mmd/rss_feeds.py

"""
rss_feeds.py — Danske finansielle RSS feeds til MoneyMaker

Feeds:
  Børsen          https://borsen.dk/rss
  Finans.dk top   https://feeds.finans.dk/topnyheder
  Politiken øko   https://politiken.dk/rss/oekonomi.rss

Artikler gemmes i samme `articles` tabel som Ground News.
`source_count` sættes til feedets kredibilitets-vægt (ikke antal medier,
men et indikativt tal der giver coverage_spread > 0 i pipeline).

Full text (title + description + content:encoded) gemmes i `page_cache`
med url-nøgle `rss:{slug}` så analyze.py kan hente det i Phase 3.
"""

import re
import time
import sqlite3
import xml.etree.ElementTree as ET
from email.utils import parsedate_to_datetime
from datetime import datetime, timezone
from pathlib import Path

import httpx
from db import get_conn, DBConn

# ---------------------------------------------------------------------------
# Feed-katalog — tilføj nye her
# ---------------------------------------------------------------------------

FEEDS: dict[str, dict] = {
    "borsen": {
        "url":    "https://borsen.dk/rss",
        "label":  "Børsen",
        "weight": 8,   # Førende dansk erhvervsmedie
    },
    "finans-top": {
        "url":    "https://feeds.finans.dk/topnyheder",
        "label":  "Finans.dk (top)",
        "weight": 7,
    },
    "finans-seneste": {
        "url":    "https://feeds.finans.dk/seneste",
        "label":  "Finans.dk (seneste)",
        "weight": 6,
    },
    "politiken-oekonomi": {
        "url":    "https://politiken.dk/rss/oekonomi.rss",
        "label":  "Politiken økonomi",
        "weight": 6,
    },
    # Berlingske: tilføj URL når den er fundet
    # "berlingske-erhverv": {
    #     "url":    "https://...",
    #     "label":  "Berlingske erhverv",
    #     "weight": 7,
    # },
}

CACHE_TTL = 30 * 60   # samme TTL som Ground News

NS = {
    "content": "http://purl.org/rss/1.0/modules/content/",
    "dc":      "http://purl.org/dc/elements/1.1/",
}


# ---------------------------------------------------------------------------
# Hjælpefunktioner
# ---------------------------------------------------------------------------

def get_db() -> DBConn:
    """Return a DBConn wrapper. Schema is managed by db.py."""
    return get_conn()


def _ensure_rss_cache_table(db: DBConn) -> None:
    """No-op: schema is now managed by db.py init_schema()."""
    pass


def _is_cached(db: DBConn, feed_id: str) -> bool:
    row = db.execute(
        "SELECT fetched_at FROM rss_feed_cache WHERE feed_id = ?", (feed_id,)
    ).fetchone()
    return bool(row and (time.time() - row["fetched_at"]) < CACHE_TTL)


def _mark_cached(db: DBConn, feed_id: str) -> None:
    db.upsert(
        "rss_feed_cache", "feed_id",
        ["feed_id", "fetched_at"],
        (feed_id, int(time.time())),
    )
    db.commit()


def _ns(prefix: str, tag: str) -> str:
    return f"{{{NS[prefix]}}}{tag}"


def _text(item: ET.Element, tag: str, ns_prefix: str | None = None) -> str:
    el = item.find(_ns(ns_prefix, tag) if ns_prefix else tag)
    return (el.text or "").strip() if el is not None else ""


def _strip_html(s: str) -> str:
    s = re.sub(r"<[^>]+>", " ", s)
    s = re.sub(r"&[a-z]+;", " ", s)
    return re.sub(r"\s+", " ", s).strip()


def _make_slug(feed_id: str, url: str) -> str:
    """Lav et unikt slug fra feed-navn + URL-sti."""
    path = url.split("?")[0].rstrip("/").split("/")[-1]
    path = re.sub(r"^ECE\d+-", "", path)           # Finans.dk ECE-id
    path = re.sub(r"\.rss$|\.html$", "", path)
    path = re.sub(r"[^a-z0-9\-]", "-", path.lower())[:55]
    path = path.strip("-")
    return f"{feed_id}-{path}" if path else f"{feed_id}-{abs(hash(url)) % 100000}"


def _parse_date(s: str) -> str:
    if not s:
        return datetime.now(timezone.utc).strftime("%Y-%m-%d")
    try:
        return parsedate_to_datetime(s).strftime("%Y-%m-%d")
    except Exception:
        return s[:10] if len(s) >= 10 else datetime.now(timezone.utc).strftime("%Y-%m-%d")


# ---------------------------------------------------------------------------
# Parse + upsert
# ---------------------------------------------------------------------------

def _parse_feed(feed_id: str, xml_text: str, weight: int) -> list[dict]:
    root = ET.fromstring(xml_text)
    articles = []

    for item in root.findall(".//item"):
        link = _text(item, "link") or _text(item, "guid")
        if not link:
            continue

        title   = _strip_html(_text(item, "title"))
        desc    = _strip_html(_text(item, "description"))[:600]
        encoded = _strip_html(_text(item, "encoded", "content"))[:3000]
        pub     = _text(item, "pubDate") or _text(item, "date", "dc")
        slug    = _make_slug(feed_id, link)

        # Fuld tekst til NLP: alt vi har
        full_text = f"{title}. {encoded or desc}".strip()

        articles.append({
            "slug":        slug,
            "title":       title,
            "description": desc,
            "full_text":   full_text,
            "start_date":  _parse_date(pub),
            "source_count": weight,
            "categories":  f"rss:{feed_id}",
            "first_seen":  int(time.time()),
            "last_seen":   int(time.time()),
        })

    return articles


def _upsert(db: DBConn, articles: list[dict]) -> int:
    new = 0
    now = int(time.time())
    for a in articles:
        exists = db.execute(
            "SELECT 1 FROM articles WHERE slug = ?", (a["slug"],)
        ).fetchone()

        if exists:
            db.execute(
                "UPDATE articles SET last_seen = ? WHERE slug = ?",
                (now, a["slug"]),
            )
        else:
            db.execute(
                """INSERT INTO articles
                   (slug, title, description, start_date,
                    source_count, categories, first_seen, last_seen)
                   VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
                (a["slug"], a["title"], a["description"], a["start_date"],
                 a["source_count"], a["categories"], a["first_seen"], a["last_seen"]),
            )
            new += 1

        # Gem full text i page_cache så analyze.py kan hente det i Phase 3
        db.upsert(
            "page_cache", "url",
            ["url", "page_type", "fetched_at", "content"],
            (f"rss:{a['slug']}", "rss", now, a["full_text"]),
        )

    db.commit()
    return new


# ---------------------------------------------------------------------------
# Hoved-funktion
# ---------------------------------------------------------------------------

def fetch_all_rss(db: DBConn, force: bool = False) -> int:
    """
    Hent alle RSS feeds og gem i DB.
    Returnerer antal nye artikler.
    """
    _ensure_rss_cache_table(db)
    total_new = 0

    for feed_id, cfg in FEEDS.items():
        if not force and _is_cached(db, feed_id):
            print(f"  💾 {cfg['label']:<30} (cache)")
            continue

        try:
            resp = httpx.get(
                cfg["url"], timeout=15, follow_redirects=True,
                headers={"User-Agent": "MoneyMaker/1.0 RSS reader (+https://github.com)"},
            )
            resp.raise_for_status()
            articles = _parse_feed(feed_id, resp.text, cfg["weight"])
            new = _upsert(db, articles)
            _mark_cached(db, feed_id)
            total_new += new
            print(f"  🌐 {cfg['label']:<30} {len(articles):2} artikler  (+{new} nye)")
        except Exception as e:
            print(f"  ✗  {cfg['label']:<30} FEJL: {e}")

    return total_new


if __name__ == "__main__":
    db = get_db()
    print("[rss] Henter feeds …")
    n = fetch_all_rss(db, force=True)
    print(f"[rss] Færdig. {n} nye artikler.")
    db.close()