rss_feeds.py

"""
rss_feeds.py — Danske finansielle RSS feeds til MoneyMaker

Feeds:
  Børsen          https://borsen.dk/rss
  Finans.dk top   https://feeds.finans.dk/topnyheder
  Politiken øko   https://politiken.dk/rss/oekonomi.rss

Artikler gemmes i samme `articles` tabel som Ground News.
`source_count` sættes til feedets kredibilitets-vægt (ikke antal medier,
men et indikativt tal der giver coverage_spread > 0 i pipeline).

Full text (title + description + content:encoded) gemmes i `page_cache`
med url-nøgle `rss:{slug}` så analyze.py kan hente det i Phase 3.
"""

import re
import time
import sqlite3
import xml.etree.ElementTree as ET
from email.utils import parsedate_to_datetime
from datetime import datetime, timezone
from pathlib import Path

import httpx
from db import get_conn, DBConn

# ---------------------------------------------------------------------------
# Feed-katalog — tilføj nye her
# ---------------------------------------------------------------------------

FEEDS: dict[str, dict] = {
    "borsen": {
        "url":    "https://borsen.dk/rss",
        "label":  "Børsen",
        "weight": 8,   # Førende dansk erhvervsmedie
    },
    "finans-top": {
        "url":    "https://feeds.finans.dk/topnyheder",
        "label":  "Finans.dk (top)",
        "weight": 7,
    },
    "finans-seneste": {
        "url":    "https://feeds.finans.dk/seneste",
        "label":  "Finans.dk (seneste)",
        "weight": 6,
    },
    "politiken-oekonomi": {
        "url":    "https://politiken.dk/rss/oekonomi.rss",
        "label":  "Politiken økonomi",
        "weight": 6,
    },
    # Berlingske: tilføj URL når den er fundet
    # "berlingske-erhverv": {
    #     "url":    "https://...",
    #     "label":  "Berlingske erhverv",
    #     "weight": 7,
    # },
}

CACHE_TTL = 30 * 60   # samme TTL som Ground News

NS = {
    "content": "http://purl.org/rss/1.0/modules/content/",
    "dc":      "http://purl.org/dc/elements/1.1/",
}


# ---------------------------------------------------------------------------
# Hjælpefunktioner
# ---------------------------------------------------------------------------

def get_db() -> DBConn:
    """Return a DBConn wrapper. Schema is managed by db.py."""
    return get_conn()


def _ensure_rss_cache_table(db: DBConn) -> None:
    """No-op: schema is now managed by db.py init_schema()."""
    pass


def _is_cached(db: DBConn, feed_id: str) -> bool:
    row = db.execute(
        "SELECT fetched_at FROM rss_feed_cache WHERE feed_id = ?", (feed_id,)
    ).fetchone()
    return bool(row and (time.time() - row["fetched_at"]) < CACHE_TTL)


def _mark_cached(db: DBConn, feed_id: str) -> None:
    db.upsert(
        "rss_feed_cache", "feed_id",
        ["feed_id", "fetched_at"],
        (feed_id, int(time.time())),
    )
    db.commit()


def _ns(prefix: str, tag: str) -> str:
    return f"{{{NS[prefix]}}}{tag}"


def _text(item: ET.Element, tag: str, ns_prefix: str | None = None) -> str:
    el = item.find(_ns(ns_prefix, tag) if ns_prefix else tag)
    return (el.text or "").strip() if el is not None else ""


def _strip_html(s: str) -> str:
    s = re.sub(r"<[^>]+>", " ", s)
    s = re.sub(r"&[a-z]+;", " ", s)
    return re.sub(r"\s+", " ", s).strip()


def _make_slug(feed_id: str, url: str) -> str:
    """Lav et unikt slug fra feed-navn + URL-sti."""
    path = url.split("?")[0].rstrip("/").split("/")[-1]
    path = re.sub(r"^ECE\d+-", "", path)           # Finans.dk ECE-id
    path = re.sub(r"\.rss$|\.html$", "", path)
    path = re.sub(r"[^a-z0-9\-]", "-", path.lower())[:55]
    path = path.strip("-")
    return f"{feed_id}-{path}" if path else f"{feed_id}-{abs(hash(url)) % 100000}"


def _parse_date(s: str) -> str:
    if not s:
        return datetime.now(timezone.utc).strftime("%Y-%m-%d")
    try:
        return parsedate_to_datetime(s).strftime("%Y-%m-%d")
    except Exception:
        return s[:10] if len(s) >= 10 else datetime.now(timezone.utc).strftime("%Y-%m-%d")


# ---------------------------------------------------------------------------
# Parse + upsert
# ---------------------------------------------------------------------------

def _parse_feed(feed_id: str, xml_text: str, weight: int) -> list[dict]:
    root = ET.fromstring(xml_text)
    articles = []

    for item in root.findall(".//item"):
        link = _text(item, "link") or _text(item, "guid")
        if not link:
            continue

        title   = _strip_html(_text(item, "title"))
        desc    = _strip_html(_text(item, "description"))[:600]
        encoded = _strip_html(_text(item, "encoded", "content"))[:3000]
        pub     = _text(item, "pubDate") or _text(item, "date", "dc")
        slug    = _make_slug(feed_id, link)

        # Fuld tekst til NLP: alt vi har
        full_text = f"{title}. {encoded or desc}".strip()

        articles.append({
            "slug":        slug,
            "title":       title,
            "description": desc,
            "full_text":   full_text,
            "start_date":  _parse_date(pub),
            "source_count": weight,
            "categories":  f"rss:{feed_id}",
            "first_seen":  int(time.time()),
            "last_seen":   int(time.time()),
        })

    return articles


def _upsert(db: DBConn, articles: list[dict]) -> int:
    new = 0
    now = int(time.time())
    for a in articles:
        exists = db.execute(
            "SELECT 1 FROM articles WHERE slug = ?", (a["slug"],)
        ).fetchone()

        if exists:
            db.execute(
                "UPDATE articles SET last_seen = ? WHERE slug = ?",
                (now, a["slug"]),
            )
        else:
            db.execute(
                """INSERT INTO articles
                   (slug, title, description, start_date,
                    source_count, categories, first_seen, last_seen)
                   VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
                (a["slug"], a["title"], a["description"], a["start_date"],
                 a["source_count"], a["categories"], a["first_seen"], a["last_seen"]),
            )
            new += 1

        # Gem full text i page_cache så analyze.py kan hente det i Phase 3
        db.upsert(
            "page_cache", "url",
            ["url", "page_type", "fetched_at", "content"],
            (f"rss:{a['slug']}", "rss", now, a["full_text"]),
        )

    db.commit()
    return new


# ---------------------------------------------------------------------------
# Hoved-funktion
# ---------------------------------------------------------------------------

def fetch_all_rss(db: DBConn, force: bool = False) -> int:
    """
    Hent alle RSS feeds og gem i DB.
    Returnerer antal nye artikler.
    """
    _ensure_rss_cache_table(db)
    total_new = 0

    for feed_id, cfg in FEEDS.items():
        if not force and _is_cached(db, feed_id):
            print(f"  💾 {cfg['label']:<30} (cache)")
            continue

        try:
            resp = httpx.get(
                cfg["url"], timeout=15, follow_redirects=True,
                headers={"User-Agent": "MoneyMaker/1.0 RSS reader (+https://github.com)"},
            )
            resp.raise_for_status()
            articles = _parse_feed(feed_id, resp.text, cfg["weight"])
            new = _upsert(db, articles)
            _mark_cached(db, feed_id)
            total_new += new
            print(f"  🌐 {cfg['label']:<30} {len(articles):2} artikler  (+{new} nye)")
        except Exception as e:
            print(f"  ✗  {cfg['label']:<30} FEJL: {e}")

    return total_new


if __name__ == "__main__":
    db = get_db()
    print("[rss] Henter feeds …")
    n = fetch_all_rss(db, force=True)
    print(f"[rss] Færdig. {n} nye artikler.")
    db.close()
First commit 2026-05-26 22:21:27 +02:00			`"""`
			`rss_feeds.py — Danske finansielle RSS feeds til MoneyMaker`

			`Feeds:`
			`Børsen https://borsen.dk/rss`
			`Finans.dk top https://feeds.finans.dk/topnyheder`
			`Politiken øko https://politiken.dk/rss/oekonomi.rss`

			Artikler gemmes i samme `articles` tabel som Ground News.
			`source_count` sættes til feedets kredibilitets-vægt (ikke antal medier,
			`men et indikativt tal der giver coverage_spread > 0 i pipeline).`

			Full text (title + description + content:encoded) gemmes i `page_cache`
			med url-nøgle `rss:{slug}` så analyze.py kan hente det i Phase 3.
			`"""`

			`import re`
			`import time`
			`import sqlite3`
			`import xml.etree.ElementTree as ET`
			`from email.utils import parsedate_to_datetime`
			`from datetime import datetime, timezone`
			`from pathlib import Path`

			`import httpx`
			`from db import get_conn, DBConn`

			`# ---------------------------------------------------------------------------`
			`# Feed-katalog — tilføj nye her`
			`# ---------------------------------------------------------------------------`

			`FEEDS: dict[str, dict] = {`
			`"borsen": {`
			`"url": "https://borsen.dk/rss",`
			`"label": "Børsen",`
			`"weight": 8, # Førende dansk erhvervsmedie`
			`},`
			`"finans-top": {`
			`"url": "https://feeds.finans.dk/topnyheder",`
			`"label": "Finans.dk (top)",`
			`"weight": 7,`
			`},`
			`"finans-seneste": {`
			`"url": "https://feeds.finans.dk/seneste",`
			`"label": "Finans.dk (seneste)",`
			`"weight": 6,`
			`},`
			`"politiken-oekonomi": {`
			`"url": "https://politiken.dk/rss/oekonomi.rss",`
			`"label": "Politiken økonomi",`
			`"weight": 6,`
			`},`
			`# Berlingske: tilføj URL når den er fundet`
			`# "berlingske-erhverv": {`
			`# "url": "https://...",`
			`# "label": "Berlingske erhverv",`
			`# "weight": 7,`
			`# },`
			`}`

			`CACHE_TTL = 30 * 60 # samme TTL som Ground News`

			`NS = {`
			`"content": "http://purl.org/rss/1.0/modules/content/",`
			`"dc": "http://purl.org/dc/elements/1.1/",`
			`}`


			`# ---------------------------------------------------------------------------`
			`# Hjælpefunktioner`
			`# ---------------------------------------------------------------------------`

			`def get_db() -> DBConn:`
			`"""Return a DBConn wrapper. Schema is managed by db.py."""`
			`return get_conn()`


			`def _ensure_rss_cache_table(db: DBConn) -> None:`
			`"""No-op: schema is now managed by db.py init_schema()."""`
			`pass`


			`def _is_cached(db: DBConn, feed_id: str) -> bool:`
			`row = db.execute(`
			`"SELECT fetched_at FROM rss_feed_cache WHERE feed_id = ?", (feed_id,)`
			`).fetchone()`
			`return bool(row and (time.time() - row["fetched_at"]) < CACHE_TTL)`


			`def _mark_cached(db: DBConn, feed_id: str) -> None:`
			`db.upsert(`
			`"rss_feed_cache", "feed_id",`
			`["feed_id", "fetched_at"],`
			`(feed_id, int(time.time())),`
			`)`
			`db.commit()`


			`def _ns(prefix: str, tag: str) -> str:`
			`return f"{{{NS[prefix]}}}{tag}"`


			`def _text(item: ET.Element, tag: str, ns_prefix: str \| None = None) -> str:`
			`el = item.find(_ns(ns_prefix, tag) if ns_prefix else tag)`
			`return (el.text or "").strip() if el is not None else ""`


			`def _strip_html(s: str) -> str:`
			`s = re.sub(r"<[^>]+>", " ", s)`
			`s = re.sub(r"&[a-z]+;", " ", s)`
			`return re.sub(r"\s+", " ", s).strip()`


			`def _make_slug(feed_id: str, url: str) -> str:`
			`"""Lav et unikt slug fra feed-navn + URL-sti."""`
			`path = url.split("?")[0].rstrip("/").split("/")[-1]`
			`path = re.sub(r"^ECE\d+-", "", path) # Finans.dk ECE-id`
			`path = re.sub(r"\.rss$\|\.html$", "", path)`
			`path = re.sub(r"[^a-z0-9\-]", "-", path.lower())[:55]`
			`path = path.strip("-")`
			`return f"{feed_id}-{path}" if path else f"{feed_id}-{abs(hash(url)) % 100000}"`


			`def _parse_date(s: str) -> str:`
			`if not s:`
			`return datetime.now(timezone.utc).strftime("%Y-%m-%d")`
			`try:`
			`return parsedate_to_datetime(s).strftime("%Y-%m-%d")`
			`except Exception:`
			`return s[:10] if len(s) >= 10 else datetime.now(timezone.utc).strftime("%Y-%m-%d")`


			`# ---------------------------------------------------------------------------`
			`# Parse + upsert`
			`# ---------------------------------------------------------------------------`

			`def _parse_feed(feed_id: str, xml_text: str, weight: int) -> list[dict]:`
			`root = ET.fromstring(xml_text)`
			`articles = []`

			`for item in root.findall(".//item"):`
			`link = _text(item, "link") or _text(item, "guid")`
			`if not link:`
			`continue`

			`title = _strip_html(_text(item, "title"))`
			`desc = _strip_html(_text(item, "description"))[:600]`
			`encoded = _strip_html(_text(item, "encoded", "content"))[:3000]`
			`pub = _text(item, "pubDate") or _text(item, "date", "dc")`
			`slug = _make_slug(feed_id, link)`

			`# Fuld tekst til NLP: alt vi har`
			`full_text = f"{title}. {encoded or desc}".strip()`

			`articles.append({`
			`"slug": slug,`
			`"title": title,`
			`"description": desc,`
			`"full_text": full_text,`
			`"start_date": _parse_date(pub),`
			`"source_count": weight,`
			`"categories": f"rss:{feed_id}",`
			`"first_seen": int(time.time()),`
			`"last_seen": int(time.time()),`
			`})`

			`return articles`


			`def _upsert(db: DBConn, articles: list[dict]) -> int:`
			`new = 0`
			`now = int(time.time())`
			`for a in articles:`
			`exists = db.execute(`
			`"SELECT 1 FROM articles WHERE slug = ?", (a["slug"],)`
			`).fetchone()`

			`if exists:`
			`db.execute(`
			`"UPDATE articles SET last_seen = ? WHERE slug = ?",`
			`(now, a["slug"]),`
			`)`
			`else:`
			`db.execute(`
			`"""INSERT INTO articles`
			`(slug, title, description, start_date,`
			`source_count, categories, first_seen, last_seen)`
			`VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",`
			`(a["slug"], a["title"], a["description"], a["start_date"],`
			`a["source_count"], a["categories"], a["first_seen"], a["last_seen"]),`
			`)`
			`new += 1`

			`# Gem full text i page_cache så analyze.py kan hente det i Phase 3`
			`db.upsert(`
			`"page_cache", "url",`
			`["url", "page_type", "fetched_at", "content"],`
			`(f"rss:{a['slug']}", "rss", now, a["full_text"]),`
			`)`

			`db.commit()`
			`return new`


			`# ---------------------------------------------------------------------------`
			`# Hoved-funktion`
			`# ---------------------------------------------------------------------------`

			`def fetch_all_rss(db: DBConn, force: bool = False) -> int:`
			`"""`
			`Hent alle RSS feeds og gem i DB.`
			`Returnerer antal nye artikler.`
			`"""`
			`_ensure_rss_cache_table(db)`
			`total_new = 0`

			`for feed_id, cfg in FEEDS.items():`
			`if not force and _is_cached(db, feed_id):`
			`print(f" 💾 {cfg['label']:<30} (cache)")`
			`continue`

			`try:`
			`resp = httpx.get(`
			`cfg["url"], timeout=15, follow_redirects=True,`
			`headers={"User-Agent": "MoneyMaker/1.0 RSS reader (+https://github.com)"},`
			`)`
			`resp.raise_for_status()`
			`articles = _parse_feed(feed_id, resp.text, cfg["weight"])`
			`new = _upsert(db, articles)`
			`_mark_cached(db, feed_id)`
			`total_new += new`
			`print(f" 🌐 {cfg['label']:<30} {len(articles):2} artikler (+{new} nye)")`
			`except Exception as e:`
			`print(f" ✗ {cfg['label']:<30} FEJL: {e}")`

			`return total_new`


			`if __name__ == "__main__":`
			`db = get_db()`
			`print("[rss] Henter feeds …")`
			`n = fetch_all_rss(db, force=True)`
			`print(f"[rss] Færdig. {n} nye artikler.")`
			`db.close()`