First commit

2026-05-26 22:21:27 +02:00
parent 2743a236b2
commit 05eed51e7d
90 changed files with 8690 additions and 0 deletions
--- a/rss_feeds.py
+++ b/rss_feeds.py
@@ -0,0 +1,244 @@
+"""
+rss_feeds.py — Danske finansielle RSS feeds til MoneyMaker
+
+Feeds:
+  Børsen          https://borsen.dk/rss
+  Finans.dk top   https://feeds.finans.dk/topnyheder
+  Politiken øko   https://politiken.dk/rss/oekonomi.rss
+
+Artikler gemmes i samme `articles` tabel som Ground News.
+`source_count` sættes til feedets kredibilitets-vægt (ikke antal medier,
+men et indikativt tal der giver coverage_spread > 0 i pipeline).
+
+Full text (title + description + content:encoded) gemmes i `page_cache`
+med url-nøgle `rss:{slug}` så analyze.py kan hente det i Phase 3.
+"""
+
+import re
+import time
+import sqlite3
+import xml.etree.ElementTree as ET
+from email.utils import parsedate_to_datetime
+from datetime import datetime, timezone
+from pathlib import Path
+
+import httpx
+from db import get_conn, DBConn
+
+# ---------------------------------------------------------------------------
+# Feed-katalog — tilføj nye her
+# ---------------------------------------------------------------------------
+
+FEEDS: dict[str, dict] = {
+    "borsen": {
+        "url":    "https://borsen.dk/rss",
+        "label":  "Børsen",
+        "weight": 8,   # Førende dansk erhvervsmedie
+    },
+    "finans-top": {
+        "url":    "https://feeds.finans.dk/topnyheder",
+        "label":  "Finans.dk (top)",
+        "weight": 7,
+    },
+    "finans-seneste": {
+        "url":    "https://feeds.finans.dk/seneste",
+        "label":  "Finans.dk (seneste)",
+        "weight": 6,
+    },
+    "politiken-oekonomi": {
+        "url":    "https://politiken.dk/rss/oekonomi.rss",
+        "label":  "Politiken økonomi",
+        "weight": 6,
+    },
+    # Berlingske: tilføj URL når den er fundet
+    # "berlingske-erhverv": {
+    #     "url":    "https://...",
+    #     "label":  "Berlingske erhverv",
+    #     "weight": 7,
+    # },
+}
+
+CACHE_TTL = 30 * 60   # samme TTL som Ground News
+
+NS = {
+    "content": "http://purl.org/rss/1.0/modules/content/",
+    "dc":      "http://purl.org/dc/elements/1.1/",
+}
+
+
+# ---------------------------------------------------------------------------
+# Hjælpefunktioner
+# ---------------------------------------------------------------------------
+
+def get_db() -> DBConn:
+    """Return a DBConn wrapper. Schema is managed by db.py."""
+    return get_conn()
+
+
+def _ensure_rss_cache_table(db: DBConn) -> None:
+    """No-op: schema is now managed by db.py init_schema()."""
+    pass
+
+
+def _is_cached(db: DBConn, feed_id: str) -> bool:
+    row = db.execute(
+        "SELECT fetched_at FROM rss_feed_cache WHERE feed_id = ?", (feed_id,)
+    ).fetchone()
+    return bool(row and (time.time() - row["fetched_at"]) < CACHE_TTL)
+
+
+def _mark_cached(db: DBConn, feed_id: str) -> None:
+    db.upsert(
+        "rss_feed_cache", "feed_id",
+        ["feed_id", "fetched_at"],
+        (feed_id, int(time.time())),
+    )
+    db.commit()
+
+
+def _ns(prefix: str, tag: str) -> str:
+    return f"{{{NS[prefix]}}}{tag}"
+
+
+def _text(item: ET.Element, tag: str, ns_prefix: str | None = None) -> str:
+    el = item.find(_ns(ns_prefix, tag) if ns_prefix else tag)
+    return (el.text or "").strip() if el is not None else ""
+
+
+def _strip_html(s: str) -> str:
+    s = re.sub(r"<[^>]+>", " ", s)
+    s = re.sub(r"&[a-z]+;", " ", s)
+    return re.sub(r"\s+", " ", s).strip()
+
+
+def _make_slug(feed_id: str, url: str) -> str:
+    """Lav et unikt slug fra feed-navn + URL-sti."""
+    path = url.split("?")[0].rstrip("/").split("/")[-1]
+    path = re.sub(r"^ECE\d+-", "", path)           # Finans.dk ECE-id
+    path = re.sub(r"\.rss$|\.html$", "", path)
+    path = re.sub(r"[^a-z0-9\-]", "-", path.lower())[:55]
+    path = path.strip("-")
+    return f"{feed_id}-{path}" if path else f"{feed_id}-{abs(hash(url)) % 100000}"
+
+
+def _parse_date(s: str) -> str:
+    if not s:
+        return datetime.now(timezone.utc).strftime("%Y-%m-%d")
+    try:
+        return parsedate_to_datetime(s).strftime("%Y-%m-%d")
+    except Exception:
+        return s[:10] if len(s) >= 10 else datetime.now(timezone.utc).strftime("%Y-%m-%d")
+
+
+# ---------------------------------------------------------------------------
+# Parse + upsert
+# ---------------------------------------------------------------------------
+
+def _parse_feed(feed_id: str, xml_text: str, weight: int) -> list[dict]:
+    root = ET.fromstring(xml_text)
+    articles = []
+
+    for item in root.findall(".//item"):
+        link = _text(item, "link") or _text(item, "guid")
+        if not link:
+            continue
+
+        title   = _strip_html(_text(item, "title"))
+        desc    = _strip_html(_text(item, "description"))[:600]
+        encoded = _strip_html(_text(item, "encoded", "content"))[:3000]
+        pub     = _text(item, "pubDate") or _text(item, "date", "dc")
+        slug    = _make_slug(feed_id, link)
+
+        # Fuld tekst til NLP: alt vi har
+        full_text = f"{title}. {encoded or desc}".strip()
+
+        articles.append({
+            "slug":        slug,
+            "title":       title,
+            "description": desc,
+            "full_text":   full_text,
+            "start_date":  _parse_date(pub),
+            "source_count": weight,
+            "categories":  f"rss:{feed_id}",
+            "first_seen":  int(time.time()),
+            "last_seen":   int(time.time()),
+        })
+
+    return articles
+
+
+def _upsert(db: DBConn, articles: list[dict]) -> int:
+    new = 0
+    now = int(time.time())
+    for a in articles:
+        exists = db.execute(
+            "SELECT 1 FROM articles WHERE slug = ?", (a["slug"],)
+        ).fetchone()
+
+        if exists:
+            db.execute(
+                "UPDATE articles SET last_seen = ? WHERE slug = ?",
+                (now, a["slug"]),
+            )
+        else:
+            db.execute(
+                """INSERT INTO articles
+                   (slug, title, description, start_date,
+                    source_count, categories, first_seen, last_seen)
+                   VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
+                (a["slug"], a["title"], a["description"], a["start_date"],
+                 a["source_count"], a["categories"], a["first_seen"], a["last_seen"]),
+            )
+            new += 1
+
+        # Gem full text i page_cache så analyze.py kan hente det i Phase 3
+        db.upsert(
+            "page_cache", "url",
+            ["url", "page_type", "fetched_at", "content"],
+            (f"rss:{a['slug']}", "rss", now, a["full_text"]),
+        )
+
+    db.commit()
+    return new
+
+
+# ---------------------------------------------------------------------------
+# Hoved-funktion
+# ---------------------------------------------------------------------------
+
+def fetch_all_rss(db: DBConn, force: bool = False) -> int:
+    """
+    Hent alle RSS feeds og gem i DB.
+    Returnerer antal nye artikler.
+    """
+    _ensure_rss_cache_table(db)
+    total_new = 0
+
+    for feed_id, cfg in FEEDS.items():
+        if not force and _is_cached(db, feed_id):
+            print(f"  💾 {cfg['label']:<30} (cache)")
+            continue
+
+        try:
+            resp = httpx.get(
+                cfg["url"], timeout=15, follow_redirects=True,
+                headers={"User-Agent": "MoneyMaker/1.0 RSS reader (+https://github.com)"},
+            )
+            resp.raise_for_status()
+            articles = _parse_feed(feed_id, resp.text, cfg["weight"])
+            new = _upsert(db, articles)
+            _mark_cached(db, feed_id)
+            total_new += new
+            print(f"  🌐 {cfg['label']:<30} {len(articles):2} artikler  (+{new} nye)")
+        except Exception as e:
+            print(f"  ✗  {cfg['label']:<30} FEJL: {e}")
+
+    return total_new
+
+
+if __name__ == "__main__":
+    db = get_db()
+    print("[rss] Henter feeds …")
+    n = fetch_all_rss(db, force=True)
+    print(f"[rss] Færdig. {n} nye artikler.")
+    db.close()