""" rss_feeds.py — Danske finansielle RSS feeds til MoneyMaker Feeds: Børsen https://borsen.dk/rss Finans.dk top https://feeds.finans.dk/topnyheder Politiken øko https://politiken.dk/rss/oekonomi.rss Artikler gemmes i samme `articles` tabel som Ground News. `source_count` sættes til feedets kredibilitets-vægt (ikke antal medier, men et indikativt tal der giver coverage_spread > 0 i pipeline). Full text (title + description + content:encoded) gemmes i `page_cache` med url-nøgle `rss:{slug}` så analyze.py kan hente det i Phase 3. """ import re import time import sqlite3 import xml.etree.ElementTree as ET from email.utils import parsedate_to_datetime from datetime import datetime, timezone from pathlib import Path import httpx from db import get_conn, DBConn # --------------------------------------------------------------------------- # Feed-katalog — tilføj nye her # --------------------------------------------------------------------------- FEEDS: dict[str, dict] = { "borsen": { "url": "https://borsen.dk/rss", "label": "Børsen", "weight": 8, # Førende dansk erhvervsmedie }, "finans-top": { "url": "https://feeds.finans.dk/topnyheder", "label": "Finans.dk (top)", "weight": 7, }, "finans-seneste": { "url": "https://feeds.finans.dk/seneste", "label": "Finans.dk (seneste)", "weight": 6, }, "politiken-oekonomi": { "url": "https://politiken.dk/rss/oekonomi.rss", "label": "Politiken økonomi", "weight": 6, }, # Berlingske: tilføj URL når den er fundet # "berlingske-erhverv": { # "url": "https://...", # "label": "Berlingske erhverv", # "weight": 7, # }, } CACHE_TTL = 30 * 60 # samme TTL som Ground News NS = { "content": "http://purl.org/rss/1.0/modules/content/", "dc": "http://purl.org/dc/elements/1.1/", } # --------------------------------------------------------------------------- # Hjælpefunktioner # --------------------------------------------------------------------------- def get_db() -> DBConn: """Return a DBConn wrapper. Schema is managed by db.py.""" return get_conn() def _ensure_rss_cache_table(db: DBConn) -> None: """No-op: schema is now managed by db.py init_schema().""" pass def _is_cached(db: DBConn, feed_id: str) -> bool: row = db.execute( "SELECT fetched_at FROM rss_feed_cache WHERE feed_id = ?", (feed_id,) ).fetchone() return bool(row and (time.time() - row["fetched_at"]) < CACHE_TTL) def _mark_cached(db: DBConn, feed_id: str) -> None: db.upsert( "rss_feed_cache", "feed_id", ["feed_id", "fetched_at"], (feed_id, int(time.time())), ) db.commit() def _ns(prefix: str, tag: str) -> str: return f"{{{NS[prefix]}}}{tag}" def _text(item: ET.Element, tag: str, ns_prefix: str | None = None) -> str: el = item.find(_ns(ns_prefix, tag) if ns_prefix else tag) return (el.text or "").strip() if el is not None else "" def _strip_html(s: str) -> str: s = re.sub(r"<[^>]+>", " ", s) s = re.sub(r"&[a-z]+;", " ", s) return re.sub(r"\s+", " ", s).strip() def _make_slug(feed_id: str, url: str) -> str: """Lav et unikt slug fra feed-navn + URL-sti.""" path = url.split("?")[0].rstrip("/").split("/")[-1] path = re.sub(r"^ECE\d+-", "", path) # Finans.dk ECE-id path = re.sub(r"\.rss$|\.html$", "", path) path = re.sub(r"[^a-z0-9\-]", "-", path.lower())[:55] path = path.strip("-") return f"{feed_id}-{path}" if path else f"{feed_id}-{abs(hash(url)) % 100000}" def _parse_date(s: str) -> str: if not s: return datetime.now(timezone.utc).strftime("%Y-%m-%d") try: return parsedate_to_datetime(s).strftime("%Y-%m-%d") except Exception: return s[:10] if len(s) >= 10 else datetime.now(timezone.utc).strftime("%Y-%m-%d") # --------------------------------------------------------------------------- # Parse + upsert # --------------------------------------------------------------------------- def _parse_feed(feed_id: str, xml_text: str, weight: int) -> list[dict]: root = ET.fromstring(xml_text) articles = [] for item in root.findall(".//item"): link = _text(item, "link") or _text(item, "guid") if not link: continue title = _strip_html(_text(item, "title")) desc = _strip_html(_text(item, "description"))[:600] encoded = _strip_html(_text(item, "encoded", "content"))[:3000] pub = _text(item, "pubDate") or _text(item, "date", "dc") slug = _make_slug(feed_id, link) # Fuld tekst til NLP: alt vi har full_text = f"{title}. {encoded or desc}".strip() articles.append({ "slug": slug, "title": title, "description": desc, "full_text": full_text, "start_date": _parse_date(pub), "source_count": weight, "categories": f"rss:{feed_id}", "first_seen": int(time.time()), "last_seen": int(time.time()), }) return articles def _upsert(db: DBConn, articles: list[dict]) -> int: new = 0 now = int(time.time()) for a in articles: exists = db.execute( "SELECT 1 FROM articles WHERE slug = ?", (a["slug"],) ).fetchone() if exists: db.execute( "UPDATE articles SET last_seen = ? WHERE slug = ?", (now, a["slug"]), ) else: db.execute( """INSERT INTO articles (slug, title, description, start_date, source_count, categories, first_seen, last_seen) VALUES (?, ?, ?, ?, ?, ?, ?, ?)""", (a["slug"], a["title"], a["description"], a["start_date"], a["source_count"], a["categories"], a["first_seen"], a["last_seen"]), ) new += 1 # Gem full text i page_cache så analyze.py kan hente det i Phase 3 db.upsert( "page_cache", "url", ["url", "page_type", "fetched_at", "content"], (f"rss:{a['slug']}", "rss", now, a["full_text"]), ) db.commit() return new # --------------------------------------------------------------------------- # Hoved-funktion # --------------------------------------------------------------------------- def fetch_all_rss(db: DBConn, force: bool = False) -> int: """ Hent alle RSS feeds og gem i DB. Returnerer antal nye artikler. """ _ensure_rss_cache_table(db) total_new = 0 for feed_id, cfg in FEEDS.items(): if not force and _is_cached(db, feed_id): print(f" 💾 {cfg['label']:<30} (cache)") continue try: resp = httpx.get( cfg["url"], timeout=15, follow_redirects=True, headers={"User-Agent": "MoneyMaker/1.0 RSS reader (+https://github.com)"}, ) resp.raise_for_status() articles = _parse_feed(feed_id, resp.text, cfg["weight"]) new = _upsert(db, articles) _mark_cached(db, feed_id) total_new += new print(f" 🌐 {cfg['label']:<30} {len(articles):2} artikler (+{new} nye)") except Exception as e: print(f" ✗ {cfg['label']:<30} FEJL: {e}") return total_new if __name__ == "__main__": db = get_db() print("[rss] Henter feeds …") n = fetch_all_rss(db, force=True) print(f"[rss] Færdig. {n} nye artikler.") db.close()