Files
mmd/rss_feeds.py
Henrik Jess Nielsen 05eed51e7d First commit
2026-05-26 22:21:27 +02:00

245 lines
7.6 KiB
Python

"""
rss_feeds.py — Danske finansielle RSS feeds til MoneyMaker
Feeds:
Børsen https://borsen.dk/rss
Finans.dk top https://feeds.finans.dk/topnyheder
Politiken øko https://politiken.dk/rss/oekonomi.rss
Artikler gemmes i samme `articles` tabel som Ground News.
`source_count` sættes til feedets kredibilitets-vægt (ikke antal medier,
men et indikativt tal der giver coverage_spread > 0 i pipeline).
Full text (title + description + content:encoded) gemmes i `page_cache`
med url-nøgle `rss:{slug}` så analyze.py kan hente det i Phase 3.
"""
import re
import time
import sqlite3
import xml.etree.ElementTree as ET
from email.utils import parsedate_to_datetime
from datetime import datetime, timezone
from pathlib import Path
import httpx
from db import get_conn, DBConn
# ---------------------------------------------------------------------------
# Feed-katalog — tilføj nye her
# ---------------------------------------------------------------------------
FEEDS: dict[str, dict] = {
"borsen": {
"url": "https://borsen.dk/rss",
"label": "Børsen",
"weight": 8, # Førende dansk erhvervsmedie
},
"finans-top": {
"url": "https://feeds.finans.dk/topnyheder",
"label": "Finans.dk (top)",
"weight": 7,
},
"finans-seneste": {
"url": "https://feeds.finans.dk/seneste",
"label": "Finans.dk (seneste)",
"weight": 6,
},
"politiken-oekonomi": {
"url": "https://politiken.dk/rss/oekonomi.rss",
"label": "Politiken økonomi",
"weight": 6,
},
# Berlingske: tilføj URL når den er fundet
# "berlingske-erhverv": {
# "url": "https://...",
# "label": "Berlingske erhverv",
# "weight": 7,
# },
}
CACHE_TTL = 30 * 60 # samme TTL som Ground News
NS = {
"content": "http://purl.org/rss/1.0/modules/content/",
"dc": "http://purl.org/dc/elements/1.1/",
}
# ---------------------------------------------------------------------------
# Hjælpefunktioner
# ---------------------------------------------------------------------------
def get_db() -> DBConn:
"""Return a DBConn wrapper. Schema is managed by db.py."""
return get_conn()
def _ensure_rss_cache_table(db: DBConn) -> None:
"""No-op: schema is now managed by db.py init_schema()."""
pass
def _is_cached(db: DBConn, feed_id: str) -> bool:
row = db.execute(
"SELECT fetched_at FROM rss_feed_cache WHERE feed_id = ?", (feed_id,)
).fetchone()
return bool(row and (time.time() - row["fetched_at"]) < CACHE_TTL)
def _mark_cached(db: DBConn, feed_id: str) -> None:
db.upsert(
"rss_feed_cache", "feed_id",
["feed_id", "fetched_at"],
(feed_id, int(time.time())),
)
db.commit()
def _ns(prefix: str, tag: str) -> str:
return f"{{{NS[prefix]}}}{tag}"
def _text(item: ET.Element, tag: str, ns_prefix: str | None = None) -> str:
el = item.find(_ns(ns_prefix, tag) if ns_prefix else tag)
return (el.text or "").strip() if el is not None else ""
def _strip_html(s: str) -> str:
s = re.sub(r"<[^>]+>", " ", s)
s = re.sub(r"&[a-z]+;", " ", s)
return re.sub(r"\s+", " ", s).strip()
def _make_slug(feed_id: str, url: str) -> str:
"""Lav et unikt slug fra feed-navn + URL-sti."""
path = url.split("?")[0].rstrip("/").split("/")[-1]
path = re.sub(r"^ECE\d+-", "", path) # Finans.dk ECE-id
path = re.sub(r"\.rss$|\.html$", "", path)
path = re.sub(r"[^a-z0-9\-]", "-", path.lower())[:55]
path = path.strip("-")
return f"{feed_id}-{path}" if path else f"{feed_id}-{abs(hash(url)) % 100000}"
def _parse_date(s: str) -> str:
if not s:
return datetime.now(timezone.utc).strftime("%Y-%m-%d")
try:
return parsedate_to_datetime(s).strftime("%Y-%m-%d")
except Exception:
return s[:10] if len(s) >= 10 else datetime.now(timezone.utc).strftime("%Y-%m-%d")
# ---------------------------------------------------------------------------
# Parse + upsert
# ---------------------------------------------------------------------------
def _parse_feed(feed_id: str, xml_text: str, weight: int) -> list[dict]:
root = ET.fromstring(xml_text)
articles = []
for item in root.findall(".//item"):
link = _text(item, "link") or _text(item, "guid")
if not link:
continue
title = _strip_html(_text(item, "title"))
desc = _strip_html(_text(item, "description"))[:600]
encoded = _strip_html(_text(item, "encoded", "content"))[:3000]
pub = _text(item, "pubDate") or _text(item, "date", "dc")
slug = _make_slug(feed_id, link)
# Fuld tekst til NLP: alt vi har
full_text = f"{title}. {encoded or desc}".strip()
articles.append({
"slug": slug,
"title": title,
"description": desc,
"full_text": full_text,
"start_date": _parse_date(pub),
"source_count": weight,
"categories": f"rss:{feed_id}",
"first_seen": int(time.time()),
"last_seen": int(time.time()),
})
return articles
def _upsert(db: DBConn, articles: list[dict]) -> int:
new = 0
now = int(time.time())
for a in articles:
exists = db.execute(
"SELECT 1 FROM articles WHERE slug = ?", (a["slug"],)
).fetchone()
if exists:
db.execute(
"UPDATE articles SET last_seen = ? WHERE slug = ?",
(now, a["slug"]),
)
else:
db.execute(
"""INSERT INTO articles
(slug, title, description, start_date,
source_count, categories, first_seen, last_seen)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
(a["slug"], a["title"], a["description"], a["start_date"],
a["source_count"], a["categories"], a["first_seen"], a["last_seen"]),
)
new += 1
# Gem full text i page_cache så analyze.py kan hente det i Phase 3
db.upsert(
"page_cache", "url",
["url", "page_type", "fetched_at", "content"],
(f"rss:{a['slug']}", "rss", now, a["full_text"]),
)
db.commit()
return new
# ---------------------------------------------------------------------------
# Hoved-funktion
# ---------------------------------------------------------------------------
def fetch_all_rss(db: DBConn, force: bool = False) -> int:
"""
Hent alle RSS feeds og gem i DB.
Returnerer antal nye artikler.
"""
_ensure_rss_cache_table(db)
total_new = 0
for feed_id, cfg in FEEDS.items():
if not force and _is_cached(db, feed_id):
print(f" 💾 {cfg['label']:<30} (cache)")
continue
try:
resp = httpx.get(
cfg["url"], timeout=15, follow_redirects=True,
headers={"User-Agent": "MoneyMaker/1.0 RSS reader (+https://github.com)"},
)
resp.raise_for_status()
articles = _parse_feed(feed_id, resp.text, cfg["weight"])
new = _upsert(db, articles)
_mark_cached(db, feed_id)
total_new += new
print(f" 🌐 {cfg['label']:<30} {len(articles):2} artikler (+{new} nye)")
except Exception as e:
print(f"{cfg['label']:<30} FEJL: {e}")
return total_new
if __name__ == "__main__":
db = get_db()
print("[rss] Henter feeds …")
n = fetch_all_rss(db, force=True)
print(f"[rss] Færdig. {n} nye artikler.")
db.close()