245 lines
7.6 KiB
Python
245 lines
7.6 KiB
Python
|
|
"""
|
||
|
|
rss_feeds.py — Danske finansielle RSS feeds til MoneyMaker
|
||
|
|
|
||
|
|
Feeds:
|
||
|
|
Børsen https://borsen.dk/rss
|
||
|
|
Finans.dk top https://feeds.finans.dk/topnyheder
|
||
|
|
Politiken øko https://politiken.dk/rss/oekonomi.rss
|
||
|
|
|
||
|
|
Artikler gemmes i samme `articles` tabel som Ground News.
|
||
|
|
`source_count` sættes til feedets kredibilitets-vægt (ikke antal medier,
|
||
|
|
men et indikativt tal der giver coverage_spread > 0 i pipeline).
|
||
|
|
|
||
|
|
Full text (title + description + content:encoded) gemmes i `page_cache`
|
||
|
|
med url-nøgle `rss:{slug}` så analyze.py kan hente det i Phase 3.
|
||
|
|
"""
|
||
|
|
|
||
|
|
import re
|
||
|
|
import time
|
||
|
|
import sqlite3
|
||
|
|
import xml.etree.ElementTree as ET
|
||
|
|
from email.utils import parsedate_to_datetime
|
||
|
|
from datetime import datetime, timezone
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
import httpx
|
||
|
|
from db import get_conn, DBConn
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Feed-katalog — tilføj nye her
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
FEEDS: dict[str, dict] = {
|
||
|
|
"borsen": {
|
||
|
|
"url": "https://borsen.dk/rss",
|
||
|
|
"label": "Børsen",
|
||
|
|
"weight": 8, # Førende dansk erhvervsmedie
|
||
|
|
},
|
||
|
|
"finans-top": {
|
||
|
|
"url": "https://feeds.finans.dk/topnyheder",
|
||
|
|
"label": "Finans.dk (top)",
|
||
|
|
"weight": 7,
|
||
|
|
},
|
||
|
|
"finans-seneste": {
|
||
|
|
"url": "https://feeds.finans.dk/seneste",
|
||
|
|
"label": "Finans.dk (seneste)",
|
||
|
|
"weight": 6,
|
||
|
|
},
|
||
|
|
"politiken-oekonomi": {
|
||
|
|
"url": "https://politiken.dk/rss/oekonomi.rss",
|
||
|
|
"label": "Politiken økonomi",
|
||
|
|
"weight": 6,
|
||
|
|
},
|
||
|
|
# Berlingske: tilføj URL når den er fundet
|
||
|
|
# "berlingske-erhverv": {
|
||
|
|
# "url": "https://...",
|
||
|
|
# "label": "Berlingske erhverv",
|
||
|
|
# "weight": 7,
|
||
|
|
# },
|
||
|
|
}
|
||
|
|
|
||
|
|
CACHE_TTL = 30 * 60 # samme TTL som Ground News
|
||
|
|
|
||
|
|
NS = {
|
||
|
|
"content": "http://purl.org/rss/1.0/modules/content/",
|
||
|
|
"dc": "http://purl.org/dc/elements/1.1/",
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Hjælpefunktioner
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
def get_db() -> DBConn:
|
||
|
|
"""Return a DBConn wrapper. Schema is managed by db.py."""
|
||
|
|
return get_conn()
|
||
|
|
|
||
|
|
|
||
|
|
def _ensure_rss_cache_table(db: DBConn) -> None:
|
||
|
|
"""No-op: schema is now managed by db.py init_schema()."""
|
||
|
|
pass
|
||
|
|
|
||
|
|
|
||
|
|
def _is_cached(db: DBConn, feed_id: str) -> bool:
|
||
|
|
row = db.execute(
|
||
|
|
"SELECT fetched_at FROM rss_feed_cache WHERE feed_id = ?", (feed_id,)
|
||
|
|
).fetchone()
|
||
|
|
return bool(row and (time.time() - row["fetched_at"]) < CACHE_TTL)
|
||
|
|
|
||
|
|
|
||
|
|
def _mark_cached(db: DBConn, feed_id: str) -> None:
|
||
|
|
db.upsert(
|
||
|
|
"rss_feed_cache", "feed_id",
|
||
|
|
["feed_id", "fetched_at"],
|
||
|
|
(feed_id, int(time.time())),
|
||
|
|
)
|
||
|
|
db.commit()
|
||
|
|
|
||
|
|
|
||
|
|
def _ns(prefix: str, tag: str) -> str:
|
||
|
|
return f"{{{NS[prefix]}}}{tag}"
|
||
|
|
|
||
|
|
|
||
|
|
def _text(item: ET.Element, tag: str, ns_prefix: str | None = None) -> str:
|
||
|
|
el = item.find(_ns(ns_prefix, tag) if ns_prefix else tag)
|
||
|
|
return (el.text or "").strip() if el is not None else ""
|
||
|
|
|
||
|
|
|
||
|
|
def _strip_html(s: str) -> str:
|
||
|
|
s = re.sub(r"<[^>]+>", " ", s)
|
||
|
|
s = re.sub(r"&[a-z]+;", " ", s)
|
||
|
|
return re.sub(r"\s+", " ", s).strip()
|
||
|
|
|
||
|
|
|
||
|
|
def _make_slug(feed_id: str, url: str) -> str:
|
||
|
|
"""Lav et unikt slug fra feed-navn + URL-sti."""
|
||
|
|
path = url.split("?")[0].rstrip("/").split("/")[-1]
|
||
|
|
path = re.sub(r"^ECE\d+-", "", path) # Finans.dk ECE-id
|
||
|
|
path = re.sub(r"\.rss$|\.html$", "", path)
|
||
|
|
path = re.sub(r"[^a-z0-9\-]", "-", path.lower())[:55]
|
||
|
|
path = path.strip("-")
|
||
|
|
return f"{feed_id}-{path}" if path else f"{feed_id}-{abs(hash(url)) % 100000}"
|
||
|
|
|
||
|
|
|
||
|
|
def _parse_date(s: str) -> str:
|
||
|
|
if not s:
|
||
|
|
return datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
||
|
|
try:
|
||
|
|
return parsedate_to_datetime(s).strftime("%Y-%m-%d")
|
||
|
|
except Exception:
|
||
|
|
return s[:10] if len(s) >= 10 else datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Parse + upsert
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
def _parse_feed(feed_id: str, xml_text: str, weight: int) -> list[dict]:
|
||
|
|
root = ET.fromstring(xml_text)
|
||
|
|
articles = []
|
||
|
|
|
||
|
|
for item in root.findall(".//item"):
|
||
|
|
link = _text(item, "link") or _text(item, "guid")
|
||
|
|
if not link:
|
||
|
|
continue
|
||
|
|
|
||
|
|
title = _strip_html(_text(item, "title"))
|
||
|
|
desc = _strip_html(_text(item, "description"))[:600]
|
||
|
|
encoded = _strip_html(_text(item, "encoded", "content"))[:3000]
|
||
|
|
pub = _text(item, "pubDate") or _text(item, "date", "dc")
|
||
|
|
slug = _make_slug(feed_id, link)
|
||
|
|
|
||
|
|
# Fuld tekst til NLP: alt vi har
|
||
|
|
full_text = f"{title}. {encoded or desc}".strip()
|
||
|
|
|
||
|
|
articles.append({
|
||
|
|
"slug": slug,
|
||
|
|
"title": title,
|
||
|
|
"description": desc,
|
||
|
|
"full_text": full_text,
|
||
|
|
"start_date": _parse_date(pub),
|
||
|
|
"source_count": weight,
|
||
|
|
"categories": f"rss:{feed_id}",
|
||
|
|
"first_seen": int(time.time()),
|
||
|
|
"last_seen": int(time.time()),
|
||
|
|
})
|
||
|
|
|
||
|
|
return articles
|
||
|
|
|
||
|
|
|
||
|
|
def _upsert(db: DBConn, articles: list[dict]) -> int:
|
||
|
|
new = 0
|
||
|
|
now = int(time.time())
|
||
|
|
for a in articles:
|
||
|
|
exists = db.execute(
|
||
|
|
"SELECT 1 FROM articles WHERE slug = ?", (a["slug"],)
|
||
|
|
).fetchone()
|
||
|
|
|
||
|
|
if exists:
|
||
|
|
db.execute(
|
||
|
|
"UPDATE articles SET last_seen = ? WHERE slug = ?",
|
||
|
|
(now, a["slug"]),
|
||
|
|
)
|
||
|
|
else:
|
||
|
|
db.execute(
|
||
|
|
"""INSERT INTO articles
|
||
|
|
(slug, title, description, start_date,
|
||
|
|
source_count, categories, first_seen, last_seen)
|
||
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
|
||
|
|
(a["slug"], a["title"], a["description"], a["start_date"],
|
||
|
|
a["source_count"], a["categories"], a["first_seen"], a["last_seen"]),
|
||
|
|
)
|
||
|
|
new += 1
|
||
|
|
|
||
|
|
# Gem full text i page_cache så analyze.py kan hente det i Phase 3
|
||
|
|
db.upsert(
|
||
|
|
"page_cache", "url",
|
||
|
|
["url", "page_type", "fetched_at", "content"],
|
||
|
|
(f"rss:{a['slug']}", "rss", now, a["full_text"]),
|
||
|
|
)
|
||
|
|
|
||
|
|
db.commit()
|
||
|
|
return new
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Hoved-funktion
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
def fetch_all_rss(db: DBConn, force: bool = False) -> int:
|
||
|
|
"""
|
||
|
|
Hent alle RSS feeds og gem i DB.
|
||
|
|
Returnerer antal nye artikler.
|
||
|
|
"""
|
||
|
|
_ensure_rss_cache_table(db)
|
||
|
|
total_new = 0
|
||
|
|
|
||
|
|
for feed_id, cfg in FEEDS.items():
|
||
|
|
if not force and _is_cached(db, feed_id):
|
||
|
|
print(f" 💾 {cfg['label']:<30} (cache)")
|
||
|
|
continue
|
||
|
|
|
||
|
|
try:
|
||
|
|
resp = httpx.get(
|
||
|
|
cfg["url"], timeout=15, follow_redirects=True,
|
||
|
|
headers={"User-Agent": "MoneyMaker/1.0 RSS reader (+https://github.com)"},
|
||
|
|
)
|
||
|
|
resp.raise_for_status()
|
||
|
|
articles = _parse_feed(feed_id, resp.text, cfg["weight"])
|
||
|
|
new = _upsert(db, articles)
|
||
|
|
_mark_cached(db, feed_id)
|
||
|
|
total_new += new
|
||
|
|
print(f" 🌐 {cfg['label']:<30} {len(articles):2} artikler (+{new} nye)")
|
||
|
|
except Exception as e:
|
||
|
|
print(f" ✗ {cfg['label']:<30} FEJL: {e}")
|
||
|
|
|
||
|
|
return total_new
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
db = get_db()
|
||
|
|
print("[rss] Henter feeds …")
|
||
|
|
n = fetch_all_rss(db, force=True)
|
||
|
|
print(f"[rss] Færdig. {n} nye artikler.")
|
||
|
|
db.close()
|