First commit
This commit is contained in:
244
rss_feeds.py
Normal file
244
rss_feeds.py
Normal file
@@ -0,0 +1,244 @@
|
||||
"""
|
||||
rss_feeds.py — Danske finansielle RSS feeds til MoneyMaker
|
||||
|
||||
Feeds:
|
||||
Børsen https://borsen.dk/rss
|
||||
Finans.dk top https://feeds.finans.dk/topnyheder
|
||||
Politiken øko https://politiken.dk/rss/oekonomi.rss
|
||||
|
||||
Artikler gemmes i samme `articles` tabel som Ground News.
|
||||
`source_count` sættes til feedets kredibilitets-vægt (ikke antal medier,
|
||||
men et indikativt tal der giver coverage_spread > 0 i pipeline).
|
||||
|
||||
Full text (title + description + content:encoded) gemmes i `page_cache`
|
||||
med url-nøgle `rss:{slug}` så analyze.py kan hente det i Phase 3.
|
||||
"""
|
||||
|
||||
import re
|
||||
import time
|
||||
import sqlite3
|
||||
import xml.etree.ElementTree as ET
|
||||
from email.utils import parsedate_to_datetime
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
from db import get_conn, DBConn
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Feed-katalog — tilføj nye her
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
FEEDS: dict[str, dict] = {
|
||||
"borsen": {
|
||||
"url": "https://borsen.dk/rss",
|
||||
"label": "Børsen",
|
||||
"weight": 8, # Førende dansk erhvervsmedie
|
||||
},
|
||||
"finans-top": {
|
||||
"url": "https://feeds.finans.dk/topnyheder",
|
||||
"label": "Finans.dk (top)",
|
||||
"weight": 7,
|
||||
},
|
||||
"finans-seneste": {
|
||||
"url": "https://feeds.finans.dk/seneste",
|
||||
"label": "Finans.dk (seneste)",
|
||||
"weight": 6,
|
||||
},
|
||||
"politiken-oekonomi": {
|
||||
"url": "https://politiken.dk/rss/oekonomi.rss",
|
||||
"label": "Politiken økonomi",
|
||||
"weight": 6,
|
||||
},
|
||||
# Berlingske: tilføj URL når den er fundet
|
||||
# "berlingske-erhverv": {
|
||||
# "url": "https://...",
|
||||
# "label": "Berlingske erhverv",
|
||||
# "weight": 7,
|
||||
# },
|
||||
}
|
||||
|
||||
CACHE_TTL = 30 * 60 # samme TTL som Ground News
|
||||
|
||||
NS = {
|
||||
"content": "http://purl.org/rss/1.0/modules/content/",
|
||||
"dc": "http://purl.org/dc/elements/1.1/",
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Hjælpefunktioner
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def get_db() -> DBConn:
|
||||
"""Return a DBConn wrapper. Schema is managed by db.py."""
|
||||
return get_conn()
|
||||
|
||||
|
||||
def _ensure_rss_cache_table(db: DBConn) -> None:
|
||||
"""No-op: schema is now managed by db.py init_schema()."""
|
||||
pass
|
||||
|
||||
|
||||
def _is_cached(db: DBConn, feed_id: str) -> bool:
|
||||
row = db.execute(
|
||||
"SELECT fetched_at FROM rss_feed_cache WHERE feed_id = ?", (feed_id,)
|
||||
).fetchone()
|
||||
return bool(row and (time.time() - row["fetched_at"]) < CACHE_TTL)
|
||||
|
||||
|
||||
def _mark_cached(db: DBConn, feed_id: str) -> None:
|
||||
db.upsert(
|
||||
"rss_feed_cache", "feed_id",
|
||||
["feed_id", "fetched_at"],
|
||||
(feed_id, int(time.time())),
|
||||
)
|
||||
db.commit()
|
||||
|
||||
|
||||
def _ns(prefix: str, tag: str) -> str:
|
||||
return f"{{{NS[prefix]}}}{tag}"
|
||||
|
||||
|
||||
def _text(item: ET.Element, tag: str, ns_prefix: str | None = None) -> str:
|
||||
el = item.find(_ns(ns_prefix, tag) if ns_prefix else tag)
|
||||
return (el.text or "").strip() if el is not None else ""
|
||||
|
||||
|
||||
def _strip_html(s: str) -> str:
|
||||
s = re.sub(r"<[^>]+>", " ", s)
|
||||
s = re.sub(r"&[a-z]+;", " ", s)
|
||||
return re.sub(r"\s+", " ", s).strip()
|
||||
|
||||
|
||||
def _make_slug(feed_id: str, url: str) -> str:
|
||||
"""Lav et unikt slug fra feed-navn + URL-sti."""
|
||||
path = url.split("?")[0].rstrip("/").split("/")[-1]
|
||||
path = re.sub(r"^ECE\d+-", "", path) # Finans.dk ECE-id
|
||||
path = re.sub(r"\.rss$|\.html$", "", path)
|
||||
path = re.sub(r"[^a-z0-9\-]", "-", path.lower())[:55]
|
||||
path = path.strip("-")
|
||||
return f"{feed_id}-{path}" if path else f"{feed_id}-{abs(hash(url)) % 100000}"
|
||||
|
||||
|
||||
def _parse_date(s: str) -> str:
|
||||
if not s:
|
||||
return datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
||||
try:
|
||||
return parsedate_to_datetime(s).strftime("%Y-%m-%d")
|
||||
except Exception:
|
||||
return s[:10] if len(s) >= 10 else datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Parse + upsert
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _parse_feed(feed_id: str, xml_text: str, weight: int) -> list[dict]:
|
||||
root = ET.fromstring(xml_text)
|
||||
articles = []
|
||||
|
||||
for item in root.findall(".//item"):
|
||||
link = _text(item, "link") or _text(item, "guid")
|
||||
if not link:
|
||||
continue
|
||||
|
||||
title = _strip_html(_text(item, "title"))
|
||||
desc = _strip_html(_text(item, "description"))[:600]
|
||||
encoded = _strip_html(_text(item, "encoded", "content"))[:3000]
|
||||
pub = _text(item, "pubDate") or _text(item, "date", "dc")
|
||||
slug = _make_slug(feed_id, link)
|
||||
|
||||
# Fuld tekst til NLP: alt vi har
|
||||
full_text = f"{title}. {encoded or desc}".strip()
|
||||
|
||||
articles.append({
|
||||
"slug": slug,
|
||||
"title": title,
|
||||
"description": desc,
|
||||
"full_text": full_text,
|
||||
"start_date": _parse_date(pub),
|
||||
"source_count": weight,
|
||||
"categories": f"rss:{feed_id}",
|
||||
"first_seen": int(time.time()),
|
||||
"last_seen": int(time.time()),
|
||||
})
|
||||
|
||||
return articles
|
||||
|
||||
|
||||
def _upsert(db: DBConn, articles: list[dict]) -> int:
|
||||
new = 0
|
||||
now = int(time.time())
|
||||
for a in articles:
|
||||
exists = db.execute(
|
||||
"SELECT 1 FROM articles WHERE slug = ?", (a["slug"],)
|
||||
).fetchone()
|
||||
|
||||
if exists:
|
||||
db.execute(
|
||||
"UPDATE articles SET last_seen = ? WHERE slug = ?",
|
||||
(now, a["slug"]),
|
||||
)
|
||||
else:
|
||||
db.execute(
|
||||
"""INSERT INTO articles
|
||||
(slug, title, description, start_date,
|
||||
source_count, categories, first_seen, last_seen)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
|
||||
(a["slug"], a["title"], a["description"], a["start_date"],
|
||||
a["source_count"], a["categories"], a["first_seen"], a["last_seen"]),
|
||||
)
|
||||
new += 1
|
||||
|
||||
# Gem full text i page_cache så analyze.py kan hente det i Phase 3
|
||||
db.upsert(
|
||||
"page_cache", "url",
|
||||
["url", "page_type", "fetched_at", "content"],
|
||||
(f"rss:{a['slug']}", "rss", now, a["full_text"]),
|
||||
)
|
||||
|
||||
db.commit()
|
||||
return new
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Hoved-funktion
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def fetch_all_rss(db: DBConn, force: bool = False) -> int:
|
||||
"""
|
||||
Hent alle RSS feeds og gem i DB.
|
||||
Returnerer antal nye artikler.
|
||||
"""
|
||||
_ensure_rss_cache_table(db)
|
||||
total_new = 0
|
||||
|
||||
for feed_id, cfg in FEEDS.items():
|
||||
if not force and _is_cached(db, feed_id):
|
||||
print(f" 💾 {cfg['label']:<30} (cache)")
|
||||
continue
|
||||
|
||||
try:
|
||||
resp = httpx.get(
|
||||
cfg["url"], timeout=15, follow_redirects=True,
|
||||
headers={"User-Agent": "MoneyMaker/1.0 RSS reader (+https://github.com)"},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
articles = _parse_feed(feed_id, resp.text, cfg["weight"])
|
||||
new = _upsert(db, articles)
|
||||
_mark_cached(db, feed_id)
|
||||
total_new += new
|
||||
print(f" 🌐 {cfg['label']:<30} {len(articles):2} artikler (+{new} nye)")
|
||||
except Exception as e:
|
||||
print(f" ✗ {cfg['label']:<30} FEJL: {e}")
|
||||
|
||||
return total_new
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
db = get_db()
|
||||
print("[rss] Henter feeds …")
|
||||
n = fetch_all_rss(db, force=True)
|
||||
print(f"[rss] Færdig. {n} nye artikler.")
|
||||
db.close()
|
||||
Reference in New Issue
Block a user