First commit

2026-05-26 22:21:27 +02:00
parent 2743a236b2
commit 05eed51e7d
90 changed files with 8690 additions and 0 deletions
--- a/ground_news.py
+++ b/ground_news.py
@@ -0,0 +1,559 @@
+"""
+ground_news.py — Ground News article fetcher + local SQLite store
+
+Key design:
+  - RSC payload trick: send  RSC: 1  header to get Next.js App Router data
+  - page_cache table: raw RSC payloads with TTL (don't re-fetch fresh pages)
+  - articles table: all extracted fields, categories merged across pages
+  - fetch_article(slug)  — single article, rich data
+  - fetch_category(slug) — all stories on an interest page (~15 stories)
+  - fetch_all()          — all known interest categories in parallel
+  - top_articles(n, days)— query DB for top-N by source_count
+"""
+
+import re
+import json
+import time
+import sqlite3
+import httpx
+import concurrent.futures
+from pathlib import Path
+from db import get_conn, DBConn
+
+# ---------------------------------------------------------------------------
+# Config
+# ---------------------------------------------------------------------------
+
+DB_PATH   = Path(__file__).parent / "ground_news.db"
+BASE_URL  = "https://ground.news"
+
+CACHE_TTL = {
+    "interest": 30 * 60,   # category pages: 30 min
+    "article":  6 * 60 * 60,  # single articles: 6 h
+}
+
+HEADERS = {
+    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
+    "Accept": "text/html,application/xhtml+xml",
+    "RSC": "1",
+    "Next-Router-State-Tree": (
+        "%5B%22%22%2C%7B%22children%22%3A%5B%22__PAGE__%22%2C%7B%7D%5D%7D%2Cnull%2Cnull%2Ctrue%5D"
+    ),
+}
+
+# All known interest slugs (auto-discovered from ground.news homepage 2026-05-24)
+KNOWN_INTERESTS: dict[str, str] = {
+    "europe":                       "Europe",
+    "europe-economy":               "Europe Economy",
+    "european-politics":            "European Politics",
+    "european-union":               "European Union",
+    "european-security-and-nato":   "European Security & NATO",
+    "uk-politics":                  "UK Politics",
+    "united-kingdom":               "United Kingdom",
+    "international":                "International",
+    "north-america":                "North America",
+    "south-america":                "South America",
+    "africa":                       "Africa",
+    "asia":                         "Asia",
+    "australia":                    "Australia",
+    "us-politics":                  "US Politics",
+    "united-states":                "United States",
+    "donald-trump":                 "Donald Trump",
+    "trump-administration":         "Trump Administration",
+    "israeli-palestinian-conflict": "Israeli-Palestinian Conflict",
+    "business-and-markets":         "Business & Markets",
+    "premier-league":               "Premier League",
+    "soccer":                       "Soccer",
+    "memorial-day":                 "Memorial Day",
+    # Financial / C25 relevant categories
+    "pharma":                       "Pharmaceuticals",
+    "energy":                       "Energy",
+    "renewable-energy":             "Renewable Energy",
+    "denmark":                      "Denmark",
+    "finance":                      "Finance",
+    "corporate":                    "Corporate",
+    "technology":                   "Technology",
+    "climate-change":               "Climate Change",
+    "shipping":                     "Shipping",
+    # Danish/Nordic specific
+    "biotech":                      "Biotech",
+    "healthcare":                   "Healthcare",
+    "pharmaceutical":               "Pharmaceutical",
+    "nordic":                       "Nordic",
+    "scandinavia":                  "Scandinavia",
+    "denmark-economy":              "Denmark Economy",
+    "danish-economy":               "Danish Economy",
+    "global-economy":               "Global Economy",
+    "global-markets":               "Global Markets",
+    "stock-market":                 "Stock Market",
+    "investing":                    "Investing",
+    "clean-energy":                 "Clean Energy",
+    "logistics":                    "Logistics",
+    "diabetes":                     "Diabetes",
+}
+
+# ---------------------------------------------------------------------------
+# Database
+# ---------------------------------------------------------------------------
+
+def get_db() -> DBConn:
+    """Return a DBConn wrapper (Postgres or SQLite). Schema is managed by db.py."""
+    return get_conn()
+
+
+# ---------------------------------------------------------------------------
+# HTTP fetch with cache
+# ---------------------------------------------------------------------------
+
+def fetch_cached(db: DBConn, url: str, page_type: str = "interest") -> tuple[str, bool]:
+    """Return (content, from_cache). Re-fetches if stale per CACHE_TTL."""
+    row = db.execute(
+        "SELECT content, fetched_at FROM page_cache WHERE url=?", (url,)
+    ).fetchone()
+    ttl = CACHE_TTL.get(page_type, 1800)
+    now = int(time.time())
+    if row and (now - row["fetched_at"]) < ttl:
+        return row["content"], True
+
+    r = httpx.get(url, headers=HEADERS, follow_redirects=True, timeout=20)
+    r.raise_for_status()
+    db.upsert(
+        "page_cache", "url",
+        ["url", "page_type", "fetched_at", "content"],
+        (url, page_type, now, r.text),
+    )
+    db.commit()
+    return r.text, False
+
+
+# ---------------------------------------------------------------------------
+# RSC payload parsers
+# ---------------------------------------------------------------------------
+
+# UUID v4 pattern
+_UUID = re.compile(r"[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}")
+
+# blindspotData — has coverageProfileStatement + coverageProfileType before the numbers
+_BLIND = re.compile(
+    r'"blindspotData":\{[^}]{0,400}'   # skip coverageProfileStatement, coverageProfileType
+    r'"leftPercent":([\d.]+),"rightPercent":([\d.]+),"centerPercent":([\d.]+),'
+    r'"leftSrcCount":(\d+),"rightSrcCount":(\d+),"cntrSrcCount":(\d+)'
+)
+
+# Story anchor: start + title + slug + factuality (field order confirmed from RSC)
+_STORY = re.compile(
+    r'"start":"(20\d{2}-[^"]+)",'
+    r'"title":"([^"]{10,200})",'
+    r'"slug":"([a-z0-9][a-z0-9_-]{15,})",'
+    r'"factuality":\{([^}]+)\}'
+)
+
+# Escaped JSON string value
+_JSON_STR = re.compile(r'"((?:[^"\\]|\\.)*)"')
+
+
+def _decode(s: str) -> str:
+    """Decode a JSON-escaped string value."""
+    try:
+        return json.loads(f'"{s}"')
+    except Exception:
+        return s
+
+
+def parse_stories(data: str, category: str) -> list[dict]:
+    """Extract all story objects from an RSC payload."""
+    stories = []
+    for m in _STORY.finditer(data):
+        start, title, slug, fact_raw = m.group(1), m.group(2), m.group(3), m.group(4)
+        before = data[max(0, m.start() - 8000): m.start()]
+        after  = data[m.end(): m.end() + 6000]
+
+        # UUID — last v4 UUID found before the story anchor (the story's own id)
+        uuids = _UUID.findall(before[-4000:])
+        story_id = uuids[-1] if uuids else None
+
+        # blindspotData (comes before the anchor)
+        blind = _BLIND.search(before[-8000:])
+        left_pct = right_pct = ctr_pct = None
+        left_cnt = right_cnt = ctr_cnt = None
+        if blind:
+            left_pct  = float(blind.group(1))   # already 0-100
+            right_pct = float(blind.group(2))
+            ctr_pct   = float(blind.group(3))
+            left_cnt  = int(blind.group(4))
+            right_cnt = int(blind.group(5))
+            ctr_cnt   = int(blind.group(6))
+
+        # biasSourceCount
+        bsc = re.search(r'"biasSourceCount":(\d+)', before[-8000:])
+        bias_src_count = int(bsc.group(1)) if bsc else 0
+
+        # overallBias score (-1 .. +1)
+        ob = re.search(r'"overallBias":([-\d.]+)', before[-8000:])
+        overall_bias = float(ob.group(1)) if ob else None
+
+        # blindspot label ("left"/"right"/"center")
+        bs = re.search(r'"blindspot":"(left|right|center|none)"', before[-8000:])
+        blindspot = bs.group(1) if bs else None
+
+        # description — allow JSON-escaped content
+        desc_m = re.search(r'"description":"((?:[^"\\]|\\.){0,600})"', before[-3000:])
+        description = _decode(desc_m.group(1)) if desc_m else None
+
+        # sourceCount (comes after the anchor in sources:[...])
+        sc = re.search(r'"sourceCount":(\d+)', after)
+        source_count = int(sc.group(1)) if sc else 0
+
+        # factuality
+        factuality = {k: int(v) for k, v in re.findall(r'"(\w+)":(\d+)', fact_raw)}
+
+        # Ground News interest UUIDs this story belongs to
+        int_m = re.search(r'"interests":\[([^\]]*)\]', before[-2000:])
+        interests = _UUID.findall(int_m.group(1)) if int_m else []
+
+        stories.append({
+            "slug":           slug,
+            "story_id":       story_id,
+            "title":          _decode(title),
+            "description":    description,
+            "start_date":     start[:10],
+            "source_count":   source_count,
+            "bias_src_count": bias_src_count,
+            "left_pct":       left_pct,
+            "ctr_pct":        ctr_pct,
+            "right_pct":      right_pct,
+            "left_src_count": left_cnt,
+            "ctr_src_count":  ctr_cnt,
+            "right_src_count":right_cnt,
+            "overall_bias":   overall_bias,
+            "blindspot":      blindspot,
+            "factuality":     factuality,
+            "interests":      interests,
+            "category":       category,
+        })
+    return stories
+
+
+def parse_single_article(data: str, slug: str) -> dict:
+    """Richer extraction for a single article page (has wireStoryRefs etc)."""
+    def get(pattern, cast=str):
+        m = re.search(pattern, data)
+        try:
+            return cast(m.group(1)) if m else None
+        except Exception:
+            return None
+
+    # story_id: UUID before the slug
+    id_m = re.search(r'"id":"([0-9a-f-]{36})"[^}]{0,200}"slug":"' + re.escape(slug), data, re.DOTALL)
+    story_id = id_m.group(1) if id_m else get(r'"id":"([0-9a-f-]{36})"')
+
+    # Title — must come before wireStoryRefs
+    title_m = re.search(r'"title":"([^"]{10,200})"[^}]{0,100}"wireStoryRefs"', data, re.DOTALL)
+    title = _decode(title_m.group(1)) if title_m else get(r'"title":"([^"]{10,200})"')
+
+    # blindspotData
+    blind = _BLIND.search(data)
+
+    # Bias side breakdown
+    bias_breakdown = {}
+    for side in ("left", "center", "right"):
+        bm = re.search(
+            rf'"id":"{side}".*?"sourceCount":(\d+).*?"percent":(\d+)',
+            data, re.DOTALL
+        )
+        if bm:
+            bias_breakdown[side] = {"sources": int(bm.group(1)), "percent": int(bm.group(2))}
+
+    # factuality
+    fm = re.search(r'"factuality":\{([^}]+)\}', data)
+    factuality = {k: int(v) for k, v in re.findall(r'"(\w+)":(\d+)', fm.group(1))} if fm else {}
+
+    desc_m = re.search(r'"description":"((?:[^"\\]|\\.){20,600})"', data)
+
+    return {
+        "slug":            slug,
+        "story_id":        story_id,
+        "title":           title,
+        "description":     _decode(desc_m.group(1)) if desc_m else None,
+        "start_date":      get(r'"start":"(20\d{2}-[^"]+)"'),
+        "source_count":    get(r'"sourceCount":(\d+)', int),
+        "bias_src_count":  get(r'"biasSourceCount":(\d+)', int),
+        "overall_bias":    get(r'"overallBias":([-\d.]+)', float),
+        "blindspot":       get(r'"blindspot":"(left|right|center|none)"'),
+        "left_pct":        float(blind.group(1)) if blind else None,
+        "right_pct":       float(blind.group(2)) if blind else None,
+        "ctr_pct":         float(blind.group(3)) if blind else None,
+        "left_src_count":  int(blind.group(4)) if blind else None,
+        "right_src_count": int(blind.group(5)) if blind else None,
+        "ctr_src_count":   int(blind.group(6)) if blind else None,
+        "factuality":      factuality,
+        "bias_breakdown":  bias_breakdown,
+    }
+
+
+# ---------------------------------------------------------------------------
+# DB upsert
+# ---------------------------------------------------------------------------
+
+def upsert_articles(db: DBConn, stories: list[dict]) -> int:
+    """Insert new / update existing articles. Returns count of new rows."""
+    now  = int(time.time())
+    new  = 0
+    for s in stories:
+        row = db.execute(
+            "SELECT categories, first_seen FROM articles WHERE slug=?", (s["slug"],)
+        ).fetchone()
+
+        cats = set((row["categories"] or "").split(",")) if row else set()
+        cats.discard("")
+        cats.add(s["category"])
+
+        if row:
+            db.execute(
+                """UPDATE articles SET
+                       story_id=COALESCE(story_id, ?),
+                       source_count=?, bias_src_count=?,
+                       left_pct=?, ctr_pct=?, right_pct=?,
+                       left_src_count=?, ctr_src_count=?, right_src_count=?,
+                       overall_bias=?, blindspot=?,
+                       description=COALESCE(description, ?),
+                       categories=?, last_seen=?
+                   WHERE slug=?""",
+                (s["story_id"],
+                 s["source_count"], s["bias_src_count"],
+                 s["left_pct"], s["ctr_pct"], s["right_pct"],
+                 s["left_src_count"], s["ctr_src_count"], s["right_src_count"],
+                 s["overall_bias"], s["blindspot"],
+                 s["description"],
+                 ",".join(sorted(cats)), now,
+                 s["slug"]),
+            )
+        else:
+            db.execute(
+                """INSERT INTO articles
+                   (slug, story_id, title, description, start_date,
+                    source_count, bias_src_count,
+                    left_pct, ctr_pct, right_pct,
+                    left_src_count, ctr_src_count, right_src_count,
+                    overall_bias, blindspot,
+                    factuality_json, interests_json,
+                    categories, first_seen, last_seen)
+                   VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""",
+                (s["slug"], s["story_id"], s["title"], s["description"], s["start_date"],
+                 s["source_count"], s["bias_src_count"],
+                 s["left_pct"], s["ctr_pct"], s["right_pct"],
+                 s["left_src_count"], s["ctr_src_count"], s["right_src_count"],
+                 s["overall_bias"], s["blindspot"],
+                 json.dumps(s["factuality"]), json.dumps(s["interests"]),
+                 s["category"], now, now),
+            )
+            new += 1
+    db.commit()
+    return new
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+def fetch_article_text(slug: str, db: DBConn | None = None) -> str:
+    """
+    Fetch full article RSC payload and return a clean text blob for NLP.
+    Extracts: main title + description + all source article headlines.
+    """
+    own_db = db is None
+    if own_db:
+        db = get_db()
+    url  = f"{BASE_URL}/article/{slug}"
+    data, _ = fetch_cached(db, url, "article")
+    if own_db:
+        db.close()
+
+    parts: list[str] = []
+    seen: set[str] = set()
+
+    def add(text: str) -> None:
+        if text and len(text) > 20 and text not in seen:
+            seen.add(text)
+            parts.append(text)
+
+    # Main title
+    for m in re.finditer(r'"title":"((?:[^"\\]|\\.){10,300})"', data):
+        t = _decode(m.group(1))
+        if not re.search(r'Getty|AFP|\/AFP|PHOTO-TAG', t, re.I):
+            add(t)
+
+    # Descriptions / excerpts
+    for pattern in [
+        r'"description":"((?:[^"\\]|\\.){20,600})"',
+        r'"excerpt":"((?:[^"\\]|\\.){20,400})"',
+        r'"summary":"((?:[^"\\]|\\.){20,400})"',
+    ]:
+        for m in re.finditer(pattern, data):
+            t = _decode(m.group(1))
+            if not re.search(r'Getty|AFP|PHOTO-TAG|Author:', t, re.I):
+                add(t)
+
+    # Wire story / source article headlines
+    for m in re.finditer(r'"headline":"((?:[^"\\]|\\.){10,300})"', data):
+        add(_decode(m.group(1)))
+
+    return " ".join(parts)
+
+
+def fetch_article(slug: str, db: DBConn | None = None) -> dict:
+    """Fetch a single article page; optionally cache + store in DB."""
+    own_db = db is None
+    if own_db:
+        db = get_db()
+    url  = f"{BASE_URL}/article/{slug}"
+    data, _ = fetch_cached(db, url, "article")
+    result   = parse_single_article(data, slug)
+    if own_db:
+        db.close()
+    return result
+
+
+def _http_fetch_category(
+    category_slug: str,
+    *,
+    force: bool = False,
+) -> tuple[str, list[dict], bool]:
+    """
+    Fetch one category page via HTTP only.
+    Uses a per-thread DB connection (psycopg2 connections are not thread-safe).
+    Returns (slug, stories, from_cache).
+    """
+    db = get_conn()
+    url = f"{BASE_URL}/interest/{category_slug}"
+    if force:
+        db.execute("DELETE FROM page_cache WHERE url=?", (url,))
+        db.commit()
+    data, from_cache = fetch_cached(db, url, "interest")
+    db.close()
+    stories = parse_stories(data, category_slug)
+    return category_slug, stories, from_cache
+
+
+def fetch_category(
+    category_slug: str,
+    db: DBConn,
+    *,
+    force: bool = False,
+) -> tuple[list[dict], bool]:
+    """
+    Fetch an interest category page.
+    Returns (stories, from_cache).
+    """
+    _, stories, from_cache = _http_fetch_category(category_slug, force=force)
+    upsert_articles(db, stories)
+    return stories, from_cache
+
+
+def fetch_all(
+    db: DBConn,
+    slugs: list[str] | None = None,
+    *,
+    force: bool = False,
+    workers: int = 12,
+) -> dict[str, list[dict]]:
+    """
+    Fetch all (or given) interest categories in parallel (HTTP only),
+    then upsert results serially into DB from the calling thread.
+    Returns {slug: [story, ...]} mapping.
+    """
+    targets = slugs or list(KNOWN_INTERESTS.keys())
+    results: dict[str, list[dict]] = {}
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as ex:
+        futs = {ex.submit(_http_fetch_category, s, force=force): s for s in targets}
+        for f in concurrent.futures.as_completed(futs):
+            slug = futs[f]
+            try:
+                _, stories, cached = f.result()
+                upsert_articles(db, stories)   # DB write in main thread
+                results[slug] = stories
+                icon = "💾" if cached else "🌐"
+                print(f"  {icon} {slug:<38} {len(stories):2} stories")
+            except Exception as e:
+                print(f"  ✗ {slug:<38} ERROR: {e}")
+                results[slug] = []
+
+    return results
+
+
+def top_articles(
+    db: DBConn,
+    limit: int = 30,
+    days: int | None = 2,
+    min_sources: int = 0,
+) -> list[sqlite3.Row]:
+    """Query DB for top articles by source_count."""
+    where = "WHERE source_count >= ?"
+    params: list = [min_sources]
+    if days is not None:
+        where += " AND start_date >= date('now', ?)"
+        params.append(f"-{days} days")
+    return db.execute(
+        f"SELECT * FROM articles {where} ORDER BY source_count DESC LIMIT ?",
+        (*params, limit),
+    ).fetchall()
+
+
+# ---------------------------------------------------------------------------
+# Display
+# ---------------------------------------------------------------------------
+
+def print_top(rows: list[sqlite3.Row], header: str = "Top artikler") -> None:
+    print(f"\n{'='*76}")
+    print(f"  {header}  ({len(rows)} artikler)")
+    print(f"{'='*76}\n")
+    for i, a in enumerate(rows, 1):
+        bias = ""
+        if a["left_pct"] is not None:
+            bias = f"  L{a['left_pct']:.0f}% C{a['ctr_pct']:.0f}% R{a['right_pct']:.0f}%"
+        cats = (a["categories"] or "").replace(",", " · ")
+        ob   = f"  bias={a['overall_bias']:+.2f}" if a["overall_bias"] is not None else ""
+        bs   = f"  blindspot={a['blindspot']}" if a["blindspot"] else ""
+        print(f"{i:2}. [{a['source_count']:4} src{bias}{ob}{bs}] [{a['start_date']}]")
+        print(f"    {a['title'][:80]}")
+        if a["description"]:
+            print(f"    {a['description'][:90]}")
+        print(f"    [{cats}]")
+        print(f"    /article/{a['slug']}")
+        print()
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    import sys
+
+    db = get_db()
+
+    if len(sys.argv) >= 2 and sys.argv[1] == "article":
+        slug = sys.argv[2]
+        url  = f"{BASE_URL}/article/{slug}"
+        data, cached = fetch_cached(db, url, "article")
+        result = parse_single_article(data, slug)
+        print(f"({'cached' if cached else 'fetched'})")
+        print(json.dumps(result, indent=2, ensure_ascii=False))
+
+    elif len(sys.argv) >= 2 and sys.argv[1] == "category":
+        slug = sys.argv[2]
+        stories, cached = fetch_category(slug, db)
+        print(f"({'cached' if cached else 'fetched'})  {len(stories)} stories\n")
+        for s in sorted(stories, key=lambda x: x["source_count"], reverse=True):
+            print(f"  [{s['source_count']:4} src] {s['title'][:70]}")
+
+    else:
+        force = "--force" in sys.argv
+        days  = 3
+        print(f"Fetching all {len(KNOWN_INTERESTS)} categories (force={force})…\n")
+        fetch_all(db, force=force)
+        rows = top_articles(db, limit=30, days=days)
+        print_top(rows, f"Top 30 – seneste {days} dage")
+
+    db.close()