Files
mmd/ground_news.py

560 lines
21 KiB
Python
Raw Normal View History

2026-05-26 22:21:27 +02:00
"""
ground_news.py Ground News article fetcher + local SQLite store
Key design:
- RSC payload trick: send RSC: 1 header to get Next.js App Router data
- page_cache table: raw RSC payloads with TTL (don't re-fetch fresh pages)
- articles table: all extracted fields, categories merged across pages
- fetch_article(slug) single article, rich data
- fetch_category(slug) all stories on an interest page (~15 stories)
- fetch_all() all known interest categories in parallel
- top_articles(n, days) query DB for top-N by source_count
"""
import re
import json
import time
import sqlite3
import httpx
import concurrent.futures
from pathlib import Path
from db import get_conn, DBConn
# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------
DB_PATH = Path(__file__).parent / "ground_news.db"
BASE_URL = "https://ground.news"
CACHE_TTL = {
"interest": 30 * 60, # category pages: 30 min
"article": 6 * 60 * 60, # single articles: 6 h
}
HEADERS = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
"Accept": "text/html,application/xhtml+xml",
"RSC": "1",
"Next-Router-State-Tree": (
"%5B%22%22%2C%7B%22children%22%3A%5B%22__PAGE__%22%2C%7B%7D%5D%7D%2Cnull%2Cnull%2Ctrue%5D"
),
}
# All known interest slugs (auto-discovered from ground.news homepage 2026-05-24)
KNOWN_INTERESTS: dict[str, str] = {
"europe": "Europe",
"europe-economy": "Europe Economy",
"european-politics": "European Politics",
"european-union": "European Union",
"european-security-and-nato": "European Security & NATO",
"uk-politics": "UK Politics",
"united-kingdom": "United Kingdom",
"international": "International",
"north-america": "North America",
"south-america": "South America",
"africa": "Africa",
"asia": "Asia",
"australia": "Australia",
"us-politics": "US Politics",
"united-states": "United States",
"donald-trump": "Donald Trump",
"trump-administration": "Trump Administration",
"israeli-palestinian-conflict": "Israeli-Palestinian Conflict",
"business-and-markets": "Business & Markets",
"premier-league": "Premier League",
"soccer": "Soccer",
"memorial-day": "Memorial Day",
# Financial / C25 relevant categories
"pharma": "Pharmaceuticals",
"energy": "Energy",
"renewable-energy": "Renewable Energy",
"denmark": "Denmark",
"finance": "Finance",
"corporate": "Corporate",
"technology": "Technology",
"climate-change": "Climate Change",
"shipping": "Shipping",
# Danish/Nordic specific
"biotech": "Biotech",
"healthcare": "Healthcare",
"pharmaceutical": "Pharmaceutical",
"nordic": "Nordic",
"scandinavia": "Scandinavia",
"denmark-economy": "Denmark Economy",
"danish-economy": "Danish Economy",
"global-economy": "Global Economy",
"global-markets": "Global Markets",
"stock-market": "Stock Market",
"investing": "Investing",
"clean-energy": "Clean Energy",
"logistics": "Logistics",
"diabetes": "Diabetes",
}
# ---------------------------------------------------------------------------
# Database
# ---------------------------------------------------------------------------
def get_db() -> DBConn:
"""Return a DBConn wrapper (Postgres or SQLite). Schema is managed by db.py."""
return get_conn()
# ---------------------------------------------------------------------------
# HTTP fetch with cache
# ---------------------------------------------------------------------------
def fetch_cached(db: DBConn, url: str, page_type: str = "interest") -> tuple[str, bool]:
"""Return (content, from_cache). Re-fetches if stale per CACHE_TTL."""
row = db.execute(
"SELECT content, fetched_at FROM page_cache WHERE url=?", (url,)
).fetchone()
ttl = CACHE_TTL.get(page_type, 1800)
now = int(time.time())
if row and (now - row["fetched_at"]) < ttl:
return row["content"], True
r = httpx.get(url, headers=HEADERS, follow_redirects=True, timeout=20)
r.raise_for_status()
db.upsert(
"page_cache", "url",
["url", "page_type", "fetched_at", "content"],
(url, page_type, now, r.text),
)
db.commit()
return r.text, False
# ---------------------------------------------------------------------------
# RSC payload parsers
# ---------------------------------------------------------------------------
# UUID v4 pattern
_UUID = re.compile(r"[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}")
# blindspotData — has coverageProfileStatement + coverageProfileType before the numbers
_BLIND = re.compile(
r'"blindspotData":\{[^}]{0,400}' # skip coverageProfileStatement, coverageProfileType
r'"leftPercent":([\d.]+),"rightPercent":([\d.]+),"centerPercent":([\d.]+),'
r'"leftSrcCount":(\d+),"rightSrcCount":(\d+),"cntrSrcCount":(\d+)'
)
# Story anchor: start + title + slug + factuality (field order confirmed from RSC)
_STORY = re.compile(
r'"start":"(20\d{2}-[^"]+)",'
r'"title":"([^"]{10,200})",'
r'"slug":"([a-z0-9][a-z0-9_-]{15,})",'
r'"factuality":\{([^}]+)\}'
)
# Escaped JSON string value
_JSON_STR = re.compile(r'"((?:[^"\\]|\\.)*)"')
def _decode(s: str) -> str:
"""Decode a JSON-escaped string value."""
try:
return json.loads(f'"{s}"')
except Exception:
return s
def parse_stories(data: str, category: str) -> list[dict]:
"""Extract all story objects from an RSC payload."""
stories = []
for m in _STORY.finditer(data):
start, title, slug, fact_raw = m.group(1), m.group(2), m.group(3), m.group(4)
before = data[max(0, m.start() - 8000): m.start()]
after = data[m.end(): m.end() + 6000]
# UUID — last v4 UUID found before the story anchor (the story's own id)
uuids = _UUID.findall(before[-4000:])
story_id = uuids[-1] if uuids else None
# blindspotData (comes before the anchor)
blind = _BLIND.search(before[-8000:])
left_pct = right_pct = ctr_pct = None
left_cnt = right_cnt = ctr_cnt = None
if blind:
left_pct = float(blind.group(1)) # already 0-100
right_pct = float(blind.group(2))
ctr_pct = float(blind.group(3))
left_cnt = int(blind.group(4))
right_cnt = int(blind.group(5))
ctr_cnt = int(blind.group(6))
# biasSourceCount
bsc = re.search(r'"biasSourceCount":(\d+)', before[-8000:])
bias_src_count = int(bsc.group(1)) if bsc else 0
# overallBias score (-1 .. +1)
ob = re.search(r'"overallBias":([-\d.]+)', before[-8000:])
overall_bias = float(ob.group(1)) if ob else None
# blindspot label ("left"/"right"/"center")
bs = re.search(r'"blindspot":"(left|right|center|none)"', before[-8000:])
blindspot = bs.group(1) if bs else None
# description — allow JSON-escaped content
desc_m = re.search(r'"description":"((?:[^"\\]|\\.){0,600})"', before[-3000:])
description = _decode(desc_m.group(1)) if desc_m else None
# sourceCount (comes after the anchor in sources:[...])
sc = re.search(r'"sourceCount":(\d+)', after)
source_count = int(sc.group(1)) if sc else 0
# factuality
factuality = {k: int(v) for k, v in re.findall(r'"(\w+)":(\d+)', fact_raw)}
# Ground News interest UUIDs this story belongs to
int_m = re.search(r'"interests":\[([^\]]*)\]', before[-2000:])
interests = _UUID.findall(int_m.group(1)) if int_m else []
stories.append({
"slug": slug,
"story_id": story_id,
"title": _decode(title),
"description": description,
"start_date": start[:10],
"source_count": source_count,
"bias_src_count": bias_src_count,
"left_pct": left_pct,
"ctr_pct": ctr_pct,
"right_pct": right_pct,
"left_src_count": left_cnt,
"ctr_src_count": ctr_cnt,
"right_src_count":right_cnt,
"overall_bias": overall_bias,
"blindspot": blindspot,
"factuality": factuality,
"interests": interests,
"category": category,
})
return stories
def parse_single_article(data: str, slug: str) -> dict:
"""Richer extraction for a single article page (has wireStoryRefs etc)."""
def get(pattern, cast=str):
m = re.search(pattern, data)
try:
return cast(m.group(1)) if m else None
except Exception:
return None
# story_id: UUID before the slug
id_m = re.search(r'"id":"([0-9a-f-]{36})"[^}]{0,200}"slug":"' + re.escape(slug), data, re.DOTALL)
story_id = id_m.group(1) if id_m else get(r'"id":"([0-9a-f-]{36})"')
# Title — must come before wireStoryRefs
title_m = re.search(r'"title":"([^"]{10,200})"[^}]{0,100}"wireStoryRefs"', data, re.DOTALL)
title = _decode(title_m.group(1)) if title_m else get(r'"title":"([^"]{10,200})"')
# blindspotData
blind = _BLIND.search(data)
# Bias side breakdown
bias_breakdown = {}
for side in ("left", "center", "right"):
bm = re.search(
rf'"id":"{side}".*?"sourceCount":(\d+).*?"percent":(\d+)',
data, re.DOTALL
)
if bm:
bias_breakdown[side] = {"sources": int(bm.group(1)), "percent": int(bm.group(2))}
# factuality
fm = re.search(r'"factuality":\{([^}]+)\}', data)
factuality = {k: int(v) for k, v in re.findall(r'"(\w+)":(\d+)', fm.group(1))} if fm else {}
desc_m = re.search(r'"description":"((?:[^"\\]|\\.){20,600})"', data)
return {
"slug": slug,
"story_id": story_id,
"title": title,
"description": _decode(desc_m.group(1)) if desc_m else None,
"start_date": get(r'"start":"(20\d{2}-[^"]+)"'),
"source_count": get(r'"sourceCount":(\d+)', int),
"bias_src_count": get(r'"biasSourceCount":(\d+)', int),
"overall_bias": get(r'"overallBias":([-\d.]+)', float),
"blindspot": get(r'"blindspot":"(left|right|center|none)"'),
"left_pct": float(blind.group(1)) if blind else None,
"right_pct": float(blind.group(2)) if blind else None,
"ctr_pct": float(blind.group(3)) if blind else None,
"left_src_count": int(blind.group(4)) if blind else None,
"right_src_count": int(blind.group(5)) if blind else None,
"ctr_src_count": int(blind.group(6)) if blind else None,
"factuality": factuality,
"bias_breakdown": bias_breakdown,
}
# ---------------------------------------------------------------------------
# DB upsert
# ---------------------------------------------------------------------------
def upsert_articles(db: DBConn, stories: list[dict]) -> int:
"""Insert new / update existing articles. Returns count of new rows."""
now = int(time.time())
new = 0
for s in stories:
row = db.execute(
"SELECT categories, first_seen FROM articles WHERE slug=?", (s["slug"],)
).fetchone()
cats = set((row["categories"] or "").split(",")) if row else set()
cats.discard("")
cats.add(s["category"])
if row:
db.execute(
"""UPDATE articles SET
story_id=COALESCE(story_id, ?),
source_count=?, bias_src_count=?,
left_pct=?, ctr_pct=?, right_pct=?,
left_src_count=?, ctr_src_count=?, right_src_count=?,
overall_bias=?, blindspot=?,
description=COALESCE(description, ?),
categories=?, last_seen=?
WHERE slug=?""",
(s["story_id"],
s["source_count"], s["bias_src_count"],
s["left_pct"], s["ctr_pct"], s["right_pct"],
s["left_src_count"], s["ctr_src_count"], s["right_src_count"],
s["overall_bias"], s["blindspot"],
s["description"],
",".join(sorted(cats)), now,
s["slug"]),
)
else:
db.execute(
"""INSERT INTO articles
(slug, story_id, title, description, start_date,
source_count, bias_src_count,
left_pct, ctr_pct, right_pct,
left_src_count, ctr_src_count, right_src_count,
overall_bias, blindspot,
factuality_json, interests_json,
categories, first_seen, last_seen)
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""",
(s["slug"], s["story_id"], s["title"], s["description"], s["start_date"],
s["source_count"], s["bias_src_count"],
s["left_pct"], s["ctr_pct"], s["right_pct"],
s["left_src_count"], s["ctr_src_count"], s["right_src_count"],
s["overall_bias"], s["blindspot"],
json.dumps(s["factuality"]), json.dumps(s["interests"]),
s["category"], now, now),
)
new += 1
db.commit()
return new
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def fetch_article_text(slug: str, db: DBConn | None = None) -> str:
"""
Fetch full article RSC payload and return a clean text blob for NLP.
Extracts: main title + description + all source article headlines.
"""
own_db = db is None
if own_db:
db = get_db()
url = f"{BASE_URL}/article/{slug}"
data, _ = fetch_cached(db, url, "article")
if own_db:
db.close()
parts: list[str] = []
seen: set[str] = set()
def add(text: str) -> None:
if text and len(text) > 20 and text not in seen:
seen.add(text)
parts.append(text)
# Main title
for m in re.finditer(r'"title":"((?:[^"\\]|\\.){10,300})"', data):
t = _decode(m.group(1))
if not re.search(r'Getty|AFP|\/AFP|PHOTO-TAG', t, re.I):
add(t)
# Descriptions / excerpts
for pattern in [
r'"description":"((?:[^"\\]|\\.){20,600})"',
r'"excerpt":"((?:[^"\\]|\\.){20,400})"',
r'"summary":"((?:[^"\\]|\\.){20,400})"',
]:
for m in re.finditer(pattern, data):
t = _decode(m.group(1))
if not re.search(r'Getty|AFP|PHOTO-TAG|Author:', t, re.I):
add(t)
# Wire story / source article headlines
for m in re.finditer(r'"headline":"((?:[^"\\]|\\.){10,300})"', data):
add(_decode(m.group(1)))
return " ".join(parts)
def fetch_article(slug: str, db: DBConn | None = None) -> dict:
"""Fetch a single article page; optionally cache + store in DB."""
own_db = db is None
if own_db:
db = get_db()
url = f"{BASE_URL}/article/{slug}"
data, _ = fetch_cached(db, url, "article")
result = parse_single_article(data, slug)
if own_db:
db.close()
return result
def _http_fetch_category(
category_slug: str,
*,
force: bool = False,
) -> tuple[str, list[dict], bool]:
"""
Fetch one category page via HTTP only.
Uses a per-thread DB connection (psycopg2 connections are not thread-safe).
Returns (slug, stories, from_cache).
"""
db = get_conn()
url = f"{BASE_URL}/interest/{category_slug}"
if force:
db.execute("DELETE FROM page_cache WHERE url=?", (url,))
db.commit()
data, from_cache = fetch_cached(db, url, "interest")
db.close()
stories = parse_stories(data, category_slug)
return category_slug, stories, from_cache
def fetch_category(
category_slug: str,
db: DBConn,
*,
force: bool = False,
) -> tuple[list[dict], bool]:
"""
Fetch an interest category page.
Returns (stories, from_cache).
"""
_, stories, from_cache = _http_fetch_category(category_slug, force=force)
upsert_articles(db, stories)
return stories, from_cache
def fetch_all(
db: DBConn,
slugs: list[str] | None = None,
*,
force: bool = False,
workers: int = 12,
) -> dict[str, list[dict]]:
"""
Fetch all (or given) interest categories in parallel (HTTP only),
then upsert results serially into DB from the calling thread.
Returns {slug: [story, ...]} mapping.
"""
targets = slugs or list(KNOWN_INTERESTS.keys())
results: dict[str, list[dict]] = {}
with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as ex:
futs = {ex.submit(_http_fetch_category, s, force=force): s for s in targets}
for f in concurrent.futures.as_completed(futs):
slug = futs[f]
try:
_, stories, cached = f.result()
upsert_articles(db, stories) # DB write in main thread
results[slug] = stories
icon = "💾" if cached else "🌐"
print(f" {icon} {slug:<38} {len(stories):2} stories")
except Exception as e:
print(f"{slug:<38} ERROR: {e}")
results[slug] = []
return results
def top_articles(
db: DBConn,
limit: int = 30,
days: int | None = 2,
min_sources: int = 0,
) -> list[sqlite3.Row]:
"""Query DB for top articles by source_count."""
where = "WHERE source_count >= ?"
params: list = [min_sources]
if days is not None:
where += " AND start_date >= date('now', ?)"
params.append(f"-{days} days")
return db.execute(
f"SELECT * FROM articles {where} ORDER BY source_count DESC LIMIT ?",
(*params, limit),
).fetchall()
# ---------------------------------------------------------------------------
# Display
# ---------------------------------------------------------------------------
def print_top(rows: list[sqlite3.Row], header: str = "Top artikler") -> None:
print(f"\n{'='*76}")
print(f" {header} ({len(rows)} artikler)")
print(f"{'='*76}\n")
for i, a in enumerate(rows, 1):
bias = ""
if a["left_pct"] is not None:
bias = f" L{a['left_pct']:.0f}% C{a['ctr_pct']:.0f}% R{a['right_pct']:.0f}%"
cats = (a["categories"] or "").replace(",", " · ")
ob = f" bias={a['overall_bias']:+.2f}" if a["overall_bias"] is not None else ""
bs = f" blindspot={a['blindspot']}" if a["blindspot"] else ""
print(f"{i:2}. [{a['source_count']:4} src{bias}{ob}{bs}] [{a['start_date']}]")
print(f" {a['title'][:80]}")
if a["description"]:
print(f" {a['description'][:90]}")
print(f" [{cats}]")
print(f" /article/{a['slug']}")
print()
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
if __name__ == "__main__":
import sys
db = get_db()
if len(sys.argv) >= 2 and sys.argv[1] == "article":
slug = sys.argv[2]
url = f"{BASE_URL}/article/{slug}"
data, cached = fetch_cached(db, url, "article")
result = parse_single_article(data, slug)
print(f"({'cached' if cached else 'fetched'})")
print(json.dumps(result, indent=2, ensure_ascii=False))
elif len(sys.argv) >= 2 and sys.argv[1] == "category":
slug = sys.argv[2]
stories, cached = fetch_category(slug, db)
print(f"({'cached' if cached else 'fetched'}) {len(stories)} stories\n")
for s in sorted(stories, key=lambda x: x["source_count"], reverse=True):
print(f" [{s['source_count']:4} src] {s['title'][:70]}")
else:
force = "--force" in sys.argv
days = 3
print(f"Fetching all {len(KNOWN_INTERESTS)} categories (force={force})…\n")
fetch_all(db, force=force)
rows = top_articles(db, limit=30, days=days)
print_top(rows, f"Top 30 seneste {days} dage")
db.close()