First commit

2026-05-26 22:21:27 +02:00
parent 2743a236b2
commit 05eed51e7d
90 changed files with 8690 additions and 0 deletions
--- a/analyze.py
+++ b/analyze.py
@@ -0,0 +1,562 @@
+"""
+analyze.py — C25 Financial Signal Extractor
+
+Pipeline (v2):
+  1. Alias screen on title+desc for C25 mentions
+  2. Coverage-spread filter: skip low-quality / one-sided articles
+  3. NER upgrade: BERT-NER to confirm and expand matches
+  4. Full-text fetch + re-screen
+  5. FinBERT: quick sentiment — drop neutral < FINBERT_MIN_CONF
+  6. Claude: structured extraction (tickers, magnitude, timeframe)
+  7. yfinance: momentum check — direction alignment
+  8. signal_score = sentiment_confidence × coverage_spread × momentum_alignment
+  9. Alert if signal_score > ALERT_THRESHOLD
+
+Usage:
+  python3 analyze.py                  # analyze new articles only
+  python3 analyze.py --force          # re-analyze everything
+  python3 analyze.py --limit 20       # limit to 20 articles
+  python3 analyze.py --dry-run        # show matches without storing
+  python3 analyze.py --no-claude      # skip Claude step (no API cost)
+"""
+
+import re
+import os
+import sys
+import json
+import time
+import math
+import sqlite3
+import logging
+import warnings
+from pathlib import Path
+
+# Silence transformer noise before importing
+os.environ.setdefault("TRANSFORMERS_VERBOSITY", "error")
+os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
+warnings.filterwarnings("ignore")
+logging.getLogger("transformers").setLevel(logging.ERROR)
+
+import torch
+from transformers import pipeline
+from dotenv import load_dotenv
+
+# Load .env (supports both ANTHROPIC_API_KEY and anthropic_api_key)
+_env_file = Path(__file__).parent / ".env"
+if _env_file.exists():
+    load_dotenv(_env_file, override=False)
+    for _k, _v in list(os.environ.items()):
+        if _k.lower() == "anthropic_api_key" and "ANTHROPIC_API_KEY" not in os.environ:
+            os.environ["ANTHROPIC_API_KEY"] = _v
+
+# Ground News helpers
+sys.path.insert(0, str(Path(__file__).parent))
+from ground_news import get_db, fetch_article_text, fetch_all
+from rss_feeds import fetch_all_rss
+
+# ---------------------------------------------------------------------------
+# Config
+# ---------------------------------------------------------------------------
+
+C25_PATH = Path(__file__).parent / "c25.json"
+_c25_raw = json.loads(C25_PATH.read_text())
+C25: dict[str, dict] = {k: v for k, v in _c25_raw.items() if not k.startswith("_")}
+
+# Build alias → ticker lookup (lower-cased for matching)
+ALIAS_MAP: dict[str, str] = {}
+for _ticker, _data in C25.items():
+    for _alias in _data["aliases"]:
+        al = _alias.lower()
+        if al not in ALIAS_MAP:  # first alias wins (most specific first in c25.json)
+            ALIAS_MAP[al] = _ticker
+
+DEVICE = -1  # always CPU — Quadro P400 (CC 6.1) too old + too little VRAM for these models
+
+# Quality thresholds
+MIN_SOURCES         = 1     # coverage_spread naturally weights single-source articles near zero
+MIN_COVERAGE_SPREAD = 0.0   # disabled: signal_score naturally zeros out single-source articles
+FINBERT_MIN_CONF    = 0.70  # drop neutral articles below this FinBERT confidence
+ALERT_THRESHOLD     = 0.35  # signal_score > this → alert
+
+
+# ---------------------------------------------------------------------------
+# Model loading
+# ---------------------------------------------------------------------------
+
+_ner_model     = None
+_finbert_model = None
+
+
+def get_ner():
+    global _ner_model
+    if _ner_model is None:
+        print("[analyze] Loading dslim/bert-base-NER …", flush=True)
+        _ner_model = pipeline(
+            "ner",
+            model="dslim/bert-base-NER",
+            aggregation_strategy="simple",
+            device=DEVICE,
+        )
+    return _ner_model
+
+
+def get_finbert():
+    global _finbert_model
+    if _finbert_model is None:
+        print("[analyze] Loading ProsusAI/finbert …", flush=True)
+        _finbert_model = pipeline(
+            "sentiment-analysis",
+            model="ProsusAI/finbert",
+            device=DEVICE,
+            truncation=True,
+            max_length=512,
+        )
+    return _finbert_model
+
+
+# ---------------------------------------------------------------------------
+# C25 alias matching
+# ---------------------------------------------------------------------------
+
+def match_c25(text: str) -> dict[str, float]:
+    """
+    Find C25 companies mentioned in text.
+    Returns {ticker: confidence_score}.
+    """
+    text_lower = text.lower()
+    matches: dict[str, float] = {}
+
+    for alias_lower, ticker in ALIAS_MAP.items():
+        if ticker in matches:
+            continue  # already found this company
+
+        # Always use word boundaries — prevents "sonic" matching "hypersonic",
+        # "net" matching "internet", "iss" matching "mission", etc.
+        pat = r"(?<![a-zA-Z0-9])" + re.escape(alias_lower) + r"(?![a-zA-Z0-9])"
+
+        if re.search(pat, text_lower):
+            # Confidence: longer alias match = more reliable
+            conf = min(0.99, 0.70 + len(alias_lower) * 0.01)
+            matches[ticker] = conf
+
+    return matches
+
+
+def merge_ner_matches(ner_result: list[dict], base: dict[str, float]) -> dict[str, float]:
+    """
+    Cross-reference NER ORG entities with alias map.
+    Requires whole-token match to avoid 'EMA' matching 'd-ema-nt'.
+    """
+    merged = dict(base)
+    for ent in ner_result:
+        if ent.get("entity_group") not in ("ORG", "PER"):
+            continue
+        word_tokens = set(re.split(r"[\s\-_/]+", ent["word"].lower().strip("##")))
+        for alias_lower, ticker in ALIAS_MAP.items():
+            if len(alias_lower) < 4:
+                continue
+            alias_tokens = set(re.split(r"[\s\-_/]+", alias_lower))
+            # Need significant token overlap, not just substring containment
+            overlap = alias_tokens & word_tokens
+            if overlap and len(overlap) / max(len(alias_tokens), len(word_tokens)) >= 0.5:
+                score = ent.get("score", 0.7)
+                if score > merged.get(ticker, 0):
+                    merged[ticker] = score
+    return merged
+
+
+
+
+# ---------------------------------------------------------------------------
+# Coverage spread scoring
+# ---------------------------------------------------------------------------
+
+def coverage_spread_score(row) -> float:
+    """
+    Quality score (0–1) based on source count and bias diversity.
+    High = many sources from left + right + centre. Low = few or echo chamber.
+    """
+    src   = row["source_count"] or 0
+    left  = row["left_src_count"] or 0
+    right = row["right_src_count"] or 0
+    ctr   = row["ctr_src_count"] or 0
+
+    if src < MIN_SOURCES:
+        return 0.0
+
+    quantity  = min(1.0, math.log(max(1, src)) / math.log(50))
+    fl, fr, fc = left / src, right / src, ctr / src
+    diversity  = min(1.0, (fl * fr * fc) ** (1 / 3) * 9)  # peaks at equal thirds
+
+    return round(quantity * 0.6 + diversity * 0.4, 3)
+
+
+# ---------------------------------------------------------------------------
+# Claude structured extraction
+# ---------------------------------------------------------------------------
+
+def claude_extract(title: str, text: str, tickers: list[str]) -> dict:
+    """
+    Use Claude Haiku to extract structured financial signal.
+    Returns {"confirmed_tickers", "magnitude", "timeframe", "reasoning"}.
+    """
+    import anthropic
+
+    api_key = os.environ.get("ANTHROPIC_API_KEY")
+    if not api_key:
+        return {"confirmed_tickers": tickers, "magnitude": 5, "timeframe": "days", "reasoning": "(no API key)"}
+
+    client = anthropic.Anthropic(api_key=api_key)
+    ticker_ctx = "\n".join(
+        f"  {t}: {C25[t]['name']} ({C25[t]['sector']})" for t in tickers if t in C25
+    )
+
+    prompt = f"""You are a financial analyst specializing in Scandinavian equities.
+
+Analyze this news article and assess its financial impact on the listed Danish C25 companies.
+
+## Companies to analyze:
+{ticker_ctx}
+
+## Article:
+Title: {title}
+{text[:1500]}
+
+Respond ONLY with valid JSON (no markdown fences):
+{{
+  "confirmed_tickers": ["NOVO-B"],
+  "magnitude": 7,
+  "timeframe": "days",
+  "reasoning": "Two sentences max on financial impact and direction."
+}}
+
+Fields:
+- confirmed_tickers: only companies truly affected (can be [])
+- magnitude: 1–10 (1=irrelevant, 10=major market mover)
+- timeframe: "hours", "days", "weeks", or "months"
+- reasoning: brief analyst note"""
+
+    try:
+        msg = client.messages.create(
+            model="claude-haiku-4-5",
+            max_tokens=256,
+            messages=[{"role": "user", "content": prompt}],
+        )
+        raw = msg.content[0].text.strip()
+        raw = re.sub(r"^```(?:json)?\n?", "", raw)
+        raw = re.sub(r"\n?```$", "", raw)
+        return json.loads(raw)
+    except Exception as e:
+        print(f"  [warn] Claude failed: {e}")
+        return {"confirmed_tickers": tickers, "magnitude": 5, "timeframe": "days", "reasoning": str(e)[:120]}
+
+
+# ---------------------------------------------------------------------------
+# yfinance momentum
+# ---------------------------------------------------------------------------
+
+_momentum_cache: dict[str, dict] = {}
+
+
+def momentum_check(ticker: str) -> dict:
+    """5-day price momentum for a C25 ticker via yfinance."""
+    if ticker in _momentum_cache:
+        return _momentum_cache[ticker]
+
+    import yfinance as yf
+
+    company      = C25.get(ticker, {})
+    yahoo_ticker = company.get("yahoo_ticker", ticker + ".CO")
+    result: dict = {"direction": "unknown", "pct_5d": 0.0, "pct_20d": 0.0}
+
+    try:
+        hist = yf.Ticker(yahoo_ticker).history(period="1mo", auto_adjust=True)
+        if len(hist) >= 5:
+            close     = hist["Close"]
+            pct_5d    = float((close.iloc[-1] / close.iloc[-5] - 1) * 100)
+            pct_20d   = float((close.iloc[-1] / close.iloc[0]  - 1) * 100) if len(hist) >= 20 else 0.0
+            direction = "up" if pct_5d > 1.5 else ("down" if pct_5d < -1.5 else "flat")
+            result    = {"direction": direction, "pct_5d": round(pct_5d, 2), "pct_20d": round(pct_20d, 2)}
+    except Exception:
+        pass
+
+    _momentum_cache[ticker] = result
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Signal score
+# ---------------------------------------------------------------------------
+
+def calc_signal_score(sent_score: float, sentiment: str, coverage: float, momentum: dict) -> float:
+    """signal_score = sentiment_confidence × coverage_spread × momentum_alignment"""
+    d = momentum.get("direction", "unknown")
+    if d == "unknown":
+        alignment = 0.5
+    elif d == "flat":
+        alignment = 0.7
+    elif (sentiment == "positive" and d == "up") or (sentiment == "negative" and d == "down"):
+        alignment = 1.0
+    else:
+        alignment = 0.4   # contrarian
+
+    return round(sent_score * coverage * alignment, 3)
+
+
+# ---------------------------------------------------------------------------
+# DB schema migration
+# ---------------------------------------------------------------------------
+
+def migrate_db(db) -> None:
+    """Apply schema migrations for SQLite. No-op for Postgres (schema managed by db.py)."""
+    if hasattr(db, "db_type") and db.db_type == "postgres":
+        return
+    existing = {row[1] for row in db.execute("PRAGMA table_info(article_signals)").fetchall()}
+    new_cols = [
+        ("coverage_spread",  "REAL    DEFAULT 0"),
+        ("claude_tickers",   "TEXT"),
+        ("claude_magnitude", "INTEGER DEFAULT 5"),
+        ("claude_timeframe", "TEXT"),
+        ("claude_reasoning", "TEXT"),
+        ("momentum_dir",     "TEXT"),
+        ("momentum_pct_5d",  "REAL    DEFAULT 0"),
+        ("signal_score",     "REAL    DEFAULT 0"),
+        ("alert",            "INTEGER DEFAULT 0"),
+    ]
+    for col_name, col_def in new_cols:
+        if col_name not in existing:
+            db.execute(f"ALTER TABLE article_signals ADD COLUMN {col_name} {col_def}")
+    db.commit()
+
+
+# ---------------------------------------------------------------------------
+# Main pipeline
+# ---------------------------------------------------------------------------
+
+def analyze_articles(
+    *,
+    force: bool = False,
+    limit: int | None = None,
+    dry_run: bool = False,
+    use_claude: bool = True,
+    auto_fetch: bool = True,
+) -> None:
+    db = get_db()
+    migrate_db(db)
+
+    # Auto-refresh articles (respects 30-min cache TTL)
+    if auto_fetch and not dry_run:
+        before = db.execute("SELECT COUNT(*) AS cnt FROM articles").fetchone()["cnt"]
+        print("[analyze] Refreshing Ground News feed …")
+        fetch_all(db)
+        print("[analyze] Henter danske RSS feeds …")
+        fetch_all_rss(db)
+        after = db.execute("SELECT COUNT(*) AS cnt FROM articles").fetchone()["cnt"]
+        if after > before:
+            print(f"[analyze] +{after - before} new articles")
+
+    base_q = """
+        SELECT slug, title, description, source_count,
+               left_src_count, right_src_count, ctr_src_count,
+               left_pct, right_pct, ctr_pct
+        FROM articles {where}
+        ORDER BY source_count DESC
+    """
+    rows = db.execute(
+        base_q.format(where="" if force else
+            "WHERE slug NOT IN (SELECT DISTINCT article_slug FROM article_signals)")
+    ).fetchall()
+    if limit:
+        rows = rows[:limit]
+
+    total = len(rows)
+    print(f"[analyze] {total} articles to process  (force={force}  dry_run={dry_run}  claude={use_claude})")
+    if total == 0:
+        print("[analyze] Nothing to do.")
+        db.close()
+        return
+
+    # ------------------------------------------------------------------
+    # Phase 1 — Alias screen + coverage spread filter
+    # ------------------------------------------------------------------
+    print("[analyze] Phase 1: alias screen + coverage filter …")
+    screened: list[tuple] = []
+    dropped_cov = 0
+
+    for row in rows:
+        cov = coverage_spread_score(row)
+        if cov < MIN_COVERAGE_SPREAD:
+            dropped_cov += 1
+            continue
+        text    = f"{row['title']}. {row['description'] or ''}"
+        matches = match_c25(text)
+        if matches:
+            screened.append((row, matches, cov))
+
+    print(f"[analyze] {len(screened)}/{total} passed  ({dropped_cov} dropped by coverage filter)")
+    if not screened:
+        db.close()
+        return
+
+    # ------------------------------------------------------------------
+    # Phase 2 — NER upgrade
+    # ------------------------------------------------------------------
+    print("[analyze] Phase 2: NER upgrade …")
+    ner   = get_ner()
+    texts = [f"{r['title']}. {r['description'] or ''}" for r, _, _ in screened]
+
+    BATCH   = 16
+    all_ner = []
+    for i in range(0, len(texts), BATCH):
+        all_ner.extend(ner(texts[i : i + BATCH]))
+
+    enriched = [
+        (row, merge_ner_matches(ner_res, base), cov)
+        for (row, base, cov), ner_res in zip(screened, all_ner)
+    ]
+
+    # ------------------------------------------------------------------
+    # Phase 3 — Full text + re-screen
+    # ------------------------------------------------------------------
+    print(f"[analyze] Phase 3: fetching full text for {len(enriched)} articles …")
+    final: list[tuple] = []
+
+    for idx, (row, matches, cov) in enumerate(enriched, 1):
+        slug = row["slug"]
+        if idx % 5 == 0 or idx == len(enriched):
+            print(f"  {idx}/{len(enriched)}: {slug[:55]}")
+
+        # RSS artikler har teksten gemt i page_cache som "rss:{slug}"
+        cats = row["categories"] if "categories" in row.keys() else ""
+        if cats and cats.startswith("rss:"):
+            cache_row = db.execute(
+                "SELECT content FROM page_cache WHERE url = ?",
+                (f"rss:{slug}",),
+            ).fetchone()
+            full_text = cache_row["content"] if cache_row else f"{row['title']}. {row['description'] or ''}"
+        else:
+            full_text = fetch_article_text(slug, db)
+
+        full_matches = match_c25(full_text)
+        for ticker, score in full_matches.items():
+            if score > matches.get(ticker, 0):
+                matches[ticker] = score
+        if matches:
+            final.append((row, matches, cov, full_text))
+
+    print(f"[analyze] {len(final)} articles with confirmed C25 mentions")
+
+    # ------------------------------------------------------------------
+    # Phase 4 — FinBERT sentiment (confidence filter)
+    # ------------------------------------------------------------------
+    print("[analyze] Phase 4: FinBERT sentiment …")
+    finbert          = get_finbert()
+    now              = int(time.time())
+    signals_written  = 0
+    alerts_triggered = 0
+
+    for row, matches, cov, full_text in final:
+        slug  = row["slug"]
+        title = row["title"]
+
+        try:
+            fb        = finbert(" ".join(full_text.split()[:400]))[0]
+            sentiment = fb["label"].lower()
+            sent_score = round(fb["score"], 4)
+        except Exception as e:
+            print(f"  [warn] FinBERT: {e}")
+            sentiment, sent_score = "neutral", 0.5
+
+        if sentiment == "neutral" and sent_score < FINBERT_MIN_CONF:
+            continue   # drop low-confidence neutral noise
+
+        # ------------------------------------------------------------------
+        # Phase 5 — Claude extraction
+        # ------------------------------------------------------------------
+        claude_data: dict = {}
+        if use_claude and not dry_run and os.environ.get("ANTHROPIC_API_KEY"):
+            print(f"  [claude] {slug[:50]}")
+            claude_data = claude_extract(title, full_text, list(matches.keys()))
+
+        # ------------------------------------------------------------------
+        # Phase 6 — yfinance momentum + scoring
+        # ------------------------------------------------------------------
+        for ticker, entity_score in matches.items():
+            company = C25[ticker]
+            full_lower    = full_text.lower()
+            mention_count = max(1, sum(
+                len(re.findall(
+                    r"(?<![a-zA-Z0-9])" + re.escape(a.lower()) + r"(?![a-zA-Z0-9])",
+                    full_lower,
+                ))
+                for a in company["aliases"]
+            ))
+
+            momentum  = momentum_check(ticker) if not dry_run else {}
+            sig_score = calc_signal_score(sent_score, sentiment, cov, momentum)
+            alert     = sig_score > ALERT_THRESHOLD and sentiment != "neutral"
+
+            if dry_run:
+                print(
+                    f"  DRY: {slug[:38]:<38} | {ticker:<8} | "
+                    f"{sentiment:<8} {sent_score:.2f} | cov={cov:.2f} | sig={sig_score:.3f}"
+                    f"{' ⚡' if alert else ''}"
+                )
+            else:
+                db.upsert(
+                    "article_signals",
+                    ["article_slug", "ticker"],
+                    [
+                        "article_slug", "ticker", "company_name", "sector",
+                        "sentiment", "sentiment_score", "entity_score",
+                        "mention_count", "full_text_used", "analyzed_at",
+                        "coverage_spread", "claude_tickers", "claude_magnitude",
+                        "claude_timeframe", "claude_reasoning",
+                        "momentum_dir", "momentum_pct_5d", "signal_score", "alert",
+                    ],
+                    (
+                        slug, ticker, company["name"], company["sector"],
+                        sentiment, float(sent_score), round(float(entity_score), 4),
+                        mention_count, 1, now,
+                        float(cov),
+                        json.dumps(claude_data.get("confirmed_tickers", [])) or None,
+                        claude_data.get("magnitude", 5),
+                        claude_data.get("timeframe", "days"),
+                        claude_data.get("reasoning", ""),
+                        momentum.get("direction", "unknown"),
+                        float(momentum.get("pct_5d", 0.0)),
+                        float(sig_score),
+                        int(alert),
+                    ),
+                )
+                signals_written += 1
+                if alert:
+                    alerts_triggered += 1
+                    icon = "↑" if sentiment == "positive" else "↓"
+                    print(
+                        f"  ⚡ ALERT: {icon} {ticker} ({company['name']}) | "
+                        f"{sentiment} {sent_score:.2f} | sig={sig_score:.3f} | {slug[:40]}"
+                    )
+
+    if not dry_run:
+        db.commit()
+        print(f"[analyze] Done. {signals_written} signals written, {alerts_triggered} alerts triggered.")
+    else:
+        print(f"[analyze] Dry-run complete. {len(final)} articles matched.")
+
+    db.close()
+
+
+# ---------------------------------------------------------------------------
+# CLI — brug Makefile i stedet for at huske flags
+# ---------------------------------------------------------------------------
+
+def main() -> None:
+    import sys
+    force   = "--force" in sys.argv
+    dry_run = "--dry"   in sys.argv
+    analyze_articles(force=force, dry_run=dry_run)
+
+
+if __name__ == "__main__":
+    main()