2026-05-24 19:14:41 +02:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
"""
|
|
|
|
|
AI-powered scoring of DBA listings using Claude.
|
|
|
|
|
|
|
|
|
|
Usage:
|
|
|
|
|
python3 score.py results_car_89a242.json
|
|
|
|
|
python3 score.py results_rtx_3090_623595.json
|
|
|
|
|
python3 score.py results_car_89a242.json --top 10 # show only top N
|
|
|
|
|
python3 score.py results_car_89a242.json --save # write ranked output to ranked_*.json
|
|
|
|
|
python3 score.py results_car_89a242.json --force # ignore cache, re-score everything
|
|
|
|
|
python3 score.py results_car_89a242.json --prefs "Ikke franske biler"
|
|
|
|
|
|
|
|
|
|
Scores are cached in results_*.json — only new/unscored listings call Claude.
|
|
|
|
|
Change --prefs to invalidate cache and re-score with new preferences.
|
|
|
|
|
|
|
|
|
|
Requires:
|
|
|
|
|
ANTHROPIC_API_KEY env var
|
|
|
|
|
pip install anthropic
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
import hashlib, json, os, re, sys, uuid as _uuid
|
|
|
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
|
|
from datetime import datetime
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
import anthropic
|
|
|
|
|
|
|
|
|
|
MODEL = "claude-haiku-4-5-20251001" # fast + cheap; swap to sonnet for better ranking
|
|
|
|
|
API_KEY = "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"
|
|
|
|
|
MAX_TOKENS = 2048
|
|
|
|
|
BASE_DIR = Path(__file__).parent
|
|
|
|
|
DATA_DIR = BASE_DIR / "data"
|
|
|
|
|
SCORE_CACHE = BASE_DIR / "data" / "score_cache" # persistent cross-search score cache
|
|
|
|
|
UUID_RE = re.compile(r"^[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$")
|
|
|
|
|
METRICS_FILE = DATA_DIR / "metrics.json"
|
|
|
|
|
|
|
|
|
|
# Pricing: Claude Haiku 4.5 — https://www.anthropic.com/pricing
|
|
|
|
|
_PRICE_INPUT_PER_TOKEN = 0.80 / 1_000_000 # $0.80 per MTok
|
|
|
|
|
_PRICE_OUTPUT_PER_TOKEN = 4.00 / 1_000_000 # $4.00 per MTok
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def calc_cost(input_tokens: int, output_tokens: int) -> float:
|
|
|
|
|
return round(input_tokens * _PRICE_INPUT_PER_TOKEN + output_tokens * _PRICE_OUTPUT_PER_TOKEN, 6)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def update_metrics(search_id: str, input_tokens: int, output_tokens: int, listings_scored: int) -> None:
|
|
|
|
|
"""Save per-search metrics and update global metrics.json."""
|
|
|
|
|
cost = calc_cost(input_tokens, output_tokens)
|
|
|
|
|
now = datetime.now().isoformat(timespec="seconds")
|
|
|
|
|
|
|
|
|
|
# Per-search metrics
|
|
|
|
|
search_dir = DATA_DIR / search_id
|
|
|
|
|
if search_dir.exists():
|
|
|
|
|
search_metrics = {
|
|
|
|
|
"search_id": search_id,
|
|
|
|
|
"scored_at": now,
|
|
|
|
|
"model": MODEL,
|
|
|
|
|
"listings_scored": listings_scored,
|
|
|
|
|
"input_tokens": input_tokens,
|
|
|
|
|
"output_tokens": output_tokens,
|
|
|
|
|
"cost_usd": cost,
|
|
|
|
|
}
|
|
|
|
|
(search_dir / "metrics.json").write_text(json.dumps(search_metrics, indent=2))
|
|
|
|
|
|
|
|
|
|
# Global metrics
|
|
|
|
|
global_metrics = {}
|
|
|
|
|
if METRICS_FILE.exists():
|
|
|
|
|
try:
|
|
|
|
|
global_metrics = json.loads(METRICS_FILE.read_text())
|
|
|
|
|
except Exception:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
global_metrics["total_searches"] = global_metrics.get("total_searches", 0) + 1
|
|
|
|
|
global_metrics["total_listings_scored"] = global_metrics.get("total_listings_scored", 0) + listings_scored
|
|
|
|
|
global_metrics["total_input_tokens"] = global_metrics.get("total_input_tokens", 0) + input_tokens
|
|
|
|
|
global_metrics["total_output_tokens"] = global_metrics.get("total_output_tokens", 0) + output_tokens
|
|
|
|
|
global_metrics["total_cost_usd"] = round(global_metrics.get("total_cost_usd", 0.0) + cost, 6)
|
|
|
|
|
global_metrics["last_updated"] = now
|
|
|
|
|
|
|
|
|
|
METRICS_FILE.write_text(json.dumps(global_metrics, indent=2))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def prefs_hash(prefs: str) -> str:
|
|
|
|
|
"""Short stable hash of the user's preference string (empty → 'none')."""
|
|
|
|
|
return hashlib.md5(prefs.strip().encode()).hexdigest()[:8] if prefs.strip() else "none"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _score_cache_key(item_id: str, prefs: str, category: str) -> Path:
|
|
|
|
|
"""Return path to the persistent score cache file for this item+context."""
|
|
|
|
|
ph = prefs_hash(prefs)
|
|
|
|
|
ch = hashlib.md5(category.encode()).hexdigest()[:6]
|
|
|
|
|
return SCORE_CACHE / f"{item_id}_{ph}_{ch}.json"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_score_cache(item_id: str, prefs: str, category: str) -> dict | None:
|
|
|
|
|
"""Return cached score dict or None if not cached."""
|
|
|
|
|
p = _score_cache_key(item_id, prefs, category)
|
|
|
|
|
if p.exists():
|
|
|
|
|
try:
|
|
|
|
|
return json.loads(p.read_text())
|
|
|
|
|
except Exception:
|
|
|
|
|
pass
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def save_score_cache(item_id: str, prefs: str, category: str, score_data: dict) -> None:
|
|
|
|
|
"""Persist a score result so future searches with same item/prefs/category hit cache."""
|
|
|
|
|
SCORE_CACHE.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
p = _score_cache_key(item_id, prefs, category)
|
|
|
|
|
p.write_text(json.dumps(score_data, ensure_ascii=False))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ── Helpers ───────────────────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
def trim_text(raw: str, max_chars: int = 800) -> str:
|
|
|
|
|
"""Cut DBA boilerplate header/footer, keep the meat."""
|
|
|
|
|
# Skip past the standard navigation header
|
|
|
|
|
for marker in ["Varebeskrivelse", "Beskrivelse", "Specifikationer"]:
|
|
|
|
|
idx = raw.find(marker)
|
|
|
|
|
if idx != -1:
|
|
|
|
|
raw = raw[idx:]
|
|
|
|
|
break
|
|
|
|
|
# Trim to max length
|
|
|
|
|
if len(raw) > max_chars:
|
|
|
|
|
raw = raw[:max_chars] + "…"
|
|
|
|
|
return raw.strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_structured_fields(raw: str) -> dict:
|
|
|
|
|
"""Pull key structured fields out of DBA raw_text before trimming."""
|
|
|
|
|
fields = {}
|
|
|
|
|
patterns = {
|
|
|
|
|
"year": r"(?:Modelår|Årstal|Årgang)[^\d]*(\d{4})",
|
|
|
|
|
"km": r"Kilometertal\s+([\d\.,]+ km)",
|
|
|
|
|
"condition": r"Stand\s*:\s*([^\n|]{3,60})",
|
|
|
|
|
"gear": r"Geartype\s+(\S+)",
|
|
|
|
|
"fuel": r"Drivmiddel\s+(\S+)",
|
|
|
|
|
"owners": r"Antal ejere\s+(\d+)",
|
|
|
|
|
}
|
|
|
|
|
for key, pattern in patterns.items():
|
|
|
|
|
m = re.search(pattern, raw, re.IGNORECASE)
|
|
|
|
|
if m:
|
|
|
|
|
fields[key] = m.group(1).strip()
|
|
|
|
|
return fields
|
|
|
|
|
|
|
|
|
|
|
2026-05-24 19:35:43 +02:00
|
|
|
def normalize_listing(item: dict, category: str) -> dict:
|
|
|
|
|
"""
|
|
|
|
|
Detect and flag data quality issues for cars:
|
|
|
|
|
- Mileage written as thousands (130 instead of 130.000)
|
|
|
|
|
- Suspiciously low price (likely a leasing monthly rate, not full price)
|
|
|
|
|
|
|
|
|
|
Adds 'data_quality_flags' list to item (in-place) and returns item.
|
|
|
|
|
"""
|
|
|
|
|
if category != "brugte biler":
|
|
|
|
|
return item
|
|
|
|
|
|
|
|
|
|
flags = []
|
|
|
|
|
raw = item.get("details", {}).get("raw_text", "")
|
|
|
|
|
fields = extract_structured_fields(raw)
|
|
|
|
|
|
|
|
|
|
# ── Mileage check ─────────────────────────────────────────────────────────
|
|
|
|
|
km_str = fields.get("km", "")
|
|
|
|
|
if km_str:
|
|
|
|
|
km_digits = re.sub(r"[^\d]", "", km_str)
|
|
|
|
|
if km_digits:
|
|
|
|
|
km_val = int(km_digits)
|
|
|
|
|
# Likely written in thousands: 130 km instead of 130.000 km
|
|
|
|
|
# Cars below 500 km that aren't brand new (year < current-1) are suspicious
|
|
|
|
|
year_str = fields.get("year", "")
|
|
|
|
|
current_year = datetime.now().year
|
|
|
|
|
car_age = (current_year - int(year_str)) if year_str.isdigit() else 99
|
|
|
|
|
if km_val < 500 and car_age > 1:
|
|
|
|
|
corrected_km = km_val * 1000
|
|
|
|
|
flags.append(
|
|
|
|
|
f"⚠️ Kilometertallet ser forkert ud: sælger har skrevet '{km_val} km' "
|
|
|
|
|
f"— sandsynligvis ment {corrected_km:,} km. "
|
|
|
|
|
f"Annoncen nedprioriteres pga. vildledende km-angivelse."
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# ── Price check ───────────────────────────────────────────────────────────
|
|
|
|
|
try:
|
|
|
|
|
price = float(str(item.get("price_dkk", 0)).replace(",", "."))
|
|
|
|
|
if 0 < price < 5000:
|
|
|
|
|
flags.append(
|
|
|
|
|
f"⚠️ Prisen på {int(price):,} kr. er usandsynlig lav for en bil — "
|
|
|
|
|
f"sandsynligvis en månedlig leasingydelse, ikke salgspris. "
|
|
|
|
|
f"Annoncen nedprioriteres pga. misvisende prisangivelse."
|
|
|
|
|
)
|
|
|
|
|
except (ValueError, TypeError):
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
if flags:
|
|
|
|
|
item["data_quality_flags"] = flags
|
|
|
|
|
return item
|
|
|
|
|
|
|
|
|
|
|
2026-05-24 19:14:41 +02:00
|
|
|
def listing_summary(item: dict, idx: int) -> str:
|
|
|
|
|
"""Compact text representation of a listing for the AI prompt."""
|
|
|
|
|
raw = item.get("details", {}).get("raw_text", item.get("description", ""))
|
|
|
|
|
fields = extract_structured_fields(raw)
|
|
|
|
|
text = trim_text(raw)
|
|
|
|
|
|
|
|
|
|
meta_parts = []
|
|
|
|
|
if fields.get("year"):
|
|
|
|
|
meta_parts.append(f"Årgang: {fields['year']}")
|
|
|
|
|
if fields.get("km"):
|
|
|
|
|
meta_parts.append(f"Km: {fields['km']}")
|
|
|
|
|
if fields.get("fuel"):
|
|
|
|
|
meta_parts.append(f"Brændstof: {fields['fuel']}")
|
|
|
|
|
if fields.get("gear"):
|
|
|
|
|
meta_parts.append(f"Gear: {fields['gear']}")
|
|
|
|
|
if fields.get("owners"):
|
|
|
|
|
meta_parts.append(f"Ejere: {fields['owners']}")
|
|
|
|
|
if fields.get("condition"):
|
|
|
|
|
meta_parts.append(f"Stand: {fields['condition']}")
|
|
|
|
|
|
|
|
|
|
meta_line = " | ".join(meta_parts)
|
|
|
|
|
|
2026-05-24 19:35:43 +02:00
|
|
|
# Include any data quality flags so AI factors them into scoring
|
|
|
|
|
quality_issues = item.get("data_quality_flags", [])
|
|
|
|
|
quality_block = ""
|
|
|
|
|
if quality_issues:
|
|
|
|
|
quality_block = "\nDATA KVALITETSPROBLEM (giv lav score 1-3):\n" + "\n".join(quality_issues) + "\n"
|
|
|
|
|
|
2026-05-24 19:14:41 +02:00
|
|
|
return (
|
|
|
|
|
f"--- Annonce #{idx + 1} (ID: {item['id']}) ---\n"
|
|
|
|
|
f"Navn: {item['name']}\n"
|
|
|
|
|
f"Pris: {item['price_dkk']} DKK\n"
|
|
|
|
|
+ (f"{meta_line}\n" if meta_line else "")
|
2026-05-24 19:35:43 +02:00
|
|
|
+ quality_block
|
2026-05-24 19:14:41 +02:00
|
|
|
+ f"{text}\n"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def detect_category(items: list[dict]) -> str:
|
|
|
|
|
"""Detect category from item URLs and breadcrumb in raw_text."""
|
|
|
|
|
if not items:
|
|
|
|
|
return "brugte varer"
|
|
|
|
|
url = items[0].get("url", "")
|
|
|
|
|
if "/mobility/" in url:
|
|
|
|
|
return "brugte biler"
|
|
|
|
|
|
|
|
|
|
# Extract breadcrumb from raw_text to detect subcategory
|
|
|
|
|
raw = items[0].get("details", {}).get("raw_text", "")
|
|
|
|
|
m = re.search(r"Du er her\s+(.+?)(?:\n|Billedgalleri)", raw)
|
|
|
|
|
breadcrumb = m.group(1).lower() if m else ""
|
|
|
|
|
|
|
|
|
|
for keywords, context_key in _CATEGORY_MAP:
|
|
|
|
|
if any(kw in breadcrumb for kw in keywords):
|
|
|
|
|
return context_key
|
|
|
|
|
|
|
|
|
|
return "brugte varer"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
KNOWLEDGE_CONTEXT = {
|
|
|
|
|
"brugte biler": (
|
|
|
|
|
"- Kendte reliabilitetsproblemer (fx Peugeot 1.2 PureTech timing-kæde, VW DSG-gearkasse, BMW N47 dieselmotor)\n"
|
|
|
|
|
"- Km-stand og alder sat i forhold til markedsværdi for den specifikke model og variant\n"
|
|
|
|
|
"- Kendte stærke og svage modeller (fx Toyota/Mazda høj reliabilitet, Renault/Citroën/Fiat lavere)\n"
|
|
|
|
|
"- Typiske brugtpriser for modellen baseret på år og km"
|
|
|
|
|
),
|
|
|
|
|
"elektronik": (
|
|
|
|
|
"- Produktgenerationens relative ydelse og markedsværdi (fx RTX 4070 > RTX 3080, iPhone 15 > 13)\n"
|
|
|
|
|
"- Kendte problemer: mining-slid på GPU'er, batterinedgang på telefoner/laptops, kondensatorfejl\n"
|
|
|
|
|
"- Hvad er en rimelig brugtpris for dette produkt i denne stand?\n"
|
|
|
|
|
"- Stand er afgørende — 'Som ny' vs 'Brugt - med synlige brugsspor' bør veje tungt"
|
|
|
|
|
),
|
|
|
|
|
"sport": (
|
|
|
|
|
"- Kendte mærker og deres relative kvalitet (fx Titleist/Callaway/TaylorMade til golf, Shimano-grupper til cykler)\n"
|
|
|
|
|
"- Produktets alder og teknologisk forældelse (fx ældre golfkøller med stålskaft vs moderne grafit)\n"
|
|
|
|
|
"- Stand er meget afgørende for sportsudstyr — slid påvirker ydeevne direkte\n"
|
|
|
|
|
"- Hvad er en rimelig brugtpris for dette udstyr i denne stand og fra dette mærke?"
|
|
|
|
|
),
|
|
|
|
|
"møbler": (
|
|
|
|
|
"- Kendte mærker og materialer (fx massivt træ > spånplade, dansk design har høj gensalgsværdi)\n"
|
|
|
|
|
"- Stand og alder — patina kan være positivt for vintage, negativt for moderne møbler\n"
|
|
|
|
|
"- Originale vs efterligninger (fx IKEA POÄNG vs original Fritz Hansen)\n"
|
|
|
|
|
"- Hvad er en rimelig brugtpris baseret på stand, alder og mærke?"
|
|
|
|
|
),
|
|
|
|
|
"brugte varer": (
|
|
|
|
|
"- Produktets markedsværdi brugt i denne stand\n"
|
|
|
|
|
"- Kendte problemer eller svagheder ved denne model/variant\n"
|
|
|
|
|
"- Stand er afgørende — 'Som ny' vs 'Brugt - med synlige brugsspor' bør veje tungt\n"
|
|
|
|
|
"- Er varen komplet? Mangler tilbehør eller dokumentation?"
|
|
|
|
|
),
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Breadcrumb keywords → knowledge context key
|
|
|
|
|
_CATEGORY_MAP = [
|
|
|
|
|
(["elektronik", "computer", "grafikkort", "telefon", "mobil", "tv", "hifi", "kamera"], "elektronik"),
|
|
|
|
|
(["golf", "sport", "cykel", "fitness", "jagt", "fiskeri", "friluftsliv"], "sport"),
|
|
|
|
|
(["møbel", "stol", "bord", "sofa", "seng", "reol", "lampe", "bolig", "indretning"], "møbler"),
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def build_prompt(items: list[dict], category: str, criteria: str, prefs: str = "") -> str:
|
|
|
|
|
summaries = "\n".join(listing_summary(i, n) for n, i in enumerate(items))
|
|
|
|
|
|
|
|
|
|
prefs_block = ""
|
|
|
|
|
if prefs.strip():
|
|
|
|
|
prefs_block = f"""
|
|
|
|
|
KØBERENS EGNE PRÆFERENCER (vigtig — vej disse tungt i din scoring):
|
|
|
|
|
{prefs.strip()}
|
|
|
|
|
Annoncer der strider mod disse præferencer skal have markant lavere score.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
knowledge = KNOWLEDGE_CONTEXT.get(category, KNOWLEDGE_CONTEXT["brugte varer"])
|
|
|
|
|
|
|
|
|
|
return f"""Du er en ekspert køberrådgiver for {category} på DBA.
|
|
|
|
|
|
|
|
|
|
Brug BÅDE annonceteksten OG din egen viden om produkterne:
|
|
|
|
|
{knowledge}
|
|
|
|
|
{prefs_block}
|
|
|
|
|
Scorer HVER annonce UAFHÆNGIGT på en absolut skala 1-10 baseret på disse kriterier:
|
|
|
|
|
{criteria}
|
|
|
|
|
|
|
|
|
|
ABSOLUT SCORESKALA (brug din viden om markedet — scoren må IKKE afhænge af de andre annoncer i denne batch):
|
|
|
|
|
- 9-10: Fremragende køb — markant under markedspris, pålidelig model, god stand/historik
|
|
|
|
|
- 7-8: Godt køb — fair pris, solid model, få eller ingen bekymringer
|
|
|
|
|
- 5-6: Middel — markedspris, eller visse risici/ukendte faktorer
|
|
|
|
|
- 3-4: Under middel — overpriset eller kendte modelproblem
|
|
|
|
|
- 1-2: Undgå — alvorlige røde flag, stor risiko eller klart overpriset
|
|
|
|
|
|
|
|
|
|
WARNINGS — list KUN konkrete, faktuelle røde flag der er direkte støttet af annonceteksten eller veldokumenterede modelproblemer:
|
|
|
|
|
- Nævn KUN ting der er bekræftet i annonceteksten (fx "sælger nævner støj", "ingen billeder", "kun afhentning")
|
|
|
|
|
- Eller veldokumenterede modelspecifikke problemer (fx "Turbo-variant har historisk køleproblemer")
|
|
|
|
|
- Skriv IKKE generiske advarsler om mining, stand etc. medmindre det eksplicit nævnes i annoncen
|
|
|
|
|
- Hvis ingen konkrete røde flag: tom streng ""
|
|
|
|
|
|
|
|
|
|
Returner KUN et JSON-array — ingen forklaringer udenfor JSON:
|
|
|
|
|
[
|
|
|
|
|
{{
|
|
|
|
|
"id": "annonce-ID",
|
|
|
|
|
"score": 8.5,
|
|
|
|
|
"reason": "Begrundelse på dansk (maks 2 sætninger). Nævn gerne konkret viden om modellen.",
|
|
|
|
|
"warnings": "Kun konkrete røde flag fra annonceteksten eller kendte modelproblemer. Tom streng hvis ingen."
|
|
|
|
|
}},
|
|
|
|
|
...
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
Alle {len(items)} annoncer skal med. Score er 1-10 (10 = suverænt køb).
|
|
|
|
|
|
|
|
|
|
ANNONCER:
|
|
|
|
|
{summaries}"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ── Scoring ───────────────────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
def score_listings(
|
|
|
|
|
items: list[dict],
|
|
|
|
|
criteria: str,
|
|
|
|
|
prefs: str = "",
|
|
|
|
|
batch_size: int = 10,
|
|
|
|
|
force: bool = False,
|
|
|
|
|
source_file: Path | None = None,
|
|
|
|
|
) -> list[dict]:
|
|
|
|
|
"""Score listings with AI — skips items that are already cached. Runs batches in parallel."""
|
|
|
|
|
client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY", API_KEY))
|
|
|
|
|
category = detect_category(items)
|
|
|
|
|
phash = prefs_hash(prefs)
|
|
|
|
|
|
2026-05-24 19:35:43 +02:00
|
|
|
# ── Normalize data quality issues before any scoring ────────────────────
|
|
|
|
|
for item in items:
|
|
|
|
|
normalize_listing(item, category)
|
|
|
|
|
|
2026-05-24 19:14:41 +02:00
|
|
|
# ── Split: persistent score cache → in-file cache → needs AI scoring ────
|
|
|
|
|
to_score, cached = [], []
|
|
|
|
|
now = datetime.now().isoformat(timespec="seconds")
|
|
|
|
|
for item in items:
|
|
|
|
|
if not force:
|
|
|
|
|
# 1. Check persistent cross-search score cache
|
|
|
|
|
sc = load_score_cache(str(item["id"]), prefs, category)
|
|
|
|
|
if sc:
|
|
|
|
|
item["ai_score"] = sc["score"]
|
|
|
|
|
item["ai_rank"] = sc.get("rank")
|
|
|
|
|
item["ai_reason"] = sc.get("reason", "")
|
|
|
|
|
item["ai_warnings"] = sc.get("warnings", "")
|
|
|
|
|
item["ai_prefs_hash"] = phash
|
|
|
|
|
item["ai_scored_at"] = sc.get("scored_at", now)
|
|
|
|
|
cached.append(item)
|
|
|
|
|
continue
|
|
|
|
|
# 2. In-file cache (same search UUID, already scored)
|
|
|
|
|
if item.get("ai_score") is not None and item.get("ai_prefs_hash") == phash:
|
|
|
|
|
cached.append(item)
|
|
|
|
|
continue
|
|
|
|
|
to_score.append(item)
|
|
|
|
|
|
|
|
|
|
if cached:
|
|
|
|
|
print(f" ♻️ {len(cached)} annoncer genbruger cache", file=sys.stderr)
|
|
|
|
|
if to_score:
|
|
|
|
|
print(f" 🤖 {len(to_score)} annoncer sendes til AI…", file=sys.stderr)
|
|
|
|
|
elif not cached:
|
|
|
|
|
print(" Ingen annoncer at score.", file=sys.stderr)
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
# ── Score only uncached items — parallel batches ──────────────────────────
|
|
|
|
|
all_scores: dict[str, dict] = {}
|
|
|
|
|
if to_score:
|
|
|
|
|
batches = [to_score[i:i + batch_size] for i in range(0, len(to_score), batch_size)]
|
|
|
|
|
print(f" ({len(batches)} parallelle batches à max {batch_size})", file=sys.stderr)
|
|
|
|
|
|
|
|
|
|
def score_batch(b_idx: int, batch: list[dict]) -> tuple[dict[str, dict], int, int]:
|
|
|
|
|
prompt = build_prompt(batch, category, criteria, prefs)
|
|
|
|
|
response = client.messages.create(
|
|
|
|
|
model=MODEL,
|
|
|
|
|
max_tokens=MAX_TOKENS,
|
|
|
|
|
temperature=0,
|
|
|
|
|
messages=[{"role": "user", "content": prompt}],
|
|
|
|
|
)
|
|
|
|
|
inp = response.usage.input_tokens
|
|
|
|
|
out = response.usage.output_tokens
|
|
|
|
|
text = response.content[0].text.strip()
|
|
|
|
|
json_m = re.search(r"\[.*\]", text, re.DOTALL)
|
|
|
|
|
if not json_m:
|
|
|
|
|
print(f" ⚠ Kunne ikke parse svar fra batch {b_idx + 1}:\n{text[:300]}", file=sys.stderr)
|
|
|
|
|
return {}, inp, out
|
|
|
|
|
result = {}
|
|
|
|
|
for s in json.loads(json_m.group(0)):
|
|
|
|
|
result[str(s["id"])] = s
|
|
|
|
|
print(f" ✓ Batch {b_idx + 1}/{len(batches)} færdig ({len(result)} scores, {inp}+{out} tok)", file=sys.stderr)
|
|
|
|
|
return result, inp, out
|
|
|
|
|
|
|
|
|
|
total_input = total_output = 0
|
|
|
|
|
with ThreadPoolExecutor(max_workers=min(len(batches), 8)) as pool:
|
|
|
|
|
futures = {pool.submit(score_batch, i, b): i for i, b in enumerate(batches)}
|
|
|
|
|
for future in as_completed(futures):
|
|
|
|
|
scores, inp, out = future.result()
|
|
|
|
|
all_scores.update(scores)
|
|
|
|
|
total_input += inp
|
|
|
|
|
total_output += out
|
|
|
|
|
|
|
|
|
|
# Write scores + cache metadata back onto items
|
|
|
|
|
now = datetime.now().isoformat(timespec="seconds")
|
|
|
|
|
for item in to_score:
|
|
|
|
|
s = all_scores.get(str(item["id"]), {})
|
|
|
|
|
if s:
|
|
|
|
|
item["ai_score"] = s.get("score")
|
|
|
|
|
item["ai_rank"] = s.get("rank")
|
|
|
|
|
item["ai_reason"] = s.get("reason", "")
|
|
|
|
|
item["ai_warnings"] = s.get("warnings", "")
|
|
|
|
|
# Persist to cross-search score cache so same item never re-scored
|
|
|
|
|
save_score_cache(str(item["id"]), prefs, category, {
|
|
|
|
|
"score": s.get("score"),
|
|
|
|
|
"rank": s.get("rank"),
|
|
|
|
|
"reason": s.get("reason", ""),
|
|
|
|
|
"warnings": s.get("warnings", ""),
|
|
|
|
|
"scored_at": now,
|
|
|
|
|
})
|
|
|
|
|
item["ai_prefs_hash"] = phash
|
|
|
|
|
item["ai_scored_at"] = now
|
|
|
|
|
|
|
|
|
|
# Auto-save scores back into source file so cache persists next run
|
|
|
|
|
if source_file:
|
|
|
|
|
all_items_map = {str(i["id"]): i for i in cached + to_score}
|
|
|
|
|
source_file.write_text(json.dumps(list(all_items_map.values()), ensure_ascii=False, indent=2))
|
|
|
|
|
scored_count = sum(1 for i in to_score if i.get("ai_score") is not None)
|
|
|
|
|
print(f" 💾 {scored_count} nye scores gemt → {source_file}", file=sys.stderr)
|
|
|
|
|
cost = calc_cost(total_input, total_output)
|
|
|
|
|
print(f" 💰 {total_input}+{total_output} tokens → ${cost:.4f}", file=sys.stderr)
|
|
|
|
|
update_metrics(source_file.parent.name, total_input, total_output, scored_count)
|
|
|
|
|
|
|
|
|
|
# ── Combine, re-sort, re-rank ─────────────────────────────────────────────
|
|
|
|
|
combined = [i for i in (cached + to_score) if i.get("ai_score") is not None]
|
|
|
|
|
combined.sort(key=lambda x: x["ai_score"], reverse=True)
|
|
|
|
|
for rank, item in enumerate(combined, 1):
|
|
|
|
|
item["ai_rank"] = rank
|
|
|
|
|
|
|
|
|
|
return combined
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ── Output ────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
def print_results(ranked: list[dict], top: int | None = None) -> None:
|
|
|
|
|
show = ranked[:top] if top else ranked
|
|
|
|
|
print(f"\n{'═' * 60}")
|
|
|
|
|
print(f" TOP {len(show)} ANNONCER (af {len(ranked)} scoret)")
|
|
|
|
|
print(f"{'═' * 60}\n")
|
|
|
|
|
for item in show:
|
|
|
|
|
score = item.get("ai_score", "?")
|
|
|
|
|
bar = "█" * int(score) + "░" * (10 - int(score)) if isinstance(score, (int, float)) else ""
|
|
|
|
|
print(
|
|
|
|
|
f"#{item['ai_rank']:>2} [{score:4.1f}] {bar} {item['name']}\n"
|
|
|
|
|
f" Pris: {item['price_dkk']} DKK | {item['url']}\n"
|
|
|
|
|
f" ✅ {item.get('ai_reason','')}\n"
|
|
|
|
|
)
|
|
|
|
|
if item.get("ai_warnings"):
|
|
|
|
|
print(f" ⚠️ {item['ai_warnings']}\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ── Main ──────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
CRITERIA = {
|
|
|
|
|
"mobility": (
|
|
|
|
|
"- Pris ift. markedsværdi for den specifikke model/år/km (brug din viden)\n"
|
|
|
|
|
"- Modelreliabilitet og kendte svagheder (timing-kæde, gearkasse, rust etc.)\n"
|
|
|
|
|
"- Km-stand og alder (Årgang og Kilometertal er angivet hvis tilgængeligt)\n"
|
|
|
|
|
"- Privat sælger foretrukket (forhandler = højere pris, ingen reklamationsret ved brugt)\n"
|
|
|
|
|
"- Servicehistorik, nysynet, tandrem nævnt?\n"
|
|
|
|
|
"- Udstyrsniveau og antal ejere"
|
|
|
|
|
),
|
|
|
|
|
"recommerce": (
|
|
|
|
|
"- Pris ift. aktuel markedsværdi for produktet (brug din viden om typiske priser)\n"
|
|
|
|
|
"- Produktgenerationens relative ydelse og værdi (fx GPU-generationer, produktionsår)\n"
|
|
|
|
|
"- Stand (DBA's standbeskrivelse er angivet: 'Som ny', 'Brugt - men i god stand', 'Brugt - med synlige brugsspor')\n"
|
|
|
|
|
"- Kendte problemer med denne model/variant\n"
|
|
|
|
|
"- Er varen komplet? Mangler tilbehør?\n"
|
|
|
|
|
"- Privat sælger foretrukket"
|
|
|
|
|
),
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main() -> None:
|
|
|
|
|
if "ANTHROPIC_API_KEY" not in os.environ and not API_KEY:
|
|
|
|
|
print("Fejl: ANTHROPIC_API_KEY er ikke sat.", file=sys.stderr)
|
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
# Parse args properly — handles both --top 3 and --top=3
|
|
|
|
|
top_n = None
|
|
|
|
|
prefs = ""
|
|
|
|
|
force = False
|
|
|
|
|
save = False
|
|
|
|
|
positional = []
|
|
|
|
|
|
|
|
|
|
argv = sys.argv[1:]
|
|
|
|
|
i = 0
|
|
|
|
|
while i < len(argv):
|
|
|
|
|
a = argv[i]
|
|
|
|
|
if a in ("--top", "--prefs") and i + 1 < len(argv):
|
|
|
|
|
if a == "--top":
|
|
|
|
|
top_n = int(argv[i + 1])
|
|
|
|
|
else:
|
|
|
|
|
prefs = argv[i + 1]
|
|
|
|
|
i += 2
|
|
|
|
|
elif a.startswith("--top="):
|
|
|
|
|
top_n = int(a[6:])
|
|
|
|
|
i += 1
|
|
|
|
|
elif a.startswith("--prefs="):
|
|
|
|
|
prefs = a[8:]
|
|
|
|
|
i += 1
|
|
|
|
|
elif a == "--force":
|
|
|
|
|
force = True
|
|
|
|
|
i += 1
|
|
|
|
|
elif a == "--save":
|
|
|
|
|
save = True
|
|
|
|
|
i += 1
|
|
|
|
|
elif not a.startswith("--"):
|
|
|
|
|
positional.append(a)
|
|
|
|
|
i += 1
|
|
|
|
|
else:
|
|
|
|
|
i += 1
|
|
|
|
|
|
|
|
|
|
if not positional:
|
|
|
|
|
# Auto-detect: most recent data/<uuid>/listings.json
|
|
|
|
|
searches = sorted(
|
|
|
|
|
(d for d in DATA_DIR.iterdir() if (d / "listings.json").exists()),
|
|
|
|
|
key=lambda d: d.stat().st_mtime, reverse=True
|
|
|
|
|
) if DATA_DIR.exists() else []
|
|
|
|
|
if not searches:
|
|
|
|
|
print("Ingen søgninger fundet. Kør fetch_dba.py <url> først.", file=sys.stderr)
|
|
|
|
|
sys.exit(1)
|
|
|
|
|
search_dir = searches[0]
|
|
|
|
|
results_file = search_dir / "listings.json"
|
|
|
|
|
print(f"Bruger nyeste søgning: {search_dir.name}", file=sys.stderr)
|
|
|
|
|
else:
|
|
|
|
|
ref = positional[0]
|
|
|
|
|
if UUID_RE.match(ref):
|
|
|
|
|
results_file = DATA_DIR / ref / "listings.json"
|
|
|
|
|
else:
|
|
|
|
|
results_file = Path(ref)
|
|
|
|
|
if not results_file.exists():
|
|
|
|
|
print(f"Fejl: {results_file} ikke fundet.", file=sys.stderr)
|
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
items = json.loads(results_file.read_text())
|
|
|
|
|
print(f"Loaded {len(items)} annoncer fra {results_file}", file=sys.stderr)
|
|
|
|
|
|
|
|
|
|
domain = "mobility" if items and "/mobility/" in items[0].get("url", "") else "recommerce"
|
|
|
|
|
criteria = CRITERIA[domain]
|
|
|
|
|
|
|
|
|
|
# ── Interaktiv refinement-løkke (op til 3 forsøg) ────────────────────────
|
|
|
|
|
MAX_ROUNDS = 3
|
|
|
|
|
interactive = sys.stdin.isatty() and not prefs
|
|
|
|
|
|
|
|
|
|
for attempt in range(MAX_ROUNDS):
|
|
|
|
|
if prefs:
|
|
|
|
|
print(f"\n🎯 Præferencer: {prefs}", file=sys.stderr)
|
|
|
|
|
|
|
|
|
|
ranked = score_listings(items, criteria, prefs, force=force, source_file=results_file)
|
|
|
|
|
# After first run, don't force re-score on subsequent interactive rounds
|
|
|
|
|
force = False
|
|
|
|
|
print_results(ranked, top_n)
|
|
|
|
|
|
|
|
|
|
if save:
|
|
|
|
|
out = results_file.parent / "ranked.json"
|
|
|
|
|
out.write_text(json.dumps(ranked, ensure_ascii=False, indent=2))
|
|
|
|
|
print(f"\n💾 Ranked output gemt → {out}", file=sys.stderr)
|
|
|
|
|
|
|
|
|
|
if not interactive or attempt >= MAX_ROUNDS - 1:
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
remaining = MAX_ROUNDS - attempt - 1
|
|
|
|
|
print(f"\n{'─' * 60}")
|
|
|
|
|
print(f" Tilføj præferencer for at re-score ({remaining} forsøg tilbage)")
|
|
|
|
|
print(f" Eks: 'Ikke franske biler' / 'Helst manuel gear' / 'Max 50 km fra Aarhus'")
|
|
|
|
|
print(f" (Tryk Enter for at afslutte)")
|
|
|
|
|
print(f"{'─' * 60}")
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
new_prefs = input(" > ").strip()
|
|
|
|
|
except (EOFError, KeyboardInterrupt):
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
if not new_prefs:
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
prefs = f"{prefs}\n{new_prefs}".strip() if prefs else new_prefs
|
|
|
|
|
# Force re-score when prefs change (cache hash will differ anyway, but be explicit)
|
|
|
|
|
force = True
|
|
|
|
|
print(f"\n🔄 Re-scorer med dine præferencer…\n", file=sys.stderr)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|