From 12c3bc3e15a1e777482ed913a292fcb235ab224d Mon Sep 17 00:00:00 2001 From: Henrik Jess Nielsen Date: Sun, 24 May 2026 19:35:43 +0200 Subject: [PATCH] Add data quality normalization for cars: mileage in thousands and leasing prices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - detect km < 500 on non-new cars (e.g. '130' instead of '130.000') - detect prices < 5.000 kr (likely monthly leasing rate not sale price) - flags shown in AI prompt → forces low score (1-3) - orange warning box in UI distinct from AI warnings --- score.py | 62 ++++++++++++++++++++++++++++++++++++++++++++ templates/index.html | 5 ++++ 2 files changed, 67 insertions(+) diff --git a/score.py b/score.py index f772336..d81c429 100644 --- a/score.py +++ b/score.py @@ -143,6 +143,57 @@ def extract_structured_fields(raw: str) -> dict: return fields +def normalize_listing(item: dict, category: str) -> dict: + """ + Detect and flag data quality issues for cars: + - Mileage written as thousands (130 instead of 130.000) + - Suspiciously low price (likely a leasing monthly rate, not full price) + + Adds 'data_quality_flags' list to item (in-place) and returns item. + """ + if category != "brugte biler": + return item + + flags = [] + raw = item.get("details", {}).get("raw_text", "") + fields = extract_structured_fields(raw) + + # ── Mileage check ───────────────────────────────────────────────────────── + km_str = fields.get("km", "") + if km_str: + km_digits = re.sub(r"[^\d]", "", km_str) + if km_digits: + km_val = int(km_digits) + # Likely written in thousands: 130 km instead of 130.000 km + # Cars below 500 km that aren't brand new (year < current-1) are suspicious + year_str = fields.get("year", "") + current_year = datetime.now().year + car_age = (current_year - int(year_str)) if year_str.isdigit() else 99 + if km_val < 500 and car_age > 1: + corrected_km = km_val * 1000 + flags.append( + f"⚠️ Kilometertallet ser forkert ud: sælger har skrevet '{km_val} km' " + f"— sandsynligvis ment {corrected_km:,} km. " + f"Annoncen nedprioriteres pga. vildledende km-angivelse." + ) + + # ── Price check ─────────────────────────────────────────────────────────── + try: + price = float(str(item.get("price_dkk", 0)).replace(",", ".")) + if 0 < price < 5000: + flags.append( + f"⚠️ Prisen på {int(price):,} kr. er usandsynlig lav for en bil — " + f"sandsynligvis en månedlig leasingydelse, ikke salgspris. " + f"Annoncen nedprioriteres pga. misvisende prisangivelse." + ) + except (ValueError, TypeError): + pass + + if flags: + item["data_quality_flags"] = flags + return item + + def listing_summary(item: dict, idx: int) -> str: """Compact text representation of a listing for the AI prompt.""" raw = item.get("details", {}).get("raw_text", item.get("description", "")) @@ -165,11 +216,18 @@ def listing_summary(item: dict, idx: int) -> str: meta_line = " | ".join(meta_parts) + # Include any data quality flags so AI factors them into scoring + quality_issues = item.get("data_quality_flags", []) + quality_block = "" + if quality_issues: + quality_block = "\nDATA KVALITETSPROBLEM (giv lav score 1-3):\n" + "\n".join(quality_issues) + "\n" + return ( f"--- Annonce #{idx + 1} (ID: {item['id']}) ---\n" f"Navn: {item['name']}\n" f"Pris: {item['price_dkk']} DKK\n" + (f"{meta_line}\n" if meta_line else "") + + quality_block + f"{text}\n" ) @@ -301,6 +359,10 @@ def score_listings( category = detect_category(items) phash = prefs_hash(prefs) + # ── Normalize data quality issues before any scoring ──────────────────── + for item in items: + normalize_listing(item, category) + # ── Split: persistent score cache → in-file cache → needs AI scoring ──── to_score, cached = [], [] now = datetime.now().isoformat(timespec="seconds") diff --git a/templates/index.html b/templates/index.html index 9fea81c..77e0986 100644 --- a/templates/index.html +++ b/templates/index.html @@ -425,6 +425,10 @@ const pct = Math.round(score * 10); const warn = item.ai_warnings ? `

↑ ${item.ai_warnings}

` : ""; + const qualityFlags = (item.data_quality_flags || []).length > 0 + ? `
+ ${item.data_quality_flags.map(f => `

${f}

`).join("")} +
` : ""; const tag = rankTag(score); const card = document.createElement("div"); @@ -447,6 +451,7 @@

${item.ai_reason || ""}

${warn} + ${qualityFlags}

${Number(item.price_dkk || 0).toLocaleString("da-DK")}kr