Add data quality normalization for cars: mileage in thousands and leasing prices

- detect km < 500 on non-new cars (e.g. '130' instead of '130.000') - detect prices < 5.000 kr (likely monthly leasing rate not sale price) - flags shown in AI prompt → forces low score (1-3) - orange warning box in UI distinct from AI warnings
2026-05-24 19:35:43 +02:00
parent 9299506523
commit 12c3bc3e15
2 changed files with 67 additions and 0 deletions
--- a/score.py
+++ b/score.py
@@ -143,6 +143,57 @@ def extract_structured_fields(raw: str) -> dict:
    return fields


+def normalize_listing(item: dict, category: str) -> dict:
+    """
+    Detect and flag data quality issues for cars:
+    - Mileage written as thousands (130 instead of 130.000)
+    - Suspiciously low price (likely a leasing monthly rate, not full price)
+
+    Adds 'data_quality_flags' list to item (in-place) and returns item.
+    """
+    if category != "brugte biler":
+        return item
+
+    flags = []
+    raw = item.get("details", {}).get("raw_text", "")
+    fields = extract_structured_fields(raw)
+
+    # ── Mileage check ─────────────────────────────────────────────────────────
+    km_str = fields.get("km", "")
+    if km_str:
+        km_digits = re.sub(r"[^\d]", "", km_str)
+        if km_digits:
+            km_val = int(km_digits)
+            # Likely written in thousands: 130 km instead of 130.000 km
+            # Cars below 500 km that aren't brand new (year < current-1) are suspicious
+            year_str = fields.get("year", "")
+            current_year = datetime.now().year
+            car_age = (current_year - int(year_str)) if year_str.isdigit() else 99
+            if km_val < 500 and car_age > 1:
+                corrected_km = km_val * 1000
+                flags.append(
+                    f"⚠️ Kilometertallet ser forkert ud: sælger har skrevet '{km_val} km' "
+                    f"— sandsynligvis ment {corrected_km:,} km. "
+                    f"Annoncen nedprioriteres pga. vildledende km-angivelse."
+                )
+
+    # ── Price check ───────────────────────────────────────────────────────────
+    try:
+        price = float(str(item.get("price_dkk", 0)).replace(",", "."))
+        if 0 < price < 5000:
+            flags.append(
+                f"⚠️ Prisen på {int(price):,} kr. er usandsynlig lav for en bil — "
+                f"sandsynligvis en månedlig leasingydelse, ikke salgspris. "
+                f"Annoncen nedprioriteres pga. misvisende prisangivelse."
+            )
+    except (ValueError, TypeError):
+        pass
+
+    if flags:
+        item["data_quality_flags"] = flags
+    return item
+
+
 def listing_summary(item: dict, idx: int) -> str:
    """Compact text representation of a listing for the AI prompt."""
    raw    = item.get("details", {}).get("raw_text", item.get("description", ""))
@@ -165,11 +216,18 @@ def listing_summary(item: dict, idx: int) -> str:

    meta_line = " | ".join(meta_parts)

+    # Include any data quality flags so AI factors them into scoring
+    quality_issues = item.get("data_quality_flags", [])
+    quality_block = ""
+    if quality_issues:
+        quality_block = "\nDATA KVALITETSPROBLEM (giv lav score 1-3):\n" + "\n".join(quality_issues) + "\n"
+
    return (
        f"--- Annonce #{idx + 1} (ID: {item['id']}) ---\n"
        f"Navn: {item['name']}\n"
        f"Pris: {item['price_dkk']} DKK\n"
        + (f"{meta_line}\n" if meta_line else "")
+        + quality_block
        + f"{text}\n"
    )

@@ -301,6 +359,10 @@ def score_listings(
    category = detect_category(items)
    phash = prefs_hash(prefs)

+    # ── Normalize data quality issues before any scoring ────────────────────
+    for item in items:
+        normalize_listing(item, category)
+
    # ── Split: persistent score cache → in-file cache → needs AI scoring ────
    to_score, cached = [], []
    now = datetime.now().isoformat(timespec="seconds")