Add data quality normalization for cars: mileage in thousands and leasing prices
All checks were successful
Build and Deploy BlaaAi / build-and-deploy (push) Successful in 4m24s

- detect km < 500 on non-new cars (e.g. '130' instead of '130.000')
- detect prices < 5.000 kr (likely monthly leasing rate not sale price)
- flags shown in AI prompt → forces low score (1-3)
- orange warning box in UI distinct from AI warnings
This commit is contained in:
Henrik Jess Nielsen
2026-05-24 19:35:43 +02:00
parent 9299506523
commit 12c3bc3e15
2 changed files with 67 additions and 0 deletions

View File

@@ -143,6 +143,57 @@ def extract_structured_fields(raw: str) -> dict:
return fields
def normalize_listing(item: dict, category: str) -> dict:
"""
Detect and flag data quality issues for cars:
- Mileage written as thousands (130 instead of 130.000)
- Suspiciously low price (likely a leasing monthly rate, not full price)
Adds 'data_quality_flags' list to item (in-place) and returns item.
"""
if category != "brugte biler":
return item
flags = []
raw = item.get("details", {}).get("raw_text", "")
fields = extract_structured_fields(raw)
# ── Mileage check ─────────────────────────────────────────────────────────
km_str = fields.get("km", "")
if km_str:
km_digits = re.sub(r"[^\d]", "", km_str)
if km_digits:
km_val = int(km_digits)
# Likely written in thousands: 130 km instead of 130.000 km
# Cars below 500 km that aren't brand new (year < current-1) are suspicious
year_str = fields.get("year", "")
current_year = datetime.now().year
car_age = (current_year - int(year_str)) if year_str.isdigit() else 99
if km_val < 500 and car_age > 1:
corrected_km = km_val * 1000
flags.append(
f"⚠️ Kilometertallet ser forkert ud: sælger har skrevet '{km_val} km' "
f"— sandsynligvis ment {corrected_km:,} km. "
f"Annoncen nedprioriteres pga. vildledende km-angivelse."
)
# ── Price check ───────────────────────────────────────────────────────────
try:
price = float(str(item.get("price_dkk", 0)).replace(",", "."))
if 0 < price < 5000:
flags.append(
f"⚠️ Prisen på {int(price):,} kr. er usandsynlig lav for en bil — "
f"sandsynligvis en månedlig leasingydelse, ikke salgspris. "
f"Annoncen nedprioriteres pga. misvisende prisangivelse."
)
except (ValueError, TypeError):
pass
if flags:
item["data_quality_flags"] = flags
return item
def listing_summary(item: dict, idx: int) -> str:
"""Compact text representation of a listing for the AI prompt."""
raw = item.get("details", {}).get("raw_text", item.get("description", ""))
@@ -165,11 +216,18 @@ def listing_summary(item: dict, idx: int) -> str:
meta_line = " | ".join(meta_parts)
# Include any data quality flags so AI factors them into scoring
quality_issues = item.get("data_quality_flags", [])
quality_block = ""
if quality_issues:
quality_block = "\nDATA KVALITETSPROBLEM (giv lav score 1-3):\n" + "\n".join(quality_issues) + "\n"
return (
f"--- Annonce #{idx + 1} (ID: {item['id']}) ---\n"
f"Navn: {item['name']}\n"
f"Pris: {item['price_dkk']} DKK\n"
+ (f"{meta_line}\n" if meta_line else "")
+ quality_block
+ f"{text}\n"
)
@@ -301,6 +359,10 @@ def score_listings(
category = detect_category(items)
phash = prefs_hash(prefs)
# ── Normalize data quality issues before any scoring ────────────────────
for item in items:
normalize_listing(item, category)
# ── Split: persistent score cache → in-file cache → needs AI scoring ────
to_score, cached = [], []
now = datetime.now().isoformat(timespec="seconds")

View File

@@ -425,6 +425,10 @@
const pct = Math.round(score * 10);
const warn = item.ai_warnings
? `<p class="text-xs mt-2" style="color:#dc2626">↑ ${item.ai_warnings}</p>` : "";
const qualityFlags = (item.data_quality_flags || []).length > 0
? `<div class="mt-2 rounded-lg px-3 py-2" style="background:#fff7ed;border:1px solid #fed7aa">
${item.data_quality_flags.map(f => `<p class="text-xs" style="color:#c2410c">${f}</p>`).join("")}
</div>` : "";
const tag = rankTag(score);
const card = document.createElement("div");
@@ -447,6 +451,7 @@
</div>
<p class="text-xs leading-relaxed" style="color:#78716c">${item.ai_reason || ""}</p>
${warn}
${qualityFlags}
</div>
<div class="shrink-0 text-right ml-2">
<p class="font-semibold text-sm tabular-nums">${Number(item.price_dkk || 0).toLocaleString("da-DK")}<span class="text-xs font-normal ml-0.5" style="color:#a8a29e">kr</span></p>