Add data quality normalization for cars: mileage in thousands and leasing prices
All checks were successful
Build and Deploy BlaaAi / build-and-deploy (push) Successful in 4m24s
All checks were successful
Build and Deploy BlaaAi / build-and-deploy (push) Successful in 4m24s
- detect km < 500 on non-new cars (e.g. '130' instead of '130.000') - detect prices < 5.000 kr (likely monthly leasing rate not sale price) - flags shown in AI prompt → forces low score (1-3) - orange warning box in UI distinct from AI warnings
This commit is contained in:
62
score.py
62
score.py
@@ -143,6 +143,57 @@ def extract_structured_fields(raw: str) -> dict:
|
||||
return fields
|
||||
|
||||
|
||||
def normalize_listing(item: dict, category: str) -> dict:
|
||||
"""
|
||||
Detect and flag data quality issues for cars:
|
||||
- Mileage written as thousands (130 instead of 130.000)
|
||||
- Suspiciously low price (likely a leasing monthly rate, not full price)
|
||||
|
||||
Adds 'data_quality_flags' list to item (in-place) and returns item.
|
||||
"""
|
||||
if category != "brugte biler":
|
||||
return item
|
||||
|
||||
flags = []
|
||||
raw = item.get("details", {}).get("raw_text", "")
|
||||
fields = extract_structured_fields(raw)
|
||||
|
||||
# ── Mileage check ─────────────────────────────────────────────────────────
|
||||
km_str = fields.get("km", "")
|
||||
if km_str:
|
||||
km_digits = re.sub(r"[^\d]", "", km_str)
|
||||
if km_digits:
|
||||
km_val = int(km_digits)
|
||||
# Likely written in thousands: 130 km instead of 130.000 km
|
||||
# Cars below 500 km that aren't brand new (year < current-1) are suspicious
|
||||
year_str = fields.get("year", "")
|
||||
current_year = datetime.now().year
|
||||
car_age = (current_year - int(year_str)) if year_str.isdigit() else 99
|
||||
if km_val < 500 and car_age > 1:
|
||||
corrected_km = km_val * 1000
|
||||
flags.append(
|
||||
f"⚠️ Kilometertallet ser forkert ud: sælger har skrevet '{km_val} km' "
|
||||
f"— sandsynligvis ment {corrected_km:,} km. "
|
||||
f"Annoncen nedprioriteres pga. vildledende km-angivelse."
|
||||
)
|
||||
|
||||
# ── Price check ───────────────────────────────────────────────────────────
|
||||
try:
|
||||
price = float(str(item.get("price_dkk", 0)).replace(",", "."))
|
||||
if 0 < price < 5000:
|
||||
flags.append(
|
||||
f"⚠️ Prisen på {int(price):,} kr. er usandsynlig lav for en bil — "
|
||||
f"sandsynligvis en månedlig leasingydelse, ikke salgspris. "
|
||||
f"Annoncen nedprioriteres pga. misvisende prisangivelse."
|
||||
)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
if flags:
|
||||
item["data_quality_flags"] = flags
|
||||
return item
|
||||
|
||||
|
||||
def listing_summary(item: dict, idx: int) -> str:
|
||||
"""Compact text representation of a listing for the AI prompt."""
|
||||
raw = item.get("details", {}).get("raw_text", item.get("description", ""))
|
||||
@@ -165,11 +216,18 @@ def listing_summary(item: dict, idx: int) -> str:
|
||||
|
||||
meta_line = " | ".join(meta_parts)
|
||||
|
||||
# Include any data quality flags so AI factors them into scoring
|
||||
quality_issues = item.get("data_quality_flags", [])
|
||||
quality_block = ""
|
||||
if quality_issues:
|
||||
quality_block = "\nDATA KVALITETSPROBLEM (giv lav score 1-3):\n" + "\n".join(quality_issues) + "\n"
|
||||
|
||||
return (
|
||||
f"--- Annonce #{idx + 1} (ID: {item['id']}) ---\n"
|
||||
f"Navn: {item['name']}\n"
|
||||
f"Pris: {item['price_dkk']} DKK\n"
|
||||
+ (f"{meta_line}\n" if meta_line else "")
|
||||
+ quality_block
|
||||
+ f"{text}\n"
|
||||
)
|
||||
|
||||
@@ -301,6 +359,10 @@ def score_listings(
|
||||
category = detect_category(items)
|
||||
phash = prefs_hash(prefs)
|
||||
|
||||
# ── Normalize data quality issues before any scoring ────────────────────
|
||||
for item in items:
|
||||
normalize_listing(item, category)
|
||||
|
||||
# ── Split: persistent score cache → in-file cache → needs AI scoring ────
|
||||
to_score, cached = [], []
|
||||
now = datetime.now().isoformat(timespec="seconds")
|
||||
|
||||
@@ -425,6 +425,10 @@
|
||||
const pct = Math.round(score * 10);
|
||||
const warn = item.ai_warnings
|
||||
? `<p class="text-xs mt-2" style="color:#dc2626">↑ ${item.ai_warnings}</p>` : "";
|
||||
const qualityFlags = (item.data_quality_flags || []).length > 0
|
||||
? `<div class="mt-2 rounded-lg px-3 py-2" style="background:#fff7ed;border:1px solid #fed7aa">
|
||||
${item.data_quality_flags.map(f => `<p class="text-xs" style="color:#c2410c">${f}</p>`).join("")}
|
||||
</div>` : "";
|
||||
const tag = rankTag(score);
|
||||
|
||||
const card = document.createElement("div");
|
||||
@@ -447,6 +451,7 @@
|
||||
</div>
|
||||
<p class="text-xs leading-relaxed" style="color:#78716c">${item.ai_reason || ""}</p>
|
||||
${warn}
|
||||
${qualityFlags}
|
||||
</div>
|
||||
<div class="shrink-0 text-right ml-2">
|
||||
<p class="font-semibold text-sm tabular-nums">${Number(item.price_dkk || 0).toLocaleString("da-DK")}<span class="text-xs font-normal ml-0.5" style="color:#a8a29e">kr</span></p>
|
||||
|
||||
Reference in New Issue
Block a user