Add data quality normalization for cars: mileage in thousands and leasing prices
All checks were successful
Build and Deploy BlaaAi / build-and-deploy (push) Successful in 4m24s
All checks were successful
Build and Deploy BlaaAi / build-and-deploy (push) Successful in 4m24s
- detect km < 500 on non-new cars (e.g. '130' instead of '130.000') - detect prices < 5.000 kr (likely monthly leasing rate not sale price) - flags shown in AI prompt → forces low score (1-3) - orange warning box in UI distinct from AI warnings
This commit is contained in:
62
score.py
62
score.py
@@ -143,6 +143,57 @@ def extract_structured_fields(raw: str) -> dict:
|
||||
return fields
|
||||
|
||||
|
||||
def normalize_listing(item: dict, category: str) -> dict:
|
||||
"""
|
||||
Detect and flag data quality issues for cars:
|
||||
- Mileage written as thousands (130 instead of 130.000)
|
||||
- Suspiciously low price (likely a leasing monthly rate, not full price)
|
||||
|
||||
Adds 'data_quality_flags' list to item (in-place) and returns item.
|
||||
"""
|
||||
if category != "brugte biler":
|
||||
return item
|
||||
|
||||
flags = []
|
||||
raw = item.get("details", {}).get("raw_text", "")
|
||||
fields = extract_structured_fields(raw)
|
||||
|
||||
# ── Mileage check ─────────────────────────────────────────────────────────
|
||||
km_str = fields.get("km", "")
|
||||
if km_str:
|
||||
km_digits = re.sub(r"[^\d]", "", km_str)
|
||||
if km_digits:
|
||||
km_val = int(km_digits)
|
||||
# Likely written in thousands: 130 km instead of 130.000 km
|
||||
# Cars below 500 km that aren't brand new (year < current-1) are suspicious
|
||||
year_str = fields.get("year", "")
|
||||
current_year = datetime.now().year
|
||||
car_age = (current_year - int(year_str)) if year_str.isdigit() else 99
|
||||
if km_val < 500 and car_age > 1:
|
||||
corrected_km = km_val * 1000
|
||||
flags.append(
|
||||
f"⚠️ Kilometertallet ser forkert ud: sælger har skrevet '{km_val} km' "
|
||||
f"— sandsynligvis ment {corrected_km:,} km. "
|
||||
f"Annoncen nedprioriteres pga. vildledende km-angivelse."
|
||||
)
|
||||
|
||||
# ── Price check ───────────────────────────────────────────────────────────
|
||||
try:
|
||||
price = float(str(item.get("price_dkk", 0)).replace(",", "."))
|
||||
if 0 < price < 5000:
|
||||
flags.append(
|
||||
f"⚠️ Prisen på {int(price):,} kr. er usandsynlig lav for en bil — "
|
||||
f"sandsynligvis en månedlig leasingydelse, ikke salgspris. "
|
||||
f"Annoncen nedprioriteres pga. misvisende prisangivelse."
|
||||
)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
if flags:
|
||||
item["data_quality_flags"] = flags
|
||||
return item
|
||||
|
||||
|
||||
def listing_summary(item: dict, idx: int) -> str:
|
||||
"""Compact text representation of a listing for the AI prompt."""
|
||||
raw = item.get("details", {}).get("raw_text", item.get("description", ""))
|
||||
@@ -165,11 +216,18 @@ def listing_summary(item: dict, idx: int) -> str:
|
||||
|
||||
meta_line = " | ".join(meta_parts)
|
||||
|
||||
# Include any data quality flags so AI factors them into scoring
|
||||
quality_issues = item.get("data_quality_flags", [])
|
||||
quality_block = ""
|
||||
if quality_issues:
|
||||
quality_block = "\nDATA KVALITETSPROBLEM (giv lav score 1-3):\n" + "\n".join(quality_issues) + "\n"
|
||||
|
||||
return (
|
||||
f"--- Annonce #{idx + 1} (ID: {item['id']}) ---\n"
|
||||
f"Navn: {item['name']}\n"
|
||||
f"Pris: {item['price_dkk']} DKK\n"
|
||||
+ (f"{meta_line}\n" if meta_line else "")
|
||||
+ quality_block
|
||||
+ f"{text}\n"
|
||||
)
|
||||
|
||||
@@ -301,6 +359,10 @@ def score_listings(
|
||||
category = detect_category(items)
|
||||
phash = prefs_hash(prefs)
|
||||
|
||||
# ── Normalize data quality issues before any scoring ────────────────────
|
||||
for item in items:
|
||||
normalize_listing(item, category)
|
||||
|
||||
# ── Split: persistent score cache → in-file cache → needs AI scoring ────
|
||||
to_score, cached = [], []
|
||||
now = datetime.now().isoformat(timespec="seconds")
|
||||
|
||||
Reference in New Issue
Block a user