From 58210207ea4de9fe480e1baa38a810670832966e Mon Sep 17 00:00:00 2001 From: Henrik Jess Nielsen Date: Fri, 5 Jun 2026 19:57:39 +0200 Subject: [PATCH] feat: add taxonomy classify service + /classify endpoint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - scripts/taxonomy.py: shared taxonomy with 14 categories, keyword scorer and classify_text() function - scripts/classify_server.py: FastAPI service — forwards to kreuzberg /extract, applies taxonomy, returns category/subcategory/confidence alongside full kreuzberg response - Dockerfile.classify: lightweight Python image for classify service - classify.nomad: Nomad job → classify.i80.dk - .gitea/workflows/classify.yml: CI/CD pipeline (build + deploy) - analyse_familie.py: refactored to import from taxonomy.py (no duplication) - .gitignore: exclude dokumenter_keywords.* and extract_all.log --- .gitea/workflows/classify.yml | 52 ++++ .gitignore | 6 + Dockerfile.classify | 13 + classify.nomad | 96 +++++++ scripts/analyse_familie.py | 512 ++++++++++++++++++++++++++++++++++ scripts/classify_server.py | 117 ++++++++ scripts/taxonomy.py | 231 +++++++++++++++ 7 files changed, 1027 insertions(+) create mode 100644 .gitea/workflows/classify.yml create mode 100644 Dockerfile.classify create mode 100644 classify.nomad create mode 100755 scripts/analyse_familie.py create mode 100644 scripts/classify_server.py create mode 100644 scripts/taxonomy.py diff --git a/.gitea/workflows/classify.yml b/.gitea/workflows/classify.yml new file mode 100644 index 0000000..f8701ed --- /dev/null +++ b/.gitea/workflows/classify.yml @@ -0,0 +1,52 @@ +name: Deploy classify service + +on: + push: + branches: + - main + paths: + - "scripts/classify_server.py" + - "scripts/taxonomy.py" + - "Dockerfile.classify" + - "classify.nomad" + workflow_dispatch: + +env: + REGISTRY: ghcr.io + IMAGE_NAME: hjess/kreuzberg-classify + +jobs: + build-and-deploy: + runs-on: debian-host + + env: + PATH: /usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/sbin:/bin:/snap/bin + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Build Docker image + run: | + docker build -f Dockerfile.classify -t ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest . + + - name: Push to registry + run: | + docker push ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest + + - name: Validate Nomad job + run: nomad job validate classify.nomad + env: + NOMAD_ADDR: "https://nomad.i80.dk:4646" + + - name: Deploy to Nomad + run: nomad job run classify.nomad + env: + NOMAD_ADDR: "https://nomad.i80.dk:4646" + + - name: Check deployment status + run: | + sleep 10 + nomad job status classify + env: + NOMAD_ADDR: "https://nomad.i80.dk:4646" diff --git a/.gitignore b/.gitignore index b2e3fee..407cfad 100644 --- a/.gitignore +++ b/.gitignore @@ -245,6 +245,12 @@ obj/ pkg/ +# Doc classifier output files +dokumenter_keywords.json +dokumenter_keywords.classified.csv +dokumenter_keywords.misplaced.csv +extract_all.log + # Local dev artifacts docs/demo-dev.html docs/serve.json diff --git a/Dockerfile.classify b/Dockerfile.classify new file mode 100644 index 0000000..db656f3 --- /dev/null +++ b/Dockerfile.classify @@ -0,0 +1,13 @@ +FROM python:3.12-slim + +WORKDIR /app + +RUN pip install --no-cache-dir fastapi uvicorn[standard] httpx + +COPY scripts/taxonomy.py . +COPY scripts/classify_server.py . + +ENV KREUZBERG_URL=https://check.i80.dk +ENV PORT=8000 + +CMD ["sh", "-c", "uvicorn classify_server:app --host 0.0.0.0 --port ${PORT}"] diff --git a/classify.nomad b/classify.nomad new file mode 100644 index 0000000..3f91237 --- /dev/null +++ b/classify.nomad @@ -0,0 +1,96 @@ +job "classify" { + region = "global" + datacenters = ["dc1"] + type = "service" + + meta { + uuid = uuidv4() + deployed_at = "[[ timeNowUTC ]]" + } + + update { + stagger = "30s" + max_parallel = 1 + auto_revert = true + progress_deadline = "10m" + } + + group "classify-group" { + count = 1 + + constraint { + attribute = "${node.unique.name}" + value = "int" + } + + update { + canary = 1 + auto_promote = true + min_healthy_time = "20s" + healthy_deadline = "10m" + progress_deadline = "15m" + auto_revert = true + } + + network { + port "http" {} + } + + reschedule { + attempts = 5 + interval = "10m" + delay = "30s" + delay_function = "exponential" + max_delay = "120s" + unlimited = false + } + + service { + provider = "consul" + name = "classify" + port = "http" + + tags = [ + "traefik.enable=true", + "traefik.http.routers.classify.rule=Host(`classify.i80.dk`)", + "traefik.http.routers.classify.tls=true", + ] + + check { + name = "http_health" + type = "http" + port = "http" + path = "/health" + interval = "15s" + timeout = "5s" + } + } + + task "classify-task" { + driver = "docker" + + config { + image = "ghcr.io/hjess/kreuzberg-classify:latest" + ports = ["http"] + force_pull = true + } + + env { + KREUZBERG_URL = "https://check.i80.dk" + PORT = "${NOMAD_PORT_http}" + } + + restart { + attempts = 5 + interval = "10m" + delay = "20s" + mode = "fail" + } + + resources { + cpu = 200 + memory = 256 + } + } + } +} diff --git a/scripts/analyse_familie.py b/scripts/analyse_familie.py new file mode 100755 index 0000000..8e2d226 --- /dev/null +++ b/scripts/analyse_familie.py @@ -0,0 +1,512 @@ +#!/usr/bin/env python3 +""" +Document keyword analyser — misplacement detection across ~/Dokumenter. + +Phase 1 (extract): Hits kreuzberg API for all files, saves progress to JSON. +Phase 2 (analyse): Builds folder profiles, flags files likely in the wrong folder. +Phase 3 (classify): Zero-shot taxonomy classification via sentence-transformers. + +Usage: + python3 analyse_familie.py extract + python3 analyse_familie.py analyse + python3 analyse_familie.py classify + python3 analyse_familie.py extract --workers 6 --output my_results.json + python3 analyse_familie.py analyse --threshold 0.25 + python3 analyse_familie.py classify --results dokumenter_keywords.json +""" + +import csv +import json +import argparse +import re +import time +from pathlib import Path +from collections import defaultdict, Counter +from concurrent.futures import ThreadPoolExecutor, as_completed + +import urllib.request +import urllib.error + +API_URL = "https://check.i80.dk/extract" +DEFAULT_DIR = "/home/hjess/Dokumenter" +DEFAULT_OUTPUT = "dokumenter_keywords.json" + +SUPPORTED_EXTS = {".pdf", ".doc", ".docx", ".odt", ".ods", ".ppt", ".pptx", ".txt", ".rtf"} + +# Generic words that don't help categorise documents +STOPWORDS = { + # Danish function words + "den", "det", "der", "som", "til", "fra", "med", "for", "og", "i", "er", + "en", "et", "af", "på", "at", "de", "har", "ikke", "vi", "hun", "han", + "skal", "kan", "var", "men", "når", "også", "bare", "blev", "bare", + # Form field noise + "navn", "dato", "side", "fulde", "nummer", "adresse", "telefon", "email", + "cpr", "cpr-nr", "postnr", "underskrift", "forælder", "barnet", "oplysninger", + "telefonnummer", "telefonnummer i dagtimerne", + # Email/phone footer boilerplate + "sendt fra", "sendt fra min", "fra min", "fra min iphone", + "min iphone", "min iphone den", "iphone den", + "skrev henrik", "skrev henrik jess", "henrik jess", "henrik", + "ganstar nielsen wrote", "nielsen wrote", + # Names — too generic across this specific corpus + "ganstar", "nielsen", "ganstar nielsen", "tanya ganstar", "tanya ganstar nielsen", + "henrik jess nielsen", "jess nielsen", + # Kaiten mail system noise + "kaiten", "kaiten mail", "med kaiten mail", "via kaiten", +} + +SIDER_PATTERN = "Aktindsigt/Sider" +SIDER_SAMPLE = 20 # how many pages to sample from the bulk Sider/ group +BULK_DIR_THRESHOLD = 50 # dirs with more files than this get sampled instead of fully processed + + +# --------------------------------------------------------------------------- +# HTTP helper (stdlib only — no requests dependency) +# --------------------------------------------------------------------------- + +def _post_multipart(url, filepath: Path, config: dict, timeout=90) -> dict | None: + """POST a file as multipart/form-data using stdlib only. Retries on 502.""" + boundary = "----KreuzbergBoundary7MA4YWxkTrZu" + config_json = json.dumps(config).encode() + + with open(filepath, "rb") as fh: + file_data = fh.read() + + body = ( + f"--{boundary}\r\n" + f'Content-Disposition: form-data; name="files"; filename="{filepath.name}"\r\n' + f"Content-Type: application/octet-stream\r\n\r\n" + ).encode() + file_data + ( + f"\r\n--{boundary}\r\n" + f'Content-Disposition: form-data; name="config"\r\n\r\n' + ).encode() + config_json + f"\r\n--{boundary}--\r\n".encode() + + req = urllib.request.Request(url, data=body) + req.add_header("Content-Type", f"multipart/form-data; boundary={boundary}") + req.add_header("Content-Length", str(len(body))) + + retries = 3 + for attempt in range(retries): + try: + with urllib.request.urlopen(req, timeout=timeout) as resp: + return json.loads(resp.read().decode()) + except urllib.error.HTTPError as e: + if e.code == 502 and attempt < retries - 1: + wait = 5 * (2 ** attempt) # 5s, 10s, 20s + time.sleep(wait) + continue + return {"error": f"HTTP {e.code}"} + except Exception as e: + return {"error": str(e)} + return {"error": "HTTP 502 (max retries exceeded)"} + + +# --------------------------------------------------------------------------- +# Extraction +# --------------------------------------------------------------------------- + +def extract_file(filepath: Path, max_keywords: int = 15) -> dict: + config = {"keywords": {"algorithm": "yake", "max_keywords": max_keywords}} + data = _post_multipart(API_URL, filepath, config) + + if data is None or "error" in (data if isinstance(data, dict) else {}): + error = (data or {}).get("error", "unknown") + return {"file": str(filepath), "keywords": [], "languages": [], "content_length": 0, "error": error} + + results = data if isinstance(data, list) else [data] + if not results: + return {"file": str(filepath), "keywords": [], "languages": [], "content_length": 0, "error": "empty response"} + + r = results[0] + return { + "file": str(filepath), + "keywords": [(k["text"], round(k["score"], 4)) for k in (r.get("extracted_keywords") or [])], + "languages": r.get("detected_languages") or [], + "content_length": len(r.get("content") or ""), + "error": None, + } + + +def collect_files(directory: Path, bulk_threshold: int = BULK_DIR_THRESHOLD) -> tuple[list[Path], list[Path]]: + """Return (regular_files, bulk_dirs) where bulk dirs have >bulk_threshold files.""" + # First pass: count files per directory + dir_files: dict[Path, list[Path]] = defaultdict(list) + for f in sorted(directory.rglob("*")): + if f.is_file() and f.suffix.lower() in SUPPORTED_EXTS: + dir_files[f.parent].append(f) + + regular: list[Path] = [] + bulk_dirs: list[Path] = [] + seen_bulk: set[Path] = set() + + for parent, files in dir_files.items(): + # Check if any ancestor is already a bulk dir + if any(b in parent.parents or b == parent for b in seen_bulk): + continue + if len(files) >= bulk_threshold: + bulk_dirs.append(parent) + seen_bulk.add(parent) + else: + regular.extend(files) + + return regular, bulk_dirs + + +def analyse_bulk_group(bulk_dir: Path) -> dict: + """Sample SIDER_SAMPLE files from a large directory and return a merged group result.""" + all_files = [f for f in sorted(bulk_dir.rglob("*")) if f.is_file() and f.suffix.lower() in SUPPORTED_EXTS] + sample = all_files[:SIDER_SAMPLE] + print(f" Sampling {len(sample)}/{len(all_files)} files from {bulk_dir.name}/…") + + keyword_counter: Counter = Counter() + for f in sample: + r = extract_file(f, max_keywords=10) + for kw, _ in r["keywords"]: + keyword_counter[kw.lower()] += 1 + + return { + "file": str(bulk_dir) + f"/ [GROUP — {len(all_files)} files, sampled {len(sample)}]", + "keywords": [(kw, round(count / len(sample), 3)) for kw, count in keyword_counter.most_common(15)], + "languages": ["dan"], + "content_length": -1, + "is_group": True, + "error": None, + } + + +def run_extract(directory: str, output_file: str, workers: int, bulk_threshold: int = BULK_DIR_THRESHOLD) -> None: + dir_path = Path(directory) + out_path = Path(output_file) + files, bulk_dirs = collect_files(dir_path, bulk_threshold=bulk_threshold) + + # Resume from existing output — only skip files with successful results (no error) + results: dict[str, dict] = {} + retriable_errors = {"HTTP 502", "HTTP 502 (max retries exceeded)"} + if out_path.exists(): + with open(out_path) as fh: + for r in json.load(fh): + # Retry transient server errors; keep permanent errors (422 etc.) + if r.get("error") in retriable_errors: + continue + results[r["file"]] = r + print(f"Resuming — {len(results)} files already done") + + todo = [f for f in files if str(f) not in results] + total = len(todo) + print(f"Files to process: {total} (skipping {len(files) - total} already done)") + if bulk_dirs: + print(f"Bulk directories (sampled): {[str(d) for d in bulk_dirs]}\n") + + done = 0 + errors = 0 + + def save(): + with open(out_path, "w") as fh: + json.dump(list(results.values()), fh, ensure_ascii=False, indent=2) + + with ThreadPoolExecutor(max_workers=workers) as ex: + futures = {ex.submit(extract_file, f): f for f in todo} + for future in as_completed(futures): + result = future.result() + results[result["file"]] = result + done += 1 + if result["error"]: + errors += 1 + print(f" ❌ [{done}/{total}] {Path(result['file']).name}: {result['error']}") + else: + preview = ", ".join(kw for kw, _ in result["keywords"][:3]) + print(f" ✅ [{done}/{total}] {Path(result['file']).name} → {preview}") + if done % 20 == 0: + save() + + save() + + # Handle bulk dirs as sampled groups + for bulk_dir in bulk_dirs: + group_key = str(bulk_dir) + "/ [GROUP]" + if group_key not in results: + print(f"\nAnalysing bulk group: {bulk_dir.name}/") + group = analyse_bulk_group(bulk_dir) + results[group["file"]] = group + save() + print(f" Top keywords: {', '.join(kw for kw, _ in group['keywords'][:5])}") + + print(f"\n✅ Finished: {done} files, {errors} errors → {out_path}") + print(f" Run 'python3 {__file__} analyse' to find misplaced files") + + +# --------------------------------------------------------------------------- +# Phase 2: Misplacement detection +# --------------------------------------------------------------------------- + +def _clean_keywords(raw: list[tuple[str, float]]) -> list[str]: + return [ + kw.lower().strip() for kw, _ in (raw or []) + if len(kw) >= 4 and kw.lower().strip() not in STOPWORDS + ] + + +def _folder_key(filepath: Path, base: Path) -> str: + """Return the relative folder path (e.g. 'Privat/Økonomi/Gæld').""" + try: + return str(filepath.relative_to(base).parent) + except ValueError: + return "" + + +def run_analyse(results_file: str, base_dir: str, threshold: float, min_folder_docs: int) -> None: + """ + Build a keyword profile per folder, then flag files whose keywords + don't overlap well with their current folder. + """ + with open(results_file) as fh: + results = json.load(fh) + + base = Path(base_dir) + print(f"Analysing {len(results)} document records…") + + # Build folder profiles: folder → Counter(keyword → freq) + folder_profiles: dict[str, Counter] = defaultdict(Counter) + file_kws: dict[str, list[str]] = {} + + for r in results: + if r.get("is_group"): + continue + kws = _clean_keywords(r.get("keywords") or []) + file_kws[r["file"]] = kws + folder = _folder_key(Path(r["file"]), base) + if folder: + for kw in kws: + folder_profiles[folder][kw] += 1 + + # Filter out thin folders (too few docs to be meaningful) + folder_doc_counts: Counter = Counter() + for r in results: + if not r.get("is_group"): + folder_doc_counts[_folder_key(Path(r["file"]), base)] += 1 + + valid_folders = {f for f, c in folder_doc_counts.items() if c >= min_folder_docs} + print(f"Folder profiles built: {len(valid_folders)} folders with {min_folder_docs}+ documents\n") + + # Score each file against its own folder, flag low-overlap files + misplaced: list[dict] = [] + unclassified: list[dict] = [] + + for r in results: + if r.get("is_group"): + continue + fp = Path(r["file"]) + folder = _folder_key(fp, base) + kws = file_kws.get(r["file"], []) + + if not kws: + continue + + if folder == "." or folder == "": + # File sits directly in the root — needs a home + unclassified.append(r) + continue + + profile = folder_profiles.get(folder, Counter()) + # Overlap: share of file's keywords that appear ≥2 times in folder profile + shared = sum(1 for kw in kws if profile[kw] >= 2) + overlap = shared / len(kws) + + if overlap < threshold and folder in valid_folders: + # Find best matching alternative folder + best_folder = max( + valid_folders - {folder}, + key=lambda fd: sum(folder_profiles[fd][kw] for kw in kws), + default=None, + ) + best_score = sum(folder_profiles[best_folder][kw] for kw in kws) if best_folder else 0 + + # Skip if both current and suggested are generic "Ukendt" dump folders — + # moving Ukendt/PDF/2011 → Ukendt/PDF/2009 is not an improvement + def _is_ukendt(f: str) -> bool: + return f is not None and "Ukendt" in f + + if _is_ukendt(folder) and _is_ukendt(best_folder): + continue + + misplaced.append({ + "file": r["file"], + "filename": fp.name, + "current_folder": folder, + "overlap": round(overlap, 2), + "suggested_folder": best_folder or "", + "suggestion_score": best_score, + "top_keywords": "; ".join(kws[:6]), + }) + + misplaced.sort(key=lambda x: x["overlap"]) + + # Print summary + print(f"{'─'*65}") + print(f"Potentially misplaced: {len(misplaced)} files (overlap < {threshold:.0%})\n") + + for m in misplaced[:40]: + print(f" 📄 {m['filename']}") + print(f" Current: {m['current_folder']}") + print(f" Suggested: {m['suggested_folder']} (overlap={m['overlap']:.0%})") + print(f" Keywords: {m['top_keywords']}") + print() + + if len(misplaced) > 40: + print(f" … and {len(misplaced) - 40} more — see CSV for full list\n") + + # Save CSV for easy review / filtering in a spreadsheet + csv_path = Path(results_file).with_suffix(".misplaced.csv") + with open(csv_path, "w", newline="", encoding="utf-8") as csvf: + writer = csv.DictWriter( + csvf, + fieldnames=["filename", "current_folder", "overlap", "suggested_folder", + "suggestion_score", "top_keywords", "file"], + ) + writer.writeheader() + writer.writerows(misplaced) + + print(f"{'─'*65}") + print(f"✅ Saved {len(misplaced)} misplaced records → {csv_path}") + print(f" Open in LibreOffice Calc, sort by 'overlap' asc to prioritise.") + + +# --------------------------------------------------------------------------- +# Phase 3: taxonomy classification via keyword scoring +# --------------------------------------------------------------------------- +# Taxonomy data and scorer live in taxonomy.py (shared with classify_server.py). + +import sys as _sys +import os as _os +_sys.path.insert(0, _os.path.dirname(__file__)) +from taxonomy import TAXONOMY, TAXONOMY_TO_FOLDER, keyword_score as _keyword_score # noqa: E402 + + +def run_classify(results_file: str, base_dir: str = DEFAULT_DIR, min_score: float = 1.5) -> None: + """Phase 3: classify each document into a taxonomy category using keyword scoring. + + Scoring is deterministic: each category has a weighted keyword list; the + document text (filename + folder path + YAKE keywords) is scored against + every category and the highest score wins. No ML model required. + """ + results_path = Path(results_file) + if not results_path.exists(): + print(f"❌ Results file not found: {results_file}") + print(" Run 'extract' phase first.") + return + + with open(results_path, encoding="utf-8") as fh: + results: list[dict] = json.load(fh) + + # Include files even without YAKE keywords — filename+folder alone can classify + classifiable = [r for r in results if not r.get("is_group") and not r.get("error")] + print(f"Classifying {len(classifiable)} documents…") + + base = Path(base_dir) + output_rows: list[dict] = [] + + for r in classifiable: + fp = Path(r["file"]) + # Build document text from filename tokens + full folder path + YAKE keywords + stem_tokens = re.sub(r"[-_.]", " ", fp.stem).lower() + folder_tokens = " ".join(fp.parent.parts).lower() + kw_text = " ".join(kw for kw, _ in r.get("keywords", [])) + doc_text = f"{stem_tokens} {folder_tokens} {kw_text}" + + scores: dict[str, float] = { + cat: _keyword_score(doc_text, kws) + for cat, kws in TAXONOMY.items() + } + + best_label = max(scores, key=lambda c: scores[c]) + best_score = scores[best_label] + + sorted_cats = sorted(scores, key=lambda c: scores[c], reverse=True) + runner_up = sorted_cats[1] if len(sorted_cats) > 1 else "" + runner_score = scores[runner_up] if runner_up else 0.0 + + current_folder = _folder_key(fp, base) + label = best_label if best_score >= min_score else "Ukendt" + suggested_folder = TAXONOMY_TO_FOLDER.get(label, "") if label != "Ukendt" else "" + + output_rows.append({ + "filename": fp.name, + "current_folder": current_folder, + "taxonomy_label": label, + "confidence": best_score, + "runner_up": runner_up, + "runner_up_score": runner_score, + "suggested_folder": suggested_folder, + "top_keywords": "; ".join(kw for kw, _ in r.get("keywords", [])[:6]), + "file": r["file"], + }) + + # Sort by confidence ascending — lowest confidence = needs most attention + output_rows.sort(key=lambda x: x["confidence"]) + + csv_path = Path(results_file).with_suffix(".classified.csv") + with open(csv_path, "w", newline="", encoding="utf-8") as csvf: + writer = csv.DictWriter( + csvf, + fieldnames=["filename", "current_folder", "taxonomy_label", "confidence", + "runner_up", "runner_up_score", "suggested_folder", "top_keywords", "file"], + ) + writer.writeheader() + writer.writerows(output_rows) + + # Print distribution summary + label_counts: Counter = Counter(r["taxonomy_label"] for r in output_rows) + print(f"\n{'─'*65}") + print(f"Taxonomy distribution ({len(output_rows)} documents):\n") + for label, count in label_counts.most_common(): + bar = "█" * (count * 30 // max(label_counts.values())) + print(f" {label:<30} {count:>4} {bar}") + + low_conf = sum(1 for r in output_rows if r["confidence"] < min_score) + print(f"\n Low score (<{min_score}): {low_conf} files → labelled 'Ukendt'") + print(f"\n{'─'*65}") + print(f"✅ Saved {len(output_rows)} classifications → {csv_path}") + print(f" Columns: filename, current_folder, taxonomy_label, confidence, suggested_folder") + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Document keyword extraction + misplacement detector", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + sub = parser.add_subparsers(dest="cmd", required=True) + + ep = sub.add_parser("extract", help="Phase 1: extract keywords from all files") + ep.add_argument("--dir", default=DEFAULT_DIR, help="Root directory to scan") + ep.add_argument("--output", default=DEFAULT_OUTPUT, help="Output JSON file") + ep.add_argument("--workers", type=int, default=3, help="Parallel API workers (default 3 — keep low to avoid 502s)") + ep.add_argument("--bulk-threshold", type=int, default=BULK_DIR_THRESHOLD, + help=f"Dirs with this many files are sampled instead of fully processed (default {BULK_DIR_THRESHOLD}). Use 999999 to process all files.") + + ap = sub.add_parser("analyse", help="Phase 2: find misplaced files using folder profiles") + ap.add_argument("--results", default=DEFAULT_OUTPUT, help="JSON file from extract phase") + ap.add_argument("--dir", default=DEFAULT_DIR, help="Root directory (same as extract)") + ap.add_argument("--threshold", type=float, default=0.25, + help="Overlap threshold below which a file is flagged (default 0.25 = 25%%)") + ap.add_argument("--min-folder-docs", type=int, default=5, + help="Minimum docs in a folder to be used as a reference profile (default 5)") + + cp = sub.add_parser("classify", help="Phase 3: keyword-scoring taxonomy classification") + cp.add_argument("--results", default=DEFAULT_OUTPUT, help="JSON file from extract phase") + cp.add_argument("--dir", default=DEFAULT_DIR, help="Root directory (same as extract)") + cp.add_argument("--min-score", type=float, default=1.5, + help="Minimum keyword score to assign a label (default 1.5). Below this → 'Ukendt'.") + + args = parser.parse_args() + + if args.cmd == "extract": + run_extract(args.dir, args.output, args.workers, args.bulk_threshold) + elif args.cmd == "analyse": + run_analyse(args.results, args.dir, args.threshold, args.min_folder_docs) + else: + run_classify(args.results, args.dir, args.min_score) diff --git a/scripts/classify_server.py b/scripts/classify_server.py new file mode 100644 index 0000000..1c3e756 --- /dev/null +++ b/scripts/classify_server.py @@ -0,0 +1,117 @@ +"""classify_server.py — FastAPI service that adds taxonomy classification to kreuzberg /extract. + +Exposes POST /classify — same multipart interface as kreuzberg /extract, +returns the full kreuzberg response plus category/subcategory/confidence fields. + +Usage: + uvicorn scripts.classify_server:app --host 0.0.0.0 --port 8001 +""" + +from __future__ import annotations + +import json +import os +from contextlib import asynccontextmanager +from typing import Annotated + +import httpx +from fastapi import FastAPI, File, Form, UploadFile +from fastapi.responses import JSONResponse + +from taxonomy import classify_text + +KREUZBERG_URL = os.getenv("KREUZBERG_URL", "https://check.i80.dk") +YAKE_CONFIG = {"keywords": {"algorithm": "yake", "max_keywords": 15}} + + +@asynccontextmanager +async def lifespan(app: FastAPI): + app.state.client = httpx.AsyncClient(timeout=60.0) + yield + await app.state.client.aclose() + + +app = FastAPI( + title="kreuzberg-classify", + description="Taxonomy classification on top of kreuzberg /extract", + version="1.0.0", + lifespan=lifespan, +) + + +@app.get("/health") +async def health(): + return {"status": "healthy", "kreuzberg_url": KREUZBERG_URL} + + +@app.post("/classify") +async def classify( + files: Annotated[list[UploadFile], File()], + config: Annotated[str | None, Form()] = None, + folder: Annotated[str | None, Form()] = None, +): + """Extract text + keywords via kreuzberg, then classify into taxonomy. + + Args: + files: One or more document files (PDF, DOCX, etc.) + config: Optional JSON config for kreuzberg (merged with YAKE defaults). + folder: Optional current folder path for context hint. + + Returns: + List of results — one per file — with all kreuzberg fields plus: + category, subcategory, confidence, runner_up, runner_up_score. + """ + # Merge caller config with our YAKE defaults + merged_config = dict(YAKE_CONFIG) + if config: + try: + caller_cfg = json.loads(config) + merged_config.update(caller_cfg) + except json.JSONDecodeError: + pass + + # Forward files to kreuzberg /extract + form_data = [("config", json.dumps(merged_config))] + file_contents = [] + for upload in files: + content = await upload.read() + file_contents.append((upload.filename, content, upload.content_type or "application/octet-stream")) + form_data.append(("files", (upload.filename, content, upload.content_type or "application/octet-stream"))) + + try: + response = await app.state.client.post( + f"{KREUZBERG_URL}/extract", + files=[("files", (fn, fc, ct)) for fn, fc, ct in file_contents], + data={"config": json.dumps(merged_config)}, + ) + response.raise_for_status() + kreuzberg_results = response.json() + except httpx.HTTPError as exc: + return JSONResponse(status_code=502, content={"error": f"kreuzberg error: {exc}"}) + + # Ensure list + if isinstance(kreuzberg_results, dict): + kreuzberg_results = [kreuzberg_results] + + folder_hint = folder or "" + + results = [] + for i, result in enumerate(kreuzberg_results): + content_text = result.get("content", "") or "" + + # Extract keyword strings from kreuzberg response + raw_keywords = result.get("keywords", []) or [] + if raw_keywords and isinstance(raw_keywords[0], dict): + kw_strings = [k.get("keyword", "") or k.get("phrase", "") for k in raw_keywords] + else: + kw_strings = [str(k) for k in raw_keywords] + + classification = classify_text( + content=content_text, + keywords=kw_strings, + folder_hint=folder_hint, + ) + + results.append({**result, **classification}) + + return results diff --git a/scripts/taxonomy.py b/scripts/taxonomy.py new file mode 100644 index 0000000..51499d4 --- /dev/null +++ b/scripts/taxonomy.py @@ -0,0 +1,231 @@ +"""Shared taxonomy: weighted keyword lists + folder mappings + scorer. + +Used by both analyse_familie.py (batch classify) and classify_server.py (API endpoint). +""" + +from __future__ import annotations + +TAXONOMY: dict[str, list[tuple[str, float]]] = { + "Familie og børn": [ + ("familie", 1.5), ("familieliv", 2.0), ("samvær", 2.0), ("samværsaftale", 2.5), + ("børn", 1.5), ("barn", 1.5), ("forældre", 2.0), ("forældremyndighed", 2.5), + ("skilsmisse", 2.5), ("separation", 2.0), ("barsel", 2.0), ("barnets", 1.5), + ("søskende", 2.0), ("mor", 1.0), ("far", 1.0), ("mor og far", 2.5), + ("dåb", 2.0), ("konfirmation", 2.5), ("bryllup", 2.0), ("vielse", 2.0), + ("fodselsdag", 1.5), ("fødselsdagskort", 2.0), + ], + "Skole og uddannelse": [ + ("skole", 1.5), ("uddannelse", 1.5), ("gymnasium", 2.5), ("universitetet", 2.0), + ("eksamen", 2.0), ("studieplan", 2.5), ("karakter", 2.0), ("lektier", 2.5), + ("opgave", 1.5), ("matematik", 2.0), ("dansk", 1.0), ("noter", 1.5), + ("pensum", 2.5), ("studie", 1.5), ("kursus", 1.5), ("folkeskole", 2.5), + ("htx", 2.5), ("hf", 2.0), ("hhx", 2.5), ("stx", 2.5), ("eux", 2.5), + ("karakterblad", 3.0), ("eksamensbevis", 3.0), ("studiekort", 3.0), + ("answer key", 2.5), ("quiz", 2.5), ("assessment", 2.5), ("learning", 1.5), + ("lecture", 2.0), ("course", 2.0), ("lesson", 2.0), ("worksheet", 2.5), + ], + "Arbejde og karriere": [ + ("ansøgning", 1.5), ("job", 1.5), ("jobansøgning", 2.5), ("cv", 2.5), + ("curriculum vitae", 3.0), ("opsigelse", 2.5), ("løn", 1.5), ("lønforhandling", 2.5), + ("ansættelseskontrakt", 3.0), ("ansættelse", 2.0), ("arbejdsplads", 2.0), + ("kollega", 2.0), ("arbejdsgiver", 2.5), ("medarbejder", 2.0), ("fagforening", 2.5), + ("a-kasse", 2.5), ("dagpenge", 2.5), ("jobcenter", 2.5), ("referenceliste", 3.0), + ("karriere", 2.0), ("rekruttering", 2.5), ("personaleafdeling", 2.5), + ("arbejde", 1.5), ("projektleder", 2.5), ("møde", 1.5), ("mødedagsorden", 2.5), + ("scrum", 2.5), ("agile", 2.5), ("backlog", 2.5), ("sprint", 2.5), + ("konference", 2.0), ("kompetencer", 2.0), + ], + "Økonomi og regninger": [ + ("faktura", 2.5), ("regning", 2.0), ("betaling", 2.0), ("bank", 2.0), + ("skat", 2.0), ("pension", 2.0), ("opsparing", 2.5), ("gæld", 2.5), + ("lån", 2.5), ("kredit", 2.0), ("inkasso", 3.0), ("afdrag", 2.5), + ("akkord", 3.0), ("restgæld", 3.0), ("kreditor", 2.5), ("økonomi", 1.5), + ("budget", 2.0), ("forsikring", 2.0), ("rykkerbrev", 3.0), ("udbetaling", 2.0), + ("sparekasse", 2.5), ("betalingsservice", 3.0), ("gældstyrelsen", 3.0), + ("netto bank", 2.5), ("netbank", 2.5), ("kontoudtog", 3.0), ("årsopgørelse", 2.5), + ("restskat", 3.0), ("årsopgørelse skat", 3.0), ("momsangivelse", 3.0), + ], + "Hjem og bolig": [ + ("bolig", 2.0), ("hus", 1.5), ("lejlighed", 2.5), ("ejendom", 2.0), + ("husleje", 3.0), ("vedligeholdelse", 2.5), ("renovation", 2.5), + ("el", 1.0), ("vand", 1.0), ("varme", 1.5), ("fjernvarme", 2.5), + ("ejerforening", 3.0), ("andelsbolig", 3.0), ("lejekontrakt", 3.0), + ("fremlejning", 2.5), ("nøgle", 1.5), ("flytning", 2.0), + ("indretning", 2.0), ("have", 1.5), ("grundejerforening", 3.0), + ("BBR", 2.5), ("byggetilladelse", 3.0), + ], + "Jura og kontrakter": [ + ("kontrakt", 2.0), ("aftale", 2.0), ("kontrakter", 2.0), ("juridisk", 2.5), + ("advokat", 2.5), ("testamente", 3.0), ("retssag", 3.0), ("dom", 2.0), + ("stævning", 3.0), ("klage", 2.0), ("tinglysning", 3.0), ("pantebrev", 3.0), + ("tilbud", 1.5), ("vilkår", 2.0), ("betingelser", 2.0), ("fuldmagt", 2.5), + ("forlig", 2.5), ("forsikringsbetingelser", 3.0), ("police", 2.0), + ], + "Sundhed og medicin": [ + ("recept", 2.5), ("medicin", 2.5), ("læge", 2.5), ("hospital", 2.5), + ("sygdom", 2.5), ("behandling", 2.0), ("diagnose", 3.0), ("operation", 2.5), + ("symptomer", 2.5), ("sundhed", 2.0), ("journaloplysning", 3.0), + ("patientjournal", 3.0), ("laboratorium", 2.5), ("blodprøve", 3.0), + ("røntgen", 3.0), ("psykolog", 3.0), ("psykiater", 3.0), ("terapi", 2.5), + ("tandlæge", 3.0), ("optiker", 2.5), ("vaccination", 3.0), + ], + "IT og teknologi": [ + ("software", 2.5), ("server", 2.0), ("netværk", 2.5), ("database", 2.5), + ("programmering", 2.5), ("kode", 2.0), ("linux", 3.0), ("cloud", 2.5), + ("it", 1.5), ("computer", 2.0), ("laptop", 2.5), ("password", 2.5), + ("installation", 2.0), ("konfiguration", 2.0), ("log", 1.5), ("backup", 2.5), + ("docker", 3.0), ("kubernetes", 3.0), ("python", 3.0), ("github", 3.0), + ("azure", 2.5), ("windows", 2.0), ("macos", 3.0), ("licens", 2.0), + ("api", 2.5), ("dokumentation", 1.5), ("teknologi", 2.0), ("system", 1.5), + ("web", 2.0), ("app", 1.5), ("program", 1.5), ("firmware", 3.0), + ("internet", 2.0), ("cybersikkerhed", 3.0), + ("bitcoin", 3.0), ("blockchain", 3.0), ("kryptovaluta", 3.0), ("jupyter", 3.0), + ("notebook", 2.5), ("monitor", 2.0), ("display", 2.0), ("remote control", 2.5), + ("user manual", 2.5), ("dataanalyse", 2.5), ("data analysis", 2.5), + ("django", 3.0), ("javascript", 2.5), ("jquery", 2.5), ("typescript", 3.0), + ("html", 2.0), ("css", 2.0), ("react", 2.5), ("nodejs", 3.0), ("java", 2.5), + ("csharp", 3.0), ("datamatiker", 3.0), ("sql", 2.5), ("rest api", 3.0), + ("programming", 2.5), ("developer", 2.0), ("debugging", 2.5), ("testing", 2.0), + ], + "Bøger og litteratur": [ + ("isbn", 3.0), ("forlag", 1.5), ("roman", 3.0), ("novelle", 3.0), + ("biografi", 3.0), ("poesi", 3.0), ("digtsamling", 3.0), ("bog", 2.0), + ("litteratur", 2.5), ("forfatter", 3.0), ("kapitel", 2.0), ("bogklub", 3.0), + ("bibliotek", 2.5), ("e-bog", 3.0), ("lydbog", 3.0), ("udgivelse", 2.0), + ("biography", 2.5), ("novel", 2.5), ("author", 2.0), ("chapter", 2.0), + ("publisher", 2.0), ("edition", 2.0), ("paperback", 3.0), ("hardcover", 3.0), + ("fiction", 3.0), ("nonfiction", 3.0), ("memoir", 3.0), + ], + "Rejse og transport": [ + ("rejse", 2.0), ("ferie", 2.0), ("fly", 2.5), ("hotel", 2.5), + ("booking", 2.5), ("rejseplan", 2.5), ("pas", 2.0), ("visum", 3.0), + ("bil", 1.5), ("kørekort", 3.0), ("tog", 2.0), ("billet", 2.5), + ("flyrejse", 3.0), ("afgangsgate", 3.0), ("baggage", 2.5), ("cruise", 3.0), + ("afrejse", 2.5), ("ankomst", 2.0), ("itinerary", 3.0), ("pakketur", 2.5), + ], + "Offentlige myndigheder": [ + ("kommune", 2.5), ("stat", 1.5), ("styrelse", 2.5), ("forvaltning", 2.5), + ("gældstyrelsen", 3.0), ("skat", 2.0), ("udbetaling danmark", 3.0), + ("borger.dk", 3.0), ("digitalpost", 2.5), ("afgørelse", 2.5), + ("offentlig myndighed", 3.0), ("ministeri", 2.5), ("ministeriet", 2.5), + ("politi", 2.5), ("domstol", 2.5), ("retsinformation", 3.0), + ("folketing", 2.5), ("region", 2.0), ("jobcenter", 2.5), + ("borger", 1.5), ("ansøgning kommune", 3.0), ("nykøbingvej", 2.5), + ("sakskøbing", 2.5), ("akkordansøgning", 3.0), + ], + "Projekter og hobby": [ + ("hobby", 2.5), ("projekt", 2.0), ("frivillig", 2.5), ("klub", 2.0), + ("aktivitet", 2.0), ("sport", 2.5), ("musik", 2.5), ("opskrift", 2.5), + ("træning", 2.0), ("kreativ", 2.5), ("håndværk", 2.5), ("fotografi", 2.5), + ("spil", 2.0), ("gaming", 3.0), ("maleri", 2.5), ("tegning", 2.0), + ("golf", 3.0), ("fitness", 2.5), ("klippekort", 2.5), ("svømning", 2.5), + ("cykling", 2.5), ("løb", 1.5), ("boldspil", 2.5), ("fodbold", 2.5), + ("concert", 2.0), ("festival", 2.5), + ], + "Teknik og ingeniørfag": [ + ("tegning", 2.0), ("teknisk tegning", 3.0), ("ingeniør", 2.5), ("konstruktion", 2.5), + ("maskine", 2.5), ("elektroteknik", 3.0), ("specifikation", 2.0), + ("diagram", 2.0), ("brugsanvisning", 3.0), ("manual", 2.5), ("datablad", 3.0), + ("CE-mærkning", 3.0), ("ISO", 2.0), ("norm", 2.0), ("standard", 1.5), + ("user manual", 2.5), ("installation guide", 3.0), ("technical specification", 3.0), + ("product guide", 2.5), ("service manual", 3.0), + ], + "Erhverv og business": [ + ("virksomhed", 2.5), ("erhverv", 2.5), ("CVR", 3.0), ("faktura", 2.5), + ("ordre", 2.0), ("leverandør", 2.5), ("kunde", 2.0), ("salg", 2.0), + ("moms", 2.5), ("regnskab", 2.5), ("årsregnskab", 3.0), ("balance", 2.0), + ("resultatopgørelse", 3.0), ("aktieselskab", 3.0), ("iværksætter", 2.5), + ("forretning", 2.0), ("selskab", 2.0), ("ApS", 3.0), ("A/S", 3.0), + ], +} + +TAXONOMY_TO_FOLDER: dict[str, str] = { + "Familie og børn": "Privat/Familie", + "Skole og uddannelse": "Privat/Personlig/Uddannelse", + "Arbejde og karriere": "Privat/Personlig/Arbejde", + "Økonomi og regninger": "Privat/Økonomi", + "Hjem og bolig": "Privat/Hjem/Bolig", + "Jura og kontrakter": "Privat/Jura", + "Sundhed og medicin": "Privat/Personlig/Sundhed", + "IT og teknologi": "Arkiv/Teknisk", + "Bøger og litteratur": "Arkiv/Bøger", + "Rejse og transport": "Privat/Rejser", + "Offentlige myndigheder": "Privat/Jura/Myndigheder", + "Projekter og hobby": "Projekter", + "Teknik og ingeniørfag": "Arkiv/Teknisk", + "Erhverv og business": "Arkiv/Erhverv", +} + +MIN_SCORE: float = 1.5 + + +def keyword_score(doc_text: str, keywords: list[tuple[str, float]]) -> float: + """Score a document against a keyword list. + + Multi-word phrases are matched as substrings; single words are matched as + whole words (word boundary) to avoid false positives (e.g. 'bil' in 'mobil'). + Returns the sum of weights for all matching entries. + """ + text = doc_text.lower() + total = 0.0 + for kw, weight in keywords: + kw_lower = kw.lower() + if " " in kw_lower: + if kw_lower in text: + total += weight + else: + idx = text.find(kw_lower) + while idx != -1: + before = text[idx - 1] if idx > 0 else " " + after = text[idx + len(kw_lower)] if idx + len(kw_lower) < len(text) else " " + if not before.isalpha() and not after.isalpha(): + total += weight + break + idx = text.find(kw_lower, idx + 1) + return round(total, 3) + + +def classify_text( + content: str, + keywords: list[str], + folder_hint: str = "", + min_score: float = MIN_SCORE, +) -> dict: + """Classify document text + keywords against the taxonomy. + + Args: + content: Extracted document text. + keywords: YAKE keyword strings from kreuzberg. + folder_hint: Current folder path (used as additional context signal). + min_score: Minimum score to assign a label (else 'Ukendt'). + + Returns: + dict with category, subcategory, confidence, runner_up, runner_up_score. + """ + kw_text = " ".join(keywords) + folder_tokens = folder_hint.replace("/", " ").replace("_", " ").replace("-", " ") + doc_text = f"{content} {folder_tokens} {kw_text}" + + scores = { + cat: keyword_score(doc_text, kws) + for cat, kws in TAXONOMY.items() + } + + sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True) + best_label, best_score = sorted_scores[0] + runner_up_label, runner_up_score = sorted_scores[1] if len(sorted_scores) > 1 else ("", 0.0) + + if best_score >= min_score: + category = best_label + subcategory = TAXONOMY_TO_FOLDER.get(best_label, "") + else: + category = "Ukendt" + subcategory = "" + + return { + "category": category, + "subcategory": subcategory, + "confidence": round(best_score, 3), + "runner_up": runner_up_label if best_score >= min_score else best_label, + "runner_up_score": round(runner_up_score, 3), + }