feat: add taxonomy classify service + /classify endpoint
Some checks failed
Deploy classify service / build-and-deploy (push) Failing after 24s
Deploy fil (kreuzberg) / deploy (push) Successful in 53s

- scripts/taxonomy.py: shared taxonomy with 14 categories, keyword scorer
  and classify_text() function
- scripts/classify_server.py: FastAPI service — forwards to kreuzberg /extract,
  applies taxonomy, returns category/subcategory/confidence alongside full kreuzberg response
- Dockerfile.classify: lightweight Python image for classify service
- classify.nomad: Nomad job → classify.i80.dk
- .gitea/workflows/classify.yml: CI/CD pipeline (build + deploy)
- analyse_familie.py: refactored to import from taxonomy.py (no duplication)
- .gitignore: exclude dokumenter_keywords.* and extract_all.log
This commit is contained in:
Henrik Jess Nielsen
2026-06-05 19:57:39 +02:00
parent f0300b586b
commit 58210207ea
7 changed files with 1027 additions and 0 deletions

View File

@@ -0,0 +1,52 @@
name: Deploy classify service
on:
push:
branches:
- main
paths:
- "scripts/classify_server.py"
- "scripts/taxonomy.py"
- "Dockerfile.classify"
- "classify.nomad"
workflow_dispatch:
env:
REGISTRY: ghcr.io
IMAGE_NAME: hjess/kreuzberg-classify
jobs:
build-and-deploy:
runs-on: debian-host
env:
PATH: /usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/sbin:/bin:/snap/bin
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Build Docker image
run: |
docker build -f Dockerfile.classify -t ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest .
- name: Push to registry
run: |
docker push ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest
- name: Validate Nomad job
run: nomad job validate classify.nomad
env:
NOMAD_ADDR: "https://nomad.i80.dk:4646"
- name: Deploy to Nomad
run: nomad job run classify.nomad
env:
NOMAD_ADDR: "https://nomad.i80.dk:4646"
- name: Check deployment status
run: |
sleep 10
nomad job status classify
env:
NOMAD_ADDR: "https://nomad.i80.dk:4646"

6
.gitignore vendored
View File

@@ -245,6 +245,12 @@ obj/
pkg/
# Doc classifier output files
dokumenter_keywords.json
dokumenter_keywords.classified.csv
dokumenter_keywords.misplaced.csv
extract_all.log
# Local dev artifacts
docs/demo-dev.html
docs/serve.json

13
Dockerfile.classify Normal file
View File

@@ -0,0 +1,13 @@
FROM python:3.12-slim
WORKDIR /app
RUN pip install --no-cache-dir fastapi uvicorn[standard] httpx
COPY scripts/taxonomy.py .
COPY scripts/classify_server.py .
ENV KREUZBERG_URL=https://check.i80.dk
ENV PORT=8000
CMD ["sh", "-c", "uvicorn classify_server:app --host 0.0.0.0 --port ${PORT}"]

96
classify.nomad Normal file
View File

@@ -0,0 +1,96 @@
job "classify" {
region = "global"
datacenters = ["dc1"]
type = "service"
meta {
uuid = uuidv4()
deployed_at = "[[ timeNowUTC ]]"
}
update {
stagger = "30s"
max_parallel = 1
auto_revert = true
progress_deadline = "10m"
}
group "classify-group" {
count = 1
constraint {
attribute = "${node.unique.name}"
value = "int"
}
update {
canary = 1
auto_promote = true
min_healthy_time = "20s"
healthy_deadline = "10m"
progress_deadline = "15m"
auto_revert = true
}
network {
port "http" {}
}
reschedule {
attempts = 5
interval = "10m"
delay = "30s"
delay_function = "exponential"
max_delay = "120s"
unlimited = false
}
service {
provider = "consul"
name = "classify"
port = "http"
tags = [
"traefik.enable=true",
"traefik.http.routers.classify.rule=Host(`classify.i80.dk`)",
"traefik.http.routers.classify.tls=true",
]
check {
name = "http_health"
type = "http"
port = "http"
path = "/health"
interval = "15s"
timeout = "5s"
}
}
task "classify-task" {
driver = "docker"
config {
image = "ghcr.io/hjess/kreuzberg-classify:latest"
ports = ["http"]
force_pull = true
}
env {
KREUZBERG_URL = "https://check.i80.dk"
PORT = "${NOMAD_PORT_http}"
}
restart {
attempts = 5
interval = "10m"
delay = "20s"
mode = "fail"
}
resources {
cpu = 200
memory = 256
}
}
}
}

512
scripts/analyse_familie.py Executable file
View File

@@ -0,0 +1,512 @@
#!/usr/bin/env python3
"""
Document keyword analyser — misplacement detection across ~/Dokumenter.
Phase 1 (extract): Hits kreuzberg API for all files, saves progress to JSON.
Phase 2 (analyse): Builds folder profiles, flags files likely in the wrong folder.
Phase 3 (classify): Zero-shot taxonomy classification via sentence-transformers.
Usage:
python3 analyse_familie.py extract
python3 analyse_familie.py analyse
python3 analyse_familie.py classify
python3 analyse_familie.py extract --workers 6 --output my_results.json
python3 analyse_familie.py analyse --threshold 0.25
python3 analyse_familie.py classify --results dokumenter_keywords.json
"""
import csv
import json
import argparse
import re
import time
from pathlib import Path
from collections import defaultdict, Counter
from concurrent.futures import ThreadPoolExecutor, as_completed
import urllib.request
import urllib.error
API_URL = "https://check.i80.dk/extract"
DEFAULT_DIR = "/home/hjess/Dokumenter"
DEFAULT_OUTPUT = "dokumenter_keywords.json"
SUPPORTED_EXTS = {".pdf", ".doc", ".docx", ".odt", ".ods", ".ppt", ".pptx", ".txt", ".rtf"}
# Generic words that don't help categorise documents
STOPWORDS = {
# Danish function words
"den", "det", "der", "som", "til", "fra", "med", "for", "og", "i", "er",
"en", "et", "af", "", "at", "de", "har", "ikke", "vi", "hun", "han",
"skal", "kan", "var", "men", "når", "også", "bare", "blev", "bare",
# Form field noise
"navn", "dato", "side", "fulde", "nummer", "adresse", "telefon", "email",
"cpr", "cpr-nr", "postnr", "underskrift", "forælder", "barnet", "oplysninger",
"telefonnummer", "telefonnummer i dagtimerne",
# Email/phone footer boilerplate
"sendt fra", "sendt fra min", "fra min", "fra min iphone",
"min iphone", "min iphone den", "iphone den",
"skrev henrik", "skrev henrik jess", "henrik jess", "henrik",
"ganstar nielsen wrote", "nielsen wrote",
# Names — too generic across this specific corpus
"ganstar", "nielsen", "ganstar nielsen", "tanya ganstar", "tanya ganstar nielsen",
"henrik jess nielsen", "jess nielsen",
# Kaiten mail system noise
"kaiten", "kaiten mail", "med kaiten mail", "via kaiten",
}
SIDER_PATTERN = "Aktindsigt/Sider"
SIDER_SAMPLE = 20 # how many pages to sample from the bulk Sider/ group
BULK_DIR_THRESHOLD = 50 # dirs with more files than this get sampled instead of fully processed
# ---------------------------------------------------------------------------
# HTTP helper (stdlib only — no requests dependency)
# ---------------------------------------------------------------------------
def _post_multipart(url, filepath: Path, config: dict, timeout=90) -> dict | None:
"""POST a file as multipart/form-data using stdlib only. Retries on 502."""
boundary = "----KreuzbergBoundary7MA4YWxkTrZu"
config_json = json.dumps(config).encode()
with open(filepath, "rb") as fh:
file_data = fh.read()
body = (
f"--{boundary}\r\n"
f'Content-Disposition: form-data; name="files"; filename="{filepath.name}"\r\n'
f"Content-Type: application/octet-stream\r\n\r\n"
).encode() + file_data + (
f"\r\n--{boundary}\r\n"
f'Content-Disposition: form-data; name="config"\r\n\r\n'
).encode() + config_json + f"\r\n--{boundary}--\r\n".encode()
req = urllib.request.Request(url, data=body)
req.add_header("Content-Type", f"multipart/form-data; boundary={boundary}")
req.add_header("Content-Length", str(len(body)))
retries = 3
for attempt in range(retries):
try:
with urllib.request.urlopen(req, timeout=timeout) as resp:
return json.loads(resp.read().decode())
except urllib.error.HTTPError as e:
if e.code == 502 and attempt < retries - 1:
wait = 5 * (2 ** attempt) # 5s, 10s, 20s
time.sleep(wait)
continue
return {"error": f"HTTP {e.code}"}
except Exception as e:
return {"error": str(e)}
return {"error": "HTTP 502 (max retries exceeded)"}
# ---------------------------------------------------------------------------
# Extraction
# ---------------------------------------------------------------------------
def extract_file(filepath: Path, max_keywords: int = 15) -> dict:
config = {"keywords": {"algorithm": "yake", "max_keywords": max_keywords}}
data = _post_multipart(API_URL, filepath, config)
if data is None or "error" in (data if isinstance(data, dict) else {}):
error = (data or {}).get("error", "unknown")
return {"file": str(filepath), "keywords": [], "languages": [], "content_length": 0, "error": error}
results = data if isinstance(data, list) else [data]
if not results:
return {"file": str(filepath), "keywords": [], "languages": [], "content_length": 0, "error": "empty response"}
r = results[0]
return {
"file": str(filepath),
"keywords": [(k["text"], round(k["score"], 4)) for k in (r.get("extracted_keywords") or [])],
"languages": r.get("detected_languages") or [],
"content_length": len(r.get("content") or ""),
"error": None,
}
def collect_files(directory: Path, bulk_threshold: int = BULK_DIR_THRESHOLD) -> tuple[list[Path], list[Path]]:
"""Return (regular_files, bulk_dirs) where bulk dirs have >bulk_threshold files."""
# First pass: count files per directory
dir_files: dict[Path, list[Path]] = defaultdict(list)
for f in sorted(directory.rglob("*")):
if f.is_file() and f.suffix.lower() in SUPPORTED_EXTS:
dir_files[f.parent].append(f)
regular: list[Path] = []
bulk_dirs: list[Path] = []
seen_bulk: set[Path] = set()
for parent, files in dir_files.items():
# Check if any ancestor is already a bulk dir
if any(b in parent.parents or b == parent for b in seen_bulk):
continue
if len(files) >= bulk_threshold:
bulk_dirs.append(parent)
seen_bulk.add(parent)
else:
regular.extend(files)
return regular, bulk_dirs
def analyse_bulk_group(bulk_dir: Path) -> dict:
"""Sample SIDER_SAMPLE files from a large directory and return a merged group result."""
all_files = [f for f in sorted(bulk_dir.rglob("*")) if f.is_file() and f.suffix.lower() in SUPPORTED_EXTS]
sample = all_files[:SIDER_SAMPLE]
print(f" Sampling {len(sample)}/{len(all_files)} files from {bulk_dir.name}/…")
keyword_counter: Counter = Counter()
for f in sample:
r = extract_file(f, max_keywords=10)
for kw, _ in r["keywords"]:
keyword_counter[kw.lower()] += 1
return {
"file": str(bulk_dir) + f"/ [GROUP — {len(all_files)} files, sampled {len(sample)}]",
"keywords": [(kw, round(count / len(sample), 3)) for kw, count in keyword_counter.most_common(15)],
"languages": ["dan"],
"content_length": -1,
"is_group": True,
"error": None,
}
def run_extract(directory: str, output_file: str, workers: int, bulk_threshold: int = BULK_DIR_THRESHOLD) -> None:
dir_path = Path(directory)
out_path = Path(output_file)
files, bulk_dirs = collect_files(dir_path, bulk_threshold=bulk_threshold)
# Resume from existing output — only skip files with successful results (no error)
results: dict[str, dict] = {}
retriable_errors = {"HTTP 502", "HTTP 502 (max retries exceeded)"}
if out_path.exists():
with open(out_path) as fh:
for r in json.load(fh):
# Retry transient server errors; keep permanent errors (422 etc.)
if r.get("error") in retriable_errors:
continue
results[r["file"]] = r
print(f"Resuming — {len(results)} files already done")
todo = [f for f in files if str(f) not in results]
total = len(todo)
print(f"Files to process: {total} (skipping {len(files) - total} already done)")
if bulk_dirs:
print(f"Bulk directories (sampled): {[str(d) for d in bulk_dirs]}\n")
done = 0
errors = 0
def save():
with open(out_path, "w") as fh:
json.dump(list(results.values()), fh, ensure_ascii=False, indent=2)
with ThreadPoolExecutor(max_workers=workers) as ex:
futures = {ex.submit(extract_file, f): f for f in todo}
for future in as_completed(futures):
result = future.result()
results[result["file"]] = result
done += 1
if result["error"]:
errors += 1
print(f" ❌ [{done}/{total}] {Path(result['file']).name}: {result['error']}")
else:
preview = ", ".join(kw for kw, _ in result["keywords"][:3])
print(f" ✅ [{done}/{total}] {Path(result['file']).name}{preview}")
if done % 20 == 0:
save()
save()
# Handle bulk dirs as sampled groups
for bulk_dir in bulk_dirs:
group_key = str(bulk_dir) + "/ [GROUP]"
if group_key not in results:
print(f"\nAnalysing bulk group: {bulk_dir.name}/")
group = analyse_bulk_group(bulk_dir)
results[group["file"]] = group
save()
print(f" Top keywords: {', '.join(kw for kw, _ in group['keywords'][:5])}")
print(f"\n✅ Finished: {done} files, {errors} errors → {out_path}")
print(f" Run 'python3 {__file__} analyse' to find misplaced files")
# ---------------------------------------------------------------------------
# Phase 2: Misplacement detection
# ---------------------------------------------------------------------------
def _clean_keywords(raw: list[tuple[str, float]]) -> list[str]:
return [
kw.lower().strip() for kw, _ in (raw or [])
if len(kw) >= 4 and kw.lower().strip() not in STOPWORDS
]
def _folder_key(filepath: Path, base: Path) -> str:
"""Return the relative folder path (e.g. 'Privat/Økonomi/Gæld')."""
try:
return str(filepath.relative_to(base).parent)
except ValueError:
return ""
def run_analyse(results_file: str, base_dir: str, threshold: float, min_folder_docs: int) -> None:
"""
Build a keyword profile per folder, then flag files whose keywords
don't overlap well with their current folder.
"""
with open(results_file) as fh:
results = json.load(fh)
base = Path(base_dir)
print(f"Analysing {len(results)} document records…")
# Build folder profiles: folder → Counter(keyword → freq)
folder_profiles: dict[str, Counter] = defaultdict(Counter)
file_kws: dict[str, list[str]] = {}
for r in results:
if r.get("is_group"):
continue
kws = _clean_keywords(r.get("keywords") or [])
file_kws[r["file"]] = kws
folder = _folder_key(Path(r["file"]), base)
if folder:
for kw in kws:
folder_profiles[folder][kw] += 1
# Filter out thin folders (too few docs to be meaningful)
folder_doc_counts: Counter = Counter()
for r in results:
if not r.get("is_group"):
folder_doc_counts[_folder_key(Path(r["file"]), base)] += 1
valid_folders = {f for f, c in folder_doc_counts.items() if c >= min_folder_docs}
print(f"Folder profiles built: {len(valid_folders)} folders with {min_folder_docs}+ documents\n")
# Score each file against its own folder, flag low-overlap files
misplaced: list[dict] = []
unclassified: list[dict] = []
for r in results:
if r.get("is_group"):
continue
fp = Path(r["file"])
folder = _folder_key(fp, base)
kws = file_kws.get(r["file"], [])
if not kws:
continue
if folder == "." or folder == "":
# File sits directly in the root — needs a home
unclassified.append(r)
continue
profile = folder_profiles.get(folder, Counter())
# Overlap: share of file's keywords that appear ≥2 times in folder profile
shared = sum(1 for kw in kws if profile[kw] >= 2)
overlap = shared / len(kws)
if overlap < threshold and folder in valid_folders:
# Find best matching alternative folder
best_folder = max(
valid_folders - {folder},
key=lambda fd: sum(folder_profiles[fd][kw] for kw in kws),
default=None,
)
best_score = sum(folder_profiles[best_folder][kw] for kw in kws) if best_folder else 0
# Skip if both current and suggested are generic "Ukendt" dump folders —
# moving Ukendt/PDF/2011 → Ukendt/PDF/2009 is not an improvement
def _is_ukendt(f: str) -> bool:
return f is not None and "Ukendt" in f
if _is_ukendt(folder) and _is_ukendt(best_folder):
continue
misplaced.append({
"file": r["file"],
"filename": fp.name,
"current_folder": folder,
"overlap": round(overlap, 2),
"suggested_folder": best_folder or "",
"suggestion_score": best_score,
"top_keywords": "; ".join(kws[:6]),
})
misplaced.sort(key=lambda x: x["overlap"])
# Print summary
print(f"{''*65}")
print(f"Potentially misplaced: {len(misplaced)} files (overlap < {threshold:.0%})\n")
for m in misplaced[:40]:
print(f" 📄 {m['filename']}")
print(f" Current: {m['current_folder']}")
print(f" Suggested: {m['suggested_folder']} (overlap={m['overlap']:.0%})")
print(f" Keywords: {m['top_keywords']}")
print()
if len(misplaced) > 40:
print(f" … and {len(misplaced) - 40} more — see CSV for full list\n")
# Save CSV for easy review / filtering in a spreadsheet
csv_path = Path(results_file).with_suffix(".misplaced.csv")
with open(csv_path, "w", newline="", encoding="utf-8") as csvf:
writer = csv.DictWriter(
csvf,
fieldnames=["filename", "current_folder", "overlap", "suggested_folder",
"suggestion_score", "top_keywords", "file"],
)
writer.writeheader()
writer.writerows(misplaced)
print(f"{''*65}")
print(f"✅ Saved {len(misplaced)} misplaced records → {csv_path}")
print(f" Open in LibreOffice Calc, sort by 'overlap' asc to prioritise.")
# ---------------------------------------------------------------------------
# Phase 3: taxonomy classification via keyword scoring
# ---------------------------------------------------------------------------
# Taxonomy data and scorer live in taxonomy.py (shared with classify_server.py).
import sys as _sys
import os as _os
_sys.path.insert(0, _os.path.dirname(__file__))
from taxonomy import TAXONOMY, TAXONOMY_TO_FOLDER, keyword_score as _keyword_score # noqa: E402
def run_classify(results_file: str, base_dir: str = DEFAULT_DIR, min_score: float = 1.5) -> None:
"""Phase 3: classify each document into a taxonomy category using keyword scoring.
Scoring is deterministic: each category has a weighted keyword list; the
document text (filename + folder path + YAKE keywords) is scored against
every category and the highest score wins. No ML model required.
"""
results_path = Path(results_file)
if not results_path.exists():
print(f"❌ Results file not found: {results_file}")
print(" Run 'extract' phase first.")
return
with open(results_path, encoding="utf-8") as fh:
results: list[dict] = json.load(fh)
# Include files even without YAKE keywords — filename+folder alone can classify
classifiable = [r for r in results if not r.get("is_group") and not r.get("error")]
print(f"Classifying {len(classifiable)} documents…")
base = Path(base_dir)
output_rows: list[dict] = []
for r in classifiable:
fp = Path(r["file"])
# Build document text from filename tokens + full folder path + YAKE keywords
stem_tokens = re.sub(r"[-_.]", " ", fp.stem).lower()
folder_tokens = " ".join(fp.parent.parts).lower()
kw_text = " ".join(kw for kw, _ in r.get("keywords", []))
doc_text = f"{stem_tokens} {folder_tokens} {kw_text}"
scores: dict[str, float] = {
cat: _keyword_score(doc_text, kws)
for cat, kws in TAXONOMY.items()
}
best_label = max(scores, key=lambda c: scores[c])
best_score = scores[best_label]
sorted_cats = sorted(scores, key=lambda c: scores[c], reverse=True)
runner_up = sorted_cats[1] if len(sorted_cats) > 1 else ""
runner_score = scores[runner_up] if runner_up else 0.0
current_folder = _folder_key(fp, base)
label = best_label if best_score >= min_score else "Ukendt"
suggested_folder = TAXONOMY_TO_FOLDER.get(label, "") if label != "Ukendt" else ""
output_rows.append({
"filename": fp.name,
"current_folder": current_folder,
"taxonomy_label": label,
"confidence": best_score,
"runner_up": runner_up,
"runner_up_score": runner_score,
"suggested_folder": suggested_folder,
"top_keywords": "; ".join(kw for kw, _ in r.get("keywords", [])[:6]),
"file": r["file"],
})
# Sort by confidence ascending — lowest confidence = needs most attention
output_rows.sort(key=lambda x: x["confidence"])
csv_path = Path(results_file).with_suffix(".classified.csv")
with open(csv_path, "w", newline="", encoding="utf-8") as csvf:
writer = csv.DictWriter(
csvf,
fieldnames=["filename", "current_folder", "taxonomy_label", "confidence",
"runner_up", "runner_up_score", "suggested_folder", "top_keywords", "file"],
)
writer.writeheader()
writer.writerows(output_rows)
# Print distribution summary
label_counts: Counter = Counter(r["taxonomy_label"] for r in output_rows)
print(f"\n{''*65}")
print(f"Taxonomy distribution ({len(output_rows)} documents):\n")
for label, count in label_counts.most_common():
bar = "" * (count * 30 // max(label_counts.values()))
print(f" {label:<30} {count:>4} {bar}")
low_conf = sum(1 for r in output_rows if r["confidence"] < min_score)
print(f"\n Low score (<{min_score}): {low_conf} files → labelled 'Ukendt'")
print(f"\n{''*65}")
print(f"✅ Saved {len(output_rows)} classifications → {csv_path}")
print(f" Columns: filename, current_folder, taxonomy_label, confidence, suggested_folder")
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Document keyword extraction + misplacement detector",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__,
)
sub = parser.add_subparsers(dest="cmd", required=True)
ep = sub.add_parser("extract", help="Phase 1: extract keywords from all files")
ep.add_argument("--dir", default=DEFAULT_DIR, help="Root directory to scan")
ep.add_argument("--output", default=DEFAULT_OUTPUT, help="Output JSON file")
ep.add_argument("--workers", type=int, default=3, help="Parallel API workers (default 3 — keep low to avoid 502s)")
ep.add_argument("--bulk-threshold", type=int, default=BULK_DIR_THRESHOLD,
help=f"Dirs with this many files are sampled instead of fully processed (default {BULK_DIR_THRESHOLD}). Use 999999 to process all files.")
ap = sub.add_parser("analyse", help="Phase 2: find misplaced files using folder profiles")
ap.add_argument("--results", default=DEFAULT_OUTPUT, help="JSON file from extract phase")
ap.add_argument("--dir", default=DEFAULT_DIR, help="Root directory (same as extract)")
ap.add_argument("--threshold", type=float, default=0.25,
help="Overlap threshold below which a file is flagged (default 0.25 = 25%%)")
ap.add_argument("--min-folder-docs", type=int, default=5,
help="Minimum docs in a folder to be used as a reference profile (default 5)")
cp = sub.add_parser("classify", help="Phase 3: keyword-scoring taxonomy classification")
cp.add_argument("--results", default=DEFAULT_OUTPUT, help="JSON file from extract phase")
cp.add_argument("--dir", default=DEFAULT_DIR, help="Root directory (same as extract)")
cp.add_argument("--min-score", type=float, default=1.5,
help="Minimum keyword score to assign a label (default 1.5). Below this → 'Ukendt'.")
args = parser.parse_args()
if args.cmd == "extract":
run_extract(args.dir, args.output, args.workers, args.bulk_threshold)
elif args.cmd == "analyse":
run_analyse(args.results, args.dir, args.threshold, args.min_folder_docs)
else:
run_classify(args.results, args.dir, args.min_score)

117
scripts/classify_server.py Normal file
View File

@@ -0,0 +1,117 @@
"""classify_server.py — FastAPI service that adds taxonomy classification to kreuzberg /extract.
Exposes POST /classify — same multipart interface as kreuzberg /extract,
returns the full kreuzberg response plus category/subcategory/confidence fields.
Usage:
uvicorn scripts.classify_server:app --host 0.0.0.0 --port 8001
"""
from __future__ import annotations
import json
import os
from contextlib import asynccontextmanager
from typing import Annotated
import httpx
from fastapi import FastAPI, File, Form, UploadFile
from fastapi.responses import JSONResponse
from taxonomy import classify_text
KREUZBERG_URL = os.getenv("KREUZBERG_URL", "https://check.i80.dk")
YAKE_CONFIG = {"keywords": {"algorithm": "yake", "max_keywords": 15}}
@asynccontextmanager
async def lifespan(app: FastAPI):
app.state.client = httpx.AsyncClient(timeout=60.0)
yield
await app.state.client.aclose()
app = FastAPI(
title="kreuzberg-classify",
description="Taxonomy classification on top of kreuzberg /extract",
version="1.0.0",
lifespan=lifespan,
)
@app.get("/health")
async def health():
return {"status": "healthy", "kreuzberg_url": KREUZBERG_URL}
@app.post("/classify")
async def classify(
files: Annotated[list[UploadFile], File()],
config: Annotated[str | None, Form()] = None,
folder: Annotated[str | None, Form()] = None,
):
"""Extract text + keywords via kreuzberg, then classify into taxonomy.
Args:
files: One or more document files (PDF, DOCX, etc.)
config: Optional JSON config for kreuzberg (merged with YAKE defaults).
folder: Optional current folder path for context hint.
Returns:
List of results — one per file — with all kreuzberg fields plus:
category, subcategory, confidence, runner_up, runner_up_score.
"""
# Merge caller config with our YAKE defaults
merged_config = dict(YAKE_CONFIG)
if config:
try:
caller_cfg = json.loads(config)
merged_config.update(caller_cfg)
except json.JSONDecodeError:
pass
# Forward files to kreuzberg /extract
form_data = [("config", json.dumps(merged_config))]
file_contents = []
for upload in files:
content = await upload.read()
file_contents.append((upload.filename, content, upload.content_type or "application/octet-stream"))
form_data.append(("files", (upload.filename, content, upload.content_type or "application/octet-stream")))
try:
response = await app.state.client.post(
f"{KREUZBERG_URL}/extract",
files=[("files", (fn, fc, ct)) for fn, fc, ct in file_contents],
data={"config": json.dumps(merged_config)},
)
response.raise_for_status()
kreuzberg_results = response.json()
except httpx.HTTPError as exc:
return JSONResponse(status_code=502, content={"error": f"kreuzberg error: {exc}"})
# Ensure list
if isinstance(kreuzberg_results, dict):
kreuzberg_results = [kreuzberg_results]
folder_hint = folder or ""
results = []
for i, result in enumerate(kreuzberg_results):
content_text = result.get("content", "") or ""
# Extract keyword strings from kreuzberg response
raw_keywords = result.get("keywords", []) or []
if raw_keywords and isinstance(raw_keywords[0], dict):
kw_strings = [k.get("keyword", "") or k.get("phrase", "") for k in raw_keywords]
else:
kw_strings = [str(k) for k in raw_keywords]
classification = classify_text(
content=content_text,
keywords=kw_strings,
folder_hint=folder_hint,
)
results.append({**result, **classification})
return results

231
scripts/taxonomy.py Normal file
View File

@@ -0,0 +1,231 @@
"""Shared taxonomy: weighted keyword lists + folder mappings + scorer.
Used by both analyse_familie.py (batch classify) and classify_server.py (API endpoint).
"""
from __future__ import annotations
TAXONOMY: dict[str, list[tuple[str, float]]] = {
"Familie og børn": [
("familie", 1.5), ("familieliv", 2.0), ("samvær", 2.0), ("samværsaftale", 2.5),
("børn", 1.5), ("barn", 1.5), ("forældre", 2.0), ("forældremyndighed", 2.5),
("skilsmisse", 2.5), ("separation", 2.0), ("barsel", 2.0), ("barnets", 1.5),
("søskende", 2.0), ("mor", 1.0), ("far", 1.0), ("mor og far", 2.5),
("dåb", 2.0), ("konfirmation", 2.5), ("bryllup", 2.0), ("vielse", 2.0),
("fodselsdag", 1.5), ("fødselsdagskort", 2.0),
],
"Skole og uddannelse": [
("skole", 1.5), ("uddannelse", 1.5), ("gymnasium", 2.5), ("universitetet", 2.0),
("eksamen", 2.0), ("studieplan", 2.5), ("karakter", 2.0), ("lektier", 2.5),
("opgave", 1.5), ("matematik", 2.0), ("dansk", 1.0), ("noter", 1.5),
("pensum", 2.5), ("studie", 1.5), ("kursus", 1.5), ("folkeskole", 2.5),
("htx", 2.5), ("hf", 2.0), ("hhx", 2.5), ("stx", 2.5), ("eux", 2.5),
("karakterblad", 3.0), ("eksamensbevis", 3.0), ("studiekort", 3.0),
("answer key", 2.5), ("quiz", 2.5), ("assessment", 2.5), ("learning", 1.5),
("lecture", 2.0), ("course", 2.0), ("lesson", 2.0), ("worksheet", 2.5),
],
"Arbejde og karriere": [
("ansøgning", 1.5), ("job", 1.5), ("jobansøgning", 2.5), ("cv", 2.5),
("curriculum vitae", 3.0), ("opsigelse", 2.5), ("løn", 1.5), ("lønforhandling", 2.5),
("ansættelseskontrakt", 3.0), ("ansættelse", 2.0), ("arbejdsplads", 2.0),
("kollega", 2.0), ("arbejdsgiver", 2.5), ("medarbejder", 2.0), ("fagforening", 2.5),
("a-kasse", 2.5), ("dagpenge", 2.5), ("jobcenter", 2.5), ("referenceliste", 3.0),
("karriere", 2.0), ("rekruttering", 2.5), ("personaleafdeling", 2.5),
("arbejde", 1.5), ("projektleder", 2.5), ("møde", 1.5), ("mødedagsorden", 2.5),
("scrum", 2.5), ("agile", 2.5), ("backlog", 2.5), ("sprint", 2.5),
("konference", 2.0), ("kompetencer", 2.0),
],
"Økonomi og regninger": [
("faktura", 2.5), ("regning", 2.0), ("betaling", 2.0), ("bank", 2.0),
("skat", 2.0), ("pension", 2.0), ("opsparing", 2.5), ("gæld", 2.5),
("lån", 2.5), ("kredit", 2.0), ("inkasso", 3.0), ("afdrag", 2.5),
("akkord", 3.0), ("restgæld", 3.0), ("kreditor", 2.5), ("økonomi", 1.5),
("budget", 2.0), ("forsikring", 2.0), ("rykkerbrev", 3.0), ("udbetaling", 2.0),
("sparekasse", 2.5), ("betalingsservice", 3.0), ("gældstyrelsen", 3.0),
("netto bank", 2.5), ("netbank", 2.5), ("kontoudtog", 3.0), ("årsopgørelse", 2.5),
("restskat", 3.0), ("årsopgørelse skat", 3.0), ("momsangivelse", 3.0),
],
"Hjem og bolig": [
("bolig", 2.0), ("hus", 1.5), ("lejlighed", 2.5), ("ejendom", 2.0),
("husleje", 3.0), ("vedligeholdelse", 2.5), ("renovation", 2.5),
("el", 1.0), ("vand", 1.0), ("varme", 1.5), ("fjernvarme", 2.5),
("ejerforening", 3.0), ("andelsbolig", 3.0), ("lejekontrakt", 3.0),
("fremlejning", 2.5), ("nøgle", 1.5), ("flytning", 2.0),
("indretning", 2.0), ("have", 1.5), ("grundejerforening", 3.0),
("BBR", 2.5), ("byggetilladelse", 3.0),
],
"Jura og kontrakter": [
("kontrakt", 2.0), ("aftale", 2.0), ("kontrakter", 2.0), ("juridisk", 2.5),
("advokat", 2.5), ("testamente", 3.0), ("retssag", 3.0), ("dom", 2.0),
("stævning", 3.0), ("klage", 2.0), ("tinglysning", 3.0), ("pantebrev", 3.0),
("tilbud", 1.5), ("vilkår", 2.0), ("betingelser", 2.0), ("fuldmagt", 2.5),
("forlig", 2.5), ("forsikringsbetingelser", 3.0), ("police", 2.0),
],
"Sundhed og medicin": [
("recept", 2.5), ("medicin", 2.5), ("læge", 2.5), ("hospital", 2.5),
("sygdom", 2.5), ("behandling", 2.0), ("diagnose", 3.0), ("operation", 2.5),
("symptomer", 2.5), ("sundhed", 2.0), ("journaloplysning", 3.0),
("patientjournal", 3.0), ("laboratorium", 2.5), ("blodprøve", 3.0),
("røntgen", 3.0), ("psykolog", 3.0), ("psykiater", 3.0), ("terapi", 2.5),
("tandlæge", 3.0), ("optiker", 2.5), ("vaccination", 3.0),
],
"IT og teknologi": [
("software", 2.5), ("server", 2.0), ("netværk", 2.5), ("database", 2.5),
("programmering", 2.5), ("kode", 2.0), ("linux", 3.0), ("cloud", 2.5),
("it", 1.5), ("computer", 2.0), ("laptop", 2.5), ("password", 2.5),
("installation", 2.0), ("konfiguration", 2.0), ("log", 1.5), ("backup", 2.5),
("docker", 3.0), ("kubernetes", 3.0), ("python", 3.0), ("github", 3.0),
("azure", 2.5), ("windows", 2.0), ("macos", 3.0), ("licens", 2.0),
("api", 2.5), ("dokumentation", 1.5), ("teknologi", 2.0), ("system", 1.5),
("web", 2.0), ("app", 1.5), ("program", 1.5), ("firmware", 3.0),
("internet", 2.0), ("cybersikkerhed", 3.0),
("bitcoin", 3.0), ("blockchain", 3.0), ("kryptovaluta", 3.0), ("jupyter", 3.0),
("notebook", 2.5), ("monitor", 2.0), ("display", 2.0), ("remote control", 2.5),
("user manual", 2.5), ("dataanalyse", 2.5), ("data analysis", 2.5),
("django", 3.0), ("javascript", 2.5), ("jquery", 2.5), ("typescript", 3.0),
("html", 2.0), ("css", 2.0), ("react", 2.5), ("nodejs", 3.0), ("java", 2.5),
("csharp", 3.0), ("datamatiker", 3.0), ("sql", 2.5), ("rest api", 3.0),
("programming", 2.5), ("developer", 2.0), ("debugging", 2.5), ("testing", 2.0),
],
"Bøger og litteratur": [
("isbn", 3.0), ("forlag", 1.5), ("roman", 3.0), ("novelle", 3.0),
("biografi", 3.0), ("poesi", 3.0), ("digtsamling", 3.0), ("bog", 2.0),
("litteratur", 2.5), ("forfatter", 3.0), ("kapitel", 2.0), ("bogklub", 3.0),
("bibliotek", 2.5), ("e-bog", 3.0), ("lydbog", 3.0), ("udgivelse", 2.0),
("biography", 2.5), ("novel", 2.5), ("author", 2.0), ("chapter", 2.0),
("publisher", 2.0), ("edition", 2.0), ("paperback", 3.0), ("hardcover", 3.0),
("fiction", 3.0), ("nonfiction", 3.0), ("memoir", 3.0),
],
"Rejse og transport": [
("rejse", 2.0), ("ferie", 2.0), ("fly", 2.5), ("hotel", 2.5),
("booking", 2.5), ("rejseplan", 2.5), ("pas", 2.0), ("visum", 3.0),
("bil", 1.5), ("kørekort", 3.0), ("tog", 2.0), ("billet", 2.5),
("flyrejse", 3.0), ("afgangsgate", 3.0), ("baggage", 2.5), ("cruise", 3.0),
("afrejse", 2.5), ("ankomst", 2.0), ("itinerary", 3.0), ("pakketur", 2.5),
],
"Offentlige myndigheder": [
("kommune", 2.5), ("stat", 1.5), ("styrelse", 2.5), ("forvaltning", 2.5),
("gældstyrelsen", 3.0), ("skat", 2.0), ("udbetaling danmark", 3.0),
("borger.dk", 3.0), ("digitalpost", 2.5), ("afgørelse", 2.5),
("offentlig myndighed", 3.0), ("ministeri", 2.5), ("ministeriet", 2.5),
("politi", 2.5), ("domstol", 2.5), ("retsinformation", 3.0),
("folketing", 2.5), ("region", 2.0), ("jobcenter", 2.5),
("borger", 1.5), ("ansøgning kommune", 3.0), ("nykøbingvej", 2.5),
("sakskøbing", 2.5), ("akkordansøgning", 3.0),
],
"Projekter og hobby": [
("hobby", 2.5), ("projekt", 2.0), ("frivillig", 2.5), ("klub", 2.0),
("aktivitet", 2.0), ("sport", 2.5), ("musik", 2.5), ("opskrift", 2.5),
("træning", 2.0), ("kreativ", 2.5), ("håndværk", 2.5), ("fotografi", 2.5),
("spil", 2.0), ("gaming", 3.0), ("maleri", 2.5), ("tegning", 2.0),
("golf", 3.0), ("fitness", 2.5), ("klippekort", 2.5), ("svømning", 2.5),
("cykling", 2.5), ("løb", 1.5), ("boldspil", 2.5), ("fodbold", 2.5),
("concert", 2.0), ("festival", 2.5),
],
"Teknik og ingeniørfag": [
("tegning", 2.0), ("teknisk tegning", 3.0), ("ingeniør", 2.5), ("konstruktion", 2.5),
("maskine", 2.5), ("elektroteknik", 3.0), ("specifikation", 2.0),
("diagram", 2.0), ("brugsanvisning", 3.0), ("manual", 2.5), ("datablad", 3.0),
("CE-mærkning", 3.0), ("ISO", 2.0), ("norm", 2.0), ("standard", 1.5),
("user manual", 2.5), ("installation guide", 3.0), ("technical specification", 3.0),
("product guide", 2.5), ("service manual", 3.0),
],
"Erhverv og business": [
("virksomhed", 2.5), ("erhverv", 2.5), ("CVR", 3.0), ("faktura", 2.5),
("ordre", 2.0), ("leverandør", 2.5), ("kunde", 2.0), ("salg", 2.0),
("moms", 2.5), ("regnskab", 2.5), ("årsregnskab", 3.0), ("balance", 2.0),
("resultatopgørelse", 3.0), ("aktieselskab", 3.0), ("iværksætter", 2.5),
("forretning", 2.0), ("selskab", 2.0), ("ApS", 3.0), ("A/S", 3.0),
],
}
TAXONOMY_TO_FOLDER: dict[str, str] = {
"Familie og børn": "Privat/Familie",
"Skole og uddannelse": "Privat/Personlig/Uddannelse",
"Arbejde og karriere": "Privat/Personlig/Arbejde",
"Økonomi og regninger": "Privat/Økonomi",
"Hjem og bolig": "Privat/Hjem/Bolig",
"Jura og kontrakter": "Privat/Jura",
"Sundhed og medicin": "Privat/Personlig/Sundhed",
"IT og teknologi": "Arkiv/Teknisk",
"Bøger og litteratur": "Arkiv/Bøger",
"Rejse og transport": "Privat/Rejser",
"Offentlige myndigheder": "Privat/Jura/Myndigheder",
"Projekter og hobby": "Projekter",
"Teknik og ingeniørfag": "Arkiv/Teknisk",
"Erhverv og business": "Arkiv/Erhverv",
}
MIN_SCORE: float = 1.5
def keyword_score(doc_text: str, keywords: list[tuple[str, float]]) -> float:
"""Score a document against a keyword list.
Multi-word phrases are matched as substrings; single words are matched as
whole words (word boundary) to avoid false positives (e.g. 'bil' in 'mobil').
Returns the sum of weights for all matching entries.
"""
text = doc_text.lower()
total = 0.0
for kw, weight in keywords:
kw_lower = kw.lower()
if " " in kw_lower:
if kw_lower in text:
total += weight
else:
idx = text.find(kw_lower)
while idx != -1:
before = text[idx - 1] if idx > 0 else " "
after = text[idx + len(kw_lower)] if idx + len(kw_lower) < len(text) else " "
if not before.isalpha() and not after.isalpha():
total += weight
break
idx = text.find(kw_lower, idx + 1)
return round(total, 3)
def classify_text(
content: str,
keywords: list[str],
folder_hint: str = "",
min_score: float = MIN_SCORE,
) -> dict:
"""Classify document text + keywords against the taxonomy.
Args:
content: Extracted document text.
keywords: YAKE keyword strings from kreuzberg.
folder_hint: Current folder path (used as additional context signal).
min_score: Minimum score to assign a label (else 'Ukendt').
Returns:
dict with category, subcategory, confidence, runner_up, runner_up_score.
"""
kw_text = " ".join(keywords)
folder_tokens = folder_hint.replace("/", " ").replace("_", " ").replace("-", " ")
doc_text = f"{content} {folder_tokens} {kw_text}"
scores = {
cat: keyword_score(doc_text, kws)
for cat, kws in TAXONOMY.items()
}
sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
best_label, best_score = sorted_scores[0]
runner_up_label, runner_up_score = sorted_scores[1] if len(sorted_scores) > 1 else ("", 0.0)
if best_score >= min_score:
category = best_label
subcategory = TAXONOMY_TO_FOLDER.get(best_label, "")
else:
category = "Ukendt"
subcategory = ""
return {
"category": category,
"subcategory": subcategory,
"confidence": round(best_score, 3),
"runner_up": runner_up_label if best_score >= min_score else best_label,
"runner_up_score": round(runner_up_score, 3),
}