feat: add taxonomy classify service + /classify endpoint
- scripts/taxonomy.py: shared taxonomy with 14 categories, keyword scorer and classify_text() function - scripts/classify_server.py: FastAPI service — forwards to kreuzberg /extract, applies taxonomy, returns category/subcategory/confidence alongside full kreuzberg response - Dockerfile.classify: lightweight Python image for classify service - classify.nomad: Nomad job → classify.i80.dk - .gitea/workflows/classify.yml: CI/CD pipeline (build + deploy) - analyse_familie.py: refactored to import from taxonomy.py (no duplication) - .gitignore: exclude dokumenter_keywords.* and extract_all.log
This commit is contained in:
52
.gitea/workflows/classify.yml
Normal file
52
.gitea/workflows/classify.yml
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
name: Deploy classify service
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
paths:
|
||||||
|
- "scripts/classify_server.py"
|
||||||
|
- "scripts/taxonomy.py"
|
||||||
|
- "Dockerfile.classify"
|
||||||
|
- "classify.nomad"
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
env:
|
||||||
|
REGISTRY: ghcr.io
|
||||||
|
IMAGE_NAME: hjess/kreuzberg-classify
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build-and-deploy:
|
||||||
|
runs-on: debian-host
|
||||||
|
|
||||||
|
env:
|
||||||
|
PATH: /usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/sbin:/bin:/snap/bin
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout code
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Build Docker image
|
||||||
|
run: |
|
||||||
|
docker build -f Dockerfile.classify -t ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest .
|
||||||
|
|
||||||
|
- name: Push to registry
|
||||||
|
run: |
|
||||||
|
docker push ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest
|
||||||
|
|
||||||
|
- name: Validate Nomad job
|
||||||
|
run: nomad job validate classify.nomad
|
||||||
|
env:
|
||||||
|
NOMAD_ADDR: "https://nomad.i80.dk:4646"
|
||||||
|
|
||||||
|
- name: Deploy to Nomad
|
||||||
|
run: nomad job run classify.nomad
|
||||||
|
env:
|
||||||
|
NOMAD_ADDR: "https://nomad.i80.dk:4646"
|
||||||
|
|
||||||
|
- name: Check deployment status
|
||||||
|
run: |
|
||||||
|
sleep 10
|
||||||
|
nomad job status classify
|
||||||
|
env:
|
||||||
|
NOMAD_ADDR: "https://nomad.i80.dk:4646"
|
||||||
6
.gitignore
vendored
6
.gitignore
vendored
@@ -245,6 +245,12 @@ obj/
|
|||||||
pkg/
|
pkg/
|
||||||
|
|
||||||
|
|
||||||
|
# Doc classifier output files
|
||||||
|
dokumenter_keywords.json
|
||||||
|
dokumenter_keywords.classified.csv
|
||||||
|
dokumenter_keywords.misplaced.csv
|
||||||
|
extract_all.log
|
||||||
|
|
||||||
# Local dev artifacts
|
# Local dev artifacts
|
||||||
docs/demo-dev.html
|
docs/demo-dev.html
|
||||||
docs/serve.json
|
docs/serve.json
|
||||||
|
|||||||
13
Dockerfile.classify
Normal file
13
Dockerfile.classify
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
FROM python:3.12-slim
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
RUN pip install --no-cache-dir fastapi uvicorn[standard] httpx
|
||||||
|
|
||||||
|
COPY scripts/taxonomy.py .
|
||||||
|
COPY scripts/classify_server.py .
|
||||||
|
|
||||||
|
ENV KREUZBERG_URL=https://check.i80.dk
|
||||||
|
ENV PORT=8000
|
||||||
|
|
||||||
|
CMD ["sh", "-c", "uvicorn classify_server:app --host 0.0.0.0 --port ${PORT}"]
|
||||||
96
classify.nomad
Normal file
96
classify.nomad
Normal file
@@ -0,0 +1,96 @@
|
|||||||
|
job "classify" {
|
||||||
|
region = "global"
|
||||||
|
datacenters = ["dc1"]
|
||||||
|
type = "service"
|
||||||
|
|
||||||
|
meta {
|
||||||
|
uuid = uuidv4()
|
||||||
|
deployed_at = "[[ timeNowUTC ]]"
|
||||||
|
}
|
||||||
|
|
||||||
|
update {
|
||||||
|
stagger = "30s"
|
||||||
|
max_parallel = 1
|
||||||
|
auto_revert = true
|
||||||
|
progress_deadline = "10m"
|
||||||
|
}
|
||||||
|
|
||||||
|
group "classify-group" {
|
||||||
|
count = 1
|
||||||
|
|
||||||
|
constraint {
|
||||||
|
attribute = "${node.unique.name}"
|
||||||
|
value = "int"
|
||||||
|
}
|
||||||
|
|
||||||
|
update {
|
||||||
|
canary = 1
|
||||||
|
auto_promote = true
|
||||||
|
min_healthy_time = "20s"
|
||||||
|
healthy_deadline = "10m"
|
||||||
|
progress_deadline = "15m"
|
||||||
|
auto_revert = true
|
||||||
|
}
|
||||||
|
|
||||||
|
network {
|
||||||
|
port "http" {}
|
||||||
|
}
|
||||||
|
|
||||||
|
reschedule {
|
||||||
|
attempts = 5
|
||||||
|
interval = "10m"
|
||||||
|
delay = "30s"
|
||||||
|
delay_function = "exponential"
|
||||||
|
max_delay = "120s"
|
||||||
|
unlimited = false
|
||||||
|
}
|
||||||
|
|
||||||
|
service {
|
||||||
|
provider = "consul"
|
||||||
|
name = "classify"
|
||||||
|
port = "http"
|
||||||
|
|
||||||
|
tags = [
|
||||||
|
"traefik.enable=true",
|
||||||
|
"traefik.http.routers.classify.rule=Host(`classify.i80.dk`)",
|
||||||
|
"traefik.http.routers.classify.tls=true",
|
||||||
|
]
|
||||||
|
|
||||||
|
check {
|
||||||
|
name = "http_health"
|
||||||
|
type = "http"
|
||||||
|
port = "http"
|
||||||
|
path = "/health"
|
||||||
|
interval = "15s"
|
||||||
|
timeout = "5s"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
task "classify-task" {
|
||||||
|
driver = "docker"
|
||||||
|
|
||||||
|
config {
|
||||||
|
image = "ghcr.io/hjess/kreuzberg-classify:latest"
|
||||||
|
ports = ["http"]
|
||||||
|
force_pull = true
|
||||||
|
}
|
||||||
|
|
||||||
|
env {
|
||||||
|
KREUZBERG_URL = "https://check.i80.dk"
|
||||||
|
PORT = "${NOMAD_PORT_http}"
|
||||||
|
}
|
||||||
|
|
||||||
|
restart {
|
||||||
|
attempts = 5
|
||||||
|
interval = "10m"
|
||||||
|
delay = "20s"
|
||||||
|
mode = "fail"
|
||||||
|
}
|
||||||
|
|
||||||
|
resources {
|
||||||
|
cpu = 200
|
||||||
|
memory = 256
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
512
scripts/analyse_familie.py
Executable file
512
scripts/analyse_familie.py
Executable file
@@ -0,0 +1,512 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Document keyword analyser — misplacement detection across ~/Dokumenter.
|
||||||
|
|
||||||
|
Phase 1 (extract): Hits kreuzberg API for all files, saves progress to JSON.
|
||||||
|
Phase 2 (analyse): Builds folder profiles, flags files likely in the wrong folder.
|
||||||
|
Phase 3 (classify): Zero-shot taxonomy classification via sentence-transformers.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 analyse_familie.py extract
|
||||||
|
python3 analyse_familie.py analyse
|
||||||
|
python3 analyse_familie.py classify
|
||||||
|
python3 analyse_familie.py extract --workers 6 --output my_results.json
|
||||||
|
python3 analyse_familie.py analyse --threshold 0.25
|
||||||
|
python3 analyse_familie.py classify --results dokumenter_keywords.json
|
||||||
|
"""
|
||||||
|
|
||||||
|
import csv
|
||||||
|
import json
|
||||||
|
import argparse
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from collections import defaultdict, Counter
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
|
||||||
|
import urllib.request
|
||||||
|
import urllib.error
|
||||||
|
|
||||||
|
API_URL = "https://check.i80.dk/extract"
|
||||||
|
DEFAULT_DIR = "/home/hjess/Dokumenter"
|
||||||
|
DEFAULT_OUTPUT = "dokumenter_keywords.json"
|
||||||
|
|
||||||
|
SUPPORTED_EXTS = {".pdf", ".doc", ".docx", ".odt", ".ods", ".ppt", ".pptx", ".txt", ".rtf"}
|
||||||
|
|
||||||
|
# Generic words that don't help categorise documents
|
||||||
|
STOPWORDS = {
|
||||||
|
# Danish function words
|
||||||
|
"den", "det", "der", "som", "til", "fra", "med", "for", "og", "i", "er",
|
||||||
|
"en", "et", "af", "på", "at", "de", "har", "ikke", "vi", "hun", "han",
|
||||||
|
"skal", "kan", "var", "men", "når", "også", "bare", "blev", "bare",
|
||||||
|
# Form field noise
|
||||||
|
"navn", "dato", "side", "fulde", "nummer", "adresse", "telefon", "email",
|
||||||
|
"cpr", "cpr-nr", "postnr", "underskrift", "forælder", "barnet", "oplysninger",
|
||||||
|
"telefonnummer", "telefonnummer i dagtimerne",
|
||||||
|
# Email/phone footer boilerplate
|
||||||
|
"sendt fra", "sendt fra min", "fra min", "fra min iphone",
|
||||||
|
"min iphone", "min iphone den", "iphone den",
|
||||||
|
"skrev henrik", "skrev henrik jess", "henrik jess", "henrik",
|
||||||
|
"ganstar nielsen wrote", "nielsen wrote",
|
||||||
|
# Names — too generic across this specific corpus
|
||||||
|
"ganstar", "nielsen", "ganstar nielsen", "tanya ganstar", "tanya ganstar nielsen",
|
||||||
|
"henrik jess nielsen", "jess nielsen",
|
||||||
|
# Kaiten mail system noise
|
||||||
|
"kaiten", "kaiten mail", "med kaiten mail", "via kaiten",
|
||||||
|
}
|
||||||
|
|
||||||
|
SIDER_PATTERN = "Aktindsigt/Sider"
|
||||||
|
SIDER_SAMPLE = 20 # how many pages to sample from the bulk Sider/ group
|
||||||
|
BULK_DIR_THRESHOLD = 50 # dirs with more files than this get sampled instead of fully processed
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# HTTP helper (stdlib only — no requests dependency)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _post_multipart(url, filepath: Path, config: dict, timeout=90) -> dict | None:
|
||||||
|
"""POST a file as multipart/form-data using stdlib only. Retries on 502."""
|
||||||
|
boundary = "----KreuzbergBoundary7MA4YWxkTrZu"
|
||||||
|
config_json = json.dumps(config).encode()
|
||||||
|
|
||||||
|
with open(filepath, "rb") as fh:
|
||||||
|
file_data = fh.read()
|
||||||
|
|
||||||
|
body = (
|
||||||
|
f"--{boundary}\r\n"
|
||||||
|
f'Content-Disposition: form-data; name="files"; filename="{filepath.name}"\r\n'
|
||||||
|
f"Content-Type: application/octet-stream\r\n\r\n"
|
||||||
|
).encode() + file_data + (
|
||||||
|
f"\r\n--{boundary}\r\n"
|
||||||
|
f'Content-Disposition: form-data; name="config"\r\n\r\n'
|
||||||
|
).encode() + config_json + f"\r\n--{boundary}--\r\n".encode()
|
||||||
|
|
||||||
|
req = urllib.request.Request(url, data=body)
|
||||||
|
req.add_header("Content-Type", f"multipart/form-data; boundary={boundary}")
|
||||||
|
req.add_header("Content-Length", str(len(body)))
|
||||||
|
|
||||||
|
retries = 3
|
||||||
|
for attempt in range(retries):
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||||||
|
return json.loads(resp.read().decode())
|
||||||
|
except urllib.error.HTTPError as e:
|
||||||
|
if e.code == 502 and attempt < retries - 1:
|
||||||
|
wait = 5 * (2 ** attempt) # 5s, 10s, 20s
|
||||||
|
time.sleep(wait)
|
||||||
|
continue
|
||||||
|
return {"error": f"HTTP {e.code}"}
|
||||||
|
except Exception as e:
|
||||||
|
return {"error": str(e)}
|
||||||
|
return {"error": "HTTP 502 (max retries exceeded)"}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Extraction
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def extract_file(filepath: Path, max_keywords: int = 15) -> dict:
|
||||||
|
config = {"keywords": {"algorithm": "yake", "max_keywords": max_keywords}}
|
||||||
|
data = _post_multipart(API_URL, filepath, config)
|
||||||
|
|
||||||
|
if data is None or "error" in (data if isinstance(data, dict) else {}):
|
||||||
|
error = (data or {}).get("error", "unknown")
|
||||||
|
return {"file": str(filepath), "keywords": [], "languages": [], "content_length": 0, "error": error}
|
||||||
|
|
||||||
|
results = data if isinstance(data, list) else [data]
|
||||||
|
if not results:
|
||||||
|
return {"file": str(filepath), "keywords": [], "languages": [], "content_length": 0, "error": "empty response"}
|
||||||
|
|
||||||
|
r = results[0]
|
||||||
|
return {
|
||||||
|
"file": str(filepath),
|
||||||
|
"keywords": [(k["text"], round(k["score"], 4)) for k in (r.get("extracted_keywords") or [])],
|
||||||
|
"languages": r.get("detected_languages") or [],
|
||||||
|
"content_length": len(r.get("content") or ""),
|
||||||
|
"error": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def collect_files(directory: Path, bulk_threshold: int = BULK_DIR_THRESHOLD) -> tuple[list[Path], list[Path]]:
|
||||||
|
"""Return (regular_files, bulk_dirs) where bulk dirs have >bulk_threshold files."""
|
||||||
|
# First pass: count files per directory
|
||||||
|
dir_files: dict[Path, list[Path]] = defaultdict(list)
|
||||||
|
for f in sorted(directory.rglob("*")):
|
||||||
|
if f.is_file() and f.suffix.lower() in SUPPORTED_EXTS:
|
||||||
|
dir_files[f.parent].append(f)
|
||||||
|
|
||||||
|
regular: list[Path] = []
|
||||||
|
bulk_dirs: list[Path] = []
|
||||||
|
seen_bulk: set[Path] = set()
|
||||||
|
|
||||||
|
for parent, files in dir_files.items():
|
||||||
|
# Check if any ancestor is already a bulk dir
|
||||||
|
if any(b in parent.parents or b == parent for b in seen_bulk):
|
||||||
|
continue
|
||||||
|
if len(files) >= bulk_threshold:
|
||||||
|
bulk_dirs.append(parent)
|
||||||
|
seen_bulk.add(parent)
|
||||||
|
else:
|
||||||
|
regular.extend(files)
|
||||||
|
|
||||||
|
return regular, bulk_dirs
|
||||||
|
|
||||||
|
|
||||||
|
def analyse_bulk_group(bulk_dir: Path) -> dict:
|
||||||
|
"""Sample SIDER_SAMPLE files from a large directory and return a merged group result."""
|
||||||
|
all_files = [f for f in sorted(bulk_dir.rglob("*")) if f.is_file() and f.suffix.lower() in SUPPORTED_EXTS]
|
||||||
|
sample = all_files[:SIDER_SAMPLE]
|
||||||
|
print(f" Sampling {len(sample)}/{len(all_files)} files from {bulk_dir.name}/…")
|
||||||
|
|
||||||
|
keyword_counter: Counter = Counter()
|
||||||
|
for f in sample:
|
||||||
|
r = extract_file(f, max_keywords=10)
|
||||||
|
for kw, _ in r["keywords"]:
|
||||||
|
keyword_counter[kw.lower()] += 1
|
||||||
|
|
||||||
|
return {
|
||||||
|
"file": str(bulk_dir) + f"/ [GROUP — {len(all_files)} files, sampled {len(sample)}]",
|
||||||
|
"keywords": [(kw, round(count / len(sample), 3)) for kw, count in keyword_counter.most_common(15)],
|
||||||
|
"languages": ["dan"],
|
||||||
|
"content_length": -1,
|
||||||
|
"is_group": True,
|
||||||
|
"error": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def run_extract(directory: str, output_file: str, workers: int, bulk_threshold: int = BULK_DIR_THRESHOLD) -> None:
|
||||||
|
dir_path = Path(directory)
|
||||||
|
out_path = Path(output_file)
|
||||||
|
files, bulk_dirs = collect_files(dir_path, bulk_threshold=bulk_threshold)
|
||||||
|
|
||||||
|
# Resume from existing output — only skip files with successful results (no error)
|
||||||
|
results: dict[str, dict] = {}
|
||||||
|
retriable_errors = {"HTTP 502", "HTTP 502 (max retries exceeded)"}
|
||||||
|
if out_path.exists():
|
||||||
|
with open(out_path) as fh:
|
||||||
|
for r in json.load(fh):
|
||||||
|
# Retry transient server errors; keep permanent errors (422 etc.)
|
||||||
|
if r.get("error") in retriable_errors:
|
||||||
|
continue
|
||||||
|
results[r["file"]] = r
|
||||||
|
print(f"Resuming — {len(results)} files already done")
|
||||||
|
|
||||||
|
todo = [f for f in files if str(f) not in results]
|
||||||
|
total = len(todo)
|
||||||
|
print(f"Files to process: {total} (skipping {len(files) - total} already done)")
|
||||||
|
if bulk_dirs:
|
||||||
|
print(f"Bulk directories (sampled): {[str(d) for d in bulk_dirs]}\n")
|
||||||
|
|
||||||
|
done = 0
|
||||||
|
errors = 0
|
||||||
|
|
||||||
|
def save():
|
||||||
|
with open(out_path, "w") as fh:
|
||||||
|
json.dump(list(results.values()), fh, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
with ThreadPoolExecutor(max_workers=workers) as ex:
|
||||||
|
futures = {ex.submit(extract_file, f): f for f in todo}
|
||||||
|
for future in as_completed(futures):
|
||||||
|
result = future.result()
|
||||||
|
results[result["file"]] = result
|
||||||
|
done += 1
|
||||||
|
if result["error"]:
|
||||||
|
errors += 1
|
||||||
|
print(f" ❌ [{done}/{total}] {Path(result['file']).name}: {result['error']}")
|
||||||
|
else:
|
||||||
|
preview = ", ".join(kw for kw, _ in result["keywords"][:3])
|
||||||
|
print(f" ✅ [{done}/{total}] {Path(result['file']).name} → {preview}")
|
||||||
|
if done % 20 == 0:
|
||||||
|
save()
|
||||||
|
|
||||||
|
save()
|
||||||
|
|
||||||
|
# Handle bulk dirs as sampled groups
|
||||||
|
for bulk_dir in bulk_dirs:
|
||||||
|
group_key = str(bulk_dir) + "/ [GROUP]"
|
||||||
|
if group_key not in results:
|
||||||
|
print(f"\nAnalysing bulk group: {bulk_dir.name}/")
|
||||||
|
group = analyse_bulk_group(bulk_dir)
|
||||||
|
results[group["file"]] = group
|
||||||
|
save()
|
||||||
|
print(f" Top keywords: {', '.join(kw for kw, _ in group['keywords'][:5])}")
|
||||||
|
|
||||||
|
print(f"\n✅ Finished: {done} files, {errors} errors → {out_path}")
|
||||||
|
print(f" Run 'python3 {__file__} analyse' to find misplaced files")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Phase 2: Misplacement detection
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _clean_keywords(raw: list[tuple[str, float]]) -> list[str]:
|
||||||
|
return [
|
||||||
|
kw.lower().strip() for kw, _ in (raw or [])
|
||||||
|
if len(kw) >= 4 and kw.lower().strip() not in STOPWORDS
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _folder_key(filepath: Path, base: Path) -> str:
|
||||||
|
"""Return the relative folder path (e.g. 'Privat/Økonomi/Gæld')."""
|
||||||
|
try:
|
||||||
|
return str(filepath.relative_to(base).parent)
|
||||||
|
except ValueError:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def run_analyse(results_file: str, base_dir: str, threshold: float, min_folder_docs: int) -> None:
|
||||||
|
"""
|
||||||
|
Build a keyword profile per folder, then flag files whose keywords
|
||||||
|
don't overlap well with their current folder.
|
||||||
|
"""
|
||||||
|
with open(results_file) as fh:
|
||||||
|
results = json.load(fh)
|
||||||
|
|
||||||
|
base = Path(base_dir)
|
||||||
|
print(f"Analysing {len(results)} document records…")
|
||||||
|
|
||||||
|
# Build folder profiles: folder → Counter(keyword → freq)
|
||||||
|
folder_profiles: dict[str, Counter] = defaultdict(Counter)
|
||||||
|
file_kws: dict[str, list[str]] = {}
|
||||||
|
|
||||||
|
for r in results:
|
||||||
|
if r.get("is_group"):
|
||||||
|
continue
|
||||||
|
kws = _clean_keywords(r.get("keywords") or [])
|
||||||
|
file_kws[r["file"]] = kws
|
||||||
|
folder = _folder_key(Path(r["file"]), base)
|
||||||
|
if folder:
|
||||||
|
for kw in kws:
|
||||||
|
folder_profiles[folder][kw] += 1
|
||||||
|
|
||||||
|
# Filter out thin folders (too few docs to be meaningful)
|
||||||
|
folder_doc_counts: Counter = Counter()
|
||||||
|
for r in results:
|
||||||
|
if not r.get("is_group"):
|
||||||
|
folder_doc_counts[_folder_key(Path(r["file"]), base)] += 1
|
||||||
|
|
||||||
|
valid_folders = {f for f, c in folder_doc_counts.items() if c >= min_folder_docs}
|
||||||
|
print(f"Folder profiles built: {len(valid_folders)} folders with {min_folder_docs}+ documents\n")
|
||||||
|
|
||||||
|
# Score each file against its own folder, flag low-overlap files
|
||||||
|
misplaced: list[dict] = []
|
||||||
|
unclassified: list[dict] = []
|
||||||
|
|
||||||
|
for r in results:
|
||||||
|
if r.get("is_group"):
|
||||||
|
continue
|
||||||
|
fp = Path(r["file"])
|
||||||
|
folder = _folder_key(fp, base)
|
||||||
|
kws = file_kws.get(r["file"], [])
|
||||||
|
|
||||||
|
if not kws:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if folder == "." or folder == "":
|
||||||
|
# File sits directly in the root — needs a home
|
||||||
|
unclassified.append(r)
|
||||||
|
continue
|
||||||
|
|
||||||
|
profile = folder_profiles.get(folder, Counter())
|
||||||
|
# Overlap: share of file's keywords that appear ≥2 times in folder profile
|
||||||
|
shared = sum(1 for kw in kws if profile[kw] >= 2)
|
||||||
|
overlap = shared / len(kws)
|
||||||
|
|
||||||
|
if overlap < threshold and folder in valid_folders:
|
||||||
|
# Find best matching alternative folder
|
||||||
|
best_folder = max(
|
||||||
|
valid_folders - {folder},
|
||||||
|
key=lambda fd: sum(folder_profiles[fd][kw] for kw in kws),
|
||||||
|
default=None,
|
||||||
|
)
|
||||||
|
best_score = sum(folder_profiles[best_folder][kw] for kw in kws) if best_folder else 0
|
||||||
|
|
||||||
|
# Skip if both current and suggested are generic "Ukendt" dump folders —
|
||||||
|
# moving Ukendt/PDF/2011 → Ukendt/PDF/2009 is not an improvement
|
||||||
|
def _is_ukendt(f: str) -> bool:
|
||||||
|
return f is not None and "Ukendt" in f
|
||||||
|
|
||||||
|
if _is_ukendt(folder) and _is_ukendt(best_folder):
|
||||||
|
continue
|
||||||
|
|
||||||
|
misplaced.append({
|
||||||
|
"file": r["file"],
|
||||||
|
"filename": fp.name,
|
||||||
|
"current_folder": folder,
|
||||||
|
"overlap": round(overlap, 2),
|
||||||
|
"suggested_folder": best_folder or "",
|
||||||
|
"suggestion_score": best_score,
|
||||||
|
"top_keywords": "; ".join(kws[:6]),
|
||||||
|
})
|
||||||
|
|
||||||
|
misplaced.sort(key=lambda x: x["overlap"])
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
print(f"{'─'*65}")
|
||||||
|
print(f"Potentially misplaced: {len(misplaced)} files (overlap < {threshold:.0%})\n")
|
||||||
|
|
||||||
|
for m in misplaced[:40]:
|
||||||
|
print(f" 📄 {m['filename']}")
|
||||||
|
print(f" Current: {m['current_folder']}")
|
||||||
|
print(f" Suggested: {m['suggested_folder']} (overlap={m['overlap']:.0%})")
|
||||||
|
print(f" Keywords: {m['top_keywords']}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
if len(misplaced) > 40:
|
||||||
|
print(f" … and {len(misplaced) - 40} more — see CSV for full list\n")
|
||||||
|
|
||||||
|
# Save CSV for easy review / filtering in a spreadsheet
|
||||||
|
csv_path = Path(results_file).with_suffix(".misplaced.csv")
|
||||||
|
with open(csv_path, "w", newline="", encoding="utf-8") as csvf:
|
||||||
|
writer = csv.DictWriter(
|
||||||
|
csvf,
|
||||||
|
fieldnames=["filename", "current_folder", "overlap", "suggested_folder",
|
||||||
|
"suggestion_score", "top_keywords", "file"],
|
||||||
|
)
|
||||||
|
writer.writeheader()
|
||||||
|
writer.writerows(misplaced)
|
||||||
|
|
||||||
|
print(f"{'─'*65}")
|
||||||
|
print(f"✅ Saved {len(misplaced)} misplaced records → {csv_path}")
|
||||||
|
print(f" Open in LibreOffice Calc, sort by 'overlap' asc to prioritise.")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Phase 3: taxonomy classification via keyword scoring
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Taxonomy data and scorer live in taxonomy.py (shared with classify_server.py).
|
||||||
|
|
||||||
|
import sys as _sys
|
||||||
|
import os as _os
|
||||||
|
_sys.path.insert(0, _os.path.dirname(__file__))
|
||||||
|
from taxonomy import TAXONOMY, TAXONOMY_TO_FOLDER, keyword_score as _keyword_score # noqa: E402
|
||||||
|
|
||||||
|
|
||||||
|
def run_classify(results_file: str, base_dir: str = DEFAULT_DIR, min_score: float = 1.5) -> None:
|
||||||
|
"""Phase 3: classify each document into a taxonomy category using keyword scoring.
|
||||||
|
|
||||||
|
Scoring is deterministic: each category has a weighted keyword list; the
|
||||||
|
document text (filename + folder path + YAKE keywords) is scored against
|
||||||
|
every category and the highest score wins. No ML model required.
|
||||||
|
"""
|
||||||
|
results_path = Path(results_file)
|
||||||
|
if not results_path.exists():
|
||||||
|
print(f"❌ Results file not found: {results_file}")
|
||||||
|
print(" Run 'extract' phase first.")
|
||||||
|
return
|
||||||
|
|
||||||
|
with open(results_path, encoding="utf-8") as fh:
|
||||||
|
results: list[dict] = json.load(fh)
|
||||||
|
|
||||||
|
# Include files even without YAKE keywords — filename+folder alone can classify
|
||||||
|
classifiable = [r for r in results if not r.get("is_group") and not r.get("error")]
|
||||||
|
print(f"Classifying {len(classifiable)} documents…")
|
||||||
|
|
||||||
|
base = Path(base_dir)
|
||||||
|
output_rows: list[dict] = []
|
||||||
|
|
||||||
|
for r in classifiable:
|
||||||
|
fp = Path(r["file"])
|
||||||
|
# Build document text from filename tokens + full folder path + YAKE keywords
|
||||||
|
stem_tokens = re.sub(r"[-_.]", " ", fp.stem).lower()
|
||||||
|
folder_tokens = " ".join(fp.parent.parts).lower()
|
||||||
|
kw_text = " ".join(kw for kw, _ in r.get("keywords", []))
|
||||||
|
doc_text = f"{stem_tokens} {folder_tokens} {kw_text}"
|
||||||
|
|
||||||
|
scores: dict[str, float] = {
|
||||||
|
cat: _keyword_score(doc_text, kws)
|
||||||
|
for cat, kws in TAXONOMY.items()
|
||||||
|
}
|
||||||
|
|
||||||
|
best_label = max(scores, key=lambda c: scores[c])
|
||||||
|
best_score = scores[best_label]
|
||||||
|
|
||||||
|
sorted_cats = sorted(scores, key=lambda c: scores[c], reverse=True)
|
||||||
|
runner_up = sorted_cats[1] if len(sorted_cats) > 1 else ""
|
||||||
|
runner_score = scores[runner_up] if runner_up else 0.0
|
||||||
|
|
||||||
|
current_folder = _folder_key(fp, base)
|
||||||
|
label = best_label if best_score >= min_score else "Ukendt"
|
||||||
|
suggested_folder = TAXONOMY_TO_FOLDER.get(label, "") if label != "Ukendt" else ""
|
||||||
|
|
||||||
|
output_rows.append({
|
||||||
|
"filename": fp.name,
|
||||||
|
"current_folder": current_folder,
|
||||||
|
"taxonomy_label": label,
|
||||||
|
"confidence": best_score,
|
||||||
|
"runner_up": runner_up,
|
||||||
|
"runner_up_score": runner_score,
|
||||||
|
"suggested_folder": suggested_folder,
|
||||||
|
"top_keywords": "; ".join(kw for kw, _ in r.get("keywords", [])[:6]),
|
||||||
|
"file": r["file"],
|
||||||
|
})
|
||||||
|
|
||||||
|
# Sort by confidence ascending — lowest confidence = needs most attention
|
||||||
|
output_rows.sort(key=lambda x: x["confidence"])
|
||||||
|
|
||||||
|
csv_path = Path(results_file).with_suffix(".classified.csv")
|
||||||
|
with open(csv_path, "w", newline="", encoding="utf-8") as csvf:
|
||||||
|
writer = csv.DictWriter(
|
||||||
|
csvf,
|
||||||
|
fieldnames=["filename", "current_folder", "taxonomy_label", "confidence",
|
||||||
|
"runner_up", "runner_up_score", "suggested_folder", "top_keywords", "file"],
|
||||||
|
)
|
||||||
|
writer.writeheader()
|
||||||
|
writer.writerows(output_rows)
|
||||||
|
|
||||||
|
# Print distribution summary
|
||||||
|
label_counts: Counter = Counter(r["taxonomy_label"] for r in output_rows)
|
||||||
|
print(f"\n{'─'*65}")
|
||||||
|
print(f"Taxonomy distribution ({len(output_rows)} documents):\n")
|
||||||
|
for label, count in label_counts.most_common():
|
||||||
|
bar = "█" * (count * 30 // max(label_counts.values()))
|
||||||
|
print(f" {label:<30} {count:>4} {bar}")
|
||||||
|
|
||||||
|
low_conf = sum(1 for r in output_rows if r["confidence"] < min_score)
|
||||||
|
print(f"\n Low score (<{min_score}): {low_conf} files → labelled 'Ukendt'")
|
||||||
|
print(f"\n{'─'*65}")
|
||||||
|
print(f"✅ Saved {len(output_rows)} classifications → {csv_path}")
|
||||||
|
print(f" Columns: filename, current_folder, taxonomy_label, confidence, suggested_folder")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# CLI
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Document keyword extraction + misplacement detector",
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
epilog=__doc__,
|
||||||
|
)
|
||||||
|
sub = parser.add_subparsers(dest="cmd", required=True)
|
||||||
|
|
||||||
|
ep = sub.add_parser("extract", help="Phase 1: extract keywords from all files")
|
||||||
|
ep.add_argument("--dir", default=DEFAULT_DIR, help="Root directory to scan")
|
||||||
|
ep.add_argument("--output", default=DEFAULT_OUTPUT, help="Output JSON file")
|
||||||
|
ep.add_argument("--workers", type=int, default=3, help="Parallel API workers (default 3 — keep low to avoid 502s)")
|
||||||
|
ep.add_argument("--bulk-threshold", type=int, default=BULK_DIR_THRESHOLD,
|
||||||
|
help=f"Dirs with this many files are sampled instead of fully processed (default {BULK_DIR_THRESHOLD}). Use 999999 to process all files.")
|
||||||
|
|
||||||
|
ap = sub.add_parser("analyse", help="Phase 2: find misplaced files using folder profiles")
|
||||||
|
ap.add_argument("--results", default=DEFAULT_OUTPUT, help="JSON file from extract phase")
|
||||||
|
ap.add_argument("--dir", default=DEFAULT_DIR, help="Root directory (same as extract)")
|
||||||
|
ap.add_argument("--threshold", type=float, default=0.25,
|
||||||
|
help="Overlap threshold below which a file is flagged (default 0.25 = 25%%)")
|
||||||
|
ap.add_argument("--min-folder-docs", type=int, default=5,
|
||||||
|
help="Minimum docs in a folder to be used as a reference profile (default 5)")
|
||||||
|
|
||||||
|
cp = sub.add_parser("classify", help="Phase 3: keyword-scoring taxonomy classification")
|
||||||
|
cp.add_argument("--results", default=DEFAULT_OUTPUT, help="JSON file from extract phase")
|
||||||
|
cp.add_argument("--dir", default=DEFAULT_DIR, help="Root directory (same as extract)")
|
||||||
|
cp.add_argument("--min-score", type=float, default=1.5,
|
||||||
|
help="Minimum keyword score to assign a label (default 1.5). Below this → 'Ukendt'.")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.cmd == "extract":
|
||||||
|
run_extract(args.dir, args.output, args.workers, args.bulk_threshold)
|
||||||
|
elif args.cmd == "analyse":
|
||||||
|
run_analyse(args.results, args.dir, args.threshold, args.min_folder_docs)
|
||||||
|
else:
|
||||||
|
run_classify(args.results, args.dir, args.min_score)
|
||||||
117
scripts/classify_server.py
Normal file
117
scripts/classify_server.py
Normal file
@@ -0,0 +1,117 @@
|
|||||||
|
"""classify_server.py — FastAPI service that adds taxonomy classification to kreuzberg /extract.
|
||||||
|
|
||||||
|
Exposes POST /classify — same multipart interface as kreuzberg /extract,
|
||||||
|
returns the full kreuzberg response plus category/subcategory/confidence fields.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
uvicorn scripts.classify_server:app --host 0.0.0.0 --port 8001
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from contextlib import asynccontextmanager
|
||||||
|
from typing import Annotated
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
from fastapi import FastAPI, File, Form, UploadFile
|
||||||
|
from fastapi.responses import JSONResponse
|
||||||
|
|
||||||
|
from taxonomy import classify_text
|
||||||
|
|
||||||
|
KREUZBERG_URL = os.getenv("KREUZBERG_URL", "https://check.i80.dk")
|
||||||
|
YAKE_CONFIG = {"keywords": {"algorithm": "yake", "max_keywords": 15}}
|
||||||
|
|
||||||
|
|
||||||
|
@asynccontextmanager
|
||||||
|
async def lifespan(app: FastAPI):
|
||||||
|
app.state.client = httpx.AsyncClient(timeout=60.0)
|
||||||
|
yield
|
||||||
|
await app.state.client.aclose()
|
||||||
|
|
||||||
|
|
||||||
|
app = FastAPI(
|
||||||
|
title="kreuzberg-classify",
|
||||||
|
description="Taxonomy classification on top of kreuzberg /extract",
|
||||||
|
version="1.0.0",
|
||||||
|
lifespan=lifespan,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/health")
|
||||||
|
async def health():
|
||||||
|
return {"status": "healthy", "kreuzberg_url": KREUZBERG_URL}
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/classify")
|
||||||
|
async def classify(
|
||||||
|
files: Annotated[list[UploadFile], File()],
|
||||||
|
config: Annotated[str | None, Form()] = None,
|
||||||
|
folder: Annotated[str | None, Form()] = None,
|
||||||
|
):
|
||||||
|
"""Extract text + keywords via kreuzberg, then classify into taxonomy.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
files: One or more document files (PDF, DOCX, etc.)
|
||||||
|
config: Optional JSON config for kreuzberg (merged with YAKE defaults).
|
||||||
|
folder: Optional current folder path for context hint.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of results — one per file — with all kreuzberg fields plus:
|
||||||
|
category, subcategory, confidence, runner_up, runner_up_score.
|
||||||
|
"""
|
||||||
|
# Merge caller config with our YAKE defaults
|
||||||
|
merged_config = dict(YAKE_CONFIG)
|
||||||
|
if config:
|
||||||
|
try:
|
||||||
|
caller_cfg = json.loads(config)
|
||||||
|
merged_config.update(caller_cfg)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Forward files to kreuzberg /extract
|
||||||
|
form_data = [("config", json.dumps(merged_config))]
|
||||||
|
file_contents = []
|
||||||
|
for upload in files:
|
||||||
|
content = await upload.read()
|
||||||
|
file_contents.append((upload.filename, content, upload.content_type or "application/octet-stream"))
|
||||||
|
form_data.append(("files", (upload.filename, content, upload.content_type or "application/octet-stream")))
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = await app.state.client.post(
|
||||||
|
f"{KREUZBERG_URL}/extract",
|
||||||
|
files=[("files", (fn, fc, ct)) for fn, fc, ct in file_contents],
|
||||||
|
data={"config": json.dumps(merged_config)},
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
kreuzberg_results = response.json()
|
||||||
|
except httpx.HTTPError as exc:
|
||||||
|
return JSONResponse(status_code=502, content={"error": f"kreuzberg error: {exc}"})
|
||||||
|
|
||||||
|
# Ensure list
|
||||||
|
if isinstance(kreuzberg_results, dict):
|
||||||
|
kreuzberg_results = [kreuzberg_results]
|
||||||
|
|
||||||
|
folder_hint = folder or ""
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for i, result in enumerate(kreuzberg_results):
|
||||||
|
content_text = result.get("content", "") or ""
|
||||||
|
|
||||||
|
# Extract keyword strings from kreuzberg response
|
||||||
|
raw_keywords = result.get("keywords", []) or []
|
||||||
|
if raw_keywords and isinstance(raw_keywords[0], dict):
|
||||||
|
kw_strings = [k.get("keyword", "") or k.get("phrase", "") for k in raw_keywords]
|
||||||
|
else:
|
||||||
|
kw_strings = [str(k) for k in raw_keywords]
|
||||||
|
|
||||||
|
classification = classify_text(
|
||||||
|
content=content_text,
|
||||||
|
keywords=kw_strings,
|
||||||
|
folder_hint=folder_hint,
|
||||||
|
)
|
||||||
|
|
||||||
|
results.append({**result, **classification})
|
||||||
|
|
||||||
|
return results
|
||||||
231
scripts/taxonomy.py
Normal file
231
scripts/taxonomy.py
Normal file
@@ -0,0 +1,231 @@
|
|||||||
|
"""Shared taxonomy: weighted keyword lists + folder mappings + scorer.
|
||||||
|
|
||||||
|
Used by both analyse_familie.py (batch classify) and classify_server.py (API endpoint).
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
TAXONOMY: dict[str, list[tuple[str, float]]] = {
|
||||||
|
"Familie og børn": [
|
||||||
|
("familie", 1.5), ("familieliv", 2.0), ("samvær", 2.0), ("samværsaftale", 2.5),
|
||||||
|
("børn", 1.5), ("barn", 1.5), ("forældre", 2.0), ("forældremyndighed", 2.5),
|
||||||
|
("skilsmisse", 2.5), ("separation", 2.0), ("barsel", 2.0), ("barnets", 1.5),
|
||||||
|
("søskende", 2.0), ("mor", 1.0), ("far", 1.0), ("mor og far", 2.5),
|
||||||
|
("dåb", 2.0), ("konfirmation", 2.5), ("bryllup", 2.0), ("vielse", 2.0),
|
||||||
|
("fodselsdag", 1.5), ("fødselsdagskort", 2.0),
|
||||||
|
],
|
||||||
|
"Skole og uddannelse": [
|
||||||
|
("skole", 1.5), ("uddannelse", 1.5), ("gymnasium", 2.5), ("universitetet", 2.0),
|
||||||
|
("eksamen", 2.0), ("studieplan", 2.5), ("karakter", 2.0), ("lektier", 2.5),
|
||||||
|
("opgave", 1.5), ("matematik", 2.0), ("dansk", 1.0), ("noter", 1.5),
|
||||||
|
("pensum", 2.5), ("studie", 1.5), ("kursus", 1.5), ("folkeskole", 2.5),
|
||||||
|
("htx", 2.5), ("hf", 2.0), ("hhx", 2.5), ("stx", 2.5), ("eux", 2.5),
|
||||||
|
("karakterblad", 3.0), ("eksamensbevis", 3.0), ("studiekort", 3.0),
|
||||||
|
("answer key", 2.5), ("quiz", 2.5), ("assessment", 2.5), ("learning", 1.5),
|
||||||
|
("lecture", 2.0), ("course", 2.0), ("lesson", 2.0), ("worksheet", 2.5),
|
||||||
|
],
|
||||||
|
"Arbejde og karriere": [
|
||||||
|
("ansøgning", 1.5), ("job", 1.5), ("jobansøgning", 2.5), ("cv", 2.5),
|
||||||
|
("curriculum vitae", 3.0), ("opsigelse", 2.5), ("løn", 1.5), ("lønforhandling", 2.5),
|
||||||
|
("ansættelseskontrakt", 3.0), ("ansættelse", 2.0), ("arbejdsplads", 2.0),
|
||||||
|
("kollega", 2.0), ("arbejdsgiver", 2.5), ("medarbejder", 2.0), ("fagforening", 2.5),
|
||||||
|
("a-kasse", 2.5), ("dagpenge", 2.5), ("jobcenter", 2.5), ("referenceliste", 3.0),
|
||||||
|
("karriere", 2.0), ("rekruttering", 2.5), ("personaleafdeling", 2.5),
|
||||||
|
("arbejde", 1.5), ("projektleder", 2.5), ("møde", 1.5), ("mødedagsorden", 2.5),
|
||||||
|
("scrum", 2.5), ("agile", 2.5), ("backlog", 2.5), ("sprint", 2.5),
|
||||||
|
("konference", 2.0), ("kompetencer", 2.0),
|
||||||
|
],
|
||||||
|
"Økonomi og regninger": [
|
||||||
|
("faktura", 2.5), ("regning", 2.0), ("betaling", 2.0), ("bank", 2.0),
|
||||||
|
("skat", 2.0), ("pension", 2.0), ("opsparing", 2.5), ("gæld", 2.5),
|
||||||
|
("lån", 2.5), ("kredit", 2.0), ("inkasso", 3.0), ("afdrag", 2.5),
|
||||||
|
("akkord", 3.0), ("restgæld", 3.0), ("kreditor", 2.5), ("økonomi", 1.5),
|
||||||
|
("budget", 2.0), ("forsikring", 2.0), ("rykkerbrev", 3.0), ("udbetaling", 2.0),
|
||||||
|
("sparekasse", 2.5), ("betalingsservice", 3.0), ("gældstyrelsen", 3.0),
|
||||||
|
("netto bank", 2.5), ("netbank", 2.5), ("kontoudtog", 3.0), ("årsopgørelse", 2.5),
|
||||||
|
("restskat", 3.0), ("årsopgørelse skat", 3.0), ("momsangivelse", 3.0),
|
||||||
|
],
|
||||||
|
"Hjem og bolig": [
|
||||||
|
("bolig", 2.0), ("hus", 1.5), ("lejlighed", 2.5), ("ejendom", 2.0),
|
||||||
|
("husleje", 3.0), ("vedligeholdelse", 2.5), ("renovation", 2.5),
|
||||||
|
("el", 1.0), ("vand", 1.0), ("varme", 1.5), ("fjernvarme", 2.5),
|
||||||
|
("ejerforening", 3.0), ("andelsbolig", 3.0), ("lejekontrakt", 3.0),
|
||||||
|
("fremlejning", 2.5), ("nøgle", 1.5), ("flytning", 2.0),
|
||||||
|
("indretning", 2.0), ("have", 1.5), ("grundejerforening", 3.0),
|
||||||
|
("BBR", 2.5), ("byggetilladelse", 3.0),
|
||||||
|
],
|
||||||
|
"Jura og kontrakter": [
|
||||||
|
("kontrakt", 2.0), ("aftale", 2.0), ("kontrakter", 2.0), ("juridisk", 2.5),
|
||||||
|
("advokat", 2.5), ("testamente", 3.0), ("retssag", 3.0), ("dom", 2.0),
|
||||||
|
("stævning", 3.0), ("klage", 2.0), ("tinglysning", 3.0), ("pantebrev", 3.0),
|
||||||
|
("tilbud", 1.5), ("vilkår", 2.0), ("betingelser", 2.0), ("fuldmagt", 2.5),
|
||||||
|
("forlig", 2.5), ("forsikringsbetingelser", 3.0), ("police", 2.0),
|
||||||
|
],
|
||||||
|
"Sundhed og medicin": [
|
||||||
|
("recept", 2.5), ("medicin", 2.5), ("læge", 2.5), ("hospital", 2.5),
|
||||||
|
("sygdom", 2.5), ("behandling", 2.0), ("diagnose", 3.0), ("operation", 2.5),
|
||||||
|
("symptomer", 2.5), ("sundhed", 2.0), ("journaloplysning", 3.0),
|
||||||
|
("patientjournal", 3.0), ("laboratorium", 2.5), ("blodprøve", 3.0),
|
||||||
|
("røntgen", 3.0), ("psykolog", 3.0), ("psykiater", 3.0), ("terapi", 2.5),
|
||||||
|
("tandlæge", 3.0), ("optiker", 2.5), ("vaccination", 3.0),
|
||||||
|
],
|
||||||
|
"IT og teknologi": [
|
||||||
|
("software", 2.5), ("server", 2.0), ("netværk", 2.5), ("database", 2.5),
|
||||||
|
("programmering", 2.5), ("kode", 2.0), ("linux", 3.0), ("cloud", 2.5),
|
||||||
|
("it", 1.5), ("computer", 2.0), ("laptop", 2.5), ("password", 2.5),
|
||||||
|
("installation", 2.0), ("konfiguration", 2.0), ("log", 1.5), ("backup", 2.5),
|
||||||
|
("docker", 3.0), ("kubernetes", 3.0), ("python", 3.0), ("github", 3.0),
|
||||||
|
("azure", 2.5), ("windows", 2.0), ("macos", 3.0), ("licens", 2.0),
|
||||||
|
("api", 2.5), ("dokumentation", 1.5), ("teknologi", 2.0), ("system", 1.5),
|
||||||
|
("web", 2.0), ("app", 1.5), ("program", 1.5), ("firmware", 3.0),
|
||||||
|
("internet", 2.0), ("cybersikkerhed", 3.0),
|
||||||
|
("bitcoin", 3.0), ("blockchain", 3.0), ("kryptovaluta", 3.0), ("jupyter", 3.0),
|
||||||
|
("notebook", 2.5), ("monitor", 2.0), ("display", 2.0), ("remote control", 2.5),
|
||||||
|
("user manual", 2.5), ("dataanalyse", 2.5), ("data analysis", 2.5),
|
||||||
|
("django", 3.0), ("javascript", 2.5), ("jquery", 2.5), ("typescript", 3.0),
|
||||||
|
("html", 2.0), ("css", 2.0), ("react", 2.5), ("nodejs", 3.0), ("java", 2.5),
|
||||||
|
("csharp", 3.0), ("datamatiker", 3.0), ("sql", 2.5), ("rest api", 3.0),
|
||||||
|
("programming", 2.5), ("developer", 2.0), ("debugging", 2.5), ("testing", 2.0),
|
||||||
|
],
|
||||||
|
"Bøger og litteratur": [
|
||||||
|
("isbn", 3.0), ("forlag", 1.5), ("roman", 3.0), ("novelle", 3.0),
|
||||||
|
("biografi", 3.0), ("poesi", 3.0), ("digtsamling", 3.0), ("bog", 2.0),
|
||||||
|
("litteratur", 2.5), ("forfatter", 3.0), ("kapitel", 2.0), ("bogklub", 3.0),
|
||||||
|
("bibliotek", 2.5), ("e-bog", 3.0), ("lydbog", 3.0), ("udgivelse", 2.0),
|
||||||
|
("biography", 2.5), ("novel", 2.5), ("author", 2.0), ("chapter", 2.0),
|
||||||
|
("publisher", 2.0), ("edition", 2.0), ("paperback", 3.0), ("hardcover", 3.0),
|
||||||
|
("fiction", 3.0), ("nonfiction", 3.0), ("memoir", 3.0),
|
||||||
|
],
|
||||||
|
"Rejse og transport": [
|
||||||
|
("rejse", 2.0), ("ferie", 2.0), ("fly", 2.5), ("hotel", 2.5),
|
||||||
|
("booking", 2.5), ("rejseplan", 2.5), ("pas", 2.0), ("visum", 3.0),
|
||||||
|
("bil", 1.5), ("kørekort", 3.0), ("tog", 2.0), ("billet", 2.5),
|
||||||
|
("flyrejse", 3.0), ("afgangsgate", 3.0), ("baggage", 2.5), ("cruise", 3.0),
|
||||||
|
("afrejse", 2.5), ("ankomst", 2.0), ("itinerary", 3.0), ("pakketur", 2.5),
|
||||||
|
],
|
||||||
|
"Offentlige myndigheder": [
|
||||||
|
("kommune", 2.5), ("stat", 1.5), ("styrelse", 2.5), ("forvaltning", 2.5),
|
||||||
|
("gældstyrelsen", 3.0), ("skat", 2.0), ("udbetaling danmark", 3.0),
|
||||||
|
("borger.dk", 3.0), ("digitalpost", 2.5), ("afgørelse", 2.5),
|
||||||
|
("offentlig myndighed", 3.0), ("ministeri", 2.5), ("ministeriet", 2.5),
|
||||||
|
("politi", 2.5), ("domstol", 2.5), ("retsinformation", 3.0),
|
||||||
|
("folketing", 2.5), ("region", 2.0), ("jobcenter", 2.5),
|
||||||
|
("borger", 1.5), ("ansøgning kommune", 3.0), ("nykøbingvej", 2.5),
|
||||||
|
("sakskøbing", 2.5), ("akkordansøgning", 3.0),
|
||||||
|
],
|
||||||
|
"Projekter og hobby": [
|
||||||
|
("hobby", 2.5), ("projekt", 2.0), ("frivillig", 2.5), ("klub", 2.0),
|
||||||
|
("aktivitet", 2.0), ("sport", 2.5), ("musik", 2.5), ("opskrift", 2.5),
|
||||||
|
("træning", 2.0), ("kreativ", 2.5), ("håndværk", 2.5), ("fotografi", 2.5),
|
||||||
|
("spil", 2.0), ("gaming", 3.0), ("maleri", 2.5), ("tegning", 2.0),
|
||||||
|
("golf", 3.0), ("fitness", 2.5), ("klippekort", 2.5), ("svømning", 2.5),
|
||||||
|
("cykling", 2.5), ("løb", 1.5), ("boldspil", 2.5), ("fodbold", 2.5),
|
||||||
|
("concert", 2.0), ("festival", 2.5),
|
||||||
|
],
|
||||||
|
"Teknik og ingeniørfag": [
|
||||||
|
("tegning", 2.0), ("teknisk tegning", 3.0), ("ingeniør", 2.5), ("konstruktion", 2.5),
|
||||||
|
("maskine", 2.5), ("elektroteknik", 3.0), ("specifikation", 2.0),
|
||||||
|
("diagram", 2.0), ("brugsanvisning", 3.0), ("manual", 2.5), ("datablad", 3.0),
|
||||||
|
("CE-mærkning", 3.0), ("ISO", 2.0), ("norm", 2.0), ("standard", 1.5),
|
||||||
|
("user manual", 2.5), ("installation guide", 3.0), ("technical specification", 3.0),
|
||||||
|
("product guide", 2.5), ("service manual", 3.0),
|
||||||
|
],
|
||||||
|
"Erhverv og business": [
|
||||||
|
("virksomhed", 2.5), ("erhverv", 2.5), ("CVR", 3.0), ("faktura", 2.5),
|
||||||
|
("ordre", 2.0), ("leverandør", 2.5), ("kunde", 2.0), ("salg", 2.0),
|
||||||
|
("moms", 2.5), ("regnskab", 2.5), ("årsregnskab", 3.0), ("balance", 2.0),
|
||||||
|
("resultatopgørelse", 3.0), ("aktieselskab", 3.0), ("iværksætter", 2.5),
|
||||||
|
("forretning", 2.0), ("selskab", 2.0), ("ApS", 3.0), ("A/S", 3.0),
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
TAXONOMY_TO_FOLDER: dict[str, str] = {
|
||||||
|
"Familie og børn": "Privat/Familie",
|
||||||
|
"Skole og uddannelse": "Privat/Personlig/Uddannelse",
|
||||||
|
"Arbejde og karriere": "Privat/Personlig/Arbejde",
|
||||||
|
"Økonomi og regninger": "Privat/Økonomi",
|
||||||
|
"Hjem og bolig": "Privat/Hjem/Bolig",
|
||||||
|
"Jura og kontrakter": "Privat/Jura",
|
||||||
|
"Sundhed og medicin": "Privat/Personlig/Sundhed",
|
||||||
|
"IT og teknologi": "Arkiv/Teknisk",
|
||||||
|
"Bøger og litteratur": "Arkiv/Bøger",
|
||||||
|
"Rejse og transport": "Privat/Rejser",
|
||||||
|
"Offentlige myndigheder": "Privat/Jura/Myndigheder",
|
||||||
|
"Projekter og hobby": "Projekter",
|
||||||
|
"Teknik og ingeniørfag": "Arkiv/Teknisk",
|
||||||
|
"Erhverv og business": "Arkiv/Erhverv",
|
||||||
|
}
|
||||||
|
|
||||||
|
MIN_SCORE: float = 1.5
|
||||||
|
|
||||||
|
|
||||||
|
def keyword_score(doc_text: str, keywords: list[tuple[str, float]]) -> float:
|
||||||
|
"""Score a document against a keyword list.
|
||||||
|
|
||||||
|
Multi-word phrases are matched as substrings; single words are matched as
|
||||||
|
whole words (word boundary) to avoid false positives (e.g. 'bil' in 'mobil').
|
||||||
|
Returns the sum of weights for all matching entries.
|
||||||
|
"""
|
||||||
|
text = doc_text.lower()
|
||||||
|
total = 0.0
|
||||||
|
for kw, weight in keywords:
|
||||||
|
kw_lower = kw.lower()
|
||||||
|
if " " in kw_lower:
|
||||||
|
if kw_lower in text:
|
||||||
|
total += weight
|
||||||
|
else:
|
||||||
|
idx = text.find(kw_lower)
|
||||||
|
while idx != -1:
|
||||||
|
before = text[idx - 1] if idx > 0 else " "
|
||||||
|
after = text[idx + len(kw_lower)] if idx + len(kw_lower) < len(text) else " "
|
||||||
|
if not before.isalpha() and not after.isalpha():
|
||||||
|
total += weight
|
||||||
|
break
|
||||||
|
idx = text.find(kw_lower, idx + 1)
|
||||||
|
return round(total, 3)
|
||||||
|
|
||||||
|
|
||||||
|
def classify_text(
|
||||||
|
content: str,
|
||||||
|
keywords: list[str],
|
||||||
|
folder_hint: str = "",
|
||||||
|
min_score: float = MIN_SCORE,
|
||||||
|
) -> dict:
|
||||||
|
"""Classify document text + keywords against the taxonomy.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
content: Extracted document text.
|
||||||
|
keywords: YAKE keyword strings from kreuzberg.
|
||||||
|
folder_hint: Current folder path (used as additional context signal).
|
||||||
|
min_score: Minimum score to assign a label (else 'Ukendt').
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict with category, subcategory, confidence, runner_up, runner_up_score.
|
||||||
|
"""
|
||||||
|
kw_text = " ".join(keywords)
|
||||||
|
folder_tokens = folder_hint.replace("/", " ").replace("_", " ").replace("-", " ")
|
||||||
|
doc_text = f"{content} {folder_tokens} {kw_text}"
|
||||||
|
|
||||||
|
scores = {
|
||||||
|
cat: keyword_score(doc_text, kws)
|
||||||
|
for cat, kws in TAXONOMY.items()
|
||||||
|
}
|
||||||
|
|
||||||
|
sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
|
||||||
|
best_label, best_score = sorted_scores[0]
|
||||||
|
runner_up_label, runner_up_score = sorted_scores[1] if len(sorted_scores) > 1 else ("", 0.0)
|
||||||
|
|
||||||
|
if best_score >= min_score:
|
||||||
|
category = best_label
|
||||||
|
subcategory = TAXONOMY_TO_FOLDER.get(best_label, "")
|
||||||
|
else:
|
||||||
|
category = "Ukendt"
|
||||||
|
subcategory = ""
|
||||||
|
|
||||||
|
return {
|
||||||
|
"category": category,
|
||||||
|
"subcategory": subcategory,
|
||||||
|
"confidence": round(best_score, 3),
|
||||||
|
"runner_up": runner_up_label if best_score >= min_score else best_label,
|
||||||
|
"runner_up_score": round(runner_up_score, 3),
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user