fil/scripts/analyse_familie.py

#!/usr/bin/env python3
"""
Document keyword analyser — misplacement detection across ~/Dokumenter.

Phase 1 (extract): Hits kreuzberg API for all files, saves progress to JSON.
Phase 2 (analyse): Builds folder profiles, flags files likely in the wrong folder.
Phase 3 (classify): Zero-shot taxonomy classification via sentence-transformers.

Usage:
    python3 analyse_familie.py extract
    python3 analyse_familie.py analyse
    python3 analyse_familie.py classify
    python3 analyse_familie.py extract --workers 6 --output my_results.json
    python3 analyse_familie.py analyse --threshold 0.25
    python3 analyse_familie.py classify --results dokumenter_keywords.json
"""

import csv
import json
import argparse
import re
import time
from pathlib import Path
from collections import defaultdict, Counter
from concurrent.futures import ThreadPoolExecutor, as_completed

import urllib.request
import urllib.error

API_URL = "https://check.i80.dk/extract"
DEFAULT_DIR = "/home/hjess/Dokumenter"
DEFAULT_OUTPUT = "dokumenter_keywords.json"

SUPPORTED_EXTS = {".pdf", ".doc", ".docx", ".odt", ".ods", ".ppt", ".pptx", ".txt", ".rtf"}

# Generic words that don't help categorise documents
STOPWORDS = {
    # Danish function words
    "den", "det", "der", "som", "til", "fra", "med", "for", "og", "i", "er",
    "en", "et", "af", "på", "at", "de", "har", "ikke", "vi", "hun", "han",
    "skal", "kan", "var", "men", "når", "også", "bare", "blev", "bare",
    # Form field noise
    "navn", "dato", "side", "fulde", "nummer", "adresse", "telefon", "email",
    "cpr", "cpr-nr", "postnr", "underskrift", "forælder", "barnet", "oplysninger",
    "telefonnummer", "telefonnummer i dagtimerne",
    # Email/phone footer boilerplate
    "sendt fra", "sendt fra min", "fra min", "fra min iphone",
    "min iphone", "min iphone den", "iphone den",
    "skrev henrik", "skrev henrik jess", "henrik jess", "henrik",
    "ganstar nielsen wrote", "nielsen wrote",
    # Names — too generic across this specific corpus
    "ganstar", "nielsen", "ganstar nielsen", "tanya ganstar", "tanya ganstar nielsen",
    "henrik jess nielsen", "jess nielsen",
    # Kaiten mail system noise
    "kaiten", "kaiten mail", "med kaiten mail", "via kaiten",
}

SIDER_PATTERN = "Aktindsigt/Sider"
SIDER_SAMPLE = 20  # how many pages to sample from the bulk Sider/ group
BULK_DIR_THRESHOLD = 50  # dirs with more files than this get sampled instead of fully processed


# ---------------------------------------------------------------------------
# HTTP helper (stdlib only — no requests dependency)
# ---------------------------------------------------------------------------

def _post_multipart(url, filepath: Path, config: dict, timeout=90) -> dict | None:
    """POST a file as multipart/form-data using stdlib only. Retries on 502."""
    boundary = "----KreuzbergBoundary7MA4YWxkTrZu"
    config_json = json.dumps(config).encode()

    with open(filepath, "rb") as fh:
        file_data = fh.read()

    body = (
        f"--{boundary}\r\n"
        f'Content-Disposition: form-data; name="files"; filename="{filepath.name}"\r\n'
        f"Content-Type: application/octet-stream\r\n\r\n"
    ).encode() + file_data + (
        f"\r\n--{boundary}\r\n"
        f'Content-Disposition: form-data; name="config"\r\n\r\n'
    ).encode() + config_json + f"\r\n--{boundary}--\r\n".encode()

    req = urllib.request.Request(url, data=body)
    req.add_header("Content-Type", f"multipart/form-data; boundary={boundary}")
    req.add_header("Content-Length", str(len(body)))

    retries = 3
    for attempt in range(retries):
        try:
            with urllib.request.urlopen(req, timeout=timeout) as resp:
                return json.loads(resp.read().decode())
        except urllib.error.HTTPError as e:
            if e.code == 502 and attempt < retries - 1:
                wait = 5 * (2 ** attempt)  # 5s, 10s, 20s
                time.sleep(wait)
                continue
            return {"error": f"HTTP {e.code}"}
        except Exception as e:
            return {"error": str(e)}
    return {"error": "HTTP 502 (max retries exceeded)"}


# ---------------------------------------------------------------------------
# Extraction
# ---------------------------------------------------------------------------

def extract_file(filepath: Path, max_keywords: int = 15) -> dict:
    config = {"keywords": {"algorithm": "yake", "max_keywords": max_keywords}}
    data = _post_multipart(API_URL, filepath, config)

    if data is None or "error" in (data if isinstance(data, dict) else {}):
        error = (data or {}).get("error", "unknown")
        return {"file": str(filepath), "keywords": [], "languages": [], "content_length": 0, "error": error}

    results = data if isinstance(data, list) else [data]
    if not results:
        return {"file": str(filepath), "keywords": [], "languages": [], "content_length": 0, "error": "empty response"}

    r = results[0]
    return {
        "file": str(filepath),
        "keywords": [(k["text"], round(k["score"], 4)) for k in (r.get("extracted_keywords") or [])],
        "languages": r.get("detected_languages") or [],
        "content_length": len(r.get("content") or ""),
        "error": None,
    }


def collect_files(directory: Path, bulk_threshold: int = BULK_DIR_THRESHOLD) -> tuple[list[Path], list[Path]]:
    """Return (regular_files, bulk_dirs) where bulk dirs have >bulk_threshold files."""
    # First pass: count files per directory
    dir_files: dict[Path, list[Path]] = defaultdict(list)
    for f in sorted(directory.rglob("*")):
        if f.is_file() and f.suffix.lower() in SUPPORTED_EXTS:
            dir_files[f.parent].append(f)

    regular: list[Path] = []
    bulk_dirs: list[Path] = []
    seen_bulk: set[Path] = set()

    for parent, files in dir_files.items():
        # Check if any ancestor is already a bulk dir
        if any(b in parent.parents or b == parent for b in seen_bulk):
            continue
        if len(files) >= bulk_threshold:
            bulk_dirs.append(parent)
            seen_bulk.add(parent)
        else:
            regular.extend(files)

    return regular, bulk_dirs


def analyse_bulk_group(bulk_dir: Path) -> dict:
    """Sample SIDER_SAMPLE files from a large directory and return a merged group result."""
    all_files = [f for f in sorted(bulk_dir.rglob("*")) if f.is_file() and f.suffix.lower() in SUPPORTED_EXTS]
    sample = all_files[:SIDER_SAMPLE]
    print(f"  Sampling {len(sample)}/{len(all_files)} files from {bulk_dir.name}/…")

    keyword_counter: Counter = Counter()
    for f in sample:
        r = extract_file(f, max_keywords=10)
        for kw, _ in r["keywords"]:
            keyword_counter[kw.lower()] += 1

    return {
        "file": str(bulk_dir) + f"/ [GROUP — {len(all_files)} files, sampled {len(sample)}]",
        "keywords": [(kw, round(count / len(sample), 3)) for kw, count in keyword_counter.most_common(15)],
        "languages": ["dan"],
        "content_length": -1,
        "is_group": True,
        "error": None,
    }


def run_extract(directory: str, output_file: str, workers: int, bulk_threshold: int = BULK_DIR_THRESHOLD) -> None:
    dir_path = Path(directory)
    out_path = Path(output_file)
    files, bulk_dirs = collect_files(dir_path, bulk_threshold=bulk_threshold)

    # Resume from existing output — only skip files with successful results (no error)
    results: dict[str, dict] = {}
    retriable_errors = {"HTTP 502", "HTTP 502 (max retries exceeded)"}
    if out_path.exists():
        with open(out_path) as fh:
            for r in json.load(fh):
                # Retry transient server errors; keep permanent errors (422 etc.)
                if r.get("error") in retriable_errors:
                    continue
                results[r["file"]] = r
        print(f"Resuming — {len(results)} files already done")

    todo = [f for f in files if str(f) not in results]
    total = len(todo)
    print(f"Files to process: {total}  (skipping {len(files) - total} already done)")
    if bulk_dirs:
        print(f"Bulk directories (sampled): {[str(d) for d in bulk_dirs]}\n")

    done = 0
    errors = 0

    def save():
        with open(out_path, "w") as fh:
            json.dump(list(results.values()), fh, ensure_ascii=False, indent=2)

    with ThreadPoolExecutor(max_workers=workers) as ex:
        futures = {ex.submit(extract_file, f): f for f in todo}
        for future in as_completed(futures):
            result = future.result()
            results[result["file"]] = result
            done += 1
            if result["error"]:
                errors += 1
                print(f"  ❌ [{done}/{total}] {Path(result['file']).name}: {result['error']}")
            else:
                preview = ", ".join(kw for kw, _ in result["keywords"][:3])
                print(f"  ✅ [{done}/{total}] {Path(result['file']).name} → {preview}")
            if done % 20 == 0:
                save()

    save()

    # Handle bulk dirs as sampled groups
    for bulk_dir in bulk_dirs:
        group_key = str(bulk_dir) + "/ [GROUP]"
        if group_key not in results:
            print(f"\nAnalysing bulk group: {bulk_dir.name}/")
            group = analyse_bulk_group(bulk_dir)
            results[group["file"]] = group
            save()
            print(f"  Top keywords: {', '.join(kw for kw, _ in group['keywords'][:5])}")

    print(f"\n✅ Finished: {done} files, {errors} errors → {out_path}")
    print(f"   Run 'python3 {__file__} analyse' to find misplaced files")


# ---------------------------------------------------------------------------
# Phase 2: Misplacement detection
# ---------------------------------------------------------------------------

def _clean_keywords(raw: list[tuple[str, float]]) -> list[str]:
    return [
        kw.lower().strip() for kw, _ in (raw or [])
        if len(kw) >= 4 and kw.lower().strip() not in STOPWORDS
    ]


def _folder_key(filepath: Path, base: Path) -> str:
    """Return the relative folder path (e.g. 'Privat/Økonomi/Gæld')."""
    try:
        return str(filepath.relative_to(base).parent)
    except ValueError:
        return ""


def run_analyse(results_file: str, base_dir: str, threshold: float, min_folder_docs: int) -> None:
    """
    Build a keyword profile per folder, then flag files whose keywords
    don't overlap well with their current folder.
    """
    with open(results_file) as fh:
        results = json.load(fh)

    base = Path(base_dir)
    print(f"Analysing {len(results)} document records…")

    # Build folder profiles: folder → Counter(keyword → freq)
    folder_profiles: dict[str, Counter] = defaultdict(Counter)
    file_kws: dict[str, list[str]] = {}

    for r in results:
        if r.get("is_group"):
            continue
        kws = _clean_keywords(r.get("keywords") or [])
        file_kws[r["file"]] = kws
        folder = _folder_key(Path(r["file"]), base)
        if folder:
            for kw in kws:
                folder_profiles[folder][kw] += 1

    # Filter out thin folders (too few docs to be meaningful)
    folder_doc_counts: Counter = Counter()
    for r in results:
        if not r.get("is_group"):
            folder_doc_counts[_folder_key(Path(r["file"]), base)] += 1

    valid_folders = {f for f, c in folder_doc_counts.items() if c >= min_folder_docs}
    print(f"Folder profiles built: {len(valid_folders)} folders with {min_folder_docs}+ documents\n")

    # Score each file against its own folder, flag low-overlap files
    misplaced: list[dict] = []
    unclassified: list[dict] = []

    for r in results:
        if r.get("is_group"):
            continue
        fp = Path(r["file"])
        folder = _folder_key(fp, base)
        kws = file_kws.get(r["file"], [])

        if not kws:
            continue

        if folder == "." or folder == "":
            # File sits directly in the root — needs a home
            unclassified.append(r)
            continue

        profile = folder_profiles.get(folder, Counter())
        # Overlap: share of file's keywords that appear ≥2 times in folder profile
        shared = sum(1 for kw in kws if profile[kw] >= 2)
        overlap = shared / len(kws)

        if overlap < threshold and folder in valid_folders:
            # Find best matching alternative folder
            best_folder = max(
                valid_folders - {folder},
                key=lambda fd: sum(folder_profiles[fd][kw] for kw in kws),
                default=None,
            )
            best_score = sum(folder_profiles[best_folder][kw] for kw in kws) if best_folder else 0

            # Skip if both current and suggested are generic "Ukendt" dump folders —
            # moving Ukendt/PDF/2011 → Ukendt/PDF/2009 is not an improvement
            def _is_ukendt(f: str) -> bool:
                return f is not None and "Ukendt" in f

            if _is_ukendt(folder) and _is_ukendt(best_folder):
                continue

            misplaced.append({
                "file": r["file"],
                "filename": fp.name,
                "current_folder": folder,
                "overlap": round(overlap, 2),
                "suggested_folder": best_folder or "",
                "suggestion_score": best_score,
                "top_keywords": "; ".join(kws[:6]),
            })

    misplaced.sort(key=lambda x: x["overlap"])

    # Print summary
    print(f"{'─'*65}")
    print(f"Potentially misplaced: {len(misplaced)} files  (overlap < {threshold:.0%})\n")

    for m in misplaced[:40]:
        print(f"  📄 {m['filename']}")
        print(f"       Current:   {m['current_folder']}")
        print(f"       Suggested: {m['suggested_folder']}  (overlap={m['overlap']:.0%})")
        print(f"       Keywords:  {m['top_keywords']}")
        print()

    if len(misplaced) > 40:
        print(f"  … and {len(misplaced) - 40} more — see CSV for full list\n")

    # Save CSV for easy review / filtering in a spreadsheet
    csv_path = Path(results_file).with_suffix(".misplaced.csv")
    with open(csv_path, "w", newline="", encoding="utf-8") as csvf:
        writer = csv.DictWriter(
            csvf,
            fieldnames=["filename", "current_folder", "overlap", "suggested_folder",
                        "suggestion_score", "top_keywords", "file"],
        )
        writer.writeheader()
        writer.writerows(misplaced)

    print(f"{'─'*65}")
    print(f"✅ Saved {len(misplaced)} misplaced records → {csv_path}")
    print(f"   Open in LibreOffice Calc, sort by 'overlap' asc to prioritise.")


# ---------------------------------------------------------------------------
# Phase 3: taxonomy classification via keyword scoring
# ---------------------------------------------------------------------------
# Taxonomy data and scorer live in taxonomy.py (shared with classify_server.py).

import sys as _sys
import os as _os
_sys.path.insert(0, _os.path.dirname(__file__))
from taxonomy import TAXONOMY, TAXONOMY_TO_FOLDER, keyword_score as _keyword_score  # noqa: E402


def run_classify(results_file: str, base_dir: str = DEFAULT_DIR, min_score: float = 1.5) -> None:
    """Phase 3: classify each document into a taxonomy category using keyword scoring.

    Scoring is deterministic: each category has a weighted keyword list; the
    document text (filename + folder path + YAKE keywords) is scored against
    every category and the highest score wins.  No ML model required.
    """
    results_path = Path(results_file)
    if not results_path.exists():
        print(f"❌ Results file not found: {results_file}")
        print("   Run 'extract' phase first.")
        return

    with open(results_path, encoding="utf-8") as fh:
        results: list[dict] = json.load(fh)

    # Include files even without YAKE keywords — filename+folder alone can classify
    classifiable = [r for r in results if not r.get("is_group") and not r.get("error")]
    print(f"Classifying {len(classifiable)} documents…")

    base = Path(base_dir)
    output_rows: list[dict] = []

    for r in classifiable:
        fp = Path(r["file"])
        # Build document text from filename tokens + full folder path + YAKE keywords
        stem_tokens = re.sub(r"[-_.]", " ", fp.stem).lower()
        folder_tokens = " ".join(fp.parent.parts).lower()
        kw_text = " ".join(kw for kw, _ in r.get("keywords", []))
        doc_text = f"{stem_tokens} {folder_tokens} {kw_text}"

        scores: dict[str, float] = {
            cat: _keyword_score(doc_text, kws)
            for cat, kws in TAXONOMY.items()
        }

        best_label = max(scores, key=lambda c: scores[c])
        best_score = scores[best_label]

        sorted_cats = sorted(scores, key=lambda c: scores[c], reverse=True)
        runner_up = sorted_cats[1] if len(sorted_cats) > 1 else ""
        runner_score = scores[runner_up] if runner_up else 0.0

        current_folder = _folder_key(fp, base)
        label = best_label if best_score >= min_score else "Ukendt"
        suggested_folder = TAXONOMY_TO_FOLDER.get(label, "") if label != "Ukendt" else ""

        output_rows.append({
            "filename": fp.name,
            "current_folder": current_folder,
            "taxonomy_label": label,
            "confidence": best_score,
            "runner_up": runner_up,
            "runner_up_score": runner_score,
            "suggested_folder": suggested_folder,
            "top_keywords": "; ".join(kw for kw, _ in r.get("keywords", [])[:6]),
            "file": r["file"],
        })

    # Sort by confidence ascending — lowest confidence = needs most attention
    output_rows.sort(key=lambda x: x["confidence"])

    csv_path = Path(results_file).with_suffix(".classified.csv")
    with open(csv_path, "w", newline="", encoding="utf-8") as csvf:
        writer = csv.DictWriter(
            csvf,
            fieldnames=["filename", "current_folder", "taxonomy_label", "confidence",
                        "runner_up", "runner_up_score", "suggested_folder", "top_keywords", "file"],
        )
        writer.writeheader()
        writer.writerows(output_rows)

    # Print distribution summary
    label_counts: Counter = Counter(r["taxonomy_label"] for r in output_rows)
    print(f"\n{'─'*65}")
    print(f"Taxonomy distribution ({len(output_rows)} documents):\n")
    for label, count in label_counts.most_common():
        bar = "█" * (count * 30 // max(label_counts.values()))
        print(f"  {label:<30}  {count:>4}  {bar}")

    low_conf = sum(1 for r in output_rows if r["confidence"] < min_score)
    print(f"\n  Low score (<{min_score}): {low_conf} files → labelled 'Ukendt'")
    print(f"\n{'─'*65}")
    print(f"✅ Saved {len(output_rows)} classifications → {csv_path}")
    print(f"   Columns: filename, current_folder, taxonomy_label, confidence, suggested_folder")


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------

if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Document keyword extraction + misplacement detector",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=__doc__,
    )
    sub = parser.add_subparsers(dest="cmd", required=True)

    ep = sub.add_parser("extract", help="Phase 1: extract keywords from all files")
    ep.add_argument("--dir", default=DEFAULT_DIR, help="Root directory to scan")
    ep.add_argument("--output", default=DEFAULT_OUTPUT, help="Output JSON file")
    ep.add_argument("--workers", type=int, default=3, help="Parallel API workers (default 3 — keep low to avoid 502s)")
    ep.add_argument("--bulk-threshold", type=int, default=BULK_DIR_THRESHOLD,
                    help=f"Dirs with this many files are sampled instead of fully processed (default {BULK_DIR_THRESHOLD}). Use 999999 to process all files.")

    ap = sub.add_parser("analyse", help="Phase 2: find misplaced files using folder profiles")
    ap.add_argument("--results", default=DEFAULT_OUTPUT, help="JSON file from extract phase")
    ap.add_argument("--dir", default=DEFAULT_DIR, help="Root directory (same as extract)")
    ap.add_argument("--threshold", type=float, default=0.25,
                    help="Overlap threshold below which a file is flagged (default 0.25 = 25%%)")
    ap.add_argument("--min-folder-docs", type=int, default=5,
                    help="Minimum docs in a folder to be used as a reference profile (default 5)")

    cp = sub.add_parser("classify", help="Phase 3: keyword-scoring taxonomy classification")
    cp.add_argument("--results", default=DEFAULT_OUTPUT, help="JSON file from extract phase")
    cp.add_argument("--dir", default=DEFAULT_DIR, help="Root directory (same as extract)")
    cp.add_argument("--min-score", type=float, default=1.5,
                    help="Minimum keyword score to assign a label (default 1.5). Below this → 'Ukendt'.")

    args = parser.parse_args()

    if args.cmd == "extract":
        run_extract(args.dir, args.output, args.workers, args.bulk_threshold)
    elif args.cmd == "analyse":
        run_analyse(args.results, args.dir, args.threshold, args.min_folder_docs)
    else:
        run_classify(args.results, args.dir, args.min_score)