Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/tools/benchmark-harness/scripts/generate_markdown_gt.py
+++ b/tools/benchmark-harness/scripts/generate_markdown_gt.py
@@ -0,0 +1,249 @@
+#!/usr/bin/env -S uv run --no-project --script
+# /// script
+# requires-python = ">=3.10"
+# dependencies = ["google-genai>=1.0"]
+# ///
+"""Generate proper markdown ground truth from PDF documents using Gemini.
+
+Reads benchmark fixture JSON files to locate PDFs, sends each to Gemini 2.5 Flash
+via Vertex AI, and saves the extracted markdown to the ground truth directory.
+
+Usage:
+    uv run tools/benchmark-harness/scripts/generate_markdown_gt.py [OPTIONS]
+
+Examples:
+    # Generate for all nougat + pdfa documents
+    uv run tools/benchmark-harness/scripts/generate_markdown_gt.py
+
+    # Generate for a specific document
+    uv run tools/benchmark-harness/scripts/generate_markdown_gt.py --filter nougat_001
+
+    # Dry run to see what would be processed
+    uv run tools/benchmark-harness/scripts/generate_markdown_gt.py --dry-run
+
+    # Force regeneration of existing files
+    uv run tools/benchmark-harness/scripts/generate_markdown_gt.py --force
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import signal
+import sys
+import time
+from pathlib import Path
+
+from google import genai
+from google.genai.types import GenerateContentConfig, Part
+
+EXTRACTION_PROMPT = """\
+Extract the complete text content of this PDF document as clean Markdown.
+
+Rules:
+- Use proper heading hierarchy (# for document title, ## for major sections, ### for subsections)
+- Render tables using markdown table syntax with | delimiters and --- separator row
+- Use numbered lists (1. 2. 3.) and bullet lists (- item) where the document uses them
+- Preserve emphasis: **bold** and *italic* where the original uses them
+- Use ``` code blocks for code snippets, formulas, or monospace content
+- Use <!-- image --> as a placeholder where figures or images appear
+- Omit page numbers, running headers/footers, and watermarks
+- Preserve the document's reading order
+- Do NOT invent or hallucinate content — only extract what is actually in the document
+- Do NOT wrap the output in a markdown code fence — return raw markdown directly
+- For multi-column layouts, read left column first, then right column
+- For forms with label-value pairs, use **Label:** Value format
+"""
+
+
+def get_repo_root() -> Path:
+    current = Path(__file__).resolve().parent
+    while current != current.parent:
+        if (current / "Cargo.toml").exists() and (current / "test_documents").exists():
+            return current
+        current = current.parent
+    raise RuntimeError("Could not find repository root")
+
+
+def discover_fixtures(fixtures_dir: Path, name_filter: str | None = None) -> list[dict]:
+    """Find PDF fixtures that need markdown ground truth."""
+    results = []
+    for fixture_path in sorted(fixtures_dir.rglob("*.json")):
+        try:
+            with open(fixture_path) as f:
+                fixture = json.load(f)
+        except (json.JSONDecodeError, OSError):
+            continue
+
+        if fixture.get("file_type") != "pdf":
+            continue
+
+        name = fixture_path.stem
+        if name_filter and name_filter not in name:
+            continue
+
+        doc_rel = fixture.get("document", "")
+        if not doc_rel:
+            continue
+
+        doc_path = (fixture_path.parent / doc_rel).resolve()
+        if not doc_path.exists():
+            continue
+
+        results.append(
+            {
+                "name": name,
+                "fixture_path": fixture_path,
+                "doc_path": doc_path,
+                "fixture": fixture,
+            }
+        )
+
+    return results
+
+
+class _Timeout(Exception):
+    pass
+
+
+def _timeout_handler(signum, frame):
+    raise _Timeout("API call timed out")
+
+
+def generate_markdown(
+    client: genai.Client,
+    pdf_path: Path,
+    model: str,
+    timeout: int = 120,
+) -> str:
+    """Send PDF to Gemini and get markdown extraction."""
+    pdf_bytes = pdf_path.read_bytes()
+
+    old_handler = signal.signal(signal.SIGALRM, _timeout_handler)
+    signal.alarm(timeout)
+    try:
+        response = client.models.generate_content(
+            model=model,
+            contents=[
+                Part.from_bytes(data=pdf_bytes, mime_type="application/pdf"),
+                EXTRACTION_PROMPT,
+            ],
+            config=GenerateContentConfig(
+                temperature=0.1,
+                max_output_tokens=8192,
+            ),
+        )
+    finally:
+        signal.alarm(0)
+        signal.signal(signal.SIGALRM, old_handler)
+
+    text = response.text or ""
+
+    # Strip markdown code fence wrapper if Gemini added one
+    if text.startswith("```markdown\n"):
+        text = text[len("```markdown\n") :]
+        text = text.removesuffix("\n```")
+    elif text.startswith("```md\n"):
+        text = text[len("```md\n") :]
+        text = text.removesuffix("\n```")
+    elif text.startswith("```\n"):
+        text = text[len("```\n") :]
+        text = text.removesuffix("\n```")
+
+    return text.strip() + "\n"
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Generate markdown ground truth from PDFs using Gemini")
+    parser.add_argument(
+        "--filter", type=str, default=None, help="Only process fixtures whose name contains this string"
+    )
+    parser.add_argument("--dry-run", action="store_true", help="Show what would be processed without calling the API")
+    parser.add_argument("--force", action="store_true", help="Regenerate even if .md file already exists")
+    parser.add_argument(
+        "--model", type=str, default="gemini-2.0-flash", help="Gemini model to use (default: gemini-2.0-flash)"
+    )
+    parser.add_argument("--project", type=str, default="boxwood-spirit-479620-r5", help="GCP project ID")
+    parser.add_argument("--location", type=str, default="us-central1", help="Vertex AI location")
+    parser.add_argument("--delay", type=float, default=1.0, help="Delay between API calls in seconds (rate limiting)")
+    parser.add_argument("--timeout", type=int, default=120, help="Per-request timeout in seconds (default: 120)")
+    parser.add_argument("--max-size", type=int, default=None, help="Skip PDFs larger than this many KB")
+    args = parser.parse_args()
+
+    repo_root = get_repo_root()
+    fixtures_dir = repo_root / "tools" / "benchmark-harness" / "fixtures"
+    gt_dir = repo_root / "test_documents" / "ground_truth" / "pdf"
+
+    print(f"Repository root: {repo_root}")
+    print(f"Fixtures dir:    {fixtures_dir}")
+    print(f"Output dir:      {gt_dir}")
+    print(f"Model:           {args.model}")
+    if args.dry_run:
+        print("DRY RUN MODE\n")
+
+    fixtures = discover_fixtures(fixtures_dir, args.filter)
+    print(f"Found {len(fixtures)} PDF fixtures")
+
+    if not args.dry_run:
+        client = genai.Client(
+            vertexai=True,
+            project=args.project,
+            location=args.location,
+        )
+
+    stats = {"generated": 0, "skipped": 0, "errors": 0}
+
+    for item in fixtures:
+        name = item["name"]
+        md_path = gt_dir / f"{name}.md"
+        file_size_kb = item["doc_path"].stat().st_size / 1024
+
+        if md_path.exists() and not args.force:
+            stats["skipped"] += 1
+            continue
+
+        if args.max_size and file_size_kb > args.max_size:
+            print(f"  Skipping {name} ({file_size_kb:.0f} KB > {args.max_size} KB)")
+            stats["skipped"] += 1
+            continue
+
+        if args.dry_run:
+            print(f"  [DRY] {name} ({file_size_kb:.0f} KB)")
+            stats["generated"] += 1
+            continue
+
+        print(f"  Processing {name} ({file_size_kb:.0f} KB)...", end=" ", flush=True)
+        try:
+            start = time.time()
+            markdown = generate_markdown(client, item["doc_path"], args.model, timeout=args.timeout)
+            elapsed = time.time() - start
+
+            gt_dir.mkdir(parents=True, exist_ok=True)
+            md_path.write_text(markdown, encoding="utf-8")
+
+            # Quick quality check
+            lines = markdown.strip().split("\n")
+            headings = sum(1 for l in lines if l.startswith("#"))
+            tables = sum(1 for l in lines if "|" in l and "---" not in l)
+            print(f"OK ({elapsed:.1f}s, {len(lines)} lines, {headings} headings, {tables} table rows)")
+            stats["generated"] += 1
+
+            time.sleep(args.delay)
+
+        except _Timeout:
+            print(f"TIMEOUT ({args.timeout}s)")
+            stats["errors"] += 1
+        except Exception as e:
+            print(f"ERROR: {e}")
+            stats["errors"] += 1
+
+    print(f"\n{'=' * 50}")
+    print(f"Generated: {stats['generated']}")
+    print(f"Skipped:   {stats['skipped']} (already exist)")
+    print(f"Errors:    {stats['errors']}")
+
+    return 0 if stats["errors"] == 0 else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())