Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/tools/benchmark-harness/scripts/generate_vendored_baselines.py
+++ b/tools/benchmark-harness/scripts/generate_vendored_baselines.py
@@ -0,0 +1,172 @@
+# /// script
+# requires-python = ">=3.11"
+# dependencies = [
+#     "paddleocr>=3.4.0",
+#     "paddlepaddle>=3.3.0",
+#     "rapidocr-onnxruntime>=1.4.0",
+#     "pymupdf>=1.24.0",
+#     "pillow>=10.0.0",
+#     "numpy>=1.24.0",
+# ]
+# ///
+"""Generate vendored OCR baselines from PaddleOCR Python and RapidOCR.
+
+Usage:
+    uv run tools/benchmark-harness/scripts/generate_vendored_baselines.py
+    uv run tools/benchmark-harness/scripts/generate_vendored_baselines.py rapidocr
+    uv run tools/benchmark-harness/scripts/generate_vendored_baselines.py --force
+"""
+
+import json
+import os
+import sys
+import time
+from pathlib import Path
+
+import fitz
+import numpy as np
+
+FIXTURES_DIR = Path(__file__).resolve().parent.parent / "fixtures"
+VENDORED_DIR = Path(__file__).resolve().parent.parent / "vendored"
+
+OCR_FIXTURES = [
+    "pdf_image_only_german",
+    "pdf_non_searchable",
+    "pdf_ocr_rotated_270",
+    "pdf_ocr_rotated_90",
+    "pdf_ocr_rotated",
+    "pdf_ocr_test",
+    "pdf_scanned_ocr",
+]
+
+
+def pdf_to_images(pdf_path: str, dpi: int = 300) -> list[np.ndarray]:
+    """Convert PDF pages to numpy arrays (RGB, HWC)."""
+    import io
+
+    from PIL import Image
+
+    doc = fitz.open(pdf_path)
+    images = []
+    for page in doc:
+        mat = fitz.Matrix(dpi / 72, dpi / 72)
+        pix = page.get_pixmap(matrix=mat)
+        img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")
+        images.append(np.array(img))
+    doc.close()
+    return images
+
+
+def lines_to_markdown(lines: list[str]) -> str:
+    """Each OCR text line becomes a markdown paragraph."""
+    paragraphs = [line.strip() for line in lines if line.strip()]
+    return "\n\n".join(paragraphs) + "\n" if paragraphs else ""
+
+
+def run_paddleocr_python(pdf_path: str) -> tuple[str, float]:
+    """Run PaddleOCR Python v3.4+ using the predict() API."""
+    os.environ["PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK"] = "True"
+    from paddleocr import PaddleOCR
+
+    ocr = PaddleOCR(use_textline_orientation=True, lang="en")
+    images = pdf_to_images(pdf_path)
+
+    start = time.monotonic()
+    all_lines: list[str] = []
+    for img in images:
+        # predict() returns list of OCRResult (dict-like) objects
+        for result in ocr.predict(img):
+            # OCRResult has 'rec_text' key with list of recognized texts
+            rec_texts = result.get("rec_text", [])
+            if isinstance(rec_texts, (list, tuple)):
+                for t in rec_texts:
+                    text = str(t).strip()
+                    if text:
+                        all_lines.append(text)
+    elapsed_ms = (time.monotonic() - start) * 1000
+
+    return lines_to_markdown(all_lines), elapsed_ms
+
+
+def run_rapidocr(pdf_path: str) -> tuple[str, float]:
+    """Run RapidOCR."""
+    from rapidocr_onnxruntime import RapidOCR
+
+    ocr = RapidOCR()
+    images = pdf_to_images(pdf_path)
+
+    start = time.monotonic()
+    all_lines: list[str] = []
+    for img in images:
+        result, _ = ocr(img)
+        if not result:
+            continue
+        for line in result:
+            if line and len(line) >= 2:
+                text = str(line[1]).strip()
+                if text:
+                    all_lines.append(text)
+    elapsed_ms = (time.monotonic() - start) * 1000
+
+    return lines_to_markdown(all_lines), elapsed_ms
+
+
+def save_vendored(pipeline_name: str, fixture_name: str, md: str, time_ms: float):
+    md_dir = VENDORED_DIR / pipeline_name / "md"
+    timing_dir = VENDORED_DIR / pipeline_name / "timing"
+    md_dir.mkdir(parents=True, exist_ok=True)
+    timing_dir.mkdir(parents=True, exist_ok=True)
+    (md_dir / f"{fixture_name}.md").write_text(md)
+    (timing_dir / f"{fixture_name}.ms").write_text(f"{time_ms:.1f}\n")
+
+
+def main():
+    pipelines = {
+        "paddleocr-python": run_paddleocr_python,
+        "rapidocr": run_rapidocr,
+    }
+
+    force = "--force" in sys.argv
+    args = [a for a in sys.argv[1:] if not a.startswith("--")]
+
+    if args:
+        selected = args[0]
+        if selected not in pipelines:
+            print(f"Unknown: {selected}. Choose: {list(pipelines.keys())}")
+            sys.exit(1)
+        pipelines = {selected: pipelines[selected]}
+
+    for fixture_name in OCR_FIXTURES:
+        fixture_path = FIXTURES_DIR / f"{fixture_name}.json"
+        if not fixture_path.exists():
+            print(f"  SKIP {fixture_name}: fixture not found")
+            continue
+
+        with open(fixture_path) as f:
+            fixture = json.load(f)
+
+        doc_path = str((FIXTURES_DIR / fixture["document"]).resolve())
+        if not os.path.exists(doc_path):
+            print(f"  SKIP {fixture_name}: document not found")
+            continue
+
+        for pipeline_name, run_fn in pipelines.items():
+            existing = VENDORED_DIR / pipeline_name / "md" / f"{fixture_name}.md"
+            if not force and existing.exists() and existing.stat().st_size > 0:
+                print(f"  CACHED {pipeline_name}/{fixture_name}")
+                continue
+
+            print(f"  RUN {pipeline_name}/{fixture_name} ...", end="", flush=True)
+            try:
+                md, time_ms = run_fn(doc_path)
+                save_vendored(pipeline_name, fixture_name, md, time_ms)
+                print(f" {time_ms:.0f}ms, {len(md)} chars")
+            except Exception as e:
+                print(f" ERROR: {e}")
+                import traceback
+
+                traceback.print_exc()
+
+
+if __name__ == "__main__":
+    main()