173 lines
5.3 KiB
Python
173 lines
5.3 KiB
Python
# /// script
|
|
# requires-python = ">=3.11"
|
|
# dependencies = [
|
|
# "paddleocr>=3.4.0",
|
|
# "paddlepaddle>=3.3.0",
|
|
# "rapidocr-onnxruntime>=1.4.0",
|
|
# "pymupdf>=1.24.0",
|
|
# "pillow>=10.0.0",
|
|
# "numpy>=1.24.0",
|
|
# ]
|
|
# ///
|
|
"""Generate vendored OCR baselines from PaddleOCR Python and RapidOCR.
|
|
|
|
Usage:
|
|
uv run tools/benchmark-harness/scripts/generate_vendored_baselines.py
|
|
uv run tools/benchmark-harness/scripts/generate_vendored_baselines.py rapidocr
|
|
uv run tools/benchmark-harness/scripts/generate_vendored_baselines.py --force
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
import fitz
|
|
import numpy as np
|
|
|
|
FIXTURES_DIR = Path(__file__).resolve().parent.parent / "fixtures"
|
|
VENDORED_DIR = Path(__file__).resolve().parent.parent / "vendored"
|
|
|
|
OCR_FIXTURES = [
|
|
"pdf_image_only_german",
|
|
"pdf_non_searchable",
|
|
"pdf_ocr_rotated_270",
|
|
"pdf_ocr_rotated_90",
|
|
"pdf_ocr_rotated",
|
|
"pdf_ocr_test",
|
|
"pdf_scanned_ocr",
|
|
]
|
|
|
|
|
|
def pdf_to_images(pdf_path: str, dpi: int = 300) -> list[np.ndarray]:
|
|
"""Convert PDF pages to numpy arrays (RGB, HWC)."""
|
|
import io
|
|
|
|
from PIL import Image
|
|
|
|
doc = fitz.open(pdf_path)
|
|
images = []
|
|
for page in doc:
|
|
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
|
pix = page.get_pixmap(matrix=mat)
|
|
img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")
|
|
images.append(np.array(img))
|
|
doc.close()
|
|
return images
|
|
|
|
|
|
def lines_to_markdown(lines: list[str]) -> str:
|
|
"""Each OCR text line becomes a markdown paragraph."""
|
|
paragraphs = [line.strip() for line in lines if line.strip()]
|
|
return "\n\n".join(paragraphs) + "\n" if paragraphs else ""
|
|
|
|
|
|
def run_paddleocr_python(pdf_path: str) -> tuple[str, float]:
|
|
"""Run PaddleOCR Python v3.4+ using the predict() API."""
|
|
os.environ["PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK"] = "True"
|
|
from paddleocr import PaddleOCR
|
|
|
|
ocr = PaddleOCR(use_textline_orientation=True, lang="en")
|
|
images = pdf_to_images(pdf_path)
|
|
|
|
start = time.monotonic()
|
|
all_lines: list[str] = []
|
|
for img in images:
|
|
# predict() returns list of OCRResult (dict-like) objects
|
|
for result in ocr.predict(img):
|
|
# OCRResult has 'rec_text' key with list of recognized texts
|
|
rec_texts = result.get("rec_text", [])
|
|
if isinstance(rec_texts, (list, tuple)):
|
|
for t in rec_texts:
|
|
text = str(t).strip()
|
|
if text:
|
|
all_lines.append(text)
|
|
elapsed_ms = (time.monotonic() - start) * 1000
|
|
|
|
return lines_to_markdown(all_lines), elapsed_ms
|
|
|
|
|
|
def run_rapidocr(pdf_path: str) -> tuple[str, float]:
|
|
"""Run RapidOCR."""
|
|
from rapidocr_onnxruntime import RapidOCR
|
|
|
|
ocr = RapidOCR()
|
|
images = pdf_to_images(pdf_path)
|
|
|
|
start = time.monotonic()
|
|
all_lines: list[str] = []
|
|
for img in images:
|
|
result, _ = ocr(img)
|
|
if not result:
|
|
continue
|
|
for line in result:
|
|
if line and len(line) >= 2:
|
|
text = str(line[1]).strip()
|
|
if text:
|
|
all_lines.append(text)
|
|
elapsed_ms = (time.monotonic() - start) * 1000
|
|
|
|
return lines_to_markdown(all_lines), elapsed_ms
|
|
|
|
|
|
def save_vendored(pipeline_name: str, fixture_name: str, md: str, time_ms: float):
|
|
md_dir = VENDORED_DIR / pipeline_name / "md"
|
|
timing_dir = VENDORED_DIR / pipeline_name / "timing"
|
|
md_dir.mkdir(parents=True, exist_ok=True)
|
|
timing_dir.mkdir(parents=True, exist_ok=True)
|
|
(md_dir / f"{fixture_name}.md").write_text(md)
|
|
(timing_dir / f"{fixture_name}.ms").write_text(f"{time_ms:.1f}\n")
|
|
|
|
|
|
def main():
|
|
pipelines = {
|
|
"paddleocr-python": run_paddleocr_python,
|
|
"rapidocr": run_rapidocr,
|
|
}
|
|
|
|
force = "--force" in sys.argv
|
|
args = [a for a in sys.argv[1:] if not a.startswith("--")]
|
|
|
|
if args:
|
|
selected = args[0]
|
|
if selected not in pipelines:
|
|
print(f"Unknown: {selected}. Choose: {list(pipelines.keys())}")
|
|
sys.exit(1)
|
|
pipelines = {selected: pipelines[selected]}
|
|
|
|
for fixture_name in OCR_FIXTURES:
|
|
fixture_path = FIXTURES_DIR / f"{fixture_name}.json"
|
|
if not fixture_path.exists():
|
|
print(f" SKIP {fixture_name}: fixture not found")
|
|
continue
|
|
|
|
with open(fixture_path) as f:
|
|
fixture = json.load(f)
|
|
|
|
doc_path = str((FIXTURES_DIR / fixture["document"]).resolve())
|
|
if not os.path.exists(doc_path):
|
|
print(f" SKIP {fixture_name}: document not found")
|
|
continue
|
|
|
|
for pipeline_name, run_fn in pipelines.items():
|
|
existing = VENDORED_DIR / pipeline_name / "md" / f"{fixture_name}.md"
|
|
if not force and existing.exists() and existing.stat().st_size > 0:
|
|
print(f" CACHED {pipeline_name}/{fixture_name}")
|
|
continue
|
|
|
|
print(f" RUN {pipeline_name}/{fixture_name} ...", end="", flush=True)
|
|
try:
|
|
md, time_ms = run_fn(doc_path)
|
|
save_vendored(pipeline_name, fixture_name, md, time_ms)
|
|
print(f" {time_ms:.0f}ms, {len(md)} chars")
|
|
except Exception as e:
|
|
print(f" ERROR: {e}")
|
|
import traceback
|
|
|
|
traceback.print_exc()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|