This commit is contained in:
172
tools/benchmark-harness/scripts/generate_vendored_baselines.py
Normal file
172
tools/benchmark-harness/scripts/generate_vendored_baselines.py
Normal file
@@ -0,0 +1,172 @@
|
||||
# /// script
|
||||
# requires-python = ">=3.11"
|
||||
# dependencies = [
|
||||
# "paddleocr>=3.4.0",
|
||||
# "paddlepaddle>=3.3.0",
|
||||
# "rapidocr-onnxruntime>=1.4.0",
|
||||
# "pymupdf>=1.24.0",
|
||||
# "pillow>=10.0.0",
|
||||
# "numpy>=1.24.0",
|
||||
# ]
|
||||
# ///
|
||||
"""Generate vendored OCR baselines from PaddleOCR Python and RapidOCR.
|
||||
|
||||
Usage:
|
||||
uv run tools/benchmark-harness/scripts/generate_vendored_baselines.py
|
||||
uv run tools/benchmark-harness/scripts/generate_vendored_baselines.py rapidocr
|
||||
uv run tools/benchmark-harness/scripts/generate_vendored_baselines.py --force
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import fitz
|
||||
import numpy as np
|
||||
|
||||
FIXTURES_DIR = Path(__file__).resolve().parent.parent / "fixtures"
|
||||
VENDORED_DIR = Path(__file__).resolve().parent.parent / "vendored"
|
||||
|
||||
OCR_FIXTURES = [
|
||||
"pdf_image_only_german",
|
||||
"pdf_non_searchable",
|
||||
"pdf_ocr_rotated_270",
|
||||
"pdf_ocr_rotated_90",
|
||||
"pdf_ocr_rotated",
|
||||
"pdf_ocr_test",
|
||||
"pdf_scanned_ocr",
|
||||
]
|
||||
|
||||
|
||||
def pdf_to_images(pdf_path: str, dpi: int = 300) -> list[np.ndarray]:
|
||||
"""Convert PDF pages to numpy arrays (RGB, HWC)."""
|
||||
import io
|
||||
|
||||
from PIL import Image
|
||||
|
||||
doc = fitz.open(pdf_path)
|
||||
images = []
|
||||
for page in doc:
|
||||
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
||||
pix = page.get_pixmap(matrix=mat)
|
||||
img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")
|
||||
images.append(np.array(img))
|
||||
doc.close()
|
||||
return images
|
||||
|
||||
|
||||
def lines_to_markdown(lines: list[str]) -> str:
|
||||
"""Each OCR text line becomes a markdown paragraph."""
|
||||
paragraphs = [line.strip() for line in lines if line.strip()]
|
||||
return "\n\n".join(paragraphs) + "\n" if paragraphs else ""
|
||||
|
||||
|
||||
def run_paddleocr_python(pdf_path: str) -> tuple[str, float]:
|
||||
"""Run PaddleOCR Python v3.4+ using the predict() API."""
|
||||
os.environ["PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK"] = "True"
|
||||
from paddleocr import PaddleOCR
|
||||
|
||||
ocr = PaddleOCR(use_textline_orientation=True, lang="en")
|
||||
images = pdf_to_images(pdf_path)
|
||||
|
||||
start = time.monotonic()
|
||||
all_lines: list[str] = []
|
||||
for img in images:
|
||||
# predict() returns list of OCRResult (dict-like) objects
|
||||
for result in ocr.predict(img):
|
||||
# OCRResult has 'rec_text' key with list of recognized texts
|
||||
rec_texts = result.get("rec_text", [])
|
||||
if isinstance(rec_texts, (list, tuple)):
|
||||
for t in rec_texts:
|
||||
text = str(t).strip()
|
||||
if text:
|
||||
all_lines.append(text)
|
||||
elapsed_ms = (time.monotonic() - start) * 1000
|
||||
|
||||
return lines_to_markdown(all_lines), elapsed_ms
|
||||
|
||||
|
||||
def run_rapidocr(pdf_path: str) -> tuple[str, float]:
|
||||
"""Run RapidOCR."""
|
||||
from rapidocr_onnxruntime import RapidOCR
|
||||
|
||||
ocr = RapidOCR()
|
||||
images = pdf_to_images(pdf_path)
|
||||
|
||||
start = time.monotonic()
|
||||
all_lines: list[str] = []
|
||||
for img in images:
|
||||
result, _ = ocr(img)
|
||||
if not result:
|
||||
continue
|
||||
for line in result:
|
||||
if line and len(line) >= 2:
|
||||
text = str(line[1]).strip()
|
||||
if text:
|
||||
all_lines.append(text)
|
||||
elapsed_ms = (time.monotonic() - start) * 1000
|
||||
|
||||
return lines_to_markdown(all_lines), elapsed_ms
|
||||
|
||||
|
||||
def save_vendored(pipeline_name: str, fixture_name: str, md: str, time_ms: float):
|
||||
md_dir = VENDORED_DIR / pipeline_name / "md"
|
||||
timing_dir = VENDORED_DIR / pipeline_name / "timing"
|
||||
md_dir.mkdir(parents=True, exist_ok=True)
|
||||
timing_dir.mkdir(parents=True, exist_ok=True)
|
||||
(md_dir / f"{fixture_name}.md").write_text(md)
|
||||
(timing_dir / f"{fixture_name}.ms").write_text(f"{time_ms:.1f}\n")
|
||||
|
||||
|
||||
def main():
|
||||
pipelines = {
|
||||
"paddleocr-python": run_paddleocr_python,
|
||||
"rapidocr": run_rapidocr,
|
||||
}
|
||||
|
||||
force = "--force" in sys.argv
|
||||
args = [a for a in sys.argv[1:] if not a.startswith("--")]
|
||||
|
||||
if args:
|
||||
selected = args[0]
|
||||
if selected not in pipelines:
|
||||
print(f"Unknown: {selected}. Choose: {list(pipelines.keys())}")
|
||||
sys.exit(1)
|
||||
pipelines = {selected: pipelines[selected]}
|
||||
|
||||
for fixture_name in OCR_FIXTURES:
|
||||
fixture_path = FIXTURES_DIR / f"{fixture_name}.json"
|
||||
if not fixture_path.exists():
|
||||
print(f" SKIP {fixture_name}: fixture not found")
|
||||
continue
|
||||
|
||||
with open(fixture_path) as f:
|
||||
fixture = json.load(f)
|
||||
|
||||
doc_path = str((FIXTURES_DIR / fixture["document"]).resolve())
|
||||
if not os.path.exists(doc_path):
|
||||
print(f" SKIP {fixture_name}: document not found")
|
||||
continue
|
||||
|
||||
for pipeline_name, run_fn in pipelines.items():
|
||||
existing = VENDORED_DIR / pipeline_name / "md" / f"{fixture_name}.md"
|
||||
if not force and existing.exists() and existing.stat().st_size > 0:
|
||||
print(f" CACHED {pipeline_name}/{fixture_name}")
|
||||
continue
|
||||
|
||||
print(f" RUN {pipeline_name}/{fixture_name} ...", end="", flush=True)
|
||||
try:
|
||||
md, time_ms = run_fn(doc_path)
|
||||
save_vendored(pipeline_name, fixture_name, md, time_ms)
|
||||
print(f" {time_ms:.0f}ms, {len(md)} chars")
|
||||
except Exception as e:
|
||||
print(f" ERROR: {e}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user