250 lines
8.5 KiB
Python
250 lines
8.5 KiB
Python
|
|
#!/usr/bin/env -S uv run --no-project --script
|
||
|
|
# /// script
|
||
|
|
# requires-python = ">=3.10"
|
||
|
|
# dependencies = ["google-genai>=1.0"]
|
||
|
|
# ///
|
||
|
|
"""Generate proper markdown ground truth from PDF documents using Gemini.
|
||
|
|
|
||
|
|
Reads benchmark fixture JSON files to locate PDFs, sends each to Gemini 2.5 Flash
|
||
|
|
via Vertex AI, and saves the extracted markdown to the ground truth directory.
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
uv run tools/benchmark-harness/scripts/generate_markdown_gt.py [OPTIONS]
|
||
|
|
|
||
|
|
Examples:
|
||
|
|
# Generate for all nougat + pdfa documents
|
||
|
|
uv run tools/benchmark-harness/scripts/generate_markdown_gt.py
|
||
|
|
|
||
|
|
# Generate for a specific document
|
||
|
|
uv run tools/benchmark-harness/scripts/generate_markdown_gt.py --filter nougat_001
|
||
|
|
|
||
|
|
# Dry run to see what would be processed
|
||
|
|
uv run tools/benchmark-harness/scripts/generate_markdown_gt.py --dry-run
|
||
|
|
|
||
|
|
# Force regeneration of existing files
|
||
|
|
uv run tools/benchmark-harness/scripts/generate_markdown_gt.py --force
|
||
|
|
"""
|
||
|
|
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import argparse
|
||
|
|
import json
|
||
|
|
import signal
|
||
|
|
import sys
|
||
|
|
import time
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
from google import genai
|
||
|
|
from google.genai.types import GenerateContentConfig, Part
|
||
|
|
|
||
|
|
EXTRACTION_PROMPT = """\
|
||
|
|
Extract the complete text content of this PDF document as clean Markdown.
|
||
|
|
|
||
|
|
Rules:
|
||
|
|
- Use proper heading hierarchy (# for document title, ## for major sections, ### for subsections)
|
||
|
|
- Render tables using markdown table syntax with | delimiters and --- separator row
|
||
|
|
- Use numbered lists (1. 2. 3.) and bullet lists (- item) where the document uses them
|
||
|
|
- Preserve emphasis: **bold** and *italic* where the original uses them
|
||
|
|
- Use ``` code blocks for code snippets, formulas, or monospace content
|
||
|
|
- Use <!-- image --> as a placeholder where figures or images appear
|
||
|
|
- Omit page numbers, running headers/footers, and watermarks
|
||
|
|
- Preserve the document's reading order
|
||
|
|
- Do NOT invent or hallucinate content — only extract what is actually in the document
|
||
|
|
- Do NOT wrap the output in a markdown code fence — return raw markdown directly
|
||
|
|
- For multi-column layouts, read left column first, then right column
|
||
|
|
- For forms with label-value pairs, use **Label:** Value format
|
||
|
|
"""
|
||
|
|
|
||
|
|
|
||
|
|
def get_repo_root() -> Path:
|
||
|
|
current = Path(__file__).resolve().parent
|
||
|
|
while current != current.parent:
|
||
|
|
if (current / "Cargo.toml").exists() and (current / "test_documents").exists():
|
||
|
|
return current
|
||
|
|
current = current.parent
|
||
|
|
raise RuntimeError("Could not find repository root")
|
||
|
|
|
||
|
|
|
||
|
|
def discover_fixtures(fixtures_dir: Path, name_filter: str | None = None) -> list[dict]:
|
||
|
|
"""Find PDF fixtures that need markdown ground truth."""
|
||
|
|
results = []
|
||
|
|
for fixture_path in sorted(fixtures_dir.rglob("*.json")):
|
||
|
|
try:
|
||
|
|
with open(fixture_path) as f:
|
||
|
|
fixture = json.load(f)
|
||
|
|
except (json.JSONDecodeError, OSError):
|
||
|
|
continue
|
||
|
|
|
||
|
|
if fixture.get("file_type") != "pdf":
|
||
|
|
continue
|
||
|
|
|
||
|
|
name = fixture_path.stem
|
||
|
|
if name_filter and name_filter not in name:
|
||
|
|
continue
|
||
|
|
|
||
|
|
doc_rel = fixture.get("document", "")
|
||
|
|
if not doc_rel:
|
||
|
|
continue
|
||
|
|
|
||
|
|
doc_path = (fixture_path.parent / doc_rel).resolve()
|
||
|
|
if not doc_path.exists():
|
||
|
|
continue
|
||
|
|
|
||
|
|
results.append(
|
||
|
|
{
|
||
|
|
"name": name,
|
||
|
|
"fixture_path": fixture_path,
|
||
|
|
"doc_path": doc_path,
|
||
|
|
"fixture": fixture,
|
||
|
|
}
|
||
|
|
)
|
||
|
|
|
||
|
|
return results
|
||
|
|
|
||
|
|
|
||
|
|
class _Timeout(Exception):
|
||
|
|
pass
|
||
|
|
|
||
|
|
|
||
|
|
def _timeout_handler(signum, frame):
|
||
|
|
raise _Timeout("API call timed out")
|
||
|
|
|
||
|
|
|
||
|
|
def generate_markdown(
|
||
|
|
client: genai.Client,
|
||
|
|
pdf_path: Path,
|
||
|
|
model: str,
|
||
|
|
timeout: int = 120,
|
||
|
|
) -> str:
|
||
|
|
"""Send PDF to Gemini and get markdown extraction."""
|
||
|
|
pdf_bytes = pdf_path.read_bytes()
|
||
|
|
|
||
|
|
old_handler = signal.signal(signal.SIGALRM, _timeout_handler)
|
||
|
|
signal.alarm(timeout)
|
||
|
|
try:
|
||
|
|
response = client.models.generate_content(
|
||
|
|
model=model,
|
||
|
|
contents=[
|
||
|
|
Part.from_bytes(data=pdf_bytes, mime_type="application/pdf"),
|
||
|
|
EXTRACTION_PROMPT,
|
||
|
|
],
|
||
|
|
config=GenerateContentConfig(
|
||
|
|
temperature=0.1,
|
||
|
|
max_output_tokens=8192,
|
||
|
|
),
|
||
|
|
)
|
||
|
|
finally:
|
||
|
|
signal.alarm(0)
|
||
|
|
signal.signal(signal.SIGALRM, old_handler)
|
||
|
|
|
||
|
|
text = response.text or ""
|
||
|
|
|
||
|
|
# Strip markdown code fence wrapper if Gemini added one
|
||
|
|
if text.startswith("```markdown\n"):
|
||
|
|
text = text[len("```markdown\n") :]
|
||
|
|
text = text.removesuffix("\n```")
|
||
|
|
elif text.startswith("```md\n"):
|
||
|
|
text = text[len("```md\n") :]
|
||
|
|
text = text.removesuffix("\n```")
|
||
|
|
elif text.startswith("```\n"):
|
||
|
|
text = text[len("```\n") :]
|
||
|
|
text = text.removesuffix("\n```")
|
||
|
|
|
||
|
|
return text.strip() + "\n"
|
||
|
|
|
||
|
|
|
||
|
|
def main() -> int:
|
||
|
|
parser = argparse.ArgumentParser(description="Generate markdown ground truth from PDFs using Gemini")
|
||
|
|
parser.add_argument(
|
||
|
|
"--filter", type=str, default=None, help="Only process fixtures whose name contains this string"
|
||
|
|
)
|
||
|
|
parser.add_argument("--dry-run", action="store_true", help="Show what would be processed without calling the API")
|
||
|
|
parser.add_argument("--force", action="store_true", help="Regenerate even if .md file already exists")
|
||
|
|
parser.add_argument(
|
||
|
|
"--model", type=str, default="gemini-2.0-flash", help="Gemini model to use (default: gemini-2.0-flash)"
|
||
|
|
)
|
||
|
|
parser.add_argument("--project", type=str, default="boxwood-spirit-479620-r5", help="GCP project ID")
|
||
|
|
parser.add_argument("--location", type=str, default="us-central1", help="Vertex AI location")
|
||
|
|
parser.add_argument("--delay", type=float, default=1.0, help="Delay between API calls in seconds (rate limiting)")
|
||
|
|
parser.add_argument("--timeout", type=int, default=120, help="Per-request timeout in seconds (default: 120)")
|
||
|
|
parser.add_argument("--max-size", type=int, default=None, help="Skip PDFs larger than this many KB")
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
repo_root = get_repo_root()
|
||
|
|
fixtures_dir = repo_root / "tools" / "benchmark-harness" / "fixtures"
|
||
|
|
gt_dir = repo_root / "test_documents" / "ground_truth" / "pdf"
|
||
|
|
|
||
|
|
print(f"Repository root: {repo_root}")
|
||
|
|
print(f"Fixtures dir: {fixtures_dir}")
|
||
|
|
print(f"Output dir: {gt_dir}")
|
||
|
|
print(f"Model: {args.model}")
|
||
|
|
if args.dry_run:
|
||
|
|
print("DRY RUN MODE\n")
|
||
|
|
|
||
|
|
fixtures = discover_fixtures(fixtures_dir, args.filter)
|
||
|
|
print(f"Found {len(fixtures)} PDF fixtures")
|
||
|
|
|
||
|
|
if not args.dry_run:
|
||
|
|
client = genai.Client(
|
||
|
|
vertexai=True,
|
||
|
|
project=args.project,
|
||
|
|
location=args.location,
|
||
|
|
)
|
||
|
|
|
||
|
|
stats = {"generated": 0, "skipped": 0, "errors": 0}
|
||
|
|
|
||
|
|
for item in fixtures:
|
||
|
|
name = item["name"]
|
||
|
|
md_path = gt_dir / f"{name}.md"
|
||
|
|
file_size_kb = item["doc_path"].stat().st_size / 1024
|
||
|
|
|
||
|
|
if md_path.exists() and not args.force:
|
||
|
|
stats["skipped"] += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
if args.max_size and file_size_kb > args.max_size:
|
||
|
|
print(f" Skipping {name} ({file_size_kb:.0f} KB > {args.max_size} KB)")
|
||
|
|
stats["skipped"] += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
if args.dry_run:
|
||
|
|
print(f" [DRY] {name} ({file_size_kb:.0f} KB)")
|
||
|
|
stats["generated"] += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
print(f" Processing {name} ({file_size_kb:.0f} KB)...", end=" ", flush=True)
|
||
|
|
try:
|
||
|
|
start = time.time()
|
||
|
|
markdown = generate_markdown(client, item["doc_path"], args.model, timeout=args.timeout)
|
||
|
|
elapsed = time.time() - start
|
||
|
|
|
||
|
|
gt_dir.mkdir(parents=True, exist_ok=True)
|
||
|
|
md_path.write_text(markdown, encoding="utf-8")
|
||
|
|
|
||
|
|
# Quick quality check
|
||
|
|
lines = markdown.strip().split("\n")
|
||
|
|
headings = sum(1 for l in lines if l.startswith("#"))
|
||
|
|
tables = sum(1 for l in lines if "|" in l and "---" not in l)
|
||
|
|
print(f"OK ({elapsed:.1f}s, {len(lines)} lines, {headings} headings, {tables} table rows)")
|
||
|
|
stats["generated"] += 1
|
||
|
|
|
||
|
|
time.sleep(args.delay)
|
||
|
|
|
||
|
|
except _Timeout:
|
||
|
|
print(f"TIMEOUT ({args.timeout}s)")
|
||
|
|
stats["errors"] += 1
|
||
|
|
except Exception as e:
|
||
|
|
print(f"ERROR: {e}")
|
||
|
|
stats["errors"] += 1
|
||
|
|
|
||
|
|
print(f"\n{'=' * 50}")
|
||
|
|
print(f"Generated: {stats['generated']}")
|
||
|
|
print(f"Skipped: {stats['skipped']} (already exist)")
|
||
|
|
print(f"Errors: {stats['errors']}")
|
||
|
|
|
||
|
|
return 0 if stats["errors"] == 0 else 1
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
sys.exit(main())
|