This commit is contained in:
249
tools/benchmark-harness/scripts/generate_markdown_gt.py
Normal file
249
tools/benchmark-harness/scripts/generate_markdown_gt.py
Normal file
@@ -0,0 +1,249 @@
|
||||
#!/usr/bin/env -S uv run --no-project --script
|
||||
# /// script
|
||||
# requires-python = ">=3.10"
|
||||
# dependencies = ["google-genai>=1.0"]
|
||||
# ///
|
||||
"""Generate proper markdown ground truth from PDF documents using Gemini.
|
||||
|
||||
Reads benchmark fixture JSON files to locate PDFs, sends each to Gemini 2.5 Flash
|
||||
via Vertex AI, and saves the extracted markdown to the ground truth directory.
|
||||
|
||||
Usage:
|
||||
uv run tools/benchmark-harness/scripts/generate_markdown_gt.py [OPTIONS]
|
||||
|
||||
Examples:
|
||||
# Generate for all nougat + pdfa documents
|
||||
uv run tools/benchmark-harness/scripts/generate_markdown_gt.py
|
||||
|
||||
# Generate for a specific document
|
||||
uv run tools/benchmark-harness/scripts/generate_markdown_gt.py --filter nougat_001
|
||||
|
||||
# Dry run to see what would be processed
|
||||
uv run tools/benchmark-harness/scripts/generate_markdown_gt.py --dry-run
|
||||
|
||||
# Force regeneration of existing files
|
||||
uv run tools/benchmark-harness/scripts/generate_markdown_gt.py --force
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import signal
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from google import genai
|
||||
from google.genai.types import GenerateContentConfig, Part
|
||||
|
||||
EXTRACTION_PROMPT = """\
|
||||
Extract the complete text content of this PDF document as clean Markdown.
|
||||
|
||||
Rules:
|
||||
- Use proper heading hierarchy (# for document title, ## for major sections, ### for subsections)
|
||||
- Render tables using markdown table syntax with | delimiters and --- separator row
|
||||
- Use numbered lists (1. 2. 3.) and bullet lists (- item) where the document uses them
|
||||
- Preserve emphasis: **bold** and *italic* where the original uses them
|
||||
- Use ``` code blocks for code snippets, formulas, or monospace content
|
||||
- Use <!-- image --> as a placeholder where figures or images appear
|
||||
- Omit page numbers, running headers/footers, and watermarks
|
||||
- Preserve the document's reading order
|
||||
- Do NOT invent or hallucinate content — only extract what is actually in the document
|
||||
- Do NOT wrap the output in a markdown code fence — return raw markdown directly
|
||||
- For multi-column layouts, read left column first, then right column
|
||||
- For forms with label-value pairs, use **Label:** Value format
|
||||
"""
|
||||
|
||||
|
||||
def get_repo_root() -> Path:
|
||||
current = Path(__file__).resolve().parent
|
||||
while current != current.parent:
|
||||
if (current / "Cargo.toml").exists() and (current / "test_documents").exists():
|
||||
return current
|
||||
current = current.parent
|
||||
raise RuntimeError("Could not find repository root")
|
||||
|
||||
|
||||
def discover_fixtures(fixtures_dir: Path, name_filter: str | None = None) -> list[dict]:
|
||||
"""Find PDF fixtures that need markdown ground truth."""
|
||||
results = []
|
||||
for fixture_path in sorted(fixtures_dir.rglob("*.json")):
|
||||
try:
|
||||
with open(fixture_path) as f:
|
||||
fixture = json.load(f)
|
||||
except (json.JSONDecodeError, OSError):
|
||||
continue
|
||||
|
||||
if fixture.get("file_type") != "pdf":
|
||||
continue
|
||||
|
||||
name = fixture_path.stem
|
||||
if name_filter and name_filter not in name:
|
||||
continue
|
||||
|
||||
doc_rel = fixture.get("document", "")
|
||||
if not doc_rel:
|
||||
continue
|
||||
|
||||
doc_path = (fixture_path.parent / doc_rel).resolve()
|
||||
if not doc_path.exists():
|
||||
continue
|
||||
|
||||
results.append(
|
||||
{
|
||||
"name": name,
|
||||
"fixture_path": fixture_path,
|
||||
"doc_path": doc_path,
|
||||
"fixture": fixture,
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
class _Timeout(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def _timeout_handler(signum, frame):
|
||||
raise _Timeout("API call timed out")
|
||||
|
||||
|
||||
def generate_markdown(
|
||||
client: genai.Client,
|
||||
pdf_path: Path,
|
||||
model: str,
|
||||
timeout: int = 120,
|
||||
) -> str:
|
||||
"""Send PDF to Gemini and get markdown extraction."""
|
||||
pdf_bytes = pdf_path.read_bytes()
|
||||
|
||||
old_handler = signal.signal(signal.SIGALRM, _timeout_handler)
|
||||
signal.alarm(timeout)
|
||||
try:
|
||||
response = client.models.generate_content(
|
||||
model=model,
|
||||
contents=[
|
||||
Part.from_bytes(data=pdf_bytes, mime_type="application/pdf"),
|
||||
EXTRACTION_PROMPT,
|
||||
],
|
||||
config=GenerateContentConfig(
|
||||
temperature=0.1,
|
||||
max_output_tokens=8192,
|
||||
),
|
||||
)
|
||||
finally:
|
||||
signal.alarm(0)
|
||||
signal.signal(signal.SIGALRM, old_handler)
|
||||
|
||||
text = response.text or ""
|
||||
|
||||
# Strip markdown code fence wrapper if Gemini added one
|
||||
if text.startswith("```markdown\n"):
|
||||
text = text[len("```markdown\n") :]
|
||||
text = text.removesuffix("\n```")
|
||||
elif text.startswith("```md\n"):
|
||||
text = text[len("```md\n") :]
|
||||
text = text.removesuffix("\n```")
|
||||
elif text.startswith("```\n"):
|
||||
text = text[len("```\n") :]
|
||||
text = text.removesuffix("\n```")
|
||||
|
||||
return text.strip() + "\n"
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="Generate markdown ground truth from PDFs using Gemini")
|
||||
parser.add_argument(
|
||||
"--filter", type=str, default=None, help="Only process fixtures whose name contains this string"
|
||||
)
|
||||
parser.add_argument("--dry-run", action="store_true", help="Show what would be processed without calling the API")
|
||||
parser.add_argument("--force", action="store_true", help="Regenerate even if .md file already exists")
|
||||
parser.add_argument(
|
||||
"--model", type=str, default="gemini-2.0-flash", help="Gemini model to use (default: gemini-2.0-flash)"
|
||||
)
|
||||
parser.add_argument("--project", type=str, default="boxwood-spirit-479620-r5", help="GCP project ID")
|
||||
parser.add_argument("--location", type=str, default="us-central1", help="Vertex AI location")
|
||||
parser.add_argument("--delay", type=float, default=1.0, help="Delay between API calls in seconds (rate limiting)")
|
||||
parser.add_argument("--timeout", type=int, default=120, help="Per-request timeout in seconds (default: 120)")
|
||||
parser.add_argument("--max-size", type=int, default=None, help="Skip PDFs larger than this many KB")
|
||||
args = parser.parse_args()
|
||||
|
||||
repo_root = get_repo_root()
|
||||
fixtures_dir = repo_root / "tools" / "benchmark-harness" / "fixtures"
|
||||
gt_dir = repo_root / "test_documents" / "ground_truth" / "pdf"
|
||||
|
||||
print(f"Repository root: {repo_root}")
|
||||
print(f"Fixtures dir: {fixtures_dir}")
|
||||
print(f"Output dir: {gt_dir}")
|
||||
print(f"Model: {args.model}")
|
||||
if args.dry_run:
|
||||
print("DRY RUN MODE\n")
|
||||
|
||||
fixtures = discover_fixtures(fixtures_dir, args.filter)
|
||||
print(f"Found {len(fixtures)} PDF fixtures")
|
||||
|
||||
if not args.dry_run:
|
||||
client = genai.Client(
|
||||
vertexai=True,
|
||||
project=args.project,
|
||||
location=args.location,
|
||||
)
|
||||
|
||||
stats = {"generated": 0, "skipped": 0, "errors": 0}
|
||||
|
||||
for item in fixtures:
|
||||
name = item["name"]
|
||||
md_path = gt_dir / f"{name}.md"
|
||||
file_size_kb = item["doc_path"].stat().st_size / 1024
|
||||
|
||||
if md_path.exists() and not args.force:
|
||||
stats["skipped"] += 1
|
||||
continue
|
||||
|
||||
if args.max_size and file_size_kb > args.max_size:
|
||||
print(f" Skipping {name} ({file_size_kb:.0f} KB > {args.max_size} KB)")
|
||||
stats["skipped"] += 1
|
||||
continue
|
||||
|
||||
if args.dry_run:
|
||||
print(f" [DRY] {name} ({file_size_kb:.0f} KB)")
|
||||
stats["generated"] += 1
|
||||
continue
|
||||
|
||||
print(f" Processing {name} ({file_size_kb:.0f} KB)...", end=" ", flush=True)
|
||||
try:
|
||||
start = time.time()
|
||||
markdown = generate_markdown(client, item["doc_path"], args.model, timeout=args.timeout)
|
||||
elapsed = time.time() - start
|
||||
|
||||
gt_dir.mkdir(parents=True, exist_ok=True)
|
||||
md_path.write_text(markdown, encoding="utf-8")
|
||||
|
||||
# Quick quality check
|
||||
lines = markdown.strip().split("\n")
|
||||
headings = sum(1 for l in lines if l.startswith("#"))
|
||||
tables = sum(1 for l in lines if "|" in l and "---" not in l)
|
||||
print(f"OK ({elapsed:.1f}s, {len(lines)} lines, {headings} headings, {tables} table rows)")
|
||||
stats["generated"] += 1
|
||||
|
||||
time.sleep(args.delay)
|
||||
|
||||
except _Timeout:
|
||||
print(f"TIMEOUT ({args.timeout}s)")
|
||||
stats["errors"] += 1
|
||||
except Exception as e:
|
||||
print(f"ERROR: {e}")
|
||||
stats["errors"] += 1
|
||||
|
||||
print(f"\n{'=' * 50}")
|
||||
print(f"Generated: {stats['generated']}")
|
||||
print(f"Skipped: {stats['skipped']} (already exist)")
|
||||
print(f"Errors: {stats['errors']}")
|
||||
|
||||
return 0 if stats["errors"] == 0 else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user