408 lines
13 KiB
Python
408 lines
13 KiB
Python
|
|
"""Import OmniDocBench dataset into our benchmark fixture format.
|
||
|
|
|
||
|
|
Converts OmniDocBench's element-level JSON annotations into:
|
||
|
|
- Per-document fixture JSON files (tools/benchmark-harness/fixtures/pdf/omnidoc_NNN.json)
|
||
|
|
- Ground truth markdown files (test_documents/ground_truth/pdf/omnidoc_NNN.md)
|
||
|
|
- Ground truth text files (test_documents/ground_truth/pdf/omnidoc_NNN.txt)
|
||
|
|
|
||
|
|
OmniDocBench groups pages by document. Each multi-page document produces one fixture.
|
||
|
|
Single-page documents produce one fixture per page.
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
python import_omnidocbench.py <omnidocbench_dir> <repo_root>
|
||
|
|
|
||
|
|
Where:
|
||
|
|
omnidocbench_dir = tools/benchmark-harness/datasets/omnidocbench (contains OmniDocBench.json + ori_pdfs/)
|
||
|
|
repo_root = repository root (contains tools/ and test_documents/)
|
||
|
|
"""
|
||
|
|
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import html
|
||
|
|
import json
|
||
|
|
import os
|
||
|
|
import re
|
||
|
|
import sys
|
||
|
|
from collections import defaultdict
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
# OmniDocBench category types that map to content we want in ground truth
|
||
|
|
CONTENT_CATEGORIES = {
|
||
|
|
"title",
|
||
|
|
"text_block",
|
||
|
|
"table",
|
||
|
|
"equation_isolated",
|
||
|
|
"code_txt",
|
||
|
|
"figure_caption",
|
||
|
|
"table_caption",
|
||
|
|
"equation_caption",
|
||
|
|
"code_txt_caption",
|
||
|
|
"reference",
|
||
|
|
}
|
||
|
|
|
||
|
|
# Categories to skip (page furniture, figures without text, etc.)
|
||
|
|
SKIP_CATEGORIES = {
|
||
|
|
"header",
|
||
|
|
"footer",
|
||
|
|
"page_number",
|
||
|
|
"page_footnote",
|
||
|
|
"abandon",
|
||
|
|
"figure",
|
||
|
|
"figure_footnote",
|
||
|
|
"table_footnote",
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def html_table_to_markdown(html_str: str) -> str:
|
||
|
|
"""Convert a simple HTML table to markdown table format."""
|
||
|
|
if not html_str:
|
||
|
|
return ""
|
||
|
|
|
||
|
|
# Unescape HTML entities
|
||
|
|
html_str = html.unescape(html_str)
|
||
|
|
|
||
|
|
rows: list[list[str]] = []
|
||
|
|
# Extract rows
|
||
|
|
for row_match in re.finditer(r"<tr[^>]*>(.*?)</tr>", html_str, re.DOTALL):
|
||
|
|
row_html = row_match.group(1)
|
||
|
|
cells: list[str] = []
|
||
|
|
for cell_match in re.finditer(r"<t[dh][^>]*>(.*?)</t[dh]>", row_html, re.DOTALL):
|
||
|
|
cell_text = re.sub(r"<[^>]+>", "", cell_match.group(1)).strip()
|
||
|
|
cells.append(cell_text)
|
||
|
|
if cells:
|
||
|
|
rows.append(cells)
|
||
|
|
|
||
|
|
if not rows:
|
||
|
|
return html_str # fallback: return raw if parsing fails
|
||
|
|
|
||
|
|
# Normalize column count
|
||
|
|
max_cols = max(len(r) for r in rows)
|
||
|
|
for row in rows:
|
||
|
|
while len(row) < max_cols:
|
||
|
|
row.append("")
|
||
|
|
|
||
|
|
# Build markdown table
|
||
|
|
lines = []
|
||
|
|
# Header row
|
||
|
|
lines.append("| " + " | ".join(rows[0]) + " |")
|
||
|
|
lines.append("|" + "|".join(["---"] * max_cols) + "|")
|
||
|
|
# Data rows
|
||
|
|
for row in rows[1:]:
|
||
|
|
lines.append("| " + " | ".join(row) + " |")
|
||
|
|
|
||
|
|
return "\n".join(lines)
|
||
|
|
|
||
|
|
|
||
|
|
def annotation_to_markdown(ann: dict) -> str | None:
|
||
|
|
"""Convert a single OmniDocBench annotation to markdown text."""
|
||
|
|
cat = ann.get("category_type", "")
|
||
|
|
|
||
|
|
if cat in SKIP_CATEGORIES:
|
||
|
|
return None
|
||
|
|
|
||
|
|
if ann.get("ignore", False):
|
||
|
|
return None
|
||
|
|
|
||
|
|
text = ann.get("text", "").strip()
|
||
|
|
|
||
|
|
if cat == "title":
|
||
|
|
# OmniDocBench doesn't distinguish heading levels.
|
||
|
|
# Use H2 as default (most titles are section-level, not document-level).
|
||
|
|
if text:
|
||
|
|
return f"## {text}"
|
||
|
|
return None
|
||
|
|
|
||
|
|
if cat == "text_block":
|
||
|
|
return text or None
|
||
|
|
|
||
|
|
if cat == "table":
|
||
|
|
# Prefer HTML representation for tables
|
||
|
|
html_str = ann.get("html", "")
|
||
|
|
if html_str:
|
||
|
|
return html_table_to_markdown(html_str)
|
||
|
|
# Fallback to text
|
||
|
|
return text or None
|
||
|
|
|
||
|
|
if cat == "equation_isolated":
|
||
|
|
latex = ann.get("latex", "")
|
||
|
|
if latex:
|
||
|
|
return f"$$\n{latex}\n$$"
|
||
|
|
return text or None
|
||
|
|
|
||
|
|
if cat == "code_txt":
|
||
|
|
if text:
|
||
|
|
return f"```\n{text}\n```"
|
||
|
|
return None
|
||
|
|
|
||
|
|
if cat in ("figure_caption", "table_caption", "equation_caption", "code_txt_caption"):
|
||
|
|
return text or None
|
||
|
|
|
||
|
|
if cat == "reference":
|
||
|
|
return text or None
|
||
|
|
|
||
|
|
# Unknown category — include text if present
|
||
|
|
return text or None
|
||
|
|
|
||
|
|
|
||
|
|
def page_to_markdown(page: dict) -> str:
|
||
|
|
"""Convert a single OmniDocBench page to markdown."""
|
||
|
|
annotations = page.get("layout_dets", [])
|
||
|
|
|
||
|
|
# Sort by reading order
|
||
|
|
sorted_anns = sorted(annotations, key=lambda a: a.get("order", 999))
|
||
|
|
|
||
|
|
# Handle truncated blocks (merge them)
|
||
|
|
relations = page.get("extra", {}).get("relation", [])
|
||
|
|
merge_targets: dict[int, int] = {} # target_id -> source_id
|
||
|
|
for rel in relations:
|
||
|
|
if rel.get("relation") == "truncated":
|
||
|
|
merge_targets[rel["target_anno_id"]] = rel["source_anno_id"]
|
||
|
|
|
||
|
|
# Build merged text for truncated blocks
|
||
|
|
merged_text: dict[int, list[str]] = defaultdict(list)
|
||
|
|
ann_by_id = {a.get("anno_id", i): a for i, a in enumerate(sorted_anns)}
|
||
|
|
|
||
|
|
for ann in sorted_anns:
|
||
|
|
anno_id = ann.get("anno_id", -1)
|
||
|
|
if anno_id in merge_targets:
|
||
|
|
source_id = merge_targets[anno_id]
|
||
|
|
text = ann.get("text", "").strip()
|
||
|
|
if text:
|
||
|
|
merged_text[source_id].append(text)
|
||
|
|
|
||
|
|
blocks: list[str] = []
|
||
|
|
skip_ids = set(merge_targets.keys())
|
||
|
|
|
||
|
|
for ann in sorted_anns:
|
||
|
|
anno_id = ann.get("anno_id", -1)
|
||
|
|
if anno_id in skip_ids:
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Append merged text from truncated continuations
|
||
|
|
if anno_id in merged_text:
|
||
|
|
original_text = ann.get("text", "").strip()
|
||
|
|
continuation = " ".join(merged_text[anno_id])
|
||
|
|
ann = dict(ann) # shallow copy
|
||
|
|
ann["text"] = f"{original_text} {continuation}".strip()
|
||
|
|
|
||
|
|
md = annotation_to_markdown(ann)
|
||
|
|
if md:
|
||
|
|
blocks.append(md)
|
||
|
|
|
||
|
|
return "\n\n".join(blocks)
|
||
|
|
|
||
|
|
|
||
|
|
def strip_markdown_to_text(md: str) -> str:
|
||
|
|
"""Strip markdown syntax to produce plain text."""
|
||
|
|
lines = []
|
||
|
|
in_code = False
|
||
|
|
in_formula = False
|
||
|
|
|
||
|
|
for line in md.split("\n"):
|
||
|
|
if line.startswith("```"):
|
||
|
|
in_code = not in_code
|
||
|
|
continue
|
||
|
|
if line.startswith("$$"):
|
||
|
|
in_formula = not in_formula
|
||
|
|
continue
|
||
|
|
if in_code or in_formula:
|
||
|
|
lines.append(line)
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Strip heading markers
|
||
|
|
stripped = re.sub(r"^#{1,6}\s+", "", line)
|
||
|
|
# Strip table pipes (keep cell content)
|
||
|
|
if stripped.startswith("|") and stripped.endswith("|"):
|
||
|
|
# Skip separator rows
|
||
|
|
if re.match(r"^\|[-|: ]+\|$", stripped):
|
||
|
|
continue
|
||
|
|
stripped = re.sub(r"\s*\|\s*", " ", stripped).strip()
|
||
|
|
# Strip bold/italic
|
||
|
|
stripped = re.sub(r"\*{1,3}([^*]+)\*{1,3}", r"\1", stripped)
|
||
|
|
|
||
|
|
if stripped:
|
||
|
|
lines.append(stripped)
|
||
|
|
|
||
|
|
return "\n".join(lines)
|
||
|
|
|
||
|
|
|
||
|
|
def group_pages_by_pdf(pages: list[dict]) -> dict[str, list[dict]]:
|
||
|
|
"""Group OmniDocBench pages by their source PDF."""
|
||
|
|
groups: dict[str, list[dict]] = defaultdict(list)
|
||
|
|
|
||
|
|
for page in pages:
|
||
|
|
page_info = page.get("page_info", {})
|
||
|
|
image_path = page_info.get("image_path", "")
|
||
|
|
|
||
|
|
# Try to extract PDF name from image path
|
||
|
|
# Image paths look like: "academic_literature/scihub_12345_p0.jpg"
|
||
|
|
# or "PPT2PDF/PPT_sample.png"
|
||
|
|
basename = os.path.splitext(os.path.basename(image_path))[0]
|
||
|
|
|
||
|
|
# Strip page suffix like _p0, _p1, etc.
|
||
|
|
pdf_name = re.sub(r"_p\d+$", "", basename)
|
||
|
|
|
||
|
|
groups[pdf_name].append(page)
|
||
|
|
|
||
|
|
# Sort pages within each group by page number
|
||
|
|
for pdf_name in groups:
|
||
|
|
groups[pdf_name].sort(key=lambda p: p.get("page_info", {}).get("page_no", 0))
|
||
|
|
|
||
|
|
return groups
|
||
|
|
|
||
|
|
|
||
|
|
def find_pdf_for_document(pdf_name: str, pages: list[dict], ori_pdfs_dir: Path) -> Path | None:
|
||
|
|
"""Find the original PDF file for a document group."""
|
||
|
|
if not ori_pdfs_dir.exists():
|
||
|
|
return None
|
||
|
|
|
||
|
|
# Try direct name match
|
||
|
|
for ext in (".pdf", ".PDF"):
|
||
|
|
candidate = ori_pdfs_dir / f"{pdf_name}{ext}"
|
||
|
|
if candidate.exists():
|
||
|
|
return candidate
|
||
|
|
|
||
|
|
# Try searching in subdirectories
|
||
|
|
for pdf_file in ori_pdfs_dir.rglob("*.pdf"):
|
||
|
|
if pdf_file.stem == pdf_name:
|
||
|
|
return pdf_file
|
||
|
|
|
||
|
|
# Try matching from image path
|
||
|
|
if pages:
|
||
|
|
image_path = pages[0].get("page_info", {}).get("image_path", "")
|
||
|
|
parts = image_path.split("/")
|
||
|
|
if len(parts) >= 2:
|
||
|
|
subdir = parts[0]
|
||
|
|
subdir_path = ori_pdfs_dir / subdir
|
||
|
|
if subdir_path.exists():
|
||
|
|
for pdf_file in subdir_path.glob("*.pdf"):
|
||
|
|
if pdf_name.startswith(pdf_file.stem) or pdf_file.stem.startswith(pdf_name):
|
||
|
|
return pdf_file
|
||
|
|
|
||
|
|
return None
|
||
|
|
|
||
|
|
|
||
|
|
def main() -> None:
|
||
|
|
if len(sys.argv) < 3:
|
||
|
|
print(
|
||
|
|
"Usage: import_omnidocbench.py <omnidocbench_dir> <repo_root>",
|
||
|
|
file=sys.stderr,
|
||
|
|
)
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
omnidoc_dir = Path(sys.argv[1]).resolve()
|
||
|
|
repo_root = Path(sys.argv[2]).resolve()
|
||
|
|
|
||
|
|
json_path = omnidoc_dir / "OmniDocBench.json"
|
||
|
|
ori_pdfs_dir = omnidoc_dir / "ori_pdfs"
|
||
|
|
|
||
|
|
if not json_path.exists():
|
||
|
|
print(f"ERROR: {json_path} not found. Run download_omnidocbench.sh first.", file=sys.stderr)
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
fixtures_dir = repo_root / "tools" / "benchmark-harness" / "fixtures" / "pdf"
|
||
|
|
gt_dir = repo_root / "test_documents" / "ground_truth" / "pdf"
|
||
|
|
fixtures_dir.mkdir(parents=True, exist_ok=True)
|
||
|
|
gt_dir.mkdir(parents=True, exist_ok=True)
|
||
|
|
|
||
|
|
print(f"Loading {json_path}...", file=sys.stderr)
|
||
|
|
with open(json_path) as f:
|
||
|
|
pages = json.load(f)
|
||
|
|
print(f"Loaded {len(pages)} pages", file=sys.stderr)
|
||
|
|
|
||
|
|
# Group pages by document
|
||
|
|
doc_groups = group_pages_by_pdf(pages)
|
||
|
|
print(f"Found {len(doc_groups)} documents", file=sys.stderr)
|
||
|
|
|
||
|
|
created = 0
|
||
|
|
skipped_no_pdf = 0
|
||
|
|
skipped_exists = 0
|
||
|
|
skipped_empty = 0
|
||
|
|
|
||
|
|
for pdf_name, doc_pages in sorted(doc_groups.items()):
|
||
|
|
# Generate fixture name
|
||
|
|
fixture_name = f"omnidoc_{pdf_name}"
|
||
|
|
# Sanitize: replace non-alphanumeric chars
|
||
|
|
fixture_name = re.sub(r"[^a-zA-Z0-9_-]", "_", fixture_name)
|
||
|
|
|
||
|
|
fixture_path = fixtures_dir / f"{fixture_name}.json"
|
||
|
|
gt_md_path = gt_dir / f"{fixture_name}.md"
|
||
|
|
gt_txt_path = gt_dir / f"{fixture_name}.txt"
|
||
|
|
|
||
|
|
# Skip if already imported
|
||
|
|
if fixture_path.exists():
|
||
|
|
skipped_exists += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Find the PDF
|
||
|
|
pdf_path = find_pdf_for_document(pdf_name, doc_pages, ori_pdfs_dir)
|
||
|
|
if pdf_path is None:
|
||
|
|
skipped_no_pdf += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Generate markdown from all pages
|
||
|
|
page_markdowns = []
|
||
|
|
for page in doc_pages:
|
||
|
|
md = page_to_markdown(page)
|
||
|
|
if md.strip():
|
||
|
|
page_markdowns.append(md)
|
||
|
|
|
||
|
|
if not page_markdowns:
|
||
|
|
skipped_empty += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
full_markdown = "\n\n".join(page_markdowns)
|
||
|
|
full_text = strip_markdown_to_text(full_markdown)
|
||
|
|
|
||
|
|
# Write ground truth files
|
||
|
|
gt_md_path.write_text(full_markdown)
|
||
|
|
gt_txt_path.write_text(full_text)
|
||
|
|
|
||
|
|
# Compute relative paths from fixture to document and ground truth
|
||
|
|
doc_rel = os.path.relpath(pdf_path, fixtures_dir)
|
||
|
|
gt_md_rel = os.path.relpath(gt_md_path, fixtures_dir)
|
||
|
|
gt_txt_rel = os.path.relpath(gt_txt_path, fixtures_dir)
|
||
|
|
|
||
|
|
# Get page metadata for fixture
|
||
|
|
first_page = doc_pages[0].get("page_info", {})
|
||
|
|
page_attr = first_page.get("page_attribute", {})
|
||
|
|
|
||
|
|
fixture = {
|
||
|
|
"document": doc_rel,
|
||
|
|
"file_type": "pdf",
|
||
|
|
"file_size": pdf_path.stat().st_size,
|
||
|
|
"expected_frameworks": ["kreuzberg"],
|
||
|
|
"metadata": {
|
||
|
|
"description": f"OmniDocBench: {page_attr.get('data_source', 'unknown')}",
|
||
|
|
"source": "omnidocbench",
|
||
|
|
"size_category": "small" if pdf_path.stat().st_size < 500_000 else "medium",
|
||
|
|
"language": page_attr.get("language", "unknown"),
|
||
|
|
"layout": page_attr.get("layout", "unknown"),
|
||
|
|
"data_source": page_attr.get("data_source", "unknown"),
|
||
|
|
"page_count": len(doc_pages),
|
||
|
|
},
|
||
|
|
"ground_truth": {
|
||
|
|
"text_file": gt_txt_rel,
|
||
|
|
"markdown_file": gt_md_rel,
|
||
|
|
"source": "omnidocbench",
|
||
|
|
},
|
||
|
|
}
|
||
|
|
|
||
|
|
fixture_path.write_text(json.dumps(fixture, indent=2) + "\n")
|
||
|
|
created += 1
|
||
|
|
|
||
|
|
if created % 50 == 0:
|
||
|
|
print(f" {created} fixtures created...", file=sys.stderr)
|
||
|
|
|
||
|
|
print("\nDone:", file=sys.stderr)
|
||
|
|
print(f" Created: {created}", file=sys.stderr)
|
||
|
|
print(f" Skipped (already exists): {skipped_exists}", file=sys.stderr)
|
||
|
|
print(f" Skipped (no PDF found): {skipped_no_pdf}", file=sys.stderr)
|
||
|
|
print(f" Skipped (empty content): {skipped_empty}", file=sys.stderr)
|
||
|
|
print(f" Fixtures: {fixtures_dir}", file=sys.stderr)
|
||
|
|
print(f" Ground truth: {gt_dir}", file=sys.stderr)
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|