Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,394 @@
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.metadata.Metadata;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
public final class TikaExtract {
private static final double NANOS_IN_MILLISECOND = 1_000_000.0;
/** Length of the JSON key {@code "path"} including surrounding quotes. */
private static final int PATH_KEY_LENGTH = 6;
private static final char LAST_CONTROL_CHAR = 0x1F;
private TikaExtract() {
}
public static void main(String[] args) {
boolean ocrEnabled = false;
List<String> positionalArgs = new ArrayList<>();
for (String arg : args) {
if ("--ocr".equals(arg)) {
ocrEnabled = true;
} else if ("--no-ocr".equals(arg)) {
ocrEnabled = false;
} else {
positionalArgs.add(arg);
}
}
if (positionalArgs.isEmpty()) {
System.err.println("Usage: TikaExtract [--ocr|--no-ocr] <mode> <file1> [file2] ...");
System.err.println("Modes: sync, batch, server");
System.exit(1);
}
String mode = positionalArgs.get(0);
if (!"sync".equals(mode) && !"batch".equals(mode) && !"server".equals(mode)) {
System.err.printf("Unsupported mode '%s'%n", mode);
System.exit(1);
}
// Enable debug logging if TIKA_BENCHMARK_DEBUG is set
boolean debug = "true".equalsIgnoreCase(System.getenv("TIKA_BENCHMARK_DEBUG"));
if (debug) {
debugLog("java.version", System.getProperty("java.version"));
debugLog("os.name", System.getProperty("os.name"));
debugLog("os.arch", System.getProperty("os.arch"));
debugLog("Mode", mode);
debugLog("OCR enabled", String.valueOf(ocrEnabled));
debugLog("Files to process", String.valueOf(positionalArgs.size() - 1));
}
try {
if ("sync".equals(mode)) {
if (positionalArgs.size() < 2) {
System.err.println("Sync mode requires exactly one file");
System.exit(1);
}
processSyncMode(positionalArgs.get(1), ocrEnabled, debug);
} else if ("batch".equals(mode)) {
processBatchMode(positionalArgs, ocrEnabled, debug);
} else {
processServerMode(ocrEnabled, debug);
}
} catch (Exception e) {
if (debug) {
debugLog("Processing failed with exception", e.getClass().getName());
e.printStackTrace(System.err);
} else {
e.printStackTrace(System.err);
}
System.exit(1);
}
}
private static void processSyncMode(String filePath, boolean ocrEnabled, boolean debug) throws Exception {
if (debug) {
debugLog("Input file", filePath);
}
Path path = Path.of(filePath);
ExtractionData data;
long start = System.nanoTime();
try {
if (debug) {
debugLog("Starting extraction", "");
}
data = extractFile(path.toFile(), ocrEnabled, debug);
if (debug) {
debugLog("Extraction completed", "");
}
} catch (Exception e) {
if (debug) {
debugLog("Extraction failed", e.getClass().getName());
e.printStackTrace(System.err);
}
throw e;
}
double elapsedMs = (System.nanoTime() - start) / NANOS_IN_MILLISECOND;
String json = toJson(data, elapsedMs, ocrEnabled);
System.out.print(json);
}
private static void processBatchMode(
List<String> positionalArgs, boolean ocrEnabled, boolean debug) throws Exception {
List<String> filePaths = new ArrayList<>();
for (int i = 1; i < positionalArgs.size(); i++) {
filePaths.add(positionalArgs.get(i));
}
long batchStart = System.nanoTime();
StringBuilder jsonArray = new StringBuilder();
jsonArray.append('[');
boolean first = true;
for (String filePath : filePaths) {
if (debug) {
debugLog("Processing file", filePath);
}
try {
Path path = Path.of(filePath);
long start = System.nanoTime();
ExtractionData data = extractFile(path.toFile(), ocrEnabled, debug);
double elapsedMs = (System.nanoTime() - start) / NANOS_IN_MILLISECOND;
if (!first) {
jsonArray.append(',');
}
first = false;
double batchTotalMs = (System.nanoTime() - batchStart) / NANOS_IN_MILLISECOND;
jsonArray.append(toJsonWithBatch(data, elapsedMs, batchTotalMs, ocrEnabled));
if (debug) {
debugLog("File processed", filePath);
}
} catch (Exception e) {
if (debug) {
debugLog("Failed to process file", filePath);
debugLog("Exception", e.getClass().getName());
e.printStackTrace(System.err);
} else {
System.err.printf("Error processing %s: %s%n", filePath, e.getMessage());
}
}
}
double totalBatchMs = (System.nanoTime() - batchStart) / NANOS_IN_MILLISECOND;
jsonArray.append(']');
if (first) {
System.err.println("No files were successfully processed");
System.exit(1);
return;
}
System.out.print(jsonArray.toString());
}
private static void processServerMode(boolean ocrEnabled, boolean debug) throws Exception {
// Pre-create shared parser and OCR config to avoid per-file construction overhead.
// AutoDetectParser is thread-safe and reusable. Only BodyContentHandler and Metadata
// need to be recreated per extraction since they accumulate state.
AutoDetectParser sharedParser = new AutoDetectParser();
TesseractOCRConfig sharedOcrConfig = new TesseractOCRConfig();
if (!ocrEnabled) {
sharedOcrConfig.setSkipOcr(true);
} else {
sharedOcrConfig.setLanguage("eng");
}
// Signal readiness after JVM + Tika parser initialization
System.out.println("READY");
System.out.flush();
BufferedReader reader = new BufferedReader(new InputStreamReader(System.in));
String line;
while ((line = reader.readLine()) != null) {
String filePath = line.trim();
if (filePath.isEmpty()) {
continue;
}
// Parse JSON request if the harness sends {"path":"...", "force_ocr": ...}
if (filePath.startsWith("{")) {
filePath = parseJsonPath(filePath);
}
try {
Path path = Path.of(filePath);
long start = System.nanoTime();
ExtractionData data = extractFileWithParser(path.toFile(), sharedParser, sharedOcrConfig, debug);
double elapsedMs = (System.nanoTime() - start) / NANOS_IN_MILLISECOND;
String json = toJson(data, elapsedMs, ocrEnabled);
System.out.println(json);
System.out.flush();
} catch (Exception e) {
String errorJson = String.format(
"{\"error\":%s,\"_extraction_time_ms\":0,\"_ocr_used\":false}",
quote(e.getMessage()));
System.out.println(errorJson);
System.out.flush();
}
}
}
private static ExtractionData extractFileWithParser(
File file, AutoDetectParser parser, TesseractOCRConfig ocrConfig, boolean debug) throws Exception {
if (!file.exists()) {
throw new IllegalArgumentException("File does not exist: " + file.getAbsolutePath());
}
BodyContentHandler handler = new BodyContentHandler(-1);
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
context.set(TesseractOCRConfig.class, ocrConfig);
try (InputStream stream = new FileInputStream(file)) {
parser.parse(stream, handler, metadata, context);
}
String content = handler.toString();
String mimeType = metadata.get(Metadata.CONTENT_TYPE);
if (mimeType == null) {
mimeType = "application/octet-stream";
}
return new ExtractionData(content, mimeType);
}
private static ExtractionData extractFile(File file, boolean ocrEnabled, boolean debug) throws Exception {
if (!file.exists()) {
throw new IllegalArgumentException("File does not exist: " + file.getAbsolutePath());
}
AutoDetectParser parser = new AutoDetectParser();
BodyContentHandler handler = new BodyContentHandler(-1);
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
if (!ocrEnabled) {
TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
ocrConfig.setSkipOcr(true);
context.set(TesseractOCRConfig.class, ocrConfig);
} else {
TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
ocrConfig.setLanguage("eng");
context.set(TesseractOCRConfig.class, ocrConfig);
}
try (InputStream stream = new FileInputStream(file)) {
parser.parse(stream, handler, metadata, context);
}
String content = handler.toString();
String mimeType = metadata.get(Metadata.CONTENT_TYPE);
if (mimeType == null) {
mimeType = "application/octet-stream";
}
return new ExtractionData(content, mimeType);
}
/**
* Determine if OCR was actually used based on MIME type and OCR config.
* OCR is used by Tika when enabled and the file is an image type.
*/
private static boolean determineOcrUsed(String mimeType, boolean ocrEnabled) {
if (!ocrEnabled) {
return false;
}
return mimeType != null && mimeType.startsWith("image/");
}
private static String toJson(ExtractionData data, double elapsedMs, boolean ocrEnabled) {
StringBuilder builder = new StringBuilder();
builder.append('{');
builder.append("\"content\":").append(quote(data.getContent())).append(',');
builder.append("\"metadata\":{");
builder.append("\"mimeType\":").append(quote(data.getMimeType()));
builder.append("},\"_extraction_time_ms\":").append(String.format("%.3f", elapsedMs));
builder.append(",\"_ocr_used\":").append(determineOcrUsed(data.getMimeType(), ocrEnabled));
builder.append('}');
return builder.toString();
}
private static String toJsonWithBatch(
ExtractionData data, double elapsedMs, double batchTotalMs, boolean ocrEnabled) {
StringBuilder builder = new StringBuilder();
builder.append('{');
builder.append("\"content\":").append(quote(data.getContent())).append(',');
builder.append("\"metadata\":{");
builder.append("\"mimeType\":").append(quote(data.getMimeType()));
builder.append("},\"_extraction_time_ms\":").append(String.format("%.3f", elapsedMs));
builder.append(",\"_batch_total_ms\":").append(String.format("%.3f", batchTotalMs));
builder.append(",\"_ocr_used\":").append(determineOcrUsed(data.getMimeType(), ocrEnabled));
builder.append('}');
return builder.toString();
}
/**
* Parse a JSON request line to extract the "path" field.
* Minimal JSON parsing to avoid adding a dependency.
*/
private static String parseJsonPath(String json) {
int idx = json.indexOf("\"path\"");
if (idx < 0) {
return json;
}
// Skip past "path" key, colon, optional whitespace, and opening quote
idx = json.indexOf(':', idx + PATH_KEY_LENGTH);
if (idx < 0) {
return json;
}
idx = json.indexOf('"', idx + 1);
if (idx < 0) {
return json;
}
int start = idx + 1;
int end = json.indexOf('"', start);
if (end < 0) {
return json;
}
return json.substring(start, end);
}
// CPD-OFF: quote() is intentionally duplicated in standalone benchmark scripts (no shared classpath)
private static String quote(String value) {
if (value == null) {
return "null";
}
StringBuilder sb = new StringBuilder(value.length() + 2);
sb.append('"');
for (int i = 0; i < value.length(); i++) {
char c = value.charAt(i);
switch (c) {
case '\\': sb.append("\\\\"); break;
case '"': sb.append("\\\""); break;
case '\n': sb.append("\\n"); break;
case '\r': sb.append("\\r"); break;
case '\t': sb.append("\\t"); break;
case '\b': sb.append("\\b"); break;
case '\f': sb.append("\\f"); break;
default:
if (c <= LAST_CONTROL_CHAR) {
sb.append(String.format("\\u%04x", (int) c));
} else {
sb.append(c);
}
}
}
sb.append('"');
return sb.toString();
}
// CPD-ON
private static void debugLog(String key, String value) {
if (value == null) {
value = "(null)";
}
System.err.printf("[BENCHMARK_DEBUG] %-30s = %s%n", key, value);
}
private static class ExtractionData {
private final String content;
private final String mimeType;
ExtractionData(String content, String mimeType) {
this.content = content;
this.mimeType = mimeType;
}
String getContent() {
return content;
}
String getMimeType() {
return mimeType;
}
}
}

View File

@@ -0,0 +1,277 @@
"""Docling extraction wrapper for benchmark harness.
Supports two modes:
- sync: convert() - synchronous single-file extraction
- batch: convert_all() - batch extraction for multiple files
- server: persistent mode reading paths from stdin
"""
from __future__ import annotations
import json
import multiprocessing as _mp
import os
import platform
import resource
import sys
import time
from typing import Any
from docling.document_converter import DocumentConverter
def _get_peak_memory_bytes() -> int:
"""Get peak memory usage in bytes using resource module."""
usage = resource.getrusage(resource.RUSAGE_SELF)
if platform.system() == "Linux":
return usage.ru_maxrss * 1024
return usage.ru_maxrss
def create_converter(ocr_enabled: bool) -> DocumentConverter:
"""Create a DocumentConverter with appropriate settings."""
if not ocr_enabled:
try:
from docling.datamodel.pipeline_options import PipelineOptions
options = PipelineOptions(do_ocr=False)
return DocumentConverter(pipeline_options=options)
except (ImportError, TypeError):
# Fallback if PipelineOptions API not available
return DocumentConverter()
return DocumentConverter()
def _render(document: Any, output_format: str) -> str:
if output_format == "plaintext":
return document.export_to_text()
return document.export_to_markdown()
def extract_sync(file_path: str, converter: DocumentConverter, output_format: str = "markdown") -> dict[str, Any]:
"""Extract using synchronous single-file API."""
start = time.perf_counter()
result = converter.convert(file_path)
content = _render(result.document, output_format)
duration_ms = (time.perf_counter() - start) * 1000.0
return {
"content": content,
"metadata": {"framework": "docling", "output_format": output_format},
"_extraction_time_ms": duration_ms,
"_peak_memory_bytes": _get_peak_memory_bytes(),
}
def extract_batch(
file_paths: list[str], converter: DocumentConverter, output_format: str = "markdown"
) -> list[dict[str, Any]]:
"""Extract multiple files using batch API."""
start = time.perf_counter()
results = converter.convert_all(file_paths, raises_on_error=False)
total_duration_ms = (time.perf_counter() - start) * 1000.0
per_file_duration_ms = total_duration_ms / len(file_paths) if file_paths else 0
outputs = []
for result in results:
if result.status.name == "SUCCESS":
content = _render(result.document, output_format)
outputs.append(
{
"content": content,
"metadata": {"framework": "docling", "output_format": output_format},
"_extraction_time_ms": per_file_duration_ms,
"_batch_total_ms": total_duration_ms,
"_peak_memory_bytes": _get_peak_memory_bytes(),
}
)
else:
outputs.append(
{
"content": "",
"metadata": {
"framework": "docling",
"error": str(result.errors) if result.errors else "Unknown error",
"status": result.status.name,
},
"_extraction_time_ms": per_file_duration_ms,
"_batch_total_ms": total_duration_ms,
"_peak_memory_bytes": _get_peak_memory_bytes(),
}
)
return outputs
def _worker(fn, args, conn):
"""Run extraction in a forked child process.
Closes inherited stdin/stdout so the child cannot corrupt the
parent's line-based JSON protocol.
"""
try:
sys.stdin.close()
sys.stdout = open(os.devnull, "w")
except Exception:
pass
try:
result = fn(*args)
conn.send(result)
except Exception as e:
conn.send({"error": str(e), "_extraction_time_ms": 0})
finally:
conn.close()
def _run_with_timeout(fn, args, timeout):
"""Execute fn(*args) in a forked child with a timeout.
On timeout the child is killed but the parent stays alive —
no expensive process restart is needed.
"""
try:
ctx = _mp.get_context("fork")
parent_conn, child_conn = ctx.Pipe(duplex=False)
p = ctx.Process(target=_worker, args=(fn, args, child_conn))
p.start()
child_conn.close()
if parent_conn.poll(timeout=timeout):
try:
result = parent_conn.recv()
except Exception:
result = {"error": "worker process crashed", "_extraction_time_ms": 0}
else:
p.kill()
result = {
"error": f"extraction timed out after {timeout}s",
"_extraction_time_ms": timeout * 1000.0,
}
p.join(timeout=5)
if p.is_alive():
p.kill()
p.join()
parent_conn.close()
return result
except Exception:
# Fork not available — fall back to in-process extraction
try:
return fn(*args)
except Exception as e:
return {"error": str(e), "_extraction_time_ms": 0}
def _parse_path(line: str) -> str:
"""Parse a request line: JSON object with path field, or plain file path."""
stripped = line.strip()
if stripped.startswith("{"):
try:
return json.loads(stripped).get("path", "")
except (json.JSONDecodeError, ValueError):
pass
return stripped
def run_server(converter: DocumentConverter, output_format: str, timeout=None) -> None:
"""Persistent server mode: read paths from stdin, write JSON to stdout."""
print("READY", flush=True)
for line in sys.stdin:
file_path = _parse_path(line)
if not file_path:
continue
if timeout is not None:
result = _run_with_timeout(extract_sync, (file_path, converter, output_format), timeout)
else:
try:
result = extract_sync(file_path, converter, output_format)
except Exception as e:
result = {"error": str(e), "_extraction_time_ms": 0}
print(json.dumps(result), flush=True)
def main() -> None:
ocr_enabled = False
timeout = None
output_format = "markdown"
args = []
for arg in sys.argv[1:]:
if arg == "--ocr":
ocr_enabled = True
elif arg == "--no-ocr":
ocr_enabled = False
elif arg.startswith("--timeout="):
timeout = int(arg.split("=", 1)[1])
elif arg.startswith("--format="):
output_format = arg.split("=", 1)[1]
elif arg == "--format":
# Next-arg style handled below by appending
args.append(arg)
else:
args.append(arg)
# Support `--format <value>` (space-separated)
cleaned: list[str] = []
i = 0
while i < len(args):
if args[i] == "--format" and i + 1 < len(args):
output_format = args[i + 1]
i += 2
continue
cleaned.append(args[i])
i += 1
args = cleaned
if output_format not in ("markdown", "plaintext"):
print(f"Error: --format must be 'markdown' or 'plaintext'; got '{output_format}'", file=sys.stderr)
sys.exit(64)
if len(args) < 1:
print(
"Usage: docling_extract.py [--ocr|--no-ocr] [--timeout=SECS] [--format markdown|plaintext] <mode> <file_path> [additional_files...]",
file=sys.stderr,
)
print("Modes: sync, batch, server", file=sys.stderr)
sys.exit(1)
mode = args[0]
file_paths = args[1:]
# Create converter once (expensive initialization)
converter = create_converter(ocr_enabled)
try:
if mode == "server":
run_server(converter, output_format, timeout=timeout)
elif mode == "sync":
if len(file_paths) != 1:
print("Error: sync mode requires exactly one file", file=sys.stderr)
sys.exit(1)
payload = extract_sync(file_paths[0], converter, output_format)
print(json.dumps(payload), end="")
elif mode == "batch":
if len(file_paths) < 1:
print("Error: batch mode requires at least one file", file=sys.stderr)
sys.exit(1)
if len(file_paths) == 1:
results = extract_batch(file_paths, converter, output_format)
print(json.dumps(results[0]), end="")
else:
results = extract_batch(file_paths, converter, output_format)
print(json.dumps(results), end="")
else:
print(f"Error: Unknown mode '{mode}'. Use sync, batch, or server", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f"Error extracting with Docling: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,63 @@
#!/usr/bin/env bash
# Download the OmniDocBench dataset (opendatalab/OmniDocBench) from HuggingFace.
#
# Usage:
# ./download_omnidocbench.sh [TARGET_DIR]
#
# Default target: tools/benchmark-harness/datasets/omnidocbench
#
# Requirements: curl, unzip (standard on macOS/Linux)
# No HuggingFace account or API key needed (public dataset).
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
DEFAULT_DIR="${SCRIPT_DIR}/../datasets/omnidocbench"
TARGET_DIR="${1:-$DEFAULT_DIR}"
HF_BASE="https://huggingface.co/datasets/opendatalab/OmniDocBench/resolve/main"
mkdir -p "$TARGET_DIR"
# Download the main annotation file (65 MB)
if [ -f "$TARGET_DIR/OmniDocBench.json" ]; then
echo "OmniDocBench.json already exists, skipping"
else
echo "Downloading OmniDocBench.json (65 MB)..."
curl -L -o "$TARGET_DIR/OmniDocBench.json" "$HF_BASE/OmniDocBench.json"
fi
# Download images directory via HF CLI if available, otherwise use git-lfs clone
if [ -d "$TARGET_DIR/images" ] && [ "$(find "$TARGET_DIR/images" -maxdepth 1 -type f 2>/dev/null | wc -l)" -gt 100 ]; then
echo "images/ directory already populated ($(find "$TARGET_DIR/images" -maxdepth 1 -type f | wc -l) files), skipping"
else
if command -v huggingface-cli &>/dev/null; then
echo "Downloading full dataset via huggingface-cli..."
huggingface-cli download opendatalab/OmniDocBench \
--repo-type dataset \
--local-dir "$TARGET_DIR" \
--include "images/*" "ori_pdfs/*" "OmniDocBench.json"
elif command -v git-lfs &>/dev/null || git lfs version &>/dev/null 2>&1; then
echo "Downloading via git-lfs clone..."
TEMP_CLONE="$(mktemp -d)"
git clone --depth 1 "https://huggingface.co/datasets/opendatalab/OmniDocBench" "$TEMP_CLONE"
cd "$TEMP_CLONE" && git lfs pull
cp -r "$TEMP_CLONE/images" "$TARGET_DIR/" 2>/dev/null || true
cp -r "$TEMP_CLONE/ori_pdfs" "$TARGET_DIR/" 2>/dev/null || true
rm -rf "$TEMP_CLONE"
else
echo "ERROR: Need either huggingface-cli or git-lfs to download images."
echo ""
echo "Install one of:"
echo " pip install huggingface-hub # then: huggingface-cli"
echo " brew install git-lfs # then: git lfs install"
exit 1
fi
fi
# Summary
echo ""
echo "OmniDocBench downloaded to: $TARGET_DIR"
echo " Annotations: $(wc -c <"$TARGET_DIR/OmniDocBench.json" | tr -d ' ') bytes"
[ -d "$TARGET_DIR/images" ] && echo " Images: $(find "$TARGET_DIR/images" -maxdepth 1 -type f | wc -l | tr -d ' ') files"
[ -d "$TARGET_DIR/ori_pdfs" ] && echo " PDFs: $(find "$TARGET_DIR/ori_pdfs" -maxdepth 1 -type f | wc -l | tr -d ' ') files"

View File

@@ -0,0 +1,789 @@
#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "beautifulsoup4>=4.12",
# "python-docx>=1.0",
# "python-pptx>=1.0",
# "openpyxl>=3.1",
# "nbformat>=5.9",
# "xlrd>=2.0",
# "extract-msg>=0.48",
# "lxml>=5.0",
# "odfpy>=1.4",
# ]
# ///
"""Generate ground truth text files for benchmark fixtures.
Walks all fixture JSONs, extracts text from source documents using independent
tools (not benchmarked frameworks), writes ground truth .txt files, patches
fixture JSONs with ground_truth field, and updates ground_truth_mapping.json.
PDF Ground Truth Methodology (updated Feb 2026):
PDF ground truth was regenerated using AI visual extraction (Claude Haiku
reading each PDF page as an image) for scanned/complex PDFs, and pdftotext
for born-digital PDFs with reliable embedded text. The previous approach of
using pdftotext for all PDFs produced incorrect ground truth for scanned
documents since pdftotext cannot read image-based text.
The handle_pdftotext() function below is retained for regenerating GT from
born-digital PDFs. For scanned PDFs, GT files were manually curated via AI
extraction and should not be overwritten by running this script with --force.
Usage:
uv run tools/benchmark-harness/scripts/generate_ground_truth.py [OPTIONS]
Options:
--dry-run Print planned actions without writing
--format-filter Comma-separated file types to process (e.g., md,txt,pdf)
--force Regenerate even if ground truth already exists
--skip-types Comma-separated file types to skip
"""
from __future__ import annotations
import argparse
import email
import json
import os
import subprocess
import sys
import xml.etree.ElementTree as ET
from pathlib import Path
# ---------------------------------------------------------------------------
# File type → handler mapping
# ---------------------------------------------------------------------------
RAW_SOURCE_TYPES = frozenset(
{
"md",
"txt",
"rst",
"org",
"commonmark",
"djot",
"toml",
"yaml",
"json",
"tsv",
"bib",
"csv",
"svg",
}
)
PDFTOTEXT_TYPES = frozenset({"pdf"})
PANDOC_TYPES = frozenset(
{
"tex",
"latex",
"typ",
"epub",
"fb2",
"docbook",
"odt",
"rtf",
"opml",
}
)
PYTHON_DOCX_TYPES = frozenset({"docx"})
PYTHON_PPTX_TYPES = frozenset({"pptx", "pptm", "ppsx"})
OPENPYXL_TYPES = frozenset({"xlsx", "xlsm"})
ODS_TYPES = frozenset({"ods"})
BEAUTIFULSOUP_TYPES = frozenset({"html"})
PYTHON_EMAIL_TYPES = frozenset({"eml"})
EXTRACT_MSG_TYPES = frozenset({"msg"})
NBFORMAT_TYPES = frozenset({"ipynb"})
XML_PARSE_TYPES = frozenset({"xml"})
XLRD_TYPES = frozenset({"xls"})
ANTIWORD_TYPES = frozenset({"doc"})
LIBREOFFICE_TYPES = frozenset({"ppt"})
DBF_TYPES = frozenset({"dbf"})
HWP_TYPES = frozenset({"hwp"})
# Archive and image types are excluded from ground truth generation
EXCLUDED_TYPES = frozenset(
{
"7z",
"gz",
"tar",
"tgz",
"zip",
"lz4",
"gif",
"jpeg",
"jpg",
"jp2",
"png",
"tiff",
"webp",
"bmp",
"pbm",
"pgm",
"pnm",
"ppm",
}
)
ALL_HANDLED_TYPES = (
RAW_SOURCE_TYPES
| PDFTOTEXT_TYPES
| PANDOC_TYPES
| PYTHON_DOCX_TYPES
| PYTHON_PPTX_TYPES
| OPENPYXL_TYPES
| BEAUTIFULSOUP_TYPES
| PYTHON_EMAIL_TYPES
| EXTRACT_MSG_TYPES
| NBFORMAT_TYPES
| XML_PARSE_TYPES
| XLRD_TYPES
| ANTIWORD_TYPES
| LIBREOFFICE_TYPES
| ODS_TYPES
| DBF_TYPES
| HWP_TYPES
)
def get_source_type(file_type: str) -> str:
"""Return the ground truth source type string for a given file type."""
if file_type in RAW_SOURCE_TYPES:
return "raw_source"
if file_type in PDFTOTEXT_TYPES:
return "pdftotext"
if file_type in PANDOC_TYPES:
return "pandoc"
if file_type in PYTHON_DOCX_TYPES:
return "python-docx"
if file_type in PYTHON_PPTX_TYPES:
return "python-pptx"
if file_type in OPENPYXL_TYPES:
return "openpyxl"
if file_type in BEAUTIFULSOUP_TYPES:
return "beautifulsoup"
if file_type in PYTHON_EMAIL_TYPES:
return "python_email"
if file_type in EXTRACT_MSG_TYPES:
return "extract_msg"
if file_type in NBFORMAT_TYPES:
return "nbformat"
if file_type in XML_PARSE_TYPES:
return "xml_parse"
if file_type in XLRD_TYPES:
return "xlrd"
if file_type in ANTIWORD_TYPES:
return "antiword"
if file_type in LIBREOFFICE_TYPES:
return "libreoffice"
if file_type in ODS_TYPES:
return "odfpy"
if file_type in DBF_TYPES:
return "manual"
if file_type in HWP_TYPES:
return "manual"
return "manual"
# ---------------------------------------------------------------------------
# Text extraction handlers
# ---------------------------------------------------------------------------
def handle_raw_source(doc_path: Path) -> str:
"""Read the file as-is. For text-based formats, source content IS ground truth."""
try:
return doc_path.read_text(encoding="utf-8")
except UnicodeDecodeError:
return doc_path.read_text(encoding="latin-1")
def handle_pdftotext(doc_path: Path) -> str:
"""Extract text from PDF using pdftotext (poppler-utils).
Note: This works well for born-digital PDFs with embedded text layers.
For scanned PDFs, pdftotext produces garbage output. Scanned PDF ground
truth should be generated via AI visual extraction instead.
"""
result = subprocess.run(
["pdftotext", "-layout", str(doc_path), "-"],
capture_output=True,
text=True,
timeout=60,
)
if result.returncode != 0:
raise RuntimeError(f"pdftotext failed: {result.stderr}")
return result.stdout
def handle_pandoc(doc_path: Path, file_type: str) -> str:
"""Convert document to plain text using pandoc."""
# Map file types to pandoc input formats
pandoc_format_map = {
"tex": "latex",
"latex": "latex",
"typ": "typst",
"epub": "epub",
"fb2": "fb2",
"docbook": "docbook",
"odt": "odt",
"rtf": "rtf",
"opml": "opml",
"doc": "doc",
"ppt": "ppt",
}
input_format = pandoc_format_map.get(file_type)
cmd = ["pandoc", "-t", "plain", "--wrap=none", str(doc_path)]
if input_format:
cmd.insert(1, "-f")
cmd.insert(2, input_format)
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
if result.returncode != 0:
raise RuntimeError(f"pandoc failed: {result.stderr}")
return result.stdout
def handle_python_docx(doc_path: Path) -> str:
"""Extract text from DOCX using python-docx."""
import docx
doc = docx.Document(str(doc_path))
paragraphs = [p.text for p in doc.paragraphs]
# Also extract table text
for table in doc.tables:
for row in table.rows:
cells = [cell.text for cell in row.cells]
paragraphs.append("\t".join(cells))
return "\n".join(paragraphs)
def handle_python_pptx(doc_path: Path) -> str:
"""Extract text from PPTX/PPTM/PPSX using python-pptx."""
from pptx import Presentation
prs = Presentation(str(doc_path))
texts = []
for slide in prs.slides:
for shape in slide.shapes:
if shape.has_text_frame:
for paragraph in shape.text_frame.paragraphs:
text = paragraph.text.strip()
if text:
texts.append(text)
return "\n".join(texts)
def handle_openpyxl(doc_path: Path) -> str:
"""Extract text from XLSX/XLSM using openpyxl."""
import openpyxl
wb = openpyxl.load_workbook(str(doc_path), read_only=True, data_only=True)
lines = []
for sheet_name in wb.sheetnames:
ws = wb[sheet_name]
for row in ws.iter_rows(values_only=True):
cells = [str(c) if c is not None else "" for c in row]
if any(cells):
lines.append("\t".join(cells))
wb.close()
return "\n".join(lines)
def handle_beautifulsoup(doc_path: Path) -> str:
"""Extract text from HTML using BeautifulSoup."""
from bs4 import BeautifulSoup
try:
html_content = doc_path.read_text(encoding="utf-8")
except UnicodeDecodeError:
html_content = doc_path.read_text(encoding="latin-1")
soup = BeautifulSoup(html_content, "html.parser")
# Remove script and style elements
for tag in soup(["script", "style"]):
tag.decompose()
return soup.get_text(separator="\n", strip=True)
def handle_python_email(doc_path: Path) -> str:
"""Extract text from EML using Python email stdlib."""
try:
raw = doc_path.read_bytes()
msg = email.message_from_bytes(raw)
except Exception:
raw = doc_path.read_text(encoding="utf-8", errors="replace")
msg = email.message_from_string(raw)
parts = []
# Add headers
for header in ("From", "To", "Subject", "Date"):
val = msg.get(header)
if val:
parts.append(f"{header}: {val}")
if parts:
parts.append("") # blank line after headers
# Extract body
if msg.is_multipart():
for part in msg.walk():
content_type = part.get_content_type()
if content_type == "text/plain":
payload = part.get_payload(decode=True)
if payload:
charset = part.get_content_charset() or "utf-8"
try:
parts.append(payload.decode(charset, errors="replace"))
except (LookupError, UnicodeDecodeError):
parts.append(payload.decode("utf-8", errors="replace"))
else:
payload = msg.get_payload(decode=True)
if payload:
charset = msg.get_content_charset() or "utf-8"
try:
parts.append(payload.decode(charset, errors="replace"))
except (LookupError, UnicodeDecodeError):
parts.append(payload.decode("utf-8", errors="replace"))
return "\n".join(parts)
def handle_extract_msg(doc_path: Path) -> str:
"""Extract text from MSG using extract-msg."""
import extract_msg
msg = extract_msg.openMsg(str(doc_path))
parts = []
if msg.subject:
parts.append(f"Subject: {msg.subject}")
if msg.sender:
parts.append(f"From: {msg.sender}")
if msg.to:
parts.append(f"To: {msg.to}")
if msg.date:
parts.append(f"Date: {msg.date}")
if parts:
parts.append("")
if msg.body:
parts.append(msg.body)
msg.close()
return "\n".join(parts)
def handle_nbformat(doc_path: Path) -> str:
"""Extract text from Jupyter notebooks using nbformat."""
import nbformat
nb = nbformat.read(str(doc_path), as_version=4)
parts = []
for cell in nb.cells:
if cell.cell_type in ("code", "markdown", "raw"):
source = cell.source.strip()
if source:
parts.append(source)
return "\n\n".join(parts)
def handle_xml_parse(doc_path: Path) -> str:
"""Extract text content from XML using xml.etree."""
try:
tree = ET.parse(str(doc_path))
except ET.ParseError:
# Fallback: read as raw text
return handle_raw_source(doc_path)
root = tree.getroot()
texts = []
for elem in root.iter():
if elem.text and elem.text.strip():
texts.append(elem.text.strip())
if elem.tail and elem.tail.strip():
texts.append(elem.tail.strip())
return "\n".join(texts)
def handle_xlrd(doc_path: Path) -> str:
"""Extract text from XLS using xlrd."""
import xlrd
wb = xlrd.open_workbook(str(doc_path))
lines = []
for sheet_idx in range(wb.nsheets):
ws = wb.sheet_by_index(sheet_idx)
for row_idx in range(ws.nrows):
cells = [str(ws.cell_value(row_idx, col_idx)) for col_idx in range(ws.ncols)]
if any(c for c in cells):
lines.append("\t".join(cells))
return "\n".join(lines)
def handle_antiword(doc_path: Path) -> str:
"""Extract text from DOC using antiword, catdoc, or pandoc as fallbacks."""
# Try antiword first
try:
result = subprocess.run(
["antiword", str(doc_path)],
capture_output=True,
text=True,
timeout=60,
)
if result.returncode == 0:
return result.stdout
except FileNotFoundError:
pass
# Fallback to catdoc
try:
result = subprocess.run(
["catdoc", str(doc_path)],
capture_output=True,
text=True,
timeout=60,
)
if result.returncode == 0:
return result.stdout
except FileNotFoundError:
pass
# Fallback to textutil (macOS)
try:
result = subprocess.run(
["textutil", "-convert", "txt", "-stdout", str(doc_path)],
capture_output=True,
text=True,
timeout=60,
)
if result.returncode == 0:
return result.stdout
except FileNotFoundError:
pass
raise RuntimeError("No DOC extraction tool available (need antiword, catdoc, or textutil)")
def handle_ods(doc_path: Path) -> str:
"""Extract text from ODS using odfpy."""
from odf import text as odf_text
from odf.opendocument import load as odf_load
from odf.table import Table, TableCell, TableRow
doc = odf_load(str(doc_path))
lines = []
for table in doc.spreadsheet.getElementsByType(Table):
for row in table.getElementsByType(TableRow):
cells = []
for cell in row.getElementsByType(TableCell):
# Get text content from cell
cell_texts = []
for p in cell.getElementsByType(odf_text.P):
# Recursively get all text
text_parts = []
for node in p.childNodes:
if hasattr(node, "data"):
text_parts.append(node.data)
elif hasattr(node, "__str__"):
text_parts.append(str(node))
cell_texts.append("".join(text_parts))
# Handle repeated cells
repeat = cell.getAttribute("numbercolumnsrepeated")
cell_text = " ".join(cell_texts)
if repeat and int(repeat) > 1 and cell_text:
cells.extend([cell_text] * min(int(repeat), 100))
else:
cells.append(cell_text)
if any(c.strip() for c in cells):
lines.append("\t".join(cells))
return "\n".join(lines)
def handle_libreoffice(doc_path: Path) -> str:
"""Extract text from PPT using LibreOffice CLI, with pandoc fallback."""
import tempfile
try:
with tempfile.TemporaryDirectory() as tmpdir:
result = subprocess.run(
["libreoffice", "--headless", "--convert-to", "txt:Text", "--outdir", tmpdir, str(doc_path)],
capture_output=True,
text=True,
timeout=120,
)
if result.returncode == 0:
txt_files = list(Path(tmpdir).glob("*.txt"))
if txt_files:
return txt_files[0].read_text(encoding="utf-8", errors="replace")
except FileNotFoundError:
pass
# Fallback: try textutil (macOS)
try:
result = subprocess.run(
["textutil", "-convert", "txt", "-stdout", str(doc_path)],
capture_output=True,
text=True,
timeout=60,
)
if result.returncode == 0:
return result.stdout
except FileNotFoundError:
pass
raise RuntimeError("No PPT extraction tool available (need libreoffice or textutil)")
def extract_text(doc_path: Path, file_type: str) -> str:
"""Dispatch to the appropriate handler for the given file type."""
if file_type in RAW_SOURCE_TYPES:
return handle_raw_source(doc_path)
if file_type in PDFTOTEXT_TYPES:
return handle_pdftotext(doc_path)
if file_type in PANDOC_TYPES:
return handle_pandoc(doc_path, file_type)
if file_type in PYTHON_DOCX_TYPES:
return handle_python_docx(doc_path)
if file_type in PYTHON_PPTX_TYPES:
return handle_python_pptx(doc_path)
if file_type in OPENPYXL_TYPES:
return handle_openpyxl(doc_path)
if file_type in BEAUTIFULSOUP_TYPES:
return handle_beautifulsoup(doc_path)
if file_type in PYTHON_EMAIL_TYPES:
return handle_python_email(doc_path)
if file_type in EXTRACT_MSG_TYPES:
return handle_extract_msg(doc_path)
if file_type in NBFORMAT_TYPES:
return handle_nbformat(doc_path)
if file_type in XML_PARSE_TYPES:
return handle_xml_parse(doc_path)
if file_type in XLRD_TYPES:
return handle_xlrd(doc_path)
if file_type in ANTIWORD_TYPES:
return handle_antiword(doc_path)
if file_type in LIBREOFFICE_TYPES:
return handle_libreoffice(doc_path)
if file_type in ODS_TYPES:
return handle_ods(doc_path)
raise ValueError(f"No handler for file type: {file_type}")
# ---------------------------------------------------------------------------
# Core logic
# ---------------------------------------------------------------------------
def get_repo_root() -> Path:
"""Find the repository root directory."""
current = Path(__file__).resolve().parent
while current != current.parent:
if (current / "Cargo.toml").exists() and (current / "test_documents").exists():
return current
current = current.parent
raise RuntimeError("Could not find repository root")
def collect_fixtures(fixtures_dir: Path) -> list[Path]:
"""Recursively collect all fixture JSON files."""
return sorted(fixtures_dir.rglob("*.json"))
def load_mapping(repo_root: Path) -> dict[str, str]:
"""Load the existing ground truth mapping."""
mapping_file = repo_root / "test_documents" / "ground_truth" / "ground_truth_mapping.json"
if mapping_file.exists():
with open(mapping_file) as f:
return json.load(f)
return {}
def save_mapping(repo_root: Path, mapping: dict[str, str]) -> None:
"""Save the ground truth mapping (sorted keys)."""
mapping_file = repo_root / "test_documents" / "ground_truth" / "ground_truth_mapping.json"
sorted_mapping = dict(sorted(mapping.items()))
with open(mapping_file, "w") as f:
json.dump(sorted_mapping, f, indent=2)
f.write("\n")
def make_mapping_key(fixture_path: Path, fixtures_dir: Path) -> str:
"""Generate a unique mapping key from the fixture path.
For top-level fixtures: stem (e.g., 'commonmark_sample')
For subdir fixtures: subdir/stem (e.g., 'md/duck.md' from md/duck.md.json)
"""
rel = fixture_path.relative_to(fixtures_dir)
parts = rel.parts
if len(parts) > 1:
return f"{parts[0]}/{fixture_path.stem}"
return fixture_path.stem
def process_fixture(
fixture_path: Path,
repo_root: Path,
fixtures_dir: Path,
mapping: dict[str, str],
dry_run: bool,
force: bool,
stats: dict[str, int],
) -> None:
"""Process a single fixture: generate ground truth, patch fixture, update mapping."""
with open(fixture_path) as f:
fixture = json.load(f)
file_type = fixture.get("file_type", "")
# Skip excluded types
if file_type in EXCLUDED_TYPES:
stats["skipped_excluded"] += 1
return
# Skip unhandled types
if file_type not in ALL_HANDLED_TYPES:
print(f" SKIP (unhandled type): {fixture_path.name} ({file_type})")
stats["skipped_unhandled"] += 1
return
# Skip if already has ground truth (unless --force)
if fixture.get("ground_truth") and not force:
stats["skipped_existing"] += 1
return
# Resolve document path
doc_rel = fixture.get("document", "")
if not doc_rel:
print(f" SKIP (no document): {fixture_path.name}")
stats["skipped_no_doc"] += 1
return
doc_path = (fixture_path.parent / doc_rel).resolve()
if not doc_path.exists():
print(f" SKIP (doc not found): {fixture_path.name} -> {doc_path}")
stats["skipped_missing_doc"] += 1
return
# Determine ground truth output path
gt_dir = repo_root / "test_documents" / "ground_truth" / file_type
gt_filename = fixture_path.stem + ".txt"
gt_path = gt_dir / gt_filename
# Compute relative path from fixture to ground truth
gt_rel = os.path.relpath(gt_path, fixture_path.parent)
# Mapping key
mapping_key = make_mapping_key(fixture_path, fixtures_dir)
if dry_run:
print(f" [DRY RUN] {fixture_path.name} ({file_type})")
print(f" doc: {doc_path}")
print(f" gt: {gt_path}")
print(f" key: {mapping_key}")
stats["would_generate"] += 1
return
# Extract text
try:
text = extract_text(doc_path, file_type)
except Exception as e:
print(f" ERROR extracting {fixture_path.name}: {e}")
stats["errors"] += 1
return
# Write ground truth file
gt_dir.mkdir(parents=True, exist_ok=True)
gt_path.write_text(text, encoding="utf-8")
# Patch fixture JSON
fixture["ground_truth"] = {
"text_file": gt_rel,
"source": get_source_type(file_type),
}
with open(fixture_path, "w") as f:
json.dump(fixture, f, indent=2)
f.write("\n")
# Update mapping
gt_mapping_path = str(gt_path.relative_to(repo_root))
mapping[mapping_key] = gt_mapping_path
stats["generated"] += 1
def main() -> int:
parser = argparse.ArgumentParser(description="Generate ground truth for benchmark fixtures")
parser.add_argument("--dry-run", action="store_true", help="Print planned actions without writing")
parser.add_argument("--format-filter", type=str, default="", help="Comma-separated file types to process")
parser.add_argument("--force", action="store_true", help="Regenerate even if ground truth exists")
parser.add_argument("--skip-types", type=str, default="", help="Comma-separated file types to skip")
args = parser.parse_args()
repo_root = get_repo_root()
fixtures_dir = repo_root / "tools" / "benchmark-harness" / "fixtures"
print(f"Repository root: {repo_root}")
print(f"Fixtures dir: {fixtures_dir}")
if args.dry_run:
print("DRY RUN MODE - no files will be written\n")
format_filter = set(args.format_filter.split(",")) if args.format_filter else None
skip_types = set(args.skip_types.split(",")) if args.skip_types else set()
# Load existing mapping
mapping = load_mapping(repo_root)
initial_mapping_size = len(mapping)
# Collect and process fixtures
fixture_paths = collect_fixtures(fixtures_dir)
print(f"Found {len(fixture_paths)} fixture files\n")
stats: dict[str, int] = {
"generated": 0,
"would_generate": 0,
"skipped_existing": 0,
"skipped_excluded": 0,
"skipped_unhandled": 0,
"skipped_no_doc": 0,
"skipped_missing_doc": 0,
"errors": 0,
}
for fixture_path in fixture_paths:
# Load to check file type for filtering
try:
with open(fixture_path) as f:
fixture_data = json.load(f)
except (json.JSONDecodeError, OSError) as e:
print(f" ERROR reading {fixture_path.name}: {e}")
stats["errors"] += 1
continue
file_type = fixture_data.get("file_type", "")
if format_filter and file_type not in format_filter:
continue
if file_type in skip_types:
continue
process_fixture(fixture_path, repo_root, fixtures_dir, mapping, args.dry_run, args.force, stats)
# Save mapping
if not args.dry_run and stats["generated"] > 0:
save_mapping(repo_root, mapping)
new_entries = len(mapping) - initial_mapping_size
print(f"\nUpdated ground_truth_mapping.json: {new_entries} new entries (total: {len(mapping)})")
# Print summary
print(f"\n{'=' * 50}")
print("Summary:")
print(f" Generated: {stats['generated']}")
if args.dry_run:
print(f" Would generate: {stats['would_generate']}")
print(f" Skipped (existing): {stats['skipped_existing']}")
print(f" Skipped (excluded): {stats['skipped_excluded']}")
print(f" Skipped (unhandled): {stats['skipped_unhandled']}")
print(f" Skipped (no doc): {stats['skipped_no_doc']}")
print(f" Skipped (missing): {stats['skipped_missing_doc']}")
print(f" Errors: {stats['errors']}")
return 1 if stats["errors"] > 0 else 0
if __name__ == "__main__":
sys.exit(main())

View File

@@ -0,0 +1,93 @@
#!/usr/bin/env bash
# Generate markdown ground truth for formats requiring LibreOffice conversion.
# Workflow: soffice → intermediate format → pandoc -t gfm → sanitize
#
# Prerequisites:
# - soffice (LibreOffice) on PATH
# - pandoc on PATH
# - python3 on PATH
#
# Usage: bash tools/benchmark-harness/scripts/generate_libreoffice_gt.sh
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)"
SANITIZE="$SCRIPT_DIR/sanitize_pandoc_gt.py"
TMP_DIR="/tmp/gt_convert"
mkdir -p "$TMP_DIR"
# --- DOC → DOCX → GFM ---
echo "=== DOC ground truth generation ==="
mkdir -p "$REPO_ROOT/test_documents/ground_truth/doc"
doc_files=(
"$REPO_ROOT/test_documents/vendored/unstructured/doc/simple.doc"
"$REPO_ROOT/test_documents/vendored/unstructured/doc/fake.doc"
"$REPO_ROOT/test_documents/vendored/unstructured/doc/duplicate-paragraphs.doc"
"$REPO_ROOT/test_documents/vendored/unstructured/doc/fake-doc-emphasized-text.doc"
"$REPO_ROOT/test_documents/doc/unit_test_lists.doc"
)
for f in "${doc_files[@]}"; do
if [ ! -f "$f" ]; then
echo " SKIP (not found): $f"
continue
fi
name=$(basename "$f" .doc)
gt_md="$REPO_ROOT/test_documents/ground_truth/doc/${name}.md"
# Convert to docx via LibreOffice
soffice --headless --convert-to docx --outdir "$TMP_DIR" "$f" 2>/dev/null
converted="$TMP_DIR/${name}.docx"
if [ -f "$converted" ]; then
pandoc -f docx -t gfm --wrap=none "$converted" 2>/dev/null |
python3 "$SANITIZE" >"$gt_md"
size=$(wc -c <"$gt_md")
echo " doc: $name$size bytes ($gt_md)"
else
echo " doc: $name FAILED conversion"
fi
done
# --- PPT → PPTX → GFM ---
echo ""
echo "=== PPT ground truth generation ==="
mkdir -p "$REPO_ROOT/test_documents/ground_truth/ppt"
ppt_files=(
"$REPO_ROOT/test_documents/ppt/simple.ppt"
)
for f in "${ppt_files[@]}"; do
if [ ! -f "$f" ]; then
echo " SKIP (not found): $f"
continue
fi
name=$(basename "$f" .ppt)
gt_md="$REPO_ROOT/test_documents/ground_truth/ppt/${name}.md"
soffice --headless --convert-to pptx --outdir "$TMP_DIR" "$f" 2>/dev/null
converted="$TMP_DIR/${name}.pptx"
if [ -f "$converted" ]; then
pandoc -f pptx -t gfm --wrap=none "$converted" 2>/dev/null |
python3 "$SANITIZE" >"$gt_md"
size=$(wc -c <"$gt_md")
echo " ppt: $name$size bytes ($gt_md)"
else
echo " ppt: $name FAILED conversion"
fi
done
# --- ODS: no pandoc support for spreadsheet input ---
echo ""
echo "=== ODS: skipped (pandoc cannot read spreadsheet formats) ==="
echo " Existing text GT in test_documents/ground_truth/ods/ is sufficient."
echo ""
echo "Done. Validate with:"
echo " cargo run --release -p benchmark-harness -- validate-gt --fixtures tools/benchmark-harness/fixtures/doc/"
echo " cargo run --release -p benchmark-harness -- validate-gt --fixtures tools/benchmark-harness/fixtures/"

View File

@@ -0,0 +1,249 @@
#!/usr/bin/env -S uv run --no-project --script
# /// script
# requires-python = ">=3.10"
# dependencies = ["google-genai>=1.0"]
# ///
"""Generate proper markdown ground truth from PDF documents using Gemini.
Reads benchmark fixture JSON files to locate PDFs, sends each to Gemini 2.5 Flash
via Vertex AI, and saves the extracted markdown to the ground truth directory.
Usage:
uv run tools/benchmark-harness/scripts/generate_markdown_gt.py [OPTIONS]
Examples:
# Generate for all nougat + pdfa documents
uv run tools/benchmark-harness/scripts/generate_markdown_gt.py
# Generate for a specific document
uv run tools/benchmark-harness/scripts/generate_markdown_gt.py --filter nougat_001
# Dry run to see what would be processed
uv run tools/benchmark-harness/scripts/generate_markdown_gt.py --dry-run
# Force regeneration of existing files
uv run tools/benchmark-harness/scripts/generate_markdown_gt.py --force
"""
from __future__ import annotations
import argparse
import json
import signal
import sys
import time
from pathlib import Path
from google import genai
from google.genai.types import GenerateContentConfig, Part
EXTRACTION_PROMPT = """\
Extract the complete text content of this PDF document as clean Markdown.
Rules:
- Use proper heading hierarchy (# for document title, ## for major sections, ### for subsections)
- Render tables using markdown table syntax with | delimiters and --- separator row
- Use numbered lists (1. 2. 3.) and bullet lists (- item) where the document uses them
- Preserve emphasis: **bold** and *italic* where the original uses them
- Use ``` code blocks for code snippets, formulas, or monospace content
- Use <!-- image --> as a placeholder where figures or images appear
- Omit page numbers, running headers/footers, and watermarks
- Preserve the document's reading order
- Do NOT invent or hallucinate content — only extract what is actually in the document
- Do NOT wrap the output in a markdown code fence — return raw markdown directly
- For multi-column layouts, read left column first, then right column
- For forms with label-value pairs, use **Label:** Value format
"""
def get_repo_root() -> Path:
current = Path(__file__).resolve().parent
while current != current.parent:
if (current / "Cargo.toml").exists() and (current / "test_documents").exists():
return current
current = current.parent
raise RuntimeError("Could not find repository root")
def discover_fixtures(fixtures_dir: Path, name_filter: str | None = None) -> list[dict]:
"""Find PDF fixtures that need markdown ground truth."""
results = []
for fixture_path in sorted(fixtures_dir.rglob("*.json")):
try:
with open(fixture_path) as f:
fixture = json.load(f)
except (json.JSONDecodeError, OSError):
continue
if fixture.get("file_type") != "pdf":
continue
name = fixture_path.stem
if name_filter and name_filter not in name:
continue
doc_rel = fixture.get("document", "")
if not doc_rel:
continue
doc_path = (fixture_path.parent / doc_rel).resolve()
if not doc_path.exists():
continue
results.append(
{
"name": name,
"fixture_path": fixture_path,
"doc_path": doc_path,
"fixture": fixture,
}
)
return results
class _Timeout(Exception):
pass
def _timeout_handler(signum, frame):
raise _Timeout("API call timed out")
def generate_markdown(
client: genai.Client,
pdf_path: Path,
model: str,
timeout: int = 120,
) -> str:
"""Send PDF to Gemini and get markdown extraction."""
pdf_bytes = pdf_path.read_bytes()
old_handler = signal.signal(signal.SIGALRM, _timeout_handler)
signal.alarm(timeout)
try:
response = client.models.generate_content(
model=model,
contents=[
Part.from_bytes(data=pdf_bytes, mime_type="application/pdf"),
EXTRACTION_PROMPT,
],
config=GenerateContentConfig(
temperature=0.1,
max_output_tokens=8192,
),
)
finally:
signal.alarm(0)
signal.signal(signal.SIGALRM, old_handler)
text = response.text or ""
# Strip markdown code fence wrapper if Gemini added one
if text.startswith("```markdown\n"):
text = text[len("```markdown\n") :]
text = text.removesuffix("\n```")
elif text.startswith("```md\n"):
text = text[len("```md\n") :]
text = text.removesuffix("\n```")
elif text.startswith("```\n"):
text = text[len("```\n") :]
text = text.removesuffix("\n```")
return text.strip() + "\n"
def main() -> int:
parser = argparse.ArgumentParser(description="Generate markdown ground truth from PDFs using Gemini")
parser.add_argument(
"--filter", type=str, default=None, help="Only process fixtures whose name contains this string"
)
parser.add_argument("--dry-run", action="store_true", help="Show what would be processed without calling the API")
parser.add_argument("--force", action="store_true", help="Regenerate even if .md file already exists")
parser.add_argument(
"--model", type=str, default="gemini-2.0-flash", help="Gemini model to use (default: gemini-2.0-flash)"
)
parser.add_argument("--project", type=str, default="boxwood-spirit-479620-r5", help="GCP project ID")
parser.add_argument("--location", type=str, default="us-central1", help="Vertex AI location")
parser.add_argument("--delay", type=float, default=1.0, help="Delay between API calls in seconds (rate limiting)")
parser.add_argument("--timeout", type=int, default=120, help="Per-request timeout in seconds (default: 120)")
parser.add_argument("--max-size", type=int, default=None, help="Skip PDFs larger than this many KB")
args = parser.parse_args()
repo_root = get_repo_root()
fixtures_dir = repo_root / "tools" / "benchmark-harness" / "fixtures"
gt_dir = repo_root / "test_documents" / "ground_truth" / "pdf"
print(f"Repository root: {repo_root}")
print(f"Fixtures dir: {fixtures_dir}")
print(f"Output dir: {gt_dir}")
print(f"Model: {args.model}")
if args.dry_run:
print("DRY RUN MODE\n")
fixtures = discover_fixtures(fixtures_dir, args.filter)
print(f"Found {len(fixtures)} PDF fixtures")
if not args.dry_run:
client = genai.Client(
vertexai=True,
project=args.project,
location=args.location,
)
stats = {"generated": 0, "skipped": 0, "errors": 0}
for item in fixtures:
name = item["name"]
md_path = gt_dir / f"{name}.md"
file_size_kb = item["doc_path"].stat().st_size / 1024
if md_path.exists() and not args.force:
stats["skipped"] += 1
continue
if args.max_size and file_size_kb > args.max_size:
print(f" Skipping {name} ({file_size_kb:.0f} KB > {args.max_size} KB)")
stats["skipped"] += 1
continue
if args.dry_run:
print(f" [DRY] {name} ({file_size_kb:.0f} KB)")
stats["generated"] += 1
continue
print(f" Processing {name} ({file_size_kb:.0f} KB)...", end=" ", flush=True)
try:
start = time.time()
markdown = generate_markdown(client, item["doc_path"], args.model, timeout=args.timeout)
elapsed = time.time() - start
gt_dir.mkdir(parents=True, exist_ok=True)
md_path.write_text(markdown, encoding="utf-8")
# Quick quality check
lines = markdown.strip().split("\n")
headings = sum(1 for l in lines if l.startswith("#"))
tables = sum(1 for l in lines if "|" in l and "---" not in l)
print(f"OK ({elapsed:.1f}s, {len(lines)} lines, {headings} headings, {tables} table rows)")
stats["generated"] += 1
time.sleep(args.delay)
except _Timeout:
print(f"TIMEOUT ({args.timeout}s)")
stats["errors"] += 1
except Exception as e:
print(f"ERROR: {e}")
stats["errors"] += 1
print(f"\n{'=' * 50}")
print(f"Generated: {stats['generated']}")
print(f"Skipped: {stats['skipped']} (already exist)")
print(f"Errors: {stats['errors']}")
return 0 if stats["errors"] == 0 else 1
if __name__ == "__main__":
sys.exit(main())

View File

@@ -0,0 +1,212 @@
#!/usr/bin/env bash
# Generate markdown and text ground truth for docbook, typst, and fictionbook formats
# using pandoc + sanitize_pandoc_gt.py, then create benchmark fixture JSON files.
set -euo pipefail
REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)"
SANITIZE="$REPO_ROOT/tools/benchmark-harness/scripts/sanitize_pandoc_gt.py"
FIXTURES_DIR="$REPO_ROOT/tools/benchmark-harness/fixtures"
cd "$REPO_ROOT"
echo "=== Step 1: Generate MD ground truth via pandoc + sanitize ==="
# --- DocBook ---
echo "--- DocBook ---"
for f in test_documents/docbook/*.dbk test_documents/docbook/*.docbook test_documents/docbook/*.docbook4 test_documents/docbook/*.docbook5; do
[ -f "$f" ] || continue
name=$(basename "$f" | sed 's/\.[^.]*$//')
mkdir -p test_documents/ground_truth/docbook
pandoc -f docbook -t gfm --wrap=none "$f" 2>/dev/null | python3 "$SANITIZE" >"test_documents/ground_truth/docbook/${name}.md"
echo "docbook: $name ($(wc -c <"test_documents/ground_truth/docbook/${name}.md") bytes)"
done
# --- Typst ---
echo "--- Typst ---"
for f in test_documents/typst/*.typ; do
[ -f "$f" ] || continue
name=$(basename "$f" .typ)
# Typst GT goes in both typ/ (matching existing convention) and typst/
for gtdir in test_documents/ground_truth/typ test_documents/ground_truth/typst; do
mkdir -p "$gtdir"
pandoc -f typst -t gfm --wrap=none "$f" 2>/dev/null | python3 "$SANITIZE" >"${gtdir}/${name}.md"
done
echo "typst: $name ($(wc -c <"test_documents/ground_truth/typ/${name}.md") bytes)"
done
# --- FictionBook (fb2) ---
echo "--- FictionBook ---"
for f in test_documents/fictionbook/*.fb2; do
[ -f "$f" ] || continue
name=$(basename "$f" .fb2)
mkdir -p test_documents/ground_truth/fb2
existing="test_documents/ground_truth/fb2/${name}.md"
if [ ! -f "$existing" ]; then
pandoc -f fb2 -t gfm --wrap=none "$f" 2>/dev/null | python3 "$SANITIZE" >"$existing"
echo "fb2: $name (new, $(wc -c <"$existing") bytes)"
else
echo "fb2: $name (exists, $(wc -c <"$existing") bytes)"
fi
done
echo ""
echo "=== Step 2: Generate text GT from MD GT ==="
# For each .md GT file, generate .txt if missing
for md_file in test_documents/ground_truth/docbook/*.md test_documents/ground_truth/typ/*.md test_documents/ground_truth/fb2/*.md; do
[ -f "$md_file" ] || continue
txt_file="${md_file%.md}.txt"
if [ ! -f "$txt_file" ]; then
pandoc -f gfm -t plain --wrap=none "$md_file" >"$txt_file"
echo "text: $(basename "$txt_file") (new, $(wc -c <"$txt_file") bytes)"
fi
done
echo ""
echo "=== Step 3: Create fixture JSON files ==="
# Helper to create fixture JSON
create_fixture() {
local doc_path="$1"
local file_type="$2"
local gt_text="$3"
local gt_md="$4"
local fixture_out="$5"
local description="$6"
local category="$7"
local file_size
file_size=$(stat -f %z "$doc_path" 2>/dev/null || wc -c <"$doc_path" | tr -d ' ')
local name
name=$(basename "$doc_path" | sed 's/\.[^.]*$//')
# Compute relative paths from fixtures dir
local rel_doc="../../../${doc_path}"
local rel_text="../../../${gt_text}"
local rel_md="../../../${gt_md}"
local json
if [ -f "$gt_md" ] && [ -f "$gt_text" ]; then
json=$(
cat <<EOJSON
{
"document": "${rel_doc}",
"file_type": "${file_type}",
"file_size": ${file_size},
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "${description}",
"category": "${category}"
},
"ground_truth": {
"text_file": "${rel_text}",
"markdown_file": "${rel_md}",
"source": "pandoc"
}
}
EOJSON
)
elif [ -f "$gt_text" ]; then
json=$(
cat <<EOJSON
{
"document": "${rel_doc}",
"file_type": "${file_type}",
"file_size": ${file_size},
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "${description}",
"category": "${category}"
},
"ground_truth": {
"text_file": "${rel_text}",
"source": "pandoc"
}
}
EOJSON
)
fi
echo "$json" >"$fixture_out"
echo "fixture: $(basename "$fixture_out")"
}
# --- DocBook fixtures ---
echo "--- DocBook fixtures ---"
for f in test_documents/docbook/*.dbk test_documents/docbook/*.docbook test_documents/docbook/*.docbook4 test_documents/docbook/*.docbook5; do
[ -f "$f" ] || continue
name=$(basename "$f" | sed 's/\.[^.]*$//')
ext=$(basename "$f" | sed 's/.*\.//')
gt_md="test_documents/ground_truth/docbook/${name}.md"
gt_txt="test_documents/ground_truth/docbook/${name}.txt"
# Determine file_type based on extension
case "$ext" in
dbk) ft="dbk" ;;
docbook | docbook4 | docbook5) ft="docbook" ;;
*) ft="docbook" ;;
esac
fixture_name="docbook_$(echo "$name" | tr '-' '_').json"
create_fixture "$f" "$ft" "$gt_txt" "$gt_md" "${FIXTURES_DIR}/${fixture_name}" "DocBook document: ${name}" "docbook"
done
# --- Typst fixtures (update existing to add markdown_file) ---
echo "--- Typst fixtures ---"
for f in test_documents/typst/*.typ; do
[ -f "$f" ] || continue
name=$(basename "$f" .typ)
gt_md="test_documents/ground_truth/typ/${name}.md"
gt_txt="test_documents/ground_truth/typ/typst_${name}.txt"
# Some txt files use name directly, some use typst_ prefix - check both
if [ ! -f "$gt_txt" ]; then
gt_txt="test_documents/ground_truth/typ/${name}.txt"
fi
fixture_name="typst_${name}.json"
create_fixture "$f" "typ" "$gt_txt" "$gt_md" "${FIXTURES_DIR}/${fixture_name}" "Typst document: ${name}" "typst"
done
# --- FictionBook fixtures (update existing to add markdown_file) ---
echo "--- FictionBook fixtures ---"
for f in test_documents/fictionbook/*.fb2; do
[ -f "$f" ] || continue
name=$(basename "$f" .fb2)
gt_md="test_documents/ground_truth/fb2/${name}.md"
gt_txt="test_documents/ground_truth/fb2/${name}.txt"
# Some txt files use fb2_ prefix
if [ ! -f "$gt_txt" ]; then
gt_txt="test_documents/ground_truth/fb2/fb2_${name}.txt"
fi
fixture_name="fb2_${name}.json"
create_fixture "$f" "fb2" "$gt_txt" "$gt_md" "${FIXTURES_DIR}/${fixture_name}" "FictionBook document: ${name}" "fictionbook"
done
echo ""
echo "=== Step 4: Validate ==="
echo "--- Verifying GT files are non-empty ---"
empty_count=0
for f in test_documents/ground_truth/docbook/*.md test_documents/ground_truth/typ/*.md test_documents/ground_truth/fb2/*.md; do
[ -f "$f" ] || continue
size=$(wc -c <"$f" | tr -d ' ')
if [ "$size" -le 1 ]; then
echo "WARNING: $f is empty/near-empty ($size bytes)"
empty_count=$((empty_count + 1))
fi
done
echo "Empty/near-empty GT files: $empty_count"
echo ""
echo "=== Summary ==="
echo "DocBook MD GT files: $(find test_documents/ground_truth/docbook/*.md -maxdepth 1 2>/dev/null | wc -l | tr -d ' ')"
echo "DocBook TXT GT files: $(find test_documents/ground_truth/docbook/*.txt -maxdepth 1 2>/dev/null | wc -l | tr -d ' ')"
echo "Typst MD GT files: $(find test_documents/ground_truth/typ/*.md -maxdepth 1 2>/dev/null | wc -l | tr -d ' ')"
echo "Typst TXT GT files: $(find test_documents/ground_truth/typ/*.txt -maxdepth 1 2>/dev/null | wc -l | tr -d ' ')"
echo "FB2 MD GT files: $(find test_documents/ground_truth/fb2/*.md -maxdepth 1 2>/dev/null | wc -l | tr -d ' ')"
echo "FB2 TXT GT files: $(find test_documents/ground_truth/fb2/*.txt -maxdepth 1 2>/dev/null | wc -l | tr -d ' ')"
echo ""
echo "Fixture files created/updated:"
ls -1 "${FIXTURES_DIR}"/docbook_*.json "${FIXTURES_DIR}"/typst_*.json "${FIXTURES_DIR}"/fb2_*.json "${FIXTURES_DIR}"/dbk_*.json 2>/dev/null

View File

@@ -0,0 +1,212 @@
#!/usr/bin/env python3
"""Generate PDF markdown ground truth using Mistral's pixtral vision model.
Usage:
# Generate GT for all PDFs missing MD GT:
python generate_pdf_gt_mistral.py
# Generate GT for a specific fixture:
python generate_pdf_gt_mistral.py tools/benchmark-harness/fixtures/pdf/2203.01017v2.json
# Dry run (show what would be generated):
python generate_pdf_gt_mistral.py --dry-run
# Pilot batch (first N):
python generate_pdf_gt_mistral.py --limit 10
"""
import argparse
import base64
import json
import os
import sys
import time
from pathlib import Path
MISTRAL_API_KEY = os.environ.get("MISTRAL_API_KEY", "")
MISTRAL_MODEL = "mistral-ocr-latest"
MISTRAL_API_URL = "https://api.mistral.ai/v1/ocr"
PROMPT = (
"Convert this PDF to clean GFM (GitHub Flavored Markdown). "
"Preserve the document structure: headings, paragraphs, tables, lists, "
"code blocks, and formulas. Use proper heading hierarchy (# for title, ## for sections). "
"Render tables as GFM pipe tables. Do not add commentary or explanations."
)
def load_env():
"""Load MISTRAL_API_KEY from ../liter-llm/.env if not in environment."""
global MISTRAL_API_KEY
if MISTRAL_API_KEY:
return
env_path = Path(__file__).resolve().parents[3] / ".." / "liter-llm" / ".env"
if env_path.exists():
for line in env_path.read_text().splitlines():
if line.startswith("MISTRAL_API_KEY="):
MISTRAL_API_KEY = line.split("=", 1)[1].strip()
return
print("ERROR: MISTRAL_API_KEY not found", file=sys.stderr)
sys.exit(1)
def call_mistral_ocr(pdf_path: str) -> str:
"""Send a PDF to Mistral OCR and return markdown."""
import httpx
pdf_data = Path(pdf_path).read_bytes()
b64 = base64.standard_b64encode(pdf_data).decode("ascii")
payload = {
"model": MISTRAL_MODEL,
"document": {
"type": "document_url",
"document_url": f"data:application/pdf;base64,{b64}",
},
}
resp = httpx.post(
MISTRAL_API_URL,
json=payload,
headers={
"Authorization": f"Bearer {MISTRAL_API_KEY}",
"Content-Type": "application/json",
},
timeout=120.0,
)
resp.raise_for_status()
data = resp.json()
# Extract markdown from pages
pages = data.get("pages", [])
if not pages:
return ""
return "\n\n".join(p.get("markdown", "") for p in pages)
def find_fixtures_needing_gt() -> list[tuple[str, str, str]]:
"""Find PDF fixtures that don't have markdown GT.
Returns list of (fixture_path, pdf_path, gt_md_path).
"""
fixtures_dir = Path("tools/benchmark-harness/fixtures/pdf")
results = []
for f in sorted(fixtures_dir.glob("*.json")):
data = json.loads(f.read_text())
gt = data.get("ground_truth")
if gt is None:
continue
if gt.get("markdown_file"):
continue # Already has MD GT
doc_path = data.get("document", "")
pdf_path = str((f.parent / doc_path).resolve())
if not Path(pdf_path).exists():
continue
# Determine GT output path
text_file = gt.get("text_file", "")
if text_file:
gt_md = text_file.rsplit(".", 1)[0] + ".md"
else:
name = Path(doc_path).stem
gt_md = f"../../../../test_documents/ground_truth/pdf/{name}.md"
gt_md_path = str((f.parent / gt_md).resolve())
results.append((str(f), pdf_path, gt_md_path))
return results
def process_fixture(fixture_path: str, pdf_path: str, gt_md_path: str, dry_run: bool = False) -> bool:
"""Process a single fixture. Returns True if successful."""
name = Path(pdf_path).stem
size_mb = Path(pdf_path).stat().st_size / (1024 * 1024)
if dry_run:
print(f" [dry-run] {name} ({size_mb:.1f}MB) → {gt_md_path}")
return True
print(f" Processing {name} ({size_mb:.1f}MB)...", end=" ", flush=True)
try:
markdown = call_mistral_ocr(pdf_path)
if not markdown.strip():
print("EMPTY")
return False
# Sanitize
from sanitize_pandoc_gt import sanitize
markdown = sanitize(markdown)
# Write GT file
Path(gt_md_path).parent.mkdir(parents=True, exist_ok=True)
Path(gt_md_path).write_text(markdown)
# Update fixture JSON
data = json.loads(Path(fixture_path).read_text())
gt = data["ground_truth"]
# Compute relative path from fixture to GT
rel_path = os.path.relpath(gt_md_path, Path(fixture_path).parent)
gt["markdown_file"] = rel_path
gt["source"] = "mistral-pixtral"
Path(fixture_path).write_text(json.dumps(data, indent=2) + "\n")
print(f"OK ({len(markdown)} bytes)")
return True
except Exception as e:
print(f"ERROR: {e}")
return False
def main():
parser = argparse.ArgumentParser(description="Generate PDF GT with Mistral OCR")
parser.add_argument("fixture", nargs="?", help="Specific fixture JSON to process")
parser.add_argument("--dry-run", action="store_true", help="Show what would be done")
parser.add_argument("--limit", type=int, default=0, help="Process only first N fixtures")
parser.add_argument("--delay", type=float, default=1.0, help="Delay between API calls (seconds)")
args = parser.parse_args()
load_env()
if args.fixture:
# Process single fixture
data = json.loads(Path(args.fixture).read_text())
doc_path = data.get("document", "")
pdf_path = str((Path(args.fixture).parent / doc_path).resolve())
gt = data.get("ground_truth", {})
text_file = gt.get("text_file", "")
if text_file:
gt_md = text_file.rsplit(".", 1)[0] + ".md"
else:
gt_md = f"../../../../test_documents/ground_truth/pdf/{Path(doc_path).stem}.md"
gt_md_path = str((Path(args.fixture).parent / gt_md).resolve())
process_fixture(args.fixture, pdf_path, gt_md_path, dry_run=args.dry_run)
return
# Process all fixtures needing GT
fixtures = find_fixtures_needing_gt()
print(f"Found {len(fixtures)} PDF fixtures needing markdown GT")
if args.limit > 0:
fixtures = fixtures[: args.limit]
print(f"Processing first {args.limit}")
success = 0
failed = 0
for fixture_path, pdf_path, gt_md_path in fixtures:
ok = process_fixture(fixture_path, pdf_path, gt_md_path, dry_run=args.dry_run)
if ok:
success += 1
else:
failed += 1
if not args.dry_run and args.delay > 0:
time.sleep(args.delay)
print(f"\nDone: {success} generated, {failed} failed")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,172 @@
# /// script
# requires-python = ">=3.11"
# dependencies = [
# "paddleocr>=3.4.0",
# "paddlepaddle>=3.3.0",
# "rapidocr-onnxruntime>=1.4.0",
# "pymupdf>=1.24.0",
# "pillow>=10.0.0",
# "numpy>=1.24.0",
# ]
# ///
"""Generate vendored OCR baselines from PaddleOCR Python and RapidOCR.
Usage:
uv run tools/benchmark-harness/scripts/generate_vendored_baselines.py
uv run tools/benchmark-harness/scripts/generate_vendored_baselines.py rapidocr
uv run tools/benchmark-harness/scripts/generate_vendored_baselines.py --force
"""
import json
import os
import sys
import time
from pathlib import Path
import fitz
import numpy as np
FIXTURES_DIR = Path(__file__).resolve().parent.parent / "fixtures"
VENDORED_DIR = Path(__file__).resolve().parent.parent / "vendored"
OCR_FIXTURES = [
"pdf_image_only_german",
"pdf_non_searchable",
"pdf_ocr_rotated_270",
"pdf_ocr_rotated_90",
"pdf_ocr_rotated",
"pdf_ocr_test",
"pdf_scanned_ocr",
]
def pdf_to_images(pdf_path: str, dpi: int = 300) -> list[np.ndarray]:
"""Convert PDF pages to numpy arrays (RGB, HWC)."""
import io
from PIL import Image
doc = fitz.open(pdf_path)
images = []
for page in doc:
mat = fitz.Matrix(dpi / 72, dpi / 72)
pix = page.get_pixmap(matrix=mat)
img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")
images.append(np.array(img))
doc.close()
return images
def lines_to_markdown(lines: list[str]) -> str:
"""Each OCR text line becomes a markdown paragraph."""
paragraphs = [line.strip() for line in lines if line.strip()]
return "\n\n".join(paragraphs) + "\n" if paragraphs else ""
def run_paddleocr_python(pdf_path: str) -> tuple[str, float]:
"""Run PaddleOCR Python v3.4+ using the predict() API."""
os.environ["PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK"] = "True"
from paddleocr import PaddleOCR
ocr = PaddleOCR(use_textline_orientation=True, lang="en")
images = pdf_to_images(pdf_path)
start = time.monotonic()
all_lines: list[str] = []
for img in images:
# predict() returns list of OCRResult (dict-like) objects
for result in ocr.predict(img):
# OCRResult has 'rec_text' key with list of recognized texts
rec_texts = result.get("rec_text", [])
if isinstance(rec_texts, (list, tuple)):
for t in rec_texts:
text = str(t).strip()
if text:
all_lines.append(text)
elapsed_ms = (time.monotonic() - start) * 1000
return lines_to_markdown(all_lines), elapsed_ms
def run_rapidocr(pdf_path: str) -> tuple[str, float]:
"""Run RapidOCR."""
from rapidocr_onnxruntime import RapidOCR
ocr = RapidOCR()
images = pdf_to_images(pdf_path)
start = time.monotonic()
all_lines: list[str] = []
for img in images:
result, _ = ocr(img)
if not result:
continue
for line in result:
if line and len(line) >= 2:
text = str(line[1]).strip()
if text:
all_lines.append(text)
elapsed_ms = (time.monotonic() - start) * 1000
return lines_to_markdown(all_lines), elapsed_ms
def save_vendored(pipeline_name: str, fixture_name: str, md: str, time_ms: float):
md_dir = VENDORED_DIR / pipeline_name / "md"
timing_dir = VENDORED_DIR / pipeline_name / "timing"
md_dir.mkdir(parents=True, exist_ok=True)
timing_dir.mkdir(parents=True, exist_ok=True)
(md_dir / f"{fixture_name}.md").write_text(md)
(timing_dir / f"{fixture_name}.ms").write_text(f"{time_ms:.1f}\n")
def main():
pipelines = {
"paddleocr-python": run_paddleocr_python,
"rapidocr": run_rapidocr,
}
force = "--force" in sys.argv
args = [a for a in sys.argv[1:] if not a.startswith("--")]
if args:
selected = args[0]
if selected not in pipelines:
print(f"Unknown: {selected}. Choose: {list(pipelines.keys())}")
sys.exit(1)
pipelines = {selected: pipelines[selected]}
for fixture_name in OCR_FIXTURES:
fixture_path = FIXTURES_DIR / f"{fixture_name}.json"
if not fixture_path.exists():
print(f" SKIP {fixture_name}: fixture not found")
continue
with open(fixture_path) as f:
fixture = json.load(f)
doc_path = str((FIXTURES_DIR / fixture["document"]).resolve())
if not os.path.exists(doc_path):
print(f" SKIP {fixture_name}: document not found")
continue
for pipeline_name, run_fn in pipelines.items():
existing = VENDORED_DIR / pipeline_name / "md" / f"{fixture_name}.md"
if not force and existing.exists() and existing.stat().st_size > 0:
print(f" CACHED {pipeline_name}/{fixture_name}")
continue
print(f" RUN {pipeline_name}/{fixture_name} ...", end="", flush=True)
try:
md, time_ms = run_fn(doc_path)
save_vendored(pipeline_name, fixture_name, md, time_ms)
print(f" {time_ms:.0f}ms, {len(md)} chars")
except Exception as e:
print(f" ERROR: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,7 @@
module github.com/kreuzberg-dev/kreuzberg/tools/benchmark-harness/scripts
go 1.23
require github.com/kreuzberg-dev/kreuzberg/packages/go/v4 v4.9.5
replace github.com/kreuzberg-dev/kreuzberg/packages/go/v4 => ../../../packages/go/v4

View File

@@ -0,0 +1,407 @@
"""Import OmniDocBench dataset into our benchmark fixture format.
Converts OmniDocBench's element-level JSON annotations into:
- Per-document fixture JSON files (tools/benchmark-harness/fixtures/pdf/omnidoc_NNN.json)
- Ground truth markdown files (test_documents/ground_truth/pdf/omnidoc_NNN.md)
- Ground truth text files (test_documents/ground_truth/pdf/omnidoc_NNN.txt)
OmniDocBench groups pages by document. Each multi-page document produces one fixture.
Single-page documents produce one fixture per page.
Usage:
python import_omnidocbench.py <omnidocbench_dir> <repo_root>
Where:
omnidocbench_dir = tools/benchmark-harness/datasets/omnidocbench (contains OmniDocBench.json + ori_pdfs/)
repo_root = repository root (contains tools/ and test_documents/)
"""
from __future__ import annotations
import html
import json
import os
import re
import sys
from collections import defaultdict
from pathlib import Path
# OmniDocBench category types that map to content we want in ground truth
CONTENT_CATEGORIES = {
"title",
"text_block",
"table",
"equation_isolated",
"code_txt",
"figure_caption",
"table_caption",
"equation_caption",
"code_txt_caption",
"reference",
}
# Categories to skip (page furniture, figures without text, etc.)
SKIP_CATEGORIES = {
"header",
"footer",
"page_number",
"page_footnote",
"abandon",
"figure",
"figure_footnote",
"table_footnote",
}
def html_table_to_markdown(html_str: str) -> str:
"""Convert a simple HTML table to markdown table format."""
if not html_str:
return ""
# Unescape HTML entities
html_str = html.unescape(html_str)
rows: list[list[str]] = []
# Extract rows
for row_match in re.finditer(r"<tr[^>]*>(.*?)</tr>", html_str, re.DOTALL):
row_html = row_match.group(1)
cells: list[str] = []
for cell_match in re.finditer(r"<t[dh][^>]*>(.*?)</t[dh]>", row_html, re.DOTALL):
cell_text = re.sub(r"<[^>]+>", "", cell_match.group(1)).strip()
cells.append(cell_text)
if cells:
rows.append(cells)
if not rows:
return html_str # fallback: return raw if parsing fails
# Normalize column count
max_cols = max(len(r) for r in rows)
for row in rows:
while len(row) < max_cols:
row.append("")
# Build markdown table
lines = []
# Header row
lines.append("| " + " | ".join(rows[0]) + " |")
lines.append("|" + "|".join(["---"] * max_cols) + "|")
# Data rows
for row in rows[1:]:
lines.append("| " + " | ".join(row) + " |")
return "\n".join(lines)
def annotation_to_markdown(ann: dict) -> str | None:
"""Convert a single OmniDocBench annotation to markdown text."""
cat = ann.get("category_type", "")
if cat in SKIP_CATEGORIES:
return None
if ann.get("ignore", False):
return None
text = ann.get("text", "").strip()
if cat == "title":
# OmniDocBench doesn't distinguish heading levels.
# Use H2 as default (most titles are section-level, not document-level).
if text:
return f"## {text}"
return None
if cat == "text_block":
return text or None
if cat == "table":
# Prefer HTML representation for tables
html_str = ann.get("html", "")
if html_str:
return html_table_to_markdown(html_str)
# Fallback to text
return text or None
if cat == "equation_isolated":
latex = ann.get("latex", "")
if latex:
return f"$$\n{latex}\n$$"
return text or None
if cat == "code_txt":
if text:
return f"```\n{text}\n```"
return None
if cat in ("figure_caption", "table_caption", "equation_caption", "code_txt_caption"):
return text or None
if cat == "reference":
return text or None
# Unknown category — include text if present
return text or None
def page_to_markdown(page: dict) -> str:
"""Convert a single OmniDocBench page to markdown."""
annotations = page.get("layout_dets", [])
# Sort by reading order
sorted_anns = sorted(annotations, key=lambda a: a.get("order", 999))
# Handle truncated blocks (merge them)
relations = page.get("extra", {}).get("relation", [])
merge_targets: dict[int, int] = {} # target_id -> source_id
for rel in relations:
if rel.get("relation") == "truncated":
merge_targets[rel["target_anno_id"]] = rel["source_anno_id"]
# Build merged text for truncated blocks
merged_text: dict[int, list[str]] = defaultdict(list)
ann_by_id = {a.get("anno_id", i): a for i, a in enumerate(sorted_anns)}
for ann in sorted_anns:
anno_id = ann.get("anno_id", -1)
if anno_id in merge_targets:
source_id = merge_targets[anno_id]
text = ann.get("text", "").strip()
if text:
merged_text[source_id].append(text)
blocks: list[str] = []
skip_ids = set(merge_targets.keys())
for ann in sorted_anns:
anno_id = ann.get("anno_id", -1)
if anno_id in skip_ids:
continue
# Append merged text from truncated continuations
if anno_id in merged_text:
original_text = ann.get("text", "").strip()
continuation = " ".join(merged_text[anno_id])
ann = dict(ann) # shallow copy
ann["text"] = f"{original_text} {continuation}".strip()
md = annotation_to_markdown(ann)
if md:
blocks.append(md)
return "\n\n".join(blocks)
def strip_markdown_to_text(md: str) -> str:
"""Strip markdown syntax to produce plain text."""
lines = []
in_code = False
in_formula = False
for line in md.split("\n"):
if line.startswith("```"):
in_code = not in_code
continue
if line.startswith("$$"):
in_formula = not in_formula
continue
if in_code or in_formula:
lines.append(line)
continue
# Strip heading markers
stripped = re.sub(r"^#{1,6}\s+", "", line)
# Strip table pipes (keep cell content)
if stripped.startswith("|") and stripped.endswith("|"):
# Skip separator rows
if re.match(r"^\|[-|: ]+\|$", stripped):
continue
stripped = re.sub(r"\s*\|\s*", " ", stripped).strip()
# Strip bold/italic
stripped = re.sub(r"\*{1,3}([^*]+)\*{1,3}", r"\1", stripped)
if stripped:
lines.append(stripped)
return "\n".join(lines)
def group_pages_by_pdf(pages: list[dict]) -> dict[str, list[dict]]:
"""Group OmniDocBench pages by their source PDF."""
groups: dict[str, list[dict]] = defaultdict(list)
for page in pages:
page_info = page.get("page_info", {})
image_path = page_info.get("image_path", "")
# Try to extract PDF name from image path
# Image paths look like: "academic_literature/scihub_12345_p0.jpg"
# or "PPT2PDF/PPT_sample.png"
basename = os.path.splitext(os.path.basename(image_path))[0]
# Strip page suffix like _p0, _p1, etc.
pdf_name = re.sub(r"_p\d+$", "", basename)
groups[pdf_name].append(page)
# Sort pages within each group by page number
for pdf_name in groups:
groups[pdf_name].sort(key=lambda p: p.get("page_info", {}).get("page_no", 0))
return groups
def find_pdf_for_document(pdf_name: str, pages: list[dict], ori_pdfs_dir: Path) -> Path | None:
"""Find the original PDF file for a document group."""
if not ori_pdfs_dir.exists():
return None
# Try direct name match
for ext in (".pdf", ".PDF"):
candidate = ori_pdfs_dir / f"{pdf_name}{ext}"
if candidate.exists():
return candidate
# Try searching in subdirectories
for pdf_file in ori_pdfs_dir.rglob("*.pdf"):
if pdf_file.stem == pdf_name:
return pdf_file
# Try matching from image path
if pages:
image_path = pages[0].get("page_info", {}).get("image_path", "")
parts = image_path.split("/")
if len(parts) >= 2:
subdir = parts[0]
subdir_path = ori_pdfs_dir / subdir
if subdir_path.exists():
for pdf_file in subdir_path.glob("*.pdf"):
if pdf_name.startswith(pdf_file.stem) or pdf_file.stem.startswith(pdf_name):
return pdf_file
return None
def main() -> None:
if len(sys.argv) < 3:
print(
"Usage: import_omnidocbench.py <omnidocbench_dir> <repo_root>",
file=sys.stderr,
)
sys.exit(1)
omnidoc_dir = Path(sys.argv[1]).resolve()
repo_root = Path(sys.argv[2]).resolve()
json_path = omnidoc_dir / "OmniDocBench.json"
ori_pdfs_dir = omnidoc_dir / "ori_pdfs"
if not json_path.exists():
print(f"ERROR: {json_path} not found. Run download_omnidocbench.sh first.", file=sys.stderr)
sys.exit(1)
fixtures_dir = repo_root / "tools" / "benchmark-harness" / "fixtures" / "pdf"
gt_dir = repo_root / "test_documents" / "ground_truth" / "pdf"
fixtures_dir.mkdir(parents=True, exist_ok=True)
gt_dir.mkdir(parents=True, exist_ok=True)
print(f"Loading {json_path}...", file=sys.stderr)
with open(json_path) as f:
pages = json.load(f)
print(f"Loaded {len(pages)} pages", file=sys.stderr)
# Group pages by document
doc_groups = group_pages_by_pdf(pages)
print(f"Found {len(doc_groups)} documents", file=sys.stderr)
created = 0
skipped_no_pdf = 0
skipped_exists = 0
skipped_empty = 0
for pdf_name, doc_pages in sorted(doc_groups.items()):
# Generate fixture name
fixture_name = f"omnidoc_{pdf_name}"
# Sanitize: replace non-alphanumeric chars
fixture_name = re.sub(r"[^a-zA-Z0-9_-]", "_", fixture_name)
fixture_path = fixtures_dir / f"{fixture_name}.json"
gt_md_path = gt_dir / f"{fixture_name}.md"
gt_txt_path = gt_dir / f"{fixture_name}.txt"
# Skip if already imported
if fixture_path.exists():
skipped_exists += 1
continue
# Find the PDF
pdf_path = find_pdf_for_document(pdf_name, doc_pages, ori_pdfs_dir)
if pdf_path is None:
skipped_no_pdf += 1
continue
# Generate markdown from all pages
page_markdowns = []
for page in doc_pages:
md = page_to_markdown(page)
if md.strip():
page_markdowns.append(md)
if not page_markdowns:
skipped_empty += 1
continue
full_markdown = "\n\n".join(page_markdowns)
full_text = strip_markdown_to_text(full_markdown)
# Write ground truth files
gt_md_path.write_text(full_markdown)
gt_txt_path.write_text(full_text)
# Compute relative paths from fixture to document and ground truth
doc_rel = os.path.relpath(pdf_path, fixtures_dir)
gt_md_rel = os.path.relpath(gt_md_path, fixtures_dir)
gt_txt_rel = os.path.relpath(gt_txt_path, fixtures_dir)
# Get page metadata for fixture
first_page = doc_pages[0].get("page_info", {})
page_attr = first_page.get("page_attribute", {})
fixture = {
"document": doc_rel,
"file_type": "pdf",
"file_size": pdf_path.stat().st_size,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": f"OmniDocBench: {page_attr.get('data_source', 'unknown')}",
"source": "omnidocbench",
"size_category": "small" if pdf_path.stat().st_size < 500_000 else "medium",
"language": page_attr.get("language", "unknown"),
"layout": page_attr.get("layout", "unknown"),
"data_source": page_attr.get("data_source", "unknown"),
"page_count": len(doc_pages),
},
"ground_truth": {
"text_file": gt_txt_rel,
"markdown_file": gt_md_rel,
"source": "omnidocbench",
},
}
fixture_path.write_text(json.dumps(fixture, indent=2) + "\n")
created += 1
if created % 50 == 0:
print(f" {created} fixtures created...", file=sys.stderr)
print("\nDone:", file=sys.stderr)
print(f" Created: {created}", file=sys.stderr)
print(f" Skipped (already exists): {skipped_exists}", file=sys.stderr)
print(f" Skipped (no PDF found): {skipped_no_pdf}", file=sys.stderr)
print(f" Skipped (empty content): {skipped_empty}", file=sys.stderr)
print(f" Fixtures: {fixtures_dir}", file=sys.stderr)
print(f" Ground truth: {gt_dir}", file=sys.stderr)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,175 @@
"""MarkItDown extraction wrapper for benchmark harness."""
from __future__ import annotations
import json
import multiprocessing as _mp
import os
import platform
import resource
import sys
import time
from markitdown import MarkItDown
def _get_peak_memory_bytes() -> int:
"""Get peak memory usage in bytes using resource module."""
usage = resource.getrusage(resource.RUSAGE_SELF)
if platform.system() == "Linux":
return usage.ru_maxrss * 1024
return usage.ru_maxrss
def extract_sync(file_path: str) -> dict:
"""Extract using MarkItDown."""
start = time.perf_counter()
md = MarkItDown()
result = md.convert(file_path)
duration_ms = (time.perf_counter() - start) * 1000.0
return {
"content": result.text_content or "",
"metadata": {"framework": "markitdown"},
"_extraction_time_ms": duration_ms,
"_peak_memory_bytes": _get_peak_memory_bytes(),
}
def _worker(fn, args, conn):
"""Run extraction in a forked child process.
Closes inherited stdin/stdout so the child cannot corrupt the
parent's line-based JSON protocol.
"""
try:
sys.stdin.close()
sys.stdout = open(os.devnull, "w")
except Exception:
pass
try:
result = fn(*args)
conn.send(result)
except Exception as e:
conn.send({"error": str(e), "_extraction_time_ms": 0})
finally:
conn.close()
def _run_with_timeout(fn, args, timeout):
"""Execute fn(*args) in a forked child with a timeout.
On timeout the child is killed but the parent stays alive —
no expensive process restart is needed.
"""
try:
ctx = _mp.get_context("fork")
parent_conn, child_conn = ctx.Pipe(duplex=False)
p = ctx.Process(target=_worker, args=(fn, args, child_conn))
p.start()
child_conn.close()
if parent_conn.poll(timeout=timeout):
try:
result = parent_conn.recv()
except Exception:
result = {"error": "worker process crashed", "_extraction_time_ms": 0}
else:
p.kill()
result = {
"error": f"extraction timed out after {timeout}s",
"_extraction_time_ms": timeout * 1000.0,
}
p.join(timeout=5)
if p.is_alive():
p.kill()
p.join()
parent_conn.close()
return result
except Exception:
# Fork not available — fall back to in-process extraction
try:
return fn(*args)
except Exception as e:
return {"error": str(e), "_extraction_time_ms": 0}
def _parse_path(line: str) -> str:
"""Parse a request line: JSON object with path field, or plain file path."""
stripped = line.strip()
if stripped.startswith("{"):
try:
return json.loads(stripped).get("path", "")
except (json.JSONDecodeError, ValueError):
pass
return stripped
def run_server(timeout=None) -> None:
"""Persistent server mode: read paths from stdin, write JSON to stdout."""
print("READY", flush=True)
for line in sys.stdin:
file_path = _parse_path(line)
if not file_path:
continue
if timeout is not None:
result = _run_with_timeout(extract_sync, (file_path,), timeout)
else:
try:
result = extract_sync(file_path)
except Exception as e:
result = {"error": str(e), "_extraction_time_ms": 0}
print(json.dumps(result), flush=True)
def main() -> None:
ocr_enabled = False
timeout = None
args = []
for arg in sys.argv[1:]:
if arg == "--ocr":
ocr_enabled = True
elif arg == "--no-ocr":
ocr_enabled = False
elif arg.startswith("--timeout="):
timeout = int(arg.split("=", 1)[1])
elif arg.startswith("--format="):
_fmt = arg.split("=", 1)[1]
if _fmt != "markdown":
print(f"{sys.argv[0]} only supports markdown output; got --format {_fmt}", file=sys.stderr)
sys.exit(64)
else:
args.append(arg)
if len(args) < 1:
print("Usage: markitdown_extract.py [--ocr|--no-ocr] [--timeout=SECS] <mode> <file_path>", file=sys.stderr)
print("Modes: sync, server", file=sys.stderr)
sys.exit(1)
mode = args[0]
if mode == "server":
run_server(timeout=timeout)
elif mode == "sync":
if len(args) < 2:
print("Error: sync mode requires a file path", file=sys.stderr)
sys.exit(1)
file_path = args[1]
try:
payload = extract_sync(file_path)
print(json.dumps(payload), end="")
except Exception as e:
print(f"Error extracting with MarkItDown: {e}", file=sys.stderr)
sys.exit(1)
else:
# Legacy fallback for direct file path
try:
payload = extract_sync(args[0])
print(json.dumps(payload), end="")
except Exception as e:
print(f"Error extracting with MarkItDown: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,338 @@
"""MinerU extraction wrapper for benchmark harness.
Supports three modes:
- sync: process single file
- batch: process multiple files
- server: persistent mode reading paths from stdin
Attempts to use MinerU's Python API directly for better performance.
Falls back to CLI subprocess if the Python API is not available.
"""
from __future__ import annotations
import os
# Force CPU-only mode to avoid GPU discovery errors in CI
os.environ.setdefault("CUDA_VISIBLE_DEVICES", "")
os.environ.setdefault("ONNXRUNTIME_PROVIDERS", "CPUExecutionProvider")
os.environ.setdefault("MINERU_DEVICE_MODE", "cpu")
import json
import multiprocessing as _mp
import platform
import resource
import subprocess
import sys
import tempfile
import time
from pathlib import Path
from typing import Any
# Try importing MinerU's Python API to avoid subprocess overhead.
# The API surface has changed across versions, so we attempt several known entry points.
try:
from magic_pdf.pipe.UNIPipe import UNIPipe # noqa: F401
HAS_PYTHON_API = True
except ImportError:
HAS_PYTHON_API = False
def _get_peak_memory_bytes() -> int:
"""Get peak memory usage in bytes using resource module."""
usage = resource.getrusage(resource.RUSAGE_SELF)
if platform.system() == "Linux":
return usage.ru_maxrss * 1024
return usage.ru_maxrss
def _extract_via_cli(file_path: str, ocr_enabled: bool) -> str:
"""Extract using MinerU CLI (fallback)."""
cmd = ["mineru", "-p", file_path, "-b", "pipeline", "-d", "cpu"]
if not ocr_enabled:
cmd.extend(["--method", "txt"])
with tempfile.TemporaryDirectory() as tmpdir:
output_dir = Path(tmpdir) / "output"
cmd.extend(["-o", str(output_dir)])
result = subprocess.run(
cmd,
capture_output=True,
text=True,
check=False,
)
# Check for output files first — ONNX Runtime may emit warnings to
# stderr even when extraction succeeds.
md_files = list(output_dir.rglob("*.md"))
if md_files:
return md_files[0].read_text(encoding="utf-8")
if result.returncode != 0:
raise RuntimeError(f"MinerU extraction failed: {result.stderr}")
raise RuntimeError("No markdown output found from MinerU")
def _extract_via_api(file_path: str, ocr_enabled: bool) -> str:
"""Extract using MinerU Python API (preferred, avoids subprocess overhead)."""
# NOTE: The MinerU Python API is not yet stable. This is a best-effort attempt
# using the UNIPipe interface. If this fails at runtime, the caller should
# fall back to CLI extraction.
from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
pdf_bytes = Path(file_path).read_bytes()
with tempfile.TemporaryDirectory() as tmpdir:
writer = DiskReaderWriter(tmpdir)
method = "ocr" if ocr_enabled else "txt"
pipe = UNIPipe(pdf_bytes, {"_pdf_type": "", "model_list": []}, writer, method=method)
pipe.pipe_classify()
pipe.pipe_analyze()
pipe.pipe_parse()
md_content = pipe.pipe_mk_markdown(str(Path(file_path).stem), tmpdir)
return md_content
_MD_STRIP_RE = None
def _strip_markdown(text: str) -> str:
"""Best-effort markdown→plaintext pass. Drops syntax tokens; preserves text."""
import re
global _MD_STRIP_RE
if _MD_STRIP_RE is None:
_MD_STRIP_RE = [
(re.compile(r"^#{1,6}\s+", re.MULTILINE), ""), # ATX headings
(re.compile(r"^\s*[-*+]\s+", re.MULTILINE), ""), # bullet markers
(re.compile(r"^\s*\d+\.\s+", re.MULTILINE), ""), # ordered list markers
(re.compile(r"^>\s?", re.MULTILINE), ""), # blockquotes
(re.compile(r"```[a-zA-Z0-9_-]*\n?"), ""), # code fences
(re.compile(r"`([^`]+)`"), r"\1"), # inline code
(re.compile(r"\*\*([^*]+)\*\*"), r"\1"), # bold
(re.compile(r"\*([^*]+)\*"), r"\1"), # italic
(re.compile(r"!\[([^\]]*)\]\([^)]*\)"), r"\1"), # images
(re.compile(r"\[([^\]]+)\]\([^)]*\)"), r"\1"), # links
(re.compile(r"^\s*\|.*\|\s*$", re.MULTILINE), ""), # table rows (drop)
]
out = text
for pattern, repl in _MD_STRIP_RE:
out = pattern.sub(repl, out)
return out
def extract_sync(file_path: str, ocr_enabled: bool, output_format: str = "markdown") -> dict[str, Any]:
"""Extract a single file using the best available method."""
start = time.perf_counter()
if HAS_PYTHON_API:
try:
markdown = _extract_via_api(file_path, ocr_enabled)
except Exception:
# Fall back to CLI if Python API fails at runtime
markdown = _extract_via_cli(file_path, ocr_enabled)
else:
markdown = _extract_via_cli(file_path, ocr_enabled)
content = _strip_markdown(markdown) if output_format == "plaintext" else markdown
duration_ms = (time.perf_counter() - start) * 1000.0
return {
"content": content,
"metadata": {"framework": "mineru", "output_format": output_format},
"_extraction_time_ms": duration_ms,
"_peak_memory_bytes": _get_peak_memory_bytes(),
}
def extract_batch(file_paths: list[str], ocr_enabled: bool, output_format: str = "markdown") -> list[dict[str, Any]]:
"""Extract multiple files in sequence."""
start = time.perf_counter()
results = []
for file_path in file_paths:
try:
payload = extract_sync(file_path, ocr_enabled, output_format)
# Remove per-file timing; we'll replace with batch timing below
payload.pop("_extraction_time_ms", None)
results.append(payload)
except Exception as e:
results.append(
{
"content": "",
"metadata": {
"framework": "mineru",
"error": str(e),
},
}
)
total_duration_ms = (time.perf_counter() - start) * 1000.0
per_file_duration_ms = total_duration_ms / len(file_paths) if file_paths else 0
peak_memory = _get_peak_memory_bytes()
for result in results:
result["_extraction_time_ms"] = per_file_duration_ms
result["_batch_total_ms"] = total_duration_ms
result["_peak_memory_bytes"] = peak_memory
return results
def _worker(fn, args, conn):
"""Run extraction in a forked child process.
Closes inherited stdin/stdout so the child cannot corrupt the
parent's line-based JSON protocol.
"""
try:
sys.stdin.close()
sys.stdout = open(os.devnull, "w")
except Exception:
pass
try:
result = fn(*args)
conn.send(result)
except Exception as e:
conn.send({"error": str(e), "_extraction_time_ms": 0})
finally:
conn.close()
def _run_with_timeout(fn, args, timeout):
"""Execute fn(*args) in a forked child with a timeout.
On timeout the child is killed but the parent stays alive —
no expensive process restart is needed.
"""
try:
ctx = _mp.get_context("fork")
parent_conn, child_conn = ctx.Pipe(duplex=False)
p = ctx.Process(target=_worker, args=(fn, args, child_conn))
p.start()
child_conn.close()
if parent_conn.poll(timeout=timeout):
try:
result = parent_conn.recv()
except Exception:
result = {"error": "worker process crashed", "_extraction_time_ms": 0}
else:
p.kill()
result = {
"error": f"extraction timed out after {timeout}s",
"_extraction_time_ms": timeout * 1000.0,
}
p.join(timeout=5)
if p.is_alive():
p.kill()
p.join()
parent_conn.close()
return result
except Exception:
# Fork not available — fall back to in-process extraction
try:
return fn(*args)
except Exception as e:
return {"error": str(e), "_extraction_time_ms": 0}
def _parse_path(line: str) -> str:
"""Parse a request line: JSON object with path field, or plain file path."""
stripped = line.strip()
if stripped.startswith("{"):
try:
return json.loads(stripped).get("path", "")
except (json.JSONDecodeError, ValueError):
pass
return stripped
def run_server(ocr_enabled: bool, output_format: str, timeout=None) -> None:
"""Persistent server mode: read paths from stdin, write JSON to stdout."""
print("READY", flush=True)
for line in sys.stdin:
file_path = _parse_path(line)
if not file_path:
continue
if timeout is not None:
result = _run_with_timeout(extract_sync, (file_path, ocr_enabled, output_format), timeout)
else:
try:
result = extract_sync(file_path, ocr_enabled, output_format)
except Exception as e:
result = {"error": str(e), "_extraction_time_ms": 0}
print(json.dumps(result), flush=True)
def main() -> None:
ocr_enabled = False
timeout = None
output_format = "markdown"
args = []
for arg in sys.argv[1:]:
if arg == "--ocr":
ocr_enabled = True
elif arg == "--no-ocr":
ocr_enabled = False
elif arg.startswith("--timeout="):
timeout = int(arg.split("=", 1)[1])
elif arg.startswith("--format="):
output_format = arg.split("=", 1)[1]
else:
args.append(arg)
if output_format not in ("markdown", "plaintext"):
print(f"Error: --format must be 'markdown' or 'plaintext'; got '{output_format}'", file=sys.stderr)
sys.exit(64)
if len(args) < 1:
print(
"Usage: mineru_extract.py [--ocr|--no-ocr] [--timeout=SECS] [--format=markdown|plaintext] <mode> <file_path> [additional_files...]",
file=sys.stderr,
)
print("Modes: sync, batch, server", file=sys.stderr)
sys.exit(1)
mode = args[0]
file_paths = args[1:]
try:
if mode == "server":
run_server(ocr_enabled, output_format, timeout=timeout)
elif mode == "sync":
if len(file_paths) != 1:
print("Error: sync mode requires exactly one file", file=sys.stderr)
sys.exit(1)
payload = extract_sync(file_paths[0], ocr_enabled, output_format)
print(json.dumps(payload), end="")
elif mode == "batch":
if len(file_paths) < 1:
print("Error: batch mode requires at least one file", file=sys.stderr)
sys.exit(1)
if len(file_paths) == 1:
results = extract_batch(file_paths, ocr_enabled, output_format)
print(json.dumps(results[0]), end="")
else:
results = extract_batch(file_paths, ocr_enabled, output_format)
print(json.dumps(results), end="")
else:
print(f"Error: Unknown mode '{mode}'. Use sync, batch, or server", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f"Error extracting with MinerU: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,67 @@
#!/usr/bin/env bash
set -euo pipefail
FORMAT="markdown"
FILE_PATH=""
for arg in "$@"; do
case "$arg" in
--format=*)
FORMAT="${arg#--format=}"
;;
*)
FILE_PATH="$arg"
;;
esac
done
if [ -z "$FILE_PATH" ]; then
echo "Usage: pandoc_extract.sh [--format=markdown|plaintext] <file_path>" >&2
exit 1
fi
if [ "$FORMAT" != "markdown" ] && [ "$FORMAT" != "plaintext" ]; then
echo "Error: --format must be 'markdown' or 'plaintext'; got '$FORMAT'" >&2
exit 64
fi
if [ ! -f "$FILE_PATH" ]; then
echo "Error: File not found: $FILE_PATH" >&2
exit 1
fi
if [ "$FORMAT" = "markdown" ]; then
PANDOC_TO="gfm"
else
PANDOC_TO="plain"
fi
START=$(date +%s%N)
if command -v timeout &>/dev/null; then
CONTENT=$(timeout 60s pandoc "$FILE_PATH" "--to=$PANDOC_TO" --wrap=none --strip-comments 2>/dev/null || echo "")
elif command -v gtimeout &>/dev/null; then
CONTENT=$(gtimeout 60s pandoc "$FILE_PATH" "--to=$PANDOC_TO" --wrap=none --strip-comments 2>/dev/null || echo "")
else
CONTENT=$(pandoc "$FILE_PATH" "--to=$PANDOC_TO" --wrap=none --strip-comments 2>/dev/null || echo "")
fi
END=$(date +%s%N)
DURATION_MS=$(((END - START) / 1000000))
if command -v jq &>/dev/null; then
jq -n \
--arg content "$CONTENT" \
--arg fmt "$FORMAT" \
--argjson duration "$DURATION_MS" \
'{
content: $content,
metadata: {framework: "pandoc", output_format: $fmt},
_extraction_time_ms: $duration
}'
else
ESCAPED_CONTENT=$(echo "$CONTENT" | sed 's/\\/\\\\/g' | sed 's/"/\\"/g' | awk '{printf "%s\\n", $0}' | sed '$ s/\\n$//')
cat <<EOF
{"content":"$ESCAPED_CONTENT","metadata":{"framework":"pandoc","output_format":"$FORMAT"},"_extraction_time_ms":$DURATION_MS}
EOF
fi

View File

@@ -0,0 +1,231 @@
"""pdfminer extraction wrapper for benchmark harness.
Supports three modes:
- sync: extract text from a single file
- batch: process multiple files (simulated batch using loop)
- server: persistent mode reading paths from stdin
"""
from __future__ import annotations
import json
import multiprocessing as _mp
import os
import platform
import resource
import sys
import time
from typing import Any
from pdfminer.high_level import extract_text
def _get_peak_memory_bytes() -> int:
"""Get peak memory usage in bytes using resource module."""
usage = resource.getrusage(resource.RUSAGE_SELF)
if platform.system() == "Linux":
return usage.ru_maxrss * 1024
return usage.ru_maxrss
def extract_sync(file_path: str) -> dict[str, Any]:
"""Extract using synchronous single-file API."""
start = time.perf_counter()
content = extract_text(file_path)
duration_ms = (time.perf_counter() - start) * 1000.0
return {
"content": content,
"metadata": {"framework": "pdfminer"},
"_extraction_time_ms": duration_ms,
"_peak_memory_bytes": _get_peak_memory_bytes(),
}
def extract_batch(file_paths: list[str]) -> list[dict[str, Any]]:
"""Extract multiple files (simulated batch - pdfminer has no native batch API)."""
start = time.perf_counter()
results = []
for file_path in file_paths:
try:
content = extract_text(file_path)
results.append(
{
"content": content,
"metadata": {"framework": "pdfminer"},
}
)
except Exception as e:
results.append(
{
"content": "",
"metadata": {
"framework": "pdfminer",
"error": str(e),
},
}
)
total_duration_ms = (time.perf_counter() - start) * 1000.0
per_file_duration_ms = total_duration_ms / len(file_paths) if file_paths else 0
peak_memory = _get_peak_memory_bytes()
for result in results:
result["_extraction_time_ms"] = per_file_duration_ms
result["_batch_total_ms"] = total_duration_ms
result["_peak_memory_bytes"] = peak_memory
return results
def _worker(fn, args, conn):
"""Run extraction in a forked child process.
Closes inherited stdin/stdout so the child cannot corrupt the
parent's line-based JSON protocol.
"""
try:
sys.stdin.close()
sys.stdout = open(os.devnull, "w")
except Exception:
pass
try:
result = fn(*args)
conn.send(result)
except Exception as e:
conn.send({"error": str(e), "_extraction_time_ms": 0})
finally:
conn.close()
def _run_with_timeout(fn, args, timeout):
"""Execute fn(*args) in a forked child with a timeout.
On timeout the child is killed but the parent stays alive —
no expensive process restart is needed.
"""
try:
ctx = _mp.get_context("fork")
parent_conn, child_conn = ctx.Pipe(duplex=False)
p = ctx.Process(target=_worker, args=(fn, args, child_conn))
p.start()
child_conn.close()
if parent_conn.poll(timeout=timeout):
try:
result = parent_conn.recv()
except Exception:
result = {"error": "worker process crashed", "_extraction_time_ms": 0}
else:
p.kill()
result = {
"error": f"extraction timed out after {timeout}s",
"_extraction_time_ms": timeout * 1000.0,
}
p.join(timeout=5)
if p.is_alive():
p.kill()
p.join()
parent_conn.close()
return result
except Exception:
# Fork not available — fall back to in-process extraction
try:
return fn(*args)
except Exception as e:
return {"error": str(e), "_extraction_time_ms": 0}
def _parse_path(line: str) -> str:
"""Parse a request line: JSON object with path field, or plain file path."""
stripped = line.strip()
if stripped.startswith("{"):
try:
return json.loads(stripped).get("path", "")
except (json.JSONDecodeError, ValueError):
pass
return stripped
def run_server(timeout=None) -> None:
"""Persistent server mode: read paths from stdin, write JSON to stdout."""
print("READY", flush=True)
for line in sys.stdin:
file_path = _parse_path(line)
if not file_path:
continue
if timeout is not None:
result = _run_with_timeout(extract_sync, (file_path,), timeout)
else:
try:
result = extract_sync(file_path)
except Exception as e:
result = {"error": str(e), "_extraction_time_ms": 0}
print(json.dumps(result), flush=True)
def main() -> None:
timeout = None
args = []
for arg in sys.argv[1:]:
if arg in ("--ocr", "--no-ocr"):
pass # Accepted but ignored - pdfminer doesn't have OCR config
elif arg.startswith("--timeout="):
timeout = int(arg.split("=", 1)[1])
elif arg.startswith("--format="):
_fmt = arg.split("=", 1)[1]
if _fmt != "plaintext":
print(f"{sys.argv[0]} only supports plaintext output; got --format {_fmt}", file=sys.stderr)
sys.exit(64)
else:
args.append(arg)
if len(args) < 1:
print(
"Usage: pdfminer_extract.py [--ocr|--no-ocr] [--timeout=SECS] <mode> <file_path> [additional_files...]",
file=sys.stderr,
)
print("Modes: sync, batch, server", file=sys.stderr)
sys.exit(1)
mode = args[0]
file_paths = args[1:]
try:
if mode == "server":
run_server(timeout=timeout)
elif mode == "sync":
if len(file_paths) != 1:
print("Error: sync mode requires exactly one file", file=sys.stderr)
sys.exit(1)
payload = extract_sync(file_paths[0])
print(json.dumps(payload), end="")
elif mode == "batch":
if len(file_paths) < 1:
print("Error: batch mode requires at least one file", file=sys.stderr)
sys.exit(1)
if len(file_paths) == 1:
results = extract_batch(file_paths)
print(json.dumps(results[0]), end="")
else:
results = extract_batch(file_paths)
print(json.dumps(results), end="")
else:
print(f"Error: Unknown mode '{mode}'. Use sync, batch, or server", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f"Error extracting with pdfminer: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,245 @@
"""pdfplumber extraction wrapper for benchmark harness.
Supports three modes:
- sync: extract text page-by-page (sequential)
- batch: process multiple files (simulated batch using loop)
- server: persistent mode reading paths from stdin
"""
from __future__ import annotations
import json
import multiprocessing as _mp
import os
import platform
import resource
import sys
import time
from typing import Any
import pdfplumber
def _get_peak_memory_bytes() -> int:
"""Get peak memory usage in bytes using resource module."""
usage = resource.getrusage(resource.RUSAGE_SELF)
if platform.system() == "Linux":
return usage.ru_maxrss * 1024
return usage.ru_maxrss
def extract_sync(file_path: str) -> dict[str, Any]:
"""Extract using synchronous single-file API."""
start = time.perf_counter()
with pdfplumber.open(file_path) as pdf:
text_parts = []
for page in pdf.pages:
page_text = page.extract_text(layout=False)
if page_text:
text_parts.append(page_text)
markdown = "\n\n".join(text_parts)
duration_ms = (time.perf_counter() - start) * 1000.0
return {
"content": markdown,
"metadata": {"framework": "pdfplumber"},
"_extraction_time_ms": duration_ms,
"_peak_memory_bytes": _get_peak_memory_bytes(),
}
def extract_batch(file_paths: list[str]) -> list[dict[str, Any]]:
"""Extract multiple files (simulated batch - pdfplumber has no native batch API)."""
start = time.perf_counter()
results = []
for file_path in file_paths:
try:
with pdfplumber.open(file_path) as pdf:
text_parts = []
for page in pdf.pages:
page_text = page.extract_text(layout=False)
if page_text:
text_parts.append(page_text)
markdown = "\n\n".join(text_parts)
results.append(
{
"content": markdown,
"metadata": {"framework": "pdfplumber"},
}
)
except Exception as e:
results.append(
{
"content": "",
"metadata": {
"framework": "pdfplumber",
"error": str(e),
},
}
)
total_duration_ms = (time.perf_counter() - start) * 1000.0
per_file_duration_ms = total_duration_ms / len(file_paths) if file_paths else 0
peak_memory = _get_peak_memory_bytes()
for result in results:
result["_extraction_time_ms"] = per_file_duration_ms
result["_batch_total_ms"] = total_duration_ms
result["_peak_memory_bytes"] = peak_memory
return results
def _worker(fn, args, conn):
"""Run extraction in a forked child process.
Closes inherited stdin/stdout so the child cannot corrupt the
parent's line-based JSON protocol.
"""
try:
sys.stdin.close()
sys.stdout = open(os.devnull, "w")
except Exception:
pass
try:
result = fn(*args)
conn.send(result)
except Exception as e:
conn.send({"error": str(e), "_extraction_time_ms": 0})
finally:
conn.close()
def _run_with_timeout(fn, args, timeout):
"""Execute fn(*args) in a forked child with a timeout.
On timeout the child is killed but the parent stays alive —
no expensive process restart is needed.
"""
try:
ctx = _mp.get_context("fork")
parent_conn, child_conn = ctx.Pipe(duplex=False)
p = ctx.Process(target=_worker, args=(fn, args, child_conn))
p.start()
child_conn.close()
if parent_conn.poll(timeout=timeout):
try:
result = parent_conn.recv()
except Exception:
result = {"error": "worker process crashed", "_extraction_time_ms": 0}
else:
p.kill()
result = {
"error": f"extraction timed out after {timeout}s",
"_extraction_time_ms": timeout * 1000.0,
}
p.join(timeout=5)
if p.is_alive():
p.kill()
p.join()
parent_conn.close()
return result
except Exception:
# Fork not available — fall back to in-process extraction
try:
return fn(*args)
except Exception as e:
return {"error": str(e), "_extraction_time_ms": 0}
def _parse_path(line: str) -> str:
"""Parse a request line: JSON object with path field, or plain file path."""
stripped = line.strip()
if stripped.startswith("{"):
try:
return json.loads(stripped).get("path", "")
except (json.JSONDecodeError, ValueError):
pass
return stripped
def run_server(timeout=None) -> None:
"""Persistent server mode: read paths from stdin, write JSON to stdout."""
print("READY", flush=True)
for line in sys.stdin:
file_path = _parse_path(line)
if not file_path:
continue
if timeout is not None:
result = _run_with_timeout(extract_sync, (file_path,), timeout)
else:
try:
result = extract_sync(file_path)
except Exception as e:
result = {"error": str(e), "_extraction_time_ms": 0}
print(json.dumps(result), flush=True)
def main() -> None:
timeout = None
args = []
for arg in sys.argv[1:]:
if arg in ("--ocr", "--no-ocr"):
pass # Accepted but ignored - pdfplumber doesn't have OCR config
elif arg.startswith("--timeout="):
timeout = int(arg.split("=", 1)[1])
elif arg.startswith("--format="):
_fmt = arg.split("=", 1)[1]
if _fmt != "plaintext":
print(f"{sys.argv[0]} only supports plaintext output; got --format {_fmt}", file=sys.stderr)
sys.exit(64)
else:
args.append(arg)
if len(args) < 1:
print(
"Usage: pdfplumber_extract.py [--ocr|--no-ocr] [--timeout=SECS] <mode> <file_path> [additional_files...]",
file=sys.stderr,
)
print("Modes: sync, batch, server", file=sys.stderr)
sys.exit(1)
mode = args[0]
file_paths = args[1:]
try:
if mode == "server":
run_server(timeout=timeout)
elif mode == "sync":
if len(file_paths) != 1:
print("Error: sync mode requires exactly one file", file=sys.stderr)
sys.exit(1)
payload = extract_sync(file_paths[0])
print(json.dumps(payload), end="")
elif mode == "batch":
if len(file_paths) < 1:
print("Error: batch mode requires at least one file", file=sys.stderr)
sys.exit(1)
if len(file_paths) == 1:
results = extract_batch(file_paths)
print(json.dumps(results[0]), end="")
else:
results = extract_batch(file_paths)
print(json.dumps(results), end="")
else:
print(f"Error: Unknown mode '{mode}'. Use sync, batch, or server", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f"Error extracting with pdfplumber: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,237 @@
"""pdftotext extraction wrapper for benchmark harness.
Supports three modes:
- sync: extract text from entire PDF (sequential)
- batch: process multiple files (simulated batch using loop)
- server: persistent mode reading paths from stdin
"""
from __future__ import annotations
import json
import multiprocessing as _mp
import os
import platform
import resource
import sys
import time
from typing import Any
import pdftotext
def _get_peak_memory_bytes() -> int:
"""Get peak memory usage in bytes using resource module."""
usage = resource.getrusage(resource.RUSAGE_SELF)
if platform.system() == "Linux":
return usage.ru_maxrss * 1024
return usage.ru_maxrss
def extract_sync(file_path: str) -> dict[str, Any]:
"""Extract using synchronous single-file API."""
start = time.perf_counter()
with open(file_path, "rb") as f:
pdf = pdftotext.PDF(f)
content = "\n\n".join(pdf)
duration_ms = (time.perf_counter() - start) * 1000.0
return {
"content": content,
"metadata": {"framework": "pdftotext"},
"_extraction_time_ms": duration_ms,
"_peak_memory_bytes": _get_peak_memory_bytes(),
}
def extract_batch(file_paths: list[str]) -> list[dict[str, Any]]:
"""Extract multiple files (simulated batch - pdftotext has no native batch API)."""
start = time.perf_counter()
results = []
for file_path in file_paths:
try:
with open(file_path, "rb") as f:
pdf = pdftotext.PDF(f)
content = "\n\n".join(pdf)
results.append(
{
"content": content,
"metadata": {"framework": "pdftotext"},
}
)
except Exception as e:
results.append(
{
"content": "",
"metadata": {
"framework": "pdftotext",
"error": str(e),
},
}
)
total_duration_ms = (time.perf_counter() - start) * 1000.0
per_file_duration_ms = total_duration_ms / len(file_paths) if file_paths else 0
peak_memory = _get_peak_memory_bytes()
for result in results:
result["_extraction_time_ms"] = per_file_duration_ms
result["_batch_total_ms"] = total_duration_ms
result["_peak_memory_bytes"] = peak_memory
return results
def _worker(fn, args, conn):
"""Run extraction in a forked child process.
Closes inherited stdin/stdout so the child cannot corrupt the
parent's line-based JSON protocol.
"""
try:
sys.stdin.close()
sys.stdout = open(os.devnull, "w")
except Exception:
pass
try:
result = fn(*args)
conn.send(result)
except Exception as e:
conn.send({"error": str(e), "_extraction_time_ms": 0})
finally:
conn.close()
def _run_with_timeout(fn, args, timeout):
"""Execute fn(*args) in a forked child with a timeout.
On timeout the child is killed but the parent stays alive —
no expensive process restart is needed.
"""
try:
ctx = _mp.get_context("fork")
parent_conn, child_conn = ctx.Pipe(duplex=False)
p = ctx.Process(target=_worker, args=(fn, args, child_conn))
p.start()
child_conn.close()
if parent_conn.poll(timeout=timeout):
try:
result = parent_conn.recv()
except Exception:
result = {"error": "worker process crashed", "_extraction_time_ms": 0}
else:
p.kill()
result = {
"error": f"extraction timed out after {timeout}s",
"_extraction_time_ms": timeout * 1000.0,
}
p.join(timeout=5)
if p.is_alive():
p.kill()
p.join()
parent_conn.close()
return result
except Exception:
# Fork not available — fall back to in-process extraction
try:
return fn(*args)
except Exception as e:
return {"error": str(e), "_extraction_time_ms": 0}
def _parse_path(line: str) -> str:
"""Parse a request line: JSON object with path field, or plain file path."""
stripped = line.strip()
if stripped.startswith("{"):
try:
return json.loads(stripped).get("path", "")
except (json.JSONDecodeError, ValueError):
pass
return stripped
def run_server(timeout=None) -> None:
"""Persistent server mode: read paths from stdin, write JSON to stdout."""
print("READY", flush=True)
for line in sys.stdin:
file_path = _parse_path(line)
if not file_path:
continue
if timeout is not None:
result = _run_with_timeout(extract_sync, (file_path,), timeout)
else:
try:
result = extract_sync(file_path)
except Exception as e:
result = {"error": str(e), "_extraction_time_ms": 0}
print(json.dumps(result), flush=True)
def main() -> None:
timeout = None
args = []
for arg in sys.argv[1:]:
if arg in ("--ocr", "--no-ocr"):
pass # Accepted but ignored - pdftotext doesn't have OCR config
elif arg.startswith("--timeout="):
timeout = int(arg.split("=", 1)[1])
elif arg.startswith("--format="):
_fmt = arg.split("=", 1)[1]
if _fmt != "plaintext":
print(f"{sys.argv[0]} only supports plaintext output; got --format {_fmt}", file=sys.stderr)
sys.exit(64)
else:
args.append(arg)
if len(args) < 1:
print(
"Usage: pdftotext_extract.py [--ocr|--no-ocr] [--timeout=SECS] <mode> <file_path> [additional_files...]",
file=sys.stderr,
)
print("Modes: sync, batch, server", file=sys.stderr)
sys.exit(1)
mode = args[0]
file_paths = args[1:]
try:
if mode == "server":
run_server(timeout=timeout)
elif mode == "sync":
if len(file_paths) != 1:
print("Error: sync mode requires exactly one file", file=sys.stderr)
sys.exit(1)
payload = extract_sync(file_paths[0])
print(json.dumps(payload), end="")
elif mode == "batch":
if len(file_paths) < 1:
print("Error: batch mode requires at least one file", file=sys.stderr)
sys.exit(1)
if len(file_paths) == 1:
results = extract_batch(file_paths)
print(json.dumps(results[0]), end="")
else:
results = extract_batch(file_paths)
print(json.dumps(results), end="")
else:
print(f"Error: Unknown mode '{mode}'. Use sync, batch, or server", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f"Error extracting with pdftotext: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,245 @@
"""playa-pdf extraction wrapper for benchmark harness.
Supports three modes:
- sync: extract text page-by-page (sequential)
- batch: process multiple files (simulated batch using loop)
- server: persistent mode reading paths from stdin
"""
from __future__ import annotations
import json
import multiprocessing as _mp
import os
import platform
import resource
import sys
import time
from typing import Any
import playa
def _get_peak_memory_bytes() -> int:
"""Get peak memory usage in bytes using resource module."""
usage = resource.getrusage(resource.RUSAGE_SELF)
if platform.system() == "Linux":
return usage.ru_maxrss * 1024
return usage.ru_maxrss
def extract_sync(file_path: str) -> dict[str, Any]:
"""Extract using synchronous single-file API."""
start = time.perf_counter()
with playa.open(file_path) as doc:
text_parts = []
for page in doc.pages:
page_text = page.extract_text()
if page_text:
text_parts.append(page_text)
markdown = "\n\n".join(text_parts)
duration_ms = (time.perf_counter() - start) * 1000.0
return {
"content": markdown,
"metadata": {"framework": "playa-pdf"},
"_extraction_time_ms": duration_ms,
"_peak_memory_bytes": _get_peak_memory_bytes(),
}
def extract_batch(file_paths: list[str]) -> list[dict[str, Any]]:
"""Extract multiple files (simulated batch - playa-pdf has no native batch API)."""
start = time.perf_counter()
results = []
for file_path in file_paths:
try:
with playa.open(file_path) as doc:
text_parts = []
for page in doc.pages:
page_text = page.extract_text()
if page_text:
text_parts.append(page_text)
markdown = "\n\n".join(text_parts)
results.append(
{
"content": markdown,
"metadata": {"framework": "playa-pdf"},
}
)
except Exception as e:
results.append(
{
"content": "",
"metadata": {
"framework": "playa-pdf",
"error": str(e),
},
}
)
total_duration_ms = (time.perf_counter() - start) * 1000.0
per_file_duration_ms = total_duration_ms / len(file_paths) if file_paths else 0
peak_memory = _get_peak_memory_bytes()
for result in results:
result["_extraction_time_ms"] = per_file_duration_ms
result["_batch_total_ms"] = total_duration_ms
result["_peak_memory_bytes"] = peak_memory
return results
def _worker(fn, args, conn):
"""Run extraction in a forked child process.
Closes inherited stdin/stdout so the child cannot corrupt the
parent's line-based JSON protocol.
"""
try:
sys.stdin.close()
sys.stdout = open(os.devnull, "w")
except Exception:
pass
try:
result = fn(*args)
conn.send(result)
except Exception as e:
conn.send({"error": str(e), "_extraction_time_ms": 0})
finally:
conn.close()
def _run_with_timeout(fn, args, timeout):
"""Execute fn(*args) in a forked child with a timeout.
On timeout the child is killed but the parent stays alive —
no expensive process restart is needed.
"""
try:
ctx = _mp.get_context("fork")
parent_conn, child_conn = ctx.Pipe(duplex=False)
p = ctx.Process(target=_worker, args=(fn, args, child_conn))
p.start()
child_conn.close()
if parent_conn.poll(timeout=timeout):
try:
result = parent_conn.recv()
except Exception:
result = {"error": "worker process crashed", "_extraction_time_ms": 0}
else:
p.kill()
result = {
"error": f"extraction timed out after {timeout}s",
"_extraction_time_ms": timeout * 1000.0,
}
p.join(timeout=5)
if p.is_alive():
p.kill()
p.join()
parent_conn.close()
return result
except Exception:
# Fork not available — fall back to in-process extraction
try:
return fn(*args)
except Exception as e:
return {"error": str(e), "_extraction_time_ms": 0}
def _parse_path(line: str) -> str:
"""Parse a request line: JSON object with path field, or plain file path."""
stripped = line.strip()
if stripped.startswith("{"):
try:
return json.loads(stripped).get("path", "")
except (json.JSONDecodeError, ValueError):
pass
return stripped
def run_server(timeout=None) -> None:
"""Persistent server mode: read paths from stdin, write JSON to stdout."""
print("READY", flush=True)
for line in sys.stdin:
file_path = _parse_path(line)
if not file_path:
continue
if timeout is not None:
result = _run_with_timeout(extract_sync, (file_path,), timeout)
else:
try:
result = extract_sync(file_path)
except Exception as e:
result = {"error": str(e), "_extraction_time_ms": 0}
print(json.dumps(result), flush=True)
def main() -> None:
timeout = None
args = []
for arg in sys.argv[1:]:
if arg in ("--ocr", "--no-ocr"):
pass # Accepted but ignored - playa-pdf doesn't have OCR capability
elif arg.startswith("--timeout="):
timeout = int(arg.split("=", 1)[1])
elif arg.startswith("--format="):
_fmt = arg.split("=", 1)[1]
if _fmt != "plaintext":
print(f"{sys.argv[0]} only supports plaintext output; got --format {_fmt}", file=sys.stderr)
sys.exit(64)
else:
args.append(arg)
if len(args) < 1:
print(
"Usage: playa_pdf_extract.py [--ocr|--no-ocr] [--timeout=SECS] <mode> <file_path> [additional_files...]",
file=sys.stderr,
)
print("Modes: sync, batch, server", file=sys.stderr)
sys.exit(1)
mode = args[0]
file_paths = args[1:]
try:
if mode == "server":
run_server(timeout=timeout)
elif mode == "sync":
if len(file_paths) != 1:
print("Error: sync mode requires exactly one file", file=sys.stderr)
sys.exit(1)
payload = extract_sync(file_paths[0])
print(json.dumps(payload), end="")
elif mode == "batch":
if len(file_paths) < 1:
print("Error: batch mode requires at least one file", file=sys.stderr)
sys.exit(1)
if len(file_paths) == 1:
results = extract_batch(file_paths)
print(json.dumps(results[0]), end="")
else:
results = extract_batch(file_paths)
print(json.dumps(results), end="")
else:
print(f"Error: Unknown mode '{mode}'. Use sync, batch, or server", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f"Error extracting with playa-pdf: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,184 @@
"""PyMuPDF4LLM extraction wrapper for benchmark harness."""
from __future__ import annotations
import json
import multiprocessing as _mp
import os
import platform
import resource
import sys
import time
# Suppress MuPDF C-level error/warning messages that can corrupt the
# persistent server's line-based JSON protocol on stdout.
# See: https://github.com/pymupdf/PyMuPDF/issues/606
import pymupdf
# Import pymupdf.layout BEFORE pymupdf4llm to enable improved layout analysis
# and suppress the "Consider using the pymupdf_layout package" info message.
import pymupdf.layout
import pymupdf4llm
pymupdf.TOOLS.mupdf_display_errors(False)
def _get_peak_memory_bytes() -> int:
"""Get peak memory usage in bytes using resource module."""
usage = resource.getrusage(resource.RUSAGE_SELF)
if platform.system() == "Linux":
return usage.ru_maxrss * 1024
return usage.ru_maxrss
def extract_sync(file_path: str) -> dict:
"""Extract using PyMuPDF4LLM."""
start = time.perf_counter()
markdown = pymupdf4llm.to_markdown(file_path, show_progress=False, write_images=False)
duration_ms = (time.perf_counter() - start) * 1000.0
return {
"content": markdown,
"metadata": {"framework": "pymupdf4llm"},
"_extraction_time_ms": duration_ms,
"_peak_memory_bytes": _get_peak_memory_bytes(),
}
def _worker(fn, args, conn):
"""Run extraction in a forked child process.
Closes inherited stdin/stdout so the child cannot corrupt the
parent's line-based JSON protocol.
"""
try:
sys.stdin.close()
sys.stdout = open(os.devnull, "w")
except Exception:
pass
try:
result = fn(*args)
conn.send(result)
except Exception as e:
conn.send({"error": str(e), "_extraction_time_ms": 0})
finally:
conn.close()
def _run_with_timeout(fn, args, timeout):
"""Execute fn(*args) in a forked child with a timeout.
On timeout the child is killed but the parent stays alive —
no expensive process restart is needed.
"""
try:
ctx = _mp.get_context("fork")
parent_conn, child_conn = ctx.Pipe(duplex=False)
p = ctx.Process(target=_worker, args=(fn, args, child_conn))
p.start()
child_conn.close()
if parent_conn.poll(timeout=timeout):
try:
result = parent_conn.recv()
except Exception:
result = {"error": "worker process crashed", "_extraction_time_ms": 0}
else:
p.kill()
result = {
"error": f"extraction timed out after {timeout}s",
"_extraction_time_ms": timeout * 1000.0,
}
p.join(timeout=5)
if p.is_alive():
p.kill()
p.join()
parent_conn.close()
return result
except Exception:
# Fork not available — fall back to in-process extraction
try:
return fn(*args)
except Exception as e:
return {"error": str(e), "_extraction_time_ms": 0}
def _parse_path(line: str) -> str:
"""Parse a request line: JSON object with path field, or plain file path."""
stripped = line.strip()
if stripped.startswith("{"):
try:
return json.loads(stripped).get("path", "")
except (json.JSONDecodeError, ValueError):
pass
return stripped
def run_server(timeout=None) -> None:
"""Persistent server mode."""
print("READY", flush=True)
for line in sys.stdin:
file_path = _parse_path(line)
if not file_path:
continue
if timeout is not None:
result = _run_with_timeout(extract_sync, (file_path,), timeout)
else:
try:
result = extract_sync(file_path)
except Exception as e:
result = {"error": str(e), "_extraction_time_ms": 0}
print(json.dumps(result), flush=True)
def main() -> None:
ocr_enabled = False
timeout = None
args = []
for arg in sys.argv[1:]:
if arg == "--ocr":
ocr_enabled = True
elif arg == "--no-ocr":
ocr_enabled = False
elif arg.startswith("--timeout="):
timeout = int(arg.split("=", 1)[1])
elif arg.startswith("--format="):
_fmt = arg.split("=", 1)[1]
if _fmt != "markdown":
print(f"{sys.argv[0]} only supports markdown output; got --format {_fmt}", file=sys.stderr)
sys.exit(64)
else:
args.append(arg)
if len(args) < 1:
print("Usage: pymupdf4llm_extract.py [--ocr|--no-ocr] [--timeout=SECS] <mode> <file_path>", file=sys.stderr)
print("Modes: sync, server", file=sys.stderr)
sys.exit(1)
mode = args[0]
if mode == "server":
run_server(timeout=timeout)
elif mode == "sync":
if len(args) < 2:
print("Error: sync mode requires a file path", file=sys.stderr)
sys.exit(1)
file_path = args[1]
try:
payload = extract_sync(file_path)
print(json.dumps(payload), end="")
except Exception as e:
print(f"Error extracting with PyMuPDF4LLM: {e}", file=sys.stderr)
sys.exit(1)
else:
# Legacy fallback for direct file path
try:
payload = extract_sync(args[0])
print(json.dumps(payload), end="")
except Exception as e:
print(f"Error extracting with PyMuPDF4LLM: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,245 @@
"""pypdf extraction wrapper for benchmark harness.
Supports three modes:
- sync: extract text page-by-page (sequential)
- batch: process multiple files (simulated batch using loop)
- server: persistent mode reading paths from stdin
"""
from __future__ import annotations
import json
import multiprocessing as _mp
import os
import platform
import resource
import sys
import time
from typing import Any
from pypdf import PdfReader
def _get_peak_memory_bytes() -> int:
"""Get peak memory usage in bytes using resource module."""
usage = resource.getrusage(resource.RUSAGE_SELF)
if platform.system() == "Linux":
return usage.ru_maxrss * 1024
return usage.ru_maxrss
def extract_sync(file_path: str) -> dict[str, Any]:
"""Extract using synchronous single-file API."""
start = time.perf_counter()
reader = PdfReader(file_path)
text_parts = []
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text_parts.append(page_text)
markdown = "\n\n".join(text_parts)
duration_ms = (time.perf_counter() - start) * 1000.0
return {
"content": markdown,
"metadata": {"framework": "pypdf"},
"_extraction_time_ms": duration_ms,
"_peak_memory_bytes": _get_peak_memory_bytes(),
}
def extract_batch(file_paths: list[str]) -> list[dict[str, Any]]:
"""Extract multiple files (simulated batch - pypdf has no native batch API)."""
start = time.perf_counter()
results = []
for file_path in file_paths:
try:
reader = PdfReader(file_path)
text_parts = []
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text_parts.append(page_text)
markdown = "\n\n".join(text_parts)
results.append(
{
"content": markdown,
"metadata": {"framework": "pypdf"},
}
)
except Exception as e:
results.append(
{
"content": "",
"metadata": {
"framework": "pypdf",
"error": str(e),
},
}
)
total_duration_ms = (time.perf_counter() - start) * 1000.0
per_file_duration_ms = total_duration_ms / len(file_paths) if file_paths else 0
peak_memory = _get_peak_memory_bytes()
for result in results:
result["_extraction_time_ms"] = per_file_duration_ms
result["_batch_total_ms"] = total_duration_ms
result["_peak_memory_bytes"] = peak_memory
return results
def _worker(fn, args, conn):
"""Run extraction in a forked child process.
Closes inherited stdin/stdout so the child cannot corrupt the
parent's line-based JSON protocol.
"""
try:
sys.stdin.close()
sys.stdout = open(os.devnull, "w")
except Exception:
pass
try:
result = fn(*args)
conn.send(result)
except Exception as e:
conn.send({"error": str(e), "_extraction_time_ms": 0})
finally:
conn.close()
def _run_with_timeout(fn, args, timeout):
"""Execute fn(*args) in a forked child with a timeout.
On timeout the child is killed but the parent stays alive —
no expensive process restart is needed.
"""
try:
ctx = _mp.get_context("fork")
parent_conn, child_conn = ctx.Pipe(duplex=False)
p = ctx.Process(target=_worker, args=(fn, args, child_conn))
p.start()
child_conn.close()
if parent_conn.poll(timeout=timeout):
try:
result = parent_conn.recv()
except Exception:
result = {"error": "worker process crashed", "_extraction_time_ms": 0}
else:
p.kill()
result = {
"error": f"extraction timed out after {timeout}s",
"_extraction_time_ms": timeout * 1000.0,
}
p.join(timeout=5)
if p.is_alive():
p.kill()
p.join()
parent_conn.close()
return result
except Exception:
# Fork not available — fall back to in-process extraction
try:
return fn(*args)
except Exception as e:
return {"error": str(e), "_extraction_time_ms": 0}
def _parse_path(line: str) -> str:
"""Parse a request line: JSON object with path field, or plain file path."""
stripped = line.strip()
if stripped.startswith("{"):
try:
return json.loads(stripped).get("path", "")
except (json.JSONDecodeError, ValueError):
pass
return stripped
def run_server(timeout=None) -> None:
"""Persistent server mode: read paths from stdin, write JSON to stdout."""
print("READY", flush=True)
for line in sys.stdin:
file_path = _parse_path(line)
if not file_path:
continue
if timeout is not None:
result = _run_with_timeout(extract_sync, (file_path,), timeout)
else:
try:
result = extract_sync(file_path)
except Exception as e:
result = {"error": str(e), "_extraction_time_ms": 0}
print(json.dumps(result), flush=True)
def main() -> None:
timeout = None
args = []
for arg in sys.argv[1:]:
if arg in ("--ocr", "--no-ocr"):
pass # Accepted but ignored - pypdf doesn't have OCR config
elif arg.startswith("--timeout="):
timeout = int(arg.split("=", 1)[1])
elif arg.startswith("--format="):
_fmt = arg.split("=", 1)[1]
if _fmt != "plaintext":
print(f"{sys.argv[0]} only supports plaintext output; got --format {_fmt}", file=sys.stderr)
sys.exit(64)
else:
args.append(arg)
if len(args) < 1:
print(
"Usage: pypdf_extract.py [--ocr|--no-ocr] [--timeout=SECS] <mode> <file_path> [additional_files...]",
file=sys.stderr,
)
print("Modes: sync, batch, server", file=sys.stderr)
sys.exit(1)
mode = args[0]
file_paths = args[1:]
try:
if mode == "server":
run_server(timeout=timeout)
elif mode == "sync":
if len(file_paths) != 1:
print("Error: sync mode requires exactly one file", file=sys.stderr)
sys.exit(1)
payload = extract_sync(file_paths[0])
print(json.dumps(payload), end="")
elif mode == "batch":
if len(file_paths) < 1:
print("Error: batch mode requires at least one file", file=sys.stderr)
sys.exit(1)
if len(file_paths) == 1:
results = extract_batch(file_paths)
print(json.dumps(results[0]), end="")
else:
results = extract_batch(file_paths)
print(json.dumps(results), end="")
else:
print(f"Error: Unknown mode '{mode}'. Use sync, batch, or server", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f"Error extracting with pypdf: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,166 @@
#!/usr/bin/env python3
"""Sanitize pandoc-generated markdown ground truth files.
Removes common pandoc artifacts that don't represent actual document structure.
Usage:
# Single file (in-place):
python sanitize_pandoc_gt.py input.md
# Pipe mode:
pandoc -f docbook -t gfm --wrap=none input.xml | python sanitize_pandoc_gt.py > output.md
# Dry run (show diff without modifying):
python sanitize_pandoc_gt.py --dry-run input.md
# Batch all GT files (dry run):
python sanitize_pandoc_gt.py --dry-run --batch test_documents/ground_truth/
# Batch all GT files (apply):
python sanitize_pandoc_gt.py --batch test_documents/ground_truth/
"""
import argparse
import difflib
import os
import re
import sys
def sanitize(text: str) -> str:
# Track whether we're inside a fenced code block
in_code = False
lines = text.split("\n")
result = []
for line in lines:
# Track fenced code blocks — don't modify content inside them
stripped = line.strip()
if stripped.startswith("```") or stripped.startswith("~~~"):
in_code = not in_code
# Clean code fence attributes even when toggling
if not in_code or stripped.startswith("```") or stripped.startswith("~~~"):
# Convert ``` {.python} to ```python
m = re.match(r"^(`{3,}|~{3,})\s*\{\s*\.(\w+)(?:\s+[^}]*)?\}\s*$", line)
if m:
line = f"{m.group(1)}{m.group(2)}"
else:
# Remove {.class} from code fences without extracting language
line = re.sub(r"^(`{3,}|~{3,})\s*\{[^}]*\}\s*$", r"\1", line)
result.append(line)
continue
if in_code:
result.append(line)
continue
# === Pandoc div wrappers ===
if re.match(r"^:::\s*(\{.*\})?\s*$", stripped):
continue
# === Remove {.class} and {#id} attributes from headings ===
if re.match(r"^#{1,6}\s", line):
line = re.sub(r"\s*\{[.#][^}]*\}\s*$", "", line)
# === Replace <!-- end list --> pandoc markers with blank line ===
# Don't just remove — keep the structural separation it provides
if stripped == "<!-- end list -->":
if not (result and result[-1].strip() == ""):
result.append("")
continue
# === Remove pandoc-specific HTML comments only ===
# Keep <!-- image --> and other semantic comments
if stripped == "<!-- end list -->" or stripped == "<!-- -->":
continue
# Do NOT collapse blank lines — they are structural in markdown.
# Blank lines separate paragraphs, tables, lists, etc.
result.append(line)
# Trim trailing blank lines, ensure single trailing newline
while result and result[-1].strip() == "":
result.pop()
return "\n".join(result) + "\n" if result else ""
def process_file(path: str, dry_run: bool = False) -> tuple[bool, str]:
"""Process a single file. Returns (changed, diff_text)."""
with open(path) as f:
original = f.read()
cleaned = sanitize(original)
if original == cleaned:
return False, ""
diff = "".join(
difflib.unified_diff(
original.splitlines(keepends=True),
cleaned.splitlines(keepends=True),
fromfile=f"a/{path}",
tofile=f"b/{path}",
n=3,
)
)
if not dry_run:
with open(path, "w") as f:
f.write(cleaned)
return True, diff
def main():
parser = argparse.ArgumentParser(description="Sanitize pandoc GT markdown files")
parser.add_argument("path", nargs="?", help="File or directory to process")
parser.add_argument("--dry-run", action="store_true", help="Show diff without modifying files")
parser.add_argument("--batch", action="store_true", help="Process all .md files in directory recursively")
args = parser.parse_args()
# Pipe mode (no path, stdin)
if args.path is None and not sys.stdin.isatty():
sys.stdout.write(sanitize(sys.stdin.read()))
return
if args.path is None:
parser.print_help()
return
# Batch mode
if args.batch or os.path.isdir(args.path):
changed_count = 0
total_count = 0
for root, _dirs, files in os.walk(args.path):
for fname in sorted(files):
if not fname.endswith(".md"):
continue
fpath = os.path.join(root, fname)
total_count += 1
changed, diff = process_file(fpath, dry_run=args.dry_run)
if changed:
changed_count += 1
if args.dry_run:
print(diff)
else:
print(f" cleaned: {fpath}")
action = "would change" if args.dry_run else "cleaned"
print(f"\n{action} {changed_count}/{total_count} files")
return
# Single file mode
changed, diff = process_file(args.path, dry_run=args.dry_run)
if changed:
if args.dry_run:
print(diff)
else:
print(f"cleaned: {args.path}")
else:
print(f"no changes: {args.path}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,230 @@
"""Unstructured extraction wrapper for benchmark harness."""
from __future__ import annotations
import json
import multiprocessing as _mp
import os
import platform
import resource
import sys
import time
from unstructured.partition.auto import partition
def _get_peak_memory_bytes() -> int:
"""Get peak memory usage in bytes using resource module."""
usage = resource.getrusage(resource.RUSAGE_SELF)
if platform.system() == "Linux":
return usage.ru_maxrss * 1024
return usage.ru_maxrss
def _render_markdown(elements: list) -> str:
"""Render Unstructured Elements as GFM-ish markdown."""
import re
parts: list[str] = []
for el in elements:
cls = type(el).__name__
text = (el.text or "").strip() if hasattr(el, "text") else str(el).strip()
if not text and cls not in ("Image", "Figure"):
continue
if cls == "Title":
parts.append(f"# {text}")
elif cls == "Header":
parts.append(f"## {text}")
elif cls == "ListItem":
parts.append(f"- {text}")
elif cls in ("CodeSnippet", "Code"):
parts.append(f"```\n{text}\n```")
elif cls in ("Image", "Figure"):
parts.append(f"![{text or cls}]()")
elif cls == "Table":
html = ""
md = getattr(el, "metadata", None)
if md is not None:
html = getattr(md, "text_as_html", "") or ""
if html:
rows = re.findall(r"<tr[^>]*>(.*?)</tr>", html, flags=re.DOTALL | re.IGNORECASE)
rendered: list[str] = []
for i, row_html in enumerate(rows):
cells = re.findall(r"<t[dh][^>]*>(.*?)</t[dh]>", row_html, flags=re.DOTALL | re.IGNORECASE)
cells = [re.sub(r"<[^>]+>", "", c).strip() for c in cells]
if cells:
rendered.append("| " + " | ".join(cells) + " |")
if i == 0:
rendered.append("| " + " | ".join("---" for _ in cells) + " |")
if rendered:
parts.append("\n".join(rendered))
else:
parts.append(text)
else:
parts.append(text)
else:
parts.append(text)
return "\n\n".join(parts)
def extract_sync(file_path: str, ocr_enabled: bool, output_format: str = "markdown") -> dict:
"""Extract using Unstructured partition API."""
strategy = "hi_res" if ocr_enabled else "fast"
start = time.perf_counter()
elements = partition(filename=file_path, strategy=strategy, languages=["eng"])
duration_ms = (time.perf_counter() - start) * 1000.0
if output_format == "markdown":
content = _render_markdown(elements)
else:
content = "\n\n".join(str(el) for el in elements)
return {
"content": content,
"metadata": {"framework": "unstructured", "strategy": strategy, "output_format": output_format},
"_extraction_time_ms": duration_ms,
"_peak_memory_bytes": _get_peak_memory_bytes(),
}
def _worker(fn, args, conn):
"""Run extraction in a forked child process.
Closes inherited stdin/stdout so the child cannot corrupt the
parent's line-based JSON protocol.
"""
try:
sys.stdin.close()
sys.stdout = open(os.devnull, "w")
except Exception:
pass
try:
result = fn(*args)
conn.send(result)
except Exception as e:
conn.send({"error": str(e), "_extraction_time_ms": 0})
finally:
conn.close()
def _run_with_timeout(fn, args, timeout):
"""Execute fn(*args) in a forked child with a timeout.
On timeout the child is killed but the parent stays alive —
no expensive process restart is needed.
"""
try:
ctx = _mp.get_context("fork")
parent_conn, child_conn = ctx.Pipe(duplex=False)
p = ctx.Process(target=_worker, args=(fn, args, child_conn))
p.start()
child_conn.close()
if parent_conn.poll(timeout=timeout):
try:
result = parent_conn.recv()
except Exception:
result = {"error": "worker process crashed", "_extraction_time_ms": 0}
else:
p.kill()
result = {
"error": f"extraction timed out after {timeout}s",
"_extraction_time_ms": timeout * 1000.0,
}
p.join(timeout=5)
if p.is_alive():
p.kill()
p.join()
parent_conn.close()
return result
except Exception:
# Fork not available — fall back to in-process extraction
try:
return fn(*args)
except Exception as e:
return {"error": str(e), "_extraction_time_ms": 0}
def _parse_path(line: str) -> str:
"""Parse a request line: JSON object with path field, or plain file path."""
stripped = line.strip()
if stripped.startswith("{"):
try:
return json.loads(stripped).get("path", "")
except (json.JSONDecodeError, ValueError):
pass
return stripped
def run_server(ocr_enabled: bool, output_format: str, timeout=None) -> None:
"""Persistent server mode: read paths from stdin, write JSON to stdout."""
print("READY", flush=True)
for line in sys.stdin:
file_path = _parse_path(line)
if not file_path:
continue
if timeout is not None:
result = _run_with_timeout(extract_sync, (file_path, ocr_enabled, output_format), timeout)
else:
try:
result = extract_sync(file_path, ocr_enabled, output_format)
except Exception as e:
result = {"error": str(e), "_extraction_time_ms": 0}
print(json.dumps(result), flush=True)
def main() -> None:
ocr_enabled = False
timeout = None
output_format = "markdown"
args = []
for arg in sys.argv[1:]:
if arg == "--ocr":
ocr_enabled = True
elif arg == "--no-ocr":
ocr_enabled = False
elif arg.startswith("--timeout="):
timeout = int(arg.split("=", 1)[1])
elif arg.startswith("--format="):
output_format = arg.split("=", 1)[1]
else:
args.append(arg)
if output_format not in ("markdown", "plaintext"):
print(f"Error: --format must be 'markdown' or 'plaintext'; got '{output_format}'", file=sys.stderr)
sys.exit(64)
if len(args) < 1:
print(
"Usage: unstructured_extract.py [--ocr|--no-ocr] [--timeout=SECS] [--format=markdown|plaintext] <mode> <file_path>",
file=sys.stderr,
)
print("Modes: sync, server", file=sys.stderr)
sys.exit(1)
mode = args[0]
if mode == "server":
run_server(ocr_enabled, output_format, timeout=timeout)
elif mode == "sync":
if len(args) < 2:
print("Error: sync mode requires a file path", file=sys.stderr)
sys.exit(1)
try:
payload = extract_sync(args[1], ocr_enabled, output_format)
print(json.dumps(payload), end="")
except Exception as e:
print(f"Error extracting with Unstructured: {e}", file=sys.stderr)
sys.exit(1)
else:
# Legacy mode: first arg is the file path directly
try:
payload = extract_sync(args[0], ocr_enabled, output_format)
print(json.dumps(payload), end="")
except Exception as e:
print(f"Error extracting with Unstructured: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()