This commit is contained in:
394
tools/benchmark-harness/scripts/TikaExtract.java
Normal file
394
tools/benchmark-harness/scripts/TikaExtract.java
Normal file
@@ -0,0 +1,394 @@
|
||||
import org.apache.tika.parser.AutoDetectParser;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.ocr.TesseractOCRConfig;
|
||||
import org.apache.tika.sax.BodyContentHandler;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public final class TikaExtract {
|
||||
private static final double NANOS_IN_MILLISECOND = 1_000_000.0;
|
||||
/** Length of the JSON key {@code "path"} including surrounding quotes. */
|
||||
private static final int PATH_KEY_LENGTH = 6;
|
||||
private static final char LAST_CONTROL_CHAR = 0x1F;
|
||||
|
||||
private TikaExtract() {
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
boolean ocrEnabled = false;
|
||||
List<String> positionalArgs = new ArrayList<>();
|
||||
|
||||
for (String arg : args) {
|
||||
if ("--ocr".equals(arg)) {
|
||||
ocrEnabled = true;
|
||||
} else if ("--no-ocr".equals(arg)) {
|
||||
ocrEnabled = false;
|
||||
} else {
|
||||
positionalArgs.add(arg);
|
||||
}
|
||||
}
|
||||
|
||||
if (positionalArgs.isEmpty()) {
|
||||
System.err.println("Usage: TikaExtract [--ocr|--no-ocr] <mode> <file1> [file2] ...");
|
||||
System.err.println("Modes: sync, batch, server");
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
String mode = positionalArgs.get(0);
|
||||
if (!"sync".equals(mode) && !"batch".equals(mode) && !"server".equals(mode)) {
|
||||
System.err.printf("Unsupported mode '%s'%n", mode);
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
// Enable debug logging if TIKA_BENCHMARK_DEBUG is set
|
||||
boolean debug = "true".equalsIgnoreCase(System.getenv("TIKA_BENCHMARK_DEBUG"));
|
||||
|
||||
if (debug) {
|
||||
debugLog("java.version", System.getProperty("java.version"));
|
||||
debugLog("os.name", System.getProperty("os.name"));
|
||||
debugLog("os.arch", System.getProperty("os.arch"));
|
||||
debugLog("Mode", mode);
|
||||
debugLog("OCR enabled", String.valueOf(ocrEnabled));
|
||||
debugLog("Files to process", String.valueOf(positionalArgs.size() - 1));
|
||||
}
|
||||
|
||||
try {
|
||||
if ("sync".equals(mode)) {
|
||||
if (positionalArgs.size() < 2) {
|
||||
System.err.println("Sync mode requires exactly one file");
|
||||
System.exit(1);
|
||||
}
|
||||
processSyncMode(positionalArgs.get(1), ocrEnabled, debug);
|
||||
} else if ("batch".equals(mode)) {
|
||||
processBatchMode(positionalArgs, ocrEnabled, debug);
|
||||
} else {
|
||||
processServerMode(ocrEnabled, debug);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
if (debug) {
|
||||
debugLog("Processing failed with exception", e.getClass().getName());
|
||||
e.printStackTrace(System.err);
|
||||
} else {
|
||||
e.printStackTrace(System.err);
|
||||
}
|
||||
System.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
private static void processSyncMode(String filePath, boolean ocrEnabled, boolean debug) throws Exception {
|
||||
if (debug) {
|
||||
debugLog("Input file", filePath);
|
||||
}
|
||||
|
||||
Path path = Path.of(filePath);
|
||||
ExtractionData data;
|
||||
long start = System.nanoTime();
|
||||
|
||||
try {
|
||||
if (debug) {
|
||||
debugLog("Starting extraction", "");
|
||||
}
|
||||
data = extractFile(path.toFile(), ocrEnabled, debug);
|
||||
if (debug) {
|
||||
debugLog("Extraction completed", "");
|
||||
}
|
||||
} catch (Exception e) {
|
||||
if (debug) {
|
||||
debugLog("Extraction failed", e.getClass().getName());
|
||||
e.printStackTrace(System.err);
|
||||
}
|
||||
throw e;
|
||||
}
|
||||
|
||||
double elapsedMs = (System.nanoTime() - start) / NANOS_IN_MILLISECOND;
|
||||
String json = toJson(data, elapsedMs, ocrEnabled);
|
||||
System.out.print(json);
|
||||
}
|
||||
|
||||
private static void processBatchMode(
|
||||
List<String> positionalArgs, boolean ocrEnabled, boolean debug) throws Exception {
|
||||
List<String> filePaths = new ArrayList<>();
|
||||
for (int i = 1; i < positionalArgs.size(); i++) {
|
||||
filePaths.add(positionalArgs.get(i));
|
||||
}
|
||||
|
||||
long batchStart = System.nanoTime();
|
||||
StringBuilder jsonArray = new StringBuilder();
|
||||
jsonArray.append('[');
|
||||
|
||||
boolean first = true;
|
||||
for (String filePath : filePaths) {
|
||||
if (debug) {
|
||||
debugLog("Processing file", filePath);
|
||||
}
|
||||
|
||||
try {
|
||||
Path path = Path.of(filePath);
|
||||
long start = System.nanoTime();
|
||||
ExtractionData data = extractFile(path.toFile(), ocrEnabled, debug);
|
||||
double elapsedMs = (System.nanoTime() - start) / NANOS_IN_MILLISECOND;
|
||||
|
||||
if (!first) {
|
||||
jsonArray.append(',');
|
||||
}
|
||||
first = false;
|
||||
|
||||
double batchTotalMs = (System.nanoTime() - batchStart) / NANOS_IN_MILLISECOND;
|
||||
jsonArray.append(toJsonWithBatch(data, elapsedMs, batchTotalMs, ocrEnabled));
|
||||
|
||||
if (debug) {
|
||||
debugLog("File processed", filePath);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
if (debug) {
|
||||
debugLog("Failed to process file", filePath);
|
||||
debugLog("Exception", e.getClass().getName());
|
||||
e.printStackTrace(System.err);
|
||||
} else {
|
||||
System.err.printf("Error processing %s: %s%n", filePath, e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
double totalBatchMs = (System.nanoTime() - batchStart) / NANOS_IN_MILLISECOND;
|
||||
jsonArray.append(']');
|
||||
|
||||
if (first) {
|
||||
System.err.println("No files were successfully processed");
|
||||
System.exit(1);
|
||||
return;
|
||||
}
|
||||
|
||||
System.out.print(jsonArray.toString());
|
||||
}
|
||||
|
||||
private static void processServerMode(boolean ocrEnabled, boolean debug) throws Exception {
|
||||
// Pre-create shared parser and OCR config to avoid per-file construction overhead.
|
||||
// AutoDetectParser is thread-safe and reusable. Only BodyContentHandler and Metadata
|
||||
// need to be recreated per extraction since they accumulate state.
|
||||
AutoDetectParser sharedParser = new AutoDetectParser();
|
||||
TesseractOCRConfig sharedOcrConfig = new TesseractOCRConfig();
|
||||
if (!ocrEnabled) {
|
||||
sharedOcrConfig.setSkipOcr(true);
|
||||
} else {
|
||||
sharedOcrConfig.setLanguage("eng");
|
||||
}
|
||||
|
||||
// Signal readiness after JVM + Tika parser initialization
|
||||
System.out.println("READY");
|
||||
System.out.flush();
|
||||
|
||||
BufferedReader reader = new BufferedReader(new InputStreamReader(System.in));
|
||||
String line;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
String filePath = line.trim();
|
||||
if (filePath.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
// Parse JSON request if the harness sends {"path":"...", "force_ocr": ...}
|
||||
if (filePath.startsWith("{")) {
|
||||
filePath = parseJsonPath(filePath);
|
||||
}
|
||||
try {
|
||||
Path path = Path.of(filePath);
|
||||
long start = System.nanoTime();
|
||||
ExtractionData data = extractFileWithParser(path.toFile(), sharedParser, sharedOcrConfig, debug);
|
||||
double elapsedMs = (System.nanoTime() - start) / NANOS_IN_MILLISECOND;
|
||||
String json = toJson(data, elapsedMs, ocrEnabled);
|
||||
System.out.println(json);
|
||||
System.out.flush();
|
||||
} catch (Exception e) {
|
||||
String errorJson = String.format(
|
||||
"{\"error\":%s,\"_extraction_time_ms\":0,\"_ocr_used\":false}",
|
||||
quote(e.getMessage()));
|
||||
System.out.println(errorJson);
|
||||
System.out.flush();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static ExtractionData extractFileWithParser(
|
||||
File file, AutoDetectParser parser, TesseractOCRConfig ocrConfig, boolean debug) throws Exception {
|
||||
if (!file.exists()) {
|
||||
throw new IllegalArgumentException("File does not exist: " + file.getAbsolutePath());
|
||||
}
|
||||
|
||||
BodyContentHandler handler = new BodyContentHandler(-1);
|
||||
Metadata metadata = new Metadata();
|
||||
ParseContext context = new ParseContext();
|
||||
context.set(TesseractOCRConfig.class, ocrConfig);
|
||||
|
||||
try (InputStream stream = new FileInputStream(file)) {
|
||||
parser.parse(stream, handler, metadata, context);
|
||||
}
|
||||
|
||||
String content = handler.toString();
|
||||
String mimeType = metadata.get(Metadata.CONTENT_TYPE);
|
||||
|
||||
if (mimeType == null) {
|
||||
mimeType = "application/octet-stream";
|
||||
}
|
||||
|
||||
return new ExtractionData(content, mimeType);
|
||||
}
|
||||
|
||||
private static ExtractionData extractFile(File file, boolean ocrEnabled, boolean debug) throws Exception {
|
||||
if (!file.exists()) {
|
||||
throw new IllegalArgumentException("File does not exist: " + file.getAbsolutePath());
|
||||
}
|
||||
|
||||
AutoDetectParser parser = new AutoDetectParser();
|
||||
BodyContentHandler handler = new BodyContentHandler(-1);
|
||||
Metadata metadata = new Metadata();
|
||||
ParseContext context = new ParseContext();
|
||||
|
||||
if (!ocrEnabled) {
|
||||
TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
|
||||
ocrConfig.setSkipOcr(true);
|
||||
context.set(TesseractOCRConfig.class, ocrConfig);
|
||||
} else {
|
||||
TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
|
||||
ocrConfig.setLanguage("eng");
|
||||
context.set(TesseractOCRConfig.class, ocrConfig);
|
||||
}
|
||||
|
||||
try (InputStream stream = new FileInputStream(file)) {
|
||||
parser.parse(stream, handler, metadata, context);
|
||||
}
|
||||
|
||||
String content = handler.toString();
|
||||
String mimeType = metadata.get(Metadata.CONTENT_TYPE);
|
||||
|
||||
if (mimeType == null) {
|
||||
mimeType = "application/octet-stream";
|
||||
}
|
||||
|
||||
return new ExtractionData(content, mimeType);
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine if OCR was actually used based on MIME type and OCR config.
|
||||
* OCR is used by Tika when enabled and the file is an image type.
|
||||
*/
|
||||
private static boolean determineOcrUsed(String mimeType, boolean ocrEnabled) {
|
||||
if (!ocrEnabled) {
|
||||
return false;
|
||||
}
|
||||
return mimeType != null && mimeType.startsWith("image/");
|
||||
}
|
||||
|
||||
private static String toJson(ExtractionData data, double elapsedMs, boolean ocrEnabled) {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
builder.append('{');
|
||||
builder.append("\"content\":").append(quote(data.getContent())).append(',');
|
||||
builder.append("\"metadata\":{");
|
||||
builder.append("\"mimeType\":").append(quote(data.getMimeType()));
|
||||
builder.append("},\"_extraction_time_ms\":").append(String.format("%.3f", elapsedMs));
|
||||
builder.append(",\"_ocr_used\":").append(determineOcrUsed(data.getMimeType(), ocrEnabled));
|
||||
builder.append('}');
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
private static String toJsonWithBatch(
|
||||
ExtractionData data, double elapsedMs, double batchTotalMs, boolean ocrEnabled) {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
builder.append('{');
|
||||
builder.append("\"content\":").append(quote(data.getContent())).append(',');
|
||||
builder.append("\"metadata\":{");
|
||||
builder.append("\"mimeType\":").append(quote(data.getMimeType()));
|
||||
builder.append("},\"_extraction_time_ms\":").append(String.format("%.3f", elapsedMs));
|
||||
builder.append(",\"_batch_total_ms\":").append(String.format("%.3f", batchTotalMs));
|
||||
builder.append(",\"_ocr_used\":").append(determineOcrUsed(data.getMimeType(), ocrEnabled));
|
||||
builder.append('}');
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a JSON request line to extract the "path" field.
|
||||
* Minimal JSON parsing to avoid adding a dependency.
|
||||
*/
|
||||
private static String parseJsonPath(String json) {
|
||||
int idx = json.indexOf("\"path\"");
|
||||
if (idx < 0) {
|
||||
return json;
|
||||
}
|
||||
// Skip past "path" key, colon, optional whitespace, and opening quote
|
||||
idx = json.indexOf(':', idx + PATH_KEY_LENGTH);
|
||||
if (idx < 0) {
|
||||
return json;
|
||||
}
|
||||
idx = json.indexOf('"', idx + 1);
|
||||
if (idx < 0) {
|
||||
return json;
|
||||
}
|
||||
int start = idx + 1;
|
||||
int end = json.indexOf('"', start);
|
||||
if (end < 0) {
|
||||
return json;
|
||||
}
|
||||
return json.substring(start, end);
|
||||
}
|
||||
|
||||
// CPD-OFF: quote() is intentionally duplicated in standalone benchmark scripts (no shared classpath)
|
||||
private static String quote(String value) {
|
||||
if (value == null) {
|
||||
return "null";
|
||||
}
|
||||
StringBuilder sb = new StringBuilder(value.length() + 2);
|
||||
sb.append('"');
|
||||
for (int i = 0; i < value.length(); i++) {
|
||||
char c = value.charAt(i);
|
||||
switch (c) {
|
||||
case '\\': sb.append("\\\\"); break;
|
||||
case '"': sb.append("\\\""); break;
|
||||
case '\n': sb.append("\\n"); break;
|
||||
case '\r': sb.append("\\r"); break;
|
||||
case '\t': sb.append("\\t"); break;
|
||||
case '\b': sb.append("\\b"); break;
|
||||
case '\f': sb.append("\\f"); break;
|
||||
default:
|
||||
if (c <= LAST_CONTROL_CHAR) {
|
||||
sb.append(String.format("\\u%04x", (int) c));
|
||||
} else {
|
||||
sb.append(c);
|
||||
}
|
||||
}
|
||||
}
|
||||
sb.append('"');
|
||||
return sb.toString();
|
||||
}
|
||||
// CPD-ON
|
||||
|
||||
private static void debugLog(String key, String value) {
|
||||
if (value == null) {
|
||||
value = "(null)";
|
||||
}
|
||||
System.err.printf("[BENCHMARK_DEBUG] %-30s = %s%n", key, value);
|
||||
}
|
||||
|
||||
private static class ExtractionData {
|
||||
private final String content;
|
||||
private final String mimeType;
|
||||
|
||||
ExtractionData(String content, String mimeType) {
|
||||
this.content = content;
|
||||
this.mimeType = mimeType;
|
||||
}
|
||||
|
||||
String getContent() {
|
||||
return content;
|
||||
}
|
||||
|
||||
String getMimeType() {
|
||||
return mimeType;
|
||||
}
|
||||
}
|
||||
}
|
||||
277
tools/benchmark-harness/scripts/docling_extract.py
Executable file
277
tools/benchmark-harness/scripts/docling_extract.py
Executable file
@@ -0,0 +1,277 @@
|
||||
"""Docling extraction wrapper for benchmark harness.
|
||||
|
||||
Supports two modes:
|
||||
- sync: convert() - synchronous single-file extraction
|
||||
- batch: convert_all() - batch extraction for multiple files
|
||||
- server: persistent mode reading paths from stdin
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import multiprocessing as _mp
|
||||
import os
|
||||
import platform
|
||||
import resource
|
||||
import sys
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
|
||||
def _get_peak_memory_bytes() -> int:
|
||||
"""Get peak memory usage in bytes using resource module."""
|
||||
usage = resource.getrusage(resource.RUSAGE_SELF)
|
||||
if platform.system() == "Linux":
|
||||
return usage.ru_maxrss * 1024
|
||||
return usage.ru_maxrss
|
||||
|
||||
|
||||
def create_converter(ocr_enabled: bool) -> DocumentConverter:
|
||||
"""Create a DocumentConverter with appropriate settings."""
|
||||
if not ocr_enabled:
|
||||
try:
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
|
||||
options = PipelineOptions(do_ocr=False)
|
||||
return DocumentConverter(pipeline_options=options)
|
||||
except (ImportError, TypeError):
|
||||
# Fallback if PipelineOptions API not available
|
||||
return DocumentConverter()
|
||||
return DocumentConverter()
|
||||
|
||||
|
||||
def _render(document: Any, output_format: str) -> str:
|
||||
if output_format == "plaintext":
|
||||
return document.export_to_text()
|
||||
return document.export_to_markdown()
|
||||
|
||||
|
||||
def extract_sync(file_path: str, converter: DocumentConverter, output_format: str = "markdown") -> dict[str, Any]:
|
||||
"""Extract using synchronous single-file API."""
|
||||
start = time.perf_counter()
|
||||
result = converter.convert(file_path)
|
||||
content = _render(result.document, output_format)
|
||||
duration_ms = (time.perf_counter() - start) * 1000.0
|
||||
|
||||
return {
|
||||
"content": content,
|
||||
"metadata": {"framework": "docling", "output_format": output_format},
|
||||
"_extraction_time_ms": duration_ms,
|
||||
"_peak_memory_bytes": _get_peak_memory_bytes(),
|
||||
}
|
||||
|
||||
|
||||
def extract_batch(
|
||||
file_paths: list[str], converter: DocumentConverter, output_format: str = "markdown"
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Extract multiple files using batch API."""
|
||||
start = time.perf_counter()
|
||||
results = converter.convert_all(file_paths, raises_on_error=False)
|
||||
total_duration_ms = (time.perf_counter() - start) * 1000.0
|
||||
|
||||
per_file_duration_ms = total_duration_ms / len(file_paths) if file_paths else 0
|
||||
|
||||
outputs = []
|
||||
for result in results:
|
||||
if result.status.name == "SUCCESS":
|
||||
content = _render(result.document, output_format)
|
||||
outputs.append(
|
||||
{
|
||||
"content": content,
|
||||
"metadata": {"framework": "docling", "output_format": output_format},
|
||||
"_extraction_time_ms": per_file_duration_ms,
|
||||
"_batch_total_ms": total_duration_ms,
|
||||
"_peak_memory_bytes": _get_peak_memory_bytes(),
|
||||
}
|
||||
)
|
||||
else:
|
||||
outputs.append(
|
||||
{
|
||||
"content": "",
|
||||
"metadata": {
|
||||
"framework": "docling",
|
||||
"error": str(result.errors) if result.errors else "Unknown error",
|
||||
"status": result.status.name,
|
||||
},
|
||||
"_extraction_time_ms": per_file_duration_ms,
|
||||
"_batch_total_ms": total_duration_ms,
|
||||
"_peak_memory_bytes": _get_peak_memory_bytes(),
|
||||
}
|
||||
)
|
||||
|
||||
return outputs
|
||||
|
||||
|
||||
def _worker(fn, args, conn):
|
||||
"""Run extraction in a forked child process.
|
||||
|
||||
Closes inherited stdin/stdout so the child cannot corrupt the
|
||||
parent's line-based JSON protocol.
|
||||
"""
|
||||
try:
|
||||
sys.stdin.close()
|
||||
sys.stdout = open(os.devnull, "w")
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
result = fn(*args)
|
||||
conn.send(result)
|
||||
except Exception as e:
|
||||
conn.send({"error": str(e), "_extraction_time_ms": 0})
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def _run_with_timeout(fn, args, timeout):
|
||||
"""Execute fn(*args) in a forked child with a timeout.
|
||||
|
||||
On timeout the child is killed but the parent stays alive —
|
||||
no expensive process restart is needed.
|
||||
"""
|
||||
try:
|
||||
ctx = _mp.get_context("fork")
|
||||
parent_conn, child_conn = ctx.Pipe(duplex=False)
|
||||
p = ctx.Process(target=_worker, args=(fn, args, child_conn))
|
||||
p.start()
|
||||
child_conn.close()
|
||||
|
||||
if parent_conn.poll(timeout=timeout):
|
||||
try:
|
||||
result = parent_conn.recv()
|
||||
except Exception:
|
||||
result = {"error": "worker process crashed", "_extraction_time_ms": 0}
|
||||
else:
|
||||
p.kill()
|
||||
result = {
|
||||
"error": f"extraction timed out after {timeout}s",
|
||||
"_extraction_time_ms": timeout * 1000.0,
|
||||
}
|
||||
|
||||
p.join(timeout=5)
|
||||
if p.is_alive():
|
||||
p.kill()
|
||||
p.join()
|
||||
parent_conn.close()
|
||||
return result
|
||||
except Exception:
|
||||
# Fork not available — fall back to in-process extraction
|
||||
try:
|
||||
return fn(*args)
|
||||
except Exception as e:
|
||||
return {"error": str(e), "_extraction_time_ms": 0}
|
||||
|
||||
|
||||
def _parse_path(line: str) -> str:
|
||||
"""Parse a request line: JSON object with path field, or plain file path."""
|
||||
stripped = line.strip()
|
||||
if stripped.startswith("{"):
|
||||
try:
|
||||
return json.loads(stripped).get("path", "")
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
pass
|
||||
return stripped
|
||||
|
||||
|
||||
def run_server(converter: DocumentConverter, output_format: str, timeout=None) -> None:
|
||||
"""Persistent server mode: read paths from stdin, write JSON to stdout."""
|
||||
print("READY", flush=True)
|
||||
for line in sys.stdin:
|
||||
file_path = _parse_path(line)
|
||||
if not file_path:
|
||||
continue
|
||||
if timeout is not None:
|
||||
result = _run_with_timeout(extract_sync, (file_path, converter, output_format), timeout)
|
||||
else:
|
||||
try:
|
||||
result = extract_sync(file_path, converter, output_format)
|
||||
except Exception as e:
|
||||
result = {"error": str(e), "_extraction_time_ms": 0}
|
||||
print(json.dumps(result), flush=True)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
ocr_enabled = False
|
||||
timeout = None
|
||||
output_format = "markdown"
|
||||
args = []
|
||||
for arg in sys.argv[1:]:
|
||||
if arg == "--ocr":
|
||||
ocr_enabled = True
|
||||
elif arg == "--no-ocr":
|
||||
ocr_enabled = False
|
||||
elif arg.startswith("--timeout="):
|
||||
timeout = int(arg.split("=", 1)[1])
|
||||
elif arg.startswith("--format="):
|
||||
output_format = arg.split("=", 1)[1]
|
||||
elif arg == "--format":
|
||||
# Next-arg style handled below by appending
|
||||
args.append(arg)
|
||||
else:
|
||||
args.append(arg)
|
||||
|
||||
# Support `--format <value>` (space-separated)
|
||||
cleaned: list[str] = []
|
||||
i = 0
|
||||
while i < len(args):
|
||||
if args[i] == "--format" and i + 1 < len(args):
|
||||
output_format = args[i + 1]
|
||||
i += 2
|
||||
continue
|
||||
cleaned.append(args[i])
|
||||
i += 1
|
||||
args = cleaned
|
||||
|
||||
if output_format not in ("markdown", "plaintext"):
|
||||
print(f"Error: --format must be 'markdown' or 'plaintext'; got '{output_format}'", file=sys.stderr)
|
||||
sys.exit(64)
|
||||
|
||||
if len(args) < 1:
|
||||
print(
|
||||
"Usage: docling_extract.py [--ocr|--no-ocr] [--timeout=SECS] [--format markdown|plaintext] <mode> <file_path> [additional_files...]",
|
||||
file=sys.stderr,
|
||||
)
|
||||
print("Modes: sync, batch, server", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
mode = args[0]
|
||||
file_paths = args[1:]
|
||||
|
||||
# Create converter once (expensive initialization)
|
||||
converter = create_converter(ocr_enabled)
|
||||
|
||||
try:
|
||||
if mode == "server":
|
||||
run_server(converter, output_format, timeout=timeout)
|
||||
|
||||
elif mode == "sync":
|
||||
if len(file_paths) != 1:
|
||||
print("Error: sync mode requires exactly one file", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
payload = extract_sync(file_paths[0], converter, output_format)
|
||||
print(json.dumps(payload), end="")
|
||||
|
||||
elif mode == "batch":
|
||||
if len(file_paths) < 1:
|
||||
print("Error: batch mode requires at least one file", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if len(file_paths) == 1:
|
||||
results = extract_batch(file_paths, converter, output_format)
|
||||
print(json.dumps(results[0]), end="")
|
||||
else:
|
||||
results = extract_batch(file_paths, converter, output_format)
|
||||
print(json.dumps(results), end="")
|
||||
|
||||
else:
|
||||
print(f"Error: Unknown mode '{mode}'. Use sync, batch, or server", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error extracting with Docling: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
63
tools/benchmark-harness/scripts/download_omnidocbench.sh
Executable file
63
tools/benchmark-harness/scripts/download_omnidocbench.sh
Executable file
@@ -0,0 +1,63 @@
|
||||
#!/usr/bin/env bash
|
||||
# Download the OmniDocBench dataset (opendatalab/OmniDocBench) from HuggingFace.
|
||||
#
|
||||
# Usage:
|
||||
# ./download_omnidocbench.sh [TARGET_DIR]
|
||||
#
|
||||
# Default target: tools/benchmark-harness/datasets/omnidocbench
|
||||
#
|
||||
# Requirements: curl, unzip (standard on macOS/Linux)
|
||||
# No HuggingFace account or API key needed (public dataset).
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
DEFAULT_DIR="${SCRIPT_DIR}/../datasets/omnidocbench"
|
||||
TARGET_DIR="${1:-$DEFAULT_DIR}"
|
||||
|
||||
HF_BASE="https://huggingface.co/datasets/opendatalab/OmniDocBench/resolve/main"
|
||||
|
||||
mkdir -p "$TARGET_DIR"
|
||||
|
||||
# Download the main annotation file (65 MB)
|
||||
if [ -f "$TARGET_DIR/OmniDocBench.json" ]; then
|
||||
echo "OmniDocBench.json already exists, skipping"
|
||||
else
|
||||
echo "Downloading OmniDocBench.json (65 MB)..."
|
||||
curl -L -o "$TARGET_DIR/OmniDocBench.json" "$HF_BASE/OmniDocBench.json"
|
||||
fi
|
||||
|
||||
# Download images directory via HF CLI if available, otherwise use git-lfs clone
|
||||
if [ -d "$TARGET_DIR/images" ] && [ "$(find "$TARGET_DIR/images" -maxdepth 1 -type f 2>/dev/null | wc -l)" -gt 100 ]; then
|
||||
echo "images/ directory already populated ($(find "$TARGET_DIR/images" -maxdepth 1 -type f | wc -l) files), skipping"
|
||||
else
|
||||
if command -v huggingface-cli &>/dev/null; then
|
||||
echo "Downloading full dataset via huggingface-cli..."
|
||||
huggingface-cli download opendatalab/OmniDocBench \
|
||||
--repo-type dataset \
|
||||
--local-dir "$TARGET_DIR" \
|
||||
--include "images/*" "ori_pdfs/*" "OmniDocBench.json"
|
||||
elif command -v git-lfs &>/dev/null || git lfs version &>/dev/null 2>&1; then
|
||||
echo "Downloading via git-lfs clone..."
|
||||
TEMP_CLONE="$(mktemp -d)"
|
||||
git clone --depth 1 "https://huggingface.co/datasets/opendatalab/OmniDocBench" "$TEMP_CLONE"
|
||||
cd "$TEMP_CLONE" && git lfs pull
|
||||
cp -r "$TEMP_CLONE/images" "$TARGET_DIR/" 2>/dev/null || true
|
||||
cp -r "$TEMP_CLONE/ori_pdfs" "$TARGET_DIR/" 2>/dev/null || true
|
||||
rm -rf "$TEMP_CLONE"
|
||||
else
|
||||
echo "ERROR: Need either huggingface-cli or git-lfs to download images."
|
||||
echo ""
|
||||
echo "Install one of:"
|
||||
echo " pip install huggingface-hub # then: huggingface-cli"
|
||||
echo " brew install git-lfs # then: git lfs install"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# Summary
|
||||
echo ""
|
||||
echo "OmniDocBench downloaded to: $TARGET_DIR"
|
||||
echo " Annotations: $(wc -c <"$TARGET_DIR/OmniDocBench.json" | tr -d ' ') bytes"
|
||||
[ -d "$TARGET_DIR/images" ] && echo " Images: $(find "$TARGET_DIR/images" -maxdepth 1 -type f | wc -l | tr -d ' ') files"
|
||||
[ -d "$TARGET_DIR/ori_pdfs" ] && echo " PDFs: $(find "$TARGET_DIR/ori_pdfs" -maxdepth 1 -type f | wc -l | tr -d ' ') files"
|
||||
789
tools/benchmark-harness/scripts/generate_ground_truth.py
Executable file
789
tools/benchmark-harness/scripts/generate_ground_truth.py
Executable file
@@ -0,0 +1,789 @@
|
||||
#!/usr/bin/env -S uv run --script
|
||||
# /// script
|
||||
# requires-python = ">=3.10"
|
||||
# dependencies = [
|
||||
# "beautifulsoup4>=4.12",
|
||||
# "python-docx>=1.0",
|
||||
# "python-pptx>=1.0",
|
||||
# "openpyxl>=3.1",
|
||||
# "nbformat>=5.9",
|
||||
# "xlrd>=2.0",
|
||||
# "extract-msg>=0.48",
|
||||
# "lxml>=5.0",
|
||||
# "odfpy>=1.4",
|
||||
# ]
|
||||
# ///
|
||||
"""Generate ground truth text files for benchmark fixtures.
|
||||
|
||||
Walks all fixture JSONs, extracts text from source documents using independent
|
||||
tools (not benchmarked frameworks), writes ground truth .txt files, patches
|
||||
fixture JSONs with ground_truth field, and updates ground_truth_mapping.json.
|
||||
|
||||
PDF Ground Truth Methodology (updated Feb 2026):
|
||||
PDF ground truth was regenerated using AI visual extraction (Claude Haiku
|
||||
reading each PDF page as an image) for scanned/complex PDFs, and pdftotext
|
||||
for born-digital PDFs with reliable embedded text. The previous approach of
|
||||
using pdftotext for all PDFs produced incorrect ground truth for scanned
|
||||
documents since pdftotext cannot read image-based text.
|
||||
|
||||
The handle_pdftotext() function below is retained for regenerating GT from
|
||||
born-digital PDFs. For scanned PDFs, GT files were manually curated via AI
|
||||
extraction and should not be overwritten by running this script with --force.
|
||||
|
||||
Usage:
|
||||
uv run tools/benchmark-harness/scripts/generate_ground_truth.py [OPTIONS]
|
||||
|
||||
Options:
|
||||
--dry-run Print planned actions without writing
|
||||
--format-filter Comma-separated file types to process (e.g., md,txt,pdf)
|
||||
--force Regenerate even if ground truth already exists
|
||||
--skip-types Comma-separated file types to skip
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import email
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import xml.etree.ElementTree as ET
|
||||
from pathlib import Path
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# File type → handler mapping
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
RAW_SOURCE_TYPES = frozenset(
|
||||
{
|
||||
"md",
|
||||
"txt",
|
||||
"rst",
|
||||
"org",
|
||||
"commonmark",
|
||||
"djot",
|
||||
"toml",
|
||||
"yaml",
|
||||
"json",
|
||||
"tsv",
|
||||
"bib",
|
||||
"csv",
|
||||
"svg",
|
||||
}
|
||||
)
|
||||
|
||||
PDFTOTEXT_TYPES = frozenset({"pdf"})
|
||||
PANDOC_TYPES = frozenset(
|
||||
{
|
||||
"tex",
|
||||
"latex",
|
||||
"typ",
|
||||
"epub",
|
||||
"fb2",
|
||||
"docbook",
|
||||
"odt",
|
||||
"rtf",
|
||||
"opml",
|
||||
}
|
||||
)
|
||||
PYTHON_DOCX_TYPES = frozenset({"docx"})
|
||||
PYTHON_PPTX_TYPES = frozenset({"pptx", "pptm", "ppsx"})
|
||||
OPENPYXL_TYPES = frozenset({"xlsx", "xlsm"})
|
||||
ODS_TYPES = frozenset({"ods"})
|
||||
BEAUTIFULSOUP_TYPES = frozenset({"html"})
|
||||
PYTHON_EMAIL_TYPES = frozenset({"eml"})
|
||||
EXTRACT_MSG_TYPES = frozenset({"msg"})
|
||||
NBFORMAT_TYPES = frozenset({"ipynb"})
|
||||
XML_PARSE_TYPES = frozenset({"xml"})
|
||||
XLRD_TYPES = frozenset({"xls"})
|
||||
ANTIWORD_TYPES = frozenset({"doc"})
|
||||
LIBREOFFICE_TYPES = frozenset({"ppt"})
|
||||
DBF_TYPES = frozenset({"dbf"})
|
||||
HWP_TYPES = frozenset({"hwp"})
|
||||
|
||||
# Archive and image types are excluded from ground truth generation
|
||||
EXCLUDED_TYPES = frozenset(
|
||||
{
|
||||
"7z",
|
||||
"gz",
|
||||
"tar",
|
||||
"tgz",
|
||||
"zip",
|
||||
"lz4",
|
||||
"gif",
|
||||
"jpeg",
|
||||
"jpg",
|
||||
"jp2",
|
||||
"png",
|
||||
"tiff",
|
||||
"webp",
|
||||
"bmp",
|
||||
"pbm",
|
||||
"pgm",
|
||||
"pnm",
|
||||
"ppm",
|
||||
}
|
||||
)
|
||||
|
||||
ALL_HANDLED_TYPES = (
|
||||
RAW_SOURCE_TYPES
|
||||
| PDFTOTEXT_TYPES
|
||||
| PANDOC_TYPES
|
||||
| PYTHON_DOCX_TYPES
|
||||
| PYTHON_PPTX_TYPES
|
||||
| OPENPYXL_TYPES
|
||||
| BEAUTIFULSOUP_TYPES
|
||||
| PYTHON_EMAIL_TYPES
|
||||
| EXTRACT_MSG_TYPES
|
||||
| NBFORMAT_TYPES
|
||||
| XML_PARSE_TYPES
|
||||
| XLRD_TYPES
|
||||
| ANTIWORD_TYPES
|
||||
| LIBREOFFICE_TYPES
|
||||
| ODS_TYPES
|
||||
| DBF_TYPES
|
||||
| HWP_TYPES
|
||||
)
|
||||
|
||||
|
||||
def get_source_type(file_type: str) -> str:
|
||||
"""Return the ground truth source type string for a given file type."""
|
||||
if file_type in RAW_SOURCE_TYPES:
|
||||
return "raw_source"
|
||||
if file_type in PDFTOTEXT_TYPES:
|
||||
return "pdftotext"
|
||||
if file_type in PANDOC_TYPES:
|
||||
return "pandoc"
|
||||
if file_type in PYTHON_DOCX_TYPES:
|
||||
return "python-docx"
|
||||
if file_type in PYTHON_PPTX_TYPES:
|
||||
return "python-pptx"
|
||||
if file_type in OPENPYXL_TYPES:
|
||||
return "openpyxl"
|
||||
if file_type in BEAUTIFULSOUP_TYPES:
|
||||
return "beautifulsoup"
|
||||
if file_type in PYTHON_EMAIL_TYPES:
|
||||
return "python_email"
|
||||
if file_type in EXTRACT_MSG_TYPES:
|
||||
return "extract_msg"
|
||||
if file_type in NBFORMAT_TYPES:
|
||||
return "nbformat"
|
||||
if file_type in XML_PARSE_TYPES:
|
||||
return "xml_parse"
|
||||
if file_type in XLRD_TYPES:
|
||||
return "xlrd"
|
||||
if file_type in ANTIWORD_TYPES:
|
||||
return "antiword"
|
||||
if file_type in LIBREOFFICE_TYPES:
|
||||
return "libreoffice"
|
||||
if file_type in ODS_TYPES:
|
||||
return "odfpy"
|
||||
if file_type in DBF_TYPES:
|
||||
return "manual"
|
||||
if file_type in HWP_TYPES:
|
||||
return "manual"
|
||||
return "manual"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Text extraction handlers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def handle_raw_source(doc_path: Path) -> str:
|
||||
"""Read the file as-is. For text-based formats, source content IS ground truth."""
|
||||
try:
|
||||
return doc_path.read_text(encoding="utf-8")
|
||||
except UnicodeDecodeError:
|
||||
return doc_path.read_text(encoding="latin-1")
|
||||
|
||||
|
||||
def handle_pdftotext(doc_path: Path) -> str:
|
||||
"""Extract text from PDF using pdftotext (poppler-utils).
|
||||
|
||||
Note: This works well for born-digital PDFs with embedded text layers.
|
||||
For scanned PDFs, pdftotext produces garbage output. Scanned PDF ground
|
||||
truth should be generated via AI visual extraction instead.
|
||||
"""
|
||||
result = subprocess.run(
|
||||
["pdftotext", "-layout", str(doc_path), "-"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"pdftotext failed: {result.stderr}")
|
||||
return result.stdout
|
||||
|
||||
|
||||
def handle_pandoc(doc_path: Path, file_type: str) -> str:
|
||||
"""Convert document to plain text using pandoc."""
|
||||
# Map file types to pandoc input formats
|
||||
pandoc_format_map = {
|
||||
"tex": "latex",
|
||||
"latex": "latex",
|
||||
"typ": "typst",
|
||||
"epub": "epub",
|
||||
"fb2": "fb2",
|
||||
"docbook": "docbook",
|
||||
"odt": "odt",
|
||||
"rtf": "rtf",
|
||||
"opml": "opml",
|
||||
"doc": "doc",
|
||||
"ppt": "ppt",
|
||||
}
|
||||
input_format = pandoc_format_map.get(file_type)
|
||||
cmd = ["pandoc", "-t", "plain", "--wrap=none", str(doc_path)]
|
||||
if input_format:
|
||||
cmd.insert(1, "-f")
|
||||
cmd.insert(2, input_format)
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"pandoc failed: {result.stderr}")
|
||||
return result.stdout
|
||||
|
||||
|
||||
def handle_python_docx(doc_path: Path) -> str:
|
||||
"""Extract text from DOCX using python-docx."""
|
||||
import docx
|
||||
|
||||
doc = docx.Document(str(doc_path))
|
||||
paragraphs = [p.text for p in doc.paragraphs]
|
||||
# Also extract table text
|
||||
for table in doc.tables:
|
||||
for row in table.rows:
|
||||
cells = [cell.text for cell in row.cells]
|
||||
paragraphs.append("\t".join(cells))
|
||||
return "\n".join(paragraphs)
|
||||
|
||||
|
||||
def handle_python_pptx(doc_path: Path) -> str:
|
||||
"""Extract text from PPTX/PPTM/PPSX using python-pptx."""
|
||||
from pptx import Presentation
|
||||
|
||||
prs = Presentation(str(doc_path))
|
||||
texts = []
|
||||
for slide in prs.slides:
|
||||
for shape in slide.shapes:
|
||||
if shape.has_text_frame:
|
||||
for paragraph in shape.text_frame.paragraphs:
|
||||
text = paragraph.text.strip()
|
||||
if text:
|
||||
texts.append(text)
|
||||
return "\n".join(texts)
|
||||
|
||||
|
||||
def handle_openpyxl(doc_path: Path) -> str:
|
||||
"""Extract text from XLSX/XLSM using openpyxl."""
|
||||
import openpyxl
|
||||
|
||||
wb = openpyxl.load_workbook(str(doc_path), read_only=True, data_only=True)
|
||||
lines = []
|
||||
for sheet_name in wb.sheetnames:
|
||||
ws = wb[sheet_name]
|
||||
for row in ws.iter_rows(values_only=True):
|
||||
cells = [str(c) if c is not None else "" for c in row]
|
||||
if any(cells):
|
||||
lines.append("\t".join(cells))
|
||||
wb.close()
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def handle_beautifulsoup(doc_path: Path) -> str:
|
||||
"""Extract text from HTML using BeautifulSoup."""
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
try:
|
||||
html_content = doc_path.read_text(encoding="utf-8")
|
||||
except UnicodeDecodeError:
|
||||
html_content = doc_path.read_text(encoding="latin-1")
|
||||
soup = BeautifulSoup(html_content, "html.parser")
|
||||
# Remove script and style elements
|
||||
for tag in soup(["script", "style"]):
|
||||
tag.decompose()
|
||||
return soup.get_text(separator="\n", strip=True)
|
||||
|
||||
|
||||
def handle_python_email(doc_path: Path) -> str:
|
||||
"""Extract text from EML using Python email stdlib."""
|
||||
try:
|
||||
raw = doc_path.read_bytes()
|
||||
msg = email.message_from_bytes(raw)
|
||||
except Exception:
|
||||
raw = doc_path.read_text(encoding="utf-8", errors="replace")
|
||||
msg = email.message_from_string(raw)
|
||||
|
||||
parts = []
|
||||
# Add headers
|
||||
for header in ("From", "To", "Subject", "Date"):
|
||||
val = msg.get(header)
|
||||
if val:
|
||||
parts.append(f"{header}: {val}")
|
||||
|
||||
if parts:
|
||||
parts.append("") # blank line after headers
|
||||
|
||||
# Extract body
|
||||
if msg.is_multipart():
|
||||
for part in msg.walk():
|
||||
content_type = part.get_content_type()
|
||||
if content_type == "text/plain":
|
||||
payload = part.get_payload(decode=True)
|
||||
if payload:
|
||||
charset = part.get_content_charset() or "utf-8"
|
||||
try:
|
||||
parts.append(payload.decode(charset, errors="replace"))
|
||||
except (LookupError, UnicodeDecodeError):
|
||||
parts.append(payload.decode("utf-8", errors="replace"))
|
||||
else:
|
||||
payload = msg.get_payload(decode=True)
|
||||
if payload:
|
||||
charset = msg.get_content_charset() or "utf-8"
|
||||
try:
|
||||
parts.append(payload.decode(charset, errors="replace"))
|
||||
except (LookupError, UnicodeDecodeError):
|
||||
parts.append(payload.decode("utf-8", errors="replace"))
|
||||
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
def handle_extract_msg(doc_path: Path) -> str:
|
||||
"""Extract text from MSG using extract-msg."""
|
||||
import extract_msg
|
||||
|
||||
msg = extract_msg.openMsg(str(doc_path))
|
||||
parts = []
|
||||
if msg.subject:
|
||||
parts.append(f"Subject: {msg.subject}")
|
||||
if msg.sender:
|
||||
parts.append(f"From: {msg.sender}")
|
||||
if msg.to:
|
||||
parts.append(f"To: {msg.to}")
|
||||
if msg.date:
|
||||
parts.append(f"Date: {msg.date}")
|
||||
if parts:
|
||||
parts.append("")
|
||||
if msg.body:
|
||||
parts.append(msg.body)
|
||||
msg.close()
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
def handle_nbformat(doc_path: Path) -> str:
|
||||
"""Extract text from Jupyter notebooks using nbformat."""
|
||||
import nbformat
|
||||
|
||||
nb = nbformat.read(str(doc_path), as_version=4)
|
||||
parts = []
|
||||
for cell in nb.cells:
|
||||
if cell.cell_type in ("code", "markdown", "raw"):
|
||||
source = cell.source.strip()
|
||||
if source:
|
||||
parts.append(source)
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
def handle_xml_parse(doc_path: Path) -> str:
|
||||
"""Extract text content from XML using xml.etree."""
|
||||
try:
|
||||
tree = ET.parse(str(doc_path))
|
||||
except ET.ParseError:
|
||||
# Fallback: read as raw text
|
||||
return handle_raw_source(doc_path)
|
||||
root = tree.getroot()
|
||||
texts = []
|
||||
for elem in root.iter():
|
||||
if elem.text and elem.text.strip():
|
||||
texts.append(elem.text.strip())
|
||||
if elem.tail and elem.tail.strip():
|
||||
texts.append(elem.tail.strip())
|
||||
return "\n".join(texts)
|
||||
|
||||
|
||||
def handle_xlrd(doc_path: Path) -> str:
|
||||
"""Extract text from XLS using xlrd."""
|
||||
import xlrd
|
||||
|
||||
wb = xlrd.open_workbook(str(doc_path))
|
||||
lines = []
|
||||
for sheet_idx in range(wb.nsheets):
|
||||
ws = wb.sheet_by_index(sheet_idx)
|
||||
for row_idx in range(ws.nrows):
|
||||
cells = [str(ws.cell_value(row_idx, col_idx)) for col_idx in range(ws.ncols)]
|
||||
if any(c for c in cells):
|
||||
lines.append("\t".join(cells))
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def handle_antiword(doc_path: Path) -> str:
|
||||
"""Extract text from DOC using antiword, catdoc, or pandoc as fallbacks."""
|
||||
# Try antiword first
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["antiword", str(doc_path)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
)
|
||||
if result.returncode == 0:
|
||||
return result.stdout
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
# Fallback to catdoc
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["catdoc", str(doc_path)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
)
|
||||
if result.returncode == 0:
|
||||
return result.stdout
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
# Fallback to textutil (macOS)
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["textutil", "-convert", "txt", "-stdout", str(doc_path)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
)
|
||||
if result.returncode == 0:
|
||||
return result.stdout
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
raise RuntimeError("No DOC extraction tool available (need antiword, catdoc, or textutil)")
|
||||
|
||||
|
||||
def handle_ods(doc_path: Path) -> str:
|
||||
"""Extract text from ODS using odfpy."""
|
||||
from odf import text as odf_text
|
||||
from odf.opendocument import load as odf_load
|
||||
from odf.table import Table, TableCell, TableRow
|
||||
|
||||
doc = odf_load(str(doc_path))
|
||||
lines = []
|
||||
for table in doc.spreadsheet.getElementsByType(Table):
|
||||
for row in table.getElementsByType(TableRow):
|
||||
cells = []
|
||||
for cell in row.getElementsByType(TableCell):
|
||||
# Get text content from cell
|
||||
cell_texts = []
|
||||
for p in cell.getElementsByType(odf_text.P):
|
||||
# Recursively get all text
|
||||
text_parts = []
|
||||
for node in p.childNodes:
|
||||
if hasattr(node, "data"):
|
||||
text_parts.append(node.data)
|
||||
elif hasattr(node, "__str__"):
|
||||
text_parts.append(str(node))
|
||||
cell_texts.append("".join(text_parts))
|
||||
# Handle repeated cells
|
||||
repeat = cell.getAttribute("numbercolumnsrepeated")
|
||||
cell_text = " ".join(cell_texts)
|
||||
if repeat and int(repeat) > 1 and cell_text:
|
||||
cells.extend([cell_text] * min(int(repeat), 100))
|
||||
else:
|
||||
cells.append(cell_text)
|
||||
if any(c.strip() for c in cells):
|
||||
lines.append("\t".join(cells))
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def handle_libreoffice(doc_path: Path) -> str:
|
||||
"""Extract text from PPT using LibreOffice CLI, with pandoc fallback."""
|
||||
import tempfile
|
||||
|
||||
try:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
result = subprocess.run(
|
||||
["libreoffice", "--headless", "--convert-to", "txt:Text", "--outdir", tmpdir, str(doc_path)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
)
|
||||
if result.returncode == 0:
|
||||
txt_files = list(Path(tmpdir).glob("*.txt"))
|
||||
if txt_files:
|
||||
return txt_files[0].read_text(encoding="utf-8", errors="replace")
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
# Fallback: try textutil (macOS)
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["textutil", "-convert", "txt", "-stdout", str(doc_path)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
)
|
||||
if result.returncode == 0:
|
||||
return result.stdout
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
raise RuntimeError("No PPT extraction tool available (need libreoffice or textutil)")
|
||||
|
||||
|
||||
def extract_text(doc_path: Path, file_type: str) -> str:
|
||||
"""Dispatch to the appropriate handler for the given file type."""
|
||||
if file_type in RAW_SOURCE_TYPES:
|
||||
return handle_raw_source(doc_path)
|
||||
if file_type in PDFTOTEXT_TYPES:
|
||||
return handle_pdftotext(doc_path)
|
||||
if file_type in PANDOC_TYPES:
|
||||
return handle_pandoc(doc_path, file_type)
|
||||
if file_type in PYTHON_DOCX_TYPES:
|
||||
return handle_python_docx(doc_path)
|
||||
if file_type in PYTHON_PPTX_TYPES:
|
||||
return handle_python_pptx(doc_path)
|
||||
if file_type in OPENPYXL_TYPES:
|
||||
return handle_openpyxl(doc_path)
|
||||
if file_type in BEAUTIFULSOUP_TYPES:
|
||||
return handle_beautifulsoup(doc_path)
|
||||
if file_type in PYTHON_EMAIL_TYPES:
|
||||
return handle_python_email(doc_path)
|
||||
if file_type in EXTRACT_MSG_TYPES:
|
||||
return handle_extract_msg(doc_path)
|
||||
if file_type in NBFORMAT_TYPES:
|
||||
return handle_nbformat(doc_path)
|
||||
if file_type in XML_PARSE_TYPES:
|
||||
return handle_xml_parse(doc_path)
|
||||
if file_type in XLRD_TYPES:
|
||||
return handle_xlrd(doc_path)
|
||||
if file_type in ANTIWORD_TYPES:
|
||||
return handle_antiword(doc_path)
|
||||
if file_type in LIBREOFFICE_TYPES:
|
||||
return handle_libreoffice(doc_path)
|
||||
if file_type in ODS_TYPES:
|
||||
return handle_ods(doc_path)
|
||||
raise ValueError(f"No handler for file type: {file_type}")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Core logic
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def get_repo_root() -> Path:
|
||||
"""Find the repository root directory."""
|
||||
current = Path(__file__).resolve().parent
|
||||
while current != current.parent:
|
||||
if (current / "Cargo.toml").exists() and (current / "test_documents").exists():
|
||||
return current
|
||||
current = current.parent
|
||||
raise RuntimeError("Could not find repository root")
|
||||
|
||||
|
||||
def collect_fixtures(fixtures_dir: Path) -> list[Path]:
|
||||
"""Recursively collect all fixture JSON files."""
|
||||
return sorted(fixtures_dir.rglob("*.json"))
|
||||
|
||||
|
||||
def load_mapping(repo_root: Path) -> dict[str, str]:
|
||||
"""Load the existing ground truth mapping."""
|
||||
mapping_file = repo_root / "test_documents" / "ground_truth" / "ground_truth_mapping.json"
|
||||
if mapping_file.exists():
|
||||
with open(mapping_file) as f:
|
||||
return json.load(f)
|
||||
return {}
|
||||
|
||||
|
||||
def save_mapping(repo_root: Path, mapping: dict[str, str]) -> None:
|
||||
"""Save the ground truth mapping (sorted keys)."""
|
||||
mapping_file = repo_root / "test_documents" / "ground_truth" / "ground_truth_mapping.json"
|
||||
sorted_mapping = dict(sorted(mapping.items()))
|
||||
with open(mapping_file, "w") as f:
|
||||
json.dump(sorted_mapping, f, indent=2)
|
||||
f.write("\n")
|
||||
|
||||
|
||||
def make_mapping_key(fixture_path: Path, fixtures_dir: Path) -> str:
|
||||
"""Generate a unique mapping key from the fixture path.
|
||||
|
||||
For top-level fixtures: stem (e.g., 'commonmark_sample')
|
||||
For subdir fixtures: subdir/stem (e.g., 'md/duck.md' from md/duck.md.json)
|
||||
"""
|
||||
rel = fixture_path.relative_to(fixtures_dir)
|
||||
parts = rel.parts
|
||||
if len(parts) > 1:
|
||||
return f"{parts[0]}/{fixture_path.stem}"
|
||||
return fixture_path.stem
|
||||
|
||||
|
||||
def process_fixture(
|
||||
fixture_path: Path,
|
||||
repo_root: Path,
|
||||
fixtures_dir: Path,
|
||||
mapping: dict[str, str],
|
||||
dry_run: bool,
|
||||
force: bool,
|
||||
stats: dict[str, int],
|
||||
) -> None:
|
||||
"""Process a single fixture: generate ground truth, patch fixture, update mapping."""
|
||||
with open(fixture_path) as f:
|
||||
fixture = json.load(f)
|
||||
|
||||
file_type = fixture.get("file_type", "")
|
||||
|
||||
# Skip excluded types
|
||||
if file_type in EXCLUDED_TYPES:
|
||||
stats["skipped_excluded"] += 1
|
||||
return
|
||||
|
||||
# Skip unhandled types
|
||||
if file_type not in ALL_HANDLED_TYPES:
|
||||
print(f" SKIP (unhandled type): {fixture_path.name} ({file_type})")
|
||||
stats["skipped_unhandled"] += 1
|
||||
return
|
||||
|
||||
# Skip if already has ground truth (unless --force)
|
||||
if fixture.get("ground_truth") and not force:
|
||||
stats["skipped_existing"] += 1
|
||||
return
|
||||
|
||||
# Resolve document path
|
||||
doc_rel = fixture.get("document", "")
|
||||
if not doc_rel:
|
||||
print(f" SKIP (no document): {fixture_path.name}")
|
||||
stats["skipped_no_doc"] += 1
|
||||
return
|
||||
|
||||
doc_path = (fixture_path.parent / doc_rel).resolve()
|
||||
if not doc_path.exists():
|
||||
print(f" SKIP (doc not found): {fixture_path.name} -> {doc_path}")
|
||||
stats["skipped_missing_doc"] += 1
|
||||
return
|
||||
|
||||
# Determine ground truth output path
|
||||
gt_dir = repo_root / "test_documents" / "ground_truth" / file_type
|
||||
gt_filename = fixture_path.stem + ".txt"
|
||||
gt_path = gt_dir / gt_filename
|
||||
|
||||
# Compute relative path from fixture to ground truth
|
||||
gt_rel = os.path.relpath(gt_path, fixture_path.parent)
|
||||
|
||||
# Mapping key
|
||||
mapping_key = make_mapping_key(fixture_path, fixtures_dir)
|
||||
|
||||
if dry_run:
|
||||
print(f" [DRY RUN] {fixture_path.name} ({file_type})")
|
||||
print(f" doc: {doc_path}")
|
||||
print(f" gt: {gt_path}")
|
||||
print(f" key: {mapping_key}")
|
||||
stats["would_generate"] += 1
|
||||
return
|
||||
|
||||
# Extract text
|
||||
try:
|
||||
text = extract_text(doc_path, file_type)
|
||||
except Exception as e:
|
||||
print(f" ERROR extracting {fixture_path.name}: {e}")
|
||||
stats["errors"] += 1
|
||||
return
|
||||
|
||||
# Write ground truth file
|
||||
gt_dir.mkdir(parents=True, exist_ok=True)
|
||||
gt_path.write_text(text, encoding="utf-8")
|
||||
|
||||
# Patch fixture JSON
|
||||
fixture["ground_truth"] = {
|
||||
"text_file": gt_rel,
|
||||
"source": get_source_type(file_type),
|
||||
}
|
||||
with open(fixture_path, "w") as f:
|
||||
json.dump(fixture, f, indent=2)
|
||||
f.write("\n")
|
||||
|
||||
# Update mapping
|
||||
gt_mapping_path = str(gt_path.relative_to(repo_root))
|
||||
mapping[mapping_key] = gt_mapping_path
|
||||
|
||||
stats["generated"] += 1
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="Generate ground truth for benchmark fixtures")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Print planned actions without writing")
|
||||
parser.add_argument("--format-filter", type=str, default="", help="Comma-separated file types to process")
|
||||
parser.add_argument("--force", action="store_true", help="Regenerate even if ground truth exists")
|
||||
parser.add_argument("--skip-types", type=str, default="", help="Comma-separated file types to skip")
|
||||
args = parser.parse_args()
|
||||
|
||||
repo_root = get_repo_root()
|
||||
fixtures_dir = repo_root / "tools" / "benchmark-harness" / "fixtures"
|
||||
|
||||
print(f"Repository root: {repo_root}")
|
||||
print(f"Fixtures dir: {fixtures_dir}")
|
||||
if args.dry_run:
|
||||
print("DRY RUN MODE - no files will be written\n")
|
||||
|
||||
format_filter = set(args.format_filter.split(",")) if args.format_filter else None
|
||||
skip_types = set(args.skip_types.split(",")) if args.skip_types else set()
|
||||
|
||||
# Load existing mapping
|
||||
mapping = load_mapping(repo_root)
|
||||
initial_mapping_size = len(mapping)
|
||||
|
||||
# Collect and process fixtures
|
||||
fixture_paths = collect_fixtures(fixtures_dir)
|
||||
print(f"Found {len(fixture_paths)} fixture files\n")
|
||||
|
||||
stats: dict[str, int] = {
|
||||
"generated": 0,
|
||||
"would_generate": 0,
|
||||
"skipped_existing": 0,
|
||||
"skipped_excluded": 0,
|
||||
"skipped_unhandled": 0,
|
||||
"skipped_no_doc": 0,
|
||||
"skipped_missing_doc": 0,
|
||||
"errors": 0,
|
||||
}
|
||||
|
||||
for fixture_path in fixture_paths:
|
||||
# Load to check file type for filtering
|
||||
try:
|
||||
with open(fixture_path) as f:
|
||||
fixture_data = json.load(f)
|
||||
except (json.JSONDecodeError, OSError) as e:
|
||||
print(f" ERROR reading {fixture_path.name}: {e}")
|
||||
stats["errors"] += 1
|
||||
continue
|
||||
|
||||
file_type = fixture_data.get("file_type", "")
|
||||
if format_filter and file_type not in format_filter:
|
||||
continue
|
||||
if file_type in skip_types:
|
||||
continue
|
||||
|
||||
process_fixture(fixture_path, repo_root, fixtures_dir, mapping, args.dry_run, args.force, stats)
|
||||
|
||||
# Save mapping
|
||||
if not args.dry_run and stats["generated"] > 0:
|
||||
save_mapping(repo_root, mapping)
|
||||
new_entries = len(mapping) - initial_mapping_size
|
||||
print(f"\nUpdated ground_truth_mapping.json: {new_entries} new entries (total: {len(mapping)})")
|
||||
|
||||
# Print summary
|
||||
print(f"\n{'=' * 50}")
|
||||
print("Summary:")
|
||||
print(f" Generated: {stats['generated']}")
|
||||
if args.dry_run:
|
||||
print(f" Would generate: {stats['would_generate']}")
|
||||
print(f" Skipped (existing): {stats['skipped_existing']}")
|
||||
print(f" Skipped (excluded): {stats['skipped_excluded']}")
|
||||
print(f" Skipped (unhandled): {stats['skipped_unhandled']}")
|
||||
print(f" Skipped (no doc): {stats['skipped_no_doc']}")
|
||||
print(f" Skipped (missing): {stats['skipped_missing_doc']}")
|
||||
print(f" Errors: {stats['errors']}")
|
||||
|
||||
return 1 if stats["errors"] > 0 else 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
93
tools/benchmark-harness/scripts/generate_libreoffice_gt.sh
Normal file
93
tools/benchmark-harness/scripts/generate_libreoffice_gt.sh
Normal file
@@ -0,0 +1,93 @@
|
||||
#!/usr/bin/env bash
|
||||
# Generate markdown ground truth for formats requiring LibreOffice conversion.
|
||||
# Workflow: soffice → intermediate format → pandoc -t gfm → sanitize
|
||||
#
|
||||
# Prerequisites:
|
||||
# - soffice (LibreOffice) on PATH
|
||||
# - pandoc on PATH
|
||||
# - python3 on PATH
|
||||
#
|
||||
# Usage: bash tools/benchmark-harness/scripts/generate_libreoffice_gt.sh
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
REPO_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)"
|
||||
SANITIZE="$SCRIPT_DIR/sanitize_pandoc_gt.py"
|
||||
TMP_DIR="/tmp/gt_convert"
|
||||
|
||||
mkdir -p "$TMP_DIR"
|
||||
|
||||
# --- DOC → DOCX → GFM ---
|
||||
echo "=== DOC ground truth generation ==="
|
||||
mkdir -p "$REPO_ROOT/test_documents/ground_truth/doc"
|
||||
|
||||
doc_files=(
|
||||
"$REPO_ROOT/test_documents/vendored/unstructured/doc/simple.doc"
|
||||
"$REPO_ROOT/test_documents/vendored/unstructured/doc/fake.doc"
|
||||
"$REPO_ROOT/test_documents/vendored/unstructured/doc/duplicate-paragraphs.doc"
|
||||
"$REPO_ROOT/test_documents/vendored/unstructured/doc/fake-doc-emphasized-text.doc"
|
||||
"$REPO_ROOT/test_documents/doc/unit_test_lists.doc"
|
||||
)
|
||||
|
||||
for f in "${doc_files[@]}"; do
|
||||
if [ ! -f "$f" ]; then
|
||||
echo " SKIP (not found): $f"
|
||||
continue
|
||||
fi
|
||||
name=$(basename "$f" .doc)
|
||||
gt_md="$REPO_ROOT/test_documents/ground_truth/doc/${name}.md"
|
||||
|
||||
# Convert to docx via LibreOffice
|
||||
soffice --headless --convert-to docx --outdir "$TMP_DIR" "$f" 2>/dev/null
|
||||
converted="$TMP_DIR/${name}.docx"
|
||||
|
||||
if [ -f "$converted" ]; then
|
||||
pandoc -f docx -t gfm --wrap=none "$converted" 2>/dev/null |
|
||||
python3 "$SANITIZE" >"$gt_md"
|
||||
size=$(wc -c <"$gt_md")
|
||||
echo " doc: $name → $size bytes ($gt_md)"
|
||||
else
|
||||
echo " doc: $name FAILED conversion"
|
||||
fi
|
||||
done
|
||||
|
||||
# --- PPT → PPTX → GFM ---
|
||||
echo ""
|
||||
echo "=== PPT ground truth generation ==="
|
||||
mkdir -p "$REPO_ROOT/test_documents/ground_truth/ppt"
|
||||
|
||||
ppt_files=(
|
||||
"$REPO_ROOT/test_documents/ppt/simple.ppt"
|
||||
)
|
||||
|
||||
for f in "${ppt_files[@]}"; do
|
||||
if [ ! -f "$f" ]; then
|
||||
echo " SKIP (not found): $f"
|
||||
continue
|
||||
fi
|
||||
name=$(basename "$f" .ppt)
|
||||
gt_md="$REPO_ROOT/test_documents/ground_truth/ppt/${name}.md"
|
||||
|
||||
soffice --headless --convert-to pptx --outdir "$TMP_DIR" "$f" 2>/dev/null
|
||||
converted="$TMP_DIR/${name}.pptx"
|
||||
|
||||
if [ -f "$converted" ]; then
|
||||
pandoc -f pptx -t gfm --wrap=none "$converted" 2>/dev/null |
|
||||
python3 "$SANITIZE" >"$gt_md"
|
||||
size=$(wc -c <"$gt_md")
|
||||
echo " ppt: $name → $size bytes ($gt_md)"
|
||||
else
|
||||
echo " ppt: $name FAILED conversion"
|
||||
fi
|
||||
done
|
||||
|
||||
# --- ODS: no pandoc support for spreadsheet input ---
|
||||
echo ""
|
||||
echo "=== ODS: skipped (pandoc cannot read spreadsheet formats) ==="
|
||||
echo " Existing text GT in test_documents/ground_truth/ods/ is sufficient."
|
||||
|
||||
echo ""
|
||||
echo "Done. Validate with:"
|
||||
echo " cargo run --release -p benchmark-harness -- validate-gt --fixtures tools/benchmark-harness/fixtures/doc/"
|
||||
echo " cargo run --release -p benchmark-harness -- validate-gt --fixtures tools/benchmark-harness/fixtures/"
|
||||
249
tools/benchmark-harness/scripts/generate_markdown_gt.py
Normal file
249
tools/benchmark-harness/scripts/generate_markdown_gt.py
Normal file
@@ -0,0 +1,249 @@
|
||||
#!/usr/bin/env -S uv run --no-project --script
|
||||
# /// script
|
||||
# requires-python = ">=3.10"
|
||||
# dependencies = ["google-genai>=1.0"]
|
||||
# ///
|
||||
"""Generate proper markdown ground truth from PDF documents using Gemini.
|
||||
|
||||
Reads benchmark fixture JSON files to locate PDFs, sends each to Gemini 2.5 Flash
|
||||
via Vertex AI, and saves the extracted markdown to the ground truth directory.
|
||||
|
||||
Usage:
|
||||
uv run tools/benchmark-harness/scripts/generate_markdown_gt.py [OPTIONS]
|
||||
|
||||
Examples:
|
||||
# Generate for all nougat + pdfa documents
|
||||
uv run tools/benchmark-harness/scripts/generate_markdown_gt.py
|
||||
|
||||
# Generate for a specific document
|
||||
uv run tools/benchmark-harness/scripts/generate_markdown_gt.py --filter nougat_001
|
||||
|
||||
# Dry run to see what would be processed
|
||||
uv run tools/benchmark-harness/scripts/generate_markdown_gt.py --dry-run
|
||||
|
||||
# Force regeneration of existing files
|
||||
uv run tools/benchmark-harness/scripts/generate_markdown_gt.py --force
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import signal
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from google import genai
|
||||
from google.genai.types import GenerateContentConfig, Part
|
||||
|
||||
EXTRACTION_PROMPT = """\
|
||||
Extract the complete text content of this PDF document as clean Markdown.
|
||||
|
||||
Rules:
|
||||
- Use proper heading hierarchy (# for document title, ## for major sections, ### for subsections)
|
||||
- Render tables using markdown table syntax with | delimiters and --- separator row
|
||||
- Use numbered lists (1. 2. 3.) and bullet lists (- item) where the document uses them
|
||||
- Preserve emphasis: **bold** and *italic* where the original uses them
|
||||
- Use ``` code blocks for code snippets, formulas, or monospace content
|
||||
- Use <!-- image --> as a placeholder where figures or images appear
|
||||
- Omit page numbers, running headers/footers, and watermarks
|
||||
- Preserve the document's reading order
|
||||
- Do NOT invent or hallucinate content — only extract what is actually in the document
|
||||
- Do NOT wrap the output in a markdown code fence — return raw markdown directly
|
||||
- For multi-column layouts, read left column first, then right column
|
||||
- For forms with label-value pairs, use **Label:** Value format
|
||||
"""
|
||||
|
||||
|
||||
def get_repo_root() -> Path:
|
||||
current = Path(__file__).resolve().parent
|
||||
while current != current.parent:
|
||||
if (current / "Cargo.toml").exists() and (current / "test_documents").exists():
|
||||
return current
|
||||
current = current.parent
|
||||
raise RuntimeError("Could not find repository root")
|
||||
|
||||
|
||||
def discover_fixtures(fixtures_dir: Path, name_filter: str | None = None) -> list[dict]:
|
||||
"""Find PDF fixtures that need markdown ground truth."""
|
||||
results = []
|
||||
for fixture_path in sorted(fixtures_dir.rglob("*.json")):
|
||||
try:
|
||||
with open(fixture_path) as f:
|
||||
fixture = json.load(f)
|
||||
except (json.JSONDecodeError, OSError):
|
||||
continue
|
||||
|
||||
if fixture.get("file_type") != "pdf":
|
||||
continue
|
||||
|
||||
name = fixture_path.stem
|
||||
if name_filter and name_filter not in name:
|
||||
continue
|
||||
|
||||
doc_rel = fixture.get("document", "")
|
||||
if not doc_rel:
|
||||
continue
|
||||
|
||||
doc_path = (fixture_path.parent / doc_rel).resolve()
|
||||
if not doc_path.exists():
|
||||
continue
|
||||
|
||||
results.append(
|
||||
{
|
||||
"name": name,
|
||||
"fixture_path": fixture_path,
|
||||
"doc_path": doc_path,
|
||||
"fixture": fixture,
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
class _Timeout(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def _timeout_handler(signum, frame):
|
||||
raise _Timeout("API call timed out")
|
||||
|
||||
|
||||
def generate_markdown(
|
||||
client: genai.Client,
|
||||
pdf_path: Path,
|
||||
model: str,
|
||||
timeout: int = 120,
|
||||
) -> str:
|
||||
"""Send PDF to Gemini and get markdown extraction."""
|
||||
pdf_bytes = pdf_path.read_bytes()
|
||||
|
||||
old_handler = signal.signal(signal.SIGALRM, _timeout_handler)
|
||||
signal.alarm(timeout)
|
||||
try:
|
||||
response = client.models.generate_content(
|
||||
model=model,
|
||||
contents=[
|
||||
Part.from_bytes(data=pdf_bytes, mime_type="application/pdf"),
|
||||
EXTRACTION_PROMPT,
|
||||
],
|
||||
config=GenerateContentConfig(
|
||||
temperature=0.1,
|
||||
max_output_tokens=8192,
|
||||
),
|
||||
)
|
||||
finally:
|
||||
signal.alarm(0)
|
||||
signal.signal(signal.SIGALRM, old_handler)
|
||||
|
||||
text = response.text or ""
|
||||
|
||||
# Strip markdown code fence wrapper if Gemini added one
|
||||
if text.startswith("```markdown\n"):
|
||||
text = text[len("```markdown\n") :]
|
||||
text = text.removesuffix("\n```")
|
||||
elif text.startswith("```md\n"):
|
||||
text = text[len("```md\n") :]
|
||||
text = text.removesuffix("\n```")
|
||||
elif text.startswith("```\n"):
|
||||
text = text[len("```\n") :]
|
||||
text = text.removesuffix("\n```")
|
||||
|
||||
return text.strip() + "\n"
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="Generate markdown ground truth from PDFs using Gemini")
|
||||
parser.add_argument(
|
||||
"--filter", type=str, default=None, help="Only process fixtures whose name contains this string"
|
||||
)
|
||||
parser.add_argument("--dry-run", action="store_true", help="Show what would be processed without calling the API")
|
||||
parser.add_argument("--force", action="store_true", help="Regenerate even if .md file already exists")
|
||||
parser.add_argument(
|
||||
"--model", type=str, default="gemini-2.0-flash", help="Gemini model to use (default: gemini-2.0-flash)"
|
||||
)
|
||||
parser.add_argument("--project", type=str, default="boxwood-spirit-479620-r5", help="GCP project ID")
|
||||
parser.add_argument("--location", type=str, default="us-central1", help="Vertex AI location")
|
||||
parser.add_argument("--delay", type=float, default=1.0, help="Delay between API calls in seconds (rate limiting)")
|
||||
parser.add_argument("--timeout", type=int, default=120, help="Per-request timeout in seconds (default: 120)")
|
||||
parser.add_argument("--max-size", type=int, default=None, help="Skip PDFs larger than this many KB")
|
||||
args = parser.parse_args()
|
||||
|
||||
repo_root = get_repo_root()
|
||||
fixtures_dir = repo_root / "tools" / "benchmark-harness" / "fixtures"
|
||||
gt_dir = repo_root / "test_documents" / "ground_truth" / "pdf"
|
||||
|
||||
print(f"Repository root: {repo_root}")
|
||||
print(f"Fixtures dir: {fixtures_dir}")
|
||||
print(f"Output dir: {gt_dir}")
|
||||
print(f"Model: {args.model}")
|
||||
if args.dry_run:
|
||||
print("DRY RUN MODE\n")
|
||||
|
||||
fixtures = discover_fixtures(fixtures_dir, args.filter)
|
||||
print(f"Found {len(fixtures)} PDF fixtures")
|
||||
|
||||
if not args.dry_run:
|
||||
client = genai.Client(
|
||||
vertexai=True,
|
||||
project=args.project,
|
||||
location=args.location,
|
||||
)
|
||||
|
||||
stats = {"generated": 0, "skipped": 0, "errors": 0}
|
||||
|
||||
for item in fixtures:
|
||||
name = item["name"]
|
||||
md_path = gt_dir / f"{name}.md"
|
||||
file_size_kb = item["doc_path"].stat().st_size / 1024
|
||||
|
||||
if md_path.exists() and not args.force:
|
||||
stats["skipped"] += 1
|
||||
continue
|
||||
|
||||
if args.max_size and file_size_kb > args.max_size:
|
||||
print(f" Skipping {name} ({file_size_kb:.0f} KB > {args.max_size} KB)")
|
||||
stats["skipped"] += 1
|
||||
continue
|
||||
|
||||
if args.dry_run:
|
||||
print(f" [DRY] {name} ({file_size_kb:.0f} KB)")
|
||||
stats["generated"] += 1
|
||||
continue
|
||||
|
||||
print(f" Processing {name} ({file_size_kb:.0f} KB)...", end=" ", flush=True)
|
||||
try:
|
||||
start = time.time()
|
||||
markdown = generate_markdown(client, item["doc_path"], args.model, timeout=args.timeout)
|
||||
elapsed = time.time() - start
|
||||
|
||||
gt_dir.mkdir(parents=True, exist_ok=True)
|
||||
md_path.write_text(markdown, encoding="utf-8")
|
||||
|
||||
# Quick quality check
|
||||
lines = markdown.strip().split("\n")
|
||||
headings = sum(1 for l in lines if l.startswith("#"))
|
||||
tables = sum(1 for l in lines if "|" in l and "---" not in l)
|
||||
print(f"OK ({elapsed:.1f}s, {len(lines)} lines, {headings} headings, {tables} table rows)")
|
||||
stats["generated"] += 1
|
||||
|
||||
time.sleep(args.delay)
|
||||
|
||||
except _Timeout:
|
||||
print(f"TIMEOUT ({args.timeout}s)")
|
||||
stats["errors"] += 1
|
||||
except Exception as e:
|
||||
print(f"ERROR: {e}")
|
||||
stats["errors"] += 1
|
||||
|
||||
print(f"\n{'=' * 50}")
|
||||
print(f"Generated: {stats['generated']}")
|
||||
print(f"Skipped: {stats['skipped']} (already exist)")
|
||||
print(f"Errors: {stats['errors']}")
|
||||
|
||||
return 0 if stats["errors"] == 0 else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
212
tools/benchmark-harness/scripts/generate_md_gt.sh
Normal file
212
tools/benchmark-harness/scripts/generate_md_gt.sh
Normal file
@@ -0,0 +1,212 @@
|
||||
#!/usr/bin/env bash
|
||||
# Generate markdown and text ground truth for docbook, typst, and fictionbook formats
|
||||
# using pandoc + sanitize_pandoc_gt.py, then create benchmark fixture JSON files.
|
||||
set -euo pipefail
|
||||
|
||||
REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)"
|
||||
SANITIZE="$REPO_ROOT/tools/benchmark-harness/scripts/sanitize_pandoc_gt.py"
|
||||
FIXTURES_DIR="$REPO_ROOT/tools/benchmark-harness/fixtures"
|
||||
|
||||
cd "$REPO_ROOT"
|
||||
|
||||
echo "=== Step 1: Generate MD ground truth via pandoc + sanitize ==="
|
||||
|
||||
# --- DocBook ---
|
||||
echo "--- DocBook ---"
|
||||
for f in test_documents/docbook/*.dbk test_documents/docbook/*.docbook test_documents/docbook/*.docbook4 test_documents/docbook/*.docbook5; do
|
||||
[ -f "$f" ] || continue
|
||||
name=$(basename "$f" | sed 's/\.[^.]*$//')
|
||||
mkdir -p test_documents/ground_truth/docbook
|
||||
pandoc -f docbook -t gfm --wrap=none "$f" 2>/dev/null | python3 "$SANITIZE" >"test_documents/ground_truth/docbook/${name}.md"
|
||||
echo "docbook: $name ($(wc -c <"test_documents/ground_truth/docbook/${name}.md") bytes)"
|
||||
done
|
||||
|
||||
# --- Typst ---
|
||||
echo "--- Typst ---"
|
||||
for f in test_documents/typst/*.typ; do
|
||||
[ -f "$f" ] || continue
|
||||
name=$(basename "$f" .typ)
|
||||
# Typst GT goes in both typ/ (matching existing convention) and typst/
|
||||
for gtdir in test_documents/ground_truth/typ test_documents/ground_truth/typst; do
|
||||
mkdir -p "$gtdir"
|
||||
pandoc -f typst -t gfm --wrap=none "$f" 2>/dev/null | python3 "$SANITIZE" >"${gtdir}/${name}.md"
|
||||
done
|
||||
echo "typst: $name ($(wc -c <"test_documents/ground_truth/typ/${name}.md") bytes)"
|
||||
done
|
||||
|
||||
# --- FictionBook (fb2) ---
|
||||
echo "--- FictionBook ---"
|
||||
for f in test_documents/fictionbook/*.fb2; do
|
||||
[ -f "$f" ] || continue
|
||||
name=$(basename "$f" .fb2)
|
||||
mkdir -p test_documents/ground_truth/fb2
|
||||
existing="test_documents/ground_truth/fb2/${name}.md"
|
||||
if [ ! -f "$existing" ]; then
|
||||
pandoc -f fb2 -t gfm --wrap=none "$f" 2>/dev/null | python3 "$SANITIZE" >"$existing"
|
||||
echo "fb2: $name (new, $(wc -c <"$existing") bytes)"
|
||||
else
|
||||
echo "fb2: $name (exists, $(wc -c <"$existing") bytes)"
|
||||
fi
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "=== Step 2: Generate text GT from MD GT ==="
|
||||
|
||||
# For each .md GT file, generate .txt if missing
|
||||
for md_file in test_documents/ground_truth/docbook/*.md test_documents/ground_truth/typ/*.md test_documents/ground_truth/fb2/*.md; do
|
||||
[ -f "$md_file" ] || continue
|
||||
txt_file="${md_file%.md}.txt"
|
||||
if [ ! -f "$txt_file" ]; then
|
||||
pandoc -f gfm -t plain --wrap=none "$md_file" >"$txt_file"
|
||||
echo "text: $(basename "$txt_file") (new, $(wc -c <"$txt_file") bytes)"
|
||||
fi
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "=== Step 3: Create fixture JSON files ==="
|
||||
|
||||
# Helper to create fixture JSON
|
||||
create_fixture() {
|
||||
local doc_path="$1"
|
||||
local file_type="$2"
|
||||
local gt_text="$3"
|
||||
local gt_md="$4"
|
||||
local fixture_out="$5"
|
||||
local description="$6"
|
||||
local category="$7"
|
||||
|
||||
local file_size
|
||||
file_size=$(stat -f %z "$doc_path" 2>/dev/null || wc -c <"$doc_path" | tr -d ' ')
|
||||
|
||||
local name
|
||||
name=$(basename "$doc_path" | sed 's/\.[^.]*$//')
|
||||
|
||||
# Compute relative paths from fixtures dir
|
||||
local rel_doc="../../../${doc_path}"
|
||||
local rel_text="../../../${gt_text}"
|
||||
local rel_md="../../../${gt_md}"
|
||||
|
||||
local json
|
||||
if [ -f "$gt_md" ] && [ -f "$gt_text" ]; then
|
||||
json=$(
|
||||
cat <<EOJSON
|
||||
{
|
||||
"document": "${rel_doc}",
|
||||
"file_type": "${file_type}",
|
||||
"file_size": ${file_size},
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "${description}",
|
||||
"category": "${category}"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "${rel_text}",
|
||||
"markdown_file": "${rel_md}",
|
||||
"source": "pandoc"
|
||||
}
|
||||
}
|
||||
EOJSON
|
||||
)
|
||||
elif [ -f "$gt_text" ]; then
|
||||
json=$(
|
||||
cat <<EOJSON
|
||||
{
|
||||
"document": "${rel_doc}",
|
||||
"file_type": "${file_type}",
|
||||
"file_size": ${file_size},
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "${description}",
|
||||
"category": "${category}"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "${rel_text}",
|
||||
"source": "pandoc"
|
||||
}
|
||||
}
|
||||
EOJSON
|
||||
)
|
||||
fi
|
||||
|
||||
echo "$json" >"$fixture_out"
|
||||
echo "fixture: $(basename "$fixture_out")"
|
||||
}
|
||||
|
||||
# --- DocBook fixtures ---
|
||||
echo "--- DocBook fixtures ---"
|
||||
for f in test_documents/docbook/*.dbk test_documents/docbook/*.docbook test_documents/docbook/*.docbook4 test_documents/docbook/*.docbook5; do
|
||||
[ -f "$f" ] || continue
|
||||
name=$(basename "$f" | sed 's/\.[^.]*$//')
|
||||
ext=$(basename "$f" | sed 's/.*\.//')
|
||||
gt_md="test_documents/ground_truth/docbook/${name}.md"
|
||||
gt_txt="test_documents/ground_truth/docbook/${name}.txt"
|
||||
|
||||
# Determine file_type based on extension
|
||||
case "$ext" in
|
||||
dbk) ft="dbk" ;;
|
||||
docbook | docbook4 | docbook5) ft="docbook" ;;
|
||||
*) ft="docbook" ;;
|
||||
esac
|
||||
|
||||
fixture_name="docbook_$(echo "$name" | tr '-' '_').json"
|
||||
create_fixture "$f" "$ft" "$gt_txt" "$gt_md" "${FIXTURES_DIR}/${fixture_name}" "DocBook document: ${name}" "docbook"
|
||||
done
|
||||
|
||||
# --- Typst fixtures (update existing to add markdown_file) ---
|
||||
echo "--- Typst fixtures ---"
|
||||
for f in test_documents/typst/*.typ; do
|
||||
[ -f "$f" ] || continue
|
||||
name=$(basename "$f" .typ)
|
||||
gt_md="test_documents/ground_truth/typ/${name}.md"
|
||||
gt_txt="test_documents/ground_truth/typ/typst_${name}.txt"
|
||||
# Some txt files use name directly, some use typst_ prefix - check both
|
||||
if [ ! -f "$gt_txt" ]; then
|
||||
gt_txt="test_documents/ground_truth/typ/${name}.txt"
|
||||
fi
|
||||
|
||||
fixture_name="typst_${name}.json"
|
||||
create_fixture "$f" "typ" "$gt_txt" "$gt_md" "${FIXTURES_DIR}/${fixture_name}" "Typst document: ${name}" "typst"
|
||||
done
|
||||
|
||||
# --- FictionBook fixtures (update existing to add markdown_file) ---
|
||||
echo "--- FictionBook fixtures ---"
|
||||
for f in test_documents/fictionbook/*.fb2; do
|
||||
[ -f "$f" ] || continue
|
||||
name=$(basename "$f" .fb2)
|
||||
gt_md="test_documents/ground_truth/fb2/${name}.md"
|
||||
gt_txt="test_documents/ground_truth/fb2/${name}.txt"
|
||||
# Some txt files use fb2_ prefix
|
||||
if [ ! -f "$gt_txt" ]; then
|
||||
gt_txt="test_documents/ground_truth/fb2/fb2_${name}.txt"
|
||||
fi
|
||||
|
||||
fixture_name="fb2_${name}.json"
|
||||
create_fixture "$f" "fb2" "$gt_txt" "$gt_md" "${FIXTURES_DIR}/${fixture_name}" "FictionBook document: ${name}" "fictionbook"
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "=== Step 4: Validate ==="
|
||||
|
||||
echo "--- Verifying GT files are non-empty ---"
|
||||
empty_count=0
|
||||
for f in test_documents/ground_truth/docbook/*.md test_documents/ground_truth/typ/*.md test_documents/ground_truth/fb2/*.md; do
|
||||
[ -f "$f" ] || continue
|
||||
size=$(wc -c <"$f" | tr -d ' ')
|
||||
if [ "$size" -le 1 ]; then
|
||||
echo "WARNING: $f is empty/near-empty ($size bytes)"
|
||||
empty_count=$((empty_count + 1))
|
||||
fi
|
||||
done
|
||||
echo "Empty/near-empty GT files: $empty_count"
|
||||
|
||||
echo ""
|
||||
echo "=== Summary ==="
|
||||
echo "DocBook MD GT files: $(find test_documents/ground_truth/docbook/*.md -maxdepth 1 2>/dev/null | wc -l | tr -d ' ')"
|
||||
echo "DocBook TXT GT files: $(find test_documents/ground_truth/docbook/*.txt -maxdepth 1 2>/dev/null | wc -l | tr -d ' ')"
|
||||
echo "Typst MD GT files: $(find test_documents/ground_truth/typ/*.md -maxdepth 1 2>/dev/null | wc -l | tr -d ' ')"
|
||||
echo "Typst TXT GT files: $(find test_documents/ground_truth/typ/*.txt -maxdepth 1 2>/dev/null | wc -l | tr -d ' ')"
|
||||
echo "FB2 MD GT files: $(find test_documents/ground_truth/fb2/*.md -maxdepth 1 2>/dev/null | wc -l | tr -d ' ')"
|
||||
echo "FB2 TXT GT files: $(find test_documents/ground_truth/fb2/*.txt -maxdepth 1 2>/dev/null | wc -l | tr -d ' ')"
|
||||
echo ""
|
||||
echo "Fixture files created/updated:"
|
||||
ls -1 "${FIXTURES_DIR}"/docbook_*.json "${FIXTURES_DIR}"/typst_*.json "${FIXTURES_DIR}"/fb2_*.json "${FIXTURES_DIR}"/dbk_*.json 2>/dev/null
|
||||
212
tools/benchmark-harness/scripts/generate_pdf_gt_mistral.py
Normal file
212
tools/benchmark-harness/scripts/generate_pdf_gt_mistral.py
Normal file
@@ -0,0 +1,212 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Generate PDF markdown ground truth using Mistral's pixtral vision model.
|
||||
|
||||
Usage:
|
||||
# Generate GT for all PDFs missing MD GT:
|
||||
python generate_pdf_gt_mistral.py
|
||||
|
||||
# Generate GT for a specific fixture:
|
||||
python generate_pdf_gt_mistral.py tools/benchmark-harness/fixtures/pdf/2203.01017v2.json
|
||||
|
||||
# Dry run (show what would be generated):
|
||||
python generate_pdf_gt_mistral.py --dry-run
|
||||
|
||||
# Pilot batch (first N):
|
||||
python generate_pdf_gt_mistral.py --limit 10
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import base64
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
MISTRAL_API_KEY = os.environ.get("MISTRAL_API_KEY", "")
|
||||
MISTRAL_MODEL = "mistral-ocr-latest"
|
||||
MISTRAL_API_URL = "https://api.mistral.ai/v1/ocr"
|
||||
|
||||
PROMPT = (
|
||||
"Convert this PDF to clean GFM (GitHub Flavored Markdown). "
|
||||
"Preserve the document structure: headings, paragraphs, tables, lists, "
|
||||
"code blocks, and formulas. Use proper heading hierarchy (# for title, ## for sections). "
|
||||
"Render tables as GFM pipe tables. Do not add commentary or explanations."
|
||||
)
|
||||
|
||||
|
||||
def load_env():
|
||||
"""Load MISTRAL_API_KEY from ../liter-llm/.env if not in environment."""
|
||||
global MISTRAL_API_KEY
|
||||
if MISTRAL_API_KEY:
|
||||
return
|
||||
env_path = Path(__file__).resolve().parents[3] / ".." / "liter-llm" / ".env"
|
||||
if env_path.exists():
|
||||
for line in env_path.read_text().splitlines():
|
||||
if line.startswith("MISTRAL_API_KEY="):
|
||||
MISTRAL_API_KEY = line.split("=", 1)[1].strip()
|
||||
return
|
||||
print("ERROR: MISTRAL_API_KEY not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def call_mistral_ocr(pdf_path: str) -> str:
|
||||
"""Send a PDF to Mistral OCR and return markdown."""
|
||||
import httpx
|
||||
|
||||
pdf_data = Path(pdf_path).read_bytes()
|
||||
b64 = base64.standard_b64encode(pdf_data).decode("ascii")
|
||||
|
||||
payload = {
|
||||
"model": MISTRAL_MODEL,
|
||||
"document": {
|
||||
"type": "document_url",
|
||||
"document_url": f"data:application/pdf;base64,{b64}",
|
||||
},
|
||||
}
|
||||
|
||||
resp = httpx.post(
|
||||
MISTRAL_API_URL,
|
||||
json=payload,
|
||||
headers={
|
||||
"Authorization": f"Bearer {MISTRAL_API_KEY}",
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
timeout=120.0,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
|
||||
# Extract markdown from pages
|
||||
pages = data.get("pages", [])
|
||||
if not pages:
|
||||
return ""
|
||||
return "\n\n".join(p.get("markdown", "") for p in pages)
|
||||
|
||||
|
||||
def find_fixtures_needing_gt() -> list[tuple[str, str, str]]:
|
||||
"""Find PDF fixtures that don't have markdown GT.
|
||||
Returns list of (fixture_path, pdf_path, gt_md_path).
|
||||
"""
|
||||
fixtures_dir = Path("tools/benchmark-harness/fixtures/pdf")
|
||||
results = []
|
||||
|
||||
for f in sorted(fixtures_dir.glob("*.json")):
|
||||
data = json.loads(f.read_text())
|
||||
gt = data.get("ground_truth")
|
||||
if gt is None:
|
||||
continue
|
||||
if gt.get("markdown_file"):
|
||||
continue # Already has MD GT
|
||||
|
||||
doc_path = data.get("document", "")
|
||||
pdf_path = str((f.parent / doc_path).resolve())
|
||||
if not Path(pdf_path).exists():
|
||||
continue
|
||||
|
||||
# Determine GT output path
|
||||
text_file = gt.get("text_file", "")
|
||||
if text_file:
|
||||
gt_md = text_file.rsplit(".", 1)[0] + ".md"
|
||||
else:
|
||||
name = Path(doc_path).stem
|
||||
gt_md = f"../../../../test_documents/ground_truth/pdf/{name}.md"
|
||||
|
||||
gt_md_path = str((f.parent / gt_md).resolve())
|
||||
results.append((str(f), pdf_path, gt_md_path))
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def process_fixture(fixture_path: str, pdf_path: str, gt_md_path: str, dry_run: bool = False) -> bool:
|
||||
"""Process a single fixture. Returns True if successful."""
|
||||
name = Path(pdf_path).stem
|
||||
size_mb = Path(pdf_path).stat().st_size / (1024 * 1024)
|
||||
|
||||
if dry_run:
|
||||
print(f" [dry-run] {name} ({size_mb:.1f}MB) → {gt_md_path}")
|
||||
return True
|
||||
|
||||
print(f" Processing {name} ({size_mb:.1f}MB)...", end=" ", flush=True)
|
||||
|
||||
try:
|
||||
markdown = call_mistral_ocr(pdf_path)
|
||||
if not markdown.strip():
|
||||
print("EMPTY")
|
||||
return False
|
||||
|
||||
# Sanitize
|
||||
from sanitize_pandoc_gt import sanitize
|
||||
|
||||
markdown = sanitize(markdown)
|
||||
|
||||
# Write GT file
|
||||
Path(gt_md_path).parent.mkdir(parents=True, exist_ok=True)
|
||||
Path(gt_md_path).write_text(markdown)
|
||||
|
||||
# Update fixture JSON
|
||||
data = json.loads(Path(fixture_path).read_text())
|
||||
gt = data["ground_truth"]
|
||||
# Compute relative path from fixture to GT
|
||||
rel_path = os.path.relpath(gt_md_path, Path(fixture_path).parent)
|
||||
gt["markdown_file"] = rel_path
|
||||
gt["source"] = "mistral-pixtral"
|
||||
Path(fixture_path).write_text(json.dumps(data, indent=2) + "\n")
|
||||
|
||||
print(f"OK ({len(markdown)} bytes)")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"ERROR: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Generate PDF GT with Mistral OCR")
|
||||
parser.add_argument("fixture", nargs="?", help="Specific fixture JSON to process")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Show what would be done")
|
||||
parser.add_argument("--limit", type=int, default=0, help="Process only first N fixtures")
|
||||
parser.add_argument("--delay", type=float, default=1.0, help="Delay between API calls (seconds)")
|
||||
args = parser.parse_args()
|
||||
|
||||
load_env()
|
||||
|
||||
if args.fixture:
|
||||
# Process single fixture
|
||||
data = json.loads(Path(args.fixture).read_text())
|
||||
doc_path = data.get("document", "")
|
||||
pdf_path = str((Path(args.fixture).parent / doc_path).resolve())
|
||||
gt = data.get("ground_truth", {})
|
||||
text_file = gt.get("text_file", "")
|
||||
if text_file:
|
||||
gt_md = text_file.rsplit(".", 1)[0] + ".md"
|
||||
else:
|
||||
gt_md = f"../../../../test_documents/ground_truth/pdf/{Path(doc_path).stem}.md"
|
||||
gt_md_path = str((Path(args.fixture).parent / gt_md).resolve())
|
||||
process_fixture(args.fixture, pdf_path, gt_md_path, dry_run=args.dry_run)
|
||||
return
|
||||
|
||||
# Process all fixtures needing GT
|
||||
fixtures = find_fixtures_needing_gt()
|
||||
print(f"Found {len(fixtures)} PDF fixtures needing markdown GT")
|
||||
|
||||
if args.limit > 0:
|
||||
fixtures = fixtures[: args.limit]
|
||||
print(f"Processing first {args.limit}")
|
||||
|
||||
success = 0
|
||||
failed = 0
|
||||
for fixture_path, pdf_path, gt_md_path in fixtures:
|
||||
ok = process_fixture(fixture_path, pdf_path, gt_md_path, dry_run=args.dry_run)
|
||||
if ok:
|
||||
success += 1
|
||||
else:
|
||||
failed += 1
|
||||
if not args.dry_run and args.delay > 0:
|
||||
time.sleep(args.delay)
|
||||
|
||||
print(f"\nDone: {success} generated, {failed} failed")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
172
tools/benchmark-harness/scripts/generate_vendored_baselines.py
Normal file
172
tools/benchmark-harness/scripts/generate_vendored_baselines.py
Normal file
@@ -0,0 +1,172 @@
|
||||
# /// script
|
||||
# requires-python = ">=3.11"
|
||||
# dependencies = [
|
||||
# "paddleocr>=3.4.0",
|
||||
# "paddlepaddle>=3.3.0",
|
||||
# "rapidocr-onnxruntime>=1.4.0",
|
||||
# "pymupdf>=1.24.0",
|
||||
# "pillow>=10.0.0",
|
||||
# "numpy>=1.24.0",
|
||||
# ]
|
||||
# ///
|
||||
"""Generate vendored OCR baselines from PaddleOCR Python and RapidOCR.
|
||||
|
||||
Usage:
|
||||
uv run tools/benchmark-harness/scripts/generate_vendored_baselines.py
|
||||
uv run tools/benchmark-harness/scripts/generate_vendored_baselines.py rapidocr
|
||||
uv run tools/benchmark-harness/scripts/generate_vendored_baselines.py --force
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import fitz
|
||||
import numpy as np
|
||||
|
||||
FIXTURES_DIR = Path(__file__).resolve().parent.parent / "fixtures"
|
||||
VENDORED_DIR = Path(__file__).resolve().parent.parent / "vendored"
|
||||
|
||||
OCR_FIXTURES = [
|
||||
"pdf_image_only_german",
|
||||
"pdf_non_searchable",
|
||||
"pdf_ocr_rotated_270",
|
||||
"pdf_ocr_rotated_90",
|
||||
"pdf_ocr_rotated",
|
||||
"pdf_ocr_test",
|
||||
"pdf_scanned_ocr",
|
||||
]
|
||||
|
||||
|
||||
def pdf_to_images(pdf_path: str, dpi: int = 300) -> list[np.ndarray]:
|
||||
"""Convert PDF pages to numpy arrays (RGB, HWC)."""
|
||||
import io
|
||||
|
||||
from PIL import Image
|
||||
|
||||
doc = fitz.open(pdf_path)
|
||||
images = []
|
||||
for page in doc:
|
||||
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
||||
pix = page.get_pixmap(matrix=mat)
|
||||
img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")
|
||||
images.append(np.array(img))
|
||||
doc.close()
|
||||
return images
|
||||
|
||||
|
||||
def lines_to_markdown(lines: list[str]) -> str:
|
||||
"""Each OCR text line becomes a markdown paragraph."""
|
||||
paragraphs = [line.strip() for line in lines if line.strip()]
|
||||
return "\n\n".join(paragraphs) + "\n" if paragraphs else ""
|
||||
|
||||
|
||||
def run_paddleocr_python(pdf_path: str) -> tuple[str, float]:
|
||||
"""Run PaddleOCR Python v3.4+ using the predict() API."""
|
||||
os.environ["PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK"] = "True"
|
||||
from paddleocr import PaddleOCR
|
||||
|
||||
ocr = PaddleOCR(use_textline_orientation=True, lang="en")
|
||||
images = pdf_to_images(pdf_path)
|
||||
|
||||
start = time.monotonic()
|
||||
all_lines: list[str] = []
|
||||
for img in images:
|
||||
# predict() returns list of OCRResult (dict-like) objects
|
||||
for result in ocr.predict(img):
|
||||
# OCRResult has 'rec_text' key with list of recognized texts
|
||||
rec_texts = result.get("rec_text", [])
|
||||
if isinstance(rec_texts, (list, tuple)):
|
||||
for t in rec_texts:
|
||||
text = str(t).strip()
|
||||
if text:
|
||||
all_lines.append(text)
|
||||
elapsed_ms = (time.monotonic() - start) * 1000
|
||||
|
||||
return lines_to_markdown(all_lines), elapsed_ms
|
||||
|
||||
|
||||
def run_rapidocr(pdf_path: str) -> tuple[str, float]:
|
||||
"""Run RapidOCR."""
|
||||
from rapidocr_onnxruntime import RapidOCR
|
||||
|
||||
ocr = RapidOCR()
|
||||
images = pdf_to_images(pdf_path)
|
||||
|
||||
start = time.monotonic()
|
||||
all_lines: list[str] = []
|
||||
for img in images:
|
||||
result, _ = ocr(img)
|
||||
if not result:
|
||||
continue
|
||||
for line in result:
|
||||
if line and len(line) >= 2:
|
||||
text = str(line[1]).strip()
|
||||
if text:
|
||||
all_lines.append(text)
|
||||
elapsed_ms = (time.monotonic() - start) * 1000
|
||||
|
||||
return lines_to_markdown(all_lines), elapsed_ms
|
||||
|
||||
|
||||
def save_vendored(pipeline_name: str, fixture_name: str, md: str, time_ms: float):
|
||||
md_dir = VENDORED_DIR / pipeline_name / "md"
|
||||
timing_dir = VENDORED_DIR / pipeline_name / "timing"
|
||||
md_dir.mkdir(parents=True, exist_ok=True)
|
||||
timing_dir.mkdir(parents=True, exist_ok=True)
|
||||
(md_dir / f"{fixture_name}.md").write_text(md)
|
||||
(timing_dir / f"{fixture_name}.ms").write_text(f"{time_ms:.1f}\n")
|
||||
|
||||
|
||||
def main():
|
||||
pipelines = {
|
||||
"paddleocr-python": run_paddleocr_python,
|
||||
"rapidocr": run_rapidocr,
|
||||
}
|
||||
|
||||
force = "--force" in sys.argv
|
||||
args = [a for a in sys.argv[1:] if not a.startswith("--")]
|
||||
|
||||
if args:
|
||||
selected = args[0]
|
||||
if selected not in pipelines:
|
||||
print(f"Unknown: {selected}. Choose: {list(pipelines.keys())}")
|
||||
sys.exit(1)
|
||||
pipelines = {selected: pipelines[selected]}
|
||||
|
||||
for fixture_name in OCR_FIXTURES:
|
||||
fixture_path = FIXTURES_DIR / f"{fixture_name}.json"
|
||||
if not fixture_path.exists():
|
||||
print(f" SKIP {fixture_name}: fixture not found")
|
||||
continue
|
||||
|
||||
with open(fixture_path) as f:
|
||||
fixture = json.load(f)
|
||||
|
||||
doc_path = str((FIXTURES_DIR / fixture["document"]).resolve())
|
||||
if not os.path.exists(doc_path):
|
||||
print(f" SKIP {fixture_name}: document not found")
|
||||
continue
|
||||
|
||||
for pipeline_name, run_fn in pipelines.items():
|
||||
existing = VENDORED_DIR / pipeline_name / "md" / f"{fixture_name}.md"
|
||||
if not force and existing.exists() and existing.stat().st_size > 0:
|
||||
print(f" CACHED {pipeline_name}/{fixture_name}")
|
||||
continue
|
||||
|
||||
print(f" RUN {pipeline_name}/{fixture_name} ...", end="", flush=True)
|
||||
try:
|
||||
md, time_ms = run_fn(doc_path)
|
||||
save_vendored(pipeline_name, fixture_name, md, time_ms)
|
||||
print(f" {time_ms:.0f}ms, {len(md)} chars")
|
||||
except Exception as e:
|
||||
print(f" ERROR: {e}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
7
tools/benchmark-harness/scripts/go.mod
Normal file
7
tools/benchmark-harness/scripts/go.mod
Normal file
@@ -0,0 +1,7 @@
|
||||
module github.com/kreuzberg-dev/kreuzberg/tools/benchmark-harness/scripts
|
||||
|
||||
go 1.23
|
||||
|
||||
require github.com/kreuzberg-dev/kreuzberg/packages/go/v4 v4.9.5
|
||||
|
||||
replace github.com/kreuzberg-dev/kreuzberg/packages/go/v4 => ../../../packages/go/v4
|
||||
407
tools/benchmark-harness/scripts/import_omnidocbench.py
Normal file
407
tools/benchmark-harness/scripts/import_omnidocbench.py
Normal file
@@ -0,0 +1,407 @@
|
||||
"""Import OmniDocBench dataset into our benchmark fixture format.
|
||||
|
||||
Converts OmniDocBench's element-level JSON annotations into:
|
||||
- Per-document fixture JSON files (tools/benchmark-harness/fixtures/pdf/omnidoc_NNN.json)
|
||||
- Ground truth markdown files (test_documents/ground_truth/pdf/omnidoc_NNN.md)
|
||||
- Ground truth text files (test_documents/ground_truth/pdf/omnidoc_NNN.txt)
|
||||
|
||||
OmniDocBench groups pages by document. Each multi-page document produces one fixture.
|
||||
Single-page documents produce one fixture per page.
|
||||
|
||||
Usage:
|
||||
python import_omnidocbench.py <omnidocbench_dir> <repo_root>
|
||||
|
||||
Where:
|
||||
omnidocbench_dir = tools/benchmark-harness/datasets/omnidocbench (contains OmniDocBench.json + ori_pdfs/)
|
||||
repo_root = repository root (contains tools/ and test_documents/)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import html
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
# OmniDocBench category types that map to content we want in ground truth
|
||||
CONTENT_CATEGORIES = {
|
||||
"title",
|
||||
"text_block",
|
||||
"table",
|
||||
"equation_isolated",
|
||||
"code_txt",
|
||||
"figure_caption",
|
||||
"table_caption",
|
||||
"equation_caption",
|
||||
"code_txt_caption",
|
||||
"reference",
|
||||
}
|
||||
|
||||
# Categories to skip (page furniture, figures without text, etc.)
|
||||
SKIP_CATEGORIES = {
|
||||
"header",
|
||||
"footer",
|
||||
"page_number",
|
||||
"page_footnote",
|
||||
"abandon",
|
||||
"figure",
|
||||
"figure_footnote",
|
||||
"table_footnote",
|
||||
}
|
||||
|
||||
|
||||
def html_table_to_markdown(html_str: str) -> str:
|
||||
"""Convert a simple HTML table to markdown table format."""
|
||||
if not html_str:
|
||||
return ""
|
||||
|
||||
# Unescape HTML entities
|
||||
html_str = html.unescape(html_str)
|
||||
|
||||
rows: list[list[str]] = []
|
||||
# Extract rows
|
||||
for row_match in re.finditer(r"<tr[^>]*>(.*?)</tr>", html_str, re.DOTALL):
|
||||
row_html = row_match.group(1)
|
||||
cells: list[str] = []
|
||||
for cell_match in re.finditer(r"<t[dh][^>]*>(.*?)</t[dh]>", row_html, re.DOTALL):
|
||||
cell_text = re.sub(r"<[^>]+>", "", cell_match.group(1)).strip()
|
||||
cells.append(cell_text)
|
||||
if cells:
|
||||
rows.append(cells)
|
||||
|
||||
if not rows:
|
||||
return html_str # fallback: return raw if parsing fails
|
||||
|
||||
# Normalize column count
|
||||
max_cols = max(len(r) for r in rows)
|
||||
for row in rows:
|
||||
while len(row) < max_cols:
|
||||
row.append("")
|
||||
|
||||
# Build markdown table
|
||||
lines = []
|
||||
# Header row
|
||||
lines.append("| " + " | ".join(rows[0]) + " |")
|
||||
lines.append("|" + "|".join(["---"] * max_cols) + "|")
|
||||
# Data rows
|
||||
for row in rows[1:]:
|
||||
lines.append("| " + " | ".join(row) + " |")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def annotation_to_markdown(ann: dict) -> str | None:
|
||||
"""Convert a single OmniDocBench annotation to markdown text."""
|
||||
cat = ann.get("category_type", "")
|
||||
|
||||
if cat in SKIP_CATEGORIES:
|
||||
return None
|
||||
|
||||
if ann.get("ignore", False):
|
||||
return None
|
||||
|
||||
text = ann.get("text", "").strip()
|
||||
|
||||
if cat == "title":
|
||||
# OmniDocBench doesn't distinguish heading levels.
|
||||
# Use H2 as default (most titles are section-level, not document-level).
|
||||
if text:
|
||||
return f"## {text}"
|
||||
return None
|
||||
|
||||
if cat == "text_block":
|
||||
return text or None
|
||||
|
||||
if cat == "table":
|
||||
# Prefer HTML representation for tables
|
||||
html_str = ann.get("html", "")
|
||||
if html_str:
|
||||
return html_table_to_markdown(html_str)
|
||||
# Fallback to text
|
||||
return text or None
|
||||
|
||||
if cat == "equation_isolated":
|
||||
latex = ann.get("latex", "")
|
||||
if latex:
|
||||
return f"$$\n{latex}\n$$"
|
||||
return text or None
|
||||
|
||||
if cat == "code_txt":
|
||||
if text:
|
||||
return f"```\n{text}\n```"
|
||||
return None
|
||||
|
||||
if cat in ("figure_caption", "table_caption", "equation_caption", "code_txt_caption"):
|
||||
return text or None
|
||||
|
||||
if cat == "reference":
|
||||
return text or None
|
||||
|
||||
# Unknown category — include text if present
|
||||
return text or None
|
||||
|
||||
|
||||
def page_to_markdown(page: dict) -> str:
|
||||
"""Convert a single OmniDocBench page to markdown."""
|
||||
annotations = page.get("layout_dets", [])
|
||||
|
||||
# Sort by reading order
|
||||
sorted_anns = sorted(annotations, key=lambda a: a.get("order", 999))
|
||||
|
||||
# Handle truncated blocks (merge them)
|
||||
relations = page.get("extra", {}).get("relation", [])
|
||||
merge_targets: dict[int, int] = {} # target_id -> source_id
|
||||
for rel in relations:
|
||||
if rel.get("relation") == "truncated":
|
||||
merge_targets[rel["target_anno_id"]] = rel["source_anno_id"]
|
||||
|
||||
# Build merged text for truncated blocks
|
||||
merged_text: dict[int, list[str]] = defaultdict(list)
|
||||
ann_by_id = {a.get("anno_id", i): a for i, a in enumerate(sorted_anns)}
|
||||
|
||||
for ann in sorted_anns:
|
||||
anno_id = ann.get("anno_id", -1)
|
||||
if anno_id in merge_targets:
|
||||
source_id = merge_targets[anno_id]
|
||||
text = ann.get("text", "").strip()
|
||||
if text:
|
||||
merged_text[source_id].append(text)
|
||||
|
||||
blocks: list[str] = []
|
||||
skip_ids = set(merge_targets.keys())
|
||||
|
||||
for ann in sorted_anns:
|
||||
anno_id = ann.get("anno_id", -1)
|
||||
if anno_id in skip_ids:
|
||||
continue
|
||||
|
||||
# Append merged text from truncated continuations
|
||||
if anno_id in merged_text:
|
||||
original_text = ann.get("text", "").strip()
|
||||
continuation = " ".join(merged_text[anno_id])
|
||||
ann = dict(ann) # shallow copy
|
||||
ann["text"] = f"{original_text} {continuation}".strip()
|
||||
|
||||
md = annotation_to_markdown(ann)
|
||||
if md:
|
||||
blocks.append(md)
|
||||
|
||||
return "\n\n".join(blocks)
|
||||
|
||||
|
||||
def strip_markdown_to_text(md: str) -> str:
|
||||
"""Strip markdown syntax to produce plain text."""
|
||||
lines = []
|
||||
in_code = False
|
||||
in_formula = False
|
||||
|
||||
for line in md.split("\n"):
|
||||
if line.startswith("```"):
|
||||
in_code = not in_code
|
||||
continue
|
||||
if line.startswith("$$"):
|
||||
in_formula = not in_formula
|
||||
continue
|
||||
if in_code or in_formula:
|
||||
lines.append(line)
|
||||
continue
|
||||
|
||||
# Strip heading markers
|
||||
stripped = re.sub(r"^#{1,6}\s+", "", line)
|
||||
# Strip table pipes (keep cell content)
|
||||
if stripped.startswith("|") and stripped.endswith("|"):
|
||||
# Skip separator rows
|
||||
if re.match(r"^\|[-|: ]+\|$", stripped):
|
||||
continue
|
||||
stripped = re.sub(r"\s*\|\s*", " ", stripped).strip()
|
||||
# Strip bold/italic
|
||||
stripped = re.sub(r"\*{1,3}([^*]+)\*{1,3}", r"\1", stripped)
|
||||
|
||||
if stripped:
|
||||
lines.append(stripped)
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def group_pages_by_pdf(pages: list[dict]) -> dict[str, list[dict]]:
|
||||
"""Group OmniDocBench pages by their source PDF."""
|
||||
groups: dict[str, list[dict]] = defaultdict(list)
|
||||
|
||||
for page in pages:
|
||||
page_info = page.get("page_info", {})
|
||||
image_path = page_info.get("image_path", "")
|
||||
|
||||
# Try to extract PDF name from image path
|
||||
# Image paths look like: "academic_literature/scihub_12345_p0.jpg"
|
||||
# or "PPT2PDF/PPT_sample.png"
|
||||
basename = os.path.splitext(os.path.basename(image_path))[0]
|
||||
|
||||
# Strip page suffix like _p0, _p1, etc.
|
||||
pdf_name = re.sub(r"_p\d+$", "", basename)
|
||||
|
||||
groups[pdf_name].append(page)
|
||||
|
||||
# Sort pages within each group by page number
|
||||
for pdf_name in groups:
|
||||
groups[pdf_name].sort(key=lambda p: p.get("page_info", {}).get("page_no", 0))
|
||||
|
||||
return groups
|
||||
|
||||
|
||||
def find_pdf_for_document(pdf_name: str, pages: list[dict], ori_pdfs_dir: Path) -> Path | None:
|
||||
"""Find the original PDF file for a document group."""
|
||||
if not ori_pdfs_dir.exists():
|
||||
return None
|
||||
|
||||
# Try direct name match
|
||||
for ext in (".pdf", ".PDF"):
|
||||
candidate = ori_pdfs_dir / f"{pdf_name}{ext}"
|
||||
if candidate.exists():
|
||||
return candidate
|
||||
|
||||
# Try searching in subdirectories
|
||||
for pdf_file in ori_pdfs_dir.rglob("*.pdf"):
|
||||
if pdf_file.stem == pdf_name:
|
||||
return pdf_file
|
||||
|
||||
# Try matching from image path
|
||||
if pages:
|
||||
image_path = pages[0].get("page_info", {}).get("image_path", "")
|
||||
parts = image_path.split("/")
|
||||
if len(parts) >= 2:
|
||||
subdir = parts[0]
|
||||
subdir_path = ori_pdfs_dir / subdir
|
||||
if subdir_path.exists():
|
||||
for pdf_file in subdir_path.glob("*.pdf"):
|
||||
if pdf_name.startswith(pdf_file.stem) or pdf_file.stem.startswith(pdf_name):
|
||||
return pdf_file
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def main() -> None:
|
||||
if len(sys.argv) < 3:
|
||||
print(
|
||||
"Usage: import_omnidocbench.py <omnidocbench_dir> <repo_root>",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
omnidoc_dir = Path(sys.argv[1]).resolve()
|
||||
repo_root = Path(sys.argv[2]).resolve()
|
||||
|
||||
json_path = omnidoc_dir / "OmniDocBench.json"
|
||||
ori_pdfs_dir = omnidoc_dir / "ori_pdfs"
|
||||
|
||||
if not json_path.exists():
|
||||
print(f"ERROR: {json_path} not found. Run download_omnidocbench.sh first.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
fixtures_dir = repo_root / "tools" / "benchmark-harness" / "fixtures" / "pdf"
|
||||
gt_dir = repo_root / "test_documents" / "ground_truth" / "pdf"
|
||||
fixtures_dir.mkdir(parents=True, exist_ok=True)
|
||||
gt_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
print(f"Loading {json_path}...", file=sys.stderr)
|
||||
with open(json_path) as f:
|
||||
pages = json.load(f)
|
||||
print(f"Loaded {len(pages)} pages", file=sys.stderr)
|
||||
|
||||
# Group pages by document
|
||||
doc_groups = group_pages_by_pdf(pages)
|
||||
print(f"Found {len(doc_groups)} documents", file=sys.stderr)
|
||||
|
||||
created = 0
|
||||
skipped_no_pdf = 0
|
||||
skipped_exists = 0
|
||||
skipped_empty = 0
|
||||
|
||||
for pdf_name, doc_pages in sorted(doc_groups.items()):
|
||||
# Generate fixture name
|
||||
fixture_name = f"omnidoc_{pdf_name}"
|
||||
# Sanitize: replace non-alphanumeric chars
|
||||
fixture_name = re.sub(r"[^a-zA-Z0-9_-]", "_", fixture_name)
|
||||
|
||||
fixture_path = fixtures_dir / f"{fixture_name}.json"
|
||||
gt_md_path = gt_dir / f"{fixture_name}.md"
|
||||
gt_txt_path = gt_dir / f"{fixture_name}.txt"
|
||||
|
||||
# Skip if already imported
|
||||
if fixture_path.exists():
|
||||
skipped_exists += 1
|
||||
continue
|
||||
|
||||
# Find the PDF
|
||||
pdf_path = find_pdf_for_document(pdf_name, doc_pages, ori_pdfs_dir)
|
||||
if pdf_path is None:
|
||||
skipped_no_pdf += 1
|
||||
continue
|
||||
|
||||
# Generate markdown from all pages
|
||||
page_markdowns = []
|
||||
for page in doc_pages:
|
||||
md = page_to_markdown(page)
|
||||
if md.strip():
|
||||
page_markdowns.append(md)
|
||||
|
||||
if not page_markdowns:
|
||||
skipped_empty += 1
|
||||
continue
|
||||
|
||||
full_markdown = "\n\n".join(page_markdowns)
|
||||
full_text = strip_markdown_to_text(full_markdown)
|
||||
|
||||
# Write ground truth files
|
||||
gt_md_path.write_text(full_markdown)
|
||||
gt_txt_path.write_text(full_text)
|
||||
|
||||
# Compute relative paths from fixture to document and ground truth
|
||||
doc_rel = os.path.relpath(pdf_path, fixtures_dir)
|
||||
gt_md_rel = os.path.relpath(gt_md_path, fixtures_dir)
|
||||
gt_txt_rel = os.path.relpath(gt_txt_path, fixtures_dir)
|
||||
|
||||
# Get page metadata for fixture
|
||||
first_page = doc_pages[0].get("page_info", {})
|
||||
page_attr = first_page.get("page_attribute", {})
|
||||
|
||||
fixture = {
|
||||
"document": doc_rel,
|
||||
"file_type": "pdf",
|
||||
"file_size": pdf_path.stat().st_size,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": f"OmniDocBench: {page_attr.get('data_source', 'unknown')}",
|
||||
"source": "omnidocbench",
|
||||
"size_category": "small" if pdf_path.stat().st_size < 500_000 else "medium",
|
||||
"language": page_attr.get("language", "unknown"),
|
||||
"layout": page_attr.get("layout", "unknown"),
|
||||
"data_source": page_attr.get("data_source", "unknown"),
|
||||
"page_count": len(doc_pages),
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": gt_txt_rel,
|
||||
"markdown_file": gt_md_rel,
|
||||
"source": "omnidocbench",
|
||||
},
|
||||
}
|
||||
|
||||
fixture_path.write_text(json.dumps(fixture, indent=2) + "\n")
|
||||
created += 1
|
||||
|
||||
if created % 50 == 0:
|
||||
print(f" {created} fixtures created...", file=sys.stderr)
|
||||
|
||||
print("\nDone:", file=sys.stderr)
|
||||
print(f" Created: {created}", file=sys.stderr)
|
||||
print(f" Skipped (already exists): {skipped_exists}", file=sys.stderr)
|
||||
print(f" Skipped (no PDF found): {skipped_no_pdf}", file=sys.stderr)
|
||||
print(f" Skipped (empty content): {skipped_empty}", file=sys.stderr)
|
||||
print(f" Fixtures: {fixtures_dir}", file=sys.stderr)
|
||||
print(f" Ground truth: {gt_dir}", file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
175
tools/benchmark-harness/scripts/markitdown_extract.py
Executable file
175
tools/benchmark-harness/scripts/markitdown_extract.py
Executable file
@@ -0,0 +1,175 @@
|
||||
"""MarkItDown extraction wrapper for benchmark harness."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import multiprocessing as _mp
|
||||
import os
|
||||
import platform
|
||||
import resource
|
||||
import sys
|
||||
import time
|
||||
|
||||
from markitdown import MarkItDown
|
||||
|
||||
|
||||
def _get_peak_memory_bytes() -> int:
|
||||
"""Get peak memory usage in bytes using resource module."""
|
||||
usage = resource.getrusage(resource.RUSAGE_SELF)
|
||||
if platform.system() == "Linux":
|
||||
return usage.ru_maxrss * 1024
|
||||
return usage.ru_maxrss
|
||||
|
||||
|
||||
def extract_sync(file_path: str) -> dict:
|
||||
"""Extract using MarkItDown."""
|
||||
start = time.perf_counter()
|
||||
md = MarkItDown()
|
||||
result = md.convert(file_path)
|
||||
duration_ms = (time.perf_counter() - start) * 1000.0
|
||||
|
||||
return {
|
||||
"content": result.text_content or "",
|
||||
"metadata": {"framework": "markitdown"},
|
||||
"_extraction_time_ms": duration_ms,
|
||||
"_peak_memory_bytes": _get_peak_memory_bytes(),
|
||||
}
|
||||
|
||||
|
||||
def _worker(fn, args, conn):
|
||||
"""Run extraction in a forked child process.
|
||||
|
||||
Closes inherited stdin/stdout so the child cannot corrupt the
|
||||
parent's line-based JSON protocol.
|
||||
"""
|
||||
try:
|
||||
sys.stdin.close()
|
||||
sys.stdout = open(os.devnull, "w")
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
result = fn(*args)
|
||||
conn.send(result)
|
||||
except Exception as e:
|
||||
conn.send({"error": str(e), "_extraction_time_ms": 0})
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def _run_with_timeout(fn, args, timeout):
|
||||
"""Execute fn(*args) in a forked child with a timeout.
|
||||
|
||||
On timeout the child is killed but the parent stays alive —
|
||||
no expensive process restart is needed.
|
||||
"""
|
||||
try:
|
||||
ctx = _mp.get_context("fork")
|
||||
parent_conn, child_conn = ctx.Pipe(duplex=False)
|
||||
p = ctx.Process(target=_worker, args=(fn, args, child_conn))
|
||||
p.start()
|
||||
child_conn.close()
|
||||
|
||||
if parent_conn.poll(timeout=timeout):
|
||||
try:
|
||||
result = parent_conn.recv()
|
||||
except Exception:
|
||||
result = {"error": "worker process crashed", "_extraction_time_ms": 0}
|
||||
else:
|
||||
p.kill()
|
||||
result = {
|
||||
"error": f"extraction timed out after {timeout}s",
|
||||
"_extraction_time_ms": timeout * 1000.0,
|
||||
}
|
||||
|
||||
p.join(timeout=5)
|
||||
if p.is_alive():
|
||||
p.kill()
|
||||
p.join()
|
||||
parent_conn.close()
|
||||
return result
|
||||
except Exception:
|
||||
# Fork not available — fall back to in-process extraction
|
||||
try:
|
||||
return fn(*args)
|
||||
except Exception as e:
|
||||
return {"error": str(e), "_extraction_time_ms": 0}
|
||||
|
||||
|
||||
def _parse_path(line: str) -> str:
|
||||
"""Parse a request line: JSON object with path field, or plain file path."""
|
||||
stripped = line.strip()
|
||||
if stripped.startswith("{"):
|
||||
try:
|
||||
return json.loads(stripped).get("path", "")
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
pass
|
||||
return stripped
|
||||
|
||||
|
||||
def run_server(timeout=None) -> None:
|
||||
"""Persistent server mode: read paths from stdin, write JSON to stdout."""
|
||||
print("READY", flush=True)
|
||||
for line in sys.stdin:
|
||||
file_path = _parse_path(line)
|
||||
if not file_path:
|
||||
continue
|
||||
if timeout is not None:
|
||||
result = _run_with_timeout(extract_sync, (file_path,), timeout)
|
||||
else:
|
||||
try:
|
||||
result = extract_sync(file_path)
|
||||
except Exception as e:
|
||||
result = {"error": str(e), "_extraction_time_ms": 0}
|
||||
print(json.dumps(result), flush=True)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
ocr_enabled = False
|
||||
timeout = None
|
||||
args = []
|
||||
for arg in sys.argv[1:]:
|
||||
if arg == "--ocr":
|
||||
ocr_enabled = True
|
||||
elif arg == "--no-ocr":
|
||||
ocr_enabled = False
|
||||
elif arg.startswith("--timeout="):
|
||||
timeout = int(arg.split("=", 1)[1])
|
||||
elif arg.startswith("--format="):
|
||||
_fmt = arg.split("=", 1)[1]
|
||||
if _fmt != "markdown":
|
||||
print(f"{sys.argv[0]} only supports markdown output; got --format {_fmt}", file=sys.stderr)
|
||||
sys.exit(64)
|
||||
else:
|
||||
args.append(arg)
|
||||
|
||||
if len(args) < 1:
|
||||
print("Usage: markitdown_extract.py [--ocr|--no-ocr] [--timeout=SECS] <mode> <file_path>", file=sys.stderr)
|
||||
print("Modes: sync, server", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
mode = args[0]
|
||||
if mode == "server":
|
||||
run_server(timeout=timeout)
|
||||
elif mode == "sync":
|
||||
if len(args) < 2:
|
||||
print("Error: sync mode requires a file path", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
file_path = args[1]
|
||||
try:
|
||||
payload = extract_sync(file_path)
|
||||
print(json.dumps(payload), end="")
|
||||
except Exception as e:
|
||||
print(f"Error extracting with MarkItDown: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
else:
|
||||
# Legacy fallback for direct file path
|
||||
try:
|
||||
payload = extract_sync(args[0])
|
||||
print(json.dumps(payload), end="")
|
||||
except Exception as e:
|
||||
print(f"Error extracting with MarkItDown: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
338
tools/benchmark-harness/scripts/mineru_extract.py
Normal file
338
tools/benchmark-harness/scripts/mineru_extract.py
Normal file
@@ -0,0 +1,338 @@
|
||||
"""MinerU extraction wrapper for benchmark harness.
|
||||
|
||||
Supports three modes:
|
||||
- sync: process single file
|
||||
- batch: process multiple files
|
||||
- server: persistent mode reading paths from stdin
|
||||
|
||||
Attempts to use MinerU's Python API directly for better performance.
|
||||
Falls back to CLI subprocess if the Python API is not available.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
|
||||
# Force CPU-only mode to avoid GPU discovery errors in CI
|
||||
os.environ.setdefault("CUDA_VISIBLE_DEVICES", "")
|
||||
os.environ.setdefault("ONNXRUNTIME_PROVIDERS", "CPUExecutionProvider")
|
||||
os.environ.setdefault("MINERU_DEVICE_MODE", "cpu")
|
||||
|
||||
import json
|
||||
import multiprocessing as _mp
|
||||
import platform
|
||||
import resource
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
# Try importing MinerU's Python API to avoid subprocess overhead.
|
||||
# The API surface has changed across versions, so we attempt several known entry points.
|
||||
try:
|
||||
from magic_pdf.pipe.UNIPipe import UNIPipe # noqa: F401
|
||||
|
||||
HAS_PYTHON_API = True
|
||||
except ImportError:
|
||||
HAS_PYTHON_API = False
|
||||
|
||||
|
||||
def _get_peak_memory_bytes() -> int:
|
||||
"""Get peak memory usage in bytes using resource module."""
|
||||
usage = resource.getrusage(resource.RUSAGE_SELF)
|
||||
if platform.system() == "Linux":
|
||||
return usage.ru_maxrss * 1024
|
||||
return usage.ru_maxrss
|
||||
|
||||
|
||||
def _extract_via_cli(file_path: str, ocr_enabled: bool) -> str:
|
||||
"""Extract using MinerU CLI (fallback)."""
|
||||
cmd = ["mineru", "-p", file_path, "-b", "pipeline", "-d", "cpu"]
|
||||
if not ocr_enabled:
|
||||
cmd.extend(["--method", "txt"])
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
output_dir = Path(tmpdir) / "output"
|
||||
cmd.extend(["-o", str(output_dir)])
|
||||
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=False,
|
||||
)
|
||||
|
||||
# Check for output files first — ONNX Runtime may emit warnings to
|
||||
# stderr even when extraction succeeds.
|
||||
md_files = list(output_dir.rglob("*.md"))
|
||||
if md_files:
|
||||
return md_files[0].read_text(encoding="utf-8")
|
||||
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"MinerU extraction failed: {result.stderr}")
|
||||
|
||||
raise RuntimeError("No markdown output found from MinerU")
|
||||
|
||||
|
||||
def _extract_via_api(file_path: str, ocr_enabled: bool) -> str:
|
||||
"""Extract using MinerU Python API (preferred, avoids subprocess overhead)."""
|
||||
# NOTE: The MinerU Python API is not yet stable. This is a best-effort attempt
|
||||
# using the UNIPipe interface. If this fails at runtime, the caller should
|
||||
# fall back to CLI extraction.
|
||||
from magic_pdf.pipe.UNIPipe import UNIPipe
|
||||
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
||||
|
||||
pdf_bytes = Path(file_path).read_bytes()
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
writer = DiskReaderWriter(tmpdir)
|
||||
method = "ocr" if ocr_enabled else "txt"
|
||||
pipe = UNIPipe(pdf_bytes, {"_pdf_type": "", "model_list": []}, writer, method=method)
|
||||
pipe.pipe_classify()
|
||||
pipe.pipe_analyze()
|
||||
pipe.pipe_parse()
|
||||
md_content = pipe.pipe_mk_markdown(str(Path(file_path).stem), tmpdir)
|
||||
return md_content
|
||||
|
||||
|
||||
_MD_STRIP_RE = None
|
||||
|
||||
|
||||
def _strip_markdown(text: str) -> str:
|
||||
"""Best-effort markdown→plaintext pass. Drops syntax tokens; preserves text."""
|
||||
import re
|
||||
|
||||
global _MD_STRIP_RE
|
||||
if _MD_STRIP_RE is None:
|
||||
_MD_STRIP_RE = [
|
||||
(re.compile(r"^#{1,6}\s+", re.MULTILINE), ""), # ATX headings
|
||||
(re.compile(r"^\s*[-*+]\s+", re.MULTILINE), ""), # bullet markers
|
||||
(re.compile(r"^\s*\d+\.\s+", re.MULTILINE), ""), # ordered list markers
|
||||
(re.compile(r"^>\s?", re.MULTILINE), ""), # blockquotes
|
||||
(re.compile(r"```[a-zA-Z0-9_-]*\n?"), ""), # code fences
|
||||
(re.compile(r"`([^`]+)`"), r"\1"), # inline code
|
||||
(re.compile(r"\*\*([^*]+)\*\*"), r"\1"), # bold
|
||||
(re.compile(r"\*([^*]+)\*"), r"\1"), # italic
|
||||
(re.compile(r"!\[([^\]]*)\]\([^)]*\)"), r"\1"), # images
|
||||
(re.compile(r"\[([^\]]+)\]\([^)]*\)"), r"\1"), # links
|
||||
(re.compile(r"^\s*\|.*\|\s*$", re.MULTILINE), ""), # table rows (drop)
|
||||
]
|
||||
out = text
|
||||
for pattern, repl in _MD_STRIP_RE:
|
||||
out = pattern.sub(repl, out)
|
||||
return out
|
||||
|
||||
|
||||
def extract_sync(file_path: str, ocr_enabled: bool, output_format: str = "markdown") -> dict[str, Any]:
|
||||
"""Extract a single file using the best available method."""
|
||||
start = time.perf_counter()
|
||||
|
||||
if HAS_PYTHON_API:
|
||||
try:
|
||||
markdown = _extract_via_api(file_path, ocr_enabled)
|
||||
except Exception:
|
||||
# Fall back to CLI if Python API fails at runtime
|
||||
markdown = _extract_via_cli(file_path, ocr_enabled)
|
||||
else:
|
||||
markdown = _extract_via_cli(file_path, ocr_enabled)
|
||||
|
||||
content = _strip_markdown(markdown) if output_format == "plaintext" else markdown
|
||||
duration_ms = (time.perf_counter() - start) * 1000.0
|
||||
|
||||
return {
|
||||
"content": content,
|
||||
"metadata": {"framework": "mineru", "output_format": output_format},
|
||||
"_extraction_time_ms": duration_ms,
|
||||
"_peak_memory_bytes": _get_peak_memory_bytes(),
|
||||
}
|
||||
|
||||
|
||||
def extract_batch(file_paths: list[str], ocr_enabled: bool, output_format: str = "markdown") -> list[dict[str, Any]]:
|
||||
"""Extract multiple files in sequence."""
|
||||
start = time.perf_counter()
|
||||
|
||||
results = []
|
||||
for file_path in file_paths:
|
||||
try:
|
||||
payload = extract_sync(file_path, ocr_enabled, output_format)
|
||||
# Remove per-file timing; we'll replace with batch timing below
|
||||
payload.pop("_extraction_time_ms", None)
|
||||
results.append(payload)
|
||||
except Exception as e:
|
||||
results.append(
|
||||
{
|
||||
"content": "",
|
||||
"metadata": {
|
||||
"framework": "mineru",
|
||||
"error": str(e),
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
total_duration_ms = (time.perf_counter() - start) * 1000.0
|
||||
per_file_duration_ms = total_duration_ms / len(file_paths) if file_paths else 0
|
||||
peak_memory = _get_peak_memory_bytes()
|
||||
|
||||
for result in results:
|
||||
result["_extraction_time_ms"] = per_file_duration_ms
|
||||
result["_batch_total_ms"] = total_duration_ms
|
||||
result["_peak_memory_bytes"] = peak_memory
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def _worker(fn, args, conn):
|
||||
"""Run extraction in a forked child process.
|
||||
|
||||
Closes inherited stdin/stdout so the child cannot corrupt the
|
||||
parent's line-based JSON protocol.
|
||||
"""
|
||||
try:
|
||||
sys.stdin.close()
|
||||
sys.stdout = open(os.devnull, "w")
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
result = fn(*args)
|
||||
conn.send(result)
|
||||
except Exception as e:
|
||||
conn.send({"error": str(e), "_extraction_time_ms": 0})
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def _run_with_timeout(fn, args, timeout):
|
||||
"""Execute fn(*args) in a forked child with a timeout.
|
||||
|
||||
On timeout the child is killed but the parent stays alive —
|
||||
no expensive process restart is needed.
|
||||
"""
|
||||
try:
|
||||
ctx = _mp.get_context("fork")
|
||||
parent_conn, child_conn = ctx.Pipe(duplex=False)
|
||||
p = ctx.Process(target=_worker, args=(fn, args, child_conn))
|
||||
p.start()
|
||||
child_conn.close()
|
||||
|
||||
if parent_conn.poll(timeout=timeout):
|
||||
try:
|
||||
result = parent_conn.recv()
|
||||
except Exception:
|
||||
result = {"error": "worker process crashed", "_extraction_time_ms": 0}
|
||||
else:
|
||||
p.kill()
|
||||
result = {
|
||||
"error": f"extraction timed out after {timeout}s",
|
||||
"_extraction_time_ms": timeout * 1000.0,
|
||||
}
|
||||
|
||||
p.join(timeout=5)
|
||||
if p.is_alive():
|
||||
p.kill()
|
||||
p.join()
|
||||
parent_conn.close()
|
||||
return result
|
||||
except Exception:
|
||||
# Fork not available — fall back to in-process extraction
|
||||
try:
|
||||
return fn(*args)
|
||||
except Exception as e:
|
||||
return {"error": str(e), "_extraction_time_ms": 0}
|
||||
|
||||
|
||||
def _parse_path(line: str) -> str:
|
||||
"""Parse a request line: JSON object with path field, or plain file path."""
|
||||
stripped = line.strip()
|
||||
if stripped.startswith("{"):
|
||||
try:
|
||||
return json.loads(stripped).get("path", "")
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
pass
|
||||
return stripped
|
||||
|
||||
|
||||
def run_server(ocr_enabled: bool, output_format: str, timeout=None) -> None:
|
||||
"""Persistent server mode: read paths from stdin, write JSON to stdout."""
|
||||
print("READY", flush=True)
|
||||
for line in sys.stdin:
|
||||
file_path = _parse_path(line)
|
||||
if not file_path:
|
||||
continue
|
||||
if timeout is not None:
|
||||
result = _run_with_timeout(extract_sync, (file_path, ocr_enabled, output_format), timeout)
|
||||
else:
|
||||
try:
|
||||
result = extract_sync(file_path, ocr_enabled, output_format)
|
||||
except Exception as e:
|
||||
result = {"error": str(e), "_extraction_time_ms": 0}
|
||||
print(json.dumps(result), flush=True)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
ocr_enabled = False
|
||||
timeout = None
|
||||
output_format = "markdown"
|
||||
args = []
|
||||
for arg in sys.argv[1:]:
|
||||
if arg == "--ocr":
|
||||
ocr_enabled = True
|
||||
elif arg == "--no-ocr":
|
||||
ocr_enabled = False
|
||||
elif arg.startswith("--timeout="):
|
||||
timeout = int(arg.split("=", 1)[1])
|
||||
elif arg.startswith("--format="):
|
||||
output_format = arg.split("=", 1)[1]
|
||||
else:
|
||||
args.append(arg)
|
||||
|
||||
if output_format not in ("markdown", "plaintext"):
|
||||
print(f"Error: --format must be 'markdown' or 'plaintext'; got '{output_format}'", file=sys.stderr)
|
||||
sys.exit(64)
|
||||
|
||||
if len(args) < 1:
|
||||
print(
|
||||
"Usage: mineru_extract.py [--ocr|--no-ocr] [--timeout=SECS] [--format=markdown|plaintext] <mode> <file_path> [additional_files...]",
|
||||
file=sys.stderr,
|
||||
)
|
||||
print("Modes: sync, batch, server", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
mode = args[0]
|
||||
file_paths = args[1:]
|
||||
|
||||
try:
|
||||
if mode == "server":
|
||||
run_server(ocr_enabled, output_format, timeout=timeout)
|
||||
|
||||
elif mode == "sync":
|
||||
if len(file_paths) != 1:
|
||||
print("Error: sync mode requires exactly one file", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
payload = extract_sync(file_paths[0], ocr_enabled, output_format)
|
||||
print(json.dumps(payload), end="")
|
||||
|
||||
elif mode == "batch":
|
||||
if len(file_paths) < 1:
|
||||
print("Error: batch mode requires at least one file", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if len(file_paths) == 1:
|
||||
results = extract_batch(file_paths, ocr_enabled, output_format)
|
||||
print(json.dumps(results[0]), end="")
|
||||
else:
|
||||
results = extract_batch(file_paths, ocr_enabled, output_format)
|
||||
print(json.dumps(results), end="")
|
||||
|
||||
else:
|
||||
print(f"Error: Unknown mode '{mode}'. Use sync, batch, or server", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error extracting with MinerU: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
67
tools/benchmark-harness/scripts/pandoc_extract.sh
Executable file
67
tools/benchmark-harness/scripts/pandoc_extract.sh
Executable file
@@ -0,0 +1,67 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
FORMAT="markdown"
|
||||
FILE_PATH=""
|
||||
for arg in "$@"; do
|
||||
case "$arg" in
|
||||
--format=*)
|
||||
FORMAT="${arg#--format=}"
|
||||
;;
|
||||
*)
|
||||
FILE_PATH="$arg"
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [ -z "$FILE_PATH" ]; then
|
||||
echo "Usage: pandoc_extract.sh [--format=markdown|plaintext] <file_path>" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ "$FORMAT" != "markdown" ] && [ "$FORMAT" != "plaintext" ]; then
|
||||
echo "Error: --format must be 'markdown' or 'plaintext'; got '$FORMAT'" >&2
|
||||
exit 64
|
||||
fi
|
||||
|
||||
if [ ! -f "$FILE_PATH" ]; then
|
||||
echo "Error: File not found: $FILE_PATH" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ "$FORMAT" = "markdown" ]; then
|
||||
PANDOC_TO="gfm"
|
||||
else
|
||||
PANDOC_TO="plain"
|
||||
fi
|
||||
|
||||
START=$(date +%s%N)
|
||||
|
||||
if command -v timeout &>/dev/null; then
|
||||
CONTENT=$(timeout 60s pandoc "$FILE_PATH" "--to=$PANDOC_TO" --wrap=none --strip-comments 2>/dev/null || echo "")
|
||||
elif command -v gtimeout &>/dev/null; then
|
||||
CONTENT=$(gtimeout 60s pandoc "$FILE_PATH" "--to=$PANDOC_TO" --wrap=none --strip-comments 2>/dev/null || echo "")
|
||||
else
|
||||
CONTENT=$(pandoc "$FILE_PATH" "--to=$PANDOC_TO" --wrap=none --strip-comments 2>/dev/null || echo "")
|
||||
fi
|
||||
|
||||
END=$(date +%s%N)
|
||||
DURATION_MS=$(((END - START) / 1000000))
|
||||
|
||||
if command -v jq &>/dev/null; then
|
||||
jq -n \
|
||||
--arg content "$CONTENT" \
|
||||
--arg fmt "$FORMAT" \
|
||||
--argjson duration "$DURATION_MS" \
|
||||
'{
|
||||
content: $content,
|
||||
metadata: {framework: "pandoc", output_format: $fmt},
|
||||
_extraction_time_ms: $duration
|
||||
}'
|
||||
else
|
||||
ESCAPED_CONTENT=$(echo "$CONTENT" | sed 's/\\/\\\\/g' | sed 's/"/\\"/g' | awk '{printf "%s\\n", $0}' | sed '$ s/\\n$//')
|
||||
cat <<EOF
|
||||
{"content":"$ESCAPED_CONTENT","metadata":{"framework":"pandoc","output_format":"$FORMAT"},"_extraction_time_ms":$DURATION_MS}
|
||||
EOF
|
||||
fi
|
||||
231
tools/benchmark-harness/scripts/pdfminer_extract.py
Normal file
231
tools/benchmark-harness/scripts/pdfminer_extract.py
Normal file
@@ -0,0 +1,231 @@
|
||||
"""pdfminer extraction wrapper for benchmark harness.
|
||||
|
||||
Supports three modes:
|
||||
- sync: extract text from a single file
|
||||
- batch: process multiple files (simulated batch using loop)
|
||||
- server: persistent mode reading paths from stdin
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import multiprocessing as _mp
|
||||
import os
|
||||
import platform
|
||||
import resource
|
||||
import sys
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
from pdfminer.high_level import extract_text
|
||||
|
||||
|
||||
def _get_peak_memory_bytes() -> int:
|
||||
"""Get peak memory usage in bytes using resource module."""
|
||||
usage = resource.getrusage(resource.RUSAGE_SELF)
|
||||
if platform.system() == "Linux":
|
||||
return usage.ru_maxrss * 1024
|
||||
return usage.ru_maxrss
|
||||
|
||||
|
||||
def extract_sync(file_path: str) -> dict[str, Any]:
|
||||
"""Extract using synchronous single-file API."""
|
||||
start = time.perf_counter()
|
||||
|
||||
content = extract_text(file_path)
|
||||
|
||||
duration_ms = (time.perf_counter() - start) * 1000.0
|
||||
|
||||
return {
|
||||
"content": content,
|
||||
"metadata": {"framework": "pdfminer"},
|
||||
"_extraction_time_ms": duration_ms,
|
||||
"_peak_memory_bytes": _get_peak_memory_bytes(),
|
||||
}
|
||||
|
||||
|
||||
def extract_batch(file_paths: list[str]) -> list[dict[str, Any]]:
|
||||
"""Extract multiple files (simulated batch - pdfminer has no native batch API)."""
|
||||
start = time.perf_counter()
|
||||
|
||||
results = []
|
||||
for file_path in file_paths:
|
||||
try:
|
||||
content = extract_text(file_path)
|
||||
results.append(
|
||||
{
|
||||
"content": content,
|
||||
"metadata": {"framework": "pdfminer"},
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
results.append(
|
||||
{
|
||||
"content": "",
|
||||
"metadata": {
|
||||
"framework": "pdfminer",
|
||||
"error": str(e),
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
total_duration_ms = (time.perf_counter() - start) * 1000.0
|
||||
per_file_duration_ms = total_duration_ms / len(file_paths) if file_paths else 0
|
||||
|
||||
peak_memory = _get_peak_memory_bytes()
|
||||
for result in results:
|
||||
result["_extraction_time_ms"] = per_file_duration_ms
|
||||
result["_batch_total_ms"] = total_duration_ms
|
||||
result["_peak_memory_bytes"] = peak_memory
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def _worker(fn, args, conn):
|
||||
"""Run extraction in a forked child process.
|
||||
|
||||
Closes inherited stdin/stdout so the child cannot corrupt the
|
||||
parent's line-based JSON protocol.
|
||||
"""
|
||||
try:
|
||||
sys.stdin.close()
|
||||
sys.stdout = open(os.devnull, "w")
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
result = fn(*args)
|
||||
conn.send(result)
|
||||
except Exception as e:
|
||||
conn.send({"error": str(e), "_extraction_time_ms": 0})
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def _run_with_timeout(fn, args, timeout):
|
||||
"""Execute fn(*args) in a forked child with a timeout.
|
||||
|
||||
On timeout the child is killed but the parent stays alive —
|
||||
no expensive process restart is needed.
|
||||
"""
|
||||
try:
|
||||
ctx = _mp.get_context("fork")
|
||||
parent_conn, child_conn = ctx.Pipe(duplex=False)
|
||||
p = ctx.Process(target=_worker, args=(fn, args, child_conn))
|
||||
p.start()
|
||||
child_conn.close()
|
||||
|
||||
if parent_conn.poll(timeout=timeout):
|
||||
try:
|
||||
result = parent_conn.recv()
|
||||
except Exception:
|
||||
result = {"error": "worker process crashed", "_extraction_time_ms": 0}
|
||||
else:
|
||||
p.kill()
|
||||
result = {
|
||||
"error": f"extraction timed out after {timeout}s",
|
||||
"_extraction_time_ms": timeout * 1000.0,
|
||||
}
|
||||
|
||||
p.join(timeout=5)
|
||||
if p.is_alive():
|
||||
p.kill()
|
||||
p.join()
|
||||
parent_conn.close()
|
||||
return result
|
||||
except Exception:
|
||||
# Fork not available — fall back to in-process extraction
|
||||
try:
|
||||
return fn(*args)
|
||||
except Exception as e:
|
||||
return {"error": str(e), "_extraction_time_ms": 0}
|
||||
|
||||
|
||||
def _parse_path(line: str) -> str:
|
||||
"""Parse a request line: JSON object with path field, or plain file path."""
|
||||
stripped = line.strip()
|
||||
if stripped.startswith("{"):
|
||||
try:
|
||||
return json.loads(stripped).get("path", "")
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
pass
|
||||
return stripped
|
||||
|
||||
|
||||
def run_server(timeout=None) -> None:
|
||||
"""Persistent server mode: read paths from stdin, write JSON to stdout."""
|
||||
print("READY", flush=True)
|
||||
for line in sys.stdin:
|
||||
file_path = _parse_path(line)
|
||||
if not file_path:
|
||||
continue
|
||||
if timeout is not None:
|
||||
result = _run_with_timeout(extract_sync, (file_path,), timeout)
|
||||
else:
|
||||
try:
|
||||
result = extract_sync(file_path)
|
||||
except Exception as e:
|
||||
result = {"error": str(e), "_extraction_time_ms": 0}
|
||||
print(json.dumps(result), flush=True)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
timeout = None
|
||||
args = []
|
||||
for arg in sys.argv[1:]:
|
||||
if arg in ("--ocr", "--no-ocr"):
|
||||
pass # Accepted but ignored - pdfminer doesn't have OCR config
|
||||
elif arg.startswith("--timeout="):
|
||||
timeout = int(arg.split("=", 1)[1])
|
||||
elif arg.startswith("--format="):
|
||||
_fmt = arg.split("=", 1)[1]
|
||||
if _fmt != "plaintext":
|
||||
print(f"{sys.argv[0]} only supports plaintext output; got --format {_fmt}", file=sys.stderr)
|
||||
sys.exit(64)
|
||||
else:
|
||||
args.append(arg)
|
||||
|
||||
if len(args) < 1:
|
||||
print(
|
||||
"Usage: pdfminer_extract.py [--ocr|--no-ocr] [--timeout=SECS] <mode> <file_path> [additional_files...]",
|
||||
file=sys.stderr,
|
||||
)
|
||||
print("Modes: sync, batch, server", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
mode = args[0]
|
||||
file_paths = args[1:]
|
||||
|
||||
try:
|
||||
if mode == "server":
|
||||
run_server(timeout=timeout)
|
||||
|
||||
elif mode == "sync":
|
||||
if len(file_paths) != 1:
|
||||
print("Error: sync mode requires exactly one file", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
payload = extract_sync(file_paths[0])
|
||||
print(json.dumps(payload), end="")
|
||||
|
||||
elif mode == "batch":
|
||||
if len(file_paths) < 1:
|
||||
print("Error: batch mode requires at least one file", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if len(file_paths) == 1:
|
||||
results = extract_batch(file_paths)
|
||||
print(json.dumps(results[0]), end="")
|
||||
else:
|
||||
results = extract_batch(file_paths)
|
||||
print(json.dumps(results), end="")
|
||||
|
||||
else:
|
||||
print(f"Error: Unknown mode '{mode}'. Use sync, batch, or server", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error extracting with pdfminer: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
245
tools/benchmark-harness/scripts/pdfplumber_extract.py
Normal file
245
tools/benchmark-harness/scripts/pdfplumber_extract.py
Normal file
@@ -0,0 +1,245 @@
|
||||
"""pdfplumber extraction wrapper for benchmark harness.
|
||||
|
||||
Supports three modes:
|
||||
- sync: extract text page-by-page (sequential)
|
||||
- batch: process multiple files (simulated batch using loop)
|
||||
- server: persistent mode reading paths from stdin
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import multiprocessing as _mp
|
||||
import os
|
||||
import platform
|
||||
import resource
|
||||
import sys
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
import pdfplumber
|
||||
|
||||
|
||||
def _get_peak_memory_bytes() -> int:
|
||||
"""Get peak memory usage in bytes using resource module."""
|
||||
usage = resource.getrusage(resource.RUSAGE_SELF)
|
||||
if platform.system() == "Linux":
|
||||
return usage.ru_maxrss * 1024
|
||||
return usage.ru_maxrss
|
||||
|
||||
|
||||
def extract_sync(file_path: str) -> dict[str, Any]:
|
||||
"""Extract using synchronous single-file API."""
|
||||
start = time.perf_counter()
|
||||
|
||||
with pdfplumber.open(file_path) as pdf:
|
||||
text_parts = []
|
||||
for page in pdf.pages:
|
||||
page_text = page.extract_text(layout=False)
|
||||
if page_text:
|
||||
text_parts.append(page_text)
|
||||
|
||||
markdown = "\n\n".join(text_parts)
|
||||
|
||||
duration_ms = (time.perf_counter() - start) * 1000.0
|
||||
|
||||
return {
|
||||
"content": markdown,
|
||||
"metadata": {"framework": "pdfplumber"},
|
||||
"_extraction_time_ms": duration_ms,
|
||||
"_peak_memory_bytes": _get_peak_memory_bytes(),
|
||||
}
|
||||
|
||||
|
||||
def extract_batch(file_paths: list[str]) -> list[dict[str, Any]]:
|
||||
"""Extract multiple files (simulated batch - pdfplumber has no native batch API)."""
|
||||
start = time.perf_counter()
|
||||
|
||||
results = []
|
||||
for file_path in file_paths:
|
||||
try:
|
||||
with pdfplumber.open(file_path) as pdf:
|
||||
text_parts = []
|
||||
for page in pdf.pages:
|
||||
page_text = page.extract_text(layout=False)
|
||||
if page_text:
|
||||
text_parts.append(page_text)
|
||||
|
||||
markdown = "\n\n".join(text_parts)
|
||||
results.append(
|
||||
{
|
||||
"content": markdown,
|
||||
"metadata": {"framework": "pdfplumber"},
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
results.append(
|
||||
{
|
||||
"content": "",
|
||||
"metadata": {
|
||||
"framework": "pdfplumber",
|
||||
"error": str(e),
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
total_duration_ms = (time.perf_counter() - start) * 1000.0
|
||||
per_file_duration_ms = total_duration_ms / len(file_paths) if file_paths else 0
|
||||
|
||||
peak_memory = _get_peak_memory_bytes()
|
||||
for result in results:
|
||||
result["_extraction_time_ms"] = per_file_duration_ms
|
||||
result["_batch_total_ms"] = total_duration_ms
|
||||
result["_peak_memory_bytes"] = peak_memory
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def _worker(fn, args, conn):
|
||||
"""Run extraction in a forked child process.
|
||||
|
||||
Closes inherited stdin/stdout so the child cannot corrupt the
|
||||
parent's line-based JSON protocol.
|
||||
"""
|
||||
try:
|
||||
sys.stdin.close()
|
||||
sys.stdout = open(os.devnull, "w")
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
result = fn(*args)
|
||||
conn.send(result)
|
||||
except Exception as e:
|
||||
conn.send({"error": str(e), "_extraction_time_ms": 0})
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def _run_with_timeout(fn, args, timeout):
|
||||
"""Execute fn(*args) in a forked child with a timeout.
|
||||
|
||||
On timeout the child is killed but the parent stays alive —
|
||||
no expensive process restart is needed.
|
||||
"""
|
||||
try:
|
||||
ctx = _mp.get_context("fork")
|
||||
parent_conn, child_conn = ctx.Pipe(duplex=False)
|
||||
p = ctx.Process(target=_worker, args=(fn, args, child_conn))
|
||||
p.start()
|
||||
child_conn.close()
|
||||
|
||||
if parent_conn.poll(timeout=timeout):
|
||||
try:
|
||||
result = parent_conn.recv()
|
||||
except Exception:
|
||||
result = {"error": "worker process crashed", "_extraction_time_ms": 0}
|
||||
else:
|
||||
p.kill()
|
||||
result = {
|
||||
"error": f"extraction timed out after {timeout}s",
|
||||
"_extraction_time_ms": timeout * 1000.0,
|
||||
}
|
||||
|
||||
p.join(timeout=5)
|
||||
if p.is_alive():
|
||||
p.kill()
|
||||
p.join()
|
||||
parent_conn.close()
|
||||
return result
|
||||
except Exception:
|
||||
# Fork not available — fall back to in-process extraction
|
||||
try:
|
||||
return fn(*args)
|
||||
except Exception as e:
|
||||
return {"error": str(e), "_extraction_time_ms": 0}
|
||||
|
||||
|
||||
def _parse_path(line: str) -> str:
|
||||
"""Parse a request line: JSON object with path field, or plain file path."""
|
||||
stripped = line.strip()
|
||||
if stripped.startswith("{"):
|
||||
try:
|
||||
return json.loads(stripped).get("path", "")
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
pass
|
||||
return stripped
|
||||
|
||||
|
||||
def run_server(timeout=None) -> None:
|
||||
"""Persistent server mode: read paths from stdin, write JSON to stdout."""
|
||||
print("READY", flush=True)
|
||||
for line in sys.stdin:
|
||||
file_path = _parse_path(line)
|
||||
if not file_path:
|
||||
continue
|
||||
if timeout is not None:
|
||||
result = _run_with_timeout(extract_sync, (file_path,), timeout)
|
||||
else:
|
||||
try:
|
||||
result = extract_sync(file_path)
|
||||
except Exception as e:
|
||||
result = {"error": str(e), "_extraction_time_ms": 0}
|
||||
print(json.dumps(result), flush=True)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
timeout = None
|
||||
args = []
|
||||
for arg in sys.argv[1:]:
|
||||
if arg in ("--ocr", "--no-ocr"):
|
||||
pass # Accepted but ignored - pdfplumber doesn't have OCR config
|
||||
elif arg.startswith("--timeout="):
|
||||
timeout = int(arg.split("=", 1)[1])
|
||||
elif arg.startswith("--format="):
|
||||
_fmt = arg.split("=", 1)[1]
|
||||
if _fmt != "plaintext":
|
||||
print(f"{sys.argv[0]} only supports plaintext output; got --format {_fmt}", file=sys.stderr)
|
||||
sys.exit(64)
|
||||
else:
|
||||
args.append(arg)
|
||||
|
||||
if len(args) < 1:
|
||||
print(
|
||||
"Usage: pdfplumber_extract.py [--ocr|--no-ocr] [--timeout=SECS] <mode> <file_path> [additional_files...]",
|
||||
file=sys.stderr,
|
||||
)
|
||||
print("Modes: sync, batch, server", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
mode = args[0]
|
||||
file_paths = args[1:]
|
||||
|
||||
try:
|
||||
if mode == "server":
|
||||
run_server(timeout=timeout)
|
||||
|
||||
elif mode == "sync":
|
||||
if len(file_paths) != 1:
|
||||
print("Error: sync mode requires exactly one file", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
payload = extract_sync(file_paths[0])
|
||||
print(json.dumps(payload), end="")
|
||||
|
||||
elif mode == "batch":
|
||||
if len(file_paths) < 1:
|
||||
print("Error: batch mode requires at least one file", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if len(file_paths) == 1:
|
||||
results = extract_batch(file_paths)
|
||||
print(json.dumps(results[0]), end="")
|
||||
else:
|
||||
results = extract_batch(file_paths)
|
||||
print(json.dumps(results), end="")
|
||||
|
||||
else:
|
||||
print(f"Error: Unknown mode '{mode}'. Use sync, batch, or server", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error extracting with pdfplumber: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
237
tools/benchmark-harness/scripts/pdftotext_extract.py
Normal file
237
tools/benchmark-harness/scripts/pdftotext_extract.py
Normal file
@@ -0,0 +1,237 @@
|
||||
"""pdftotext extraction wrapper for benchmark harness.
|
||||
|
||||
Supports three modes:
|
||||
- sync: extract text from entire PDF (sequential)
|
||||
- batch: process multiple files (simulated batch using loop)
|
||||
- server: persistent mode reading paths from stdin
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import multiprocessing as _mp
|
||||
import os
|
||||
import platform
|
||||
import resource
|
||||
import sys
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
import pdftotext
|
||||
|
||||
|
||||
def _get_peak_memory_bytes() -> int:
|
||||
"""Get peak memory usage in bytes using resource module."""
|
||||
usage = resource.getrusage(resource.RUSAGE_SELF)
|
||||
if platform.system() == "Linux":
|
||||
return usage.ru_maxrss * 1024
|
||||
return usage.ru_maxrss
|
||||
|
||||
|
||||
def extract_sync(file_path: str) -> dict[str, Any]:
|
||||
"""Extract using synchronous single-file API."""
|
||||
start = time.perf_counter()
|
||||
|
||||
with open(file_path, "rb") as f:
|
||||
pdf = pdftotext.PDF(f)
|
||||
|
||||
content = "\n\n".join(pdf)
|
||||
|
||||
duration_ms = (time.perf_counter() - start) * 1000.0
|
||||
|
||||
return {
|
||||
"content": content,
|
||||
"metadata": {"framework": "pdftotext"},
|
||||
"_extraction_time_ms": duration_ms,
|
||||
"_peak_memory_bytes": _get_peak_memory_bytes(),
|
||||
}
|
||||
|
||||
|
||||
def extract_batch(file_paths: list[str]) -> list[dict[str, Any]]:
|
||||
"""Extract multiple files (simulated batch - pdftotext has no native batch API)."""
|
||||
start = time.perf_counter()
|
||||
|
||||
results = []
|
||||
for file_path in file_paths:
|
||||
try:
|
||||
with open(file_path, "rb") as f:
|
||||
pdf = pdftotext.PDF(f)
|
||||
|
||||
content = "\n\n".join(pdf)
|
||||
results.append(
|
||||
{
|
||||
"content": content,
|
||||
"metadata": {"framework": "pdftotext"},
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
results.append(
|
||||
{
|
||||
"content": "",
|
||||
"metadata": {
|
||||
"framework": "pdftotext",
|
||||
"error": str(e),
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
total_duration_ms = (time.perf_counter() - start) * 1000.0
|
||||
per_file_duration_ms = total_duration_ms / len(file_paths) if file_paths else 0
|
||||
|
||||
peak_memory = _get_peak_memory_bytes()
|
||||
for result in results:
|
||||
result["_extraction_time_ms"] = per_file_duration_ms
|
||||
result["_batch_total_ms"] = total_duration_ms
|
||||
result["_peak_memory_bytes"] = peak_memory
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def _worker(fn, args, conn):
|
||||
"""Run extraction in a forked child process.
|
||||
|
||||
Closes inherited stdin/stdout so the child cannot corrupt the
|
||||
parent's line-based JSON protocol.
|
||||
"""
|
||||
try:
|
||||
sys.stdin.close()
|
||||
sys.stdout = open(os.devnull, "w")
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
result = fn(*args)
|
||||
conn.send(result)
|
||||
except Exception as e:
|
||||
conn.send({"error": str(e), "_extraction_time_ms": 0})
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def _run_with_timeout(fn, args, timeout):
|
||||
"""Execute fn(*args) in a forked child with a timeout.
|
||||
|
||||
On timeout the child is killed but the parent stays alive —
|
||||
no expensive process restart is needed.
|
||||
"""
|
||||
try:
|
||||
ctx = _mp.get_context("fork")
|
||||
parent_conn, child_conn = ctx.Pipe(duplex=False)
|
||||
p = ctx.Process(target=_worker, args=(fn, args, child_conn))
|
||||
p.start()
|
||||
child_conn.close()
|
||||
|
||||
if parent_conn.poll(timeout=timeout):
|
||||
try:
|
||||
result = parent_conn.recv()
|
||||
except Exception:
|
||||
result = {"error": "worker process crashed", "_extraction_time_ms": 0}
|
||||
else:
|
||||
p.kill()
|
||||
result = {
|
||||
"error": f"extraction timed out after {timeout}s",
|
||||
"_extraction_time_ms": timeout * 1000.0,
|
||||
}
|
||||
|
||||
p.join(timeout=5)
|
||||
if p.is_alive():
|
||||
p.kill()
|
||||
p.join()
|
||||
parent_conn.close()
|
||||
return result
|
||||
except Exception:
|
||||
# Fork not available — fall back to in-process extraction
|
||||
try:
|
||||
return fn(*args)
|
||||
except Exception as e:
|
||||
return {"error": str(e), "_extraction_time_ms": 0}
|
||||
|
||||
|
||||
def _parse_path(line: str) -> str:
|
||||
"""Parse a request line: JSON object with path field, or plain file path."""
|
||||
stripped = line.strip()
|
||||
if stripped.startswith("{"):
|
||||
try:
|
||||
return json.loads(stripped).get("path", "")
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
pass
|
||||
return stripped
|
||||
|
||||
|
||||
def run_server(timeout=None) -> None:
|
||||
"""Persistent server mode: read paths from stdin, write JSON to stdout."""
|
||||
print("READY", flush=True)
|
||||
for line in sys.stdin:
|
||||
file_path = _parse_path(line)
|
||||
if not file_path:
|
||||
continue
|
||||
if timeout is not None:
|
||||
result = _run_with_timeout(extract_sync, (file_path,), timeout)
|
||||
else:
|
||||
try:
|
||||
result = extract_sync(file_path)
|
||||
except Exception as e:
|
||||
result = {"error": str(e), "_extraction_time_ms": 0}
|
||||
print(json.dumps(result), flush=True)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
timeout = None
|
||||
args = []
|
||||
for arg in sys.argv[1:]:
|
||||
if arg in ("--ocr", "--no-ocr"):
|
||||
pass # Accepted but ignored - pdftotext doesn't have OCR config
|
||||
elif arg.startswith("--timeout="):
|
||||
timeout = int(arg.split("=", 1)[1])
|
||||
elif arg.startswith("--format="):
|
||||
_fmt = arg.split("=", 1)[1]
|
||||
if _fmt != "plaintext":
|
||||
print(f"{sys.argv[0]} only supports plaintext output; got --format {_fmt}", file=sys.stderr)
|
||||
sys.exit(64)
|
||||
else:
|
||||
args.append(arg)
|
||||
|
||||
if len(args) < 1:
|
||||
print(
|
||||
"Usage: pdftotext_extract.py [--ocr|--no-ocr] [--timeout=SECS] <mode> <file_path> [additional_files...]",
|
||||
file=sys.stderr,
|
||||
)
|
||||
print("Modes: sync, batch, server", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
mode = args[0]
|
||||
file_paths = args[1:]
|
||||
|
||||
try:
|
||||
if mode == "server":
|
||||
run_server(timeout=timeout)
|
||||
|
||||
elif mode == "sync":
|
||||
if len(file_paths) != 1:
|
||||
print("Error: sync mode requires exactly one file", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
payload = extract_sync(file_paths[0])
|
||||
print(json.dumps(payload), end="")
|
||||
|
||||
elif mode == "batch":
|
||||
if len(file_paths) < 1:
|
||||
print("Error: batch mode requires at least one file", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if len(file_paths) == 1:
|
||||
results = extract_batch(file_paths)
|
||||
print(json.dumps(results[0]), end="")
|
||||
else:
|
||||
results = extract_batch(file_paths)
|
||||
print(json.dumps(results), end="")
|
||||
|
||||
else:
|
||||
print(f"Error: Unknown mode '{mode}'. Use sync, batch, or server", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error extracting with pdftotext: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
245
tools/benchmark-harness/scripts/playa_pdf_extract.py
Normal file
245
tools/benchmark-harness/scripts/playa_pdf_extract.py
Normal file
@@ -0,0 +1,245 @@
|
||||
"""playa-pdf extraction wrapper for benchmark harness.
|
||||
|
||||
Supports three modes:
|
||||
- sync: extract text page-by-page (sequential)
|
||||
- batch: process multiple files (simulated batch using loop)
|
||||
- server: persistent mode reading paths from stdin
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import multiprocessing as _mp
|
||||
import os
|
||||
import platform
|
||||
import resource
|
||||
import sys
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
import playa
|
||||
|
||||
|
||||
def _get_peak_memory_bytes() -> int:
|
||||
"""Get peak memory usage in bytes using resource module."""
|
||||
usage = resource.getrusage(resource.RUSAGE_SELF)
|
||||
if platform.system() == "Linux":
|
||||
return usage.ru_maxrss * 1024
|
||||
return usage.ru_maxrss
|
||||
|
||||
|
||||
def extract_sync(file_path: str) -> dict[str, Any]:
|
||||
"""Extract using synchronous single-file API."""
|
||||
start = time.perf_counter()
|
||||
|
||||
with playa.open(file_path) as doc:
|
||||
text_parts = []
|
||||
for page in doc.pages:
|
||||
page_text = page.extract_text()
|
||||
if page_text:
|
||||
text_parts.append(page_text)
|
||||
|
||||
markdown = "\n\n".join(text_parts)
|
||||
|
||||
duration_ms = (time.perf_counter() - start) * 1000.0
|
||||
|
||||
return {
|
||||
"content": markdown,
|
||||
"metadata": {"framework": "playa-pdf"},
|
||||
"_extraction_time_ms": duration_ms,
|
||||
"_peak_memory_bytes": _get_peak_memory_bytes(),
|
||||
}
|
||||
|
||||
|
||||
def extract_batch(file_paths: list[str]) -> list[dict[str, Any]]:
|
||||
"""Extract multiple files (simulated batch - playa-pdf has no native batch API)."""
|
||||
start = time.perf_counter()
|
||||
|
||||
results = []
|
||||
for file_path in file_paths:
|
||||
try:
|
||||
with playa.open(file_path) as doc:
|
||||
text_parts = []
|
||||
for page in doc.pages:
|
||||
page_text = page.extract_text()
|
||||
if page_text:
|
||||
text_parts.append(page_text)
|
||||
|
||||
markdown = "\n\n".join(text_parts)
|
||||
results.append(
|
||||
{
|
||||
"content": markdown,
|
||||
"metadata": {"framework": "playa-pdf"},
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
results.append(
|
||||
{
|
||||
"content": "",
|
||||
"metadata": {
|
||||
"framework": "playa-pdf",
|
||||
"error": str(e),
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
total_duration_ms = (time.perf_counter() - start) * 1000.0
|
||||
per_file_duration_ms = total_duration_ms / len(file_paths) if file_paths else 0
|
||||
|
||||
peak_memory = _get_peak_memory_bytes()
|
||||
for result in results:
|
||||
result["_extraction_time_ms"] = per_file_duration_ms
|
||||
result["_batch_total_ms"] = total_duration_ms
|
||||
result["_peak_memory_bytes"] = peak_memory
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def _worker(fn, args, conn):
|
||||
"""Run extraction in a forked child process.
|
||||
|
||||
Closes inherited stdin/stdout so the child cannot corrupt the
|
||||
parent's line-based JSON protocol.
|
||||
"""
|
||||
try:
|
||||
sys.stdin.close()
|
||||
sys.stdout = open(os.devnull, "w")
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
result = fn(*args)
|
||||
conn.send(result)
|
||||
except Exception as e:
|
||||
conn.send({"error": str(e), "_extraction_time_ms": 0})
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def _run_with_timeout(fn, args, timeout):
|
||||
"""Execute fn(*args) in a forked child with a timeout.
|
||||
|
||||
On timeout the child is killed but the parent stays alive —
|
||||
no expensive process restart is needed.
|
||||
"""
|
||||
try:
|
||||
ctx = _mp.get_context("fork")
|
||||
parent_conn, child_conn = ctx.Pipe(duplex=False)
|
||||
p = ctx.Process(target=_worker, args=(fn, args, child_conn))
|
||||
p.start()
|
||||
child_conn.close()
|
||||
|
||||
if parent_conn.poll(timeout=timeout):
|
||||
try:
|
||||
result = parent_conn.recv()
|
||||
except Exception:
|
||||
result = {"error": "worker process crashed", "_extraction_time_ms": 0}
|
||||
else:
|
||||
p.kill()
|
||||
result = {
|
||||
"error": f"extraction timed out after {timeout}s",
|
||||
"_extraction_time_ms": timeout * 1000.0,
|
||||
}
|
||||
|
||||
p.join(timeout=5)
|
||||
if p.is_alive():
|
||||
p.kill()
|
||||
p.join()
|
||||
parent_conn.close()
|
||||
return result
|
||||
except Exception:
|
||||
# Fork not available — fall back to in-process extraction
|
||||
try:
|
||||
return fn(*args)
|
||||
except Exception as e:
|
||||
return {"error": str(e), "_extraction_time_ms": 0}
|
||||
|
||||
|
||||
def _parse_path(line: str) -> str:
|
||||
"""Parse a request line: JSON object with path field, or plain file path."""
|
||||
stripped = line.strip()
|
||||
if stripped.startswith("{"):
|
||||
try:
|
||||
return json.loads(stripped).get("path", "")
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
pass
|
||||
return stripped
|
||||
|
||||
|
||||
def run_server(timeout=None) -> None:
|
||||
"""Persistent server mode: read paths from stdin, write JSON to stdout."""
|
||||
print("READY", flush=True)
|
||||
for line in sys.stdin:
|
||||
file_path = _parse_path(line)
|
||||
if not file_path:
|
||||
continue
|
||||
if timeout is not None:
|
||||
result = _run_with_timeout(extract_sync, (file_path,), timeout)
|
||||
else:
|
||||
try:
|
||||
result = extract_sync(file_path)
|
||||
except Exception as e:
|
||||
result = {"error": str(e), "_extraction_time_ms": 0}
|
||||
print(json.dumps(result), flush=True)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
timeout = None
|
||||
args = []
|
||||
for arg in sys.argv[1:]:
|
||||
if arg in ("--ocr", "--no-ocr"):
|
||||
pass # Accepted but ignored - playa-pdf doesn't have OCR capability
|
||||
elif arg.startswith("--timeout="):
|
||||
timeout = int(arg.split("=", 1)[1])
|
||||
elif arg.startswith("--format="):
|
||||
_fmt = arg.split("=", 1)[1]
|
||||
if _fmt != "plaintext":
|
||||
print(f"{sys.argv[0]} only supports plaintext output; got --format {_fmt}", file=sys.stderr)
|
||||
sys.exit(64)
|
||||
else:
|
||||
args.append(arg)
|
||||
|
||||
if len(args) < 1:
|
||||
print(
|
||||
"Usage: playa_pdf_extract.py [--ocr|--no-ocr] [--timeout=SECS] <mode> <file_path> [additional_files...]",
|
||||
file=sys.stderr,
|
||||
)
|
||||
print("Modes: sync, batch, server", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
mode = args[0]
|
||||
file_paths = args[1:]
|
||||
|
||||
try:
|
||||
if mode == "server":
|
||||
run_server(timeout=timeout)
|
||||
|
||||
elif mode == "sync":
|
||||
if len(file_paths) != 1:
|
||||
print("Error: sync mode requires exactly one file", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
payload = extract_sync(file_paths[0])
|
||||
print(json.dumps(payload), end="")
|
||||
|
||||
elif mode == "batch":
|
||||
if len(file_paths) < 1:
|
||||
print("Error: batch mode requires at least one file", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if len(file_paths) == 1:
|
||||
results = extract_batch(file_paths)
|
||||
print(json.dumps(results[0]), end="")
|
||||
else:
|
||||
results = extract_batch(file_paths)
|
||||
print(json.dumps(results), end="")
|
||||
|
||||
else:
|
||||
print(f"Error: Unknown mode '{mode}'. Use sync, batch, or server", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error extracting with playa-pdf: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
184
tools/benchmark-harness/scripts/pymupdf4llm_extract.py
Normal file
184
tools/benchmark-harness/scripts/pymupdf4llm_extract.py
Normal file
@@ -0,0 +1,184 @@
|
||||
"""PyMuPDF4LLM extraction wrapper for benchmark harness."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import multiprocessing as _mp
|
||||
import os
|
||||
import platform
|
||||
import resource
|
||||
import sys
|
||||
import time
|
||||
|
||||
# Suppress MuPDF C-level error/warning messages that can corrupt the
|
||||
# persistent server's line-based JSON protocol on stdout.
|
||||
# See: https://github.com/pymupdf/PyMuPDF/issues/606
|
||||
import pymupdf
|
||||
|
||||
# Import pymupdf.layout BEFORE pymupdf4llm to enable improved layout analysis
|
||||
# and suppress the "Consider using the pymupdf_layout package" info message.
|
||||
import pymupdf.layout
|
||||
import pymupdf4llm
|
||||
|
||||
pymupdf.TOOLS.mupdf_display_errors(False)
|
||||
|
||||
|
||||
def _get_peak_memory_bytes() -> int:
|
||||
"""Get peak memory usage in bytes using resource module."""
|
||||
usage = resource.getrusage(resource.RUSAGE_SELF)
|
||||
if platform.system() == "Linux":
|
||||
return usage.ru_maxrss * 1024
|
||||
return usage.ru_maxrss
|
||||
|
||||
|
||||
def extract_sync(file_path: str) -> dict:
|
||||
"""Extract using PyMuPDF4LLM."""
|
||||
start = time.perf_counter()
|
||||
markdown = pymupdf4llm.to_markdown(file_path, show_progress=False, write_images=False)
|
||||
duration_ms = (time.perf_counter() - start) * 1000.0
|
||||
|
||||
return {
|
||||
"content": markdown,
|
||||
"metadata": {"framework": "pymupdf4llm"},
|
||||
"_extraction_time_ms": duration_ms,
|
||||
"_peak_memory_bytes": _get_peak_memory_bytes(),
|
||||
}
|
||||
|
||||
|
||||
def _worker(fn, args, conn):
|
||||
"""Run extraction in a forked child process.
|
||||
|
||||
Closes inherited stdin/stdout so the child cannot corrupt the
|
||||
parent's line-based JSON protocol.
|
||||
"""
|
||||
try:
|
||||
sys.stdin.close()
|
||||
sys.stdout = open(os.devnull, "w")
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
result = fn(*args)
|
||||
conn.send(result)
|
||||
except Exception as e:
|
||||
conn.send({"error": str(e), "_extraction_time_ms": 0})
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def _run_with_timeout(fn, args, timeout):
|
||||
"""Execute fn(*args) in a forked child with a timeout.
|
||||
|
||||
On timeout the child is killed but the parent stays alive —
|
||||
no expensive process restart is needed.
|
||||
"""
|
||||
try:
|
||||
ctx = _mp.get_context("fork")
|
||||
parent_conn, child_conn = ctx.Pipe(duplex=False)
|
||||
p = ctx.Process(target=_worker, args=(fn, args, child_conn))
|
||||
p.start()
|
||||
child_conn.close()
|
||||
|
||||
if parent_conn.poll(timeout=timeout):
|
||||
try:
|
||||
result = parent_conn.recv()
|
||||
except Exception:
|
||||
result = {"error": "worker process crashed", "_extraction_time_ms": 0}
|
||||
else:
|
||||
p.kill()
|
||||
result = {
|
||||
"error": f"extraction timed out after {timeout}s",
|
||||
"_extraction_time_ms": timeout * 1000.0,
|
||||
}
|
||||
|
||||
p.join(timeout=5)
|
||||
if p.is_alive():
|
||||
p.kill()
|
||||
p.join()
|
||||
parent_conn.close()
|
||||
return result
|
||||
except Exception:
|
||||
# Fork not available — fall back to in-process extraction
|
||||
try:
|
||||
return fn(*args)
|
||||
except Exception as e:
|
||||
return {"error": str(e), "_extraction_time_ms": 0}
|
||||
|
||||
|
||||
def _parse_path(line: str) -> str:
|
||||
"""Parse a request line: JSON object with path field, or plain file path."""
|
||||
stripped = line.strip()
|
||||
if stripped.startswith("{"):
|
||||
try:
|
||||
return json.loads(stripped).get("path", "")
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
pass
|
||||
return stripped
|
||||
|
||||
|
||||
def run_server(timeout=None) -> None:
|
||||
"""Persistent server mode."""
|
||||
print("READY", flush=True)
|
||||
for line in sys.stdin:
|
||||
file_path = _parse_path(line)
|
||||
if not file_path:
|
||||
continue
|
||||
if timeout is not None:
|
||||
result = _run_with_timeout(extract_sync, (file_path,), timeout)
|
||||
else:
|
||||
try:
|
||||
result = extract_sync(file_path)
|
||||
except Exception as e:
|
||||
result = {"error": str(e), "_extraction_time_ms": 0}
|
||||
print(json.dumps(result), flush=True)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
ocr_enabled = False
|
||||
timeout = None
|
||||
args = []
|
||||
for arg in sys.argv[1:]:
|
||||
if arg == "--ocr":
|
||||
ocr_enabled = True
|
||||
elif arg == "--no-ocr":
|
||||
ocr_enabled = False
|
||||
elif arg.startswith("--timeout="):
|
||||
timeout = int(arg.split("=", 1)[1])
|
||||
elif arg.startswith("--format="):
|
||||
_fmt = arg.split("=", 1)[1]
|
||||
if _fmt != "markdown":
|
||||
print(f"{sys.argv[0]} only supports markdown output; got --format {_fmt}", file=sys.stderr)
|
||||
sys.exit(64)
|
||||
else:
|
||||
args.append(arg)
|
||||
|
||||
if len(args) < 1:
|
||||
print("Usage: pymupdf4llm_extract.py [--ocr|--no-ocr] [--timeout=SECS] <mode> <file_path>", file=sys.stderr)
|
||||
print("Modes: sync, server", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
mode = args[0]
|
||||
if mode == "server":
|
||||
run_server(timeout=timeout)
|
||||
elif mode == "sync":
|
||||
if len(args) < 2:
|
||||
print("Error: sync mode requires a file path", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
file_path = args[1]
|
||||
try:
|
||||
payload = extract_sync(file_path)
|
||||
print(json.dumps(payload), end="")
|
||||
except Exception as e:
|
||||
print(f"Error extracting with PyMuPDF4LLM: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
else:
|
||||
# Legacy fallback for direct file path
|
||||
try:
|
||||
payload = extract_sync(args[0])
|
||||
print(json.dumps(payload), end="")
|
||||
except Exception as e:
|
||||
print(f"Error extracting with PyMuPDF4LLM: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
245
tools/benchmark-harness/scripts/pypdf_extract.py
Normal file
245
tools/benchmark-harness/scripts/pypdf_extract.py
Normal file
@@ -0,0 +1,245 @@
|
||||
"""pypdf extraction wrapper for benchmark harness.
|
||||
|
||||
Supports three modes:
|
||||
- sync: extract text page-by-page (sequential)
|
||||
- batch: process multiple files (simulated batch using loop)
|
||||
- server: persistent mode reading paths from stdin
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import multiprocessing as _mp
|
||||
import os
|
||||
import platform
|
||||
import resource
|
||||
import sys
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
from pypdf import PdfReader
|
||||
|
||||
|
||||
def _get_peak_memory_bytes() -> int:
|
||||
"""Get peak memory usage in bytes using resource module."""
|
||||
usage = resource.getrusage(resource.RUSAGE_SELF)
|
||||
if platform.system() == "Linux":
|
||||
return usage.ru_maxrss * 1024
|
||||
return usage.ru_maxrss
|
||||
|
||||
|
||||
def extract_sync(file_path: str) -> dict[str, Any]:
|
||||
"""Extract using synchronous single-file API."""
|
||||
start = time.perf_counter()
|
||||
|
||||
reader = PdfReader(file_path)
|
||||
text_parts = []
|
||||
for page in reader.pages:
|
||||
page_text = page.extract_text()
|
||||
if page_text:
|
||||
text_parts.append(page_text)
|
||||
|
||||
markdown = "\n\n".join(text_parts)
|
||||
|
||||
duration_ms = (time.perf_counter() - start) * 1000.0
|
||||
|
||||
return {
|
||||
"content": markdown,
|
||||
"metadata": {"framework": "pypdf"},
|
||||
"_extraction_time_ms": duration_ms,
|
||||
"_peak_memory_bytes": _get_peak_memory_bytes(),
|
||||
}
|
||||
|
||||
|
||||
def extract_batch(file_paths: list[str]) -> list[dict[str, Any]]:
|
||||
"""Extract multiple files (simulated batch - pypdf has no native batch API)."""
|
||||
start = time.perf_counter()
|
||||
|
||||
results = []
|
||||
for file_path in file_paths:
|
||||
try:
|
||||
reader = PdfReader(file_path)
|
||||
text_parts = []
|
||||
for page in reader.pages:
|
||||
page_text = page.extract_text()
|
||||
if page_text:
|
||||
text_parts.append(page_text)
|
||||
|
||||
markdown = "\n\n".join(text_parts)
|
||||
results.append(
|
||||
{
|
||||
"content": markdown,
|
||||
"metadata": {"framework": "pypdf"},
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
results.append(
|
||||
{
|
||||
"content": "",
|
||||
"metadata": {
|
||||
"framework": "pypdf",
|
||||
"error": str(e),
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
total_duration_ms = (time.perf_counter() - start) * 1000.0
|
||||
per_file_duration_ms = total_duration_ms / len(file_paths) if file_paths else 0
|
||||
|
||||
peak_memory = _get_peak_memory_bytes()
|
||||
for result in results:
|
||||
result["_extraction_time_ms"] = per_file_duration_ms
|
||||
result["_batch_total_ms"] = total_duration_ms
|
||||
result["_peak_memory_bytes"] = peak_memory
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def _worker(fn, args, conn):
|
||||
"""Run extraction in a forked child process.
|
||||
|
||||
Closes inherited stdin/stdout so the child cannot corrupt the
|
||||
parent's line-based JSON protocol.
|
||||
"""
|
||||
try:
|
||||
sys.stdin.close()
|
||||
sys.stdout = open(os.devnull, "w")
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
result = fn(*args)
|
||||
conn.send(result)
|
||||
except Exception as e:
|
||||
conn.send({"error": str(e), "_extraction_time_ms": 0})
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def _run_with_timeout(fn, args, timeout):
|
||||
"""Execute fn(*args) in a forked child with a timeout.
|
||||
|
||||
On timeout the child is killed but the parent stays alive —
|
||||
no expensive process restart is needed.
|
||||
"""
|
||||
try:
|
||||
ctx = _mp.get_context("fork")
|
||||
parent_conn, child_conn = ctx.Pipe(duplex=False)
|
||||
p = ctx.Process(target=_worker, args=(fn, args, child_conn))
|
||||
p.start()
|
||||
child_conn.close()
|
||||
|
||||
if parent_conn.poll(timeout=timeout):
|
||||
try:
|
||||
result = parent_conn.recv()
|
||||
except Exception:
|
||||
result = {"error": "worker process crashed", "_extraction_time_ms": 0}
|
||||
else:
|
||||
p.kill()
|
||||
result = {
|
||||
"error": f"extraction timed out after {timeout}s",
|
||||
"_extraction_time_ms": timeout * 1000.0,
|
||||
}
|
||||
|
||||
p.join(timeout=5)
|
||||
if p.is_alive():
|
||||
p.kill()
|
||||
p.join()
|
||||
parent_conn.close()
|
||||
return result
|
||||
except Exception:
|
||||
# Fork not available — fall back to in-process extraction
|
||||
try:
|
||||
return fn(*args)
|
||||
except Exception as e:
|
||||
return {"error": str(e), "_extraction_time_ms": 0}
|
||||
|
||||
|
||||
def _parse_path(line: str) -> str:
|
||||
"""Parse a request line: JSON object with path field, or plain file path."""
|
||||
stripped = line.strip()
|
||||
if stripped.startswith("{"):
|
||||
try:
|
||||
return json.loads(stripped).get("path", "")
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
pass
|
||||
return stripped
|
||||
|
||||
|
||||
def run_server(timeout=None) -> None:
|
||||
"""Persistent server mode: read paths from stdin, write JSON to stdout."""
|
||||
print("READY", flush=True)
|
||||
for line in sys.stdin:
|
||||
file_path = _parse_path(line)
|
||||
if not file_path:
|
||||
continue
|
||||
if timeout is not None:
|
||||
result = _run_with_timeout(extract_sync, (file_path,), timeout)
|
||||
else:
|
||||
try:
|
||||
result = extract_sync(file_path)
|
||||
except Exception as e:
|
||||
result = {"error": str(e), "_extraction_time_ms": 0}
|
||||
print(json.dumps(result), flush=True)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
timeout = None
|
||||
args = []
|
||||
for arg in sys.argv[1:]:
|
||||
if arg in ("--ocr", "--no-ocr"):
|
||||
pass # Accepted but ignored - pypdf doesn't have OCR config
|
||||
elif arg.startswith("--timeout="):
|
||||
timeout = int(arg.split("=", 1)[1])
|
||||
elif arg.startswith("--format="):
|
||||
_fmt = arg.split("=", 1)[1]
|
||||
if _fmt != "plaintext":
|
||||
print(f"{sys.argv[0]} only supports plaintext output; got --format {_fmt}", file=sys.stderr)
|
||||
sys.exit(64)
|
||||
else:
|
||||
args.append(arg)
|
||||
|
||||
if len(args) < 1:
|
||||
print(
|
||||
"Usage: pypdf_extract.py [--ocr|--no-ocr] [--timeout=SECS] <mode> <file_path> [additional_files...]",
|
||||
file=sys.stderr,
|
||||
)
|
||||
print("Modes: sync, batch, server", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
mode = args[0]
|
||||
file_paths = args[1:]
|
||||
|
||||
try:
|
||||
if mode == "server":
|
||||
run_server(timeout=timeout)
|
||||
|
||||
elif mode == "sync":
|
||||
if len(file_paths) != 1:
|
||||
print("Error: sync mode requires exactly one file", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
payload = extract_sync(file_paths[0])
|
||||
print(json.dumps(payload), end="")
|
||||
|
||||
elif mode == "batch":
|
||||
if len(file_paths) < 1:
|
||||
print("Error: batch mode requires at least one file", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if len(file_paths) == 1:
|
||||
results = extract_batch(file_paths)
|
||||
print(json.dumps(results[0]), end="")
|
||||
else:
|
||||
results = extract_batch(file_paths)
|
||||
print(json.dumps(results), end="")
|
||||
|
||||
else:
|
||||
print(f"Error: Unknown mode '{mode}'. Use sync, batch, or server", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error extracting with pypdf: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
166
tools/benchmark-harness/scripts/sanitize_pandoc_gt.py
Normal file
166
tools/benchmark-harness/scripts/sanitize_pandoc_gt.py
Normal file
@@ -0,0 +1,166 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Sanitize pandoc-generated markdown ground truth files.
|
||||
|
||||
Removes common pandoc artifacts that don't represent actual document structure.
|
||||
|
||||
Usage:
|
||||
# Single file (in-place):
|
||||
python sanitize_pandoc_gt.py input.md
|
||||
|
||||
# Pipe mode:
|
||||
pandoc -f docbook -t gfm --wrap=none input.xml | python sanitize_pandoc_gt.py > output.md
|
||||
|
||||
# Dry run (show diff without modifying):
|
||||
python sanitize_pandoc_gt.py --dry-run input.md
|
||||
|
||||
# Batch all GT files (dry run):
|
||||
python sanitize_pandoc_gt.py --dry-run --batch test_documents/ground_truth/
|
||||
|
||||
# Batch all GT files (apply):
|
||||
python sanitize_pandoc_gt.py --batch test_documents/ground_truth/
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import difflib
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
|
||||
|
||||
def sanitize(text: str) -> str:
|
||||
# Track whether we're inside a fenced code block
|
||||
in_code = False
|
||||
lines = text.split("\n")
|
||||
result = []
|
||||
|
||||
for line in lines:
|
||||
# Track fenced code blocks — don't modify content inside them
|
||||
stripped = line.strip()
|
||||
if stripped.startswith("```") or stripped.startswith("~~~"):
|
||||
in_code = not in_code
|
||||
# Clean code fence attributes even when toggling
|
||||
if not in_code or stripped.startswith("```") or stripped.startswith("~~~"):
|
||||
# Convert ``` {.python} to ```python
|
||||
m = re.match(r"^(`{3,}|~{3,})\s*\{\s*\.(\w+)(?:\s+[^}]*)?\}\s*$", line)
|
||||
if m:
|
||||
line = f"{m.group(1)}{m.group(2)}"
|
||||
else:
|
||||
# Remove {.class} from code fences without extracting language
|
||||
line = re.sub(r"^(`{3,}|~{3,})\s*\{[^}]*\}\s*$", r"\1", line)
|
||||
result.append(line)
|
||||
continue
|
||||
|
||||
if in_code:
|
||||
result.append(line)
|
||||
continue
|
||||
|
||||
# === Pandoc div wrappers ===
|
||||
if re.match(r"^:::\s*(\{.*\})?\s*$", stripped):
|
||||
continue
|
||||
|
||||
# === Remove {.class} and {#id} attributes from headings ===
|
||||
if re.match(r"^#{1,6}\s", line):
|
||||
line = re.sub(r"\s*\{[.#][^}]*\}\s*$", "", line)
|
||||
|
||||
# === Replace <!-- end list --> pandoc markers with blank line ===
|
||||
# Don't just remove — keep the structural separation it provides
|
||||
if stripped == "<!-- end list -->":
|
||||
if not (result and result[-1].strip() == ""):
|
||||
result.append("")
|
||||
continue
|
||||
|
||||
# === Remove pandoc-specific HTML comments only ===
|
||||
# Keep <!-- image --> and other semantic comments
|
||||
if stripped == "<!-- end list -->" or stripped == "<!-- -->":
|
||||
continue
|
||||
|
||||
# Do NOT collapse blank lines — they are structural in markdown.
|
||||
# Blank lines separate paragraphs, tables, lists, etc.
|
||||
|
||||
result.append(line)
|
||||
|
||||
# Trim trailing blank lines, ensure single trailing newline
|
||||
while result and result[-1].strip() == "":
|
||||
result.pop()
|
||||
|
||||
return "\n".join(result) + "\n" if result else ""
|
||||
|
||||
|
||||
def process_file(path: str, dry_run: bool = False) -> tuple[bool, str]:
|
||||
"""Process a single file. Returns (changed, diff_text)."""
|
||||
with open(path) as f:
|
||||
original = f.read()
|
||||
|
||||
cleaned = sanitize(original)
|
||||
|
||||
if original == cleaned:
|
||||
return False, ""
|
||||
|
||||
diff = "".join(
|
||||
difflib.unified_diff(
|
||||
original.splitlines(keepends=True),
|
||||
cleaned.splitlines(keepends=True),
|
||||
fromfile=f"a/{path}",
|
||||
tofile=f"b/{path}",
|
||||
n=3,
|
||||
)
|
||||
)
|
||||
|
||||
if not dry_run:
|
||||
with open(path, "w") as f:
|
||||
f.write(cleaned)
|
||||
|
||||
return True, diff
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Sanitize pandoc GT markdown files")
|
||||
parser.add_argument("path", nargs="?", help="File or directory to process")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Show diff without modifying files")
|
||||
parser.add_argument("--batch", action="store_true", help="Process all .md files in directory recursively")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Pipe mode (no path, stdin)
|
||||
if args.path is None and not sys.stdin.isatty():
|
||||
sys.stdout.write(sanitize(sys.stdin.read()))
|
||||
return
|
||||
|
||||
if args.path is None:
|
||||
parser.print_help()
|
||||
return
|
||||
|
||||
# Batch mode
|
||||
if args.batch or os.path.isdir(args.path):
|
||||
changed_count = 0
|
||||
total_count = 0
|
||||
for root, _dirs, files in os.walk(args.path):
|
||||
for fname in sorted(files):
|
||||
if not fname.endswith(".md"):
|
||||
continue
|
||||
fpath = os.path.join(root, fname)
|
||||
total_count += 1
|
||||
changed, diff = process_file(fpath, dry_run=args.dry_run)
|
||||
if changed:
|
||||
changed_count += 1
|
||||
if args.dry_run:
|
||||
print(diff)
|
||||
else:
|
||||
print(f" cleaned: {fpath}")
|
||||
|
||||
action = "would change" if args.dry_run else "cleaned"
|
||||
print(f"\n{action} {changed_count}/{total_count} files")
|
||||
return
|
||||
|
||||
# Single file mode
|
||||
changed, diff = process_file(args.path, dry_run=args.dry_run)
|
||||
if changed:
|
||||
if args.dry_run:
|
||||
print(diff)
|
||||
else:
|
||||
print(f"cleaned: {args.path}")
|
||||
else:
|
||||
print(f"no changes: {args.path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
230
tools/benchmark-harness/scripts/unstructured_extract.py
Executable file
230
tools/benchmark-harness/scripts/unstructured_extract.py
Executable file
@@ -0,0 +1,230 @@
|
||||
"""Unstructured extraction wrapper for benchmark harness."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import multiprocessing as _mp
|
||||
import os
|
||||
import platform
|
||||
import resource
|
||||
import sys
|
||||
import time
|
||||
|
||||
from unstructured.partition.auto import partition
|
||||
|
||||
|
||||
def _get_peak_memory_bytes() -> int:
|
||||
"""Get peak memory usage in bytes using resource module."""
|
||||
usage = resource.getrusage(resource.RUSAGE_SELF)
|
||||
if platform.system() == "Linux":
|
||||
return usage.ru_maxrss * 1024
|
||||
return usage.ru_maxrss
|
||||
|
||||
|
||||
def _render_markdown(elements: list) -> str:
|
||||
"""Render Unstructured Elements as GFM-ish markdown."""
|
||||
import re
|
||||
|
||||
parts: list[str] = []
|
||||
for el in elements:
|
||||
cls = type(el).__name__
|
||||
text = (el.text or "").strip() if hasattr(el, "text") else str(el).strip()
|
||||
if not text and cls not in ("Image", "Figure"):
|
||||
continue
|
||||
if cls == "Title":
|
||||
parts.append(f"# {text}")
|
||||
elif cls == "Header":
|
||||
parts.append(f"## {text}")
|
||||
elif cls == "ListItem":
|
||||
parts.append(f"- {text}")
|
||||
elif cls in ("CodeSnippet", "Code"):
|
||||
parts.append(f"```\n{text}\n```")
|
||||
elif cls in ("Image", "Figure"):
|
||||
parts.append(f"![{text or cls}]()")
|
||||
elif cls == "Table":
|
||||
html = ""
|
||||
md = getattr(el, "metadata", None)
|
||||
if md is not None:
|
||||
html = getattr(md, "text_as_html", "") or ""
|
||||
if html:
|
||||
rows = re.findall(r"<tr[^>]*>(.*?)</tr>", html, flags=re.DOTALL | re.IGNORECASE)
|
||||
rendered: list[str] = []
|
||||
for i, row_html in enumerate(rows):
|
||||
cells = re.findall(r"<t[dh][^>]*>(.*?)</t[dh]>", row_html, flags=re.DOTALL | re.IGNORECASE)
|
||||
cells = [re.sub(r"<[^>]+>", "", c).strip() for c in cells]
|
||||
if cells:
|
||||
rendered.append("| " + " | ".join(cells) + " |")
|
||||
if i == 0:
|
||||
rendered.append("| " + " | ".join("---" for _ in cells) + " |")
|
||||
if rendered:
|
||||
parts.append("\n".join(rendered))
|
||||
else:
|
||||
parts.append(text)
|
||||
else:
|
||||
parts.append(text)
|
||||
else:
|
||||
parts.append(text)
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
def extract_sync(file_path: str, ocr_enabled: bool, output_format: str = "markdown") -> dict:
|
||||
"""Extract using Unstructured partition API."""
|
||||
strategy = "hi_res" if ocr_enabled else "fast"
|
||||
start = time.perf_counter()
|
||||
elements = partition(filename=file_path, strategy=strategy, languages=["eng"])
|
||||
duration_ms = (time.perf_counter() - start) * 1000.0
|
||||
|
||||
if output_format == "markdown":
|
||||
content = _render_markdown(elements)
|
||||
else:
|
||||
content = "\n\n".join(str(el) for el in elements)
|
||||
return {
|
||||
"content": content,
|
||||
"metadata": {"framework": "unstructured", "strategy": strategy, "output_format": output_format},
|
||||
"_extraction_time_ms": duration_ms,
|
||||
"_peak_memory_bytes": _get_peak_memory_bytes(),
|
||||
}
|
||||
|
||||
|
||||
def _worker(fn, args, conn):
|
||||
"""Run extraction in a forked child process.
|
||||
|
||||
Closes inherited stdin/stdout so the child cannot corrupt the
|
||||
parent's line-based JSON protocol.
|
||||
"""
|
||||
try:
|
||||
sys.stdin.close()
|
||||
sys.stdout = open(os.devnull, "w")
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
result = fn(*args)
|
||||
conn.send(result)
|
||||
except Exception as e:
|
||||
conn.send({"error": str(e), "_extraction_time_ms": 0})
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def _run_with_timeout(fn, args, timeout):
|
||||
"""Execute fn(*args) in a forked child with a timeout.
|
||||
|
||||
On timeout the child is killed but the parent stays alive —
|
||||
no expensive process restart is needed.
|
||||
"""
|
||||
try:
|
||||
ctx = _mp.get_context("fork")
|
||||
parent_conn, child_conn = ctx.Pipe(duplex=False)
|
||||
p = ctx.Process(target=_worker, args=(fn, args, child_conn))
|
||||
p.start()
|
||||
child_conn.close()
|
||||
|
||||
if parent_conn.poll(timeout=timeout):
|
||||
try:
|
||||
result = parent_conn.recv()
|
||||
except Exception:
|
||||
result = {"error": "worker process crashed", "_extraction_time_ms": 0}
|
||||
else:
|
||||
p.kill()
|
||||
result = {
|
||||
"error": f"extraction timed out after {timeout}s",
|
||||
"_extraction_time_ms": timeout * 1000.0,
|
||||
}
|
||||
|
||||
p.join(timeout=5)
|
||||
if p.is_alive():
|
||||
p.kill()
|
||||
p.join()
|
||||
parent_conn.close()
|
||||
return result
|
||||
except Exception:
|
||||
# Fork not available — fall back to in-process extraction
|
||||
try:
|
||||
return fn(*args)
|
||||
except Exception as e:
|
||||
return {"error": str(e), "_extraction_time_ms": 0}
|
||||
|
||||
|
||||
def _parse_path(line: str) -> str:
|
||||
"""Parse a request line: JSON object with path field, or plain file path."""
|
||||
stripped = line.strip()
|
||||
if stripped.startswith("{"):
|
||||
try:
|
||||
return json.loads(stripped).get("path", "")
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
pass
|
||||
return stripped
|
||||
|
||||
|
||||
def run_server(ocr_enabled: bool, output_format: str, timeout=None) -> None:
|
||||
"""Persistent server mode: read paths from stdin, write JSON to stdout."""
|
||||
print("READY", flush=True)
|
||||
for line in sys.stdin:
|
||||
file_path = _parse_path(line)
|
||||
if not file_path:
|
||||
continue
|
||||
if timeout is not None:
|
||||
result = _run_with_timeout(extract_sync, (file_path, ocr_enabled, output_format), timeout)
|
||||
else:
|
||||
try:
|
||||
result = extract_sync(file_path, ocr_enabled, output_format)
|
||||
except Exception as e:
|
||||
result = {"error": str(e), "_extraction_time_ms": 0}
|
||||
print(json.dumps(result), flush=True)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
ocr_enabled = False
|
||||
timeout = None
|
||||
output_format = "markdown"
|
||||
args = []
|
||||
for arg in sys.argv[1:]:
|
||||
if arg == "--ocr":
|
||||
ocr_enabled = True
|
||||
elif arg == "--no-ocr":
|
||||
ocr_enabled = False
|
||||
elif arg.startswith("--timeout="):
|
||||
timeout = int(arg.split("=", 1)[1])
|
||||
elif arg.startswith("--format="):
|
||||
output_format = arg.split("=", 1)[1]
|
||||
else:
|
||||
args.append(arg)
|
||||
|
||||
if output_format not in ("markdown", "plaintext"):
|
||||
print(f"Error: --format must be 'markdown' or 'plaintext'; got '{output_format}'", file=sys.stderr)
|
||||
sys.exit(64)
|
||||
|
||||
if len(args) < 1:
|
||||
print(
|
||||
"Usage: unstructured_extract.py [--ocr|--no-ocr] [--timeout=SECS] [--format=markdown|plaintext] <mode> <file_path>",
|
||||
file=sys.stderr,
|
||||
)
|
||||
print("Modes: sync, server", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
mode = args[0]
|
||||
|
||||
if mode == "server":
|
||||
run_server(ocr_enabled, output_format, timeout=timeout)
|
||||
elif mode == "sync":
|
||||
if len(args) < 2:
|
||||
print("Error: sync mode requires a file path", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
try:
|
||||
payload = extract_sync(args[1], ocr_enabled, output_format)
|
||||
print(json.dumps(payload), end="")
|
||||
except Exception as e:
|
||||
print(f"Error extracting with Unstructured: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
else:
|
||||
# Legacy mode: first arg is the file path directly
|
||||
try:
|
||||
payload = extract_sync(args[0], ocr_enabled, output_format)
|
||||
print(json.dumps(payload), end="")
|
||||
except Exception as e:
|
||||
print(f"Error extracting with Unstructured: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user