Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/tools/benchmark-harness/scripts/TikaExtract.java
+++ b/tools/benchmark-harness/scripts/TikaExtract.java
@@ -0,0 +1,394 @@
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.ocr.TesseractOCRConfig;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.metadata.Metadata;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.List;
+
+public final class TikaExtract {
+    private static final double NANOS_IN_MILLISECOND = 1_000_000.0;
+    /** Length of the JSON key {@code "path"} including surrounding quotes. */
+    private static final int PATH_KEY_LENGTH = 6;
+    private static final char LAST_CONTROL_CHAR = 0x1F;
+
+    private TikaExtract() {
+    }
+
+    public static void main(String[] args) {
+        boolean ocrEnabled = false;
+        List<String> positionalArgs = new ArrayList<>();
+
+        for (String arg : args) {
+            if ("--ocr".equals(arg)) {
+                ocrEnabled = true;
+            } else if ("--no-ocr".equals(arg)) {
+                ocrEnabled = false;
+            } else {
+                positionalArgs.add(arg);
+            }
+        }
+
+        if (positionalArgs.isEmpty()) {
+            System.err.println("Usage: TikaExtract [--ocr|--no-ocr] <mode> <file1> [file2] ...");
+            System.err.println("Modes: sync, batch, server");
+            System.exit(1);
+        }
+
+        String mode = positionalArgs.get(0);
+        if (!"sync".equals(mode) && !"batch".equals(mode) && !"server".equals(mode)) {
+            System.err.printf("Unsupported mode '%s'%n", mode);
+            System.exit(1);
+        }
+
+        // Enable debug logging if TIKA_BENCHMARK_DEBUG is set
+        boolean debug = "true".equalsIgnoreCase(System.getenv("TIKA_BENCHMARK_DEBUG"));
+
+        if (debug) {
+            debugLog("java.version", System.getProperty("java.version"));
+            debugLog("os.name", System.getProperty("os.name"));
+            debugLog("os.arch", System.getProperty("os.arch"));
+            debugLog("Mode", mode);
+            debugLog("OCR enabled", String.valueOf(ocrEnabled));
+            debugLog("Files to process", String.valueOf(positionalArgs.size() - 1));
+        }
+
+        try {
+            if ("sync".equals(mode)) {
+                if (positionalArgs.size() < 2) {
+                    System.err.println("Sync mode requires exactly one file");
+                    System.exit(1);
+                }
+                processSyncMode(positionalArgs.get(1), ocrEnabled, debug);
+            } else if ("batch".equals(mode)) {
+                processBatchMode(positionalArgs, ocrEnabled, debug);
+            } else {
+                processServerMode(ocrEnabled, debug);
+            }
+        } catch (Exception e) {
+            if (debug) {
+                debugLog("Processing failed with exception", e.getClass().getName());
+                e.printStackTrace(System.err);
+            } else {
+                e.printStackTrace(System.err);
+            }
+            System.exit(1);
+        }
+    }
+
+    private static void processSyncMode(String filePath, boolean ocrEnabled, boolean debug) throws Exception {
+        if (debug) {
+            debugLog("Input file", filePath);
+        }
+
+        Path path = Path.of(filePath);
+        ExtractionData data;
+        long start = System.nanoTime();
+
+        try {
+            if (debug) {
+                debugLog("Starting extraction", "");
+            }
+            data = extractFile(path.toFile(), ocrEnabled, debug);
+            if (debug) {
+                debugLog("Extraction completed", "");
+            }
+        } catch (Exception e) {
+            if (debug) {
+                debugLog("Extraction failed", e.getClass().getName());
+                e.printStackTrace(System.err);
+            }
+            throw e;
+        }
+
+        double elapsedMs = (System.nanoTime() - start) / NANOS_IN_MILLISECOND;
+        String json = toJson(data, elapsedMs, ocrEnabled);
+        System.out.print(json);
+    }
+
+    private static void processBatchMode(
+            List<String> positionalArgs, boolean ocrEnabled, boolean debug) throws Exception {
+        List<String> filePaths = new ArrayList<>();
+        for (int i = 1; i < positionalArgs.size(); i++) {
+            filePaths.add(positionalArgs.get(i));
+        }
+
+        long batchStart = System.nanoTime();
+        StringBuilder jsonArray = new StringBuilder();
+        jsonArray.append('[');
+
+        boolean first = true;
+        for (String filePath : filePaths) {
+            if (debug) {
+                debugLog("Processing file", filePath);
+            }
+
+            try {
+                Path path = Path.of(filePath);
+                long start = System.nanoTime();
+                ExtractionData data = extractFile(path.toFile(), ocrEnabled, debug);
+                double elapsedMs = (System.nanoTime() - start) / NANOS_IN_MILLISECOND;
+
+                if (!first) {
+                    jsonArray.append(',');
+                }
+                first = false;
+
+                double batchTotalMs = (System.nanoTime() - batchStart) / NANOS_IN_MILLISECOND;
+                jsonArray.append(toJsonWithBatch(data, elapsedMs, batchTotalMs, ocrEnabled));
+
+                if (debug) {
+                    debugLog("File processed", filePath);
+                }
+            } catch (Exception e) {
+                if (debug) {
+                    debugLog("Failed to process file", filePath);
+                    debugLog("Exception", e.getClass().getName());
+                    e.printStackTrace(System.err);
+                } else {
+                    System.err.printf("Error processing %s: %s%n", filePath, e.getMessage());
+                }
+            }
+        }
+
+        double totalBatchMs = (System.nanoTime() - batchStart) / NANOS_IN_MILLISECOND;
+        jsonArray.append(']');
+
+        if (first) {
+            System.err.println("No files were successfully processed");
+            System.exit(1);
+            return;
+        }
+
+        System.out.print(jsonArray.toString());
+    }
+
+    private static void processServerMode(boolean ocrEnabled, boolean debug) throws Exception {
+        // Pre-create shared parser and OCR config to avoid per-file construction overhead.
+        // AutoDetectParser is thread-safe and reusable. Only BodyContentHandler and Metadata
+        // need to be recreated per extraction since they accumulate state.
+        AutoDetectParser sharedParser = new AutoDetectParser();
+        TesseractOCRConfig sharedOcrConfig = new TesseractOCRConfig();
+        if (!ocrEnabled) {
+            sharedOcrConfig.setSkipOcr(true);
+        } else {
+            sharedOcrConfig.setLanguage("eng");
+        }
+
+        // Signal readiness after JVM + Tika parser initialization
+        System.out.println("READY");
+        System.out.flush();
+
+        BufferedReader reader = new BufferedReader(new InputStreamReader(System.in));
+        String line;
+        while ((line = reader.readLine()) != null) {
+            String filePath = line.trim();
+            if (filePath.isEmpty()) {
+                continue;
+            }
+            // Parse JSON request if the harness sends {"path":"...", "force_ocr": ...}
+            if (filePath.startsWith("{")) {
+                filePath = parseJsonPath(filePath);
+            }
+            try {
+                Path path = Path.of(filePath);
+                long start = System.nanoTime();
+                ExtractionData data = extractFileWithParser(path.toFile(), sharedParser, sharedOcrConfig, debug);
+                double elapsedMs = (System.nanoTime() - start) / NANOS_IN_MILLISECOND;
+                String json = toJson(data, elapsedMs, ocrEnabled);
+                System.out.println(json);
+                System.out.flush();
+            } catch (Exception e) {
+                String errorJson = String.format(
+                        "{\"error\":%s,\"_extraction_time_ms\":0,\"_ocr_used\":false}",
+                        quote(e.getMessage()));
+                System.out.println(errorJson);
+                System.out.flush();
+            }
+        }
+    }
+
+    private static ExtractionData extractFileWithParser(
+            File file, AutoDetectParser parser, TesseractOCRConfig ocrConfig, boolean debug) throws Exception {
+        if (!file.exists()) {
+            throw new IllegalArgumentException("File does not exist: " + file.getAbsolutePath());
+        }
+
+        BodyContentHandler handler = new BodyContentHandler(-1);
+        Metadata metadata = new Metadata();
+        ParseContext context = new ParseContext();
+        context.set(TesseractOCRConfig.class, ocrConfig);
+
+        try (InputStream stream = new FileInputStream(file)) {
+            parser.parse(stream, handler, metadata, context);
+        }
+
+        String content = handler.toString();
+        String mimeType = metadata.get(Metadata.CONTENT_TYPE);
+
+        if (mimeType == null) {
+            mimeType = "application/octet-stream";
+        }
+
+        return new ExtractionData(content, mimeType);
+    }
+
+    private static ExtractionData extractFile(File file, boolean ocrEnabled, boolean debug) throws Exception {
+        if (!file.exists()) {
+            throw new IllegalArgumentException("File does not exist: " + file.getAbsolutePath());
+        }
+
+        AutoDetectParser parser = new AutoDetectParser();
+        BodyContentHandler handler = new BodyContentHandler(-1);
+        Metadata metadata = new Metadata();
+        ParseContext context = new ParseContext();
+
+        if (!ocrEnabled) {
+            TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
+            ocrConfig.setSkipOcr(true);
+            context.set(TesseractOCRConfig.class, ocrConfig);
+        } else {
+            TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
+            ocrConfig.setLanguage("eng");
+            context.set(TesseractOCRConfig.class, ocrConfig);
+        }
+
+        try (InputStream stream = new FileInputStream(file)) {
+            parser.parse(stream, handler, metadata, context);
+        }
+
+        String content = handler.toString();
+        String mimeType = metadata.get(Metadata.CONTENT_TYPE);
+
+        if (mimeType == null) {
+            mimeType = "application/octet-stream";
+        }
+
+        return new ExtractionData(content, mimeType);
+    }
+
+    /**
+     * Determine if OCR was actually used based on MIME type and OCR config.
+     * OCR is used by Tika when enabled and the file is an image type.
+     */
+    private static boolean determineOcrUsed(String mimeType, boolean ocrEnabled) {
+        if (!ocrEnabled) {
+            return false;
+        }
+        return mimeType != null && mimeType.startsWith("image/");
+    }
+
+    private static String toJson(ExtractionData data, double elapsedMs, boolean ocrEnabled) {
+        StringBuilder builder = new StringBuilder();
+        builder.append('{');
+        builder.append("\"content\":").append(quote(data.getContent())).append(',');
+        builder.append("\"metadata\":{");
+        builder.append("\"mimeType\":").append(quote(data.getMimeType()));
+        builder.append("},\"_extraction_time_ms\":").append(String.format("%.3f", elapsedMs));
+        builder.append(",\"_ocr_used\":").append(determineOcrUsed(data.getMimeType(), ocrEnabled));
+        builder.append('}');
+        return builder.toString();
+    }
+
+    private static String toJsonWithBatch(
+            ExtractionData data, double elapsedMs, double batchTotalMs, boolean ocrEnabled) {
+        StringBuilder builder = new StringBuilder();
+        builder.append('{');
+        builder.append("\"content\":").append(quote(data.getContent())).append(',');
+        builder.append("\"metadata\":{");
+        builder.append("\"mimeType\":").append(quote(data.getMimeType()));
+        builder.append("},\"_extraction_time_ms\":").append(String.format("%.3f", elapsedMs));
+        builder.append(",\"_batch_total_ms\":").append(String.format("%.3f", batchTotalMs));
+        builder.append(",\"_ocr_used\":").append(determineOcrUsed(data.getMimeType(), ocrEnabled));
+        builder.append('}');
+        return builder.toString();
+    }
+
+    /**
+     * Parse a JSON request line to extract the "path" field.
+     * Minimal JSON parsing to avoid adding a dependency.
+     */
+    private static String parseJsonPath(String json) {
+        int idx = json.indexOf("\"path\"");
+        if (idx < 0) {
+            return json;
+        }
+        // Skip past "path" key, colon, optional whitespace, and opening quote
+        idx = json.indexOf(':', idx + PATH_KEY_LENGTH);
+        if (idx < 0) {
+            return json;
+        }
+        idx = json.indexOf('"', idx + 1);
+        if (idx < 0) {
+            return json;
+        }
+        int start = idx + 1;
+        int end = json.indexOf('"', start);
+        if (end < 0) {
+            return json;
+        }
+        return json.substring(start, end);
+    }
+
+    // CPD-OFF: quote() is intentionally duplicated in standalone benchmark scripts (no shared classpath)
+    private static String quote(String value) {
+        if (value == null) {
+            return "null";
+        }
+        StringBuilder sb = new StringBuilder(value.length() + 2);
+        sb.append('"');
+        for (int i = 0; i < value.length(); i++) {
+            char c = value.charAt(i);
+            switch (c) {
+                case '\\': sb.append("\\\\"); break;
+                case '"':  sb.append("\\\""); break;
+                case '\n': sb.append("\\n");  break;
+                case '\r': sb.append("\\r");  break;
+                case '\t': sb.append("\\t");  break;
+                case '\b': sb.append("\\b");  break;
+                case '\f': sb.append("\\f");  break;
+                default:
+                    if (c <= LAST_CONTROL_CHAR) {
+                        sb.append(String.format("\\u%04x", (int) c));
+                    } else {
+                        sb.append(c);
+                    }
+            }
+        }
+        sb.append('"');
+        return sb.toString();
+    }
+    // CPD-ON
+
+    private static void debugLog(String key, String value) {
+        if (value == null) {
+            value = "(null)";
+        }
+        System.err.printf("[BENCHMARK_DEBUG] %-30s = %s%n", key, value);
+    }
+
+    private static class ExtractionData {
+        private final String content;
+        private final String mimeType;
+
+        ExtractionData(String content, String mimeType) {
+            this.content = content;
+            this.mimeType = mimeType;
+        }
+
+        String getContent() {
+            return content;
+        }
+
+        String getMimeType() {
+            return mimeType;
+        }
+    }
+}
--- a/tools/benchmark-harness/scripts/docling_extract.py
+++ b/tools/benchmark-harness/scripts/docling_extract.py
@@ -0,0 +1,277 @@
+"""Docling extraction wrapper for benchmark harness.
+
+Supports two modes:
+- sync: convert() - synchronous single-file extraction
+- batch: convert_all() - batch extraction for multiple files
+- server: persistent mode reading paths from stdin
+"""
+
+from __future__ import annotations
+
+import json
+import multiprocessing as _mp
+import os
+import platform
+import resource
+import sys
+import time
+from typing import Any
+
+from docling.document_converter import DocumentConverter
+
+
+def _get_peak_memory_bytes() -> int:
+    """Get peak memory usage in bytes using resource module."""
+    usage = resource.getrusage(resource.RUSAGE_SELF)
+    if platform.system() == "Linux":
+        return usage.ru_maxrss * 1024
+    return usage.ru_maxrss
+
+
+def create_converter(ocr_enabled: bool) -> DocumentConverter:
+    """Create a DocumentConverter with appropriate settings."""
+    if not ocr_enabled:
+        try:
+            from docling.datamodel.pipeline_options import PipelineOptions
+
+            options = PipelineOptions(do_ocr=False)
+            return DocumentConverter(pipeline_options=options)
+        except (ImportError, TypeError):
+            # Fallback if PipelineOptions API not available
+            return DocumentConverter()
+    return DocumentConverter()
+
+
+def _render(document: Any, output_format: str) -> str:
+    if output_format == "plaintext":
+        return document.export_to_text()
+    return document.export_to_markdown()
+
+
+def extract_sync(file_path: str, converter: DocumentConverter, output_format: str = "markdown") -> dict[str, Any]:
+    """Extract using synchronous single-file API."""
+    start = time.perf_counter()
+    result = converter.convert(file_path)
+    content = _render(result.document, output_format)
+    duration_ms = (time.perf_counter() - start) * 1000.0
+
+    return {
+        "content": content,
+        "metadata": {"framework": "docling", "output_format": output_format},
+        "_extraction_time_ms": duration_ms,
+        "_peak_memory_bytes": _get_peak_memory_bytes(),
+    }
+
+
+def extract_batch(
+    file_paths: list[str], converter: DocumentConverter, output_format: str = "markdown"
+) -> list[dict[str, Any]]:
+    """Extract multiple files using batch API."""
+    start = time.perf_counter()
+    results = converter.convert_all(file_paths, raises_on_error=False)
+    total_duration_ms = (time.perf_counter() - start) * 1000.0
+
+    per_file_duration_ms = total_duration_ms / len(file_paths) if file_paths else 0
+
+    outputs = []
+    for result in results:
+        if result.status.name == "SUCCESS":
+            content = _render(result.document, output_format)
+            outputs.append(
+                {
+                    "content": content,
+                    "metadata": {"framework": "docling", "output_format": output_format},
+                    "_extraction_time_ms": per_file_duration_ms,
+                    "_batch_total_ms": total_duration_ms,
+                    "_peak_memory_bytes": _get_peak_memory_bytes(),
+                }
+            )
+        else:
+            outputs.append(
+                {
+                    "content": "",
+                    "metadata": {
+                        "framework": "docling",
+                        "error": str(result.errors) if result.errors else "Unknown error",
+                        "status": result.status.name,
+                    },
+                    "_extraction_time_ms": per_file_duration_ms,
+                    "_batch_total_ms": total_duration_ms,
+                    "_peak_memory_bytes": _get_peak_memory_bytes(),
+                }
+            )
+
+    return outputs
+
+
+def _worker(fn, args, conn):
+    """Run extraction in a forked child process.
+
+    Closes inherited stdin/stdout so the child cannot corrupt the
+    parent's line-based JSON protocol.
+    """
+    try:
+        sys.stdin.close()
+        sys.stdout = open(os.devnull, "w")
+    except Exception:
+        pass
+    try:
+        result = fn(*args)
+        conn.send(result)
+    except Exception as e:
+        conn.send({"error": str(e), "_extraction_time_ms": 0})
+    finally:
+        conn.close()
+
+
+def _run_with_timeout(fn, args, timeout):
+    """Execute fn(*args) in a forked child with a timeout.
+
+    On timeout the child is killed but the parent stays alive —
+    no expensive process restart is needed.
+    """
+    try:
+        ctx = _mp.get_context("fork")
+        parent_conn, child_conn = ctx.Pipe(duplex=False)
+        p = ctx.Process(target=_worker, args=(fn, args, child_conn))
+        p.start()
+        child_conn.close()
+
+        if parent_conn.poll(timeout=timeout):
+            try:
+                result = parent_conn.recv()
+            except Exception:
+                result = {"error": "worker process crashed", "_extraction_time_ms": 0}
+        else:
+            p.kill()
+            result = {
+                "error": f"extraction timed out after {timeout}s",
+                "_extraction_time_ms": timeout * 1000.0,
+            }
+
+        p.join(timeout=5)
+        if p.is_alive():
+            p.kill()
+            p.join()
+        parent_conn.close()
+        return result
+    except Exception:
+        # Fork not available — fall back to in-process extraction
+        try:
+            return fn(*args)
+        except Exception as e:
+            return {"error": str(e), "_extraction_time_ms": 0}
+
+
+def _parse_path(line: str) -> str:
+    """Parse a request line: JSON object with path field, or plain file path."""
+    stripped = line.strip()
+    if stripped.startswith("{"):
+        try:
+            return json.loads(stripped).get("path", "")
+        except (json.JSONDecodeError, ValueError):
+            pass
+    return stripped
+
+
+def run_server(converter: DocumentConverter, output_format: str, timeout=None) -> None:
+    """Persistent server mode: read paths from stdin, write JSON to stdout."""
+    print("READY", flush=True)
+    for line in sys.stdin:
+        file_path = _parse_path(line)
+        if not file_path:
+            continue
+        if timeout is not None:
+            result = _run_with_timeout(extract_sync, (file_path, converter, output_format), timeout)
+        else:
+            try:
+                result = extract_sync(file_path, converter, output_format)
+            except Exception as e:
+                result = {"error": str(e), "_extraction_time_ms": 0}
+        print(json.dumps(result), flush=True)
+
+
+def main() -> None:
+    ocr_enabled = False
+    timeout = None
+    output_format = "markdown"
+    args = []
+    for arg in sys.argv[1:]:
+        if arg == "--ocr":
+            ocr_enabled = True
+        elif arg == "--no-ocr":
+            ocr_enabled = False
+        elif arg.startswith("--timeout="):
+            timeout = int(arg.split("=", 1)[1])
+        elif arg.startswith("--format="):
+            output_format = arg.split("=", 1)[1]
+        elif arg == "--format":
+            # Next-arg style handled below by appending
+            args.append(arg)
+        else:
+            args.append(arg)
+
+    # Support `--format <value>` (space-separated)
+    cleaned: list[str] = []
+    i = 0
+    while i < len(args):
+        if args[i] == "--format" and i + 1 < len(args):
+            output_format = args[i + 1]
+            i += 2
+            continue
+        cleaned.append(args[i])
+        i += 1
+    args = cleaned
+
+    if output_format not in ("markdown", "plaintext"):
+        print(f"Error: --format must be 'markdown' or 'plaintext'; got '{output_format}'", file=sys.stderr)
+        sys.exit(64)
+
+    if len(args) < 1:
+        print(
+            "Usage: docling_extract.py [--ocr|--no-ocr] [--timeout=SECS] [--format markdown|plaintext] <mode> <file_path> [additional_files...]",
+            file=sys.stderr,
+        )
+        print("Modes: sync, batch, server", file=sys.stderr)
+        sys.exit(1)
+
+    mode = args[0]
+    file_paths = args[1:]
+
+    # Create converter once (expensive initialization)
+    converter = create_converter(ocr_enabled)
+
+    try:
+        if mode == "server":
+            run_server(converter, output_format, timeout=timeout)
+
+        elif mode == "sync":
+            if len(file_paths) != 1:
+                print("Error: sync mode requires exactly one file", file=sys.stderr)
+                sys.exit(1)
+            payload = extract_sync(file_paths[0], converter, output_format)
+            print(json.dumps(payload), end="")
+
+        elif mode == "batch":
+            if len(file_paths) < 1:
+                print("Error: batch mode requires at least one file", file=sys.stderr)
+                sys.exit(1)
+
+            if len(file_paths) == 1:
+                results = extract_batch(file_paths, converter, output_format)
+                print(json.dumps(results[0]), end="")
+            else:
+                results = extract_batch(file_paths, converter, output_format)
+                print(json.dumps(results), end="")
+
+        else:
+            print(f"Error: Unknown mode '{mode}'. Use sync, batch, or server", file=sys.stderr)
+            sys.exit(1)
+
+    except Exception as e:
+        print(f"Error extracting with Docling: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/benchmark-harness/scripts/download_omnidocbench.sh
+++ b/tools/benchmark-harness/scripts/download_omnidocbench.sh
@@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+# Download the OmniDocBench dataset (opendatalab/OmniDocBench) from HuggingFace.
+#
+# Usage:
+#   ./download_omnidocbench.sh [TARGET_DIR]
+#
+# Default target: tools/benchmark-harness/datasets/omnidocbench
+#
+# Requirements: curl, unzip (standard on macOS/Linux)
+# No HuggingFace account or API key needed (public dataset).
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+DEFAULT_DIR="${SCRIPT_DIR}/../datasets/omnidocbench"
+TARGET_DIR="${1:-$DEFAULT_DIR}"
+
+HF_BASE="https://huggingface.co/datasets/opendatalab/OmniDocBench/resolve/main"
+
+mkdir -p "$TARGET_DIR"
+
+# Download the main annotation file (65 MB)
+if [ -f "$TARGET_DIR/OmniDocBench.json" ]; then
+  echo "OmniDocBench.json already exists, skipping"
+else
+  echo "Downloading OmniDocBench.json (65 MB)..."
+  curl -L -o "$TARGET_DIR/OmniDocBench.json" "$HF_BASE/OmniDocBench.json"
+fi
+
+# Download images directory via HF CLI if available, otherwise use git-lfs clone
+if [ -d "$TARGET_DIR/images" ] && [ "$(find "$TARGET_DIR/images" -maxdepth 1 -type f 2>/dev/null | wc -l)" -gt 100 ]; then
+  echo "images/ directory already populated ($(find "$TARGET_DIR/images" -maxdepth 1 -type f | wc -l) files), skipping"
+else
+  if command -v huggingface-cli &>/dev/null; then
+    echo "Downloading full dataset via huggingface-cli..."
+    huggingface-cli download opendatalab/OmniDocBench \
+      --repo-type dataset \
+      --local-dir "$TARGET_DIR" \
+      --include "images/*" "ori_pdfs/*" "OmniDocBench.json"
+  elif command -v git-lfs &>/dev/null || git lfs version &>/dev/null 2>&1; then
+    echo "Downloading via git-lfs clone..."
+    TEMP_CLONE="$(mktemp -d)"
+    git clone --depth 1 "https://huggingface.co/datasets/opendatalab/OmniDocBench" "$TEMP_CLONE"
+    cd "$TEMP_CLONE" && git lfs pull
+    cp -r "$TEMP_CLONE/images" "$TARGET_DIR/" 2>/dev/null || true
+    cp -r "$TEMP_CLONE/ori_pdfs" "$TARGET_DIR/" 2>/dev/null || true
+    rm -rf "$TEMP_CLONE"
+  else
+    echo "ERROR: Need either huggingface-cli or git-lfs to download images."
+    echo ""
+    echo "Install one of:"
+    echo "  pip install huggingface-hub   # then: huggingface-cli"
+    echo "  brew install git-lfs          # then: git lfs install"
+    exit 1
+  fi
+fi
+
+# Summary
+echo ""
+echo "OmniDocBench downloaded to: $TARGET_DIR"
+echo "  Annotations: $(wc -c <"$TARGET_DIR/OmniDocBench.json" | tr -d ' ') bytes"
+[ -d "$TARGET_DIR/images" ] && echo "  Images: $(find "$TARGET_DIR/images" -maxdepth 1 -type f | wc -l | tr -d ' ') files"
+[ -d "$TARGET_DIR/ori_pdfs" ] && echo "  PDFs: $(find "$TARGET_DIR/ori_pdfs" -maxdepth 1 -type f | wc -l | tr -d ' ') files"
--- a/tools/benchmark-harness/scripts/generate_ground_truth.py
+++ b/tools/benchmark-harness/scripts/generate_ground_truth.py
@@ -0,0 +1,789 @@
+#!/usr/bin/env -S uv run --script
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "beautifulsoup4>=4.12",
+#     "python-docx>=1.0",
+#     "python-pptx>=1.0",
+#     "openpyxl>=3.1",
+#     "nbformat>=5.9",
+#     "xlrd>=2.0",
+#     "extract-msg>=0.48",
+#     "lxml>=5.0",
+#     "odfpy>=1.4",
+# ]
+# ///
+"""Generate ground truth text files for benchmark fixtures.
+
+Walks all fixture JSONs, extracts text from source documents using independent
+tools (not benchmarked frameworks), writes ground truth .txt files, patches
+fixture JSONs with ground_truth field, and updates ground_truth_mapping.json.
+
+PDF Ground Truth Methodology (updated Feb 2026):
+    PDF ground truth was regenerated using AI visual extraction (Claude Haiku
+    reading each PDF page as an image) for scanned/complex PDFs, and pdftotext
+    for born-digital PDFs with reliable embedded text. The previous approach of
+    using pdftotext for all PDFs produced incorrect ground truth for scanned
+    documents since pdftotext cannot read image-based text.
+
+    The handle_pdftotext() function below is retained for regenerating GT from
+    born-digital PDFs. For scanned PDFs, GT files were manually curated via AI
+    extraction and should not be overwritten by running this script with --force.
+
+Usage:
+    uv run tools/benchmark-harness/scripts/generate_ground_truth.py [OPTIONS]
+
+Options:
+    --dry-run           Print planned actions without writing
+    --format-filter     Comma-separated file types to process (e.g., md,txt,pdf)
+    --force             Regenerate even if ground truth already exists
+    --skip-types        Comma-separated file types to skip
+"""
+
+from __future__ import annotations
+
+import argparse
+import email
+import json
+import os
+import subprocess
+import sys
+import xml.etree.ElementTree as ET
+from pathlib import Path
+
+# ---------------------------------------------------------------------------
+# File type → handler mapping
+# ---------------------------------------------------------------------------
+
+RAW_SOURCE_TYPES = frozenset(
+    {
+        "md",
+        "txt",
+        "rst",
+        "org",
+        "commonmark",
+        "djot",
+        "toml",
+        "yaml",
+        "json",
+        "tsv",
+        "bib",
+        "csv",
+        "svg",
+    }
+)
+
+PDFTOTEXT_TYPES = frozenset({"pdf"})
+PANDOC_TYPES = frozenset(
+    {
+        "tex",
+        "latex",
+        "typ",
+        "epub",
+        "fb2",
+        "docbook",
+        "odt",
+        "rtf",
+        "opml",
+    }
+)
+PYTHON_DOCX_TYPES = frozenset({"docx"})
+PYTHON_PPTX_TYPES = frozenset({"pptx", "pptm", "ppsx"})
+OPENPYXL_TYPES = frozenset({"xlsx", "xlsm"})
+ODS_TYPES = frozenset({"ods"})
+BEAUTIFULSOUP_TYPES = frozenset({"html"})
+PYTHON_EMAIL_TYPES = frozenset({"eml"})
+EXTRACT_MSG_TYPES = frozenset({"msg"})
+NBFORMAT_TYPES = frozenset({"ipynb"})
+XML_PARSE_TYPES = frozenset({"xml"})
+XLRD_TYPES = frozenset({"xls"})
+ANTIWORD_TYPES = frozenset({"doc"})
+LIBREOFFICE_TYPES = frozenset({"ppt"})
+DBF_TYPES = frozenset({"dbf"})
+HWP_TYPES = frozenset({"hwp"})
+
+# Archive and image types are excluded from ground truth generation
+EXCLUDED_TYPES = frozenset(
+    {
+        "7z",
+        "gz",
+        "tar",
+        "tgz",
+        "zip",
+        "lz4",
+        "gif",
+        "jpeg",
+        "jpg",
+        "jp2",
+        "png",
+        "tiff",
+        "webp",
+        "bmp",
+        "pbm",
+        "pgm",
+        "pnm",
+        "ppm",
+    }
+)
+
+ALL_HANDLED_TYPES = (
+    RAW_SOURCE_TYPES
+    | PDFTOTEXT_TYPES
+    | PANDOC_TYPES
+    | PYTHON_DOCX_TYPES
+    | PYTHON_PPTX_TYPES
+    | OPENPYXL_TYPES
+    | BEAUTIFULSOUP_TYPES
+    | PYTHON_EMAIL_TYPES
+    | EXTRACT_MSG_TYPES
+    | NBFORMAT_TYPES
+    | XML_PARSE_TYPES
+    | XLRD_TYPES
+    | ANTIWORD_TYPES
+    | LIBREOFFICE_TYPES
+    | ODS_TYPES
+    | DBF_TYPES
+    | HWP_TYPES
+)
+
+
+def get_source_type(file_type: str) -> str:
+    """Return the ground truth source type string for a given file type."""
+    if file_type in RAW_SOURCE_TYPES:
+        return "raw_source"
+    if file_type in PDFTOTEXT_TYPES:
+        return "pdftotext"
+    if file_type in PANDOC_TYPES:
+        return "pandoc"
+    if file_type in PYTHON_DOCX_TYPES:
+        return "python-docx"
+    if file_type in PYTHON_PPTX_TYPES:
+        return "python-pptx"
+    if file_type in OPENPYXL_TYPES:
+        return "openpyxl"
+    if file_type in BEAUTIFULSOUP_TYPES:
+        return "beautifulsoup"
+    if file_type in PYTHON_EMAIL_TYPES:
+        return "python_email"
+    if file_type in EXTRACT_MSG_TYPES:
+        return "extract_msg"
+    if file_type in NBFORMAT_TYPES:
+        return "nbformat"
+    if file_type in XML_PARSE_TYPES:
+        return "xml_parse"
+    if file_type in XLRD_TYPES:
+        return "xlrd"
+    if file_type in ANTIWORD_TYPES:
+        return "antiword"
+    if file_type in LIBREOFFICE_TYPES:
+        return "libreoffice"
+    if file_type in ODS_TYPES:
+        return "odfpy"
+    if file_type in DBF_TYPES:
+        return "manual"
+    if file_type in HWP_TYPES:
+        return "manual"
+    return "manual"
+
+
+# ---------------------------------------------------------------------------
+# Text extraction handlers
+# ---------------------------------------------------------------------------
+
+
+def handle_raw_source(doc_path: Path) -> str:
+    """Read the file as-is. For text-based formats, source content IS ground truth."""
+    try:
+        return doc_path.read_text(encoding="utf-8")
+    except UnicodeDecodeError:
+        return doc_path.read_text(encoding="latin-1")
+
+
+def handle_pdftotext(doc_path: Path) -> str:
+    """Extract text from PDF using pdftotext (poppler-utils).
+
+    Note: This works well for born-digital PDFs with embedded text layers.
+    For scanned PDFs, pdftotext produces garbage output. Scanned PDF ground
+    truth should be generated via AI visual extraction instead.
+    """
+    result = subprocess.run(
+        ["pdftotext", "-layout", str(doc_path), "-"],
+        capture_output=True,
+        text=True,
+        timeout=60,
+    )
+    if result.returncode != 0:
+        raise RuntimeError(f"pdftotext failed: {result.stderr}")
+    return result.stdout
+
+
+def handle_pandoc(doc_path: Path, file_type: str) -> str:
+    """Convert document to plain text using pandoc."""
+    # Map file types to pandoc input formats
+    pandoc_format_map = {
+        "tex": "latex",
+        "latex": "latex",
+        "typ": "typst",
+        "epub": "epub",
+        "fb2": "fb2",
+        "docbook": "docbook",
+        "odt": "odt",
+        "rtf": "rtf",
+        "opml": "opml",
+        "doc": "doc",
+        "ppt": "ppt",
+    }
+    input_format = pandoc_format_map.get(file_type)
+    cmd = ["pandoc", "-t", "plain", "--wrap=none", str(doc_path)]
+    if input_format:
+        cmd.insert(1, "-f")
+        cmd.insert(2, input_format)
+    result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
+    if result.returncode != 0:
+        raise RuntimeError(f"pandoc failed: {result.stderr}")
+    return result.stdout
+
+
+def handle_python_docx(doc_path: Path) -> str:
+    """Extract text from DOCX using python-docx."""
+    import docx
+
+    doc = docx.Document(str(doc_path))
+    paragraphs = [p.text for p in doc.paragraphs]
+    # Also extract table text
+    for table in doc.tables:
+        for row in table.rows:
+            cells = [cell.text for cell in row.cells]
+            paragraphs.append("\t".join(cells))
+    return "\n".join(paragraphs)
+
+
+def handle_python_pptx(doc_path: Path) -> str:
+    """Extract text from PPTX/PPTM/PPSX using python-pptx."""
+    from pptx import Presentation
+
+    prs = Presentation(str(doc_path))
+    texts = []
+    for slide in prs.slides:
+        for shape in slide.shapes:
+            if shape.has_text_frame:
+                for paragraph in shape.text_frame.paragraphs:
+                    text = paragraph.text.strip()
+                    if text:
+                        texts.append(text)
+    return "\n".join(texts)
+
+
+def handle_openpyxl(doc_path: Path) -> str:
+    """Extract text from XLSX/XLSM using openpyxl."""
+    import openpyxl
+
+    wb = openpyxl.load_workbook(str(doc_path), read_only=True, data_only=True)
+    lines = []
+    for sheet_name in wb.sheetnames:
+        ws = wb[sheet_name]
+        for row in ws.iter_rows(values_only=True):
+            cells = [str(c) if c is not None else "" for c in row]
+            if any(cells):
+                lines.append("\t".join(cells))
+    wb.close()
+    return "\n".join(lines)
+
+
+def handle_beautifulsoup(doc_path: Path) -> str:
+    """Extract text from HTML using BeautifulSoup."""
+    from bs4 import BeautifulSoup
+
+    try:
+        html_content = doc_path.read_text(encoding="utf-8")
+    except UnicodeDecodeError:
+        html_content = doc_path.read_text(encoding="latin-1")
+    soup = BeautifulSoup(html_content, "html.parser")
+    # Remove script and style elements
+    for tag in soup(["script", "style"]):
+        tag.decompose()
+    return soup.get_text(separator="\n", strip=True)
+
+
+def handle_python_email(doc_path: Path) -> str:
+    """Extract text from EML using Python email stdlib."""
+    try:
+        raw = doc_path.read_bytes()
+        msg = email.message_from_bytes(raw)
+    except Exception:
+        raw = doc_path.read_text(encoding="utf-8", errors="replace")
+        msg = email.message_from_string(raw)
+
+    parts = []
+    # Add headers
+    for header in ("From", "To", "Subject", "Date"):
+        val = msg.get(header)
+        if val:
+            parts.append(f"{header}: {val}")
+
+    if parts:
+        parts.append("")  # blank line after headers
+
+    # Extract body
+    if msg.is_multipart():
+        for part in msg.walk():
+            content_type = part.get_content_type()
+            if content_type == "text/plain":
+                payload = part.get_payload(decode=True)
+                if payload:
+                    charset = part.get_content_charset() or "utf-8"
+                    try:
+                        parts.append(payload.decode(charset, errors="replace"))
+                    except (LookupError, UnicodeDecodeError):
+                        parts.append(payload.decode("utf-8", errors="replace"))
+    else:
+        payload = msg.get_payload(decode=True)
+        if payload:
+            charset = msg.get_content_charset() or "utf-8"
+            try:
+                parts.append(payload.decode(charset, errors="replace"))
+            except (LookupError, UnicodeDecodeError):
+                parts.append(payload.decode("utf-8", errors="replace"))
+
+    return "\n".join(parts)
+
+
+def handle_extract_msg(doc_path: Path) -> str:
+    """Extract text from MSG using extract-msg."""
+    import extract_msg
+
+    msg = extract_msg.openMsg(str(doc_path))
+    parts = []
+    if msg.subject:
+        parts.append(f"Subject: {msg.subject}")
+    if msg.sender:
+        parts.append(f"From: {msg.sender}")
+    if msg.to:
+        parts.append(f"To: {msg.to}")
+    if msg.date:
+        parts.append(f"Date: {msg.date}")
+    if parts:
+        parts.append("")
+    if msg.body:
+        parts.append(msg.body)
+    msg.close()
+    return "\n".join(parts)
+
+
+def handle_nbformat(doc_path: Path) -> str:
+    """Extract text from Jupyter notebooks using nbformat."""
+    import nbformat
+
+    nb = nbformat.read(str(doc_path), as_version=4)
+    parts = []
+    for cell in nb.cells:
+        if cell.cell_type in ("code", "markdown", "raw"):
+            source = cell.source.strip()
+            if source:
+                parts.append(source)
+    return "\n\n".join(parts)
+
+
+def handle_xml_parse(doc_path: Path) -> str:
+    """Extract text content from XML using xml.etree."""
+    try:
+        tree = ET.parse(str(doc_path))
+    except ET.ParseError:
+        # Fallback: read as raw text
+        return handle_raw_source(doc_path)
+    root = tree.getroot()
+    texts = []
+    for elem in root.iter():
+        if elem.text and elem.text.strip():
+            texts.append(elem.text.strip())
+        if elem.tail and elem.tail.strip():
+            texts.append(elem.tail.strip())
+    return "\n".join(texts)
+
+
+def handle_xlrd(doc_path: Path) -> str:
+    """Extract text from XLS using xlrd."""
+    import xlrd
+
+    wb = xlrd.open_workbook(str(doc_path))
+    lines = []
+    for sheet_idx in range(wb.nsheets):
+        ws = wb.sheet_by_index(sheet_idx)
+        for row_idx in range(ws.nrows):
+            cells = [str(ws.cell_value(row_idx, col_idx)) for col_idx in range(ws.ncols)]
+            if any(c for c in cells):
+                lines.append("\t".join(cells))
+    return "\n".join(lines)
+
+
+def handle_antiword(doc_path: Path) -> str:
+    """Extract text from DOC using antiword, catdoc, or pandoc as fallbacks."""
+    # Try antiword first
+    try:
+        result = subprocess.run(
+            ["antiword", str(doc_path)],
+            capture_output=True,
+            text=True,
+            timeout=60,
+        )
+        if result.returncode == 0:
+            return result.stdout
+    except FileNotFoundError:
+        pass
+
+    # Fallback to catdoc
+    try:
+        result = subprocess.run(
+            ["catdoc", str(doc_path)],
+            capture_output=True,
+            text=True,
+            timeout=60,
+        )
+        if result.returncode == 0:
+            return result.stdout
+    except FileNotFoundError:
+        pass
+
+    # Fallback to textutil (macOS)
+    try:
+        result = subprocess.run(
+            ["textutil", "-convert", "txt", "-stdout", str(doc_path)],
+            capture_output=True,
+            text=True,
+            timeout=60,
+        )
+        if result.returncode == 0:
+            return result.stdout
+    except FileNotFoundError:
+        pass
+
+    raise RuntimeError("No DOC extraction tool available (need antiword, catdoc, or textutil)")
+
+
+def handle_ods(doc_path: Path) -> str:
+    """Extract text from ODS using odfpy."""
+    from odf import text as odf_text
+    from odf.opendocument import load as odf_load
+    from odf.table import Table, TableCell, TableRow
+
+    doc = odf_load(str(doc_path))
+    lines = []
+    for table in doc.spreadsheet.getElementsByType(Table):
+        for row in table.getElementsByType(TableRow):
+            cells = []
+            for cell in row.getElementsByType(TableCell):
+                # Get text content from cell
+                cell_texts = []
+                for p in cell.getElementsByType(odf_text.P):
+                    # Recursively get all text
+                    text_parts = []
+                    for node in p.childNodes:
+                        if hasattr(node, "data"):
+                            text_parts.append(node.data)
+                        elif hasattr(node, "__str__"):
+                            text_parts.append(str(node))
+                    cell_texts.append("".join(text_parts))
+                # Handle repeated cells
+                repeat = cell.getAttribute("numbercolumnsrepeated")
+                cell_text = " ".join(cell_texts)
+                if repeat and int(repeat) > 1 and cell_text:
+                    cells.extend([cell_text] * min(int(repeat), 100))
+                else:
+                    cells.append(cell_text)
+            if any(c.strip() for c in cells):
+                lines.append("\t".join(cells))
+    return "\n".join(lines)
+
+
+def handle_libreoffice(doc_path: Path) -> str:
+    """Extract text from PPT using LibreOffice CLI, with pandoc fallback."""
+    import tempfile
+
+    try:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            result = subprocess.run(
+                ["libreoffice", "--headless", "--convert-to", "txt:Text", "--outdir", tmpdir, str(doc_path)],
+                capture_output=True,
+                text=True,
+                timeout=120,
+            )
+            if result.returncode == 0:
+                txt_files = list(Path(tmpdir).glob("*.txt"))
+                if txt_files:
+                    return txt_files[0].read_text(encoding="utf-8", errors="replace")
+    except FileNotFoundError:
+        pass
+
+    # Fallback: try textutil (macOS)
+    try:
+        result = subprocess.run(
+            ["textutil", "-convert", "txt", "-stdout", str(doc_path)],
+            capture_output=True,
+            text=True,
+            timeout=60,
+        )
+        if result.returncode == 0:
+            return result.stdout
+    except FileNotFoundError:
+        pass
+
+    raise RuntimeError("No PPT extraction tool available (need libreoffice or textutil)")
+
+
+def extract_text(doc_path: Path, file_type: str) -> str:
+    """Dispatch to the appropriate handler for the given file type."""
+    if file_type in RAW_SOURCE_TYPES:
+        return handle_raw_source(doc_path)
+    if file_type in PDFTOTEXT_TYPES:
+        return handle_pdftotext(doc_path)
+    if file_type in PANDOC_TYPES:
+        return handle_pandoc(doc_path, file_type)
+    if file_type in PYTHON_DOCX_TYPES:
+        return handle_python_docx(doc_path)
+    if file_type in PYTHON_PPTX_TYPES:
+        return handle_python_pptx(doc_path)
+    if file_type in OPENPYXL_TYPES:
+        return handle_openpyxl(doc_path)
+    if file_type in BEAUTIFULSOUP_TYPES:
+        return handle_beautifulsoup(doc_path)
+    if file_type in PYTHON_EMAIL_TYPES:
+        return handle_python_email(doc_path)
+    if file_type in EXTRACT_MSG_TYPES:
+        return handle_extract_msg(doc_path)
+    if file_type in NBFORMAT_TYPES:
+        return handle_nbformat(doc_path)
+    if file_type in XML_PARSE_TYPES:
+        return handle_xml_parse(doc_path)
+    if file_type in XLRD_TYPES:
+        return handle_xlrd(doc_path)
+    if file_type in ANTIWORD_TYPES:
+        return handle_antiword(doc_path)
+    if file_type in LIBREOFFICE_TYPES:
+        return handle_libreoffice(doc_path)
+    if file_type in ODS_TYPES:
+        return handle_ods(doc_path)
+    raise ValueError(f"No handler for file type: {file_type}")
+
+
+# ---------------------------------------------------------------------------
+# Core logic
+# ---------------------------------------------------------------------------
+
+
+def get_repo_root() -> Path:
+    """Find the repository root directory."""
+    current = Path(__file__).resolve().parent
+    while current != current.parent:
+        if (current / "Cargo.toml").exists() and (current / "test_documents").exists():
+            return current
+        current = current.parent
+    raise RuntimeError("Could not find repository root")
+
+
+def collect_fixtures(fixtures_dir: Path) -> list[Path]:
+    """Recursively collect all fixture JSON files."""
+    return sorted(fixtures_dir.rglob("*.json"))
+
+
+def load_mapping(repo_root: Path) -> dict[str, str]:
+    """Load the existing ground truth mapping."""
+    mapping_file = repo_root / "test_documents" / "ground_truth" / "ground_truth_mapping.json"
+    if mapping_file.exists():
+        with open(mapping_file) as f:
+            return json.load(f)
+    return {}
+
+
+def save_mapping(repo_root: Path, mapping: dict[str, str]) -> None:
+    """Save the ground truth mapping (sorted keys)."""
+    mapping_file = repo_root / "test_documents" / "ground_truth" / "ground_truth_mapping.json"
+    sorted_mapping = dict(sorted(mapping.items()))
+    with open(mapping_file, "w") as f:
+        json.dump(sorted_mapping, f, indent=2)
+        f.write("\n")
+
+
+def make_mapping_key(fixture_path: Path, fixtures_dir: Path) -> str:
+    """Generate a unique mapping key from the fixture path.
+
+    For top-level fixtures: stem (e.g., 'commonmark_sample')
+    For subdir fixtures: subdir/stem (e.g., 'md/duck.md' from md/duck.md.json)
+    """
+    rel = fixture_path.relative_to(fixtures_dir)
+    parts = rel.parts
+    if len(parts) > 1:
+        return f"{parts[0]}/{fixture_path.stem}"
+    return fixture_path.stem
+
+
+def process_fixture(
+    fixture_path: Path,
+    repo_root: Path,
+    fixtures_dir: Path,
+    mapping: dict[str, str],
+    dry_run: bool,
+    force: bool,
+    stats: dict[str, int],
+) -> None:
+    """Process a single fixture: generate ground truth, patch fixture, update mapping."""
+    with open(fixture_path) as f:
+        fixture = json.load(f)
+
+    file_type = fixture.get("file_type", "")
+
+    # Skip excluded types
+    if file_type in EXCLUDED_TYPES:
+        stats["skipped_excluded"] += 1
+        return
+
+    # Skip unhandled types
+    if file_type not in ALL_HANDLED_TYPES:
+        print(f"  SKIP (unhandled type): {fixture_path.name} ({file_type})")
+        stats["skipped_unhandled"] += 1
+        return
+
+    # Skip if already has ground truth (unless --force)
+    if fixture.get("ground_truth") and not force:
+        stats["skipped_existing"] += 1
+        return
+
+    # Resolve document path
+    doc_rel = fixture.get("document", "")
+    if not doc_rel:
+        print(f"  SKIP (no document): {fixture_path.name}")
+        stats["skipped_no_doc"] += 1
+        return
+
+    doc_path = (fixture_path.parent / doc_rel).resolve()
+    if not doc_path.exists():
+        print(f"  SKIP (doc not found): {fixture_path.name} -> {doc_path}")
+        stats["skipped_missing_doc"] += 1
+        return
+
+    # Determine ground truth output path
+    gt_dir = repo_root / "test_documents" / "ground_truth" / file_type
+    gt_filename = fixture_path.stem + ".txt"
+    gt_path = gt_dir / gt_filename
+
+    # Compute relative path from fixture to ground truth
+    gt_rel = os.path.relpath(gt_path, fixture_path.parent)
+
+    # Mapping key
+    mapping_key = make_mapping_key(fixture_path, fixtures_dir)
+
+    if dry_run:
+        print(f"  [DRY RUN] {fixture_path.name} ({file_type})")
+        print(f"    doc: {doc_path}")
+        print(f"    gt:  {gt_path}")
+        print(f"    key: {mapping_key}")
+        stats["would_generate"] += 1
+        return
+
+    # Extract text
+    try:
+        text = extract_text(doc_path, file_type)
+    except Exception as e:
+        print(f"  ERROR extracting {fixture_path.name}: {e}")
+        stats["errors"] += 1
+        return
+
+    # Write ground truth file
+    gt_dir.mkdir(parents=True, exist_ok=True)
+    gt_path.write_text(text, encoding="utf-8")
+
+    # Patch fixture JSON
+    fixture["ground_truth"] = {
+        "text_file": gt_rel,
+        "source": get_source_type(file_type),
+    }
+    with open(fixture_path, "w") as f:
+        json.dump(fixture, f, indent=2)
+        f.write("\n")
+
+    # Update mapping
+    gt_mapping_path = str(gt_path.relative_to(repo_root))
+    mapping[mapping_key] = gt_mapping_path
+
+    stats["generated"] += 1
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Generate ground truth for benchmark fixtures")
+    parser.add_argument("--dry-run", action="store_true", help="Print planned actions without writing")
+    parser.add_argument("--format-filter", type=str, default="", help="Comma-separated file types to process")
+    parser.add_argument("--force", action="store_true", help="Regenerate even if ground truth exists")
+    parser.add_argument("--skip-types", type=str, default="", help="Comma-separated file types to skip")
+    args = parser.parse_args()
+
+    repo_root = get_repo_root()
+    fixtures_dir = repo_root / "tools" / "benchmark-harness" / "fixtures"
+
+    print(f"Repository root: {repo_root}")
+    print(f"Fixtures dir: {fixtures_dir}")
+    if args.dry_run:
+        print("DRY RUN MODE - no files will be written\n")
+
+    format_filter = set(args.format_filter.split(",")) if args.format_filter else None
+    skip_types = set(args.skip_types.split(",")) if args.skip_types else set()
+
+    # Load existing mapping
+    mapping = load_mapping(repo_root)
+    initial_mapping_size = len(mapping)
+
+    # Collect and process fixtures
+    fixture_paths = collect_fixtures(fixtures_dir)
+    print(f"Found {len(fixture_paths)} fixture files\n")
+
+    stats: dict[str, int] = {
+        "generated": 0,
+        "would_generate": 0,
+        "skipped_existing": 0,
+        "skipped_excluded": 0,
+        "skipped_unhandled": 0,
+        "skipped_no_doc": 0,
+        "skipped_missing_doc": 0,
+        "errors": 0,
+    }
+
+    for fixture_path in fixture_paths:
+        # Load to check file type for filtering
+        try:
+            with open(fixture_path) as f:
+                fixture_data = json.load(f)
+        except (json.JSONDecodeError, OSError) as e:
+            print(f"  ERROR reading {fixture_path.name}: {e}")
+            stats["errors"] += 1
+            continue
+
+        file_type = fixture_data.get("file_type", "")
+        if format_filter and file_type not in format_filter:
+            continue
+        if file_type in skip_types:
+            continue
+
+        process_fixture(fixture_path, repo_root, fixtures_dir, mapping, args.dry_run, args.force, stats)
+
+    # Save mapping
+    if not args.dry_run and stats["generated"] > 0:
+        save_mapping(repo_root, mapping)
+        new_entries = len(mapping) - initial_mapping_size
+        print(f"\nUpdated ground_truth_mapping.json: {new_entries} new entries (total: {len(mapping)})")
+
+    # Print summary
+    print(f"\n{'=' * 50}")
+    print("Summary:")
+    print(f"  Generated:         {stats['generated']}")
+    if args.dry_run:
+        print(f"  Would generate:    {stats['would_generate']}")
+    print(f"  Skipped (existing): {stats['skipped_existing']}")
+    print(f"  Skipped (excluded): {stats['skipped_excluded']}")
+    print(f"  Skipped (unhandled): {stats['skipped_unhandled']}")
+    print(f"  Skipped (no doc):   {stats['skipped_no_doc']}")
+    print(f"  Skipped (missing):  {stats['skipped_missing_doc']}")
+    print(f"  Errors:            {stats['errors']}")
+
+    return 1 if stats["errors"] > 0 else 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/tools/benchmark-harness/scripts/generate_libreoffice_gt.sh
+++ b/tools/benchmark-harness/scripts/generate_libreoffice_gt.sh
@@ -0,0 +1,93 @@
+#!/usr/bin/env bash
+# Generate markdown ground truth for formats requiring LibreOffice conversion.
+# Workflow: soffice → intermediate format → pandoc -t gfm → sanitize
+#
+# Prerequisites:
+#   - soffice (LibreOffice) on PATH
+#   - pandoc on PATH
+#   - python3 on PATH
+#
+# Usage: bash tools/benchmark-harness/scripts/generate_libreoffice_gt.sh
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)"
+SANITIZE="$SCRIPT_DIR/sanitize_pandoc_gt.py"
+TMP_DIR="/tmp/gt_convert"
+
+mkdir -p "$TMP_DIR"
+
+# --- DOC → DOCX → GFM ---
+echo "=== DOC ground truth generation ==="
+mkdir -p "$REPO_ROOT/test_documents/ground_truth/doc"
+
+doc_files=(
+  "$REPO_ROOT/test_documents/vendored/unstructured/doc/simple.doc"
+  "$REPO_ROOT/test_documents/vendored/unstructured/doc/fake.doc"
+  "$REPO_ROOT/test_documents/vendored/unstructured/doc/duplicate-paragraphs.doc"
+  "$REPO_ROOT/test_documents/vendored/unstructured/doc/fake-doc-emphasized-text.doc"
+  "$REPO_ROOT/test_documents/doc/unit_test_lists.doc"
+)
+
+for f in "${doc_files[@]}"; do
+  if [ ! -f "$f" ]; then
+    echo "  SKIP (not found): $f"
+    continue
+  fi
+  name=$(basename "$f" .doc)
+  gt_md="$REPO_ROOT/test_documents/ground_truth/doc/${name}.md"
+
+  # Convert to docx via LibreOffice
+  soffice --headless --convert-to docx --outdir "$TMP_DIR" "$f" 2>/dev/null
+  converted="$TMP_DIR/${name}.docx"
+
+  if [ -f "$converted" ]; then
+    pandoc -f docx -t gfm --wrap=none "$converted" 2>/dev/null |
+      python3 "$SANITIZE" >"$gt_md"
+    size=$(wc -c <"$gt_md")
+    echo "  doc: $name → $size bytes  ($gt_md)"
+  else
+    echo "  doc: $name FAILED conversion"
+  fi
+done
+
+# --- PPT → PPTX → GFM ---
+echo ""
+echo "=== PPT ground truth generation ==="
+mkdir -p "$REPO_ROOT/test_documents/ground_truth/ppt"
+
+ppt_files=(
+  "$REPO_ROOT/test_documents/ppt/simple.ppt"
+)
+
+for f in "${ppt_files[@]}"; do
+  if [ ! -f "$f" ]; then
+    echo "  SKIP (not found): $f"
+    continue
+  fi
+  name=$(basename "$f" .ppt)
+  gt_md="$REPO_ROOT/test_documents/ground_truth/ppt/${name}.md"
+
+  soffice --headless --convert-to pptx --outdir "$TMP_DIR" "$f" 2>/dev/null
+  converted="$TMP_DIR/${name}.pptx"
+
+  if [ -f "$converted" ]; then
+    pandoc -f pptx -t gfm --wrap=none "$converted" 2>/dev/null |
+      python3 "$SANITIZE" >"$gt_md"
+    size=$(wc -c <"$gt_md")
+    echo "  ppt: $name → $size bytes  ($gt_md)"
+  else
+    echo "  ppt: $name FAILED conversion"
+  fi
+done
+
+# --- ODS: no pandoc support for spreadsheet input ---
+echo ""
+echo "=== ODS: skipped (pandoc cannot read spreadsheet formats) ==="
+echo "  Existing text GT in test_documents/ground_truth/ods/ is sufficient."
+
+echo ""
+echo "Done. Validate with:"
+echo "  cargo run --release -p benchmark-harness -- validate-gt --fixtures tools/benchmark-harness/fixtures/doc/"
+echo "  cargo run --release -p benchmark-harness -- validate-gt --fixtures tools/benchmark-harness/fixtures/"
--- a/tools/benchmark-harness/scripts/generate_markdown_gt.py
+++ b/tools/benchmark-harness/scripts/generate_markdown_gt.py
@@ -0,0 +1,249 @@
+#!/usr/bin/env -S uv run --no-project --script
+# /// script
+# requires-python = ">=3.10"
+# dependencies = ["google-genai>=1.0"]
+# ///
+"""Generate proper markdown ground truth from PDF documents using Gemini.
+
+Reads benchmark fixture JSON files to locate PDFs, sends each to Gemini 2.5 Flash
+via Vertex AI, and saves the extracted markdown to the ground truth directory.
+
+Usage:
+    uv run tools/benchmark-harness/scripts/generate_markdown_gt.py [OPTIONS]
+
+Examples:
+    # Generate for all nougat + pdfa documents
+    uv run tools/benchmark-harness/scripts/generate_markdown_gt.py
+
+    # Generate for a specific document
+    uv run tools/benchmark-harness/scripts/generate_markdown_gt.py --filter nougat_001
+
+    # Dry run to see what would be processed
+    uv run tools/benchmark-harness/scripts/generate_markdown_gt.py --dry-run
+
+    # Force regeneration of existing files
+    uv run tools/benchmark-harness/scripts/generate_markdown_gt.py --force
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import signal
+import sys
+import time
+from pathlib import Path
+
+from google import genai
+from google.genai.types import GenerateContentConfig, Part
+
+EXTRACTION_PROMPT = """\
+Extract the complete text content of this PDF document as clean Markdown.
+
+Rules:
+- Use proper heading hierarchy (# for document title, ## for major sections, ### for subsections)
+- Render tables using markdown table syntax with | delimiters and --- separator row
+- Use numbered lists (1. 2. 3.) and bullet lists (- item) where the document uses them
+- Preserve emphasis: **bold** and *italic* where the original uses them
+- Use ``` code blocks for code snippets, formulas, or monospace content
+- Use <!-- image --> as a placeholder where figures or images appear
+- Omit page numbers, running headers/footers, and watermarks
+- Preserve the document's reading order
+- Do NOT invent or hallucinate content — only extract what is actually in the document
+- Do NOT wrap the output in a markdown code fence — return raw markdown directly
+- For multi-column layouts, read left column first, then right column
+- For forms with label-value pairs, use **Label:** Value format
+"""
+
+
+def get_repo_root() -> Path:
+    current = Path(__file__).resolve().parent
+    while current != current.parent:
+        if (current / "Cargo.toml").exists() and (current / "test_documents").exists():
+            return current
+        current = current.parent
+    raise RuntimeError("Could not find repository root")
+
+
+def discover_fixtures(fixtures_dir: Path, name_filter: str | None = None) -> list[dict]:
+    """Find PDF fixtures that need markdown ground truth."""
+    results = []
+    for fixture_path in sorted(fixtures_dir.rglob("*.json")):
+        try:
+            with open(fixture_path) as f:
+                fixture = json.load(f)
+        except (json.JSONDecodeError, OSError):
+            continue
+
+        if fixture.get("file_type") != "pdf":
+            continue
+
+        name = fixture_path.stem
+        if name_filter and name_filter not in name:
+            continue
+
+        doc_rel = fixture.get("document", "")
+        if not doc_rel:
+            continue
+
+        doc_path = (fixture_path.parent / doc_rel).resolve()
+        if not doc_path.exists():
+            continue
+
+        results.append(
+            {
+                "name": name,
+                "fixture_path": fixture_path,
+                "doc_path": doc_path,
+                "fixture": fixture,
+            }
+        )
+
+    return results
+
+
+class _Timeout(Exception):
+    pass
+
+
+def _timeout_handler(signum, frame):
+    raise _Timeout("API call timed out")
+
+
+def generate_markdown(
+    client: genai.Client,
+    pdf_path: Path,
+    model: str,
+    timeout: int = 120,
+) -> str:
+    """Send PDF to Gemini and get markdown extraction."""
+    pdf_bytes = pdf_path.read_bytes()
+
+    old_handler = signal.signal(signal.SIGALRM, _timeout_handler)
+    signal.alarm(timeout)
+    try:
+        response = client.models.generate_content(
+            model=model,
+            contents=[
+                Part.from_bytes(data=pdf_bytes, mime_type="application/pdf"),
+                EXTRACTION_PROMPT,
+            ],
+            config=GenerateContentConfig(
+                temperature=0.1,
+                max_output_tokens=8192,
+            ),
+        )
+    finally:
+        signal.alarm(0)
+        signal.signal(signal.SIGALRM, old_handler)
+
+    text = response.text or ""
+
+    # Strip markdown code fence wrapper if Gemini added one
+    if text.startswith("```markdown\n"):
+        text = text[len("```markdown\n") :]
+        text = text.removesuffix("\n```")
+    elif text.startswith("```md\n"):
+        text = text[len("```md\n") :]
+        text = text.removesuffix("\n```")
+    elif text.startswith("```\n"):
+        text = text[len("```\n") :]
+        text = text.removesuffix("\n```")
+
+    return text.strip() + "\n"
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Generate markdown ground truth from PDFs using Gemini")
+    parser.add_argument(
+        "--filter", type=str, default=None, help="Only process fixtures whose name contains this string"
+    )
+    parser.add_argument("--dry-run", action="store_true", help="Show what would be processed without calling the API")
+    parser.add_argument("--force", action="store_true", help="Regenerate even if .md file already exists")
+    parser.add_argument(
+        "--model", type=str, default="gemini-2.0-flash", help="Gemini model to use (default: gemini-2.0-flash)"
+    )
+    parser.add_argument("--project", type=str, default="boxwood-spirit-479620-r5", help="GCP project ID")
+    parser.add_argument("--location", type=str, default="us-central1", help="Vertex AI location")
+    parser.add_argument("--delay", type=float, default=1.0, help="Delay between API calls in seconds (rate limiting)")
+    parser.add_argument("--timeout", type=int, default=120, help="Per-request timeout in seconds (default: 120)")
+    parser.add_argument("--max-size", type=int, default=None, help="Skip PDFs larger than this many KB")
+    args = parser.parse_args()
+
+    repo_root = get_repo_root()
+    fixtures_dir = repo_root / "tools" / "benchmark-harness" / "fixtures"
+    gt_dir = repo_root / "test_documents" / "ground_truth" / "pdf"
+
+    print(f"Repository root: {repo_root}")
+    print(f"Fixtures dir:    {fixtures_dir}")
+    print(f"Output dir:      {gt_dir}")
+    print(f"Model:           {args.model}")
+    if args.dry_run:
+        print("DRY RUN MODE\n")
+
+    fixtures = discover_fixtures(fixtures_dir, args.filter)
+    print(f"Found {len(fixtures)} PDF fixtures")
+
+    if not args.dry_run:
+        client = genai.Client(
+            vertexai=True,
+            project=args.project,
+            location=args.location,
+        )
+
+    stats = {"generated": 0, "skipped": 0, "errors": 0}
+
+    for item in fixtures:
+        name = item["name"]
+        md_path = gt_dir / f"{name}.md"
+        file_size_kb = item["doc_path"].stat().st_size / 1024
+
+        if md_path.exists() and not args.force:
+            stats["skipped"] += 1
+            continue
+
+        if args.max_size and file_size_kb > args.max_size:
+            print(f"  Skipping {name} ({file_size_kb:.0f} KB > {args.max_size} KB)")
+            stats["skipped"] += 1
+            continue
+
+        if args.dry_run:
+            print(f"  [DRY] {name} ({file_size_kb:.0f} KB)")
+            stats["generated"] += 1
+            continue
+
+        print(f"  Processing {name} ({file_size_kb:.0f} KB)...", end=" ", flush=True)
+        try:
+            start = time.time()
+            markdown = generate_markdown(client, item["doc_path"], args.model, timeout=args.timeout)
+            elapsed = time.time() - start
+
+            gt_dir.mkdir(parents=True, exist_ok=True)
+            md_path.write_text(markdown, encoding="utf-8")
+
+            # Quick quality check
+            lines = markdown.strip().split("\n")
+            headings = sum(1 for l in lines if l.startswith("#"))
+            tables = sum(1 for l in lines if "|" in l and "---" not in l)
+            print(f"OK ({elapsed:.1f}s, {len(lines)} lines, {headings} headings, {tables} table rows)")
+            stats["generated"] += 1
+
+            time.sleep(args.delay)
+
+        except _Timeout:
+            print(f"TIMEOUT ({args.timeout}s)")
+            stats["errors"] += 1
+        except Exception as e:
+            print(f"ERROR: {e}")
+            stats["errors"] += 1
+
+    print(f"\n{'=' * 50}")
+    print(f"Generated: {stats['generated']}")
+    print(f"Skipped:   {stats['skipped']} (already exist)")
+    print(f"Errors:    {stats['errors']}")
+
+    return 0 if stats["errors"] == 0 else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/tools/benchmark-harness/scripts/generate_md_gt.sh
+++ b/tools/benchmark-harness/scripts/generate_md_gt.sh
@@ -0,0 +1,212 @@
+#!/usr/bin/env bash
+# Generate markdown and text ground truth for docbook, typst, and fictionbook formats
+# using pandoc + sanitize_pandoc_gt.py, then create benchmark fixture JSON files.
+set -euo pipefail
+
+REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)"
+SANITIZE="$REPO_ROOT/tools/benchmark-harness/scripts/sanitize_pandoc_gt.py"
+FIXTURES_DIR="$REPO_ROOT/tools/benchmark-harness/fixtures"
+
+cd "$REPO_ROOT"
+
+echo "=== Step 1: Generate MD ground truth via pandoc + sanitize ==="
+
+# --- DocBook ---
+echo "--- DocBook ---"
+for f in test_documents/docbook/*.dbk test_documents/docbook/*.docbook test_documents/docbook/*.docbook4 test_documents/docbook/*.docbook5; do
+  [ -f "$f" ] || continue
+  name=$(basename "$f" | sed 's/\.[^.]*$//')
+  mkdir -p test_documents/ground_truth/docbook
+  pandoc -f docbook -t gfm --wrap=none "$f" 2>/dev/null | python3 "$SANITIZE" >"test_documents/ground_truth/docbook/${name}.md"
+  echo "docbook: $name ($(wc -c <"test_documents/ground_truth/docbook/${name}.md") bytes)"
+done
+
+# --- Typst ---
+echo "--- Typst ---"
+for f in test_documents/typst/*.typ; do
+  [ -f "$f" ] || continue
+  name=$(basename "$f" .typ)
+  # Typst GT goes in both typ/ (matching existing convention) and typst/
+  for gtdir in test_documents/ground_truth/typ test_documents/ground_truth/typst; do
+    mkdir -p "$gtdir"
+    pandoc -f typst -t gfm --wrap=none "$f" 2>/dev/null | python3 "$SANITIZE" >"${gtdir}/${name}.md"
+  done
+  echo "typst: $name ($(wc -c <"test_documents/ground_truth/typ/${name}.md") bytes)"
+done
+
+# --- FictionBook (fb2) ---
+echo "--- FictionBook ---"
+for f in test_documents/fictionbook/*.fb2; do
+  [ -f "$f" ] || continue
+  name=$(basename "$f" .fb2)
+  mkdir -p test_documents/ground_truth/fb2
+  existing="test_documents/ground_truth/fb2/${name}.md"
+  if [ ! -f "$existing" ]; then
+    pandoc -f fb2 -t gfm --wrap=none "$f" 2>/dev/null | python3 "$SANITIZE" >"$existing"
+    echo "fb2: $name (new, $(wc -c <"$existing") bytes)"
+  else
+    echo "fb2: $name (exists, $(wc -c <"$existing") bytes)"
+  fi
+done
+
+echo ""
+echo "=== Step 2: Generate text GT from MD GT ==="
+
+# For each .md GT file, generate .txt if missing
+for md_file in test_documents/ground_truth/docbook/*.md test_documents/ground_truth/typ/*.md test_documents/ground_truth/fb2/*.md; do
+  [ -f "$md_file" ] || continue
+  txt_file="${md_file%.md}.txt"
+  if [ ! -f "$txt_file" ]; then
+    pandoc -f gfm -t plain --wrap=none "$md_file" >"$txt_file"
+    echo "text: $(basename "$txt_file") (new, $(wc -c <"$txt_file") bytes)"
+  fi
+done
+
+echo ""
+echo "=== Step 3: Create fixture JSON files ==="
+
+# Helper to create fixture JSON
+create_fixture() {
+  local doc_path="$1"
+  local file_type="$2"
+  local gt_text="$3"
+  local gt_md="$4"
+  local fixture_out="$5"
+  local description="$6"
+  local category="$7"
+
+  local file_size
+  file_size=$(stat -f %z "$doc_path" 2>/dev/null || wc -c <"$doc_path" | tr -d ' ')
+
+  local name
+  name=$(basename "$doc_path" | sed 's/\.[^.]*$//')
+
+  # Compute relative paths from fixtures dir
+  local rel_doc="../../../${doc_path}"
+  local rel_text="../../../${gt_text}"
+  local rel_md="../../../${gt_md}"
+
+  local json
+  if [ -f "$gt_md" ] && [ -f "$gt_text" ]; then
+    json=$(
+      cat <<EOJSON
+{
+	"document": "${rel_doc}",
+	"file_type": "${file_type}",
+	"file_size": ${file_size},
+	"expected_frameworks": ["kreuzberg"],
+	"metadata": {
+		"description": "${description}",
+		"category": "${category}"
+	},
+	"ground_truth": {
+		"text_file": "${rel_text}",
+		"markdown_file": "${rel_md}",
+		"source": "pandoc"
+	}
+}
+EOJSON
+    )
+  elif [ -f "$gt_text" ]; then
+    json=$(
+      cat <<EOJSON
+{
+	"document": "${rel_doc}",
+	"file_type": "${file_type}",
+	"file_size": ${file_size},
+	"expected_frameworks": ["kreuzberg"],
+	"metadata": {
+		"description": "${description}",
+		"category": "${category}"
+	},
+	"ground_truth": {
+		"text_file": "${rel_text}",
+		"source": "pandoc"
+	}
+}
+EOJSON
+    )
+  fi
+
+  echo "$json" >"$fixture_out"
+  echo "fixture: $(basename "$fixture_out")"
+}
+
+# --- DocBook fixtures ---
+echo "--- DocBook fixtures ---"
+for f in test_documents/docbook/*.dbk test_documents/docbook/*.docbook test_documents/docbook/*.docbook4 test_documents/docbook/*.docbook5; do
+  [ -f "$f" ] || continue
+  name=$(basename "$f" | sed 's/\.[^.]*$//')
+  ext=$(basename "$f" | sed 's/.*\.//')
+  gt_md="test_documents/ground_truth/docbook/${name}.md"
+  gt_txt="test_documents/ground_truth/docbook/${name}.txt"
+
+  # Determine file_type based on extension
+  case "$ext" in
+  dbk) ft="dbk" ;;
+  docbook | docbook4 | docbook5) ft="docbook" ;;
+  *) ft="docbook" ;;
+  esac
+
+  fixture_name="docbook_$(echo "$name" | tr '-' '_').json"
+  create_fixture "$f" "$ft" "$gt_txt" "$gt_md" "${FIXTURES_DIR}/${fixture_name}" "DocBook document: ${name}" "docbook"
+done
+
+# --- Typst fixtures (update existing to add markdown_file) ---
+echo "--- Typst fixtures ---"
+for f in test_documents/typst/*.typ; do
+  [ -f "$f" ] || continue
+  name=$(basename "$f" .typ)
+  gt_md="test_documents/ground_truth/typ/${name}.md"
+  gt_txt="test_documents/ground_truth/typ/typst_${name}.txt"
+  # Some txt files use name directly, some use typst_ prefix - check both
+  if [ ! -f "$gt_txt" ]; then
+    gt_txt="test_documents/ground_truth/typ/${name}.txt"
+  fi
+
+  fixture_name="typst_${name}.json"
+  create_fixture "$f" "typ" "$gt_txt" "$gt_md" "${FIXTURES_DIR}/${fixture_name}" "Typst document: ${name}" "typst"
+done
+
+# --- FictionBook fixtures (update existing to add markdown_file) ---
+echo "--- FictionBook fixtures ---"
+for f in test_documents/fictionbook/*.fb2; do
+  [ -f "$f" ] || continue
+  name=$(basename "$f" .fb2)
+  gt_md="test_documents/ground_truth/fb2/${name}.md"
+  gt_txt="test_documents/ground_truth/fb2/${name}.txt"
+  # Some txt files use fb2_ prefix
+  if [ ! -f "$gt_txt" ]; then
+    gt_txt="test_documents/ground_truth/fb2/fb2_${name}.txt"
+  fi
+
+  fixture_name="fb2_${name}.json"
+  create_fixture "$f" "fb2" "$gt_txt" "$gt_md" "${FIXTURES_DIR}/${fixture_name}" "FictionBook document: ${name}" "fictionbook"
+done
+
+echo ""
+echo "=== Step 4: Validate ==="
+
+echo "--- Verifying GT files are non-empty ---"
+empty_count=0
+for f in test_documents/ground_truth/docbook/*.md test_documents/ground_truth/typ/*.md test_documents/ground_truth/fb2/*.md; do
+  [ -f "$f" ] || continue
+  size=$(wc -c <"$f" | tr -d ' ')
+  if [ "$size" -le 1 ]; then
+    echo "WARNING: $f is empty/near-empty ($size bytes)"
+    empty_count=$((empty_count + 1))
+  fi
+done
+echo "Empty/near-empty GT files: $empty_count"
+
+echo ""
+echo "=== Summary ==="
+echo "DocBook MD GT files: $(find test_documents/ground_truth/docbook/*.md -maxdepth 1 2>/dev/null | wc -l | tr -d ' ')"
+echo "DocBook TXT GT files: $(find test_documents/ground_truth/docbook/*.txt -maxdepth 1 2>/dev/null | wc -l | tr -d ' ')"
+echo "Typst MD GT files: $(find test_documents/ground_truth/typ/*.md -maxdepth 1 2>/dev/null | wc -l | tr -d ' ')"
+echo "Typst TXT GT files: $(find test_documents/ground_truth/typ/*.txt -maxdepth 1 2>/dev/null | wc -l | tr -d ' ')"
+echo "FB2 MD GT files: $(find test_documents/ground_truth/fb2/*.md -maxdepth 1 2>/dev/null | wc -l | tr -d ' ')"
+echo "FB2 TXT GT files: $(find test_documents/ground_truth/fb2/*.txt -maxdepth 1 2>/dev/null | wc -l | tr -d ' ')"
+echo ""
+echo "Fixture files created/updated:"
+ls -1 "${FIXTURES_DIR}"/docbook_*.json "${FIXTURES_DIR}"/typst_*.json "${FIXTURES_DIR}"/fb2_*.json "${FIXTURES_DIR}"/dbk_*.json 2>/dev/null
--- a/tools/benchmark-harness/scripts/generate_pdf_gt_mistral.py
+++ b/tools/benchmark-harness/scripts/generate_pdf_gt_mistral.py
@@ -0,0 +1,212 @@
+#!/usr/bin/env python3
+"""Generate PDF markdown ground truth using Mistral's pixtral vision model.
+
+Usage:
+    # Generate GT for all PDFs missing MD GT:
+    python generate_pdf_gt_mistral.py
+
+    # Generate GT for a specific fixture:
+    python generate_pdf_gt_mistral.py tools/benchmark-harness/fixtures/pdf/2203.01017v2.json
+
+    # Dry run (show what would be generated):
+    python generate_pdf_gt_mistral.py --dry-run
+
+    # Pilot batch (first N):
+    python generate_pdf_gt_mistral.py --limit 10
+"""
+
+import argparse
+import base64
+import json
+import os
+import sys
+import time
+from pathlib import Path
+
+MISTRAL_API_KEY = os.environ.get("MISTRAL_API_KEY", "")
+MISTRAL_MODEL = "mistral-ocr-latest"
+MISTRAL_API_URL = "https://api.mistral.ai/v1/ocr"
+
+PROMPT = (
+    "Convert this PDF to clean GFM (GitHub Flavored Markdown). "
+    "Preserve the document structure: headings, paragraphs, tables, lists, "
+    "code blocks, and formulas. Use proper heading hierarchy (# for title, ## for sections). "
+    "Render tables as GFM pipe tables. Do not add commentary or explanations."
+)
+
+
+def load_env():
+    """Load MISTRAL_API_KEY from ../liter-llm/.env if not in environment."""
+    global MISTRAL_API_KEY
+    if MISTRAL_API_KEY:
+        return
+    env_path = Path(__file__).resolve().parents[3] / ".." / "liter-llm" / ".env"
+    if env_path.exists():
+        for line in env_path.read_text().splitlines():
+            if line.startswith("MISTRAL_API_KEY="):
+                MISTRAL_API_KEY = line.split("=", 1)[1].strip()
+                return
+    print("ERROR: MISTRAL_API_KEY not found", file=sys.stderr)
+    sys.exit(1)
+
+
+def call_mistral_ocr(pdf_path: str) -> str:
+    """Send a PDF to Mistral OCR and return markdown."""
+    import httpx
+
+    pdf_data = Path(pdf_path).read_bytes()
+    b64 = base64.standard_b64encode(pdf_data).decode("ascii")
+
+    payload = {
+        "model": MISTRAL_MODEL,
+        "document": {
+            "type": "document_url",
+            "document_url": f"data:application/pdf;base64,{b64}",
+        },
+    }
+
+    resp = httpx.post(
+        MISTRAL_API_URL,
+        json=payload,
+        headers={
+            "Authorization": f"Bearer {MISTRAL_API_KEY}",
+            "Content-Type": "application/json",
+        },
+        timeout=120.0,
+    )
+    resp.raise_for_status()
+    data = resp.json()
+
+    # Extract markdown from pages
+    pages = data.get("pages", [])
+    if not pages:
+        return ""
+    return "\n\n".join(p.get("markdown", "") for p in pages)
+
+
+def find_fixtures_needing_gt() -> list[tuple[str, str, str]]:
+    """Find PDF fixtures that don't have markdown GT.
+    Returns list of (fixture_path, pdf_path, gt_md_path).
+    """
+    fixtures_dir = Path("tools/benchmark-harness/fixtures/pdf")
+    results = []
+
+    for f in sorted(fixtures_dir.glob("*.json")):
+        data = json.loads(f.read_text())
+        gt = data.get("ground_truth")
+        if gt is None:
+            continue
+        if gt.get("markdown_file"):
+            continue  # Already has MD GT
+
+        doc_path = data.get("document", "")
+        pdf_path = str((f.parent / doc_path).resolve())
+        if not Path(pdf_path).exists():
+            continue
+
+        # Determine GT output path
+        text_file = gt.get("text_file", "")
+        if text_file:
+            gt_md = text_file.rsplit(".", 1)[0] + ".md"
+        else:
+            name = Path(doc_path).stem
+            gt_md = f"../../../../test_documents/ground_truth/pdf/{name}.md"
+
+        gt_md_path = str((f.parent / gt_md).resolve())
+        results.append((str(f), pdf_path, gt_md_path))
+
+    return results
+
+
+def process_fixture(fixture_path: str, pdf_path: str, gt_md_path: str, dry_run: bool = False) -> bool:
+    """Process a single fixture. Returns True if successful."""
+    name = Path(pdf_path).stem
+    size_mb = Path(pdf_path).stat().st_size / (1024 * 1024)
+
+    if dry_run:
+        print(f"  [dry-run] {name} ({size_mb:.1f}MB) → {gt_md_path}")
+        return True
+
+    print(f"  Processing {name} ({size_mb:.1f}MB)...", end=" ", flush=True)
+
+    try:
+        markdown = call_mistral_ocr(pdf_path)
+        if not markdown.strip():
+            print("EMPTY")
+            return False
+
+        # Sanitize
+        from sanitize_pandoc_gt import sanitize
+
+        markdown = sanitize(markdown)
+
+        # Write GT file
+        Path(gt_md_path).parent.mkdir(parents=True, exist_ok=True)
+        Path(gt_md_path).write_text(markdown)
+
+        # Update fixture JSON
+        data = json.loads(Path(fixture_path).read_text())
+        gt = data["ground_truth"]
+        # Compute relative path from fixture to GT
+        rel_path = os.path.relpath(gt_md_path, Path(fixture_path).parent)
+        gt["markdown_file"] = rel_path
+        gt["source"] = "mistral-pixtral"
+        Path(fixture_path).write_text(json.dumps(data, indent=2) + "\n")
+
+        print(f"OK ({len(markdown)} bytes)")
+        return True
+
+    except Exception as e:
+        print(f"ERROR: {e}")
+        return False
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate PDF GT with Mistral OCR")
+    parser.add_argument("fixture", nargs="?", help="Specific fixture JSON to process")
+    parser.add_argument("--dry-run", action="store_true", help="Show what would be done")
+    parser.add_argument("--limit", type=int, default=0, help="Process only first N fixtures")
+    parser.add_argument("--delay", type=float, default=1.0, help="Delay between API calls (seconds)")
+    args = parser.parse_args()
+
+    load_env()
+
+    if args.fixture:
+        # Process single fixture
+        data = json.loads(Path(args.fixture).read_text())
+        doc_path = data.get("document", "")
+        pdf_path = str((Path(args.fixture).parent / doc_path).resolve())
+        gt = data.get("ground_truth", {})
+        text_file = gt.get("text_file", "")
+        if text_file:
+            gt_md = text_file.rsplit(".", 1)[0] + ".md"
+        else:
+            gt_md = f"../../../../test_documents/ground_truth/pdf/{Path(doc_path).stem}.md"
+        gt_md_path = str((Path(args.fixture).parent / gt_md).resolve())
+        process_fixture(args.fixture, pdf_path, gt_md_path, dry_run=args.dry_run)
+        return
+
+    # Process all fixtures needing GT
+    fixtures = find_fixtures_needing_gt()
+    print(f"Found {len(fixtures)} PDF fixtures needing markdown GT")
+
+    if args.limit > 0:
+        fixtures = fixtures[: args.limit]
+        print(f"Processing first {args.limit}")
+
+    success = 0
+    failed = 0
+    for fixture_path, pdf_path, gt_md_path in fixtures:
+        ok = process_fixture(fixture_path, pdf_path, gt_md_path, dry_run=args.dry_run)
+        if ok:
+            success += 1
+        else:
+            failed += 1
+        if not args.dry_run and args.delay > 0:
+            time.sleep(args.delay)
+
+    print(f"\nDone: {success} generated, {failed} failed")
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/benchmark-harness/scripts/generate_vendored_baselines.py
+++ b/tools/benchmark-harness/scripts/generate_vendored_baselines.py
@@ -0,0 +1,172 @@
+# /// script
+# requires-python = ">=3.11"
+# dependencies = [
+#     "paddleocr>=3.4.0",
+#     "paddlepaddle>=3.3.0",
+#     "rapidocr-onnxruntime>=1.4.0",
+#     "pymupdf>=1.24.0",
+#     "pillow>=10.0.0",
+#     "numpy>=1.24.0",
+# ]
+# ///
+"""Generate vendored OCR baselines from PaddleOCR Python and RapidOCR.
+
+Usage:
+    uv run tools/benchmark-harness/scripts/generate_vendored_baselines.py
+    uv run tools/benchmark-harness/scripts/generate_vendored_baselines.py rapidocr
+    uv run tools/benchmark-harness/scripts/generate_vendored_baselines.py --force
+"""
+
+import json
+import os
+import sys
+import time
+from pathlib import Path
+
+import fitz
+import numpy as np
+
+FIXTURES_DIR = Path(__file__).resolve().parent.parent / "fixtures"
+VENDORED_DIR = Path(__file__).resolve().parent.parent / "vendored"
+
+OCR_FIXTURES = [
+    "pdf_image_only_german",
+    "pdf_non_searchable",
+    "pdf_ocr_rotated_270",
+    "pdf_ocr_rotated_90",
+    "pdf_ocr_rotated",
+    "pdf_ocr_test",
+    "pdf_scanned_ocr",
+]
+
+
+def pdf_to_images(pdf_path: str, dpi: int = 300) -> list[np.ndarray]:
+    """Convert PDF pages to numpy arrays (RGB, HWC)."""
+    import io
+
+    from PIL import Image
+
+    doc = fitz.open(pdf_path)
+    images = []
+    for page in doc:
+        mat = fitz.Matrix(dpi / 72, dpi / 72)
+        pix = page.get_pixmap(matrix=mat)
+        img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")
+        images.append(np.array(img))
+    doc.close()
+    return images
+
+
+def lines_to_markdown(lines: list[str]) -> str:
+    """Each OCR text line becomes a markdown paragraph."""
+    paragraphs = [line.strip() for line in lines if line.strip()]
+    return "\n\n".join(paragraphs) + "\n" if paragraphs else ""
+
+
+def run_paddleocr_python(pdf_path: str) -> tuple[str, float]:
+    """Run PaddleOCR Python v3.4+ using the predict() API."""
+    os.environ["PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK"] = "True"
+    from paddleocr import PaddleOCR
+
+    ocr = PaddleOCR(use_textline_orientation=True, lang="en")
+    images = pdf_to_images(pdf_path)
+
+    start = time.monotonic()
+    all_lines: list[str] = []
+    for img in images:
+        # predict() returns list of OCRResult (dict-like) objects
+        for result in ocr.predict(img):
+            # OCRResult has 'rec_text' key with list of recognized texts
+            rec_texts = result.get("rec_text", [])
+            if isinstance(rec_texts, (list, tuple)):
+                for t in rec_texts:
+                    text = str(t).strip()
+                    if text:
+                        all_lines.append(text)
+    elapsed_ms = (time.monotonic() - start) * 1000
+
+    return lines_to_markdown(all_lines), elapsed_ms
+
+
+def run_rapidocr(pdf_path: str) -> tuple[str, float]:
+    """Run RapidOCR."""
+    from rapidocr_onnxruntime import RapidOCR
+
+    ocr = RapidOCR()
+    images = pdf_to_images(pdf_path)
+
+    start = time.monotonic()
+    all_lines: list[str] = []
+    for img in images:
+        result, _ = ocr(img)
+        if not result:
+            continue
+        for line in result:
+            if line and len(line) >= 2:
+                text = str(line[1]).strip()
+                if text:
+                    all_lines.append(text)
+    elapsed_ms = (time.monotonic() - start) * 1000
+
+    return lines_to_markdown(all_lines), elapsed_ms
+
+
+def save_vendored(pipeline_name: str, fixture_name: str, md: str, time_ms: float):
+    md_dir = VENDORED_DIR / pipeline_name / "md"
+    timing_dir = VENDORED_DIR / pipeline_name / "timing"
+    md_dir.mkdir(parents=True, exist_ok=True)
+    timing_dir.mkdir(parents=True, exist_ok=True)
+    (md_dir / f"{fixture_name}.md").write_text(md)
+    (timing_dir / f"{fixture_name}.ms").write_text(f"{time_ms:.1f}\n")
+
+
+def main():
+    pipelines = {
+        "paddleocr-python": run_paddleocr_python,
+        "rapidocr": run_rapidocr,
+    }
+
+    force = "--force" in sys.argv
+    args = [a for a in sys.argv[1:] if not a.startswith("--")]
+
+    if args:
+        selected = args[0]
+        if selected not in pipelines:
+            print(f"Unknown: {selected}. Choose: {list(pipelines.keys())}")
+            sys.exit(1)
+        pipelines = {selected: pipelines[selected]}
+
+    for fixture_name in OCR_FIXTURES:
+        fixture_path = FIXTURES_DIR / f"{fixture_name}.json"
+        if not fixture_path.exists():
+            print(f"  SKIP {fixture_name}: fixture not found")
+            continue
+
+        with open(fixture_path) as f:
+            fixture = json.load(f)
+
+        doc_path = str((FIXTURES_DIR / fixture["document"]).resolve())
+        if not os.path.exists(doc_path):
+            print(f"  SKIP {fixture_name}: document not found")
+            continue
+
+        for pipeline_name, run_fn in pipelines.items():
+            existing = VENDORED_DIR / pipeline_name / "md" / f"{fixture_name}.md"
+            if not force and existing.exists() and existing.stat().st_size > 0:
+                print(f"  CACHED {pipeline_name}/{fixture_name}")
+                continue
+
+            print(f"  RUN {pipeline_name}/{fixture_name} ...", end="", flush=True)
+            try:
+                md, time_ms = run_fn(doc_path)
+                save_vendored(pipeline_name, fixture_name, md, time_ms)
+                print(f" {time_ms:.0f}ms, {len(md)} chars")
+            except Exception as e:
+                print(f" ERROR: {e}")
+                import traceback
+
+                traceback.print_exc()
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/benchmark-harness/scripts/go.mod
+++ b/tools/benchmark-harness/scripts/go.mod
@@ -0,0 +1,7 @@
+module github.com/kreuzberg-dev/kreuzberg/tools/benchmark-harness/scripts
+
+go 1.23
+
+require github.com/kreuzberg-dev/kreuzberg/packages/go/v4 v4.9.5
+
+replace github.com/kreuzberg-dev/kreuzberg/packages/go/v4 => ../../../packages/go/v4
--- a/tools/benchmark-harness/scripts/import_omnidocbench.py
+++ b/tools/benchmark-harness/scripts/import_omnidocbench.py
@@ -0,0 +1,407 @@
+"""Import OmniDocBench dataset into our benchmark fixture format.
+
+Converts OmniDocBench's element-level JSON annotations into:
+  - Per-document fixture JSON files (tools/benchmark-harness/fixtures/pdf/omnidoc_NNN.json)
+  - Ground truth markdown files (test_documents/ground_truth/pdf/omnidoc_NNN.md)
+  - Ground truth text files (test_documents/ground_truth/pdf/omnidoc_NNN.txt)
+
+OmniDocBench groups pages by document. Each multi-page document produces one fixture.
+Single-page documents produce one fixture per page.
+
+Usage:
+    python import_omnidocbench.py <omnidocbench_dir> <repo_root>
+
+Where:
+    omnidocbench_dir = tools/benchmark-harness/datasets/omnidocbench (contains OmniDocBench.json + ori_pdfs/)
+    repo_root = repository root (contains tools/ and test_documents/)
+"""
+
+from __future__ import annotations
+
+import html
+import json
+import os
+import re
+import sys
+from collections import defaultdict
+from pathlib import Path
+
+# OmniDocBench category types that map to content we want in ground truth
+CONTENT_CATEGORIES = {
+    "title",
+    "text_block",
+    "table",
+    "equation_isolated",
+    "code_txt",
+    "figure_caption",
+    "table_caption",
+    "equation_caption",
+    "code_txt_caption",
+    "reference",
+}
+
+# Categories to skip (page furniture, figures without text, etc.)
+SKIP_CATEGORIES = {
+    "header",
+    "footer",
+    "page_number",
+    "page_footnote",
+    "abandon",
+    "figure",
+    "figure_footnote",
+    "table_footnote",
+}
+
+
+def html_table_to_markdown(html_str: str) -> str:
+    """Convert a simple HTML table to markdown table format."""
+    if not html_str:
+        return ""
+
+    # Unescape HTML entities
+    html_str = html.unescape(html_str)
+
+    rows: list[list[str]] = []
+    # Extract rows
+    for row_match in re.finditer(r"<tr[^>]*>(.*?)</tr>", html_str, re.DOTALL):
+        row_html = row_match.group(1)
+        cells: list[str] = []
+        for cell_match in re.finditer(r"<t[dh][^>]*>(.*?)</t[dh]>", row_html, re.DOTALL):
+            cell_text = re.sub(r"<[^>]+>", "", cell_match.group(1)).strip()
+            cells.append(cell_text)
+        if cells:
+            rows.append(cells)
+
+    if not rows:
+        return html_str  # fallback: return raw if parsing fails
+
+    # Normalize column count
+    max_cols = max(len(r) for r in rows)
+    for row in rows:
+        while len(row) < max_cols:
+            row.append("")
+
+    # Build markdown table
+    lines = []
+    # Header row
+    lines.append("| " + " | ".join(rows[0]) + " |")
+    lines.append("|" + "|".join(["---"] * max_cols) + "|")
+    # Data rows
+    for row in rows[1:]:
+        lines.append("| " + " | ".join(row) + " |")
+
+    return "\n".join(lines)
+
+
+def annotation_to_markdown(ann: dict) -> str | None:
+    """Convert a single OmniDocBench annotation to markdown text."""
+    cat = ann.get("category_type", "")
+
+    if cat in SKIP_CATEGORIES:
+        return None
+
+    if ann.get("ignore", False):
+        return None
+
+    text = ann.get("text", "").strip()
+
+    if cat == "title":
+        # OmniDocBench doesn't distinguish heading levels.
+        # Use H2 as default (most titles are section-level, not document-level).
+        if text:
+            return f"## {text}"
+        return None
+
+    if cat == "text_block":
+        return text or None
+
+    if cat == "table":
+        # Prefer HTML representation for tables
+        html_str = ann.get("html", "")
+        if html_str:
+            return html_table_to_markdown(html_str)
+        # Fallback to text
+        return text or None
+
+    if cat == "equation_isolated":
+        latex = ann.get("latex", "")
+        if latex:
+            return f"$$\n{latex}\n$$"
+        return text or None
+
+    if cat == "code_txt":
+        if text:
+            return f"```\n{text}\n```"
+        return None
+
+    if cat in ("figure_caption", "table_caption", "equation_caption", "code_txt_caption"):
+        return text or None
+
+    if cat == "reference":
+        return text or None
+
+    # Unknown category — include text if present
+    return text or None
+
+
+def page_to_markdown(page: dict) -> str:
+    """Convert a single OmniDocBench page to markdown."""
+    annotations = page.get("layout_dets", [])
+
+    # Sort by reading order
+    sorted_anns = sorted(annotations, key=lambda a: a.get("order", 999))
+
+    # Handle truncated blocks (merge them)
+    relations = page.get("extra", {}).get("relation", [])
+    merge_targets: dict[int, int] = {}  # target_id -> source_id
+    for rel in relations:
+        if rel.get("relation") == "truncated":
+            merge_targets[rel["target_anno_id"]] = rel["source_anno_id"]
+
+    # Build merged text for truncated blocks
+    merged_text: dict[int, list[str]] = defaultdict(list)
+    ann_by_id = {a.get("anno_id", i): a for i, a in enumerate(sorted_anns)}
+
+    for ann in sorted_anns:
+        anno_id = ann.get("anno_id", -1)
+        if anno_id in merge_targets:
+            source_id = merge_targets[anno_id]
+            text = ann.get("text", "").strip()
+            if text:
+                merged_text[source_id].append(text)
+
+    blocks: list[str] = []
+    skip_ids = set(merge_targets.keys())
+
+    for ann in sorted_anns:
+        anno_id = ann.get("anno_id", -1)
+        if anno_id in skip_ids:
+            continue
+
+        # Append merged text from truncated continuations
+        if anno_id in merged_text:
+            original_text = ann.get("text", "").strip()
+            continuation = " ".join(merged_text[anno_id])
+            ann = dict(ann)  # shallow copy
+            ann["text"] = f"{original_text} {continuation}".strip()
+
+        md = annotation_to_markdown(ann)
+        if md:
+            blocks.append(md)
+
+    return "\n\n".join(blocks)
+
+
+def strip_markdown_to_text(md: str) -> str:
+    """Strip markdown syntax to produce plain text."""
+    lines = []
+    in_code = False
+    in_formula = False
+
+    for line in md.split("\n"):
+        if line.startswith("```"):
+            in_code = not in_code
+            continue
+        if line.startswith("$$"):
+            in_formula = not in_formula
+            continue
+        if in_code or in_formula:
+            lines.append(line)
+            continue
+
+        # Strip heading markers
+        stripped = re.sub(r"^#{1,6}\s+", "", line)
+        # Strip table pipes (keep cell content)
+        if stripped.startswith("|") and stripped.endswith("|"):
+            # Skip separator rows
+            if re.match(r"^\|[-|: ]+\|$", stripped):
+                continue
+            stripped = re.sub(r"\s*\|\s*", " ", stripped).strip()
+        # Strip bold/italic
+        stripped = re.sub(r"\*{1,3}([^*]+)\*{1,3}", r"\1", stripped)
+
+        if stripped:
+            lines.append(stripped)
+
+    return "\n".join(lines)
+
+
+def group_pages_by_pdf(pages: list[dict]) -> dict[str, list[dict]]:
+    """Group OmniDocBench pages by their source PDF."""
+    groups: dict[str, list[dict]] = defaultdict(list)
+
+    for page in pages:
+        page_info = page.get("page_info", {})
+        image_path = page_info.get("image_path", "")
+
+        # Try to extract PDF name from image path
+        # Image paths look like: "academic_literature/scihub_12345_p0.jpg"
+        # or "PPT2PDF/PPT_sample.png"
+        basename = os.path.splitext(os.path.basename(image_path))[0]
+
+        # Strip page suffix like _p0, _p1, etc.
+        pdf_name = re.sub(r"_p\d+$", "", basename)
+
+        groups[pdf_name].append(page)
+
+    # Sort pages within each group by page number
+    for pdf_name in groups:
+        groups[pdf_name].sort(key=lambda p: p.get("page_info", {}).get("page_no", 0))
+
+    return groups
+
+
+def find_pdf_for_document(pdf_name: str, pages: list[dict], ori_pdfs_dir: Path) -> Path | None:
+    """Find the original PDF file for a document group."""
+    if not ori_pdfs_dir.exists():
+        return None
+
+    # Try direct name match
+    for ext in (".pdf", ".PDF"):
+        candidate = ori_pdfs_dir / f"{pdf_name}{ext}"
+        if candidate.exists():
+            return candidate
+
+    # Try searching in subdirectories
+    for pdf_file in ori_pdfs_dir.rglob("*.pdf"):
+        if pdf_file.stem == pdf_name:
+            return pdf_file
+
+    # Try matching from image path
+    if pages:
+        image_path = pages[0].get("page_info", {}).get("image_path", "")
+        parts = image_path.split("/")
+        if len(parts) >= 2:
+            subdir = parts[0]
+            subdir_path = ori_pdfs_dir / subdir
+            if subdir_path.exists():
+                for pdf_file in subdir_path.glob("*.pdf"):
+                    if pdf_name.startswith(pdf_file.stem) or pdf_file.stem.startswith(pdf_name):
+                        return pdf_file
+
+    return None
+
+
+def main() -> None:
+    if len(sys.argv) < 3:
+        print(
+            "Usage: import_omnidocbench.py <omnidocbench_dir> <repo_root>",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    omnidoc_dir = Path(sys.argv[1]).resolve()
+    repo_root = Path(sys.argv[2]).resolve()
+
+    json_path = omnidoc_dir / "OmniDocBench.json"
+    ori_pdfs_dir = omnidoc_dir / "ori_pdfs"
+
+    if not json_path.exists():
+        print(f"ERROR: {json_path} not found. Run download_omnidocbench.sh first.", file=sys.stderr)
+        sys.exit(1)
+
+    fixtures_dir = repo_root / "tools" / "benchmark-harness" / "fixtures" / "pdf"
+    gt_dir = repo_root / "test_documents" / "ground_truth" / "pdf"
+    fixtures_dir.mkdir(parents=True, exist_ok=True)
+    gt_dir.mkdir(parents=True, exist_ok=True)
+
+    print(f"Loading {json_path}...", file=sys.stderr)
+    with open(json_path) as f:
+        pages = json.load(f)
+    print(f"Loaded {len(pages)} pages", file=sys.stderr)
+
+    # Group pages by document
+    doc_groups = group_pages_by_pdf(pages)
+    print(f"Found {len(doc_groups)} documents", file=sys.stderr)
+
+    created = 0
+    skipped_no_pdf = 0
+    skipped_exists = 0
+    skipped_empty = 0
+
+    for pdf_name, doc_pages in sorted(doc_groups.items()):
+        # Generate fixture name
+        fixture_name = f"omnidoc_{pdf_name}"
+        # Sanitize: replace non-alphanumeric chars
+        fixture_name = re.sub(r"[^a-zA-Z0-9_-]", "_", fixture_name)
+
+        fixture_path = fixtures_dir / f"{fixture_name}.json"
+        gt_md_path = gt_dir / f"{fixture_name}.md"
+        gt_txt_path = gt_dir / f"{fixture_name}.txt"
+
+        # Skip if already imported
+        if fixture_path.exists():
+            skipped_exists += 1
+            continue
+
+        # Find the PDF
+        pdf_path = find_pdf_for_document(pdf_name, doc_pages, ori_pdfs_dir)
+        if pdf_path is None:
+            skipped_no_pdf += 1
+            continue
+
+        # Generate markdown from all pages
+        page_markdowns = []
+        for page in doc_pages:
+            md = page_to_markdown(page)
+            if md.strip():
+                page_markdowns.append(md)
+
+        if not page_markdowns:
+            skipped_empty += 1
+            continue
+
+        full_markdown = "\n\n".join(page_markdowns)
+        full_text = strip_markdown_to_text(full_markdown)
+
+        # Write ground truth files
+        gt_md_path.write_text(full_markdown)
+        gt_txt_path.write_text(full_text)
+
+        # Compute relative paths from fixture to document and ground truth
+        doc_rel = os.path.relpath(pdf_path, fixtures_dir)
+        gt_md_rel = os.path.relpath(gt_md_path, fixtures_dir)
+        gt_txt_rel = os.path.relpath(gt_txt_path, fixtures_dir)
+
+        # Get page metadata for fixture
+        first_page = doc_pages[0].get("page_info", {})
+        page_attr = first_page.get("page_attribute", {})
+
+        fixture = {
+            "document": doc_rel,
+            "file_type": "pdf",
+            "file_size": pdf_path.stat().st_size,
+            "expected_frameworks": ["kreuzberg"],
+            "metadata": {
+                "description": f"OmniDocBench: {page_attr.get('data_source', 'unknown')}",
+                "source": "omnidocbench",
+                "size_category": "small" if pdf_path.stat().st_size < 500_000 else "medium",
+                "language": page_attr.get("language", "unknown"),
+                "layout": page_attr.get("layout", "unknown"),
+                "data_source": page_attr.get("data_source", "unknown"),
+                "page_count": len(doc_pages),
+            },
+            "ground_truth": {
+                "text_file": gt_txt_rel,
+                "markdown_file": gt_md_rel,
+                "source": "omnidocbench",
+            },
+        }
+
+        fixture_path.write_text(json.dumps(fixture, indent=2) + "\n")
+        created += 1
+
+        if created % 50 == 0:
+            print(f"  {created} fixtures created...", file=sys.stderr)
+
+    print("\nDone:", file=sys.stderr)
+    print(f"  Created: {created}", file=sys.stderr)
+    print(f"  Skipped (already exists): {skipped_exists}", file=sys.stderr)
+    print(f"  Skipped (no PDF found): {skipped_no_pdf}", file=sys.stderr)
+    print(f"  Skipped (empty content): {skipped_empty}", file=sys.stderr)
+    print(f"  Fixtures: {fixtures_dir}", file=sys.stderr)
+    print(f"  Ground truth: {gt_dir}", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/benchmark-harness/scripts/markitdown_extract.py
+++ b/tools/benchmark-harness/scripts/markitdown_extract.py
@@ -0,0 +1,175 @@
+"""MarkItDown extraction wrapper for benchmark harness."""
+
+from __future__ import annotations
+
+import json
+import multiprocessing as _mp
+import os
+import platform
+import resource
+import sys
+import time
+
+from markitdown import MarkItDown
+
+
+def _get_peak_memory_bytes() -> int:
+    """Get peak memory usage in bytes using resource module."""
+    usage = resource.getrusage(resource.RUSAGE_SELF)
+    if platform.system() == "Linux":
+        return usage.ru_maxrss * 1024
+    return usage.ru_maxrss
+
+
+def extract_sync(file_path: str) -> dict:
+    """Extract using MarkItDown."""
+    start = time.perf_counter()
+    md = MarkItDown()
+    result = md.convert(file_path)
+    duration_ms = (time.perf_counter() - start) * 1000.0
+
+    return {
+        "content": result.text_content or "",
+        "metadata": {"framework": "markitdown"},
+        "_extraction_time_ms": duration_ms,
+        "_peak_memory_bytes": _get_peak_memory_bytes(),
+    }
+
+
+def _worker(fn, args, conn):
+    """Run extraction in a forked child process.
+
+    Closes inherited stdin/stdout so the child cannot corrupt the
+    parent's line-based JSON protocol.
+    """
+    try:
+        sys.stdin.close()
+        sys.stdout = open(os.devnull, "w")
+    except Exception:
+        pass
+    try:
+        result = fn(*args)
+        conn.send(result)
+    except Exception as e:
+        conn.send({"error": str(e), "_extraction_time_ms": 0})
+    finally:
+        conn.close()
+
+
+def _run_with_timeout(fn, args, timeout):
+    """Execute fn(*args) in a forked child with a timeout.
+
+    On timeout the child is killed but the parent stays alive —
+    no expensive process restart is needed.
+    """
+    try:
+        ctx = _mp.get_context("fork")
+        parent_conn, child_conn = ctx.Pipe(duplex=False)
+        p = ctx.Process(target=_worker, args=(fn, args, child_conn))
+        p.start()
+        child_conn.close()
+
+        if parent_conn.poll(timeout=timeout):
+            try:
+                result = parent_conn.recv()
+            except Exception:
+                result = {"error": "worker process crashed", "_extraction_time_ms": 0}
+        else:
+            p.kill()
+            result = {
+                "error": f"extraction timed out after {timeout}s",
+                "_extraction_time_ms": timeout * 1000.0,
+            }
+
+        p.join(timeout=5)
+        if p.is_alive():
+            p.kill()
+            p.join()
+        parent_conn.close()
+        return result
+    except Exception:
+        # Fork not available — fall back to in-process extraction
+        try:
+            return fn(*args)
+        except Exception as e:
+            return {"error": str(e), "_extraction_time_ms": 0}
+
+
+def _parse_path(line: str) -> str:
+    """Parse a request line: JSON object with path field, or plain file path."""
+    stripped = line.strip()
+    if stripped.startswith("{"):
+        try:
+            return json.loads(stripped).get("path", "")
+        except (json.JSONDecodeError, ValueError):
+            pass
+    return stripped
+
+
+def run_server(timeout=None) -> None:
+    """Persistent server mode: read paths from stdin, write JSON to stdout."""
+    print("READY", flush=True)
+    for line in sys.stdin:
+        file_path = _parse_path(line)
+        if not file_path:
+            continue
+        if timeout is not None:
+            result = _run_with_timeout(extract_sync, (file_path,), timeout)
+        else:
+            try:
+                result = extract_sync(file_path)
+            except Exception as e:
+                result = {"error": str(e), "_extraction_time_ms": 0}
+        print(json.dumps(result), flush=True)
+
+
+def main() -> None:
+    ocr_enabled = False
+    timeout = None
+    args = []
+    for arg in sys.argv[1:]:
+        if arg == "--ocr":
+            ocr_enabled = True
+        elif arg == "--no-ocr":
+            ocr_enabled = False
+        elif arg.startswith("--timeout="):
+            timeout = int(arg.split("=", 1)[1])
+        elif arg.startswith("--format="):
+            _fmt = arg.split("=", 1)[1]
+            if _fmt != "markdown":
+                print(f"{sys.argv[0]} only supports markdown output; got --format {_fmt}", file=sys.stderr)
+                sys.exit(64)
+        else:
+            args.append(arg)
+
+    if len(args) < 1:
+        print("Usage: markitdown_extract.py [--ocr|--no-ocr] [--timeout=SECS] <mode> <file_path>", file=sys.stderr)
+        print("Modes: sync, server", file=sys.stderr)
+        sys.exit(1)
+
+    mode = args[0]
+    if mode == "server":
+        run_server(timeout=timeout)
+    elif mode == "sync":
+        if len(args) < 2:
+            print("Error: sync mode requires a file path", file=sys.stderr)
+            sys.exit(1)
+        file_path = args[1]
+        try:
+            payload = extract_sync(file_path)
+            print(json.dumps(payload), end="")
+        except Exception as e:
+            print(f"Error extracting with MarkItDown: {e}", file=sys.stderr)
+            sys.exit(1)
+    else:
+        # Legacy fallback for direct file path
+        try:
+            payload = extract_sync(args[0])
+            print(json.dumps(payload), end="")
+        except Exception as e:
+            print(f"Error extracting with MarkItDown: {e}", file=sys.stderr)
+            sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/benchmark-harness/scripts/mineru_extract.py
+++ b/tools/benchmark-harness/scripts/mineru_extract.py
@@ -0,0 +1,338 @@
+"""MinerU extraction wrapper for benchmark harness.
+
+Supports three modes:
+- sync: process single file
+- batch: process multiple files
+- server: persistent mode reading paths from stdin
+
+Attempts to use MinerU's Python API directly for better performance.
+Falls back to CLI subprocess if the Python API is not available.
+"""
+
+from __future__ import annotations
+
+import os
+
+# Force CPU-only mode to avoid GPU discovery errors in CI
+os.environ.setdefault("CUDA_VISIBLE_DEVICES", "")
+os.environ.setdefault("ONNXRUNTIME_PROVIDERS", "CPUExecutionProvider")
+os.environ.setdefault("MINERU_DEVICE_MODE", "cpu")
+
+import json
+import multiprocessing as _mp
+import platform
+import resource
+import subprocess
+import sys
+import tempfile
+import time
+from pathlib import Path
+from typing import Any
+
+# Try importing MinerU's Python API to avoid subprocess overhead.
+# The API surface has changed across versions, so we attempt several known entry points.
+try:
+    from magic_pdf.pipe.UNIPipe import UNIPipe  # noqa: F401
+
+    HAS_PYTHON_API = True
+except ImportError:
+    HAS_PYTHON_API = False
+
+
+def _get_peak_memory_bytes() -> int:
+    """Get peak memory usage in bytes using resource module."""
+    usage = resource.getrusage(resource.RUSAGE_SELF)
+    if platform.system() == "Linux":
+        return usage.ru_maxrss * 1024
+    return usage.ru_maxrss
+
+
+def _extract_via_cli(file_path: str, ocr_enabled: bool) -> str:
+    """Extract using MinerU CLI (fallback)."""
+    cmd = ["mineru", "-p", file_path, "-b", "pipeline", "-d", "cpu"]
+    if not ocr_enabled:
+        cmd.extend(["--method", "txt"])
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        output_dir = Path(tmpdir) / "output"
+        cmd.extend(["-o", str(output_dir)])
+
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            check=False,
+        )
+
+        # Check for output files first — ONNX Runtime may emit warnings to
+        # stderr even when extraction succeeds.
+        md_files = list(output_dir.rglob("*.md"))
+        if md_files:
+            return md_files[0].read_text(encoding="utf-8")
+
+        if result.returncode != 0:
+            raise RuntimeError(f"MinerU extraction failed: {result.stderr}")
+
+        raise RuntimeError("No markdown output found from MinerU")
+
+
+def _extract_via_api(file_path: str, ocr_enabled: bool) -> str:
+    """Extract using MinerU Python API (preferred, avoids subprocess overhead)."""
+    # NOTE: The MinerU Python API is not yet stable. This is a best-effort attempt
+    # using the UNIPipe interface. If this fails at runtime, the caller should
+    # fall back to CLI extraction.
+    from magic_pdf.pipe.UNIPipe import UNIPipe
+    from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
+
+    pdf_bytes = Path(file_path).read_bytes()
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        writer = DiskReaderWriter(tmpdir)
+        method = "ocr" if ocr_enabled else "txt"
+        pipe = UNIPipe(pdf_bytes, {"_pdf_type": "", "model_list": []}, writer, method=method)
+        pipe.pipe_classify()
+        pipe.pipe_analyze()
+        pipe.pipe_parse()
+        md_content = pipe.pipe_mk_markdown(str(Path(file_path).stem), tmpdir)
+        return md_content
+
+
+_MD_STRIP_RE = None
+
+
+def _strip_markdown(text: str) -> str:
+    """Best-effort markdown→plaintext pass. Drops syntax tokens; preserves text."""
+    import re
+
+    global _MD_STRIP_RE
+    if _MD_STRIP_RE is None:
+        _MD_STRIP_RE = [
+            (re.compile(r"^#{1,6}\s+", re.MULTILINE), ""),  # ATX headings
+            (re.compile(r"^\s*[-*+]\s+", re.MULTILINE), ""),  # bullet markers
+            (re.compile(r"^\s*\d+\.\s+", re.MULTILINE), ""),  # ordered list markers
+            (re.compile(r"^>\s?", re.MULTILINE), ""),  # blockquotes
+            (re.compile(r"```[a-zA-Z0-9_-]*\n?"), ""),  # code fences
+            (re.compile(r"`([^`]+)`"), r"\1"),  # inline code
+            (re.compile(r"\*\*([^*]+)\*\*"), r"\1"),  # bold
+            (re.compile(r"\*([^*]+)\*"), r"\1"),  # italic
+            (re.compile(r"!\[([^\]]*)\]\([^)]*\)"), r"\1"),  # images
+            (re.compile(r"\[([^\]]+)\]\([^)]*\)"), r"\1"),  # links
+            (re.compile(r"^\s*\|.*\|\s*$", re.MULTILINE), ""),  # table rows (drop)
+        ]
+    out = text
+    for pattern, repl in _MD_STRIP_RE:
+        out = pattern.sub(repl, out)
+    return out
+
+
+def extract_sync(file_path: str, ocr_enabled: bool, output_format: str = "markdown") -> dict[str, Any]:
+    """Extract a single file using the best available method."""
+    start = time.perf_counter()
+
+    if HAS_PYTHON_API:
+        try:
+            markdown = _extract_via_api(file_path, ocr_enabled)
+        except Exception:
+            # Fall back to CLI if Python API fails at runtime
+            markdown = _extract_via_cli(file_path, ocr_enabled)
+    else:
+        markdown = _extract_via_cli(file_path, ocr_enabled)
+
+    content = _strip_markdown(markdown) if output_format == "plaintext" else markdown
+    duration_ms = (time.perf_counter() - start) * 1000.0
+
+    return {
+        "content": content,
+        "metadata": {"framework": "mineru", "output_format": output_format},
+        "_extraction_time_ms": duration_ms,
+        "_peak_memory_bytes": _get_peak_memory_bytes(),
+    }
+
+
+def extract_batch(file_paths: list[str], ocr_enabled: bool, output_format: str = "markdown") -> list[dict[str, Any]]:
+    """Extract multiple files in sequence."""
+    start = time.perf_counter()
+
+    results = []
+    for file_path in file_paths:
+        try:
+            payload = extract_sync(file_path, ocr_enabled, output_format)
+            # Remove per-file timing; we'll replace with batch timing below
+            payload.pop("_extraction_time_ms", None)
+            results.append(payload)
+        except Exception as e:
+            results.append(
+                {
+                    "content": "",
+                    "metadata": {
+                        "framework": "mineru",
+                        "error": str(e),
+                    },
+                }
+            )
+
+    total_duration_ms = (time.perf_counter() - start) * 1000.0
+    per_file_duration_ms = total_duration_ms / len(file_paths) if file_paths else 0
+    peak_memory = _get_peak_memory_bytes()
+
+    for result in results:
+        result["_extraction_time_ms"] = per_file_duration_ms
+        result["_batch_total_ms"] = total_duration_ms
+        result["_peak_memory_bytes"] = peak_memory
+
+    return results
+
+
+def _worker(fn, args, conn):
+    """Run extraction in a forked child process.
+
+    Closes inherited stdin/stdout so the child cannot corrupt the
+    parent's line-based JSON protocol.
+    """
+    try:
+        sys.stdin.close()
+        sys.stdout = open(os.devnull, "w")
+    except Exception:
+        pass
+    try:
+        result = fn(*args)
+        conn.send(result)
+    except Exception as e:
+        conn.send({"error": str(e), "_extraction_time_ms": 0})
+    finally:
+        conn.close()
+
+
+def _run_with_timeout(fn, args, timeout):
+    """Execute fn(*args) in a forked child with a timeout.
+
+    On timeout the child is killed but the parent stays alive —
+    no expensive process restart is needed.
+    """
+    try:
+        ctx = _mp.get_context("fork")
+        parent_conn, child_conn = ctx.Pipe(duplex=False)
+        p = ctx.Process(target=_worker, args=(fn, args, child_conn))
+        p.start()
+        child_conn.close()
+
+        if parent_conn.poll(timeout=timeout):
+            try:
+                result = parent_conn.recv()
+            except Exception:
+                result = {"error": "worker process crashed", "_extraction_time_ms": 0}
+        else:
+            p.kill()
+            result = {
+                "error": f"extraction timed out after {timeout}s",
+                "_extraction_time_ms": timeout * 1000.0,
+            }
+
+        p.join(timeout=5)
+        if p.is_alive():
+            p.kill()
+            p.join()
+        parent_conn.close()
+        return result
+    except Exception:
+        # Fork not available — fall back to in-process extraction
+        try:
+            return fn(*args)
+        except Exception as e:
+            return {"error": str(e), "_extraction_time_ms": 0}
+
+
+def _parse_path(line: str) -> str:
+    """Parse a request line: JSON object with path field, or plain file path."""
+    stripped = line.strip()
+    if stripped.startswith("{"):
+        try:
+            return json.loads(stripped).get("path", "")
+        except (json.JSONDecodeError, ValueError):
+            pass
+    return stripped
+
+
+def run_server(ocr_enabled: bool, output_format: str, timeout=None) -> None:
+    """Persistent server mode: read paths from stdin, write JSON to stdout."""
+    print("READY", flush=True)
+    for line in sys.stdin:
+        file_path = _parse_path(line)
+        if not file_path:
+            continue
+        if timeout is not None:
+            result = _run_with_timeout(extract_sync, (file_path, ocr_enabled, output_format), timeout)
+        else:
+            try:
+                result = extract_sync(file_path, ocr_enabled, output_format)
+            except Exception as e:
+                result = {"error": str(e), "_extraction_time_ms": 0}
+        print(json.dumps(result), flush=True)
+
+
+def main() -> None:
+    ocr_enabled = False
+    timeout = None
+    output_format = "markdown"
+    args = []
+    for arg in sys.argv[1:]:
+        if arg == "--ocr":
+            ocr_enabled = True
+        elif arg == "--no-ocr":
+            ocr_enabled = False
+        elif arg.startswith("--timeout="):
+            timeout = int(arg.split("=", 1)[1])
+        elif arg.startswith("--format="):
+            output_format = arg.split("=", 1)[1]
+        else:
+            args.append(arg)
+
+    if output_format not in ("markdown", "plaintext"):
+        print(f"Error: --format must be 'markdown' or 'plaintext'; got '{output_format}'", file=sys.stderr)
+        sys.exit(64)
+
+    if len(args) < 1:
+        print(
+            "Usage: mineru_extract.py [--ocr|--no-ocr] [--timeout=SECS] [--format=markdown|plaintext] <mode> <file_path> [additional_files...]",
+            file=sys.stderr,
+        )
+        print("Modes: sync, batch, server", file=sys.stderr)
+        sys.exit(1)
+
+    mode = args[0]
+    file_paths = args[1:]
+
+    try:
+        if mode == "server":
+            run_server(ocr_enabled, output_format, timeout=timeout)
+
+        elif mode == "sync":
+            if len(file_paths) != 1:
+                print("Error: sync mode requires exactly one file", file=sys.stderr)
+                sys.exit(1)
+            payload = extract_sync(file_paths[0], ocr_enabled, output_format)
+            print(json.dumps(payload), end="")
+
+        elif mode == "batch":
+            if len(file_paths) < 1:
+                print("Error: batch mode requires at least one file", file=sys.stderr)
+                sys.exit(1)
+
+            if len(file_paths) == 1:
+                results = extract_batch(file_paths, ocr_enabled, output_format)
+                print(json.dumps(results[0]), end="")
+            else:
+                results = extract_batch(file_paths, ocr_enabled, output_format)
+                print(json.dumps(results), end="")
+
+        else:
+            print(f"Error: Unknown mode '{mode}'. Use sync, batch, or server", file=sys.stderr)
+            sys.exit(1)
+
+    except Exception as e:
+        print(f"Error extracting with MinerU: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/benchmark-harness/scripts/pandoc_extract.sh
+++ b/tools/benchmark-harness/scripts/pandoc_extract.sh
@@ -0,0 +1,67 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+FORMAT="markdown"
+FILE_PATH=""
+for arg in "$@"; do
+  case "$arg" in
+  --format=*)
+    FORMAT="${arg#--format=}"
+    ;;
+  *)
+    FILE_PATH="$arg"
+    ;;
+  esac
+done
+
+if [ -z "$FILE_PATH" ]; then
+  echo "Usage: pandoc_extract.sh [--format=markdown|plaintext] <file_path>" >&2
+  exit 1
+fi
+
+if [ "$FORMAT" != "markdown" ] && [ "$FORMAT" != "plaintext" ]; then
+  echo "Error: --format must be 'markdown' or 'plaintext'; got '$FORMAT'" >&2
+  exit 64
+fi
+
+if [ ! -f "$FILE_PATH" ]; then
+  echo "Error: File not found: $FILE_PATH" >&2
+  exit 1
+fi
+
+if [ "$FORMAT" = "markdown" ]; then
+  PANDOC_TO="gfm"
+else
+  PANDOC_TO="plain"
+fi
+
+START=$(date +%s%N)
+
+if command -v timeout &>/dev/null; then
+  CONTENT=$(timeout 60s pandoc "$FILE_PATH" "--to=$PANDOC_TO" --wrap=none --strip-comments 2>/dev/null || echo "")
+elif command -v gtimeout &>/dev/null; then
+  CONTENT=$(gtimeout 60s pandoc "$FILE_PATH" "--to=$PANDOC_TO" --wrap=none --strip-comments 2>/dev/null || echo "")
+else
+  CONTENT=$(pandoc "$FILE_PATH" "--to=$PANDOC_TO" --wrap=none --strip-comments 2>/dev/null || echo "")
+fi
+
+END=$(date +%s%N)
+DURATION_MS=$(((END - START) / 1000000))
+
+if command -v jq &>/dev/null; then
+  jq -n \
+    --arg content "$CONTENT" \
+    --arg fmt "$FORMAT" \
+    --argjson duration "$DURATION_MS" \
+    '{
+            content: $content,
+            metadata: {framework: "pandoc", output_format: $fmt},
+            _extraction_time_ms: $duration
+        }'
+else
+  ESCAPED_CONTENT=$(echo "$CONTENT" | sed 's/\\/\\\\/g' | sed 's/"/\\"/g' | awk '{printf "%s\\n", $0}' | sed '$ s/\\n$//')
+  cat <<EOF
+{"content":"$ESCAPED_CONTENT","metadata":{"framework":"pandoc","output_format":"$FORMAT"},"_extraction_time_ms":$DURATION_MS}
+EOF
+fi
--- a/tools/benchmark-harness/scripts/pdfminer_extract.py
+++ b/tools/benchmark-harness/scripts/pdfminer_extract.py
@@ -0,0 +1,231 @@
+"""pdfminer extraction wrapper for benchmark harness.
+
+Supports three modes:
+- sync: extract text from a single file
+- batch: process multiple files (simulated batch using loop)
+- server: persistent mode reading paths from stdin
+"""
+
+from __future__ import annotations
+
+import json
+import multiprocessing as _mp
+import os
+import platform
+import resource
+import sys
+import time
+from typing import Any
+
+from pdfminer.high_level import extract_text
+
+
+def _get_peak_memory_bytes() -> int:
+    """Get peak memory usage in bytes using resource module."""
+    usage = resource.getrusage(resource.RUSAGE_SELF)
+    if platform.system() == "Linux":
+        return usage.ru_maxrss * 1024
+    return usage.ru_maxrss
+
+
+def extract_sync(file_path: str) -> dict[str, Any]:
+    """Extract using synchronous single-file API."""
+    start = time.perf_counter()
+
+    content = extract_text(file_path)
+
+    duration_ms = (time.perf_counter() - start) * 1000.0
+
+    return {
+        "content": content,
+        "metadata": {"framework": "pdfminer"},
+        "_extraction_time_ms": duration_ms,
+        "_peak_memory_bytes": _get_peak_memory_bytes(),
+    }
+
+
+def extract_batch(file_paths: list[str]) -> list[dict[str, Any]]:
+    """Extract multiple files (simulated batch - pdfminer has no native batch API)."""
+    start = time.perf_counter()
+
+    results = []
+    for file_path in file_paths:
+        try:
+            content = extract_text(file_path)
+            results.append(
+                {
+                    "content": content,
+                    "metadata": {"framework": "pdfminer"},
+                }
+            )
+        except Exception as e:
+            results.append(
+                {
+                    "content": "",
+                    "metadata": {
+                        "framework": "pdfminer",
+                        "error": str(e),
+                    },
+                }
+            )
+
+    total_duration_ms = (time.perf_counter() - start) * 1000.0
+    per_file_duration_ms = total_duration_ms / len(file_paths) if file_paths else 0
+
+    peak_memory = _get_peak_memory_bytes()
+    for result in results:
+        result["_extraction_time_ms"] = per_file_duration_ms
+        result["_batch_total_ms"] = total_duration_ms
+        result["_peak_memory_bytes"] = peak_memory
+
+    return results
+
+
+def _worker(fn, args, conn):
+    """Run extraction in a forked child process.
+
+    Closes inherited stdin/stdout so the child cannot corrupt the
+    parent's line-based JSON protocol.
+    """
+    try:
+        sys.stdin.close()
+        sys.stdout = open(os.devnull, "w")
+    except Exception:
+        pass
+    try:
+        result = fn(*args)
+        conn.send(result)
+    except Exception as e:
+        conn.send({"error": str(e), "_extraction_time_ms": 0})
+    finally:
+        conn.close()
+
+
+def _run_with_timeout(fn, args, timeout):
+    """Execute fn(*args) in a forked child with a timeout.
+
+    On timeout the child is killed but the parent stays alive —
+    no expensive process restart is needed.
+    """
+    try:
+        ctx = _mp.get_context("fork")
+        parent_conn, child_conn = ctx.Pipe(duplex=False)
+        p = ctx.Process(target=_worker, args=(fn, args, child_conn))
+        p.start()
+        child_conn.close()
+
+        if parent_conn.poll(timeout=timeout):
+            try:
+                result = parent_conn.recv()
+            except Exception:
+                result = {"error": "worker process crashed", "_extraction_time_ms": 0}
+        else:
+            p.kill()
+            result = {
+                "error": f"extraction timed out after {timeout}s",
+                "_extraction_time_ms": timeout * 1000.0,
+            }
+
+        p.join(timeout=5)
+        if p.is_alive():
+            p.kill()
+            p.join()
+        parent_conn.close()
+        return result
+    except Exception:
+        # Fork not available — fall back to in-process extraction
+        try:
+            return fn(*args)
+        except Exception as e:
+            return {"error": str(e), "_extraction_time_ms": 0}
+
+
+def _parse_path(line: str) -> str:
+    """Parse a request line: JSON object with path field, or plain file path."""
+    stripped = line.strip()
+    if stripped.startswith("{"):
+        try:
+            return json.loads(stripped).get("path", "")
+        except (json.JSONDecodeError, ValueError):
+            pass
+    return stripped
+
+
+def run_server(timeout=None) -> None:
+    """Persistent server mode: read paths from stdin, write JSON to stdout."""
+    print("READY", flush=True)
+    for line in sys.stdin:
+        file_path = _parse_path(line)
+        if not file_path:
+            continue
+        if timeout is not None:
+            result = _run_with_timeout(extract_sync, (file_path,), timeout)
+        else:
+            try:
+                result = extract_sync(file_path)
+            except Exception as e:
+                result = {"error": str(e), "_extraction_time_ms": 0}
+        print(json.dumps(result), flush=True)
+
+
+def main() -> None:
+    timeout = None
+    args = []
+    for arg in sys.argv[1:]:
+        if arg in ("--ocr", "--no-ocr"):
+            pass  # Accepted but ignored - pdfminer doesn't have OCR config
+        elif arg.startswith("--timeout="):
+            timeout = int(arg.split("=", 1)[1])
+        elif arg.startswith("--format="):
+            _fmt = arg.split("=", 1)[1]
+            if _fmt != "plaintext":
+                print(f"{sys.argv[0]} only supports plaintext output; got --format {_fmt}", file=sys.stderr)
+                sys.exit(64)
+        else:
+            args.append(arg)
+
+    if len(args) < 1:
+        print(
+            "Usage: pdfminer_extract.py [--ocr|--no-ocr] [--timeout=SECS] <mode> <file_path> [additional_files...]",
+            file=sys.stderr,
+        )
+        print("Modes: sync, batch, server", file=sys.stderr)
+        sys.exit(1)
+
+    mode = args[0]
+    file_paths = args[1:]
+
+    try:
+        if mode == "server":
+            run_server(timeout=timeout)
+
+        elif mode == "sync":
+            if len(file_paths) != 1:
+                print("Error: sync mode requires exactly one file", file=sys.stderr)
+                sys.exit(1)
+            payload = extract_sync(file_paths[0])
+            print(json.dumps(payload), end="")
+
+        elif mode == "batch":
+            if len(file_paths) < 1:
+                print("Error: batch mode requires at least one file", file=sys.stderr)
+                sys.exit(1)
+
+            if len(file_paths) == 1:
+                results = extract_batch(file_paths)
+                print(json.dumps(results[0]), end="")
+            else:
+                results = extract_batch(file_paths)
+                print(json.dumps(results), end="")
+
+        else:
+            print(f"Error: Unknown mode '{mode}'. Use sync, batch, or server", file=sys.stderr)
+            sys.exit(1)
+
+    except Exception as e:
+        print(f"Error extracting with pdfminer: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/benchmark-harness/scripts/pdfplumber_extract.py
+++ b/tools/benchmark-harness/scripts/pdfplumber_extract.py
@@ -0,0 +1,245 @@
+"""pdfplumber extraction wrapper for benchmark harness.
+
+Supports three modes:
+- sync: extract text page-by-page (sequential)
+- batch: process multiple files (simulated batch using loop)
+- server: persistent mode reading paths from stdin
+"""
+
+from __future__ import annotations
+
+import json
+import multiprocessing as _mp
+import os
+import platform
+import resource
+import sys
+import time
+from typing import Any
+
+import pdfplumber
+
+
+def _get_peak_memory_bytes() -> int:
+    """Get peak memory usage in bytes using resource module."""
+    usage = resource.getrusage(resource.RUSAGE_SELF)
+    if platform.system() == "Linux":
+        return usage.ru_maxrss * 1024
+    return usage.ru_maxrss
+
+
+def extract_sync(file_path: str) -> dict[str, Any]:
+    """Extract using synchronous single-file API."""
+    start = time.perf_counter()
+
+    with pdfplumber.open(file_path) as pdf:
+        text_parts = []
+        for page in pdf.pages:
+            page_text = page.extract_text(layout=False)
+            if page_text:
+                text_parts.append(page_text)
+
+        markdown = "\n\n".join(text_parts)
+
+    duration_ms = (time.perf_counter() - start) * 1000.0
+
+    return {
+        "content": markdown,
+        "metadata": {"framework": "pdfplumber"},
+        "_extraction_time_ms": duration_ms,
+        "_peak_memory_bytes": _get_peak_memory_bytes(),
+    }
+
+
+def extract_batch(file_paths: list[str]) -> list[dict[str, Any]]:
+    """Extract multiple files (simulated batch - pdfplumber has no native batch API)."""
+    start = time.perf_counter()
+
+    results = []
+    for file_path in file_paths:
+        try:
+            with pdfplumber.open(file_path) as pdf:
+                text_parts = []
+                for page in pdf.pages:
+                    page_text = page.extract_text(layout=False)
+                    if page_text:
+                        text_parts.append(page_text)
+
+                markdown = "\n\n".join(text_parts)
+                results.append(
+                    {
+                        "content": markdown,
+                        "metadata": {"framework": "pdfplumber"},
+                    }
+                )
+        except Exception as e:
+            results.append(
+                {
+                    "content": "",
+                    "metadata": {
+                        "framework": "pdfplumber",
+                        "error": str(e),
+                    },
+                }
+            )
+
+    total_duration_ms = (time.perf_counter() - start) * 1000.0
+    per_file_duration_ms = total_duration_ms / len(file_paths) if file_paths else 0
+
+    peak_memory = _get_peak_memory_bytes()
+    for result in results:
+        result["_extraction_time_ms"] = per_file_duration_ms
+        result["_batch_total_ms"] = total_duration_ms
+        result["_peak_memory_bytes"] = peak_memory
+
+    return results
+
+
+def _worker(fn, args, conn):
+    """Run extraction in a forked child process.
+
+    Closes inherited stdin/stdout so the child cannot corrupt the
+    parent's line-based JSON protocol.
+    """
+    try:
+        sys.stdin.close()
+        sys.stdout = open(os.devnull, "w")
+    except Exception:
+        pass
+    try:
+        result = fn(*args)
+        conn.send(result)
+    except Exception as e:
+        conn.send({"error": str(e), "_extraction_time_ms": 0})
+    finally:
+        conn.close()
+
+
+def _run_with_timeout(fn, args, timeout):
+    """Execute fn(*args) in a forked child with a timeout.
+
+    On timeout the child is killed but the parent stays alive —
+    no expensive process restart is needed.
+    """
+    try:
+        ctx = _mp.get_context("fork")
+        parent_conn, child_conn = ctx.Pipe(duplex=False)
+        p = ctx.Process(target=_worker, args=(fn, args, child_conn))
+        p.start()
+        child_conn.close()
+
+        if parent_conn.poll(timeout=timeout):
+            try:
+                result = parent_conn.recv()
+            except Exception:
+                result = {"error": "worker process crashed", "_extraction_time_ms": 0}
+        else:
+            p.kill()
+            result = {
+                "error": f"extraction timed out after {timeout}s",
+                "_extraction_time_ms": timeout * 1000.0,
+            }
+
+        p.join(timeout=5)
+        if p.is_alive():
+            p.kill()
+            p.join()
+        parent_conn.close()
+        return result
+    except Exception:
+        # Fork not available — fall back to in-process extraction
+        try:
+            return fn(*args)
+        except Exception as e:
+            return {"error": str(e), "_extraction_time_ms": 0}
+
+
+def _parse_path(line: str) -> str:
+    """Parse a request line: JSON object with path field, or plain file path."""
+    stripped = line.strip()
+    if stripped.startswith("{"):
+        try:
+            return json.loads(stripped).get("path", "")
+        except (json.JSONDecodeError, ValueError):
+            pass
+    return stripped
+
+
+def run_server(timeout=None) -> None:
+    """Persistent server mode: read paths from stdin, write JSON to stdout."""
+    print("READY", flush=True)
+    for line in sys.stdin:
+        file_path = _parse_path(line)
+        if not file_path:
+            continue
+        if timeout is not None:
+            result = _run_with_timeout(extract_sync, (file_path,), timeout)
+        else:
+            try:
+                result = extract_sync(file_path)
+            except Exception as e:
+                result = {"error": str(e), "_extraction_time_ms": 0}
+        print(json.dumps(result), flush=True)
+
+
+def main() -> None:
+    timeout = None
+    args = []
+    for arg in sys.argv[1:]:
+        if arg in ("--ocr", "--no-ocr"):
+            pass  # Accepted but ignored - pdfplumber doesn't have OCR config
+        elif arg.startswith("--timeout="):
+            timeout = int(arg.split("=", 1)[1])
+        elif arg.startswith("--format="):
+            _fmt = arg.split("=", 1)[1]
+            if _fmt != "plaintext":
+                print(f"{sys.argv[0]} only supports plaintext output; got --format {_fmt}", file=sys.stderr)
+                sys.exit(64)
+        else:
+            args.append(arg)
+
+    if len(args) < 1:
+        print(
+            "Usage: pdfplumber_extract.py [--ocr|--no-ocr] [--timeout=SECS] <mode> <file_path> [additional_files...]",
+            file=sys.stderr,
+        )
+        print("Modes: sync, batch, server", file=sys.stderr)
+        sys.exit(1)
+
+    mode = args[0]
+    file_paths = args[1:]
+
+    try:
+        if mode == "server":
+            run_server(timeout=timeout)
+
+        elif mode == "sync":
+            if len(file_paths) != 1:
+                print("Error: sync mode requires exactly one file", file=sys.stderr)
+                sys.exit(1)
+            payload = extract_sync(file_paths[0])
+            print(json.dumps(payload), end="")
+
+        elif mode == "batch":
+            if len(file_paths) < 1:
+                print("Error: batch mode requires at least one file", file=sys.stderr)
+                sys.exit(1)
+
+            if len(file_paths) == 1:
+                results = extract_batch(file_paths)
+                print(json.dumps(results[0]), end="")
+            else:
+                results = extract_batch(file_paths)
+                print(json.dumps(results), end="")
+
+        else:
+            print(f"Error: Unknown mode '{mode}'. Use sync, batch, or server", file=sys.stderr)
+            sys.exit(1)
+
+    except Exception as e:
+        print(f"Error extracting with pdfplumber: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/benchmark-harness/scripts/pdftotext_extract.py
+++ b/tools/benchmark-harness/scripts/pdftotext_extract.py
@@ -0,0 +1,237 @@
+"""pdftotext extraction wrapper for benchmark harness.
+
+Supports three modes:
+- sync: extract text from entire PDF (sequential)
+- batch: process multiple files (simulated batch using loop)
+- server: persistent mode reading paths from stdin
+"""
+
+from __future__ import annotations
+
+import json
+import multiprocessing as _mp
+import os
+import platform
+import resource
+import sys
+import time
+from typing import Any
+
+import pdftotext
+
+
+def _get_peak_memory_bytes() -> int:
+    """Get peak memory usage in bytes using resource module."""
+    usage = resource.getrusage(resource.RUSAGE_SELF)
+    if platform.system() == "Linux":
+        return usage.ru_maxrss * 1024
+    return usage.ru_maxrss
+
+
+def extract_sync(file_path: str) -> dict[str, Any]:
+    """Extract using synchronous single-file API."""
+    start = time.perf_counter()
+
+    with open(file_path, "rb") as f:
+        pdf = pdftotext.PDF(f)
+
+    content = "\n\n".join(pdf)
+
+    duration_ms = (time.perf_counter() - start) * 1000.0
+
+    return {
+        "content": content,
+        "metadata": {"framework": "pdftotext"},
+        "_extraction_time_ms": duration_ms,
+        "_peak_memory_bytes": _get_peak_memory_bytes(),
+    }
+
+
+def extract_batch(file_paths: list[str]) -> list[dict[str, Any]]:
+    """Extract multiple files (simulated batch - pdftotext has no native batch API)."""
+    start = time.perf_counter()
+
+    results = []
+    for file_path in file_paths:
+        try:
+            with open(file_path, "rb") as f:
+                pdf = pdftotext.PDF(f)
+
+            content = "\n\n".join(pdf)
+            results.append(
+                {
+                    "content": content,
+                    "metadata": {"framework": "pdftotext"},
+                }
+            )
+        except Exception as e:
+            results.append(
+                {
+                    "content": "",
+                    "metadata": {
+                        "framework": "pdftotext",
+                        "error": str(e),
+                    },
+                }
+            )
+
+    total_duration_ms = (time.perf_counter() - start) * 1000.0
+    per_file_duration_ms = total_duration_ms / len(file_paths) if file_paths else 0
+
+    peak_memory = _get_peak_memory_bytes()
+    for result in results:
+        result["_extraction_time_ms"] = per_file_duration_ms
+        result["_batch_total_ms"] = total_duration_ms
+        result["_peak_memory_bytes"] = peak_memory
+
+    return results
+
+
+def _worker(fn, args, conn):
+    """Run extraction in a forked child process.
+
+    Closes inherited stdin/stdout so the child cannot corrupt the
+    parent's line-based JSON protocol.
+    """
+    try:
+        sys.stdin.close()
+        sys.stdout = open(os.devnull, "w")
+    except Exception:
+        pass
+    try:
+        result = fn(*args)
+        conn.send(result)
+    except Exception as e:
+        conn.send({"error": str(e), "_extraction_time_ms": 0})
+    finally:
+        conn.close()
+
+
+def _run_with_timeout(fn, args, timeout):
+    """Execute fn(*args) in a forked child with a timeout.
+
+    On timeout the child is killed but the parent stays alive —
+    no expensive process restart is needed.
+    """
+    try:
+        ctx = _mp.get_context("fork")
+        parent_conn, child_conn = ctx.Pipe(duplex=False)
+        p = ctx.Process(target=_worker, args=(fn, args, child_conn))
+        p.start()
+        child_conn.close()
+
+        if parent_conn.poll(timeout=timeout):
+            try:
+                result = parent_conn.recv()
+            except Exception:
+                result = {"error": "worker process crashed", "_extraction_time_ms": 0}
+        else:
+            p.kill()
+            result = {
+                "error": f"extraction timed out after {timeout}s",
+                "_extraction_time_ms": timeout * 1000.0,
+            }
+
+        p.join(timeout=5)
+        if p.is_alive():
+            p.kill()
+            p.join()
+        parent_conn.close()
+        return result
+    except Exception:
+        # Fork not available — fall back to in-process extraction
+        try:
+            return fn(*args)
+        except Exception as e:
+            return {"error": str(e), "_extraction_time_ms": 0}
+
+
+def _parse_path(line: str) -> str:
+    """Parse a request line: JSON object with path field, or plain file path."""
+    stripped = line.strip()
+    if stripped.startswith("{"):
+        try:
+            return json.loads(stripped).get("path", "")
+        except (json.JSONDecodeError, ValueError):
+            pass
+    return stripped
+
+
+def run_server(timeout=None) -> None:
+    """Persistent server mode: read paths from stdin, write JSON to stdout."""
+    print("READY", flush=True)
+    for line in sys.stdin:
+        file_path = _parse_path(line)
+        if not file_path:
+            continue
+        if timeout is not None:
+            result = _run_with_timeout(extract_sync, (file_path,), timeout)
+        else:
+            try:
+                result = extract_sync(file_path)
+            except Exception as e:
+                result = {"error": str(e), "_extraction_time_ms": 0}
+        print(json.dumps(result), flush=True)
+
+
+def main() -> None:
+    timeout = None
+    args = []
+    for arg in sys.argv[1:]:
+        if arg in ("--ocr", "--no-ocr"):
+            pass  # Accepted but ignored - pdftotext doesn't have OCR config
+        elif arg.startswith("--timeout="):
+            timeout = int(arg.split("=", 1)[1])
+        elif arg.startswith("--format="):
+            _fmt = arg.split("=", 1)[1]
+            if _fmt != "plaintext":
+                print(f"{sys.argv[0]} only supports plaintext output; got --format {_fmt}", file=sys.stderr)
+                sys.exit(64)
+        else:
+            args.append(arg)
+
+    if len(args) < 1:
+        print(
+            "Usage: pdftotext_extract.py [--ocr|--no-ocr] [--timeout=SECS] <mode> <file_path> [additional_files...]",
+            file=sys.stderr,
+        )
+        print("Modes: sync, batch, server", file=sys.stderr)
+        sys.exit(1)
+
+    mode = args[0]
+    file_paths = args[1:]
+
+    try:
+        if mode == "server":
+            run_server(timeout=timeout)
+
+        elif mode == "sync":
+            if len(file_paths) != 1:
+                print("Error: sync mode requires exactly one file", file=sys.stderr)
+                sys.exit(1)
+            payload = extract_sync(file_paths[0])
+            print(json.dumps(payload), end="")
+
+        elif mode == "batch":
+            if len(file_paths) < 1:
+                print("Error: batch mode requires at least one file", file=sys.stderr)
+                sys.exit(1)
+
+            if len(file_paths) == 1:
+                results = extract_batch(file_paths)
+                print(json.dumps(results[0]), end="")
+            else:
+                results = extract_batch(file_paths)
+                print(json.dumps(results), end="")
+
+        else:
+            print(f"Error: Unknown mode '{mode}'. Use sync, batch, or server", file=sys.stderr)
+            sys.exit(1)
+
+    except Exception as e:
+        print(f"Error extracting with pdftotext: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/benchmark-harness/scripts/playa_pdf_extract.py
+++ b/tools/benchmark-harness/scripts/playa_pdf_extract.py
@@ -0,0 +1,245 @@
+"""playa-pdf extraction wrapper for benchmark harness.
+
+Supports three modes:
+- sync: extract text page-by-page (sequential)
+- batch: process multiple files (simulated batch using loop)
+- server: persistent mode reading paths from stdin
+"""
+
+from __future__ import annotations
+
+import json
+import multiprocessing as _mp
+import os
+import platform
+import resource
+import sys
+import time
+from typing import Any
+
+import playa
+
+
+def _get_peak_memory_bytes() -> int:
+    """Get peak memory usage in bytes using resource module."""
+    usage = resource.getrusage(resource.RUSAGE_SELF)
+    if platform.system() == "Linux":
+        return usage.ru_maxrss * 1024
+    return usage.ru_maxrss
+
+
+def extract_sync(file_path: str) -> dict[str, Any]:
+    """Extract using synchronous single-file API."""
+    start = time.perf_counter()
+
+    with playa.open(file_path) as doc:
+        text_parts = []
+        for page in doc.pages:
+            page_text = page.extract_text()
+            if page_text:
+                text_parts.append(page_text)
+
+    markdown = "\n\n".join(text_parts)
+
+    duration_ms = (time.perf_counter() - start) * 1000.0
+
+    return {
+        "content": markdown,
+        "metadata": {"framework": "playa-pdf"},
+        "_extraction_time_ms": duration_ms,
+        "_peak_memory_bytes": _get_peak_memory_bytes(),
+    }
+
+
+def extract_batch(file_paths: list[str]) -> list[dict[str, Any]]:
+    """Extract multiple files (simulated batch - playa-pdf has no native batch API)."""
+    start = time.perf_counter()
+
+    results = []
+    for file_path in file_paths:
+        try:
+            with playa.open(file_path) as doc:
+                text_parts = []
+                for page in doc.pages:
+                    page_text = page.extract_text()
+                    if page_text:
+                        text_parts.append(page_text)
+
+            markdown = "\n\n".join(text_parts)
+            results.append(
+                {
+                    "content": markdown,
+                    "metadata": {"framework": "playa-pdf"},
+                }
+            )
+        except Exception as e:
+            results.append(
+                {
+                    "content": "",
+                    "metadata": {
+                        "framework": "playa-pdf",
+                        "error": str(e),
+                    },
+                }
+            )
+
+    total_duration_ms = (time.perf_counter() - start) * 1000.0
+    per_file_duration_ms = total_duration_ms / len(file_paths) if file_paths else 0
+
+    peak_memory = _get_peak_memory_bytes()
+    for result in results:
+        result["_extraction_time_ms"] = per_file_duration_ms
+        result["_batch_total_ms"] = total_duration_ms
+        result["_peak_memory_bytes"] = peak_memory
+
+    return results
+
+
+def _worker(fn, args, conn):
+    """Run extraction in a forked child process.
+
+    Closes inherited stdin/stdout so the child cannot corrupt the
+    parent's line-based JSON protocol.
+    """
+    try:
+        sys.stdin.close()
+        sys.stdout = open(os.devnull, "w")
+    except Exception:
+        pass
+    try:
+        result = fn(*args)
+        conn.send(result)
+    except Exception as e:
+        conn.send({"error": str(e), "_extraction_time_ms": 0})
+    finally:
+        conn.close()
+
+
+def _run_with_timeout(fn, args, timeout):
+    """Execute fn(*args) in a forked child with a timeout.
+
+    On timeout the child is killed but the parent stays alive —
+    no expensive process restart is needed.
+    """
+    try:
+        ctx = _mp.get_context("fork")
+        parent_conn, child_conn = ctx.Pipe(duplex=False)
+        p = ctx.Process(target=_worker, args=(fn, args, child_conn))
+        p.start()
+        child_conn.close()
+
+        if parent_conn.poll(timeout=timeout):
+            try:
+                result = parent_conn.recv()
+            except Exception:
+                result = {"error": "worker process crashed", "_extraction_time_ms": 0}
+        else:
+            p.kill()
+            result = {
+                "error": f"extraction timed out after {timeout}s",
+                "_extraction_time_ms": timeout * 1000.0,
+            }
+
+        p.join(timeout=5)
+        if p.is_alive():
+            p.kill()
+            p.join()
+        parent_conn.close()
+        return result
+    except Exception:
+        # Fork not available — fall back to in-process extraction
+        try:
+            return fn(*args)
+        except Exception as e:
+            return {"error": str(e), "_extraction_time_ms": 0}
+
+
+def _parse_path(line: str) -> str:
+    """Parse a request line: JSON object with path field, or plain file path."""
+    stripped = line.strip()
+    if stripped.startswith("{"):
+        try:
+            return json.loads(stripped).get("path", "")
+        except (json.JSONDecodeError, ValueError):
+            pass
+    return stripped
+
+
+def run_server(timeout=None) -> None:
+    """Persistent server mode: read paths from stdin, write JSON to stdout."""
+    print("READY", flush=True)
+    for line in sys.stdin:
+        file_path = _parse_path(line)
+        if not file_path:
+            continue
+        if timeout is not None:
+            result = _run_with_timeout(extract_sync, (file_path,), timeout)
+        else:
+            try:
+                result = extract_sync(file_path)
+            except Exception as e:
+                result = {"error": str(e), "_extraction_time_ms": 0}
+        print(json.dumps(result), flush=True)
+
+
+def main() -> None:
+    timeout = None
+    args = []
+    for arg in sys.argv[1:]:
+        if arg in ("--ocr", "--no-ocr"):
+            pass  # Accepted but ignored - playa-pdf doesn't have OCR capability
+        elif arg.startswith("--timeout="):
+            timeout = int(arg.split("=", 1)[1])
+        elif arg.startswith("--format="):
+            _fmt = arg.split("=", 1)[1]
+            if _fmt != "plaintext":
+                print(f"{sys.argv[0]} only supports plaintext output; got --format {_fmt}", file=sys.stderr)
+                sys.exit(64)
+        else:
+            args.append(arg)
+
+    if len(args) < 1:
+        print(
+            "Usage: playa_pdf_extract.py [--ocr|--no-ocr] [--timeout=SECS] <mode> <file_path> [additional_files...]",
+            file=sys.stderr,
+        )
+        print("Modes: sync, batch, server", file=sys.stderr)
+        sys.exit(1)
+
+    mode = args[0]
+    file_paths = args[1:]
+
+    try:
+        if mode == "server":
+            run_server(timeout=timeout)
+
+        elif mode == "sync":
+            if len(file_paths) != 1:
+                print("Error: sync mode requires exactly one file", file=sys.stderr)
+                sys.exit(1)
+            payload = extract_sync(file_paths[0])
+            print(json.dumps(payload), end="")
+
+        elif mode == "batch":
+            if len(file_paths) < 1:
+                print("Error: batch mode requires at least one file", file=sys.stderr)
+                sys.exit(1)
+
+            if len(file_paths) == 1:
+                results = extract_batch(file_paths)
+                print(json.dumps(results[0]), end="")
+            else:
+                results = extract_batch(file_paths)
+                print(json.dumps(results), end="")
+
+        else:
+            print(f"Error: Unknown mode '{mode}'. Use sync, batch, or server", file=sys.stderr)
+            sys.exit(1)
+
+    except Exception as e:
+        print(f"Error extracting with playa-pdf: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/benchmark-harness/scripts/pymupdf4llm_extract.py
+++ b/tools/benchmark-harness/scripts/pymupdf4llm_extract.py
@@ -0,0 +1,184 @@
+"""PyMuPDF4LLM extraction wrapper for benchmark harness."""
+
+from __future__ import annotations
+
+import json
+import multiprocessing as _mp
+import os
+import platform
+import resource
+import sys
+import time
+
+# Suppress MuPDF C-level error/warning messages that can corrupt the
+# persistent server's line-based JSON protocol on stdout.
+# See: https://github.com/pymupdf/PyMuPDF/issues/606
+import pymupdf
+
+# Import pymupdf.layout BEFORE pymupdf4llm to enable improved layout analysis
+# and suppress the "Consider using the pymupdf_layout package" info message.
+import pymupdf.layout
+import pymupdf4llm
+
+pymupdf.TOOLS.mupdf_display_errors(False)
+
+
+def _get_peak_memory_bytes() -> int:
+    """Get peak memory usage in bytes using resource module."""
+    usage = resource.getrusage(resource.RUSAGE_SELF)
+    if platform.system() == "Linux":
+        return usage.ru_maxrss * 1024
+    return usage.ru_maxrss
+
+
+def extract_sync(file_path: str) -> dict:
+    """Extract using PyMuPDF4LLM."""
+    start = time.perf_counter()
+    markdown = pymupdf4llm.to_markdown(file_path, show_progress=False, write_images=False)
+    duration_ms = (time.perf_counter() - start) * 1000.0
+
+    return {
+        "content": markdown,
+        "metadata": {"framework": "pymupdf4llm"},
+        "_extraction_time_ms": duration_ms,
+        "_peak_memory_bytes": _get_peak_memory_bytes(),
+    }
+
+
+def _worker(fn, args, conn):
+    """Run extraction in a forked child process.
+
+    Closes inherited stdin/stdout so the child cannot corrupt the
+    parent's line-based JSON protocol.
+    """
+    try:
+        sys.stdin.close()
+        sys.stdout = open(os.devnull, "w")
+    except Exception:
+        pass
+    try:
+        result = fn(*args)
+        conn.send(result)
+    except Exception as e:
+        conn.send({"error": str(e), "_extraction_time_ms": 0})
+    finally:
+        conn.close()
+
+
+def _run_with_timeout(fn, args, timeout):
+    """Execute fn(*args) in a forked child with a timeout.
+
+    On timeout the child is killed but the parent stays alive —
+    no expensive process restart is needed.
+    """
+    try:
+        ctx = _mp.get_context("fork")
+        parent_conn, child_conn = ctx.Pipe(duplex=False)
+        p = ctx.Process(target=_worker, args=(fn, args, child_conn))
+        p.start()
+        child_conn.close()
+
+        if parent_conn.poll(timeout=timeout):
+            try:
+                result = parent_conn.recv()
+            except Exception:
+                result = {"error": "worker process crashed", "_extraction_time_ms": 0}
+        else:
+            p.kill()
+            result = {
+                "error": f"extraction timed out after {timeout}s",
+                "_extraction_time_ms": timeout * 1000.0,
+            }
+
+        p.join(timeout=5)
+        if p.is_alive():
+            p.kill()
+            p.join()
+        parent_conn.close()
+        return result
+    except Exception:
+        # Fork not available — fall back to in-process extraction
+        try:
+            return fn(*args)
+        except Exception as e:
+            return {"error": str(e), "_extraction_time_ms": 0}
+
+
+def _parse_path(line: str) -> str:
+    """Parse a request line: JSON object with path field, or plain file path."""
+    stripped = line.strip()
+    if stripped.startswith("{"):
+        try:
+            return json.loads(stripped).get("path", "")
+        except (json.JSONDecodeError, ValueError):
+            pass
+    return stripped
+
+
+def run_server(timeout=None) -> None:
+    """Persistent server mode."""
+    print("READY", flush=True)
+    for line in sys.stdin:
+        file_path = _parse_path(line)
+        if not file_path:
+            continue
+        if timeout is not None:
+            result = _run_with_timeout(extract_sync, (file_path,), timeout)
+        else:
+            try:
+                result = extract_sync(file_path)
+            except Exception as e:
+                result = {"error": str(e), "_extraction_time_ms": 0}
+        print(json.dumps(result), flush=True)
+
+
+def main() -> None:
+    ocr_enabled = False
+    timeout = None
+    args = []
+    for arg in sys.argv[1:]:
+        if arg == "--ocr":
+            ocr_enabled = True
+        elif arg == "--no-ocr":
+            ocr_enabled = False
+        elif arg.startswith("--timeout="):
+            timeout = int(arg.split("=", 1)[1])
+        elif arg.startswith("--format="):
+            _fmt = arg.split("=", 1)[1]
+            if _fmt != "markdown":
+                print(f"{sys.argv[0]} only supports markdown output; got --format {_fmt}", file=sys.stderr)
+                sys.exit(64)
+        else:
+            args.append(arg)
+
+    if len(args) < 1:
+        print("Usage: pymupdf4llm_extract.py [--ocr|--no-ocr] [--timeout=SECS] <mode> <file_path>", file=sys.stderr)
+        print("Modes: sync, server", file=sys.stderr)
+        sys.exit(1)
+
+    mode = args[0]
+    if mode == "server":
+        run_server(timeout=timeout)
+    elif mode == "sync":
+        if len(args) < 2:
+            print("Error: sync mode requires a file path", file=sys.stderr)
+            sys.exit(1)
+        file_path = args[1]
+        try:
+            payload = extract_sync(file_path)
+            print(json.dumps(payload), end="")
+        except Exception as e:
+            print(f"Error extracting with PyMuPDF4LLM: {e}", file=sys.stderr)
+            sys.exit(1)
+    else:
+        # Legacy fallback for direct file path
+        try:
+            payload = extract_sync(args[0])
+            print(json.dumps(payload), end="")
+        except Exception as e:
+            print(f"Error extracting with PyMuPDF4LLM: {e}", file=sys.stderr)
+            sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/benchmark-harness/scripts/pypdf_extract.py
+++ b/tools/benchmark-harness/scripts/pypdf_extract.py
@@ -0,0 +1,245 @@
+"""pypdf extraction wrapper for benchmark harness.
+
+Supports three modes:
+- sync: extract text page-by-page (sequential)
+- batch: process multiple files (simulated batch using loop)
+- server: persistent mode reading paths from stdin
+"""
+
+from __future__ import annotations
+
+import json
+import multiprocessing as _mp
+import os
+import platform
+import resource
+import sys
+import time
+from typing import Any
+
+from pypdf import PdfReader
+
+
+def _get_peak_memory_bytes() -> int:
+    """Get peak memory usage in bytes using resource module."""
+    usage = resource.getrusage(resource.RUSAGE_SELF)
+    if platform.system() == "Linux":
+        return usage.ru_maxrss * 1024
+    return usage.ru_maxrss
+
+
+def extract_sync(file_path: str) -> dict[str, Any]:
+    """Extract using synchronous single-file API."""
+    start = time.perf_counter()
+
+    reader = PdfReader(file_path)
+    text_parts = []
+    for page in reader.pages:
+        page_text = page.extract_text()
+        if page_text:
+            text_parts.append(page_text)
+
+    markdown = "\n\n".join(text_parts)
+
+    duration_ms = (time.perf_counter() - start) * 1000.0
+
+    return {
+        "content": markdown,
+        "metadata": {"framework": "pypdf"},
+        "_extraction_time_ms": duration_ms,
+        "_peak_memory_bytes": _get_peak_memory_bytes(),
+    }
+
+
+def extract_batch(file_paths: list[str]) -> list[dict[str, Any]]:
+    """Extract multiple files (simulated batch - pypdf has no native batch API)."""
+    start = time.perf_counter()
+
+    results = []
+    for file_path in file_paths:
+        try:
+            reader = PdfReader(file_path)
+            text_parts = []
+            for page in reader.pages:
+                page_text = page.extract_text()
+                if page_text:
+                    text_parts.append(page_text)
+
+            markdown = "\n\n".join(text_parts)
+            results.append(
+                {
+                    "content": markdown,
+                    "metadata": {"framework": "pypdf"},
+                }
+            )
+        except Exception as e:
+            results.append(
+                {
+                    "content": "",
+                    "metadata": {
+                        "framework": "pypdf",
+                        "error": str(e),
+                    },
+                }
+            )
+
+    total_duration_ms = (time.perf_counter() - start) * 1000.0
+    per_file_duration_ms = total_duration_ms / len(file_paths) if file_paths else 0
+
+    peak_memory = _get_peak_memory_bytes()
+    for result in results:
+        result["_extraction_time_ms"] = per_file_duration_ms
+        result["_batch_total_ms"] = total_duration_ms
+        result["_peak_memory_bytes"] = peak_memory
+
+    return results
+
+
+def _worker(fn, args, conn):
+    """Run extraction in a forked child process.
+
+    Closes inherited stdin/stdout so the child cannot corrupt the
+    parent's line-based JSON protocol.
+    """
+    try:
+        sys.stdin.close()
+        sys.stdout = open(os.devnull, "w")
+    except Exception:
+        pass
+    try:
+        result = fn(*args)
+        conn.send(result)
+    except Exception as e:
+        conn.send({"error": str(e), "_extraction_time_ms": 0})
+    finally:
+        conn.close()
+
+
+def _run_with_timeout(fn, args, timeout):
+    """Execute fn(*args) in a forked child with a timeout.
+
+    On timeout the child is killed but the parent stays alive —
+    no expensive process restart is needed.
+    """
+    try:
+        ctx = _mp.get_context("fork")
+        parent_conn, child_conn = ctx.Pipe(duplex=False)
+        p = ctx.Process(target=_worker, args=(fn, args, child_conn))
+        p.start()
+        child_conn.close()
+
+        if parent_conn.poll(timeout=timeout):
+            try:
+                result = parent_conn.recv()
+            except Exception:
+                result = {"error": "worker process crashed", "_extraction_time_ms": 0}
+        else:
+            p.kill()
+            result = {
+                "error": f"extraction timed out after {timeout}s",
+                "_extraction_time_ms": timeout * 1000.0,
+            }
+
+        p.join(timeout=5)
+        if p.is_alive():
+            p.kill()
+            p.join()
+        parent_conn.close()
+        return result
+    except Exception:
+        # Fork not available — fall back to in-process extraction
+        try:
+            return fn(*args)
+        except Exception as e:
+            return {"error": str(e), "_extraction_time_ms": 0}
+
+
+def _parse_path(line: str) -> str:
+    """Parse a request line: JSON object with path field, or plain file path."""
+    stripped = line.strip()
+    if stripped.startswith("{"):
+        try:
+            return json.loads(stripped).get("path", "")
+        except (json.JSONDecodeError, ValueError):
+            pass
+    return stripped
+
+
+def run_server(timeout=None) -> None:
+    """Persistent server mode: read paths from stdin, write JSON to stdout."""
+    print("READY", flush=True)
+    for line in sys.stdin:
+        file_path = _parse_path(line)
+        if not file_path:
+            continue
+        if timeout is not None:
+            result = _run_with_timeout(extract_sync, (file_path,), timeout)
+        else:
+            try:
+                result = extract_sync(file_path)
+            except Exception as e:
+                result = {"error": str(e), "_extraction_time_ms": 0}
+        print(json.dumps(result), flush=True)
+
+
+def main() -> None:
+    timeout = None
+    args = []
+    for arg in sys.argv[1:]:
+        if arg in ("--ocr", "--no-ocr"):
+            pass  # Accepted but ignored - pypdf doesn't have OCR config
+        elif arg.startswith("--timeout="):
+            timeout = int(arg.split("=", 1)[1])
+        elif arg.startswith("--format="):
+            _fmt = arg.split("=", 1)[1]
+            if _fmt != "plaintext":
+                print(f"{sys.argv[0]} only supports plaintext output; got --format {_fmt}", file=sys.stderr)
+                sys.exit(64)
+        else:
+            args.append(arg)
+
+    if len(args) < 1:
+        print(
+            "Usage: pypdf_extract.py [--ocr|--no-ocr] [--timeout=SECS] <mode> <file_path> [additional_files...]",
+            file=sys.stderr,
+        )
+        print("Modes: sync, batch, server", file=sys.stderr)
+        sys.exit(1)
+
+    mode = args[0]
+    file_paths = args[1:]
+
+    try:
+        if mode == "server":
+            run_server(timeout=timeout)
+
+        elif mode == "sync":
+            if len(file_paths) != 1:
+                print("Error: sync mode requires exactly one file", file=sys.stderr)
+                sys.exit(1)
+            payload = extract_sync(file_paths[0])
+            print(json.dumps(payload), end="")
+
+        elif mode == "batch":
+            if len(file_paths) < 1:
+                print("Error: batch mode requires at least one file", file=sys.stderr)
+                sys.exit(1)
+
+            if len(file_paths) == 1:
+                results = extract_batch(file_paths)
+                print(json.dumps(results[0]), end="")
+            else:
+                results = extract_batch(file_paths)
+                print(json.dumps(results), end="")
+
+        else:
+            print(f"Error: Unknown mode '{mode}'. Use sync, batch, or server", file=sys.stderr)
+            sys.exit(1)
+
+    except Exception as e:
+        print(f"Error extracting with pypdf: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/benchmark-harness/scripts/sanitize_pandoc_gt.py
+++ b/tools/benchmark-harness/scripts/sanitize_pandoc_gt.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python3
+"""Sanitize pandoc-generated markdown ground truth files.
+
+Removes common pandoc artifacts that don't represent actual document structure.
+
+Usage:
+    # Single file (in-place):
+    python sanitize_pandoc_gt.py input.md
+
+    # Pipe mode:
+    pandoc -f docbook -t gfm --wrap=none input.xml | python sanitize_pandoc_gt.py > output.md
+
+    # Dry run (show diff without modifying):
+    python sanitize_pandoc_gt.py --dry-run input.md
+
+    # Batch all GT files (dry run):
+    python sanitize_pandoc_gt.py --dry-run --batch test_documents/ground_truth/
+
+    # Batch all GT files (apply):
+    python sanitize_pandoc_gt.py --batch test_documents/ground_truth/
+"""
+
+import argparse
+import difflib
+import os
+import re
+import sys
+
+
+def sanitize(text: str) -> str:
+    # Track whether we're inside a fenced code block
+    in_code = False
+    lines = text.split("\n")
+    result = []
+
+    for line in lines:
+        # Track fenced code blocks — don't modify content inside them
+        stripped = line.strip()
+        if stripped.startswith("```") or stripped.startswith("~~~"):
+            in_code = not in_code
+            # Clean code fence attributes even when toggling
+            if not in_code or stripped.startswith("```") or stripped.startswith("~~~"):
+                # Convert ``` {.python} to ```python
+                m = re.match(r"^(`{3,}|~{3,})\s*\{\s*\.(\w+)(?:\s+[^}]*)?\}\s*$", line)
+                if m:
+                    line = f"{m.group(1)}{m.group(2)}"
+                else:
+                    # Remove {.class} from code fences without extracting language
+                    line = re.sub(r"^(`{3,}|~{3,})\s*\{[^}]*\}\s*$", r"\1", line)
+            result.append(line)
+            continue
+
+        if in_code:
+            result.append(line)
+            continue
+
+        # === Pandoc div wrappers ===
+        if re.match(r"^:::\s*(\{.*\})?\s*$", stripped):
+            continue
+
+        # === Remove {.class} and {#id} attributes from headings ===
+        if re.match(r"^#{1,6}\s", line):
+            line = re.sub(r"\s*\{[.#][^}]*\}\s*$", "", line)
+
+        # === Replace <!-- end list --> pandoc markers with blank line ===
+        # Don't just remove — keep the structural separation it provides
+        if stripped == "<!-- end list -->":
+            if not (result and result[-1].strip() == ""):
+                result.append("")
+            continue
+
+        # === Remove pandoc-specific HTML comments only ===
+        # Keep <!-- image --> and other semantic comments
+        if stripped == "<!-- end list -->" or stripped == "<!-- -->":
+            continue
+
+        # Do NOT collapse blank lines — they are structural in markdown.
+        # Blank lines separate paragraphs, tables, lists, etc.
+
+        result.append(line)
+
+    # Trim trailing blank lines, ensure single trailing newline
+    while result and result[-1].strip() == "":
+        result.pop()
+
+    return "\n".join(result) + "\n" if result else ""
+
+
+def process_file(path: str, dry_run: bool = False) -> tuple[bool, str]:
+    """Process a single file. Returns (changed, diff_text)."""
+    with open(path) as f:
+        original = f.read()
+
+    cleaned = sanitize(original)
+
+    if original == cleaned:
+        return False, ""
+
+    diff = "".join(
+        difflib.unified_diff(
+            original.splitlines(keepends=True),
+            cleaned.splitlines(keepends=True),
+            fromfile=f"a/{path}",
+            tofile=f"b/{path}",
+            n=3,
+        )
+    )
+
+    if not dry_run:
+        with open(path, "w") as f:
+            f.write(cleaned)
+
+    return True, diff
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Sanitize pandoc GT markdown files")
+    parser.add_argument("path", nargs="?", help="File or directory to process")
+    parser.add_argument("--dry-run", action="store_true", help="Show diff without modifying files")
+    parser.add_argument("--batch", action="store_true", help="Process all .md files in directory recursively")
+    args = parser.parse_args()
+
+    # Pipe mode (no path, stdin)
+    if args.path is None and not sys.stdin.isatty():
+        sys.stdout.write(sanitize(sys.stdin.read()))
+        return
+
+    if args.path is None:
+        parser.print_help()
+        return
+
+    # Batch mode
+    if args.batch or os.path.isdir(args.path):
+        changed_count = 0
+        total_count = 0
+        for root, _dirs, files in os.walk(args.path):
+            for fname in sorted(files):
+                if not fname.endswith(".md"):
+                    continue
+                fpath = os.path.join(root, fname)
+                total_count += 1
+                changed, diff = process_file(fpath, dry_run=args.dry_run)
+                if changed:
+                    changed_count += 1
+                    if args.dry_run:
+                        print(diff)
+                    else:
+                        print(f"  cleaned: {fpath}")
+
+        action = "would change" if args.dry_run else "cleaned"
+        print(f"\n{action} {changed_count}/{total_count} files")
+        return
+
+    # Single file mode
+    changed, diff = process_file(args.path, dry_run=args.dry_run)
+    if changed:
+        if args.dry_run:
+            print(diff)
+        else:
+            print(f"cleaned: {args.path}")
+    else:
+        print(f"no changes: {args.path}")
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/benchmark-harness/scripts/unstructured_extract.py
+++ b/tools/benchmark-harness/scripts/unstructured_extract.py
@@ -0,0 +1,230 @@
+"""Unstructured extraction wrapper for benchmark harness."""
+
+from __future__ import annotations
+
+import json
+import multiprocessing as _mp
+import os
+import platform
+import resource
+import sys
+import time
+
+from unstructured.partition.auto import partition
+
+
+def _get_peak_memory_bytes() -> int:
+    """Get peak memory usage in bytes using resource module."""
+    usage = resource.getrusage(resource.RUSAGE_SELF)
+    if platform.system() == "Linux":
+        return usage.ru_maxrss * 1024
+    return usage.ru_maxrss
+
+
+def _render_markdown(elements: list) -> str:
+    """Render Unstructured Elements as GFM-ish markdown."""
+    import re
+
+    parts: list[str] = []
+    for el in elements:
+        cls = type(el).__name__
+        text = (el.text or "").strip() if hasattr(el, "text") else str(el).strip()
+        if not text and cls not in ("Image", "Figure"):
+            continue
+        if cls == "Title":
+            parts.append(f"# {text}")
+        elif cls == "Header":
+            parts.append(f"## {text}")
+        elif cls == "ListItem":
+            parts.append(f"- {text}")
+        elif cls in ("CodeSnippet", "Code"):
+            parts.append(f"```\n{text}\n```")
+        elif cls in ("Image", "Figure"):
+            parts.append(f"![{text or cls}]()")
+        elif cls == "Table":
+            html = ""
+            md = getattr(el, "metadata", None)
+            if md is not None:
+                html = getattr(md, "text_as_html", "") or ""
+            if html:
+                rows = re.findall(r"<tr[^>]*>(.*?)</tr>", html, flags=re.DOTALL | re.IGNORECASE)
+                rendered: list[str] = []
+                for i, row_html in enumerate(rows):
+                    cells = re.findall(r"<t[dh][^>]*>(.*?)</t[dh]>", row_html, flags=re.DOTALL | re.IGNORECASE)
+                    cells = [re.sub(r"<[^>]+>", "", c).strip() for c in cells]
+                    if cells:
+                        rendered.append("| " + " | ".join(cells) + " |")
+                        if i == 0:
+                            rendered.append("| " + " | ".join("---" for _ in cells) + " |")
+                if rendered:
+                    parts.append("\n".join(rendered))
+                else:
+                    parts.append(text)
+            else:
+                parts.append(text)
+        else:
+            parts.append(text)
+    return "\n\n".join(parts)
+
+
+def extract_sync(file_path: str, ocr_enabled: bool, output_format: str = "markdown") -> dict:
+    """Extract using Unstructured partition API."""
+    strategy = "hi_res" if ocr_enabled else "fast"
+    start = time.perf_counter()
+    elements = partition(filename=file_path, strategy=strategy, languages=["eng"])
+    duration_ms = (time.perf_counter() - start) * 1000.0
+
+    if output_format == "markdown":
+        content = _render_markdown(elements)
+    else:
+        content = "\n\n".join(str(el) for el in elements)
+    return {
+        "content": content,
+        "metadata": {"framework": "unstructured", "strategy": strategy, "output_format": output_format},
+        "_extraction_time_ms": duration_ms,
+        "_peak_memory_bytes": _get_peak_memory_bytes(),
+    }
+
+
+def _worker(fn, args, conn):
+    """Run extraction in a forked child process.
+
+    Closes inherited stdin/stdout so the child cannot corrupt the
+    parent's line-based JSON protocol.
+    """
+    try:
+        sys.stdin.close()
+        sys.stdout = open(os.devnull, "w")
+    except Exception:
+        pass
+    try:
+        result = fn(*args)
+        conn.send(result)
+    except Exception as e:
+        conn.send({"error": str(e), "_extraction_time_ms": 0})
+    finally:
+        conn.close()
+
+
+def _run_with_timeout(fn, args, timeout):
+    """Execute fn(*args) in a forked child with a timeout.
+
+    On timeout the child is killed but the parent stays alive —
+    no expensive process restart is needed.
+    """
+    try:
+        ctx = _mp.get_context("fork")
+        parent_conn, child_conn = ctx.Pipe(duplex=False)
+        p = ctx.Process(target=_worker, args=(fn, args, child_conn))
+        p.start()
+        child_conn.close()
+
+        if parent_conn.poll(timeout=timeout):
+            try:
+                result = parent_conn.recv()
+            except Exception:
+                result = {"error": "worker process crashed", "_extraction_time_ms": 0}
+        else:
+            p.kill()
+            result = {
+                "error": f"extraction timed out after {timeout}s",
+                "_extraction_time_ms": timeout * 1000.0,
+            }
+
+        p.join(timeout=5)
+        if p.is_alive():
+            p.kill()
+            p.join()
+        parent_conn.close()
+        return result
+    except Exception:
+        # Fork not available — fall back to in-process extraction
+        try:
+            return fn(*args)
+        except Exception as e:
+            return {"error": str(e), "_extraction_time_ms": 0}
+
+
+def _parse_path(line: str) -> str:
+    """Parse a request line: JSON object with path field, or plain file path."""
+    stripped = line.strip()
+    if stripped.startswith("{"):
+        try:
+            return json.loads(stripped).get("path", "")
+        except (json.JSONDecodeError, ValueError):
+            pass
+    return stripped
+
+
+def run_server(ocr_enabled: bool, output_format: str, timeout=None) -> None:
+    """Persistent server mode: read paths from stdin, write JSON to stdout."""
+    print("READY", flush=True)
+    for line in sys.stdin:
+        file_path = _parse_path(line)
+        if not file_path:
+            continue
+        if timeout is not None:
+            result = _run_with_timeout(extract_sync, (file_path, ocr_enabled, output_format), timeout)
+        else:
+            try:
+                result = extract_sync(file_path, ocr_enabled, output_format)
+            except Exception as e:
+                result = {"error": str(e), "_extraction_time_ms": 0}
+        print(json.dumps(result), flush=True)
+
+
+def main() -> None:
+    ocr_enabled = False
+    timeout = None
+    output_format = "markdown"
+    args = []
+    for arg in sys.argv[1:]:
+        if arg == "--ocr":
+            ocr_enabled = True
+        elif arg == "--no-ocr":
+            ocr_enabled = False
+        elif arg.startswith("--timeout="):
+            timeout = int(arg.split("=", 1)[1])
+        elif arg.startswith("--format="):
+            output_format = arg.split("=", 1)[1]
+        else:
+            args.append(arg)
+
+    if output_format not in ("markdown", "plaintext"):
+        print(f"Error: --format must be 'markdown' or 'plaintext'; got '{output_format}'", file=sys.stderr)
+        sys.exit(64)
+
+    if len(args) < 1:
+        print(
+            "Usage: unstructured_extract.py [--ocr|--no-ocr] [--timeout=SECS] [--format=markdown|plaintext] <mode> <file_path>",
+            file=sys.stderr,
+        )
+        print("Modes: sync, server", file=sys.stderr)
+        sys.exit(1)
+
+    mode = args[0]
+
+    if mode == "server":
+        run_server(ocr_enabled, output_format, timeout=timeout)
+    elif mode == "sync":
+        if len(args) < 2:
+            print("Error: sync mode requires a file path", file=sys.stderr)
+            sys.exit(1)
+        try:
+            payload = extract_sync(args[1], ocr_enabled, output_format)
+            print(json.dumps(payload), end="")
+        except Exception as e:
+            print(f"Error extracting with Unstructured: {e}", file=sys.stderr)
+            sys.exit(1)
+    else:
+        # Legacy mode: first arg is the file path directly
+        try:
+            payload = extract_sync(args[0], ocr_enabled, output_format)
+            print(json.dumps(payload), end="")
+        except Exception as e:
+            print(f"Error extracting with Unstructured: {e}", file=sys.stderr)
+            sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()