tools/benchmark-harness/scripts/mineru_extract.py

"""MinerU extraction wrapper for benchmark harness.

Supports three modes:
- sync: process single file
- batch: process multiple files
- server: persistent mode reading paths from stdin

Attempts to use MinerU's Python API directly for better performance.
Falls back to CLI subprocess if the Python API is not available.
"""

from __future__ import annotations

import os

# Force CPU-only mode to avoid GPU discovery errors in CI
os.environ.setdefault("CUDA_VISIBLE_DEVICES", "")
os.environ.setdefault("ONNXRUNTIME_PROVIDERS", "CPUExecutionProvider")
os.environ.setdefault("MINERU_DEVICE_MODE", "cpu")

import json
import multiprocessing as _mp
import platform
import resource
import subprocess
import sys
import tempfile
import time
from pathlib import Path
from typing import Any

# Try importing MinerU's Python API to avoid subprocess overhead.
# The API surface has changed across versions, so we attempt several known entry points.
try:
    from magic_pdf.pipe.UNIPipe import UNIPipe  # noqa: F401

    HAS_PYTHON_API = True
except ImportError:
    HAS_PYTHON_API = False


def _get_peak_memory_bytes() -> int:
    """Get peak memory usage in bytes using resource module."""
    usage = resource.getrusage(resource.RUSAGE_SELF)
    if platform.system() == "Linux":
        return usage.ru_maxrss * 1024
    return usage.ru_maxrss


def _extract_via_cli(file_path: str, ocr_enabled: bool) -> str:
    """Extract using MinerU CLI (fallback)."""
    cmd = ["mineru", "-p", file_path, "-b", "pipeline", "-d", "cpu"]
    if not ocr_enabled:
        cmd.extend(["--method", "txt"])

    with tempfile.TemporaryDirectory() as tmpdir:
        output_dir = Path(tmpdir) / "output"
        cmd.extend(["-o", str(output_dir)])

        result = subprocess.run(
            cmd,
            capture_output=True,
            text=True,
            check=False,
        )

        # Check for output files first — ONNX Runtime may emit warnings to
        # stderr even when extraction succeeds.
        md_files = list(output_dir.rglob("*.md"))
        if md_files:
            return md_files[0].read_text(encoding="utf-8")

        if result.returncode != 0:
            raise RuntimeError(f"MinerU extraction failed: {result.stderr}")

        raise RuntimeError("No markdown output found from MinerU")


def _extract_via_api(file_path: str, ocr_enabled: bool) -> str:
    """Extract using MinerU Python API (preferred, avoids subprocess overhead)."""
    # NOTE: The MinerU Python API is not yet stable. This is a best-effort attempt
    # using the UNIPipe interface. If this fails at runtime, the caller should
    # fall back to CLI extraction.
    from magic_pdf.pipe.UNIPipe import UNIPipe
    from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter

    pdf_bytes = Path(file_path).read_bytes()

    with tempfile.TemporaryDirectory() as tmpdir:
        writer = DiskReaderWriter(tmpdir)
        method = "ocr" if ocr_enabled else "txt"
        pipe = UNIPipe(pdf_bytes, {"_pdf_type": "", "model_list": []}, writer, method=method)
        pipe.pipe_classify()
        pipe.pipe_analyze()
        pipe.pipe_parse()
        md_content = pipe.pipe_mk_markdown(str(Path(file_path).stem), tmpdir)
        return md_content


_MD_STRIP_RE = None


def _strip_markdown(text: str) -> str:
    """Best-effort markdown→plaintext pass. Drops syntax tokens; preserves text."""
    import re

    global _MD_STRIP_RE
    if _MD_STRIP_RE is None:
        _MD_STRIP_RE = [
            (re.compile(r"^#{1,6}\s+", re.MULTILINE), ""),  # ATX headings
            (re.compile(r"^\s*[-*+]\s+", re.MULTILINE), ""),  # bullet markers
            (re.compile(r"^\s*\d+\.\s+", re.MULTILINE), ""),  # ordered list markers
            (re.compile(r"^>\s?", re.MULTILINE), ""),  # blockquotes
            (re.compile(r"```[a-zA-Z0-9_-]*\n?"), ""),  # code fences
            (re.compile(r"`([^`]+)`"), r"\1"),  # inline code
            (re.compile(r"\*\*([^*]+)\*\*"), r"\1"),  # bold
            (re.compile(r"\*([^*]+)\*"), r"\1"),  # italic
            (re.compile(r"!\[([^\]]*)\]\([^)]*\)"), r"\1"),  # images
            (re.compile(r"\[([^\]]+)\]\([^)]*\)"), r"\1"),  # links
            (re.compile(r"^\s*\|.*\|\s*$", re.MULTILINE), ""),  # table rows (drop)
        ]
    out = text
    for pattern, repl in _MD_STRIP_RE:
        out = pattern.sub(repl, out)
    return out


def extract_sync(file_path: str, ocr_enabled: bool, output_format: str = "markdown") -> dict[str, Any]:
    """Extract a single file using the best available method."""
    start = time.perf_counter()

    if HAS_PYTHON_API:
        try:
            markdown = _extract_via_api(file_path, ocr_enabled)
        except Exception:
            # Fall back to CLI if Python API fails at runtime
            markdown = _extract_via_cli(file_path, ocr_enabled)
    else:
        markdown = _extract_via_cli(file_path, ocr_enabled)

    content = _strip_markdown(markdown) if output_format == "plaintext" else markdown
    duration_ms = (time.perf_counter() - start) * 1000.0

    return {
        "content": content,
        "metadata": {"framework": "mineru", "output_format": output_format},
        "_extraction_time_ms": duration_ms,
        "_peak_memory_bytes": _get_peak_memory_bytes(),
    }


def extract_batch(file_paths: list[str], ocr_enabled: bool, output_format: str = "markdown") -> list[dict[str, Any]]:
    """Extract multiple files in sequence."""
    start = time.perf_counter()

    results = []
    for file_path in file_paths:
        try:
            payload = extract_sync(file_path, ocr_enabled, output_format)
            # Remove per-file timing; we'll replace with batch timing below
            payload.pop("_extraction_time_ms", None)
            results.append(payload)
        except Exception as e:
            results.append(
                {
                    "content": "",
                    "metadata": {
                        "framework": "mineru",
                        "error": str(e),
                    },
                }
            )

    total_duration_ms = (time.perf_counter() - start) * 1000.0
    per_file_duration_ms = total_duration_ms / len(file_paths) if file_paths else 0
    peak_memory = _get_peak_memory_bytes()

    for result in results:
        result["_extraction_time_ms"] = per_file_duration_ms
        result["_batch_total_ms"] = total_duration_ms
        result["_peak_memory_bytes"] = peak_memory

    return results


def _worker(fn, args, conn):
    """Run extraction in a forked child process.

    Closes inherited stdin/stdout so the child cannot corrupt the
    parent's line-based JSON protocol.
    """
    try:
        sys.stdin.close()
        sys.stdout = open(os.devnull, "w")
    except Exception:
        pass
    try:
        result = fn(*args)
        conn.send(result)
    except Exception as e:
        conn.send({"error": str(e), "_extraction_time_ms": 0})
    finally:
        conn.close()


def _run_with_timeout(fn, args, timeout):
    """Execute fn(*args) in a forked child with a timeout.

    On timeout the child is killed but the parent stays alive —
    no expensive process restart is needed.
    """
    try:
        ctx = _mp.get_context("fork")
        parent_conn, child_conn = ctx.Pipe(duplex=False)
        p = ctx.Process(target=_worker, args=(fn, args, child_conn))
        p.start()
        child_conn.close()

        if parent_conn.poll(timeout=timeout):
            try:
                result = parent_conn.recv()
            except Exception:
                result = {"error": "worker process crashed", "_extraction_time_ms": 0}
        else:
            p.kill()
            result = {
                "error": f"extraction timed out after {timeout}s",
                "_extraction_time_ms": timeout * 1000.0,
            }

        p.join(timeout=5)
        if p.is_alive():
            p.kill()
            p.join()
        parent_conn.close()
        return result
    except Exception:
        # Fork not available — fall back to in-process extraction
        try:
            return fn(*args)
        except Exception as e:
            return {"error": str(e), "_extraction_time_ms": 0}


def _parse_path(line: str) -> str:
    """Parse a request line: JSON object with path field, or plain file path."""
    stripped = line.strip()
    if stripped.startswith("{"):
        try:
            return json.loads(stripped).get("path", "")
        except (json.JSONDecodeError, ValueError):
            pass
    return stripped


def run_server(ocr_enabled: bool, output_format: str, timeout=None) -> None:
    """Persistent server mode: read paths from stdin, write JSON to stdout."""
    print("READY", flush=True)
    for line in sys.stdin:
        file_path = _parse_path(line)
        if not file_path:
            continue
        if timeout is not None:
            result = _run_with_timeout(extract_sync, (file_path, ocr_enabled, output_format), timeout)
        else:
            try:
                result = extract_sync(file_path, ocr_enabled, output_format)
            except Exception as e:
                result = {"error": str(e), "_extraction_time_ms": 0}
        print(json.dumps(result), flush=True)


def main() -> None:
    ocr_enabled = False
    timeout = None
    output_format = "markdown"
    args = []
    for arg in sys.argv[1:]:
        if arg == "--ocr":
            ocr_enabled = True
        elif arg == "--no-ocr":
            ocr_enabled = False
        elif arg.startswith("--timeout="):
            timeout = int(arg.split("=", 1)[1])
        elif arg.startswith("--format="):
            output_format = arg.split("=", 1)[1]
        else:
            args.append(arg)

    if output_format not in ("markdown", "plaintext"):
        print(f"Error: --format must be 'markdown' or 'plaintext'; got '{output_format}'", file=sys.stderr)
        sys.exit(64)

    if len(args) < 1:
        print(
            "Usage: mineru_extract.py [--ocr|--no-ocr] [--timeout=SECS] [--format=markdown|plaintext] <mode> <file_path> [additional_files...]",
            file=sys.stderr,
        )
        print("Modes: sync, batch, server", file=sys.stderr)
        sys.exit(1)

    mode = args[0]
    file_paths = args[1:]

    try:
        if mode == "server":
            run_server(ocr_enabled, output_format, timeout=timeout)

        elif mode == "sync":
            if len(file_paths) != 1:
                print("Error: sync mode requires exactly one file", file=sys.stderr)
                sys.exit(1)
            payload = extract_sync(file_paths[0], ocr_enabled, output_format)
            print(json.dumps(payload), end="")

        elif mode == "batch":
            if len(file_paths) < 1:
                print("Error: batch mode requires at least one file", file=sys.stderr)
                sys.exit(1)

            if len(file_paths) == 1:
                results = extract_batch(file_paths, ocr_enabled, output_format)
                print(json.dumps(results[0]), end="")
            else:
                results = extract_batch(file_paths, ocr_enabled, output_format)
                print(json.dumps(results), end="")

        else:
            print(f"Error: Unknown mode '{mode}'. Use sync, batch, or server", file=sys.stderr)
            sys.exit(1)

    except Exception as e:
        print(f"Error extracting with MinerU: {e}", file=sys.stderr)
        sys.exit(1)


if __name__ == "__main__":
    main()
Nomad changes 2026-06-01 23:40:55 +02:00			`"""MinerU extraction wrapper for benchmark harness.`

			`Supports three modes:`
			`- sync: process single file`
			`- batch: process multiple files`
			`- server: persistent mode reading paths from stdin`

			`Attempts to use MinerU's Python API directly for better performance.`
			`Falls back to CLI subprocess if the Python API is not available.`
			`"""`

			`from __future__ import annotations`

			`import os`

			`# Force CPU-only mode to avoid GPU discovery errors in CI`
			`os.environ.setdefault("CUDA_VISIBLE_DEVICES", "")`
			`os.environ.setdefault("ONNXRUNTIME_PROVIDERS", "CPUExecutionProvider")`
			`os.environ.setdefault("MINERU_DEVICE_MODE", "cpu")`

			`import json`
			`import multiprocessing as _mp`
			`import platform`
			`import resource`
			`import subprocess`
			`import sys`
			`import tempfile`
			`import time`
			`from pathlib import Path`
			`from typing import Any`

			`# Try importing MinerU's Python API to avoid subprocess overhead.`
			`# The API surface has changed across versions, so we attempt several known entry points.`
			`try:`
			`from magic_pdf.pipe.UNIPipe import UNIPipe # noqa: F401`

			`HAS_PYTHON_API = True`
			`except ImportError:`
			`HAS_PYTHON_API = False`


			`def _get_peak_memory_bytes() -> int:`
			`"""Get peak memory usage in bytes using resource module."""`
			`usage = resource.getrusage(resource.RUSAGE_SELF)`
			`if platform.system() == "Linux":`
			`return usage.ru_maxrss * 1024`
			`return usage.ru_maxrss`


			`def _extract_via_cli(file_path: str, ocr_enabled: bool) -> str:`
			`"""Extract using MinerU CLI (fallback)."""`
			`cmd = ["mineru", "-p", file_path, "-b", "pipeline", "-d", "cpu"]`
			`if not ocr_enabled:`
			`cmd.extend(["--method", "txt"])`

			`with tempfile.TemporaryDirectory() as tmpdir:`
			`output_dir = Path(tmpdir) / "output"`
			`cmd.extend(["-o", str(output_dir)])`

			`result = subprocess.run(`
			`cmd,`
			`capture_output=True,`
			`text=True,`
			`check=False,`
			`)`

			`# Check for output files first — ONNX Runtime may emit warnings to`
			`# stderr even when extraction succeeds.`
			`md_files = list(output_dir.rglob("*.md"))`
			`if md_files:`
			`return md_files[0].read_text(encoding="utf-8")`

			`if result.returncode != 0:`
			`raise RuntimeError(f"MinerU extraction failed: {result.stderr}")`

			`raise RuntimeError("No markdown output found from MinerU")`


			`def _extract_via_api(file_path: str, ocr_enabled: bool) -> str:`
			`"""Extract using MinerU Python API (preferred, avoids subprocess overhead)."""`
			`# NOTE: The MinerU Python API is not yet stable. This is a best-effort attempt`
			`# using the UNIPipe interface. If this fails at runtime, the caller should`
			`# fall back to CLI extraction.`
			`from magic_pdf.pipe.UNIPipe import UNIPipe`
			`from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter`

			`pdf_bytes = Path(file_path).read_bytes()`

			`with tempfile.TemporaryDirectory() as tmpdir:`
			`writer = DiskReaderWriter(tmpdir)`
			`method = "ocr" if ocr_enabled else "txt"`
			`pipe = UNIPipe(pdf_bytes, {"_pdf_type": "", "model_list": []}, writer, method=method)`
			`pipe.pipe_classify()`
			`pipe.pipe_analyze()`
			`pipe.pipe_parse()`
			`md_content = pipe.pipe_mk_markdown(str(Path(file_path).stem), tmpdir)`
			`return md_content`


			`_MD_STRIP_RE = None`


			`def _strip_markdown(text: str) -> str:`
			`"""Best-effort markdown→plaintext pass. Drops syntax tokens; preserves text."""`
			`import re`

			`global _MD_STRIP_RE`
			`if _MD_STRIP_RE is None:`
			`_MD_STRIP_RE = [`
			`(re.compile(r"^#{1,6}\s+", re.MULTILINE), ""), # ATX headings`
			`(re.compile(r"^\s[-+]\s+", re.MULTILINE), ""), # bullet markers`
			`(re.compile(r"^\s*\d+\.\s+", re.MULTILINE), ""), # ordered list markers`
			`(re.compile(r"^>\s?", re.MULTILINE), ""), # blockquotes`
			(re.compile(r"```[a-zA-Z0-9_-]*\n?"), ""), # code fences
			(re.compile(r"`([^`]+)`"), r"\1"), # inline code
			`(re.compile(r"\\([^]+)\\*"), r"\1"), # bold`
			`(re.compile(r"\([^]+)\*"), r"\1"), # italic`
			`(re.compile(r"!\[([^\]])\]\([^)]\)"), r"\1"), # images`
			`(re.compile(r"\[([^\]]+)\]\([^)]*\)"), r"\1"), # links`
			`(re.compile(r"^\s\\|.\\|\s*$", re.MULTILINE), ""), # table rows (drop)`
			`]`
			`out = text`
			`for pattern, repl in _MD_STRIP_RE:`
			`out = pattern.sub(repl, out)`
			`return out`


			`def extract_sync(file_path: str, ocr_enabled: bool, output_format: str = "markdown") -> dict[str, Any]:`
			`"""Extract a single file using the best available method."""`
			`start = time.perf_counter()`

			`if HAS_PYTHON_API:`
			`try:`
			`markdown = _extract_via_api(file_path, ocr_enabled)`
			`except Exception:`
			`# Fall back to CLI if Python API fails at runtime`
			`markdown = _extract_via_cli(file_path, ocr_enabled)`
			`else:`
			`markdown = _extract_via_cli(file_path, ocr_enabled)`

			`content = _strip_markdown(markdown) if output_format == "plaintext" else markdown`
			`duration_ms = (time.perf_counter() - start) * 1000.0`

			`return {`
			`"content": content,`
			`"metadata": {"framework": "mineru", "output_format": output_format},`
			`"_extraction_time_ms": duration_ms,`
			`"_peak_memory_bytes": _get_peak_memory_bytes(),`
			`}`


			`def extract_batch(file_paths: list[str], ocr_enabled: bool, output_format: str = "markdown") -> list[dict[str, Any]]:`
			`"""Extract multiple files in sequence."""`
			`start = time.perf_counter()`

			`results = []`
			`for file_path in file_paths:`
			`try:`
			`payload = extract_sync(file_path, ocr_enabled, output_format)`
			`# Remove per-file timing; we'll replace with batch timing below`
			`payload.pop("_extraction_time_ms", None)`
			`results.append(payload)`
			`except Exception as e:`
			`results.append(`
			`{`
			`"content": "",`
			`"metadata": {`
			`"framework": "mineru",`
			`"error": str(e),`
			`},`
			`}`
			`)`

			`total_duration_ms = (time.perf_counter() - start) * 1000.0`
			`per_file_duration_ms = total_duration_ms / len(file_paths) if file_paths else 0`
			`peak_memory = _get_peak_memory_bytes()`

			`for result in results:`
			`result["_extraction_time_ms"] = per_file_duration_ms`
			`result["_batch_total_ms"] = total_duration_ms`
			`result["_peak_memory_bytes"] = peak_memory`

			`return results`


			`def _worker(fn, args, conn):`
			`"""Run extraction in a forked child process.`

			`Closes inherited stdin/stdout so the child cannot corrupt the`
			`parent's line-based JSON protocol.`
			`"""`
			`try:`
			`sys.stdin.close()`
			`sys.stdout = open(os.devnull, "w")`
			`except Exception:`
			`pass`
			`try:`
			`result = fn(*args)`
			`conn.send(result)`
			`except Exception as e:`
			`conn.send({"error": str(e), "_extraction_time_ms": 0})`
			`finally:`
			`conn.close()`


			`def _run_with_timeout(fn, args, timeout):`
			`"""Execute fn(*args) in a forked child with a timeout.`

			`On timeout the child is killed but the parent stays alive —`
			`no expensive process restart is needed.`
			`"""`
			`try:`
			`ctx = _mp.get_context("fork")`
			`parent_conn, child_conn = ctx.Pipe(duplex=False)`
			`p = ctx.Process(target=_worker, args=(fn, args, child_conn))`
			`p.start()`
			`child_conn.close()`

			`if parent_conn.poll(timeout=timeout):`
			`try:`
			`result = parent_conn.recv()`
			`except Exception:`
			`result = {"error": "worker process crashed", "_extraction_time_ms": 0}`
			`else:`
			`p.kill()`
			`result = {`
			`"error": f"extraction timed out after {timeout}s",`
			`"_extraction_time_ms": timeout * 1000.0,`
			`}`

			`p.join(timeout=5)`
			`if p.is_alive():`
			`p.kill()`
			`p.join()`
			`parent_conn.close()`
			`return result`
			`except Exception:`
			`# Fork not available — fall back to in-process extraction`
			`try:`
			`return fn(*args)`
			`except Exception as e:`
			`return {"error": str(e), "_extraction_time_ms": 0}`


			`def _parse_path(line: str) -> str:`
			`"""Parse a request line: JSON object with path field, or plain file path."""`
			`stripped = line.strip()`
			`if stripped.startswith("{"):`
			`try:`
			`return json.loads(stripped).get("path", "")`
			`except (json.JSONDecodeError, ValueError):`
			`pass`
			`return stripped`


			`def run_server(ocr_enabled: bool, output_format: str, timeout=None) -> None:`
			`"""Persistent server mode: read paths from stdin, write JSON to stdout."""`
			`print("READY", flush=True)`
			`for line in sys.stdin:`
			`file_path = _parse_path(line)`
			`if not file_path:`
			`continue`
			`if timeout is not None:`
			`result = _run_with_timeout(extract_sync, (file_path, ocr_enabled, output_format), timeout)`
			`else:`
			`try:`
			`result = extract_sync(file_path, ocr_enabled, output_format)`
			`except Exception as e:`
			`result = {"error": str(e), "_extraction_time_ms": 0}`
			`print(json.dumps(result), flush=True)`


			`def main() -> None:`
			`ocr_enabled = False`
			`timeout = None`
			`output_format = "markdown"`
			`args = []`
			`for arg in sys.argv[1:]:`
			`if arg == "--ocr":`
			`ocr_enabled = True`
			`elif arg == "--no-ocr":`
			`ocr_enabled = False`
			`elif arg.startswith("--timeout="):`
			`timeout = int(arg.split("=", 1)[1])`
			`elif arg.startswith("--format="):`
			`output_format = arg.split("=", 1)[1]`
			`else:`
			`args.append(arg)`

			`if output_format not in ("markdown", "plaintext"):`
			`print(f"Error: --format must be 'markdown' or 'plaintext'; got '{output_format}'", file=sys.stderr)`
			`sys.exit(64)`

			`if len(args) < 1:`
			`print(`
			`"Usage: mineru_extract.py [--ocr\|--no-ocr] [--timeout=SECS] [--format=markdown\|plaintext] <mode> <file_path> [additional_files...]",`
			`file=sys.stderr,`
			`)`
			`print("Modes: sync, batch, server", file=sys.stderr)`
			`sys.exit(1)`

			`mode = args[0]`
			`file_paths = args[1:]`

			`try:`
			`if mode == "server":`
			`run_server(ocr_enabled, output_format, timeout=timeout)`

			`elif mode == "sync":`
			`if len(file_paths) != 1:`
			`print("Error: sync mode requires exactly one file", file=sys.stderr)`
			`sys.exit(1)`
			`payload = extract_sync(file_paths[0], ocr_enabled, output_format)`
			`print(json.dumps(payload), end="")`

			`elif mode == "batch":`
			`if len(file_paths) < 1:`
			`print("Error: batch mode requires at least one file", file=sys.stderr)`
			`sys.exit(1)`

			`if len(file_paths) == 1:`
			`results = extract_batch(file_paths, ocr_enabled, output_format)`
			`print(json.dumps(results[0]), end="")`
			`else:`
			`results = extract_batch(file_paths, ocr_enabled, output_format)`
			`print(json.dumps(results), end="")`

			`else:`
			`print(f"Error: Unknown mode '{mode}'. Use sync, batch, or server", file=sys.stderr)`
			`sys.exit(1)`

			`except Exception as e:`
			`print(f"Error extracting with MinerU: {e}", file=sys.stderr)`
			`sys.exit(1)`


			`if __name__ == "__main__":`
			`main()`