name: Benchmarks on: workflow_dispatch: inputs: branch: description: "Git branch to benchmark" required: false default: "main" type: string timeout: description: "Timeout per document in seconds" required: false default: "900" type: string env: ITERATIONS: "3" TIKA_VERSION: "3.2.3" ORT_VERSION: "1.24.2" CARGO_TERM_COLOR: always CARGO_INCREMENTAL: 0 CARGO_PROFILE_DEV_DEBUG: 0 RUST_BACKTRACE: short RUST_MIN_STACK: 16777216 RUSTFLAGS: "-C strip=symbols" MEASURE_QUALITY: "true" OCR_ENABLED: "true" RUN_OCR_BENCHMARKS: "true" FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: "true" GROUND_TRUTH_DIR: "test_documents/ground_truth" concurrency: group: benchmarks-${{ github.ref }} cancel-in-progress: true permissions: contents: read defaults: run: shell: bash jobs: setup: name: Build harness + native libs runs-on: runner-medium-arm64 timeout-minutes: 360 permissions: contents: read outputs: artifact-name: benchmarks-target steps: - uses: actions/checkout@v6 with: ref: ${{ github.event.inputs.branch || github.ref }} submodules: recursive - name: Free disk space uses: kreuzberg-dev/actions/free-disk-space-linux@v1 - name: Ensure benchmark harness exists run: scripts/benchmarks/ensure-benchmark-harness-exists.sh - name: Install system dependencies uses: ./.github/actions/install-system-deps - name: Setup OpenSSL uses: kreuzberg-dev/actions/setup-openssl@v1 - name: Setup Rust uses: kreuzberg-dev/actions/setup-rust@v1 with: cache-key-prefix: benchmarks-setup use-sccache: "true" report-stats: "true" - name: Setup ONNX Runtime uses: ./.github/actions/setup-onnx-runtime with: ort-version: ${{ env.ORT_VERSION }} - name: Setup Layout Models uses: ./.github/actions/setup-layout-models - name: Install Task uses: kreuzberg-dev/actions/install-task@v1 - name: Cache benchmark harness uses: ./.github/actions/cache-benchmark-harness with: build-profile: release - name: Build kreuzberg-cli (release, all features) run: cargo build --release -p kreuzberg-cli --features all - name: Validate ground truth run: cargo run --release -p benchmark-harness -- validate-gt --fixtures tools/benchmark-harness/fixtures/ - name: Log disk space before artifact upload run: scripts/ci/validate/show-disk-space.sh "Disk space before artifact upload" - name: Upload build artifacts (harness binary + kreuzberg-cli) uses: actions/upload-artifact@v7 with: name: benchmarks-target path: | target/release/benchmark-harness target/release/kreuzberg retention-days: 7 if-no-files-found: warn - name: Upload benchmark harness binary (for third-party jobs) uses: actions/upload-artifact@v7 with: name: benchmark-harness-binary path: | target/release/benchmark-harness retention-days: 7 - name: Log disk space after artifact upload run: scripts/ci/validate/show-disk-space.sh "Disk space after artifact upload" bench-rust: name: bench-rust (${{ matrix.pipeline }}, ${{ matrix.output_format }}, ${{ matrix.mode }}) needs: setup runs-on: runner-medium-arm64 timeout-minutes: 360 permissions: contents: read strategy: fail-fast: false matrix: pipeline: [baseline, layout, paddle-ocr] output_format: [markdown, plaintext] mode: [single-file, batch] steps: - uses: actions/checkout@v6 with: ref: ${{ github.event.inputs.branch || github.ref }} submodules: recursive - name: Free disk space uses: kreuzberg-dev/actions/free-disk-space-linux@v1 - name: Install system dependencies uses: ./.github/actions/install-system-deps - name: Setup OpenSSL uses: kreuzberg-dev/actions/setup-openssl@v1 - name: Setup Rust toolchain uses: kreuzberg-dev/actions/setup-rust@v1 with: cache-key-prefix: benchmarks-rust-${{ matrix.pipeline }} use-sccache: "true" report-stats: "true" - name: Download build artifacts uses: actions/download-artifact@v8 with: name: ${{ needs.setup.outputs.artifact-name }} path: target/release - name: Restore benchmark binary permissions run: scripts/benchmarks/restore-binary-permissions.sh - name: Setup ONNX Runtime uses: ./.github/actions/setup-onnx-runtime with: ort-version: ${{ env.ORT_VERSION }} - name: Setup Layout Models if: matrix.pipeline == 'layout' uses: ./.github/actions/setup-layout-models - name: Setup PaddleOCR models if: matrix.pipeline == 'paddle-ocr' uses: ./.github/actions/setup-paddle-ocr-models - name: Run benchmark env: FRAMEWORK: kreuzberg-${{ matrix.output_format }}-${{ matrix.pipeline }} MODE: ${{ matrix.mode }} OUTPUT_FORMAT: ${{ matrix.output_format }} ITERATIONS: ${{ env.ITERATIONS }} TIMEOUT: ${{ github.event.inputs.timeout }} run: scripts/benchmarks/run-benchmark.sh - name: Upload artifacts if: always() uses: actions/upload-artifact@v7 with: name: benchmarks-rust-${{ matrix.pipeline }}-${{ matrix.output_format }}-${{ matrix.mode }}-${{ github.run_id }} path: benchmark-results/ retention-days: 30 bench-docling: name: docling (${{ matrix.output_format }}, ${{ matrix.mode }}, shard ${{ matrix.shard }}/3) needs: bench-rust if: ${{ !cancelled() }} runs-on: runner-medium-arm64 timeout-minutes: 360 permissions: contents: read strategy: fail-fast: true matrix: mode: [single-file] shard: [1, 2, 3] output_format: [markdown, plaintext] steps: - uses: actions/checkout@v6 with: ref: ${{ github.event.inputs.branch || github.ref }} submodules: recursive - name: Free disk space uses: kreuzberg-dev/actions/free-disk-space-linux@v1 - name: Download benchmark harness binary uses: actions/download-artifact@v8 with: name: benchmark-harness-binary path: target/release - name: Restore benchmark binary permissions run: scripts/benchmarks/restore-binary-permissions.sh - name: Setup ONNX Runtime uses: ./.github/actions/setup-onnx-runtime with: ort-version: ${{ env.ORT_VERSION }} - name: Setup Python uses: kreuzberg-dev/actions/setup-python-env@v1 with: python-version: "3.11" cache-prefix: benchmark-docling install-command: "uv sync --no-install-project --no-install-workspace --group dev --group bench-docling" - name: Install system dependencies run: | sudo apt-get update sudo apt-get install -y libgl1 libglib2.0-0 tesseract-ocr tesseract-ocr-eng - name: Run benchmark env: FRAMEWORK: docling MODE: ${{ matrix.mode }} OUTPUT_FORMAT: ${{ matrix.output_format }} ITERATIONS: ${{ env.ITERATIONS }} TIMEOUT: ${{ github.event.inputs.timeout }} SHARD: ${{ matrix.shard }}/3 UV_NO_SYNC: "1" run: scripts/benchmarks/run-benchmark.sh - name: Upload artifacts if: always() uses: actions/upload-artifact@v7 with: name: benchmarks-docling-${{ matrix.output_format }}-${{ matrix.mode }}-shard${{ matrix.shard }}-${{ github.run_id }} path: benchmark-results/ retention-days: 30 bench-markitdown: name: markitdown (${{ matrix.output_format }}, ${{ matrix.mode }}, shard ${{ matrix.shard }}/3) needs: bench-rust if: ${{ !cancelled() }} runs-on: runner-medium-arm64 timeout-minutes: 360 permissions: contents: read strategy: fail-fast: true matrix: mode: [single-file] shard: [1, 2, 3] output_format: [markdown, plaintext] exclude: - output_format: plaintext steps: - uses: actions/checkout@v6 with: ref: ${{ github.event.inputs.branch || github.ref }} submodules: recursive - name: Free disk space uses: kreuzberg-dev/actions/free-disk-space-linux@v1 - name: Download benchmark harness binary uses: actions/download-artifact@v8 with: name: benchmark-harness-binary path: target/release - name: Restore benchmark binary permissions run: scripts/benchmarks/restore-binary-permissions.sh - name: Setup ONNX Runtime uses: ./.github/actions/setup-onnx-runtime with: ort-version: ${{ env.ORT_VERSION }} - name: Setup Python uses: kreuzberg-dev/actions/setup-python-env@v1 with: python-version: "3.11" cache-prefix: benchmark-markitdown install-command: "uv sync --no-install-project --no-install-workspace --group dev --group bench-markitdown" - name: Run benchmark env: FRAMEWORK: markitdown MODE: ${{ matrix.mode }} OUTPUT_FORMAT: ${{ matrix.output_format }} ITERATIONS: ${{ env.ITERATIONS }} TIMEOUT: ${{ github.event.inputs.timeout }} OCR_ENABLED: "false" SHARD: ${{ matrix.shard }}/3 UV_NO_SYNC: "1" run: scripts/benchmarks/run-benchmark.sh - name: Upload artifacts if: always() uses: actions/upload-artifact@v7 with: name: benchmarks-markitdown-${{ matrix.output_format }}-${{ matrix.mode }}-shard${{ matrix.shard }}-${{ github.run_id }} path: benchmark-results/ retention-days: 30 bench-pandoc: name: pandoc (${{ matrix.output_format }}, ${{ matrix.mode }}) needs: bench-rust if: ${{ !cancelled() }} runs-on: runner-medium-arm64 timeout-minutes: 360 permissions: contents: read strategy: fail-fast: true matrix: mode: [single-file] output_format: [markdown, plaintext] steps: - uses: actions/checkout@v6 with: ref: ${{ github.event.inputs.branch || github.ref }} submodules: recursive - name: Free disk space uses: kreuzberg-dev/actions/free-disk-space-linux@v1 - name: Download benchmark harness binary uses: actions/download-artifact@v8 with: name: benchmark-harness-binary path: target/release - name: Restore benchmark binary permissions run: scripts/benchmarks/restore-binary-permissions.sh - name: Setup ONNX Runtime uses: ./.github/actions/setup-onnx-runtime with: ort-version: ${{ env.ORT_VERSION }} - name: Setup Python uses: kreuzberg-dev/actions/setup-python-env@v1 with: python-version: "3.11" cache-prefix: benchmark-pandoc install-command: "uv sync --no-install-project --no-install-workspace --group dev" - name: Install pandoc run: | sudo apt-get update sudo apt-get install -y pandoc pandoc --version - name: Run benchmark env: FRAMEWORK: pandoc MODE: ${{ matrix.mode }} OUTPUT_FORMAT: ${{ matrix.output_format }} ITERATIONS: ${{ env.ITERATIONS }} TIMEOUT: ${{ github.event.inputs.timeout }} OCR_ENABLED: "false" UV_NO_SYNC: "1" run: scripts/benchmarks/run-benchmark.sh - name: Upload artifacts if: always() uses: actions/upload-artifact@v7 with: name: benchmarks-pandoc-${{ matrix.output_format }}-${{ matrix.mode }}-${{ github.run_id }} path: benchmark-results/ retention-days: 30 bench-unstructured: name: unstructured (${{ matrix.output_format }}, ${{ matrix.mode }}, shard ${{ matrix.shard }}/4) needs: bench-rust if: ${{ !cancelled() }} runs-on: runner-medium-arm64 timeout-minutes: 360 permissions: contents: read strategy: fail-fast: true matrix: mode: [single-file] shard: [1, 2, 3, 4] output_format: [markdown, plaintext] steps: - uses: actions/checkout@v6 with: ref: ${{ github.event.inputs.branch || github.ref }} submodules: recursive - name: Free disk space uses: kreuzberg-dev/actions/free-disk-space-linux@v1 - name: Download benchmark harness binary uses: actions/download-artifact@v8 with: name: benchmark-harness-binary path: target/release - name: Restore benchmark binary permissions run: scripts/benchmarks/restore-binary-permissions.sh - name: Setup ONNX Runtime uses: ./.github/actions/setup-onnx-runtime with: ort-version: ${{ env.ORT_VERSION }} - name: Install system dependencies run: | sudo apt-get update sudo apt-get install -y tesseract-ocr tesseract-ocr-eng libmagic-dev poppler-utils libreoffice pandoc - name: Setup Python uses: kreuzberg-dev/actions/setup-python-env@v1 with: python-version: "3.11" cache-prefix: benchmark-unstructured install-command: "uv sync --no-install-project --no-install-workspace --group dev --group bench-unstructured" - name: Run benchmark env: FRAMEWORK: unstructured MODE: ${{ matrix.mode }} OUTPUT_FORMAT: ${{ matrix.output_format }} ITERATIONS: ${{ env.ITERATIONS }} TIMEOUT: ${{ github.event.inputs.timeout }} SHARD: ${{ matrix.shard }}/4 UV_NO_SYNC: "1" run: scripts/benchmarks/run-benchmark.sh - name: Upload artifacts if: always() uses: actions/upload-artifact@v7 with: name: benchmarks-unstructured-${{ matrix.output_format }}-${{ matrix.mode }}-shard${{ matrix.shard }}-${{ github.run_id }} path: benchmark-results/ retention-days: 30 bench-tika: name: tika (${{ matrix.output_format }}, ${{ matrix.mode }}, shard ${{ matrix.shard }}/3) needs: bench-rust if: ${{ !cancelled() }} runs-on: runner-medium-arm64 timeout-minutes: 360 permissions: contents: read strategy: fail-fast: true matrix: mode: [single-file] shard: [1, 2, 3] output_format: [markdown, plaintext] exclude: - output_format: markdown steps: - uses: actions/checkout@v6 with: ref: ${{ github.event.inputs.branch || github.ref }} submodules: recursive - name: Free disk space uses: kreuzberg-dev/actions/free-disk-space-linux@v1 - name: Download benchmark harness binary uses: actions/download-artifact@v8 with: name: benchmark-harness-binary path: target/release - name: Restore benchmark binary permissions run: scripts/benchmarks/restore-binary-permissions.sh - name: Setup ONNX Runtime uses: ./.github/actions/setup-onnx-runtime with: ort-version: ${{ env.ORT_VERSION }} - name: Setup Java uses: actions/setup-java@v5 id: setup-java with: distribution: "temurin" java-version: "25" - name: Download Apache Tika run: | mkdir -p tools/benchmark-harness/libs curl -fsSL --retry 5 --retry-delay 5 -o "tools/benchmark-harness/libs/tika-app-${{ env.TIKA_VERSION }}.jar" \ "https://repo1.maven.org/maven2/org/apache/tika/tika-app/${{ env.TIKA_VERSION }}/tika-app-${{ env.TIKA_VERSION }}.jar" - name: Run benchmark env: FRAMEWORK: tika MODE: ${{ matrix.mode }} OUTPUT_FORMAT: ${{ matrix.output_format }} ITERATIONS: ${{ env.ITERATIONS }} TIMEOUT: ${{ github.event.inputs.timeout }} SHARD: ${{ matrix.shard }}/3 run: scripts/benchmarks/run-benchmark.sh - name: Upload artifacts if: always() uses: actions/upload-artifact@v7 with: name: benchmarks-tika-${{ matrix.output_format }}-${{ matrix.mode }}-shard${{ matrix.shard }}-${{ github.run_id }} path: benchmark-results/ retention-days: 30 bench-pymupdf4llm: name: pymupdf4llm (${{ matrix.output_format }}, ${{ matrix.mode }}, shard ${{ matrix.shard }}/3) needs: bench-rust if: ${{ !cancelled() }} runs-on: runner-medium-arm64 timeout-minutes: 360 permissions: contents: read strategy: fail-fast: true matrix: mode: [single-file] shard: [1, 2, 3] output_format: [markdown, plaintext] exclude: - output_format: plaintext steps: - uses: actions/checkout@v6 with: ref: ${{ github.event.inputs.branch || github.ref }} submodules: recursive - name: Free disk space uses: kreuzberg-dev/actions/free-disk-space-linux@v1 - name: Download benchmark harness binary uses: actions/download-artifact@v8 with: name: benchmark-harness-binary path: target/release - name: Restore benchmark binary permissions run: scripts/benchmarks/restore-binary-permissions.sh - name: Setup ONNX Runtime uses: ./.github/actions/setup-onnx-runtime with: ort-version: ${{ env.ORT_VERSION }} - name: Install system dependencies run: | sudo apt-get update sudo apt-get install -y tesseract-ocr tesseract-ocr-eng libjpeg-dev libpng-dev libtiff-dev - name: Setup Python uses: kreuzberg-dev/actions/setup-python-env@v1 with: python-version: "3.11" cache-prefix: benchmark-pymupdf4llm install-command: "uv sync --no-install-project --no-install-workspace --group dev --group bench-pymupdf4llm" - name: Run benchmark env: FRAMEWORK: pymupdf4llm MODE: ${{ matrix.mode }} OUTPUT_FORMAT: ${{ matrix.output_format }} ITERATIONS: ${{ env.ITERATIONS }} TIMEOUT: ${{ github.event.inputs.timeout }} TESSDATA_PREFIX: /usr/share/tesseract-ocr/5/tessdata SHARD: ${{ matrix.shard }}/3 UV_NO_SYNC: "1" run: scripts/benchmarks/run-benchmark.sh - name: Upload artifacts if: always() uses: actions/upload-artifact@v7 with: name: benchmarks-pymupdf4llm-${{ matrix.output_format }}-${{ matrix.mode }}-shard${{ matrix.shard }}-${{ github.run_id }} path: benchmark-results/ retention-days: 30 bench-pdfplumber: name: pdfplumber (${{ matrix.output_format }}, ${{ matrix.mode }}) needs: bench-rust if: ${{ !cancelled() }} runs-on: runner-medium-arm64 timeout-minutes: 360 permissions: contents: read strategy: fail-fast: true matrix: mode: [single-file] output_format: [markdown, plaintext] exclude: - output_format: markdown steps: - uses: actions/checkout@v6 with: ref: ${{ github.event.inputs.branch || github.ref }} submodules: recursive - name: Free disk space uses: kreuzberg-dev/actions/free-disk-space-linux@v1 - name: Download benchmark harness binary uses: actions/download-artifact@v8 with: name: benchmark-harness-binary path: target/release - name: Restore benchmark binary permissions run: scripts/benchmarks/restore-binary-permissions.sh - name: Setup ONNX Runtime uses: ./.github/actions/setup-onnx-runtime with: ort-version: ${{ env.ORT_VERSION }} - name: Setup Python uses: kreuzberg-dev/actions/setup-python-env@v1 with: python-version: "3.11" cache-prefix: benchmark-pdfplumber install-command: "uv sync --no-install-project --no-install-workspace --group dev --group bench-pdfplumber" - name: Run benchmark env: FRAMEWORK: pdfplumber MODE: ${{ matrix.mode }} OUTPUT_FORMAT: ${{ matrix.output_format }} ITERATIONS: ${{ env.ITERATIONS }} TIMEOUT: ${{ github.event.inputs.timeout }} OCR_ENABLED: "false" UV_NO_SYNC: "1" run: scripts/benchmarks/run-benchmark.sh - name: Upload artifacts if: always() uses: actions/upload-artifact@v7 with: name: benchmarks-pdfplumber-${{ matrix.output_format }}-${{ matrix.mode }}-${{ github.run_id }} path: benchmark-results/ retention-days: 30 bench-mineru: name: mineru (${{ matrix.output_format }}, ${{ matrix.mode }}, shard ${{ matrix.shard }}/3) needs: bench-rust if: ${{ !cancelled() }} runs-on: runner-medium-arm64 timeout-minutes: 360 permissions: contents: read strategy: fail-fast: true matrix: mode: [single-file] shard: [1, 2, 3] output_format: [markdown, plaintext] steps: - uses: actions/checkout@v6 with: ref: ${{ github.event.inputs.branch || github.ref }} submodules: recursive - name: Free disk space uses: kreuzberg-dev/actions/free-disk-space-linux@v1 - name: Download benchmark harness binary uses: actions/download-artifact@v8 with: name: benchmark-harness-binary path: target/release - name: Restore benchmark binary permissions run: scripts/benchmarks/restore-binary-permissions.sh - name: Setup ONNX Runtime uses: ./.github/actions/setup-onnx-runtime with: ort-version: ${{ env.ORT_VERSION }} - name: Setup Python uses: kreuzberg-dev/actions/setup-python-env@v1 with: python-version: "3.11" cache-prefix: benchmark-mineru install-command: "uv sync --no-install-project --no-install-workspace --group dev --group bench-mineru" - name: Install system dependencies run: | sudo apt-get update sudo apt-get install -y libgl1 libglib2.0-0 - name: Run benchmark env: FRAMEWORK: mineru MODE: ${{ matrix.mode }} OUTPUT_FORMAT: ${{ matrix.output_format }} ITERATIONS: ${{ env.ITERATIONS }} TIMEOUT: ${{ github.event.inputs.timeout }} SHARD: ${{ matrix.shard }}/3 UV_NO_SYNC: "1" run: scripts/benchmarks/run-benchmark.sh - name: Upload artifacts if: always() uses: actions/upload-artifact@v7 with: name: benchmarks-mineru-${{ matrix.output_format }}-${{ matrix.mode }}-shard${{ matrix.shard }}-${{ github.run_id }} path: benchmark-results/ retention-days: 30 bench-pypdf: name: pypdf (${{ matrix.output_format }}, ${{ matrix.mode }}) needs: bench-rust if: ${{ !cancelled() }} runs-on: runner-medium-arm64 timeout-minutes: 360 permissions: contents: read strategy: fail-fast: true matrix: mode: [single-file] output_format: [markdown, plaintext] exclude: - output_format: markdown steps: - uses: actions/checkout@v6 with: ref: ${{ github.event.inputs.branch || github.ref }} submodules: recursive - name: Free disk space uses: kreuzberg-dev/actions/free-disk-space-linux@v1 - name: Download benchmark harness binary uses: actions/download-artifact@v8 with: name: benchmark-harness-binary path: target/release - name: Restore benchmark binary permissions run: scripts/benchmarks/restore-binary-permissions.sh - name: Setup ONNX Runtime uses: ./.github/actions/setup-onnx-runtime with: ort-version: ${{ env.ORT_VERSION }} - name: Setup Python uses: kreuzberg-dev/actions/setup-python-env@v1 with: python-version: "3.11" cache-prefix: benchmark-pypdf install-command: "uv sync --no-install-project --no-install-workspace --group dev --group bench-pypdf" - name: Run benchmark env: FRAMEWORK: pypdf MODE: ${{ matrix.mode }} OUTPUT_FORMAT: ${{ matrix.output_format }} ITERATIONS: ${{ env.ITERATIONS }} TIMEOUT: ${{ github.event.inputs.timeout }} OCR_ENABLED: "false" UV_NO_SYNC: "1" run: scripts/benchmarks/run-benchmark.sh - name: Upload artifacts if: always() uses: actions/upload-artifact@v7 with: name: benchmarks-pypdf-${{ matrix.output_format }}-${{ matrix.mode }}-${{ github.run_id }} path: benchmark-results/ retention-days: 30 bench-pdfminer: name: pdfminer (${{ matrix.output_format }}, ${{ matrix.mode }}) needs: bench-rust if: ${{ !cancelled() }} runs-on: runner-medium-arm64 timeout-minutes: 360 permissions: contents: read strategy: fail-fast: true matrix: mode: [single-file] output_format: [markdown, plaintext] exclude: - output_format: markdown steps: - uses: actions/checkout@v6 with: ref: ${{ github.event.inputs.branch || github.ref }} submodules: recursive - name: Free disk space uses: kreuzberg-dev/actions/free-disk-space-linux@v1 - name: Download benchmark harness binary uses: actions/download-artifact@v8 with: name: benchmark-harness-binary path: target/release - name: Restore benchmark binary permissions run: scripts/benchmarks/restore-binary-permissions.sh - name: Setup ONNX Runtime uses: ./.github/actions/setup-onnx-runtime with: ort-version: ${{ env.ORT_VERSION }} - name: Setup Python uses: kreuzberg-dev/actions/setup-python-env@v1 with: python-version: "3.11" cache-prefix: benchmark-pdfminer install-command: "uv sync --no-install-project --no-install-workspace --group dev --group bench-pdfminer" - name: Run benchmark env: FRAMEWORK: pdfminer MODE: ${{ matrix.mode }} OUTPUT_FORMAT: ${{ matrix.output_format }} ITERATIONS: ${{ env.ITERATIONS }} TIMEOUT: ${{ github.event.inputs.timeout }} OCR_ENABLED: "false" UV_NO_SYNC: "1" run: scripts/benchmarks/run-benchmark.sh - name: Upload artifacts if: always() uses: actions/upload-artifact@v7 with: name: benchmarks-pdfminer-${{ matrix.output_format }}-${{ matrix.mode }}-${{ github.run_id }} path: benchmark-results/ retention-days: 30 bench-pdftotext: name: pdftotext (${{ matrix.output_format }}, ${{ matrix.mode }}) needs: bench-rust if: ${{ !cancelled() }} runs-on: runner-medium-arm64 timeout-minutes: 360 permissions: contents: read strategy: fail-fast: true matrix: mode: [single-file] output_format: [markdown, plaintext] exclude: - output_format: markdown steps: - uses: actions/checkout@v6 with: ref: ${{ github.event.inputs.branch || github.ref }} submodules: recursive - name: Free disk space uses: kreuzberg-dev/actions/free-disk-space-linux@v1 - name: Download benchmark harness binary uses: actions/download-artifact@v8 with: name: benchmark-harness-binary path: target/release - name: Restore benchmark binary permissions run: scripts/benchmarks/restore-binary-permissions.sh - name: Setup ONNX Runtime uses: ./.github/actions/setup-onnx-runtime with: ort-version: ${{ env.ORT_VERSION }} - name: Install poppler run: | sudo apt-get update sudo apt-get install -y libpoppler-cpp-dev - name: Setup Python uses: kreuzberg-dev/actions/setup-python-env@v1 with: python-version: "3.11" cache-prefix: benchmark-pdftotext install-command: "uv sync --no-install-project --no-install-workspace --group dev --group bench-pdftotext" - name: Run benchmark env: FRAMEWORK: pdftotext MODE: ${{ matrix.mode }} OUTPUT_FORMAT: ${{ matrix.output_format }} ITERATIONS: ${{ env.ITERATIONS }} TIMEOUT: ${{ github.event.inputs.timeout }} OCR_ENABLED: "false" UV_NO_SYNC: "1" run: scripts/benchmarks/run-benchmark.sh - name: Upload artifacts if: always() uses: actions/upload-artifact@v7 with: name: benchmarks-pdftotext-${{ matrix.output_format }}-${{ matrix.mode }}-${{ github.run_id }} path: benchmark-results/ retention-days: 30 bench-playa-pdf: name: playa-pdf (${{ matrix.output_format }}, ${{ matrix.mode }}) needs: bench-rust if: ${{ !cancelled() }} runs-on: runner-medium-arm64 timeout-minutes: 360 permissions: contents: read strategy: fail-fast: true matrix: mode: [single-file] output_format: [markdown, plaintext] exclude: - output_format: markdown steps: - uses: actions/checkout@v6 with: ref: ${{ github.event.inputs.branch || github.ref }} submodules: recursive - name: Free disk space uses: kreuzberg-dev/actions/free-disk-space-linux@v1 - name: Download benchmark harness binary uses: actions/download-artifact@v8 with: name: benchmark-harness-binary path: target/release - name: Restore benchmark binary permissions run: scripts/benchmarks/restore-binary-permissions.sh - name: Setup ONNX Runtime uses: ./.github/actions/setup-onnx-runtime with: ort-version: ${{ env.ORT_VERSION }} - name: Setup Python uses: kreuzberg-dev/actions/setup-python-env@v1 with: python-version: "3.11" cache-prefix: benchmark-playa-pdf install-command: "uv sync --no-install-project --no-install-workspace --group dev --group bench-playa-pdf" - name: Run benchmark env: FRAMEWORK: playa-pdf MODE: ${{ matrix.mode }} OUTPUT_FORMAT: ${{ matrix.output_format }} ITERATIONS: ${{ env.ITERATIONS }} TIMEOUT: ${{ github.event.inputs.timeout }} OCR_ENABLED: "false" UV_NO_SYNC: "1" run: scripts/benchmarks/run-benchmark.sh - name: Upload artifacts if: always() uses: actions/upload-artifact@v7 with: name: benchmarks-playa-pdf-${{ matrix.output_format }}-${{ matrix.mode }}-${{ github.run_id }} path: benchmark-results/ retention-days: 30 aggregate-and-publish: name: Aggregate & Release Results needs: [ bench-rust, bench-docling, bench-markitdown, bench-pandoc, bench-unstructured, bench-tika, bench-pymupdf4llm, bench-pdfplumber, bench-mineru, bench-pypdf, bench-pdfminer, bench-pdftotext, bench-playa-pdf, ] runs-on: ubuntu-24.04-arm if: ${{ !cancelled() && (github.ref == 'refs/heads/main' || github.event_name == 'workflow_dispatch') }} permissions: contents: write steps: - uses: actions/checkout@v6 with: submodules: recursive - name: Setup Rust uses: kreuzberg-dev/actions/setup-rust@v1 with: cache-key-prefix: aggregate - name: Download all benchmark artifacts uses: actions/download-artifact@v8 with: pattern: "benchmarks-*" path: benchmark-artifacts/ merge-multiple: false - name: Validate artifacts before consolidation id: validate-artifacts run: | set -euo pipefail echo "=== Validating benchmark artifacts ===" # Check if benchmark-artifacts directory exists and is not empty if [[ ! -d "benchmark-artifacts" ]]; then echo "WARNING: benchmark-artifacts directory does not exist" echo "This may indicate that no benchmarks completed successfully (e.g., setup job was cancelled)" echo "has-artifacts=false" >> "$GITHUB_OUTPUT" exit 0 fi ARTIFACT_COUNT=$(find benchmark-artifacts -mindepth 1 -maxdepth 1 -type d | wc -l) if [[ $ARTIFACT_COUNT -eq 0 ]]; then echo "WARNING: No artifact directories found in benchmark-artifacts" echo "This may indicate that no benchmarks completed successfully" echo "has-artifacts=false" >> "$GITHUB_OUTPUT" exit 0 fi echo "Found $ARTIFACT_COUNT artifact directories" find benchmark-artifacts -mindepth 1 -maxdepth 1 -type d -exec basename {} \; echo "has-artifacts=true" >> "$GITHUB_OUTPUT" - name: Consolidate results if: steps.validate-artifacts.outputs.has-artifacts == 'true' run: | set -euo pipefail # Check if we should skip (from validation step) if [[ ! -d "benchmark-artifacts" ]]; then echo "Skipping consolidation - no artifacts available" exit 0 fi ARTIFACT_COUNT=$(find benchmark-artifacts -mindepth 1 -maxdepth 1 -type d | wc -l) if [[ $ARTIFACT_COUNT -eq 0 ]]; then echo "Skipping consolidation - no artifact directories found" exit 0 fi echo "=== Consolidating benchmark results ===" # Find all artifact subdirectories ARTIFACT_DIRS=$(find benchmark-artifacts -mindepth 1 -maxdepth 1 -type d | tr '\n' ',' | sed 's/,$//') if [[ -z "$ARTIFACT_DIRS" ]]; then echo "ERROR: ARTIFACT_DIRS is empty after globbing" exit 1 fi echo "Artifact directories: $ARTIFACT_DIRS" # Run consolidation cargo run --release --package benchmark-harness --bin benchmark-harness -- consolidate \ --inputs "$ARTIFACT_DIRS" \ --output consolidated-output/ echo "Consolidation complete" ls -lh consolidated-output/ - name: Validate aggregated data if: steps.validate-artifacts.outputs.has-artifacts == 'true' run: | set -euo pipefail echo "=== Validating aggregated benchmark data ===" AGGREGATED_FILE="consolidated-output/aggregated.json" # Check if aggregated.json exists if [[ ! -f "$AGGREGATED_FILE" ]]; then echo "ERROR: aggregated.json not found at $AGGREGATED_FILE" exit 1 fi echo "Found aggregated.json ($(wc -c < "$AGGREGATED_FILE") bytes)" # Validate JSON structure using jq if ! jq empty "$AGGREGATED_FILE" 2>/dev/null; then echo "ERROR: aggregated.json is not valid JSON" exit 1 fi echo "JSON validation passed" # Check for required fields (by_framework_mode) if ! jq -e '.by_framework_mode' "$AGGREGATED_FILE" > /dev/null 2>&1; then echo "ERROR: Required field 'by_framework_mode' not found in aggregated.json" exit 1 fi echo "Required fields validated successfully" # Display data structure summary echo "Data structure summary:" jq 'keys' "$AGGREGATED_FILE" | head -20 - name: Validate framework completeness if: steps.validate-artifacts.outputs.has-artifacts == 'true' run: | set -euo pipefail AGGREGATED_FILE="consolidated-output/aggregated.json" echo "=== Validating framework completeness (schema v2.4.0) ===" # Keys follow the v2.4.0 aggregate-key convention: # kreuzberg-* → "{framework_name}:{mode}" (format is encoded in the name) # competitors → "{framework}:{output_format}:{mode}" # # kreuzberg: 3 pipelines × 2 output formats × 2 modes = 12 # competitors: 16 entries (format varies per tool — see SCHEMA.md for derivation) # total expected: 28 EXPECTED_FRAMEWORKS=( # kreuzberg-* (slim keys — format encoded in name) "kreuzberg-markdown-baseline:single" "kreuzberg-markdown-baseline:batch" "kreuzberg-markdown-layout:single" "kreuzberg-markdown-layout:batch" "kreuzberg-markdown-paddle-ocr:single" "kreuzberg-markdown-paddle-ocr:batch" "kreuzberg-plaintext-baseline:single" "kreuzberg-plaintext-baseline:batch" "kreuzberg-plaintext-layout:single" "kreuzberg-plaintext-layout:batch" "kreuzberg-plaintext-paddle-ocr:single" "kreuzberg-plaintext-paddle-ocr:batch" # competitors (format in key; single-file only — no batch API) "docling:markdown:single" "docling:plaintext:single" "markitdown:markdown:single" "pandoc:markdown:single" "pandoc:plaintext:single" "unstructured:markdown:single" "unstructured:plaintext:single" "tika:plaintext:single" "pymupdf4llm:markdown:single" "pdfplumber:plaintext:single" "mineru:markdown:single" "mineru:plaintext:single" "pypdf:plaintext:single" "pdfminer:plaintext:single" "pdftotext:plaintext:single" "playa-pdf:plaintext:single" ) # Get actual frameworks from aggregated.json ACTUAL_FRAMEWORKS=$(jq -r '.by_framework_mode | keys[]' "$AGGREGATED_FILE") MISSING_COUNT=0 MISSING_LIST="" for expected in "${EXPECTED_FRAMEWORKS[@]}"; do if ! echo "$ACTUAL_FRAMEWORKS" | grep -qx "$expected"; then MISSING_COUNT=$((MISSING_COUNT + 1)) MISSING_LIST="${MISSING_LIST} - $expected"$'\n' fi done echo "Present frameworks ($(echo "$ACTUAL_FRAMEWORKS" | wc -l | tr -d ' ')):" while IFS= read -r framework; do echo " - $framework" done <<< "$ACTUAL_FRAMEWORKS" TOTAL_EXPECTED=${#EXPECTED_FRAMEWORKS[@]} ACTUAL_COUNT=$(echo "$ACTUAL_FRAMEWORKS" | wc -l | tr -d ' ') if [[ $MISSING_COUNT -gt 0 ]]; then echo "" echo "::warning::Missing $MISSING_COUNT of $TOTAL_EXPECTED expected framework:mode combinations (${ACTUAL_COUNT} present):" echo "$MISSING_LIST" echo "This is expected when some jobs fail — results will be published with available data." else echo "" echo "All ${TOTAL_EXPECTED} expected framework:mode combinations present" fi - name: Create GitHub Release with benchmark results if: steps.validate-artifacts.outputs.has-artifacts == 'true' env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | set -euo pipefail echo "=== Creating GitHub Release with benchmark results ===" SHORT_SHA="${{ github.sha }}" SHORT_SHA="${SHORT_SHA:0:7}" TAG="benchmark-run-${{ github.run_id }}" DATE=$(date -u +"%Y-%m-%d") # Create metadata file alongside aggregated data cat > consolidated-output/metadata.json <