Files
fil/.github/workflows/benchmarks.yaml
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

1245 lines
40 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
name: Benchmarks
on:
workflow_dispatch:
inputs:
branch:
description: "Git branch to benchmark"
required: false
default: "main"
type: string
timeout:
description: "Timeout per document in seconds"
required: false
default: "900"
type: string
env:
ITERATIONS: "3"
TIKA_VERSION: "3.2.3"
ORT_VERSION: "1.24.2"
CARGO_TERM_COLOR: always
CARGO_INCREMENTAL: 0
CARGO_PROFILE_DEV_DEBUG: 0
RUST_BACKTRACE: short
RUST_MIN_STACK: 16777216
RUSTFLAGS: "-C strip=symbols"
MEASURE_QUALITY: "true"
OCR_ENABLED: "true"
RUN_OCR_BENCHMARKS: "true"
FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: "true"
GROUND_TRUTH_DIR: "test_documents/ground_truth"
concurrency:
group: benchmarks-${{ github.ref }}
cancel-in-progress: true
permissions:
contents: read
defaults:
run:
shell: bash
jobs:
setup:
name: Build harness + native libs
runs-on: runner-medium-arm64
timeout-minutes: 360
permissions:
contents: read
outputs:
artifact-name: benchmarks-target
steps:
- uses: actions/checkout@v6
with:
ref: ${{ github.event.inputs.branch || github.ref }}
submodules: recursive
- name: Free disk space
uses: kreuzberg-dev/actions/free-disk-space-linux@v1
- name: Ensure benchmark harness exists
run: scripts/benchmarks/ensure-benchmark-harness-exists.sh
- name: Install system dependencies
uses: ./.github/actions/install-system-deps
- name: Setup OpenSSL
uses: kreuzberg-dev/actions/setup-openssl@v1
- name: Setup Rust
uses: kreuzberg-dev/actions/setup-rust@v1
with:
cache-key-prefix: benchmarks-setup
use-sccache: "true"
report-stats: "true"
- name: Setup ONNX Runtime
uses: ./.github/actions/setup-onnx-runtime
with:
ort-version: ${{ env.ORT_VERSION }}
- name: Setup Layout Models
uses: ./.github/actions/setup-layout-models
- name: Install Task
uses: kreuzberg-dev/actions/install-task@v1
- name: Cache benchmark harness
uses: ./.github/actions/cache-benchmark-harness
with:
build-profile: release
- name: Build kreuzberg-cli (release, all features)
run: cargo build --release -p kreuzberg-cli --features all
- name: Validate ground truth
run: cargo run --release -p benchmark-harness -- validate-gt --fixtures tools/benchmark-harness/fixtures/
- name: Log disk space before artifact upload
run: scripts/ci/validate/show-disk-space.sh "Disk space before artifact upload"
- name: Upload build artifacts (harness binary + kreuzberg-cli)
uses: actions/upload-artifact@v7
with:
name: benchmarks-target
path: |
target/release/benchmark-harness
target/release/kreuzberg
retention-days: 7
if-no-files-found: warn
- name: Upload benchmark harness binary (for third-party jobs)
uses: actions/upload-artifact@v7
with:
name: benchmark-harness-binary
path: |
target/release/benchmark-harness
retention-days: 7
- name: Log disk space after artifact upload
run: scripts/ci/validate/show-disk-space.sh "Disk space after artifact upload"
bench-rust:
name: bench-rust (${{ matrix.pipeline }}, ${{ matrix.output_format }}, ${{ matrix.mode }})
needs: setup
runs-on: runner-medium-arm64
timeout-minutes: 360
permissions:
contents: read
strategy:
fail-fast: false
matrix:
pipeline: [baseline, layout, paddle-ocr]
output_format: [markdown, plaintext]
mode: [single-file, batch]
steps:
- uses: actions/checkout@v6
with:
ref: ${{ github.event.inputs.branch || github.ref }}
submodules: recursive
- name: Free disk space
uses: kreuzberg-dev/actions/free-disk-space-linux@v1
- name: Install system dependencies
uses: ./.github/actions/install-system-deps
- name: Setup OpenSSL
uses: kreuzberg-dev/actions/setup-openssl@v1
- name: Setup Rust toolchain
uses: kreuzberg-dev/actions/setup-rust@v1
with:
cache-key-prefix: benchmarks-rust-${{ matrix.pipeline }}
use-sccache: "true"
report-stats: "true"
- name: Download build artifacts
uses: actions/download-artifact@v8
with:
name: ${{ needs.setup.outputs.artifact-name }}
path: target/release
- name: Restore benchmark binary permissions
run: scripts/benchmarks/restore-binary-permissions.sh
- name: Setup ONNX Runtime
uses: ./.github/actions/setup-onnx-runtime
with:
ort-version: ${{ env.ORT_VERSION }}
- name: Setup Layout Models
if: matrix.pipeline == 'layout'
uses: ./.github/actions/setup-layout-models
- name: Setup PaddleOCR models
if: matrix.pipeline == 'paddle-ocr'
uses: ./.github/actions/setup-paddle-ocr-models
- name: Run benchmark
env:
FRAMEWORK: kreuzberg-${{ matrix.output_format }}-${{ matrix.pipeline }}
MODE: ${{ matrix.mode }}
OUTPUT_FORMAT: ${{ matrix.output_format }}
ITERATIONS: ${{ env.ITERATIONS }}
TIMEOUT: ${{ github.event.inputs.timeout }}
run: scripts/benchmarks/run-benchmark.sh
- name: Upload artifacts
if: always()
uses: actions/upload-artifact@v7
with:
name: benchmarks-rust-${{ matrix.pipeline }}-${{ matrix.output_format }}-${{ matrix.mode }}-${{ github.run_id }}
path: benchmark-results/
retention-days: 30
bench-docling:
name: docling (${{ matrix.output_format }}, ${{ matrix.mode }}, shard ${{ matrix.shard }}/3)
needs: bench-rust
if: ${{ !cancelled() }}
runs-on: runner-medium-arm64
timeout-minutes: 360
permissions:
contents: read
strategy:
fail-fast: true
matrix:
mode: [single-file]
shard: [1, 2, 3]
output_format: [markdown, plaintext]
steps:
- uses: actions/checkout@v6
with:
ref: ${{ github.event.inputs.branch || github.ref }}
submodules: recursive
- name: Free disk space
uses: kreuzberg-dev/actions/free-disk-space-linux@v1
- name: Download benchmark harness binary
uses: actions/download-artifact@v8
with:
name: benchmark-harness-binary
path: target/release
- name: Restore benchmark binary permissions
run: scripts/benchmarks/restore-binary-permissions.sh
- name: Setup ONNX Runtime
uses: ./.github/actions/setup-onnx-runtime
with:
ort-version: ${{ env.ORT_VERSION }}
- name: Setup Python
uses: kreuzberg-dev/actions/setup-python-env@v1
with:
python-version: "3.11"
cache-prefix: benchmark-docling
install-command: "uv sync --no-install-project --no-install-workspace --group dev --group bench-docling"
- name: Install system dependencies
run: |
sudo apt-get update
sudo apt-get install -y libgl1 libglib2.0-0 tesseract-ocr tesseract-ocr-eng
- name: Run benchmark
env:
FRAMEWORK: docling
MODE: ${{ matrix.mode }}
OUTPUT_FORMAT: ${{ matrix.output_format }}
ITERATIONS: ${{ env.ITERATIONS }}
TIMEOUT: ${{ github.event.inputs.timeout }}
SHARD: ${{ matrix.shard }}/3
UV_NO_SYNC: "1"
run: scripts/benchmarks/run-benchmark.sh
- name: Upload artifacts
if: always()
uses: actions/upload-artifact@v7
with:
name: benchmarks-docling-${{ matrix.output_format }}-${{ matrix.mode }}-shard${{ matrix.shard }}-${{ github.run_id }}
path: benchmark-results/
retention-days: 30
bench-markitdown:
name: markitdown (${{ matrix.output_format }}, ${{ matrix.mode }}, shard ${{ matrix.shard }}/3)
needs: bench-rust
if: ${{ !cancelled() }}
runs-on: runner-medium-arm64
timeout-minutes: 360
permissions:
contents: read
strategy:
fail-fast: true
matrix:
mode: [single-file]
shard: [1, 2, 3]
output_format: [markdown, plaintext]
exclude:
- output_format: plaintext
steps:
- uses: actions/checkout@v6
with:
ref: ${{ github.event.inputs.branch || github.ref }}
submodules: recursive
- name: Free disk space
uses: kreuzberg-dev/actions/free-disk-space-linux@v1
- name: Download benchmark harness binary
uses: actions/download-artifact@v8
with:
name: benchmark-harness-binary
path: target/release
- name: Restore benchmark binary permissions
run: scripts/benchmarks/restore-binary-permissions.sh
- name: Setup ONNX Runtime
uses: ./.github/actions/setup-onnx-runtime
with:
ort-version: ${{ env.ORT_VERSION }}
- name: Setup Python
uses: kreuzberg-dev/actions/setup-python-env@v1
with:
python-version: "3.11"
cache-prefix: benchmark-markitdown
install-command: "uv sync --no-install-project --no-install-workspace --group dev --group bench-markitdown"
- name: Run benchmark
env:
FRAMEWORK: markitdown
MODE: ${{ matrix.mode }}
OUTPUT_FORMAT: ${{ matrix.output_format }}
ITERATIONS: ${{ env.ITERATIONS }}
TIMEOUT: ${{ github.event.inputs.timeout }}
OCR_ENABLED: "false"
SHARD: ${{ matrix.shard }}/3
UV_NO_SYNC: "1"
run: scripts/benchmarks/run-benchmark.sh
- name: Upload artifacts
if: always()
uses: actions/upload-artifact@v7
with:
name: benchmarks-markitdown-${{ matrix.output_format }}-${{ matrix.mode }}-shard${{ matrix.shard }}-${{ github.run_id }}
path: benchmark-results/
retention-days: 30
bench-pandoc:
name: pandoc (${{ matrix.output_format }}, ${{ matrix.mode }})
needs: bench-rust
if: ${{ !cancelled() }}
runs-on: runner-medium-arm64
timeout-minutes: 360
permissions:
contents: read
strategy:
fail-fast: true
matrix:
mode: [single-file]
output_format: [markdown, plaintext]
steps:
- uses: actions/checkout@v6
with:
ref: ${{ github.event.inputs.branch || github.ref }}
submodules: recursive
- name: Free disk space
uses: kreuzberg-dev/actions/free-disk-space-linux@v1
- name: Download benchmark harness binary
uses: actions/download-artifact@v8
with:
name: benchmark-harness-binary
path: target/release
- name: Restore benchmark binary permissions
run: scripts/benchmarks/restore-binary-permissions.sh
- name: Setup ONNX Runtime
uses: ./.github/actions/setup-onnx-runtime
with:
ort-version: ${{ env.ORT_VERSION }}
- name: Setup Python
uses: kreuzberg-dev/actions/setup-python-env@v1
with:
python-version: "3.11"
cache-prefix: benchmark-pandoc
install-command: "uv sync --no-install-project --no-install-workspace --group dev"
- name: Install pandoc
run: |
sudo apt-get update
sudo apt-get install -y pandoc
pandoc --version
- name: Run benchmark
env:
FRAMEWORK: pandoc
MODE: ${{ matrix.mode }}
OUTPUT_FORMAT: ${{ matrix.output_format }}
ITERATIONS: ${{ env.ITERATIONS }}
TIMEOUT: ${{ github.event.inputs.timeout }}
OCR_ENABLED: "false"
UV_NO_SYNC: "1"
run: scripts/benchmarks/run-benchmark.sh
- name: Upload artifacts
if: always()
uses: actions/upload-artifact@v7
with:
name: benchmarks-pandoc-${{ matrix.output_format }}-${{ matrix.mode }}-${{ github.run_id }}
path: benchmark-results/
retention-days: 30
bench-unstructured:
name: unstructured (${{ matrix.output_format }}, ${{ matrix.mode }}, shard ${{ matrix.shard }}/4)
needs: bench-rust
if: ${{ !cancelled() }}
runs-on: runner-medium-arm64
timeout-minutes: 360
permissions:
contents: read
strategy:
fail-fast: true
matrix:
mode: [single-file]
shard: [1, 2, 3, 4]
output_format: [markdown, plaintext]
steps:
- uses: actions/checkout@v6
with:
ref: ${{ github.event.inputs.branch || github.ref }}
submodules: recursive
- name: Free disk space
uses: kreuzberg-dev/actions/free-disk-space-linux@v1
- name: Download benchmark harness binary
uses: actions/download-artifact@v8
with:
name: benchmark-harness-binary
path: target/release
- name: Restore benchmark binary permissions
run: scripts/benchmarks/restore-binary-permissions.sh
- name: Setup ONNX Runtime
uses: ./.github/actions/setup-onnx-runtime
with:
ort-version: ${{ env.ORT_VERSION }}
- name: Install system dependencies
run: |
sudo apt-get update
sudo apt-get install -y tesseract-ocr tesseract-ocr-eng libmagic-dev poppler-utils libreoffice pandoc
- name: Setup Python
uses: kreuzberg-dev/actions/setup-python-env@v1
with:
python-version: "3.11"
cache-prefix: benchmark-unstructured
install-command: "uv sync --no-install-project --no-install-workspace --group dev --group bench-unstructured"
- name: Run benchmark
env:
FRAMEWORK: unstructured
MODE: ${{ matrix.mode }}
OUTPUT_FORMAT: ${{ matrix.output_format }}
ITERATIONS: ${{ env.ITERATIONS }}
TIMEOUT: ${{ github.event.inputs.timeout }}
SHARD: ${{ matrix.shard }}/4
UV_NO_SYNC: "1"
run: scripts/benchmarks/run-benchmark.sh
- name: Upload artifacts
if: always()
uses: actions/upload-artifact@v7
with:
name: benchmarks-unstructured-${{ matrix.output_format }}-${{ matrix.mode }}-shard${{ matrix.shard }}-${{ github.run_id }}
path: benchmark-results/
retention-days: 30
bench-tika:
name: tika (${{ matrix.output_format }}, ${{ matrix.mode }}, shard ${{ matrix.shard }}/3)
needs: bench-rust
if: ${{ !cancelled() }}
runs-on: runner-medium-arm64
timeout-minutes: 360
permissions:
contents: read
strategy:
fail-fast: true
matrix:
mode: [single-file]
shard: [1, 2, 3]
output_format: [markdown, plaintext]
exclude:
- output_format: markdown
steps:
- uses: actions/checkout@v6
with:
ref: ${{ github.event.inputs.branch || github.ref }}
submodules: recursive
- name: Free disk space
uses: kreuzberg-dev/actions/free-disk-space-linux@v1
- name: Download benchmark harness binary
uses: actions/download-artifact@v8
with:
name: benchmark-harness-binary
path: target/release
- name: Restore benchmark binary permissions
run: scripts/benchmarks/restore-binary-permissions.sh
- name: Setup ONNX Runtime
uses: ./.github/actions/setup-onnx-runtime
with:
ort-version: ${{ env.ORT_VERSION }}
- name: Setup Java
uses: actions/setup-java@v5
id: setup-java
with:
distribution: "temurin"
java-version: "25"
- name: Download Apache Tika
run: |
mkdir -p tools/benchmark-harness/libs
curl -fsSL --retry 5 --retry-delay 5 -o "tools/benchmark-harness/libs/tika-app-${{ env.TIKA_VERSION }}.jar" \
"https://repo1.maven.org/maven2/org/apache/tika/tika-app/${{ env.TIKA_VERSION }}/tika-app-${{ env.TIKA_VERSION }}.jar"
- name: Run benchmark
env:
FRAMEWORK: tika
MODE: ${{ matrix.mode }}
OUTPUT_FORMAT: ${{ matrix.output_format }}
ITERATIONS: ${{ env.ITERATIONS }}
TIMEOUT: ${{ github.event.inputs.timeout }}
SHARD: ${{ matrix.shard }}/3
run: scripts/benchmarks/run-benchmark.sh
- name: Upload artifacts
if: always()
uses: actions/upload-artifact@v7
with:
name: benchmarks-tika-${{ matrix.output_format }}-${{ matrix.mode }}-shard${{ matrix.shard }}-${{ github.run_id }}
path: benchmark-results/
retention-days: 30
bench-pymupdf4llm:
name: pymupdf4llm (${{ matrix.output_format }}, ${{ matrix.mode }}, shard ${{ matrix.shard }}/3)
needs: bench-rust
if: ${{ !cancelled() }}
runs-on: runner-medium-arm64
timeout-minutes: 360
permissions:
contents: read
strategy:
fail-fast: true
matrix:
mode: [single-file]
shard: [1, 2, 3]
output_format: [markdown, plaintext]
exclude:
- output_format: plaintext
steps:
- uses: actions/checkout@v6
with:
ref: ${{ github.event.inputs.branch || github.ref }}
submodules: recursive
- name: Free disk space
uses: kreuzberg-dev/actions/free-disk-space-linux@v1
- name: Download benchmark harness binary
uses: actions/download-artifact@v8
with:
name: benchmark-harness-binary
path: target/release
- name: Restore benchmark binary permissions
run: scripts/benchmarks/restore-binary-permissions.sh
- name: Setup ONNX Runtime
uses: ./.github/actions/setup-onnx-runtime
with:
ort-version: ${{ env.ORT_VERSION }}
- name: Install system dependencies
run: |
sudo apt-get update
sudo apt-get install -y tesseract-ocr tesseract-ocr-eng libjpeg-dev libpng-dev libtiff-dev
- name: Setup Python
uses: kreuzberg-dev/actions/setup-python-env@v1
with:
python-version: "3.11"
cache-prefix: benchmark-pymupdf4llm
install-command: "uv sync --no-install-project --no-install-workspace --group dev --group bench-pymupdf4llm"
- name: Run benchmark
env:
FRAMEWORK: pymupdf4llm
MODE: ${{ matrix.mode }}
OUTPUT_FORMAT: ${{ matrix.output_format }}
ITERATIONS: ${{ env.ITERATIONS }}
TIMEOUT: ${{ github.event.inputs.timeout }}
TESSDATA_PREFIX: /usr/share/tesseract-ocr/5/tessdata
SHARD: ${{ matrix.shard }}/3
UV_NO_SYNC: "1"
run: scripts/benchmarks/run-benchmark.sh
- name: Upload artifacts
if: always()
uses: actions/upload-artifact@v7
with:
name: benchmarks-pymupdf4llm-${{ matrix.output_format }}-${{ matrix.mode }}-shard${{ matrix.shard }}-${{ github.run_id }}
path: benchmark-results/
retention-days: 30
bench-pdfplumber:
name: pdfplumber (${{ matrix.output_format }}, ${{ matrix.mode }})
needs: bench-rust
if: ${{ !cancelled() }}
runs-on: runner-medium-arm64
timeout-minutes: 360
permissions:
contents: read
strategy:
fail-fast: true
matrix:
mode: [single-file]
output_format: [markdown, plaintext]
exclude:
- output_format: markdown
steps:
- uses: actions/checkout@v6
with:
ref: ${{ github.event.inputs.branch || github.ref }}
submodules: recursive
- name: Free disk space
uses: kreuzberg-dev/actions/free-disk-space-linux@v1
- name: Download benchmark harness binary
uses: actions/download-artifact@v8
with:
name: benchmark-harness-binary
path: target/release
- name: Restore benchmark binary permissions
run: scripts/benchmarks/restore-binary-permissions.sh
- name: Setup ONNX Runtime
uses: ./.github/actions/setup-onnx-runtime
with:
ort-version: ${{ env.ORT_VERSION }}
- name: Setup Python
uses: kreuzberg-dev/actions/setup-python-env@v1
with:
python-version: "3.11"
cache-prefix: benchmark-pdfplumber
install-command: "uv sync --no-install-project --no-install-workspace --group dev --group bench-pdfplumber"
- name: Run benchmark
env:
FRAMEWORK: pdfplumber
MODE: ${{ matrix.mode }}
OUTPUT_FORMAT: ${{ matrix.output_format }}
ITERATIONS: ${{ env.ITERATIONS }}
TIMEOUT: ${{ github.event.inputs.timeout }}
OCR_ENABLED: "false"
UV_NO_SYNC: "1"
run: scripts/benchmarks/run-benchmark.sh
- name: Upload artifacts
if: always()
uses: actions/upload-artifact@v7
with:
name: benchmarks-pdfplumber-${{ matrix.output_format }}-${{ matrix.mode }}-${{ github.run_id }}
path: benchmark-results/
retention-days: 30
bench-mineru:
name: mineru (${{ matrix.output_format }}, ${{ matrix.mode }}, shard ${{ matrix.shard }}/3)
needs: bench-rust
if: ${{ !cancelled() }}
runs-on: runner-medium-arm64
timeout-minutes: 360
permissions:
contents: read
strategy:
fail-fast: true
matrix:
mode: [single-file]
shard: [1, 2, 3]
output_format: [markdown, plaintext]
steps:
- uses: actions/checkout@v6
with:
ref: ${{ github.event.inputs.branch || github.ref }}
submodules: recursive
- name: Free disk space
uses: kreuzberg-dev/actions/free-disk-space-linux@v1
- name: Download benchmark harness binary
uses: actions/download-artifact@v8
with:
name: benchmark-harness-binary
path: target/release
- name: Restore benchmark binary permissions
run: scripts/benchmarks/restore-binary-permissions.sh
- name: Setup ONNX Runtime
uses: ./.github/actions/setup-onnx-runtime
with:
ort-version: ${{ env.ORT_VERSION }}
- name: Setup Python
uses: kreuzberg-dev/actions/setup-python-env@v1
with:
python-version: "3.11"
cache-prefix: benchmark-mineru
install-command: "uv sync --no-install-project --no-install-workspace --group dev --group bench-mineru"
- name: Install system dependencies
run: |
sudo apt-get update
sudo apt-get install -y libgl1 libglib2.0-0
- name: Run benchmark
env:
FRAMEWORK: mineru
MODE: ${{ matrix.mode }}
OUTPUT_FORMAT: ${{ matrix.output_format }}
ITERATIONS: ${{ env.ITERATIONS }}
TIMEOUT: ${{ github.event.inputs.timeout }}
SHARD: ${{ matrix.shard }}/3
UV_NO_SYNC: "1"
run: scripts/benchmarks/run-benchmark.sh
- name: Upload artifacts
if: always()
uses: actions/upload-artifact@v7
with:
name: benchmarks-mineru-${{ matrix.output_format }}-${{ matrix.mode }}-shard${{ matrix.shard }}-${{ github.run_id }}
path: benchmark-results/
retention-days: 30
bench-pypdf:
name: pypdf (${{ matrix.output_format }}, ${{ matrix.mode }})
needs: bench-rust
if: ${{ !cancelled() }}
runs-on: runner-medium-arm64
timeout-minutes: 360
permissions:
contents: read
strategy:
fail-fast: true
matrix:
mode: [single-file]
output_format: [markdown, plaintext]
exclude:
- output_format: markdown
steps:
- uses: actions/checkout@v6
with:
ref: ${{ github.event.inputs.branch || github.ref }}
submodules: recursive
- name: Free disk space
uses: kreuzberg-dev/actions/free-disk-space-linux@v1
- name: Download benchmark harness binary
uses: actions/download-artifact@v8
with:
name: benchmark-harness-binary
path: target/release
- name: Restore benchmark binary permissions
run: scripts/benchmarks/restore-binary-permissions.sh
- name: Setup ONNX Runtime
uses: ./.github/actions/setup-onnx-runtime
with:
ort-version: ${{ env.ORT_VERSION }}
- name: Setup Python
uses: kreuzberg-dev/actions/setup-python-env@v1
with:
python-version: "3.11"
cache-prefix: benchmark-pypdf
install-command: "uv sync --no-install-project --no-install-workspace --group dev --group bench-pypdf"
- name: Run benchmark
env:
FRAMEWORK: pypdf
MODE: ${{ matrix.mode }}
OUTPUT_FORMAT: ${{ matrix.output_format }}
ITERATIONS: ${{ env.ITERATIONS }}
TIMEOUT: ${{ github.event.inputs.timeout }}
OCR_ENABLED: "false"
UV_NO_SYNC: "1"
run: scripts/benchmarks/run-benchmark.sh
- name: Upload artifacts
if: always()
uses: actions/upload-artifact@v7
with:
name: benchmarks-pypdf-${{ matrix.output_format }}-${{ matrix.mode }}-${{ github.run_id }}
path: benchmark-results/
retention-days: 30
bench-pdfminer:
name: pdfminer (${{ matrix.output_format }}, ${{ matrix.mode }})
needs: bench-rust
if: ${{ !cancelled() }}
runs-on: runner-medium-arm64
timeout-minutes: 360
permissions:
contents: read
strategy:
fail-fast: true
matrix:
mode: [single-file]
output_format: [markdown, plaintext]
exclude:
- output_format: markdown
steps:
- uses: actions/checkout@v6
with:
ref: ${{ github.event.inputs.branch || github.ref }}
submodules: recursive
- name: Free disk space
uses: kreuzberg-dev/actions/free-disk-space-linux@v1
- name: Download benchmark harness binary
uses: actions/download-artifact@v8
with:
name: benchmark-harness-binary
path: target/release
- name: Restore benchmark binary permissions
run: scripts/benchmarks/restore-binary-permissions.sh
- name: Setup ONNX Runtime
uses: ./.github/actions/setup-onnx-runtime
with:
ort-version: ${{ env.ORT_VERSION }}
- name: Setup Python
uses: kreuzberg-dev/actions/setup-python-env@v1
with:
python-version: "3.11"
cache-prefix: benchmark-pdfminer
install-command: "uv sync --no-install-project --no-install-workspace --group dev --group bench-pdfminer"
- name: Run benchmark
env:
FRAMEWORK: pdfminer
MODE: ${{ matrix.mode }}
OUTPUT_FORMAT: ${{ matrix.output_format }}
ITERATIONS: ${{ env.ITERATIONS }}
TIMEOUT: ${{ github.event.inputs.timeout }}
OCR_ENABLED: "false"
UV_NO_SYNC: "1"
run: scripts/benchmarks/run-benchmark.sh
- name: Upload artifacts
if: always()
uses: actions/upload-artifact@v7
with:
name: benchmarks-pdfminer-${{ matrix.output_format }}-${{ matrix.mode }}-${{ github.run_id }}
path: benchmark-results/
retention-days: 30
bench-pdftotext:
name: pdftotext (${{ matrix.output_format }}, ${{ matrix.mode }})
needs: bench-rust
if: ${{ !cancelled() }}
runs-on: runner-medium-arm64
timeout-minutes: 360
permissions:
contents: read
strategy:
fail-fast: true
matrix:
mode: [single-file]
output_format: [markdown, plaintext]
exclude:
- output_format: markdown
steps:
- uses: actions/checkout@v6
with:
ref: ${{ github.event.inputs.branch || github.ref }}
submodules: recursive
- name: Free disk space
uses: kreuzberg-dev/actions/free-disk-space-linux@v1
- name: Download benchmark harness binary
uses: actions/download-artifact@v8
with:
name: benchmark-harness-binary
path: target/release
- name: Restore benchmark binary permissions
run: scripts/benchmarks/restore-binary-permissions.sh
- name: Setup ONNX Runtime
uses: ./.github/actions/setup-onnx-runtime
with:
ort-version: ${{ env.ORT_VERSION }}
- name: Install poppler
run: |
sudo apt-get update
sudo apt-get install -y libpoppler-cpp-dev
- name: Setup Python
uses: kreuzberg-dev/actions/setup-python-env@v1
with:
python-version: "3.11"
cache-prefix: benchmark-pdftotext
install-command: "uv sync --no-install-project --no-install-workspace --group dev --group bench-pdftotext"
- name: Run benchmark
env:
FRAMEWORK: pdftotext
MODE: ${{ matrix.mode }}
OUTPUT_FORMAT: ${{ matrix.output_format }}
ITERATIONS: ${{ env.ITERATIONS }}
TIMEOUT: ${{ github.event.inputs.timeout }}
OCR_ENABLED: "false"
UV_NO_SYNC: "1"
run: scripts/benchmarks/run-benchmark.sh
- name: Upload artifacts
if: always()
uses: actions/upload-artifact@v7
with:
name: benchmarks-pdftotext-${{ matrix.output_format }}-${{ matrix.mode }}-${{ github.run_id }}
path: benchmark-results/
retention-days: 30
bench-playa-pdf:
name: playa-pdf (${{ matrix.output_format }}, ${{ matrix.mode }})
needs: bench-rust
if: ${{ !cancelled() }}
runs-on: runner-medium-arm64
timeout-minutes: 360
permissions:
contents: read
strategy:
fail-fast: true
matrix:
mode: [single-file]
output_format: [markdown, plaintext]
exclude:
- output_format: markdown
steps:
- uses: actions/checkout@v6
with:
ref: ${{ github.event.inputs.branch || github.ref }}
submodules: recursive
- name: Free disk space
uses: kreuzberg-dev/actions/free-disk-space-linux@v1
- name: Download benchmark harness binary
uses: actions/download-artifact@v8
with:
name: benchmark-harness-binary
path: target/release
- name: Restore benchmark binary permissions
run: scripts/benchmarks/restore-binary-permissions.sh
- name: Setup ONNX Runtime
uses: ./.github/actions/setup-onnx-runtime
with:
ort-version: ${{ env.ORT_VERSION }}
- name: Setup Python
uses: kreuzberg-dev/actions/setup-python-env@v1
with:
python-version: "3.11"
cache-prefix: benchmark-playa-pdf
install-command: "uv sync --no-install-project --no-install-workspace --group dev --group bench-playa-pdf"
- name: Run benchmark
env:
FRAMEWORK: playa-pdf
MODE: ${{ matrix.mode }}
OUTPUT_FORMAT: ${{ matrix.output_format }}
ITERATIONS: ${{ env.ITERATIONS }}
TIMEOUT: ${{ github.event.inputs.timeout }}
OCR_ENABLED: "false"
UV_NO_SYNC: "1"
run: scripts/benchmarks/run-benchmark.sh
- name: Upload artifacts
if: always()
uses: actions/upload-artifact@v7
with:
name: benchmarks-playa-pdf-${{ matrix.output_format }}-${{ matrix.mode }}-${{ github.run_id }}
path: benchmark-results/
retention-days: 30
aggregate-and-publish:
name: Aggregate & Release Results
needs:
[
bench-rust,
bench-docling,
bench-markitdown,
bench-pandoc,
bench-unstructured,
bench-tika,
bench-pymupdf4llm,
bench-pdfplumber,
bench-mineru,
bench-pypdf,
bench-pdfminer,
bench-pdftotext,
bench-playa-pdf,
]
runs-on: ubuntu-24.04-arm
if: ${{ !cancelled() && (github.ref == 'refs/heads/main' || github.event_name == 'workflow_dispatch') }}
permissions:
contents: write
steps:
- uses: actions/checkout@v6
with:
submodules: recursive
- name: Setup Rust
uses: kreuzberg-dev/actions/setup-rust@v1
with:
cache-key-prefix: aggregate
- name: Download all benchmark artifacts
uses: actions/download-artifact@v8
with:
pattern: "benchmarks-*"
path: benchmark-artifacts/
merge-multiple: false
- name: Validate artifacts before consolidation
id: validate-artifacts
run: |
set -euo pipefail
echo "=== Validating benchmark artifacts ==="
# Check if benchmark-artifacts directory exists and is not empty
if [[ ! -d "benchmark-artifacts" ]]; then
echo "WARNING: benchmark-artifacts directory does not exist"
echo "This may indicate that no benchmarks completed successfully (e.g., setup job was cancelled)"
echo "has-artifacts=false" >> "$GITHUB_OUTPUT"
exit 0
fi
ARTIFACT_COUNT=$(find benchmark-artifacts -mindepth 1 -maxdepth 1 -type d | wc -l)
if [[ $ARTIFACT_COUNT -eq 0 ]]; then
echo "WARNING: No artifact directories found in benchmark-artifacts"
echo "This may indicate that no benchmarks completed successfully"
echo "has-artifacts=false" >> "$GITHUB_OUTPUT"
exit 0
fi
echo "Found $ARTIFACT_COUNT artifact directories"
find benchmark-artifacts -mindepth 1 -maxdepth 1 -type d -exec basename {} \;
echo "has-artifacts=true" >> "$GITHUB_OUTPUT"
- name: Consolidate results
if: steps.validate-artifacts.outputs.has-artifacts == 'true'
run: |
set -euo pipefail
# Check if we should skip (from validation step)
if [[ ! -d "benchmark-artifacts" ]]; then
echo "Skipping consolidation - no artifacts available"
exit 0
fi
ARTIFACT_COUNT=$(find benchmark-artifacts -mindepth 1 -maxdepth 1 -type d | wc -l)
if [[ $ARTIFACT_COUNT -eq 0 ]]; then
echo "Skipping consolidation - no artifact directories found"
exit 0
fi
echo "=== Consolidating benchmark results ==="
# Find all artifact subdirectories
ARTIFACT_DIRS=$(find benchmark-artifacts -mindepth 1 -maxdepth 1 -type d | tr '\n' ',' | sed 's/,$//')
if [[ -z "$ARTIFACT_DIRS" ]]; then
echo "ERROR: ARTIFACT_DIRS is empty after globbing"
exit 1
fi
echo "Artifact directories: $ARTIFACT_DIRS"
# Run consolidation
cargo run --release --package benchmark-harness --bin benchmark-harness -- consolidate \
--inputs "$ARTIFACT_DIRS" \
--output consolidated-output/
echo "Consolidation complete"
ls -lh consolidated-output/
- name: Validate aggregated data
if: steps.validate-artifacts.outputs.has-artifacts == 'true'
run: |
set -euo pipefail
echo "=== Validating aggregated benchmark data ==="
AGGREGATED_FILE="consolidated-output/aggregated.json"
# Check if aggregated.json exists
if [[ ! -f "$AGGREGATED_FILE" ]]; then
echo "ERROR: aggregated.json not found at $AGGREGATED_FILE"
exit 1
fi
echo "Found aggregated.json ($(wc -c < "$AGGREGATED_FILE") bytes)"
# Validate JSON structure using jq
if ! jq empty "$AGGREGATED_FILE" 2>/dev/null; then
echo "ERROR: aggregated.json is not valid JSON"
exit 1
fi
echo "JSON validation passed"
# Check for required fields (by_framework_mode)
if ! jq -e '.by_framework_mode' "$AGGREGATED_FILE" > /dev/null 2>&1; then
echo "ERROR: Required field 'by_framework_mode' not found in aggregated.json"
exit 1
fi
echo "Required fields validated successfully"
# Display data structure summary
echo "Data structure summary:"
jq 'keys' "$AGGREGATED_FILE" | head -20
- name: Validate framework completeness
if: steps.validate-artifacts.outputs.has-artifacts == 'true'
run: |
set -euo pipefail
AGGREGATED_FILE="consolidated-output/aggregated.json"
echo "=== Validating framework completeness (schema v2.4.0) ==="
# Keys follow the v2.4.0 aggregate-key convention:
# kreuzberg-* → "{framework_name}:{mode}" (format is encoded in the name)
# competitors → "{framework}:{output_format}:{mode}"
#
# kreuzberg: 3 pipelines × 2 output formats × 2 modes = 12
# competitors: 16 entries (format varies per tool — see SCHEMA.md for derivation)
# total expected: 28
EXPECTED_FRAMEWORKS=(
# kreuzberg-* (slim keys — format encoded in name)
"kreuzberg-markdown-baseline:single" "kreuzberg-markdown-baseline:batch"
"kreuzberg-markdown-layout:single" "kreuzberg-markdown-layout:batch"
"kreuzberg-markdown-paddle-ocr:single" "kreuzberg-markdown-paddle-ocr:batch"
"kreuzberg-plaintext-baseline:single" "kreuzberg-plaintext-baseline:batch"
"kreuzberg-plaintext-layout:single" "kreuzberg-plaintext-layout:batch"
"kreuzberg-plaintext-paddle-ocr:single" "kreuzberg-plaintext-paddle-ocr:batch"
# competitors (format in key; single-file only — no batch API)
"docling:markdown:single" "docling:plaintext:single"
"markitdown:markdown:single"
"pandoc:markdown:single" "pandoc:plaintext:single"
"unstructured:markdown:single" "unstructured:plaintext:single"
"tika:plaintext:single"
"pymupdf4llm:markdown:single"
"pdfplumber:plaintext:single"
"mineru:markdown:single" "mineru:plaintext:single"
"pypdf:plaintext:single"
"pdfminer:plaintext:single"
"pdftotext:plaintext:single"
"playa-pdf:plaintext:single"
)
# Get actual frameworks from aggregated.json
ACTUAL_FRAMEWORKS=$(jq -r '.by_framework_mode | keys[]' "$AGGREGATED_FILE")
MISSING_COUNT=0
MISSING_LIST=""
for expected in "${EXPECTED_FRAMEWORKS[@]}"; do
if ! echo "$ACTUAL_FRAMEWORKS" | grep -qx "$expected"; then
MISSING_COUNT=$((MISSING_COUNT + 1))
MISSING_LIST="${MISSING_LIST} - $expected"$'\n'
fi
done
echo "Present frameworks ($(echo "$ACTUAL_FRAMEWORKS" | wc -l | tr -d ' ')):"
while IFS= read -r framework; do
echo " - $framework"
done <<< "$ACTUAL_FRAMEWORKS"
TOTAL_EXPECTED=${#EXPECTED_FRAMEWORKS[@]}
ACTUAL_COUNT=$(echo "$ACTUAL_FRAMEWORKS" | wc -l | tr -d ' ')
if [[ $MISSING_COUNT -gt 0 ]]; then
echo ""
echo "::warning::Missing $MISSING_COUNT of $TOTAL_EXPECTED expected framework:mode combinations (${ACTUAL_COUNT} present):"
echo "$MISSING_LIST"
echo "This is expected when some jobs fail — results will be published with available data."
else
echo ""
echo "All ${TOTAL_EXPECTED} expected framework:mode combinations present"
fi
- name: Create GitHub Release with benchmark results
if: steps.validate-artifacts.outputs.has-artifacts == 'true'
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
set -euo pipefail
echo "=== Creating GitHub Release with benchmark results ==="
SHORT_SHA="${{ github.sha }}"
SHORT_SHA="${SHORT_SHA:0:7}"
TAG="benchmark-run-${{ github.run_id }}"
DATE=$(date -u +"%Y-%m-%d")
# Create metadata file alongside aggregated data
cat > consolidated-output/metadata.json <<EOF
{
"updated_at": "$(date -u +"%Y-%m-%dT%H:%M:%SZ")",
"commit": "${{ github.sha }}",
"run_id": "${{ github.run_id }}",
"run_url": "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
}
EOF
gh release create "$TAG" \
--prerelease \
--title "Benchmark Results ${DATE} (${SHORT_SHA})" \
--notes "Comparative benchmark results from workflow run [${{ github.run_id }}](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}).
**Commit:** ${{ github.sha }}
**Date:** ${DATE}" \
consolidated-output/aggregated.json \
consolidated-output/metadata.json