.task/workflows/benchmark.yml

version: "3"

vars:
  FIXTURES_DIR: "tools/benchmark-harness/fixtures"
  HARNESS_PATH: "./target/release/benchmark-harness"
  BENCHMARK_RESULTS_DIR: "benchmark-results"
  FLAMEGRAPH_DIR: "flamegraphs"

tasks:
  run:
    desc: "Run benchmark harness with profiling support"
    requires:
      vars:
        - FRAMEWORK
        - MODE
    vars:
      ITERATIONS: '{{ .ITERATIONS | default "1" }}'
      TIMEOUT: '{{ .TIMEOUT | default "900" }}'
      MAX_CONCURRENT: '{{ if eq .MODE "single-file" }}1{{ else }}4{{ end }}'
    env:
      RUST_BACKTRACE: short
    cmds:
      - mkdir -p "{{.BENCHMARK_RESULTS_DIR}}/{{.FRAMEWORK}}-{{.MODE}}"
      - |
        {{.HARNESS_PATH}} \
          run \
          --fixtures "{{.FIXTURES_DIR}}" \
          --frameworks "{{.FRAMEWORK}}" \
          --output "{{.BENCHMARK_RESULTS_DIR}}/{{.FRAMEWORK}}-{{.MODE}}" \
          --iterations "{{.ITERATIONS}}" \
          --timeout "{{.TIMEOUT}}" \
          --mode "{{.MODE}}" \
          --max-concurrent "{{.MAX_CONCURRENT}}"

  profile:
    desc: "Run pipeline-benchmark with flamegraph profiling. Builds with --profile profiling so Rust symbols are resolved."
    vars:
      PIPELINE: '{{ .PIPELINE | default "baseline" }}'
      DOC_FILTER: '{{ .DOC_FILTER | default "pdf" }}'
      SHA:
        sh: git rev-parse --short HEAD
    env:
      RUST_BACKTRACE: short
    cmds:
      # Build with the `profiling` profile (inherits release, retains debug
      # info). A plain --release build strips Rust symbols, leaving the
      # flamegraph full of __mh_execute_header / raw addresses and unable
      # to surface kreuzberg::* hotspots. See docs/perf/profiling.md.
      - cargo build --profile profiling -p kreuzberg-cli --features all
      - cargo build --profile profiling -p benchmark-harness --features profiling
      - mkdir -p "{{.FLAMEGRAPH_DIR}}/{{.SHA}}"
      - |
        target/profiling/benchmark-harness pipeline-benchmark \
          --fixtures "{{.FIXTURES_DIR}}" \
          --paths "{{.PIPELINE}}" \
          --doc "{{.DOC_FILTER}}" \
          --profile-dir "{{.FLAMEGRAPH_DIR}}/{{.SHA}}"
      - 'echo "Flamegraph SVGs in {{.FLAMEGRAPH_DIR}}/{{.SHA}}/. Run: python3 tools/perf/extract_top_symbols.py {{.FLAMEGRAPH_DIR}}/{{.SHA}}/{{.PIPELINE}}.svg"'

  compare:
    desc: "Framework comparison with quality guardrails (baseline vs layout)"
    cmds:
      - cargo run -p benchmark-harness -- compare --fixtures "{{.FIXTURES_DIR}}" --guardrails

  pipeline:quick:
    desc: "Pipeline benchmark — native paths only (P1+P2)"
    cmds:
      - cargo run -p benchmark-harness -- pipeline-benchmark --fixtures "{{.FIXTURES_DIR}}" --paths baseline,layout

  pipeline:all:
    desc: "Pipeline benchmark — all 6 extraction paths"
    cmds:
      - cargo run -p benchmark-harness -- pipeline-benchmark --fixtures "{{.FIXTURES_DIR}}"

  survey:
    desc: "Corpus-wide extraction stats for all PDFs"
    cmds:
      - cargo run -p benchmark-harness -- survey --fixtures "{{.FIXTURES_DIR}}" --types pdf

  models:
    desc: "Layout model A/B comparison (fast vs accurate)"
    cmds:
      - cargo run -p benchmark-harness -- model-benchmark --fixtures "{{.FIXTURES_DIR}}"

  generate-gt:
    desc: "Generate markdown ground truth from PDFs using Gemini"
    cmds:
      - uv run --no-sync tools/benchmark-harness/scripts/generate_markdown_gt.py

  download:omnidocbench:
    desc: "Download OmniDocBench dataset from HuggingFace (~1.3 GB)"
    cmds:
      - bash tools/benchmark-harness/scripts/download_omnidocbench.sh
    status:
      - test -f tools/benchmark-harness/datasets/omnidocbench/OmniDocBench.json

  import:omnidocbench:
    desc: "Import OmniDocBench into benchmark fixtures (run download:omnidocbench first)"
    deps: ["download:omnidocbench"]
    cmds:
      - python3 tools/benchmark-harness/scripts/import_omnidocbench.py tools/benchmark-harness/datasets/omnidocbench .
    status:
      - test -f tools/benchmark-harness/fixtures/pdf/omnidoc_*.json

  clean-results:
    desc: "Clean up benchmark results and profiles"
    cmds:
      - rm -rf "{{.BENCHMARK_RESULTS_DIR}}"
      - rm -rf "{{.FLAMEGRAPH_DIR}}"
Nomad changes 2026-06-01 23:40:55 +02:00			`version: "3"`

			`vars:`
			`FIXTURES_DIR: "tools/benchmark-harness/fixtures"`
			`HARNESS_PATH: "./target/release/benchmark-harness"`
			`BENCHMARK_RESULTS_DIR: "benchmark-results"`
			`FLAMEGRAPH_DIR: "flamegraphs"`

			`tasks:`
			`run:`
			`desc: "Run benchmark harness with profiling support"`
			`requires:`
			`vars:`
			`- FRAMEWORK`
			`- MODE`
			`vars:`
			`ITERATIONS: '{{ .ITERATIONS \| default "1" }}'`
			`TIMEOUT: '{{ .TIMEOUT \| default "900" }}'`
			`MAX_CONCURRENT: '{{ if eq .MODE "single-file" }}1{{ else }}4{{ end }}'`
			`env:`
			`RUST_BACKTRACE: short`
			`cmds:`
			`- mkdir -p "{{.BENCHMARK_RESULTS_DIR}}/{{.FRAMEWORK}}-{{.MODE}}"`
			`- \|`
			`{{.HARNESS_PATH}} \`
			`run \`
			`--fixtures "{{.FIXTURES_DIR}}" \`
			`--frameworks "{{.FRAMEWORK}}" \`
			`--output "{{.BENCHMARK_RESULTS_DIR}}/{{.FRAMEWORK}}-{{.MODE}}" \`
			`--iterations "{{.ITERATIONS}}" \`
			`--timeout "{{.TIMEOUT}}" \`
			`--mode "{{.MODE}}" \`
			`--max-concurrent "{{.MAX_CONCURRENT}}"`

			`profile:`
			`desc: "Run pipeline-benchmark with flamegraph profiling. Builds with --profile profiling so Rust symbols are resolved."`
			`vars:`
			`PIPELINE: '{{ .PIPELINE \| default "baseline" }}'`
			`DOC_FILTER: '{{ .DOC_FILTER \| default "pdf" }}'`
			`SHA:`
			`sh: git rev-parse --short HEAD`
			`env:`
			`RUST_BACKTRACE: short`
			`cmds:`
			# Build with the `profiling` profile (inherits release, retains debug
			`# info). A plain --release build strips Rust symbols, leaving the`
			`# flamegraph full of __mh_execute_header / raw addresses and unable`
			`# to surface kreuzberg::* hotspots. See docs/perf/profiling.md.`
			`- cargo build --profile profiling -p kreuzberg-cli --features all`
			`- cargo build --profile profiling -p benchmark-harness --features profiling`
			`- mkdir -p "{{.FLAMEGRAPH_DIR}}/{{.SHA}}"`
			`- \|`
			`target/profiling/benchmark-harness pipeline-benchmark \`
			`--fixtures "{{.FIXTURES_DIR}}" \`
			`--paths "{{.PIPELINE}}" \`
			`--doc "{{.DOC_FILTER}}" \`
			`--profile-dir "{{.FLAMEGRAPH_DIR}}/{{.SHA}}"`
			`- 'echo "Flamegraph SVGs in {{.FLAMEGRAPH_DIR}}/{{.SHA}}/. Run: python3 tools/perf/extract_top_symbols.py {{.FLAMEGRAPH_DIR}}/{{.SHA}}/{{.PIPELINE}}.svg"'`

			`compare:`
			`desc: "Framework comparison with quality guardrails (baseline vs layout)"`
			`cmds:`
			`- cargo run -p benchmark-harness -- compare --fixtures "{{.FIXTURES_DIR}}" --guardrails`

			`pipeline:quick:`
			`desc: "Pipeline benchmark — native paths only (P1+P2)"`
			`cmds:`
			`- cargo run -p benchmark-harness -- pipeline-benchmark --fixtures "{{.FIXTURES_DIR}}" --paths baseline,layout`

			`pipeline:all:`
			`desc: "Pipeline benchmark — all 6 extraction paths"`
			`cmds:`
			`- cargo run -p benchmark-harness -- pipeline-benchmark --fixtures "{{.FIXTURES_DIR}}"`

			`survey:`
			`desc: "Corpus-wide extraction stats for all PDFs"`
			`cmds:`
			`- cargo run -p benchmark-harness -- survey --fixtures "{{.FIXTURES_DIR}}" --types pdf`

			`models:`
			`desc: "Layout model A/B comparison (fast vs accurate)"`
			`cmds:`
			`- cargo run -p benchmark-harness -- model-benchmark --fixtures "{{.FIXTURES_DIR}}"`

			`generate-gt:`
			`desc: "Generate markdown ground truth from PDFs using Gemini"`
			`cmds:`
			`- uv run --no-sync tools/benchmark-harness/scripts/generate_markdown_gt.py`

			`download:omnidocbench:`
			`desc: "Download OmniDocBench dataset from HuggingFace (~1.3 GB)"`
			`cmds:`
			`- bash tools/benchmark-harness/scripts/download_omnidocbench.sh`
			`status:`
			`- test -f tools/benchmark-harness/datasets/omnidocbench/OmniDocBench.json`

			`import:omnidocbench:`
			`desc: "Import OmniDocBench into benchmark fixtures (run download:omnidocbench first)"`
			`deps: ["download:omnidocbench"]`
			`cmds:`
			`- python3 tools/benchmark-harness/scripts/import_omnidocbench.py tools/benchmark-harness/datasets/omnidocbench .`
			`status:`
			`- test -f tools/benchmark-harness/fixtures/pdf/omnidoc_*.json`

			`clean-results:`
			`desc: "Clean up benchmark results and profiles"`
			`cmds:`
			`- rm -rf "{{.BENCHMARK_RESULTS_DIR}}"`
			`- rm -rf "{{.FLAMEGRAPH_DIR}}"`