110 lines
4.0 KiB
YAML
110 lines
4.0 KiB
YAML
|
|
version: "3"
|
||
|
|
|
||
|
|
vars:
|
||
|
|
FIXTURES_DIR: "tools/benchmark-harness/fixtures"
|
||
|
|
HARNESS_PATH: "./target/release/benchmark-harness"
|
||
|
|
BENCHMARK_RESULTS_DIR: "benchmark-results"
|
||
|
|
FLAMEGRAPH_DIR: "flamegraphs"
|
||
|
|
|
||
|
|
tasks:
|
||
|
|
run:
|
||
|
|
desc: "Run benchmark harness with profiling support"
|
||
|
|
requires:
|
||
|
|
vars:
|
||
|
|
- FRAMEWORK
|
||
|
|
- MODE
|
||
|
|
vars:
|
||
|
|
ITERATIONS: '{{ .ITERATIONS | default "1" }}'
|
||
|
|
TIMEOUT: '{{ .TIMEOUT | default "900" }}'
|
||
|
|
MAX_CONCURRENT: '{{ if eq .MODE "single-file" }}1{{ else }}4{{ end }}'
|
||
|
|
env:
|
||
|
|
RUST_BACKTRACE: short
|
||
|
|
cmds:
|
||
|
|
- mkdir -p "{{.BENCHMARK_RESULTS_DIR}}/{{.FRAMEWORK}}-{{.MODE}}"
|
||
|
|
- |
|
||
|
|
{{.HARNESS_PATH}} \
|
||
|
|
run \
|
||
|
|
--fixtures "{{.FIXTURES_DIR}}" \
|
||
|
|
--frameworks "{{.FRAMEWORK}}" \
|
||
|
|
--output "{{.BENCHMARK_RESULTS_DIR}}/{{.FRAMEWORK}}-{{.MODE}}" \
|
||
|
|
--iterations "{{.ITERATIONS}}" \
|
||
|
|
--timeout "{{.TIMEOUT}}" \
|
||
|
|
--mode "{{.MODE}}" \
|
||
|
|
--max-concurrent "{{.MAX_CONCURRENT}}"
|
||
|
|
|
||
|
|
profile:
|
||
|
|
desc: "Run pipeline-benchmark with flamegraph profiling. Builds with --profile profiling so Rust symbols are resolved."
|
||
|
|
vars:
|
||
|
|
PIPELINE: '{{ .PIPELINE | default "baseline" }}'
|
||
|
|
DOC_FILTER: '{{ .DOC_FILTER | default "pdf" }}'
|
||
|
|
SHA:
|
||
|
|
sh: git rev-parse --short HEAD
|
||
|
|
env:
|
||
|
|
RUST_BACKTRACE: short
|
||
|
|
cmds:
|
||
|
|
# Build with the `profiling` profile (inherits release, retains debug
|
||
|
|
# info). A plain --release build strips Rust symbols, leaving the
|
||
|
|
# flamegraph full of __mh_execute_header / raw addresses and unable
|
||
|
|
# to surface kreuzberg::* hotspots. See docs/perf/profiling.md.
|
||
|
|
- cargo build --profile profiling -p kreuzberg-cli --features all
|
||
|
|
- cargo build --profile profiling -p benchmark-harness --features profiling
|
||
|
|
- mkdir -p "{{.FLAMEGRAPH_DIR}}/{{.SHA}}"
|
||
|
|
- |
|
||
|
|
target/profiling/benchmark-harness pipeline-benchmark \
|
||
|
|
--fixtures "{{.FIXTURES_DIR}}" \
|
||
|
|
--paths "{{.PIPELINE}}" \
|
||
|
|
--doc "{{.DOC_FILTER}}" \
|
||
|
|
--profile-dir "{{.FLAMEGRAPH_DIR}}/{{.SHA}}"
|
||
|
|
- 'echo "Flamegraph SVGs in {{.FLAMEGRAPH_DIR}}/{{.SHA}}/. Run: python3 tools/perf/extract_top_symbols.py {{.FLAMEGRAPH_DIR}}/{{.SHA}}/{{.PIPELINE}}.svg"'
|
||
|
|
|
||
|
|
compare:
|
||
|
|
desc: "Framework comparison with quality guardrails (baseline vs layout)"
|
||
|
|
cmds:
|
||
|
|
- cargo run -p benchmark-harness -- compare --fixtures "{{.FIXTURES_DIR}}" --guardrails
|
||
|
|
|
||
|
|
pipeline:quick:
|
||
|
|
desc: "Pipeline benchmark — native paths only (P1+P2)"
|
||
|
|
cmds:
|
||
|
|
- cargo run -p benchmark-harness -- pipeline-benchmark --fixtures "{{.FIXTURES_DIR}}" --paths baseline,layout
|
||
|
|
|
||
|
|
pipeline:all:
|
||
|
|
desc: "Pipeline benchmark — all 6 extraction paths"
|
||
|
|
cmds:
|
||
|
|
- cargo run -p benchmark-harness -- pipeline-benchmark --fixtures "{{.FIXTURES_DIR}}"
|
||
|
|
|
||
|
|
survey:
|
||
|
|
desc: "Corpus-wide extraction stats for all PDFs"
|
||
|
|
cmds:
|
||
|
|
- cargo run -p benchmark-harness -- survey --fixtures "{{.FIXTURES_DIR}}" --types pdf
|
||
|
|
|
||
|
|
models:
|
||
|
|
desc: "Layout model A/B comparison (fast vs accurate)"
|
||
|
|
cmds:
|
||
|
|
- cargo run -p benchmark-harness -- model-benchmark --fixtures "{{.FIXTURES_DIR}}"
|
||
|
|
|
||
|
|
generate-gt:
|
||
|
|
desc: "Generate markdown ground truth from PDFs using Gemini"
|
||
|
|
cmds:
|
||
|
|
- uv run --no-sync tools/benchmark-harness/scripts/generate_markdown_gt.py
|
||
|
|
|
||
|
|
download:omnidocbench:
|
||
|
|
desc: "Download OmniDocBench dataset from HuggingFace (~1.3 GB)"
|
||
|
|
cmds:
|
||
|
|
- bash tools/benchmark-harness/scripts/download_omnidocbench.sh
|
||
|
|
status:
|
||
|
|
- test -f tools/benchmark-harness/datasets/omnidocbench/OmniDocBench.json
|
||
|
|
|
||
|
|
import:omnidocbench:
|
||
|
|
desc: "Import OmniDocBench into benchmark fixtures (run download:omnidocbench first)"
|
||
|
|
deps: ["download:omnidocbench"]
|
||
|
|
cmds:
|
||
|
|
- python3 tools/benchmark-harness/scripts/import_omnidocbench.py tools/benchmark-harness/datasets/omnidocbench .
|
||
|
|
status:
|
||
|
|
- test -f tools/benchmark-harness/fixtures/pdf/omnidoc_*.json
|
||
|
|
|
||
|
|
clean-results:
|
||
|
|
desc: "Clean up benchmark results and profiles"
|
||
|
|
cmds:
|
||
|
|
- rm -rf "{{.BENCHMARK_RESULTS_DIR}}"
|
||
|
|
- rm -rf "{{.FLAMEGRAPH_DIR}}"
|