version: "3"

vars:
  FIXTURES_DIR: "tools/benchmark-harness/fixtures"
  HARNESS_PATH: "./target/release/benchmark-harness"
  BENCHMARK_RESULTS_DIR: "benchmark-results"
  FLAMEGRAPH_DIR: "flamegraphs"

tasks:
  run:
    desc: "Run benchmark harness with profiling support"
    requires:
      vars:
        - FRAMEWORK
        - MODE
    vars:
      ITERATIONS: '{{ .ITERATIONS | default "1" }}'
      TIMEOUT: '{{ .TIMEOUT | default "900" }}'
      MAX_CONCURRENT: '{{ if eq .MODE "single-file" }}1{{ else }}4{{ end }}'
    env:
      RUST_BACKTRACE: short
    cmds:
      - mkdir -p "{{.BENCHMARK_RESULTS_DIR}}/{{.FRAMEWORK}}-{{.MODE}}"
      - |
        {{.HARNESS_PATH}} \
          run \
          --fixtures "{{.FIXTURES_DIR}}" \
          --frameworks "{{.FRAMEWORK}}" \
          --output "{{.BENCHMARK_RESULTS_DIR}}/{{.FRAMEWORK}}-{{.MODE}}" \
          --iterations "{{.ITERATIONS}}" \
          --timeout "{{.TIMEOUT}}" \
          --mode "{{.MODE}}" \
          --max-concurrent "{{.MAX_CONCURRENT}}"

  profile:
    desc: "Run pipeline-benchmark with flamegraph profiling. Builds with --profile profiling so Rust symbols are resolved."
    vars:
      PIPELINE: '{{ .PIPELINE | default "baseline" }}'
      DOC_FILTER: '{{ .DOC_FILTER | default "pdf" }}'
      SHA:
        sh: git rev-parse --short HEAD
    env:
      RUST_BACKTRACE: short
    cmds:
      # Build with the `profiling` profile (inherits release, retains debug
      # info). A plain --release build strips Rust symbols, leaving the
      # flamegraph full of __mh_execute_header / raw addresses and unable
      # to surface kreuzberg::* hotspots. See docs/perf/profiling.md.
      - cargo build --profile profiling -p kreuzberg-cli --features all
      - cargo build --profile profiling -p benchmark-harness --features profiling
      - mkdir -p "{{.FLAMEGRAPH_DIR}}/{{.SHA}}"
      - |
        target/profiling/benchmark-harness pipeline-benchmark \
          --fixtures "{{.FIXTURES_DIR}}" \
          --paths "{{.PIPELINE}}" \
          --doc "{{.DOC_FILTER}}" \
          --profile-dir "{{.FLAMEGRAPH_DIR}}/{{.SHA}}"
      - 'echo "Flamegraph SVGs in {{.FLAMEGRAPH_DIR}}/{{.SHA}}/. Run: python3 tools/perf/extract_top_symbols.py {{.FLAMEGRAPH_DIR}}/{{.SHA}}/{{.PIPELINE}}.svg"'

  compare:
    desc: "Framework comparison with quality guardrails (baseline vs layout)"
    cmds:
      - cargo run -p benchmark-harness -- compare --fixtures "{{.FIXTURES_DIR}}" --guardrails

  pipeline:quick:
    desc: "Pipeline benchmark — native paths only (P1+P2)"
    cmds:
      - cargo run -p benchmark-harness -- pipeline-benchmark --fixtures "{{.FIXTURES_DIR}}" --paths baseline,layout

  pipeline:all:
    desc: "Pipeline benchmark — all 6 extraction paths"
    cmds:
      - cargo run -p benchmark-harness -- pipeline-benchmark --fixtures "{{.FIXTURES_DIR}}"

  survey:
    desc: "Corpus-wide extraction stats for all PDFs"
    cmds:
      - cargo run -p benchmark-harness -- survey --fixtures "{{.FIXTURES_DIR}}" --types pdf

  models:
    desc: "Layout model A/B comparison (fast vs accurate)"
    cmds:
      - cargo run -p benchmark-harness -- model-benchmark --fixtures "{{.FIXTURES_DIR}}"

  generate-gt:
    desc: "Generate markdown ground truth from PDFs using Gemini"
    cmds:
      - uv run --no-sync tools/benchmark-harness/scripts/generate_markdown_gt.py

  download:omnidocbench:
    desc: "Download OmniDocBench dataset from HuggingFace (~1.3 GB)"
    cmds:
      - bash tools/benchmark-harness/scripts/download_omnidocbench.sh
    status:
      - test -f tools/benchmark-harness/datasets/omnidocbench/OmniDocBench.json

  import:omnidocbench:
    desc: "Import OmniDocBench into benchmark fixtures (run download:omnidocbench first)"
    deps: ["download:omnidocbench"]
    cmds:
      - python3 tools/benchmark-harness/scripts/import_omnidocbench.py tools/benchmark-harness/datasets/omnidocbench .
    status:
      - test -f tools/benchmark-harness/fixtures/pdf/omnidoc_*.json

  clean-results:
    desc: "Clean up benchmark results and profiles"
    cmds:
      - rm -rf "{{.BENCHMARK_RESULTS_DIR}}"
      - rm -rf "{{.FLAMEGRAPH_DIR}}"