Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/.github/actions/cache-benchmark-harness/action.yml
+++ b/.github/actions/cache-benchmark-harness/action.yml
@@ -0,0 +1,313 @@
+name: Cache Benchmark Harness Binary
+description: >
+  Build and cache the benchmark-harness binary with intelligent caching based on source hashes.
+  Generates cache keys based on harness source + kreuzberg dependency + Cargo files,
+  restores from cache if available, builds if needed, and saves to cache.
+  Validates artifacts after restore or build to ensure integrity.
+
+inputs:
+  cache-version:
+    description: "Manual version for cache invalidation"
+    required: false
+    default: "v1"
+
+  build-profile:
+    description: "Build profile (release, debug)"
+    required: false
+    default: "release"
+
+outputs:
+  cache-hit:
+    description: "Boolean indicating exact cache hit"
+    value: ${{ steps.cache-restore.outputs.cache-hit }}
+
+  cache-key:
+    description: "The cache key used"
+    value: ${{ steps.generate-cache-key.outputs.cache-key }}
+
+  binary-path:
+    description: "Path to the built/cached benchmark-harness binary"
+    value: ${{ steps.validate-binary.outputs.binary-path }}
+
+runs:
+  using: composite
+  steps:
+    # Validate inputs
+    - name: Validate inputs
+      shell: bash
+      env:
+        BUILD_PROFILE: ${{ inputs.build-profile }}
+      run: |
+        set -euo pipefail
+
+        # Validate build profile
+        valid_profiles=("release" "debug")
+        if [[ ! " ${valid_profiles[@]} " =~ " ${BUILD_PROFILE} " ]]; then
+          echo "❌ Error: build-profile must be one of: ${valid_profiles[*]}"
+          exit 1
+        fi
+
+        echo "✓ Validation passed"
+        echo "  Build profile: $BUILD_PROFILE"
+        echo "  Cache version: ${{ inputs.cache-version }}"
+
+    # Compute hash for benchmark-harness sources
+    - name: Compute benchmark-harness source hash
+      id: harness-hash
+      shell: bash
+      run: |
+        set -euo pipefail
+
+        echo "=== Computing Benchmark Harness Source Hash ==="
+
+        # Compute hash for harness source files and Cargo.toml
+        HARNESS_HASH=$(scripts/ci/cache/compute-hash.sh \
+          "tools/benchmark-harness/src/**" \
+          "tools/benchmark-harness/Cargo.toml" \
+          2>&1 | grep "^[a-f0-9]*$")
+
+        if [[ -z "$HARNESS_HASH" ]]; then
+          echo "❌ Failed to compute harness source hash"
+          exit 1
+        fi
+
+        echo "harness-hash=$HARNESS_HASH" >> "$GITHUB_OUTPUT"
+        echo "✓ Harness source hash: $HARNESS_HASH"
+
+    # Compute hash for kreuzberg dependency
+    - name: Compute kreuzberg dependency hash
+      id: kreuzberg-hash
+      shell: bash
+      run: |
+        set -euo pipefail
+
+        echo "=== Computing Kreuzberg Dependency Hash ==="
+
+        # Compute hash for kreuzberg crate (dependency)
+        KREUZBERG_HASH=$(scripts/ci/cache/compute-hash.sh --dirs \
+          "crates/kreuzberg" \
+          2>&1 | grep "^[a-f0-9]*$")
+
+        if [[ -z "$KREUZBERG_HASH" ]]; then
+          echo "❌ Failed to compute kreuzberg dependency hash"
+          exit 1
+        fi
+
+        echo "kreuzberg-hash=$KREUZBERG_HASH" >> "$GITHUB_OUTPUT"
+        echo "✓ Kreuzberg dependency hash: $KREUZBERG_HASH"
+
+    # Compute hash for Cargo files
+    - name: Compute Cargo files hash
+      id: cargo-hash
+      shell: bash
+      run: |
+        set -euo pipefail
+
+        echo "=== Computing Cargo Files Hash ==="
+
+        # Compute hash for Cargo.lock
+        CARGO_HASH=$(scripts/ci/cache/compute-hash.sh --files Cargo.lock 2>&1 | grep "^[a-f0-9]*$")
+
+        if [[ -z "$CARGO_HASH" ]]; then
+          echo "❌ Failed to compute Cargo files hash"
+          exit 1
+        fi
+
+        echo "cargo-hash=$CARGO_HASH" >> "$GITHUB_OUTPUT"
+        echo "✓ Cargo files hash: $CARGO_HASH"
+
+    # Generate cache key
+    - name: Generate cache key
+      id: generate-cache-key
+      shell: bash
+      env:
+        BUILD_PROFILE: ${{ inputs.build-profile }}
+        HARNESS_HASH: ${{ steps.harness-hash.outputs.harness-hash }}
+        KREUZBERG_HASH: ${{ steps.kreuzberg-hash.outputs.kreuzberg-hash }}
+        CARGO_HASH: ${{ steps.cargo-hash.outputs.cargo-hash }}
+        CACHE_VERSION: ${{ inputs.cache-version }}
+      run: |
+        set -euo pipefail
+
+        echo "=== Cache Key Generated ==="
+
+        # Build cache key following format:
+        # harness-{profile}-{platform}-src-{harness-hash}-kreuzberg-{kreuzberg-hash}-cargo-{cargo-hash}-v{version}
+        CACHE_KEY="harness-${BUILD_PROFILE}-$(uname -m)-src-${HARNESS_HASH}-kreuzberg-${KREUZBERG_HASH}-cargo-${CARGO_HASH}-${CACHE_VERSION}"
+
+        echo "cache-key=$CACHE_KEY" >> "$GITHUB_OUTPUT"
+
+        echo "Full key: $CACHE_KEY"
+        echo ""
+        echo "Key components:"
+        echo "  Profile:         $BUILD_PROFILE"
+        echo "  Platform:        $(uname -m)"
+        echo "  Harness hash:    $HARNESS_HASH"
+        echo "  Kreuzberg hash:  $KREUZBERG_HASH"
+        echo "  Cargo hash:      $CARGO_HASH"
+        echo "  Cache version:   $CACHE_VERSION"
+
+    # Determine target path based on profile
+    - name: Determine target paths
+      id: target-paths
+      shell: bash
+      env:
+        BUILD_PROFILE: ${{ inputs.build-profile }}
+      run: |
+        set -euo pipefail
+
+        echo "=== Determining Target Paths ==="
+
+        case "$BUILD_PROFILE" in
+          release)
+            TARGET_DIR="target/release"
+            ;;
+          debug)
+            TARGET_DIR="target/debug"
+            ;;
+          *)
+            echo "❌ Invalid build profile: $BUILD_PROFILE"
+            exit 1
+            ;;
+        esac
+
+        echo "target-dir=$TARGET_DIR" >> "$GITHUB_OUTPUT"
+        echo "✓ Target directory: $TARGET_DIR"
+
+    # Detect architecture for cache keys (shell expansion doesn't work in YAML with: context)
+    - name: Detect architecture
+      id: detect-arch
+      shell: bash
+      run: echo "arch=$(uname -m)" >> "$GITHUB_OUTPUT"
+
+    # Restore from cache
+    - name: Restore benchmark-harness binary from cache
+      id: cache-restore
+      uses: kreuzberg-dev/actions/cache-binding-artifact@v1
+      with:
+        binding-name: benchmark-harness
+        cache-key: ${{ steps.generate-cache-key.outputs.cache-key }}
+        cache-restore-keys: |
+          harness-${{ inputs.build-profile }}-${{ steps.detect-arch.outputs.arch }}-src-
+          harness-${{ inputs.build-profile }}-${{ steps.detect-arch.outputs.arch }}-
+          harness-${{ inputs.build-profile }}-
+        cache-paths: |
+          ${{ steps.target-paths.outputs.target-dir }}/benchmark-harness
+        operation: restore
+
+    # Log cache hit status
+    - name: Log cache hit status
+      shell: bash
+      run: |
+        set -euo pipefail
+
+        if [[ "${{ steps.cache-restore.outputs.cache-hit }}" == "true" ]]; then
+          echo "✓ Cache HIT - benchmark-harness binary found in cache"
+        else
+          echo "✗ Cache MISS - Building benchmark-harness from source"
+        fi
+
+    # Build if cache miss
+    - name: Build benchmark-harness
+      id: build
+      if: steps.cache-restore.outputs.cache-hit != 'true'
+      shell: bash
+      env:
+        BUILD_PROFILE: ${{ inputs.build-profile }}
+      run: |
+        set -euo pipefail
+
+        echo "=== Building Benchmark Harness ==="
+        echo "Profile: $BUILD_PROFILE"
+
+        # Determine cargo build profile argument
+        case "$BUILD_PROFILE" in
+          release)
+            BUILD_ARG="--release"
+            ;;
+          debug)
+            # Debug is default, no flag needed
+            BUILD_ARG=""
+            ;;
+          *)
+            echo "❌ Invalid build profile: $BUILD_PROFILE"
+            exit 1
+            ;;
+        esac
+
+        # Build benchmark-harness
+        echo "Running: cargo build --manifest-path tools/benchmark-harness/Cargo.toml $BUILD_ARG"
+        if ! cargo build --manifest-path tools/benchmark-harness/Cargo.toml $BUILD_ARG; then
+          echo "❌ Build failed for benchmark-harness"
+          exit 1
+        fi
+
+        echo "✓ Build succeeded"
+
+    # Validate binary exists and is executable
+    - name: Validate benchmark-harness binary
+      id: validate-binary
+      shell: bash
+      env:
+        BUILD_PROFILE: ${{ inputs.build-profile }}
+        TARGET_DIR: ${{ steps.target-paths.outputs.target-dir }}
+      run: |
+        set -euo pipefail
+
+        echo "=== Validating Benchmark Harness Binary ==="
+
+        BINARY_PATH="${TARGET_DIR}/benchmark-harness"
+
+        # Check if binary exists
+        if [[ ! -f "$BINARY_PATH" ]]; then
+          echo "❌ Binary not found at: $BINARY_PATH"
+          exit 1
+        fi
+
+        # Check if binary is executable
+        if [[ ! -x "$BINARY_PATH" ]]; then
+          echo "❌ Binary is not executable: $BINARY_PATH"
+          exit 1
+        fi
+
+        # Get binary size and info
+        BINARY_SIZE=$(ls -lh "$BINARY_PATH" | awk '{print $5}')
+        BINARY_PERMS=$(ls -l "$BINARY_PATH" | awk '{print $1}')
+
+        echo "binary-path=$BINARY_PATH" >> "$GITHUB_OUTPUT"
+
+        echo "✓ Binary validation passed"
+        echo "  Path:        $BINARY_PATH"
+        echo "  Size:        $BINARY_SIZE"
+        echo "  Permissions: $BINARY_PERMS"
+
+    # Save to cache if build occurred
+    - name: Save benchmark-harness binary to cache
+      if: steps.cache-restore.outputs.cache-hit != 'true'
+      uses: kreuzberg-dev/actions/cache-binding-artifact@v1
+      with:
+        binding-name: benchmark-harness
+        cache-key: ${{ steps.generate-cache-key.outputs.cache-key }}
+        cache-paths: |
+          ${{ steps.target-paths.outputs.target-dir }}/benchmark-harness
+        operation: save
+
+    # Summary
+    - name: Summary
+      if: always()
+      shell: bash
+      run: |
+        set -euo pipefail
+
+        echo ""
+        echo "=== Build and Cache Summary ==="
+        echo "Build Profile:   ${{ inputs.build-profile }}"
+        echo "Platform:        $(uname -m)"
+        echo "Cache Hit:       ${{ steps.cache-restore.outputs.cache-hit == 'true' && 'Yes' || 'No' }}"
+        echo "Cache Key:       ${{ steps.generate-cache-key.outputs.cache-key }}"
+        echo "Binary Path:     ${{ steps.validate-binary.outputs.binary-path }}"
+        echo ""
+        echo "Hashes:"
+        echo "  Harness:       ${{ steps.harness-hash.outputs.harness-hash }}"
+        echo "  Kreuzberg:     ${{ steps.kreuzberg-hash.outputs.kreuzberg-hash }}"
+        echo "  Cargo:         ${{ steps.cargo-hash.outputs.cargo-hash }}"
--- a/.github/actions/install-system-deps/action.yml
+++ b/.github/actions/install-system-deps/action.yml
@@ -0,0 +1,105 @@
+name: Install System Dependencies
+description: |
+  Install and cache platform-specific dependencies required for document conversion.
+  Includes: Tesseract OCR, fonts, and build tools.
+  Features robust caching with architecture/version awareness, timeout handling, and retry logic.
+
+inputs:
+  enable-retry:
+    description: Enable retry logic with exponential backoff
+    required: false
+    default: "true"
+
+runs:
+  using: composite
+  steps:
+    - name: Detect Tesseract version (macOS)
+      if: runner.os == 'macOS'
+      id: detect-tesseract-macos
+      shell: bash
+      run: scripts/ci/install-system-deps/detect-tesseract-macos.sh
+
+    - name: Cache Tesseract & tessdata (macOS)
+      if: runner.os == 'macOS'
+      id: cache-tesseract-macos
+      uses: actions/cache@v5
+      with:
+        path: |
+          /usr/local/opt/tesseract/
+          /usr/local/Cellar/tesseract/
+          /opt/homebrew/opt/tesseract/
+          /opt/homebrew/Cellar/tesseract/
+        key: tesseract-macos-${{ runner.arch }}-v5-${{ steps.detect-tesseract-macos.outputs.version }}
+        restore-keys: |
+          tesseract-macos-${{ runner.arch }}-v5-
+          tesseract-macos-${{ runner.arch }}-
+
+    - name: Install dependencies (macOS)
+      if: runner.os == 'macOS'
+      shell: bash
+      run: scripts/ci/install-system-deps/install-macos.sh
+
+    - name: Detect Tesseract version (Linux)
+      if: runner.os == 'Linux'
+      id: detect-tesseract-linux
+      shell: bash
+      run: scripts/ci/install-system-deps/detect-tesseract-linux.sh
+
+    - name: Cache Tesseract data (Linux)
+      if: runner.os == 'Linux'
+      id: cache-tesseract-linux
+      uses: actions/cache@v5
+      with:
+        path: |
+          /usr/share/tesseract-ocr/5/tessdata/
+          /usr/share/tesseract-ocr/tessdata/
+        key: tesseract-linux-${{ runner.arch }}-v5-${{ steps.detect-tesseract-linux.outputs.version }}
+        restore-keys: |
+          tesseract-linux-${{ runner.arch }}-v5-
+          tesseract-linux-${{ runner.arch }}-
+
+    - name: Install dependencies (Linux)
+      if: runner.os == 'Linux'
+      shell: bash
+      run: scripts/ci/install-system-deps/install-linux.sh
+
+    - name: Cache Tesseract (Windows)
+      if: runner.os == 'Windows'
+      id: cache-tesseract-windows
+      uses: actions/cache@v5
+      with:
+        path: |
+          C:\Program Files\Tesseract-OCR
+          C:\ProgramData\chocolatey\lib\tesseract
+        key: tesseract-windows-${{ runner.arch }}-v5-data
+        restore-keys: |
+          tesseract-windows-${{ runner.arch }}-
+
+    - name: Cache LLVM (Windows)
+      if: runner.os == 'Windows'
+      id: cache-llvm-windows
+      uses: actions/cache@v5
+      with:
+        path: |
+          C:\Program Files\LLVM
+          C:\ProgramData\chocolatey\lib\llvm
+        key: llvm-windows-${{ runner.arch }}-v1
+
+    - name: Cache CMake (Windows)
+      if: runner.os == 'Windows'
+      id: cache-cmake-windows
+      uses: actions/cache@v5
+      with:
+        path: |
+          C:\Program Files\CMake
+          C:\ProgramData\chocolatey\lib\cmake
+        key: cmake-windows-${{ runner.arch }}-v1
+
+    - name: Install dependencies (Windows)
+      if: runner.os == 'Windows'
+      shell: pwsh
+      env:
+        TESSERACT_CACHE_HIT: ${{ steps.cache-tesseract-windows.outputs.cache-hit }}
+        LLVM_CACHE_HIT: ${{ steps.cache-llvm-windows.outputs.cache-hit }}
+        CMAKE_CACHE_HIT: ${{ steps.cache-cmake-windows.outputs.cache-hit }}
+      run: pwsh -File scripts/ci/install-system-deps/install-windows.ps1
--- a/.github/actions/setup-layout-models/action.yml
+++ b/.github/actions/setup-layout-models/action.yml
@@ -0,0 +1,197 @@
+name: Setup Layout Detection Models Cache
+description: Download and cache layout detection ONNX models (RT-DETR + TATR) for CI testing
+
+inputs:
+  cache-enabled:
+    description: Enable model caching (set to false for cross-arch builds)
+    required: false
+    default: "true"
+  models:
+    description: Comma-separated list of models to setup (rtdetr,tatr)
+    required: false
+    default: "rtdetr,tatr"
+  cache-key-suffix:
+    description: Suffix for cache key to differentiate model sets
+    required: false
+    default: "layout-models-v2"
+
+outputs:
+  cache-hit:
+    description: Whether models were restored from cache (true/false)
+    value: ${{ steps.cache-models.outputs.cache-hit }}
+  cache-dir:
+    description: Path to the layout model cache directory
+    value: ${{ steps.set-outputs.outputs.cache-dir }}
+  models-available:
+    description: Comma-separated list of available models
+    value: ${{ steps.verify-models.outputs.available-models }}
+
+runs:
+  using: composite
+  steps:
+    - name: Setup cache directory
+      shell: bash
+      run: |
+        mkdir -p ~/.cache/kreuzberg/layout
+        echo "Cache directory: $HOME/.cache/kreuzberg/layout"
+
+    - name: Restore layout models from cache
+      if: inputs.cache-enabled == 'true'
+      uses: actions/cache@v5
+      id: cache-models
+      with:
+        path: ~/.cache/kreuzberg/layout
+        key: ${{ inputs.cache-key-suffix }}-${{ runner.os }}-${{ runner.arch }}-rtdetr_3bf2fb0e+tatr_c11f4033
+        restore-keys: |
+          ${{ inputs.cache-key-suffix }}-${{ runner.os }}-${{ runner.arch }}-
+          ${{ inputs.cache-key-suffix }}-${{ runner.os }}-
+          ${{ inputs.cache-key-suffix }}-
+
+    - name: Download RT-DETR model (rtdetr)
+      if: contains(inputs.models, 'rtdetr') && steps.cache-models.outputs.cache-hit != 'true'
+      shell: bash
+      run: |
+        MODEL_URL="https://huggingface.co/Kreuzberg/layout-models/resolve/main/rtdetr/model.onnx"
+        CACHE_DIR="$HOME/.cache/kreuzberg/layout"
+        MODEL_DIR="$CACHE_DIR/rtdetr"
+        MODEL_FILE="$MODEL_DIR/model.onnx"
+
+        echo "Downloading RT-DETR layout detection model from $MODEL_URL"
+        mkdir -p "$MODEL_DIR"
+
+        for attempt in 1 2 3; do
+          if [ $attempt -gt 1 ]; then
+            backoff=$((5 * 3 ** (attempt - 2)))
+            echo "Retry attempt $attempt/3 after ${backoff}s backoff..."
+            sleep $backoff
+          fi
+
+          if curl -f -L --progress-bar --connect-timeout 30 --max-time 600 \
+            -o "$MODEL_FILE" "$MODEL_URL"; then
+            echo "RT-DETR model downloaded successfully ($(du -h "$MODEL_FILE" | cut -f1))"
+            break
+          fi
+        done
+
+        if [ ! -f "$MODEL_FILE" ]; then
+          echo "ERROR: Failed to download RT-DETR model after 3 attempts"
+          exit 1
+        fi
+
+    - name: Verify RT-DETR SHA256
+      if: contains(inputs.models, 'rtdetr') && steps.cache-models.outputs.cache-hit != 'true'
+      shell: bash
+      run: |
+        MODEL_FILE="$HOME/.cache/kreuzberg/layout/rtdetr/model.onnx"
+        EXPECTED="3bf2fb0ee6df87435b7ae47f0f3930ec3dc97ec56fd824acc6d57bc7a6b89ef2"
+
+        if command -v sha256sum &>/dev/null; then
+          ACTUAL=$(sha256sum "$MODEL_FILE" | awk '{print $1}')
+        else
+          ACTUAL=$(shasum -a 256 "$MODEL_FILE" | awk '{print $1}')
+        fi
+
+        if [ "$ACTUAL" != "$EXPECTED" ]; then
+          echo "ERROR: RT-DETR SHA256 mismatch"
+          echo "  Expected: $EXPECTED"
+          echo "  Actual:   $ACTUAL"
+          rm -f "$MODEL_FILE"
+          exit 1
+        fi
+        echo "RT-DETR SHA256 verified"
+
+    - name: Download TATR model (tatr)
+      if: contains(inputs.models, 'tatr') && steps.cache-models.outputs.cache-hit != 'true'
+      shell: bash
+      run: |
+        MODEL_URL="https://huggingface.co/Kreuzberg/layout-models/resolve/main/tatr/model.onnx"
+        CACHE_DIR="$HOME/.cache/kreuzberg/layout"
+        MODEL_DIR="$CACHE_DIR/tatr"
+        MODEL_FILE="$MODEL_DIR/tatr.onnx"
+
+        echo "Downloading TATR table recognition model from $MODEL_URL"
+        mkdir -p "$MODEL_DIR"
+
+        for attempt in 1 2 3; do
+          if [ $attempt -gt 1 ]; then
+            backoff=$((5 * 3 ** (attempt - 2)))
+            echo "Retry attempt $attempt/3 after ${backoff}s backoff..."
+            sleep $backoff
+          fi
+
+          if curl -f -L --progress-bar --connect-timeout 30 --max-time 600 \
+            -o "$MODEL_FILE" "$MODEL_URL"; then
+            echo "TATR model downloaded successfully ($(du -h "$MODEL_FILE" | cut -f1))"
+            break
+          fi
+        done
+
+        if [ ! -f "$MODEL_FILE" ]; then
+          echo "ERROR: Failed to download TATR model after 3 attempts"
+          exit 1
+        fi
+
+    - name: Verify TATR SHA256
+      if: contains(inputs.models, 'tatr') && steps.cache-models.outputs.cache-hit != 'true'
+      shell: bash
+      run: |
+        MODEL_FILE="$HOME/.cache/kreuzberg/layout/tatr/tatr.onnx"
+        EXPECTED="c11f4033da75e9c4d41c403ef356e89caa0a37a7d111b55461e7d5ba856bb6b6"
+
+        if command -v sha256sum &>/dev/null; then
+          ACTUAL=$(sha256sum "$MODEL_FILE" | awk '{print $1}')
+        else
+          ACTUAL=$(shasum -a 256 "$MODEL_FILE" | awk '{print $1}')
+        fi
+
+        if [ "$ACTUAL" != "$EXPECTED" ]; then
+          echo "ERROR: TATR SHA256 mismatch"
+          echo "  Expected: $EXPECTED"
+          echo "  Actual:   $ACTUAL"
+          rm -f "$MODEL_FILE"
+          exit 1
+        fi
+        echo "TATR SHA256 verified"
+
+    - name: Verify downloaded models
+      id: verify-models
+      shell: bash
+      run: |
+        CACHE_DIR="$HOME/.cache/kreuzberg/layout"
+        AVAILABLE_MODELS=()
+        TOTAL_SIZE=0
+
+        echo "Checking for layout models in $CACHE_DIR"
+
+        if [ -f "$CACHE_DIR/rtdetr/model.onnx" ]; then
+          SIZE=$(wc -c < "$CACHE_DIR/rtdetr/model.onnx" | tr -d ' ')
+          AVAILABLE_MODELS+=("rtdetr")
+          TOTAL_SIZE=$((TOTAL_SIZE + SIZE))
+          echo "  ✓ RT-DETR model: $(numfmt --to=iec-i --suffix=B $SIZE 2>/dev/null || echo $SIZE bytes)"
+        fi
+
+        if [ -f "$CACHE_DIR/tatr/tatr.onnx" ]; then
+          SIZE=$(wc -c < "$CACHE_DIR/tatr/tatr.onnx" | tr -d ' ')
+          AVAILABLE_MODELS+=("tatr")
+          TOTAL_SIZE=$((TOTAL_SIZE + SIZE))
+          echo "  ✓ TATR model: $(numfmt --to=iec-i --suffix=B $SIZE 2>/dev/null || echo $SIZE bytes)"
+        fi
+
+        if [ ${#AVAILABLE_MODELS[@]} -eq 0 ]; then
+          echo "ERROR: No layout models found in cache directory after download"
+          echo "available-models=" >> $GITHUB_OUTPUT
+          exit 1
+        fi
+
+        AVAILABLE_MODELS_STR=$(IFS=, ; echo "${AVAILABLE_MODELS[*]}")
+        echo "✓ Total cached layout models: ${#AVAILABLE_MODELS[@]} ($(numfmt --to=iec-i --suffix=B $TOTAL_SIZE 2>/dev/null || echo $TOTAL_SIZE bytes))"
+        echo "available-models=$AVAILABLE_MODELS_STR" >> $GITHUB_OUTPUT
+
+    - name: Set cache directory output
+      id: set-outputs
+      shell: bash
+      run: |
+        CACHE_DIR="$HOME/.cache/kreuzberg/layout"
+        echo "cache-dir=$CACHE_DIR" >> $GITHUB_OUTPUT
+        echo "KREUZBERG_CACHE_DIR=$HOME/.cache/kreuzberg" >> $GITHUB_ENV
+        echo "Layout model cache configured at: $CACHE_DIR"
--- a/.github/actions/setup-onnx-runtime/action.yml
+++ b/.github/actions/setup-onnx-runtime/action.yml
@@ -0,0 +1,46 @@
+name: Setup ONNX Runtime
+description: Download and stage ONNX Runtime libraries for bindings
+inputs:
+  ort-version:
+    description: ONNX Runtime version to download
+    required: true
+  dest-dir:
+    description: Directory (relative to workspace) where libraries should be copied
+    required: false
+    default: crates/kreuzberg-node
+  arch-id:
+    description: Override architecture (x64|arm64). Defaults to runner architecture.
+    required: false
+    default: ""
+  strategy:
+    description: "ORT linking strategy: 'system' (dynamic link, default) or 'bundled' (static link via ort-bundled cargo feature)"
+    required: false
+    default: system
+runs:
+  using: composite
+  steps:
+    - name: Cache ONNX Runtime
+      id: cache-onnx
+      uses: actions/cache@v5
+      with:
+        path: |
+          ${{ runner.temp }}/onnxruntime
+        key: onnx-v2-${{ runner.os }}-${{ inputs.arch-id != '' && inputs.arch-id || runner.arch }}-${{ inputs.ort-version }}
+        restore-keys: |
+          onnx-v2-${{ runner.os }}-${{ inputs.arch-id != '' && inputs.arch-id || runner.arch }}-
+          onnx-v2-${{ runner.os }}-
+
+    - name: Prepare ONNX Runtime (Linux)
+      if: runner.os == 'Linux'
+      shell: bash
+      run: scripts/ci/actions/setup-onnx-runtime/linux.sh "${{ inputs.ort-version }}" "${{ inputs.dest-dir }}" "${{ inputs.arch-id }}" "${{ inputs.strategy }}"
+
+    - name: Prepare ONNX Runtime (macOS)
+      if: runner.os == 'macOS'
+      shell: bash
+      run: scripts/ci/actions/setup-onnx-runtime/macos.sh "${{ inputs.ort-version }}" "${{ inputs.dest-dir }}" "${{ inputs.arch-id }}" "${{ inputs.strategy }}"
+
+    - name: Prepare ONNX Runtime (Windows)
+      if: runner.os == 'Windows'
+      shell: pwsh
+      run: scripts/ci/actions/setup-onnx-runtime/windows.ps1 "${{ inputs.ort-version }}" "${{ inputs.dest-dir }}" "${{ inputs.arch-id }}" "${{ inputs.strategy }}"
--- a/.github/actions/setup-paddle-ocr-models/README.md
+++ b/.github/actions/setup-paddle-ocr-models/README.md
@@ -0,0 +1,202 @@
+# Setup PaddleOCR Models Cache
+
+GitHub Action to download and cache PaddleOCR ONNX models for CI testing and development.
+
+## Overview
+
+This action manages the setup of PaddleOCR PP-OCRv5 ONNX models used by the `kreuzberg-paddle-ocr` crate for optical character recognition testing. It:
+
+- Downloads three model types (detection, classification, recognition) from Hugging Face
+- Caches models per OS and CPU architecture (Linux x86_64, Linux ARM64, macOS, Windows)
+- Provides environment variables for downstream use
+- Outputs cache hit status and available model information
+- Gracefully handles download failures (continues with available models)
+
+## Models
+
+The action downloads pre-converted ONNX format models from the `Kreuzberg/paddleocr-onnx-models` Hugging Face repository:
+
+| Model Type           | File                                  | Size    | Purpose                                   |
+| -------------------- | ------------------------------------- | ------- | ----------------------------------------- |
+| Detection (det)      | `PP-OCRv5_server_det_infer.onnx`      | ~84 MB  | Text location detection (PP-OCRv5 server) |
+| Classification (cls) | `ch_ppocr_mobile_v2.0_cls_infer.onnx` | ~0.6 MB | Text orientation classification           |
+| Recognition (rec)    | `rec/english/model.onnx`              | ~8 MB   | Text character recognition (PP-OCRv5)     |
+
+**Total cache size: ~93 MB per OS/architecture combination**
+
+## Usage
+
+### Basic Usage
+
+```yaml
+- uses: ./.github/actions/setup-paddle-ocr-models
+```
+
+### With Custom Cache Suffix
+
+```yaml
+- uses: ./.github/actions/setup-paddle-ocr-models
+  with:
+    cache-key-suffix: my-paddle-ocr-v5
+```
+
+### Disable Caching
+
+For cross-architecture builds where caching doesn't help:
+
+```yaml
+- uses: ./.github/actions/setup-paddle-ocr-models
+  with:
+    cache-enabled: false
+```
+
+### Download Specific Models Only
+
+```yaml
+- uses: ./.github/actions/setup-paddle-ocr-models
+  with:
+    models: "det,rec" # Skip classification model
+```
+
+## Inputs
+
+| Name               | Description                                                     | Required | Default              |
+| ------------------ | --------------------------------------------------------------- | -------- | -------------------- |
+| `cache-enabled`    | Enable model caching (set false for cross-arch builds)          | No       | `true`               |
+| `models`           | Comma-separated list of models to setup (det,cls,rec or subset) | No       | `det,cls,rec`        |
+| `cache-key-suffix` | Suffix for cache key to differentiate model sets                | No       | `paddle-ocr-v5-onnx` |
+
+## Outputs
+
+| Name               | Description                                          |
+| ------------------ | ---------------------------------------------------- |
+| `cache-hit`        | Whether models were restored from cache (true/false) |
+| `cache-dir`        | Path to the PaddleOCR model cache directory          |
+| `models-available` | Comma-separated list of available models after setup |
+
+## Outputs as Environment Variables
+
+The action automatically exports:
+
+- `PADDLE_OCR_MODEL_CACHE`: Absolute path to model cache directory
+
+## Cache Strategy
+
+Models are cached using GitHub Actions cache with the following key structure:
+
+```text
+paddle-ocr-v5-onnx-{OS}-{ARCHITECTURE}-v4
+```
+
+Cache restoration order (restore-keys):
+
+1. Exact match: `paddle-ocr-v5-onnx-{OS}-{ARCHITECTURE}-v4`
+2. OS-Architecture: `paddle-ocr-v5-onnx-{OS}-{ARCHITECTURE}-`
+3. OS only: `paddle-ocr-v5-onnx-{OS}-`
+4. Any: `paddle-ocr-v5-onnx-`
+
+## Example: CI Rust Workflow Integration
+
+```yaml
+jobs:
+  paddle-ocr-tests:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: ./.github/actions/setup-paddle-ocr-models
+        id: paddle-models
+
+      - name: Run PaddleOCR tests
+        run: cargo test --package kreuzberg-paddle-ocr
+        env:
+          PADDLE_OCR_MODEL_CACHE: ${{ steps.paddle-models.outputs.cache-dir }}
+
+      - name: Report cache status
+        if: always()
+        run: |
+          echo "Cache hit: ${{ steps.paddle-models.outputs.cache-hit }}"
+          echo "Available models: ${{ steps.paddle-models.outputs.models-available }}"
+```
+
+## Error Handling
+
+The action downloads models sequentially and will fail if a required model download fails. After downloading:
+
+- The verify step reports which models are actually available in the output
+- Downstream tests can check `models-available` to know what's available
+- If all models fail, tests can fall back to alternative behavior
+
+## Download Sources
+
+Models are downloaded from:
+
+```text
+https://huggingface.co/Kreuzberg/paddleocr-onnx-models/resolve/main/
+```
+
+If this repository becomes unavailable, the action will fail gracefully. Alternative sources can be configured by modifying the `MODEL_URL` environment variables in the action.
+
+## Troubleshooting
+
+### Models not being cached
+
+1. Check that `cache-enabled` is not set to `false`
+2. Verify GitHub Actions cache is not full (max 10 GB per repository)
+3. Check runner OS and architecture match cache keys
+4. View cache in repository settings (Settings → Actions → Caches)
+
+### Download timeouts
+
+If downloads timeout:
+
+- Increase the 300-second timeout in the action steps
+- Check Hugging Face API availability
+- Try reducing the number of models (`models: "det,rec"`)
+
+### Verifying models are present
+
+Check that all expected models exist in the correct directory structure:
+
+```bash
+ls -lh ~/.cache/kreuzberg/paddle-ocr/
+```
+
+Expected output:
+
+```text
+drwxr-xr-x det/
+drwxr-xr-x cls/
+drwxr-xr-x rec/
+
+ls -lh ~/.cache/kreuzberg/paddle-ocr/det/
+-rw-r--r-- model.onnx (~84 MB)
+
+ls -lh ~/.cache/kreuzberg/paddle-ocr/cls/
+-rw-r--r-- model.onnx (~0.6 MB)
+
+ls -lh ~/.cache/kreuzberg/paddle-ocr/rec/english/
+-rw-r--r-- model.onnx (~8 MB)
+-rw-r--r-- dict.txt
+```
+
+The directory structure must match what `ModelManager` expects in `model_manager.rs`.
+
+## Performance Impact
+
+- **First run (no cache)**: ~30-60 seconds (download time depends on network)
+- **Cached run**: <1 second (cache restore)
+- **Cache size**: ~93 MB per OS/architecture
+- **Network bandwidth**: ~93 MB download on cache miss
+
+## Related Actions
+
+- `.github/actions/setup-tesseract-cache` - Similar caching for Tesseract models
+- `.github/actions/cache-hf-fastembed` - Hugging Face model caching for fastembed
+- `.github/actions/setup-onnx-runtime` - ONNX Runtime setup for inference
+
+## See Also
+
+- [PaddleOCR Documentation](https://github.com/PaddlePaddle/PaddleOCR)
+- [kreuzberg-paddle-ocr crate](../../../crates/kreuzberg-paddle-ocr)
+- [ModelManager source](../../../crates/kreuzberg/src/paddle_ocr/model_manager.rs)
--- a/.github/actions/setup-paddle-ocr-models/action.yml
+++ b/.github/actions/setup-paddle-ocr-models/action.yml
@@ -0,0 +1,231 @@
+name: Setup PaddleOCR Models Cache
+description: Download and cache PaddleOCR ONNX models for CI testing
+
+inputs:
+  cache-enabled:
+    description: Enable model caching (set to false for cross-arch builds)
+    required: false
+    default: "true"
+  models:
+    description: Comma-separated list of models to setup (det,cls,rec or specific subset)
+    required: false
+    default: "det,cls,rec"
+  cache-key-suffix:
+    description: Suffix for cache key to differentiate model sets
+    required: false
+    default: "paddle-ocr-v5-onnx"
+
+outputs:
+  cache-hit:
+    description: Whether models were restored from cache (true/false)
+    value: ${{ steps.cache-models.outputs.cache-hit }}
+  cache-dir:
+    description: Path to the PaddleOCR model cache directory
+    value: ${{ steps.set-outputs.outputs.cache-dir }}
+  models-available:
+    description: Comma-separated list of available models
+    value: ${{ steps.verify-models.outputs.available-models }}
+
+runs:
+  using: composite
+  steps:
+    - name: Setup cache directory
+      shell: bash
+      run: |
+        mkdir -p ~/.cache/kreuzberg/paddle-ocr
+        echo "Cache directory: $HOME/.cache/kreuzberg/paddle-ocr"
+
+    - name: Restore PaddleOCR models from cache
+      if: inputs.cache-enabled == 'true'
+      uses: actions/cache@v5
+      id: cache-models
+      with:
+        path: ~/.cache/kreuzberg/paddle-ocr
+        key: ${{ inputs.cache-key-suffix }}-${{ runner.os }}-${{ runner.arch }}-v4
+        restore-keys: |
+          ${{ inputs.cache-key-suffix }}-${{ runner.os }}-${{ runner.arch }}-
+          ${{ inputs.cache-key-suffix }}-${{ runner.os }}-
+          ${{ inputs.cache-key-suffix }}-
+
+    - name: Download detection model (det)
+      if: contains(inputs.models, 'det') && steps.cache-models.outputs.cache-hit != 'true'
+      shell: bash
+      run: |
+        MODEL_URL="https://huggingface.co/Kreuzberg/paddleocr-onnx-models/resolve/main/PP-OCRv5_server_det_infer.onnx"
+        CACHE_DIR="$HOME/.cache/kreuzberg/paddle-ocr"
+        MODEL_DIR="$CACHE_DIR/det"
+        MODEL_FILE="$MODEL_DIR/model.onnx"
+
+        echo "Downloading detection model from $MODEL_URL"
+        mkdir -p "$MODEL_DIR"
+
+        for attempt in 1 2 3; do
+          if [ $attempt -gt 1 ]; then
+            backoff=$((5 * 3 ** (attempt - 2)))
+            echo "Retry attempt $attempt/3 after ${backoff}s backoff..."
+            sleep $backoff
+          fi
+
+          if curl -f -L --progress-bar --connect-timeout 30 --max-time 600 \
+            -o "$MODEL_FILE" "$MODEL_URL"; then
+            echo "Detection model downloaded successfully ($(du -h "$MODEL_FILE" | cut -f1))"
+            exit 0
+          fi
+        done
+
+        echo "ERROR: Failed to download detection model after 3 attempts"
+        rm -f "$MODEL_FILE"
+        exit 1
+
+    - name: Download classification model (cls)
+      if: contains(inputs.models, 'cls') && steps.cache-models.outputs.cache-hit != 'true'
+      shell: bash
+      run: |
+        MODEL_URL="https://huggingface.co/Kreuzberg/paddleocr-onnx-models/resolve/main/ch_ppocr_mobile_v2.0_cls_infer.onnx"
+        CACHE_DIR="$HOME/.cache/kreuzberg/paddle-ocr"
+        MODEL_DIR="$CACHE_DIR/cls"
+        MODEL_FILE="$MODEL_DIR/model.onnx"
+
+        echo "Downloading classification model from $MODEL_URL"
+        mkdir -p "$MODEL_DIR"
+
+        for attempt in 1 2 3; do
+          if [ $attempt -gt 1 ]; then
+            backoff=$((5 * 3 ** (attempt - 2)))
+            echo "Retry attempt $attempt/3 after ${backoff}s backoff..."
+            sleep $backoff
+          fi
+
+          if curl -f -L --progress-bar --connect-timeout 30 --max-time 600 \
+            -o "$MODEL_FILE" "$MODEL_URL"; then
+            echo "Classification model downloaded successfully ($(du -h "$MODEL_FILE" | cut -f1))"
+            exit 0
+          fi
+        done
+
+        echo "ERROR: Failed to download classification model after 3 attempts"
+        rm -f "$MODEL_FILE"
+        exit 1
+
+    - name: Download recognition model (rec/english)
+      if: contains(inputs.models, 'rec') && steps.cache-models.outputs.cache-hit != 'true'
+      shell: bash
+      run: |
+        MODEL_URL="https://huggingface.co/Kreuzberg/paddleocr-onnx-models/resolve/main/rec/english/model.onnx"
+        CACHE_DIR="$HOME/.cache/kreuzberg/paddle-ocr"
+        MODEL_DIR="$CACHE_DIR/rec/english"
+        MODEL_FILE="$MODEL_DIR/model.onnx"
+
+        echo "Downloading English recognition model from $MODEL_URL"
+        mkdir -p "$MODEL_DIR"
+
+        for attempt in 1 2 3; do
+          if [ $attempt -gt 1 ]; then
+            backoff=$((5 * 3 ** (attempt - 2)))
+            echo "Retry attempt $attempt/3 after ${backoff}s backoff..."
+            sleep $backoff
+          fi
+
+          if curl -f -L --progress-bar --connect-timeout 30 --max-time 600 \
+            -o "$MODEL_FILE" "$MODEL_URL"; then
+            echo "Recognition model downloaded successfully ($(du -h "$MODEL_FILE" | cut -f1))"
+            exit 0
+          fi
+        done
+
+        echo "ERROR: Failed to download recognition model after 3 attempts"
+        rm -f "$MODEL_FILE"
+        exit 1
+
+    - name: Download recognition dictionary (rec/english/dict.txt)
+      if: contains(inputs.models, 'rec') && steps.cache-models.outputs.cache-hit != 'true'
+      shell: bash
+      run: |
+        DICT_URL="https://huggingface.co/Kreuzberg/paddleocr-onnx-models/resolve/main/rec/english/dict.txt"
+        CACHE_DIR="$HOME/.cache/kreuzberg/paddle-ocr"
+        MODEL_DIR="$CACHE_DIR/rec/english"
+        DICT_FILE="$MODEL_DIR/dict.txt"
+
+        echo "Downloading English recognition dictionary from $DICT_URL"
+        mkdir -p "$MODEL_DIR"
+
+        for attempt in 1 2 3; do
+          if [ $attempt -gt 1 ]; then
+            backoff=$((5 * 3 ** (attempt - 2)))
+            echo "Retry attempt $attempt/3 after ${backoff}s backoff..."
+            sleep $backoff
+          fi
+
+          if curl -f -L --progress-bar --connect-timeout 30 --max-time 600 \
+            -o "$DICT_FILE" "$DICT_URL"; then
+            echo "Dictionary downloaded successfully ($(du -h "$DICT_FILE" | cut -f1))"
+            exit 0
+          fi
+        done
+
+        echo "ERROR: Failed to download dictionary after 3 attempts"
+        rm -f "$DICT_FILE"
+        exit 1
+
+    - name: Verify downloaded models
+      id: verify-models
+      shell: bash
+      run: |
+        CACHE_DIR="$HOME/.cache/kreuzberg/paddle-ocr"
+        AVAILABLE_MODELS=()
+        TOTAL_SIZE=0
+
+        echo "Checking for PaddleOCR models in $CACHE_DIR"
+
+        if [ -f "$CACHE_DIR/det/model.onnx" ]; then
+          SIZE=$(wc -c < "$CACHE_DIR/det/model.onnx" | tr -d ' ')
+          AVAILABLE_MODELS+=("det")
+          TOTAL_SIZE=$((TOTAL_SIZE + SIZE))
+          echo "  ✓ Detection model: $(numfmt --to=iec-i --suffix=B $SIZE 2>/dev/null || echo $SIZE bytes)"
+        fi
+
+        if [ -f "$CACHE_DIR/cls/model.onnx" ]; then
+          SIZE=$(wc -c < "$CACHE_DIR/cls/model.onnx" | tr -d ' ')
+          AVAILABLE_MODELS+=("cls")
+          TOTAL_SIZE=$((TOTAL_SIZE + SIZE))
+          echo "  ✓ Classification model: $(numfmt --to=iec-i --suffix=B $SIZE 2>/dev/null || echo $SIZE bytes)"
+        fi
+
+        if [ -f "$CACHE_DIR/rec/english/model.onnx" ]; then
+          SIZE=$(wc -c < "$CACHE_DIR/rec/english/model.onnx" | tr -d ' ')
+          AVAILABLE_MODELS+=("rec")
+          TOTAL_SIZE=$((TOTAL_SIZE + SIZE))
+          echo "  ✓ Recognition model (English): $(numfmt --to=iec-i --suffix=B $SIZE 2>/dev/null || echo $SIZE bytes)"
+        fi
+
+        if [ -f "$CACHE_DIR/rec/english/dict.txt" ]; then
+          SIZE=$(wc -c < "$CACHE_DIR/rec/english/dict.txt" | tr -d ' ')
+          TOTAL_SIZE=$((TOTAL_SIZE + SIZE))
+          echo "  ✓ Recognition dictionary (English): $(numfmt --to=iec-i --suffix=B $SIZE 2>/dev/null || echo $SIZE bytes)"
+        fi
+
+        if [ ${#AVAILABLE_MODELS[@]} -eq 0 ]; then
+          echo "ERROR: No models found in cache directory after download"
+          echo "available-models=" >> $GITHUB_OUTPUT
+          exit 1
+        fi
+
+        AVAILABLE_MODELS_STR=$(IFS=, ; echo "${AVAILABLE_MODELS[*]}")
+        echo "✓ Total cached models: ${#AVAILABLE_MODELS[@]} ($(numfmt --to=iec-i --suffix=B $TOTAL_SIZE 2>/dev/null || echo $TOTAL_SIZE bytes))"
+        echo "available-models=$AVAILABLE_MODELS_STR" >> $GITHUB_OUTPUT
+
+    - name: Set cache directory output
+      id: set-outputs
+      shell: bash
+      run: |
+        CACHE_DIR="$HOME/.cache/kreuzberg/paddle-ocr"
+        echo "cache-dir=$CACHE_DIR" >> $GITHUB_OUTPUT
+        echo "PADDLE_OCR_MODEL_CACHE=$CACHE_DIR" >> $GITHUB_ENV
+        echo "KREUZBERG_CACHE_DIR=$HOME/.cache/kreuzberg" >> $GITHUB_ENV
+
+    - name: Export cache environment
+      shell: bash
+      run: |
+        echo "PADDLE_OCR_MODEL_CACHE=$HOME/.cache/kreuzberg/paddle-ocr" >> $GITHUB_ENV
+        echo "KREUZBERG_CACHE_DIR=$HOME/.cache/kreuzberg" >> $GITHUB_ENV
+        echo "PaddleOCR model cache configured at: $HOME/.cache/kreuzberg/paddle-ocr"
--- a/.github/actions/setup-tesseract-cache/action.yml
+++ b/.github/actions/setup-tesseract-cache/action.yml
@@ -0,0 +1,60 @@
+name: Setup Tesseract Cache
+description: Manages kreuzberg-tesseract build cache per architecture
+
+inputs:
+  label:
+    description: Platform label (e.g. linux-x86_64, linux-aarch64)
+    required: true
+  enable-cache:
+    description: Enable tesseract caching (disable for cross-arch builds)
+    required: false
+    default: "true"
+  rust-target:
+    description: Rust target triple for per-target cache cleanup
+    required: false
+    default: ""
+
+outputs:
+  cache-dir:
+    description: Tesseract cache directory path
+    value: ${{ steps.set-outputs.outputs.cache-dir }}
+  cache-enabled:
+    description: Whether caching is enabled (true/false)
+    value: ${{ steps.set-outputs.outputs.cache-enabled }}
+  docker-options:
+    description: Docker options for passing cache env vars
+    value: ${{ steps.set-outputs.outputs.docker-options }}
+
+runs:
+  using: composite
+  steps:
+    - name: Clean cache directories (cache disabled)
+      if: inputs.enable-cache != 'true'
+      shell: bash
+      run: scripts/ci/actions/setup-tesseract-cache/clean-dirs.sh "${{ inputs.label }}"
+
+    - name: Setup cache directories
+      if: inputs.enable-cache == 'true'
+      shell: bash
+      run: scripts/ci/actions/setup-tesseract-cache/setup-dirs.sh "${{ inputs.label }}"
+
+    - name: Cache kreuzberg-tesseract build cache
+      if: inputs.enable-cache == 'true'
+      uses: actions/cache@v5
+      with:
+        path: |
+          .tesseract-cache/${{ inputs.label }}
+          .xdg-cache/${{ inputs.label }}
+        key: kreuzberg-tesseract-cache-v2-${{ runner.os }}-${{ inputs.label }}-${{ hashFiles('crates/kreuzberg-tesseract/Cargo.toml', 'crates/kreuzberg-tesseract/build.rs') }}
+        restore-keys: |
+          kreuzberg-tesseract-cache-v2-${{ runner.os }}-${{ inputs.label }}-
+
+    - name: Clean per-target Tesseract cache
+      if: inputs.rust-target != ''
+      shell: bash
+      run: scripts/ci/actions/setup-tesseract-cache/clean-target-cache.sh "${{ inputs.rust-target }}"
+
+    - name: Set outputs and environment
+      id: set-outputs
+      shell: bash
+      run: scripts/ci/actions/setup-tesseract-cache/set-outputs.sh "${{ inputs.label }}" "${{ inputs.enable-cache }}"