Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -0,0 +1,10 @@
+# Default owner — everything
+* @Goldziher
+
+# Zensical config and documentation
+/zensical.toml @Goldziher @pratik-mahalle @v-tan
+/docs/ @Goldziher @pratik-mahalle @v-tan
+*.md @Goldziher @pratik-mahalle @v-tan
+
+# Rust crates
+/crates/ @Goldziher @kh3rld
--- a/.github/ISSUE_TEMPLATE/bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -0,0 +1,28 @@
+name: Bug Report
+description: Report a bug or unexpected behavior
+title: "bug: "
+labels: ["bug"]
+projects: ["kreuzberg-dev/1"]
+body:
+  - type: textarea
+    id: description
+    attributes:
+      label: Description
+      description: What happened? What did you expect to happen?
+    validations:
+      required: true
+  - type: textarea
+    id: steps-to-reproduce
+    attributes:
+      label: Steps to reproduce
+      description: Minimal steps to reproduce the issue.
+    validations:
+      required: true
+  - type: textarea
+    id: reproduction-files
+    attributes:
+      label: Relevant files and configuration
+      description: >-
+        Any configuration files, input files, or code snippets needed to
+        reproduce the issue.
+      render: text
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1 @@
+blank_issues_enabled: true
--- a/.github/ISSUE_TEMPLATE/documentation.yml
+++ b/.github/ISSUE_TEMPLATE/documentation.yml
@@ -0,0 +1,20 @@
+name: Documentation Issue
+description: Report missing, unclear, or incorrect documentation
+title: "docs: "
+labels: ["documentation"]
+projects: ["kreuzberg-dev/1"]
+body:
+  - type: textarea
+    id: what
+    attributes:
+      label: What
+      description: What documentation is missing, unclear, or incorrect?
+    validations:
+      required: true
+  - type: textarea
+    id: why
+    attributes:
+      label: Why
+      description: Why does this need to change?
+    validations:
+      required: true
--- a/.github/ISSUE_TEMPLATE/feature_request.yml
+++ b/.github/ISSUE_TEMPLATE/feature_request.yml
@@ -0,0 +1,18 @@
+name: Feature Request
+description: Suggest a new feature or improvement
+title: "feat: "
+labels: ["enhancement"]
+projects: ["kreuzberg-dev/1"]
+body:
+  - type: textarea
+    id: what
+    attributes:
+      label: What is the proposed feature?
+    validations:
+      required: true
+  - type: textarea
+    id: why
+    attributes:
+      label: Why would this be a good addition?
+    validations:
+      required: true
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,12 @@
+## Related
+
+<!-- Link issues or discussions if applicable -->
+
+## Description
+
+<!-- What does this PR do? -->
+
+## Checklist
+
+- [ ] CI passing
+- [ ] Tests added where applicable
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@@ -0,0 +1,9 @@
+self-hosted-runner:
+  labels:
+    - runner-small
+    - runner-medium
+    - runner-medium-arm64
+    - runner-large
+    - runner-large-spot
+    - runner-medium-arm64-spot
+    - runner-gpu-l4
--- a/.github/actions/cache-benchmark-harness/action.yml
+++ b/.github/actions/cache-benchmark-harness/action.yml
@@ -0,0 +1,313 @@
+name: Cache Benchmark Harness Binary
+description: >
+  Build and cache the benchmark-harness binary with intelligent caching based on source hashes.
+  Generates cache keys based on harness source + kreuzberg dependency + Cargo files,
+  restores from cache if available, builds if needed, and saves to cache.
+  Validates artifacts after restore or build to ensure integrity.
+
+inputs:
+  cache-version:
+    description: "Manual version for cache invalidation"
+    required: false
+    default: "v1"
+
+  build-profile:
+    description: "Build profile (release, debug)"
+    required: false
+    default: "release"
+
+outputs:
+  cache-hit:
+    description: "Boolean indicating exact cache hit"
+    value: ${{ steps.cache-restore.outputs.cache-hit }}
+
+  cache-key:
+    description: "The cache key used"
+    value: ${{ steps.generate-cache-key.outputs.cache-key }}
+
+  binary-path:
+    description: "Path to the built/cached benchmark-harness binary"
+    value: ${{ steps.validate-binary.outputs.binary-path }}
+
+runs:
+  using: composite
+  steps:
+    # Validate inputs
+    - name: Validate inputs
+      shell: bash
+      env:
+        BUILD_PROFILE: ${{ inputs.build-profile }}
+      run: |
+        set -euo pipefail
+
+        # Validate build profile
+        valid_profiles=("release" "debug")
+        if [[ ! " ${valid_profiles[@]} " =~ " ${BUILD_PROFILE} " ]]; then
+          echo "❌ Error: build-profile must be one of: ${valid_profiles[*]}"
+          exit 1
+        fi
+
+        echo "✓ Validation passed"
+        echo "  Build profile: $BUILD_PROFILE"
+        echo "  Cache version: ${{ inputs.cache-version }}"
+
+    # Compute hash for benchmark-harness sources
+    - name: Compute benchmark-harness source hash
+      id: harness-hash
+      shell: bash
+      run: |
+        set -euo pipefail
+
+        echo "=== Computing Benchmark Harness Source Hash ==="
+
+        # Compute hash for harness source files and Cargo.toml
+        HARNESS_HASH=$(scripts/ci/cache/compute-hash.sh \
+          "tools/benchmark-harness/src/**" \
+          "tools/benchmark-harness/Cargo.toml" \
+          2>&1 | grep "^[a-f0-9]*$")
+
+        if [[ -z "$HARNESS_HASH" ]]; then
+          echo "❌ Failed to compute harness source hash"
+          exit 1
+        fi
+
+        echo "harness-hash=$HARNESS_HASH" >> "$GITHUB_OUTPUT"
+        echo "✓ Harness source hash: $HARNESS_HASH"
+
+    # Compute hash for kreuzberg dependency
+    - name: Compute kreuzberg dependency hash
+      id: kreuzberg-hash
+      shell: bash
+      run: |
+        set -euo pipefail
+
+        echo "=== Computing Kreuzberg Dependency Hash ==="
+
+        # Compute hash for kreuzberg crate (dependency)
+        KREUZBERG_HASH=$(scripts/ci/cache/compute-hash.sh --dirs \
+          "crates/kreuzberg" \
+          2>&1 | grep "^[a-f0-9]*$")
+
+        if [[ -z "$KREUZBERG_HASH" ]]; then
+          echo "❌ Failed to compute kreuzberg dependency hash"
+          exit 1
+        fi
+
+        echo "kreuzberg-hash=$KREUZBERG_HASH" >> "$GITHUB_OUTPUT"
+        echo "✓ Kreuzberg dependency hash: $KREUZBERG_HASH"
+
+    # Compute hash for Cargo files
+    - name: Compute Cargo files hash
+      id: cargo-hash
+      shell: bash
+      run: |
+        set -euo pipefail
+
+        echo "=== Computing Cargo Files Hash ==="
+
+        # Compute hash for Cargo.lock
+        CARGO_HASH=$(scripts/ci/cache/compute-hash.sh --files Cargo.lock 2>&1 | grep "^[a-f0-9]*$")
+
+        if [[ -z "$CARGO_HASH" ]]; then
+          echo "❌ Failed to compute Cargo files hash"
+          exit 1
+        fi
+
+        echo "cargo-hash=$CARGO_HASH" >> "$GITHUB_OUTPUT"
+        echo "✓ Cargo files hash: $CARGO_HASH"
+
+    # Generate cache key
+    - name: Generate cache key
+      id: generate-cache-key
+      shell: bash
+      env:
+        BUILD_PROFILE: ${{ inputs.build-profile }}
+        HARNESS_HASH: ${{ steps.harness-hash.outputs.harness-hash }}
+        KREUZBERG_HASH: ${{ steps.kreuzberg-hash.outputs.kreuzberg-hash }}
+        CARGO_HASH: ${{ steps.cargo-hash.outputs.cargo-hash }}
+        CACHE_VERSION: ${{ inputs.cache-version }}
+      run: |
+        set -euo pipefail
+
+        echo "=== Cache Key Generated ==="
+
+        # Build cache key following format:
+        # harness-{profile}-{platform}-src-{harness-hash}-kreuzberg-{kreuzberg-hash}-cargo-{cargo-hash}-v{version}
+        CACHE_KEY="harness-${BUILD_PROFILE}-$(uname -m)-src-${HARNESS_HASH}-kreuzberg-${KREUZBERG_HASH}-cargo-${CARGO_HASH}-${CACHE_VERSION}"
+
+        echo "cache-key=$CACHE_KEY" >> "$GITHUB_OUTPUT"
+
+        echo "Full key: $CACHE_KEY"
+        echo ""
+        echo "Key components:"
+        echo "  Profile:         $BUILD_PROFILE"
+        echo "  Platform:        $(uname -m)"
+        echo "  Harness hash:    $HARNESS_HASH"
+        echo "  Kreuzberg hash:  $KREUZBERG_HASH"
+        echo "  Cargo hash:      $CARGO_HASH"
+        echo "  Cache version:   $CACHE_VERSION"
+
+    # Determine target path based on profile
+    - name: Determine target paths
+      id: target-paths
+      shell: bash
+      env:
+        BUILD_PROFILE: ${{ inputs.build-profile }}
+      run: |
+        set -euo pipefail
+
+        echo "=== Determining Target Paths ==="
+
+        case "$BUILD_PROFILE" in
+          release)
+            TARGET_DIR="target/release"
+            ;;
+          debug)
+            TARGET_DIR="target/debug"
+            ;;
+          *)
+            echo "❌ Invalid build profile: $BUILD_PROFILE"
+            exit 1
+            ;;
+        esac
+
+        echo "target-dir=$TARGET_DIR" >> "$GITHUB_OUTPUT"
+        echo "✓ Target directory: $TARGET_DIR"
+
+    # Detect architecture for cache keys (shell expansion doesn't work in YAML with: context)
+    - name: Detect architecture
+      id: detect-arch
+      shell: bash
+      run: echo "arch=$(uname -m)" >> "$GITHUB_OUTPUT"
+
+    # Restore from cache
+    - name: Restore benchmark-harness binary from cache
+      id: cache-restore
+      uses: kreuzberg-dev/actions/cache-binding-artifact@v1
+      with:
+        binding-name: benchmark-harness
+        cache-key: ${{ steps.generate-cache-key.outputs.cache-key }}
+        cache-restore-keys: |
+          harness-${{ inputs.build-profile }}-${{ steps.detect-arch.outputs.arch }}-src-
+          harness-${{ inputs.build-profile }}-${{ steps.detect-arch.outputs.arch }}-
+          harness-${{ inputs.build-profile }}-
+        cache-paths: |
+          ${{ steps.target-paths.outputs.target-dir }}/benchmark-harness
+        operation: restore
+
+    # Log cache hit status
+    - name: Log cache hit status
+      shell: bash
+      run: |
+        set -euo pipefail
+
+        if [[ "${{ steps.cache-restore.outputs.cache-hit }}" == "true" ]]; then
+          echo "✓ Cache HIT - benchmark-harness binary found in cache"
+        else
+          echo "✗ Cache MISS - Building benchmark-harness from source"
+        fi
+
+    # Build if cache miss
+    - name: Build benchmark-harness
+      id: build
+      if: steps.cache-restore.outputs.cache-hit != 'true'
+      shell: bash
+      env:
+        BUILD_PROFILE: ${{ inputs.build-profile }}
+      run: |
+        set -euo pipefail
+
+        echo "=== Building Benchmark Harness ==="
+        echo "Profile: $BUILD_PROFILE"
+
+        # Determine cargo build profile argument
+        case "$BUILD_PROFILE" in
+          release)
+            BUILD_ARG="--release"
+            ;;
+          debug)
+            # Debug is default, no flag needed
+            BUILD_ARG=""
+            ;;
+          *)
+            echo "❌ Invalid build profile: $BUILD_PROFILE"
+            exit 1
+            ;;
+        esac
+
+        # Build benchmark-harness
+        echo "Running: cargo build --manifest-path tools/benchmark-harness/Cargo.toml $BUILD_ARG"
+        if ! cargo build --manifest-path tools/benchmark-harness/Cargo.toml $BUILD_ARG; then
+          echo "❌ Build failed for benchmark-harness"
+          exit 1
+        fi
+
+        echo "✓ Build succeeded"
+
+    # Validate binary exists and is executable
+    - name: Validate benchmark-harness binary
+      id: validate-binary
+      shell: bash
+      env:
+        BUILD_PROFILE: ${{ inputs.build-profile }}
+        TARGET_DIR: ${{ steps.target-paths.outputs.target-dir }}
+      run: |
+        set -euo pipefail
+
+        echo "=== Validating Benchmark Harness Binary ==="
+
+        BINARY_PATH="${TARGET_DIR}/benchmark-harness"
+
+        # Check if binary exists
+        if [[ ! -f "$BINARY_PATH" ]]; then
+          echo "❌ Binary not found at: $BINARY_PATH"
+          exit 1
+        fi
+
+        # Check if binary is executable
+        if [[ ! -x "$BINARY_PATH" ]]; then
+          echo "❌ Binary is not executable: $BINARY_PATH"
+          exit 1
+        fi
+
+        # Get binary size and info
+        BINARY_SIZE=$(ls -lh "$BINARY_PATH" | awk '{print $5}')
+        BINARY_PERMS=$(ls -l "$BINARY_PATH" | awk '{print $1}')
+
+        echo "binary-path=$BINARY_PATH" >> "$GITHUB_OUTPUT"
+
+        echo "✓ Binary validation passed"
+        echo "  Path:        $BINARY_PATH"
+        echo "  Size:        $BINARY_SIZE"
+        echo "  Permissions: $BINARY_PERMS"
+
+    # Save to cache if build occurred
+    - name: Save benchmark-harness binary to cache
+      if: steps.cache-restore.outputs.cache-hit != 'true'
+      uses: kreuzberg-dev/actions/cache-binding-artifact@v1
+      with:
+        binding-name: benchmark-harness
+        cache-key: ${{ steps.generate-cache-key.outputs.cache-key }}
+        cache-paths: |
+          ${{ steps.target-paths.outputs.target-dir }}/benchmark-harness
+        operation: save
+
+    # Summary
+    - name: Summary
+      if: always()
+      shell: bash
+      run: |
+        set -euo pipefail
+
+        echo ""
+        echo "=== Build and Cache Summary ==="
+        echo "Build Profile:   ${{ inputs.build-profile }}"
+        echo "Platform:        $(uname -m)"
+        echo "Cache Hit:       ${{ steps.cache-restore.outputs.cache-hit == 'true' && 'Yes' || 'No' }}"
+        echo "Cache Key:       ${{ steps.generate-cache-key.outputs.cache-key }}"
+        echo "Binary Path:     ${{ steps.validate-binary.outputs.binary-path }}"
+        echo ""
+        echo "Hashes:"
+        echo "  Harness:       ${{ steps.harness-hash.outputs.harness-hash }}"
+        echo "  Kreuzberg:     ${{ steps.kreuzberg-hash.outputs.kreuzberg-hash }}"
+        echo "  Cargo:         ${{ steps.cargo-hash.outputs.cargo-hash }}"
--- a/.github/actions/install-system-deps/action.yml
+++ b/.github/actions/install-system-deps/action.yml
@@ -0,0 +1,105 @@
+name: Install System Dependencies
+description: |
+  Install and cache platform-specific dependencies required for document conversion.
+  Includes: Tesseract OCR, fonts, and build tools.
+  Features robust caching with architecture/version awareness, timeout handling, and retry logic.
+
+inputs:
+  enable-retry:
+    description: Enable retry logic with exponential backoff
+    required: false
+    default: "true"
+
+runs:
+  using: composite
+  steps:
+    - name: Detect Tesseract version (macOS)
+      if: runner.os == 'macOS'
+      id: detect-tesseract-macos
+      shell: bash
+      run: scripts/ci/install-system-deps/detect-tesseract-macos.sh
+
+    - name: Cache Tesseract & tessdata (macOS)
+      if: runner.os == 'macOS'
+      id: cache-tesseract-macos
+      uses: actions/cache@v5
+      with:
+        path: |
+          /usr/local/opt/tesseract/
+          /usr/local/Cellar/tesseract/
+          /opt/homebrew/opt/tesseract/
+          /opt/homebrew/Cellar/tesseract/
+        key: tesseract-macos-${{ runner.arch }}-v5-${{ steps.detect-tesseract-macos.outputs.version }}
+        restore-keys: |
+          tesseract-macos-${{ runner.arch }}-v5-
+          tesseract-macos-${{ runner.arch }}-
+
+    - name: Install dependencies (macOS)
+      if: runner.os == 'macOS'
+      shell: bash
+      run: scripts/ci/install-system-deps/install-macos.sh
+
+    - name: Detect Tesseract version (Linux)
+      if: runner.os == 'Linux'
+      id: detect-tesseract-linux
+      shell: bash
+      run: scripts/ci/install-system-deps/detect-tesseract-linux.sh
+
+    - name: Cache Tesseract data (Linux)
+      if: runner.os == 'Linux'
+      id: cache-tesseract-linux
+      uses: actions/cache@v5
+      with:
+        path: |
+          /usr/share/tesseract-ocr/5/tessdata/
+          /usr/share/tesseract-ocr/tessdata/
+        key: tesseract-linux-${{ runner.arch }}-v5-${{ steps.detect-tesseract-linux.outputs.version }}
+        restore-keys: |
+          tesseract-linux-${{ runner.arch }}-v5-
+          tesseract-linux-${{ runner.arch }}-
+
+    - name: Install dependencies (Linux)
+      if: runner.os == 'Linux'
+      shell: bash
+      run: scripts/ci/install-system-deps/install-linux.sh
+
+    - name: Cache Tesseract (Windows)
+      if: runner.os == 'Windows'
+      id: cache-tesseract-windows
+      uses: actions/cache@v5
+      with:
+        path: |
+          C:\Program Files\Tesseract-OCR
+          C:\ProgramData\chocolatey\lib\tesseract
+        key: tesseract-windows-${{ runner.arch }}-v5-data
+        restore-keys: |
+          tesseract-windows-${{ runner.arch }}-
+
+    - name: Cache LLVM (Windows)
+      if: runner.os == 'Windows'
+      id: cache-llvm-windows
+      uses: actions/cache@v5
+      with:
+        path: |
+          C:\Program Files\LLVM
+          C:\ProgramData\chocolatey\lib\llvm
+        key: llvm-windows-${{ runner.arch }}-v1
+
+    - name: Cache CMake (Windows)
+      if: runner.os == 'Windows'
+      id: cache-cmake-windows
+      uses: actions/cache@v5
+      with:
+        path: |
+          C:\Program Files\CMake
+          C:\ProgramData\chocolatey\lib\cmake
+        key: cmake-windows-${{ runner.arch }}-v1
+
+    - name: Install dependencies (Windows)
+      if: runner.os == 'Windows'
+      shell: pwsh
+      env:
+        TESSERACT_CACHE_HIT: ${{ steps.cache-tesseract-windows.outputs.cache-hit }}
+        LLVM_CACHE_HIT: ${{ steps.cache-llvm-windows.outputs.cache-hit }}
+        CMAKE_CACHE_HIT: ${{ steps.cache-cmake-windows.outputs.cache-hit }}
+      run: pwsh -File scripts/ci/install-system-deps/install-windows.ps1
--- a/.github/actions/setup-layout-models/action.yml
+++ b/.github/actions/setup-layout-models/action.yml
@@ -0,0 +1,197 @@
+name: Setup Layout Detection Models Cache
+description: Download and cache layout detection ONNX models (RT-DETR + TATR) for CI testing
+
+inputs:
+  cache-enabled:
+    description: Enable model caching (set to false for cross-arch builds)
+    required: false
+    default: "true"
+  models:
+    description: Comma-separated list of models to setup (rtdetr,tatr)
+    required: false
+    default: "rtdetr,tatr"
+  cache-key-suffix:
+    description: Suffix for cache key to differentiate model sets
+    required: false
+    default: "layout-models-v2"
+
+outputs:
+  cache-hit:
+    description: Whether models were restored from cache (true/false)
+    value: ${{ steps.cache-models.outputs.cache-hit }}
+  cache-dir:
+    description: Path to the layout model cache directory
+    value: ${{ steps.set-outputs.outputs.cache-dir }}
+  models-available:
+    description: Comma-separated list of available models
+    value: ${{ steps.verify-models.outputs.available-models }}
+
+runs:
+  using: composite
+  steps:
+    - name: Setup cache directory
+      shell: bash
+      run: |
+        mkdir -p ~/.cache/kreuzberg/layout
+        echo "Cache directory: $HOME/.cache/kreuzberg/layout"
+
+    - name: Restore layout models from cache
+      if: inputs.cache-enabled == 'true'
+      uses: actions/cache@v5
+      id: cache-models
+      with:
+        path: ~/.cache/kreuzberg/layout
+        key: ${{ inputs.cache-key-suffix }}-${{ runner.os }}-${{ runner.arch }}-rtdetr_3bf2fb0e+tatr_c11f4033
+        restore-keys: |
+          ${{ inputs.cache-key-suffix }}-${{ runner.os }}-${{ runner.arch }}-
+          ${{ inputs.cache-key-suffix }}-${{ runner.os }}-
+          ${{ inputs.cache-key-suffix }}-
+
+    - name: Download RT-DETR model (rtdetr)
+      if: contains(inputs.models, 'rtdetr') && steps.cache-models.outputs.cache-hit != 'true'
+      shell: bash
+      run: |
+        MODEL_URL="https://huggingface.co/Kreuzberg/layout-models/resolve/main/rtdetr/model.onnx"
+        CACHE_DIR="$HOME/.cache/kreuzberg/layout"
+        MODEL_DIR="$CACHE_DIR/rtdetr"
+        MODEL_FILE="$MODEL_DIR/model.onnx"
+
+        echo "Downloading RT-DETR layout detection model from $MODEL_URL"
+        mkdir -p "$MODEL_DIR"
+
+        for attempt in 1 2 3; do
+          if [ $attempt -gt 1 ]; then
+            backoff=$((5 * 3 ** (attempt - 2)))
+            echo "Retry attempt $attempt/3 after ${backoff}s backoff..."
+            sleep $backoff
+          fi
+
+          if curl -f -L --progress-bar --connect-timeout 30 --max-time 600 \
+            -o "$MODEL_FILE" "$MODEL_URL"; then
+            echo "RT-DETR model downloaded successfully ($(du -h "$MODEL_FILE" | cut -f1))"
+            break
+          fi
+        done
+
+        if [ ! -f "$MODEL_FILE" ]; then
+          echo "ERROR: Failed to download RT-DETR model after 3 attempts"
+          exit 1
+        fi
+
+    - name: Verify RT-DETR SHA256
+      if: contains(inputs.models, 'rtdetr') && steps.cache-models.outputs.cache-hit != 'true'
+      shell: bash
+      run: |
+        MODEL_FILE="$HOME/.cache/kreuzberg/layout/rtdetr/model.onnx"
+        EXPECTED="3bf2fb0ee6df87435b7ae47f0f3930ec3dc97ec56fd824acc6d57bc7a6b89ef2"
+
+        if command -v sha256sum &>/dev/null; then
+          ACTUAL=$(sha256sum "$MODEL_FILE" | awk '{print $1}')
+        else
+          ACTUAL=$(shasum -a 256 "$MODEL_FILE" | awk '{print $1}')
+        fi
+
+        if [ "$ACTUAL" != "$EXPECTED" ]; then
+          echo "ERROR: RT-DETR SHA256 mismatch"
+          echo "  Expected: $EXPECTED"
+          echo "  Actual:   $ACTUAL"
+          rm -f "$MODEL_FILE"
+          exit 1
+        fi
+        echo "RT-DETR SHA256 verified"
+
+    - name: Download TATR model (tatr)
+      if: contains(inputs.models, 'tatr') && steps.cache-models.outputs.cache-hit != 'true'
+      shell: bash
+      run: |
+        MODEL_URL="https://huggingface.co/Kreuzberg/layout-models/resolve/main/tatr/model.onnx"
+        CACHE_DIR="$HOME/.cache/kreuzberg/layout"
+        MODEL_DIR="$CACHE_DIR/tatr"
+        MODEL_FILE="$MODEL_DIR/tatr.onnx"
+
+        echo "Downloading TATR table recognition model from $MODEL_URL"
+        mkdir -p "$MODEL_DIR"
+
+        for attempt in 1 2 3; do
+          if [ $attempt -gt 1 ]; then
+            backoff=$((5 * 3 ** (attempt - 2)))
+            echo "Retry attempt $attempt/3 after ${backoff}s backoff..."
+            sleep $backoff
+          fi
+
+          if curl -f -L --progress-bar --connect-timeout 30 --max-time 600 \
+            -o "$MODEL_FILE" "$MODEL_URL"; then
+            echo "TATR model downloaded successfully ($(du -h "$MODEL_FILE" | cut -f1))"
+            break
+          fi
+        done
+
+        if [ ! -f "$MODEL_FILE" ]; then
+          echo "ERROR: Failed to download TATR model after 3 attempts"
+          exit 1
+        fi
+
+    - name: Verify TATR SHA256
+      if: contains(inputs.models, 'tatr') && steps.cache-models.outputs.cache-hit != 'true'
+      shell: bash
+      run: |
+        MODEL_FILE="$HOME/.cache/kreuzberg/layout/tatr/tatr.onnx"
+        EXPECTED="c11f4033da75e9c4d41c403ef356e89caa0a37a7d111b55461e7d5ba856bb6b6"
+
+        if command -v sha256sum &>/dev/null; then
+          ACTUAL=$(sha256sum "$MODEL_FILE" | awk '{print $1}')
+        else
+          ACTUAL=$(shasum -a 256 "$MODEL_FILE" | awk '{print $1}')
+        fi
+
+        if [ "$ACTUAL" != "$EXPECTED" ]; then
+          echo "ERROR: TATR SHA256 mismatch"
+          echo "  Expected: $EXPECTED"
+          echo "  Actual:   $ACTUAL"
+          rm -f "$MODEL_FILE"
+          exit 1
+        fi
+        echo "TATR SHA256 verified"
+
+    - name: Verify downloaded models
+      id: verify-models
+      shell: bash
+      run: |
+        CACHE_DIR="$HOME/.cache/kreuzberg/layout"
+        AVAILABLE_MODELS=()
+        TOTAL_SIZE=0
+
+        echo "Checking for layout models in $CACHE_DIR"
+
+        if [ -f "$CACHE_DIR/rtdetr/model.onnx" ]; then
+          SIZE=$(wc -c < "$CACHE_DIR/rtdetr/model.onnx" | tr -d ' ')
+          AVAILABLE_MODELS+=("rtdetr")
+          TOTAL_SIZE=$((TOTAL_SIZE + SIZE))
+          echo "  ✓ RT-DETR model: $(numfmt --to=iec-i --suffix=B $SIZE 2>/dev/null || echo $SIZE bytes)"
+        fi
+
+        if [ -f "$CACHE_DIR/tatr/tatr.onnx" ]; then
+          SIZE=$(wc -c < "$CACHE_DIR/tatr/tatr.onnx" | tr -d ' ')
+          AVAILABLE_MODELS+=("tatr")
+          TOTAL_SIZE=$((TOTAL_SIZE + SIZE))
+          echo "  ✓ TATR model: $(numfmt --to=iec-i --suffix=B $SIZE 2>/dev/null || echo $SIZE bytes)"
+        fi
+
+        if [ ${#AVAILABLE_MODELS[@]} -eq 0 ]; then
+          echo "ERROR: No layout models found in cache directory after download"
+          echo "available-models=" >> $GITHUB_OUTPUT
+          exit 1
+        fi
+
+        AVAILABLE_MODELS_STR=$(IFS=, ; echo "${AVAILABLE_MODELS[*]}")
+        echo "✓ Total cached layout models: ${#AVAILABLE_MODELS[@]} ($(numfmt --to=iec-i --suffix=B $TOTAL_SIZE 2>/dev/null || echo $TOTAL_SIZE bytes))"
+        echo "available-models=$AVAILABLE_MODELS_STR" >> $GITHUB_OUTPUT
+
+    - name: Set cache directory output
+      id: set-outputs
+      shell: bash
+      run: |
+        CACHE_DIR="$HOME/.cache/kreuzberg/layout"
+        echo "cache-dir=$CACHE_DIR" >> $GITHUB_OUTPUT
+        echo "KREUZBERG_CACHE_DIR=$HOME/.cache/kreuzberg" >> $GITHUB_ENV
+        echo "Layout model cache configured at: $CACHE_DIR"
--- a/.github/actions/setup-onnx-runtime/action.yml
+++ b/.github/actions/setup-onnx-runtime/action.yml
@@ -0,0 +1,46 @@
+name: Setup ONNX Runtime
+description: Download and stage ONNX Runtime libraries for bindings
+inputs:
+  ort-version:
+    description: ONNX Runtime version to download
+    required: true
+  dest-dir:
+    description: Directory (relative to workspace) where libraries should be copied
+    required: false
+    default: crates/kreuzberg-node
+  arch-id:
+    description: Override architecture (x64|arm64). Defaults to runner architecture.
+    required: false
+    default: ""
+  strategy:
+    description: "ORT linking strategy: 'system' (dynamic link, default) or 'bundled' (static link via ort-bundled cargo feature)"
+    required: false
+    default: system
+runs:
+  using: composite
+  steps:
+    - name: Cache ONNX Runtime
+      id: cache-onnx
+      uses: actions/cache@v5
+      with:
+        path: |
+          ${{ runner.temp }}/onnxruntime
+        key: onnx-v2-${{ runner.os }}-${{ inputs.arch-id != '' && inputs.arch-id || runner.arch }}-${{ inputs.ort-version }}
+        restore-keys: |
+          onnx-v2-${{ runner.os }}-${{ inputs.arch-id != '' && inputs.arch-id || runner.arch }}-
+          onnx-v2-${{ runner.os }}-
+
+    - name: Prepare ONNX Runtime (Linux)
+      if: runner.os == 'Linux'
+      shell: bash
+      run: scripts/ci/actions/setup-onnx-runtime/linux.sh "${{ inputs.ort-version }}" "${{ inputs.dest-dir }}" "${{ inputs.arch-id }}" "${{ inputs.strategy }}"
+
+    - name: Prepare ONNX Runtime (macOS)
+      if: runner.os == 'macOS'
+      shell: bash
+      run: scripts/ci/actions/setup-onnx-runtime/macos.sh "${{ inputs.ort-version }}" "${{ inputs.dest-dir }}" "${{ inputs.arch-id }}" "${{ inputs.strategy }}"
+
+    - name: Prepare ONNX Runtime (Windows)
+      if: runner.os == 'Windows'
+      shell: pwsh
+      run: scripts/ci/actions/setup-onnx-runtime/windows.ps1 "${{ inputs.ort-version }}" "${{ inputs.dest-dir }}" "${{ inputs.arch-id }}" "${{ inputs.strategy }}"
--- a/.github/actions/setup-paddle-ocr-models/README.md
+++ b/.github/actions/setup-paddle-ocr-models/README.md
@@ -0,0 +1,202 @@
+# Setup PaddleOCR Models Cache
+
+GitHub Action to download and cache PaddleOCR ONNX models for CI testing and development.
+
+## Overview
+
+This action manages the setup of PaddleOCR PP-OCRv5 ONNX models used by the `kreuzberg-paddle-ocr` crate for optical character recognition testing. It:
+
+- Downloads three model types (detection, classification, recognition) from Hugging Face
+- Caches models per OS and CPU architecture (Linux x86_64, Linux ARM64, macOS, Windows)
+- Provides environment variables for downstream use
+- Outputs cache hit status and available model information
+- Gracefully handles download failures (continues with available models)
+
+## Models
+
+The action downloads pre-converted ONNX format models from the `Kreuzberg/paddleocr-onnx-models` Hugging Face repository:
+
+| Model Type           | File                                  | Size    | Purpose                                   |
+| -------------------- | ------------------------------------- | ------- | ----------------------------------------- |
+| Detection (det)      | `PP-OCRv5_server_det_infer.onnx`      | ~84 MB  | Text location detection (PP-OCRv5 server) |
+| Classification (cls) | `ch_ppocr_mobile_v2.0_cls_infer.onnx` | ~0.6 MB | Text orientation classification           |
+| Recognition (rec)    | `rec/english/model.onnx`              | ~8 MB   | Text character recognition (PP-OCRv5)     |
+
+**Total cache size: ~93 MB per OS/architecture combination**
+
+## Usage
+
+### Basic Usage
+
+```yaml
+- uses: ./.github/actions/setup-paddle-ocr-models
+```
+
+### With Custom Cache Suffix
+
+```yaml
+- uses: ./.github/actions/setup-paddle-ocr-models
+  with:
+    cache-key-suffix: my-paddle-ocr-v5
+```
+
+### Disable Caching
+
+For cross-architecture builds where caching doesn't help:
+
+```yaml
+- uses: ./.github/actions/setup-paddle-ocr-models
+  with:
+    cache-enabled: false
+```
+
+### Download Specific Models Only
+
+```yaml
+- uses: ./.github/actions/setup-paddle-ocr-models
+  with:
+    models: "det,rec" # Skip classification model
+```
+
+## Inputs
+
+| Name               | Description                                                     | Required | Default              |
+| ------------------ | --------------------------------------------------------------- | -------- | -------------------- |
+| `cache-enabled`    | Enable model caching (set false for cross-arch builds)          | No       | `true`               |
+| `models`           | Comma-separated list of models to setup (det,cls,rec or subset) | No       | `det,cls,rec`        |
+| `cache-key-suffix` | Suffix for cache key to differentiate model sets                | No       | `paddle-ocr-v5-onnx` |
+
+## Outputs
+
+| Name               | Description                                          |
+| ------------------ | ---------------------------------------------------- |
+| `cache-hit`        | Whether models were restored from cache (true/false) |
+| `cache-dir`        | Path to the PaddleOCR model cache directory          |
+| `models-available` | Comma-separated list of available models after setup |
+
+## Outputs as Environment Variables
+
+The action automatically exports:
+
+- `PADDLE_OCR_MODEL_CACHE`: Absolute path to model cache directory
+
+## Cache Strategy
+
+Models are cached using GitHub Actions cache with the following key structure:
+
+```text
+paddle-ocr-v5-onnx-{OS}-{ARCHITECTURE}-v4
+```
+
+Cache restoration order (restore-keys):
+
+1. Exact match: `paddle-ocr-v5-onnx-{OS}-{ARCHITECTURE}-v4`
+2. OS-Architecture: `paddle-ocr-v5-onnx-{OS}-{ARCHITECTURE}-`
+3. OS only: `paddle-ocr-v5-onnx-{OS}-`
+4. Any: `paddle-ocr-v5-onnx-`
+
+## Example: CI Rust Workflow Integration
+
+```yaml
+jobs:
+  paddle-ocr-tests:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: ./.github/actions/setup-paddle-ocr-models
+        id: paddle-models
+
+      - name: Run PaddleOCR tests
+        run: cargo test --package kreuzberg-paddle-ocr
+        env:
+          PADDLE_OCR_MODEL_CACHE: ${{ steps.paddle-models.outputs.cache-dir }}
+
+      - name: Report cache status
+        if: always()
+        run: |
+          echo "Cache hit: ${{ steps.paddle-models.outputs.cache-hit }}"
+          echo "Available models: ${{ steps.paddle-models.outputs.models-available }}"
+```
+
+## Error Handling
+
+The action downloads models sequentially and will fail if a required model download fails. After downloading:
+
+- The verify step reports which models are actually available in the output
+- Downstream tests can check `models-available` to know what's available
+- If all models fail, tests can fall back to alternative behavior
+
+## Download Sources
+
+Models are downloaded from:
+
+```text
+https://huggingface.co/Kreuzberg/paddleocr-onnx-models/resolve/main/
+```
+
+If this repository becomes unavailable, the action will fail gracefully. Alternative sources can be configured by modifying the `MODEL_URL` environment variables in the action.
+
+## Troubleshooting
+
+### Models not being cached
+
+1. Check that `cache-enabled` is not set to `false`
+2. Verify GitHub Actions cache is not full (max 10 GB per repository)
+3. Check runner OS and architecture match cache keys
+4. View cache in repository settings (Settings → Actions → Caches)
+
+### Download timeouts
+
+If downloads timeout:
+
+- Increase the 300-second timeout in the action steps
+- Check Hugging Face API availability
+- Try reducing the number of models (`models: "det,rec"`)
+
+### Verifying models are present
+
+Check that all expected models exist in the correct directory structure:
+
+```bash
+ls -lh ~/.cache/kreuzberg/paddle-ocr/
+```
+
+Expected output:
+
+```text
+drwxr-xr-x det/
+drwxr-xr-x cls/
+drwxr-xr-x rec/
+
+ls -lh ~/.cache/kreuzberg/paddle-ocr/det/
+-rw-r--r-- model.onnx (~84 MB)
+
+ls -lh ~/.cache/kreuzberg/paddle-ocr/cls/
+-rw-r--r-- model.onnx (~0.6 MB)
+
+ls -lh ~/.cache/kreuzberg/paddle-ocr/rec/english/
+-rw-r--r-- model.onnx (~8 MB)
+-rw-r--r-- dict.txt
+```
+
+The directory structure must match what `ModelManager` expects in `model_manager.rs`.
+
+## Performance Impact
+
+- **First run (no cache)**: ~30-60 seconds (download time depends on network)
+- **Cached run**: <1 second (cache restore)
+- **Cache size**: ~93 MB per OS/architecture
+- **Network bandwidth**: ~93 MB download on cache miss
+
+## Related Actions
+
+- `.github/actions/setup-tesseract-cache` - Similar caching for Tesseract models
+- `.github/actions/cache-hf-fastembed` - Hugging Face model caching for fastembed
+- `.github/actions/setup-onnx-runtime` - ONNX Runtime setup for inference
+
+## See Also
+
+- [PaddleOCR Documentation](https://github.com/PaddlePaddle/PaddleOCR)
+- [kreuzberg-paddle-ocr crate](../../../crates/kreuzberg-paddle-ocr)
+- [ModelManager source](../../../crates/kreuzberg/src/paddle_ocr/model_manager.rs)
--- a/.github/actions/setup-paddle-ocr-models/action.yml
+++ b/.github/actions/setup-paddle-ocr-models/action.yml
@@ -0,0 +1,231 @@
+name: Setup PaddleOCR Models Cache
+description: Download and cache PaddleOCR ONNX models for CI testing
+
+inputs:
+  cache-enabled:
+    description: Enable model caching (set to false for cross-arch builds)
+    required: false
+    default: "true"
+  models:
+    description: Comma-separated list of models to setup (det,cls,rec or specific subset)
+    required: false
+    default: "det,cls,rec"
+  cache-key-suffix:
+    description: Suffix for cache key to differentiate model sets
+    required: false
+    default: "paddle-ocr-v5-onnx"
+
+outputs:
+  cache-hit:
+    description: Whether models were restored from cache (true/false)
+    value: ${{ steps.cache-models.outputs.cache-hit }}
+  cache-dir:
+    description: Path to the PaddleOCR model cache directory
+    value: ${{ steps.set-outputs.outputs.cache-dir }}
+  models-available:
+    description: Comma-separated list of available models
+    value: ${{ steps.verify-models.outputs.available-models }}
+
+runs:
+  using: composite
+  steps:
+    - name: Setup cache directory
+      shell: bash
+      run: |
+        mkdir -p ~/.cache/kreuzberg/paddle-ocr
+        echo "Cache directory: $HOME/.cache/kreuzberg/paddle-ocr"
+
+    - name: Restore PaddleOCR models from cache
+      if: inputs.cache-enabled == 'true'
+      uses: actions/cache@v5
+      id: cache-models
+      with:
+        path: ~/.cache/kreuzberg/paddle-ocr
+        key: ${{ inputs.cache-key-suffix }}-${{ runner.os }}-${{ runner.arch }}-v4
+        restore-keys: |
+          ${{ inputs.cache-key-suffix }}-${{ runner.os }}-${{ runner.arch }}-
+          ${{ inputs.cache-key-suffix }}-${{ runner.os }}-
+          ${{ inputs.cache-key-suffix }}-
+
+    - name: Download detection model (det)
+      if: contains(inputs.models, 'det') && steps.cache-models.outputs.cache-hit != 'true'
+      shell: bash
+      run: |
+        MODEL_URL="https://huggingface.co/Kreuzberg/paddleocr-onnx-models/resolve/main/PP-OCRv5_server_det_infer.onnx"
+        CACHE_DIR="$HOME/.cache/kreuzberg/paddle-ocr"
+        MODEL_DIR="$CACHE_DIR/det"
+        MODEL_FILE="$MODEL_DIR/model.onnx"
+
+        echo "Downloading detection model from $MODEL_URL"
+        mkdir -p "$MODEL_DIR"
+
+        for attempt in 1 2 3; do
+          if [ $attempt -gt 1 ]; then
+            backoff=$((5 * 3 ** (attempt - 2)))
+            echo "Retry attempt $attempt/3 after ${backoff}s backoff..."
+            sleep $backoff
+          fi
+
+          if curl -f -L --progress-bar --connect-timeout 30 --max-time 600 \
+            -o "$MODEL_FILE" "$MODEL_URL"; then
+            echo "Detection model downloaded successfully ($(du -h "$MODEL_FILE" | cut -f1))"
+            exit 0
+          fi
+        done
+
+        echo "ERROR: Failed to download detection model after 3 attempts"
+        rm -f "$MODEL_FILE"
+        exit 1
+
+    - name: Download classification model (cls)
+      if: contains(inputs.models, 'cls') && steps.cache-models.outputs.cache-hit != 'true'
+      shell: bash
+      run: |
+        MODEL_URL="https://huggingface.co/Kreuzberg/paddleocr-onnx-models/resolve/main/ch_ppocr_mobile_v2.0_cls_infer.onnx"
+        CACHE_DIR="$HOME/.cache/kreuzberg/paddle-ocr"
+        MODEL_DIR="$CACHE_DIR/cls"
+        MODEL_FILE="$MODEL_DIR/model.onnx"
+
+        echo "Downloading classification model from $MODEL_URL"
+        mkdir -p "$MODEL_DIR"
+
+        for attempt in 1 2 3; do
+          if [ $attempt -gt 1 ]; then
+            backoff=$((5 * 3 ** (attempt - 2)))
+            echo "Retry attempt $attempt/3 after ${backoff}s backoff..."
+            sleep $backoff
+          fi
+
+          if curl -f -L --progress-bar --connect-timeout 30 --max-time 600 \
+            -o "$MODEL_FILE" "$MODEL_URL"; then
+            echo "Classification model downloaded successfully ($(du -h "$MODEL_FILE" | cut -f1))"
+            exit 0
+          fi
+        done
+
+        echo "ERROR: Failed to download classification model after 3 attempts"
+        rm -f "$MODEL_FILE"
+        exit 1
+
+    - name: Download recognition model (rec/english)
+      if: contains(inputs.models, 'rec') && steps.cache-models.outputs.cache-hit != 'true'
+      shell: bash
+      run: |
+        MODEL_URL="https://huggingface.co/Kreuzberg/paddleocr-onnx-models/resolve/main/rec/english/model.onnx"
+        CACHE_DIR="$HOME/.cache/kreuzberg/paddle-ocr"
+        MODEL_DIR="$CACHE_DIR/rec/english"
+        MODEL_FILE="$MODEL_DIR/model.onnx"
+
+        echo "Downloading English recognition model from $MODEL_URL"
+        mkdir -p "$MODEL_DIR"
+
+        for attempt in 1 2 3; do
+          if [ $attempt -gt 1 ]; then
+            backoff=$((5 * 3 ** (attempt - 2)))
+            echo "Retry attempt $attempt/3 after ${backoff}s backoff..."
+            sleep $backoff
+          fi
+
+          if curl -f -L --progress-bar --connect-timeout 30 --max-time 600 \
+            -o "$MODEL_FILE" "$MODEL_URL"; then
+            echo "Recognition model downloaded successfully ($(du -h "$MODEL_FILE" | cut -f1))"
+            exit 0
+          fi
+        done
+
+        echo "ERROR: Failed to download recognition model after 3 attempts"
+        rm -f "$MODEL_FILE"
+        exit 1
+
+    - name: Download recognition dictionary (rec/english/dict.txt)
+      if: contains(inputs.models, 'rec') && steps.cache-models.outputs.cache-hit != 'true'
+      shell: bash
+      run: |
+        DICT_URL="https://huggingface.co/Kreuzberg/paddleocr-onnx-models/resolve/main/rec/english/dict.txt"
+        CACHE_DIR="$HOME/.cache/kreuzberg/paddle-ocr"
+        MODEL_DIR="$CACHE_DIR/rec/english"
+        DICT_FILE="$MODEL_DIR/dict.txt"
+
+        echo "Downloading English recognition dictionary from $DICT_URL"
+        mkdir -p "$MODEL_DIR"
+
+        for attempt in 1 2 3; do
+          if [ $attempt -gt 1 ]; then
+            backoff=$((5 * 3 ** (attempt - 2)))
+            echo "Retry attempt $attempt/3 after ${backoff}s backoff..."
+            sleep $backoff
+          fi
+
+          if curl -f -L --progress-bar --connect-timeout 30 --max-time 600 \
+            -o "$DICT_FILE" "$DICT_URL"; then
+            echo "Dictionary downloaded successfully ($(du -h "$DICT_FILE" | cut -f1))"
+            exit 0
+          fi
+        done
+
+        echo "ERROR: Failed to download dictionary after 3 attempts"
+        rm -f "$DICT_FILE"
+        exit 1
+
+    - name: Verify downloaded models
+      id: verify-models
+      shell: bash
+      run: |
+        CACHE_DIR="$HOME/.cache/kreuzberg/paddle-ocr"
+        AVAILABLE_MODELS=()
+        TOTAL_SIZE=0
+
+        echo "Checking for PaddleOCR models in $CACHE_DIR"
+
+        if [ -f "$CACHE_DIR/det/model.onnx" ]; then
+          SIZE=$(wc -c < "$CACHE_DIR/det/model.onnx" | tr -d ' ')
+          AVAILABLE_MODELS+=("det")
+          TOTAL_SIZE=$((TOTAL_SIZE + SIZE))
+          echo "  ✓ Detection model: $(numfmt --to=iec-i --suffix=B $SIZE 2>/dev/null || echo $SIZE bytes)"
+        fi
+
+        if [ -f "$CACHE_DIR/cls/model.onnx" ]; then
+          SIZE=$(wc -c < "$CACHE_DIR/cls/model.onnx" | tr -d ' ')
+          AVAILABLE_MODELS+=("cls")
+          TOTAL_SIZE=$((TOTAL_SIZE + SIZE))
+          echo "  ✓ Classification model: $(numfmt --to=iec-i --suffix=B $SIZE 2>/dev/null || echo $SIZE bytes)"
+        fi
+
+        if [ -f "$CACHE_DIR/rec/english/model.onnx" ]; then
+          SIZE=$(wc -c < "$CACHE_DIR/rec/english/model.onnx" | tr -d ' ')
+          AVAILABLE_MODELS+=("rec")
+          TOTAL_SIZE=$((TOTAL_SIZE + SIZE))
+          echo "  ✓ Recognition model (English): $(numfmt --to=iec-i --suffix=B $SIZE 2>/dev/null || echo $SIZE bytes)"
+        fi
+
+        if [ -f "$CACHE_DIR/rec/english/dict.txt" ]; then
+          SIZE=$(wc -c < "$CACHE_DIR/rec/english/dict.txt" | tr -d ' ')
+          TOTAL_SIZE=$((TOTAL_SIZE + SIZE))
+          echo "  ✓ Recognition dictionary (English): $(numfmt --to=iec-i --suffix=B $SIZE 2>/dev/null || echo $SIZE bytes)"
+        fi
+
+        if [ ${#AVAILABLE_MODELS[@]} -eq 0 ]; then
+          echo "ERROR: No models found in cache directory after download"
+          echo "available-models=" >> $GITHUB_OUTPUT
+          exit 1
+        fi
+
+        AVAILABLE_MODELS_STR=$(IFS=, ; echo "${AVAILABLE_MODELS[*]}")
+        echo "✓ Total cached models: ${#AVAILABLE_MODELS[@]} ($(numfmt --to=iec-i --suffix=B $TOTAL_SIZE 2>/dev/null || echo $TOTAL_SIZE bytes))"
+        echo "available-models=$AVAILABLE_MODELS_STR" >> $GITHUB_OUTPUT
+
+    - name: Set cache directory output
+      id: set-outputs
+      shell: bash
+      run: |
+        CACHE_DIR="$HOME/.cache/kreuzberg/paddle-ocr"
+        echo "cache-dir=$CACHE_DIR" >> $GITHUB_OUTPUT
+        echo "PADDLE_OCR_MODEL_CACHE=$CACHE_DIR" >> $GITHUB_ENV
+        echo "KREUZBERG_CACHE_DIR=$HOME/.cache/kreuzberg" >> $GITHUB_ENV
+
+    - name: Export cache environment
+      shell: bash
+      run: |
+        echo "PADDLE_OCR_MODEL_CACHE=$HOME/.cache/kreuzberg/paddle-ocr" >> $GITHUB_ENV
+        echo "KREUZBERG_CACHE_DIR=$HOME/.cache/kreuzberg" >> $GITHUB_ENV
+        echo "PaddleOCR model cache configured at: $HOME/.cache/kreuzberg/paddle-ocr"
--- a/.github/actions/setup-tesseract-cache/action.yml
+++ b/.github/actions/setup-tesseract-cache/action.yml
@@ -0,0 +1,60 @@
+name: Setup Tesseract Cache
+description: Manages kreuzberg-tesseract build cache per architecture
+
+inputs:
+  label:
+    description: Platform label (e.g. linux-x86_64, linux-aarch64)
+    required: true
+  enable-cache:
+    description: Enable tesseract caching (disable for cross-arch builds)
+    required: false
+    default: "true"
+  rust-target:
+    description: Rust target triple for per-target cache cleanup
+    required: false
+    default: ""
+
+outputs:
+  cache-dir:
+    description: Tesseract cache directory path
+    value: ${{ steps.set-outputs.outputs.cache-dir }}
+  cache-enabled:
+    description: Whether caching is enabled (true/false)
+    value: ${{ steps.set-outputs.outputs.cache-enabled }}
+  docker-options:
+    description: Docker options for passing cache env vars
+    value: ${{ steps.set-outputs.outputs.docker-options }}
+
+runs:
+  using: composite
+  steps:
+    - name: Clean cache directories (cache disabled)
+      if: inputs.enable-cache != 'true'
+      shell: bash
+      run: scripts/ci/actions/setup-tesseract-cache/clean-dirs.sh "${{ inputs.label }}"
+
+    - name: Setup cache directories
+      if: inputs.enable-cache == 'true'
+      shell: bash
+      run: scripts/ci/actions/setup-tesseract-cache/setup-dirs.sh "${{ inputs.label }}"
+
+    - name: Cache kreuzberg-tesseract build cache
+      if: inputs.enable-cache == 'true'
+      uses: actions/cache@v5
+      with:
+        path: |
+          .tesseract-cache/${{ inputs.label }}
+          .xdg-cache/${{ inputs.label }}
+        key: kreuzberg-tesseract-cache-v2-${{ runner.os }}-${{ inputs.label }}-${{ hashFiles('crates/kreuzberg-tesseract/Cargo.toml', 'crates/kreuzberg-tesseract/build.rs') }}
+        restore-keys: |
+          kreuzberg-tesseract-cache-v2-${{ runner.os }}-${{ inputs.label }}-
+
+    - name: Clean per-target Tesseract cache
+      if: inputs.rust-target != ''
+      shell: bash
+      run: scripts/ci/actions/setup-tesseract-cache/clean-target-cache.sh "${{ inputs.rust-target }}"
+
+    - name: Set outputs and environment
+      id: set-outputs
+      shell: bash
+      run: scripts/ci/actions/setup-tesseract-cache/set-outputs.sh "${{ inputs.label }}" "${{ inputs.enable-cache }}"
--- a/.github/dependabot.yaml
+++ b/.github/dependabot.yaml
@@ -0,0 +1,67 @@
+version: 2
+
+multi-ecosystem-groups:
+  dependencies:
+    schedule:
+      interval: "weekly"
+
+updates:
+  - package-ecosystem: "cargo"
+    # Explicitly list root only — packages/ruby/ext and packages/r/src have
+    # standalone workspaces with path deps to vendored crates that only exist
+    # at build time. Dependabot cannot resolve these paths.
+    directories:
+      - "/"
+    ignore:
+      - dependency-name: "kreuzberg"
+      - dependency-name: "kreuzberg-ffi"
+    patterns: ["*"]
+    multi-ecosystem-group: "dependencies"
+
+  - package-ecosystem: "pip"
+    directories:
+      - "/"
+      - "/packages/python"
+    patterns: ["*"]
+    multi-ecosystem-group: "dependencies"
+
+  - package-ecosystem: "npm"
+    directories:
+      - "/"
+      - "/crates/kreuzberg-node"
+      - "/crates/kreuzberg-wasm"
+      - "/packages/typescript/core"
+    patterns: ["*"]
+    multi-ecosystem-group: "dependencies"
+
+  - package-ecosystem: "bundler"
+    directory: "/packages/ruby"
+    patterns: ["*"]
+    multi-ecosystem-group: "dependencies"
+
+  - package-ecosystem: "composer"
+    directories:
+      - "/"
+      - "/packages/php"
+    patterns: ["*"]
+    multi-ecosystem-group: "dependencies"
+
+  - package-ecosystem: "gomod"
+    directory: "/packages/go/v5"
+    patterns: ["*"]
+    multi-ecosystem-group: "dependencies"
+
+  - package-ecosystem: "maven"
+    directory: "/packages/java"
+    patterns: ["*"]
+    multi-ecosystem-group: "dependencies"
+
+  - package-ecosystem: "nuget"
+    directory: "/packages/csharp"
+    patterns: ["*"]
+    multi-ecosystem-group: "dependencies"
+
+  - package-ecosystem: "mix"
+    directory: "/packages/elixir"
+    patterns: ["*"]
+    multi-ecosystem-group: "dependencies"
--- a/.github/documentation/runners.md
+++ b/.github/documentation/runners.md
@@ -0,0 +1,39 @@
+# Custom GitHub Actions Runners
+
+## Available Runners
+
+| Runner Label               | Architecture | Size   | Ephemeral | Notes                                                      |
+| -------------------------- | ------------ | ------ | --------- | ---------------------------------------------------------- |
+| `runner-small`             | x86_64       | Small  | No        | Light tasks: linting, formatting, validation               |
+| `runner-medium`            | x86_64       | Medium | No        | Standard CI: tests, builds                                 |
+| `runner-medium-arm64`      | arm64        | Medium | No        | ARM64 builds and tests                                     |
+| `runner-large`             | x86_64       | Large  | No        | Heavy workloads: benchmarks, coverage, release builds      |
+| `runner-large-spot`        | x86_64       | Large  | Yes       | Cost-optimized large jobs where interruption is acceptable |
+| `runner-medium-arm64-spot` | arm64        | Medium | Yes       | Cost-optimized ARM64 jobs where interruption is acceptable |
+
+## Spot Runners
+
+Spot runners (`*-spot`) use ephemeral cloud instances provisioned on a best-effort basis. They are significantly cheaper but can be preempted at any time if the cloud provider reclaims capacity.
+
+**Use spot runners for:**
+
+- Jobs that can be retried without consequence (test suites, linting)
+- Non-time-critical workloads
+- PR validation where re-runs are acceptable
+
+**Do not use spot runners for:**
+
+- Benchmarks (preemption and noisy-neighbor effects skew results)
+- Release builds and publishing
+- Jobs requiring consistent, reproducible timing
+
+## Choosing a Runner
+
+| Workload                        | Recommended Runner         |
+| ------------------------------- | -------------------------- |
+| Linting, formatting, validation | `runner-small`             |
+| Unit tests, standard builds     | `runner-medium`            |
+| ARM64 cross-compilation / tests | `runner-medium-arm64`      |
+| Benchmarks, coverage reports    | `runner-large`             |
+| Non-critical large builds       | `runner-large-spot`        |
+| Non-critical ARM64 builds       | `runner-medium-arm64-spot` |
--- a/.github/workflows/benchmarks.yaml
+++ b/.github/workflows/benchmarks.yaml
--- a/.github/workflows/build-node-native.yml
+++ b/.github/workflows/build-node-native.yml
@@ -0,0 +1,74 @@
+name: Build Node Native
+
+on:
+  push:
+    branches: [main]
+    paths:
+      - "crates/kreuzberg-node/**"
+      - "crates/kreuzberg/**"
+      - "Cargo.toml"
+      - "Cargo.lock"
+      - "rust-toolchain.toml"
+      - ".github/workflows/build-node-native.yml"
+  pull_request:
+    branches: [main]
+    paths:
+      - "crates/kreuzberg-node/**"
+      - "crates/kreuzberg/**"
+      - "Cargo.toml"
+      - "Cargo.lock"
+      - "rust-toolchain.toml"
+      - ".github/workflows/build-node-native.yml"
+  workflow_dispatch:
+
+concurrency:
+  group: build-node-native-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  CARGO_TERM_COLOR: always
+  CARGO_INCREMENTAL: 0
+  MACOSX_DEPLOYMENT_TARGET: "14.0"
+  FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: "true"
+
+permissions:
+  contents: read
+
+jobs:
+  build:
+    name: Build ${{ matrix.target }}
+    runs-on: ${{ matrix.os }}
+    timeout-minutes: 60
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - os: ubuntu-latest
+            target: x86_64-unknown-linux-gnu
+          - os: ubuntu-24.04-arm
+            target: aarch64-unknown-linux-gnu
+          - os: macos-13
+            target: x86_64-apple-darwin
+          - os: macos-latest
+            target: aarch64-apple-darwin
+          - os: windows-latest
+            target: x86_64-pc-windows-msvc
+
+    steps:
+      - uses: actions/checkout@v6.0.2
+        with:
+          submodules: recursive
+
+      - uses: kreuzberg-dev/actions/setup-rust@v1
+        with:
+          target: ${{ matrix.target }}
+
+      - uses: kreuzberg-dev/actions/setup-node-workspace@v1
+        with:
+          node-version: "24"
+
+      - name: Build NAPI binding
+        uses: kreuzberg-dev/actions/build-node-napi@v1
+        with:
+          crate-dir: crates/kreuzberg-node
+          build-command: pnpm exec napi build --release --target ${{ matrix.target }} --platform
--- a/.github/workflows/ci-docker.yaml
+++ b/.github/workflows/ci-docker.yaml
@@ -0,0 +1,79 @@
+name: CI Docker
+
+on:
+  workflow_dispatch:
+
+concurrency:
+  group: ci-docker-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  ORT_VERSION: "1.24.2"
+  FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: "true"
+
+permissions:
+  contents: read
+
+jobs:
+  docker:
+    name: Docker (${{ matrix.variant }})
+    if: github.repository == 'kreuzberg-dev/kreuzberg' && github.actor != 'dependabot[bot]'
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+    strategy:
+      fail-fast: false
+      matrix:
+        variant: [core, full, cli]
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: Free disk space
+        uses: kreuzberg-dev/actions/free-disk-space-linux@v1
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v4
+
+      - name: Build Docker image
+        uses: docker/build-push-action@v7
+        with:
+          context: .
+          file: docker/Dockerfile.${{ matrix.variant }}
+          push: false
+          load: true
+          tags: kreuzberg:${{ matrix.variant }}
+          build-args: ONNXRUNTIME_VERSION=${{ env.ORT_VERSION }}
+          cache-from: type=gha,scope=ci-docker-${{ matrix.variant }}
+          cache-to: type=gha,mode=max,scope=ci-docker-${{ matrix.variant }}
+
+      - name: Save Docker image
+        shell: bash
+        run: |
+          mkdir -p /tmp
+          docker save kreuzberg:${{ matrix.variant }} | gzip > /tmp/kreuzberg-${{ matrix.variant }}.tar.gz
+          ls -lh /tmp/kreuzberg-${{ matrix.variant }}.tar.gz
+
+      - name: Check image size
+        uses: kreuzberg-dev/actions/check-docker-image-size@v1
+        with:
+          image: kreuzberg:${{ matrix.variant }}
+          warn-mb: ${{ matrix.variant == 'cli' && '200' || '' }}
+          label: "${{ matrix.variant }} image"
+
+      - name: Run feature tests
+        if: matrix.variant != 'cli'
+        run: scripts/ci/docker/run-feature-tests.sh "${{ matrix.variant }}"
+
+      - name: Run configuration tests
+        if: matrix.variant != 'cli'
+        run: scripts/ci/docker/run-config-tests.sh "${{ matrix.variant }}"
+
+      - name: Run API contract tests with schemathesis
+        if: matrix.variant != 'cli'
+        uses: kreuzberg-dev/actions/run-api-contract-tests@v1
+        with:
+          image: kreuzberg:${{ matrix.variant }}
+          port: "8000"
+
+      - name: Run CLI tests
+        if: matrix.variant == 'cli'
+        run: scripts/ci/docker/run-cli-tests.sh
--- a/.github/workflows/ci-docs.yaml
+++ b/.github/workflows/ci-docs.yaml
@@ -0,0 +1,102 @@
+name: CI Docs
+
+on:
+  pull_request:
+    paths:
+      - "docs/**"
+      - "packages/**/README.md"
+      - "crates/*/README.md"
+      - "packages/python/pyproject.toml"
+      - "packages/typescript/package.json"
+      - "packages/ruby/kreuzberg.gemspec"
+      - "packages/php/composer.json"
+      - "packages/go/v5/go.mod"
+      - "packages/java/pom.xml"
+      - "packages/csharp/**/Kreuzberg.csproj"
+      - "packages/elixir/mix.exs"
+      - "packages/r/DESCRIPTION"
+      - "packages/dart/pubspec.yaml"
+      - "zensical.toml"
+      - "mkdocs.yml"
+      - "alef.toml"
+      - ".github/workflows/ci-docs.yaml"
+  push:
+    branches: [main]
+    paths:
+      - "docs/**"
+      - "packages/**/README.md"
+      - "crates/*/README.md"
+      - "zensical.toml"
+      - "pyproject.toml"
+      - "alef.toml"
+      - "CHANGELOG.md"
+      - ".github/workflows/ci-docs.yaml"
+  workflow_dispatch:
+
+permissions:
+  contents: read
+  pages: write
+  id-token: write
+
+concurrency:
+  group: ci-docs-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: false
+
+jobs:
+  lint:
+    name: Lint
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: Lint documentation + validate snippets
+        uses: kreuzberg-dev/actions/lint-docs@v1
+        with:
+          working-directory: .
+          strict: "true"
+          validate-snippets: "true"
+          alef-ref: v0.19.5
+
+  build:
+    name: Build
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+
+      - name: Build documentation
+        uses: kreuzberg-dev/actions/build-docs@v1
+        with:
+          working-directory: .
+          strict: "true"
+
+      - name: Upload site artifact
+        uses: actions/upload-artifact@v7
+        with:
+          name: docs-site
+          path: site/
+          retention-days: 1
+
+  deploy:
+    name: Deploy
+    needs: [build, lint]
+    if: github.ref == 'refs/heads/main' && github.event_name == 'push'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Download site artifact
+        uses: actions/download-artifact@v8
+        with:
+          name: docs-site
+          path: site/
+
+      - name: Upload Pages artifact
+        uses: actions/upload-pages-artifact@v5
+        with:
+          path: site
+
+      - name: Deploy to GitHub Pages
+        id: deployment
+        uses: actions/deploy-pages@v5
--- a/.github/workflows/ci-e2e.yaml
+++ b/.github/workflows/ci-e2e.yaml
@@ -0,0 +1,345 @@
+name: CI E2E
+
+on:
+  push:
+    branches: [main]
+    paths:
+      - "crates/**"
+      - "packages/**"
+      - "e2e/**"
+      - "fixtures/**"
+      - "alef.toml"
+      - "Cargo.toml"
+      - "Cargo.lock"
+      - "Taskfile.yml"
+      - ".github/workflows/ci-e2e.yaml"
+  pull_request:
+    branches: [main]
+    paths:
+      - "crates/**"
+      - "packages/**"
+      - "e2e/**"
+      - "fixtures/**"
+      - "alef.toml"
+      - "Cargo.toml"
+      - "Cargo.lock"
+      - "Taskfile.yml"
+      - ".github/workflows/ci-e2e.yaml"
+  workflow_dispatch:
+
+concurrency:
+  group: ci-e2e-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  CARGO_TERM_COLOR: always
+  CARGO_INCREMENTAL: 0
+  CARGO_PROFILE_DEV_DEBUG: 0
+  RUST_BACKTRACE: short
+  RUST_MIN_STACK: 16777216
+  ORT_VERSION: "1.24.2"
+  MACOSX_DEPLOYMENT_TARGET: "14.0"
+  BUILD_PROFILE: "ci"
+  FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: "true"
+
+permissions:
+  contents: read
+
+jobs:
+  build-ffi:
+    name: Build FFI (${{ matrix.target }})
+    if: github.repository == 'kreuzberg-dev/kreuzberg' && github.actor != 'dependabot[bot]'
+    runs-on: ${{ matrix.os }}
+    timeout-minutes: ${{ matrix.os == 'windows-latest' && 120 || 60 }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - os: ubuntu-24.04-arm
+            target: aarch64-unknown-linux-gnu
+          - os: ubuntu-latest
+            target: x86_64-unknown-linux-gnu
+          - os: macos-latest
+            target: aarch64-apple-darwin
+          - os: windows-latest
+            target: x86_64-pc-windows-msvc
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          submodules: recursive
+
+      - name: Setup Rust
+        uses: kreuzberg-dev/actions/setup-rust@v1
+        with:
+          cache-key-prefix: build-ffi-${{ matrix.target }}
+          target: ${{ matrix.target }}
+
+      - name: Install system dependencies
+        uses: ./.github/actions/install-system-deps
+
+      - name: Setup OpenSSL
+        uses: kreuzberg-dev/actions/setup-openssl@v1
+
+      - name: Build FFI library
+        uses: kreuzberg-dev/actions/build-rust-ffi@v1
+        with:
+          crate-name: kreuzberg-ffi
+
+      - name: Build CLI
+        uses: kreuzberg-dev/actions/build-rust-cli@v1
+        with:
+          package-name: kreuzberg-cli
+          binary-name: kreuzberg
+          extra-cargo-args: --features all
+
+      - name: Upload FFI artifacts
+        uses: actions/upload-artifact@v7
+        with:
+          name: ffi-${{ matrix.target }}
+          path: |
+            target/release/libkreuzberg_ffi.*
+            target/release/kreuzberg_ffi.*
+            crates/kreuzberg-ffi/include/kreuzberg.h
+            crates/kreuzberg-ffi/kreuzberg-ffi.pc
+            crates/kreuzberg-ffi/cmake/
+            target/release/kreuzberg
+            target/release/kreuzberg.exe
+          retention-days: 7
+          if-no-files-found: error
+
+  e2e-tests:
+    name: E2E (${{ matrix.lang }})
+    if: github.repository == 'kreuzberg-dev/kreuzberg' && github.actor != 'dependabot[bot]'
+    needs: [build-ffi]
+    runs-on: ubuntu-24.04-arm
+    timeout-minutes: 60
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - lang: python
+            python-version: "3.13"
+            test-cmd: "pip install maturin && cd packages/python && maturin develop --release && cd ../../e2e/python && python3 -m pytest tests/ -q"
+          - lang: node
+            node-version: "24"
+            test-cmd: "cd crates/kreuzberg-node && npm run build && cd ../../e2e/node && npx vitest run"
+          - lang: go
+            go-version: "1.26"
+            test-cmd: "cd e2e/go && go test ./... -count=1 -v"
+          - lang: ruby
+            ruby-version: "3.4"
+            test-cmd: "cd e2e/ruby && bundle exec rspec"
+          - lang: java
+            java-version: "25"
+            test-cmd: "cd packages/java && mvn -q package -DskipTests && cd ../../e2e/java && mvn test -q"
+          - lang: csharp
+            dotnet-version: "10.0.x"
+            test-cmd: "cd e2e/csharp && dotnet test"
+          - lang: php
+            php-version: "8.4"
+            test-cmd: 'cd crates/kreuzberg-php && cargo build --release && echo "extension=$(pwd)/../../target/release/libkreuzberg_php.so" | sudo tee -a "$(php -r ''echo php_ini_loaded_file();'')" >/dev/null && cd ../../e2e/php && composer install -q && vendor/bin/phpunit'
+          - lang: elixir
+            elixir-version: "1.19"
+            otp-version: "28"
+            test-cmd: "cd e2e/elixir && KREUZBERG_BUILD=true mix deps.get && KREUZBERG_BUILD=true mix test"
+          - lang: wasm
+            node-version: "24"
+            test-cmd: 'curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh && export PATH="$HOME/.cargo/bin:$PATH" && export RUSTFLAGS=''--cfg getrandom_backend="wasm_js"'' && cd crates/kreuzberg-wasm && wasm-pack build --release --target web --out-dir ../../packages/wasm/pkg && cd ../../e2e/wasm && npm install && npm test'
+          - lang: rust
+            test-cmd: "cd e2e/rust && cargo test"
+          - lang: r
+            r-version: "4.3"
+            test-cmd: "cd e2e/r && Rscript run_tests.R"
+          - lang: dart
+            dart-version: "3.11"
+            test-cmd: "cargo build --release -p kreuzberg-dart && mkdir -p packages/dart/rust/target/release && cp target/release/libkreuzberg_dart.* packages/dart/rust/target/release/ 2>/dev/null || true && cd packages/dart && dart pub get && cd ../../e2e/dart && dart pub get && dart test"
+          - lang: kotlin_android
+            java-version: "25"
+            test-cmd: "cd e2e/kotlin_android && gradle test --no-daemon"
+          - lang: swift
+            swift-version: "6.0"
+            test-cmd: "cd e2e/swift_e2e && swift test"
+          - lang: zig
+            zig-version: "0.16.0"
+            test-cmd: 'FFI_ABS="$PWD/target/release" && cd e2e/zig && zig build test -Dffi_path="$FFI_ABS"'
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          submodules: recursive
+
+      - name: Setup Rust
+        uses: kreuzberg-dev/actions/setup-rust@v1
+        with:
+          cache-key-prefix: e2e-${{ matrix.lang }}
+
+      - name: Download FFI artifacts
+        uses: actions/download-artifact@v8
+        with:
+          name: ffi-aarch64-unknown-linux-gnu
+          path: ffi-artifacts
+
+      - name: Stage FFI artifacts
+        shell: bash
+        run: |
+          mkdir -p target/release crates/kreuzberg-ffi/include crates/kreuzberg-ffi/cmake
+          if [ -d ffi-artifacts/target/release ]; then
+            cp -r ffi-artifacts/target/release/. target/release/
+          fi
+          if [ -d ffi-artifacts/crates/kreuzberg-ffi/include ]; then
+            cp -r ffi-artifacts/crates/kreuzberg-ffi/include/. crates/kreuzberg-ffi/include/
+          fi
+          if [ -d ffi-artifacts/crates/kreuzberg-ffi/cmake ]; then
+            cp -r ffi-artifacts/crates/kreuzberg-ffi/cmake/. crates/kreuzberg-ffi/cmake/
+          fi
+          if [ -f ffi-artifacts/crates/kreuzberg-ffi/kreuzberg-ffi.pc ]; then
+            cp ffi-artifacts/crates/kreuzberg-ffi/kreuzberg-ffi.pc crates/kreuzberg-ffi/
+          fi
+          chmod +x target/release/libkreuzberg_ffi.so 2>/dev/null || true
+          ls -la target/release/
+          if [ -f target/release/libkreuzberg_ffi.so ]; then
+            sudo cp target/release/libkreuzberg_ffi.so /usr/local/lib/
+            sudo ldconfig
+          fi
+
+      - name: Install system dependencies
+        uses: ./.github/actions/install-system-deps
+
+      - name: Setup OpenSSL
+        uses: kreuzberg-dev/actions/setup-openssl@v1
+
+      - name: Setup ONNX Runtime
+        uses: ./.github/actions/setup-onnx-runtime
+        with:
+          ort-version: ${{ env.ORT_VERSION }}
+
+      - name: Setup Tesseract cache
+        uses: ./.github/actions/setup-tesseract-cache
+        with:
+          label: e2e-${{ matrix.lang }}
+
+      - name: Install WASI SDK
+        if: matrix.lang == 'wasm'
+        uses: kreuzberg-dev/actions/install-wasi-sdk@v1
+
+      - name: Setup Python
+        if: matrix.python-version
+        uses: kreuzberg-dev/actions/setup-python-env@v1
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache-prefix: e2e-py-${{ matrix.python-version }}
+
+      - name: Setup Node
+        if: matrix.node-version
+        uses: kreuzberg-dev/actions/setup-node-workspace@v1
+
+      - name: Setup Go
+        if: matrix.go-version
+        uses: actions/setup-go@v6
+        with:
+          go-version: ${{ matrix.go-version }}
+
+      - name: Setup Ruby
+        if: matrix.ruby-version
+        uses: ruby/setup-ruby@v1
+        with:
+          ruby-version: ${{ matrix.ruby-version }}
+          bundler-cache: true
+          working-directory: e2e/ruby
+
+      - name: Setup Java
+        if: matrix.java-version
+        uses: actions/setup-java@v5
+        with:
+          distribution: temurin
+          java-version: ${{ matrix.java-version }}
+
+      - name: Setup Android SDK
+        if: matrix.lang == 'kotlin_android'
+        uses: android-actions/setup-android@v3
+        with:
+          api-level: 35
+          build-tools-version: "35.0.0"
+
+      - name: Setup Gradle
+        if: matrix.lang == 'kotlin_android'
+        uses: kreuzberg-dev/actions/setup-gradle@v1
+        with:
+          gradle-version: "9.1.0"
+
+      - name: Setup .NET
+        if: matrix.dotnet-version
+        uses: actions/setup-dotnet@v5
+        with:
+          dotnet-version: ${{ matrix.dotnet-version }}
+
+      - name: Setup PHP
+        if: matrix.php-version
+        uses: kreuzberg-dev/actions/setup-php@v1
+        with:
+          php-version: ${{ matrix.php-version }}
+          tools: composer
+
+      - name: Setup Elixir
+        if: matrix.elixir-version
+        uses: kreuzberg-dev/actions/setup-elixir@v1
+        with:
+          elixir-version: ${{ matrix.elixir-version }}
+          otp-version: ${{ matrix.otp-version }}
+
+      - name: Setup R
+        if: matrix.r-version
+        uses: kreuzberg-dev/actions/setup-r@v1
+        with:
+          r-version: ${{ matrix.r-version }}
+
+      - name: Install R test packages
+        if: matrix.lang == 'r'
+        run: R -e 'install.packages(c("testthat","jsonlite","devtools"), repos="https://cloud.r-project.org")'
+
+      - name: Setup Dart
+        if: matrix.dart-version
+        uses: dart-lang/setup-dart@v1
+        with:
+          sdk: ${{ matrix.dart-version }}
+
+      - name: Setup Swift
+        if: matrix.swift-version
+        uses: kreuzberg-dev/actions/setup-swift@v1
+        with:
+          swift-version: ${{ matrix.swift-version }}
+
+      - name: Setup Zig
+        if: matrix.zig-version
+        uses: kreuzberg-dev/actions/setup-zig@v1
+        with:
+          version: ${{ matrix.zig-version }}
+
+      - name: Setup library paths for FFI bindings
+        if: |
+          matrix.lang == 'go' || matrix.lang == 'java' ||
+          matrix.lang == 'csharp' || matrix.lang == 'elixir' ||
+          matrix.lang == 'r' || matrix.lang == 'kotlin_android' ||
+          matrix.lang == 'swift' || matrix.lang == 'zig'
+        shell: bash
+        run: |
+          export PKG_CONFIG_PATH="${PWD}/crates/kreuzberg-ffi:${PKG_CONFIG_PATH}"
+          export LD_LIBRARY_PATH="${PWD}/target/release:${LD_LIBRARY_PATH}"
+          echo "PKG_CONFIG_PATH=${PKG_CONFIG_PATH}" >> "$GITHUB_ENV"
+          echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> "$GITHUB_ENV"
+
+      - name: Install Task
+        uses: kreuzberg-dev/actions/install-task@v1
+
+      - name: Compile Ruby native extension
+        if: matrix.lang == 'ruby'
+        working-directory: packages/ruby
+        run: bundle install && bundle exec rake compile
+
+      - name: Run tests
+        run: ${{ matrix.test-cmd }}
+        shell: bash
+        env:
+          PKG_CONFIG_PATH: ${{ env.PKG_CONFIG_PATH }}
+          LD_LIBRARY_PATH: ${{ env.LD_LIBRARY_PATH }}
+          DYLD_LIBRARY_PATH: ${{ env.DYLD_LIBRARY_PATH || '' }}
+          TESSDATA_PREFIX: "/usr/share/tesseract-ocr/5/tessdata"
--- a/.github/workflows/ci-gpu.yaml
+++ b/.github/workflows/ci-gpu.yaml
@@ -0,0 +1,112 @@
+name: CI GPU
+
+on:
+  workflow_dispatch:
+
+concurrency:
+  group: ci-gpu-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  CARGO_TERM_COLOR: always
+  CARGO_INCREMENTAL: 0
+  CARGO_PROFILE_DEV_DEBUG: 0
+  RUST_BACKTRACE: short
+  RUST_MIN_STACK: 16777216
+  ORT_VERSION: "1.24.2"
+
+permissions:
+  contents: read
+
+jobs:
+  build:
+    name: "Build test binary"
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+        with:
+          submodules: recursive
+
+      - name: Install system dependencies
+        uses: ./.github/actions/install-system-deps
+
+      - name: Setup Rust
+        uses: dtolnay/rust-toolchain@stable
+        with:
+          toolchain: "1.95"
+
+      - name: Cache Cargo
+        uses: actions/cache@v5
+        with:
+          path: |
+            ~/.cargo/registry
+            ~/.cargo/git
+            target
+          key: gpu-build-${{ runner.os }}-${{ hashFiles('Cargo.lock') }}
+          restore-keys: |
+            gpu-build-${{ runner.os }}-
+
+      - name: Build GPU test binary
+        uses: kreuzberg-dev/actions/build-gpu-test-binary@v1
+        with:
+          package: kreuzberg
+          test-name: gpu_acceleration
+          features: "paddle-ocr,layout-detection,embeddings,pdf,ocr,ort-dynamic"
+          output-name: gpu-acceleration-test
+
+      - name: Upload test binary
+        uses: actions/upload-artifact@v7
+        with:
+          name: gpu-test-binary
+          path: gpu-acceleration-test
+          retention-days: 1
+
+  gpu-tests:
+    name: "GPU Tests (CUDA)"
+    needs: build
+    runs-on: runner-gpu-l4
+    timeout-minutes: 15
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+        with:
+          submodules: recursive
+
+      - name: Verify GPU
+        run: |
+          nvidia-smi || {
+            echo "ERROR: nvidia-smi failed — no GPU detected"
+            exit 1
+          }
+          echo "GPU detected:"
+          nvidia-smi --query-gpu=name,driver_version,memory.total --format=csv,noheader
+
+      - name: Download test binary
+        uses: actions/download-artifact@v8.0.1
+        with:
+          name: gpu-test-binary
+
+      - name: Download ONNX Runtime (GPU/CUDA)
+        uses: kreuzberg-dev/actions/setup-onnx-runtime-gpu@v1
+        with:
+          version: ${{ env.ORT_VERSION }}
+
+      - name: Setup PaddleOCR models
+        uses: ./.github/actions/setup-paddle-ocr-models
+
+      - name: Clear stale layout model cache (self-hosted runner persistence)
+        run: |
+          rm -rf "$HOME/.cache/kreuzberg/layout"
+          echo "Cleared layout model cache"
+
+      - name: Run GPU tests
+        run: |
+          chmod +x gpu-acceleration-test
+          ./gpu-acceleration-test --ignored --nocapture
+        env:
+          RUST_LOG: "kreuzberg=debug"
+          TEST_DOCUMENTS_DIR: ${{ github.workspace }}/test_documents
--- a/.github/workflows/ci-lint.yaml
+++ b/.github/workflows/ci-lint.yaml
@@ -0,0 +1,107 @@
+name: CI Lint
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+  workflow_dispatch:
+
+concurrency:
+  group: ci-lint-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+
+jobs:
+  lint:
+    name: Lint
+    if: github.repository == 'kreuzberg-dev/kreuzberg' && github.actor != 'dependabot[bot]'
+    runs-on: ubuntu-24.04-arm
+    timeout-minutes: 60
+    env:
+      CARGO_TERM_COLOR: always
+      FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: "true"
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          submodules: recursive
+
+      - name: Setup Rust
+        uses: kreuzberg-dev/actions/setup-rust@v1
+        with:
+          cache-key-prefix: lint
+
+      - name: Setup Python
+        uses: kreuzberg-dev/actions/setup-python-env@v1
+        with:
+          python-version: "3.13"
+          cache-prefix: lint-py
+          install-command: "uv sync --group dev --no-install-project --no-install-workspace --frozen"
+
+      - name: Setup Node
+        uses: kreuzberg-dev/actions/setup-node-workspace@v1
+
+      - name: Setup Go
+        uses: actions/setup-go@v6
+        with:
+          go-version: "1.26"
+
+      - name: Setup Java
+        uses: actions/setup-java@v5
+        with:
+          distribution: temurin
+          java-version: "25"
+
+      - name: Setup Elixir
+        uses: kreuzberg-dev/actions/setup-elixir@v1
+
+      - name: Setup Ruby
+        uses: ruby/setup-ruby@v1
+        with:
+          ruby-version: "3.4"
+          bundler-cache: true
+          working-directory: packages/ruby
+
+      - name: Setup PHP
+        uses: kreuzberg-dev/actions/setup-php@v1
+
+      - name: Setup .NET
+        uses: actions/setup-dotnet@v5
+        with:
+          dotnet-version: "10.0.x"
+
+      - name: Setup R
+        uses: kreuzberg-dev/actions/setup-r@v1
+        with:
+          r-version: "release"
+          install-deps: "false"
+
+      - name: Install Task
+        uses: kreuzberg-dev/actions/install-task@v1
+
+      - name: Setup Helm
+        uses: azure/setup-helm@v5
+
+      - name: Setup kubeconform
+        uses: bmuschko/setup-kubeconform@v1
+
+      - name: Install alef CLI
+        uses: kreuzberg-dev/actions/install-alef@v1
+
+      - name: Run all prek hooks
+        uses: j178/prek-action@v2
+        with:
+          cache: false
+          extra-args: --all-files
+
+      - name: Validate C header
+        shell: bash
+        run: |
+          HEADER="crates/kreuzberg-ffi/include/kreuzberg.h"
+          if [ ! -f "$HEADER" ]; then
+            echo "::error::C header not found at $HEADER — run 'task alef:generate'"
+            exit 1
+          fi
+          echo "C header verified at $HEADER"
--- a/.github/workflows/ci-mobile.yaml
+++ b/.github/workflows/ci-mobile.yaml
@@ -0,0 +1,79 @@
+name: CI Mobile
+
+on:
+  push:
+    branches: [main]
+    paths:
+      - "crates/**"
+      - "packages/dart/**"
+      - "packages/swift/**"
+      - "packages/kotlin-android/**"
+      - ".github/workflows/ci-mobile.yaml"
+  pull_request:
+    branches: [main]
+    paths:
+      - "crates/**"
+      - "packages/dart/**"
+      - "packages/swift/**"
+      - "packages/kotlin-android/**"
+      - ".github/workflows/ci-mobile.yaml"
+  workflow_dispatch:
+
+concurrency:
+  group: ci-mobile-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  CARGO_TERM_COLOR: always
+  CARGO_INCREMENTAL: 0
+  CARGO_PROFILE_DEV_DEBUG: 0
+  RUST_BACKTRACE: short
+  # Mobile feature subsets (Android drops ORT-requiring features) leave some
+  # functions only used in the full-feature graph; -A dead_code keeps the
+  # cross-compile check honest about other classes of warnings without choking
+  # on these.
+  RUSTFLAGS: "-D warnings -A dead_code -A unpredictable-function-pointer-comparisons -A mismatched-lifetime-syntaxes"
+
+permissions:
+  contents: read
+
+jobs:
+  android-check:
+    name: Android cargo check (${{ matrix.abi }})
+    if: github.repository == 'kreuzberg-dev/kreuzberg' && github.actor != 'dependabot[bot]'
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    strategy:
+      fail-fast: false
+      matrix:
+        abi: [arm64-v8a, x86_64]
+    steps:
+      - uses: actions/checkout@v6
+      - uses: kreuzberg-dev/actions/setup-rust@v1
+        with:
+          cache-key-prefix: ci-mobile-android-${{ matrix.abi }}
+      - uses: kreuzberg-dev/actions/setup-android-ndk@v1
+      - name: cargo ndk check kreuzberg-dart
+        run: cargo ndk --target ${{ matrix.abi }} --platform 21 -- check -p kreuzberg-dart
+      - name: cargo ndk check kreuzberg-ffi
+        run: cargo ndk --target ${{ matrix.abi }} --platform 21 -- check -p kreuzberg-ffi
+
+  ios-check:
+    name: iOS cargo check (${{ matrix.target }})
+    if: github.repository == 'kreuzberg-dev/kreuzberg' && github.actor != 'dependabot[bot]'
+    runs-on: macos-latest
+    timeout-minutes: 30
+    strategy:
+      fail-fast: false
+      matrix:
+        target: [aarch64-apple-ios, aarch64-apple-ios-sim]
+    steps:
+      - uses: actions/checkout@v6
+      - uses: kreuzberg-dev/actions/setup-rust@v1
+        with:
+          target: ${{ matrix.target }}
+          cache-key-prefix: ci-mobile-ios-${{ matrix.target }}
+      - name: cargo check kreuzberg-dart
+        run: cargo check -p kreuzberg-dart --target ${{ matrix.target }}
+      - name: cargo check kreuzberg-swift
+        run: cargo check -p kreuzberg-swift --target ${{ matrix.target }}
--- a/.github/workflows/ci-rust.yaml
+++ b/.github/workflows/ci-rust.yaml
@@ -0,0 +1,103 @@
+name: CI Rust
+
+on:
+  push:
+    branches: [main]
+    paths:
+      - "crates/**"
+      - "Cargo.toml"
+      - "Cargo.lock"
+      - "rust-toolchain.toml"
+      - ".github/workflows/ci-rust.yaml"
+  pull_request:
+    branches: [main]
+    paths:
+      - "crates/**"
+      - "Cargo.toml"
+      - "Cargo.lock"
+      - "rust-toolchain.toml"
+      - ".github/workflows/ci-rust.yaml"
+  workflow_dispatch:
+
+concurrency:
+  group: ci-rust-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  CARGO_TERM_COLOR: always
+  CARGO_INCREMENTAL: 0
+  CARGO_PROFILE_DEV_DEBUG: 0
+  RUST_BACKTRACE: short
+  RUST_MIN_STACK: 16777216
+  ORT_VERSION: "1.24.2"
+  MACOSX_DEPLOYMENT_TARGET: "14.0"
+  BUILD_PROFILE: "ci"
+  FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: "true"
+
+permissions:
+  contents: read
+
+jobs:
+  rust:
+    name: Rust (${{ matrix.os }})
+    if: github.repository == 'kreuzberg-dev/kreuzberg' && github.actor != 'dependabot[bot]'
+    runs-on: ${{ matrix.os }}
+    timeout-minutes: 60
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - os: ubuntu-24.04-arm
+          - os: macos-latest
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          submodules: recursive
+
+      - name: Free disk space
+        if: runner.os == 'Linux'
+        uses: kreuzberg-dev/actions/free-disk-space-linux@v1
+        with:
+          show-initial: "false"
+          show-final: "true"
+
+      - name: Setup Rust
+        uses: kreuzberg-dev/actions/setup-rust@v1
+        with:
+          cache-key-prefix: rust-${{ matrix.os }}
+          use-sccache: "true"
+
+      - name: Install system dependencies
+        uses: ./.github/actions/install-system-deps
+
+      - name: Setup OpenSSL
+        uses: kreuzberg-dev/actions/setup-openssl@v1
+
+      - name: Setup ONNX Runtime
+        uses: ./.github/actions/setup-onnx-runtime
+        with:
+          ort-version: ${{ env.ORT_VERSION }}
+
+      - name: Setup Tesseract cache
+        uses: ./.github/actions/setup-tesseract-cache
+        with:
+          label: ${{ matrix.os }}
+
+      - name: Install Task
+        uses: kreuzberg-dev/actions/install-task@v1
+
+      - name: Run clippy
+        run: cargo clippy --workspace --exclude kreuzberg-ffi --exclude kreuzberg-py --exclude kreuzberg-php --exclude kreuzberg-node --exclude kreuzberg-wasm --exclude kreuzberg-dart --exclude kreuzberg-swift --exclude kreuzberg_nif -- -D warnings
+        shell: bash
+
+      - name: Run tests
+        run: task rust:test:ci
+        shell: bash
+        env:
+          LD_LIBRARY_PATH: ${{ env.LD_LIBRARY_PATH || '' }}
+          DYLD_LIBRARY_PATH: ${{ env.DYLD_LIBRARY_PATH || '' }}
+          DYLD_FALLBACK_LIBRARY_PATH: ${{ env.DYLD_FALLBACK_LIBRARY_PATH || '' }}
+
+      - name: Check no-default-features
+        run: cargo check -p kreuzberg --no-default-features
+        shell: bash
--- a/.github/workflows/profiling.yaml
+++ b/.github/workflows/profiling.yaml
--- a/.github/workflows/publish-docker.yaml
+++ b/.github/workflows/publish-docker.yaml
@@ -0,0 +1,262 @@
+name: Publish Docker Images
+
+on:
+  workflow_dispatch:
+    inputs:
+      tag:
+        description: "Release tag to build (e.g., v4.3.6)"
+        required: true
+        type: string
+      dry_run:
+        description: "Prepare artifacts without publishing"
+        required: false
+        type: boolean
+        default: false
+      ref:
+        description: "Git ref (branch, tag, or commit) to build; defaults to the tag"
+        required: false
+        type: string
+      force_republish:
+        description: "Force re-publish even if artifacts already exist"
+        required: false
+        type: boolean
+        default: false
+  release:
+    types: [published]
+  repository_dispatch:
+    types: [publish-docker]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ (github.event_name == 'workflow_dispatch' && (github.event.inputs.ref || github.event.inputs.tag)) || github.ref || github.run_id }}
+  cancel-in-progress: false
+
+env:
+  CARGO_TERM_COLOR: always
+  ORT_VERSION: "1.24.2"
+  MACOSX_DEPLOYMENT_TARGET: "14.0"
+  FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: "true"
+
+permissions:
+  contents: read
+
+jobs:
+  prepare:
+    name: Prepare metadata
+    if: ${{ github.event_name != 'release' || !github.event.release.prerelease }}
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+    outputs:
+      tag: ${{ steps.meta.outputs.tag }}
+      version: ${{ steps.meta.outputs.version }}
+      ref: ${{ steps.meta.outputs.ref }}
+      dry_run: ${{ steps.meta.outputs.dry_run }}
+      force_republish: ${{ steps.meta.outputs.force_republish }}
+      checkout_ref: ${{ steps.meta.outputs.checkout_ref }}
+      target_sha: ${{ steps.meta.outputs.target_sha }}
+      is_tag: ${{ steps.meta.outputs.is_tag }}
+      release_docker: ${{ steps.meta.outputs.release_docker }}
+    steps:
+      - name: Checkout code (default)
+        uses: actions/checkout@v6
+
+      - name: Resolve release metadata
+        id: meta
+        uses: kreuzberg-dev/actions/prepare-release-metadata@v1
+        with:
+          tag: ${{ inputs.tag }}
+          ref: ${{ inputs.ref }}
+          targets: docker
+          dry-run: ${{ inputs.dry_run }}
+          force-republish: ${{ inputs.force_republish }}
+
+      - name: Re-checkout at target ref
+        if: ${{ steps.meta.outputs.checkout_ref != '' }}
+        uses: actions/checkout@v6
+        with:
+          ref: ${{ steps.meta.outputs.checkout_ref }}
+          fetch-depth: 0
+          submodules: recursive
+
+      - name: Show metadata
+        env:
+          META_TAG: ${{ steps.meta.outputs.tag }}
+          META_VERSION: ${{ steps.meta.outputs.version }}
+          META_REF: ${{ steps.meta.outputs.ref }}
+          META_DRY_RUN: ${{ steps.meta.outputs.dry_run }}
+          META_FORCE_REPUBLISH: ${{ steps.meta.outputs.force_republish }}
+          META_CHECKOUT_REF: ${{ steps.meta.outputs.checkout_ref }}
+          META_TARGET_SHA: ${{ steps.meta.outputs.target_sha }}
+          META_IS_TAG: ${{ steps.meta.outputs.is_tag }}
+          META_RELEASE_DOCKER: ${{ steps.meta.outputs.release_docker }}
+        run: |
+          {
+            echo "## Release Metadata"
+            echo "- **Tag**: \`$META_TAG\`"
+            echo "- **Version**: \`$META_VERSION\`"
+            echo "- **Ref**: \`$META_REF\`"
+            echo "- **Dry Run**: \`$META_DRY_RUN\`"
+            echo "- **Force Republish**: \`$META_FORCE_REPUBLISH\`"
+            echo "- **Checkout Ref**: \`$META_CHECKOUT_REF\`"
+            echo "- **Target SHA**: \`$META_TARGET_SHA\`"
+            echo "- **Is Tag**: \`$META_IS_TAG\`"
+            echo "- **Release Docker**: \`$META_RELEASE_DOCKER\`"
+          } >> "$GITHUB_STEP_SUMMARY"
+
+  check-docker:
+    name: Check if Docker image tag exists
+    needs: prepare
+    if: ${{ needs.prepare.outputs.release_docker == 'true' }}
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: read
+    outputs:
+      core_exists: ${{ steps.core.outputs.exists }}
+      full_exists: ${{ steps.full.outputs.exists }}
+      cli_exists: ${{ steps.cli.outputs.exists }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v6
+        with:
+          ref: ${{ needs.prepare.outputs.tag }}
+
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v4
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Check core image tag
+        id: core
+        env:
+          DOCKER_TAG: ghcr.io/kreuzberg-dev/kreuzberg:${{ needs.prepare.outputs.version }}-core
+          SUMMARY_LABEL: core
+        run: scripts/publish/check-docker-tag.sh
+
+      - name: Check full image tag
+        id: full
+        env:
+          DOCKER_TAG: ghcr.io/kreuzberg-dev/kreuzberg:${{ needs.prepare.outputs.version }}
+          SUMMARY_LABEL: full
+        run: scripts/publish/check-docker-tag.sh
+
+      - name: Check CLI image tag
+        id: cli
+        env:
+          DOCKER_TAG: ghcr.io/kreuzberg-dev/kreuzberg-cli:${{ needs.prepare.outputs.version }}
+          SUMMARY_LABEL: cli
+        run: scripts/publish/check-docker-tag.sh
+
+  publish-docker:
+    name: Publish Docker image (${{ matrix.variant }})
+    needs:
+      - prepare
+      - check-docker
+    runs-on: ubuntu-latest
+    timeout-minutes: 360
+    permissions:
+      contents: read
+      packages: write
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - variant: core
+            dockerfile: docker/Dockerfile.core
+            image: ghcr.io/kreuzberg-dev/kreuzberg
+            tag_suffix: "-core"
+            extra_tag: "core"
+          - variant: full
+            dockerfile: docker/Dockerfile.full
+            image: ghcr.io/kreuzberg-dev/kreuzberg
+            tag_suffix: ""
+            extra_tag: "latest"
+          - variant: cli
+            dockerfile: docker/Dockerfile.cli
+            image: ghcr.io/kreuzberg-dev/kreuzberg-cli
+            tag_suffix: ""
+            extra_tag: "latest"
+    if: ${{ needs.prepare.outputs.release_docker == 'true' }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+        with:
+          ref: ${{ needs.prepare.outputs.checkout_ref }}
+          fetch-depth: 0
+          submodules: recursive
+
+      - name: Free up disk space
+        uses: kreuzberg-dev/actions/free-disk-space-linux@v1
+
+      - name: Ensure target commit
+        if: ${{ needs.prepare.outputs.target_sha != '' }}
+        run: git checkout --progress --force ${{ needs.prepare.outputs.target_sha }}
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v4
+
+      - name: Skip because tag already exists
+        if: ${{ needs.prepare.outputs.force_republish != 'true' && ((matrix.variant == 'core' && needs.check-docker.outputs.core_exists == 'true') || (matrix.variant == 'full' && needs.check-docker.outputs.full_exists == 'true') || (matrix.variant == 'cli' && needs.check-docker.outputs.cli_exists == 'true')) }}
+        run: echo "Docker tag already exists for variant ${{ matrix.variant }}; skipping publish." >> "$GITHUB_STEP_SUMMARY"
+
+      - name: Build AMD64 test image
+        if: ${{ needs.prepare.outputs.force_republish == 'true' || (matrix.variant == 'core' && needs.check-docker.outputs.core_exists != 'true') || (matrix.variant == 'full' && needs.check-docker.outputs.full_exists != 'true') || (matrix.variant == 'cli' && needs.check-docker.outputs.cli_exists != 'true') }}
+        run: docker build -f ${{ matrix.dockerfile }} --build-arg ONNXRUNTIME_VERSION=${{ env.ORT_VERSION }} -t kreuzberg-publish:${{ matrix.variant }}-test .
+
+      - name: Run Docker tests
+        if: ${{ needs.prepare.outputs.force_republish == 'true' || (matrix.variant == 'core' && needs.check-docker.outputs.core_exists != 'true') || (matrix.variant == 'full' && needs.check-docker.outputs.full_exists != 'true') || (matrix.variant == 'cli' && needs.check-docker.outputs.cli_exists != 'true') }}
+        run: python3 scripts/ci/docker/test_docker.py --image kreuzberg-publish:${{ matrix.variant }}-test --variant ${{ matrix.variant }} --verbose
+
+      - name: Log in to GitHub Container Registry
+        if: ${{ needs.prepare.outputs.dry_run != 'true' && (needs.prepare.outputs.force_republish == 'true' || (matrix.variant == 'core' && needs.check-docker.outputs.core_exists != 'true') || (matrix.variant == 'full' && needs.check-docker.outputs.full_exists != 'true') || (matrix.variant == 'cli' && needs.check-docker.outputs.cli_exists != 'true')) }}
+        uses: docker/login-action@v4
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Extract Docker metadata
+        if: ${{ needs.prepare.outputs.dry_run != 'true' && (needs.prepare.outputs.force_republish == 'true' || (matrix.variant == 'core' && needs.check-docker.outputs.core_exists != 'true') || (matrix.variant == 'full' && needs.check-docker.outputs.full_exists != 'true') || (matrix.variant == 'cli' && needs.check-docker.outputs.cli_exists != 'true')) }}
+        id: docker_meta
+        uses: docker/metadata-action@v6
+        with:
+          images: ${{ matrix.image }}
+          tags: |
+            type=raw,value=${{ needs.prepare.outputs.version }}${{ matrix.tag_suffix }}
+            type=raw,value=${{ matrix.extra_tag }}
+
+      - name: Build and push image
+        if: ${{ needs.prepare.outputs.dry_run != 'true' && (needs.prepare.outputs.force_republish == 'true' || (matrix.variant == 'core' && needs.check-docker.outputs.core_exists != 'true') || (matrix.variant == 'full' && needs.check-docker.outputs.full_exists != 'true') || (matrix.variant == 'cli' && needs.check-docker.outputs.cli_exists != 'true')) }}
+        uses: docker/build-push-action@v7
+        with:
+          context: .
+          file: ${{ matrix.dockerfile }}
+          push: true
+          build-args: |
+            ONNXRUNTIME_VERSION=${{ env.ORT_VERSION }}
+          tags: ${{ steps.docker_meta.outputs.tags }}
+          labels: |
+            ${{ steps.docker_meta.outputs.labels }}
+            org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }}
+            org.opencontainers.image.description=Kreuzberg document intelligence - ${{ matrix.variant }} variant
+            org.opencontainers.image.licenses=MIT
+          platforms: linux/amd64,linux/arm64
+          cache-from: type=gha
+          cache-to: type=gha,mode=max,scope=publish-docker-${{ matrix.variant }}
+
+      - name: Docker dry-run summary
+        if: ${{ needs.prepare.outputs.dry_run == 'true' }}
+        env:
+          IMAGE: ${{ matrix.image }}
+          VERSION: ${{ needs.prepare.outputs.version }}
+          TAG_SUFFIX: ${{ matrix.tag_suffix }}
+        run: scripts/publish/docker/dry-run-summary.sh
+
+      - name: Clean up local Docker images
+        if: ${{ always() }}
+        run: docker rmi kreuzberg-publish:${{ matrix.variant }}-test || true
--- a/.github/workflows/publish-helm.yaml
+++ b/.github/workflows/publish-helm.yaml
@@ -0,0 +1,108 @@
+name: Publish Helm Chart
+
+on:
+  workflow_dispatch:
+    inputs:
+      tag:
+        description: "Release tag to build (e.g., v4.3.6)"
+        required: true
+        type: string
+      dry_run:
+        description: "Prepare artifacts without publishing"
+        required: false
+        type: boolean
+        default: false
+  release:
+    types: [published]
+  repository_dispatch:
+    types: [publish-helm]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.tag) || github.ref || github.run_id }}
+  cancel-in-progress: false
+
+permissions:
+  contents: read
+
+jobs:
+  publish-helm:
+    name: Publish Helm chart to GHCR
+    if: ${{ github.event_name != 'release' || !github.event.release.prerelease }}
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+
+      - name: Resolve version
+        id: meta
+        run: |
+          if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
+            TAG="${{ inputs.tag }}"
+          elif [[ "${{ github.event_name }}" == "release" ]]; then
+            TAG="${{ github.event.release.tag_name }}"
+          elif [[ "${{ github.event_name }}" == "repository_dispatch" ]]; then
+            TAG="${{ github.event.client_payload.tag }}"
+          fi
+
+          VERSION="${TAG#v}"
+          DRY_RUN="${{ inputs.dry_run || 'false' }}"
+
+          {
+            echo "tag=${TAG}"
+            echo "version=${VERSION}"
+            echo "dry_run=${DRY_RUN}"
+          } >> "$GITHUB_OUTPUT"
+
+          {
+            echo "## Helm Publish Metadata"
+            echo "- **Tag**: \`${TAG}\`"
+            echo "- **Version**: \`${VERSION}\`"
+            echo "- **Dry Run**: \`${DRY_RUN}\`"
+          } >> "$GITHUB_STEP_SUMMARY"
+
+      - name: Setup Helm
+        uses: azure/setup-helm@v5
+
+      - name: Lint chart
+        run: helm lint --strict charts/kreuzberg/
+
+      - name: Update Chart.yaml version
+        run: |
+          sed -i "s/^version:.*/version: ${{ steps.meta.outputs.version }}/" charts/kreuzberg/Chart.yaml
+          sed -i "s/^appVersion:.*/appVersion: \"${{ steps.meta.outputs.version }}\"/" charts/kreuzberg/Chart.yaml
+          {
+            echo "### Chart.yaml"
+            echo '```yaml'
+            cat charts/kreuzberg/Chart.yaml
+            echo '```'
+          } >> "$GITHUB_STEP_SUMMARY"
+
+      - name: Package chart
+        run: |
+          helm package charts/kreuzberg/ --destination .helm-packages/
+          echo "### Packaged" >> "$GITHUB_STEP_SUMMARY"
+          ls -lh .helm-packages/ >> "$GITHUB_STEP_SUMMARY"
+
+      - name: Log in to GitHub Container Registry
+        if: ${{ steps.meta.outputs.dry_run != 'true' }}
+        uses: docker/login-action@v4
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Push chart to GHCR
+        if: ${{ steps.meta.outputs.dry_run != 'true' }}
+        run: |
+          helm push .helm-packages/kreuzberg-${{ steps.meta.outputs.version }}.tgz oci://ghcr.io/kreuzberg-dev/charts
+          echo "### Published" >> "$GITHUB_STEP_SUMMARY"
+          echo "Chart pushed to \`oci://ghcr.io/kreuzberg-dev/charts/kreuzberg:${{ steps.meta.outputs.version }}\`" >> "$GITHUB_STEP_SUMMARY"
+
+      - name: Dry-run summary
+        if: ${{ steps.meta.outputs.dry_run == 'true' }}
+        run: |
+          echo "### Dry Run" >> "$GITHUB_STEP_SUMMARY"
+          echo "Would have pushed \`kreuzberg-${{ steps.meta.outputs.version }}.tgz\` to \`oci://ghcr.io/kreuzberg-dev/charts\`" >> "$GITHUB_STEP_SUMMARY"
--- a/.github/workflows/publish-pubdev.yaml
+++ b/.github/workflows/publish-pubdev.yaml
@@ -0,0 +1,46 @@
+name: Publish pub.dev
+
+# pub.dev OIDC trusted publishing rejects tokens originating from `release`
+# events; only `push` and `workflow_dispatch` are accepted.
+#
+# Because the kreuzberg Dart package embeds platform-specific native binaries
+# (Android JNI, iOS XCFramework, server libs for linux/macos/windows), we
+# cannot just rebuild here — those artifacts are produced by the main
+# `publish.yaml` workflow. Instead, the main workflow's `trigger-pubdev` job
+# dispatches this workflow with the run_id of the main workflow, and this
+# workflow downloads the `dart-package-assembled` artifact from that run.
+#
+# One-time setup: on pub.dev → kreuzberg package → Admin → Automated publishing,
+# set the workflow path to `.github/workflows/publish-pubdev.yaml`.
+
+on:
+  workflow_dispatch:
+    inputs:
+      run_id:
+        description: "GitHub Actions run ID of publish.yaml that produced the dart-package-assembled artifact"
+        required: true
+        type: string
+
+permissions:
+  contents: read
+  id-token: write
+  actions: read
+
+env:
+  FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: "true"
+
+jobs:
+  publish-pub:
+    name: Publish pub.dev
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/download-artifact@v8.0.1
+        with:
+          name: dart-package-assembled
+          path: packages/dart
+          run-id: ${{ inputs.run_id }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+
+      - uses: kreuzberg-dev/actions/publish-pub@v1
+        with:
+          package-dir: packages/dart
--- a/.github/workflows/publish.yaml
+++ b/.github/workflows/publish.yaml
--- a/.github/workflows/validate-issues.yml
+++ b/.github/workflows/validate-issues.yml
@@ -0,0 +1,10 @@
+name: Validate Issues
+
+on:
+  issues:
+    types: [opened, edited]
+
+jobs:
+  validate:
+    uses: kreuzberg-dev/actions/.github/workflows/reusable-validate-issues.yml@v1
+    secrets: inherit
--- a/.github/workflows/validate-pr.yml
+++ b/.github/workflows/validate-pr.yml
@@ -0,0 +1,10 @@
+name: Validate PR
+
+on:
+  pull_request:
+    types: [opened, edited, synchronize]
+
+jobs:
+  validate:
+    uses: kreuzberg-dev/actions/.github/workflows/reusable-validate-pr.yml@v1
+    secrets: inherit