Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/scripts/benchmarks/ensure-benchmark-harness-exists.sh
+++ b/scripts/benchmarks/ensure-benchmark-harness-exists.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+
+source "${REPO_ROOT}/scripts/lib/common.sh"
+
+validate_repo_root "$REPO_ROOT" || exit 1
+
+if [ ! -d "$REPO_ROOT/tools/benchmark-harness" ]; then
+  echo "::error::tools/benchmark-harness not found on branch ${GITHUB_REF}." >&2
+  exit 1
+fi
+
+echo "✓ Benchmark harness directory verified at: $REPO_ROOT/tools/benchmark-harness"
--- a/scripts/benchmarks/restore-binary-permissions.sh
+++ b/scripts/benchmarks/restore-binary-permissions.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+
+source "${REPO_ROOT}/scripts/lib/common.sh"
+
+validate_repo_root "$REPO_ROOT" || exit 1
+
+BINARY_PATH="${BINARY_PATH:-$REPO_ROOT/target/release/benchmark-harness}"
+
+if [ ! -f "$BINARY_PATH" ]; then
+  echo "::error::Binary not found at $BINARY_PATH" >&2
+  exit 1
+fi
+
+chmod +x "$BINARY_PATH"
+echo "✓ Restored executable permissions on: $BINARY_PATH"
+
+# Also restore kreuzberg-cli if present (used by all kreuzberg adapter pipelines)
+CLI_BINARY="$REPO_ROOT/target/release/kreuzberg"
+if [ -f "$CLI_BINARY" ]; then
+  chmod +x "$CLI_BINARY"
+  echo "✓ Restored executable permissions on: $CLI_BINARY"
+fi
--- a/scripts/benchmarks/run-benchmark.sh
+++ b/scripts/benchmarks/run-benchmark.sh
@@ -0,0 +1,57 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+FRAMEWORK="${FRAMEWORK:-}"
+MODE="${MODE:-}"
+ITERATIONS="${ITERATIONS:-3}"
+TIMEOUT="${TIMEOUT:-900}"
+FIXTURES_DIR="${FIXTURES_DIR:-tools/benchmark-harness/fixtures}"
+HARNESS_PATH="${HARNESS_PATH:-./target/release/benchmark-harness}"
+MEASURE_QUALITY="${MEASURE_QUALITY:-false}"
+OCR_ENABLED="${OCR_ENABLED:-false}"
+OUTPUT_FORMAT="${OUTPUT_FORMAT:-markdown}"
+
+if [ -z "$FRAMEWORK" ] || [ -z "$MODE" ]; then
+  echo "::error::FRAMEWORK and MODE environment variables are required" >&2
+  exit 1
+fi
+
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+
+source "${REPO_ROOT}/scripts/lib/common.sh"
+source "${REPO_ROOT}/scripts/lib/library-paths.sh"
+
+validate_repo_root "$REPO_ROOT" || exit 1
+
+setup_go_paths "$REPO_ROOT"
+setup_onnx_paths
+
+OUTPUT_DIR="benchmark-results/${FRAMEWORK}-${OUTPUT_FORMAT}-${MODE}"
+rm -rf "${OUTPUT_DIR}"
+
+MAX_CONCURRENT=$([[ "$MODE" == "single-file" ]] && echo 1 || echo 4)
+
+SHARD="${SHARD:-}"
+
+EXTRA_ARGS=()
+if [ "$MEASURE_QUALITY" = "true" ]; then
+  EXTRA_ARGS+=("--measure-quality")
+fi
+if [ "$OCR_ENABLED" = "true" ]; then
+  EXTRA_ARGS+=("--ocr")
+fi
+if [ -n "$SHARD" ]; then
+  EXTRA_ARGS+=("--shard" "${SHARD}")
+fi
+
+BENCHMARK_DEBUG=1 "${HARNESS_PATH}" \
+  run \
+  --fixtures "${FIXTURES_DIR}" \
+  --frameworks "${FRAMEWORK}" \
+  --output "${OUTPUT_DIR}" \
+  --iterations "${ITERATIONS}" \
+  --timeout "${TIMEOUT}" \
+  --mode "${MODE}" \
+  --max-concurrent "${MAX_CONCURRENT}" \
+  --output-format "${OUTPUT_FORMAT}" \
+  "${EXTRA_ARGS[@]+"${EXTRA_ARGS[@]}"}"
--- a/scripts/ci/README.md
+++ b/scripts/ci/README.md
@@ -0,0 +1,242 @@
+# CI Workflow Scripts
+
+This directory contains extracted scripts from GitHub Actions CI workflows, organized by workflow type.
+
+## Overview
+
+- **Total Scripts**: 41 (27 Bash + 14 PowerShell)
+- **Documentation**: See `SCRIPT_MAPPING.md` for detailed workflow-to-script mapping
+- **All Scripts**: Production-ready with proper error handling and documentation
+
+## Directory Structure
+
+```text
+scripts/ci/
+├── README.md               ← This file
+├── SCRIPT_MAPPING.md       ← Detailed workflow-to-script mapping guide
+├── docker/                 ← Docker image build and test scripts
+├── go/                     ← Go bindings scripts
+├── java/                   ← Java bindings scripts
+├── node/                   ← Node/TypeScript NAPI scripts
+├── python/                 ← Python wheel build scripts
+├── ruby/                   ← Ruby gem build scripts
+├── rust/                   ← Rust core and CLI scripts
+├── csharp/                 ← C# bindings scripts
+└── validate/               ← Validation and linting scripts
+```
+
+## Quick Start
+
+### Running a Script
+
+**Bash scripts:**
+
+```bash
+./scripts/ci/docker/build-image.sh core
+./scripts/ci/python/run-tests.sh true
+```
+
+**PowerShell scripts:**
+
+```powershell
+& ./scripts/ci/go/build-ffi.ps1
+& ./scripts/ci/rust/package-cli-windows.ps1 -Target "x86_64-pc-windows-msvc"
+```
+
+### Sourcing Scripts
+
+For library path setup scripts:
+
+```bash
+source ./scripts/lib/library-paths.sh
+setup_all_library_paths
+./scripts/ci/python/run-tests.sh true
+```
+
+## Scripts by Workflow
+
+### Docker (`docker/`)
+
+- `free-disk-space.sh` - Clean up CI disk space
+- `build-image.sh` - Build Docker image variant
+- `check-image-size.sh` - Validate image size constraints
+- `save-image.sh` - Save Docker image as tar.gz artifact
+- `collect-logs.sh` - Collect container logs on failure
+- `cleanup.sh` - Clean up Docker resources
+- `summary.sh` - Print test summary
+
+### Go (`go/`)
+
+- `build-ffi.sh` - Build FFI library (Unix)
+- `build-ffi.ps1` - Build FFI library (Windows)
+- `build-bindings.sh` - Build Go bindings with CGO (Unix)
+- `build-bindings.ps1` - Build Go bindings with CGO (Windows)
+- `reorganize-libraries.ps1` - Reorganize FFI libraries for Windows
+- `run-tests.sh` - Run Go tests with library paths
+
+### Java (`java/`)
+
+- `build-java.sh` - Build Java bindings with Maven
+- `run-tests.sh` - Run Java tests with Maven
+
+### Node/TypeScript (`node/`)
+
+- `build-napi.sh` - Build NAPI bindings with artifact collection
+- `unpack-bindings.sh` - Unpack and install bindings from tarball
+
+### Python (`python/`)
+
+- `clean-artifacts.sh` - Clean previous wheel artifacts
+- `smoke-test-wheel.sh` - Test wheel installation
+- `install-wheel.sh` - Install platform-specific wheel
+- `run-tests.sh` - Run tests with optional coverage
+
+### Ruby (`ruby/`)
+
+- `install-ruby-deps.sh` - Install bundle dependencies (Unix)
+- `install-ruby-deps.ps1` - Install bundle dependencies (Windows)
+- `vendor-kreuzberg-core.py` - Vendor core crate for packaging
+- `configure-bindgen-windows.ps1` - Configure bindgen headers (Windows)
+- `configure-tesseract-windows.ps1` - Configure Tesseract (Windows)
+- `build-gem.sh` - Build Ruby gem
+- `install-gem.sh` - Install built gem
+- `compile-extension.sh` - Compile native extension
+- `run-tests.sh` - Run RSpec tests
+
+### Rust (`rust/`)
+
+- `configure-bindgen-windows.ps1` - Configure bindgen headers (Windows)
+- `run-unit-tests.sh` - Run Rust unit tests
+- `package-cli-unix.sh` - Package CLI as tar.gz (Unix)
+- `package-cli-windows.ps1` - Package CLI as zip (Windows)
+- `test-cli-unix.sh` - Test CLI binary (Unix)
+- `test-cli-windows.ps1` - Test CLI binary (Windows)
+
+### C# (`csharp/`)
+
+- `build-csharp.sh` - Build C# bindings with dotnet
+- `run-tests.sh` - Run C# tests with dotnet
+
+### Validate (`validate/`)
+
+- `run-lint.sh` - Run all linting and validation checks via Task
+
+## Features
+
+### Error Handling
+
+- All Bash scripts use `set -euo pipefail`
+- All PowerShell scripts use `Set-StrictMode` and error action preferences
+- Proper exit codes and error messages
+- Usage information for incorrect arguments
+
+### Documentation
+
+- Every script has a descriptive header
+- Purpose and usage clearly stated
+- Which CI workflow step uses it
+- Argument documentation
+
+### Platform Support
+
+- Windows-specific operations via PowerShell (.ps1)
+- Unix operations via Bash (.sh)
+- Cross-platform scripts detect OS and adjust behavior
+- Library path setup scripts handle Windows/Linux/macOS
+
+### Reusability
+
+- `library-paths.sh` (`scripts/lib/`) - Shared by all workflows for native library configuration
+- `configure-bindgen-windows.ps1` used by Ruby and Rust
+- Common patterns consolidated into single scripts
+
+## Detailed Documentation
+
+For comprehensive workflow-to-script mapping and usage examples, see `SCRIPT_MAPPING.md`.
+
+## Usage in Workflows
+
+### Example: ci-docker.yaml
+
+**Before (inline commands):**
+
+```yaml
+- name: Free up disk space
+  run: |
+    echo "=== Initial disk space ==="
+    df -h /
+    echo "=== Removing unnecessary packages ==="
+    sudo rm -rf /usr/share/dotnet
+    # ... 30+ lines of commands ...
+```
+
+**After (using script):**
+
+```yaml
+- name: Free up disk space
+  run: ./scripts/ci/docker/free-disk-space.sh
+```
+
+### Example: ci-python.yaml
+
+**Before (inline commands):**
+
+```yaml
+- name: Run Python tests
+  run: |
+    cd packages/python
+    if [ "${{ matrix.coverage }}" = "true" ]; then
+      uv run pytest -vv --cov=kreuzberg --cov-report=lcov:coverage.lcov ...
+    else
+      uv run pytest -vv --reruns 1 --reruns-delay 1
+    fi
+```
+
+**After (using script):**
+
+```yaml
+- name: Run Python tests
+  run: ./scripts/ci/python/run-tests.sh ${{ matrix.coverage }}
+```
+
+## Testing Scripts Locally
+
+You can test scripts locally before running in CI:
+
+```bash
+# Test Docker scripts
+./scripts/ci/docker/free-disk-space.sh
+
+# Test Python scripts
+./scripts/ci/python/clean-artifacts.sh
+./scripts/ci/python/run-tests.sh false
+
+# Test Rust scripts
+./scripts/ci/rust/run-unit-tests.sh
+```
+
+## Shell Compatibility
+
+- **Bash scripts**: Compatible with bash 3.2+ (macOS) and bash 4.0+ (Linux)
+- **PowerShell scripts**: Compatible with PowerShell 5.1+ (Windows) and PowerShell Core 7+ (cross-platform)
+
+## Contributing
+
+When adding new CI steps or modifying existing ones:
+
+1. Extract the inline script into a separate file in the appropriate directory
+2. Add proper error handling (`set -euo pipefail` for bash)
+3. Include descriptive header comments
+4. Update `SCRIPT_MAPPING.md` with the new mapping
+5. Test the script locally before committing
+
+## Maintenance
+
+Scripts should be reviewed and updated when:
+
+- Updating CI workflow logic
+- Changing build tools or versions
+- Improving error handling
+- Adding new platform support
+
+See each script's header for detailed documentation on its purpose and usage.
--- a/scripts/ci/actions/setup-onnx-runtime/linux.sh
+++ b/scripts/ci/actions/setup-onnx-runtime/linux.sh
@@ -0,0 +1,90 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ort_version="${1:?ort-version required}"
+dest_dir="${2:-crates/kreuzberg-node}"
+arch_id="${3:-}"
+strategy="${4:-system}"
+
+extract_dir="$RUNNER_TEMP/onnxruntime"
+
+if [ -z "$arch_id" ]; then
+  case "$(uname -m)" in
+  x86_64 | amd64) arch_id="x64" ;;
+  arm64 | aarch64) arch_id="arm64" ;;
+  *)
+    echo "Unsupported Linux architecture: $(uname -m)" >&2
+    exit 1
+    ;;
+  esac
+fi
+
+case "$arch_id" in
+x64)
+  ort_dir_name="onnxruntime-linux-x64-${ort_version}"
+  archive="onnxruntime-linux-x64-${ort_version}.tgz"
+  ;;
+arm64)
+  ort_dir_name="onnxruntime-linux-aarch64-${ort_version}"
+  archive="onnxruntime-linux-aarch64-${ort_version}.tgz"
+  ;;
+*)
+  echo "Unsupported Linux arch-id: $arch_id" >&2
+  exit 1
+  ;;
+esac
+
+if [ ! -d "$extract_dir/$ort_dir_name" ]; then
+  echo "Cache miss: Downloading ONNX Runtime ${ort_version}"
+  curl -fsSL --retry 5 --retry-delay 5 --retry-all-errors -o "$RUNNER_TEMP/$archive" "https://github.com/microsoft/onnxruntime/releases/download/v${ort_version}/$archive"
+  mkdir -p "$extract_dir"
+  tar -xzf "$RUNNER_TEMP/$archive" -C "$extract_dir"
+else
+  echo "Cache hit: Using cached ONNX Runtime ${ort_version}"
+fi
+
+ort_root="$extract_dir/$ort_dir_name"
+
+if [ ! -d "$ort_root/lib" ]; then
+  echo "ERROR: ONNX Runtime lib directory missing at $ort_root/lib" >&2
+  echo "Available directories:" >&2
+  ls -la "$extract_dir" >&2 || true
+  exit 1
+fi
+
+if ! ls "$ort_root/lib"/*.so* 1>/dev/null 2>&1; then
+  echo "ERROR: No ONNX Runtime libraries found in $ort_root/lib" >&2
+  echo "Directory contents:" >&2
+  ls -la "$ort_root/lib" >&2 || true
+  exit 1
+fi
+
+dest="$GITHUB_WORKSPACE/$dest_dir"
+mkdir -p "$dest"
+cp -f "$ort_root/lib/"*.so* "$dest/"
+
+if [ -n "${RUSTFLAGS:-}" ]; then
+  rustflags="$RUSTFLAGS -L $ort_root/lib"
+else
+  rustflags="-L $ort_root/lib"
+fi
+
+if [ "$strategy" = "bundled" ]; then
+  echo "Using bundled ORT strategy — letting ort-sys download-binaries handle static linking"
+  {
+    echo "LD_LIBRARY_PATH=$ort_root/lib:$dest:${LD_LIBRARY_PATH:-}"
+    echo "LIBRARY_PATH=$ort_root/lib:$dest:${LIBRARY_PATH:-}"
+  } >>"$GITHUB_ENV"
+else
+  {
+    ort_lib=$(find "$ort_root/lib" -name "libonnxruntime*.so*" -print -quit)
+    echo "ORT_LIB_LOCATION=$ort_root/lib"
+    echo "ORT_PREFER_DYNAMIC_LINK=1"
+    echo "ORT_SKIP_DOWNLOAD=1"
+    echo "ORT_STRATEGY=system"
+    echo "ORT_DYLIB_PATH=$ort_root/lib/${ort_lib##*/}"
+    echo "LD_LIBRARY_PATH=$ort_root/lib:$dest:${LD_LIBRARY_PATH:-}"
+    echo "LIBRARY_PATH=$ort_root/lib:$dest:${LIBRARY_PATH:-}"
+    echo "RUSTFLAGS=$rustflags"
+  } >>"$GITHUB_ENV"
+fi
--- a/scripts/ci/actions/setup-onnx-runtime/macos.sh
+++ b/scripts/ci/actions/setup-onnx-runtime/macos.sh
@@ -0,0 +1,86 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ort_version="${1:?ort-version required}"
+dest_dir="${2:-crates/kreuzberg-node}"
+arch_id="${3:-}"
+strategy="${4:-system}"
+
+extract_dir="$RUNNER_TEMP/onnxruntime"
+
+if [ -z "$arch_id" ]; then
+  arch="$(uname -m)"
+  if [ "$arch" = "arm64" ]; then
+    arch_id="arm64"
+  else
+    arch_id="x64"
+  fi
+fi
+
+case "$arch_id" in
+arm64) ort_arch="arm64" ;;
+x64) ort_arch="x86_64" ;;
+*)
+  echo "Unsupported macOS arch-id: $arch_id" >&2
+  exit 1
+  ;;
+esac
+echo "Using macOS ONNX Runtime arch: $ort_arch"
+
+if [ ! -d "$extract_dir/onnxruntime-osx-${ort_arch}-${ort_version}" ]; then
+  echo "Cache miss: Downloading ONNX Runtime ${ort_version} for macOS ${ort_arch}"
+  archive="onnxruntime-osx-${ort_arch}-${ort_version}.tgz"
+  curl -fsSL --retry 5 --retry-delay 5 --retry-all-errors -o "$RUNNER_TEMP/$archive" "https://github.com/microsoft/onnxruntime/releases/download/v${ort_version}/$archive"
+  mkdir -p "$extract_dir"
+  tar -xzf "$RUNNER_TEMP/$archive" -C "$extract_dir"
+else
+  echo "Cache hit: Using cached ONNX Runtime ${ort_version}"
+fi
+
+ort_root="$extract_dir/onnxruntime-osx-${ort_arch}-${ort_version}"
+
+if [ ! -d "$ort_root/lib" ]; then
+  echo "ERROR: ONNX Runtime lib directory missing at $ort_root/lib" >&2
+  echo "Available directories:" >&2
+  ls -la "$extract_dir" >&2 || true
+  exit 1
+fi
+
+if ! ls "$ort_root/lib"/libonnxruntime*.dylib 1>/dev/null 2>&1; then
+  echo "ERROR: No ONNX Runtime libraries found in $ort_root/lib" >&2
+  echo "Directory contents:" >&2
+  ls -la "$ort_root/lib" >&2 || true
+  exit 1
+fi
+
+dest="$GITHUB_WORKSPACE/$dest_dir"
+mkdir -p "$dest"
+cp -f "$ort_root/lib/"libonnxruntime*.dylib "$dest/"
+
+if [ -n "${RUSTFLAGS:-}" ]; then
+  rustflags="$RUSTFLAGS -L $ort_root/lib"
+else
+  rustflags="-L $ort_root/lib"
+fi
+
+if [ "$strategy" = "bundled" ]; then
+  echo "Using bundled ORT strategy — letting ort-sys download-binaries handle static linking"
+  {
+    echo "DYLD_LIBRARY_PATH=$ort_root/lib:$dest:${DYLD_LIBRARY_PATH:-}"
+    echo "DYLD_FALLBACK_LIBRARY_PATH=$ort_root/lib:$dest:${DYLD_FALLBACK_LIBRARY_PATH:-}"
+    echo "LIBRARY_PATH=$ort_root/lib:$dest:${LIBRARY_PATH:-}"
+  } >>"$GITHUB_ENV"
+else
+  {
+    ort_lib=$(find "$ort_root/lib" -name "libonnxruntime*.dylib" -print -quit)
+    echo "ORT_LIB_LOCATION=$ort_root/lib"
+    echo "ORT_PREFER_DYNAMIC_LINK=1"
+    echo "ORT_SKIP_DOWNLOAD=1"
+    echo "ORT_STRATEGY=system"
+    echo "ORT_DYLIB_PATH=$ort_root/lib/${ort_lib##*/}"
+    echo "DYLD_LIBRARY_PATH=$ort_root/lib:$dest:${DYLD_LIBRARY_PATH:-}"
+    echo "DYLD_FALLBACK_LIBRARY_PATH=$ort_root/lib:$dest:${DYLD_FALLBACK_LIBRARY_PATH:-}"
+    echo "LIBRARY_PATH=$ort_root/lib:$dest:${LIBRARY_PATH:-}"
+    echo "RUSTFLAGS=$rustflags"
+  } >>"$GITHUB_ENV"
+fi
--- a/scripts/ci/actions/setup-onnx-runtime/windows.ps1
+++ b/scripts/ci/actions/setup-onnx-runtime/windows.ps1
@@ -0,0 +1,100 @@
+$OrtVersion = $args[0]
+if ([string]::IsNullOrWhiteSpace($OrtVersion)) { throw "Usage: windows.ps1 <ortVersion> [destDir] [archId] [strategy]" }
+
+$DestDir = if ($args.Count -ge 2 -and -not [string]::IsNullOrWhiteSpace($args[1])) { $args[1] } else { "crates/kreuzberg-node" }
+$ArchId = if ($args.Count -ge 3) { $args[2] } else { "" }
+$Strategy = if ($args.Count -ge 4 -and -not [string]::IsNullOrWhiteSpace($args[3])) { $args[3] } else { "system" }
+
+$ExtractRoot = Join-Path $env:TEMP "onnxruntime"
+if ([string]::IsNullOrWhiteSpace($ArchId)) {
+  $ArchId = $env:RUNNER_ARCH
+}
+$ArchId = $ArchId.ToLowerInvariant()
+if ($ArchId -eq "arm64") { $ArchId = "arm64" } else { $ArchId = "x64" }
+
+$OrtRoot = Join-Path $ExtractRoot "onnxruntime-win-$ArchId-$OrtVersion"
+$OrtBin = Join-Path $OrtRoot 'bin'
+$OrtLib = Join-Path $OrtRoot 'lib'
+
+if (-Not (Test-Path $OrtRoot)) {
+  Write-Host "Cache miss: Downloading ONNX Runtime $OrtVersion"
+  $Archive = "onnxruntime-win-$ArchId-$OrtVersion.zip"
+  $DownloadPath = Join-Path $env:TEMP $Archive
+  Invoke-WebRequest -Uri "https://github.com/microsoft/onnxruntime/releases/download/v$OrtVersion/$Archive" -OutFile $DownloadPath -UseBasicParsing -MaximumRetryCount 5 -RetryIntervalSec 5
+  New-Item -ItemType Directory -Path $ExtractRoot -Force | Out-Null
+  Expand-Archive -Path $DownloadPath -DestinationPath $ExtractRoot -Force
+} else {
+  Write-Host "Cache hit: Using cached ONNX Runtime $OrtVersion"
+}
+
+if (!(Test-Path $OrtLib)) {
+  Write-Error "ERROR: ONNX Runtime lib directory missing at $OrtLib"
+  Get-ChildItem -Path $ExtractRoot -Recurse | Write-Host
+  exit 1
+}
+
+$LibFiles = @(Get-ChildItem -Path $OrtLib -Filter "*.lib" -ErrorAction SilentlyContinue)
+if ($LibFiles.Count -eq 0) {
+  Write-Error "ERROR: No ONNX Runtime library files found in $OrtLib"
+  Get-ChildItem -Path $OrtLib | Write-Host
+  exit 1
+}
+
+$DllDirs = @()
+foreach ($Candidate in @($OrtLib, $OrtBin)) {
+  if (Test-Path $Candidate) {
+    $CandidateDlls = @(Get-ChildItem -Path $Candidate -Filter "*.dll" -File -ErrorAction SilentlyContinue)
+    if ($CandidateDlls.Count -gt 0) {
+      $DllDirs += $Candidate
+    }
+  }
+}
+if ($DllDirs.Count -eq 0) {
+  $OrtDll = Get-ChildItem -Path $OrtRoot -Recurse -Filter "onnxruntime.dll" -File -ErrorAction SilentlyContinue | Select-Object -First 1
+  if ($OrtDll) { $DllDirs += $OrtDll.DirectoryName }
+}
+if ($DllDirs.Count -eq 0) {
+  $AnyDll = Get-ChildItem -Path $OrtRoot -Recurse -Filter "*.dll" -File -ErrorAction SilentlyContinue | Select-Object -First 1
+  if ($AnyDll) { $DllDirs += $AnyDll.DirectoryName }
+}
+$DllDirs = $DllDirs | Select-Object -Unique
+if ($DllDirs.Count -eq 0) {
+  Write-Error "ERROR: No ONNX Runtime runtime DLLs found under $OrtRoot"
+  Get-ChildItem -Path $OrtRoot -Recurse | Write-Host
+  exit 1
+}
+
+$Dest = Join-Path $env:GITHUB_WORKSPACE $DestDir
+New-Item -ItemType Directory -Path $Dest -Force | Out-Null
+Copy-Item -Path (Join-Path $OrtLib '*') -Destination $Dest -Force
+foreach ($Dir in $DllDirs) {
+  Copy-Item -Path (Join-Path $Dir '*.dll') -Destination $Dest -Force
+}
+
+$RustFlags = if ($env:RUSTFLAGS) { "$env:RUSTFLAGS -L $OrtLib" } else { "-L $OrtLib" }
+
+if ($Strategy -eq "bundled") {
+  # ort-sys has no prebuilt static binaries for x86_64-pc-windows-gnu (MSYS2/MinGW).
+  # Use the pre-downloaded Microsoft ORT with dynamic linking for Windows GNU targets.
+  Write-Host "Using bundled ORT strategy (Windows) - dynamic linking against pre-downloaded ORT (no static binaries for windows-gnu)"
+  @(
+    "ORT_LIB_LOCATION=$OrtLib"
+    "ORT_PREFER_DYNAMIC_LINK=1"
+    "RUSTFLAGS=$RustFlags"
+    "LIB=$OrtLib;$env:LIB"
+    "LIBRARY_PATH=$OrtLib;$env:LIBRARY_PATH"
+    "PATH=$Dest;$env:PATH"
+  ) | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
+} else {
+  @(
+    "ORT_LIB_LOCATION=$OrtLib"
+    "ORT_PREFER_DYNAMIC_LINK=1"
+    "ORT_SKIP_DOWNLOAD=1"
+    "ORT_STRATEGY=system"
+    "ORT_DYLIB_PATH=$Dest\onnxruntime.dll"
+    "RUSTFLAGS=$RustFlags"
+    "LIB=$OrtLib;$env:LIB"
+    "LIBRARY_PATH=$OrtLib;$env:LIBRARY_PATH"
+    "PATH=$Dest;$env:PATH"
+  ) | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
+}
--- a/scripts/ci/actions/setup-prebuilt-onnx/prepare.sh
+++ b/scripts/ci/actions/setup-prebuilt-onnx/prepare.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+target="${1:?target required}"
+
+case "$target" in
+aarch64-apple-darwin)
+  ort_url="https://cdn.pyke.io/0/pyke:ort-rs/ms@1.24.1/aarch64-apple-darwin.tgz"
+  ;;
+x86_64-apple-darwin)
+  ort_url="https://cdn.pyke.io/0/pyke:ort-rs/ms@1.24.1/x86_64-apple-darwin.tgz"
+  ;;
+*)
+  echo "setup-prebuilt-onnx does not support target $target" >&2
+  exit 1
+  ;;
+esac
+
+ort_dir="${GITHUB_WORKSPACE}/target/onnxruntime/${target}"
+ort_root="${ort_dir}/onnxruntime"
+ort_lib="${ort_root}/lib"
+
+write_env() {
+  {
+    echo "ORT_STRATEGY=system"
+    echo "ORT_LIB_LOCATION=${ort_lib}"
+    echo "ORT_SKIP_DOWNLOAD=1"
+    echo "ORT_PREFER_DYNAMIC_LINK=1"
+  } >>"${GITHUB_ENV}"
+}
+
+if [ ! -f "${ort_lib}/libonnxruntime.a" ]; then
+  rm -rf "${ort_dir}"
+  mkdir -p "${ort_lib}"
+
+  echo "Attempting to download prebuilt ONNX Runtime for ${target}..." >&2
+  if curl -fsSL --max-time 30 -o /tmp/ort.tgz "${ort_url}" 2>/dev/null; then
+    tar -xz -C "${ort_lib}" -f /tmp/ort.tgz
+    rm -f /tmp/ort.tgz
+    write_env
+  else
+    echo "Warning: Prebuilt ONNX Runtime not available for ${target}" >&2
+    echo "Will download and build ONNX Runtime during compilation" >&2
+  fi
+else
+  echo "Using existing ONNX Runtime at ${ort_lib}" >&2
+  write_env
+fi
--- a/scripts/ci/actions/setup-rust/build-with-sccache-fallback.sh
+++ b/scripts/ci/actions/setup-rust/build-with-sccache-fallback.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Usage: build-with-sccache-fallback.sh <cargo command...>
+log_file=$(mktemp)
+trap 'rm -f "$log_file"' EXIT
+
+echo "Building with sccache (fallback on errors)..."
+
+# Attempt with sccache
+if "$@" 2>&1 | tee "$log_file"; then
+  echo "✓ Build succeeded with sccache"
+  exit 0
+fi
+
+# Check for sccache-related errors
+if grep -Eq "sccache.*(error|failed)|cache storage failed|dns error|connection (refused|timed out)" "$log_file"; then
+  echo "⚠️  sccache failure detected, retrying without cache..."
+  export RUSTC_WRAPPER=""
+  export SCCACHE_GHA_ENABLED=false
+
+  if "$@"; then
+    echo "✓ Build succeeded without sccache (fallback)"
+    exit 0
+  fi
+fi
+
+echo "✗ Build failed"
+exit 1
--- a/scripts/ci/actions/setup-tesseract-cache/clean-dirs.sh
+++ b/scripts/ci/actions/setup-tesseract-cache/clean-dirs.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+label="${1:?label required}"
+
+rm -rf ".tesseract-cache/${label}"
+rm -rf ".xdg-cache/${label}"
--- a/scripts/ci/actions/setup-tesseract-cache/clean-target-cache.sh
+++ b/scripts/ci/actions/setup-tesseract-cache/clean-target-cache.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+rust_target="${1:?rust target required}"
+rm -rf "target/${rust_target}/kreuzberg-tesseract-cache"
--- a/scripts/ci/actions/setup-tesseract-cache/set-outputs.sh
+++ b/scripts/ci/actions/setup-tesseract-cache/set-outputs.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+label="${1:?label required}"
+enable_cache="${2:?enable-cache required (true/false)}"
+
+if [ "$enable_cache" = "true" ]; then
+  cache_dir="${GITHUB_WORKSPACE}/.tesseract-cache/${label}"
+
+  echo "TESSERACT_RS_CACHE_DIR=${cache_dir}" >>"$GITHUB_ENV"
+  echo "XDG_CACHE_HOME=${GITHUB_WORKSPACE}/.xdg-cache/${label}" >>"$GITHUB_ENV"
+
+  echo "cache-dir=${cache_dir}" >>"$GITHUB_OUTPUT"
+  echo "cache-enabled=true" >>"$GITHUB_OUTPUT"
+
+  docker_opts="--env TESSERACT_RS_CACHE_DIR=/io/.tesseract-cache/${label}"
+  docker_opts="${docker_opts} --env XDG_CACHE_HOME=/io/.xdg-cache/${label}"
+  multiarch=""
+  if command -v dpkg-architecture >/dev/null 2>&1; then
+    multiarch="$(dpkg-architecture -qDEB_HOST_MULTIARCH 2>/dev/null || true)"
+  fi
+  if [ -z "$multiarch" ]; then
+    case "$(uname -m)" in
+    x86_64) multiarch="x86_64-linux-gnu" ;;
+    aarch64 | arm64) multiarch="aarch64-linux-gnu" ;;
+    esac
+  fi
+  openssl_lib_dir="/usr/lib"
+  if [ -n "$multiarch" ]; then
+    openssl_lib_dir="/usr/lib/${multiarch}"
+  fi
+  docker_opts="${docker_opts} --env OPENSSL_LIB_DIR=${openssl_lib_dir}"
+  docker_opts="${docker_opts} --env OPENSSL_INCLUDE_DIR=/usr/include"
+  echo "docker-options=${docker_opts}" >>"$GITHUB_OUTPUT"
+else
+  {
+    echo "TESSERACT_RS_CACHE_DIR="
+  } >>"$GITHUB_ENV"
+  {
+    echo "cache-dir="
+    echo "cache-enabled=false"
+    echo "docker-options="
+  } >>"$GITHUB_OUTPUT"
+fi
--- a/scripts/ci/actions/setup-tesseract-cache/setup-dirs.sh
+++ b/scripts/ci/actions/setup-tesseract-cache/setup-dirs.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+label="${1:?label required}"
+
+mkdir -p ".tesseract-cache/${label}"
+mkdir -p ".xdg-cache/${label}"
--- a/scripts/ci/benchmarks/verify-node-setup.sh
+++ b/scripts/ci/benchmarks/verify-node-setup.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+label="${1:-Node setup}"
+
+echo "=== ${label} ==="
+echo "Node version: $(node --version)"
+echo "pnpm version: $(pnpm --version)"
+echo "tsx availability: $(command -v tsx || echo 'NOT FOUND')"
+echo "pnpm workspace structure:"
+pnpm list --depth=0 || true
--- a/scripts/ci/cache/compute-hash.sh
+++ b/scripts/ci/cache/compute-hash.sh
@@ -0,0 +1,158 @@
+#!/usr/bin/env bash
+# Compute deterministic hash for cache key generation
+#
+# Usage:
+#   compute-hash.sh <glob-pattern> [glob-pattern...]
+#   compute-hash.sh --files <file1> <file2> ...
+#   compute-hash.sh --dirs <dir1> <dir2> ...
+#
+# Examples:
+#   compute-hash.sh "crates/kreuzberg/**/*.rs" "crates/kreuzberg-ffi/**/*.rs"
+#   compute-hash.sh --files Cargo.lock uv.lock
+#   compute-hash.sh --dirs crates/kreuzberg/ crates/kreuzberg-ffi/
+
+set -euo pipefail
+
+# Color output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+error() {
+  echo -e "${RED}Error: $*${NC}" >&2
+  exit 1
+}
+
+info() {
+  echo -e "${GREEN}$*${NC}" >&2
+}
+
+warn() {
+  echo -e "${YELLOW}$*${NC}" >&2
+}
+
+# Check if sha256sum or shasum is available
+if command -v sha256sum &>/dev/null; then
+  HASH_CMD="sha256sum"
+elif command -v shasum &>/dev/null; then
+  HASH_CMD="shasum -a 256"
+else
+  error "Neither sha256sum nor shasum found in PATH"
+fi
+
+# Mode detection
+MODE="glob"
+if [[ "${1:-}" == "--files" ]]; then
+  MODE="files"
+  shift
+elif [[ "${1:-}" == "--dirs" ]]; then
+  MODE="dirs"
+  shift
+fi
+
+if [[ $# -eq 0 ]]; then
+  error "No input provided. Usage: $0 <pattern...> or $0 --files <file...> or $0 --dirs <dir...>"
+fi
+
+# Temporary file for collecting hashes
+TEMP_HASHES=$(mktemp)
+trap 'rm -f "$TEMP_HASHES"' EXIT
+
+case "$MODE" in
+files)
+  # Hash specific files directly
+  for file in "$@"; do
+    if [[ -f "$file" ]]; then
+      $HASH_CMD "$file" >>"$TEMP_HASHES" 2>/dev/null || warn "Failed to hash: $file"
+    else
+      warn "File not found: $file"
+    fi
+  done
+  ;;
+
+dirs)
+  # Hash all files in directories recursively
+  for dir in "$@"; do
+    if [[ -d "$dir" ]]; then
+      # Find all files (excluding hidden files and directories)
+      find "$dir" -type f \
+        ! -path "*/.*" \
+        ! -path "*/target/*" \
+        ! -path "*/node_modules/*" \
+        ! -path "*/.venv/*" \
+        ! -path "*/dist/*" \
+        ! -path "*/build/*" \
+        -exec "$HASH_CMD" {} \; >>"$TEMP_HASHES" 2>/dev/null || true
+    else
+      warn "Directory not found: $dir"
+    fi
+  done
+  ;;
+
+glob)
+  # Hash files matching glob patterns
+  for pattern in "$@"; do
+    # Use find with -path for glob matching
+    # Convert glob to find path expression
+
+    if [[ "$pattern" == *"**"* ]]; then
+      # Handle ** recursive glob (e.g., "crates/kreuzberg/**/*.rs")
+      # Extract the base directory and file extension/name pattern
+      base_dir=$(echo "$pattern" | cut -d'*' -f1 | sed 's|/$||')
+
+      # Get the suffix after the ** (e.g., "/*.rs" from "crates/kreuzberg/**/*.rs")
+      # Remove everything up to and including **/
+      suffix="${pattern#*\*\*/}"
+
+      # Extract filename pattern (e.g., "*.rs" from "/*.rs")
+      # Remove leading / if present
+      if [[ "$suffix" == /* ]]; then
+        name_pattern="${suffix#/}"
+      else
+        name_pattern="$suffix"
+      fi
+
+      if [[ -d "$base_dir" ]]; then
+        # Find all files recursively using -name for filename matching
+        # This is more portable and reliable than bash regex
+        find "$base_dir" -type f \
+          ! -path "*/.*" \
+          ! -path "*/target/*" \
+          ! -path "*/node_modules/*" \
+          ! -path "*/.venv/*" \
+          -name "$name_pattern" \
+          -exec "$HASH_CMD" {} \; 2>/dev/null >>"$TEMP_HASHES" || true
+      else
+        warn "Directory not found: $base_dir"
+      fi
+    else
+      # Simple glob (no **)
+      for file in $pattern; do
+        if [[ -f "$file" ]]; then
+          $HASH_CMD "$file" >>"$TEMP_HASHES" 2>/dev/null || warn "Failed to hash: $file"
+        fi
+      done
+    fi
+  done
+  ;;
+esac
+
+# Check if we found any files to hash
+if [[ ! -s "$TEMP_HASHES" ]]; then
+  error "No files found matching the provided patterns"
+fi
+
+# Sort hashes (for determinism across different find orders)
+# Then hash the combined hashes to get final hash
+FINAL_HASH=$(sort "$TEMP_HASHES" | $HASH_CMD | cut -d' ' -f1)
+
+# Truncate to 12 characters for cache key (still 48 bits of entropy)
+SHORT_HASH="${FINAL_HASH:0:12}"
+
+# Output the hash
+echo "$SHORT_HASH"
+
+# Debug info (to stderr)
+FILE_COUNT=$(wc -l <"$TEMP_HASHES")
+info "Hashed $FILE_COUNT files → $SHORT_HASH" >&2
--- a/scripts/ci/docker/run-cli-tests.sh
+++ b/scripts/ci/docker/run-cli-tests.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+echo "=== Running Docker CLI feature tests ==="
+python3 scripts/ci/docker/test_docker.py --image "kreuzberg:cli" --variant cli --verbose
--- a/scripts/ci/docker/run-config-tests.sh
+++ b/scripts/ci/docker/run-config-tests.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+# CI wrapper for Docker configuration testing
+# Tests volume mounts, config formats, and environment variable overrides
+
+set -euo pipefail
+
+variant="${1:?missing variant}"
+
+echo "=== Running Docker configuration tests (${variant}) ==="
+
+# Run the comprehensive config test script
+# The script expects the image to already be built and tagged
+exec ./scripts/test/test-docker-config-local.sh --image "kreuzberg:${variant}" --variant "${variant}"
--- a/scripts/ci/docker/run-feature-tests.sh
+++ b/scripts/ci/docker/run-feature-tests.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+variant="${1:?missing variant}"
+
+echo "=== Running Docker feature tests (${variant}) ==="
+python3 scripts/ci/docker/test_docker.py --image "kreuzberg:${variant}" --variant "${variant}" --verbose
--- a/scripts/ci/docker/test_docker.py
+++ b/scripts/ci/docker/test_docker.py
@@ -0,0 +1,750 @@
+#!/usr/bin/env python3
+"""Unified Docker image test script for all variants (core, full, cli)."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import random
+import subprocess
+import sys
+import tempfile
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+
+BLUE = "\033[0;34m"
+GREEN = "\033[0;32m"
+RED = "\033[0;31m"
+YELLOW = "\033[1;33m"
+NC = "\033[0m"
+
+REPO_ROOT = Path(__file__).resolve().parents[3]
+TEST_DOCS_DIR = REPO_ROOT / "test_documents"
+RESULTS_FILE = Path("/tmp/kreuzberg-docker-test-results.json")
+
+
+@dataclass
+class TestRunner:
+    image: str
+    variant: str
+    verbose: bool = False
+    total: int = 0
+    passed: int = 0
+    failed: int = 0
+    failed_names: list[str] = field(default_factory=list)
+    containers: list[str] = field(default_factory=list)
+
+    def log(self, level: str, color: str, msg: str) -> None:
+        print(f"{color}[{level}]{NC} {msg}", flush=True)
+
+    def info(self, msg: str) -> None:
+        self.log("INFO", BLUE, msg)
+
+    def ok(self, msg: str = "PASS") -> None:
+        self.log("SUCCESS", GREEN, msg)
+
+    def error(self, msg: str) -> None:
+        self.log("ERROR", RED, msg)
+
+    def warn(self, msg: str) -> None:
+        self.log("WARNING", YELLOW, msg)
+
+    def debug(self, msg: str) -> None:
+        if self.verbose:
+            self.log("VERBOSE", YELLOW, msg)
+
+    def start(self, name: str) -> None:
+        self.total += 1
+        self.info(f"Test {self.total}: {name}")
+
+    def pass_test(self) -> None:
+        self.passed += 1
+        self.ok()
+
+    def fail_test(self, name: str, details: str = "") -> None:
+        self.failed += 1
+        self.failed_names.append(name)
+        msg = f"FAIL: {name}"
+        if details:
+            msg += f"\n  Details: {details}"
+        self.error(msg)
+
+    def container_name(self) -> str:
+        name = f"kreuzberg-test-{int(time.time())}-{random.randint(0, 99999)}"
+        self.containers.append(name)
+        return name
+
+    def docker_run(self, *args: str, capture: bool = True) -> subprocess.CompletedProcess[str]:
+        cmd = ["docker", "run", "--rm", *args]
+        return subprocess.run(cmd, capture_output=capture, text=True, timeout=120)
+
+    def docker_run_detached(self, *args: str) -> str:
+        name = self.container_name()
+        cmd = ["docker", "run", "-d", "--name", name, *args]
+        subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=60)
+        return name
+
+    def docker_rm(self, name: str) -> None:
+        subprocess.run(["docker", "rm", "-f", name], capture_output=True, timeout=30)
+
+    def cleanup(self) -> None:
+        for c in self.containers:
+            self.docker_rm(c)
+
+    def run_cli_output(self, *extra_args: str, volumes: bool = False) -> str:
+        """Run a CLI command against the image and return combined stdout+stderr."""
+        args: list[str] = ["--name", self.container_name()]
+        if volumes:
+            args += ["-v", f"{TEST_DOCS_DIR}:/data:ro"]
+        args.append(self.image)
+        args.extend(extra_args)
+        r = self.docker_run(*args)
+        return (r.stdout + r.stderr).strip()
+
+    def write_results(self) -> None:
+        rate = (self.passed * 100 // self.total) if self.total else 0
+        data = {
+            "image": self.image,
+            "variant": self.variant,
+            "total_tests": self.total,
+            "passed": self.passed,
+            "failed": self.failed,
+            "success_rate": rate,
+            "failed_tests": self.failed_names,
+        }
+        RESULTS_FILE.write_text(json.dumps(data, indent=2))
+        self.info(f"Results written to {RESULTS_FILE}")
+
+
+# ---------------------------------------------------------------------------
+# Shared tests (all variants)
+# ---------------------------------------------------------------------------
+
+def test_image_exists(t: TestRunner) -> None:
+    t.start("Docker image exists")
+    r = subprocess.run(["docker", "inspect", t.image], capture_output=True, timeout=30)
+    if r.returncode == 0:
+        t.pass_test()
+    else:
+        t.fail_test("Image does not exist", t.image)
+
+
+def test_version(t: TestRunner) -> None:
+    t.start("CLI --version command")
+    out = t.run_cli_output("--version")
+    t.debug(f"Version output: {out}")
+    if "kreuzberg" in out.lower():
+        t.pass_test()
+    else:
+        t.fail_test("CLI version", f"Expected 'kreuzberg' in output, got: {out}")
+
+
+def test_help(t: TestRunner) -> None:
+    t.start("CLI --help command")
+    out = t.run_cli_output("--help")
+    if "extract" in out.lower():
+        t.pass_test()
+    else:
+        t.fail_test("CLI help", "Expected 'extract' in help output")
+
+
+def test_mime_detection(t: TestRunner) -> None:
+    t.start("MIME type detection (detect command)")
+    out = t.run_cli_output("detect", "/data/pdf/searchable.pdf", volumes=True)
+    t.debug(f"MIME detection output: {out}")
+    if "application/pdf" in out.lower():
+        t.pass_test()
+    else:
+        t.fail_test("MIME detection", f"Expected 'application/pdf', got: {out}")
+
+
+def test_extract_text(t: TestRunner) -> None:
+    t.start("Extract plain text file")
+    out = t.run_cli_output("extract", "/data/text/contract.txt", volumes=True)
+    t.debug(f"Text extraction output (first 100 chars): {out[:100]}")
+    if len(out) > 15 and "contract" in out.lower():
+        t.pass_test()
+    else:
+        t.fail_test("Text extraction", f"Output too short ({len(out)} chars) or missing expected keywords")
+
+
+def test_extract_pdf(t: TestRunner) -> None:
+    t.start("Extract searchable PDF")
+    name = t.container_name()
+    r = subprocess.run(
+        ["docker", "run", "--rm", "--name", name,
+         "-v", f"{TEST_DOCS_DIR}:/data:ro",
+         t.image, "extract", "/data/pdf/searchable.pdf"],
+        capture_output=True, text=True, timeout=120,
+    )
+    out = (r.stdout + r.stderr).strip()
+    t.debug(f"PDF extraction output (first 200 chars): {out[:200]}")
+    if r.returncode != 0:
+        t.fail_test("Searchable PDF extraction", f"Exit code {r.returncode}: {out[:300]}")
+    elif len(out) > 50:
+        t.pass_test()
+    else:
+        t.fail_test("Searchable PDF extraction", f"Output too short: {len(out)} chars")
+
+
+def test_extract_html(t: TestRunner) -> None:
+    t.start("Extract HTML file")
+    out = t.run_cli_output("extract", "/data/html/simple_table.html", volumes=True)
+    t.debug(f"HTML extraction output (first 100 chars): {out[:100]}")
+    if len(out) > 10:
+        t.pass_test()
+    else:
+        t.fail_test("HTML extraction", f"Output too short: {len(out)} chars")
+
+
+def test_extract_docx(t: TestRunner) -> None:
+    t.start("Extract DOCX file")
+    out = t.run_cli_output("extract", "/data/docx/extraction_test.docx", volumes=True)
+    t.debug(f"DOCX extraction output (first 100 chars): {out[:100]}")
+    if len(out) > 100:
+        t.pass_test()
+    else:
+        t.fail_test("DOCX extraction", f"Output too short ({len(out)} chars)")
+
+
+def test_batch_cli(t: TestRunner) -> None:
+    t.start("CLI batch extraction (multiple files)")
+    out = t.run_cli_output(
+        "batch", "/data/text/contract.txt", "/data/html/simple_table.html",
+        volumes=True,
+    )
+    t.debug(f"Batch output (first 200 chars): {out[:200]}")
+    if len(out) > 20:
+        t.pass_test()
+    else:
+        t.fail_test("Batch extraction", f"Output too short: {len(out)} chars")
+
+
+def test_nonexistent_file(t: TestRunner) -> None:
+    t.start("Non-existent file returns error")
+    r = subprocess.run(
+        ["docker", "run", "--rm", t.image, "extract", "/nonexistent/file.pdf"],
+        capture_output=True, text=True, timeout=60,
+    )
+    if r.returncode != 0:
+        t.pass_test()
+    else:
+        t.fail_test("Error on missing file", "Expected non-zero exit code for missing file")
+
+
+def test_readonly_mount(t: TestRunner) -> None:
+    t.start("Read-only volume mount works")
+    name = t.container_name()
+    r = subprocess.run(
+        ["docker", "run", "--rm", "--name", name,
+         "-v", f"{TEST_DOCS_DIR}:/data:ro",
+         "--read-only", "--tmpfs", "/tmp",
+         t.image, "extract", "/data/text/simple.txt"],
+        capture_output=True, text=True, timeout=60,
+    )
+    out = (r.stdout + r.stderr).strip()
+    if len(out) > 5:
+        t.pass_test()
+    else:
+        t.fail_test("Read-only mount", "Failed to extract with read-only filesystem")
+
+
+# ---------------------------------------------------------------------------
+# Core/Full-only tests (API server tests)
+# ---------------------------------------------------------------------------
+
+def _wait_for_api(port: int, retries: int = 10) -> bool:
+    import urllib.request
+    for _ in range(retries):
+        try:
+            urllib.request.urlopen(f"http://localhost:{port}/health", timeout=3)
+            return True
+        except Exception:
+            time.sleep(2)
+    return False
+
+
+def _api_get(port: int, path: str) -> str | None:
+    import urllib.request
+    try:
+        with urllib.request.urlopen(f"http://localhost:{port}{path}", timeout=10) as resp:
+            return resp.read().decode()
+    except Exception:
+        return None
+
+
+def _api_post_file(port: int, path: str, filepath: str) -> str | None:
+    """POST a file using curl (simplest multipart approach)."""
+    r = subprocess.run(
+        ["curl", "-f", "-s", "-X", "POST", f"http://localhost:{port}{path}",
+         "-F", f"files=@{filepath}"],
+        capture_output=True, text=True, timeout=30,
+    )
+    return r.stdout if r.returncode == 0 else None
+
+
+def test_ocr_extraction(t: TestRunner) -> None:
+    t.start("OCR extraction with Tesseract")
+    name = t.container_name()
+    r = subprocess.run(
+        ["docker", "run", "--rm", "--name", name, "--memory", "1g",
+         "-v", f"{TEST_DOCS_DIR}:/data:ro",
+         t.image, "extract", "/data/images/ocr_image.jpg", "--ocr", "true"],
+        capture_output=True, text=True, timeout=120,
+    )
+    out = (r.stdout + r.stderr).strip()
+    t.debug(f"OCR extraction output (first 100 chars): {out[:100]}")
+    if len(out) > 10:
+        t.pass_test()
+    else:
+        t.fail_test("OCR extraction", "Output too short or OCR failed")
+
+
+def test_paddle_ocr_extraction(t: TestRunner) -> None:
+    t.start("PaddleOCR extraction (pre-loaded models)")
+    name = t.container_name()
+    r = subprocess.run(
+        ["docker", "run", "--rm", "--name", name, "--memory", "2g",
+         "-v", f"{TEST_DOCS_DIR}:/data:ro",
+         t.image, "extract", "/data/images/ocr_image.jpg",
+         "--ocr", "true", "--ocr-backend", "paddle-ocr"],
+        capture_output=True, text=True, timeout=120,
+    )
+    out = (r.stdout + r.stderr).strip()
+    t.debug(f"PaddleOCR extraction output (first 200 chars): {out[:200]}")
+    if r.returncode == 0 and len(out) > 10:
+        t.pass_test()
+    else:
+        t.fail_test("PaddleOCR extraction", f"Exit code: {r.returncode}, output length: {len(out)}")
+
+
+def test_doc_extraction(t: TestRunner) -> None:
+    t.start("Legacy DOC extraction (native OLE/CFB)")
+    name = t.container_name()
+    r = subprocess.run(
+        ["docker", "run", "--rm", "--name", name, "--memory", "1g",
+         "-v", f"{TEST_DOCS_DIR}:/data:ro",
+         t.image, "extract", "/data/doc/unit_test_lists.doc"],
+        capture_output=True, text=True, timeout=120,
+    )
+    out = (r.stdout + r.stderr).strip()
+    t.debug(f"DOC extraction output (first 100 chars): {out[:100]}")
+    if len(out) > 20:
+        t.pass_test()
+    else:
+        t.fail_test("DOC extraction", f"Output too short: {len(out)} chars")
+
+
+def test_api_health(t: TestRunner) -> None:
+    t.start("API server startup and health check")
+    port = 9000 + random.randint(0, 999)
+    name = t.docker_run_detached(
+        "--memory", "2g", "--cpus", "2",
+        "-p", f"{port}:8000", t.image,
+    )
+    if not _wait_for_api(port):
+        t.fail_test("API health check", f"Health endpoint not responding on port {port}")
+        t.docker_rm(name)
+        return
+
+    health = _api_get(port, "/health")
+    t.debug(f"Health response: {health}")
+    if health:
+        t.pass_test()
+    else:
+        t.fail_test("API health check", "No response from /health")
+
+    # Plugin initialization validation
+    t.start("Plugin initialization validation")
+    if health and "plugins" in health:
+        import re
+        ocr_m = re.search(r'"ocr_backends_count":(\d+)', health)
+        ext_m = re.search(r'"extractors_count":(\d+)', health)
+        ocr_count = int(ocr_m.group(1)) if ocr_m else 0
+        ext_count = int(ext_m.group(1)) if ext_m else 0
+        t.debug(f"OCR backends: {ocr_count}, Extractors: {ext_count}")
+
+        if t.variant == "full":
+            if ocr_count > 0:
+                t.info(f"Full variant: {ocr_count} OCR backend(s) registered")
+                t.pass_test()
+            else:
+                t.fail_test("Plugin initialization", "Full variant: No OCR backends registered")
+                t.docker_rm(name)
+                return
+        else:
+            t.pass_test()
+
+        if ext_count == 0:
+            t.fail_test("Plugin initialization", "No document extractors registered")
+            t.docker_rm(name)
+            return
+    else:
+        t.warn("Health response missing 'plugins' field")
+        t.pass_test()
+
+    t.docker_rm(name)
+
+
+def test_api_extract(t: TestRunner) -> None:
+    t.start("API extraction endpoint")
+    port = 9000 + random.randint(0, 999)
+    name = t.docker_run_detached(
+        "--memory", "2g", "--cpus", "2",
+        "-p", f"{port}:8000", t.image,
+    )
+    if not _wait_for_api(port):
+        t.fail_test("API extraction", "Server not ready")
+        t.docker_rm(name)
+        return
+
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
+        f.write("Test content for API extraction")
+        tmp = f.name
+
+    resp = _api_post_file(port, "/extract", tmp)
+    os.unlink(tmp)
+    t.debug(f"API response: {resp}")
+
+    if resp and "Test content for API extraction" in resp:
+        t.pass_test()
+    else:
+        t.fail_test("API extraction", "Response missing expected content")
+    t.docker_rm(name)
+
+
+def test_api_info(t: TestRunner) -> None:
+    t.start("API /info endpoint")
+    port = 9000 + random.randint(0, 999)
+    name = t.docker_run_detached(
+        "--memory", "2g", "--cpus", "2",
+        "-p", f"{port}:8000", t.image,
+    )
+    if not _wait_for_api(port):
+        t.fail_test("API /info", "Server not ready")
+        t.docker_rm(name)
+        return
+
+    resp = _api_get(port, "/info")
+    t.debug(f"/info response: {resp}")
+    if resp and "version" in resp and "rust_backend" in resp:
+        t.pass_test()
+    else:
+        t.fail_test("API /info endpoint", "Response missing expected fields")
+    t.docker_rm(name)
+
+
+def test_api_openapi(t: TestRunner) -> None:
+    t.start("API /openapi.json endpoint")
+    port = 9000 + random.randint(0, 999)
+    name = t.docker_run_detached(
+        "--memory", "2g", "--cpus", "2",
+        "-p", f"{port}:8000", t.image,
+    )
+    if not _wait_for_api(port):
+        t.fail_test("API /openapi.json", "Server not ready")
+        t.docker_rm(name)
+        return
+
+    resp = _api_get(port, "/openapi.json")
+    t.debug(f"/openapi.json response (first 200 chars): {(resp or '')[:200]}")
+    if resp and '"openapi"' in resp and '"paths"' in resp:
+        t.pass_test()
+    else:
+        t.fail_test("API /openapi.json endpoint", "Response missing OpenAPI schema fields")
+    t.docker_rm(name)
+
+
+def test_api_cache(t: TestRunner) -> None:
+    t.start("API /cache/stats endpoint")
+    port = 9000 + random.randint(0, 999)
+    name = t.docker_run_detached(
+        "--memory", "2g", "--cpus", "2",
+        "-p", f"{port}:8000", t.image,
+    )
+    if not _wait_for_api(port):
+        t.fail_test("API /cache/stats", "Server not ready")
+        t.docker_rm(name)
+        return
+
+    resp = _api_get(port, "/cache/stats")
+    t.debug(f"/cache/stats response: {resp}")
+    if resp and "total_files" in resp:
+        t.pass_test()
+    else:
+        t.fail_test("API /cache/stats endpoint", "Response missing expected fields")
+
+    t.start("API /cache/clear endpoint")
+    r = subprocess.run(
+        ["curl", "-f", "-s", "-X", "DELETE", f"http://localhost:{port}/cache/clear"],
+        capture_output=True, text=True, timeout=10,
+    )
+    if r.returncode == 0 and "removed_files" in r.stdout:
+        t.pass_test()
+    else:
+        t.fail_test("API /cache/clear endpoint", "Response missing expected fields")
+    t.docker_rm(name)
+
+
+def test_api_batch(t: TestRunner) -> None:
+    t.start("API batch extraction (multiple files)")
+    port = 9000 + random.randint(0, 999)
+    name = t.docker_run_detached(
+        "--memory", "2g", "--cpus", "2",
+        "-p", f"{port}:8000", t.image,
+    )
+    if not _wait_for_api(port):
+        t.fail_test("API batch extraction", "Server not ready")
+        t.docker_rm(name)
+        return
+
+    tmp1 = tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False)
+    tmp2 = tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False)
+    tmp1.write("File one content"); tmp1.close()
+    tmp2.write("File two content"); tmp2.close()
+
+    r = subprocess.run(
+        ["curl", "-f", "-s", "-X", "POST", f"http://localhost:{port}/extract",
+         "-F", f"files=@{tmp1.name}", "-F", f"files=@{tmp2.name}"],
+        capture_output=True, text=True, timeout=30,
+    )
+    os.unlink(tmp1.name)
+    os.unlink(tmp2.name)
+
+    t.debug(f"Batch extraction response (first 200 chars): {r.stdout[:200]}")
+    if "File one content" in r.stdout and "File two content" in r.stdout:
+        t.pass_test()
+    else:
+        t.fail_test("API batch extraction", "Response missing expected content")
+    t.docker_rm(name)
+
+
+def test_cli_batch_json(t: TestRunner) -> None:
+    t.start("CLI batch extraction with JSON format")
+    name = t.container_name()
+    r = subprocess.run(
+        ["docker", "run", "--rm", "--name", name,
+         "-v", f"{TEST_DOCS_DIR}:/data:ro",
+         t.image, "batch", "/data/text/contract.txt", "/data/pdf/searchable.pdf",
+         "--format", "json"],
+        capture_output=True, text=True, timeout=120,
+    )
+    out = (r.stdout + r.stderr).strip()
+    t.debug(f"Batch command output (first 200 chars): {out[:200]}")
+    if len(out) > 100 and "content" in out:
+        t.pass_test()
+    else:
+        t.fail_test("CLI batch command", "Output too short or malformed")
+
+
+def test_mcp_server(t: TestRunner) -> None:
+    t.start("MCP server startup and persistence")
+    name = t.docker_run_detached(
+        "-i", "--memory", "1g", t.image, "mcp",
+    )
+    time.sleep(3)
+    r = subprocess.run(
+        ["docker", "ps", "--filter", f"name={name}", "--format", "{{.Names}}"],
+        capture_output=True, text=True, timeout=10,
+    )
+    if name in r.stdout:
+        t.debug("MCP server is running")
+        t.pass_test()
+    else:
+        t.fail_test("MCP server persistence", "MCP server exited immediately")
+    t.docker_rm(name)
+
+
+def test_cli_cache(t: TestRunner) -> None:
+    t.start("CLI cache stats command")
+    name = t.container_name()
+    r = subprocess.run(
+        ["docker", "run", "--rm", "--name", name, t.image, "cache", "stats", "--format", "json"],
+        capture_output=True, text=True, timeout=60,
+    )
+    out = (r.stdout + r.stderr).strip()
+    t.debug(f"Cache stats output: {out}")
+    if "total_files" in out:
+        t.pass_test()
+    else:
+        t.fail_test("CLI cache stats", "Output missing expected fields")
+
+    t.start("CLI cache clear command")
+    name = t.container_name()
+    r = subprocess.run(
+        ["docker", "run", "--rm", "--name", name, t.image, "cache", "clear", "--format", "json"],
+        capture_output=True, text=True, timeout=60,
+    )
+    out = (r.stdout + r.stderr).strip()
+    t.debug(f"Cache clear output: {out}")
+    if "removed_files" in out:
+        t.pass_test()
+    else:
+        t.fail_test("CLI cache clear", "Output missing expected fields")
+
+
+def test_security_nonroot(t: TestRunner) -> None:
+    t.start("Security: Container runs as non-root user")
+    name = t.container_name()
+    r = subprocess.run(
+        ["docker", "run", "--rm", "--name", name, "--entrypoint", "/bin/sh",
+         t.image, "-c", "whoami"],
+        capture_output=True, text=True, timeout=30,
+    )
+    user = r.stdout.strip()
+    if user == "kreuzberg":
+        t.pass_test()
+    else:
+        t.fail_test("Non-root user", f"Container running as: {user} (expected: kreuzberg)")
+
+
+def test_security_readonly(t: TestRunner) -> None:
+    t.start("Security: Read-only volume enforcement")
+    with tempfile.TemporaryDirectory() as tmpdir:
+        (Path(tmpdir) / "test.txt").write_text("test")
+        name = t.container_name()
+        r = subprocess.run(
+            ["docker", "run", "--rm", "--name", name,
+             "-v", f"{tmpdir}:/data:ro",
+             "--entrypoint", "/bin/sh", t.image,
+             "-c", "echo 'attempt' > /data/test2.txt 2>&1 || echo 'READ_ONLY'"],
+            capture_output=True, text=True, timeout=30,
+        )
+        out = r.stdout + r.stderr
+        if any(s in out for s in ("READ_ONLY", "read-only", "Read-only")):
+            t.pass_test()
+        else:
+            t.fail_test("Read-only volume", "Was able to write to read-only volume")
+
+
+def test_security_memlimit(t: TestRunner) -> None:
+    t.start("Security: Memory limit enforcement")
+    name = t.container_name()
+    r = subprocess.run(
+        ["docker", "run", "--rm", "--name", name,
+         "--memory", "128m", "--memory-swap", "128m",
+         "--entrypoint", "/bin/sh", t.image,
+         "-c", "echo 'Memory limit test passed'"],
+        capture_output=True, text=True, timeout=30,
+    )
+    if "Memory limit test passed" in r.stdout:
+        t.pass_test()
+    else:
+        t.fail_test("Memory limit", "Container failed with memory limit")
+
+
+# ---------------------------------------------------------------------------
+# CLI-only tests
+# ---------------------------------------------------------------------------
+
+def test_cli_image_size(t: TestRunner) -> None:
+    t.start("Image size is reasonable (< 200MB)")
+    r = subprocess.run(
+        ["docker", "inspect", t.image, "--format", "{{.Size}}"],
+        capture_output=True, text=True, timeout=10,
+    )
+    try:
+        size_mb = int(r.stdout.strip()) // (1024 * 1024)
+    except ValueError:
+        size_mb = 0
+    t.debug(f"Image size: {size_mb}MB")
+    if 0 < size_mb < 200:
+        t.pass_test()
+    else:
+        t.fail_test("Image size", f"Expected < 200MB, got {size_mb}MB")
+
+
+# ---------------------------------------------------------------------------
+# Test suites per variant
+# ---------------------------------------------------------------------------
+
+def run_cli_tests(t: TestRunner) -> None:
+    """Tests for the minimal CLI Docker image."""
+    test_image_exists(t)
+    test_cli_image_size(t)
+    test_version(t)
+    test_help(t)
+    test_mime_detection(t)
+    test_extract_text(t)
+    test_extract_pdf(t)
+    test_extract_html(t)
+    test_extract_docx(t)
+    test_batch_cli(t)
+    test_readonly_mount(t)
+    test_nonexistent_file(t)
+
+
+def run_core_full_tests(t: TestRunner) -> None:
+    """Tests for core and full Docker images."""
+    test_image_exists(t)
+    test_version(t)
+    test_help(t)
+    test_mime_detection(t)
+    test_extract_text(t)
+    test_extract_pdf(t)
+    test_extract_docx(t)
+    test_extract_html(t)
+    test_ocr_extraction(t)
+
+    if t.variant == "full":
+        test_doc_extraction(t)
+        test_paddle_ocr_extraction(t)
+
+    test_api_health(t)
+    test_api_extract(t)
+    test_api_info(t)
+    test_api_openapi(t)
+    test_api_cache(t)
+    test_api_batch(t)
+    test_cli_batch_json(t)
+    test_mcp_server(t)
+    test_cli_cache(t)
+    test_security_nonroot(t)
+    test_security_readonly(t)
+    test_security_memlimit(t)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Docker image tests")
+    parser.add_argument("--image", required=True, help="Docker image name")
+    parser.add_argument("--variant", required=True, choices=["core", "full", "cli"])
+    parser.add_argument("--verbose", action="store_true")
+    parser.add_argument("--skip-build", action="store_true", help="(ignored, kept for compat)")
+    args = parser.parse_args()
+
+    t = TestRunner(image=args.image, variant=args.variant, verbose=args.verbose)
+
+    print("=" * 72)
+    t.info(f"Starting Docker tests for: {args.image} (variant: {args.variant})")
+    print("=" * 72)
+
+    try:
+        if args.variant == "cli":
+            run_cli_tests(t)
+        else:
+            run_core_full_tests(t)
+    finally:
+        t.cleanup()
+
+    # Summary
+    print()
+    print("=" * 72)
+    t.info(f"Test Results: {t.passed}/{t.total} passed, {t.failed} failed")
+    print("=" * 72)
+
+    if t.failed > 0:
+        t.error("Failed tests:")
+        for name in t.failed_names:
+            print(f"  - {name}")
+
+    t.write_results()
+
+    if t.failed > 0:
+        sys.exit(1)
+    t.ok("All tests passed!")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/ci/docs/build.sh
+++ b/scripts/ci/docs/build.sh
@@ -0,0 +1,61 @@
+#!/usr/bin/env bash
+# Build the documentation site (Zensical, doc dependency group).
+#
+# Usage:
+#   scripts/ci/docs/build.sh
+#   scripts/ci/docs/build.sh --strict --log-file /tmp/build-log.txt
+#
+# Caching: use astral-sh/setup-uv with enable-cache in CI; this script only runs uv.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)"
+cd "$REPO_ROOT"
+
+strict=false
+log_file=""
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --strict)
+      strict=true
+      shift
+      ;;
+    --log-file)
+      if [[ $# -lt 2 ]]; then
+        echo "error: --log-file requires a path" >&2
+        exit 2
+      fi
+      log_file="$2"
+      shift 2
+      ;;
+    *)
+      echo "usage: $0 [--strict] [--log-file PATH]" >&2
+      exit 2
+      ;;
+  esac
+done
+
+uv_sync() {
+  uv sync --group doc --no-editable --no-install-workspace --no-install-project
+}
+
+zensical_build() {
+  if [[ "$strict" == true ]]; then
+    uv run --no-sync zensical build --clean --strict
+  else
+    uv run --no-sync zensical build --clean
+  fi
+}
+
+if [[ -n "$log_file" ]]; then
+  set -o pipefail
+  mkdir -p "$(dirname "$log_file")"
+  : >"$log_file"
+  uv_sync 2>&1 | tee -a "$log_file"
+  zensical_build 2>&1 | tee -a "$log_file"
+else
+  uv_sync
+  zensical_build
+fi
--- a/scripts/ci/docs/textlint.sh
+++ b/scripts/ci/docs/textlint.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+# Run textlint prose linting against docs/**/*.md.
+#
+# Usage:
+#   scripts/ci/docs/textlint.sh
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)"
+cd "$REPO_ROOT"
+
+npx textlint "docs/**/*.md"
--- a/scripts/ci/install-system-deps/detect-tesseract-linux.sh
+++ b/scripts/ci/install-system-deps/detect-tesseract-linux.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+version="$(
+  apt-cache policy tesseract-ocr 2>/dev/null |
+    grep 'Candidate:' |
+    grep -Eo '[0-9]+\.[0-9]+' |
+    head -1 ||
+    true
+)"
+
+if [[ -z "${version}" ]]; then
+  version="unknown"
+fi
+
+echo "version=${version}" >>"${GITHUB_OUTPUT}"
+echo "::notice title=Tesseract Version::Detected version: ${version}"
--- a/scripts/ci/install-system-deps/detect-tesseract-macos.sh
+++ b/scripts/ci/install-system-deps/detect-tesseract-macos.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+version=""
+
+json="$(brew info --json=v2 tesseract 2>/dev/null || true)"
+if [[ -n "${json}" ]]; then
+  version="$(
+    python3 -c 'import json, re, sys; data = json.loads(sys.argv[1]); stable = (((data.get("formulae") or [{}])[0].get("versions") or {}).get("stable") or ""); m = re.match(r"^(\d+\.\d+)", stable); print(m.group(1) if m else "")' "${json}" || true
+  )"
+fi
+
+if [[ -z "${version}" ]]; then
+  first_line="$(brew info tesseract 2>/dev/null | head -1 || true)"
+  if [[ "${first_line}" =~ ([0-9]+\.[0-9]+) ]]; then
+    version="${BASH_REMATCH[1]}"
+  fi
+fi
+
+if [[ -z "${version}" ]]; then
+  version="unknown"
+fi
+
+echo "version=${version}" >>"${GITHUB_OUTPUT}"
+echo "::notice title=Tesseract Version::Detected version: ${version}"
--- a/scripts/ci/install-system-deps/install-linux.sh
+++ b/scripts/ci/install-system-deps/install-linux.sh
@@ -0,0 +1,136 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="${REPO_ROOT:-$(cd "$SCRIPT_DIR/../../.." && pwd)}"
+
+source "$REPO_ROOT/scripts/lib/retry.sh"
+
+echo "::group::Installing Linux dependencies"
+
+echo "Updating package index..."
+if ! retry_with_backoff sudo apt-get update; then
+  echo "::warning::apt-get update failed after retries, continuing anyway..."
+fi
+
+packages=(
+  tesseract-ocr
+  tesseract-ocr-eng
+  tesseract-ocr-tur
+  tesseract-ocr-deu
+  fonts-liberation
+  fonts-dejavu-core
+  fonts-noto-core
+  libssl-dev
+  pkg-config
+  build-essential
+  cmake
+  libmagic-dev
+  libuv1-dev
+  php-cli
+  php-dev
+)
+
+echo "Installing dependencies..."
+if retry_with_backoff_timeout 900 sudo apt-get install -y "${packages[@]}"; then
+  echo "✓ All packages installed successfully"
+else
+  exit_code=$?
+  if [ $exit_code -eq 124 ]; then
+    echo "::error::Package installation timed out after 15 minutes"
+  else
+    echo "::warning::Some packages failed to install, attempting individual installs..."
+    for pkg in tesseract-ocr libssl-dev pkg-config cmake; do
+      echo "Installing $pkg..."
+      if retry_with_backoff_timeout 300 sudo apt-get install -y "$pkg" 2>&1; then
+        echo "  ✓ $pkg installed"
+      else
+        echo "  ⚠ Failed to install $pkg"
+      fi
+    done
+  fi
+fi
+
+echo "::endgroup::"
+
+echo "::group::Verifying Linux installations"
+
+echo "CMake:"
+if command -v cmake >/dev/null 2>&1; then
+  cmake --version | head -1
+  echo "✓ CMake available"
+  # Export CMAKE environment variable for immediate availability in build scripts
+  CMAKE_FULL_PATH="$(command -v cmake)"
+  if [[ -n "$GITHUB_ENV" ]]; then
+    echo "CMAKE=$CMAKE_FULL_PATH" >>"$GITHUB_ENV"
+    echo "✓ Set CMAKE=$CMAKE_FULL_PATH in GITHUB_ENV"
+  fi
+  # Also add cmake binary directory to GITHUB_PATH for subsequent steps
+  CMAKE_BIN="$(dirname "$CMAKE_FULL_PATH")"
+  if [[ -n "$GITHUB_PATH" && -d "$CMAKE_BIN" ]]; then
+    echo "$CMAKE_BIN" >>"$GITHUB_PATH"
+    echo "✓ Added cmake directory to GITHUB_PATH: $CMAKE_BIN"
+  fi
+else
+  echo "::error::CMake not found after installation"
+  exit 1
+fi
+
+echo ""
+echo "Tesseract:"
+if command -v tesseract >/dev/null 2>&1; then
+  if tesseract --version 2>/dev/null | head -1; then
+    echo "✓ Tesseract CLI available"
+  else
+    echo "::warning::Tesseract CLI present but failed to run"
+  fi
+else
+  echo "::warning::Tesseract CLI not found; continuing (OCR will rely on bundled Tesseract)"
+fi
+
+echo ""
+echo "Available Tesseract languages:"
+if command -v tesseract >/dev/null 2>&1; then
+  tesseract --list-langs | head -10 || true
+else
+  echo "(tesseract CLI not available)"
+fi
+
+echo ""
+echo "PHP:"
+if command -v php >/dev/null 2>&1; then
+  php --version | head -1
+  echo "✓ PHP available"
+else
+  echo "::error::PHP not found after installation"
+  exit 1
+fi
+
+echo ""
+echo "Checking Tesseract data path..."
+
+tessdata_found=0
+for tessdata_path in "/usr/share/tesseract-ocr/5/tessdata" "/usr/share/tesseract-ocr/tessdata"; do
+  if [ -d "$tessdata_path" ]; then
+    echo "Found tessdata at: $tessdata_path"
+
+    echo "Required language files:"
+    for lang in eng tur deu; do
+      if [ -f "$tessdata_path/${lang}.traineddata" ]; then
+        size=$(stat -c%s "$tessdata_path/${lang}.traineddata" 2>/dev/null || echo "unknown")
+        echo "  ✓ ${lang}.traineddata ($size bytes)"
+      else
+        echo "  ⚠ ${lang}.traineddata (missing)"
+      fi
+    done
+    tessdata_found=1
+    break
+  fi
+done
+
+if [ $tessdata_found -eq 0 ]; then
+  echo "::error::Tessdata directory not found in standard locations"
+  exit 1
+fi
+
+echo "::endgroup::"
--- a/scripts/ci/install-system-deps/install-macos.sh
+++ b/scripts/ci/install-system-deps/install-macos.sh
@@ -0,0 +1,136 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="${REPO_ROOT:-$(cd "$SCRIPT_DIR/../../.." && pwd)}"
+
+source "$REPO_ROOT/scripts/lib/retry.sh"
+
+echo "::group::Installing macOS dependencies"
+
+if [[ -d "/opt/homebrew/bin" ]]; then
+  export PATH="/opt/homebrew/bin:/opt/homebrew/sbin:${PATH}"
+  echo "/opt/homebrew/bin" >>"$GITHUB_PATH"
+  echo "/opt/homebrew/sbin" >>"$GITHUB_PATH"
+fi
+if [[ -d "/usr/local/bin" ]]; then
+  export PATH="/usr/local/bin:/usr/local/sbin:${PATH}"
+  echo "/usr/local/bin" >>"$GITHUB_PATH"
+  echo "/usr/local/sbin" >>"$GITHUB_PATH"
+fi
+
+if ! brew list cmake &>/dev/null; then
+  echo "Installing CMake..."
+  retry_with_backoff brew install cmake || {
+    echo "::error::Failed to install CMake after retries"
+    exit 1
+  }
+else
+  echo "✓ CMake already installed"
+fi
+
+if ! command -v cmake >/dev/null 2>&1; then
+  echo "CMake not on PATH after install; attempting brew link..."
+  brew link --overwrite cmake >/dev/null 2>&1 || true
+fi
+
+if ! brew list tesseract &>/dev/null; then
+  echo "Installing Tesseract..."
+  retry_with_backoff brew install tesseract || {
+    echo "::error::Failed to install Tesseract after retries"
+    exit 1
+  }
+else
+  echo "✓ Tesseract already installed"
+fi
+
+if ! command -v tesseract >/dev/null 2>&1; then
+  echo "Tesseract not on PATH after install; attempting brew link..."
+  brew link --overwrite tesseract >/dev/null 2>&1 || true
+fi
+
+if ! brew list tesseract-lang &>/dev/null; then
+  echo "Installing Tesseract language packs..."
+  retry_with_backoff brew install tesseract-lang || {
+    echo "::warning::Failed to install tesseract-lang, some languages may be unavailable"
+  }
+else
+  echo "✓ Tesseract language packs already installed"
+fi
+
+if ! brew list libmagic &>/dev/null; then
+  echo "Installing libmagic..."
+  retry_with_backoff brew install libmagic || {
+    echo "::warning::Failed to install libmagic after retries"
+  }
+else
+  echo "✓ libmagic already installed"
+fi
+
+if ! brew list php &>/dev/null; then
+  echo "Installing PHP..."
+  retry_with_backoff brew install php || {
+    echo "::error::Failed to install PHP after retries"
+    exit 1
+  }
+else
+  echo "✓ PHP already installed"
+fi
+
+if ! command -v php >/dev/null 2>&1; then
+  echo "PHP not on PATH after install; attempting brew link..."
+  brew link --overwrite php >/dev/null 2>&1 || true
+fi
+
+echo "::endgroup::"
+
+echo "::group::Verifying macOS installations"
+
+echo "CMake:"
+if command -v cmake >/dev/null 2>&1; then
+  cmake --version | head -1
+  # Export CMAKE environment variable for immediate availability in build scripts
+  CMAKE_FULL_PATH="$(command -v cmake)"
+  if [[ -n "$GITHUB_ENV" ]]; then
+    echo "CMAKE=$CMAKE_FULL_PATH" >>"$GITHUB_ENV"
+    echo "✓ Set CMAKE=$CMAKE_FULL_PATH in GITHUB_ENV"
+  fi
+  # Also add cmake binary directory to GITHUB_PATH for subsequent steps
+  CMAKE_BIN="$(dirname "$CMAKE_FULL_PATH")"
+  if [[ -n "$GITHUB_PATH" && -d "$CMAKE_BIN" ]]; then
+    echo "$CMAKE_BIN" >>"$GITHUB_PATH"
+    echo "✓ Added cmake directory to GITHUB_PATH: $CMAKE_BIN"
+  fi
+else
+  echo "::error::CMake not found on PATH after installation"
+  echo "PATH=$PATH"
+  brew --prefix cmake 2>/dev/null || true
+  exit 1
+fi
+
+echo ""
+echo "Tesseract:"
+if command -v tesseract >/dev/null 2>&1; then
+  tesseract --version | head -1
+else
+  echo "::error::Tesseract not found on PATH after installation"
+  echo "PATH=$PATH"
+  brew --prefix tesseract 2>/dev/null || true
+  exit 1
+fi
+
+echo ""
+echo "Available languages:"
+tesseract --list-langs | head -5
+
+echo ""
+echo "PHP:"
+if command -v php >/dev/null 2>&1; then
+  php --version | head -1
+else
+  echo "::error::PHP not found on PATH after installation"
+  echo "PATH=$PATH"
+  exit 1
+fi
+
+echo "::endgroup::"
--- a/scripts/ci/install-system-deps/install-windows.ps1
+++ b/scripts/ci/install-system-deps/install-windows.ps1
@@ -0,0 +1,301 @@
+#!/usr/bin/env pwsh
+
+Set-StrictMode -Version Latest
+$ErrorActionPreference = 'Stop'
+
+Write-Host "::group::Installing Windows dependencies"
+
+function Retry-Command {
+  param(
+    [scriptblock]$Command,
+    [int]$MaxAttempts = 3,
+    [int]$DelaySeconds = 5
+  )
+
+  $attempt = 1
+  while ($attempt -le $MaxAttempts) {
+    try {
+      Write-Host "Attempt $attempt of $MaxAttempts..."
+      & $Command
+      return $true
+    }
+    catch {
+      $attempt++
+      if ($attempt -le $MaxAttempts) {
+        $backoffDelay = $DelaySeconds * [Math]::Pow(2, $attempt - 1)
+        Write-Host "⚠ Attempt failed, retrying in ${backoffDelay}s..." -ForegroundColor Yellow
+        Start-Sleep -Seconds $backoffDelay
+      }
+      else {
+        return $false
+      }
+    }
+  }
+}
+
+$tesseractCacheHit = $env:TESSERACT_CACHE_HIT -eq "true"
+$llvmCacheHit = $env:LLVM_CACHE_HIT -eq "true"
+$cmakeCacheHit = $env:CMAKE_CACHE_HIT -eq "true"
+$cmakeInstalled = $false
+
+Write-Host "Cache status:"
+Write-Host "  TESSERACT_CACHE_HIT: $env:TESSERACT_CACHE_HIT (evaluated: $tesseractCacheHit)"
+Write-Host "  LLVM_CACHE_HIT: $env:LLVM_CACHE_HIT (evaluated: $llvmCacheHit)"
+Write-Host "  CMAKE_CACHE_HIT: $env:CMAKE_CACHE_HIT (evaluated: $cmakeCacheHit)"
+Write-Host ""
+try {
+  & cmake --version 2>$null
+  Write-Host "✓ CMake already installed"
+  $cmakeInstalled = $true
+}
+catch {
+  Write-Host "CMake not found, will attempt to install"
+}
+
+if (-not $tesseractCacheHit) {
+  Write-Host "Tesseract cache miss, installing (optional for build - needed for tests only)..."
+  if (-not (Retry-Command { choco install -y tesseract --no-progress } -MaxAttempts 3)) {
+    Write-Host "::warning::Failed to install Tesseract (optional dependency - gem build does not require it)"
+  }
+  else {
+    Write-Host "✓ Tesseract installed"
+    # Ensure tessdata directory exists and is accessible
+    $tesseractPath = "C:\Program Files\Tesseract-OCR"
+    if (Test-Path $tesseractPath) {
+      Write-Host "  Configuring Tesseract data paths..."
+
+      # Create tessdata directory if it doesn't exist
+      $tessdataPath = "$tesseractPath\tessdata"
+      if (-not (Test-Path $tessdataPath)) {
+        Write-Host "  Creating tessdata directory at: $tessdataPath"
+        New-Item -ItemType Directory -Path $tessdataPath -Force | Out-Null
+      }
+
+      # Download English language data if not present
+      if (-not (Test-Path "$tessdataPath\eng.traineddata")) {
+        Write-Host "  Downloading English language data..."
+        try {
+          $engUrl = "https://github.com/tesseract-ocr/tessdata_fast/raw/main/eng.traineddata"
+          Invoke-WebRequest -Uri $engUrl -OutFile "$tessdataPath\eng.traineddata" -ErrorAction Stop
+          Write-Host "  ✓ Downloaded eng.traineddata"
+        }
+        catch {
+          Write-Host "  ::warning::Failed to download eng.traineddata: $($_.Exception.Message)"
+        }
+      }
+
+      # Download OSD data if not present (needed for orientation detection)
+      if (-not (Test-Path "$tessdataPath\osd.traineddata")) {
+        Write-Host "  Downloading OSD data..."
+        try {
+          $osdUrl = "https://github.com/tesseract-ocr/tessdata_fast/raw/main/osd.traineddata"
+          Invoke-WebRequest -Uri $osdUrl -OutFile "$tessdataPath\osd.traineddata" -ErrorAction Stop
+          Write-Host "  ✓ Downloaded osd.traineddata"
+        }
+        catch {
+          Write-Host "  ::warning::Failed to download osd.traineddata: $($_.Exception.Message)"
+        }
+      }
+    }
+  }
+}
+else {
+  Write-Host "✓ Tesseract found in cache"
+}
+
+if (-not $llvmCacheHit) {
+  Write-Host "LLVM cache miss, installing LLVM/Clang (required for bindgen)..."
+  if (-not (Retry-Command { choco install -y llvm --no-progress } -MaxAttempts 3)) {
+    Write-Host "::warning::Failed to install LLVM/Clang via Chocolatey"
+  }
+  else {
+    Write-Host "✓ LLVM/Clang installed"
+  }
+}
+else {
+  Write-Host "✓ LLVM/Clang found in cache"
+}
+
+Write-Host "Installing PHP..."
+$phpInstalled = $false
+try {
+  & php --version 2>$null
+  Write-Host "✓ PHP already installed"
+  $phpInstalled = $true
+}
+catch {
+  Write-Host "PHP not found, installing via Chocolatey..."
+  if (-not (Retry-Command { choco install -y php --no-progress } -MaxAttempts 3)) {
+    Write-Host "::warning::Failed to install PHP via Chocolatey, will rely on shivammathur/setup-php action"
+  }
+  else {
+    Write-Host "✓ PHP installed via Chocolatey"
+    $phpInstalled = $true
+  }
+}
+
+Write-Host "Installing CMake..."
+if (-not $cmakeCacheHit) {
+  Write-Host "CMake cache miss, installing..."
+  if (-not (Retry-Command { choco install -y cmake --no-progress } -MaxAttempts 3)) {
+    throw "Failed to install CMake after 3 attempts"
+  }
+  Write-Host "✓ CMake installed"
+}
+else {
+  Write-Host "✓ CMake found in cache"
+}
+
+Write-Host "Configuring PATH and environment variables..."
+$paths = @(
+  "C:\Program Files\CMake\bin",
+  "C:\Program Files\Tesseract-OCR",
+  "C:\Program Files\LLVM\bin",
+  "C:\tools\php",
+  "C:\Program Files\PHP"
+)
+
+foreach ($path in $paths) {
+  if (Test-Path $path) {
+    Write-Host "  Adding to PATH: $path"
+    Write-Output $path | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+    $env:PATH = "$path;$env:PATH"
+  }
+  else {
+    Write-Host "  Path not found (skipping): $path"
+  }
+}
+
+# Ensure TESSDATA_PREFIX is set for Windows OCR tests
+$tesseractPath = "C:\Program Files\Tesseract-OCR"
+if (Test-Path $tesseractPath) {
+  $tessdataPath = "$tesseractPath\tessdata"
+  if (Test-Path $tessdataPath) {
+    Write-Host "  Setting TESSDATA_PREFIX for tests: $tessdataPath"
+    Add-Content -Path $env:GITHUB_ENV -Value "TESSDATA_PREFIX=$tessdataPath"
+    $env:TESSDATA_PREFIX = $tessdataPath
+  }
+}
+
+Write-Host "::endgroup::"
+
+Write-Host "::group::Verifying Windows installations"
+
+Write-Host "Tesseract (optional for build):"
+try {
+  $tesseractCmd = Get-Command tesseract -ErrorAction Stop
+  $tesseractPath = $tesseractCmd.Path
+  Write-Host "  Found at: $tesseractPath"
+  Write-Host "  Command type: $($tesseractCmd.CommandType)"
+
+  # Get installation directory
+  $tesseractDir = Split-Path -Parent $tesseractPath
+  Write-Host "  Installation directory: $tesseractDir"
+
+  # Check for tessdata
+  $tessdataPath = Join-Path $tesseractDir "tessdata"
+  if (Test-Path $tessdataPath) {
+    Write-Host "  tessdata directory: $tessdataPath"
+    Write-Host "  Available language files:"
+    Get-ChildItem "$tessdataPath\*.traineddata" -ErrorAction SilentlyContinue | ForEach-Object {
+      Write-Host "    - $($_.Name)"
+    }
+  }
+  else {
+    Write-Host "  tessdata directory not found at: $tessdataPath"
+  }
+
+  try {
+    $version = & tesseract --version 2>&1
+    Write-Host "  Version output: $version"
+    Write-Host "✓ Tesseract available and working"
+
+    Write-Host ""
+    Write-Host "Available Tesseract languages:"
+    & tesseract --list-langs 2>&1 | ForEach-Object { Write-Host "  $_" }
+  }
+  catch {
+    Write-Host "⚠ Warning: Tesseract found but failed to run: $($_.Exception.Message)"
+  }
+
+  # Set TESSDATA_PREFIX environment variable for tests
+  if (Test-Path $tessdataPath) {
+    Write-Host ""
+    Write-Host "Setting TESSDATA_PREFIX environment variable..."
+    Add-Content -Path $env:GITHUB_ENV -Value "TESSDATA_PREFIX=$tessdataPath"
+    Write-Host "✓ Set TESSDATA_PREFIX=$tessdataPath in GITHUB_ENV"
+    $env:TESSDATA_PREFIX = $tessdataPath
+  }
+}
+catch {
+  Write-Host "⚠ Tesseract not found on PATH (not required for build)"
+  Write-Host "  Error details: $($_.Exception.Message)"
+  Write-Host "  Searching common installation locations..."
+
+  $commonPaths = @(
+    "C:\Program Files\Tesseract-OCR\tesseract.exe",
+    "C:\Program Files (x86)\Tesseract-OCR\tesseract.exe",
+    "${env:ProgramFiles}\Tesseract-OCR\tesseract.exe",
+    "${env:ProgramFiles(x86)}\Tesseract-OCR\tesseract.exe"
+  )
+
+  $found = $false
+  foreach ($path in $commonPaths) {
+    if (Test-Path $path) {
+      Write-Host "  Found Tesseract at: $path (not on PATH)"
+      $tesseractDir = Split-Path -Parent $path
+      $tessdataPath = Join-Path $tesseractDir "tessdata"
+      if (Test-Path $tessdataPath) {
+        Write-Host "  Found tessdata at: $tessdataPath"
+        Add-Content -Path $env:GITHUB_ENV -Value "TESSDATA_PREFIX=$tessdataPath"
+        Write-Host "✓ Set TESSDATA_PREFIX=$tessdataPath in GITHUB_ENV"
+        $env:TESSDATA_PREFIX = $tessdataPath
+      }
+      $found = $true
+      break
+    }
+  }
+
+  if (-not $found) {
+    Write-Host "  Tesseract not found in common locations"
+  }
+}
+
+Write-Host ""
+Write-Host "CMake:"
+try {
+  & cmake --version
+  Write-Host "✓ CMake available"
+  # Export CMAKE environment variable for immediate availability in build scripts
+  $cmakePath = (Get-Command cmake -ErrorAction Stop).Source
+  if ($cmakePath) {
+    Add-Content -Path $env:GITHUB_ENV -Value "CMAKE=$cmakePath"
+    Write-Host "✓ Set CMAKE=$cmakePath in GITHUB_ENV"
+  }
+}
+catch {
+  Write-Host "::error::CMake not found after installation"
+  throw "CMake verification failed"
+}
+
+Write-Host ""
+Write-Host "Clang:"
+try {
+  & clang --version
+  Write-Host "✓ Clang available"
+}
+catch {
+  Write-Host "⚠ Warning: Clang not currently available on PATH"
+}
+
+Write-Host ""
+Write-Host "PHP:"
+try {
+  & php --version
+  Write-Host "✓ PHP available"
+}
+catch {
+  Write-Host "⚠ Warning: PHP not currently available on PATH (will be set up by shivammathur/setup-php action)"
+}
+
+Write-Host "::endgroup::"
--- a/scripts/ci/r/vendor-kreuzberg-core.py
+++ b/scripts/ci/r/vendor-kreuzberg-core.py
@@ -0,0 +1,433 @@
+#!/usr/bin/env python3
+"""
+Vendor kreuzberg core crate into R package
+Used by: ci-r.yaml - Vendor kreuzberg core crate step
+
+This script:
+1. Reads workspace.dependencies from root Cargo.toml
+2. Copies core crates to packages/r/vendor/
+3. Replaces workspace = true with explicit versions
+4. Generates vendor/Cargo.toml with proper workspace setup
+"""
+
+import os
+import sys
+import shutil
+import re
+from pathlib import Path
+
+try:
+    import tomllib
+except ImportError:
+    import tomli as tomllib  # type: ignore
+
+
+def get_repo_root() -> Path:
+    """Get repository root directory."""
+    repo_root_env = os.environ.get("REPO_ROOT")
+    if repo_root_env:
+        return Path(repo_root_env)
+
+    script_dir = Path(__file__).parent.absolute()
+    return (script_dir / ".." / ".." / "..").resolve()
+
+
+def read_toml(path: Path) -> dict[str, object]:
+    """Read TOML file."""
+    with open(path, "rb") as f:
+        return tomllib.load(f)
+
+
+def get_workspace_deps(repo_root: Path) -> dict[str, object]:
+    """Extract workspace.dependencies from root Cargo.toml."""
+    cargo_toml_path = repo_root / "Cargo.toml"
+    data = read_toml(cargo_toml_path)
+    return data.get("workspace", {}).get("dependencies", {})
+
+
+def get_workspace_version(repo_root: Path) -> str:
+    """Extract version from workspace.package."""
+    cargo_toml_path = repo_root / "Cargo.toml"
+    data = read_toml(cargo_toml_path)
+    return data.get("workspace", {}).get("package", {}).get("version", "4.0.0")
+
+
+def format_dependency(name: str, dep_spec: object) -> str:
+    """Format a dependency spec for Cargo.toml."""
+    if isinstance(dep_spec, str):
+        return f'{name} = "{dep_spec}"'
+    elif isinstance(dep_spec, dict):
+        version: str = dep_spec.get("version", "")
+        package: str | None = dep_spec.get("package")
+        features: list[str] = dep_spec.get("features", [])
+        default_features: bool | None = dep_spec.get("default-features")
+        optional: bool | None = dep_spec.get("optional")
+
+        path: str | None = dep_spec.get("path")
+        git: str | None = dep_spec.get("git")
+        branch: str | None = dep_spec.get("branch")
+        tag: str | None = dep_spec.get("tag")
+        rev: str | None = dep_spec.get("rev")
+
+        parts: list[str] = []
+
+        if package:
+            parts.append(f'package = "{package}"')
+
+        if git:
+            parts.append(f'git = "{git}"')
+
+        if branch:
+            parts.append(f'branch = "{branch}"')
+
+        if tag:
+            parts.append(f'tag = "{tag}"')
+
+        if rev:
+            parts.append(f'rev = "{rev}"')
+
+        if path:
+            parts.append(f'path = "{path}"')
+
+        if version:
+            parts.append(f'version = "{version}"')
+
+        if features:
+            features_str = ', '.join(f'"{f}"' for f in features)
+            parts.append(f'features = [{features_str}]')
+
+        if default_features is False:
+            parts.append('default-features = false')
+        elif default_features is True:
+            parts.append('default-features = true')
+
+        if optional is True:
+            parts.append('optional = true')
+        elif optional is False:
+            parts.append('optional = false')
+
+        spec_str = ", ".join(parts)
+        return f"{name} = {{ {spec_str} }}"
+
+    return f'{name} = "{dep_spec}"'
+
+
+def replace_workspace_deps_in_toml(toml_path: Path, workspace_deps: dict[str, object]) -> None:
+    """Replace workspace = true with explicit versions in a Cargo.toml file."""
+    with open(toml_path, "r") as f:
+        content = f.read()
+
+    for name, dep_spec in workspace_deps.items():
+        pattern1 = rf'^{re.escape(name)} = \{{ workspace = true \}}$'
+        content = re.sub(pattern1, format_dependency(name, dep_spec), content, flags=re.MULTILINE)
+
+        def replace_with_fields(match: re.Match[str]) -> str:
+            other_fields_str = match.group(1).strip()
+            base_spec = format_dependency(name, dep_spec)
+            if " = { " not in base_spec:
+                # Simple string dep like `ctor = "0.6"` - wrap it
+                version_val = base_spec.split(" = ", 1)[1].strip('"')
+                spec_part = f'version = "{version_val}"'
+            else:
+                spec_part = base_spec.split(" = { ", 1)[1].rstrip("} ").rstrip("}")
+
+            # Extract existing keys and values from workspace spec, handling nested brackets
+            workspace_fields: dict[str, str] = {}
+            bracket_depth = 0
+            current_field = ""
+            for char in spec_part:
+                if char == '[':
+                    bracket_depth += 1
+                    current_field += char
+                elif char == ']':
+                    bracket_depth -= 1
+                    current_field += char
+                elif char == ',' and bracket_depth == 0:
+                    # End of field
+                    field = current_field.strip()
+                    if field and "=" in field:
+                        key, val = field.split("=", 1)
+                        workspace_fields[key.strip()] = val.strip()
+                    current_field = ""
+                else:
+                    current_field += char
+
+            # Don't forget the last field
+            if current_field.strip():
+                field = current_field.strip()
+                if field and "=" in field:
+                    key, val = field.split("=", 1)
+                    workspace_fields[key.strip()] = val.strip()
+
+            # Extract crate-specific keys using bracket-aware parsing
+            crate_fields: dict[str, str] = {}
+            bracket_depth = 0
+            current_field = ""
+            for char in other_fields_str:
+                if char == '[':
+                    bracket_depth += 1
+                    current_field += char
+                elif char == ']':
+                    bracket_depth -= 1
+                    current_field += char
+                elif char == ',' and bracket_depth == 0:
+                    # End of field
+                    field = current_field.strip()
+                    if field and "=" in field:
+                        key, val = field.split("=", 1)
+                        crate_fields[key.strip()] = val.strip()
+                    current_field = ""
+                else:
+                    current_field += char
+
+            # Don't forget the last field
+            if current_field.strip():
+                field = current_field.strip()
+                if field and "=" in field:
+                    key, val = field.split("=", 1)
+                    crate_fields[key.strip()] = val.strip()
+
+            # Merge: crate-specific fields override workspace fields
+            merged_fields = {**workspace_fields, **crate_fields}
+
+            # Build result from merged fields
+            merged_parts = [f"{k} = {v}" for k, v in merged_fields.items()]
+            merged_spec = ", ".join(merged_parts)
+
+            return f"{name} = {{ {merged_spec} }}"
+
+        pattern2 = rf'^{re.escape(name)} = \{{ workspace = true, (.+?) \}}$'
+        content = re.sub(pattern2, replace_with_fields, content, flags=re.MULTILINE | re.DOTALL)
+
+    with open(toml_path, "w") as f:
+        f.write(content)
+
+
+def generate_vendor_cargo_toml(repo_root: Path, workspace_deps: dict[str, object], core_version: str, copied_crates: list[str]) -> None:
+    """Generate vendor/Cargo.toml with workspace setup.
+
+    Args:
+        repo_root: Repository root directory
+        workspace_deps: Workspace dependencies from Cargo.toml
+        core_version: Core version string
+        copied_crates: List of crates that were successfully copied
+    """
+
+    deps_lines: list[str] = []
+    for name, dep_spec in sorted(workspace_deps.items()):
+        deps_lines.append(format_dependency(name, dep_spec))
+
+    deps_str = "\n".join(deps_lines)
+
+    # Build members list based on actually copied crates
+    members = [name for name in ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr"]
+               if name in copied_crates]
+    members_str = ', '.join(f'"{m}"' for m in members)
+
+    vendor_toml = f'''[workspace]
+members = [{members_str}]
+
+[workspace.package]
+version = "{core_version}"
+edition = "2024"
+rust-version = "1.91"
+authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
+license = "MIT"
+repository = "https://github.com/kreuzberg-dev/kreuzberg"
+homepage = "https://kreuzberg.dev"
+
+[workspace.dependencies]
+{deps_str}
+'''
+
+    vendor_dir = repo_root / "packages" / "r" / "vendor"
+    vendor_dir.mkdir(parents=True, exist_ok=True)
+
+    toml_path = vendor_dir / "Cargo.toml"
+    with open(toml_path, "w") as f:
+        f.write(vendor_toml)
+
+
+def main() -> None:
+    """Main vendoring function."""
+    repo_root: Path = get_repo_root()
+
+    print("=== Vendoring kreuzberg core crate ===")
+
+    workspace_deps: dict[str, object] = get_workspace_deps(repo_root)
+    core_version: str = get_workspace_version(repo_root)
+
+    print(f"Core version: {core_version}")
+    print(f"Workspace dependencies: {len(workspace_deps)}")
+
+    vendor_base: Path = repo_root / "packages" / "r" / "vendor"
+
+    # Clean only crate directories
+    crate_names = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract",
+                   "kreuzberg-paddle-ocr"]
+    for name in crate_names:
+        crate_path = vendor_base / name
+        if crate_path.exists():
+            shutil.rmtree(crate_path)
+    # Also clean the vendor Cargo.toml (will be regenerated)
+    vendor_cargo = vendor_base / "Cargo.toml"
+    if vendor_cargo.exists():
+        vendor_cargo.unlink()
+    print("Cleaned vendor crate directories")
+
+    vendor_base.mkdir(parents=True, exist_ok=True)
+
+    crates_to_copy: list[tuple[str, str]] = [
+        ("crates/kreuzberg", "kreuzberg"),
+        ("crates/kreuzberg-ffi", "kreuzberg-ffi"),
+        ("crates/kreuzberg-tesseract", "kreuzberg-tesseract"),
+        ("crates/kreuzberg-paddle-ocr", "kreuzberg-paddle-ocr"),
+    ]
+
+    copied_crates: list[str] = []
+    for src_rel, dest_name in crates_to_copy:
+        src: Path = repo_root / src_rel
+        dest: Path = vendor_base / dest_name
+        if src.exists():
+            try:
+                shutil.copytree(src, dest)
+                copied_crates.append(dest_name)
+                print(f"Copied {dest_name}")
+            except Exception as e:
+                print(f"Warning: Failed to copy {dest_name}: {e}", file=sys.stderr)
+        else:
+            print(f"Warning: Source directory not found: {src_rel}")
+
+    artifact_dirs: list[str] = [".fastembed_cache", "target"]
+    temp_patterns: list[str] = ["*.swp", "*.bak", "*.tmp", "*~"]
+
+    for crate_dir in copied_crates:
+        crate_path: Path = vendor_base / crate_dir
+        if crate_path.exists():
+            for artifact_dir in artifact_dirs:
+                artifact: Path = crate_path / artifact_dir
+                if artifact.exists():
+                    shutil.rmtree(artifact)
+
+            for pattern in temp_patterns:
+                for f in crate_path.rglob(pattern):
+                    f.unlink()
+
+    print("Cleaned build artifacts")
+
+    # Update workspace inheritance in Cargo.toml files
+    for crate_dir in copied_crates:
+        crate_toml = vendor_base / crate_dir / "Cargo.toml"
+        if crate_toml.exists():
+            with open(crate_toml, "r") as f:
+                content = f.read()
+
+            content = re.sub(r'^version\.workspace = true$', f'version = "{core_version}"', content, flags=re.MULTILINE)
+            content = re.sub(r'^edition\.workspace = true$', 'edition = "2024"', content, flags=re.MULTILINE)
+            content = re.sub(r'^rust-version\.workspace = true$', 'rust-version = "1.91"', content, flags=re.MULTILINE)
+            content = re.sub(r'^authors\.workspace = true$', 'authors = ["Na\'aman Hirschfeld <naaman@kreuzberg.dev>"]', content, flags=re.MULTILINE)
+            content = re.sub(r'^license\.workspace = true$', 'license = "MIT"', content, flags=re.MULTILINE)
+
+            with open(crate_toml, "w") as f:
+                f.write(content)
+
+            replace_workspace_deps_in_toml(crate_toml, workspace_deps)
+            print(f"Updated {crate_dir}/Cargo.toml")
+
+    # Update path dependencies in all crates that depend on other vendored crates
+    # First handle kreuzberg-ffi's dependency on kreuzberg
+    if "kreuzberg-ffi" in copied_crates:
+        ffi_toml = vendor_base / "kreuzberg-ffi" / "Cargo.toml"
+        if ffi_toml.exists():
+            with open(ffi_toml, "r") as f:
+                content = f.read()
+
+            if "kreuzberg" in copied_crates:
+                # Replace kreuzberg workspace references with path dependency
+                # Handle cases with path, version, or neither
+                content = re.sub(
+                    r'(kreuzberg = \{) (?:(?:path|version) = "[^"]*", )?',
+                    r'\1 path = "../kreuzberg", ',
+                    content
+                )
+
+            with open(ffi_toml, "w") as f:
+                f.write(content)
+
+    # Update path dependencies in kreuzberg crate if tesseract was copied
+    if "kreuzberg" in copied_crates:
+        kreuzberg_toml = vendor_base / "kreuzberg" / "Cargo.toml"
+        if kreuzberg_toml.exists():
+            with open(kreuzberg_toml, "r") as f:
+                content = f.read()
+
+            # Only update tesseract path if it was actually copied
+            if "kreuzberg-tesseract" in copied_crates:
+                content = re.sub(
+                    r'kreuzberg-tesseract = \{ version = "[^"]*", optional = true \}',
+                    'kreuzberg-tesseract = { path = "../kreuzberg-tesseract", optional = true }',
+                    content
+                )
+            # Only update paddle-ocr path if it was actually copied
+            if "kreuzberg-paddle-ocr" in copied_crates:
+                content = re.sub(
+                    r'kreuzberg-paddle-ocr = \{ version = "[^"]*", optional = true \}',
+                    'kreuzberg-paddle-ocr = { path = "../kreuzberg-paddle-ocr", optional = true }',
+                    content
+                )
+
+            with open(kreuzberg_toml, "w") as f:
+                f.write(content)
+
+    generate_vendor_cargo_toml(repo_root, workspace_deps, core_version, copied_crates)
+    print("Generated vendor/Cargo.toml")
+
+    # Copy root Cargo.lock so vendor workspace uses identical dependency versions
+    root_lock = repo_root / "Cargo.lock"
+    vendor_lock = vendor_base / "Cargo.lock"
+    if root_lock.exists():
+        shutil.copy2(root_lock, vendor_lock)
+        print("Copied Cargo.lock to vendor directory")
+
+    # Update R package Cargo.toml to use vendored crates
+    r_toml = repo_root / "packages" / "r" / "src" / "rust" / "Cargo.toml"
+    if r_toml.exists():
+        with open(r_toml, "r") as f:
+            content = f.read()
+
+        # Replace path dependencies to point to vendored crates
+        # From: path = "../../../../crates/kreuzberg"
+        # To: path = "../../vendor/kreuzberg"
+        content = re.sub(
+            r'path = "\.\./\.\./\.\./\.\./crates/kreuzberg"',
+            'path = "../../vendor/kreuzberg"',
+            content
+        )
+        content = re.sub(
+            r'path = "\.\./\.\./\.\./\.\./crates/kreuzberg-ffi"',
+            'path = "../../vendor/kreuzberg-ffi"',
+            content
+        )
+
+        with open(r_toml, "w") as f:
+            f.write(content)
+
+        print("Updated R package Cargo.toml to use vendored crates")
+
+    print(f"\nVendoring complete (core version: {core_version})")
+    print(f"Copied crates: {', '.join(sorted(copied_crates))}")
+
+    if "kreuzberg" in copied_crates and "kreuzberg-ffi" in copied_crates:
+        print("R package Cargo.toml uses:")
+        print("  - path '../../vendor/kreuzberg' for kreuzberg crate")
+        print("  - path '../../vendor/kreuzberg-ffi' for kreuzberg-ffi crate")
+    else:
+        print("Warning: Some required crates were not copied. Check for missing source directories.")
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
--- a/scripts/ci/ruby/compile-extension.sh
+++ b/scripts/ci/ruby/compile-extension.sh
@@ -0,0 +1,95 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="${REPO_ROOT:-$(cd "$SCRIPT_DIR/../../.." && pwd)}"
+
+source "$REPO_ROOT/scripts/lib/common.sh"
+source "$REPO_ROOT/scripts/lib/library-paths.sh"
+
+validate_repo_root "$REPO_ROOT" || exit 1
+setup_rust_ffi_paths "$REPO_ROOT"
+
+echo "=== Compiling Ruby native extension (Verbose Debug) ==="
+cd "$REPO_ROOT/packages/ruby"
+
+export CARGO_BUILD_JOBS=1
+export RUST_BACKTRACE=1
+export RB_SYS_VERBOSE=1
+
+echo ""
+echo "=== Pre-compilation environment ==="
+echo "Ruby version: $(ruby --version)"
+echo "Ruby platform: $(ruby -e 'puts RUBY_PLATFORM')"
+echo "Rustc version: $(rustc --version)"
+echo "Cargo version: $(cargo --version)"
+echo "Working directory: $(pwd)"
+echo ""
+
+echo "=== Build configuration variables ==="
+echo "CARGO_BUILD_JOBS: ${CARGO_BUILD_JOBS}"
+echo "RUST_BACKTRACE: ${RUST_BACKTRACE}"
+echo "RB_SYS_VERBOSE: ${RB_SYS_VERBOSE}"
+echo "LD_LIBRARY_PATH: ${LD_LIBRARY_PATH:-<not set>}"
+echo "DYLD_LIBRARY_PATH: ${DYLD_LIBRARY_PATH:-<not set>}"
+echo ""
+
+echo "=== Pre-vendor directory state ==="
+echo "packages/ruby directory contents:"
+find . -maxdepth 1 -type f -o -maxdepth 1 -type d | head -20
+echo ""
+
+echo "=== Vendoring kreuzberg core ==="
+python3 "$REPO_ROOT/scripts/ci/ruby/vendor-kreuzberg-core.py"
+
+echo ""
+echo "=== Post-vendor directory state ==="
+if [ -d "ext/kreuzberg_rb/vendor" ]; then
+  echo "Vendor directory contents:"
+  find ext/kreuzberg_rb/vendor -maxdepth 2 -type f | head -10
+else
+  echo "WARNING: No vendor directory found in ext/kreuzberg_rb"
+fi
+echo ""
+
+echo "=== Running rake compile with verbose output ==="
+bundle exec rake compile --verbose --trace 2>&1 || {
+  echo ""
+  echo "ERROR: rake compile failed"
+  echo "=== Attempting to capture compilation error details ==="
+
+  if [ -f "mkmf.log" ]; then
+    echo "=== mkmf.log (last 150 lines) ==="
+    tail -150 mkmf.log
+  fi
+
+  echo ""
+  echo "=== Looking for compiled artifacts ==="
+  find . -name "*.so" -o -name "*.dll" -o -name "*.dylib" 2>/dev/null | head -20
+
+  echo ""
+  echo "=== Checking gem installation ==="
+  gem list kreuzberg || echo "Gem not found"
+
+  exit 1
+}
+
+echo ""
+echo "=== Post-compilation directory state ==="
+echo "lib/ contents:"
+if [ -d "lib" ]; then
+  find lib -type f -name "*.so" -o -name "*.dll" -o -name "*.dylib" 2>/dev/null || echo "No compiled extension found"
+else
+  echo "ERROR: lib directory not found"
+fi
+echo ""
+
+echo "=== Verifying extension can be loaded ==="
+ruby -e "require_relative 'lib/kreuzberg'; puts 'Extension loaded successfully'" 2>&1 || {
+  echo "WARNING: Could not load extension directly"
+  echo "This might be expected if gem installation is required"
+}
+
+echo ""
+echo "=== Compilation complete ==="
--- a/scripts/ci/ruby/install-bundler.sh
+++ b/scripts/ci/ruby/install-bundler.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+gem install bundler -v 4.0.3 --no-document || gem install bundler --no-document
+bundler --version
--- a/scripts/ci/ruby/install-ruby-deps.sh
+++ b/scripts/ci/ruby/install-ruby-deps.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="${REPO_ROOT:-$(cd "$SCRIPT_DIR/../../.." && pwd)}"
+
+source "$REPO_ROOT/scripts/lib/common.sh"
+
+validate_repo_root "$REPO_ROOT" || exit 1
+
+echo "=== Installing Ruby dependencies ==="
+cd "$REPO_ROOT/packages/ruby"
+
+bundle_path="${BUNDLE_PATH:-$REPO_ROOT/packages/ruby/.bundle/bundle}"
+
+if [[ -n "${GITHUB_ENV:-}" ]]; then
+  if [[ -z "${BUNDLE_GEMFILE:-}" ]]; then
+    echo "BUNDLE_GEMFILE=$REPO_ROOT/packages/ruby/Gemfile" >>"$GITHUB_ENV"
+  fi
+  if [[ -z "${BUNDLE_PATH:-}" ]]; then
+    echo "BUNDLE_PATH=$bundle_path" >>"$GITHUB_ENV"
+  fi
+fi
+
+bundle config set deployment false
+bundle config set path "$bundle_path"
+bundle install --jobs 4
+
+echo "Ruby dependencies installed"
--- a/scripts/ci/ruby/vendor-kreuzberg-core.py
+++ b/scripts/ci/ruby/vendor-kreuzberg-core.py
@@ -0,0 +1,430 @@
+#!/usr/bin/env python3
+"""
+Vendor kreuzberg core crate into Ruby package
+Used by: ci-ruby.yaml - Vendor kreuzberg core crate step
+
+This script:
+1. Reads workspace.dependencies from root Cargo.toml
+2. Copies core crates to packages/ruby/vendor/
+3. Replaces workspace = true with explicit versions
+4. Generates vendor/Cargo.toml with proper workspace setup
+"""
+
+import os
+import sys
+import shutil
+import re
+from pathlib import Path
+
+try:
+    import tomllib
+except ImportError:
+    import tomli as tomllib  # type: ignore[import-not-found]
+
+
+def get_repo_root() -> Path:
+    """Get repository root directory."""
+    repo_root_env = os.environ.get("REPO_ROOT")
+    if repo_root_env:
+        return Path(repo_root_env)
+
+    script_dir = Path(__file__).parent.absolute()
+    return (script_dir / ".." / ".." / "..").resolve()
+
+
+def read_toml(path: Path) -> dict[str, object]:
+    """Read TOML file."""
+    with open(path, "rb") as f:
+        return tomllib.load(f)
+
+
+def get_workspace_deps(repo_root: Path) -> dict[str, object]:
+    """Extract workspace.dependencies from root Cargo.toml."""
+    cargo_toml_path = repo_root / "Cargo.toml"
+    data = read_toml(cargo_toml_path)
+    return data.get("workspace", {}).get("dependencies", {})
+
+
+def get_workspace_version(repo_root: Path) -> str:
+    """Extract version from workspace.package."""
+    cargo_toml_path = repo_root / "Cargo.toml"
+    data = read_toml(cargo_toml_path)
+    return data.get("workspace", {}).get("package", {}).get("version", "4.0.0")
+
+
+def format_dependency(name: str, dep_spec: object) -> str:
+    """Format a dependency spec for Cargo.toml."""
+    if isinstance(dep_spec, str):
+        return f'{name} = "{dep_spec}"'
+    elif isinstance(dep_spec, dict):
+        version: str = dep_spec.get("version", "")
+        package: str | None = dep_spec.get("package")
+        features: list[str] = dep_spec.get("features", [])
+        default_features: bool | None = dep_spec.get("default-features")
+
+        optional: bool | None = dep_spec.get("optional")
+
+        path: str | None = dep_spec.get("path")
+        git: str | None = dep_spec.get("git")
+        branch: str | None = dep_spec.get("branch")
+        tag: str | None = dep_spec.get("tag")
+        rev: str | None = dep_spec.get("rev")
+
+        parts: list[str] = []
+
+        if package:
+            parts.append(f'package = "{package}"')
+
+        if git:
+            parts.append(f'git = "{git}"')
+
+        if branch:
+            parts.append(f'branch = "{branch}"')
+
+        if tag:
+            parts.append(f'tag = "{tag}"')
+
+        if rev:
+            parts.append(f'rev = "{rev}"')
+
+        if path:
+            parts.append(f'path = "{path}"')
+
+        if version:
+            parts.append(f'version = "{version}"')
+
+        if features:
+            features_str = ', '.join(f'"{f}"' for f in features)
+            parts.append(f'features = [{features_str}]')
+
+        if default_features is False:
+            parts.append('default-features = false')
+        elif default_features is True:
+            parts.append('default-features = true')
+
+        if optional is True:
+            parts.append('optional = true')
+        elif optional is False:
+            parts.append('optional = false')
+
+        spec_str = ", ".join(parts)
+        return f"{name} = {{ {spec_str} }}"
+
+    return f'{name} = "{dep_spec}"'
+
+
+def replace_workspace_deps_in_toml(toml_path: Path, workspace_deps: dict[str, object]) -> None:
+    """Replace workspace = true with explicit versions in a Cargo.toml file."""
+    with open(toml_path, "r") as f:
+        content = f.read()
+
+    for name, dep_spec in workspace_deps.items():
+        pattern1 = rf'^{re.escape(name)} = \{{ workspace = true \}}$'
+        content = re.sub(pattern1, format_dependency(name, dep_spec), content, flags=re.MULTILINE)
+
+        def replace_with_fields(match: re.Match[str]) -> str:
+            other_fields_str = match.group(1).strip()
+            base_spec = format_dependency(name, dep_spec)
+            if " = { " not in base_spec:
+                # Simple string dep like `ctor = "0.6"` - wrap it
+                version_val = base_spec.split(" = ", 1)[1].strip('"')
+                spec_part = f'version = "{version_val}"'
+            else:
+                spec_part = base_spec.split(" = { ", 1)[1].rstrip("} ").rstrip("}")
+
+            # Extract existing keys and values from workspace spec, handling nested brackets
+            workspace_fields: dict[str, str] = {}
+            bracket_depth = 0
+            current_field = ""
+            for char in spec_part:
+                if char == '[':
+                    bracket_depth += 1
+                    current_field += char
+                elif char == ']':
+                    bracket_depth -= 1
+                    current_field += char
+                elif char == ',' and bracket_depth == 0:
+                    # End of field
+                    field = current_field.strip()
+                    if field and "=" in field:
+                        key, val = field.split("=", 1)
+                        workspace_fields[key.strip()] = val.strip()
+                    current_field = ""
+                else:
+                    current_field += char
+
+            # Don't forget the last field
+            if current_field.strip():
+                field = current_field.strip()
+                if field and "=" in field:
+                    key, val = field.split("=", 1)
+                    workspace_fields[key.strip()] = val.strip()
+
+            # Extract crate-specific keys using bracket-aware parsing
+            crate_fields: dict[str, str] = {}
+            bracket_depth = 0
+            current_field = ""
+            for char in other_fields_str:
+                if char == '[':
+                    bracket_depth += 1
+                    current_field += char
+                elif char == ']':
+                    bracket_depth -= 1
+                    current_field += char
+                elif char == ',' and bracket_depth == 0:
+                    # End of field
+                    field = current_field.strip()
+                    if field and "=" in field:
+                        key, val = field.split("=", 1)
+                        crate_fields[key.strip()] = val.strip()
+                    current_field = ""
+                else:
+                    current_field += char
+
+            # Don't forget the last field
+            if current_field.strip():
+                field = current_field.strip()
+                if field and "=" in field:
+                    key, val = field.split("=", 1)
+                    crate_fields[key.strip()] = val.strip()
+
+            # Merge: crate-specific fields override workspace fields
+            merged_fields = {**workspace_fields, **crate_fields}
+
+            # Build result from merged fields
+            merged_parts = [f"{k} = {v}" for k, v in merged_fields.items()]
+            merged_spec = ", ".join(merged_parts)
+
+            return f"{name} = {{ {merged_spec} }}"
+
+        pattern2 = rf'^{re.escape(name)} = \{{ workspace = true, (.+?) \}}$'
+        content = re.sub(pattern2, replace_with_fields, content, flags=re.MULTILINE | re.DOTALL)
+
+    with open(toml_path, "w") as f:
+        f.write(content)
+
+
+def generate_vendor_cargo_toml(repo_root: Path, workspace_deps: dict[str, object], core_version: str, copied_crates: list[str]) -> None:
+    """Generate vendor/Cargo.toml with workspace setup.
+
+    Args:
+        repo_root: Repository root directory
+        workspace_deps: Workspace dependencies from Cargo.toml
+        core_version: Core version string
+        copied_crates: List of crates that were successfully copied
+    """
+
+    deps_lines: list[str] = []
+    for name, dep_spec in sorted(workspace_deps.items()):
+        deps_lines.append(format_dependency(name, dep_spec))
+
+    deps_str = "\n".join(deps_lines)
+
+    # Build members list based on actually copied crates
+    members = [name for name in ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "rb-sys"]
+               if name in copied_crates]
+    members_str = ', '.join(f'"{m}"' for m in members)
+
+    vendor_toml = f'''[workspace]
+members = [{members_str}]
+
+[workspace.package]
+version = "{core_version}"
+edition = "2024"
+rust-version = "1.91"
+authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
+license = "MIT"
+repository = "https://github.com/kreuzberg-dev/kreuzberg"
+homepage = "https://kreuzberg.dev"
+
+[workspace.dependencies]
+{deps_str}
+'''
+
+    vendor_dir = repo_root / "packages" / "ruby" / "vendor"
+    vendor_dir.mkdir(parents=True, exist_ok=True)
+
+    toml_path = vendor_dir / "Cargo.toml"
+    with open(toml_path, "w") as f:
+        f.write(vendor_toml)
+
+
+def main() -> None:
+    """Main vendoring function."""
+    repo_root: Path = get_repo_root()
+
+    print("=== Vendoring kreuzberg core crate ===")
+
+    workspace_deps: dict[str, object] = get_workspace_deps(repo_root)
+    core_version: str = get_workspace_version(repo_root)
+
+    print(f"Core version: {core_version}")
+    print(f"Workspace dependencies: {len(workspace_deps)}")
+
+    vendor_base: Path = repo_root / "packages" / "ruby" / "vendor"
+
+    # Clean only crate directories, preserving vendor/bundle/ (Bundler gems)
+    crate_names = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract",
+                   "kreuzberg-paddle-ocr", "rb-sys"]
+    for name in crate_names:
+        crate_path = vendor_base / name
+        if crate_path.exists():
+            shutil.rmtree(crate_path)
+    # Also clean the vendor Cargo.toml (will be regenerated)
+    vendor_cargo = vendor_base / "Cargo.toml"
+    if vendor_cargo.exists():
+        vendor_cargo.unlink()
+    print("Cleaned vendor crate directories")
+
+    vendor_base.mkdir(parents=True, exist_ok=True)
+
+    crates_to_copy: list[tuple[str, str]] = [
+        ("crates/kreuzberg", "kreuzberg"),
+        ("crates/kreuzberg-ffi", "kreuzberg-ffi"),
+        ("crates/kreuzberg-tesseract", "kreuzberg-tesseract"),
+        ("crates/kreuzberg-paddle-ocr", "kreuzberg-paddle-ocr"),
+        ("vendor/rb-sys", "rb-sys"),
+    ]
+
+    copied_crates: list[str] = []
+    for src_rel, dest_name in crates_to_copy:
+        src: Path = repo_root / src_rel
+        dest: Path = vendor_base / dest_name
+        if src.exists():
+            try:
+                shutil.copytree(src, dest)
+                copied_crates.append(dest_name)
+                print(f"Copied {dest_name}")
+            except Exception as e:
+                print(f"Warning: Failed to copy {dest_name}: {e}", file=sys.stderr)
+        else:
+            print(f"Warning: Source directory not found: {src_rel}")
+
+    artifact_dirs: list[str] = [".fastembed_cache", "target"]
+    temp_patterns: list[str] = ["*.swp", "*.bak", "*.tmp", "*~"]
+
+    for crate_dir in copied_crates:
+        crate_path: Path = vendor_base / crate_dir
+        if crate_path.exists():
+            for artifact_dir in artifact_dirs:
+                artifact: Path = crate_path / artifact_dir
+                if artifact.exists():
+                    shutil.rmtree(artifact)
+
+            for pattern in temp_patterns:
+                for f in crate_path.rglob(pattern):
+                    f.unlink()
+
+    print("Cleaned build artifacts")
+
+    # Update workspace inheritance in Cargo.toml files
+    for crate_dir in copied_crates:
+        crate_toml = vendor_base / crate_dir / "Cargo.toml"
+        if crate_toml.exists():
+            with open(crate_toml, "r") as f:
+                content = f.read()
+
+            content = re.sub(r'^version\.workspace = true$', f'version = "{core_version}"', content, flags=re.MULTILINE)
+            content = re.sub(r'^edition\.workspace = true$', 'edition = "2024"', content, flags=re.MULTILINE)
+            content = re.sub(r'^rust-version\.workspace = true$', 'rust-version = "1.91"', content, flags=re.MULTILINE)
+            content = re.sub(r'^authors\.workspace = true$', 'authors = ["Na\'aman Hirschfeld <naaman@kreuzberg.dev>"]', content, flags=re.MULTILINE)
+            content = re.sub(r'^license\.workspace = true$', 'license = "MIT"', content, flags=re.MULTILINE)
+
+            with open(crate_toml, "w") as f:
+                f.write(content)
+
+            replace_workspace_deps_in_toml(crate_toml, workspace_deps)
+            print(f"Updated {crate_dir}/Cargo.toml")
+
+    # Update path dependencies in kreuzberg-ffi crate
+    if "kreuzberg-ffi" in copied_crates and "kreuzberg" in copied_crates:
+        ffi_toml = vendor_base / "kreuzberg-ffi" / "Cargo.toml"
+        if ffi_toml.exists():
+            with open(ffi_toml, "r") as f:
+                content = f.read()
+
+            # Replace kreuzberg workspace references with path dependency
+            # Handle cases with path, version, or neither
+            content = re.sub(
+                r'(kreuzberg = \{) (?:(?:path|version) = "[^"]*", )?',
+                r'\1 path = "../kreuzberg", ',
+                content
+            )
+
+            with open(ffi_toml, "w") as f:
+                f.write(content)
+
+    # Update path dependencies in kreuzberg crate if tesseract was copied
+    if "kreuzberg" in copied_crates:
+        kreuzberg_toml = vendor_base / "kreuzberg" / "Cargo.toml"
+        if kreuzberg_toml.exists():
+            with open(kreuzberg_toml, "r") as f:
+                content = f.read()
+
+            # Only update tesseract path if it was actually copied
+            if "kreuzberg-tesseract" in copied_crates:
+                content = re.sub(
+                    r'kreuzberg-tesseract = \{ (?:path = "[^"]*", )?version = "[^"]*", optional = true \}',
+                    'kreuzberg-tesseract = { path = "../kreuzberg-tesseract", optional = true }',
+                    content
+                )
+            # Only update paddle-ocr path if it was actually copied
+            if "kreuzberg-paddle-ocr" in copied_crates:
+                content = re.sub(
+                    r'kreuzberg-paddle-ocr = \{ (?:path = "[^"]*", )?version = "[^"]*", optional = true \}',
+                    'kreuzberg-paddle-ocr = { path = "../kreuzberg-paddle-ocr", optional = true }',
+                    content
+                )
+
+            with open(kreuzberg_toml, "w") as f:
+                f.write(content)
+
+    generate_vendor_cargo_toml(repo_root, workspace_deps, core_version, copied_crates)
+    print("Generated vendor/Cargo.toml")
+
+    # Update native extension Cargo.toml to use vendored crates
+    native_toml = repo_root / "packages" / "ruby" / "ext" / "kreuzberg_rb" / "native" / "Cargo.toml"
+    if native_toml.exists():
+        with open(native_toml, "r") as f:
+            content = f.read()
+
+        # Replace path dependencies to point to vendored crates
+        # From: path = "../../../../../crates/kreuzberg"
+        # To: path = "../../../vendor/kreuzberg"
+        content = re.sub(
+            r'path = "\.\./\.\./\.\./\.\./\.\./crates/kreuzberg"',
+            'path = "../../../vendor/kreuzberg"',
+            content
+        )
+        content = re.sub(
+            r'path = "\.\./\.\./\.\./\.\./\.\./crates/kreuzberg-ffi"',
+            'path = "../../../vendor/kreuzberg-ffi"',
+            content
+        )
+
+        with open(native_toml, "w") as f:
+            f.write(content)
+
+        print("Updated native extension Cargo.toml to use vendored crates")
+
+    print(f"\nVendoring complete (core version: {core_version})")
+    print(f"Copied crates: {', '.join(sorted(copied_crates))}")
+
+    if "kreuzberg" in copied_crates and "kreuzberg-ffi" in copied_crates:
+        print("Native extension Cargo.toml uses:")
+        print("  - path '../../../vendor/kreuzberg' for kreuzberg crate")
+        print("  - path '../../../vendor/kreuzberg-ffi' for kreuzberg-ffi crate")
+        if "rb-sys" in copied_crates:
+            print("  - path '../../../vendor/rb-sys' for rb-sys crate")
+        else:
+            print("  - rb-sys from crates.io")
+    else:
+        print("Warning: Some required crates were not copied. Check for missing source directories.")
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
--- a/scripts/ci/rust/package-cli-windows.ps1
+++ b/scripts/ci/rust/package-cli-windows.ps1
@@ -0,0 +1,19 @@
+#!/usr/bin/env pwsh
+# Package CLI binary as zip archive (Windows)
+# Used by: ci-rust.yaml - Package CLI (Windows) step
+# Arguments: TARGET (e.g., x86_64-pc-windows-msvc)
+
+param(
+    [Parameter(Mandatory=$true)]
+    [string]$Target
+)
+
+Set-StrictMode -Version Latest
+$ErrorActionPreference = 'Stop'
+
+Write-Host "=== Packaging CLI binary for $Target ==="
+
+cd target/$Target/release
+Compress-Archive -Path kreuzberg.exe -DestinationPath ../../../kreuzberg-cli-$Target.zip
+
+Write-Host "Packaging complete: kreuzberg-cli-$Target.zip"
--- a/scripts/ci/rust/run-unit-tests.sh
+++ b/scripts/ci/rust/run-unit-tests.sh
@@ -0,0 +1,103 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="${REPO_ROOT:-$(cd "$SCRIPT_DIR/../../.." && pwd)}"
+
+source "$REPO_ROOT/scripts/lib/common.sh"
+source "$REPO_ROOT/scripts/lib/tessdata.sh"
+
+validate_repo_root "$REPO_ROOT" || exit 1
+
+cd "$REPO_ROOT"
+
+echo "=== Running Rust unit tests ==="
+
+setup_tessdata
+
+echo "Test environment configuration:"
+echo "  TESSDATA_PREFIX: ${TESSDATA_PREFIX:-not set}"
+echo "  RUST_BACKTRACE: ${RUST_BACKTRACE:-not set}"
+echo "  CARGO_TERM_COLOR: ${CARGO_TERM_COLOR:-not set}"
+
+echo "Workspace information:"
+echo "  Repository: $REPO_ROOT"
+echo "  Excluded packages: kreuzberg-e2e-generator, kreuzberg-py, kreuzberg-node (+ benchmark-harness on Windows)"
+
+if [ ! -d "$TESSDATA_PREFIX" ]; then
+  echo "WARNING: TESSDATA_PREFIX directory not found: $TESSDATA_PREFIX"
+  echo "Attempting to create it..."
+  mkdir -p "$TESSDATA_PREFIX"
+  ensure_tessdata "$TESSDATA_PREFIX"
+fi
+
+echo "Verifying Tesseract data files..."
+for lang in eng osd; do
+  langfile="$TESSDATA_PREFIX/${lang}.traineddata"
+  if [ -f "$langfile" ]; then
+    size=$(stat -f%z "$langfile" 2>/dev/null || stat -c%s "$langfile" 2>/dev/null || echo "unknown")
+    echo "  ✓ ${lang}.traineddata (${size} bytes)"
+  else
+    echo "  WARNING: Missing ${lang}.traineddata"
+  fi
+done
+
+if [ -n "${KREUZBERG_PDFIUM_PREBUILT:-}" ]; then
+  export LD_LIBRARY_PATH="${KREUZBERG_PDFIUM_PREBUILT}/lib:${LD_LIBRARY_PATH:-}"
+  export DYLD_LIBRARY_PATH="${KREUZBERG_PDFIUM_PREBUILT}/lib:${DYLD_LIBRARY_PATH:-}"
+  export DYLD_FALLBACK_LIBRARY_PATH="${KREUZBERG_PDFIUM_PREBUILT}/lib:${DYLD_FALLBACK_LIBRARY_PATH:-}"
+  echo "Library path configuration:"
+  echo "  LD_LIBRARY_PATH: $LD_LIBRARY_PATH"
+  echo "  DYLD_LIBRARY_PATH: $DYLD_LIBRARY_PATH"
+  echo "  DYLD_FALLBACK_LIBRARY_PATH: $DYLD_FALLBACK_LIBRARY_PATH"
+fi
+
+echo "=== Starting cargo test ==="
+
+# NOTE: We intentionally avoid `--all-features` for the `kreuzberg` crate because
+TEST_LOG="/tmp/cargo-test-$$.log"
+
+if ! {
+  # `--all-targets` runs --lib --bins --tests --examples --benches but excludes
+  # `--doc`. 22 rustdoc examples in the kreuzberg crate currently reference
+  # private items (extraction::capacity::estimate_content_capacity et al.) and
+  # fail to compile. Tracking the cleanup separately; doc-test coverage is not
+  # on the v5.0.0 publish path. TODO: re-enable doc tests once the failing
+  # examples are rewritten against the public API.
+  echo "=== cargo test -p kreuzberg --features full ==="
+  RUST_BACKTRACE=full cargo test -p kreuzberg --features full --all-targets --verbose
+
+  echo "=== cargo test --workspace (all features, excluding kreuzberg) ==="
+  extra_excludes=()
+  if [[ "$OSTYPE" == "msys" || "$OSTYPE" == "cygwin" || "$OSTYPE" == "win32" ]]; then
+    extra_excludes+=(--exclude benchmark-harness)
+  fi
+  RUST_BACKTRACE=full cargo test \
+    --workspace \
+    --exclude kreuzberg \
+    --exclude kreuzberg-e2e-generator \
+    --exclude kreuzberg-py \
+    --exclude kreuzberg-node \
+    ${extra_excludes[@]+"${extra_excludes[@]}"} \
+    --all-features \
+    --all-targets \
+    --verbose
+} 2>&1 | tee "$TEST_LOG"; then
+  echo "=== Test execution failed ==="
+  echo "Last 50 lines of test output:"
+  tail -n 50 "$TEST_LOG"
+  echo ""
+  echo "Collecting diagnostic information..."
+  echo "Disk space:"
+  df -h . || du -h . 2>/dev/null | head -1
+  echo "Cargo environment:"
+  cargo --version
+  rustc --version
+  rm -f "$TEST_LOG"
+  exit 1
+fi
+
+rm -f "$TEST_LOG"
+
+echo "=== Tests complete ==="
--- a/scripts/ci/validate/show-disk-space.sh
+++ b/scripts/ci/validate/show-disk-space.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+label="${1:-Disk space}"
+echo "=== ${label} ===" >&2
+df -h / >&2
+
+echo "Disk info:" >&2
+df -B1 / | tail -1 >&2 || true
--- a/scripts/install-php-ext.sh
+++ b/scripts/install-php-ext.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+set -e
+
+# Install the kreuzberg PHP extension to the system PHP extension directory
+# Called from the before hook in alef.toml for PHP e2e tests
+
+EXTENSION_DIR=$(php -r 'echo ini_get("extension_dir");')
+
+# Find the built extension
+for path in target/release/libkreuzberg_php.dylib target/release/libkreuzberg_php.so target/release/kreuzberg_php.dll; do
+  if [ -f "$path" ]; then
+    EXT_PATH="$path"
+    break
+  fi
+done
+
+if [ -z "$EXT_PATH" ]; then
+  echo "Error: PHP extension not found in target/release/" >&2
+  exit 1
+fi
+
+# Copy to extension directory
+EXT_FILENAME=$(basename "$EXT_PATH")
+cp "$EXT_PATH" "$EXTENSION_DIR/$EXT_FILENAME"
+
+# Add to php.ini if not already present
+PHP_INI=$(php -r 'echo php_ini_loaded_file();')
+if ! grep -q "extension=$EXT_FILENAME" "$PHP_INI"; then
+  echo "extension=$EXT_FILENAME" >>"$PHP_INI"
+fi
+
+echo "Installed PHP extension: $EXT_FILENAME to $EXTENSION_DIR"
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -0,0 +1,178 @@
+#!/usr/bin/env bash
+# Kreuzberg CLI installer
+# Usage: curl -fsSL https://kreuzberg.dev/install.sh | bash
+#
+# Environment variables:
+#   KREUZBERG_VERSION  - Specific version to install (default: latest)
+#   KREUZBERG_INSTALL  - Installation directory (default: ~/.kreuzberg/bin or /usr/local/bin)
+
+set -euo pipefail
+
+REPO="kreuzberg-dev/kreuzberg"
+BINARY_NAME="kreuzberg"
+
+# --- Helpers ---
+
+info() { printf '\033[1;34m%s\033[0m\n' "$*"; }
+warn() { printf '\033[1;33m%s\033[0m\n' "$*" >&2; }
+error() {
+  printf '\033[1;31merror: %s\033[0m\n' "$*" >&2
+  exit 1
+}
+
+need_cmd() {
+  if ! command -v "$1" >/dev/null 2>&1; then
+    error "need '$1' (command not found)"
+  fi
+}
+
+# --- Detect platform ---
+
+detect_os() {
+  local os
+  os="$(uname -s)"
+  case "$os" in
+  Linux*) echo "linux" ;;
+  Darwin*) echo "darwin" ;;
+  *) error "unsupported OS: $os" ;;
+  esac
+}
+
+detect_arch() {
+  local arch
+  arch="$(uname -m)"
+  case "$arch" in
+  x86_64 | amd64) echo "x86_64" ;;
+  aarch64 | arm64) echo "aarch64" ;;
+  *) error "unsupported architecture: $arch" ;;
+  esac
+}
+
+detect_target() {
+  local os arch
+  os="$(detect_os)"
+  arch="$(detect_arch)"
+
+  case "${os}-${arch}" in
+  linux-x86_64) echo "x86_64-unknown-linux-musl" ;;
+  linux-aarch64) echo "aarch64-unknown-linux-musl" ;;
+  darwin-x86_64) echo "aarch64-apple-darwin" ;; # Rosetta compatible
+  darwin-aarch64) echo "aarch64-apple-darwin" ;;
+  *) error "unsupported platform: ${os}-${arch}" ;;
+  esac
+}
+
+# --- Version resolution ---
+
+get_latest_version() {
+  need_cmd curl
+
+  # List recent releases and pick the first tag starting with "v" (skip benchmark runs etc.)
+  local url="https://api.github.com/repos/${REPO}/releases?per_page=20"
+  local tag
+  tag="$(curl -fsSL "$url" | grep '"tag_name"' | sed 's/.*"tag_name":[[:space:]]*"\([^"]*\)".*/\1/' | grep '^v' | head -1 || true)"
+
+  if [ -z "$tag" ]; then
+    error "failed to fetch latest release tag from GitHub"
+  fi
+  echo "$tag"
+}
+
+# --- Download and install ---
+
+install() {
+  need_cmd curl
+  need_cmd tar
+
+  local os arch target version install_dir
+
+  os="$(detect_os)"
+  arch="$(detect_arch)"
+  target="$(detect_target)"
+
+  if [ -n "${KREUZBERG_VERSION:-}" ]; then
+    version="${KREUZBERG_VERSION}"
+    # Ensure 'v' prefix
+    case "$version" in
+    v*) ;;
+    *) version="v${version}" ;;
+    esac
+  else
+    info "Fetching latest release..."
+    version="$(get_latest_version)"
+  fi
+
+  info "Installing kreuzberg ${version} for ${target}"
+
+  # Determine install directory
+  if [ -n "${KREUZBERG_INSTALL:-}" ]; then
+    install_dir="${KREUZBERG_INSTALL}"
+  elif [ "$(id -u)" -eq 0 ]; then
+    install_dir="/usr/local/bin"
+  else
+    install_dir="${HOME}/.kreuzberg/bin"
+  fi
+
+  mkdir -p "$install_dir"
+
+  # Download
+  local artifact="kreuzberg-cli-${target}.tar.gz"
+  local url="https://github.com/${REPO}/releases/download/${version}/${artifact}"
+
+  info "Downloading ${url}"
+
+  tmpdir="$(mktemp -d)"
+  trap 'rm -rf "$tmpdir"' EXIT
+
+  curl -fsSL "$url" -o "${tmpdir}/${artifact}"
+
+  # Extract
+  tar -xzf "${tmpdir}/${artifact}" -C "$tmpdir"
+
+  # Install binary
+  local stage_dir="${tmpdir}/kreuzberg-cli-${target}"
+  local binary_path="${stage_dir}/${BINARY_NAME}"
+  if [ ! -f "$binary_path" ]; then
+    error "binary not found in archive at ${binary_path}"
+  fi
+
+  cp "$binary_path" "${install_dir}/${BINARY_NAME}"
+  chmod +x "${install_dir}/${BINARY_NAME}"
+
+  # Install the actual binary (musl builds use wrapper + .bin)
+  if [ -f "${stage_dir}/${BINARY_NAME}.bin" ]; then
+    cp "${stage_dir}/${BINARY_NAME}.bin" "${install_dir}/${BINARY_NAME}.bin"
+    chmod +x "${install_dir}/${BINARY_NAME}.bin"
+  fi
+
+  # Install bundled runtime libraries (musl builds only)
+  if [ -d "${stage_dir}/lib" ] && [ "$(ls -A "${stage_dir}/lib" 2>/dev/null)" ]; then
+    mkdir -p "${install_dir}/lib"
+    cp "${stage_dir}/lib/"* "${install_dir}/lib/"
+    info "Installed runtime libraries to ${install_dir}/lib/"
+  fi
+
+  info "Installed ${BINARY_NAME} to ${install_dir}/${BINARY_NAME}"
+
+  # Verify
+  if "${install_dir}/${BINARY_NAME}" --version >/dev/null 2>&1; then
+    info "Verified: $("${install_dir}/${BINARY_NAME}" --version)"
+  else
+    warn "Binary installed but --version check failed"
+  fi
+
+  # PATH hint
+  case ":${PATH}:" in
+  *":${install_dir}:"*) ;;
+  *)
+    warn ""
+    warn "Add ${install_dir} to your PATH:"
+    warn ""
+    warn "  export PATH=\"${install_dir}:\$PATH\""
+    warn ""
+    warn "Add this to your shell profile (~/.bashrc, ~/.zshrc, etc.) to make it permanent."
+    ;;
+  esac
+}
+
+install
--- a/scripts/lib/common.sh
+++ b/scripts/lib/common.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+get_repo_root() {
+  local start_dir current_dir
+  start_dir="$(pwd)"
+  current_dir="$start_dir"
+
+  while [ "$current_dir" != "/" ]; do
+    if [ -f "$current_dir/Cargo.toml" ]; then
+      echo "$current_dir"
+      return 0
+    fi
+    current_dir="$(dirname "$current_dir")"
+  done
+
+  echo "Error: Could not find repository root (Cargo.toml) from: $start_dir" >&2
+  return 1
+}
+
+validate_repo_root() {
+  local repo_root="${1:-${REPO_ROOT:-}}"
+
+  if [ -z "$repo_root" ]; then
+    echo "Error: REPO_ROOT not provided and env var not set" >&2
+    return 1
+  fi
+
+  if [ ! -f "$repo_root/Cargo.toml" ]; then
+    echo "Error: REPO_ROOT validation failed. Expected Cargo.toml at: $repo_root/Cargo.toml" >&2
+    echo "REPO_ROOT resolved to: $repo_root" >&2
+    return 1
+  fi
+
+  return 0
+}
+
+error_exit() {
+  local message="${1:-Unknown error}"
+  local exit_code="${2:-1}"
+  echo "Error: $message" >&2
+  exit "$exit_code"
+}
+
+get_platform() {
+  if [ -n "${RUNNER_OS:-}" ]; then
+    echo "$RUNNER_OS"
+  else
+    case "$(uname -s)" in
+    Linux*)
+      echo "Linux"
+      ;;
+    Darwin*)
+      echo "macOS"
+      ;;
+    MINGW* | MSYS* | CYGWIN*)
+      echo "Windows"
+      ;;
+    *)
+      echo "unknown"
+      ;;
+    esac
+  fi
+}
+
+export -f get_repo_root
+export -f validate_repo_root
+export -f error_exit
+export -f get_platform
--- a/scripts/lib/library-paths.sh
+++ b/scripts/lib/library-paths.sh
@@ -0,0 +1,197 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+_get_path_separator() {
+  local platform="${1:-$(uname -s)}"
+  case "$platform" in
+  MINGW* | MSYS* | CYGWIN* | Windows)
+    echo ";"
+    ;;
+  *)
+    echo ":"
+    ;;
+  esac
+}
+
+setup_onnx_paths() {
+  local ort_lib="${ORT_LIB_LOCATION:-}"
+  [ -z "$ort_lib" ] && return 0
+
+  local platform="${RUNNER_OS:-$(uname -s)}"
+  case "$platform" in
+  Linux)
+    export LD_LIBRARY_PATH="${ort_lib}:${LD_LIBRARY_PATH:-}"
+    echo "✓ Set LD_LIBRARY_PATH for ONNX Runtime"
+    ;;
+  macOS | Darwin)
+    export DYLD_LIBRARY_PATH="${ort_lib}:${DYLD_LIBRARY_PATH:-}"
+    export DYLD_FALLBACK_LIBRARY_PATH="${ort_lib}:${DYLD_FALLBACK_LIBRARY_PATH:-}"
+    echo "✓ Set DYLD_LIBRARY_PATH for ONNX Runtime on macOS"
+    ;;
+  Windows | MINGW* | MSYS* | CYGWIN*)
+    export PATH="${ort_lib};${PATH:-}"
+    echo "✓ Set PATH for ONNX Runtime on Windows"
+    ;;
+  esac
+}
+
+setup_rust_ffi_paths() {
+  local repo_root="${1:-${REPO_ROOT:-}}"
+  [ -z "$repo_root" ] && return 0
+
+  local ffi_lib="$repo_root/target/release"
+  local ffi_lib_gnu="$repo_root/target/x86_64-pc-windows-gnu/release"
+
+  local platform="${RUNNER_OS:-$(uname -s)}"
+  case "$platform" in
+  Linux)
+    [ ! -d "$ffi_lib" ] && return 0
+    export LD_LIBRARY_PATH="${ffi_lib}:${LD_LIBRARY_PATH:-}"
+    echo "✓ Set LD_LIBRARY_PATH for Rust FFI"
+    ;;
+  macOS | Darwin)
+    [ ! -d "$ffi_lib" ] && return 0
+    export DYLD_LIBRARY_PATH="${ffi_lib}:${DYLD_LIBRARY_PATH:-}"
+    export DYLD_FALLBACK_LIBRARY_PATH="${ffi_lib}:${DYLD_FALLBACK_LIBRARY_PATH:-}"
+    echo "✓ Set DYLD_LIBRARY_PATH for Rust FFI on macOS"
+    ;;
+  Windows | MINGW* | MSYS* | CYGWIN*)
+    # Check for short path CI directories first
+    local cargo_target="${CARGO_TARGET_DIR:-}"
+    if [ -n "$cargo_target" ] && [ -d "$cargo_target/release" ]; then
+      export PATH="${cargo_target}/release;${PATH:-}"
+      echo "✓ Set PATH for Rust FFI (using CARGO_TARGET_DIR=$cargo_target)"
+    fi
+    # Add GNU target path if it exists
+    if [ -d "$ffi_lib_gnu" ]; then
+      export PATH="${ffi_lib_gnu};${PATH:-}"
+      echo "✓ Set PATH for Rust FFI GNU target"
+    fi
+    # Add standard target path if it exists
+    if [ -d "$ffi_lib" ]; then
+      export PATH="${ffi_lib};${PATH:-}"
+      echo "✓ Set PATH for Rust FFI on Windows"
+    fi
+    ;;
+  esac
+}
+
+verify_pkg_config() {
+  if pkg-config --exists kreuzberg-ffi 2>/dev/null; then
+    return 0
+  else
+    {
+      echo "Error: pkg-config cannot find kreuzberg-ffi"
+      echo "PKG_CONFIG_PATH=${PKG_CONFIG_PATH:-<not set>}"
+      echo "Run 'pkg-config --list-all' to see available packages"
+    } >&2
+    return 1
+  fi
+}
+
+setup_go_paths_windows() {
+  local repo_root="${1:-${REPO_ROOT:-}}"
+  [ -z "$repo_root" ] && return 0
+
+  local gnu_target="${repo_root}/target/x86_64-pc-windows-gnu/release"
+  local release_target="${repo_root}/target/release"
+
+  export PKG_CONFIG_PATH="${repo_root}/crates/kreuzberg-ffi:${PKG_CONFIG_PATH:-}"
+
+  export PATH="${gnu_target};${release_target};${PATH:-}"
+
+  export CGO_ENABLED=1
+  export CGO_CFLAGS="-I${repo_root}/crates/kreuzberg-ffi/include"
+  export CGO_LDFLAGS="-L${gnu_target} -L${release_target} -lkreuzberg_ffi -static-libgcc -static-libstdc++"
+
+  echo "✓ Configured Go cgo environment for Windows"
+}
+
+# NOTE: CGO_LDFLAGS is set by setup-go-cgo-env action on Windows in CI, or by this script on Unix
+setup_go_paths() {
+  local repo_root="${1:-${REPO_ROOT:-}}"
+  [ -z "$repo_root" ] && return 0
+
+  local pc_path="${repo_root}/crates/kreuzberg-ffi/kreuzberg-ffi.pc"
+  if [ ! -f "$pc_path" ]; then
+    local version=""
+    version="$(sed -n 's/^version = \"\\(.*\\)\"/\\1/p' "${repo_root}/Cargo.toml" | head -n 1 || true)"
+    [ -z "$version" ] && version="unknown"
+
+    local platform="${RUNNER_OS:-$(uname -s)}"
+    local libs_private=""
+    case "$platform" in
+    Linux)
+      libs_private="-lpthread -ldl -lm"
+      ;;
+    macOS | Darwin)
+      libs_private="-framework CoreFoundation -framework Security -lpthread"
+      ;;
+    Windows | MINGW* | MSYS* | CYGWIN*)
+      libs_private="-lws2_32 -luserenv -lbcrypt"
+      ;;
+    esac
+
+    mkdir -p "$(dirname "$pc_path")"
+    cat >"$pc_path" <<EOF
+prefix=${repo_root}
+exec_prefix=\${prefix}
+libdir=${repo_root}/target/release
+includedir=${repo_root}/crates/kreuzberg-ffi
+
+Name: kreuzberg-ffi
+Description: C FFI bindings for Kreuzberg document intelligence library
+Version: ${version}
+URL: https://kreuzberg.dev
+Libs: -L\${libdir} -lkreuzberg_ffi
+Libs.private: ${libs_private}
+Cflags: -I\${includedir}
+EOF
+  fi
+
+  export PKG_CONFIG_PATH="${repo_root}/crates/kreuzberg-ffi:${PKG_CONFIG_PATH:-}"
+
+  export CGO_ENABLED=1
+  export CGO_CFLAGS="-I${repo_root}/crates/kreuzberg-ffi/include"
+
+  local platform="${RUNNER_OS:-$(uname -s)}"
+  case "$platform" in
+  Linux)
+    export LD_LIBRARY_PATH="${repo_root}/target/release:${LD_LIBRARY_PATH:-}"
+    export CGO_LDFLAGS="-L${repo_root}/target/release -lkreuzberg_ffi -Wl,-rpath,${repo_root}/target/release"
+    ;;
+  macOS | Darwin)
+    export DYLD_LIBRARY_PATH="${repo_root}/target/release:${DYLD_LIBRARY_PATH:-}"
+    export DYLD_FALLBACK_LIBRARY_PATH="${repo_root}/target/release:${DYLD_FALLBACK_LIBRARY_PATH:-}"
+    export CGO_LDFLAGS="-L${repo_root}/target/release -lkreuzberg_ffi -Wl,-rpath,${repo_root}/target/release"
+    ;;
+  Windows | MINGW* | MSYS* | CYGWIN*)
+    if [ -z "${CGO_LDFLAGS:-}" ] && [ -z "${GITHUB_ENV:-}" ]; then
+      # Only set library search path; ffi.go CGO directives handle -l flags
+      # This matches the approach in setup-go-cgo-env/windows.ps1
+      export CGO_LDFLAGS="-L${repo_root}/target/x86_64-pc-windows-gnu/release -L${repo_root}/target/release"
+    fi
+    ;;
+  esac
+
+  echo "✓ Configured Go cgo environment"
+}
+
+setup_all_library_paths() {
+  local repo_root="${1:-${REPO_ROOT:-}}"
+
+  echo "Setting up library paths..."
+  setup_onnx_paths
+  setup_rust_ffi_paths "$repo_root"
+  setup_go_paths "$repo_root"
+  echo "✓ All library paths configured"
+}
+
+export -f setup_onnx_paths
+export -f setup_rust_ffi_paths
+export -f verify_pkg_config
+export -f setup_go_paths_windows
+export -f setup_go_paths
+export -f setup_all_library_paths
+export -f _get_path_separator
--- a/scripts/lib/retry.sh
+++ b/scripts/lib/retry.sh
@@ -0,0 +1,85 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+run_with_timeout() {
+  local seconds="$1"
+  shift
+
+  if command -v timeout >/dev/null 2>&1; then
+    timeout "${seconds}" "$@"
+    return $?
+  fi
+  if command -v gtimeout >/dev/null 2>&1; then
+    gtimeout "${seconds}" "$@"
+    return $?
+  fi
+
+  if command -v python3 >/dev/null 2>&1; then
+    python3 - "$seconds" "$@" <<'PY'
+import subprocess
+import sys
+
+timeout_s = int(sys.argv[1])
+cmd = sys.argv[2:]
+try:
+    completed = subprocess.run(cmd, timeout=timeout_s)
+    sys.exit(completed.returncode)
+except subprocess.TimeoutExpired:
+    sys.exit(124)
+PY
+    return $?
+  fi
+
+  "$@"
+}
+
+retry_with_backoff() {
+  local max_attempts=3
+  local attempt=1
+  local delay=5
+
+  while [ $attempt -le $max_attempts ]; do
+    if "$@"; then
+      return 0
+    fi
+
+    if [ $attempt -lt $max_attempts ]; then
+      echo "⚠ Attempt $attempt failed, retrying in ${delay}s..." >&2
+      sleep $delay
+      delay=$((delay * 2))
+    fi
+    attempt=$((attempt + 1))
+  done
+
+  return 1
+}
+
+retry_with_backoff_timeout() {
+  local seconds="$1"
+  shift
+  local max_attempts=3
+  local attempt=1
+  local delay=5
+  local exit_code=1
+
+  while [ $attempt -le $max_attempts ]; do
+    if run_with_timeout "$seconds" "$@"; then
+      return 0
+    else
+      exit_code=$?
+    fi
+    if [ $attempt -lt $max_attempts ]; then
+      echo "⚠ Attempt $attempt failed (exit $exit_code), retrying in ${delay}s..." >&2
+      sleep $delay
+      delay=$((delay * 2))
+    fi
+    attempt=$((attempt + 1))
+  done
+
+  return $exit_code
+}
+
+export -f run_with_timeout
+export -f retry_with_backoff
+export -f retry_with_backoff_timeout
--- a/scripts/lib/tessdata.sh
+++ b/scripts/lib/tessdata.sh
@@ -0,0 +1,157 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+file_size_bytes() {
+  local path="$1"
+  if [ ! -f "$path" ]; then
+    echo 0
+    return
+  fi
+  if stat -c%s "$path" >/dev/null 2>&1; then
+    stat -c%s "$path"
+    return
+  fi
+  stat -f%z "$path"
+}
+
+min_traineddata_size_bytes() {
+  local lang="$1"
+  case "$lang" in
+  eng) echo 1000000 ;;
+  osd) echo 100000 ;;
+  deu) echo 1000000 ;;
+  *) echo 100000 ;;
+  esac
+}
+
+download_traineddata() {
+  local lang="$1"
+  local dest="$2"
+  local url="$3"
+  local tmp="${dest}.tmp"
+  local min_size
+  min_size="$(min_traineddata_size_bytes "$lang")"
+
+  rm -f "$tmp"
+
+  for attempt in 1 2 3 4 5; do
+    if curl -fsSL --retry 5 --retry-delay 5 --retry-all-errors "$url" -o "$tmp"; then
+      local size
+      size="$(file_size_bytes "$tmp")"
+      if [ "$size" -ge "$min_size" ]; then
+        mv -f "$tmp" "$dest"
+        return 0
+      fi
+      echo "Downloaded ${lang}.traineddata too small (${size} bytes < ${min_size}), retrying..." >&2
+    else
+      echo "Failed to download ${lang}.traineddata (attempt ${attempt}), retrying..." >&2
+    fi
+    rm -f "$tmp"
+    sleep "$attempt"
+  done
+
+  echo "ERROR: Failed to download valid ${lang}.traineddata after retries" >&2
+  return 1
+}
+
+ensure_valid_traineddata() {
+  local dest_dir="$1"
+  local lang="$2"
+  local url="$3"
+  local dest_file="${dest_dir}/${lang}.traineddata"
+  local min_size
+  min_size="$(min_traineddata_size_bytes "$lang")"
+
+  local size
+  size="$(file_size_bytes "$dest_file")"
+  if [ "$size" -ge "$min_size" ]; then
+    return 0
+  fi
+
+  if [ -f "$dest_file" ]; then
+    echo "Invalid ${lang}.traineddata at ${dest_file} (${size} bytes < ${min_size}); re-downloading..." >&2
+    rm -f "$dest_file"
+  fi
+
+  download_traineddata "$lang" "$dest_file" "$url"
+}
+
+ensure_tessdata() {
+  local dest="$1"
+  mkdir -p "$dest"
+  local dest_real
+  dest_real="$(cd "$dest" && pwd -P)"
+
+  local candidates=(
+    "/opt/homebrew/share/tessdata"
+    "/usr/local/opt/tesseract/share/tessdata"
+    "/usr/share/tesseract-ocr/5/tessdata"
+  )
+
+  if [ -n "${PROGRAMFILES:-}" ] && command -v cygpath >/dev/null 2>&1; then
+    candidates+=("$(cygpath -u "$PROGRAMFILES")/Tesseract-OCR/tessdata")
+  fi
+  if [ -d "/c/Program Files/Tesseract-OCR/tessdata" ]; then
+    candidates+=("/c/Program Files/Tesseract-OCR/tessdata")
+  fi
+
+  for dir in "${candidates[@]}"; do
+    if [ -f "$dir/eng.traineddata" ]; then
+      local dir_real
+      dir_real="$(cd "$dir" && pwd -P)"
+
+      if [ "$dir_real" = "$dest_real" ]; then
+        break
+      fi
+
+      for lang in eng osd deu; do
+        if [ -f "$dir/$lang.traineddata" ]; then
+          if [ -f "$dest/$lang.traineddata" ] &&
+            [ "$dir_real/$lang.traineddata" -ef "$dest/$lang.traineddata" ]; then
+            continue
+          fi
+          cp -f "$dir/$lang.traineddata" "$dest/"
+        fi
+      done
+      break
+    fi
+  done
+
+  ensure_valid_traineddata "$dest" "eng" "https://github.com/tesseract-ocr/tessdata_fast/raw/main/eng.traineddata"
+  ensure_valid_traineddata "$dest" "osd" "https://github.com/tesseract-ocr/tessdata_fast/raw/main/osd.traineddata"
+}
+
+setup_tessdata() {
+  local platform="${RUNNER_OS:-$(uname -s)}"
+
+  case "$platform" in
+  Linux)
+    export TESSDATA_PREFIX="/usr/share/tesseract-ocr/5/tessdata"
+    ;;
+  macOS | Darwin)
+    if [ -d "/opt/homebrew/opt/tesseract/share/tessdata" ]; then
+      export TESSDATA_PREFIX="/opt/homebrew/opt/tesseract/share/tessdata"
+    elif [ -d "/usr/local/opt/tesseract/share/tessdata" ]; then
+      export TESSDATA_PREFIX="/usr/local/opt/tesseract/share/tessdata"
+    else
+      export TESSDATA_PREFIX="$HOME/Library/Application Support/kreuzberg-tesseract/tessdata"
+    fi
+    ;;
+  Windows | MINGW* | MSYS* | CYGWIN*)
+    export TESSDATA_PREFIX="${APPDATA:-${USERPROFILE:-}}/kreuzberg-tesseract/tessdata"
+    ;;
+  *)
+    export TESSDATA_PREFIX="${REPO_ROOT:-$(pwd)}/target/tessdata"
+    ;;
+  esac
+
+  ensure_tessdata "$TESSDATA_PREFIX"
+
+  echo "✓ TESSDATA_PREFIX set to: $TESSDATA_PREFIX"
+  [ -f "$TESSDATA_PREFIX/eng.traineddata" ] && echo "✓ eng.traineddata available"
+  [ -f "$TESSDATA_PREFIX/osd.traineddata" ] && echo "✓ osd.traineddata available"
+}
+
+export -f ensure_tessdata
+export -f setup_tessdata
--- a/scripts/publish/check-docker-tag.sh
+++ b/scripts/publish/check-docker-tag.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+tag="${DOCKER_TAG:?DOCKER_TAG not set}"
+label="${SUMMARY_LABEL:-image}"
+
+exists=false
+if docker manifest inspect "$tag" >/dev/null 2>&1; then
+  exists=true
+fi
+
+echo "exists=$exists" >>"${GITHUB_OUTPUT:?GITHUB_OUTPUT not set}"
+
+if [ "$exists" = "true" ] && [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
+  echo "Docker tag $tag already exists; ${label} publish will be skipped." >>"$GITHUB_STEP_SUMMARY"
+fi
--- a/scripts/publish/docker/dry-run-summary.sh
+++ b/scripts/publish/docker/dry-run-summary.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+image="${IMAGE:-}"
+version="${VERSION:-}"
+tag_suffix="${TAG_SUFFIX:-}"
+
+if [ -z "$image" ] || [ -z "$version" ]; then
+  echo "Usage: set IMAGE and VERSION (optional TAG_SUFFIX) env vars" >&2
+  exit 2
+fi
+
+echo "Dry run requested; Docker image ${image}:${version}${tag_suffix} tested but not pushed." >>"$GITHUB_STEP_SUMMARY"
--- a/scripts/publish/update-homebrew-formula.sh
+++ b/scripts/publish/update-homebrew-formula.sh
@@ -0,0 +1,66 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Update Formula/kreuzberg.rb in the homebrew-tap with the new tag's URL and
+# source-tarball SHA256. The bottle DSL is updated separately by the
+# `homebrew-merge-bottles@v1` action after bottles are built.
+#
+# Usage (env vars):
+#   TAG=v5.0.0-rc.2 VERSION=5.0.0-rc.2 \
+#   TAP_DIR=/path/to/homebrew-tap \
+#   ./update-homebrew-formula.sh
+
+tag="${TAG:?TAG is required (e.g. v5.0.0-rc.2)}"
+version="${VERSION:?VERSION is required (e.g. 5.0.0-rc.2)}"
+tap_dir="${TAP_DIR:?TAP_DIR is required (path to homebrew-tap checkout)}"
+dry_run="${DRY_RUN:-false}"
+
+formula="${tap_dir}/Formula/kreuzberg.rb"
+
+[[ -f "$formula" ]] || {
+  echo "Missing $formula" >&2
+  exit 1
+}
+
+tarball_url="https://github.com/kreuzberg-dev/kreuzberg/archive/${tag}.tar.gz"
+
+echo "Updating Homebrew formula for kreuzberg ${version} (tag ${tag})"
+
+if [[ "$dry_run" == "true" ]]; then
+  echo "[dry-run] target formula: $formula"
+  echo "[dry-run] would set url to: $tarball_url"
+  echo "[dry-run] would compute sha256 of source tarball and rewrite the formula"
+  echo "[dry-run] would leave bottle DSL untouched (handled by homebrew-merge-bottles)"
+  exit 0
+fi
+
+echo "Fetching source tarball SHA256 for ${tag}..."
+sha256=$(curl -fsSL "$tarball_url" | shasum -a 256 | awk '{print $1}')
+echo "  url:    $tarball_url"
+echo "  sha256: $sha256"
+
+# Update the top-level url + sha256 lines (the ones outside `bottle do ... end`).
+# Match `url "..."` on one line, `sha256 "..."` on the next, only when both come
+# before the `bottle do` block.
+python3 - "$formula" "$tarball_url" "$sha256" <<'PY'
+import re
+import sys
+
+formula_path, new_url, new_sha = sys.argv[1], sys.argv[2], sys.argv[3]
+text = open(formula_path).read()
+
+# Split off the bottle block so the regex only touches the formula header.
+bottle_start = text.find("bottle do")
+if bottle_start == -1:
+    head, tail = text, ""
+else:
+    head, tail = text[:bottle_start], text[bottle_start:]
+
+head = re.sub(r'^(\s*url\s+)"[^"]*"', rf'\1"{new_url}"', head, count=1, flags=re.MULTILINE)
+head = re.sub(r'^(\s*sha256\s+)"[^"]*"', rf'\1"{new_sha}"', head, count=1, flags=re.MULTILINE)
+
+with open(formula_path, "w") as f:
+    f.write(head + tail)
+PY
+
+echo "Updated $formula"
--- a/scripts/setup-php-ext-ini.sh
+++ b/scripts/setup-php-ext-ini.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+set -e
+
+# Setup temporary php.ini for e2e/php that loads the kreuzberg extension from target/release
+# Called from alef.toml before hook for PHP e2e tests
+# Must be run from e2e/php directory
+
+EXT_DIR=$(php -r 'echo ini_get("extension_dir");')
+
+# Look for built extension (relative to e2e/php/)
+for path in ../../target/release/libkreuzberg_php.dylib ../../target/release/libkreuzberg_php.so ../../target/release/kreuzberg_php.dll; do
+  if [ -f "$path" ]; then
+    BUILT_EXT="$path"
+    break
+  fi
+done
+
+if [ -z "$BUILT_EXT" ]; then
+  echo "Error: kreuzberg PHP extension not found in target/release/" >&2
+  exit 1
+fi
+
+# Resolve to absolute path
+BUILT_EXT=$(cd "$(dirname "$BUILT_EXT")" && pwd)/$(basename "$BUILT_EXT")
+
+# Copy extension to extension directory
+BASENAME=$(basename "$BUILT_EXT")
+TARGET="$EXT_DIR/$BASENAME"
+cp "$BUILT_EXT" "$TARGET" 2>/dev/null || true # May fail if already exists, that's OK
+echo "Extension copied/verified: $TARGET"
+
+# Create php.ini in current directory (e2e/php) that loads the extension.
+# extension_dir is set explicitly so the ini works even when invoked with
+# PHP_INI_SCAN_DIR= (which is recommended in the e2e runner to skip stale
+# conf.d/*.ini entries left behind by sibling projects).
+cat >php.ini <<EOF
+; Temporary PHP INI for e2e tests — loads kreuzberg PHP extension from system extension directory
+[PHP]
+extension_dir=$EXT_DIR
+extension=$BASENAME
+EOF
+
+echo "Created php.ini that loads: $BASENAME"
--- a/scripts/setup-swift-bridge.sh
+++ b/scripts/setup-swift-bridge.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# Setup Swift bridge files after cargo build
+
+set -e
+
+# Find the most recently built output directory
+OUT=$(find target/release/build -maxdepth 2 -type d -name out -path '*kreuzberg-swift-*' \
+  -exec stat -f '%m %N' {} + 2>/dev/null | sort -rn | head -1 | cut -d' ' -f2-)
+if [ -z "$OUT" ]; then
+  echo "ERROR: Could not find swift-bridge build output in target/release/build/"
+  exit 1
+fi
+
+echo "Using swift-bridge output from: $OUT"
+
+# Fix swift-bridge visibility: make 'var ptr' and 'var isOwned' properties public for internal type conversion
+fixVisibility() {
+    sed -e 's/^    var ptr: UnsafeMutableRawPointer$/    public var ptr: UnsafeMutableRawPointer/g' \
+        -e 's/^    var isOwned: Bool = true$/    public var isOwned: Bool = true/g'
+}
+
+# Ensure target directories exist
+mkdir -p packages/swift/Sources/RustBridgeC
+mkdir -p packages/swift/Sources/RustBridge
+
+# Copy C headers
+cat "$OUT/SwiftBridgeCore.h" "$OUT/kreuzberg-swift/kreuzberg-swift.h" \
+  >packages/swift/Sources/RustBridgeC/RustBridgeC.h
+
+# Copy Swift bridge files with import statement prepended
+{
+  printf 'import RustBridgeC\n'
+  cat "$OUT/SwiftBridgeCore.swift" | fixVisibility
+} >packages/swift/Sources/RustBridge/SwiftBridgeCore.swift
+{
+  printf 'import RustBridgeC\n'
+  cat "$OUT/kreuzberg-swift/kreuzberg-swift.swift" | fixVisibility
+} >packages/swift/Sources/RustBridge/kreuzberg-swift.swift
+
+echo "Swift-bridge files setup complete"
--- a/scripts/stage_csharp_native_local.sh
+++ b/scripts/stage_csharp_native_local.sh
@@ -0,0 +1,51 @@
+#!/usr/bin/env bash
+# Stage libkreuzberg_ffi into packages/csharp/Kreuzberg/runtimes/<rid>/native/
+# so dotnet test can locate it via runtime asset resolution.
+#
+# Auto-detects host RID. Idempotent.
+
+set -euo pipefail
+
+repo_root="$(cd "$(dirname "$0")/.." && pwd)"
+cd "$repo_root"
+
+case "$(uname -s)" in
+Darwin)
+  ext=dylib
+  case "$(uname -m)" in
+  arm64 | aarch64) rid=osx-arm64 ;;
+  *) rid=osx-x64 ;;
+  esac
+  ;;
+Linux)
+  ext=so
+  case "$(uname -m)" in
+  aarch64 | arm64) rid=linux-arm64 ;;
+  *) rid=linux-x64 ;;
+  esac
+  ;;
+MINGW* | MSYS* | CYGWIN*)
+  ext=dll
+  rid=win-x64
+  ;;
+*)
+  echo "Unsupported platform: $(uname -s)" >&2
+  exit 1
+  ;;
+esac
+
+src="target/release/libkreuzberg_ffi.${ext}"
+if [ "$ext" = "dll" ]; then
+  src="target/release/kreuzberg_ffi.${ext}"
+fi
+
+if [ ! -f "$src" ]; then
+  echo "ERROR: $src not found. Run: cargo build --release -p kreuzberg-ffi" >&2
+  exit 1
+fi
+
+dst_dir="packages/csharp/Kreuzberg/runtimes/${rid}/native"
+mkdir -p "$dst_dir"
+cp -f "$src" "$dst_dir/"
+
+echo "Staged $(basename "$src") -> $dst_dir/"
--- a/scripts/task/patch-demo-dev.mjs
+++ b/scripts/task/patch-demo-dev.mjs
@@ -0,0 +1,92 @@
+#!/usr/bin/env node
+// Generates docs/demo-dev.html from docs/demo.html with CDN URLs replaced
+// by the local asset server so no manual editing of demo.html is ever needed.
+//
+// CDN pattern replaced:
+//   https://cdn.jsdelivr.net/npm/@kreuzberg/wasm@*/...
+//   → http://localhost:9000/...
+//
+// Also patches pkg/web/kreuzberg_wasm.js (gitignored, wasm-pack generated) to
+// replace bare specifier imports ("env", "wasi_snapshot_preview1") with inline
+// browser shims.  The local 5.x WASM binary is compiled with WASI syscalls via
+// tesseract's C layer; the importmap approach does not propagate into Workers
+// loading cross-origin modules, so we shim the generated JS directly.
+//
+// The output file is gitignored and regenerated on every `task demo:dev`.
+
+import { readFileSync, writeFileSync, existsSync } from "node:fs";
+import { join, dirname } from "node:path";
+import { fileURLToPath } from "node:url";
+
+const root = join(dirname(fileURLToPath(import.meta.url)), "..", "..");
+const src = join(root, "docs", "demo.html");
+const dest = join(root, "docs", "demo-dev.html");
+const ASSET_PORT = process.env.ASSET_PORT ?? "9000";
+
+const cdnRe = /https:\/\/cdn\.jsdelivr\.net\/npm\/@kreuzberg\/wasm@[^/'"]+/g;
+
+const patched = readFileSync(src, "utf8")
+	.replace(cdnRe, `http://localhost:${ASSET_PORT}`)
+	.replace(/<title>(.*?)<\/title>/, "<title>$1 [local dev]</title>")
+	.replace(
+		"</body>",
+		`  <div style="position:fixed;bottom:12px;right:12px;background:#1a172a;border:1px solid #58FBDA55;color:#58FBDA;font-family:monospace;font-size:11px;padding:6px 10px;border-radius:6px;z-index:9999">
+    local dev · assets: localhost:${ASSET_PORT}
+  </div>\n</body>`,
+	);
+
+writeFileSync(dest, patched, "utf8");
+console.log(`patch-demo-dev: docs/demo-dev.html → http://localhost:8001/demo-dev.html`);
+console.log(`  assets served from http://localhost:${ASSET_PORT}`);
+
+// Patch pkg/web/kreuzberg_wasm.js — strip bare "env" / "wasi_snapshot_preview1"
+// import lines and replace with inline browser shims so the module loads in a
+// Worker without an importmap (importmap inheritance in Workers is unreliable
+// for bare specifiers in transitive cross-origin dynamic imports).
+const wasmJs = join(root, "crates", "kreuzberg-wasm", "pkg", "web", "kreuzberg_wasm.js");
+if (!existsSync(wasmJs)) {
+	console.warn(`patch-demo-dev: ${wasmJs} not found — skipping WASI shim patch`);
+} else {
+	const bareImportRe = /^import \* as (import\d+) from "(env|wasi_snapshot_preview1)"\s*$/gm;
+	const original = readFileSync(wasmJs, "utf8");
+
+	const envAliases = [];
+	const wasiAliases = [];
+	let m;
+	while ((m = bareImportRe.exec(original)) !== null) {
+		if (m[2] === "env") envAliases.push(m[1]);
+		else wasiAliases.push(m[1]);
+	}
+
+	if (envAliases.length === 0 && wasiAliases.length === 0) {
+		console.log("patch-demo-dev: kreuzberg_wasm.js already patched, skipping");
+	} else {
+		const stripped = original.replace(/^import \* as import\d+ from "(env|wasi_snapshot_preview1)"\s*\n/gm, "");
+
+		const envShim = `const __env_shim = { system: () => -1, mkstemp: () => -1 };`;
+		const envConsts = envAliases.map((a) => `const ${a} = __env_shim;`).join("\n");
+
+		const wasiShim = [
+			`const __wasi_shim = {`,
+			`  environ_sizes_get: () => 0, environ_get: () => 0,`,
+			`  clock_time_get: () => 52,`,
+			`  fd_close: () => 8, fd_fdstat_get: () => 8, fd_fdstat_set_flags: () => 8,`,
+			`  fd_prestat_get: () => 8, fd_prestat_dir_name: () => 8,`,
+			`  fd_read: () => 8, fd_seek: () => 8, fd_write: () => 8,`,
+			`  path_create_directory: () => 52, path_filestat_get: () => 52,`,
+			`  path_open: () => 52, path_remove_directory: () => 52, path_unlink_file: () => 52,`,
+			`  proc_exit: (code) => { throw new Error("WASI: proc_exit(" + code + ")"); },`,
+			`};`,
+		].join("\n");
+		const wasiConsts = wasiAliases.map((a) => `const ${a} = __wasi_shim;`).join("\n");
+
+		const shims = [envShim, envConsts, wasiShim, wasiConsts].filter(Boolean).join("\n") + "\n";
+		const patchedWasmJs = stripped.replace(/^(\/\* @ts-self-types[^\n]*\n)/m, `$1${shims}`);
+
+		writeFileSync(wasmJs, patchedWasmJs, "utf8");
+		console.log(
+			`patch-demo-dev: patched kreuzberg_wasm.js` +
+				` (${envAliases.length} env alias(es), ${wasiAliases.length} wasi alias(es))`,
+		);
+	}
+}
--- a/scripts/test/README.md
+++ b/scripts/test/README.md
@@ -0,0 +1,264 @@
+# Docker Configuration Testing Scripts
+
+This directory contains comprehensive testing scripts for validating Docker configuration scenarios.
+
+## Scripts
+
+### test-docker-config-local.sh
+
+A comprehensive local Docker testing script that validates all configuration volume mount scenarios.
+
+#### Purpose
+
+Tests Docker configuration in various scenarios:
+
+- Volume mounts to `/etc/kreuzberg/kreuzberg.toml` (recommended system path)
+- Volume mounts to `/app/.config/kreuzberg/config.toml` (user path)
+- Custom paths with `--config` flag
+- Environment variable overrides with config files
+- All config formats (TOML, YAML, JSON)
+- Read-only mounts (`:ro` flag)
+
+#### Requirements
+
+- Docker installed and running
+- Docker images pre-built (`kreuzberg:core` and/or `kreuzberg:full`)
+- Port range 18100-18199 available for testing
+
+#### Usage
+
+```bash
+./test-docker-config-local.sh [OPTIONS]
+```
+
+#### Options
+
+| Option              | Description                                     | Default  |
+| ------------------- | ----------------------------------------------- | -------- |
+| `--variant VARIANT` | Test specific variant: `core`, `full`, or `all` | `all`    |
+| `--verbose`         | Enable verbose debugging output                 | Disabled |
+| `--keep-containers` | Preserve containers after tests for inspection  | Clean up |
+| `--help`            | Display help message                            | -        |
+
+#### Examples
+
+Test both core and full variants:
+
+```bash
+./test-docker-config-local.sh
+```
+
+Test only the full variant with verbose output:
+
+```bash
+./test-docker-config-local.sh --variant full --verbose
+```
+
+Test core variant and keep containers for inspection:
+
+```bash
+./test-docker-config-local.sh --variant core --keep-containers
+```
+
+#### Test Cases
+
+The script runs 8 test cases for each variant:
+
+1. **Volume mount to /etc/kreuzberg/kreuzberg.toml**
+   - Tests the recommended system-wide configuration path
+   - Validates read-only mount functionality
+
+2. **Volume mount to /app/.config/kreuzberg/config.toml**
+   - Tests the user-level configuration path
+   - Validates alternative mount location
+
+3. **Custom path with --config flag**
+   - Tests custom configuration file paths
+   - Validates explicit path specification via CLI flag
+
+4. **Environment variable overrides with config file**
+   - Tests that environment variables can override config file settings
+   - Validates configuration precedence
+
+5. **TOML config format**
+   - Tests TOML configuration file format support
+   - Validates parsing of TOML syntax
+
+6. **YAML config format**
+   - Tests YAML configuration file format support
+   - Validates parsing of YAML syntax
+
+7. **JSON config format**
+   - Tests JSON configuration file format support
+   - Validates parsing of JSON syntax
+
+8. **Read-only mount**
+   - Tests that containers work correctly with read-only mounts
+   - Validates security of mounted volumes
+
+#### Validation Method
+
+For each test, the script:
+
+1. Creates a temporary configuration file in the specified format
+2. Starts a Docker container with the configuration mounted
+3. Waits for the service to become healthy (up to 30 seconds)
+4. Verifies the health endpoint responds successfully
+5. Stops and removes the container
+6. Reports pass/fail status
+
+#### Output
+
+The script provides clear, color-coded output:
+
+- `[PASS]` - Test passed (green)
+- `[FAIL]` - Test failed (red)
+- `[INFO]` - Informational messages (blue)
+- `[WARN]` - Warnings (yellow)
+- `[DEBUG]` - Debug information (yellow, with `--verbose`)
+
+Example output:
+
+```text
+╔════════════════════════════════════════════════════════╗
+║ Docker Configuration Volume Mount Test Suite           ║
+╚════════════════════════════════════════════════════════╝
+
+[INFO] Configuration:
+[INFO]   Variant:         all
+[INFO]   Verbose:         false
+[INFO]   Keep Containers: false
+[INFO]   Port Range:      18100-18199
+
+[INFO] Docker is available
+
+Test 01: Volume mount to /etc/kreuzberg/kreuzberg.toml (variant: core)
+[PASS] Test passed
+
+Test 02: Volume mount to /app/.config/kreuzberg/config.toml (variant: core)
+[PASS] Test passed
+
+...
+
+╔════════════════════════════════════════════════════════╗
+║ Test Summary                                           ║
+╚════════════════════════════════════════════════════════╝
+
+Total Tests:   16
+Passed Tests:  16
+Failed Tests:  0
+Pass Rate:     100%
+
+Tested Variants:
+  - kreuzberg:core
+  - kreuzberg:full
+```
+
+#### Troubleshooting
+
+**Error: Docker is not installed or not in PATH**
+
+- Install Docker from <https://www.docker.com/products/docker-desktop>
+- Ensure Docker is in your system PATH
+
+**Error: Docker daemon is not running**
+
+- Start Docker Desktop or the Docker daemon
+- On Linux: `sudo systemctl start docker`
+
+**Error: Docker image does not exist**
+
+- Build the required image(s):
+
+  ```bash
+  cd /path/to/kreuzberg
+  docker build -f docker/Dockerfile.core -t kreuzberg:core .
+  docker build -f docker/Dockerfile.full -t kreuzberg:full .
+  ```
+
+**Tests timing out**
+
+- Check system resources (CPU, memory)
+- Increase timeout: Modify `TIMEOUT_SECONDS=30` in the script
+- Check Docker logs: `docker logs <container-name>`
+
+**Port conflicts**
+
+- Ensure ports 18100-18199 are available
+- Check for existing containers: `docker ps -a`
+- Kill conflicting containers: `docker kill <container-name>`
+
+#### Environment Variables
+
+The script respects these environment variables:
+
+| Variable          | Description                           | Default |
+| ----------------- | ------------------------------------- | ------- |
+| `TEST_VARIANT`    | Override variant via environment      | Unset   |
+| `VERBOSE`         | Enable verbose output via environment | `false` |
+| `KEEP_CONTAINERS` | Keep containers via environment       | `false` |
+
+Example:
+
+```bash
+VERBOSE=true ./test-docker-config-local.sh --variant core
+```
+
+#### Temporary Files
+
+The script creates temporary configuration files in `/tmp/kreuzberg-config-test-$PID/`:
+
+- `kreuzberg.toml` - TOML format test config
+- `config.yaml` - YAML format test config
+- `config.json` - JSON format test config
+
+These are automatically cleaned up after tests complete (unless `--keep-containers` is used).
+
+#### Exit Codes
+
+- `0` - All tests passed
+- `1` - One or more tests failed, or Docker is not available
+
+#### Performance Notes
+
+- Each test takes approximately 2-5 seconds
+- Total test suite runtime: 1-2 minutes for all variants
+- Network latency may affect health check timing
+- Container startup time depends on system resources
+
+#### CI/CD Integration
+
+The script can be integrated into CI/CD pipelines:
+
+```bash
+#!/bin/bash
+set -e
+
+# Build images
+docker build -f docker/Dockerfile.core -t kreuzberg:core .
+docker build -f docker/Dockerfile.full -t kreuzberg:full .
+
+# Run tests
+./scripts/test/test-docker-config-local.sh --variant all
+
+echo "Configuration tests passed!"
+```
+
+#### Limitations
+
+- Requires Docker to be installed and running
+- Tests only configuration volume mounts (not other volume types)
+- Tests only health endpoint (basic connectivity validation)
+- Assumes `kreuzberg:*` image naming convention
+- Tests run sequentially (not parallelized)
+
+#### Future Enhancements
+
+Potential improvements:
+
+- Parallel test execution for faster results
+- Additional validation endpoints (beyond `/health`)
+- Configuration value verification (test that config was actually loaded)
+- Performance benchmarking
+- Multi-architecture testing (arm64, amd64)
+- Docker Compose integration tests
--- a/scripts/test/USAGE.md
+++ b/scripts/test/USAGE.md
@@ -0,0 +1,528 @@
+# Docker Configuration Testing - Quick Start Guide
+
+## Overview
+
+The `test-docker-config-local.sh` script provides comprehensive testing for Docker configuration volume mounts and environment variable overrides.
+
+## Prerequisites
+
+1. **Docker**: Installed and running
+2. **Images**: Pre-built Docker images for testing
+3. **Ports**: 18100-18199 available for test containers
+4. **Utilities**: `bash`, `curl`, `docker` command-line tools
+
+## Building Test Images
+
+Before running tests, build the Docker images:
+
+```bash
+cd .
+
+# Build core variant
+docker build -f docker/Dockerfile.core -t kreuzberg:core .
+
+# Build full variant
+docker build -f docker/Dockerfile.full -t kreuzberg:full .
+
+# Or build both
+docker build -f docker/Dockerfile.core -t kreuzberg:core . && \
+docker build -f docker/Dockerfile.full -t kreuzberg:full .
+```
+
+## Running Tests
+
+### Basic Usage
+
+Test all variants with default settings:
+
+```bash
+./scripts/test/test-docker-config-local.sh
+```
+
+### Common Commands
+
+**Test only core variant:**
+
+```bash
+./scripts/test/test-docker-config-local.sh --variant core
+```
+
+**Test only full variant:**
+
+```bash
+./scripts/test/test-docker-config-local.sh --variant full
+```
+
+**Enable verbose output:**
+
+```bash
+./scripts/test/test-docker-config-local.sh --verbose
+```
+
+**Keep containers after testing:**
+
+```bash
+./scripts/test/test-docker-config-local.sh --keep-containers
+```
+
+**Combine multiple options:**
+
+```bash
+./scripts/test/test-docker-config-local.sh --variant full --verbose --keep-containers
+```
+
+## Test Cases Explained
+
+### 1. Volume Mount to /etc/kreuzberg/kreuzberg.toml
+
+**What it tests**: System-wide configuration path (recommended)
+
+**Docker command**:
+
+```bash
+docker run -v /local/config.toml:/etc/kreuzberg/kreuzberg.toml:ro kreuzberg:full
+```
+
+**Expected**: Container reads config from standard system location
+
+---
+
+### 2. Volume Mount to /app/.config/kreuzberg/config.toml
+
+**What it tests**: User-level configuration path (alternative location)
+
+**Docker command**:
+
+```bash
+docker run -v /local/config.toml:/app/.config/kreuzberg/config.toml:ro kreuzberg:full
+```
+
+**Expected**: Container reads config from user application directory
+
+---
+
+### 3. Custom Path with --config Flag
+
+**What it tests**: Explicit configuration path specification
+
+**Docker command**:
+
+```bash
+docker run \
+  -v /local/config.toml:/app/custom-config.toml:ro \
+  --entrypoint "/app/kreuzberg" \
+  kreuzberg:full \
+  --config /app/custom-config.toml
+```
+
+**Expected**: Container uses specified custom path
+
+---
+
+### 4. Environment Variable Overrides
+
+**What it tests**: Environment variables override config file settings
+
+**Docker command**:
+
+```bash
+docker run \
+  -v /local/config.toml:/etc/kreuzberg/kreuzberg.toml:ro \
+  -e KREUZBERG_SERVER_PORT=8000 \
+  kreuzberg:full
+```
+
+**Expected**: Environment variable takes precedence over config file
+
+---
+
+### 5. TOML Format Support
+
+**What it tests**: Configuration in TOML format
+
+**Config file**:
+
+```toml
+[server]
+host = "0.0.0.0"
+port = 8000
+max_upload_mb = 100
+
+[ocr]
+backend = "tesseract"
+language = "eng"
+```
+
+**Expected**: Container parses TOML correctly
+
+---
+
+### 6. YAML Format Support
+
+**What it tests**: Configuration in YAML format
+
+**Config file**:
+
+```yaml
+server:
+  host: "0.0.0.0"
+  port: 8000
+  max_upload_mb: 100
+
+ocr:
+  backend: "tesseract"
+  language: "eng"
+```
+
+**Expected**: Container parses YAML correctly
+
+---
+
+### 7. JSON Format Support
+
+**What it tests**: Configuration in JSON format
+
+**Config file**:
+
+```json
+{
+  "server": {
+    "host": "0.0.0.0",
+    "port": 8000,
+    "max_upload_mb": 100
+  },
+  "ocr": {
+    "backend": "tesseract",
+    "language": "eng"
+  }
+}
+```
+
+**Expected**: Container parses JSON correctly
+
+---
+
+### 8. Read-Only Mount
+
+**What it tests**: Security of read-only mounted volumes
+
+**Docker command**:
+
+```bash
+docker run -v /local/config.toml:/etc/kreuzberg/kreuzberg.toml:ro kreuzberg:full
+```
+
+**Expected**: Container works with read-only volumes, application doesn't attempt to modify config
+
+---
+
+## Understanding Output
+
+### Success Output
+
+```text
+╔════════════════════════════════════════════════════════╗
+║ Docker Configuration Volume Mount Test Suite           ║
+╚════════════════════════════════════════════════════════╝
+
+[INFO] Configuration:
+[INFO]   Variant:         all
+[INFO]   Verbose:         false
+[INFO]   Keep Containers: false
+[INFO]   Port Range:      18100-18199
+
+[INFO] Docker is available
+
+Test 01: Volume mount to /etc/kreuzberg/kreuzberg.toml (variant: core)
+[PASS] Test passed
+```
+
+### Failure Output
+
+```text
+Test 02: Custom path with --config flag (variant: core)
+[FAIL] Test failed: Failed to start container with custom --config flag
+[FAIL]   Details: Container logs:
+          /app/kreuzberg: line 123: syntax error: unexpected token
+```
+
+### Summary
+
+```text
+╔════════════════════════════════════════════════════════╗
+║ Test Summary                                           ║
+╚════════════════════════════════════════════════════════╝
+
+Total Tests:   16
+Passed Tests:  16
+Failed Tests:  0
+Pass Rate:     100%
+
+Tested Variants:
+  - kreuzberg:core
+  - kreuzberg:full
+```
+
+## Debugging Failed Tests
+
+### Enable Verbose Output
+
+```bash
+./scripts/test/test-docker-config-local.sh --variant core --verbose
+```
+
+Verbose output shows:
+
+- Container IDs
+- Docker arguments
+- Service startup timing
+- Health check attempts
+
+### Keep Containers for Inspection
+
+```bash
+./scripts/test/test-docker-config-local.sh --keep-containers
+```
+
+Then inspect containers manually:
+
+```bash
+# List test containers
+docker ps -a | grep kreuzberg-config-test
+
+# View specific container logs
+docker logs kreuzberg-config-test-etc-core-12345
+
+# Execute command in running container
+docker exec kreuzberg-config-test-etc-core-12345 cat /etc/kreuzberg/kreuzberg.toml
+
+# Stop container manually
+docker stop kreuzberg-config-test-etc-core-12345
+docker rm kreuzberg-config-test-etc-core-12345
+```
+
+### Check Health Endpoint Manually
+
+```bash
+# Start container manually
+docker run -d \
+  --name test-container \
+  -p 8000:8000 \
+  -v /path/to/config.toml:/etc/kreuzberg/kreuzberg.toml:ro \
+  kreuzberg:full
+
+# Wait for startup
+sleep 3
+
+# Test health endpoint
+curl -v http://localhost:8000/health
+
+# View logs
+docker logs test-container
+
+# Cleanup
+docker stop test-container
+docker rm test-container
+```
+
+## Troubleshooting
+
+### Docker Not Found
+
+```text
+[ERROR] Docker is not installed or not in PATH
+```
+
+**Solution**: Install Docker or ensure it's in your PATH
+
+```bash
+which docker
+export PATH=$PATH:/usr/local/bin  # or wherever docker is installed
+```
+
+### Docker Daemon Not Running
+
+```text
+[ERROR] Docker daemon is not running or you don't have permissions
+```
+
+**Solution**: Start Docker
+
+```bash
+# macOS
+open -a Docker
+
+# Linux
+sudo systemctl start docker
+
+# Check status
+docker ps
+```
+
+### Image Not Found
+
+```text
+[WARN] Skipping tests for variant: full (image not found)
+```
+
+**Solution**: Build the image
+
+```bash
+docker build -f docker/Dockerfile.full -t kreuzberg:full .
+```
+
+### Port Already in Use
+
+```text
+[FAIL] Test failed: Failed to start container
+[FAIL]   Details: port is already allocated
+```
+
+**Solution**: Free the ports or wait for existing tests to finish
+
+```bash
+# Find what's using the ports
+lsof -i :18100-18199
+
+# Or just stop all test containers
+docker ps -a --filter "name=kreuzberg-config-test" --format "{{.Names}}" | \
+  xargs -r docker stop
+```
+
+### Health Check Timeout
+
+```text
+[FAIL] Test failed: Service failed to start (health check timeout)
+```
+
+**Debugging**:
+
+1. Check container is still running:
+
+```bash
+docker ps | grep kreuzberg-config-test
+```
+
+2. View container logs:
+
+```bash
+docker logs <container-name>
+```
+
+3. Check if service is binding to port:
+
+```bash
+docker exec <container-name> netstat -tuln | grep 8000
+```
+
+4. Increase timeout (edit script):
+
+```bash
+TIMEOUT_SECONDS=60  # Change from 30
+```
+
+## CI/CD Integration
+
+### GitHub Actions
+
+```yaml
+name: Docker Config Tests
+
+on: [push, pull_request]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Build Docker images
+        run: |
+          docker build -f docker/Dockerfile.core -t kreuzberg:core .
+          docker build -f docker/Dockerfile.full -t kreuzberg:full .
+
+      - name: Run configuration tests
+        run: ./scripts/test/test-docker-config-local.sh --variant all
+```
+
+### GitLab CI
+
+```yaml
+docker-config-tests:
+  stage: test
+  image: docker:latest
+  services:
+    - docker:dind
+  script:
+    - docker build -f docker/Dockerfile.core -t kreuzberg:core .
+    - docker build -f docker/Dockerfile.full -t kreuzberg:full .
+    - ./scripts/test/test-docker-config-local.sh --variant all
+```
+
+## Performance Expectations
+
+| Metric                    | Time           |
+| ------------------------- | -------------- |
+| Single test               | 2-5 seconds    |
+| All 8 tests (1 variant)   | 30-45 seconds  |
+| All 16 tests (2 variants) | 60-90 seconds  |
+| With verbose output       | +10-20 seconds |
+
+## Exit Codes
+
+| Code | Meaning                                        |
+| ---- | ---------------------------------------------- |
+| 0    | All tests passed                               |
+| 1    | One or more tests failed OR Docker unavailable |
+
+## Advanced Usage
+
+### Custom Environment Variables
+
+```bash
+# Override variant via environment
+TEST_VARIANT=core ./scripts/test/test-docker-config-local.sh
+
+# Override verbose via environment
+VERBOSE=true ./scripts/test/test-docker-config-local.sh
+```
+
+### Modify Timeout
+
+Edit the script to change timeout:
+
+```bash
+TIMEOUT_SECONDS=60  # Line ~43, change from 30
+```
+
+### Test Specific Scenarios
+
+To test only one specific scenario, modify the `run_test_suite()` call in `main()`:
+
+```bash
+# Comment out unwanted tests
+# test_etc_kreuzberg_mount "$variant"
+test_app_config_mount "$variant"
+# test_custom_path_with_flag "$variant"
+# ... etc
+```
+
+## Getting Help
+
+```bash
+./scripts/test/test-docker-config-local.sh --help
+```
+
+For detailed documentation:
+
+```bash
+cat ./scripts/test/README.md
+```
+
+## Related Files
+
+- **Script**: `./scripts/test/test-docker-config-local.sh`
+- **Documentation**: `./scripts/test/README.md`
+- **This Guide**: `./scripts/test/USAGE.md`
+- **Docker Files**: `./docker/Dockerfile.core`
+- **Docker Files**: `./docker/Dockerfile.full`
--- a/scripts/test/test-docker-config-local.sh
+++ b/scripts/test/test-docker-config-local.sh
@@ -0,0 +1,800 @@
+#!/bin/bash
+
+################################################################################
+# Docker Configuration Volume Mount Testing Script
+#
+# This script validates all Docker configuration scenarios locally:
+# - Volume mounts to /etc/kreuzberg/kreuzberg.toml (recommended)
+# - Volume mounts to /app/.config/kreuzberg/config.toml (user path)
+# - Custom paths with --config flag
+# - Environment variable overrides with config files
+# - All config formats (TOML, YAML, JSON)
+# - Read-only mounts
+#
+# Usage: ./test-docker-config-local.sh [OPTIONS]
+# Options:
+#   --variant core|full|all   Test specific variant (default: all)
+#   --verbose                 Enable verbose output
+#   --keep-containers         Don't cleanup containers after tests
+################################################################################
+
+set -o pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+DOCKER_DIR="$(cd "$SCRIPT_DIR/../../docker" && pwd)"
+
+# Color codes
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+CYAN='\033[0;36m'
+NC='\033[0m'
+
+# Test configuration
+TEST_VARIANT="${TEST_VARIANT:-all}"
+IMAGE_NAME="${IMAGE_NAME:-}" # Empty means build from Dockerfile
+VERBOSE="${VERBOSE:-false}"
+KEEP_CONTAINERS="${KEEP_CONTAINERS:-false}"
+TIMEOUT_SECONDS=30
+PORT_BASE=18100
+TEST_TEMP_DIR="/tmp/kreuzberg-config-test-$$"
+
+# Test tracking
+TOTAL_TESTS=0
+PASSED_TESTS=0
+FAILED_TESTS=0
+declare -a FAILED_TEST_NAMES=()
+declare -a TESTED_VARIANTS=()
+
+################################################################################
+# Helper Functions
+################################################################################
+
+log_header() {
+  echo -e "\n${CYAN}╔════════════════════════════════════════════════════════╗${NC}"
+  echo -e "${CYAN}║ $1${NC}"
+  echo -e "${CYAN}╚════════════════════════════════════════════════════════╝${NC}\n"
+}
+
+log_info() {
+  echo -e "${BLUE}[INFO]${NC} $*"
+}
+
+log_success() {
+  echo -e "${GREEN}[PASS]${NC} $*"
+}
+
+log_warning() {
+  echo -e "${YELLOW}[WARN]${NC} $*"
+}
+
+log_error() {
+  echo -e "${RED}[FAIL]${NC} $*"
+}
+
+log_debug() {
+  if [ "$VERBOSE" = "true" ]; then
+    echo -e "${YELLOW}[DEBUG]${NC} $*"
+  fi
+}
+
+start_test() {
+  TOTAL_TESTS=$((TOTAL_TESTS + 1))
+  local test_num
+  test_num=$(printf "%02d" $TOTAL_TESTS)
+  echo ""
+  echo -e "${CYAN}Test $test_num:${NC} $*"
+}
+
+pass_test() {
+  PASSED_TESTS=$((PASSED_TESTS + 1))
+  log_success "Test passed"
+}
+
+fail_test() {
+  FAILED_TESTS=$((FAILED_TESTS + 1))
+  FAILED_TEST_NAMES+=("$1")
+  log_error "Test failed: $1"
+  if [ -n "${2:-}" ]; then
+    log_error "  Details: $2"
+  fi
+}
+
+# shellcheck disable=SC2317,SC2329  # Function is invoked via trap EXIT
+cleanup() {
+  log_info "Cleaning up test environment..."
+
+  if [ "$KEEP_CONTAINERS" != "true" ]; then
+    # Stop and remove test containers
+    docker ps -a --filter "name=kreuzberg-config-test-" --format "{{.Names}}" | while read -r container; do
+      log_debug "Stopping container: $container"
+      docker stop "$container" 2>/dev/null || true
+      docker rm "$container" 2>/dev/null || true
+    done
+  else
+    log_warning "Keeping containers for inspection (use 'docker ps -a' to view)"
+  fi
+
+  # Remove temporary test files
+  if [ -d "$TEST_TEMP_DIR" ]; then
+    log_debug "Removing temporary directory: $TEST_TEMP_DIR"
+    rm -rf "$TEST_TEMP_DIR"
+  fi
+}
+
+trap cleanup EXIT
+
+################################################################################
+# Setup Functions
+################################################################################
+
+setup_test_environment() {
+  log_info "Setting up test environment..."
+
+  if ! mkdir -p "$TEST_TEMP_DIR"; then
+    log_error "Failed to create temporary directory"
+    exit 1
+  fi
+
+  log_debug "Test temp directory: $TEST_TEMP_DIR"
+}
+
+verify_docker_available() {
+  if ! command -v docker &>/dev/null; then
+    log_error "Docker is not installed or not in PATH"
+    exit 1
+  fi
+
+  if ! docker ps &>/dev/null; then
+    log_error "Docker daemon is not running or you don't have permissions"
+    exit 1
+  fi
+
+  log_info "Docker is available"
+}
+
+check_image_exists() {
+  local image="$1"
+
+  if ! docker image inspect "$image" &>/dev/null; then
+    log_error "Docker image does not exist: $image"
+    log_error "Please build the image first with: docker build -f $DOCKER_DIR/Dockerfile.${image##*:} -t $image ."
+    return 1
+  fi
+
+  return 0
+}
+
+get_image_name() {
+  local variant="$1"
+
+  if [ -n "$IMAGE_NAME" ]; then
+    # Use provided image name (CI mode)
+    echo "$IMAGE_NAME"
+  else
+    # Use default naming convention (local mode)
+    echo "kreuzberg:$variant"
+  fi
+}
+
+################################################################################
+# Config File Creation Functions
+################################################################################
+
+create_toml_config() {
+  local file_path="$1"
+  local port="${2:-8000}"
+
+  # Config must be valid ExtractionConfig (deny_unknown_fields).
+  # Server settings use defaults; ports are mapped via docker -p flag.
+  cat >"$file_path" <<EOF
+use_cache = true
+enable_quality_processing = true
+
+[ocr]
+backend = "tesseract"
+language = "eng"
+EOF
+
+  log_debug "Created TOML config: $file_path"
+}
+
+create_yaml_config() {
+  local file_path="$1"
+  local port="${2:-8000}"
+
+  # Config must be valid ExtractionConfig (deny_unknown_fields).
+  # Server settings use defaults; ports are mapped via docker -p flag.
+  cat >"$file_path" <<EOF
+use_cache: true
+enable_quality_processing: true
+
+ocr:
+  backend: "tesseract"
+  language: "eng"
+EOF
+
+  log_debug "Created YAML config: $file_path"
+}
+
+create_json_config() {
+  local file_path="$1"
+  local port="${2:-8000}"
+
+  # Config must be valid ExtractionConfig (deny_unknown_fields).
+  # Server settings use defaults; ports are mapped via docker -p flag.
+  cat >"$file_path" <<EOF
+{
+  "use_cache": true,
+  "enable_quality_processing": true,
+  "ocr": {
+    "backend": "tesseract",
+    "language": "eng"
+  }
+}
+EOF
+
+  log_debug "Created JSON config: $file_path"
+}
+
+################################################################################
+# Container Testing Functions
+################################################################################
+
+run_container() {
+  local container_name="$1"
+  local image="$2"
+  local port="$3"
+  shift 3
+
+  # Separate docker options from command arguments
+  local docker_opts=()
+  local cmd_args=()
+  local after_separator=false
+
+  while [ $# -gt 0 ]; do
+    if [ "$1" = "--" ]; then
+      after_separator=true
+      shift
+      continue
+    fi
+
+    if [ "$after_separator" = true ]; then
+      cmd_args+=("$1")
+    else
+      docker_opts+=("$1")
+    fi
+    shift
+  done
+
+  log_debug "Running container: $container_name"
+  log_debug "Docker opts: ${docker_opts[*]}"
+  log_debug "Command args: ${cmd_args[*]}"
+
+  if ! docker run -d \
+    --name "$container_name" \
+    -p "$port:8000" \
+    "${docker_opts[@]}" \
+    "$image" \
+    "${cmd_args[@]}" >/dev/null 2>&1; then
+    return 1
+  fi
+
+  return 0
+}
+
+wait_for_health() {
+  local port="$1"
+  local max_wait="${2:-$TIMEOUT_SECONDS}"
+  local elapsed=0
+  local interval=1
+
+  log_debug "Waiting for service on port $port (timeout: ${max_wait}s)"
+
+  while [ "$elapsed" -lt "$max_wait" ]; do
+    if curl -sf "http://localhost:$port/health" &>/dev/null; then
+      log_debug "Service became healthy after ${elapsed}s"
+      return 0
+    fi
+
+    sleep $interval
+    elapsed=$((elapsed + interval))
+  done
+
+  log_debug "Service did not become healthy within ${max_wait}s"
+  return 1
+}
+
+check_container_running() {
+  local container_name="$1"
+
+  if docker inspect "$container_name" --format='{{.State.Running}}' 2>/dev/null | grep -q "true"; then
+    return 0
+  fi
+
+  return 1
+}
+
+get_container_logs() {
+  local container_name="$1"
+  docker logs "$container_name" 2>&1 | tail -20
+}
+
+################################################################################
+# Test Cases
+################################################################################
+
+test_etc_kreuzberg_mount() {
+  local variant="$1"
+  start_test "Volume mount to /etc/kreuzberg/kreuzberg.toml (variant: $variant)"
+
+  local image
+  image="$(get_image_name "$variant")"
+  local port=$((PORT_BASE + TOTAL_TESTS))
+  local container_name="kreuzberg-config-test-etc-${variant}-$$"
+  local config_file="$TEST_TEMP_DIR/kreuzberg.toml"
+
+  # Create config file
+  create_toml_config "$config_file" "$port"
+
+  # Run container with mount
+  if ! run_container "$container_name" "$image" "$port" \
+    --volume "$config_file:/etc/kreuzberg/kreuzberg.toml:ro"; then
+    fail_test "Failed to start container with /etc/kreuzberg mount"
+    log_error "  Container logs:\n$(get_container_logs "$container_name" 2>/dev/null || echo 'N/A')"
+    return 1
+  fi
+
+  sleep 2
+
+  # Check if container is still running
+  if ! check_container_running "$container_name"; then
+    fail_test "Container exited unexpectedly"
+    log_error "  Container logs:\n$(get_container_logs "$container_name")"
+    return 1
+  fi
+
+  # Wait for service to be healthy
+  if ! wait_for_health "$port"; then
+    fail_test "Service failed to start (health check timeout)"
+    log_error "  Container logs:\n$(get_container_logs "$container_name")"
+    docker stop "$container_name" 2>/dev/null || true
+    return 1
+  fi
+
+  # Test the health endpoint
+  if ! curl -sf "http://localhost:$port/health" >/dev/null; then
+    fail_test "Health endpoint returned non-success status"
+    docker stop "$container_name" 2>/dev/null || true
+    return 1
+  fi
+
+  log_success "Service is running and healthy"
+  docker stop "$container_name" 2>/dev/null || true
+  pass_test
+}
+
+test_app_config_mount() {
+  local variant="$1"
+  start_test "Volume mount to /app/.config/kreuzberg/config.toml (variant: $variant)"
+
+  local image
+  image="$(get_image_name "$variant")"
+  local port=$((PORT_BASE + TOTAL_TESTS))
+  local container_name="kreuzberg-config-test-app-config-${variant}-$$"
+  local config_file="$TEST_TEMP_DIR/config.toml"
+
+  # Create config file
+  create_toml_config "$config_file" "$port"
+
+  # Run container with mount
+  if ! run_container "$container_name" "$image" "$port" \
+    --volume "$config_file:/app/.config/kreuzberg/config.toml:ro"; then
+    fail_test "Failed to start container with /app/.config mount"
+    log_error "  Container logs:\n$(get_container_logs "$container_name" 2>/dev/null || echo 'N/A')"
+    return 1
+  fi
+
+  sleep 2
+
+  if ! check_container_running "$container_name"; then
+    fail_test "Container exited unexpectedly"
+    log_error "  Container logs:\n$(get_container_logs "$container_name")"
+    return 1
+  fi
+
+  if ! wait_for_health "$port"; then
+    fail_test "Service failed to start (health check timeout)"
+    log_error "  Container logs:\n$(get_container_logs "$container_name")"
+    docker stop "$container_name" 2>/dev/null || true
+    return 1
+  fi
+
+  if ! curl -sf "http://localhost:$port/health" >/dev/null; then
+    fail_test "Health endpoint returned non-success status"
+    docker stop "$container_name" 2>/dev/null || true
+    return 1
+  fi
+
+  log_success "Service is running and healthy"
+  docker stop "$container_name" 2>/dev/null || true
+  pass_test
+}
+
+test_custom_path_with_flag() {
+  local variant="$1"
+  start_test "Custom path with --config flag (variant: $variant)"
+
+  local image
+  image="$(get_image_name "$variant")"
+  local port=$((PORT_BASE + TOTAL_TESTS))
+  local container_name="kreuzberg-config-test-custom-${variant}-$$"
+  local config_file="$TEST_TEMP_DIR/custom-config.toml"
+  local container_path="/app/custom-config.toml"
+
+  # Create config file
+  create_toml_config "$config_file" "$port"
+
+  # Run container with custom config path
+  if ! run_container "$container_name" "$image" "$port" \
+    --volume "$config_file:$container_path:ro" \
+    --entrypoint "/usr/local/bin/kreuzberg" \
+    -- "serve" "--config" "$container_path" "--host" "0.0.0.0"; then
+    fail_test "Failed to start container with custom --config flag"
+    log_error "  Container logs:\n$(get_container_logs "$container_name" 2>/dev/null || echo 'N/A')"
+    return 1
+  fi
+
+  sleep 2
+
+  if ! check_container_running "$container_name"; then
+    fail_test "Container exited unexpectedly"
+    log_error "  Container logs:\n$(get_container_logs "$container_name")"
+    return 1
+  fi
+
+  if ! wait_for_health "$port"; then
+    fail_test "Service failed to start (health check timeout)"
+    log_error "  Container logs:\n$(get_container_logs "$container_name")"
+    docker stop "$container_name" 2>/dev/null || true
+    return 1
+  fi
+
+  if ! curl -sf "http://localhost:$port/health" >/dev/null; then
+    fail_test "Health endpoint returned non-success status"
+    docker stop "$container_name" 2>/dev/null || true
+    return 1
+  fi
+
+  log_success "Service is running and healthy with custom config path"
+  docker stop "$container_name" 2>/dev/null || true
+  pass_test
+}
+
+test_env_var_overrides() {
+  local variant="$1"
+  start_test "Environment variable overrides with config file (variant: $variant)"
+
+  local image
+  image="$(get_image_name "$variant")"
+  local port=$((PORT_BASE + TOTAL_TESTS))
+  local container_name="kreuzberg-config-test-env-${variant}-$$"
+  local config_file="$TEST_TEMP_DIR/env-config.toml"
+
+  # Create config file with port 8000
+  create_toml_config "$config_file" "8000"
+
+  # Run container with config mount and environment variable override
+  if ! run_container "$container_name" "$image" "$port" \
+    --volume "$config_file:/etc/kreuzberg/kreuzberg.toml:ro" \
+    --env "KREUZBERG_SERVER_PORT=$port"; then
+    fail_test "Failed to start container with env var override"
+    log_error "  Container logs:\n$(get_container_logs "$container_name" 2>/dev/null || echo 'N/A')"
+    return 1
+  fi
+
+  sleep 2
+
+  if ! check_container_running "$container_name"; then
+    fail_test "Container exited unexpectedly"
+    log_error "  Container logs:\n$(get_container_logs "$container_name")"
+    return 1
+  fi
+
+  if ! wait_for_health "$port"; then
+    fail_test "Service failed to start (health check timeout)"
+    log_error "  Container logs:\n$(get_container_logs "$container_name")"
+    docker stop "$container_name" 2>/dev/null || true
+    return 1
+  fi
+
+  if ! curl -sf "http://localhost:$port/health" >/dev/null; then
+    fail_test "Health endpoint returned non-success status"
+    docker stop "$container_name" 2>/dev/null || true
+    return 1
+  fi
+
+  log_success "Service is running with environment variable overrides"
+  docker stop "$container_name" 2>/dev/null || true
+  pass_test
+}
+
+test_toml_format() {
+  local variant="$1"
+  start_test "TOML config format (variant: $variant)"
+
+  local image
+  image="$(get_image_name "$variant")"
+  local port=$((PORT_BASE + TOTAL_TESTS))
+  local container_name="kreuzberg-config-test-toml-${variant}-$$"
+  local config_file="$TEST_TEMP_DIR/config.toml"
+
+  create_toml_config "$config_file" "$port"
+
+  if ! run_container "$container_name" "$image" "$port" \
+    --volume "$config_file:/etc/kreuzberg/kreuzberg.toml:ro"; then
+    fail_test "Failed to start container with TOML config"
+    return 1
+  fi
+
+  sleep 2
+
+  if ! wait_for_health "$port"; then
+    fail_test "Service failed to start with TOML config"
+    docker stop "$container_name" 2>/dev/null || true
+    return 1
+  fi
+
+  log_success "TOML config format works correctly"
+  docker stop "$container_name" 2>/dev/null || true
+  pass_test
+}
+
+test_yaml_format() {
+  local variant="$1"
+  start_test "YAML config format (variant: $variant)"
+
+  local image
+  image="$(get_image_name "$variant")"
+  local port=$((PORT_BASE + TOTAL_TESTS))
+  local container_name="kreuzberg-config-test-yaml-${variant}-$$"
+  local config_file="$TEST_TEMP_DIR/config.yaml"
+
+  create_yaml_config "$config_file" "$port"
+
+  if ! run_container "$container_name" "$image" "$port" \
+    --volume "$config_file:/etc/kreuzberg/kreuzberg.yaml:ro"; then
+    fail_test "Failed to start container with YAML config"
+    return 1
+  fi
+
+  sleep 2
+
+  if ! wait_for_health "$port"; then
+    fail_test "Service failed to start with YAML config"
+    docker stop "$container_name" 2>/dev/null || true
+    return 1
+  fi
+
+  log_success "YAML config format works correctly"
+  docker stop "$container_name" 2>/dev/null || true
+  pass_test
+}
+
+test_json_format() {
+  local variant="$1"
+  start_test "JSON config format (variant: $variant)"
+
+  local image
+  image="$(get_image_name "$variant")"
+  local port=$((PORT_BASE + TOTAL_TESTS))
+  local container_name="kreuzberg-config-test-json-${variant}-$$"
+  local config_file="$TEST_TEMP_DIR/config.json"
+
+  create_json_config "$config_file" "$port"
+
+  if ! run_container "$container_name" "$image" "$port" \
+    --volume "$config_file:/etc/kreuzberg/kreuzberg.json:ro"; then
+    fail_test "Failed to start container with JSON config"
+    return 1
+  fi
+
+  sleep 2
+
+  if ! wait_for_health "$port"; then
+    fail_test "Service failed to start with JSON config"
+    docker stop "$container_name" 2>/dev/null || true
+    return 1
+  fi
+
+  log_success "JSON config format works correctly"
+  docker stop "$container_name" 2>/dev/null || true
+  pass_test
+}
+
+test_readonly_mount() {
+  local variant="$1"
+  start_test "Read-only mount (variant: $variant)"
+
+  local image
+  image="$(get_image_name "$variant")"
+  local port=$((PORT_BASE + TOTAL_TESTS))
+  local container_name="kreuzberg-config-test-readonly-${variant}-$$"
+  local config_file="$TEST_TEMP_DIR/readonly-config.toml"
+
+  create_toml_config "$config_file" "$port"
+
+  # Run with read-only mount (explicitly :ro)
+  if ! run_container "$container_name" "$image" "$port" \
+    --volume "$config_file:/etc/kreuzberg/kreuzberg.toml:ro"; then
+    fail_test "Failed to start container with read-only mount"
+    return 1
+  fi
+
+  sleep 2
+
+  if ! check_container_running "$container_name"; then
+    fail_test "Container exited unexpectedly with read-only mount"
+    return 1
+  fi
+
+  if ! wait_for_health "$port"; then
+    fail_test "Service failed to start with read-only mount"
+    docker stop "$container_name" 2>/dev/null || true
+    return 1
+  fi
+
+  log_success "Read-only mount works correctly"
+  docker stop "$container_name" 2>/dev/null || true
+  pass_test
+}
+
+################################################################################
+# Test Execution
+################################################################################
+
+run_test_suite() {
+  local variant="$1"
+
+  log_header "Testing variant: $(get_image_name "$variant")"
+
+  # Check if image exists
+  if ! check_image_exists "$(get_image_name "$variant")"; then
+    log_warning "Skipping tests for variant: $variant (image not found)"
+    return
+  fi
+
+  TESTED_VARIANTS+=("$variant")
+
+  # Run all test cases
+  test_etc_kreuzberg_mount "$variant"
+  test_app_config_mount "$variant"
+  test_custom_path_with_flag "$variant"
+  test_env_var_overrides "$variant"
+  test_toml_format "$variant"
+  test_yaml_format "$variant"
+  test_json_format "$variant"
+  test_readonly_mount "$variant"
+}
+
+print_summary() {
+  log_header "Test Summary"
+
+  local pass_rate=0
+  if [ $TOTAL_TESTS -gt 0 ]; then
+    pass_rate=$((PASSED_TESTS * 100 / TOTAL_TESTS))
+  fi
+
+  echo -e "Total Tests:   ${CYAN}$TOTAL_TESTS${NC}"
+  echo -e "Passed Tests:  ${GREEN}$PASSED_TESTS${NC}"
+  echo -e "Failed Tests:  ${RED}$FAILED_TESTS${NC}"
+  echo -e "Pass Rate:     ${BLUE}${pass_rate}%${NC}"
+  echo ""
+
+  if [ $FAILED_TESTS -gt 0 ]; then
+    echo -e "${RED}Failed Tests:${NC}"
+    for test_name in "${FAILED_TEST_NAMES[@]}"; do
+      echo "  - $test_name"
+    done
+    echo ""
+  fi
+
+  if [ ${#TESTED_VARIANTS[@]} -gt 0 ]; then
+    echo -e "${CYAN}Tested Variants:${NC}"
+    for variant in "${TESTED_VARIANTS[@]}"; do
+      echo "  - $(get_image_name "$variant")"
+    done
+    echo ""
+  fi
+}
+
+################################################################################
+# Main Entry Point
+################################################################################
+
+main() {
+  # Parse command line arguments
+  while [[ $# -gt 0 ]]; do
+    case $1 in
+    --variant)
+      TEST_VARIANT="$2"
+      shift 2
+      ;;
+    --image)
+      IMAGE_NAME="$2"
+      shift 2
+      ;;
+    --verbose)
+      VERBOSE=true
+      shift
+      ;;
+    --keep-containers)
+      KEEP_CONTAINERS=true
+      shift
+      ;;
+    --help)
+      echo "Usage: $0 [OPTIONS]"
+      echo ""
+      echo "Options:"
+      echo "  --variant VARIANT       Test specific variant (core, full, or all) [default: all]"
+      echo "  --image IMAGE          Use pre-built image instead of building [default: build from Dockerfile]"
+      echo "  --verbose              Enable verbose output"
+      echo "  --keep-containers      Don't cleanup containers after tests"
+      echo "  --help                 Show this help message"
+      exit 0
+      ;;
+    *)
+      log_error "Unknown option: $1"
+      exit 1
+      ;;
+    esac
+  done
+
+  log_header "Docker Configuration Volume Mount Test Suite"
+
+  log_info "Configuration:"
+  log_info "  Variant:         $TEST_VARIANT"
+  log_info "  Verbose:         $VERBOSE"
+  log_info "  Keep Containers: $KEEP_CONTAINERS"
+  log_info "  Port Range:      $PORT_BASE-$((PORT_BASE + 99))"
+  log_info ""
+
+  # Verify Docker is available
+  verify_docker_available
+
+  # Setup test environment
+  setup_test_environment
+
+  # Run tests based on variant selection
+  case "$TEST_VARIANT" in
+  core)
+    run_test_suite "core"
+    ;;
+  full)
+    run_test_suite "full"
+    ;;
+  all)
+    run_test_suite "core"
+    run_test_suite "full"
+    ;;
+  *)
+    log_error "Invalid variant: $TEST_VARIANT (must be 'core', 'full', or 'all')"
+    exit 1
+    ;;
+  esac
+
+  # Print summary
+  print_summary
+
+  # Exit with appropriate code
+  if [ $FAILED_TESTS -eq 0 ]; then
+    log_success "All tests passed!"
+    exit 0
+  else
+    log_error "Some tests failed"
+    exit 1
+  fi
+}
+
+main "$@"