Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,16 @@
#!/usr/bin/env bash
set -euo pipefail
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
source "${REPO_ROOT}/scripts/lib/common.sh"
validate_repo_root "$REPO_ROOT" || exit 1
if [ ! -d "$REPO_ROOT/tools/benchmark-harness" ]; then
echo "::error::tools/benchmark-harness not found on branch ${GITHUB_REF}." >&2
exit 1
fi
echo "✓ Benchmark harness directory verified at: $REPO_ROOT/tools/benchmark-harness"

View File

@@ -0,0 +1,26 @@
#!/usr/bin/env bash
set -euo pipefail
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
source "${REPO_ROOT}/scripts/lib/common.sh"
validate_repo_root "$REPO_ROOT" || exit 1
BINARY_PATH="${BINARY_PATH:-$REPO_ROOT/target/release/benchmark-harness}"
if [ ! -f "$BINARY_PATH" ]; then
echo "::error::Binary not found at $BINARY_PATH" >&2
exit 1
fi
chmod +x "$BINARY_PATH"
echo "✓ Restored executable permissions on: $BINARY_PATH"
# Also restore kreuzberg-cli if present (used by all kreuzberg adapter pipelines)
CLI_BINARY="$REPO_ROOT/target/release/kreuzberg"
if [ -f "$CLI_BINARY" ]; then
chmod +x "$CLI_BINARY"
echo "✓ Restored executable permissions on: $CLI_BINARY"
fi

View File

@@ -0,0 +1,57 @@
#!/usr/bin/env bash
set -euo pipefail
FRAMEWORK="${FRAMEWORK:-}"
MODE="${MODE:-}"
ITERATIONS="${ITERATIONS:-3}"
TIMEOUT="${TIMEOUT:-900}"
FIXTURES_DIR="${FIXTURES_DIR:-tools/benchmark-harness/fixtures}"
HARNESS_PATH="${HARNESS_PATH:-./target/release/benchmark-harness}"
MEASURE_QUALITY="${MEASURE_QUALITY:-false}"
OCR_ENABLED="${OCR_ENABLED:-false}"
OUTPUT_FORMAT="${OUTPUT_FORMAT:-markdown}"
if [ -z "$FRAMEWORK" ] || [ -z "$MODE" ]; then
echo "::error::FRAMEWORK and MODE environment variables are required" >&2
exit 1
fi
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
source "${REPO_ROOT}/scripts/lib/common.sh"
source "${REPO_ROOT}/scripts/lib/library-paths.sh"
validate_repo_root "$REPO_ROOT" || exit 1
setup_go_paths "$REPO_ROOT"
setup_onnx_paths
OUTPUT_DIR="benchmark-results/${FRAMEWORK}-${OUTPUT_FORMAT}-${MODE}"
rm -rf "${OUTPUT_DIR}"
MAX_CONCURRENT=$([[ "$MODE" == "single-file" ]] && echo 1 || echo 4)
SHARD="${SHARD:-}"
EXTRA_ARGS=()
if [ "$MEASURE_QUALITY" = "true" ]; then
EXTRA_ARGS+=("--measure-quality")
fi
if [ "$OCR_ENABLED" = "true" ]; then
EXTRA_ARGS+=("--ocr")
fi
if [ -n "$SHARD" ]; then
EXTRA_ARGS+=("--shard" "${SHARD}")
fi
BENCHMARK_DEBUG=1 "${HARNESS_PATH}" \
run \
--fixtures "${FIXTURES_DIR}" \
--frameworks "${FRAMEWORK}" \
--output "${OUTPUT_DIR}" \
--iterations "${ITERATIONS}" \
--timeout "${TIMEOUT}" \
--mode "${MODE}" \
--max-concurrent "${MAX_CONCURRENT}" \
--output-format "${OUTPUT_FORMAT}" \
"${EXTRA_ARGS[@]+"${EXTRA_ARGS[@]}"}"

242
scripts/ci/README.md Normal file
View File

@@ -0,0 +1,242 @@
# CI Workflow Scripts
This directory contains extracted scripts from GitHub Actions CI workflows, organized by workflow type.
## Overview
- **Total Scripts**: 41 (27 Bash + 14 PowerShell)
- **Documentation**: See `SCRIPT_MAPPING.md` for detailed workflow-to-script mapping
- **All Scripts**: Production-ready with proper error handling and documentation
## Directory Structure
```text
scripts/ci/
├── README.md ← This file
├── SCRIPT_MAPPING.md ← Detailed workflow-to-script mapping guide
├── docker/ ← Docker image build and test scripts
├── go/ ← Go bindings scripts
├── java/ ← Java bindings scripts
├── node/ ← Node/TypeScript NAPI scripts
├── python/ ← Python wheel build scripts
├── ruby/ ← Ruby gem build scripts
├── rust/ ← Rust core and CLI scripts
├── csharp/ ← C# bindings scripts
└── validate/ ← Validation and linting scripts
```
## Quick Start
### Running a Script
**Bash scripts:**
```bash
./scripts/ci/docker/build-image.sh core
./scripts/ci/python/run-tests.sh true
```
**PowerShell scripts:**
```powershell
& ./scripts/ci/go/build-ffi.ps1
& ./scripts/ci/rust/package-cli-windows.ps1 -Target "x86_64-pc-windows-msvc"
```
### Sourcing Scripts
For library path setup scripts:
```bash
source ./scripts/lib/library-paths.sh
setup_all_library_paths
./scripts/ci/python/run-tests.sh true
```
## Scripts by Workflow
### Docker (`docker/`)
- `free-disk-space.sh` - Clean up CI disk space
- `build-image.sh` - Build Docker image variant
- `check-image-size.sh` - Validate image size constraints
- `save-image.sh` - Save Docker image as tar.gz artifact
- `collect-logs.sh` - Collect container logs on failure
- `cleanup.sh` - Clean up Docker resources
- `summary.sh` - Print test summary
### Go (`go/`)
- `build-ffi.sh` - Build FFI library (Unix)
- `build-ffi.ps1` - Build FFI library (Windows)
- `build-bindings.sh` - Build Go bindings with CGO (Unix)
- `build-bindings.ps1` - Build Go bindings with CGO (Windows)
- `reorganize-libraries.ps1` - Reorganize FFI libraries for Windows
- `run-tests.sh` - Run Go tests with library paths
### Java (`java/`)
- `build-java.sh` - Build Java bindings with Maven
- `run-tests.sh` - Run Java tests with Maven
### Node/TypeScript (`node/`)
- `build-napi.sh` - Build NAPI bindings with artifact collection
- `unpack-bindings.sh` - Unpack and install bindings from tarball
### Python (`python/`)
- `clean-artifacts.sh` - Clean previous wheel artifacts
- `smoke-test-wheel.sh` - Test wheel installation
- `install-wheel.sh` - Install platform-specific wheel
- `run-tests.sh` - Run tests with optional coverage
### Ruby (`ruby/`)
- `install-ruby-deps.sh` - Install bundle dependencies (Unix)
- `install-ruby-deps.ps1` - Install bundle dependencies (Windows)
- `vendor-kreuzberg-core.py` - Vendor core crate for packaging
- `configure-bindgen-windows.ps1` - Configure bindgen headers (Windows)
- `configure-tesseract-windows.ps1` - Configure Tesseract (Windows)
- `build-gem.sh` - Build Ruby gem
- `install-gem.sh` - Install built gem
- `compile-extension.sh` - Compile native extension
- `run-tests.sh` - Run RSpec tests
### Rust (`rust/`)
- `configure-bindgen-windows.ps1` - Configure bindgen headers (Windows)
- `run-unit-tests.sh` - Run Rust unit tests
- `package-cli-unix.sh` - Package CLI as tar.gz (Unix)
- `package-cli-windows.ps1` - Package CLI as zip (Windows)
- `test-cli-unix.sh` - Test CLI binary (Unix)
- `test-cli-windows.ps1` - Test CLI binary (Windows)
### C# (`csharp/`)
- `build-csharp.sh` - Build C# bindings with dotnet
- `run-tests.sh` - Run C# tests with dotnet
### Validate (`validate/`)
- `run-lint.sh` - Run all linting and validation checks via Task
## Features
### Error Handling
- All Bash scripts use `set -euo pipefail`
- All PowerShell scripts use `Set-StrictMode` and error action preferences
- Proper exit codes and error messages
- Usage information for incorrect arguments
### Documentation
- Every script has a descriptive header
- Purpose and usage clearly stated
- Which CI workflow step uses it
- Argument documentation
### Platform Support
- Windows-specific operations via PowerShell (.ps1)
- Unix operations via Bash (.sh)
- Cross-platform scripts detect OS and adjust behavior
- Library path setup scripts handle Windows/Linux/macOS
### Reusability
- `library-paths.sh` (`scripts/lib/`) - Shared by all workflows for native library configuration
- `configure-bindgen-windows.ps1` used by Ruby and Rust
- Common patterns consolidated into single scripts
## Detailed Documentation
For comprehensive workflow-to-script mapping and usage examples, see `SCRIPT_MAPPING.md`.
## Usage in Workflows
### Example: ci-docker.yaml
**Before (inline commands):**
```yaml
- name: Free up disk space
run: |
echo "=== Initial disk space ==="
df -h /
echo "=== Removing unnecessary packages ==="
sudo rm -rf /usr/share/dotnet
# ... 30+ lines of commands ...
```
**After (using script):**
```yaml
- name: Free up disk space
run: ./scripts/ci/docker/free-disk-space.sh
```
### Example: ci-python.yaml
**Before (inline commands):**
```yaml
- name: Run Python tests
run: |
cd packages/python
if [ "${{ matrix.coverage }}" = "true" ]; then
uv run pytest -vv --cov=kreuzberg --cov-report=lcov:coverage.lcov ...
else
uv run pytest -vv --reruns 1 --reruns-delay 1
fi
```
**After (using script):**
```yaml
- name: Run Python tests
run: ./scripts/ci/python/run-tests.sh ${{ matrix.coverage }}
```
## Testing Scripts Locally
You can test scripts locally before running in CI:
```bash
# Test Docker scripts
./scripts/ci/docker/free-disk-space.sh
# Test Python scripts
./scripts/ci/python/clean-artifacts.sh
./scripts/ci/python/run-tests.sh false
# Test Rust scripts
./scripts/ci/rust/run-unit-tests.sh
```
## Shell Compatibility
- **Bash scripts**: Compatible with bash 3.2+ (macOS) and bash 4.0+ (Linux)
- **PowerShell scripts**: Compatible with PowerShell 5.1+ (Windows) and PowerShell Core 7+ (cross-platform)
## Contributing
When adding new CI steps or modifying existing ones:
1. Extract the inline script into a separate file in the appropriate directory
2. Add proper error handling (`set -euo pipefail` for bash)
3. Include descriptive header comments
4. Update `SCRIPT_MAPPING.md` with the new mapping
5. Test the script locally before committing
## Maintenance
Scripts should be reviewed and updated when:
- Updating CI workflow logic
- Changing build tools or versions
- Improving error handling
- Adding new platform support
See each script's header for detailed documentation on its purpose and usage.

View File

@@ -0,0 +1,90 @@
#!/usr/bin/env bash
set -euo pipefail
ort_version="${1:?ort-version required}"
dest_dir="${2:-crates/kreuzberg-node}"
arch_id="${3:-}"
strategy="${4:-system}"
extract_dir="$RUNNER_TEMP/onnxruntime"
if [ -z "$arch_id" ]; then
case "$(uname -m)" in
x86_64 | amd64) arch_id="x64" ;;
arm64 | aarch64) arch_id="arm64" ;;
*)
echo "Unsupported Linux architecture: $(uname -m)" >&2
exit 1
;;
esac
fi
case "$arch_id" in
x64)
ort_dir_name="onnxruntime-linux-x64-${ort_version}"
archive="onnxruntime-linux-x64-${ort_version}.tgz"
;;
arm64)
ort_dir_name="onnxruntime-linux-aarch64-${ort_version}"
archive="onnxruntime-linux-aarch64-${ort_version}.tgz"
;;
*)
echo "Unsupported Linux arch-id: $arch_id" >&2
exit 1
;;
esac
if [ ! -d "$extract_dir/$ort_dir_name" ]; then
echo "Cache miss: Downloading ONNX Runtime ${ort_version}"
curl -fsSL --retry 5 --retry-delay 5 --retry-all-errors -o "$RUNNER_TEMP/$archive" "https://github.com/microsoft/onnxruntime/releases/download/v${ort_version}/$archive"
mkdir -p "$extract_dir"
tar -xzf "$RUNNER_TEMP/$archive" -C "$extract_dir"
else
echo "Cache hit: Using cached ONNX Runtime ${ort_version}"
fi
ort_root="$extract_dir/$ort_dir_name"
if [ ! -d "$ort_root/lib" ]; then
echo "ERROR: ONNX Runtime lib directory missing at $ort_root/lib" >&2
echo "Available directories:" >&2
ls -la "$extract_dir" >&2 || true
exit 1
fi
if ! ls "$ort_root/lib"/*.so* 1>/dev/null 2>&1; then
echo "ERROR: No ONNX Runtime libraries found in $ort_root/lib" >&2
echo "Directory contents:" >&2
ls -la "$ort_root/lib" >&2 || true
exit 1
fi
dest="$GITHUB_WORKSPACE/$dest_dir"
mkdir -p "$dest"
cp -f "$ort_root/lib/"*.so* "$dest/"
if [ -n "${RUSTFLAGS:-}" ]; then
rustflags="$RUSTFLAGS -L $ort_root/lib"
else
rustflags="-L $ort_root/lib"
fi
if [ "$strategy" = "bundled" ]; then
echo "Using bundled ORT strategy — letting ort-sys download-binaries handle static linking"
{
echo "LD_LIBRARY_PATH=$ort_root/lib:$dest:${LD_LIBRARY_PATH:-}"
echo "LIBRARY_PATH=$ort_root/lib:$dest:${LIBRARY_PATH:-}"
} >>"$GITHUB_ENV"
else
{
ort_lib=$(find "$ort_root/lib" -name "libonnxruntime*.so*" -print -quit)
echo "ORT_LIB_LOCATION=$ort_root/lib"
echo "ORT_PREFER_DYNAMIC_LINK=1"
echo "ORT_SKIP_DOWNLOAD=1"
echo "ORT_STRATEGY=system"
echo "ORT_DYLIB_PATH=$ort_root/lib/${ort_lib##*/}"
echo "LD_LIBRARY_PATH=$ort_root/lib:$dest:${LD_LIBRARY_PATH:-}"
echo "LIBRARY_PATH=$ort_root/lib:$dest:${LIBRARY_PATH:-}"
echo "RUSTFLAGS=$rustflags"
} >>"$GITHUB_ENV"
fi

View File

@@ -0,0 +1,86 @@
#!/usr/bin/env bash
set -euo pipefail
ort_version="${1:?ort-version required}"
dest_dir="${2:-crates/kreuzberg-node}"
arch_id="${3:-}"
strategy="${4:-system}"
extract_dir="$RUNNER_TEMP/onnxruntime"
if [ -z "$arch_id" ]; then
arch="$(uname -m)"
if [ "$arch" = "arm64" ]; then
arch_id="arm64"
else
arch_id="x64"
fi
fi
case "$arch_id" in
arm64) ort_arch="arm64" ;;
x64) ort_arch="x86_64" ;;
*)
echo "Unsupported macOS arch-id: $arch_id" >&2
exit 1
;;
esac
echo "Using macOS ONNX Runtime arch: $ort_arch"
if [ ! -d "$extract_dir/onnxruntime-osx-${ort_arch}-${ort_version}" ]; then
echo "Cache miss: Downloading ONNX Runtime ${ort_version} for macOS ${ort_arch}"
archive="onnxruntime-osx-${ort_arch}-${ort_version}.tgz"
curl -fsSL --retry 5 --retry-delay 5 --retry-all-errors -o "$RUNNER_TEMP/$archive" "https://github.com/microsoft/onnxruntime/releases/download/v${ort_version}/$archive"
mkdir -p "$extract_dir"
tar -xzf "$RUNNER_TEMP/$archive" -C "$extract_dir"
else
echo "Cache hit: Using cached ONNX Runtime ${ort_version}"
fi
ort_root="$extract_dir/onnxruntime-osx-${ort_arch}-${ort_version}"
if [ ! -d "$ort_root/lib" ]; then
echo "ERROR: ONNX Runtime lib directory missing at $ort_root/lib" >&2
echo "Available directories:" >&2
ls -la "$extract_dir" >&2 || true
exit 1
fi
if ! ls "$ort_root/lib"/libonnxruntime*.dylib 1>/dev/null 2>&1; then
echo "ERROR: No ONNX Runtime libraries found in $ort_root/lib" >&2
echo "Directory contents:" >&2
ls -la "$ort_root/lib" >&2 || true
exit 1
fi
dest="$GITHUB_WORKSPACE/$dest_dir"
mkdir -p "$dest"
cp -f "$ort_root/lib/"libonnxruntime*.dylib "$dest/"
if [ -n "${RUSTFLAGS:-}" ]; then
rustflags="$RUSTFLAGS -L $ort_root/lib"
else
rustflags="-L $ort_root/lib"
fi
if [ "$strategy" = "bundled" ]; then
echo "Using bundled ORT strategy — letting ort-sys download-binaries handle static linking"
{
echo "DYLD_LIBRARY_PATH=$ort_root/lib:$dest:${DYLD_LIBRARY_PATH:-}"
echo "DYLD_FALLBACK_LIBRARY_PATH=$ort_root/lib:$dest:${DYLD_FALLBACK_LIBRARY_PATH:-}"
echo "LIBRARY_PATH=$ort_root/lib:$dest:${LIBRARY_PATH:-}"
} >>"$GITHUB_ENV"
else
{
ort_lib=$(find "$ort_root/lib" -name "libonnxruntime*.dylib" -print -quit)
echo "ORT_LIB_LOCATION=$ort_root/lib"
echo "ORT_PREFER_DYNAMIC_LINK=1"
echo "ORT_SKIP_DOWNLOAD=1"
echo "ORT_STRATEGY=system"
echo "ORT_DYLIB_PATH=$ort_root/lib/${ort_lib##*/}"
echo "DYLD_LIBRARY_PATH=$ort_root/lib:$dest:${DYLD_LIBRARY_PATH:-}"
echo "DYLD_FALLBACK_LIBRARY_PATH=$ort_root/lib:$dest:${DYLD_FALLBACK_LIBRARY_PATH:-}"
echo "LIBRARY_PATH=$ort_root/lib:$dest:${LIBRARY_PATH:-}"
echo "RUSTFLAGS=$rustflags"
} >>"$GITHUB_ENV"
fi

View File

@@ -0,0 +1,100 @@
$OrtVersion = $args[0]
if ([string]::IsNullOrWhiteSpace($OrtVersion)) { throw "Usage: windows.ps1 <ortVersion> [destDir] [archId] [strategy]" }
$DestDir = if ($args.Count -ge 2 -and -not [string]::IsNullOrWhiteSpace($args[1])) { $args[1] } else { "crates/kreuzberg-node" }
$ArchId = if ($args.Count -ge 3) { $args[2] } else { "" }
$Strategy = if ($args.Count -ge 4 -and -not [string]::IsNullOrWhiteSpace($args[3])) { $args[3] } else { "system" }
$ExtractRoot = Join-Path $env:TEMP "onnxruntime"
if ([string]::IsNullOrWhiteSpace($ArchId)) {
$ArchId = $env:RUNNER_ARCH
}
$ArchId = $ArchId.ToLowerInvariant()
if ($ArchId -eq "arm64") { $ArchId = "arm64" } else { $ArchId = "x64" }
$OrtRoot = Join-Path $ExtractRoot "onnxruntime-win-$ArchId-$OrtVersion"
$OrtBin = Join-Path $OrtRoot 'bin'
$OrtLib = Join-Path $OrtRoot 'lib'
if (-Not (Test-Path $OrtRoot)) {
Write-Host "Cache miss: Downloading ONNX Runtime $OrtVersion"
$Archive = "onnxruntime-win-$ArchId-$OrtVersion.zip"
$DownloadPath = Join-Path $env:TEMP $Archive
Invoke-WebRequest -Uri "https://github.com/microsoft/onnxruntime/releases/download/v$OrtVersion/$Archive" -OutFile $DownloadPath -UseBasicParsing -MaximumRetryCount 5 -RetryIntervalSec 5
New-Item -ItemType Directory -Path $ExtractRoot -Force | Out-Null
Expand-Archive -Path $DownloadPath -DestinationPath $ExtractRoot -Force
} else {
Write-Host "Cache hit: Using cached ONNX Runtime $OrtVersion"
}
if (!(Test-Path $OrtLib)) {
Write-Error "ERROR: ONNX Runtime lib directory missing at $OrtLib"
Get-ChildItem -Path $ExtractRoot -Recurse | Write-Host
exit 1
}
$LibFiles = @(Get-ChildItem -Path $OrtLib -Filter "*.lib" -ErrorAction SilentlyContinue)
if ($LibFiles.Count -eq 0) {
Write-Error "ERROR: No ONNX Runtime library files found in $OrtLib"
Get-ChildItem -Path $OrtLib | Write-Host
exit 1
}
$DllDirs = @()
foreach ($Candidate in @($OrtLib, $OrtBin)) {
if (Test-Path $Candidate) {
$CandidateDlls = @(Get-ChildItem -Path $Candidate -Filter "*.dll" -File -ErrorAction SilentlyContinue)
if ($CandidateDlls.Count -gt 0) {
$DllDirs += $Candidate
}
}
}
if ($DllDirs.Count -eq 0) {
$OrtDll = Get-ChildItem -Path $OrtRoot -Recurse -Filter "onnxruntime.dll" -File -ErrorAction SilentlyContinue | Select-Object -First 1
if ($OrtDll) { $DllDirs += $OrtDll.DirectoryName }
}
if ($DllDirs.Count -eq 0) {
$AnyDll = Get-ChildItem -Path $OrtRoot -Recurse -Filter "*.dll" -File -ErrorAction SilentlyContinue | Select-Object -First 1
if ($AnyDll) { $DllDirs += $AnyDll.DirectoryName }
}
$DllDirs = $DllDirs | Select-Object -Unique
if ($DllDirs.Count -eq 0) {
Write-Error "ERROR: No ONNX Runtime runtime DLLs found under $OrtRoot"
Get-ChildItem -Path $OrtRoot -Recurse | Write-Host
exit 1
}
$Dest = Join-Path $env:GITHUB_WORKSPACE $DestDir
New-Item -ItemType Directory -Path $Dest -Force | Out-Null
Copy-Item -Path (Join-Path $OrtLib '*') -Destination $Dest -Force
foreach ($Dir in $DllDirs) {
Copy-Item -Path (Join-Path $Dir '*.dll') -Destination $Dest -Force
}
$RustFlags = if ($env:RUSTFLAGS) { "$env:RUSTFLAGS -L $OrtLib" } else { "-L $OrtLib" }
if ($Strategy -eq "bundled") {
# ort-sys has no prebuilt static binaries for x86_64-pc-windows-gnu (MSYS2/MinGW).
# Use the pre-downloaded Microsoft ORT with dynamic linking for Windows GNU targets.
Write-Host "Using bundled ORT strategy (Windows) - dynamic linking against pre-downloaded ORT (no static binaries for windows-gnu)"
@(
"ORT_LIB_LOCATION=$OrtLib"
"ORT_PREFER_DYNAMIC_LINK=1"
"RUSTFLAGS=$RustFlags"
"LIB=$OrtLib;$env:LIB"
"LIBRARY_PATH=$OrtLib;$env:LIBRARY_PATH"
"PATH=$Dest;$env:PATH"
) | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
} else {
@(
"ORT_LIB_LOCATION=$OrtLib"
"ORT_PREFER_DYNAMIC_LINK=1"
"ORT_SKIP_DOWNLOAD=1"
"ORT_STRATEGY=system"
"ORT_DYLIB_PATH=$Dest\onnxruntime.dll"
"RUSTFLAGS=$RustFlags"
"LIB=$OrtLib;$env:LIB"
"LIBRARY_PATH=$OrtLib;$env:LIBRARY_PATH"
"PATH=$Dest;$env:PATH"
) | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
}

View File

@@ -0,0 +1,48 @@
#!/usr/bin/env bash
set -euo pipefail
target="${1:?target required}"
case "$target" in
aarch64-apple-darwin)
ort_url="https://cdn.pyke.io/0/pyke:ort-rs/ms@1.24.1/aarch64-apple-darwin.tgz"
;;
x86_64-apple-darwin)
ort_url="https://cdn.pyke.io/0/pyke:ort-rs/ms@1.24.1/x86_64-apple-darwin.tgz"
;;
*)
echo "setup-prebuilt-onnx does not support target $target" >&2
exit 1
;;
esac
ort_dir="${GITHUB_WORKSPACE}/target/onnxruntime/${target}"
ort_root="${ort_dir}/onnxruntime"
ort_lib="${ort_root}/lib"
write_env() {
{
echo "ORT_STRATEGY=system"
echo "ORT_LIB_LOCATION=${ort_lib}"
echo "ORT_SKIP_DOWNLOAD=1"
echo "ORT_PREFER_DYNAMIC_LINK=1"
} >>"${GITHUB_ENV}"
}
if [ ! -f "${ort_lib}/libonnxruntime.a" ]; then
rm -rf "${ort_dir}"
mkdir -p "${ort_lib}"
echo "Attempting to download prebuilt ONNX Runtime for ${target}..." >&2
if curl -fsSL --max-time 30 -o /tmp/ort.tgz "${ort_url}" 2>/dev/null; then
tar -xz -C "${ort_lib}" -f /tmp/ort.tgz
rm -f /tmp/ort.tgz
write_env
else
echo "Warning: Prebuilt ONNX Runtime not available for ${target}" >&2
echo "Will download and build ONNX Runtime during compilation" >&2
fi
else
echo "Using existing ONNX Runtime at ${ort_lib}" >&2
write_env
fi

View File

@@ -0,0 +1,29 @@
#!/usr/bin/env bash
set -euo pipefail
# Usage: build-with-sccache-fallback.sh <cargo command...>
log_file=$(mktemp)
trap 'rm -f "$log_file"' EXIT
echo "Building with sccache (fallback on errors)..."
# Attempt with sccache
if "$@" 2>&1 | tee "$log_file"; then
echo "✓ Build succeeded with sccache"
exit 0
fi
# Check for sccache-related errors
if grep -Eq "sccache.*(error|failed)|cache storage failed|dns error|connection (refused|timed out)" "$log_file"; then
echo "⚠️ sccache failure detected, retrying without cache..."
export RUSTC_WRAPPER=""
export SCCACHE_GHA_ENABLED=false
if "$@"; then
echo "✓ Build succeeded without sccache (fallback)"
exit 0
fi
fi
echo "✗ Build failed"
exit 1

View File

@@ -0,0 +1,7 @@
#!/usr/bin/env bash
set -euo pipefail
label="${1:?label required}"
rm -rf ".tesseract-cache/${label}"
rm -rf ".xdg-cache/${label}"

View File

@@ -0,0 +1,5 @@
#!/usr/bin/env bash
set -euo pipefail
rust_target="${1:?rust target required}"
rm -rf "target/${rust_target}/kreuzberg-tesseract-cache"

View File

@@ -0,0 +1,44 @@
#!/usr/bin/env bash
set -euo pipefail
label="${1:?label required}"
enable_cache="${2:?enable-cache required (true/false)}"
if [ "$enable_cache" = "true" ]; then
cache_dir="${GITHUB_WORKSPACE}/.tesseract-cache/${label}"
echo "TESSERACT_RS_CACHE_DIR=${cache_dir}" >>"$GITHUB_ENV"
echo "XDG_CACHE_HOME=${GITHUB_WORKSPACE}/.xdg-cache/${label}" >>"$GITHUB_ENV"
echo "cache-dir=${cache_dir}" >>"$GITHUB_OUTPUT"
echo "cache-enabled=true" >>"$GITHUB_OUTPUT"
docker_opts="--env TESSERACT_RS_CACHE_DIR=/io/.tesseract-cache/${label}"
docker_opts="${docker_opts} --env XDG_CACHE_HOME=/io/.xdg-cache/${label}"
multiarch=""
if command -v dpkg-architecture >/dev/null 2>&1; then
multiarch="$(dpkg-architecture -qDEB_HOST_MULTIARCH 2>/dev/null || true)"
fi
if [ -z "$multiarch" ]; then
case "$(uname -m)" in
x86_64) multiarch="x86_64-linux-gnu" ;;
aarch64 | arm64) multiarch="aarch64-linux-gnu" ;;
esac
fi
openssl_lib_dir="/usr/lib"
if [ -n "$multiarch" ]; then
openssl_lib_dir="/usr/lib/${multiarch}"
fi
docker_opts="${docker_opts} --env OPENSSL_LIB_DIR=${openssl_lib_dir}"
docker_opts="${docker_opts} --env OPENSSL_INCLUDE_DIR=/usr/include"
echo "docker-options=${docker_opts}" >>"$GITHUB_OUTPUT"
else
{
echo "TESSERACT_RS_CACHE_DIR="
} >>"$GITHUB_ENV"
{
echo "cache-dir="
echo "cache-enabled=false"
echo "docker-options="
} >>"$GITHUB_OUTPUT"
fi

View File

@@ -0,0 +1,7 @@
#!/usr/bin/env bash
set -euo pipefail
label="${1:?label required}"
mkdir -p ".tesseract-cache/${label}"
mkdir -p ".xdg-cache/${label}"

View File

@@ -0,0 +1,11 @@
#!/usr/bin/env bash
set -euo pipefail
label="${1:-Node setup}"
echo "=== ${label} ==="
echo "Node version: $(node --version)"
echo "pnpm version: $(pnpm --version)"
echo "tsx availability: $(command -v tsx || echo 'NOT FOUND')"
echo "pnpm workspace structure:"
pnpm list --depth=0 || true

158
scripts/ci/cache/compute-hash.sh vendored Executable file
View File

@@ -0,0 +1,158 @@
#!/usr/bin/env bash
# Compute deterministic hash for cache key generation
#
# Usage:
# compute-hash.sh <glob-pattern> [glob-pattern...]
# compute-hash.sh --files <file1> <file2> ...
# compute-hash.sh --dirs <dir1> <dir2> ...
#
# Examples:
# compute-hash.sh "crates/kreuzberg/**/*.rs" "crates/kreuzberg-ffi/**/*.rs"
# compute-hash.sh --files Cargo.lock uv.lock
# compute-hash.sh --dirs crates/kreuzberg/ crates/kreuzberg-ffi/
set -euo pipefail
# Color output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
error() {
echo -e "${RED}Error: $*${NC}" >&2
exit 1
}
info() {
echo -e "${GREEN}$*${NC}" >&2
}
warn() {
echo -e "${YELLOW}$*${NC}" >&2
}
# Check if sha256sum or shasum is available
if command -v sha256sum &>/dev/null; then
HASH_CMD="sha256sum"
elif command -v shasum &>/dev/null; then
HASH_CMD="shasum -a 256"
else
error "Neither sha256sum nor shasum found in PATH"
fi
# Mode detection
MODE="glob"
if [[ "${1:-}" == "--files" ]]; then
MODE="files"
shift
elif [[ "${1:-}" == "--dirs" ]]; then
MODE="dirs"
shift
fi
if [[ $# -eq 0 ]]; then
error "No input provided. Usage: $0 <pattern...> or $0 --files <file...> or $0 --dirs <dir...>"
fi
# Temporary file for collecting hashes
TEMP_HASHES=$(mktemp)
trap 'rm -f "$TEMP_HASHES"' EXIT
case "$MODE" in
files)
# Hash specific files directly
for file in "$@"; do
if [[ -f "$file" ]]; then
$HASH_CMD "$file" >>"$TEMP_HASHES" 2>/dev/null || warn "Failed to hash: $file"
else
warn "File not found: $file"
fi
done
;;
dirs)
# Hash all files in directories recursively
for dir in "$@"; do
if [[ -d "$dir" ]]; then
# Find all files (excluding hidden files and directories)
find "$dir" -type f \
! -path "*/.*" \
! -path "*/target/*" \
! -path "*/node_modules/*" \
! -path "*/.venv/*" \
! -path "*/dist/*" \
! -path "*/build/*" \
-exec "$HASH_CMD" {} \; >>"$TEMP_HASHES" 2>/dev/null || true
else
warn "Directory not found: $dir"
fi
done
;;
glob)
# Hash files matching glob patterns
for pattern in "$@"; do
# Use find with -path for glob matching
# Convert glob to find path expression
if [[ "$pattern" == *"**"* ]]; then
# Handle ** recursive glob (e.g., "crates/kreuzberg/**/*.rs")
# Extract the base directory and file extension/name pattern
base_dir=$(echo "$pattern" | cut -d'*' -f1 | sed 's|/$||')
# Get the suffix after the ** (e.g., "/*.rs" from "crates/kreuzberg/**/*.rs")
# Remove everything up to and including **/
suffix="${pattern#*\*\*/}"
# Extract filename pattern (e.g., "*.rs" from "/*.rs")
# Remove leading / if present
if [[ "$suffix" == /* ]]; then
name_pattern="${suffix#/}"
else
name_pattern="$suffix"
fi
if [[ -d "$base_dir" ]]; then
# Find all files recursively using -name for filename matching
# This is more portable and reliable than bash regex
find "$base_dir" -type f \
! -path "*/.*" \
! -path "*/target/*" \
! -path "*/node_modules/*" \
! -path "*/.venv/*" \
-name "$name_pattern" \
-exec "$HASH_CMD" {} \; 2>/dev/null >>"$TEMP_HASHES" || true
else
warn "Directory not found: $base_dir"
fi
else
# Simple glob (no **)
for file in $pattern; do
if [[ -f "$file" ]]; then
$HASH_CMD "$file" >>"$TEMP_HASHES" 2>/dev/null || warn "Failed to hash: $file"
fi
done
fi
done
;;
esac
# Check if we found any files to hash
if [[ ! -s "$TEMP_HASHES" ]]; then
error "No files found matching the provided patterns"
fi
# Sort hashes (for determinism across different find orders)
# Then hash the combined hashes to get final hash
FINAL_HASH=$(sort "$TEMP_HASHES" | $HASH_CMD | cut -d' ' -f1)
# Truncate to 12 characters for cache key (still 48 bits of entropy)
SHORT_HASH="${FINAL_HASH:0:12}"
# Output the hash
echo "$SHORT_HASH"
# Debug info (to stderr)
FILE_COUNT=$(wc -l <"$TEMP_HASHES")
info "Hashed $FILE_COUNT files → $SHORT_HASH" >&2

View File

@@ -0,0 +1,5 @@
#!/usr/bin/env bash
set -euo pipefail
echo "=== Running Docker CLI feature tests ==="
python3 scripts/ci/docker/test_docker.py --image "kreuzberg:cli" --variant cli --verbose

View File

@@ -0,0 +1,13 @@
#!/usr/bin/env bash
# CI wrapper for Docker configuration testing
# Tests volume mounts, config formats, and environment variable overrides
set -euo pipefail
variant="${1:?missing variant}"
echo "=== Running Docker configuration tests (${variant}) ==="
# Run the comprehensive config test script
# The script expects the image to already be built and tagged
exec ./scripts/test/test-docker-config-local.sh --image "kreuzberg:${variant}" --variant "${variant}"

View File

@@ -0,0 +1,7 @@
#!/usr/bin/env bash
set -euo pipefail
variant="${1:?missing variant}"
echo "=== Running Docker feature tests (${variant}) ==="
python3 scripts/ci/docker/test_docker.py --image "kreuzberg:${variant}" --variant "${variant}" --verbose

750
scripts/ci/docker/test_docker.py Executable file
View File

@@ -0,0 +1,750 @@
#!/usr/bin/env python3
"""Unified Docker image test script for all variants (core, full, cli)."""
from __future__ import annotations
import argparse
import json
import os
import random
import subprocess
import sys
import tempfile
import time
from dataclasses import dataclass, field
from pathlib import Path
BLUE = "\033[0;34m"
GREEN = "\033[0;32m"
RED = "\033[0;31m"
YELLOW = "\033[1;33m"
NC = "\033[0m"
REPO_ROOT = Path(__file__).resolve().parents[3]
TEST_DOCS_DIR = REPO_ROOT / "test_documents"
RESULTS_FILE = Path("/tmp/kreuzberg-docker-test-results.json")
@dataclass
class TestRunner:
image: str
variant: str
verbose: bool = False
total: int = 0
passed: int = 0
failed: int = 0
failed_names: list[str] = field(default_factory=list)
containers: list[str] = field(default_factory=list)
def log(self, level: str, color: str, msg: str) -> None:
print(f"{color}[{level}]{NC} {msg}", flush=True)
def info(self, msg: str) -> None:
self.log("INFO", BLUE, msg)
def ok(self, msg: str = "PASS") -> None:
self.log("SUCCESS", GREEN, msg)
def error(self, msg: str) -> None:
self.log("ERROR", RED, msg)
def warn(self, msg: str) -> None:
self.log("WARNING", YELLOW, msg)
def debug(self, msg: str) -> None:
if self.verbose:
self.log("VERBOSE", YELLOW, msg)
def start(self, name: str) -> None:
self.total += 1
self.info(f"Test {self.total}: {name}")
def pass_test(self) -> None:
self.passed += 1
self.ok()
def fail_test(self, name: str, details: str = "") -> None:
self.failed += 1
self.failed_names.append(name)
msg = f"FAIL: {name}"
if details:
msg += f"\n Details: {details}"
self.error(msg)
def container_name(self) -> str:
name = f"kreuzberg-test-{int(time.time())}-{random.randint(0, 99999)}"
self.containers.append(name)
return name
def docker_run(self, *args: str, capture: bool = True) -> subprocess.CompletedProcess[str]:
cmd = ["docker", "run", "--rm", *args]
return subprocess.run(cmd, capture_output=capture, text=True, timeout=120)
def docker_run_detached(self, *args: str) -> str:
name = self.container_name()
cmd = ["docker", "run", "-d", "--name", name, *args]
subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=60)
return name
def docker_rm(self, name: str) -> None:
subprocess.run(["docker", "rm", "-f", name], capture_output=True, timeout=30)
def cleanup(self) -> None:
for c in self.containers:
self.docker_rm(c)
def run_cli_output(self, *extra_args: str, volumes: bool = False) -> str:
"""Run a CLI command against the image and return combined stdout+stderr."""
args: list[str] = ["--name", self.container_name()]
if volumes:
args += ["-v", f"{TEST_DOCS_DIR}:/data:ro"]
args.append(self.image)
args.extend(extra_args)
r = self.docker_run(*args)
return (r.stdout + r.stderr).strip()
def write_results(self) -> None:
rate = (self.passed * 100 // self.total) if self.total else 0
data = {
"image": self.image,
"variant": self.variant,
"total_tests": self.total,
"passed": self.passed,
"failed": self.failed,
"success_rate": rate,
"failed_tests": self.failed_names,
}
RESULTS_FILE.write_text(json.dumps(data, indent=2))
self.info(f"Results written to {RESULTS_FILE}")
# ---------------------------------------------------------------------------
# Shared tests (all variants)
# ---------------------------------------------------------------------------
def test_image_exists(t: TestRunner) -> None:
t.start("Docker image exists")
r = subprocess.run(["docker", "inspect", t.image], capture_output=True, timeout=30)
if r.returncode == 0:
t.pass_test()
else:
t.fail_test("Image does not exist", t.image)
def test_version(t: TestRunner) -> None:
t.start("CLI --version command")
out = t.run_cli_output("--version")
t.debug(f"Version output: {out}")
if "kreuzberg" in out.lower():
t.pass_test()
else:
t.fail_test("CLI version", f"Expected 'kreuzberg' in output, got: {out}")
def test_help(t: TestRunner) -> None:
t.start("CLI --help command")
out = t.run_cli_output("--help")
if "extract" in out.lower():
t.pass_test()
else:
t.fail_test("CLI help", "Expected 'extract' in help output")
def test_mime_detection(t: TestRunner) -> None:
t.start("MIME type detection (detect command)")
out = t.run_cli_output("detect", "/data/pdf/searchable.pdf", volumes=True)
t.debug(f"MIME detection output: {out}")
if "application/pdf" in out.lower():
t.pass_test()
else:
t.fail_test("MIME detection", f"Expected 'application/pdf', got: {out}")
def test_extract_text(t: TestRunner) -> None:
t.start("Extract plain text file")
out = t.run_cli_output("extract", "/data/text/contract.txt", volumes=True)
t.debug(f"Text extraction output (first 100 chars): {out[:100]}")
if len(out) > 15 and "contract" in out.lower():
t.pass_test()
else:
t.fail_test("Text extraction", f"Output too short ({len(out)} chars) or missing expected keywords")
def test_extract_pdf(t: TestRunner) -> None:
t.start("Extract searchable PDF")
name = t.container_name()
r = subprocess.run(
["docker", "run", "--rm", "--name", name,
"-v", f"{TEST_DOCS_DIR}:/data:ro",
t.image, "extract", "/data/pdf/searchable.pdf"],
capture_output=True, text=True, timeout=120,
)
out = (r.stdout + r.stderr).strip()
t.debug(f"PDF extraction output (first 200 chars): {out[:200]}")
if r.returncode != 0:
t.fail_test("Searchable PDF extraction", f"Exit code {r.returncode}: {out[:300]}")
elif len(out) > 50:
t.pass_test()
else:
t.fail_test("Searchable PDF extraction", f"Output too short: {len(out)} chars")
def test_extract_html(t: TestRunner) -> None:
t.start("Extract HTML file")
out = t.run_cli_output("extract", "/data/html/simple_table.html", volumes=True)
t.debug(f"HTML extraction output (first 100 chars): {out[:100]}")
if len(out) > 10:
t.pass_test()
else:
t.fail_test("HTML extraction", f"Output too short: {len(out)} chars")
def test_extract_docx(t: TestRunner) -> None:
t.start("Extract DOCX file")
out = t.run_cli_output("extract", "/data/docx/extraction_test.docx", volumes=True)
t.debug(f"DOCX extraction output (first 100 chars): {out[:100]}")
if len(out) > 100:
t.pass_test()
else:
t.fail_test("DOCX extraction", f"Output too short ({len(out)} chars)")
def test_batch_cli(t: TestRunner) -> None:
t.start("CLI batch extraction (multiple files)")
out = t.run_cli_output(
"batch", "/data/text/contract.txt", "/data/html/simple_table.html",
volumes=True,
)
t.debug(f"Batch output (first 200 chars): {out[:200]}")
if len(out) > 20:
t.pass_test()
else:
t.fail_test("Batch extraction", f"Output too short: {len(out)} chars")
def test_nonexistent_file(t: TestRunner) -> None:
t.start("Non-existent file returns error")
r = subprocess.run(
["docker", "run", "--rm", t.image, "extract", "/nonexistent/file.pdf"],
capture_output=True, text=True, timeout=60,
)
if r.returncode != 0:
t.pass_test()
else:
t.fail_test("Error on missing file", "Expected non-zero exit code for missing file")
def test_readonly_mount(t: TestRunner) -> None:
t.start("Read-only volume mount works")
name = t.container_name()
r = subprocess.run(
["docker", "run", "--rm", "--name", name,
"-v", f"{TEST_DOCS_DIR}:/data:ro",
"--read-only", "--tmpfs", "/tmp",
t.image, "extract", "/data/text/simple.txt"],
capture_output=True, text=True, timeout=60,
)
out = (r.stdout + r.stderr).strip()
if len(out) > 5:
t.pass_test()
else:
t.fail_test("Read-only mount", "Failed to extract with read-only filesystem")
# ---------------------------------------------------------------------------
# Core/Full-only tests (API server tests)
# ---------------------------------------------------------------------------
def _wait_for_api(port: int, retries: int = 10) -> bool:
import urllib.request
for _ in range(retries):
try:
urllib.request.urlopen(f"http://localhost:{port}/health", timeout=3)
return True
except Exception:
time.sleep(2)
return False
def _api_get(port: int, path: str) -> str | None:
import urllib.request
try:
with urllib.request.urlopen(f"http://localhost:{port}{path}", timeout=10) as resp:
return resp.read().decode()
except Exception:
return None
def _api_post_file(port: int, path: str, filepath: str) -> str | None:
"""POST a file using curl (simplest multipart approach)."""
r = subprocess.run(
["curl", "-f", "-s", "-X", "POST", f"http://localhost:{port}{path}",
"-F", f"files=@{filepath}"],
capture_output=True, text=True, timeout=30,
)
return r.stdout if r.returncode == 0 else None
def test_ocr_extraction(t: TestRunner) -> None:
t.start("OCR extraction with Tesseract")
name = t.container_name()
r = subprocess.run(
["docker", "run", "--rm", "--name", name, "--memory", "1g",
"-v", f"{TEST_DOCS_DIR}:/data:ro",
t.image, "extract", "/data/images/ocr_image.jpg", "--ocr", "true"],
capture_output=True, text=True, timeout=120,
)
out = (r.stdout + r.stderr).strip()
t.debug(f"OCR extraction output (first 100 chars): {out[:100]}")
if len(out) > 10:
t.pass_test()
else:
t.fail_test("OCR extraction", "Output too short or OCR failed")
def test_paddle_ocr_extraction(t: TestRunner) -> None:
t.start("PaddleOCR extraction (pre-loaded models)")
name = t.container_name()
r = subprocess.run(
["docker", "run", "--rm", "--name", name, "--memory", "2g",
"-v", f"{TEST_DOCS_DIR}:/data:ro",
t.image, "extract", "/data/images/ocr_image.jpg",
"--ocr", "true", "--ocr-backend", "paddle-ocr"],
capture_output=True, text=True, timeout=120,
)
out = (r.stdout + r.stderr).strip()
t.debug(f"PaddleOCR extraction output (first 200 chars): {out[:200]}")
if r.returncode == 0 and len(out) > 10:
t.pass_test()
else:
t.fail_test("PaddleOCR extraction", f"Exit code: {r.returncode}, output length: {len(out)}")
def test_doc_extraction(t: TestRunner) -> None:
t.start("Legacy DOC extraction (native OLE/CFB)")
name = t.container_name()
r = subprocess.run(
["docker", "run", "--rm", "--name", name, "--memory", "1g",
"-v", f"{TEST_DOCS_DIR}:/data:ro",
t.image, "extract", "/data/doc/unit_test_lists.doc"],
capture_output=True, text=True, timeout=120,
)
out = (r.stdout + r.stderr).strip()
t.debug(f"DOC extraction output (first 100 chars): {out[:100]}")
if len(out) > 20:
t.pass_test()
else:
t.fail_test("DOC extraction", f"Output too short: {len(out)} chars")
def test_api_health(t: TestRunner) -> None:
t.start("API server startup and health check")
port = 9000 + random.randint(0, 999)
name = t.docker_run_detached(
"--memory", "2g", "--cpus", "2",
"-p", f"{port}:8000", t.image,
)
if not _wait_for_api(port):
t.fail_test("API health check", f"Health endpoint not responding on port {port}")
t.docker_rm(name)
return
health = _api_get(port, "/health")
t.debug(f"Health response: {health}")
if health:
t.pass_test()
else:
t.fail_test("API health check", "No response from /health")
# Plugin initialization validation
t.start("Plugin initialization validation")
if health and "plugins" in health:
import re
ocr_m = re.search(r'"ocr_backends_count":(\d+)', health)
ext_m = re.search(r'"extractors_count":(\d+)', health)
ocr_count = int(ocr_m.group(1)) if ocr_m else 0
ext_count = int(ext_m.group(1)) if ext_m else 0
t.debug(f"OCR backends: {ocr_count}, Extractors: {ext_count}")
if t.variant == "full":
if ocr_count > 0:
t.info(f"Full variant: {ocr_count} OCR backend(s) registered")
t.pass_test()
else:
t.fail_test("Plugin initialization", "Full variant: No OCR backends registered")
t.docker_rm(name)
return
else:
t.pass_test()
if ext_count == 0:
t.fail_test("Plugin initialization", "No document extractors registered")
t.docker_rm(name)
return
else:
t.warn("Health response missing 'plugins' field")
t.pass_test()
t.docker_rm(name)
def test_api_extract(t: TestRunner) -> None:
t.start("API extraction endpoint")
port = 9000 + random.randint(0, 999)
name = t.docker_run_detached(
"--memory", "2g", "--cpus", "2",
"-p", f"{port}:8000", t.image,
)
if not _wait_for_api(port):
t.fail_test("API extraction", "Server not ready")
t.docker_rm(name)
return
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
f.write("Test content for API extraction")
tmp = f.name
resp = _api_post_file(port, "/extract", tmp)
os.unlink(tmp)
t.debug(f"API response: {resp}")
if resp and "Test content for API extraction" in resp:
t.pass_test()
else:
t.fail_test("API extraction", "Response missing expected content")
t.docker_rm(name)
def test_api_info(t: TestRunner) -> None:
t.start("API /info endpoint")
port = 9000 + random.randint(0, 999)
name = t.docker_run_detached(
"--memory", "2g", "--cpus", "2",
"-p", f"{port}:8000", t.image,
)
if not _wait_for_api(port):
t.fail_test("API /info", "Server not ready")
t.docker_rm(name)
return
resp = _api_get(port, "/info")
t.debug(f"/info response: {resp}")
if resp and "version" in resp and "rust_backend" in resp:
t.pass_test()
else:
t.fail_test("API /info endpoint", "Response missing expected fields")
t.docker_rm(name)
def test_api_openapi(t: TestRunner) -> None:
t.start("API /openapi.json endpoint")
port = 9000 + random.randint(0, 999)
name = t.docker_run_detached(
"--memory", "2g", "--cpus", "2",
"-p", f"{port}:8000", t.image,
)
if not _wait_for_api(port):
t.fail_test("API /openapi.json", "Server not ready")
t.docker_rm(name)
return
resp = _api_get(port, "/openapi.json")
t.debug(f"/openapi.json response (first 200 chars): {(resp or '')[:200]}")
if resp and '"openapi"' in resp and '"paths"' in resp:
t.pass_test()
else:
t.fail_test("API /openapi.json endpoint", "Response missing OpenAPI schema fields")
t.docker_rm(name)
def test_api_cache(t: TestRunner) -> None:
t.start("API /cache/stats endpoint")
port = 9000 + random.randint(0, 999)
name = t.docker_run_detached(
"--memory", "2g", "--cpus", "2",
"-p", f"{port}:8000", t.image,
)
if not _wait_for_api(port):
t.fail_test("API /cache/stats", "Server not ready")
t.docker_rm(name)
return
resp = _api_get(port, "/cache/stats")
t.debug(f"/cache/stats response: {resp}")
if resp and "total_files" in resp:
t.pass_test()
else:
t.fail_test("API /cache/stats endpoint", "Response missing expected fields")
t.start("API /cache/clear endpoint")
r = subprocess.run(
["curl", "-f", "-s", "-X", "DELETE", f"http://localhost:{port}/cache/clear"],
capture_output=True, text=True, timeout=10,
)
if r.returncode == 0 and "removed_files" in r.stdout:
t.pass_test()
else:
t.fail_test("API /cache/clear endpoint", "Response missing expected fields")
t.docker_rm(name)
def test_api_batch(t: TestRunner) -> None:
t.start("API batch extraction (multiple files)")
port = 9000 + random.randint(0, 999)
name = t.docker_run_detached(
"--memory", "2g", "--cpus", "2",
"-p", f"{port}:8000", t.image,
)
if not _wait_for_api(port):
t.fail_test("API batch extraction", "Server not ready")
t.docker_rm(name)
return
tmp1 = tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False)
tmp2 = tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False)
tmp1.write("File one content"); tmp1.close()
tmp2.write("File two content"); tmp2.close()
r = subprocess.run(
["curl", "-f", "-s", "-X", "POST", f"http://localhost:{port}/extract",
"-F", f"files=@{tmp1.name}", "-F", f"files=@{tmp2.name}"],
capture_output=True, text=True, timeout=30,
)
os.unlink(tmp1.name)
os.unlink(tmp2.name)
t.debug(f"Batch extraction response (first 200 chars): {r.stdout[:200]}")
if "File one content" in r.stdout and "File two content" in r.stdout:
t.pass_test()
else:
t.fail_test("API batch extraction", "Response missing expected content")
t.docker_rm(name)
def test_cli_batch_json(t: TestRunner) -> None:
t.start("CLI batch extraction with JSON format")
name = t.container_name()
r = subprocess.run(
["docker", "run", "--rm", "--name", name,
"-v", f"{TEST_DOCS_DIR}:/data:ro",
t.image, "batch", "/data/text/contract.txt", "/data/pdf/searchable.pdf",
"--format", "json"],
capture_output=True, text=True, timeout=120,
)
out = (r.stdout + r.stderr).strip()
t.debug(f"Batch command output (first 200 chars): {out[:200]}")
if len(out) > 100 and "content" in out:
t.pass_test()
else:
t.fail_test("CLI batch command", "Output too short or malformed")
def test_mcp_server(t: TestRunner) -> None:
t.start("MCP server startup and persistence")
name = t.docker_run_detached(
"-i", "--memory", "1g", t.image, "mcp",
)
time.sleep(3)
r = subprocess.run(
["docker", "ps", "--filter", f"name={name}", "--format", "{{.Names}}"],
capture_output=True, text=True, timeout=10,
)
if name in r.stdout:
t.debug("MCP server is running")
t.pass_test()
else:
t.fail_test("MCP server persistence", "MCP server exited immediately")
t.docker_rm(name)
def test_cli_cache(t: TestRunner) -> None:
t.start("CLI cache stats command")
name = t.container_name()
r = subprocess.run(
["docker", "run", "--rm", "--name", name, t.image, "cache", "stats", "--format", "json"],
capture_output=True, text=True, timeout=60,
)
out = (r.stdout + r.stderr).strip()
t.debug(f"Cache stats output: {out}")
if "total_files" in out:
t.pass_test()
else:
t.fail_test("CLI cache stats", "Output missing expected fields")
t.start("CLI cache clear command")
name = t.container_name()
r = subprocess.run(
["docker", "run", "--rm", "--name", name, t.image, "cache", "clear", "--format", "json"],
capture_output=True, text=True, timeout=60,
)
out = (r.stdout + r.stderr).strip()
t.debug(f"Cache clear output: {out}")
if "removed_files" in out:
t.pass_test()
else:
t.fail_test("CLI cache clear", "Output missing expected fields")
def test_security_nonroot(t: TestRunner) -> None:
t.start("Security: Container runs as non-root user")
name = t.container_name()
r = subprocess.run(
["docker", "run", "--rm", "--name", name, "--entrypoint", "/bin/sh",
t.image, "-c", "whoami"],
capture_output=True, text=True, timeout=30,
)
user = r.stdout.strip()
if user == "kreuzberg":
t.pass_test()
else:
t.fail_test("Non-root user", f"Container running as: {user} (expected: kreuzberg)")
def test_security_readonly(t: TestRunner) -> None:
t.start("Security: Read-only volume enforcement")
with tempfile.TemporaryDirectory() as tmpdir:
(Path(tmpdir) / "test.txt").write_text("test")
name = t.container_name()
r = subprocess.run(
["docker", "run", "--rm", "--name", name,
"-v", f"{tmpdir}:/data:ro",
"--entrypoint", "/bin/sh", t.image,
"-c", "echo 'attempt' > /data/test2.txt 2>&1 || echo 'READ_ONLY'"],
capture_output=True, text=True, timeout=30,
)
out = r.stdout + r.stderr
if any(s in out for s in ("READ_ONLY", "read-only", "Read-only")):
t.pass_test()
else:
t.fail_test("Read-only volume", "Was able to write to read-only volume")
def test_security_memlimit(t: TestRunner) -> None:
t.start("Security: Memory limit enforcement")
name = t.container_name()
r = subprocess.run(
["docker", "run", "--rm", "--name", name,
"--memory", "128m", "--memory-swap", "128m",
"--entrypoint", "/bin/sh", t.image,
"-c", "echo 'Memory limit test passed'"],
capture_output=True, text=True, timeout=30,
)
if "Memory limit test passed" in r.stdout:
t.pass_test()
else:
t.fail_test("Memory limit", "Container failed with memory limit")
# ---------------------------------------------------------------------------
# CLI-only tests
# ---------------------------------------------------------------------------
def test_cli_image_size(t: TestRunner) -> None:
t.start("Image size is reasonable (< 200MB)")
r = subprocess.run(
["docker", "inspect", t.image, "--format", "{{.Size}}"],
capture_output=True, text=True, timeout=10,
)
try:
size_mb = int(r.stdout.strip()) // (1024 * 1024)
except ValueError:
size_mb = 0
t.debug(f"Image size: {size_mb}MB")
if 0 < size_mb < 200:
t.pass_test()
else:
t.fail_test("Image size", f"Expected < 200MB, got {size_mb}MB")
# ---------------------------------------------------------------------------
# Test suites per variant
# ---------------------------------------------------------------------------
def run_cli_tests(t: TestRunner) -> None:
"""Tests for the minimal CLI Docker image."""
test_image_exists(t)
test_cli_image_size(t)
test_version(t)
test_help(t)
test_mime_detection(t)
test_extract_text(t)
test_extract_pdf(t)
test_extract_html(t)
test_extract_docx(t)
test_batch_cli(t)
test_readonly_mount(t)
test_nonexistent_file(t)
def run_core_full_tests(t: TestRunner) -> None:
"""Tests for core and full Docker images."""
test_image_exists(t)
test_version(t)
test_help(t)
test_mime_detection(t)
test_extract_text(t)
test_extract_pdf(t)
test_extract_docx(t)
test_extract_html(t)
test_ocr_extraction(t)
if t.variant == "full":
test_doc_extraction(t)
test_paddle_ocr_extraction(t)
test_api_health(t)
test_api_extract(t)
test_api_info(t)
test_api_openapi(t)
test_api_cache(t)
test_api_batch(t)
test_cli_batch_json(t)
test_mcp_server(t)
test_cli_cache(t)
test_security_nonroot(t)
test_security_readonly(t)
test_security_memlimit(t)
def main() -> None:
parser = argparse.ArgumentParser(description="Docker image tests")
parser.add_argument("--image", required=True, help="Docker image name")
parser.add_argument("--variant", required=True, choices=["core", "full", "cli"])
parser.add_argument("--verbose", action="store_true")
parser.add_argument("--skip-build", action="store_true", help="(ignored, kept for compat)")
args = parser.parse_args()
t = TestRunner(image=args.image, variant=args.variant, verbose=args.verbose)
print("=" * 72)
t.info(f"Starting Docker tests for: {args.image} (variant: {args.variant})")
print("=" * 72)
try:
if args.variant == "cli":
run_cli_tests(t)
else:
run_core_full_tests(t)
finally:
t.cleanup()
# Summary
print()
print("=" * 72)
t.info(f"Test Results: {t.passed}/{t.total} passed, {t.failed} failed")
print("=" * 72)
if t.failed > 0:
t.error("Failed tests:")
for name in t.failed_names:
print(f" - {name}")
t.write_results()
if t.failed > 0:
sys.exit(1)
t.ok("All tests passed!")
if __name__ == "__main__":
main()

61
scripts/ci/docs/build.sh Executable file
View File

@@ -0,0 +1,61 @@
#!/usr/bin/env bash
# Build the documentation site (Zensical, doc dependency group).
#
# Usage:
# scripts/ci/docs/build.sh
# scripts/ci/docs/build.sh --strict --log-file /tmp/build-log.txt
#
# Caching: use astral-sh/setup-uv with enable-cache in CI; this script only runs uv.
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)"
cd "$REPO_ROOT"
strict=false
log_file=""
while [[ $# -gt 0 ]]; do
case "$1" in
--strict)
strict=true
shift
;;
--log-file)
if [[ $# -lt 2 ]]; then
echo "error: --log-file requires a path" >&2
exit 2
fi
log_file="$2"
shift 2
;;
*)
echo "usage: $0 [--strict] [--log-file PATH]" >&2
exit 2
;;
esac
done
uv_sync() {
uv sync --group doc --no-editable --no-install-workspace --no-install-project
}
zensical_build() {
if [[ "$strict" == true ]]; then
uv run --no-sync zensical build --clean --strict
else
uv run --no-sync zensical build --clean
fi
}
if [[ -n "$log_file" ]]; then
set -o pipefail
mkdir -p "$(dirname "$log_file")"
: >"$log_file"
uv_sync 2>&1 | tee -a "$log_file"
zensical_build 2>&1 | tee -a "$log_file"
else
uv_sync
zensical_build
fi

13
scripts/ci/docs/textlint.sh Executable file
View File

@@ -0,0 +1,13 @@
#!/usr/bin/env bash
# Run textlint prose linting against docs/**/*.md.
#
# Usage:
# scripts/ci/docs/textlint.sh
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)"
cd "$REPO_ROOT"
npx textlint "docs/**/*.md"

View File

@@ -0,0 +1,17 @@
#!/usr/bin/env bash
set -euo pipefail
version="$(
apt-cache policy tesseract-ocr 2>/dev/null |
grep 'Candidate:' |
grep -Eo '[0-9]+\.[0-9]+' |
head -1 ||
true
)"
if [[ -z "${version}" ]]; then
version="unknown"
fi
echo "version=${version}" >>"${GITHUB_OUTPUT}"
echo "::notice title=Tesseract Version::Detected version: ${version}"

View File

@@ -0,0 +1,25 @@
#!/usr/bin/env bash
set -euo pipefail
version=""
json="$(brew info --json=v2 tesseract 2>/dev/null || true)"
if [[ -n "${json}" ]]; then
version="$(
python3 -c 'import json, re, sys; data = json.loads(sys.argv[1]); stable = (((data.get("formulae") or [{}])[0].get("versions") or {}).get("stable") or ""); m = re.match(r"^(\d+\.\d+)", stable); print(m.group(1) if m else "")' "${json}" || true
)"
fi
if [[ -z "${version}" ]]; then
first_line="$(brew info tesseract 2>/dev/null | head -1 || true)"
if [[ "${first_line}" =~ ([0-9]+\.[0-9]+) ]]; then
version="${BASH_REMATCH[1]}"
fi
fi
if [[ -z "${version}" ]]; then
version="unknown"
fi
echo "version=${version}" >>"${GITHUB_OUTPUT}"
echo "::notice title=Tesseract Version::Detected version: ${version}"

View File

@@ -0,0 +1,136 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="${REPO_ROOT:-$(cd "$SCRIPT_DIR/../../.." && pwd)}"
source "$REPO_ROOT/scripts/lib/retry.sh"
echo "::group::Installing Linux dependencies"
echo "Updating package index..."
if ! retry_with_backoff sudo apt-get update; then
echo "::warning::apt-get update failed after retries, continuing anyway..."
fi
packages=(
tesseract-ocr
tesseract-ocr-eng
tesseract-ocr-tur
tesseract-ocr-deu
fonts-liberation
fonts-dejavu-core
fonts-noto-core
libssl-dev
pkg-config
build-essential
cmake
libmagic-dev
libuv1-dev
php-cli
php-dev
)
echo "Installing dependencies..."
if retry_with_backoff_timeout 900 sudo apt-get install -y "${packages[@]}"; then
echo "✓ All packages installed successfully"
else
exit_code=$?
if [ $exit_code -eq 124 ]; then
echo "::error::Package installation timed out after 15 minutes"
else
echo "::warning::Some packages failed to install, attempting individual installs..."
for pkg in tesseract-ocr libssl-dev pkg-config cmake; do
echo "Installing $pkg..."
if retry_with_backoff_timeout 300 sudo apt-get install -y "$pkg" 2>&1; then
echo "$pkg installed"
else
echo " ⚠ Failed to install $pkg"
fi
done
fi
fi
echo "::endgroup::"
echo "::group::Verifying Linux installations"
echo "CMake:"
if command -v cmake >/dev/null 2>&1; then
cmake --version | head -1
echo "✓ CMake available"
# Export CMAKE environment variable for immediate availability in build scripts
CMAKE_FULL_PATH="$(command -v cmake)"
if [[ -n "$GITHUB_ENV" ]]; then
echo "CMAKE=$CMAKE_FULL_PATH" >>"$GITHUB_ENV"
echo "✓ Set CMAKE=$CMAKE_FULL_PATH in GITHUB_ENV"
fi
# Also add cmake binary directory to GITHUB_PATH for subsequent steps
CMAKE_BIN="$(dirname "$CMAKE_FULL_PATH")"
if [[ -n "$GITHUB_PATH" && -d "$CMAKE_BIN" ]]; then
echo "$CMAKE_BIN" >>"$GITHUB_PATH"
echo "✓ Added cmake directory to GITHUB_PATH: $CMAKE_BIN"
fi
else
echo "::error::CMake not found after installation"
exit 1
fi
echo ""
echo "Tesseract:"
if command -v tesseract >/dev/null 2>&1; then
if tesseract --version 2>/dev/null | head -1; then
echo "✓ Tesseract CLI available"
else
echo "::warning::Tesseract CLI present but failed to run"
fi
else
echo "::warning::Tesseract CLI not found; continuing (OCR will rely on bundled Tesseract)"
fi
echo ""
echo "Available Tesseract languages:"
if command -v tesseract >/dev/null 2>&1; then
tesseract --list-langs | head -10 || true
else
echo "(tesseract CLI not available)"
fi
echo ""
echo "PHP:"
if command -v php >/dev/null 2>&1; then
php --version | head -1
echo "✓ PHP available"
else
echo "::error::PHP not found after installation"
exit 1
fi
echo ""
echo "Checking Tesseract data path..."
tessdata_found=0
for tessdata_path in "/usr/share/tesseract-ocr/5/tessdata" "/usr/share/tesseract-ocr/tessdata"; do
if [ -d "$tessdata_path" ]; then
echo "Found tessdata at: $tessdata_path"
echo "Required language files:"
for lang in eng tur deu; do
if [ -f "$tessdata_path/${lang}.traineddata" ]; then
size=$(stat -c%s "$tessdata_path/${lang}.traineddata" 2>/dev/null || echo "unknown")
echo "${lang}.traineddata ($size bytes)"
else
echo "${lang}.traineddata (missing)"
fi
done
tessdata_found=1
break
fi
done
if [ $tessdata_found -eq 0 ]; then
echo "::error::Tessdata directory not found in standard locations"
exit 1
fi
echo "::endgroup::"

View File

@@ -0,0 +1,136 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="${REPO_ROOT:-$(cd "$SCRIPT_DIR/../../.." && pwd)}"
source "$REPO_ROOT/scripts/lib/retry.sh"
echo "::group::Installing macOS dependencies"
if [[ -d "/opt/homebrew/bin" ]]; then
export PATH="/opt/homebrew/bin:/opt/homebrew/sbin:${PATH}"
echo "/opt/homebrew/bin" >>"$GITHUB_PATH"
echo "/opt/homebrew/sbin" >>"$GITHUB_PATH"
fi
if [[ -d "/usr/local/bin" ]]; then
export PATH="/usr/local/bin:/usr/local/sbin:${PATH}"
echo "/usr/local/bin" >>"$GITHUB_PATH"
echo "/usr/local/sbin" >>"$GITHUB_PATH"
fi
if ! brew list cmake &>/dev/null; then
echo "Installing CMake..."
retry_with_backoff brew install cmake || {
echo "::error::Failed to install CMake after retries"
exit 1
}
else
echo "✓ CMake already installed"
fi
if ! command -v cmake >/dev/null 2>&1; then
echo "CMake not on PATH after install; attempting brew link..."
brew link --overwrite cmake >/dev/null 2>&1 || true
fi
if ! brew list tesseract &>/dev/null; then
echo "Installing Tesseract..."
retry_with_backoff brew install tesseract || {
echo "::error::Failed to install Tesseract after retries"
exit 1
}
else
echo "✓ Tesseract already installed"
fi
if ! command -v tesseract >/dev/null 2>&1; then
echo "Tesseract not on PATH after install; attempting brew link..."
brew link --overwrite tesseract >/dev/null 2>&1 || true
fi
if ! brew list tesseract-lang &>/dev/null; then
echo "Installing Tesseract language packs..."
retry_with_backoff brew install tesseract-lang || {
echo "::warning::Failed to install tesseract-lang, some languages may be unavailable"
}
else
echo "✓ Tesseract language packs already installed"
fi
if ! brew list libmagic &>/dev/null; then
echo "Installing libmagic..."
retry_with_backoff brew install libmagic || {
echo "::warning::Failed to install libmagic after retries"
}
else
echo "✓ libmagic already installed"
fi
if ! brew list php &>/dev/null; then
echo "Installing PHP..."
retry_with_backoff brew install php || {
echo "::error::Failed to install PHP after retries"
exit 1
}
else
echo "✓ PHP already installed"
fi
if ! command -v php >/dev/null 2>&1; then
echo "PHP not on PATH after install; attempting brew link..."
brew link --overwrite php >/dev/null 2>&1 || true
fi
echo "::endgroup::"
echo "::group::Verifying macOS installations"
echo "CMake:"
if command -v cmake >/dev/null 2>&1; then
cmake --version | head -1
# Export CMAKE environment variable for immediate availability in build scripts
CMAKE_FULL_PATH="$(command -v cmake)"
if [[ -n "$GITHUB_ENV" ]]; then
echo "CMAKE=$CMAKE_FULL_PATH" >>"$GITHUB_ENV"
echo "✓ Set CMAKE=$CMAKE_FULL_PATH in GITHUB_ENV"
fi
# Also add cmake binary directory to GITHUB_PATH for subsequent steps
CMAKE_BIN="$(dirname "$CMAKE_FULL_PATH")"
if [[ -n "$GITHUB_PATH" && -d "$CMAKE_BIN" ]]; then
echo "$CMAKE_BIN" >>"$GITHUB_PATH"
echo "✓ Added cmake directory to GITHUB_PATH: $CMAKE_BIN"
fi
else
echo "::error::CMake not found on PATH after installation"
echo "PATH=$PATH"
brew --prefix cmake 2>/dev/null || true
exit 1
fi
echo ""
echo "Tesseract:"
if command -v tesseract >/dev/null 2>&1; then
tesseract --version | head -1
else
echo "::error::Tesseract not found on PATH after installation"
echo "PATH=$PATH"
brew --prefix tesseract 2>/dev/null || true
exit 1
fi
echo ""
echo "Available languages:"
tesseract --list-langs | head -5
echo ""
echo "PHP:"
if command -v php >/dev/null 2>&1; then
php --version | head -1
else
echo "::error::PHP not found on PATH after installation"
echo "PATH=$PATH"
exit 1
fi
echo "::endgroup::"

View File

@@ -0,0 +1,301 @@
#!/usr/bin/env pwsh
Set-StrictMode -Version Latest
$ErrorActionPreference = 'Stop'
Write-Host "::group::Installing Windows dependencies"
function Retry-Command {
param(
[scriptblock]$Command,
[int]$MaxAttempts = 3,
[int]$DelaySeconds = 5
)
$attempt = 1
while ($attempt -le $MaxAttempts) {
try {
Write-Host "Attempt $attempt of $MaxAttempts..."
& $Command
return $true
}
catch {
$attempt++
if ($attempt -le $MaxAttempts) {
$backoffDelay = $DelaySeconds * [Math]::Pow(2, $attempt - 1)
Write-Host "⚠ Attempt failed, retrying in ${backoffDelay}s..." -ForegroundColor Yellow
Start-Sleep -Seconds $backoffDelay
}
else {
return $false
}
}
}
}
$tesseractCacheHit = $env:TESSERACT_CACHE_HIT -eq "true"
$llvmCacheHit = $env:LLVM_CACHE_HIT -eq "true"
$cmakeCacheHit = $env:CMAKE_CACHE_HIT -eq "true"
$cmakeInstalled = $false
Write-Host "Cache status:"
Write-Host " TESSERACT_CACHE_HIT: $env:TESSERACT_CACHE_HIT (evaluated: $tesseractCacheHit)"
Write-Host " LLVM_CACHE_HIT: $env:LLVM_CACHE_HIT (evaluated: $llvmCacheHit)"
Write-Host " CMAKE_CACHE_HIT: $env:CMAKE_CACHE_HIT (evaluated: $cmakeCacheHit)"
Write-Host ""
try {
& cmake --version 2>$null
Write-Host "✓ CMake already installed"
$cmakeInstalled = $true
}
catch {
Write-Host "CMake not found, will attempt to install"
}
if (-not $tesseractCacheHit) {
Write-Host "Tesseract cache miss, installing (optional for build - needed for tests only)..."
if (-not (Retry-Command { choco install -y tesseract --no-progress } -MaxAttempts 3)) {
Write-Host "::warning::Failed to install Tesseract (optional dependency - gem build does not require it)"
}
else {
Write-Host "✓ Tesseract installed"
# Ensure tessdata directory exists and is accessible
$tesseractPath = "C:\Program Files\Tesseract-OCR"
if (Test-Path $tesseractPath) {
Write-Host " Configuring Tesseract data paths..."
# Create tessdata directory if it doesn't exist
$tessdataPath = "$tesseractPath\tessdata"
if (-not (Test-Path $tessdataPath)) {
Write-Host " Creating tessdata directory at: $tessdataPath"
New-Item -ItemType Directory -Path $tessdataPath -Force | Out-Null
}
# Download English language data if not present
if (-not (Test-Path "$tessdataPath\eng.traineddata")) {
Write-Host " Downloading English language data..."
try {
$engUrl = "https://github.com/tesseract-ocr/tessdata_fast/raw/main/eng.traineddata"
Invoke-WebRequest -Uri $engUrl -OutFile "$tessdataPath\eng.traineddata" -ErrorAction Stop
Write-Host " ✓ Downloaded eng.traineddata"
}
catch {
Write-Host " ::warning::Failed to download eng.traineddata: $($_.Exception.Message)"
}
}
# Download OSD data if not present (needed for orientation detection)
if (-not (Test-Path "$tessdataPath\osd.traineddata")) {
Write-Host " Downloading OSD data..."
try {
$osdUrl = "https://github.com/tesseract-ocr/tessdata_fast/raw/main/osd.traineddata"
Invoke-WebRequest -Uri $osdUrl -OutFile "$tessdataPath\osd.traineddata" -ErrorAction Stop
Write-Host " ✓ Downloaded osd.traineddata"
}
catch {
Write-Host " ::warning::Failed to download osd.traineddata: $($_.Exception.Message)"
}
}
}
}
}
else {
Write-Host "✓ Tesseract found in cache"
}
if (-not $llvmCacheHit) {
Write-Host "LLVM cache miss, installing LLVM/Clang (required for bindgen)..."
if (-not (Retry-Command { choco install -y llvm --no-progress } -MaxAttempts 3)) {
Write-Host "::warning::Failed to install LLVM/Clang via Chocolatey"
}
else {
Write-Host "✓ LLVM/Clang installed"
}
}
else {
Write-Host "✓ LLVM/Clang found in cache"
}
Write-Host "Installing PHP..."
$phpInstalled = $false
try {
& php --version 2>$null
Write-Host "✓ PHP already installed"
$phpInstalled = $true
}
catch {
Write-Host "PHP not found, installing via Chocolatey..."
if (-not (Retry-Command { choco install -y php --no-progress } -MaxAttempts 3)) {
Write-Host "::warning::Failed to install PHP via Chocolatey, will rely on shivammathur/setup-php action"
}
else {
Write-Host "✓ PHP installed via Chocolatey"
$phpInstalled = $true
}
}
Write-Host "Installing CMake..."
if (-not $cmakeCacheHit) {
Write-Host "CMake cache miss, installing..."
if (-not (Retry-Command { choco install -y cmake --no-progress } -MaxAttempts 3)) {
throw "Failed to install CMake after 3 attempts"
}
Write-Host "✓ CMake installed"
}
else {
Write-Host "✓ CMake found in cache"
}
Write-Host "Configuring PATH and environment variables..."
$paths = @(
"C:\Program Files\CMake\bin",
"C:\Program Files\Tesseract-OCR",
"C:\Program Files\LLVM\bin",
"C:\tools\php",
"C:\Program Files\PHP"
)
foreach ($path in $paths) {
if (Test-Path $path) {
Write-Host " Adding to PATH: $path"
Write-Output $path | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
$env:PATH = "$path;$env:PATH"
}
else {
Write-Host " Path not found (skipping): $path"
}
}
# Ensure TESSDATA_PREFIX is set for Windows OCR tests
$tesseractPath = "C:\Program Files\Tesseract-OCR"
if (Test-Path $tesseractPath) {
$tessdataPath = "$tesseractPath\tessdata"
if (Test-Path $tessdataPath) {
Write-Host " Setting TESSDATA_PREFIX for tests: $tessdataPath"
Add-Content -Path $env:GITHUB_ENV -Value "TESSDATA_PREFIX=$tessdataPath"
$env:TESSDATA_PREFIX = $tessdataPath
}
}
Write-Host "::endgroup::"
Write-Host "::group::Verifying Windows installations"
Write-Host "Tesseract (optional for build):"
try {
$tesseractCmd = Get-Command tesseract -ErrorAction Stop
$tesseractPath = $tesseractCmd.Path
Write-Host " Found at: $tesseractPath"
Write-Host " Command type: $($tesseractCmd.CommandType)"
# Get installation directory
$tesseractDir = Split-Path -Parent $tesseractPath
Write-Host " Installation directory: $tesseractDir"
# Check for tessdata
$tessdataPath = Join-Path $tesseractDir "tessdata"
if (Test-Path $tessdataPath) {
Write-Host " tessdata directory: $tessdataPath"
Write-Host " Available language files:"
Get-ChildItem "$tessdataPath\*.traineddata" -ErrorAction SilentlyContinue | ForEach-Object {
Write-Host " - $($_.Name)"
}
}
else {
Write-Host " tessdata directory not found at: $tessdataPath"
}
try {
$version = & tesseract --version 2>&1
Write-Host " Version output: $version"
Write-Host "✓ Tesseract available and working"
Write-Host ""
Write-Host "Available Tesseract languages:"
& tesseract --list-langs 2>&1 | ForEach-Object { Write-Host " $_" }
}
catch {
Write-Host "⚠ Warning: Tesseract found but failed to run: $($_.Exception.Message)"
}
# Set TESSDATA_PREFIX environment variable for tests
if (Test-Path $tessdataPath) {
Write-Host ""
Write-Host "Setting TESSDATA_PREFIX environment variable..."
Add-Content -Path $env:GITHUB_ENV -Value "TESSDATA_PREFIX=$tessdataPath"
Write-Host "✓ Set TESSDATA_PREFIX=$tessdataPath in GITHUB_ENV"
$env:TESSDATA_PREFIX = $tessdataPath
}
}
catch {
Write-Host "⚠ Tesseract not found on PATH (not required for build)"
Write-Host " Error details: $($_.Exception.Message)"
Write-Host " Searching common installation locations..."
$commonPaths = @(
"C:\Program Files\Tesseract-OCR\tesseract.exe",
"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe",
"${env:ProgramFiles}\Tesseract-OCR\tesseract.exe",
"${env:ProgramFiles(x86)}\Tesseract-OCR\tesseract.exe"
)
$found = $false
foreach ($path in $commonPaths) {
if (Test-Path $path) {
Write-Host " Found Tesseract at: $path (not on PATH)"
$tesseractDir = Split-Path -Parent $path
$tessdataPath = Join-Path $tesseractDir "tessdata"
if (Test-Path $tessdataPath) {
Write-Host " Found tessdata at: $tessdataPath"
Add-Content -Path $env:GITHUB_ENV -Value "TESSDATA_PREFIX=$tessdataPath"
Write-Host "✓ Set TESSDATA_PREFIX=$tessdataPath in GITHUB_ENV"
$env:TESSDATA_PREFIX = $tessdataPath
}
$found = $true
break
}
}
if (-not $found) {
Write-Host " Tesseract not found in common locations"
}
}
Write-Host ""
Write-Host "CMake:"
try {
& cmake --version
Write-Host "✓ CMake available"
# Export CMAKE environment variable for immediate availability in build scripts
$cmakePath = (Get-Command cmake -ErrorAction Stop).Source
if ($cmakePath) {
Add-Content -Path $env:GITHUB_ENV -Value "CMAKE=$cmakePath"
Write-Host "✓ Set CMAKE=$cmakePath in GITHUB_ENV"
}
}
catch {
Write-Host "::error::CMake not found after installation"
throw "CMake verification failed"
}
Write-Host ""
Write-Host "Clang:"
try {
& clang --version
Write-Host "✓ Clang available"
}
catch {
Write-Host "⚠ Warning: Clang not currently available on PATH"
}
Write-Host ""
Write-Host "PHP:"
try {
& php --version
Write-Host "✓ PHP available"
}
catch {
Write-Host "⚠ Warning: PHP not currently available on PATH (will be set up by shivammathur/setup-php action)"
}
Write-Host "::endgroup::"

View File

@@ -0,0 +1,433 @@
#!/usr/bin/env python3
"""
Vendor kreuzberg core crate into R package
Used by: ci-r.yaml - Vendor kreuzberg core crate step
This script:
1. Reads workspace.dependencies from root Cargo.toml
2. Copies core crates to packages/r/vendor/
3. Replaces workspace = true with explicit versions
4. Generates vendor/Cargo.toml with proper workspace setup
"""
import os
import sys
import shutil
import re
from pathlib import Path
try:
import tomllib
except ImportError:
import tomli as tomllib # type: ignore
def get_repo_root() -> Path:
"""Get repository root directory."""
repo_root_env = os.environ.get("REPO_ROOT")
if repo_root_env:
return Path(repo_root_env)
script_dir = Path(__file__).parent.absolute()
return (script_dir / ".." / ".." / "..").resolve()
def read_toml(path: Path) -> dict[str, object]:
"""Read TOML file."""
with open(path, "rb") as f:
return tomllib.load(f)
def get_workspace_deps(repo_root: Path) -> dict[str, object]:
"""Extract workspace.dependencies from root Cargo.toml."""
cargo_toml_path = repo_root / "Cargo.toml"
data = read_toml(cargo_toml_path)
return data.get("workspace", {}).get("dependencies", {})
def get_workspace_version(repo_root: Path) -> str:
"""Extract version from workspace.package."""
cargo_toml_path = repo_root / "Cargo.toml"
data = read_toml(cargo_toml_path)
return data.get("workspace", {}).get("package", {}).get("version", "4.0.0")
def format_dependency(name: str, dep_spec: object) -> str:
"""Format a dependency spec for Cargo.toml."""
if isinstance(dep_spec, str):
return f'{name} = "{dep_spec}"'
elif isinstance(dep_spec, dict):
version: str = dep_spec.get("version", "")
package: str | None = dep_spec.get("package")
features: list[str] = dep_spec.get("features", [])
default_features: bool | None = dep_spec.get("default-features")
optional: bool | None = dep_spec.get("optional")
path: str | None = dep_spec.get("path")
git: str | None = dep_spec.get("git")
branch: str | None = dep_spec.get("branch")
tag: str | None = dep_spec.get("tag")
rev: str | None = dep_spec.get("rev")
parts: list[str] = []
if package:
parts.append(f'package = "{package}"')
if git:
parts.append(f'git = "{git}"')
if branch:
parts.append(f'branch = "{branch}"')
if tag:
parts.append(f'tag = "{tag}"')
if rev:
parts.append(f'rev = "{rev}"')
if path:
parts.append(f'path = "{path}"')
if version:
parts.append(f'version = "{version}"')
if features:
features_str = ', '.join(f'"{f}"' for f in features)
parts.append(f'features = [{features_str}]')
if default_features is False:
parts.append('default-features = false')
elif default_features is True:
parts.append('default-features = true')
if optional is True:
parts.append('optional = true')
elif optional is False:
parts.append('optional = false')
spec_str = ", ".join(parts)
return f"{name} = {{ {spec_str} }}"
return f'{name} = "{dep_spec}"'
def replace_workspace_deps_in_toml(toml_path: Path, workspace_deps: dict[str, object]) -> None:
"""Replace workspace = true with explicit versions in a Cargo.toml file."""
with open(toml_path, "r") as f:
content = f.read()
for name, dep_spec in workspace_deps.items():
pattern1 = rf'^{re.escape(name)} = \{{ workspace = true \}}$'
content = re.sub(pattern1, format_dependency(name, dep_spec), content, flags=re.MULTILINE)
def replace_with_fields(match: re.Match[str]) -> str:
other_fields_str = match.group(1).strip()
base_spec = format_dependency(name, dep_spec)
if " = { " not in base_spec:
# Simple string dep like `ctor = "0.6"` - wrap it
version_val = base_spec.split(" = ", 1)[1].strip('"')
spec_part = f'version = "{version_val}"'
else:
spec_part = base_spec.split(" = { ", 1)[1].rstrip("} ").rstrip("}")
# Extract existing keys and values from workspace spec, handling nested brackets
workspace_fields: dict[str, str] = {}
bracket_depth = 0
current_field = ""
for char in spec_part:
if char == '[':
bracket_depth += 1
current_field += char
elif char == ']':
bracket_depth -= 1
current_field += char
elif char == ',' and bracket_depth == 0:
# End of field
field = current_field.strip()
if field and "=" in field:
key, val = field.split("=", 1)
workspace_fields[key.strip()] = val.strip()
current_field = ""
else:
current_field += char
# Don't forget the last field
if current_field.strip():
field = current_field.strip()
if field and "=" in field:
key, val = field.split("=", 1)
workspace_fields[key.strip()] = val.strip()
# Extract crate-specific keys using bracket-aware parsing
crate_fields: dict[str, str] = {}
bracket_depth = 0
current_field = ""
for char in other_fields_str:
if char == '[':
bracket_depth += 1
current_field += char
elif char == ']':
bracket_depth -= 1
current_field += char
elif char == ',' and bracket_depth == 0:
# End of field
field = current_field.strip()
if field and "=" in field:
key, val = field.split("=", 1)
crate_fields[key.strip()] = val.strip()
current_field = ""
else:
current_field += char
# Don't forget the last field
if current_field.strip():
field = current_field.strip()
if field and "=" in field:
key, val = field.split("=", 1)
crate_fields[key.strip()] = val.strip()
# Merge: crate-specific fields override workspace fields
merged_fields = {**workspace_fields, **crate_fields}
# Build result from merged fields
merged_parts = [f"{k} = {v}" for k, v in merged_fields.items()]
merged_spec = ", ".join(merged_parts)
return f"{name} = {{ {merged_spec} }}"
pattern2 = rf'^{re.escape(name)} = \{{ workspace = true, (.+?) \}}$'
content = re.sub(pattern2, replace_with_fields, content, flags=re.MULTILINE | re.DOTALL)
with open(toml_path, "w") as f:
f.write(content)
def generate_vendor_cargo_toml(repo_root: Path, workspace_deps: dict[str, object], core_version: str, copied_crates: list[str]) -> None:
"""Generate vendor/Cargo.toml with workspace setup.
Args:
repo_root: Repository root directory
workspace_deps: Workspace dependencies from Cargo.toml
core_version: Core version string
copied_crates: List of crates that were successfully copied
"""
deps_lines: list[str] = []
for name, dep_spec in sorted(workspace_deps.items()):
deps_lines.append(format_dependency(name, dep_spec))
deps_str = "\n".join(deps_lines)
# Build members list based on actually copied crates
members = [name for name in ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr"]
if name in copied_crates]
members_str = ', '.join(f'"{m}"' for m in members)
vendor_toml = f'''[workspace]
members = [{members_str}]
[workspace.package]
version = "{core_version}"
edition = "2024"
rust-version = "1.91"
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
license = "MIT"
repository = "https://github.com/kreuzberg-dev/kreuzberg"
homepage = "https://kreuzberg.dev"
[workspace.dependencies]
{deps_str}
'''
vendor_dir = repo_root / "packages" / "r" / "vendor"
vendor_dir.mkdir(parents=True, exist_ok=True)
toml_path = vendor_dir / "Cargo.toml"
with open(toml_path, "w") as f:
f.write(vendor_toml)
def main() -> None:
"""Main vendoring function."""
repo_root: Path = get_repo_root()
print("=== Vendoring kreuzberg core crate ===")
workspace_deps: dict[str, object] = get_workspace_deps(repo_root)
core_version: str = get_workspace_version(repo_root)
print(f"Core version: {core_version}")
print(f"Workspace dependencies: {len(workspace_deps)}")
vendor_base: Path = repo_root / "packages" / "r" / "vendor"
# Clean only crate directories
crate_names = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract",
"kreuzberg-paddle-ocr"]
for name in crate_names:
crate_path = vendor_base / name
if crate_path.exists():
shutil.rmtree(crate_path)
# Also clean the vendor Cargo.toml (will be regenerated)
vendor_cargo = vendor_base / "Cargo.toml"
if vendor_cargo.exists():
vendor_cargo.unlink()
print("Cleaned vendor crate directories")
vendor_base.mkdir(parents=True, exist_ok=True)
crates_to_copy: list[tuple[str, str]] = [
("crates/kreuzberg", "kreuzberg"),
("crates/kreuzberg-ffi", "kreuzberg-ffi"),
("crates/kreuzberg-tesseract", "kreuzberg-tesseract"),
("crates/kreuzberg-paddle-ocr", "kreuzberg-paddle-ocr"),
]
copied_crates: list[str] = []
for src_rel, dest_name in crates_to_copy:
src: Path = repo_root / src_rel
dest: Path = vendor_base / dest_name
if src.exists():
try:
shutil.copytree(src, dest)
copied_crates.append(dest_name)
print(f"Copied {dest_name}")
except Exception as e:
print(f"Warning: Failed to copy {dest_name}: {e}", file=sys.stderr)
else:
print(f"Warning: Source directory not found: {src_rel}")
artifact_dirs: list[str] = [".fastembed_cache", "target"]
temp_patterns: list[str] = ["*.swp", "*.bak", "*.tmp", "*~"]
for crate_dir in copied_crates:
crate_path: Path = vendor_base / crate_dir
if crate_path.exists():
for artifact_dir in artifact_dirs:
artifact: Path = crate_path / artifact_dir
if artifact.exists():
shutil.rmtree(artifact)
for pattern in temp_patterns:
for f in crate_path.rglob(pattern):
f.unlink()
print("Cleaned build artifacts")
# Update workspace inheritance in Cargo.toml files
for crate_dir in copied_crates:
crate_toml = vendor_base / crate_dir / "Cargo.toml"
if crate_toml.exists():
with open(crate_toml, "r") as f:
content = f.read()
content = re.sub(r'^version\.workspace = true$', f'version = "{core_version}"', content, flags=re.MULTILINE)
content = re.sub(r'^edition\.workspace = true$', 'edition = "2024"', content, flags=re.MULTILINE)
content = re.sub(r'^rust-version\.workspace = true$', 'rust-version = "1.91"', content, flags=re.MULTILINE)
content = re.sub(r'^authors\.workspace = true$', 'authors = ["Na\'aman Hirschfeld <naaman@kreuzberg.dev>"]', content, flags=re.MULTILINE)
content = re.sub(r'^license\.workspace = true$', 'license = "MIT"', content, flags=re.MULTILINE)
with open(crate_toml, "w") as f:
f.write(content)
replace_workspace_deps_in_toml(crate_toml, workspace_deps)
print(f"Updated {crate_dir}/Cargo.toml")
# Update path dependencies in all crates that depend on other vendored crates
# First handle kreuzberg-ffi's dependency on kreuzberg
if "kreuzberg-ffi" in copied_crates:
ffi_toml = vendor_base / "kreuzberg-ffi" / "Cargo.toml"
if ffi_toml.exists():
with open(ffi_toml, "r") as f:
content = f.read()
if "kreuzberg" in copied_crates:
# Replace kreuzberg workspace references with path dependency
# Handle cases with path, version, or neither
content = re.sub(
r'(kreuzberg = \{) (?:(?:path|version) = "[^"]*", )?',
r'\1 path = "../kreuzberg", ',
content
)
with open(ffi_toml, "w") as f:
f.write(content)
# Update path dependencies in kreuzberg crate if tesseract was copied
if "kreuzberg" in copied_crates:
kreuzberg_toml = vendor_base / "kreuzberg" / "Cargo.toml"
if kreuzberg_toml.exists():
with open(kreuzberg_toml, "r") as f:
content = f.read()
# Only update tesseract path if it was actually copied
if "kreuzberg-tesseract" in copied_crates:
content = re.sub(
r'kreuzberg-tesseract = \{ version = "[^"]*", optional = true \}',
'kreuzberg-tesseract = { path = "../kreuzberg-tesseract", optional = true }',
content
)
# Only update paddle-ocr path if it was actually copied
if "kreuzberg-paddle-ocr" in copied_crates:
content = re.sub(
r'kreuzberg-paddle-ocr = \{ version = "[^"]*", optional = true \}',
'kreuzberg-paddle-ocr = { path = "../kreuzberg-paddle-ocr", optional = true }',
content
)
with open(kreuzberg_toml, "w") as f:
f.write(content)
generate_vendor_cargo_toml(repo_root, workspace_deps, core_version, copied_crates)
print("Generated vendor/Cargo.toml")
# Copy root Cargo.lock so vendor workspace uses identical dependency versions
root_lock = repo_root / "Cargo.lock"
vendor_lock = vendor_base / "Cargo.lock"
if root_lock.exists():
shutil.copy2(root_lock, vendor_lock)
print("Copied Cargo.lock to vendor directory")
# Update R package Cargo.toml to use vendored crates
r_toml = repo_root / "packages" / "r" / "src" / "rust" / "Cargo.toml"
if r_toml.exists():
with open(r_toml, "r") as f:
content = f.read()
# Replace path dependencies to point to vendored crates
# From: path = "../../../../crates/kreuzberg"
# To: path = "../../vendor/kreuzberg"
content = re.sub(
r'path = "\.\./\.\./\.\./\.\./crates/kreuzberg"',
'path = "../../vendor/kreuzberg"',
content
)
content = re.sub(
r'path = "\.\./\.\./\.\./\.\./crates/kreuzberg-ffi"',
'path = "../../vendor/kreuzberg-ffi"',
content
)
with open(r_toml, "w") as f:
f.write(content)
print("Updated R package Cargo.toml to use vendored crates")
print(f"\nVendoring complete (core version: {core_version})")
print(f"Copied crates: {', '.join(sorted(copied_crates))}")
if "kreuzberg" in copied_crates and "kreuzberg-ffi" in copied_crates:
print("R package Cargo.toml uses:")
print(" - path '../../vendor/kreuzberg' for kreuzberg crate")
print(" - path '../../vendor/kreuzberg-ffi' for kreuzberg-ffi crate")
else:
print("Warning: Some required crates were not copied. Check for missing source directories.")
if __name__ == "__main__":
try:
main()
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)

View File

@@ -0,0 +1,95 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="${REPO_ROOT:-$(cd "$SCRIPT_DIR/../../.." && pwd)}"
source "$REPO_ROOT/scripts/lib/common.sh"
source "$REPO_ROOT/scripts/lib/library-paths.sh"
validate_repo_root "$REPO_ROOT" || exit 1
setup_rust_ffi_paths "$REPO_ROOT"
echo "=== Compiling Ruby native extension (Verbose Debug) ==="
cd "$REPO_ROOT/packages/ruby"
export CARGO_BUILD_JOBS=1
export RUST_BACKTRACE=1
export RB_SYS_VERBOSE=1
echo ""
echo "=== Pre-compilation environment ==="
echo "Ruby version: $(ruby --version)"
echo "Ruby platform: $(ruby -e 'puts RUBY_PLATFORM')"
echo "Rustc version: $(rustc --version)"
echo "Cargo version: $(cargo --version)"
echo "Working directory: $(pwd)"
echo ""
echo "=== Build configuration variables ==="
echo "CARGO_BUILD_JOBS: ${CARGO_BUILD_JOBS}"
echo "RUST_BACKTRACE: ${RUST_BACKTRACE}"
echo "RB_SYS_VERBOSE: ${RB_SYS_VERBOSE}"
echo "LD_LIBRARY_PATH: ${LD_LIBRARY_PATH:-<not set>}"
echo "DYLD_LIBRARY_PATH: ${DYLD_LIBRARY_PATH:-<not set>}"
echo ""
echo "=== Pre-vendor directory state ==="
echo "packages/ruby directory contents:"
find . -maxdepth 1 -type f -o -maxdepth 1 -type d | head -20
echo ""
echo "=== Vendoring kreuzberg core ==="
python3 "$REPO_ROOT/scripts/ci/ruby/vendor-kreuzberg-core.py"
echo ""
echo "=== Post-vendor directory state ==="
if [ -d "ext/kreuzberg_rb/vendor" ]; then
echo "Vendor directory contents:"
find ext/kreuzberg_rb/vendor -maxdepth 2 -type f | head -10
else
echo "WARNING: No vendor directory found in ext/kreuzberg_rb"
fi
echo ""
echo "=== Running rake compile with verbose output ==="
bundle exec rake compile --verbose --trace 2>&1 || {
echo ""
echo "ERROR: rake compile failed"
echo "=== Attempting to capture compilation error details ==="
if [ -f "mkmf.log" ]; then
echo "=== mkmf.log (last 150 lines) ==="
tail -150 mkmf.log
fi
echo ""
echo "=== Looking for compiled artifacts ==="
find . -name "*.so" -o -name "*.dll" -o -name "*.dylib" 2>/dev/null | head -20
echo ""
echo "=== Checking gem installation ==="
gem list kreuzberg || echo "Gem not found"
exit 1
}
echo ""
echo "=== Post-compilation directory state ==="
echo "lib/ contents:"
if [ -d "lib" ]; then
find lib -type f -name "*.so" -o -name "*.dll" -o -name "*.dylib" 2>/dev/null || echo "No compiled extension found"
else
echo "ERROR: lib directory not found"
fi
echo ""
echo "=== Verifying extension can be loaded ==="
ruby -e "require_relative 'lib/kreuzberg'; puts 'Extension loaded successfully'" 2>&1 || {
echo "WARNING: Could not load extension directly"
echo "This might be expected if gem installation is required"
}
echo ""
echo "=== Compilation complete ==="

View File

@@ -0,0 +1,5 @@
#!/usr/bin/env bash
set -euo pipefail
gem install bundler -v 4.0.3 --no-document || gem install bundler --no-document
bundler --version

View File

@@ -0,0 +1,30 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="${REPO_ROOT:-$(cd "$SCRIPT_DIR/../../.." && pwd)}"
source "$REPO_ROOT/scripts/lib/common.sh"
validate_repo_root "$REPO_ROOT" || exit 1
echo "=== Installing Ruby dependencies ==="
cd "$REPO_ROOT/packages/ruby"
bundle_path="${BUNDLE_PATH:-$REPO_ROOT/packages/ruby/.bundle/bundle}"
if [[ -n "${GITHUB_ENV:-}" ]]; then
if [[ -z "${BUNDLE_GEMFILE:-}" ]]; then
echo "BUNDLE_GEMFILE=$REPO_ROOT/packages/ruby/Gemfile" >>"$GITHUB_ENV"
fi
if [[ -z "${BUNDLE_PATH:-}" ]]; then
echo "BUNDLE_PATH=$bundle_path" >>"$GITHUB_ENV"
fi
fi
bundle config set deployment false
bundle config set path "$bundle_path"
bundle install --jobs 4
echo "Ruby dependencies installed"

View File

@@ -0,0 +1,430 @@
#!/usr/bin/env python3
"""
Vendor kreuzberg core crate into Ruby package
Used by: ci-ruby.yaml - Vendor kreuzberg core crate step
This script:
1. Reads workspace.dependencies from root Cargo.toml
2. Copies core crates to packages/ruby/vendor/
3. Replaces workspace = true with explicit versions
4. Generates vendor/Cargo.toml with proper workspace setup
"""
import os
import sys
import shutil
import re
from pathlib import Path
try:
import tomllib
except ImportError:
import tomli as tomllib # type: ignore[import-not-found]
def get_repo_root() -> Path:
"""Get repository root directory."""
repo_root_env = os.environ.get("REPO_ROOT")
if repo_root_env:
return Path(repo_root_env)
script_dir = Path(__file__).parent.absolute()
return (script_dir / ".." / ".." / "..").resolve()
def read_toml(path: Path) -> dict[str, object]:
"""Read TOML file."""
with open(path, "rb") as f:
return tomllib.load(f)
def get_workspace_deps(repo_root: Path) -> dict[str, object]:
"""Extract workspace.dependencies from root Cargo.toml."""
cargo_toml_path = repo_root / "Cargo.toml"
data = read_toml(cargo_toml_path)
return data.get("workspace", {}).get("dependencies", {})
def get_workspace_version(repo_root: Path) -> str:
"""Extract version from workspace.package."""
cargo_toml_path = repo_root / "Cargo.toml"
data = read_toml(cargo_toml_path)
return data.get("workspace", {}).get("package", {}).get("version", "4.0.0")
def format_dependency(name: str, dep_spec: object) -> str:
"""Format a dependency spec for Cargo.toml."""
if isinstance(dep_spec, str):
return f'{name} = "{dep_spec}"'
elif isinstance(dep_spec, dict):
version: str = dep_spec.get("version", "")
package: str | None = dep_spec.get("package")
features: list[str] = dep_spec.get("features", [])
default_features: bool | None = dep_spec.get("default-features")
optional: bool | None = dep_spec.get("optional")
path: str | None = dep_spec.get("path")
git: str | None = dep_spec.get("git")
branch: str | None = dep_spec.get("branch")
tag: str | None = dep_spec.get("tag")
rev: str | None = dep_spec.get("rev")
parts: list[str] = []
if package:
parts.append(f'package = "{package}"')
if git:
parts.append(f'git = "{git}"')
if branch:
parts.append(f'branch = "{branch}"')
if tag:
parts.append(f'tag = "{tag}"')
if rev:
parts.append(f'rev = "{rev}"')
if path:
parts.append(f'path = "{path}"')
if version:
parts.append(f'version = "{version}"')
if features:
features_str = ', '.join(f'"{f}"' for f in features)
parts.append(f'features = [{features_str}]')
if default_features is False:
parts.append('default-features = false')
elif default_features is True:
parts.append('default-features = true')
if optional is True:
parts.append('optional = true')
elif optional is False:
parts.append('optional = false')
spec_str = ", ".join(parts)
return f"{name} = {{ {spec_str} }}"
return f'{name} = "{dep_spec}"'
def replace_workspace_deps_in_toml(toml_path: Path, workspace_deps: dict[str, object]) -> None:
"""Replace workspace = true with explicit versions in a Cargo.toml file."""
with open(toml_path, "r") as f:
content = f.read()
for name, dep_spec in workspace_deps.items():
pattern1 = rf'^{re.escape(name)} = \{{ workspace = true \}}$'
content = re.sub(pattern1, format_dependency(name, dep_spec), content, flags=re.MULTILINE)
def replace_with_fields(match: re.Match[str]) -> str:
other_fields_str = match.group(1).strip()
base_spec = format_dependency(name, dep_spec)
if " = { " not in base_spec:
# Simple string dep like `ctor = "0.6"` - wrap it
version_val = base_spec.split(" = ", 1)[1].strip('"')
spec_part = f'version = "{version_val}"'
else:
spec_part = base_spec.split(" = { ", 1)[1].rstrip("} ").rstrip("}")
# Extract existing keys and values from workspace spec, handling nested brackets
workspace_fields: dict[str, str] = {}
bracket_depth = 0
current_field = ""
for char in spec_part:
if char == '[':
bracket_depth += 1
current_field += char
elif char == ']':
bracket_depth -= 1
current_field += char
elif char == ',' and bracket_depth == 0:
# End of field
field = current_field.strip()
if field and "=" in field:
key, val = field.split("=", 1)
workspace_fields[key.strip()] = val.strip()
current_field = ""
else:
current_field += char
# Don't forget the last field
if current_field.strip():
field = current_field.strip()
if field and "=" in field:
key, val = field.split("=", 1)
workspace_fields[key.strip()] = val.strip()
# Extract crate-specific keys using bracket-aware parsing
crate_fields: dict[str, str] = {}
bracket_depth = 0
current_field = ""
for char in other_fields_str:
if char == '[':
bracket_depth += 1
current_field += char
elif char == ']':
bracket_depth -= 1
current_field += char
elif char == ',' and bracket_depth == 0:
# End of field
field = current_field.strip()
if field and "=" in field:
key, val = field.split("=", 1)
crate_fields[key.strip()] = val.strip()
current_field = ""
else:
current_field += char
# Don't forget the last field
if current_field.strip():
field = current_field.strip()
if field and "=" in field:
key, val = field.split("=", 1)
crate_fields[key.strip()] = val.strip()
# Merge: crate-specific fields override workspace fields
merged_fields = {**workspace_fields, **crate_fields}
# Build result from merged fields
merged_parts = [f"{k} = {v}" for k, v in merged_fields.items()]
merged_spec = ", ".join(merged_parts)
return f"{name} = {{ {merged_spec} }}"
pattern2 = rf'^{re.escape(name)} = \{{ workspace = true, (.+?) \}}$'
content = re.sub(pattern2, replace_with_fields, content, flags=re.MULTILINE | re.DOTALL)
with open(toml_path, "w") as f:
f.write(content)
def generate_vendor_cargo_toml(repo_root: Path, workspace_deps: dict[str, object], core_version: str, copied_crates: list[str]) -> None:
"""Generate vendor/Cargo.toml with workspace setup.
Args:
repo_root: Repository root directory
workspace_deps: Workspace dependencies from Cargo.toml
core_version: Core version string
copied_crates: List of crates that were successfully copied
"""
deps_lines: list[str] = []
for name, dep_spec in sorted(workspace_deps.items()):
deps_lines.append(format_dependency(name, dep_spec))
deps_str = "\n".join(deps_lines)
# Build members list based on actually copied crates
members = [name for name in ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "rb-sys"]
if name in copied_crates]
members_str = ', '.join(f'"{m}"' for m in members)
vendor_toml = f'''[workspace]
members = [{members_str}]
[workspace.package]
version = "{core_version}"
edition = "2024"
rust-version = "1.91"
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
license = "MIT"
repository = "https://github.com/kreuzberg-dev/kreuzberg"
homepage = "https://kreuzberg.dev"
[workspace.dependencies]
{deps_str}
'''
vendor_dir = repo_root / "packages" / "ruby" / "vendor"
vendor_dir.mkdir(parents=True, exist_ok=True)
toml_path = vendor_dir / "Cargo.toml"
with open(toml_path, "w") as f:
f.write(vendor_toml)
def main() -> None:
"""Main vendoring function."""
repo_root: Path = get_repo_root()
print("=== Vendoring kreuzberg core crate ===")
workspace_deps: dict[str, object] = get_workspace_deps(repo_root)
core_version: str = get_workspace_version(repo_root)
print(f"Core version: {core_version}")
print(f"Workspace dependencies: {len(workspace_deps)}")
vendor_base: Path = repo_root / "packages" / "ruby" / "vendor"
# Clean only crate directories, preserving vendor/bundle/ (Bundler gems)
crate_names = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract",
"kreuzberg-paddle-ocr", "rb-sys"]
for name in crate_names:
crate_path = vendor_base / name
if crate_path.exists():
shutil.rmtree(crate_path)
# Also clean the vendor Cargo.toml (will be regenerated)
vendor_cargo = vendor_base / "Cargo.toml"
if vendor_cargo.exists():
vendor_cargo.unlink()
print("Cleaned vendor crate directories")
vendor_base.mkdir(parents=True, exist_ok=True)
crates_to_copy: list[tuple[str, str]] = [
("crates/kreuzberg", "kreuzberg"),
("crates/kreuzberg-ffi", "kreuzberg-ffi"),
("crates/kreuzberg-tesseract", "kreuzberg-tesseract"),
("crates/kreuzberg-paddle-ocr", "kreuzberg-paddle-ocr"),
("vendor/rb-sys", "rb-sys"),
]
copied_crates: list[str] = []
for src_rel, dest_name in crates_to_copy:
src: Path = repo_root / src_rel
dest: Path = vendor_base / dest_name
if src.exists():
try:
shutil.copytree(src, dest)
copied_crates.append(dest_name)
print(f"Copied {dest_name}")
except Exception as e:
print(f"Warning: Failed to copy {dest_name}: {e}", file=sys.stderr)
else:
print(f"Warning: Source directory not found: {src_rel}")
artifact_dirs: list[str] = [".fastembed_cache", "target"]
temp_patterns: list[str] = ["*.swp", "*.bak", "*.tmp", "*~"]
for crate_dir in copied_crates:
crate_path: Path = vendor_base / crate_dir
if crate_path.exists():
for artifact_dir in artifact_dirs:
artifact: Path = crate_path / artifact_dir
if artifact.exists():
shutil.rmtree(artifact)
for pattern in temp_patterns:
for f in crate_path.rglob(pattern):
f.unlink()
print("Cleaned build artifacts")
# Update workspace inheritance in Cargo.toml files
for crate_dir in copied_crates:
crate_toml = vendor_base / crate_dir / "Cargo.toml"
if crate_toml.exists():
with open(crate_toml, "r") as f:
content = f.read()
content = re.sub(r'^version\.workspace = true$', f'version = "{core_version}"', content, flags=re.MULTILINE)
content = re.sub(r'^edition\.workspace = true$', 'edition = "2024"', content, flags=re.MULTILINE)
content = re.sub(r'^rust-version\.workspace = true$', 'rust-version = "1.91"', content, flags=re.MULTILINE)
content = re.sub(r'^authors\.workspace = true$', 'authors = ["Na\'aman Hirschfeld <naaman@kreuzberg.dev>"]', content, flags=re.MULTILINE)
content = re.sub(r'^license\.workspace = true$', 'license = "MIT"', content, flags=re.MULTILINE)
with open(crate_toml, "w") as f:
f.write(content)
replace_workspace_deps_in_toml(crate_toml, workspace_deps)
print(f"Updated {crate_dir}/Cargo.toml")
# Update path dependencies in kreuzberg-ffi crate
if "kreuzberg-ffi" in copied_crates and "kreuzberg" in copied_crates:
ffi_toml = vendor_base / "kreuzberg-ffi" / "Cargo.toml"
if ffi_toml.exists():
with open(ffi_toml, "r") as f:
content = f.read()
# Replace kreuzberg workspace references with path dependency
# Handle cases with path, version, or neither
content = re.sub(
r'(kreuzberg = \{) (?:(?:path|version) = "[^"]*", )?',
r'\1 path = "../kreuzberg", ',
content
)
with open(ffi_toml, "w") as f:
f.write(content)
# Update path dependencies in kreuzberg crate if tesseract was copied
if "kreuzberg" in copied_crates:
kreuzberg_toml = vendor_base / "kreuzberg" / "Cargo.toml"
if kreuzberg_toml.exists():
with open(kreuzberg_toml, "r") as f:
content = f.read()
# Only update tesseract path if it was actually copied
if "kreuzberg-tesseract" in copied_crates:
content = re.sub(
r'kreuzberg-tesseract = \{ (?:path = "[^"]*", )?version = "[^"]*", optional = true \}',
'kreuzberg-tesseract = { path = "../kreuzberg-tesseract", optional = true }',
content
)
# Only update paddle-ocr path if it was actually copied
if "kreuzberg-paddle-ocr" in copied_crates:
content = re.sub(
r'kreuzberg-paddle-ocr = \{ (?:path = "[^"]*", )?version = "[^"]*", optional = true \}',
'kreuzberg-paddle-ocr = { path = "../kreuzberg-paddle-ocr", optional = true }',
content
)
with open(kreuzberg_toml, "w") as f:
f.write(content)
generate_vendor_cargo_toml(repo_root, workspace_deps, core_version, copied_crates)
print("Generated vendor/Cargo.toml")
# Update native extension Cargo.toml to use vendored crates
native_toml = repo_root / "packages" / "ruby" / "ext" / "kreuzberg_rb" / "native" / "Cargo.toml"
if native_toml.exists():
with open(native_toml, "r") as f:
content = f.read()
# Replace path dependencies to point to vendored crates
# From: path = "../../../../../crates/kreuzberg"
# To: path = "../../../vendor/kreuzberg"
content = re.sub(
r'path = "\.\./\.\./\.\./\.\./\.\./crates/kreuzberg"',
'path = "../../../vendor/kreuzberg"',
content
)
content = re.sub(
r'path = "\.\./\.\./\.\./\.\./\.\./crates/kreuzberg-ffi"',
'path = "../../../vendor/kreuzberg-ffi"',
content
)
with open(native_toml, "w") as f:
f.write(content)
print("Updated native extension Cargo.toml to use vendored crates")
print(f"\nVendoring complete (core version: {core_version})")
print(f"Copied crates: {', '.join(sorted(copied_crates))}")
if "kreuzberg" in copied_crates and "kreuzberg-ffi" in copied_crates:
print("Native extension Cargo.toml uses:")
print(" - path '../../../vendor/kreuzberg' for kreuzberg crate")
print(" - path '../../../vendor/kreuzberg-ffi' for kreuzberg-ffi crate")
if "rb-sys" in copied_crates:
print(" - path '../../../vendor/rb-sys' for rb-sys crate")
else:
print(" - rb-sys from crates.io")
else:
print("Warning: Some required crates were not copied. Check for missing source directories.")
if __name__ == "__main__":
try:
main()
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)

View File

@@ -0,0 +1,19 @@
#!/usr/bin/env pwsh
# Package CLI binary as zip archive (Windows)
# Used by: ci-rust.yaml - Package CLI (Windows) step
# Arguments: TARGET (e.g., x86_64-pc-windows-msvc)
param(
[Parameter(Mandatory=$true)]
[string]$Target
)
Set-StrictMode -Version Latest
$ErrorActionPreference = 'Stop'
Write-Host "=== Packaging CLI binary for $Target ==="
cd target/$Target/release
Compress-Archive -Path kreuzberg.exe -DestinationPath ../../../kreuzberg-cli-$Target.zip
Write-Host "Packaging complete: kreuzberg-cli-$Target.zip"

103
scripts/ci/rust/run-unit-tests.sh Executable file
View File

@@ -0,0 +1,103 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="${REPO_ROOT:-$(cd "$SCRIPT_DIR/../../.." && pwd)}"
source "$REPO_ROOT/scripts/lib/common.sh"
source "$REPO_ROOT/scripts/lib/tessdata.sh"
validate_repo_root "$REPO_ROOT" || exit 1
cd "$REPO_ROOT"
echo "=== Running Rust unit tests ==="
setup_tessdata
echo "Test environment configuration:"
echo " TESSDATA_PREFIX: ${TESSDATA_PREFIX:-not set}"
echo " RUST_BACKTRACE: ${RUST_BACKTRACE:-not set}"
echo " CARGO_TERM_COLOR: ${CARGO_TERM_COLOR:-not set}"
echo "Workspace information:"
echo " Repository: $REPO_ROOT"
echo " Excluded packages: kreuzberg-e2e-generator, kreuzberg-py, kreuzberg-node (+ benchmark-harness on Windows)"
if [ ! -d "$TESSDATA_PREFIX" ]; then
echo "WARNING: TESSDATA_PREFIX directory not found: $TESSDATA_PREFIX"
echo "Attempting to create it..."
mkdir -p "$TESSDATA_PREFIX"
ensure_tessdata "$TESSDATA_PREFIX"
fi
echo "Verifying Tesseract data files..."
for lang in eng osd; do
langfile="$TESSDATA_PREFIX/${lang}.traineddata"
if [ -f "$langfile" ]; then
size=$(stat -f%z "$langfile" 2>/dev/null || stat -c%s "$langfile" 2>/dev/null || echo "unknown")
echo "${lang}.traineddata (${size} bytes)"
else
echo " WARNING: Missing ${lang}.traineddata"
fi
done
if [ -n "${KREUZBERG_PDFIUM_PREBUILT:-}" ]; then
export LD_LIBRARY_PATH="${KREUZBERG_PDFIUM_PREBUILT}/lib:${LD_LIBRARY_PATH:-}"
export DYLD_LIBRARY_PATH="${KREUZBERG_PDFIUM_PREBUILT}/lib:${DYLD_LIBRARY_PATH:-}"
export DYLD_FALLBACK_LIBRARY_PATH="${KREUZBERG_PDFIUM_PREBUILT}/lib:${DYLD_FALLBACK_LIBRARY_PATH:-}"
echo "Library path configuration:"
echo " LD_LIBRARY_PATH: $LD_LIBRARY_PATH"
echo " DYLD_LIBRARY_PATH: $DYLD_LIBRARY_PATH"
echo " DYLD_FALLBACK_LIBRARY_PATH: $DYLD_FALLBACK_LIBRARY_PATH"
fi
echo "=== Starting cargo test ==="
# NOTE: We intentionally avoid `--all-features` for the `kreuzberg` crate because
TEST_LOG="/tmp/cargo-test-$$.log"
if ! {
# `--all-targets` runs --lib --bins --tests --examples --benches but excludes
# `--doc`. 22 rustdoc examples in the kreuzberg crate currently reference
# private items (extraction::capacity::estimate_content_capacity et al.) and
# fail to compile. Tracking the cleanup separately; doc-test coverage is not
# on the v5.0.0 publish path. TODO: re-enable doc tests once the failing
# examples are rewritten against the public API.
echo "=== cargo test -p kreuzberg --features full ==="
RUST_BACKTRACE=full cargo test -p kreuzberg --features full --all-targets --verbose
echo "=== cargo test --workspace (all features, excluding kreuzberg) ==="
extra_excludes=()
if [[ "$OSTYPE" == "msys" || "$OSTYPE" == "cygwin" || "$OSTYPE" == "win32" ]]; then
extra_excludes+=(--exclude benchmark-harness)
fi
RUST_BACKTRACE=full cargo test \
--workspace \
--exclude kreuzberg \
--exclude kreuzberg-e2e-generator \
--exclude kreuzberg-py \
--exclude kreuzberg-node \
${extra_excludes[@]+"${extra_excludes[@]}"} \
--all-features \
--all-targets \
--verbose
} 2>&1 | tee "$TEST_LOG"; then
echo "=== Test execution failed ==="
echo "Last 50 lines of test output:"
tail -n 50 "$TEST_LOG"
echo ""
echo "Collecting diagnostic information..."
echo "Disk space:"
df -h . || du -h . 2>/dev/null | head -1
echo "Cargo environment:"
cargo --version
rustc --version
rm -f "$TEST_LOG"
exit 1
fi
rm -f "$TEST_LOG"
echo "=== Tests complete ==="

View File

@@ -0,0 +1,9 @@
#!/usr/bin/env bash
set -euo pipefail
label="${1:-Disk space}"
echo "=== ${label} ===" >&2
df -h / >&2
echo "Disk info:" >&2
df -B1 / | tail -1 >&2 || true

32
scripts/install-php-ext.sh Executable file
View File

@@ -0,0 +1,32 @@
#!/bin/bash
set -e
# Install the kreuzberg PHP extension to the system PHP extension directory
# Called from the before hook in alef.toml for PHP e2e tests
EXTENSION_DIR=$(php -r 'echo ini_get("extension_dir");')
# Find the built extension
for path in target/release/libkreuzberg_php.dylib target/release/libkreuzberg_php.so target/release/kreuzberg_php.dll; do
if [ -f "$path" ]; then
EXT_PATH="$path"
break
fi
done
if [ -z "$EXT_PATH" ]; then
echo "Error: PHP extension not found in target/release/" >&2
exit 1
fi
# Copy to extension directory
EXT_FILENAME=$(basename "$EXT_PATH")
cp "$EXT_PATH" "$EXTENSION_DIR/$EXT_FILENAME"
# Add to php.ini if not already present
PHP_INI=$(php -r 'echo php_ini_loaded_file();')
if ! grep -q "extension=$EXT_FILENAME" "$PHP_INI"; then
echo "extension=$EXT_FILENAME" >>"$PHP_INI"
fi
echo "Installed PHP extension: $EXT_FILENAME to $EXTENSION_DIR"

178
scripts/install.sh Executable file
View File

@@ -0,0 +1,178 @@
#!/usr/bin/env bash
# Kreuzberg CLI installer
# Usage: curl -fsSL https://kreuzberg.dev/install.sh | bash
#
# Environment variables:
# KREUZBERG_VERSION - Specific version to install (default: latest)
# KREUZBERG_INSTALL - Installation directory (default: ~/.kreuzberg/bin or /usr/local/bin)
set -euo pipefail
REPO="kreuzberg-dev/kreuzberg"
BINARY_NAME="kreuzberg"
# --- Helpers ---
info() { printf '\033[1;34m%s\033[0m\n' "$*"; }
warn() { printf '\033[1;33m%s\033[0m\n' "$*" >&2; }
error() {
printf '\033[1;31merror: %s\033[0m\n' "$*" >&2
exit 1
}
need_cmd() {
if ! command -v "$1" >/dev/null 2>&1; then
error "need '$1' (command not found)"
fi
}
# --- Detect platform ---
detect_os() {
local os
os="$(uname -s)"
case "$os" in
Linux*) echo "linux" ;;
Darwin*) echo "darwin" ;;
*) error "unsupported OS: $os" ;;
esac
}
detect_arch() {
local arch
arch="$(uname -m)"
case "$arch" in
x86_64 | amd64) echo "x86_64" ;;
aarch64 | arm64) echo "aarch64" ;;
*) error "unsupported architecture: $arch" ;;
esac
}
detect_target() {
local os arch
os="$(detect_os)"
arch="$(detect_arch)"
case "${os}-${arch}" in
linux-x86_64) echo "x86_64-unknown-linux-musl" ;;
linux-aarch64) echo "aarch64-unknown-linux-musl" ;;
darwin-x86_64) echo "aarch64-apple-darwin" ;; # Rosetta compatible
darwin-aarch64) echo "aarch64-apple-darwin" ;;
*) error "unsupported platform: ${os}-${arch}" ;;
esac
}
# --- Version resolution ---
get_latest_version() {
need_cmd curl
# List recent releases and pick the first tag starting with "v" (skip benchmark runs etc.)
local url="https://api.github.com/repos/${REPO}/releases?per_page=20"
local tag
tag="$(curl -fsSL "$url" | grep '"tag_name"' | sed 's/.*"tag_name":[[:space:]]*"\([^"]*\)".*/\1/' | grep '^v' | head -1 || true)"
if [ -z "$tag" ]; then
error "failed to fetch latest release tag from GitHub"
fi
echo "$tag"
}
# --- Download and install ---
install() {
need_cmd curl
need_cmd tar
local os arch target version install_dir
os="$(detect_os)"
arch="$(detect_arch)"
target="$(detect_target)"
if [ -n "${KREUZBERG_VERSION:-}" ]; then
version="${KREUZBERG_VERSION}"
# Ensure 'v' prefix
case "$version" in
v*) ;;
*) version="v${version}" ;;
esac
else
info "Fetching latest release..."
version="$(get_latest_version)"
fi
info "Installing kreuzberg ${version} for ${target}"
# Determine install directory
if [ -n "${KREUZBERG_INSTALL:-}" ]; then
install_dir="${KREUZBERG_INSTALL}"
elif [ "$(id -u)" -eq 0 ]; then
install_dir="/usr/local/bin"
else
install_dir="${HOME}/.kreuzberg/bin"
fi
mkdir -p "$install_dir"
# Download
local artifact="kreuzberg-cli-${target}.tar.gz"
local url="https://github.com/${REPO}/releases/download/${version}/${artifact}"
info "Downloading ${url}"
tmpdir="$(mktemp -d)"
trap 'rm -rf "$tmpdir"' EXIT
curl -fsSL "$url" -o "${tmpdir}/${artifact}"
# Extract
tar -xzf "${tmpdir}/${artifact}" -C "$tmpdir"
# Install binary
local stage_dir="${tmpdir}/kreuzberg-cli-${target}"
local binary_path="${stage_dir}/${BINARY_NAME}"
if [ ! -f "$binary_path" ]; then
error "binary not found in archive at ${binary_path}"
fi
cp "$binary_path" "${install_dir}/${BINARY_NAME}"
chmod +x "${install_dir}/${BINARY_NAME}"
# Install the actual binary (musl builds use wrapper + .bin)
if [ -f "${stage_dir}/${BINARY_NAME}.bin" ]; then
cp "${stage_dir}/${BINARY_NAME}.bin" "${install_dir}/${BINARY_NAME}.bin"
chmod +x "${install_dir}/${BINARY_NAME}.bin"
fi
# Install bundled runtime libraries (musl builds only)
if [ -d "${stage_dir}/lib" ] && [ "$(ls -A "${stage_dir}/lib" 2>/dev/null)" ]; then
mkdir -p "${install_dir}/lib"
cp "${stage_dir}/lib/"* "${install_dir}/lib/"
info "Installed runtime libraries to ${install_dir}/lib/"
fi
info "Installed ${BINARY_NAME} to ${install_dir}/${BINARY_NAME}"
# Verify
if "${install_dir}/${BINARY_NAME}" --version >/dev/null 2>&1; then
info "Verified: $("${install_dir}/${BINARY_NAME}" --version)"
else
warn "Binary installed but --version check failed"
fi
# PATH hint
case ":${PATH}:" in
*":${install_dir}:"*) ;;
*)
warn ""
warn "Add ${install_dir} to your PATH:"
warn ""
warn " export PATH=\"${install_dir}:\$PATH\""
warn ""
warn "Add this to your shell profile (~/.bashrc, ~/.zshrc, etc.) to make it permanent."
;;
esac
}
install

70
scripts/lib/common.sh Executable file
View File

@@ -0,0 +1,70 @@
#!/usr/bin/env bash
set -euo pipefail
get_repo_root() {
local start_dir current_dir
start_dir="$(pwd)"
current_dir="$start_dir"
while [ "$current_dir" != "/" ]; do
if [ -f "$current_dir/Cargo.toml" ]; then
echo "$current_dir"
return 0
fi
current_dir="$(dirname "$current_dir")"
done
echo "Error: Could not find repository root (Cargo.toml) from: $start_dir" >&2
return 1
}
validate_repo_root() {
local repo_root="${1:-${REPO_ROOT:-}}"
if [ -z "$repo_root" ]; then
echo "Error: REPO_ROOT not provided and env var not set" >&2
return 1
fi
if [ ! -f "$repo_root/Cargo.toml" ]; then
echo "Error: REPO_ROOT validation failed. Expected Cargo.toml at: $repo_root/Cargo.toml" >&2
echo "REPO_ROOT resolved to: $repo_root" >&2
return 1
fi
return 0
}
error_exit() {
local message="${1:-Unknown error}"
local exit_code="${2:-1}"
echo "Error: $message" >&2
exit "$exit_code"
}
get_platform() {
if [ -n "${RUNNER_OS:-}" ]; then
echo "$RUNNER_OS"
else
case "$(uname -s)" in
Linux*)
echo "Linux"
;;
Darwin*)
echo "macOS"
;;
MINGW* | MSYS* | CYGWIN*)
echo "Windows"
;;
*)
echo "unknown"
;;
esac
fi
}
export -f get_repo_root
export -f validate_repo_root
export -f error_exit
export -f get_platform

197
scripts/lib/library-paths.sh Executable file
View File

@@ -0,0 +1,197 @@
#!/usr/bin/env bash
set -euo pipefail
_get_path_separator() {
local platform="${1:-$(uname -s)}"
case "$platform" in
MINGW* | MSYS* | CYGWIN* | Windows)
echo ";"
;;
*)
echo ":"
;;
esac
}
setup_onnx_paths() {
local ort_lib="${ORT_LIB_LOCATION:-}"
[ -z "$ort_lib" ] && return 0
local platform="${RUNNER_OS:-$(uname -s)}"
case "$platform" in
Linux)
export LD_LIBRARY_PATH="${ort_lib}:${LD_LIBRARY_PATH:-}"
echo "✓ Set LD_LIBRARY_PATH for ONNX Runtime"
;;
macOS | Darwin)
export DYLD_LIBRARY_PATH="${ort_lib}:${DYLD_LIBRARY_PATH:-}"
export DYLD_FALLBACK_LIBRARY_PATH="${ort_lib}:${DYLD_FALLBACK_LIBRARY_PATH:-}"
echo "✓ Set DYLD_LIBRARY_PATH for ONNX Runtime on macOS"
;;
Windows | MINGW* | MSYS* | CYGWIN*)
export PATH="${ort_lib};${PATH:-}"
echo "✓ Set PATH for ONNX Runtime on Windows"
;;
esac
}
setup_rust_ffi_paths() {
local repo_root="${1:-${REPO_ROOT:-}}"
[ -z "$repo_root" ] && return 0
local ffi_lib="$repo_root/target/release"
local ffi_lib_gnu="$repo_root/target/x86_64-pc-windows-gnu/release"
local platform="${RUNNER_OS:-$(uname -s)}"
case "$platform" in
Linux)
[ ! -d "$ffi_lib" ] && return 0
export LD_LIBRARY_PATH="${ffi_lib}:${LD_LIBRARY_PATH:-}"
echo "✓ Set LD_LIBRARY_PATH for Rust FFI"
;;
macOS | Darwin)
[ ! -d "$ffi_lib" ] && return 0
export DYLD_LIBRARY_PATH="${ffi_lib}:${DYLD_LIBRARY_PATH:-}"
export DYLD_FALLBACK_LIBRARY_PATH="${ffi_lib}:${DYLD_FALLBACK_LIBRARY_PATH:-}"
echo "✓ Set DYLD_LIBRARY_PATH for Rust FFI on macOS"
;;
Windows | MINGW* | MSYS* | CYGWIN*)
# Check for short path CI directories first
local cargo_target="${CARGO_TARGET_DIR:-}"
if [ -n "$cargo_target" ] && [ -d "$cargo_target/release" ]; then
export PATH="${cargo_target}/release;${PATH:-}"
echo "✓ Set PATH for Rust FFI (using CARGO_TARGET_DIR=$cargo_target)"
fi
# Add GNU target path if it exists
if [ -d "$ffi_lib_gnu" ]; then
export PATH="${ffi_lib_gnu};${PATH:-}"
echo "✓ Set PATH for Rust FFI GNU target"
fi
# Add standard target path if it exists
if [ -d "$ffi_lib" ]; then
export PATH="${ffi_lib};${PATH:-}"
echo "✓ Set PATH for Rust FFI on Windows"
fi
;;
esac
}
verify_pkg_config() {
if pkg-config --exists kreuzberg-ffi 2>/dev/null; then
return 0
else
{
echo "Error: pkg-config cannot find kreuzberg-ffi"
echo "PKG_CONFIG_PATH=${PKG_CONFIG_PATH:-<not set>}"
echo "Run 'pkg-config --list-all' to see available packages"
} >&2
return 1
fi
}
setup_go_paths_windows() {
local repo_root="${1:-${REPO_ROOT:-}}"
[ -z "$repo_root" ] && return 0
local gnu_target="${repo_root}/target/x86_64-pc-windows-gnu/release"
local release_target="${repo_root}/target/release"
export PKG_CONFIG_PATH="${repo_root}/crates/kreuzberg-ffi:${PKG_CONFIG_PATH:-}"
export PATH="${gnu_target};${release_target};${PATH:-}"
export CGO_ENABLED=1
export CGO_CFLAGS="-I${repo_root}/crates/kreuzberg-ffi/include"
export CGO_LDFLAGS="-L${gnu_target} -L${release_target} -lkreuzberg_ffi -static-libgcc -static-libstdc++"
echo "✓ Configured Go cgo environment for Windows"
}
# NOTE: CGO_LDFLAGS is set by setup-go-cgo-env action on Windows in CI, or by this script on Unix
setup_go_paths() {
local repo_root="${1:-${REPO_ROOT:-}}"
[ -z "$repo_root" ] && return 0
local pc_path="${repo_root}/crates/kreuzberg-ffi/kreuzberg-ffi.pc"
if [ ! -f "$pc_path" ]; then
local version=""
version="$(sed -n 's/^version = \"\\(.*\\)\"/\\1/p' "${repo_root}/Cargo.toml" | head -n 1 || true)"
[ -z "$version" ] && version="unknown"
local platform="${RUNNER_OS:-$(uname -s)}"
local libs_private=""
case "$platform" in
Linux)
libs_private="-lpthread -ldl -lm"
;;
macOS | Darwin)
libs_private="-framework CoreFoundation -framework Security -lpthread"
;;
Windows | MINGW* | MSYS* | CYGWIN*)
libs_private="-lws2_32 -luserenv -lbcrypt"
;;
esac
mkdir -p "$(dirname "$pc_path")"
cat >"$pc_path" <<EOF
prefix=${repo_root}
exec_prefix=\${prefix}
libdir=${repo_root}/target/release
includedir=${repo_root}/crates/kreuzberg-ffi
Name: kreuzberg-ffi
Description: C FFI bindings for Kreuzberg document intelligence library
Version: ${version}
URL: https://kreuzberg.dev
Libs: -L\${libdir} -lkreuzberg_ffi
Libs.private: ${libs_private}
Cflags: -I\${includedir}
EOF
fi
export PKG_CONFIG_PATH="${repo_root}/crates/kreuzberg-ffi:${PKG_CONFIG_PATH:-}"
export CGO_ENABLED=1
export CGO_CFLAGS="-I${repo_root}/crates/kreuzberg-ffi/include"
local platform="${RUNNER_OS:-$(uname -s)}"
case "$platform" in
Linux)
export LD_LIBRARY_PATH="${repo_root}/target/release:${LD_LIBRARY_PATH:-}"
export CGO_LDFLAGS="-L${repo_root}/target/release -lkreuzberg_ffi -Wl,-rpath,${repo_root}/target/release"
;;
macOS | Darwin)
export DYLD_LIBRARY_PATH="${repo_root}/target/release:${DYLD_LIBRARY_PATH:-}"
export DYLD_FALLBACK_LIBRARY_PATH="${repo_root}/target/release:${DYLD_FALLBACK_LIBRARY_PATH:-}"
export CGO_LDFLAGS="-L${repo_root}/target/release -lkreuzberg_ffi -Wl,-rpath,${repo_root}/target/release"
;;
Windows | MINGW* | MSYS* | CYGWIN*)
if [ -z "${CGO_LDFLAGS:-}" ] && [ -z "${GITHUB_ENV:-}" ]; then
# Only set library search path; ffi.go CGO directives handle -l flags
# This matches the approach in setup-go-cgo-env/windows.ps1
export CGO_LDFLAGS="-L${repo_root}/target/x86_64-pc-windows-gnu/release -L${repo_root}/target/release"
fi
;;
esac
echo "✓ Configured Go cgo environment"
}
setup_all_library_paths() {
local repo_root="${1:-${REPO_ROOT:-}}"
echo "Setting up library paths..."
setup_onnx_paths
setup_rust_ffi_paths "$repo_root"
setup_go_paths "$repo_root"
echo "✓ All library paths configured"
}
export -f setup_onnx_paths
export -f setup_rust_ffi_paths
export -f verify_pkg_config
export -f setup_go_paths_windows
export -f setup_go_paths
export -f setup_all_library_paths
export -f _get_path_separator

85
scripts/lib/retry.sh Executable file
View File

@@ -0,0 +1,85 @@
#!/usr/bin/env bash
set -euo pipefail
run_with_timeout() {
local seconds="$1"
shift
if command -v timeout >/dev/null 2>&1; then
timeout "${seconds}" "$@"
return $?
fi
if command -v gtimeout >/dev/null 2>&1; then
gtimeout "${seconds}" "$@"
return $?
fi
if command -v python3 >/dev/null 2>&1; then
python3 - "$seconds" "$@" <<'PY'
import subprocess
import sys
timeout_s = int(sys.argv[1])
cmd = sys.argv[2:]
try:
completed = subprocess.run(cmd, timeout=timeout_s)
sys.exit(completed.returncode)
except subprocess.TimeoutExpired:
sys.exit(124)
PY
return $?
fi
"$@"
}
retry_with_backoff() {
local max_attempts=3
local attempt=1
local delay=5
while [ $attempt -le $max_attempts ]; do
if "$@"; then
return 0
fi
if [ $attempt -lt $max_attempts ]; then
echo "⚠ Attempt $attempt failed, retrying in ${delay}s..." >&2
sleep $delay
delay=$((delay * 2))
fi
attempt=$((attempt + 1))
done
return 1
}
retry_with_backoff_timeout() {
local seconds="$1"
shift
local max_attempts=3
local attempt=1
local delay=5
local exit_code=1
while [ $attempt -le $max_attempts ]; do
if run_with_timeout "$seconds" "$@"; then
return 0
else
exit_code=$?
fi
if [ $attempt -lt $max_attempts ]; then
echo "⚠ Attempt $attempt failed (exit $exit_code), retrying in ${delay}s..." >&2
sleep $delay
delay=$((delay * 2))
fi
attempt=$((attempt + 1))
done
return $exit_code
}
export -f run_with_timeout
export -f retry_with_backoff
export -f retry_with_backoff_timeout

157
scripts/lib/tessdata.sh Executable file
View File

@@ -0,0 +1,157 @@
#!/usr/bin/env bash
set -euo pipefail
file_size_bytes() {
local path="$1"
if [ ! -f "$path" ]; then
echo 0
return
fi
if stat -c%s "$path" >/dev/null 2>&1; then
stat -c%s "$path"
return
fi
stat -f%z "$path"
}
min_traineddata_size_bytes() {
local lang="$1"
case "$lang" in
eng) echo 1000000 ;;
osd) echo 100000 ;;
deu) echo 1000000 ;;
*) echo 100000 ;;
esac
}
download_traineddata() {
local lang="$1"
local dest="$2"
local url="$3"
local tmp="${dest}.tmp"
local min_size
min_size="$(min_traineddata_size_bytes "$lang")"
rm -f "$tmp"
for attempt in 1 2 3 4 5; do
if curl -fsSL --retry 5 --retry-delay 5 --retry-all-errors "$url" -o "$tmp"; then
local size
size="$(file_size_bytes "$tmp")"
if [ "$size" -ge "$min_size" ]; then
mv -f "$tmp" "$dest"
return 0
fi
echo "Downloaded ${lang}.traineddata too small (${size} bytes < ${min_size}), retrying..." >&2
else
echo "Failed to download ${lang}.traineddata (attempt ${attempt}), retrying..." >&2
fi
rm -f "$tmp"
sleep "$attempt"
done
echo "ERROR: Failed to download valid ${lang}.traineddata after retries" >&2
return 1
}
ensure_valid_traineddata() {
local dest_dir="$1"
local lang="$2"
local url="$3"
local dest_file="${dest_dir}/${lang}.traineddata"
local min_size
min_size="$(min_traineddata_size_bytes "$lang")"
local size
size="$(file_size_bytes "$dest_file")"
if [ "$size" -ge "$min_size" ]; then
return 0
fi
if [ -f "$dest_file" ]; then
echo "Invalid ${lang}.traineddata at ${dest_file} (${size} bytes < ${min_size}); re-downloading..." >&2
rm -f "$dest_file"
fi
download_traineddata "$lang" "$dest_file" "$url"
}
ensure_tessdata() {
local dest="$1"
mkdir -p "$dest"
local dest_real
dest_real="$(cd "$dest" && pwd -P)"
local candidates=(
"/opt/homebrew/share/tessdata"
"/usr/local/opt/tesseract/share/tessdata"
"/usr/share/tesseract-ocr/5/tessdata"
)
if [ -n "${PROGRAMFILES:-}" ] && command -v cygpath >/dev/null 2>&1; then
candidates+=("$(cygpath -u "$PROGRAMFILES")/Tesseract-OCR/tessdata")
fi
if [ -d "/c/Program Files/Tesseract-OCR/tessdata" ]; then
candidates+=("/c/Program Files/Tesseract-OCR/tessdata")
fi
for dir in "${candidates[@]}"; do
if [ -f "$dir/eng.traineddata" ]; then
local dir_real
dir_real="$(cd "$dir" && pwd -P)"
if [ "$dir_real" = "$dest_real" ]; then
break
fi
for lang in eng osd deu; do
if [ -f "$dir/$lang.traineddata" ]; then
if [ -f "$dest/$lang.traineddata" ] &&
[ "$dir_real/$lang.traineddata" -ef "$dest/$lang.traineddata" ]; then
continue
fi
cp -f "$dir/$lang.traineddata" "$dest/"
fi
done
break
fi
done
ensure_valid_traineddata "$dest" "eng" "https://github.com/tesseract-ocr/tessdata_fast/raw/main/eng.traineddata"
ensure_valid_traineddata "$dest" "osd" "https://github.com/tesseract-ocr/tessdata_fast/raw/main/osd.traineddata"
}
setup_tessdata() {
local platform="${RUNNER_OS:-$(uname -s)}"
case "$platform" in
Linux)
export TESSDATA_PREFIX="/usr/share/tesseract-ocr/5/tessdata"
;;
macOS | Darwin)
if [ -d "/opt/homebrew/opt/tesseract/share/tessdata" ]; then
export TESSDATA_PREFIX="/opt/homebrew/opt/tesseract/share/tessdata"
elif [ -d "/usr/local/opt/tesseract/share/tessdata" ]; then
export TESSDATA_PREFIX="/usr/local/opt/tesseract/share/tessdata"
else
export TESSDATA_PREFIX="$HOME/Library/Application Support/kreuzberg-tesseract/tessdata"
fi
;;
Windows | MINGW* | MSYS* | CYGWIN*)
export TESSDATA_PREFIX="${APPDATA:-${USERPROFILE:-}}/kreuzberg-tesseract/tessdata"
;;
*)
export TESSDATA_PREFIX="${REPO_ROOT:-$(pwd)}/target/tessdata"
;;
esac
ensure_tessdata "$TESSDATA_PREFIX"
echo "✓ TESSDATA_PREFIX set to: $TESSDATA_PREFIX"
[ -f "$TESSDATA_PREFIX/eng.traineddata" ] && echo "✓ eng.traineddata available"
[ -f "$TESSDATA_PREFIX/osd.traineddata" ] && echo "✓ osd.traineddata available"
}
export -f ensure_tessdata
export -f setup_tessdata

View File

@@ -0,0 +1,17 @@
#!/usr/bin/env bash
set -euo pipefail
tag="${DOCKER_TAG:?DOCKER_TAG not set}"
label="${SUMMARY_LABEL:-image}"
exists=false
if docker manifest inspect "$tag" >/dev/null 2>&1; then
exists=true
fi
echo "exists=$exists" >>"${GITHUB_OUTPUT:?GITHUB_OUTPUT not set}"
if [ "$exists" = "true" ] && [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
echo "Docker tag $tag already exists; ${label} publish will be skipped." >>"$GITHUB_STEP_SUMMARY"
fi

View File

@@ -0,0 +1,13 @@
#!/usr/bin/env bash
set -euo pipefail
image="${IMAGE:-}"
version="${VERSION:-}"
tag_suffix="${TAG_SUFFIX:-}"
if [ -z "$image" ] || [ -z "$version" ]; then
echo "Usage: set IMAGE and VERSION (optional TAG_SUFFIX) env vars" >&2
exit 2
fi
echo "Dry run requested; Docker image ${image}:${version}${tag_suffix} tested but not pushed." >>"$GITHUB_STEP_SUMMARY"

View File

@@ -0,0 +1,66 @@
#!/usr/bin/env bash
set -euo pipefail
# Update Formula/kreuzberg.rb in the homebrew-tap with the new tag's URL and
# source-tarball SHA256. The bottle DSL is updated separately by the
# `homebrew-merge-bottles@v1` action after bottles are built.
#
# Usage (env vars):
# TAG=v5.0.0-rc.2 VERSION=5.0.0-rc.2 \
# TAP_DIR=/path/to/homebrew-tap \
# ./update-homebrew-formula.sh
tag="${TAG:?TAG is required (e.g. v5.0.0-rc.2)}"
version="${VERSION:?VERSION is required (e.g. 5.0.0-rc.2)}"
tap_dir="${TAP_DIR:?TAP_DIR is required (path to homebrew-tap checkout)}"
dry_run="${DRY_RUN:-false}"
formula="${tap_dir}/Formula/kreuzberg.rb"
[[ -f "$formula" ]] || {
echo "Missing $formula" >&2
exit 1
}
tarball_url="https://github.com/kreuzberg-dev/kreuzberg/archive/${tag}.tar.gz"
echo "Updating Homebrew formula for kreuzberg ${version} (tag ${tag})"
if [[ "$dry_run" == "true" ]]; then
echo "[dry-run] target formula: $formula"
echo "[dry-run] would set url to: $tarball_url"
echo "[dry-run] would compute sha256 of source tarball and rewrite the formula"
echo "[dry-run] would leave bottle DSL untouched (handled by homebrew-merge-bottles)"
exit 0
fi
echo "Fetching source tarball SHA256 for ${tag}..."
sha256=$(curl -fsSL "$tarball_url" | shasum -a 256 | awk '{print $1}')
echo " url: $tarball_url"
echo " sha256: $sha256"
# Update the top-level url + sha256 lines (the ones outside `bottle do ... end`).
# Match `url "..."` on one line, `sha256 "..."` on the next, only when both come
# before the `bottle do` block.
python3 - "$formula" "$tarball_url" "$sha256" <<'PY'
import re
import sys
formula_path, new_url, new_sha = sys.argv[1], sys.argv[2], sys.argv[3]
text = open(formula_path).read()
# Split off the bottle block so the regex only touches the formula header.
bottle_start = text.find("bottle do")
if bottle_start == -1:
head, tail = text, ""
else:
head, tail = text[:bottle_start], text[bottle_start:]
head = re.sub(r'^(\s*url\s+)"[^"]*"', rf'\1"{new_url}"', head, count=1, flags=re.MULTILINE)
head = re.sub(r'^(\s*sha256\s+)"[^"]*"', rf'\1"{new_sha}"', head, count=1, flags=re.MULTILINE)
with open(formula_path, "w") as f:
f.write(head + tail)
PY
echo "Updated $formula"

43
scripts/setup-php-ext-ini.sh Executable file
View File

@@ -0,0 +1,43 @@
#!/bin/bash
set -e
# Setup temporary php.ini for e2e/php that loads the kreuzberg extension from target/release
# Called from alef.toml before hook for PHP e2e tests
# Must be run from e2e/php directory
EXT_DIR=$(php -r 'echo ini_get("extension_dir");')
# Look for built extension (relative to e2e/php/)
for path in ../../target/release/libkreuzberg_php.dylib ../../target/release/libkreuzberg_php.so ../../target/release/kreuzberg_php.dll; do
if [ -f "$path" ]; then
BUILT_EXT="$path"
break
fi
done
if [ -z "$BUILT_EXT" ]; then
echo "Error: kreuzberg PHP extension not found in target/release/" >&2
exit 1
fi
# Resolve to absolute path
BUILT_EXT=$(cd "$(dirname "$BUILT_EXT")" && pwd)/$(basename "$BUILT_EXT")
# Copy extension to extension directory
BASENAME=$(basename "$BUILT_EXT")
TARGET="$EXT_DIR/$BASENAME"
cp "$BUILT_EXT" "$TARGET" 2>/dev/null || true # May fail if already exists, that's OK
echo "Extension copied/verified: $TARGET"
# Create php.ini in current directory (e2e/php) that loads the extension.
# extension_dir is set explicitly so the ini works even when invoked with
# PHP_INI_SCAN_DIR= (which is recommended in the e2e runner to skip stale
# conf.d/*.ini entries left behind by sibling projects).
cat >php.ini <<EOF
; Temporary PHP INI for e2e tests — loads kreuzberg PHP extension from system extension directory
[PHP]
extension_dir=$EXT_DIR
extension=$BASENAME
EOF
echo "Created php.ini that loads: $BASENAME"

View File

@@ -0,0 +1,40 @@
#!/bin/bash
# Setup Swift bridge files after cargo build
set -e
# Find the most recently built output directory
OUT=$(find target/release/build -maxdepth 2 -type d -name out -path '*kreuzberg-swift-*' \
-exec stat -f '%m %N' {} + 2>/dev/null | sort -rn | head -1 | cut -d' ' -f2-)
if [ -z "$OUT" ]; then
echo "ERROR: Could not find swift-bridge build output in target/release/build/"
exit 1
fi
echo "Using swift-bridge output from: $OUT"
# Fix swift-bridge visibility: make 'var ptr' and 'var isOwned' properties public for internal type conversion
fixVisibility() {
sed -e 's/^ var ptr: UnsafeMutableRawPointer$/ public var ptr: UnsafeMutableRawPointer/g' \
-e 's/^ var isOwned: Bool = true$/ public var isOwned: Bool = true/g'
}
# Ensure target directories exist
mkdir -p packages/swift/Sources/RustBridgeC
mkdir -p packages/swift/Sources/RustBridge
# Copy C headers
cat "$OUT/SwiftBridgeCore.h" "$OUT/kreuzberg-swift/kreuzberg-swift.h" \
>packages/swift/Sources/RustBridgeC/RustBridgeC.h
# Copy Swift bridge files with import statement prepended
{
printf 'import RustBridgeC\n'
cat "$OUT/SwiftBridgeCore.swift" | fixVisibility
} >packages/swift/Sources/RustBridge/SwiftBridgeCore.swift
{
printf 'import RustBridgeC\n'
cat "$OUT/kreuzberg-swift/kreuzberg-swift.swift" | fixVisibility
} >packages/swift/Sources/RustBridge/kreuzberg-swift.swift
echo "Swift-bridge files setup complete"

View File

@@ -0,0 +1,51 @@
#!/usr/bin/env bash
# Stage libkreuzberg_ffi into packages/csharp/Kreuzberg/runtimes/<rid>/native/
# so dotnet test can locate it via runtime asset resolution.
#
# Auto-detects host RID. Idempotent.
set -euo pipefail
repo_root="$(cd "$(dirname "$0")/.." && pwd)"
cd "$repo_root"
case "$(uname -s)" in
Darwin)
ext=dylib
case "$(uname -m)" in
arm64 | aarch64) rid=osx-arm64 ;;
*) rid=osx-x64 ;;
esac
;;
Linux)
ext=so
case "$(uname -m)" in
aarch64 | arm64) rid=linux-arm64 ;;
*) rid=linux-x64 ;;
esac
;;
MINGW* | MSYS* | CYGWIN*)
ext=dll
rid=win-x64
;;
*)
echo "Unsupported platform: $(uname -s)" >&2
exit 1
;;
esac
src="target/release/libkreuzberg_ffi.${ext}"
if [ "$ext" = "dll" ]; then
src="target/release/kreuzberg_ffi.${ext}"
fi
if [ ! -f "$src" ]; then
echo "ERROR: $src not found. Run: cargo build --release -p kreuzberg-ffi" >&2
exit 1
fi
dst_dir="packages/csharp/Kreuzberg/runtimes/${rid}/native"
mkdir -p "$dst_dir"
cp -f "$src" "$dst_dir/"
echo "Staged $(basename "$src") -> $dst_dir/"

View File

@@ -0,0 +1,92 @@
#!/usr/bin/env node
// Generates docs/demo-dev.html from docs/demo.html with CDN URLs replaced
// by the local asset server so no manual editing of demo.html is ever needed.
//
// CDN pattern replaced:
// https://cdn.jsdelivr.net/npm/@kreuzberg/wasm@*/...
// → http://localhost:9000/...
//
// Also patches pkg/web/kreuzberg_wasm.js (gitignored, wasm-pack generated) to
// replace bare specifier imports ("env", "wasi_snapshot_preview1") with inline
// browser shims. The local 5.x WASM binary is compiled with WASI syscalls via
// tesseract's C layer; the importmap approach does not propagate into Workers
// loading cross-origin modules, so we shim the generated JS directly.
//
// The output file is gitignored and regenerated on every `task demo:dev`.
import { readFileSync, writeFileSync, existsSync } from "node:fs";
import { join, dirname } from "node:path";
import { fileURLToPath } from "node:url";
const root = join(dirname(fileURLToPath(import.meta.url)), "..", "..");
const src = join(root, "docs", "demo.html");
const dest = join(root, "docs", "demo-dev.html");
const ASSET_PORT = process.env.ASSET_PORT ?? "9000";
const cdnRe = /https:\/\/cdn\.jsdelivr\.net\/npm\/@kreuzberg\/wasm@[^/'"]+/g;
const patched = readFileSync(src, "utf8")
.replace(cdnRe, `http://localhost:${ASSET_PORT}`)
.replace(/<title>(.*?)<\/title>/, "<title>$1 [local dev]</title>")
.replace(
"</body>",
` <div style="position:fixed;bottom:12px;right:12px;background:#1a172a;border:1px solid #58FBDA55;color:#58FBDA;font-family:monospace;font-size:11px;padding:6px 10px;border-radius:6px;z-index:9999">
local dev · assets: localhost:${ASSET_PORT}
</div>\n</body>`,
);
writeFileSync(dest, patched, "utf8");
console.log(`patch-demo-dev: docs/demo-dev.html → http://localhost:8001/demo-dev.html`);
console.log(` assets served from http://localhost:${ASSET_PORT}`);
// Patch pkg/web/kreuzberg_wasm.js — strip bare "env" / "wasi_snapshot_preview1"
// import lines and replace with inline browser shims so the module loads in a
// Worker without an importmap (importmap inheritance in Workers is unreliable
// for bare specifiers in transitive cross-origin dynamic imports).
const wasmJs = join(root, "crates", "kreuzberg-wasm", "pkg", "web", "kreuzberg_wasm.js");
if (!existsSync(wasmJs)) {
console.warn(`patch-demo-dev: ${wasmJs} not found — skipping WASI shim patch`);
} else {
const bareImportRe = /^import \* as (import\d+) from "(env|wasi_snapshot_preview1)"\s*$/gm;
const original = readFileSync(wasmJs, "utf8");
const envAliases = [];
const wasiAliases = [];
let m;
while ((m = bareImportRe.exec(original)) !== null) {
if (m[2] === "env") envAliases.push(m[1]);
else wasiAliases.push(m[1]);
}
if (envAliases.length === 0 && wasiAliases.length === 0) {
console.log("patch-demo-dev: kreuzberg_wasm.js already patched, skipping");
} else {
const stripped = original.replace(/^import \* as import\d+ from "(env|wasi_snapshot_preview1)"\s*\n/gm, "");
const envShim = `const __env_shim = { system: () => -1, mkstemp: () => -1 };`;
const envConsts = envAliases.map((a) => `const ${a} = __env_shim;`).join("\n");
const wasiShim = [
`const __wasi_shim = {`,
` environ_sizes_get: () => 0, environ_get: () => 0,`,
` clock_time_get: () => 52,`,
` fd_close: () => 8, fd_fdstat_get: () => 8, fd_fdstat_set_flags: () => 8,`,
` fd_prestat_get: () => 8, fd_prestat_dir_name: () => 8,`,
` fd_read: () => 8, fd_seek: () => 8, fd_write: () => 8,`,
` path_create_directory: () => 52, path_filestat_get: () => 52,`,
` path_open: () => 52, path_remove_directory: () => 52, path_unlink_file: () => 52,`,
` proc_exit: (code) => { throw new Error("WASI: proc_exit(" + code + ")"); },`,
`};`,
].join("\n");
const wasiConsts = wasiAliases.map((a) => `const ${a} = __wasi_shim;`).join("\n");
const shims = [envShim, envConsts, wasiShim, wasiConsts].filter(Boolean).join("\n") + "\n";
const patchedWasmJs = stripped.replace(/^(\/\* @ts-self-types[^\n]*\n)/m, `$1${shims}`);
writeFileSync(wasmJs, patchedWasmJs, "utf8");
console.log(
`patch-demo-dev: patched kreuzberg_wasm.js` +
` (${envAliases.length} env alias(es), ${wasiAliases.length} wasi alias(es))`,
);
}
}

264
scripts/test/README.md Normal file
View File

@@ -0,0 +1,264 @@
# Docker Configuration Testing Scripts
This directory contains comprehensive testing scripts for validating Docker configuration scenarios.
## Scripts
### test-docker-config-local.sh
A comprehensive local Docker testing script that validates all configuration volume mount scenarios.
#### Purpose
Tests Docker configuration in various scenarios:
- Volume mounts to `/etc/kreuzberg/kreuzberg.toml` (recommended system path)
- Volume mounts to `/app/.config/kreuzberg/config.toml` (user path)
- Custom paths with `--config` flag
- Environment variable overrides with config files
- All config formats (TOML, YAML, JSON)
- Read-only mounts (`:ro` flag)
#### Requirements
- Docker installed and running
- Docker images pre-built (`kreuzberg:core` and/or `kreuzberg:full`)
- Port range 18100-18199 available for testing
#### Usage
```bash
./test-docker-config-local.sh [OPTIONS]
```
#### Options
| Option | Description | Default |
| ------------------- | ----------------------------------------------- | -------- |
| `--variant VARIANT` | Test specific variant: `core`, `full`, or `all` | `all` |
| `--verbose` | Enable verbose debugging output | Disabled |
| `--keep-containers` | Preserve containers after tests for inspection | Clean up |
| `--help` | Display help message | - |
#### Examples
Test both core and full variants:
```bash
./test-docker-config-local.sh
```
Test only the full variant with verbose output:
```bash
./test-docker-config-local.sh --variant full --verbose
```
Test core variant and keep containers for inspection:
```bash
./test-docker-config-local.sh --variant core --keep-containers
```
#### Test Cases
The script runs 8 test cases for each variant:
1. **Volume mount to /etc/kreuzberg/kreuzberg.toml**
- Tests the recommended system-wide configuration path
- Validates read-only mount functionality
2. **Volume mount to /app/.config/kreuzberg/config.toml**
- Tests the user-level configuration path
- Validates alternative mount location
3. **Custom path with --config flag**
- Tests custom configuration file paths
- Validates explicit path specification via CLI flag
4. **Environment variable overrides with config file**
- Tests that environment variables can override config file settings
- Validates configuration precedence
5. **TOML config format**
- Tests TOML configuration file format support
- Validates parsing of TOML syntax
6. **YAML config format**
- Tests YAML configuration file format support
- Validates parsing of YAML syntax
7. **JSON config format**
- Tests JSON configuration file format support
- Validates parsing of JSON syntax
8. **Read-only mount**
- Tests that containers work correctly with read-only mounts
- Validates security of mounted volumes
#### Validation Method
For each test, the script:
1. Creates a temporary configuration file in the specified format
2. Starts a Docker container with the configuration mounted
3. Waits for the service to become healthy (up to 30 seconds)
4. Verifies the health endpoint responds successfully
5. Stops and removes the container
6. Reports pass/fail status
#### Output
The script provides clear, color-coded output:
- `[PASS]` - Test passed (green)
- `[FAIL]` - Test failed (red)
- `[INFO]` - Informational messages (blue)
- `[WARN]` - Warnings (yellow)
- `[DEBUG]` - Debug information (yellow, with `--verbose`)
Example output:
```text
╔════════════════════════════════════════════════════════╗
║ Docker Configuration Volume Mount Test Suite ║
╚════════════════════════════════════════════════════════╝
[INFO] Configuration:
[INFO] Variant: all
[INFO] Verbose: false
[INFO] Keep Containers: false
[INFO] Port Range: 18100-18199
[INFO] Docker is available
Test 01: Volume mount to /etc/kreuzberg/kreuzberg.toml (variant: core)
[PASS] Test passed
Test 02: Volume mount to /app/.config/kreuzberg/config.toml (variant: core)
[PASS] Test passed
...
╔════════════════════════════════════════════════════════╗
║ Test Summary ║
╚════════════════════════════════════════════════════════╝
Total Tests: 16
Passed Tests: 16
Failed Tests: 0
Pass Rate: 100%
Tested Variants:
- kreuzberg:core
- kreuzberg:full
```
#### Troubleshooting
**Error: Docker is not installed or not in PATH**
- Install Docker from <https://www.docker.com/products/docker-desktop>
- Ensure Docker is in your system PATH
**Error: Docker daemon is not running**
- Start Docker Desktop or the Docker daemon
- On Linux: `sudo systemctl start docker`
**Error: Docker image does not exist**
- Build the required image(s):
```bash
cd /path/to/kreuzberg
docker build -f docker/Dockerfile.core -t kreuzberg:core .
docker build -f docker/Dockerfile.full -t kreuzberg:full .
```
**Tests timing out**
- Check system resources (CPU, memory)
- Increase timeout: Modify `TIMEOUT_SECONDS=30` in the script
- Check Docker logs: `docker logs <container-name>`
**Port conflicts**
- Ensure ports 18100-18199 are available
- Check for existing containers: `docker ps -a`
- Kill conflicting containers: `docker kill <container-name>`
#### Environment Variables
The script respects these environment variables:
| Variable | Description | Default |
| ----------------- | ------------------------------------- | ------- |
| `TEST_VARIANT` | Override variant via environment | Unset |
| `VERBOSE` | Enable verbose output via environment | `false` |
| `KEEP_CONTAINERS` | Keep containers via environment | `false` |
Example:
```bash
VERBOSE=true ./test-docker-config-local.sh --variant core
```
#### Temporary Files
The script creates temporary configuration files in `/tmp/kreuzberg-config-test-$PID/`:
- `kreuzberg.toml` - TOML format test config
- `config.yaml` - YAML format test config
- `config.json` - JSON format test config
These are automatically cleaned up after tests complete (unless `--keep-containers` is used).
#### Exit Codes
- `0` - All tests passed
- `1` - One or more tests failed, or Docker is not available
#### Performance Notes
- Each test takes approximately 2-5 seconds
- Total test suite runtime: 1-2 minutes for all variants
- Network latency may affect health check timing
- Container startup time depends on system resources
#### CI/CD Integration
The script can be integrated into CI/CD pipelines:
```bash
#!/bin/bash
set -e
# Build images
docker build -f docker/Dockerfile.core -t kreuzberg:core .
docker build -f docker/Dockerfile.full -t kreuzberg:full .
# Run tests
./scripts/test/test-docker-config-local.sh --variant all
echo "Configuration tests passed!"
```
#### Limitations
- Requires Docker to be installed and running
- Tests only configuration volume mounts (not other volume types)
- Tests only health endpoint (basic connectivity validation)
- Assumes `kreuzberg:*` image naming convention
- Tests run sequentially (not parallelized)
#### Future Enhancements
Potential improvements:
- Parallel test execution for faster results
- Additional validation endpoints (beyond `/health`)
- Configuration value verification (test that config was actually loaded)
- Performance benchmarking
- Multi-architecture testing (arm64, amd64)
- Docker Compose integration tests

528
scripts/test/USAGE.md Normal file
View File

@@ -0,0 +1,528 @@
# Docker Configuration Testing - Quick Start Guide
## Overview
The `test-docker-config-local.sh` script provides comprehensive testing for Docker configuration volume mounts and environment variable overrides.
## Prerequisites
1. **Docker**: Installed and running
2. **Images**: Pre-built Docker images for testing
3. **Ports**: 18100-18199 available for test containers
4. **Utilities**: `bash`, `curl`, `docker` command-line tools
## Building Test Images
Before running tests, build the Docker images:
```bash
cd .
# Build core variant
docker build -f docker/Dockerfile.core -t kreuzberg:core .
# Build full variant
docker build -f docker/Dockerfile.full -t kreuzberg:full .
# Or build both
docker build -f docker/Dockerfile.core -t kreuzberg:core . && \
docker build -f docker/Dockerfile.full -t kreuzberg:full .
```
## Running Tests
### Basic Usage
Test all variants with default settings:
```bash
./scripts/test/test-docker-config-local.sh
```
### Common Commands
**Test only core variant:**
```bash
./scripts/test/test-docker-config-local.sh --variant core
```
**Test only full variant:**
```bash
./scripts/test/test-docker-config-local.sh --variant full
```
**Enable verbose output:**
```bash
./scripts/test/test-docker-config-local.sh --verbose
```
**Keep containers after testing:**
```bash
./scripts/test/test-docker-config-local.sh --keep-containers
```
**Combine multiple options:**
```bash
./scripts/test/test-docker-config-local.sh --variant full --verbose --keep-containers
```
## Test Cases Explained
### 1. Volume Mount to /etc/kreuzberg/kreuzberg.toml
**What it tests**: System-wide configuration path (recommended)
**Docker command**:
```bash
docker run -v /local/config.toml:/etc/kreuzberg/kreuzberg.toml:ro kreuzberg:full
```
**Expected**: Container reads config from standard system location
---
### 2. Volume Mount to /app/.config/kreuzberg/config.toml
**What it tests**: User-level configuration path (alternative location)
**Docker command**:
```bash
docker run -v /local/config.toml:/app/.config/kreuzberg/config.toml:ro kreuzberg:full
```
**Expected**: Container reads config from user application directory
---
### 3. Custom Path with --config Flag
**What it tests**: Explicit configuration path specification
**Docker command**:
```bash
docker run \
-v /local/config.toml:/app/custom-config.toml:ro \
--entrypoint "/app/kreuzberg" \
kreuzberg:full \
--config /app/custom-config.toml
```
**Expected**: Container uses specified custom path
---
### 4. Environment Variable Overrides
**What it tests**: Environment variables override config file settings
**Docker command**:
```bash
docker run \
-v /local/config.toml:/etc/kreuzberg/kreuzberg.toml:ro \
-e KREUZBERG_SERVER_PORT=8000 \
kreuzberg:full
```
**Expected**: Environment variable takes precedence over config file
---
### 5. TOML Format Support
**What it tests**: Configuration in TOML format
**Config file**:
```toml
[server]
host = "0.0.0.0"
port = 8000
max_upload_mb = 100
[ocr]
backend = "tesseract"
language = "eng"
```
**Expected**: Container parses TOML correctly
---
### 6. YAML Format Support
**What it tests**: Configuration in YAML format
**Config file**:
```yaml
server:
host: "0.0.0.0"
port: 8000
max_upload_mb: 100
ocr:
backend: "tesseract"
language: "eng"
```
**Expected**: Container parses YAML correctly
---
### 7. JSON Format Support
**What it tests**: Configuration in JSON format
**Config file**:
```json
{
"server": {
"host": "0.0.0.0",
"port": 8000,
"max_upload_mb": 100
},
"ocr": {
"backend": "tesseract",
"language": "eng"
}
}
```
**Expected**: Container parses JSON correctly
---
### 8. Read-Only Mount
**What it tests**: Security of read-only mounted volumes
**Docker command**:
```bash
docker run -v /local/config.toml:/etc/kreuzberg/kreuzberg.toml:ro kreuzberg:full
```
**Expected**: Container works with read-only volumes, application doesn't attempt to modify config
---
## Understanding Output
### Success Output
```text
╔════════════════════════════════════════════════════════╗
║ Docker Configuration Volume Mount Test Suite ║
╚════════════════════════════════════════════════════════╝
[INFO] Configuration:
[INFO] Variant: all
[INFO] Verbose: false
[INFO] Keep Containers: false
[INFO] Port Range: 18100-18199
[INFO] Docker is available
Test 01: Volume mount to /etc/kreuzberg/kreuzberg.toml (variant: core)
[PASS] Test passed
```
### Failure Output
```text
Test 02: Custom path with --config flag (variant: core)
[FAIL] Test failed: Failed to start container with custom --config flag
[FAIL] Details: Container logs:
/app/kreuzberg: line 123: syntax error: unexpected token
```
### Summary
```text
╔════════════════════════════════════════════════════════╗
║ Test Summary ║
╚════════════════════════════════════════════════════════╝
Total Tests: 16
Passed Tests: 16
Failed Tests: 0
Pass Rate: 100%
Tested Variants:
- kreuzberg:core
- kreuzberg:full
```
## Debugging Failed Tests
### Enable Verbose Output
```bash
./scripts/test/test-docker-config-local.sh --variant core --verbose
```
Verbose output shows:
- Container IDs
- Docker arguments
- Service startup timing
- Health check attempts
### Keep Containers for Inspection
```bash
./scripts/test/test-docker-config-local.sh --keep-containers
```
Then inspect containers manually:
```bash
# List test containers
docker ps -a | grep kreuzberg-config-test
# View specific container logs
docker logs kreuzberg-config-test-etc-core-12345
# Execute command in running container
docker exec kreuzberg-config-test-etc-core-12345 cat /etc/kreuzberg/kreuzberg.toml
# Stop container manually
docker stop kreuzberg-config-test-etc-core-12345
docker rm kreuzberg-config-test-etc-core-12345
```
### Check Health Endpoint Manually
```bash
# Start container manually
docker run -d \
--name test-container \
-p 8000:8000 \
-v /path/to/config.toml:/etc/kreuzberg/kreuzberg.toml:ro \
kreuzberg:full
# Wait for startup
sleep 3
# Test health endpoint
curl -v http://localhost:8000/health
# View logs
docker logs test-container
# Cleanup
docker stop test-container
docker rm test-container
```
## Troubleshooting
### Docker Not Found
```text
[ERROR] Docker is not installed or not in PATH
```
**Solution**: Install Docker or ensure it's in your PATH
```bash
which docker
export PATH=$PATH:/usr/local/bin # or wherever docker is installed
```
### Docker Daemon Not Running
```text
[ERROR] Docker daemon is not running or you don't have permissions
```
**Solution**: Start Docker
```bash
# macOS
open -a Docker
# Linux
sudo systemctl start docker
# Check status
docker ps
```
### Image Not Found
```text
[WARN] Skipping tests for variant: full (image not found)
```
**Solution**: Build the image
```bash
docker build -f docker/Dockerfile.full -t kreuzberg:full .
```
### Port Already in Use
```text
[FAIL] Test failed: Failed to start container
[FAIL] Details: port is already allocated
```
**Solution**: Free the ports or wait for existing tests to finish
```bash
# Find what's using the ports
lsof -i :18100-18199
# Or just stop all test containers
docker ps -a --filter "name=kreuzberg-config-test" --format "{{.Names}}" | \
xargs -r docker stop
```
### Health Check Timeout
```text
[FAIL] Test failed: Service failed to start (health check timeout)
```
**Debugging**:
1. Check container is still running:
```bash
docker ps | grep kreuzberg-config-test
```
2. View container logs:
```bash
docker logs <container-name>
```
3. Check if service is binding to port:
```bash
docker exec <container-name> netstat -tuln | grep 8000
```
4. Increase timeout (edit script):
```bash
TIMEOUT_SECONDS=60 # Change from 30
```
## CI/CD Integration
### GitHub Actions
```yaml
name: Docker Config Tests
on: [push, pull_request]
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Build Docker images
run: |
docker build -f docker/Dockerfile.core -t kreuzberg:core .
docker build -f docker/Dockerfile.full -t kreuzberg:full .
- name: Run configuration tests
run: ./scripts/test/test-docker-config-local.sh --variant all
```
### GitLab CI
```yaml
docker-config-tests:
stage: test
image: docker:latest
services:
- docker:dind
script:
- docker build -f docker/Dockerfile.core -t kreuzberg:core .
- docker build -f docker/Dockerfile.full -t kreuzberg:full .
- ./scripts/test/test-docker-config-local.sh --variant all
```
## Performance Expectations
| Metric | Time |
| ------------------------- | -------------- |
| Single test | 2-5 seconds |
| All 8 tests (1 variant) | 30-45 seconds |
| All 16 tests (2 variants) | 60-90 seconds |
| With verbose output | +10-20 seconds |
## Exit Codes
| Code | Meaning |
| ---- | ---------------------------------------------- |
| 0 | All tests passed |
| 1 | One or more tests failed OR Docker unavailable |
## Advanced Usage
### Custom Environment Variables
```bash
# Override variant via environment
TEST_VARIANT=core ./scripts/test/test-docker-config-local.sh
# Override verbose via environment
VERBOSE=true ./scripts/test/test-docker-config-local.sh
```
### Modify Timeout
Edit the script to change timeout:
```bash
TIMEOUT_SECONDS=60 # Line ~43, change from 30
```
### Test Specific Scenarios
To test only one specific scenario, modify the `run_test_suite()` call in `main()`:
```bash
# Comment out unwanted tests
# test_etc_kreuzberg_mount "$variant"
test_app_config_mount "$variant"
# test_custom_path_with_flag "$variant"
# ... etc
```
## Getting Help
```bash
./scripts/test/test-docker-config-local.sh --help
```
For detailed documentation:
```bash
cat ./scripts/test/README.md
```
## Related Files
- **Script**: `./scripts/test/test-docker-config-local.sh`
- **Documentation**: `./scripts/test/README.md`
- **This Guide**: `./scripts/test/USAGE.md`
- **Docker Files**: `./docker/Dockerfile.core`
- **Docker Files**: `./docker/Dockerfile.full`

View File

@@ -0,0 +1,800 @@
#!/bin/bash
################################################################################
# Docker Configuration Volume Mount Testing Script
#
# This script validates all Docker configuration scenarios locally:
# - Volume mounts to /etc/kreuzberg/kreuzberg.toml (recommended)
# - Volume mounts to /app/.config/kreuzberg/config.toml (user path)
# - Custom paths with --config flag
# - Environment variable overrides with config files
# - All config formats (TOML, YAML, JSON)
# - Read-only mounts
#
# Usage: ./test-docker-config-local.sh [OPTIONS]
# Options:
# --variant core|full|all Test specific variant (default: all)
# --verbose Enable verbose output
# --keep-containers Don't cleanup containers after tests
################################################################################
set -o pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
DOCKER_DIR="$(cd "$SCRIPT_DIR/../../docker" && pwd)"
# Color codes
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m'
# Test configuration
TEST_VARIANT="${TEST_VARIANT:-all}"
IMAGE_NAME="${IMAGE_NAME:-}" # Empty means build from Dockerfile
VERBOSE="${VERBOSE:-false}"
KEEP_CONTAINERS="${KEEP_CONTAINERS:-false}"
TIMEOUT_SECONDS=30
PORT_BASE=18100
TEST_TEMP_DIR="/tmp/kreuzberg-config-test-$$"
# Test tracking
TOTAL_TESTS=0
PASSED_TESTS=0
FAILED_TESTS=0
declare -a FAILED_TEST_NAMES=()
declare -a TESTED_VARIANTS=()
################################################################################
# Helper Functions
################################################################################
log_header() {
echo -e "\n${CYAN}╔════════════════════════════════════════════════════════╗${NC}"
echo -e "${CYAN}$1${NC}"
echo -e "${CYAN}╚════════════════════════════════════════════════════════╝${NC}\n"
}
log_info() {
echo -e "${BLUE}[INFO]${NC} $*"
}
log_success() {
echo -e "${GREEN}[PASS]${NC} $*"
}
log_warning() {
echo -e "${YELLOW}[WARN]${NC} $*"
}
log_error() {
echo -e "${RED}[FAIL]${NC} $*"
}
log_debug() {
if [ "$VERBOSE" = "true" ]; then
echo -e "${YELLOW}[DEBUG]${NC} $*"
fi
}
start_test() {
TOTAL_TESTS=$((TOTAL_TESTS + 1))
local test_num
test_num=$(printf "%02d" $TOTAL_TESTS)
echo ""
echo -e "${CYAN}Test $test_num:${NC} $*"
}
pass_test() {
PASSED_TESTS=$((PASSED_TESTS + 1))
log_success "Test passed"
}
fail_test() {
FAILED_TESTS=$((FAILED_TESTS + 1))
FAILED_TEST_NAMES+=("$1")
log_error "Test failed: $1"
if [ -n "${2:-}" ]; then
log_error " Details: $2"
fi
}
# shellcheck disable=SC2317,SC2329 # Function is invoked via trap EXIT
cleanup() {
log_info "Cleaning up test environment..."
if [ "$KEEP_CONTAINERS" != "true" ]; then
# Stop and remove test containers
docker ps -a --filter "name=kreuzberg-config-test-" --format "{{.Names}}" | while read -r container; do
log_debug "Stopping container: $container"
docker stop "$container" 2>/dev/null || true
docker rm "$container" 2>/dev/null || true
done
else
log_warning "Keeping containers for inspection (use 'docker ps -a' to view)"
fi
# Remove temporary test files
if [ -d "$TEST_TEMP_DIR" ]; then
log_debug "Removing temporary directory: $TEST_TEMP_DIR"
rm -rf "$TEST_TEMP_DIR"
fi
}
trap cleanup EXIT
################################################################################
# Setup Functions
################################################################################
setup_test_environment() {
log_info "Setting up test environment..."
if ! mkdir -p "$TEST_TEMP_DIR"; then
log_error "Failed to create temporary directory"
exit 1
fi
log_debug "Test temp directory: $TEST_TEMP_DIR"
}
verify_docker_available() {
if ! command -v docker &>/dev/null; then
log_error "Docker is not installed or not in PATH"
exit 1
fi
if ! docker ps &>/dev/null; then
log_error "Docker daemon is not running or you don't have permissions"
exit 1
fi
log_info "Docker is available"
}
check_image_exists() {
local image="$1"
if ! docker image inspect "$image" &>/dev/null; then
log_error "Docker image does not exist: $image"
log_error "Please build the image first with: docker build -f $DOCKER_DIR/Dockerfile.${image##*:} -t $image ."
return 1
fi
return 0
}
get_image_name() {
local variant="$1"
if [ -n "$IMAGE_NAME" ]; then
# Use provided image name (CI mode)
echo "$IMAGE_NAME"
else
# Use default naming convention (local mode)
echo "kreuzberg:$variant"
fi
}
################################################################################
# Config File Creation Functions
################################################################################
create_toml_config() {
local file_path="$1"
local port="${2:-8000}"
# Config must be valid ExtractionConfig (deny_unknown_fields).
# Server settings use defaults; ports are mapped via docker -p flag.
cat >"$file_path" <<EOF
use_cache = true
enable_quality_processing = true
[ocr]
backend = "tesseract"
language = "eng"
EOF
log_debug "Created TOML config: $file_path"
}
create_yaml_config() {
local file_path="$1"
local port="${2:-8000}"
# Config must be valid ExtractionConfig (deny_unknown_fields).
# Server settings use defaults; ports are mapped via docker -p flag.
cat >"$file_path" <<EOF
use_cache: true
enable_quality_processing: true
ocr:
backend: "tesseract"
language: "eng"
EOF
log_debug "Created YAML config: $file_path"
}
create_json_config() {
local file_path="$1"
local port="${2:-8000}"
# Config must be valid ExtractionConfig (deny_unknown_fields).
# Server settings use defaults; ports are mapped via docker -p flag.
cat >"$file_path" <<EOF
{
"use_cache": true,
"enable_quality_processing": true,
"ocr": {
"backend": "tesseract",
"language": "eng"
}
}
EOF
log_debug "Created JSON config: $file_path"
}
################################################################################
# Container Testing Functions
################################################################################
run_container() {
local container_name="$1"
local image="$2"
local port="$3"
shift 3
# Separate docker options from command arguments
local docker_opts=()
local cmd_args=()
local after_separator=false
while [ $# -gt 0 ]; do
if [ "$1" = "--" ]; then
after_separator=true
shift
continue
fi
if [ "$after_separator" = true ]; then
cmd_args+=("$1")
else
docker_opts+=("$1")
fi
shift
done
log_debug "Running container: $container_name"
log_debug "Docker opts: ${docker_opts[*]}"
log_debug "Command args: ${cmd_args[*]}"
if ! docker run -d \
--name "$container_name" \
-p "$port:8000" \
"${docker_opts[@]}" \
"$image" \
"${cmd_args[@]}" >/dev/null 2>&1; then
return 1
fi
return 0
}
wait_for_health() {
local port="$1"
local max_wait="${2:-$TIMEOUT_SECONDS}"
local elapsed=0
local interval=1
log_debug "Waiting for service on port $port (timeout: ${max_wait}s)"
while [ "$elapsed" -lt "$max_wait" ]; do
if curl -sf "http://localhost:$port/health" &>/dev/null; then
log_debug "Service became healthy after ${elapsed}s"
return 0
fi
sleep $interval
elapsed=$((elapsed + interval))
done
log_debug "Service did not become healthy within ${max_wait}s"
return 1
}
check_container_running() {
local container_name="$1"
if docker inspect "$container_name" --format='{{.State.Running}}' 2>/dev/null | grep -q "true"; then
return 0
fi
return 1
}
get_container_logs() {
local container_name="$1"
docker logs "$container_name" 2>&1 | tail -20
}
################################################################################
# Test Cases
################################################################################
test_etc_kreuzberg_mount() {
local variant="$1"
start_test "Volume mount to /etc/kreuzberg/kreuzberg.toml (variant: $variant)"
local image
image="$(get_image_name "$variant")"
local port=$((PORT_BASE + TOTAL_TESTS))
local container_name="kreuzberg-config-test-etc-${variant}-$$"
local config_file="$TEST_TEMP_DIR/kreuzberg.toml"
# Create config file
create_toml_config "$config_file" "$port"
# Run container with mount
if ! run_container "$container_name" "$image" "$port" \
--volume "$config_file:/etc/kreuzberg/kreuzberg.toml:ro"; then
fail_test "Failed to start container with /etc/kreuzberg mount"
log_error " Container logs:\n$(get_container_logs "$container_name" 2>/dev/null || echo 'N/A')"
return 1
fi
sleep 2
# Check if container is still running
if ! check_container_running "$container_name"; then
fail_test "Container exited unexpectedly"
log_error " Container logs:\n$(get_container_logs "$container_name")"
return 1
fi
# Wait for service to be healthy
if ! wait_for_health "$port"; then
fail_test "Service failed to start (health check timeout)"
log_error " Container logs:\n$(get_container_logs "$container_name")"
docker stop "$container_name" 2>/dev/null || true
return 1
fi
# Test the health endpoint
if ! curl -sf "http://localhost:$port/health" >/dev/null; then
fail_test "Health endpoint returned non-success status"
docker stop "$container_name" 2>/dev/null || true
return 1
fi
log_success "Service is running and healthy"
docker stop "$container_name" 2>/dev/null || true
pass_test
}
test_app_config_mount() {
local variant="$1"
start_test "Volume mount to /app/.config/kreuzberg/config.toml (variant: $variant)"
local image
image="$(get_image_name "$variant")"
local port=$((PORT_BASE + TOTAL_TESTS))
local container_name="kreuzberg-config-test-app-config-${variant}-$$"
local config_file="$TEST_TEMP_DIR/config.toml"
# Create config file
create_toml_config "$config_file" "$port"
# Run container with mount
if ! run_container "$container_name" "$image" "$port" \
--volume "$config_file:/app/.config/kreuzberg/config.toml:ro"; then
fail_test "Failed to start container with /app/.config mount"
log_error " Container logs:\n$(get_container_logs "$container_name" 2>/dev/null || echo 'N/A')"
return 1
fi
sleep 2
if ! check_container_running "$container_name"; then
fail_test "Container exited unexpectedly"
log_error " Container logs:\n$(get_container_logs "$container_name")"
return 1
fi
if ! wait_for_health "$port"; then
fail_test "Service failed to start (health check timeout)"
log_error " Container logs:\n$(get_container_logs "$container_name")"
docker stop "$container_name" 2>/dev/null || true
return 1
fi
if ! curl -sf "http://localhost:$port/health" >/dev/null; then
fail_test "Health endpoint returned non-success status"
docker stop "$container_name" 2>/dev/null || true
return 1
fi
log_success "Service is running and healthy"
docker stop "$container_name" 2>/dev/null || true
pass_test
}
test_custom_path_with_flag() {
local variant="$1"
start_test "Custom path with --config flag (variant: $variant)"
local image
image="$(get_image_name "$variant")"
local port=$((PORT_BASE + TOTAL_TESTS))
local container_name="kreuzberg-config-test-custom-${variant}-$$"
local config_file="$TEST_TEMP_DIR/custom-config.toml"
local container_path="/app/custom-config.toml"
# Create config file
create_toml_config "$config_file" "$port"
# Run container with custom config path
if ! run_container "$container_name" "$image" "$port" \
--volume "$config_file:$container_path:ro" \
--entrypoint "/usr/local/bin/kreuzberg" \
-- "serve" "--config" "$container_path" "--host" "0.0.0.0"; then
fail_test "Failed to start container with custom --config flag"
log_error " Container logs:\n$(get_container_logs "$container_name" 2>/dev/null || echo 'N/A')"
return 1
fi
sleep 2
if ! check_container_running "$container_name"; then
fail_test "Container exited unexpectedly"
log_error " Container logs:\n$(get_container_logs "$container_name")"
return 1
fi
if ! wait_for_health "$port"; then
fail_test "Service failed to start (health check timeout)"
log_error " Container logs:\n$(get_container_logs "$container_name")"
docker stop "$container_name" 2>/dev/null || true
return 1
fi
if ! curl -sf "http://localhost:$port/health" >/dev/null; then
fail_test "Health endpoint returned non-success status"
docker stop "$container_name" 2>/dev/null || true
return 1
fi
log_success "Service is running and healthy with custom config path"
docker stop "$container_name" 2>/dev/null || true
pass_test
}
test_env_var_overrides() {
local variant="$1"
start_test "Environment variable overrides with config file (variant: $variant)"
local image
image="$(get_image_name "$variant")"
local port=$((PORT_BASE + TOTAL_TESTS))
local container_name="kreuzberg-config-test-env-${variant}-$$"
local config_file="$TEST_TEMP_DIR/env-config.toml"
# Create config file with port 8000
create_toml_config "$config_file" "8000"
# Run container with config mount and environment variable override
if ! run_container "$container_name" "$image" "$port" \
--volume "$config_file:/etc/kreuzberg/kreuzberg.toml:ro" \
--env "KREUZBERG_SERVER_PORT=$port"; then
fail_test "Failed to start container with env var override"
log_error " Container logs:\n$(get_container_logs "$container_name" 2>/dev/null || echo 'N/A')"
return 1
fi
sleep 2
if ! check_container_running "$container_name"; then
fail_test "Container exited unexpectedly"
log_error " Container logs:\n$(get_container_logs "$container_name")"
return 1
fi
if ! wait_for_health "$port"; then
fail_test "Service failed to start (health check timeout)"
log_error " Container logs:\n$(get_container_logs "$container_name")"
docker stop "$container_name" 2>/dev/null || true
return 1
fi
if ! curl -sf "http://localhost:$port/health" >/dev/null; then
fail_test "Health endpoint returned non-success status"
docker stop "$container_name" 2>/dev/null || true
return 1
fi
log_success "Service is running with environment variable overrides"
docker stop "$container_name" 2>/dev/null || true
pass_test
}
test_toml_format() {
local variant="$1"
start_test "TOML config format (variant: $variant)"
local image
image="$(get_image_name "$variant")"
local port=$((PORT_BASE + TOTAL_TESTS))
local container_name="kreuzberg-config-test-toml-${variant}-$$"
local config_file="$TEST_TEMP_DIR/config.toml"
create_toml_config "$config_file" "$port"
if ! run_container "$container_name" "$image" "$port" \
--volume "$config_file:/etc/kreuzberg/kreuzberg.toml:ro"; then
fail_test "Failed to start container with TOML config"
return 1
fi
sleep 2
if ! wait_for_health "$port"; then
fail_test "Service failed to start with TOML config"
docker stop "$container_name" 2>/dev/null || true
return 1
fi
log_success "TOML config format works correctly"
docker stop "$container_name" 2>/dev/null || true
pass_test
}
test_yaml_format() {
local variant="$1"
start_test "YAML config format (variant: $variant)"
local image
image="$(get_image_name "$variant")"
local port=$((PORT_BASE + TOTAL_TESTS))
local container_name="kreuzberg-config-test-yaml-${variant}-$$"
local config_file="$TEST_TEMP_DIR/config.yaml"
create_yaml_config "$config_file" "$port"
if ! run_container "$container_name" "$image" "$port" \
--volume "$config_file:/etc/kreuzberg/kreuzberg.yaml:ro"; then
fail_test "Failed to start container with YAML config"
return 1
fi
sleep 2
if ! wait_for_health "$port"; then
fail_test "Service failed to start with YAML config"
docker stop "$container_name" 2>/dev/null || true
return 1
fi
log_success "YAML config format works correctly"
docker stop "$container_name" 2>/dev/null || true
pass_test
}
test_json_format() {
local variant="$1"
start_test "JSON config format (variant: $variant)"
local image
image="$(get_image_name "$variant")"
local port=$((PORT_BASE + TOTAL_TESTS))
local container_name="kreuzberg-config-test-json-${variant}-$$"
local config_file="$TEST_TEMP_DIR/config.json"
create_json_config "$config_file" "$port"
if ! run_container "$container_name" "$image" "$port" \
--volume "$config_file:/etc/kreuzberg/kreuzberg.json:ro"; then
fail_test "Failed to start container with JSON config"
return 1
fi
sleep 2
if ! wait_for_health "$port"; then
fail_test "Service failed to start with JSON config"
docker stop "$container_name" 2>/dev/null || true
return 1
fi
log_success "JSON config format works correctly"
docker stop "$container_name" 2>/dev/null || true
pass_test
}
test_readonly_mount() {
local variant="$1"
start_test "Read-only mount (variant: $variant)"
local image
image="$(get_image_name "$variant")"
local port=$((PORT_BASE + TOTAL_TESTS))
local container_name="kreuzberg-config-test-readonly-${variant}-$$"
local config_file="$TEST_TEMP_DIR/readonly-config.toml"
create_toml_config "$config_file" "$port"
# Run with read-only mount (explicitly :ro)
if ! run_container "$container_name" "$image" "$port" \
--volume "$config_file:/etc/kreuzberg/kreuzberg.toml:ro"; then
fail_test "Failed to start container with read-only mount"
return 1
fi
sleep 2
if ! check_container_running "$container_name"; then
fail_test "Container exited unexpectedly with read-only mount"
return 1
fi
if ! wait_for_health "$port"; then
fail_test "Service failed to start with read-only mount"
docker stop "$container_name" 2>/dev/null || true
return 1
fi
log_success "Read-only mount works correctly"
docker stop "$container_name" 2>/dev/null || true
pass_test
}
################################################################################
# Test Execution
################################################################################
run_test_suite() {
local variant="$1"
log_header "Testing variant: $(get_image_name "$variant")"
# Check if image exists
if ! check_image_exists "$(get_image_name "$variant")"; then
log_warning "Skipping tests for variant: $variant (image not found)"
return
fi
TESTED_VARIANTS+=("$variant")
# Run all test cases
test_etc_kreuzberg_mount "$variant"
test_app_config_mount "$variant"
test_custom_path_with_flag "$variant"
test_env_var_overrides "$variant"
test_toml_format "$variant"
test_yaml_format "$variant"
test_json_format "$variant"
test_readonly_mount "$variant"
}
print_summary() {
log_header "Test Summary"
local pass_rate=0
if [ $TOTAL_TESTS -gt 0 ]; then
pass_rate=$((PASSED_TESTS * 100 / TOTAL_TESTS))
fi
echo -e "Total Tests: ${CYAN}$TOTAL_TESTS${NC}"
echo -e "Passed Tests: ${GREEN}$PASSED_TESTS${NC}"
echo -e "Failed Tests: ${RED}$FAILED_TESTS${NC}"
echo -e "Pass Rate: ${BLUE}${pass_rate}%${NC}"
echo ""
if [ $FAILED_TESTS -gt 0 ]; then
echo -e "${RED}Failed Tests:${NC}"
for test_name in "${FAILED_TEST_NAMES[@]}"; do
echo " - $test_name"
done
echo ""
fi
if [ ${#TESTED_VARIANTS[@]} -gt 0 ]; then
echo -e "${CYAN}Tested Variants:${NC}"
for variant in "${TESTED_VARIANTS[@]}"; do
echo " - $(get_image_name "$variant")"
done
echo ""
fi
}
################################################################################
# Main Entry Point
################################################################################
main() {
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--variant)
TEST_VARIANT="$2"
shift 2
;;
--image)
IMAGE_NAME="$2"
shift 2
;;
--verbose)
VERBOSE=true
shift
;;
--keep-containers)
KEEP_CONTAINERS=true
shift
;;
--help)
echo "Usage: $0 [OPTIONS]"
echo ""
echo "Options:"
echo " --variant VARIANT Test specific variant (core, full, or all) [default: all]"
echo " --image IMAGE Use pre-built image instead of building [default: build from Dockerfile]"
echo " --verbose Enable verbose output"
echo " --keep-containers Don't cleanup containers after tests"
echo " --help Show this help message"
exit 0
;;
*)
log_error "Unknown option: $1"
exit 1
;;
esac
done
log_header "Docker Configuration Volume Mount Test Suite"
log_info "Configuration:"
log_info " Variant: $TEST_VARIANT"
log_info " Verbose: $VERBOSE"
log_info " Keep Containers: $KEEP_CONTAINERS"
log_info " Port Range: $PORT_BASE-$((PORT_BASE + 99))"
log_info ""
# Verify Docker is available
verify_docker_available
# Setup test environment
setup_test_environment
# Run tests based on variant selection
case "$TEST_VARIANT" in
core)
run_test_suite "core"
;;
full)
run_test_suite "full"
;;
all)
run_test_suite "core"
run_test_suite "full"
;;
*)
log_error "Invalid variant: $TEST_VARIANT (must be 'core', 'full', or 'all')"
exit 1
;;
esac
# Print summary
print_summary
# Exit with appropriate code
if [ $FAILED_TESTS -eq 0 ]; then
log_success "All tests passed!"
exit 0
else
log_error "Some tests failed"
exit 1
fi
}
main "$@"