This commit is contained in:
10
.github/CODEOWNERS
vendored
Normal file
10
.github/CODEOWNERS
vendored
Normal file
@@ -0,0 +1,10 @@
|
||||
# Default owner — everything
|
||||
* @Goldziher
|
||||
|
||||
# Zensical config and documentation
|
||||
/zensical.toml @Goldziher @pratik-mahalle @v-tan
|
||||
/docs/ @Goldziher @pratik-mahalle @v-tan
|
||||
*.md @Goldziher @pratik-mahalle @v-tan
|
||||
|
||||
# Rust crates
|
||||
/crates/ @Goldziher @kh3rld
|
||||
28
.github/ISSUE_TEMPLATE/bug_report.yml
vendored
Normal file
28
.github/ISSUE_TEMPLATE/bug_report.yml
vendored
Normal file
@@ -0,0 +1,28 @@
|
||||
name: Bug Report
|
||||
description: Report a bug or unexpected behavior
|
||||
title: "bug: "
|
||||
labels: ["bug"]
|
||||
projects: ["kreuzberg-dev/1"]
|
||||
body:
|
||||
- type: textarea
|
||||
id: description
|
||||
attributes:
|
||||
label: Description
|
||||
description: What happened? What did you expect to happen?
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: steps-to-reproduce
|
||||
attributes:
|
||||
label: Steps to reproduce
|
||||
description: Minimal steps to reproduce the issue.
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: reproduction-files
|
||||
attributes:
|
||||
label: Relevant files and configuration
|
||||
description: >-
|
||||
Any configuration files, input files, or code snippets needed to
|
||||
reproduce the issue.
|
||||
render: text
|
||||
1
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
1
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
@@ -0,0 +1 @@
|
||||
blank_issues_enabled: true
|
||||
20
.github/ISSUE_TEMPLATE/documentation.yml
vendored
Normal file
20
.github/ISSUE_TEMPLATE/documentation.yml
vendored
Normal file
@@ -0,0 +1,20 @@
|
||||
name: Documentation Issue
|
||||
description: Report missing, unclear, or incorrect documentation
|
||||
title: "docs: "
|
||||
labels: ["documentation"]
|
||||
projects: ["kreuzberg-dev/1"]
|
||||
body:
|
||||
- type: textarea
|
||||
id: what
|
||||
attributes:
|
||||
label: What
|
||||
description: What documentation is missing, unclear, or incorrect?
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: why
|
||||
attributes:
|
||||
label: Why
|
||||
description: Why does this need to change?
|
||||
validations:
|
||||
required: true
|
||||
18
.github/ISSUE_TEMPLATE/feature_request.yml
vendored
Normal file
18
.github/ISSUE_TEMPLATE/feature_request.yml
vendored
Normal file
@@ -0,0 +1,18 @@
|
||||
name: Feature Request
|
||||
description: Suggest a new feature or improvement
|
||||
title: "feat: "
|
||||
labels: ["enhancement"]
|
||||
projects: ["kreuzberg-dev/1"]
|
||||
body:
|
||||
- type: textarea
|
||||
id: what
|
||||
attributes:
|
||||
label: What is the proposed feature?
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: why
|
||||
attributes:
|
||||
label: Why would this be a good addition?
|
||||
validations:
|
||||
required: true
|
||||
12
.github/PULL_REQUEST_TEMPLATE.md
vendored
Normal file
12
.github/PULL_REQUEST_TEMPLATE.md
vendored
Normal file
@@ -0,0 +1,12 @@
|
||||
## Related
|
||||
|
||||
<!-- Link issues or discussions if applicable -->
|
||||
|
||||
## Description
|
||||
|
||||
<!-- What does this PR do? -->
|
||||
|
||||
## Checklist
|
||||
|
||||
- [ ] CI passing
|
||||
- [ ] Tests added where applicable
|
||||
9
.github/actionlint.yaml
vendored
Normal file
9
.github/actionlint.yaml
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
self-hosted-runner:
|
||||
labels:
|
||||
- runner-small
|
||||
- runner-medium
|
||||
- runner-medium-arm64
|
||||
- runner-large
|
||||
- runner-large-spot
|
||||
- runner-medium-arm64-spot
|
||||
- runner-gpu-l4
|
||||
313
.github/actions/cache-benchmark-harness/action.yml
vendored
Normal file
313
.github/actions/cache-benchmark-harness/action.yml
vendored
Normal file
@@ -0,0 +1,313 @@
|
||||
name: Cache Benchmark Harness Binary
|
||||
description: >
|
||||
Build and cache the benchmark-harness binary with intelligent caching based on source hashes.
|
||||
Generates cache keys based on harness source + kreuzberg dependency + Cargo files,
|
||||
restores from cache if available, builds if needed, and saves to cache.
|
||||
Validates artifacts after restore or build to ensure integrity.
|
||||
|
||||
inputs:
|
||||
cache-version:
|
||||
description: "Manual version for cache invalidation"
|
||||
required: false
|
||||
default: "v1"
|
||||
|
||||
build-profile:
|
||||
description: "Build profile (release, debug)"
|
||||
required: false
|
||||
default: "release"
|
||||
|
||||
outputs:
|
||||
cache-hit:
|
||||
description: "Boolean indicating exact cache hit"
|
||||
value: ${{ steps.cache-restore.outputs.cache-hit }}
|
||||
|
||||
cache-key:
|
||||
description: "The cache key used"
|
||||
value: ${{ steps.generate-cache-key.outputs.cache-key }}
|
||||
|
||||
binary-path:
|
||||
description: "Path to the built/cached benchmark-harness binary"
|
||||
value: ${{ steps.validate-binary.outputs.binary-path }}
|
||||
|
||||
runs:
|
||||
using: composite
|
||||
steps:
|
||||
# Validate inputs
|
||||
- name: Validate inputs
|
||||
shell: bash
|
||||
env:
|
||||
BUILD_PROFILE: ${{ inputs.build-profile }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
# Validate build profile
|
||||
valid_profiles=("release" "debug")
|
||||
if [[ ! " ${valid_profiles[@]} " =~ " ${BUILD_PROFILE} " ]]; then
|
||||
echo "❌ Error: build-profile must be one of: ${valid_profiles[*]}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "✓ Validation passed"
|
||||
echo " Build profile: $BUILD_PROFILE"
|
||||
echo " Cache version: ${{ inputs.cache-version }}"
|
||||
|
||||
# Compute hash for benchmark-harness sources
|
||||
- name: Compute benchmark-harness source hash
|
||||
id: harness-hash
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "=== Computing Benchmark Harness Source Hash ==="
|
||||
|
||||
# Compute hash for harness source files and Cargo.toml
|
||||
HARNESS_HASH=$(scripts/ci/cache/compute-hash.sh \
|
||||
"tools/benchmark-harness/src/**" \
|
||||
"tools/benchmark-harness/Cargo.toml" \
|
||||
2>&1 | grep "^[a-f0-9]*$")
|
||||
|
||||
if [[ -z "$HARNESS_HASH" ]]; then
|
||||
echo "❌ Failed to compute harness source hash"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "harness-hash=$HARNESS_HASH" >> "$GITHUB_OUTPUT"
|
||||
echo "✓ Harness source hash: $HARNESS_HASH"
|
||||
|
||||
# Compute hash for kreuzberg dependency
|
||||
- name: Compute kreuzberg dependency hash
|
||||
id: kreuzberg-hash
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "=== Computing Kreuzberg Dependency Hash ==="
|
||||
|
||||
# Compute hash for kreuzberg crate (dependency)
|
||||
KREUZBERG_HASH=$(scripts/ci/cache/compute-hash.sh --dirs \
|
||||
"crates/kreuzberg" \
|
||||
2>&1 | grep "^[a-f0-9]*$")
|
||||
|
||||
if [[ -z "$KREUZBERG_HASH" ]]; then
|
||||
echo "❌ Failed to compute kreuzberg dependency hash"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "kreuzberg-hash=$KREUZBERG_HASH" >> "$GITHUB_OUTPUT"
|
||||
echo "✓ Kreuzberg dependency hash: $KREUZBERG_HASH"
|
||||
|
||||
# Compute hash for Cargo files
|
||||
- name: Compute Cargo files hash
|
||||
id: cargo-hash
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "=== Computing Cargo Files Hash ==="
|
||||
|
||||
# Compute hash for Cargo.lock
|
||||
CARGO_HASH=$(scripts/ci/cache/compute-hash.sh --files Cargo.lock 2>&1 | grep "^[a-f0-9]*$")
|
||||
|
||||
if [[ -z "$CARGO_HASH" ]]; then
|
||||
echo "❌ Failed to compute Cargo files hash"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "cargo-hash=$CARGO_HASH" >> "$GITHUB_OUTPUT"
|
||||
echo "✓ Cargo files hash: $CARGO_HASH"
|
||||
|
||||
# Generate cache key
|
||||
- name: Generate cache key
|
||||
id: generate-cache-key
|
||||
shell: bash
|
||||
env:
|
||||
BUILD_PROFILE: ${{ inputs.build-profile }}
|
||||
HARNESS_HASH: ${{ steps.harness-hash.outputs.harness-hash }}
|
||||
KREUZBERG_HASH: ${{ steps.kreuzberg-hash.outputs.kreuzberg-hash }}
|
||||
CARGO_HASH: ${{ steps.cargo-hash.outputs.cargo-hash }}
|
||||
CACHE_VERSION: ${{ inputs.cache-version }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "=== Cache Key Generated ==="
|
||||
|
||||
# Build cache key following format:
|
||||
# harness-{profile}-{platform}-src-{harness-hash}-kreuzberg-{kreuzberg-hash}-cargo-{cargo-hash}-v{version}
|
||||
CACHE_KEY="harness-${BUILD_PROFILE}-$(uname -m)-src-${HARNESS_HASH}-kreuzberg-${KREUZBERG_HASH}-cargo-${CARGO_HASH}-${CACHE_VERSION}"
|
||||
|
||||
echo "cache-key=$CACHE_KEY" >> "$GITHUB_OUTPUT"
|
||||
|
||||
echo "Full key: $CACHE_KEY"
|
||||
echo ""
|
||||
echo "Key components:"
|
||||
echo " Profile: $BUILD_PROFILE"
|
||||
echo " Platform: $(uname -m)"
|
||||
echo " Harness hash: $HARNESS_HASH"
|
||||
echo " Kreuzberg hash: $KREUZBERG_HASH"
|
||||
echo " Cargo hash: $CARGO_HASH"
|
||||
echo " Cache version: $CACHE_VERSION"
|
||||
|
||||
# Determine target path based on profile
|
||||
- name: Determine target paths
|
||||
id: target-paths
|
||||
shell: bash
|
||||
env:
|
||||
BUILD_PROFILE: ${{ inputs.build-profile }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "=== Determining Target Paths ==="
|
||||
|
||||
case "$BUILD_PROFILE" in
|
||||
release)
|
||||
TARGET_DIR="target/release"
|
||||
;;
|
||||
debug)
|
||||
TARGET_DIR="target/debug"
|
||||
;;
|
||||
*)
|
||||
echo "❌ Invalid build profile: $BUILD_PROFILE"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "target-dir=$TARGET_DIR" >> "$GITHUB_OUTPUT"
|
||||
echo "✓ Target directory: $TARGET_DIR"
|
||||
|
||||
# Detect architecture for cache keys (shell expansion doesn't work in YAML with: context)
|
||||
- name: Detect architecture
|
||||
id: detect-arch
|
||||
shell: bash
|
||||
run: echo "arch=$(uname -m)" >> "$GITHUB_OUTPUT"
|
||||
|
||||
# Restore from cache
|
||||
- name: Restore benchmark-harness binary from cache
|
||||
id: cache-restore
|
||||
uses: kreuzberg-dev/actions/cache-binding-artifact@v1
|
||||
with:
|
||||
binding-name: benchmark-harness
|
||||
cache-key: ${{ steps.generate-cache-key.outputs.cache-key }}
|
||||
cache-restore-keys: |
|
||||
harness-${{ inputs.build-profile }}-${{ steps.detect-arch.outputs.arch }}-src-
|
||||
harness-${{ inputs.build-profile }}-${{ steps.detect-arch.outputs.arch }}-
|
||||
harness-${{ inputs.build-profile }}-
|
||||
cache-paths: |
|
||||
${{ steps.target-paths.outputs.target-dir }}/benchmark-harness
|
||||
operation: restore
|
||||
|
||||
# Log cache hit status
|
||||
- name: Log cache hit status
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
if [[ "${{ steps.cache-restore.outputs.cache-hit }}" == "true" ]]; then
|
||||
echo "✓ Cache HIT - benchmark-harness binary found in cache"
|
||||
else
|
||||
echo "✗ Cache MISS - Building benchmark-harness from source"
|
||||
fi
|
||||
|
||||
# Build if cache miss
|
||||
- name: Build benchmark-harness
|
||||
id: build
|
||||
if: steps.cache-restore.outputs.cache-hit != 'true'
|
||||
shell: bash
|
||||
env:
|
||||
BUILD_PROFILE: ${{ inputs.build-profile }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "=== Building Benchmark Harness ==="
|
||||
echo "Profile: $BUILD_PROFILE"
|
||||
|
||||
# Determine cargo build profile argument
|
||||
case "$BUILD_PROFILE" in
|
||||
release)
|
||||
BUILD_ARG="--release"
|
||||
;;
|
||||
debug)
|
||||
# Debug is default, no flag needed
|
||||
BUILD_ARG=""
|
||||
;;
|
||||
*)
|
||||
echo "❌ Invalid build profile: $BUILD_PROFILE"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
# Build benchmark-harness
|
||||
echo "Running: cargo build --manifest-path tools/benchmark-harness/Cargo.toml $BUILD_ARG"
|
||||
if ! cargo build --manifest-path tools/benchmark-harness/Cargo.toml $BUILD_ARG; then
|
||||
echo "❌ Build failed for benchmark-harness"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "✓ Build succeeded"
|
||||
|
||||
# Validate binary exists and is executable
|
||||
- name: Validate benchmark-harness binary
|
||||
id: validate-binary
|
||||
shell: bash
|
||||
env:
|
||||
BUILD_PROFILE: ${{ inputs.build-profile }}
|
||||
TARGET_DIR: ${{ steps.target-paths.outputs.target-dir }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "=== Validating Benchmark Harness Binary ==="
|
||||
|
||||
BINARY_PATH="${TARGET_DIR}/benchmark-harness"
|
||||
|
||||
# Check if binary exists
|
||||
if [[ ! -f "$BINARY_PATH" ]]; then
|
||||
echo "❌ Binary not found at: $BINARY_PATH"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check if binary is executable
|
||||
if [[ ! -x "$BINARY_PATH" ]]; then
|
||||
echo "❌ Binary is not executable: $BINARY_PATH"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Get binary size and info
|
||||
BINARY_SIZE=$(ls -lh "$BINARY_PATH" | awk '{print $5}')
|
||||
BINARY_PERMS=$(ls -l "$BINARY_PATH" | awk '{print $1}')
|
||||
|
||||
echo "binary-path=$BINARY_PATH" >> "$GITHUB_OUTPUT"
|
||||
|
||||
echo "✓ Binary validation passed"
|
||||
echo " Path: $BINARY_PATH"
|
||||
echo " Size: $BINARY_SIZE"
|
||||
echo " Permissions: $BINARY_PERMS"
|
||||
|
||||
# Save to cache if build occurred
|
||||
- name: Save benchmark-harness binary to cache
|
||||
if: steps.cache-restore.outputs.cache-hit != 'true'
|
||||
uses: kreuzberg-dev/actions/cache-binding-artifact@v1
|
||||
with:
|
||||
binding-name: benchmark-harness
|
||||
cache-key: ${{ steps.generate-cache-key.outputs.cache-key }}
|
||||
cache-paths: |
|
||||
${{ steps.target-paths.outputs.target-dir }}/benchmark-harness
|
||||
operation: save
|
||||
|
||||
# Summary
|
||||
- name: Summary
|
||||
if: always()
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
echo ""
|
||||
echo "=== Build and Cache Summary ==="
|
||||
echo "Build Profile: ${{ inputs.build-profile }}"
|
||||
echo "Platform: $(uname -m)"
|
||||
echo "Cache Hit: ${{ steps.cache-restore.outputs.cache-hit == 'true' && 'Yes' || 'No' }}"
|
||||
echo "Cache Key: ${{ steps.generate-cache-key.outputs.cache-key }}"
|
||||
echo "Binary Path: ${{ steps.validate-binary.outputs.binary-path }}"
|
||||
echo ""
|
||||
echo "Hashes:"
|
||||
echo " Harness: ${{ steps.harness-hash.outputs.harness-hash }}"
|
||||
echo " Kreuzberg: ${{ steps.kreuzberg-hash.outputs.kreuzberg-hash }}"
|
||||
echo " Cargo: ${{ steps.cargo-hash.outputs.cargo-hash }}"
|
||||
105
.github/actions/install-system-deps/action.yml
vendored
Normal file
105
.github/actions/install-system-deps/action.yml
vendored
Normal file
@@ -0,0 +1,105 @@
|
||||
name: Install System Dependencies
|
||||
description: |
|
||||
Install and cache platform-specific dependencies required for document conversion.
|
||||
Includes: Tesseract OCR, fonts, and build tools.
|
||||
Features robust caching with architecture/version awareness, timeout handling, and retry logic.
|
||||
|
||||
inputs:
|
||||
enable-retry:
|
||||
description: Enable retry logic with exponential backoff
|
||||
required: false
|
||||
default: "true"
|
||||
|
||||
runs:
|
||||
using: composite
|
||||
steps:
|
||||
- name: Detect Tesseract version (macOS)
|
||||
if: runner.os == 'macOS'
|
||||
id: detect-tesseract-macos
|
||||
shell: bash
|
||||
run: scripts/ci/install-system-deps/detect-tesseract-macos.sh
|
||||
|
||||
- name: Cache Tesseract & tessdata (macOS)
|
||||
if: runner.os == 'macOS'
|
||||
id: cache-tesseract-macos
|
||||
uses: actions/cache@v5
|
||||
with:
|
||||
path: |
|
||||
/usr/local/opt/tesseract/
|
||||
/usr/local/Cellar/tesseract/
|
||||
/opt/homebrew/opt/tesseract/
|
||||
/opt/homebrew/Cellar/tesseract/
|
||||
key: tesseract-macos-${{ runner.arch }}-v5-${{ steps.detect-tesseract-macos.outputs.version }}
|
||||
restore-keys: |
|
||||
tesseract-macos-${{ runner.arch }}-v5-
|
||||
tesseract-macos-${{ runner.arch }}-
|
||||
|
||||
- name: Install dependencies (macOS)
|
||||
if: runner.os == 'macOS'
|
||||
shell: bash
|
||||
run: scripts/ci/install-system-deps/install-macos.sh
|
||||
|
||||
- name: Detect Tesseract version (Linux)
|
||||
if: runner.os == 'Linux'
|
||||
id: detect-tesseract-linux
|
||||
shell: bash
|
||||
run: scripts/ci/install-system-deps/detect-tesseract-linux.sh
|
||||
|
||||
- name: Cache Tesseract data (Linux)
|
||||
if: runner.os == 'Linux'
|
||||
id: cache-tesseract-linux
|
||||
uses: actions/cache@v5
|
||||
with:
|
||||
path: |
|
||||
/usr/share/tesseract-ocr/5/tessdata/
|
||||
/usr/share/tesseract-ocr/tessdata/
|
||||
key: tesseract-linux-${{ runner.arch }}-v5-${{ steps.detect-tesseract-linux.outputs.version }}
|
||||
restore-keys: |
|
||||
tesseract-linux-${{ runner.arch }}-v5-
|
||||
tesseract-linux-${{ runner.arch }}-
|
||||
|
||||
- name: Install dependencies (Linux)
|
||||
if: runner.os == 'Linux'
|
||||
shell: bash
|
||||
run: scripts/ci/install-system-deps/install-linux.sh
|
||||
|
||||
- name: Cache Tesseract (Windows)
|
||||
if: runner.os == 'Windows'
|
||||
id: cache-tesseract-windows
|
||||
uses: actions/cache@v5
|
||||
with:
|
||||
path: |
|
||||
C:\Program Files\Tesseract-OCR
|
||||
C:\ProgramData\chocolatey\lib\tesseract
|
||||
key: tesseract-windows-${{ runner.arch }}-v5-data
|
||||
restore-keys: |
|
||||
tesseract-windows-${{ runner.arch }}-
|
||||
|
||||
- name: Cache LLVM (Windows)
|
||||
if: runner.os == 'Windows'
|
||||
id: cache-llvm-windows
|
||||
uses: actions/cache@v5
|
||||
with:
|
||||
path: |
|
||||
C:\Program Files\LLVM
|
||||
C:\ProgramData\chocolatey\lib\llvm
|
||||
key: llvm-windows-${{ runner.arch }}-v1
|
||||
|
||||
- name: Cache CMake (Windows)
|
||||
if: runner.os == 'Windows'
|
||||
id: cache-cmake-windows
|
||||
uses: actions/cache@v5
|
||||
with:
|
||||
path: |
|
||||
C:\Program Files\CMake
|
||||
C:\ProgramData\chocolatey\lib\cmake
|
||||
key: cmake-windows-${{ runner.arch }}-v1
|
||||
|
||||
- name: Install dependencies (Windows)
|
||||
if: runner.os == 'Windows'
|
||||
shell: pwsh
|
||||
env:
|
||||
TESSERACT_CACHE_HIT: ${{ steps.cache-tesseract-windows.outputs.cache-hit }}
|
||||
LLVM_CACHE_HIT: ${{ steps.cache-llvm-windows.outputs.cache-hit }}
|
||||
CMAKE_CACHE_HIT: ${{ steps.cache-cmake-windows.outputs.cache-hit }}
|
||||
run: pwsh -File scripts/ci/install-system-deps/install-windows.ps1
|
||||
197
.github/actions/setup-layout-models/action.yml
vendored
Normal file
197
.github/actions/setup-layout-models/action.yml
vendored
Normal file
@@ -0,0 +1,197 @@
|
||||
name: Setup Layout Detection Models Cache
|
||||
description: Download and cache layout detection ONNX models (RT-DETR + TATR) for CI testing
|
||||
|
||||
inputs:
|
||||
cache-enabled:
|
||||
description: Enable model caching (set to false for cross-arch builds)
|
||||
required: false
|
||||
default: "true"
|
||||
models:
|
||||
description: Comma-separated list of models to setup (rtdetr,tatr)
|
||||
required: false
|
||||
default: "rtdetr,tatr"
|
||||
cache-key-suffix:
|
||||
description: Suffix for cache key to differentiate model sets
|
||||
required: false
|
||||
default: "layout-models-v2"
|
||||
|
||||
outputs:
|
||||
cache-hit:
|
||||
description: Whether models were restored from cache (true/false)
|
||||
value: ${{ steps.cache-models.outputs.cache-hit }}
|
||||
cache-dir:
|
||||
description: Path to the layout model cache directory
|
||||
value: ${{ steps.set-outputs.outputs.cache-dir }}
|
||||
models-available:
|
||||
description: Comma-separated list of available models
|
||||
value: ${{ steps.verify-models.outputs.available-models }}
|
||||
|
||||
runs:
|
||||
using: composite
|
||||
steps:
|
||||
- name: Setup cache directory
|
||||
shell: bash
|
||||
run: |
|
||||
mkdir -p ~/.cache/kreuzberg/layout
|
||||
echo "Cache directory: $HOME/.cache/kreuzberg/layout"
|
||||
|
||||
- name: Restore layout models from cache
|
||||
if: inputs.cache-enabled == 'true'
|
||||
uses: actions/cache@v5
|
||||
id: cache-models
|
||||
with:
|
||||
path: ~/.cache/kreuzberg/layout
|
||||
key: ${{ inputs.cache-key-suffix }}-${{ runner.os }}-${{ runner.arch }}-rtdetr_3bf2fb0e+tatr_c11f4033
|
||||
restore-keys: |
|
||||
${{ inputs.cache-key-suffix }}-${{ runner.os }}-${{ runner.arch }}-
|
||||
${{ inputs.cache-key-suffix }}-${{ runner.os }}-
|
||||
${{ inputs.cache-key-suffix }}-
|
||||
|
||||
- name: Download RT-DETR model (rtdetr)
|
||||
if: contains(inputs.models, 'rtdetr') && steps.cache-models.outputs.cache-hit != 'true'
|
||||
shell: bash
|
||||
run: |
|
||||
MODEL_URL="https://huggingface.co/Kreuzberg/layout-models/resolve/main/rtdetr/model.onnx"
|
||||
CACHE_DIR="$HOME/.cache/kreuzberg/layout"
|
||||
MODEL_DIR="$CACHE_DIR/rtdetr"
|
||||
MODEL_FILE="$MODEL_DIR/model.onnx"
|
||||
|
||||
echo "Downloading RT-DETR layout detection model from $MODEL_URL"
|
||||
mkdir -p "$MODEL_DIR"
|
||||
|
||||
for attempt in 1 2 3; do
|
||||
if [ $attempt -gt 1 ]; then
|
||||
backoff=$((5 * 3 ** (attempt - 2)))
|
||||
echo "Retry attempt $attempt/3 after ${backoff}s backoff..."
|
||||
sleep $backoff
|
||||
fi
|
||||
|
||||
if curl -f -L --progress-bar --connect-timeout 30 --max-time 600 \
|
||||
-o "$MODEL_FILE" "$MODEL_URL"; then
|
||||
echo "RT-DETR model downloaded successfully ($(du -h "$MODEL_FILE" | cut -f1))"
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [ ! -f "$MODEL_FILE" ]; then
|
||||
echo "ERROR: Failed to download RT-DETR model after 3 attempts"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Verify RT-DETR SHA256
|
||||
if: contains(inputs.models, 'rtdetr') && steps.cache-models.outputs.cache-hit != 'true'
|
||||
shell: bash
|
||||
run: |
|
||||
MODEL_FILE="$HOME/.cache/kreuzberg/layout/rtdetr/model.onnx"
|
||||
EXPECTED="3bf2fb0ee6df87435b7ae47f0f3930ec3dc97ec56fd824acc6d57bc7a6b89ef2"
|
||||
|
||||
if command -v sha256sum &>/dev/null; then
|
||||
ACTUAL=$(sha256sum "$MODEL_FILE" | awk '{print $1}')
|
||||
else
|
||||
ACTUAL=$(shasum -a 256 "$MODEL_FILE" | awk '{print $1}')
|
||||
fi
|
||||
|
||||
if [ "$ACTUAL" != "$EXPECTED" ]; then
|
||||
echo "ERROR: RT-DETR SHA256 mismatch"
|
||||
echo " Expected: $EXPECTED"
|
||||
echo " Actual: $ACTUAL"
|
||||
rm -f "$MODEL_FILE"
|
||||
exit 1
|
||||
fi
|
||||
echo "RT-DETR SHA256 verified"
|
||||
|
||||
- name: Download TATR model (tatr)
|
||||
if: contains(inputs.models, 'tatr') && steps.cache-models.outputs.cache-hit != 'true'
|
||||
shell: bash
|
||||
run: |
|
||||
MODEL_URL="https://huggingface.co/Kreuzberg/layout-models/resolve/main/tatr/model.onnx"
|
||||
CACHE_DIR="$HOME/.cache/kreuzberg/layout"
|
||||
MODEL_DIR="$CACHE_DIR/tatr"
|
||||
MODEL_FILE="$MODEL_DIR/tatr.onnx"
|
||||
|
||||
echo "Downloading TATR table recognition model from $MODEL_URL"
|
||||
mkdir -p "$MODEL_DIR"
|
||||
|
||||
for attempt in 1 2 3; do
|
||||
if [ $attempt -gt 1 ]; then
|
||||
backoff=$((5 * 3 ** (attempt - 2)))
|
||||
echo "Retry attempt $attempt/3 after ${backoff}s backoff..."
|
||||
sleep $backoff
|
||||
fi
|
||||
|
||||
if curl -f -L --progress-bar --connect-timeout 30 --max-time 600 \
|
||||
-o "$MODEL_FILE" "$MODEL_URL"; then
|
||||
echo "TATR model downloaded successfully ($(du -h "$MODEL_FILE" | cut -f1))"
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [ ! -f "$MODEL_FILE" ]; then
|
||||
echo "ERROR: Failed to download TATR model after 3 attempts"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Verify TATR SHA256
|
||||
if: contains(inputs.models, 'tatr') && steps.cache-models.outputs.cache-hit != 'true'
|
||||
shell: bash
|
||||
run: |
|
||||
MODEL_FILE="$HOME/.cache/kreuzberg/layout/tatr/tatr.onnx"
|
||||
EXPECTED="c11f4033da75e9c4d41c403ef356e89caa0a37a7d111b55461e7d5ba856bb6b6"
|
||||
|
||||
if command -v sha256sum &>/dev/null; then
|
||||
ACTUAL=$(sha256sum "$MODEL_FILE" | awk '{print $1}')
|
||||
else
|
||||
ACTUAL=$(shasum -a 256 "$MODEL_FILE" | awk '{print $1}')
|
||||
fi
|
||||
|
||||
if [ "$ACTUAL" != "$EXPECTED" ]; then
|
||||
echo "ERROR: TATR SHA256 mismatch"
|
||||
echo " Expected: $EXPECTED"
|
||||
echo " Actual: $ACTUAL"
|
||||
rm -f "$MODEL_FILE"
|
||||
exit 1
|
||||
fi
|
||||
echo "TATR SHA256 verified"
|
||||
|
||||
- name: Verify downloaded models
|
||||
id: verify-models
|
||||
shell: bash
|
||||
run: |
|
||||
CACHE_DIR="$HOME/.cache/kreuzberg/layout"
|
||||
AVAILABLE_MODELS=()
|
||||
TOTAL_SIZE=0
|
||||
|
||||
echo "Checking for layout models in $CACHE_DIR"
|
||||
|
||||
if [ -f "$CACHE_DIR/rtdetr/model.onnx" ]; then
|
||||
SIZE=$(wc -c < "$CACHE_DIR/rtdetr/model.onnx" | tr -d ' ')
|
||||
AVAILABLE_MODELS+=("rtdetr")
|
||||
TOTAL_SIZE=$((TOTAL_SIZE + SIZE))
|
||||
echo " ✓ RT-DETR model: $(numfmt --to=iec-i --suffix=B $SIZE 2>/dev/null || echo $SIZE bytes)"
|
||||
fi
|
||||
|
||||
if [ -f "$CACHE_DIR/tatr/tatr.onnx" ]; then
|
||||
SIZE=$(wc -c < "$CACHE_DIR/tatr/tatr.onnx" | tr -d ' ')
|
||||
AVAILABLE_MODELS+=("tatr")
|
||||
TOTAL_SIZE=$((TOTAL_SIZE + SIZE))
|
||||
echo " ✓ TATR model: $(numfmt --to=iec-i --suffix=B $SIZE 2>/dev/null || echo $SIZE bytes)"
|
||||
fi
|
||||
|
||||
if [ ${#AVAILABLE_MODELS[@]} -eq 0 ]; then
|
||||
echo "ERROR: No layout models found in cache directory after download"
|
||||
echo "available-models=" >> $GITHUB_OUTPUT
|
||||
exit 1
|
||||
fi
|
||||
|
||||
AVAILABLE_MODELS_STR=$(IFS=, ; echo "${AVAILABLE_MODELS[*]}")
|
||||
echo "✓ Total cached layout models: ${#AVAILABLE_MODELS[@]} ($(numfmt --to=iec-i --suffix=B $TOTAL_SIZE 2>/dev/null || echo $TOTAL_SIZE bytes))"
|
||||
echo "available-models=$AVAILABLE_MODELS_STR" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set cache directory output
|
||||
id: set-outputs
|
||||
shell: bash
|
||||
run: |
|
||||
CACHE_DIR="$HOME/.cache/kreuzberg/layout"
|
||||
echo "cache-dir=$CACHE_DIR" >> $GITHUB_OUTPUT
|
||||
echo "KREUZBERG_CACHE_DIR=$HOME/.cache/kreuzberg" >> $GITHUB_ENV
|
||||
echo "Layout model cache configured at: $CACHE_DIR"
|
||||
46
.github/actions/setup-onnx-runtime/action.yml
vendored
Normal file
46
.github/actions/setup-onnx-runtime/action.yml
vendored
Normal file
@@ -0,0 +1,46 @@
|
||||
name: Setup ONNX Runtime
|
||||
description: Download and stage ONNX Runtime libraries for bindings
|
||||
inputs:
|
||||
ort-version:
|
||||
description: ONNX Runtime version to download
|
||||
required: true
|
||||
dest-dir:
|
||||
description: Directory (relative to workspace) where libraries should be copied
|
||||
required: false
|
||||
default: crates/kreuzberg-node
|
||||
arch-id:
|
||||
description: Override architecture (x64|arm64). Defaults to runner architecture.
|
||||
required: false
|
||||
default: ""
|
||||
strategy:
|
||||
description: "ORT linking strategy: 'system' (dynamic link, default) or 'bundled' (static link via ort-bundled cargo feature)"
|
||||
required: false
|
||||
default: system
|
||||
runs:
|
||||
using: composite
|
||||
steps:
|
||||
- name: Cache ONNX Runtime
|
||||
id: cache-onnx
|
||||
uses: actions/cache@v5
|
||||
with:
|
||||
path: |
|
||||
${{ runner.temp }}/onnxruntime
|
||||
key: onnx-v2-${{ runner.os }}-${{ inputs.arch-id != '' && inputs.arch-id || runner.arch }}-${{ inputs.ort-version }}
|
||||
restore-keys: |
|
||||
onnx-v2-${{ runner.os }}-${{ inputs.arch-id != '' && inputs.arch-id || runner.arch }}-
|
||||
onnx-v2-${{ runner.os }}-
|
||||
|
||||
- name: Prepare ONNX Runtime (Linux)
|
||||
if: runner.os == 'Linux'
|
||||
shell: bash
|
||||
run: scripts/ci/actions/setup-onnx-runtime/linux.sh "${{ inputs.ort-version }}" "${{ inputs.dest-dir }}" "${{ inputs.arch-id }}" "${{ inputs.strategy }}"
|
||||
|
||||
- name: Prepare ONNX Runtime (macOS)
|
||||
if: runner.os == 'macOS'
|
||||
shell: bash
|
||||
run: scripts/ci/actions/setup-onnx-runtime/macos.sh "${{ inputs.ort-version }}" "${{ inputs.dest-dir }}" "${{ inputs.arch-id }}" "${{ inputs.strategy }}"
|
||||
|
||||
- name: Prepare ONNX Runtime (Windows)
|
||||
if: runner.os == 'Windows'
|
||||
shell: pwsh
|
||||
run: scripts/ci/actions/setup-onnx-runtime/windows.ps1 "${{ inputs.ort-version }}" "${{ inputs.dest-dir }}" "${{ inputs.arch-id }}" "${{ inputs.strategy }}"
|
||||
202
.github/actions/setup-paddle-ocr-models/README.md
vendored
Normal file
202
.github/actions/setup-paddle-ocr-models/README.md
vendored
Normal file
@@ -0,0 +1,202 @@
|
||||
# Setup PaddleOCR Models Cache
|
||||
|
||||
GitHub Action to download and cache PaddleOCR ONNX models for CI testing and development.
|
||||
|
||||
## Overview
|
||||
|
||||
This action manages the setup of PaddleOCR PP-OCRv5 ONNX models used by the `kreuzberg-paddle-ocr` crate for optical character recognition testing. It:
|
||||
|
||||
- Downloads three model types (detection, classification, recognition) from Hugging Face
|
||||
- Caches models per OS and CPU architecture (Linux x86_64, Linux ARM64, macOS, Windows)
|
||||
- Provides environment variables for downstream use
|
||||
- Outputs cache hit status and available model information
|
||||
- Gracefully handles download failures (continues with available models)
|
||||
|
||||
## Models
|
||||
|
||||
The action downloads pre-converted ONNX format models from the `Kreuzberg/paddleocr-onnx-models` Hugging Face repository:
|
||||
|
||||
| Model Type | File | Size | Purpose |
|
||||
| -------------------- | ------------------------------------- | ------- | ----------------------------------------- |
|
||||
| Detection (det) | `PP-OCRv5_server_det_infer.onnx` | ~84 MB | Text location detection (PP-OCRv5 server) |
|
||||
| Classification (cls) | `ch_ppocr_mobile_v2.0_cls_infer.onnx` | ~0.6 MB | Text orientation classification |
|
||||
| Recognition (rec) | `rec/english/model.onnx` | ~8 MB | Text character recognition (PP-OCRv5) |
|
||||
|
||||
**Total cache size: ~93 MB per OS/architecture combination**
|
||||
|
||||
## Usage
|
||||
|
||||
### Basic Usage
|
||||
|
||||
```yaml
|
||||
- uses: ./.github/actions/setup-paddle-ocr-models
|
||||
```
|
||||
|
||||
### With Custom Cache Suffix
|
||||
|
||||
```yaml
|
||||
- uses: ./.github/actions/setup-paddle-ocr-models
|
||||
with:
|
||||
cache-key-suffix: my-paddle-ocr-v5
|
||||
```
|
||||
|
||||
### Disable Caching
|
||||
|
||||
For cross-architecture builds where caching doesn't help:
|
||||
|
||||
```yaml
|
||||
- uses: ./.github/actions/setup-paddle-ocr-models
|
||||
with:
|
||||
cache-enabled: false
|
||||
```
|
||||
|
||||
### Download Specific Models Only
|
||||
|
||||
```yaml
|
||||
- uses: ./.github/actions/setup-paddle-ocr-models
|
||||
with:
|
||||
models: "det,rec" # Skip classification model
|
||||
```
|
||||
|
||||
## Inputs
|
||||
|
||||
| Name | Description | Required | Default |
|
||||
| ------------------ | --------------------------------------------------------------- | -------- | -------------------- |
|
||||
| `cache-enabled` | Enable model caching (set false for cross-arch builds) | No | `true` |
|
||||
| `models` | Comma-separated list of models to setup (det,cls,rec or subset) | No | `det,cls,rec` |
|
||||
| `cache-key-suffix` | Suffix for cache key to differentiate model sets | No | `paddle-ocr-v5-onnx` |
|
||||
|
||||
## Outputs
|
||||
|
||||
| Name | Description |
|
||||
| ------------------ | ---------------------------------------------------- |
|
||||
| `cache-hit` | Whether models were restored from cache (true/false) |
|
||||
| `cache-dir` | Path to the PaddleOCR model cache directory |
|
||||
| `models-available` | Comma-separated list of available models after setup |
|
||||
|
||||
## Outputs as Environment Variables
|
||||
|
||||
The action automatically exports:
|
||||
|
||||
- `PADDLE_OCR_MODEL_CACHE`: Absolute path to model cache directory
|
||||
|
||||
## Cache Strategy
|
||||
|
||||
Models are cached using GitHub Actions cache with the following key structure:
|
||||
|
||||
```text
|
||||
paddle-ocr-v5-onnx-{OS}-{ARCHITECTURE}-v4
|
||||
```
|
||||
|
||||
Cache restoration order (restore-keys):
|
||||
|
||||
1. Exact match: `paddle-ocr-v5-onnx-{OS}-{ARCHITECTURE}-v4`
|
||||
2. OS-Architecture: `paddle-ocr-v5-onnx-{OS}-{ARCHITECTURE}-`
|
||||
3. OS only: `paddle-ocr-v5-onnx-{OS}-`
|
||||
4. Any: `paddle-ocr-v5-onnx-`
|
||||
|
||||
## Example: CI Rust Workflow Integration
|
||||
|
||||
```yaml
|
||||
jobs:
|
||||
paddle-ocr-tests:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: ./.github/actions/setup-paddle-ocr-models
|
||||
id: paddle-models
|
||||
|
||||
- name: Run PaddleOCR tests
|
||||
run: cargo test --package kreuzberg-paddle-ocr
|
||||
env:
|
||||
PADDLE_OCR_MODEL_CACHE: ${{ steps.paddle-models.outputs.cache-dir }}
|
||||
|
||||
- name: Report cache status
|
||||
if: always()
|
||||
run: |
|
||||
echo "Cache hit: ${{ steps.paddle-models.outputs.cache-hit }}"
|
||||
echo "Available models: ${{ steps.paddle-models.outputs.models-available }}"
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
The action downloads models sequentially and will fail if a required model download fails. After downloading:
|
||||
|
||||
- The verify step reports which models are actually available in the output
|
||||
- Downstream tests can check `models-available` to know what's available
|
||||
- If all models fail, tests can fall back to alternative behavior
|
||||
|
||||
## Download Sources
|
||||
|
||||
Models are downloaded from:
|
||||
|
||||
```text
|
||||
https://huggingface.co/Kreuzberg/paddleocr-onnx-models/resolve/main/
|
||||
```
|
||||
|
||||
If this repository becomes unavailable, the action will fail gracefully. Alternative sources can be configured by modifying the `MODEL_URL` environment variables in the action.
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Models not being cached
|
||||
|
||||
1. Check that `cache-enabled` is not set to `false`
|
||||
2. Verify GitHub Actions cache is not full (max 10 GB per repository)
|
||||
3. Check runner OS and architecture match cache keys
|
||||
4. View cache in repository settings (Settings → Actions → Caches)
|
||||
|
||||
### Download timeouts
|
||||
|
||||
If downloads timeout:
|
||||
|
||||
- Increase the 300-second timeout in the action steps
|
||||
- Check Hugging Face API availability
|
||||
- Try reducing the number of models (`models: "det,rec"`)
|
||||
|
||||
### Verifying models are present
|
||||
|
||||
Check that all expected models exist in the correct directory structure:
|
||||
|
||||
```bash
|
||||
ls -lh ~/.cache/kreuzberg/paddle-ocr/
|
||||
```
|
||||
|
||||
Expected output:
|
||||
|
||||
```text
|
||||
drwxr-xr-x det/
|
||||
drwxr-xr-x cls/
|
||||
drwxr-xr-x rec/
|
||||
|
||||
ls -lh ~/.cache/kreuzberg/paddle-ocr/det/
|
||||
-rw-r--r-- model.onnx (~84 MB)
|
||||
|
||||
ls -lh ~/.cache/kreuzberg/paddle-ocr/cls/
|
||||
-rw-r--r-- model.onnx (~0.6 MB)
|
||||
|
||||
ls -lh ~/.cache/kreuzberg/paddle-ocr/rec/english/
|
||||
-rw-r--r-- model.onnx (~8 MB)
|
||||
-rw-r--r-- dict.txt
|
||||
```
|
||||
|
||||
The directory structure must match what `ModelManager` expects in `model_manager.rs`.
|
||||
|
||||
## Performance Impact
|
||||
|
||||
- **First run (no cache)**: ~30-60 seconds (download time depends on network)
|
||||
- **Cached run**: <1 second (cache restore)
|
||||
- **Cache size**: ~93 MB per OS/architecture
|
||||
- **Network bandwidth**: ~93 MB download on cache miss
|
||||
|
||||
## Related Actions
|
||||
|
||||
- `.github/actions/setup-tesseract-cache` - Similar caching for Tesseract models
|
||||
- `.github/actions/cache-hf-fastembed` - Hugging Face model caching for fastembed
|
||||
- `.github/actions/setup-onnx-runtime` - ONNX Runtime setup for inference
|
||||
|
||||
## See Also
|
||||
|
||||
- [PaddleOCR Documentation](https://github.com/PaddlePaddle/PaddleOCR)
|
||||
- [kreuzberg-paddle-ocr crate](../../../crates/kreuzberg-paddle-ocr)
|
||||
- [ModelManager source](../../../crates/kreuzberg/src/paddle_ocr/model_manager.rs)
|
||||
231
.github/actions/setup-paddle-ocr-models/action.yml
vendored
Normal file
231
.github/actions/setup-paddle-ocr-models/action.yml
vendored
Normal file
@@ -0,0 +1,231 @@
|
||||
name: Setup PaddleOCR Models Cache
|
||||
description: Download and cache PaddleOCR ONNX models for CI testing
|
||||
|
||||
inputs:
|
||||
cache-enabled:
|
||||
description: Enable model caching (set to false for cross-arch builds)
|
||||
required: false
|
||||
default: "true"
|
||||
models:
|
||||
description: Comma-separated list of models to setup (det,cls,rec or specific subset)
|
||||
required: false
|
||||
default: "det,cls,rec"
|
||||
cache-key-suffix:
|
||||
description: Suffix for cache key to differentiate model sets
|
||||
required: false
|
||||
default: "paddle-ocr-v5-onnx"
|
||||
|
||||
outputs:
|
||||
cache-hit:
|
||||
description: Whether models were restored from cache (true/false)
|
||||
value: ${{ steps.cache-models.outputs.cache-hit }}
|
||||
cache-dir:
|
||||
description: Path to the PaddleOCR model cache directory
|
||||
value: ${{ steps.set-outputs.outputs.cache-dir }}
|
||||
models-available:
|
||||
description: Comma-separated list of available models
|
||||
value: ${{ steps.verify-models.outputs.available-models }}
|
||||
|
||||
runs:
|
||||
using: composite
|
||||
steps:
|
||||
- name: Setup cache directory
|
||||
shell: bash
|
||||
run: |
|
||||
mkdir -p ~/.cache/kreuzberg/paddle-ocr
|
||||
echo "Cache directory: $HOME/.cache/kreuzberg/paddle-ocr"
|
||||
|
||||
- name: Restore PaddleOCR models from cache
|
||||
if: inputs.cache-enabled == 'true'
|
||||
uses: actions/cache@v5
|
||||
id: cache-models
|
||||
with:
|
||||
path: ~/.cache/kreuzberg/paddle-ocr
|
||||
key: ${{ inputs.cache-key-suffix }}-${{ runner.os }}-${{ runner.arch }}-v4
|
||||
restore-keys: |
|
||||
${{ inputs.cache-key-suffix }}-${{ runner.os }}-${{ runner.arch }}-
|
||||
${{ inputs.cache-key-suffix }}-${{ runner.os }}-
|
||||
${{ inputs.cache-key-suffix }}-
|
||||
|
||||
- name: Download detection model (det)
|
||||
if: contains(inputs.models, 'det') && steps.cache-models.outputs.cache-hit != 'true'
|
||||
shell: bash
|
||||
run: |
|
||||
MODEL_URL="https://huggingface.co/Kreuzberg/paddleocr-onnx-models/resolve/main/PP-OCRv5_server_det_infer.onnx"
|
||||
CACHE_DIR="$HOME/.cache/kreuzberg/paddle-ocr"
|
||||
MODEL_DIR="$CACHE_DIR/det"
|
||||
MODEL_FILE="$MODEL_DIR/model.onnx"
|
||||
|
||||
echo "Downloading detection model from $MODEL_URL"
|
||||
mkdir -p "$MODEL_DIR"
|
||||
|
||||
for attempt in 1 2 3; do
|
||||
if [ $attempt -gt 1 ]; then
|
||||
backoff=$((5 * 3 ** (attempt - 2)))
|
||||
echo "Retry attempt $attempt/3 after ${backoff}s backoff..."
|
||||
sleep $backoff
|
||||
fi
|
||||
|
||||
if curl -f -L --progress-bar --connect-timeout 30 --max-time 600 \
|
||||
-o "$MODEL_FILE" "$MODEL_URL"; then
|
||||
echo "Detection model downloaded successfully ($(du -h "$MODEL_FILE" | cut -f1))"
|
||||
exit 0
|
||||
fi
|
||||
done
|
||||
|
||||
echo "ERROR: Failed to download detection model after 3 attempts"
|
||||
rm -f "$MODEL_FILE"
|
||||
exit 1
|
||||
|
||||
- name: Download classification model (cls)
|
||||
if: contains(inputs.models, 'cls') && steps.cache-models.outputs.cache-hit != 'true'
|
||||
shell: bash
|
||||
run: |
|
||||
MODEL_URL="https://huggingface.co/Kreuzberg/paddleocr-onnx-models/resolve/main/ch_ppocr_mobile_v2.0_cls_infer.onnx"
|
||||
CACHE_DIR="$HOME/.cache/kreuzberg/paddle-ocr"
|
||||
MODEL_DIR="$CACHE_DIR/cls"
|
||||
MODEL_FILE="$MODEL_DIR/model.onnx"
|
||||
|
||||
echo "Downloading classification model from $MODEL_URL"
|
||||
mkdir -p "$MODEL_DIR"
|
||||
|
||||
for attempt in 1 2 3; do
|
||||
if [ $attempt -gt 1 ]; then
|
||||
backoff=$((5 * 3 ** (attempt - 2)))
|
||||
echo "Retry attempt $attempt/3 after ${backoff}s backoff..."
|
||||
sleep $backoff
|
||||
fi
|
||||
|
||||
if curl -f -L --progress-bar --connect-timeout 30 --max-time 600 \
|
||||
-o "$MODEL_FILE" "$MODEL_URL"; then
|
||||
echo "Classification model downloaded successfully ($(du -h "$MODEL_FILE" | cut -f1))"
|
||||
exit 0
|
||||
fi
|
||||
done
|
||||
|
||||
echo "ERROR: Failed to download classification model after 3 attempts"
|
||||
rm -f "$MODEL_FILE"
|
||||
exit 1
|
||||
|
||||
- name: Download recognition model (rec/english)
|
||||
if: contains(inputs.models, 'rec') && steps.cache-models.outputs.cache-hit != 'true'
|
||||
shell: bash
|
||||
run: |
|
||||
MODEL_URL="https://huggingface.co/Kreuzberg/paddleocr-onnx-models/resolve/main/rec/english/model.onnx"
|
||||
CACHE_DIR="$HOME/.cache/kreuzberg/paddle-ocr"
|
||||
MODEL_DIR="$CACHE_DIR/rec/english"
|
||||
MODEL_FILE="$MODEL_DIR/model.onnx"
|
||||
|
||||
echo "Downloading English recognition model from $MODEL_URL"
|
||||
mkdir -p "$MODEL_DIR"
|
||||
|
||||
for attempt in 1 2 3; do
|
||||
if [ $attempt -gt 1 ]; then
|
||||
backoff=$((5 * 3 ** (attempt - 2)))
|
||||
echo "Retry attempt $attempt/3 after ${backoff}s backoff..."
|
||||
sleep $backoff
|
||||
fi
|
||||
|
||||
if curl -f -L --progress-bar --connect-timeout 30 --max-time 600 \
|
||||
-o "$MODEL_FILE" "$MODEL_URL"; then
|
||||
echo "Recognition model downloaded successfully ($(du -h "$MODEL_FILE" | cut -f1))"
|
||||
exit 0
|
||||
fi
|
||||
done
|
||||
|
||||
echo "ERROR: Failed to download recognition model after 3 attempts"
|
||||
rm -f "$MODEL_FILE"
|
||||
exit 1
|
||||
|
||||
- name: Download recognition dictionary (rec/english/dict.txt)
|
||||
if: contains(inputs.models, 'rec') && steps.cache-models.outputs.cache-hit != 'true'
|
||||
shell: bash
|
||||
run: |
|
||||
DICT_URL="https://huggingface.co/Kreuzberg/paddleocr-onnx-models/resolve/main/rec/english/dict.txt"
|
||||
CACHE_DIR="$HOME/.cache/kreuzberg/paddle-ocr"
|
||||
MODEL_DIR="$CACHE_DIR/rec/english"
|
||||
DICT_FILE="$MODEL_DIR/dict.txt"
|
||||
|
||||
echo "Downloading English recognition dictionary from $DICT_URL"
|
||||
mkdir -p "$MODEL_DIR"
|
||||
|
||||
for attempt in 1 2 3; do
|
||||
if [ $attempt -gt 1 ]; then
|
||||
backoff=$((5 * 3 ** (attempt - 2)))
|
||||
echo "Retry attempt $attempt/3 after ${backoff}s backoff..."
|
||||
sleep $backoff
|
||||
fi
|
||||
|
||||
if curl -f -L --progress-bar --connect-timeout 30 --max-time 600 \
|
||||
-o "$DICT_FILE" "$DICT_URL"; then
|
||||
echo "Dictionary downloaded successfully ($(du -h "$DICT_FILE" | cut -f1))"
|
||||
exit 0
|
||||
fi
|
||||
done
|
||||
|
||||
echo "ERROR: Failed to download dictionary after 3 attempts"
|
||||
rm -f "$DICT_FILE"
|
||||
exit 1
|
||||
|
||||
- name: Verify downloaded models
|
||||
id: verify-models
|
||||
shell: bash
|
||||
run: |
|
||||
CACHE_DIR="$HOME/.cache/kreuzberg/paddle-ocr"
|
||||
AVAILABLE_MODELS=()
|
||||
TOTAL_SIZE=0
|
||||
|
||||
echo "Checking for PaddleOCR models in $CACHE_DIR"
|
||||
|
||||
if [ -f "$CACHE_DIR/det/model.onnx" ]; then
|
||||
SIZE=$(wc -c < "$CACHE_DIR/det/model.onnx" | tr -d ' ')
|
||||
AVAILABLE_MODELS+=("det")
|
||||
TOTAL_SIZE=$((TOTAL_SIZE + SIZE))
|
||||
echo " ✓ Detection model: $(numfmt --to=iec-i --suffix=B $SIZE 2>/dev/null || echo $SIZE bytes)"
|
||||
fi
|
||||
|
||||
if [ -f "$CACHE_DIR/cls/model.onnx" ]; then
|
||||
SIZE=$(wc -c < "$CACHE_DIR/cls/model.onnx" | tr -d ' ')
|
||||
AVAILABLE_MODELS+=("cls")
|
||||
TOTAL_SIZE=$((TOTAL_SIZE + SIZE))
|
||||
echo " ✓ Classification model: $(numfmt --to=iec-i --suffix=B $SIZE 2>/dev/null || echo $SIZE bytes)"
|
||||
fi
|
||||
|
||||
if [ -f "$CACHE_DIR/rec/english/model.onnx" ]; then
|
||||
SIZE=$(wc -c < "$CACHE_DIR/rec/english/model.onnx" | tr -d ' ')
|
||||
AVAILABLE_MODELS+=("rec")
|
||||
TOTAL_SIZE=$((TOTAL_SIZE + SIZE))
|
||||
echo " ✓ Recognition model (English): $(numfmt --to=iec-i --suffix=B $SIZE 2>/dev/null || echo $SIZE bytes)"
|
||||
fi
|
||||
|
||||
if [ -f "$CACHE_DIR/rec/english/dict.txt" ]; then
|
||||
SIZE=$(wc -c < "$CACHE_DIR/rec/english/dict.txt" | tr -d ' ')
|
||||
TOTAL_SIZE=$((TOTAL_SIZE + SIZE))
|
||||
echo " ✓ Recognition dictionary (English): $(numfmt --to=iec-i --suffix=B $SIZE 2>/dev/null || echo $SIZE bytes)"
|
||||
fi
|
||||
|
||||
if [ ${#AVAILABLE_MODELS[@]} -eq 0 ]; then
|
||||
echo "ERROR: No models found in cache directory after download"
|
||||
echo "available-models=" >> $GITHUB_OUTPUT
|
||||
exit 1
|
||||
fi
|
||||
|
||||
AVAILABLE_MODELS_STR=$(IFS=, ; echo "${AVAILABLE_MODELS[*]}")
|
||||
echo "✓ Total cached models: ${#AVAILABLE_MODELS[@]} ($(numfmt --to=iec-i --suffix=B $TOTAL_SIZE 2>/dev/null || echo $TOTAL_SIZE bytes))"
|
||||
echo "available-models=$AVAILABLE_MODELS_STR" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set cache directory output
|
||||
id: set-outputs
|
||||
shell: bash
|
||||
run: |
|
||||
CACHE_DIR="$HOME/.cache/kreuzberg/paddle-ocr"
|
||||
echo "cache-dir=$CACHE_DIR" >> $GITHUB_OUTPUT
|
||||
echo "PADDLE_OCR_MODEL_CACHE=$CACHE_DIR" >> $GITHUB_ENV
|
||||
echo "KREUZBERG_CACHE_DIR=$HOME/.cache/kreuzberg" >> $GITHUB_ENV
|
||||
|
||||
- name: Export cache environment
|
||||
shell: bash
|
||||
run: |
|
||||
echo "PADDLE_OCR_MODEL_CACHE=$HOME/.cache/kreuzberg/paddle-ocr" >> $GITHUB_ENV
|
||||
echo "KREUZBERG_CACHE_DIR=$HOME/.cache/kreuzberg" >> $GITHUB_ENV
|
||||
echo "PaddleOCR model cache configured at: $HOME/.cache/kreuzberg/paddle-ocr"
|
||||
60
.github/actions/setup-tesseract-cache/action.yml
vendored
Normal file
60
.github/actions/setup-tesseract-cache/action.yml
vendored
Normal file
@@ -0,0 +1,60 @@
|
||||
name: Setup Tesseract Cache
|
||||
description: Manages kreuzberg-tesseract build cache per architecture
|
||||
|
||||
inputs:
|
||||
label:
|
||||
description: Platform label (e.g. linux-x86_64, linux-aarch64)
|
||||
required: true
|
||||
enable-cache:
|
||||
description: Enable tesseract caching (disable for cross-arch builds)
|
||||
required: false
|
||||
default: "true"
|
||||
rust-target:
|
||||
description: Rust target triple for per-target cache cleanup
|
||||
required: false
|
||||
default: ""
|
||||
|
||||
outputs:
|
||||
cache-dir:
|
||||
description: Tesseract cache directory path
|
||||
value: ${{ steps.set-outputs.outputs.cache-dir }}
|
||||
cache-enabled:
|
||||
description: Whether caching is enabled (true/false)
|
||||
value: ${{ steps.set-outputs.outputs.cache-enabled }}
|
||||
docker-options:
|
||||
description: Docker options for passing cache env vars
|
||||
value: ${{ steps.set-outputs.outputs.docker-options }}
|
||||
|
||||
runs:
|
||||
using: composite
|
||||
steps:
|
||||
- name: Clean cache directories (cache disabled)
|
||||
if: inputs.enable-cache != 'true'
|
||||
shell: bash
|
||||
run: scripts/ci/actions/setup-tesseract-cache/clean-dirs.sh "${{ inputs.label }}"
|
||||
|
||||
- name: Setup cache directories
|
||||
if: inputs.enable-cache == 'true'
|
||||
shell: bash
|
||||
run: scripts/ci/actions/setup-tesseract-cache/setup-dirs.sh "${{ inputs.label }}"
|
||||
|
||||
- name: Cache kreuzberg-tesseract build cache
|
||||
if: inputs.enable-cache == 'true'
|
||||
uses: actions/cache@v5
|
||||
with:
|
||||
path: |
|
||||
.tesseract-cache/${{ inputs.label }}
|
||||
.xdg-cache/${{ inputs.label }}
|
||||
key: kreuzberg-tesseract-cache-v2-${{ runner.os }}-${{ inputs.label }}-${{ hashFiles('crates/kreuzberg-tesseract/Cargo.toml', 'crates/kreuzberg-tesseract/build.rs') }}
|
||||
restore-keys: |
|
||||
kreuzberg-tesseract-cache-v2-${{ runner.os }}-${{ inputs.label }}-
|
||||
|
||||
- name: Clean per-target Tesseract cache
|
||||
if: inputs.rust-target != ''
|
||||
shell: bash
|
||||
run: scripts/ci/actions/setup-tesseract-cache/clean-target-cache.sh "${{ inputs.rust-target }}"
|
||||
|
||||
- name: Set outputs and environment
|
||||
id: set-outputs
|
||||
shell: bash
|
||||
run: scripts/ci/actions/setup-tesseract-cache/set-outputs.sh "${{ inputs.label }}" "${{ inputs.enable-cache }}"
|
||||
67
.github/dependabot.yaml
vendored
Normal file
67
.github/dependabot.yaml
vendored
Normal file
@@ -0,0 +1,67 @@
|
||||
version: 2
|
||||
|
||||
multi-ecosystem-groups:
|
||||
dependencies:
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
|
||||
updates:
|
||||
- package-ecosystem: "cargo"
|
||||
# Explicitly list root only — packages/ruby/ext and packages/r/src have
|
||||
# standalone workspaces with path deps to vendored crates that only exist
|
||||
# at build time. Dependabot cannot resolve these paths.
|
||||
directories:
|
||||
- "/"
|
||||
ignore:
|
||||
- dependency-name: "kreuzberg"
|
||||
- dependency-name: "kreuzberg-ffi"
|
||||
patterns: ["*"]
|
||||
multi-ecosystem-group: "dependencies"
|
||||
|
||||
- package-ecosystem: "pip"
|
||||
directories:
|
||||
- "/"
|
||||
- "/packages/python"
|
||||
patterns: ["*"]
|
||||
multi-ecosystem-group: "dependencies"
|
||||
|
||||
- package-ecosystem: "npm"
|
||||
directories:
|
||||
- "/"
|
||||
- "/crates/kreuzberg-node"
|
||||
- "/crates/kreuzberg-wasm"
|
||||
- "/packages/typescript/core"
|
||||
patterns: ["*"]
|
||||
multi-ecosystem-group: "dependencies"
|
||||
|
||||
- package-ecosystem: "bundler"
|
||||
directory: "/packages/ruby"
|
||||
patterns: ["*"]
|
||||
multi-ecosystem-group: "dependencies"
|
||||
|
||||
- package-ecosystem: "composer"
|
||||
directories:
|
||||
- "/"
|
||||
- "/packages/php"
|
||||
patterns: ["*"]
|
||||
multi-ecosystem-group: "dependencies"
|
||||
|
||||
- package-ecosystem: "gomod"
|
||||
directory: "/packages/go/v5"
|
||||
patterns: ["*"]
|
||||
multi-ecosystem-group: "dependencies"
|
||||
|
||||
- package-ecosystem: "maven"
|
||||
directory: "/packages/java"
|
||||
patterns: ["*"]
|
||||
multi-ecosystem-group: "dependencies"
|
||||
|
||||
- package-ecosystem: "nuget"
|
||||
directory: "/packages/csharp"
|
||||
patterns: ["*"]
|
||||
multi-ecosystem-group: "dependencies"
|
||||
|
||||
- package-ecosystem: "mix"
|
||||
directory: "/packages/elixir"
|
||||
patterns: ["*"]
|
||||
multi-ecosystem-group: "dependencies"
|
||||
39
.github/documentation/runners.md
vendored
Normal file
39
.github/documentation/runners.md
vendored
Normal file
@@ -0,0 +1,39 @@
|
||||
# Custom GitHub Actions Runners
|
||||
|
||||
## Available Runners
|
||||
|
||||
| Runner Label | Architecture | Size | Ephemeral | Notes |
|
||||
| -------------------------- | ------------ | ------ | --------- | ---------------------------------------------------------- |
|
||||
| `runner-small` | x86_64 | Small | No | Light tasks: linting, formatting, validation |
|
||||
| `runner-medium` | x86_64 | Medium | No | Standard CI: tests, builds |
|
||||
| `runner-medium-arm64` | arm64 | Medium | No | ARM64 builds and tests |
|
||||
| `runner-large` | x86_64 | Large | No | Heavy workloads: benchmarks, coverage, release builds |
|
||||
| `runner-large-spot` | x86_64 | Large | Yes | Cost-optimized large jobs where interruption is acceptable |
|
||||
| `runner-medium-arm64-spot` | arm64 | Medium | Yes | Cost-optimized ARM64 jobs where interruption is acceptable |
|
||||
|
||||
## Spot Runners
|
||||
|
||||
Spot runners (`*-spot`) use ephemeral cloud instances provisioned on a best-effort basis. They are significantly cheaper but can be preempted at any time if the cloud provider reclaims capacity.
|
||||
|
||||
**Use spot runners for:**
|
||||
|
||||
- Jobs that can be retried without consequence (test suites, linting)
|
||||
- Non-time-critical workloads
|
||||
- PR validation where re-runs are acceptable
|
||||
|
||||
**Do not use spot runners for:**
|
||||
|
||||
- Benchmarks (preemption and noisy-neighbor effects skew results)
|
||||
- Release builds and publishing
|
||||
- Jobs requiring consistent, reproducible timing
|
||||
|
||||
## Choosing a Runner
|
||||
|
||||
| Workload | Recommended Runner |
|
||||
| ------------------------------- | -------------------------- |
|
||||
| Linting, formatting, validation | `runner-small` |
|
||||
| Unit tests, standard builds | `runner-medium` |
|
||||
| ARM64 cross-compilation / tests | `runner-medium-arm64` |
|
||||
| Benchmarks, coverage reports | `runner-large` |
|
||||
| Non-critical large builds | `runner-large-spot` |
|
||||
| Non-critical ARM64 builds | `runner-medium-arm64-spot` |
|
||||
1244
.github/workflows/benchmarks.yaml
vendored
Normal file
1244
.github/workflows/benchmarks.yaml
vendored
Normal file
File diff suppressed because it is too large
Load Diff
74
.github/workflows/build-node-native.yml
vendored
Normal file
74
.github/workflows/build-node-native.yml
vendored
Normal file
@@ -0,0 +1,74 @@
|
||||
name: Build Node Native
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
paths:
|
||||
- "crates/kreuzberg-node/**"
|
||||
- "crates/kreuzberg/**"
|
||||
- "Cargo.toml"
|
||||
- "Cargo.lock"
|
||||
- "rust-toolchain.toml"
|
||||
- ".github/workflows/build-node-native.yml"
|
||||
pull_request:
|
||||
branches: [main]
|
||||
paths:
|
||||
- "crates/kreuzberg-node/**"
|
||||
- "crates/kreuzberg/**"
|
||||
- "Cargo.toml"
|
||||
- "Cargo.lock"
|
||||
- "rust-toolchain.toml"
|
||||
- ".github/workflows/build-node-native.yml"
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: build-node-native-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
CARGO_INCREMENTAL: 0
|
||||
MACOSX_DEPLOYMENT_TARGET: "14.0"
|
||||
FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: "true"
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
build:
|
||||
name: Build ${{ matrix.target }}
|
||||
runs-on: ${{ matrix.os }}
|
||||
timeout-minutes: 60
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- os: ubuntu-latest
|
||||
target: x86_64-unknown-linux-gnu
|
||||
- os: ubuntu-24.04-arm
|
||||
target: aarch64-unknown-linux-gnu
|
||||
- os: macos-13
|
||||
target: x86_64-apple-darwin
|
||||
- os: macos-latest
|
||||
target: aarch64-apple-darwin
|
||||
- os: windows-latest
|
||||
target: x86_64-pc-windows-msvc
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v6.0.2
|
||||
with:
|
||||
submodules: recursive
|
||||
|
||||
- uses: kreuzberg-dev/actions/setup-rust@v1
|
||||
with:
|
||||
target: ${{ matrix.target }}
|
||||
|
||||
- uses: kreuzberg-dev/actions/setup-node-workspace@v1
|
||||
with:
|
||||
node-version: "24"
|
||||
|
||||
- name: Build NAPI binding
|
||||
uses: kreuzberg-dev/actions/build-node-napi@v1
|
||||
with:
|
||||
crate-dir: crates/kreuzberg-node
|
||||
build-command: pnpm exec napi build --release --target ${{ matrix.target }} --platform
|
||||
79
.github/workflows/ci-docker.yaml
vendored
Normal file
79
.github/workflows/ci-docker.yaml
vendored
Normal file
@@ -0,0 +1,79 @@
|
||||
name: CI Docker
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: ci-docker-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
ORT_VERSION: "1.24.2"
|
||||
FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: "true"
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
docker:
|
||||
name: Docker (${{ matrix.variant }})
|
||||
if: github.repository == 'kreuzberg-dev/kreuzberg' && github.actor != 'dependabot[bot]'
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 60
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
variant: [core, full, cli]
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- name: Free disk space
|
||||
uses: kreuzberg-dev/actions/free-disk-space-linux@v1
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v4
|
||||
|
||||
- name: Build Docker image
|
||||
uses: docker/build-push-action@v7
|
||||
with:
|
||||
context: .
|
||||
file: docker/Dockerfile.${{ matrix.variant }}
|
||||
push: false
|
||||
load: true
|
||||
tags: kreuzberg:${{ matrix.variant }}
|
||||
build-args: ONNXRUNTIME_VERSION=${{ env.ORT_VERSION }}
|
||||
cache-from: type=gha,scope=ci-docker-${{ matrix.variant }}
|
||||
cache-to: type=gha,mode=max,scope=ci-docker-${{ matrix.variant }}
|
||||
|
||||
- name: Save Docker image
|
||||
shell: bash
|
||||
run: |
|
||||
mkdir -p /tmp
|
||||
docker save kreuzberg:${{ matrix.variant }} | gzip > /tmp/kreuzberg-${{ matrix.variant }}.tar.gz
|
||||
ls -lh /tmp/kreuzberg-${{ matrix.variant }}.tar.gz
|
||||
|
||||
- name: Check image size
|
||||
uses: kreuzberg-dev/actions/check-docker-image-size@v1
|
||||
with:
|
||||
image: kreuzberg:${{ matrix.variant }}
|
||||
warn-mb: ${{ matrix.variant == 'cli' && '200' || '' }}
|
||||
label: "${{ matrix.variant }} image"
|
||||
|
||||
- name: Run feature tests
|
||||
if: matrix.variant != 'cli'
|
||||
run: scripts/ci/docker/run-feature-tests.sh "${{ matrix.variant }}"
|
||||
|
||||
- name: Run configuration tests
|
||||
if: matrix.variant != 'cli'
|
||||
run: scripts/ci/docker/run-config-tests.sh "${{ matrix.variant }}"
|
||||
|
||||
- name: Run API contract tests with schemathesis
|
||||
if: matrix.variant != 'cli'
|
||||
uses: kreuzberg-dev/actions/run-api-contract-tests@v1
|
||||
with:
|
||||
image: kreuzberg:${{ matrix.variant }}
|
||||
port: "8000"
|
||||
|
||||
- name: Run CLI tests
|
||||
if: matrix.variant == 'cli'
|
||||
run: scripts/ci/docker/run-cli-tests.sh
|
||||
102
.github/workflows/ci-docs.yaml
vendored
Normal file
102
.github/workflows/ci-docs.yaml
vendored
Normal file
@@ -0,0 +1,102 @@
|
||||
name: CI Docs
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
paths:
|
||||
- "docs/**"
|
||||
- "packages/**/README.md"
|
||||
- "crates/*/README.md"
|
||||
- "packages/python/pyproject.toml"
|
||||
- "packages/typescript/package.json"
|
||||
- "packages/ruby/kreuzberg.gemspec"
|
||||
- "packages/php/composer.json"
|
||||
- "packages/go/v5/go.mod"
|
||||
- "packages/java/pom.xml"
|
||||
- "packages/csharp/**/Kreuzberg.csproj"
|
||||
- "packages/elixir/mix.exs"
|
||||
- "packages/r/DESCRIPTION"
|
||||
- "packages/dart/pubspec.yaml"
|
||||
- "zensical.toml"
|
||||
- "mkdocs.yml"
|
||||
- "alef.toml"
|
||||
- ".github/workflows/ci-docs.yaml"
|
||||
push:
|
||||
branches: [main]
|
||||
paths:
|
||||
- "docs/**"
|
||||
- "packages/**/README.md"
|
||||
- "crates/*/README.md"
|
||||
- "zensical.toml"
|
||||
- "pyproject.toml"
|
||||
- "alef.toml"
|
||||
- "CHANGELOG.md"
|
||||
- ".github/workflows/ci-docs.yaml"
|
||||
workflow_dispatch:
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
pages: write
|
||||
id-token: write
|
||||
|
||||
concurrency:
|
||||
group: ci-docs-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
lint:
|
||||
name: Lint
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 30
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- name: Lint documentation + validate snippets
|
||||
uses: kreuzberg-dev/actions/lint-docs@v1
|
||||
with:
|
||||
working-directory: .
|
||||
strict: "true"
|
||||
validate-snippets: "true"
|
||||
alef-ref: v0.19.5
|
||||
|
||||
build:
|
||||
name: Build
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 10
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Build documentation
|
||||
uses: kreuzberg-dev/actions/build-docs@v1
|
||||
with:
|
||||
working-directory: .
|
||||
strict: "true"
|
||||
|
||||
- name: Upload site artifact
|
||||
uses: actions/upload-artifact@v7
|
||||
with:
|
||||
name: docs-site
|
||||
path: site/
|
||||
retention-days: 1
|
||||
|
||||
deploy:
|
||||
name: Deploy
|
||||
needs: [build, lint]
|
||||
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Download site artifact
|
||||
uses: actions/download-artifact@v8
|
||||
with:
|
||||
name: docs-site
|
||||
path: site/
|
||||
|
||||
- name: Upload Pages artifact
|
||||
uses: actions/upload-pages-artifact@v5
|
||||
with:
|
||||
path: site
|
||||
|
||||
- name: Deploy to GitHub Pages
|
||||
id: deployment
|
||||
uses: actions/deploy-pages@v5
|
||||
345
.github/workflows/ci-e2e.yaml
vendored
Normal file
345
.github/workflows/ci-e2e.yaml
vendored
Normal file
@@ -0,0 +1,345 @@
|
||||
name: CI E2E
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
paths:
|
||||
- "crates/**"
|
||||
- "packages/**"
|
||||
- "e2e/**"
|
||||
- "fixtures/**"
|
||||
- "alef.toml"
|
||||
- "Cargo.toml"
|
||||
- "Cargo.lock"
|
||||
- "Taskfile.yml"
|
||||
- ".github/workflows/ci-e2e.yaml"
|
||||
pull_request:
|
||||
branches: [main]
|
||||
paths:
|
||||
- "crates/**"
|
||||
- "packages/**"
|
||||
- "e2e/**"
|
||||
- "fixtures/**"
|
||||
- "alef.toml"
|
||||
- "Cargo.toml"
|
||||
- "Cargo.lock"
|
||||
- "Taskfile.yml"
|
||||
- ".github/workflows/ci-e2e.yaml"
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: ci-e2e-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
CARGO_INCREMENTAL: 0
|
||||
CARGO_PROFILE_DEV_DEBUG: 0
|
||||
RUST_BACKTRACE: short
|
||||
RUST_MIN_STACK: 16777216
|
||||
ORT_VERSION: "1.24.2"
|
||||
MACOSX_DEPLOYMENT_TARGET: "14.0"
|
||||
BUILD_PROFILE: "ci"
|
||||
FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: "true"
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
build-ffi:
|
||||
name: Build FFI (${{ matrix.target }})
|
||||
if: github.repository == 'kreuzberg-dev/kreuzberg' && github.actor != 'dependabot[bot]'
|
||||
runs-on: ${{ matrix.os }}
|
||||
timeout-minutes: ${{ matrix.os == 'windows-latest' && 120 || 60 }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- os: ubuntu-24.04-arm
|
||||
target: aarch64-unknown-linux-gnu
|
||||
- os: ubuntu-latest
|
||||
target: x86_64-unknown-linux-gnu
|
||||
- os: macos-latest
|
||||
target: aarch64-apple-darwin
|
||||
- os: windows-latest
|
||||
target: x86_64-pc-windows-msvc
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
submodules: recursive
|
||||
|
||||
- name: Setup Rust
|
||||
uses: kreuzberg-dev/actions/setup-rust@v1
|
||||
with:
|
||||
cache-key-prefix: build-ffi-${{ matrix.target }}
|
||||
target: ${{ matrix.target }}
|
||||
|
||||
- name: Install system dependencies
|
||||
uses: ./.github/actions/install-system-deps
|
||||
|
||||
- name: Setup OpenSSL
|
||||
uses: kreuzberg-dev/actions/setup-openssl@v1
|
||||
|
||||
- name: Build FFI library
|
||||
uses: kreuzberg-dev/actions/build-rust-ffi@v1
|
||||
with:
|
||||
crate-name: kreuzberg-ffi
|
||||
|
||||
- name: Build CLI
|
||||
uses: kreuzberg-dev/actions/build-rust-cli@v1
|
||||
with:
|
||||
package-name: kreuzberg-cli
|
||||
binary-name: kreuzberg
|
||||
extra-cargo-args: --features all
|
||||
|
||||
- name: Upload FFI artifacts
|
||||
uses: actions/upload-artifact@v7
|
||||
with:
|
||||
name: ffi-${{ matrix.target }}
|
||||
path: |
|
||||
target/release/libkreuzberg_ffi.*
|
||||
target/release/kreuzberg_ffi.*
|
||||
crates/kreuzberg-ffi/include/kreuzberg.h
|
||||
crates/kreuzberg-ffi/kreuzberg-ffi.pc
|
||||
crates/kreuzberg-ffi/cmake/
|
||||
target/release/kreuzberg
|
||||
target/release/kreuzberg.exe
|
||||
retention-days: 7
|
||||
if-no-files-found: error
|
||||
|
||||
e2e-tests:
|
||||
name: E2E (${{ matrix.lang }})
|
||||
if: github.repository == 'kreuzberg-dev/kreuzberg' && github.actor != 'dependabot[bot]'
|
||||
needs: [build-ffi]
|
||||
runs-on: ubuntu-24.04-arm
|
||||
timeout-minutes: 60
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- lang: python
|
||||
python-version: "3.13"
|
||||
test-cmd: "pip install maturin && cd packages/python && maturin develop --release && cd ../../e2e/python && python3 -m pytest tests/ -q"
|
||||
- lang: node
|
||||
node-version: "24"
|
||||
test-cmd: "cd crates/kreuzberg-node && npm run build && cd ../../e2e/node && npx vitest run"
|
||||
- lang: go
|
||||
go-version: "1.26"
|
||||
test-cmd: "cd e2e/go && go test ./... -count=1 -v"
|
||||
- lang: ruby
|
||||
ruby-version: "3.4"
|
||||
test-cmd: "cd e2e/ruby && bundle exec rspec"
|
||||
- lang: java
|
||||
java-version: "25"
|
||||
test-cmd: "cd packages/java && mvn -q package -DskipTests && cd ../../e2e/java && mvn test -q"
|
||||
- lang: csharp
|
||||
dotnet-version: "10.0.x"
|
||||
test-cmd: "cd e2e/csharp && dotnet test"
|
||||
- lang: php
|
||||
php-version: "8.4"
|
||||
test-cmd: 'cd crates/kreuzberg-php && cargo build --release && echo "extension=$(pwd)/../../target/release/libkreuzberg_php.so" | sudo tee -a "$(php -r ''echo php_ini_loaded_file();'')" >/dev/null && cd ../../e2e/php && composer install -q && vendor/bin/phpunit'
|
||||
- lang: elixir
|
||||
elixir-version: "1.19"
|
||||
otp-version: "28"
|
||||
test-cmd: "cd e2e/elixir && KREUZBERG_BUILD=true mix deps.get && KREUZBERG_BUILD=true mix test"
|
||||
- lang: wasm
|
||||
node-version: "24"
|
||||
test-cmd: 'curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh && export PATH="$HOME/.cargo/bin:$PATH" && export RUSTFLAGS=''--cfg getrandom_backend="wasm_js"'' && cd crates/kreuzberg-wasm && wasm-pack build --release --target web --out-dir ../../packages/wasm/pkg && cd ../../e2e/wasm && npm install && npm test'
|
||||
- lang: rust
|
||||
test-cmd: "cd e2e/rust && cargo test"
|
||||
- lang: r
|
||||
r-version: "4.3"
|
||||
test-cmd: "cd e2e/r && Rscript run_tests.R"
|
||||
- lang: dart
|
||||
dart-version: "3.11"
|
||||
test-cmd: "cargo build --release -p kreuzberg-dart && mkdir -p packages/dart/rust/target/release && cp target/release/libkreuzberg_dart.* packages/dart/rust/target/release/ 2>/dev/null || true && cd packages/dart && dart pub get && cd ../../e2e/dart && dart pub get && dart test"
|
||||
- lang: kotlin_android
|
||||
java-version: "25"
|
||||
test-cmd: "cd e2e/kotlin_android && gradle test --no-daemon"
|
||||
- lang: swift
|
||||
swift-version: "6.0"
|
||||
test-cmd: "cd e2e/swift_e2e && swift test"
|
||||
- lang: zig
|
||||
zig-version: "0.16.0"
|
||||
test-cmd: 'FFI_ABS="$PWD/target/release" && cd e2e/zig && zig build test -Dffi_path="$FFI_ABS"'
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
submodules: recursive
|
||||
|
||||
- name: Setup Rust
|
||||
uses: kreuzberg-dev/actions/setup-rust@v1
|
||||
with:
|
||||
cache-key-prefix: e2e-${{ matrix.lang }}
|
||||
|
||||
- name: Download FFI artifacts
|
||||
uses: actions/download-artifact@v8
|
||||
with:
|
||||
name: ffi-aarch64-unknown-linux-gnu
|
||||
path: ffi-artifacts
|
||||
|
||||
- name: Stage FFI artifacts
|
||||
shell: bash
|
||||
run: |
|
||||
mkdir -p target/release crates/kreuzberg-ffi/include crates/kreuzberg-ffi/cmake
|
||||
if [ -d ffi-artifacts/target/release ]; then
|
||||
cp -r ffi-artifacts/target/release/. target/release/
|
||||
fi
|
||||
if [ -d ffi-artifacts/crates/kreuzberg-ffi/include ]; then
|
||||
cp -r ffi-artifacts/crates/kreuzberg-ffi/include/. crates/kreuzberg-ffi/include/
|
||||
fi
|
||||
if [ -d ffi-artifacts/crates/kreuzberg-ffi/cmake ]; then
|
||||
cp -r ffi-artifacts/crates/kreuzberg-ffi/cmake/. crates/kreuzberg-ffi/cmake/
|
||||
fi
|
||||
if [ -f ffi-artifacts/crates/kreuzberg-ffi/kreuzberg-ffi.pc ]; then
|
||||
cp ffi-artifacts/crates/kreuzberg-ffi/kreuzberg-ffi.pc crates/kreuzberg-ffi/
|
||||
fi
|
||||
chmod +x target/release/libkreuzberg_ffi.so 2>/dev/null || true
|
||||
ls -la target/release/
|
||||
if [ -f target/release/libkreuzberg_ffi.so ]; then
|
||||
sudo cp target/release/libkreuzberg_ffi.so /usr/local/lib/
|
||||
sudo ldconfig
|
||||
fi
|
||||
|
||||
- name: Install system dependencies
|
||||
uses: ./.github/actions/install-system-deps
|
||||
|
||||
- name: Setup OpenSSL
|
||||
uses: kreuzberg-dev/actions/setup-openssl@v1
|
||||
|
||||
- name: Setup ONNX Runtime
|
||||
uses: ./.github/actions/setup-onnx-runtime
|
||||
with:
|
||||
ort-version: ${{ env.ORT_VERSION }}
|
||||
|
||||
- name: Setup Tesseract cache
|
||||
uses: ./.github/actions/setup-tesseract-cache
|
||||
with:
|
||||
label: e2e-${{ matrix.lang }}
|
||||
|
||||
- name: Install WASI SDK
|
||||
if: matrix.lang == 'wasm'
|
||||
uses: kreuzberg-dev/actions/install-wasi-sdk@v1
|
||||
|
||||
- name: Setup Python
|
||||
if: matrix.python-version
|
||||
uses: kreuzberg-dev/actions/setup-python-env@v1
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
cache-prefix: e2e-py-${{ matrix.python-version }}
|
||||
|
||||
- name: Setup Node
|
||||
if: matrix.node-version
|
||||
uses: kreuzberg-dev/actions/setup-node-workspace@v1
|
||||
|
||||
- name: Setup Go
|
||||
if: matrix.go-version
|
||||
uses: actions/setup-go@v6
|
||||
with:
|
||||
go-version: ${{ matrix.go-version }}
|
||||
|
||||
- name: Setup Ruby
|
||||
if: matrix.ruby-version
|
||||
uses: ruby/setup-ruby@v1
|
||||
with:
|
||||
ruby-version: ${{ matrix.ruby-version }}
|
||||
bundler-cache: true
|
||||
working-directory: e2e/ruby
|
||||
|
||||
- name: Setup Java
|
||||
if: matrix.java-version
|
||||
uses: actions/setup-java@v5
|
||||
with:
|
||||
distribution: temurin
|
||||
java-version: ${{ matrix.java-version }}
|
||||
|
||||
- name: Setup Android SDK
|
||||
if: matrix.lang == 'kotlin_android'
|
||||
uses: android-actions/setup-android@v3
|
||||
with:
|
||||
api-level: 35
|
||||
build-tools-version: "35.0.0"
|
||||
|
||||
- name: Setup Gradle
|
||||
if: matrix.lang == 'kotlin_android'
|
||||
uses: kreuzberg-dev/actions/setup-gradle@v1
|
||||
with:
|
||||
gradle-version: "9.1.0"
|
||||
|
||||
- name: Setup .NET
|
||||
if: matrix.dotnet-version
|
||||
uses: actions/setup-dotnet@v5
|
||||
with:
|
||||
dotnet-version: ${{ matrix.dotnet-version }}
|
||||
|
||||
- name: Setup PHP
|
||||
if: matrix.php-version
|
||||
uses: kreuzberg-dev/actions/setup-php@v1
|
||||
with:
|
||||
php-version: ${{ matrix.php-version }}
|
||||
tools: composer
|
||||
|
||||
- name: Setup Elixir
|
||||
if: matrix.elixir-version
|
||||
uses: kreuzberg-dev/actions/setup-elixir@v1
|
||||
with:
|
||||
elixir-version: ${{ matrix.elixir-version }}
|
||||
otp-version: ${{ matrix.otp-version }}
|
||||
|
||||
- name: Setup R
|
||||
if: matrix.r-version
|
||||
uses: kreuzberg-dev/actions/setup-r@v1
|
||||
with:
|
||||
r-version: ${{ matrix.r-version }}
|
||||
|
||||
- name: Install R test packages
|
||||
if: matrix.lang == 'r'
|
||||
run: R -e 'install.packages(c("testthat","jsonlite","devtools"), repos="https://cloud.r-project.org")'
|
||||
|
||||
- name: Setup Dart
|
||||
if: matrix.dart-version
|
||||
uses: dart-lang/setup-dart@v1
|
||||
with:
|
||||
sdk: ${{ matrix.dart-version }}
|
||||
|
||||
- name: Setup Swift
|
||||
if: matrix.swift-version
|
||||
uses: kreuzberg-dev/actions/setup-swift@v1
|
||||
with:
|
||||
swift-version: ${{ matrix.swift-version }}
|
||||
|
||||
- name: Setup Zig
|
||||
if: matrix.zig-version
|
||||
uses: kreuzberg-dev/actions/setup-zig@v1
|
||||
with:
|
||||
version: ${{ matrix.zig-version }}
|
||||
|
||||
- name: Setup library paths for FFI bindings
|
||||
if: |
|
||||
matrix.lang == 'go' || matrix.lang == 'java' ||
|
||||
matrix.lang == 'csharp' || matrix.lang == 'elixir' ||
|
||||
matrix.lang == 'r' || matrix.lang == 'kotlin_android' ||
|
||||
matrix.lang == 'swift' || matrix.lang == 'zig'
|
||||
shell: bash
|
||||
run: |
|
||||
export PKG_CONFIG_PATH="${PWD}/crates/kreuzberg-ffi:${PKG_CONFIG_PATH}"
|
||||
export LD_LIBRARY_PATH="${PWD}/target/release:${LD_LIBRARY_PATH}"
|
||||
echo "PKG_CONFIG_PATH=${PKG_CONFIG_PATH}" >> "$GITHUB_ENV"
|
||||
echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Install Task
|
||||
uses: kreuzberg-dev/actions/install-task@v1
|
||||
|
||||
- name: Compile Ruby native extension
|
||||
if: matrix.lang == 'ruby'
|
||||
working-directory: packages/ruby
|
||||
run: bundle install && bundle exec rake compile
|
||||
|
||||
- name: Run tests
|
||||
run: ${{ matrix.test-cmd }}
|
||||
shell: bash
|
||||
env:
|
||||
PKG_CONFIG_PATH: ${{ env.PKG_CONFIG_PATH }}
|
||||
LD_LIBRARY_PATH: ${{ env.LD_LIBRARY_PATH }}
|
||||
DYLD_LIBRARY_PATH: ${{ env.DYLD_LIBRARY_PATH || '' }}
|
||||
TESSDATA_PREFIX: "/usr/share/tesseract-ocr/5/tessdata"
|
||||
112
.github/workflows/ci-gpu.yaml
vendored
Normal file
112
.github/workflows/ci-gpu.yaml
vendored
Normal file
@@ -0,0 +1,112 @@
|
||||
name: CI GPU
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: ci-gpu-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
CARGO_INCREMENTAL: 0
|
||||
CARGO_PROFILE_DEV_DEBUG: 0
|
||||
RUST_BACKTRACE: short
|
||||
RUST_MIN_STACK: 16777216
|
||||
ORT_VERSION: "1.24.2"
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
build:
|
||||
name: "Build test binary"
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 30
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
submodules: recursive
|
||||
|
||||
- name: Install system dependencies
|
||||
uses: ./.github/actions/install-system-deps
|
||||
|
||||
- name: Setup Rust
|
||||
uses: dtolnay/rust-toolchain@stable
|
||||
with:
|
||||
toolchain: "1.95"
|
||||
|
||||
- name: Cache Cargo
|
||||
uses: actions/cache@v5
|
||||
with:
|
||||
path: |
|
||||
~/.cargo/registry
|
||||
~/.cargo/git
|
||||
target
|
||||
key: gpu-build-${{ runner.os }}-${{ hashFiles('Cargo.lock') }}
|
||||
restore-keys: |
|
||||
gpu-build-${{ runner.os }}-
|
||||
|
||||
- name: Build GPU test binary
|
||||
uses: kreuzberg-dev/actions/build-gpu-test-binary@v1
|
||||
with:
|
||||
package: kreuzberg
|
||||
test-name: gpu_acceleration
|
||||
features: "paddle-ocr,layout-detection,embeddings,pdf,ocr,ort-dynamic"
|
||||
output-name: gpu-acceleration-test
|
||||
|
||||
- name: Upload test binary
|
||||
uses: actions/upload-artifact@v7
|
||||
with:
|
||||
name: gpu-test-binary
|
||||
path: gpu-acceleration-test
|
||||
retention-days: 1
|
||||
|
||||
gpu-tests:
|
||||
name: "GPU Tests (CUDA)"
|
||||
needs: build
|
||||
runs-on: runner-gpu-l4
|
||||
timeout-minutes: 15
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
submodules: recursive
|
||||
|
||||
- name: Verify GPU
|
||||
run: |
|
||||
nvidia-smi || {
|
||||
echo "ERROR: nvidia-smi failed — no GPU detected"
|
||||
exit 1
|
||||
}
|
||||
echo "GPU detected:"
|
||||
nvidia-smi --query-gpu=name,driver_version,memory.total --format=csv,noheader
|
||||
|
||||
- name: Download test binary
|
||||
uses: actions/download-artifact@v8.0.1
|
||||
with:
|
||||
name: gpu-test-binary
|
||||
|
||||
- name: Download ONNX Runtime (GPU/CUDA)
|
||||
uses: kreuzberg-dev/actions/setup-onnx-runtime-gpu@v1
|
||||
with:
|
||||
version: ${{ env.ORT_VERSION }}
|
||||
|
||||
- name: Setup PaddleOCR models
|
||||
uses: ./.github/actions/setup-paddle-ocr-models
|
||||
|
||||
- name: Clear stale layout model cache (self-hosted runner persistence)
|
||||
run: |
|
||||
rm -rf "$HOME/.cache/kreuzberg/layout"
|
||||
echo "Cleared layout model cache"
|
||||
|
||||
- name: Run GPU tests
|
||||
run: |
|
||||
chmod +x gpu-acceleration-test
|
||||
./gpu-acceleration-test --ignored --nocapture
|
||||
env:
|
||||
RUST_LOG: "kreuzberg=debug"
|
||||
TEST_DOCUMENTS_DIR: ${{ github.workspace }}/test_documents
|
||||
107
.github/workflows/ci-lint.yaml
vendored
Normal file
107
.github/workflows/ci-lint.yaml
vendored
Normal file
@@ -0,0 +1,107 @@
|
||||
name: CI Lint
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
pull_request:
|
||||
branches: [main]
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: ci-lint-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
lint:
|
||||
name: Lint
|
||||
if: github.repository == 'kreuzberg-dev/kreuzberg' && github.actor != 'dependabot[bot]'
|
||||
runs-on: ubuntu-24.04-arm
|
||||
timeout-minutes: 60
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: "true"
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
submodules: recursive
|
||||
|
||||
- name: Setup Rust
|
||||
uses: kreuzberg-dev/actions/setup-rust@v1
|
||||
with:
|
||||
cache-key-prefix: lint
|
||||
|
||||
- name: Setup Python
|
||||
uses: kreuzberg-dev/actions/setup-python-env@v1
|
||||
with:
|
||||
python-version: "3.13"
|
||||
cache-prefix: lint-py
|
||||
install-command: "uv sync --group dev --no-install-project --no-install-workspace --frozen"
|
||||
|
||||
- name: Setup Node
|
||||
uses: kreuzberg-dev/actions/setup-node-workspace@v1
|
||||
|
||||
- name: Setup Go
|
||||
uses: actions/setup-go@v6
|
||||
with:
|
||||
go-version: "1.26"
|
||||
|
||||
- name: Setup Java
|
||||
uses: actions/setup-java@v5
|
||||
with:
|
||||
distribution: temurin
|
||||
java-version: "25"
|
||||
|
||||
- name: Setup Elixir
|
||||
uses: kreuzberg-dev/actions/setup-elixir@v1
|
||||
|
||||
- name: Setup Ruby
|
||||
uses: ruby/setup-ruby@v1
|
||||
with:
|
||||
ruby-version: "3.4"
|
||||
bundler-cache: true
|
||||
working-directory: packages/ruby
|
||||
|
||||
- name: Setup PHP
|
||||
uses: kreuzberg-dev/actions/setup-php@v1
|
||||
|
||||
- name: Setup .NET
|
||||
uses: actions/setup-dotnet@v5
|
||||
with:
|
||||
dotnet-version: "10.0.x"
|
||||
|
||||
- name: Setup R
|
||||
uses: kreuzberg-dev/actions/setup-r@v1
|
||||
with:
|
||||
r-version: "release"
|
||||
install-deps: "false"
|
||||
|
||||
- name: Install Task
|
||||
uses: kreuzberg-dev/actions/install-task@v1
|
||||
|
||||
- name: Setup Helm
|
||||
uses: azure/setup-helm@v5
|
||||
|
||||
- name: Setup kubeconform
|
||||
uses: bmuschko/setup-kubeconform@v1
|
||||
|
||||
- name: Install alef CLI
|
||||
uses: kreuzberg-dev/actions/install-alef@v1
|
||||
|
||||
- name: Run all prek hooks
|
||||
uses: j178/prek-action@v2
|
||||
with:
|
||||
cache: false
|
||||
extra-args: --all-files
|
||||
|
||||
- name: Validate C header
|
||||
shell: bash
|
||||
run: |
|
||||
HEADER="crates/kreuzberg-ffi/include/kreuzberg.h"
|
||||
if [ ! -f "$HEADER" ]; then
|
||||
echo "::error::C header not found at $HEADER — run 'task alef:generate'"
|
||||
exit 1
|
||||
fi
|
||||
echo "C header verified at $HEADER"
|
||||
79
.github/workflows/ci-mobile.yaml
vendored
Normal file
79
.github/workflows/ci-mobile.yaml
vendored
Normal file
@@ -0,0 +1,79 @@
|
||||
name: CI Mobile
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
paths:
|
||||
- "crates/**"
|
||||
- "packages/dart/**"
|
||||
- "packages/swift/**"
|
||||
- "packages/kotlin-android/**"
|
||||
- ".github/workflows/ci-mobile.yaml"
|
||||
pull_request:
|
||||
branches: [main]
|
||||
paths:
|
||||
- "crates/**"
|
||||
- "packages/dart/**"
|
||||
- "packages/swift/**"
|
||||
- "packages/kotlin-android/**"
|
||||
- ".github/workflows/ci-mobile.yaml"
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: ci-mobile-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
CARGO_INCREMENTAL: 0
|
||||
CARGO_PROFILE_DEV_DEBUG: 0
|
||||
RUST_BACKTRACE: short
|
||||
# Mobile feature subsets (Android drops ORT-requiring features) leave some
|
||||
# functions only used in the full-feature graph; -A dead_code keeps the
|
||||
# cross-compile check honest about other classes of warnings without choking
|
||||
# on these.
|
||||
RUSTFLAGS: "-D warnings -A dead_code -A unpredictable-function-pointer-comparisons -A mismatched-lifetime-syntaxes"
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
android-check:
|
||||
name: Android cargo check (${{ matrix.abi }})
|
||||
if: github.repository == 'kreuzberg-dev/kreuzberg' && github.actor != 'dependabot[bot]'
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 30
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
abi: [arm64-v8a, x86_64]
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: kreuzberg-dev/actions/setup-rust@v1
|
||||
with:
|
||||
cache-key-prefix: ci-mobile-android-${{ matrix.abi }}
|
||||
- uses: kreuzberg-dev/actions/setup-android-ndk@v1
|
||||
- name: cargo ndk check kreuzberg-dart
|
||||
run: cargo ndk --target ${{ matrix.abi }} --platform 21 -- check -p kreuzberg-dart
|
||||
- name: cargo ndk check kreuzberg-ffi
|
||||
run: cargo ndk --target ${{ matrix.abi }} --platform 21 -- check -p kreuzberg-ffi
|
||||
|
||||
ios-check:
|
||||
name: iOS cargo check (${{ matrix.target }})
|
||||
if: github.repository == 'kreuzberg-dev/kreuzberg' && github.actor != 'dependabot[bot]'
|
||||
runs-on: macos-latest
|
||||
timeout-minutes: 30
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
target: [aarch64-apple-ios, aarch64-apple-ios-sim]
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: kreuzberg-dev/actions/setup-rust@v1
|
||||
with:
|
||||
target: ${{ matrix.target }}
|
||||
cache-key-prefix: ci-mobile-ios-${{ matrix.target }}
|
||||
- name: cargo check kreuzberg-dart
|
||||
run: cargo check -p kreuzberg-dart --target ${{ matrix.target }}
|
||||
- name: cargo check kreuzberg-swift
|
||||
run: cargo check -p kreuzberg-swift --target ${{ matrix.target }}
|
||||
103
.github/workflows/ci-rust.yaml
vendored
Normal file
103
.github/workflows/ci-rust.yaml
vendored
Normal file
@@ -0,0 +1,103 @@
|
||||
name: CI Rust
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
paths:
|
||||
- "crates/**"
|
||||
- "Cargo.toml"
|
||||
- "Cargo.lock"
|
||||
- "rust-toolchain.toml"
|
||||
- ".github/workflows/ci-rust.yaml"
|
||||
pull_request:
|
||||
branches: [main]
|
||||
paths:
|
||||
- "crates/**"
|
||||
- "Cargo.toml"
|
||||
- "Cargo.lock"
|
||||
- "rust-toolchain.toml"
|
||||
- ".github/workflows/ci-rust.yaml"
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: ci-rust-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
CARGO_INCREMENTAL: 0
|
||||
CARGO_PROFILE_DEV_DEBUG: 0
|
||||
RUST_BACKTRACE: short
|
||||
RUST_MIN_STACK: 16777216
|
||||
ORT_VERSION: "1.24.2"
|
||||
MACOSX_DEPLOYMENT_TARGET: "14.0"
|
||||
BUILD_PROFILE: "ci"
|
||||
FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: "true"
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
rust:
|
||||
name: Rust (${{ matrix.os }})
|
||||
if: github.repository == 'kreuzberg-dev/kreuzberg' && github.actor != 'dependabot[bot]'
|
||||
runs-on: ${{ matrix.os }}
|
||||
timeout-minutes: 60
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- os: ubuntu-24.04-arm
|
||||
- os: macos-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
submodules: recursive
|
||||
|
||||
- name: Free disk space
|
||||
if: runner.os == 'Linux'
|
||||
uses: kreuzberg-dev/actions/free-disk-space-linux@v1
|
||||
with:
|
||||
show-initial: "false"
|
||||
show-final: "true"
|
||||
|
||||
- name: Setup Rust
|
||||
uses: kreuzberg-dev/actions/setup-rust@v1
|
||||
with:
|
||||
cache-key-prefix: rust-${{ matrix.os }}
|
||||
use-sccache: "true"
|
||||
|
||||
- name: Install system dependencies
|
||||
uses: ./.github/actions/install-system-deps
|
||||
|
||||
- name: Setup OpenSSL
|
||||
uses: kreuzberg-dev/actions/setup-openssl@v1
|
||||
|
||||
- name: Setup ONNX Runtime
|
||||
uses: ./.github/actions/setup-onnx-runtime
|
||||
with:
|
||||
ort-version: ${{ env.ORT_VERSION }}
|
||||
|
||||
- name: Setup Tesseract cache
|
||||
uses: ./.github/actions/setup-tesseract-cache
|
||||
with:
|
||||
label: ${{ matrix.os }}
|
||||
|
||||
- name: Install Task
|
||||
uses: kreuzberg-dev/actions/install-task@v1
|
||||
|
||||
- name: Run clippy
|
||||
run: cargo clippy --workspace --exclude kreuzberg-ffi --exclude kreuzberg-py --exclude kreuzberg-php --exclude kreuzberg-node --exclude kreuzberg-wasm --exclude kreuzberg-dart --exclude kreuzberg-swift --exclude kreuzberg_nif -- -D warnings
|
||||
shell: bash
|
||||
|
||||
- name: Run tests
|
||||
run: task rust:test:ci
|
||||
shell: bash
|
||||
env:
|
||||
LD_LIBRARY_PATH: ${{ env.LD_LIBRARY_PATH || '' }}
|
||||
DYLD_LIBRARY_PATH: ${{ env.DYLD_LIBRARY_PATH || '' }}
|
||||
DYLD_FALLBACK_LIBRARY_PATH: ${{ env.DYLD_FALLBACK_LIBRARY_PATH || '' }}
|
||||
|
||||
- name: Check no-default-features
|
||||
run: cargo check -p kreuzberg --no-default-features
|
||||
shell: bash
|
||||
1303
.github/workflows/profiling.yaml
vendored
Normal file
1303
.github/workflows/profiling.yaml
vendored
Normal file
File diff suppressed because it is too large
Load Diff
262
.github/workflows/publish-docker.yaml
vendored
Normal file
262
.github/workflows/publish-docker.yaml
vendored
Normal file
@@ -0,0 +1,262 @@
|
||||
name: Publish Docker Images
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
tag:
|
||||
description: "Release tag to build (e.g., v4.3.6)"
|
||||
required: true
|
||||
type: string
|
||||
dry_run:
|
||||
description: "Prepare artifacts without publishing"
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
ref:
|
||||
description: "Git ref (branch, tag, or commit) to build; defaults to the tag"
|
||||
required: false
|
||||
type: string
|
||||
force_republish:
|
||||
description: "Force re-publish even if artifacts already exist"
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
release:
|
||||
types: [published]
|
||||
repository_dispatch:
|
||||
types: [publish-docker]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ (github.event_name == 'workflow_dispatch' && (github.event.inputs.ref || github.event.inputs.tag)) || github.ref || github.run_id }}
|
||||
cancel-in-progress: false
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
ORT_VERSION: "1.24.2"
|
||||
MACOSX_DEPLOYMENT_TARGET: "14.0"
|
||||
FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: "true"
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
prepare:
|
||||
name: Prepare metadata
|
||||
if: ${{ github.event_name != 'release' || !github.event.release.prerelease }}
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
outputs:
|
||||
tag: ${{ steps.meta.outputs.tag }}
|
||||
version: ${{ steps.meta.outputs.version }}
|
||||
ref: ${{ steps.meta.outputs.ref }}
|
||||
dry_run: ${{ steps.meta.outputs.dry_run }}
|
||||
force_republish: ${{ steps.meta.outputs.force_republish }}
|
||||
checkout_ref: ${{ steps.meta.outputs.checkout_ref }}
|
||||
target_sha: ${{ steps.meta.outputs.target_sha }}
|
||||
is_tag: ${{ steps.meta.outputs.is_tag }}
|
||||
release_docker: ${{ steps.meta.outputs.release_docker }}
|
||||
steps:
|
||||
- name: Checkout code (default)
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Resolve release metadata
|
||||
id: meta
|
||||
uses: kreuzberg-dev/actions/prepare-release-metadata@v1
|
||||
with:
|
||||
tag: ${{ inputs.tag }}
|
||||
ref: ${{ inputs.ref }}
|
||||
targets: docker
|
||||
dry-run: ${{ inputs.dry_run }}
|
||||
force-republish: ${{ inputs.force_republish }}
|
||||
|
||||
- name: Re-checkout at target ref
|
||||
if: ${{ steps.meta.outputs.checkout_ref != '' }}
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
ref: ${{ steps.meta.outputs.checkout_ref }}
|
||||
fetch-depth: 0
|
||||
submodules: recursive
|
||||
|
||||
- name: Show metadata
|
||||
env:
|
||||
META_TAG: ${{ steps.meta.outputs.tag }}
|
||||
META_VERSION: ${{ steps.meta.outputs.version }}
|
||||
META_REF: ${{ steps.meta.outputs.ref }}
|
||||
META_DRY_RUN: ${{ steps.meta.outputs.dry_run }}
|
||||
META_FORCE_REPUBLISH: ${{ steps.meta.outputs.force_republish }}
|
||||
META_CHECKOUT_REF: ${{ steps.meta.outputs.checkout_ref }}
|
||||
META_TARGET_SHA: ${{ steps.meta.outputs.target_sha }}
|
||||
META_IS_TAG: ${{ steps.meta.outputs.is_tag }}
|
||||
META_RELEASE_DOCKER: ${{ steps.meta.outputs.release_docker }}
|
||||
run: |
|
||||
{
|
||||
echo "## Release Metadata"
|
||||
echo "- **Tag**: \`$META_TAG\`"
|
||||
echo "- **Version**: \`$META_VERSION\`"
|
||||
echo "- **Ref**: \`$META_REF\`"
|
||||
echo "- **Dry Run**: \`$META_DRY_RUN\`"
|
||||
echo "- **Force Republish**: \`$META_FORCE_REPUBLISH\`"
|
||||
echo "- **Checkout Ref**: \`$META_CHECKOUT_REF\`"
|
||||
echo "- **Target SHA**: \`$META_TARGET_SHA\`"
|
||||
echo "- **Is Tag**: \`$META_IS_TAG\`"
|
||||
echo "- **Release Docker**: \`$META_RELEASE_DOCKER\`"
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
check-docker:
|
||||
name: Check if Docker image tag exists
|
||||
needs: prepare
|
||||
if: ${{ needs.prepare.outputs.release_docker == 'true' }}
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
packages: read
|
||||
outputs:
|
||||
core_exists: ${{ steps.core.outputs.exists }}
|
||||
full_exists: ${{ steps.full.outputs.exists }}
|
||||
cli_exists: ${{ steps.cli.outputs.exists }}
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
ref: ${{ needs.prepare.outputs.tag }}
|
||||
|
||||
- name: Log in to GitHub Container Registry
|
||||
uses: docker/login-action@v4
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Check core image tag
|
||||
id: core
|
||||
env:
|
||||
DOCKER_TAG: ghcr.io/kreuzberg-dev/kreuzberg:${{ needs.prepare.outputs.version }}-core
|
||||
SUMMARY_LABEL: core
|
||||
run: scripts/publish/check-docker-tag.sh
|
||||
|
||||
- name: Check full image tag
|
||||
id: full
|
||||
env:
|
||||
DOCKER_TAG: ghcr.io/kreuzberg-dev/kreuzberg:${{ needs.prepare.outputs.version }}
|
||||
SUMMARY_LABEL: full
|
||||
run: scripts/publish/check-docker-tag.sh
|
||||
|
||||
- name: Check CLI image tag
|
||||
id: cli
|
||||
env:
|
||||
DOCKER_TAG: ghcr.io/kreuzberg-dev/kreuzberg-cli:${{ needs.prepare.outputs.version }}
|
||||
SUMMARY_LABEL: cli
|
||||
run: scripts/publish/check-docker-tag.sh
|
||||
|
||||
publish-docker:
|
||||
name: Publish Docker image (${{ matrix.variant }})
|
||||
needs:
|
||||
- prepare
|
||||
- check-docker
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 360
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- variant: core
|
||||
dockerfile: docker/Dockerfile.core
|
||||
image: ghcr.io/kreuzberg-dev/kreuzberg
|
||||
tag_suffix: "-core"
|
||||
extra_tag: "core"
|
||||
- variant: full
|
||||
dockerfile: docker/Dockerfile.full
|
||||
image: ghcr.io/kreuzberg-dev/kreuzberg
|
||||
tag_suffix: ""
|
||||
extra_tag: "latest"
|
||||
- variant: cli
|
||||
dockerfile: docker/Dockerfile.cli
|
||||
image: ghcr.io/kreuzberg-dev/kreuzberg-cli
|
||||
tag_suffix: ""
|
||||
extra_tag: "latest"
|
||||
if: ${{ needs.prepare.outputs.release_docker == 'true' }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
ref: ${{ needs.prepare.outputs.checkout_ref }}
|
||||
fetch-depth: 0
|
||||
submodules: recursive
|
||||
|
||||
- name: Free up disk space
|
||||
uses: kreuzberg-dev/actions/free-disk-space-linux@v1
|
||||
|
||||
- name: Ensure target commit
|
||||
if: ${{ needs.prepare.outputs.target_sha != '' }}
|
||||
run: git checkout --progress --force ${{ needs.prepare.outputs.target_sha }}
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v4
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v4
|
||||
|
||||
- name: Skip because tag already exists
|
||||
if: ${{ needs.prepare.outputs.force_republish != 'true' && ((matrix.variant == 'core' && needs.check-docker.outputs.core_exists == 'true') || (matrix.variant == 'full' && needs.check-docker.outputs.full_exists == 'true') || (matrix.variant == 'cli' && needs.check-docker.outputs.cli_exists == 'true')) }}
|
||||
run: echo "Docker tag already exists for variant ${{ matrix.variant }}; skipping publish." >> "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
- name: Build AMD64 test image
|
||||
if: ${{ needs.prepare.outputs.force_republish == 'true' || (matrix.variant == 'core' && needs.check-docker.outputs.core_exists != 'true') || (matrix.variant == 'full' && needs.check-docker.outputs.full_exists != 'true') || (matrix.variant == 'cli' && needs.check-docker.outputs.cli_exists != 'true') }}
|
||||
run: docker build -f ${{ matrix.dockerfile }} --build-arg ONNXRUNTIME_VERSION=${{ env.ORT_VERSION }} -t kreuzberg-publish:${{ matrix.variant }}-test .
|
||||
|
||||
- name: Run Docker tests
|
||||
if: ${{ needs.prepare.outputs.force_republish == 'true' || (matrix.variant == 'core' && needs.check-docker.outputs.core_exists != 'true') || (matrix.variant == 'full' && needs.check-docker.outputs.full_exists != 'true') || (matrix.variant == 'cli' && needs.check-docker.outputs.cli_exists != 'true') }}
|
||||
run: python3 scripts/ci/docker/test_docker.py --image kreuzberg-publish:${{ matrix.variant }}-test --variant ${{ matrix.variant }} --verbose
|
||||
|
||||
- name: Log in to GitHub Container Registry
|
||||
if: ${{ needs.prepare.outputs.dry_run != 'true' && (needs.prepare.outputs.force_republish == 'true' || (matrix.variant == 'core' && needs.check-docker.outputs.core_exists != 'true') || (matrix.variant == 'full' && needs.check-docker.outputs.full_exists != 'true') || (matrix.variant == 'cli' && needs.check-docker.outputs.cli_exists != 'true')) }}
|
||||
uses: docker/login-action@v4
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Extract Docker metadata
|
||||
if: ${{ needs.prepare.outputs.dry_run != 'true' && (needs.prepare.outputs.force_republish == 'true' || (matrix.variant == 'core' && needs.check-docker.outputs.core_exists != 'true') || (matrix.variant == 'full' && needs.check-docker.outputs.full_exists != 'true') || (matrix.variant == 'cli' && needs.check-docker.outputs.cli_exists != 'true')) }}
|
||||
id: docker_meta
|
||||
uses: docker/metadata-action@v6
|
||||
with:
|
||||
images: ${{ matrix.image }}
|
||||
tags: |
|
||||
type=raw,value=${{ needs.prepare.outputs.version }}${{ matrix.tag_suffix }}
|
||||
type=raw,value=${{ matrix.extra_tag }}
|
||||
|
||||
- name: Build and push image
|
||||
if: ${{ needs.prepare.outputs.dry_run != 'true' && (needs.prepare.outputs.force_republish == 'true' || (matrix.variant == 'core' && needs.check-docker.outputs.core_exists != 'true') || (matrix.variant == 'full' && needs.check-docker.outputs.full_exists != 'true') || (matrix.variant == 'cli' && needs.check-docker.outputs.cli_exists != 'true')) }}
|
||||
uses: docker/build-push-action@v7
|
||||
with:
|
||||
context: .
|
||||
file: ${{ matrix.dockerfile }}
|
||||
push: true
|
||||
build-args: |
|
||||
ONNXRUNTIME_VERSION=${{ env.ORT_VERSION }}
|
||||
tags: ${{ steps.docker_meta.outputs.tags }}
|
||||
labels: |
|
||||
${{ steps.docker_meta.outputs.labels }}
|
||||
org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }}
|
||||
org.opencontainers.image.description=Kreuzberg document intelligence - ${{ matrix.variant }} variant
|
||||
org.opencontainers.image.licenses=MIT
|
||||
platforms: linux/amd64,linux/arm64
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max,scope=publish-docker-${{ matrix.variant }}
|
||||
|
||||
- name: Docker dry-run summary
|
||||
if: ${{ needs.prepare.outputs.dry_run == 'true' }}
|
||||
env:
|
||||
IMAGE: ${{ matrix.image }}
|
||||
VERSION: ${{ needs.prepare.outputs.version }}
|
||||
TAG_SUFFIX: ${{ matrix.tag_suffix }}
|
||||
run: scripts/publish/docker/dry-run-summary.sh
|
||||
|
||||
- name: Clean up local Docker images
|
||||
if: ${{ always() }}
|
||||
run: docker rmi kreuzberg-publish:${{ matrix.variant }}-test || true
|
||||
108
.github/workflows/publish-helm.yaml
vendored
Normal file
108
.github/workflows/publish-helm.yaml
vendored
Normal file
@@ -0,0 +1,108 @@
|
||||
name: Publish Helm Chart
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
tag:
|
||||
description: "Release tag to build (e.g., v4.3.6)"
|
||||
required: true
|
||||
type: string
|
||||
dry_run:
|
||||
description: "Prepare artifacts without publishing"
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
release:
|
||||
types: [published]
|
||||
repository_dispatch:
|
||||
types: [publish-helm]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.tag) || github.ref || github.run_id }}
|
||||
cancel-in-progress: false
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
publish-helm:
|
||||
name: Publish Helm chart to GHCR
|
||||
if: ${{ github.event_name != 'release' || !github.event.release.prerelease }}
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Resolve version
|
||||
id: meta
|
||||
run: |
|
||||
if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
|
||||
TAG="${{ inputs.tag }}"
|
||||
elif [[ "${{ github.event_name }}" == "release" ]]; then
|
||||
TAG="${{ github.event.release.tag_name }}"
|
||||
elif [[ "${{ github.event_name }}" == "repository_dispatch" ]]; then
|
||||
TAG="${{ github.event.client_payload.tag }}"
|
||||
fi
|
||||
|
||||
VERSION="${TAG#v}"
|
||||
DRY_RUN="${{ inputs.dry_run || 'false' }}"
|
||||
|
||||
{
|
||||
echo "tag=${TAG}"
|
||||
echo "version=${VERSION}"
|
||||
echo "dry_run=${DRY_RUN}"
|
||||
} >> "$GITHUB_OUTPUT"
|
||||
|
||||
{
|
||||
echo "## Helm Publish Metadata"
|
||||
echo "- **Tag**: \`${TAG}\`"
|
||||
echo "- **Version**: \`${VERSION}\`"
|
||||
echo "- **Dry Run**: \`${DRY_RUN}\`"
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
- name: Setup Helm
|
||||
uses: azure/setup-helm@v5
|
||||
|
||||
- name: Lint chart
|
||||
run: helm lint --strict charts/kreuzberg/
|
||||
|
||||
- name: Update Chart.yaml version
|
||||
run: |
|
||||
sed -i "s/^version:.*/version: ${{ steps.meta.outputs.version }}/" charts/kreuzberg/Chart.yaml
|
||||
sed -i "s/^appVersion:.*/appVersion: \"${{ steps.meta.outputs.version }}\"/" charts/kreuzberg/Chart.yaml
|
||||
{
|
||||
echo "### Chart.yaml"
|
||||
echo '```yaml'
|
||||
cat charts/kreuzberg/Chart.yaml
|
||||
echo '```'
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
- name: Package chart
|
||||
run: |
|
||||
helm package charts/kreuzberg/ --destination .helm-packages/
|
||||
echo "### Packaged" >> "$GITHUB_STEP_SUMMARY"
|
||||
ls -lh .helm-packages/ >> "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
- name: Log in to GitHub Container Registry
|
||||
if: ${{ steps.meta.outputs.dry_run != 'true' }}
|
||||
uses: docker/login-action@v4
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Push chart to GHCR
|
||||
if: ${{ steps.meta.outputs.dry_run != 'true' }}
|
||||
run: |
|
||||
helm push .helm-packages/kreuzberg-${{ steps.meta.outputs.version }}.tgz oci://ghcr.io/kreuzberg-dev/charts
|
||||
echo "### Published" >> "$GITHUB_STEP_SUMMARY"
|
||||
echo "Chart pushed to \`oci://ghcr.io/kreuzberg-dev/charts/kreuzberg:${{ steps.meta.outputs.version }}\`" >> "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
- name: Dry-run summary
|
||||
if: ${{ steps.meta.outputs.dry_run == 'true' }}
|
||||
run: |
|
||||
echo "### Dry Run" >> "$GITHUB_STEP_SUMMARY"
|
||||
echo "Would have pushed \`kreuzberg-${{ steps.meta.outputs.version }}.tgz\` to \`oci://ghcr.io/kreuzberg-dev/charts\`" >> "$GITHUB_STEP_SUMMARY"
|
||||
46
.github/workflows/publish-pubdev.yaml
vendored
Normal file
46
.github/workflows/publish-pubdev.yaml
vendored
Normal file
@@ -0,0 +1,46 @@
|
||||
name: Publish pub.dev
|
||||
|
||||
# pub.dev OIDC trusted publishing rejects tokens originating from `release`
|
||||
# events; only `push` and `workflow_dispatch` are accepted.
|
||||
#
|
||||
# Because the kreuzberg Dart package embeds platform-specific native binaries
|
||||
# (Android JNI, iOS XCFramework, server libs for linux/macos/windows), we
|
||||
# cannot just rebuild here — those artifacts are produced by the main
|
||||
# `publish.yaml` workflow. Instead, the main workflow's `trigger-pubdev` job
|
||||
# dispatches this workflow with the run_id of the main workflow, and this
|
||||
# workflow downloads the `dart-package-assembled` artifact from that run.
|
||||
#
|
||||
# One-time setup: on pub.dev → kreuzberg package → Admin → Automated publishing,
|
||||
# set the workflow path to `.github/workflows/publish-pubdev.yaml`.
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
run_id:
|
||||
description: "GitHub Actions run ID of publish.yaml that produced the dart-package-assembled artifact"
|
||||
required: true
|
||||
type: string
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
id-token: write
|
||||
actions: read
|
||||
|
||||
env:
|
||||
FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: "true"
|
||||
|
||||
jobs:
|
||||
publish-pub:
|
||||
name: Publish pub.dev
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/download-artifact@v8.0.1
|
||||
with:
|
||||
name: dart-package-assembled
|
||||
path: packages/dart
|
||||
run-id: ${{ inputs.run_id }}
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- uses: kreuzberg-dev/actions/publish-pub@v1
|
||||
with:
|
||||
package-dir: packages/dart
|
||||
2345
.github/workflows/publish.yaml
vendored
Normal file
2345
.github/workflows/publish.yaml
vendored
Normal file
File diff suppressed because it is too large
Load Diff
10
.github/workflows/validate-issues.yml
vendored
Normal file
10
.github/workflows/validate-issues.yml
vendored
Normal file
@@ -0,0 +1,10 @@
|
||||
name: Validate Issues
|
||||
|
||||
on:
|
||||
issues:
|
||||
types: [opened, edited]
|
||||
|
||||
jobs:
|
||||
validate:
|
||||
uses: kreuzberg-dev/actions/.github/workflows/reusable-validate-issues.yml@v1
|
||||
secrets: inherit
|
||||
10
.github/workflows/validate-pr.yml
vendored
Normal file
10
.github/workflows/validate-pr.yml
vendored
Normal file
@@ -0,0 +1,10 @@
|
||||
name: Validate PR
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
types: [opened, edited, synchronize]
|
||||
|
||||
jobs:
|
||||
validate:
|
||||
uses: kreuzberg-dev/actions/.github/workflows/reusable-validate-pr.yml@v1
|
||||
secrets: inherit
|
||||
Reference in New Issue
Block a user