Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

70
scripts/lib/common.sh Executable file
View File

@@ -0,0 +1,70 @@
#!/usr/bin/env bash
set -euo pipefail
get_repo_root() {
local start_dir current_dir
start_dir="$(pwd)"
current_dir="$start_dir"
while [ "$current_dir" != "/" ]; do
if [ -f "$current_dir/Cargo.toml" ]; then
echo "$current_dir"
return 0
fi
current_dir="$(dirname "$current_dir")"
done
echo "Error: Could not find repository root (Cargo.toml) from: $start_dir" >&2
return 1
}
validate_repo_root() {
local repo_root="${1:-${REPO_ROOT:-}}"
if [ -z "$repo_root" ]; then
echo "Error: REPO_ROOT not provided and env var not set" >&2
return 1
fi
if [ ! -f "$repo_root/Cargo.toml" ]; then
echo "Error: REPO_ROOT validation failed. Expected Cargo.toml at: $repo_root/Cargo.toml" >&2
echo "REPO_ROOT resolved to: $repo_root" >&2
return 1
fi
return 0
}
error_exit() {
local message="${1:-Unknown error}"
local exit_code="${2:-1}"
echo "Error: $message" >&2
exit "$exit_code"
}
get_platform() {
if [ -n "${RUNNER_OS:-}" ]; then
echo "$RUNNER_OS"
else
case "$(uname -s)" in
Linux*)
echo "Linux"
;;
Darwin*)
echo "macOS"
;;
MINGW* | MSYS* | CYGWIN*)
echo "Windows"
;;
*)
echo "unknown"
;;
esac
fi
}
export -f get_repo_root
export -f validate_repo_root
export -f error_exit
export -f get_platform

197
scripts/lib/library-paths.sh Executable file
View File

@@ -0,0 +1,197 @@
#!/usr/bin/env bash
set -euo pipefail
_get_path_separator() {
local platform="${1:-$(uname -s)}"
case "$platform" in
MINGW* | MSYS* | CYGWIN* | Windows)
echo ";"
;;
*)
echo ":"
;;
esac
}
setup_onnx_paths() {
local ort_lib="${ORT_LIB_LOCATION:-}"
[ -z "$ort_lib" ] && return 0
local platform="${RUNNER_OS:-$(uname -s)}"
case "$platform" in
Linux)
export LD_LIBRARY_PATH="${ort_lib}:${LD_LIBRARY_PATH:-}"
echo "✓ Set LD_LIBRARY_PATH for ONNX Runtime"
;;
macOS | Darwin)
export DYLD_LIBRARY_PATH="${ort_lib}:${DYLD_LIBRARY_PATH:-}"
export DYLD_FALLBACK_LIBRARY_PATH="${ort_lib}:${DYLD_FALLBACK_LIBRARY_PATH:-}"
echo "✓ Set DYLD_LIBRARY_PATH for ONNX Runtime on macOS"
;;
Windows | MINGW* | MSYS* | CYGWIN*)
export PATH="${ort_lib};${PATH:-}"
echo "✓ Set PATH for ONNX Runtime on Windows"
;;
esac
}
setup_rust_ffi_paths() {
local repo_root="${1:-${REPO_ROOT:-}}"
[ -z "$repo_root" ] && return 0
local ffi_lib="$repo_root/target/release"
local ffi_lib_gnu="$repo_root/target/x86_64-pc-windows-gnu/release"
local platform="${RUNNER_OS:-$(uname -s)}"
case "$platform" in
Linux)
[ ! -d "$ffi_lib" ] && return 0
export LD_LIBRARY_PATH="${ffi_lib}:${LD_LIBRARY_PATH:-}"
echo "✓ Set LD_LIBRARY_PATH for Rust FFI"
;;
macOS | Darwin)
[ ! -d "$ffi_lib" ] && return 0
export DYLD_LIBRARY_PATH="${ffi_lib}:${DYLD_LIBRARY_PATH:-}"
export DYLD_FALLBACK_LIBRARY_PATH="${ffi_lib}:${DYLD_FALLBACK_LIBRARY_PATH:-}"
echo "✓ Set DYLD_LIBRARY_PATH for Rust FFI on macOS"
;;
Windows | MINGW* | MSYS* | CYGWIN*)
# Check for short path CI directories first
local cargo_target="${CARGO_TARGET_DIR:-}"
if [ -n "$cargo_target" ] && [ -d "$cargo_target/release" ]; then
export PATH="${cargo_target}/release;${PATH:-}"
echo "✓ Set PATH for Rust FFI (using CARGO_TARGET_DIR=$cargo_target)"
fi
# Add GNU target path if it exists
if [ -d "$ffi_lib_gnu" ]; then
export PATH="${ffi_lib_gnu};${PATH:-}"
echo "✓ Set PATH for Rust FFI GNU target"
fi
# Add standard target path if it exists
if [ -d "$ffi_lib" ]; then
export PATH="${ffi_lib};${PATH:-}"
echo "✓ Set PATH for Rust FFI on Windows"
fi
;;
esac
}
verify_pkg_config() {
if pkg-config --exists kreuzberg-ffi 2>/dev/null; then
return 0
else
{
echo "Error: pkg-config cannot find kreuzberg-ffi"
echo "PKG_CONFIG_PATH=${PKG_CONFIG_PATH:-<not set>}"
echo "Run 'pkg-config --list-all' to see available packages"
} >&2
return 1
fi
}
setup_go_paths_windows() {
local repo_root="${1:-${REPO_ROOT:-}}"
[ -z "$repo_root" ] && return 0
local gnu_target="${repo_root}/target/x86_64-pc-windows-gnu/release"
local release_target="${repo_root}/target/release"
export PKG_CONFIG_PATH="${repo_root}/crates/kreuzberg-ffi:${PKG_CONFIG_PATH:-}"
export PATH="${gnu_target};${release_target};${PATH:-}"
export CGO_ENABLED=1
export CGO_CFLAGS="-I${repo_root}/crates/kreuzberg-ffi/include"
export CGO_LDFLAGS="-L${gnu_target} -L${release_target} -lkreuzberg_ffi -static-libgcc -static-libstdc++"
echo "✓ Configured Go cgo environment for Windows"
}
# NOTE: CGO_LDFLAGS is set by setup-go-cgo-env action on Windows in CI, or by this script on Unix
setup_go_paths() {
local repo_root="${1:-${REPO_ROOT:-}}"
[ -z "$repo_root" ] && return 0
local pc_path="${repo_root}/crates/kreuzberg-ffi/kreuzberg-ffi.pc"
if [ ! -f "$pc_path" ]; then
local version=""
version="$(sed -n 's/^version = \"\\(.*\\)\"/\\1/p' "${repo_root}/Cargo.toml" | head -n 1 || true)"
[ -z "$version" ] && version="unknown"
local platform="${RUNNER_OS:-$(uname -s)}"
local libs_private=""
case "$platform" in
Linux)
libs_private="-lpthread -ldl -lm"
;;
macOS | Darwin)
libs_private="-framework CoreFoundation -framework Security -lpthread"
;;
Windows | MINGW* | MSYS* | CYGWIN*)
libs_private="-lws2_32 -luserenv -lbcrypt"
;;
esac
mkdir -p "$(dirname "$pc_path")"
cat >"$pc_path" <<EOF
prefix=${repo_root}
exec_prefix=\${prefix}
libdir=${repo_root}/target/release
includedir=${repo_root}/crates/kreuzberg-ffi
Name: kreuzberg-ffi
Description: C FFI bindings for Kreuzberg document intelligence library
Version: ${version}
URL: https://kreuzberg.dev
Libs: -L\${libdir} -lkreuzberg_ffi
Libs.private: ${libs_private}
Cflags: -I\${includedir}
EOF
fi
export PKG_CONFIG_PATH="${repo_root}/crates/kreuzberg-ffi:${PKG_CONFIG_PATH:-}"
export CGO_ENABLED=1
export CGO_CFLAGS="-I${repo_root}/crates/kreuzberg-ffi/include"
local platform="${RUNNER_OS:-$(uname -s)}"
case "$platform" in
Linux)
export LD_LIBRARY_PATH="${repo_root}/target/release:${LD_LIBRARY_PATH:-}"
export CGO_LDFLAGS="-L${repo_root}/target/release -lkreuzberg_ffi -Wl,-rpath,${repo_root}/target/release"
;;
macOS | Darwin)
export DYLD_LIBRARY_PATH="${repo_root}/target/release:${DYLD_LIBRARY_PATH:-}"
export DYLD_FALLBACK_LIBRARY_PATH="${repo_root}/target/release:${DYLD_FALLBACK_LIBRARY_PATH:-}"
export CGO_LDFLAGS="-L${repo_root}/target/release -lkreuzberg_ffi -Wl,-rpath,${repo_root}/target/release"
;;
Windows | MINGW* | MSYS* | CYGWIN*)
if [ -z "${CGO_LDFLAGS:-}" ] && [ -z "${GITHUB_ENV:-}" ]; then
# Only set library search path; ffi.go CGO directives handle -l flags
# This matches the approach in setup-go-cgo-env/windows.ps1
export CGO_LDFLAGS="-L${repo_root}/target/x86_64-pc-windows-gnu/release -L${repo_root}/target/release"
fi
;;
esac
echo "✓ Configured Go cgo environment"
}
setup_all_library_paths() {
local repo_root="${1:-${REPO_ROOT:-}}"
echo "Setting up library paths..."
setup_onnx_paths
setup_rust_ffi_paths "$repo_root"
setup_go_paths "$repo_root"
echo "✓ All library paths configured"
}
export -f setup_onnx_paths
export -f setup_rust_ffi_paths
export -f verify_pkg_config
export -f setup_go_paths_windows
export -f setup_go_paths
export -f setup_all_library_paths
export -f _get_path_separator

85
scripts/lib/retry.sh Executable file
View File

@@ -0,0 +1,85 @@
#!/usr/bin/env bash
set -euo pipefail
run_with_timeout() {
local seconds="$1"
shift
if command -v timeout >/dev/null 2>&1; then
timeout "${seconds}" "$@"
return $?
fi
if command -v gtimeout >/dev/null 2>&1; then
gtimeout "${seconds}" "$@"
return $?
fi
if command -v python3 >/dev/null 2>&1; then
python3 - "$seconds" "$@" <<'PY'
import subprocess
import sys
timeout_s = int(sys.argv[1])
cmd = sys.argv[2:]
try:
completed = subprocess.run(cmd, timeout=timeout_s)
sys.exit(completed.returncode)
except subprocess.TimeoutExpired:
sys.exit(124)
PY
return $?
fi
"$@"
}
retry_with_backoff() {
local max_attempts=3
local attempt=1
local delay=5
while [ $attempt -le $max_attempts ]; do
if "$@"; then
return 0
fi
if [ $attempt -lt $max_attempts ]; then
echo "⚠ Attempt $attempt failed, retrying in ${delay}s..." >&2
sleep $delay
delay=$((delay * 2))
fi
attempt=$((attempt + 1))
done
return 1
}
retry_with_backoff_timeout() {
local seconds="$1"
shift
local max_attempts=3
local attempt=1
local delay=5
local exit_code=1
while [ $attempt -le $max_attempts ]; do
if run_with_timeout "$seconds" "$@"; then
return 0
else
exit_code=$?
fi
if [ $attempt -lt $max_attempts ]; then
echo "⚠ Attempt $attempt failed (exit $exit_code), retrying in ${delay}s..." >&2
sleep $delay
delay=$((delay * 2))
fi
attempt=$((attempt + 1))
done
return $exit_code
}
export -f run_with_timeout
export -f retry_with_backoff
export -f retry_with_backoff_timeout

157
scripts/lib/tessdata.sh Executable file
View File

@@ -0,0 +1,157 @@
#!/usr/bin/env bash
set -euo pipefail
file_size_bytes() {
local path="$1"
if [ ! -f "$path" ]; then
echo 0
return
fi
if stat -c%s "$path" >/dev/null 2>&1; then
stat -c%s "$path"
return
fi
stat -f%z "$path"
}
min_traineddata_size_bytes() {
local lang="$1"
case "$lang" in
eng) echo 1000000 ;;
osd) echo 100000 ;;
deu) echo 1000000 ;;
*) echo 100000 ;;
esac
}
download_traineddata() {
local lang="$1"
local dest="$2"
local url="$3"
local tmp="${dest}.tmp"
local min_size
min_size="$(min_traineddata_size_bytes "$lang")"
rm -f "$tmp"
for attempt in 1 2 3 4 5; do
if curl -fsSL --retry 5 --retry-delay 5 --retry-all-errors "$url" -o "$tmp"; then
local size
size="$(file_size_bytes "$tmp")"
if [ "$size" -ge "$min_size" ]; then
mv -f "$tmp" "$dest"
return 0
fi
echo "Downloaded ${lang}.traineddata too small (${size} bytes < ${min_size}), retrying..." >&2
else
echo "Failed to download ${lang}.traineddata (attempt ${attempt}), retrying..." >&2
fi
rm -f "$tmp"
sleep "$attempt"
done
echo "ERROR: Failed to download valid ${lang}.traineddata after retries" >&2
return 1
}
ensure_valid_traineddata() {
local dest_dir="$1"
local lang="$2"
local url="$3"
local dest_file="${dest_dir}/${lang}.traineddata"
local min_size
min_size="$(min_traineddata_size_bytes "$lang")"
local size
size="$(file_size_bytes "$dest_file")"
if [ "$size" -ge "$min_size" ]; then
return 0
fi
if [ -f "$dest_file" ]; then
echo "Invalid ${lang}.traineddata at ${dest_file} (${size} bytes < ${min_size}); re-downloading..." >&2
rm -f "$dest_file"
fi
download_traineddata "$lang" "$dest_file" "$url"
}
ensure_tessdata() {
local dest="$1"
mkdir -p "$dest"
local dest_real
dest_real="$(cd "$dest" && pwd -P)"
local candidates=(
"/opt/homebrew/share/tessdata"
"/usr/local/opt/tesseract/share/tessdata"
"/usr/share/tesseract-ocr/5/tessdata"
)
if [ -n "${PROGRAMFILES:-}" ] && command -v cygpath >/dev/null 2>&1; then
candidates+=("$(cygpath -u "$PROGRAMFILES")/Tesseract-OCR/tessdata")
fi
if [ -d "/c/Program Files/Tesseract-OCR/tessdata" ]; then
candidates+=("/c/Program Files/Tesseract-OCR/tessdata")
fi
for dir in "${candidates[@]}"; do
if [ -f "$dir/eng.traineddata" ]; then
local dir_real
dir_real="$(cd "$dir" && pwd -P)"
if [ "$dir_real" = "$dest_real" ]; then
break
fi
for lang in eng osd deu; do
if [ -f "$dir/$lang.traineddata" ]; then
if [ -f "$dest/$lang.traineddata" ] &&
[ "$dir_real/$lang.traineddata" -ef "$dest/$lang.traineddata" ]; then
continue
fi
cp -f "$dir/$lang.traineddata" "$dest/"
fi
done
break
fi
done
ensure_valid_traineddata "$dest" "eng" "https://github.com/tesseract-ocr/tessdata_fast/raw/main/eng.traineddata"
ensure_valid_traineddata "$dest" "osd" "https://github.com/tesseract-ocr/tessdata_fast/raw/main/osd.traineddata"
}
setup_tessdata() {
local platform="${RUNNER_OS:-$(uname -s)}"
case "$platform" in
Linux)
export TESSDATA_PREFIX="/usr/share/tesseract-ocr/5/tessdata"
;;
macOS | Darwin)
if [ -d "/opt/homebrew/opt/tesseract/share/tessdata" ]; then
export TESSDATA_PREFIX="/opt/homebrew/opt/tesseract/share/tessdata"
elif [ -d "/usr/local/opt/tesseract/share/tessdata" ]; then
export TESSDATA_PREFIX="/usr/local/opt/tesseract/share/tessdata"
else
export TESSDATA_PREFIX="$HOME/Library/Application Support/kreuzberg-tesseract/tessdata"
fi
;;
Windows | MINGW* | MSYS* | CYGWIN*)
export TESSDATA_PREFIX="${APPDATA:-${USERPROFILE:-}}/kreuzberg-tesseract/tessdata"
;;
*)
export TESSDATA_PREFIX="${REPO_ROOT:-$(pwd)}/target/tessdata"
;;
esac
ensure_tessdata "$TESSDATA_PREFIX"
echo "✓ TESSDATA_PREFIX set to: $TESSDATA_PREFIX"
[ -f "$TESSDATA_PREFIX/eng.traineddata" ] && echo "✓ eng.traineddata available"
[ -f "$TESSDATA_PREFIX/osd.traineddata" ] && echo "✓ osd.traineddata available"
}
export -f ensure_tessdata
export -f setup_tessdata