This commit is contained in:
70
scripts/lib/common.sh
Executable file
70
scripts/lib/common.sh
Executable file
@@ -0,0 +1,70 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
get_repo_root() {
|
||||
local start_dir current_dir
|
||||
start_dir="$(pwd)"
|
||||
current_dir="$start_dir"
|
||||
|
||||
while [ "$current_dir" != "/" ]; do
|
||||
if [ -f "$current_dir/Cargo.toml" ]; then
|
||||
echo "$current_dir"
|
||||
return 0
|
||||
fi
|
||||
current_dir="$(dirname "$current_dir")"
|
||||
done
|
||||
|
||||
echo "Error: Could not find repository root (Cargo.toml) from: $start_dir" >&2
|
||||
return 1
|
||||
}
|
||||
|
||||
validate_repo_root() {
|
||||
local repo_root="${1:-${REPO_ROOT:-}}"
|
||||
|
||||
if [ -z "$repo_root" ]; then
|
||||
echo "Error: REPO_ROOT not provided and env var not set" >&2
|
||||
return 1
|
||||
fi
|
||||
|
||||
if [ ! -f "$repo_root/Cargo.toml" ]; then
|
||||
echo "Error: REPO_ROOT validation failed. Expected Cargo.toml at: $repo_root/Cargo.toml" >&2
|
||||
echo "REPO_ROOT resolved to: $repo_root" >&2
|
||||
return 1
|
||||
fi
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
error_exit() {
|
||||
local message="${1:-Unknown error}"
|
||||
local exit_code="${2:-1}"
|
||||
echo "Error: $message" >&2
|
||||
exit "$exit_code"
|
||||
}
|
||||
|
||||
get_platform() {
|
||||
if [ -n "${RUNNER_OS:-}" ]; then
|
||||
echo "$RUNNER_OS"
|
||||
else
|
||||
case "$(uname -s)" in
|
||||
Linux*)
|
||||
echo "Linux"
|
||||
;;
|
||||
Darwin*)
|
||||
echo "macOS"
|
||||
;;
|
||||
MINGW* | MSYS* | CYGWIN*)
|
||||
echo "Windows"
|
||||
;;
|
||||
*)
|
||||
echo "unknown"
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
}
|
||||
|
||||
export -f get_repo_root
|
||||
export -f validate_repo_root
|
||||
export -f error_exit
|
||||
export -f get_platform
|
||||
197
scripts/lib/library-paths.sh
Executable file
197
scripts/lib/library-paths.sh
Executable file
@@ -0,0 +1,197 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
_get_path_separator() {
|
||||
local platform="${1:-$(uname -s)}"
|
||||
case "$platform" in
|
||||
MINGW* | MSYS* | CYGWIN* | Windows)
|
||||
echo ";"
|
||||
;;
|
||||
*)
|
||||
echo ":"
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
setup_onnx_paths() {
|
||||
local ort_lib="${ORT_LIB_LOCATION:-}"
|
||||
[ -z "$ort_lib" ] && return 0
|
||||
|
||||
local platform="${RUNNER_OS:-$(uname -s)}"
|
||||
case "$platform" in
|
||||
Linux)
|
||||
export LD_LIBRARY_PATH="${ort_lib}:${LD_LIBRARY_PATH:-}"
|
||||
echo "✓ Set LD_LIBRARY_PATH for ONNX Runtime"
|
||||
;;
|
||||
macOS | Darwin)
|
||||
export DYLD_LIBRARY_PATH="${ort_lib}:${DYLD_LIBRARY_PATH:-}"
|
||||
export DYLD_FALLBACK_LIBRARY_PATH="${ort_lib}:${DYLD_FALLBACK_LIBRARY_PATH:-}"
|
||||
echo "✓ Set DYLD_LIBRARY_PATH for ONNX Runtime on macOS"
|
||||
;;
|
||||
Windows | MINGW* | MSYS* | CYGWIN*)
|
||||
export PATH="${ort_lib};${PATH:-}"
|
||||
echo "✓ Set PATH for ONNX Runtime on Windows"
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
setup_rust_ffi_paths() {
|
||||
local repo_root="${1:-${REPO_ROOT:-}}"
|
||||
[ -z "$repo_root" ] && return 0
|
||||
|
||||
local ffi_lib="$repo_root/target/release"
|
||||
local ffi_lib_gnu="$repo_root/target/x86_64-pc-windows-gnu/release"
|
||||
|
||||
local platform="${RUNNER_OS:-$(uname -s)}"
|
||||
case "$platform" in
|
||||
Linux)
|
||||
[ ! -d "$ffi_lib" ] && return 0
|
||||
export LD_LIBRARY_PATH="${ffi_lib}:${LD_LIBRARY_PATH:-}"
|
||||
echo "✓ Set LD_LIBRARY_PATH for Rust FFI"
|
||||
;;
|
||||
macOS | Darwin)
|
||||
[ ! -d "$ffi_lib" ] && return 0
|
||||
export DYLD_LIBRARY_PATH="${ffi_lib}:${DYLD_LIBRARY_PATH:-}"
|
||||
export DYLD_FALLBACK_LIBRARY_PATH="${ffi_lib}:${DYLD_FALLBACK_LIBRARY_PATH:-}"
|
||||
echo "✓ Set DYLD_LIBRARY_PATH for Rust FFI on macOS"
|
||||
;;
|
||||
Windows | MINGW* | MSYS* | CYGWIN*)
|
||||
# Check for short path CI directories first
|
||||
local cargo_target="${CARGO_TARGET_DIR:-}"
|
||||
if [ -n "$cargo_target" ] && [ -d "$cargo_target/release" ]; then
|
||||
export PATH="${cargo_target}/release;${PATH:-}"
|
||||
echo "✓ Set PATH for Rust FFI (using CARGO_TARGET_DIR=$cargo_target)"
|
||||
fi
|
||||
# Add GNU target path if it exists
|
||||
if [ -d "$ffi_lib_gnu" ]; then
|
||||
export PATH="${ffi_lib_gnu};${PATH:-}"
|
||||
echo "✓ Set PATH for Rust FFI GNU target"
|
||||
fi
|
||||
# Add standard target path if it exists
|
||||
if [ -d "$ffi_lib" ]; then
|
||||
export PATH="${ffi_lib};${PATH:-}"
|
||||
echo "✓ Set PATH for Rust FFI on Windows"
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
verify_pkg_config() {
|
||||
if pkg-config --exists kreuzberg-ffi 2>/dev/null; then
|
||||
return 0
|
||||
else
|
||||
{
|
||||
echo "Error: pkg-config cannot find kreuzberg-ffi"
|
||||
echo "PKG_CONFIG_PATH=${PKG_CONFIG_PATH:-<not set>}"
|
||||
echo "Run 'pkg-config --list-all' to see available packages"
|
||||
} >&2
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
setup_go_paths_windows() {
|
||||
local repo_root="${1:-${REPO_ROOT:-}}"
|
||||
[ -z "$repo_root" ] && return 0
|
||||
|
||||
local gnu_target="${repo_root}/target/x86_64-pc-windows-gnu/release"
|
||||
local release_target="${repo_root}/target/release"
|
||||
|
||||
export PKG_CONFIG_PATH="${repo_root}/crates/kreuzberg-ffi:${PKG_CONFIG_PATH:-}"
|
||||
|
||||
export PATH="${gnu_target};${release_target};${PATH:-}"
|
||||
|
||||
export CGO_ENABLED=1
|
||||
export CGO_CFLAGS="-I${repo_root}/crates/kreuzberg-ffi/include"
|
||||
export CGO_LDFLAGS="-L${gnu_target} -L${release_target} -lkreuzberg_ffi -static-libgcc -static-libstdc++"
|
||||
|
||||
echo "✓ Configured Go cgo environment for Windows"
|
||||
}
|
||||
|
||||
# NOTE: CGO_LDFLAGS is set by setup-go-cgo-env action on Windows in CI, or by this script on Unix
|
||||
setup_go_paths() {
|
||||
local repo_root="${1:-${REPO_ROOT:-}}"
|
||||
[ -z "$repo_root" ] && return 0
|
||||
|
||||
local pc_path="${repo_root}/crates/kreuzberg-ffi/kreuzberg-ffi.pc"
|
||||
if [ ! -f "$pc_path" ]; then
|
||||
local version=""
|
||||
version="$(sed -n 's/^version = \"\\(.*\\)\"/\\1/p' "${repo_root}/Cargo.toml" | head -n 1 || true)"
|
||||
[ -z "$version" ] && version="unknown"
|
||||
|
||||
local platform="${RUNNER_OS:-$(uname -s)}"
|
||||
local libs_private=""
|
||||
case "$platform" in
|
||||
Linux)
|
||||
libs_private="-lpthread -ldl -lm"
|
||||
;;
|
||||
macOS | Darwin)
|
||||
libs_private="-framework CoreFoundation -framework Security -lpthread"
|
||||
;;
|
||||
Windows | MINGW* | MSYS* | CYGWIN*)
|
||||
libs_private="-lws2_32 -luserenv -lbcrypt"
|
||||
;;
|
||||
esac
|
||||
|
||||
mkdir -p "$(dirname "$pc_path")"
|
||||
cat >"$pc_path" <<EOF
|
||||
prefix=${repo_root}
|
||||
exec_prefix=\${prefix}
|
||||
libdir=${repo_root}/target/release
|
||||
includedir=${repo_root}/crates/kreuzberg-ffi
|
||||
|
||||
Name: kreuzberg-ffi
|
||||
Description: C FFI bindings for Kreuzberg document intelligence library
|
||||
Version: ${version}
|
||||
URL: https://kreuzberg.dev
|
||||
Libs: -L\${libdir} -lkreuzberg_ffi
|
||||
Libs.private: ${libs_private}
|
||||
Cflags: -I\${includedir}
|
||||
EOF
|
||||
fi
|
||||
|
||||
export PKG_CONFIG_PATH="${repo_root}/crates/kreuzberg-ffi:${PKG_CONFIG_PATH:-}"
|
||||
|
||||
export CGO_ENABLED=1
|
||||
export CGO_CFLAGS="-I${repo_root}/crates/kreuzberg-ffi/include"
|
||||
|
||||
local platform="${RUNNER_OS:-$(uname -s)}"
|
||||
case "$platform" in
|
||||
Linux)
|
||||
export LD_LIBRARY_PATH="${repo_root}/target/release:${LD_LIBRARY_PATH:-}"
|
||||
export CGO_LDFLAGS="-L${repo_root}/target/release -lkreuzberg_ffi -Wl,-rpath,${repo_root}/target/release"
|
||||
;;
|
||||
macOS | Darwin)
|
||||
export DYLD_LIBRARY_PATH="${repo_root}/target/release:${DYLD_LIBRARY_PATH:-}"
|
||||
export DYLD_FALLBACK_LIBRARY_PATH="${repo_root}/target/release:${DYLD_FALLBACK_LIBRARY_PATH:-}"
|
||||
export CGO_LDFLAGS="-L${repo_root}/target/release -lkreuzberg_ffi -Wl,-rpath,${repo_root}/target/release"
|
||||
;;
|
||||
Windows | MINGW* | MSYS* | CYGWIN*)
|
||||
if [ -z "${CGO_LDFLAGS:-}" ] && [ -z "${GITHUB_ENV:-}" ]; then
|
||||
# Only set library search path; ffi.go CGO directives handle -l flags
|
||||
# This matches the approach in setup-go-cgo-env/windows.ps1
|
||||
export CGO_LDFLAGS="-L${repo_root}/target/x86_64-pc-windows-gnu/release -L${repo_root}/target/release"
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "✓ Configured Go cgo environment"
|
||||
}
|
||||
|
||||
setup_all_library_paths() {
|
||||
local repo_root="${1:-${REPO_ROOT:-}}"
|
||||
|
||||
echo "Setting up library paths..."
|
||||
setup_onnx_paths
|
||||
setup_rust_ffi_paths "$repo_root"
|
||||
setup_go_paths "$repo_root"
|
||||
echo "✓ All library paths configured"
|
||||
}
|
||||
|
||||
export -f setup_onnx_paths
|
||||
export -f setup_rust_ffi_paths
|
||||
export -f verify_pkg_config
|
||||
export -f setup_go_paths_windows
|
||||
export -f setup_go_paths
|
||||
export -f setup_all_library_paths
|
||||
export -f _get_path_separator
|
||||
85
scripts/lib/retry.sh
Executable file
85
scripts/lib/retry.sh
Executable file
@@ -0,0 +1,85 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
run_with_timeout() {
|
||||
local seconds="$1"
|
||||
shift
|
||||
|
||||
if command -v timeout >/dev/null 2>&1; then
|
||||
timeout "${seconds}" "$@"
|
||||
return $?
|
||||
fi
|
||||
if command -v gtimeout >/dev/null 2>&1; then
|
||||
gtimeout "${seconds}" "$@"
|
||||
return $?
|
||||
fi
|
||||
|
||||
if command -v python3 >/dev/null 2>&1; then
|
||||
python3 - "$seconds" "$@" <<'PY'
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
timeout_s = int(sys.argv[1])
|
||||
cmd = sys.argv[2:]
|
||||
try:
|
||||
completed = subprocess.run(cmd, timeout=timeout_s)
|
||||
sys.exit(completed.returncode)
|
||||
except subprocess.TimeoutExpired:
|
||||
sys.exit(124)
|
||||
PY
|
||||
return $?
|
||||
fi
|
||||
|
||||
"$@"
|
||||
}
|
||||
|
||||
retry_with_backoff() {
|
||||
local max_attempts=3
|
||||
local attempt=1
|
||||
local delay=5
|
||||
|
||||
while [ $attempt -le $max_attempts ]; do
|
||||
if "$@"; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
if [ $attempt -lt $max_attempts ]; then
|
||||
echo "⚠ Attempt $attempt failed, retrying in ${delay}s..." >&2
|
||||
sleep $delay
|
||||
delay=$((delay * 2))
|
||||
fi
|
||||
attempt=$((attempt + 1))
|
||||
done
|
||||
|
||||
return 1
|
||||
}
|
||||
|
||||
retry_with_backoff_timeout() {
|
||||
local seconds="$1"
|
||||
shift
|
||||
local max_attempts=3
|
||||
local attempt=1
|
||||
local delay=5
|
||||
local exit_code=1
|
||||
|
||||
while [ $attempt -le $max_attempts ]; do
|
||||
if run_with_timeout "$seconds" "$@"; then
|
||||
return 0
|
||||
else
|
||||
exit_code=$?
|
||||
fi
|
||||
if [ $attempt -lt $max_attempts ]; then
|
||||
echo "⚠ Attempt $attempt failed (exit $exit_code), retrying in ${delay}s..." >&2
|
||||
sleep $delay
|
||||
delay=$((delay * 2))
|
||||
fi
|
||||
attempt=$((attempt + 1))
|
||||
done
|
||||
|
||||
return $exit_code
|
||||
}
|
||||
|
||||
export -f run_with_timeout
|
||||
export -f retry_with_backoff
|
||||
export -f retry_with_backoff_timeout
|
||||
157
scripts/lib/tessdata.sh
Executable file
157
scripts/lib/tessdata.sh
Executable file
@@ -0,0 +1,157 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
file_size_bytes() {
|
||||
local path="$1"
|
||||
if [ ! -f "$path" ]; then
|
||||
echo 0
|
||||
return
|
||||
fi
|
||||
if stat -c%s "$path" >/dev/null 2>&1; then
|
||||
stat -c%s "$path"
|
||||
return
|
||||
fi
|
||||
stat -f%z "$path"
|
||||
}
|
||||
|
||||
min_traineddata_size_bytes() {
|
||||
local lang="$1"
|
||||
case "$lang" in
|
||||
eng) echo 1000000 ;;
|
||||
osd) echo 100000 ;;
|
||||
deu) echo 1000000 ;;
|
||||
*) echo 100000 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
download_traineddata() {
|
||||
local lang="$1"
|
||||
local dest="$2"
|
||||
local url="$3"
|
||||
local tmp="${dest}.tmp"
|
||||
local min_size
|
||||
min_size="$(min_traineddata_size_bytes "$lang")"
|
||||
|
||||
rm -f "$tmp"
|
||||
|
||||
for attempt in 1 2 3 4 5; do
|
||||
if curl -fsSL --retry 5 --retry-delay 5 --retry-all-errors "$url" -o "$tmp"; then
|
||||
local size
|
||||
size="$(file_size_bytes "$tmp")"
|
||||
if [ "$size" -ge "$min_size" ]; then
|
||||
mv -f "$tmp" "$dest"
|
||||
return 0
|
||||
fi
|
||||
echo "Downloaded ${lang}.traineddata too small (${size} bytes < ${min_size}), retrying..." >&2
|
||||
else
|
||||
echo "Failed to download ${lang}.traineddata (attempt ${attempt}), retrying..." >&2
|
||||
fi
|
||||
rm -f "$tmp"
|
||||
sleep "$attempt"
|
||||
done
|
||||
|
||||
echo "ERROR: Failed to download valid ${lang}.traineddata after retries" >&2
|
||||
return 1
|
||||
}
|
||||
|
||||
ensure_valid_traineddata() {
|
||||
local dest_dir="$1"
|
||||
local lang="$2"
|
||||
local url="$3"
|
||||
local dest_file="${dest_dir}/${lang}.traineddata"
|
||||
local min_size
|
||||
min_size="$(min_traineddata_size_bytes "$lang")"
|
||||
|
||||
local size
|
||||
size="$(file_size_bytes "$dest_file")"
|
||||
if [ "$size" -ge "$min_size" ]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
if [ -f "$dest_file" ]; then
|
||||
echo "Invalid ${lang}.traineddata at ${dest_file} (${size} bytes < ${min_size}); re-downloading..." >&2
|
||||
rm -f "$dest_file"
|
||||
fi
|
||||
|
||||
download_traineddata "$lang" "$dest_file" "$url"
|
||||
}
|
||||
|
||||
ensure_tessdata() {
|
||||
local dest="$1"
|
||||
mkdir -p "$dest"
|
||||
local dest_real
|
||||
dest_real="$(cd "$dest" && pwd -P)"
|
||||
|
||||
local candidates=(
|
||||
"/opt/homebrew/share/tessdata"
|
||||
"/usr/local/opt/tesseract/share/tessdata"
|
||||
"/usr/share/tesseract-ocr/5/tessdata"
|
||||
)
|
||||
|
||||
if [ -n "${PROGRAMFILES:-}" ] && command -v cygpath >/dev/null 2>&1; then
|
||||
candidates+=("$(cygpath -u "$PROGRAMFILES")/Tesseract-OCR/tessdata")
|
||||
fi
|
||||
if [ -d "/c/Program Files/Tesseract-OCR/tessdata" ]; then
|
||||
candidates+=("/c/Program Files/Tesseract-OCR/tessdata")
|
||||
fi
|
||||
|
||||
for dir in "${candidates[@]}"; do
|
||||
if [ -f "$dir/eng.traineddata" ]; then
|
||||
local dir_real
|
||||
dir_real="$(cd "$dir" && pwd -P)"
|
||||
|
||||
if [ "$dir_real" = "$dest_real" ]; then
|
||||
break
|
||||
fi
|
||||
|
||||
for lang in eng osd deu; do
|
||||
if [ -f "$dir/$lang.traineddata" ]; then
|
||||
if [ -f "$dest/$lang.traineddata" ] &&
|
||||
[ "$dir_real/$lang.traineddata" -ef "$dest/$lang.traineddata" ]; then
|
||||
continue
|
||||
fi
|
||||
cp -f "$dir/$lang.traineddata" "$dest/"
|
||||
fi
|
||||
done
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
ensure_valid_traineddata "$dest" "eng" "https://github.com/tesseract-ocr/tessdata_fast/raw/main/eng.traineddata"
|
||||
ensure_valid_traineddata "$dest" "osd" "https://github.com/tesseract-ocr/tessdata_fast/raw/main/osd.traineddata"
|
||||
}
|
||||
|
||||
setup_tessdata() {
|
||||
local platform="${RUNNER_OS:-$(uname -s)}"
|
||||
|
||||
case "$platform" in
|
||||
Linux)
|
||||
export TESSDATA_PREFIX="/usr/share/tesseract-ocr/5/tessdata"
|
||||
;;
|
||||
macOS | Darwin)
|
||||
if [ -d "/opt/homebrew/opt/tesseract/share/tessdata" ]; then
|
||||
export TESSDATA_PREFIX="/opt/homebrew/opt/tesseract/share/tessdata"
|
||||
elif [ -d "/usr/local/opt/tesseract/share/tessdata" ]; then
|
||||
export TESSDATA_PREFIX="/usr/local/opt/tesseract/share/tessdata"
|
||||
else
|
||||
export TESSDATA_PREFIX="$HOME/Library/Application Support/kreuzberg-tesseract/tessdata"
|
||||
fi
|
||||
;;
|
||||
Windows | MINGW* | MSYS* | CYGWIN*)
|
||||
export TESSDATA_PREFIX="${APPDATA:-${USERPROFILE:-}}/kreuzberg-tesseract/tessdata"
|
||||
;;
|
||||
*)
|
||||
export TESSDATA_PREFIX="${REPO_ROOT:-$(pwd)}/target/tessdata"
|
||||
;;
|
||||
esac
|
||||
|
||||
ensure_tessdata "$TESSDATA_PREFIX"
|
||||
|
||||
echo "✓ TESSDATA_PREFIX set to: $TESSDATA_PREFIX"
|
||||
[ -f "$TESSDATA_PREFIX/eng.traineddata" ] && echo "✓ eng.traineddata available"
|
||||
[ -f "$TESSDATA_PREFIX/osd.traineddata" ] && echo "✓ osd.traineddata available"
|
||||
}
|
||||
|
||||
export -f ensure_tessdata
|
||||
export -f setup_tessdata
|
||||
Reference in New Issue
Block a user