Files
fil/scripts/lib/tessdata.sh
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

158 lines
4.1 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
file_size_bytes() {
local path="$1"
if [ ! -f "$path" ]; then
echo 0
return
fi
if stat -c%s "$path" >/dev/null 2>&1; then
stat -c%s "$path"
return
fi
stat -f%z "$path"
}
min_traineddata_size_bytes() {
local lang="$1"
case "$lang" in
eng) echo 1000000 ;;
osd) echo 100000 ;;
deu) echo 1000000 ;;
*) echo 100000 ;;
esac
}
download_traineddata() {
local lang="$1"
local dest="$2"
local url="$3"
local tmp="${dest}.tmp"
local min_size
min_size="$(min_traineddata_size_bytes "$lang")"
rm -f "$tmp"
for attempt in 1 2 3 4 5; do
if curl -fsSL --retry 5 --retry-delay 5 --retry-all-errors "$url" -o "$tmp"; then
local size
size="$(file_size_bytes "$tmp")"
if [ "$size" -ge "$min_size" ]; then
mv -f "$tmp" "$dest"
return 0
fi
echo "Downloaded ${lang}.traineddata too small (${size} bytes < ${min_size}), retrying..." >&2
else
echo "Failed to download ${lang}.traineddata (attempt ${attempt}), retrying..." >&2
fi
rm -f "$tmp"
sleep "$attempt"
done
echo "ERROR: Failed to download valid ${lang}.traineddata after retries" >&2
return 1
}
ensure_valid_traineddata() {
local dest_dir="$1"
local lang="$2"
local url="$3"
local dest_file="${dest_dir}/${lang}.traineddata"
local min_size
min_size="$(min_traineddata_size_bytes "$lang")"
local size
size="$(file_size_bytes "$dest_file")"
if [ "$size" -ge "$min_size" ]; then
return 0
fi
if [ -f "$dest_file" ]; then
echo "Invalid ${lang}.traineddata at ${dest_file} (${size} bytes < ${min_size}); re-downloading..." >&2
rm -f "$dest_file"
fi
download_traineddata "$lang" "$dest_file" "$url"
}
ensure_tessdata() {
local dest="$1"
mkdir -p "$dest"
local dest_real
dest_real="$(cd "$dest" && pwd -P)"
local candidates=(
"/opt/homebrew/share/tessdata"
"/usr/local/opt/tesseract/share/tessdata"
"/usr/share/tesseract-ocr/5/tessdata"
)
if [ -n "${PROGRAMFILES:-}" ] && command -v cygpath >/dev/null 2>&1; then
candidates+=("$(cygpath -u "$PROGRAMFILES")/Tesseract-OCR/tessdata")
fi
if [ -d "/c/Program Files/Tesseract-OCR/tessdata" ]; then
candidates+=("/c/Program Files/Tesseract-OCR/tessdata")
fi
for dir in "${candidates[@]}"; do
if [ -f "$dir/eng.traineddata" ]; then
local dir_real
dir_real="$(cd "$dir" && pwd -P)"
if [ "$dir_real" = "$dest_real" ]; then
break
fi
for lang in eng osd deu; do
if [ -f "$dir/$lang.traineddata" ]; then
if [ -f "$dest/$lang.traineddata" ] &&
[ "$dir_real/$lang.traineddata" -ef "$dest/$lang.traineddata" ]; then
continue
fi
cp -f "$dir/$lang.traineddata" "$dest/"
fi
done
break
fi
done
ensure_valid_traineddata "$dest" "eng" "https://github.com/tesseract-ocr/tessdata_fast/raw/main/eng.traineddata"
ensure_valid_traineddata "$dest" "osd" "https://github.com/tesseract-ocr/tessdata_fast/raw/main/osd.traineddata"
}
setup_tessdata() {
local platform="${RUNNER_OS:-$(uname -s)}"
case "$platform" in
Linux)
export TESSDATA_PREFIX="/usr/share/tesseract-ocr/5/tessdata"
;;
macOS | Darwin)
if [ -d "/opt/homebrew/opt/tesseract/share/tessdata" ]; then
export TESSDATA_PREFIX="/opt/homebrew/opt/tesseract/share/tessdata"
elif [ -d "/usr/local/opt/tesseract/share/tessdata" ]; then
export TESSDATA_PREFIX="/usr/local/opt/tesseract/share/tessdata"
else
export TESSDATA_PREFIX="$HOME/Library/Application Support/kreuzberg-tesseract/tessdata"
fi
;;
Windows | MINGW* | MSYS* | CYGWIN*)
export TESSDATA_PREFIX="${APPDATA:-${USERPROFILE:-}}/kreuzberg-tesseract/tessdata"
;;
*)
export TESSDATA_PREFIX="${REPO_ROOT:-$(pwd)}/target/tessdata"
;;
esac
ensure_tessdata "$TESSDATA_PREFIX"
echo "✓ TESSDATA_PREFIX set to: $TESSDATA_PREFIX"
[ -f "$TESSDATA_PREFIX/eng.traineddata" ] && echo "✓ eng.traineddata available"
[ -f "$TESSDATA_PREFIX/osd.traineddata" ] && echo "✓ osd.traineddata available"
}
export -f ensure_tessdata
export -f setup_tessdata