158 lines
4.1 KiB
Bash
158 lines
4.1 KiB
Bash
|
|
#!/usr/bin/env bash
|
||
|
|
|
||
|
|
set -euo pipefail
|
||
|
|
|
||
|
|
file_size_bytes() {
|
||
|
|
local path="$1"
|
||
|
|
if [ ! -f "$path" ]; then
|
||
|
|
echo 0
|
||
|
|
return
|
||
|
|
fi
|
||
|
|
if stat -c%s "$path" >/dev/null 2>&1; then
|
||
|
|
stat -c%s "$path"
|
||
|
|
return
|
||
|
|
fi
|
||
|
|
stat -f%z "$path"
|
||
|
|
}
|
||
|
|
|
||
|
|
min_traineddata_size_bytes() {
|
||
|
|
local lang="$1"
|
||
|
|
case "$lang" in
|
||
|
|
eng) echo 1000000 ;;
|
||
|
|
osd) echo 100000 ;;
|
||
|
|
deu) echo 1000000 ;;
|
||
|
|
*) echo 100000 ;;
|
||
|
|
esac
|
||
|
|
}
|
||
|
|
|
||
|
|
download_traineddata() {
|
||
|
|
local lang="$1"
|
||
|
|
local dest="$2"
|
||
|
|
local url="$3"
|
||
|
|
local tmp="${dest}.tmp"
|
||
|
|
local min_size
|
||
|
|
min_size="$(min_traineddata_size_bytes "$lang")"
|
||
|
|
|
||
|
|
rm -f "$tmp"
|
||
|
|
|
||
|
|
for attempt in 1 2 3 4 5; do
|
||
|
|
if curl -fsSL --retry 5 --retry-delay 5 --retry-all-errors "$url" -o "$tmp"; then
|
||
|
|
local size
|
||
|
|
size="$(file_size_bytes "$tmp")"
|
||
|
|
if [ "$size" -ge "$min_size" ]; then
|
||
|
|
mv -f "$tmp" "$dest"
|
||
|
|
return 0
|
||
|
|
fi
|
||
|
|
echo "Downloaded ${lang}.traineddata too small (${size} bytes < ${min_size}), retrying..." >&2
|
||
|
|
else
|
||
|
|
echo "Failed to download ${lang}.traineddata (attempt ${attempt}), retrying..." >&2
|
||
|
|
fi
|
||
|
|
rm -f "$tmp"
|
||
|
|
sleep "$attempt"
|
||
|
|
done
|
||
|
|
|
||
|
|
echo "ERROR: Failed to download valid ${lang}.traineddata after retries" >&2
|
||
|
|
return 1
|
||
|
|
}
|
||
|
|
|
||
|
|
ensure_valid_traineddata() {
|
||
|
|
local dest_dir="$1"
|
||
|
|
local lang="$2"
|
||
|
|
local url="$3"
|
||
|
|
local dest_file="${dest_dir}/${lang}.traineddata"
|
||
|
|
local min_size
|
||
|
|
min_size="$(min_traineddata_size_bytes "$lang")"
|
||
|
|
|
||
|
|
local size
|
||
|
|
size="$(file_size_bytes "$dest_file")"
|
||
|
|
if [ "$size" -ge "$min_size" ]; then
|
||
|
|
return 0
|
||
|
|
fi
|
||
|
|
|
||
|
|
if [ -f "$dest_file" ]; then
|
||
|
|
echo "Invalid ${lang}.traineddata at ${dest_file} (${size} bytes < ${min_size}); re-downloading..." >&2
|
||
|
|
rm -f "$dest_file"
|
||
|
|
fi
|
||
|
|
|
||
|
|
download_traineddata "$lang" "$dest_file" "$url"
|
||
|
|
}
|
||
|
|
|
||
|
|
ensure_tessdata() {
|
||
|
|
local dest="$1"
|
||
|
|
mkdir -p "$dest"
|
||
|
|
local dest_real
|
||
|
|
dest_real="$(cd "$dest" && pwd -P)"
|
||
|
|
|
||
|
|
local candidates=(
|
||
|
|
"/opt/homebrew/share/tessdata"
|
||
|
|
"/usr/local/opt/tesseract/share/tessdata"
|
||
|
|
"/usr/share/tesseract-ocr/5/tessdata"
|
||
|
|
)
|
||
|
|
|
||
|
|
if [ -n "${PROGRAMFILES:-}" ] && command -v cygpath >/dev/null 2>&1; then
|
||
|
|
candidates+=("$(cygpath -u "$PROGRAMFILES")/Tesseract-OCR/tessdata")
|
||
|
|
fi
|
||
|
|
if [ -d "/c/Program Files/Tesseract-OCR/tessdata" ]; then
|
||
|
|
candidates+=("/c/Program Files/Tesseract-OCR/tessdata")
|
||
|
|
fi
|
||
|
|
|
||
|
|
for dir in "${candidates[@]}"; do
|
||
|
|
if [ -f "$dir/eng.traineddata" ]; then
|
||
|
|
local dir_real
|
||
|
|
dir_real="$(cd "$dir" && pwd -P)"
|
||
|
|
|
||
|
|
if [ "$dir_real" = "$dest_real" ]; then
|
||
|
|
break
|
||
|
|
fi
|
||
|
|
|
||
|
|
for lang in eng osd deu; do
|
||
|
|
if [ -f "$dir/$lang.traineddata" ]; then
|
||
|
|
if [ -f "$dest/$lang.traineddata" ] &&
|
||
|
|
[ "$dir_real/$lang.traineddata" -ef "$dest/$lang.traineddata" ]; then
|
||
|
|
continue
|
||
|
|
fi
|
||
|
|
cp -f "$dir/$lang.traineddata" "$dest/"
|
||
|
|
fi
|
||
|
|
done
|
||
|
|
break
|
||
|
|
fi
|
||
|
|
done
|
||
|
|
|
||
|
|
ensure_valid_traineddata "$dest" "eng" "https://github.com/tesseract-ocr/tessdata_fast/raw/main/eng.traineddata"
|
||
|
|
ensure_valid_traineddata "$dest" "osd" "https://github.com/tesseract-ocr/tessdata_fast/raw/main/osd.traineddata"
|
||
|
|
}
|
||
|
|
|
||
|
|
setup_tessdata() {
|
||
|
|
local platform="${RUNNER_OS:-$(uname -s)}"
|
||
|
|
|
||
|
|
case "$platform" in
|
||
|
|
Linux)
|
||
|
|
export TESSDATA_PREFIX="/usr/share/tesseract-ocr/5/tessdata"
|
||
|
|
;;
|
||
|
|
macOS | Darwin)
|
||
|
|
if [ -d "/opt/homebrew/opt/tesseract/share/tessdata" ]; then
|
||
|
|
export TESSDATA_PREFIX="/opt/homebrew/opt/tesseract/share/tessdata"
|
||
|
|
elif [ -d "/usr/local/opt/tesseract/share/tessdata" ]; then
|
||
|
|
export TESSDATA_PREFIX="/usr/local/opt/tesseract/share/tessdata"
|
||
|
|
else
|
||
|
|
export TESSDATA_PREFIX="$HOME/Library/Application Support/kreuzberg-tesseract/tessdata"
|
||
|
|
fi
|
||
|
|
;;
|
||
|
|
Windows | MINGW* | MSYS* | CYGWIN*)
|
||
|
|
export TESSDATA_PREFIX="${APPDATA:-${USERPROFILE:-}}/kreuzberg-tesseract/tessdata"
|
||
|
|
;;
|
||
|
|
*)
|
||
|
|
export TESSDATA_PREFIX="${REPO_ROOT:-$(pwd)}/target/tessdata"
|
||
|
|
;;
|
||
|
|
esac
|
||
|
|
|
||
|
|
ensure_tessdata "$TESSDATA_PREFIX"
|
||
|
|
|
||
|
|
echo "✓ TESSDATA_PREFIX set to: $TESSDATA_PREFIX"
|
||
|
|
[ -f "$TESSDATA_PREFIX/eng.traineddata" ] && echo "✓ eng.traineddata available"
|
||
|
|
[ -f "$TESSDATA_PREFIX/osd.traineddata" ] && echo "✓ osd.traineddata available"
|
||
|
|
}
|
||
|
|
|
||
|
|
export -f ensure_tessdata
|
||
|
|
export -f setup_tessdata
|