This commit is contained in:
157
scripts/lib/tessdata.sh
Executable file
157
scripts/lib/tessdata.sh
Executable file
@@ -0,0 +1,157 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
file_size_bytes() {
|
||||
local path="$1"
|
||||
if [ ! -f "$path" ]; then
|
||||
echo 0
|
||||
return
|
||||
fi
|
||||
if stat -c%s "$path" >/dev/null 2>&1; then
|
||||
stat -c%s "$path"
|
||||
return
|
||||
fi
|
||||
stat -f%z "$path"
|
||||
}
|
||||
|
||||
min_traineddata_size_bytes() {
|
||||
local lang="$1"
|
||||
case "$lang" in
|
||||
eng) echo 1000000 ;;
|
||||
osd) echo 100000 ;;
|
||||
deu) echo 1000000 ;;
|
||||
*) echo 100000 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
download_traineddata() {
|
||||
local lang="$1"
|
||||
local dest="$2"
|
||||
local url="$3"
|
||||
local tmp="${dest}.tmp"
|
||||
local min_size
|
||||
min_size="$(min_traineddata_size_bytes "$lang")"
|
||||
|
||||
rm -f "$tmp"
|
||||
|
||||
for attempt in 1 2 3 4 5; do
|
||||
if curl -fsSL --retry 5 --retry-delay 5 --retry-all-errors "$url" -o "$tmp"; then
|
||||
local size
|
||||
size="$(file_size_bytes "$tmp")"
|
||||
if [ "$size" -ge "$min_size" ]; then
|
||||
mv -f "$tmp" "$dest"
|
||||
return 0
|
||||
fi
|
||||
echo "Downloaded ${lang}.traineddata too small (${size} bytes < ${min_size}), retrying..." >&2
|
||||
else
|
||||
echo "Failed to download ${lang}.traineddata (attempt ${attempt}), retrying..." >&2
|
||||
fi
|
||||
rm -f "$tmp"
|
||||
sleep "$attempt"
|
||||
done
|
||||
|
||||
echo "ERROR: Failed to download valid ${lang}.traineddata after retries" >&2
|
||||
return 1
|
||||
}
|
||||
|
||||
ensure_valid_traineddata() {
|
||||
local dest_dir="$1"
|
||||
local lang="$2"
|
||||
local url="$3"
|
||||
local dest_file="${dest_dir}/${lang}.traineddata"
|
||||
local min_size
|
||||
min_size="$(min_traineddata_size_bytes "$lang")"
|
||||
|
||||
local size
|
||||
size="$(file_size_bytes "$dest_file")"
|
||||
if [ "$size" -ge "$min_size" ]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
if [ -f "$dest_file" ]; then
|
||||
echo "Invalid ${lang}.traineddata at ${dest_file} (${size} bytes < ${min_size}); re-downloading..." >&2
|
||||
rm -f "$dest_file"
|
||||
fi
|
||||
|
||||
download_traineddata "$lang" "$dest_file" "$url"
|
||||
}
|
||||
|
||||
ensure_tessdata() {
|
||||
local dest="$1"
|
||||
mkdir -p "$dest"
|
||||
local dest_real
|
||||
dest_real="$(cd "$dest" && pwd -P)"
|
||||
|
||||
local candidates=(
|
||||
"/opt/homebrew/share/tessdata"
|
||||
"/usr/local/opt/tesseract/share/tessdata"
|
||||
"/usr/share/tesseract-ocr/5/tessdata"
|
||||
)
|
||||
|
||||
if [ -n "${PROGRAMFILES:-}" ] && command -v cygpath >/dev/null 2>&1; then
|
||||
candidates+=("$(cygpath -u "$PROGRAMFILES")/Tesseract-OCR/tessdata")
|
||||
fi
|
||||
if [ -d "/c/Program Files/Tesseract-OCR/tessdata" ]; then
|
||||
candidates+=("/c/Program Files/Tesseract-OCR/tessdata")
|
||||
fi
|
||||
|
||||
for dir in "${candidates[@]}"; do
|
||||
if [ -f "$dir/eng.traineddata" ]; then
|
||||
local dir_real
|
||||
dir_real="$(cd "$dir" && pwd -P)"
|
||||
|
||||
if [ "$dir_real" = "$dest_real" ]; then
|
||||
break
|
||||
fi
|
||||
|
||||
for lang in eng osd deu; do
|
||||
if [ -f "$dir/$lang.traineddata" ]; then
|
||||
if [ -f "$dest/$lang.traineddata" ] &&
|
||||
[ "$dir_real/$lang.traineddata" -ef "$dest/$lang.traineddata" ]; then
|
||||
continue
|
||||
fi
|
||||
cp -f "$dir/$lang.traineddata" "$dest/"
|
||||
fi
|
||||
done
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
ensure_valid_traineddata "$dest" "eng" "https://github.com/tesseract-ocr/tessdata_fast/raw/main/eng.traineddata"
|
||||
ensure_valid_traineddata "$dest" "osd" "https://github.com/tesseract-ocr/tessdata_fast/raw/main/osd.traineddata"
|
||||
}
|
||||
|
||||
setup_tessdata() {
|
||||
local platform="${RUNNER_OS:-$(uname -s)}"
|
||||
|
||||
case "$platform" in
|
||||
Linux)
|
||||
export TESSDATA_PREFIX="/usr/share/tesseract-ocr/5/tessdata"
|
||||
;;
|
||||
macOS | Darwin)
|
||||
if [ -d "/opt/homebrew/opt/tesseract/share/tessdata" ]; then
|
||||
export TESSDATA_PREFIX="/opt/homebrew/opt/tesseract/share/tessdata"
|
||||
elif [ -d "/usr/local/opt/tesseract/share/tessdata" ]; then
|
||||
export TESSDATA_PREFIX="/usr/local/opt/tesseract/share/tessdata"
|
||||
else
|
||||
export TESSDATA_PREFIX="$HOME/Library/Application Support/kreuzberg-tesseract/tessdata"
|
||||
fi
|
||||
;;
|
||||
Windows | MINGW* | MSYS* | CYGWIN*)
|
||||
export TESSDATA_PREFIX="${APPDATA:-${USERPROFILE:-}}/kreuzberg-tesseract/tessdata"
|
||||
;;
|
||||
*)
|
||||
export TESSDATA_PREFIX="${REPO_ROOT:-$(pwd)}/target/tessdata"
|
||||
;;
|
||||
esac
|
||||
|
||||
ensure_tessdata "$TESSDATA_PREFIX"
|
||||
|
||||
echo "✓ TESSDATA_PREFIX set to: $TESSDATA_PREFIX"
|
||||
[ -f "$TESSDATA_PREFIX/eng.traineddata" ] && echo "✓ eng.traineddata available"
|
||||
[ -f "$TESSDATA_PREFIX/osd.traineddata" ] && echo "✓ osd.traineddata available"
|
||||
}
|
||||
|
||||
export -f ensure_tessdata
|
||||
export -f setup_tessdata
|
||||
Reference in New Issue
Block a user