Files
fil/.github/actions/setup-paddle-ocr-models/action.yml
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

232 lines
8.7 KiB
YAML

name: Setup PaddleOCR Models Cache
description: Download and cache PaddleOCR ONNX models for CI testing
inputs:
cache-enabled:
description: Enable model caching (set to false for cross-arch builds)
required: false
default: "true"
models:
description: Comma-separated list of models to setup (det,cls,rec or specific subset)
required: false
default: "det,cls,rec"
cache-key-suffix:
description: Suffix for cache key to differentiate model sets
required: false
default: "paddle-ocr-v5-onnx"
outputs:
cache-hit:
description: Whether models were restored from cache (true/false)
value: ${{ steps.cache-models.outputs.cache-hit }}
cache-dir:
description: Path to the PaddleOCR model cache directory
value: ${{ steps.set-outputs.outputs.cache-dir }}
models-available:
description: Comma-separated list of available models
value: ${{ steps.verify-models.outputs.available-models }}
runs:
using: composite
steps:
- name: Setup cache directory
shell: bash
run: |
mkdir -p ~/.cache/kreuzberg/paddle-ocr
echo "Cache directory: $HOME/.cache/kreuzberg/paddle-ocr"
- name: Restore PaddleOCR models from cache
if: inputs.cache-enabled == 'true'
uses: actions/cache@v5
id: cache-models
with:
path: ~/.cache/kreuzberg/paddle-ocr
key: ${{ inputs.cache-key-suffix }}-${{ runner.os }}-${{ runner.arch }}-v4
restore-keys: |
${{ inputs.cache-key-suffix }}-${{ runner.os }}-${{ runner.arch }}-
${{ inputs.cache-key-suffix }}-${{ runner.os }}-
${{ inputs.cache-key-suffix }}-
- name: Download detection model (det)
if: contains(inputs.models, 'det') && steps.cache-models.outputs.cache-hit != 'true'
shell: bash
run: |
MODEL_URL="https://huggingface.co/Kreuzberg/paddleocr-onnx-models/resolve/main/PP-OCRv5_server_det_infer.onnx"
CACHE_DIR="$HOME/.cache/kreuzberg/paddle-ocr"
MODEL_DIR="$CACHE_DIR/det"
MODEL_FILE="$MODEL_DIR/model.onnx"
echo "Downloading detection model from $MODEL_URL"
mkdir -p "$MODEL_DIR"
for attempt in 1 2 3; do
if [ $attempt -gt 1 ]; then
backoff=$((5 * 3 ** (attempt - 2)))
echo "Retry attempt $attempt/3 after ${backoff}s backoff..."
sleep $backoff
fi
if curl -f -L --progress-bar --connect-timeout 30 --max-time 600 \
-o "$MODEL_FILE" "$MODEL_URL"; then
echo "Detection model downloaded successfully ($(du -h "$MODEL_FILE" | cut -f1))"
exit 0
fi
done
echo "ERROR: Failed to download detection model after 3 attempts"
rm -f "$MODEL_FILE"
exit 1
- name: Download classification model (cls)
if: contains(inputs.models, 'cls') && steps.cache-models.outputs.cache-hit != 'true'
shell: bash
run: |
MODEL_URL="https://huggingface.co/Kreuzberg/paddleocr-onnx-models/resolve/main/ch_ppocr_mobile_v2.0_cls_infer.onnx"
CACHE_DIR="$HOME/.cache/kreuzberg/paddle-ocr"
MODEL_DIR="$CACHE_DIR/cls"
MODEL_FILE="$MODEL_DIR/model.onnx"
echo "Downloading classification model from $MODEL_URL"
mkdir -p "$MODEL_DIR"
for attempt in 1 2 3; do
if [ $attempt -gt 1 ]; then
backoff=$((5 * 3 ** (attempt - 2)))
echo "Retry attempt $attempt/3 after ${backoff}s backoff..."
sleep $backoff
fi
if curl -f -L --progress-bar --connect-timeout 30 --max-time 600 \
-o "$MODEL_FILE" "$MODEL_URL"; then
echo "Classification model downloaded successfully ($(du -h "$MODEL_FILE" | cut -f1))"
exit 0
fi
done
echo "ERROR: Failed to download classification model after 3 attempts"
rm -f "$MODEL_FILE"
exit 1
- name: Download recognition model (rec/english)
if: contains(inputs.models, 'rec') && steps.cache-models.outputs.cache-hit != 'true'
shell: bash
run: |
MODEL_URL="https://huggingface.co/Kreuzberg/paddleocr-onnx-models/resolve/main/rec/english/model.onnx"
CACHE_DIR="$HOME/.cache/kreuzberg/paddle-ocr"
MODEL_DIR="$CACHE_DIR/rec/english"
MODEL_FILE="$MODEL_DIR/model.onnx"
echo "Downloading English recognition model from $MODEL_URL"
mkdir -p "$MODEL_DIR"
for attempt in 1 2 3; do
if [ $attempt -gt 1 ]; then
backoff=$((5 * 3 ** (attempt - 2)))
echo "Retry attempt $attempt/3 after ${backoff}s backoff..."
sleep $backoff
fi
if curl -f -L --progress-bar --connect-timeout 30 --max-time 600 \
-o "$MODEL_FILE" "$MODEL_URL"; then
echo "Recognition model downloaded successfully ($(du -h "$MODEL_FILE" | cut -f1))"
exit 0
fi
done
echo "ERROR: Failed to download recognition model after 3 attempts"
rm -f "$MODEL_FILE"
exit 1
- name: Download recognition dictionary (rec/english/dict.txt)
if: contains(inputs.models, 'rec') && steps.cache-models.outputs.cache-hit != 'true'
shell: bash
run: |
DICT_URL="https://huggingface.co/Kreuzberg/paddleocr-onnx-models/resolve/main/rec/english/dict.txt"
CACHE_DIR="$HOME/.cache/kreuzberg/paddle-ocr"
MODEL_DIR="$CACHE_DIR/rec/english"
DICT_FILE="$MODEL_DIR/dict.txt"
echo "Downloading English recognition dictionary from $DICT_URL"
mkdir -p "$MODEL_DIR"
for attempt in 1 2 3; do
if [ $attempt -gt 1 ]; then
backoff=$((5 * 3 ** (attempt - 2)))
echo "Retry attempt $attempt/3 after ${backoff}s backoff..."
sleep $backoff
fi
if curl -f -L --progress-bar --connect-timeout 30 --max-time 600 \
-o "$DICT_FILE" "$DICT_URL"; then
echo "Dictionary downloaded successfully ($(du -h "$DICT_FILE" | cut -f1))"
exit 0
fi
done
echo "ERROR: Failed to download dictionary after 3 attempts"
rm -f "$DICT_FILE"
exit 1
- name: Verify downloaded models
id: verify-models
shell: bash
run: |
CACHE_DIR="$HOME/.cache/kreuzberg/paddle-ocr"
AVAILABLE_MODELS=()
TOTAL_SIZE=0
echo "Checking for PaddleOCR models in $CACHE_DIR"
if [ -f "$CACHE_DIR/det/model.onnx" ]; then
SIZE=$(wc -c < "$CACHE_DIR/det/model.onnx" | tr -d ' ')
AVAILABLE_MODELS+=("det")
TOTAL_SIZE=$((TOTAL_SIZE + SIZE))
echo " ✓ Detection model: $(numfmt --to=iec-i --suffix=B $SIZE 2>/dev/null || echo $SIZE bytes)"
fi
if [ -f "$CACHE_DIR/cls/model.onnx" ]; then
SIZE=$(wc -c < "$CACHE_DIR/cls/model.onnx" | tr -d ' ')
AVAILABLE_MODELS+=("cls")
TOTAL_SIZE=$((TOTAL_SIZE + SIZE))
echo " ✓ Classification model: $(numfmt --to=iec-i --suffix=B $SIZE 2>/dev/null || echo $SIZE bytes)"
fi
if [ -f "$CACHE_DIR/rec/english/model.onnx" ]; then
SIZE=$(wc -c < "$CACHE_DIR/rec/english/model.onnx" | tr -d ' ')
AVAILABLE_MODELS+=("rec")
TOTAL_SIZE=$((TOTAL_SIZE + SIZE))
echo " ✓ Recognition model (English): $(numfmt --to=iec-i --suffix=B $SIZE 2>/dev/null || echo $SIZE bytes)"
fi
if [ -f "$CACHE_DIR/rec/english/dict.txt" ]; then
SIZE=$(wc -c < "$CACHE_DIR/rec/english/dict.txt" | tr -d ' ')
TOTAL_SIZE=$((TOTAL_SIZE + SIZE))
echo " ✓ Recognition dictionary (English): $(numfmt --to=iec-i --suffix=B $SIZE 2>/dev/null || echo $SIZE bytes)"
fi
if [ ${#AVAILABLE_MODELS[@]} -eq 0 ]; then
echo "ERROR: No models found in cache directory after download"
echo "available-models=" >> $GITHUB_OUTPUT
exit 1
fi
AVAILABLE_MODELS_STR=$(IFS=, ; echo "${AVAILABLE_MODELS[*]}")
echo "✓ Total cached models: ${#AVAILABLE_MODELS[@]} ($(numfmt --to=iec-i --suffix=B $TOTAL_SIZE 2>/dev/null || echo $TOTAL_SIZE bytes))"
echo "available-models=$AVAILABLE_MODELS_STR" >> $GITHUB_OUTPUT
- name: Set cache directory output
id: set-outputs
shell: bash
run: |
CACHE_DIR="$HOME/.cache/kreuzberg/paddle-ocr"
echo "cache-dir=$CACHE_DIR" >> $GITHUB_OUTPUT
echo "PADDLE_OCR_MODEL_CACHE=$CACHE_DIR" >> $GITHUB_ENV
echo "KREUZBERG_CACHE_DIR=$HOME/.cache/kreuzberg" >> $GITHUB_ENV
- name: Export cache environment
shell: bash
run: |
echo "PADDLE_OCR_MODEL_CACHE=$HOME/.cache/kreuzberg/paddle-ocr" >> $GITHUB_ENV
echo "KREUZBERG_CACHE_DIR=$HOME/.cache/kreuzberg" >> $GITHUB_ENV
echo "PaddleOCR model cache configured at: $HOME/.cache/kreuzberg/paddle-ocr"