name: Setup PaddleOCR Models Cache description: Download and cache PaddleOCR ONNX models for CI testing inputs: cache-enabled: description: Enable model caching (set to false for cross-arch builds) required: false default: "true" models: description: Comma-separated list of models to setup (det,cls,rec or specific subset) required: false default: "det,cls,rec" cache-key-suffix: description: Suffix for cache key to differentiate model sets required: false default: "paddle-ocr-v5-onnx" outputs: cache-hit: description: Whether models were restored from cache (true/false) value: ${{ steps.cache-models.outputs.cache-hit }} cache-dir: description: Path to the PaddleOCR model cache directory value: ${{ steps.set-outputs.outputs.cache-dir }} models-available: description: Comma-separated list of available models value: ${{ steps.verify-models.outputs.available-models }} runs: using: composite steps: - name: Setup cache directory shell: bash run: | mkdir -p ~/.cache/kreuzberg/paddle-ocr echo "Cache directory: $HOME/.cache/kreuzberg/paddle-ocr" - name: Restore PaddleOCR models from cache if: inputs.cache-enabled == 'true' uses: actions/cache@v5 id: cache-models with: path: ~/.cache/kreuzberg/paddle-ocr key: ${{ inputs.cache-key-suffix }}-${{ runner.os }}-${{ runner.arch }}-v4 restore-keys: | ${{ inputs.cache-key-suffix }}-${{ runner.os }}-${{ runner.arch }}- ${{ inputs.cache-key-suffix }}-${{ runner.os }}- ${{ inputs.cache-key-suffix }}- - name: Download detection model (det) if: contains(inputs.models, 'det') && steps.cache-models.outputs.cache-hit != 'true' shell: bash run: | MODEL_URL="https://huggingface.co/Kreuzberg/paddleocr-onnx-models/resolve/main/PP-OCRv5_server_det_infer.onnx" CACHE_DIR="$HOME/.cache/kreuzberg/paddle-ocr" MODEL_DIR="$CACHE_DIR/det" MODEL_FILE="$MODEL_DIR/model.onnx" echo "Downloading detection model from $MODEL_URL" mkdir -p "$MODEL_DIR" for attempt in 1 2 3; do if [ $attempt -gt 1 ]; then backoff=$((5 * 3 ** (attempt - 2))) echo "Retry attempt $attempt/3 after ${backoff}s backoff..." sleep $backoff fi if curl -f -L --progress-bar --connect-timeout 30 --max-time 600 \ -o "$MODEL_FILE" "$MODEL_URL"; then echo "Detection model downloaded successfully ($(du -h "$MODEL_FILE" | cut -f1))" exit 0 fi done echo "ERROR: Failed to download detection model after 3 attempts" rm -f "$MODEL_FILE" exit 1 - name: Download classification model (cls) if: contains(inputs.models, 'cls') && steps.cache-models.outputs.cache-hit != 'true' shell: bash run: | MODEL_URL="https://huggingface.co/Kreuzberg/paddleocr-onnx-models/resolve/main/ch_ppocr_mobile_v2.0_cls_infer.onnx" CACHE_DIR="$HOME/.cache/kreuzberg/paddle-ocr" MODEL_DIR="$CACHE_DIR/cls" MODEL_FILE="$MODEL_DIR/model.onnx" echo "Downloading classification model from $MODEL_URL" mkdir -p "$MODEL_DIR" for attempt in 1 2 3; do if [ $attempt -gt 1 ]; then backoff=$((5 * 3 ** (attempt - 2))) echo "Retry attempt $attempt/3 after ${backoff}s backoff..." sleep $backoff fi if curl -f -L --progress-bar --connect-timeout 30 --max-time 600 \ -o "$MODEL_FILE" "$MODEL_URL"; then echo "Classification model downloaded successfully ($(du -h "$MODEL_FILE" | cut -f1))" exit 0 fi done echo "ERROR: Failed to download classification model after 3 attempts" rm -f "$MODEL_FILE" exit 1 - name: Download recognition model (rec/english) if: contains(inputs.models, 'rec') && steps.cache-models.outputs.cache-hit != 'true' shell: bash run: | MODEL_URL="https://huggingface.co/Kreuzberg/paddleocr-onnx-models/resolve/main/rec/english/model.onnx" CACHE_DIR="$HOME/.cache/kreuzberg/paddle-ocr" MODEL_DIR="$CACHE_DIR/rec/english" MODEL_FILE="$MODEL_DIR/model.onnx" echo "Downloading English recognition model from $MODEL_URL" mkdir -p "$MODEL_DIR" for attempt in 1 2 3; do if [ $attempt -gt 1 ]; then backoff=$((5 * 3 ** (attempt - 2))) echo "Retry attempt $attempt/3 after ${backoff}s backoff..." sleep $backoff fi if curl -f -L --progress-bar --connect-timeout 30 --max-time 600 \ -o "$MODEL_FILE" "$MODEL_URL"; then echo "Recognition model downloaded successfully ($(du -h "$MODEL_FILE" | cut -f1))" exit 0 fi done echo "ERROR: Failed to download recognition model after 3 attempts" rm -f "$MODEL_FILE" exit 1 - name: Download recognition dictionary (rec/english/dict.txt) if: contains(inputs.models, 'rec') && steps.cache-models.outputs.cache-hit != 'true' shell: bash run: | DICT_URL="https://huggingface.co/Kreuzberg/paddleocr-onnx-models/resolve/main/rec/english/dict.txt" CACHE_DIR="$HOME/.cache/kreuzberg/paddle-ocr" MODEL_DIR="$CACHE_DIR/rec/english" DICT_FILE="$MODEL_DIR/dict.txt" echo "Downloading English recognition dictionary from $DICT_URL" mkdir -p "$MODEL_DIR" for attempt in 1 2 3; do if [ $attempt -gt 1 ]; then backoff=$((5 * 3 ** (attempt - 2))) echo "Retry attempt $attempt/3 after ${backoff}s backoff..." sleep $backoff fi if curl -f -L --progress-bar --connect-timeout 30 --max-time 600 \ -o "$DICT_FILE" "$DICT_URL"; then echo "Dictionary downloaded successfully ($(du -h "$DICT_FILE" | cut -f1))" exit 0 fi done echo "ERROR: Failed to download dictionary after 3 attempts" rm -f "$DICT_FILE" exit 1 - name: Verify downloaded models id: verify-models shell: bash run: | CACHE_DIR="$HOME/.cache/kreuzberg/paddle-ocr" AVAILABLE_MODELS=() TOTAL_SIZE=0 echo "Checking for PaddleOCR models in $CACHE_DIR" if [ -f "$CACHE_DIR/det/model.onnx" ]; then SIZE=$(wc -c < "$CACHE_DIR/det/model.onnx" | tr -d ' ') AVAILABLE_MODELS+=("det") TOTAL_SIZE=$((TOTAL_SIZE + SIZE)) echo " ✓ Detection model: $(numfmt --to=iec-i --suffix=B $SIZE 2>/dev/null || echo $SIZE bytes)" fi if [ -f "$CACHE_DIR/cls/model.onnx" ]; then SIZE=$(wc -c < "$CACHE_DIR/cls/model.onnx" | tr -d ' ') AVAILABLE_MODELS+=("cls") TOTAL_SIZE=$((TOTAL_SIZE + SIZE)) echo " ✓ Classification model: $(numfmt --to=iec-i --suffix=B $SIZE 2>/dev/null || echo $SIZE bytes)" fi if [ -f "$CACHE_DIR/rec/english/model.onnx" ]; then SIZE=$(wc -c < "$CACHE_DIR/rec/english/model.onnx" | tr -d ' ') AVAILABLE_MODELS+=("rec") TOTAL_SIZE=$((TOTAL_SIZE + SIZE)) echo " ✓ Recognition model (English): $(numfmt --to=iec-i --suffix=B $SIZE 2>/dev/null || echo $SIZE bytes)" fi if [ -f "$CACHE_DIR/rec/english/dict.txt" ]; then SIZE=$(wc -c < "$CACHE_DIR/rec/english/dict.txt" | tr -d ' ') TOTAL_SIZE=$((TOTAL_SIZE + SIZE)) echo " ✓ Recognition dictionary (English): $(numfmt --to=iec-i --suffix=B $SIZE 2>/dev/null || echo $SIZE bytes)" fi if [ ${#AVAILABLE_MODELS[@]} -eq 0 ]; then echo "ERROR: No models found in cache directory after download" echo "available-models=" >> $GITHUB_OUTPUT exit 1 fi AVAILABLE_MODELS_STR=$(IFS=, ; echo "${AVAILABLE_MODELS[*]}") echo "✓ Total cached models: ${#AVAILABLE_MODELS[@]} ($(numfmt --to=iec-i --suffix=B $TOTAL_SIZE 2>/dev/null || echo $TOTAL_SIZE bytes))" echo "available-models=$AVAILABLE_MODELS_STR" >> $GITHUB_OUTPUT - name: Set cache directory output id: set-outputs shell: bash run: | CACHE_DIR="$HOME/.cache/kreuzberg/paddle-ocr" echo "cache-dir=$CACHE_DIR" >> $GITHUB_OUTPUT echo "PADDLE_OCR_MODEL_CACHE=$CACHE_DIR" >> $GITHUB_ENV echo "KREUZBERG_CACHE_DIR=$HOME/.cache/kreuzberg" >> $GITHUB_ENV - name: Export cache environment shell: bash run: | echo "PADDLE_OCR_MODEL_CACHE=$HOME/.cache/kreuzberg/paddle-ocr" >> $GITHUB_ENV echo "KREUZBERG_CACHE_DIR=$HOME/.cache/kreuzberg" >> $GITHUB_ENV echo "PaddleOCR model cache configured at: $HOME/.cache/kreuzberg/paddle-ocr"