198 lines
7.1 KiB
YAML
198 lines
7.1 KiB
YAML
name: Setup Layout Detection Models Cache
|
|
description: Download and cache layout detection ONNX models (RT-DETR + TATR) for CI testing
|
|
|
|
inputs:
|
|
cache-enabled:
|
|
description: Enable model caching (set to false for cross-arch builds)
|
|
required: false
|
|
default: "true"
|
|
models:
|
|
description: Comma-separated list of models to setup (rtdetr,tatr)
|
|
required: false
|
|
default: "rtdetr,tatr"
|
|
cache-key-suffix:
|
|
description: Suffix for cache key to differentiate model sets
|
|
required: false
|
|
default: "layout-models-v2"
|
|
|
|
outputs:
|
|
cache-hit:
|
|
description: Whether models were restored from cache (true/false)
|
|
value: ${{ steps.cache-models.outputs.cache-hit }}
|
|
cache-dir:
|
|
description: Path to the layout model cache directory
|
|
value: ${{ steps.set-outputs.outputs.cache-dir }}
|
|
models-available:
|
|
description: Comma-separated list of available models
|
|
value: ${{ steps.verify-models.outputs.available-models }}
|
|
|
|
runs:
|
|
using: composite
|
|
steps:
|
|
- name: Setup cache directory
|
|
shell: bash
|
|
run: |
|
|
mkdir -p ~/.cache/kreuzberg/layout
|
|
echo "Cache directory: $HOME/.cache/kreuzberg/layout"
|
|
|
|
- name: Restore layout models from cache
|
|
if: inputs.cache-enabled == 'true'
|
|
uses: actions/cache@v5
|
|
id: cache-models
|
|
with:
|
|
path: ~/.cache/kreuzberg/layout
|
|
key: ${{ inputs.cache-key-suffix }}-${{ runner.os }}-${{ runner.arch }}-rtdetr_3bf2fb0e+tatr_c11f4033
|
|
restore-keys: |
|
|
${{ inputs.cache-key-suffix }}-${{ runner.os }}-${{ runner.arch }}-
|
|
${{ inputs.cache-key-suffix }}-${{ runner.os }}-
|
|
${{ inputs.cache-key-suffix }}-
|
|
|
|
- name: Download RT-DETR model (rtdetr)
|
|
if: contains(inputs.models, 'rtdetr') && steps.cache-models.outputs.cache-hit != 'true'
|
|
shell: bash
|
|
run: |
|
|
MODEL_URL="https://huggingface.co/Kreuzberg/layout-models/resolve/main/rtdetr/model.onnx"
|
|
CACHE_DIR="$HOME/.cache/kreuzberg/layout"
|
|
MODEL_DIR="$CACHE_DIR/rtdetr"
|
|
MODEL_FILE="$MODEL_DIR/model.onnx"
|
|
|
|
echo "Downloading RT-DETR layout detection model from $MODEL_URL"
|
|
mkdir -p "$MODEL_DIR"
|
|
|
|
for attempt in 1 2 3; do
|
|
if [ $attempt -gt 1 ]; then
|
|
backoff=$((5 * 3 ** (attempt - 2)))
|
|
echo "Retry attempt $attempt/3 after ${backoff}s backoff..."
|
|
sleep $backoff
|
|
fi
|
|
|
|
if curl -f -L --progress-bar --connect-timeout 30 --max-time 600 \
|
|
-o "$MODEL_FILE" "$MODEL_URL"; then
|
|
echo "RT-DETR model downloaded successfully ($(du -h "$MODEL_FILE" | cut -f1))"
|
|
break
|
|
fi
|
|
done
|
|
|
|
if [ ! -f "$MODEL_FILE" ]; then
|
|
echo "ERROR: Failed to download RT-DETR model after 3 attempts"
|
|
exit 1
|
|
fi
|
|
|
|
- name: Verify RT-DETR SHA256
|
|
if: contains(inputs.models, 'rtdetr') && steps.cache-models.outputs.cache-hit != 'true'
|
|
shell: bash
|
|
run: |
|
|
MODEL_FILE="$HOME/.cache/kreuzberg/layout/rtdetr/model.onnx"
|
|
EXPECTED="3bf2fb0ee6df87435b7ae47f0f3930ec3dc97ec56fd824acc6d57bc7a6b89ef2"
|
|
|
|
if command -v sha256sum &>/dev/null; then
|
|
ACTUAL=$(sha256sum "$MODEL_FILE" | awk '{print $1}')
|
|
else
|
|
ACTUAL=$(shasum -a 256 "$MODEL_FILE" | awk '{print $1}')
|
|
fi
|
|
|
|
if [ "$ACTUAL" != "$EXPECTED" ]; then
|
|
echo "ERROR: RT-DETR SHA256 mismatch"
|
|
echo " Expected: $EXPECTED"
|
|
echo " Actual: $ACTUAL"
|
|
rm -f "$MODEL_FILE"
|
|
exit 1
|
|
fi
|
|
echo "RT-DETR SHA256 verified"
|
|
|
|
- name: Download TATR model (tatr)
|
|
if: contains(inputs.models, 'tatr') && steps.cache-models.outputs.cache-hit != 'true'
|
|
shell: bash
|
|
run: |
|
|
MODEL_URL="https://huggingface.co/Kreuzberg/layout-models/resolve/main/tatr/model.onnx"
|
|
CACHE_DIR="$HOME/.cache/kreuzberg/layout"
|
|
MODEL_DIR="$CACHE_DIR/tatr"
|
|
MODEL_FILE="$MODEL_DIR/tatr.onnx"
|
|
|
|
echo "Downloading TATR table recognition model from $MODEL_URL"
|
|
mkdir -p "$MODEL_DIR"
|
|
|
|
for attempt in 1 2 3; do
|
|
if [ $attempt -gt 1 ]; then
|
|
backoff=$((5 * 3 ** (attempt - 2)))
|
|
echo "Retry attempt $attempt/3 after ${backoff}s backoff..."
|
|
sleep $backoff
|
|
fi
|
|
|
|
if curl -f -L --progress-bar --connect-timeout 30 --max-time 600 \
|
|
-o "$MODEL_FILE" "$MODEL_URL"; then
|
|
echo "TATR model downloaded successfully ($(du -h "$MODEL_FILE" | cut -f1))"
|
|
break
|
|
fi
|
|
done
|
|
|
|
if [ ! -f "$MODEL_FILE" ]; then
|
|
echo "ERROR: Failed to download TATR model after 3 attempts"
|
|
exit 1
|
|
fi
|
|
|
|
- name: Verify TATR SHA256
|
|
if: contains(inputs.models, 'tatr') && steps.cache-models.outputs.cache-hit != 'true'
|
|
shell: bash
|
|
run: |
|
|
MODEL_FILE="$HOME/.cache/kreuzberg/layout/tatr/tatr.onnx"
|
|
EXPECTED="c11f4033da75e9c4d41c403ef356e89caa0a37a7d111b55461e7d5ba856bb6b6"
|
|
|
|
if command -v sha256sum &>/dev/null; then
|
|
ACTUAL=$(sha256sum "$MODEL_FILE" | awk '{print $1}')
|
|
else
|
|
ACTUAL=$(shasum -a 256 "$MODEL_FILE" | awk '{print $1}')
|
|
fi
|
|
|
|
if [ "$ACTUAL" != "$EXPECTED" ]; then
|
|
echo "ERROR: TATR SHA256 mismatch"
|
|
echo " Expected: $EXPECTED"
|
|
echo " Actual: $ACTUAL"
|
|
rm -f "$MODEL_FILE"
|
|
exit 1
|
|
fi
|
|
echo "TATR SHA256 verified"
|
|
|
|
- name: Verify downloaded models
|
|
id: verify-models
|
|
shell: bash
|
|
run: |
|
|
CACHE_DIR="$HOME/.cache/kreuzberg/layout"
|
|
AVAILABLE_MODELS=()
|
|
TOTAL_SIZE=0
|
|
|
|
echo "Checking for layout models in $CACHE_DIR"
|
|
|
|
if [ -f "$CACHE_DIR/rtdetr/model.onnx" ]; then
|
|
SIZE=$(wc -c < "$CACHE_DIR/rtdetr/model.onnx" | tr -d ' ')
|
|
AVAILABLE_MODELS+=("rtdetr")
|
|
TOTAL_SIZE=$((TOTAL_SIZE + SIZE))
|
|
echo " ✓ RT-DETR model: $(numfmt --to=iec-i --suffix=B $SIZE 2>/dev/null || echo $SIZE bytes)"
|
|
fi
|
|
|
|
if [ -f "$CACHE_DIR/tatr/tatr.onnx" ]; then
|
|
SIZE=$(wc -c < "$CACHE_DIR/tatr/tatr.onnx" | tr -d ' ')
|
|
AVAILABLE_MODELS+=("tatr")
|
|
TOTAL_SIZE=$((TOTAL_SIZE + SIZE))
|
|
echo " ✓ TATR model: $(numfmt --to=iec-i --suffix=B $SIZE 2>/dev/null || echo $SIZE bytes)"
|
|
fi
|
|
|
|
if [ ${#AVAILABLE_MODELS[@]} -eq 0 ]; then
|
|
echo "ERROR: No layout models found in cache directory after download"
|
|
echo "available-models=" >> $GITHUB_OUTPUT
|
|
exit 1
|
|
fi
|
|
|
|
AVAILABLE_MODELS_STR=$(IFS=, ; echo "${AVAILABLE_MODELS[*]}")
|
|
echo "✓ Total cached layout models: ${#AVAILABLE_MODELS[@]} ($(numfmt --to=iec-i --suffix=B $TOTAL_SIZE 2>/dev/null || echo $TOTAL_SIZE bytes))"
|
|
echo "available-models=$AVAILABLE_MODELS_STR" >> $GITHUB_OUTPUT
|
|
|
|
- name: Set cache directory output
|
|
id: set-outputs
|
|
shell: bash
|
|
run: |
|
|
CACHE_DIR="$HOME/.cache/kreuzberg/layout"
|
|
echo "cache-dir=$CACHE_DIR" >> $GITHUB_OUTPUT
|
|
echo "KREUZBERG_CACHE_DIR=$HOME/.cache/kreuzberg" >> $GITHUB_ENV
|
|
echo "Layout model cache configured at: $CACHE_DIR"
|