name: Setup Layout Detection Models Cache description: Download and cache layout detection ONNX models (RT-DETR + TATR) for CI testing inputs: cache-enabled: description: Enable model caching (set to false for cross-arch builds) required: false default: "true" models: description: Comma-separated list of models to setup (rtdetr,tatr) required: false default: "rtdetr,tatr" cache-key-suffix: description: Suffix for cache key to differentiate model sets required: false default: "layout-models-v2" outputs: cache-hit: description: Whether models were restored from cache (true/false) value: ${{ steps.cache-models.outputs.cache-hit }} cache-dir: description: Path to the layout model cache directory value: ${{ steps.set-outputs.outputs.cache-dir }} models-available: description: Comma-separated list of available models value: ${{ steps.verify-models.outputs.available-models }} runs: using: composite steps: - name: Setup cache directory shell: bash run: | mkdir -p ~/.cache/kreuzberg/layout echo "Cache directory: $HOME/.cache/kreuzberg/layout" - name: Restore layout models from cache if: inputs.cache-enabled == 'true' uses: actions/cache@v5 id: cache-models with: path: ~/.cache/kreuzberg/layout key: ${{ inputs.cache-key-suffix }}-${{ runner.os }}-${{ runner.arch }}-rtdetr_3bf2fb0e+tatr_c11f4033 restore-keys: | ${{ inputs.cache-key-suffix }}-${{ runner.os }}-${{ runner.arch }}- ${{ inputs.cache-key-suffix }}-${{ runner.os }}- ${{ inputs.cache-key-suffix }}- - name: Download RT-DETR model (rtdetr) if: contains(inputs.models, 'rtdetr') && steps.cache-models.outputs.cache-hit != 'true' shell: bash run: | MODEL_URL="https://huggingface.co/Kreuzberg/layout-models/resolve/main/rtdetr/model.onnx" CACHE_DIR="$HOME/.cache/kreuzberg/layout" MODEL_DIR="$CACHE_DIR/rtdetr" MODEL_FILE="$MODEL_DIR/model.onnx" echo "Downloading RT-DETR layout detection model from $MODEL_URL" mkdir -p "$MODEL_DIR" for attempt in 1 2 3; do if [ $attempt -gt 1 ]; then backoff=$((5 * 3 ** (attempt - 2))) echo "Retry attempt $attempt/3 after ${backoff}s backoff..." sleep $backoff fi if curl -f -L --progress-bar --connect-timeout 30 --max-time 600 \ -o "$MODEL_FILE" "$MODEL_URL"; then echo "RT-DETR model downloaded successfully ($(du -h "$MODEL_FILE" | cut -f1))" break fi done if [ ! -f "$MODEL_FILE" ]; then echo "ERROR: Failed to download RT-DETR model after 3 attempts" exit 1 fi - name: Verify RT-DETR SHA256 if: contains(inputs.models, 'rtdetr') && steps.cache-models.outputs.cache-hit != 'true' shell: bash run: | MODEL_FILE="$HOME/.cache/kreuzberg/layout/rtdetr/model.onnx" EXPECTED="3bf2fb0ee6df87435b7ae47f0f3930ec3dc97ec56fd824acc6d57bc7a6b89ef2" if command -v sha256sum &>/dev/null; then ACTUAL=$(sha256sum "$MODEL_FILE" | awk '{print $1}') else ACTUAL=$(shasum -a 256 "$MODEL_FILE" | awk '{print $1}') fi if [ "$ACTUAL" != "$EXPECTED" ]; then echo "ERROR: RT-DETR SHA256 mismatch" echo " Expected: $EXPECTED" echo " Actual: $ACTUAL" rm -f "$MODEL_FILE" exit 1 fi echo "RT-DETR SHA256 verified" - name: Download TATR model (tatr) if: contains(inputs.models, 'tatr') && steps.cache-models.outputs.cache-hit != 'true' shell: bash run: | MODEL_URL="https://huggingface.co/Kreuzberg/layout-models/resolve/main/tatr/model.onnx" CACHE_DIR="$HOME/.cache/kreuzberg/layout" MODEL_DIR="$CACHE_DIR/tatr" MODEL_FILE="$MODEL_DIR/tatr.onnx" echo "Downloading TATR table recognition model from $MODEL_URL" mkdir -p "$MODEL_DIR" for attempt in 1 2 3; do if [ $attempt -gt 1 ]; then backoff=$((5 * 3 ** (attempt - 2))) echo "Retry attempt $attempt/3 after ${backoff}s backoff..." sleep $backoff fi if curl -f -L --progress-bar --connect-timeout 30 --max-time 600 \ -o "$MODEL_FILE" "$MODEL_URL"; then echo "TATR model downloaded successfully ($(du -h "$MODEL_FILE" | cut -f1))" break fi done if [ ! -f "$MODEL_FILE" ]; then echo "ERROR: Failed to download TATR model after 3 attempts" exit 1 fi - name: Verify TATR SHA256 if: contains(inputs.models, 'tatr') && steps.cache-models.outputs.cache-hit != 'true' shell: bash run: | MODEL_FILE="$HOME/.cache/kreuzberg/layout/tatr/tatr.onnx" EXPECTED="c11f4033da75e9c4d41c403ef356e89caa0a37a7d111b55461e7d5ba856bb6b6" if command -v sha256sum &>/dev/null; then ACTUAL=$(sha256sum "$MODEL_FILE" | awk '{print $1}') else ACTUAL=$(shasum -a 256 "$MODEL_FILE" | awk '{print $1}') fi if [ "$ACTUAL" != "$EXPECTED" ]; then echo "ERROR: TATR SHA256 mismatch" echo " Expected: $EXPECTED" echo " Actual: $ACTUAL" rm -f "$MODEL_FILE" exit 1 fi echo "TATR SHA256 verified" - name: Verify downloaded models id: verify-models shell: bash run: | CACHE_DIR="$HOME/.cache/kreuzberg/layout" AVAILABLE_MODELS=() TOTAL_SIZE=0 echo "Checking for layout models in $CACHE_DIR" if [ -f "$CACHE_DIR/rtdetr/model.onnx" ]; then SIZE=$(wc -c < "$CACHE_DIR/rtdetr/model.onnx" | tr -d ' ') AVAILABLE_MODELS+=("rtdetr") TOTAL_SIZE=$((TOTAL_SIZE + SIZE)) echo " ✓ RT-DETR model: $(numfmt --to=iec-i --suffix=B $SIZE 2>/dev/null || echo $SIZE bytes)" fi if [ -f "$CACHE_DIR/tatr/tatr.onnx" ]; then SIZE=$(wc -c < "$CACHE_DIR/tatr/tatr.onnx" | tr -d ' ') AVAILABLE_MODELS+=("tatr") TOTAL_SIZE=$((TOTAL_SIZE + SIZE)) echo " ✓ TATR model: $(numfmt --to=iec-i --suffix=B $SIZE 2>/dev/null || echo $SIZE bytes)" fi if [ ${#AVAILABLE_MODELS[@]} -eq 0 ]; then echo "ERROR: No layout models found in cache directory after download" echo "available-models=" >> $GITHUB_OUTPUT exit 1 fi AVAILABLE_MODELS_STR=$(IFS=, ; echo "${AVAILABLE_MODELS[*]}") echo "✓ Total cached layout models: ${#AVAILABLE_MODELS[@]} ($(numfmt --to=iec-i --suffix=B $TOTAL_SIZE 2>/dev/null || echo $TOTAL_SIZE bytes))" echo "available-models=$AVAILABLE_MODELS_STR" >> $GITHUB_OUTPUT - name: Set cache directory output id: set-outputs shell: bash run: | CACHE_DIR="$HOME/.cache/kreuzberg/layout" echo "cache-dir=$CACHE_DIR" >> $GITHUB_OUTPUT echo "KREUZBERG_CACHE_DIR=$HOME/.cache/kreuzberg" >> $GITHUB_ENV echo "Layout model cache configured at: $CACHE_DIR"