Files
fil/tools/benchmark-harness/scripts/download_omnidocbench.sh

64 lines
2.6 KiB
Bash
Raw Normal View History

2026-06-01 23:40:55 +02:00
#!/usr/bin/env bash
# Download the OmniDocBench dataset (opendatalab/OmniDocBench) from HuggingFace.
#
# Usage:
# ./download_omnidocbench.sh [TARGET_DIR]
#
# Default target: tools/benchmark-harness/datasets/omnidocbench
#
# Requirements: curl, unzip (standard on macOS/Linux)
# No HuggingFace account or API key needed (public dataset).
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
DEFAULT_DIR="${SCRIPT_DIR}/../datasets/omnidocbench"
TARGET_DIR="${1:-$DEFAULT_DIR}"
HF_BASE="https://huggingface.co/datasets/opendatalab/OmniDocBench/resolve/main"
mkdir -p "$TARGET_DIR"
# Download the main annotation file (65 MB)
if [ -f "$TARGET_DIR/OmniDocBench.json" ]; then
echo "OmniDocBench.json already exists, skipping"
else
echo "Downloading OmniDocBench.json (65 MB)..."
curl -L -o "$TARGET_DIR/OmniDocBench.json" "$HF_BASE/OmniDocBench.json"
fi
# Download images directory via HF CLI if available, otherwise use git-lfs clone
if [ -d "$TARGET_DIR/images" ] && [ "$(find "$TARGET_DIR/images" -maxdepth 1 -type f 2>/dev/null | wc -l)" -gt 100 ]; then
echo "images/ directory already populated ($(find "$TARGET_DIR/images" -maxdepth 1 -type f | wc -l) files), skipping"
else
if command -v huggingface-cli &>/dev/null; then
echo "Downloading full dataset via huggingface-cli..."
huggingface-cli download opendatalab/OmniDocBench \
--repo-type dataset \
--local-dir "$TARGET_DIR" \
--include "images/*" "ori_pdfs/*" "OmniDocBench.json"
elif command -v git-lfs &>/dev/null || git lfs version &>/dev/null 2>&1; then
echo "Downloading via git-lfs clone..."
TEMP_CLONE="$(mktemp -d)"
git clone --depth 1 "https://huggingface.co/datasets/opendatalab/OmniDocBench" "$TEMP_CLONE"
cd "$TEMP_CLONE" && git lfs pull
cp -r "$TEMP_CLONE/images" "$TARGET_DIR/" 2>/dev/null || true
cp -r "$TEMP_CLONE/ori_pdfs" "$TARGET_DIR/" 2>/dev/null || true
rm -rf "$TEMP_CLONE"
else
echo "ERROR: Need either huggingface-cli or git-lfs to download images."
echo ""
echo "Install one of:"
echo " pip install huggingface-hub # then: huggingface-cli"
echo " brew install git-lfs # then: git lfs install"
exit 1
fi
fi
# Summary
echo ""
echo "OmniDocBench downloaded to: $TARGET_DIR"
echo " Annotations: $(wc -c <"$TARGET_DIR/OmniDocBench.json" | tr -d ' ') bytes"
[ -d "$TARGET_DIR/images" ] && echo " Images: $(find "$TARGET_DIR/images" -maxdepth 1 -type f | wc -l | tr -d ' ') files"
[ -d "$TARGET_DIR/ori_pdfs" ] && echo " PDFs: $(find "$TARGET_DIR/ori_pdfs" -maxdepth 1 -type f | wc -l | tr -d ' ') files"