64 lines
2.6 KiB
Bash
Executable File
64 lines
2.6 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Download the OmniDocBench dataset (opendatalab/OmniDocBench) from HuggingFace.
|
|
#
|
|
# Usage:
|
|
# ./download_omnidocbench.sh [TARGET_DIR]
|
|
#
|
|
# Default target: tools/benchmark-harness/datasets/omnidocbench
|
|
#
|
|
# Requirements: curl, unzip (standard on macOS/Linux)
|
|
# No HuggingFace account or API key needed (public dataset).
|
|
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
DEFAULT_DIR="${SCRIPT_DIR}/../datasets/omnidocbench"
|
|
TARGET_DIR="${1:-$DEFAULT_DIR}"
|
|
|
|
HF_BASE="https://huggingface.co/datasets/opendatalab/OmniDocBench/resolve/main"
|
|
|
|
mkdir -p "$TARGET_DIR"
|
|
|
|
# Download the main annotation file (65 MB)
|
|
if [ -f "$TARGET_DIR/OmniDocBench.json" ]; then
|
|
echo "OmniDocBench.json already exists, skipping"
|
|
else
|
|
echo "Downloading OmniDocBench.json (65 MB)..."
|
|
curl -L -o "$TARGET_DIR/OmniDocBench.json" "$HF_BASE/OmniDocBench.json"
|
|
fi
|
|
|
|
# Download images directory via HF CLI if available, otherwise use git-lfs clone
|
|
if [ -d "$TARGET_DIR/images" ] && [ "$(find "$TARGET_DIR/images" -maxdepth 1 -type f 2>/dev/null | wc -l)" -gt 100 ]; then
|
|
echo "images/ directory already populated ($(find "$TARGET_DIR/images" -maxdepth 1 -type f | wc -l) files), skipping"
|
|
else
|
|
if command -v huggingface-cli &>/dev/null; then
|
|
echo "Downloading full dataset via huggingface-cli..."
|
|
huggingface-cli download opendatalab/OmniDocBench \
|
|
--repo-type dataset \
|
|
--local-dir "$TARGET_DIR" \
|
|
--include "images/*" "ori_pdfs/*" "OmniDocBench.json"
|
|
elif command -v git-lfs &>/dev/null || git lfs version &>/dev/null 2>&1; then
|
|
echo "Downloading via git-lfs clone..."
|
|
TEMP_CLONE="$(mktemp -d)"
|
|
git clone --depth 1 "https://huggingface.co/datasets/opendatalab/OmniDocBench" "$TEMP_CLONE"
|
|
cd "$TEMP_CLONE" && git lfs pull
|
|
cp -r "$TEMP_CLONE/images" "$TARGET_DIR/" 2>/dev/null || true
|
|
cp -r "$TEMP_CLONE/ori_pdfs" "$TARGET_DIR/" 2>/dev/null || true
|
|
rm -rf "$TEMP_CLONE"
|
|
else
|
|
echo "ERROR: Need either huggingface-cli or git-lfs to download images."
|
|
echo ""
|
|
echo "Install one of:"
|
|
echo " pip install huggingface-hub # then: huggingface-cli"
|
|
echo " brew install git-lfs # then: git lfs install"
|
|
exit 1
|
|
fi
|
|
fi
|
|
|
|
# Summary
|
|
echo ""
|
|
echo "OmniDocBench downloaded to: $TARGET_DIR"
|
|
echo " Annotations: $(wc -c <"$TARGET_DIR/OmniDocBench.json" | tr -d ' ') bytes"
|
|
[ -d "$TARGET_DIR/images" ] && echo " Images: $(find "$TARGET_DIR/images" -maxdepth 1 -type f | wc -l | tr -d ' ') files"
|
|
[ -d "$TARGET_DIR/ori_pdfs" ] && echo " PDFs: $(find "$TARGET_DIR/ori_pdfs" -maxdepth 1 -type f | wc -l | tr -d ' ') files"
|