This commit is contained in:
63
tools/benchmark-harness/scripts/download_omnidocbench.sh
Executable file
63
tools/benchmark-harness/scripts/download_omnidocbench.sh
Executable file
@@ -0,0 +1,63 @@
|
||||
#!/usr/bin/env bash
|
||||
# Download the OmniDocBench dataset (opendatalab/OmniDocBench) from HuggingFace.
|
||||
#
|
||||
# Usage:
|
||||
# ./download_omnidocbench.sh [TARGET_DIR]
|
||||
#
|
||||
# Default target: tools/benchmark-harness/datasets/omnidocbench
|
||||
#
|
||||
# Requirements: curl, unzip (standard on macOS/Linux)
|
||||
# No HuggingFace account or API key needed (public dataset).
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
DEFAULT_DIR="${SCRIPT_DIR}/../datasets/omnidocbench"
|
||||
TARGET_DIR="${1:-$DEFAULT_DIR}"
|
||||
|
||||
HF_BASE="https://huggingface.co/datasets/opendatalab/OmniDocBench/resolve/main"
|
||||
|
||||
mkdir -p "$TARGET_DIR"
|
||||
|
||||
# Download the main annotation file (65 MB)
|
||||
if [ -f "$TARGET_DIR/OmniDocBench.json" ]; then
|
||||
echo "OmniDocBench.json already exists, skipping"
|
||||
else
|
||||
echo "Downloading OmniDocBench.json (65 MB)..."
|
||||
curl -L -o "$TARGET_DIR/OmniDocBench.json" "$HF_BASE/OmniDocBench.json"
|
||||
fi
|
||||
|
||||
# Download images directory via HF CLI if available, otherwise use git-lfs clone
|
||||
if [ -d "$TARGET_DIR/images" ] && [ "$(find "$TARGET_DIR/images" -maxdepth 1 -type f 2>/dev/null | wc -l)" -gt 100 ]; then
|
||||
echo "images/ directory already populated ($(find "$TARGET_DIR/images" -maxdepth 1 -type f | wc -l) files), skipping"
|
||||
else
|
||||
if command -v huggingface-cli &>/dev/null; then
|
||||
echo "Downloading full dataset via huggingface-cli..."
|
||||
huggingface-cli download opendatalab/OmniDocBench \
|
||||
--repo-type dataset \
|
||||
--local-dir "$TARGET_DIR" \
|
||||
--include "images/*" "ori_pdfs/*" "OmniDocBench.json"
|
||||
elif command -v git-lfs &>/dev/null || git lfs version &>/dev/null 2>&1; then
|
||||
echo "Downloading via git-lfs clone..."
|
||||
TEMP_CLONE="$(mktemp -d)"
|
||||
git clone --depth 1 "https://huggingface.co/datasets/opendatalab/OmniDocBench" "$TEMP_CLONE"
|
||||
cd "$TEMP_CLONE" && git lfs pull
|
||||
cp -r "$TEMP_CLONE/images" "$TARGET_DIR/" 2>/dev/null || true
|
||||
cp -r "$TEMP_CLONE/ori_pdfs" "$TARGET_DIR/" 2>/dev/null || true
|
||||
rm -rf "$TEMP_CLONE"
|
||||
else
|
||||
echo "ERROR: Need either huggingface-cli or git-lfs to download images."
|
||||
echo ""
|
||||
echo "Install one of:"
|
||||
echo " pip install huggingface-hub # then: huggingface-cli"
|
||||
echo " brew install git-lfs # then: git lfs install"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# Summary
|
||||
echo ""
|
||||
echo "OmniDocBench downloaded to: $TARGET_DIR"
|
||||
echo " Annotations: $(wc -c <"$TARGET_DIR/OmniDocBench.json" | tr -d ' ') bytes"
|
||||
[ -d "$TARGET_DIR/images" ] && echo " Images: $(find "$TARGET_DIR/images" -maxdepth 1 -type f | wc -l | tr -d ' ') files"
|
||||
[ -d "$TARGET_DIR/ori_pdfs" ] && echo " PDFs: $(find "$TARGET_DIR/ori_pdfs" -maxdepth 1 -type f | wc -l | tr -d ' ') files"
|
||||
Reference in New Issue
Block a user