213 lines
7.2 KiB
Bash
213 lines
7.2 KiB
Bash
#!/usr/bin/env bash
|
|
# Generate markdown and text ground truth for docbook, typst, and fictionbook formats
|
|
# using pandoc + sanitize_pandoc_gt.py, then create benchmark fixture JSON files.
|
|
set -euo pipefail
|
|
|
|
REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)"
|
|
SANITIZE="$REPO_ROOT/tools/benchmark-harness/scripts/sanitize_pandoc_gt.py"
|
|
FIXTURES_DIR="$REPO_ROOT/tools/benchmark-harness/fixtures"
|
|
|
|
cd "$REPO_ROOT"
|
|
|
|
echo "=== Step 1: Generate MD ground truth via pandoc + sanitize ==="
|
|
|
|
# --- DocBook ---
|
|
echo "--- DocBook ---"
|
|
for f in test_documents/docbook/*.dbk test_documents/docbook/*.docbook test_documents/docbook/*.docbook4 test_documents/docbook/*.docbook5; do
|
|
[ -f "$f" ] || continue
|
|
name=$(basename "$f" | sed 's/\.[^.]*$//')
|
|
mkdir -p test_documents/ground_truth/docbook
|
|
pandoc -f docbook -t gfm --wrap=none "$f" 2>/dev/null | python3 "$SANITIZE" >"test_documents/ground_truth/docbook/${name}.md"
|
|
echo "docbook: $name ($(wc -c <"test_documents/ground_truth/docbook/${name}.md") bytes)"
|
|
done
|
|
|
|
# --- Typst ---
|
|
echo "--- Typst ---"
|
|
for f in test_documents/typst/*.typ; do
|
|
[ -f "$f" ] || continue
|
|
name=$(basename "$f" .typ)
|
|
# Typst GT goes in both typ/ (matching existing convention) and typst/
|
|
for gtdir in test_documents/ground_truth/typ test_documents/ground_truth/typst; do
|
|
mkdir -p "$gtdir"
|
|
pandoc -f typst -t gfm --wrap=none "$f" 2>/dev/null | python3 "$SANITIZE" >"${gtdir}/${name}.md"
|
|
done
|
|
echo "typst: $name ($(wc -c <"test_documents/ground_truth/typ/${name}.md") bytes)"
|
|
done
|
|
|
|
# --- FictionBook (fb2) ---
|
|
echo "--- FictionBook ---"
|
|
for f in test_documents/fictionbook/*.fb2; do
|
|
[ -f "$f" ] || continue
|
|
name=$(basename "$f" .fb2)
|
|
mkdir -p test_documents/ground_truth/fb2
|
|
existing="test_documents/ground_truth/fb2/${name}.md"
|
|
if [ ! -f "$existing" ]; then
|
|
pandoc -f fb2 -t gfm --wrap=none "$f" 2>/dev/null | python3 "$SANITIZE" >"$existing"
|
|
echo "fb2: $name (new, $(wc -c <"$existing") bytes)"
|
|
else
|
|
echo "fb2: $name (exists, $(wc -c <"$existing") bytes)"
|
|
fi
|
|
done
|
|
|
|
echo ""
|
|
echo "=== Step 2: Generate text GT from MD GT ==="
|
|
|
|
# For each .md GT file, generate .txt if missing
|
|
for md_file in test_documents/ground_truth/docbook/*.md test_documents/ground_truth/typ/*.md test_documents/ground_truth/fb2/*.md; do
|
|
[ -f "$md_file" ] || continue
|
|
txt_file="${md_file%.md}.txt"
|
|
if [ ! -f "$txt_file" ]; then
|
|
pandoc -f gfm -t plain --wrap=none "$md_file" >"$txt_file"
|
|
echo "text: $(basename "$txt_file") (new, $(wc -c <"$txt_file") bytes)"
|
|
fi
|
|
done
|
|
|
|
echo ""
|
|
echo "=== Step 3: Create fixture JSON files ==="
|
|
|
|
# Helper to create fixture JSON
|
|
create_fixture() {
|
|
local doc_path="$1"
|
|
local file_type="$2"
|
|
local gt_text="$3"
|
|
local gt_md="$4"
|
|
local fixture_out="$5"
|
|
local description="$6"
|
|
local category="$7"
|
|
|
|
local file_size
|
|
file_size=$(stat -f %z "$doc_path" 2>/dev/null || wc -c <"$doc_path" | tr -d ' ')
|
|
|
|
local name
|
|
name=$(basename "$doc_path" | sed 's/\.[^.]*$//')
|
|
|
|
# Compute relative paths from fixtures dir
|
|
local rel_doc="../../../${doc_path}"
|
|
local rel_text="../../../${gt_text}"
|
|
local rel_md="../../../${gt_md}"
|
|
|
|
local json
|
|
if [ -f "$gt_md" ] && [ -f "$gt_text" ]; then
|
|
json=$(
|
|
cat <<EOJSON
|
|
{
|
|
"document": "${rel_doc}",
|
|
"file_type": "${file_type}",
|
|
"file_size": ${file_size},
|
|
"expected_frameworks": ["kreuzberg"],
|
|
"metadata": {
|
|
"description": "${description}",
|
|
"category": "${category}"
|
|
},
|
|
"ground_truth": {
|
|
"text_file": "${rel_text}",
|
|
"markdown_file": "${rel_md}",
|
|
"source": "pandoc"
|
|
}
|
|
}
|
|
EOJSON
|
|
)
|
|
elif [ -f "$gt_text" ]; then
|
|
json=$(
|
|
cat <<EOJSON
|
|
{
|
|
"document": "${rel_doc}",
|
|
"file_type": "${file_type}",
|
|
"file_size": ${file_size},
|
|
"expected_frameworks": ["kreuzberg"],
|
|
"metadata": {
|
|
"description": "${description}",
|
|
"category": "${category}"
|
|
},
|
|
"ground_truth": {
|
|
"text_file": "${rel_text}",
|
|
"source": "pandoc"
|
|
}
|
|
}
|
|
EOJSON
|
|
)
|
|
fi
|
|
|
|
echo "$json" >"$fixture_out"
|
|
echo "fixture: $(basename "$fixture_out")"
|
|
}
|
|
|
|
# --- DocBook fixtures ---
|
|
echo "--- DocBook fixtures ---"
|
|
for f in test_documents/docbook/*.dbk test_documents/docbook/*.docbook test_documents/docbook/*.docbook4 test_documents/docbook/*.docbook5; do
|
|
[ -f "$f" ] || continue
|
|
name=$(basename "$f" | sed 's/\.[^.]*$//')
|
|
ext=$(basename "$f" | sed 's/.*\.//')
|
|
gt_md="test_documents/ground_truth/docbook/${name}.md"
|
|
gt_txt="test_documents/ground_truth/docbook/${name}.txt"
|
|
|
|
# Determine file_type based on extension
|
|
case "$ext" in
|
|
dbk) ft="dbk" ;;
|
|
docbook | docbook4 | docbook5) ft="docbook" ;;
|
|
*) ft="docbook" ;;
|
|
esac
|
|
|
|
fixture_name="docbook_$(echo "$name" | tr '-' '_').json"
|
|
create_fixture "$f" "$ft" "$gt_txt" "$gt_md" "${FIXTURES_DIR}/${fixture_name}" "DocBook document: ${name}" "docbook"
|
|
done
|
|
|
|
# --- Typst fixtures (update existing to add markdown_file) ---
|
|
echo "--- Typst fixtures ---"
|
|
for f in test_documents/typst/*.typ; do
|
|
[ -f "$f" ] || continue
|
|
name=$(basename "$f" .typ)
|
|
gt_md="test_documents/ground_truth/typ/${name}.md"
|
|
gt_txt="test_documents/ground_truth/typ/typst_${name}.txt"
|
|
# Some txt files use name directly, some use typst_ prefix - check both
|
|
if [ ! -f "$gt_txt" ]; then
|
|
gt_txt="test_documents/ground_truth/typ/${name}.txt"
|
|
fi
|
|
|
|
fixture_name="typst_${name}.json"
|
|
create_fixture "$f" "typ" "$gt_txt" "$gt_md" "${FIXTURES_DIR}/${fixture_name}" "Typst document: ${name}" "typst"
|
|
done
|
|
|
|
# --- FictionBook fixtures (update existing to add markdown_file) ---
|
|
echo "--- FictionBook fixtures ---"
|
|
for f in test_documents/fictionbook/*.fb2; do
|
|
[ -f "$f" ] || continue
|
|
name=$(basename "$f" .fb2)
|
|
gt_md="test_documents/ground_truth/fb2/${name}.md"
|
|
gt_txt="test_documents/ground_truth/fb2/${name}.txt"
|
|
# Some txt files use fb2_ prefix
|
|
if [ ! -f "$gt_txt" ]; then
|
|
gt_txt="test_documents/ground_truth/fb2/fb2_${name}.txt"
|
|
fi
|
|
|
|
fixture_name="fb2_${name}.json"
|
|
create_fixture "$f" "fb2" "$gt_txt" "$gt_md" "${FIXTURES_DIR}/${fixture_name}" "FictionBook document: ${name}" "fictionbook"
|
|
done
|
|
|
|
echo ""
|
|
echo "=== Step 4: Validate ==="
|
|
|
|
echo "--- Verifying GT files are non-empty ---"
|
|
empty_count=0
|
|
for f in test_documents/ground_truth/docbook/*.md test_documents/ground_truth/typ/*.md test_documents/ground_truth/fb2/*.md; do
|
|
[ -f "$f" ] || continue
|
|
size=$(wc -c <"$f" | tr -d ' ')
|
|
if [ "$size" -le 1 ]; then
|
|
echo "WARNING: $f is empty/near-empty ($size bytes)"
|
|
empty_count=$((empty_count + 1))
|
|
fi
|
|
done
|
|
echo "Empty/near-empty GT files: $empty_count"
|
|
|
|
echo ""
|
|
echo "=== Summary ==="
|
|
echo "DocBook MD GT files: $(find test_documents/ground_truth/docbook/*.md -maxdepth 1 2>/dev/null | wc -l | tr -d ' ')"
|
|
echo "DocBook TXT GT files: $(find test_documents/ground_truth/docbook/*.txt -maxdepth 1 2>/dev/null | wc -l | tr -d ' ')"
|
|
echo "Typst MD GT files: $(find test_documents/ground_truth/typ/*.md -maxdepth 1 2>/dev/null | wc -l | tr -d ' ')"
|
|
echo "Typst TXT GT files: $(find test_documents/ground_truth/typ/*.txt -maxdepth 1 2>/dev/null | wc -l | tr -d ' ')"
|
|
echo "FB2 MD GT files: $(find test_documents/ground_truth/fb2/*.md -maxdepth 1 2>/dev/null | wc -l | tr -d ' ')"
|
|
echo "FB2 TXT GT files: $(find test_documents/ground_truth/fb2/*.txt -maxdepth 1 2>/dev/null | wc -l | tr -d ' ')"
|
|
echo ""
|
|
echo "Fixture files created/updated:"
|
|
ls -1 "${FIXTURES_DIR}"/docbook_*.json "${FIXTURES_DIR}"/typst_*.json "${FIXTURES_DIR}"/fb2_*.json "${FIXTURES_DIR}"/dbk_*.json 2>/dev/null
|