Files
fil/tools/benchmark-harness/scripts/pandoc_extract.sh

68 lines
1.7 KiB
Bash
Raw Normal View History

2026-06-01 23:40:55 +02:00
#!/usr/bin/env bash
set -euo pipefail
FORMAT="markdown"
FILE_PATH=""
for arg in "$@"; do
case "$arg" in
--format=*)
FORMAT="${arg#--format=}"
;;
*)
FILE_PATH="$arg"
;;
esac
done
if [ -z "$FILE_PATH" ]; then
echo "Usage: pandoc_extract.sh [--format=markdown|plaintext] <file_path>" >&2
exit 1
fi
if [ "$FORMAT" != "markdown" ] && [ "$FORMAT" != "plaintext" ]; then
echo "Error: --format must be 'markdown' or 'plaintext'; got '$FORMAT'" >&2
exit 64
fi
if [ ! -f "$FILE_PATH" ]; then
echo "Error: File not found: $FILE_PATH" >&2
exit 1
fi
if [ "$FORMAT" = "markdown" ]; then
PANDOC_TO="gfm"
else
PANDOC_TO="plain"
fi
START=$(date +%s%N)
if command -v timeout &>/dev/null; then
CONTENT=$(timeout 60s pandoc "$FILE_PATH" "--to=$PANDOC_TO" --wrap=none --strip-comments 2>/dev/null || echo "")
elif command -v gtimeout &>/dev/null; then
CONTENT=$(gtimeout 60s pandoc "$FILE_PATH" "--to=$PANDOC_TO" --wrap=none --strip-comments 2>/dev/null || echo "")
else
CONTENT=$(pandoc "$FILE_PATH" "--to=$PANDOC_TO" --wrap=none --strip-comments 2>/dev/null || echo "")
fi
END=$(date +%s%N)
DURATION_MS=$(((END - START) / 1000000))
if command -v jq &>/dev/null; then
jq -n \
--arg content "$CONTENT" \
--arg fmt "$FORMAT" \
--argjson duration "$DURATION_MS" \
'{
content: $content,
metadata: {framework: "pandoc", output_format: $fmt},
_extraction_time_ms: $duration
}'
else
ESCAPED_CONTENT=$(echo "$CONTENT" | sed 's/\\/\\\\/g' | sed 's/"/\\"/g' | awk '{printf "%s\\n", $0}' | sed '$ s/\\n$//')
cat <<EOF
{"content":"$ESCAPED_CONTENT","metadata":{"framework":"pandoc","output_format":"$FORMAT"},"_extraction_time_ms":$DURATION_MS}
EOF
fi