Files
fil/tools/benchmark-harness/scripts/sanitize_pandoc_gt.py

167 lines
5.2 KiB
Python
Raw Permalink Normal View History

2026-06-01 23:40:55 +02:00
#!/usr/bin/env python3
"""Sanitize pandoc-generated markdown ground truth files.
Removes common pandoc artifacts that don't represent actual document structure.
Usage:
# Single file (in-place):
python sanitize_pandoc_gt.py input.md
# Pipe mode:
pandoc -f docbook -t gfm --wrap=none input.xml | python sanitize_pandoc_gt.py > output.md
# Dry run (show diff without modifying):
python sanitize_pandoc_gt.py --dry-run input.md
# Batch all GT files (dry run):
python sanitize_pandoc_gt.py --dry-run --batch test_documents/ground_truth/
# Batch all GT files (apply):
python sanitize_pandoc_gt.py --batch test_documents/ground_truth/
"""
import argparse
import difflib
import os
import re
import sys
def sanitize(text: str) -> str:
# Track whether we're inside a fenced code block
in_code = False
lines = text.split("\n")
result = []
for line in lines:
# Track fenced code blocks — don't modify content inside them
stripped = line.strip()
if stripped.startswith("```") or stripped.startswith("~~~"):
in_code = not in_code
# Clean code fence attributes even when toggling
if not in_code or stripped.startswith("```") or stripped.startswith("~~~"):
# Convert ``` {.python} to ```python
m = re.match(r"^(`{3,}|~{3,})\s*\{\s*\.(\w+)(?:\s+[^}]*)?\}\s*$", line)
if m:
line = f"{m.group(1)}{m.group(2)}"
else:
# Remove {.class} from code fences without extracting language
line = re.sub(r"^(`{3,}|~{3,})\s*\{[^}]*\}\s*$", r"\1", line)
result.append(line)
continue
if in_code:
result.append(line)
continue
# === Pandoc div wrappers ===
if re.match(r"^:::\s*(\{.*\})?\s*$", stripped):
continue
# === Remove {.class} and {#id} attributes from headings ===
if re.match(r"^#{1,6}\s", line):
line = re.sub(r"\s*\{[.#][^}]*\}\s*$", "", line)
# === Replace <!-- end list --> pandoc markers with blank line ===
# Don't just remove — keep the structural separation it provides
if stripped == "<!-- end list -->":
if not (result and result[-1].strip() == ""):
result.append("")
continue
# === Remove pandoc-specific HTML comments only ===
# Keep <!-- image --> and other semantic comments
if stripped == "<!-- end list -->" or stripped == "<!-- -->":
continue
# Do NOT collapse blank lines — they are structural in markdown.
# Blank lines separate paragraphs, tables, lists, etc.
result.append(line)
# Trim trailing blank lines, ensure single trailing newline
while result and result[-1].strip() == "":
result.pop()
return "\n".join(result) + "\n" if result else ""
def process_file(path: str, dry_run: bool = False) -> tuple[bool, str]:
"""Process a single file. Returns (changed, diff_text)."""
with open(path) as f:
original = f.read()
cleaned = sanitize(original)
if original == cleaned:
return False, ""
diff = "".join(
difflib.unified_diff(
original.splitlines(keepends=True),
cleaned.splitlines(keepends=True),
fromfile=f"a/{path}",
tofile=f"b/{path}",
n=3,
)
)
if not dry_run:
with open(path, "w") as f:
f.write(cleaned)
return True, diff
def main():
parser = argparse.ArgumentParser(description="Sanitize pandoc GT markdown files")
parser.add_argument("path", nargs="?", help="File or directory to process")
parser.add_argument("--dry-run", action="store_true", help="Show diff without modifying files")
parser.add_argument("--batch", action="store_true", help="Process all .md files in directory recursively")
args = parser.parse_args()
# Pipe mode (no path, stdin)
if args.path is None and not sys.stdin.isatty():
sys.stdout.write(sanitize(sys.stdin.read()))
return
if args.path is None:
parser.print_help()
return
# Batch mode
if args.batch or os.path.isdir(args.path):
changed_count = 0
total_count = 0
for root, _dirs, files in os.walk(args.path):
for fname in sorted(files):
if not fname.endswith(".md"):
continue
fpath = os.path.join(root, fname)
total_count += 1
changed, diff = process_file(fpath, dry_run=args.dry_run)
if changed:
changed_count += 1
if args.dry_run:
print(diff)
else:
print(f" cleaned: {fpath}")
action = "would change" if args.dry_run else "cleaned"
print(f"\n{action} {changed_count}/{total_count} files")
return
# Single file mode
changed, diff = process_file(args.path, dry_run=args.dry_run)
if changed:
if args.dry_run:
print(diff)
else:
print(f"cleaned: {args.path}")
else:
print(f"no changes: {args.path}")
if __name__ == "__main__":
main()