167 lines
5.2 KiB
Python
167 lines
5.2 KiB
Python
#!/usr/bin/env python3
|
|
"""Sanitize pandoc-generated markdown ground truth files.
|
|
|
|
Removes common pandoc artifacts that don't represent actual document structure.
|
|
|
|
Usage:
|
|
# Single file (in-place):
|
|
python sanitize_pandoc_gt.py input.md
|
|
|
|
# Pipe mode:
|
|
pandoc -f docbook -t gfm --wrap=none input.xml | python sanitize_pandoc_gt.py > output.md
|
|
|
|
# Dry run (show diff without modifying):
|
|
python sanitize_pandoc_gt.py --dry-run input.md
|
|
|
|
# Batch all GT files (dry run):
|
|
python sanitize_pandoc_gt.py --dry-run --batch test_documents/ground_truth/
|
|
|
|
# Batch all GT files (apply):
|
|
python sanitize_pandoc_gt.py --batch test_documents/ground_truth/
|
|
"""
|
|
|
|
import argparse
|
|
import difflib
|
|
import os
|
|
import re
|
|
import sys
|
|
|
|
|
|
def sanitize(text: str) -> str:
|
|
# Track whether we're inside a fenced code block
|
|
in_code = False
|
|
lines = text.split("\n")
|
|
result = []
|
|
|
|
for line in lines:
|
|
# Track fenced code blocks — don't modify content inside them
|
|
stripped = line.strip()
|
|
if stripped.startswith("```") or stripped.startswith("~~~"):
|
|
in_code = not in_code
|
|
# Clean code fence attributes even when toggling
|
|
if not in_code or stripped.startswith("```") or stripped.startswith("~~~"):
|
|
# Convert ``` {.python} to ```python
|
|
m = re.match(r"^(`{3,}|~{3,})\s*\{\s*\.(\w+)(?:\s+[^}]*)?\}\s*$", line)
|
|
if m:
|
|
line = f"{m.group(1)}{m.group(2)}"
|
|
else:
|
|
# Remove {.class} from code fences without extracting language
|
|
line = re.sub(r"^(`{3,}|~{3,})\s*\{[^}]*\}\s*$", r"\1", line)
|
|
result.append(line)
|
|
continue
|
|
|
|
if in_code:
|
|
result.append(line)
|
|
continue
|
|
|
|
# === Pandoc div wrappers ===
|
|
if re.match(r"^:::\s*(\{.*\})?\s*$", stripped):
|
|
continue
|
|
|
|
# === Remove {.class} and {#id} attributes from headings ===
|
|
if re.match(r"^#{1,6}\s", line):
|
|
line = re.sub(r"\s*\{[.#][^}]*\}\s*$", "", line)
|
|
|
|
# === Replace <!-- end list --> pandoc markers with blank line ===
|
|
# Don't just remove — keep the structural separation it provides
|
|
if stripped == "<!-- end list -->":
|
|
if not (result and result[-1].strip() == ""):
|
|
result.append("")
|
|
continue
|
|
|
|
# === Remove pandoc-specific HTML comments only ===
|
|
# Keep <!-- image --> and other semantic comments
|
|
if stripped == "<!-- end list -->" or stripped == "<!-- -->":
|
|
continue
|
|
|
|
# Do NOT collapse blank lines — they are structural in markdown.
|
|
# Blank lines separate paragraphs, tables, lists, etc.
|
|
|
|
result.append(line)
|
|
|
|
# Trim trailing blank lines, ensure single trailing newline
|
|
while result and result[-1].strip() == "":
|
|
result.pop()
|
|
|
|
return "\n".join(result) + "\n" if result else ""
|
|
|
|
|
|
def process_file(path: str, dry_run: bool = False) -> tuple[bool, str]:
|
|
"""Process a single file. Returns (changed, diff_text)."""
|
|
with open(path) as f:
|
|
original = f.read()
|
|
|
|
cleaned = sanitize(original)
|
|
|
|
if original == cleaned:
|
|
return False, ""
|
|
|
|
diff = "".join(
|
|
difflib.unified_diff(
|
|
original.splitlines(keepends=True),
|
|
cleaned.splitlines(keepends=True),
|
|
fromfile=f"a/{path}",
|
|
tofile=f"b/{path}",
|
|
n=3,
|
|
)
|
|
)
|
|
|
|
if not dry_run:
|
|
with open(path, "w") as f:
|
|
f.write(cleaned)
|
|
|
|
return True, diff
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Sanitize pandoc GT markdown files")
|
|
parser.add_argument("path", nargs="?", help="File or directory to process")
|
|
parser.add_argument("--dry-run", action="store_true", help="Show diff without modifying files")
|
|
parser.add_argument("--batch", action="store_true", help="Process all .md files in directory recursively")
|
|
args = parser.parse_args()
|
|
|
|
# Pipe mode (no path, stdin)
|
|
if args.path is None and not sys.stdin.isatty():
|
|
sys.stdout.write(sanitize(sys.stdin.read()))
|
|
return
|
|
|
|
if args.path is None:
|
|
parser.print_help()
|
|
return
|
|
|
|
# Batch mode
|
|
if args.batch or os.path.isdir(args.path):
|
|
changed_count = 0
|
|
total_count = 0
|
|
for root, _dirs, files in os.walk(args.path):
|
|
for fname in sorted(files):
|
|
if not fname.endswith(".md"):
|
|
continue
|
|
fpath = os.path.join(root, fname)
|
|
total_count += 1
|
|
changed, diff = process_file(fpath, dry_run=args.dry_run)
|
|
if changed:
|
|
changed_count += 1
|
|
if args.dry_run:
|
|
print(diff)
|
|
else:
|
|
print(f" cleaned: {fpath}")
|
|
|
|
action = "would change" if args.dry_run else "cleaned"
|
|
print(f"\n{action} {changed_count}/{total_count} files")
|
|
return
|
|
|
|
# Single file mode
|
|
changed, diff = process_file(args.path, dry_run=args.dry_run)
|
|
if changed:
|
|
if args.dry_run:
|
|
print(diff)
|
|
else:
|
|
print(f"cleaned: {args.path}")
|
|
else:
|
|
print(f"no changes: {args.path}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|