Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/tools/benchmark-harness/scripts/sanitize_pandoc_gt.py
+++ b/tools/benchmark-harness/scripts/sanitize_pandoc_gt.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python3
+"""Sanitize pandoc-generated markdown ground truth files.
+
+Removes common pandoc artifacts that don't represent actual document structure.
+
+Usage:
+    # Single file (in-place):
+    python sanitize_pandoc_gt.py input.md
+
+    # Pipe mode:
+    pandoc -f docbook -t gfm --wrap=none input.xml | python sanitize_pandoc_gt.py > output.md
+
+    # Dry run (show diff without modifying):
+    python sanitize_pandoc_gt.py --dry-run input.md
+
+    # Batch all GT files (dry run):
+    python sanitize_pandoc_gt.py --dry-run --batch test_documents/ground_truth/
+
+    # Batch all GT files (apply):
+    python sanitize_pandoc_gt.py --batch test_documents/ground_truth/
+"""
+
+import argparse
+import difflib
+import os
+import re
+import sys
+
+
+def sanitize(text: str) -> str:
+    # Track whether we're inside a fenced code block
+    in_code = False
+    lines = text.split("\n")
+    result = []
+
+    for line in lines:
+        # Track fenced code blocks — don't modify content inside them
+        stripped = line.strip()
+        if stripped.startswith("```") or stripped.startswith("~~~"):
+            in_code = not in_code
+            # Clean code fence attributes even when toggling
+            if not in_code or stripped.startswith("```") or stripped.startswith("~~~"):
+                # Convert ``` {.python} to ```python
+                m = re.match(r"^(`{3,}|~{3,})\s*\{\s*\.(\w+)(?:\s+[^}]*)?\}\s*$", line)
+                if m:
+                    line = f"{m.group(1)}{m.group(2)}"
+                else:
+                    # Remove {.class} from code fences without extracting language
+                    line = re.sub(r"^(`{3,}|~{3,})\s*\{[^}]*\}\s*$", r"\1", line)
+            result.append(line)
+            continue
+
+        if in_code:
+            result.append(line)
+            continue
+
+        # === Pandoc div wrappers ===
+        if re.match(r"^:::\s*(\{.*\})?\s*$", stripped):
+            continue
+
+        # === Remove {.class} and {#id} attributes from headings ===
+        if re.match(r"^#{1,6}\s", line):
+            line = re.sub(r"\s*\{[.#][^}]*\}\s*$", "", line)
+
+        # === Replace <!-- end list --> pandoc markers with blank line ===
+        # Don't just remove — keep the structural separation it provides
+        if stripped == "<!-- end list -->":
+            if not (result and result[-1].strip() == ""):
+                result.append("")
+            continue
+
+        # === Remove pandoc-specific HTML comments only ===
+        # Keep <!-- image --> and other semantic comments
+        if stripped == "<!-- end list -->" or stripped == "<!-- -->":
+            continue
+
+        # Do NOT collapse blank lines — they are structural in markdown.
+        # Blank lines separate paragraphs, tables, lists, etc.
+
+        result.append(line)
+
+    # Trim trailing blank lines, ensure single trailing newline
+    while result and result[-1].strip() == "":
+        result.pop()
+
+    return "\n".join(result) + "\n" if result else ""
+
+
+def process_file(path: str, dry_run: bool = False) -> tuple[bool, str]:
+    """Process a single file. Returns (changed, diff_text)."""
+    with open(path) as f:
+        original = f.read()
+
+    cleaned = sanitize(original)
+
+    if original == cleaned:
+        return False, ""
+
+    diff = "".join(
+        difflib.unified_diff(
+            original.splitlines(keepends=True),
+            cleaned.splitlines(keepends=True),
+            fromfile=f"a/{path}",
+            tofile=f"b/{path}",
+            n=3,
+        )
+    )
+
+    if not dry_run:
+        with open(path, "w") as f:
+            f.write(cleaned)
+
+    return True, diff
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Sanitize pandoc GT markdown files")
+    parser.add_argument("path", nargs="?", help="File or directory to process")
+    parser.add_argument("--dry-run", action="store_true", help="Show diff without modifying files")
+    parser.add_argument("--batch", action="store_true", help="Process all .md files in directory recursively")
+    args = parser.parse_args()
+
+    # Pipe mode (no path, stdin)
+    if args.path is None and not sys.stdin.isatty():
+        sys.stdout.write(sanitize(sys.stdin.read()))
+        return
+
+    if args.path is None:
+        parser.print_help()
+        return
+
+    # Batch mode
+    if args.batch or os.path.isdir(args.path):
+        changed_count = 0
+        total_count = 0
+        for root, _dirs, files in os.walk(args.path):
+            for fname in sorted(files):
+                if not fname.endswith(".md"):
+                    continue
+                fpath = os.path.join(root, fname)
+                total_count += 1
+                changed, diff = process_file(fpath, dry_run=args.dry_run)
+                if changed:
+                    changed_count += 1
+                    if args.dry_run:
+                        print(diff)
+                    else:
+                        print(f"  cleaned: {fpath}")
+
+        action = "would change" if args.dry_run else "cleaned"
+        print(f"\n{action} {changed_count}/{total_count} files")
+        return
+
+    # Single file mode
+    changed, diff = process_file(args.path, dry_run=args.dry_run)
+    if changed:
+        if args.dry_run:
+            print(diff)
+        else:
+            print(f"cleaned: {args.path}")
+    else:
+        print(f"no changes: {args.path}")
+
+
+if __name__ == "__main__":
+    main()