1352 lines
45 KiB
Rust
1352 lines
45 KiB
Rust
|
|
//! Cross-format output parity tests.
|
||
|
|
//!
|
||
|
|
//! Verify that all output formats (Markdown, HTML, Djot, Plain) produce
|
||
|
|
//! equivalent text content for the same document. We extract each document
|
||
|
|
//! in every format, strip markup to plain text, tokenize, and compute
|
||
|
|
//! token-level F1 scores between format pairs.
|
||
|
|
//!
|
||
|
|
//! Usage:
|
||
|
|
//! cargo test -p kreuzberg --test cross_format_parity -- --nocapture
|
||
|
|
|
||
|
|
mod helpers;
|
||
|
|
|
||
|
|
use helpers::{get_test_file_path, test_documents_available};
|
||
|
|
use kreuzberg::core::config::{ExtractionConfig, OutputFormat};
|
||
|
|
use kreuzberg::extract_file_sync;
|
||
|
|
use std::collections::HashMap;
|
||
|
|
use std::path::Path;
|
||
|
|
|
||
|
|
// ============================================================================
|
||
|
|
// Text stripping helpers
|
||
|
|
// ============================================================================
|
||
|
|
|
||
|
|
/// Strip markdown markup to recover approximate plain text.
|
||
|
|
fn strip_markdown(text: &str) -> String {
|
||
|
|
let mut result = String::with_capacity(text.len());
|
||
|
|
|
||
|
|
for line in text.lines() {
|
||
|
|
let trimmed = line.trim();
|
||
|
|
|
||
|
|
// Skip code fence lines
|
||
|
|
if trimmed.starts_with("```") {
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Skip table separator lines (e.g., |---|---|)
|
||
|
|
if trimmed.starts_with('|') && trimmed.chars().all(|c| c == '|' || c == '-' || c == ':' || c == ' ') {
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Strip heading markers
|
||
|
|
let line = strip_leading_pattern(trimmed, '#');
|
||
|
|
|
||
|
|
// Strip blockquote markers
|
||
|
|
let line = strip_leading_pattern(&line, '>');
|
||
|
|
|
||
|
|
// Strip unordered list markers
|
||
|
|
let line = strip_list_marker(&line);
|
||
|
|
|
||
|
|
// Strip table pipes
|
||
|
|
let line = line.replace('|', " ");
|
||
|
|
|
||
|
|
// Strip link syntax: [text](url) -> text
|
||
|
|
let line = strip_links(&line);
|
||
|
|
|
||
|
|
// Strip image syntax:  -> alt
|
||
|
|
let line = strip_images(&line);
|
||
|
|
|
||
|
|
// Strip inline formatting markers
|
||
|
|
let line = line.replace("**", "");
|
||
|
|
let line = line.replace("__", "");
|
||
|
|
let line = line.replace('*', "");
|
||
|
|
let line = line.replace('_', " ");
|
||
|
|
let line = line.replace('~', "");
|
||
|
|
let line = line.replace('`', "");
|
||
|
|
|
||
|
|
result.push_str(&line);
|
||
|
|
result.push('\n');
|
||
|
|
}
|
||
|
|
|
||
|
|
result
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Strip HTML tags and decode common entities.
|
||
|
|
fn strip_html(text: &str) -> String {
|
||
|
|
// Remove all HTML tags
|
||
|
|
let mut result = String::with_capacity(text.len());
|
||
|
|
let mut in_tag = false;
|
||
|
|
|
||
|
|
for ch in text.chars() {
|
||
|
|
if ch == '<' {
|
||
|
|
in_tag = true;
|
||
|
|
} else if ch == '>' {
|
||
|
|
in_tag = false;
|
||
|
|
// Add space after closing tags to prevent word merging
|
||
|
|
result.push(' ');
|
||
|
|
} else if !in_tag {
|
||
|
|
result.push(ch);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Decode common HTML entities
|
||
|
|
let result = result.replace("&", "&");
|
||
|
|
let result = result.replace("<", "<");
|
||
|
|
let result = result.replace(">", ">");
|
||
|
|
let result = result.replace(""", "\"");
|
||
|
|
let result = result.replace("'", "'");
|
||
|
|
let result = result.replace("'", "'");
|
||
|
|
let result = result.replace(" ", " ");
|
||
|
|
|
||
|
|
// Decode numeric entities: &#NNN;
|
||
|
|
decode_numeric_entities(&result)
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Strip djot markup (similar to markdown with minor differences).
|
||
|
|
fn strip_djot(text: &str) -> String {
|
||
|
|
// Djot is structurally similar to markdown for our purposes
|
||
|
|
strip_markdown(text)
|
||
|
|
}
|
||
|
|
|
||
|
|
// ============================================================================
|
||
|
|
// Tokenization and scoring
|
||
|
|
// ============================================================================
|
||
|
|
|
||
|
|
/// Tokenize text: lowercase, split on whitespace, filter empty and
|
||
|
|
/// purely-punctuation tokens.
|
||
|
|
fn tokenize(text: &str) -> Vec<String> {
|
||
|
|
text.to_lowercase()
|
||
|
|
.split_whitespace()
|
||
|
|
.map(|t| t.trim_matches(|c: char| c.is_ascii_punctuation()).to_string())
|
||
|
|
.filter(|t| !t.is_empty())
|
||
|
|
.collect()
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Compute token-level F1 between two token sequences using bag-of-tokens.
|
||
|
|
///
|
||
|
|
/// This treats each sequence as a multiset (bag) and computes precision,
|
||
|
|
/// recall, and F1 based on token overlap counts.
|
||
|
|
fn token_f1(a: &[String], b: &[String]) -> f64 {
|
||
|
|
if a.is_empty() && b.is_empty() {
|
||
|
|
return 1.0;
|
||
|
|
}
|
||
|
|
if a.is_empty() || b.is_empty() {
|
||
|
|
return 0.0;
|
||
|
|
}
|
||
|
|
|
||
|
|
let mut bag_a: HashMap<&str, usize> = HashMap::new();
|
||
|
|
for token in a {
|
||
|
|
*bag_a.entry(token.as_str()).or_insert(0) += 1;
|
||
|
|
}
|
||
|
|
|
||
|
|
let mut bag_b: HashMap<&str, usize> = HashMap::new();
|
||
|
|
for token in b {
|
||
|
|
*bag_b.entry(token.as_str()).or_insert(0) += 1;
|
||
|
|
}
|
||
|
|
|
||
|
|
let mut overlap = 0usize;
|
||
|
|
for (token, &count_a) in &bag_a {
|
||
|
|
if let Some(&count_b) = bag_b.get(token) {
|
||
|
|
overlap += count_a.min(count_b);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
let precision = overlap as f64 / b.len() as f64;
|
||
|
|
let recall = overlap as f64 / a.len() as f64;
|
||
|
|
|
||
|
|
if precision + recall == 0.0 {
|
||
|
|
return 0.0;
|
||
|
|
}
|
||
|
|
|
||
|
|
2.0 * precision * recall / (precision + recall)
|
||
|
|
}
|
||
|
|
|
||
|
|
// ============================================================================
|
||
|
|
// Internal helpers
|
||
|
|
// ============================================================================
|
||
|
|
|
||
|
|
/// Strip leading repeated characters (like `#` for headings or `>` for quotes).
|
||
|
|
fn strip_leading_pattern(line: &str, marker: char) -> String {
|
||
|
|
let stripped = line.trim_start_matches(marker);
|
||
|
|
if stripped.len() < line.len() {
|
||
|
|
stripped.trim_start().to_string()
|
||
|
|
} else {
|
||
|
|
line.to_string()
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Strip list markers (- , * , + , 1. , etc.).
|
||
|
|
fn strip_list_marker(line: &str) -> String {
|
||
|
|
let trimmed = line.trim_start();
|
||
|
|
if trimmed.starts_with("- ") || trimmed.starts_with("* ") || trimmed.starts_with("+ ") {
|
||
|
|
let indent_len = line.len() - trimmed.len();
|
||
|
|
let rest = &trimmed[2..];
|
||
|
|
format!("{}{}", &line[..indent_len], rest)
|
||
|
|
} else if let Some(after_digit) = trimmed.strip_prefix(|c: char| c.is_ascii_digit()) {
|
||
|
|
// Handle "1. ", "2. ", etc.
|
||
|
|
if let Some(rest) = after_digit.strip_prefix(". ") {
|
||
|
|
let indent_len = line.len() - trimmed.len();
|
||
|
|
format!("{}{}", &line[..indent_len], rest)
|
||
|
|
} else {
|
||
|
|
line.to_string()
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
line.to_string()
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Strip markdown link syntax: [text](url) -> text
|
||
|
|
fn strip_links(text: &str) -> String {
|
||
|
|
let mut result = String::with_capacity(text.len());
|
||
|
|
let chars: Vec<char> = text.chars().collect();
|
||
|
|
let mut i = 0;
|
||
|
|
|
||
|
|
while i < chars.len() {
|
||
|
|
if chars[i] == '[' {
|
||
|
|
// Look for closing ] followed by (
|
||
|
|
if let Some(close_bracket) = chars[i + 1..].iter().position(|&c| c == ']') {
|
||
|
|
let close_idx = i + 1 + close_bracket;
|
||
|
|
if close_idx + 1 < chars.len() && chars[close_idx + 1] == '(' {
|
||
|
|
// Found [text]( ... look for closing )
|
||
|
|
if let Some(close_paren) = chars[close_idx + 2..].iter().position(|&c| c == ')') {
|
||
|
|
// Extract just the text part
|
||
|
|
let text_part: String = chars[i + 1..close_idx].iter().collect();
|
||
|
|
result.push_str(&text_part);
|
||
|
|
i = close_idx + 2 + close_paren + 1;
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
result.push(chars[i]);
|
||
|
|
i += 1;
|
||
|
|
} else {
|
||
|
|
result.push(chars[i]);
|
||
|
|
i += 1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
result
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Strip markdown image syntax:  -> alt
|
||
|
|
fn strip_images(text: &str) -> String {
|
||
|
|
let mut result = String::with_capacity(text.len());
|
||
|
|
let chars: Vec<char> = text.chars().collect();
|
||
|
|
let mut i = 0;
|
||
|
|
|
||
|
|
while i < chars.len() {
|
||
|
|
if chars[i] == '!' && i + 1 < chars.len() && chars[i + 1] == '[' {
|
||
|
|
// Image syntax: 
|
||
|
|
if let Some(close_bracket) = chars[i + 2..].iter().position(|&c| c == ']') {
|
||
|
|
let close_idx = i + 2 + close_bracket;
|
||
|
|
if close_idx + 1 < chars.len()
|
||
|
|
&& chars[close_idx + 1] == '('
|
||
|
|
&& let Some(close_paren) = chars[close_idx + 2..].iter().position(|&c| c == ')')
|
||
|
|
{
|
||
|
|
let alt_text: String = chars[i + 2..close_idx].iter().collect();
|
||
|
|
result.push_str(&alt_text);
|
||
|
|
i = close_idx + 2 + close_paren + 1;
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
result.push(chars[i]);
|
||
|
|
i += 1;
|
||
|
|
} else {
|
||
|
|
result.push(chars[i]);
|
||
|
|
i += 1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
result
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Decode numeric HTML entities (&#NNN;) to characters.
|
||
|
|
fn decode_numeric_entities(text: &str) -> String {
|
||
|
|
let mut result = String::with_capacity(text.len());
|
||
|
|
let mut chars = text.chars().peekable();
|
||
|
|
|
||
|
|
while let Some(ch) = chars.next() {
|
||
|
|
if ch == '&' && chars.peek() == Some(&'#') {
|
||
|
|
chars.next(); // consume '#'
|
||
|
|
let mut num_str = String::new();
|
||
|
|
while let Some(&c) = chars.peek() {
|
||
|
|
if c == ';' {
|
||
|
|
chars.next(); // consume ';'
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
if c.is_ascii_digit() {
|
||
|
|
num_str.push(c);
|
||
|
|
chars.next();
|
||
|
|
} else {
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if let Ok(code) = num_str.parse::<u32>()
|
||
|
|
&& let Some(decoded) = char::from_u32(code)
|
||
|
|
{
|
||
|
|
result.push(decoded);
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
// Failed to decode, emit as-is
|
||
|
|
result.push('&');
|
||
|
|
result.push('#');
|
||
|
|
result.push_str(&num_str);
|
||
|
|
} else {
|
||
|
|
result.push(ch);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
result
|
||
|
|
}
|
||
|
|
|
||
|
|
// ============================================================================
|
||
|
|
// GFM validation
|
||
|
|
// ============================================================================
|
||
|
|
|
||
|
|
/// Validate basic GFM (GitHub Flavored Markdown) lint rules.
|
||
|
|
///
|
||
|
|
/// Returns a list of violation descriptions. An empty list means the markdown
|
||
|
|
/// passes all checks. This is a lightweight inline replacement for shelling
|
||
|
|
/// out to `rumdl` which may not be installed.
|
||
|
|
fn validate_gfm_basics(markdown: &str) -> Vec<String> {
|
||
|
|
let mut violations = Vec::new();
|
||
|
|
let lines: Vec<&str> = markdown.lines().collect();
|
||
|
|
|
||
|
|
for (i, line) in lines.iter().enumerate() {
|
||
|
|
let line_num = i + 1;
|
||
|
|
|
||
|
|
// Rule: no trailing whitespace on lines
|
||
|
|
if line.ends_with(' ') || line.ends_with('\t') {
|
||
|
|
violations.push(format!("line {}: trailing whitespace", line_num));
|
||
|
|
}
|
||
|
|
|
||
|
|
// Rule: ATX-style headings only (not underline/setext style)
|
||
|
|
if i > 0 {
|
||
|
|
let prev = lines[i - 1].trim();
|
||
|
|
if !prev.is_empty() && (line.chars().all(|c| c == '=') && line.len() >= 2) {
|
||
|
|
violations.push(format!(
|
||
|
|
"line {}: setext heading (=== style), use ATX (# style)",
|
||
|
|
line_num
|
||
|
|
));
|
||
|
|
}
|
||
|
|
if !prev.is_empty() && (line.chars().all(|c| c == '-') && line.len() >= 2) && !prev.starts_with('|') {
|
||
|
|
// Exclude table separator rows (previous line starts with |)
|
||
|
|
violations.push(format!(
|
||
|
|
"line {}: setext heading (--- style), use ATX (# style)",
|
||
|
|
line_num
|
||
|
|
));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Rule: blank line before headings (except at file start)
|
||
|
|
if line.starts_with('#') && i > 0 && !lines[i - 1].trim().is_empty() {
|
||
|
|
violations.push(format!("line {}: missing blank line before heading", line_num));
|
||
|
|
}
|
||
|
|
|
||
|
|
// Rule: fenced code blocks should not be indented
|
||
|
|
let trimmed = line.trim_start();
|
||
|
|
if trimmed.starts_with("```") && line.len() != trimmed.len() {
|
||
|
|
violations.push(format!("line {}: indented fenced code block", line_num));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Rule: single trailing newline at end of file
|
||
|
|
if !markdown.is_empty() {
|
||
|
|
if !markdown.ends_with('\n') {
|
||
|
|
violations.push("file does not end with a newline".to_string());
|
||
|
|
} else if markdown.ends_with("\n\n") {
|
||
|
|
violations.push("file ends with multiple trailing newlines".to_string());
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Rule: no escaped brackets outside code blocks/spans
|
||
|
|
let mut in_fenced_block = false;
|
||
|
|
for (i, line) in lines.iter().enumerate() {
|
||
|
|
let trimmed = line.trim();
|
||
|
|
if trimmed.starts_with("```") {
|
||
|
|
in_fenced_block = !in_fenced_block;
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
if in_fenced_block {
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Strip inline code spans before checking for escaped brackets
|
||
|
|
let without_code = strip_inline_code(line);
|
||
|
|
if without_code.contains("\\[") || without_code.contains("\\]") {
|
||
|
|
violations.push(format!(
|
||
|
|
"line {}: escaped bracket (\\[ or \\]) outside code context",
|
||
|
|
i + 1
|
||
|
|
));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Rule: valid pipe table format (header row must be followed by separator row)
|
||
|
|
let mut in_code = false;
|
||
|
|
for (i, line) in lines.iter().enumerate() {
|
||
|
|
if line.trim().starts_with("```") {
|
||
|
|
in_code = !in_code;
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
if in_code {
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
let trimmed = line.trim();
|
||
|
|
// Detect a pipe-table header row: starts and ends with | and contains text
|
||
|
|
if trimmed.starts_with('|') && trimmed.ends_with('|') && trimmed.len() > 2 && !is_table_separator(trimmed) {
|
||
|
|
// Check if this could be the first row of a table (preceded by blank or start)
|
||
|
|
let is_first_table_row = i == 0 || lines[i - 1].trim().is_empty() || !lines[i - 1].trim().starts_with('|');
|
||
|
|
if is_first_table_row {
|
||
|
|
// Next line should be a separator row
|
||
|
|
if i + 1 >= lines.len() || !is_table_separator(lines[i + 1].trim()) {
|
||
|
|
violations.push(format!(
|
||
|
|
"line {}: pipe table header row not followed by separator row",
|
||
|
|
i + 1
|
||
|
|
));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
violations
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Check if a line is a markdown table separator row (e.g., `|---|---|`).
|
||
|
|
fn is_table_separator(line: &str) -> bool {
|
||
|
|
let trimmed = line.trim();
|
||
|
|
trimmed.starts_with('|') && trimmed.chars().all(|c| c == '|' || c == '-' || c == ':' || c == ' ')
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Strip inline code spans from a line for bracket-escaping analysis.
|
||
|
|
fn strip_inline_code(line: &str) -> String {
|
||
|
|
let mut result = String::with_capacity(line.len());
|
||
|
|
let mut in_code = false;
|
||
|
|
let chars: Vec<char> = line.chars().collect();
|
||
|
|
let mut i = 0;
|
||
|
|
|
||
|
|
while i < chars.len() {
|
||
|
|
if chars[i] == '`' {
|
||
|
|
in_code = !in_code;
|
||
|
|
i += 1;
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
if !in_code {
|
||
|
|
result.push(chars[i]);
|
||
|
|
}
|
||
|
|
i += 1;
|
||
|
|
}
|
||
|
|
|
||
|
|
result
|
||
|
|
}
|
||
|
|
|
||
|
|
// ============================================================================
|
||
|
|
// Structural block counting
|
||
|
|
// ============================================================================
|
||
|
|
|
||
|
|
/// Count structural block elements in markdown/djot content.
|
||
|
|
///
|
||
|
|
/// Returns a map of block type name to count.
|
||
|
|
fn count_blocks(content: &str) -> HashMap<String, usize> {
|
||
|
|
let mut counts: HashMap<String, usize> = HashMap::new();
|
||
|
|
let lines: Vec<&str> = content.lines().collect();
|
||
|
|
let mut in_code_block = false;
|
||
|
|
|
||
|
|
for (i, line) in lines.iter().enumerate() {
|
||
|
|
let trimmed = line.trim();
|
||
|
|
|
||
|
|
// Track fenced code blocks
|
||
|
|
if trimmed.starts_with("```") {
|
||
|
|
if !in_code_block {
|
||
|
|
*counts.entry("code_blocks".to_string()).or_insert(0) += 1;
|
||
|
|
}
|
||
|
|
in_code_block = !in_code_block;
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
if in_code_block {
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Headings
|
||
|
|
if trimmed.starts_with('#') {
|
||
|
|
*counts.entry("headings".to_string()).or_insert(0) += 1;
|
||
|
|
}
|
||
|
|
// List items (unordered or ordered)
|
||
|
|
else if trimmed.starts_with("- ")
|
||
|
|
|| trimmed.starts_with("* ")
|
||
|
|
|| trimmed.starts_with("+ ")
|
||
|
|
|| (trimmed.len() > 2
|
||
|
|
&& trimmed.chars().next().is_some_and(|c| c.is_ascii_digit())
|
||
|
|
&& trimmed.contains(". "))
|
||
|
|
{
|
||
|
|
*counts.entry("list_items".to_string()).or_insert(0) += 1;
|
||
|
|
}
|
||
|
|
// Table rows (non-separator pipe table rows)
|
||
|
|
else if trimmed.starts_with('|') && trimmed.ends_with('|') && !is_table_separator(trimmed) {
|
||
|
|
*counts.entry("table_rows".to_string()).or_insert(0) += 1;
|
||
|
|
}
|
||
|
|
// Paragraphs: non-empty line preceded by a blank line (or at start of file)
|
||
|
|
else if !trimmed.is_empty() && (i == 0 || lines[i - 1].trim().is_empty()) && !trimmed.starts_with('>') {
|
||
|
|
*counts.entry("paragraphs".to_string()).or_insert(0) += 1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
counts
|
||
|
|
}
|
||
|
|
|
||
|
|
// ============================================================================
|
||
|
|
// Extraction helpers
|
||
|
|
// ============================================================================
|
||
|
|
|
||
|
|
/// Extract a document in the given output format.
|
||
|
|
fn extract_with_format(path: &Path, format: OutputFormat) -> Option<String> {
|
||
|
|
let config = ExtractionConfig {
|
||
|
|
output_format: format.clone(),
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
|
||
|
|
match extract_file_sync(path, None, &config) {
|
||
|
|
Ok(result) => Some(result.content),
|
||
|
|
Err(err) => {
|
||
|
|
eprintln!(
|
||
|
|
" [WARN] extraction failed for {} with format {}: {}",
|
||
|
|
path.display(),
|
||
|
|
format,
|
||
|
|
err
|
||
|
|
);
|
||
|
|
None
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Strip markup from content based on its format.
|
||
|
|
fn strip_markup(content: &str, format: &OutputFormat) -> String {
|
||
|
|
match format {
|
||
|
|
OutputFormat::Plain => content.to_string(),
|
||
|
|
OutputFormat::Markdown => strip_markdown(content),
|
||
|
|
OutputFormat::Html => strip_html(content),
|
||
|
|
OutputFormat::Djot => strip_djot(content),
|
||
|
|
_ => content.to_string(),
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// ============================================================================
|
||
|
|
// Test document definitions
|
||
|
|
// ============================================================================
|
||
|
|
|
||
|
|
struct TestDoc {
|
||
|
|
/// Human-readable label.
|
||
|
|
label: &'static str,
|
||
|
|
/// Path relative to test_documents/.
|
||
|
|
relative_path: &'static str,
|
||
|
|
/// Required cargo feature (empty string means no feature needed).
|
||
|
|
required_feature: &'static str,
|
||
|
|
/// Expected minimum TF1 for Markdown vs HTML.
|
||
|
|
md_html_threshold: f64,
|
||
|
|
/// Expected minimum TF1 for Markdown vs Djot.
|
||
|
|
md_djot_threshold: f64,
|
||
|
|
/// Expected minimum TF1 for Markdown vs Plain.
|
||
|
|
md_plain_threshold: f64,
|
||
|
|
/// Whether the source document is HTML (relaxed thresholds due to
|
||
|
|
/// round-trip divergence).
|
||
|
|
_is_html_input: bool,
|
||
|
|
}
|
||
|
|
|
||
|
|
const TEST_DOCS: &[TestDoc] = &[
|
||
|
|
// Markdown extraction_test.md — has headings, tables, lists. No extra features needed.
|
||
|
|
TestDoc {
|
||
|
|
label: "markdown-extraction-test",
|
||
|
|
relative_path: "markdown/extraction_test.md",
|
||
|
|
required_feature: "",
|
||
|
|
md_html_threshold: 0.95,
|
||
|
|
md_djot_threshold: 0.85,
|
||
|
|
md_plain_threshold: 0.85,
|
||
|
|
_is_html_input: false,
|
||
|
|
},
|
||
|
|
// Markdown readme.md — headings, lists, code block. No extra features needed.
|
||
|
|
TestDoc {
|
||
|
|
label: "markdown-readme",
|
||
|
|
relative_path: "markdown/readme.md",
|
||
|
|
required_feature: "",
|
||
|
|
md_html_threshold: 0.95,
|
||
|
|
md_djot_threshold: 0.85,
|
||
|
|
md_plain_threshold: 0.85,
|
||
|
|
_is_html_input: false,
|
||
|
|
},
|
||
|
|
// RST document — requires office feature for the RST extractor.
|
||
|
|
TestDoc {
|
||
|
|
label: "rst-readme",
|
||
|
|
relative_path: "rst/readme.rst",
|
||
|
|
required_feature: "office",
|
||
|
|
md_html_threshold: 0.95,
|
||
|
|
md_djot_threshold: 0.85,
|
||
|
|
md_plain_threshold: 0.85,
|
||
|
|
_is_html_input: false,
|
||
|
|
},
|
||
|
|
// HTML page — requires html feature. The taylor_swift page is large (Wikipedia)
|
||
|
|
// and includes extensive navigation/sidebar elements. When extracting as Markdown,
|
||
|
|
// html-to-markdown-rs performs article extraction (producing ~43k tokens). When
|
||
|
|
// extracting as HTML or Plain, the full InternalDocument is rendered (~82k tokens),
|
||
|
|
// including navigation elements absent from the article extraction. This structural
|
||
|
|
// divergence yields a TF1 of ~0.55 between Markdown and HTML/Plain outputs, with
|
||
|
|
// ~1% variance across tokenizer/dependency-version drift. Threshold set conservatively
|
||
|
|
// below the observed lower bound to allow for that variation.
|
||
|
|
TestDoc {
|
||
|
|
label: "html-taylor-swift",
|
||
|
|
relative_path: "html/taylor_swift.html",
|
||
|
|
required_feature: "html",
|
||
|
|
md_html_threshold: 0.54,
|
||
|
|
md_djot_threshold: 0.85,
|
||
|
|
md_plain_threshold: 0.54,
|
||
|
|
_is_html_input: true,
|
||
|
|
},
|
||
|
|
// LaTeX document — requires office feature.
|
||
|
|
TestDoc {
|
||
|
|
label: "latex-basic-sections",
|
||
|
|
relative_path: "latex/basic_sections.tex",
|
||
|
|
required_feature: "office",
|
||
|
|
md_html_threshold: 0.95,
|
||
|
|
md_djot_threshold: 0.85,
|
||
|
|
md_plain_threshold: 0.85,
|
||
|
|
_is_html_input: false,
|
||
|
|
},
|
||
|
|
// EPUB — requires office feature.
|
||
|
|
TestDoc {
|
||
|
|
label: "epub-wasteland",
|
||
|
|
relative_path: "epub/wasteland.epub",
|
||
|
|
required_feature: "office",
|
||
|
|
md_html_threshold: 0.95,
|
||
|
|
md_djot_threshold: 0.85,
|
||
|
|
md_plain_threshold: 0.85,
|
||
|
|
_is_html_input: false,
|
||
|
|
},
|
||
|
|
// DOCX — requires office feature.
|
||
|
|
TestDoc {
|
||
|
|
label: "docx-sample-document",
|
||
|
|
relative_path: "docx/sample_document.docx",
|
||
|
|
required_feature: "office",
|
||
|
|
md_html_threshold: 0.95,
|
||
|
|
md_djot_threshold: 0.85,
|
||
|
|
md_plain_threshold: 0.85,
|
||
|
|
_is_html_input: false,
|
||
|
|
},
|
||
|
|
// HTML table document — requires html feature.
|
||
|
|
TestDoc {
|
||
|
|
label: "html-simple-table",
|
||
|
|
relative_path: "html/simple_table.html",
|
||
|
|
required_feature: "html",
|
||
|
|
md_html_threshold: 0.75,
|
||
|
|
md_djot_threshold: 0.85,
|
||
|
|
md_plain_threshold: 0.75,
|
||
|
|
_is_html_input: true,
|
||
|
|
},
|
||
|
|
// LaTeX tables — requires office feature.
|
||
|
|
TestDoc {
|
||
|
|
label: "latex-tables",
|
||
|
|
relative_path: "latex/tables.tex",
|
||
|
|
required_feature: "office",
|
||
|
|
md_html_threshold: 0.95,
|
||
|
|
md_djot_threshold: 0.80,
|
||
|
|
md_plain_threshold: 0.80,
|
||
|
|
_is_html_input: false,
|
||
|
|
},
|
||
|
|
];
|
||
|
|
|
||
|
|
// ============================================================================
|
||
|
|
// Tests
|
||
|
|
// ============================================================================
|
||
|
|
|
||
|
|
/// Check whether a required feature is available at runtime by attempting an
|
||
|
|
/// extraction. Returns false if extraction fails (feature likely not compiled).
|
||
|
|
fn feature_available(feature: &str) -> bool {
|
||
|
|
// Each arm evaluates a different cfg! macro, so matches! is not appropriate.
|
||
|
|
#[allow(clippy::match_like_matches_macro)]
|
||
|
|
match feature {
|
||
|
|
"" => true,
|
||
|
|
"html" => cfg!(feature = "html"),
|
||
|
|
"office" => cfg!(feature = "office"),
|
||
|
|
"pdf" => cfg!(feature = "pdf"),
|
||
|
|
"excel" => cfg!(feature = "excel"),
|
||
|
|
_ => false,
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn cross_format_parity_all_documents() {
|
||
|
|
if !test_documents_available() {
|
||
|
|
eprintln!("Skipping: test_documents not available");
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let formats = [
|
||
|
|
OutputFormat::Markdown,
|
||
|
|
OutputFormat::Html,
|
||
|
|
OutputFormat::Djot,
|
||
|
|
OutputFormat::Plain,
|
||
|
|
];
|
||
|
|
|
||
|
|
let mut failures: Vec<String> = Vec::new();
|
||
|
|
let mut tested = 0usize;
|
||
|
|
|
||
|
|
for doc in TEST_DOCS {
|
||
|
|
if !feature_available(doc.required_feature) {
|
||
|
|
eprintln!(" [SKIP] {} — requires feature '{}'", doc.label, doc.required_feature);
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
let path = get_test_file_path(doc.relative_path);
|
||
|
|
if !path.exists() {
|
||
|
|
eprintln!(" [SKIP] {} — file not found: {}", doc.label, path.display());
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
eprintln!("\n--- {} ---", doc.label);
|
||
|
|
|
||
|
|
// Extract in all formats
|
||
|
|
let mut outputs: HashMap<String, String> = HashMap::new();
|
||
|
|
for format in &formats {
|
||
|
|
if let Some(content) = extract_with_format(&path, format.clone()) {
|
||
|
|
let stripped = strip_markup(&content, format);
|
||
|
|
let format_name = format.to_string();
|
||
|
|
eprintln!(
|
||
|
|
" {}: {} chars raw, {} chars stripped",
|
||
|
|
format_name,
|
||
|
|
content.len(),
|
||
|
|
stripped.len()
|
||
|
|
);
|
||
|
|
outputs.insert(format_name, stripped);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Need at least markdown and one other format to compare
|
||
|
|
let md_tokens = match outputs.get("markdown") {
|
||
|
|
Some(text) => tokenize(text),
|
||
|
|
None => {
|
||
|
|
eprintln!(" [SKIP] {} — markdown extraction failed", doc.label);
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
if md_tokens.is_empty() {
|
||
|
|
eprintln!(" [SKIP] {} — markdown produced no tokens", doc.label);
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
tested += 1;
|
||
|
|
|
||
|
|
// Compare Markdown vs HTML
|
||
|
|
if let Some(html_text) = outputs.get("html") {
|
||
|
|
let html_tokens = tokenize(html_text);
|
||
|
|
let f1 = token_f1(&md_tokens, &html_tokens);
|
||
|
|
eprintln!(
|
||
|
|
" MD vs HTML: TF1 = {:.4} (md_tokens={}, html_tokens={})",
|
||
|
|
f1,
|
||
|
|
md_tokens.len(),
|
||
|
|
html_tokens.len()
|
||
|
|
);
|
||
|
|
if f1 < doc.md_html_threshold {
|
||
|
|
failures.push(format!(
|
||
|
|
"{}: MD vs HTML TF1 = {:.4} < threshold {:.2}",
|
||
|
|
doc.label, f1, doc.md_html_threshold
|
||
|
|
));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Compare Markdown vs Djot
|
||
|
|
if let Some(djot_text) = outputs.get("djot") {
|
||
|
|
let djot_tokens = tokenize(djot_text);
|
||
|
|
let f1 = token_f1(&md_tokens, &djot_tokens);
|
||
|
|
eprintln!(
|
||
|
|
" MD vs Djot: TF1 = {:.4} (md_tokens={}, djot_tokens={})",
|
||
|
|
f1,
|
||
|
|
md_tokens.len(),
|
||
|
|
djot_tokens.len()
|
||
|
|
);
|
||
|
|
if f1 < doc.md_djot_threshold {
|
||
|
|
failures.push(format!(
|
||
|
|
"{}: MD vs Djot TF1 = {:.4} < threshold {:.2}",
|
||
|
|
doc.label, f1, doc.md_djot_threshold
|
||
|
|
));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Compare Markdown vs Plain
|
||
|
|
if let Some(plain_text) = outputs.get("plain") {
|
||
|
|
let plain_tokens = tokenize(plain_text);
|
||
|
|
let f1 = token_f1(&md_tokens, &plain_tokens);
|
||
|
|
eprintln!(
|
||
|
|
" MD vs Plain: TF1 = {:.4} (md_tokens={}, plain_tokens={})",
|
||
|
|
f1,
|
||
|
|
md_tokens.len(),
|
||
|
|
plain_tokens.len()
|
||
|
|
);
|
||
|
|
if f1 < doc.md_plain_threshold {
|
||
|
|
failures.push(format!(
|
||
|
|
"{}: MD vs Plain TF1 = {:.4} < threshold {:.2}",
|
||
|
|
doc.label, f1, doc.md_plain_threshold
|
||
|
|
));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
eprintln!("\n=== Summary: tested {} documents ===", tested);
|
||
|
|
|
||
|
|
if !failures.is_empty() {
|
||
|
|
panic!(
|
||
|
|
"Cross-format parity failures ({}/{} checks failed):\n - {}",
|
||
|
|
failures.len(),
|
||
|
|
tested * 3,
|
||
|
|
failures.join("\n - ")
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
assert!(tested > 0, "Expected at least one document to be tested");
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Focused test for table content parity across formats.
|
||
|
|
///
|
||
|
|
/// Verifies that table cell text appears in all format outputs,
|
||
|
|
/// regardless of how the table is rendered (pipe tables, HTML tables,
|
||
|
|
/// space-separated text).
|
||
|
|
#[test]
|
||
|
|
fn cross_format_table_content_parity() {
|
||
|
|
if !test_documents_available() {
|
||
|
|
eprintln!("Skipping: test_documents not available");
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Documents known to contain tables
|
||
|
|
let table_docs: &[(&str, &str, &[&str])] = &[
|
||
|
|
#[cfg(feature = "html")]
|
||
|
|
("html/simple_table.html", "html", &["Product", "Category", "Price"]),
|
||
|
|
#[cfg(feature = "office")]
|
||
|
|
(
|
||
|
|
"latex/tables.tex",
|
||
|
|
"office",
|
||
|
|
&[], // We don't know exact cell values; just check non-empty extraction
|
||
|
|
),
|
||
|
|
#[cfg(feature = "office")]
|
||
|
|
("docx/docx_tables.docx", "office", &[]),
|
||
|
|
];
|
||
|
|
|
||
|
|
let formats = [
|
||
|
|
("markdown", OutputFormat::Markdown),
|
||
|
|
("html", OutputFormat::Html),
|
||
|
|
("djot", OutputFormat::Djot),
|
||
|
|
("plain", OutputFormat::Plain),
|
||
|
|
];
|
||
|
|
|
||
|
|
let mut tested = 0usize;
|
||
|
|
let mut failures: Vec<String> = Vec::new();
|
||
|
|
|
||
|
|
for &(relative_path, required_feature, expected_cells) in table_docs {
|
||
|
|
if !feature_available(required_feature) {
|
||
|
|
eprintln!(" [SKIP] {} — requires feature '{}'", relative_path, required_feature);
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
let path = get_test_file_path(relative_path);
|
||
|
|
if !path.exists() {
|
||
|
|
eprintln!(" [SKIP] {} — file not found", relative_path);
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
eprintln!("\n--- table test: {} ---", relative_path);
|
||
|
|
tested += 1;
|
||
|
|
|
||
|
|
for (format_name, format) in &formats {
|
||
|
|
if let Some(content) = extract_with_format(&path, format.clone()) {
|
||
|
|
let lower = content.to_lowercase();
|
||
|
|
|
||
|
|
// Check that expected cell values appear in every format
|
||
|
|
for &cell in expected_cells {
|
||
|
|
if !lower.contains(&cell.to_lowercase()) {
|
||
|
|
failures.push(format!(
|
||
|
|
"{} [{}]: missing expected table cell '{}'",
|
||
|
|
relative_path, format_name, cell
|
||
|
|
));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Every format should produce non-empty content
|
||
|
|
if content.trim().is_empty() {
|
||
|
|
failures.push(format!("{} [{}]: produced empty content", relative_path, format_name));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
eprintln!("\n=== Table parity: tested {} documents ===", tested);
|
||
|
|
|
||
|
|
if !failures.is_empty() {
|
||
|
|
panic!("Table content parity failures:\n - {}", failures.join("\n - "));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Validate that markdown extraction output passes basic GFM lint rules.
|
||
|
|
///
|
||
|
|
/// Extracts each non-HTML document as Markdown and runs inline GFM checks.
|
||
|
|
/// This catches common issues like trailing whitespace, setext headings,
|
||
|
|
/// escaped brackets, and malformed tables without requiring an external tool.
|
||
|
|
#[test]
|
||
|
|
fn markdown_gfm_lint_validation() {
|
||
|
|
if !test_documents_available() {
|
||
|
|
eprintln!("Skipping: test_documents not available");
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let mut failures: Vec<String> = Vec::new();
|
||
|
|
let mut tested = 0usize;
|
||
|
|
|
||
|
|
for doc in TEST_DOCS {
|
||
|
|
if !feature_available(doc.required_feature) {
|
||
|
|
eprintln!(" [SKIP] {} — requires feature '{}'", doc.label, doc.required_feature);
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
let path = get_test_file_path(doc.relative_path);
|
||
|
|
if !path.exists() {
|
||
|
|
eprintln!(" [SKIP] {} — file not found: {}", doc.label, path.display());
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
if let Some(md_content) = extract_with_format(&path, OutputFormat::Markdown) {
|
||
|
|
tested += 1;
|
||
|
|
let violations = validate_gfm_basics(&md_content);
|
||
|
|
if !violations.is_empty() {
|
||
|
|
// Report at most 5 violations per document to keep output manageable
|
||
|
|
let shown: Vec<_> = violations.iter().take(5).collect();
|
||
|
|
let suffix = if violations.len() > 5 {
|
||
|
|
format!(" ... and {} more", violations.len() - 5)
|
||
|
|
} else {
|
||
|
|
String::new()
|
||
|
|
};
|
||
|
|
failures.push(format!(
|
||
|
|
"{}: {} GFM violations: [{}]{}",
|
||
|
|
doc.label,
|
||
|
|
violations.len(),
|
||
|
|
shown.iter().map(|s| s.as_str()).collect::<Vec<_>>().join(", "),
|
||
|
|
suffix,
|
||
|
|
));
|
||
|
|
} else {
|
||
|
|
eprintln!(" [OK] {} — GFM lint clean", doc.label);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
eprintln!("\n=== GFM lint: tested {} documents ===", tested);
|
||
|
|
|
||
|
|
if !failures.is_empty() {
|
||
|
|
panic!(
|
||
|
|
"GFM lint failures ({} documents):\n - {}",
|
||
|
|
failures.len(),
|
||
|
|
failures.join("\n - ")
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
assert!(tested > 0, "Expected at least one document to be tested");
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Compare structural block counts between Markdown and Djot outputs.
|
||
|
|
///
|
||
|
|
/// For each document, counts headings, paragraphs, table rows, list items,
|
||
|
|
/// and code blocks in both formats. Asserts they are within +/-2 of each
|
||
|
|
/// other, allowing for minor differences in paragraph consolidation.
|
||
|
|
#[test]
|
||
|
|
fn structural_block_comparison() {
|
||
|
|
if !test_documents_available() {
|
||
|
|
eprintln!("Skipping: test_documents not available");
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let mut failures: Vec<String> = Vec::new();
|
||
|
|
let mut tested = 0usize;
|
||
|
|
let tolerance = 2i64;
|
||
|
|
|
||
|
|
for doc in TEST_DOCS {
|
||
|
|
if !feature_available(doc.required_feature) {
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
let path = get_test_file_path(doc.relative_path);
|
||
|
|
if !path.exists() {
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
let md_content = extract_with_format(&path, OutputFormat::Markdown);
|
||
|
|
let djot_content = extract_with_format(&path, OutputFormat::Djot);
|
||
|
|
|
||
|
|
if let (Some(md), Some(djot)) = (md_content, djot_content) {
|
||
|
|
tested += 1;
|
||
|
|
let md_blocks = count_blocks(&md);
|
||
|
|
let djot_blocks = count_blocks(&djot);
|
||
|
|
|
||
|
|
eprintln!("\n--- structural blocks: {} ---", doc.label);
|
||
|
|
|
||
|
|
let block_types = ["headings", "paragraphs", "table_rows", "list_items", "code_blocks"];
|
||
|
|
for block_type in &block_types {
|
||
|
|
let md_count = *md_blocks.get(*block_type).unwrap_or(&0) as i64;
|
||
|
|
let djot_count = *djot_blocks.get(*block_type).unwrap_or(&0) as i64;
|
||
|
|
let diff = (md_count - djot_count).abs();
|
||
|
|
|
||
|
|
eprintln!(" {}: md={}, djot={}, diff={}", block_type, md_count, djot_count, diff);
|
||
|
|
|
||
|
|
if diff > tolerance {
|
||
|
|
failures.push(format!(
|
||
|
|
"{}: {} count differs by {} (md={}, djot={}, tolerance={})",
|
||
|
|
doc.label, block_type, diff, md_count, djot_count, tolerance
|
||
|
|
));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
eprintln!("\n=== Structural blocks: tested {} documents ===", tested);
|
||
|
|
|
||
|
|
if !failures.is_empty() {
|
||
|
|
panic!("Structural block comparison failures:\n - {}", failures.join("\n - "));
|
||
|
|
}
|
||
|
|
|
||
|
|
assert!(tested > 0, "Expected at least one document to be tested");
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Verify that markdown output does not contain escaped brackets outside code.
|
||
|
|
///
|
||
|
|
/// Extracts a document known to contain links/brackets (markdown/comprehensive.md)
|
||
|
|
/// and checks that the output does not have `\[` or `\]` in non-code contexts.
|
||
|
|
/// Escaped brackets break rendering in most markdown viewers and are a sign
|
||
|
|
/// of incorrect link/bracket handling in the extraction pipeline.
|
||
|
|
#[test]
|
||
|
|
fn no_escaped_brackets_in_markdown() {
|
||
|
|
if !test_documents_available() {
|
||
|
|
eprintln!("Skipping: test_documents not available");
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Documents known to have links/brackets in the source.
|
||
|
|
// We use extraction_test.md and readme.md which are well-formed and
|
||
|
|
// contain links. comprehensive.md is excluded because it contains
|
||
|
|
// intentional edge cases that may not round-trip cleanly.
|
||
|
|
let bracket_docs: &[(&str, &str)] = &[
|
||
|
|
("markdown/extraction_test.md", ""),
|
||
|
|
("markdown/readme.md", ""),
|
||
|
|
#[cfg(feature = "office")]
|
||
|
|
("rst/readme.rst", "office"),
|
||
|
|
#[cfg(feature = "office")]
|
||
|
|
("epub/wasteland.epub", "office"),
|
||
|
|
];
|
||
|
|
|
||
|
|
let mut tested = 0usize;
|
||
|
|
let mut failures: Vec<String> = Vec::new();
|
||
|
|
|
||
|
|
for &(relative_path, required_feature) in bracket_docs {
|
||
|
|
if !feature_available(required_feature) {
|
||
|
|
eprintln!(" [SKIP] {} — requires feature '{}'", relative_path, required_feature);
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
let path = get_test_file_path(relative_path);
|
||
|
|
if !path.exists() {
|
||
|
|
eprintln!(" [SKIP] {} — file not found", relative_path);
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
if let Some(md_content) = extract_with_format(&path, OutputFormat::Markdown) {
|
||
|
|
tested += 1;
|
||
|
|
|
||
|
|
// Check for escaped brackets outside of code blocks/spans
|
||
|
|
let mut in_fenced_block = false;
|
||
|
|
for (i, line) in md_content.lines().enumerate() {
|
||
|
|
let trimmed = line.trim();
|
||
|
|
if trimmed.starts_with("```") {
|
||
|
|
in_fenced_block = !in_fenced_block;
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
if in_fenced_block {
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
let without_code = strip_inline_code(line);
|
||
|
|
if without_code.contains("\\[") || without_code.contains("\\]") {
|
||
|
|
failures.push(format!(
|
||
|
|
"{} line {}: escaped bracket found: '{}'",
|
||
|
|
relative_path,
|
||
|
|
i + 1,
|
||
|
|
line.trim()
|
||
|
|
));
|
||
|
|
// Only report first occurrence per document
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if !failures.iter().any(|f| f.starts_with(relative_path)) {
|
||
|
|
eprintln!(" [OK] {} — no escaped brackets", relative_path);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
eprintln!("\n=== Bracket escaping: tested {} documents ===", tested);
|
||
|
|
|
||
|
|
if !failures.is_empty() {
|
||
|
|
panic!("Escaped bracket violations:\n - {}", failures.join("\n - "));
|
||
|
|
}
|
||
|
|
|
||
|
|
assert!(tested > 0, "Expected at least one document to be tested");
|
||
|
|
}
|
||
|
|
|
||
|
|
// ============================================================================
|
||
|
|
// Unit tests for helper functions
|
||
|
|
// ============================================================================
|
||
|
|
|
||
|
|
#[cfg(test)]
|
||
|
|
mod helper_tests {
|
||
|
|
use super::*;
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_strip_markdown_headings() {
|
||
|
|
let input = "# Heading 1\n## Heading 2\nPlain text\n";
|
||
|
|
let stripped = strip_markdown(input);
|
||
|
|
assert!(stripped.contains("Heading 1"));
|
||
|
|
assert!(stripped.contains("Heading 2"));
|
||
|
|
assert!(stripped.contains("Plain text"));
|
||
|
|
assert!(!stripped.contains('#'));
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_strip_markdown_links() {
|
||
|
|
let input = "See [link text](https://example.com) for details.\n";
|
||
|
|
let stripped = strip_markdown(input);
|
||
|
|
assert!(stripped.contains("link text"));
|
||
|
|
assert!(!stripped.contains("https://example.com"));
|
||
|
|
assert!(!stripped.contains('['));
|
||
|
|
assert!(!stripped.contains(']'));
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_strip_markdown_bold_italic() {
|
||
|
|
let input = "This is **bold** and *italic* text.\n";
|
||
|
|
let stripped = strip_markdown(input);
|
||
|
|
assert!(stripped.contains("bold"));
|
||
|
|
assert!(stripped.contains("italic"));
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_strip_markdown_list() {
|
||
|
|
let input = "- item one\n* item two\n1. item three\n";
|
||
|
|
let stripped = strip_markdown(input);
|
||
|
|
assert!(stripped.contains("item one"));
|
||
|
|
assert!(stripped.contains("item two"));
|
||
|
|
assert!(stripped.contains("item three"));
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_strip_html_tags() {
|
||
|
|
let input = "<h1>Title</h1><p>Hello & goodbye</p>";
|
||
|
|
let stripped = strip_html(input);
|
||
|
|
assert!(stripped.contains("Title"));
|
||
|
|
assert!(stripped.contains("Hello & goodbye"));
|
||
|
|
assert!(!stripped.contains('<'));
|
||
|
|
assert!(!stripped.contains('>'));
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_strip_html_numeric_entity() {
|
||
|
|
let input = "AAB";
|
||
|
|
let stripped = strip_html(input);
|
||
|
|
assert!(stripped.contains("AAB"));
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_tokenize() {
|
||
|
|
let input = "Hello, World! This is a TEST.";
|
||
|
|
let tokens = tokenize(input);
|
||
|
|
assert!(tokens.contains(&"hello".to_string()));
|
||
|
|
assert!(tokens.contains(&"world".to_string()));
|
||
|
|
assert!(tokens.contains(&"test".to_string()));
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_token_f1_identical() {
|
||
|
|
let a = vec!["hello".to_string(), "world".to_string()];
|
||
|
|
let f1 = token_f1(&a, &a);
|
||
|
|
assert!((f1 - 1.0).abs() < f64::EPSILON);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_token_f1_no_overlap() {
|
||
|
|
let a = vec!["hello".to_string()];
|
||
|
|
let b = vec!["world".to_string()];
|
||
|
|
let f1 = token_f1(&a, &b);
|
||
|
|
assert!(f1.abs() < f64::EPSILON);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_token_f1_partial_overlap() {
|
||
|
|
let a = vec![
|
||
|
|
"the".to_string(),
|
||
|
|
"quick".to_string(),
|
||
|
|
"brown".to_string(),
|
||
|
|
"fox".to_string(),
|
||
|
|
];
|
||
|
|
let b = vec![
|
||
|
|
"the".to_string(),
|
||
|
|
"quick".to_string(),
|
||
|
|
"red".to_string(),
|
||
|
|
"fox".to_string(),
|
||
|
|
];
|
||
|
|
let f1 = token_f1(&a, &b);
|
||
|
|
// 3 overlapping tokens out of 4 each -> precision=3/4, recall=3/4, F1=3/4
|
||
|
|
assert!((f1 - 0.75).abs() < 0.01);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_token_f1_empty() {
|
||
|
|
let empty: Vec<String> = vec![];
|
||
|
|
assert!((token_f1(&empty, &empty) - 1.0).abs() < f64::EPSILON);
|
||
|
|
assert!(token_f1(&empty, &["a".to_string()]).abs() < f64::EPSILON);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_strip_images() {
|
||
|
|
let input = "Before  after";
|
||
|
|
let stripped = strip_images(input);
|
||
|
|
assert!(stripped.contains("alt text"));
|
||
|
|
assert!(!stripped.contains("image.png"));
|
||
|
|
}
|
||
|
|
|
||
|
|
// ---- GFM validation unit tests ----
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_gfm_trailing_whitespace() {
|
||
|
|
let md = "Hello world \nNext line\n";
|
||
|
|
let violations = validate_gfm_basics(md);
|
||
|
|
assert!(violations.iter().any(|v| v.contains("trailing whitespace")));
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_gfm_no_trailing_newline() {
|
||
|
|
let md = "Hello world";
|
||
|
|
let violations = validate_gfm_basics(md);
|
||
|
|
assert!(violations.iter().any(|v| v.contains("does not end with a newline")));
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_gfm_multiple_trailing_newlines() {
|
||
|
|
let md = "Hello world\n\n";
|
||
|
|
let violations = validate_gfm_basics(md);
|
||
|
|
assert!(violations.iter().any(|v| v.contains("multiple trailing newlines")));
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_gfm_setext_heading() {
|
||
|
|
let md = "Title\n=====\n";
|
||
|
|
let violations = validate_gfm_basics(md);
|
||
|
|
assert!(violations.iter().any(|v| v.contains("setext heading")));
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_gfm_missing_blank_before_heading() {
|
||
|
|
let md = "Some text\n# Heading\n";
|
||
|
|
let violations = validate_gfm_basics(md);
|
||
|
|
assert!(
|
||
|
|
violations
|
||
|
|
.iter()
|
||
|
|
.any(|v| v.contains("missing blank line before heading"))
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_gfm_escaped_brackets() {
|
||
|
|
let md = "Text with \\[escaped\\] brackets\n";
|
||
|
|
let violations = validate_gfm_basics(md);
|
||
|
|
assert!(violations.iter().any(|v| v.contains("escaped bracket")));
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_gfm_escaped_brackets_in_code_ok() {
|
||
|
|
let md = "Text with `\\[code\\]` is fine\n";
|
||
|
|
let violations = validate_gfm_basics(md);
|
||
|
|
assert!(!violations.iter().any(|v| v.contains("escaped bracket")));
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_gfm_indented_code_fence() {
|
||
|
|
let md = " ```rust\ncode\n```\n";
|
||
|
|
let violations = validate_gfm_basics(md);
|
||
|
|
assert!(violations.iter().any(|v| v.contains("indented fenced code block")));
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_gfm_valid_markdown() {
|
||
|
|
let md = "# Heading\n\nSome text here.\n\n## Sub heading\n\nMore text.\n";
|
||
|
|
let violations = validate_gfm_basics(md);
|
||
|
|
assert!(violations.is_empty(), "Expected no violations, got: {:?}", violations);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_gfm_valid_table() {
|
||
|
|
let md = "# Table\n\n| Header | Col |\n| --- | --- |\n| A | B |\n";
|
||
|
|
let violations = validate_gfm_basics(md);
|
||
|
|
assert!(
|
||
|
|
!violations.iter().any(|v| v.contains("table header")),
|
||
|
|
"Valid table flagged: {:?}",
|
||
|
|
violations
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_gfm_table_missing_separator() {
|
||
|
|
let md = "# Table\n\n| Header | Col |\n| A | B |\n";
|
||
|
|
let violations = validate_gfm_basics(md);
|
||
|
|
assert!(violations.iter().any(|v| v.contains("separator row")));
|
||
|
|
}
|
||
|
|
|
||
|
|
// ---- Block counting unit tests ----
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_count_blocks_headings() {
|
||
|
|
let md = "# H1\n\n## H2\n\n### H3\n\nSome text.\n";
|
||
|
|
let counts = count_blocks(md);
|
||
|
|
assert_eq!(*counts.get("headings").unwrap_or(&0), 3);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_count_blocks_list_items() {
|
||
|
|
let md = "- one\n- two\n- three\n";
|
||
|
|
let counts = count_blocks(md);
|
||
|
|
assert_eq!(*counts.get("list_items").unwrap_or(&0), 3);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_count_blocks_code_blocks() {
|
||
|
|
let md = "```rust\nfn main() {}\n```\n\n```\nplain\n```\n";
|
||
|
|
let counts = count_blocks(md);
|
||
|
|
assert_eq!(*counts.get("code_blocks").unwrap_or(&0), 2);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_count_blocks_table_rows() {
|
||
|
|
let md = "| A | B |\n| --- | --- |\n| 1 | 2 |\n| 3 | 4 |\n";
|
||
|
|
let counts = count_blocks(md);
|
||
|
|
// Header row + 2 data rows = 3 (separator excluded)
|
||
|
|
assert_eq!(*counts.get("table_rows").unwrap_or(&0), 3);
|
||
|
|
}
|
||
|
|
|
||
|
|
// ---- is_table_separator / strip_inline_code unit tests ----
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_is_table_separator() {
|
||
|
|
assert!(is_table_separator("| --- | --- |"));
|
||
|
|
assert!(is_table_separator("|---|---|"));
|
||
|
|
assert!(is_table_separator("| :---: | ---: |"));
|
||
|
|
assert!(!is_table_separator("| data | here |"));
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_strip_inline_code() {
|
||
|
|
assert_eq!(strip_inline_code("hello `world` foo"), "hello foo");
|
||
|
|
assert_eq!(strip_inline_code("no code here"), "no code here");
|
||
|
|
assert_eq!(strip_inline_code("`all code`"), "");
|
||
|
|
}
|
||
|
|
}
|