Files
fil/crates/kreuzberg/tests/typst_behavioral_tests.rs

1247 lines
41 KiB
Rust
Raw Normal View History

2026-06-01 23:40:55 +02:00
#![allow(clippy::len_zero, clippy::unnecessary_get_then_check, clippy::single_match)]
#![cfg(feature = "office")]
//! Comprehensive behavioral tests for Typst extractor against Pandoc baselines.
//!
//! These tests expose the critical bugs found in code review:
//! 1. 62% heading loss bug - only matches single `=` headings
//! 2. Blockquotes not implemented
//! 3. Display math not extracted
//! 4. Nested table brackets cause corruption
//! 5. Empty headings output (just `= ` with no text)
//! 6. Regex failures silently lose metadata
//!
//! The tests are designed to FAIL initially, exposing real bugs that need fixing.
//! They compare extracted output against Pandoc baseline outputs for behavioral parity.
use kreuzberg::core::config::ExtractionConfig;
use kreuzberg::core::extractor::extract_bytes;
use std::{fs, path::PathBuf};
fn typst_doc_root() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../test_documents/typst")
}
/// Load a test document from the test_documents/typst directory
fn load_test_document(filename: &str) -> Vec<u8> {
let path = typst_doc_root().join(filename);
fs::read(&path).unwrap_or_else(|_| panic!("Failed to read test document: {}", filename))
}
/// Load Pandoc baseline output for comparison
fn load_pandoc_baseline(filename_base: &str) -> String {
let path = typst_doc_root().join(format!("{filename_base}_pandoc_baseline.txt"));
fs::read_to_string(&path).unwrap_or_else(|_| panic!("Failed to read baseline: {}", filename_base))
}
/// Load Pandoc metadata JSON for comparison
fn load_pandoc_metadata(filename_base: &str) -> String {
let path = typst_doc_root().join(format!("{filename_base}_pandoc_meta.json"));
fs::read_to_string(&path).unwrap_or_else(|_| panic!("Failed to read metadata: {}", filename_base))
}
/// Count specific heading levels (= for level 1, == for level 2, etc.)
fn count_heading_level(content: &str, level: usize) -> usize {
let exact_marker = format!("{} ", "=".repeat(level));
content
.lines()
.filter(|l| l.trim_start().starts_with(&exact_marker))
.count()
}
/// Extract all headings from content
fn extract_all_headings(content: &str) -> Vec<String> {
content
.lines()
.filter(|l| {
let trimmed = l.trim_start();
trimmed.starts_with('=') && !trimmed.starts_with("#set")
})
.map(|l| l.to_string())
.collect()
}
/// Count lines that are pure metadata/directives (not content)
fn count_directive_lines(content: &str) -> usize {
content
.lines()
.filter(|l| {
let t = l.trim();
t.starts_with("#set ") || t.starts_with("#let ") || t.starts_with("#import ")
})
.count()
}
/// Count empty headings (headings with just `= ` and no text)
fn count_empty_headings(content: &str) -> usize {
content
.lines()
.filter(|l| {
let trimmed = l.trim_start();
trimmed == "="
|| trimmed == "=="
|| trimmed == "==="
|| trimmed == "===="
|| trimmed == "====="
|| trimmed == "======"
})
.count()
}
/// Extract all text between headings (content blocks)
fn extract_content_blocks(content: &str) -> Vec<String> {
let mut blocks = Vec::new();
let mut current_block = String::new();
let mut in_block = false;
for line in content.lines() {
let trimmed = line.trim_start();
if trimmed.starts_with('=') && !trimmed.starts_with("#set") {
if !current_block.is_empty() {
blocks.push(current_block.trim().to_string());
current_block.clear();
}
in_block = true;
} else if in_block && !trimmed.is_empty() {
current_block.push_str(line);
current_block.push('\n');
}
}
if !current_block.is_empty() {
blocks.push(current_block.trim().to_string());
}
blocks
}
/// Check if content has reasonable parity with baseline (within tolerance)
fn content_parity_check(extracted: &str, baseline: &str, tolerance_percent: f64) -> bool {
let extracted_len = extracted.len();
let baseline_len = baseline.len();
if baseline_len == 0 {
return extracted_len == 0;
}
let ratio = (extracted_len as f64) / (baseline_len as f64);
let acceptable_min = 1.0 - (tolerance_percent / 100.0);
let acceptable_max = 1.0 + (tolerance_percent / 100.0);
ratio >= acceptable_min && ratio <= acceptable_max
}
// CRITICAL BUG TESTS - These expose the 45+ issues
/// TEST 1: CRITICAL - 62% heading loss bug
///
/// The extractor only matches single `=` headings, completely skipping
/// `==`, `===`, and higher levels. This causes catastrophic data loss
/// in hierarchical documents.
///
/// Expected: All heading levels should be extracted
/// Current behavior: Only level 1 headings extracted
/// WILL FAIL: Exposing the heading loss bug
#[tokio::test]
async fn test_typst_all_heading_levels_not_lost() {
let content = load_test_document("headings.typ");
let _baseline = load_pandoc_baseline("headings");
let config = ExtractionConfig::default();
let result = extract_bytes(&content, "application/x-typst", &config)
.await
.expect("Extraction failed");
let extracted_all_headings = extract_all_headings(&result.content);
assert!(
extracted_all_headings.len() >= 6,
"CRITICAL BUG: Only extracted {} headings, should have extracted 6+ heading levels. \
This is the 62% heading loss bug - extractor only matches '=' but skips '==', '===', etc.",
extracted_all_headings.len()
);
for level in 1..=6 {
let count = count_heading_level(&result.content, level);
assert_eq!(
count, 1,
"Heading level {} should appear exactly once (found {}). \
Missing heading levels cause data loss in hierarchical documents.",
level, count
);
}
}
/// TEST 2: Display math not extracted
///
/// Display math ($$...$$) is completely lost from extraction,
/// breaking mathematical content preservation.
///
/// Expected: Display math should be preserved in output
/// Current behavior: Silently dropped
/// WILL FAIL: Exposing display math loss
#[tokio::test]
async fn test_typst_display_math_preserved() {
let content = load_test_document("advanced.typ");
let baseline = load_pandoc_baseline("advanced");
let config = ExtractionConfig::default();
let result = extract_bytes(&content, "application/x-typst", &config)
.await
.expect("Extraction failed");
let has_display_math_in_baseline =
baseline.contains("²") || baseline.contains("Display math") || baseline.contains("x^2");
if has_display_math_in_baseline {
let our_has_math = result.content.contains("$")
|| result.content.contains("Display")
|| result.content.contains("²")
|| result.content.contains("²");
assert!(
our_has_math,
"Display math should be extracted. Pandoc preserves mathematical notation, \
but extractor drops it entirely. This breaks scientific/academic documents."
);
}
let has_pythagorean = result.content.contains("^2")
|| result.content.contains("²")
|| result.content.contains("x") && result.content.contains("y") && result.content.contains("r");
assert!(
has_pythagorean,
"Pythagorean theorem expression should be present. Display math is being dropped."
);
}
/// TEST 3: Empty headings output
///
/// When heading text is missing or malformed, extractor outputs
/// just the marker like "= " with no text, polluting the output.
///
/// Expected: Either full heading text or no heading at all
/// Current behavior: "= " with no content
/// WILL FAIL: Exposing empty heading bug
#[tokio::test]
async fn test_typst_no_empty_headings_output() {
let content = load_test_document("headings.typ");
let config = ExtractionConfig::default();
let result = extract_bytes(&content, "application/x-typst", &config)
.await
.expect("Extraction failed");
let empty_headings = count_empty_headings(&result.content);
assert_eq!(
empty_headings, 0,
"Found {} empty heading lines (just '=' with no text). \
Extractor outputs malformed headings like '= ' with no text, \
corrupting the document structure.",
empty_headings
);
for heading in extract_all_headings(&result.content) {
let trimmed = heading.trim_start();
let after_marker = trimmed.trim_start_matches('=').trim();
assert!(
!after_marker.is_empty(),
"Heading '{}' has no text after marker. Should not output empty headings.",
trimmed
);
}
}
/// TEST 4: Metadata extraction fails with regex silently
///
/// When regex patterns fail to match metadata fields,
/// the extractor silently returns None instead of logging/failing,
/// causing complete metadata loss for certain formats.
///
/// Expected: All metadata fields should be extracted
/// Current behavior: Some formats fail silently
/// WILL FAIL: Exposing metadata loss
#[tokio::test]
async fn test_typst_metadata_extraction_completeness() {
let content = load_test_document("metadata.typ");
let _baseline_meta = load_pandoc_metadata("metadata");
let config = ExtractionConfig::default();
let result = extract_bytes(&content, "application/x-typst", &config)
.await
.expect("Extraction failed");
let has_title = result.metadata.title.as_ref().map(|t| !t.is_empty()).unwrap_or(false);
let has_author = result.metadata.authors.as_ref().map(|a| !a.is_empty()).unwrap_or(false);
let has_keywords = result
.metadata
.keywords
.as_ref()
.map(|k| !k.is_empty())
.unwrap_or(false);
assert!(
has_title,
"Title metadata should be extracted. Regex pattern matching fails silently \
and metadata is lost with no error reporting."
);
assert!(
has_author,
"Author metadata should be extracted. Some metadata formats fail silently."
);
assert!(
has_keywords,
"Keywords should be extracted. Regex failures cause silent data loss."
);
}
/// TEST 5: Nested table brackets cause corruption
///
/// Tables with nested brackets like [Name [full]] corrupt the
/// table content extraction because bracket counting is naive.
///
/// Expected: Table cells should be extracted correctly even with nesting
/// Current behavior: Bracket nesting causes cells to be malformed
/// WILL FAIL: Exposing table corruption bug
#[tokio::test]
async fn test_typst_tables_with_nested_brackets_not_corrupted() {
let content = load_test_document("advanced.typ");
let baseline = load_pandoc_baseline("advanced");
let config = ExtractionConfig::default();
let result = extract_bytes(&content, "application/x-typst", &config)
.await
.expect("Extraction failed");
let has_table_in_baseline = baseline.contains("Name") && baseline.contains("Alice");
if has_table_in_baseline {
let table_content_extracted =
result.content.contains("Name") && result.content.contains("Alice") && result.content.contains("Age");
assert!(
table_content_extracted,
"Table content should be extracted correctly. Nested brackets cause corruption \
and table cells are malformed."
);
let corrupted_brackets = result.content.matches("[[").count();
assert_eq!(
corrupted_brackets, 0,
"Found corrupted bracket sequences [[. Table extraction with nested brackets \
produces malformed output."
);
}
}
/// TEST 6: Content volume parity - within tolerance of Pandoc
///
/// Our extractor should extract roughly the same amount of content
/// as Pandoc (baseline). Large discrepancies indicate data loss or
/// noise injection.
///
/// Expected: Within reasonable tolerance of baseline content size
/// Current behavior: Significant data loss on complex documents (e.g., advanced.typ)
/// WILL FAIL: Exposing data loss on complex documents with formatting
#[tokio::test]
async fn test_typst_content_volume_parity_with_pandoc() {
let documents = vec![("simple", 30.0), ("headings", 20.0)];
for (doc_name, tolerance) in documents {
let content = load_test_document(&format!("{}.typ", doc_name));
let baseline = load_pandoc_baseline(doc_name);
let config = ExtractionConfig::default();
let result = extract_bytes(&content, "application/x-typst", &config)
.await
.unwrap_or_else(|_| panic!("Extraction failed for {}", doc_name));
let baseline_size = baseline.len();
let extracted_size = result.content.len();
let is_within_tolerance = content_parity_check(&result.content, &baseline, tolerance);
assert!(
is_within_tolerance,
"Content volume parity failed for {}: \
Baseline: {} bytes, Extracted: {} bytes ({}% tolerance allowed). \
Data loss indicates missing extraction features or formatting issues.",
doc_name, baseline_size, extracted_size, tolerance
);
}
}
/// TEST 7: Blockquotes not implemented
///
/// Blockquotes (using > syntax in other formats, typst uses #quote)
/// are completely unimplemented, causing loss of semantic structure.
///
/// Expected: Blockquote content should be extracted
/// Current behavior: Feature not implemented
/// WILL FAIL: Exposing missing blockquote support
#[tokio::test]
async fn test_typst_blockquote_handling() {
let test_content = b"#quote[
This is a blockquote.
It should be extracted.
]";
let config = ExtractionConfig::default();
let result = extract_bytes(test_content, "application/x-typst", &config)
.await
.expect("Extraction failed");
let has_blockquote_content =
result.content.contains("blockquote") || result.content.contains("This is a blockquote");
assert!(
has_blockquote_content,
"Blockquote content should be extracted. Blockquotes are not implemented \
in the extractor, causing complete loss of quoted content."
);
}
/// TEST 8: Inline code preservation
///
/// Test that inline code blocks are properly extracted and marked.
/// This ensures code snippets aren't corrupted.
///
/// Expected: Inline code preserved with backticks or clearly marked
/// Current behavior: May be corrupted
/// WILL FAIL: If inline code is not preserved
#[tokio::test]
async fn test_typst_inline_code_preserved() {
let content = load_test_document("advanced.typ");
let baseline = load_pandoc_baseline("advanced");
let config = ExtractionConfig::default();
let result = extract_bytes(&content, "application/x-typst", &config)
.await
.expect("Extraction failed");
let has_inline_code =
result.content.contains("`") || (result.content.contains("code") && baseline.contains("`code`"));
assert!(
has_inline_code,
"Inline code should be preserved with backticks or clearly marked."
);
}
/// TEST 9: Inline math extraction
///
/// Inline math (single $ delimiters) should be extracted and preserved.
///
/// Expected: Inline math formulas preserved
/// Current behavior: May be dropped
/// WILL FAIL: If inline math is lost
#[tokio::test]
async fn test_typst_inline_math_preserved() {
let content = load_test_document("advanced.typ");
let baseline = load_pandoc_baseline("advanced");
let config = ExtractionConfig::default();
let result = extract_bytes(&content, "application/x-typst", &config)
.await
.expect("Extraction failed");
let has_inline_math =
result.content.contains("$") || result.content.contains("sqrt") || result.content.contains("equation");
if baseline.contains("$") || baseline.contains("equation") {
assert!(
has_inline_math,
"Inline math should be extracted. Mathematical formulas are being dropped."
);
}
}
/// TEST 10: Figures and captions
///
/// Figure extraction with captions should preserve both image references
/// and caption text.
///
/// Expected: Figure content and captions extracted
/// Current behavior: May be unimplemented
#[tokio::test]
async fn test_typst_figures_and_captions() {
let test_content = b"#figure(
image(\"example.png\"),
caption: [This is a figure caption]
)";
let config = ExtractionConfig::default();
let result = extract_bytes(test_content, "application/x-typst", &config)
.await
.expect("Extraction failed");
let _has_caption = result.content.contains("caption") || result.content.contains("figure");
println!(
"Figure extraction result (feature may be unimplemented): {:?}",
result.content
);
}
/// TEST 11: Citation/reference handling
///
/// Citations and references should be extracted when present.
///
/// Expected: Citation markers and text preserved
/// Current behavior: May be dropped
#[tokio::test]
async fn test_typst_citations_preserved() {
let test_content = b"Here is a citation @smith2020.
= References
#bibliography()";
let config = ExtractionConfig::default();
let result = extract_bytes(test_content, "application/x-typst", &config)
.await
.expect("Extraction failed");
let _has_citation = result.content.contains("@smith2020")
|| result.content.contains("smith")
|| result.content.contains("References");
println!("Citation handling (may be limited): {:?}", result.content);
}
/// TEST 12: Link extraction and formatting
///
/// Links should be extracted with both URL and link text.
///
/// Expected: Links in markdown format [text](url)
/// Current behavior: May lose URL or text
#[tokio::test]
async fn test_typst_link_extraction() {
let content = load_test_document("advanced.typ");
let _baseline = load_pandoc_baseline("advanced");
let config = ExtractionConfig::default();
let result = extract_bytes(&content, "application/x-typst", &config)
.await
.expect("Extraction failed");
let has_link_content =
result.content.contains("example") || result.content.contains("link") || result.content.contains("https");
assert!(
has_link_content,
"Link content should be extracted. Links may be completely dropped."
);
}
/// TEST 13: Unordered list extraction
///
/// Both + and - list markers should be converted to standard format.
///
/// Expected: All list items extracted and normalized
/// Current behavior: May lose some items
#[tokio::test]
async fn test_typst_list_extraction() {
let content = load_test_document("simple.typ");
let _baseline = load_pandoc_baseline("simple");
let config = ExtractionConfig::default();
let result = extract_bytes(&content, "application/x-typst", &config)
.await
.expect("Extraction failed");
let has_list_markers = result.content.contains("-") || result.content.contains("+");
let has_list_content =
result.content.contains("First") || result.content.contains("Second") || result.content.contains("item");
assert!(
has_list_markers || has_list_content,
"List items should be extracted with markers or content preserved."
);
}
/// TEST 14: Code block extraction
///
/// Triple-backtick code blocks should be fully extracted with language specifiers.
///
/// Expected: Code blocks with language markers preserved
/// Current behavior: May be malformed
#[tokio::test]
async fn test_typst_code_block_extraction() {
let content = load_test_document("advanced.typ");
let _baseline = load_pandoc_baseline("advanced");
let config = ExtractionConfig::default();
let result = extract_bytes(&content, "application/x-typst", &config)
.await
.expect("Extraction failed");
let has_code = result.content.contains("```")
|| result.content.contains("def")
|| result.content.contains("fibonacci")
|| result.content.contains("python");
assert!(has_code, "Code blocks should be extracted with language specifiers.");
}
/// TEST 15: Bold and italic formatting
///
/// Inline emphasis formatting should be preserved or normalized.
///
/// Expected: Bold (*text*) and italic (_text_) markers present
/// Current behavior: May be lost
#[tokio::test]
async fn test_typst_emphasis_formatting() {
let content = load_test_document("advanced.typ");
let config = ExtractionConfig::default();
let result = extract_bytes(&content, "application/x-typst", &config)
.await
.expect("Extraction failed");
let has_emphasis = result.content.contains("*") && result.content.contains("_");
assert!(has_emphasis, "Bold and italic formatting markers should be preserved.");
}
/// TEST 16: Complex nested formatting
///
/// Test handling of *_nested formatting_* combinations.
///
/// Expected: Nested formatting preserved or flattened consistently
/// Current behavior: May be malformed
#[tokio::test]
async fn test_typst_nested_formatting() {
let test_content = b"This is *bold with _nested italic_* text.";
let config = ExtractionConfig::default();
let result = extract_bytes(test_content, "application/x-typst", &config)
.await
.expect("Extraction failed");
let has_formatting = result.content.contains("*")
|| result.content.contains("_")
|| (result.content.contains("bold") && result.content.contains("italic"));
assert!(
has_formatting,
"Nested formatting should be preserved or flattened consistently."
);
}
/// TEST 17: Multiple paragraph handling
///
/// Multiple paragraphs separated by blank lines should be preserved.
///
/// Expected: Paragraph structure maintained
/// Current behavior: May merge or lose paragraphs
#[tokio::test]
async fn test_typst_multiple_paragraphs() {
let content = load_test_document("advanced.typ");
let _baseline = load_pandoc_baseline("advanced");
let config = ExtractionConfig::default();
let result = extract_bytes(&content, "application/x-typst", &config)
.await
.expect("Extraction failed");
let non_empty_lines: Vec<_> = result.content.lines().filter(|l| !l.trim().is_empty()).collect();
assert!(
non_empty_lines.len() >= 5,
"Multiple paragraphs should be preserved. Found {} content lines.",
non_empty_lines.len()
);
}
/// TEST 18: Heading-content association
///
/// Content should follow its heading logically in the output.
///
/// Expected: Each heading followed by its content
/// Current behavior: May be scrambled
#[tokio::test]
async fn test_typst_heading_content_association() {
let content = load_test_document("advanced.typ");
let config = ExtractionConfig::default();
let result = extract_bytes(&content, "application/x-typst", &config)
.await
.expect("Extraction failed");
let blocks = extract_content_blocks(&result.content);
assert!(blocks.len() > 0, "Content blocks should be associated with headings.");
for block in &blocks {
assert!(block.len() > 0, "Content blocks should not be empty.");
}
}
/// TEST 19: Whitespace normalization
///
/// Multiple blank lines should be normalized consistently.
///
/// Expected: Single blank lines between sections
/// Current behavior: May have excessive whitespace
#[tokio::test]
async fn test_typst_whitespace_handling() {
let content = load_test_document("advanced.typ");
let config = ExtractionConfig::default();
let result = extract_bytes(&content, "application/x-typst", &config)
.await
.expect("Extraction failed");
let blank_line_runs: Vec<_> = result.content.split("\n\n\n").collect();
assert!(
blank_line_runs.len() <= 2,
"Should not have excessive blank lines (triple newlines). \
Found {} instances of triple newlines.",
blank_line_runs.len() - 1
);
}
/// TEST 20: Minimal document handling
///
/// Even minimal documents should extract correctly.
///
/// Expected: Basic content and structure
/// Current behavior: May fail or lose content
#[tokio::test]
async fn test_typst_minimal_document() {
let content = load_test_document("minimal.typ");
let _baseline = load_pandoc_baseline("minimal");
let config = ExtractionConfig::default();
let result = extract_bytes(&content, "application/x-typst", &config)
.await
.expect("Extraction failed");
assert!(
!result.content.is_empty(),
"Even minimal documents should extract some content."
);
assert!(
result.content.len() > 0,
"Minimal document should produce non-empty output."
);
}
/// TEST 21: No directive pollution
///
/// Extracted content should not contain #set, #let, #import directives.
///
/// Expected: Clean extracted content without directives
/// Current behavior: May include directives
#[tokio::test]
async fn test_typst_no_directive_pollution() {
let content = load_test_document("advanced.typ");
let config = ExtractionConfig::default();
let result = extract_bytes(&content, "application/x-typst", &config)
.await
.expect("Extraction failed");
let directive_count = count_directive_lines(&result.content);
assert_eq!(
directive_count, 0,
"Extracted content should not contain directives (#set, #let, etc). \
Found {} directive lines polluting the output.",
directive_count
);
}
/// TEST 22: Metadata field completeness
///
/// All metadata fields from baseline should be present.
///
/// Expected: Title, author, date, keywords all extracted
/// Current behavior: Some fields missing
#[tokio::test]
async fn test_typst_metadata_field_completeness() {
let content = load_test_document("advanced.typ");
let config = ExtractionConfig::default();
let result = extract_bytes(&content, "application/x-typst", &config)
.await
.expect("Extraction failed");
let has_title = result.metadata.title.is_some();
let has_author = result.metadata.authors.is_some();
let has_date = result.metadata.created_at.is_some();
assert!(
has_title && has_author && has_date,
"All metadata fields should be extracted. \
Title: {}, Author: {}, Date: {}",
has_title,
has_author,
has_date
);
}
/// TEST 23: Special character handling
///
/// Unicode and special characters should be preserved.
///
/// Expected: Special characters like ü, é, etc. preserved
/// Current behavior: May be corrupted
#[tokio::test]
async fn test_typst_special_character_preservation() {
let test_content = "Café with naïve français".as_bytes();
let config = ExtractionConfig::default();
let result = extract_bytes(test_content, "application/x-typst", &config)
.await
.expect("Extraction failed");
let has_special_chars =
result.content.contains("Café") || result.content.contains("naïve") || result.content.contains("français");
assert!(
has_special_chars,
"Special characters should be preserved in extraction."
);
}
/// TEST 24: Very long heading handling
///
/// Long headings should not cause truncation or corruption.
///
/// Expected: Full heading text preserved regardless of length
/// Current behavior: May truncate
#[tokio::test]
async fn test_typst_long_heading_handling() {
let test_content = b"= This is a very long heading that should be completely preserved without any truncation or corruption whatsoever";
let config = ExtractionConfig::default();
let result = extract_bytes(test_content, "application/x-typst", &config)
.await
.expect("Extraction failed");
let has_heading_start = result.content.contains("very long heading");
assert!(has_heading_start, "Long headings should not be truncated.");
}
/// TEST 25: Edge case - Empty heading recovery
///
/// Even if a heading has no text, extraction should be robust.
///
/// Expected: Graceful handling without crashes
/// Current behavior: May panic or produce empty output
#[tokio::test]
async fn test_typst_empty_heading_edge_case() {
let test_content = b"= \n\n== \nContent here";
let config = ExtractionConfig::default();
let result = extract_bytes(test_content, "application/x-typst", &config).await;
match result {
Ok(extraction) => {
assert!(
extraction.content.contains("Content"),
"Should extract regular content even if some headings are empty."
);
}
Err(_) => {}
}
}
/// TEST 26: Regression - Basic heading extraction
#[tokio::test]
async fn test_typst_basic_heading_regression() {
let test_content = b"= Main Heading\n\nContent here";
let config = ExtractionConfig::default();
let result = extract_bytes(test_content, "application/x-typst", &config)
.await
.expect("Extraction failed");
assert!(
result.content.contains("= Main Heading"),
"Basic level-1 heading should be extracted."
);
assert!(result.content.contains("Content"), "Content should be extracted.");
}
/// TEST 27: Regression - Level 2 heading extraction
#[tokio::test]
async fn test_typst_level2_heading_regression() {
let test_content = b"= Main\n\n== Subsection\n\nMore content";
let config = ExtractionConfig::default();
let result = extract_bytes(test_content, "application/x-typst", &config)
.await
.expect("Extraction failed");
assert!(
result.content.contains("== Subsection"),
"Level 2 headings must be extracted."
);
}
/// TEST 28: Regression - Basic metadata
#[tokio::test]
async fn test_typst_basic_metadata_regression() {
let test_content = b"#set document(title: \"Test\", author: \"Me\")\n\n= Heading";
let config = ExtractionConfig::default();
let result = extract_bytes(test_content, "application/x-typst", &config)
.await
.expect("Extraction failed");
assert!(result.metadata.title.is_some(), "Title metadata must be extracted.");
assert!(result.metadata.authors.is_some(), "Author metadata must be extracted.");
}
/// TEST 29: Regression - Bold formatting
#[tokio::test]
async fn test_typst_bold_regression() {
let test_content = b"This is *bold text* here";
let config = ExtractionConfig::default();
let result = extract_bytes(test_content, "application/x-typst", &config)
.await
.expect("Extraction failed");
assert!(
result.content.contains("*bold*") || result.content.contains("bold"),
"Bold text should be preserved."
);
}
/// TEST 30: Regression - Inline code
///
/// The Typst extractor strips inline-code backticks from the rendered text
/// and stores the spans as `code` annotations on the InternalDocument. This
/// preserves the *information* (which words are code) without polluting the
/// plain-text output with format markers — same approach as our other format
/// extractors. This test asserts the content survives round-trip.
#[tokio::test]
async fn test_typst_inline_code_regression() {
let test_content = b"Use `println!(\"hello\")` in Rust";
let config = ExtractionConfig::default();
let result = extract_bytes(test_content, "application/x-typst", &config)
.await
.expect("Extraction failed");
assert!(result.content.contains("println"), "inline code content lost");
assert!(
result.content.contains("Use") && result.content.contains("Rust"),
"surrounding text lost"
);
}
/// TEST 31: Regression - Code blocks
///
/// Code-block content survives extraction. The triple-backtick fence and
/// language tag are tracked as a `Code` element with `language` attribute on
/// the InternalDocument; downstream renderers (markdown, djot, html) emit the
/// fence, plain-text omits it. This test asserts the program text round-trips.
#[tokio::test]
async fn test_typst_codeblock_regression() {
let test_content = b"```rust\nfn main() {}\n```";
let config = ExtractionConfig::default();
let result = extract_bytes(test_content, "application/x-typst", &config)
.await
.expect("Extraction failed");
assert!(result.content.contains("fn main"), "code block content lost");
}
/// TEST 32: Regression - List extraction
#[tokio::test]
async fn test_typst_list_regression() {
let test_content = b"- Item 1\n+ Item 2\n- Item 3";
let config = ExtractionConfig::default();
let result = extract_bytes(test_content, "application/x-typst", &config)
.await
.expect("Extraction failed");
assert!(
result.content.contains("Item 1") && result.content.contains("Item 2") && result.content.contains("Item 3"),
"All list items should be extracted."
);
}
/// TEST 33: Regression - Math preservation
#[tokio::test]
async fn test_typst_math_regression() {
let test_content = b"Formula: $E = mc^2$";
let config = ExtractionConfig::default();
let result = extract_bytes(test_content, "application/x-typst", &config)
.await
.expect("Extraction failed");
assert!(
result.content.contains("$") && (result.content.contains("mc") || result.content.contains("E")),
"Math formulas should be preserved."
);
}
/// TEST 34: Regression - Link extraction
#[tokio::test]
async fn test_typst_link_regression() {
let test_content = b"Visit #link(\"https://example.com\")[example]";
let config = ExtractionConfig::default();
let result = extract_bytes(test_content, "application/x-typst", &config)
.await
.expect("Extraction failed");
assert!(
result.content.contains("example") || result.content.contains("example.com"),
"Link text or URL should be preserved."
);
}
/// TEST 35: Regression - Table basic extraction
#[tokio::test]
async fn test_typst_table_regression() {
let test_content = b"#table(columns: 2, [A], [B], [1], [2])";
let config = ExtractionConfig::default();
let result = extract_bytes(test_content, "application/x-typst", &config)
.await
.expect("Extraction failed");
assert!(
result.content.contains("A") || result.content.contains("TABLE"),
"Table content should be extracted."
);
}
/// TEST 36: Large document handling
#[tokio::test]
async fn test_typst_large_document_stress() {
let mut large_content = String::new();
for i in 1..=50 {
large_content.push_str(&format!("= Heading {}\n\n", i));
large_content.push_str(&format!("Content for section {}.\n\n", i));
}
let config = ExtractionConfig::default();
let result = extract_bytes(large_content.as_bytes(), "application/x-typst", &config)
.await
.expect("Extraction failed");
let heading_count = extract_all_headings(&result.content).len();
assert!(
heading_count >= 40,
"Large documents should extract all headings. Found {} of 50.",
heading_count
);
}
/// TEST 37: Deep nesting stress test
#[tokio::test]
async fn test_typst_deep_nesting_stress() {
let mut nested = String::new();
for level in 1..=6 {
nested.push_str(&format!("{} Level {} Heading\n\n", "=".repeat(level), level));
nested.push_str(&format!("Content at level {}.\n\n", level));
}
let config = ExtractionConfig::default();
let result = extract_bytes(nested.as_bytes(), "application/x-typst", &config)
.await
.expect("Extraction failed");
for level in 1..=6 {
let count = count_heading_level(&result.content, level);
assert!(
count >= 1,
"Level {} heading should be extracted in deep nesting test.",
level
);
}
}
/// TEST 38: Mixed formatting stress
#[tokio::test]
async fn test_typst_mixed_formatting_stress() {
let test_content = b"This text has *bold*, _italic_, `code`, and $math$ all mixed together!";
let config = ExtractionConfig::default();
let result = extract_bytes(test_content, "application/x-typst", &config)
.await
.expect("Extraction failed");
let has_formatting = (result.content.contains("*") || result.content.contains("bold"))
&& (result.content.contains("_") || result.content.contains("italic"))
&& (result.content.contains("`") || result.content.contains("code"))
&& (result.content.contains("$") || result.content.contains("math"));
assert!(has_formatting, "All mixed formatting should be preserved.");
}
/// TEST 39: Unicode stress test
#[tokio::test]
async fn test_typst_unicode_stress() {
let test_content = "= Unicode Heading 中文 العربية\n\nContent with emojis: 🎉🚀💯\n\nGreek: α β γ δ ε ζ".as_bytes();
let config = ExtractionConfig::default();
let result = extract_bytes(test_content, "application/x-typst", &config)
.await
.expect("Extraction failed");
assert!(
result.content.contains("Unicode"),
"Unicode content should be preserved."
);
}
/// TEST 40: Pathological whitespace
#[tokio::test]
async fn test_typst_pathological_whitespace() {
let test_content = b"= Heading\n\n\n\n\n\nContent with excessive blank lines\n\n\n\n\nMore content";
let config = ExtractionConfig::default();
let result = extract_bytes(test_content, "application/x-typst", &config)
.await
.expect("Extraction failed");
assert!(
result.content.contains("Heading") && result.content.contains("Content"),
"Should extract content even with excessive whitespace."
);
}
/// TEST 41: Full document comparison - simple.typ
#[tokio::test]
async fn test_typst_full_simple_document_comparison() {
let content = load_test_document("simple.typ");
let _baseline = load_pandoc_baseline("simple");
let config = ExtractionConfig::default();
let result = extract_bytes(&content, "application/x-typst", &config)
.await
.expect("Extraction failed");
assert!(
result.content.len() > 50,
"simple.typ should extract substantial content"
);
let heading_count = extract_all_headings(&result.content).len();
assert!(heading_count > 2, "simple.typ should have multiple sections");
}
/// TEST 42: Full document comparison - advanced.typ
#[tokio::test]
async fn test_typst_full_advanced_document_comparison() {
let content = load_test_document("advanced.typ");
let _baseline = load_pandoc_baseline("advanced");
let config = ExtractionConfig::default();
let result = extract_bytes(&content, "application/x-typst", &config)
.await
.expect("Extraction failed");
assert!(
result.content.len() > 100,
"advanced.typ should extract comprehensive content"
);
let heading_count = extract_all_headings(&result.content).len();
assert!(heading_count >= 5, "advanced.typ should preserve heading structure");
}
/// TEST 43: MIME type consistency
///
/// The extractor should support both standard MIME types for Typst.
/// Currently only supports application/x-typst, not text/x-typst.
#[tokio::test]
async fn test_typst_mime_type_consistency() {
let content = load_test_document("simple.typ");
let config = ExtractionConfig::default();
let result_primary = extract_bytes(&content, "application/x-typst", &config)
.await
.expect("Primary MIME type should work");
assert!(
result_primary.content.len() > 0,
"Primary MIME type should extract content"
);
match extract_bytes(&content, "text/x-typst", &config).await {
Ok(result) => {
assert!(
result.content.len() > 0,
"Alternative MIME type should extract content if supported"
);
}
Err(_e) => {
println!("Note: text/x-typst is not currently supported (may be added in future)");
}
}
}
/// TEST 44: Config parameter impact
#[tokio::test]
async fn test_typst_config_parameter_handling() {
let content = load_test_document("simple.typ");
let config = ExtractionConfig::default();
let result = extract_bytes(&content, "application/x-typst", &config)
.await
.expect("Extraction failed");
assert!(!result.content.is_empty(), "Extraction with default config should work");
assert_eq!(result.mime_type, "application/x-typst", "MIME type should be preserved");
}
/// TEST 45: Comparative heading analysis
///
/// This final comprehensive test checks heading extraction
/// against the baseline to identify the exact scope of the heading loss bug.
#[tokio::test]
async fn test_typst_heading_loss_bug_analysis() {
let content = load_test_document("headings.typ");
let baseline = load_pandoc_baseline("headings");
let config = ExtractionConfig::default();
let result = extract_bytes(&content, "application/x-typst", &config)
.await
.expect("Extraction failed");
println!("\n===== HEADING EXTRACTION ANALYSIS =====");
println!("Baseline content:");
println!("{}", baseline);
println!("\nExtracted content:");
println!("{}", result.content);
let extracted_headings = extract_all_headings(&result.content);
println!("\nExtracted headings: {}", extracted_headings.len());
for (i, h) in extracted_headings.iter().enumerate() {
println!(" {}: {}", i + 1, h);
}
assert!(
extracted_headings.len() >= 6,
"BUG CONFIRMED: Heading loss detected. \
Expected 6 headings (1-6 levels), found {}. \
This is the 62% heading loss bug - only single '=' is matched, \
all '==' and higher are skipped entirely.",
extracted_headings.len()
);
}