#![allow(clippy::len_zero, clippy::unnecessary_get_then_check, clippy::single_match)] #![cfg(feature = "office")] //! Comprehensive behavioral tests for Typst extractor against Pandoc baselines. //! //! These tests expose the critical bugs found in code review: //! 1. 62% heading loss bug - only matches single `=` headings //! 2. Blockquotes not implemented //! 3. Display math not extracted //! 4. Nested table brackets cause corruption //! 5. Empty headings output (just `= ` with no text) //! 6. Regex failures silently lose metadata //! //! The tests are designed to FAIL initially, exposing real bugs that need fixing. //! They compare extracted output against Pandoc baseline outputs for behavioral parity. use kreuzberg::core::config::ExtractionConfig; use kreuzberg::core::extractor::extract_bytes; use std::{fs, path::PathBuf}; fn typst_doc_root() -> PathBuf { PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../test_documents/typst") } /// Load a test document from the test_documents/typst directory fn load_test_document(filename: &str) -> Vec { let path = typst_doc_root().join(filename); fs::read(&path).unwrap_or_else(|_| panic!("Failed to read test document: {}", filename)) } /// Load Pandoc baseline output for comparison fn load_pandoc_baseline(filename_base: &str) -> String { let path = typst_doc_root().join(format!("{filename_base}_pandoc_baseline.txt")); fs::read_to_string(&path).unwrap_or_else(|_| panic!("Failed to read baseline: {}", filename_base)) } /// Load Pandoc metadata JSON for comparison fn load_pandoc_metadata(filename_base: &str) -> String { let path = typst_doc_root().join(format!("{filename_base}_pandoc_meta.json")); fs::read_to_string(&path).unwrap_or_else(|_| panic!("Failed to read metadata: {}", filename_base)) } /// Count specific heading levels (= for level 1, == for level 2, etc.) fn count_heading_level(content: &str, level: usize) -> usize { let exact_marker = format!("{} ", "=".repeat(level)); content .lines() .filter(|l| l.trim_start().starts_with(&exact_marker)) .count() } /// Extract all headings from content fn extract_all_headings(content: &str) -> Vec { content .lines() .filter(|l| { let trimmed = l.trim_start(); trimmed.starts_with('=') && !trimmed.starts_with("#set") }) .map(|l| l.to_string()) .collect() } /// Count lines that are pure metadata/directives (not content) fn count_directive_lines(content: &str) -> usize { content .lines() .filter(|l| { let t = l.trim(); t.starts_with("#set ") || t.starts_with("#let ") || t.starts_with("#import ") }) .count() } /// Count empty headings (headings with just `= ` and no text) fn count_empty_headings(content: &str) -> usize { content .lines() .filter(|l| { let trimmed = l.trim_start(); trimmed == "=" || trimmed == "==" || trimmed == "===" || trimmed == "====" || trimmed == "=====" || trimmed == "======" }) .count() } /// Extract all text between headings (content blocks) fn extract_content_blocks(content: &str) -> Vec { let mut blocks = Vec::new(); let mut current_block = String::new(); let mut in_block = false; for line in content.lines() { let trimmed = line.trim_start(); if trimmed.starts_with('=') && !trimmed.starts_with("#set") { if !current_block.is_empty() { blocks.push(current_block.trim().to_string()); current_block.clear(); } in_block = true; } else if in_block && !trimmed.is_empty() { current_block.push_str(line); current_block.push('\n'); } } if !current_block.is_empty() { blocks.push(current_block.trim().to_string()); } blocks } /// Check if content has reasonable parity with baseline (within tolerance) fn content_parity_check(extracted: &str, baseline: &str, tolerance_percent: f64) -> bool { let extracted_len = extracted.len(); let baseline_len = baseline.len(); if baseline_len == 0 { return extracted_len == 0; } let ratio = (extracted_len as f64) / (baseline_len as f64); let acceptable_min = 1.0 - (tolerance_percent / 100.0); let acceptable_max = 1.0 + (tolerance_percent / 100.0); ratio >= acceptable_min && ratio <= acceptable_max } // CRITICAL BUG TESTS - These expose the 45+ issues /// TEST 1: CRITICAL - 62% heading loss bug /// /// The extractor only matches single `=` headings, completely skipping /// `==`, `===`, and higher levels. This causes catastrophic data loss /// in hierarchical documents. /// /// Expected: All heading levels should be extracted /// Current behavior: Only level 1 headings extracted /// WILL FAIL: Exposing the heading loss bug #[tokio::test] async fn test_typst_all_heading_levels_not_lost() { let content = load_test_document("headings.typ"); let _baseline = load_pandoc_baseline("headings"); let config = ExtractionConfig::default(); let result = extract_bytes(&content, "application/x-typst", &config) .await .expect("Extraction failed"); let extracted_all_headings = extract_all_headings(&result.content); assert!( extracted_all_headings.len() >= 6, "CRITICAL BUG: Only extracted {} headings, should have extracted 6+ heading levels. \ This is the 62% heading loss bug - extractor only matches '=' but skips '==', '===', etc.", extracted_all_headings.len() ); for level in 1..=6 { let count = count_heading_level(&result.content, level); assert_eq!( count, 1, "Heading level {} should appear exactly once (found {}). \ Missing heading levels cause data loss in hierarchical documents.", level, count ); } } /// TEST 2: Display math not extracted /// /// Display math ($$...$$) is completely lost from extraction, /// breaking mathematical content preservation. /// /// Expected: Display math should be preserved in output /// Current behavior: Silently dropped /// WILL FAIL: Exposing display math loss #[tokio::test] async fn test_typst_display_math_preserved() { let content = load_test_document("advanced.typ"); let baseline = load_pandoc_baseline("advanced"); let config = ExtractionConfig::default(); let result = extract_bytes(&content, "application/x-typst", &config) .await .expect("Extraction failed"); let has_display_math_in_baseline = baseline.contains("²") || baseline.contains("Display math") || baseline.contains("x^2"); if has_display_math_in_baseline { let our_has_math = result.content.contains("$") || result.content.contains("Display") || result.content.contains("²") || result.content.contains("²"); assert!( our_has_math, "Display math should be extracted. Pandoc preserves mathematical notation, \ but extractor drops it entirely. This breaks scientific/academic documents." ); } let has_pythagorean = result.content.contains("^2") || result.content.contains("²") || result.content.contains("x") && result.content.contains("y") && result.content.contains("r"); assert!( has_pythagorean, "Pythagorean theorem expression should be present. Display math is being dropped." ); } /// TEST 3: Empty headings output /// /// When heading text is missing or malformed, extractor outputs /// just the marker like "= " with no text, polluting the output. /// /// Expected: Either full heading text or no heading at all /// Current behavior: "= " with no content /// WILL FAIL: Exposing empty heading bug #[tokio::test] async fn test_typst_no_empty_headings_output() { let content = load_test_document("headings.typ"); let config = ExtractionConfig::default(); let result = extract_bytes(&content, "application/x-typst", &config) .await .expect("Extraction failed"); let empty_headings = count_empty_headings(&result.content); assert_eq!( empty_headings, 0, "Found {} empty heading lines (just '=' with no text). \ Extractor outputs malformed headings like '= ' with no text, \ corrupting the document structure.", empty_headings ); for heading in extract_all_headings(&result.content) { let trimmed = heading.trim_start(); let after_marker = trimmed.trim_start_matches('=').trim(); assert!( !after_marker.is_empty(), "Heading '{}' has no text after marker. Should not output empty headings.", trimmed ); } } /// TEST 4: Metadata extraction fails with regex silently /// /// When regex patterns fail to match metadata fields, /// the extractor silently returns None instead of logging/failing, /// causing complete metadata loss for certain formats. /// /// Expected: All metadata fields should be extracted /// Current behavior: Some formats fail silently /// WILL FAIL: Exposing metadata loss #[tokio::test] async fn test_typst_metadata_extraction_completeness() { let content = load_test_document("metadata.typ"); let _baseline_meta = load_pandoc_metadata("metadata"); let config = ExtractionConfig::default(); let result = extract_bytes(&content, "application/x-typst", &config) .await .expect("Extraction failed"); let has_title = result.metadata.title.as_ref().map(|t| !t.is_empty()).unwrap_or(false); let has_author = result.metadata.authors.as_ref().map(|a| !a.is_empty()).unwrap_or(false); let has_keywords = result .metadata .keywords .as_ref() .map(|k| !k.is_empty()) .unwrap_or(false); assert!( has_title, "Title metadata should be extracted. Regex pattern matching fails silently \ and metadata is lost with no error reporting." ); assert!( has_author, "Author metadata should be extracted. Some metadata formats fail silently." ); assert!( has_keywords, "Keywords should be extracted. Regex failures cause silent data loss." ); } /// TEST 5: Nested table brackets cause corruption /// /// Tables with nested brackets like [Name [full]] corrupt the /// table content extraction because bracket counting is naive. /// /// Expected: Table cells should be extracted correctly even with nesting /// Current behavior: Bracket nesting causes cells to be malformed /// WILL FAIL: Exposing table corruption bug #[tokio::test] async fn test_typst_tables_with_nested_brackets_not_corrupted() { let content = load_test_document("advanced.typ"); let baseline = load_pandoc_baseline("advanced"); let config = ExtractionConfig::default(); let result = extract_bytes(&content, "application/x-typst", &config) .await .expect("Extraction failed"); let has_table_in_baseline = baseline.contains("Name") && baseline.contains("Alice"); if has_table_in_baseline { let table_content_extracted = result.content.contains("Name") && result.content.contains("Alice") && result.content.contains("Age"); assert!( table_content_extracted, "Table content should be extracted correctly. Nested brackets cause corruption \ and table cells are malformed." ); let corrupted_brackets = result.content.matches("[[").count(); assert_eq!( corrupted_brackets, 0, "Found corrupted bracket sequences [[. Table extraction with nested brackets \ produces malformed output." ); } } /// TEST 6: Content volume parity - within tolerance of Pandoc /// /// Our extractor should extract roughly the same amount of content /// as Pandoc (baseline). Large discrepancies indicate data loss or /// noise injection. /// /// Expected: Within reasonable tolerance of baseline content size /// Current behavior: Significant data loss on complex documents (e.g., advanced.typ) /// WILL FAIL: Exposing data loss on complex documents with formatting #[tokio::test] async fn test_typst_content_volume_parity_with_pandoc() { let documents = vec![("simple", 30.0), ("headings", 20.0)]; for (doc_name, tolerance) in documents { let content = load_test_document(&format!("{}.typ", doc_name)); let baseline = load_pandoc_baseline(doc_name); let config = ExtractionConfig::default(); let result = extract_bytes(&content, "application/x-typst", &config) .await .unwrap_or_else(|_| panic!("Extraction failed for {}", doc_name)); let baseline_size = baseline.len(); let extracted_size = result.content.len(); let is_within_tolerance = content_parity_check(&result.content, &baseline, tolerance); assert!( is_within_tolerance, "Content volume parity failed for {}: \ Baseline: {} bytes, Extracted: {} bytes ({}% tolerance allowed). \ Data loss indicates missing extraction features or formatting issues.", doc_name, baseline_size, extracted_size, tolerance ); } } /// TEST 7: Blockquotes not implemented /// /// Blockquotes (using > syntax in other formats, typst uses #quote) /// are completely unimplemented, causing loss of semantic structure. /// /// Expected: Blockquote content should be extracted /// Current behavior: Feature not implemented /// WILL FAIL: Exposing missing blockquote support #[tokio::test] async fn test_typst_blockquote_handling() { let test_content = b"#quote[ This is a blockquote. It should be extracted. ]"; let config = ExtractionConfig::default(); let result = extract_bytes(test_content, "application/x-typst", &config) .await .expect("Extraction failed"); let has_blockquote_content = result.content.contains("blockquote") || result.content.contains("This is a blockquote"); assert!( has_blockquote_content, "Blockquote content should be extracted. Blockquotes are not implemented \ in the extractor, causing complete loss of quoted content." ); } /// TEST 8: Inline code preservation /// /// Test that inline code blocks are properly extracted and marked. /// This ensures code snippets aren't corrupted. /// /// Expected: Inline code preserved with backticks or clearly marked /// Current behavior: May be corrupted /// WILL FAIL: If inline code is not preserved #[tokio::test] async fn test_typst_inline_code_preserved() { let content = load_test_document("advanced.typ"); let baseline = load_pandoc_baseline("advanced"); let config = ExtractionConfig::default(); let result = extract_bytes(&content, "application/x-typst", &config) .await .expect("Extraction failed"); let has_inline_code = result.content.contains("`") || (result.content.contains("code") && baseline.contains("`code`")); assert!( has_inline_code, "Inline code should be preserved with backticks or clearly marked." ); } /// TEST 9: Inline math extraction /// /// Inline math (single $ delimiters) should be extracted and preserved. /// /// Expected: Inline math formulas preserved /// Current behavior: May be dropped /// WILL FAIL: If inline math is lost #[tokio::test] async fn test_typst_inline_math_preserved() { let content = load_test_document("advanced.typ"); let baseline = load_pandoc_baseline("advanced"); let config = ExtractionConfig::default(); let result = extract_bytes(&content, "application/x-typst", &config) .await .expect("Extraction failed"); let has_inline_math = result.content.contains("$") || result.content.contains("sqrt") || result.content.contains("equation"); if baseline.contains("$") || baseline.contains("equation") { assert!( has_inline_math, "Inline math should be extracted. Mathematical formulas are being dropped." ); } } /// TEST 10: Figures and captions /// /// Figure extraction with captions should preserve both image references /// and caption text. /// /// Expected: Figure content and captions extracted /// Current behavior: May be unimplemented #[tokio::test] async fn test_typst_figures_and_captions() { let test_content = b"#figure( image(\"example.png\"), caption: [This is a figure caption] )"; let config = ExtractionConfig::default(); let result = extract_bytes(test_content, "application/x-typst", &config) .await .expect("Extraction failed"); let _has_caption = result.content.contains("caption") || result.content.contains("figure"); println!( "Figure extraction result (feature may be unimplemented): {:?}", result.content ); } /// TEST 11: Citation/reference handling /// /// Citations and references should be extracted when present. /// /// Expected: Citation markers and text preserved /// Current behavior: May be dropped #[tokio::test] async fn test_typst_citations_preserved() { let test_content = b"Here is a citation @smith2020. = References #bibliography()"; let config = ExtractionConfig::default(); let result = extract_bytes(test_content, "application/x-typst", &config) .await .expect("Extraction failed"); let _has_citation = result.content.contains("@smith2020") || result.content.contains("smith") || result.content.contains("References"); println!("Citation handling (may be limited): {:?}", result.content); } /// TEST 12: Link extraction and formatting /// /// Links should be extracted with both URL and link text. /// /// Expected: Links in markdown format [text](url) /// Current behavior: May lose URL or text #[tokio::test] async fn test_typst_link_extraction() { let content = load_test_document("advanced.typ"); let _baseline = load_pandoc_baseline("advanced"); let config = ExtractionConfig::default(); let result = extract_bytes(&content, "application/x-typst", &config) .await .expect("Extraction failed"); let has_link_content = result.content.contains("example") || result.content.contains("link") || result.content.contains("https"); assert!( has_link_content, "Link content should be extracted. Links may be completely dropped." ); } /// TEST 13: Unordered list extraction /// /// Both + and - list markers should be converted to standard format. /// /// Expected: All list items extracted and normalized /// Current behavior: May lose some items #[tokio::test] async fn test_typst_list_extraction() { let content = load_test_document("simple.typ"); let _baseline = load_pandoc_baseline("simple"); let config = ExtractionConfig::default(); let result = extract_bytes(&content, "application/x-typst", &config) .await .expect("Extraction failed"); let has_list_markers = result.content.contains("-") || result.content.contains("+"); let has_list_content = result.content.contains("First") || result.content.contains("Second") || result.content.contains("item"); assert!( has_list_markers || has_list_content, "List items should be extracted with markers or content preserved." ); } /// TEST 14: Code block extraction /// /// Triple-backtick code blocks should be fully extracted with language specifiers. /// /// Expected: Code blocks with language markers preserved /// Current behavior: May be malformed #[tokio::test] async fn test_typst_code_block_extraction() { let content = load_test_document("advanced.typ"); let _baseline = load_pandoc_baseline("advanced"); let config = ExtractionConfig::default(); let result = extract_bytes(&content, "application/x-typst", &config) .await .expect("Extraction failed"); let has_code = result.content.contains("```") || result.content.contains("def") || result.content.contains("fibonacci") || result.content.contains("python"); assert!(has_code, "Code blocks should be extracted with language specifiers."); } /// TEST 15: Bold and italic formatting /// /// Inline emphasis formatting should be preserved or normalized. /// /// Expected: Bold (*text*) and italic (_text_) markers present /// Current behavior: May be lost #[tokio::test] async fn test_typst_emphasis_formatting() { let content = load_test_document("advanced.typ"); let config = ExtractionConfig::default(); let result = extract_bytes(&content, "application/x-typst", &config) .await .expect("Extraction failed"); let has_emphasis = result.content.contains("*") && result.content.contains("_"); assert!(has_emphasis, "Bold and italic formatting markers should be preserved."); } /// TEST 16: Complex nested formatting /// /// Test handling of *_nested formatting_* combinations. /// /// Expected: Nested formatting preserved or flattened consistently /// Current behavior: May be malformed #[tokio::test] async fn test_typst_nested_formatting() { let test_content = b"This is *bold with _nested italic_* text."; let config = ExtractionConfig::default(); let result = extract_bytes(test_content, "application/x-typst", &config) .await .expect("Extraction failed"); let has_formatting = result.content.contains("*") || result.content.contains("_") || (result.content.contains("bold") && result.content.contains("italic")); assert!( has_formatting, "Nested formatting should be preserved or flattened consistently." ); } /// TEST 17: Multiple paragraph handling /// /// Multiple paragraphs separated by blank lines should be preserved. /// /// Expected: Paragraph structure maintained /// Current behavior: May merge or lose paragraphs #[tokio::test] async fn test_typst_multiple_paragraphs() { let content = load_test_document("advanced.typ"); let _baseline = load_pandoc_baseline("advanced"); let config = ExtractionConfig::default(); let result = extract_bytes(&content, "application/x-typst", &config) .await .expect("Extraction failed"); let non_empty_lines: Vec<_> = result.content.lines().filter(|l| !l.trim().is_empty()).collect(); assert!( non_empty_lines.len() >= 5, "Multiple paragraphs should be preserved. Found {} content lines.", non_empty_lines.len() ); } /// TEST 18: Heading-content association /// /// Content should follow its heading logically in the output. /// /// Expected: Each heading followed by its content /// Current behavior: May be scrambled #[tokio::test] async fn test_typst_heading_content_association() { let content = load_test_document("advanced.typ"); let config = ExtractionConfig::default(); let result = extract_bytes(&content, "application/x-typst", &config) .await .expect("Extraction failed"); let blocks = extract_content_blocks(&result.content); assert!(blocks.len() > 0, "Content blocks should be associated with headings."); for block in &blocks { assert!(block.len() > 0, "Content blocks should not be empty."); } } /// TEST 19: Whitespace normalization /// /// Multiple blank lines should be normalized consistently. /// /// Expected: Single blank lines between sections /// Current behavior: May have excessive whitespace #[tokio::test] async fn test_typst_whitespace_handling() { let content = load_test_document("advanced.typ"); let config = ExtractionConfig::default(); let result = extract_bytes(&content, "application/x-typst", &config) .await .expect("Extraction failed"); let blank_line_runs: Vec<_> = result.content.split("\n\n\n").collect(); assert!( blank_line_runs.len() <= 2, "Should not have excessive blank lines (triple newlines). \ Found {} instances of triple newlines.", blank_line_runs.len() - 1 ); } /// TEST 20: Minimal document handling /// /// Even minimal documents should extract correctly. /// /// Expected: Basic content and structure /// Current behavior: May fail or lose content #[tokio::test] async fn test_typst_minimal_document() { let content = load_test_document("minimal.typ"); let _baseline = load_pandoc_baseline("minimal"); let config = ExtractionConfig::default(); let result = extract_bytes(&content, "application/x-typst", &config) .await .expect("Extraction failed"); assert!( !result.content.is_empty(), "Even minimal documents should extract some content." ); assert!( result.content.len() > 0, "Minimal document should produce non-empty output." ); } /// TEST 21: No directive pollution /// /// Extracted content should not contain #set, #let, #import directives. /// /// Expected: Clean extracted content without directives /// Current behavior: May include directives #[tokio::test] async fn test_typst_no_directive_pollution() { let content = load_test_document("advanced.typ"); let config = ExtractionConfig::default(); let result = extract_bytes(&content, "application/x-typst", &config) .await .expect("Extraction failed"); let directive_count = count_directive_lines(&result.content); assert_eq!( directive_count, 0, "Extracted content should not contain directives (#set, #let, etc). \ Found {} directive lines polluting the output.", directive_count ); } /// TEST 22: Metadata field completeness /// /// All metadata fields from baseline should be present. /// /// Expected: Title, author, date, keywords all extracted /// Current behavior: Some fields missing #[tokio::test] async fn test_typst_metadata_field_completeness() { let content = load_test_document("advanced.typ"); let config = ExtractionConfig::default(); let result = extract_bytes(&content, "application/x-typst", &config) .await .expect("Extraction failed"); let has_title = result.metadata.title.is_some(); let has_author = result.metadata.authors.is_some(); let has_date = result.metadata.created_at.is_some(); assert!( has_title && has_author && has_date, "All metadata fields should be extracted. \ Title: {}, Author: {}, Date: {}", has_title, has_author, has_date ); } /// TEST 23: Special character handling /// /// Unicode and special characters should be preserved. /// /// Expected: Special characters like ü, é, etc. preserved /// Current behavior: May be corrupted #[tokio::test] async fn test_typst_special_character_preservation() { let test_content = "Café with naïve français".as_bytes(); let config = ExtractionConfig::default(); let result = extract_bytes(test_content, "application/x-typst", &config) .await .expect("Extraction failed"); let has_special_chars = result.content.contains("Café") || result.content.contains("naïve") || result.content.contains("français"); assert!( has_special_chars, "Special characters should be preserved in extraction." ); } /// TEST 24: Very long heading handling /// /// Long headings should not cause truncation or corruption. /// /// Expected: Full heading text preserved regardless of length /// Current behavior: May truncate #[tokio::test] async fn test_typst_long_heading_handling() { let test_content = b"= This is a very long heading that should be completely preserved without any truncation or corruption whatsoever"; let config = ExtractionConfig::default(); let result = extract_bytes(test_content, "application/x-typst", &config) .await .expect("Extraction failed"); let has_heading_start = result.content.contains("very long heading"); assert!(has_heading_start, "Long headings should not be truncated."); } /// TEST 25: Edge case - Empty heading recovery /// /// Even if a heading has no text, extraction should be robust. /// /// Expected: Graceful handling without crashes /// Current behavior: May panic or produce empty output #[tokio::test] async fn test_typst_empty_heading_edge_case() { let test_content = b"= \n\n== \nContent here"; let config = ExtractionConfig::default(); let result = extract_bytes(test_content, "application/x-typst", &config).await; match result { Ok(extraction) => { assert!( extraction.content.contains("Content"), "Should extract regular content even if some headings are empty." ); } Err(_) => {} } } /// TEST 26: Regression - Basic heading extraction #[tokio::test] async fn test_typst_basic_heading_regression() { let test_content = b"= Main Heading\n\nContent here"; let config = ExtractionConfig::default(); let result = extract_bytes(test_content, "application/x-typst", &config) .await .expect("Extraction failed"); assert!( result.content.contains("= Main Heading"), "Basic level-1 heading should be extracted." ); assert!(result.content.contains("Content"), "Content should be extracted."); } /// TEST 27: Regression - Level 2 heading extraction #[tokio::test] async fn test_typst_level2_heading_regression() { let test_content = b"= Main\n\n== Subsection\n\nMore content"; let config = ExtractionConfig::default(); let result = extract_bytes(test_content, "application/x-typst", &config) .await .expect("Extraction failed"); assert!( result.content.contains("== Subsection"), "Level 2 headings must be extracted." ); } /// TEST 28: Regression - Basic metadata #[tokio::test] async fn test_typst_basic_metadata_regression() { let test_content = b"#set document(title: \"Test\", author: \"Me\")\n\n= Heading"; let config = ExtractionConfig::default(); let result = extract_bytes(test_content, "application/x-typst", &config) .await .expect("Extraction failed"); assert!(result.metadata.title.is_some(), "Title metadata must be extracted."); assert!(result.metadata.authors.is_some(), "Author metadata must be extracted."); } /// TEST 29: Regression - Bold formatting #[tokio::test] async fn test_typst_bold_regression() { let test_content = b"This is *bold text* here"; let config = ExtractionConfig::default(); let result = extract_bytes(test_content, "application/x-typst", &config) .await .expect("Extraction failed"); assert!( result.content.contains("*bold*") || result.content.contains("bold"), "Bold text should be preserved." ); } /// TEST 30: Regression - Inline code /// /// The Typst extractor strips inline-code backticks from the rendered text /// and stores the spans as `code` annotations on the InternalDocument. This /// preserves the *information* (which words are code) without polluting the /// plain-text output with format markers — same approach as our other format /// extractors. This test asserts the content survives round-trip. #[tokio::test] async fn test_typst_inline_code_regression() { let test_content = b"Use `println!(\"hello\")` in Rust"; let config = ExtractionConfig::default(); let result = extract_bytes(test_content, "application/x-typst", &config) .await .expect("Extraction failed"); assert!(result.content.contains("println"), "inline code content lost"); assert!( result.content.contains("Use") && result.content.contains("Rust"), "surrounding text lost" ); } /// TEST 31: Regression - Code blocks /// /// Code-block content survives extraction. The triple-backtick fence and /// language tag are tracked as a `Code` element with `language` attribute on /// the InternalDocument; downstream renderers (markdown, djot, html) emit the /// fence, plain-text omits it. This test asserts the program text round-trips. #[tokio::test] async fn test_typst_codeblock_regression() { let test_content = b"```rust\nfn main() {}\n```"; let config = ExtractionConfig::default(); let result = extract_bytes(test_content, "application/x-typst", &config) .await .expect("Extraction failed"); assert!(result.content.contains("fn main"), "code block content lost"); } /// TEST 32: Regression - List extraction #[tokio::test] async fn test_typst_list_regression() { let test_content = b"- Item 1\n+ Item 2\n- Item 3"; let config = ExtractionConfig::default(); let result = extract_bytes(test_content, "application/x-typst", &config) .await .expect("Extraction failed"); assert!( result.content.contains("Item 1") && result.content.contains("Item 2") && result.content.contains("Item 3"), "All list items should be extracted." ); } /// TEST 33: Regression - Math preservation #[tokio::test] async fn test_typst_math_regression() { let test_content = b"Formula: $E = mc^2$"; let config = ExtractionConfig::default(); let result = extract_bytes(test_content, "application/x-typst", &config) .await .expect("Extraction failed"); assert!( result.content.contains("$") && (result.content.contains("mc") || result.content.contains("E")), "Math formulas should be preserved." ); } /// TEST 34: Regression - Link extraction #[tokio::test] async fn test_typst_link_regression() { let test_content = b"Visit #link(\"https://example.com\")[example]"; let config = ExtractionConfig::default(); let result = extract_bytes(test_content, "application/x-typst", &config) .await .expect("Extraction failed"); assert!( result.content.contains("example") || result.content.contains("example.com"), "Link text or URL should be preserved." ); } /// TEST 35: Regression - Table basic extraction #[tokio::test] async fn test_typst_table_regression() { let test_content = b"#table(columns: 2, [A], [B], [1], [2])"; let config = ExtractionConfig::default(); let result = extract_bytes(test_content, "application/x-typst", &config) .await .expect("Extraction failed"); assert!( result.content.contains("A") || result.content.contains("TABLE"), "Table content should be extracted." ); } /// TEST 36: Large document handling #[tokio::test] async fn test_typst_large_document_stress() { let mut large_content = String::new(); for i in 1..=50 { large_content.push_str(&format!("= Heading {}\n\n", i)); large_content.push_str(&format!("Content for section {}.\n\n", i)); } let config = ExtractionConfig::default(); let result = extract_bytes(large_content.as_bytes(), "application/x-typst", &config) .await .expect("Extraction failed"); let heading_count = extract_all_headings(&result.content).len(); assert!( heading_count >= 40, "Large documents should extract all headings. Found {} of 50.", heading_count ); } /// TEST 37: Deep nesting stress test #[tokio::test] async fn test_typst_deep_nesting_stress() { let mut nested = String::new(); for level in 1..=6 { nested.push_str(&format!("{} Level {} Heading\n\n", "=".repeat(level), level)); nested.push_str(&format!("Content at level {}.\n\n", level)); } let config = ExtractionConfig::default(); let result = extract_bytes(nested.as_bytes(), "application/x-typst", &config) .await .expect("Extraction failed"); for level in 1..=6 { let count = count_heading_level(&result.content, level); assert!( count >= 1, "Level {} heading should be extracted in deep nesting test.", level ); } } /// TEST 38: Mixed formatting stress #[tokio::test] async fn test_typst_mixed_formatting_stress() { let test_content = b"This text has *bold*, _italic_, `code`, and $math$ all mixed together!"; let config = ExtractionConfig::default(); let result = extract_bytes(test_content, "application/x-typst", &config) .await .expect("Extraction failed"); let has_formatting = (result.content.contains("*") || result.content.contains("bold")) && (result.content.contains("_") || result.content.contains("italic")) && (result.content.contains("`") || result.content.contains("code")) && (result.content.contains("$") || result.content.contains("math")); assert!(has_formatting, "All mixed formatting should be preserved."); } /// TEST 39: Unicode stress test #[tokio::test] async fn test_typst_unicode_stress() { let test_content = "= Unicode Heading 中文 العربية\n\nContent with emojis: 🎉🚀💯\n\nGreek: α β γ δ ε ζ".as_bytes(); let config = ExtractionConfig::default(); let result = extract_bytes(test_content, "application/x-typst", &config) .await .expect("Extraction failed"); assert!( result.content.contains("Unicode"), "Unicode content should be preserved." ); } /// TEST 40: Pathological whitespace #[tokio::test] async fn test_typst_pathological_whitespace() { let test_content = b"= Heading\n\n\n\n\n\nContent with excessive blank lines\n\n\n\n\nMore content"; let config = ExtractionConfig::default(); let result = extract_bytes(test_content, "application/x-typst", &config) .await .expect("Extraction failed"); assert!( result.content.contains("Heading") && result.content.contains("Content"), "Should extract content even with excessive whitespace." ); } /// TEST 41: Full document comparison - simple.typ #[tokio::test] async fn test_typst_full_simple_document_comparison() { let content = load_test_document("simple.typ"); let _baseline = load_pandoc_baseline("simple"); let config = ExtractionConfig::default(); let result = extract_bytes(&content, "application/x-typst", &config) .await .expect("Extraction failed"); assert!( result.content.len() > 50, "simple.typ should extract substantial content" ); let heading_count = extract_all_headings(&result.content).len(); assert!(heading_count > 2, "simple.typ should have multiple sections"); } /// TEST 42: Full document comparison - advanced.typ #[tokio::test] async fn test_typst_full_advanced_document_comparison() { let content = load_test_document("advanced.typ"); let _baseline = load_pandoc_baseline("advanced"); let config = ExtractionConfig::default(); let result = extract_bytes(&content, "application/x-typst", &config) .await .expect("Extraction failed"); assert!( result.content.len() > 100, "advanced.typ should extract comprehensive content" ); let heading_count = extract_all_headings(&result.content).len(); assert!(heading_count >= 5, "advanced.typ should preserve heading structure"); } /// TEST 43: MIME type consistency /// /// The extractor should support both standard MIME types for Typst. /// Currently only supports application/x-typst, not text/x-typst. #[tokio::test] async fn test_typst_mime_type_consistency() { let content = load_test_document("simple.typ"); let config = ExtractionConfig::default(); let result_primary = extract_bytes(&content, "application/x-typst", &config) .await .expect("Primary MIME type should work"); assert!( result_primary.content.len() > 0, "Primary MIME type should extract content" ); match extract_bytes(&content, "text/x-typst", &config).await { Ok(result) => { assert!( result.content.len() > 0, "Alternative MIME type should extract content if supported" ); } Err(_e) => { println!("Note: text/x-typst is not currently supported (may be added in future)"); } } } /// TEST 44: Config parameter impact #[tokio::test] async fn test_typst_config_parameter_handling() { let content = load_test_document("simple.typ"); let config = ExtractionConfig::default(); let result = extract_bytes(&content, "application/x-typst", &config) .await .expect("Extraction failed"); assert!(!result.content.is_empty(), "Extraction with default config should work"); assert_eq!(result.mime_type, "application/x-typst", "MIME type should be preserved"); } /// TEST 45: Comparative heading analysis /// /// This final comprehensive test checks heading extraction /// against the baseline to identify the exact scope of the heading loss bug. #[tokio::test] async fn test_typst_heading_loss_bug_analysis() { let content = load_test_document("headings.typ"); let baseline = load_pandoc_baseline("headings"); let config = ExtractionConfig::default(); let result = extract_bytes(&content, "application/x-typst", &config) .await .expect("Extraction failed"); println!("\n===== HEADING EXTRACTION ANALYSIS ====="); println!("Baseline content:"); println!("{}", baseline); println!("\nExtracted content:"); println!("{}", result.content); let extracted_headings = extract_all_headings(&result.content); println!("\nExtracted headings: {}", extracted_headings.len()); for (i, h) in extracted_headings.iter().enumerate() { println!(" {}: {}", i + 1, h); } assert!( extracted_headings.len() >= 6, "BUG CONFIRMED: Heading loss detected. \ Expected 6 headings (1-6 levels), found {}. \ This is the 62% heading loss bug - only single '=' is matched, \ all '==' and higher are skipped entirely.", extracted_headings.len() ); }