crates/kreuzberg/tests/typst_behavioral_tests.rs

#![allow(clippy::len_zero, clippy::unnecessary_get_then_check, clippy::single_match)]
#![cfg(feature = "office")]
//! Comprehensive behavioral tests for Typst extractor against Pandoc baselines.
//!
//! These tests expose the critical bugs found in code review:
//! 1. 62% heading loss bug - only matches single `=` headings
//! 2. Blockquotes not implemented
//! 3. Display math not extracted
//! 4. Nested table brackets cause corruption
//! 5. Empty headings output (just `= ` with no text)
//! 6. Regex failures silently lose metadata
//!
//! The tests are designed to FAIL initially, exposing real bugs that need fixing.
//! They compare extracted output against Pandoc baseline outputs for behavioral parity.

use kreuzberg::core::config::ExtractionConfig;
use kreuzberg::core::extractor::extract_bytes;
use std::{fs, path::PathBuf};

fn typst_doc_root() -> PathBuf {
    PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../test_documents/typst")
}

/// Load a test document from the test_documents/typst directory
fn load_test_document(filename: &str) -> Vec<u8> {
    let path = typst_doc_root().join(filename);
    fs::read(&path).unwrap_or_else(|_| panic!("Failed to read test document: {}", filename))
}

/// Load Pandoc baseline output for comparison
fn load_pandoc_baseline(filename_base: &str) -> String {
    let path = typst_doc_root().join(format!("{filename_base}_pandoc_baseline.txt"));
    fs::read_to_string(&path).unwrap_or_else(|_| panic!("Failed to read baseline: {}", filename_base))
}

/// Load Pandoc metadata JSON for comparison
fn load_pandoc_metadata(filename_base: &str) -> String {
    let path = typst_doc_root().join(format!("{filename_base}_pandoc_meta.json"));
    fs::read_to_string(&path).unwrap_or_else(|_| panic!("Failed to read metadata: {}", filename_base))
}

/// Count specific heading levels (= for level 1, == for level 2, etc.)
fn count_heading_level(content: &str, level: usize) -> usize {
    let exact_marker = format!("{} ", "=".repeat(level));
    content
        .lines()
        .filter(|l| l.trim_start().starts_with(&exact_marker))
        .count()
}

/// Extract all headings from content
fn extract_all_headings(content: &str) -> Vec<String> {
    content
        .lines()
        .filter(|l| {
            let trimmed = l.trim_start();
            trimmed.starts_with('=') && !trimmed.starts_with("#set")
        })
        .map(|l| l.to_string())
        .collect()
}

/// Count lines that are pure metadata/directives (not content)
fn count_directive_lines(content: &str) -> usize {
    content
        .lines()
        .filter(|l| {
            let t = l.trim();
            t.starts_with("#set ") || t.starts_with("#let ") || t.starts_with("#import ")
        })
        .count()
}

/// Count empty headings (headings with just `= ` and no text)
fn count_empty_headings(content: &str) -> usize {
    content
        .lines()
        .filter(|l| {
            let trimmed = l.trim_start();
            trimmed == "="
                || trimmed == "=="
                || trimmed == "==="
                || trimmed == "===="
                || trimmed == "====="
                || trimmed == "======"
        })
        .count()
}

/// Extract all text between headings (content blocks)
fn extract_content_blocks(content: &str) -> Vec<String> {
    let mut blocks = Vec::new();
    let mut current_block = String::new();
    let mut in_block = false;

    for line in content.lines() {
        let trimmed = line.trim_start();
        if trimmed.starts_with('=') && !trimmed.starts_with("#set") {
            if !current_block.is_empty() {
                blocks.push(current_block.trim().to_string());
                current_block.clear();
            }
            in_block = true;
        } else if in_block && !trimmed.is_empty() {
            current_block.push_str(line);
            current_block.push('\n');
        }
    }

    if !current_block.is_empty() {
        blocks.push(current_block.trim().to_string());
    }

    blocks
}

/// Check if content has reasonable parity with baseline (within tolerance)
fn content_parity_check(extracted: &str, baseline: &str, tolerance_percent: f64) -> bool {
    let extracted_len = extracted.len();
    let baseline_len = baseline.len();

    if baseline_len == 0 {
        return extracted_len == 0;
    }

    let ratio = (extracted_len as f64) / (baseline_len as f64);
    let acceptable_min = 1.0 - (tolerance_percent / 100.0);
    let acceptable_max = 1.0 + (tolerance_percent / 100.0);

    ratio >= acceptable_min && ratio <= acceptable_max
}

// CRITICAL BUG TESTS - These expose the 45+ issues

/// TEST 1: CRITICAL - 62% heading loss bug
///
/// The extractor only matches single `=` headings, completely skipping
/// `==`, `===`, and higher levels. This causes catastrophic data loss
/// in hierarchical documents.
///
/// Expected: All heading levels should be extracted
/// Current behavior: Only level 1 headings extracted
/// WILL FAIL: Exposing the heading loss bug
#[tokio::test]
async fn test_typst_all_heading_levels_not_lost() {
    let content = load_test_document("headings.typ");
    let _baseline = load_pandoc_baseline("headings");
    let config = ExtractionConfig::default();

    let result = extract_bytes(&content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    let extracted_all_headings = extract_all_headings(&result.content);

    assert!(
        extracted_all_headings.len() >= 6,
        "CRITICAL BUG: Only extracted {} headings, should have extracted 6+ heading levels. \
         This is the 62% heading loss bug - extractor only matches '=' but skips '==', '===', etc.",
        extracted_all_headings.len()
    );

    for level in 1..=6 {
        let count = count_heading_level(&result.content, level);
        assert_eq!(
            count, 1,
            "Heading level {} should appear exactly once (found {}). \
             Missing heading levels cause data loss in hierarchical documents.",
            level, count
        );
    }
}

/// TEST 2: Display math not extracted
///
/// Display math ($$...$$) is completely lost from extraction,
/// breaking mathematical content preservation.
///
/// Expected: Display math should be preserved in output
/// Current behavior: Silently dropped
/// WILL FAIL: Exposing display math loss
#[tokio::test]
async fn test_typst_display_math_preserved() {
    let content = load_test_document("advanced.typ");
    let baseline = load_pandoc_baseline("advanced");
    let config = ExtractionConfig::default();

    let result = extract_bytes(&content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    let has_display_math_in_baseline =
        baseline.contains("²") || baseline.contains("Display math") || baseline.contains("x^2");

    if has_display_math_in_baseline {
        let our_has_math = result.content.contains("$")
            || result.content.contains("Display")
            || result.content.contains("²")
            || result.content.contains("²");

        assert!(
            our_has_math,
            "Display math should be extracted. Pandoc preserves mathematical notation, \
             but extractor drops it entirely. This breaks scientific/academic documents."
        );
    }

    let has_pythagorean = result.content.contains("^2")
        || result.content.contains("²")
        || result.content.contains("x") && result.content.contains("y") && result.content.contains("r");

    assert!(
        has_pythagorean,
        "Pythagorean theorem expression should be present. Display math is being dropped."
    );
}

/// TEST 3: Empty headings output
///
/// When heading text is missing or malformed, extractor outputs
/// just the marker like "= " with no text, polluting the output.
///
/// Expected: Either full heading text or no heading at all
/// Current behavior: "= " with no content
/// WILL FAIL: Exposing empty heading bug
#[tokio::test]
async fn test_typst_no_empty_headings_output() {
    let content = load_test_document("headings.typ");
    let config = ExtractionConfig::default();

    let result = extract_bytes(&content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    let empty_headings = count_empty_headings(&result.content);

    assert_eq!(
        empty_headings, 0,
        "Found {} empty heading lines (just '=' with no text). \
         Extractor outputs malformed headings like '= ' with no text, \
         corrupting the document structure.",
        empty_headings
    );

    for heading in extract_all_headings(&result.content) {
        let trimmed = heading.trim_start();
        let after_marker = trimmed.trim_start_matches('=').trim();
        assert!(
            !after_marker.is_empty(),
            "Heading '{}' has no text after marker. Should not output empty headings.",
            trimmed
        );
    }
}

/// TEST 4: Metadata extraction fails with regex silently
///
/// When regex patterns fail to match metadata fields,
/// the extractor silently returns None instead of logging/failing,
/// causing complete metadata loss for certain formats.
///
/// Expected: All metadata fields should be extracted
/// Current behavior: Some formats fail silently
/// WILL FAIL: Exposing metadata loss
#[tokio::test]
async fn test_typst_metadata_extraction_completeness() {
    let content = load_test_document("metadata.typ");
    let _baseline_meta = load_pandoc_metadata("metadata");
    let config = ExtractionConfig::default();

    let result = extract_bytes(&content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    let has_title = result.metadata.title.as_ref().map(|t| !t.is_empty()).unwrap_or(false);
    let has_author = result.metadata.authors.as_ref().map(|a| !a.is_empty()).unwrap_or(false);
    let has_keywords = result
        .metadata
        .keywords
        .as_ref()
        .map(|k| !k.is_empty())
        .unwrap_or(false);

    assert!(
        has_title,
        "Title metadata should be extracted. Regex pattern matching fails silently \
         and metadata is lost with no error reporting."
    );

    assert!(
        has_author,
        "Author metadata should be extracted. Some metadata formats fail silently."
    );

    assert!(
        has_keywords,
        "Keywords should be extracted. Regex failures cause silent data loss."
    );
}

/// TEST 5: Nested table brackets cause corruption
///
/// Tables with nested brackets like [Name [full]] corrupt the
/// table content extraction because bracket counting is naive.
///
/// Expected: Table cells should be extracted correctly even with nesting
/// Current behavior: Bracket nesting causes cells to be malformed
/// WILL FAIL: Exposing table corruption bug
#[tokio::test]
async fn test_typst_tables_with_nested_brackets_not_corrupted() {
    let content = load_test_document("advanced.typ");
    let baseline = load_pandoc_baseline("advanced");
    let config = ExtractionConfig::default();

    let result = extract_bytes(&content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    let has_table_in_baseline = baseline.contains("Name") && baseline.contains("Alice");

    if has_table_in_baseline {
        let table_content_extracted =
            result.content.contains("Name") && result.content.contains("Alice") && result.content.contains("Age");

        assert!(
            table_content_extracted,
            "Table content should be extracted correctly. Nested brackets cause corruption \
             and table cells are malformed."
        );

        let corrupted_brackets = result.content.matches("[[").count();
        assert_eq!(
            corrupted_brackets, 0,
            "Found corrupted bracket sequences [[. Table extraction with nested brackets \
             produces malformed output."
        );
    }
}

/// TEST 6: Content volume parity - within tolerance of Pandoc
///
/// Our extractor should extract roughly the same amount of content
/// as Pandoc (baseline). Large discrepancies indicate data loss or
/// noise injection.
///
/// Expected: Within reasonable tolerance of baseline content size
/// Current behavior: Significant data loss on complex documents (e.g., advanced.typ)
/// WILL FAIL: Exposing data loss on complex documents with formatting
#[tokio::test]
async fn test_typst_content_volume_parity_with_pandoc() {
    let documents = vec![("simple", 30.0), ("headings", 20.0)];

    for (doc_name, tolerance) in documents {
        let content = load_test_document(&format!("{}.typ", doc_name));
        let baseline = load_pandoc_baseline(doc_name);
        let config = ExtractionConfig::default();

        let result = extract_bytes(&content, "application/x-typst", &config)
            .await
            .unwrap_or_else(|_| panic!("Extraction failed for {}", doc_name));

        let baseline_size = baseline.len();
        let extracted_size = result.content.len();

        let is_within_tolerance = content_parity_check(&result.content, &baseline, tolerance);

        assert!(
            is_within_tolerance,
            "Content volume parity failed for {}: \
             Baseline: {} bytes, Extracted: {} bytes ({}% tolerance allowed). \
             Data loss indicates missing extraction features or formatting issues.",
            doc_name, baseline_size, extracted_size, tolerance
        );
    }
}

/// TEST 7: Blockquotes not implemented
///
/// Blockquotes (using > syntax in other formats, typst uses #quote)
/// are completely unimplemented, causing loss of semantic structure.
///
/// Expected: Blockquote content should be extracted
/// Current behavior: Feature not implemented
/// WILL FAIL: Exposing missing blockquote support
#[tokio::test]
async fn test_typst_blockquote_handling() {
    let test_content = b"#quote[
        This is a blockquote.
        It should be extracted.
    ]";

    let config = ExtractionConfig::default();
    let result = extract_bytes(test_content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    let has_blockquote_content =
        result.content.contains("blockquote") || result.content.contains("This is a blockquote");

    assert!(
        has_blockquote_content,
        "Blockquote content should be extracted. Blockquotes are not implemented \
         in the extractor, causing complete loss of quoted content."
    );
}

/// TEST 8: Inline code preservation
///
/// Test that inline code blocks are properly extracted and marked.
/// This ensures code snippets aren't corrupted.
///
/// Expected: Inline code preserved with backticks or clearly marked
/// Current behavior: May be corrupted
/// WILL FAIL: If inline code is not preserved
#[tokio::test]
async fn test_typst_inline_code_preserved() {
    let content = load_test_document("advanced.typ");
    let baseline = load_pandoc_baseline("advanced");
    let config = ExtractionConfig::default();

    let result = extract_bytes(&content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    let has_inline_code =
        result.content.contains("`") || (result.content.contains("code") && baseline.contains("`code`"));

    assert!(
        has_inline_code,
        "Inline code should be preserved with backticks or clearly marked."
    );
}

/// TEST 9: Inline math extraction
///
/// Inline math (single $ delimiters) should be extracted and preserved.
///
/// Expected: Inline math formulas preserved
/// Current behavior: May be dropped
/// WILL FAIL: If inline math is lost
#[tokio::test]
async fn test_typst_inline_math_preserved() {
    let content = load_test_document("advanced.typ");
    let baseline = load_pandoc_baseline("advanced");
    let config = ExtractionConfig::default();

    let result = extract_bytes(&content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    let has_inline_math =
        result.content.contains("$") || result.content.contains("sqrt") || result.content.contains("equation");

    if baseline.contains("$") || baseline.contains("equation") {
        assert!(
            has_inline_math,
            "Inline math should be extracted. Mathematical formulas are being dropped."
        );
    }
}

/// TEST 10: Figures and captions
///
/// Figure extraction with captions should preserve both image references
/// and caption text.
///
/// Expected: Figure content and captions extracted
/// Current behavior: May be unimplemented
#[tokio::test]
async fn test_typst_figures_and_captions() {
    let test_content = b"#figure(
        image(\"example.png\"),
        caption: [This is a figure caption]
    )";

    let config = ExtractionConfig::default();
    let result = extract_bytes(test_content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    let _has_caption = result.content.contains("caption") || result.content.contains("figure");

    println!(
        "Figure extraction result (feature may be unimplemented): {:?}",
        result.content
    );
}

/// TEST 11: Citation/reference handling
///
/// Citations and references should be extracted when present.
///
/// Expected: Citation markers and text preserved
/// Current behavior: May be dropped
#[tokio::test]
async fn test_typst_citations_preserved() {
    let test_content = b"Here is a citation @smith2020.

= References

#bibliography()";

    let config = ExtractionConfig::default();
    let result = extract_bytes(test_content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    let _has_citation = result.content.contains("@smith2020")
        || result.content.contains("smith")
        || result.content.contains("References");

    println!("Citation handling (may be limited): {:?}", result.content);
}

/// TEST 12: Link extraction and formatting
///
/// Links should be extracted with both URL and link text.
///
/// Expected: Links in markdown format [text](url)
/// Current behavior: May lose URL or text
#[tokio::test]
async fn test_typst_link_extraction() {
    let content = load_test_document("advanced.typ");
    let _baseline = load_pandoc_baseline("advanced");
    let config = ExtractionConfig::default();

    let result = extract_bytes(&content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    let has_link_content =
        result.content.contains("example") || result.content.contains("link") || result.content.contains("https");

    assert!(
        has_link_content,
        "Link content should be extracted. Links may be completely dropped."
    );
}

/// TEST 13: Unordered list extraction
///
/// Both + and - list markers should be converted to standard format.
///
/// Expected: All list items extracted and normalized
/// Current behavior: May lose some items
#[tokio::test]
async fn test_typst_list_extraction() {
    let content = load_test_document("simple.typ");
    let _baseline = load_pandoc_baseline("simple");
    let config = ExtractionConfig::default();

    let result = extract_bytes(&content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    let has_list_markers = result.content.contains("-") || result.content.contains("+");
    let has_list_content =
        result.content.contains("First") || result.content.contains("Second") || result.content.contains("item");

    assert!(
        has_list_markers || has_list_content,
        "List items should be extracted with markers or content preserved."
    );
}

/// TEST 14: Code block extraction
///
/// Triple-backtick code blocks should be fully extracted with language specifiers.
///
/// Expected: Code blocks with language markers preserved
/// Current behavior: May be malformed
#[tokio::test]
async fn test_typst_code_block_extraction() {
    let content = load_test_document("advanced.typ");
    let _baseline = load_pandoc_baseline("advanced");
    let config = ExtractionConfig::default();

    let result = extract_bytes(&content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    let has_code = result.content.contains("```")
        || result.content.contains("def")
        || result.content.contains("fibonacci")
        || result.content.contains("python");

    assert!(has_code, "Code blocks should be extracted with language specifiers.");
}

/// TEST 15: Bold and italic formatting
///
/// Inline emphasis formatting should be preserved or normalized.
///
/// Expected: Bold (*text*) and italic (_text_) markers present
/// Current behavior: May be lost
#[tokio::test]
async fn test_typst_emphasis_formatting() {
    let content = load_test_document("advanced.typ");
    let config = ExtractionConfig::default();

    let result = extract_bytes(&content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    let has_emphasis = result.content.contains("*") && result.content.contains("_");

    assert!(has_emphasis, "Bold and italic formatting markers should be preserved.");
}

/// TEST 16: Complex nested formatting
///
/// Test handling of *_nested formatting_* combinations.
///
/// Expected: Nested formatting preserved or flattened consistently
/// Current behavior: May be malformed
#[tokio::test]
async fn test_typst_nested_formatting() {
    let test_content = b"This is *bold with _nested italic_* text.";

    let config = ExtractionConfig::default();
    let result = extract_bytes(test_content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    let has_formatting = result.content.contains("*")
        || result.content.contains("_")
        || (result.content.contains("bold") && result.content.contains("italic"));

    assert!(
        has_formatting,
        "Nested formatting should be preserved or flattened consistently."
    );
}

/// TEST 17: Multiple paragraph handling
///
/// Multiple paragraphs separated by blank lines should be preserved.
///
/// Expected: Paragraph structure maintained
/// Current behavior: May merge or lose paragraphs
#[tokio::test]
async fn test_typst_multiple_paragraphs() {
    let content = load_test_document("advanced.typ");
    let _baseline = load_pandoc_baseline("advanced");
    let config = ExtractionConfig::default();

    let result = extract_bytes(&content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    let non_empty_lines: Vec<_> = result.content.lines().filter(|l| !l.trim().is_empty()).collect();

    assert!(
        non_empty_lines.len() >= 5,
        "Multiple paragraphs should be preserved. Found {} content lines.",
        non_empty_lines.len()
    );
}

/// TEST 18: Heading-content association
///
/// Content should follow its heading logically in the output.
///
/// Expected: Each heading followed by its content
/// Current behavior: May be scrambled
#[tokio::test]
async fn test_typst_heading_content_association() {
    let content = load_test_document("advanced.typ");
    let config = ExtractionConfig::default();

    let result = extract_bytes(&content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    let blocks = extract_content_blocks(&result.content);

    assert!(blocks.len() > 0, "Content blocks should be associated with headings.");

    for block in &blocks {
        assert!(block.len() > 0, "Content blocks should not be empty.");
    }
}

/// TEST 19: Whitespace normalization
///
/// Multiple blank lines should be normalized consistently.
///
/// Expected: Single blank lines between sections
/// Current behavior: May have excessive whitespace
#[tokio::test]
async fn test_typst_whitespace_handling() {
    let content = load_test_document("advanced.typ");
    let config = ExtractionConfig::default();

    let result = extract_bytes(&content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    let blank_line_runs: Vec<_> = result.content.split("\n\n\n").collect();

    assert!(
        blank_line_runs.len() <= 2,
        "Should not have excessive blank lines (triple newlines). \
         Found {} instances of triple newlines.",
        blank_line_runs.len() - 1
    );
}

/// TEST 20: Minimal document handling
///
/// Even minimal documents should extract correctly.
///
/// Expected: Basic content and structure
/// Current behavior: May fail or lose content
#[tokio::test]
async fn test_typst_minimal_document() {
    let content = load_test_document("minimal.typ");
    let _baseline = load_pandoc_baseline("minimal");
    let config = ExtractionConfig::default();

    let result = extract_bytes(&content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    assert!(
        !result.content.is_empty(),
        "Even minimal documents should extract some content."
    );

    assert!(
        result.content.len() > 0,
        "Minimal document should produce non-empty output."
    );
}

/// TEST 21: No directive pollution
///
/// Extracted content should not contain #set, #let, #import directives.
///
/// Expected: Clean extracted content without directives
/// Current behavior: May include directives
#[tokio::test]
async fn test_typst_no_directive_pollution() {
    let content = load_test_document("advanced.typ");
    let config = ExtractionConfig::default();

    let result = extract_bytes(&content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    let directive_count = count_directive_lines(&result.content);

    assert_eq!(
        directive_count, 0,
        "Extracted content should not contain directives (#set, #let, etc). \
         Found {} directive lines polluting the output.",
        directive_count
    );
}

/// TEST 22: Metadata field completeness
///
/// All metadata fields from baseline should be present.
///
/// Expected: Title, author, date, keywords all extracted
/// Current behavior: Some fields missing
#[tokio::test]
async fn test_typst_metadata_field_completeness() {
    let content = load_test_document("advanced.typ");
    let config = ExtractionConfig::default();

    let result = extract_bytes(&content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    let has_title = result.metadata.title.is_some();
    let has_author = result.metadata.authors.is_some();
    let has_date = result.metadata.created_at.is_some();

    assert!(
        has_title && has_author && has_date,
        "All metadata fields should be extracted. \
         Title: {}, Author: {}, Date: {}",
        has_title,
        has_author,
        has_date
    );
}

/// TEST 23: Special character handling
///
/// Unicode and special characters should be preserved.
///
/// Expected: Special characters like ü, é, etc. preserved
/// Current behavior: May be corrupted
#[tokio::test]
async fn test_typst_special_character_preservation() {
    let test_content = "Café with naïve français".as_bytes();

    let config = ExtractionConfig::default();
    let result = extract_bytes(test_content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    let has_special_chars =
        result.content.contains("Café") || result.content.contains("naïve") || result.content.contains("français");

    assert!(
        has_special_chars,
        "Special characters should be preserved in extraction."
    );
}

/// TEST 24: Very long heading handling
///
/// Long headings should not cause truncation or corruption.
///
/// Expected: Full heading text preserved regardless of length
/// Current behavior: May truncate
#[tokio::test]
async fn test_typst_long_heading_handling() {
    let test_content = b"= This is a very long heading that should be completely preserved without any truncation or corruption whatsoever";

    let config = ExtractionConfig::default();
    let result = extract_bytes(test_content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    let has_heading_start = result.content.contains("very long heading");

    assert!(has_heading_start, "Long headings should not be truncated.");
}

/// TEST 25: Edge case - Empty heading recovery
///
/// Even if a heading has no text, extraction should be robust.
///
/// Expected: Graceful handling without crashes
/// Current behavior: May panic or produce empty output
#[tokio::test]
async fn test_typst_empty_heading_edge_case() {
    let test_content = b"= \n\n== \nContent here";

    let config = ExtractionConfig::default();
    let result = extract_bytes(test_content, "application/x-typst", &config).await;

    match result {
        Ok(extraction) => {
            assert!(
                extraction.content.contains("Content"),
                "Should extract regular content even if some headings are empty."
            );
        }
        Err(_) => {}
    }
}

/// TEST 26: Regression - Basic heading extraction
#[tokio::test]
async fn test_typst_basic_heading_regression() {
    let test_content = b"= Main Heading\n\nContent here";

    let config = ExtractionConfig::default();
    let result = extract_bytes(test_content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    assert!(
        result.content.contains("= Main Heading"),
        "Basic level-1 heading should be extracted."
    );

    assert!(result.content.contains("Content"), "Content should be extracted.");
}

/// TEST 27: Regression - Level 2 heading extraction
#[tokio::test]
async fn test_typst_level2_heading_regression() {
    let test_content = b"= Main\n\n== Subsection\n\nMore content";

    let config = ExtractionConfig::default();
    let result = extract_bytes(test_content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    assert!(
        result.content.contains("== Subsection"),
        "Level 2 headings must be extracted."
    );
}

/// TEST 28: Regression - Basic metadata
#[tokio::test]
async fn test_typst_basic_metadata_regression() {
    let test_content = b"#set document(title: \"Test\", author: \"Me\")\n\n= Heading";

    let config = ExtractionConfig::default();
    let result = extract_bytes(test_content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    assert!(result.metadata.title.is_some(), "Title metadata must be extracted.");

    assert!(result.metadata.authors.is_some(), "Author metadata must be extracted.");
}

/// TEST 29: Regression - Bold formatting
#[tokio::test]
async fn test_typst_bold_regression() {
    let test_content = b"This is *bold text* here";

    let config = ExtractionConfig::default();
    let result = extract_bytes(test_content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    assert!(
        result.content.contains("*bold*") || result.content.contains("bold"),
        "Bold text should be preserved."
    );
}

/// TEST 30: Regression - Inline code
///
/// The Typst extractor strips inline-code backticks from the rendered text
/// and stores the spans as `code` annotations on the InternalDocument. This
/// preserves the *information* (which words are code) without polluting the
/// plain-text output with format markers — same approach as our other format
/// extractors. This test asserts the content survives round-trip.
#[tokio::test]
async fn test_typst_inline_code_regression() {
    let test_content = b"Use `println!(\"hello\")` in Rust";

    let config = ExtractionConfig::default();
    let result = extract_bytes(test_content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    assert!(result.content.contains("println"), "inline code content lost");
    assert!(
        result.content.contains("Use") && result.content.contains("Rust"),
        "surrounding text lost"
    );
}

/// TEST 31: Regression - Code blocks
///
/// Code-block content survives extraction. The triple-backtick fence and
/// language tag are tracked as a `Code` element with `language` attribute on
/// the InternalDocument; downstream renderers (markdown, djot, html) emit the
/// fence, plain-text omits it. This test asserts the program text round-trips.
#[tokio::test]
async fn test_typst_codeblock_regression() {
    let test_content = b"```rust\nfn main() {}\n```";

    let config = ExtractionConfig::default();
    let result = extract_bytes(test_content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    assert!(result.content.contains("fn main"), "code block content lost");
}

/// TEST 32: Regression - List extraction
#[tokio::test]
async fn test_typst_list_regression() {
    let test_content = b"- Item 1\n+ Item 2\n- Item 3";

    let config = ExtractionConfig::default();
    let result = extract_bytes(test_content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    assert!(
        result.content.contains("Item 1") && result.content.contains("Item 2") && result.content.contains("Item 3"),
        "All list items should be extracted."
    );
}

/// TEST 33: Regression - Math preservation
#[tokio::test]
async fn test_typst_math_regression() {
    let test_content = b"Formula: $E = mc^2$";

    let config = ExtractionConfig::default();
    let result = extract_bytes(test_content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    assert!(
        result.content.contains("$") && (result.content.contains("mc") || result.content.contains("E")),
        "Math formulas should be preserved."
    );
}

/// TEST 34: Regression - Link extraction
#[tokio::test]
async fn test_typst_link_regression() {
    let test_content = b"Visit #link(\"https://example.com\")[example]";

    let config = ExtractionConfig::default();
    let result = extract_bytes(test_content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    assert!(
        result.content.contains("example") || result.content.contains("example.com"),
        "Link text or URL should be preserved."
    );
}

/// TEST 35: Regression - Table basic extraction
#[tokio::test]
async fn test_typst_table_regression() {
    let test_content = b"#table(columns: 2, [A], [B], [1], [2])";

    let config = ExtractionConfig::default();
    let result = extract_bytes(test_content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    assert!(
        result.content.contains("A") || result.content.contains("TABLE"),
        "Table content should be extracted."
    );
}

/// TEST 36: Large document handling
#[tokio::test]
async fn test_typst_large_document_stress() {
    let mut large_content = String::new();

    for i in 1..=50 {
        large_content.push_str(&format!("= Heading {}\n\n", i));
        large_content.push_str(&format!("Content for section {}.\n\n", i));
    }

    let config = ExtractionConfig::default();
    let result = extract_bytes(large_content.as_bytes(), "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    let heading_count = extract_all_headings(&result.content).len();
    assert!(
        heading_count >= 40,
        "Large documents should extract all headings. Found {} of 50.",
        heading_count
    );
}

/// TEST 37: Deep nesting stress test
#[tokio::test]
async fn test_typst_deep_nesting_stress() {
    let mut nested = String::new();

    for level in 1..=6 {
        nested.push_str(&format!("{} Level {} Heading\n\n", "=".repeat(level), level));
        nested.push_str(&format!("Content at level {}.\n\n", level));
    }

    let config = ExtractionConfig::default();
    let result = extract_bytes(nested.as_bytes(), "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    for level in 1..=6 {
        let count = count_heading_level(&result.content, level);
        assert!(
            count >= 1,
            "Level {} heading should be extracted in deep nesting test.",
            level
        );
    }
}

/// TEST 38: Mixed formatting stress
#[tokio::test]
async fn test_typst_mixed_formatting_stress() {
    let test_content = b"This text has *bold*, _italic_, `code`, and $math$ all mixed together!";

    let config = ExtractionConfig::default();
    let result = extract_bytes(test_content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    let has_formatting = (result.content.contains("*") || result.content.contains("bold"))
        && (result.content.contains("_") || result.content.contains("italic"))
        && (result.content.contains("`") || result.content.contains("code"))
        && (result.content.contains("$") || result.content.contains("math"));

    assert!(has_formatting, "All mixed formatting should be preserved.");
}

/// TEST 39: Unicode stress test
#[tokio::test]
async fn test_typst_unicode_stress() {
    let test_content = "= Unicode Heading 中文 العربية\n\nContent with emojis: 🎉🚀💯\n\nGreek: α β γ δ ε ζ".as_bytes();

    let config = ExtractionConfig::default();
    let result = extract_bytes(test_content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    assert!(
        result.content.contains("Unicode"),
        "Unicode content should be preserved."
    );
}

/// TEST 40: Pathological whitespace
#[tokio::test]
async fn test_typst_pathological_whitespace() {
    let test_content = b"= Heading\n\n\n\n\n\nContent with excessive blank lines\n\n\n\n\nMore content";

    let config = ExtractionConfig::default();
    let result = extract_bytes(test_content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    assert!(
        result.content.contains("Heading") && result.content.contains("Content"),
        "Should extract content even with excessive whitespace."
    );
}

/// TEST 41: Full document comparison - simple.typ
#[tokio::test]
async fn test_typst_full_simple_document_comparison() {
    let content = load_test_document("simple.typ");
    let _baseline = load_pandoc_baseline("simple");
    let config = ExtractionConfig::default();

    let result = extract_bytes(&content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    assert!(
        result.content.len() > 50,
        "simple.typ should extract substantial content"
    );

    let heading_count = extract_all_headings(&result.content).len();
    assert!(heading_count > 2, "simple.typ should have multiple sections");
}

/// TEST 42: Full document comparison - advanced.typ
#[tokio::test]
async fn test_typst_full_advanced_document_comparison() {
    let content = load_test_document("advanced.typ");
    let _baseline = load_pandoc_baseline("advanced");
    let config = ExtractionConfig::default();

    let result = extract_bytes(&content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    assert!(
        result.content.len() > 100,
        "advanced.typ should extract comprehensive content"
    );

    let heading_count = extract_all_headings(&result.content).len();
    assert!(heading_count >= 5, "advanced.typ should preserve heading structure");
}

/// TEST 43: MIME type consistency
///
/// The extractor should support both standard MIME types for Typst.
/// Currently only supports application/x-typst, not text/x-typst.
#[tokio::test]
async fn test_typst_mime_type_consistency() {
    let content = load_test_document("simple.typ");
    let config = ExtractionConfig::default();

    let result_primary = extract_bytes(&content, "application/x-typst", &config)
        .await
        .expect("Primary MIME type should work");

    assert!(
        result_primary.content.len() > 0,
        "Primary MIME type should extract content"
    );

    match extract_bytes(&content, "text/x-typst", &config).await {
        Ok(result) => {
            assert!(
                result.content.len() > 0,
                "Alternative MIME type should extract content if supported"
            );
        }
        Err(_e) => {
            println!("Note: text/x-typst is not currently supported (may be added in future)");
        }
    }
}

/// TEST 44: Config parameter impact
#[tokio::test]
async fn test_typst_config_parameter_handling() {
    let content = load_test_document("simple.typ");
    let config = ExtractionConfig::default();

    let result = extract_bytes(&content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    assert!(!result.content.is_empty(), "Extraction with default config should work");

    assert_eq!(result.mime_type, "application/x-typst", "MIME type should be preserved");
}

/// TEST 45: Comparative heading analysis
///
/// This final comprehensive test checks heading extraction
/// against the baseline to identify the exact scope of the heading loss bug.
#[tokio::test]
async fn test_typst_heading_loss_bug_analysis() {
    let content = load_test_document("headings.typ");
    let baseline = load_pandoc_baseline("headings");
    let config = ExtractionConfig::default();

    let result = extract_bytes(&content, "application/x-typst", &config)
        .await
        .expect("Extraction failed");

    println!("\n===== HEADING EXTRACTION ANALYSIS =====");
    println!("Baseline content:");
    println!("{}", baseline);
    println!("\nExtracted content:");
    println!("{}", result.content);

    let extracted_headings = extract_all_headings(&result.content);
    println!("\nExtracted headings: {}", extracted_headings.len());
    for (i, h) in extracted_headings.iter().enumerate() {
        println!("  {}: {}", i + 1, h);
    }

    assert!(
        extracted_headings.len() >= 6,
        "BUG CONFIRMED: Heading loss detected. \
         Expected 6 headings (1-6 levels), found {}. \
         This is the 62% heading loss bug - only single '=' is matched, \
         all '==' and higher are skipped entirely.",
        extracted_headings.len()
    );
}