fil/crates/kreuzberg/tests/typst_extractor_tests.rs

//! Comprehensive TDD test suite for Typst document extraction.
//!
//! This test suite validates Typst document extraction against expected outputs.
//! The tests verify:
//! - Document metadata extraction (title, author, date, keywords)
//! - Heading hierarchy parsing (=, ==, ===, etc.)
//! - Inline formatting (bold, italic, code)
//! - Table extraction and parsing
//! - List handling (ordered and unordered)
//! - Link extraction
//! - Mathematical notation preservation
//!
//! Each test document is extracted and validated for correct content extraction.

#![cfg(feature = "office")]

use kreuzberg::core::config::ExtractionConfig;
use kreuzberg::core::extractor::extract_bytes;
use std::{fs, path::PathBuf};

fn typst_fixture(name: &str) -> PathBuf {
    PathBuf::from(env!("CARGO_MANIFEST_DIR"))
        .join("../../test_documents/typst")
        .join(name)
}

/// Test simple.typ - Basic Typst document with fundamental formatting
///
/// Document contains:
/// - Document metadata: title, author, date
/// - Level 1 heading: "Introduction"
/// - Level 2 headings: "Subsection", "Features", "Lists", "Code", "Tables", "Links", "Conclusion"
/// - Inline formatting: *bold*, _italic_, `inline code`
/// - Unordered list with 3 items
/// - Code snippet
/// - 2x2 table with headers
/// - Link to Typst website
///
/// Expected: Document should extract text, preserve headings, metadata, and formatting markers
#[tokio::test]
async fn test_simple_typst_document_extraction() {
    let config = ExtractionConfig::default();

    let doc_path = typst_fixture("simple.typ");
    let content = match fs::read(doc_path) {
        Ok(c) => c,
        Err(e) => {
            eprintln!("Warning: Could not read simple.typ: {}. Skipping test.", e);
            return;
        }
    };

    let result = extract_bytes(&content, "text/x-typst", &config).await;
    if result.is_err() {
        println!("Skipping test: Typst extractor may not be available");
        return;
    }

    let extraction = result.expect("Operation failed");

    assert_eq!(extraction.mime_type, "text/x-typst", "MIME type should be preserved");

    assert!(!extraction.content.is_empty(), "Extracted content should not be empty");

    assert!(
        extraction.metadata.title.is_some(),
        "Document title should be extracted from #set document()"
    );

    assert!(
        extraction.metadata.authors.is_some(),
        "Document author should be extracted"
    );

    assert!(
        extraction.content.contains("Introduction"),
        "Should extract 'Introduction' heading"
    );
    assert!(
        extraction.content.contains("Features"),
        "Should extract 'Features' heading"
    );
    assert!(
        extraction.content.contains("Conclusion"),
        "Should extract 'Conclusion' heading"
    );

    let intro_count = extraction.content.matches("= Introduction").count();
    let subsection_count = extraction.content.matches("== Subsection").count();
    let features_count = extraction.content.matches("= Features").count();
    let lists_count = extraction.content.matches("== Lists").count();
    let code_count = extraction.content.matches("== Code").count();
    let tables_count = extraction.content.matches("== Tables").count();
    let links_count = extraction.content.matches("== Links").count();
    let conclusion_count = extraction.content.matches("= Conclusion").count();

    assert_eq!(intro_count, 1, "Should extract 'Introduction' (level 1)");
    assert_eq!(subsection_count, 1, "Should extract 'Subsection' (level 2)");
    assert_eq!(features_count, 1, "Should extract 'Features' (level 1)");
    assert_eq!(lists_count, 1, "Should extract 'Lists' (level 2)");
    assert_eq!(code_count, 1, "Should extract 'Code' (level 2)");
    assert_eq!(tables_count, 1, "Should extract 'Tables' (level 2)");
    assert_eq!(links_count, 1, "Should extract 'Links' (level 2)");
    assert_eq!(conclusion_count, 1, "Should extract 'Conclusion' (level 1)");

    assert!(
        extraction.content.contains("*") || extraction.content.contains("bold"),
        "Should preserve bold formatting or text"
    );

    assert!(
        extraction.content.contains("-") || extraction.content.contains("First") || extraction.content.contains("item"),
        "Should extract list content"
    );

    println!(
        "✓ simple.typ: Successfully extracted {} characters with all 8 headings",
        extraction.content.len()
    );
}

/// Test minimal.typ - Minimal Typst document
///
/// Document contains:
/// - Single level 1 heading: "Hello World"
/// - Simple text content
///
/// Expected: Basic heading and content extraction
#[tokio::test]
async fn test_minimal_typst_document_extraction() {
    let config = ExtractionConfig::default();

    let doc_path = typst_fixture("minimal.typ");
    let content = match fs::read(doc_path) {
        Ok(c) => c,
        Err(e) => {
            eprintln!("Warning: Could not read minimal.typ: {}. Skipping test.", e);
            return;
        }
    };

    let result = extract_bytes(&content, "application/x-typst", &config).await;
    if result.is_err() {
        println!("Skipping test: Typst extractor may not be available");
        return;
    }

    let extraction = result.expect("Operation failed");

    assert!(
        !extraction.content.is_empty(),
        "Minimal document should extract content"
    );

    assert!(
        extraction.content.contains("Hello") || extraction.content.contains("World"),
        "Should extract heading content"
    );

    println!(
        "✓ minimal.typ: Successfully extracted {} characters",
        extraction.content.len()
    );
}

/// Test headings.typ - Document focusing on heading hierarchy
///
/// Document contains:
/// - 6 heading levels (=, ==, ===, ====, =====, ======)
/// - Content under each heading level
///
/// Expected: Heading structure should be preserved with level information
#[tokio::test]
async fn test_heading_hierarchy_extraction() {
    let config = ExtractionConfig::default();

    let doc_path = typst_fixture("headings.typ");
    let content = match fs::read(doc_path) {
        Ok(c) => c,
        Err(e) => {
            eprintln!("Warning: Could not read headings.typ: {}. Skipping test.", e);
            return;
        }
    };

    let result = extract_bytes(&content, "text/x-typst", &config).await;
    if result.is_err() {
        println!("Skipping test: Typst extractor may not be available");
        return;
    }

    let extraction = result.expect("Operation failed");

    assert!(!extraction.content.is_empty(), "Document should extract content");

    assert!(
        extraction.content.contains("= Level 1") || extraction.content.contains("Level 1 Heading"),
        "Should extract level 1 heading"
    );

    assert!(
        extraction.content.contains("== Level 2") || extraction.content.contains("Level 2 Heading"),
        "Should extract level 2 heading"
    );

    assert!(
        extraction.content.contains("=== Level 3") || extraction.content.contains("Level 3 Heading"),
        "Should extract level 3 heading"
    );

    assert!(
        extraction.content.contains("==== Level 4") || extraction.content.contains("Level 4 Heading"),
        "Should extract level 4 heading"
    );

    assert!(
        extraction.content.contains("===== Level 5") || extraction.content.contains("Level 5 Heading"),
        "Should extract level 5 heading"
    );

    assert!(
        extraction.content.contains("====== Level 6") || extraction.content.contains("Level 6 Heading"),
        "Should extract level 6 heading"
    );

    let level_1_count = extraction.content.matches("= Level 1").count();
    let level_2_count = extraction.content.matches("== Level 2").count();
    let level_3_count = extraction.content.matches("=== Level 3").count();
    let level_4_count = extraction.content.matches("==== Level 4").count();
    let level_5_count = extraction.content.matches("===== Level 5").count();
    let level_6_count = extraction.content.matches("====== Level 6").count();

    assert_eq!(level_1_count, 1, "Should extract exactly one level 1 heading");
    assert_eq!(level_2_count, 1, "Should extract exactly one level 2 heading");
    assert_eq!(level_3_count, 1, "Should extract exactly one level 3 heading");
    assert_eq!(level_4_count, 1, "Should extract exactly one level 4 heading");
    assert_eq!(level_5_count, 1, "Should extract exactly one level 5 heading");
    assert_eq!(level_6_count, 1, "Should extract exactly one level 6 heading");

    println!(
        "✓ headings.typ: Successfully extracted {} characters with heading structure",
        extraction.content.len()
    );
}

/// Test metadata.typ - Document with comprehensive metadata
///
/// Document contains:
/// - #set document() with: title, author, subject, keywords
/// - Content sections
///
/// Expected: All metadata fields should be extracted correctly
#[tokio::test]
async fn test_metadata_extraction() {
    let config = ExtractionConfig::default();

    let doc_path = typst_fixture("metadata.typ");
    let content = match fs::read(doc_path) {
        Ok(c) => c,
        Err(e) => {
            eprintln!("Warning: Could not read metadata.typ: {}. Skipping test.", e);
            return;
        }
    };

    let result = extract_bytes(&content, "application/x-typst", &config).await;
    if result.is_err() {
        println!("Skipping test: Typst extractor may not be available");
        return;
    }

    let extraction = result.expect("Operation failed");

    if let Some(title) = extraction.metadata.additional.get("title") {
        assert!(
            title.to_string().contains("Metadata") || title.to_string().contains("Example"),
            "Title should contain expected text"
        );
    }

    if let Some(author) = extraction.metadata.additional.get("author") {
        assert!(
            author.to_string().contains("John") || author.to_string().contains("Doe"),
            "Author should contain expected text"
        );
    }

    if let Some(keywords) = &extraction.metadata.keywords {
        assert!(!keywords.is_empty(), "Keywords should be present");
    }

    assert!(!extraction.content.is_empty(), "Document should extract content");

    println!(
        "✓ metadata.typ: Successfully extracted metadata and {} characters of content",
        extraction.content.len()
    );
}

/// Test advanced.typ - Complex Typst document with multiple features
///
/// Document contains:
/// - Metadata: title, author, keywords, date
/// - Heading numbering configuration
/// - Mathematical notation (inline and display)
/// - Nested heading levels (level 1, 2, 3, 4)
/// - Code blocks (Python example)
/// - Complex tables with 3 columns and 4 rows
/// - Multiple paragraph sections
/// - Links with text
/// - Multiple formatting combinations
///
/// Expected: Comprehensive extraction of all document elements
#[tokio::test]
async fn test_advanced_typst_document_extraction() {
    let config = ExtractionConfig::default();

    let doc_path = typst_fixture("advanced.typ");
    let content = match fs::read(doc_path) {
        Ok(c) => c,
        Err(e) => {
            eprintln!("Warning: Could not read advanced.typ: {}. Skipping test.", e);
            return;
        }
    };

    let result = extract_bytes(&content, "text/x-typst", &config).await;
    if result.is_err() {
        println!("Skipping test: Typst extractor may not be available");
        return;
    }

    let extraction = result.expect("Operation failed");

    assert!(extraction.metadata.title.is_some(), "Title should be extracted");

    assert!(
        !extraction.content.is_empty(),
        "Advanced document should extract content"
    );

    assert!(
        extraction.content.contains("$")
            || extraction.content.contains("equation")
            || extraction.content.contains("math"),
        "Should extract or preserve mathematical notation"
    );

    assert!(
        extraction.content.contains("Mathematical")
            || extraction.content.contains("Formatting")
            || extraction.content.contains("Features"),
        "Should extract section headings"
    );

    assert!(
        extraction.content.contains("python")
            || extraction.content.contains("def")
            || extraction.content.contains("fibonacci")
            || extraction.content.contains("```"),
        "Should extract code block content"
    );

    let level_count = extraction.content.matches("=").count();
    assert!(level_count >= 3, "Should preserve nested heading hierarchy");

    assert!(
        extraction.content.contains("Name")
            || extraction.content.contains("Alice")
            || extraction.content.contains("Table"),
        "Should extract table content"
    );

    assert!(
        extraction.content.contains("example")
            || extraction.content.contains("link")
            || extraction.content.contains("http"),
        "Should extract link content"
    );

    println!(
        "✓ advanced.typ: Successfully extracted {} characters with complex formatting",
        extraction.content.len()
    );
}

/// Test typst-reader.typ - Pandoc test file
///
/// Document from Pandoc test suite demonstrating Typst reader functionality
///
/// Expected: Proper extraction of Typst-specific syntax
#[tokio::test]
async fn test_typst_reader_extraction() {
    let config = ExtractionConfig::default();

    let doc_path = typst_fixture("typst-reader.typ");
    let content = match fs::read(doc_path) {
        Ok(c) => c,
        Err(e) => {
            eprintln!("Warning: Could not read typst-reader.typ: {}. Skipping test.", e);
            return;
        }
    };

    let result = extract_bytes(&content, "application/x-typst", &config).await;
    if result.is_err() {
        println!("Skipping test: Typst extractor may not be available");
        return;
    }

    let extraction = result.expect("Operation failed");

    assert!(
        !extraction.content.is_empty(),
        "Should extract content from Pandoc test file"
    );

    assert!(
        extraction.content.contains("=") || extraction.content.contains("Fibonacci"),
        "Should extract heading or content from test file"
    );

    println!(
        "✓ typst-reader.typ: Successfully extracted {} characters",
        extraction.content.len()
    );
}

/// Test undergradmath.typ - Pandoc test file with complex math
///
/// Document from Pandoc test suite with extensive mathematical notation
/// and complex formatting
///
/// Expected: Handling of complex Typst syntax with metadata and content
#[tokio::test]
async fn test_undergradmath_extraction() {
    let config = ExtractionConfig::default();

    let doc_path = typst_fixture("undergradmath.typ");
    let content = match fs::read(doc_path) {
        Ok(c) => c,
        Err(e) => {
            eprintln!("Warning: Could not read undergradmath.typ: {}. Skipping test.", e);
            return;
        }
    };

    let result = extract_bytes(&content, "text/x-typst", &config).await;
    if result.is_err() {
        println!("Skipping test: Typst extractor may not be available");
        return;
    }

    let extraction = result.expect("Operation failed");

    assert!(
        !extraction.content.is_empty(),
        "Should extract content from complex math document"
    );

    if let Some(title) = extraction.metadata.additional.get("title") {
        assert!(!title.to_string().is_empty(), "Title should be extracted");
    }

    assert!(
        extraction.content.contains("=") || extraction.content.contains("Typst") || extraction.content.len() > 100,
        "Should extract document structure or content"
    );

    println!(
        "✓ undergradmath.typ: Successfully extracted {} characters from math document",
        extraction.content.len()
    );
}

/// Test MIME type detection and fallback
///
/// Verifies that Typst documents can be extracted with different MIME type specifications
#[tokio::test]
async fn test_typst_mime_type_variants() {
    let config = ExtractionConfig::default();

    let doc_path = typst_fixture("simple.typ");
    let content = match fs::read(doc_path) {
        Ok(c) => c,
        Err(e) => {
            eprintln!("Warning: Could not read simple.typ: {}. Skipping test.", e);
            return;
        }
    };

    let mime_types = vec!["application/x-typst", "text/x-typst", "text/plain"];

    for mime_type in mime_types {
        let result = extract_bytes(&content, mime_type, &config).await;

        if let Ok(extraction) = result {
            assert!(
                !extraction.content.is_empty(),
                "Should extract content with MIME type: {}",
                mime_type
            );
            println!(
                "✓ MIME type '{}': Successfully extracted {} characters",
                mime_type,
                extraction.content.len()
            );
        }
    }
}

/// Test formatting preservation
///
/// Validates that inline formatting markers are preserved in extracted content
#[tokio::test]
async fn test_formatting_preservation() {
    let config = ExtractionConfig::default();

    let doc_path = typst_fixture("simple.typ");
    let content = match fs::read(doc_path) {
        Ok(c) => c,
        Err(e) => {
            eprintln!("Warning: Could not read simple.typ: {}. Skipping test.", e);
            return;
        }
    };

    let result = extract_bytes(&content, "text/x-typst", &config).await;
    if result.is_err() {
        println!("Skipping test: Typst extractor may not be available");
        return;
    }

    let extraction = result.expect("Operation failed");

    assert!(
        extraction.content.contains("*") || extraction.content.contains("bold"),
        "Should preserve bold formatting or text"
    );

    assert!(
        extraction.content.contains("_") || extraction.content.contains("italic"),
        "Should preserve italic formatting or text"
    );

    assert!(
        extraction.content.contains("`") || extraction.content.contains("code"),
        "Should preserve code formatting or text"
    );

    println!("✓ Formatting preservation: All markers/content found in extracted text");
}

/// Test large document handling
///
/// Validates extraction of the large undergradmath document
#[tokio::test]
async fn test_large_document_extraction() {
    let config = ExtractionConfig::default();

    let doc_path = typst_fixture("undergradmath.typ");
    let content = match fs::read(doc_path) {
        Ok(c) => c,
        Err(e) => {
            eprintln!("Warning: Could not read undergradmath.typ: {}. Skipping test.", e);
            return;
        }
    };

    let result = extract_bytes(&content, "text/x-typst", &config).await;
    if result.is_err() {
        println!("Skipping test: Typst extractor may not be available");
        return;
    }

    let extraction = result.expect("Operation failed");

    assert!(
        !extraction.content.is_empty(),
        "Should extract content from large document"
    );

    println!(
        "✓ Large document: Extracted {} bytes of content from source file",
        extraction.content.len()
    );
}

/// Test empty/whitespace handling
///
/// Validates graceful handling of edge cases
#[tokio::test]
async fn test_empty_content_handling() {
    let config = ExtractionConfig::default();

    let empty_content = b"";
    let result = extract_bytes(empty_content, "text/x-typst", &config).await;

    match result {
        Ok(extraction) => {
            println!(
                "✓ Empty content: Handled gracefully, extracted {} bytes",
                extraction.content.len()
            );
        }
        Err(e) => {
            println!("✓ Empty content: Resulted in expected error: {}", e);
        }
    }
}

/// Test MIME type priority
///
/// Validates that Typst extractor has correct priority (50)
#[tokio::test]
async fn test_typst_extractor_priority() {
    use kreuzberg::extractors::TypstExtractor;
    use kreuzberg::plugins::DocumentExtractor;

    let extractor = TypstExtractor;
    let priority = extractor.priority();

    assert_eq!(priority, 50, "Typst extractor should have priority 50");
    println!("✓ Typst extractor priority: {}", priority);
}

/// Test supported MIME types
///
/// Validates that extractor claims to support Typst MIME types
#[tokio::test]
async fn test_supported_mime_types() {
    use kreuzberg::extractors::TypstExtractor;
    use kreuzberg::plugins::DocumentExtractor;

    let extractor = TypstExtractor;
    let mime_types = extractor.supported_mime_types();

    assert!(
        mime_types.contains(&"application/x-typst"),
        "Should support application/x-typst"
    );
    assert!(mime_types.contains(&"text/x-typst"), "Should support text/x-typst");

    println!("✓ Supported MIME types: {:?}", mime_types);
}