//! Comprehensive TDD test suite for Typst document extraction. //! //! This test suite validates Typst document extraction against expected outputs. //! The tests verify: //! - Document metadata extraction (title, author, date, keywords) //! - Heading hierarchy parsing (=, ==, ===, etc.) //! - Inline formatting (bold, italic, code) //! - Table extraction and parsing //! - List handling (ordered and unordered) //! - Link extraction //! - Mathematical notation preservation //! //! Each test document is extracted and validated for correct content extraction. #![cfg(feature = "office")] use kreuzberg::core::config::ExtractionConfig; use kreuzberg::core::extractor::extract_bytes; use std::{fs, path::PathBuf}; fn typst_fixture(name: &str) -> PathBuf { PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("../../test_documents/typst") .join(name) } /// Test simple.typ - Basic Typst document with fundamental formatting /// /// Document contains: /// - Document metadata: title, author, date /// - Level 1 heading: "Introduction" /// - Level 2 headings: "Subsection", "Features", "Lists", "Code", "Tables", "Links", "Conclusion" /// - Inline formatting: *bold*, _italic_, `inline code` /// - Unordered list with 3 items /// - Code snippet /// - 2x2 table with headers /// - Link to Typst website /// /// Expected: Document should extract text, preserve headings, metadata, and formatting markers #[tokio::test] async fn test_simple_typst_document_extraction() { let config = ExtractionConfig::default(); let doc_path = typst_fixture("simple.typ"); let content = match fs::read(doc_path) { Ok(c) => c, Err(e) => { eprintln!("Warning: Could not read simple.typ: {}. Skipping test.", e); return; } }; let result = extract_bytes(&content, "text/x-typst", &config).await; if result.is_err() { println!("Skipping test: Typst extractor may not be available"); return; } let extraction = result.expect("Operation failed"); assert_eq!(extraction.mime_type, "text/x-typst", "MIME type should be preserved"); assert!(!extraction.content.is_empty(), "Extracted content should not be empty"); assert!( extraction.metadata.title.is_some(), "Document title should be extracted from #set document()" ); assert!( extraction.metadata.authors.is_some(), "Document author should be extracted" ); assert!( extraction.content.contains("Introduction"), "Should extract 'Introduction' heading" ); assert!( extraction.content.contains("Features"), "Should extract 'Features' heading" ); assert!( extraction.content.contains("Conclusion"), "Should extract 'Conclusion' heading" ); let intro_count = extraction.content.matches("= Introduction").count(); let subsection_count = extraction.content.matches("== Subsection").count(); let features_count = extraction.content.matches("= Features").count(); let lists_count = extraction.content.matches("== Lists").count(); let code_count = extraction.content.matches("== Code").count(); let tables_count = extraction.content.matches("== Tables").count(); let links_count = extraction.content.matches("== Links").count(); let conclusion_count = extraction.content.matches("= Conclusion").count(); assert_eq!(intro_count, 1, "Should extract 'Introduction' (level 1)"); assert_eq!(subsection_count, 1, "Should extract 'Subsection' (level 2)"); assert_eq!(features_count, 1, "Should extract 'Features' (level 1)"); assert_eq!(lists_count, 1, "Should extract 'Lists' (level 2)"); assert_eq!(code_count, 1, "Should extract 'Code' (level 2)"); assert_eq!(tables_count, 1, "Should extract 'Tables' (level 2)"); assert_eq!(links_count, 1, "Should extract 'Links' (level 2)"); assert_eq!(conclusion_count, 1, "Should extract 'Conclusion' (level 1)"); assert!( extraction.content.contains("*") || extraction.content.contains("bold"), "Should preserve bold formatting or text" ); assert!( extraction.content.contains("-") || extraction.content.contains("First") || extraction.content.contains("item"), "Should extract list content" ); println!( "✓ simple.typ: Successfully extracted {} characters with all 8 headings", extraction.content.len() ); } /// Test minimal.typ - Minimal Typst document /// /// Document contains: /// - Single level 1 heading: "Hello World" /// - Simple text content /// /// Expected: Basic heading and content extraction #[tokio::test] async fn test_minimal_typst_document_extraction() { let config = ExtractionConfig::default(); let doc_path = typst_fixture("minimal.typ"); let content = match fs::read(doc_path) { Ok(c) => c, Err(e) => { eprintln!("Warning: Could not read minimal.typ: {}. Skipping test.", e); return; } }; let result = extract_bytes(&content, "application/x-typst", &config).await; if result.is_err() { println!("Skipping test: Typst extractor may not be available"); return; } let extraction = result.expect("Operation failed"); assert!( !extraction.content.is_empty(), "Minimal document should extract content" ); assert!( extraction.content.contains("Hello") || extraction.content.contains("World"), "Should extract heading content" ); println!( "✓ minimal.typ: Successfully extracted {} characters", extraction.content.len() ); } /// Test headings.typ - Document focusing on heading hierarchy /// /// Document contains: /// - 6 heading levels (=, ==, ===, ====, =====, ======) /// - Content under each heading level /// /// Expected: Heading structure should be preserved with level information #[tokio::test] async fn test_heading_hierarchy_extraction() { let config = ExtractionConfig::default(); let doc_path = typst_fixture("headings.typ"); let content = match fs::read(doc_path) { Ok(c) => c, Err(e) => { eprintln!("Warning: Could not read headings.typ: {}. Skipping test.", e); return; } }; let result = extract_bytes(&content, "text/x-typst", &config).await; if result.is_err() { println!("Skipping test: Typst extractor may not be available"); return; } let extraction = result.expect("Operation failed"); assert!(!extraction.content.is_empty(), "Document should extract content"); assert!( extraction.content.contains("= Level 1") || extraction.content.contains("Level 1 Heading"), "Should extract level 1 heading" ); assert!( extraction.content.contains("== Level 2") || extraction.content.contains("Level 2 Heading"), "Should extract level 2 heading" ); assert!( extraction.content.contains("=== Level 3") || extraction.content.contains("Level 3 Heading"), "Should extract level 3 heading" ); assert!( extraction.content.contains("==== Level 4") || extraction.content.contains("Level 4 Heading"), "Should extract level 4 heading" ); assert!( extraction.content.contains("===== Level 5") || extraction.content.contains("Level 5 Heading"), "Should extract level 5 heading" ); assert!( extraction.content.contains("====== Level 6") || extraction.content.contains("Level 6 Heading"), "Should extract level 6 heading" ); let level_1_count = extraction.content.matches("= Level 1").count(); let level_2_count = extraction.content.matches("== Level 2").count(); let level_3_count = extraction.content.matches("=== Level 3").count(); let level_4_count = extraction.content.matches("==== Level 4").count(); let level_5_count = extraction.content.matches("===== Level 5").count(); let level_6_count = extraction.content.matches("====== Level 6").count(); assert_eq!(level_1_count, 1, "Should extract exactly one level 1 heading"); assert_eq!(level_2_count, 1, "Should extract exactly one level 2 heading"); assert_eq!(level_3_count, 1, "Should extract exactly one level 3 heading"); assert_eq!(level_4_count, 1, "Should extract exactly one level 4 heading"); assert_eq!(level_5_count, 1, "Should extract exactly one level 5 heading"); assert_eq!(level_6_count, 1, "Should extract exactly one level 6 heading"); println!( "✓ headings.typ: Successfully extracted {} characters with heading structure", extraction.content.len() ); } /// Test metadata.typ - Document with comprehensive metadata /// /// Document contains: /// - #set document() with: title, author, subject, keywords /// - Content sections /// /// Expected: All metadata fields should be extracted correctly #[tokio::test] async fn test_metadata_extraction() { let config = ExtractionConfig::default(); let doc_path = typst_fixture("metadata.typ"); let content = match fs::read(doc_path) { Ok(c) => c, Err(e) => { eprintln!("Warning: Could not read metadata.typ: {}. Skipping test.", e); return; } }; let result = extract_bytes(&content, "application/x-typst", &config).await; if result.is_err() { println!("Skipping test: Typst extractor may not be available"); return; } let extraction = result.expect("Operation failed"); if let Some(title) = extraction.metadata.additional.get("title") { assert!( title.to_string().contains("Metadata") || title.to_string().contains("Example"), "Title should contain expected text" ); } if let Some(author) = extraction.metadata.additional.get("author") { assert!( author.to_string().contains("John") || author.to_string().contains("Doe"), "Author should contain expected text" ); } if let Some(keywords) = &extraction.metadata.keywords { assert!(!keywords.is_empty(), "Keywords should be present"); } assert!(!extraction.content.is_empty(), "Document should extract content"); println!( "✓ metadata.typ: Successfully extracted metadata and {} characters of content", extraction.content.len() ); } /// Test advanced.typ - Complex Typst document with multiple features /// /// Document contains: /// - Metadata: title, author, keywords, date /// - Heading numbering configuration /// - Mathematical notation (inline and display) /// - Nested heading levels (level 1, 2, 3, 4) /// - Code blocks (Python example) /// - Complex tables with 3 columns and 4 rows /// - Multiple paragraph sections /// - Links with text /// - Multiple formatting combinations /// /// Expected: Comprehensive extraction of all document elements #[tokio::test] async fn test_advanced_typst_document_extraction() { let config = ExtractionConfig::default(); let doc_path = typst_fixture("advanced.typ"); let content = match fs::read(doc_path) { Ok(c) => c, Err(e) => { eprintln!("Warning: Could not read advanced.typ: {}. Skipping test.", e); return; } }; let result = extract_bytes(&content, "text/x-typst", &config).await; if result.is_err() { println!("Skipping test: Typst extractor may not be available"); return; } let extraction = result.expect("Operation failed"); assert!(extraction.metadata.title.is_some(), "Title should be extracted"); assert!( !extraction.content.is_empty(), "Advanced document should extract content" ); assert!( extraction.content.contains("$") || extraction.content.contains("equation") || extraction.content.contains("math"), "Should extract or preserve mathematical notation" ); assert!( extraction.content.contains("Mathematical") || extraction.content.contains("Formatting") || extraction.content.contains("Features"), "Should extract section headings" ); assert!( extraction.content.contains("python") || extraction.content.contains("def") || extraction.content.contains("fibonacci") || extraction.content.contains("```"), "Should extract code block content" ); let level_count = extraction.content.matches("=").count(); assert!(level_count >= 3, "Should preserve nested heading hierarchy"); assert!( extraction.content.contains("Name") || extraction.content.contains("Alice") || extraction.content.contains("Table"), "Should extract table content" ); assert!( extraction.content.contains("example") || extraction.content.contains("link") || extraction.content.contains("http"), "Should extract link content" ); println!( "✓ advanced.typ: Successfully extracted {} characters with complex formatting", extraction.content.len() ); } /// Test typst-reader.typ - Pandoc test file /// /// Document from Pandoc test suite demonstrating Typst reader functionality /// /// Expected: Proper extraction of Typst-specific syntax #[tokio::test] async fn test_typst_reader_extraction() { let config = ExtractionConfig::default(); let doc_path = typst_fixture("typst-reader.typ"); let content = match fs::read(doc_path) { Ok(c) => c, Err(e) => { eprintln!("Warning: Could not read typst-reader.typ: {}. Skipping test.", e); return; } }; let result = extract_bytes(&content, "application/x-typst", &config).await; if result.is_err() { println!("Skipping test: Typst extractor may not be available"); return; } let extraction = result.expect("Operation failed"); assert!( !extraction.content.is_empty(), "Should extract content from Pandoc test file" ); assert!( extraction.content.contains("=") || extraction.content.contains("Fibonacci"), "Should extract heading or content from test file" ); println!( "✓ typst-reader.typ: Successfully extracted {} characters", extraction.content.len() ); } /// Test undergradmath.typ - Pandoc test file with complex math /// /// Document from Pandoc test suite with extensive mathematical notation /// and complex formatting /// /// Expected: Handling of complex Typst syntax with metadata and content #[tokio::test] async fn test_undergradmath_extraction() { let config = ExtractionConfig::default(); let doc_path = typst_fixture("undergradmath.typ"); let content = match fs::read(doc_path) { Ok(c) => c, Err(e) => { eprintln!("Warning: Could not read undergradmath.typ: {}. Skipping test.", e); return; } }; let result = extract_bytes(&content, "text/x-typst", &config).await; if result.is_err() { println!("Skipping test: Typst extractor may not be available"); return; } let extraction = result.expect("Operation failed"); assert!( !extraction.content.is_empty(), "Should extract content from complex math document" ); if let Some(title) = extraction.metadata.additional.get("title") { assert!(!title.to_string().is_empty(), "Title should be extracted"); } assert!( extraction.content.contains("=") || extraction.content.contains("Typst") || extraction.content.len() > 100, "Should extract document structure or content" ); println!( "✓ undergradmath.typ: Successfully extracted {} characters from math document", extraction.content.len() ); } /// Test MIME type detection and fallback /// /// Verifies that Typst documents can be extracted with different MIME type specifications #[tokio::test] async fn test_typst_mime_type_variants() { let config = ExtractionConfig::default(); let doc_path = typst_fixture("simple.typ"); let content = match fs::read(doc_path) { Ok(c) => c, Err(e) => { eprintln!("Warning: Could not read simple.typ: {}. Skipping test.", e); return; } }; let mime_types = vec!["application/x-typst", "text/x-typst", "text/plain"]; for mime_type in mime_types { let result = extract_bytes(&content, mime_type, &config).await; if let Ok(extraction) = result { assert!( !extraction.content.is_empty(), "Should extract content with MIME type: {}", mime_type ); println!( "✓ MIME type '{}': Successfully extracted {} characters", mime_type, extraction.content.len() ); } } } /// Test formatting preservation /// /// Validates that inline formatting markers are preserved in extracted content #[tokio::test] async fn test_formatting_preservation() { let config = ExtractionConfig::default(); let doc_path = typst_fixture("simple.typ"); let content = match fs::read(doc_path) { Ok(c) => c, Err(e) => { eprintln!("Warning: Could not read simple.typ: {}. Skipping test.", e); return; } }; let result = extract_bytes(&content, "text/x-typst", &config).await; if result.is_err() { println!("Skipping test: Typst extractor may not be available"); return; } let extraction = result.expect("Operation failed"); assert!( extraction.content.contains("*") || extraction.content.contains("bold"), "Should preserve bold formatting or text" ); assert!( extraction.content.contains("_") || extraction.content.contains("italic"), "Should preserve italic formatting or text" ); assert!( extraction.content.contains("`") || extraction.content.contains("code"), "Should preserve code formatting or text" ); println!("✓ Formatting preservation: All markers/content found in extracted text"); } /// Test large document handling /// /// Validates extraction of the large undergradmath document #[tokio::test] async fn test_large_document_extraction() { let config = ExtractionConfig::default(); let doc_path = typst_fixture("undergradmath.typ"); let content = match fs::read(doc_path) { Ok(c) => c, Err(e) => { eprintln!("Warning: Could not read undergradmath.typ: {}. Skipping test.", e); return; } }; let result = extract_bytes(&content, "text/x-typst", &config).await; if result.is_err() { println!("Skipping test: Typst extractor may not be available"); return; } let extraction = result.expect("Operation failed"); assert!( !extraction.content.is_empty(), "Should extract content from large document" ); println!( "✓ Large document: Extracted {} bytes of content from source file", extraction.content.len() ); } /// Test empty/whitespace handling /// /// Validates graceful handling of edge cases #[tokio::test] async fn test_empty_content_handling() { let config = ExtractionConfig::default(); let empty_content = b""; let result = extract_bytes(empty_content, "text/x-typst", &config).await; match result { Ok(extraction) => { println!( "✓ Empty content: Handled gracefully, extracted {} bytes", extraction.content.len() ); } Err(e) => { println!("✓ Empty content: Resulted in expected error: {}", e); } } } /// Test MIME type priority /// /// Validates that Typst extractor has correct priority (50) #[tokio::test] async fn test_typst_extractor_priority() { use kreuzberg::extractors::TypstExtractor; use kreuzberg::plugins::DocumentExtractor; let extractor = TypstExtractor; let priority = extractor.priority(); assert_eq!(priority, 50, "Typst extractor should have priority 50"); println!("✓ Typst extractor priority: {}", priority); } /// Test supported MIME types /// /// Validates that extractor claims to support Typst MIME types #[tokio::test] async fn test_supported_mime_types() { use kreuzberg::extractors::TypstExtractor; use kreuzberg::plugins::DocumentExtractor; let extractor = TypstExtractor; let mime_types = extractor.supported_mime_types(); assert!( mime_types.contains(&"application/x-typst"), "Should support application/x-typst" ); assert!(mime_types.contains(&"text/x-typst"), "Should support text/x-typst"); println!("✓ Supported MIME types: {:?}", mime_types); }