//! Comprehensive LaTeX Extractor Tests //! //! This test suite defines the expected behavior for LaTeX extraction. //! //! Test Coverage: //! - Basic content extraction (minimal.tex) //! - Section hierarchy (basic_sections.tex) //! - Text formatting (formatting.tex) //! - Mathematical expressions (math.tex) //! - Tables (tables.tex) //! - Lists (lists.tex) //! - Unicode handling (unicode.tex) //! //! Success Criteria: //! - All tests passing (100%) //! - No content loss (extract meaningful content) #![cfg(feature = "office")] use kreuzberg::core::config::ExtractionConfig; use kreuzberg::extraction::derive::derive_extraction_result; use kreuzberg::extractors::latex::LatexExtractor; use kreuzberg::plugins::DocumentExtractor; use std::fs; use std::path::PathBuf; /// Helper to get absolute path to test documents fn test_file_path(filename: &str) -> PathBuf { let manifest_dir = env!("CARGO_MANIFEST_DIR"); PathBuf::from(manifest_dir) .parent() .expect("Operation failed") .parent() .expect("Operation failed") .join("test_documents") .join("latex") .join(filename) } #[tokio::test] async fn test_latex_minimal_extraction() { let content = fs::read(test_file_path("minimal.tex")).expect("Failed to read minimal.tex"); let extractor = LatexExtractor; let doc = extractor .extract_bytes(&content, "text/x-tex", &ExtractionConfig::default()) .await .expect("Should extract minimal LaTeX"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain); assert!( !result.content.is_empty(), "FAIL: Extracted 0 bytes (current bug). Should extract content from minimal.tex" ); assert!( result.content.contains("Hello World from LaTeX!"), "FAIL: Should extract 'Hello World from LaTeX!' but got: '{}'", result.content ); } #[tokio::test] async fn test_latex_metadata_extraction() { let content = fs::read(test_file_path("basic_sections.tex")).expect("Failed to read basic_sections.tex"); let extractor = LatexExtractor; let doc = extractor .extract_bytes(&content, "text/x-tex", &ExtractionConfig::default()) .await .expect("Should extract LaTeX with metadata"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain); assert_eq!( result.metadata.additional.get("title").and_then(|v| v.as_str()), Some("Test Document"), "FAIL: Should extract title 'Test Document' from \\title{{}} command" ); assert_eq!( result.metadata.additional.get("author").and_then(|v| v.as_str()), Some("John Doe"), "FAIL: Should extract author 'John Doe' from \\author{{}} command" ); assert_eq!( result.metadata.additional.get("date").and_then(|v| v.as_str()), Some("2025-12-07"), "FAIL: Should extract date '2025-12-07' from \\date{{}} command" ); } #[tokio::test] async fn test_latex_section_hierarchy() { let content = fs::read(test_file_path("basic_sections.tex")).expect("Failed to read basic_sections.tex"); let extractor = LatexExtractor; let doc = extractor .extract_bytes(&content, "text/x-tex", &ExtractionConfig::default()) .await .expect("Should extract LaTeX sections"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain); assert!( result.content.contains("Introduction"), "FAIL: Should extract \\section{{Introduction}} as text" ); assert!( result.content.contains("Methods"), "FAIL: Should extract \\section{{Methods}} as text" ); assert!( result.content.contains("Results"), "FAIL: Should extract \\section{{Results}} as text" ); assert!( result.content.contains("Background"), "FAIL: Should extract \\subsection{{Background}} as text" ); assert!( result.content.contains("Historical Context"), "FAIL: Should extract \\subsubsection{{Historical Context}} as text" ); assert!( result.content.contains("This is the introduction paragraph"), "FAIL: Should extract paragraph text from document body" ); } #[tokio::test] async fn test_latex_text_formatting() { let content = fs::read(test_file_path("formatting.tex")).expect("Failed to read formatting.tex"); let extractor = LatexExtractor; let doc = extractor .extract_bytes(&content, "text/x-tex", &ExtractionConfig::default()) .await .expect("Should extract LaTeX formatting"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain); assert!( result.content.contains("Text Formatting"), "FAIL: Should extract \\section{{Text Formatting}}" ); assert!( result.content.contains("This is normal text"), "FAIL: Should extract plain paragraph text" ); assert!( result.content.contains("bold text"), "FAIL: Should extract text from \\textbf{{bold text}}" ); assert!( result.content.contains("italic text"), "FAIL: Should extract text from \\textit{{italic text}}" ); assert!( result.content.contains("underlined text"), "FAIL: Should extract text from \\underline{{underlined text}}" ); assert!( result.content.contains("emphasized text"), "FAIL: Should extract text from \\emph{{emphasized text}}" ); assert!( result.content.contains("monospace text"), "FAIL: Should extract text from \\texttt{{monospace text}}" ); assert!( result.content.contains("bold and italic"), "FAIL: Should extract text from nested formatting commands" ); } #[tokio::test] async fn test_latex_math_extraction() { let content = fs::read(test_file_path("math.tex")).expect("Failed to read math.tex"); let extractor = LatexExtractor; let doc = extractor .extract_bytes(&content, "text/x-tex", &ExtractionConfig::default()) .await .expect("Should extract LaTeX math"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain); assert!( result.content.contains("Math Formulas"), "FAIL: Should extract \\section{{Math Formulas}}" ); assert!( result.content.contains("Inline Math"), "FAIL: Should extract \\subsection{{Inline Math}}" ); assert!( result.content.contains("Display Math"), "FAIL: Should extract \\subsection{{Display Math}}" ); assert!( result.content.contains("mc") || result.content.contains("mc²"), "FAIL: Should extract inline math content from $E = mc^2$" ); assert!( result.content.contains("The equation"), "FAIL: Should extract text before inline math" ); assert!( result.content.contains("is famous"), "FAIL: Should extract text after inline math" ); assert!( result.content.contains("int") || result.content.contains("∫"), "FAIL: Should extract display math environment content" ); } #[tokio::test] async fn test_latex_table_extraction() { let content = fs::read(test_file_path("tables.tex")).expect("Failed to read tables.tex"); let extractor = LatexExtractor; let doc = extractor .extract_bytes(&content, "text/x-tex", &ExtractionConfig::default()) .await .expect("Should extract LaTeX tables"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain); assert!( result.content.contains("Tables"), "FAIL: Should extract \\section{{Tables}}" ); assert!( result.content.contains("Name"), "FAIL: Should extract table header 'Name' from tabular" ); assert!( result.content.contains("Age"), "FAIL: Should extract table header 'Age' from tabular" ); assert!( result.content.contains("Score"), "FAIL: Should extract table header 'Score' from tabular" ); assert!( result.content.contains("Alice"), "FAIL: Should extract table cell 'Alice'" ); assert!(result.content.contains("30"), "FAIL: Should extract table cell '30'"); assert!(result.content.contains("95"), "FAIL: Should extract table cell '95'"); assert!(result.content.contains("Bob"), "FAIL: Should extract table cell 'Bob'"); assert!( result.content.contains("Charlie"), "FAIL: Should extract table cell 'Charlie'" ); assert!( result.content.contains("Column 1"), "FAIL: Should extract 'Column 1' from second table" ); assert!( result.content.contains("Column 2"), "FAIL: Should extract 'Column 2' from second table" ); assert!( result.content.contains("Sample table with caption"), "FAIL: Should extract table caption from \\caption{{}}" ); } #[tokio::test] async fn test_latex_list_itemize() { let content = fs::read(test_file_path("lists.tex")).expect("Failed to read lists.tex"); let extractor = LatexExtractor; let doc = extractor .extract_bytes(&content, "text/x-tex", &ExtractionConfig::default()) .await .expect("Should extract LaTeX lists"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain); assert!( result.content.contains("First item"), "FAIL: Should extract \\item First item from itemize" ); assert!( result.content.contains("Second item"), "FAIL: Should extract \\item Second item from itemize" ); assert!( result.content.contains("Third item with nested list"), "FAIL: Should extract \\item Third item with nested list" ); assert!( result.content.contains("Fourth item"), "FAIL: Should extract \\item Fourth item from itemize" ); } #[tokio::test] async fn test_latex_list_nested() { let content = fs::read(test_file_path("lists.tex")).expect("Failed to read lists.tex"); let extractor = LatexExtractor; let doc = extractor .extract_bytes(&content, "text/x-tex", &ExtractionConfig::default()) .await .expect("Should extract LaTeX nested lists"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain); assert!( result.content.contains("Nested item 1"), "FAIL: Should extract nested \\item Nested item 1" ); assert!( result.content.contains("Nested item 2"), "FAIL: Should extract nested \\item Nested item 2" ); } #[tokio::test] async fn test_latex_list_enumerate() { let content = fs::read(test_file_path("lists.tex")).expect("Failed to read lists.tex"); let extractor = LatexExtractor; let doc = extractor .extract_bytes(&content, "text/x-tex", &ExtractionConfig::default()) .await .expect("Should extract LaTeX enumerate"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain); assert!( result.content.contains("First numbered item"), "FAIL: Should extract \\item First numbered item from enumerate" ); assert!( result.content.contains("Second numbered item"), "FAIL: Should extract \\item Second numbered item from enumerate" ); assert!( result.content.contains("Third numbered item"), "FAIL: Should extract \\item Third numbered item from enumerate" ); } #[tokio::test] async fn test_latex_list_description() { let content = fs::read(test_file_path("lists.tex")).expect("Failed to read lists.tex"); let extractor = LatexExtractor; let doc = extractor .extract_bytes(&content, "text/x-tex", &ExtractionConfig::default()) .await .expect("Should extract LaTeX description lists"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain); assert!( result.content.contains("Term 1"), "FAIL: Should extract \\item[Term 1] from description list" ); assert!( result.content.contains("Definition of term 1"), "FAIL: Should extract definition text from description list" ); assert!( result.content.contains("Term 2"), "FAIL: Should extract \\item[Term 2] from description list" ); assert!( result.content.contains("Definition of term 2"), "FAIL: Should extract definition text from description list" ); } #[tokio::test] async fn test_latex_lists_pandoc_parity() { let content = fs::read(test_file_path("lists.tex")).expect("Failed to read lists.tex"); let extractor = LatexExtractor; let _result = extractor .extract_bytes(&content, "text/x-tex", &ExtractionConfig::default()) .await .expect("Should extract LaTeX lists"); } #[tokio::test] async fn test_latex_unicode_handling() { let content = fs::read(test_file_path("unicode.tex")).expect("Failed to read unicode.tex"); let extractor = LatexExtractor; let doc = extractor .extract_bytes(&content, "text/x-tex", &ExtractionConfig::default()) .await .expect("Should extract LaTeX with Unicode"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain); assert!( result.content.contains("אֳרָנִים") || result.content.contains("Hebrew"), "FAIL: Should extract Hebrew characters or 'Hebrew' text" ); assert!( !result.content.is_empty(), "FAIL: Should extract non-zero content from unicode.tex" ); } #[tokio::test] async fn test_latex_no_content_loss_bug() { let content = fs::read(test_file_path("minimal.tex")).expect("Failed to read minimal.tex"); let extractor = LatexExtractor; let doc = extractor .extract_bytes(&content, "text/x-tex", &ExtractionConfig::default()) .await .expect("Should extract minimal LaTeX"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain); assert!( !result.content.is_empty(), "FAIL: CRITICAL BUG - Extracted 0 bytes from minimal.tex. Current LaTeX extractor is completely broken." ); assert!( result.content.len() >= 10, "FAIL: Extracted only {} bytes, expected at least 10. Content: '{}'", result.content.len(), result.content ); } #[tokio::test] async fn test_latex_extraction_deterministic() { let content = fs::read(test_file_path("minimal.tex")).expect("Failed to read minimal.tex"); let extractor = LatexExtractor; let doc_result1 = extractor .extract_bytes(&content, "text/x-tex", &ExtractionConfig::default()) .await .expect("Should extract LaTeX (first run)"); let result1 = derive_extraction_result(doc_result1, false, kreuzberg::OutputFormat::Plain); let doc_result2 = extractor .extract_bytes(&content, "text/x-tex", &ExtractionConfig::default()) .await .expect("Should extract LaTeX (second run)"); let result2 = derive_extraction_result(doc_result2, false, kreuzberg::OutputFormat::Plain); assert_eq!( result1.content, result2.content, "FAIL: Extraction is not deterministic. Same input produced different outputs." ); assert_eq!( result1.metadata.additional, result2.metadata.additional, "FAIL: Metadata extraction is not deterministic." ); } #[tokio::test] async fn test_latex_empty_document_handling() { let empty_latex = b"\\documentclass{article}\n\\begin{document}\n\\end{document}"; let extractor = LatexExtractor; let doc = extractor .extract_bytes(empty_latex, "text/x-tex", &ExtractionConfig::default()) .await .expect("Should handle empty LaTeX without panicking"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain); assert!( result.content.trim().is_empty(), "Empty document should produce empty content (got: '{}')", result.content ); }