Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/crates/kreuzberg/tests/latex_extractor_tests.rs
+++ b/crates/kreuzberg/tests/latex_extractor_tests.rs
@@ -0,0 +1,512 @@
+//! Comprehensive LaTeX Extractor Tests
+//!
+//! This test suite defines the expected behavior for LaTeX extraction.
+//!
+//! Test Coverage:
+//! - Basic content extraction (minimal.tex)
+//! - Section hierarchy (basic_sections.tex)
+//! - Text formatting (formatting.tex)
+//! - Mathematical expressions (math.tex)
+//! - Tables (tables.tex)
+//! - Lists (lists.tex)
+//! - Unicode handling (unicode.tex)
+//!
+//! Success Criteria:
+//! - All tests passing (100%)
+//! - No content loss (extract meaningful content)
+
+#![cfg(feature = "office")]
+
+use kreuzberg::core::config::ExtractionConfig;
+use kreuzberg::extraction::derive::derive_extraction_result;
+use kreuzberg::extractors::latex::LatexExtractor;
+use kreuzberg::plugins::DocumentExtractor;
+use std::fs;
+use std::path::PathBuf;
+
+/// Helper to get absolute path to test documents
+fn test_file_path(filename: &str) -> PathBuf {
+    let manifest_dir = env!("CARGO_MANIFEST_DIR");
+    PathBuf::from(manifest_dir)
+        .parent()
+        .expect("Operation failed")
+        .parent()
+        .expect("Operation failed")
+        .join("test_documents")
+        .join("latex")
+        .join(filename)
+}
+
+#[tokio::test]
+async fn test_latex_minimal_extraction() {
+    let content = fs::read(test_file_path("minimal.tex")).expect("Failed to read minimal.tex");
+
+    let extractor = LatexExtractor;
+    let doc = extractor
+        .extract_bytes(&content, "text/x-tex", &ExtractionConfig::default())
+        .await
+        .expect("Should extract minimal LaTeX");
+    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
+
+    assert!(
+        !result.content.is_empty(),
+        "FAIL: Extracted 0 bytes (current bug). Should extract content from minimal.tex"
+    );
+
+    assert!(
+        result.content.contains("Hello World from LaTeX!"),
+        "FAIL: Should extract 'Hello World from LaTeX!' but got: '{}'",
+        result.content
+    );
+}
+
+#[tokio::test]
+async fn test_latex_metadata_extraction() {
+    let content = fs::read(test_file_path("basic_sections.tex")).expect("Failed to read basic_sections.tex");
+
+    let extractor = LatexExtractor;
+    let doc = extractor
+        .extract_bytes(&content, "text/x-tex", &ExtractionConfig::default())
+        .await
+        .expect("Should extract LaTeX with metadata");
+    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
+
+    assert_eq!(
+        result.metadata.additional.get("title").and_then(|v| v.as_str()),
+        Some("Test Document"),
+        "FAIL: Should extract title 'Test Document' from \\title{{}} command"
+    );
+
+    assert_eq!(
+        result.metadata.additional.get("author").and_then(|v| v.as_str()),
+        Some("John Doe"),
+        "FAIL: Should extract author 'John Doe' from \\author{{}} command"
+    );
+
+    assert_eq!(
+        result.metadata.additional.get("date").and_then(|v| v.as_str()),
+        Some("2025-12-07"),
+        "FAIL: Should extract date '2025-12-07' from \\date{{}} command"
+    );
+}
+
+#[tokio::test]
+async fn test_latex_section_hierarchy() {
+    let content = fs::read(test_file_path("basic_sections.tex")).expect("Failed to read basic_sections.tex");
+
+    let extractor = LatexExtractor;
+    let doc = extractor
+        .extract_bytes(&content, "text/x-tex", &ExtractionConfig::default())
+        .await
+        .expect("Should extract LaTeX sections");
+    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
+
+    assert!(
+        result.content.contains("Introduction"),
+        "FAIL: Should extract \\section{{Introduction}} as text"
+    );
+
+    assert!(
+        result.content.contains("Methods"),
+        "FAIL: Should extract \\section{{Methods}} as text"
+    );
+
+    assert!(
+        result.content.contains("Results"),
+        "FAIL: Should extract \\section{{Results}} as text"
+    );
+
+    assert!(
+        result.content.contains("Background"),
+        "FAIL: Should extract \\subsection{{Background}} as text"
+    );
+
+    assert!(
+        result.content.contains("Historical Context"),
+        "FAIL: Should extract \\subsubsection{{Historical Context}} as text"
+    );
+
+    assert!(
+        result.content.contains("This is the introduction paragraph"),
+        "FAIL: Should extract paragraph text from document body"
+    );
+}
+
+#[tokio::test]
+async fn test_latex_text_formatting() {
+    let content = fs::read(test_file_path("formatting.tex")).expect("Failed to read formatting.tex");
+
+    let extractor = LatexExtractor;
+    let doc = extractor
+        .extract_bytes(&content, "text/x-tex", &ExtractionConfig::default())
+        .await
+        .expect("Should extract LaTeX formatting");
+    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
+
+    assert!(
+        result.content.contains("Text Formatting"),
+        "FAIL: Should extract \\section{{Text Formatting}}"
+    );
+
+    assert!(
+        result.content.contains("This is normal text"),
+        "FAIL: Should extract plain paragraph text"
+    );
+
+    assert!(
+        result.content.contains("bold text"),
+        "FAIL: Should extract text from \\textbf{{bold text}}"
+    );
+
+    assert!(
+        result.content.contains("italic text"),
+        "FAIL: Should extract text from \\textit{{italic text}}"
+    );
+
+    assert!(
+        result.content.contains("underlined text"),
+        "FAIL: Should extract text from \\underline{{underlined text}}"
+    );
+
+    assert!(
+        result.content.contains("emphasized text"),
+        "FAIL: Should extract text from \\emph{{emphasized text}}"
+    );
+
+    assert!(
+        result.content.contains("monospace text"),
+        "FAIL: Should extract text from \\texttt{{monospace text}}"
+    );
+
+    assert!(
+        result.content.contains("bold and italic"),
+        "FAIL: Should extract text from nested formatting commands"
+    );
+}
+
+#[tokio::test]
+async fn test_latex_math_extraction() {
+    let content = fs::read(test_file_path("math.tex")).expect("Failed to read math.tex");
+
+    let extractor = LatexExtractor;
+    let doc = extractor
+        .extract_bytes(&content, "text/x-tex", &ExtractionConfig::default())
+        .await
+        .expect("Should extract LaTeX math");
+    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
+
+    assert!(
+        result.content.contains("Math Formulas"),
+        "FAIL: Should extract \\section{{Math Formulas}}"
+    );
+
+    assert!(
+        result.content.contains("Inline Math"),
+        "FAIL: Should extract \\subsection{{Inline Math}}"
+    );
+
+    assert!(
+        result.content.contains("Display Math"),
+        "FAIL: Should extract \\subsection{{Display Math}}"
+    );
+
+    assert!(
+        result.content.contains("mc") || result.content.contains("mc²"),
+        "FAIL: Should extract inline math content from $E = mc^2$"
+    );
+
+    assert!(
+        result.content.contains("The equation"),
+        "FAIL: Should extract text before inline math"
+    );
+
+    assert!(
+        result.content.contains("is famous"),
+        "FAIL: Should extract text after inline math"
+    );
+
+    assert!(
+        result.content.contains("int") || result.content.contains("∫"),
+        "FAIL: Should extract display math environment content"
+    );
+}
+
+#[tokio::test]
+async fn test_latex_table_extraction() {
+    let content = fs::read(test_file_path("tables.tex")).expect("Failed to read tables.tex");
+
+    let extractor = LatexExtractor;
+    let doc = extractor
+        .extract_bytes(&content, "text/x-tex", &ExtractionConfig::default())
+        .await
+        .expect("Should extract LaTeX tables");
+    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
+
+    assert!(
+        result.content.contains("Tables"),
+        "FAIL: Should extract \\section{{Tables}}"
+    );
+
+    assert!(
+        result.content.contains("Name"),
+        "FAIL: Should extract table header 'Name' from tabular"
+    );
+
+    assert!(
+        result.content.contains("Age"),
+        "FAIL: Should extract table header 'Age' from tabular"
+    );
+
+    assert!(
+        result.content.contains("Score"),
+        "FAIL: Should extract table header 'Score' from tabular"
+    );
+
+    assert!(
+        result.content.contains("Alice"),
+        "FAIL: Should extract table cell 'Alice'"
+    );
+
+    assert!(result.content.contains("30"), "FAIL: Should extract table cell '30'");
+
+    assert!(result.content.contains("95"), "FAIL: Should extract table cell '95'");
+
+    assert!(result.content.contains("Bob"), "FAIL: Should extract table cell 'Bob'");
+
+    assert!(
+        result.content.contains("Charlie"),
+        "FAIL: Should extract table cell 'Charlie'"
+    );
+
+    assert!(
+        result.content.contains("Column 1"),
+        "FAIL: Should extract 'Column 1' from second table"
+    );
+
+    assert!(
+        result.content.contains("Column 2"),
+        "FAIL: Should extract 'Column 2' from second table"
+    );
+
+    assert!(
+        result.content.contains("Sample table with caption"),
+        "FAIL: Should extract table caption from \\caption{{}}"
+    );
+}
+
+#[tokio::test]
+async fn test_latex_list_itemize() {
+    let content = fs::read(test_file_path("lists.tex")).expect("Failed to read lists.tex");
+
+    let extractor = LatexExtractor;
+    let doc = extractor
+        .extract_bytes(&content, "text/x-tex", &ExtractionConfig::default())
+        .await
+        .expect("Should extract LaTeX lists");
+    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
+
+    assert!(
+        result.content.contains("First item"),
+        "FAIL: Should extract \\item First item from itemize"
+    );
+
+    assert!(
+        result.content.contains("Second item"),
+        "FAIL: Should extract \\item Second item from itemize"
+    );
+
+    assert!(
+        result.content.contains("Third item with nested list"),
+        "FAIL: Should extract \\item Third item with nested list"
+    );
+
+    assert!(
+        result.content.contains("Fourth item"),
+        "FAIL: Should extract \\item Fourth item from itemize"
+    );
+}
+
+#[tokio::test]
+async fn test_latex_list_nested() {
+    let content = fs::read(test_file_path("lists.tex")).expect("Failed to read lists.tex");
+
+    let extractor = LatexExtractor;
+    let doc = extractor
+        .extract_bytes(&content, "text/x-tex", &ExtractionConfig::default())
+        .await
+        .expect("Should extract LaTeX nested lists");
+    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
+
+    assert!(
+        result.content.contains("Nested item 1"),
+        "FAIL: Should extract nested \\item Nested item 1"
+    );
+
+    assert!(
+        result.content.contains("Nested item 2"),
+        "FAIL: Should extract nested \\item Nested item 2"
+    );
+}
+
+#[tokio::test]
+async fn test_latex_list_enumerate() {
+    let content = fs::read(test_file_path("lists.tex")).expect("Failed to read lists.tex");
+
+    let extractor = LatexExtractor;
+    let doc = extractor
+        .extract_bytes(&content, "text/x-tex", &ExtractionConfig::default())
+        .await
+        .expect("Should extract LaTeX enumerate");
+    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
+
+    assert!(
+        result.content.contains("First numbered item"),
+        "FAIL: Should extract \\item First numbered item from enumerate"
+    );
+
+    assert!(
+        result.content.contains("Second numbered item"),
+        "FAIL: Should extract \\item Second numbered item from enumerate"
+    );
+
+    assert!(
+        result.content.contains("Third numbered item"),
+        "FAIL: Should extract \\item Third numbered item from enumerate"
+    );
+}
+
+#[tokio::test]
+async fn test_latex_list_description() {
+    let content = fs::read(test_file_path("lists.tex")).expect("Failed to read lists.tex");
+
+    let extractor = LatexExtractor;
+    let doc = extractor
+        .extract_bytes(&content, "text/x-tex", &ExtractionConfig::default())
+        .await
+        .expect("Should extract LaTeX description lists");
+    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
+
+    assert!(
+        result.content.contains("Term 1"),
+        "FAIL: Should extract \\item[Term 1] from description list"
+    );
+
+    assert!(
+        result.content.contains("Definition of term 1"),
+        "FAIL: Should extract definition text from description list"
+    );
+
+    assert!(
+        result.content.contains("Term 2"),
+        "FAIL: Should extract \\item[Term 2] from description list"
+    );
+
+    assert!(
+        result.content.contains("Definition of term 2"),
+        "FAIL: Should extract definition text from description list"
+    );
+}
+
+#[tokio::test]
+async fn test_latex_lists_pandoc_parity() {
+    let content = fs::read(test_file_path("lists.tex")).expect("Failed to read lists.tex");
+
+    let extractor = LatexExtractor;
+    let _result = extractor
+        .extract_bytes(&content, "text/x-tex", &ExtractionConfig::default())
+        .await
+        .expect("Should extract LaTeX lists");
+}
+
+#[tokio::test]
+async fn test_latex_unicode_handling() {
+    let content = fs::read(test_file_path("unicode.tex")).expect("Failed to read unicode.tex");
+
+    let extractor = LatexExtractor;
+    let doc = extractor
+        .extract_bytes(&content, "text/x-tex", &ExtractionConfig::default())
+        .await
+        .expect("Should extract LaTeX with Unicode");
+    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
+
+    assert!(
+        result.content.contains("אֳרָנִים") || result.content.contains("Hebrew"),
+        "FAIL: Should extract Hebrew characters or 'Hebrew' text"
+    );
+
+    assert!(
+        !result.content.is_empty(),
+        "FAIL: Should extract non-zero content from unicode.tex"
+    );
+}
+
+#[tokio::test]
+async fn test_latex_no_content_loss_bug() {
+    let content = fs::read(test_file_path("minimal.tex")).expect("Failed to read minimal.tex");
+
+    let extractor = LatexExtractor;
+    let doc = extractor
+        .extract_bytes(&content, "text/x-tex", &ExtractionConfig::default())
+        .await
+        .expect("Should extract minimal LaTeX");
+    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
+
+    assert!(
+        !result.content.is_empty(),
+        "FAIL: CRITICAL BUG - Extracted 0 bytes from minimal.tex. Current LaTeX extractor is completely broken."
+    );
+
+    assert!(
+        result.content.len() >= 10,
+        "FAIL: Extracted only {} bytes, expected at least 10. Content: '{}'",
+        result.content.len(),
+        result.content
+    );
+}
+
+#[tokio::test]
+async fn test_latex_extraction_deterministic() {
+    let content = fs::read(test_file_path("minimal.tex")).expect("Failed to read minimal.tex");
+
+    let extractor = LatexExtractor;
+
+    let doc_result1 = extractor
+        .extract_bytes(&content, "text/x-tex", &ExtractionConfig::default())
+        .await
+        .expect("Should extract LaTeX (first run)");
+    let result1 = derive_extraction_result(doc_result1, false, kreuzberg::OutputFormat::Plain);
+
+    let doc_result2 = extractor
+        .extract_bytes(&content, "text/x-tex", &ExtractionConfig::default())
+        .await
+        .expect("Should extract LaTeX (second run)");
+    let result2 = derive_extraction_result(doc_result2, false, kreuzberg::OutputFormat::Plain);
+
+    assert_eq!(
+        result1.content, result2.content,
+        "FAIL: Extraction is not deterministic. Same input produced different outputs."
+    );
+
+    assert_eq!(
+        result1.metadata.additional, result2.metadata.additional,
+        "FAIL: Metadata extraction is not deterministic."
+    );
+}
+
+#[tokio::test]
+async fn test_latex_empty_document_handling() {
+    let empty_latex = b"\\documentclass{article}\n\\begin{document}\n\\end{document}";
+
+    let extractor = LatexExtractor;
+    let doc = extractor
+        .extract_bytes(empty_latex, "text/x-tex", &ExtractionConfig::default())
+        .await
+        .expect("Should handle empty LaTeX without panicking");
+    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
+
+    assert!(
+        result.content.trim().is_empty(),
+        "Empty document should produce empty content (got: '{}')",
+        result.content
+    );
+}