Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,512 @@
//! Comprehensive LaTeX Extractor Tests
//!
//! This test suite defines the expected behavior for LaTeX extraction.
//!
//! Test Coverage:
//! - Basic content extraction (minimal.tex)
//! - Section hierarchy (basic_sections.tex)
//! - Text formatting (formatting.tex)
//! - Mathematical expressions (math.tex)
//! - Tables (tables.tex)
//! - Lists (lists.tex)
//! - Unicode handling (unicode.tex)
//!
//! Success Criteria:
//! - All tests passing (100%)
//! - No content loss (extract meaningful content)
#![cfg(feature = "office")]
use kreuzberg::core::config::ExtractionConfig;
use kreuzberg::extraction::derive::derive_extraction_result;
use kreuzberg::extractors::latex::LatexExtractor;
use kreuzberg::plugins::DocumentExtractor;
use std::fs;
use std::path::PathBuf;
/// Helper to get absolute path to test documents
fn test_file_path(filename: &str) -> PathBuf {
let manifest_dir = env!("CARGO_MANIFEST_DIR");
PathBuf::from(manifest_dir)
.parent()
.expect("Operation failed")
.parent()
.expect("Operation failed")
.join("test_documents")
.join("latex")
.join(filename)
}
#[tokio::test]
async fn test_latex_minimal_extraction() {
let content = fs::read(test_file_path("minimal.tex")).expect("Failed to read minimal.tex");
let extractor = LatexExtractor;
let doc = extractor
.extract_bytes(&content, "text/x-tex", &ExtractionConfig::default())
.await
.expect("Should extract minimal LaTeX");
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
assert!(
!result.content.is_empty(),
"FAIL: Extracted 0 bytes (current bug). Should extract content from minimal.tex"
);
assert!(
result.content.contains("Hello World from LaTeX!"),
"FAIL: Should extract 'Hello World from LaTeX!' but got: '{}'",
result.content
);
}
#[tokio::test]
async fn test_latex_metadata_extraction() {
let content = fs::read(test_file_path("basic_sections.tex")).expect("Failed to read basic_sections.tex");
let extractor = LatexExtractor;
let doc = extractor
.extract_bytes(&content, "text/x-tex", &ExtractionConfig::default())
.await
.expect("Should extract LaTeX with metadata");
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
assert_eq!(
result.metadata.additional.get("title").and_then(|v| v.as_str()),
Some("Test Document"),
"FAIL: Should extract title 'Test Document' from \\title{{}} command"
);
assert_eq!(
result.metadata.additional.get("author").and_then(|v| v.as_str()),
Some("John Doe"),
"FAIL: Should extract author 'John Doe' from \\author{{}} command"
);
assert_eq!(
result.metadata.additional.get("date").and_then(|v| v.as_str()),
Some("2025-12-07"),
"FAIL: Should extract date '2025-12-07' from \\date{{}} command"
);
}
#[tokio::test]
async fn test_latex_section_hierarchy() {
let content = fs::read(test_file_path("basic_sections.tex")).expect("Failed to read basic_sections.tex");
let extractor = LatexExtractor;
let doc = extractor
.extract_bytes(&content, "text/x-tex", &ExtractionConfig::default())
.await
.expect("Should extract LaTeX sections");
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
assert!(
result.content.contains("Introduction"),
"FAIL: Should extract \\section{{Introduction}} as text"
);
assert!(
result.content.contains("Methods"),
"FAIL: Should extract \\section{{Methods}} as text"
);
assert!(
result.content.contains("Results"),
"FAIL: Should extract \\section{{Results}} as text"
);
assert!(
result.content.contains("Background"),
"FAIL: Should extract \\subsection{{Background}} as text"
);
assert!(
result.content.contains("Historical Context"),
"FAIL: Should extract \\subsubsection{{Historical Context}} as text"
);
assert!(
result.content.contains("This is the introduction paragraph"),
"FAIL: Should extract paragraph text from document body"
);
}
#[tokio::test]
async fn test_latex_text_formatting() {
let content = fs::read(test_file_path("formatting.tex")).expect("Failed to read formatting.tex");
let extractor = LatexExtractor;
let doc = extractor
.extract_bytes(&content, "text/x-tex", &ExtractionConfig::default())
.await
.expect("Should extract LaTeX formatting");
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
assert!(
result.content.contains("Text Formatting"),
"FAIL: Should extract \\section{{Text Formatting}}"
);
assert!(
result.content.contains("This is normal text"),
"FAIL: Should extract plain paragraph text"
);
assert!(
result.content.contains("bold text"),
"FAIL: Should extract text from \\textbf{{bold text}}"
);
assert!(
result.content.contains("italic text"),
"FAIL: Should extract text from \\textit{{italic text}}"
);
assert!(
result.content.contains("underlined text"),
"FAIL: Should extract text from \\underline{{underlined text}}"
);
assert!(
result.content.contains("emphasized text"),
"FAIL: Should extract text from \\emph{{emphasized text}}"
);
assert!(
result.content.contains("monospace text"),
"FAIL: Should extract text from \\texttt{{monospace text}}"
);
assert!(
result.content.contains("bold and italic"),
"FAIL: Should extract text from nested formatting commands"
);
}
#[tokio::test]
async fn test_latex_math_extraction() {
let content = fs::read(test_file_path("math.tex")).expect("Failed to read math.tex");
let extractor = LatexExtractor;
let doc = extractor
.extract_bytes(&content, "text/x-tex", &ExtractionConfig::default())
.await
.expect("Should extract LaTeX math");
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
assert!(
result.content.contains("Math Formulas"),
"FAIL: Should extract \\section{{Math Formulas}}"
);
assert!(
result.content.contains("Inline Math"),
"FAIL: Should extract \\subsection{{Inline Math}}"
);
assert!(
result.content.contains("Display Math"),
"FAIL: Should extract \\subsection{{Display Math}}"
);
assert!(
result.content.contains("mc") || result.content.contains("mc²"),
"FAIL: Should extract inline math content from $E = mc^2$"
);
assert!(
result.content.contains("The equation"),
"FAIL: Should extract text before inline math"
);
assert!(
result.content.contains("is famous"),
"FAIL: Should extract text after inline math"
);
assert!(
result.content.contains("int") || result.content.contains(""),
"FAIL: Should extract display math environment content"
);
}
#[tokio::test]
async fn test_latex_table_extraction() {
let content = fs::read(test_file_path("tables.tex")).expect("Failed to read tables.tex");
let extractor = LatexExtractor;
let doc = extractor
.extract_bytes(&content, "text/x-tex", &ExtractionConfig::default())
.await
.expect("Should extract LaTeX tables");
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
assert!(
result.content.contains("Tables"),
"FAIL: Should extract \\section{{Tables}}"
);
assert!(
result.content.contains("Name"),
"FAIL: Should extract table header 'Name' from tabular"
);
assert!(
result.content.contains("Age"),
"FAIL: Should extract table header 'Age' from tabular"
);
assert!(
result.content.contains("Score"),
"FAIL: Should extract table header 'Score' from tabular"
);
assert!(
result.content.contains("Alice"),
"FAIL: Should extract table cell 'Alice'"
);
assert!(result.content.contains("30"), "FAIL: Should extract table cell '30'");
assert!(result.content.contains("95"), "FAIL: Should extract table cell '95'");
assert!(result.content.contains("Bob"), "FAIL: Should extract table cell 'Bob'");
assert!(
result.content.contains("Charlie"),
"FAIL: Should extract table cell 'Charlie'"
);
assert!(
result.content.contains("Column 1"),
"FAIL: Should extract 'Column 1' from second table"
);
assert!(
result.content.contains("Column 2"),
"FAIL: Should extract 'Column 2' from second table"
);
assert!(
result.content.contains("Sample table with caption"),
"FAIL: Should extract table caption from \\caption{{}}"
);
}
#[tokio::test]
async fn test_latex_list_itemize() {
let content = fs::read(test_file_path("lists.tex")).expect("Failed to read lists.tex");
let extractor = LatexExtractor;
let doc = extractor
.extract_bytes(&content, "text/x-tex", &ExtractionConfig::default())
.await
.expect("Should extract LaTeX lists");
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
assert!(
result.content.contains("First item"),
"FAIL: Should extract \\item First item from itemize"
);
assert!(
result.content.contains("Second item"),
"FAIL: Should extract \\item Second item from itemize"
);
assert!(
result.content.contains("Third item with nested list"),
"FAIL: Should extract \\item Third item with nested list"
);
assert!(
result.content.contains("Fourth item"),
"FAIL: Should extract \\item Fourth item from itemize"
);
}
#[tokio::test]
async fn test_latex_list_nested() {
let content = fs::read(test_file_path("lists.tex")).expect("Failed to read lists.tex");
let extractor = LatexExtractor;
let doc = extractor
.extract_bytes(&content, "text/x-tex", &ExtractionConfig::default())
.await
.expect("Should extract LaTeX nested lists");
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
assert!(
result.content.contains("Nested item 1"),
"FAIL: Should extract nested \\item Nested item 1"
);
assert!(
result.content.contains("Nested item 2"),
"FAIL: Should extract nested \\item Nested item 2"
);
}
#[tokio::test]
async fn test_latex_list_enumerate() {
let content = fs::read(test_file_path("lists.tex")).expect("Failed to read lists.tex");
let extractor = LatexExtractor;
let doc = extractor
.extract_bytes(&content, "text/x-tex", &ExtractionConfig::default())
.await
.expect("Should extract LaTeX enumerate");
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
assert!(
result.content.contains("First numbered item"),
"FAIL: Should extract \\item First numbered item from enumerate"
);
assert!(
result.content.contains("Second numbered item"),
"FAIL: Should extract \\item Second numbered item from enumerate"
);
assert!(
result.content.contains("Third numbered item"),
"FAIL: Should extract \\item Third numbered item from enumerate"
);
}
#[tokio::test]
async fn test_latex_list_description() {
let content = fs::read(test_file_path("lists.tex")).expect("Failed to read lists.tex");
let extractor = LatexExtractor;
let doc = extractor
.extract_bytes(&content, "text/x-tex", &ExtractionConfig::default())
.await
.expect("Should extract LaTeX description lists");
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
assert!(
result.content.contains("Term 1"),
"FAIL: Should extract \\item[Term 1] from description list"
);
assert!(
result.content.contains("Definition of term 1"),
"FAIL: Should extract definition text from description list"
);
assert!(
result.content.contains("Term 2"),
"FAIL: Should extract \\item[Term 2] from description list"
);
assert!(
result.content.contains("Definition of term 2"),
"FAIL: Should extract definition text from description list"
);
}
#[tokio::test]
async fn test_latex_lists_pandoc_parity() {
let content = fs::read(test_file_path("lists.tex")).expect("Failed to read lists.tex");
let extractor = LatexExtractor;
let _result = extractor
.extract_bytes(&content, "text/x-tex", &ExtractionConfig::default())
.await
.expect("Should extract LaTeX lists");
}
#[tokio::test]
async fn test_latex_unicode_handling() {
let content = fs::read(test_file_path("unicode.tex")).expect("Failed to read unicode.tex");
let extractor = LatexExtractor;
let doc = extractor
.extract_bytes(&content, "text/x-tex", &ExtractionConfig::default())
.await
.expect("Should extract LaTeX with Unicode");
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
assert!(
result.content.contains("אֳרָנִים") || result.content.contains("Hebrew"),
"FAIL: Should extract Hebrew characters or 'Hebrew' text"
);
assert!(
!result.content.is_empty(),
"FAIL: Should extract non-zero content from unicode.tex"
);
}
#[tokio::test]
async fn test_latex_no_content_loss_bug() {
let content = fs::read(test_file_path("minimal.tex")).expect("Failed to read minimal.tex");
let extractor = LatexExtractor;
let doc = extractor
.extract_bytes(&content, "text/x-tex", &ExtractionConfig::default())
.await
.expect("Should extract minimal LaTeX");
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
assert!(
!result.content.is_empty(),
"FAIL: CRITICAL BUG - Extracted 0 bytes from minimal.tex. Current LaTeX extractor is completely broken."
);
assert!(
result.content.len() >= 10,
"FAIL: Extracted only {} bytes, expected at least 10. Content: '{}'",
result.content.len(),
result.content
);
}
#[tokio::test]
async fn test_latex_extraction_deterministic() {
let content = fs::read(test_file_path("minimal.tex")).expect("Failed to read minimal.tex");
let extractor = LatexExtractor;
let doc_result1 = extractor
.extract_bytes(&content, "text/x-tex", &ExtractionConfig::default())
.await
.expect("Should extract LaTeX (first run)");
let result1 = derive_extraction_result(doc_result1, false, kreuzberg::OutputFormat::Plain);
let doc_result2 = extractor
.extract_bytes(&content, "text/x-tex", &ExtractionConfig::default())
.await
.expect("Should extract LaTeX (second run)");
let result2 = derive_extraction_result(doc_result2, false, kreuzberg::OutputFormat::Plain);
assert_eq!(
result1.content, result2.content,
"FAIL: Extraction is not deterministic. Same input produced different outputs."
);
assert_eq!(
result1.metadata.additional, result2.metadata.additional,
"FAIL: Metadata extraction is not deterministic."
);
}
#[tokio::test]
async fn test_latex_empty_document_handling() {
let empty_latex = b"\\documentclass{article}\n\\begin{document}\n\\end{document}";
let extractor = LatexExtractor;
let doc = extractor
.extract_bytes(empty_latex, "text/x-tex", &ExtractionConfig::default())
.await
.expect("Should handle empty LaTeX without panicking");
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
assert!(
result.content.trim().is_empty(),
"Empty document should produce empty content (got: '{}')",
result.content
);
}