Files
fil/crates/kreuzberg/tests/orgmode_extractor_tests.rs

827 lines
28 KiB
Rust
Raw Normal View History

2026-06-01 23:40:55 +02:00
//! Comprehensive TDD test suite for Org Mode extraction
//!
//! This test suite validates Org Mode extraction capabilities.
//! Each test extracts an Org Mode file and validates:
//!
//! - Metadata extraction (title, author, date from #+TITLE, #+AUTHOR, #+DATE)
//! - Heading hierarchy (* ** ***)
//! - Table parsing with proper structure
//! - List extraction (ordered, unordered, nested)
//! - Inline formatting (*bold*, /italic/, =code=, ~strikethrough~)
//! - Properties drawer extraction (:PROPERTIES: ... :END:)
//! - Link syntax ([[url][description]])
//! - Code blocks (#+BEGIN_SRC ... #+END_SRC)
//! - Unicode and special character handling
//! - Content quality validation
#![cfg(feature = "office")]
use kreuzberg::core::config::ExtractionConfig;
use kreuzberg::core::extractor::extract_bytes;
use std::path::PathBuf;
/// Helper to resolve workspace root and construct test file paths
fn get_test_orgmode_path(filename: &str) -> PathBuf {
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
.parent()
.expect("Operation failed")
.parent()
.expect("Operation failed");
workspace_root.join(format!("test_documents/org/{}", filename))
}
/// Helper to validate that content contains expected text
fn assert_contains_ci(content: &str, needle: &str, description: &str) {
assert!(
content.to_lowercase().contains(&needle.to_lowercase()),
"Content should contain '{}' ({}). Content: {}",
needle,
description,
&content[..std::cmp::min(200, content.len())]
);
}
/// Helper to validate content doesn't contain undesired text
fn assert_not_contains_ci(content: &str, needle: &str, description: &str) {
assert!(
!content.to_lowercase().contains(&needle.to_lowercase()),
"Content should NOT contain '{}' ({})",
needle,
description
);
}
/// Test 1: Basic Org Mode extraction from simple.org
///
/// Validates:
/// - Successfully extracts Org Mode format
/// - Content is properly formatted without raw markup
/// - Basic document structure is preserved
#[tokio::test]
async fn test_orgmode_basic_extraction() {
let test_file = get_test_orgmode_path("tables.org");
if !test_file.exists() {
println!("Skipping test: Test file not found at {:?}", test_file);
return;
}
let content = std::fs::read(&test_file).expect("Should read Org Mode file");
let result = extract_bytes(&content, "text/x-org", &ExtractionConfig::default())
.await
.expect("Should extract Org Mode successfully");
assert!(
!result.content.is_empty(),
"Content should not be empty for Org Mode file"
);
assert!(result.content.len() > 50, "Content should have substantial length");
assert_not_contains_ci(&result.content, "#+TITLE", "Should not contain raw #+TITLE");
assert_not_contains_ci(&result.content, "#+BEGIN_", "Should not contain raw #+BEGIN_");
println!("✅ Org Mode basic extraction test passed!");
println!(" Content length: {} bytes", result.content.len());
}
/// Test 2: Metadata extraction (title, author, date)
///
/// Validates:
/// - #+TITLE metadata is extracted
/// - #+AUTHOR metadata is extracted
/// - #+DATE metadata is extracted
#[tokio::test]
async fn test_orgmode_metadata_extraction() {
let org_content = r#"#+TITLE: Test Document
#+AUTHOR: John Doe
#+DATE: 2024-01-15
* First Section
Document content here.
"#;
let result = extract_bytes(org_content.as_bytes(), "text/x-org", &ExtractionConfig::default())
.await
.expect("Should extract metadata from Org Mode");
assert!(
!result.content.is_empty(),
"Content should be extracted from Org Mode with metadata"
);
assert_contains_ci(&result.content, "First Section", "Should contain section heading");
assert_contains_ci(&result.content, "content", "Should contain document content");
println!("✅ Org Mode metadata extraction test passed!");
println!(" Metadata fields: {}", result.metadata.additional.len());
println!(" Content length: {} bytes", result.content.len());
}
/// Test 3: Heading hierarchy extraction
///
/// Validates:
/// - Single-level headings (*) are recognized
/// - Multi-level headings (**, ***, etc.) are recognized
/// - Heading structure is preserved
/// - Heading text is properly extracted
#[tokio::test]
async fn test_orgmode_headings() {
let org_content = r#"* Top Level Heading
Text under top level.
** Second Level Heading
Text under second level.
*** Third Level Heading
Text under third level.
**** Fourth Level Heading
Deep nested content.
"#;
let result = extract_bytes(org_content.as_bytes(), "text/x-org", &ExtractionConfig::default())
.await
.expect("Should extract headings from Org Mode");
assert_contains_ci(&result.content, "Top Level Heading", "Should contain level 1 heading");
assert_contains_ci(
&result.content,
"Second Level Heading",
"Should contain level 2 heading",
);
assert_contains_ci(&result.content, "Third Level Heading", "Should contain level 3 heading");
assert_contains_ci(
&result.content,
"Fourth Level Heading",
"Should contain level 4 heading",
);
println!("✅ Org Mode headings test passed!");
println!(" All heading levels extracted successfully");
}
/// Test 4: Table extraction with proper structure
///
/// Validates:
/// - Tables are recognized and extracted
/// - Table headers are identified
/// - Table data rows are preserved
/// - Multiple tables in document are all extracted
#[tokio::test]
async fn test_orgmode_tables() {
let test_file = get_test_orgmode_path("tables.org");
if !test_file.exists() {
println!("Skipping test: Test file not found at {:?}", test_file);
return;
}
let content = std::fs::read(&test_file).expect("Should read Org Mode file");
let result = extract_bytes(&content, "text/x-org", &ExtractionConfig::default())
.await
.expect("Should extract tables from Org Mode");
assert!(
result.content.contains("Right") || result.content.contains("Left"),
"Should contain table headers"
);
assert!(
result.content.contains("12") || result.content.contains("123"),
"Should contain table data"
);
let table_count = result.content.matches("Right").count();
assert!(table_count >= 1, "Should extract at least one table from document");
println!("✅ Org Mode tables test passed!");
println!(" Found approximately {} table(s)", table_count);
}
/// Test 5: Table with complex structure and multiline cells
///
/// Validates:
/// - Multiline table cells are handled
/// - Complex table structures are preserved
/// - Table captions are extracted
#[tokio::test]
async fn test_orgmode_tables_complex() {
let test_file = get_test_orgmode_path("tables.org");
if !test_file.exists() {
println!("Skipping test: Test file not found at {:?}", test_file);
return;
}
let content = std::fs::read(&test_file).expect("Should read Org Mode file");
let result = extract_bytes(&content, "text/x-org", &ExtractionConfig::default())
.await
.expect("Should extract complex tables from Org Mode");
assert!(
result.content.contains("Centered Header")
|| result.content.contains("Left Aligned")
|| result.content.contains("Right Aligned"),
"Should contain multiline table headers"
);
assert!(
result.content.contains("span multiple lines")
|| result.content.contains("First")
|| result.content.contains("Second"),
"Should contain multiline table cell content"
);
println!("✅ Org Mode complex tables test passed!");
}
/// Test 6: Ordered and unordered list extraction
///
/// Validates:
/// - Unordered lists (- items) are recognized
/// - Ordered lists (1., 2., etc.) are recognized
/// - List items are properly extracted
/// - Nested lists are handled
#[tokio::test]
async fn test_orgmode_lists() {
let org_content = r#"* Lists Section
** Unordered List
- First item
- Second item
- Third item
** Ordered List
1. One
2. Two
3. Three
** Mixed and Nested
- Item A
- Nested A1
- Nested A2
- Item B
1. Sub-ordered
2. Another sub
"#;
let result = extract_bytes(org_content.as_bytes(), "text/x-org", &ExtractionConfig::default())
.await
.expect("Should extract lists from Org Mode");
assert_contains_ci(&result.content, "First item", "Should contain unordered list items");
assert_contains_ci(&result.content, "Second item", "Should contain unordered list items");
assert_contains_ci(&result.content, "One", "Should contain ordered list items");
assert_contains_ci(&result.content, "Two", "Should contain ordered list items");
assert_contains_ci(&result.content, "Nested", "Should contain nested list items");
assert_contains_ci(&result.content, "Item A", "Should contain parent list items");
println!("✅ Org Mode lists test passed!");
}
/// Test 7: Inline formatting (bold, italic, code, strikethrough)
///
/// Validates:
/// - *bold* text is preserved
/// - /italic/ text is preserved
/// - =code= text is preserved
/// - ~strikethrough~ text is preserved
/// - +underline+ text is handled
#[tokio::test]
async fn test_orgmode_inline_formatting() {
let org_content = r#"* Formatting Test
This text has *bold emphasis* and /italic text/.
We also have =inline code= and ~strikethrough text~.
Some text with _underlined_ content.
Mixed formatting like *bold /italic/ text* is also supported.
"#;
let result = extract_bytes(org_content.as_bytes(), "text/x-org", &ExtractionConfig::default())
.await
.expect("Should extract inline formatting from Org Mode");
assert_contains_ci(&result.content, "bold", "Should contain bold text");
assert_contains_ci(&result.content, "italic", "Should contain italic text");
assert_contains_ci(&result.content, "code", "Should contain code text");
assert_contains_ci(&result.content, "emphasis", "Should preserve text content");
assert_contains_ci(&result.content, "strikethrough", "Should preserve strikethrough text");
println!("✅ Org Mode inline formatting test passed!");
}
/// Test 8: Properties drawer extraction
///
/// Validates:
/// - :PROPERTIES: drawers are recognized
/// - Property key-value pairs are extracted
/// - Custom properties are preserved
#[tokio::test]
async fn test_orgmode_properties() {
let org_content = r#"* Task with Properties
:PROPERTIES:
:ID: 12345-abcde-67890
:CUSTOM: custom-value
:STATUS: active
:END:
This is content after properties.
"#;
let result = extract_bytes(org_content.as_bytes(), "text/x-org", &ExtractionConfig::default())
.await
.expect("Should extract properties from Org Mode");
assert_contains_ci(&result.content, "Task with Properties", "Should contain heading");
assert_contains_ci(&result.content, "content", "Should contain main content");
println!("✅ Org Mode properties test passed!");
}
/// Test 9: Link syntax extraction with description priority
///
/// Validates:
/// - [[url]] syntax is recognized
/// - [[url][description]] syntax extracts description (not url)
/// - Internal links [[*heading]] are handled
/// - Link text is preserved (description when available)
#[tokio::test]
async fn test_orgmode_links() {
let test_file = get_test_orgmode_path("links.org");
if !test_file.exists() {
println!("Skipping test: Test file not found at {:?}", test_file);
return;
}
let content = std::fs::read(&test_file).expect("Should read Org Mode file");
let result = extract_bytes(&content, "text/x-org", &ExtractionConfig::default())
.await
.expect("Should extract links from Org Mode");
assert_contains_ci(&result.content, "AT&T", "Should contain AT&T link description");
assert_contains_ci(&result.content, "URL", "Should contain 'URL' link description");
assert_contains_ci(&result.content, "email", "Should contain 'email' link description");
assert_contains_ci(&result.content, "ampersand", "Should contain ampersand reference");
assert_contains_ci(&result.content, "Links", "Should contain Links section header");
println!("✅ Org Mode links test passed!");
}
/// Test 10: Code block extraction
///
/// Validates:
/// - #+BEGIN_SRC blocks are recognized
/// - #+BEGIN_SRC language blocks are identified
/// - Code content is preserved
/// - Multiple code blocks are extracted
#[tokio::test]
async fn test_orgmode_code_blocks() {
let test_file = get_test_orgmode_path("../misc/readme.org");
if !test_file.exists() {
println!("Skipping test: Test file not found at {:?}", test_file);
return;
}
let content = std::fs::read(&test_file).expect("Should read Org Mode file");
let result = extract_bytes(&content, "text/x-org", &ExtractionConfig::default())
.await
.expect("Should extract code blocks from Org Mode");
assert!(
result.content.contains("curl") || result.content.contains("bash") || result.content.contains("bash"),
"Should contain code block content or language specification"
);
println!("✅ Org Mode code blocks test passed!");
}
/// Test 11: Multiple code blocks with different languages
///
/// Validates:
/// - Python code blocks are recognized
/// - Bash code blocks are recognized
/// - Language syntax is preserved
#[tokio::test]
async fn test_orgmode_code_blocks_multilang() {
let test_file = get_test_orgmode_path("code-blocks.org");
if !test_file.exists() {
println!("Skipping test: Test file not found at {:?}", test_file);
return;
}
let content = std::fs::read(&test_file).expect("Should read Org Mode file");
let result = extract_bytes(&content, "text/x-org", &ExtractionConfig::default())
.await
.expect("Should extract multi-language code blocks");
assert_contains_ci(&result.content, "Python", "Should contain Python code reference");
assert_contains_ci(&result.content, "Bash", "Should contain Bash code reference");
assert_contains_ci(
&result.content,
"JavaScript",
"Should contain JavaScript code reference",
);
println!("✅ Org Mode multi-language code blocks test passed!");
}
/// Test 12: Unicode character handling
///
/// Validates:
/// - International characters are preserved (é, ñ, ü, etc.)
/// - Mathematical symbols are preserved (∈, ©, °, etc.)
/// - Emoji characters are handled
/// - UTF-8 encoding is maintained
#[tokio::test]
async fn test_orgmode_unicode() {
let org_content = r#"* Unicode Test
French: Café, naïve, résumé
German: Äpfel, Zürich
Spanish: Niño, Español
Russian: Привет
Mathematical:
Copyright: © ®
Degrees: 25°C
Emoji: 🎉 📚 🌟
"#;
let result = extract_bytes(org_content.as_bytes(), "text/x-org", &ExtractionConfig::default())
.await
.expect("Should extract unicode characters from Org Mode");
assert!(
result.content.contains("Café") || result.content.contains("Caf"),
"Should contain French text"
);
assert!(
result.content.contains("°") || result.content.contains("Degrees"),
"Should contain degree symbol"
);
assert!(
result.content.contains("©") || result.content.contains("Copyright"),
"Should contain copyright symbol"
);
let _ = result.content.chars().count();
println!("✅ Org Mode unicode test passed!");
}
/// Test 13: Special character escaping
///
/// Validates:
/// - Escaped characters are handled properly
/// - Special Org Mode characters are escaped correctly
/// - Ampersands, brackets, etc. are preserved
#[tokio::test]
async fn test_orgmode_special_characters() {
let org_content = r#"* Special Characters
This contains & ampersand, < less than, > greater than.
We have [brackets] and {braces} in text.
AT&T has an ampersand. Check prices @ 50%.
Backslash: \ and other symbols: | ~ `
"#;
let result = extract_bytes(org_content.as_bytes(), "text/x-org", &ExtractionConfig::default())
.await
.expect("Should extract special characters from Org Mode");
assert_contains_ci(&result.content, "ampersand", "Should contain ampersand text");
assert_contains_ci(&result.content, "AT&T", "Should preserve ampersands in company names");
assert_contains_ci(&result.content, "bracket", "Should contain bracket text");
println!("✅ Org Mode special characters test passed!");
}
/// Test 14: Content extraction quality
///
/// Validates:
/// - Content is non-empty
/// - Content is valid UTF-8
/// - No excessive control characters
/// - Content doesn't contain raw markup
#[tokio::test]
async fn test_orgmode_content_quality() {
let test_file = get_test_orgmode_path("tables.org");
if !test_file.exists() {
println!("Skipping test: Test file not found at {:?}", test_file);
return;
}
let content = std::fs::read(&test_file).expect("Should read Org Mode file");
let result = extract_bytes(&content, "text/x-org", &ExtractionConfig::default())
.await
.expect("Should extract Org Mode content successfully");
let extracted = &result.content;
assert!(!extracted.is_empty(), "Content should not be empty");
let char_count = extracted.chars().count();
assert!(char_count > 0, "Content should have valid UTF-8 characters");
let control_chars = extracted
.chars()
.filter(|c| c.is_control() && *c != '\n' && *c != '\t' && *c != '\r')
.count();
assert!(
control_chars < 5,
"Should not have excessive control characters (found {})",
control_chars
);
assert!(
!extracted.contains("#+TITLE:"),
"Should not contain raw #+TITLE directive"
);
assert!(
!extracted.contains("#+BEGIN_SRC") || !extracted.contains("#+END_SRC"),
"Should not contain unprocessed code block markers"
);
println!("✅ Org Mode content quality test passed!");
println!(" Extracted {} bytes", extracted.len());
println!(" Valid UTF-8: ✓");
println!(" Control chars: ✓ (found {})", control_chars);
}
/// Test 15: MIME type detection and handling
///
/// Validates:
/// - MIME type is correctly set
/// - Extraction respects MIME type hints
/// - Content type remains consistent
#[tokio::test]
async fn test_orgmode_mime_type() {
let org_content = r#"* Test Document
Content here.
"#;
let result = extract_bytes(org_content.as_bytes(), "text/x-org", &ExtractionConfig::default())
.await
.expect("Should extract with correct MIME type");
assert_eq!(
result.mime_type, "text/x-org",
"MIME type should be preserved as text/x-org"
);
println!("✅ Org Mode MIME type test passed!");
}
/// Test 16: Content compliance validation
///
/// Validates:
/// - Extracted content doesn't contain raw XML/HTML
/// - Content has proper UTF-8 encoding
/// - Content is well-formed
/// - No unprocessed Org Mode syntax remains
#[tokio::test]
async fn test_orgmode_content_compliance() {
let test_file = get_test_orgmode_path("tables.org");
if !test_file.exists() {
println!("Skipping test: Test file not found at {:?}", test_file);
return;
}
let content = std::fs::read(&test_file).expect("Should read Org Mode file");
let result = extract_bytes(&content, "text/x-org", &ExtractionConfig::default())
.await
.expect("Should extract Org Mode successfully for baseline comparison");
let extracted = &result.content;
assert!(
!extracted.contains("#+TITLE"),
"Should not contain raw #+TITLE directive"
);
assert!(
!extracted.contains("#+AUTHOR"),
"Should not contain raw #+AUTHOR directive"
);
assert!(!extracted.contains("#+DATE"), "Should not contain raw #+DATE directive");
assert!(
!extracted.contains("#+BEGIN_") || !extracted.contains("#+END_"),
"Should have processed BEGIN/END blocks"
);
assert!(extracted.len() > 100, "Should have substantial content extracted");
assert!(
extracted.contains("#") || extracted.contains("Table"),
"Should have heading structure or document content"
);
println!("✅ Org Mode content compliance test passed!");
println!(" Raw markup: ✓ (not found)");
println!(" UTF-8 encoding: ✓");
println!(" Content structure: ✓");
}
/// Test 17: Empty document handling
///
/// Validates:
/// - Empty Org Mode documents are handled gracefully
/// - No panics occur
/// - Result is valid (even if empty)
#[tokio::test]
async fn test_orgmode_empty_document() {
let empty_org = "";
let result = extract_bytes(empty_org.as_bytes(), "text/x-org", &ExtractionConfig::default())
.await
.expect("Should handle empty Org Mode document");
assert_eq!(
result.mime_type, "text/x-org",
"MIME type should be set even for empty documents"
);
println!("✅ Org Mode empty document test passed!");
}
/// Test 18: Document with only metadata
///
/// Validates:
/// - Documents with only metadata (no content) are handled
/// - Metadata is extracted
/// - No panic occurs
#[tokio::test]
async fn test_orgmode_metadata_only() {
let metadata_only = r#"#+TITLE: Document Title
#+AUTHOR: Author Name
#+DATE: 2024-01-01
"#;
let result = extract_bytes(metadata_only.as_bytes(), "text/x-org", &ExtractionConfig::default())
.await
.expect("Should handle metadata-only document");
assert_eq!(result.mime_type, "text/x-org");
println!("✅ Org Mode metadata-only document test passed!");
}
/// Test 19: Deeply nested document structure
///
/// Validates:
/// - Deep nesting (many levels) is handled correctly
/// - No stack overflow or performance issues
/// - All levels are extracted
#[tokio::test]
async fn test_orgmode_deep_nesting() {
let deep_org = r#"* Level 1
Text at level 1
** Level 2
Text at level 2
*** Level 3
Text at level 3
**** Level 4
Text at level 4
***** Level 5
Text at level 5
****** Level 6
Text at level 6
"#;
let result = extract_bytes(deep_org.as_bytes(), "text/x-org", &ExtractionConfig::default())
.await
.expect("Should handle deeply nested structure");
assert_contains_ci(&result.content, "Level 1", "Should contain level 1");
assert_contains_ci(&result.content, "Level 2", "Should contain level 2");
assert_contains_ci(&result.content, "Level 6", "Should contain level 6");
println!("✅ Org Mode deep nesting test passed!");
}
/// Test 20: Comprehensive document with mixed features
///
/// Validates:
/// - Document with all major features is extracted correctly
/// - All features work together
/// - Output is coherent and complete
#[tokio::test]
async fn test_orgmode_comprehensive_document() {
let test_file = get_test_orgmode_path("comprehensive.org");
if !test_file.exists() {
println!("Skipping test: Test file not found at {:?}", test_file);
return;
}
let content = std::fs::read(&test_file).expect("Should read Org Mode file");
let result = extract_bytes(&content, "text/x-org", &ExtractionConfig::default())
.await
.expect("Should extract comprehensive document");
assert_contains_ci(&result.content, "Headers", "Should contain Headers section");
assert_contains_ci(&result.content, "Paragraphs", "Should contain Paragraphs section");
assert_contains_ci(&result.content, "Block Quotes", "Should contain Block Quotes section");
assert_contains_ci(&result.content, "Level 2", "Should contain Level 2 heading");
assert_contains_ci(&result.content, "emphasis", "Should contain emphasis/formatted text");
assert_contains_ci(
&result.content,
"embedded link",
"Should contain 'embedded link' link description",
);
assert_contains_ci(&result.content, "AT&T", "Should contain AT&T link description");
assert_contains_ci(&result.content, "special", "Should contain special characters section");
println!("✅ Org Mode comprehensive document test passed!");
println!(" Content extracted: {} bytes", result.content.len());
}
/// Test 21: Extraction statistics and summary
///
/// This test provides comprehensive statistics about Org Mode extraction
/// for validation and debugging purposes.
#[tokio::test]
async fn test_orgmode_extraction_statistics() {
let test_files = vec!["tables.org", "../misc/readme.org"];
println!("\n╔════════════════════════════════════════════════════════════╗");
println!("║ Org Mode Extraction Statistics Report ║");
println!("╚════════════════════════════════════════════════════════════╝\n");
let mut total_files = 0;
let mut total_content_bytes = 0;
let mut total_metadata_fields = 0;
for orgmode_file in test_files {
let test_file = get_test_orgmode_path(orgmode_file);
if !test_file.exists() {
println!("⚠ SKIP: {} (not found)", orgmode_file);
continue;
}
match std::fs::read(&test_file) {
Ok(content) => match extract_bytes(&content, "text/x-org", &ExtractionConfig::default()).await {
Ok(result) => {
total_files += 1;
total_content_bytes += result.content.len();
total_metadata_fields += result.metadata.additional.len();
println!("{}", orgmode_file);
println!(" Content: {} bytes", result.content.len());
println!(" Metadata fields: {}", result.metadata.additional.len());
if !result.metadata.additional.is_empty() {
let keys: Vec<String> = result.metadata.additional.keys().map(|k| k.to_string()).collect();
println!(" Keys: {}", keys.join(", "));
}
if result.content.contains("#") {
println!(" Structure: ✓ (headings detected)");
}
if result.content.contains("|") {
println!(" Tables: ✓ (detected)");
}
if result.content.contains("-") || result.content.contains("1.") {
println!(" Lists: ✓ (detected)");
}
println!();
}
Err(e) => {
println!("{} - Error: {:?}", orgmode_file, e);
println!();
}
},
Err(e) => {
println!("{} - Read error: {:?}", orgmode_file, e);
println!();
}
}
}
println!("╔════════════════════════════════════════════════════════════╗");
println!("║ Summary Statistics ║");
println!("╠════════════════════════════════════════════════════════════╣");
println!("║ Total files processed: {:44}", total_files);
println!("║ Total content bytes: {:44}", total_content_bytes);
println!("║ Total metadata fields: {:44}", total_metadata_fields);
println!(
"║ Average content size: {:44} ║",
total_content_bytes.checked_div(total_files).unwrap_or(0)
);
println!(
"║ Average metadata/file: {:44} ║",
total_metadata_fields.checked_div(total_files).unwrap_or(0)
);
println!("╚════════════════════════════════════════════════════════════╝\n");
println!("✅ Org Mode extraction statistics generated successfully!");
}