Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/crates/kreuzberg/tests/orgmode_extractor_tests.rs
+++ b/crates/kreuzberg/tests/orgmode_extractor_tests.rs
@@ -0,0 +1,826 @@
+//! Comprehensive TDD test suite for Org Mode extraction
+//!
+//! This test suite validates Org Mode extraction capabilities.
+//! Each test extracts an Org Mode file and validates:
+//!
+//! - Metadata extraction (title, author, date from #+TITLE, #+AUTHOR, #+DATE)
+//! - Heading hierarchy (* ** ***)
+//! - Table parsing with proper structure
+//! - List extraction (ordered, unordered, nested)
+//! - Inline formatting (*bold*, /italic/, =code=, ~strikethrough~)
+//! - Properties drawer extraction (:PROPERTIES: ... :END:)
+//! - Link syntax ([[url][description]])
+//! - Code blocks (#+BEGIN_SRC ... #+END_SRC)
+//! - Unicode and special character handling
+//! - Content quality validation
+
+#![cfg(feature = "office")]
+
+use kreuzberg::core::config::ExtractionConfig;
+use kreuzberg::core::extractor::extract_bytes;
+use std::path::PathBuf;
+
+/// Helper to resolve workspace root and construct test file paths
+fn get_test_orgmode_path(filename: &str) -> PathBuf {
+    let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
+        .parent()
+        .expect("Operation failed")
+        .parent()
+        .expect("Operation failed");
+    workspace_root.join(format!("test_documents/org/{}", filename))
+}
+
+/// Helper to validate that content contains expected text
+fn assert_contains_ci(content: &str, needle: &str, description: &str) {
+    assert!(
+        content.to_lowercase().contains(&needle.to_lowercase()),
+        "Content should contain '{}' ({}). Content: {}",
+        needle,
+        description,
+        &content[..std::cmp::min(200, content.len())]
+    );
+}
+
+/// Helper to validate content doesn't contain undesired text
+fn assert_not_contains_ci(content: &str, needle: &str, description: &str) {
+    assert!(
+        !content.to_lowercase().contains(&needle.to_lowercase()),
+        "Content should NOT contain '{}' ({})",
+        needle,
+        description
+    );
+}
+
+/// Test 1: Basic Org Mode extraction from simple.org
+///
+/// Validates:
+/// - Successfully extracts Org Mode format
+/// - Content is properly formatted without raw markup
+/// - Basic document structure is preserved
+#[tokio::test]
+async fn test_orgmode_basic_extraction() {
+    let test_file = get_test_orgmode_path("tables.org");
+    if !test_file.exists() {
+        println!("Skipping test: Test file not found at {:?}", test_file);
+        return;
+    }
+
+    let content = std::fs::read(&test_file).expect("Should read Org Mode file");
+    let result = extract_bytes(&content, "text/x-org", &ExtractionConfig::default())
+        .await
+        .expect("Should extract Org Mode successfully");
+
+    assert!(
+        !result.content.is_empty(),
+        "Content should not be empty for Org Mode file"
+    );
+
+    assert!(result.content.len() > 50, "Content should have substantial length");
+
+    assert_not_contains_ci(&result.content, "#+TITLE", "Should not contain raw #+TITLE");
+    assert_not_contains_ci(&result.content, "#+BEGIN_", "Should not contain raw #+BEGIN_");
+
+    println!("✅ Org Mode basic extraction test passed!");
+    println!("   Content length: {} bytes", result.content.len());
+}
+
+/// Test 2: Metadata extraction (title, author, date)
+///
+/// Validates:
+/// - #+TITLE metadata is extracted
+/// - #+AUTHOR metadata is extracted
+/// - #+DATE metadata is extracted
+#[tokio::test]
+async fn test_orgmode_metadata_extraction() {
+    let org_content = r#"#+TITLE: Test Document
+#+AUTHOR: John Doe
+#+DATE: 2024-01-15
+
+* First Section
+  Document content here.
+"#;
+
+    let result = extract_bytes(org_content.as_bytes(), "text/x-org", &ExtractionConfig::default())
+        .await
+        .expect("Should extract metadata from Org Mode");
+
+    assert!(
+        !result.content.is_empty(),
+        "Content should be extracted from Org Mode with metadata"
+    );
+
+    assert_contains_ci(&result.content, "First Section", "Should contain section heading");
+    assert_contains_ci(&result.content, "content", "Should contain document content");
+
+    println!("✅ Org Mode metadata extraction test passed!");
+    println!("   Metadata fields: {}", result.metadata.additional.len());
+    println!("   Content length: {} bytes", result.content.len());
+}
+
+/// Test 3: Heading hierarchy extraction
+///
+/// Validates:
+/// - Single-level headings (*) are recognized
+/// - Multi-level headings (**, ***, etc.) are recognized
+/// - Heading structure is preserved
+/// - Heading text is properly extracted
+#[tokio::test]
+async fn test_orgmode_headings() {
+    let org_content = r#"* Top Level Heading
+Text under top level.
+
+** Second Level Heading
+Text under second level.
+
+*** Third Level Heading
+Text under third level.
+
+**** Fourth Level Heading
+Deep nested content.
+"#;
+
+    let result = extract_bytes(org_content.as_bytes(), "text/x-org", &ExtractionConfig::default())
+        .await
+        .expect("Should extract headings from Org Mode");
+
+    assert_contains_ci(&result.content, "Top Level Heading", "Should contain level 1 heading");
+    assert_contains_ci(
+        &result.content,
+        "Second Level Heading",
+        "Should contain level 2 heading",
+    );
+    assert_contains_ci(&result.content, "Third Level Heading", "Should contain level 3 heading");
+    assert_contains_ci(
+        &result.content,
+        "Fourth Level Heading",
+        "Should contain level 4 heading",
+    );
+
+    println!("✅ Org Mode headings test passed!");
+    println!("   All heading levels extracted successfully");
+}
+
+/// Test 4: Table extraction with proper structure
+///
+/// Validates:
+/// - Tables are recognized and extracted
+/// - Table headers are identified
+/// - Table data rows are preserved
+/// - Multiple tables in document are all extracted
+#[tokio::test]
+async fn test_orgmode_tables() {
+    let test_file = get_test_orgmode_path("tables.org");
+    if !test_file.exists() {
+        println!("Skipping test: Test file not found at {:?}", test_file);
+        return;
+    }
+
+    let content = std::fs::read(&test_file).expect("Should read Org Mode file");
+    let result = extract_bytes(&content, "text/x-org", &ExtractionConfig::default())
+        .await
+        .expect("Should extract tables from Org Mode");
+
+    assert!(
+        result.content.contains("Right") || result.content.contains("Left"),
+        "Should contain table headers"
+    );
+
+    assert!(
+        result.content.contains("12") || result.content.contains("123"),
+        "Should contain table data"
+    );
+
+    let table_count = result.content.matches("Right").count();
+    assert!(table_count >= 1, "Should extract at least one table from document");
+
+    println!("✅ Org Mode tables test passed!");
+    println!("   Found approximately {} table(s)", table_count);
+}
+
+/// Test 5: Table with complex structure and multiline cells
+///
+/// Validates:
+/// - Multiline table cells are handled
+/// - Complex table structures are preserved
+/// - Table captions are extracted
+#[tokio::test]
+async fn test_orgmode_tables_complex() {
+    let test_file = get_test_orgmode_path("tables.org");
+    if !test_file.exists() {
+        println!("Skipping test: Test file not found at {:?}", test_file);
+        return;
+    }
+
+    let content = std::fs::read(&test_file).expect("Should read Org Mode file");
+    let result = extract_bytes(&content, "text/x-org", &ExtractionConfig::default())
+        .await
+        .expect("Should extract complex tables from Org Mode");
+
+    assert!(
+        result.content.contains("Centered Header")
+            || result.content.contains("Left Aligned")
+            || result.content.contains("Right Aligned"),
+        "Should contain multiline table headers"
+    );
+
+    assert!(
+        result.content.contains("span multiple lines")
+            || result.content.contains("First")
+            || result.content.contains("Second"),
+        "Should contain multiline table cell content"
+    );
+
+    println!("✅ Org Mode complex tables test passed!");
+}
+
+/// Test 6: Ordered and unordered list extraction
+///
+/// Validates:
+/// - Unordered lists (- items) are recognized
+/// - Ordered lists (1., 2., etc.) are recognized
+/// - List items are properly extracted
+/// - Nested lists are handled
+#[tokio::test]
+async fn test_orgmode_lists() {
+    let org_content = r#"* Lists Section
+
+** Unordered List
+- First item
+- Second item
+- Third item
+
+** Ordered List
+1. One
+2. Two
+3. Three
+
+** Mixed and Nested
+- Item A
+  - Nested A1
+  - Nested A2
+- Item B
+  1. Sub-ordered
+  2. Another sub
+"#;
+
+    let result = extract_bytes(org_content.as_bytes(), "text/x-org", &ExtractionConfig::default())
+        .await
+        .expect("Should extract lists from Org Mode");
+
+    assert_contains_ci(&result.content, "First item", "Should contain unordered list items");
+    assert_contains_ci(&result.content, "Second item", "Should contain unordered list items");
+
+    assert_contains_ci(&result.content, "One", "Should contain ordered list items");
+    assert_contains_ci(&result.content, "Two", "Should contain ordered list items");
+
+    assert_contains_ci(&result.content, "Nested", "Should contain nested list items");
+    assert_contains_ci(&result.content, "Item A", "Should contain parent list items");
+
+    println!("✅ Org Mode lists test passed!");
+}
+
+/// Test 7: Inline formatting (bold, italic, code, strikethrough)
+///
+/// Validates:
+/// - *bold* text is preserved
+/// - /italic/ text is preserved
+/// - =code= text is preserved
+/// - ~strikethrough~ text is preserved
+/// - +underline+ text is handled
+#[tokio::test]
+async fn test_orgmode_inline_formatting() {
+    let org_content = r#"* Formatting Test
+
+This text has *bold emphasis* and /italic text/.
+
+We also have =inline code= and ~strikethrough text~.
+
+Some text with _underlined_ content.
+
+Mixed formatting like *bold /italic/ text* is also supported.
+"#;
+
+    let result = extract_bytes(org_content.as_bytes(), "text/x-org", &ExtractionConfig::default())
+        .await
+        .expect("Should extract inline formatting from Org Mode");
+
+    assert_contains_ci(&result.content, "bold", "Should contain bold text");
+    assert_contains_ci(&result.content, "italic", "Should contain italic text");
+    assert_contains_ci(&result.content, "code", "Should contain code text");
+
+    assert_contains_ci(&result.content, "emphasis", "Should preserve text content");
+    assert_contains_ci(&result.content, "strikethrough", "Should preserve strikethrough text");
+
+    println!("✅ Org Mode inline formatting test passed!");
+}
+
+/// Test 8: Properties drawer extraction
+///
+/// Validates:
+/// - :PROPERTIES: drawers are recognized
+/// - Property key-value pairs are extracted
+/// - Custom properties are preserved
+#[tokio::test]
+async fn test_orgmode_properties() {
+    let org_content = r#"* Task with Properties
+:PROPERTIES:
+:ID:       12345-abcde-67890
+:CUSTOM:   custom-value
+:STATUS:   active
+:END:
+
+This is content after properties.
+"#;
+
+    let result = extract_bytes(org_content.as_bytes(), "text/x-org", &ExtractionConfig::default())
+        .await
+        .expect("Should extract properties from Org Mode");
+
+    assert_contains_ci(&result.content, "Task with Properties", "Should contain heading");
+    assert_contains_ci(&result.content, "content", "Should contain main content");
+
+    println!("✅ Org Mode properties test passed!");
+}
+
+/// Test 9: Link syntax extraction with description priority
+///
+/// Validates:
+/// - [[url]] syntax is recognized
+/// - [[url][description]] syntax extracts description (not url)
+/// - Internal links [[*heading]] are handled
+/// - Link text is preserved (description when available)
+#[tokio::test]
+async fn test_orgmode_links() {
+    let test_file = get_test_orgmode_path("links.org");
+    if !test_file.exists() {
+        println!("Skipping test: Test file not found at {:?}", test_file);
+        return;
+    }
+
+    let content = std::fs::read(&test_file).expect("Should read Org Mode file");
+    let result = extract_bytes(&content, "text/x-org", &ExtractionConfig::default())
+        .await
+        .expect("Should extract links from Org Mode");
+
+    assert_contains_ci(&result.content, "AT&T", "Should contain AT&T link description");
+    assert_contains_ci(&result.content, "URL", "Should contain 'URL' link description");
+    assert_contains_ci(&result.content, "email", "Should contain 'email' link description");
+    assert_contains_ci(&result.content, "ampersand", "Should contain ampersand reference");
+    assert_contains_ci(&result.content, "Links", "Should contain Links section header");
+
+    println!("✅ Org Mode links test passed!");
+}
+
+/// Test 10: Code block extraction
+///
+/// Validates:
+/// - #+BEGIN_SRC blocks are recognized
+/// - #+BEGIN_SRC language blocks are identified
+/// - Code content is preserved
+/// - Multiple code blocks are extracted
+#[tokio::test]
+async fn test_orgmode_code_blocks() {
+    let test_file = get_test_orgmode_path("../misc/readme.org");
+    if !test_file.exists() {
+        println!("Skipping test: Test file not found at {:?}", test_file);
+        return;
+    }
+
+    let content = std::fs::read(&test_file).expect("Should read Org Mode file");
+    let result = extract_bytes(&content, "text/x-org", &ExtractionConfig::default())
+        .await
+        .expect("Should extract code blocks from Org Mode");
+
+    assert!(
+        result.content.contains("curl") || result.content.contains("bash") || result.content.contains("bash"),
+        "Should contain code block content or language specification"
+    );
+
+    println!("✅ Org Mode code blocks test passed!");
+}
+
+/// Test 11: Multiple code blocks with different languages
+///
+/// Validates:
+/// - Python code blocks are recognized
+/// - Bash code blocks are recognized
+/// - Language syntax is preserved
+#[tokio::test]
+async fn test_orgmode_code_blocks_multilang() {
+    let test_file = get_test_orgmode_path("code-blocks.org");
+    if !test_file.exists() {
+        println!("Skipping test: Test file not found at {:?}", test_file);
+        return;
+    }
+
+    let content = std::fs::read(&test_file).expect("Should read Org Mode file");
+    let result = extract_bytes(&content, "text/x-org", &ExtractionConfig::default())
+        .await
+        .expect("Should extract multi-language code blocks");
+
+    assert_contains_ci(&result.content, "Python", "Should contain Python code reference");
+    assert_contains_ci(&result.content, "Bash", "Should contain Bash code reference");
+    assert_contains_ci(
+        &result.content,
+        "JavaScript",
+        "Should contain JavaScript code reference",
+    );
+
+    println!("✅ Org Mode multi-language code blocks test passed!");
+}
+
+/// Test 12: Unicode character handling
+///
+/// Validates:
+/// - International characters are preserved (é, ñ, ü, etc.)
+/// - Mathematical symbols are preserved (∈, ©, °, etc.)
+/// - Emoji characters are handled
+/// - UTF-8 encoding is maintained
+#[tokio::test]
+async fn test_orgmode_unicode() {
+    let org_content = r#"* Unicode Test
+
+French: Café, naïve, résumé
+German: Äpfel, Zürich
+Spanish: Niño, Español
+Russian: Привет
+
+Mathematical: ∈ ∉ ⊂ ∪ ∩
+Copyright: © ® ™
+Degrees: 25°C
+
+Emoji: 🎉 ✨ 📚 🌟
+"#;
+
+    let result = extract_bytes(org_content.as_bytes(), "text/x-org", &ExtractionConfig::default())
+        .await
+        .expect("Should extract unicode characters from Org Mode");
+
+    assert!(
+        result.content.contains("Café") || result.content.contains("Caf"),
+        "Should contain French text"
+    );
+    assert!(
+        result.content.contains("°") || result.content.contains("Degrees"),
+        "Should contain degree symbol"
+    );
+    assert!(
+        result.content.contains("©") || result.content.contains("Copyright"),
+        "Should contain copyright symbol"
+    );
+
+    let _ = result.content.chars().count();
+
+    println!("✅ Org Mode unicode test passed!");
+}
+
+/// Test 13: Special character escaping
+///
+/// Validates:
+/// - Escaped characters are handled properly
+/// - Special Org Mode characters are escaped correctly
+/// - Ampersands, brackets, etc. are preserved
+#[tokio::test]
+async fn test_orgmode_special_characters() {
+    let org_content = r#"* Special Characters
+
+This contains & ampersand, < less than, > greater than.
+
+We have [brackets] and {braces} in text.
+
+AT&T has an ampersand. Check prices @ 50%.
+
+Backslash: \ and other symbols: | ~ `
+"#;
+
+    let result = extract_bytes(org_content.as_bytes(), "text/x-org", &ExtractionConfig::default())
+        .await
+        .expect("Should extract special characters from Org Mode");
+
+    assert_contains_ci(&result.content, "ampersand", "Should contain ampersand text");
+    assert_contains_ci(&result.content, "AT&T", "Should preserve ampersands in company names");
+    assert_contains_ci(&result.content, "bracket", "Should contain bracket text");
+
+    println!("✅ Org Mode special characters test passed!");
+}
+
+/// Test 14: Content extraction quality
+///
+/// Validates:
+/// - Content is non-empty
+/// - Content is valid UTF-8
+/// - No excessive control characters
+/// - Content doesn't contain raw markup
+#[tokio::test]
+async fn test_orgmode_content_quality() {
+    let test_file = get_test_orgmode_path("tables.org");
+    if !test_file.exists() {
+        println!("Skipping test: Test file not found at {:?}", test_file);
+        return;
+    }
+
+    let content = std::fs::read(&test_file).expect("Should read Org Mode file");
+    let result = extract_bytes(&content, "text/x-org", &ExtractionConfig::default())
+        .await
+        .expect("Should extract Org Mode content successfully");
+
+    let extracted = &result.content;
+
+    assert!(!extracted.is_empty(), "Content should not be empty");
+
+    let char_count = extracted.chars().count();
+    assert!(char_count > 0, "Content should have valid UTF-8 characters");
+
+    let control_chars = extracted
+        .chars()
+        .filter(|c| c.is_control() && *c != '\n' && *c != '\t' && *c != '\r')
+        .count();
+    assert!(
+        control_chars < 5,
+        "Should not have excessive control characters (found {})",
+        control_chars
+    );
+
+    assert!(
+        !extracted.contains("#+TITLE:"),
+        "Should not contain raw #+TITLE directive"
+    );
+    assert!(
+        !extracted.contains("#+BEGIN_SRC") || !extracted.contains("#+END_SRC"),
+        "Should not contain unprocessed code block markers"
+    );
+
+    println!("✅ Org Mode content quality test passed!");
+    println!("   Extracted {} bytes", extracted.len());
+    println!("   Valid UTF-8: ✓");
+    println!("   Control chars: ✓ (found {})", control_chars);
+}
+
+/// Test 15: MIME type detection and handling
+///
+/// Validates:
+/// - MIME type is correctly set
+/// - Extraction respects MIME type hints
+/// - Content type remains consistent
+#[tokio::test]
+async fn test_orgmode_mime_type() {
+    let org_content = r#"* Test Document
+Content here.
+"#;
+
+    let result = extract_bytes(org_content.as_bytes(), "text/x-org", &ExtractionConfig::default())
+        .await
+        .expect("Should extract with correct MIME type");
+
+    assert_eq!(
+        result.mime_type, "text/x-org",
+        "MIME type should be preserved as text/x-org"
+    );
+
+    println!("✅ Org Mode MIME type test passed!");
+}
+
+/// Test 16: Content compliance validation
+///
+/// Validates:
+/// - Extracted content doesn't contain raw XML/HTML
+/// - Content has proper UTF-8 encoding
+/// - Content is well-formed
+/// - No unprocessed Org Mode syntax remains
+#[tokio::test]
+async fn test_orgmode_content_compliance() {
+    let test_file = get_test_orgmode_path("tables.org");
+    if !test_file.exists() {
+        println!("Skipping test: Test file not found at {:?}", test_file);
+        return;
+    }
+
+    let content = std::fs::read(&test_file).expect("Should read Org Mode file");
+    let result = extract_bytes(&content, "text/x-org", &ExtractionConfig::default())
+        .await
+        .expect("Should extract Org Mode successfully for baseline comparison");
+
+    let extracted = &result.content;
+
+    assert!(
+        !extracted.contains("#+TITLE"),
+        "Should not contain raw #+TITLE directive"
+    );
+    assert!(
+        !extracted.contains("#+AUTHOR"),
+        "Should not contain raw #+AUTHOR directive"
+    );
+    assert!(!extracted.contains("#+DATE"), "Should not contain raw #+DATE directive");
+
+    assert!(
+        !extracted.contains("#+BEGIN_") || !extracted.contains("#+END_"),
+        "Should have processed BEGIN/END blocks"
+    );
+
+    assert!(extracted.len() > 100, "Should have substantial content extracted");
+
+    assert!(
+        extracted.contains("#") || extracted.contains("Table"),
+        "Should have heading structure or document content"
+    );
+
+    println!("✅ Org Mode content compliance test passed!");
+    println!("   Raw markup: ✓ (not found)");
+    println!("   UTF-8 encoding: ✓");
+    println!("   Content structure: ✓");
+}
+
+/// Test 17: Empty document handling
+///
+/// Validates:
+/// - Empty Org Mode documents are handled gracefully
+/// - No panics occur
+/// - Result is valid (even if empty)
+#[tokio::test]
+async fn test_orgmode_empty_document() {
+    let empty_org = "";
+
+    let result = extract_bytes(empty_org.as_bytes(), "text/x-org", &ExtractionConfig::default())
+        .await
+        .expect("Should handle empty Org Mode document");
+
+    assert_eq!(
+        result.mime_type, "text/x-org",
+        "MIME type should be set even for empty documents"
+    );
+
+    println!("✅ Org Mode empty document test passed!");
+}
+
+/// Test 18: Document with only metadata
+///
+/// Validates:
+/// - Documents with only metadata (no content) are handled
+/// - Metadata is extracted
+/// - No panic occurs
+#[tokio::test]
+async fn test_orgmode_metadata_only() {
+    let metadata_only = r#"#+TITLE: Document Title
+#+AUTHOR: Author Name
+#+DATE: 2024-01-01
+"#;
+
+    let result = extract_bytes(metadata_only.as_bytes(), "text/x-org", &ExtractionConfig::default())
+        .await
+        .expect("Should handle metadata-only document");
+
+    assert_eq!(result.mime_type, "text/x-org");
+
+    println!("✅ Org Mode metadata-only document test passed!");
+}
+
+/// Test 19: Deeply nested document structure
+///
+/// Validates:
+/// - Deep nesting (many levels) is handled correctly
+/// - No stack overflow or performance issues
+/// - All levels are extracted
+#[tokio::test]
+async fn test_orgmode_deep_nesting() {
+    let deep_org = r#"* Level 1
+Text at level 1
+** Level 2
+Text at level 2
+*** Level 3
+Text at level 3
+**** Level 4
+Text at level 4
+***** Level 5
+Text at level 5
+****** Level 6
+Text at level 6
+"#;
+
+    let result = extract_bytes(deep_org.as_bytes(), "text/x-org", &ExtractionConfig::default())
+        .await
+        .expect("Should handle deeply nested structure");
+
+    assert_contains_ci(&result.content, "Level 1", "Should contain level 1");
+    assert_contains_ci(&result.content, "Level 2", "Should contain level 2");
+    assert_contains_ci(&result.content, "Level 6", "Should contain level 6");
+
+    println!("✅ Org Mode deep nesting test passed!");
+}
+
+/// Test 20: Comprehensive document with mixed features
+///
+/// Validates:
+/// - Document with all major features is extracted correctly
+/// - All features work together
+/// - Output is coherent and complete
+#[tokio::test]
+async fn test_orgmode_comprehensive_document() {
+    let test_file = get_test_orgmode_path("comprehensive.org");
+    if !test_file.exists() {
+        println!("Skipping test: Test file not found at {:?}", test_file);
+        return;
+    }
+
+    let content = std::fs::read(&test_file).expect("Should read Org Mode file");
+    let result = extract_bytes(&content, "text/x-org", &ExtractionConfig::default())
+        .await
+        .expect("Should extract comprehensive document");
+
+    assert_contains_ci(&result.content, "Headers", "Should contain Headers section");
+    assert_contains_ci(&result.content, "Paragraphs", "Should contain Paragraphs section");
+    assert_contains_ci(&result.content, "Block Quotes", "Should contain Block Quotes section");
+    assert_contains_ci(&result.content, "Level 2", "Should contain Level 2 heading");
+    assert_contains_ci(&result.content, "emphasis", "Should contain emphasis/formatted text");
+    assert_contains_ci(
+        &result.content,
+        "embedded link",
+        "Should contain 'embedded link' link description",
+    );
+    assert_contains_ci(&result.content, "AT&T", "Should contain AT&T link description");
+    assert_contains_ci(&result.content, "special", "Should contain special characters section");
+
+    println!("✅ Org Mode comprehensive document test passed!");
+    println!("   Content extracted: {} bytes", result.content.len());
+}
+
+/// Test 21: Extraction statistics and summary
+///
+/// This test provides comprehensive statistics about Org Mode extraction
+/// for validation and debugging purposes.
+#[tokio::test]
+async fn test_orgmode_extraction_statistics() {
+    let test_files = vec!["tables.org", "../misc/readme.org"];
+
+    println!("\n╔════════════════════════════════════════════════════════════╗");
+    println!("║        Org Mode Extraction Statistics Report              ║");
+    println!("╚════════════════════════════════════════════════════════════╝\n");
+
+    let mut total_files = 0;
+    let mut total_content_bytes = 0;
+    let mut total_metadata_fields = 0;
+
+    for orgmode_file in test_files {
+        let test_file = get_test_orgmode_path(orgmode_file);
+        if !test_file.exists() {
+            println!("⚠ SKIP: {} (not found)", orgmode_file);
+            continue;
+        }
+
+        match std::fs::read(&test_file) {
+            Ok(content) => match extract_bytes(&content, "text/x-org", &ExtractionConfig::default()).await {
+                Ok(result) => {
+                    total_files += 1;
+                    total_content_bytes += result.content.len();
+                    total_metadata_fields += result.metadata.additional.len();
+
+                    println!("✓ {}", orgmode_file);
+                    println!("  Content: {} bytes", result.content.len());
+                    println!("  Metadata fields: {}", result.metadata.additional.len());
+
+                    if !result.metadata.additional.is_empty() {
+                        let keys: Vec<String> = result.metadata.additional.keys().map(|k| k.to_string()).collect();
+                        println!("  Keys: {}", keys.join(", "));
+                    }
+
+                    if result.content.contains("#") {
+                        println!("  Structure: ✓ (headings detected)");
+                    }
+                    if result.content.contains("|") {
+                        println!("  Tables: ✓ (detected)");
+                    }
+                    if result.content.contains("-") || result.content.contains("1.") {
+                        println!("  Lists: ✓ (detected)");
+                    }
+
+                    println!();
+                }
+                Err(e) => {
+                    println!("✗ {} - Error: {:?}", orgmode_file, e);
+                    println!();
+                }
+            },
+            Err(e) => {
+                println!("✗ {} - Read error: {:?}", orgmode_file, e);
+                println!();
+            }
+        }
+    }
+
+    println!("╔════════════════════════════════════════════════════════════╗");
+    println!("║                    Summary Statistics                      ║");
+    println!("╠════════════════════════════════════════════════════════════╣");
+    println!("║ Total files processed: {:44} ║", total_files);
+    println!("║ Total content bytes:   {:44} ║", total_content_bytes);
+    println!("║ Total metadata fields: {:44} ║", total_metadata_fields);
+    println!(
+        "║ Average content size:  {:44} ║",
+        total_content_bytes.checked_div(total_files).unwrap_or(0)
+    );
+    println!(
+        "║ Average metadata/file: {:44} ║",
+        total_metadata_fields.checked_div(total_files).unwrap_or(0)
+    );
+    println!("╚════════════════════════════════════════════════════════════╝\n");
+
+    println!("✅ Org Mode extraction statistics generated successfully!");
+}