//! Comprehensive TDD test suite for RTF extraction. //! //! This test suite validates RTF extraction capabilities. //! Tests cover: //! - Accent and Unicode handling //! - Bookmarks and internal links //! - Footnotes and references //! - Text formatting (bold, italic, underline, strikeout, superscript, subscript, small caps) //! - Headings and structure //! - Image extraction //! - External hyperlinks //! - List extraction (simple and complex nested lists) //! - Table extraction (simple and complex with special formatting) //! - Unicode characters and special symbols //! //! Test Organization: //! - Basic Content Extraction (unicode, accent) //! - Structure Preservation (heading, list_simple, list_complex) //! - Table Extraction (table_simple, table_error_codes) //! - Formatting Detection (formatting) //! - Special Features (footnote, bookmark, link) //! - Integration Tests (deterministic extraction, no content loss) //! //! Success Criteria: //! - All tests passing (100%) //! - No content loss (should extract meaningful text from all files) //! - Deterministic extraction (same input = same output) //! //! Note: These tests require the `office` feature to be enabled. #![cfg(feature = "office")] #![allow(clippy::doc_suspicious_footnotes)] use kreuzberg::core::config::ExtractionConfig; use kreuzberg::core::extractor::extract_file; use std::path::PathBuf; mod helpers; /// Helper function to get path to RTF test document fn get_rtf_path(filename: &str) -> PathBuf { let manifest_dir = env!("CARGO_MANIFEST_DIR"); PathBuf::from(manifest_dir) .parent() .expect("kreuzberg crate should have a parent") .parent() .expect("parent should have a parent") .join("test_documents") .join("rtf") .join(filename) } /// Helper for reaching the workspace root from the kreuzberg crate fn workspace_root() -> PathBuf { PathBuf::from(env!("CARGO_MANIFEST_DIR")) .parent() .expect("kreuzberg crate should have a parent") .parent() .expect("workspace root exists") .to_path_buf() } /// Test extraction of RTF file with accent characters (accented vowels). /// /// File: accent.rtf /// Content: "le café où on ne fume pas" /// Expected: Correctly extracts French text with accented characters (é, ù) /// Pandoc baseline: le café où on ne fume pas #[tokio::test] async fn test_rtf_accent_extraction() { let config = ExtractionConfig::default(); let path = get_rtf_path("accent.rtf"); let result = extract_file(&path, Some("application/rtf"), &config).await; assert!(result.is_ok(), "RTF extraction should succeed for accent.rtf"); let extraction = result.expect("Operation failed"); assert_eq!(extraction.mime_type, "application/rtf"); assert!(!extraction.content.is_empty(), "Content should not be empty"); let content = extraction.content.to_lowercase(); assert!( extraction.content.contains("café") || content.contains("cafe"), "Should extract French word 'café' or 'cafe'" ); assert!( extraction.content.contains("où") || content.contains("ou"), "Should extract French word 'où' or 'ou'" ); assert!( content.contains("fume") || content.contains("smoking"), "Should extract content about smoking" ); } /// Test extraction of RTF file with bookmarks (internal anchors/references). /// /// File: bookmark.rtf /// Content: Bookmark anchor labeled "Bookmark_1" and link text "click me" /// Expected: Extracts bookmark definition and link text /// Pandoc baseline: [Bookmark_1]{#bookmark_1} and [click me](#bookmark_1) #[tokio::test] async fn test_rtf_bookmark_extraction() { let config = ExtractionConfig::default(); let path = get_rtf_path("bookmark.rtf"); let result = extract_file(&path, Some("application/rtf"), &config).await; assert!(result.is_ok(), "RTF extraction should succeed for bookmark.rtf"); let extraction = result.expect("Operation failed"); let content = extraction.content.to_lowercase(); assert!( content.contains("bookmark") || content.contains("click") || content.contains("me"), "Should extract bookmark or link text (found: {})", extraction.content ); } /// Test extraction of RTF file with footnotes. /// /// File: footnote.rtf /// Content: Academic text with footnote references and their content /// Expected: Extracts both main text and footnote content /// Pandoc baseline: Uses [^1] and [^2] syntax for footnotes #[tokio::test] async fn test_rtf_footnote_extraction() { let config = ExtractionConfig::default(); let path = get_rtf_path("footnote.rtf"); let result = extract_file(&path, Some("application/rtf"), &config).await; assert!(result.is_ok(), "RTF extraction should succeed for footnote.rtf"); let extraction = result.expect("Operation failed"); assert!(!extraction.content.is_empty(), "Content should not be empty"); let content = extraction.content.to_lowercase(); assert!( content.contains("mead") || content.contains("landmark"), "Should extract main text about Mead's study" ); assert!( content.contains("note") || content.contains("annotated") || content.contains("bibliography") || content.contains("sahlins"), "Should extract footnote content or references" ); assert!( content.contains("footnote") || extraction.content.contains("[^") || content.contains("annotated"), "Should contain footnote indicators" ); } /// Test extraction of RTF file with various text formatting. /// /// File: formatting.rtf /// Content: Text with bold, italic, underline, strikeout, superscript, subscript, small caps /// Expected: Preserves or indicates all formatting types /// Pandoc baseline: Detailed formatting in markdown syntax #[tokio::test] async fn test_rtf_formatting_extraction() { let config = ExtractionConfig::default(); let path = get_rtf_path("formatting.rtf"); let result = extract_file(&path, Some("application/rtf"), &config).await; assert!(result.is_ok(), "RTF extraction should succeed for formatting.rtf"); let extraction = result.expect("Operation failed"); assert!(!extraction.content.is_empty(), "Content should not be empty"); let content = extraction.content.to_lowercase(); assert!( content.contains("formatting") || content.contains("test") || content.contains("bold"), "Should extract formatting-related content" ); assert!( extraction.content.contains("**bold**") || content.contains("bold"), "Should preserve or indicate bold text" ); assert!( extraction.content.contains("*italic") || content.contains("italic"), "Should preserve or indicate italic text" ); let has_formatting = extraction.content.contains("**") || extraction.content.contains("*") || extraction.content.contains("__") || extraction.content.contains("_") || extraction.content.contains("~~") || extraction.content.contains("^") || extraction.content.contains("~") || content.contains("bold"); assert!(has_formatting, "Should preserve or indicate text formatting"); } /// Test extraction of RTF file with heading hierarchy. /// /// File: heading.rtf /// Content: Three levels of headings (H1, H2, H3) followed by paragraph /// Expected: Extracts all headings and paragraph text /// Pandoc baseline: Markdown heading syntax (# ## ###) #[tokio::test] async fn test_rtf_heading_extraction() { let config = ExtractionConfig::default(); let path = get_rtf_path("heading.rtf"); let result = extract_file(&path, Some("application/rtf"), &config).await; assert!(result.is_ok(), "RTF extraction should succeed for heading.rtf"); let extraction = result.expect("Operation failed"); assert!(!extraction.content.is_empty(), "Content should not be empty"); assert!( extraction.content.contains("Heading 1"), "Should extract Heading 1 text" ); assert!( extraction.content.contains("Heading 2"), "Should extract Heading 2 text" ); assert!( extraction.content.contains("Heading 3"), "Should extract Heading 3 text" ); assert!( extraction.content.contains("Paragraph"), "Should extract paragraph text" ); let content_lower = extraction.content.to_lowercase(); assert!( extraction.content.contains("#") || (content_lower.contains("heading 1") && content_lower.contains("heading 2")), "Should preserve heading hierarchy" ); } /// Test extraction of RTF file with embedded or referenced image. /// /// File: image.rtf /// Content: Image reference with dimensions (2.0in x 2.0in) /// Expected: Extracts image reference and/or dimensions /// Pandoc baseline: Markdown image syntax with dimensions #[tokio::test] async fn test_rtf_image_extraction() { let config = ExtractionConfig::default(); let path = get_rtf_path("image.rtf"); let result = extract_file(&path, Some("application/rtf"), &config).await; assert!(result.is_ok(), "RTF extraction should succeed for image.rtf"); let extraction = result.expect("Operation failed"); assert!(!extraction.content.is_empty(), "Content should not be empty"); let content = extraction.content.to_lowercase(); assert!( extraction.content.contains("!") || content.contains("image") || extraction.content.contains(".jpg") || content.contains("2.0") || content.contains("width") || content.contains("height"), "Should contain image reference or dimension information (found: {})", extraction.content ); } /// Test extraction of RTF file with external hyperlink. /// /// File: link.rtf /// Content: Link to pandoc.org website /// Expected: Extracts link text and/or URL /// Pandoc baseline: Markdown link syntax [pandoc](http://pandoc.org) #[tokio::test] async fn test_rtf_link_extraction() { let config = ExtractionConfig::default(); let path = get_rtf_path("link.rtf"); let result = extract_file(&path, Some("application/rtf"), &config).await; assert!(result.is_ok(), "RTF extraction should succeed for link.rtf"); let extraction = result.expect("Operation failed"); assert!(!extraction.content.is_empty(), "Content should not be empty"); let content = extraction.content.to_lowercase(); assert!( content.contains("pandoc") || content.contains("http"), "Should extract link-related content (found: {})", extraction.content ); } /// Test extraction of RTF file with complex nested list structure. /// /// File: list_complex.rtf /// Content: Multi-level nested list with various numbering (numeric, alphabetic, roman) /// Expected: Extracts all list items preserving or indicating hierarchy /// Pandoc baseline: Markdown nested list with mixed numbering schemes #[tokio::test] async fn test_rtf_list_complex_extraction() { let config = ExtractionConfig::default(); let path = get_rtf_path("list_complex.rtf"); let result = extract_file(&path, Some("application/rtf"), &config).await; assert!(result.is_ok(), "RTF extraction should succeed for list_complex.rtf"); let extraction = result.expect("Operation failed"); assert!(!extraction.content.is_empty(), "Content should not be empty"); let content = extraction.content.to_lowercase(); assert!(content.contains("one"), "Should extract list item 'One'"); assert!(content.contains("two"), "Should extract list item 'Two'"); assert!( content.contains("three") || content.contains("three"), "Should extract nested list item 'Three'" ); assert!( content.contains("five") || content.contains("six"), "Should extract deeply nested list items" ); assert!( extraction.content.contains("1") || extraction.content.contains("-") || extraction.content.contains("•") || content.contains("one"), "Should preserve list structure indicators" ); assert!( content.contains("out of list") || content.contains("out"), "Should extract separator text 'Out of list'" ); assert!( content.contains("seven") || content.contains("eight") || content.contains("7") || content.contains("8"), "Should extract restarted list numbering (7, 8)" ); } /// Test extraction of RTF file with simple bulleted list. /// /// File: list_simple.rtf /// Content: Simple bullet list with one nested item and list break /// Expected: Extracts all list items and indicates nesting /// Pandoc baseline: Simple markdown bullet list with nesting #[tokio::test] async fn test_rtf_list_simple_extraction() { let config = ExtractionConfig::default(); let path = get_rtf_path("list_simple.rtf"); let result = extract_file(&path, Some("application/rtf"), &config).await; assert!(result.is_ok(), "RTF extraction should succeed for list_simple.rtf"); let extraction = result.expect("Operation failed"); assert!(!extraction.content.is_empty(), "Content should not be empty"); let content = extraction.content.to_lowercase(); assert!(content.contains("one"), "Should extract list item 'one'"); assert!(content.contains("two"), "Should extract list item 'two'"); assert!(content.contains("sub"), "Should extract nested list item 'sub'"); assert!(content.contains("new"), "Should extract 'new list' text"); // Verify list items are extracted (list structure preserved as separate text) assert!( content.contains("one") && content.contains("two") && content.contains("sub"), "Should extract all list items" ); } /// Test extraction of RTF file with table containing error codes. /// /// File: table_error_codes.rtf /// Content: Table with Code and Error columns, 23 rows of Pandoc error codes /// Expected: Extracts table structure and all data cells /// Pandoc baseline: Markdown table format with 2 columns and 23 rows /// /// Note: RTF table extraction via Pandoc markdown output may result in empty content /// due to limitations in Pandoc's markdown table rendering. Tables are present /// in Pandoc's internal JSON representation but may not render in text format. #[tokio::test] async fn test_rtf_table_error_codes_extraction() { let config = ExtractionConfig::default(); let path = get_rtf_path("table_error_codes.rtf"); let result = extract_file(&path, Some("application/rtf"), &config).await; assert!( result.is_ok(), "RTF extraction should succeed for table_error_codes.rtf" ); let extraction = result.expect("Operation failed"); assert!( extraction.mime_type == "application/rtf", "MIME type should be preserved" ); } /// Test extraction of RTF file with simple 4-column, 2-row table. /// /// File: table_simple.rtf /// Content: Table with headers A, B, C, D and data row E, F, G, H /// Expected: Extracts all cells in correct table structure /// Pandoc baseline: Markdown table format /// /// Note: RTF table extraction via Pandoc markdown output may result in empty content /// due to limitations in Pandoc's markdown table rendering. Tables are present /// in Pandoc's internal JSON representation but may not render in text format. #[tokio::test] async fn test_rtf_table_simple_extraction() { let config = ExtractionConfig::default(); let path = get_rtf_path("table_simple.rtf"); let result = extract_file(&path, Some("application/rtf"), &config).await; assert!(result.is_ok(), "RTF extraction should succeed for table_simple.rtf"); let extraction = result.expect("Operation failed"); assert!( extraction.mime_type == "application/rtf", "MIME type should be preserved" ); } /// Test extraction of RTF file with various Unicode characters. /// /// File: unicode.rtf /// Content: Smart quotes, Greek letters (α, ä) /// Expected: Correctly extracts and preserves Unicode characters /// Pandoc baseline: "hi"'hi'αä #[tokio::test] async fn test_rtf_unicode_extraction() { let config = ExtractionConfig::default(); let path = get_rtf_path("unicode.rtf"); let result = extract_file(&path, Some("application/rtf"), &config).await; assert!(result.is_ok(), "RTF extraction should succeed for unicode.rtf"); let extraction = result.expect("Operation failed"); assert!(!extraction.content.is_empty(), "Content should not be empty"); assert!( extraction.content.contains("hi") || extraction.content.contains("α") || extraction.content.contains("ä"), "Should extract unicode content (found: {})", extraction.content ); } /// Test that RTF extraction is deterministic /// Same input should produce identical output #[tokio::test] async fn test_rtf_extraction_deterministic_unicode() { let config = ExtractionConfig::default(); let path = get_rtf_path("unicode.rtf"); let result1 = extract_file(&path, Some("application/rtf"), &config).await; let result2 = extract_file(&path, Some("application/rtf"), &config).await; assert!(result1.is_ok() && result2.is_ok(), "Both extractions should succeed"); let extraction1 = result1.expect("Operation failed"); let extraction2 = result2.expect("Operation failed"); assert_eq!( extraction1.content, extraction2.content, "FAIL: Extraction is not deterministic. Same input produced different outputs." ); } /// Test that RTF extraction is deterministic for complex files /// Same input should produce identical output #[tokio::test] async fn test_rtf_extraction_deterministic_list_complex() { let config = ExtractionConfig::default(); let path = get_rtf_path("list_complex.rtf"); let result1 = extract_file(&path, Some("application/rtf"), &config).await; let result2 = extract_file(&path, Some("application/rtf"), &config).await; assert!(result1.is_ok() && result2.is_ok(), "Both extractions should succeed"); let extraction1 = result1.expect("Operation failed"); let extraction2 = result2.expect("Operation failed"); assert_eq!( extraction1.content, extraction2.content, "FAIL: Extraction is not deterministic. Same input produced different outputs." ); } /// Test no critical content loss /// All RTF files should extract non-empty content (except possibly image-only files) #[tokio::test] async fn test_rtf_no_critical_content_loss() { let config = ExtractionConfig::default(); let must_extract = vec![ "unicode.rtf", "accent.rtf", "heading.rtf", "list_simple.rtf", "list_complex.rtf", "formatting.rtf", "footnote.rtf", "bookmark.rtf", "link.rtf", ]; for filename in must_extract { let path = get_rtf_path(filename); let result = extract_file(&path, Some("application/rtf"), &config).await; assert!( result.is_ok(), "FAIL: Extraction failed for {} (critical file)", filename ); let extraction = result.expect("Operation failed"); assert!( !extraction.content.is_empty(), "FAIL: CRITICAL - Extracted 0 bytes from {}. RTF extractor lost all content.", filename ); assert!( extraction.content.len() >= 5, "FAIL: Extracted only {} bytes from {} (expected at least 5 characters). Content: '{}'", extraction.content.len(), filename, extraction.content ); } } /// Test MIME type preservation /// All RTF extractions should preserve the application/rtf MIME type #[tokio::test] async fn test_rtf_mime_type_preservation() { let config = ExtractionConfig::default(); let test_files = vec!["unicode.rtf", "accent.rtf", "heading.rtf", "list_simple.rtf"]; for filename in test_files { let path = get_rtf_path(filename); let result = extract_file(&path, Some("application/rtf"), &config).await; assert!(result.is_ok(), "Extraction should succeed for {}", filename); let extraction = result.expect("Operation failed"); assert_eq!( extraction.mime_type, "application/rtf", "FAIL: MIME type not preserved for {}", filename ); } } /// Parity check: RTF extracted from the DOCX `word_sample.docx` should /// carry the same content signals and metadata as the DOCX extractor. #[tokio::test] async fn test_rtf_word_sample_matches_docx_metadata_and_content() { let root = workspace_root(); let rtf_path = root.join("test_documents/rtf/word_sample.rtf"); let docx_path = root.join("test_documents/docx/word_sample.docx"); if !rtf_path.exists() || !docx_path.exists() { println!("Skipping word_sample parity test: fixtures missing"); return; } let config = ExtractionConfig::default(); let rtf_result = extract_file(&rtf_path, Some("application/rtf"), &config) .await .expect("RTF extraction should succeed for word_sample"); let docx_result = extract_file(&docx_path, None, &config) .await .expect("DOCX extraction should succeed for word_sample"); let rtf_content_lower = rtf_result.content.to_lowercase(); assert!( rtf_content_lower.contains("swim"), "RTF content should include the same body text as DOCX" ); // Compare typed metadata fields (both DOCX and RTF now use typed fields) assert_eq!( rtf_result.metadata.created_by.as_deref(), docx_result.metadata.created_by.as_deref(), "Metadata field created_by should align with DOCX" ); assert_eq!( rtf_result.metadata.modified_by.as_deref(), docx_result.metadata.modified_by.as_deref(), "Metadata field modified_by should align with DOCX" ); assert_eq!( rtf_result.metadata.created_at.as_deref(), docx_result.metadata.created_at.as_deref(), "Metadata field created_at should align with DOCX" ); assert_eq!( rtf_result.metadata.additional.get("revision").and_then(|v| v.as_str()), docx_result.metadata.additional.get("revision").and_then(|v| v.as_str()), "Metadata field revision should align with DOCX" ); for (key, expected) in [ ("page_count", 2), ("word_count", 108), ("character_count", 620), ("line_count", 5), ("paragraph_count", 1), ] { assert_eq!( rtf_result.metadata.additional.get(key).and_then(|v| v.as_i64()), Some(expected), "Metadata field {} should match DOCX values", key ); } } /// RTF generated from lorem_ipsum.docx should expose the same document statistics /// we validate for the DOCX extractor. #[tokio::test] async fn test_rtf_lorem_ipsum_metadata_alignment() { let root = workspace_root(); let rtf_path = root.join("test_documents/rtf/lorem_ipsum.rtf"); if !rtf_path.exists() { println!("Skipping lorem_ipsum metadata test: fixture missing"); return; } let config = ExtractionConfig::default(); let result = extract_file(&rtf_path, Some("application/rtf"), &config) .await .expect("RTF extraction should succeed for lorem_ipsum"); assert!( result.content.to_lowercase().contains("lorem ipsum"), "Content should contain lorem ipsum text" ); for (key, expected) in [ ("page_count", 1), ("word_count", 520), ("character_count", 2967), ("line_count", 24), ("paragraph_count", 6), ] { assert_eq!( result.metadata.additional.get(key).and_then(|v| v.as_i64()), Some(expected), "Metadata field {} should match DOCX values", key ); } } /// The comprehensive extraction fixture should mirror the coverage of the ODT/DOCX variants: /// headings, section text, table content, and metadata fields should all be present. #[tokio::test] async fn test_rtf_comprehensive_extraction_alignment() { let root = workspace_root(); let rtf_path = root.join("test_documents/rtf/extraction_test.rtf"); let docx_path = root.join("test_documents/docx/extraction_test.docx"); let odt_path = root.join("test_documents/odt/extraction_test.odt"); if !rtf_path.exists() { println!("⚠️ Test document not found at {:?}, skipping", rtf_path); return; } if !docx_path.exists() || !odt_path.exists() { println!( "⚠️ Companion DOCX/ODT documents missing (docx: {}, odt: {}), skipping", docx_path.exists(), odt_path.exists() ); return; } let config = ExtractionConfig::default(); let rtf_result = extract_file(&rtf_path, Some("application/rtf"), &config) .await .expect("RTF extraction should succeed for extraction_test.rtf"); let docx_result = extract_file(&docx_path, None, &config) .await .expect("DOCX extraction should succeed for extraction_test.docx"); let odt_result = extract_file(&odt_path, None, &config) .await .expect("ODT extraction should succeed for extraction_test.odt"); assert!( rtf_result.content.contains("Comprehensive Extraction Test Document"), "Should include document heading" ); assert!( rtf_result.content.contains("First Section"), "Should include first section heading" ); assert!( rtf_result.content.contains("Second Section"), "Should include second section heading" ); assert!( rtf_result.content.contains("Third Section"), "Should include third section heading" ); for expected in ["Header 1", "Cell 1A", "Product", "Apple"] { assert!( rtf_result.content.contains(expected), "Should include table content '{}'", expected ); } // Verify table structure is recognized (tables are stored as structured nodes) assert!( !rtf_result.tables.is_empty(), "Should extract structured tables from RTF" ); assert!( rtf_result .tables .iter() .any(|t| t.markdown.contains("Header 1") || t.markdown.contains("Cell 1A")), "Table markdown should include header/data cells" ); assert!( rtf_result.tables.len() >= docx_result.tables.len() && rtf_result.tables.len() >= odt_result.tables.len(), "RTF should capture at least as many tables as DOCX/ODT" ); for (key, expected) in [ ("page_count", 1), ("word_count", 83), ("character_count", 475), ("line_count", 12), ("paragraph_count", 8), ] { assert_eq!( rtf_result.metadata.additional.get(key).and_then(|v| v.as_i64()), Some(expected), "Metadata field {} should be populated", key ); } }