//! Comprehensive TDD test suite for RST (reStructuredText) extraction //! //! Tests RST extraction using Pandoc as the baseline for quality validation. //! The test documents are derived from the Pandoc test suite and provide //! comprehensive coverage of RST-specific features including: //! - Metadata extraction from field lists (:Author:, :Date:, etc.) //! - Directive handling (.. code-block::, .. image::, .. math::, etc.) //! - Section structure and heading levels //! - Table extraction (simple and grid tables) //! - Reference links and images #![cfg(feature = "office")] //! - Comments and special blocks //! - Content quality validation use kreuzberg::core::config::ExtractionConfig; use kreuzberg::core::extractor::extract_bytes; mod helpers; const RST_FIXTURE: &str = include_str!("../../../test_documents/rst/rst-reader.rst"); fn rst_fixture_bytes() -> Vec { RST_FIXTURE.as_bytes().to_vec() } /// Test extraction of document title from RST file structure #[tokio::test] async fn test_rst_title_extraction() { let content = rst_fixture_bytes(); let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default()) .await .expect("Should extract RST successfully"); assert!( result.content.to_lowercase().contains("pandoc test suite"), "Should contain document title 'Pandoc Test Suite'" ); assert!( result.content.contains("Level one header") || result.content.contains("header"), "Should contain document headers" ); println!("✅ RST title extraction test passed!"); } /// Test field list metadata extraction (:Authors:, :Date:, :Revision:) #[tokio::test] async fn test_rst_field_list_metadata_extraction() { let content = rst_fixture_bytes(); let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default()) .await .expect("Should extract RST successfully"); println!( "Content excerpt (first 500 chars): {}", &result.content[..std::cmp::min(500, result.content.len())] ); assert!( result.content.contains("John MacFarlane") || result.content.contains("July 17") || result.content.contains("Pandoc Test Suite"), "Should contain metadata information or title" ); println!("✅ RST field list metadata extraction test passed!"); } /// Test extraction of multiple heading levels #[tokio::test] async fn test_rst_section_hierarchy() { let content = rst_fixture_bytes(); let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default()) .await .expect("Should extract RST successfully"); let headings = vec![ "Level one header", "Level two header", "Level three", "Paragraphs", "Block Quotes", "Code Blocks", "Lists", "Field Lists", "HTML Blocks", "LaTeX Block", "Images", "Tables", ]; for heading in headings { assert!( result.content.contains(heading), "Should contain heading: '{}'", heading ); } println!("✅ RST section hierarchy test passed!"); } /// Test that emphasis in headings is preserved #[tokio::test] async fn test_rst_heading_with_inline_markup() { let content = rst_fixture_bytes(); let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default()) .await .expect("Should extract RST successfully"); assert!( result.content.contains("emphasis") || result.content.contains("Level four"), "Should contain heading with emphasis" ); println!("✅ RST heading with inline markup test passed!"); } /// Test code block extraction with language specification #[tokio::test] async fn test_rst_code_block_extraction() { let content = rst_fixture_bytes(); let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default()) .await .expect("Should extract RST successfully"); assert!( result.content.contains("def my_function") || result.content.contains("python"), "Should contain Python code block or language specification" ); assert!( result.content.contains("return x + 1") || result.content.contains("my_function"), "Should contain Python function code" ); println!("✅ RST code block extraction test passed!"); } /// Test Haskell code blocks with highlight directive #[tokio::test] async fn test_rst_highlight_directive_code_blocks() { let content = rst_fixture_bytes(); let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default()) .await .expect("Should extract RST successfully"); assert!( result.content.contains("haskell") || result.content.contains("Tree") || result.content.contains("data Tree"), "Should contain Haskell code blocks" ); assert!( result.content.contains("Leaf") || result.content.contains("Node"), "Should contain Haskell data constructors" ); println!("✅ RST highlight directive code blocks test passed!"); } /// Test JavaScript code blocks #[tokio::test] async fn test_rst_javascript_code_blocks() { let content = rst_fixture_bytes(); let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default()) .await .expect("Should extract RST successfully"); assert!( result.content.contains("javascript") || result.content.contains("=>") || result.content.contains("let f"), "Should contain JavaScript code" ); println!("✅ RST JavaScript code blocks test passed!"); } /// Test unordered list extraction #[tokio::test] async fn test_rst_unordered_lists() { let content = rst_fixture_bytes(); let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default()) .await .expect("Should extract RST successfully"); let list_items = vec![ "asterisk 1", "asterisk 2", "asterisk 3", "Plus 1", "Plus 2", "Plus 3", "Minus 1", "Minus 2", "Minus 3", ]; for item in list_items { assert!(result.content.contains(item), "Should contain list item: '{}'", item); } println!("✅ RST unordered lists test passed!"); } /// Test ordered list extraction #[tokio::test] async fn test_rst_ordered_lists() { let content = rst_fixture_bytes(); let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default()) .await .expect("Should extract RST successfully"); let list_items = vec!["First", "Second", "Third"]; for item in list_items { assert!( result.content.contains(item), "Should contain ordered list item: '{}'", item ); } println!("✅ RST ordered lists test passed!"); } /// Test nested lists extraction #[tokio::test] async fn test_rst_nested_lists() { let content = rst_fixture_bytes(); let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default()) .await .expect("Should extract RST successfully"); assert!( result.content.contains("First") || result.content.contains("Second") || result.content.contains("Third") || result.content.contains("Definition"), "Should contain nested or definition list content" ); println!("✅ RST nested lists test passed!"); } /// Test simple table extraction #[tokio::test] async fn test_rst_simple_table_extraction() { let content = rst_fixture_bytes(); let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default()) .await .expect("Should extract RST successfully"); assert!( result.content.contains("Simple Tables") || result.content.contains("col") || (result.content.contains("r1") && result.content.contains("r2")), "Should contain simple table content" ); println!("✅ RST simple table extraction test passed!"); } /// Test grid table extraction #[tokio::test] async fn test_rst_grid_table_extraction() { let content = rst_fixture_bytes(); let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default()) .await .expect("Should extract RST successfully"); assert!( result.content.contains("Grid Tables") || result.content.contains("r1 a") || (result.content.contains("r1") && result.content.contains("r2")), "Should contain grid table content" ); println!("✅ RST grid table extraction test passed!"); } /// Test table with complex structure (multiple rows/columns spanning) #[tokio::test] async fn test_rst_complex_table_with_spanning() { let content = rst_fixture_bytes(); let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default()) .await .expect("Should extract RST successfully"); assert!( result.content.contains("Table with cells") || result.content.contains("Property") || result.content.contains("min") || result.content.contains("°C"), "Should contain complex table content" ); println!("✅ RST complex table with spanning test passed!"); } /// Test emphasis and strong markup #[tokio::test] async fn test_rst_emphasis_and_strong() { let content = rst_fixture_bytes(); let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default()) .await .expect("Should extract RST successfully"); assert!( result.content.contains("emphasized") || result.content.contains("strong"), "Should contain emphasis markers or converted text" ); println!("✅ RST emphasis and strong test passed!"); } /// Test inline code extraction #[tokio::test] async fn test_rst_inline_code() { let content = rst_fixture_bytes(); let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default()) .await .expect("Should extract RST successfully"); assert!( result.content.contains(">") || result.content.contains("code"), "Should contain inline code or code markers" ); println!("✅ RST inline code test passed!"); } /// Test subscript and superscript #[tokio::test] async fn test_rst_subscript_superscript() { let content = rst_fixture_bytes(); let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default()) .await .expect("Should extract RST successfully"); assert!( result.content.contains("subscript") || result.content.contains("superscript"), "Should contain subscript/superscript text" ); println!("✅ RST subscript/superscript test passed!"); } /// Test explicit links extraction #[tokio::test] async fn test_rst_explicit_links() { let content = rst_fixture_bytes(); let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default()) .await .expect("Should extract RST successfully"); assert!( result.content.contains("/url") || result.content.contains("URL"), "Should contain link URLs" ); assert!( result.content.contains("link"), "Should contain link references or text" ); println!("✅ RST explicit links test passed!"); } /// Test reference links #[tokio::test] async fn test_rst_reference_links() { let content = rst_fixture_bytes(); let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default()) .await .expect("Should extract RST successfully"); assert!( result.content.contains("link1") || result.content.contains("link2") || result.content.contains("link"), "Should contain resolved reference links" ); println!("✅ RST reference links test passed!"); } /// Test autolinks (bare URLs and email addresses) #[tokio::test] async fn test_rst_autolinks() { let content = rst_fixture_bytes(); let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default()) .await .expect("Should extract RST successfully"); assert!( result.content.contains("example.com") || result.content.contains("http"), "Should contain URLs from autolinks" ); assert!( result.content.contains("nowhere") || result.content.contains("@"), "Should contain email references" ); println!("✅ RST autolinks test passed!"); } /// Test image directive extraction #[tokio::test] async fn test_rst_image_directive() { let content = rst_fixture_bytes(); let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default()) .await .expect("Should extract RST successfully"); assert!( result.content.contains("image") || result.content.contains("lalune") || result.content.contains("movie"), "Should contain image directives or references" ); assert!( result.content.contains("Voyage") || result.content.contains("Melies"), "Should contain image descriptions" ); println!("✅ RST image directive test passed!"); } /// Test raw HTML block extraction #[tokio::test] async fn test_rst_raw_html_blocks() { let content = rst_fixture_bytes(); let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default()) .await .expect("Should extract RST successfully"); assert!( result.content.contains("div") || result.content.contains("foo"), "Should contain HTML block content" ); println!("✅ RST raw HTML blocks test passed!"); } /// Test LaTeX block extraction #[tokio::test] async fn test_rst_latex_blocks() { let content = rst_fixture_bytes(); let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default()) .await .expect("Should extract RST successfully"); assert!( result.content.contains("LaTeX Block") || result.content.contains("begin{tabular}") || result.content.contains("Animal") || result.content.contains("Dog"), "Should contain LaTeX block or content" ); println!("✅ RST LaTeX blocks test passed!"); } /// Test math directive extraction #[tokio::test] async fn test_rst_math_directive() { let content = rst_fixture_bytes(); let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default()) .await .expect("Should extract RST successfully"); assert!( result.content.contains("E=mc^2") || result.content.contains("E = mc") || result.content.contains("alpha") || result.content.contains("Math"), "Should contain math formulas" ); println!("✅ RST math directive test passed!"); } /// Test comment blocks are excluded from output #[tokio::test] async fn test_rst_comment_blocks_excluded() { let content = rst_fixture_bytes(); let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default()) .await .expect("Should extract RST successfully"); assert!( !result.content.contains("should not appear"), "Comments should be excluded from output" ); assert!( result.content.contains("First paragraph") || result.content.contains("paragraph"), "Non-comment content should be present" ); println!("✅ RST comment blocks excluded test passed!"); } /// Test line blocks extraction #[tokio::test] async fn test_rst_line_blocks() { let content = rst_fixture_bytes(); let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default()) .await .expect("Should extract RST successfully"); assert!( result.content.contains("Line blocks") || result.content.contains("bee") || result.content.contains("entire bee"), "Should contain line block content or heading" ); println!("✅ RST line blocks test passed!"); } /// Test unicode character preservation #[tokio::test] async fn test_rst_unicode_characters() { let content = rst_fixture_bytes(); let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default()) .await .expect("Should extract RST successfully"); assert!( result.content.contains("©") || result.content.contains("copyright") || result.content.contains("umlaut") || result.content.contains("unicode"), "Should contain unicode characters or references" ); println!("✅ RST unicode characters test passed!"); } /// Test escaped characters #[tokio::test] async fn test_rst_escaped_characters() { let content = rst_fixture_bytes(); let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default()) .await .expect("Should extract RST successfully"); assert!( result.content.contains("Backslash") || result.content.contains("Backtick") || result.content.contains("Asterisk"), "Should contain escaped special character sections" ); println!("✅ RST escaped characters test passed!"); } // SECTION 12: FOOTNOTES AND REFERENCES /// Test footnote extraction #[tokio::test] async fn test_rst_footnotes() { let content = rst_fixture_bytes(); let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default()) .await .expect("Should extract RST successfully"); assert!( result.content.contains("Note") || result.content.contains("continuation"), "Should contain footnote content" ); println!("✅ RST footnotes test passed!"); } /// Test block quote extraction #[tokio::test] async fn test_rst_block_quotes() { let content = rst_fixture_bytes(); let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default()) .await .expect("Should extract RST successfully"); assert!( result.content.contains("block quote") || result.content.contains("pretty short"), "Should contain block quote content" ); println!("✅ RST block quotes test passed!"); } /// Test overall content extraction volume #[tokio::test] async fn test_rst_content_extraction_volume() { let content = rst_fixture_bytes(); let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default()) .await .expect("Should extract RST successfully"); let content_length = result.content.len(); println!("Extracted content length: {} bytes", content_length); assert!( content_length > 1000, "Extracted content should be substantial (> 1000 bytes), got {} bytes", content_length ); assert_eq!(result.mime_type, "text/x-rst", "MIME type should be preserved"); println!("✅ RST content extraction volume test passed!"); println!(" Extracted {} bytes from RST file", content_length); } /// Test extracted content contains all major sections #[tokio::test] async fn test_rst_all_major_sections_present() { let content = rst_fixture_bytes(); let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default()) .await .expect("Should extract RST successfully"); let major_sections = vec![ "Paragraphs", "Block Quotes", "Code Blocks", "Lists", "Field Lists", "HTML Blocks", "LaTeX Block", "Inline Markup", "Special Characters", "Links", "Images", "Comments", "Tables", "Math", ]; let content_lower = result.content.to_lowercase(); let mut found_count = 0; for section in major_sections { if content_lower.contains(§ion.to_lowercase()) { found_count += 1; println!("✓ Found section: {}", section); } else { println!("✗ Missing section: {}", section); } } assert!( found_count >= 10, "Should find at least 10 major sections, found {}", found_count ); println!("✅ RST all major sections present test passed!"); println!(" Found {}/14 major sections", found_count); } /// Test MIME type detection #[tokio::test] async fn test_rst_mime_type_detection() { let content = rst_fixture_bytes(); let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default()) .await .expect("Should extract with text/x-rst MIME type"); assert_eq!(result.mime_type, "text/x-rst"); println!("✅ RST MIME type detection test passed!"); } /// Test that no extraction errors occur on valid RST file #[tokio::test] async fn test_rst_extraction_no_errors() { let content = rst_fixture_bytes(); let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default()).await; assert!( result.is_ok(), "RST extraction should succeed without errors: {:?}", result.err() ); let extraction = result.expect("Operation failed"); assert!(!extraction.content.is_empty(), "Extracted content should not be empty"); println!("✅ RST extraction no errors test passed!"); }