//! Comprehensive TDD test suite for OPML (Outline Processor Markup Language) extraction //! //! This test suite validates OPML extraction capabilities. //! Each test extracts an OPML file and validates: //! //! - Metadata extraction (title, dateCreated, dateModified, ownerName, ownerEmail) //! - Outline hierarchy extraction with proper indentation //! - RSS feed attribute handling (xmlUrl, htmlUrl) //! - Content structure preservation //! - Special character handling //! - Edge cases (empty bodies, nested structures, etc.) #![cfg(feature = "office")] use kreuzberg::core::config::ExtractionConfig; use kreuzberg::core::extractor::extract_bytes; use std::path::PathBuf; mod helpers; /// Helper to resolve workspace root and construct test file paths fn get_test_opml_path(filename: &str) -> PathBuf { let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR")) .parent() .expect("Operation failed") .parent() .expect("Operation failed"); workspace_root.join(format!("test_documents/opml/{}", filename)) } /// Helper to validate that content contains expected text (case-insensitive) fn assert_contains_ci(content: &str, needle: &str, description: &str) { assert!( content.to_lowercase().contains(&needle.to_lowercase()), "Content should contain '{}' ({}). Content: {}", needle, description, &content[..std::cmp::min(300, content.len())] ); } /// Helper to validate content doesn't contain undesired text #[allow(dead_code)] fn assert_not_contains_ci(content: &str, needle: &str, description: &str) { assert!( !content.to_lowercase().contains(&needle.to_lowercase()), "Content should NOT contain '{}' ({})", needle, description ); } /// Test 1: Extract RSS feed subscription list with categories /// /// Validates: /// - Successfully extracts feeds.opml with RSS feed structure /// - Extracts Dublin Core metadata (title, dateCreated, dateModified, ownerName, ownerEmail) /// - Content includes all feed categories and feed names /// - Feed URLs are captured in output /// - Hierarchy structure is preserved with proper nesting #[tokio::test] async fn test_opml_rss_feeds_extraction() { let test_file = get_test_opml_path("feeds.opml"); if !test_file.exists() { println!("Skipping test: Test file not found at {:?}", test_file); return; } let content = std::fs::read(&test_file).expect("Should read OPML file"); let result = extract_bytes(&content, "text/x-opml", &ExtractionConfig::default()) .await .expect("Should extract RSS feeds OPML successfully"); assert!( !result.content.is_empty(), "Content should not be empty for RSS feeds OPML" ); assert_contains_ci(&result.content, "Technology", "Should contain Technology category"); assert_contains_ci(&result.content, "Programming", "Should contain Programming category"); assert_contains_ci( &result.content, "Uncategorized", "Should contain Uncategorized category", ); assert_contains_ci(&result.content, "Hacker News", "Should contain Hacker News feed"); assert_contains_ci(&result.content, "TechCrunch", "Should contain TechCrunch feed"); assert_contains_ci(&result.content, "Rust Blog", "Should contain Rust Blog feed"); assert!(result.metadata.title.is_some(), "Should extract title metadata"); assert_eq!( result.metadata.title.as_deref(), Some("Tech News Feeds"), "Should have correct title" ); let has_owner = result.metadata.created_by.is_some() || result.metadata.additional.contains_key("ownerEmail"); assert!(has_owner, "Should extract owner information"); println!("✅ RSS feeds extraction test passed!"); println!(" Found {} metadata fields", result.metadata.additional.len()); println!(" Content length: {} bytes", result.content.len()); } /// Test 2: Extract podcast directory with multiple categories /// /// Validates: /// - Successfully extracts podcasts.opml with podcast structure /// - Extracts title and metadata fields /// - Content includes all podcast categories /// - Podcast feed names are properly extracted /// - Handles HTML entity encoding (&) /// - Complex hierarchy is preserved #[tokio::test] async fn test_opml_podcast_directory_extraction() { let test_file = get_test_opml_path("podcasts.opml"); if !test_file.exists() { println!("Skipping test: Test file not found at {:?}", test_file); return; } let content = std::fs::read(&test_file).expect("Should read OPML file"); let result = extract_bytes(&content, "text/x-opml", &ExtractionConfig::default()) .await .expect("Should extract podcast directory OPML successfully"); assert!( !result.content.is_empty(), "Content should not be empty for podcast OPML" ); assert_contains_ci( &result.content, "Technology Podcasts", "Should contain Technology Podcasts category", ); assert_contains_ci(&result.content, "Business", "Should contain Business category"); assert_contains_ci(&result.content, "Science", "Should contain Science category"); assert_contains_ci(&result.content, "Syntax", "Should contain Syntax podcast"); assert_contains_ci(&result.content, "Acquired", "Should contain Acquired podcast"); assert_eq!( result.metadata.title.as_deref(), Some("Podcast Directory"), "Should have correct title" ); assert_eq!( result.metadata.created_by.as_deref(), Some("Jane Doe"), "Should extract owner name correctly" ); println!("✅ Podcast directory extraction test passed!"); println!(" Found {} metadata fields", result.metadata.additional.len()); println!(" Content length: {} bytes", result.content.len()); } /// Test 3: Extract general outline structure with deep nesting /// /// Validates: /// - Successfully extracts outline.opml with project structure /// - Preserves hierarchy with proper indentation /// - Handles multi-level nesting (4 levels deep) /// - Extracts all task items in correct order /// - Metadata is properly extracted /// - Content structure matches expected outline format #[tokio::test] async fn test_opml_outline_hierarchy_extraction() { let test_file = get_test_opml_path("outline.opml"); if !test_file.exists() { println!("Skipping test: Test file not found at {:?}", test_file); return; } let content = std::fs::read(&test_file).expect("Should read OPML file"); let result = extract_bytes(&content, "text/x-opml", &ExtractionConfig::default()) .await .expect("Should extract outline OPML successfully"); assert!( !result.content.is_empty(), "Content should not be empty for outline OPML" ); assert_contains_ci(&result.content, "Project Alpha", "Should contain main project"); assert_contains_ci(&result.content, "Phase 1", "Should contain Phase 1"); assert_contains_ci(&result.content, "Phase 2", "Should contain Phase 2"); assert_contains_ci(&result.content, "Phase 3", "Should contain Phase 3"); assert_contains_ci(&result.content, "Phase 4", "Should contain Phase 4"); assert_contains_ci( &result.content, "Requirements gathering", "Should contain Phase 1 tasks", ); assert_contains_ci(&result.content, "Resource allocation", "Should contain Phase 1 tasks"); assert_contains_ci( &result.content, "Backend implementation", "Should contain Phase 2 backend task", ); assert_contains_ci( &result.content, "Frontend implementation", "Should contain Phase 2 frontend task", ); assert_contains_ci(&result.content, "Unit testing", "Should contain Phase 3 testing task"); assert_contains_ci( &result.content, "Production setup", "Should contain Phase 4 deployment task", ); assert_eq!( result.metadata.title.as_deref(), Some("Project Outline"), "Should have correct title" ); assert!( !result.content.is_empty(), "Content should have nested items rendered (as headings)" ); println!("✅ Outline hierarchy extraction test passed!"); println!(" Content length: {} bytes", result.content.len()); println!(" Hierarchy levels preserved with indentation"); } /// Test 4: Comprehensive metadata extraction from head section /// /// Validates: /// - All head metadata fields are extracted (title, dateCreated, dateModified, ownerName, ownerEmail) /// - Metadata values are correctly typed and encoded /// - Date formats are preserved as-is /// - Owner information is properly extracted /// - Missing optional fields are handled gracefully #[tokio::test] async fn test_opml_metadata_extraction_complete() { let test_file = get_test_opml_path("feeds.opml"); if !test_file.exists() { println!("Skipping test: Test file not found at {:?}", test_file); return; } let content = std::fs::read(&test_file).expect("Should read OPML file"); let result = extract_bytes(&content, "text/x-opml", &ExtractionConfig::default()) .await .expect("Should extract metadata successfully"); let additional = &result.metadata.additional; assert!(result.metadata.title.is_some(), "Should have title metadata"); assert!( result.metadata.created_at.is_some() || result.metadata.modified_at.is_some(), "Should have at least one date field" ); assert!( result.metadata.created_by.is_some() || additional.contains_key("ownerEmail"), "Should have owner information" ); assert_eq!( result.metadata.title.as_deref(), Some("Tech News Feeds"), "Title should match exactly" ); if let Some(date_created) = result.metadata.created_at.as_deref() { assert!( date_created.contains("Nov") || date_created.contains("2023"), "Date should be preserved in original format" ); } assert_eq!( result.metadata.created_by.as_deref(), Some("John Smith"), "Owner name should be extracted" ); println!("✅ Metadata extraction test passed!"); println!(" Metadata fields: {:?}", additional.keys().collect::>()); } /// Test 5: Verify RSS feed names are extracted from OPML attributes /// /// Validates: /// - Feed names from text attribute are properly extracted /// - Feed categories are preserved in the hierarchy /// - All feed names are present in output /// - Extraction matches Pandoc baseline (no URLs in main content) #[tokio::test] async fn test_opml_feed_url_extraction() { let test_file = get_test_opml_path("feeds.opml"); if !test_file.exists() { println!("Skipping test: Test file not found at {:?}", test_file); return; } let content = std::fs::read(&test_file).expect("Should read OPML file"); let result = extract_bytes(&content, "text/x-opml", &ExtractionConfig::default()) .await .expect("Should extract feed names successfully"); assert_contains_ci(&result.content, "Hacker News", "Should contain Hacker News feed name"); assert_contains_ci(&result.content, "TechCrunch", "Should contain TechCrunch feed name"); assert_contains_ci(&result.content, "Rust Blog", "Should contain Rust Blog feed name"); assert_contains_ci(&result.content, "Dev.to", "Should contain Dev.to feed name"); assert_contains_ci(&result.content, "Technology", "Should contain Technology category"); assert_contains_ci(&result.content, "Programming", "Should contain Programming category"); println!("✅ Feed extraction test passed!"); println!(" Found {} bytes of content", result.content.len()); } /// Test 6: Verify correct MIME type handling and format detection /// /// Validates: /// - MIME type is correctly preserved in result /// - Extractor handles text/x-opml MIME type /// - Content format is appropriate for OPML outline structure /// - Result structure is valid #[tokio::test] async fn test_opml_mime_type_handling() { let test_file = get_test_opml_path("feeds.opml"); if !test_file.exists() { println!("Skipping test: Test file not found at {:?}", test_file); return; } let content = std::fs::read(&test_file).expect("Should read OPML file"); let result = extract_bytes(&content, "text/x-opml", &ExtractionConfig::default()) .await .expect("Should extract with text/x-opml MIME type"); assert_eq!(result.mime_type, "text/x-opml", "MIME type should be preserved"); let result2 = extract_bytes(&content, "application/xml+opml", &ExtractionConfig::default()) .await .expect("Should extract with application/xml+opml MIME type"); assert_eq!( result2.mime_type, "application/xml+opml", "Alternative MIME type should work" ); assert_eq!( result.content, result2.content, "Content should be same regardless of MIME type" ); println!("✅ MIME type handling test passed!"); } /// Test 7: Handle special characters and HTML entities in OPML /// /// Validates: /// - HTML entities are properly decoded (&, <, >, etc.) /// - Special characters in feed names are handled correctly /// - Quotes and apostrophes are properly processed /// - UTF-8 content is valid /// - Content is human-readable after extraction #[tokio::test] async fn test_opml_special_characters_handling() { let test_file = get_test_opml_path("podcasts.opml"); if !test_file.exists() { println!("Skipping test: Test file not found at {:?}", test_file); return; } let content = std::fs::read(&test_file).expect("Should read OPML file"); let result = extract_bytes(&content, "text/x-opml", &ExtractionConfig::default()) .await .expect("Should extract with special characters"); assert_contains_ci( &result.content, "Business", "Should properly decode Business & Startups", ); let _ = result.content.chars().count(); println!("✅ Special characters handling test passed!"); println!(" Verified UTF-8 integrity and entity decoding"); } /// Test 9: Validate deep nesting and hierarchy preservation in outline.opml /// /// Validates: /// - Multi-level nesting (4 levels) is properly preserved /// - Indentation increases with nesting depth /// - All tasks are extracted in correct nesting context /// - Task ordering is preserved /// - Notes & Resources section is captured #[tokio::test] async fn test_opml_deep_nesting_hierarchy() { let test_file = get_test_opml_path("outline.opml"); if !test_file.exists() { println!("Skipping test: Test file not found at {:?}", test_file); return; } let content = std::fs::read(&test_file).expect("Should read OPML file"); let result = extract_bytes(&content, "text/x-opml", &ExtractionConfig::default()) .await .expect("Should extract deep nesting successfully"); let extracted = &result.content; let project_pos = extracted.find("Project Alpha").unwrap_or(0); let phase1_pos = extracted.find("Phase 1").unwrap_or(0); let phase2_pos = extracted.find("Phase 2").unwrap_or(0); let phase3_pos = extracted.find("Phase 3").unwrap_or(0); let phase4_pos = extracted.find("Phase 4").unwrap_or(0); assert!( project_pos < phase1_pos && phase1_pos < phase2_pos && phase2_pos < phase3_pos && phase3_pos < phase4_pos, "Phases should appear in order in output" ); assert_contains_ci(extracted, "Phase 1", "Phase 1 should be present"); assert_contains_ci(extracted, "Phase 2", "Phase 2 should be present"); assert_contains_ci(extracted, "Phase 3", "Phase 3 should be present"); assert_contains_ci(extracted, "Phase 4", "Phase 4 should be present"); assert_contains_ci(extracted, "Notes & Resources", "Notes section should be present"); println!("✅ Deep nesting hierarchy test passed!"); println!(" All phases and tasks extracted in correct order"); } /// Test 10: Validate content extraction quality and consistency across all OPML files /// /// Validates: /// - All OPML files produce non-empty content /// - Content is valid UTF-8 (no corruption) /// - Content doesn't have excessive whitespace /// - Minimum content quality standards /// - Consistent extraction behavior #[tokio::test] async fn test_opml_content_quality_all_files() { let opml_files = vec!["feeds.opml", "podcasts.opml", "outline.opml"]; for opml_file in opml_files { let test_file = get_test_opml_path(opml_file); if !test_file.exists() { println!("Skipping file: {:?}", test_file); continue; } let content = std::fs::read(&test_file).expect("Should read OPML file"); let result = extract_bytes(&content, "text/x-opml", &ExtractionConfig::default()) .await .unwrap_or_else(|_| panic!("Should extract {}", opml_file)); assert!( !result.content.is_empty(), "Content should not be empty for {}", opml_file ); let _ = result.content.chars().count(); assert!( result.content.len() > 20, "Content should have meaningful length for {}", opml_file ); let whitespace_ratio = result.content.chars().filter(|c| c.is_whitespace()).count() as f64 / result.content.len() as f64; assert!( whitespace_ratio < 0.5, "Content should not be mostly whitespace for {}", opml_file ); println!(" ✓ {} ({} bytes) quality validated", opml_file, result.content.len()); } println!("✅ Content quality validation test passed!"); } /// Test 11: Verify OPML extractor is properly registered /// /// Validates: /// - Extractor is available in the registry #[tokio::test] async fn test_opml_extractor_registration() { use kreuzberg::plugins::registry::get_document_extractor_registry; // Trigger initialization via a real extraction call so the registry is populated. let _ = kreuzberg::extract_bytes( b"", "text/x-opml", &kreuzberg::core::config::ExtractionConfig::default(), ) .await; let registry = get_document_extractor_registry(); let registry_guard = registry.read(); let extractor_names = registry_guard.list(); println!("Available extractors: {:?}", extractor_names); assert!( extractor_names.contains(&"opml-extractor".to_string()), "OPML extractor should be registered. Available: {:?}", extractor_names ); println!("✅ OPML extractor registration test passed!"); } /// Test 12: Extract all OPML files and generate summary statistics /// /// This test runs all OPML extractions and provides comprehensive statistics /// for validation and debugging purposes. It's not a strict pass/fail test /// but provides useful information about extraction behavior. #[tokio::test] async fn test_opml_extraction_statistics() { let opml_files = vec!["feeds.opml", "podcasts.opml", "outline.opml"]; println!("\n╔════════════════════════════════════════════════════════════╗"); println!("║ OPML Extraction Statistics Report ║"); println!("╚════════════════════════════════════════════════════════════╝\n"); let mut total_files = 0; let mut total_content_bytes = 0; let mut total_metadata_fields = 0; for opml_file in opml_files { let test_file = get_test_opml_path(opml_file); if !test_file.exists() { println!("⚠ SKIP: {} (not found)", opml_file); continue; } match std::fs::read(&test_file) { Ok(content) => match extract_bytes(&content, "text/x-opml", &ExtractionConfig::default()).await { Ok(result) => { total_files += 1; total_content_bytes += result.content.len(); total_metadata_fields += result.metadata.additional.len(); println!("✓ {} ", opml_file); println!(" Content: {} bytes", result.content.len()); println!(" Metadata fields: {}", result.metadata.additional.len()); if !result.metadata.additional.is_empty() { let keys: Vec = result.metadata.additional.keys().map(|k| k.to_string()).collect(); println!(" Keys: {}", keys.join(", ")); } let outline_count = result.content.lines().count(); println!(" Outline items: ~{}", outline_count); let indented_lines = result.content.lines().filter(|l| l.starts_with(" ")).count(); println!(" Nested items: {}", indented_lines); println!(); } Err(e) => { println!("✗ {} - Error: {:?}", opml_file, e); println!(); } }, Err(e) => { println!("✗ {} - Read Error: {:?}", opml_file, e); println!(); } } } println!("╔════════════════════════════════════════════════════════════╗"); println!("║ Summary Statistics ║"); println!("╠════════════════════════════════════════════════════════════╣"); println!("║ Total files processed: {:44} ║", total_files); println!("║ Total content bytes: {:44} ║", total_content_bytes); println!("║ Total metadata fields: {:44} ║", total_metadata_fields); println!( "║ Average content size: {:44} ║", total_content_bytes.checked_div(total_files).unwrap_or(0) ); println!( "║ Average metadata/file: {:44} ║", total_metadata_fields.checked_div(total_files).unwrap_or(0) ); println!("╚════════════════════════════════════════════════════════════╝\n"); println!("✅ OPML extraction statistics generated successfully!"); }