604 lines
22 KiB
Rust
604 lines
22 KiB
Rust
|
|
//! Comprehensive TDD test suite for OPML (Outline Processor Markup Language) extraction
|
||
|
|
//!
|
||
|
|
//! This test suite validates OPML extraction capabilities.
|
||
|
|
//! Each test extracts an OPML file and validates:
|
||
|
|
//!
|
||
|
|
//! - Metadata extraction (title, dateCreated, dateModified, ownerName, ownerEmail)
|
||
|
|
//! - Outline hierarchy extraction with proper indentation
|
||
|
|
//! - RSS feed attribute handling (xmlUrl, htmlUrl)
|
||
|
|
//! - Content structure preservation
|
||
|
|
//! - Special character handling
|
||
|
|
//! - Edge cases (empty bodies, nested structures, etc.)
|
||
|
|
|
||
|
|
#![cfg(feature = "office")]
|
||
|
|
|
||
|
|
use kreuzberg::core::config::ExtractionConfig;
|
||
|
|
use kreuzberg::core::extractor::extract_bytes;
|
||
|
|
use std::path::PathBuf;
|
||
|
|
|
||
|
|
mod helpers;
|
||
|
|
|
||
|
|
/// Helper to resolve workspace root and construct test file paths
|
||
|
|
fn get_test_opml_path(filename: &str) -> PathBuf {
|
||
|
|
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
||
|
|
.parent()
|
||
|
|
.expect("Operation failed")
|
||
|
|
.parent()
|
||
|
|
.expect("Operation failed");
|
||
|
|
workspace_root.join(format!("test_documents/opml/{}", filename))
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Helper to validate that content contains expected text (case-insensitive)
|
||
|
|
fn assert_contains_ci(content: &str, needle: &str, description: &str) {
|
||
|
|
assert!(
|
||
|
|
content.to_lowercase().contains(&needle.to_lowercase()),
|
||
|
|
"Content should contain '{}' ({}). Content: {}",
|
||
|
|
needle,
|
||
|
|
description,
|
||
|
|
&content[..std::cmp::min(300, content.len())]
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Helper to validate content doesn't contain undesired text
|
||
|
|
#[allow(dead_code)]
|
||
|
|
fn assert_not_contains_ci(content: &str, needle: &str, description: &str) {
|
||
|
|
assert!(
|
||
|
|
!content.to_lowercase().contains(&needle.to_lowercase()),
|
||
|
|
"Content should NOT contain '{}' ({})",
|
||
|
|
needle,
|
||
|
|
description
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test 1: Extract RSS feed subscription list with categories
|
||
|
|
///
|
||
|
|
/// Validates:
|
||
|
|
/// - Successfully extracts feeds.opml with RSS feed structure
|
||
|
|
/// - Extracts Dublin Core metadata (title, dateCreated, dateModified, ownerName, ownerEmail)
|
||
|
|
/// - Content includes all feed categories and feed names
|
||
|
|
/// - Feed URLs are captured in output
|
||
|
|
/// - Hierarchy structure is preserved with proper nesting
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_opml_rss_feeds_extraction() {
|
||
|
|
let test_file = get_test_opml_path("feeds.opml");
|
||
|
|
if !test_file.exists() {
|
||
|
|
println!("Skipping test: Test file not found at {:?}", test_file);
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let content = std::fs::read(&test_file).expect("Should read OPML file");
|
||
|
|
let result = extract_bytes(&content, "text/x-opml", &ExtractionConfig::default())
|
||
|
|
.await
|
||
|
|
.expect("Should extract RSS feeds OPML successfully");
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
!result.content.is_empty(),
|
||
|
|
"Content should not be empty for RSS feeds OPML"
|
||
|
|
);
|
||
|
|
|
||
|
|
assert_contains_ci(&result.content, "Technology", "Should contain Technology category");
|
||
|
|
assert_contains_ci(&result.content, "Programming", "Should contain Programming category");
|
||
|
|
assert_contains_ci(
|
||
|
|
&result.content,
|
||
|
|
"Uncategorized",
|
||
|
|
"Should contain Uncategorized category",
|
||
|
|
);
|
||
|
|
|
||
|
|
assert_contains_ci(&result.content, "Hacker News", "Should contain Hacker News feed");
|
||
|
|
assert_contains_ci(&result.content, "TechCrunch", "Should contain TechCrunch feed");
|
||
|
|
assert_contains_ci(&result.content, "Rust Blog", "Should contain Rust Blog feed");
|
||
|
|
|
||
|
|
assert!(result.metadata.title.is_some(), "Should extract title metadata");
|
||
|
|
assert_eq!(
|
||
|
|
result.metadata.title.as_deref(),
|
||
|
|
Some("Tech News Feeds"),
|
||
|
|
"Should have correct title"
|
||
|
|
);
|
||
|
|
|
||
|
|
let has_owner = result.metadata.created_by.is_some() || result.metadata.additional.contains_key("ownerEmail");
|
||
|
|
assert!(has_owner, "Should extract owner information");
|
||
|
|
|
||
|
|
println!("✅ RSS feeds extraction test passed!");
|
||
|
|
println!(" Found {} metadata fields", result.metadata.additional.len());
|
||
|
|
println!(" Content length: {} bytes", result.content.len());
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test 2: Extract podcast directory with multiple categories
|
||
|
|
///
|
||
|
|
/// Validates:
|
||
|
|
/// - Successfully extracts podcasts.opml with podcast structure
|
||
|
|
/// - Extracts title and metadata fields
|
||
|
|
/// - Content includes all podcast categories
|
||
|
|
/// - Podcast feed names are properly extracted
|
||
|
|
/// - Handles HTML entity encoding (&)
|
||
|
|
/// - Complex hierarchy is preserved
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_opml_podcast_directory_extraction() {
|
||
|
|
let test_file = get_test_opml_path("podcasts.opml");
|
||
|
|
if !test_file.exists() {
|
||
|
|
println!("Skipping test: Test file not found at {:?}", test_file);
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let content = std::fs::read(&test_file).expect("Should read OPML file");
|
||
|
|
let result = extract_bytes(&content, "text/x-opml", &ExtractionConfig::default())
|
||
|
|
.await
|
||
|
|
.expect("Should extract podcast directory OPML successfully");
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
!result.content.is_empty(),
|
||
|
|
"Content should not be empty for podcast OPML"
|
||
|
|
);
|
||
|
|
|
||
|
|
assert_contains_ci(
|
||
|
|
&result.content,
|
||
|
|
"Technology Podcasts",
|
||
|
|
"Should contain Technology Podcasts category",
|
||
|
|
);
|
||
|
|
assert_contains_ci(&result.content, "Business", "Should contain Business category");
|
||
|
|
assert_contains_ci(&result.content, "Science", "Should contain Science category");
|
||
|
|
|
||
|
|
assert_contains_ci(&result.content, "Syntax", "Should contain Syntax podcast");
|
||
|
|
assert_contains_ci(&result.content, "Acquired", "Should contain Acquired podcast");
|
||
|
|
|
||
|
|
assert_eq!(
|
||
|
|
result.metadata.title.as_deref(),
|
||
|
|
Some("Podcast Directory"),
|
||
|
|
"Should have correct title"
|
||
|
|
);
|
||
|
|
|
||
|
|
assert_eq!(
|
||
|
|
result.metadata.created_by.as_deref(),
|
||
|
|
Some("Jane Doe"),
|
||
|
|
"Should extract owner name correctly"
|
||
|
|
);
|
||
|
|
|
||
|
|
println!("✅ Podcast directory extraction test passed!");
|
||
|
|
println!(" Found {} metadata fields", result.metadata.additional.len());
|
||
|
|
println!(" Content length: {} bytes", result.content.len());
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test 3: Extract general outline structure with deep nesting
|
||
|
|
///
|
||
|
|
/// Validates:
|
||
|
|
/// - Successfully extracts outline.opml with project structure
|
||
|
|
/// - Preserves hierarchy with proper indentation
|
||
|
|
/// - Handles multi-level nesting (4 levels deep)
|
||
|
|
/// - Extracts all task items in correct order
|
||
|
|
/// - Metadata is properly extracted
|
||
|
|
/// - Content structure matches expected outline format
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_opml_outline_hierarchy_extraction() {
|
||
|
|
let test_file = get_test_opml_path("outline.opml");
|
||
|
|
if !test_file.exists() {
|
||
|
|
println!("Skipping test: Test file not found at {:?}", test_file);
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let content = std::fs::read(&test_file).expect("Should read OPML file");
|
||
|
|
let result = extract_bytes(&content, "text/x-opml", &ExtractionConfig::default())
|
||
|
|
.await
|
||
|
|
.expect("Should extract outline OPML successfully");
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
!result.content.is_empty(),
|
||
|
|
"Content should not be empty for outline OPML"
|
||
|
|
);
|
||
|
|
|
||
|
|
assert_contains_ci(&result.content, "Project Alpha", "Should contain main project");
|
||
|
|
|
||
|
|
assert_contains_ci(&result.content, "Phase 1", "Should contain Phase 1");
|
||
|
|
assert_contains_ci(&result.content, "Phase 2", "Should contain Phase 2");
|
||
|
|
assert_contains_ci(&result.content, "Phase 3", "Should contain Phase 3");
|
||
|
|
assert_contains_ci(&result.content, "Phase 4", "Should contain Phase 4");
|
||
|
|
|
||
|
|
assert_contains_ci(
|
||
|
|
&result.content,
|
||
|
|
"Requirements gathering",
|
||
|
|
"Should contain Phase 1 tasks",
|
||
|
|
);
|
||
|
|
assert_contains_ci(&result.content, "Resource allocation", "Should contain Phase 1 tasks");
|
||
|
|
|
||
|
|
assert_contains_ci(
|
||
|
|
&result.content,
|
||
|
|
"Backend implementation",
|
||
|
|
"Should contain Phase 2 backend task",
|
||
|
|
);
|
||
|
|
assert_contains_ci(
|
||
|
|
&result.content,
|
||
|
|
"Frontend implementation",
|
||
|
|
"Should contain Phase 2 frontend task",
|
||
|
|
);
|
||
|
|
|
||
|
|
assert_contains_ci(&result.content, "Unit testing", "Should contain Phase 3 testing task");
|
||
|
|
assert_contains_ci(
|
||
|
|
&result.content,
|
||
|
|
"Production setup",
|
||
|
|
"Should contain Phase 4 deployment task",
|
||
|
|
);
|
||
|
|
|
||
|
|
assert_eq!(
|
||
|
|
result.metadata.title.as_deref(),
|
||
|
|
Some("Project Outline"),
|
||
|
|
"Should have correct title"
|
||
|
|
);
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
!result.content.is_empty(),
|
||
|
|
"Content should have nested items rendered (as headings)"
|
||
|
|
);
|
||
|
|
|
||
|
|
println!("✅ Outline hierarchy extraction test passed!");
|
||
|
|
println!(" Content length: {} bytes", result.content.len());
|
||
|
|
println!(" Hierarchy levels preserved with indentation");
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test 4: Comprehensive metadata extraction from head section
|
||
|
|
///
|
||
|
|
/// Validates:
|
||
|
|
/// - All head metadata fields are extracted (title, dateCreated, dateModified, ownerName, ownerEmail)
|
||
|
|
/// - Metadata values are correctly typed and encoded
|
||
|
|
/// - Date formats are preserved as-is
|
||
|
|
/// - Owner information is properly extracted
|
||
|
|
/// - Missing optional fields are handled gracefully
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_opml_metadata_extraction_complete() {
|
||
|
|
let test_file = get_test_opml_path("feeds.opml");
|
||
|
|
if !test_file.exists() {
|
||
|
|
println!("Skipping test: Test file not found at {:?}", test_file);
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let content = std::fs::read(&test_file).expect("Should read OPML file");
|
||
|
|
let result = extract_bytes(&content, "text/x-opml", &ExtractionConfig::default())
|
||
|
|
.await
|
||
|
|
.expect("Should extract metadata successfully");
|
||
|
|
|
||
|
|
let additional = &result.metadata.additional;
|
||
|
|
|
||
|
|
assert!(result.metadata.title.is_some(), "Should have title metadata");
|
||
|
|
assert!(
|
||
|
|
result.metadata.created_at.is_some() || result.metadata.modified_at.is_some(),
|
||
|
|
"Should have at least one date field"
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
result.metadata.created_by.is_some() || additional.contains_key("ownerEmail"),
|
||
|
|
"Should have owner information"
|
||
|
|
);
|
||
|
|
|
||
|
|
assert_eq!(
|
||
|
|
result.metadata.title.as_deref(),
|
||
|
|
Some("Tech News Feeds"),
|
||
|
|
"Title should match exactly"
|
||
|
|
);
|
||
|
|
|
||
|
|
if let Some(date_created) = result.metadata.created_at.as_deref() {
|
||
|
|
assert!(
|
||
|
|
date_created.contains("Nov") || date_created.contains("2023"),
|
||
|
|
"Date should be preserved in original format"
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
assert_eq!(
|
||
|
|
result.metadata.created_by.as_deref(),
|
||
|
|
Some("John Smith"),
|
||
|
|
"Owner name should be extracted"
|
||
|
|
);
|
||
|
|
|
||
|
|
println!("✅ Metadata extraction test passed!");
|
||
|
|
println!(" Metadata fields: {:?}", additional.keys().collect::<Vec<_>>());
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test 5: Verify RSS feed names are extracted from OPML attributes
|
||
|
|
///
|
||
|
|
/// Validates:
|
||
|
|
/// - Feed names from text attribute are properly extracted
|
||
|
|
/// - Feed categories are preserved in the hierarchy
|
||
|
|
/// - All feed names are present in output
|
||
|
|
/// - Extraction matches Pandoc baseline (no URLs in main content)
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_opml_feed_url_extraction() {
|
||
|
|
let test_file = get_test_opml_path("feeds.opml");
|
||
|
|
if !test_file.exists() {
|
||
|
|
println!("Skipping test: Test file not found at {:?}", test_file);
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let content = std::fs::read(&test_file).expect("Should read OPML file");
|
||
|
|
let result = extract_bytes(&content, "text/x-opml", &ExtractionConfig::default())
|
||
|
|
.await
|
||
|
|
.expect("Should extract feed names successfully");
|
||
|
|
|
||
|
|
assert_contains_ci(&result.content, "Hacker News", "Should contain Hacker News feed name");
|
||
|
|
assert_contains_ci(&result.content, "TechCrunch", "Should contain TechCrunch feed name");
|
||
|
|
assert_contains_ci(&result.content, "Rust Blog", "Should contain Rust Blog feed name");
|
||
|
|
assert_contains_ci(&result.content, "Dev.to", "Should contain Dev.to feed name");
|
||
|
|
|
||
|
|
assert_contains_ci(&result.content, "Technology", "Should contain Technology category");
|
||
|
|
assert_contains_ci(&result.content, "Programming", "Should contain Programming category");
|
||
|
|
|
||
|
|
println!("✅ Feed extraction test passed!");
|
||
|
|
println!(" Found {} bytes of content", result.content.len());
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test 6: Verify correct MIME type handling and format detection
|
||
|
|
///
|
||
|
|
/// Validates:
|
||
|
|
/// - MIME type is correctly preserved in result
|
||
|
|
/// - Extractor handles text/x-opml MIME type
|
||
|
|
/// - Content format is appropriate for OPML outline structure
|
||
|
|
/// - Result structure is valid
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_opml_mime_type_handling() {
|
||
|
|
let test_file = get_test_opml_path("feeds.opml");
|
||
|
|
if !test_file.exists() {
|
||
|
|
println!("Skipping test: Test file not found at {:?}", test_file);
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let content = std::fs::read(&test_file).expect("Should read OPML file");
|
||
|
|
|
||
|
|
let result = extract_bytes(&content, "text/x-opml", &ExtractionConfig::default())
|
||
|
|
.await
|
||
|
|
.expect("Should extract with text/x-opml MIME type");
|
||
|
|
|
||
|
|
assert_eq!(result.mime_type, "text/x-opml", "MIME type should be preserved");
|
||
|
|
|
||
|
|
let result2 = extract_bytes(&content, "application/xml+opml", &ExtractionConfig::default())
|
||
|
|
.await
|
||
|
|
.expect("Should extract with application/xml+opml MIME type");
|
||
|
|
|
||
|
|
assert_eq!(
|
||
|
|
result2.mime_type, "application/xml+opml",
|
||
|
|
"Alternative MIME type should work"
|
||
|
|
);
|
||
|
|
|
||
|
|
assert_eq!(
|
||
|
|
result.content, result2.content,
|
||
|
|
"Content should be same regardless of MIME type"
|
||
|
|
);
|
||
|
|
|
||
|
|
println!("✅ MIME type handling test passed!");
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test 7: Handle special characters and HTML entities in OPML
|
||
|
|
///
|
||
|
|
/// Validates:
|
||
|
|
/// - HTML entities are properly decoded (&, <, >, etc.)
|
||
|
|
/// - Special characters in feed names are handled correctly
|
||
|
|
/// - Quotes and apostrophes are properly processed
|
||
|
|
/// - UTF-8 content is valid
|
||
|
|
/// - Content is human-readable after extraction
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_opml_special_characters_handling() {
|
||
|
|
let test_file = get_test_opml_path("podcasts.opml");
|
||
|
|
if !test_file.exists() {
|
||
|
|
println!("Skipping test: Test file not found at {:?}", test_file);
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let content = std::fs::read(&test_file).expect("Should read OPML file");
|
||
|
|
let result = extract_bytes(&content, "text/x-opml", &ExtractionConfig::default())
|
||
|
|
.await
|
||
|
|
.expect("Should extract with special characters");
|
||
|
|
|
||
|
|
assert_contains_ci(
|
||
|
|
&result.content,
|
||
|
|
"Business",
|
||
|
|
"Should properly decode Business & Startups",
|
||
|
|
);
|
||
|
|
|
||
|
|
let _ = result.content.chars().count();
|
||
|
|
|
||
|
|
println!("✅ Special characters handling test passed!");
|
||
|
|
println!(" Verified UTF-8 integrity and entity decoding");
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test 9: Validate deep nesting and hierarchy preservation in outline.opml
|
||
|
|
///
|
||
|
|
/// Validates:
|
||
|
|
/// - Multi-level nesting (4 levels) is properly preserved
|
||
|
|
/// - Indentation increases with nesting depth
|
||
|
|
/// - All tasks are extracted in correct nesting context
|
||
|
|
/// - Task ordering is preserved
|
||
|
|
/// - Notes & Resources section is captured
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_opml_deep_nesting_hierarchy() {
|
||
|
|
let test_file = get_test_opml_path("outline.opml");
|
||
|
|
if !test_file.exists() {
|
||
|
|
println!("Skipping test: Test file not found at {:?}", test_file);
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let content = std::fs::read(&test_file).expect("Should read OPML file");
|
||
|
|
let result = extract_bytes(&content, "text/x-opml", &ExtractionConfig::default())
|
||
|
|
.await
|
||
|
|
.expect("Should extract deep nesting successfully");
|
||
|
|
|
||
|
|
let extracted = &result.content;
|
||
|
|
|
||
|
|
let project_pos = extracted.find("Project Alpha").unwrap_or(0);
|
||
|
|
let phase1_pos = extracted.find("Phase 1").unwrap_or(0);
|
||
|
|
let phase2_pos = extracted.find("Phase 2").unwrap_or(0);
|
||
|
|
let phase3_pos = extracted.find("Phase 3").unwrap_or(0);
|
||
|
|
let phase4_pos = extracted.find("Phase 4").unwrap_or(0);
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
project_pos < phase1_pos && phase1_pos < phase2_pos && phase2_pos < phase3_pos && phase3_pos < phase4_pos,
|
||
|
|
"Phases should appear in order in output"
|
||
|
|
);
|
||
|
|
|
||
|
|
assert_contains_ci(extracted, "Phase 1", "Phase 1 should be present");
|
||
|
|
assert_contains_ci(extracted, "Phase 2", "Phase 2 should be present");
|
||
|
|
assert_contains_ci(extracted, "Phase 3", "Phase 3 should be present");
|
||
|
|
assert_contains_ci(extracted, "Phase 4", "Phase 4 should be present");
|
||
|
|
|
||
|
|
assert_contains_ci(extracted, "Notes & Resources", "Notes section should be present");
|
||
|
|
|
||
|
|
println!("✅ Deep nesting hierarchy test passed!");
|
||
|
|
println!(" All phases and tasks extracted in correct order");
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test 10: Validate content extraction quality and consistency across all OPML files
|
||
|
|
///
|
||
|
|
/// Validates:
|
||
|
|
/// - All OPML files produce non-empty content
|
||
|
|
/// - Content is valid UTF-8 (no corruption)
|
||
|
|
/// - Content doesn't have excessive whitespace
|
||
|
|
/// - Minimum content quality standards
|
||
|
|
/// - Consistent extraction behavior
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_opml_content_quality_all_files() {
|
||
|
|
let opml_files = vec!["feeds.opml", "podcasts.opml", "outline.opml"];
|
||
|
|
|
||
|
|
for opml_file in opml_files {
|
||
|
|
let test_file = get_test_opml_path(opml_file);
|
||
|
|
if !test_file.exists() {
|
||
|
|
println!("Skipping file: {:?}", test_file);
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
let content = std::fs::read(&test_file).expect("Should read OPML file");
|
||
|
|
let result = extract_bytes(&content, "text/x-opml", &ExtractionConfig::default())
|
||
|
|
.await
|
||
|
|
.unwrap_or_else(|_| panic!("Should extract {}", opml_file));
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
!result.content.is_empty(),
|
||
|
|
"Content should not be empty for {}",
|
||
|
|
opml_file
|
||
|
|
);
|
||
|
|
|
||
|
|
let _ = result.content.chars().count();
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
result.content.len() > 20,
|
||
|
|
"Content should have meaningful length for {}",
|
||
|
|
opml_file
|
||
|
|
);
|
||
|
|
|
||
|
|
let whitespace_ratio =
|
||
|
|
result.content.chars().filter(|c| c.is_whitespace()).count() as f64 / result.content.len() as f64;
|
||
|
|
assert!(
|
||
|
|
whitespace_ratio < 0.5,
|
||
|
|
"Content should not be mostly whitespace for {}",
|
||
|
|
opml_file
|
||
|
|
);
|
||
|
|
|
||
|
|
println!(" ✓ {} ({} bytes) quality validated", opml_file, result.content.len());
|
||
|
|
}
|
||
|
|
|
||
|
|
println!("✅ Content quality validation test passed!");
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test 11: Verify OPML extractor is properly registered
|
||
|
|
///
|
||
|
|
/// Validates:
|
||
|
|
/// - Extractor is available in the registry
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_opml_extractor_registration() {
|
||
|
|
use kreuzberg::plugins::registry::get_document_extractor_registry;
|
||
|
|
|
||
|
|
// Trigger initialization via a real extraction call so the registry is populated.
|
||
|
|
let _ = kreuzberg::extract_bytes(
|
||
|
|
b"<opml/>",
|
||
|
|
"text/x-opml",
|
||
|
|
&kreuzberg::core::config::ExtractionConfig::default(),
|
||
|
|
)
|
||
|
|
.await;
|
||
|
|
|
||
|
|
let registry = get_document_extractor_registry();
|
||
|
|
let registry_guard = registry.read();
|
||
|
|
|
||
|
|
let extractor_names = registry_guard.list();
|
||
|
|
|
||
|
|
println!("Available extractors: {:?}", extractor_names);
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
extractor_names.contains(&"opml-extractor".to_string()),
|
||
|
|
"OPML extractor should be registered. Available: {:?}",
|
||
|
|
extractor_names
|
||
|
|
);
|
||
|
|
|
||
|
|
println!("✅ OPML extractor registration test passed!");
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test 12: Extract all OPML files and generate summary statistics
|
||
|
|
///
|
||
|
|
/// This test runs all OPML extractions and provides comprehensive statistics
|
||
|
|
/// for validation and debugging purposes. It's not a strict pass/fail test
|
||
|
|
/// but provides useful information about extraction behavior.
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_opml_extraction_statistics() {
|
||
|
|
let opml_files = vec!["feeds.opml", "podcasts.opml", "outline.opml"];
|
||
|
|
|
||
|
|
println!("\n╔════════════════════════════════════════════════════════════╗");
|
||
|
|
println!("║ OPML Extraction Statistics Report ║");
|
||
|
|
println!("╚════════════════════════════════════════════════════════════╝\n");
|
||
|
|
|
||
|
|
let mut total_files = 0;
|
||
|
|
let mut total_content_bytes = 0;
|
||
|
|
let mut total_metadata_fields = 0;
|
||
|
|
|
||
|
|
for opml_file in opml_files {
|
||
|
|
let test_file = get_test_opml_path(opml_file);
|
||
|
|
if !test_file.exists() {
|
||
|
|
println!("⚠ SKIP: {} (not found)", opml_file);
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
match std::fs::read(&test_file) {
|
||
|
|
Ok(content) => match extract_bytes(&content, "text/x-opml", &ExtractionConfig::default()).await {
|
||
|
|
Ok(result) => {
|
||
|
|
total_files += 1;
|
||
|
|
total_content_bytes += result.content.len();
|
||
|
|
total_metadata_fields += result.metadata.additional.len();
|
||
|
|
|
||
|
|
println!("✓ {} ", opml_file);
|
||
|
|
println!(" Content: {} bytes", result.content.len());
|
||
|
|
println!(" Metadata fields: {}", result.metadata.additional.len());
|
||
|
|
|
||
|
|
if !result.metadata.additional.is_empty() {
|
||
|
|
let keys: Vec<String> = result.metadata.additional.keys().map(|k| k.to_string()).collect();
|
||
|
|
println!(" Keys: {}", keys.join(", "));
|
||
|
|
}
|
||
|
|
|
||
|
|
let outline_count = result.content.lines().count();
|
||
|
|
println!(" Outline items: ~{}", outline_count);
|
||
|
|
|
||
|
|
let indented_lines = result.content.lines().filter(|l| l.starts_with(" ")).count();
|
||
|
|
println!(" Nested items: {}", indented_lines);
|
||
|
|
|
||
|
|
println!();
|
||
|
|
}
|
||
|
|
Err(e) => {
|
||
|
|
println!("✗ {} - Error: {:?}", opml_file, e);
|
||
|
|
println!();
|
||
|
|
}
|
||
|
|
},
|
||
|
|
Err(e) => {
|
||
|
|
println!("✗ {} - Read Error: {:?}", opml_file, e);
|
||
|
|
println!();
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
println!("╔════════════════════════════════════════════════════════════╗");
|
||
|
|
println!("║ Summary Statistics ║");
|
||
|
|
println!("╠════════════════════════════════════════════════════════════╣");
|
||
|
|
println!("║ Total files processed: {:44} ║", total_files);
|
||
|
|
println!("║ Total content bytes: {:44} ║", total_content_bytes);
|
||
|
|
println!("║ Total metadata fields: {:44} ║", total_metadata_fields);
|
||
|
|
println!(
|
||
|
|
"║ Average content size: {:44} ║",
|
||
|
|
total_content_bytes.checked_div(total_files).unwrap_or(0)
|
||
|
|
);
|
||
|
|
println!(
|
||
|
|
"║ Average metadata/file: {:44} ║",
|
||
|
|
total_metadata_fields.checked_div(total_files).unwrap_or(0)
|
||
|
|
);
|
||
|
|
println!("╚════════════════════════════════════════════════════════════╝\n");
|
||
|
|
|
||
|
|
println!("✅ OPML extraction statistics generated successfully!");
|
||
|
|
}
|