This commit is contained in:
275
crates/kreuzberg/tests/epub_native_extractor_tests.rs
Normal file
275
crates/kreuzberg/tests/epub_native_extractor_tests.rs
Normal file
@@ -0,0 +1,275 @@
|
||||
//! Integration tests for the native EPUB extractor
|
||||
//!
|
||||
//! These tests validate the native Rust EPUB extractor (EpubExtractor)
|
||||
//! which uses zip + roxmltree + html-to-markdown-rs (permissive licenses).
|
||||
//!
|
||||
//! This test suite verifies the fix for the two-pass OPF parsing bug that
|
||||
//! caused 99.84% content loss due to single-pass manifest/spine resolution.
|
||||
|
||||
#![cfg(feature = "office")]
|
||||
|
||||
use kreuzberg::core::config::ExtractionConfig;
|
||||
use kreuzberg::extraction::derive::derive_extraction_result;
|
||||
use kreuzberg::extractors::EpubExtractor;
|
||||
use kreuzberg::plugins::DocumentExtractor;
|
||||
use std::path::PathBuf;
|
||||
|
||||
/// Helper to resolve workspace root and construct test file paths
|
||||
fn get_test_epub_path(filename: &str) -> PathBuf {
|
||||
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
||||
.parent()
|
||||
.expect("Operation failed")
|
||||
.parent()
|
||||
.expect("Operation failed");
|
||||
workspace_root.join(format!("test_documents/epub/{}", filename))
|
||||
}
|
||||
|
||||
/// Test 1: Basic EPUB extraction - wasteland.epub
|
||||
///
|
||||
/// Validates:
|
||||
/// - Two-pass OPF parsing works correctly
|
||||
/// - Manifest is fully populated before spine resolution
|
||||
/// - Content is extracted successfully (>2000 bytes expected)
|
||||
/// - Metadata is extracted correctly
|
||||
#[tokio::test]
|
||||
async fn test_native_epub_wasteland_extraction() {
|
||||
let test_file = get_test_epub_path("wasteland.epub");
|
||||
if !test_file.exists() {
|
||||
println!("Skipping test: Test file not found at {:?}", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let bytes = std::fs::read(&test_file).expect("Failed to read wasteland.epub");
|
||||
let extractor = EpubExtractor;
|
||||
let config = ExtractionConfig::default();
|
||||
|
||||
let doc = extractor
|
||||
.extract_bytes(&bytes, "application/epub+zip", &config)
|
||||
.await
|
||||
.expect("Should extract wasteland.epub successfully");
|
||||
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
|
||||
|
||||
assert!(
|
||||
result.content.len() > 2000,
|
||||
"Should extract substantial content from Wasteland, got {} bytes",
|
||||
result.content.len()
|
||||
);
|
||||
|
||||
assert!(result.metadata.title.is_some(), "Should extract title metadata");
|
||||
assert_eq!(
|
||||
result.metadata.title.as_deref(),
|
||||
Some("The Waste Land"),
|
||||
"Should have correct title"
|
||||
);
|
||||
|
||||
assert!(result.metadata.authors.is_some(), "Should extract creator metadata");
|
||||
|
||||
assert!(
|
||||
result.content.contains("April") || result.content.contains("cruellest"),
|
||||
"Should contain key phrases from The Waste Land"
|
||||
);
|
||||
|
||||
println!("✅ Wasteland extraction test passed ({} bytes)", result.content.len());
|
||||
}
|
||||
|
||||
/// Test 2: EPUB with images - img.epub
|
||||
///
|
||||
/// Validates:
|
||||
/// - EPUB with embedded images extracts successfully
|
||||
/// - Text content is extracted (images are in manifest but not in content)
|
||||
/// - Metadata is extracted
|
||||
#[tokio::test]
|
||||
async fn test_native_epub_images_extraction() {
|
||||
let test_file = get_test_epub_path("img.epub");
|
||||
if !test_file.exists() {
|
||||
println!("Skipping test: Test file not found at {:?}", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let bytes = std::fs::read(&test_file).expect("Failed to read img.epub");
|
||||
let extractor = EpubExtractor;
|
||||
let config = ExtractionConfig::default();
|
||||
|
||||
let doc = extractor
|
||||
.extract_bytes(&bytes, "application/epub+zip", &config)
|
||||
.await
|
||||
.expect("Should extract img.epub successfully");
|
||||
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
|
||||
|
||||
assert!(
|
||||
result.content.len() > 50,
|
||||
"Should extract text content from EPUB with images, got {} bytes",
|
||||
result.content.len()
|
||||
);
|
||||
|
||||
assert!(result.metadata.title.is_some(), "Should extract title metadata");
|
||||
|
||||
println!("✅ Images EPUB extraction test passed ({} bytes)", result.content.len());
|
||||
}
|
||||
|
||||
/// Test 3: Features EPUB - features.epub
|
||||
///
|
||||
/// Validates:
|
||||
/// - Complex EPUB3 features document extracts successfully
|
||||
/// - Multiple chapters/sections are extracted (not just first)
|
||||
/// - Substantial content is present (>1000 bytes)
|
||||
#[tokio::test]
|
||||
async fn test_native_epub_features_extraction() {
|
||||
let test_file = get_test_epub_path("features.epub");
|
||||
if !test_file.exists() {
|
||||
println!("Skipping test: Test file not found at {:?}", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let bytes = std::fs::read(&test_file).expect("Failed to read features.epub");
|
||||
let extractor = EpubExtractor;
|
||||
let config = ExtractionConfig::default();
|
||||
|
||||
let doc = extractor
|
||||
.extract_bytes(&bytes, "application/epub+zip", &config)
|
||||
.await
|
||||
.expect("Should extract features.epub successfully");
|
||||
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
|
||||
|
||||
assert!(
|
||||
result.content.len() > 1000,
|
||||
"CRITICAL: Should extract from ALL chapters, got only {} bytes. \
|
||||
This indicates the two-pass bug is not fixed!",
|
||||
result.content.len()
|
||||
);
|
||||
|
||||
println!(
|
||||
"✅ Features EPUB extraction test passed ({} bytes)",
|
||||
result.content.len()
|
||||
);
|
||||
}
|
||||
|
||||
/// Test 4: EPUB2 with cover - epub2_cover.epub
|
||||
///
|
||||
/// Validates:
|
||||
/// - EPUB2 format is supported
|
||||
/// - Cover handling works correctly
|
||||
/// - Content and metadata extracted
|
||||
#[tokio::test]
|
||||
async fn test_native_epub2_cover_extraction() {
|
||||
let test_file = get_test_epub_path("epub2_cover.epub");
|
||||
if !test_file.exists() {
|
||||
println!("Skipping test: Test file not found at {:?}", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let bytes = std::fs::read(&test_file).expect("Failed to read epub2_cover.epub");
|
||||
let extractor = EpubExtractor;
|
||||
let config = ExtractionConfig::default();
|
||||
|
||||
let doc = extractor
|
||||
.extract_bytes(&bytes, "application/epub+zip", &config)
|
||||
.await
|
||||
.expect("Should extract epub2_cover.epub successfully");
|
||||
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
|
||||
|
||||
assert!(
|
||||
result.content.len() > 10,
|
||||
"Should extract content from EPUB2 with cover, got {} bytes",
|
||||
result.content.len()
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
result.metadata.title.as_deref(),
|
||||
Some("Pandoc EPUB Test"),
|
||||
"Should have correct title"
|
||||
);
|
||||
|
||||
println!("✅ EPUB2 cover extraction test passed ({} bytes)", result.content.len());
|
||||
}
|
||||
|
||||
/// Test 5: Deterministic extraction
|
||||
///
|
||||
/// Validates:
|
||||
/// - Same input produces same output (no randomness)
|
||||
/// - Extraction is stable and reproducible
|
||||
#[tokio::test]
|
||||
async fn test_native_epub_deterministic_extraction() {
|
||||
let test_file = get_test_epub_path("features.epub");
|
||||
if !test_file.exists() {
|
||||
println!("Skipping test: Test file not found at {:?}", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let bytes = std::fs::read(&test_file).expect("Failed to read features.epub");
|
||||
let extractor = EpubExtractor;
|
||||
let config = ExtractionConfig::default();
|
||||
|
||||
let doc_result1 = extractor
|
||||
.extract_bytes(&bytes, "application/epub+zip", &config)
|
||||
.await
|
||||
.expect("First extraction should succeed");
|
||||
let result1 = derive_extraction_result(doc_result1, false, kreuzberg::OutputFormat::Plain);
|
||||
|
||||
let doc_result2 = extractor
|
||||
.extract_bytes(&bytes, "application/epub+zip", &config)
|
||||
.await
|
||||
.expect("Second extraction should succeed");
|
||||
let result2 = derive_extraction_result(doc_result2, false, kreuzberg::OutputFormat::Plain);
|
||||
|
||||
assert_eq!(
|
||||
result1.content, result2.content,
|
||||
"Extraction should be deterministic - same input should produce same output"
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
result1.metadata.additional, result2.metadata.additional,
|
||||
"Metadata extraction should be deterministic"
|
||||
);
|
||||
|
||||
println!("✅ Deterministic extraction test passed");
|
||||
}
|
||||
|
||||
/// Test 6: No content loss across multiple EPUBs
|
||||
///
|
||||
/// Validates:
|
||||
/// - All test EPUB files extract successfully
|
||||
/// - No file has empty or nearly-empty content
|
||||
/// - Bug causing 99.84% content loss is fixed
|
||||
#[tokio::test]
|
||||
async fn test_native_epub_no_content_loss() {
|
||||
let epub_files = vec![
|
||||
("epub2_cover.epub", 10),
|
||||
// This fixture contains a single title line in one XHTML body document.
|
||||
("epub2_no_cover.epub", 10),
|
||||
("img.epub", 50),
|
||||
("features.epub", 1000),
|
||||
("wasteland.epub", 2000),
|
||||
];
|
||||
|
||||
let extractor = EpubExtractor;
|
||||
let config = ExtractionConfig::default();
|
||||
|
||||
for (epub_file, min_bytes) in epub_files {
|
||||
let test_file = get_test_epub_path(epub_file);
|
||||
if !test_file.exists() {
|
||||
println!("⚠ Skipping {}: not found", epub_file);
|
||||
continue;
|
||||
}
|
||||
|
||||
let bytes = std::fs::read(&test_file).unwrap_or_else(|_| panic!("Failed to read {}", epub_file));
|
||||
|
||||
let doc = extractor
|
||||
.extract_bytes(&bytes, "application/epub+zip", &config)
|
||||
.await
|
||||
.unwrap_or_else(|_| panic!("Should extract {}", epub_file));
|
||||
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
|
||||
|
||||
assert!(
|
||||
result.content.len() >= min_bytes,
|
||||
"CRITICAL: {} extracted only {} bytes (expected >= {}). Content loss bug?",
|
||||
epub_file,
|
||||
result.content.len(),
|
||||
min_bytes
|
||||
);
|
||||
|
||||
println!("✓ {} - {} bytes extracted", epub_file, result.content.len());
|
||||
}
|
||||
|
||||
println!("✅ All EPUBs extracted successfully - no content loss!");
|
||||
}
|
||||
Reference in New Issue
Block a user