Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/crates/kreuzberg/tests/epub_native_extractor_tests.rs
+++ b/crates/kreuzberg/tests/epub_native_extractor_tests.rs
@@ -0,0 +1,275 @@
+//! Integration tests for the native EPUB extractor
+//!
+//! These tests validate the native Rust EPUB extractor (EpubExtractor)
+//! which uses zip + roxmltree + html-to-markdown-rs (permissive licenses).
+//!
+//! This test suite verifies the fix for the two-pass OPF parsing bug that
+//! caused 99.84% content loss due to single-pass manifest/spine resolution.
+
+#![cfg(feature = "office")]
+
+use kreuzberg::core::config::ExtractionConfig;
+use kreuzberg::extraction::derive::derive_extraction_result;
+use kreuzberg::extractors::EpubExtractor;
+use kreuzberg::plugins::DocumentExtractor;
+use std::path::PathBuf;
+
+/// Helper to resolve workspace root and construct test file paths
+fn get_test_epub_path(filename: &str) -> PathBuf {
+    let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
+        .parent()
+        .expect("Operation failed")
+        .parent()
+        .expect("Operation failed");
+    workspace_root.join(format!("test_documents/epub/{}", filename))
+}
+
+/// Test 1: Basic EPUB extraction - wasteland.epub
+///
+/// Validates:
+/// - Two-pass OPF parsing works correctly
+/// - Manifest is fully populated before spine resolution
+/// - Content is extracted successfully (>2000 bytes expected)
+/// - Metadata is extracted correctly
+#[tokio::test]
+async fn test_native_epub_wasteland_extraction() {
+    let test_file = get_test_epub_path("wasteland.epub");
+    if !test_file.exists() {
+        println!("Skipping test: Test file not found at {:?}", test_file);
+        return;
+    }
+
+    let bytes = std::fs::read(&test_file).expect("Failed to read wasteland.epub");
+    let extractor = EpubExtractor;
+    let config = ExtractionConfig::default();
+
+    let doc = extractor
+        .extract_bytes(&bytes, "application/epub+zip", &config)
+        .await
+        .expect("Should extract wasteland.epub successfully");
+    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
+
+    assert!(
+        result.content.len() > 2000,
+        "Should extract substantial content from Wasteland, got {} bytes",
+        result.content.len()
+    );
+
+    assert!(result.metadata.title.is_some(), "Should extract title metadata");
+    assert_eq!(
+        result.metadata.title.as_deref(),
+        Some("The Waste Land"),
+        "Should have correct title"
+    );
+
+    assert!(result.metadata.authors.is_some(), "Should extract creator metadata");
+
+    assert!(
+        result.content.contains("April") || result.content.contains("cruellest"),
+        "Should contain key phrases from The Waste Land"
+    );
+
+    println!("✅ Wasteland extraction test passed ({} bytes)", result.content.len());
+}
+
+/// Test 2: EPUB with images - img.epub
+///
+/// Validates:
+/// - EPUB with embedded images extracts successfully
+/// - Text content is extracted (images are in manifest but not in content)
+/// - Metadata is extracted
+#[tokio::test]
+async fn test_native_epub_images_extraction() {
+    let test_file = get_test_epub_path("img.epub");
+    if !test_file.exists() {
+        println!("Skipping test: Test file not found at {:?}", test_file);
+        return;
+    }
+
+    let bytes = std::fs::read(&test_file).expect("Failed to read img.epub");
+    let extractor = EpubExtractor;
+    let config = ExtractionConfig::default();
+
+    let doc = extractor
+        .extract_bytes(&bytes, "application/epub+zip", &config)
+        .await
+        .expect("Should extract img.epub successfully");
+    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
+
+    assert!(
+        result.content.len() > 50,
+        "Should extract text content from EPUB with images, got {} bytes",
+        result.content.len()
+    );
+
+    assert!(result.metadata.title.is_some(), "Should extract title metadata");
+
+    println!("✅ Images EPUB extraction test passed ({} bytes)", result.content.len());
+}
+
+/// Test 3: Features EPUB - features.epub
+///
+/// Validates:
+/// - Complex EPUB3 features document extracts successfully
+/// - Multiple chapters/sections are extracted (not just first)
+/// - Substantial content is present (>1000 bytes)
+#[tokio::test]
+async fn test_native_epub_features_extraction() {
+    let test_file = get_test_epub_path("features.epub");
+    if !test_file.exists() {
+        println!("Skipping test: Test file not found at {:?}", test_file);
+        return;
+    }
+
+    let bytes = std::fs::read(&test_file).expect("Failed to read features.epub");
+    let extractor = EpubExtractor;
+    let config = ExtractionConfig::default();
+
+    let doc = extractor
+        .extract_bytes(&bytes, "application/epub+zip", &config)
+        .await
+        .expect("Should extract features.epub successfully");
+    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
+
+    assert!(
+        result.content.len() > 1000,
+        "CRITICAL: Should extract from ALL chapters, got only {} bytes. \
+         This indicates the two-pass bug is not fixed!",
+        result.content.len()
+    );
+
+    println!(
+        "✅ Features EPUB extraction test passed ({} bytes)",
+        result.content.len()
+    );
+}
+
+/// Test 4: EPUB2 with cover - epub2_cover.epub
+///
+/// Validates:
+/// - EPUB2 format is supported
+/// - Cover handling works correctly
+/// - Content and metadata extracted
+#[tokio::test]
+async fn test_native_epub2_cover_extraction() {
+    let test_file = get_test_epub_path("epub2_cover.epub");
+    if !test_file.exists() {
+        println!("Skipping test: Test file not found at {:?}", test_file);
+        return;
+    }
+
+    let bytes = std::fs::read(&test_file).expect("Failed to read epub2_cover.epub");
+    let extractor = EpubExtractor;
+    let config = ExtractionConfig::default();
+
+    let doc = extractor
+        .extract_bytes(&bytes, "application/epub+zip", &config)
+        .await
+        .expect("Should extract epub2_cover.epub successfully");
+    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
+
+    assert!(
+        result.content.len() > 10,
+        "Should extract content from EPUB2 with cover, got {} bytes",
+        result.content.len()
+    );
+
+    assert_eq!(
+        result.metadata.title.as_deref(),
+        Some("Pandoc EPUB Test"),
+        "Should have correct title"
+    );
+
+    println!("✅ EPUB2 cover extraction test passed ({} bytes)", result.content.len());
+}
+
+/// Test 5: Deterministic extraction
+///
+/// Validates:
+/// - Same input produces same output (no randomness)
+/// - Extraction is stable and reproducible
+#[tokio::test]
+async fn test_native_epub_deterministic_extraction() {
+    let test_file = get_test_epub_path("features.epub");
+    if !test_file.exists() {
+        println!("Skipping test: Test file not found at {:?}", test_file);
+        return;
+    }
+
+    let bytes = std::fs::read(&test_file).expect("Failed to read features.epub");
+    let extractor = EpubExtractor;
+    let config = ExtractionConfig::default();
+
+    let doc_result1 = extractor
+        .extract_bytes(&bytes, "application/epub+zip", &config)
+        .await
+        .expect("First extraction should succeed");
+    let result1 = derive_extraction_result(doc_result1, false, kreuzberg::OutputFormat::Plain);
+
+    let doc_result2 = extractor
+        .extract_bytes(&bytes, "application/epub+zip", &config)
+        .await
+        .expect("Second extraction should succeed");
+    let result2 = derive_extraction_result(doc_result2, false, kreuzberg::OutputFormat::Plain);
+
+    assert_eq!(
+        result1.content, result2.content,
+        "Extraction should be deterministic - same input should produce same output"
+    );
+
+    assert_eq!(
+        result1.metadata.additional, result2.metadata.additional,
+        "Metadata extraction should be deterministic"
+    );
+
+    println!("✅ Deterministic extraction test passed");
+}
+
+/// Test 6: No content loss across multiple EPUBs
+///
+/// Validates:
+/// - All test EPUB files extract successfully
+/// - No file has empty or nearly-empty content
+/// - Bug causing 99.84% content loss is fixed
+#[tokio::test]
+async fn test_native_epub_no_content_loss() {
+    let epub_files = vec![
+        ("epub2_cover.epub", 10),
+        // This fixture contains a single title line in one XHTML body document.
+        ("epub2_no_cover.epub", 10),
+        ("img.epub", 50),
+        ("features.epub", 1000),
+        ("wasteland.epub", 2000),
+    ];
+
+    let extractor = EpubExtractor;
+    let config = ExtractionConfig::default();
+
+    for (epub_file, min_bytes) in epub_files {
+        let test_file = get_test_epub_path(epub_file);
+        if !test_file.exists() {
+            println!("⚠ Skipping {}: not found", epub_file);
+            continue;
+        }
+
+        let bytes = std::fs::read(&test_file).unwrap_or_else(|_| panic!("Failed to read {}", epub_file));
+
+        let doc = extractor
+            .extract_bytes(&bytes, "application/epub+zip", &config)
+            .await
+            .unwrap_or_else(|_| panic!("Should extract {}", epub_file));
+        let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
+
+        assert!(
+            result.content.len() >= min_bytes,
+            "CRITICAL: {} extracted only {} bytes (expected >= {}). Content loss bug?",
+            epub_file,
+            result.content.len(),
+            min_bytes
+        );
+
+        println!("✓ {} - {} bytes extracted", epub_file, result.content.len());
+    }
+
+    println!("✅ All EPUBs extracted successfully - no content loss!");
+}