Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/crates/kreuzberg/tests/format_integration.rs
+++ b/crates/kreuzberg/tests/format_integration.rs
@@ -0,0 +1,165 @@
+//! Binding-specific format integration tests.
+//!
+//! Positive-path scenarios are now covered by the shared fixture-based E2E
+//! suites. The tests here focus on behaviour that is specific to the Rust
+//! asynchronous APIs or to graceful handling when optional system
+//! dependencies are missing.
+
+#![cfg(any(feature = "pdf", feature = "office", feature = "ocr"))]
+
+mod helpers;
+
+use helpers::{assert_mime_type, get_test_file_path, test_documents_available};
+
+#[cfg(any(feature = "office", feature = "ocr"))]
+use helpers::assert_non_empty_content;
+use kreuzberg::core::config::ExtractionConfig;
+use kreuzberg::core::extractor::extract_file;
+
+#[cfg(feature = "ocr")]
+use kreuzberg::core::config::OcrConfig;
+
+#[cfg(feature = "pdf")]
+#[tokio::test]
+async fn test_pdf_password_protected_async() {
+    if !test_documents_available() {
+        return;
+    }
+
+    let path = get_test_file_path("pdf/copy_protected.pdf");
+    if !path.exists() {
+        tracing::debug!("Skipping test: protected PDF not available");
+        return;
+    }
+
+    let result = extract_file(&path, None, &ExtractionConfig::default()).await;
+
+    match result {
+        Err(err) => {
+            tracing::debug!("Password protection detected (expected): {}", err);
+        }
+        Ok(res) => {
+            tracing::debug!("Protected PDF extracted; some files allow fallback");
+            assert_mime_type(&res, "application/pdf");
+            assert!(res.chunks.is_none(), "Chunks should be None without chunking config");
+            assert!(res.detected_languages.is_none(), "Language detection not enabled");
+        }
+    }
+}
+
+#[cfg(feature = "office")]
+#[tokio::test]
+#[cfg_attr(target_os = "windows", ignore = "Legacy office tests timeout on Windows CI")]
+async fn test_legacy_doc_extraction_async() {
+    if !test_documents_available() {
+        return;
+    }
+
+    let path = get_test_file_path("doc/simple.doc");
+    if !path.exists() {
+        tracing::debug!("Skipping test: legacy .doc file not available");
+        return;
+    }
+
+    let result = extract_file(&path, None, &ExtractionConfig::default()).await;
+
+    match result {
+        Ok(extracted) => {
+            assert_mime_type(&extracted, "application/msword");
+            assert_non_empty_content(&extracted);
+            assert!(
+                extracted.chunks.is_none(),
+                "Chunks should be None without chunking config"
+            );
+            assert!(extracted.detected_languages.is_none(), "Language detection not enabled");
+        }
+        Err(err) => {
+            tracing::debug!(
+                "Legacy Office extraction failed (office feature may not be enabled): {}",
+                err
+            );
+        }
+    }
+}
+
+#[cfg(feature = "ocr")]
+#[tokio::test]
+async fn test_ocr_simple_english_image_async() {
+    if !test_documents_available() {
+        return;
+    }
+
+    let path = get_test_file_path("images/test_hello_world.png");
+    if !path.exists() {
+        tracing::debug!("Skipping test: OCR sample image not available");
+        return;
+    }
+
+    let config = ExtractionConfig {
+        ocr: Some(OcrConfig {
+            backend: "tesseract".to_string(),
+            language: "eng".to_string(),
+            ..Default::default()
+        }),
+        force_ocr: true,
+        ..Default::default()
+    };
+
+    let result = extract_file(&path, None, &config).await;
+
+    match result {
+        Ok(res) => {
+            assert_mime_type(&res, "image/png");
+            assert_non_empty_content(&res);
+            let content_lower = res.content.to_lowercase();
+            assert!(
+                content_lower.contains("hello") || content_lower.contains("world"),
+                "OCR output {:?} should contain HELLO or WORLD",
+                res.content
+            );
+        }
+        Err(err) => {
+            tracing::debug!("OCR test failed (Tesseract may not be installed): {}", err);
+        }
+    }
+}
+
+#[cfg(feature = "ocr")]
+#[tokio::test]
+async fn test_ocr_image_without_text_async() {
+    if !test_documents_available() {
+        return;
+    }
+
+    let path = get_test_file_path("images/flower_no_text.jpg");
+    if !path.exists() {
+        tracing::debug!("Skipping test: OCR flower image not available");
+        return;
+    }
+
+    let config = ExtractionConfig {
+        ocr: Some(OcrConfig {
+            backend: "tesseract".to_string(),
+            language: "eng".to_string(),
+            ..Default::default()
+        }),
+        force_ocr: true,
+        ..Default::default()
+    };
+
+    let result = extract_file(&path, None, &config).await;
+
+    match result {
+        Ok(res) => {
+            assert_mime_type(&res, "image/jpeg");
+            assert!(
+                res.content.len() < 200,
+                "Expected minimal OCR output, got {} bytes",
+                res.content.len()
+            );
+        }
+        Err(err) => {
+            tracing::debug!("OCR fallback test failed (Tesseract may not be installed): {}", err);
+        }
+    }
+}