crates/kreuzberg/tests/format_integration.rs

//! Binding-specific format integration tests.
//!
//! Positive-path scenarios are now covered by the shared fixture-based E2E
//! suites. The tests here focus on behaviour that is specific to the Rust
//! asynchronous APIs or to graceful handling when optional system
//! dependencies are missing.

#![cfg(any(feature = "pdf", feature = "office", feature = "ocr"))]

mod helpers;

use helpers::{assert_mime_type, get_test_file_path, test_documents_available};

#[cfg(any(feature = "office", feature = "ocr"))]
use helpers::assert_non_empty_content;
use kreuzberg::core::config::ExtractionConfig;
use kreuzberg::core::extractor::extract_file;

#[cfg(feature = "ocr")]
use kreuzberg::core::config::OcrConfig;

#[cfg(feature = "pdf")]
#[tokio::test]
async fn test_pdf_password_protected_async() {
    if !test_documents_available() {
        return;
    }

    let path = get_test_file_path("pdf/copy_protected.pdf");
    if !path.exists() {
        tracing::debug!("Skipping test: protected PDF not available");
        return;
    }

    let result = extract_file(&path, None, &ExtractionConfig::default()).await;

    match result {
        Err(err) => {
            tracing::debug!("Password protection detected (expected): {}", err);
        }
        Ok(res) => {
            tracing::debug!("Protected PDF extracted; some files allow fallback");
            assert_mime_type(&res, "application/pdf");
            assert!(res.chunks.is_none(), "Chunks should be None without chunking config");
            assert!(res.detected_languages.is_none(), "Language detection not enabled");
        }
    }
}

#[cfg(feature = "office")]
#[tokio::test]
#[cfg_attr(target_os = "windows", ignore = "Legacy office tests timeout on Windows CI")]
async fn test_legacy_doc_extraction_async() {
    if !test_documents_available() {
        return;
    }

    let path = get_test_file_path("doc/simple.doc");
    if !path.exists() {
        tracing::debug!("Skipping test: legacy .doc file not available");
        return;
    }

    let result = extract_file(&path, None, &ExtractionConfig::default()).await;

    match result {
        Ok(extracted) => {
            assert_mime_type(&extracted, "application/msword");
            assert_non_empty_content(&extracted);
            assert!(
                extracted.chunks.is_none(),
                "Chunks should be None without chunking config"
            );
            assert!(extracted.detected_languages.is_none(), "Language detection not enabled");
        }
        Err(err) => {
            tracing::debug!(
                "Legacy Office extraction failed (office feature may not be enabled): {}",
                err
            );
        }
    }
}

#[cfg(feature = "ocr")]
#[tokio::test]
async fn test_ocr_simple_english_image_async() {
    if !test_documents_available() {
        return;
    }

    let path = get_test_file_path("images/test_hello_world.png");
    if !path.exists() {
        tracing::debug!("Skipping test: OCR sample image not available");
        return;
    }

    let config = ExtractionConfig {
        ocr: Some(OcrConfig {
            backend: "tesseract".to_string(),
            language: "eng".to_string(),
            ..Default::default()
        }),
        force_ocr: true,
        ..Default::default()
    };

    let result = extract_file(&path, None, &config).await;

    match result {
        Ok(res) => {
            assert_mime_type(&res, "image/png");
            assert_non_empty_content(&res);
            let content_lower = res.content.to_lowercase();
            assert!(
                content_lower.contains("hello") || content_lower.contains("world"),
                "OCR output {:?} should contain HELLO or WORLD",
                res.content
            );
        }
        Err(err) => {
            tracing::debug!("OCR test failed (Tesseract may not be installed): {}", err);
        }
    }
}

#[cfg(feature = "ocr")]
#[tokio::test]
async fn test_ocr_image_without_text_async() {
    if !test_documents_available() {
        return;
    }

    let path = get_test_file_path("images/flower_no_text.jpg");
    if !path.exists() {
        tracing::debug!("Skipping test: OCR flower image not available");
        return;
    }

    let config = ExtractionConfig {
        ocr: Some(OcrConfig {
            backend: "tesseract".to_string(),
            language: "eng".to_string(),
            ..Default::default()
        }),
        force_ocr: true,
        ..Default::default()
    };

    let result = extract_file(&path, None, &config).await;

    match result {
        Ok(res) => {
            assert_mime_type(&res, "image/jpeg");
            assert!(
                res.content.len() < 200,
                "Expected minimal OCR output, got {} bytes",
                res.content.len()
            );
        }
        Err(err) => {
            tracing::debug!("OCR fallback test failed (Tesseract may not be installed): {}", err);
        }
    }
}
Nomad changes 2026-06-01 23:40:55 +02:00			`//! Binding-specific format integration tests.`
			`//!`
			`//! Positive-path scenarios are now covered by the shared fixture-based E2E`
			`//! suites. The tests here focus on behaviour that is specific to the Rust`
			`//! asynchronous APIs or to graceful handling when optional system`
			`//! dependencies are missing.`

			`#![cfg(any(feature = "pdf", feature = "office", feature = "ocr"))]`

			`mod helpers;`

			`use helpers::{assert_mime_type, get_test_file_path, test_documents_available};`

			`#[cfg(any(feature = "office", feature = "ocr"))]`
			`use helpers::assert_non_empty_content;`
			`use kreuzberg::core::config::ExtractionConfig;`
			`use kreuzberg::core::extractor::extract_file;`

			`#[cfg(feature = "ocr")]`
			`use kreuzberg::core::config::OcrConfig;`

			`#[cfg(feature = "pdf")]`
			`#[tokio::test]`
			`async fn test_pdf_password_protected_async() {`
			`if !test_documents_available() {`
			`return;`
			`}`

			`let path = get_test_file_path("pdf/copy_protected.pdf");`
			`if !path.exists() {`
			`tracing::debug!("Skipping test: protected PDF not available");`
			`return;`
			`}`

			`let result = extract_file(&path, None, &ExtractionConfig::default()).await;`

			`match result {`
			`Err(err) => {`
			`tracing::debug!("Password protection detected (expected): {}", err);`
			`}`
			`Ok(res) => {`
			`tracing::debug!("Protected PDF extracted; some files allow fallback");`
			`assert_mime_type(&res, "application/pdf");`
			`assert!(res.chunks.is_none(), "Chunks should be None without chunking config");`
			`assert!(res.detected_languages.is_none(), "Language detection not enabled");`
			`}`
			`}`
			`}`

			`#[cfg(feature = "office")]`
			`#[tokio::test]`
			`#[cfg_attr(target_os = "windows", ignore = "Legacy office tests timeout on Windows CI")]`
			`async fn test_legacy_doc_extraction_async() {`
			`if !test_documents_available() {`
			`return;`
			`}`

			`let path = get_test_file_path("doc/simple.doc");`
			`if !path.exists() {`
			`tracing::debug!("Skipping test: legacy .doc file not available");`
			`return;`
			`}`

			`let result = extract_file(&path, None, &ExtractionConfig::default()).await;`

			`match result {`
			`Ok(extracted) => {`
			`assert_mime_type(&extracted, "application/msword");`
			`assert_non_empty_content(&extracted);`
			`assert!(`
			`extracted.chunks.is_none(),`
			`"Chunks should be None without chunking config"`
			`);`
			`assert!(extracted.detected_languages.is_none(), "Language detection not enabled");`
			`}`
			`Err(err) => {`
			`tracing::debug!(`
			`"Legacy Office extraction failed (office feature may not be enabled): {}",`
			`err`
			`);`
			`}`
			`}`
			`}`

			`#[cfg(feature = "ocr")]`
			`#[tokio::test]`
			`async fn test_ocr_simple_english_image_async() {`
			`if !test_documents_available() {`
			`return;`
			`}`

			`let path = get_test_file_path("images/test_hello_world.png");`
			`if !path.exists() {`
			`tracing::debug!("Skipping test: OCR sample image not available");`
			`return;`
			`}`

			`let config = ExtractionConfig {`
			`ocr: Some(OcrConfig {`
			`backend: "tesseract".to_string(),`
			`language: "eng".to_string(),`
			`..Default::default()`
			`}),`
			`force_ocr: true,`
			`..Default::default()`
			`};`

			`let result = extract_file(&path, None, &config).await;`

			`match result {`
			`Ok(res) => {`
			`assert_mime_type(&res, "image/png");`
			`assert_non_empty_content(&res);`
			`let content_lower = res.content.to_lowercase();`
			`assert!(`
			`content_lower.contains("hello") \|\| content_lower.contains("world"),`
			`"OCR output {:?} should contain HELLO or WORLD",`
			`res.content`
			`);`
			`}`
			`Err(err) => {`
			`tracing::debug!("OCR test failed (Tesseract may not be installed): {}", err);`
			`}`
			`}`
			`}`

			`#[cfg(feature = "ocr")]`
			`#[tokio::test]`
			`async fn test_ocr_image_without_text_async() {`
			`if !test_documents_available() {`
			`return;`
			`}`

			`let path = get_test_file_path("images/flower_no_text.jpg");`
			`if !path.exists() {`
			`tracing::debug!("Skipping test: OCR flower image not available");`
			`return;`
			`}`

			`let config = ExtractionConfig {`
			`ocr: Some(OcrConfig {`
			`backend: "tesseract".to_string(),`
			`language: "eng".to_string(),`
			`..Default::default()`
			`}),`
			`force_ocr: true,`
			`..Default::default()`
			`};`

			`let result = extract_file(&path, None, &config).await;`

			`match result {`
			`Ok(res) => {`
			`assert_mime_type(&res, "image/jpeg");`
			`assert!(`
			`res.content.len() < 200,`
			`"Expected minimal OCR output, got {} bytes",`
			`res.content.len()`
			`);`
			`}`
			`Err(err) => {`
			`tracing::debug!("OCR fallback test failed (Tesseract may not be installed): {}", err);`
			`}`
			`}`
			`}`