//! Binding-specific format integration tests. //! //! Positive-path scenarios are now covered by the shared fixture-based E2E //! suites. The tests here focus on behaviour that is specific to the Rust //! asynchronous APIs or to graceful handling when optional system //! dependencies are missing. #![cfg(any(feature = "pdf", feature = "office", feature = "ocr"))] mod helpers; use helpers::{assert_mime_type, get_test_file_path, test_documents_available}; #[cfg(any(feature = "office", feature = "ocr"))] use helpers::assert_non_empty_content; use kreuzberg::core::config::ExtractionConfig; use kreuzberg::core::extractor::extract_file; #[cfg(feature = "ocr")] use kreuzberg::core::config::OcrConfig; #[cfg(feature = "pdf")] #[tokio::test] async fn test_pdf_password_protected_async() { if !test_documents_available() { return; } let path = get_test_file_path("pdf/copy_protected.pdf"); if !path.exists() { tracing::debug!("Skipping test: protected PDF not available"); return; } let result = extract_file(&path, None, &ExtractionConfig::default()).await; match result { Err(err) => { tracing::debug!("Password protection detected (expected): {}", err); } Ok(res) => { tracing::debug!("Protected PDF extracted; some files allow fallback"); assert_mime_type(&res, "application/pdf"); assert!(res.chunks.is_none(), "Chunks should be None without chunking config"); assert!(res.detected_languages.is_none(), "Language detection not enabled"); } } } #[cfg(feature = "office")] #[tokio::test] #[cfg_attr(target_os = "windows", ignore = "Legacy office tests timeout on Windows CI")] async fn test_legacy_doc_extraction_async() { if !test_documents_available() { return; } let path = get_test_file_path("doc/simple.doc"); if !path.exists() { tracing::debug!("Skipping test: legacy .doc file not available"); return; } let result = extract_file(&path, None, &ExtractionConfig::default()).await; match result { Ok(extracted) => { assert_mime_type(&extracted, "application/msword"); assert_non_empty_content(&extracted); assert!( extracted.chunks.is_none(), "Chunks should be None without chunking config" ); assert!(extracted.detected_languages.is_none(), "Language detection not enabled"); } Err(err) => { tracing::debug!( "Legacy Office extraction failed (office feature may not be enabled): {}", err ); } } } #[cfg(feature = "ocr")] #[tokio::test] async fn test_ocr_simple_english_image_async() { if !test_documents_available() { return; } let path = get_test_file_path("images/test_hello_world.png"); if !path.exists() { tracing::debug!("Skipping test: OCR sample image not available"); return; } let config = ExtractionConfig { ocr: Some(OcrConfig { backend: "tesseract".to_string(), language: "eng".to_string(), ..Default::default() }), force_ocr: true, ..Default::default() }; let result = extract_file(&path, None, &config).await; match result { Ok(res) => { assert_mime_type(&res, "image/png"); assert_non_empty_content(&res); let content_lower = res.content.to_lowercase(); assert!( content_lower.contains("hello") || content_lower.contains("world"), "OCR output {:?} should contain HELLO or WORLD", res.content ); } Err(err) => { tracing::debug!("OCR test failed (Tesseract may not be installed): {}", err); } } } #[cfg(feature = "ocr")] #[tokio::test] async fn test_ocr_image_without_text_async() { if !test_documents_available() { return; } let path = get_test_file_path("images/flower_no_text.jpg"); if !path.exists() { tracing::debug!("Skipping test: OCR flower image not available"); return; } let config = ExtractionConfig { ocr: Some(OcrConfig { backend: "tesseract".to_string(), language: "eng".to_string(), ..Default::default() }), force_ocr: true, ..Default::default() }; let result = extract_file(&path, None, &config).await; match result { Ok(res) => { assert_mime_type(&res, "image/jpeg"); assert!( res.content.len() < 200, "Expected minimal OCR output, got {} bytes", res.content.len() ); } Err(err) => { tracing::debug!("OCR fallback test failed (Tesseract may not be installed): {}", err); } } }