//! PDF integration tests that remain specific to the Rust core. //! //! Positive-path scenarios live in the shared fixtures that back the //! multi-language E2E generator. This module keeps only the cases that //! exercise Rust-specific failure handling or error propagation. #![cfg(feature = "pdf")] mod helpers; use helpers::*; use kreuzberg::core::config::ExtractionConfig; use kreuzberg::{PdfConfig, extract_bytes_sync, extract_file_sync}; /// Corrupted / garbage bytes passed as PDF must return a handled error, not panic. /// /// This is a regression guard for issue #544: previously, malformed PDFs could /// trigger a Rust panic via `.unwrap()` / `.expect()` calls in the extraction /// path, crashing the host process when called through FFI (Python, Node, etc.). #[test] fn test_corrupted_pdf_returns_error_not_panic() { let config = ExtractionConfig::default(); // Pure garbage — not even a PDF header. let result = extract_bytes_sync(b"not a pdf", "application/pdf", &config); assert!(result.is_err(), "Garbage bytes should return Err, not Ok"); // Truncated PDF header with no content. let result = extract_bytes_sync(b"%PDF-1.4\n%%EOF", "application/pdf", &config); assert!(result.is_err(), "Truncated PDF should return Err, not Ok"); // Binary noise with a valid-looking PDF header. let mut noisy = b"%PDF-1.7\n".to_vec(); noisy.extend(std::iter::repeat_n(0xEFu8, 256)); let result = extract_bytes_sync(&noisy, "application/pdf", &config); assert!(result.is_err(), "Corrupt PDF body should return Err, not Ok"); } #[test] fn test_pdf_password_protected_fails_gracefully() { if skip_if_missing("pdfs/copy_protected.pdf") { return; } let file_path = get_test_file_path("pdfs/copy_protected.pdf"); let result = extract_file_sync(&file_path, None, &ExtractionConfig::default()); match result { Ok(extraction_result) => { assert_mime_type(&extraction_result, "application/pdf"); assert!( extraction_result.chunks.is_none(), "Chunks should be None without chunking config" ); assert!( extraction_result.detected_languages.is_none(), "Language detection not enabled" ); } Err(e) => { let error_msg = e.to_string().to_lowercase(); assert!( error_msg.contains("password") || error_msg.contains("protected") || error_msg.contains("encrypted"), "Error message should indicate password/protection issue, got: {}", e ); } } } #[test] fn test_pdf_password_protected_succeeds_with_correct_password() { if skip_if_missing("pdfs/copy_protected.pdf") { return; } let file_path = get_test_file_path("pdfs/copy_protected.pdf"); let config = ExtractionConfig { pdf_options: Some(PdfConfig { passwords: Some(vec!["wrong-password".into(), "".into()]), ..Default::default() }), ..Default::default() }; let result = extract_file_sync(&file_path, None, &config); match result { Ok(extraction_result) => { assert_mime_type(&extraction_result, "application/pdf"); assert!( extraction_result.chunks.is_none(), "Chunks should be None without chunking config" ); assert!( extraction_result.detected_languages.is_none(), "Language detection not enabled" ); } Err(e) => { let error_msg = e.to_string().to_lowercase(); assert!( !error_msg.contains("password") && !error_msg.contains("protected") && !error_msg.contains("encrypted"), "Error message should not indicate password/protection issue, got: {e}", ); } } }