This commit is contained in:
109
crates/kreuzberg/tests/pdf_integration.rs
Normal file
109
crates/kreuzberg/tests/pdf_integration.rs
Normal file
@@ -0,0 +1,109 @@
|
||||
//! PDF integration tests that remain specific to the Rust core.
|
||||
//!
|
||||
//! Positive-path scenarios live in the shared fixtures that back the
|
||||
//! multi-language E2E generator. This module keeps only the cases that
|
||||
//! exercise Rust-specific failure handling or error propagation.
|
||||
|
||||
#![cfg(feature = "pdf")]
|
||||
|
||||
mod helpers;
|
||||
|
||||
use helpers::*;
|
||||
use kreuzberg::core::config::ExtractionConfig;
|
||||
use kreuzberg::{PdfConfig, extract_bytes_sync, extract_file_sync};
|
||||
|
||||
/// Corrupted / garbage bytes passed as PDF must return a handled error, not panic.
|
||||
///
|
||||
/// This is a regression guard for issue #544: previously, malformed PDFs could
|
||||
/// trigger a Rust panic via `.unwrap()` / `.expect()` calls in the extraction
|
||||
/// path, crashing the host process when called through FFI (Python, Node, etc.).
|
||||
#[test]
|
||||
fn test_corrupted_pdf_returns_error_not_panic() {
|
||||
let config = ExtractionConfig::default();
|
||||
|
||||
// Pure garbage — not even a PDF header.
|
||||
let result = extract_bytes_sync(b"not a pdf", "application/pdf", &config);
|
||||
assert!(result.is_err(), "Garbage bytes should return Err, not Ok");
|
||||
|
||||
// Truncated PDF header with no content.
|
||||
let result = extract_bytes_sync(b"%PDF-1.4\n%%EOF", "application/pdf", &config);
|
||||
assert!(result.is_err(), "Truncated PDF should return Err, not Ok");
|
||||
|
||||
// Binary noise with a valid-looking PDF header.
|
||||
let mut noisy = b"%PDF-1.7\n".to_vec();
|
||||
noisy.extend(std::iter::repeat_n(0xEFu8, 256));
|
||||
let result = extract_bytes_sync(&noisy, "application/pdf", &config);
|
||||
assert!(result.is_err(), "Corrupt PDF body should return Err, not Ok");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pdf_password_protected_fails_gracefully() {
|
||||
if skip_if_missing("pdfs/copy_protected.pdf") {
|
||||
return;
|
||||
}
|
||||
|
||||
let file_path = get_test_file_path("pdfs/copy_protected.pdf");
|
||||
let result = extract_file_sync(&file_path, None, &ExtractionConfig::default());
|
||||
|
||||
match result {
|
||||
Ok(extraction_result) => {
|
||||
assert_mime_type(&extraction_result, "application/pdf");
|
||||
assert!(
|
||||
extraction_result.chunks.is_none(),
|
||||
"Chunks should be None without chunking config"
|
||||
);
|
||||
assert!(
|
||||
extraction_result.detected_languages.is_none(),
|
||||
"Language detection not enabled"
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
let error_msg = e.to_string().to_lowercase();
|
||||
assert!(
|
||||
error_msg.contains("password") || error_msg.contains("protected") || error_msg.contains("encrypted"),
|
||||
"Error message should indicate password/protection issue, got: {}",
|
||||
e
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pdf_password_protected_succeeds_with_correct_password() {
|
||||
if skip_if_missing("pdfs/copy_protected.pdf") {
|
||||
return;
|
||||
}
|
||||
|
||||
let file_path = get_test_file_path("pdfs/copy_protected.pdf");
|
||||
|
||||
let config = ExtractionConfig {
|
||||
pdf_options: Some(PdfConfig {
|
||||
passwords: Some(vec!["wrong-password".into(), "<correct password>".into()]),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync(&file_path, None, &config);
|
||||
|
||||
match result {
|
||||
Ok(extraction_result) => {
|
||||
assert_mime_type(&extraction_result, "application/pdf");
|
||||
assert!(
|
||||
extraction_result.chunks.is_none(),
|
||||
"Chunks should be None without chunking config"
|
||||
);
|
||||
assert!(
|
||||
extraction_result.detected_languages.is_none(),
|
||||
"Language detection not enabled"
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
let error_msg = e.to_string().to_lowercase();
|
||||
assert!(
|
||||
!error_msg.contains("password") && !error_msg.contains("protected") && !error_msg.contains("encrypted"),
|
||||
"Error message should not indicate password/protection issue, got: {e}",
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user