110 lines
3.9 KiB
Rust
110 lines
3.9 KiB
Rust
|
|
//! PDF integration tests that remain specific to the Rust core.
|
||
|
|
//!
|
||
|
|
//! Positive-path scenarios live in the shared fixtures that back the
|
||
|
|
//! multi-language E2E generator. This module keeps only the cases that
|
||
|
|
//! exercise Rust-specific failure handling or error propagation.
|
||
|
|
|
||
|
|
#![cfg(feature = "pdf")]
|
||
|
|
|
||
|
|
mod helpers;
|
||
|
|
|
||
|
|
use helpers::*;
|
||
|
|
use kreuzberg::core::config::ExtractionConfig;
|
||
|
|
use kreuzberg::{PdfConfig, extract_bytes_sync, extract_file_sync};
|
||
|
|
|
||
|
|
/// Corrupted / garbage bytes passed as PDF must return a handled error, not panic.
|
||
|
|
///
|
||
|
|
/// This is a regression guard for issue #544: previously, malformed PDFs could
|
||
|
|
/// trigger a Rust panic via `.unwrap()` / `.expect()` calls in the extraction
|
||
|
|
/// path, crashing the host process when called through FFI (Python, Node, etc.).
|
||
|
|
#[test]
|
||
|
|
fn test_corrupted_pdf_returns_error_not_panic() {
|
||
|
|
let config = ExtractionConfig::default();
|
||
|
|
|
||
|
|
// Pure garbage — not even a PDF header.
|
||
|
|
let result = extract_bytes_sync(b"not a pdf", "application/pdf", &config);
|
||
|
|
assert!(result.is_err(), "Garbage bytes should return Err, not Ok");
|
||
|
|
|
||
|
|
// Truncated PDF header with no content.
|
||
|
|
let result = extract_bytes_sync(b"%PDF-1.4\n%%EOF", "application/pdf", &config);
|
||
|
|
assert!(result.is_err(), "Truncated PDF should return Err, not Ok");
|
||
|
|
|
||
|
|
// Binary noise with a valid-looking PDF header.
|
||
|
|
let mut noisy = b"%PDF-1.7\n".to_vec();
|
||
|
|
noisy.extend(std::iter::repeat_n(0xEFu8, 256));
|
||
|
|
let result = extract_bytes_sync(&noisy, "application/pdf", &config);
|
||
|
|
assert!(result.is_err(), "Corrupt PDF body should return Err, not Ok");
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_pdf_password_protected_fails_gracefully() {
|
||
|
|
if skip_if_missing("pdfs/copy_protected.pdf") {
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let file_path = get_test_file_path("pdfs/copy_protected.pdf");
|
||
|
|
let result = extract_file_sync(&file_path, None, &ExtractionConfig::default());
|
||
|
|
|
||
|
|
match result {
|
||
|
|
Ok(extraction_result) => {
|
||
|
|
assert_mime_type(&extraction_result, "application/pdf");
|
||
|
|
assert!(
|
||
|
|
extraction_result.chunks.is_none(),
|
||
|
|
"Chunks should be None without chunking config"
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
extraction_result.detected_languages.is_none(),
|
||
|
|
"Language detection not enabled"
|
||
|
|
);
|
||
|
|
}
|
||
|
|
Err(e) => {
|
||
|
|
let error_msg = e.to_string().to_lowercase();
|
||
|
|
assert!(
|
||
|
|
error_msg.contains("password") || error_msg.contains("protected") || error_msg.contains("encrypted"),
|
||
|
|
"Error message should indicate password/protection issue, got: {}",
|
||
|
|
e
|
||
|
|
);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_pdf_password_protected_succeeds_with_correct_password() {
|
||
|
|
if skip_if_missing("pdfs/copy_protected.pdf") {
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let file_path = get_test_file_path("pdfs/copy_protected.pdf");
|
||
|
|
|
||
|
|
let config = ExtractionConfig {
|
||
|
|
pdf_options: Some(PdfConfig {
|
||
|
|
passwords: Some(vec!["wrong-password".into(), "<correct password>".into()]),
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
|
||
|
|
let result = extract_file_sync(&file_path, None, &config);
|
||
|
|
|
||
|
|
match result {
|
||
|
|
Ok(extraction_result) => {
|
||
|
|
assert_mime_type(&extraction_result, "application/pdf");
|
||
|
|
assert!(
|
||
|
|
extraction_result.chunks.is_none(),
|
||
|
|
"Chunks should be None without chunking config"
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
extraction_result.detected_languages.is_none(),
|
||
|
|
"Language detection not enabled"
|
||
|
|
);
|
||
|
|
}
|
||
|
|
Err(e) => {
|
||
|
|
let error_msg = e.to_string().to_lowercase();
|
||
|
|
assert!(
|
||
|
|
!error_msg.contains("password") && !error_msg.contains("protected") && !error_msg.contains("encrypted"),
|
||
|
|
"Error message should not indicate password/protection issue, got: {e}",
|
||
|
|
);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|