Files
fil/crates/kreuzberg/tests/pdf_integration.rs
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

110 lines
3.9 KiB
Rust

//! PDF integration tests that remain specific to the Rust core.
//!
//! Positive-path scenarios live in the shared fixtures that back the
//! multi-language E2E generator. This module keeps only the cases that
//! exercise Rust-specific failure handling or error propagation.
#![cfg(feature = "pdf")]
mod helpers;
use helpers::*;
use kreuzberg::core::config::ExtractionConfig;
use kreuzberg::{PdfConfig, extract_bytes_sync, extract_file_sync};
/// Corrupted / garbage bytes passed as PDF must return a handled error, not panic.
///
/// This is a regression guard for issue #544: previously, malformed PDFs could
/// trigger a Rust panic via `.unwrap()` / `.expect()` calls in the extraction
/// path, crashing the host process when called through FFI (Python, Node, etc.).
#[test]
fn test_corrupted_pdf_returns_error_not_panic() {
let config = ExtractionConfig::default();
// Pure garbage — not even a PDF header.
let result = extract_bytes_sync(b"not a pdf", "application/pdf", &config);
assert!(result.is_err(), "Garbage bytes should return Err, not Ok");
// Truncated PDF header with no content.
let result = extract_bytes_sync(b"%PDF-1.4\n%%EOF", "application/pdf", &config);
assert!(result.is_err(), "Truncated PDF should return Err, not Ok");
// Binary noise with a valid-looking PDF header.
let mut noisy = b"%PDF-1.7\n".to_vec();
noisy.extend(std::iter::repeat_n(0xEFu8, 256));
let result = extract_bytes_sync(&noisy, "application/pdf", &config);
assert!(result.is_err(), "Corrupt PDF body should return Err, not Ok");
}
#[test]
fn test_pdf_password_protected_fails_gracefully() {
if skip_if_missing("pdfs/copy_protected.pdf") {
return;
}
let file_path = get_test_file_path("pdfs/copy_protected.pdf");
let result = extract_file_sync(&file_path, None, &ExtractionConfig::default());
match result {
Ok(extraction_result) => {
assert_mime_type(&extraction_result, "application/pdf");
assert!(
extraction_result.chunks.is_none(),
"Chunks should be None without chunking config"
);
assert!(
extraction_result.detected_languages.is_none(),
"Language detection not enabled"
);
}
Err(e) => {
let error_msg = e.to_string().to_lowercase();
assert!(
error_msg.contains("password") || error_msg.contains("protected") || error_msg.contains("encrypted"),
"Error message should indicate password/protection issue, got: {}",
e
);
}
}
}
#[test]
fn test_pdf_password_protected_succeeds_with_correct_password() {
if skip_if_missing("pdfs/copy_protected.pdf") {
return;
}
let file_path = get_test_file_path("pdfs/copy_protected.pdf");
let config = ExtractionConfig {
pdf_options: Some(PdfConfig {
passwords: Some(vec!["wrong-password".into(), "<correct password>".into()]),
..Default::default()
}),
..Default::default()
};
let result = extract_file_sync(&file_path, None, &config);
match result {
Ok(extraction_result) => {
assert_mime_type(&extraction_result, "application/pdf");
assert!(
extraction_result.chunks.is_none(),
"Chunks should be None without chunking config"
);
assert!(
extraction_result.detected_languages.is_none(),
"Language detection not enabled"
);
}
Err(e) => {
let error_msg = e.to_string().to_lowercase();
assert!(
!error_msg.contains("password") && !error_msg.contains("protected") && !error_msg.contains("encrypted"),
"Error message should not indicate password/protection issue, got: {e}",
);
}
}
}