Files
fil/crates/kreuzberg/tests/error_handling.rs

405 lines
13 KiB
Rust
Raw Permalink Normal View History

2026-06-01 23:40:55 +02:00
//! Error handling and edge case integration tests.
//!
//! Tests for corrupted files, edge cases, and invalid inputs.
//! Validates that the system handles errors gracefully without panics.
use kreuzberg::core::config::ExtractionConfig;
use kreuzberg::core::extractor::{extract_bytes, extract_file};
use std::io::Write;
use tempfile::NamedTempFile;
mod helpers;
/// Test truncated PDF - incomplete PDF file.
#[tokio::test]
#[cfg(feature = "pdf")]
async fn test_truncated_pdf() {
let config = ExtractionConfig::default();
let truncated_pdf = b"%PDF-1.4\n1 0 obj\n<<";
let result = extract_bytes(truncated_pdf, "application/pdf", &config).await;
assert!(result.is_err(), "Truncated PDF should fail gracefully");
let error = result.unwrap_err();
assert!(
matches!(error, kreuzberg::KreuzbergError::Parsing { .. }),
"Truncated PDF should produce Parsing error, got: {:?}",
error
);
}
/// Test corrupted ZIP - malformed archive.
#[tokio::test]
#[cfg(feature = "archives")]
async fn test_corrupted_zip() {
let config = ExtractionConfig::default();
let corrupted_zip = vec![0x50, 0x4B, 0x03, 0x04, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00];
let result = extract_bytes(&corrupted_zip, "application/zip", &config).await;
assert!(result.is_err(), "Corrupted ZIP should fail gracefully");
let error = result.unwrap_err();
assert!(
matches!(error, kreuzberg::KreuzbergError::Parsing { .. }),
"Corrupted ZIP should produce Parsing error, got: {:?}",
error
);
}
/// Test invalid XML - bad XML syntax.
#[tokio::test]
#[cfg(feature = "xml")]
async fn test_invalid_xml() {
let config = ExtractionConfig::default();
let invalid_xml = b"<?xml version=\"1.0\"?>\n\
<root>\n\
<unclosed>\n\
<another>text</wrong_tag>\n\
</root";
let result = extract_bytes(invalid_xml, "application/xml", &config).await;
match result {
Ok(extraction) => {
assert!(
extraction.chunks.is_none(),
"Chunks should be None without chunking config"
);
}
Err(error) => {
assert!(
matches!(error, kreuzberg::KreuzbergError::Parsing { .. }),
"Invalid XML error should be Parsing type, got: {:?}",
error
);
}
}
}
/// Test corrupted image - invalid image data.
#[tokio::test]
#[cfg(feature = "ocr")]
async fn test_corrupted_image() {
let config = ExtractionConfig::default();
let corrupted_png = vec![0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, 0xFF, 0xFF, 0xFF, 0xFF];
let result = extract_bytes(&corrupted_png, "image/png", &config).await;
match result {
Ok(extraction) => {
assert!(
extraction.chunks.is_none(),
"Chunks should be None without chunking config"
);
}
Err(error) => {
assert!(
matches!(error, kreuzberg::KreuzbergError::Parsing { .. })
|| matches!(error, kreuzberg::KreuzbergError::Ocr { .. }),
"Corrupted image error should be Parsing or OCR type, got: {:?}",
error
);
}
}
}
/// Test empty file - 0 bytes.
#[tokio::test]
async fn test_empty_file() {
let config = ExtractionConfig::default();
let empty_data = b"";
let result_text = extract_bytes(empty_data, "text/plain", &config).await;
#[cfg(feature = "pdf")]
{
let result_pdf = extract_bytes(empty_data, "application/pdf", &config).await;
match result_pdf {
Ok(extraction) => {
assert!(
extraction.content.is_empty(),
"Empty PDF should have empty content if it succeeds"
);
assert!(extraction.chunks.is_none(), "Chunks should be None");
}
Err(error) => {
assert!(
matches!(
error,
kreuzberg::KreuzbergError::Parsing { .. } | kreuzberg::KreuzbergError::Validation { .. }
),
"Empty PDF should produce Parsing or Validation error, got: {:?}",
error
);
}
}
}
match result_text {
Ok(extraction) => {
assert!(
extraction.content.is_empty(),
"Empty text file should have empty content"
);
assert!(extraction.chunks.is_none(), "Chunks should be None");
}
Err(error) => {
panic!("Empty text file should not fail, got error: {:?}", error);
}
}
#[cfg(feature = "xml")]
{
let result_xml = extract_bytes(empty_data, "application/xml", &config).await;
match result_xml {
Ok(extraction) => {
assert!(
extraction.content.is_empty(),
"Empty XML should have empty content if it succeeds"
);
assert!(extraction.chunks.is_none(), "Chunks should be None");
}
Err(error) => {
assert!(
matches!(error, kreuzberg::KreuzbergError::Parsing { .. }),
"Empty XML error should be Parsing type, got: {:?}",
error
);
}
}
}
}
/// Test very large file - stress test with large content.
#[tokio::test]
async fn test_very_large_file() {
let config = ExtractionConfig::default();
let large_text = "This is a line of text that will be repeated many times.\n".repeat(200_000);
let large_bytes = large_text.as_bytes();
let result = extract_bytes(large_bytes, "text/plain", &config).await;
assert!(result.is_ok(), "Large file should be processed successfully");
let extraction = result.expect("Operation failed");
assert!(!extraction.content.is_empty(), "Large file content should not be empty");
assert!(extraction.content.len() > 1_000_000, "Content should be large");
assert!(
extraction.chunks.is_none(),
"Chunks should be None without chunking config"
);
assert!(
extraction.detected_languages.is_none(),
"Language detection not enabled"
);
assert!(extraction.tables.is_empty(), "Text file should not have tables");
assert!(
extraction.content.contains("This is a line of text"),
"Content should preserve original text"
);
}
/// Test unicode filenames - non-ASCII paths.
#[tokio::test]
async fn test_unicode_filenames() {
let config = ExtractionConfig::default();
let mut temp_file = NamedTempFile::new().expect("Should create temp file");
temp_file
.write_all(b"Test content with Unicode filename.")
.expect("Operation failed");
let result = extract_file(temp_file.path(), Some("text/plain"), &config).await;
assert!(result.is_ok(), "Unicode filename should be handled");
let extraction = result.expect("Operation failed");
assert!(
extraction.content.contains("Test content"),
"Content should be extracted"
);
assert!(
extraction.chunks.is_none(),
"Chunks should be None without chunking config"
);
assert!(
extraction.detected_languages.is_none(),
"Language detection not enabled"
);
}
/// Test special characters in content - emojis, RTL text.
#[tokio::test]
async fn test_special_characters_content() {
let config = ExtractionConfig::default();
let special_text = "Emojis: 🎉 🚀 ✅ 🌍\n\
Arabic (RTL): مرحبا بالعالم\n\
Chinese: \n\
Japanese: \n\
Special chars: © ® £ ¥\n\
Math symbols: ";
let result = extract_bytes(special_text.as_bytes(), "text/plain", &config).await;
assert!(result.is_ok(), "Special characters should be handled");
let extraction = result.expect("Operation failed");
assert!(!extraction.content.is_empty(), "Content should not be empty");
assert!(extraction.content.len() > 10, "Should have substantial content");
assert!(
extraction.chunks.is_none(),
"Chunks should be None without chunking config"
);
assert!(
extraction.detected_languages.is_none(),
"Language detection not enabled"
);
assert!(
extraction.content.contains("Emojis")
|| extraction.content.contains("Arabic")
|| extraction.content.contains("Chinese"),
"Should preserve at least some special character text"
);
}
/// Test nonexistent file - file not found.
#[tokio::test]
async fn test_nonexistent_file() {
let config = ExtractionConfig::default();
let nonexistent_path = "/nonexistent/path/to/file.pdf";
let result = extract_file(nonexistent_path, Some("application/pdf"), &config).await;
assert!(result.is_err(), "Nonexistent file should return error");
let error = result.unwrap_err();
assert!(
matches!(error, kreuzberg::KreuzbergError::Io(_))
|| matches!(error, kreuzberg::KreuzbergError::Validation { .. }),
"Should be IO or Validation error for nonexistent file, got: {:?}",
error
);
}
/// Test unsupported format - unknown file type.
#[tokio::test]
async fn test_unsupported_format() {
let config = ExtractionConfig::default();
let data = b"Some random data";
let result = extract_bytes(data, "application/x-unknown-format", &config).await;
assert!(result.is_err(), "Unsupported format should return error");
let error = result.unwrap_err();
assert!(
matches!(error, kreuzberg::KreuzbergError::UnsupportedFormat(_)),
"Should be UnsupportedFormat error, got: {:?}",
error
);
}
/// Test permission denied - no read access (platform-specific).
#[tokio::test]
#[cfg(unix)]
async fn test_permission_denied() {
use std::fs;
use std::os::unix::fs::PermissionsExt;
let config = ExtractionConfig::default();
let mut temp_file = NamedTempFile::new().expect("Should create temp file");
temp_file.write_all(b"Test content").expect("Operation failed");
let mut perms = fs::metadata(temp_file.path()).expect("Operation failed").permissions();
perms.set_mode(0o000);
fs::set_permissions(temp_file.path(), perms).expect("Operation failed");
let result = extract_file(temp_file.path(), Some("text/plain"), &config).await;
let mut perms = fs::metadata(temp_file.path()).expect("Operation failed").permissions();
perms.set_mode(0o644);
fs::set_permissions(temp_file.path(), perms).expect("Operation failed");
assert!(result.is_err(), "Permission denied should return error");
}
/// Test file extension mismatch - .pdf extension with DOCX content.
#[tokio::test]
async fn test_file_extension_mismatch() {
let config = ExtractionConfig::default();
let docx_magic = vec![0x50, 0x4B, 0x03, 0x04, 0x14, 0x00, 0x00, 0x00];
let result = extract_bytes(&docx_magic, "application/pdf", &config).await;
assert!(result.is_err(), "MIME type mismatch should fail");
}
/// Test extraction with null bytes in content.
#[tokio::test]
async fn test_null_bytes_in_content() {
let config = ExtractionConfig::default();
let data_with_nulls = b"Text before\x00null\x00bytes\x00after";
let result = extract_bytes(data_with_nulls, "text/plain", &config).await;
assert!(result.is_ok(), "Null bytes should be handled");
let extraction = result.expect("Operation failed");
assert!(!extraction.content.is_empty(), "Content should not be empty");
assert!(
extraction.chunks.is_none(),
"Chunks should be None without chunking config"
);
assert!(
extraction.content.contains("Text before") || extraction.content.contains("after"),
"Should preserve at least some of the text content"
);
}
/// Test concurrent extractions of same file.
#[tokio::test]
async fn test_concurrent_extractions() {
let config = ExtractionConfig::default();
let text_data = b"Concurrent extraction test content.";
let handles: Vec<_> = (0..10)
.map(|_| {
let config = config.clone();
tokio::spawn(async move { extract_bytes(text_data, "text/plain", &config).await })
})
.collect();
for handle in handles {
let result = handle.await.expect("Task should complete");
assert!(result.is_ok(), "Concurrent extraction should succeed");
let extraction = result.expect("Operation failed");
assert!(
extraction.content.contains("Concurrent extraction"),
"Content should be extracted correctly"
);
assert!(extraction.chunks.is_none(), "Chunks should be None");
assert!(
extraction.detected_languages.is_none(),
"Language detection not enabled"
);
}
}