//! Error handling and edge case integration tests. //! //! Tests for corrupted files, edge cases, and invalid inputs. //! Validates that the system handles errors gracefully without panics. use kreuzberg::core::config::ExtractionConfig; use kreuzberg::core::extractor::{extract_bytes, extract_file}; use std::io::Write; use tempfile::NamedTempFile; mod helpers; /// Test truncated PDF - incomplete PDF file. #[tokio::test] #[cfg(feature = "pdf")] async fn test_truncated_pdf() { let config = ExtractionConfig::default(); let truncated_pdf = b"%PDF-1.4\n1 0 obj\n<<"; let result = extract_bytes(truncated_pdf, "application/pdf", &config).await; assert!(result.is_err(), "Truncated PDF should fail gracefully"); let error = result.unwrap_err(); assert!( matches!(error, kreuzberg::KreuzbergError::Parsing { .. }), "Truncated PDF should produce Parsing error, got: {:?}", error ); } /// Test corrupted ZIP - malformed archive. #[tokio::test] #[cfg(feature = "archives")] async fn test_corrupted_zip() { let config = ExtractionConfig::default(); let corrupted_zip = vec![0x50, 0x4B, 0x03, 0x04, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00]; let result = extract_bytes(&corrupted_zip, "application/zip", &config).await; assert!(result.is_err(), "Corrupted ZIP should fail gracefully"); let error = result.unwrap_err(); assert!( matches!(error, kreuzberg::KreuzbergError::Parsing { .. }), "Corrupted ZIP should produce Parsing error, got: {:?}", error ); } /// Test invalid XML - bad XML syntax. #[tokio::test] #[cfg(feature = "xml")] async fn test_invalid_xml() { let config = ExtractionConfig::default(); let invalid_xml = b"\n\ \n\ \n\ text\n\ { assert!( extraction.chunks.is_none(), "Chunks should be None without chunking config" ); } Err(error) => { assert!( matches!(error, kreuzberg::KreuzbergError::Parsing { .. }), "Invalid XML error should be Parsing type, got: {:?}", error ); } } } /// Test corrupted image - invalid image data. #[tokio::test] #[cfg(feature = "ocr")] async fn test_corrupted_image() { let config = ExtractionConfig::default(); let corrupted_png = vec![0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, 0xFF, 0xFF, 0xFF, 0xFF]; let result = extract_bytes(&corrupted_png, "image/png", &config).await; match result { Ok(extraction) => { assert!( extraction.chunks.is_none(), "Chunks should be None without chunking config" ); } Err(error) => { assert!( matches!(error, kreuzberg::KreuzbergError::Parsing { .. }) || matches!(error, kreuzberg::KreuzbergError::Ocr { .. }), "Corrupted image error should be Parsing or OCR type, got: {:?}", error ); } } } /// Test empty file - 0 bytes. #[tokio::test] async fn test_empty_file() { let config = ExtractionConfig::default(); let empty_data = b""; let result_text = extract_bytes(empty_data, "text/plain", &config).await; #[cfg(feature = "pdf")] { let result_pdf = extract_bytes(empty_data, "application/pdf", &config).await; match result_pdf { Ok(extraction) => { assert!( extraction.content.is_empty(), "Empty PDF should have empty content if it succeeds" ); assert!(extraction.chunks.is_none(), "Chunks should be None"); } Err(error) => { assert!( matches!( error, kreuzberg::KreuzbergError::Parsing { .. } | kreuzberg::KreuzbergError::Validation { .. } ), "Empty PDF should produce Parsing or Validation error, got: {:?}", error ); } } } match result_text { Ok(extraction) => { assert!( extraction.content.is_empty(), "Empty text file should have empty content" ); assert!(extraction.chunks.is_none(), "Chunks should be None"); } Err(error) => { panic!("Empty text file should not fail, got error: {:?}", error); } } #[cfg(feature = "xml")] { let result_xml = extract_bytes(empty_data, "application/xml", &config).await; match result_xml { Ok(extraction) => { assert!( extraction.content.is_empty(), "Empty XML should have empty content if it succeeds" ); assert!(extraction.chunks.is_none(), "Chunks should be None"); } Err(error) => { assert!( matches!(error, kreuzberg::KreuzbergError::Parsing { .. }), "Empty XML error should be Parsing type, got: {:?}", error ); } } } } /// Test very large file - stress test with large content. #[tokio::test] async fn test_very_large_file() { let config = ExtractionConfig::default(); let large_text = "This is a line of text that will be repeated many times.\n".repeat(200_000); let large_bytes = large_text.as_bytes(); let result = extract_bytes(large_bytes, "text/plain", &config).await; assert!(result.is_ok(), "Large file should be processed successfully"); let extraction = result.expect("Operation failed"); assert!(!extraction.content.is_empty(), "Large file content should not be empty"); assert!(extraction.content.len() > 1_000_000, "Content should be large"); assert!( extraction.chunks.is_none(), "Chunks should be None without chunking config" ); assert!( extraction.detected_languages.is_none(), "Language detection not enabled" ); assert!(extraction.tables.is_empty(), "Text file should not have tables"); assert!( extraction.content.contains("This is a line of text"), "Content should preserve original text" ); } /// Test unicode filenames - non-ASCII paths. #[tokio::test] async fn test_unicode_filenames() { let config = ExtractionConfig::default(); let mut temp_file = NamedTempFile::new().expect("Should create temp file"); temp_file .write_all(b"Test content with Unicode filename.") .expect("Operation failed"); let result = extract_file(temp_file.path(), Some("text/plain"), &config).await; assert!(result.is_ok(), "Unicode filename should be handled"); let extraction = result.expect("Operation failed"); assert!( extraction.content.contains("Test content"), "Content should be extracted" ); assert!( extraction.chunks.is_none(), "Chunks should be None without chunking config" ); assert!( extraction.detected_languages.is_none(), "Language detection not enabled" ); } /// Test special characters in content - emojis, RTL text. #[tokio::test] async fn test_special_characters_content() { let config = ExtractionConfig::default(); let special_text = "Emojis: 🎉 🚀 ✅ 🌍\n\ Arabic (RTL): مرحبا بالعالم\n\ Chinese: 你好世界\n\ Japanese: こんにちは世界\n\ Special chars: © ® ™ € £ ¥\n\ Math symbols: ∑ ∫ √ ≈ ∞"; let result = extract_bytes(special_text.as_bytes(), "text/plain", &config).await; assert!(result.is_ok(), "Special characters should be handled"); let extraction = result.expect("Operation failed"); assert!(!extraction.content.is_empty(), "Content should not be empty"); assert!(extraction.content.len() > 10, "Should have substantial content"); assert!( extraction.chunks.is_none(), "Chunks should be None without chunking config" ); assert!( extraction.detected_languages.is_none(), "Language detection not enabled" ); assert!( extraction.content.contains("Emojis") || extraction.content.contains("Arabic") || extraction.content.contains("Chinese"), "Should preserve at least some special character text" ); } /// Test nonexistent file - file not found. #[tokio::test] async fn test_nonexistent_file() { let config = ExtractionConfig::default(); let nonexistent_path = "/nonexistent/path/to/file.pdf"; let result = extract_file(nonexistent_path, Some("application/pdf"), &config).await; assert!(result.is_err(), "Nonexistent file should return error"); let error = result.unwrap_err(); assert!( matches!(error, kreuzberg::KreuzbergError::Io(_)) || matches!(error, kreuzberg::KreuzbergError::Validation { .. }), "Should be IO or Validation error for nonexistent file, got: {:?}", error ); } /// Test unsupported format - unknown file type. #[tokio::test] async fn test_unsupported_format() { let config = ExtractionConfig::default(); let data = b"Some random data"; let result = extract_bytes(data, "application/x-unknown-format", &config).await; assert!(result.is_err(), "Unsupported format should return error"); let error = result.unwrap_err(); assert!( matches!(error, kreuzberg::KreuzbergError::UnsupportedFormat(_)), "Should be UnsupportedFormat error, got: {:?}", error ); } /// Test permission denied - no read access (platform-specific). #[tokio::test] #[cfg(unix)] async fn test_permission_denied() { use std::fs; use std::os::unix::fs::PermissionsExt; let config = ExtractionConfig::default(); let mut temp_file = NamedTempFile::new().expect("Should create temp file"); temp_file.write_all(b"Test content").expect("Operation failed"); let mut perms = fs::metadata(temp_file.path()).expect("Operation failed").permissions(); perms.set_mode(0o000); fs::set_permissions(temp_file.path(), perms).expect("Operation failed"); let result = extract_file(temp_file.path(), Some("text/plain"), &config).await; let mut perms = fs::metadata(temp_file.path()).expect("Operation failed").permissions(); perms.set_mode(0o644); fs::set_permissions(temp_file.path(), perms).expect("Operation failed"); assert!(result.is_err(), "Permission denied should return error"); } /// Test file extension mismatch - .pdf extension with DOCX content. #[tokio::test] async fn test_file_extension_mismatch() { let config = ExtractionConfig::default(); let docx_magic = vec![0x50, 0x4B, 0x03, 0x04, 0x14, 0x00, 0x00, 0x00]; let result = extract_bytes(&docx_magic, "application/pdf", &config).await; assert!(result.is_err(), "MIME type mismatch should fail"); } /// Test extraction with null bytes in content. #[tokio::test] async fn test_null_bytes_in_content() { let config = ExtractionConfig::default(); let data_with_nulls = b"Text before\x00null\x00bytes\x00after"; let result = extract_bytes(data_with_nulls, "text/plain", &config).await; assert!(result.is_ok(), "Null bytes should be handled"); let extraction = result.expect("Operation failed"); assert!(!extraction.content.is_empty(), "Content should not be empty"); assert!( extraction.chunks.is_none(), "Chunks should be None without chunking config" ); assert!( extraction.content.contains("Text before") || extraction.content.contains("after"), "Should preserve at least some of the text content" ); } /// Test concurrent extractions of same file. #[tokio::test] async fn test_concurrent_extractions() { let config = ExtractionConfig::default(); let text_data = b"Concurrent extraction test content."; let handles: Vec<_> = (0..10) .map(|_| { let config = config.clone(); tokio::spawn(async move { extract_bytes(text_data, "text/plain", &config).await }) }) .collect(); for handle in handles { let result = handle.await.expect("Task should complete"); assert!(result.is_ok(), "Concurrent extraction should succeed"); let extraction = result.expect("Operation failed"); assert!( extraction.content.contains("Concurrent extraction"), "Content should be extracted correctly" ); assert!(extraction.chunks.is_none(), "Chunks should be None"); assert!( extraction.detected_languages.is_none(), "Language detection not enabled" ); } }