//! Security validation tests. //! //! Tests the system's resilience against malicious inputs including: //! - Archive attacks (zip bombs, path traversal) //! - XML attacks (billion laughs, XXE) //! - Resource exhaustion (large files, memory limits) //! - Malformed inputs (invalid MIME, encoding) //! - PDF-specific attacks (malicious JS, weak encryption) use kreuzberg::core::config::ExtractionConfig; use kreuzberg::core::extractor::{extract_bytes_sync, extract_file_sync}; use std::io::Write; use tempfile::NamedTempFile; fn trim_trailing_newlines(value: &str) -> &str { value.trim_end_matches(['\n', '\r']) } fn assert_text_content(actual: &str, expected: &str) { assert_eq!( trim_trailing_newlines(actual), expected, "Content mismatch after trimming trailing newlines" ); } #[test] fn test_archive_zip_bomb_detection() { let mut cursor = std::io::Cursor::new(Vec::new()); { use zip::write::{FileOptions, ZipWriter}; let mut zip = ZipWriter::new(&mut cursor); let options = FileOptions::<'_, ()>::default(); zip.start_file("large.txt", options).expect("Operation failed"); let zeros = vec![0u8; 10 * 1024 * 1024]; zip.write_all(&zeros).expect("Operation failed"); zip.finish().expect("Operation failed"); } let bytes = cursor.into_inner(); let config = ExtractionConfig::default(); let result = extract_bytes_sync(&bytes, "application/zip", &config); assert!(result.is_ok() || result.is_err()); if let Ok(extracted) = result { assert!(extracted.metadata.format.is_some()); } } #[test] fn test_archive_path_traversal_zip() { let mut cursor = std::io::Cursor::new(Vec::new()); { use zip::write::{FileOptions, ZipWriter}; let mut zip = ZipWriter::new(&mut cursor); let options = FileOptions::<'_, ()>::default(); zip.start_file("../../etc/passwd", options).expect("Operation failed"); zip.write_all(b"malicious content").expect("Operation failed"); zip.finish().expect("Operation failed"); } let bytes = cursor.into_inner(); let config = ExtractionConfig::default(); let result = extract_bytes_sync(&bytes, "application/zip", &config); if let Ok(extracted) = result && let Some(archive_meta) = &extracted.metadata.format.as_ref().and_then(|f| match f { kreuzberg::FormatMetadata::Archive(m) => Some(m), _ => None, }) { for file_path in &archive_meta.file_list { assert!(!file_path.starts_with('/'), "Absolute paths should be rejected"); } } } #[test] fn test_archive_path_traversal_tar() { let mut header = tar::Header::new_gnu(); let result = header.set_path("../../etc/shadow"); assert!(result.is_err(), "TAR library should reject path traversal attempts"); } #[test] fn test_archive_absolute_paths_rejected() { let mut cursor = std::io::Cursor::new(Vec::new()); { use zip::write::{FileOptions, ZipWriter}; let mut zip = ZipWriter::new(&mut cursor); let options = FileOptions::<'_, ()>::default(); zip.start_file("/tmp/malicious.txt", options).expect("Operation failed"); zip.write_all(b"malicious content").expect("Operation failed"); zip.finish().expect("Operation failed"); } let bytes = cursor.into_inner(); let config = ExtractionConfig::default(); let result = extract_bytes_sync(&bytes, "application/zip", &config); assert!( result.is_ok() || result.is_err(), "Should handle absolute paths gracefully" ); } #[test] fn test_archive_deeply_nested_directories() { let mut cursor = std::io::Cursor::new(Vec::new()); { use zip::write::{FileOptions, ZipWriter}; let mut zip = ZipWriter::new(&mut cursor); let options = FileOptions::<'_, ()>::default(); let deep_path = (0..100).map(|i| format!("dir{}", i)).collect::>().join("/"); let file_path = format!("{}/file.txt", deep_path); zip.start_file(&file_path, options).expect("Operation failed"); zip.write_all(b"deep content").expect("Operation failed"); zip.finish().expect("Operation failed"); } let bytes = cursor.into_inner(); let config = ExtractionConfig::default(); let result = extract_bytes_sync(&bytes, "application/zip", &config); assert!(result.is_ok() || result.is_err()); } #[test] #[cfg(feature = "archives")] fn test_archive_many_small_files() { let mut cursor = std::io::Cursor::new(Vec::new()); { use zip::write::{FileOptions, ZipWriter}; let mut zip = ZipWriter::new(&mut cursor); let options = FileOptions::<'_, ()>::default(); for i in 0..1000 { zip.start_file(format!("file{}.txt", i), options) .expect("Operation failed"); zip.write_all(b"small content").expect("Operation failed"); } zip.finish().expect("Operation failed"); } let bytes = cursor.into_inner(); let config = ExtractionConfig::default(); let result = extract_bytes_sync(&bytes, "application/zip", &config); assert!(result.is_ok()); if let Ok(extracted) = result { assert!(extracted.metadata.format.is_some()); } } #[test] fn test_xml_billion_laughs_attack() { let xml = r#" ]> &lol3;"#; let config = ExtractionConfig::default(); let result = extract_bytes_sync(xml.as_bytes(), "application/xml", &config); assert!(result.is_ok() || result.is_err()); } #[test] fn test_xml_quadratic_blowup() { let xml = r#" ]> &a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;"#; let config = ExtractionConfig::default(); let result = extract_bytes_sync(xml.as_bytes(), "application/xml", &config); assert!(result.is_ok() || result.is_err()); } #[test] fn test_xml_external_entity_injection() { let xml = r#" ]> &xxe;"#; let config = ExtractionConfig::default(); let result = extract_bytes_sync(xml.as_bytes(), "application/xml", &config); if let Ok(extracted) = result { assert!(!extracted.content.contains("root:")); assert!(!extracted.content.contains("/bin/bash")); } } #[test] fn test_xml_dtd_entity_expansion() { let xml = r#" ]> &large;&large;&large;&large;&large;&large;&large;&large;"#; let config = ExtractionConfig::default(); let result = extract_bytes_sync(xml.as_bytes(), "application/xml", &config); assert!(result.is_ok() || result.is_err()); } #[test] fn test_resource_large_text_file() { let large_text = "This is a line of text that will be repeated many times.\n".repeat(200_000); let config = ExtractionConfig::default(); let result = extract_bytes_sync(large_text.as_bytes(), "text/plain", &config); assert!(result.is_ok()); if let Ok(extracted) = result { assert!(!extracted.content.is_empty()); } } #[test] fn test_resource_large_xml_streaming() { let mut xml = String::from(r#""#); for i in 0..10000 { xml.push_str(&format!("{}", i, "x".repeat(100))); } xml.push_str(""); let config = ExtractionConfig::default(); let result = extract_bytes_sync(xml.as_bytes(), "application/xml", &config); assert!(result.is_ok() || result.is_err()); } #[test] fn test_resource_empty_file() { let empty = b""; let config = ExtractionConfig::default(); let result = extract_bytes_sync(empty, "text/plain", &config); assert!(result.is_ok()); if let Ok(extracted) = result { assert!(extracted.content.is_empty()); } } #[test] fn test_resource_single_byte_file() { let single_byte = b"a"; let config = ExtractionConfig::default(); let result = extract_bytes_sync(single_byte, "text/plain", &config); assert!(result.is_ok()); if let Ok(extracted) = result { assert_text_content(&extracted.content, "a"); } } #[test] fn test_resource_null_bytes() { let null_bytes = b"Hello\x00World\x00Test\x00"; let config = ExtractionConfig::default(); let result = extract_bytes_sync(null_bytes, "text/plain", &config); assert!(result.is_ok()); } #[test] fn test_malformed_invalid_mime_type() { let content = b"Some content"; let config = ExtractionConfig::default(); let result = extract_bytes_sync(content, "invalid/mime/type", &config); assert!(result.is_err()); } #[test] fn test_malformed_xml_structure() { let malformed_xml = r#"test"#; let config = ExtractionConfig::default(); let result = extract_bytes_sync(malformed_xml.as_bytes(), "application/xml", &config); assert!(result.is_ok() || result.is_err()); } #[test] fn test_malformed_zip_structure() { let corrupt_zip = b"PK\x03\x04CORRUPTED_DATA"; let config = ExtractionConfig::default(); let result = extract_bytes_sync(corrupt_zip, "application/zip", &config); assert!(result.is_err()); } #[test] fn test_malformed_invalid_utf8() { let invalid_utf8 = b"Hello \xFF\xFE World"; let config = ExtractionConfig::default(); let result = extract_bytes_sync(invalid_utf8, "text/plain", &config); assert!(result.is_ok() || result.is_err()); } #[test] fn test_malformed_mixed_line_endings() { let mixed_endings = b"Line 1\r\nLine 2\nLine 3\rLine 4"; let config = ExtractionConfig::default(); let result = extract_bytes_sync(mixed_endings, "text/plain", &config); assert!(result.is_ok()); if let Ok(extracted) = result { assert!(extracted.content.contains("Line 1")); assert!(extracted.content.contains("Line 2")); assert!(extracted.content.contains("Line 3")); assert!(extracted.content.contains("Line 4")); } } #[test] fn test_pdf_minimal_valid() { let minimal_pdf = b"%PDF-1.4 This is a very minimal PDF structure for security testing. %%EOF"; let config = ExtractionConfig::default(); let result = extract_bytes_sync(minimal_pdf, "application/pdf", &config); assert!(result.is_ok() || result.is_err()); } #[test] fn test_pdf_malformed_header() { let malformed_pdf = b"%PDF-INVALID This is not a valid PDF structure"; let config = ExtractionConfig::default(); let result = extract_bytes_sync(malformed_pdf, "application/pdf", &config); assert!(result.is_err()); } #[test] fn test_pdf_truncated() { let truncated_pdf = b"%PDF-1.4 1 0 obj << /Type /Catalog >> endobj"; let config = ExtractionConfig::default(); let result = extract_bytes_sync(truncated_pdf, "application/pdf", &config); assert!(result.is_err() || result.is_ok()); } #[test] fn test_security_nonexistent_file() { let config = ExtractionConfig::default(); let result = extract_file_sync("/nonexistent/path/to/file.txt", None, &config); assert!(result.is_err()); } #[test] fn test_security_directory_instead_of_file() { let config = ExtractionConfig::default(); let result = extract_file_sync("/tmp", None, &config); assert!(result.is_err()); } #[test] fn test_security_special_file_handling() { let mut tmpfile = NamedTempFile::new().expect("Operation failed"); tmpfile.write_all(b"test content").expect("Operation failed"); tmpfile.flush().expect("Operation failed"); let path = tmpfile.path(); let config = ExtractionConfig::default(); let result = extract_file_sync(path.to_str().expect("Operation failed"), None, &config); assert!(result.is_ok() || result.is_err()); }