This commit is contained in:
417
crates/kreuzberg/tests/security_validation.rs
Normal file
417
crates/kreuzberg/tests/security_validation.rs
Normal file
@@ -0,0 +1,417 @@
|
||||
//! Security validation tests.
|
||||
//!
|
||||
//! Tests the system's resilience against malicious inputs including:
|
||||
//! - Archive attacks (zip bombs, path traversal)
|
||||
//! - XML attacks (billion laughs, XXE)
|
||||
//! - Resource exhaustion (large files, memory limits)
|
||||
//! - Malformed inputs (invalid MIME, encoding)
|
||||
//! - PDF-specific attacks (malicious JS, weak encryption)
|
||||
|
||||
use kreuzberg::core::config::ExtractionConfig;
|
||||
use kreuzberg::core::extractor::{extract_bytes_sync, extract_file_sync};
|
||||
use std::io::Write;
|
||||
use tempfile::NamedTempFile;
|
||||
|
||||
fn trim_trailing_newlines(value: &str) -> &str {
|
||||
value.trim_end_matches(['\n', '\r'])
|
||||
}
|
||||
|
||||
fn assert_text_content(actual: &str, expected: &str) {
|
||||
assert_eq!(
|
||||
trim_trailing_newlines(actual),
|
||||
expected,
|
||||
"Content mismatch after trimming trailing newlines"
|
||||
);
|
||||
}
|
||||
#[test]
|
||||
fn test_archive_zip_bomb_detection() {
|
||||
let mut cursor = std::io::Cursor::new(Vec::new());
|
||||
{
|
||||
use zip::write::{FileOptions, ZipWriter};
|
||||
let mut zip = ZipWriter::new(&mut cursor);
|
||||
let options = FileOptions::<'_, ()>::default();
|
||||
|
||||
zip.start_file("large.txt", options).expect("Operation failed");
|
||||
let zeros = vec![0u8; 10 * 1024 * 1024];
|
||||
zip.write_all(&zeros).expect("Operation failed");
|
||||
|
||||
zip.finish().expect("Operation failed");
|
||||
}
|
||||
|
||||
let bytes = cursor.into_inner();
|
||||
let config = ExtractionConfig::default();
|
||||
|
||||
let result = extract_bytes_sync(&bytes, "application/zip", &config);
|
||||
|
||||
assert!(result.is_ok() || result.is_err());
|
||||
if let Ok(extracted) = result {
|
||||
assert!(extracted.metadata.format.is_some());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_archive_path_traversal_zip() {
|
||||
let mut cursor = std::io::Cursor::new(Vec::new());
|
||||
{
|
||||
use zip::write::{FileOptions, ZipWriter};
|
||||
let mut zip = ZipWriter::new(&mut cursor);
|
||||
let options = FileOptions::<'_, ()>::default();
|
||||
|
||||
zip.start_file("../../etc/passwd", options).expect("Operation failed");
|
||||
zip.write_all(b"malicious content").expect("Operation failed");
|
||||
|
||||
zip.finish().expect("Operation failed");
|
||||
}
|
||||
|
||||
let bytes = cursor.into_inner();
|
||||
let config = ExtractionConfig::default();
|
||||
|
||||
let result = extract_bytes_sync(&bytes, "application/zip", &config);
|
||||
|
||||
if let Ok(extracted) = result
|
||||
&& let Some(archive_meta) = &extracted.metadata.format.as_ref().and_then(|f| match f {
|
||||
kreuzberg::FormatMetadata::Archive(m) => Some(m),
|
||||
_ => None,
|
||||
})
|
||||
{
|
||||
for file_path in &archive_meta.file_list {
|
||||
assert!(!file_path.starts_with('/'), "Absolute paths should be rejected");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_archive_path_traversal_tar() {
|
||||
let mut header = tar::Header::new_gnu();
|
||||
|
||||
let result = header.set_path("../../etc/shadow");
|
||||
|
||||
assert!(result.is_err(), "TAR library should reject path traversal attempts");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_archive_absolute_paths_rejected() {
|
||||
let mut cursor = std::io::Cursor::new(Vec::new());
|
||||
{
|
||||
use zip::write::{FileOptions, ZipWriter};
|
||||
let mut zip = ZipWriter::new(&mut cursor);
|
||||
let options = FileOptions::<'_, ()>::default();
|
||||
|
||||
zip.start_file("/tmp/malicious.txt", options).expect("Operation failed");
|
||||
zip.write_all(b"malicious content").expect("Operation failed");
|
||||
|
||||
zip.finish().expect("Operation failed");
|
||||
}
|
||||
|
||||
let bytes = cursor.into_inner();
|
||||
let config = ExtractionConfig::default();
|
||||
|
||||
let result = extract_bytes_sync(&bytes, "application/zip", &config);
|
||||
|
||||
assert!(
|
||||
result.is_ok() || result.is_err(),
|
||||
"Should handle absolute paths gracefully"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_archive_deeply_nested_directories() {
|
||||
let mut cursor = std::io::Cursor::new(Vec::new());
|
||||
{
|
||||
use zip::write::{FileOptions, ZipWriter};
|
||||
let mut zip = ZipWriter::new(&mut cursor);
|
||||
let options = FileOptions::<'_, ()>::default();
|
||||
|
||||
let deep_path = (0..100).map(|i| format!("dir{}", i)).collect::<Vec<_>>().join("/");
|
||||
let file_path = format!("{}/file.txt", deep_path);
|
||||
|
||||
zip.start_file(&file_path, options).expect("Operation failed");
|
||||
zip.write_all(b"deep content").expect("Operation failed");
|
||||
|
||||
zip.finish().expect("Operation failed");
|
||||
}
|
||||
|
||||
let bytes = cursor.into_inner();
|
||||
let config = ExtractionConfig::default();
|
||||
|
||||
let result = extract_bytes_sync(&bytes, "application/zip", &config);
|
||||
|
||||
assert!(result.is_ok() || result.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "archives")]
|
||||
fn test_archive_many_small_files() {
|
||||
let mut cursor = std::io::Cursor::new(Vec::new());
|
||||
{
|
||||
use zip::write::{FileOptions, ZipWriter};
|
||||
let mut zip = ZipWriter::new(&mut cursor);
|
||||
let options = FileOptions::<'_, ()>::default();
|
||||
|
||||
for i in 0..1000 {
|
||||
zip.start_file(format!("file{}.txt", i), options)
|
||||
.expect("Operation failed");
|
||||
zip.write_all(b"small content").expect("Operation failed");
|
||||
}
|
||||
|
||||
zip.finish().expect("Operation failed");
|
||||
}
|
||||
|
||||
let bytes = cursor.into_inner();
|
||||
let config = ExtractionConfig::default();
|
||||
|
||||
let result = extract_bytes_sync(&bytes, "application/zip", &config);
|
||||
|
||||
assert!(result.is_ok());
|
||||
if let Ok(extracted) = result {
|
||||
assert!(extracted.metadata.format.is_some());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_xml_billion_laughs_attack() {
|
||||
let xml = r#"<?xml version="1.0"?>
|
||||
<!DOCTYPE lolz [
|
||||
<!ENTITY lol "lol">
|
||||
<!ENTITY lol1 "&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;">
|
||||
<!ENTITY lol2 "&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;">
|
||||
<!ENTITY lol3 "&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;">
|
||||
]>
|
||||
<lolz>&lol3;</lolz>"#;
|
||||
|
||||
let config = ExtractionConfig::default();
|
||||
let result = extract_bytes_sync(xml.as_bytes(), "application/xml", &config);
|
||||
|
||||
assert!(result.is_ok() || result.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_xml_quadratic_blowup() {
|
||||
let xml = r#"<?xml version="1.0"?>
|
||||
<!DOCTYPE bomb [
|
||||
<!ENTITY a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa">
|
||||
]>
|
||||
<bomb>&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;</bomb>"#;
|
||||
|
||||
let config = ExtractionConfig::default();
|
||||
let result = extract_bytes_sync(xml.as_bytes(), "application/xml", &config);
|
||||
|
||||
assert!(result.is_ok() || result.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_xml_external_entity_injection() {
|
||||
let xml = r#"<?xml version="1.0"?>
|
||||
<!DOCTYPE foo [
|
||||
<!ENTITY xxe SYSTEM "file:///etc/passwd">
|
||||
]>
|
||||
<foo>&xxe;</foo>"#;
|
||||
|
||||
let config = ExtractionConfig::default();
|
||||
let result = extract_bytes_sync(xml.as_bytes(), "application/xml", &config);
|
||||
|
||||
if let Ok(extracted) = result {
|
||||
assert!(!extracted.content.contains("root:"));
|
||||
assert!(!extracted.content.contains("/bin/bash"));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_xml_dtd_entity_expansion() {
|
||||
let xml = r#"<?xml version="1.0"?>
|
||||
<!DOCTYPE data [
|
||||
<!ENTITY large "THIS_IS_A_LARGE_STRING_REPEATED_MANY_TIMES">
|
||||
]>
|
||||
<data>&large;&large;&large;&large;&large;&large;&large;&large;</data>"#;
|
||||
|
||||
let config = ExtractionConfig::default();
|
||||
let result = extract_bytes_sync(xml.as_bytes(), "application/xml", &config);
|
||||
|
||||
assert!(result.is_ok() || result.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_resource_large_text_file() {
|
||||
let large_text = "This is a line of text that will be repeated many times.\n".repeat(200_000);
|
||||
|
||||
let config = ExtractionConfig::default();
|
||||
let result = extract_bytes_sync(large_text.as_bytes(), "text/plain", &config);
|
||||
|
||||
assert!(result.is_ok());
|
||||
if let Ok(extracted) = result {
|
||||
assert!(!extracted.content.is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_resource_large_xml_streaming() {
|
||||
let mut xml = String::from(r#"<?xml version="1.0"?><root>"#);
|
||||
for i in 0..10000 {
|
||||
xml.push_str(&format!("<item id=\"{}\">{}</item>", i, "x".repeat(100)));
|
||||
}
|
||||
xml.push_str("</root>");
|
||||
|
||||
let config = ExtractionConfig::default();
|
||||
let result = extract_bytes_sync(xml.as_bytes(), "application/xml", &config);
|
||||
|
||||
assert!(result.is_ok() || result.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_resource_empty_file() {
|
||||
let empty = b"";
|
||||
|
||||
let config = ExtractionConfig::default();
|
||||
let result = extract_bytes_sync(empty, "text/plain", &config);
|
||||
|
||||
assert!(result.is_ok());
|
||||
if let Ok(extracted) = result {
|
||||
assert!(extracted.content.is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_resource_single_byte_file() {
|
||||
let single_byte = b"a";
|
||||
|
||||
let config = ExtractionConfig::default();
|
||||
let result = extract_bytes_sync(single_byte, "text/plain", &config);
|
||||
|
||||
assert!(result.is_ok());
|
||||
if let Ok(extracted) = result {
|
||||
assert_text_content(&extracted.content, "a");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_resource_null_bytes() {
|
||||
let null_bytes = b"Hello\x00World\x00Test\x00";
|
||||
|
||||
let config = ExtractionConfig::default();
|
||||
let result = extract_bytes_sync(null_bytes, "text/plain", &config);
|
||||
|
||||
assert!(result.is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_malformed_invalid_mime_type() {
|
||||
let content = b"Some content";
|
||||
|
||||
let config = ExtractionConfig::default();
|
||||
let result = extract_bytes_sync(content, "invalid/mime/type", &config);
|
||||
|
||||
assert!(result.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_malformed_xml_structure() {
|
||||
let malformed_xml = r#"<?xml version="1.0"?><root><item>test</item>"#;
|
||||
|
||||
let config = ExtractionConfig::default();
|
||||
let result = extract_bytes_sync(malformed_xml.as_bytes(), "application/xml", &config);
|
||||
|
||||
assert!(result.is_ok() || result.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_malformed_zip_structure() {
|
||||
let corrupt_zip = b"PK\x03\x04CORRUPTED_DATA";
|
||||
|
||||
let config = ExtractionConfig::default();
|
||||
let result = extract_bytes_sync(corrupt_zip, "application/zip", &config);
|
||||
|
||||
assert!(result.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_malformed_invalid_utf8() {
|
||||
let invalid_utf8 = b"Hello \xFF\xFE World";
|
||||
|
||||
let config = ExtractionConfig::default();
|
||||
let result = extract_bytes_sync(invalid_utf8, "text/plain", &config);
|
||||
|
||||
assert!(result.is_ok() || result.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_malformed_mixed_line_endings() {
|
||||
let mixed_endings = b"Line 1\r\nLine 2\nLine 3\rLine 4";
|
||||
|
||||
let config = ExtractionConfig::default();
|
||||
let result = extract_bytes_sync(mixed_endings, "text/plain", &config);
|
||||
|
||||
assert!(result.is_ok());
|
||||
if let Ok(extracted) = result {
|
||||
assert!(extracted.content.contains("Line 1"));
|
||||
assert!(extracted.content.contains("Line 2"));
|
||||
assert!(extracted.content.contains("Line 3"));
|
||||
assert!(extracted.content.contains("Line 4"));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pdf_minimal_valid() {
|
||||
let minimal_pdf = b"%PDF-1.4
|
||||
This is a very minimal PDF structure for security testing.
|
||||
%%EOF";
|
||||
|
||||
let config = ExtractionConfig::default();
|
||||
let result = extract_bytes_sync(minimal_pdf, "application/pdf", &config);
|
||||
|
||||
assert!(result.is_ok() || result.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pdf_malformed_header() {
|
||||
let malformed_pdf = b"%PDF-INVALID
|
||||
This is not a valid PDF structure";
|
||||
|
||||
let config = ExtractionConfig::default();
|
||||
let result = extract_bytes_sync(malformed_pdf, "application/pdf", &config);
|
||||
|
||||
assert!(result.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pdf_truncated() {
|
||||
let truncated_pdf = b"%PDF-1.4
|
||||
1 0 obj
|
||||
<<
|
||||
/Type /Catalog
|
||||
>>
|
||||
endobj";
|
||||
|
||||
let config = ExtractionConfig::default();
|
||||
let result = extract_bytes_sync(truncated_pdf, "application/pdf", &config);
|
||||
|
||||
assert!(result.is_err() || result.is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_security_nonexistent_file() {
|
||||
let config = ExtractionConfig::default();
|
||||
let result = extract_file_sync("/nonexistent/path/to/file.txt", None, &config);
|
||||
|
||||
assert!(result.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_security_directory_instead_of_file() {
|
||||
let config = ExtractionConfig::default();
|
||||
let result = extract_file_sync("/tmp", None, &config);
|
||||
|
||||
assert!(result.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_security_special_file_handling() {
|
||||
let mut tmpfile = NamedTempFile::new().expect("Operation failed");
|
||||
tmpfile.write_all(b"test content").expect("Operation failed");
|
||||
tmpfile.flush().expect("Operation failed");
|
||||
let path = tmpfile.path();
|
||||
|
||||
let config = ExtractionConfig::default();
|
||||
let result = extract_file_sync(path.to_str().expect("Operation failed"), None, &config);
|
||||
|
||||
assert!(result.is_ok() || result.is_err());
|
||||
}
|
||||
Reference in New Issue
Block a user