Files
fil/crates/kreuzberg/tests/archive_integration.rs

553 lines
21 KiB
Rust
Raw Normal View History

2026-06-01 23:40:55 +02:00
//! Archive extraction integration tests.
//!
//! Tests for ZIP, TAR, TAR.GZ, and 7z archive extraction.
//! Validates metadata extraction, content extraction, nested archives, and error handling.
#![cfg(feature = "archives")]
use kreuzberg::core::config::ExtractionConfig;
use kreuzberg::core::extractor::{extract_bytes, extract_bytes_sync};
use std::io::{Cursor, Write};
use tar::Builder as TarBuilder;
use zip::write::{FileOptions, ZipWriter};
mod helpers;
/// Test basic ZIP extraction with single file.
#[tokio::test]
async fn test_zip_basic_extraction() {
let config = ExtractionConfig::default();
let zip_bytes = create_simple_zip();
let result = extract_bytes(&zip_bytes, "application/zip", &config)
.await
.expect("Should extract ZIP successfully");
assert_eq!(result.mime_type, "application/zip");
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
assert!(result.detected_languages.is_none(), "Language detection not enabled");
assert!(result.tables.is_empty(), "Archive should not have tables");
assert!(result.content.contains("ZIP Archive"));
assert!(result.content.contains("test.txt"));
assert!(result.content.contains("Hello from ZIP!"));
assert!(result.metadata.format.is_some());
let archive_meta = match result.metadata.format.as_ref().expect("Operation failed") {
kreuzberg::FormatMetadata::Archive(meta) => meta,
_ => panic!("Expected Archive metadata"),
};
assert_eq!(archive_meta.format, "ZIP");
assert_eq!(archive_meta.file_count, 1);
assert_eq!(archive_meta.file_list.len(), 1);
assert_eq!(archive_meta.file_list[0], "test.txt");
}
/// Test ZIP with multiple files.
#[tokio::test]
async fn test_zip_multiple_files() {
let config = ExtractionConfig::default();
let mut cursor = Cursor::new(Vec::new());
{
let mut zip = ZipWriter::new(&mut cursor);
let options = FileOptions::<'_, ()>::default();
zip.start_file("file1.txt", options).expect("Operation failed");
zip.write_all(b"Content 1").expect("Operation failed");
zip.start_file("file2.md", options).expect("Operation failed");
zip.write_all(b"# Content 2").expect("Operation failed");
zip.start_file("file3.json", options).expect("Operation failed");
zip.write_all(b"{\"key\": \"value\"}").expect("Operation failed");
zip.finish().expect("Operation failed");
}
let zip_bytes = cursor.into_inner();
let result = extract_bytes(&zip_bytes, "application/zip", &config)
.await
.expect("Should extract multi-file ZIP");
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
assert!(result.detected_languages.is_none(), "Language detection not enabled");
assert!(result.tables.is_empty(), "Archive should not have tables");
assert!(result.content.contains("file1.txt"));
assert!(result.content.contains("file2.md"));
assert!(result.content.contains("file3.json"));
assert!(result.content.contains("Content 1"));
assert!(result.content.contains("Content 2"));
assert!(result.content.contains("value"));
assert!(result.metadata.format.is_some());
let archive_meta = match result.metadata.format.as_ref().expect("Operation failed") {
kreuzberg::FormatMetadata::Archive(meta) => meta,
_ => panic!("Expected Archive metadata"),
};
assert_eq!(archive_meta.file_count, 3, "Should have 3 files");
assert_eq!(archive_meta.file_list.len(), 3, "file_list should contain 3 entries");
assert!(archive_meta.file_list.contains(&"file1.txt".to_string()));
assert!(archive_meta.file_list.contains(&"file2.md".to_string()));
assert!(archive_meta.file_list.contains(&"file3.json".to_string()));
}
/// Test ZIP with nested directory structure.
#[tokio::test]
async fn test_zip_nested_directories() {
let config = ExtractionConfig::default();
let mut cursor = Cursor::new(Vec::new());
{
let mut zip = ZipWriter::new(&mut cursor);
let options = FileOptions::<'_, ()>::default();
zip.add_directory("dir1/", options).expect("Operation failed");
zip.add_directory("dir1/subdir/", options).expect("Operation failed");
zip.start_file("dir1/file.txt", options).expect("Operation failed");
zip.write_all(b"File in dir1").expect("Operation failed");
zip.start_file("dir1/subdir/nested.txt", options)
.expect("Operation failed");
zip.write_all(b"Nested file").expect("Operation failed");
zip.finish().expect("Operation failed");
}
let zip_bytes = cursor.into_inner();
let result = extract_bytes(&zip_bytes, "application/zip", &config)
.await
.expect("Should extract nested ZIP");
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
assert!(result.detected_languages.is_none(), "Language detection not enabled");
assert!(result.tables.is_empty(), "Archive should not have tables");
assert!(result.content.contains("dir1/"));
assert!(result.content.contains("dir1/file.txt"));
assert!(result.content.contains("dir1/subdir/nested.txt"));
assert!(result.content.contains("File in dir1"));
assert!(result.content.contains("Nested file"));
assert!(result.metadata.format.is_some());
let archive_meta = match result.metadata.format.as_ref().expect("Operation failed") {
kreuzberg::FormatMetadata::Archive(meta) => meta,
_ => panic!("Expected Archive metadata"),
};
assert!(
archive_meta.file_count >= 2,
"Should have at least 2 files (excluding empty dirs)"
);
assert!(archive_meta.file_list.iter().any(|f| f.contains("dir1/file.txt")));
assert!(
archive_meta
.file_list
.iter()
.any(|f| f.contains("dir1/subdir/nested.txt"))
);
}
/// Test TAR extraction.
#[tokio::test]
async fn test_tar_extraction() {
let config = ExtractionConfig::default();
let tar_bytes = create_simple_tar();
let result = extract_bytes(&tar_bytes, "application/x-tar", &config)
.await
.expect("Should extract TAR successfully");
assert_eq!(result.mime_type, "application/x-tar");
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
assert!(result.detected_languages.is_none(), "Language detection not enabled");
assert!(result.tables.is_empty(), "Archive should not have tables");
assert!(result.content.contains("TAR Archive"));
assert!(result.content.contains("test.txt"));
assert!(result.content.contains("Hello from TAR!"));
assert!(result.metadata.format.is_some());
let archive_meta = match result.metadata.format.as_ref().expect("Operation failed") {
kreuzberg::FormatMetadata::Archive(meta) => meta,
_ => panic!("Expected Archive metadata"),
};
assert_eq!(archive_meta.format, "TAR");
assert_eq!(archive_meta.file_count, 1);
}
/// Test TAR.GZ extraction (compressed TAR).
///
/// Note: TAR.GZ requires decompression before extraction.
/// This test validates TAR extraction which is the underlying format.
#[tokio::test]
async fn test_tar_gz_extraction() {
let config = ExtractionConfig::default();
let tar_bytes = create_simple_tar();
let result = extract_bytes(&tar_bytes, "application/x-tar", &config)
.await
.expect("Should extract TAR");
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
assert!(result.detected_languages.is_none(), "Language detection not enabled");
assert!(result.tables.is_empty(), "Archive should not have tables");
assert!(result.content.contains("TAR Archive"));
assert!(result.content.contains("test.txt"));
assert!(result.metadata.format.is_some());
let archive_meta = match result.metadata.format.as_ref().expect("Operation failed") {
kreuzberg::FormatMetadata::Archive(meta) => meta,
_ => panic!("Expected Archive metadata"),
};
assert_eq!(archive_meta.format, "TAR");
assert_eq!(archive_meta.file_count, 1);
let result2 = extract_bytes(&tar_bytes, "application/tar", &config)
.await
.expect("Should extract with alternative MIME type");
assert!(
result2.chunks.is_none(),
"Chunks should be None without chunking config"
);
assert!(result2.detected_languages.is_none(), "Language detection not enabled");
assert!(result2.tables.is_empty(), "Archive should not have tables");
assert!(result2.content.contains("TAR Archive"));
assert!(result2.metadata.format.is_some());
}
/// Test 7z extraction.
#[tokio::test]
async fn test_7z_extraction() {
println!("7z test requires real 7z file - skipping programmatic creation");
}
/// Test nested archive (ZIP inside ZIP).
#[tokio::test]
async fn test_nested_archive() {
let config = ExtractionConfig::default();
let inner_zip = create_simple_zip();
let mut cursor = Cursor::new(Vec::new());
{
let mut zip = ZipWriter::new(&mut cursor);
let options = FileOptions::<'_, ()>::default();
zip.start_file("inner.zip", options).expect("Operation failed");
zip.write_all(&inner_zip).expect("Operation failed");
zip.start_file("readme.txt", options).expect("Operation failed");
zip.write_all(b"This archive contains another archive")
.expect("Operation failed");
zip.finish().expect("Operation failed");
}
let outer_zip_bytes = cursor.into_inner();
let result = extract_bytes(&outer_zip_bytes, "application/zip", &config)
.await
.expect("Should extract nested ZIP");
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
assert!(result.detected_languages.is_none(), "Language detection not enabled");
assert!(result.tables.is_empty(), "Archive should not have tables");
assert!(result.content.contains("inner.zip"));
assert!(result.content.contains("readme.txt"));
assert!(result.content.contains("This archive contains another archive"));
assert!(result.metadata.format.is_some());
let archive_meta = match result.metadata.format.as_ref().expect("Operation failed") {
kreuzberg::FormatMetadata::Archive(meta) => meta,
_ => panic!("Expected Archive metadata"),
};
assert_eq!(archive_meta.file_count, 2, "Should have 2 files in outer archive");
assert!(archive_meta.file_list.contains(&"inner.zip".to_string()));
assert!(archive_meta.file_list.contains(&"readme.txt".to_string()));
}
/// Test archive with mixed file formats (PDF, DOCX, images).
#[tokio::test]
async fn test_archive_mixed_formats() {
let config = ExtractionConfig::default();
let mut cursor = Cursor::new(Vec::new());
{
let mut zip = ZipWriter::new(&mut cursor);
let options = FileOptions::<'_, ()>::default();
zip.start_file("document.txt", options).expect("Operation failed");
zip.write_all(b"Text document").expect("Operation failed");
zip.start_file("readme.md", options).expect("Operation failed");
zip.write_all(b"# README").expect("Operation failed");
zip.start_file("image.png", options).expect("Operation failed");
zip.write_all(&[0x89, 0x50, 0x4E, 0x47]).expect("Operation failed");
zip.start_file("document.pdf", options).expect("Operation failed");
zip.write_all(b"%PDF-1.4").expect("Operation failed");
zip.finish().expect("Operation failed");
}
let zip_bytes = cursor.into_inner();
let result = extract_bytes(&zip_bytes, "application/zip", &config)
.await
.expect("Should extract mixed-format ZIP");
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
assert!(result.detected_languages.is_none(), "Language detection not enabled");
assert!(result.tables.is_empty(), "Archive should not have tables");
assert!(result.content.contains("document.txt"));
assert!(result.content.contains("readme.md"));
assert!(result.content.contains("image.png"));
assert!(result.content.contains("document.pdf"));
assert!(result.content.contains("Text document"));
assert!(result.content.contains("# README"));
assert!(result.metadata.format.is_some());
let archive_meta = match result.metadata.format.as_ref().expect("Operation failed") {
kreuzberg::FormatMetadata::Archive(meta) => meta,
_ => panic!("Expected Archive metadata"),
};
assert_eq!(archive_meta.file_count, 4, "Should have 4 files");
assert_eq!(archive_meta.file_list.len(), 4, "file_list should contain 4 entries");
assert!(archive_meta.file_list.contains(&"document.txt".to_string()));
assert!(archive_meta.file_list.contains(&"readme.md".to_string()));
assert!(archive_meta.file_list.contains(&"image.png".to_string()));
assert!(archive_meta.file_list.contains(&"document.pdf".to_string()));
}
/// Test password-protected archive (should fail gracefully).
#[tokio::test]
async fn test_password_protected_archive() {
let config = ExtractionConfig::default();
let invalid_zip = vec![0x50, 0x4B, 0x03, 0x04];
let result = extract_bytes(&invalid_zip, "application/zip", &config).await;
assert!(result.is_err(), "Should fail on invalid/encrypted ZIP");
}
/// Test corrupted archive.
#[tokio::test]
async fn test_corrupted_archive() {
let config = ExtractionConfig::default();
let corrupted_zip = vec![0x50, 0x4B, 0x03, 0x04, 0xFF, 0xFF, 0xFF, 0xFF];
let result = extract_bytes(&corrupted_zip, "application/zip", &config).await;
assert!(result.is_err(), "Should fail on corrupted ZIP");
let mut corrupted_tar = vec![0xFF; 512];
corrupted_tar[0..5].copy_from_slice(b"file\0");
let result = extract_bytes(&corrupted_tar, "application/x-tar", &config).await;
assert!(
result.is_ok() || result.is_err(),
"Should handle corrupted TAR gracefully"
);
}
/// Test large archive (100+ files).
#[tokio::test]
async fn test_large_archive() {
let config = ExtractionConfig::default();
let mut cursor = Cursor::new(Vec::new());
{
let mut zip = ZipWriter::new(&mut cursor);
let options = FileOptions::<'_, ()>::default();
for i in 0..100 {
zip.start_file(format!("file_{}.txt", i), options)
.expect("Operation failed");
zip.write_all(format!("Content {}", i).as_bytes())
.expect("Failed to convert to bytes");
}
zip.finish().expect("Operation failed");
}
let zip_bytes = cursor.into_inner();
let result = extract_bytes(&zip_bytes, "application/zip", &config)
.await
.expect("Should extract large ZIP");
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
assert!(result.detected_languages.is_none(), "Language detection not enabled");
assert!(result.tables.is_empty(), "Archive should not have tables");
assert!(result.metadata.format.is_some());
let archive_meta = match result.metadata.format.as_ref().expect("Operation failed") {
kreuzberg::FormatMetadata::Archive(meta) => meta,
_ => panic!("Expected Archive metadata"),
};
assert_eq!(archive_meta.file_count, 100, "Should have 100 files");
assert_eq!(
archive_meta.file_list.len(),
100,
"file_list should contain 100 entries"
);
assert!(result.content.contains("file_0.txt"));
assert!(result.content.contains("file_99.txt"));
assert!(archive_meta.file_list.contains(&"file_0.txt".to_string()));
assert!(archive_meta.file_list.contains(&"file_50.txt".to_string()));
assert!(archive_meta.file_list.contains(&"file_99.txt".to_string()));
}
/// Test archive with special characters and Unicode filenames.
#[tokio::test]
async fn test_archive_with_special_characters() {
let config = ExtractionConfig::default();
let mut cursor = Cursor::new(Vec::new());
{
let mut zip = ZipWriter::new(&mut cursor);
let options = FileOptions::<'_, ()>::default();
zip.start_file("测试文件.txt", options).expect("Operation failed");
zip.write_all("Unicode content".as_bytes())
.expect("Failed to convert to bytes");
zip.start_file("file with spaces.txt", options)
.expect("Operation failed");
zip.write_all(b"Spaces in filename").expect("Operation failed");
zip.start_file("file-with-dashes.txt", options)
.expect("Operation failed");
zip.write_all(b"Dashes").expect("Operation failed");
zip.finish().expect("Operation failed");
}
let zip_bytes = cursor.into_inner();
let result = extract_bytes(&zip_bytes, "application/zip", &config)
.await
.expect("Should extract ZIP with special characters");
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
assert!(result.detected_languages.is_none(), "Language detection not enabled");
assert!(result.tables.is_empty(), "Archive should not have tables");
assert!(result.content.contains("测试文件.txt") || result.content.contains("txt"));
assert!(result.content.contains("file with spaces.txt"));
assert!(result.content.contains("file-with-dashes.txt"));
assert!(result.metadata.format.is_some());
let archive_meta = match result.metadata.format.as_ref().expect("Operation failed") {
kreuzberg::FormatMetadata::Archive(meta) => meta,
_ => panic!("Expected Archive metadata"),
};
assert_eq!(archive_meta.file_count, 3, "Should have 3 files");
assert_eq!(archive_meta.file_list.len(), 3, "file_list should contain 3 entries");
assert!(archive_meta.file_list.iter().any(|f| f.contains("txt")));
assert!(archive_meta.file_list.contains(&"file with spaces.txt".to_string()));
assert!(archive_meta.file_list.contains(&"file-with-dashes.txt".to_string()));
}
/// Test empty archive.
#[tokio::test]
async fn test_empty_archive() {
let config = ExtractionConfig::default();
let mut cursor = Cursor::new(Vec::new());
{
let zip = ZipWriter::new(&mut cursor);
zip.finish().expect("Operation failed");
}
let zip_bytes = cursor.into_inner();
let result = extract_bytes(&zip_bytes, "application/zip", &config)
.await
.expect("Should extract empty ZIP");
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
assert!(result.detected_languages.is_none(), "Language detection not enabled");
assert!(result.tables.is_empty(), "Archive should not have tables");
assert!(result.content.contains("ZIP Archive"));
assert!(result.metadata.format.is_some());
let archive_meta = match result.metadata.format.as_ref().expect("Operation failed") {
kreuzberg::FormatMetadata::Archive(meta) => meta,
_ => panic!("Expected Archive metadata"),
};
assert_eq!(archive_meta.file_count, 0, "Empty archive should have 0 files");
assert_eq!(archive_meta.total_size, 0, "Empty archive should have 0 total size");
assert!(archive_meta.file_list.is_empty(), "file_list should be empty");
}
/// Test synchronous archive extraction.
#[test]
fn test_archive_extraction_sync() {
let config = ExtractionConfig::default();
let zip_bytes = create_simple_zip();
let result = extract_bytes_sync(&zip_bytes, "application/zip", &config).expect("Should extract ZIP synchronously");
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
assert!(result.detected_languages.is_none(), "Language detection not enabled");
assert!(result.tables.is_empty(), "Archive should not have tables");
assert!(result.content.contains("ZIP Archive"));
assert!(result.content.contains("test.txt"));
assert!(result.content.contains("Hello from ZIP!"));
assert!(result.metadata.format.is_some(), "Should have archive metadata");
let archive_meta = match result.metadata.format.as_ref().expect("Operation failed") {
kreuzberg::FormatMetadata::Archive(meta) => meta,
_ => panic!("Expected Archive metadata"),
};
assert_eq!(archive_meta.format, "ZIP");
assert_eq!(archive_meta.file_count, 1);
assert_eq!(archive_meta.file_list.len(), 1);
assert_eq!(archive_meta.file_list[0], "test.txt");
}
fn create_simple_zip() -> Vec<u8> {
let mut cursor = Cursor::new(Vec::new());
{
let mut zip = ZipWriter::new(&mut cursor);
let options = FileOptions::<'_, ()>::default();
zip.start_file("test.txt", options).expect("Operation failed");
zip.write_all(b"Hello from ZIP!").expect("Operation failed");
zip.finish().expect("Operation failed");
}
cursor.into_inner()
}
fn create_simple_tar() -> Vec<u8> {
let mut cursor = Cursor::new(Vec::new());
{
let mut tar = TarBuilder::new(&mut cursor);
let data = b"Hello from TAR!";
let mut header = tar::Header::new_gnu();
header.set_path("test.txt").expect("Operation failed");
header.set_size(data.len() as u64);
header.set_cksum();
tar.append(&header, &data[..]).expect("Operation failed");
tar.finish().expect("Operation failed");
}
cursor.into_inner()
}