141 lines
5.4 KiB
Rust
141 lines
5.4 KiB
Rust
|
|
//! Test that DOCX files are correctly detected as DOCX, not ZIP.
|
||
|
|
//!
|
||
|
|
//! This tests the fix for https://github.com/kreuzberg-dev/kreuzberg/issues/350
|
||
|
|
|
||
|
|
use std::io::Write;
|
||
|
|
use tempfile::NamedTempFile;
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_docx_detected_from_bytes_not_zip() {
|
||
|
|
// Create a minimal DOCX-like ZIP file with the word/document.xml marker
|
||
|
|
// This simulates the structure of a DOCX file
|
||
|
|
let mut file = NamedTempFile::new().unwrap();
|
||
|
|
|
||
|
|
// Write a minimal ZIP with word/document.xml entry
|
||
|
|
let docx_content: &[u8] = &[
|
||
|
|
// ZIP local file header
|
||
|
|
0x50, 0x4b, 0x03, 0x04, // signature
|
||
|
|
0x14, 0x00, // version needed
|
||
|
|
0x00, 0x00, // flags
|
||
|
|
0x00, 0x00, // compression method
|
||
|
|
0x00, 0x00, // mod time
|
||
|
|
0x00, 0x00, // mod date
|
||
|
|
0x00, 0x00, 0x00, 0x00, // crc
|
||
|
|
0x00, 0x00, 0x00, 0x00, // compressed size
|
||
|
|
0x00, 0x00, 0x00, 0x00, // uncompressed size
|
||
|
|
0x11, 0x00, // file name length (17)
|
||
|
|
0x00, 0x00, // extra field length
|
||
|
|
// "word/document.xml"
|
||
|
|
b'w', b'o', b'r', b'd', b'/', b'd', b'o', b'c', b'u', b'm', b'e', b'n', b't', b'.', b'x', b'm', b'l',
|
||
|
|
];
|
||
|
|
|
||
|
|
file.write_all(docx_content).unwrap();
|
||
|
|
file.flush().unwrap();
|
||
|
|
|
||
|
|
let content = std::fs::read(file.path()).unwrap();
|
||
|
|
let mime = kreuzberg::core::mime::detect_mime_type_from_bytes(&content).unwrap();
|
||
|
|
|
||
|
|
assert_eq!(
|
||
|
|
mime, "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||
|
|
"DOCX file should be detected as DOCX MIME type, not ZIP"
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_xlsx_detected_from_bytes_not_zip() {
|
||
|
|
// Create a minimal XLSX-like ZIP file with the xl/workbook.xml marker
|
||
|
|
let xlsx_content: &[u8] = &[
|
||
|
|
// ZIP local file header
|
||
|
|
0x50, 0x4b, 0x03, 0x04, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||
|
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x00, // file name length (15)
|
||
|
|
0x00, 0x00, // "xl/workbook.xml"
|
||
|
|
b'x', b'l', b'/', b'w', b'o', b'r', b'k', b'b', b'o', b'o', b'k', b'.', b'x', b'm', b'l',
|
||
|
|
];
|
||
|
|
|
||
|
|
let mime = kreuzberg::core::mime::detect_mime_type_from_bytes(xlsx_content).unwrap();
|
||
|
|
|
||
|
|
assert_eq!(
|
||
|
|
mime, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||
|
|
"XLSX file should be detected as XLSX MIME type, not ZIP"
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_pptx_detected_from_bytes_not_zip() {
|
||
|
|
// Create a minimal PPTX-like ZIP file with the ppt/presentation.xml marker
|
||
|
|
let pptx_content: &[u8] = &[
|
||
|
|
// ZIP local file header
|
||
|
|
0x50, 0x4b, 0x03, 0x04, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||
|
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x14, 0x00, // file name length (20)
|
||
|
|
0x00, 0x00, // "ppt/presentation.xml"
|
||
|
|
b'p', b'p', b't', b'/', b'p', b'r', b'e', b's', b'e', b'n', b't', b'a', b't', b'i', b'o', b'n', b'.', b'x',
|
||
|
|
b'm', b'l',
|
||
|
|
];
|
||
|
|
|
||
|
|
let mime = kreuzberg::core::mime::detect_mime_type_from_bytes(pptx_content).unwrap();
|
||
|
|
|
||
|
|
assert_eq!(
|
||
|
|
mime, "application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||
|
|
"PPTX file should be detected as PPTX MIME type, not ZIP"
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_plain_zip_still_detected_as_zip() {
|
||
|
|
// Plain ZIP without Office markers should remain as ZIP
|
||
|
|
let plain_zip_content: &[u8] = &[
|
||
|
|
// ZIP local file header
|
||
|
|
0x50, 0x4b, 0x03, 0x04, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||
|
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, // file name length (8)
|
||
|
|
0x00, 0x00, // "test.txt"
|
||
|
|
b't', b'e', b's', b't', b'.', b't', b'x', b't',
|
||
|
|
];
|
||
|
|
|
||
|
|
let mime = kreuzberg::core::mime::detect_mime_type_from_bytes(plain_zip_content).unwrap();
|
||
|
|
|
||
|
|
assert_eq!(mime, "application/zip", "Plain ZIP should remain as application/zip");
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_legacy_docx_mime_validates_ok() {
|
||
|
|
// kreuzberg-cloud receives real traffic with this non-standard value; it must not be rejected.
|
||
|
|
let result = kreuzberg::core::mime::validate_mime_type("application/docx");
|
||
|
|
assert!(
|
||
|
|
result.is_ok(),
|
||
|
|
"validate_mime_type should accept legacy application/docx alias, got: {:?}",
|
||
|
|
result
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_ext_to_mime_docx_still_canonical() {
|
||
|
|
// Extension dispatch must continue to produce the RFC MIME — the alias is for validation only.
|
||
|
|
let path = std::path::Path::new("document.docx");
|
||
|
|
let mime = kreuzberg::core::mime::detect_mime_type(path, false).unwrap();
|
||
|
|
assert_eq!(
|
||
|
|
mime, "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||
|
|
".docx extension must resolve to the RFC canonical MIME, not the legacy alias"
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[cfg(feature = "office")]
|
||
|
|
#[test]
|
||
|
|
fn test_both_docx_mimes_resolve_to_same_extractor() {
|
||
|
|
// Both the RFC MIME and the legacy alias must be declared by DocxExtractor,
|
||
|
|
// ensuring the registry maps both to the same plugin.
|
||
|
|
use kreuzberg::extractors::DocxExtractor;
|
||
|
|
use kreuzberg::plugins::DocumentExtractor;
|
||
|
|
|
||
|
|
let extractor = DocxExtractor;
|
||
|
|
let supported = extractor.supported_mime_types();
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
supported.contains(&"application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
|
||
|
|
"DocxExtractor must declare the RFC canonical MIME"
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
supported.contains(&"application/docx"),
|
||
|
|
"DocxExtractor must declare the legacy alias so the registry routes both to the same plugin"
|
||
|
|
);
|
||
|
|
}
|