//! End-to-end integration test for DOCX metadata extraction #![cfg(feature = "office")] use kreuzberg::{ExtractionConfig, extract_file}; #[tokio::test] async fn test_docx_full_metadata_extraction() { let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR")) .parent() .expect("Operation failed") .parent() .expect("Operation failed"); let test_file = workspace_root.join("test_documents/docx/word_sample.docx"); if !test_file.exists() { println!("Skipping test: Test file not found at {:?}", test_file); return; } let result = extract_file(&test_file, None, &ExtractionConfig::default()) .await .expect("Should extract DOCX successfully"); assert!(!result.content.is_empty(), "Content should not be empty"); assert!( result.content.to_lowercase().contains("swim"), "Content should contain 'swim'" ); assert_eq!( result.metadata.created_by.as_deref(), Some("Christoph Auer"), "Should have correct creator" ); assert_eq!( result.metadata.modified_by.as_deref(), Some("Maxim Lysak"), "Should have correct last modified by" ); assert_eq!( result.metadata.created_at.as_deref(), Some("2024-10-09T12:43:00Z"), "Should have correct creation date" ); assert_eq!( result.metadata.additional.get("revision").and_then(|v| v.as_str()), Some("7"), "Should have revision number" ); assert_eq!( result.metadata.additional.get("page_count").and_then(|v| v.as_i64()), Some(2), "Should have 2 pages" ); assert_eq!( result.metadata.additional.get("word_count").and_then(|v| v.as_i64()), Some(108), "Should have 108 words" ); assert_eq!( result .metadata .additional .get("character_count") .and_then(|v| v.as_i64()), Some(620), "Should have 620 characters" ); assert_eq!( result.metadata.additional.get("line_count").and_then(|v| v.as_i64()), Some(5), "Should have 5 lines" ); assert_eq!( result .metadata .additional .get("paragraph_count") .and_then(|v| v.as_i64()), Some(1), "Should have 1 paragraph" ); println!("✅ DOCX metadata extraction test passed!"); println!(" Found {} metadata fields", result.metadata.additional.len()); } #[tokio::test] async fn test_docx_minimal_metadata_extraction() { let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR")) .parent() .expect("Operation failed") .parent() .expect("Operation failed"); let test_file = workspace_root.join("test_documents/docx/lorem_ipsum.docx"); if !test_file.exists() { println!("Skipping test: Test file not found at {:?}", test_file); return; } let result = extract_file(&test_file, None, &ExtractionConfig::default()) .await .expect("Should extract DOCX successfully"); assert!(!result.content.is_empty(), "Content should not be empty"); assert_eq!( result.metadata.additional.get("page_count").and_then(|v| v.as_i64()), Some(1), "Should have 1 page" ); assert_eq!( result.metadata.additional.get("word_count").and_then(|v| v.as_i64()), Some(520), "Should have 520 words" ); println!("✅ DOCX minimal metadata extraction test passed!"); } #[tokio::test] async fn test_docx_keywords_extraction() { // This test verifies that DOCX keywords metadata is properly parsed // from comma-separated strings into Vec in Metadata.keywords // // Addresses GitHub issue #309: DOCX keyword extraction was returning // strings instead of parsed keyword lists, causing FunctionClauseError // in the Elixir binding. use std::io::Write; use tempfile::NamedTempFile; use zip::CompressionMethod; use zip::write::{FileOptions, ZipWriter}; // Create a minimal DOCX with keywords metadata let mut temp_file = NamedTempFile::new().expect("Failed to create temp file"); { let mut zip = ZipWriter::new(&mut temp_file); let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored); // Add [Content_Types].xml zip.start_file("[Content_Types].xml", options) .expect("Operation failed"); zip.write_all(br#" "#).expect("Operation failed"); // Add _rels/.rels zip.start_file("_rels/.rels", options).expect("Operation failed"); zip.write_all(br#" "#).expect("Operation failed"); // Add word/document.xml with simple content zip.start_file("word/document.xml", options).expect("Operation failed"); zip.write_all( br#" Test document for keyword extraction "#, ) .expect("Operation failed"); // Add docProps/core.xml with keywords (comma-separated string) zip.start_file("docProps/core.xml", options).expect("Operation failed"); zip.write_all( br#" Test Document Test Author rust, docx, extraction, metadata, test Testing keyword extraction "#, ) .expect("Operation failed"); zip.finish().expect("Operation failed"); } // Extract the DOCX file let result = extract_file( temp_file.path(), Some("application/vnd.openxmlformats-officedocument.wordprocessingml.document"), &ExtractionConfig::default(), ) .await .expect("Should extract DOCX with keywords successfully"); // Verify content was extracted assert!(!result.content.is_empty(), "Content should not be empty"); assert!( result.content.contains("Test document for keyword extraction"), "Content should match document text" ); // Verify keywords were parsed into Vec in Metadata.keywords assert!( result.metadata.keywords.is_some(), "Keywords should be present in metadata.keywords" ); let keywords = result.metadata.keywords.as_ref().expect("Operation failed"); assert_eq!( keywords.len(), 5, "Should have 5 keywords parsed from comma-separated string" ); // Verify individual keywords were trimmed and parsed correctly assert_eq!(keywords[0], "rust", "First keyword should be 'rust'"); assert_eq!(keywords[1], "docx", "Second keyword should be 'docx'"); assert_eq!(keywords[2], "extraction", "Third keyword should be 'extraction'"); assert_eq!(keywords[3], "metadata", "Fourth keyword should be 'metadata'"); assert_eq!(keywords[4], "test", "Fifth keyword should be 'test'"); // Verify other metadata was also extracted via typed fields assert_eq!( result.metadata.created_by.as_deref(), Some("Test Author"), "Should have correct creator" ); assert_eq!( result.metadata.title.as_deref(), Some("Test Document"), "Should have correct title" ); assert_eq!( result.metadata.subject.as_deref(), Some("Testing keyword extraction"), "Should have correct subject" ); println!("✅ DOCX keywords extraction test passed!"); println!(" Extracted keywords: {:?}", keywords); }