Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/crates/kreuzberg/tests/docx_metadata_extraction_test.rs
+++ b/crates/kreuzberg/tests/docx_metadata_extraction_test.rs
@@ -0,0 +1,253 @@
+//! End-to-end integration test for DOCX metadata extraction
+
+#![cfg(feature = "office")]
+
+use kreuzberg::{ExtractionConfig, extract_file};
+
+#[tokio::test]
+async fn test_docx_full_metadata_extraction() {
+    let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
+        .parent()
+        .expect("Operation failed")
+        .parent()
+        .expect("Operation failed");
+    let test_file = workspace_root.join("test_documents/docx/word_sample.docx");
+
+    if !test_file.exists() {
+        println!("Skipping test: Test file not found at {:?}", test_file);
+        return;
+    }
+
+    let result = extract_file(&test_file, None, &ExtractionConfig::default())
+        .await
+        .expect("Should extract DOCX successfully");
+
+    assert!(!result.content.is_empty(), "Content should not be empty");
+    assert!(
+        result.content.to_lowercase().contains("swim"),
+        "Content should contain 'swim'"
+    );
+
+    assert_eq!(
+        result.metadata.created_by.as_deref(),
+        Some("Christoph Auer"),
+        "Should have correct creator"
+    );
+    assert_eq!(
+        result.metadata.modified_by.as_deref(),
+        Some("Maxim Lysak"),
+        "Should have correct last modified by"
+    );
+    assert_eq!(
+        result.metadata.created_at.as_deref(),
+        Some("2024-10-09T12:43:00Z"),
+        "Should have correct creation date"
+    );
+    assert_eq!(
+        result.metadata.additional.get("revision").and_then(|v| v.as_str()),
+        Some("7"),
+        "Should have revision number"
+    );
+
+    assert_eq!(
+        result.metadata.additional.get("page_count").and_then(|v| v.as_i64()),
+        Some(2),
+        "Should have 2 pages"
+    );
+    assert_eq!(
+        result.metadata.additional.get("word_count").and_then(|v| v.as_i64()),
+        Some(108),
+        "Should have 108 words"
+    );
+    assert_eq!(
+        result
+            .metadata
+            .additional
+            .get("character_count")
+            .and_then(|v| v.as_i64()),
+        Some(620),
+        "Should have 620 characters"
+    );
+    assert_eq!(
+        result.metadata.additional.get("line_count").and_then(|v| v.as_i64()),
+        Some(5),
+        "Should have 5 lines"
+    );
+    assert_eq!(
+        result
+            .metadata
+            .additional
+            .get("paragraph_count")
+            .and_then(|v| v.as_i64()),
+        Some(1),
+        "Should have 1 paragraph"
+    );
+
+    println!("✅ DOCX metadata extraction test passed!");
+    println!("   Found {} metadata fields", result.metadata.additional.len());
+}
+
+#[tokio::test]
+async fn test_docx_minimal_metadata_extraction() {
+    let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
+        .parent()
+        .expect("Operation failed")
+        .parent()
+        .expect("Operation failed");
+    let test_file = workspace_root.join("test_documents/docx/lorem_ipsum.docx");
+
+    if !test_file.exists() {
+        println!("Skipping test: Test file not found at {:?}", test_file);
+        return;
+    }
+
+    let result = extract_file(&test_file, None, &ExtractionConfig::default())
+        .await
+        .expect("Should extract DOCX successfully");
+
+    assert!(!result.content.is_empty(), "Content should not be empty");
+
+    assert_eq!(
+        result.metadata.additional.get("page_count").and_then(|v| v.as_i64()),
+        Some(1),
+        "Should have 1 page"
+    );
+    assert_eq!(
+        result.metadata.additional.get("word_count").and_then(|v| v.as_i64()),
+        Some(520),
+        "Should have 520 words"
+    );
+
+    println!("✅ DOCX minimal metadata extraction test passed!");
+}
+
+#[tokio::test]
+async fn test_docx_keywords_extraction() {
+    // This test verifies that DOCX keywords metadata is properly parsed
+    // from comma-separated strings into Vec<String> in Metadata.keywords
+    //
+    // Addresses GitHub issue #309: DOCX keyword extraction was returning
+    // strings instead of parsed keyword lists, causing FunctionClauseError
+    // in the Elixir binding.
+
+    use std::io::Write;
+    use tempfile::NamedTempFile;
+    use zip::CompressionMethod;
+    use zip::write::{FileOptions, ZipWriter};
+
+    // Create a minimal DOCX with keywords metadata
+    let mut temp_file = NamedTempFile::new().expect("Failed to create temp file");
+
+    {
+        let mut zip = ZipWriter::new(&mut temp_file);
+        let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
+
+        // Add [Content_Types].xml
+        zip.start_file("[Content_Types].xml", options)
+            .expect("Operation failed");
+        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
+<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
+  <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
+  <Default Extension="xml" ContentType="application/xml"/>
+  <Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
+  <Override PartName="/docProps/core.xml" ContentType="application/vnd.openxmlformats-package.core-properties+xml"/>
+</Types>"#).expect("Operation failed");
+
+        // Add _rels/.rels
+        zip.start_file("_rels/.rels", options).expect("Operation failed");
+        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
+<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
+  <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
+  <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties" Target="docProps/core.xml"/>
+</Relationships>"#).expect("Operation failed");
+
+        // Add word/document.xml with simple content
+        zip.start_file("word/document.xml", options).expect("Operation failed");
+        zip.write_all(
+            br#"<?xml version="1.0" encoding="UTF-8"?>
+<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
+  <w:body>
+    <w:p>
+      <w:r>
+        <w:t>Test document for keyword extraction</w:t>
+      </w:r>
+    </w:p>
+  </w:body>
+</w:document>"#,
+        )
+        .expect("Operation failed");
+
+        // Add docProps/core.xml with keywords (comma-separated string)
+        zip.start_file("docProps/core.xml", options).expect("Operation failed");
+        zip.write_all(
+            br#"<?xml version="1.0" encoding="UTF-8"?>
+<cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
+                   xmlns:dc="http://purl.org/dc/elements/1.1/"
+                   xmlns:dcterms="http://purl.org/dc/terms/">
+  <dc:title>Test Document</dc:title>
+  <dc:creator>Test Author</dc:creator>
+  <cp:keywords>rust, docx, extraction, metadata, test</cp:keywords>
+  <dc:subject>Testing keyword extraction</dc:subject>
+</cp:coreProperties>"#,
+        )
+        .expect("Operation failed");
+
+        zip.finish().expect("Operation failed");
+    }
+
+    // Extract the DOCX file
+    let result = extract_file(
+        temp_file.path(),
+        Some("application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
+        &ExtractionConfig::default(),
+    )
+    .await
+    .expect("Should extract DOCX with keywords successfully");
+
+    // Verify content was extracted
+    assert!(!result.content.is_empty(), "Content should not be empty");
+    assert!(
+        result.content.contains("Test document for keyword extraction"),
+        "Content should match document text"
+    );
+
+    // Verify keywords were parsed into Vec<String> in Metadata.keywords
+    assert!(
+        result.metadata.keywords.is_some(),
+        "Keywords should be present in metadata.keywords"
+    );
+
+    let keywords = result.metadata.keywords.as_ref().expect("Operation failed");
+    assert_eq!(
+        keywords.len(),
+        5,
+        "Should have 5 keywords parsed from comma-separated string"
+    );
+
+    // Verify individual keywords were trimmed and parsed correctly
+    assert_eq!(keywords[0], "rust", "First keyword should be 'rust'");
+    assert_eq!(keywords[1], "docx", "Second keyword should be 'docx'");
+    assert_eq!(keywords[2], "extraction", "Third keyword should be 'extraction'");
+    assert_eq!(keywords[3], "metadata", "Fourth keyword should be 'metadata'");
+    assert_eq!(keywords[4], "test", "Fifth keyword should be 'test'");
+
+    // Verify other metadata was also extracted via typed fields
+    assert_eq!(
+        result.metadata.created_by.as_deref(),
+        Some("Test Author"),
+        "Should have correct creator"
+    );
+    assert_eq!(
+        result.metadata.title.as_deref(),
+        Some("Test Document"),
+        "Should have correct title"
+    );
+    assert_eq!(
+        result.metadata.subject.as_deref(),
+        Some("Testing keyword extraction"),
+        "Should have correct subject"
+    );
+
+    println!("✅ DOCX keywords extraction test passed!");
+    println!("   Extracted keywords: {:?}", keywords);
+}