Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/crates/kreuzberg/tests/image_integration.rs
+++ b/crates/kreuzberg/tests/image_integration.rs
@@ -0,0 +1,255 @@
+//! Image and OCR integration tests using real image files.
+//!
+//! This module tests image extraction and OCR processing end-to-end with real
+//! image files from the test_documents/ directory. Tests verify that both
+//! image metadata extraction and OCR text extraction work correctly.
+//!
+//! Test philosophy:
+//! - Use real images from test_documents/
+//! - Assert on behavior, not implementation
+//! - Test different image formats (PNG, JPG, BMP, etc.)
+//! - Test OCR with various languages and layouts
+//! - Verify graceful handling of images without text
+
+#![cfg(feature = "ocr")]
+
+mod helpers;
+
+use helpers::*;
+use kreuzberg::core::config::ExtractionConfig;
+use kreuzberg::extract_file_sync;
+
+#[test]
+fn test_jpg_image_metadata() {
+    if skip_if_missing("images/example.jpg") {
+        return;
+    }
+
+    let file_path = get_test_file_path("images/example.jpg");
+    let result = extract_file_sync(&file_path, None, &ExtractionConfig::default())
+        .expect("Should extract JPG image successfully");
+
+    assert_mime_type(&result, "image/jpeg");
+
+    assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
+    assert!(result.detected_languages.is_none(), "Language detection not enabled");
+}
+
+#[test]
+fn test_png_image_metadata() {
+    if skip_if_missing("images/test_hello_world.png") {
+        return;
+    }
+
+    let file_path = get_test_file_path("images/test_hello_world.png");
+    let result = extract_file_sync(&file_path, None, &ExtractionConfig::default())
+        .expect("Should extract PNG image successfully");
+
+    assert_mime_type(&result, "image/png");
+
+    assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
+    assert!(result.detected_languages.is_none(), "Language detection not enabled");
+}
+
+#[test]
+fn test_bmp_image_format() {
+    if skip_if_missing("images/bmp_24.bmp") {
+        return;
+    }
+
+    let file_path = get_test_file_path("images/bmp_24.bmp");
+    let result = extract_file_sync(&file_path, None, &ExtractionConfig::default())
+        .expect("Should extract BMP image successfully");
+
+    assert_mime_type(&result, "image/bmp");
+
+    assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
+    assert!(result.detected_languages.is_none(), "Language detection not enabled");
+}
+
+#[test]
+fn test_ocr_simple_text() {
+    if skip_if_missing("images/test_hello_world.png") {
+        return;
+    }
+
+    let file_path = get_test_file_path("images/test_hello_world.png");
+    let config = test_config_with_ocr();
+
+    let result = extract_file_sync(&file_path, None, &config).expect("Should extract text from image with OCR");
+
+    assert_mime_type(&result, "image/png");
+
+    assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
+    assert!(result.detected_languages.is_none(), "Language detection not enabled");
+
+    if !result.content.trim().is_empty() {
+        assert_min_content_length(&result, 5);
+    }
+}
+
+#[test]
+fn test_ocr_document_image() {
+    if skip_if_missing("images/ocr_image.jpg") {
+        return;
+    }
+
+    let file_path = get_test_file_path("images/ocr_image.jpg");
+    let config = test_config_with_ocr();
+
+    let result = extract_file_sync(&file_path, None, &config).expect("Should extract text from document image");
+
+    assert_mime_type(&result, "image/jpeg");
+
+    assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
+    assert!(result.detected_languages.is_none(), "Language detection not enabled");
+
+    if !result.content.trim().is_empty() {
+        assert_min_content_length(&result, 10);
+    }
+}
+
+#[test]
+fn test_ocr_layout_parser() {
+    if skip_if_missing("images/layout_parser_ocr.jpg") {
+        return;
+    }
+
+    let file_path = get_test_file_path("images/layout_parser_ocr.jpg");
+    let config = test_config_with_ocr();
+
+    let result = extract_file_sync(&file_path, None, &config).expect("Should extract text from layout parser image");
+
+    assert_mime_type(&result, "image/jpeg");
+
+    assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
+    assert!(result.detected_languages.is_none(), "Language detection not enabled");
+
+    if !result.content.trim().is_empty() {
+        assert_min_content_length(&result, 20);
+    }
+}
+
+#[test]
+fn test_ocr_invoice_image() {
+    if skip_if_missing("images/invoice_image.png") {
+        return;
+    }
+
+    let file_path = get_test_file_path("images/invoice_image.png");
+    let config = test_config_with_ocr();
+
+    let result = extract_file_sync(&file_path, None, &config).expect("Should extract text from invoice image");
+
+    assert_mime_type(&result, "image/png");
+
+    assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
+    assert!(result.detected_languages.is_none(), "Language detection not enabled");
+
+    if !result.content.trim().is_empty() {
+        assert_min_content_length(&result, 10);
+    }
+}
+
+#[test]
+fn test_table_image_simple() {
+    if skip_if_missing("images/simple_table.png") {
+        return;
+    }
+
+    let file_path = get_test_file_path("images/simple_table.png");
+    let config = test_config_with_ocr();
+
+    let result = extract_file_sync(&file_path, None, &config).expect("Should extract table image successfully");
+
+    assert_mime_type(&result, "image/png");
+
+    assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
+    assert!(result.detected_languages.is_none(), "Language detection not enabled");
+}
+
+#[test]
+fn test_table_image_complex() {
+    if skip_if_missing("images/complex_document.png") {
+        return;
+    }
+
+    let file_path = get_test_file_path("images/complex_document.png");
+    let config = test_config_with_ocr();
+
+    let result =
+        extract_file_sync(&file_path, None, &config).expect("Should extract complex document image successfully");
+
+    assert_mime_type(&result, "image/png");
+
+    assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
+    assert!(result.detected_languages.is_none(), "Language detection not enabled");
+}
+
+#[test]
+fn test_ocr_multilang_english_korean() {
+    if skip_if_missing("images/english_and_korean.png") {
+        return;
+    }
+
+    let file_path = get_test_file_path("images/english_and_korean.png");
+    let config = test_config_with_ocr();
+
+    let result = extract_file_sync(&file_path, None, &config).expect("Should extract mixed language image");
+
+    assert_mime_type(&result, "image/png");
+
+    assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
+    assert!(result.detected_languages.is_none(), "Language detection not enabled");
+}
+
+#[test]
+fn test_ocr_chinese_simplified() {
+    if skip_if_missing("images/chi_sim_image.jpeg") {
+        return;
+    }
+
+    let file_path = get_test_file_path("images/chi_sim_image.jpeg");
+    let config = test_config_with_ocr();
+
+    let result = extract_file_sync(&file_path, None, &config).expect("Should process Chinese image");
+
+    assert_mime_type(&result, "image/jpeg");
+
+    assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
+    assert!(result.detected_languages.is_none(), "Language detection not enabled");
+}
+
+#[test]
+fn test_ocr_japanese_vertical() {
+    if skip_if_missing("images/jpn_vert.jpeg") {
+        return;
+    }
+
+    let file_path = get_test_file_path("images/jpn_vert.jpeg");
+    let config = test_config_with_ocr();
+
+    let result = extract_file_sync(&file_path, None, &config).expect("Should process Japanese vertical text image");
+
+    assert_mime_type(&result, "image/jpeg");
+
+    assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
+    assert!(result.detected_languages.is_none(), "Language detection not enabled");
+}
+
+#[test]
+fn test_image_no_text() {
+    if skip_if_missing("images/flower_no_text.jpg") {
+        return;
+    }
+
+    let file_path = get_test_file_path("images/flower_no_text.jpg");
+    let config = test_config_with_ocr();
+
+    let result = extract_file_sync(&file_path, None, &config).expect("Should process image without text");
+
+    assert_mime_type(&result, "image/jpeg");
+
+    assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
+    assert!(result.detected_languages.is_none(), "Language detection not enabled");
+}