Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/crates/kreuzberg-paddle-ocr/tests/diagnostic.rs
+++ b/crates/kreuzberg-paddle-ocr/tests/diagnostic.rs
@@ -0,0 +1,436 @@
+//! Diagnostic test to trace PaddleOCR detection pipeline.
+//!
+//! This test isolates each step to determine where empty results originate.
+//! Since this crate doesn't have PNG/image decoder features, we create test
+//! images programmatically.
+
+use std::path::PathBuf;
+
+fn get_workspace_root() -> PathBuf {
+    let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+    manifest_dir.parent().unwrap().parent().unwrap().to_path_buf()
+}
+
+fn get_model_dir() -> PathBuf {
+    get_workspace_root().join(".kreuzberg/paddle-ocr")
+}
+
+/// Create a simple test image with black text "HELLO" on white background.
+/// This avoids needing PNG decoder features.
+fn create_test_image() -> image::RgbImage {
+    let width = 200u32;
+    let height = 100u32;
+    let mut img = image::RgbImage::from_pixel(width, height, image::Rgb([255, 255, 255]));
+
+    // Draw a thick black rectangle to simulate text (a simple "block" pattern)
+    // This ensures the detection model has SOMETHING to detect
+    let black = image::Rgb([0, 0, 0]);
+
+    // Draw "H" shape (x: 20-60, y: 20-80)
+    for y in 20..80 {
+        img.put_pixel(20, y, black);
+        img.put_pixel(21, y, black);
+        img.put_pixel(22, y, black);
+    }
+    for y in 20..80 {
+        img.put_pixel(55, y, black);
+        img.put_pixel(56, y, black);
+        img.put_pixel(57, y, black);
+    }
+    for x in 20..58 {
+        img.put_pixel(x, 48, black);
+        img.put_pixel(x, 49, black);
+        img.put_pixel(x, 50, black);
+    }
+
+    // Draw thick solid block to be very obvious (x: 80-180, y: 30-70)
+    for y in 30..70 {
+        for x in 80..180 {
+            img.put_pixel(x, y, black);
+        }
+    }
+
+    img
+}
+
+#[test]
+fn diagnostic_detection_pipeline() {
+    let model_dir = get_model_dir();
+
+    if !model_dir.join("det/model.onnx").exists() {
+        eprintln!("SKIP: Models not downloaded at {:?}", model_dir);
+        return;
+    }
+
+    // Discover ORT library
+    discover_ort();
+
+    eprintln!("=== PaddleOCR Diagnostic Test ===");
+    eprintln!("Model dir: {:?}", model_dir);
+
+    // Step 1: Create test image
+    let img = create_test_image();
+    eprintln!("Step 1 - Test image created: {}x{}", img.width(), img.height());
+
+    // Step 2: Initialize OcrLite
+    let mut ocr_lite = kreuzberg_paddle_ocr::OcrLite::new();
+    let det_path = model_dir.join("det/model.onnx");
+    let cls_path = model_dir.join("cls/model.onnx");
+    let rec_path = model_dir.join("rec/model.onnx");
+
+    let init_result = ocr_lite.init_models(
+        det_path.to_str().unwrap(),
+        cls_path.to_str().unwrap(),
+        rec_path.to_str().unwrap(),
+        1,
+    );
+
+    match &init_result {
+        Ok(()) => eprintln!("Step 2 - Models initialized successfully"),
+        Err(e) => {
+            eprintln!("Step 2 - FAILED to init models: {:?}", e);
+            panic!("Model initialization failed: {:?}", e);
+        }
+    }
+
+    // Step 3: Run detection with various parameter sets
+    let test_cases = vec![
+        ("A: Default params", 50u32, 960u32, 0.3f32, 0.5f32, 1.6f32, true, false),
+        ("B: Very low thresholds", 50, 960, 0.01, 0.01, 1.6, false, false),
+        ("C: No padding + low", 0, 960, 0.01, 0.01, 1.6, false, false),
+        ("D: Higher unclip ratio", 50, 960, 0.1, 0.1, 3.0, false, false),
+        ("E: No padding + medium", 0, 960, 0.1, 0.3, 2.0, false, false),
+    ];
+
+    let mut any_detected = false;
+
+    for (name, padding, max_side, box_score, box_thresh, unclip, do_angle, most_angle) in &test_cases {
+        eprintln!("\n--- Test {} ---", name);
+        eprintln!(
+            "  padding={}, max_side={}, box_score={}, box_thresh={}, unclip={}",
+            padding, max_side, box_score, box_thresh, unclip
+        );
+
+        let result = ocr_lite.detect(
+            &img,
+            *padding,
+            *max_side,
+            *box_score,
+            *box_thresh,
+            *unclip,
+            *do_angle,
+            *most_angle,
+        );
+
+        match &result {
+            Ok(ocr_result) => {
+                eprintln!("  Result: {} text blocks", ocr_result.text_blocks.len());
+                for (i, block) in ocr_result.text_blocks.iter().enumerate() {
+                    eprintln!(
+                        "    Block {}: text='{}', text_score={:.3}, box_score={:.3}",
+                        i, block.text, block.text_score, block.box_score
+                    );
+                    any_detected = true;
+                }
+            }
+            Err(e) => {
+                eprintln!("  FAILED: {:?}", e);
+            }
+        }
+    }
+
+    eprintln!("\n=== Diagnosis ===");
+    if !any_detected {
+        eprintln!("RESULT: Detection model produces NO output regardless of thresholds.");
+        eprintln!("This strongly suggests an ORT version compatibility issue.");
+        eprintln!("  ort crate version: check Cargo.lock for current version");
+        eprintln!("  ORT_DYLIB_PATH: {:?}", std::env::var("ORT_DYLIB_PATH"));
+    } else {
+        eprintln!("RESULT: Detection works. Issue may be threshold-related or image-specific.");
+    }
+}
+
+/// Also test with raw ONNX inference to check if ORT works at all.
+#[test]
+fn diagnostic_raw_ort_inference() {
+    let model_dir = get_model_dir();
+    let det_model = model_dir.join("det/model.onnx");
+
+    if !det_model.exists() {
+        eprintln!("SKIP: Detection model not found at {:?}", det_model);
+        return;
+    }
+
+    discover_ort();
+
+    eprintln!("=== Raw ORT Inference Test ===");
+
+    // Load model directly via ort
+    use ort::session::Session;
+
+    let mut session = Session::builder().unwrap().commit_from_file(&det_model).unwrap();
+
+    eprintln!("Model loaded successfully");
+    eprintln!("Inputs:");
+    for input in session.inputs() {
+        eprintln!("  name='{}'", input.name());
+    }
+    eprintln!("Outputs:");
+    for output in session.outputs() {
+        eprintln!("  name='{}'", output.name());
+    }
+
+    // Create a small 32x32 test tensor (NCHW format: batch=1, channels=3, h=32, w=32)
+    let input_data: Vec<f32> = vec![0.5; 3 * 32 * 32];
+    let tensor =
+        ort::value::Tensor::from_array(ndarray::Array::from_shape_vec((1, 3, 32, 32), input_data).unwrap()).unwrap();
+
+    let input_name = session.inputs()[0].name().to_string();
+    eprintln!("\nRunning inference with 32x32 gray image...");
+
+    let outputs = session.run(ort::inputs![input_name => tensor]).unwrap();
+
+    // Check output
+    let (output_name, output_value) = outputs.iter().next().unwrap();
+    eprintln!("Output name: {}", output_name);
+
+    let output_tensor = output_value.try_extract_tensor::<f32>().unwrap();
+    let output_shape = output_tensor.0;
+    let output_data = output_tensor.1;
+
+    eprintln!("Output shape: {:?}", output_shape);
+    eprintln!("Output len: {}", output_data.len());
+
+    if !output_data.is_empty() {
+        let min = output_data.iter().cloned().fold(f32::INFINITY, f32::min);
+        let max = output_data.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
+        let sum: f32 = output_data.iter().sum();
+        let mean = sum / output_data.len() as f32;
+        let non_zero = output_data.iter().filter(|&&v| v > 0.001).count();
+
+        eprintln!("Output stats: min={:.6}, max={:.6}, mean={:.6}", min, max, mean);
+        eprintln!("Non-zero values (>0.001): {} / {}", non_zero, output_data.len());
+
+        if max < 0.001 {
+            eprintln!("\nDIAGNOSIS: Model outputs are essentially all zeros.");
+            eprintln!("This confirms an ORT compatibility issue - model isn't executing correctly.");
+        } else {
+            eprintln!("\nDIAGNOSIS: Model produces non-zero output. ORT is working.");
+        }
+    }
+}
+
+/// Diagnostic: test the CRNN recognition model directly.
+#[test]
+fn diagnostic_crnn_model_output() {
+    let model_dir = get_model_dir();
+    let rec_model = model_dir.join("rec/model.onnx");
+
+    if !rec_model.exists() {
+        eprintln!("SKIP: Recognition model not found");
+        return;
+    }
+
+    discover_ort();
+
+    eprintln!("=== CRNN Recognition Model Diagnostic ===");
+
+    use ort::session::Session;
+
+    let mut session = Session::builder().unwrap().commit_from_file(&rec_model).unwrap();
+
+    eprintln!("Model loaded successfully");
+    eprintln!("Inputs:");
+    for input in session.inputs() {
+        eprintln!("  name='{}'", input.name());
+    }
+    eprintln!("Outputs:");
+    for output in session.outputs() {
+        eprintln!("  name='{}'", output.name());
+    }
+
+    // Check metadata for character list
+    {
+        let metadata = session.metadata().unwrap();
+
+        // Check all metadata custom keys
+        eprintln!("Model metadata:");
+        eprintln!("  description: {:?}", metadata.description());
+        eprintln!("  producer: {:?}", metadata.producer());
+
+        // Try to get the character key
+        match metadata.custom("character") {
+            Some(chars) => {
+                let bytes = chars.as_bytes();
+                let char_count = chars.split('\n').count();
+                eprintln!(
+                    "  custom('character'): len={}, bytes={}, split_count={}",
+                    chars.len(),
+                    bytes.len(),
+                    char_count
+                );
+                if chars.len() < 500 {
+                    eprintln!("  value: {:?}", chars);
+                } else {
+                    let preview: String = chars.chars().take(100).collect();
+                    eprintln!("  preview (first 100 chars): {:?}", preview);
+                }
+
+                // Check for null bytes or other encoding issues
+                let null_count = bytes.iter().filter(|&&b| b == 0).count();
+                if null_count > 0 {
+                    eprintln!("  WARNING: {} null bytes found in character string!", null_count);
+                }
+            }
+            None => {
+                eprintln!("  ERROR: No 'character' key in model metadata!");
+            }
+        }
+
+        // Try other possible metadata keys
+        for key in [
+            "character",
+            "characters",
+            "dict",
+            "dictionary",
+            "labels",
+            "vocab",
+            "alphabet",
+        ] {
+            if let Some(val) = metadata.custom(key) {
+                eprintln!(
+                    "  custom('{}'): len={}, preview={:?}",
+                    key,
+                    val.len(),
+                    &val[..val.len().min(80)]
+                );
+            }
+        }
+    } // metadata dropped here
+
+    // Test 1: Run inference with a simple input (height=48, width=200)
+    // CRNN expects NCHW: [1, 3, 48, width]
+    let h = 48usize;
+    let w = 200usize;
+
+    // Create a pattern that looks like text (alternating black/white vertical stripes)
+    let mut input_data: Vec<f32> = vec![0.0; 3 * h * w];
+    for c in 0..3 {
+        for y in 10..38 {
+            for x in (20..180).step_by(2) {
+                input_data[c * h * w + y * w + x] = -1.0; // normalized black
+            }
+        }
+    }
+
+    let tensor =
+        ort::value::Tensor::from_array(ndarray::Array::from_shape_vec((1, 3, h, w), input_data).unwrap()).unwrap();
+
+    let input_name = session.inputs()[0].name().to_string();
+    eprintln!("\nRunning CRNN with striped pattern (48x200)...");
+
+    let outputs = session.run(ort::inputs![input_name => tensor]).unwrap();
+
+    let (_, output_value) = outputs.iter().next().unwrap();
+    let (shape, data) = output_value.try_extract_tensor::<f32>().unwrap();
+
+    eprintln!("Output shape: {:?}", shape);
+    eprintln!("Output total values: {}", data.len());
+
+    if shape.len() >= 3 {
+        let time_steps = shape[1] as usize;
+        let vocab_size = shape[2] as usize;
+        eprintln!("Time steps: {}, Vocabulary size: {}", time_steps, vocab_size);
+
+        // Check if outputs are meaningful
+        let data_vec: Vec<f32> = data.to_vec();
+        let min = data_vec.iter().cloned().fold(f32::INFINITY, f32::min);
+        let max = data_vec.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
+        let mean: f32 = data_vec.iter().sum::<f32>() / data_vec.len() as f32;
+        eprintln!("Overall stats: min={:.6}, max={:.6}, mean={:.6}", min, max, mean);
+
+        // Check argmax distribution
+        let mut argmax_zero_count = 0;
+        let mut argmax_nonzero_count = 0;
+        for t in 0..time_steps {
+            let start = t * vocab_size;
+            let end = start + vocab_size;
+            let slice = &data_vec[start..end.min(data_vec.len())];
+
+            let (max_idx, max_val) =
+                slice.iter().enumerate().fold(
+                    (0, f32::MIN),
+                    |(mi, mv), (i, &v)| if v > mv { (i, v) } else { (mi, mv) },
+                );
+
+            if max_idx == 0 {
+                argmax_zero_count += 1;
+            } else {
+                argmax_nonzero_count += 1;
+            }
+
+            if t < 5 || (t > time_steps - 3) {
+                eprintln!("  Step {}: argmax={}, max_val={:.4}", t, max_idx, max_val);
+            } else if t == 5 {
+                eprintln!("  ... (skipping middle steps)");
+            }
+        }
+
+        eprintln!(
+            "\nArgmax distribution: {} blank (idx=0), {} non-blank",
+            argmax_zero_count, argmax_nonzero_count
+        );
+
+        if argmax_nonzero_count == 0 {
+            eprintln!("\nDIAGNOSIS: CRNN model outputs all blanks.");
+            eprintln!("Possible causes:");
+            eprintln!("  1. ORT version incompatibility with CRNN model");
+            eprintln!("  2. Model is not executing graph correctly");
+            eprintln!("  3. Input normalization mismatch");
+        } else {
+            eprintln!("\nDIAGNOSIS: CRNN model produces non-blank output. Recognition works.");
+        }
+    }
+
+    // Drop outputs before reusing session
+    drop(outputs);
+
+    // Test 2: Run with a uniform white image (should produce all blanks - valid baseline)
+    let white_data: Vec<f32> = vec![1.0; 3 * h * w];
+    let white_tensor =
+        ort::value::Tensor::from_array(ndarray::Array::from_shape_vec((1, 3, h, w), white_data).unwrap()).unwrap();
+
+    let input_name2 = session.inputs()[0].name().to_string();
+    eprintln!("\nRunning CRNN with uniform white (48x200)...");
+    let white_outputs = session.run(ort::inputs![input_name2 => white_tensor]).unwrap();
+    let (_, white_val) = white_outputs.iter().next().unwrap();
+    let (_, white_data_out) = white_val.try_extract_tensor::<f32>().unwrap();
+    let white_vec: Vec<f32> = white_data_out.to_vec();
+    let white_max = white_vec.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
+    let white_min = white_vec.iter().cloned().fold(f32::INFINITY, f32::min);
+    eprintln!("White image output: min={:.6}, max={:.6}", white_min, white_max);
+}
+
+fn discover_ort() {
+    if let Ok(path) = std::env::var("ORT_DYLIB_PATH")
+        && std::path::Path::new(&path).exists()
+    {
+        eprintln!("ORT found via ORT_DYLIB_PATH: {}", path);
+        return;
+    }
+
+    let candidates = [
+        "/opt/homebrew/lib/libonnxruntime.dylib",
+        "/usr/local/lib/libonnxruntime.dylib",
+    ];
+
+    for candidate in &candidates {
+        if std::path::Path::new(candidate).exists() {
+            eprintln!("Setting ORT_DYLIB_PATH={}", candidate);
+            unsafe { std::env::set_var("ORT_DYLIB_PATH", candidate) };
+            return;
+        }
+    }
+
+    eprintln!("WARNING: Could not find ORT library!");
+}