437 lines
15 KiB
Rust
437 lines
15 KiB
Rust
|
|
//! Diagnostic test to trace PaddleOCR detection pipeline.
|
||
|
|
//!
|
||
|
|
//! This test isolates each step to determine where empty results originate.
|
||
|
|
//! Since this crate doesn't have PNG/image decoder features, we create test
|
||
|
|
//! images programmatically.
|
||
|
|
|
||
|
|
use std::path::PathBuf;
|
||
|
|
|
||
|
|
fn get_workspace_root() -> PathBuf {
|
||
|
|
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
|
||
|
|
manifest_dir.parent().unwrap().parent().unwrap().to_path_buf()
|
||
|
|
}
|
||
|
|
|
||
|
|
fn get_model_dir() -> PathBuf {
|
||
|
|
get_workspace_root().join(".kreuzberg/paddle-ocr")
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Create a simple test image with black text "HELLO" on white background.
|
||
|
|
/// This avoids needing PNG decoder features.
|
||
|
|
fn create_test_image() -> image::RgbImage {
|
||
|
|
let width = 200u32;
|
||
|
|
let height = 100u32;
|
||
|
|
let mut img = image::RgbImage::from_pixel(width, height, image::Rgb([255, 255, 255]));
|
||
|
|
|
||
|
|
// Draw a thick black rectangle to simulate text (a simple "block" pattern)
|
||
|
|
// This ensures the detection model has SOMETHING to detect
|
||
|
|
let black = image::Rgb([0, 0, 0]);
|
||
|
|
|
||
|
|
// Draw "H" shape (x: 20-60, y: 20-80)
|
||
|
|
for y in 20..80 {
|
||
|
|
img.put_pixel(20, y, black);
|
||
|
|
img.put_pixel(21, y, black);
|
||
|
|
img.put_pixel(22, y, black);
|
||
|
|
}
|
||
|
|
for y in 20..80 {
|
||
|
|
img.put_pixel(55, y, black);
|
||
|
|
img.put_pixel(56, y, black);
|
||
|
|
img.put_pixel(57, y, black);
|
||
|
|
}
|
||
|
|
for x in 20..58 {
|
||
|
|
img.put_pixel(x, 48, black);
|
||
|
|
img.put_pixel(x, 49, black);
|
||
|
|
img.put_pixel(x, 50, black);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Draw thick solid block to be very obvious (x: 80-180, y: 30-70)
|
||
|
|
for y in 30..70 {
|
||
|
|
for x in 80..180 {
|
||
|
|
img.put_pixel(x, y, black);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
img
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn diagnostic_detection_pipeline() {
|
||
|
|
let model_dir = get_model_dir();
|
||
|
|
|
||
|
|
if !model_dir.join("det/model.onnx").exists() {
|
||
|
|
eprintln!("SKIP: Models not downloaded at {:?}", model_dir);
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Discover ORT library
|
||
|
|
discover_ort();
|
||
|
|
|
||
|
|
eprintln!("=== PaddleOCR Diagnostic Test ===");
|
||
|
|
eprintln!("Model dir: {:?}", model_dir);
|
||
|
|
|
||
|
|
// Step 1: Create test image
|
||
|
|
let img = create_test_image();
|
||
|
|
eprintln!("Step 1 - Test image created: {}x{}", img.width(), img.height());
|
||
|
|
|
||
|
|
// Step 2: Initialize OcrLite
|
||
|
|
let mut ocr_lite = kreuzberg_paddle_ocr::OcrLite::new();
|
||
|
|
let det_path = model_dir.join("det/model.onnx");
|
||
|
|
let cls_path = model_dir.join("cls/model.onnx");
|
||
|
|
let rec_path = model_dir.join("rec/model.onnx");
|
||
|
|
|
||
|
|
let init_result = ocr_lite.init_models(
|
||
|
|
det_path.to_str().unwrap(),
|
||
|
|
cls_path.to_str().unwrap(),
|
||
|
|
rec_path.to_str().unwrap(),
|
||
|
|
1,
|
||
|
|
);
|
||
|
|
|
||
|
|
match &init_result {
|
||
|
|
Ok(()) => eprintln!("Step 2 - Models initialized successfully"),
|
||
|
|
Err(e) => {
|
||
|
|
eprintln!("Step 2 - FAILED to init models: {:?}", e);
|
||
|
|
panic!("Model initialization failed: {:?}", e);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Step 3: Run detection with various parameter sets
|
||
|
|
let test_cases = vec![
|
||
|
|
("A: Default params", 50u32, 960u32, 0.3f32, 0.5f32, 1.6f32, true, false),
|
||
|
|
("B: Very low thresholds", 50, 960, 0.01, 0.01, 1.6, false, false),
|
||
|
|
("C: No padding + low", 0, 960, 0.01, 0.01, 1.6, false, false),
|
||
|
|
("D: Higher unclip ratio", 50, 960, 0.1, 0.1, 3.0, false, false),
|
||
|
|
("E: No padding + medium", 0, 960, 0.1, 0.3, 2.0, false, false),
|
||
|
|
];
|
||
|
|
|
||
|
|
let mut any_detected = false;
|
||
|
|
|
||
|
|
for (name, padding, max_side, box_score, box_thresh, unclip, do_angle, most_angle) in &test_cases {
|
||
|
|
eprintln!("\n--- Test {} ---", name);
|
||
|
|
eprintln!(
|
||
|
|
" padding={}, max_side={}, box_score={}, box_thresh={}, unclip={}",
|
||
|
|
padding, max_side, box_score, box_thresh, unclip
|
||
|
|
);
|
||
|
|
|
||
|
|
let result = ocr_lite.detect(
|
||
|
|
&img,
|
||
|
|
*padding,
|
||
|
|
*max_side,
|
||
|
|
*box_score,
|
||
|
|
*box_thresh,
|
||
|
|
*unclip,
|
||
|
|
*do_angle,
|
||
|
|
*most_angle,
|
||
|
|
);
|
||
|
|
|
||
|
|
match &result {
|
||
|
|
Ok(ocr_result) => {
|
||
|
|
eprintln!(" Result: {} text blocks", ocr_result.text_blocks.len());
|
||
|
|
for (i, block) in ocr_result.text_blocks.iter().enumerate() {
|
||
|
|
eprintln!(
|
||
|
|
" Block {}: text='{}', text_score={:.3}, box_score={:.3}",
|
||
|
|
i, block.text, block.text_score, block.box_score
|
||
|
|
);
|
||
|
|
any_detected = true;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
Err(e) => {
|
||
|
|
eprintln!(" FAILED: {:?}", e);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
eprintln!("\n=== Diagnosis ===");
|
||
|
|
if !any_detected {
|
||
|
|
eprintln!("RESULT: Detection model produces NO output regardless of thresholds.");
|
||
|
|
eprintln!("This strongly suggests an ORT version compatibility issue.");
|
||
|
|
eprintln!(" ort crate version: check Cargo.lock for current version");
|
||
|
|
eprintln!(" ORT_DYLIB_PATH: {:?}", std::env::var("ORT_DYLIB_PATH"));
|
||
|
|
} else {
|
||
|
|
eprintln!("RESULT: Detection works. Issue may be threshold-related or image-specific.");
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Also test with raw ONNX inference to check if ORT works at all.
|
||
|
|
#[test]
|
||
|
|
fn diagnostic_raw_ort_inference() {
|
||
|
|
let model_dir = get_model_dir();
|
||
|
|
let det_model = model_dir.join("det/model.onnx");
|
||
|
|
|
||
|
|
if !det_model.exists() {
|
||
|
|
eprintln!("SKIP: Detection model not found at {:?}", det_model);
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
discover_ort();
|
||
|
|
|
||
|
|
eprintln!("=== Raw ORT Inference Test ===");
|
||
|
|
|
||
|
|
// Load model directly via ort
|
||
|
|
use ort::session::Session;
|
||
|
|
|
||
|
|
let mut session = Session::builder().unwrap().commit_from_file(&det_model).unwrap();
|
||
|
|
|
||
|
|
eprintln!("Model loaded successfully");
|
||
|
|
eprintln!("Inputs:");
|
||
|
|
for input in session.inputs() {
|
||
|
|
eprintln!(" name='{}'", input.name());
|
||
|
|
}
|
||
|
|
eprintln!("Outputs:");
|
||
|
|
for output in session.outputs() {
|
||
|
|
eprintln!(" name='{}'", output.name());
|
||
|
|
}
|
||
|
|
|
||
|
|
// Create a small 32x32 test tensor (NCHW format: batch=1, channels=3, h=32, w=32)
|
||
|
|
let input_data: Vec<f32> = vec![0.5; 3 * 32 * 32];
|
||
|
|
let tensor =
|
||
|
|
ort::value::Tensor::from_array(ndarray::Array::from_shape_vec((1, 3, 32, 32), input_data).unwrap()).unwrap();
|
||
|
|
|
||
|
|
let input_name = session.inputs()[0].name().to_string();
|
||
|
|
eprintln!("\nRunning inference with 32x32 gray image...");
|
||
|
|
|
||
|
|
let outputs = session.run(ort::inputs![input_name => tensor]).unwrap();
|
||
|
|
|
||
|
|
// Check output
|
||
|
|
let (output_name, output_value) = outputs.iter().next().unwrap();
|
||
|
|
eprintln!("Output name: {}", output_name);
|
||
|
|
|
||
|
|
let output_tensor = output_value.try_extract_tensor::<f32>().unwrap();
|
||
|
|
let output_shape = output_tensor.0;
|
||
|
|
let output_data = output_tensor.1;
|
||
|
|
|
||
|
|
eprintln!("Output shape: {:?}", output_shape);
|
||
|
|
eprintln!("Output len: {}", output_data.len());
|
||
|
|
|
||
|
|
if !output_data.is_empty() {
|
||
|
|
let min = output_data.iter().cloned().fold(f32::INFINITY, f32::min);
|
||
|
|
let max = output_data.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
|
||
|
|
let sum: f32 = output_data.iter().sum();
|
||
|
|
let mean = sum / output_data.len() as f32;
|
||
|
|
let non_zero = output_data.iter().filter(|&&v| v > 0.001).count();
|
||
|
|
|
||
|
|
eprintln!("Output stats: min={:.6}, max={:.6}, mean={:.6}", min, max, mean);
|
||
|
|
eprintln!("Non-zero values (>0.001): {} / {}", non_zero, output_data.len());
|
||
|
|
|
||
|
|
if max < 0.001 {
|
||
|
|
eprintln!("\nDIAGNOSIS: Model outputs are essentially all zeros.");
|
||
|
|
eprintln!("This confirms an ORT compatibility issue - model isn't executing correctly.");
|
||
|
|
} else {
|
||
|
|
eprintln!("\nDIAGNOSIS: Model produces non-zero output. ORT is working.");
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Diagnostic: test the CRNN recognition model directly.
|
||
|
|
#[test]
|
||
|
|
fn diagnostic_crnn_model_output() {
|
||
|
|
let model_dir = get_model_dir();
|
||
|
|
let rec_model = model_dir.join("rec/model.onnx");
|
||
|
|
|
||
|
|
if !rec_model.exists() {
|
||
|
|
eprintln!("SKIP: Recognition model not found");
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
discover_ort();
|
||
|
|
|
||
|
|
eprintln!("=== CRNN Recognition Model Diagnostic ===");
|
||
|
|
|
||
|
|
use ort::session::Session;
|
||
|
|
|
||
|
|
let mut session = Session::builder().unwrap().commit_from_file(&rec_model).unwrap();
|
||
|
|
|
||
|
|
eprintln!("Model loaded successfully");
|
||
|
|
eprintln!("Inputs:");
|
||
|
|
for input in session.inputs() {
|
||
|
|
eprintln!(" name='{}'", input.name());
|
||
|
|
}
|
||
|
|
eprintln!("Outputs:");
|
||
|
|
for output in session.outputs() {
|
||
|
|
eprintln!(" name='{}'", output.name());
|
||
|
|
}
|
||
|
|
|
||
|
|
// Check metadata for character list
|
||
|
|
{
|
||
|
|
let metadata = session.metadata().unwrap();
|
||
|
|
|
||
|
|
// Check all metadata custom keys
|
||
|
|
eprintln!("Model metadata:");
|
||
|
|
eprintln!(" description: {:?}", metadata.description());
|
||
|
|
eprintln!(" producer: {:?}", metadata.producer());
|
||
|
|
|
||
|
|
// Try to get the character key
|
||
|
|
match metadata.custom("character") {
|
||
|
|
Some(chars) => {
|
||
|
|
let bytes = chars.as_bytes();
|
||
|
|
let char_count = chars.split('\n').count();
|
||
|
|
eprintln!(
|
||
|
|
" custom('character'): len={}, bytes={}, split_count={}",
|
||
|
|
chars.len(),
|
||
|
|
bytes.len(),
|
||
|
|
char_count
|
||
|
|
);
|
||
|
|
if chars.len() < 500 {
|
||
|
|
eprintln!(" value: {:?}", chars);
|
||
|
|
} else {
|
||
|
|
let preview: String = chars.chars().take(100).collect();
|
||
|
|
eprintln!(" preview (first 100 chars): {:?}", preview);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Check for null bytes or other encoding issues
|
||
|
|
let null_count = bytes.iter().filter(|&&b| b == 0).count();
|
||
|
|
if null_count > 0 {
|
||
|
|
eprintln!(" WARNING: {} null bytes found in character string!", null_count);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
None => {
|
||
|
|
eprintln!(" ERROR: No 'character' key in model metadata!");
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Try other possible metadata keys
|
||
|
|
for key in [
|
||
|
|
"character",
|
||
|
|
"characters",
|
||
|
|
"dict",
|
||
|
|
"dictionary",
|
||
|
|
"labels",
|
||
|
|
"vocab",
|
||
|
|
"alphabet",
|
||
|
|
] {
|
||
|
|
if let Some(val) = metadata.custom(key) {
|
||
|
|
eprintln!(
|
||
|
|
" custom('{}'): len={}, preview={:?}",
|
||
|
|
key,
|
||
|
|
val.len(),
|
||
|
|
&val[..val.len().min(80)]
|
||
|
|
);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
} // metadata dropped here
|
||
|
|
|
||
|
|
// Test 1: Run inference with a simple input (height=48, width=200)
|
||
|
|
// CRNN expects NCHW: [1, 3, 48, width]
|
||
|
|
let h = 48usize;
|
||
|
|
let w = 200usize;
|
||
|
|
|
||
|
|
// Create a pattern that looks like text (alternating black/white vertical stripes)
|
||
|
|
let mut input_data: Vec<f32> = vec![0.0; 3 * h * w];
|
||
|
|
for c in 0..3 {
|
||
|
|
for y in 10..38 {
|
||
|
|
for x in (20..180).step_by(2) {
|
||
|
|
input_data[c * h * w + y * w + x] = -1.0; // normalized black
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
let tensor =
|
||
|
|
ort::value::Tensor::from_array(ndarray::Array::from_shape_vec((1, 3, h, w), input_data).unwrap()).unwrap();
|
||
|
|
|
||
|
|
let input_name = session.inputs()[0].name().to_string();
|
||
|
|
eprintln!("\nRunning CRNN with striped pattern (48x200)...");
|
||
|
|
|
||
|
|
let outputs = session.run(ort::inputs![input_name => tensor]).unwrap();
|
||
|
|
|
||
|
|
let (_, output_value) = outputs.iter().next().unwrap();
|
||
|
|
let (shape, data) = output_value.try_extract_tensor::<f32>().unwrap();
|
||
|
|
|
||
|
|
eprintln!("Output shape: {:?}", shape);
|
||
|
|
eprintln!("Output total values: {}", data.len());
|
||
|
|
|
||
|
|
if shape.len() >= 3 {
|
||
|
|
let time_steps = shape[1] as usize;
|
||
|
|
let vocab_size = shape[2] as usize;
|
||
|
|
eprintln!("Time steps: {}, Vocabulary size: {}", time_steps, vocab_size);
|
||
|
|
|
||
|
|
// Check if outputs are meaningful
|
||
|
|
let data_vec: Vec<f32> = data.to_vec();
|
||
|
|
let min = data_vec.iter().cloned().fold(f32::INFINITY, f32::min);
|
||
|
|
let max = data_vec.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
|
||
|
|
let mean: f32 = data_vec.iter().sum::<f32>() / data_vec.len() as f32;
|
||
|
|
eprintln!("Overall stats: min={:.6}, max={:.6}, mean={:.6}", min, max, mean);
|
||
|
|
|
||
|
|
// Check argmax distribution
|
||
|
|
let mut argmax_zero_count = 0;
|
||
|
|
let mut argmax_nonzero_count = 0;
|
||
|
|
for t in 0..time_steps {
|
||
|
|
let start = t * vocab_size;
|
||
|
|
let end = start + vocab_size;
|
||
|
|
let slice = &data_vec[start..end.min(data_vec.len())];
|
||
|
|
|
||
|
|
let (max_idx, max_val) =
|
||
|
|
slice.iter().enumerate().fold(
|
||
|
|
(0, f32::MIN),
|
||
|
|
|(mi, mv), (i, &v)| if v > mv { (i, v) } else { (mi, mv) },
|
||
|
|
);
|
||
|
|
|
||
|
|
if max_idx == 0 {
|
||
|
|
argmax_zero_count += 1;
|
||
|
|
} else {
|
||
|
|
argmax_nonzero_count += 1;
|
||
|
|
}
|
||
|
|
|
||
|
|
if t < 5 || (t > time_steps - 3) {
|
||
|
|
eprintln!(" Step {}: argmax={}, max_val={:.4}", t, max_idx, max_val);
|
||
|
|
} else if t == 5 {
|
||
|
|
eprintln!(" ... (skipping middle steps)");
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
eprintln!(
|
||
|
|
"\nArgmax distribution: {} blank (idx=0), {} non-blank",
|
||
|
|
argmax_zero_count, argmax_nonzero_count
|
||
|
|
);
|
||
|
|
|
||
|
|
if argmax_nonzero_count == 0 {
|
||
|
|
eprintln!("\nDIAGNOSIS: CRNN model outputs all blanks.");
|
||
|
|
eprintln!("Possible causes:");
|
||
|
|
eprintln!(" 1. ORT version incompatibility with CRNN model");
|
||
|
|
eprintln!(" 2. Model is not executing graph correctly");
|
||
|
|
eprintln!(" 3. Input normalization mismatch");
|
||
|
|
} else {
|
||
|
|
eprintln!("\nDIAGNOSIS: CRNN model produces non-blank output. Recognition works.");
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Drop outputs before reusing session
|
||
|
|
drop(outputs);
|
||
|
|
|
||
|
|
// Test 2: Run with a uniform white image (should produce all blanks - valid baseline)
|
||
|
|
let white_data: Vec<f32> = vec![1.0; 3 * h * w];
|
||
|
|
let white_tensor =
|
||
|
|
ort::value::Tensor::from_array(ndarray::Array::from_shape_vec((1, 3, h, w), white_data).unwrap()).unwrap();
|
||
|
|
|
||
|
|
let input_name2 = session.inputs()[0].name().to_string();
|
||
|
|
eprintln!("\nRunning CRNN with uniform white (48x200)...");
|
||
|
|
let white_outputs = session.run(ort::inputs![input_name2 => white_tensor]).unwrap();
|
||
|
|
let (_, white_val) = white_outputs.iter().next().unwrap();
|
||
|
|
let (_, white_data_out) = white_val.try_extract_tensor::<f32>().unwrap();
|
||
|
|
let white_vec: Vec<f32> = white_data_out.to_vec();
|
||
|
|
let white_max = white_vec.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
|
||
|
|
let white_min = white_vec.iter().cloned().fold(f32::INFINITY, f32::min);
|
||
|
|
eprintln!("White image output: min={:.6}, max={:.6}", white_min, white_max);
|
||
|
|
}
|
||
|
|
|
||
|
|
fn discover_ort() {
|
||
|
|
if let Ok(path) = std::env::var("ORT_DYLIB_PATH")
|
||
|
|
&& std::path::Path::new(&path).exists()
|
||
|
|
{
|
||
|
|
eprintln!("ORT found via ORT_DYLIB_PATH: {}", path);
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let candidates = [
|
||
|
|
"/opt/homebrew/lib/libonnxruntime.dylib",
|
||
|
|
"/usr/local/lib/libonnxruntime.dylib",
|
||
|
|
];
|
||
|
|
|
||
|
|
for candidate in &candidates {
|
||
|
|
if std::path::Path::new(candidate).exists() {
|
||
|
|
eprintln!("Setting ORT_DYLIB_PATH={}", candidate);
|
||
|
|
unsafe { std::env::set_var("ORT_DYLIB_PATH", candidate) };
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
eprintln!("WARNING: Could not find ORT library!");
|
||
|
|
}
|