Files
fil/crates/kreuzberg/tests/paddle_ocr_integration.rs
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

850 lines
30 KiB
Rust

//! TODO: Restored from 245539484 alef-migration cleanup. Currently exercises
//! pub(crate) APIs that the migration deliberately narrowed; gated until
//! either (a) these APIs are re-exposed publicly, or (b) the test is
//! rewritten against the public extraction surface.
#![cfg(any())]
// Original content preserved below; recompiled once gating cfg drops.
// Disabled by the file-level cfg(any()) above.
/*
//! Integration tests for PaddleOCR functionality.
//!
//! These tests require:
//! - Network access to download models from HuggingFace
//! - ONNX Runtime installed on the system
//!
//! Run with: `cargo test -p kreuzberg --features paddle-ocr --test paddle_ocr_integration -- --ignored`
#![cfg(feature = "paddle-ocr")]
use std::path::PathBuf;
use kreuzberg::core::config::OcrConfig;
use kreuzberg::paddle_ocr::{ModelManager, PaddleOcrBackend, PaddleOcrConfig};
use kreuzberg::plugins::OcrBackend;
use kreuzberg::types::ExtractionResult;
/// Helper to get the test documents directory
fn test_documents_dir() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.parent()
.unwrap()
.parent()
.unwrap()
.join("test_documents")
}
/// Helper to get a temporary cache directory for tests
fn test_cache_dir() -> PathBuf {
std::env::temp_dir().join("kreuzberg_paddle_test")
}
/// Test that model manager can download models from HuggingFace.
///
/// This test downloads actual models and verifies they are cached correctly.
/// It's ignored by default since it requires network access and ~100MB download.
#[tokio::test]
#[ignore = "requires network access and ~100MB download"]
async fn test_model_download_from_huggingface() {
let cache_dir = test_cache_dir();
// Clean up any existing cache
let _ = std::fs::remove_dir_all(&cache_dir);
let manager = ModelManager::new(cache_dir.clone());
// Verify cache is empty
assert!(!manager.are_models_cached());
// Download models (synchronous now)
let result = manager.ensure_models_exist();
assert!(result.is_ok(), "Model download failed: {:?}", result.err());
let paths: kreuzberg::paddle_ocr::ModelPaths = result.unwrap();
// Verify all model directories exist
assert!(paths.det_model.exists(), "Detection model dir not found");
assert!(paths.cls_model.exists(), "Classification model dir not found");
assert!(paths.rec_model.exists(), "Recognition model dir not found");
// Verify ONNX model files exist within directories
assert!(
paths.det_model.join("model.onnx").exists(),
"Detection ONNX file not found"
);
assert!(
paths.cls_model.join("model.onnx").exists(),
"Classification ONNX file not found"
);
assert!(
paths.rec_model.join("model.onnx").exists(),
"Recognition ONNX file not found"
);
// Verify dictionary file exists
assert!(paths.dict_file.exists(), "Dictionary file not found");
// Verify cache reports correctly
assert!(manager.are_models_cached());
// Check cache stats
let stats = manager.cache_stats().unwrap();
// 3 model dirs, each containing model.onnx (rec/ also has dict.txt)
assert!(
stats.model_count >= 3,
"Expected at least 3 cached items, got {}",
stats.model_count
);
// Models should be > 1MB each
assert!(stats.total_size_bytes > 1_000_000);
println!("Cache stats: {:?}", stats);
println!("Detection model: {:?}", paths.det_model);
println!("Classification model: {:?}", paths.cls_model);
println!("Recognition model: {:?}", paths.rec_model);
println!("Dictionary file: {:?}", paths.dict_file);
}
/// Test OCR on a simple English "Hello World" image.
///
/// This test requires ONNX Runtime and downloaded models.
#[tokio::test]
#[ignore = "requires ONNX Runtime and downloaded models"]
async fn test_ocr_hello_world_english() {
let image_path = test_documents_dir().join("images/test_hello_world.png");
assert!(image_path.exists(), "Test image not found: {:?}", image_path);
let image_bytes = std::fs::read(&image_path).expect("Failed to read image");
let config = PaddleOcrConfig::new("en").with_cache_dir(test_cache_dir());
let backend = PaddleOcrBackend::with_config(config).expect("Failed to create backend");
let ocr_config = OcrConfig {
backend: "paddle-ocr".to_string(),
language: "en".to_string(),
..Default::default()
};
let result: kreuzberg::Result<ExtractionResult> = backend.process_image(&image_bytes, &ocr_config).await;
assert!(result.is_ok(), "OCR failed: {:?}", result.err());
let extraction: ExtractionResult = result.unwrap();
let text = extraction.content.to_lowercase();
println!("OCR result: {}", extraction.content);
// Should contain "hello" and "world"
assert!(
text.contains("hello") || text.contains("helo"),
"Expected 'hello' in OCR result: {}",
text
);
assert!(
text.contains("world") || text.contains("worid"),
"Expected 'world' in OCR result: {}",
text
);
}
/// Test OCR on a complex English document (newspaper).
#[tokio::test]
#[ignore = "requires ONNX Runtime and downloaded models"]
async fn test_ocr_newspaper_english() {
let image_path = test_documents_dir().join("images/ocr_image.jpg");
assert!(image_path.exists(), "Test image not found: {:?}", image_path);
let image_bytes = std::fs::read(&image_path).expect("Failed to read image");
let config = PaddleOcrConfig::new("en").with_cache_dir(test_cache_dir());
let backend = PaddleOcrBackend::with_config(config).expect("Failed to create backend");
let ocr_config = OcrConfig {
backend: "paddle-ocr".to_string(),
language: "en".to_string(),
..Default::default()
};
let result: kreuzberg::Result<ExtractionResult> = backend.process_image(&image_bytes, &ocr_config).await;
assert!(result.is_ok(), "OCR failed: {:?}", result.err());
let extraction: ExtractionResult = result.unwrap();
let text = extraction.content.to_uppercase();
println!(
"OCR result (first 500 chars): {}",
&extraction.content[..extraction.content.len().min(500)]
);
// Should contain "NASDAQ" and "AMEX" from the header
assert!(
text.contains("NASDAQ") || text.contains("NASOAQ"),
"Expected 'NASDAQ' in OCR result"
);
assert!(
text.contains("AMEX") || text.contains("STOCK"),
"Expected 'AMEX' or 'STOCK' in OCR result"
);
}
/// Test OCR on Chinese text image.
///
/// Note: Uses per-family PP-OCRv5 recognition models.
/// This test verifies the pipeline handles non-English images without crashing,
/// but requires the Chinese recognition model to be cached for accurate results.
#[tokio::test]
#[ignore = "requires ONNX Runtime and downloaded models"]
async fn test_ocr_chinese_text() {
let image_path = test_documents_dir().join("images/chi_sim_image.jpeg");
assert!(image_path.exists(), "Test image not found: {:?}", image_path);
let image_bytes = std::fs::read(&image_path).expect("Failed to read image");
// Use Chinese language setting
let config = PaddleOcrConfig::new("ch").with_cache_dir(test_cache_dir());
let backend = PaddleOcrBackend::with_config(config).expect("Failed to create backend");
let ocr_config = OcrConfig {
backend: "paddle-ocr".to_string(),
language: "ch".to_string(),
..Default::default()
};
let result: kreuzberg::Result<ExtractionResult> = backend.process_image(&image_bytes, &ocr_config).await;
assert!(result.is_ok(), "OCR failed: {:?}", result.err());
let extraction: ExtractionResult = result.unwrap();
println!("OCR result: {}", extraction.content);
// The pipeline should produce some output without crashing.
// With the English-only model, Chinese characters are not recognized,
// but the detection and recognition pipeline should still function.
assert!(
!extraction.content.is_empty(),
"Expected non-empty OCR result for Chinese image"
);
}
/// Test that the backend correctly reports supported languages.
#[test]
fn test_supported_languages() {
let backend = PaddleOcrBackend::new().expect("Failed to create backend");
// Direct PaddleOCR codes
assert!(backend.supports_language("ch"));
assert!(backend.supports_language("en"));
assert!(backend.supports_language("japan"));
assert!(backend.supports_language("korean"));
// Mapped Tesseract/ISO codes
assert!(backend.supports_language("chi_sim"));
assert!(backend.supports_language("eng"));
assert!(backend.supports_language("jpn"));
assert!(backend.supports_language("fra"));
assert!(backend.supports_language("deu"));
// Unsupported
assert!(!backend.supports_language("xyz"));
assert!(!backend.supports_language("klingon"));
}
/// Test that empty image returns an error.
#[tokio::test]
async fn test_empty_image_error() {
let backend = PaddleOcrBackend::new().expect("Failed to create backend");
let ocr_config = OcrConfig {
backend: "paddle-ocr".to_string(),
language: "en".to_string(),
..Default::default()
};
let result: kreuzberg::Result<ExtractionResult> = backend.process_image(&[], &ocr_config).await;
assert!(result.is_err(), "Expected error for empty image");
}
/// Test that invalid image data returns an error (requires ONNX Runtime).
#[tokio::test]
#[ignore = "requires ONNX Runtime and downloaded models"]
async fn test_invalid_image_error() {
let config = PaddleOcrConfig::new("en").with_cache_dir(test_cache_dir());
let backend = PaddleOcrBackend::with_config(config).expect("Failed to create backend");
let ocr_config = OcrConfig {
backend: "paddle-ocr".to_string(),
language: "en".to_string(),
..Default::default()
};
// Random bytes that aren't a valid image
let invalid_bytes = vec![0u8, 1, 2, 3, 4, 5, 6, 7, 8, 9];
let result: kreuzberg::Result<ExtractionResult> = backend.process_image(&invalid_bytes, &ocr_config).await;
assert!(result.is_err(), "Expected error for invalid image data");
}
/// Test processing an image file directly.
#[tokio::test]
#[ignore = "requires ONNX Runtime and downloaded models"]
async fn test_process_image_file() {
let image_path = test_documents_dir().join("images/test_hello_world.png");
assert!(image_path.exists(), "Test image not found: {:?}", image_path);
let config = PaddleOcrConfig::new("en").with_cache_dir(test_cache_dir());
let backend = PaddleOcrBackend::with_config(config).expect("Failed to create backend");
let ocr_config = OcrConfig {
backend: "paddle-ocr".to_string(),
language: "en".to_string(),
..Default::default()
};
let result: kreuzberg::Result<ExtractionResult> = backend.process_image_file(&image_path, &ocr_config).await;
assert!(result.is_ok(), "OCR failed: {:?}", result.err());
let extraction: ExtractionResult = result.unwrap();
let text = extraction.content.to_lowercase();
assert!(
text.contains("hello") || text.contains("helo"),
"Expected 'hello' in OCR result"
);
}
/// Test that explicit cache_dir in config overrides default.
#[test]
fn test_cache_dir_explicit_config() {
// Set explicit config - this should always work regardless of env vars
let config = PaddleOcrConfig::new("en").with_cache_dir(PathBuf::from("/explicit/path"));
let resolved = config.resolve_cache_dir();
// Explicit config should always win
assert_eq!(resolved, PathBuf::from("/explicit/path"));
}
/// Test that OCR elements have proper geometry (quadrilateral bounding boxes).
#[tokio::test]
#[ignore = "requires ONNX Runtime and downloaded models"]
async fn test_paddle_ocr_elements_geometry() {
use kreuzberg::types::OcrBoundingGeometry;
let image_path = test_documents_dir().join("images/test_hello_world.png");
assert!(image_path.exists(), "Test image not found: {:?}", image_path);
let image_bytes = std::fs::read(&image_path).expect("Failed to read image");
let config = PaddleOcrConfig::new("en").with_cache_dir(test_cache_dir());
let backend = PaddleOcrBackend::with_config(config).expect("Failed to create backend");
let ocr_config = OcrConfig {
backend: "paddle-ocr".to_string(),
language: "en".to_string(),
..Default::default()
};
let result: kreuzberg::Result<ExtractionResult> = backend.process_image(&image_bytes, &ocr_config).await;
assert!(result.is_ok(), "OCR failed: {:?}", result.err());
let extraction: ExtractionResult = result.unwrap();
// Check that OCR elements are present
assert!(
extraction.ocr_elements.is_some(),
"Expected ocr_elements to be populated"
);
let elements = extraction.ocr_elements.as_ref().unwrap();
assert!(!elements.is_empty(), "Expected at least one OCR element");
// Verify each element has geometry
for element in elements {
// Check geometry based on variant
match &element.geometry {
OcrBoundingGeometry::Quadrilateral { points } => {
// Quadrilateral should have 4 points
assert_eq!(points.len(), 4, "Quadrilateral should have 4 points");
println!("Quadrilateral with 4 points");
}
OcrBoundingGeometry::Rectangle {
left,
top,
width,
height,
} => {
assert!(*width > 0, "Width should be positive");
assert!(*height > 0, "Height should be positive");
println!("Rectangle at ({}, {}) size {}x{}", left, top, width, height);
}
}
}
println!("Found {} OCR elements with valid geometry", elements.len());
}
/// Test that OCR elements have confidence scores.
#[tokio::test]
#[ignore = "requires ONNX Runtime and downloaded models"]
async fn test_paddle_ocr_elements_confidence() {
let image_path = test_documents_dir().join("images/test_hello_world.png");
assert!(image_path.exists(), "Test image not found: {:?}", image_path);
let image_bytes = std::fs::read(&image_path).expect("Failed to read image");
let config = PaddleOcrConfig::new("en").with_cache_dir(test_cache_dir());
let backend = PaddleOcrBackend::with_config(config).expect("Failed to create backend");
let ocr_config = OcrConfig {
backend: "paddle-ocr".to_string(),
language: "en".to_string(),
..Default::default()
};
let result: kreuzberg::Result<ExtractionResult> = backend.process_image(&image_bytes, &ocr_config).await;
assert!(result.is_ok(), "OCR failed: {:?}", result.err());
let extraction: ExtractionResult = result.unwrap();
assert!(
extraction.ocr_elements.is_some(),
"Expected ocr_elements to be populated"
);
let elements = extraction.ocr_elements.as_ref().unwrap();
assert!(!elements.is_empty(), "Expected at least one OCR element");
// Verify each element has confidence score
for element in elements {
// Recognition confidence should be between 0 and 1
assert!(
element.confidence.recognition >= 0.0 && element.confidence.recognition <= 1.0,
"Recognition confidence should be between 0 and 1, got {}",
element.confidence.recognition
);
// PaddleOCR also provides detection confidence
if let Some(det_conf) = element.confidence.detection {
assert!(
(0.0..=1.0).contains(&det_conf),
"Detection confidence should be between 0 and 1, got {}",
det_conf
);
}
println!(
"Element '{}' has recognition confidence: {:.2}%",
element.text,
element.confidence.recognition * 100.0
);
}
}
/// Test rotation detection via angle classification.
#[tokio::test]
#[ignore = "requires ONNX Runtime and downloaded models"]
async fn test_paddle_ocr_rotation_detection() {
// Use an image that might have rotated text
let image_path = test_documents_dir().join("images/ocr_image.jpg");
assert!(image_path.exists(), "Test image not found: {:?}", image_path);
let image_bytes = std::fs::read(&image_path).expect("Failed to read image");
// Enable angle classification
let config = PaddleOcrConfig::new("en").with_cache_dir(test_cache_dir());
let backend = PaddleOcrBackend::with_config(config).expect("Failed to create backend");
let ocr_config = OcrConfig {
backend: "paddle-ocr".to_string(),
language: "en".to_string(),
..Default::default()
};
let result: kreuzberg::Result<ExtractionResult> = backend.process_image(&image_bytes, &ocr_config).await;
assert!(result.is_ok(), "OCR failed: {:?}", result.err());
let extraction: ExtractionResult = result.unwrap();
assert!(
extraction.ocr_elements.is_some(),
"Expected ocr_elements to be populated"
);
let elements = extraction.ocr_elements.as_ref().unwrap();
// Check that rotation info is populated when available
let elements_with_rotation = elements.iter().filter(|e| e.rotation.is_some()).count();
println!(
"Found {} elements total, {} with rotation info",
elements.len(),
elements_with_rotation
);
// For elements with rotation, verify the angle is valid
for element in elements.iter().filter(|e| e.rotation.is_some()) {
let rotation = element.rotation.as_ref().unwrap();
// Rotation should be in degrees (typically 0, 90, 180, 270)
assert!(
rotation.angle_degrees >= 0.0 && rotation.angle_degrees < 360.0,
"Rotation angle should be between 0 and 360, got {}",
rotation.angle_degrees
);
}
}
/// Test table reconstruction from OCR elements.
#[tokio::test]
#[ignore = "requires ONNX Runtime and downloaded models"]
async fn test_paddle_ocr_table_reconstruction() {
let image_path = test_documents_dir().join("images/simple_table.png");
assert!(image_path.exists(), "Test image not found: {:?}", image_path);
let image_bytes = std::fs::read(&image_path).expect("Failed to read image");
// Enable table detection
let config = PaddleOcrConfig::new("en")
.with_cache_dir(test_cache_dir())
.with_table_detection(true);
let backend = PaddleOcrBackend::with_config(config).expect("Failed to create backend");
let ocr_config = OcrConfig {
backend: "paddle-ocr".to_string(),
language: "en".to_string(),
..Default::default()
};
let result: kreuzberg::Result<ExtractionResult> = backend.process_image(&image_bytes, &ocr_config).await;
assert!(result.is_ok(), "OCR failed: {:?}", result.err());
let extraction: ExtractionResult = result.unwrap();
println!(
"OCR result (first 500 chars): {}",
&extraction.content[..extraction.content.len().min(500)]
);
// Check if tables were detected
if !extraction.tables.is_empty() {
println!("Found {} tables", extraction.tables.len());
for (i, table) in extraction.tables.iter().enumerate() {
println!(
"Table {}: {} rows x {} cols",
i,
table.cells.len(),
table.cells.first().map(|r| r.len()).unwrap_or(0)
);
}
}
// OCR elements should also be populated
if let Some(elements) = &extraction.ocr_elements {
println!("Found {} OCR elements", elements.len());
// Elements should have text content
let non_empty_elements = elements.iter().filter(|e| !e.text.is_empty()).count();
assert!(non_empty_elements > 0, "Expected at least one element with text");
}
}
// ============================================================================
// Mobile tier integration tests with quality measurement (TF1)
// ============================================================================
/// Compute Text F1 score: token-level precision/recall between predicted and reference text.
fn compute_tf1(predicted: &str, reference: &str) -> f64 {
let pred_tokens: Vec<&str> = predicted.split_whitespace().collect();
let ref_tokens: Vec<&str> = reference.split_whitespace().collect();
if pred_tokens.is_empty() && ref_tokens.is_empty() {
return 1.0;
}
if pred_tokens.is_empty() || ref_tokens.is_empty() {
return 0.0;
}
let pred_set: std::collections::HashSet<&str> = pred_tokens.iter().copied().collect();
let ref_set: std::collections::HashSet<&str> = ref_tokens.iter().copied().collect();
let intersection = pred_set.intersection(&ref_set).count() as f64;
let precision = intersection / pred_set.len() as f64;
let recall = intersection / ref_set.len() as f64;
if precision + recall == 0.0 {
return 0.0;
}
2.0 * precision * recall / (precision + recall)
}
/// Ground truth for the complex_document test image.
const COMPLEX_DOC_GT: &str = "Sales Report 2024 This report contains quarterly sales data for our products. Q1 Sales: Product Units Revenue Widget A 150 ,500 Widget B 200 ,000 Widget C 100 ,000 Q2 Sales: Product Units Revenue Widget A 180 ,000 Widget B 220 ,200 Widget C 130 ,400 Summary: Total Q1 Revenue: ,500 Total Q2 Revenue: ,600 Prepared by: John Doe Date: 2024-03-15 Department: Finance";
/// Test mobile tier OCR on a document image, measuring TF1.
#[tokio::test]
#[ignore = "requires ONNX Runtime and downloaded models"]
async fn test_mobile_tier_ocr_quality() {
let image_path = test_documents_dir().join("images/complex_document.png");
assert!(image_path.exists(), "Test image not found: {:?}", image_path);
let image_bytes = std::fs::read(&image_path).expect("Failed to read image");
// Mobile tier config
let config = PaddleOcrConfig::new("en")
.with_cache_dir(test_cache_dir())
.with_model_tier("mobile");
let backend = PaddleOcrBackend::with_config(config).expect("Failed to create backend");
let ocr_config = OcrConfig {
backend: "paddle-ocr".to_string(),
language: "en".to_string(),
paddle_ocr_config: Some(serde_json::json!({"model_tier": "mobile"})),
..Default::default()
};
let start = std::time::Instant::now();
let result = backend.process_image(&image_bytes, &ocr_config).await;
let elapsed_ms = start.elapsed().as_millis();
assert!(result.is_ok(), "Mobile tier OCR failed: {:?}", result.err());
let extraction = result.unwrap();
let tf1 = compute_tf1(&extraction.content, COMPLEX_DOC_GT);
println!("Mobile tier TF1: {:.1}% ({} ms)", tf1 * 100.0, elapsed_ms);
println!(
"Extracted text: {}",
&extraction.content[..extraction.content.len().min(200)]
);
// Mobile tier should achieve at least 50% TF1 on this document
assert!(
tf1 > 0.5,
"Mobile tier TF1 too low: {:.1}% (expected >50%)",
tf1 * 100.0
);
}
/// Test server tier OCR on the same document for comparison.
#[tokio::test]
#[ignore = "requires ONNX Runtime and downloaded models"]
async fn test_server_tier_ocr_quality() {
let image_path = test_documents_dir().join("images/complex_document.png");
assert!(image_path.exists(), "Test image not found: {:?}", image_path);
let image_bytes = std::fs::read(&image_path).expect("Failed to read image");
// Server tier config (default)
let config = PaddleOcrConfig::new("en")
.with_cache_dir(test_cache_dir())
.with_model_tier("server");
let backend = PaddleOcrBackend::with_config(config).expect("Failed to create backend");
let ocr_config = OcrConfig {
backend: "paddle-ocr".to_string(),
language: "en".to_string(),
..Default::default()
};
let start = std::time::Instant::now();
let result = backend.process_image(&image_bytes, &ocr_config).await;
let elapsed_ms = start.elapsed().as_millis();
assert!(result.is_ok(), "Server tier OCR failed: {:?}", result.err());
let extraction = result.unwrap();
let tf1 = compute_tf1(&extraction.content, COMPLEX_DOC_GT);
println!("Server tier TF1: {:.1}% ({} ms)", tf1 * 100.0, elapsed_ms);
// Server tier should achieve at least 60% TF1
assert!(
tf1 > 0.6,
"Server tier TF1 too low: {:.1}% (expected >60%)",
tf1 * 100.0
);
}
/// Test mobile tier with auto_rotate on rotated images.
/// Verifies that doc_ori detection + rotation correction produces consistent TF1.
#[tokio::test]
#[ignore = "requires ONNX Runtime and downloaded models"]
async fn test_mobile_tier_auto_rotate() {
let base_dir = test_documents_dir().join("images");
let test_cases = vec![
("complex_document.png", "original (0°)"),
("complex_document_rotated_90.png", "rotated 90°"),
("complex_document_rotated_180.png", "rotated 180°"),
("complex_document_rotated_270.png", "rotated 270°"),
];
let config = PaddleOcrConfig::new("en")
.with_cache_dir(test_cache_dir())
.with_model_tier("mobile");
let backend = PaddleOcrBackend::with_config(config).expect("Failed to create backend");
let mut tf1_scores = Vec::new();
for (filename, label) in &test_cases {
let image_path = base_dir.join(filename);
assert!(image_path.exists(), "Test image not found: {:?}", image_path);
let image_bytes = std::fs::read(&image_path).expect("Failed to read image");
let ocr_config = OcrConfig {
backend: "paddle-ocr".to_string(),
language: "en".to_string(),
auto_rotate: true,
paddle_ocr_config: Some(serde_json::json!({"model_tier": "mobile"})),
..Default::default()
};
let start = std::time::Instant::now();
let result = backend.process_image(&image_bytes, &ocr_config).await;
let elapsed_ms = start.elapsed().as_millis();
assert!(result.is_ok(), "OCR failed on {}: {:?}", label, result.err());
let extraction = result.unwrap();
let tf1 = compute_tf1(&extraction.content, COMPLEX_DOC_GT);
tf1_scores.push(tf1);
println!("{}: TF1={:.1}% ({} ms)", label, tf1 * 100.0, elapsed_ms);
}
// All orientations should produce consistent quality (within 20% of each other)
let min_tf1 = tf1_scores.iter().cloned().fold(f64::INFINITY, f64::min);
let max_tf1 = tf1_scores.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
println!(
"TF1 range: {:.1}% - {:.1}% (spread: {:.1}%)",
min_tf1 * 100.0,
max_tf1 * 100.0,
(max_tf1 - min_tf1) * 100.0
);
// Auto-rotate should make all orientations achieve at least 40% TF1
assert!(
min_tf1 > 0.4,
"Worst orientation TF1 too low: {:.1}% (expected >40% with auto_rotate)",
min_tf1 * 100.0
);
// Spread should be <30% — auto_rotate should normalize quality across orientations
assert!(
max_tf1 - min_tf1 < 0.3,
"TF1 spread too large: {:.1}% (expected <30% with auto_rotate)",
(max_tf1 - min_tf1) * 100.0
);
}
/// Test that mobile tier model download caches correctly.
#[tokio::test]
#[ignore = "requires network access"]
async fn test_mobile_tier_model_cache() {
let cache_dir = test_cache_dir();
let manager = ModelManager::new(cache_dir.clone());
// Download mobile det model
let det_result = manager.ensure_v2_det_model("mobile");
assert!(det_result.is_ok(), "Mobile det download failed: {:?}", det_result.err());
let det_dir = det_result.unwrap();
assert!(det_dir.join("model.onnx").exists(), "Mobile det model not cached");
// Mobile det should be ~4.7MB (much smaller than server ~88MB)
let det_size = std::fs::metadata(det_dir.join("model.onnx")).unwrap().len();
assert!(
det_size < 10_000_000,
"Mobile det model too large: {} bytes (expected <10MB)",
det_size
);
println!(
"Mobile det model size: {} bytes ({:.1} MB)",
det_size,
det_size as f64 / 1_048_576.0
);
// Download en_mobile rec model
let rec_result = manager.resolve_rec_model("english", "mobile");
assert!(rec_result.is_ok(), "Mobile rec download failed: {:?}", rec_result.err());
let rec = rec_result.unwrap();
assert!(rec.model_dir.join("model.onnx").exists(), "Mobile rec model not cached");
assert!(rec.dict_file.exists(), "Mobile rec dict not cached");
let rec_size = std::fs::metadata(rec.model_dir.join("model.onnx")).unwrap().len();
assert!(
rec_size < 20_000_000,
"Mobile rec model too large: {} bytes (expected <20MB)",
rec_size
);
println!(
"Mobile rec model size: {} bytes ({:.1} MB)",
rec_size,
rec_size as f64 / 1_048_576.0
);
println!("Mobile rec model key: {}", rec.model_key);
}
/// Test that server and mobile tiers produce different model paths.
#[tokio::test]
#[ignore = "requires network access"]
async fn test_tier_model_differentiation() {
let cache_dir = test_cache_dir();
let manager = ModelManager::new(cache_dir);
let server_det = manager.ensure_v2_det_model("server").unwrap();
let mobile_det = manager.ensure_v2_det_model("mobile").unwrap();
assert_ne!(server_det, mobile_det, "Server and mobile det paths should differ");
let server_rec = manager.resolve_rec_model("english", "server").unwrap();
let mobile_rec = manager.resolve_rec_model("english", "mobile").unwrap();
assert_ne!(
server_rec.model_key, mobile_rec.model_key,
"Server and mobile rec model keys should differ"
);
println!("Server det: {:?}", server_det);
println!("Mobile det: {:?}", mobile_det);
println!("Server rec key: {}", server_rec.model_key);
println!("Mobile rec key: {}", mobile_rec.model_key);
}
/// Test default cache directory when no explicit config is set.
#[test]
fn test_cache_dir_default() {
// Save and clear env var to test default behavior
let original = std::env::var("KREUZBERG_CACHE_DIR").ok();
// SAFETY: This is a test that manipulates environment variables.
// Tests should be run with --test-threads=1 if this causes issues.
unsafe {
std::env::remove_var("KREUZBERG_CACHE_DIR");
}
let config = PaddleOcrConfig::new("en");
let resolved = config.resolve_cache_dir();
// Default should use .kreuzberg/paddle-ocr/
assert!(resolved.to_string_lossy().contains(".kreuzberg"));
assert!(resolved.to_string_lossy().contains("paddle-ocr"));
// Restore
unsafe {
if let Some(val) = original {
std::env::set_var("KREUZBERG_CACHE_DIR", val);
}
}
}
*/