//! TODO: Restored from 245539484 alef-migration cleanup. Currently exercises //! pub(crate) APIs that the migration deliberately narrowed; gated until //! either (a) these APIs are re-exposed publicly, or (b) the test is //! rewritten against the public extraction surface. #![cfg(any())] // Original content preserved below; recompiled once gating cfg drops. // Disabled by the file-level cfg(any()) above. /* //! Integration tests for PaddleOCR functionality. //! //! These tests require: //! - Network access to download models from HuggingFace //! - ONNX Runtime installed on the system //! //! Run with: `cargo test -p kreuzberg --features paddle-ocr --test paddle_ocr_integration -- --ignored` #![cfg(feature = "paddle-ocr")] use std::path::PathBuf; use kreuzberg::core::config::OcrConfig; use kreuzberg::paddle_ocr::{ModelManager, PaddleOcrBackend, PaddleOcrConfig}; use kreuzberg::plugins::OcrBackend; use kreuzberg::types::ExtractionResult; /// Helper to get the test documents directory fn test_documents_dir() -> PathBuf { PathBuf::from(env!("CARGO_MANIFEST_DIR")) .parent() .unwrap() .parent() .unwrap() .join("test_documents") } /// Helper to get a temporary cache directory for tests fn test_cache_dir() -> PathBuf { std::env::temp_dir().join("kreuzberg_paddle_test") } /// Test that model manager can download models from HuggingFace. /// /// This test downloads actual models and verifies they are cached correctly. /// It's ignored by default since it requires network access and ~100MB download. #[tokio::test] #[ignore = "requires network access and ~100MB download"] async fn test_model_download_from_huggingface() { let cache_dir = test_cache_dir(); // Clean up any existing cache let _ = std::fs::remove_dir_all(&cache_dir); let manager = ModelManager::new(cache_dir.clone()); // Verify cache is empty assert!(!manager.are_models_cached()); // Download models (synchronous now) let result = manager.ensure_models_exist(); assert!(result.is_ok(), "Model download failed: {:?}", result.err()); let paths: kreuzberg::paddle_ocr::ModelPaths = result.unwrap(); // Verify all model directories exist assert!(paths.det_model.exists(), "Detection model dir not found"); assert!(paths.cls_model.exists(), "Classification model dir not found"); assert!(paths.rec_model.exists(), "Recognition model dir not found"); // Verify ONNX model files exist within directories assert!( paths.det_model.join("model.onnx").exists(), "Detection ONNX file not found" ); assert!( paths.cls_model.join("model.onnx").exists(), "Classification ONNX file not found" ); assert!( paths.rec_model.join("model.onnx").exists(), "Recognition ONNX file not found" ); // Verify dictionary file exists assert!(paths.dict_file.exists(), "Dictionary file not found"); // Verify cache reports correctly assert!(manager.are_models_cached()); // Check cache stats let stats = manager.cache_stats().unwrap(); // 3 model dirs, each containing model.onnx (rec/ also has dict.txt) assert!( stats.model_count >= 3, "Expected at least 3 cached items, got {}", stats.model_count ); // Models should be > 1MB each assert!(stats.total_size_bytes > 1_000_000); println!("Cache stats: {:?}", stats); println!("Detection model: {:?}", paths.det_model); println!("Classification model: {:?}", paths.cls_model); println!("Recognition model: {:?}", paths.rec_model); println!("Dictionary file: {:?}", paths.dict_file); } /// Test OCR on a simple English "Hello World" image. /// /// This test requires ONNX Runtime and downloaded models. #[tokio::test] #[ignore = "requires ONNX Runtime and downloaded models"] async fn test_ocr_hello_world_english() { let image_path = test_documents_dir().join("images/test_hello_world.png"); assert!(image_path.exists(), "Test image not found: {:?}", image_path); let image_bytes = std::fs::read(&image_path).expect("Failed to read image"); let config = PaddleOcrConfig::new("en").with_cache_dir(test_cache_dir()); let backend = PaddleOcrBackend::with_config(config).expect("Failed to create backend"); let ocr_config = OcrConfig { backend: "paddle-ocr".to_string(), language: "en".to_string(), ..Default::default() }; let result: kreuzberg::Result = backend.process_image(&image_bytes, &ocr_config).await; assert!(result.is_ok(), "OCR failed: {:?}", result.err()); let extraction: ExtractionResult = result.unwrap(); let text = extraction.content.to_lowercase(); println!("OCR result: {}", extraction.content); // Should contain "hello" and "world" assert!( text.contains("hello") || text.contains("helo"), "Expected 'hello' in OCR result: {}", text ); assert!( text.contains("world") || text.contains("worid"), "Expected 'world' in OCR result: {}", text ); } /// Test OCR on a complex English document (newspaper). #[tokio::test] #[ignore = "requires ONNX Runtime and downloaded models"] async fn test_ocr_newspaper_english() { let image_path = test_documents_dir().join("images/ocr_image.jpg"); assert!(image_path.exists(), "Test image not found: {:?}", image_path); let image_bytes = std::fs::read(&image_path).expect("Failed to read image"); let config = PaddleOcrConfig::new("en").with_cache_dir(test_cache_dir()); let backend = PaddleOcrBackend::with_config(config).expect("Failed to create backend"); let ocr_config = OcrConfig { backend: "paddle-ocr".to_string(), language: "en".to_string(), ..Default::default() }; let result: kreuzberg::Result = backend.process_image(&image_bytes, &ocr_config).await; assert!(result.is_ok(), "OCR failed: {:?}", result.err()); let extraction: ExtractionResult = result.unwrap(); let text = extraction.content.to_uppercase(); println!( "OCR result (first 500 chars): {}", &extraction.content[..extraction.content.len().min(500)] ); // Should contain "NASDAQ" and "AMEX" from the header assert!( text.contains("NASDAQ") || text.contains("NASOAQ"), "Expected 'NASDAQ' in OCR result" ); assert!( text.contains("AMEX") || text.contains("STOCK"), "Expected 'AMEX' or 'STOCK' in OCR result" ); } /// Test OCR on Chinese text image. /// /// Note: Uses per-family PP-OCRv5 recognition models. /// This test verifies the pipeline handles non-English images without crashing, /// but requires the Chinese recognition model to be cached for accurate results. #[tokio::test] #[ignore = "requires ONNX Runtime and downloaded models"] async fn test_ocr_chinese_text() { let image_path = test_documents_dir().join("images/chi_sim_image.jpeg"); assert!(image_path.exists(), "Test image not found: {:?}", image_path); let image_bytes = std::fs::read(&image_path).expect("Failed to read image"); // Use Chinese language setting let config = PaddleOcrConfig::new("ch").with_cache_dir(test_cache_dir()); let backend = PaddleOcrBackend::with_config(config).expect("Failed to create backend"); let ocr_config = OcrConfig { backend: "paddle-ocr".to_string(), language: "ch".to_string(), ..Default::default() }; let result: kreuzberg::Result = backend.process_image(&image_bytes, &ocr_config).await; assert!(result.is_ok(), "OCR failed: {:?}", result.err()); let extraction: ExtractionResult = result.unwrap(); println!("OCR result: {}", extraction.content); // The pipeline should produce some output without crashing. // With the English-only model, Chinese characters are not recognized, // but the detection and recognition pipeline should still function. assert!( !extraction.content.is_empty(), "Expected non-empty OCR result for Chinese image" ); } /// Test that the backend correctly reports supported languages. #[test] fn test_supported_languages() { let backend = PaddleOcrBackend::new().expect("Failed to create backend"); // Direct PaddleOCR codes assert!(backend.supports_language("ch")); assert!(backend.supports_language("en")); assert!(backend.supports_language("japan")); assert!(backend.supports_language("korean")); // Mapped Tesseract/ISO codes assert!(backend.supports_language("chi_sim")); assert!(backend.supports_language("eng")); assert!(backend.supports_language("jpn")); assert!(backend.supports_language("fra")); assert!(backend.supports_language("deu")); // Unsupported assert!(!backend.supports_language("xyz")); assert!(!backend.supports_language("klingon")); } /// Test that empty image returns an error. #[tokio::test] async fn test_empty_image_error() { let backend = PaddleOcrBackend::new().expect("Failed to create backend"); let ocr_config = OcrConfig { backend: "paddle-ocr".to_string(), language: "en".to_string(), ..Default::default() }; let result: kreuzberg::Result = backend.process_image(&[], &ocr_config).await; assert!(result.is_err(), "Expected error for empty image"); } /// Test that invalid image data returns an error (requires ONNX Runtime). #[tokio::test] #[ignore = "requires ONNX Runtime and downloaded models"] async fn test_invalid_image_error() { let config = PaddleOcrConfig::new("en").with_cache_dir(test_cache_dir()); let backend = PaddleOcrBackend::with_config(config).expect("Failed to create backend"); let ocr_config = OcrConfig { backend: "paddle-ocr".to_string(), language: "en".to_string(), ..Default::default() }; // Random bytes that aren't a valid image let invalid_bytes = vec![0u8, 1, 2, 3, 4, 5, 6, 7, 8, 9]; let result: kreuzberg::Result = backend.process_image(&invalid_bytes, &ocr_config).await; assert!(result.is_err(), "Expected error for invalid image data"); } /// Test processing an image file directly. #[tokio::test] #[ignore = "requires ONNX Runtime and downloaded models"] async fn test_process_image_file() { let image_path = test_documents_dir().join("images/test_hello_world.png"); assert!(image_path.exists(), "Test image not found: {:?}", image_path); let config = PaddleOcrConfig::new("en").with_cache_dir(test_cache_dir()); let backend = PaddleOcrBackend::with_config(config).expect("Failed to create backend"); let ocr_config = OcrConfig { backend: "paddle-ocr".to_string(), language: "en".to_string(), ..Default::default() }; let result: kreuzberg::Result = backend.process_image_file(&image_path, &ocr_config).await; assert!(result.is_ok(), "OCR failed: {:?}", result.err()); let extraction: ExtractionResult = result.unwrap(); let text = extraction.content.to_lowercase(); assert!( text.contains("hello") || text.contains("helo"), "Expected 'hello' in OCR result" ); } /// Test that explicit cache_dir in config overrides default. #[test] fn test_cache_dir_explicit_config() { // Set explicit config - this should always work regardless of env vars let config = PaddleOcrConfig::new("en").with_cache_dir(PathBuf::from("/explicit/path")); let resolved = config.resolve_cache_dir(); // Explicit config should always win assert_eq!(resolved, PathBuf::from("/explicit/path")); } /// Test that OCR elements have proper geometry (quadrilateral bounding boxes). #[tokio::test] #[ignore = "requires ONNX Runtime and downloaded models"] async fn test_paddle_ocr_elements_geometry() { use kreuzberg::types::OcrBoundingGeometry; let image_path = test_documents_dir().join("images/test_hello_world.png"); assert!(image_path.exists(), "Test image not found: {:?}", image_path); let image_bytes = std::fs::read(&image_path).expect("Failed to read image"); let config = PaddleOcrConfig::new("en").with_cache_dir(test_cache_dir()); let backend = PaddleOcrBackend::with_config(config).expect("Failed to create backend"); let ocr_config = OcrConfig { backend: "paddle-ocr".to_string(), language: "en".to_string(), ..Default::default() }; let result: kreuzberg::Result = backend.process_image(&image_bytes, &ocr_config).await; assert!(result.is_ok(), "OCR failed: {:?}", result.err()); let extraction: ExtractionResult = result.unwrap(); // Check that OCR elements are present assert!( extraction.ocr_elements.is_some(), "Expected ocr_elements to be populated" ); let elements = extraction.ocr_elements.as_ref().unwrap(); assert!(!elements.is_empty(), "Expected at least one OCR element"); // Verify each element has geometry for element in elements { // Check geometry based on variant match &element.geometry { OcrBoundingGeometry::Quadrilateral { points } => { // Quadrilateral should have 4 points assert_eq!(points.len(), 4, "Quadrilateral should have 4 points"); println!("Quadrilateral with 4 points"); } OcrBoundingGeometry::Rectangle { left, top, width, height, } => { assert!(*width > 0, "Width should be positive"); assert!(*height > 0, "Height should be positive"); println!("Rectangle at ({}, {}) size {}x{}", left, top, width, height); } } } println!("Found {} OCR elements with valid geometry", elements.len()); } /// Test that OCR elements have confidence scores. #[tokio::test] #[ignore = "requires ONNX Runtime and downloaded models"] async fn test_paddle_ocr_elements_confidence() { let image_path = test_documents_dir().join("images/test_hello_world.png"); assert!(image_path.exists(), "Test image not found: {:?}", image_path); let image_bytes = std::fs::read(&image_path).expect("Failed to read image"); let config = PaddleOcrConfig::new("en").with_cache_dir(test_cache_dir()); let backend = PaddleOcrBackend::with_config(config).expect("Failed to create backend"); let ocr_config = OcrConfig { backend: "paddle-ocr".to_string(), language: "en".to_string(), ..Default::default() }; let result: kreuzberg::Result = backend.process_image(&image_bytes, &ocr_config).await; assert!(result.is_ok(), "OCR failed: {:?}", result.err()); let extraction: ExtractionResult = result.unwrap(); assert!( extraction.ocr_elements.is_some(), "Expected ocr_elements to be populated" ); let elements = extraction.ocr_elements.as_ref().unwrap(); assert!(!elements.is_empty(), "Expected at least one OCR element"); // Verify each element has confidence score for element in elements { // Recognition confidence should be between 0 and 1 assert!( element.confidence.recognition >= 0.0 && element.confidence.recognition <= 1.0, "Recognition confidence should be between 0 and 1, got {}", element.confidence.recognition ); // PaddleOCR also provides detection confidence if let Some(det_conf) = element.confidence.detection { assert!( (0.0..=1.0).contains(&det_conf), "Detection confidence should be between 0 and 1, got {}", det_conf ); } println!( "Element '{}' has recognition confidence: {:.2}%", element.text, element.confidence.recognition * 100.0 ); } } /// Test rotation detection via angle classification. #[tokio::test] #[ignore = "requires ONNX Runtime and downloaded models"] async fn test_paddle_ocr_rotation_detection() { // Use an image that might have rotated text let image_path = test_documents_dir().join("images/ocr_image.jpg"); assert!(image_path.exists(), "Test image not found: {:?}", image_path); let image_bytes = std::fs::read(&image_path).expect("Failed to read image"); // Enable angle classification let config = PaddleOcrConfig::new("en").with_cache_dir(test_cache_dir()); let backend = PaddleOcrBackend::with_config(config).expect("Failed to create backend"); let ocr_config = OcrConfig { backend: "paddle-ocr".to_string(), language: "en".to_string(), ..Default::default() }; let result: kreuzberg::Result = backend.process_image(&image_bytes, &ocr_config).await; assert!(result.is_ok(), "OCR failed: {:?}", result.err()); let extraction: ExtractionResult = result.unwrap(); assert!( extraction.ocr_elements.is_some(), "Expected ocr_elements to be populated" ); let elements = extraction.ocr_elements.as_ref().unwrap(); // Check that rotation info is populated when available let elements_with_rotation = elements.iter().filter(|e| e.rotation.is_some()).count(); println!( "Found {} elements total, {} with rotation info", elements.len(), elements_with_rotation ); // For elements with rotation, verify the angle is valid for element in elements.iter().filter(|e| e.rotation.is_some()) { let rotation = element.rotation.as_ref().unwrap(); // Rotation should be in degrees (typically 0, 90, 180, 270) assert!( rotation.angle_degrees >= 0.0 && rotation.angle_degrees < 360.0, "Rotation angle should be between 0 and 360, got {}", rotation.angle_degrees ); } } /// Test table reconstruction from OCR elements. #[tokio::test] #[ignore = "requires ONNX Runtime and downloaded models"] async fn test_paddle_ocr_table_reconstruction() { let image_path = test_documents_dir().join("images/simple_table.png"); assert!(image_path.exists(), "Test image not found: {:?}", image_path); let image_bytes = std::fs::read(&image_path).expect("Failed to read image"); // Enable table detection let config = PaddleOcrConfig::new("en") .with_cache_dir(test_cache_dir()) .with_table_detection(true); let backend = PaddleOcrBackend::with_config(config).expect("Failed to create backend"); let ocr_config = OcrConfig { backend: "paddle-ocr".to_string(), language: "en".to_string(), ..Default::default() }; let result: kreuzberg::Result = backend.process_image(&image_bytes, &ocr_config).await; assert!(result.is_ok(), "OCR failed: {:?}", result.err()); let extraction: ExtractionResult = result.unwrap(); println!( "OCR result (first 500 chars): {}", &extraction.content[..extraction.content.len().min(500)] ); // Check if tables were detected if !extraction.tables.is_empty() { println!("Found {} tables", extraction.tables.len()); for (i, table) in extraction.tables.iter().enumerate() { println!( "Table {}: {} rows x {} cols", i, table.cells.len(), table.cells.first().map(|r| r.len()).unwrap_or(0) ); } } // OCR elements should also be populated if let Some(elements) = &extraction.ocr_elements { println!("Found {} OCR elements", elements.len()); // Elements should have text content let non_empty_elements = elements.iter().filter(|e| !e.text.is_empty()).count(); assert!(non_empty_elements > 0, "Expected at least one element with text"); } } // ============================================================================ // Mobile tier integration tests with quality measurement (TF1) // ============================================================================ /// Compute Text F1 score: token-level precision/recall between predicted and reference text. fn compute_tf1(predicted: &str, reference: &str) -> f64 { let pred_tokens: Vec<&str> = predicted.split_whitespace().collect(); let ref_tokens: Vec<&str> = reference.split_whitespace().collect(); if pred_tokens.is_empty() && ref_tokens.is_empty() { return 1.0; } if pred_tokens.is_empty() || ref_tokens.is_empty() { return 0.0; } let pred_set: std::collections::HashSet<&str> = pred_tokens.iter().copied().collect(); let ref_set: std::collections::HashSet<&str> = ref_tokens.iter().copied().collect(); let intersection = pred_set.intersection(&ref_set).count() as f64; let precision = intersection / pred_set.len() as f64; let recall = intersection / ref_set.len() as f64; if precision + recall == 0.0 { return 0.0; } 2.0 * precision * recall / (precision + recall) } /// Ground truth for the complex_document test image. const COMPLEX_DOC_GT: &str = "Sales Report 2024 This report contains quarterly sales data for our products. Q1 Sales: Product Units Revenue Widget A 150 ,500 Widget B 200 ,000 Widget C 100 ,000 Q2 Sales: Product Units Revenue Widget A 180 ,000 Widget B 220 ,200 Widget C 130 ,400 Summary: Total Q1 Revenue: ,500 Total Q2 Revenue: ,600 Prepared by: John Doe Date: 2024-03-15 Department: Finance"; /// Test mobile tier OCR on a document image, measuring TF1. #[tokio::test] #[ignore = "requires ONNX Runtime and downloaded models"] async fn test_mobile_tier_ocr_quality() { let image_path = test_documents_dir().join("images/complex_document.png"); assert!(image_path.exists(), "Test image not found: {:?}", image_path); let image_bytes = std::fs::read(&image_path).expect("Failed to read image"); // Mobile tier config let config = PaddleOcrConfig::new("en") .with_cache_dir(test_cache_dir()) .with_model_tier("mobile"); let backend = PaddleOcrBackend::with_config(config).expect("Failed to create backend"); let ocr_config = OcrConfig { backend: "paddle-ocr".to_string(), language: "en".to_string(), paddle_ocr_config: Some(serde_json::json!({"model_tier": "mobile"})), ..Default::default() }; let start = std::time::Instant::now(); let result = backend.process_image(&image_bytes, &ocr_config).await; let elapsed_ms = start.elapsed().as_millis(); assert!(result.is_ok(), "Mobile tier OCR failed: {:?}", result.err()); let extraction = result.unwrap(); let tf1 = compute_tf1(&extraction.content, COMPLEX_DOC_GT); println!("Mobile tier TF1: {:.1}% ({} ms)", tf1 * 100.0, elapsed_ms); println!( "Extracted text: {}", &extraction.content[..extraction.content.len().min(200)] ); // Mobile tier should achieve at least 50% TF1 on this document assert!( tf1 > 0.5, "Mobile tier TF1 too low: {:.1}% (expected >50%)", tf1 * 100.0 ); } /// Test server tier OCR on the same document for comparison. #[tokio::test] #[ignore = "requires ONNX Runtime and downloaded models"] async fn test_server_tier_ocr_quality() { let image_path = test_documents_dir().join("images/complex_document.png"); assert!(image_path.exists(), "Test image not found: {:?}", image_path); let image_bytes = std::fs::read(&image_path).expect("Failed to read image"); // Server tier config (default) let config = PaddleOcrConfig::new("en") .with_cache_dir(test_cache_dir()) .with_model_tier("server"); let backend = PaddleOcrBackend::with_config(config).expect("Failed to create backend"); let ocr_config = OcrConfig { backend: "paddle-ocr".to_string(), language: "en".to_string(), ..Default::default() }; let start = std::time::Instant::now(); let result = backend.process_image(&image_bytes, &ocr_config).await; let elapsed_ms = start.elapsed().as_millis(); assert!(result.is_ok(), "Server tier OCR failed: {:?}", result.err()); let extraction = result.unwrap(); let tf1 = compute_tf1(&extraction.content, COMPLEX_DOC_GT); println!("Server tier TF1: {:.1}% ({} ms)", tf1 * 100.0, elapsed_ms); // Server tier should achieve at least 60% TF1 assert!( tf1 > 0.6, "Server tier TF1 too low: {:.1}% (expected >60%)", tf1 * 100.0 ); } /// Test mobile tier with auto_rotate on rotated images. /// Verifies that doc_ori detection + rotation correction produces consistent TF1. #[tokio::test] #[ignore = "requires ONNX Runtime and downloaded models"] async fn test_mobile_tier_auto_rotate() { let base_dir = test_documents_dir().join("images"); let test_cases = vec![ ("complex_document.png", "original (0°)"), ("complex_document_rotated_90.png", "rotated 90°"), ("complex_document_rotated_180.png", "rotated 180°"), ("complex_document_rotated_270.png", "rotated 270°"), ]; let config = PaddleOcrConfig::new("en") .with_cache_dir(test_cache_dir()) .with_model_tier("mobile"); let backend = PaddleOcrBackend::with_config(config).expect("Failed to create backend"); let mut tf1_scores = Vec::new(); for (filename, label) in &test_cases { let image_path = base_dir.join(filename); assert!(image_path.exists(), "Test image not found: {:?}", image_path); let image_bytes = std::fs::read(&image_path).expect("Failed to read image"); let ocr_config = OcrConfig { backend: "paddle-ocr".to_string(), language: "en".to_string(), auto_rotate: true, paddle_ocr_config: Some(serde_json::json!({"model_tier": "mobile"})), ..Default::default() }; let start = std::time::Instant::now(); let result = backend.process_image(&image_bytes, &ocr_config).await; let elapsed_ms = start.elapsed().as_millis(); assert!(result.is_ok(), "OCR failed on {}: {:?}", label, result.err()); let extraction = result.unwrap(); let tf1 = compute_tf1(&extraction.content, COMPLEX_DOC_GT); tf1_scores.push(tf1); println!("{}: TF1={:.1}% ({} ms)", label, tf1 * 100.0, elapsed_ms); } // All orientations should produce consistent quality (within 20% of each other) let min_tf1 = tf1_scores.iter().cloned().fold(f64::INFINITY, f64::min); let max_tf1 = tf1_scores.iter().cloned().fold(f64::NEG_INFINITY, f64::max); println!( "TF1 range: {:.1}% - {:.1}% (spread: {:.1}%)", min_tf1 * 100.0, max_tf1 * 100.0, (max_tf1 - min_tf1) * 100.0 ); // Auto-rotate should make all orientations achieve at least 40% TF1 assert!( min_tf1 > 0.4, "Worst orientation TF1 too low: {:.1}% (expected >40% with auto_rotate)", min_tf1 * 100.0 ); // Spread should be <30% — auto_rotate should normalize quality across orientations assert!( max_tf1 - min_tf1 < 0.3, "TF1 spread too large: {:.1}% (expected <30% with auto_rotate)", (max_tf1 - min_tf1) * 100.0 ); } /// Test that mobile tier model download caches correctly. #[tokio::test] #[ignore = "requires network access"] async fn test_mobile_tier_model_cache() { let cache_dir = test_cache_dir(); let manager = ModelManager::new(cache_dir.clone()); // Download mobile det model let det_result = manager.ensure_v2_det_model("mobile"); assert!(det_result.is_ok(), "Mobile det download failed: {:?}", det_result.err()); let det_dir = det_result.unwrap(); assert!(det_dir.join("model.onnx").exists(), "Mobile det model not cached"); // Mobile det should be ~4.7MB (much smaller than server ~88MB) let det_size = std::fs::metadata(det_dir.join("model.onnx")).unwrap().len(); assert!( det_size < 10_000_000, "Mobile det model too large: {} bytes (expected <10MB)", det_size ); println!( "Mobile det model size: {} bytes ({:.1} MB)", det_size, det_size as f64 / 1_048_576.0 ); // Download en_mobile rec model let rec_result = manager.resolve_rec_model("english", "mobile"); assert!(rec_result.is_ok(), "Mobile rec download failed: {:?}", rec_result.err()); let rec = rec_result.unwrap(); assert!(rec.model_dir.join("model.onnx").exists(), "Mobile rec model not cached"); assert!(rec.dict_file.exists(), "Mobile rec dict not cached"); let rec_size = std::fs::metadata(rec.model_dir.join("model.onnx")).unwrap().len(); assert!( rec_size < 20_000_000, "Mobile rec model too large: {} bytes (expected <20MB)", rec_size ); println!( "Mobile rec model size: {} bytes ({:.1} MB)", rec_size, rec_size as f64 / 1_048_576.0 ); println!("Mobile rec model key: {}", rec.model_key); } /// Test that server and mobile tiers produce different model paths. #[tokio::test] #[ignore = "requires network access"] async fn test_tier_model_differentiation() { let cache_dir = test_cache_dir(); let manager = ModelManager::new(cache_dir); let server_det = manager.ensure_v2_det_model("server").unwrap(); let mobile_det = manager.ensure_v2_det_model("mobile").unwrap(); assert_ne!(server_det, mobile_det, "Server and mobile det paths should differ"); let server_rec = manager.resolve_rec_model("english", "server").unwrap(); let mobile_rec = manager.resolve_rec_model("english", "mobile").unwrap(); assert_ne!( server_rec.model_key, mobile_rec.model_key, "Server and mobile rec model keys should differ" ); println!("Server det: {:?}", server_det); println!("Mobile det: {:?}", mobile_det); println!("Server rec key: {}", server_rec.model_key); println!("Mobile rec key: {}", mobile_rec.model_key); } /// Test default cache directory when no explicit config is set. #[test] fn test_cache_dir_default() { // Save and clear env var to test default behavior let original = std::env::var("KREUZBERG_CACHE_DIR").ok(); // SAFETY: This is a test that manipulates environment variables. // Tests should be run with --test-threads=1 if this causes issues. unsafe { std::env::remove_var("KREUZBERG_CACHE_DIR"); } let config = PaddleOcrConfig::new("en"); let resolved = config.resolve_cache_dir(); // Default should use .kreuzberg/paddle-ocr/ assert!(resolved.to_string_lossy().contains(".kreuzberg")); assert!(resolved.to_string_lossy().contains("paddle-ocr")); // Restore unsafe { if let Some(val) = original { std::env::set_var("KREUZBERG_CACHE_DIR", val); } } } */