Files
fil/crates/kreuzberg/tests/paddle_ocr_multilang.rs

625 lines
25 KiB
Rust
Raw Permalink Normal View History

2026-06-01 23:40:55 +02:00
//! TODO: Restored from 245539484 alef-migration cleanup. Currently exercises
//! pub(crate) APIs that the migration deliberately narrowed; gated until
//! either (a) these APIs are re-exposed publicly, or (b) the test is
//! rewritten against the public extraction surface.
#![cfg(any())]
// Original content preserved below; recompiled once gating cfg drops.
// Disabled by the file-level cfg(any()) above.
/*
//! Comprehensive tests for PaddleOCR multi-language support.
//!
//! This test suite verifies the multi-language model infrastructure WITHOUT requiring
//! network access or ONNX runtime. Tests focus on language mapping, model management,
//! configuration, and backend initialization.
//!
//! Run with: `cargo test -p kreuzberg --features paddle-ocr --test paddle_ocr_multilang`
#![cfg(feature = "paddle-ocr")]
use kreuzberg::core::config::OcrConfig;
use kreuzberg::paddle_ocr::{
ModelManager, PaddleLanguage, PaddleOcrBackend, PaddleOcrConfig, SUPPORTED_LANGUAGES, language_to_script_family,
map_language_code,
};
use kreuzberg::plugins::{OcrBackend, Plugin};
use std::path::PathBuf;
use tempfile::TempDir;
// ============================================================================
// Language Mapping Tests (non-ignored, no network needed)
// ============================================================================
/// Test that all PaddleOCR language codes map to correct script families.
#[test]
fn test_language_to_script_family_all_paddle_codes() {
// PP-OCRv5 families (11 families)
assert_eq!(language_to_script_family("en"), "english");
assert_eq!(language_to_script_family("ch"), "chinese");
assert_eq!(language_to_script_family("japan"), "chinese");
assert_eq!(language_to_script_family("chinese_cht"), "chinese");
assert_eq!(language_to_script_family("korean"), "korean");
assert_eq!(language_to_script_family("latin"), "latin");
assert_eq!(language_to_script_family("french"), "latin");
assert_eq!(language_to_script_family("german"), "latin");
assert_eq!(language_to_script_family("cyrillic"), "eslav");
assert_eq!(language_to_script_family("thai"), "thai");
assert_eq!(language_to_script_family("greek"), "greek");
assert_eq!(language_to_script_family("arabic"), "arabic");
assert_eq!(language_to_script_family("devanagari"), "devanagari");
assert_eq!(language_to_script_family("tamil"), "tamil");
assert_eq!(language_to_script_family("telugu"), "telugu");
}
/// Test that Tesseract-style language codes map correctly to PaddleOCR codes.
#[test]
fn test_language_to_script_family_tesseract_codes() {
// Tesseract codes should map via map_language_code first, then to families
assert_eq!(map_language_code("eng"), Some("en"));
assert_eq!(language_to_script_family("en"), "english");
assert_eq!(map_language_code("fra"), Some("french"));
assert_eq!(language_to_script_family("french"), "latin");
assert_eq!(map_language_code("deu"), Some("german"));
assert_eq!(language_to_script_family("german"), "latin");
assert_eq!(map_language_code("chi_sim"), Some("ch"));
assert_eq!(language_to_script_family("ch"), "chinese");
assert_eq!(map_language_code("jpn"), Some("japan"));
assert_eq!(language_to_script_family("japan"), "chinese");
assert_eq!(map_language_code("kor"), Some("korean"));
assert_eq!(language_to_script_family("korean"), "korean");
assert_eq!(map_language_code("tha"), Some("thai"));
assert_eq!(language_to_script_family("thai"), "thai");
assert_eq!(map_language_code("ell"), Some("greek"));
assert_eq!(language_to_script_family("greek"), "greek");
assert_eq!(map_language_code("rus"), Some("cyrillic"));
assert_eq!(language_to_script_family("cyrillic"), "eslav");
assert_eq!(map_language_code("ara"), Some("arabic"));
assert_eq!(language_to_script_family("arabic"), "arabic");
assert_eq!(map_language_code("hin"), Some("devanagari"));
assert_eq!(language_to_script_family("devanagari"), "devanagari");
assert_eq!(map_language_code("tam"), Some("tamil"));
assert_eq!(language_to_script_family("tamil"), "tamil");
assert_eq!(map_language_code("tel"), Some("telugu"));
assert_eq!(language_to_script_family("telugu"), "telugu");
}
/// Test that ISO 639-1 language codes map correctly to PaddleOCR codes.
#[test]
fn test_language_to_script_family_iso639_codes() {
// ISO 639-1 codes (2-letter)
assert_eq!(map_language_code("en"), Some("en"));
assert_eq!(map_language_code("fr"), Some("french"));
assert_eq!(map_language_code("de"), Some("german"));
assert_eq!(map_language_code("zh"), Some("ch"));
assert_eq!(map_language_code("ja"), Some("japan"));
assert_eq!(map_language_code("ko"), Some("korean"));
assert_eq!(map_language_code("th"), Some("thai"));
assert_eq!(map_language_code("el"), Some("greek"));
assert_eq!(map_language_code("ru"), Some("cyrillic"));
assert_eq!(map_language_code("ar"), Some("arabic"));
assert_eq!(map_language_code("hi"), Some("devanagari"));
assert_eq!(map_language_code("ta"), Some("tamil"));
assert_eq!(map_language_code("te"), Some("telugu"));
// Verify they map to correct families
assert_eq!(language_to_script_family("en"), "english");
assert_eq!(language_to_script_family("french"), "latin");
assert_eq!(language_to_script_family("german"), "latin");
assert_eq!(language_to_script_family("ch"), "chinese");
assert_eq!(language_to_script_family("japan"), "chinese");
assert_eq!(language_to_script_family("korean"), "korean");
assert_eq!(language_to_script_family("thai"), "thai");
assert_eq!(language_to_script_family("greek"), "greek");
assert_eq!(language_to_script_family("cyrillic"), "eslav");
assert_eq!(language_to_script_family("arabic"), "arabic");
assert_eq!(language_to_script_family("devanagari"), "devanagari");
assert_eq!(language_to_script_family("tamil"), "tamil");
assert_eq!(language_to_script_family("telugu"), "telugu");
}
/// Test that unknown language codes fall back to "english" script family.
#[test]
fn test_language_to_script_family_unknown_fallback() {
// Unknown codes should fall back to "english"
assert_eq!(language_to_script_family("xyz"), "english");
assert_eq!(language_to_script_family("unknown"), "english");
assert_eq!(language_to_script_family("invalid"), "english");
assert_eq!(language_to_script_family(""), "english");
assert_eq!(language_to_script_family("klingon"), "english");
}
/// Test that map_language_code normalizes various formats to canonical PaddleOCR codes.
#[test]
fn test_map_language_code_normalization() {
// English variants
assert_eq!(map_language_code("en"), Some("en"));
assert_eq!(map_language_code("eng"), Some("en"));
assert_eq!(map_language_code("english"), Some("en"));
// Chinese variants
assert_eq!(map_language_code("ch"), Some("ch"));
assert_eq!(map_language_code("chi_sim"), Some("ch"));
assert_eq!(map_language_code("zho"), Some("ch"));
assert_eq!(map_language_code("zh"), Some("ch"));
assert_eq!(map_language_code("chinese"), Some("ch"));
// Traditional Chinese
assert_eq!(map_language_code("chi_tra"), Some("chinese_cht"));
assert_eq!(map_language_code("zh_tw"), Some("chinese_cht"));
assert_eq!(map_language_code("zh_hant"), Some("chinese_cht"));
// Japanese
assert_eq!(map_language_code("ja"), Some("japan"));
assert_eq!(map_language_code("jpn"), Some("japan"));
assert_eq!(map_language_code("japanese"), Some("japan"));
// Korean
assert_eq!(map_language_code("ko"), Some("korean"));
assert_eq!(map_language_code("kor"), Some("korean"));
assert_eq!(map_language_code("korean"), Some("korean"));
// French
assert_eq!(map_language_code("fr"), Some("french"));
assert_eq!(map_language_code("fra"), Some("french"));
assert_eq!(map_language_code("french"), Some("french"));
// German
assert_eq!(map_language_code("de"), Some("german"));
assert_eq!(map_language_code("deu"), Some("german"));
assert_eq!(map_language_code("german"), Some("german"));
// Thai
assert_eq!(map_language_code("th"), Some("thai"));
assert_eq!(map_language_code("tha"), Some("thai"));
assert_eq!(map_language_code("thai"), Some("thai"));
// Greek
assert_eq!(map_language_code("el"), Some("greek"));
assert_eq!(map_language_code("ell"), Some("greek"));
assert_eq!(map_language_code("greek"), Some("greek"));
// Russian and other Cyrillic
assert_eq!(map_language_code("ru"), Some("cyrillic"));
assert_eq!(map_language_code("rus"), Some("cyrillic"));
assert_eq!(map_language_code("russian"), Some("cyrillic"));
assert_eq!(map_language_code("uk"), Some("cyrillic"));
assert_eq!(map_language_code("ukr"), Some("cyrillic"));
assert_eq!(map_language_code("ukrainian"), Some("cyrillic"));
// Latin script languages (should map to "latin")
assert_eq!(map_language_code("es"), Some("latin"));
assert_eq!(map_language_code("spa"), Some("latin"));
assert_eq!(map_language_code("spanish"), Some("latin"));
assert_eq!(map_language_code("it"), Some("latin"));
assert_eq!(map_language_code("ita"), Some("latin"));
assert_eq!(map_language_code("italian"), Some("latin"));
assert_eq!(map_language_code("pt"), Some("latin"));
assert_eq!(map_language_code("por"), Some("latin"));
assert_eq!(map_language_code("portuguese"), Some("latin"));
// Arabic variants
assert_eq!(map_language_code("ar"), Some("arabic"));
assert_eq!(map_language_code("ara"), Some("arabic"));
assert_eq!(map_language_code("arabic"), Some("arabic"));
assert_eq!(map_language_code("fa"), Some("arabic"));
assert_eq!(map_language_code("persian"), Some("arabic"));
assert_eq!(map_language_code("ur"), Some("arabic"));
assert_eq!(map_language_code("urdu"), Some("arabic"));
// Devanagari variants
assert_eq!(map_language_code("hi"), Some("devanagari"));
assert_eq!(map_language_code("hin"), Some("devanagari"));
assert_eq!(map_language_code("hindi"), Some("devanagari"));
assert_eq!(map_language_code("mr"), Some("devanagari"));
assert_eq!(map_language_code("marathi"), Some("devanagari"));
assert_eq!(map_language_code("sa"), Some("devanagari"));
assert_eq!(map_language_code("sanskrit"), Some("devanagari"));
assert_eq!(map_language_code("ne"), Some("devanagari"));
assert_eq!(map_language_code("nepali"), Some("devanagari"));
// Tamil variants
assert_eq!(map_language_code("ta"), Some("tamil"));
assert_eq!(map_language_code("tam"), Some("tamil"));
assert_eq!(map_language_code("tamil"), Some("tamil"));
// Telugu variants
assert_eq!(map_language_code("te"), Some("telugu"));
assert_eq!(map_language_code("tel"), Some("telugu"));
assert_eq!(map_language_code("telugu"), Some("telugu"));
// Unknown codes should return None
assert_eq!(map_language_code("xyz"), None);
assert_eq!(map_language_code("unknown"), None);
assert_eq!(map_language_code("invalid"), None);
}
/// Test that SUPPORTED_LANGUAGES contains expected entries and correct count.
#[test]
fn test_supported_languages_list() {
// Should contain 15 entries (11 script families mapped to 15 language codes)
assert_eq!(SUPPORTED_LANGUAGES.len(), 15);
// Verify key languages are present
assert!(SUPPORTED_LANGUAGES.contains(&"ch"));
assert!(SUPPORTED_LANGUAGES.contains(&"en"));
assert!(SUPPORTED_LANGUAGES.contains(&"french"));
assert!(SUPPORTED_LANGUAGES.contains(&"german"));
assert!(SUPPORTED_LANGUAGES.contains(&"korean"));
assert!(SUPPORTED_LANGUAGES.contains(&"japan"));
assert!(SUPPORTED_LANGUAGES.contains(&"chinese_cht"));
assert!(SUPPORTED_LANGUAGES.contains(&"latin"));
assert!(SUPPORTED_LANGUAGES.contains(&"cyrillic"));
assert!(SUPPORTED_LANGUAGES.contains(&"thai"));
assert!(SUPPORTED_LANGUAGES.contains(&"greek"));
assert!(SUPPORTED_LANGUAGES.contains(&"arabic"));
assert!(SUPPORTED_LANGUAGES.contains(&"devanagari"));
assert!(SUPPORTED_LANGUAGES.contains(&"tamil"));
assert!(SUPPORTED_LANGUAGES.contains(&"telugu"));
}
// ============================================================================
// Model Manager Tests (non-ignored, no network needed)
// ============================================================================
/// Test that ModelManager creates the cache directory path correctly.
#[test]
fn test_model_manager_cache_dir_creation() {
let temp_dir = TempDir::new().unwrap();
let cache_path = temp_dir.path().join("paddle-cache");
let manager = ModelManager::new(cache_path.clone());
assert_eq!(manager.cache_dir(), &cache_path);
}
/// Test that rec_family_path returns the correct path for each script family.
#[test]
fn test_model_manager_rec_family_path() {
let temp_dir = TempDir::new().unwrap();
let _manager = ModelManager::new(temp_dir.path().to_path_buf());
// Test all 11 PP-OCRv5 script families
let families = [
"english",
"chinese",
"latin",
"korean",
"eslav",
"thai",
"greek",
"arabic",
"devanagari",
"tamil",
"telugu",
];
for family in families {
// We can't call rec_family_path directly as it's private, but we can verify
// the cache structure via the public API
let expected_path = temp_dir.path().join("rec").join(family);
// The path should not exist yet (no models cached)
assert!(!expected_path.exists(), "Path should not exist yet for {}", family);
// After creating the directory structure manually, verify it matches expectation
std::fs::create_dir_all(&expected_path).unwrap();
assert!(expected_path.exists(), "Path should exist for {}", family);
assert!(expected_path.ends_with(format!("rec/{}", family)));
}
}
/// Test that fresh ModelManager reports models as not cached.
#[test]
fn test_model_manager_empty_cache_not_cached() {
let temp_dir = TempDir::new().unwrap();
let manager = ModelManager::new(temp_dir.path().to_path_buf());
// Should report as not cached
assert!(!manager.are_shared_models_cached());
assert!(!manager.is_rec_model_cached("english"));
assert!(!manager.is_rec_model_cached("chinese"));
assert!(!manager.is_rec_model_cached("latin"));
assert!(!manager.is_rec_model_cached("korean"));
assert!(!manager.is_rec_model_cached("thai"));
assert!(!manager.is_rec_model_cached("greek"));
assert!(!manager.are_models_cached());
}
// ============================================================================
// Config Tests (non-ignored)
// ============================================================================
/// Test that all PaddleLanguage enum variants exist and match documentation.
#[test]
fn test_paddle_language_enum_variants() {
// Verify all documented variants exist
let _english = PaddleLanguage::English;
let _chinese = PaddleLanguage::Chinese;
let _japanese = PaddleLanguage::Japanese;
let _korean = PaddleLanguage::Korean;
let _german = PaddleLanguage::German;
let _french = PaddleLanguage::French;
let _latin = PaddleLanguage::Latin;
let _cyrillic = PaddleLanguage::Cyrillic;
let _traditional_chinese = PaddleLanguage::TraditionalChinese;
let _thai = PaddleLanguage::Thai;
let _greek = PaddleLanguage::Greek;
let _east_slavic = PaddleLanguage::EastSlavic;
// Verify codes match expectations
assert_eq!(PaddleLanguage::English.code(), "en");
assert_eq!(PaddleLanguage::Chinese.code(), "ch");
assert_eq!(PaddleLanguage::Japanese.code(), "jpn");
assert_eq!(PaddleLanguage::Korean.code(), "kor");
assert_eq!(PaddleLanguage::German.code(), "deu");
assert_eq!(PaddleLanguage::French.code(), "fra");
assert_eq!(PaddleLanguage::Latin.code(), "latin");
assert_eq!(PaddleLanguage::Cyrillic.code(), "cyrillic");
assert_eq!(PaddleLanguage::TraditionalChinese.code(), "chinese_cht");
assert_eq!(PaddleLanguage::Thai.code(), "thai");
assert_eq!(PaddleLanguage::Greek.code(), "greek");
assert_eq!(PaddleLanguage::EastSlavic.code(), "eslav");
}
/// Test that PaddleOcrConfig::new stores the correct language.
#[test]
fn test_paddle_ocr_config_new_with_language() {
let config_en = PaddleOcrConfig::new("en");
assert_eq!(config_en.language, "en");
let config_ch = PaddleOcrConfig::new("ch");
assert_eq!(config_ch.language, "ch");
let config_thai = PaddleOcrConfig::new("thai");
assert_eq!(config_thai.language, "thai");
let config_greek = PaddleOcrConfig::new("greek");
assert_eq!(config_greek.language, "greek");
let config_cyrillic = PaddleOcrConfig::new("cyrillic");
assert_eq!(config_cyrillic.language, "cyrillic");
}
/// Test that with_cache_dir properly sets the cache directory.
#[test]
fn test_paddle_ocr_config_with_cache_dir() {
let cache_path = PathBuf::from("/custom/cache/dir");
let config = PaddleOcrConfig::new("en").with_cache_dir(cache_path.clone());
assert_eq!(config.cache_dir, Some(cache_path.clone()));
assert_eq!(config.resolve_cache_dir(), cache_path);
}
// ============================================================================
// Backend Tests (non-ignored)
// ============================================================================
/// Test that PaddleOcrBackend::new() succeeds without errors.
#[test]
fn test_paddle_backend_creation() {
let result = PaddleOcrBackend::new();
assert!(result.is_ok(), "Backend creation should succeed");
let backend = result.unwrap();
assert_eq!(backend.name(), "paddle-ocr");
}
/// Test that supports_language works for all new language codes.
#[test]
fn test_paddle_backend_supports_language_expanded() {
let backend = PaddleOcrBackend::new().unwrap();
// Direct PaddleOCR codes
assert!(backend.supports_language("en"));
assert!(backend.supports_language("ch"));
assert!(backend.supports_language("japan"));
assert!(backend.supports_language("korean"));
assert!(backend.supports_language("french"));
assert!(backend.supports_language("german"));
assert!(backend.supports_language("latin"));
assert!(backend.supports_language("cyrillic"));
assert!(backend.supports_language("thai"));
assert!(backend.supports_language("greek"));
assert!(backend.supports_language("chinese_cht"));
}
/// Test that Tesseract-style codes are supported via mapping.
#[test]
fn test_paddle_backend_supports_tesseract_mapped_codes() {
let backend = PaddleOcrBackend::new().unwrap();
// Tesseract codes that map to PaddleOCR codes
assert!(backend.supports_language("eng")); // → en
assert!(backend.supports_language("chi_sim")); // → ch
assert!(backend.supports_language("jpn")); // → japan
assert!(backend.supports_language("kor")); // → korean
assert!(backend.supports_language("fra")); // → french
assert!(backend.supports_language("deu")); // → german
assert!(backend.supports_language("tha")); // → thai
assert!(backend.supports_language("ell")); // → greek
assert!(backend.supports_language("rus")); // → cyrillic
// ISO 639-1 codes
assert!(backend.supports_language("en"));
assert!(backend.supports_language("zh"));
assert!(backend.supports_language("ja"));
assert!(backend.supports_language("ko"));
assert!(backend.supports_language("fr"));
assert!(backend.supports_language("de"));
assert!(backend.supports_language("th"));
assert!(backend.supports_language("el"));
assert!(backend.supports_language("ru"));
}
// ============================================================================
// Integration Tests (ignored, require network + ONNX)
// ============================================================================
/// Test OCR with Chinese language and Chinese model.
///
/// This test verifies that the Chinese recognition model can process Chinese text.
/// Requires network access to download the Chinese rec model and ONNX Runtime.
#[tokio::test]
#[ignore = "requires network access and ONNX Runtime"]
async fn test_ocr_chinese_with_chinese_model() {
let test_documents_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.parent()
.unwrap()
.parent()
.unwrap()
.join("test_documents");
let image_path = test_documents_dir.join("images/chi_sim_image.jpeg");
if !image_path.exists() {
eprintln!("Skipping test: Chinese test image not found at {:?}", image_path);
return;
}
let image_bytes = std::fs::read(&image_path).expect("Failed to read Chinese test image");
let cache_dir = std::env::temp_dir().join("kreuzberg_paddle_multilang_test");
let config = PaddleOcrConfig::new("ch").with_cache_dir(cache_dir);
let backend = PaddleOcrBackend::with_config(config).expect("Failed to create backend");
let ocr_config = OcrConfig {
backend: "paddle-ocr".to_string(),
language: "ch".to_string(),
..Default::default()
};
let result = backend.process_image(&image_bytes, &ocr_config).await;
assert!(result.is_ok(), "Chinese OCR failed: {:?}", result.err());
let extraction = result.unwrap();
println!("Chinese OCR result: {}", extraction.content);
// The result should contain Chinese characters (not empty after using Chinese model)
assert!(
!extraction.content.is_empty(),
"Expected non-empty result from Chinese OCR"
);
// Verify Chinese characters are present (Unicode range for CJK)
let has_chinese_chars = extraction.content.chars().any(|c| {
matches!(c,
'\u{4E00}'..='\u{9FFF}' | // CJK Unified Ideographs
'\u{3400}'..='\u{4DBF}' | // CJK Extension A
'\u{F900}'..='\u{FAFF}' // CJK Compatibility Ideographs
)
});
assert!(
has_chinese_chars,
"Expected Chinese characters in OCR result, got: {}",
extraction.content
);
}
/// Test concurrent OCR on different languages using the engine pool.
///
/// This test verifies that the backend can handle multiple concurrent OCR requests
/// with different languages without blocking. Each language should use its own
/// engine from the pool.
#[tokio::test]
#[ignore = "requires network access and ONNX Runtime"]
async fn test_ocr_concurrent_different_languages() {
use tokio::task::JoinSet;
let test_documents_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.parent()
.unwrap()
.parent()
.unwrap()
.join("test_documents");
let cache_dir = std::env::temp_dir().join("kreuzberg_paddle_multilang_concurrent");
let backend = std::sync::Arc::new(
PaddleOcrBackend::with_config(PaddleOcrConfig::new("en").with_cache_dir(cache_dir))
.expect("Failed to create backend"),
);
let mut tasks = JoinSet::new();
// English OCR task
{
let backend_clone = backend.clone();
let image_path = test_documents_dir.join("images/test_hello_world.png");
if image_path.exists() {
let image_bytes = std::fs::read(&image_path).expect("Failed to read English image");
tasks.spawn(async move {
let config = OcrConfig {
backend: "paddle-ocr".to_string(),
language: "en".to_string(),
..Default::default()
};
let result = backend_clone.process_image(&image_bytes, &config).await;
("en", result)
});
}
}
// Chinese OCR task
{
let backend_clone = backend.clone();
let image_path = test_documents_dir.join("images/chi_sim_image.jpeg");
if image_path.exists() {
let image_bytes = std::fs::read(&image_path).expect("Failed to read Chinese image");
tasks.spawn(async move {
let config = OcrConfig {
backend: "paddle-ocr".to_string(),
language: "ch".to_string(),
..Default::default()
};
let result = backend_clone.process_image(&image_bytes, &config).await;
("ch", result)
});
}
}
// Wait for all tasks and verify results
let mut results_count = 0;
while let Some(result) = tasks.join_next().await {
let (lang, ocr_result) = result.expect("Task panicked");
assert!(
ocr_result.is_ok(),
"OCR failed for language {}: {:?}",
lang,
ocr_result.err()
);
let extraction = ocr_result.unwrap();
println!(
"OCR result for {}: {}",
lang,
&extraction.content[..extraction.content.len().min(100)]
);
assert!(
!extraction.content.is_empty(),
"Expected non-empty result for language {}",
lang
);
results_count += 1;
}
// Should have completed both tasks
assert!(results_count >= 2, "Expected at least 2 OCR tasks to complete");
println!("Successfully completed {} concurrent OCR tasks", results_count);
}
*/