625 lines
25 KiB
Rust
625 lines
25 KiB
Rust
|
|
//! TODO: Restored from 245539484 alef-migration cleanup. Currently exercises
|
||
|
|
//! pub(crate) APIs that the migration deliberately narrowed; gated until
|
||
|
|
//! either (a) these APIs are re-exposed publicly, or (b) the test is
|
||
|
|
//! rewritten against the public extraction surface.
|
||
|
|
|
||
|
|
#![cfg(any())]
|
||
|
|
|
||
|
|
// Original content preserved below; recompiled once gating cfg drops.
|
||
|
|
// Disabled by the file-level cfg(any()) above.
|
||
|
|
|
||
|
|
/*
|
||
|
|
//! Comprehensive tests for PaddleOCR multi-language support.
|
||
|
|
//!
|
||
|
|
//! This test suite verifies the multi-language model infrastructure WITHOUT requiring
|
||
|
|
//! network access or ONNX runtime. Tests focus on language mapping, model management,
|
||
|
|
//! configuration, and backend initialization.
|
||
|
|
//!
|
||
|
|
//! Run with: `cargo test -p kreuzberg --features paddle-ocr --test paddle_ocr_multilang`
|
||
|
|
|
||
|
|
#![cfg(feature = "paddle-ocr")]
|
||
|
|
|
||
|
|
use kreuzberg::core::config::OcrConfig;
|
||
|
|
use kreuzberg::paddle_ocr::{
|
||
|
|
ModelManager, PaddleLanguage, PaddleOcrBackend, PaddleOcrConfig, SUPPORTED_LANGUAGES, language_to_script_family,
|
||
|
|
map_language_code,
|
||
|
|
};
|
||
|
|
use kreuzberg::plugins::{OcrBackend, Plugin};
|
||
|
|
use std::path::PathBuf;
|
||
|
|
use tempfile::TempDir;
|
||
|
|
|
||
|
|
// ============================================================================
|
||
|
|
// Language Mapping Tests (non-ignored, no network needed)
|
||
|
|
// ============================================================================
|
||
|
|
|
||
|
|
/// Test that all PaddleOCR language codes map to correct script families.
|
||
|
|
#[test]
|
||
|
|
fn test_language_to_script_family_all_paddle_codes() {
|
||
|
|
// PP-OCRv5 families (11 families)
|
||
|
|
assert_eq!(language_to_script_family("en"), "english");
|
||
|
|
assert_eq!(language_to_script_family("ch"), "chinese");
|
||
|
|
assert_eq!(language_to_script_family("japan"), "chinese");
|
||
|
|
assert_eq!(language_to_script_family("chinese_cht"), "chinese");
|
||
|
|
assert_eq!(language_to_script_family("korean"), "korean");
|
||
|
|
assert_eq!(language_to_script_family("latin"), "latin");
|
||
|
|
assert_eq!(language_to_script_family("french"), "latin");
|
||
|
|
assert_eq!(language_to_script_family("german"), "latin");
|
||
|
|
assert_eq!(language_to_script_family("cyrillic"), "eslav");
|
||
|
|
assert_eq!(language_to_script_family("thai"), "thai");
|
||
|
|
assert_eq!(language_to_script_family("greek"), "greek");
|
||
|
|
assert_eq!(language_to_script_family("arabic"), "arabic");
|
||
|
|
assert_eq!(language_to_script_family("devanagari"), "devanagari");
|
||
|
|
assert_eq!(language_to_script_family("tamil"), "tamil");
|
||
|
|
assert_eq!(language_to_script_family("telugu"), "telugu");
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test that Tesseract-style language codes map correctly to PaddleOCR codes.
|
||
|
|
#[test]
|
||
|
|
fn test_language_to_script_family_tesseract_codes() {
|
||
|
|
// Tesseract codes should map via map_language_code first, then to families
|
||
|
|
assert_eq!(map_language_code("eng"), Some("en"));
|
||
|
|
assert_eq!(language_to_script_family("en"), "english");
|
||
|
|
|
||
|
|
assert_eq!(map_language_code("fra"), Some("french"));
|
||
|
|
assert_eq!(language_to_script_family("french"), "latin");
|
||
|
|
|
||
|
|
assert_eq!(map_language_code("deu"), Some("german"));
|
||
|
|
assert_eq!(language_to_script_family("german"), "latin");
|
||
|
|
|
||
|
|
assert_eq!(map_language_code("chi_sim"), Some("ch"));
|
||
|
|
assert_eq!(language_to_script_family("ch"), "chinese");
|
||
|
|
|
||
|
|
assert_eq!(map_language_code("jpn"), Some("japan"));
|
||
|
|
assert_eq!(language_to_script_family("japan"), "chinese");
|
||
|
|
|
||
|
|
assert_eq!(map_language_code("kor"), Some("korean"));
|
||
|
|
assert_eq!(language_to_script_family("korean"), "korean");
|
||
|
|
|
||
|
|
assert_eq!(map_language_code("tha"), Some("thai"));
|
||
|
|
assert_eq!(language_to_script_family("thai"), "thai");
|
||
|
|
|
||
|
|
assert_eq!(map_language_code("ell"), Some("greek"));
|
||
|
|
assert_eq!(language_to_script_family("greek"), "greek");
|
||
|
|
|
||
|
|
assert_eq!(map_language_code("rus"), Some("cyrillic"));
|
||
|
|
assert_eq!(language_to_script_family("cyrillic"), "eslav");
|
||
|
|
|
||
|
|
assert_eq!(map_language_code("ara"), Some("arabic"));
|
||
|
|
assert_eq!(language_to_script_family("arabic"), "arabic");
|
||
|
|
|
||
|
|
assert_eq!(map_language_code("hin"), Some("devanagari"));
|
||
|
|
assert_eq!(language_to_script_family("devanagari"), "devanagari");
|
||
|
|
|
||
|
|
assert_eq!(map_language_code("tam"), Some("tamil"));
|
||
|
|
assert_eq!(language_to_script_family("tamil"), "tamil");
|
||
|
|
|
||
|
|
assert_eq!(map_language_code("tel"), Some("telugu"));
|
||
|
|
assert_eq!(language_to_script_family("telugu"), "telugu");
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test that ISO 639-1 language codes map correctly to PaddleOCR codes.
|
||
|
|
#[test]
|
||
|
|
fn test_language_to_script_family_iso639_codes() {
|
||
|
|
// ISO 639-1 codes (2-letter)
|
||
|
|
assert_eq!(map_language_code("en"), Some("en"));
|
||
|
|
assert_eq!(map_language_code("fr"), Some("french"));
|
||
|
|
assert_eq!(map_language_code("de"), Some("german"));
|
||
|
|
assert_eq!(map_language_code("zh"), Some("ch"));
|
||
|
|
assert_eq!(map_language_code("ja"), Some("japan"));
|
||
|
|
assert_eq!(map_language_code("ko"), Some("korean"));
|
||
|
|
assert_eq!(map_language_code("th"), Some("thai"));
|
||
|
|
assert_eq!(map_language_code("el"), Some("greek"));
|
||
|
|
assert_eq!(map_language_code("ru"), Some("cyrillic"));
|
||
|
|
assert_eq!(map_language_code("ar"), Some("arabic"));
|
||
|
|
assert_eq!(map_language_code("hi"), Some("devanagari"));
|
||
|
|
assert_eq!(map_language_code("ta"), Some("tamil"));
|
||
|
|
assert_eq!(map_language_code("te"), Some("telugu"));
|
||
|
|
|
||
|
|
// Verify they map to correct families
|
||
|
|
assert_eq!(language_to_script_family("en"), "english");
|
||
|
|
assert_eq!(language_to_script_family("french"), "latin");
|
||
|
|
assert_eq!(language_to_script_family("german"), "latin");
|
||
|
|
assert_eq!(language_to_script_family("ch"), "chinese");
|
||
|
|
assert_eq!(language_to_script_family("japan"), "chinese");
|
||
|
|
assert_eq!(language_to_script_family("korean"), "korean");
|
||
|
|
assert_eq!(language_to_script_family("thai"), "thai");
|
||
|
|
assert_eq!(language_to_script_family("greek"), "greek");
|
||
|
|
assert_eq!(language_to_script_family("cyrillic"), "eslav");
|
||
|
|
assert_eq!(language_to_script_family("arabic"), "arabic");
|
||
|
|
assert_eq!(language_to_script_family("devanagari"), "devanagari");
|
||
|
|
assert_eq!(language_to_script_family("tamil"), "tamil");
|
||
|
|
assert_eq!(language_to_script_family("telugu"), "telugu");
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test that unknown language codes fall back to "english" script family.
|
||
|
|
#[test]
|
||
|
|
fn test_language_to_script_family_unknown_fallback() {
|
||
|
|
// Unknown codes should fall back to "english"
|
||
|
|
assert_eq!(language_to_script_family("xyz"), "english");
|
||
|
|
assert_eq!(language_to_script_family("unknown"), "english");
|
||
|
|
assert_eq!(language_to_script_family("invalid"), "english");
|
||
|
|
assert_eq!(language_to_script_family(""), "english");
|
||
|
|
assert_eq!(language_to_script_family("klingon"), "english");
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test that map_language_code normalizes various formats to canonical PaddleOCR codes.
|
||
|
|
#[test]
|
||
|
|
fn test_map_language_code_normalization() {
|
||
|
|
// English variants
|
||
|
|
assert_eq!(map_language_code("en"), Some("en"));
|
||
|
|
assert_eq!(map_language_code("eng"), Some("en"));
|
||
|
|
assert_eq!(map_language_code("english"), Some("en"));
|
||
|
|
|
||
|
|
// Chinese variants
|
||
|
|
assert_eq!(map_language_code("ch"), Some("ch"));
|
||
|
|
assert_eq!(map_language_code("chi_sim"), Some("ch"));
|
||
|
|
assert_eq!(map_language_code("zho"), Some("ch"));
|
||
|
|
assert_eq!(map_language_code("zh"), Some("ch"));
|
||
|
|
assert_eq!(map_language_code("chinese"), Some("ch"));
|
||
|
|
|
||
|
|
// Traditional Chinese
|
||
|
|
assert_eq!(map_language_code("chi_tra"), Some("chinese_cht"));
|
||
|
|
assert_eq!(map_language_code("zh_tw"), Some("chinese_cht"));
|
||
|
|
assert_eq!(map_language_code("zh_hant"), Some("chinese_cht"));
|
||
|
|
|
||
|
|
// Japanese
|
||
|
|
assert_eq!(map_language_code("ja"), Some("japan"));
|
||
|
|
assert_eq!(map_language_code("jpn"), Some("japan"));
|
||
|
|
assert_eq!(map_language_code("japanese"), Some("japan"));
|
||
|
|
|
||
|
|
// Korean
|
||
|
|
assert_eq!(map_language_code("ko"), Some("korean"));
|
||
|
|
assert_eq!(map_language_code("kor"), Some("korean"));
|
||
|
|
assert_eq!(map_language_code("korean"), Some("korean"));
|
||
|
|
|
||
|
|
// French
|
||
|
|
assert_eq!(map_language_code("fr"), Some("french"));
|
||
|
|
assert_eq!(map_language_code("fra"), Some("french"));
|
||
|
|
assert_eq!(map_language_code("french"), Some("french"));
|
||
|
|
|
||
|
|
// German
|
||
|
|
assert_eq!(map_language_code("de"), Some("german"));
|
||
|
|
assert_eq!(map_language_code("deu"), Some("german"));
|
||
|
|
assert_eq!(map_language_code("german"), Some("german"));
|
||
|
|
|
||
|
|
// Thai
|
||
|
|
assert_eq!(map_language_code("th"), Some("thai"));
|
||
|
|
assert_eq!(map_language_code("tha"), Some("thai"));
|
||
|
|
assert_eq!(map_language_code("thai"), Some("thai"));
|
||
|
|
|
||
|
|
// Greek
|
||
|
|
assert_eq!(map_language_code("el"), Some("greek"));
|
||
|
|
assert_eq!(map_language_code("ell"), Some("greek"));
|
||
|
|
assert_eq!(map_language_code("greek"), Some("greek"));
|
||
|
|
|
||
|
|
// Russian and other Cyrillic
|
||
|
|
assert_eq!(map_language_code("ru"), Some("cyrillic"));
|
||
|
|
assert_eq!(map_language_code("rus"), Some("cyrillic"));
|
||
|
|
assert_eq!(map_language_code("russian"), Some("cyrillic"));
|
||
|
|
assert_eq!(map_language_code("uk"), Some("cyrillic"));
|
||
|
|
assert_eq!(map_language_code("ukr"), Some("cyrillic"));
|
||
|
|
assert_eq!(map_language_code("ukrainian"), Some("cyrillic"));
|
||
|
|
|
||
|
|
// Latin script languages (should map to "latin")
|
||
|
|
assert_eq!(map_language_code("es"), Some("latin"));
|
||
|
|
assert_eq!(map_language_code("spa"), Some("latin"));
|
||
|
|
assert_eq!(map_language_code("spanish"), Some("latin"));
|
||
|
|
assert_eq!(map_language_code("it"), Some("latin"));
|
||
|
|
assert_eq!(map_language_code("ita"), Some("latin"));
|
||
|
|
assert_eq!(map_language_code("italian"), Some("latin"));
|
||
|
|
assert_eq!(map_language_code("pt"), Some("latin"));
|
||
|
|
assert_eq!(map_language_code("por"), Some("latin"));
|
||
|
|
assert_eq!(map_language_code("portuguese"), Some("latin"));
|
||
|
|
|
||
|
|
// Arabic variants
|
||
|
|
assert_eq!(map_language_code("ar"), Some("arabic"));
|
||
|
|
assert_eq!(map_language_code("ara"), Some("arabic"));
|
||
|
|
assert_eq!(map_language_code("arabic"), Some("arabic"));
|
||
|
|
assert_eq!(map_language_code("fa"), Some("arabic"));
|
||
|
|
assert_eq!(map_language_code("persian"), Some("arabic"));
|
||
|
|
assert_eq!(map_language_code("ur"), Some("arabic"));
|
||
|
|
assert_eq!(map_language_code("urdu"), Some("arabic"));
|
||
|
|
|
||
|
|
// Devanagari variants
|
||
|
|
assert_eq!(map_language_code("hi"), Some("devanagari"));
|
||
|
|
assert_eq!(map_language_code("hin"), Some("devanagari"));
|
||
|
|
assert_eq!(map_language_code("hindi"), Some("devanagari"));
|
||
|
|
assert_eq!(map_language_code("mr"), Some("devanagari"));
|
||
|
|
assert_eq!(map_language_code("marathi"), Some("devanagari"));
|
||
|
|
assert_eq!(map_language_code("sa"), Some("devanagari"));
|
||
|
|
assert_eq!(map_language_code("sanskrit"), Some("devanagari"));
|
||
|
|
assert_eq!(map_language_code("ne"), Some("devanagari"));
|
||
|
|
assert_eq!(map_language_code("nepali"), Some("devanagari"));
|
||
|
|
|
||
|
|
// Tamil variants
|
||
|
|
assert_eq!(map_language_code("ta"), Some("tamil"));
|
||
|
|
assert_eq!(map_language_code("tam"), Some("tamil"));
|
||
|
|
assert_eq!(map_language_code("tamil"), Some("tamil"));
|
||
|
|
|
||
|
|
// Telugu variants
|
||
|
|
assert_eq!(map_language_code("te"), Some("telugu"));
|
||
|
|
assert_eq!(map_language_code("tel"), Some("telugu"));
|
||
|
|
assert_eq!(map_language_code("telugu"), Some("telugu"));
|
||
|
|
|
||
|
|
// Unknown codes should return None
|
||
|
|
assert_eq!(map_language_code("xyz"), None);
|
||
|
|
assert_eq!(map_language_code("unknown"), None);
|
||
|
|
assert_eq!(map_language_code("invalid"), None);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test that SUPPORTED_LANGUAGES contains expected entries and correct count.
|
||
|
|
#[test]
|
||
|
|
fn test_supported_languages_list() {
|
||
|
|
// Should contain 15 entries (11 script families mapped to 15 language codes)
|
||
|
|
assert_eq!(SUPPORTED_LANGUAGES.len(), 15);
|
||
|
|
|
||
|
|
// Verify key languages are present
|
||
|
|
assert!(SUPPORTED_LANGUAGES.contains(&"ch"));
|
||
|
|
assert!(SUPPORTED_LANGUAGES.contains(&"en"));
|
||
|
|
assert!(SUPPORTED_LANGUAGES.contains(&"french"));
|
||
|
|
assert!(SUPPORTED_LANGUAGES.contains(&"german"));
|
||
|
|
assert!(SUPPORTED_LANGUAGES.contains(&"korean"));
|
||
|
|
assert!(SUPPORTED_LANGUAGES.contains(&"japan"));
|
||
|
|
assert!(SUPPORTED_LANGUAGES.contains(&"chinese_cht"));
|
||
|
|
assert!(SUPPORTED_LANGUAGES.contains(&"latin"));
|
||
|
|
assert!(SUPPORTED_LANGUAGES.contains(&"cyrillic"));
|
||
|
|
assert!(SUPPORTED_LANGUAGES.contains(&"thai"));
|
||
|
|
assert!(SUPPORTED_LANGUAGES.contains(&"greek"));
|
||
|
|
assert!(SUPPORTED_LANGUAGES.contains(&"arabic"));
|
||
|
|
assert!(SUPPORTED_LANGUAGES.contains(&"devanagari"));
|
||
|
|
assert!(SUPPORTED_LANGUAGES.contains(&"tamil"));
|
||
|
|
assert!(SUPPORTED_LANGUAGES.contains(&"telugu"));
|
||
|
|
}
|
||
|
|
|
||
|
|
// ============================================================================
|
||
|
|
// Model Manager Tests (non-ignored, no network needed)
|
||
|
|
// ============================================================================
|
||
|
|
|
||
|
|
/// Test that ModelManager creates the cache directory path correctly.
|
||
|
|
#[test]
|
||
|
|
fn test_model_manager_cache_dir_creation() {
|
||
|
|
let temp_dir = TempDir::new().unwrap();
|
||
|
|
let cache_path = temp_dir.path().join("paddle-cache");
|
||
|
|
|
||
|
|
let manager = ModelManager::new(cache_path.clone());
|
||
|
|
|
||
|
|
assert_eq!(manager.cache_dir(), &cache_path);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test that rec_family_path returns the correct path for each script family.
|
||
|
|
#[test]
|
||
|
|
fn test_model_manager_rec_family_path() {
|
||
|
|
let temp_dir = TempDir::new().unwrap();
|
||
|
|
let _manager = ModelManager::new(temp_dir.path().to_path_buf());
|
||
|
|
|
||
|
|
// Test all 11 PP-OCRv5 script families
|
||
|
|
let families = [
|
||
|
|
"english",
|
||
|
|
"chinese",
|
||
|
|
"latin",
|
||
|
|
"korean",
|
||
|
|
"eslav",
|
||
|
|
"thai",
|
||
|
|
"greek",
|
||
|
|
"arabic",
|
||
|
|
"devanagari",
|
||
|
|
"tamil",
|
||
|
|
"telugu",
|
||
|
|
];
|
||
|
|
|
||
|
|
for family in families {
|
||
|
|
// We can't call rec_family_path directly as it's private, but we can verify
|
||
|
|
// the cache structure via the public API
|
||
|
|
let expected_path = temp_dir.path().join("rec").join(family);
|
||
|
|
|
||
|
|
// The path should not exist yet (no models cached)
|
||
|
|
assert!(!expected_path.exists(), "Path should not exist yet for {}", family);
|
||
|
|
|
||
|
|
// After creating the directory structure manually, verify it matches expectation
|
||
|
|
std::fs::create_dir_all(&expected_path).unwrap();
|
||
|
|
assert!(expected_path.exists(), "Path should exist for {}", family);
|
||
|
|
assert!(expected_path.ends_with(format!("rec/{}", family)));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test that fresh ModelManager reports models as not cached.
|
||
|
|
#[test]
|
||
|
|
fn test_model_manager_empty_cache_not_cached() {
|
||
|
|
let temp_dir = TempDir::new().unwrap();
|
||
|
|
let manager = ModelManager::new(temp_dir.path().to_path_buf());
|
||
|
|
|
||
|
|
// Should report as not cached
|
||
|
|
assert!(!manager.are_shared_models_cached());
|
||
|
|
assert!(!manager.is_rec_model_cached("english"));
|
||
|
|
assert!(!manager.is_rec_model_cached("chinese"));
|
||
|
|
assert!(!manager.is_rec_model_cached("latin"));
|
||
|
|
assert!(!manager.is_rec_model_cached("korean"));
|
||
|
|
assert!(!manager.is_rec_model_cached("thai"));
|
||
|
|
assert!(!manager.is_rec_model_cached("greek"));
|
||
|
|
assert!(!manager.are_models_cached());
|
||
|
|
}
|
||
|
|
|
||
|
|
// ============================================================================
|
||
|
|
// Config Tests (non-ignored)
|
||
|
|
// ============================================================================
|
||
|
|
|
||
|
|
/// Test that all PaddleLanguage enum variants exist and match documentation.
|
||
|
|
#[test]
|
||
|
|
fn test_paddle_language_enum_variants() {
|
||
|
|
// Verify all documented variants exist
|
||
|
|
let _english = PaddleLanguage::English;
|
||
|
|
let _chinese = PaddleLanguage::Chinese;
|
||
|
|
let _japanese = PaddleLanguage::Japanese;
|
||
|
|
let _korean = PaddleLanguage::Korean;
|
||
|
|
let _german = PaddleLanguage::German;
|
||
|
|
let _french = PaddleLanguage::French;
|
||
|
|
let _latin = PaddleLanguage::Latin;
|
||
|
|
let _cyrillic = PaddleLanguage::Cyrillic;
|
||
|
|
let _traditional_chinese = PaddleLanguage::TraditionalChinese;
|
||
|
|
let _thai = PaddleLanguage::Thai;
|
||
|
|
let _greek = PaddleLanguage::Greek;
|
||
|
|
let _east_slavic = PaddleLanguage::EastSlavic;
|
||
|
|
|
||
|
|
// Verify codes match expectations
|
||
|
|
assert_eq!(PaddleLanguage::English.code(), "en");
|
||
|
|
assert_eq!(PaddleLanguage::Chinese.code(), "ch");
|
||
|
|
assert_eq!(PaddleLanguage::Japanese.code(), "jpn");
|
||
|
|
assert_eq!(PaddleLanguage::Korean.code(), "kor");
|
||
|
|
assert_eq!(PaddleLanguage::German.code(), "deu");
|
||
|
|
assert_eq!(PaddleLanguage::French.code(), "fra");
|
||
|
|
assert_eq!(PaddleLanguage::Latin.code(), "latin");
|
||
|
|
assert_eq!(PaddleLanguage::Cyrillic.code(), "cyrillic");
|
||
|
|
assert_eq!(PaddleLanguage::TraditionalChinese.code(), "chinese_cht");
|
||
|
|
assert_eq!(PaddleLanguage::Thai.code(), "thai");
|
||
|
|
assert_eq!(PaddleLanguage::Greek.code(), "greek");
|
||
|
|
assert_eq!(PaddleLanguage::EastSlavic.code(), "eslav");
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test that PaddleOcrConfig::new stores the correct language.
|
||
|
|
#[test]
|
||
|
|
fn test_paddle_ocr_config_new_with_language() {
|
||
|
|
let config_en = PaddleOcrConfig::new("en");
|
||
|
|
assert_eq!(config_en.language, "en");
|
||
|
|
|
||
|
|
let config_ch = PaddleOcrConfig::new("ch");
|
||
|
|
assert_eq!(config_ch.language, "ch");
|
||
|
|
|
||
|
|
let config_thai = PaddleOcrConfig::new("thai");
|
||
|
|
assert_eq!(config_thai.language, "thai");
|
||
|
|
|
||
|
|
let config_greek = PaddleOcrConfig::new("greek");
|
||
|
|
assert_eq!(config_greek.language, "greek");
|
||
|
|
|
||
|
|
let config_cyrillic = PaddleOcrConfig::new("cyrillic");
|
||
|
|
assert_eq!(config_cyrillic.language, "cyrillic");
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test that with_cache_dir properly sets the cache directory.
|
||
|
|
#[test]
|
||
|
|
fn test_paddle_ocr_config_with_cache_dir() {
|
||
|
|
let cache_path = PathBuf::from("/custom/cache/dir");
|
||
|
|
let config = PaddleOcrConfig::new("en").with_cache_dir(cache_path.clone());
|
||
|
|
|
||
|
|
assert_eq!(config.cache_dir, Some(cache_path.clone()));
|
||
|
|
assert_eq!(config.resolve_cache_dir(), cache_path);
|
||
|
|
}
|
||
|
|
|
||
|
|
// ============================================================================
|
||
|
|
// Backend Tests (non-ignored)
|
||
|
|
// ============================================================================
|
||
|
|
|
||
|
|
/// Test that PaddleOcrBackend::new() succeeds without errors.
|
||
|
|
#[test]
|
||
|
|
fn test_paddle_backend_creation() {
|
||
|
|
let result = PaddleOcrBackend::new();
|
||
|
|
assert!(result.is_ok(), "Backend creation should succeed");
|
||
|
|
|
||
|
|
let backend = result.unwrap();
|
||
|
|
assert_eq!(backend.name(), "paddle-ocr");
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test that supports_language works for all new language codes.
|
||
|
|
#[test]
|
||
|
|
fn test_paddle_backend_supports_language_expanded() {
|
||
|
|
let backend = PaddleOcrBackend::new().unwrap();
|
||
|
|
|
||
|
|
// Direct PaddleOCR codes
|
||
|
|
assert!(backend.supports_language("en"));
|
||
|
|
assert!(backend.supports_language("ch"));
|
||
|
|
assert!(backend.supports_language("japan"));
|
||
|
|
assert!(backend.supports_language("korean"));
|
||
|
|
assert!(backend.supports_language("french"));
|
||
|
|
assert!(backend.supports_language("german"));
|
||
|
|
assert!(backend.supports_language("latin"));
|
||
|
|
assert!(backend.supports_language("cyrillic"));
|
||
|
|
assert!(backend.supports_language("thai"));
|
||
|
|
assert!(backend.supports_language("greek"));
|
||
|
|
assert!(backend.supports_language("chinese_cht"));
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test that Tesseract-style codes are supported via mapping.
|
||
|
|
#[test]
|
||
|
|
fn test_paddle_backend_supports_tesseract_mapped_codes() {
|
||
|
|
let backend = PaddleOcrBackend::new().unwrap();
|
||
|
|
|
||
|
|
// Tesseract codes that map to PaddleOCR codes
|
||
|
|
assert!(backend.supports_language("eng")); // → en
|
||
|
|
assert!(backend.supports_language("chi_sim")); // → ch
|
||
|
|
assert!(backend.supports_language("jpn")); // → japan
|
||
|
|
assert!(backend.supports_language("kor")); // → korean
|
||
|
|
assert!(backend.supports_language("fra")); // → french
|
||
|
|
assert!(backend.supports_language("deu")); // → german
|
||
|
|
assert!(backend.supports_language("tha")); // → thai
|
||
|
|
assert!(backend.supports_language("ell")); // → greek
|
||
|
|
assert!(backend.supports_language("rus")); // → cyrillic
|
||
|
|
|
||
|
|
// ISO 639-1 codes
|
||
|
|
assert!(backend.supports_language("en"));
|
||
|
|
assert!(backend.supports_language("zh"));
|
||
|
|
assert!(backend.supports_language("ja"));
|
||
|
|
assert!(backend.supports_language("ko"));
|
||
|
|
assert!(backend.supports_language("fr"));
|
||
|
|
assert!(backend.supports_language("de"));
|
||
|
|
assert!(backend.supports_language("th"));
|
||
|
|
assert!(backend.supports_language("el"));
|
||
|
|
assert!(backend.supports_language("ru"));
|
||
|
|
}
|
||
|
|
|
||
|
|
// ============================================================================
|
||
|
|
// Integration Tests (ignored, require network + ONNX)
|
||
|
|
// ============================================================================
|
||
|
|
|
||
|
|
/// Test OCR with Chinese language and Chinese model.
|
||
|
|
///
|
||
|
|
/// This test verifies that the Chinese recognition model can process Chinese text.
|
||
|
|
/// Requires network access to download the Chinese rec model and ONNX Runtime.
|
||
|
|
#[tokio::test]
|
||
|
|
#[ignore = "requires network access and ONNX Runtime"]
|
||
|
|
async fn test_ocr_chinese_with_chinese_model() {
|
||
|
|
let test_documents_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||
|
|
.parent()
|
||
|
|
.unwrap()
|
||
|
|
.parent()
|
||
|
|
.unwrap()
|
||
|
|
.join("test_documents");
|
||
|
|
|
||
|
|
let image_path = test_documents_dir.join("images/chi_sim_image.jpeg");
|
||
|
|
if !image_path.exists() {
|
||
|
|
eprintln!("Skipping test: Chinese test image not found at {:?}", image_path);
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let image_bytes = std::fs::read(&image_path).expect("Failed to read Chinese test image");
|
||
|
|
|
||
|
|
let cache_dir = std::env::temp_dir().join("kreuzberg_paddle_multilang_test");
|
||
|
|
let config = PaddleOcrConfig::new("ch").with_cache_dir(cache_dir);
|
||
|
|
let backend = PaddleOcrBackend::with_config(config).expect("Failed to create backend");
|
||
|
|
|
||
|
|
let ocr_config = OcrConfig {
|
||
|
|
backend: "paddle-ocr".to_string(),
|
||
|
|
language: "ch".to_string(),
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
|
||
|
|
let result = backend.process_image(&image_bytes, &ocr_config).await;
|
||
|
|
assert!(result.is_ok(), "Chinese OCR failed: {:?}", result.err());
|
||
|
|
|
||
|
|
let extraction = result.unwrap();
|
||
|
|
println!("Chinese OCR result: {}", extraction.content);
|
||
|
|
|
||
|
|
// The result should contain Chinese characters (not empty after using Chinese model)
|
||
|
|
assert!(
|
||
|
|
!extraction.content.is_empty(),
|
||
|
|
"Expected non-empty result from Chinese OCR"
|
||
|
|
);
|
||
|
|
|
||
|
|
// Verify Chinese characters are present (Unicode range for CJK)
|
||
|
|
let has_chinese_chars = extraction.content.chars().any(|c| {
|
||
|
|
matches!(c,
|
||
|
|
'\u{4E00}'..='\u{9FFF}' | // CJK Unified Ideographs
|
||
|
|
'\u{3400}'..='\u{4DBF}' | // CJK Extension A
|
||
|
|
'\u{F900}'..='\u{FAFF}' // CJK Compatibility Ideographs
|
||
|
|
)
|
||
|
|
});
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
has_chinese_chars,
|
||
|
|
"Expected Chinese characters in OCR result, got: {}",
|
||
|
|
extraction.content
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test concurrent OCR on different languages using the engine pool.
|
||
|
|
///
|
||
|
|
/// This test verifies that the backend can handle multiple concurrent OCR requests
|
||
|
|
/// with different languages without blocking. Each language should use its own
|
||
|
|
/// engine from the pool.
|
||
|
|
#[tokio::test]
|
||
|
|
#[ignore = "requires network access and ONNX Runtime"]
|
||
|
|
async fn test_ocr_concurrent_different_languages() {
|
||
|
|
use tokio::task::JoinSet;
|
||
|
|
|
||
|
|
let test_documents_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||
|
|
.parent()
|
||
|
|
.unwrap()
|
||
|
|
.parent()
|
||
|
|
.unwrap()
|
||
|
|
.join("test_documents");
|
||
|
|
|
||
|
|
let cache_dir = std::env::temp_dir().join("kreuzberg_paddle_multilang_concurrent");
|
||
|
|
let backend = std::sync::Arc::new(
|
||
|
|
PaddleOcrBackend::with_config(PaddleOcrConfig::new("en").with_cache_dir(cache_dir))
|
||
|
|
.expect("Failed to create backend"),
|
||
|
|
);
|
||
|
|
|
||
|
|
let mut tasks = JoinSet::new();
|
||
|
|
|
||
|
|
// English OCR task
|
||
|
|
{
|
||
|
|
let backend_clone = backend.clone();
|
||
|
|
let image_path = test_documents_dir.join("images/test_hello_world.png");
|
||
|
|
if image_path.exists() {
|
||
|
|
let image_bytes = std::fs::read(&image_path).expect("Failed to read English image");
|
||
|
|
tasks.spawn(async move {
|
||
|
|
let config = OcrConfig {
|
||
|
|
backend: "paddle-ocr".to_string(),
|
||
|
|
language: "en".to_string(),
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
let result = backend_clone.process_image(&image_bytes, &config).await;
|
||
|
|
("en", result)
|
||
|
|
});
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Chinese OCR task
|
||
|
|
{
|
||
|
|
let backend_clone = backend.clone();
|
||
|
|
let image_path = test_documents_dir.join("images/chi_sim_image.jpeg");
|
||
|
|
if image_path.exists() {
|
||
|
|
let image_bytes = std::fs::read(&image_path).expect("Failed to read Chinese image");
|
||
|
|
tasks.spawn(async move {
|
||
|
|
let config = OcrConfig {
|
||
|
|
backend: "paddle-ocr".to_string(),
|
||
|
|
language: "ch".to_string(),
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
let result = backend_clone.process_image(&image_bytes, &config).await;
|
||
|
|
("ch", result)
|
||
|
|
});
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Wait for all tasks and verify results
|
||
|
|
let mut results_count = 0;
|
||
|
|
while let Some(result) = tasks.join_next().await {
|
||
|
|
let (lang, ocr_result) = result.expect("Task panicked");
|
||
|
|
assert!(
|
||
|
|
ocr_result.is_ok(),
|
||
|
|
"OCR failed for language {}: {:?}",
|
||
|
|
lang,
|
||
|
|
ocr_result.err()
|
||
|
|
);
|
||
|
|
|
||
|
|
let extraction = ocr_result.unwrap();
|
||
|
|
println!(
|
||
|
|
"OCR result for {}: {}",
|
||
|
|
lang,
|
||
|
|
&extraction.content[..extraction.content.len().min(100)]
|
||
|
|
);
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
!extraction.content.is_empty(),
|
||
|
|
"Expected non-empty result for language {}",
|
||
|
|
lang
|
||
|
|
);
|
||
|
|
results_count += 1;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Should have completed both tasks
|
||
|
|
assert!(results_count >= 2, "Expected at least 2 OCR tasks to complete");
|
||
|
|
println!("Successfully completed {} concurrent OCR tasks", results_count);
|
||
|
|
}
|
||
|
|
|
||
|
|
*/
|