205 lines
6.1 KiB
Rust
205 lines
6.1 KiB
Rust
//! TODO: Restored from 245539484 alef-migration cleanup. Currently exercises
|
|
//! pub(crate) APIs that the migration deliberately narrowed; gated until
|
|
//! either (a) these APIs are re-exposed publicly, or (b) the test is
|
|
//! rewritten against the public extraction surface.
|
|
|
|
#![cfg(any())]
|
|
|
|
// Original content preserved below; recompiled once gating cfg drops.
|
|
// Disabled by the file-level cfg(any()) above.
|
|
|
|
/*
|
|
#![cfg(feature = "ocr")]
|
|
//! Integration tests for OCR language registry
|
|
//!
|
|
//! Tests the language registry functionality across all OCR backends.
|
|
|
|
use kreuzberg::ocr::LanguageRegistry;
|
|
|
|
#[test]
|
|
fn test_registry_provides_easyocr_languages() {
|
|
let registry = LanguageRegistry::new();
|
|
let languages = registry.get_supported_languages("easyocr").expect("EasyOCR not found");
|
|
|
|
assert_eq!(languages.len(), 83);
|
|
|
|
let expected = vec!["en", "fr", "de", "es", "it", "pt", "ja", "ko", "ch_sim", "ch_tra", "ru"];
|
|
for lang in expected {
|
|
assert!(
|
|
languages.contains(&lang.to_string()),
|
|
"Expected language '{}' not found in EasyOCR",
|
|
lang
|
|
);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_registry_provides_paddleocr_languages() {
|
|
let registry = LanguageRegistry::new();
|
|
let languages = registry
|
|
.get_supported_languages("paddleocr")
|
|
.expect("PaddleOCR not found");
|
|
|
|
assert_eq!(languages.len(), 14);
|
|
|
|
let expected = vec!["en", "ch", "french", "german", "korean", "japan", "arabic"];
|
|
for lang in expected {
|
|
assert!(
|
|
languages.contains(&lang.to_string()),
|
|
"Expected language '{}' not found in PaddleOCR",
|
|
lang
|
|
);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_registry_provides_tesseract_languages() {
|
|
let registry = LanguageRegistry::new();
|
|
let languages = registry
|
|
.get_supported_languages("tesseract")
|
|
.expect("Tesseract not found");
|
|
|
|
assert!(languages.len() >= 100, "Tesseract should support 100+ languages");
|
|
|
|
let expected = vec![
|
|
"eng", "fra", "deu", "spa", "ita", "por", "jpn", "kor", "chi_sim", "chi_tra", "rus",
|
|
];
|
|
for lang in expected {
|
|
assert!(
|
|
languages.contains(&lang.to_string()),
|
|
"Expected language '{}' not found in Tesseract",
|
|
lang
|
|
);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_language_support_checking() {
|
|
let registry = LanguageRegistry::new();
|
|
|
|
assert!(registry.is_language_supported("easyocr", "en"));
|
|
assert!(registry.is_language_supported("easyocr", "fr"));
|
|
assert!(!registry.is_language_supported("easyocr", "invalid_lang"));
|
|
|
|
assert!(registry.is_language_supported("paddleocr", "en"));
|
|
assert!(registry.is_language_supported("paddleocr", "ch"));
|
|
assert!(!registry.is_language_supported("paddleocr", "en_US"));
|
|
|
|
assert!(registry.is_language_supported("tesseract", "eng"));
|
|
assert!(registry.is_language_supported("tesseract", "fra"));
|
|
assert!(!registry.is_language_supported("tesseract", "en"));
|
|
|
|
assert!(!registry.is_language_supported("invalid_backend", "en"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_backend_enumeration() {
|
|
let registry = LanguageRegistry::new();
|
|
let backends = registry.get_backends();
|
|
|
|
assert_eq!(backends.len(), 3);
|
|
assert!(backends.contains(&"easyocr".to_string()));
|
|
assert!(backends.contains(&"paddleocr".to_string()));
|
|
assert!(backends.contains(&"tesseract".to_string()));
|
|
}
|
|
|
|
#[test]
|
|
fn test_language_count_per_backend() {
|
|
let registry = LanguageRegistry::new();
|
|
|
|
assert_eq!(registry.get_language_count("easyocr"), 83);
|
|
assert_eq!(registry.get_language_count("paddleocr"), 14);
|
|
assert!(registry.get_language_count("tesseract") >= 100);
|
|
assert_eq!(registry.get_language_count("nonexistent"), 0);
|
|
}
|
|
|
|
#[test]
|
|
fn test_registry_singleton_behavior() {
|
|
let global1 = LanguageRegistry::global();
|
|
let global2 = LanguageRegistry::global();
|
|
|
|
assert_eq!(
|
|
global1.get_language_count("easyocr"),
|
|
global2.get_language_count("easyocr")
|
|
);
|
|
assert_eq!(
|
|
global1.get_language_count("paddleocr"),
|
|
global2.get_language_count("paddleocr")
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_easyocr_special_languages() {
|
|
let registry = LanguageRegistry::new();
|
|
let languages = registry.get_supported_languages("easyocr").expect("Operation failed");
|
|
|
|
let special_langs = vec!["ch_sim", "ch_tra", "rs_cyrillic", "rs_latin"];
|
|
|
|
for lang in special_langs {
|
|
assert!(
|
|
languages.contains(&lang.to_string()),
|
|
"EasyOCR should support special language '{}'",
|
|
lang
|
|
);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_registry_clone() {
|
|
let registry1 = LanguageRegistry::new();
|
|
let registry2 = registry1.clone();
|
|
|
|
assert_eq!(
|
|
registry1.get_language_count("easyocr"),
|
|
registry2.get_language_count("easyocr")
|
|
);
|
|
assert_eq!(registry1.get_backends(), registry2.get_backends());
|
|
}
|
|
|
|
#[test]
|
|
fn test_registry_default() {
|
|
let registry_default = LanguageRegistry::default();
|
|
let registry_new = LanguageRegistry::new();
|
|
|
|
assert_eq!(registry_default.get_backends().len(), registry_new.get_backends().len());
|
|
}
|
|
|
|
#[test]
|
|
fn test_registry_consistency() {
|
|
let registries: Vec<_> = (0..5).map(|_| LanguageRegistry::new()).collect();
|
|
|
|
let expected_backends = vec!["easyocr", "paddleocr", "tesseract"];
|
|
let expected_counts = vec![("easyocr", 83), ("paddleocr", 14), ("tesseract", 100_usize)];
|
|
|
|
for registry in ®istries {
|
|
let backends = registry.get_backends();
|
|
assert_eq!(backends.len(), 3);
|
|
|
|
for expected_backend in &expected_backends {
|
|
assert!(backends.contains(&expected_backend.to_string()));
|
|
}
|
|
|
|
for (backend, min_count) in &expected_counts {
|
|
let count = registry.get_language_count(backend);
|
|
if backend == &"tesseract" {
|
|
assert!(count >= *min_count);
|
|
} else {
|
|
assert_eq!(count, *min_count);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_language_case_sensitivity() {
|
|
let registry = LanguageRegistry::new();
|
|
|
|
assert!(registry.is_language_supported("easyocr", "en"));
|
|
assert!(!registry.is_language_supported("easyocr", "EN"));
|
|
|
|
assert!(registry.is_language_supported("easyocr", "en"));
|
|
assert!(!registry.is_language_supported("EASYOCR", "en"));
|
|
}
|
|
|
|
*/
|