Files
fil/crates/kreuzberg/tests/ocr_language_registry.rs

205 lines
6.1 KiB
Rust
Raw Permalink Normal View History

2026-06-01 23:40:55 +02:00
//! TODO: Restored from 245539484 alef-migration cleanup. Currently exercises
//! pub(crate) APIs that the migration deliberately narrowed; gated until
//! either (a) these APIs are re-exposed publicly, or (b) the test is
//! rewritten against the public extraction surface.
#![cfg(any())]
// Original content preserved below; recompiled once gating cfg drops.
// Disabled by the file-level cfg(any()) above.
/*
#![cfg(feature = "ocr")]
//! Integration tests for OCR language registry
//!
//! Tests the language registry functionality across all OCR backends.
use kreuzberg::ocr::LanguageRegistry;
#[test]
fn test_registry_provides_easyocr_languages() {
let registry = LanguageRegistry::new();
let languages = registry.get_supported_languages("easyocr").expect("EasyOCR not found");
assert_eq!(languages.len(), 83);
let expected = vec!["en", "fr", "de", "es", "it", "pt", "ja", "ko", "ch_sim", "ch_tra", "ru"];
for lang in expected {
assert!(
languages.contains(&lang.to_string()),
"Expected language '{}' not found in EasyOCR",
lang
);
}
}
#[test]
fn test_registry_provides_paddleocr_languages() {
let registry = LanguageRegistry::new();
let languages = registry
.get_supported_languages("paddleocr")
.expect("PaddleOCR not found");
assert_eq!(languages.len(), 14);
let expected = vec!["en", "ch", "french", "german", "korean", "japan", "arabic"];
for lang in expected {
assert!(
languages.contains(&lang.to_string()),
"Expected language '{}' not found in PaddleOCR",
lang
);
}
}
#[test]
fn test_registry_provides_tesseract_languages() {
let registry = LanguageRegistry::new();
let languages = registry
.get_supported_languages("tesseract")
.expect("Tesseract not found");
assert!(languages.len() >= 100, "Tesseract should support 100+ languages");
let expected = vec![
"eng", "fra", "deu", "spa", "ita", "por", "jpn", "kor", "chi_sim", "chi_tra", "rus",
];
for lang in expected {
assert!(
languages.contains(&lang.to_string()),
"Expected language '{}' not found in Tesseract",
lang
);
}
}
#[test]
fn test_language_support_checking() {
let registry = LanguageRegistry::new();
assert!(registry.is_language_supported("easyocr", "en"));
assert!(registry.is_language_supported("easyocr", "fr"));
assert!(!registry.is_language_supported("easyocr", "invalid_lang"));
assert!(registry.is_language_supported("paddleocr", "en"));
assert!(registry.is_language_supported("paddleocr", "ch"));
assert!(!registry.is_language_supported("paddleocr", "en_US"));
assert!(registry.is_language_supported("tesseract", "eng"));
assert!(registry.is_language_supported("tesseract", "fra"));
assert!(!registry.is_language_supported("tesseract", "en"));
assert!(!registry.is_language_supported("invalid_backend", "en"));
}
#[test]
fn test_backend_enumeration() {
let registry = LanguageRegistry::new();
let backends = registry.get_backends();
assert_eq!(backends.len(), 3);
assert!(backends.contains(&"easyocr".to_string()));
assert!(backends.contains(&"paddleocr".to_string()));
assert!(backends.contains(&"tesseract".to_string()));
}
#[test]
fn test_language_count_per_backend() {
let registry = LanguageRegistry::new();
assert_eq!(registry.get_language_count("easyocr"), 83);
assert_eq!(registry.get_language_count("paddleocr"), 14);
assert!(registry.get_language_count("tesseract") >= 100);
assert_eq!(registry.get_language_count("nonexistent"), 0);
}
#[test]
fn test_registry_singleton_behavior() {
let global1 = LanguageRegistry::global();
let global2 = LanguageRegistry::global();
assert_eq!(
global1.get_language_count("easyocr"),
global2.get_language_count("easyocr")
);
assert_eq!(
global1.get_language_count("paddleocr"),
global2.get_language_count("paddleocr")
);
}
#[test]
fn test_easyocr_special_languages() {
let registry = LanguageRegistry::new();
let languages = registry.get_supported_languages("easyocr").expect("Operation failed");
let special_langs = vec!["ch_sim", "ch_tra", "rs_cyrillic", "rs_latin"];
for lang in special_langs {
assert!(
languages.contains(&lang.to_string()),
"EasyOCR should support special language '{}'",
lang
);
}
}
#[test]
fn test_registry_clone() {
let registry1 = LanguageRegistry::new();
let registry2 = registry1.clone();
assert_eq!(
registry1.get_language_count("easyocr"),
registry2.get_language_count("easyocr")
);
assert_eq!(registry1.get_backends(), registry2.get_backends());
}
#[test]
fn test_registry_default() {
let registry_default = LanguageRegistry::default();
let registry_new = LanguageRegistry::new();
assert_eq!(registry_default.get_backends().len(), registry_new.get_backends().len());
}
#[test]
fn test_registry_consistency() {
let registries: Vec<_> = (0..5).map(|_| LanguageRegistry::new()).collect();
let expected_backends = vec!["easyocr", "paddleocr", "tesseract"];
let expected_counts = vec![("easyocr", 83), ("paddleocr", 14), ("tesseract", 100_usize)];
for registry in &registries {
let backends = registry.get_backends();
assert_eq!(backends.len(), 3);
for expected_backend in &expected_backends {
assert!(backends.contains(&expected_backend.to_string()));
}
for (backend, min_count) in &expected_counts {
let count = registry.get_language_count(backend);
if backend == &"tesseract" {
assert!(count >= *min_count);
} else {
assert_eq!(count, *min_count);
}
}
}
}
#[test]
fn test_language_case_sensitivity() {
let registry = LanguageRegistry::new();
assert!(registry.is_language_supported("easyocr", "en"));
assert!(!registry.is_language_supported("easyocr", "EN"));
assert!(registry.is_language_supported("easyocr", "en"));
assert!(!registry.is_language_supported("EASYOCR", "en"));
}
*/