This commit is contained in:
427
crates/kreuzberg/tests/config_loading_tests.rs
Normal file
427
crates/kreuzberg/tests/config_loading_tests.rs
Normal file
@@ -0,0 +1,427 @@
|
||||
//! Configuration loading integration tests.
|
||||
//!
|
||||
//! Tests the config loading APIs:
|
||||
//! - from_file() with TOML/YAML/JSON
|
||||
//! - discover() for searching parent directories
|
||||
//! - Error handling for invalid configs
|
||||
|
||||
use kreuzberg::KreuzbergError;
|
||||
use kreuzberg::core::config::ExtractionConfig;
|
||||
use std::fs;
|
||||
use tempfile::TempDir;
|
||||
|
||||
/// Test loading config from TOML file.
|
||||
#[test]
|
||||
fn test_from_file_toml_succeeds() {
|
||||
let temp_dir = TempDir::new().expect("Operation failed");
|
||||
let config_path = temp_dir.path().join("config.toml");
|
||||
|
||||
let toml_content = r#"
|
||||
[ocr]
|
||||
enabled = true
|
||||
backend = "tesseract"
|
||||
|
||||
[chunking]
|
||||
max_chars = 1000
|
||||
max_overlap = 100
|
||||
"#;
|
||||
|
||||
fs::write(&config_path, toml_content).expect("Operation failed");
|
||||
|
||||
let config = ExtractionConfig::from_file(&config_path);
|
||||
assert!(config.is_ok(), "Should load TOML config successfully");
|
||||
|
||||
let config = config.expect("Operation failed");
|
||||
assert!(config.ocr.is_some(), "Should have OCR config");
|
||||
assert!(config.chunking.is_some(), "Should have chunking config");
|
||||
|
||||
let chunking = config.chunking.expect("Operation failed");
|
||||
assert_eq!(chunking.max_characters, 1000);
|
||||
assert_eq!(chunking.overlap, 100);
|
||||
}
|
||||
|
||||
/// Test loading config from YAML file.
|
||||
#[test]
|
||||
fn test_from_file_yaml_succeeds() {
|
||||
let temp_dir = TempDir::new().expect("Operation failed");
|
||||
let config_path = temp_dir.path().join("config.yaml");
|
||||
|
||||
let yaml_content = r#"
|
||||
ocr:
|
||||
enabled: true
|
||||
backend: tesseract
|
||||
chunking:
|
||||
max_characters: 1000
|
||||
overlap: 100
|
||||
"#;
|
||||
|
||||
fs::write(&config_path, yaml_content).expect("Operation failed");
|
||||
|
||||
let config = ExtractionConfig::from_file(&config_path);
|
||||
assert!(config.is_ok(), "Should load YAML config successfully");
|
||||
|
||||
let config = config.expect("Operation failed");
|
||||
assert!(config.ocr.is_some(), "Should have OCR config");
|
||||
assert!(config.chunking.is_some(), "Should have chunking config");
|
||||
|
||||
let chunking = config.chunking.expect("Operation failed");
|
||||
assert_eq!(chunking.max_characters, 1000);
|
||||
assert_eq!(chunking.overlap, 100);
|
||||
}
|
||||
|
||||
/// Test loading config from JSON file.
|
||||
#[test]
|
||||
fn test_from_file_json_succeeds() {
|
||||
let temp_dir = TempDir::new().expect("Operation failed");
|
||||
let config_path = temp_dir.path().join("config.json");
|
||||
|
||||
let json_content = r#"
|
||||
{
|
||||
"ocr": {
|
||||
"enabled": true,
|
||||
"backend": "tesseract"
|
||||
},
|
||||
"chunking": {
|
||||
"max_chars": 1000,
|
||||
"max_overlap": 100
|
||||
}
|
||||
}
|
||||
"#;
|
||||
|
||||
fs::write(&config_path, json_content).expect("Operation failed");
|
||||
|
||||
let config = ExtractionConfig::from_file(&config_path);
|
||||
assert!(config.is_ok(), "Should load JSON config successfully");
|
||||
|
||||
let config = config.expect("Operation failed");
|
||||
assert!(config.ocr.is_some(), "Should have OCR config");
|
||||
assert!(config.chunking.is_some(), "Should have chunking config");
|
||||
|
||||
let chunking = config.chunking.expect("Operation failed");
|
||||
assert_eq!(chunking.max_characters, 1000);
|
||||
assert_eq!(chunking.overlap, 100);
|
||||
}
|
||||
|
||||
/// Test loading config from .yml extension.
|
||||
#[test]
|
||||
fn test_from_file_yml_extension_succeeds() {
|
||||
let temp_dir = TempDir::new().expect("Operation failed");
|
||||
let config_path = temp_dir.path().join("config.yml");
|
||||
|
||||
let yml_content = r#"
|
||||
ocr:
|
||||
enabled: true
|
||||
"#;
|
||||
|
||||
fs::write(&config_path, yml_content).expect("Operation failed");
|
||||
|
||||
let config = ExtractionConfig::from_file(&config_path);
|
||||
assert!(config.is_ok(), "Should load .yml config successfully");
|
||||
}
|
||||
|
||||
/// Test from_file with nonexistent path fails.
|
||||
#[test]
|
||||
fn test_from_file_nonexistent_path_fails() {
|
||||
let result = ExtractionConfig::from_file("/nonexistent/path/config.toml");
|
||||
assert!(result.is_err(), "Should fail for nonexistent path: {:?}", result);
|
||||
}
|
||||
|
||||
/// Test from_file with malformed TOML fails.
|
||||
#[test]
|
||||
fn test_from_file_malformed_toml_fails() {
|
||||
let temp_dir = TempDir::new().expect("Operation failed");
|
||||
let config_path = temp_dir.path().join("config.toml");
|
||||
|
||||
let malformed_toml = r#"
|
||||
[ocr
|
||||
enabled = true
|
||||
"#;
|
||||
|
||||
fs::write(&config_path, malformed_toml).expect("Operation failed");
|
||||
|
||||
let result = ExtractionConfig::from_file(&config_path);
|
||||
assert!(result.is_err(), "Should fail for malformed TOML: {:?}", result);
|
||||
}
|
||||
|
||||
/// Test from_file with malformed JSON fails.
|
||||
#[test]
|
||||
fn test_from_file_malformed_json_fails() {
|
||||
let temp_dir = TempDir::new().expect("Operation failed");
|
||||
let config_path = temp_dir.path().join("config.json");
|
||||
|
||||
let malformed_json = r#"
|
||||
{
|
||||
"ocr": {
|
||||
"enabled": true
|
||||
}
|
||||
"chunking": {}
|
||||
}
|
||||
"#;
|
||||
|
||||
fs::write(&config_path, malformed_json).expect("Operation failed");
|
||||
|
||||
let result = ExtractionConfig::from_file(&config_path);
|
||||
assert!(result.is_err(), "Should fail for malformed JSON: {:?}", result);
|
||||
}
|
||||
|
||||
/// Test from_file with malformed YAML fails.
|
||||
#[test]
|
||||
fn test_from_file_malformed_yaml_fails() {
|
||||
let temp_dir = TempDir::new().expect("Operation failed");
|
||||
let config_path = temp_dir.path().join("config.yaml");
|
||||
|
||||
let malformed_yaml = r#"
|
||||
ocr:
|
||||
enabled: true
|
||||
- invalid_list
|
||||
"#;
|
||||
|
||||
fs::write(&config_path, malformed_yaml).expect("Operation failed");
|
||||
|
||||
let result = ExtractionConfig::from_file(&config_path);
|
||||
assert!(result.is_err(), "Should fail for malformed YAML: {:?}", result);
|
||||
}
|
||||
|
||||
/// Test from_file with empty file uses defaults.
|
||||
#[test]
|
||||
fn test_from_file_empty_file_uses_defaults() {
|
||||
let temp_dir = TempDir::new().expect("Operation failed");
|
||||
let config_path = temp_dir.path().join("config.toml");
|
||||
|
||||
fs::write(&config_path, "").expect("Operation failed");
|
||||
|
||||
let config = ExtractionConfig::from_file(&config_path);
|
||||
assert!(config.is_ok(), "Should load empty file successfully");
|
||||
|
||||
let config = config.expect("Operation failed");
|
||||
assert!(config.ocr.is_none(), "Default config should have no OCR");
|
||||
assert!(config.chunking.is_none(), "Default config should have no chunking");
|
||||
}
|
||||
|
||||
/// Test from_file with unsupported extension fails.
|
||||
#[test]
|
||||
fn test_from_file_unsupported_extension_fails() {
|
||||
let temp_dir = TempDir::new().expect("Operation failed");
|
||||
let config_path = temp_dir.path().join("config.txt");
|
||||
|
||||
fs::write(&config_path, "ocr:\n enabled: true").expect("Operation failed");
|
||||
|
||||
let result = ExtractionConfig::from_file(&config_path);
|
||||
assert!(result.is_err(), "Should fail for unsupported extension: {:?}", result);
|
||||
|
||||
if let Err(KreuzbergError::Validation { message, .. }) = result {
|
||||
assert!(
|
||||
message.contains("format") || message.contains("extension") || message.contains("Unsupported"),
|
||||
"Error should mention format/extension: {}",
|
||||
message
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Test discover() finds config in current directory.
|
||||
#[test]
|
||||
#[serial_test::serial]
|
||||
fn test_discover_finds_config_in_current_dir() {
|
||||
let temp_dir = TempDir::new().expect("Operation failed");
|
||||
let config_path = temp_dir.path().join("kreuzberg.toml");
|
||||
|
||||
let toml_content = r#"
|
||||
[ocr]
|
||||
enabled = true
|
||||
"#;
|
||||
|
||||
fs::write(&config_path, toml_content).expect("Operation failed");
|
||||
|
||||
let original_dir = std::env::current_dir().expect("Operation failed");
|
||||
std::env::set_current_dir(temp_dir.path()).expect("Operation failed");
|
||||
|
||||
let result = ExtractionConfig::discover();
|
||||
|
||||
std::env::set_current_dir(original_dir).expect("Operation failed");
|
||||
|
||||
assert!(result.is_ok(), "Discover should succeed");
|
||||
let config = result.expect("Operation failed");
|
||||
assert!(config.is_some(), "Should find config in current directory");
|
||||
assert!(
|
||||
config.expect("Operation failed").ocr.is_some(),
|
||||
"Should have OCR config"
|
||||
);
|
||||
}
|
||||
|
||||
/// Test discover() finds config in parent directory.
|
||||
#[test]
|
||||
#[serial_test::serial]
|
||||
fn test_discover_finds_config_in_parent_dir() {
|
||||
let temp_dir = TempDir::new().expect("Operation failed");
|
||||
let config_path = temp_dir.path().join("kreuzberg.toml");
|
||||
|
||||
let toml_content = r#"
|
||||
[ocr]
|
||||
enabled = true
|
||||
"#;
|
||||
|
||||
fs::write(&config_path, toml_content).expect("Operation failed");
|
||||
|
||||
let sub_dir = temp_dir.path().join("subdir");
|
||||
fs::create_dir(&sub_dir).expect("Operation failed");
|
||||
|
||||
let original_dir = std::env::current_dir().expect("Operation failed");
|
||||
std::env::set_current_dir(&sub_dir).expect("Operation failed");
|
||||
|
||||
let result = ExtractionConfig::discover();
|
||||
|
||||
std::env::set_current_dir(original_dir).expect("Operation failed");
|
||||
|
||||
assert!(result.is_ok(), "Discover should succeed");
|
||||
let config = result.expect("Operation failed");
|
||||
assert!(config.is_some(), "Should find config in parent directory");
|
||||
assert!(
|
||||
config.expect("Operation failed").ocr.is_some(),
|
||||
"Should have OCR config"
|
||||
);
|
||||
}
|
||||
|
||||
/// Test discover() returns None when no config found.
|
||||
#[test]
|
||||
#[serial_test::serial]
|
||||
fn test_discover_returns_none_when_not_found() {
|
||||
let temp_dir = TempDir::new().expect("Operation failed");
|
||||
let sub_dir = temp_dir.path().join("subdir");
|
||||
fs::create_dir(&sub_dir).expect("Operation failed");
|
||||
|
||||
let original_dir = std::env::current_dir().expect("Operation failed");
|
||||
std::env::set_current_dir(&sub_dir).expect("Operation failed");
|
||||
|
||||
let result = ExtractionConfig::discover();
|
||||
|
||||
std::env::set_current_dir(original_dir).expect("Operation failed");
|
||||
|
||||
assert!(result.is_ok(), "Discover should succeed even when no config found");
|
||||
let _config = result.expect("Operation failed");
|
||||
}
|
||||
|
||||
/// Test discover() prefers certain file names.
|
||||
#[test]
|
||||
#[serial_test::serial]
|
||||
fn test_discover_file_name_preference() {
|
||||
let temp_dir = TempDir::new().expect("Operation failed");
|
||||
|
||||
fs::write(temp_dir.path().join("kreuzberg.toml"), "[ocr]\nenabled = true").expect("Operation failed");
|
||||
fs::write(temp_dir.path().join(".kreuzberg.toml"), "[ocr]\nenabled = false").expect("Operation failed");
|
||||
|
||||
let original_dir = std::env::current_dir().expect("Operation failed");
|
||||
if std::env::set_current_dir(temp_dir.path()).is_err() {
|
||||
return;
|
||||
}
|
||||
|
||||
let result = ExtractionConfig::discover();
|
||||
|
||||
let _ = std::env::set_current_dir(original_dir);
|
||||
|
||||
assert!(result.is_ok(), "Discover should succeed");
|
||||
let config = result.expect("Operation failed");
|
||||
assert!(config.is_some(), "Should find a config file");
|
||||
}
|
||||
|
||||
/// Test discover() with nested directories.
|
||||
#[test]
|
||||
#[serial_test::serial]
|
||||
fn test_discover_with_nested_directories() {
|
||||
let temp_dir = TempDir::new().expect("Operation failed");
|
||||
let config_path = temp_dir.path().join("kreuzberg.toml");
|
||||
|
||||
let toml_content = r#"
|
||||
[ocr]
|
||||
enabled = true
|
||||
"#;
|
||||
|
||||
fs::write(&config_path, toml_content).expect("Operation failed");
|
||||
|
||||
let level1 = temp_dir.path().join("level1");
|
||||
let level2 = level1.join("level2");
|
||||
let level3 = level2.join("level3");
|
||||
fs::create_dir_all(&level3).expect("Operation failed");
|
||||
|
||||
let original_dir = std::env::current_dir().expect("Operation failed");
|
||||
if std::env::set_current_dir(&level3).is_err() {
|
||||
return;
|
||||
}
|
||||
|
||||
let result = ExtractionConfig::discover();
|
||||
|
||||
let _ = std::env::set_current_dir(&original_dir);
|
||||
|
||||
assert!(result.is_ok(), "Discover should succeed");
|
||||
let config = result.expect("Operation failed");
|
||||
assert!(config.is_some(), "Should find config in ancestor directory");
|
||||
assert!(
|
||||
config.expect("Operation failed").ocr.is_some(),
|
||||
"Should have OCR config"
|
||||
);
|
||||
}
|
||||
|
||||
/// Test config loading with all supported features.
|
||||
#[test]
|
||||
fn test_from_file_comprehensive_config() {
|
||||
let temp_dir = TempDir::new().expect("Operation failed");
|
||||
let config_path = temp_dir.path().join("config.toml");
|
||||
|
||||
let mut toml_content = String::from(
|
||||
r#"
|
||||
[ocr]
|
||||
enabled = true
|
||||
backend = "tesseract"
|
||||
|
||||
[chunking]
|
||||
max_chars = 2000
|
||||
max_overlap = 200
|
||||
|
||||
[language_detection]
|
||||
enabled = true
|
||||
|
||||
[images]
|
||||
extract_images = true
|
||||
"#,
|
||||
);
|
||||
|
||||
#[cfg(feature = "pdf")]
|
||||
toml_content.push_str("\n[pdf_options]\nextract_images = true\n");
|
||||
|
||||
fs::write(&config_path, toml_content).expect("Operation failed");
|
||||
|
||||
let config = ExtractionConfig::from_file(&config_path);
|
||||
assert!(config.is_ok(), "Should load comprehensive config successfully");
|
||||
|
||||
let config = config.expect("Operation failed");
|
||||
assert!(config.ocr.is_some(), "Should have OCR config");
|
||||
assert!(config.chunking.is_some(), "Should have chunking config");
|
||||
assert!(
|
||||
config.language_detection.is_some(),
|
||||
"Should have language detection config"
|
||||
);
|
||||
assert!(config.images.is_some(), "Should have image extraction config");
|
||||
#[cfg(feature = "pdf")]
|
||||
assert!(config.pdf_options.is_some(), "Should have PDF config");
|
||||
}
|
||||
|
||||
/// Test config validation with invalid values.
|
||||
#[test]
|
||||
fn test_from_file_with_invalid_values() {
|
||||
let temp_dir = TempDir::new().expect("Operation failed");
|
||||
let config_path = temp_dir.path().join("config.toml");
|
||||
|
||||
let toml_content = r#"
|
||||
[chunking]
|
||||
max_chars = -1000
|
||||
max_overlap = -100
|
||||
"#;
|
||||
|
||||
fs::write(&config_path, toml_content).expect("Operation failed");
|
||||
|
||||
let result = ExtractionConfig::from_file(&config_path);
|
||||
if let Ok(config) = result
|
||||
&& let Some(chunking) = config.chunking
|
||||
{
|
||||
assert!(chunking.max_characters > 0, "max_characters should be positive");
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user