428 lines
13 KiB
Rust
428 lines
13 KiB
Rust
|
|
//! Configuration loading integration tests.
|
||
|
|
//!
|
||
|
|
//! Tests the config loading APIs:
|
||
|
|
//! - from_file() with TOML/YAML/JSON
|
||
|
|
//! - discover() for searching parent directories
|
||
|
|
//! - Error handling for invalid configs
|
||
|
|
|
||
|
|
use kreuzberg::KreuzbergError;
|
||
|
|
use kreuzberg::core::config::ExtractionConfig;
|
||
|
|
use std::fs;
|
||
|
|
use tempfile::TempDir;
|
||
|
|
|
||
|
|
/// Test loading config from TOML file.
|
||
|
|
#[test]
|
||
|
|
fn test_from_file_toml_succeeds() {
|
||
|
|
let temp_dir = TempDir::new().expect("Operation failed");
|
||
|
|
let config_path = temp_dir.path().join("config.toml");
|
||
|
|
|
||
|
|
let toml_content = r#"
|
||
|
|
[ocr]
|
||
|
|
enabled = true
|
||
|
|
backend = "tesseract"
|
||
|
|
|
||
|
|
[chunking]
|
||
|
|
max_chars = 1000
|
||
|
|
max_overlap = 100
|
||
|
|
"#;
|
||
|
|
|
||
|
|
fs::write(&config_path, toml_content).expect("Operation failed");
|
||
|
|
|
||
|
|
let config = ExtractionConfig::from_file(&config_path);
|
||
|
|
assert!(config.is_ok(), "Should load TOML config successfully");
|
||
|
|
|
||
|
|
let config = config.expect("Operation failed");
|
||
|
|
assert!(config.ocr.is_some(), "Should have OCR config");
|
||
|
|
assert!(config.chunking.is_some(), "Should have chunking config");
|
||
|
|
|
||
|
|
let chunking = config.chunking.expect("Operation failed");
|
||
|
|
assert_eq!(chunking.max_characters, 1000);
|
||
|
|
assert_eq!(chunking.overlap, 100);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test loading config from YAML file.
|
||
|
|
#[test]
|
||
|
|
fn test_from_file_yaml_succeeds() {
|
||
|
|
let temp_dir = TempDir::new().expect("Operation failed");
|
||
|
|
let config_path = temp_dir.path().join("config.yaml");
|
||
|
|
|
||
|
|
let yaml_content = r#"
|
||
|
|
ocr:
|
||
|
|
enabled: true
|
||
|
|
backend: tesseract
|
||
|
|
chunking:
|
||
|
|
max_characters: 1000
|
||
|
|
overlap: 100
|
||
|
|
"#;
|
||
|
|
|
||
|
|
fs::write(&config_path, yaml_content).expect("Operation failed");
|
||
|
|
|
||
|
|
let config = ExtractionConfig::from_file(&config_path);
|
||
|
|
assert!(config.is_ok(), "Should load YAML config successfully");
|
||
|
|
|
||
|
|
let config = config.expect("Operation failed");
|
||
|
|
assert!(config.ocr.is_some(), "Should have OCR config");
|
||
|
|
assert!(config.chunking.is_some(), "Should have chunking config");
|
||
|
|
|
||
|
|
let chunking = config.chunking.expect("Operation failed");
|
||
|
|
assert_eq!(chunking.max_characters, 1000);
|
||
|
|
assert_eq!(chunking.overlap, 100);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test loading config from JSON file.
|
||
|
|
#[test]
|
||
|
|
fn test_from_file_json_succeeds() {
|
||
|
|
let temp_dir = TempDir::new().expect("Operation failed");
|
||
|
|
let config_path = temp_dir.path().join("config.json");
|
||
|
|
|
||
|
|
let json_content = r#"
|
||
|
|
{
|
||
|
|
"ocr": {
|
||
|
|
"enabled": true,
|
||
|
|
"backend": "tesseract"
|
||
|
|
},
|
||
|
|
"chunking": {
|
||
|
|
"max_chars": 1000,
|
||
|
|
"max_overlap": 100
|
||
|
|
}
|
||
|
|
}
|
||
|
|
"#;
|
||
|
|
|
||
|
|
fs::write(&config_path, json_content).expect("Operation failed");
|
||
|
|
|
||
|
|
let config = ExtractionConfig::from_file(&config_path);
|
||
|
|
assert!(config.is_ok(), "Should load JSON config successfully");
|
||
|
|
|
||
|
|
let config = config.expect("Operation failed");
|
||
|
|
assert!(config.ocr.is_some(), "Should have OCR config");
|
||
|
|
assert!(config.chunking.is_some(), "Should have chunking config");
|
||
|
|
|
||
|
|
let chunking = config.chunking.expect("Operation failed");
|
||
|
|
assert_eq!(chunking.max_characters, 1000);
|
||
|
|
assert_eq!(chunking.overlap, 100);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test loading config from .yml extension.
|
||
|
|
#[test]
|
||
|
|
fn test_from_file_yml_extension_succeeds() {
|
||
|
|
let temp_dir = TempDir::new().expect("Operation failed");
|
||
|
|
let config_path = temp_dir.path().join("config.yml");
|
||
|
|
|
||
|
|
let yml_content = r#"
|
||
|
|
ocr:
|
||
|
|
enabled: true
|
||
|
|
"#;
|
||
|
|
|
||
|
|
fs::write(&config_path, yml_content).expect("Operation failed");
|
||
|
|
|
||
|
|
let config = ExtractionConfig::from_file(&config_path);
|
||
|
|
assert!(config.is_ok(), "Should load .yml config successfully");
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test from_file with nonexistent path fails.
|
||
|
|
#[test]
|
||
|
|
fn test_from_file_nonexistent_path_fails() {
|
||
|
|
let result = ExtractionConfig::from_file("/nonexistent/path/config.toml");
|
||
|
|
assert!(result.is_err(), "Should fail for nonexistent path: {:?}", result);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test from_file with malformed TOML fails.
|
||
|
|
#[test]
|
||
|
|
fn test_from_file_malformed_toml_fails() {
|
||
|
|
let temp_dir = TempDir::new().expect("Operation failed");
|
||
|
|
let config_path = temp_dir.path().join("config.toml");
|
||
|
|
|
||
|
|
let malformed_toml = r#"
|
||
|
|
[ocr
|
||
|
|
enabled = true
|
||
|
|
"#;
|
||
|
|
|
||
|
|
fs::write(&config_path, malformed_toml).expect("Operation failed");
|
||
|
|
|
||
|
|
let result = ExtractionConfig::from_file(&config_path);
|
||
|
|
assert!(result.is_err(), "Should fail for malformed TOML: {:?}", result);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test from_file with malformed JSON fails.
|
||
|
|
#[test]
|
||
|
|
fn test_from_file_malformed_json_fails() {
|
||
|
|
let temp_dir = TempDir::new().expect("Operation failed");
|
||
|
|
let config_path = temp_dir.path().join("config.json");
|
||
|
|
|
||
|
|
let malformed_json = r#"
|
||
|
|
{
|
||
|
|
"ocr": {
|
||
|
|
"enabled": true
|
||
|
|
}
|
||
|
|
"chunking": {}
|
||
|
|
}
|
||
|
|
"#;
|
||
|
|
|
||
|
|
fs::write(&config_path, malformed_json).expect("Operation failed");
|
||
|
|
|
||
|
|
let result = ExtractionConfig::from_file(&config_path);
|
||
|
|
assert!(result.is_err(), "Should fail for malformed JSON: {:?}", result);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test from_file with malformed YAML fails.
|
||
|
|
#[test]
|
||
|
|
fn test_from_file_malformed_yaml_fails() {
|
||
|
|
let temp_dir = TempDir::new().expect("Operation failed");
|
||
|
|
let config_path = temp_dir.path().join("config.yaml");
|
||
|
|
|
||
|
|
let malformed_yaml = r#"
|
||
|
|
ocr:
|
||
|
|
enabled: true
|
||
|
|
- invalid_list
|
||
|
|
"#;
|
||
|
|
|
||
|
|
fs::write(&config_path, malformed_yaml).expect("Operation failed");
|
||
|
|
|
||
|
|
let result = ExtractionConfig::from_file(&config_path);
|
||
|
|
assert!(result.is_err(), "Should fail for malformed YAML: {:?}", result);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test from_file with empty file uses defaults.
|
||
|
|
#[test]
|
||
|
|
fn test_from_file_empty_file_uses_defaults() {
|
||
|
|
let temp_dir = TempDir::new().expect("Operation failed");
|
||
|
|
let config_path = temp_dir.path().join("config.toml");
|
||
|
|
|
||
|
|
fs::write(&config_path, "").expect("Operation failed");
|
||
|
|
|
||
|
|
let config = ExtractionConfig::from_file(&config_path);
|
||
|
|
assert!(config.is_ok(), "Should load empty file successfully");
|
||
|
|
|
||
|
|
let config = config.expect("Operation failed");
|
||
|
|
assert!(config.ocr.is_none(), "Default config should have no OCR");
|
||
|
|
assert!(config.chunking.is_none(), "Default config should have no chunking");
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test from_file with unsupported extension fails.
|
||
|
|
#[test]
|
||
|
|
fn test_from_file_unsupported_extension_fails() {
|
||
|
|
let temp_dir = TempDir::new().expect("Operation failed");
|
||
|
|
let config_path = temp_dir.path().join("config.txt");
|
||
|
|
|
||
|
|
fs::write(&config_path, "ocr:\n enabled: true").expect("Operation failed");
|
||
|
|
|
||
|
|
let result = ExtractionConfig::from_file(&config_path);
|
||
|
|
assert!(result.is_err(), "Should fail for unsupported extension: {:?}", result);
|
||
|
|
|
||
|
|
if let Err(KreuzbergError::Validation { message, .. }) = result {
|
||
|
|
assert!(
|
||
|
|
message.contains("format") || message.contains("extension") || message.contains("Unsupported"),
|
||
|
|
"Error should mention format/extension: {}",
|
||
|
|
message
|
||
|
|
);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test discover() finds config in current directory.
|
||
|
|
#[test]
|
||
|
|
#[serial_test::serial]
|
||
|
|
fn test_discover_finds_config_in_current_dir() {
|
||
|
|
let temp_dir = TempDir::new().expect("Operation failed");
|
||
|
|
let config_path = temp_dir.path().join("kreuzberg.toml");
|
||
|
|
|
||
|
|
let toml_content = r#"
|
||
|
|
[ocr]
|
||
|
|
enabled = true
|
||
|
|
"#;
|
||
|
|
|
||
|
|
fs::write(&config_path, toml_content).expect("Operation failed");
|
||
|
|
|
||
|
|
let original_dir = std::env::current_dir().expect("Operation failed");
|
||
|
|
std::env::set_current_dir(temp_dir.path()).expect("Operation failed");
|
||
|
|
|
||
|
|
let result = ExtractionConfig::discover();
|
||
|
|
|
||
|
|
std::env::set_current_dir(original_dir).expect("Operation failed");
|
||
|
|
|
||
|
|
assert!(result.is_ok(), "Discover should succeed");
|
||
|
|
let config = result.expect("Operation failed");
|
||
|
|
assert!(config.is_some(), "Should find config in current directory");
|
||
|
|
assert!(
|
||
|
|
config.expect("Operation failed").ocr.is_some(),
|
||
|
|
"Should have OCR config"
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test discover() finds config in parent directory.
|
||
|
|
#[test]
|
||
|
|
#[serial_test::serial]
|
||
|
|
fn test_discover_finds_config_in_parent_dir() {
|
||
|
|
let temp_dir = TempDir::new().expect("Operation failed");
|
||
|
|
let config_path = temp_dir.path().join("kreuzberg.toml");
|
||
|
|
|
||
|
|
let toml_content = r#"
|
||
|
|
[ocr]
|
||
|
|
enabled = true
|
||
|
|
"#;
|
||
|
|
|
||
|
|
fs::write(&config_path, toml_content).expect("Operation failed");
|
||
|
|
|
||
|
|
let sub_dir = temp_dir.path().join("subdir");
|
||
|
|
fs::create_dir(&sub_dir).expect("Operation failed");
|
||
|
|
|
||
|
|
let original_dir = std::env::current_dir().expect("Operation failed");
|
||
|
|
std::env::set_current_dir(&sub_dir).expect("Operation failed");
|
||
|
|
|
||
|
|
let result = ExtractionConfig::discover();
|
||
|
|
|
||
|
|
std::env::set_current_dir(original_dir).expect("Operation failed");
|
||
|
|
|
||
|
|
assert!(result.is_ok(), "Discover should succeed");
|
||
|
|
let config = result.expect("Operation failed");
|
||
|
|
assert!(config.is_some(), "Should find config in parent directory");
|
||
|
|
assert!(
|
||
|
|
config.expect("Operation failed").ocr.is_some(),
|
||
|
|
"Should have OCR config"
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test discover() returns None when no config found.
|
||
|
|
#[test]
|
||
|
|
#[serial_test::serial]
|
||
|
|
fn test_discover_returns_none_when_not_found() {
|
||
|
|
let temp_dir = TempDir::new().expect("Operation failed");
|
||
|
|
let sub_dir = temp_dir.path().join("subdir");
|
||
|
|
fs::create_dir(&sub_dir).expect("Operation failed");
|
||
|
|
|
||
|
|
let original_dir = std::env::current_dir().expect("Operation failed");
|
||
|
|
std::env::set_current_dir(&sub_dir).expect("Operation failed");
|
||
|
|
|
||
|
|
let result = ExtractionConfig::discover();
|
||
|
|
|
||
|
|
std::env::set_current_dir(original_dir).expect("Operation failed");
|
||
|
|
|
||
|
|
assert!(result.is_ok(), "Discover should succeed even when no config found");
|
||
|
|
let _config = result.expect("Operation failed");
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test discover() prefers certain file names.
|
||
|
|
#[test]
|
||
|
|
#[serial_test::serial]
|
||
|
|
fn test_discover_file_name_preference() {
|
||
|
|
let temp_dir = TempDir::new().expect("Operation failed");
|
||
|
|
|
||
|
|
fs::write(temp_dir.path().join("kreuzberg.toml"), "[ocr]\nenabled = true").expect("Operation failed");
|
||
|
|
fs::write(temp_dir.path().join(".kreuzberg.toml"), "[ocr]\nenabled = false").expect("Operation failed");
|
||
|
|
|
||
|
|
let original_dir = std::env::current_dir().expect("Operation failed");
|
||
|
|
if std::env::set_current_dir(temp_dir.path()).is_err() {
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let result = ExtractionConfig::discover();
|
||
|
|
|
||
|
|
let _ = std::env::set_current_dir(original_dir);
|
||
|
|
|
||
|
|
assert!(result.is_ok(), "Discover should succeed");
|
||
|
|
let config = result.expect("Operation failed");
|
||
|
|
assert!(config.is_some(), "Should find a config file");
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test discover() with nested directories.
|
||
|
|
#[test]
|
||
|
|
#[serial_test::serial]
|
||
|
|
fn test_discover_with_nested_directories() {
|
||
|
|
let temp_dir = TempDir::new().expect("Operation failed");
|
||
|
|
let config_path = temp_dir.path().join("kreuzberg.toml");
|
||
|
|
|
||
|
|
let toml_content = r#"
|
||
|
|
[ocr]
|
||
|
|
enabled = true
|
||
|
|
"#;
|
||
|
|
|
||
|
|
fs::write(&config_path, toml_content).expect("Operation failed");
|
||
|
|
|
||
|
|
let level1 = temp_dir.path().join("level1");
|
||
|
|
let level2 = level1.join("level2");
|
||
|
|
let level3 = level2.join("level3");
|
||
|
|
fs::create_dir_all(&level3).expect("Operation failed");
|
||
|
|
|
||
|
|
let original_dir = std::env::current_dir().expect("Operation failed");
|
||
|
|
if std::env::set_current_dir(&level3).is_err() {
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let result = ExtractionConfig::discover();
|
||
|
|
|
||
|
|
let _ = std::env::set_current_dir(&original_dir);
|
||
|
|
|
||
|
|
assert!(result.is_ok(), "Discover should succeed");
|
||
|
|
let config = result.expect("Operation failed");
|
||
|
|
assert!(config.is_some(), "Should find config in ancestor directory");
|
||
|
|
assert!(
|
||
|
|
config.expect("Operation failed").ocr.is_some(),
|
||
|
|
"Should have OCR config"
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test config loading with all supported features.
|
||
|
|
#[test]
|
||
|
|
fn test_from_file_comprehensive_config() {
|
||
|
|
let temp_dir = TempDir::new().expect("Operation failed");
|
||
|
|
let config_path = temp_dir.path().join("config.toml");
|
||
|
|
|
||
|
|
let mut toml_content = String::from(
|
||
|
|
r#"
|
||
|
|
[ocr]
|
||
|
|
enabled = true
|
||
|
|
backend = "tesseract"
|
||
|
|
|
||
|
|
[chunking]
|
||
|
|
max_chars = 2000
|
||
|
|
max_overlap = 200
|
||
|
|
|
||
|
|
[language_detection]
|
||
|
|
enabled = true
|
||
|
|
|
||
|
|
[images]
|
||
|
|
extract_images = true
|
||
|
|
"#,
|
||
|
|
);
|
||
|
|
|
||
|
|
#[cfg(feature = "pdf")]
|
||
|
|
toml_content.push_str("\n[pdf_options]\nextract_images = true\n");
|
||
|
|
|
||
|
|
fs::write(&config_path, toml_content).expect("Operation failed");
|
||
|
|
|
||
|
|
let config = ExtractionConfig::from_file(&config_path);
|
||
|
|
assert!(config.is_ok(), "Should load comprehensive config successfully");
|
||
|
|
|
||
|
|
let config = config.expect("Operation failed");
|
||
|
|
assert!(config.ocr.is_some(), "Should have OCR config");
|
||
|
|
assert!(config.chunking.is_some(), "Should have chunking config");
|
||
|
|
assert!(
|
||
|
|
config.language_detection.is_some(),
|
||
|
|
"Should have language detection config"
|
||
|
|
);
|
||
|
|
assert!(config.images.is_some(), "Should have image extraction config");
|
||
|
|
#[cfg(feature = "pdf")]
|
||
|
|
assert!(config.pdf_options.is_some(), "Should have PDF config");
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test config validation with invalid values.
|
||
|
|
#[test]
|
||
|
|
fn test_from_file_with_invalid_values() {
|
||
|
|
let temp_dir = TempDir::new().expect("Operation failed");
|
||
|
|
let config_path = temp_dir.path().join("config.toml");
|
||
|
|
|
||
|
|
let toml_content = r#"
|
||
|
|
[chunking]
|
||
|
|
max_chars = -1000
|
||
|
|
max_overlap = -100
|
||
|
|
"#;
|
||
|
|
|
||
|
|
fs::write(&config_path, toml_content).expect("Operation failed");
|
||
|
|
|
||
|
|
let result = ExtractionConfig::from_file(&config_path);
|
||
|
|
if let Ok(config) = result
|
||
|
|
&& let Some(chunking) = config.chunking
|
||
|
|
{
|
||
|
|
assert!(chunking.max_characters > 0, "max_characters should be positive");
|
||
|
|
}
|
||
|
|
}
|