Files
fil/crates/kreuzberg/tests/config_loading_tests.rs
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

428 lines
13 KiB
Rust

//! Configuration loading integration tests.
//!
//! Tests the config loading APIs:
//! - from_file() with TOML/YAML/JSON
//! - discover() for searching parent directories
//! - Error handling for invalid configs
use kreuzberg::KreuzbergError;
use kreuzberg::core::config::ExtractionConfig;
use std::fs;
use tempfile::TempDir;
/// Test loading config from TOML file.
#[test]
fn test_from_file_toml_succeeds() {
let temp_dir = TempDir::new().expect("Operation failed");
let config_path = temp_dir.path().join("config.toml");
let toml_content = r#"
[ocr]
enabled = true
backend = "tesseract"
[chunking]
max_chars = 1000
max_overlap = 100
"#;
fs::write(&config_path, toml_content).expect("Operation failed");
let config = ExtractionConfig::from_file(&config_path);
assert!(config.is_ok(), "Should load TOML config successfully");
let config = config.expect("Operation failed");
assert!(config.ocr.is_some(), "Should have OCR config");
assert!(config.chunking.is_some(), "Should have chunking config");
let chunking = config.chunking.expect("Operation failed");
assert_eq!(chunking.max_characters, 1000);
assert_eq!(chunking.overlap, 100);
}
/// Test loading config from YAML file.
#[test]
fn test_from_file_yaml_succeeds() {
let temp_dir = TempDir::new().expect("Operation failed");
let config_path = temp_dir.path().join("config.yaml");
let yaml_content = r#"
ocr:
enabled: true
backend: tesseract
chunking:
max_characters: 1000
overlap: 100
"#;
fs::write(&config_path, yaml_content).expect("Operation failed");
let config = ExtractionConfig::from_file(&config_path);
assert!(config.is_ok(), "Should load YAML config successfully");
let config = config.expect("Operation failed");
assert!(config.ocr.is_some(), "Should have OCR config");
assert!(config.chunking.is_some(), "Should have chunking config");
let chunking = config.chunking.expect("Operation failed");
assert_eq!(chunking.max_characters, 1000);
assert_eq!(chunking.overlap, 100);
}
/// Test loading config from JSON file.
#[test]
fn test_from_file_json_succeeds() {
let temp_dir = TempDir::new().expect("Operation failed");
let config_path = temp_dir.path().join("config.json");
let json_content = r#"
{
"ocr": {
"enabled": true,
"backend": "tesseract"
},
"chunking": {
"max_chars": 1000,
"max_overlap": 100
}
}
"#;
fs::write(&config_path, json_content).expect("Operation failed");
let config = ExtractionConfig::from_file(&config_path);
assert!(config.is_ok(), "Should load JSON config successfully");
let config = config.expect("Operation failed");
assert!(config.ocr.is_some(), "Should have OCR config");
assert!(config.chunking.is_some(), "Should have chunking config");
let chunking = config.chunking.expect("Operation failed");
assert_eq!(chunking.max_characters, 1000);
assert_eq!(chunking.overlap, 100);
}
/// Test loading config from .yml extension.
#[test]
fn test_from_file_yml_extension_succeeds() {
let temp_dir = TempDir::new().expect("Operation failed");
let config_path = temp_dir.path().join("config.yml");
let yml_content = r#"
ocr:
enabled: true
"#;
fs::write(&config_path, yml_content).expect("Operation failed");
let config = ExtractionConfig::from_file(&config_path);
assert!(config.is_ok(), "Should load .yml config successfully");
}
/// Test from_file with nonexistent path fails.
#[test]
fn test_from_file_nonexistent_path_fails() {
let result = ExtractionConfig::from_file("/nonexistent/path/config.toml");
assert!(result.is_err(), "Should fail for nonexistent path: {:?}", result);
}
/// Test from_file with malformed TOML fails.
#[test]
fn test_from_file_malformed_toml_fails() {
let temp_dir = TempDir::new().expect("Operation failed");
let config_path = temp_dir.path().join("config.toml");
let malformed_toml = r#"
[ocr
enabled = true
"#;
fs::write(&config_path, malformed_toml).expect("Operation failed");
let result = ExtractionConfig::from_file(&config_path);
assert!(result.is_err(), "Should fail for malformed TOML: {:?}", result);
}
/// Test from_file with malformed JSON fails.
#[test]
fn test_from_file_malformed_json_fails() {
let temp_dir = TempDir::new().expect("Operation failed");
let config_path = temp_dir.path().join("config.json");
let malformed_json = r#"
{
"ocr": {
"enabled": true
}
"chunking": {}
}
"#;
fs::write(&config_path, malformed_json).expect("Operation failed");
let result = ExtractionConfig::from_file(&config_path);
assert!(result.is_err(), "Should fail for malformed JSON: {:?}", result);
}
/// Test from_file with malformed YAML fails.
#[test]
fn test_from_file_malformed_yaml_fails() {
let temp_dir = TempDir::new().expect("Operation failed");
let config_path = temp_dir.path().join("config.yaml");
let malformed_yaml = r#"
ocr:
enabled: true
- invalid_list
"#;
fs::write(&config_path, malformed_yaml).expect("Operation failed");
let result = ExtractionConfig::from_file(&config_path);
assert!(result.is_err(), "Should fail for malformed YAML: {:?}", result);
}
/// Test from_file with empty file uses defaults.
#[test]
fn test_from_file_empty_file_uses_defaults() {
let temp_dir = TempDir::new().expect("Operation failed");
let config_path = temp_dir.path().join("config.toml");
fs::write(&config_path, "").expect("Operation failed");
let config = ExtractionConfig::from_file(&config_path);
assert!(config.is_ok(), "Should load empty file successfully");
let config = config.expect("Operation failed");
assert!(config.ocr.is_none(), "Default config should have no OCR");
assert!(config.chunking.is_none(), "Default config should have no chunking");
}
/// Test from_file with unsupported extension fails.
#[test]
fn test_from_file_unsupported_extension_fails() {
let temp_dir = TempDir::new().expect("Operation failed");
let config_path = temp_dir.path().join("config.txt");
fs::write(&config_path, "ocr:\n enabled: true").expect("Operation failed");
let result = ExtractionConfig::from_file(&config_path);
assert!(result.is_err(), "Should fail for unsupported extension: {:?}", result);
if let Err(KreuzbergError::Validation { message, .. }) = result {
assert!(
message.contains("format") || message.contains("extension") || message.contains("Unsupported"),
"Error should mention format/extension: {}",
message
);
}
}
/// Test discover() finds config in current directory.
#[test]
#[serial_test::serial]
fn test_discover_finds_config_in_current_dir() {
let temp_dir = TempDir::new().expect("Operation failed");
let config_path = temp_dir.path().join("kreuzberg.toml");
let toml_content = r#"
[ocr]
enabled = true
"#;
fs::write(&config_path, toml_content).expect("Operation failed");
let original_dir = std::env::current_dir().expect("Operation failed");
std::env::set_current_dir(temp_dir.path()).expect("Operation failed");
let result = ExtractionConfig::discover();
std::env::set_current_dir(original_dir).expect("Operation failed");
assert!(result.is_ok(), "Discover should succeed");
let config = result.expect("Operation failed");
assert!(config.is_some(), "Should find config in current directory");
assert!(
config.expect("Operation failed").ocr.is_some(),
"Should have OCR config"
);
}
/// Test discover() finds config in parent directory.
#[test]
#[serial_test::serial]
fn test_discover_finds_config_in_parent_dir() {
let temp_dir = TempDir::new().expect("Operation failed");
let config_path = temp_dir.path().join("kreuzberg.toml");
let toml_content = r#"
[ocr]
enabled = true
"#;
fs::write(&config_path, toml_content).expect("Operation failed");
let sub_dir = temp_dir.path().join("subdir");
fs::create_dir(&sub_dir).expect("Operation failed");
let original_dir = std::env::current_dir().expect("Operation failed");
std::env::set_current_dir(&sub_dir).expect("Operation failed");
let result = ExtractionConfig::discover();
std::env::set_current_dir(original_dir).expect("Operation failed");
assert!(result.is_ok(), "Discover should succeed");
let config = result.expect("Operation failed");
assert!(config.is_some(), "Should find config in parent directory");
assert!(
config.expect("Operation failed").ocr.is_some(),
"Should have OCR config"
);
}
/// Test discover() returns None when no config found.
#[test]
#[serial_test::serial]
fn test_discover_returns_none_when_not_found() {
let temp_dir = TempDir::new().expect("Operation failed");
let sub_dir = temp_dir.path().join("subdir");
fs::create_dir(&sub_dir).expect("Operation failed");
let original_dir = std::env::current_dir().expect("Operation failed");
std::env::set_current_dir(&sub_dir).expect("Operation failed");
let result = ExtractionConfig::discover();
std::env::set_current_dir(original_dir).expect("Operation failed");
assert!(result.is_ok(), "Discover should succeed even when no config found");
let _config = result.expect("Operation failed");
}
/// Test discover() prefers certain file names.
#[test]
#[serial_test::serial]
fn test_discover_file_name_preference() {
let temp_dir = TempDir::new().expect("Operation failed");
fs::write(temp_dir.path().join("kreuzberg.toml"), "[ocr]\nenabled = true").expect("Operation failed");
fs::write(temp_dir.path().join(".kreuzberg.toml"), "[ocr]\nenabled = false").expect("Operation failed");
let original_dir = std::env::current_dir().expect("Operation failed");
if std::env::set_current_dir(temp_dir.path()).is_err() {
return;
}
let result = ExtractionConfig::discover();
let _ = std::env::set_current_dir(original_dir);
assert!(result.is_ok(), "Discover should succeed");
let config = result.expect("Operation failed");
assert!(config.is_some(), "Should find a config file");
}
/// Test discover() with nested directories.
#[test]
#[serial_test::serial]
fn test_discover_with_nested_directories() {
let temp_dir = TempDir::new().expect("Operation failed");
let config_path = temp_dir.path().join("kreuzberg.toml");
let toml_content = r#"
[ocr]
enabled = true
"#;
fs::write(&config_path, toml_content).expect("Operation failed");
let level1 = temp_dir.path().join("level1");
let level2 = level1.join("level2");
let level3 = level2.join("level3");
fs::create_dir_all(&level3).expect("Operation failed");
let original_dir = std::env::current_dir().expect("Operation failed");
if std::env::set_current_dir(&level3).is_err() {
return;
}
let result = ExtractionConfig::discover();
let _ = std::env::set_current_dir(&original_dir);
assert!(result.is_ok(), "Discover should succeed");
let config = result.expect("Operation failed");
assert!(config.is_some(), "Should find config in ancestor directory");
assert!(
config.expect("Operation failed").ocr.is_some(),
"Should have OCR config"
);
}
/// Test config loading with all supported features.
#[test]
fn test_from_file_comprehensive_config() {
let temp_dir = TempDir::new().expect("Operation failed");
let config_path = temp_dir.path().join("config.toml");
let mut toml_content = String::from(
r#"
[ocr]
enabled = true
backend = "tesseract"
[chunking]
max_chars = 2000
max_overlap = 200
[language_detection]
enabled = true
[images]
extract_images = true
"#,
);
#[cfg(feature = "pdf")]
toml_content.push_str("\n[pdf_options]\nextract_images = true\n");
fs::write(&config_path, toml_content).expect("Operation failed");
let config = ExtractionConfig::from_file(&config_path);
assert!(config.is_ok(), "Should load comprehensive config successfully");
let config = config.expect("Operation failed");
assert!(config.ocr.is_some(), "Should have OCR config");
assert!(config.chunking.is_some(), "Should have chunking config");
assert!(
config.language_detection.is_some(),
"Should have language detection config"
);
assert!(config.images.is_some(), "Should have image extraction config");
#[cfg(feature = "pdf")]
assert!(config.pdf_options.is_some(), "Should have PDF config");
}
/// Test config validation with invalid values.
#[test]
fn test_from_file_with_invalid_values() {
let temp_dir = TempDir::new().expect("Operation failed");
let config_path = temp_dir.path().join("config.toml");
let toml_content = r#"
[chunking]
max_chars = -1000
max_overlap = -100
"#;
fs::write(&config_path, toml_content).expect("Operation failed");
let result = ExtractionConfig::from_file(&config_path);
if let Ok(config) = result
&& let Some(chunking) = config.chunking
{
assert!(chunking.max_characters > 0, "max_characters should be positive");
}
}