//! Configuration loading integration tests. //! //! Tests the config loading APIs: //! - from_file() with TOML/YAML/JSON //! - discover() for searching parent directories //! - Error handling for invalid configs use kreuzberg::KreuzbergError; use kreuzberg::core::config::ExtractionConfig; use std::fs; use tempfile::TempDir; /// Test loading config from TOML file. #[test] fn test_from_file_toml_succeeds() { let temp_dir = TempDir::new().expect("Operation failed"); let config_path = temp_dir.path().join("config.toml"); let toml_content = r#" [ocr] enabled = true backend = "tesseract" [chunking] max_chars = 1000 max_overlap = 100 "#; fs::write(&config_path, toml_content).expect("Operation failed"); let config = ExtractionConfig::from_file(&config_path); assert!(config.is_ok(), "Should load TOML config successfully"); let config = config.expect("Operation failed"); assert!(config.ocr.is_some(), "Should have OCR config"); assert!(config.chunking.is_some(), "Should have chunking config"); let chunking = config.chunking.expect("Operation failed"); assert_eq!(chunking.max_characters, 1000); assert_eq!(chunking.overlap, 100); } /// Test loading config from YAML file. #[test] fn test_from_file_yaml_succeeds() { let temp_dir = TempDir::new().expect("Operation failed"); let config_path = temp_dir.path().join("config.yaml"); let yaml_content = r#" ocr: enabled: true backend: tesseract chunking: max_characters: 1000 overlap: 100 "#; fs::write(&config_path, yaml_content).expect("Operation failed"); let config = ExtractionConfig::from_file(&config_path); assert!(config.is_ok(), "Should load YAML config successfully"); let config = config.expect("Operation failed"); assert!(config.ocr.is_some(), "Should have OCR config"); assert!(config.chunking.is_some(), "Should have chunking config"); let chunking = config.chunking.expect("Operation failed"); assert_eq!(chunking.max_characters, 1000); assert_eq!(chunking.overlap, 100); } /// Test loading config from JSON file. #[test] fn test_from_file_json_succeeds() { let temp_dir = TempDir::new().expect("Operation failed"); let config_path = temp_dir.path().join("config.json"); let json_content = r#" { "ocr": { "enabled": true, "backend": "tesseract" }, "chunking": { "max_chars": 1000, "max_overlap": 100 } } "#; fs::write(&config_path, json_content).expect("Operation failed"); let config = ExtractionConfig::from_file(&config_path); assert!(config.is_ok(), "Should load JSON config successfully"); let config = config.expect("Operation failed"); assert!(config.ocr.is_some(), "Should have OCR config"); assert!(config.chunking.is_some(), "Should have chunking config"); let chunking = config.chunking.expect("Operation failed"); assert_eq!(chunking.max_characters, 1000); assert_eq!(chunking.overlap, 100); } /// Test loading config from .yml extension. #[test] fn test_from_file_yml_extension_succeeds() { let temp_dir = TempDir::new().expect("Operation failed"); let config_path = temp_dir.path().join("config.yml"); let yml_content = r#" ocr: enabled: true "#; fs::write(&config_path, yml_content).expect("Operation failed"); let config = ExtractionConfig::from_file(&config_path); assert!(config.is_ok(), "Should load .yml config successfully"); } /// Test from_file with nonexistent path fails. #[test] fn test_from_file_nonexistent_path_fails() { let result = ExtractionConfig::from_file("/nonexistent/path/config.toml"); assert!(result.is_err(), "Should fail for nonexistent path: {:?}", result); } /// Test from_file with malformed TOML fails. #[test] fn test_from_file_malformed_toml_fails() { let temp_dir = TempDir::new().expect("Operation failed"); let config_path = temp_dir.path().join("config.toml"); let malformed_toml = r#" [ocr enabled = true "#; fs::write(&config_path, malformed_toml).expect("Operation failed"); let result = ExtractionConfig::from_file(&config_path); assert!(result.is_err(), "Should fail for malformed TOML: {:?}", result); } /// Test from_file with malformed JSON fails. #[test] fn test_from_file_malformed_json_fails() { let temp_dir = TempDir::new().expect("Operation failed"); let config_path = temp_dir.path().join("config.json"); let malformed_json = r#" { "ocr": { "enabled": true } "chunking": {} } "#; fs::write(&config_path, malformed_json).expect("Operation failed"); let result = ExtractionConfig::from_file(&config_path); assert!(result.is_err(), "Should fail for malformed JSON: {:?}", result); } /// Test from_file with malformed YAML fails. #[test] fn test_from_file_malformed_yaml_fails() { let temp_dir = TempDir::new().expect("Operation failed"); let config_path = temp_dir.path().join("config.yaml"); let malformed_yaml = r#" ocr: enabled: true - invalid_list "#; fs::write(&config_path, malformed_yaml).expect("Operation failed"); let result = ExtractionConfig::from_file(&config_path); assert!(result.is_err(), "Should fail for malformed YAML: {:?}", result); } /// Test from_file with empty file uses defaults. #[test] fn test_from_file_empty_file_uses_defaults() { let temp_dir = TempDir::new().expect("Operation failed"); let config_path = temp_dir.path().join("config.toml"); fs::write(&config_path, "").expect("Operation failed"); let config = ExtractionConfig::from_file(&config_path); assert!(config.is_ok(), "Should load empty file successfully"); let config = config.expect("Operation failed"); assert!(config.ocr.is_none(), "Default config should have no OCR"); assert!(config.chunking.is_none(), "Default config should have no chunking"); } /// Test from_file with unsupported extension fails. #[test] fn test_from_file_unsupported_extension_fails() { let temp_dir = TempDir::new().expect("Operation failed"); let config_path = temp_dir.path().join("config.txt"); fs::write(&config_path, "ocr:\n enabled: true").expect("Operation failed"); let result = ExtractionConfig::from_file(&config_path); assert!(result.is_err(), "Should fail for unsupported extension: {:?}", result); if let Err(KreuzbergError::Validation { message, .. }) = result { assert!( message.contains("format") || message.contains("extension") || message.contains("Unsupported"), "Error should mention format/extension: {}", message ); } } /// Test discover() finds config in current directory. #[test] #[serial_test::serial] fn test_discover_finds_config_in_current_dir() { let temp_dir = TempDir::new().expect("Operation failed"); let config_path = temp_dir.path().join("kreuzberg.toml"); let toml_content = r#" [ocr] enabled = true "#; fs::write(&config_path, toml_content).expect("Operation failed"); let original_dir = std::env::current_dir().expect("Operation failed"); std::env::set_current_dir(temp_dir.path()).expect("Operation failed"); let result = ExtractionConfig::discover(); std::env::set_current_dir(original_dir).expect("Operation failed"); assert!(result.is_ok(), "Discover should succeed"); let config = result.expect("Operation failed"); assert!(config.is_some(), "Should find config in current directory"); assert!( config.expect("Operation failed").ocr.is_some(), "Should have OCR config" ); } /// Test discover() finds config in parent directory. #[test] #[serial_test::serial] fn test_discover_finds_config_in_parent_dir() { let temp_dir = TempDir::new().expect("Operation failed"); let config_path = temp_dir.path().join("kreuzberg.toml"); let toml_content = r#" [ocr] enabled = true "#; fs::write(&config_path, toml_content).expect("Operation failed"); let sub_dir = temp_dir.path().join("subdir"); fs::create_dir(&sub_dir).expect("Operation failed"); let original_dir = std::env::current_dir().expect("Operation failed"); std::env::set_current_dir(&sub_dir).expect("Operation failed"); let result = ExtractionConfig::discover(); std::env::set_current_dir(original_dir).expect("Operation failed"); assert!(result.is_ok(), "Discover should succeed"); let config = result.expect("Operation failed"); assert!(config.is_some(), "Should find config in parent directory"); assert!( config.expect("Operation failed").ocr.is_some(), "Should have OCR config" ); } /// Test discover() returns None when no config found. #[test] #[serial_test::serial] fn test_discover_returns_none_when_not_found() { let temp_dir = TempDir::new().expect("Operation failed"); let sub_dir = temp_dir.path().join("subdir"); fs::create_dir(&sub_dir).expect("Operation failed"); let original_dir = std::env::current_dir().expect("Operation failed"); std::env::set_current_dir(&sub_dir).expect("Operation failed"); let result = ExtractionConfig::discover(); std::env::set_current_dir(original_dir).expect("Operation failed"); assert!(result.is_ok(), "Discover should succeed even when no config found"); let _config = result.expect("Operation failed"); } /// Test discover() prefers certain file names. #[test] #[serial_test::serial] fn test_discover_file_name_preference() { let temp_dir = TempDir::new().expect("Operation failed"); fs::write(temp_dir.path().join("kreuzberg.toml"), "[ocr]\nenabled = true").expect("Operation failed"); fs::write(temp_dir.path().join(".kreuzberg.toml"), "[ocr]\nenabled = false").expect("Operation failed"); let original_dir = std::env::current_dir().expect("Operation failed"); if std::env::set_current_dir(temp_dir.path()).is_err() { return; } let result = ExtractionConfig::discover(); let _ = std::env::set_current_dir(original_dir); assert!(result.is_ok(), "Discover should succeed"); let config = result.expect("Operation failed"); assert!(config.is_some(), "Should find a config file"); } /// Test discover() with nested directories. #[test] #[serial_test::serial] fn test_discover_with_nested_directories() { let temp_dir = TempDir::new().expect("Operation failed"); let config_path = temp_dir.path().join("kreuzberg.toml"); let toml_content = r#" [ocr] enabled = true "#; fs::write(&config_path, toml_content).expect("Operation failed"); let level1 = temp_dir.path().join("level1"); let level2 = level1.join("level2"); let level3 = level2.join("level3"); fs::create_dir_all(&level3).expect("Operation failed"); let original_dir = std::env::current_dir().expect("Operation failed"); if std::env::set_current_dir(&level3).is_err() { return; } let result = ExtractionConfig::discover(); let _ = std::env::set_current_dir(&original_dir); assert!(result.is_ok(), "Discover should succeed"); let config = result.expect("Operation failed"); assert!(config.is_some(), "Should find config in ancestor directory"); assert!( config.expect("Operation failed").ocr.is_some(), "Should have OCR config" ); } /// Test config loading with all supported features. #[test] fn test_from_file_comprehensive_config() { let temp_dir = TempDir::new().expect("Operation failed"); let config_path = temp_dir.path().join("config.toml"); let mut toml_content = String::from( r#" [ocr] enabled = true backend = "tesseract" [chunking] max_chars = 2000 max_overlap = 200 [language_detection] enabled = true [images] extract_images = true "#, ); #[cfg(feature = "pdf")] toml_content.push_str("\n[pdf_options]\nextract_images = true\n"); fs::write(&config_path, toml_content).expect("Operation failed"); let config = ExtractionConfig::from_file(&config_path); assert!(config.is_ok(), "Should load comprehensive config successfully"); let config = config.expect("Operation failed"); assert!(config.ocr.is_some(), "Should have OCR config"); assert!(config.chunking.is_some(), "Should have chunking config"); assert!( config.language_detection.is_some(), "Should have language detection config" ); assert!(config.images.is_some(), "Should have image extraction config"); #[cfg(feature = "pdf")] assert!(config.pdf_options.is_some(), "Should have PDF config"); } /// Test config validation with invalid values. #[test] fn test_from_file_with_invalid_values() { let temp_dir = TempDir::new().expect("Operation failed"); let config_path = temp_dir.path().join("config.toml"); let toml_content = r#" [chunking] max_chars = -1000 max_overlap = -100 "#; fs::write(&config_path, toml_content).expect("Operation failed"); let result = ExtractionConfig::from_file(&config_path); if let Ok(config) = result && let Some(chunking) = config.chunking { assert!(chunking.max_characters > 0, "max_characters should be positive"); } }