This commit is contained in:
355
crates/kreuzberg-cli/tests/contract_cli.rs
Normal file
355
crates/kreuzberg-cli/tests/contract_cli.rs
Normal file
@@ -0,0 +1,355 @@
|
||||
//! CLI contract tests - verify CLI config parsing matches Rust core
|
||||
//!
|
||||
//! This test suite validates that the CLI's configuration parsing produces
|
||||
//! identical results to the Rust core library. It ensures that users get
|
||||
//! consistent behavior whether using the CLI, SDK, or MCP interfaces.
|
||||
|
||||
use kreuzberg::core::config::ExtractionConfig;
|
||||
use kreuzberg::core::config::OutputFormat;
|
||||
use serde_json::json;
|
||||
|
||||
#[test]
|
||||
fn test_cli_config_json_flag_basic_parsing() {
|
||||
let config_str = r#"{"use_cache": true, "output_format": "plain"}"#;
|
||||
|
||||
// Parse as Rust core would
|
||||
let rust_config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize config string");
|
||||
|
||||
// Simulate CLI --config-json parsing (same as Rust core)
|
||||
let cli_json: serde_json::Value = serde_json::from_str(config_str).expect("Failed to parse JSON string");
|
||||
let cli_config: ExtractionConfig = serde_json::from_value(cli_json).expect("Failed to deserialize from JSON value");
|
||||
|
||||
// Verify identical behavior
|
||||
assert_eq!(
|
||||
rust_config.use_cache, cli_config.use_cache,
|
||||
"use_cache should be identical"
|
||||
);
|
||||
assert_eq!(
|
||||
rust_config.output_format, cli_config.output_format,
|
||||
"output_format should be identical"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_nested_config_deserialization() {
|
||||
let config_str = r#"{
|
||||
"chunking": {
|
||||
"max_characters": 1000,
|
||||
"overlap": 200
|
||||
},
|
||||
"ocr": {
|
||||
"backend": "tesseract"
|
||||
}
|
||||
}"#;
|
||||
|
||||
let config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize nested config");
|
||||
|
||||
assert!(config.chunking.is_some(), "Chunking config should be present");
|
||||
assert!(config.ocr.is_some(), "OCR config should be present");
|
||||
|
||||
let chunking = config.chunking.unwrap();
|
||||
assert_eq!(chunking.max_characters, 1000, "max_chars should be 1000");
|
||||
assert_eq!(chunking.overlap, 200, "max_overlap should be 200");
|
||||
|
||||
let ocr = config.ocr.unwrap();
|
||||
assert_eq!(ocr.backend, "tesseract", "backend should be tesseract");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_force_ocr_flag_parsing() {
|
||||
let config_str = r#"{"force_ocr": true}"#;
|
||||
|
||||
let config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize force_ocr config");
|
||||
|
||||
assert!(config.force_ocr, "force_ocr should be true");
|
||||
// Verify other fields retain defaults
|
||||
assert!(config.use_cache, "use_cache should still be true by default");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_max_concurrent_extractions_parsing() {
|
||||
let config_str = r#"{"max_concurrent_extractions": 8}"#;
|
||||
|
||||
let config: ExtractionConfig =
|
||||
serde_json::from_str(config_str).expect("Failed to deserialize concurrent extractions");
|
||||
|
||||
assert_eq!(
|
||||
config.max_concurrent_extractions,
|
||||
Some(8),
|
||||
"max_concurrent_extractions should be 8"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_complex_config_deserialization() {
|
||||
let config_str = r#"{
|
||||
"use_cache": false,
|
||||
"enable_quality_processing": true,
|
||||
"force_ocr": true,
|
||||
"output_format": "markdown",
|
||||
"result_format": "unified",
|
||||
"max_concurrent_extractions": 16,
|
||||
"ocr": {
|
||||
"backend": "tesseract",
|
||||
"language": "eng"
|
||||
},
|
||||
"chunking": {
|
||||
"max_characters": 2000,
|
||||
"overlap": 400,
|
||||
"strategy": "sliding_window"
|
||||
}
|
||||
}"#;
|
||||
|
||||
let config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize complex config");
|
||||
|
||||
// Verify all top-level fields
|
||||
assert!(!config.use_cache);
|
||||
assert!(config.enable_quality_processing);
|
||||
assert!(config.force_ocr);
|
||||
assert_eq!(config.max_concurrent_extractions, Some(16));
|
||||
|
||||
// Verify nested configs
|
||||
assert!(config.ocr.is_some());
|
||||
assert!(config.chunking.is_some());
|
||||
|
||||
let ocr = config.ocr.unwrap();
|
||||
assert_eq!(ocr.backend, "tesseract");
|
||||
assert_eq!(ocr.language, "eng");
|
||||
|
||||
let chunking = config.chunking.unwrap();
|
||||
assert_eq!(chunking.max_characters, 2000);
|
||||
assert_eq!(chunking.overlap, 400);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_empty_config_uses_defaults() {
|
||||
let config_str = r#"{}"#;
|
||||
|
||||
let config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize empty config");
|
||||
|
||||
// All defaults should apply
|
||||
assert!(config.use_cache, "Default use_cache should be true");
|
||||
assert!(
|
||||
config.enable_quality_processing,
|
||||
"Default enable_quality_processing should be true"
|
||||
);
|
||||
assert!(!config.force_ocr, "Default force_ocr should be false");
|
||||
assert_eq!(
|
||||
config.max_concurrent_extractions, None,
|
||||
"Default max_concurrent_extractions should be None"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_roundtrip_preserves_all_fields() {
|
||||
let original_str = r#"{
|
||||
"use_cache": false,
|
||||
"force_ocr": true,
|
||||
"max_concurrent_extractions": 12
|
||||
}"#;
|
||||
|
||||
// Parse
|
||||
let config: ExtractionConfig = serde_json::from_str(original_str).expect("Failed to deserialize");
|
||||
|
||||
// Serialize back
|
||||
let serialized = serde_json::to_value(&config).expect("Failed to serialize");
|
||||
|
||||
// Re-parse the serialized version
|
||||
let reparsed: ExtractionConfig =
|
||||
serde_json::from_value(serialized).expect("Failed to deserialize roundtripped config");
|
||||
|
||||
// Verify fields preserved
|
||||
assert!(!reparsed.use_cache);
|
||||
assert!(reparsed.force_ocr);
|
||||
assert_eq!(reparsed.max_concurrent_extractions, Some(12));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_output_format_enum_parsing() {
|
||||
let test_cases = vec![
|
||||
(r#"{"output_format": "plain"}"#, OutputFormat::Plain),
|
||||
(r#"{"output_format": "markdown"}"#, OutputFormat::Markdown),
|
||||
(r#"{"output_format": "html"}"#, OutputFormat::Html),
|
||||
];
|
||||
|
||||
for (config_str, expected_format) in test_cases {
|
||||
let config: ExtractionConfig =
|
||||
serde_json::from_str(config_str).unwrap_or_else(|_| panic!("Failed to deserialize {}", config_str));
|
||||
|
||||
assert_eq!(
|
||||
config.output_format, expected_format,
|
||||
"output_format should match expected value"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_result_format_enum_parsing() {
|
||||
let test_cases = vec![
|
||||
r#"{"result_format": "unified"}"#,
|
||||
r#"{"result_format": "element_based"}"#,
|
||||
];
|
||||
|
||||
for config_str in test_cases {
|
||||
let result = serde_json::from_str::<ExtractionConfig>(config_str);
|
||||
assert!(result.is_ok(), "Should deserialize result_format from {}", config_str);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_base64_encoded_config_simulation() {
|
||||
// Simulate --config-json-base64 flag handling
|
||||
let original_json = json!({
|
||||
"force_ocr": true,
|
||||
"output_format": "markdown"
|
||||
});
|
||||
|
||||
let json_string = original_json.to_string();
|
||||
|
||||
// Simulate base64 encoding
|
||||
let encoded = base64::engine::general_purpose::STANDARD.encode(&json_string);
|
||||
|
||||
// Simulate base64 decoding (as CLI would do)
|
||||
use base64::Engine;
|
||||
let decoded = String::from_utf8(
|
||||
base64::engine::general_purpose::STANDARD
|
||||
.decode(&encoded)
|
||||
.expect("Failed to decode base64"),
|
||||
)
|
||||
.expect("Failed to convert bytes to string");
|
||||
|
||||
// Parse the decoded JSON
|
||||
let config: ExtractionConfig = serde_json::from_str(&decoded).expect("Failed to deserialize base64-decoded config");
|
||||
|
||||
assert!(config.force_ocr);
|
||||
assert_eq!(config.output_format, OutputFormat::Markdown);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_partial_override_merging() {
|
||||
// Test that partial configs can override defaults
|
||||
let base_config = ExtractionConfig::default();
|
||||
let override_json = json!({"force_ocr": true, "use_cache": false});
|
||||
|
||||
// Simulate CLI merge: convert base to JSON, merge overrides, deserialize
|
||||
let mut base_json = serde_json::to_value(&base_config).expect("Failed to serialize base config");
|
||||
|
||||
if let (serde_json::Value::Object(base_obj), serde_json::Value::Object(override_obj)) =
|
||||
(&mut base_json, override_json)
|
||||
{
|
||||
for (key, value) in override_obj {
|
||||
base_obj.insert(key, value);
|
||||
}
|
||||
}
|
||||
|
||||
let merged: ExtractionConfig = serde_json::from_value(base_json).expect("Failed to deserialize merged config");
|
||||
|
||||
assert!(merged.force_ocr, "Override should apply force_ocr");
|
||||
assert!(!merged.use_cache, "Override should apply use_cache");
|
||||
assert!(
|
||||
merged.enable_quality_processing,
|
||||
"Unoverridden field should retain default"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_invalid_json_error_handling() {
|
||||
let invalid_json_str = r#"{"force_ocr": true, "invalid_field": "value"}"#;
|
||||
|
||||
// Note: serde with deny_unknown_fields would reject this
|
||||
// Without that, it should deserialize successfully and ignore unknown fields
|
||||
let result = serde_json::from_str::<ExtractionConfig>(invalid_json_str);
|
||||
|
||||
// Document the current behavior - unknown fields are typically ignored
|
||||
if let Ok(config) = result {
|
||||
assert!(config.force_ocr);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_whitespace_handling_in_json() {
|
||||
let config_strs = vec![
|
||||
r#"{"force_ocr":true}"#, // No spaces
|
||||
r#"{ "force_ocr" : true }"#, // Extra spaces
|
||||
r#"{
|
||||
"force_ocr": true
|
||||
}"#, // Newlines and indentation
|
||||
];
|
||||
|
||||
for config_str in config_strs {
|
||||
let config: ExtractionConfig =
|
||||
serde_json::from_str(config_str).unwrap_or_else(|_| panic!("Failed to parse: {}", config_str));
|
||||
|
||||
assert!(config.force_ocr);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_numeric_boundary_values() {
|
||||
// Test minimum and maximum reasonable values for numeric fields
|
||||
let test_cases = vec![
|
||||
(r#"{"max_concurrent_extractions": 1}"#, Some(1)),
|
||||
(r#"{"max_concurrent_extractions": 256}"#, Some(256)),
|
||||
(r#"{"max_concurrent_extractions": 0}"#, Some(0)), // Edge case: 0 extractions
|
||||
];
|
||||
|
||||
for (config_str, expected_value) in test_cases {
|
||||
let config: ExtractionConfig =
|
||||
serde_json::from_str(config_str).unwrap_or_else(|_| panic!("Failed to parse: {}", config_str));
|
||||
|
||||
assert_eq!(
|
||||
config.max_concurrent_extractions, expected_value,
|
||||
"Numeric values should be parsed correctly"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_boolean_values_strict_parsing() {
|
||||
// Test that boolean values are strictly true/false, not truthy/falsy
|
||||
let test_cases = vec![(r#"{"use_cache": true}"#, true), (r#"{"use_cache": false}"#, false)];
|
||||
|
||||
for (config_str, expected_value) in test_cases {
|
||||
let config: ExtractionConfig =
|
||||
serde_json::from_str(config_str).unwrap_or_else(|_| panic!("Failed to parse: {}", config_str));
|
||||
|
||||
assert_eq!(config.use_cache, expected_value);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_config_consistency_across_formats() {
|
||||
// Create a config programmatically
|
||||
let programmatic_config = ExtractionConfig {
|
||||
use_cache: false,
|
||||
enable_quality_processing: true,
|
||||
force_ocr: true,
|
||||
output_format: OutputFormat::Markdown,
|
||||
max_concurrent_extractions: Some(4),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Serialize it
|
||||
let serialized_json = serde_json::to_value(&programmatic_config).expect("Failed to serialize");
|
||||
|
||||
// Deserialize back from JSON string (simulating CLI parsing)
|
||||
let json_string = serialized_json.to_string();
|
||||
let deserialized: ExtractionConfig = serde_json::from_str(&json_string).expect("Failed to deserialize from string");
|
||||
|
||||
// Verify complete roundtrip
|
||||
assert_eq!(deserialized.use_cache, programmatic_config.use_cache);
|
||||
assert_eq!(
|
||||
deserialized.enable_quality_processing,
|
||||
programmatic_config.enable_quality_processing
|
||||
);
|
||||
assert_eq!(deserialized.force_ocr, programmatic_config.force_ocr);
|
||||
assert_eq!(deserialized.output_format, programmatic_config.output_format);
|
||||
assert_eq!(
|
||||
deserialized.max_concurrent_extractions,
|
||||
programmatic_config.max_concurrent_extractions
|
||||
);
|
||||
}
|
||||
|
||||
// Re-export needed for base64 test (moved to end of file)
|
||||
|
||||
// Re-export needed for base64 test (imported at top of file)
|
||||
Reference in New Issue
Block a user