//! CLI contract tests - verify CLI config parsing matches Rust core //! //! This test suite validates that the CLI's configuration parsing produces //! identical results to the Rust core library. It ensures that users get //! consistent behavior whether using the CLI, SDK, or MCP interfaces. use kreuzberg::core::config::ExtractionConfig; use kreuzberg::core::config::OutputFormat; use serde_json::json; #[test] fn test_cli_config_json_flag_basic_parsing() { let config_str = r#"{"use_cache": true, "output_format": "plain"}"#; // Parse as Rust core would let rust_config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize config string"); // Simulate CLI --config-json parsing (same as Rust core) let cli_json: serde_json::Value = serde_json::from_str(config_str).expect("Failed to parse JSON string"); let cli_config: ExtractionConfig = serde_json::from_value(cli_json).expect("Failed to deserialize from JSON value"); // Verify identical behavior assert_eq!( rust_config.use_cache, cli_config.use_cache, "use_cache should be identical" ); assert_eq!( rust_config.output_format, cli_config.output_format, "output_format should be identical" ); } #[test] fn test_cli_nested_config_deserialization() { let config_str = r#"{ "chunking": { "max_characters": 1000, "overlap": 200 }, "ocr": { "backend": "tesseract" } }"#; let config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize nested config"); assert!(config.chunking.is_some(), "Chunking config should be present"); assert!(config.ocr.is_some(), "OCR config should be present"); let chunking = config.chunking.unwrap(); assert_eq!(chunking.max_characters, 1000, "max_chars should be 1000"); assert_eq!(chunking.overlap, 200, "max_overlap should be 200"); let ocr = config.ocr.unwrap(); assert_eq!(ocr.backend, "tesseract", "backend should be tesseract"); } #[test] fn test_cli_force_ocr_flag_parsing() { let config_str = r#"{"force_ocr": true}"#; let config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize force_ocr config"); assert!(config.force_ocr, "force_ocr should be true"); // Verify other fields retain defaults assert!(config.use_cache, "use_cache should still be true by default"); } #[test] fn test_cli_max_concurrent_extractions_parsing() { let config_str = r#"{"max_concurrent_extractions": 8}"#; let config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize concurrent extractions"); assert_eq!( config.max_concurrent_extractions, Some(8), "max_concurrent_extractions should be 8" ); } #[test] fn test_cli_complex_config_deserialization() { let config_str = r#"{ "use_cache": false, "enable_quality_processing": true, "force_ocr": true, "output_format": "markdown", "result_format": "unified", "max_concurrent_extractions": 16, "ocr": { "backend": "tesseract", "language": "eng" }, "chunking": { "max_characters": 2000, "overlap": 400, "strategy": "sliding_window" } }"#; let config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize complex config"); // Verify all top-level fields assert!(!config.use_cache); assert!(config.enable_quality_processing); assert!(config.force_ocr); assert_eq!(config.max_concurrent_extractions, Some(16)); // Verify nested configs assert!(config.ocr.is_some()); assert!(config.chunking.is_some()); let ocr = config.ocr.unwrap(); assert_eq!(ocr.backend, "tesseract"); assert_eq!(ocr.language, "eng"); let chunking = config.chunking.unwrap(); assert_eq!(chunking.max_characters, 2000); assert_eq!(chunking.overlap, 400); } #[test] fn test_cli_empty_config_uses_defaults() { let config_str = r#"{}"#; let config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize empty config"); // All defaults should apply assert!(config.use_cache, "Default use_cache should be true"); assert!( config.enable_quality_processing, "Default enable_quality_processing should be true" ); assert!(!config.force_ocr, "Default force_ocr should be false"); assert_eq!( config.max_concurrent_extractions, None, "Default max_concurrent_extractions should be None" ); } #[test] fn test_cli_roundtrip_preserves_all_fields() { let original_str = r#"{ "use_cache": false, "force_ocr": true, "max_concurrent_extractions": 12 }"#; // Parse let config: ExtractionConfig = serde_json::from_str(original_str).expect("Failed to deserialize"); // Serialize back let serialized = serde_json::to_value(&config).expect("Failed to serialize"); // Re-parse the serialized version let reparsed: ExtractionConfig = serde_json::from_value(serialized).expect("Failed to deserialize roundtripped config"); // Verify fields preserved assert!(!reparsed.use_cache); assert!(reparsed.force_ocr); assert_eq!(reparsed.max_concurrent_extractions, Some(12)); } #[test] fn test_cli_output_format_enum_parsing() { let test_cases = vec![ (r#"{"output_format": "plain"}"#, OutputFormat::Plain), (r#"{"output_format": "markdown"}"#, OutputFormat::Markdown), (r#"{"output_format": "html"}"#, OutputFormat::Html), ]; for (config_str, expected_format) in test_cases { let config: ExtractionConfig = serde_json::from_str(config_str).unwrap_or_else(|_| panic!("Failed to deserialize {}", config_str)); assert_eq!( config.output_format, expected_format, "output_format should match expected value" ); } } #[test] fn test_cli_result_format_enum_parsing() { let test_cases = vec![ r#"{"result_format": "unified"}"#, r#"{"result_format": "element_based"}"#, ]; for config_str in test_cases { let result = serde_json::from_str::(config_str); assert!(result.is_ok(), "Should deserialize result_format from {}", config_str); } } #[test] fn test_cli_base64_encoded_config_simulation() { // Simulate --config-json-base64 flag handling let original_json = json!({ "force_ocr": true, "output_format": "markdown" }); let json_string = original_json.to_string(); // Simulate base64 encoding let encoded = base64::engine::general_purpose::STANDARD.encode(&json_string); // Simulate base64 decoding (as CLI would do) use base64::Engine; let decoded = String::from_utf8( base64::engine::general_purpose::STANDARD .decode(&encoded) .expect("Failed to decode base64"), ) .expect("Failed to convert bytes to string"); // Parse the decoded JSON let config: ExtractionConfig = serde_json::from_str(&decoded).expect("Failed to deserialize base64-decoded config"); assert!(config.force_ocr); assert_eq!(config.output_format, OutputFormat::Markdown); } #[test] fn test_cli_partial_override_merging() { // Test that partial configs can override defaults let base_config = ExtractionConfig::default(); let override_json = json!({"force_ocr": true, "use_cache": false}); // Simulate CLI merge: convert base to JSON, merge overrides, deserialize let mut base_json = serde_json::to_value(&base_config).expect("Failed to serialize base config"); if let (serde_json::Value::Object(base_obj), serde_json::Value::Object(override_obj)) = (&mut base_json, override_json) { for (key, value) in override_obj { base_obj.insert(key, value); } } let merged: ExtractionConfig = serde_json::from_value(base_json).expect("Failed to deserialize merged config"); assert!(merged.force_ocr, "Override should apply force_ocr"); assert!(!merged.use_cache, "Override should apply use_cache"); assert!( merged.enable_quality_processing, "Unoverridden field should retain default" ); } #[test] fn test_cli_invalid_json_error_handling() { let invalid_json_str = r#"{"force_ocr": true, "invalid_field": "value"}"#; // Note: serde with deny_unknown_fields would reject this // Without that, it should deserialize successfully and ignore unknown fields let result = serde_json::from_str::(invalid_json_str); // Document the current behavior - unknown fields are typically ignored if let Ok(config) = result { assert!(config.force_ocr); } } #[test] fn test_cli_whitespace_handling_in_json() { let config_strs = vec![ r#"{"force_ocr":true}"#, // No spaces r#"{ "force_ocr" : true }"#, // Extra spaces r#"{ "force_ocr": true }"#, // Newlines and indentation ]; for config_str in config_strs { let config: ExtractionConfig = serde_json::from_str(config_str).unwrap_or_else(|_| panic!("Failed to parse: {}", config_str)); assert!(config.force_ocr); } } #[test] fn test_cli_numeric_boundary_values() { // Test minimum and maximum reasonable values for numeric fields let test_cases = vec![ (r#"{"max_concurrent_extractions": 1}"#, Some(1)), (r#"{"max_concurrent_extractions": 256}"#, Some(256)), (r#"{"max_concurrent_extractions": 0}"#, Some(0)), // Edge case: 0 extractions ]; for (config_str, expected_value) in test_cases { let config: ExtractionConfig = serde_json::from_str(config_str).unwrap_or_else(|_| panic!("Failed to parse: {}", config_str)); assert_eq!( config.max_concurrent_extractions, expected_value, "Numeric values should be parsed correctly" ); } } #[test] fn test_cli_boolean_values_strict_parsing() { // Test that boolean values are strictly true/false, not truthy/falsy let test_cases = vec![(r#"{"use_cache": true}"#, true), (r#"{"use_cache": false}"#, false)]; for (config_str, expected_value) in test_cases { let config: ExtractionConfig = serde_json::from_str(config_str).unwrap_or_else(|_| panic!("Failed to parse: {}", config_str)); assert_eq!(config.use_cache, expected_value); } } #[test] fn test_cli_config_consistency_across_formats() { // Create a config programmatically let programmatic_config = ExtractionConfig { use_cache: false, enable_quality_processing: true, force_ocr: true, output_format: OutputFormat::Markdown, max_concurrent_extractions: Some(4), ..Default::default() }; // Serialize it let serialized_json = serde_json::to_value(&programmatic_config).expect("Failed to serialize"); // Deserialize back from JSON string (simulating CLI parsing) let json_string = serialized_json.to_string(); let deserialized: ExtractionConfig = serde_json::from_str(&json_string).expect("Failed to deserialize from string"); // Verify complete roundtrip assert_eq!(deserialized.use_cache, programmatic_config.use_cache); assert_eq!( deserialized.enable_quality_processing, programmatic_config.enable_quality_processing ); assert_eq!(deserialized.force_ocr, programmatic_config.force_ocr); assert_eq!(deserialized.output_format, programmatic_config.output_format); assert_eq!( deserialized.max_concurrent_extractions, programmatic_config.max_concurrent_extractions ); } // Re-export needed for base64 test (moved to end of file) // Re-export needed for base64 test (imported at top of file)