Files
fil/crates/kreuzberg/tests/contract_mcp.rs

315 lines
10 KiB
Rust
Raw Normal View History

2026-06-01 23:40:55 +02:00
//! MCP contract tests - verify MCP config matches Rust core
//!
//! This test suite validates that MCP (Model Context Protocol) configuration
//! produces identical JSON to the Rust core library when parsing configuration.
//! This ensures that MCP users get the same configuration behavior as CLI and SDK users.
use kreuzberg::core::config::ExtractionConfig;
use kreuzberg::core::config::OutputFormat;
use serde_json::json;
#[test]
fn test_mcp_basic_config_json_matches_rust_core() {
// Create config via Rust core
let rust_config = ExtractionConfig {
use_cache: true,
enable_quality_processing: true,
force_ocr: false,
output_format: OutputFormat::Plain,
result_format: kreuzberg::types::ResultFormat::Unified,
..Default::default()
};
let rust_json = serde_json::to_value(&rust_config).expect("Failed to serialize rust config");
// Simulate MCP config parameter deserialization
let mcp_json = json!({
"use_cache": true,
"enable_quality_processing": true,
"force_ocr": false,
"output_format": "plain",
"result_format": "unified"
});
let mcp_config: ExtractionConfig =
serde_json::from_value(mcp_json.clone()).expect("Failed to deserialize MCP config");
let mcp_serialized = serde_json::to_value(&mcp_config).expect("Failed to serialize MCP config");
// Verify they produce identical JSON for the relevant fields
assert_eq!(
rust_json.get("use_cache"),
mcp_serialized.get("use_cache"),
"MCP use_cache must match Rust core"
);
assert_eq!(
rust_json.get("enable_quality_processing"),
mcp_serialized.get("enable_quality_processing"),
"MCP enable_quality_processing must match Rust core"
);
assert_eq!(
rust_json.get("force_ocr"),
mcp_serialized.get("force_ocr"),
"MCP force_ocr must match Rust core"
);
assert_eq!(
rust_json.get("output_format"),
mcp_serialized.get("output_format"),
"MCP output_format must match Rust core"
);
}
#[test]
fn test_mcp_ocr_config_nested_matches_rust_core() {
let mcp_json = json!({
"ocr": {
"backend": "tesseract"
},
"force_ocr": true
});
let config: ExtractionConfig = serde_json::from_value(mcp_json).expect("Failed to deserialize OCR config");
// Verify OCR config deserialized correctly
assert!(config.ocr.is_some(), "OCR config should be present");
assert!(config.force_ocr, "force_ocr should be true");
if let Some(ocr) = &config.ocr {
assert_eq!(ocr.backend, "tesseract", "OCR backend should be tesseract");
}
// Verify roundtrip
let serialized = serde_json::to_value(&config).expect("Failed to serialize");
assert!(serialized.get("ocr").is_some(), "Serialized config should include ocr");
}
#[test]
fn test_mcp_chunking_config_nested_matches_rust_core() {
let mcp_json = json!({
"chunking": {
"max_chars": 500,
"max_overlap": 50,
"strategy": "sliding_window"
}
});
let config: ExtractionConfig = serde_json::from_value(mcp_json).expect("Failed to deserialize chunking config");
// Verify chunking config deserialized correctly
assert!(config.chunking.is_some(), "Chunking config should be present");
if let Some(chunking) = &config.chunking {
assert_eq!(chunking.max_characters, 500, "max_chars should be 500");
assert_eq!(chunking.overlap, 50, "max_overlap should be 50");
}
// Verify roundtrip
let serialized = serde_json::to_value(&config).expect("Failed to serialize");
assert!(
serialized.get("chunking").is_some(),
"Serialized config should include chunking"
);
}
#[test]
fn test_mcp_full_config_preserves_all_fields() {
let full_config_json = json!({
"use_cache": false,
"enable_quality_processing": true,
"force_ocr": true,
"output_format": "markdown",
"result_format": "unified",
"max_concurrent_extractions": 8,
"ocr": {
"backend": "tesseract"
},
"chunking": {
"max_chars": 1000,
"max_overlap": 200
}
});
let config: ExtractionConfig =
serde_json::from_value(full_config_json.clone()).expect("Failed to deserialize full config");
let roundtrip_json = serde_json::to_value(&config).expect("Failed to serialize");
// Verify all top-level fields preserved
assert!(!config.use_cache, "use_cache should be false");
assert!(
config.enable_quality_processing,
"enable_quality_processing should be true"
);
assert!(config.force_ocr, "force_ocr should be true");
assert_eq!(
config.max_concurrent_extractions,
Some(8),
"max_concurrent_extractions should be 8"
);
// Verify nested fields preserved
assert!(config.ocr.is_some(), "OCR config should be present");
assert!(config.chunking.is_some(), "Chunking config should be present");
// Verify roundtrip integrity
assert_eq!(
roundtrip_json.get("use_cache"),
full_config_json.get("use_cache"),
"use_cache should survive roundtrip"
);
assert_eq!(
roundtrip_json.get("force_ocr"),
full_config_json.get("force_ocr"),
"force_ocr should survive roundtrip"
);
assert_eq!(
roundtrip_json.get("max_concurrent_extractions"),
full_config_json.get("max_concurrent_extractions"),
"max_concurrent_extractions should survive roundtrip"
);
}
#[test]
fn test_mcp_default_config_matches_rust_core_defaults() {
// Create Rust core default
let rust_default = ExtractionConfig::default();
let rust_json = serde_json::to_value(&rust_default).expect("Failed to serialize default");
// Create empty JSON (simulates MCP with no overrides)
let mcp_json = json!({});
let mcp_config: ExtractionConfig = serde_json::from_value(mcp_json).expect("Failed to deserialize empty config");
let mcp_json_serialized = serde_json::to_value(&mcp_config).expect("Failed to serialize MCP default");
// Verify defaults match
assert_eq!(
mcp_json_serialized.get("use_cache"),
rust_json.get("use_cache"),
"use_cache default should match"
);
assert_eq!(
mcp_json_serialized.get("enable_quality_processing"),
rust_json.get("enable_quality_processing"),
"enable_quality_processing default should match"
);
assert_eq!(
mcp_json_serialized.get("force_ocr"),
rust_json.get("force_ocr"),
"force_ocr default should match"
);
assert_eq!(
mcp_json_serialized.get("result_format"),
rust_json.get("result_format"),
"result_format default should match"
);
assert_eq!(
mcp_json_serialized.get("output_format"),
rust_json.get("output_format"),
"output_format default should match"
);
}
#[test]
fn test_mcp_output_format_values_are_valid() {
// Test all valid output format values (lowercase, as per serde rename_all)
let valid_formats = vec!["plain", "markdown", "html"];
for format in valid_formats {
let mcp_json = json!({
"output_format": format
});
let result = serde_json::from_value::<ExtractionConfig>(mcp_json);
assert!(result.is_ok(), "Format '{}' should deserialize successfully", format);
let config = result.unwrap();
assert!(
!config.output_format.to_string().is_empty(),
"Deserialized format should have valid string representation"
);
}
}
#[test]
fn test_mcp_result_format_values_are_valid() {
// Test valid result format values (lowercase, as per serde rename_all)
let valid_formats = vec!["unified", "element_based"];
for format in valid_formats {
let mcp_json = json!({
"result_format": format
});
let result = serde_json::from_value::<ExtractionConfig>(mcp_json);
assert!(
result.is_ok(),
"Result format '{}' should deserialize successfully",
format
);
}
}
#[test]
fn test_mcp_partial_override_preserves_defaults() {
// Create a partial config that overrides only one field
let partial_json = json!({
"force_ocr": true
});
let config: ExtractionConfig = serde_json::from_value(partial_json).expect("Failed to deserialize partial config");
// Verify override applied
assert!(config.force_ocr, "force_ocr override should be applied");
// Verify defaults preserved for other fields
assert!(config.use_cache, "use_cache should retain default when not overridden");
assert!(
config.enable_quality_processing,
"enable_quality_processing should retain default when not overridden"
);
}
#[test]
fn test_mcp_error_handling_for_invalid_json() {
// Test that invalid format values produce errors (or are handled gracefully)
let invalid_json = json!({
"output_format": "InvalidFormat"
});
let result = serde_json::from_value::<ExtractionConfig>(invalid_json);
// The deserialization should either fail or parse to a valid state
// depending on how OutputFormat handles unknown values
if let Ok(config) = result {
let _ = config.output_format.to_string();
}
}
#[test]
fn test_mcp_concurrent_extractions_override() {
let mcp_json = json!({
"max_concurrent_extractions": 16
});
let config: ExtractionConfig =
serde_json::from_value(mcp_json).expect("Failed to deserialize config with concurrent extractions");
assert_eq!(
config.max_concurrent_extractions,
Some(16),
"max_concurrent_extractions should be overridden to 16"
);
}
#[test]
fn test_mcp_config_json_keys_case_sensitive() {
// Verify that config JSON keys are case-sensitive
let lowercase_json = json!({
"use_cache": true,
"force_ocr": false
});
let config: ExtractionConfig =
serde_json::from_value(lowercase_json).expect("Failed to deserialize lowercase config");
assert!(config.use_cache, "use_cache should be true");
assert!(!config.force_ocr, "force_ocr should be false");
// Note: serde by default fails on unknown fields, so camelCase would fail
// This test documents the expected behavior
}