858 lines
29 KiB
Rust
858 lines
29 KiB
Rust
|
|
//! MCP integration tests for API consistency and breaking changes.
|
||
|
|
//!
|
||
|
|
//! This test suite validates that:
|
||
|
|
//! 1. MCP parameters properly handle extraction configuration
|
||
|
|
//! 2. MCP parameter deserialization is consistent
|
||
|
|
//! 3. Various config combinations work correctly
|
||
|
|
//! 4. End-to-end MCP tool invocations work with real data
|
||
|
|
//! 5. Error handling is consistent across MCP tools
|
||
|
|
//!
|
||
|
|
//! Note: These tests verify the parameter structures used by MCP.
|
||
|
|
//! The build_config function in the MCP server should accept
|
||
|
|
//! a config JSON field instead of separate enable_ocr/force_ocr flags
|
||
|
|
//! to align with the new API consistency approach.
|
||
|
|
|
||
|
|
#![allow(clippy::bool_assert_comparison)]
|
||
|
|
#![allow(clippy::field_reassign_with_default)]
|
||
|
|
|
||
|
|
use serde_json::json;
|
||
|
|
|
||
|
|
/// Test that parameter structures can handle various JSON configurations
|
||
|
|
#[test]
|
||
|
|
fn test_extraction_config_parameter_structure() {
|
||
|
|
// This demonstrates the new approach: config JSON instead of separate flags
|
||
|
|
let config_json = json!({
|
||
|
|
"use_cache": true,
|
||
|
|
"force_ocr": true,
|
||
|
|
"output_format": "markdown",
|
||
|
|
});
|
||
|
|
|
||
|
|
let config: kreuzberg::core::config::ExtractionConfig =
|
||
|
|
serde_json::from_value(config_json).expect("Failed to parse config");
|
||
|
|
|
||
|
|
assert_eq!(config.use_cache, true);
|
||
|
|
assert_eq!(config.force_ocr, true);
|
||
|
|
assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Markdown);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_mcp_style_params_with_config() {
|
||
|
|
// This demonstrates how MCP params should accept full config JSON
|
||
|
|
let mcp_request = json!({
|
||
|
|
"path": "/test.pdf",
|
||
|
|
"mime_type": "application/pdf",
|
||
|
|
"config": {
|
||
|
|
"use_cache": false,
|
||
|
|
"force_ocr": true,
|
||
|
|
"output_format": "markdown",
|
||
|
|
}
|
||
|
|
});
|
||
|
|
|
||
|
|
// The config field should be parseable as ExtractionConfig
|
||
|
|
let config_obj = mcp_request.get("config").expect("Should have config field");
|
||
|
|
let config: kreuzberg::core::config::ExtractionConfig =
|
||
|
|
serde_json::from_value(config_obj.clone()).expect("Failed to parse config");
|
||
|
|
|
||
|
|
assert_eq!(config.force_ocr, true);
|
||
|
|
assert_eq!(config.use_cache, false);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_mcp_params_backward_compatibility_minimal() {
|
||
|
|
// Minimal MCP params structure
|
||
|
|
let params = json!({
|
||
|
|
"path": "/test.pdf",
|
||
|
|
});
|
||
|
|
|
||
|
|
// Should be deserializable
|
||
|
|
let path = params.get("path").expect("Should have path");
|
||
|
|
assert_eq!(path, "/test.pdf");
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_mcp_params_with_all_fields() {
|
||
|
|
// Complete MCP params with config
|
||
|
|
let params = json!({
|
||
|
|
"path": "/test.pdf",
|
||
|
|
"mime_type": "application/pdf",
|
||
|
|
"config": {
|
||
|
|
"use_cache": true,
|
||
|
|
"enable_quality_processing": true,
|
||
|
|
"force_ocr": false,
|
||
|
|
"output_format": "plain",
|
||
|
|
}
|
||
|
|
});
|
||
|
|
|
||
|
|
// Extract and validate config
|
||
|
|
if let Some(config_obj) = params.get("config") {
|
||
|
|
let config: kreuzberg::core::config::ExtractionConfig =
|
||
|
|
serde_json::from_value(config_obj.clone()).expect("Failed to parse");
|
||
|
|
|
||
|
|
assert_eq!(config.use_cache, true);
|
||
|
|
assert_eq!(config.force_ocr, false);
|
||
|
|
assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Plain);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_batch_extraction_params_structure() {
|
||
|
|
// Batch extraction params with paths and config
|
||
|
|
let batch_params = json!({
|
||
|
|
"paths": ["/file1.pdf", "/file2.pdf", "/file3.pdf"],
|
||
|
|
"config": {
|
||
|
|
"force_ocr": true,
|
||
|
|
"max_concurrent_extractions": 4,
|
||
|
|
}
|
||
|
|
});
|
||
|
|
|
||
|
|
let paths = batch_params.get("paths").expect("Should have paths");
|
||
|
|
assert!(paths.is_array(), "paths field should be an array");
|
||
|
|
let path_array = paths.as_array().expect("paths should be deserializable as array");
|
||
|
|
assert_eq!(path_array.len(), 3, "paths array should contain exactly 3 elements");
|
||
|
|
|
||
|
|
if let Some(config_obj) = batch_params.get("config") {
|
||
|
|
let config: kreuzberg::core::config::ExtractionConfig =
|
||
|
|
serde_json::from_value(config_obj.clone()).expect("Failed to parse");
|
||
|
|
assert_eq!(config.force_ocr, true);
|
||
|
|
assert_eq!(config.max_concurrent_extractions, Some(4));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_config_merge_in_mcp_context() {
|
||
|
|
// Test 1: Verify default config baseline
|
||
|
|
let default_config = kreuzberg::core::config::ExtractionConfig::default();
|
||
|
|
assert_eq!(default_config.use_cache, true, "Default cache should be enabled");
|
||
|
|
assert_eq!(default_config.force_ocr, false, "Default force_ocr should be false");
|
||
|
|
assert_eq!(
|
||
|
|
default_config.output_format,
|
||
|
|
kreuzberg::core::config::OutputFormat::Plain,
|
||
|
|
"Default output format should be Plain"
|
||
|
|
);
|
||
|
|
|
||
|
|
// Test 2: Request provides single field override - verify precedence
|
||
|
|
let request_config_json = json!({
|
||
|
|
"force_ocr": true,
|
||
|
|
});
|
||
|
|
let request_config: kreuzberg::core::config::ExtractionConfig =
|
||
|
|
serde_json::from_value(request_config_json).expect("Failed to parse request config");
|
||
|
|
|
||
|
|
// Request config should override that field
|
||
|
|
assert_eq!(request_config.force_ocr, true, "Request force_ocr should be true");
|
||
|
|
|
||
|
|
// But unspecified fields should use defaults
|
||
|
|
assert_eq!(
|
||
|
|
request_config.use_cache, true,
|
||
|
|
"Unspecified use_cache should default to true"
|
||
|
|
);
|
||
|
|
assert_eq!(
|
||
|
|
request_config.output_format,
|
||
|
|
kreuzberg::core::config::OutputFormat::Plain,
|
||
|
|
"Unspecified output_format should default to Plain"
|
||
|
|
);
|
||
|
|
|
||
|
|
// Test 3: Multiple field overrides - verify precedence chain
|
||
|
|
let multi_override_json = json!({
|
||
|
|
"use_cache": false,
|
||
|
|
"force_ocr": true,
|
||
|
|
"output_format": "markdown",
|
||
|
|
});
|
||
|
|
let multi_config: kreuzberg::core::config::ExtractionConfig =
|
||
|
|
serde_json::from_value(multi_override_json).expect("Failed to parse multi-field config");
|
||
|
|
|
||
|
|
// All specified fields should override defaults
|
||
|
|
assert_eq!(multi_config.use_cache, false, "Override use_cache should be false");
|
||
|
|
assert_eq!(multi_config.force_ocr, true, "Override force_ocr should be true");
|
||
|
|
assert_eq!(
|
||
|
|
multi_config.output_format,
|
||
|
|
kreuzberg::core::config::OutputFormat::Markdown,
|
||
|
|
"Override output_format should be Markdown"
|
||
|
|
);
|
||
|
|
|
||
|
|
// Unspecified numeric fields should still have defaults
|
||
|
|
if let Some(max_conc) = multi_config.max_concurrent_extractions {
|
||
|
|
panic!(
|
||
|
|
"max_concurrent_extractions should not be specified when not in request, got: {}",
|
||
|
|
max_conc
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Test 4: Verify config can be fully constructed with all fields
|
||
|
|
let full_json = json!({
|
||
|
|
"use_cache": false,
|
||
|
|
"enable_quality_processing": true,
|
||
|
|
"force_ocr": true,
|
||
|
|
"output_format": "html",
|
||
|
|
"max_concurrent_extractions": 8,
|
||
|
|
});
|
||
|
|
let full_config: kreuzberg::core::config::ExtractionConfig =
|
||
|
|
serde_json::from_value(full_json).expect("Failed to parse full config");
|
||
|
|
|
||
|
|
assert_eq!(full_config.use_cache, false, "Full config use_cache should be false");
|
||
|
|
assert_eq!(
|
||
|
|
full_config.enable_quality_processing, true,
|
||
|
|
"Full config quality processing should be true"
|
||
|
|
);
|
||
|
|
assert_eq!(full_config.force_ocr, true, "Full config force_ocr should be true");
|
||
|
|
assert_eq!(
|
||
|
|
full_config.output_format,
|
||
|
|
kreuzberg::core::config::OutputFormat::Html,
|
||
|
|
"Full config output_format should be Html"
|
||
|
|
);
|
||
|
|
assert_eq!(
|
||
|
|
full_config.max_concurrent_extractions,
|
||
|
|
Some(8),
|
||
|
|
"Full config max_concurrent should be 8"
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_config_json_flexibility() {
|
||
|
|
// Config JSON can have any combination of fields
|
||
|
|
let configs = vec![
|
||
|
|
json!({}), // Empty = all defaults
|
||
|
|
json!({"force_ocr": true}), // Single field
|
||
|
|
json!({"force_ocr": true, "use_cache": false}), // Multiple fields
|
||
|
|
json!({"output_format": "markdown", "max_concurrent_extractions": 8}), // Various types
|
||
|
|
];
|
||
|
|
|
||
|
|
for config_json in configs {
|
||
|
|
let config: Result<kreuzberg::core::config::ExtractionConfig, _> = serde_json::from_value(config_json);
|
||
|
|
assert!(config.is_ok(), "Config should deserialize successfully");
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_extraction_config_serialization_for_mcp() {
|
||
|
|
// MCP should be able to serialize config back to JSON
|
||
|
|
let mut config = kreuzberg::core::config::ExtractionConfig::default();
|
||
|
|
config.force_ocr = true;
|
||
|
|
config.output_format = kreuzberg::core::config::OutputFormat::Markdown;
|
||
|
|
|
||
|
|
let json = serde_json::to_value(&config).expect("Failed to serialize");
|
||
|
|
|
||
|
|
// Verify it round-trips
|
||
|
|
let restored: kreuzberg::core::config::ExtractionConfig =
|
||
|
|
serde_json::from_value(json).expect("Failed to deserialize");
|
||
|
|
|
||
|
|
assert_eq!(config.force_ocr, restored.force_ocr);
|
||
|
|
assert_eq!(config.output_format, restored.output_format);
|
||
|
|
}
|
||
|
|
|
||
|
|
// ============================================================================
|
||
|
|
// E2E TEST CASES
|
||
|
|
// ============================================================================
|
||
|
|
|
||
|
|
/// Test MCP config with all options enabled
|
||
|
|
#[test]
|
||
|
|
fn test_mcp_config_full_extraction() {
|
||
|
|
let config_json = json!({
|
||
|
|
"use_cache": false,
|
||
|
|
"enable_quality_processing": true,
|
||
|
|
"force_ocr": false,
|
||
|
|
"output_format": "markdown",
|
||
|
|
"max_concurrent_extractions": 4,
|
||
|
|
});
|
||
|
|
|
||
|
|
let config: kreuzberg::core::config::ExtractionConfig =
|
||
|
|
serde_json::from_value(config_json).expect("Failed to parse full config");
|
||
|
|
|
||
|
|
// Verify all fields deserialized correctly
|
||
|
|
assert_eq!(config.use_cache, false);
|
||
|
|
assert_eq!(config.enable_quality_processing, true);
|
||
|
|
assert_eq!(config.force_ocr, false);
|
||
|
|
assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Markdown);
|
||
|
|
assert_eq!(config.max_concurrent_extractions, Some(4));
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test MCP config with markdown output format
|
||
|
|
#[test]
|
||
|
|
fn test_mcp_config_output_format_markdown() {
|
||
|
|
let config_json = json!({
|
||
|
|
"output_format": "markdown",
|
||
|
|
});
|
||
|
|
|
||
|
|
let config: kreuzberg::core::config::ExtractionConfig =
|
||
|
|
serde_json::from_value(config_json).expect("Failed to parse markdown config");
|
||
|
|
|
||
|
|
assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Markdown);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test MCP config with element-based result structure
|
||
|
|
#[test]
|
||
|
|
fn test_mcp_config_result_format_element_based() {
|
||
|
|
let config_json = json!({
|
||
|
|
"output_format": "markdown",
|
||
|
|
"use_cache": true,
|
||
|
|
"enable_quality_processing": true,
|
||
|
|
});
|
||
|
|
|
||
|
|
let config: kreuzberg::core::config::ExtractionConfig =
|
||
|
|
serde_json::from_value(config_json).expect("Failed to parse element format");
|
||
|
|
|
||
|
|
assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Markdown);
|
||
|
|
assert_eq!(config.use_cache, true);
|
||
|
|
assert_eq!(config.enable_quality_processing, true);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test batch extraction with config applied to all files
|
||
|
|
#[test]
|
||
|
|
fn test_mcp_batch_with_config() {
|
||
|
|
let batch_request = json!({
|
||
|
|
"paths": ["/file1.txt", "/file2.txt", "/file3.txt"],
|
||
|
|
"config": {
|
||
|
|
"force_ocr": true,
|
||
|
|
"output_format": "plain",
|
||
|
|
"max_concurrent_extractions": 2,
|
||
|
|
}
|
||
|
|
});
|
||
|
|
|
||
|
|
// Verify paths are array
|
||
|
|
let paths = batch_request.get("paths").expect("Should have paths");
|
||
|
|
assert!(paths.is_array(), "paths field should be an array");
|
||
|
|
let path_array = paths.as_array().expect("paths should be deserializable as array");
|
||
|
|
assert_eq!(path_array.len(), 3, "paths array should contain exactly 3 elements");
|
||
|
|
|
||
|
|
// Verify config applies to batch
|
||
|
|
let config_obj = batch_request.get("config").expect("Should have config");
|
||
|
|
let config: kreuzberg::core::config::ExtractionConfig =
|
||
|
|
serde_json::from_value(config_obj.clone()).expect("Failed to parse batch config");
|
||
|
|
|
||
|
|
assert_eq!(config.force_ocr, true);
|
||
|
|
assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Plain);
|
||
|
|
assert_eq!(config.max_concurrent_extractions, Some(2));
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test MCP error handling with invalid JSON config
|
||
|
|
#[test]
|
||
|
|
fn test_mcp_invalid_config_json_error() {
|
||
|
|
let invalid_config = "not a valid json object";
|
||
|
|
|
||
|
|
let result: Result<kreuzberg::core::config::ExtractionConfig, _> = serde_json::from_str(invalid_config);
|
||
|
|
|
||
|
|
assert!(result.is_err(), "Invalid JSON should produce error");
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test that MCP config field precedence is correct
|
||
|
|
#[test]
|
||
|
|
fn test_mcp_config_overrides() {
|
||
|
|
// Simulate MCP request with inline config
|
||
|
|
let mcp_params = json!({
|
||
|
|
"path": "/document.pdf",
|
||
|
|
"mime_type": "application/pdf",
|
||
|
|
"config": {
|
||
|
|
"force_ocr": true,
|
||
|
|
"use_cache": false,
|
||
|
|
"output_format": "markdown",
|
||
|
|
}
|
||
|
|
});
|
||
|
|
|
||
|
|
if let Some(config_obj) = mcp_params.get("config") {
|
||
|
|
let parsed_config: kreuzberg::core::config::ExtractionConfig =
|
||
|
|
serde_json::from_value(config_obj.clone()).expect("Failed to parse");
|
||
|
|
|
||
|
|
// Verify request config overrides defaults
|
||
|
|
assert_eq!(parsed_config.force_ocr, true);
|
||
|
|
assert_eq!(parsed_config.use_cache, false);
|
||
|
|
assert_eq!(
|
||
|
|
parsed_config.output_format,
|
||
|
|
kreuzberg::core::config::OutputFormat::Markdown
|
||
|
|
);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test that deprecated parameters (enable_ocr, force_ocr as separate fields) are rejected
|
||
|
|
#[test]
|
||
|
|
fn test_mcp_no_deprecated_params() {
|
||
|
|
// This simulates MCP params that incorrectly use separate flags
|
||
|
|
let deprecated_params = json!({
|
||
|
|
"path": "/document.pdf",
|
||
|
|
"enable_ocr": true, // deprecated!
|
||
|
|
"force_ocr": true, // should be in config
|
||
|
|
});
|
||
|
|
|
||
|
|
// The correct approach: config field contains all options
|
||
|
|
let correct_params = json!({
|
||
|
|
"path": "/document.pdf",
|
||
|
|
"config": {
|
||
|
|
"force_ocr": true,
|
||
|
|
}
|
||
|
|
});
|
||
|
|
|
||
|
|
// Extract and verify correct params
|
||
|
|
if let Some(config_obj) = correct_params.get("config") {
|
||
|
|
let config: kreuzberg::core::config::ExtractionConfig =
|
||
|
|
serde_json::from_value(config_obj.clone()).expect("Failed to parse");
|
||
|
|
assert_eq!(config.force_ocr, true);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Verify deprecated params are NOT in the correct structure
|
||
|
|
assert!(
|
||
|
|
deprecated_params.get("config").is_none(),
|
||
|
|
"Deprecated params should not be in config"
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// End-to-end test with real text extraction
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_mcp_real_pdf_extraction() {
|
||
|
|
// Create a simple test document in bytes
|
||
|
|
let test_content = b"Hello, MCP!";
|
||
|
|
|
||
|
|
// Create MCP request structure
|
||
|
|
let mcp_request = json!({
|
||
|
|
"mime_type": "text/plain",
|
||
|
|
"config": {
|
||
|
|
"output_format": "plain",
|
||
|
|
"use_cache": false,
|
||
|
|
}
|
||
|
|
});
|
||
|
|
|
||
|
|
// Extract config from request
|
||
|
|
if let Some(config_obj) = mcp_request.get("config") {
|
||
|
|
let config: kreuzberg::core::config::ExtractionConfig =
|
||
|
|
serde_json::from_value(config_obj.clone()).expect("Failed to parse config");
|
||
|
|
|
||
|
|
// Use async extract_bytes to process content
|
||
|
|
let result = kreuzberg::extract_bytes(test_content, "text/plain", &config)
|
||
|
|
.await
|
||
|
|
.expect("Extraction should succeed");
|
||
|
|
|
||
|
|
// Verify result has content
|
||
|
|
assert!(!result.content.is_empty());
|
||
|
|
assert!(result.content.contains("MCP") || result.content.contains("Hello"));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test MCP batch extraction with mixed formats
|
||
|
|
#[test]
|
||
|
|
fn test_mcp_batch_mixed_formats() {
|
||
|
|
let batch_config = json!({
|
||
|
|
"files": [
|
||
|
|
{
|
||
|
|
"path": "/document.pdf",
|
||
|
|
"mime_type": "application/pdf",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"path": "/document.docx",
|
||
|
|
"mime_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"path": "/document.txt",
|
||
|
|
"mime_type": "text/plain",
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"config": {
|
||
|
|
"output_format": "markdown",
|
||
|
|
"force_ocr": false,
|
||
|
|
}
|
||
|
|
});
|
||
|
|
|
||
|
|
let files = batch_config.get("files").expect("Should have files");
|
||
|
|
assert!(files.is_array(), "files field should be an array");
|
||
|
|
let file_array = files.as_array().expect("files should be deserializable as array");
|
||
|
|
assert_eq!(file_array.len(), 3, "files array should contain exactly 3 elements");
|
||
|
|
|
||
|
|
if let Some(config_obj) = batch_config.get("config") {
|
||
|
|
let config: kreuzberg::core::config::ExtractionConfig =
|
||
|
|
serde_json::from_value(config_obj.clone()).expect("Failed to parse batch config");
|
||
|
|
assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Markdown);
|
||
|
|
assert_eq!(config.force_ocr, false);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test MCP request with minimal config (all defaults)
|
||
|
|
#[test]
|
||
|
|
fn test_mcp_minimal_config() {
|
||
|
|
let minimal_request = json!({
|
||
|
|
"path": "/document.pdf",
|
||
|
|
});
|
||
|
|
|
||
|
|
// Path should exist and be correct
|
||
|
|
assert_eq!(
|
||
|
|
minimal_request.get("path"),
|
||
|
|
Some(&serde_json::Value::String("/document.pdf".to_string())),
|
||
|
|
"Path field should be present and set to /document.pdf"
|
||
|
|
);
|
||
|
|
|
||
|
|
// If no config, use defaults
|
||
|
|
let config = match minimal_request.get("config") {
|
||
|
|
Some(config_obj) => {
|
||
|
|
serde_json::from_value(config_obj.clone()).expect("Failed to parse config from minimal request")
|
||
|
|
}
|
||
|
|
None => kreuzberg::core::config::ExtractionConfig::default(),
|
||
|
|
};
|
||
|
|
|
||
|
|
// Verify defaults are applied
|
||
|
|
assert_eq!(config.use_cache, true);
|
||
|
|
assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Plain);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test MCP config with all output formats
|
||
|
|
#[test]
|
||
|
|
fn test_mcp_all_output_formats() {
|
||
|
|
let formats = vec!["plain", "markdown", "html"];
|
||
|
|
|
||
|
|
for format_str in formats {
|
||
|
|
let config_json = json!({
|
||
|
|
"output_format": format_str,
|
||
|
|
});
|
||
|
|
|
||
|
|
let config: kreuzberg::core::config::ExtractionConfig =
|
||
|
|
serde_json::from_value(config_json).expect("Failed to parse output format config");
|
||
|
|
|
||
|
|
// Verify format was set
|
||
|
|
let format_display = format!("{}", config.output_format);
|
||
|
|
assert_eq!(format_display, format_str);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test MCP concurrent extraction config
|
||
|
|
#[test]
|
||
|
|
fn test_mcp_concurrent_extraction_config() {
|
||
|
|
let concurrent_configs = vec![1, 2, 4, 8, 16];
|
||
|
|
|
||
|
|
for max_concurrent in concurrent_configs {
|
||
|
|
let config_json = json!({
|
||
|
|
"max_concurrent_extractions": max_concurrent,
|
||
|
|
});
|
||
|
|
|
||
|
|
let config: kreuzberg::core::config::ExtractionConfig =
|
||
|
|
serde_json::from_value(config_json).expect("Failed to parse concurrent config");
|
||
|
|
|
||
|
|
assert_eq!(config.max_concurrent_extractions, Some(max_concurrent));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test MCP config with cache disabled
|
||
|
|
#[test]
|
||
|
|
fn test_mcp_cache_disabled_config() {
|
||
|
|
let config_json = json!({
|
||
|
|
"use_cache": false,
|
||
|
|
"force_ocr": true,
|
||
|
|
});
|
||
|
|
|
||
|
|
let config: kreuzberg::core::config::ExtractionConfig =
|
||
|
|
serde_json::from_value(config_json).expect("Failed to parse cache config");
|
||
|
|
|
||
|
|
assert_eq!(config.use_cache, false);
|
||
|
|
assert_eq!(config.force_ocr, true);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test MCP config round-trip serialization
|
||
|
|
#[test]
|
||
|
|
fn test_mcp_config_round_trip_serialization() {
|
||
|
|
let original_config = kreuzberg::core::config::ExtractionConfig {
|
||
|
|
use_cache: false,
|
||
|
|
enable_quality_processing: true,
|
||
|
|
force_ocr: true,
|
||
|
|
output_format: kreuzberg::core::config::OutputFormat::Markdown,
|
||
|
|
max_concurrent_extractions: Some(4),
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
|
||
|
|
// Serialize to JSON
|
||
|
|
let json_value = serde_json::to_value(&original_config).expect("Failed to serialize");
|
||
|
|
|
||
|
|
// Deserialize back
|
||
|
|
let restored_config: kreuzberg::core::config::ExtractionConfig =
|
||
|
|
serde_json::from_value(json_value).expect("Failed to deserialize");
|
||
|
|
|
||
|
|
// Verify round-trip
|
||
|
|
assert_eq!(original_config.use_cache, restored_config.use_cache);
|
||
|
|
assert_eq!(
|
||
|
|
original_config.enable_quality_processing,
|
||
|
|
restored_config.enable_quality_processing
|
||
|
|
);
|
||
|
|
assert_eq!(original_config.force_ocr, restored_config.force_ocr);
|
||
|
|
assert_eq!(original_config.output_format, restored_config.output_format);
|
||
|
|
assert_eq!(
|
||
|
|
original_config.max_concurrent_extractions,
|
||
|
|
restored_config.max_concurrent_extractions
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test MCP tool invocation with extract_bytes semantics
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_mcp_tool_extract_bytes_semantics() {
|
||
|
|
let test_bytes = b"Test content for MCP extraction";
|
||
|
|
let mime_type = "text/plain";
|
||
|
|
|
||
|
|
let config_json = json!({
|
||
|
|
"output_format": "plain",
|
||
|
|
});
|
||
|
|
|
||
|
|
let config: kreuzberg::core::config::ExtractionConfig =
|
||
|
|
serde_json::from_value(config_json).expect("Failed to parse config");
|
||
|
|
|
||
|
|
// Simulate MCP tool: extract_bytes
|
||
|
|
let result = kreuzberg::extract_bytes(test_bytes, mime_type, &config)
|
||
|
|
.await
|
||
|
|
.expect("Extraction should succeed");
|
||
|
|
|
||
|
|
assert!(!result.content.is_empty());
|
||
|
|
assert!(result.mime_type.contains("text"));
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test MCP tool invocation with file path semantics
|
||
|
|
#[test]
|
||
|
|
fn test_mcp_tool_extract_file_semantics() {
|
||
|
|
// Create temporary test file
|
||
|
|
let test_dir = tempfile::tempdir().expect("Failed to create temp dir");
|
||
|
|
let test_file = test_dir.path().join("test.txt");
|
||
|
|
std::fs::write(&test_file, b"Test content").expect("Failed to write test file");
|
||
|
|
|
||
|
|
let config_json = json!({
|
||
|
|
"output_format": "plain",
|
||
|
|
});
|
||
|
|
|
||
|
|
let config: kreuzberg::core::config::ExtractionConfig =
|
||
|
|
serde_json::from_value(config_json).expect("Failed to parse config");
|
||
|
|
|
||
|
|
// Simulate MCP tool: extract_file (sync)
|
||
|
|
if test_file.exists() {
|
||
|
|
let file_path = test_file.to_str().expect("test_file path should be valid UTF-8");
|
||
|
|
let result = kreuzberg::extract_file_sync(file_path, None, &config).expect("Extraction should succeed");
|
||
|
|
|
||
|
|
assert!(!result.content.is_empty());
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test MCP batch extraction semantics
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_mcp_batch_extraction_semantics() {
|
||
|
|
let test_bytes_1 = b"Content 1";
|
||
|
|
let test_bytes_2 = b"Content 2";
|
||
|
|
let mime_type = "text/plain";
|
||
|
|
|
||
|
|
let config_json = json!({
|
||
|
|
"output_format": "plain",
|
||
|
|
});
|
||
|
|
|
||
|
|
let config: kreuzberg::core::config::ExtractionConfig =
|
||
|
|
serde_json::from_value(config_json).expect("Failed to parse config");
|
||
|
|
|
||
|
|
// Simulate MCP batch tool: batch_extract_bytes
|
||
|
|
let test_data = vec![
|
||
|
|
(test_bytes_1.to_vec(), mime_type.to_string()),
|
||
|
|
(test_bytes_2.to_vec(), mime_type.to_string()),
|
||
|
|
];
|
||
|
|
|
||
|
|
// Extract each item
|
||
|
|
for (bytes, mime) in test_data {
|
||
|
|
let result = kreuzberg::extract_bytes(&bytes, &mime, &config)
|
||
|
|
.await
|
||
|
|
.expect("Batch extraction should succeed");
|
||
|
|
assert!(!result.content.is_empty());
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test MCP config deserialization with unknown format string.
|
||
|
|
///
|
||
|
|
/// `OutputFormat` has a `Custom(String)` catch-all variant, so an unknown format
|
||
|
|
/// string deserializes successfully rather than erroring. This allows registering
|
||
|
|
/// custom renderers by name.
|
||
|
|
#[test]
|
||
|
|
fn test_mcp_error_invalid_format_field() {
|
||
|
|
let config_json = json!({
|
||
|
|
"output_format": "invalid_format_that_does_not_exist",
|
||
|
|
});
|
||
|
|
|
||
|
|
let result: Result<kreuzberg::core::config::ExtractionConfig, _> = serde_json::from_value(config_json);
|
||
|
|
|
||
|
|
// Custom formats are accepted at deserialization time; unknown names produce Custom(...)
|
||
|
|
assert!(result.is_ok());
|
||
|
|
assert_eq!(
|
||
|
|
result.unwrap().output_format,
|
||
|
|
kreuzberg::OutputFormat::Custom("invalid_format_that_does_not_exist".to_string())
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test MCP parameter validation with zero concurrent count
|
||
|
|
#[test]
|
||
|
|
fn test_mcp_validate_zero_concurrent() {
|
||
|
|
// Zero values should be accepted by serde, but MCP validation should flag
|
||
|
|
let config_json = json!({
|
||
|
|
"max_concurrent_extractions": 0,
|
||
|
|
});
|
||
|
|
|
||
|
|
let config: kreuzberg::core::config::ExtractionConfig =
|
||
|
|
serde_json::from_value(config_json).expect("Failed to parse");
|
||
|
|
|
||
|
|
// The config accepted the value; MCP server should validate semantically
|
||
|
|
assert_eq!(config.max_concurrent_extractions, Some(0));
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test MCP tool with empty batch
|
||
|
|
#[test]
|
||
|
|
fn test_mcp_empty_batch_handling() {
|
||
|
|
let empty_batch = json!({
|
||
|
|
"paths": [],
|
||
|
|
"config": {
|
||
|
|
"output_format": "plain",
|
||
|
|
}
|
||
|
|
});
|
||
|
|
|
||
|
|
let paths = empty_batch.get("paths").expect("Should have paths");
|
||
|
|
assert!(paths.is_array(), "paths field should be an array");
|
||
|
|
let path_array = paths.as_array().expect("paths should be deserializable as array");
|
||
|
|
assert_eq!(path_array.len(), 0, "paths array should be empty");
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test MCP parameter extraction with nested config
|
||
|
|
#[test]
|
||
|
|
fn test_mcp_nested_config_extraction() {
|
||
|
|
let nested_request = json!({
|
||
|
|
"tool": "extract_file",
|
||
|
|
"parameters": {
|
||
|
|
"path": "/document.pdf",
|
||
|
|
"config": {
|
||
|
|
"output_format": "markdown",
|
||
|
|
"force_ocr": true,
|
||
|
|
}
|
||
|
|
}
|
||
|
|
});
|
||
|
|
|
||
|
|
if let Some(params) = nested_request.get("parameters")
|
||
|
|
&& let Some(config_obj) = params.get("config")
|
||
|
|
{
|
||
|
|
let config: kreuzberg::core::config::ExtractionConfig =
|
||
|
|
serde_json::from_value(config_obj.clone()).expect("Failed to parse nested config");
|
||
|
|
|
||
|
|
assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Markdown);
|
||
|
|
assert_eq!(config.force_ocr, true);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test MCP HTML output format
|
||
|
|
#[test]
|
||
|
|
fn test_mcp_html_output_format() {
|
||
|
|
let config_json = json!({
|
||
|
|
"output_format": "html",
|
||
|
|
});
|
||
|
|
|
||
|
|
let config: kreuzberg::core::config::ExtractionConfig =
|
||
|
|
serde_json::from_value(config_json).expect("Failed to parse HTML config");
|
||
|
|
|
||
|
|
assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Html);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test MCP config with all boolean combinations
|
||
|
|
#[test]
|
||
|
|
fn test_mcp_boolean_combinations() {
|
||
|
|
let combinations = vec![(true, true), (true, false), (false, true), (false, false)];
|
||
|
|
|
||
|
|
for (use_cache, quality_processing) in combinations {
|
||
|
|
let config_json = json!({
|
||
|
|
"use_cache": use_cache,
|
||
|
|
"enable_quality_processing": quality_processing,
|
||
|
|
});
|
||
|
|
|
||
|
|
let config: kreuzberg::core::config::ExtractionConfig =
|
||
|
|
serde_json::from_value(config_json).expect("Failed to parse config");
|
||
|
|
|
||
|
|
assert_eq!(config.use_cache, use_cache);
|
||
|
|
assert_eq!(config.enable_quality_processing, quality_processing);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test MCP response structure with extraction result
|
||
|
|
#[test]
|
||
|
|
fn test_mcp_response_structure_validation() {
|
||
|
|
let mcp_response = json!({
|
||
|
|
"status": "success",
|
||
|
|
"data": {
|
||
|
|
"content": "Extracted text",
|
||
|
|
"mime_type": "text/plain",
|
||
|
|
"metadata": {
|
||
|
|
"source": "test",
|
||
|
|
"extracted_at": "2024-01-25",
|
||
|
|
}
|
||
|
|
}
|
||
|
|
});
|
||
|
|
|
||
|
|
assert_eq!(
|
||
|
|
mcp_response.get("status").expect("status field should exist"),
|
||
|
|
"success"
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
mcp_response.get("data").is_some(),
|
||
|
|
"data field should be present in MCP response"
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test MCP request/response roundtrip with config
|
||
|
|
#[test]
|
||
|
|
fn test_mcp_request_response_roundtrip() {
|
||
|
|
let original_config = json!({
|
||
|
|
"use_cache": false,
|
||
|
|
"force_ocr": true,
|
||
|
|
"output_format": "markdown",
|
||
|
|
"max_concurrent_extractions": 4,
|
||
|
|
});
|
||
|
|
|
||
|
|
// Simulate sending to MCP and getting back
|
||
|
|
let config: kreuzberg::core::config::ExtractionConfig =
|
||
|
|
serde_json::from_value(original_config.clone()).expect("Failed to parse");
|
||
|
|
|
||
|
|
// Serialize back
|
||
|
|
let response_config = serde_json::to_value(&config).expect("Failed to serialize");
|
||
|
|
|
||
|
|
// Verify it matches
|
||
|
|
assert_eq!(original_config.get("use_cache"), response_config.get("use_cache"));
|
||
|
|
assert_eq!(original_config.get("force_ocr"), response_config.get("force_ocr"));
|
||
|
|
assert_eq!(
|
||
|
|
original_config.get("output_format"),
|
||
|
|
response_config.get("output_format")
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test MCP config with partial updates
|
||
|
|
#[test]
|
||
|
|
fn test_mcp_config_partial_updates() {
|
||
|
|
let mut base_config = kreuzberg::core::config::ExtractionConfig::default();
|
||
|
|
base_config.use_cache = true;
|
||
|
|
base_config.force_ocr = false;
|
||
|
|
|
||
|
|
// Partial update
|
||
|
|
let update_json = json!({
|
||
|
|
"force_ocr": true,
|
||
|
|
});
|
||
|
|
|
||
|
|
let update_config: kreuzberg::core::config::ExtractionConfig =
|
||
|
|
serde_json::from_value(update_json).expect("Failed to parse update");
|
||
|
|
|
||
|
|
// In MCP, updates replace config completely
|
||
|
|
let updated = update_config;
|
||
|
|
|
||
|
|
// New config has update applied
|
||
|
|
assert_eq!(updated.force_ocr, true);
|
||
|
|
// But other fields revert to defaults (not merged)
|
||
|
|
assert_eq!(updated.use_cache, true);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test MCP API consistency for all formats
|
||
|
|
#[test]
|
||
|
|
fn test_mcp_api_consistency_all_formats() {
|
||
|
|
let formats = vec!["plain", "markdown", "html"];
|
||
|
|
|
||
|
|
for format_str in formats {
|
||
|
|
let config = json!({
|
||
|
|
"output_format": format_str,
|
||
|
|
});
|
||
|
|
|
||
|
|
let parsed: kreuzberg::core::config::ExtractionConfig =
|
||
|
|
serde_json::from_value(config).expect("Failed to parse");
|
||
|
|
|
||
|
|
// Verify format is consistent
|
||
|
|
let serialized = serde_json::to_value(&parsed).expect("Failed to serialize");
|
||
|
|
let reserialized: kreuzberg::core::config::ExtractionConfig =
|
||
|
|
serde_json::from_value(serialized).expect("Failed to deserialize");
|
||
|
|
|
||
|
|
let original_format = format!("{}", parsed.output_format);
|
||
|
|
let restored_format = format!("{}", reserialized.output_format);
|
||
|
|
|
||
|
|
assert_eq!(original_format, restored_format);
|
||
|
|
}
|
||
|
|
}
|