This commit is contained in:
603
crates/kreuzberg-cli/tests/e2e_config_test.rs
Normal file
603
crates/kreuzberg-cli/tests/e2e_config_test.rs
Normal file
@@ -0,0 +1,603 @@
|
||||
//! Comprehensive CLI end-to-end integration tests for configuration flags.
|
||||
//!
|
||||
//! This test suite validates the new configuration features including:
|
||||
//! - `--config-json` for inline JSON configuration
|
||||
//! - `--config-json-base64` for base64-encoded JSON configuration
|
||||
//! - `--output-format` flag with all variants (plain, markdown, djot, html)
|
||||
//! - Flag precedence (CLI args > JSON config > file > defaults)
|
||||
//! - Config merge scenarios and conflict detection
|
||||
//! - Error handling for invalid inputs
|
||||
//! - Real extraction with new formats
|
||||
|
||||
#![allow(clippy::bool_assert_comparison)]
|
||||
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
use tempfile::TempDir;
|
||||
|
||||
/// Get the path to the kreuzberg binary.
|
||||
fn get_binary_path() -> String {
|
||||
let manifest_dir = env!("CARGO_MANIFEST_DIR");
|
||||
format!("{}/../../target/debug/kreuzberg", manifest_dir)
|
||||
}
|
||||
|
||||
/// Get the test_documents directory path.
|
||||
fn get_test_documents_dir() -> PathBuf {
|
||||
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
|
||||
manifest_dir.parent().unwrap().parent().unwrap().join("test_documents")
|
||||
}
|
||||
|
||||
/// Get a test file path relative to test_documents/.
|
||||
fn get_test_file(relative_path: &str) -> String {
|
||||
get_test_documents_dir()
|
||||
.join(relative_path)
|
||||
.to_string_lossy()
|
||||
.to_string()
|
||||
}
|
||||
|
||||
/// Build the binary before running tests (runs once per test).
|
||||
fn build_binary() {
|
||||
let status = Command::new("cargo")
|
||||
.args(["build", "--bin", "kreuzberg"])
|
||||
.status()
|
||||
.expect("Failed to build kreuzberg binary");
|
||||
|
||||
assert!(status.success(), "Failed to build kreuzberg binary");
|
||||
}
|
||||
|
||||
/// Helper to create a temporary config file with specified content.
|
||||
fn create_test_config(dir: &TempDir, name: &str, content: &str) -> PathBuf {
|
||||
let config_path = dir.path().join(name);
|
||||
std::fs::write(&config_path, content).expect("Failed to write config file");
|
||||
config_path
|
||||
}
|
||||
|
||||
/// Helper to encode string as base64.
|
||||
fn to_base64(input: &str) -> String {
|
||||
// Manual base64 encoding
|
||||
const CHARSET: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
|
||||
let bytes = input.as_bytes();
|
||||
let mut result = String::new();
|
||||
let mut i = 0;
|
||||
|
||||
while i < bytes.len() {
|
||||
let b1 = bytes[i];
|
||||
let b2 = if i + 1 < bytes.len() { bytes[i + 1] } else { 0 };
|
||||
let b3 = if i + 2 < bytes.len() { bytes[i + 2] } else { 0 };
|
||||
|
||||
let n = ((b1 as u32) << 16) | ((b2 as u32) << 8) | (b3 as u32);
|
||||
|
||||
result.push(CHARSET[((n >> 18) & 0x3F) as usize] as char);
|
||||
result.push(CHARSET[((n >> 12) & 0x3F) as usize] as char);
|
||||
|
||||
if i + 1 < bytes.len() {
|
||||
result.push(CHARSET[((n >> 6) & 0x3F) as usize] as char);
|
||||
} else {
|
||||
result.push('=');
|
||||
}
|
||||
|
||||
if i + 2 < bytes.len() {
|
||||
result.push(CHARSET[(n & 0x3F) as usize] as char);
|
||||
} else {
|
||||
result.push('=');
|
||||
}
|
||||
|
||||
i += 3;
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Test 1: --config-json inline flag with complex configuration
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_cli_config_json_inline() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args([
|
||||
"extract",
|
||||
test_file.as_str(),
|
||||
"--config-json",
|
||||
r#"{"use_cache": false, "chunking": {"max_chars": 512}}"#,
|
||||
])
|
||||
.output()
|
||||
.expect("Failed to execute extract command with --config-json");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Extract command with --config-json failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
assert!(!stdout.is_empty(), "Output should not be empty");
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Test 2: --config-json-base64 flag for base64-encoded configuration
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_cli_config_json_base64() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
// Encode JSON config as base64
|
||||
let json_config = r#"{"use_cache": false}"#;
|
||||
let base64_config = to_base64(json_config);
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args([
|
||||
"extract",
|
||||
test_file.as_str(),
|
||||
"--config-json-base64",
|
||||
base64_config.as_str(),
|
||||
])
|
||||
.output()
|
||||
.expect("Failed to execute extract command with --config-json-base64");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Extract command with --config-json-base64 failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
assert!(!stdout.is_empty(), "Output should not be empty");
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Test 3: Flag precedence verification (CLI flags > JSON > file > defaults)
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_cli_flag_precedence() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let temp_dir = TempDir::new().expect("Failed to create temp directory");
|
||||
|
||||
// Create a config file with specific settings
|
||||
let config_content = r#"
|
||||
use_cache = true
|
||||
|
||||
[chunking]
|
||||
max_chars = 1024
|
||||
"#;
|
||||
let config_path = create_test_config(&temp_dir, "config.toml", config_content);
|
||||
|
||||
// CLI flag should override config file setting
|
||||
let output = Command::new(get_binary_path())
|
||||
.args([
|
||||
"extract",
|
||||
test_file.as_str(),
|
||||
"--config",
|
||||
config_path.to_string_lossy().as_ref(),
|
||||
"--config-json",
|
||||
r#"{"use_cache": false}"#,
|
||||
])
|
||||
.output()
|
||||
.expect("Failed to execute command with precedence test");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Precedence test command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Test 4: --output-format flag with all variants (plain, markdown, djot, html)
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_cli_output_format_all_variants() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let formats = vec!["plain", "markdown", "djot", "html"];
|
||||
|
||||
for format in formats {
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", test_file.as_str(), "--output-format", format])
|
||||
.output()
|
||||
.unwrap_or_else(|_| panic!("Failed to execute extract with --output-format {}", format));
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Extract command with --output-format {} failed: {}",
|
||||
format,
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
assert!(!stdout.is_empty(), "Output for format {} should not be empty", format);
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Test 5: Output formats (text vs json) for extraction result
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_cli_result_format() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
// Test text output format
|
||||
let output_text = Command::new(get_binary_path())
|
||||
.args(["extract", test_file.as_str(), "--format", "text"])
|
||||
.output()
|
||||
.expect("Failed to execute extract with --format text");
|
||||
|
||||
assert!(
|
||||
output_text.status.success(),
|
||||
"Text format output failed: {}",
|
||||
String::from_utf8_lossy(&output_text.stderr)
|
||||
);
|
||||
|
||||
let text_content = String::from_utf8_lossy(&output_text.stdout);
|
||||
assert!(!text_content.is_empty(), "Text output should not be empty");
|
||||
|
||||
// Test JSON output format
|
||||
let output_json = Command::new(get_binary_path())
|
||||
.args(["extract", test_file.as_str(), "--format", "json"])
|
||||
.output()
|
||||
.expect("Failed to execute extract with --format json");
|
||||
|
||||
assert!(
|
||||
output_json.status.success(),
|
||||
"JSON format output failed: {}",
|
||||
String::from_utf8_lossy(&output_json.stderr)
|
||||
);
|
||||
|
||||
let json_content = String::from_utf8_lossy(&output_json.stdout);
|
||||
let parsed: Result<serde_json::Value, _> = serde_json::from_str(&json_content);
|
||||
assert!(
|
||||
parsed.is_ok(),
|
||||
"JSON output should be valid JSON, got: {}",
|
||||
json_content
|
||||
);
|
||||
|
||||
// Verify JSON has expected envelope+result structure
|
||||
if let Ok(value) = parsed {
|
||||
assert!(
|
||||
value.get("result").is_some(),
|
||||
"JSON envelope should have 'result' field"
|
||||
);
|
||||
assert!(
|
||||
value.get("extraction_time_ms").is_some(),
|
||||
"JSON envelope should have 'extraction_time_ms' field"
|
||||
);
|
||||
assert!(
|
||||
value["result"].get("content").is_some(),
|
||||
"result should have 'content' field"
|
||||
);
|
||||
assert!(
|
||||
value["result"].get("mime_type").is_some(),
|
||||
"result should have 'mime_type' field"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Test 6: Deprecated --content-format flag warning
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_cli_content_format_deprecated_warning() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
// The deprecated --content-format should still work but may show warning
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", test_file.as_str(), "--content-format", "plain"])
|
||||
.output()
|
||||
.expect("Failed to execute extract with --content-format");
|
||||
|
||||
// Command should either succeed or show expected deprecation behavior
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
|
||||
// Note: We're checking that the command doesn't crash; deprecation warning behavior
|
||||
// depends on implementation details
|
||||
assert!(
|
||||
output.status.success() || !stdout.is_empty(),
|
||||
"Command should succeed or produce output"
|
||||
);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Test 7: Config merge scenarios - multiple configuration sources
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_cli_config_merge_scenarios() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let temp_dir = TempDir::new().expect("Failed to create temp directory");
|
||||
|
||||
// Create a base config file
|
||||
let config_content = r#"
|
||||
use_cache = true
|
||||
|
||||
[chunking]
|
||||
max_chars = 1024
|
||||
"#;
|
||||
let config_path = create_test_config(&temp_dir, "base.toml", config_content);
|
||||
|
||||
// Merge: config file + inline JSON (JSON should override matching keys)
|
||||
let output = Command::new(get_binary_path())
|
||||
.args([
|
||||
"extract",
|
||||
test_file.as_str(),
|
||||
"--config",
|
||||
config_path.to_string_lossy().as_ref(),
|
||||
"--config-json",
|
||||
r#"{"use_cache": false}"#,
|
||||
])
|
||||
.output()
|
||||
.expect("Failed to merge configs");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Config merge failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Test 8: Invalid JSON error handling
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_cli_invalid_json_error() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args([
|
||||
"extract",
|
||||
test_file.as_str(),
|
||||
"--config-json",
|
||||
r#"{"invalid json without closing"#, // Malformed JSON
|
||||
])
|
||||
.output()
|
||||
.expect("Failed to execute command");
|
||||
|
||||
// Should fail gracefully with error message
|
||||
assert!(!output.status.success(), "Command should fail with invalid JSON");
|
||||
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
// Should contain some error indication
|
||||
assert!(
|
||||
!stderr.is_empty() || !String::from_utf8_lossy(&output.stdout).is_empty(),
|
||||
"Should provide feedback about invalid JSON"
|
||||
);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Test 9: Config flag conflicts
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_cli_conflicts() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let temp_dir = TempDir::new().expect("Failed to create temp directory");
|
||||
let config_content = "use_cache = true\n";
|
||||
let config_path = create_test_config(&temp_dir, "config.toml", config_content);
|
||||
|
||||
// Using both --config-json and --config-json-base64 might conflict
|
||||
let json_config = r#"{"use_cache": false}"#;
|
||||
let base64_config = to_base64(json_config);
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args([
|
||||
"extract",
|
||||
test_file.as_str(),
|
||||
"--config",
|
||||
config_path.to_string_lossy().as_ref(),
|
||||
"--config-json",
|
||||
r#"{"chunking": {"max_chars": 512}}"#,
|
||||
"--config-json-base64",
|
||||
base64_config.as_str(),
|
||||
])
|
||||
.output()
|
||||
.expect("Failed to execute command with potential conflicts");
|
||||
|
||||
// The behavior here depends on implementation:
|
||||
// Either it should succeed (last flag wins) or show an error (mutually exclusive)
|
||||
// We verify that the command completes without crashing
|
||||
let _ = output.status.success();
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Test 10: Real end-to-end extraction with new config formats
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_cli_real_extraction() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
// Full E2E test: extract with multiple new flags
|
||||
let output = Command::new(get_binary_path())
|
||||
.args([
|
||||
"extract",
|
||||
test_file.as_str(),
|
||||
"--format",
|
||||
"json",
|
||||
"--output-format",
|
||||
"markdown",
|
||||
"--config-json",
|
||||
r#"{"use_cache": false, "disable_ocr": true}"#,
|
||||
])
|
||||
.output()
|
||||
.expect("Failed to execute full E2E extraction");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"E2E extraction failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
|
||||
// Should be valid JSON output
|
||||
let parsed: Result<serde_json::Value, _> = serde_json::from_str(&stdout);
|
||||
assert!(parsed.is_ok(), "E2E output should be valid JSON, got: {}", stdout);
|
||||
|
||||
// Verify envelope+result structure
|
||||
if let Ok(value) = parsed {
|
||||
assert!(value.get("result").is_some(), "Missing 'result' envelope field");
|
||||
assert!(
|
||||
value.get("extraction_time_ms").is_some(),
|
||||
"Missing 'extraction_time_ms' field"
|
||||
);
|
||||
assert!(
|
||||
value["result"].get("content").is_some(),
|
||||
"Missing content field in result"
|
||||
);
|
||||
assert!(
|
||||
value["result"].get("mime_type").is_some(),
|
||||
"Missing mime_type field in result"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Additional Edge Cases and Robustness Tests
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_cli_empty_config_json() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
// Empty JSON object should use defaults
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", test_file.as_str(), "--config-json", "{}"])
|
||||
.output()
|
||||
.expect("Failed to execute with empty JSON config");
|
||||
|
||||
assert!(output.status.success(), "Command with empty JSON config should succeed");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_multiple_output_format_variants() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
// Test case-insensitive format argument
|
||||
let output = Command::new(get_binary_path())
|
||||
.args([
|
||||
"extract",
|
||||
test_file.as_str(),
|
||||
"--output-format",
|
||||
"MARKDOWN", // uppercase should work or fail predictably
|
||||
])
|
||||
.output()
|
||||
.expect("Failed to execute");
|
||||
|
||||
// Either succeeds with case-insensitive parsing or fails gracefully
|
||||
let _ = output.status.success();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_config_json_with_nested_objects() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
// Complex nested JSON configuration
|
||||
let complex_config = r#"
|
||||
{
|
||||
"use_cache": false,
|
||||
"chunking": {"max_chars": 512},
|
||||
"language_detection": {
|
||||
"enabled": true,
|
||||
"confidence_threshold": 0.8
|
||||
}
|
||||
}
|
||||
"#;
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", test_file.as_str(), "--config-json", complex_config])
|
||||
.output()
|
||||
.expect("Failed to execute with nested JSON config");
|
||||
|
||||
assert!(
|
||||
output.status.success() || !String::from_utf8_lossy(&output.stderr).is_empty(),
|
||||
"Complex config should either work or provide error"
|
||||
);
|
||||
}
|
||||
Reference in New Issue
Block a user