Files
fil/crates/kreuzberg/tests/api_consistency.rs

615 lines
21 KiB
Rust
Raw Permalink Normal View History

2026-06-01 23:40:55 +02:00
//! API consistency tests for ExtractionConfig and related types.
//!
//! This test suite validates that:
//! 1. ExtractionConfig serialization is complete with all fields
//! 2. All required configuration fields are present
//! 3. Configuration types maintain consistency across different formats
//! 4. No configuration fields are accidentally hidden or lost
use kreuzberg::core::config::ExtractionConfig;
use kreuzberg::core::config::OutputFormat;
#[cfg(feature = "tree-sitter")]
use kreuzberg::core::config::{TreeSitterConfig, TreeSitterProcessConfig};
use serde_json::json;
#[test]
fn test_extraction_config_serialization_includes_all_fields() {
let config = ExtractionConfig::default();
let json = serde_json::to_value(&config).expect("Failed to serialize config");
// Verify core fields exist and are accessible
assert!(
json.get("use_cache").is_some(),
"Missing 'use_cache' field in serialized config"
);
assert!(
json.get("enable_quality_processing").is_some(),
"Missing 'enable_quality_processing' field"
);
assert!(
json.get("force_ocr").is_some(),
"Missing 'force_ocr' field in serialized config"
);
assert!(
json.get("max_concurrent_extractions").is_some(),
"Missing 'max_concurrent_extractions' field"
);
assert!(
json.get("result_format").is_some(),
"Missing 'result_format' field in serialized config"
);
assert!(
json.get("output_format").is_some(),
"Missing 'output_format' field in serialized config"
);
}
#[test]
fn test_extraction_config_defaults_are_correct() {
let config = ExtractionConfig::default();
assert!(config.use_cache, "Default use_cache should be true");
assert!(
config.enable_quality_processing,
"Default enable_quality_processing should be true"
);
assert!(!config.force_ocr, "Default force_ocr should be false");
assert_eq!(
config.max_concurrent_extractions, None,
"Default max_concurrent_extractions should be None"
);
}
#[test]
fn test_extraction_config_serialization_roundtrip() {
let config = ExtractionConfig::default();
// Serialize to JSON
let json_string = serde_json::to_string(&config).expect("Failed to serialize");
// Deserialize back
let deserialized: ExtractionConfig =
serde_json::from_str(&json_string).expect("Failed to deserialize config from JSON");
// Verify roundtrip integrity
assert_eq!(
config.use_cache, deserialized.use_cache,
"use_cache should survive roundtrip"
);
assert_eq!(
config.enable_quality_processing, deserialized.enable_quality_processing,
"enable_quality_processing should survive roundtrip"
);
assert_eq!(
config.force_ocr, deserialized.force_ocr,
"force_ocr should survive roundtrip"
);
assert_eq!(
config.result_format, deserialized.result_format,
"result_format should survive roundtrip"
);
assert_eq!(
config.output_format, deserialized.output_format,
"output_format should survive roundtrip"
);
}
#[test]
fn test_extraction_config_json_structure() {
let config = ExtractionConfig::default();
let json = serde_json::to_value(&config).expect("Failed to serialize config");
let obj = json.as_object().expect("Config should serialize as object");
// Verify all expected fields are present as keys
let expected_fields = vec![
"use_cache",
"enable_quality_processing",
"force_ocr",
"max_concurrent_extractions",
"result_format",
"output_format",
];
for field in expected_fields {
assert!(obj.contains_key(field), "Missing field in JSON: {}", field);
}
}
#[test]
fn test_extraction_config_values_are_correct_types() {
let config = ExtractionConfig::default();
let json = serde_json::to_value(&config).expect("Failed to serialize config");
// Verify field types
assert!(
json.get("use_cache").expect("Value not found").is_boolean(),
"use_cache should be boolean"
);
assert!(
json.get("enable_quality_processing")
.expect("Value not found")
.is_boolean(),
"enable_quality_processing should be boolean"
);
assert!(
json.get("force_ocr").expect("Value not found").is_boolean(),
"force_ocr should be boolean"
);
assert!(
json.get("result_format").expect("Value not found").is_string(),
"result_format should be string"
);
assert!(
json.get("output_format").expect("Value not found").is_string(),
"output_format should be string"
);
}
#[test]
fn test_extraction_config_with_custom_values() {
let config = ExtractionConfig {
use_cache: false,
force_ocr: true,
max_concurrent_extractions: Some(8),
..ExtractionConfig::default()
};
let json = serde_json::to_value(&config).expect("Failed to serialize");
assert_eq!(json.get("use_cache").expect("Value not found"), &json!(false));
assert_eq!(json.get("force_ocr").expect("Value not found"), &json!(true));
assert_eq!(
json.get("max_concurrent_extractions").expect("Value not found"),
&json!(8)
);
}
#[test]
fn test_extraction_config_partial_json_parsing() {
// Test that we can parse partial JSON and fields get defaults
let partial_json = json!({
"use_cache": false,
});
let config: ExtractionConfig = serde_json::from_value(partial_json).expect("Failed to parse partial config");
assert!(!config.use_cache, "Explicit use_cache should be respected");
assert!(
config.enable_quality_processing,
"Omitted enable_quality_processing should use default"
);
assert!(!config.force_ocr, "Omitted force_ocr should use default");
}
#[test]
fn test_extraction_config_empty_json_uses_defaults() {
// Empty object should use all defaults
let empty_json = json!({});
let config: ExtractionConfig = serde_json::from_value(empty_json).expect("Failed to parse empty config");
let default_config = ExtractionConfig::default();
assert_eq!(config.use_cache, default_config.use_cache);
assert_eq!(
config.enable_quality_processing,
default_config.enable_quality_processing
);
assert_eq!(config.force_ocr, default_config.force_ocr);
assert_eq!(config.result_format, default_config.result_format);
assert_eq!(config.output_format, default_config.output_format);
}
#[test]
fn test_extraction_config_output_format_valid_values() {
// Test that output_format accepts valid values (case-insensitive)
let json_plain = json!({"output_format": "plain"});
let config_plain: ExtractionConfig =
serde_json::from_value(json_plain).expect("Failed to parse plain output_format");
assert_eq!(config_plain.output_format, OutputFormat::Plain);
let json_markdown = json!({"output_format": "markdown"});
let config_markdown: ExtractionConfig =
serde_json::from_value(json_markdown).expect("Failed to parse markdown output_format");
assert_eq!(config_markdown.output_format, OutputFormat::Markdown);
let json_html = json!({"output_format": "html"});
let config_html: ExtractionConfig = serde_json::from_value(json_html).expect("Failed to parse html output_format");
assert_eq!(config_html.output_format, OutputFormat::Html);
}
#[test]
fn test_extraction_config_result_format_valid_values() {
// Test that result_format accepts valid values
let json_unified = json!({"result_format": "unified"});
let config_unified: ExtractionConfig =
serde_json::from_value(json_unified).expect("Failed to parse unified result_format");
// result_format uses types::ExtractionMode, not core::config::OutputFormat
let _ = config_unified.result_format;
}
#[test]
fn test_extraction_config_no_unknown_fields_in_default() {
// Verify that the default config only has expected fields when serialized
let config = ExtractionConfig::default();
let json = serde_json::to_value(&config).expect("Failed to serialize");
let obj = json.as_object().expect("Should be object");
// These are the fields we expect (some may be null based on feature flags)
let expected_fields = vec![
"use_cache",
"enable_quality_processing",
"ocr",
"force_ocr",
"disable_ocr",
"chunking",
"content_filter",
"images",
"pdf_options",
"token_reduction",
"language_detection",
"pages",
"keywords",
"postprocessor",
"html_options",
"html_output",
"max_concurrent_extractions",
"result_format",
"output_format",
"include_document_structure",
"security_limits",
"acceleration",
"cache_namespace",
"cache_ttl_secs",
"concurrency",
"email",
"layout",
"max_archive_depth",
"max_embedded_file_bytes",
"extraction_timeout_secs",
"tree_sitter",
"use_layout_for_markdown",
];
for key in obj.keys() {
assert!(
expected_fields.contains(&key.as_str()),
"Unexpected field in config: {}",
key
);
}
}
#[test]
fn test_extraction_config_needs_image_processing() {
// Test the needs_image_processing helper method
let mut config = ExtractionConfig::default();
// By default, should not need image processing
assert!(
!config.needs_image_processing(),
"Default config should not need image processing"
);
// With OCR enabled, should need image processing
config.ocr = Some(kreuzberg::OcrConfig {
backend: "tesseract".to_string(),
language: "eng".to_string(),
..Default::default()
});
assert!(
config.needs_image_processing(),
"Config with OCR should need image processing"
);
// Reset for next test
config.ocr = None;
config.images = Some(kreuzberg::ImageExtractionConfig {
extract_images: true,
target_dpi: 150,
max_image_dimension: 2000,
inject_placeholders: true,
auto_adjust_dpi: true,
min_dpi: 72,
max_dpi: 600,
max_images_per_page: None,
classify: true,
include_page_rasters: false,
run_ocr_on_images: true,
ocr_text_only: false,
append_ocr_text: false,
});
assert!(
config.needs_image_processing(),
"Config with image extraction should need image processing"
);
}
#[test]
fn test_output_format_serialization_lowercase() {
// Verify that OutputFormat serializes to lowercase values
let json = serde_json::json!({"output_format": "markdown"});
let config: ExtractionConfig = serde_json::from_value(json).expect("Failed to parse");
let reserialized = serde_json::to_value(&config).expect("Failed to reserialize");
// Should serialize back to lowercase
assert_eq!(reserialized["output_format"], "markdown");
}
#[test]
fn test_extraction_config_field_presence_consistency() {
// Test that all serialized configs have the expected top-level fields
let config = ExtractionConfig::default();
let json1 = serde_json::to_value(&config).expect("Failed to serialize");
let config2 = ExtractionConfig {
force_ocr: true,
..ExtractionConfig::default()
};
let json2 = serde_json::to_value(&config2).expect("Failed to serialize");
// Both should have the same top-level keys
let keys1: Vec<_> = json1.as_object().expect("Expected object value").keys().collect();
let keys2: Vec<_> = json2.as_object().expect("Expected object value").keys().collect();
assert_eq!(keys1.len(), keys2.len(), "Configs should have same number of keys");
}
#[test]
fn test_output_format_all_variants() {
// Test all output format variants can be serialized and deserialized
let formats = vec![
OutputFormat::Plain,
OutputFormat::Markdown,
OutputFormat::Html,
OutputFormat::Djot,
OutputFormat::Structured,
];
for fmt in &formats {
let serialized = serde_json::to_value(fmt.clone()).expect("Failed to serialize");
let deserialized: OutputFormat = serde_json::from_value(serialized).expect("Failed to deserialize");
assert_eq!(*fmt, deserialized, "Format should survive roundtrip");
}
}
#[test]
fn test_include_document_structure_default_is_false() {
let config = ExtractionConfig::default();
assert!(
!config.include_document_structure,
"Default include_document_structure should be false"
);
}
#[test]
fn test_include_document_structure_serialization_roundtrip() {
// Test with include_document_structure explicitly set to true
let config = ExtractionConfig {
include_document_structure: true,
..ExtractionConfig::default()
};
// Serialize to JSON
let json_string = serde_json::to_string(&config).expect("Failed to serialize");
// Deserialize back
let deserialized: ExtractionConfig =
serde_json::from_str(&json_string).expect("Failed to deserialize config from JSON");
// Verify the field survived the roundtrip
assert_eq!(
config.include_document_structure, deserialized.include_document_structure,
"include_document_structure should survive roundtrip"
);
assert!(
deserialized.include_document_structure,
"Deserialized include_document_structure should be true"
);
// Also test with false to ensure explicit false values are preserved
let config_false = ExtractionConfig {
include_document_structure: false,
..ExtractionConfig::default()
};
let json_string_false = serde_json::to_string(&config_false).expect("Failed to serialize");
let deserialized_false: ExtractionConfig =
serde_json::from_str(&json_string_false).expect("Failed to deserialize config from JSON");
assert!(
!deserialized_false.include_document_structure,
"Explicitly false include_document_structure should survive roundtrip"
);
}
// ── Tree-sitter API parity tests ───────────────────────────────────────
#[cfg(feature = "tree-sitter")]
#[test]
fn test_tree_sitter_process_config_defaults() {
let config = TreeSitterProcessConfig::default();
assert!(config.structure, "Default structure should be true");
assert!(config.imports, "Default imports should be true");
assert!(config.exports, "Default exports should be true");
assert!(!config.comments, "Default comments should be false");
assert!(!config.docstrings, "Default docstrings should be false");
assert!(!config.symbols, "Default symbols should be false");
assert!(!config.diagnostics, "Default diagnostics should be false");
assert!(config.chunk_max_size.is_none(), "Default chunk_max_size should be None");
}
#[cfg(feature = "tree-sitter")]
#[test]
fn test_tree_sitter_config_defaults() {
let config = TreeSitterConfig::default();
assert!(config.cache_dir.is_none(), "Default cache_dir should be None");
assert!(config.languages.is_none(), "Default languages should be None");
assert!(config.groups.is_none(), "Default groups should be None");
// process sub-config should use its own defaults
assert!(config.process.structure, "Default process.structure should be true");
}
#[cfg(feature = "tree-sitter")]
#[test]
fn test_tree_sitter_config_serialization_roundtrip() {
let config = TreeSitterConfig {
enabled: true,
cache_dir: Some("/tmp/grammars".into()),
languages: Some(vec!["python".to_string(), "rust".to_string()]),
groups: Some(vec!["web".to_string()]),
process: TreeSitterProcessConfig {
structure: true,
imports: true,
exports: false,
comments: true,
docstrings: true,
symbols: false,
diagnostics: false,
chunk_max_size: Some(4000),
content_mode: Default::default(),
},
};
let json_string = serde_json::to_string(&config).expect("Failed to serialize");
let deserialized: TreeSitterConfig = serde_json::from_str(&json_string).expect("Failed to deserialize");
assert_eq!(config.cache_dir, deserialized.cache_dir);
assert_eq!(config.languages, deserialized.languages);
assert_eq!(config.groups, deserialized.groups);
assert_eq!(config.process.structure, deserialized.process.structure);
assert_eq!(config.process.exports, deserialized.process.exports);
assert_eq!(config.process.comments, deserialized.process.comments);
assert_eq!(config.process.docstrings, deserialized.process.docstrings);
assert_eq!(config.process.chunk_max_size, deserialized.process.chunk_max_size);
}
#[cfg(feature = "tree-sitter")]
#[test]
fn test_tree_sitter_config_in_extraction_config_roundtrip() {
let config = ExtractionConfig {
tree_sitter: Some(TreeSitterConfig {
languages: Some(vec!["python".to_string()]),
..TreeSitterConfig::default()
}),
..ExtractionConfig::default()
};
let json_string = serde_json::to_string(&config).expect("Failed to serialize");
let deserialized: ExtractionConfig = serde_json::from_str(&json_string).expect("Failed to deserialize");
let ts = deserialized
.tree_sitter
.expect("tree_sitter should be present after roundtrip");
assert_eq!(ts.languages, Some(vec!["python".to_string()]));
assert!(ts.process.structure, "process.structure should default to true");
}
#[cfg(feature = "tree-sitter")]
#[test]
fn test_tree_sitter_partial_json_parsing() {
// Partial tree_sitter config with only some fields
let json = json!({
"tree_sitter": {
"languages": ["rust"],
"process": {
"comments": true
}
}
});
let config: ExtractionConfig = serde_json::from_value(json).expect("Failed to parse");
let ts = config.tree_sitter.expect("tree_sitter should be present");
assert_eq!(ts.languages, Some(vec!["rust".to_string()]));
assert!(ts.groups.is_none(), "Omitted groups should be None");
assert!(ts.process.comments, "Explicit comments=true should be respected");
// Omitted fields should use defaults
assert!(ts.process.structure, "Omitted structure should default to true");
assert!(!ts.process.symbols, "Omitted symbols should default to false");
}
#[cfg(feature = "tree-sitter")]
#[test]
fn test_tree_sitter_process_config_all_fields_serialized() {
let config = TreeSitterProcessConfig {
structure: false,
imports: false,
exports: false,
comments: true,
docstrings: true,
symbols: true,
diagnostics: true,
chunk_max_size: Some(2000),
content_mode: Default::default(),
};
let json = serde_json::to_value(&config).expect("Failed to serialize");
let obj = json.as_object().expect("Should be object");
let expected_fields = [
"structure",
"imports",
"exports",
"comments",
"docstrings",
"symbols",
"diagnostics",
"chunk_max_size",
];
for field in expected_fields {
assert!(obj.contains_key(field), "Missing field: {field}");
}
}
#[cfg(feature = "tree-sitter")]
#[test]
fn test_format_metadata_code_variant_serialization() {
use kreuzberg::types::metadata::FormatMetadata;
// Use a default ProcessResult to test serialization shape without needing a grammar
let result = kreuzberg::ProcessResult {
language: "python".to_string(),
..kreuzberg::ProcessResult::default()
};
let metadata = FormatMetadata::Code(result);
let json = serde_json::to_value(&metadata).expect("Failed to serialize FormatMetadata::Code");
// Verify the tagged union format
assert_eq!(json["format_type"], "code", "format_type tag should be 'code'");
assert_eq!(json["language"], "python", "language should be 'python'");
assert!(json.get("metrics").is_some(), "metrics should be present");
assert!(
json["metrics"]["total_lines"].is_number(),
"metrics.total_lines should be a number"
);
}
#[cfg(feature = "tree-sitter")]
#[test]
fn test_tslp_types_reexported() {
// Verify that TSLP types are accessible through kreuzberg's public API
let _: kreuzberg::ProcessConfig = kreuzberg::ProcessConfig::new("rust");
// StructureKind enum variants
let _kind = kreuzberg::StructureKind::Function;
let _kind = kreuzberg::StructureKind::Class;
let _kind = kreuzberg::StructureKind::Method;
// ExportKind enum variants
let _kind = kreuzberg::ExportKind::Named;
let _kind = kreuzberg::ExportKind::Default;
// CommentKind enum variants
let _kind = kreuzberg::CommentKind::Line;
let _kind = kreuzberg::CommentKind::Block;
// DiagnosticSeverity enum variants
let _sev = kreuzberg::DiagnosticSeverity::Error;
let _sev = kreuzberg::DiagnosticSeverity::Warning;
// FileMetrics default
let metrics = kreuzberg::FileMetrics::default();
assert_eq!(metrics.total_lines, 0);
}