//! API consistency tests for ExtractionConfig and related types. //! //! This test suite validates that: //! 1. ExtractionConfig serialization is complete with all fields //! 2. All required configuration fields are present //! 3. Configuration types maintain consistency across different formats //! 4. No configuration fields are accidentally hidden or lost use kreuzberg::core::config::ExtractionConfig; use kreuzberg::core::config::OutputFormat; #[cfg(feature = "tree-sitter")] use kreuzberg::core::config::{TreeSitterConfig, TreeSitterProcessConfig}; use serde_json::json; #[test] fn test_extraction_config_serialization_includes_all_fields() { let config = ExtractionConfig::default(); let json = serde_json::to_value(&config).expect("Failed to serialize config"); // Verify core fields exist and are accessible assert!( json.get("use_cache").is_some(), "Missing 'use_cache' field in serialized config" ); assert!( json.get("enable_quality_processing").is_some(), "Missing 'enable_quality_processing' field" ); assert!( json.get("force_ocr").is_some(), "Missing 'force_ocr' field in serialized config" ); assert!( json.get("max_concurrent_extractions").is_some(), "Missing 'max_concurrent_extractions' field" ); assert!( json.get("result_format").is_some(), "Missing 'result_format' field in serialized config" ); assert!( json.get("output_format").is_some(), "Missing 'output_format' field in serialized config" ); } #[test] fn test_extraction_config_defaults_are_correct() { let config = ExtractionConfig::default(); assert!(config.use_cache, "Default use_cache should be true"); assert!( config.enable_quality_processing, "Default enable_quality_processing should be true" ); assert!(!config.force_ocr, "Default force_ocr should be false"); assert_eq!( config.max_concurrent_extractions, None, "Default max_concurrent_extractions should be None" ); } #[test] fn test_extraction_config_serialization_roundtrip() { let config = ExtractionConfig::default(); // Serialize to JSON let json_string = serde_json::to_string(&config).expect("Failed to serialize"); // Deserialize back let deserialized: ExtractionConfig = serde_json::from_str(&json_string).expect("Failed to deserialize config from JSON"); // Verify roundtrip integrity assert_eq!( config.use_cache, deserialized.use_cache, "use_cache should survive roundtrip" ); assert_eq!( config.enable_quality_processing, deserialized.enable_quality_processing, "enable_quality_processing should survive roundtrip" ); assert_eq!( config.force_ocr, deserialized.force_ocr, "force_ocr should survive roundtrip" ); assert_eq!( config.result_format, deserialized.result_format, "result_format should survive roundtrip" ); assert_eq!( config.output_format, deserialized.output_format, "output_format should survive roundtrip" ); } #[test] fn test_extraction_config_json_structure() { let config = ExtractionConfig::default(); let json = serde_json::to_value(&config).expect("Failed to serialize config"); let obj = json.as_object().expect("Config should serialize as object"); // Verify all expected fields are present as keys let expected_fields = vec![ "use_cache", "enable_quality_processing", "force_ocr", "max_concurrent_extractions", "result_format", "output_format", ]; for field in expected_fields { assert!(obj.contains_key(field), "Missing field in JSON: {}", field); } } #[test] fn test_extraction_config_values_are_correct_types() { let config = ExtractionConfig::default(); let json = serde_json::to_value(&config).expect("Failed to serialize config"); // Verify field types assert!( json.get("use_cache").expect("Value not found").is_boolean(), "use_cache should be boolean" ); assert!( json.get("enable_quality_processing") .expect("Value not found") .is_boolean(), "enable_quality_processing should be boolean" ); assert!( json.get("force_ocr").expect("Value not found").is_boolean(), "force_ocr should be boolean" ); assert!( json.get("result_format").expect("Value not found").is_string(), "result_format should be string" ); assert!( json.get("output_format").expect("Value not found").is_string(), "output_format should be string" ); } #[test] fn test_extraction_config_with_custom_values() { let config = ExtractionConfig { use_cache: false, force_ocr: true, max_concurrent_extractions: Some(8), ..ExtractionConfig::default() }; let json = serde_json::to_value(&config).expect("Failed to serialize"); assert_eq!(json.get("use_cache").expect("Value not found"), &json!(false)); assert_eq!(json.get("force_ocr").expect("Value not found"), &json!(true)); assert_eq!( json.get("max_concurrent_extractions").expect("Value not found"), &json!(8) ); } #[test] fn test_extraction_config_partial_json_parsing() { // Test that we can parse partial JSON and fields get defaults let partial_json = json!({ "use_cache": false, }); let config: ExtractionConfig = serde_json::from_value(partial_json).expect("Failed to parse partial config"); assert!(!config.use_cache, "Explicit use_cache should be respected"); assert!( config.enable_quality_processing, "Omitted enable_quality_processing should use default" ); assert!(!config.force_ocr, "Omitted force_ocr should use default"); } #[test] fn test_extraction_config_empty_json_uses_defaults() { // Empty object should use all defaults let empty_json = json!({}); let config: ExtractionConfig = serde_json::from_value(empty_json).expect("Failed to parse empty config"); let default_config = ExtractionConfig::default(); assert_eq!(config.use_cache, default_config.use_cache); assert_eq!( config.enable_quality_processing, default_config.enable_quality_processing ); assert_eq!(config.force_ocr, default_config.force_ocr); assert_eq!(config.result_format, default_config.result_format); assert_eq!(config.output_format, default_config.output_format); } #[test] fn test_extraction_config_output_format_valid_values() { // Test that output_format accepts valid values (case-insensitive) let json_plain = json!({"output_format": "plain"}); let config_plain: ExtractionConfig = serde_json::from_value(json_plain).expect("Failed to parse plain output_format"); assert_eq!(config_plain.output_format, OutputFormat::Plain); let json_markdown = json!({"output_format": "markdown"}); let config_markdown: ExtractionConfig = serde_json::from_value(json_markdown).expect("Failed to parse markdown output_format"); assert_eq!(config_markdown.output_format, OutputFormat::Markdown); let json_html = json!({"output_format": "html"}); let config_html: ExtractionConfig = serde_json::from_value(json_html).expect("Failed to parse html output_format"); assert_eq!(config_html.output_format, OutputFormat::Html); } #[test] fn test_extraction_config_result_format_valid_values() { // Test that result_format accepts valid values let json_unified = json!({"result_format": "unified"}); let config_unified: ExtractionConfig = serde_json::from_value(json_unified).expect("Failed to parse unified result_format"); // result_format uses types::ExtractionMode, not core::config::OutputFormat let _ = config_unified.result_format; } #[test] fn test_extraction_config_no_unknown_fields_in_default() { // Verify that the default config only has expected fields when serialized let config = ExtractionConfig::default(); let json = serde_json::to_value(&config).expect("Failed to serialize"); let obj = json.as_object().expect("Should be object"); // These are the fields we expect (some may be null based on feature flags) let expected_fields = vec![ "use_cache", "enable_quality_processing", "ocr", "force_ocr", "disable_ocr", "chunking", "content_filter", "images", "pdf_options", "token_reduction", "language_detection", "pages", "keywords", "postprocessor", "html_options", "html_output", "max_concurrent_extractions", "result_format", "output_format", "include_document_structure", "security_limits", "acceleration", "cache_namespace", "cache_ttl_secs", "concurrency", "email", "layout", "max_archive_depth", "max_embedded_file_bytes", "extraction_timeout_secs", "tree_sitter", "use_layout_for_markdown", ]; for key in obj.keys() { assert!( expected_fields.contains(&key.as_str()), "Unexpected field in config: {}", key ); } } #[test] fn test_extraction_config_needs_image_processing() { // Test the needs_image_processing helper method let mut config = ExtractionConfig::default(); // By default, should not need image processing assert!( !config.needs_image_processing(), "Default config should not need image processing" ); // With OCR enabled, should need image processing config.ocr = Some(kreuzberg::OcrConfig { backend: "tesseract".to_string(), language: "eng".to_string(), ..Default::default() }); assert!( config.needs_image_processing(), "Config with OCR should need image processing" ); // Reset for next test config.ocr = None; config.images = Some(kreuzberg::ImageExtractionConfig { extract_images: true, target_dpi: 150, max_image_dimension: 2000, inject_placeholders: true, auto_adjust_dpi: true, min_dpi: 72, max_dpi: 600, max_images_per_page: None, classify: true, include_page_rasters: false, run_ocr_on_images: true, ocr_text_only: false, append_ocr_text: false, }); assert!( config.needs_image_processing(), "Config with image extraction should need image processing" ); } #[test] fn test_output_format_serialization_lowercase() { // Verify that OutputFormat serializes to lowercase values let json = serde_json::json!({"output_format": "markdown"}); let config: ExtractionConfig = serde_json::from_value(json).expect("Failed to parse"); let reserialized = serde_json::to_value(&config).expect("Failed to reserialize"); // Should serialize back to lowercase assert_eq!(reserialized["output_format"], "markdown"); } #[test] fn test_extraction_config_field_presence_consistency() { // Test that all serialized configs have the expected top-level fields let config = ExtractionConfig::default(); let json1 = serde_json::to_value(&config).expect("Failed to serialize"); let config2 = ExtractionConfig { force_ocr: true, ..ExtractionConfig::default() }; let json2 = serde_json::to_value(&config2).expect("Failed to serialize"); // Both should have the same top-level keys let keys1: Vec<_> = json1.as_object().expect("Expected object value").keys().collect(); let keys2: Vec<_> = json2.as_object().expect("Expected object value").keys().collect(); assert_eq!(keys1.len(), keys2.len(), "Configs should have same number of keys"); } #[test] fn test_output_format_all_variants() { // Test all output format variants can be serialized and deserialized let formats = vec![ OutputFormat::Plain, OutputFormat::Markdown, OutputFormat::Html, OutputFormat::Djot, OutputFormat::Structured, ]; for fmt in &formats { let serialized = serde_json::to_value(fmt.clone()).expect("Failed to serialize"); let deserialized: OutputFormat = serde_json::from_value(serialized).expect("Failed to deserialize"); assert_eq!(*fmt, deserialized, "Format should survive roundtrip"); } } #[test] fn test_include_document_structure_default_is_false() { let config = ExtractionConfig::default(); assert!( !config.include_document_structure, "Default include_document_structure should be false" ); } #[test] fn test_include_document_structure_serialization_roundtrip() { // Test with include_document_structure explicitly set to true let config = ExtractionConfig { include_document_structure: true, ..ExtractionConfig::default() }; // Serialize to JSON let json_string = serde_json::to_string(&config).expect("Failed to serialize"); // Deserialize back let deserialized: ExtractionConfig = serde_json::from_str(&json_string).expect("Failed to deserialize config from JSON"); // Verify the field survived the roundtrip assert_eq!( config.include_document_structure, deserialized.include_document_structure, "include_document_structure should survive roundtrip" ); assert!( deserialized.include_document_structure, "Deserialized include_document_structure should be true" ); // Also test with false to ensure explicit false values are preserved let config_false = ExtractionConfig { include_document_structure: false, ..ExtractionConfig::default() }; let json_string_false = serde_json::to_string(&config_false).expect("Failed to serialize"); let deserialized_false: ExtractionConfig = serde_json::from_str(&json_string_false).expect("Failed to deserialize config from JSON"); assert!( !deserialized_false.include_document_structure, "Explicitly false include_document_structure should survive roundtrip" ); } // ── Tree-sitter API parity tests ─────────────────────────────────────── #[cfg(feature = "tree-sitter")] #[test] fn test_tree_sitter_process_config_defaults() { let config = TreeSitterProcessConfig::default(); assert!(config.structure, "Default structure should be true"); assert!(config.imports, "Default imports should be true"); assert!(config.exports, "Default exports should be true"); assert!(!config.comments, "Default comments should be false"); assert!(!config.docstrings, "Default docstrings should be false"); assert!(!config.symbols, "Default symbols should be false"); assert!(!config.diagnostics, "Default diagnostics should be false"); assert!(config.chunk_max_size.is_none(), "Default chunk_max_size should be None"); } #[cfg(feature = "tree-sitter")] #[test] fn test_tree_sitter_config_defaults() { let config = TreeSitterConfig::default(); assert!(config.cache_dir.is_none(), "Default cache_dir should be None"); assert!(config.languages.is_none(), "Default languages should be None"); assert!(config.groups.is_none(), "Default groups should be None"); // process sub-config should use its own defaults assert!(config.process.structure, "Default process.structure should be true"); } #[cfg(feature = "tree-sitter")] #[test] fn test_tree_sitter_config_serialization_roundtrip() { let config = TreeSitterConfig { enabled: true, cache_dir: Some("/tmp/grammars".into()), languages: Some(vec!["python".to_string(), "rust".to_string()]), groups: Some(vec!["web".to_string()]), process: TreeSitterProcessConfig { structure: true, imports: true, exports: false, comments: true, docstrings: true, symbols: false, diagnostics: false, chunk_max_size: Some(4000), content_mode: Default::default(), }, }; let json_string = serde_json::to_string(&config).expect("Failed to serialize"); let deserialized: TreeSitterConfig = serde_json::from_str(&json_string).expect("Failed to deserialize"); assert_eq!(config.cache_dir, deserialized.cache_dir); assert_eq!(config.languages, deserialized.languages); assert_eq!(config.groups, deserialized.groups); assert_eq!(config.process.structure, deserialized.process.structure); assert_eq!(config.process.exports, deserialized.process.exports); assert_eq!(config.process.comments, deserialized.process.comments); assert_eq!(config.process.docstrings, deserialized.process.docstrings); assert_eq!(config.process.chunk_max_size, deserialized.process.chunk_max_size); } #[cfg(feature = "tree-sitter")] #[test] fn test_tree_sitter_config_in_extraction_config_roundtrip() { let config = ExtractionConfig { tree_sitter: Some(TreeSitterConfig { languages: Some(vec!["python".to_string()]), ..TreeSitterConfig::default() }), ..ExtractionConfig::default() }; let json_string = serde_json::to_string(&config).expect("Failed to serialize"); let deserialized: ExtractionConfig = serde_json::from_str(&json_string).expect("Failed to deserialize"); let ts = deserialized .tree_sitter .expect("tree_sitter should be present after roundtrip"); assert_eq!(ts.languages, Some(vec!["python".to_string()])); assert!(ts.process.structure, "process.structure should default to true"); } #[cfg(feature = "tree-sitter")] #[test] fn test_tree_sitter_partial_json_parsing() { // Partial tree_sitter config with only some fields let json = json!({ "tree_sitter": { "languages": ["rust"], "process": { "comments": true } } }); let config: ExtractionConfig = serde_json::from_value(json).expect("Failed to parse"); let ts = config.tree_sitter.expect("tree_sitter should be present"); assert_eq!(ts.languages, Some(vec!["rust".to_string()])); assert!(ts.groups.is_none(), "Omitted groups should be None"); assert!(ts.process.comments, "Explicit comments=true should be respected"); // Omitted fields should use defaults assert!(ts.process.structure, "Omitted structure should default to true"); assert!(!ts.process.symbols, "Omitted symbols should default to false"); } #[cfg(feature = "tree-sitter")] #[test] fn test_tree_sitter_process_config_all_fields_serialized() { let config = TreeSitterProcessConfig { structure: false, imports: false, exports: false, comments: true, docstrings: true, symbols: true, diagnostics: true, chunk_max_size: Some(2000), content_mode: Default::default(), }; let json = serde_json::to_value(&config).expect("Failed to serialize"); let obj = json.as_object().expect("Should be object"); let expected_fields = [ "structure", "imports", "exports", "comments", "docstrings", "symbols", "diagnostics", "chunk_max_size", ]; for field in expected_fields { assert!(obj.contains_key(field), "Missing field: {field}"); } } #[cfg(feature = "tree-sitter")] #[test] fn test_format_metadata_code_variant_serialization() { use kreuzberg::types::metadata::FormatMetadata; // Use a default ProcessResult to test serialization shape without needing a grammar let result = kreuzberg::ProcessResult { language: "python".to_string(), ..kreuzberg::ProcessResult::default() }; let metadata = FormatMetadata::Code(result); let json = serde_json::to_value(&metadata).expect("Failed to serialize FormatMetadata::Code"); // Verify the tagged union format assert_eq!(json["format_type"], "code", "format_type tag should be 'code'"); assert_eq!(json["language"], "python", "language should be 'python'"); assert!(json.get("metrics").is_some(), "metrics should be present"); assert!( json["metrics"]["total_lines"].is_number(), "metrics.total_lines should be a number" ); } #[cfg(feature = "tree-sitter")] #[test] fn test_tslp_types_reexported() { // Verify that TSLP types are accessible through kreuzberg's public API let _: kreuzberg::ProcessConfig = kreuzberg::ProcessConfig::new("rust"); // StructureKind enum variants let _kind = kreuzberg::StructureKind::Function; let _kind = kreuzberg::StructureKind::Class; let _kind = kreuzberg::StructureKind::Method; // ExportKind enum variants let _kind = kreuzberg::ExportKind::Named; let _kind = kreuzberg::ExportKind::Default; // CommentKind enum variants let _kind = kreuzberg::CommentKind::Line; let _kind = kreuzberg::CommentKind::Block; // DiagnosticSeverity enum variants let _sev = kreuzberg::DiagnosticSeverity::Error; let _sev = kreuzberg::DiagnosticSeverity::Warning; // FileMetrics default let metrics = kreuzberg::FileMetrics::default(); assert_eq!(metrics.total_lines, 0); }