//! Hostile-input integration tests for the security validators wired into //! extractors. Each test feeds a synthesised attack payload through the //! public `extract_bytes_sync` entry point and asserts the extraction fails //! with `KreuzbergError::Security`. //! //! The validators (`StringGrowthValidator`, `IterationValidator`, //! `DepthValidator`, `EntityValidator`, `TableValidator`) are crate-private //! helpers — the assertion target is the unified `Security` error variant //! that bindings observe. All tests use `ExtractionConfig::security_limits` //! to dial limits down so we can verify a bounded payload fires the cap //! deterministically (rather than waiting for OOM at the production default). //! //! Coverage is bounded by what's actually wired today. This file grows as //! more extractors get the `SecurityBudget` parameter threaded through their //! parser loops. use kreuzberg::core::config::ExtractionConfig; use kreuzberg::core::extractor::extract_bytes_sync; use kreuzberg::extractors::security::SecurityLimits; /// Build a `SecurityLimits` with everything dialled to a small cap so a /// bounded hostile fixture can deterministically trip exactly one validator. fn tight_limits() -> SecurityLimits { SecurityLimits { max_archive_size: 500 * 1024 * 1024, max_compression_ratio: 100, max_files_in_archive: 10_000, max_nesting_depth: 16, max_entity_length: 32, max_content_size: 4 * 1024, max_iterations: 10_000, max_xml_depth: 16, max_table_cells: 64, } } fn config_with_tight_limits() -> ExtractionConfig { ExtractionConfig { security_limits: Some(tight_limits()), ..ExtractionConfig::default() } } /// Plain XML body: 1000 nested `` elements should trip `max_xml_depth = 16`. #[test] fn xml_depth_bomb_fires_security_error() { let mut payload = String::from(""); for _ in 0..1000 { payload.push_str(""); } payload.push_str("leaf"); for _ in 0..1000 { payload.push_str(""); } payload.push_str(""); let cfg = config_with_tight_limits(); let err = extract_bytes_sync(payload.as_bytes(), "application/xml", &cfg) .expect_err("hostile XML must not extract successfully"); assert!( matches!(err, kreuzberg::KreuzbergError::Security { .. }), "expected Security error, got {:?}", err ); assert!( err.to_string().to_lowercase().contains("nesting"), "expected nesting-too-deep message, got {}", err ); } /// XML body with one element whose name expands beyond `max_entity_length = 32` /// once read as text content. Tests that text-content entities are checked. #[test] fn xml_oversize_text_fires_security_error() { let huge_text = "x".repeat(64); let payload = format!("{}", huge_text); let cfg = config_with_tight_limits(); let err = extract_bytes_sync(payload.as_bytes(), "application/xml", &cfg) .expect_err("oversize text must not extract successfully"); assert!( matches!(err, kreuzberg::KreuzbergError::Security { .. }), "expected Security error, got {:?}", err ); assert!( err.to_string().to_lowercase().contains("entity"), "expected entity-too-long message, got {}", err ); } /// XML body with an attribute value longer than `max_entity_length = 32`. #[test] fn xml_oversize_attribute_fires_security_error() { let huge_attr = "v".repeat(128); let payload = format!("ok", huge_attr); let cfg = config_with_tight_limits(); let err = extract_bytes_sync(payload.as_bytes(), "application/xml", &cfg) .expect_err("oversize attribute must not extract successfully"); assert!( matches!(err, kreuzberg::KreuzbergError::Security { .. }), "expected Security error, got {:?}", err ); assert!( err.to_string().to_lowercase().contains("entity"), "expected entity-too-long message, got {}", err ); } /// XML body whose accumulated text emit exceeds `max_content_size = 4 KiB`. /// Each text node alone fits under `max_entity_length`, but the running total /// trips `StringGrowthValidator`. #[test] fn xml_string_growth_fires_security_error() { // 256 text nodes × 30 bytes each = 7680 bytes total content (> 4 KiB cap) // each individual node is 30 bytes (< max_entity_length = 32). let mut payload = String::from(""); for _ in 0..256 { payload.push_str(""); payload.push_str(&"x".repeat(30)); payload.push_str(""); } payload.push_str(""); let cfg = config_with_tight_limits(); let err = extract_bytes_sync(payload.as_bytes(), "application/xml", &cfg) .expect_err("oversized cumulative content must not extract successfully"); assert!( matches!(err, kreuzberg::KreuzbergError::Security { .. }), "expected Security error, got {:?}", err ); assert!( err.to_string().to_lowercase().contains("content"), "expected content-too-large message, got {}", err ); } /// XML body with more events than `max_iterations = 10_000`. Each empty element /// is one Start + one End event, so 6 000 empties = 12 000 events. #[test] fn xml_iteration_bomb_fires_security_error() { // Use empty self-closing elements to maximise event count per byte. // Each `` is 4 bytes and produces exactly one Empty event. let mut payload = String::from(""); for _ in 0..15_000 { payload.push_str(""); } payload.push_str(""); let cfg = config_with_tight_limits(); let err = extract_bytes_sync(payload.as_bytes(), "application/xml", &cfg) .expect_err("oversized iteration count must not extract successfully"); assert!( matches!(err, kreuzberg::KreuzbergError::Security { .. }), "expected Security error, got {:?}", err ); let msg = err.to_string().to_lowercase(); assert!( msg.contains("iteration") || msg.contains("content"), "expected iteration-or-content error, got {}", err ); } /// Sanity check: a benign small XML document under all caps extracts /// successfully with the same `tight_limits()` configuration. #[test] fn xml_benign_input_extracts_successfully() { let payload = "hello world"; let cfg = config_with_tight_limits(); let result = extract_bytes_sync(payload.as_bytes(), "application/xml", &cfg) .expect("benign XML must extract under the same tight limits"); assert!( result.content.contains("hello world"), "expected greeting in content, got: {}", result.content ); }