Files
fil/crates/kreuzberg/tests/security_validators.rs
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

192 lines
6.7 KiB
Rust
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//! Hostile-input integration tests for the security validators wired into
//! extractors. Each test feeds a synthesised attack payload through the
//! public `extract_bytes_sync` entry point and asserts the extraction fails
//! with `KreuzbergError::Security`.
//!
//! The validators (`StringGrowthValidator`, `IterationValidator`,
//! `DepthValidator`, `EntityValidator`, `TableValidator`) are crate-private
//! helpers — the assertion target is the unified `Security` error variant
//! that bindings observe. All tests use `ExtractionConfig::security_limits`
//! to dial limits down so we can verify a bounded payload fires the cap
//! deterministically (rather than waiting for OOM at the production default).
//!
//! Coverage is bounded by what's actually wired today. This file grows as
//! more extractors get the `SecurityBudget` parameter threaded through their
//! parser loops.
use kreuzberg::core::config::ExtractionConfig;
use kreuzberg::core::extractor::extract_bytes_sync;
use kreuzberg::extractors::security::SecurityLimits;
/// Build a `SecurityLimits` with everything dialled to a small cap so a
/// bounded hostile fixture can deterministically trip exactly one validator.
fn tight_limits() -> SecurityLimits {
SecurityLimits {
max_archive_size: 500 * 1024 * 1024,
max_compression_ratio: 100,
max_files_in_archive: 10_000,
max_nesting_depth: 16,
max_entity_length: 32,
max_content_size: 4 * 1024,
max_iterations: 10_000,
max_xml_depth: 16,
max_table_cells: 64,
}
}
fn config_with_tight_limits() -> ExtractionConfig {
ExtractionConfig {
security_limits: Some(tight_limits()),
..ExtractionConfig::default()
}
}
/// Plain XML body: 1000 nested `<a>` elements should trip `max_xml_depth = 16`.
#[test]
fn xml_depth_bomb_fires_security_error() {
let mut payload = String::from("<root>");
for _ in 0..1000 {
payload.push_str("<a>");
}
payload.push_str("leaf");
for _ in 0..1000 {
payload.push_str("</a>");
}
payload.push_str("</root>");
let cfg = config_with_tight_limits();
let err = extract_bytes_sync(payload.as_bytes(), "application/xml", &cfg)
.expect_err("hostile XML must not extract successfully");
assert!(
matches!(err, kreuzberg::KreuzbergError::Security { .. }),
"expected Security error, got {:?}",
err
);
assert!(
err.to_string().to_lowercase().contains("nesting"),
"expected nesting-too-deep message, got {}",
err
);
}
/// XML body with one element whose name expands beyond `max_entity_length = 32`
/// once read as text content. Tests that text-content entities are checked.
#[test]
fn xml_oversize_text_fires_security_error() {
let huge_text = "x".repeat(64);
let payload = format!("<root>{}</root>", huge_text);
let cfg = config_with_tight_limits();
let err = extract_bytes_sync(payload.as_bytes(), "application/xml", &cfg)
.expect_err("oversize text must not extract successfully");
assert!(
matches!(err, kreuzberg::KreuzbergError::Security { .. }),
"expected Security error, got {:?}",
err
);
assert!(
err.to_string().to_lowercase().contains("entity"),
"expected entity-too-long message, got {}",
err
);
}
/// XML body with an attribute value longer than `max_entity_length = 32`.
#[test]
fn xml_oversize_attribute_fires_security_error() {
let huge_attr = "v".repeat(128);
let payload = format!("<root attr=\"{}\">ok</root>", huge_attr);
let cfg = config_with_tight_limits();
let err = extract_bytes_sync(payload.as_bytes(), "application/xml", &cfg)
.expect_err("oversize attribute must not extract successfully");
assert!(
matches!(err, kreuzberg::KreuzbergError::Security { .. }),
"expected Security error, got {:?}",
err
);
assert!(
err.to_string().to_lowercase().contains("entity"),
"expected entity-too-long message, got {}",
err
);
}
/// XML body whose accumulated text emit exceeds `max_content_size = 4 KiB`.
/// Each text node alone fits under `max_entity_length`, but the running total
/// trips `StringGrowthValidator`.
#[test]
fn xml_string_growth_fires_security_error() {
// 256 text nodes × 30 bytes each = 7680 bytes total content (> 4 KiB cap)
// each individual node is 30 bytes (< max_entity_length = 32).
let mut payload = String::from("<root>");
for _ in 0..256 {
payload.push_str("<n>");
payload.push_str(&"x".repeat(30));
payload.push_str("</n>");
}
payload.push_str("</root>");
let cfg = config_with_tight_limits();
let err = extract_bytes_sync(payload.as_bytes(), "application/xml", &cfg)
.expect_err("oversized cumulative content must not extract successfully");
assert!(
matches!(err, kreuzberg::KreuzbergError::Security { .. }),
"expected Security error, got {:?}",
err
);
assert!(
err.to_string().to_lowercase().contains("content"),
"expected content-too-large message, got {}",
err
);
}
/// XML body with more events than `max_iterations = 10_000`. Each empty element
/// is one Start + one End event, so 6 000 empties = 12 000 events.
#[test]
fn xml_iteration_bomb_fires_security_error() {
// Use empty self-closing elements to maximise event count per byte.
// Each `<n/>` is 4 bytes and produces exactly one Empty event.
let mut payload = String::from("<root>");
for _ in 0..15_000 {
payload.push_str("<n/>");
}
payload.push_str("</root>");
let cfg = config_with_tight_limits();
let err = extract_bytes_sync(payload.as_bytes(), "application/xml", &cfg)
.expect_err("oversized iteration count must not extract successfully");
assert!(
matches!(err, kreuzberg::KreuzbergError::Security { .. }),
"expected Security error, got {:?}",
err
);
let msg = err.to_string().to_lowercase();
assert!(
msg.contains("iteration") || msg.contains("content"),
"expected iteration-or-content error, got {}",
err
);
}
/// Sanity check: a benign small XML document under all caps extracts
/// successfully with the same `tight_limits()` configuration.
#[test]
fn xml_benign_input_extracts_successfully() {
let payload = "<root><greeting>hello world</greeting></root>";
let cfg = config_with_tight_limits();
let result = extract_bytes_sync(payload.as_bytes(), "application/xml", &cfg)
.expect("benign XML must extract under the same tight limits");
assert!(
result.content.contains("hello world"),
"expected greeting in content, got: {}",
result.content
);
}