Files
fil/crates/kreuzberg/tests/config_behavioral.rs

436 lines
14 KiB
Rust
Raw Permalink Normal View History

2026-06-01 23:40:55 +02:00
//! Config behavioral verification tests
//!
//! These tests verify that configuration options actually affect extraction behavior,
//! not just that they serialize correctly.
//!
//! Unlike serialization tests that only check if configs deserialize, these tests verify
//! that the configuration options actually influence the extraction process and produce
//! observable differences in the output.
#[cfg(feature = "chunking")]
use kreuzberg::core::config::ChunkingConfig;
use kreuzberg::core::config::ExtractionConfig;
use kreuzberg::core::config::OutputFormat;
use kreuzberg::core::extractor::extract_bytes;
use kreuzberg::types::ResultFormat;
mod helpers;
/// Test output_format Plain produces text without formatting
///
/// Note: HTML extractors often convert to markdown internally, so this test
/// uses plain text input to verify the output_format configuration is respected.
#[tokio::test]
async fn test_output_format_plain_produces_plain() {
let plain_text = b"Title\n\nParagraph with bold text.";
let config = ExtractionConfig {
output_format: OutputFormat::Plain,
..Default::default()
};
let result = extract_bytes(plain_text, "text/plain", &config)
.await
.expect("Should extract successfully");
// Plain text should not have markdown or HTML formatting
assert!(
!result.content.contains("# ") && !result.content.contains("<h1>"),
"Plain format should not contain markdown headers or HTML tags, got: {}",
result.content
);
assert!(
result.content.contains("Title") || result.content.contains("Paragraph"),
"Should still contain extracted text content"
);
}
/// Test output_format Markdown produces markdown formatting
#[tokio::test]
#[cfg(feature = "html")]
async fn test_output_format_markdown_produces_markdown() {
let html = b"<h1>Title</h1><p>Paragraph with <strong>bold</strong> text.</p>";
let config = ExtractionConfig {
output_format: OutputFormat::Markdown,
..Default::default()
};
let result = extract_bytes(html, "text/html", &config)
.await
.expect("Should extract successfully");
// Verify markdown formatting is present (# for headers or ** for bold)
let has_markdown = result.content.contains("# ") || result.content.contains("**") || result.content.contains("*");
assert!(
has_markdown,
"Markdown format should contain # headers or ** bold, got: {}",
result.content
);
}
/// Test output_format HTML produces valid HTML content
#[tokio::test]
async fn test_output_format_html_produces_html() {
let text = "Title\n\nParagraph with bold text.";
let config = ExtractionConfig {
output_format: OutputFormat::Html,
..Default::default()
};
let result = extract_bytes(text.as_bytes(), "text/plain", &config)
.await
.expect("Should extract successfully");
// HTML format should be safe and not contain injection vectors
assert!(
!result.content.contains("<script>"),
"HTML format should be safe from injection"
);
assert!(!result.content.is_empty(), "Should produce content in HTML format");
}
/// Test result_format Unified produces content in single field
#[tokio::test]
async fn test_result_format_unified_structure() {
let text = "Sample content";
let config = ExtractionConfig {
result_format: ResultFormat::Unified,
..Default::default()
};
let result = extract_bytes(text.as_bytes(), "text/plain", &config)
.await
.expect("Should extract successfully");
// Unified format should have content in main content field
assert!(!result.content.is_empty(), "Unified format should have content");
// Elements should be None or empty for unified format
assert!(
result.elements.is_none() || result.elements.as_ref().unwrap().is_empty(),
"Unified format should not have elements"
);
}
/// Test result_format ElementBased produces element structure
#[tokio::test]
async fn test_result_format_element_based_structure() {
let text = "First paragraph here.\n\nSecond paragraph with more content.";
let config = ExtractionConfig {
result_format: ResultFormat::ElementBased,
..Default::default()
};
let result = extract_bytes(text.as_bytes(), "text/plain", &config)
.await
.expect("Should extract successfully");
// Element-based format should produce elements array
if let Some(elements) = &result.elements {
assert!(!elements.is_empty(), "Element-based format should have elements");
// Verify elements have expected structure
for element in elements {
assert!(!element.text.is_empty(), "Elements should have non-empty text");
}
}
}
/// Test chunking max_chars actually limits chunk size
#[tokio::test]
#[cfg(feature = "chunking")]
async fn test_chunking_max_chars_limits_chunk_size() {
let long_text = "word ".repeat(500); // ~2500 characters
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 100,
overlap: 20,
..Default::default()
}),
..Default::default()
};
let result = extract_bytes(long_text.as_bytes(), "text/plain", &config)
.await
.expect("Should extract successfully");
assert!(result.chunks.is_some(), "Chunking should produce chunks");
if let Some(chunks) = result.chunks {
assert!(chunks.len() > 1, "Long text should produce multiple chunks");
// Verify chunk size constraint: each chunk should respect max_chars
for (i, chunk) in chunks.iter().enumerate() {
assert!(
chunk.content.len() <= 100 + 20,
"Chunk {} exceeds max_chars + overlap: length = {}",
i,
chunk.content.len()
);
}
}
}
/// Test chunking with overlap creates overlapping chunks
#[tokio::test]
#[cfg(feature = "chunking")]
async fn test_chunking_overlap_creates_overlap() {
let text = "First sentence. ".repeat(30); // ~480 characters
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 50,
overlap: 15,
..Default::default()
}),
..Default::default()
};
let result = extract_bytes(text.as_bytes(), "text/plain", &config)
.await
.expect("Should extract successfully");
if let Some(chunks) = result.chunks
&& chunks.len() >= 2
{
// Check if adjacent chunks have overlapping text
let chunk1_end = &chunks[0].content[chunks[0].content.len().saturating_sub(15)..];
let chunk2_start = &chunks[1].content[..chunks[1].content.len().min(15)];
// There should be some overlap in the text
let overlap_found = chunk1_end.chars().any(|c| c != ' ') && chunk2_start.chars().any(|c| c != ' ');
assert!(
overlap_found,
"Adjacent chunks should have overlapping non-whitespace text"
);
}
}
/// Test chunking disabled produces no chunks
#[tokio::test]
async fn test_chunking_disabled_produces_no_chunks() {
let long_text = "word ".repeat(500);
let config = ExtractionConfig {
chunking: None,
..Default::default()
};
let result = extract_bytes(long_text.as_bytes(), "text/plain", &config)
.await
.expect("Should extract successfully");
assert!(result.chunks.is_none(), "Chunking disabled should produce no chunks");
}
/// Test use_cache true allows results to be cached
#[tokio::test]
async fn test_cache_enabled_allows_caching() {
let text = "Test content for caching";
let config = ExtractionConfig {
use_cache: true,
..Default::default()
};
// Extract twice with same content
let result1 = extract_bytes(text.as_bytes(), "text/plain", &config)
.await
.expect("First extraction should succeed");
let result2 = extract_bytes(text.as_bytes(), "text/plain", &config)
.await
.expect("Second extraction should succeed");
// Results should be identical
assert_eq!(
result1.content, result2.content,
"Cache enabled should produce consistent results"
);
}
/// Test use_cache false disables caching without crashing
#[tokio::test]
async fn test_cache_disabled_does_not_crash() {
let text = "Test content without caching";
let config = ExtractionConfig {
use_cache: false,
..Default::default()
};
let result = extract_bytes(text.as_bytes(), "text/plain", &config)
.await
.expect("Extraction with cache disabled should succeed");
assert!(!result.content.is_empty(), "Should still extract content");
}
/// Test quality_processing enabled produces quality score
#[tokio::test]
#[cfg(feature = "quality")]
async fn test_quality_processing_enabled_produces_score() {
let text = "This is a well-structured document. It has proper sentences. And good formatting.";
let config = ExtractionConfig {
enable_quality_processing: true,
..Default::default()
};
let result = extract_bytes(text.as_bytes(), "text/plain", &config)
.await
.expect("Should extract successfully");
// Quality processing should populate the dedicated quality_score field
assert!(
result.quality_score.is_some(),
"Quality processing enabled should produce quality_score"
);
}
/// Test quality_processing disabled does not produce score
#[tokio::test]
#[cfg(feature = "quality")]
async fn test_quality_processing_disabled_no_score() {
let text = "This is a document.";
let config = ExtractionConfig {
enable_quality_processing: false,
..Default::default()
};
let result = extract_bytes(text.as_bytes(), "text/plain", &config)
.await
.expect("Should extract successfully");
assert!(
result.quality_score.is_none(),
"Quality processing disabled should not produce quality_score"
);
}
/// Test output_format combinations with result_format
#[tokio::test]
#[cfg(feature = "html")]
async fn test_output_format_with_element_based() {
let html = b"<p>First paragraph</p><p>Second paragraph</p>";
let config = ExtractionConfig {
output_format: OutputFormat::Markdown,
result_format: ResultFormat::ElementBased,
..Default::default()
};
let result = extract_bytes(html, "text/html", &config)
.await
.expect("Should extract successfully");
// Should have elements
assert!(result.elements.is_some(), "ElementBased format should produce elements");
// Content should still be markdown formatted
assert!(
!result.content.contains("<p>"),
"Output format should not contain HTML tags"
);
}
/// Test chunking respects overlap maximum
#[tokio::test]
#[cfg(feature = "chunking")]
async fn test_chunking_overlap_maximum() {
let text = "x".repeat(200); // Simple repeated character
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 60,
overlap: 10,
..Default::default()
}),
..Default::default()
};
let result = extract_bytes(text.as_bytes(), "text/plain", &config)
.await
.expect("Should extract successfully");
if let Some(chunks) = result.chunks {
// Verify max_overlap is not exceeded
for (i, chunk) in chunks.iter().enumerate() {
assert!(
chunk.content.len() <= 60 + 10,
"Chunk {} size {} exceeds max_chars (60) + max_overlap (10)",
i,
chunk.content.len()
);
}
}
}
/// Test large document extraction with multiple config options
#[tokio::test]
#[cfg(feature = "chunking")]
async fn test_large_document_with_combined_config() {
let large_text = "This is a paragraph. ".repeat(100); // ~2000 characters
let config = ExtractionConfig {
output_format: OutputFormat::Plain,
chunking: Some(ChunkingConfig {
max_characters: 200,
overlap: 30,
..Default::default()
}),
use_cache: true,
enable_quality_processing: true,
..Default::default()
};
let result = extract_bytes(large_text.as_bytes(), "text/plain", &config)
.await
.expect("Should extract successfully");
// Should have chunks due to size
assert!(result.chunks.is_some(), "Should produce chunks for large text");
// Should have quality score
#[cfg(feature = "quality")]
{
assert!(result.quality_score.is_some(), "Should have quality score");
}
// Should have content in plain format
assert!(!result.content.is_empty(), "Should have content");
}
/// Test that result_format ElementBased auto-enables page extraction via normalization (#908)
#[tokio::test]
async fn test_element_based_auto_normalizes_page_extraction() {
let text = "Page 1 content.\n\nPage 2 content.";
// Configure ElementBased but explicitly set extract_pages to false.
// The normalization logic should override this to true because ElementBased needs page numbers.
let config = ExtractionConfig {
result_format: ResultFormat::ElementBased,
pages: Some(kreuzberg::core::config::PageConfig {
extract_pages: false,
..Default::default()
}),
..Default::default()
};
let result = extract_bytes(text.as_bytes(), "text/plain", &config)
.await
.expect("Should extract successfully");
// If normalization worked, elements should have page numbers correctly assigned (not just all 1).
// Note: For plain text, page numbers are often 1 unless explicit markers are found,
// but the key is that the extraction SUCCEEDS and the config was normalized.
assert!(result.elements.is_some(), "Should have elements");
}