550 lines
17 KiB
Rust
550 lines
17 KiB
Rust
//! Comprehensive concurrency and parallelism stress tests.
|
|
//!
|
|
//! Validates that the Kreuzberg core handles concurrent operations correctly:
|
|
//! - Parallel extractions don't interfere with each other
|
|
//! - OCR processing is thread-safe and efficient
|
|
//! - Pipeline processing works correctly under concurrent load
|
|
//! - Cache access is safe with multiple readers/writers
|
|
//! - Registry access is thread-safe
|
|
//!
|
|
//! These tests ensure production workloads with high concurrency work correctly.
|
|
|
|
use async_trait::async_trait;
|
|
use kreuzberg::Result;
|
|
use kreuzberg::core::config::{ExtractionConfig, PostProcessorConfig};
|
|
use kreuzberg::core::extractor::{batch_extract_bytes, extract_bytes};
|
|
use kreuzberg::core::pipeline::run_pipeline;
|
|
use kreuzberg::internal::{ElementKind, InternalDocument, InternalElement};
|
|
use kreuzberg::plugins::registry::{get_document_extractor_registry, get_post_processor_registry};
|
|
use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
|
|
use kreuzberg::types::ExtractionResult;
|
|
use std::sync::Arc;
|
|
|
|
#[cfg(feature = "ocr")]
|
|
use kreuzberg::core::config::OcrConfig;
|
|
|
|
#[cfg(feature = "ocr")]
|
|
use kreuzberg::core::extractor::extract_file_sync;
|
|
use std::time::Duration;
|
|
use tokio::time::timeout;
|
|
|
|
mod helpers;
|
|
|
|
fn trim_trailing_newlines(value: &str) -> &str {
|
|
value.trim_end_matches(['\n', '\r'])
|
|
}
|
|
|
|
fn assert_text_content(actual: &str, expected: &str) {
|
|
assert_eq!(
|
|
trim_trailing_newlines(actual),
|
|
expected,
|
|
"Content mismatch after trimming trailing newlines"
|
|
);
|
|
}
|
|
|
|
/// Test many concurrent extractions of different MIME types.
|
|
///
|
|
/// Validates that:
|
|
/// - Registry lookups don't block each other unnecessarily
|
|
/// - Different extractors can run in parallel
|
|
/// - No data races or corruption
|
|
#[tokio::test]
|
|
async fn test_concurrent_extractions_mixed_formats() {
|
|
let config = ExtractionConfig::default();
|
|
|
|
#[allow(unused_mut)]
|
|
let mut test_cases = vec![
|
|
(b"Plain text content" as &[u8], "text/plain"),
|
|
(b"{\"key\": \"value\"}", "application/json"),
|
|
(b"# Markdown\n\nContent here", "text/markdown"),
|
|
];
|
|
|
|
#[cfg(feature = "xml")]
|
|
test_cases.push((b"<root><item>XML content</item></root>" as &[u8], "application/xml"));
|
|
|
|
let mut handles = vec![];
|
|
for _ in 0..10 {
|
|
for (data, mime_type) in &test_cases {
|
|
let config = config.clone();
|
|
let data = data.to_vec();
|
|
let mime_type = mime_type.to_string();
|
|
|
|
handles.push(tokio::spawn(
|
|
async move { extract_bytes(&data, &mime_type, &config).await },
|
|
));
|
|
}
|
|
}
|
|
|
|
let results = timeout(Duration::from_secs(30), async {
|
|
let mut results = vec![];
|
|
for handle in handles {
|
|
results.push(handle.await.expect("Task should not panic"));
|
|
}
|
|
results
|
|
})
|
|
.await
|
|
.expect("All extractions should complete within 30s");
|
|
|
|
for result in results {
|
|
assert!(
|
|
result.is_ok(),
|
|
"Concurrent extraction should succeed: {:?}",
|
|
result.err()
|
|
);
|
|
}
|
|
}
|
|
|
|
/// Test concurrent batch extractions.
|
|
///
|
|
/// Validates that batch processing correctly handles parallelism internally.
|
|
#[tokio::test]
|
|
async fn test_concurrent_batch_extractions() {
|
|
let config = ExtractionConfig::default();
|
|
|
|
let contents: Vec<Vec<u8>> = (0..20).map(|i| format!("Content {}", i).into_bytes()).collect();
|
|
|
|
let mut handles = vec![];
|
|
for _ in 0..5 {
|
|
let config = config.clone();
|
|
let contents_clone = contents.clone();
|
|
|
|
handles.push(tokio::spawn(async move {
|
|
let owned_data: Vec<kreuzberg::BatchBytesItem> = contents_clone
|
|
.iter()
|
|
.map(|c| kreuzberg::BatchBytesItem {
|
|
content: c.to_vec(),
|
|
mime_type: "text/plain".to_string(),
|
|
config: None,
|
|
})
|
|
.collect();
|
|
batch_extract_bytes(owned_data, &config).await
|
|
}));
|
|
}
|
|
|
|
for handle in handles {
|
|
let results = handle.await.expect("Task should not panic");
|
|
assert!(results.is_ok(), "Batch extraction should succeed");
|
|
let results = results.expect("Operation failed");
|
|
assert_eq!(results.len(), 20, "Should return all results");
|
|
}
|
|
}
|
|
|
|
/// Test concurrent extractions with caching enabled.
|
|
///
|
|
/// Validates that:
|
|
/// - Cache reads/writes are thread-safe
|
|
/// - No cache corruption under concurrent access
|
|
/// - Cache hits work correctly across threads
|
|
#[tokio::test]
|
|
async fn test_concurrent_extractions_with_cache() {
|
|
let config = ExtractionConfig {
|
|
use_cache: true,
|
|
postprocessor: Some(PostProcessorConfig {
|
|
enabled: false,
|
|
enabled_processors: None,
|
|
disabled_processors: None,
|
|
enabled_set: None,
|
|
disabled_set: None,
|
|
}),
|
|
..Default::default()
|
|
};
|
|
|
|
let test_data = b"Cached content for concurrent access test";
|
|
|
|
let _ = extract_bytes(test_data, "text/plain", &config)
|
|
.await
|
|
.expect("Async operation failed");
|
|
|
|
let mut handles = vec![];
|
|
for _ in 0..100 {
|
|
let config = config.clone();
|
|
let data = test_data.to_vec();
|
|
|
|
handles.push(tokio::spawn(async move {
|
|
extract_bytes(&data, "text/plain", &config).await
|
|
}));
|
|
}
|
|
|
|
let expected_content = "Cached content for concurrent access test";
|
|
for handle in handles {
|
|
let result = handle.await.expect("Task should not panic");
|
|
assert!(result.is_ok(), "Cache read should succeed");
|
|
let extraction = result.expect("Operation failed");
|
|
assert_text_content(&extraction.content, expected_content);
|
|
}
|
|
}
|
|
|
|
/// Test concurrent OCR processing of different images.
|
|
///
|
|
/// Validates that:
|
|
/// - OCR backend is thread-safe
|
|
/// - Multiple OCR operations don't interfere
|
|
/// - OCR cache handles concurrent access correctly
|
|
#[cfg(feature = "ocr")]
|
|
#[tokio::test]
|
|
async fn test_concurrent_ocr_processing() {
|
|
use helpers::{get_test_file_path, skip_if_missing};
|
|
|
|
if cfg!(windows) {
|
|
return;
|
|
}
|
|
|
|
if skip_if_missing("images/ocr_image.jpg") {
|
|
tracing::debug!("Skipping concurrent OCR test: test file not available");
|
|
return;
|
|
}
|
|
|
|
let config = ExtractionConfig {
|
|
ocr: Some(OcrConfig {
|
|
backend: "tesseract".to_string(),
|
|
language: "eng".to_string(),
|
|
..Default::default()
|
|
}),
|
|
force_ocr: false,
|
|
use_cache: true,
|
|
..Default::default()
|
|
};
|
|
|
|
let file_path = get_test_file_path("images/ocr_image.jpg");
|
|
|
|
let mut handles = vec![];
|
|
for _ in 0..20 {
|
|
let file_path = file_path.clone();
|
|
let config = config.clone();
|
|
|
|
handles.push(tokio::task::spawn_blocking(move || {
|
|
extract_file_sync(&file_path, None, &config)
|
|
}));
|
|
}
|
|
|
|
let results = timeout(Duration::from_secs(60), async {
|
|
let mut results = vec![];
|
|
for handle in handles {
|
|
results.push(handle.await.expect("Task should not panic"));
|
|
}
|
|
results
|
|
})
|
|
.await
|
|
.expect("All OCR operations should complete within 60s");
|
|
|
|
let mut extracted_texts = vec![];
|
|
for result in results {
|
|
assert!(result.is_ok(), "OCR should succeed: {:?}", result.err());
|
|
let extraction = result.expect("Operation failed");
|
|
assert!(!extraction.content.is_empty(), "OCR should extract text");
|
|
extracted_texts.push(extraction.content);
|
|
}
|
|
|
|
let first_text = &extracted_texts[0];
|
|
for text in &extracted_texts[1..] {
|
|
assert_eq!(text, first_text, "Concurrent OCR should produce identical results");
|
|
}
|
|
}
|
|
|
|
/// Test concurrent OCR with cache warming.
|
|
///
|
|
/// Validates cache performance under concurrent load.
|
|
///
|
|
/// Note: This test is simplified to avoid runtime nesting issues.
|
|
/// It validates that concurrent OCR extractions work correctly with caching.
|
|
///
|
|
/// WARNING: This test uses timing heuristics (<500ms = cache hit) which are unreliable
|
|
/// in CI environments where even cached operations may exceed the threshold on slow runners.
|
|
/// Ignored to prevent flaky failures - cache hit rates vary significantly across platforms.
|
|
#[cfg(feature = "ocr")]
|
|
#[ignore = "flaky timing-based cache heuristic - cache hit rates vary significantly across platforms"]
|
|
#[test]
|
|
fn test_concurrent_ocr_cache_stress() {
|
|
use helpers::{get_test_file_path, skip_if_missing};
|
|
use std::sync::atomic::{AtomicUsize, Ordering};
|
|
|
|
if skip_if_missing("images/ocr_image.jpg") {
|
|
tracing::debug!("Skipping OCR cache stress test: test file not available");
|
|
return;
|
|
}
|
|
|
|
let config = ExtractionConfig {
|
|
ocr: Some(OcrConfig {
|
|
backend: "tesseract".to_string(),
|
|
language: "eng".to_string(),
|
|
..Default::default()
|
|
}),
|
|
force_ocr: false,
|
|
use_cache: true,
|
|
..Default::default()
|
|
};
|
|
|
|
let file_path = get_test_file_path("images/ocr_image.jpg");
|
|
|
|
let first_result = extract_file_sync(&file_path, None, &config);
|
|
assert!(first_result.is_ok(), "Initial OCR should succeed");
|
|
|
|
let cache_hit_count = Arc::new(AtomicUsize::new(0));
|
|
|
|
let mut handles = vec![];
|
|
for _ in 0..50 {
|
|
let file_path = file_path.clone();
|
|
let config = config.clone();
|
|
let hit_count = Arc::clone(&cache_hit_count);
|
|
|
|
handles.push(std::thread::spawn(move || {
|
|
let start = std::time::Instant::now();
|
|
let result = extract_file_sync(&file_path, None, &config);
|
|
let duration = start.elapsed();
|
|
|
|
if duration < Duration::from_millis(500) {
|
|
hit_count.fetch_add(1, Ordering::Relaxed);
|
|
}
|
|
|
|
result
|
|
}));
|
|
}
|
|
|
|
for handle in handles {
|
|
let result = handle.join().expect("Thread should not panic");
|
|
assert!(result.is_ok(), "Cached OCR should succeed");
|
|
}
|
|
|
|
let hits = cache_hit_count.load(Ordering::Relaxed);
|
|
assert!(
|
|
hits >= 20,
|
|
"At least 20/50 requests should hit cache, got {} hits",
|
|
hits
|
|
);
|
|
}
|
|
|
|
/// Test concurrent pipeline processing.
|
|
///
|
|
/// Validates that:
|
|
/// - Pipeline can process multiple results in parallel
|
|
/// - Processors don't interfere with each other
|
|
/// - Registry reads are thread-safe
|
|
///
|
|
/// Note: This test is flaky due to timing-dependent concurrent operations.
|
|
#[tokio::test]
|
|
#[ignore = "flaky concurrency test - timing dependent on system load"]
|
|
async fn test_concurrent_pipeline_processing() {
|
|
struct ConcurrentTestProcessor;
|
|
|
|
impl Plugin for ConcurrentTestProcessor {
|
|
fn name(&self) -> &str {
|
|
"concurrent-test"
|
|
}
|
|
fn version(&self) -> String {
|
|
"1.0.0".to_string()
|
|
}
|
|
fn initialize(&self) -> Result<()> {
|
|
Ok(())
|
|
}
|
|
fn shutdown(&self) -> Result<()> {
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
#[async_trait]
|
|
impl PostProcessor for ConcurrentTestProcessor {
|
|
async fn process(&self, result: &mut ExtractionResult, _: &ExtractionConfig) -> Result<()> {
|
|
tokio::time::sleep(Duration::from_millis(10)).await;
|
|
result.content.push_str("[processed]");
|
|
Ok(())
|
|
}
|
|
|
|
fn processing_stage(&self) -> ProcessingStage {
|
|
ProcessingStage::Early
|
|
}
|
|
}
|
|
|
|
let registry = get_post_processor_registry();
|
|
{
|
|
let mut reg = registry.write();
|
|
let processor = Arc::new(ConcurrentTestProcessor);
|
|
let _ = reg.remove("concurrent-test");
|
|
reg.register(processor).expect("Should register processor");
|
|
}
|
|
|
|
let config = ExtractionConfig {
|
|
postprocessor: Some(PostProcessorConfig {
|
|
enabled: true,
|
|
enabled_processors: Some(vec!["concurrent-test".to_string()]),
|
|
disabled_processors: None,
|
|
enabled_set: None,
|
|
disabled_set: None,
|
|
}),
|
|
..Default::default()
|
|
};
|
|
|
|
let mut handles = vec![];
|
|
for i in 0..50 {
|
|
let config = config.clone();
|
|
|
|
handles.push(tokio::spawn(async move {
|
|
let mut doc = InternalDocument::new("text");
|
|
doc.mime_type = "text/plain".to_string();
|
|
doc.elements.push(InternalElement::text(
|
|
ElementKind::Paragraph,
|
|
format!("Content {}", i),
|
|
0,
|
|
));
|
|
|
|
run_pipeline(doc, &config).await
|
|
}));
|
|
}
|
|
|
|
for handle in handles {
|
|
let result = handle.await.expect("Task should not panic");
|
|
assert!(result.is_ok(), "Pipeline should succeed");
|
|
let processed = result.expect("Operation failed");
|
|
assert!(processed.content.contains("[processed]"), "Processor should run");
|
|
}
|
|
|
|
{
|
|
let mut reg = registry.write();
|
|
let _ = reg.remove("concurrent-test");
|
|
}
|
|
}
|
|
|
|
/// Test concurrent registry reads don't block unnecessarily.
|
|
///
|
|
/// Validates that:
|
|
/// - Multiple readers can access registry simultaneously
|
|
/// - Registry lookups are fast under concurrent load
|
|
#[tokio::test]
|
|
async fn test_concurrent_registry_reads() {
|
|
let registry = get_document_extractor_registry();
|
|
|
|
let mut handles = vec![];
|
|
for _ in 0..200 {
|
|
let registry_clone = Arc::clone(®istry);
|
|
handles.push(tokio::spawn(async move {
|
|
let start = std::time::Instant::now();
|
|
|
|
let reg = registry_clone.read();
|
|
let _extractor = reg.get("text/plain");
|
|
|
|
start.elapsed()
|
|
}));
|
|
}
|
|
|
|
let mut max_duration = Duration::from_secs(0);
|
|
for handle in handles {
|
|
let duration = handle.await.expect("Task should not panic");
|
|
if duration > max_duration {
|
|
max_duration = duration;
|
|
}
|
|
}
|
|
|
|
assert!(
|
|
max_duration < Duration::from_millis(10),
|
|
"Registry reads should be fast, max duration: {:?}",
|
|
max_duration
|
|
);
|
|
}
|
|
|
|
/// Test that extraction throughput scales with concurrency.
|
|
///
|
|
/// Validates that:
|
|
/// - Parallel extractions are actually running in parallel
|
|
/// - No global bottlenecks limiting throughput
|
|
///
|
|
/// Note: This is a performance benchmark that can be flaky based on system load,
|
|
/// CPU availability, and other factors. Marked as #[ignore] to run only on demand.
|
|
#[tokio::test]
|
|
#[ignore]
|
|
async fn test_extraction_throughput_scales() {
|
|
let config = ExtractionConfig::default();
|
|
let test_data = b"Throughput test content";
|
|
|
|
let sequential_start = std::time::Instant::now();
|
|
for _ in 0..20 {
|
|
let _ = extract_bytes(test_data, "text/plain", &config)
|
|
.await
|
|
.expect("Async operation failed");
|
|
}
|
|
let sequential_duration = sequential_start.elapsed();
|
|
|
|
let parallel_start = std::time::Instant::now();
|
|
let mut handles = vec![];
|
|
for _ in 0..20 {
|
|
let config = config.clone();
|
|
let data = test_data.to_vec();
|
|
|
|
handles.push(tokio::spawn(async move {
|
|
extract_bytes(&data, "text/plain", &config).await
|
|
}));
|
|
}
|
|
|
|
for handle in handles {
|
|
let _ = handle.await.expect("Task should not panic");
|
|
}
|
|
let parallel_duration = parallel_start.elapsed();
|
|
|
|
println!(
|
|
"Sequential: {:?}, Parallel: {:?}, Speedup: {:.2}x",
|
|
sequential_duration,
|
|
parallel_duration,
|
|
sequential_duration.as_secs_f64() / parallel_duration.as_secs_f64()
|
|
);
|
|
|
|
let speedup = sequential_duration.as_secs_f64() / parallel_duration.as_secs_f64();
|
|
|
|
assert!(
|
|
speedup > 0.5,
|
|
"Parallel execution should not be significantly slower than sequential. Sequential: {:?}, Parallel: {:?}, Speedup: {:.2}x",
|
|
sequential_duration,
|
|
parallel_duration,
|
|
speedup
|
|
);
|
|
}
|
|
|
|
/// High-load stress test with many concurrent operations.
|
|
///
|
|
/// Validates system stability under sustained concurrent load.
|
|
#[tokio::test]
|
|
async fn test_high_concurrency_stress() {
|
|
let config = ExtractionConfig {
|
|
use_cache: true,
|
|
..Default::default()
|
|
};
|
|
|
|
#[allow(unused_mut)]
|
|
let mut formats = vec![
|
|
(b"Text content" as &[u8], "text/plain"),
|
|
(b"{\"json\": true}", "application/json"),
|
|
(b"# Markdown\n\nContent", "text/markdown"),
|
|
];
|
|
|
|
#[cfg(feature = "xml")]
|
|
formats.push((b"<xml><item>content</item></xml>" as &[u8], "application/xml"));
|
|
|
|
let mut handles = vec![];
|
|
for _ in 0..100 {
|
|
for (data, mime_type) in &formats {
|
|
let config = config.clone();
|
|
let data = data.to_vec();
|
|
let mime_type = mime_type.to_string();
|
|
|
|
handles.push(tokio::spawn(
|
|
async move { extract_bytes(&data, &mime_type, &config).await },
|
|
));
|
|
}
|
|
}
|
|
|
|
let results = timeout(Duration::from_secs(60), async {
|
|
let mut results = vec![];
|
|
for handle in handles {
|
|
results.push(handle.await.expect("Task should not panic"));
|
|
}
|
|
results
|
|
})
|
|
.await
|
|
.expect("High-load stress test should complete within 60s");
|
|
|
|
let expected_successes = 100 * formats.len();
|
|
let success_count = results.iter().filter(|r| r.is_ok()).count();
|
|
assert_eq!(
|
|
success_count, expected_successes,
|
|
"All extractions should succeed under stress, got {} successes",
|
|
success_count
|
|
);
|
|
}
|