fil/crates/kreuzberg/tests/concurrency_stress.rs

//! Comprehensive concurrency and parallelism stress tests.
//!
//! Validates that the Kreuzberg core handles concurrent operations correctly:
//! - Parallel extractions don't interfere with each other
//! - OCR processing is thread-safe and efficient
//! - Pipeline processing works correctly under concurrent load
//! - Cache access is safe with multiple readers/writers
//! - Registry access is thread-safe
//!
//! These tests ensure production workloads with high concurrency work correctly.

use async_trait::async_trait;
use kreuzberg::Result;
use kreuzberg::core::config::{ExtractionConfig, PostProcessorConfig};
use kreuzberg::core::extractor::{batch_extract_bytes, extract_bytes};
use kreuzberg::core::pipeline::run_pipeline;
use kreuzberg::internal::{ElementKind, InternalDocument, InternalElement};
use kreuzberg::plugins::registry::{get_document_extractor_registry, get_post_processor_registry};
use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
use kreuzberg::types::ExtractionResult;
use std::sync::Arc;

#[cfg(feature = "ocr")]
use kreuzberg::core::config::OcrConfig;

#[cfg(feature = "ocr")]
use kreuzberg::core::extractor::extract_file_sync;
use std::time::Duration;
use tokio::time::timeout;

mod helpers;

fn trim_trailing_newlines(value: &str) -> &str {
    value.trim_end_matches(['\n', '\r'])
}

fn assert_text_content(actual: &str, expected: &str) {
    assert_eq!(
        trim_trailing_newlines(actual),
        expected,
        "Content mismatch after trimming trailing newlines"
    );
}

/// Test many concurrent extractions of different MIME types.
///
/// Validates that:
/// - Registry lookups don't block each other unnecessarily
/// - Different extractors can run in parallel
/// - No data races or corruption
#[tokio::test]
async fn test_concurrent_extractions_mixed_formats() {
    let config = ExtractionConfig::default();

    #[allow(unused_mut)]
    let mut test_cases = vec![
        (b"Plain text content" as &[u8], "text/plain"),
        (b"{\"key\": \"value\"}", "application/json"),
        (b"# Markdown\n\nContent here", "text/markdown"),
    ];

    #[cfg(feature = "xml")]
    test_cases.push((b"<root><item>XML content</item></root>" as &[u8], "application/xml"));

    let mut handles = vec![];
    for _ in 0..10 {
        for (data, mime_type) in &test_cases {
            let config = config.clone();
            let data = data.to_vec();
            let mime_type = mime_type.to_string();

            handles.push(tokio::spawn(
                async move { extract_bytes(&data, &mime_type, &config).await },
            ));
        }
    }

    let results = timeout(Duration::from_secs(30), async {
        let mut results = vec![];
        for handle in handles {
            results.push(handle.await.expect("Task should not panic"));
        }
        results
    })
    .await
    .expect("All extractions should complete within 30s");

    for result in results {
        assert!(
            result.is_ok(),
            "Concurrent extraction should succeed: {:?}",
            result.err()
        );
    }
}

/// Test concurrent batch extractions.
///
/// Validates that batch processing correctly handles parallelism internally.
#[tokio::test]
async fn test_concurrent_batch_extractions() {
    let config = ExtractionConfig::default();

    let contents: Vec<Vec<u8>> = (0..20).map(|i| format!("Content {}", i).into_bytes()).collect();

    let mut handles = vec![];
    for _ in 0..5 {
        let config = config.clone();
        let contents_clone = contents.clone();

        handles.push(tokio::spawn(async move {
            let owned_data: Vec<kreuzberg::BatchBytesItem> = contents_clone
                .iter()
                .map(|c| kreuzberg::BatchBytesItem {
                    content: c.to_vec(),
                    mime_type: "text/plain".to_string(),
                    config: None,
                })
                .collect();
            batch_extract_bytes(owned_data, &config).await
        }));
    }

    for handle in handles {
        let results = handle.await.expect("Task should not panic");
        assert!(results.is_ok(), "Batch extraction should succeed");
        let results = results.expect("Operation failed");
        assert_eq!(results.len(), 20, "Should return all results");
    }
}

/// Test concurrent extractions with caching enabled.
///
/// Validates that:
/// - Cache reads/writes are thread-safe
/// - No cache corruption under concurrent access
/// - Cache hits work correctly across threads
#[tokio::test]
async fn test_concurrent_extractions_with_cache() {
    let config = ExtractionConfig {
        use_cache: true,
        postprocessor: Some(PostProcessorConfig {
            enabled: false,
            enabled_processors: None,
            disabled_processors: None,
            enabled_set: None,
            disabled_set: None,
        }),
        ..Default::default()
    };

    let test_data = b"Cached content for concurrent access test";

    let _ = extract_bytes(test_data, "text/plain", &config)
        .await
        .expect("Async operation failed");

    let mut handles = vec![];
    for _ in 0..100 {
        let config = config.clone();
        let data = test_data.to_vec();

        handles.push(tokio::spawn(async move {
            extract_bytes(&data, "text/plain", &config).await
        }));
    }

    let expected_content = "Cached content for concurrent access test";
    for handle in handles {
        let result = handle.await.expect("Task should not panic");
        assert!(result.is_ok(), "Cache read should succeed");
        let extraction = result.expect("Operation failed");
        assert_text_content(&extraction.content, expected_content);
    }
}

/// Test concurrent OCR processing of different images.
///
/// Validates that:
/// - OCR backend is thread-safe
/// - Multiple OCR operations don't interfere
/// - OCR cache handles concurrent access correctly
#[cfg(feature = "ocr")]
#[tokio::test]
async fn test_concurrent_ocr_processing() {
    use helpers::{get_test_file_path, skip_if_missing};

    if cfg!(windows) {
        return;
    }

    if skip_if_missing("images/ocr_image.jpg") {
        tracing::debug!("Skipping concurrent OCR test: test file not available");
        return;
    }

    let config = ExtractionConfig {
        ocr: Some(OcrConfig {
            backend: "tesseract".to_string(),
            language: "eng".to_string(),
            ..Default::default()
        }),
        force_ocr: false,
        use_cache: true,
        ..Default::default()
    };

    let file_path = get_test_file_path("images/ocr_image.jpg");

    let mut handles = vec![];
    for _ in 0..20 {
        let file_path = file_path.clone();
        let config = config.clone();

        handles.push(tokio::task::spawn_blocking(move || {
            extract_file_sync(&file_path, None, &config)
        }));
    }

    let results = timeout(Duration::from_secs(60), async {
        let mut results = vec![];
        for handle in handles {
            results.push(handle.await.expect("Task should not panic"));
        }
        results
    })
    .await
    .expect("All OCR operations should complete within 60s");

    let mut extracted_texts = vec![];
    for result in results {
        assert!(result.is_ok(), "OCR should succeed: {:?}", result.err());
        let extraction = result.expect("Operation failed");
        assert!(!extraction.content.is_empty(), "OCR should extract text");
        extracted_texts.push(extraction.content);
    }

    let first_text = &extracted_texts[0];
    for text in &extracted_texts[1..] {
        assert_eq!(text, first_text, "Concurrent OCR should produce identical results");
    }
}

/// Test concurrent OCR with cache warming.
///
/// Validates cache performance under concurrent load.
///
/// Note: This test is simplified to avoid runtime nesting issues.
/// It validates that concurrent OCR extractions work correctly with caching.
///
/// WARNING: This test uses timing heuristics (<500ms = cache hit) which are unreliable
/// in CI environments where even cached operations may exceed the threshold on slow runners.
/// Ignored to prevent flaky failures - cache hit rates vary significantly across platforms.
#[cfg(feature = "ocr")]
#[ignore = "flaky timing-based cache heuristic - cache hit rates vary significantly across platforms"]
#[test]
fn test_concurrent_ocr_cache_stress() {
    use helpers::{get_test_file_path, skip_if_missing};
    use std::sync::atomic::{AtomicUsize, Ordering};

    if skip_if_missing("images/ocr_image.jpg") {
        tracing::debug!("Skipping OCR cache stress test: test file not available");
        return;
    }

    let config = ExtractionConfig {
        ocr: Some(OcrConfig {
            backend: "tesseract".to_string(),
            language: "eng".to_string(),
            ..Default::default()
        }),
        force_ocr: false,
        use_cache: true,
        ..Default::default()
    };

    let file_path = get_test_file_path("images/ocr_image.jpg");

    let first_result = extract_file_sync(&file_path, None, &config);
    assert!(first_result.is_ok(), "Initial OCR should succeed");

    let cache_hit_count = Arc::new(AtomicUsize::new(0));

    let mut handles = vec![];
    for _ in 0..50 {
        let file_path = file_path.clone();
        let config = config.clone();
        let hit_count = Arc::clone(&cache_hit_count);

        handles.push(std::thread::spawn(move || {
            let start = std::time::Instant::now();
            let result = extract_file_sync(&file_path, None, &config);
            let duration = start.elapsed();

            if duration < Duration::from_millis(500) {
                hit_count.fetch_add(1, Ordering::Relaxed);
            }

            result
        }));
    }

    for handle in handles {
        let result = handle.join().expect("Thread should not panic");
        assert!(result.is_ok(), "Cached OCR should succeed");
    }

    let hits = cache_hit_count.load(Ordering::Relaxed);
    assert!(
        hits >= 20,
        "At least 20/50 requests should hit cache, got {} hits",
        hits
    );
}

/// Test concurrent pipeline processing.
///
/// Validates that:
/// - Pipeline can process multiple results in parallel
/// - Processors don't interfere with each other
/// - Registry reads are thread-safe
///
/// Note: This test is flaky due to timing-dependent concurrent operations.
#[tokio::test]
#[ignore = "flaky concurrency test - timing dependent on system load"]
async fn test_concurrent_pipeline_processing() {
    struct ConcurrentTestProcessor;

    impl Plugin for ConcurrentTestProcessor {
        fn name(&self) -> &str {
            "concurrent-test"
        }
        fn version(&self) -> String {
            "1.0.0".to_string()
        }
        fn initialize(&self) -> Result<()> {
            Ok(())
        }
        fn shutdown(&self) -> Result<()> {
            Ok(())
        }
    }

    #[async_trait]
    impl PostProcessor for ConcurrentTestProcessor {
        async fn process(&self, result: &mut ExtractionResult, _: &ExtractionConfig) -> Result<()> {
            tokio::time::sleep(Duration::from_millis(10)).await;
            result.content.push_str("[processed]");
            Ok(())
        }

        fn processing_stage(&self) -> ProcessingStage {
            ProcessingStage::Early
        }
    }

    let registry = get_post_processor_registry();
    {
        let mut reg = registry.write();
        let processor = Arc::new(ConcurrentTestProcessor);
        let _ = reg.remove("concurrent-test");
        reg.register(processor).expect("Should register processor");
    }

    let config = ExtractionConfig {
        postprocessor: Some(PostProcessorConfig {
            enabled: true,
            enabled_processors: Some(vec!["concurrent-test".to_string()]),
            disabled_processors: None,
            enabled_set: None,
            disabled_set: None,
        }),
        ..Default::default()
    };

    let mut handles = vec![];
    for i in 0..50 {
        let config = config.clone();

        handles.push(tokio::spawn(async move {
            let mut doc = InternalDocument::new("text");
            doc.mime_type = "text/plain".to_string();
            doc.elements.push(InternalElement::text(
                ElementKind::Paragraph,
                format!("Content {}", i),
                0,
            ));

            run_pipeline(doc, &config).await
        }));
    }

    for handle in handles {
        let result = handle.await.expect("Task should not panic");
        assert!(result.is_ok(), "Pipeline should succeed");
        let processed = result.expect("Operation failed");
        assert!(processed.content.contains("[processed]"), "Processor should run");
    }

    {
        let mut reg = registry.write();
        let _ = reg.remove("concurrent-test");
    }
}

/// Test concurrent registry reads don't block unnecessarily.
///
/// Validates that:
/// - Multiple readers can access registry simultaneously
/// - Registry lookups are fast under concurrent load
#[tokio::test]
async fn test_concurrent_registry_reads() {
    let registry = get_document_extractor_registry();

    let mut handles = vec![];
    for _ in 0..200 {
        let registry_clone = Arc::clone(&registry);
        handles.push(tokio::spawn(async move {
            let start = std::time::Instant::now();

            let reg = registry_clone.read();
            let _extractor = reg.get("text/plain");

            start.elapsed()
        }));
    }

    let mut max_duration = Duration::from_secs(0);
    for handle in handles {
        let duration = handle.await.expect("Task should not panic");
        if duration > max_duration {
            max_duration = duration;
        }
    }

    assert!(
        max_duration < Duration::from_millis(10),
        "Registry reads should be fast, max duration: {:?}",
        max_duration
    );
}

/// Test that extraction throughput scales with concurrency.
///
/// Validates that:
/// - Parallel extractions are actually running in parallel
/// - No global bottlenecks limiting throughput
///
/// Note: This is a performance benchmark that can be flaky based on system load,
/// CPU availability, and other factors. Marked as #[ignore] to run only on demand.
#[tokio::test]
#[ignore]
async fn test_extraction_throughput_scales() {
    let config = ExtractionConfig::default();
    let test_data = b"Throughput test content";

    let sequential_start = std::time::Instant::now();
    for _ in 0..20 {
        let _ = extract_bytes(test_data, "text/plain", &config)
            .await
            .expect("Async operation failed");
    }
    let sequential_duration = sequential_start.elapsed();

    let parallel_start = std::time::Instant::now();
    let mut handles = vec![];
    for _ in 0..20 {
        let config = config.clone();
        let data = test_data.to_vec();

        handles.push(tokio::spawn(async move {
            extract_bytes(&data, "text/plain", &config).await
        }));
    }

    for handle in handles {
        let _ = handle.await.expect("Task should not panic");
    }
    let parallel_duration = parallel_start.elapsed();

    println!(
        "Sequential: {:?}, Parallel: {:?}, Speedup: {:.2}x",
        sequential_duration,
        parallel_duration,
        sequential_duration.as_secs_f64() / parallel_duration.as_secs_f64()
    );

    let speedup = sequential_duration.as_secs_f64() / parallel_duration.as_secs_f64();

    assert!(
        speedup > 0.5,
        "Parallel execution should not be significantly slower than sequential. Sequential: {:?}, Parallel: {:?}, Speedup: {:.2}x",
        sequential_duration,
        parallel_duration,
        speedup
    );
}

/// High-load stress test with many concurrent operations.
///
/// Validates system stability under sustained concurrent load.
#[tokio::test]
async fn test_high_concurrency_stress() {
    let config = ExtractionConfig {
        use_cache: true,
        ..Default::default()
    };

    #[allow(unused_mut)]
    let mut formats = vec![
        (b"Text content" as &[u8], "text/plain"),
        (b"{\"json\": true}", "application/json"),
        (b"# Markdown\n\nContent", "text/markdown"),
    ];

    #[cfg(feature = "xml")]
    formats.push((b"<xml><item>content</item></xml>" as &[u8], "application/xml"));

    let mut handles = vec![];
    for _ in 0..100 {
        for (data, mime_type) in &formats {
            let config = config.clone();
            let data = data.to_vec();
            let mime_type = mime_type.to_string();

            handles.push(tokio::spawn(
                async move { extract_bytes(&data, &mime_type, &config).await },
            ));
        }
    }

    let results = timeout(Duration::from_secs(60), async {
        let mut results = vec![];
        for handle in handles {
            results.push(handle.await.expect("Task should not panic"));
        }
        results
    })
    .await
    .expect("High-load stress test should complete within 60s");

    let expected_successes = 100 * formats.len();
    let success_count = results.iter().filter(|r| r.is_ok()).count();
    assert_eq!(
        success_count, expected_successes,
        "All extractions should succeed under stress, got {} successes",
        success_count
    );
}