//! Configuration features integration tests. //! //! Tests for chunking, language detection, caching, token reduction, and quality processing. //! Validates that configuration options work correctly end-to-end. #[cfg(feature = "chunking")] use kreuzberg::core::config::ChunkingConfig; use kreuzberg::core::config::ExtractionConfig; #[cfg(feature = "language-detection")] use kreuzberg::core::config::LanguageDetectionConfig; use kreuzberg::core::config::TokenReductionOptions; use kreuzberg::core::extractor::extract_bytes; mod helpers; /// Test chunking enabled - text split into chunks. #[tokio::test] #[cfg(feature = "chunking")] async fn test_chunking_enabled() { let config = ExtractionConfig { chunking: Some(ChunkingConfig { max_characters: 50, overlap: 10, ..Default::default() }), ..Default::default() }; let text = "This is a long text that should be split into multiple chunks. ".repeat(10); let text_bytes = text.as_bytes(); let result = extract_bytes(text_bytes, "text/plain", &config) .await .expect("Should extract successfully"); assert!(result.chunks.is_some(), "Chunks should be present"); let chunks = result.chunks.expect("Operation failed"); assert!(chunks.len() > 1, "Should have multiple chunks"); for chunk in &chunks { assert!(!chunk.content.is_empty(), "Chunk should not be empty"); assert!( chunk.content.len() <= 50 + 10, "Chunk length {} exceeds max_chars + overlap", chunk.content.len() ); } } /// Test chunking with overlap - overlap preserved between chunks. #[tokio::test] #[cfg(feature = "chunking")] async fn test_chunking_with_overlap() { let config = ExtractionConfig { chunking: Some(ChunkingConfig { max_characters: 100, overlap: 20, ..Default::default() }), ..Default::default() }; let text = "a".repeat(250); let text_bytes = text.as_bytes(); let result = extract_bytes(text_bytes, "text/plain", &config) .await .expect("Should extract successfully"); assert!(result.chunks.is_some(), "Chunks should be present"); let chunks = result.chunks.expect("Operation failed"); assert!(chunks.len() >= 2, "Should have at least 2 chunks"); if chunks.len() >= 2 { let chunk1 = &chunks[0]; let chunk2 = &chunks[1]; let chunk1_end = &chunk1.content[chunk1.content.len().saturating_sub(20)..]; assert!( chunk2.content.starts_with(chunk1_end) || chunk1_end.starts_with(&chunk2.content[..chunk1_end.len().min(chunk2.content.len())]), "Chunks should have overlap" ); } } /// Test chunking with custom sizes - custom chunk size and overlap. #[tokio::test] #[cfg(feature = "chunking")] async fn test_chunking_custom_sizes() { let config = ExtractionConfig { chunking: Some(ChunkingConfig { max_characters: 200, overlap: 50, ..Default::default() }), ..Default::default() }; let text = "Custom chunk test. ".repeat(50); let text_bytes = text.as_bytes(); let result = extract_bytes(text_bytes, "text/plain", &config) .await .expect("Should extract successfully"); assert!(result.chunks.is_some(), "Chunks should be present"); let chunks = result.chunks.expect("Operation failed"); assert!(!chunks.is_empty(), "Should have at least 1 chunk"); for chunk in &chunks { assert!( chunk.content.len() <= 200 + 50, "Chunk length {} exceeds custom max_chars + overlap", chunk.content.len() ); } } /// Test chunking disabled - no chunking when disabled. #[tokio::test] async fn test_chunking_disabled() { let config = ExtractionConfig { chunking: None, ..Default::default() }; let text = "This is a long text that should NOT be split into chunks. ".repeat(10); let text_bytes = text.as_bytes(); let result = extract_bytes(text_bytes, "text/plain", &config) .await .expect("Should extract successfully"); assert!(result.chunks.is_none(), "Should not have chunks when chunking disabled"); assert!(!result.content.is_empty(), "Content should be extracted"); assert!(result.content.contains("long text"), "Should contain original text"); } /// Test language detection for single language document. #[tokio::test] #[cfg(feature = "language-detection")] async fn test_language_detection_single() { let config = ExtractionConfig { language_detection: Some(LanguageDetectionConfig { enabled: true, min_confidence: 0.8, detect_multiple: false, }), ..Default::default() }; let text = "Hello world! This is English text. It should be detected as English language."; let text_bytes = text.as_bytes(); let result = extract_bytes(text_bytes, "text/plain", &config) .await .expect("Should extract successfully"); assert!(result.detected_languages.is_some(), "Should detect language"); let languages = result.detected_languages.expect("Operation failed"); assert!(!languages.is_empty(), "Should detect at least one language"); assert_eq!(languages[0], "eng", "Should detect English"); } /// Test language detection for multi-language document. #[cfg_attr(coverage, ignore = "coverage instrumentation affects multi-language heuristics")] #[tokio::test] #[cfg(feature = "language-detection")] async fn test_language_detection_multiple() { let config = ExtractionConfig { language_detection: Some(LanguageDetectionConfig { enabled: true, min_confidence: 0.7, detect_multiple: true, }), ..Default::default() }; let text = "Hello world! This is English. ".repeat(10) + "Hola mundo! Este es espaƱol. ".repeat(10).as_str(); let text_bytes = text.as_bytes(); let result = extract_bytes(text_bytes, "text/plain", &config) .await .expect("Should extract successfully"); assert!(result.detected_languages.is_some(), "Should detect languages"); let languages = result.detected_languages.expect("Operation failed"); assert!(!languages.is_empty(), "Should detect at least one language"); } /// Test language detection with confidence threshold. #[tokio::test] #[cfg(feature = "language-detection")] async fn test_language_detection_confidence() { let config = ExtractionConfig { language_detection: Some(LanguageDetectionConfig { enabled: true, min_confidence: 0.9, detect_multiple: false, }), ..Default::default() }; let text = "This is clear English text that should have high confidence."; let text_bytes = text.as_bytes(); let result = extract_bytes(text_bytes, "text/plain", &config) .await .expect("Should extract successfully"); if let Some(languages) = result.detected_languages { assert!(!languages.is_empty()); } } /// Test language detection disabled. #[tokio::test] #[cfg(feature = "language-detection")] async fn test_language_detection_disabled() { let config = ExtractionConfig { language_detection: Some(LanguageDetectionConfig { enabled: false, min_confidence: 0.8, detect_multiple: false, }), ..Default::default() }; let text = "Hello world! This is English text."; let text_bytes = text.as_bytes(); let result = extract_bytes(text_bytes, "text/plain", &config) .await .expect("Should extract successfully"); assert!( result.detected_languages.is_none(), "Should not detect language when disabled" ); } /// Test cache hit behavior - second extraction from cache. #[tokio::test] async fn test_cache_hit_behavior() { let config = ExtractionConfig { use_cache: true, ..Default::default() }; let text = "Test text for caching behavior."; let text_bytes = text.as_bytes(); let result1 = extract_bytes(text_bytes, "text/plain", &config) .await .expect("First extraction should succeed"); let result2 = extract_bytes(text_bytes, "text/plain", &config) .await .expect("Second extraction should succeed"); assert_eq!(result1.content, result2.content); } /// Test cache miss and invalidation. #[tokio::test] async fn test_cache_miss_invalidation() { let config = ExtractionConfig { use_cache: true, ..Default::default() }; let text1 = "First text for cache test."; let text2 = "Second different text."; let result1 = extract_bytes(text1.as_bytes(), "text/plain", &config) .await .expect("First extraction should succeed"); let result2 = extract_bytes(text2.as_bytes(), "text/plain", &config) .await .expect("Second extraction should succeed"); assert_ne!(result1.content, result2.content); } /// Test custom cache directory (Note: OCR cache uses hardcoded directory). #[tokio::test] async fn test_custom_cache_directory() { let config = ExtractionConfig { use_cache: true, ..Default::default() }; let text = "Test text for cache directory test."; let text_bytes = text.as_bytes(); let result = extract_bytes(text_bytes, "text/plain", &config) .await .expect("Should extract successfully"); assert!(!result.content.is_empty()); } /// Test cache disabled - bypass cache. #[tokio::test] async fn test_cache_disabled() { let config = ExtractionConfig { use_cache: false, ..Default::default() }; let text = "Test text without caching."; let text_bytes = text.as_bytes(); let result1 = extract_bytes(text_bytes, "text/plain", &config) .await .expect("First extraction should succeed"); let result2 = extract_bytes(text_bytes, "text/plain", &config) .await .expect("Second extraction should succeed"); assert_eq!(result1.content, result2.content); } /// Test token reduction in aggressive mode. #[tokio::test] async fn test_token_reduction_aggressive() { let config = ExtractionConfig { token_reduction: Some(TokenReductionOptions { mode: "aggressive".to_string(), preserve_important_words: true, }), ..Default::default() }; let text = "This is a very long sentence with many unnecessary words that could be reduced. ".repeat(5); let text_bytes = text.as_bytes(); let result = extract_bytes(text_bytes, "text/plain", &config) .await .expect("Should extract successfully"); assert!(!result.content.is_empty()); } /// Test token reduction in conservative mode. #[tokio::test] async fn test_token_reduction_conservative() { let config = ExtractionConfig { token_reduction: Some(TokenReductionOptions { mode: "light".to_string(), preserve_important_words: true, }), ..Default::default() }; let text = "Conservative token reduction test with moderate text length."; let text_bytes = text.as_bytes(); let result = extract_bytes(text_bytes, "text/plain", &config) .await .expect("Should extract successfully"); assert!(!result.content.is_empty()); } /// Test token reduction disabled. #[tokio::test] async fn test_token_reduction_disabled() { let config = ExtractionConfig { token_reduction: Some(TokenReductionOptions { mode: "off".to_string(), preserve_important_words: false, }), ..Default::default() }; let text = "Text without token reduction applied."; let text_bytes = text.as_bytes(); let result = extract_bytes(text_bytes, "text/plain", &config) .await .expect("Should extract successfully"); assert!(result.content.contains("without token reduction")); } /// Test quality processing enabled - quality scoring applied. #[tokio::test] #[cfg(feature = "quality")] async fn test_quality_processing_enabled() { let config = ExtractionConfig { enable_quality_processing: true, ..Default::default() }; let text = "This is well-structured text. It has multiple sentences. And proper punctuation."; let text_bytes = text.as_bytes(); let result = extract_bytes(text_bytes, "text/plain", &config) .await .expect("Should extract successfully"); if let Some(score_value) = result.quality_score { assert!((0.0..=1.0).contains(&score_value)); } assert!(!result.content.is_empty()); } /// Test quality processing calculates score for different text quality. #[tokio::test] #[cfg(feature = "quality")] async fn test_quality_threshold_filtering() { let config = ExtractionConfig { enable_quality_processing: true, ..Default::default() }; let high_quality = "This is a well-structured document. It has proper sentences. And good formatting."; let result_high = extract_bytes(high_quality.as_bytes(), "text/plain", &config) .await .expect("Should extract successfully"); let low_quality = "a b c d ....... word123mixed . . "; let result_low = extract_bytes(low_quality.as_bytes(), "text/plain", &config) .await .expect("Should extract successfully"); assert!(result_high.quality_score.is_some(), "High quality should have score"); assert!(result_low.quality_score.is_some(), "Low quality should have score"); let score_high = result_high.quality_score.expect("High quality should have score"); let score_low = result_low.quality_score.expect("Low quality should have score"); assert!((0.0..=1.0).contains(&score_high)); assert!((0.0..=1.0).contains(&score_low)); } /// Test quality processing disabled. #[tokio::test] async fn test_quality_processing_disabled() { let config = ExtractionConfig { enable_quality_processing: false, ..Default::default() }; let text = "Text without quality processing."; let text_bytes = text.as_bytes(); let result = extract_bytes(text_bytes, "text/plain", &config) .await .expect("Should extract successfully"); assert!(result.quality_score.is_none()); assert!(!result.content.is_empty()); } /// Test markdown chunker populates heading context. #[tokio::test] #[cfg(feature = "chunking")] async fn test_markdown_chunker_heading_context() { let markdown = r#"# Title Some intro text. ## Section One Content in section one with enough text to create a chunk. ## Section Two Content in section two with enough text to create another chunk. ### Subsection More detailed content here in the subsection. "#; let config = ExtractionConfig { chunking: Some(ChunkingConfig { max_characters: 80, overlap: 10, chunker_type: kreuzberg::ChunkerType::Markdown, ..Default::default() }), ..Default::default() }; let result = extract_bytes(markdown.as_bytes(), "text/markdown", &config) .await .expect("Should extract successfully"); assert!(result.chunks.is_some(), "Chunks should be present"); let chunks = result.chunks.expect("Should have chunks"); assert!(chunks.len() >= 2, "Should have at least 2 chunks"); // At least one chunk should have heading_context populated let has_heading = chunks.iter().any(|c| c.metadata.heading_context.is_some()); assert!(has_heading, "At least one chunk should have heading_context"); // Verify heading context structure for chunk in &chunks { if let Some(ref ctx) = chunk.metadata.heading_context { for heading in &ctx.headings { assert!(heading.level >= 1 && heading.level <= 6, "Heading level should be 1-6"); assert!(!heading.text.is_empty(), "Heading text should not be empty"); } } } } /// Test that chunk_type is populated for markdown chunks. #[tokio::test] #[cfg(feature = "chunking")] async fn test_chunk_type_populated() { let markdown = r#"# Introduction This section introduces the document with some content. ## Code Example ```rust fn hello() { println!("Hello, world!"); } ``` ## Summary A brief summary of the document. "#; let config = ExtractionConfig { chunking: Some(ChunkingConfig { max_characters: 200, overlap: 0, chunker_type: kreuzberg::ChunkerType::Markdown, ..Default::default() }), ..Default::default() }; let result = extract_bytes(markdown.as_bytes(), "text/markdown", &config) .await .expect("Should extract successfully"); assert!(result.chunks.is_some(), "Chunks should be present"); let chunks = result.chunks.expect("Should have chunks"); assert!(!chunks.is_empty(), "Should have at least one chunk"); // Verify chunk metadata is populated for chunk in &chunks { assert!(!chunk.content.is_empty(), "Chunk should not be empty"); // chunk_type must always be set (never uninitialized) let _ = &chunk.chunk_type; } // At least one chunk should have heading context (from markdown structure) let has_heading_context = chunks.iter().any(|c| c.metadata.heading_context.is_some()); assert!( has_heading_context, "At least one chunk should have heading context from markdown structure" ); } /// Test chunking with embeddings using balanced preset. /// /// This test requires ONNX Runtime to be installed as a system dependency. /// On macOS with Homebrew: `brew install onnxruntime` /// On Linux: Install via your package manager or download from https://github.com/microsoft/onnxruntime/releases /// On Windows: Download from https://github.com/microsoft/onnxruntime/releases #[tokio::test] #[cfg(feature = "embeddings")] #[cfg_attr(target_os = "macos", ignore = "ONNX models not cached on macOS")] #[cfg_attr(target_os = "windows", ignore = "ONNX models not cached on Windows")] async fn test_chunking_with_embeddings() { use kreuzberg::core::config::EmbeddingConfig; let config = ExtractionConfig { chunking: Some(ChunkingConfig { max_characters: 100, overlap: 20, embedding: Some(EmbeddingConfig::default()), ..Default::default() }), ..Default::default() }; let text = "This is a test document for embedding generation. ".repeat(10); let text_bytes = text.as_bytes(); let result = extract_bytes(text_bytes, "text/plain", &config) .await .expect("Should extract successfully"); assert!(result.chunks.is_some(), "Chunks should be present"); let chunks = result.chunks.expect("Operation failed"); assert!(chunks.len() > 1, "Should have multiple chunks"); if let Some(error) = result.metadata.additional.get("embedding_error") { panic!("Embedding generation failed: {}", error); } for chunk in &chunks { assert!(chunk.embedding.is_some(), "Each chunk should have an embedding"); let embedding = chunk.embedding.as_ref().expect("Operation failed"); assert_eq!( embedding.len(), 768, "Embedding should have 768 dimensions for balanced preset" ); let magnitude: f32 = embedding.iter().map(|x| x * x).sum::().sqrt(); assert!( (magnitude - 1.0).abs() < 0.01, "Embedding should be normalized (magnitude ~= 1.0)" ); } } /// Test chunking with fast embedding preset. /// /// This test requires ONNX Runtime to be installed as a system dependency. /// On macOS with Homebrew: `brew install onnxruntime` /// On Linux: Install via your package manager or download from https://github.com/microsoft/onnxruntime/releases /// On Windows: Download from https://github.com/microsoft/onnxruntime/releases #[tokio::test] #[cfg(feature = "embeddings")] #[cfg_attr(target_os = "macos", ignore = "ONNX models not cached on macOS")] #[cfg_attr(target_os = "windows", ignore = "ONNX models not cached on Windows")] async fn test_chunking_with_fast_embeddings() { use kreuzberg::core::config::{EmbeddingConfig, EmbeddingModelType}; let config = ExtractionConfig { chunking: Some(ChunkingConfig { max_characters: 100, overlap: 20, embedding: Some(EmbeddingConfig { model: EmbeddingModelType::Preset { name: "fast".to_string(), }, ..Default::default() }), ..Default::default() }), ..Default::default() }; let text = "Fast embedding test. ".repeat(10); let text_bytes = text.as_bytes(); let result = extract_bytes(text_bytes, "text/plain", &config) .await .expect("Should extract successfully"); let chunks = result.chunks.expect("Should have chunks"); assert!(!chunks.is_empty(), "Should have at least one chunk"); if let Some(error) = result.metadata.additional.get("embedding_error") { panic!("Embedding generation failed: {}", error); } for chunk in &chunks { let embedding = chunk.embedding.as_ref().expect("Should have embedding"); assert_eq!(embedding.len(), 384, "Fast preset should produce 384-dim embeddings"); } }