crates/kreuzberg/tests/config_features.rs

//! Configuration features integration tests.
//!
//! Tests for chunking, language detection, caching, token reduction, and quality processing.
//! Validates that configuration options work correctly end-to-end.

#[cfg(feature = "chunking")]
use kreuzberg::core::config::ChunkingConfig;
use kreuzberg::core::config::ExtractionConfig;
#[cfg(feature = "language-detection")]
use kreuzberg::core::config::LanguageDetectionConfig;
use kreuzberg::core::config::TokenReductionOptions;
use kreuzberg::core::extractor::extract_bytes;

mod helpers;

/// Test chunking enabled - text split into chunks.
#[tokio::test]
#[cfg(feature = "chunking")]
async fn test_chunking_enabled() {
    let config = ExtractionConfig {
        chunking: Some(ChunkingConfig {
            max_characters: 50,
            overlap: 10,
            ..Default::default()
        }),
        ..Default::default()
    };

    let text = "This is a long text that should be split into multiple chunks. ".repeat(10);
    let text_bytes = text.as_bytes();

    let result = extract_bytes(text_bytes, "text/plain", &config)
        .await
        .expect("Should extract successfully");

    assert!(result.chunks.is_some(), "Chunks should be present");
    let chunks = result.chunks.expect("Operation failed");
    assert!(chunks.len() > 1, "Should have multiple chunks");

    for chunk in &chunks {
        assert!(!chunk.content.is_empty(), "Chunk should not be empty");
        assert!(
            chunk.content.len() <= 50 + 10,
            "Chunk length {} exceeds max_chars + overlap",
            chunk.content.len()
        );
    }
}

/// Test chunking with overlap - overlap preserved between chunks.
#[tokio::test]
#[cfg(feature = "chunking")]
async fn test_chunking_with_overlap() {
    let config = ExtractionConfig {
        chunking: Some(ChunkingConfig {
            max_characters: 100,
            overlap: 20,
            ..Default::default()
        }),
        ..Default::default()
    };

    let text = "a".repeat(250);
    let text_bytes = text.as_bytes();

    let result = extract_bytes(text_bytes, "text/plain", &config)
        .await
        .expect("Should extract successfully");

    assert!(result.chunks.is_some(), "Chunks should be present");
    let chunks = result.chunks.expect("Operation failed");
    assert!(chunks.len() >= 2, "Should have at least 2 chunks");

    if chunks.len() >= 2 {
        let chunk1 = &chunks[0];
        let chunk2 = &chunks[1];

        let chunk1_end = &chunk1.content[chunk1.content.len().saturating_sub(20)..];
        assert!(
            chunk2.content.starts_with(chunk1_end)
                || chunk1_end.starts_with(&chunk2.content[..chunk1_end.len().min(chunk2.content.len())]),
            "Chunks should have overlap"
        );
    }
}

/// Test chunking with custom sizes - custom chunk size and overlap.
#[tokio::test]
#[cfg(feature = "chunking")]
async fn test_chunking_custom_sizes() {
    let config = ExtractionConfig {
        chunking: Some(ChunkingConfig {
            max_characters: 200,
            overlap: 50,
            ..Default::default()
        }),
        ..Default::default()
    };

    let text = "Custom chunk test. ".repeat(50);
    let text_bytes = text.as_bytes();

    let result = extract_bytes(text_bytes, "text/plain", &config)
        .await
        .expect("Should extract successfully");

    assert!(result.chunks.is_some(), "Chunks should be present");
    let chunks = result.chunks.expect("Operation failed");
    assert!(!chunks.is_empty(), "Should have at least 1 chunk");

    for chunk in &chunks {
        assert!(
            chunk.content.len() <= 200 + 50,
            "Chunk length {} exceeds custom max_chars + overlap",
            chunk.content.len()
        );
    }
}

/// Test chunking disabled - no chunking when disabled.
#[tokio::test]
async fn test_chunking_disabled() {
    let config = ExtractionConfig {
        chunking: None,
        ..Default::default()
    };

    let text = "This is a long text that should NOT be split into chunks. ".repeat(10);
    let text_bytes = text.as_bytes();

    let result = extract_bytes(text_bytes, "text/plain", &config)
        .await
        .expect("Should extract successfully");

    assert!(result.chunks.is_none(), "Should not have chunks when chunking disabled");

    assert!(!result.content.is_empty(), "Content should be extracted");
    assert!(result.content.contains("long text"), "Should contain original text");
}

/// Test language detection for single language document.
#[tokio::test]
#[cfg(feature = "language-detection")]
async fn test_language_detection_single() {
    let config = ExtractionConfig {
        language_detection: Some(LanguageDetectionConfig {
            enabled: true,
            min_confidence: 0.8,
            detect_multiple: false,
        }),
        ..Default::default()
    };

    let text = "Hello world! This is English text. It should be detected as English language.";
    let text_bytes = text.as_bytes();

    let result = extract_bytes(text_bytes, "text/plain", &config)
        .await
        .expect("Should extract successfully");

    assert!(result.detected_languages.is_some(), "Should detect language");
    let languages = result.detected_languages.expect("Operation failed");
    assert!(!languages.is_empty(), "Should detect at least one language");
    assert_eq!(languages[0], "eng", "Should detect English");
}

/// Test language detection for multi-language document.
#[cfg_attr(coverage, ignore = "coverage instrumentation affects multi-language heuristics")]
#[tokio::test]
#[cfg(feature = "language-detection")]
async fn test_language_detection_multiple() {
    let config = ExtractionConfig {
        language_detection: Some(LanguageDetectionConfig {
            enabled: true,
            min_confidence: 0.7,
            detect_multiple: true,
        }),
        ..Default::default()
    };

    let text = "Hello world! This is English. ".repeat(10) + "Hola mundo! Este es español. ".repeat(10).as_str();
    let text_bytes = text.as_bytes();

    let result = extract_bytes(text_bytes, "text/plain", &config)
        .await
        .expect("Should extract successfully");

    assert!(result.detected_languages.is_some(), "Should detect languages");
    let languages = result.detected_languages.expect("Operation failed");
    assert!(!languages.is_empty(), "Should detect at least one language");
}

/// Test language detection with confidence threshold.
#[tokio::test]
#[cfg(feature = "language-detection")]
async fn test_language_detection_confidence() {
    let config = ExtractionConfig {
        language_detection: Some(LanguageDetectionConfig {
            enabled: true,
            min_confidence: 0.9,
            detect_multiple: false,
        }),
        ..Default::default()
    };

    let text = "This is clear English text that should have high confidence.";
    let text_bytes = text.as_bytes();

    let result = extract_bytes(text_bytes, "text/plain", &config)
        .await
        .expect("Should extract successfully");

    if let Some(languages) = result.detected_languages {
        assert!(!languages.is_empty());
    }
}

/// Test language detection disabled.
#[tokio::test]
#[cfg(feature = "language-detection")]
async fn test_language_detection_disabled() {
    let config = ExtractionConfig {
        language_detection: Some(LanguageDetectionConfig {
            enabled: false,
            min_confidence: 0.8,
            detect_multiple: false,
        }),
        ..Default::default()
    };

    let text = "Hello world! This is English text.";
    let text_bytes = text.as_bytes();

    let result = extract_bytes(text_bytes, "text/plain", &config)
        .await
        .expect("Should extract successfully");

    assert!(
        result.detected_languages.is_none(),
        "Should not detect language when disabled"
    );
}

/// Test cache hit behavior - second extraction from cache.
#[tokio::test]
async fn test_cache_hit_behavior() {
    let config = ExtractionConfig {
        use_cache: true,
        ..Default::default()
    };

    let text = "Test text for caching behavior.";
    let text_bytes = text.as_bytes();

    let result1 = extract_bytes(text_bytes, "text/plain", &config)
        .await
        .expect("First extraction should succeed");

    let result2 = extract_bytes(text_bytes, "text/plain", &config)
        .await
        .expect("Second extraction should succeed");

    assert_eq!(result1.content, result2.content);
}

/// Test cache miss and invalidation.
#[tokio::test]
async fn test_cache_miss_invalidation() {
    let config = ExtractionConfig {
        use_cache: true,
        ..Default::default()
    };

    let text1 = "First text for cache test.";
    let text2 = "Second different text.";

    let result1 = extract_bytes(text1.as_bytes(), "text/plain", &config)
        .await
        .expect("First extraction should succeed");

    let result2 = extract_bytes(text2.as_bytes(), "text/plain", &config)
        .await
        .expect("Second extraction should succeed");

    assert_ne!(result1.content, result2.content);
}

/// Test custom cache directory (Note: OCR cache uses hardcoded directory).
#[tokio::test]
async fn test_custom_cache_directory() {
    let config = ExtractionConfig {
        use_cache: true,
        ..Default::default()
    };

    let text = "Test text for cache directory test.";
    let text_bytes = text.as_bytes();

    let result = extract_bytes(text_bytes, "text/plain", &config)
        .await
        .expect("Should extract successfully");

    assert!(!result.content.is_empty());
}

/// Test cache disabled - bypass cache.
#[tokio::test]
async fn test_cache_disabled() {
    let config = ExtractionConfig {
        use_cache: false,
        ..Default::default()
    };

    let text = "Test text without caching.";
    let text_bytes = text.as_bytes();

    let result1 = extract_bytes(text_bytes, "text/plain", &config)
        .await
        .expect("First extraction should succeed");

    let result2 = extract_bytes(text_bytes, "text/plain", &config)
        .await
        .expect("Second extraction should succeed");

    assert_eq!(result1.content, result2.content);
}

/// Test token reduction in aggressive mode.
#[tokio::test]
async fn test_token_reduction_aggressive() {
    let config = ExtractionConfig {
        token_reduction: Some(TokenReductionOptions {
            mode: "aggressive".to_string(),
            preserve_important_words: true,
        }),
        ..Default::default()
    };

    let text = "This is a very long sentence with many unnecessary words that could be reduced. ".repeat(5);
    let text_bytes = text.as_bytes();

    let result = extract_bytes(text_bytes, "text/plain", &config)
        .await
        .expect("Should extract successfully");

    assert!(!result.content.is_empty());
}

/// Test token reduction in conservative mode.
#[tokio::test]
async fn test_token_reduction_conservative() {
    let config = ExtractionConfig {
        token_reduction: Some(TokenReductionOptions {
            mode: "light".to_string(),
            preserve_important_words: true,
        }),
        ..Default::default()
    };

    let text = "Conservative token reduction test with moderate text length.";
    let text_bytes = text.as_bytes();

    let result = extract_bytes(text_bytes, "text/plain", &config)
        .await
        .expect("Should extract successfully");

    assert!(!result.content.is_empty());
}

/// Test token reduction disabled.
#[tokio::test]
async fn test_token_reduction_disabled() {
    let config = ExtractionConfig {
        token_reduction: Some(TokenReductionOptions {
            mode: "off".to_string(),
            preserve_important_words: false,
        }),
        ..Default::default()
    };

    let text = "Text without token reduction applied.";
    let text_bytes = text.as_bytes();

    let result = extract_bytes(text_bytes, "text/plain", &config)
        .await
        .expect("Should extract successfully");

    assert!(result.content.contains("without token reduction"));
}

/// Test quality processing enabled - quality scoring applied.
#[tokio::test]
#[cfg(feature = "quality")]
async fn test_quality_processing_enabled() {
    let config = ExtractionConfig {
        enable_quality_processing: true,
        ..Default::default()
    };

    let text = "This is well-structured text. It has multiple sentences. And proper punctuation.";
    let text_bytes = text.as_bytes();

    let result = extract_bytes(text_bytes, "text/plain", &config)
        .await
        .expect("Should extract successfully");

    if let Some(score_value) = result.quality_score {
        assert!((0.0..=1.0).contains(&score_value));
    }

    assert!(!result.content.is_empty());
}

/// Test quality processing calculates score for different text quality.
#[tokio::test]
#[cfg(feature = "quality")]
async fn test_quality_threshold_filtering() {
    let config = ExtractionConfig {
        enable_quality_processing: true,
        ..Default::default()
    };

    let high_quality = "This is a well-structured document. It has proper sentences. And good formatting.";
    let result_high = extract_bytes(high_quality.as_bytes(), "text/plain", &config)
        .await
        .expect("Should extract successfully");

    let low_quality = "a  b  c  d  ....... word123mixed .  . ";
    let result_low = extract_bytes(low_quality.as_bytes(), "text/plain", &config)
        .await
        .expect("Should extract successfully");

    assert!(result_high.quality_score.is_some(), "High quality should have score");
    assert!(result_low.quality_score.is_some(), "Low quality should have score");

    let score_high = result_high.quality_score.expect("High quality should have score");
    let score_low = result_low.quality_score.expect("Low quality should have score");

    assert!((0.0..=1.0).contains(&score_high));
    assert!((0.0..=1.0).contains(&score_low));
}

/// Test quality processing disabled.
#[tokio::test]
async fn test_quality_processing_disabled() {
    let config = ExtractionConfig {
        enable_quality_processing: false,
        ..Default::default()
    };

    let text = "Text without quality processing.";
    let text_bytes = text.as_bytes();

    let result = extract_bytes(text_bytes, "text/plain", &config)
        .await
        .expect("Should extract successfully");

    assert!(result.quality_score.is_none());
    assert!(!result.content.is_empty());
}

/// Test markdown chunker populates heading context.
#[tokio::test]
#[cfg(feature = "chunking")]
async fn test_markdown_chunker_heading_context() {
    let markdown = r#"# Title

Some intro text.

## Section One

Content in section one with enough text to create a chunk.

## Section Two

Content in section two with enough text to create another chunk.

### Subsection

More detailed content here in the subsection.
"#;

    let config = ExtractionConfig {
        chunking: Some(ChunkingConfig {
            max_characters: 80,
            overlap: 10,
            chunker_type: kreuzberg::ChunkerType::Markdown,
            ..Default::default()
        }),
        ..Default::default()
    };

    let result = extract_bytes(markdown.as_bytes(), "text/markdown", &config)
        .await
        .expect("Should extract successfully");

    assert!(result.chunks.is_some(), "Chunks should be present");
    let chunks = result.chunks.expect("Should have chunks");
    assert!(chunks.len() >= 2, "Should have at least 2 chunks");

    // At least one chunk should have heading_context populated
    let has_heading = chunks.iter().any(|c| c.metadata.heading_context.is_some());
    assert!(has_heading, "At least one chunk should have heading_context");

    // Verify heading context structure
    for chunk in &chunks {
        if let Some(ref ctx) = chunk.metadata.heading_context {
            for heading in &ctx.headings {
                assert!(heading.level >= 1 && heading.level <= 6, "Heading level should be 1-6");
                assert!(!heading.text.is_empty(), "Heading text should not be empty");
            }
        }
    }
}

/// Test that chunk_type is populated for markdown chunks.
#[tokio::test]
#[cfg(feature = "chunking")]
async fn test_chunk_type_populated() {
    let markdown = r#"# Introduction

This section introduces the document with some content.

## Code Example

```rust
fn hello() {
    println!("Hello, world!");
}
```

## Summary

A brief summary of the document.
"#;

    let config = ExtractionConfig {
        chunking: Some(ChunkingConfig {
            max_characters: 200,
            overlap: 0,
            chunker_type: kreuzberg::ChunkerType::Markdown,
            ..Default::default()
        }),
        ..Default::default()
    };

    let result = extract_bytes(markdown.as_bytes(), "text/markdown", &config)
        .await
        .expect("Should extract successfully");

    assert!(result.chunks.is_some(), "Chunks should be present");
    let chunks = result.chunks.expect("Should have chunks");
    assert!(!chunks.is_empty(), "Should have at least one chunk");

    // Verify chunk metadata is populated
    for chunk in &chunks {
        assert!(!chunk.content.is_empty(), "Chunk should not be empty");
        // chunk_type must always be set (never uninitialized)
        let _ = &chunk.chunk_type;
    }

    // At least one chunk should have heading context (from markdown structure)
    let has_heading_context = chunks.iter().any(|c| c.metadata.heading_context.is_some());
    assert!(
        has_heading_context,
        "At least one chunk should have heading context from markdown structure"
    );
}

/// Test chunking with embeddings using balanced preset.
///
/// This test requires ONNX Runtime to be installed as a system dependency.
/// On macOS with Homebrew: `brew install onnxruntime`
/// On Linux: Install via your package manager or download from https://github.com/microsoft/onnxruntime/releases
/// On Windows: Download from https://github.com/microsoft/onnxruntime/releases
#[tokio::test]
#[cfg(feature = "embeddings")]
#[cfg_attr(target_os = "macos", ignore = "ONNX models not cached on macOS")]
#[cfg_attr(target_os = "windows", ignore = "ONNX models not cached on Windows")]
async fn test_chunking_with_embeddings() {
    use kreuzberg::core::config::EmbeddingConfig;

    let config = ExtractionConfig {
        chunking: Some(ChunkingConfig {
            max_characters: 100,
            overlap: 20,
            embedding: Some(EmbeddingConfig::default()),
            ..Default::default()
        }),
        ..Default::default()
    };

    let text = "This is a test document for embedding generation. ".repeat(10);
    let text_bytes = text.as_bytes();

    let result = extract_bytes(text_bytes, "text/plain", &config)
        .await
        .expect("Should extract successfully");

    assert!(result.chunks.is_some(), "Chunks should be present");
    let chunks = result.chunks.expect("Operation failed");
    assert!(chunks.len() > 1, "Should have multiple chunks");

    if let Some(error) = result.metadata.additional.get("embedding_error") {
        panic!("Embedding generation failed: {}", error);
    }

    for chunk in &chunks {
        assert!(chunk.embedding.is_some(), "Each chunk should have an embedding");
        let embedding = chunk.embedding.as_ref().expect("Operation failed");
        assert_eq!(
            embedding.len(),
            768,
            "Embedding should have 768 dimensions for balanced preset"
        );

        let magnitude: f32 = embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
        assert!(
            (magnitude - 1.0).abs() < 0.01,
            "Embedding should be normalized (magnitude ~= 1.0)"
        );
    }
}

/// Test chunking with fast embedding preset.
///
/// This test requires ONNX Runtime to be installed as a system dependency.
/// On macOS with Homebrew: `brew install onnxruntime`
/// On Linux: Install via your package manager or download from https://github.com/microsoft/onnxruntime/releases
/// On Windows: Download from https://github.com/microsoft/onnxruntime/releases
#[tokio::test]
#[cfg(feature = "embeddings")]
#[cfg_attr(target_os = "macos", ignore = "ONNX models not cached on macOS")]
#[cfg_attr(target_os = "windows", ignore = "ONNX models not cached on Windows")]
async fn test_chunking_with_fast_embeddings() {
    use kreuzberg::core::config::{EmbeddingConfig, EmbeddingModelType};

    let config = ExtractionConfig {
        chunking: Some(ChunkingConfig {
            max_characters: 100,
            overlap: 20,
            embedding: Some(EmbeddingConfig {
                model: EmbeddingModelType::Preset {
                    name: "fast".to_string(),
                },
                ..Default::default()
            }),
            ..Default::default()
        }),
        ..Default::default()
    };

    let text = "Fast embedding test. ".repeat(10);
    let text_bytes = text.as_bytes();

    let result = extract_bytes(text_bytes, "text/plain", &config)
        .await
        .expect("Should extract successfully");

    let chunks = result.chunks.expect("Should have chunks");
    assert!(!chunks.is_empty(), "Should have at least one chunk");

    if let Some(error) = result.metadata.additional.get("embedding_error") {
        panic!("Embedding generation failed: {}", error);
    }

    for chunk in &chunks {
        let embedding = chunk.embedding.as_ref().expect("Should have embedding");
        assert_eq!(embedding.len(), 384, "Fast preset should produce 384-dim embeddings");
    }
}
Nomad changes 2026-06-01 23:40:55 +02:00			`//! Configuration features integration tests.`
			`//!`
			`//! Tests for chunking, language detection, caching, token reduction, and quality processing.`
			`//! Validates that configuration options work correctly end-to-end.`

			`#[cfg(feature = "chunking")]`
			`use kreuzberg::core::config::ChunkingConfig;`
			`use kreuzberg::core::config::ExtractionConfig;`
			`#[cfg(feature = "language-detection")]`
			`use kreuzberg::core::config::LanguageDetectionConfig;`
			`use kreuzberg::core::config::TokenReductionOptions;`
			`use kreuzberg::core::extractor::extract_bytes;`

			`mod helpers;`

			`/// Test chunking enabled - text split into chunks.`
			`#[tokio::test]`
			`#[cfg(feature = "chunking")]`
			`async fn test_chunking_enabled() {`
			`let config = ExtractionConfig {`
			`chunking: Some(ChunkingConfig {`
			`max_characters: 50,`
			`overlap: 10,`
			`..Default::default()`
			`}),`
			`..Default::default()`
			`};`

			`let text = "This is a long text that should be split into multiple chunks. ".repeat(10);`
			`let text_bytes = text.as_bytes();`

			`let result = extract_bytes(text_bytes, "text/plain", &config)`
			`.await`
			`.expect("Should extract successfully");`

			`assert!(result.chunks.is_some(), "Chunks should be present");`
			`let chunks = result.chunks.expect("Operation failed");`
			`assert!(chunks.len() > 1, "Should have multiple chunks");`

			`for chunk in &chunks {`
			`assert!(!chunk.content.is_empty(), "Chunk should not be empty");`
			`assert!(`
			`chunk.content.len() <= 50 + 10,`
			`"Chunk length {} exceeds max_chars + overlap",`
			`chunk.content.len()`
			`);`
			`}`
			`}`

			`/// Test chunking with overlap - overlap preserved between chunks.`
			`#[tokio::test]`
			`#[cfg(feature = "chunking")]`
			`async fn test_chunking_with_overlap() {`
			`let config = ExtractionConfig {`
			`chunking: Some(ChunkingConfig {`
			`max_characters: 100,`
			`overlap: 20,`
			`..Default::default()`
			`}),`
			`..Default::default()`
			`};`

			`let text = "a".repeat(250);`
			`let text_bytes = text.as_bytes();`

			`let result = extract_bytes(text_bytes, "text/plain", &config)`
			`.await`
			`.expect("Should extract successfully");`

			`assert!(result.chunks.is_some(), "Chunks should be present");`
			`let chunks = result.chunks.expect("Operation failed");`
			`assert!(chunks.len() >= 2, "Should have at least 2 chunks");`

			`if chunks.len() >= 2 {`
			`let chunk1 = &chunks[0];`
			`let chunk2 = &chunks[1];`

			`let chunk1_end = &chunk1.content[chunk1.content.len().saturating_sub(20)..];`
			`assert!(`
			`chunk2.content.starts_with(chunk1_end)`
			`\|\| chunk1_end.starts_with(&chunk2.content[..chunk1_end.len().min(chunk2.content.len())]),`
			`"Chunks should have overlap"`
			`);`
			`}`
			`}`

			`/// Test chunking with custom sizes - custom chunk size and overlap.`
			`#[tokio::test]`
			`#[cfg(feature = "chunking")]`
			`async fn test_chunking_custom_sizes() {`
			`let config = ExtractionConfig {`
			`chunking: Some(ChunkingConfig {`
			`max_characters: 200,`
			`overlap: 50,`
			`..Default::default()`
			`}),`
			`..Default::default()`
			`};`

			`let text = "Custom chunk test. ".repeat(50);`
			`let text_bytes = text.as_bytes();`

			`let result = extract_bytes(text_bytes, "text/plain", &config)`
			`.await`
			`.expect("Should extract successfully");`

			`assert!(result.chunks.is_some(), "Chunks should be present");`
			`let chunks = result.chunks.expect("Operation failed");`
			`assert!(!chunks.is_empty(), "Should have at least 1 chunk");`

			`for chunk in &chunks {`
			`assert!(`
			`chunk.content.len() <= 200 + 50,`
			`"Chunk length {} exceeds custom max_chars + overlap",`
			`chunk.content.len()`
			`);`
			`}`
			`}`

			`/// Test chunking disabled - no chunking when disabled.`
			`#[tokio::test]`
			`async fn test_chunking_disabled() {`
			`let config = ExtractionConfig {`
			`chunking: None,`
			`..Default::default()`
			`};`

			`let text = "This is a long text that should NOT be split into chunks. ".repeat(10);`
			`let text_bytes = text.as_bytes();`

			`let result = extract_bytes(text_bytes, "text/plain", &config)`
			`.await`
			`.expect("Should extract successfully");`

			`assert!(result.chunks.is_none(), "Should not have chunks when chunking disabled");`

			`assert!(!result.content.is_empty(), "Content should be extracted");`
			`assert!(result.content.contains("long text"), "Should contain original text");`
			`}`

			`/// Test language detection for single language document.`
			`#[tokio::test]`
			`#[cfg(feature = "language-detection")]`
			`async fn test_language_detection_single() {`
			`let config = ExtractionConfig {`
			`language_detection: Some(LanguageDetectionConfig {`
			`enabled: true,`
			`min_confidence: 0.8,`
			`detect_multiple: false,`
			`}),`
			`..Default::default()`
			`};`

			`let text = "Hello world! This is English text. It should be detected as English language.";`
			`let text_bytes = text.as_bytes();`

			`let result = extract_bytes(text_bytes, "text/plain", &config)`
			`.await`
			`.expect("Should extract successfully");`

			`assert!(result.detected_languages.is_some(), "Should detect language");`
			`let languages = result.detected_languages.expect("Operation failed");`
			`assert!(!languages.is_empty(), "Should detect at least one language");`
			`assert_eq!(languages[0], "eng", "Should detect English");`
			`}`

			`/// Test language detection for multi-language document.`
			`#[cfg_attr(coverage, ignore = "coverage instrumentation affects multi-language heuristics")]`
			`#[tokio::test]`
			`#[cfg(feature = "language-detection")]`
			`async fn test_language_detection_multiple() {`
			`let config = ExtractionConfig {`
			`language_detection: Some(LanguageDetectionConfig {`
			`enabled: true,`
			`min_confidence: 0.7,`
			`detect_multiple: true,`
			`}),`
			`..Default::default()`
			`};`

			`let text = "Hello world! This is English. ".repeat(10) + "Hola mundo! Este es español. ".repeat(10).as_str();`
			`let text_bytes = text.as_bytes();`

			`let result = extract_bytes(text_bytes, "text/plain", &config)`
			`.await`
			`.expect("Should extract successfully");`

			`assert!(result.detected_languages.is_some(), "Should detect languages");`
			`let languages = result.detected_languages.expect("Operation failed");`
			`assert!(!languages.is_empty(), "Should detect at least one language");`
			`}`

			`/// Test language detection with confidence threshold.`
			`#[tokio::test]`
			`#[cfg(feature = "language-detection")]`
			`async fn test_language_detection_confidence() {`
			`let config = ExtractionConfig {`
			`language_detection: Some(LanguageDetectionConfig {`
			`enabled: true,`
			`min_confidence: 0.9,`
			`detect_multiple: false,`
			`}),`
			`..Default::default()`
			`};`

			`let text = "This is clear English text that should have high confidence.";`
			`let text_bytes = text.as_bytes();`

			`let result = extract_bytes(text_bytes, "text/plain", &config)`
			`.await`
			`.expect("Should extract successfully");`

			`if let Some(languages) = result.detected_languages {`
			`assert!(!languages.is_empty());`
			`}`
			`}`

			`/// Test language detection disabled.`
			`#[tokio::test]`
			`#[cfg(feature = "language-detection")]`
			`async fn test_language_detection_disabled() {`
			`let config = ExtractionConfig {`
			`language_detection: Some(LanguageDetectionConfig {`
			`enabled: false,`
			`min_confidence: 0.8,`
			`detect_multiple: false,`
			`}),`
			`..Default::default()`
			`};`

			`let text = "Hello world! This is English text.";`
			`let text_bytes = text.as_bytes();`

			`let result = extract_bytes(text_bytes, "text/plain", &config)`
			`.await`
			`.expect("Should extract successfully");`

			`assert!(`
			`result.detected_languages.is_none(),`
			`"Should not detect language when disabled"`
			`);`
			`}`

			`/// Test cache hit behavior - second extraction from cache.`
			`#[tokio::test]`
			`async fn test_cache_hit_behavior() {`
			`let config = ExtractionConfig {`
			`use_cache: true,`
			`..Default::default()`
			`};`

			`let text = "Test text for caching behavior.";`
			`let text_bytes = text.as_bytes();`

			`let result1 = extract_bytes(text_bytes, "text/plain", &config)`
			`.await`
			`.expect("First extraction should succeed");`

			`let result2 = extract_bytes(text_bytes, "text/plain", &config)`
			`.await`
			`.expect("Second extraction should succeed");`

			`assert_eq!(result1.content, result2.content);`
			`}`

			`/// Test cache miss and invalidation.`
			`#[tokio::test]`
			`async fn test_cache_miss_invalidation() {`
			`let config = ExtractionConfig {`
			`use_cache: true,`
			`..Default::default()`
			`};`

			`let text1 = "First text for cache test.";`
			`let text2 = "Second different text.";`

			`let result1 = extract_bytes(text1.as_bytes(), "text/plain", &config)`
			`.await`
			`.expect("First extraction should succeed");`

			`let result2 = extract_bytes(text2.as_bytes(), "text/plain", &config)`
			`.await`
			`.expect("Second extraction should succeed");`

			`assert_ne!(result1.content, result2.content);`
			`}`

			`/// Test custom cache directory (Note: OCR cache uses hardcoded directory).`
			`#[tokio::test]`
			`async fn test_custom_cache_directory() {`
			`let config = ExtractionConfig {`
			`use_cache: true,`
			`..Default::default()`
			`};`

			`let text = "Test text for cache directory test.";`
			`let text_bytes = text.as_bytes();`

			`let result = extract_bytes(text_bytes, "text/plain", &config)`
			`.await`
			`.expect("Should extract successfully");`

			`assert!(!result.content.is_empty());`
			`}`

			`/// Test cache disabled - bypass cache.`
			`#[tokio::test]`
			`async fn test_cache_disabled() {`
			`let config = ExtractionConfig {`
			`use_cache: false,`
			`..Default::default()`
			`};`

			`let text = "Test text without caching.";`
			`let text_bytes = text.as_bytes();`

			`let result1 = extract_bytes(text_bytes, "text/plain", &config)`
			`.await`
			`.expect("First extraction should succeed");`

			`let result2 = extract_bytes(text_bytes, "text/plain", &config)`
			`.await`
			`.expect("Second extraction should succeed");`

			`assert_eq!(result1.content, result2.content);`
			`}`

			`/// Test token reduction in aggressive mode.`
			`#[tokio::test]`
			`async fn test_token_reduction_aggressive() {`
			`let config = ExtractionConfig {`
			`token_reduction: Some(TokenReductionOptions {`
			`mode: "aggressive".to_string(),`
			`preserve_important_words: true,`
			`}),`
			`..Default::default()`
			`};`

			`let text = "This is a very long sentence with many unnecessary words that could be reduced. ".repeat(5);`
			`let text_bytes = text.as_bytes();`

			`let result = extract_bytes(text_bytes, "text/plain", &config)`
			`.await`
			`.expect("Should extract successfully");`

			`assert!(!result.content.is_empty());`
			`}`

			`/// Test token reduction in conservative mode.`
			`#[tokio::test]`
			`async fn test_token_reduction_conservative() {`
			`let config = ExtractionConfig {`
			`token_reduction: Some(TokenReductionOptions {`
			`mode: "light".to_string(),`
			`preserve_important_words: true,`
			`}),`
			`..Default::default()`
			`};`

			`let text = "Conservative token reduction test with moderate text length.";`
			`let text_bytes = text.as_bytes();`

			`let result = extract_bytes(text_bytes, "text/plain", &config)`
			`.await`
			`.expect("Should extract successfully");`

			`assert!(!result.content.is_empty());`
			`}`

			`/// Test token reduction disabled.`
			`#[tokio::test]`
			`async fn test_token_reduction_disabled() {`
			`let config = ExtractionConfig {`
			`token_reduction: Some(TokenReductionOptions {`
			`mode: "off".to_string(),`
			`preserve_important_words: false,`
			`}),`
			`..Default::default()`
			`};`

			`let text = "Text without token reduction applied.";`
			`let text_bytes = text.as_bytes();`

			`let result = extract_bytes(text_bytes, "text/plain", &config)`
			`.await`
			`.expect("Should extract successfully");`

			`assert!(result.content.contains("without token reduction"));`
			`}`

			`/// Test quality processing enabled - quality scoring applied.`
			`#[tokio::test]`
			`#[cfg(feature = "quality")]`
			`async fn test_quality_processing_enabled() {`
			`let config = ExtractionConfig {`
			`enable_quality_processing: true,`
			`..Default::default()`
			`};`

			`let text = "This is well-structured text. It has multiple sentences. And proper punctuation.";`
			`let text_bytes = text.as_bytes();`

			`let result = extract_bytes(text_bytes, "text/plain", &config)`
			`.await`
			`.expect("Should extract successfully");`

			`if let Some(score_value) = result.quality_score {`
			`assert!((0.0..=1.0).contains(&score_value));`
			`}`

			`assert!(!result.content.is_empty());`
			`}`

			`/// Test quality processing calculates score for different text quality.`
			`#[tokio::test]`
			`#[cfg(feature = "quality")]`
			`async fn test_quality_threshold_filtering() {`
			`let config = ExtractionConfig {`
			`enable_quality_processing: true,`
			`..Default::default()`
			`};`

			`let high_quality = "This is a well-structured document. It has proper sentences. And good formatting.";`
			`let result_high = extract_bytes(high_quality.as_bytes(), "text/plain", &config)`
			`.await`
			`.expect("Should extract successfully");`

			`let low_quality = "a b c d ....... word123mixed . . ";`
			`let result_low = extract_bytes(low_quality.as_bytes(), "text/plain", &config)`
			`.await`
			`.expect("Should extract successfully");`

			`assert!(result_high.quality_score.is_some(), "High quality should have score");`
			`assert!(result_low.quality_score.is_some(), "Low quality should have score");`

			`let score_high = result_high.quality_score.expect("High quality should have score");`
			`let score_low = result_low.quality_score.expect("Low quality should have score");`

			`assert!((0.0..=1.0).contains(&score_high));`
			`assert!((0.0..=1.0).contains(&score_low));`
			`}`

			`/// Test quality processing disabled.`
			`#[tokio::test]`
			`async fn test_quality_processing_disabled() {`
			`let config = ExtractionConfig {`
			`enable_quality_processing: false,`
			`..Default::default()`
			`};`

			`let text = "Text without quality processing.";`
			`let text_bytes = text.as_bytes();`

			`let result = extract_bytes(text_bytes, "text/plain", &config)`
			`.await`
			`.expect("Should extract successfully");`

			`assert!(result.quality_score.is_none());`
			`assert!(!result.content.is_empty());`
			`}`

			`/// Test markdown chunker populates heading context.`
			`#[tokio::test]`
			`#[cfg(feature = "chunking")]`
			`async fn test_markdown_chunker_heading_context() {`
			`let markdown = r#"# Title`

			`Some intro text.`

			`## Section One`

			`Content in section one with enough text to create a chunk.`

			`## Section Two`

			`Content in section two with enough text to create another chunk.`

			`### Subsection`

			`More detailed content here in the subsection.`
			`"#;`

			`let config = ExtractionConfig {`
			`chunking: Some(ChunkingConfig {`
			`max_characters: 80,`
			`overlap: 10,`
			`chunker_type: kreuzberg::ChunkerType::Markdown,`
			`..Default::default()`
			`}),`
			`..Default::default()`
			`};`

			`let result = extract_bytes(markdown.as_bytes(), "text/markdown", &config)`
			`.await`
			`.expect("Should extract successfully");`

			`assert!(result.chunks.is_some(), "Chunks should be present");`
			`let chunks = result.chunks.expect("Should have chunks");`
			`assert!(chunks.len() >= 2, "Should have at least 2 chunks");`

			`// At least one chunk should have heading_context populated`
			`let has_heading = chunks.iter().any(\|c\| c.metadata.heading_context.is_some());`
			`assert!(has_heading, "At least one chunk should have heading_context");`

			`// Verify heading context structure`
			`for chunk in &chunks {`
			`if let Some(ref ctx) = chunk.metadata.heading_context {`
			`for heading in &ctx.headings {`
			`assert!(heading.level >= 1 && heading.level <= 6, "Heading level should be 1-6");`
			`assert!(!heading.text.is_empty(), "Heading text should not be empty");`
			`}`
			`}`
			`}`
			`}`

			`/// Test that chunk_type is populated for markdown chunks.`
			`#[tokio::test]`
			`#[cfg(feature = "chunking")]`
			`async fn test_chunk_type_populated() {`
			`let markdown = r#"# Introduction`

			`This section introduces the document with some content.`

			`## Code Example`

			```rust
			`fn hello() {`
			`println!("Hello, world!");`
			`}`
			```

			`## Summary`

			`A brief summary of the document.`
			`"#;`

			`let config = ExtractionConfig {`
			`chunking: Some(ChunkingConfig {`
			`max_characters: 200,`
			`overlap: 0,`
			`chunker_type: kreuzberg::ChunkerType::Markdown,`
			`..Default::default()`
			`}),`
			`..Default::default()`
			`};`

			`let result = extract_bytes(markdown.as_bytes(), "text/markdown", &config)`
			`.await`
			`.expect("Should extract successfully");`

			`assert!(result.chunks.is_some(), "Chunks should be present");`
			`let chunks = result.chunks.expect("Should have chunks");`
			`assert!(!chunks.is_empty(), "Should have at least one chunk");`

			`// Verify chunk metadata is populated`
			`for chunk in &chunks {`
			`assert!(!chunk.content.is_empty(), "Chunk should not be empty");`
			`// chunk_type must always be set (never uninitialized)`
			`let _ = &chunk.chunk_type;`
			`}`

			`// At least one chunk should have heading context (from markdown structure)`
			`let has_heading_context = chunks.iter().any(\|c\| c.metadata.heading_context.is_some());`
			`assert!(`
			`has_heading_context,`
			`"At least one chunk should have heading context from markdown structure"`
			`);`
			`}`

			`/// Test chunking with embeddings using balanced preset.`
			`///`
			`/// This test requires ONNX Runtime to be installed as a system dependency.`
			/// On macOS with Homebrew: `brew install onnxruntime`
			`/// On Linux: Install via your package manager or download from https://github.com/microsoft/onnxruntime/releases`
			`/// On Windows: Download from https://github.com/microsoft/onnxruntime/releases`
			`#[tokio::test]`
			`#[cfg(feature = "embeddings")]`
			`#[cfg_attr(target_os = "macos", ignore = "ONNX models not cached on macOS")]`
			`#[cfg_attr(target_os = "windows", ignore = "ONNX models not cached on Windows")]`
			`async fn test_chunking_with_embeddings() {`
			`use kreuzberg::core::config::EmbeddingConfig;`

			`let config = ExtractionConfig {`
			`chunking: Some(ChunkingConfig {`
			`max_characters: 100,`
			`overlap: 20,`
			`embedding: Some(EmbeddingConfig::default()),`
			`..Default::default()`
			`}),`
			`..Default::default()`
			`};`

			`let text = "This is a test document for embedding generation. ".repeat(10);`
			`let text_bytes = text.as_bytes();`

			`let result = extract_bytes(text_bytes, "text/plain", &config)`
			`.await`
			`.expect("Should extract successfully");`

			`assert!(result.chunks.is_some(), "Chunks should be present");`
			`let chunks = result.chunks.expect("Operation failed");`
			`assert!(chunks.len() > 1, "Should have multiple chunks");`

			`if let Some(error) = result.metadata.additional.get("embedding_error") {`
			`panic!("Embedding generation failed: {}", error);`
			`}`

			`for chunk in &chunks {`
			`assert!(chunk.embedding.is_some(), "Each chunk should have an embedding");`
			`let embedding = chunk.embedding.as_ref().expect("Operation failed");`
			`assert_eq!(`
			`embedding.len(),`
			`768,`
			`"Embedding should have 768 dimensions for balanced preset"`
			`);`

			`let magnitude: f32 = embedding.iter().map(\|x\| x * x).sum::<f32>().sqrt();`
			`assert!(`
			`(magnitude - 1.0).abs() < 0.01,`
			`"Embedding should be normalized (magnitude ~= 1.0)"`
			`);`
			`}`
			`}`

			`/// Test chunking with fast embedding preset.`
			`///`
			`/// This test requires ONNX Runtime to be installed as a system dependency.`
			/// On macOS with Homebrew: `brew install onnxruntime`
			`/// On Linux: Install via your package manager or download from https://github.com/microsoft/onnxruntime/releases`
			`/// On Windows: Download from https://github.com/microsoft/onnxruntime/releases`
			`#[tokio::test]`
			`#[cfg(feature = "embeddings")]`
			`#[cfg_attr(target_os = "macos", ignore = "ONNX models not cached on macOS")]`
			`#[cfg_attr(target_os = "windows", ignore = "ONNX models not cached on Windows")]`
			`async fn test_chunking_with_fast_embeddings() {`
			`use kreuzberg::core::config::{EmbeddingConfig, EmbeddingModelType};`

			`let config = ExtractionConfig {`
			`chunking: Some(ChunkingConfig {`
			`max_characters: 100,`
			`overlap: 20,`
			`embedding: Some(EmbeddingConfig {`
			`model: EmbeddingModelType::Preset {`
			`name: "fast".to_string(),`
			`},`
			`..Default::default()`
			`}),`
			`..Default::default()`
			`}),`
			`..Default::default()`
			`};`

			`let text = "Fast embedding test. ".repeat(10);`
			`let text_bytes = text.as_bytes();`

			`let result = extract_bytes(text_bytes, "text/plain", &config)`
			`.await`
			`.expect("Should extract successfully");`

			`let chunks = result.chunks.expect("Should have chunks");`
			`assert!(!chunks.is_empty(), "Should have at least one chunk");`

			`if let Some(error) = result.metadata.additional.get("embedding_error") {`
			`panic!("Embedding generation failed: {}", error);`
			`}`

			`for chunk in &chunks {`
			`let embedding = chunk.embedding.as_ref().expect("Should have embedding");`
			`assert_eq!(embedding.len(), 384, "Fast preset should produce 384-dim embeddings");`
			`}`
			`}`