crates/kreuzberg/tests/config_behavioral.rs

//! Config behavioral verification tests
//!
//! These tests verify that configuration options actually affect extraction behavior,
//! not just that they serialize correctly.
//!
//! Unlike serialization tests that only check if configs deserialize, these tests verify
//! that the configuration options actually influence the extraction process and produce
//! observable differences in the output.

#[cfg(feature = "chunking")]
use kreuzberg::core::config::ChunkingConfig;
use kreuzberg::core::config::ExtractionConfig;
use kreuzberg::core::config::OutputFormat;
use kreuzberg::core::extractor::extract_bytes;
use kreuzberg::types::ResultFormat;

mod helpers;

/// Test output_format Plain produces text without formatting
///
/// Note: HTML extractors often convert to markdown internally, so this test
/// uses plain text input to verify the output_format configuration is respected.
#[tokio::test]
async fn test_output_format_plain_produces_plain() {
    let plain_text = b"Title\n\nParagraph with bold text.";

    let config = ExtractionConfig {
        output_format: OutputFormat::Plain,
        ..Default::default()
    };

    let result = extract_bytes(plain_text, "text/plain", &config)
        .await
        .expect("Should extract successfully");

    // Plain text should not have markdown or HTML formatting
    assert!(
        !result.content.contains("# ") && !result.content.contains("<h1>"),
        "Plain format should not contain markdown headers or HTML tags, got: {}",
        result.content
    );
    assert!(
        result.content.contains("Title") || result.content.contains("Paragraph"),
        "Should still contain extracted text content"
    );
}

/// Test output_format Markdown produces markdown formatting
#[tokio::test]
#[cfg(feature = "html")]
async fn test_output_format_markdown_produces_markdown() {
    let html = b"<h1>Title</h1><p>Paragraph with <strong>bold</strong> text.</p>";

    let config = ExtractionConfig {
        output_format: OutputFormat::Markdown,
        ..Default::default()
    };

    let result = extract_bytes(html, "text/html", &config)
        .await
        .expect("Should extract successfully");

    // Verify markdown formatting is present (# for headers or ** for bold)
    let has_markdown = result.content.contains("# ") || result.content.contains("**") || result.content.contains("*");

    assert!(
        has_markdown,
        "Markdown format should contain # headers or ** bold, got: {}",
        result.content
    );
}

/// Test output_format HTML produces valid HTML content
#[tokio::test]
async fn test_output_format_html_produces_html() {
    let text = "Title\n\nParagraph with bold text.";

    let config = ExtractionConfig {
        output_format: OutputFormat::Html,
        ..Default::default()
    };

    let result = extract_bytes(text.as_bytes(), "text/plain", &config)
        .await
        .expect("Should extract successfully");

    // HTML format should be safe and not contain injection vectors
    assert!(
        !result.content.contains("<script>"),
        "HTML format should be safe from injection"
    );
    assert!(!result.content.is_empty(), "Should produce content in HTML format");
}

/// Test result_format Unified produces content in single field
#[tokio::test]
async fn test_result_format_unified_structure() {
    let text = "Sample content";

    let config = ExtractionConfig {
        result_format: ResultFormat::Unified,
        ..Default::default()
    };

    let result = extract_bytes(text.as_bytes(), "text/plain", &config)
        .await
        .expect("Should extract successfully");

    // Unified format should have content in main content field
    assert!(!result.content.is_empty(), "Unified format should have content");

    // Elements should be None or empty for unified format
    assert!(
        result.elements.is_none() || result.elements.as_ref().unwrap().is_empty(),
        "Unified format should not have elements"
    );
}

/// Test result_format ElementBased produces element structure
#[tokio::test]
async fn test_result_format_element_based_structure() {
    let text = "First paragraph here.\n\nSecond paragraph with more content.";

    let config = ExtractionConfig {
        result_format: ResultFormat::ElementBased,
        ..Default::default()
    };

    let result = extract_bytes(text.as_bytes(), "text/plain", &config)
        .await
        .expect("Should extract successfully");

    // Element-based format should produce elements array
    if let Some(elements) = &result.elements {
        assert!(!elements.is_empty(), "Element-based format should have elements");
        // Verify elements have expected structure
        for element in elements {
            assert!(!element.text.is_empty(), "Elements should have non-empty text");
        }
    }
}

/// Test chunking max_chars actually limits chunk size
#[tokio::test]
#[cfg(feature = "chunking")]
async fn test_chunking_max_chars_limits_chunk_size() {
    let long_text = "word ".repeat(500); // ~2500 characters

    let config = ExtractionConfig {
        chunking: Some(ChunkingConfig {
            max_characters: 100,
            overlap: 20,
            ..Default::default()
        }),
        ..Default::default()
    };

    let result = extract_bytes(long_text.as_bytes(), "text/plain", &config)
        .await
        .expect("Should extract successfully");

    assert!(result.chunks.is_some(), "Chunking should produce chunks");

    if let Some(chunks) = result.chunks {
        assert!(chunks.len() > 1, "Long text should produce multiple chunks");

        // Verify chunk size constraint: each chunk should respect max_chars
        for (i, chunk) in chunks.iter().enumerate() {
            assert!(
                chunk.content.len() <= 100 + 20,
                "Chunk {} exceeds max_chars + overlap: length = {}",
                i,
                chunk.content.len()
            );
        }
    }
}

/// Test chunking with overlap creates overlapping chunks
#[tokio::test]
#[cfg(feature = "chunking")]
async fn test_chunking_overlap_creates_overlap() {
    let text = "First sentence. ".repeat(30); // ~480 characters

    let config = ExtractionConfig {
        chunking: Some(ChunkingConfig {
            max_characters: 50,
            overlap: 15,
            ..Default::default()
        }),
        ..Default::default()
    };

    let result = extract_bytes(text.as_bytes(), "text/plain", &config)
        .await
        .expect("Should extract successfully");

    if let Some(chunks) = result.chunks
        && chunks.len() >= 2
    {
        // Check if adjacent chunks have overlapping text
        let chunk1_end = &chunks[0].content[chunks[0].content.len().saturating_sub(15)..];
        let chunk2_start = &chunks[1].content[..chunks[1].content.len().min(15)];

        // There should be some overlap in the text
        let overlap_found = chunk1_end.chars().any(|c| c != ' ') && chunk2_start.chars().any(|c| c != ' ');

        assert!(
            overlap_found,
            "Adjacent chunks should have overlapping non-whitespace text"
        );
    }
}

/// Test chunking disabled produces no chunks
#[tokio::test]
async fn test_chunking_disabled_produces_no_chunks() {
    let long_text = "word ".repeat(500);

    let config = ExtractionConfig {
        chunking: None,
        ..Default::default()
    };

    let result = extract_bytes(long_text.as_bytes(), "text/plain", &config)
        .await
        .expect("Should extract successfully");

    assert!(result.chunks.is_none(), "Chunking disabled should produce no chunks");
}

/// Test use_cache true allows results to be cached
#[tokio::test]
async fn test_cache_enabled_allows_caching() {
    let text = "Test content for caching";

    let config = ExtractionConfig {
        use_cache: true,
        ..Default::default()
    };

    // Extract twice with same content
    let result1 = extract_bytes(text.as_bytes(), "text/plain", &config)
        .await
        .expect("First extraction should succeed");

    let result2 = extract_bytes(text.as_bytes(), "text/plain", &config)
        .await
        .expect("Second extraction should succeed");

    // Results should be identical
    assert_eq!(
        result1.content, result2.content,
        "Cache enabled should produce consistent results"
    );
}

/// Test use_cache false disables caching without crashing
#[tokio::test]
async fn test_cache_disabled_does_not_crash() {
    let text = "Test content without caching";

    let config = ExtractionConfig {
        use_cache: false,
        ..Default::default()
    };

    let result = extract_bytes(text.as_bytes(), "text/plain", &config)
        .await
        .expect("Extraction with cache disabled should succeed");

    assert!(!result.content.is_empty(), "Should still extract content");
}

/// Test quality_processing enabled produces quality score
#[tokio::test]
#[cfg(feature = "quality")]
async fn test_quality_processing_enabled_produces_score() {
    let text = "This is a well-structured document. It has proper sentences. And good formatting.";

    let config = ExtractionConfig {
        enable_quality_processing: true,
        ..Default::default()
    };

    let result = extract_bytes(text.as_bytes(), "text/plain", &config)
        .await
        .expect("Should extract successfully");

    // Quality processing should populate the dedicated quality_score field
    assert!(
        result.quality_score.is_some(),
        "Quality processing enabled should produce quality_score"
    );
}

/// Test quality_processing disabled does not produce score
#[tokio::test]
#[cfg(feature = "quality")]
async fn test_quality_processing_disabled_no_score() {
    let text = "This is a document.";

    let config = ExtractionConfig {
        enable_quality_processing: false,
        ..Default::default()
    };

    let result = extract_bytes(text.as_bytes(), "text/plain", &config)
        .await
        .expect("Should extract successfully");

    assert!(
        result.quality_score.is_none(),
        "Quality processing disabled should not produce quality_score"
    );
}

/// Test output_format combinations with result_format
#[tokio::test]
#[cfg(feature = "html")]
async fn test_output_format_with_element_based() {
    let html = b"<p>First paragraph</p><p>Second paragraph</p>";

    let config = ExtractionConfig {
        output_format: OutputFormat::Markdown,
        result_format: ResultFormat::ElementBased,
        ..Default::default()
    };

    let result = extract_bytes(html, "text/html", &config)
        .await
        .expect("Should extract successfully");

    // Should have elements
    assert!(result.elements.is_some(), "ElementBased format should produce elements");

    // Content should still be markdown formatted
    assert!(
        !result.content.contains("<p>"),
        "Output format should not contain HTML tags"
    );
}

/// Test chunking respects overlap maximum
#[tokio::test]
#[cfg(feature = "chunking")]
async fn test_chunking_overlap_maximum() {
    let text = "x".repeat(200); // Simple repeated character

    let config = ExtractionConfig {
        chunking: Some(ChunkingConfig {
            max_characters: 60,
            overlap: 10,
            ..Default::default()
        }),
        ..Default::default()
    };

    let result = extract_bytes(text.as_bytes(), "text/plain", &config)
        .await
        .expect("Should extract successfully");

    if let Some(chunks) = result.chunks {
        // Verify max_overlap is not exceeded
        for (i, chunk) in chunks.iter().enumerate() {
            assert!(
                chunk.content.len() <= 60 + 10,
                "Chunk {} size {} exceeds max_chars (60) + max_overlap (10)",
                i,
                chunk.content.len()
            );
        }
    }
}

/// Test large document extraction with multiple config options
#[tokio::test]
#[cfg(feature = "chunking")]
async fn test_large_document_with_combined_config() {
    let large_text = "This is a paragraph. ".repeat(100); // ~2000 characters

    let config = ExtractionConfig {
        output_format: OutputFormat::Plain,
        chunking: Some(ChunkingConfig {
            max_characters: 200,
            overlap: 30,
            ..Default::default()
        }),
        use_cache: true,
        enable_quality_processing: true,
        ..Default::default()
    };

    let result = extract_bytes(large_text.as_bytes(), "text/plain", &config)
        .await
        .expect("Should extract successfully");

    // Should have chunks due to size
    assert!(result.chunks.is_some(), "Should produce chunks for large text");

    // Should have quality score
    #[cfg(feature = "quality")]
    {
        assert!(result.quality_score.is_some(), "Should have quality score");
    }

    // Should have content in plain format
    assert!(!result.content.is_empty(), "Should have content");
}

/// Test that result_format ElementBased auto-enables page extraction via normalization (#908)
#[tokio::test]
async fn test_element_based_auto_normalizes_page_extraction() {
    let text = "Page 1 content.\n\nPage 2 content.";

    // Configure ElementBased but explicitly set extract_pages to false.
    // The normalization logic should override this to true because ElementBased needs page numbers.
    let config = ExtractionConfig {
        result_format: ResultFormat::ElementBased,
        pages: Some(kreuzberg::core::config::PageConfig {
            extract_pages: false,
            ..Default::default()
        }),
        ..Default::default()
    };

    let result = extract_bytes(text.as_bytes(), "text/plain", &config)
        .await
        .expect("Should extract successfully");

    // If normalization worked, elements should have page numbers correctly assigned (not just all 1).
    // Note: For plain text, page numbers are often 1 unless explicit markers are found,
    // but the key is that the extraction SUCCEEDS and the config was normalized.
    assert!(result.elements.is_some(), "Should have elements");
}
Nomad changes 2026-06-01 23:40:55 +02:00			`//! Config behavioral verification tests`
			`//!`
			`//! These tests verify that configuration options actually affect extraction behavior,`
			`//! not just that they serialize correctly.`
			`//!`
			`//! Unlike serialization tests that only check if configs deserialize, these tests verify`
			`//! that the configuration options actually influence the extraction process and produce`
			`//! observable differences in the output.`

			`#[cfg(feature = "chunking")]`
			`use kreuzberg::core::config::ChunkingConfig;`
			`use kreuzberg::core::config::ExtractionConfig;`
			`use kreuzberg::core::config::OutputFormat;`
			`use kreuzberg::core::extractor::extract_bytes;`
			`use kreuzberg::types::ResultFormat;`

			`mod helpers;`

			`/// Test output_format Plain produces text without formatting`
			`///`
			`/// Note: HTML extractors often convert to markdown internally, so this test`
			`/// uses plain text input to verify the output_format configuration is respected.`
			`#[tokio::test]`
			`async fn test_output_format_plain_produces_plain() {`
			`let plain_text = b"Title\n\nParagraph with bold text.";`

			`let config = ExtractionConfig {`
			`output_format: OutputFormat::Plain,`
			`..Default::default()`
			`};`

			`let result = extract_bytes(plain_text, "text/plain", &config)`
			`.await`
			`.expect("Should extract successfully");`

			`// Plain text should not have markdown or HTML formatting`
			`assert!(`
			`!result.content.contains("# ") && !result.content.contains("<h1>"),`
			`"Plain format should not contain markdown headers or HTML tags, got: {}",`
			`result.content`
			`);`
			`assert!(`
			`result.content.contains("Title") \|\| result.content.contains("Paragraph"),`
			`"Should still contain extracted text content"`
			`);`
			`}`

			`/// Test output_format Markdown produces markdown formatting`
			`#[tokio::test]`
			`#[cfg(feature = "html")]`
			`async fn test_output_format_markdown_produces_markdown() {`
			`let html = b"<h1>Title</h1><p>Paragraph with <strong>bold</strong> text.</p>";`

			`let config = ExtractionConfig {`
			`output_format: OutputFormat::Markdown,`
			`..Default::default()`
			`};`

			`let result = extract_bytes(html, "text/html", &config)`
			`.await`
			`.expect("Should extract successfully");`

			`// Verify markdown formatting is present (# for headers or ** for bold)`
			`let has_markdown = result.content.contains("# ") \|\| result.content.contains("*") \|\| result.content.contains("");`

			`assert!(`
			`has_markdown,`
			`"Markdown format should contain # headers or ** bold, got: {}",`
			`result.content`
			`);`
			`}`

			`/// Test output_format HTML produces valid HTML content`
			`#[tokio::test]`
			`async fn test_output_format_html_produces_html() {`
			`let text = "Title\n\nParagraph with bold text.";`

			`let config = ExtractionConfig {`
			`output_format: OutputFormat::Html,`
			`..Default::default()`
			`};`

			`let result = extract_bytes(text.as_bytes(), "text/plain", &config)`
			`.await`
			`.expect("Should extract successfully");`

			`// HTML format should be safe and not contain injection vectors`
			`assert!(`
			`!result.content.contains("<script>"),`
			`"HTML format should be safe from injection"`
			`);`
			`assert!(!result.content.is_empty(), "Should produce content in HTML format");`
			`}`

			`/// Test result_format Unified produces content in single field`
			`#[tokio::test]`
			`async fn test_result_format_unified_structure() {`
			`let text = "Sample content";`

			`let config = ExtractionConfig {`
			`result_format: ResultFormat::Unified,`
			`..Default::default()`
			`};`

			`let result = extract_bytes(text.as_bytes(), "text/plain", &config)`
			`.await`
			`.expect("Should extract successfully");`

			`// Unified format should have content in main content field`
			`assert!(!result.content.is_empty(), "Unified format should have content");`

			`// Elements should be None or empty for unified format`
			`assert!(`
			`result.elements.is_none() \|\| result.elements.as_ref().unwrap().is_empty(),`
			`"Unified format should not have elements"`
			`);`
			`}`

			`/// Test result_format ElementBased produces element structure`
			`#[tokio::test]`
			`async fn test_result_format_element_based_structure() {`
			`let text = "First paragraph here.\n\nSecond paragraph with more content.";`

			`let config = ExtractionConfig {`
			`result_format: ResultFormat::ElementBased,`
			`..Default::default()`
			`};`

			`let result = extract_bytes(text.as_bytes(), "text/plain", &config)`
			`.await`
			`.expect("Should extract successfully");`

			`// Element-based format should produce elements array`
			`if let Some(elements) = &result.elements {`
			`assert!(!elements.is_empty(), "Element-based format should have elements");`
			`// Verify elements have expected structure`
			`for element in elements {`
			`assert!(!element.text.is_empty(), "Elements should have non-empty text");`
			`}`
			`}`
			`}`

			`/// Test chunking max_chars actually limits chunk size`
			`#[tokio::test]`
			`#[cfg(feature = "chunking")]`
			`async fn test_chunking_max_chars_limits_chunk_size() {`
			`let long_text = "word ".repeat(500); // ~2500 characters`

			`let config = ExtractionConfig {`
			`chunking: Some(ChunkingConfig {`
			`max_characters: 100,`
			`overlap: 20,`
			`..Default::default()`
			`}),`
			`..Default::default()`
			`};`

			`let result = extract_bytes(long_text.as_bytes(), "text/plain", &config)`
			`.await`
			`.expect("Should extract successfully");`

			`assert!(result.chunks.is_some(), "Chunking should produce chunks");`

			`if let Some(chunks) = result.chunks {`
			`assert!(chunks.len() > 1, "Long text should produce multiple chunks");`

			`// Verify chunk size constraint: each chunk should respect max_chars`
			`for (i, chunk) in chunks.iter().enumerate() {`
			`assert!(`
			`chunk.content.len() <= 100 + 20,`
			`"Chunk {} exceeds max_chars + overlap: length = {}",`
			`i,`
			`chunk.content.len()`
			`);`
			`}`
			`}`
			`}`

			`/// Test chunking with overlap creates overlapping chunks`
			`#[tokio::test]`
			`#[cfg(feature = "chunking")]`
			`async fn test_chunking_overlap_creates_overlap() {`
			`let text = "First sentence. ".repeat(30); // ~480 characters`

			`let config = ExtractionConfig {`
			`chunking: Some(ChunkingConfig {`
			`max_characters: 50,`
			`overlap: 15,`
			`..Default::default()`
			`}),`
			`..Default::default()`
			`};`

			`let result = extract_bytes(text.as_bytes(), "text/plain", &config)`
			`.await`
			`.expect("Should extract successfully");`

			`if let Some(chunks) = result.chunks`
			`&& chunks.len() >= 2`
			`{`
			`// Check if adjacent chunks have overlapping text`
			`let chunk1_end = &chunks[0].content[chunks[0].content.len().saturating_sub(15)..];`
			`let chunk2_start = &chunks[1].content[..chunks[1].content.len().min(15)];`

			`// There should be some overlap in the text`
			`let overlap_found = chunk1_end.chars().any(\|c\| c != ' ') && chunk2_start.chars().any(\|c\| c != ' ');`

			`assert!(`
			`overlap_found,`
			`"Adjacent chunks should have overlapping non-whitespace text"`
			`);`
			`}`
			`}`

			`/// Test chunking disabled produces no chunks`
			`#[tokio::test]`
			`async fn test_chunking_disabled_produces_no_chunks() {`
			`let long_text = "word ".repeat(500);`

			`let config = ExtractionConfig {`
			`chunking: None,`
			`..Default::default()`
			`};`

			`let result = extract_bytes(long_text.as_bytes(), "text/plain", &config)`
			`.await`
			`.expect("Should extract successfully");`

			`assert!(result.chunks.is_none(), "Chunking disabled should produce no chunks");`
			`}`

			`/// Test use_cache true allows results to be cached`
			`#[tokio::test]`
			`async fn test_cache_enabled_allows_caching() {`
			`let text = "Test content for caching";`

			`let config = ExtractionConfig {`
			`use_cache: true,`
			`..Default::default()`
			`};`

			`// Extract twice with same content`
			`let result1 = extract_bytes(text.as_bytes(), "text/plain", &config)`
			`.await`
			`.expect("First extraction should succeed");`

			`let result2 = extract_bytes(text.as_bytes(), "text/plain", &config)`
			`.await`
			`.expect("Second extraction should succeed");`

			`// Results should be identical`
			`assert_eq!(`
			`result1.content, result2.content,`
			`"Cache enabled should produce consistent results"`
			`);`
			`}`

			`/// Test use_cache false disables caching without crashing`
			`#[tokio::test]`
			`async fn test_cache_disabled_does_not_crash() {`
			`let text = "Test content without caching";`

			`let config = ExtractionConfig {`
			`use_cache: false,`
			`..Default::default()`
			`};`

			`let result = extract_bytes(text.as_bytes(), "text/plain", &config)`
			`.await`
			`.expect("Extraction with cache disabled should succeed");`

			`assert!(!result.content.is_empty(), "Should still extract content");`
			`}`

			`/// Test quality_processing enabled produces quality score`
			`#[tokio::test]`
			`#[cfg(feature = "quality")]`
			`async fn test_quality_processing_enabled_produces_score() {`
			`let text = "This is a well-structured document. It has proper sentences. And good formatting.";`

			`let config = ExtractionConfig {`
			`enable_quality_processing: true,`
			`..Default::default()`
			`};`

			`let result = extract_bytes(text.as_bytes(), "text/plain", &config)`
			`.await`
			`.expect("Should extract successfully");`

			`// Quality processing should populate the dedicated quality_score field`
			`assert!(`
			`result.quality_score.is_some(),`
			`"Quality processing enabled should produce quality_score"`
			`);`
			`}`

			`/// Test quality_processing disabled does not produce score`
			`#[tokio::test]`
			`#[cfg(feature = "quality")]`
			`async fn test_quality_processing_disabled_no_score() {`
			`let text = "This is a document.";`

			`let config = ExtractionConfig {`
			`enable_quality_processing: false,`
			`..Default::default()`
			`};`

			`let result = extract_bytes(text.as_bytes(), "text/plain", &config)`
			`.await`
			`.expect("Should extract successfully");`

			`assert!(`
			`result.quality_score.is_none(),`
			`"Quality processing disabled should not produce quality_score"`
			`);`
			`}`

			`/// Test output_format combinations with result_format`
			`#[tokio::test]`
			`#[cfg(feature = "html")]`
			`async fn test_output_format_with_element_based() {`
			`let html = b"<p>First paragraph</p><p>Second paragraph</p>";`

			`let config = ExtractionConfig {`
			`output_format: OutputFormat::Markdown,`
			`result_format: ResultFormat::ElementBased,`
			`..Default::default()`
			`};`

			`let result = extract_bytes(html, "text/html", &config)`
			`.await`
			`.expect("Should extract successfully");`

			`// Should have elements`
			`assert!(result.elements.is_some(), "ElementBased format should produce elements");`

			`// Content should still be markdown formatted`
			`assert!(`
			`!result.content.contains("<p>"),`
			`"Output format should not contain HTML tags"`
			`);`
			`}`

			`/// Test chunking respects overlap maximum`
			`#[tokio::test]`
			`#[cfg(feature = "chunking")]`
			`async fn test_chunking_overlap_maximum() {`
			`let text = "x".repeat(200); // Simple repeated character`

			`let config = ExtractionConfig {`
			`chunking: Some(ChunkingConfig {`
			`max_characters: 60,`
			`overlap: 10,`
			`..Default::default()`
			`}),`
			`..Default::default()`
			`};`

			`let result = extract_bytes(text.as_bytes(), "text/plain", &config)`
			`.await`
			`.expect("Should extract successfully");`

			`if let Some(chunks) = result.chunks {`
			`// Verify max_overlap is not exceeded`
			`for (i, chunk) in chunks.iter().enumerate() {`
			`assert!(`
			`chunk.content.len() <= 60 + 10,`
			`"Chunk {} size {} exceeds max_chars (60) + max_overlap (10)",`
			`i,`
			`chunk.content.len()`
			`);`
			`}`
			`}`
			`}`

			`/// Test large document extraction with multiple config options`
			`#[tokio::test]`
			`#[cfg(feature = "chunking")]`
			`async fn test_large_document_with_combined_config() {`
			`let large_text = "This is a paragraph. ".repeat(100); // ~2000 characters`

			`let config = ExtractionConfig {`
			`output_format: OutputFormat::Plain,`
			`chunking: Some(ChunkingConfig {`
			`max_characters: 200,`
			`overlap: 30,`
			`..Default::default()`
			`}),`
			`use_cache: true,`
			`enable_quality_processing: true,`
			`..Default::default()`
			`};`

			`let result = extract_bytes(large_text.as_bytes(), "text/plain", &config)`
			`.await`
			`.expect("Should extract successfully");`

			`// Should have chunks due to size`
			`assert!(result.chunks.is_some(), "Should produce chunks for large text");`

			`// Should have quality score`
			`#[cfg(feature = "quality")]`
			`{`
			`assert!(result.quality_score.is_some(), "Should have quality score");`
			`}`

			`// Should have content in plain format`
			`assert!(!result.content.is_empty(), "Should have content");`
			`}`

			`/// Test that result_format ElementBased auto-enables page extraction via normalization (#908)`
			`#[tokio::test]`
			`async fn test_element_based_auto_normalizes_page_extraction() {`
			`let text = "Page 1 content.\n\nPage 2 content.";`

			`// Configure ElementBased but explicitly set extract_pages to false.`
			`// The normalization logic should override this to true because ElementBased needs page numbers.`
			`let config = ExtractionConfig {`
			`result_format: ResultFormat::ElementBased,`
			`pages: Some(kreuzberg::core::config::PageConfig {`
			`extract_pages: false,`
			`..Default::default()`
			`}),`
			`..Default::default()`
			`};`

			`let result = extract_bytes(text.as_bytes(), "text/plain", &config)`
			`.await`
			`.expect("Should extract successfully");`

			`// If normalization worked, elements should have page numbers correctly assigned (not just all 1).`
			`// Note: For plain text, page numbers are often 1 unless explicit markers are found,`
			`// but the key is that the extraction SUCCEEDS and the config was normalized.`
			`assert!(result.elements.is_some(), "Should have elements");`
			`}`