//! Config behavioral verification tests //! //! These tests verify that configuration options actually affect extraction behavior, //! not just that they serialize correctly. //! //! Unlike serialization tests that only check if configs deserialize, these tests verify //! that the configuration options actually influence the extraction process and produce //! observable differences in the output. #[cfg(feature = "chunking")] use kreuzberg::core::config::ChunkingConfig; use kreuzberg::core::config::ExtractionConfig; use kreuzberg::core::config::OutputFormat; use kreuzberg::core::extractor::extract_bytes; use kreuzberg::types::ResultFormat; mod helpers; /// Test output_format Plain produces text without formatting /// /// Note: HTML extractors often convert to markdown internally, so this test /// uses plain text input to verify the output_format configuration is respected. #[tokio::test] async fn test_output_format_plain_produces_plain() { let plain_text = b"Title\n\nParagraph with bold text."; let config = ExtractionConfig { output_format: OutputFormat::Plain, ..Default::default() }; let result = extract_bytes(plain_text, "text/plain", &config) .await .expect("Should extract successfully"); // Plain text should not have markdown or HTML formatting assert!( !result.content.contains("# ") && !result.content.contains("

"), "Plain format should not contain markdown headers or HTML tags, got: {}", result.content ); assert!( result.content.contains("Title") || result.content.contains("Paragraph"), "Should still contain extracted text content" ); } /// Test output_format Markdown produces markdown formatting #[tokio::test] #[cfg(feature = "html")] async fn test_output_format_markdown_produces_markdown() { let html = b"

Title

Paragraph with bold text.

"; let config = ExtractionConfig { output_format: OutputFormat::Markdown, ..Default::default() }; let result = extract_bytes(html, "text/html", &config) .await .expect("Should extract successfully"); // Verify markdown formatting is present (# for headers or ** for bold) let has_markdown = result.content.contains("# ") || result.content.contains("**") || result.content.contains("*"); assert!( has_markdown, "Markdown format should contain # headers or ** bold, got: {}", result.content ); } /// Test output_format HTML produces valid HTML content #[tokio::test] async fn test_output_format_html_produces_html() { let text = "Title\n\nParagraph with bold text."; let config = ExtractionConfig { output_format: OutputFormat::Html, ..Default::default() }; let result = extract_bytes(text.as_bytes(), "text/plain", &config) .await .expect("Should extract successfully"); // HTML format should be safe and not contain injection vectors assert!( !result.content.contains("