//! Integration tests for the `use_layout_for_markdown` flag. //! //! These tests verify that: //! 1. `use_layout_for_markdown = true` feeds layout regions into the non-OCR //! markdown pipeline, producing richer structural output compared to the //! baseline (font-clustering only). //! 2. `use_layout_for_markdown = false` (default) leaves the pipeline unchanged //! and produces the same output as a config without the field. //! //! Tests are feature-gated on `pdf` and `layout-detection` and are marked //! `#[ignore]` when the layout engine model files are not available on CI. #![cfg(all(feature = "pdf", feature = "layout-detection"))] mod helpers; use helpers::{get_test_file_path, test_documents_available}; use kreuzberg::core::config::{ExtractionConfig, OutputFormat, layout::LayoutDetectionConfig}; use kreuzberg::extract_file_sync; /// Extract `relative_path` (from `test_documents/`) with the given config. fn extract_md(relative_path: &str, config: &ExtractionConfig) -> String { let path = get_test_file_path(relative_path); extract_file_sync(&path, None, config) .expect("extraction should succeed") .content } /// Config: output_format=Markdown, no layout at all (pure baseline). fn baseline_config() -> ExtractionConfig { ExtractionConfig { output_format: OutputFormat::Markdown, ..Default::default() } } /// Config: layout=Some(default), use_layout_for_markdown=false. /// Layout model is loaded but NOT injected into the native path. fn layout_config_not_injected() -> ExtractionConfig { ExtractionConfig { output_format: OutputFormat::Markdown, layout: Some(LayoutDetectionConfig::default()), use_layout_for_markdown: false, ..Default::default() } } /// Config: layout=Some(default), use_layout_for_markdown=true. /// Layout regions ARE injected into the native markdown pipeline. fn layout_for_markdown_config() -> ExtractionConfig { ExtractionConfig { output_format: OutputFormat::Markdown, layout: Some(LayoutDetectionConfig::default()), use_layout_for_markdown: true, ..Default::default() } } // ── Default: no behavior change ───────────────────────────────────────────── /// With `use_layout_for_markdown = false` (the default), the pipeline must /// produce output that is indistinguishable from the baseline (no layout). /// This guards against accidental regressions introduced by the new field. #[test] fn test_use_layout_for_markdown_false_matches_baseline() { if !test_documents_available() { return; } let pdf = "pdf/google_doc_document.pdf"; let baseline = extract_md(pdf, &baseline_config()); let layout_not_injected = extract_md(pdf, &layout_config_not_injected()); assert_eq!( baseline, layout_not_injected, "use_layout_for_markdown=false must not change extraction output compared to no-layout config" ); } // ── Layout injection: structural improvement ───────────────────────────────── /// With `use_layout_for_markdown = true` and a PDF that has headings, the /// markdown output must contain at least one ATX heading line (`# ...`). /// /// The test uses `google_doc_document.pdf`, which is a structured Google Docs /// export with clear title and section headings detectable by the RT-DETR model. /// /// This test requires the layout model to be available (ORT + model files). /// It is marked `#[ignore]` on CI where model weights are not pre-downloaded. #[test] #[ignore = "requires layout model files (ORT inference)"] fn test_use_layout_for_markdown_produces_headings() { if !test_documents_available() { return; } let pdf = "pdf/google_doc_document.pdf"; let output = extract_md(pdf, &layout_for_markdown_config()); let has_heading = output.lines().any(|line| line.starts_with('#')); assert!( has_heading, "use_layout_for_markdown=true should produce at least one ATX heading line; got:\n{}", &output[..output.len().min(500)] ); } /// **Strict regression guard** — `use_layout_for_markdown=true` must produce /// strictly more ATX headings than the baseline (font-clustering only). /// /// This is the test that catches the catastrophic bug where RT-DETR runs but /// its detections never reach `apply_layout_overrides`, making the layout /// pipeline a 70× slower no-op (identical SF1 to baseline). Presence-only /// tests (see `test_use_layout_for_markdown_produces_headings`) pass even /// when the layout path is broken, because font-clustering finds some /// headings on its own. Only an *inequality* against the baseline reveals /// whether layout hints actually changed classification. #[test] #[ignore = "requires layout model files (ORT inference)"] fn test_use_layout_for_markdown_adds_headings_vs_baseline() { if !test_documents_available() { return; } let pdf = "pdf/google_doc_document.pdf"; let baseline = extract_md(pdf, &baseline_config()); let layout = extract_md(pdf, &layout_for_markdown_config()); fn count_atx_headings(content: &str) -> usize { content .lines() .filter(|line| { let trimmed = line.trim_start(); trimmed.starts_with("# ") || trimmed.starts_with("## ") || trimmed.starts_with("### ") || trimmed.starts_with("#### ") || trimmed.starts_with("##### ") || trimmed.starts_with("###### ") }) .count() } let baseline_h = count_atx_headings(&baseline); let layout_h = count_atx_headings(&layout); assert!( layout_h > baseline_h, "use_layout_for_markdown=true must add at least one heading vs baseline.\n\ baseline_headings = {}, layout_headings = {}\n\ If these are equal, layout detections are not flowing into \ apply_layout_overrides. Check pdf/mod.rs:169 (`layout_hints` should \ not be hardcoded `None`) and the pixel→PDF coord-space conversion in \ extractors/pdf/layout_hints.rs.", baseline_h, layout_h ); } /// Verify that `use_layout_for_markdown = true` with `layout = None` silently /// produces the same output as the baseline (no-op when layout config is absent). #[test] fn test_use_layout_for_markdown_without_layout_config_is_noop() { if !test_documents_available() { return; } let pdf = "pdf/google_doc_document.pdf"; let baseline = extract_md(pdf, &baseline_config()); // use_layout_for_markdown=true but layout=None → runner must skip silently. let noop_config = ExtractionConfig { output_format: OutputFormat::Markdown, layout: None, use_layout_for_markdown: true, ..Default::default() }; let noop_output = extract_md(pdf, &noop_config); assert_eq!( baseline, noop_output, "use_layout_for_markdown=true with layout=None must produce the same output as baseline" ); } /// Verify that `force_ocr=true` bypasses the layout-for-markdown path. /// The field must be a no-op when the entire document is OCR'd. #[test] fn test_use_layout_for_markdown_skipped_when_force_ocr() { // We can't easily run OCR in unit tests without a backend registered, // but we CAN verify the config combination doesn't panic or error. // The actual gate is tested via the `maybe_run_layout_for_markdown` guard. let config = ExtractionConfig { output_format: OutputFormat::Markdown, layout: Some(LayoutDetectionConfig::default()), use_layout_for_markdown: true, force_ocr: true, ..Default::default() }; // Config construction must succeed and the field values must be set correctly. assert!(config.use_layout_for_markdown); assert!(config.force_ocr); assert!(config.layout.is_some()); }