Files
fil/crates/kreuzberg/tests/layout_for_markdown.rs

205 lines
7.9 KiB
Rust
Raw Permalink Normal View History

2026-06-01 23:40:55 +02:00
//! Integration tests for the `use_layout_for_markdown` flag.
//!
//! These tests verify that:
//! 1. `use_layout_for_markdown = true` feeds layout regions into the non-OCR
//! markdown pipeline, producing richer structural output compared to the
//! baseline (font-clustering only).
//! 2. `use_layout_for_markdown = false` (default) leaves the pipeline unchanged
//! and produces the same output as a config without the field.
//!
//! Tests are feature-gated on `pdf` and `layout-detection` and are marked
//! `#[ignore]` when the layout engine model files are not available on CI.
#![cfg(all(feature = "pdf", feature = "layout-detection"))]
mod helpers;
use helpers::{get_test_file_path, test_documents_available};
use kreuzberg::core::config::{ExtractionConfig, OutputFormat, layout::LayoutDetectionConfig};
use kreuzberg::extract_file_sync;
/// Extract `relative_path` (from `test_documents/`) with the given config.
fn extract_md(relative_path: &str, config: &ExtractionConfig) -> String {
let path = get_test_file_path(relative_path);
extract_file_sync(&path, None, config)
.expect("extraction should succeed")
.content
}
/// Config: output_format=Markdown, no layout at all (pure baseline).
fn baseline_config() -> ExtractionConfig {
ExtractionConfig {
output_format: OutputFormat::Markdown,
..Default::default()
}
}
/// Config: layout=Some(default), use_layout_for_markdown=false.
/// Layout model is loaded but NOT injected into the native path.
fn layout_config_not_injected() -> ExtractionConfig {
ExtractionConfig {
output_format: OutputFormat::Markdown,
layout: Some(LayoutDetectionConfig::default()),
use_layout_for_markdown: false,
..Default::default()
}
}
/// Config: layout=Some(default), use_layout_for_markdown=true.
/// Layout regions ARE injected into the native markdown pipeline.
fn layout_for_markdown_config() -> ExtractionConfig {
ExtractionConfig {
output_format: OutputFormat::Markdown,
layout: Some(LayoutDetectionConfig::default()),
use_layout_for_markdown: true,
..Default::default()
}
}
// ── Default: no behavior change ─────────────────────────────────────────────
/// With `use_layout_for_markdown = false` (the default), the pipeline must
/// produce output that is indistinguishable from the baseline (no layout).
/// This guards against accidental regressions introduced by the new field.
#[test]
fn test_use_layout_for_markdown_false_matches_baseline() {
if !test_documents_available() {
return;
}
let pdf = "pdf/google_doc_document.pdf";
let baseline = extract_md(pdf, &baseline_config());
let layout_not_injected = extract_md(pdf, &layout_config_not_injected());
assert_eq!(
baseline, layout_not_injected,
"use_layout_for_markdown=false must not change extraction output compared to no-layout config"
);
}
// ── Layout injection: structural improvement ─────────────────────────────────
/// With `use_layout_for_markdown = true` and a PDF that has headings, the
/// markdown output must contain at least one ATX heading line (`# ...`).
///
/// The test uses `google_doc_document.pdf`, which is a structured Google Docs
/// export with clear title and section headings detectable by the RT-DETR model.
///
/// This test requires the layout model to be available (ORT + model files).
/// It is marked `#[ignore]` on CI where model weights are not pre-downloaded.
#[test]
#[ignore = "requires layout model files (ORT inference)"]
fn test_use_layout_for_markdown_produces_headings() {
if !test_documents_available() {
return;
}
let pdf = "pdf/google_doc_document.pdf";
let output = extract_md(pdf, &layout_for_markdown_config());
let has_heading = output.lines().any(|line| line.starts_with('#'));
assert!(
has_heading,
"use_layout_for_markdown=true should produce at least one ATX heading line; got:\n{}",
&output[..output.len().min(500)]
);
}
/// **Strict regression guard** — `use_layout_for_markdown=true` must produce
/// strictly more ATX headings than the baseline (font-clustering only).
///
/// This is the test that catches the catastrophic bug where RT-DETR runs but
/// its detections never reach `apply_layout_overrides`, making the layout
/// pipeline a 70× slower no-op (identical SF1 to baseline). Presence-only
/// tests (see `test_use_layout_for_markdown_produces_headings`) pass even
/// when the layout path is broken, because font-clustering finds some
/// headings on its own. Only an *inequality* against the baseline reveals
/// whether layout hints actually changed classification.
#[test]
#[ignore = "requires layout model files (ORT inference)"]
fn test_use_layout_for_markdown_adds_headings_vs_baseline() {
if !test_documents_available() {
return;
}
let pdf = "pdf/google_doc_document.pdf";
let baseline = extract_md(pdf, &baseline_config());
let layout = extract_md(pdf, &layout_for_markdown_config());
fn count_atx_headings(content: &str) -> usize {
content
.lines()
.filter(|line| {
let trimmed = line.trim_start();
trimmed.starts_with("# ")
|| trimmed.starts_with("## ")
|| trimmed.starts_with("### ")
|| trimmed.starts_with("#### ")
|| trimmed.starts_with("##### ")
|| trimmed.starts_with("###### ")
})
.count()
}
let baseline_h = count_atx_headings(&baseline);
let layout_h = count_atx_headings(&layout);
assert!(
layout_h > baseline_h,
"use_layout_for_markdown=true must add at least one heading vs baseline.\n\
baseline_headings = {}, layout_headings = {}\n\
If these are equal, layout detections are not flowing into \
apply_layout_overrides. Check pdf/mod.rs:169 (`layout_hints` should \
not be hardcoded `None`) and the pixelPDF coord-space conversion in \
extractors/pdf/layout_hints.rs.",
baseline_h,
layout_h
);
}
/// Verify that `use_layout_for_markdown = true` with `layout = None` silently
/// produces the same output as the baseline (no-op when layout config is absent).
#[test]
fn test_use_layout_for_markdown_without_layout_config_is_noop() {
if !test_documents_available() {
return;
}
let pdf = "pdf/google_doc_document.pdf";
let baseline = extract_md(pdf, &baseline_config());
// use_layout_for_markdown=true but layout=None → runner must skip silently.
let noop_config = ExtractionConfig {
output_format: OutputFormat::Markdown,
layout: None,
use_layout_for_markdown: true,
..Default::default()
};
let noop_output = extract_md(pdf, &noop_config);
assert_eq!(
baseline, noop_output,
"use_layout_for_markdown=true with layout=None must produce the same output as baseline"
);
}
/// Verify that `force_ocr=true` bypasses the layout-for-markdown path.
/// The field must be a no-op when the entire document is OCR'd.
#[test]
fn test_use_layout_for_markdown_skipped_when_force_ocr() {
// We can't easily run OCR in unit tests without a backend registered,
// but we CAN verify the config combination doesn't panic or error.
// The actual gate is tested via the `maybe_run_layout_for_markdown` guard.
let config = ExtractionConfig {
output_format: OutputFormat::Markdown,
layout: Some(LayoutDetectionConfig::default()),
use_layout_for_markdown: true,
force_ocr: true,
..Default::default()
};
// Config construction must succeed and the field values must be set correctly.
assert!(config.use_layout_for_markdown);
assert!(config.force_ocr);
assert!(config.layout.is_some());
}