205 lines
7.9 KiB
Rust
205 lines
7.9 KiB
Rust
//! Integration tests for the `use_layout_for_markdown` flag.
|
||
//!
|
||
//! These tests verify that:
|
||
//! 1. `use_layout_for_markdown = true` feeds layout regions into the non-OCR
|
||
//! markdown pipeline, producing richer structural output compared to the
|
||
//! baseline (font-clustering only).
|
||
//! 2. `use_layout_for_markdown = false` (default) leaves the pipeline unchanged
|
||
//! and produces the same output as a config without the field.
|
||
//!
|
||
//! Tests are feature-gated on `pdf` and `layout-detection` and are marked
|
||
//! `#[ignore]` when the layout engine model files are not available on CI.
|
||
|
||
#![cfg(all(feature = "pdf", feature = "layout-detection"))]
|
||
|
||
mod helpers;
|
||
|
||
use helpers::{get_test_file_path, test_documents_available};
|
||
use kreuzberg::core::config::{ExtractionConfig, OutputFormat, layout::LayoutDetectionConfig};
|
||
use kreuzberg::extract_file_sync;
|
||
|
||
/// Extract `relative_path` (from `test_documents/`) with the given config.
|
||
fn extract_md(relative_path: &str, config: &ExtractionConfig) -> String {
|
||
let path = get_test_file_path(relative_path);
|
||
extract_file_sync(&path, None, config)
|
||
.expect("extraction should succeed")
|
||
.content
|
||
}
|
||
|
||
/// Config: output_format=Markdown, no layout at all (pure baseline).
|
||
fn baseline_config() -> ExtractionConfig {
|
||
ExtractionConfig {
|
||
output_format: OutputFormat::Markdown,
|
||
..Default::default()
|
||
}
|
||
}
|
||
|
||
/// Config: layout=Some(default), use_layout_for_markdown=false.
|
||
/// Layout model is loaded but NOT injected into the native path.
|
||
fn layout_config_not_injected() -> ExtractionConfig {
|
||
ExtractionConfig {
|
||
output_format: OutputFormat::Markdown,
|
||
layout: Some(LayoutDetectionConfig::default()),
|
||
use_layout_for_markdown: false,
|
||
..Default::default()
|
||
}
|
||
}
|
||
|
||
/// Config: layout=Some(default), use_layout_for_markdown=true.
|
||
/// Layout regions ARE injected into the native markdown pipeline.
|
||
fn layout_for_markdown_config() -> ExtractionConfig {
|
||
ExtractionConfig {
|
||
output_format: OutputFormat::Markdown,
|
||
layout: Some(LayoutDetectionConfig::default()),
|
||
use_layout_for_markdown: true,
|
||
..Default::default()
|
||
}
|
||
}
|
||
|
||
// ── Default: no behavior change ─────────────────────────────────────────────
|
||
|
||
/// With `use_layout_for_markdown = false` (the default), the pipeline must
|
||
/// produce output that is indistinguishable from the baseline (no layout).
|
||
/// This guards against accidental regressions introduced by the new field.
|
||
#[test]
|
||
fn test_use_layout_for_markdown_false_matches_baseline() {
|
||
if !test_documents_available() {
|
||
return;
|
||
}
|
||
|
||
let pdf = "pdf/google_doc_document.pdf";
|
||
let baseline = extract_md(pdf, &baseline_config());
|
||
let layout_not_injected = extract_md(pdf, &layout_config_not_injected());
|
||
|
||
assert_eq!(
|
||
baseline, layout_not_injected,
|
||
"use_layout_for_markdown=false must not change extraction output compared to no-layout config"
|
||
);
|
||
}
|
||
|
||
// ── Layout injection: structural improvement ─────────────────────────────────
|
||
|
||
/// With `use_layout_for_markdown = true` and a PDF that has headings, the
|
||
/// markdown output must contain at least one ATX heading line (`# ...`).
|
||
///
|
||
/// The test uses `google_doc_document.pdf`, which is a structured Google Docs
|
||
/// export with clear title and section headings detectable by the RT-DETR model.
|
||
///
|
||
/// This test requires the layout model to be available (ORT + model files).
|
||
/// It is marked `#[ignore]` on CI where model weights are not pre-downloaded.
|
||
#[test]
|
||
#[ignore = "requires layout model files (ORT inference)"]
|
||
fn test_use_layout_for_markdown_produces_headings() {
|
||
if !test_documents_available() {
|
||
return;
|
||
}
|
||
|
||
let pdf = "pdf/google_doc_document.pdf";
|
||
let output = extract_md(pdf, &layout_for_markdown_config());
|
||
|
||
let has_heading = output.lines().any(|line| line.starts_with('#'));
|
||
assert!(
|
||
has_heading,
|
||
"use_layout_for_markdown=true should produce at least one ATX heading line; got:\n{}",
|
||
&output[..output.len().min(500)]
|
||
);
|
||
}
|
||
|
||
/// **Strict regression guard** — `use_layout_for_markdown=true` must produce
|
||
/// strictly more ATX headings than the baseline (font-clustering only).
|
||
///
|
||
/// This is the test that catches the catastrophic bug where RT-DETR runs but
|
||
/// its detections never reach `apply_layout_overrides`, making the layout
|
||
/// pipeline a 70× slower no-op (identical SF1 to baseline). Presence-only
|
||
/// tests (see `test_use_layout_for_markdown_produces_headings`) pass even
|
||
/// when the layout path is broken, because font-clustering finds some
|
||
/// headings on its own. Only an *inequality* against the baseline reveals
|
||
/// whether layout hints actually changed classification.
|
||
#[test]
|
||
#[ignore = "requires layout model files (ORT inference)"]
|
||
fn test_use_layout_for_markdown_adds_headings_vs_baseline() {
|
||
if !test_documents_available() {
|
||
return;
|
||
}
|
||
|
||
let pdf = "pdf/google_doc_document.pdf";
|
||
let baseline = extract_md(pdf, &baseline_config());
|
||
let layout = extract_md(pdf, &layout_for_markdown_config());
|
||
|
||
fn count_atx_headings(content: &str) -> usize {
|
||
content
|
||
.lines()
|
||
.filter(|line| {
|
||
let trimmed = line.trim_start();
|
||
trimmed.starts_with("# ")
|
||
|| trimmed.starts_with("## ")
|
||
|| trimmed.starts_with("### ")
|
||
|| trimmed.starts_with("#### ")
|
||
|| trimmed.starts_with("##### ")
|
||
|| trimmed.starts_with("###### ")
|
||
})
|
||
.count()
|
||
}
|
||
|
||
let baseline_h = count_atx_headings(&baseline);
|
||
let layout_h = count_atx_headings(&layout);
|
||
|
||
assert!(
|
||
layout_h > baseline_h,
|
||
"use_layout_for_markdown=true must add at least one heading vs baseline.\n\
|
||
baseline_headings = {}, layout_headings = {}\n\
|
||
If these are equal, layout detections are not flowing into \
|
||
apply_layout_overrides. Check pdf/mod.rs:169 (`layout_hints` should \
|
||
not be hardcoded `None`) and the pixel→PDF coord-space conversion in \
|
||
extractors/pdf/layout_hints.rs.",
|
||
baseline_h,
|
||
layout_h
|
||
);
|
||
}
|
||
|
||
/// Verify that `use_layout_for_markdown = true` with `layout = None` silently
|
||
/// produces the same output as the baseline (no-op when layout config is absent).
|
||
#[test]
|
||
fn test_use_layout_for_markdown_without_layout_config_is_noop() {
|
||
if !test_documents_available() {
|
||
return;
|
||
}
|
||
|
||
let pdf = "pdf/google_doc_document.pdf";
|
||
let baseline = extract_md(pdf, &baseline_config());
|
||
|
||
// use_layout_for_markdown=true but layout=None → runner must skip silently.
|
||
let noop_config = ExtractionConfig {
|
||
output_format: OutputFormat::Markdown,
|
||
layout: None,
|
||
use_layout_for_markdown: true,
|
||
..Default::default()
|
||
};
|
||
let noop_output = extract_md(pdf, &noop_config);
|
||
|
||
assert_eq!(
|
||
baseline, noop_output,
|
||
"use_layout_for_markdown=true with layout=None must produce the same output as baseline"
|
||
);
|
||
}
|
||
|
||
/// Verify that `force_ocr=true` bypasses the layout-for-markdown path.
|
||
/// The field must be a no-op when the entire document is OCR'd.
|
||
#[test]
|
||
fn test_use_layout_for_markdown_skipped_when_force_ocr() {
|
||
// We can't easily run OCR in unit tests without a backend registered,
|
||
// but we CAN verify the config combination doesn't panic or error.
|
||
// The actual gate is tested via the `maybe_run_layout_for_markdown` guard.
|
||
let config = ExtractionConfig {
|
||
output_format: OutputFormat::Markdown,
|
||
layout: Some(LayoutDetectionConfig::default()),
|
||
use_layout_for_markdown: true,
|
||
force_ocr: true,
|
||
..Default::default()
|
||
};
|
||
// Config construction must succeed and the field values must be set correctly.
|
||
assert!(config.use_layout_for_markdown);
|
||
assert!(config.force_ocr);
|
||
assert!(config.layout.is_some());
|
||
}
|