Files
fil/crates/kreuzberg/tests/layout_for_markdown.rs
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

205 lines
7.9 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//! Integration tests for the `use_layout_for_markdown` flag.
//!
//! These tests verify that:
//! 1. `use_layout_for_markdown = true` feeds layout regions into the non-OCR
//! markdown pipeline, producing richer structural output compared to the
//! baseline (font-clustering only).
//! 2. `use_layout_for_markdown = false` (default) leaves the pipeline unchanged
//! and produces the same output as a config without the field.
//!
//! Tests are feature-gated on `pdf` and `layout-detection` and are marked
//! `#[ignore]` when the layout engine model files are not available on CI.
#![cfg(all(feature = "pdf", feature = "layout-detection"))]
mod helpers;
use helpers::{get_test_file_path, test_documents_available};
use kreuzberg::core::config::{ExtractionConfig, OutputFormat, layout::LayoutDetectionConfig};
use kreuzberg::extract_file_sync;
/// Extract `relative_path` (from `test_documents/`) with the given config.
fn extract_md(relative_path: &str, config: &ExtractionConfig) -> String {
let path = get_test_file_path(relative_path);
extract_file_sync(&path, None, config)
.expect("extraction should succeed")
.content
}
/// Config: output_format=Markdown, no layout at all (pure baseline).
fn baseline_config() -> ExtractionConfig {
ExtractionConfig {
output_format: OutputFormat::Markdown,
..Default::default()
}
}
/// Config: layout=Some(default), use_layout_for_markdown=false.
/// Layout model is loaded but NOT injected into the native path.
fn layout_config_not_injected() -> ExtractionConfig {
ExtractionConfig {
output_format: OutputFormat::Markdown,
layout: Some(LayoutDetectionConfig::default()),
use_layout_for_markdown: false,
..Default::default()
}
}
/// Config: layout=Some(default), use_layout_for_markdown=true.
/// Layout regions ARE injected into the native markdown pipeline.
fn layout_for_markdown_config() -> ExtractionConfig {
ExtractionConfig {
output_format: OutputFormat::Markdown,
layout: Some(LayoutDetectionConfig::default()),
use_layout_for_markdown: true,
..Default::default()
}
}
// ── Default: no behavior change ─────────────────────────────────────────────
/// With `use_layout_for_markdown = false` (the default), the pipeline must
/// produce output that is indistinguishable from the baseline (no layout).
/// This guards against accidental regressions introduced by the new field.
#[test]
fn test_use_layout_for_markdown_false_matches_baseline() {
if !test_documents_available() {
return;
}
let pdf = "pdf/google_doc_document.pdf";
let baseline = extract_md(pdf, &baseline_config());
let layout_not_injected = extract_md(pdf, &layout_config_not_injected());
assert_eq!(
baseline, layout_not_injected,
"use_layout_for_markdown=false must not change extraction output compared to no-layout config"
);
}
// ── Layout injection: structural improvement ─────────────────────────────────
/// With `use_layout_for_markdown = true` and a PDF that has headings, the
/// markdown output must contain at least one ATX heading line (`# ...`).
///
/// The test uses `google_doc_document.pdf`, which is a structured Google Docs
/// export with clear title and section headings detectable by the RT-DETR model.
///
/// This test requires the layout model to be available (ORT + model files).
/// It is marked `#[ignore]` on CI where model weights are not pre-downloaded.
#[test]
#[ignore = "requires layout model files (ORT inference)"]
fn test_use_layout_for_markdown_produces_headings() {
if !test_documents_available() {
return;
}
let pdf = "pdf/google_doc_document.pdf";
let output = extract_md(pdf, &layout_for_markdown_config());
let has_heading = output.lines().any(|line| line.starts_with('#'));
assert!(
has_heading,
"use_layout_for_markdown=true should produce at least one ATX heading line; got:\n{}",
&output[..output.len().min(500)]
);
}
/// **Strict regression guard** — `use_layout_for_markdown=true` must produce
/// strictly more ATX headings than the baseline (font-clustering only).
///
/// This is the test that catches the catastrophic bug where RT-DETR runs but
/// its detections never reach `apply_layout_overrides`, making the layout
/// pipeline a 70× slower no-op (identical SF1 to baseline). Presence-only
/// tests (see `test_use_layout_for_markdown_produces_headings`) pass even
/// when the layout path is broken, because font-clustering finds some
/// headings on its own. Only an *inequality* against the baseline reveals
/// whether layout hints actually changed classification.
#[test]
#[ignore = "requires layout model files (ORT inference)"]
fn test_use_layout_for_markdown_adds_headings_vs_baseline() {
if !test_documents_available() {
return;
}
let pdf = "pdf/google_doc_document.pdf";
let baseline = extract_md(pdf, &baseline_config());
let layout = extract_md(pdf, &layout_for_markdown_config());
fn count_atx_headings(content: &str) -> usize {
content
.lines()
.filter(|line| {
let trimmed = line.trim_start();
trimmed.starts_with("# ")
|| trimmed.starts_with("## ")
|| trimmed.starts_with("### ")
|| trimmed.starts_with("#### ")
|| trimmed.starts_with("##### ")
|| trimmed.starts_with("###### ")
})
.count()
}
let baseline_h = count_atx_headings(&baseline);
let layout_h = count_atx_headings(&layout);
assert!(
layout_h > baseline_h,
"use_layout_for_markdown=true must add at least one heading vs baseline.\n\
baseline_headings = {}, layout_headings = {}\n\
If these are equal, layout detections are not flowing into \
apply_layout_overrides. Check pdf/mod.rs:169 (`layout_hints` should \
not be hardcoded `None`) and the pixel→PDF coord-space conversion in \
extractors/pdf/layout_hints.rs.",
baseline_h,
layout_h
);
}
/// Verify that `use_layout_for_markdown = true` with `layout = None` silently
/// produces the same output as the baseline (no-op when layout config is absent).
#[test]
fn test_use_layout_for_markdown_without_layout_config_is_noop() {
if !test_documents_available() {
return;
}
let pdf = "pdf/google_doc_document.pdf";
let baseline = extract_md(pdf, &baseline_config());
// use_layout_for_markdown=true but layout=None → runner must skip silently.
let noop_config = ExtractionConfig {
output_format: OutputFormat::Markdown,
layout: None,
use_layout_for_markdown: true,
..Default::default()
};
let noop_output = extract_md(pdf, &noop_config);
assert_eq!(
baseline, noop_output,
"use_layout_for_markdown=true with layout=None must produce the same output as baseline"
);
}
/// Verify that `force_ocr=true` bypasses the layout-for-markdown path.
/// The field must be a no-op when the entire document is OCR'd.
#[test]
fn test_use_layout_for_markdown_skipped_when_force_ocr() {
// We can't easily run OCR in unit tests without a backend registered,
// but we CAN verify the config combination doesn't panic or error.
// The actual gate is tested via the `maybe_run_layout_for_markdown` guard.
let config = ExtractionConfig {
output_format: OutputFormat::Markdown,
layout: Some(LayoutDetectionConfig::default()),
use_layout_for_markdown: true,
force_ocr: true,
..Default::default()
};
// Config construction must succeed and the field values must be set correctly.
assert!(config.use_layout_for_markdown);
assert!(config.force_ocr);
assert!(config.layout.is_some());
}