Files
fil/crates/kreuzberg/tests/pdf_markdown_quality.rs

158 lines
4.5 KiB
Rust
Raw Permalink Normal View History

2026-06-01 23:40:55 +02:00
//! PDF markdown quality smoke tests: verify extraction produces structural elements.
//!
//! These are lightweight assertions — detailed quality scoring and A/B comparisons
//! live in `tools/benchmark-harness` (subcommands: `compare`, `pipeline-benchmark`).
//!
//! Usage:
//! cargo test -p kreuzberg --features "pdf" \
//! --test pdf_markdown_quality -- --nocapture
#![cfg(feature = "pdf")]
mod helpers;
use helpers::*;
use kreuzberg::core::config::{ExtractionConfig, OutputFormat};
use kreuzberg::extract_file_sync;
/// Documents with markdown ground truth.
const MARKDOWN_GT_DOCS: &[(&str, &str)] = &[("docling", "pdf/docling.pdf")];
fn extract_markdown(pdf_path: &std::path::Path) -> String {
let config = ExtractionConfig {
output_format: OutputFormat::Markdown,
..Default::default()
};
extract_file_sync(pdf_path, None, &config)
.expect("extraction should succeed")
.content
}
#[cfg(feature = "layout-detection")]
fn extract_markdown_with_layout(pdf_path: &std::path::Path) -> String {
use kreuzberg::core::config::layout::LayoutDetectionConfig;
let config = ExtractionConfig {
output_format: OutputFormat::Markdown,
layout: Some(LayoutDetectionConfig::default()),
..Default::default()
};
extract_file_sync(pdf_path, None, &config)
.expect("layout extraction should succeed")
.content
}
/// Count structural elements in markdown content.
fn count_headings(md: &str) -> usize {
md.lines().filter(|l| l.starts_with('#')).count()
}
fn count_table_rows(md: &str) -> usize {
md.lines()
.filter(|l| l.starts_with('|') && l.ends_with('|') && !l.contains("---"))
.count()
}
fn count_list_items(md: &str) -> usize {
md.lines()
.filter(|l| {
let t = l.trim_start();
t.starts_with("- ") || t.starts_with("* ") || t.starts_with("+ ")
})
.count()
}
fn has_code_blocks(md: &str) -> bool {
md.contains("```")
}
#[test]
fn test_baseline_produces_structural_markdown() {
if !test_documents_available() {
println!("Skipping: test_documents not available");
return;
}
for &(name, pdf_rel) in MARKDOWN_GT_DOCS {
let pdf_path = get_test_file_path(pdf_rel);
if !pdf_path.exists() {
println!("Skipping {}: file not found", name);
continue;
}
let content = extract_markdown(&pdf_path);
// Basic structural assertions
assert!(
!content.trim().is_empty(),
"{}: extraction produced empty content",
name
);
assert!(
content.len() > 500,
"{}: content too short ({} chars)",
name,
content.len()
);
assert!(count_headings(&content) > 0, "{}: expected at least one heading", name);
println!(
"{}: {} chars, {} headings, {} table rows, {} list items, code={}",
name,
content.len(),
count_headings(&content),
count_table_rows(&content),
count_list_items(&content),
has_code_blocks(&content),
);
}
}
#[cfg(feature = "layout-detection")]
#[test]
fn test_layout_does_not_regress_text_content() {
if !test_documents_available() {
println!("Skipping: test_documents not available");
return;
}
for &(name, pdf_rel) in MARKDOWN_GT_DOCS {
let pdf_path = get_test_file_path(pdf_rel);
if !pdf_path.exists() {
println!("Skipping {}: file not found", name);
continue;
}
let baseline = extract_markdown(&pdf_path);
let layout = extract_markdown_with_layout(&pdf_path);
// Layout extraction should not lose significant content
let baseline_len = baseline.len();
let layout_len = layout.len();
// Allow up to 20% content loss (layout may restructure)
assert!(
layout_len as f64 >= baseline_len as f64 * 0.8,
"{}: layout content ({} chars) is significantly shorter than baseline ({} chars)",
name,
layout_len,
baseline_len,
);
// Layout should still have headings
assert!(
count_headings(&layout) > 0,
"{}: layout extraction lost all headings",
name
);
println!(
"{}: baseline={} chars, layout={} chars, layout headings={}",
name,
baseline_len,
layout_len,
count_headings(&layout),
);
}
}