Files
fil/crates/kreuzberg/tests/pdf_output_quality.rs
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

202 lines
6.4 KiB
Rust

//! PDF output quality integration tests.
//!
//! Regression tests verifying that extraction output is clean and free of
//! common noise patterns (figure-internal text, arXiv watermarks, reference
//! entries misclassified as headings, repeating conference headers).
//!
//! Benchmark documents:
//! - `docling.pdf` — academic paper with figures, tables, arXiv sidebar
//! - `multi_page.pdf` — clean multi-page document (no noise expected)
#![cfg(feature = "pdf")]
mod helpers;
use helpers::*;
use kreuzberg::core::config::{ExtractionConfig, OutputFormat};
use kreuzberg::extract_file_sync;
fn extract_markdown(relative_path: &str) -> String {
let pdf_path = get_test_file_path(relative_path);
if !pdf_path.exists() {
panic!("Test document not found: {}", relative_path);
}
let config = ExtractionConfig {
output_format: OutputFormat::Markdown,
..Default::default()
};
extract_file_sync(&pdf_path, None, &config)
.expect("extraction should succeed")
.content
}
#[cfg(feature = "layout-detection")]
fn extract_markdown_with_layout(relative_path: &str) -> String {
use kreuzberg::core::config::layout::LayoutDetectionConfig;
let pdf_path = get_test_file_path(relative_path);
if !pdf_path.exists() {
panic!("Test document not found: {}", relative_path);
}
let config = ExtractionConfig {
output_format: OutputFormat::Markdown,
layout: Some(LayoutDetectionConfig::default()),
..Default::default()
};
extract_file_sync(&pdf_path, None, &config)
.expect("layout extraction should succeed")
.content
}
// ── Noise filtering: figure-internal text ────────────────────────────
#[cfg(feature = "layout-detection")]
#[ignore = "TODO: pdf_oxide upstream — https://github.com/yfedoseev/pdf_oxide/issues/484"]
#[test]
fn test_docling_no_figure_internal_text() {
if !test_documents_available() {
return;
}
let content = extract_markdown_with_layout("pdf/docling.pdf");
// "Circling Minimums" is a heading from inside an appendix figure — should be suppressed
assert!(
!content.contains("Circling Minimums"),
"Figure-internal heading 'Circling Minimums' leaked into output"
);
// Figure diagram labels from Figure 1 should not appear as body text
assert!(
!content.contains("{;} Parse PDF pages"),
"Figure 1 diagram text leaked into output"
);
}
#[cfg(feature = "layout-detection")]
#[test]
fn test_docling_no_figure_text_as_headings() {
if !test_documents_available() {
return;
}
let content = extract_markdown_with_layout("pdf/docling.pdf");
// "{;} Parse PDF pages" is from the pipeline diagram (Figure 1)
for line in content.lines() {
if line.starts_with('#') {
assert!(
!line.contains("{;}"),
"Figure diagram text promoted to heading: {}",
line
);
assert!(
!line.contains("Parse PDF pages Table Structure OCR"),
"Figure diagram text promoted to heading: {}",
line
);
}
}
}
// ── Noise filtering: arXiv watermark ─────────────────────────────────
#[cfg(feature = "layout-detection")]
#[test]
fn test_docling_no_arxiv_watermark() {
if !test_documents_available() {
return;
}
let content = extract_markdown_with_layout("pdf/docling.pdf");
// The arXiv sidebar watermark "arXiv:2408.09869v5" should be stripped.
// Legitimate references to arXiv in body text are fine (they don't include the ID).
assert!(
!content.contains("arXiv:2408.09869"),
"arXiv watermark identifier not stripped from output"
);
}
// ── Noise filtering: references as headings ──────────────────────────
#[cfg(feature = "layout-detection")]
#[test]
fn test_docling_references_not_headings() {
if !test_documents_available() {
return;
}
let content = extract_markdown_with_layout("pdf/docling.pdf");
// Individual reference entries should not be promoted to ## headings
let heading_lines: Vec<&str> = content.lines().filter(|l| l.starts_with("## ")).collect();
for h in &heading_lines {
assert!(
!h.contains("PyPDFium2"),
"Reference entry misclassified as heading: {}",
h
);
assert!(
!h.contains("LlamaIndex"),
"Reference entry misclassified as heading: {}",
h
);
assert!(
!h.contains("PyttiuPDF"),
"Reference entry misclassified as heading: {}",
h
);
}
}
// ── Content preservation ─────────────────────────────────────────────
#[cfg(feature = "layout-detection")]
#[test]
fn test_docling_key_content_preserved() {
if !test_documents_available() {
return;
}
let content = extract_markdown_with_layout("pdf/docling.pdf");
assert!(
content.contains("Docling Technical Report"),
"Title not found in output"
);
assert!(
content.contains("Processing pipeline") || content.contains("processing pipeline"),
"Section 'Processing pipeline' not found"
);
assert!(content.contains("TableFormer"), "'TableFormer' not found");
assert!(
content.contains("PDF backend") || content.contains("PDF backends"),
"'PDF backends' section not found"
);
}
#[test]
fn test_multipage_clean_output() {
if !test_documents_available() {
return;
}
let content = extract_markdown("pdf/multi_page.pdf");
assert!(content.contains("Evolution of the Word Processor"), "Title not found");
assert!(
content.contains("Pre-Digital Era"),
"Section 'Pre-Digital Era' not found"
);
assert!(content.contains("IBM MT/ST"), "'IBM MT/ST' not found");
}
#[test]
fn test_multipage_no_noise() {
if !test_documents_available() {
return;
}
let content = extract_markdown("pdf/multi_page.pdf");
// multipage.pdf is a clean document — should have no arXiv noise
assert!(
!content.contains("arXiv:"),
"multipage.pdf should have no arXiv identifiers"
);
}