202 lines
6.4 KiB
Rust
202 lines
6.4 KiB
Rust
//! PDF output quality integration tests.
|
|
//!
|
|
//! Regression tests verifying that extraction output is clean and free of
|
|
//! common noise patterns (figure-internal text, arXiv watermarks, reference
|
|
//! entries misclassified as headings, repeating conference headers).
|
|
//!
|
|
//! Benchmark documents:
|
|
//! - `docling.pdf` — academic paper with figures, tables, arXiv sidebar
|
|
//! - `multi_page.pdf` — clean multi-page document (no noise expected)
|
|
|
|
#![cfg(feature = "pdf")]
|
|
|
|
mod helpers;
|
|
|
|
use helpers::*;
|
|
use kreuzberg::core::config::{ExtractionConfig, OutputFormat};
|
|
use kreuzberg::extract_file_sync;
|
|
|
|
fn extract_markdown(relative_path: &str) -> String {
|
|
let pdf_path = get_test_file_path(relative_path);
|
|
if !pdf_path.exists() {
|
|
panic!("Test document not found: {}", relative_path);
|
|
}
|
|
let config = ExtractionConfig {
|
|
output_format: OutputFormat::Markdown,
|
|
..Default::default()
|
|
};
|
|
extract_file_sync(&pdf_path, None, &config)
|
|
.expect("extraction should succeed")
|
|
.content
|
|
}
|
|
|
|
#[cfg(feature = "layout-detection")]
|
|
fn extract_markdown_with_layout(relative_path: &str) -> String {
|
|
use kreuzberg::core::config::layout::LayoutDetectionConfig;
|
|
|
|
let pdf_path = get_test_file_path(relative_path);
|
|
if !pdf_path.exists() {
|
|
panic!("Test document not found: {}", relative_path);
|
|
}
|
|
let config = ExtractionConfig {
|
|
output_format: OutputFormat::Markdown,
|
|
layout: Some(LayoutDetectionConfig::default()),
|
|
..Default::default()
|
|
};
|
|
extract_file_sync(&pdf_path, None, &config)
|
|
.expect("layout extraction should succeed")
|
|
.content
|
|
}
|
|
|
|
// ── Noise filtering: figure-internal text ────────────────────────────
|
|
|
|
#[cfg(feature = "layout-detection")]
|
|
#[ignore = "TODO: pdf_oxide upstream — https://github.com/yfedoseev/pdf_oxide/issues/484"]
|
|
#[test]
|
|
fn test_docling_no_figure_internal_text() {
|
|
if !test_documents_available() {
|
|
return;
|
|
}
|
|
let content = extract_markdown_with_layout("pdf/docling.pdf");
|
|
|
|
// "Circling Minimums" is a heading from inside an appendix figure — should be suppressed
|
|
assert!(
|
|
!content.contains("Circling Minimums"),
|
|
"Figure-internal heading 'Circling Minimums' leaked into output"
|
|
);
|
|
|
|
// Figure diagram labels from Figure 1 should not appear as body text
|
|
assert!(
|
|
!content.contains("{;} Parse PDF pages"),
|
|
"Figure 1 diagram text leaked into output"
|
|
);
|
|
}
|
|
|
|
#[cfg(feature = "layout-detection")]
|
|
#[test]
|
|
fn test_docling_no_figure_text_as_headings() {
|
|
if !test_documents_available() {
|
|
return;
|
|
}
|
|
let content = extract_markdown_with_layout("pdf/docling.pdf");
|
|
|
|
// "{;} Parse PDF pages" is from the pipeline diagram (Figure 1)
|
|
for line in content.lines() {
|
|
if line.starts_with('#') {
|
|
assert!(
|
|
!line.contains("{;}"),
|
|
"Figure diagram text promoted to heading: {}",
|
|
line
|
|
);
|
|
assert!(
|
|
!line.contains("Parse PDF pages Table Structure OCR"),
|
|
"Figure diagram text promoted to heading: {}",
|
|
line
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
// ── Noise filtering: arXiv watermark ─────────────────────────────────
|
|
|
|
#[cfg(feature = "layout-detection")]
|
|
#[test]
|
|
fn test_docling_no_arxiv_watermark() {
|
|
if !test_documents_available() {
|
|
return;
|
|
}
|
|
let content = extract_markdown_with_layout("pdf/docling.pdf");
|
|
|
|
// The arXiv sidebar watermark "arXiv:2408.09869v5" should be stripped.
|
|
// Legitimate references to arXiv in body text are fine (they don't include the ID).
|
|
assert!(
|
|
!content.contains("arXiv:2408.09869"),
|
|
"arXiv watermark identifier not stripped from output"
|
|
);
|
|
}
|
|
|
|
// ── Noise filtering: references as headings ──────────────────────────
|
|
|
|
#[cfg(feature = "layout-detection")]
|
|
#[test]
|
|
fn test_docling_references_not_headings() {
|
|
if !test_documents_available() {
|
|
return;
|
|
}
|
|
let content = extract_markdown_with_layout("pdf/docling.pdf");
|
|
|
|
// Individual reference entries should not be promoted to ## headings
|
|
let heading_lines: Vec<&str> = content.lines().filter(|l| l.starts_with("## ")).collect();
|
|
for h in &heading_lines {
|
|
assert!(
|
|
!h.contains("PyPDFium2"),
|
|
"Reference entry misclassified as heading: {}",
|
|
h
|
|
);
|
|
assert!(
|
|
!h.contains("LlamaIndex"),
|
|
"Reference entry misclassified as heading: {}",
|
|
h
|
|
);
|
|
assert!(
|
|
!h.contains("PyttiuPDF"),
|
|
"Reference entry misclassified as heading: {}",
|
|
h
|
|
);
|
|
}
|
|
}
|
|
|
|
// ── Content preservation ─────────────────────────────────────────────
|
|
|
|
#[cfg(feature = "layout-detection")]
|
|
#[test]
|
|
fn test_docling_key_content_preserved() {
|
|
if !test_documents_available() {
|
|
return;
|
|
}
|
|
let content = extract_markdown_with_layout("pdf/docling.pdf");
|
|
|
|
assert!(
|
|
content.contains("Docling Technical Report"),
|
|
"Title not found in output"
|
|
);
|
|
assert!(
|
|
content.contains("Processing pipeline") || content.contains("processing pipeline"),
|
|
"Section 'Processing pipeline' not found"
|
|
);
|
|
assert!(content.contains("TableFormer"), "'TableFormer' not found");
|
|
assert!(
|
|
content.contains("PDF backend") || content.contains("PDF backends"),
|
|
"'PDF backends' section not found"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_multipage_clean_output() {
|
|
if !test_documents_available() {
|
|
return;
|
|
}
|
|
let content = extract_markdown("pdf/multi_page.pdf");
|
|
|
|
assert!(content.contains("Evolution of the Word Processor"), "Title not found");
|
|
assert!(
|
|
content.contains("Pre-Digital Era"),
|
|
"Section 'Pre-Digital Era' not found"
|
|
);
|
|
assert!(content.contains("IBM MT/ST"), "'IBM MT/ST' not found");
|
|
}
|
|
|
|
#[test]
|
|
fn test_multipage_no_noise() {
|
|
if !test_documents_available() {
|
|
return;
|
|
}
|
|
let content = extract_markdown("pdf/multi_page.pdf");
|
|
|
|
// multipage.pdf is a clean document — should have no arXiv noise
|
|
assert!(
|
|
!content.contains("arXiv:"),
|
|
"multipage.pdf should have no arXiv identifiers"
|
|
);
|
|
}
|