fil/crates/kreuzberg/tests/pdf_output_quality.rs

//! PDF output quality integration tests.
//!
//! Regression tests verifying that extraction output is clean and free of
//! common noise patterns (figure-internal text, arXiv watermarks, reference
//! entries misclassified as headings, repeating conference headers).
//!
//! Benchmark documents:
//! - `docling.pdf` — academic paper with figures, tables, arXiv sidebar
//! - `multi_page.pdf` — clean multi-page document (no noise expected)

#![cfg(feature = "pdf")]

mod helpers;

use helpers::*;
use kreuzberg::core::config::{ExtractionConfig, OutputFormat};
use kreuzberg::extract_file_sync;

fn extract_markdown(relative_path: &str) -> String {
    let pdf_path = get_test_file_path(relative_path);
    if !pdf_path.exists() {
        panic!("Test document not found: {}", relative_path);
    }
    let config = ExtractionConfig {
        output_format: OutputFormat::Markdown,
        ..Default::default()
    };
    extract_file_sync(&pdf_path, None, &config)
        .expect("extraction should succeed")
        .content
}

#[cfg(feature = "layout-detection")]
fn extract_markdown_with_layout(relative_path: &str) -> String {
    use kreuzberg::core::config::layout::LayoutDetectionConfig;

    let pdf_path = get_test_file_path(relative_path);
    if !pdf_path.exists() {
        panic!("Test document not found: {}", relative_path);
    }
    let config = ExtractionConfig {
        output_format: OutputFormat::Markdown,
        layout: Some(LayoutDetectionConfig::default()),
        ..Default::default()
    };
    extract_file_sync(&pdf_path, None, &config)
        .expect("layout extraction should succeed")
        .content
}

// ── Noise filtering: figure-internal text ────────────────────────────

#[cfg(feature = "layout-detection")]
#[ignore = "TODO: pdf_oxide upstream — https://github.com/yfedoseev/pdf_oxide/issues/484"]
#[test]
fn test_docling_no_figure_internal_text() {
    if !test_documents_available() {
        return;
    }
    let content = extract_markdown_with_layout("pdf/docling.pdf");

    // "Circling Minimums" is a heading from inside an appendix figure — should be suppressed
    assert!(
        !content.contains("Circling Minimums"),
        "Figure-internal heading 'Circling Minimums' leaked into output"
    );

    // Figure diagram labels from Figure 1 should not appear as body text
    assert!(
        !content.contains("{;} Parse PDF pages"),
        "Figure 1 diagram text leaked into output"
    );
}

#[cfg(feature = "layout-detection")]
#[test]
fn test_docling_no_figure_text_as_headings() {
    if !test_documents_available() {
        return;
    }
    let content = extract_markdown_with_layout("pdf/docling.pdf");

    // "{;} Parse PDF pages" is from the pipeline diagram (Figure 1)
    for line in content.lines() {
        if line.starts_with('#') {
            assert!(
                !line.contains("{;}"),
                "Figure diagram text promoted to heading: {}",
                line
            );
            assert!(
                !line.contains("Parse PDF pages Table Structure OCR"),
                "Figure diagram text promoted to heading: {}",
                line
            );
        }
    }
}

// ── Noise filtering: arXiv watermark ─────────────────────────────────

#[cfg(feature = "layout-detection")]
#[test]
fn test_docling_no_arxiv_watermark() {
    if !test_documents_available() {
        return;
    }
    let content = extract_markdown_with_layout("pdf/docling.pdf");

    // The arXiv sidebar watermark "arXiv:2408.09869v5" should be stripped.
    // Legitimate references to arXiv in body text are fine (they don't include the ID).
    assert!(
        !content.contains("arXiv:2408.09869"),
        "arXiv watermark identifier not stripped from output"
    );
}

// ── Noise filtering: references as headings ──────────────────────────

#[cfg(feature = "layout-detection")]
#[test]
fn test_docling_references_not_headings() {
    if !test_documents_available() {
        return;
    }
    let content = extract_markdown_with_layout("pdf/docling.pdf");

    // Individual reference entries should not be promoted to ## headings
    let heading_lines: Vec<&str> = content.lines().filter(|l| l.starts_with("## ")).collect();
    for h in &heading_lines {
        assert!(
            !h.contains("PyPDFium2"),
            "Reference entry misclassified as heading: {}",
            h
        );
        assert!(
            !h.contains("LlamaIndex"),
            "Reference entry misclassified as heading: {}",
            h
        );
        assert!(
            !h.contains("PyttiuPDF"),
            "Reference entry misclassified as heading: {}",
            h
        );
    }
}

// ── Content preservation ─────────────────────────────────────────────

#[cfg(feature = "layout-detection")]
#[test]
fn test_docling_key_content_preserved() {
    if !test_documents_available() {
        return;
    }
    let content = extract_markdown_with_layout("pdf/docling.pdf");

    assert!(
        content.contains("Docling Technical Report"),
        "Title not found in output"
    );
    assert!(
        content.contains("Processing pipeline") || content.contains("processing pipeline"),
        "Section 'Processing pipeline' not found"
    );
    assert!(content.contains("TableFormer"), "'TableFormer' not found");
    assert!(
        content.contains("PDF backend") || content.contains("PDF backends"),
        "'PDF backends' section not found"
    );
}

#[test]
fn test_multipage_clean_output() {
    if !test_documents_available() {
        return;
    }
    let content = extract_markdown("pdf/multi_page.pdf");

    assert!(content.contains("Evolution of the Word Processor"), "Title not found");
    assert!(
        content.contains("Pre-Digital Era"),
        "Section 'Pre-Digital Era' not found"
    );
    assert!(content.contains("IBM MT/ST"), "'IBM MT/ST' not found");
}

#[test]
fn test_multipage_no_noise() {
    if !test_documents_available() {
        return;
    }
    let content = extract_markdown("pdf/multi_page.pdf");

    // multipage.pdf is a clean document — should have no arXiv noise
    assert!(
        !content.contains("arXiv:"),
        "multipage.pdf should have no arXiv identifiers"
    );
}