fil/crates/kreuzberg/tests/pdf_markdown_extraction.rs

//! PDF markdown extraction integration tests.
//!
//! Tests that the new markdown rendering pipeline produces structured output
//! with headings, proper paragraph breaks, and no mid-sentence line breaks.

#![cfg(feature = "pdf")]

mod helpers;

use helpers::*;
use kreuzberg::core::config::{ExtractionConfig, OutputFormat};
use kreuzberg::extract_file_sync;

#[ignore = "TODO: pdf_oxide upstream — https://github.com/yfedoseev/pdf_oxide/issues/484"]
#[test]
fn test_pdf_markdown_extraction_produces_structured_output() {
    if skip_if_missing("pdf/fake_memo.pdf") {
        return;
    }

    let path = get_test_file_path("pdf/fake_memo.pdf");

    let config = ExtractionConfig {
        output_format: OutputFormat::Markdown,
        ..Default::default()
    };

    let result = extract_file_sync(&path, None, &config).expect("Should extract PDF as markdown");

    assert!(
        !result.content.trim().is_empty(),
        "Markdown content should not be empty"
    );
    assert_eq!(
        &*result.mime_type, "application/pdf",
        "Mime type should preserve original document type; output format is tracked in metadata"
    );

    // Verify paragraph structure: should have paragraph breaks (blank lines).
    // PDFs may use \r\n or \n line endings; normalize before counting.
    let normalized = result.content.replace("\r\n", "\n");
    let para_breaks = normalized.matches("\n\n").count();

    println!("=== Markdown output (first 1500 chars) ===");
    println!("{}", &result.content[..result.content.len().min(1500)]);
    println!("\n=== Analysis ===");
    println!("Has heading markers: {}", result.content.contains("# "));
    println!("Paragraph breaks: {}", para_breaks);
    println!("Total chars: {}", result.content.len());
    println!("Mime type: {}", result.mime_type);

    assert!(
        para_breaks >= 1,
        "Should have at least 1 paragraph break, got {}",
        para_breaks
    );
}

#[test]
fn test_pdf_plain_extraction_unchanged() {
    if skip_if_missing("pdf/fake_memo.pdf") {
        return;
    }

    let path = get_test_file_path("pdf/fake_memo.pdf");

    // Default config = Plain output format
    let config = ExtractionConfig::default();
    let result = extract_file_sync(&path, None, &config).expect("Should extract PDF as plain text");

    assert!(!result.content.trim().is_empty(), "Plain content should not be empty");
    assert_eq!(
        &*result.mime_type, "application/pdf",
        "Mime type should remain application/pdf for plain extraction"
    );
}

#[test]
fn test_pdf_markdown_vs_plain_has_more_structure() {
    if skip_if_missing("pdf/google_doc_document.pdf") {
        return;
    }

    let path = get_test_file_path("pdf/google_doc_document.pdf");

    // Extract as plain
    let plain_config = ExtractionConfig::default();
    let plain_result = extract_file_sync(&path, None, &plain_config).expect("Plain extraction failed");

    // Extract as markdown
    let md_config = ExtractionConfig {
        output_format: OutputFormat::Markdown,
        ..Default::default()
    };
    let md_result = extract_file_sync(&path, None, &md_config).expect("Markdown extraction failed");

    println!("=== Plain (first 500 chars) ===");
    println!("{}", &plain_result.content[..plain_result.content.len().min(500)]);
    println!("\n=== Markdown (first 500 chars) ===");
    println!("{}", &md_result.content[..md_result.content.len().min(500)]);

    // Both should have content
    assert!(!plain_result.content.trim().is_empty());
    assert!(!md_result.content.trim().is_empty());

    // Markdown should be different from plain (has structure added)
    // This is a weak check but validates the pipeline ran
    assert_ne!(
        plain_result.content, md_result.content,
        "Markdown output should differ from plain text output"
    );
}

/// Regression test for GitHub discussion #391: PDFs with flat structure trees
/// (all paragraphs, no heading tags) should still produce headings via
/// font-size clustering fallback when the document has varying font sizes.
#[test]
fn test_pdf_markdown_produces_headings_via_font_size_clustering() {
    if skip_if_missing("pdf/google_doc_document.pdf") {
        return;
    }

    let path = get_test_file_path("pdf/google_doc_document.pdf");

    let config = ExtractionConfig {
        output_format: OutputFormat::Markdown,
        ..Default::default()
    };

    let result = extract_file_sync(&path, None, &config).expect("Should extract PDF as markdown");

    let heading_lines: Vec<&str> = result.content.lines().filter(|l| l.trim().starts_with('#')).collect();

    println!("=== Heading detection test ===");
    println!("Total headings found: {}", heading_lines.len());
    for h in &heading_lines {
        println!("  {}", h);
    }

    // The markdown pipeline should detect headings either from the structure
    // tree or via font-size clustering fallback.
    assert!(
        !heading_lines.is_empty(),
        "Markdown extraction should produce at least one heading via structure tree or font-size clustering"
    );
}

/// Test that markdown extraction preserves paragraph breaks even when
/// the structure tree fallback to heuristic is triggered.
#[test]
fn test_pdf_markdown_heuristic_fallback_preserves_paragraphs() {
    if skip_if_missing("pdf/google_doc_document.pdf") {
        return;
    }

    let path = get_test_file_path("pdf/google_doc_document.pdf");

    let config = ExtractionConfig {
        output_format: OutputFormat::Markdown,
        ..Default::default()
    };

    let result = extract_file_sync(&path, None, &config).expect("Should extract PDF");

    let normalized = result.content.replace("\r\n", "\n");
    let para_breaks = normalized.matches("\n\n").count();
    let heading_count = result.content.lines().filter(|l| l.trim().starts_with('#')).count();

    println!("=== Heuristic fallback test ===");
    println!("Paragraphs: {}, Headings: {}", para_breaks, heading_count);
    println!("First 500 chars: {}", &result.content[..result.content.len().min(500)]);

    assert!(
        para_breaks >= 2,
        "Should preserve paragraph breaks after heuristic fallback, got {}",
        para_breaks
    );
}