Files
fil/crates/kreuzberg/tests/pdf_markdown_extraction.rs
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

179 lines
6.1 KiB
Rust

//! PDF markdown extraction integration tests.
//!
//! Tests that the new markdown rendering pipeline produces structured output
//! with headings, proper paragraph breaks, and no mid-sentence line breaks.
#![cfg(feature = "pdf")]
mod helpers;
use helpers::*;
use kreuzberg::core::config::{ExtractionConfig, OutputFormat};
use kreuzberg::extract_file_sync;
#[ignore = "TODO: pdf_oxide upstream — https://github.com/yfedoseev/pdf_oxide/issues/484"]
#[test]
fn test_pdf_markdown_extraction_produces_structured_output() {
if skip_if_missing("pdf/fake_memo.pdf") {
return;
}
let path = get_test_file_path("pdf/fake_memo.pdf");
let config = ExtractionConfig {
output_format: OutputFormat::Markdown,
..Default::default()
};
let result = extract_file_sync(&path, None, &config).expect("Should extract PDF as markdown");
assert!(
!result.content.trim().is_empty(),
"Markdown content should not be empty"
);
assert_eq!(
&*result.mime_type, "application/pdf",
"Mime type should preserve original document type; output format is tracked in metadata"
);
// Verify paragraph structure: should have paragraph breaks (blank lines).
// PDFs may use \r\n or \n line endings; normalize before counting.
let normalized = result.content.replace("\r\n", "\n");
let para_breaks = normalized.matches("\n\n").count();
println!("=== Markdown output (first 1500 chars) ===");
println!("{}", &result.content[..result.content.len().min(1500)]);
println!("\n=== Analysis ===");
println!("Has heading markers: {}", result.content.contains("# "));
println!("Paragraph breaks: {}", para_breaks);
println!("Total chars: {}", result.content.len());
println!("Mime type: {}", result.mime_type);
assert!(
para_breaks >= 1,
"Should have at least 1 paragraph break, got {}",
para_breaks
);
}
#[test]
fn test_pdf_plain_extraction_unchanged() {
if skip_if_missing("pdf/fake_memo.pdf") {
return;
}
let path = get_test_file_path("pdf/fake_memo.pdf");
// Default config = Plain output format
let config = ExtractionConfig::default();
let result = extract_file_sync(&path, None, &config).expect("Should extract PDF as plain text");
assert!(!result.content.trim().is_empty(), "Plain content should not be empty");
assert_eq!(
&*result.mime_type, "application/pdf",
"Mime type should remain application/pdf for plain extraction"
);
}
#[test]
fn test_pdf_markdown_vs_plain_has_more_structure() {
if skip_if_missing("pdf/google_doc_document.pdf") {
return;
}
let path = get_test_file_path("pdf/google_doc_document.pdf");
// Extract as plain
let plain_config = ExtractionConfig::default();
let plain_result = extract_file_sync(&path, None, &plain_config).expect("Plain extraction failed");
// Extract as markdown
let md_config = ExtractionConfig {
output_format: OutputFormat::Markdown,
..Default::default()
};
let md_result = extract_file_sync(&path, None, &md_config).expect("Markdown extraction failed");
println!("=== Plain (first 500 chars) ===");
println!("{}", &plain_result.content[..plain_result.content.len().min(500)]);
println!("\n=== Markdown (first 500 chars) ===");
println!("{}", &md_result.content[..md_result.content.len().min(500)]);
// Both should have content
assert!(!plain_result.content.trim().is_empty());
assert!(!md_result.content.trim().is_empty());
// Markdown should be different from plain (has structure added)
// This is a weak check but validates the pipeline ran
assert_ne!(
plain_result.content, md_result.content,
"Markdown output should differ from plain text output"
);
}
/// Regression test for GitHub discussion #391: PDFs with flat structure trees
/// (all paragraphs, no heading tags) should still produce headings via
/// font-size clustering fallback when the document has varying font sizes.
#[test]
fn test_pdf_markdown_produces_headings_via_font_size_clustering() {
if skip_if_missing("pdf/google_doc_document.pdf") {
return;
}
let path = get_test_file_path("pdf/google_doc_document.pdf");
let config = ExtractionConfig {
output_format: OutputFormat::Markdown,
..Default::default()
};
let result = extract_file_sync(&path, None, &config).expect("Should extract PDF as markdown");
let heading_lines: Vec<&str> = result.content.lines().filter(|l| l.trim().starts_with('#')).collect();
println!("=== Heading detection test ===");
println!("Total headings found: {}", heading_lines.len());
for h in &heading_lines {
println!(" {}", h);
}
// The markdown pipeline should detect headings either from the structure
// tree or via font-size clustering fallback.
assert!(
!heading_lines.is_empty(),
"Markdown extraction should produce at least one heading via structure tree or font-size clustering"
);
}
/// Test that markdown extraction preserves paragraph breaks even when
/// the structure tree fallback to heuristic is triggered.
#[test]
fn test_pdf_markdown_heuristic_fallback_preserves_paragraphs() {
if skip_if_missing("pdf/google_doc_document.pdf") {
return;
}
let path = get_test_file_path("pdf/google_doc_document.pdf");
let config = ExtractionConfig {
output_format: OutputFormat::Markdown,
..Default::default()
};
let result = extract_file_sync(&path, None, &config).expect("Should extract PDF");
let normalized = result.content.replace("\r\n", "\n");
let para_breaks = normalized.matches("\n\n").count();
let heading_count = result.content.lines().filter(|l| l.trim().starts_with('#')).count();
println!("=== Heuristic fallback test ===");
println!("Paragraphs: {}, Headings: {}", para_breaks, heading_count);
println!("First 500 chars: {}", &result.content[..result.content.len().min(500)]);
assert!(
para_breaks >= 2,
"Should preserve paragraph breaks after heuristic fallback, got {}",
para_breaks
);
}