//! PDF markdown extraction integration tests. //! //! Tests that the new markdown rendering pipeline produces structured output //! with headings, proper paragraph breaks, and no mid-sentence line breaks. #![cfg(feature = "pdf")] mod helpers; use helpers::*; use kreuzberg::core::config::{ExtractionConfig, OutputFormat}; use kreuzberg::extract_file_sync; #[ignore = "TODO: pdf_oxide upstream — https://github.com/yfedoseev/pdf_oxide/issues/484"] #[test] fn test_pdf_markdown_extraction_produces_structured_output() { if skip_if_missing("pdf/fake_memo.pdf") { return; } let path = get_test_file_path("pdf/fake_memo.pdf"); let config = ExtractionConfig { output_format: OutputFormat::Markdown, ..Default::default() }; let result = extract_file_sync(&path, None, &config).expect("Should extract PDF as markdown"); assert!( !result.content.trim().is_empty(), "Markdown content should not be empty" ); assert_eq!( &*result.mime_type, "application/pdf", "Mime type should preserve original document type; output format is tracked in metadata" ); // Verify paragraph structure: should have paragraph breaks (blank lines). // PDFs may use \r\n or \n line endings; normalize before counting. let normalized = result.content.replace("\r\n", "\n"); let para_breaks = normalized.matches("\n\n").count(); println!("=== Markdown output (first 1500 chars) ==="); println!("{}", &result.content[..result.content.len().min(1500)]); println!("\n=== Analysis ==="); println!("Has heading markers: {}", result.content.contains("# ")); println!("Paragraph breaks: {}", para_breaks); println!("Total chars: {}", result.content.len()); println!("Mime type: {}", result.mime_type); assert!( para_breaks >= 1, "Should have at least 1 paragraph break, got {}", para_breaks ); } #[test] fn test_pdf_plain_extraction_unchanged() { if skip_if_missing("pdf/fake_memo.pdf") { return; } let path = get_test_file_path("pdf/fake_memo.pdf"); // Default config = Plain output format let config = ExtractionConfig::default(); let result = extract_file_sync(&path, None, &config).expect("Should extract PDF as plain text"); assert!(!result.content.trim().is_empty(), "Plain content should not be empty"); assert_eq!( &*result.mime_type, "application/pdf", "Mime type should remain application/pdf for plain extraction" ); } #[test] fn test_pdf_markdown_vs_plain_has_more_structure() { if skip_if_missing("pdf/google_doc_document.pdf") { return; } let path = get_test_file_path("pdf/google_doc_document.pdf"); // Extract as plain let plain_config = ExtractionConfig::default(); let plain_result = extract_file_sync(&path, None, &plain_config).expect("Plain extraction failed"); // Extract as markdown let md_config = ExtractionConfig { output_format: OutputFormat::Markdown, ..Default::default() }; let md_result = extract_file_sync(&path, None, &md_config).expect("Markdown extraction failed"); println!("=== Plain (first 500 chars) ==="); println!("{}", &plain_result.content[..plain_result.content.len().min(500)]); println!("\n=== Markdown (first 500 chars) ==="); println!("{}", &md_result.content[..md_result.content.len().min(500)]); // Both should have content assert!(!plain_result.content.trim().is_empty()); assert!(!md_result.content.trim().is_empty()); // Markdown should be different from plain (has structure added) // This is a weak check but validates the pipeline ran assert_ne!( plain_result.content, md_result.content, "Markdown output should differ from plain text output" ); } /// Regression test for GitHub discussion #391: PDFs with flat structure trees /// (all paragraphs, no heading tags) should still produce headings via /// font-size clustering fallback when the document has varying font sizes. #[test] fn test_pdf_markdown_produces_headings_via_font_size_clustering() { if skip_if_missing("pdf/google_doc_document.pdf") { return; } let path = get_test_file_path("pdf/google_doc_document.pdf"); let config = ExtractionConfig { output_format: OutputFormat::Markdown, ..Default::default() }; let result = extract_file_sync(&path, None, &config).expect("Should extract PDF as markdown"); let heading_lines: Vec<&str> = result.content.lines().filter(|l| l.trim().starts_with('#')).collect(); println!("=== Heading detection test ==="); println!("Total headings found: {}", heading_lines.len()); for h in &heading_lines { println!(" {}", h); } // The markdown pipeline should detect headings either from the structure // tree or via font-size clustering fallback. assert!( !heading_lines.is_empty(), "Markdown extraction should produce at least one heading via structure tree or font-size clustering" ); } /// Test that markdown extraction preserves paragraph breaks even when /// the structure tree fallback to heuristic is triggered. #[test] fn test_pdf_markdown_heuristic_fallback_preserves_paragraphs() { if skip_if_missing("pdf/google_doc_document.pdf") { return; } let path = get_test_file_path("pdf/google_doc_document.pdf"); let config = ExtractionConfig { output_format: OutputFormat::Markdown, ..Default::default() }; let result = extract_file_sync(&path, None, &config).expect("Should extract PDF"); let normalized = result.content.replace("\r\n", "\n"); let para_breaks = normalized.matches("\n\n").count(); let heading_count = result.content.lines().filter(|l| l.trim().starts_with('#')).count(); println!("=== Heuristic fallback test ==="); println!("Paragraphs: {}, Headings: {}", para_breaks, heading_count); println!("First 500 chars: {}", &result.content[..result.content.len().min(500)]); assert!( para_breaks >= 2, "Should preserve paragraph breaks after heuristic fallback, got {}", para_breaks ); }