//! Integration tests for PDF text hierarchy detection. //! //! Tests the extraction and detection of document hierarchy levels (H1-H6) //! from PDF text using font size clustering and semantic analysis. #![cfg(feature = "pdf")] use kreuzberg::core::config::{ExtractionConfig, HierarchyConfig, PageConfig, PdfConfig}; use kreuzberg::extract_bytes; use std::path::Path; // Note: All tests must run serially because Pdfium can only be initialized once. // Using tokio::test with single_threaded doesn't work well, so we use the serial_test crate. // For now, we'll just accept that tests run in parallel but handle the Pdfium initialization error. /// Test full hierarchy extraction from a real PDF. /// /// Loads a PDF from test data directory, extracts with hierarchy detection enabled, /// and verifies that PageContent.hierarchy is properly populated with expected /// blocks and hierarchy levels. #[tokio::test] async fn test_full_hierarchy_extraction() { // Use the embedded_images_tables.pdf which has clear text structure // Path is relative to workspace root, not crate root let pdf_path = "../../test_documents/pdf/embedded_images_tables.pdf"; if !Path::new(pdf_path).exists() { eprintln!("Test PDF not found at: {}", pdf_path); // Skip the test if PDF doesn't exist return; } let pdf_bytes = std::fs::read(pdf_path).expect("Failed to read PDF file"); // Create extraction config with hierarchy detection enabled let config = ExtractionConfig { pages: Some(PageConfig { extract_pages: true, ..Default::default() }), pdf_options: Some(PdfConfig { extract_images: false, extract_tables: true, passwords: None, extract_metadata: true, hierarchy: Some(HierarchyConfig { enabled: true, k_clusters: 6, include_bbox: true, ocr_coverage_threshold: None, }), extract_annotations: false, ..Default::default() }), ..Default::default() }; // Extract the PDF let result = extract_bytes(&pdf_bytes, "application/pdf", &config) .await .expect("PDF extraction failed"); // Verify that pages were extracted assert!( result.pages.is_some(), "Pages should be extracted when extract_pages is enabled" ); let pages = result.pages.as_ref().expect("Operation failed"); assert!(!pages.is_empty(), "At least one page should be extracted"); // Check that the first page has hierarchy information let first_page = &pages[0]; assert!( first_page.hierarchy.is_some(), "First page should have hierarchy information when hierarchy extraction is enabled" ); let hierarchy = first_page.hierarchy.as_ref().expect("Operation failed"); // Verify hierarchy structure assert!(hierarchy.block_count > 0, "Hierarchy should contain at least one block"); assert!(!hierarchy.blocks.is_empty(), "Hierarchy blocks should not be empty"); eprintln!("Extracted {} hierarchy blocks from page 1", hierarchy.block_count); // Verify that we have multiple hierarchy levels let levels: std::collections::HashSet = hierarchy.blocks.iter().map(|b| b.level.clone()).collect(); eprintln!("Found hierarchy levels: {:?}", levels); // Should have at least 1 level assert!(!levels.is_empty(), "Should have at least one hierarchy level"); // Verify block structure for block in &hierarchy.blocks { assert!(!block.text.is_empty(), "Block text should not be empty"); assert!(block.font_size > 0.0, "Font size should be positive"); // Check that level is a valid heading level or body let is_valid_level = matches!(block.level.as_str(), "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "body"); assert!(is_valid_level, "Invalid hierarchy level: {}", block.level); // Verify bounding box if present if let Some((left, top, right, bottom)) = block.bbox { assert!(left < right, "Bounding box left should be less than right"); assert!(top < bottom, "Bounding box top should be less than bottom"); assert!( left >= 0.0 && top >= 0.0, "Bounding box coordinates should be non-negative" ); eprintln!( "Block '{}' (level: {}, font_size: {}) bbox: ({}, {}, {}, {})", block.text.chars().take(30).collect::(), block.level, block.font_size, left, top, right, bottom ); } else { eprintln!( "Block '{}' (level: {}, font_size: {}) no bbox", block.text.chars().take(30).collect::(), block.level, block.font_size ); } } eprintln!("Hierarchy extraction test passed!"); } /// Test that hierarchy extraction respects the enabled flag. /// Note: This test is combined with the full_hierarchy_extraction test due to Pdfium initialization constraints. #[tokio::test] #[ignore] async fn test_hierarchy_disabled() { let pdf_path = "../../test_documents/pdf/embedded_images_tables.pdf"; if !Path::new(pdf_path).exists() { eprintln!("Test PDF not found at: {}", pdf_path); return; } let pdf_bytes = std::fs::read(pdf_path).expect("Failed to read PDF file"); // Create extraction config with hierarchy detection disabled let config = ExtractionConfig { pages: Some(PageConfig { extract_pages: true, ..Default::default() }), pdf_options: Some(PdfConfig { extract_images: false, extract_tables: true, passwords: None, extract_metadata: true, hierarchy: Some(HierarchyConfig { enabled: false, k_clusters: 6, include_bbox: true, ocr_coverage_threshold: None, }), extract_annotations: false, ..Default::default() }), ..Default::default() }; let result = extract_bytes(&pdf_bytes, "application/pdf", &config) .await .expect("PDF extraction failed"); // Verify that pages were extracted assert!(result.pages.is_some(), "Pages should be extracted"); let pages = result.pages.as_ref().expect("Operation failed"); assert!(!pages.is_empty(), "At least one page should be extracted"); // Check that the first page does NOT have hierarchy information when disabled let first_page = &pages[0]; assert!( first_page.hierarchy.is_none(), "First page should not have hierarchy when hierarchy extraction is disabled" ); eprintln!("Hierarchy disabled test passed!"); } /// Test different hierarchy configurations /// Note: This test is ignored due to Pdfium initialization constraints (can only initialize once). #[tokio::test] #[ignore] async fn test_hierarchy_with_explicit_disabled() { let pdf_path = "../../test_documents/pdf/embedded_images_tables.pdf"; if !Path::new(pdf_path).exists() { eprintln!("Test PDF not found at: {}", pdf_path); return; } let pdf_bytes = std::fs::read(pdf_path).expect("Failed to read PDF file"); // Create extraction config with hierarchy extraction explicitly disabled let config = ExtractionConfig { pages: Some(PageConfig { extract_pages: true, ..Default::default() }), pdf_options: Some(PdfConfig { extract_images: false, extract_tables: true, passwords: None, extract_metadata: true, hierarchy: Some(HierarchyConfig { enabled: false, k_clusters: 6, include_bbox: true, ocr_coverage_threshold: None, }), extract_annotations: false, ..Default::default() }), ..Default::default() }; let result = extract_bytes(&pdf_bytes, "application/pdf", &config) .await .expect("PDF extraction failed"); // Verify that pages were extracted assert!(result.pages.is_some(), "Pages should be extracted"); let pages = result.pages.as_ref().expect("Operation failed"); assert!(!pages.is_empty(), "At least one page should be extracted"); // Check that the first page does NOT have hierarchy information when disabled let first_page = &pages[0]; assert!( first_page.hierarchy.is_none(), "First page should not have hierarchy when hierarchy extraction is disabled" ); eprintln!("Hierarchy with explicit disabled test passed!"); } /// Test hierarchy extraction with different cluster configurations. /// Note: This test is ignored due to Pdfium initialization constraints (can only initialize once). #[tokio::test] #[ignore] async fn test_hierarchy_different_k_clusters() { let pdf_path = "../../test_documents/pdf/embedded_images_tables.pdf"; if !Path::new(pdf_path).exists() { eprintln!("Test PDF not found at: {}", pdf_path); return; } let pdf_bytes = std::fs::read(pdf_path).expect("Failed to read PDF file"); // Test with different k values for k in &[2, 4, 6] { let config = ExtractionConfig { pages: Some(PageConfig { extract_pages: true, ..Default::default() }), pdf_options: Some(PdfConfig { extract_images: false, extract_tables: true, passwords: None, extract_metadata: true, hierarchy: Some(HierarchyConfig { enabled: true, k_clusters: *k, include_bbox: true, ocr_coverage_threshold: None, }), extract_annotations: false, ..Default::default() }), ..Default::default() }; let result = extract_bytes(&pdf_bytes, "application/pdf", &config) .await .expect("PDF extraction failed"); assert!(result.pages.is_some(), "Pages should be extracted"); let pages = result.pages.as_ref().expect("Operation failed"); assert!(!pages.is_empty(), "At least one page should be extracted"); let first_page = &pages[0]; assert!( first_page.hierarchy.is_some(), "Hierarchy should be present with k={}", k ); let hierarchy = first_page.hierarchy.as_ref().expect("Operation failed"); eprintln!("K={}: {} hierarchy blocks extracted", k, hierarchy.block_count); assert!(hierarchy.block_count > 0, "Should have blocks with k={}", k); } eprintln!("Different k_clusters test passed!"); }