This commit is contained in:
309
crates/kreuzberg/tests/pdf_hierarchy_detection.rs
Normal file
309
crates/kreuzberg/tests/pdf_hierarchy_detection.rs
Normal file
@@ -0,0 +1,309 @@
|
||||
//! Integration tests for PDF text hierarchy detection.
|
||||
//!
|
||||
//! Tests the extraction and detection of document hierarchy levels (H1-H6)
|
||||
//! from PDF text using font size clustering and semantic analysis.
|
||||
|
||||
#![cfg(feature = "pdf")]
|
||||
|
||||
use kreuzberg::core::config::{ExtractionConfig, HierarchyConfig, PageConfig, PdfConfig};
|
||||
use kreuzberg::extract_bytes;
|
||||
use std::path::Path;
|
||||
|
||||
// Note: All tests must run serially because Pdfium can only be initialized once.
|
||||
// Using tokio::test with single_threaded doesn't work well, so we use the serial_test crate.
|
||||
// For now, we'll just accept that tests run in parallel but handle the Pdfium initialization error.
|
||||
|
||||
/// Test full hierarchy extraction from a real PDF.
|
||||
///
|
||||
/// Loads a PDF from test data directory, extracts with hierarchy detection enabled,
|
||||
/// and verifies that PageContent.hierarchy is properly populated with expected
|
||||
/// blocks and hierarchy levels.
|
||||
#[tokio::test]
|
||||
async fn test_full_hierarchy_extraction() {
|
||||
// Use the embedded_images_tables.pdf which has clear text structure
|
||||
// Path is relative to workspace root, not crate root
|
||||
let pdf_path = "../../test_documents/pdf/embedded_images_tables.pdf";
|
||||
|
||||
if !Path::new(pdf_path).exists() {
|
||||
eprintln!("Test PDF not found at: {}", pdf_path);
|
||||
// Skip the test if PDF doesn't exist
|
||||
return;
|
||||
}
|
||||
|
||||
let pdf_bytes = std::fs::read(pdf_path).expect("Failed to read PDF file");
|
||||
|
||||
// Create extraction config with hierarchy detection enabled
|
||||
let config = ExtractionConfig {
|
||||
pages: Some(PageConfig {
|
||||
extract_pages: true,
|
||||
..Default::default()
|
||||
}),
|
||||
pdf_options: Some(PdfConfig {
|
||||
extract_images: false,
|
||||
extract_tables: true,
|
||||
passwords: None,
|
||||
extract_metadata: true,
|
||||
hierarchy: Some(HierarchyConfig {
|
||||
enabled: true,
|
||||
k_clusters: 6,
|
||||
include_bbox: true,
|
||||
ocr_coverage_threshold: None,
|
||||
}),
|
||||
extract_annotations: false,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Extract the PDF
|
||||
let result = extract_bytes(&pdf_bytes, "application/pdf", &config)
|
||||
.await
|
||||
.expect("PDF extraction failed");
|
||||
|
||||
// Verify that pages were extracted
|
||||
assert!(
|
||||
result.pages.is_some(),
|
||||
"Pages should be extracted when extract_pages is enabled"
|
||||
);
|
||||
|
||||
let pages = result.pages.as_ref().expect("Operation failed");
|
||||
assert!(!pages.is_empty(), "At least one page should be extracted");
|
||||
|
||||
// Check that the first page has hierarchy information
|
||||
let first_page = &pages[0];
|
||||
assert!(
|
||||
first_page.hierarchy.is_some(),
|
||||
"First page should have hierarchy information when hierarchy extraction is enabled"
|
||||
);
|
||||
|
||||
let hierarchy = first_page.hierarchy.as_ref().expect("Operation failed");
|
||||
|
||||
// Verify hierarchy structure
|
||||
assert!(hierarchy.block_count > 0, "Hierarchy should contain at least one block");
|
||||
assert!(!hierarchy.blocks.is_empty(), "Hierarchy blocks should not be empty");
|
||||
|
||||
eprintln!("Extracted {} hierarchy blocks from page 1", hierarchy.block_count);
|
||||
|
||||
// Verify that we have multiple hierarchy levels
|
||||
let levels: std::collections::HashSet<String> = hierarchy.blocks.iter().map(|b| b.level.clone()).collect();
|
||||
|
||||
eprintln!("Found hierarchy levels: {:?}", levels);
|
||||
|
||||
// Should have at least 1 level
|
||||
assert!(!levels.is_empty(), "Should have at least one hierarchy level");
|
||||
|
||||
// Verify block structure
|
||||
for block in &hierarchy.blocks {
|
||||
assert!(!block.text.is_empty(), "Block text should not be empty");
|
||||
assert!(block.font_size > 0.0, "Font size should be positive");
|
||||
|
||||
// Check that level is a valid heading level or body
|
||||
let is_valid_level = matches!(block.level.as_str(), "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "body");
|
||||
assert!(is_valid_level, "Invalid hierarchy level: {}", block.level);
|
||||
|
||||
// Verify bounding box if present
|
||||
if let Some((left, top, right, bottom)) = block.bbox {
|
||||
assert!(left < right, "Bounding box left should be less than right");
|
||||
assert!(top < bottom, "Bounding box top should be less than bottom");
|
||||
assert!(
|
||||
left >= 0.0 && top >= 0.0,
|
||||
"Bounding box coordinates should be non-negative"
|
||||
);
|
||||
eprintln!(
|
||||
"Block '{}' (level: {}, font_size: {}) bbox: ({}, {}, {}, {})",
|
||||
block.text.chars().take(30).collect::<String>(),
|
||||
block.level,
|
||||
block.font_size,
|
||||
left,
|
||||
top,
|
||||
right,
|
||||
bottom
|
||||
);
|
||||
} else {
|
||||
eprintln!(
|
||||
"Block '{}' (level: {}, font_size: {}) no bbox",
|
||||
block.text.chars().take(30).collect::<String>(),
|
||||
block.level,
|
||||
block.font_size
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
eprintln!("Hierarchy extraction test passed!");
|
||||
}
|
||||
|
||||
/// Test that hierarchy extraction respects the enabled flag.
|
||||
/// Note: This test is combined with the full_hierarchy_extraction test due to Pdfium initialization constraints.
|
||||
#[tokio::test]
|
||||
#[ignore]
|
||||
async fn test_hierarchy_disabled() {
|
||||
let pdf_path = "../../test_documents/pdf/embedded_images_tables.pdf";
|
||||
|
||||
if !Path::new(pdf_path).exists() {
|
||||
eprintln!("Test PDF not found at: {}", pdf_path);
|
||||
return;
|
||||
}
|
||||
|
||||
let pdf_bytes = std::fs::read(pdf_path).expect("Failed to read PDF file");
|
||||
|
||||
// Create extraction config with hierarchy detection disabled
|
||||
let config = ExtractionConfig {
|
||||
pages: Some(PageConfig {
|
||||
extract_pages: true,
|
||||
..Default::default()
|
||||
}),
|
||||
pdf_options: Some(PdfConfig {
|
||||
extract_images: false,
|
||||
extract_tables: true,
|
||||
passwords: None,
|
||||
extract_metadata: true,
|
||||
hierarchy: Some(HierarchyConfig {
|
||||
enabled: false,
|
||||
k_clusters: 6,
|
||||
include_bbox: true,
|
||||
ocr_coverage_threshold: None,
|
||||
}),
|
||||
extract_annotations: false,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_bytes(&pdf_bytes, "application/pdf", &config)
|
||||
.await
|
||||
.expect("PDF extraction failed");
|
||||
|
||||
// Verify that pages were extracted
|
||||
assert!(result.pages.is_some(), "Pages should be extracted");
|
||||
|
||||
let pages = result.pages.as_ref().expect("Operation failed");
|
||||
assert!(!pages.is_empty(), "At least one page should be extracted");
|
||||
|
||||
// Check that the first page does NOT have hierarchy information when disabled
|
||||
let first_page = &pages[0];
|
||||
assert!(
|
||||
first_page.hierarchy.is_none(),
|
||||
"First page should not have hierarchy when hierarchy extraction is disabled"
|
||||
);
|
||||
|
||||
eprintln!("Hierarchy disabled test passed!");
|
||||
}
|
||||
|
||||
/// Test different hierarchy configurations
|
||||
/// Note: This test is ignored due to Pdfium initialization constraints (can only initialize once).
|
||||
#[tokio::test]
|
||||
#[ignore]
|
||||
async fn test_hierarchy_with_explicit_disabled() {
|
||||
let pdf_path = "../../test_documents/pdf/embedded_images_tables.pdf";
|
||||
|
||||
if !Path::new(pdf_path).exists() {
|
||||
eprintln!("Test PDF not found at: {}", pdf_path);
|
||||
return;
|
||||
}
|
||||
|
||||
let pdf_bytes = std::fs::read(pdf_path).expect("Failed to read PDF file");
|
||||
|
||||
// Create extraction config with hierarchy extraction explicitly disabled
|
||||
let config = ExtractionConfig {
|
||||
pages: Some(PageConfig {
|
||||
extract_pages: true,
|
||||
..Default::default()
|
||||
}),
|
||||
pdf_options: Some(PdfConfig {
|
||||
extract_images: false,
|
||||
extract_tables: true,
|
||||
passwords: None,
|
||||
extract_metadata: true,
|
||||
hierarchy: Some(HierarchyConfig {
|
||||
enabled: false,
|
||||
k_clusters: 6,
|
||||
include_bbox: true,
|
||||
ocr_coverage_threshold: None,
|
||||
}),
|
||||
extract_annotations: false,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_bytes(&pdf_bytes, "application/pdf", &config)
|
||||
.await
|
||||
.expect("PDF extraction failed");
|
||||
|
||||
// Verify that pages were extracted
|
||||
assert!(result.pages.is_some(), "Pages should be extracted");
|
||||
|
||||
let pages = result.pages.as_ref().expect("Operation failed");
|
||||
assert!(!pages.is_empty(), "At least one page should be extracted");
|
||||
|
||||
// Check that the first page does NOT have hierarchy information when disabled
|
||||
let first_page = &pages[0];
|
||||
assert!(
|
||||
first_page.hierarchy.is_none(),
|
||||
"First page should not have hierarchy when hierarchy extraction is disabled"
|
||||
);
|
||||
|
||||
eprintln!("Hierarchy with explicit disabled test passed!");
|
||||
}
|
||||
|
||||
/// Test hierarchy extraction with different cluster configurations.
|
||||
/// Note: This test is ignored due to Pdfium initialization constraints (can only initialize once).
|
||||
#[tokio::test]
|
||||
#[ignore]
|
||||
async fn test_hierarchy_different_k_clusters() {
|
||||
let pdf_path = "../../test_documents/pdf/embedded_images_tables.pdf";
|
||||
|
||||
if !Path::new(pdf_path).exists() {
|
||||
eprintln!("Test PDF not found at: {}", pdf_path);
|
||||
return;
|
||||
}
|
||||
|
||||
let pdf_bytes = std::fs::read(pdf_path).expect("Failed to read PDF file");
|
||||
|
||||
// Test with different k values
|
||||
for k in &[2, 4, 6] {
|
||||
let config = ExtractionConfig {
|
||||
pages: Some(PageConfig {
|
||||
extract_pages: true,
|
||||
..Default::default()
|
||||
}),
|
||||
pdf_options: Some(PdfConfig {
|
||||
extract_images: false,
|
||||
extract_tables: true,
|
||||
passwords: None,
|
||||
extract_metadata: true,
|
||||
hierarchy: Some(HierarchyConfig {
|
||||
enabled: true,
|
||||
k_clusters: *k,
|
||||
include_bbox: true,
|
||||
ocr_coverage_threshold: None,
|
||||
}),
|
||||
extract_annotations: false,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_bytes(&pdf_bytes, "application/pdf", &config)
|
||||
.await
|
||||
.expect("PDF extraction failed");
|
||||
|
||||
assert!(result.pages.is_some(), "Pages should be extracted");
|
||||
|
||||
let pages = result.pages.as_ref().expect("Operation failed");
|
||||
assert!(!pages.is_empty(), "At least one page should be extracted");
|
||||
|
||||
let first_page = &pages[0];
|
||||
assert!(
|
||||
first_page.hierarchy.is_some(),
|
||||
"Hierarchy should be present with k={}",
|
||||
k
|
||||
);
|
||||
|
||||
let hierarchy = first_page.hierarchy.as_ref().expect("Operation failed");
|
||||
eprintln!("K={}: {} hierarchy blocks extracted", k, hierarchy.block_count);
|
||||
assert!(hierarchy.block_count > 0, "Should have blocks with k={}", k);
|
||||
}
|
||||
|
||||
eprintln!("Different k_clusters test passed!");
|
||||
}
|
||||
Reference in New Issue
Block a user