Files
fil/crates/kreuzberg/tests/pdf_hierarchy_detection.rs
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

310 lines
11 KiB
Rust

//! Integration tests for PDF text hierarchy detection.
//!
//! Tests the extraction and detection of document hierarchy levels (H1-H6)
//! from PDF text using font size clustering and semantic analysis.
#![cfg(feature = "pdf")]
use kreuzberg::core::config::{ExtractionConfig, HierarchyConfig, PageConfig, PdfConfig};
use kreuzberg::extract_bytes;
use std::path::Path;
// Note: All tests must run serially because Pdfium can only be initialized once.
// Using tokio::test with single_threaded doesn't work well, so we use the serial_test crate.
// For now, we'll just accept that tests run in parallel but handle the Pdfium initialization error.
/// Test full hierarchy extraction from a real PDF.
///
/// Loads a PDF from test data directory, extracts with hierarchy detection enabled,
/// and verifies that PageContent.hierarchy is properly populated with expected
/// blocks and hierarchy levels.
#[tokio::test]
async fn test_full_hierarchy_extraction() {
// Use the embedded_images_tables.pdf which has clear text structure
// Path is relative to workspace root, not crate root
let pdf_path = "../../test_documents/pdf/embedded_images_tables.pdf";
if !Path::new(pdf_path).exists() {
eprintln!("Test PDF not found at: {}", pdf_path);
// Skip the test if PDF doesn't exist
return;
}
let pdf_bytes = std::fs::read(pdf_path).expect("Failed to read PDF file");
// Create extraction config with hierarchy detection enabled
let config = ExtractionConfig {
pages: Some(PageConfig {
extract_pages: true,
..Default::default()
}),
pdf_options: Some(PdfConfig {
extract_images: false,
extract_tables: true,
passwords: None,
extract_metadata: true,
hierarchy: Some(HierarchyConfig {
enabled: true,
k_clusters: 6,
include_bbox: true,
ocr_coverage_threshold: None,
}),
extract_annotations: false,
..Default::default()
}),
..Default::default()
};
// Extract the PDF
let result = extract_bytes(&pdf_bytes, "application/pdf", &config)
.await
.expect("PDF extraction failed");
// Verify that pages were extracted
assert!(
result.pages.is_some(),
"Pages should be extracted when extract_pages is enabled"
);
let pages = result.pages.as_ref().expect("Operation failed");
assert!(!pages.is_empty(), "At least one page should be extracted");
// Check that the first page has hierarchy information
let first_page = &pages[0];
assert!(
first_page.hierarchy.is_some(),
"First page should have hierarchy information when hierarchy extraction is enabled"
);
let hierarchy = first_page.hierarchy.as_ref().expect("Operation failed");
// Verify hierarchy structure
assert!(hierarchy.block_count > 0, "Hierarchy should contain at least one block");
assert!(!hierarchy.blocks.is_empty(), "Hierarchy blocks should not be empty");
eprintln!("Extracted {} hierarchy blocks from page 1", hierarchy.block_count);
// Verify that we have multiple hierarchy levels
let levels: std::collections::HashSet<String> = hierarchy.blocks.iter().map(|b| b.level.clone()).collect();
eprintln!("Found hierarchy levels: {:?}", levels);
// Should have at least 1 level
assert!(!levels.is_empty(), "Should have at least one hierarchy level");
// Verify block structure
for block in &hierarchy.blocks {
assert!(!block.text.is_empty(), "Block text should not be empty");
assert!(block.font_size > 0.0, "Font size should be positive");
// Check that level is a valid heading level or body
let is_valid_level = matches!(block.level.as_str(), "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "body");
assert!(is_valid_level, "Invalid hierarchy level: {}", block.level);
// Verify bounding box if present
if let Some((left, top, right, bottom)) = block.bbox {
assert!(left < right, "Bounding box left should be less than right");
assert!(top < bottom, "Bounding box top should be less than bottom");
assert!(
left >= 0.0 && top >= 0.0,
"Bounding box coordinates should be non-negative"
);
eprintln!(
"Block '{}' (level: {}, font_size: {}) bbox: ({}, {}, {}, {})",
block.text.chars().take(30).collect::<String>(),
block.level,
block.font_size,
left,
top,
right,
bottom
);
} else {
eprintln!(
"Block '{}' (level: {}, font_size: {}) no bbox",
block.text.chars().take(30).collect::<String>(),
block.level,
block.font_size
);
}
}
eprintln!("Hierarchy extraction test passed!");
}
/// Test that hierarchy extraction respects the enabled flag.
/// Note: This test is combined with the full_hierarchy_extraction test due to Pdfium initialization constraints.
#[tokio::test]
#[ignore]
async fn test_hierarchy_disabled() {
let pdf_path = "../../test_documents/pdf/embedded_images_tables.pdf";
if !Path::new(pdf_path).exists() {
eprintln!("Test PDF not found at: {}", pdf_path);
return;
}
let pdf_bytes = std::fs::read(pdf_path).expect("Failed to read PDF file");
// Create extraction config with hierarchy detection disabled
let config = ExtractionConfig {
pages: Some(PageConfig {
extract_pages: true,
..Default::default()
}),
pdf_options: Some(PdfConfig {
extract_images: false,
extract_tables: true,
passwords: None,
extract_metadata: true,
hierarchy: Some(HierarchyConfig {
enabled: false,
k_clusters: 6,
include_bbox: true,
ocr_coverage_threshold: None,
}),
extract_annotations: false,
..Default::default()
}),
..Default::default()
};
let result = extract_bytes(&pdf_bytes, "application/pdf", &config)
.await
.expect("PDF extraction failed");
// Verify that pages were extracted
assert!(result.pages.is_some(), "Pages should be extracted");
let pages = result.pages.as_ref().expect("Operation failed");
assert!(!pages.is_empty(), "At least one page should be extracted");
// Check that the first page does NOT have hierarchy information when disabled
let first_page = &pages[0];
assert!(
first_page.hierarchy.is_none(),
"First page should not have hierarchy when hierarchy extraction is disabled"
);
eprintln!("Hierarchy disabled test passed!");
}
/// Test different hierarchy configurations
/// Note: This test is ignored due to Pdfium initialization constraints (can only initialize once).
#[tokio::test]
#[ignore]
async fn test_hierarchy_with_explicit_disabled() {
let pdf_path = "../../test_documents/pdf/embedded_images_tables.pdf";
if !Path::new(pdf_path).exists() {
eprintln!("Test PDF not found at: {}", pdf_path);
return;
}
let pdf_bytes = std::fs::read(pdf_path).expect("Failed to read PDF file");
// Create extraction config with hierarchy extraction explicitly disabled
let config = ExtractionConfig {
pages: Some(PageConfig {
extract_pages: true,
..Default::default()
}),
pdf_options: Some(PdfConfig {
extract_images: false,
extract_tables: true,
passwords: None,
extract_metadata: true,
hierarchy: Some(HierarchyConfig {
enabled: false,
k_clusters: 6,
include_bbox: true,
ocr_coverage_threshold: None,
}),
extract_annotations: false,
..Default::default()
}),
..Default::default()
};
let result = extract_bytes(&pdf_bytes, "application/pdf", &config)
.await
.expect("PDF extraction failed");
// Verify that pages were extracted
assert!(result.pages.is_some(), "Pages should be extracted");
let pages = result.pages.as_ref().expect("Operation failed");
assert!(!pages.is_empty(), "At least one page should be extracted");
// Check that the first page does NOT have hierarchy information when disabled
let first_page = &pages[0];
assert!(
first_page.hierarchy.is_none(),
"First page should not have hierarchy when hierarchy extraction is disabled"
);
eprintln!("Hierarchy with explicit disabled test passed!");
}
/// Test hierarchy extraction with different cluster configurations.
/// Note: This test is ignored due to Pdfium initialization constraints (can only initialize once).
#[tokio::test]
#[ignore]
async fn test_hierarchy_different_k_clusters() {
let pdf_path = "../../test_documents/pdf/embedded_images_tables.pdf";
if !Path::new(pdf_path).exists() {
eprintln!("Test PDF not found at: {}", pdf_path);
return;
}
let pdf_bytes = std::fs::read(pdf_path).expect("Failed to read PDF file");
// Test with different k values
for k in &[2, 4, 6] {
let config = ExtractionConfig {
pages: Some(PageConfig {
extract_pages: true,
..Default::default()
}),
pdf_options: Some(PdfConfig {
extract_images: false,
extract_tables: true,
passwords: None,
extract_metadata: true,
hierarchy: Some(HierarchyConfig {
enabled: true,
k_clusters: *k,
include_bbox: true,
ocr_coverage_threshold: None,
}),
extract_annotations: false,
..Default::default()
}),
..Default::default()
};
let result = extract_bytes(&pdf_bytes, "application/pdf", &config)
.await
.expect("PDF extraction failed");
assert!(result.pages.is_some(), "Pages should be extracted");
let pages = result.pages.as_ref().expect("Operation failed");
assert!(!pages.is_empty(), "At least one page should be extracted");
let first_page = &pages[0];
assert!(
first_page.hierarchy.is_some(),
"Hierarchy should be present with k={}",
k
);
let hierarchy = first_page.hierarchy.as_ref().expect("Operation failed");
eprintln!("K={}: {} hierarchy blocks extracted", k, hierarchy.block_count);
assert!(hierarchy.block_count > 0, "Should have blocks with k={}", k);
}
eprintln!("Different k_clusters test passed!");
}