310 lines
11 KiB
Rust
310 lines
11 KiB
Rust
//! Integration tests for PDF text hierarchy detection.
|
|
//!
|
|
//! Tests the extraction and detection of document hierarchy levels (H1-H6)
|
|
//! from PDF text using font size clustering and semantic analysis.
|
|
|
|
#![cfg(feature = "pdf")]
|
|
|
|
use kreuzberg::core::config::{ExtractionConfig, HierarchyConfig, PageConfig, PdfConfig};
|
|
use kreuzberg::extract_bytes;
|
|
use std::path::Path;
|
|
|
|
// Note: All tests must run serially because Pdfium can only be initialized once.
|
|
// Using tokio::test with single_threaded doesn't work well, so we use the serial_test crate.
|
|
// For now, we'll just accept that tests run in parallel but handle the Pdfium initialization error.
|
|
|
|
/// Test full hierarchy extraction from a real PDF.
|
|
///
|
|
/// Loads a PDF from test data directory, extracts with hierarchy detection enabled,
|
|
/// and verifies that PageContent.hierarchy is properly populated with expected
|
|
/// blocks and hierarchy levels.
|
|
#[tokio::test]
|
|
async fn test_full_hierarchy_extraction() {
|
|
// Use the embedded_images_tables.pdf which has clear text structure
|
|
// Path is relative to workspace root, not crate root
|
|
let pdf_path = "../../test_documents/pdf/embedded_images_tables.pdf";
|
|
|
|
if !Path::new(pdf_path).exists() {
|
|
eprintln!("Test PDF not found at: {}", pdf_path);
|
|
// Skip the test if PDF doesn't exist
|
|
return;
|
|
}
|
|
|
|
let pdf_bytes = std::fs::read(pdf_path).expect("Failed to read PDF file");
|
|
|
|
// Create extraction config with hierarchy detection enabled
|
|
let config = ExtractionConfig {
|
|
pages: Some(PageConfig {
|
|
extract_pages: true,
|
|
..Default::default()
|
|
}),
|
|
pdf_options: Some(PdfConfig {
|
|
extract_images: false,
|
|
extract_tables: true,
|
|
passwords: None,
|
|
extract_metadata: true,
|
|
hierarchy: Some(HierarchyConfig {
|
|
enabled: true,
|
|
k_clusters: 6,
|
|
include_bbox: true,
|
|
ocr_coverage_threshold: None,
|
|
}),
|
|
extract_annotations: false,
|
|
..Default::default()
|
|
}),
|
|
..Default::default()
|
|
};
|
|
|
|
// Extract the PDF
|
|
let result = extract_bytes(&pdf_bytes, "application/pdf", &config)
|
|
.await
|
|
.expect("PDF extraction failed");
|
|
|
|
// Verify that pages were extracted
|
|
assert!(
|
|
result.pages.is_some(),
|
|
"Pages should be extracted when extract_pages is enabled"
|
|
);
|
|
|
|
let pages = result.pages.as_ref().expect("Operation failed");
|
|
assert!(!pages.is_empty(), "At least one page should be extracted");
|
|
|
|
// Check that the first page has hierarchy information
|
|
let first_page = &pages[0];
|
|
assert!(
|
|
first_page.hierarchy.is_some(),
|
|
"First page should have hierarchy information when hierarchy extraction is enabled"
|
|
);
|
|
|
|
let hierarchy = first_page.hierarchy.as_ref().expect("Operation failed");
|
|
|
|
// Verify hierarchy structure
|
|
assert!(hierarchy.block_count > 0, "Hierarchy should contain at least one block");
|
|
assert!(!hierarchy.blocks.is_empty(), "Hierarchy blocks should not be empty");
|
|
|
|
eprintln!("Extracted {} hierarchy blocks from page 1", hierarchy.block_count);
|
|
|
|
// Verify that we have multiple hierarchy levels
|
|
let levels: std::collections::HashSet<String> = hierarchy.blocks.iter().map(|b| b.level.clone()).collect();
|
|
|
|
eprintln!("Found hierarchy levels: {:?}", levels);
|
|
|
|
// Should have at least 1 level
|
|
assert!(!levels.is_empty(), "Should have at least one hierarchy level");
|
|
|
|
// Verify block structure
|
|
for block in &hierarchy.blocks {
|
|
assert!(!block.text.is_empty(), "Block text should not be empty");
|
|
assert!(block.font_size > 0.0, "Font size should be positive");
|
|
|
|
// Check that level is a valid heading level or body
|
|
let is_valid_level = matches!(block.level.as_str(), "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "body");
|
|
assert!(is_valid_level, "Invalid hierarchy level: {}", block.level);
|
|
|
|
// Verify bounding box if present
|
|
if let Some((left, top, right, bottom)) = block.bbox {
|
|
assert!(left < right, "Bounding box left should be less than right");
|
|
assert!(top < bottom, "Bounding box top should be less than bottom");
|
|
assert!(
|
|
left >= 0.0 && top >= 0.0,
|
|
"Bounding box coordinates should be non-negative"
|
|
);
|
|
eprintln!(
|
|
"Block '{}' (level: {}, font_size: {}) bbox: ({}, {}, {}, {})",
|
|
block.text.chars().take(30).collect::<String>(),
|
|
block.level,
|
|
block.font_size,
|
|
left,
|
|
top,
|
|
right,
|
|
bottom
|
|
);
|
|
} else {
|
|
eprintln!(
|
|
"Block '{}' (level: {}, font_size: {}) no bbox",
|
|
block.text.chars().take(30).collect::<String>(),
|
|
block.level,
|
|
block.font_size
|
|
);
|
|
}
|
|
}
|
|
|
|
eprintln!("Hierarchy extraction test passed!");
|
|
}
|
|
|
|
/// Test that hierarchy extraction respects the enabled flag.
|
|
/// Note: This test is combined with the full_hierarchy_extraction test due to Pdfium initialization constraints.
|
|
#[tokio::test]
|
|
#[ignore]
|
|
async fn test_hierarchy_disabled() {
|
|
let pdf_path = "../../test_documents/pdf/embedded_images_tables.pdf";
|
|
|
|
if !Path::new(pdf_path).exists() {
|
|
eprintln!("Test PDF not found at: {}", pdf_path);
|
|
return;
|
|
}
|
|
|
|
let pdf_bytes = std::fs::read(pdf_path).expect("Failed to read PDF file");
|
|
|
|
// Create extraction config with hierarchy detection disabled
|
|
let config = ExtractionConfig {
|
|
pages: Some(PageConfig {
|
|
extract_pages: true,
|
|
..Default::default()
|
|
}),
|
|
pdf_options: Some(PdfConfig {
|
|
extract_images: false,
|
|
extract_tables: true,
|
|
passwords: None,
|
|
extract_metadata: true,
|
|
hierarchy: Some(HierarchyConfig {
|
|
enabled: false,
|
|
k_clusters: 6,
|
|
include_bbox: true,
|
|
ocr_coverage_threshold: None,
|
|
}),
|
|
extract_annotations: false,
|
|
..Default::default()
|
|
}),
|
|
..Default::default()
|
|
};
|
|
|
|
let result = extract_bytes(&pdf_bytes, "application/pdf", &config)
|
|
.await
|
|
.expect("PDF extraction failed");
|
|
|
|
// Verify that pages were extracted
|
|
assert!(result.pages.is_some(), "Pages should be extracted");
|
|
|
|
let pages = result.pages.as_ref().expect("Operation failed");
|
|
assert!(!pages.is_empty(), "At least one page should be extracted");
|
|
|
|
// Check that the first page does NOT have hierarchy information when disabled
|
|
let first_page = &pages[0];
|
|
assert!(
|
|
first_page.hierarchy.is_none(),
|
|
"First page should not have hierarchy when hierarchy extraction is disabled"
|
|
);
|
|
|
|
eprintln!("Hierarchy disabled test passed!");
|
|
}
|
|
|
|
/// Test different hierarchy configurations
|
|
/// Note: This test is ignored due to Pdfium initialization constraints (can only initialize once).
|
|
#[tokio::test]
|
|
#[ignore]
|
|
async fn test_hierarchy_with_explicit_disabled() {
|
|
let pdf_path = "../../test_documents/pdf/embedded_images_tables.pdf";
|
|
|
|
if !Path::new(pdf_path).exists() {
|
|
eprintln!("Test PDF not found at: {}", pdf_path);
|
|
return;
|
|
}
|
|
|
|
let pdf_bytes = std::fs::read(pdf_path).expect("Failed to read PDF file");
|
|
|
|
// Create extraction config with hierarchy extraction explicitly disabled
|
|
let config = ExtractionConfig {
|
|
pages: Some(PageConfig {
|
|
extract_pages: true,
|
|
..Default::default()
|
|
}),
|
|
pdf_options: Some(PdfConfig {
|
|
extract_images: false,
|
|
extract_tables: true,
|
|
passwords: None,
|
|
extract_metadata: true,
|
|
hierarchy: Some(HierarchyConfig {
|
|
enabled: false,
|
|
k_clusters: 6,
|
|
include_bbox: true,
|
|
ocr_coverage_threshold: None,
|
|
}),
|
|
extract_annotations: false,
|
|
..Default::default()
|
|
}),
|
|
..Default::default()
|
|
};
|
|
|
|
let result = extract_bytes(&pdf_bytes, "application/pdf", &config)
|
|
.await
|
|
.expect("PDF extraction failed");
|
|
|
|
// Verify that pages were extracted
|
|
assert!(result.pages.is_some(), "Pages should be extracted");
|
|
|
|
let pages = result.pages.as_ref().expect("Operation failed");
|
|
assert!(!pages.is_empty(), "At least one page should be extracted");
|
|
|
|
// Check that the first page does NOT have hierarchy information when disabled
|
|
let first_page = &pages[0];
|
|
assert!(
|
|
first_page.hierarchy.is_none(),
|
|
"First page should not have hierarchy when hierarchy extraction is disabled"
|
|
);
|
|
|
|
eprintln!("Hierarchy with explicit disabled test passed!");
|
|
}
|
|
|
|
/// Test hierarchy extraction with different cluster configurations.
|
|
/// Note: This test is ignored due to Pdfium initialization constraints (can only initialize once).
|
|
#[tokio::test]
|
|
#[ignore]
|
|
async fn test_hierarchy_different_k_clusters() {
|
|
let pdf_path = "../../test_documents/pdf/embedded_images_tables.pdf";
|
|
|
|
if !Path::new(pdf_path).exists() {
|
|
eprintln!("Test PDF not found at: {}", pdf_path);
|
|
return;
|
|
}
|
|
|
|
let pdf_bytes = std::fs::read(pdf_path).expect("Failed to read PDF file");
|
|
|
|
// Test with different k values
|
|
for k in &[2, 4, 6] {
|
|
let config = ExtractionConfig {
|
|
pages: Some(PageConfig {
|
|
extract_pages: true,
|
|
..Default::default()
|
|
}),
|
|
pdf_options: Some(PdfConfig {
|
|
extract_images: false,
|
|
extract_tables: true,
|
|
passwords: None,
|
|
extract_metadata: true,
|
|
hierarchy: Some(HierarchyConfig {
|
|
enabled: true,
|
|
k_clusters: *k,
|
|
include_bbox: true,
|
|
ocr_coverage_threshold: None,
|
|
}),
|
|
extract_annotations: false,
|
|
..Default::default()
|
|
}),
|
|
..Default::default()
|
|
};
|
|
|
|
let result = extract_bytes(&pdf_bytes, "application/pdf", &config)
|
|
.await
|
|
.expect("PDF extraction failed");
|
|
|
|
assert!(result.pages.is_some(), "Pages should be extracted");
|
|
|
|
let pages = result.pages.as_ref().expect("Operation failed");
|
|
assert!(!pages.is_empty(), "At least one page should be extracted");
|
|
|
|
let first_page = &pages[0];
|
|
assert!(
|
|
first_page.hierarchy.is_some(),
|
|
"Hierarchy should be present with k={}",
|
|
k
|
|
);
|
|
|
|
let hierarchy = first_page.hierarchy.as_ref().expect("Operation failed");
|
|
eprintln!("K={}: {} hierarchy blocks extracted", k, hierarchy.block_count);
|
|
assert!(hierarchy.block_count > 0, "Should have blocks with k={}", k);
|
|
}
|
|
|
|
eprintln!("Different k_clusters test passed!");
|
|
}
|