Files
fil/crates/kreuzberg/tests/pdf_hierarchy_quality.rs

603 lines
19 KiB
Rust
Raw Normal View History

2026-06-01 23:40:55 +02:00
//! TODO: Restored from 245539484 alef-migration cleanup. Currently exercises
//! pub(crate) APIs that the migration deliberately narrowed; gated until
//! either (a) these APIs are re-exposed publicly, or (b) the test is
//! rewritten against the public extraction surface.
#![cfg(any())]
// Original content preserved below; recompiled once gating cfg drops.
// Disabled by the file-level cfg(any()) above.
/*
//! PDF hierarchy quality assessment tests.
//!
//! This module tests PDF text hierarchy extraction quality by comparing against ground truth annotations.
//! Measures precision, recall, F1 score, and level accuracy to ensure the hierarchy detection
//! algorithm works well on real document structures.
//!
//! Test philosophy:
//! - Define ground truth hierarchies for representative PDF documents
//! - Measure how well extracted hierarchies match ground truth
//! - Assert minimum quality thresholds for precision/recall/F1
//! - Verify correct hierarchy level assignments
#![cfg(feature = "pdf")]
use kreuzberg::pdf::hierarchy::{
BoundingBox, HierarchyLevel, KMeansResult, TextBlock, assign_hierarchy_levels,
assign_hierarchy_levels_from_clusters, cluster_font_sizes,
};
use serde::{Deserialize, Serialize};
use std::fs;
use std::path::Path;
/// A bounding box annotation from ground truth.
#[derive(Debug, Clone, Serialize, Deserialize)]
struct GroundTruthBBox {
left: f32,
top: f32,
right: f32,
bottom: f32,
}
impl GroundTruthBBox {
/// Convert to kreuzberg BoundingBox
fn to_bbox(&self) -> BoundingBox {
BoundingBox {
left: self.left,
top: self.top,
right: self.right,
bottom: self.bottom,
}
}
}
/// A ground truth text block with hierarchy level annotation.
#[derive(Debug, Clone, Serialize, Deserialize)]
struct GroundTruthBlock {
text: String,
level: String,
bbox: GroundTruthBBox,
}
/// A page of ground truth annotations.
#[derive(Debug, Clone, Serialize, Deserialize)]
struct GroundTruthPage {
page_number: u32,
blocks: Vec<GroundTruthBlock>,
}
/// A document with ground truth hierarchy annotations.
#[derive(Debug, Clone, Serialize, Deserialize)]
struct GroundTruthDocument {
pdf_file: String,
pages: Vec<GroundTruthPage>,
}
/// Root structure for ground truth JSON file.
#[derive(Debug, Clone, Serialize, Deserialize)]
struct GroundTruthFile {
documents: Vec<GroundTruthDocument>,
}
/// Quality metrics for hierarchy extraction.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct QualityMetrics {
/// Precision: (correctly identified hierarchies) / (total extracted hierarchies)
pub precision: f64,
/// Recall: (correctly identified hierarchies) / (total ground truth hierarchies)
pub recall: f64,
/// F1 Score: harmonic mean of precision and recall
pub f1_score: f64,
/// Level accuracy: percentage of blocks assigned to correct hierarchy level
pub level_accuracy: f64,
/// Number of correctly identified hierarchy blocks
pub true_positives: usize,
/// Number of incorrectly extracted hierarchy blocks
pub false_positives: usize,
/// Number of missed ground truth hierarchy blocks
pub false_negatives: usize,
/// Number of blocks with correct hierarchy level
pub correct_levels: usize,
/// Total number of blocks evaluated
pub total_blocks: usize,
}
impl QualityMetrics {
/// Create new quality metrics from test results.
fn new(
true_positives: usize,
false_positives: usize,
false_negatives: usize,
correct_levels: usize,
total_blocks: usize,
) -> Self {
let precision = if true_positives + false_positives > 0 {
true_positives as f64 / (true_positives + false_positives) as f64
} else {
0.0
};
let recall = if true_positives + false_negatives > 0 {
true_positives as f64 / (true_positives + false_negatives) as f64
} else {
0.0
};
let f1_score = if precision + recall > 0.0 {
2.0 * precision * recall / (precision + recall)
} else {
0.0
};
let level_accuracy = if total_blocks > 0 {
correct_levels as f64 / total_blocks as f64
} else {
0.0
};
Self {
precision,
recall,
f1_score,
level_accuracy,
true_positives,
false_positives,
false_negatives,
correct_levels,
total_blocks,
}
}
}
/// Convert hierarchy level string to HierarchyLevel enum.
fn parse_level(level: &str) -> HierarchyLevel {
match level {
"H1" => HierarchyLevel::H1,
"H2" => HierarchyLevel::H2,
"H3" => HierarchyLevel::H3,
"H4" => HierarchyLevel::H4,
"H5" => HierarchyLevel::H5,
"H6" => HierarchyLevel::H6,
_ => HierarchyLevel::Body,
}
}
/// Load ground truth annotations from JSON file.
///
/// Reads the hierarchy_ground_truth.json file and parses document annotations.
///
/// # Arguments
///
/// * `path` - Path to the ground truth JSON file
///
/// # Returns
///
/// Result containing the parsed GroundTruthFile or error message
fn load_ground_truth<P: AsRef<Path>>(path: P) -> Result<GroundTruthFile, String> {
let content = fs::read_to_string(path).map_err(|e| format!("Failed to read file: {}", e))?;
serde_json::from_str(&content).map_err(|e| format!("Failed to parse JSON: {}", e))
}
/// Calculate quality metrics by comparing extracted hierarchies to ground truth.
///
/// Compares extracted text blocks with their hierarchy assignments to ground truth annotations.
/// Measures:
/// - Precision: correctly identified hierarchies / total extracted
/// - Recall: correctly identified hierarchies / total ground truth
/// - F1 Score: harmonic mean of precision and recall
/// - Level Accuracy: percentage of blocks with correct hierarchy level
///
/// # Arguments
///
/// * `extracted_blocks` - Vector of extracted HierarchyBlock objects
/// * `ground_truth_blocks` - Vector of ground truth blocks
///
/// # Returns
///
/// QualityMetrics struct with calculated precision, recall, F1, and level accuracy
fn calculate_quality_metrics(
extracted_blocks: &[kreuzberg::pdf::hierarchy::HierarchyBlock],
ground_truth_blocks: &[GroundTruthBlock],
) -> QualityMetrics {
let mut true_positives = 0;
let mut false_positives = 0;
let mut correct_levels = 0;
// For matching blocks, we use bounding box overlap and text similarity
let mut matched_gt_indices: Vec<bool> = vec![false; ground_truth_blocks.len()];
for extracted in extracted_blocks {
let mut best_match_idx: Option<usize> = None;
let mut best_overlap = 0.0;
// Find the best matching ground truth block by bounding box overlap
for (gt_idx, gt_block) in ground_truth_blocks.iter().enumerate() {
if matched_gt_indices[gt_idx] {
continue; // Already matched
}
let gt_bbox = gt_block.bbox.to_bbox();
let overlap = extracted.bbox.iou(&gt_bbox);
if overlap > best_overlap && overlap > 0.3 {
best_overlap = overlap;
best_match_idx = Some(gt_idx);
}
}
if let Some(gt_idx) = best_match_idx {
matched_gt_indices[gt_idx] = true;
true_positives += 1;
// Check if the hierarchy level matches
let gt_level = parse_level(&ground_truth_blocks[gt_idx].level);
if extracted.hierarchy_level == gt_level {
correct_levels += 1;
}
} else {
false_positives += 1;
}
}
// Count unmatched ground truth blocks as false negatives
let false_negatives = matched_gt_indices.iter().filter(|&&m| !m).count();
let total_blocks = extracted_blocks.len().max(ground_truth_blocks.len());
QualityMetrics::new(
true_positives,
false_positives,
false_negatives,
correct_levels,
total_blocks,
)
}
/// Create test text blocks from ground truth.
fn create_text_blocks_from_ground_truth(gt_blocks: &[GroundTruthBlock]) -> Vec<TextBlock> {
gt_blocks
.iter()
.enumerate()
.map(|(idx, gt_block)| {
// Estimate font size from bbox height
let bbox = gt_block.bbox.to_bbox();
let font_size = match gt_block.level.as_str() {
"H1" => 28.0,
"H2" => 24.0,
"H3" => 20.0,
"H4" => 16.0,
"H5" => 14.0,
"H6" => 12.0,
_ => 10.0, // Body
};
TextBlock {
text: if gt_block.text.len() > 50 {
format!("{} (Block {})", gt_block.text.chars().take(50).collect::<String>(), idx)
} else {
gt_block.text.clone()
},
bbox,
font_size,
}
})
.collect()
}
#[test]
fn test_hierarchy_quality_on_ground_truth() {
// Load ground truth data
let ground_truth_path = "tests/data/hierarchy_ground_truth.json";
let ground_truth_file = load_ground_truth(ground_truth_path).expect("Failed to load ground truth file");
println!(
"\nLoaded {} documents from ground truth",
ground_truth_file.documents.len()
);
let mut all_metrics: Vec<QualityMetrics> = Vec::new();
// Process each document
for doc in &ground_truth_file.documents {
println!("\nProcessing document: {}", doc.pdf_file);
for page in &doc.pages {
println!(" Page {}: {} blocks", page.page_number, page.blocks.len());
// Create text blocks from ground truth
let text_blocks = create_text_blocks_from_ground_truth(&page.blocks);
// Cluster by font size
let k = (text_blocks.len() / 3).clamp(1, 6); // Estimate k clusters
let clusters = cluster_font_sizes(&text_blocks, k).expect("Failed to cluster font sizes");
println!(
" Created {} clusters from {} blocks",
clusters.len(),
text_blocks.len()
);
// Assign hierarchy levels from clusters
let hierarchy_assignments = assign_hierarchy_levels_from_clusters(&text_blocks, &clusters);
// Convert to HierarchyBlock format
let extracted_blocks: Vec<kreuzberg::pdf::hierarchy::HierarchyBlock> = hierarchy_assignments
.iter()
.map(|(block, level)| kreuzberg::pdf::hierarchy::HierarchyBlock {
text: block.text.clone(),
bbox: block.bbox,
font_size: block.font_size,
hierarchy_level: *level,
})
.collect();
// Calculate quality metrics
let metrics = calculate_quality_metrics(&extracted_blocks, &page.blocks);
all_metrics.push(metrics.clone());
println!(" Precision: {:.4}", metrics.precision);
println!(" Recall: {:.4}", metrics.recall);
println!(" F1 Score: {:.4}", metrics.f1_score);
println!(" Level Accuracy: {:.4}", metrics.level_accuracy);
}
}
// Calculate average metrics
if !all_metrics.is_empty() {
let avg_precision = all_metrics.iter().map(|m| m.precision).sum::<f64>() / all_metrics.len() as f64;
let avg_recall = all_metrics.iter().map(|m| m.recall).sum::<f64>() / all_metrics.len() as f64;
let avg_f1 = all_metrics.iter().map(|m| m.f1_score).sum::<f64>() / all_metrics.len() as f64;
let avg_level_acc = all_metrics.iter().map(|m| m.level_accuracy).sum::<f64>() / all_metrics.len() as f64;
println!("\n=== AVERAGE METRICS ACROSS ALL PAGES ===");
println!("Average Precision: {:.4}", avg_precision);
println!("Average Recall: {:.4}", avg_recall);
println!("Average F1 Score: {:.4}", avg_f1);
println!("Average Level Accuracy: {:.4}", avg_level_acc);
// Assert minimum F1 threshold
assert!(
avg_f1 > 0.85,
"F1 score ({:.4}) must be greater than 0.85. Metrics: precision={:.4}, recall={:.4}, level_accuracy={:.4}",
avg_f1,
avg_precision,
avg_recall,
avg_level_acc
);
}
}
#[test]
fn test_hierarchy_clustering_consistency() {
// Arrange: Create a simple document with clear hierarchy
let blocks = vec![
TextBlock {
text: "Title".to_string(),
bbox: BoundingBox {
left: 0.0,
top: 0.0,
right: 100.0,
bottom: 28.0,
},
font_size: 28.0,
},
TextBlock {
text: "Subtitle".to_string(),
bbox: BoundingBox {
left: 0.0,
top: 30.0,
right: 100.0,
bottom: 54.0,
},
font_size: 24.0,
},
TextBlock {
text: "Section".to_string(),
bbox: BoundingBox {
left: 0.0,
top: 60.0,
right: 100.0,
bottom: 80.0,
},
font_size: 20.0,
},
TextBlock {
text: "Body paragraph".to_string(),
bbox: BoundingBox {
left: 0.0,
top: 90.0,
right: 100.0,
bottom: 102.0,
},
font_size: 10.0,
},
];
// Act: Cluster and assign hierarchies
let clusters = cluster_font_sizes(&blocks, 4).expect("Clustering failed");
let assignments = assign_hierarchy_levels_from_clusters(&blocks, &clusters);
// Assert: Verify hierarchy levels are correct
assert_eq!(assignments.len(), 4);
assert_eq!(assignments[0].1, HierarchyLevel::H1, "Largest text should be H1");
assert_eq!(assignments[1].1, HierarchyLevel::H2, "Second largest should be H2");
assert_eq!(assignments[2].1, HierarchyLevel::H3, "Third largest should be H3");
assert_eq!(assignments[3].1, HierarchyLevel::Body, "Smallest text should be Body");
// Assert: F1 score should be perfect for this simple case
let quality_metrics = calculate_quality_metrics(
&assignments
.iter()
.map(|(b, l)| kreuzberg::pdf::hierarchy::HierarchyBlock {
text: b.text.clone(),
bbox: b.bbox,
font_size: b.font_size,
hierarchy_level: *l,
})
.collect::<Vec<_>>(),
&[
GroundTruthBlock {
text: "Title".to_string(),
level: "H1".to_string(),
bbox: GroundTruthBBox {
left: 0.0,
top: 0.0,
right: 100.0,
bottom: 28.0,
},
},
GroundTruthBlock {
text: "Subtitle".to_string(),
level: "H2".to_string(),
bbox: GroundTruthBBox {
left: 0.0,
top: 30.0,
right: 100.0,
bottom: 54.0,
},
},
GroundTruthBlock {
text: "Section".to_string(),
level: "H3".to_string(),
bbox: GroundTruthBBox {
left: 0.0,
top: 60.0,
right: 100.0,
bottom: 80.0,
},
},
GroundTruthBlock {
text: "Body paragraph".to_string(),
level: "Body".to_string(),
bbox: GroundTruthBBox {
left: 0.0,
top: 90.0,
right: 100.0,
bottom: 102.0,
},
},
],
);
println!("Consistency Test - F1 Score: {:.4}", quality_metrics.f1_score);
assert!(
quality_metrics.f1_score >= 0.8,
"F1 score for simple hierarchy should be >= 0.8"
);
}
#[test]
fn test_hierarchy_level_assignment() {
// Arrange: Create blocks and KMeans result
let blocks = vec![
TextBlock {
text: "Main Title".to_string(),
bbox: BoundingBox {
left: 50.0,
top: 50.0,
right: 150.0,
bottom: 100.0,
},
font_size: 28.0,
},
TextBlock {
text: "Section Title".to_string(),
bbox: BoundingBox {
left: 50.0,
top: 120.0,
right: 150.0,
bottom: 160.0,
},
font_size: 20.0,
},
TextBlock {
text: "Regular body text".to_string(),
bbox: BoundingBox {
left: 50.0,
top: 180.0,
right: 200.0,
bottom: 200.0,
},
font_size: 12.0,
},
];
let kmeans_result = KMeansResult { labels: vec![0, 1, 2] };
// Act: Assign hierarchy levels using KMeans result
let result = assign_hierarchy_levels(&blocks, &kmeans_result);
// Assert: Verify correct level assignments
assert_eq!(result.len(), 3);
assert_eq!(result[0].hierarchy_level, HierarchyLevel::H1);
assert_eq!(result[1].hierarchy_level, HierarchyLevel::H2);
assert_eq!(result[2].hierarchy_level, HierarchyLevel::H3);
}
#[test]
fn test_quality_metrics_calculation() {
// Arrange: Create extracted blocks and ground truth
let extracted = vec![
kreuzberg::pdf::hierarchy::HierarchyBlock {
text: "Title".to_string(),
bbox: BoundingBox {
left: 0.0,
top: 0.0,
right: 100.0,
bottom: 20.0,
},
font_size: 28.0,
hierarchy_level: HierarchyLevel::H1,
},
kreuzberg::pdf::hierarchy::HierarchyBlock {
text: "Body".to_string(),
bbox: BoundingBox {
left: 0.0,
top: 30.0,
right: 100.0,
bottom: 50.0,
},
font_size: 12.0,
hierarchy_level: HierarchyLevel::Body,
},
];
let ground_truth = vec![
GroundTruthBlock {
text: "Title".to_string(),
level: "H1".to_string(),
bbox: GroundTruthBBox {
left: 0.0,
top: 0.0,
right: 100.0,
bottom: 20.0,
},
},
GroundTruthBlock {
text: "Body".to_string(),
level: "Body".to_string(),
bbox: GroundTruthBBox {
left: 0.0,
top: 30.0,
right: 100.0,
bottom: 50.0,
},
},
];
// Act: Calculate metrics
let metrics = calculate_quality_metrics(&extracted, &ground_truth);
// Assert: Verify metrics
assert_eq!(metrics.true_positives, 2);
assert_eq!(metrics.false_positives, 0);
assert_eq!(metrics.false_negatives, 0);
assert_eq!(metrics.correct_levels, 2);
assert!(metrics.precision > 0.99);
assert!(metrics.recall > 0.99);
assert!(metrics.f1_score > 0.99);
}
*/