//! TODO: Restored from 245539484 alef-migration cleanup. Currently exercises //! pub(crate) APIs that the migration deliberately narrowed; gated until //! either (a) these APIs are re-exposed publicly, or (b) the test is //! rewritten against the public extraction surface. #![cfg(any())] // Original content preserved below; recompiled once gating cfg drops. // Disabled by the file-level cfg(any()) above. /* //! PDF hierarchy quality assessment tests. //! //! This module tests PDF text hierarchy extraction quality by comparing against ground truth annotations. //! Measures precision, recall, F1 score, and level accuracy to ensure the hierarchy detection //! algorithm works well on real document structures. //! //! Test philosophy: //! - Define ground truth hierarchies for representative PDF documents //! - Measure how well extracted hierarchies match ground truth //! - Assert minimum quality thresholds for precision/recall/F1 //! - Verify correct hierarchy level assignments #![cfg(feature = "pdf")] use kreuzberg::pdf::hierarchy::{ BoundingBox, HierarchyLevel, KMeansResult, TextBlock, assign_hierarchy_levels, assign_hierarchy_levels_from_clusters, cluster_font_sizes, }; use serde::{Deserialize, Serialize}; use std::fs; use std::path::Path; /// A bounding box annotation from ground truth. #[derive(Debug, Clone, Serialize, Deserialize)] struct GroundTruthBBox { left: f32, top: f32, right: f32, bottom: f32, } impl GroundTruthBBox { /// Convert to kreuzberg BoundingBox fn to_bbox(&self) -> BoundingBox { BoundingBox { left: self.left, top: self.top, right: self.right, bottom: self.bottom, } } } /// A ground truth text block with hierarchy level annotation. #[derive(Debug, Clone, Serialize, Deserialize)] struct GroundTruthBlock { text: String, level: String, bbox: GroundTruthBBox, } /// A page of ground truth annotations. #[derive(Debug, Clone, Serialize, Deserialize)] struct GroundTruthPage { page_number: u32, blocks: Vec, } /// A document with ground truth hierarchy annotations. #[derive(Debug, Clone, Serialize, Deserialize)] struct GroundTruthDocument { pdf_file: String, pages: Vec, } /// Root structure for ground truth JSON file. #[derive(Debug, Clone, Serialize, Deserialize)] struct GroundTruthFile { documents: Vec, } /// Quality metrics for hierarchy extraction. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct QualityMetrics { /// Precision: (correctly identified hierarchies) / (total extracted hierarchies) pub precision: f64, /// Recall: (correctly identified hierarchies) / (total ground truth hierarchies) pub recall: f64, /// F1 Score: harmonic mean of precision and recall pub f1_score: f64, /// Level accuracy: percentage of blocks assigned to correct hierarchy level pub level_accuracy: f64, /// Number of correctly identified hierarchy blocks pub true_positives: usize, /// Number of incorrectly extracted hierarchy blocks pub false_positives: usize, /// Number of missed ground truth hierarchy blocks pub false_negatives: usize, /// Number of blocks with correct hierarchy level pub correct_levels: usize, /// Total number of blocks evaluated pub total_blocks: usize, } impl QualityMetrics { /// Create new quality metrics from test results. fn new( true_positives: usize, false_positives: usize, false_negatives: usize, correct_levels: usize, total_blocks: usize, ) -> Self { let precision = if true_positives + false_positives > 0 { true_positives as f64 / (true_positives + false_positives) as f64 } else { 0.0 }; let recall = if true_positives + false_negatives > 0 { true_positives as f64 / (true_positives + false_negatives) as f64 } else { 0.0 }; let f1_score = if precision + recall > 0.0 { 2.0 * precision * recall / (precision + recall) } else { 0.0 }; let level_accuracy = if total_blocks > 0 { correct_levels as f64 / total_blocks as f64 } else { 0.0 }; Self { precision, recall, f1_score, level_accuracy, true_positives, false_positives, false_negatives, correct_levels, total_blocks, } } } /// Convert hierarchy level string to HierarchyLevel enum. fn parse_level(level: &str) -> HierarchyLevel { match level { "H1" => HierarchyLevel::H1, "H2" => HierarchyLevel::H2, "H3" => HierarchyLevel::H3, "H4" => HierarchyLevel::H4, "H5" => HierarchyLevel::H5, "H6" => HierarchyLevel::H6, _ => HierarchyLevel::Body, } } /// Load ground truth annotations from JSON file. /// /// Reads the hierarchy_ground_truth.json file and parses document annotations. /// /// # Arguments /// /// * `path` - Path to the ground truth JSON file /// /// # Returns /// /// Result containing the parsed GroundTruthFile or error message fn load_ground_truth>(path: P) -> Result { let content = fs::read_to_string(path).map_err(|e| format!("Failed to read file: {}", e))?; serde_json::from_str(&content).map_err(|e| format!("Failed to parse JSON: {}", e)) } /// Calculate quality metrics by comparing extracted hierarchies to ground truth. /// /// Compares extracted text blocks with their hierarchy assignments to ground truth annotations. /// Measures: /// - Precision: correctly identified hierarchies / total extracted /// - Recall: correctly identified hierarchies / total ground truth /// - F1 Score: harmonic mean of precision and recall /// - Level Accuracy: percentage of blocks with correct hierarchy level /// /// # Arguments /// /// * `extracted_blocks` - Vector of extracted HierarchyBlock objects /// * `ground_truth_blocks` - Vector of ground truth blocks /// /// # Returns /// /// QualityMetrics struct with calculated precision, recall, F1, and level accuracy fn calculate_quality_metrics( extracted_blocks: &[kreuzberg::pdf::hierarchy::HierarchyBlock], ground_truth_blocks: &[GroundTruthBlock], ) -> QualityMetrics { let mut true_positives = 0; let mut false_positives = 0; let mut correct_levels = 0; // For matching blocks, we use bounding box overlap and text similarity let mut matched_gt_indices: Vec = vec![false; ground_truth_blocks.len()]; for extracted in extracted_blocks { let mut best_match_idx: Option = None; let mut best_overlap = 0.0; // Find the best matching ground truth block by bounding box overlap for (gt_idx, gt_block) in ground_truth_blocks.iter().enumerate() { if matched_gt_indices[gt_idx] { continue; // Already matched } let gt_bbox = gt_block.bbox.to_bbox(); let overlap = extracted.bbox.iou(>_bbox); if overlap > best_overlap && overlap > 0.3 { best_overlap = overlap; best_match_idx = Some(gt_idx); } } if let Some(gt_idx) = best_match_idx { matched_gt_indices[gt_idx] = true; true_positives += 1; // Check if the hierarchy level matches let gt_level = parse_level(&ground_truth_blocks[gt_idx].level); if extracted.hierarchy_level == gt_level { correct_levels += 1; } } else { false_positives += 1; } } // Count unmatched ground truth blocks as false negatives let false_negatives = matched_gt_indices.iter().filter(|&&m| !m).count(); let total_blocks = extracted_blocks.len().max(ground_truth_blocks.len()); QualityMetrics::new( true_positives, false_positives, false_negatives, correct_levels, total_blocks, ) } /// Create test text blocks from ground truth. fn create_text_blocks_from_ground_truth(gt_blocks: &[GroundTruthBlock]) -> Vec { gt_blocks .iter() .enumerate() .map(|(idx, gt_block)| { // Estimate font size from bbox height let bbox = gt_block.bbox.to_bbox(); let font_size = match gt_block.level.as_str() { "H1" => 28.0, "H2" => 24.0, "H3" => 20.0, "H4" => 16.0, "H5" => 14.0, "H6" => 12.0, _ => 10.0, // Body }; TextBlock { text: if gt_block.text.len() > 50 { format!("{} (Block {})", gt_block.text.chars().take(50).collect::(), idx) } else { gt_block.text.clone() }, bbox, font_size, } }) .collect() } #[test] fn test_hierarchy_quality_on_ground_truth() { // Load ground truth data let ground_truth_path = "tests/data/hierarchy_ground_truth.json"; let ground_truth_file = load_ground_truth(ground_truth_path).expect("Failed to load ground truth file"); println!( "\nLoaded {} documents from ground truth", ground_truth_file.documents.len() ); let mut all_metrics: Vec = Vec::new(); // Process each document for doc in &ground_truth_file.documents { println!("\nProcessing document: {}", doc.pdf_file); for page in &doc.pages { println!(" Page {}: {} blocks", page.page_number, page.blocks.len()); // Create text blocks from ground truth let text_blocks = create_text_blocks_from_ground_truth(&page.blocks); // Cluster by font size let k = (text_blocks.len() / 3).clamp(1, 6); // Estimate k clusters let clusters = cluster_font_sizes(&text_blocks, k).expect("Failed to cluster font sizes"); println!( " Created {} clusters from {} blocks", clusters.len(), text_blocks.len() ); // Assign hierarchy levels from clusters let hierarchy_assignments = assign_hierarchy_levels_from_clusters(&text_blocks, &clusters); // Convert to HierarchyBlock format let extracted_blocks: Vec = hierarchy_assignments .iter() .map(|(block, level)| kreuzberg::pdf::hierarchy::HierarchyBlock { text: block.text.clone(), bbox: block.bbox, font_size: block.font_size, hierarchy_level: *level, }) .collect(); // Calculate quality metrics let metrics = calculate_quality_metrics(&extracted_blocks, &page.blocks); all_metrics.push(metrics.clone()); println!(" Precision: {:.4}", metrics.precision); println!(" Recall: {:.4}", metrics.recall); println!(" F1 Score: {:.4}", metrics.f1_score); println!(" Level Accuracy: {:.4}", metrics.level_accuracy); } } // Calculate average metrics if !all_metrics.is_empty() { let avg_precision = all_metrics.iter().map(|m| m.precision).sum::() / all_metrics.len() as f64; let avg_recall = all_metrics.iter().map(|m| m.recall).sum::() / all_metrics.len() as f64; let avg_f1 = all_metrics.iter().map(|m| m.f1_score).sum::() / all_metrics.len() as f64; let avg_level_acc = all_metrics.iter().map(|m| m.level_accuracy).sum::() / all_metrics.len() as f64; println!("\n=== AVERAGE METRICS ACROSS ALL PAGES ==="); println!("Average Precision: {:.4}", avg_precision); println!("Average Recall: {:.4}", avg_recall); println!("Average F1 Score: {:.4}", avg_f1); println!("Average Level Accuracy: {:.4}", avg_level_acc); // Assert minimum F1 threshold assert!( avg_f1 > 0.85, "F1 score ({:.4}) must be greater than 0.85. Metrics: precision={:.4}, recall={:.4}, level_accuracy={:.4}", avg_f1, avg_precision, avg_recall, avg_level_acc ); } } #[test] fn test_hierarchy_clustering_consistency() { // Arrange: Create a simple document with clear hierarchy let blocks = vec![ TextBlock { text: "Title".to_string(), bbox: BoundingBox { left: 0.0, top: 0.0, right: 100.0, bottom: 28.0, }, font_size: 28.0, }, TextBlock { text: "Subtitle".to_string(), bbox: BoundingBox { left: 0.0, top: 30.0, right: 100.0, bottom: 54.0, }, font_size: 24.0, }, TextBlock { text: "Section".to_string(), bbox: BoundingBox { left: 0.0, top: 60.0, right: 100.0, bottom: 80.0, }, font_size: 20.0, }, TextBlock { text: "Body paragraph".to_string(), bbox: BoundingBox { left: 0.0, top: 90.0, right: 100.0, bottom: 102.0, }, font_size: 10.0, }, ]; // Act: Cluster and assign hierarchies let clusters = cluster_font_sizes(&blocks, 4).expect("Clustering failed"); let assignments = assign_hierarchy_levels_from_clusters(&blocks, &clusters); // Assert: Verify hierarchy levels are correct assert_eq!(assignments.len(), 4); assert_eq!(assignments[0].1, HierarchyLevel::H1, "Largest text should be H1"); assert_eq!(assignments[1].1, HierarchyLevel::H2, "Second largest should be H2"); assert_eq!(assignments[2].1, HierarchyLevel::H3, "Third largest should be H3"); assert_eq!(assignments[3].1, HierarchyLevel::Body, "Smallest text should be Body"); // Assert: F1 score should be perfect for this simple case let quality_metrics = calculate_quality_metrics( &assignments .iter() .map(|(b, l)| kreuzberg::pdf::hierarchy::HierarchyBlock { text: b.text.clone(), bbox: b.bbox, font_size: b.font_size, hierarchy_level: *l, }) .collect::>(), &[ GroundTruthBlock { text: "Title".to_string(), level: "H1".to_string(), bbox: GroundTruthBBox { left: 0.0, top: 0.0, right: 100.0, bottom: 28.0, }, }, GroundTruthBlock { text: "Subtitle".to_string(), level: "H2".to_string(), bbox: GroundTruthBBox { left: 0.0, top: 30.0, right: 100.0, bottom: 54.0, }, }, GroundTruthBlock { text: "Section".to_string(), level: "H3".to_string(), bbox: GroundTruthBBox { left: 0.0, top: 60.0, right: 100.0, bottom: 80.0, }, }, GroundTruthBlock { text: "Body paragraph".to_string(), level: "Body".to_string(), bbox: GroundTruthBBox { left: 0.0, top: 90.0, right: 100.0, bottom: 102.0, }, }, ], ); println!("Consistency Test - F1 Score: {:.4}", quality_metrics.f1_score); assert!( quality_metrics.f1_score >= 0.8, "F1 score for simple hierarchy should be >= 0.8" ); } #[test] fn test_hierarchy_level_assignment() { // Arrange: Create blocks and KMeans result let blocks = vec![ TextBlock { text: "Main Title".to_string(), bbox: BoundingBox { left: 50.0, top: 50.0, right: 150.0, bottom: 100.0, }, font_size: 28.0, }, TextBlock { text: "Section Title".to_string(), bbox: BoundingBox { left: 50.0, top: 120.0, right: 150.0, bottom: 160.0, }, font_size: 20.0, }, TextBlock { text: "Regular body text".to_string(), bbox: BoundingBox { left: 50.0, top: 180.0, right: 200.0, bottom: 200.0, }, font_size: 12.0, }, ]; let kmeans_result = KMeansResult { labels: vec![0, 1, 2] }; // Act: Assign hierarchy levels using KMeans result let result = assign_hierarchy_levels(&blocks, &kmeans_result); // Assert: Verify correct level assignments assert_eq!(result.len(), 3); assert_eq!(result[0].hierarchy_level, HierarchyLevel::H1); assert_eq!(result[1].hierarchy_level, HierarchyLevel::H2); assert_eq!(result[2].hierarchy_level, HierarchyLevel::H3); } #[test] fn test_quality_metrics_calculation() { // Arrange: Create extracted blocks and ground truth let extracted = vec![ kreuzberg::pdf::hierarchy::HierarchyBlock { text: "Title".to_string(), bbox: BoundingBox { left: 0.0, top: 0.0, right: 100.0, bottom: 20.0, }, font_size: 28.0, hierarchy_level: HierarchyLevel::H1, }, kreuzberg::pdf::hierarchy::HierarchyBlock { text: "Body".to_string(), bbox: BoundingBox { left: 0.0, top: 30.0, right: 100.0, bottom: 50.0, }, font_size: 12.0, hierarchy_level: HierarchyLevel::Body, }, ]; let ground_truth = vec![ GroundTruthBlock { text: "Title".to_string(), level: "H1".to_string(), bbox: GroundTruthBBox { left: 0.0, top: 0.0, right: 100.0, bottom: 20.0, }, }, GroundTruthBlock { text: "Body".to_string(), level: "Body".to_string(), bbox: GroundTruthBBox { left: 0.0, top: 30.0, right: 100.0, bottom: 50.0, }, }, ]; // Act: Calculate metrics let metrics = calculate_quality_metrics(&extracted, &ground_truth); // Assert: Verify metrics assert_eq!(metrics.true_positives, 2); assert_eq!(metrics.false_positives, 0); assert_eq!(metrics.false_negatives, 0); assert_eq!(metrics.correct_levels, 2); assert!(metrics.precision > 0.99); assert!(metrics.recall > 0.99); assert!(metrics.f1_score > 0.99); } */