use benchmark_harness::aggregate::aggregate_new_format; use benchmark_harness::types::{ BenchmarkResult, ErrorKind, FrameworkCapabilities, OcrStatus, OutputFormat, PerformanceMetrics, QualityMetrics, }; use std::path::PathBuf; use std::time::Duration; fn make_benchmark_result( framework: &str, output_format: OutputFormat, file_name: &str, ocr: bool, success: bool, quality: Option, ) -> BenchmarkResult { BenchmarkResult { framework: framework.to_string(), output_format, file_path: PathBuf::from(file_name), file_size: 10240, success, error_message: if success { None } else { Some("test error".to_string()) }, error_kind: if success { ErrorKind::None } else { ErrorKind::FrameworkError }, duration: Duration::from_millis(100), extraction_duration: Some(Duration::from_millis(80)), subprocess_overhead: Some(Duration::from_millis(20)), metrics: PerformanceMetrics { peak_memory_bytes: 100_000_000, avg_cpu_percent: 50.0, throughput_bytes_per_sec: 102_400.0, p50_memory_bytes: 90_000_000, p95_memory_bytes: 95_000_000, p99_memory_bytes: 99_000_000, }, quality, iterations: vec![], statistics: None, cold_start_duration: Some(Duration::from_millis(500)), file_extension: "pdf".to_string(), framework_capabilities: FrameworkCapabilities::default(), pdf_metadata: None, ocr_status: if ocr { OcrStatus::Used } else { OcrStatus::NotUsed }, extracted_text: None, } } #[test] fn test_schema_version_2_4_0() { let results = vec![make_benchmark_result( "kreuzberg-markdown-baseline", OutputFormat::Markdown, "test.pdf", false, true, Some(QualityMetrics { f1_score_text: 0.95, f1_score_numeric: 0.90, f1_score_layout: Some(0.88), quality_score: 0.91, missing_tokens: vec![], extra_tokens: vec![], correct: true, }), )]; let aggregated = aggregate_new_format(&results); assert_eq!(aggregated.schema_version, "2.4.0"); } #[test] fn test_per_fixture_results_populated() { let results = vec![ make_benchmark_result( "kreuzberg-markdown-baseline", OutputFormat::Markdown, "fixture_1.pdf", false, true, Some(QualityMetrics { f1_score_text: 0.95, f1_score_numeric: 0.90, f1_score_layout: Some(0.88), quality_score: 0.91, missing_tokens: vec![], extra_tokens: vec![], correct: true, }), ), make_benchmark_result( "kreuzberg-markdown-baseline", OutputFormat::Markdown, "fixture_2.pdf", false, true, Some(QualityMetrics { f1_score_text: 0.92, f1_score_numeric: 0.88, f1_score_layout: Some(0.85), quality_score: 0.88, missing_tokens: vec![], extra_tokens: vec![], correct: true, }), ), ]; let aggregated = aggregate_new_format(&results); assert!(!aggregated.per_fixture_results.is_empty()); assert_eq!(aggregated.per_fixture_results.len(), 2); // Check that fixture_id is correctly extracted from file path let fixture_ids: Vec = aggregated .per_fixture_results .iter() .map(|r| r.fixture_id.clone()) .collect(); assert!(fixture_ids.contains(&"fixture_1".to_string())); assert!(fixture_ids.contains(&"fixture_2".to_string())); // Check that output_format is preserved for row in &aggregated.per_fixture_results { assert_eq!(row.output_format, OutputFormat::Markdown); } } #[test] fn test_plaintext_has_no_layout_percentiles() { let results = vec![ make_benchmark_result( "pdfplumber", OutputFormat::Plaintext, "fixture_1.pdf", false, true, Some(QualityMetrics { f1_score_text: 0.90, f1_score_numeric: 0.85, f1_score_layout: None, // Plaintext mode has no layout quality_score: 0.88, missing_tokens: vec![], extra_tokens: vec![], correct: true, }), ), make_benchmark_result( "pdfplumber", OutputFormat::Plaintext, "fixture_2.pdf", false, true, Some(QualityMetrics { f1_score_text: 0.91, f1_score_numeric: 0.86, f1_score_layout: None, quality_score: 0.89, missing_tokens: vec![], extra_tokens: vec![], correct: true, }), ), ]; let aggregated = aggregate_new_format(&results); // Find the plaintext aggregation let plaintext_key = aggregated .by_framework_mode .keys() .find(|k| k.contains("plaintext")) .cloned(); assert!(plaintext_key.is_some(), "Expected to find plaintext aggregation key"); if let Some(key) = plaintext_key && let Some(agg) = aggregated.by_framework_mode.get(&key) && let Some(pdf_ft) = agg.by_file_type.get("pdf") && let Some(perf) = &pdf_ft.no_ocr && let Some(quality) = &perf.quality { assert_eq!(quality.f1_layout_p50, None); assert_eq!(quality.f1_layout_p95, None); assert_eq!(quality.f1_layout_p99, None); } } #[test] fn test_output_format_in_aggregation_key() { let results = vec![ make_benchmark_result( "kreuzberg", OutputFormat::Markdown, "test.pdf", false, true, Some(QualityMetrics { f1_score_text: 0.95, f1_score_numeric: 0.90, f1_score_layout: Some(0.88), quality_score: 0.91, missing_tokens: vec![], extra_tokens: vec![], correct: true, }), ), make_benchmark_result( "kreuzberg", OutputFormat::Plaintext, "test.pdf", false, true, Some(QualityMetrics { f1_score_text: 0.92, f1_score_numeric: 0.88, f1_score_layout: None, quality_score: 0.90, missing_tokens: vec![], extra_tokens: vec![], correct: true, }), ), ]; let aggregated = aggregate_new_format(&results); // Should have two separate aggregations: one for markdown, one for plaintext let markdown_key = aggregated.by_framework_mode.keys().find(|k| k.contains("markdown")); let plaintext_key = aggregated.by_framework_mode.keys().find(|k| k.contains("plaintext")); assert!(markdown_key.is_some(), "Expected markdown aggregation"); assert!(plaintext_key.is_some(), "Expected plaintext aggregation"); } #[test] fn test_plaintext_frameworks_excluded_from_sf1_ranking() { let results = vec![ // Markdown framework for PDF make_benchmark_result( "kreuzberg-markdown", OutputFormat::Markdown, "test.pdf", false, true, Some(QualityMetrics { f1_score_text: 0.95, f1_score_numeric: 0.90, f1_score_layout: Some(0.88), quality_score: 0.91, missing_tokens: vec![], extra_tokens: vec![], correct: true, }), ), // Plaintext-only framework make_benchmark_result( "pdfplumber", OutputFormat::Plaintext, "test.pdf", false, true, Some(QualityMetrics { f1_score_text: 0.92, f1_score_numeric: 0.88, f1_score_layout: None, quality_score: 0.90, missing_tokens: vec![], extra_tokens: vec![], correct: true, }), ), ]; let aggregated = aggregate_new_format(&results); // plaintext frameworks should NOT appear in pdf_sf1_ranking_markdown for ranked in &aggregated.comparison.pdf_sf1_ranking_markdown { assert!(!ranked.framework_mode.contains("pdfplumber")); } // markdown frameworks SHOULD appear in pdf_sf1_ranking_markdown let has_markdown = aggregated .comparison .pdf_sf1_ranking_markdown .iter() .any(|r| r.framework_mode.contains("kreuzberg-markdown")); assert!(has_markdown, "Expected markdown framework in SF1 ranking"); } #[test] fn test_quality_percentiles_all_three() { let results = vec![ make_benchmark_result( "test-framework", OutputFormat::Markdown, "fixture_1.pdf", false, true, Some(QualityMetrics { f1_score_text: 0.95, f1_score_numeric: 0.90, f1_score_layout: Some(0.88), quality_score: 0.91, missing_tokens: vec![], extra_tokens: vec![], correct: true, }), ), make_benchmark_result( "test-framework", OutputFormat::Markdown, "fixture_2.pdf", false, true, Some(QualityMetrics { f1_score_text: 0.80, f1_score_numeric: 0.75, f1_score_layout: Some(0.70), quality_score: 0.75, missing_tokens: vec![], extra_tokens: vec![], correct: false, }), ), make_benchmark_result( "test-framework", OutputFormat::Markdown, "fixture_3.pdf", false, true, Some(QualityMetrics { f1_score_text: 0.92, f1_score_numeric: 0.87, f1_score_layout: Some(0.85), quality_score: 0.88, missing_tokens: vec![], extra_tokens: vec![], correct: true, }), ), ]; let aggregated = aggregate_new_format(&results); // Find the aggregation with quality metrics let has_quality_percentiles = aggregated.by_framework_mode.values().any(|agg| { agg.by_file_type.values().any(|ft| { [ft.no_ocr.as_ref(), ft.with_ocr.as_ref()] .into_iter() .flatten() .any(|perf| { if let Some(q) = &perf.quality { // Check that all three percentiles are present q.f1_text_p50 > 0.0 && q.f1_text_p95 > 0.0 && q.f1_text_p99 >= 0.0 && q.quality_score_p50 > 0.0 && q.quality_score_p95 > 0.0 && q.quality_score_p99 >= 0.0 } else { false } }) }) }); assert!( has_quality_percentiles, "Expected quality percentiles with p50, p95, p99" ); } #[test] fn test_ocr_flag_in_per_fixture() { let results = vec![ make_benchmark_result( "test-framework", OutputFormat::Markdown, "no_ocr.pdf", false, true, None, ), make_benchmark_result( "test-framework", OutputFormat::Markdown, "with_ocr.png", true, true, None, ), ]; let aggregated = aggregate_new_format(&results); let no_ocr_row = aggregated.per_fixture_results.iter().find(|r| r.fixture_id == "no_ocr"); let with_ocr_row = aggregated .per_fixture_results .iter() .find(|r| r.fixture_id == "with_ocr"); assert!(no_ocr_row.is_some()); assert!(with_ocr_row.is_some()); assert!(!no_ocr_row.unwrap().ocr); assert!(with_ocr_row.unwrap().ocr); } #[test] fn test_empty_results() { let results = vec![]; let aggregated = aggregate_new_format(&results); assert_eq!(aggregated.schema_version, "2.4.0"); assert!(aggregated.by_framework_mode.is_empty()); assert!(aggregated.per_fixture_results.is_empty()); assert_eq!(aggregated.metadata.total_results, 0); }