Files
fil/tools/benchmark-harness/tests/aggregate_schema.rs
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

421 lines
13 KiB
Rust

use benchmark_harness::aggregate::aggregate_new_format;
use benchmark_harness::types::{
BenchmarkResult, ErrorKind, FrameworkCapabilities, OcrStatus, OutputFormat, PerformanceMetrics, QualityMetrics,
};
use std::path::PathBuf;
use std::time::Duration;
fn make_benchmark_result(
framework: &str,
output_format: OutputFormat,
file_name: &str,
ocr: bool,
success: bool,
quality: Option<QualityMetrics>,
) -> BenchmarkResult {
BenchmarkResult {
framework: framework.to_string(),
output_format,
file_path: PathBuf::from(file_name),
file_size: 10240,
success,
error_message: if success { None } else { Some("test error".to_string()) },
error_kind: if success {
ErrorKind::None
} else {
ErrorKind::FrameworkError
},
duration: Duration::from_millis(100),
extraction_duration: Some(Duration::from_millis(80)),
subprocess_overhead: Some(Duration::from_millis(20)),
metrics: PerformanceMetrics {
peak_memory_bytes: 100_000_000,
avg_cpu_percent: 50.0,
throughput_bytes_per_sec: 102_400.0,
p50_memory_bytes: 90_000_000,
p95_memory_bytes: 95_000_000,
p99_memory_bytes: 99_000_000,
},
quality,
iterations: vec![],
statistics: None,
cold_start_duration: Some(Duration::from_millis(500)),
file_extension: "pdf".to_string(),
framework_capabilities: FrameworkCapabilities::default(),
pdf_metadata: None,
ocr_status: if ocr { OcrStatus::Used } else { OcrStatus::NotUsed },
extracted_text: None,
}
}
#[test]
fn test_schema_version_2_4_0() {
let results = vec![make_benchmark_result(
"kreuzberg-markdown-baseline",
OutputFormat::Markdown,
"test.pdf",
false,
true,
Some(QualityMetrics {
f1_score_text: 0.95,
f1_score_numeric: 0.90,
f1_score_layout: Some(0.88),
quality_score: 0.91,
missing_tokens: vec![],
extra_tokens: vec![],
correct: true,
}),
)];
let aggregated = aggregate_new_format(&results);
assert_eq!(aggregated.schema_version, "2.4.0");
}
#[test]
fn test_per_fixture_results_populated() {
let results = vec![
make_benchmark_result(
"kreuzberg-markdown-baseline",
OutputFormat::Markdown,
"fixture_1.pdf",
false,
true,
Some(QualityMetrics {
f1_score_text: 0.95,
f1_score_numeric: 0.90,
f1_score_layout: Some(0.88),
quality_score: 0.91,
missing_tokens: vec![],
extra_tokens: vec![],
correct: true,
}),
),
make_benchmark_result(
"kreuzberg-markdown-baseline",
OutputFormat::Markdown,
"fixture_2.pdf",
false,
true,
Some(QualityMetrics {
f1_score_text: 0.92,
f1_score_numeric: 0.88,
f1_score_layout: Some(0.85),
quality_score: 0.88,
missing_tokens: vec![],
extra_tokens: vec![],
correct: true,
}),
),
];
let aggregated = aggregate_new_format(&results);
assert!(!aggregated.per_fixture_results.is_empty());
assert_eq!(aggregated.per_fixture_results.len(), 2);
// Check that fixture_id is correctly extracted from file path
let fixture_ids: Vec<String> = aggregated
.per_fixture_results
.iter()
.map(|r| r.fixture_id.clone())
.collect();
assert!(fixture_ids.contains(&"fixture_1".to_string()));
assert!(fixture_ids.contains(&"fixture_2".to_string()));
// Check that output_format is preserved
for row in &aggregated.per_fixture_results {
assert_eq!(row.output_format, OutputFormat::Markdown);
}
}
#[test]
fn test_plaintext_has_no_layout_percentiles() {
let results = vec![
make_benchmark_result(
"pdfplumber",
OutputFormat::Plaintext,
"fixture_1.pdf",
false,
true,
Some(QualityMetrics {
f1_score_text: 0.90,
f1_score_numeric: 0.85,
f1_score_layout: None, // Plaintext mode has no layout
quality_score: 0.88,
missing_tokens: vec![],
extra_tokens: vec![],
correct: true,
}),
),
make_benchmark_result(
"pdfplumber",
OutputFormat::Plaintext,
"fixture_2.pdf",
false,
true,
Some(QualityMetrics {
f1_score_text: 0.91,
f1_score_numeric: 0.86,
f1_score_layout: None,
quality_score: 0.89,
missing_tokens: vec![],
extra_tokens: vec![],
correct: true,
}),
),
];
let aggregated = aggregate_new_format(&results);
// Find the plaintext aggregation
let plaintext_key = aggregated
.by_framework_mode
.keys()
.find(|k| k.contains("plaintext"))
.cloned();
assert!(plaintext_key.is_some(), "Expected to find plaintext aggregation key");
if let Some(key) = plaintext_key
&& let Some(agg) = aggregated.by_framework_mode.get(&key)
&& let Some(pdf_ft) = agg.by_file_type.get("pdf")
&& let Some(perf) = &pdf_ft.no_ocr
&& let Some(quality) = &perf.quality
{
assert_eq!(quality.f1_layout_p50, None);
assert_eq!(quality.f1_layout_p95, None);
assert_eq!(quality.f1_layout_p99, None);
}
}
#[test]
fn test_output_format_in_aggregation_key() {
let results = vec![
make_benchmark_result(
"kreuzberg",
OutputFormat::Markdown,
"test.pdf",
false,
true,
Some(QualityMetrics {
f1_score_text: 0.95,
f1_score_numeric: 0.90,
f1_score_layout: Some(0.88),
quality_score: 0.91,
missing_tokens: vec![],
extra_tokens: vec![],
correct: true,
}),
),
make_benchmark_result(
"kreuzberg",
OutputFormat::Plaintext,
"test.pdf",
false,
true,
Some(QualityMetrics {
f1_score_text: 0.92,
f1_score_numeric: 0.88,
f1_score_layout: None,
quality_score: 0.90,
missing_tokens: vec![],
extra_tokens: vec![],
correct: true,
}),
),
];
let aggregated = aggregate_new_format(&results);
// Should have two separate aggregations: one for markdown, one for plaintext
let markdown_key = aggregated.by_framework_mode.keys().find(|k| k.contains("markdown"));
let plaintext_key = aggregated.by_framework_mode.keys().find(|k| k.contains("plaintext"));
assert!(markdown_key.is_some(), "Expected markdown aggregation");
assert!(plaintext_key.is_some(), "Expected plaintext aggregation");
}
#[test]
fn test_plaintext_frameworks_excluded_from_sf1_ranking() {
let results = vec![
// Markdown framework for PDF
make_benchmark_result(
"kreuzberg-markdown",
OutputFormat::Markdown,
"test.pdf",
false,
true,
Some(QualityMetrics {
f1_score_text: 0.95,
f1_score_numeric: 0.90,
f1_score_layout: Some(0.88),
quality_score: 0.91,
missing_tokens: vec![],
extra_tokens: vec![],
correct: true,
}),
),
// Plaintext-only framework
make_benchmark_result(
"pdfplumber",
OutputFormat::Plaintext,
"test.pdf",
false,
true,
Some(QualityMetrics {
f1_score_text: 0.92,
f1_score_numeric: 0.88,
f1_score_layout: None,
quality_score: 0.90,
missing_tokens: vec![],
extra_tokens: vec![],
correct: true,
}),
),
];
let aggregated = aggregate_new_format(&results);
// plaintext frameworks should NOT appear in pdf_sf1_ranking_markdown
for ranked in &aggregated.comparison.pdf_sf1_ranking_markdown {
assert!(!ranked.framework_mode.contains("pdfplumber"));
}
// markdown frameworks SHOULD appear in pdf_sf1_ranking_markdown
let has_markdown = aggregated
.comparison
.pdf_sf1_ranking_markdown
.iter()
.any(|r| r.framework_mode.contains("kreuzberg-markdown"));
assert!(has_markdown, "Expected markdown framework in SF1 ranking");
}
#[test]
fn test_quality_percentiles_all_three() {
let results = vec![
make_benchmark_result(
"test-framework",
OutputFormat::Markdown,
"fixture_1.pdf",
false,
true,
Some(QualityMetrics {
f1_score_text: 0.95,
f1_score_numeric: 0.90,
f1_score_layout: Some(0.88),
quality_score: 0.91,
missing_tokens: vec![],
extra_tokens: vec![],
correct: true,
}),
),
make_benchmark_result(
"test-framework",
OutputFormat::Markdown,
"fixture_2.pdf",
false,
true,
Some(QualityMetrics {
f1_score_text: 0.80,
f1_score_numeric: 0.75,
f1_score_layout: Some(0.70),
quality_score: 0.75,
missing_tokens: vec![],
extra_tokens: vec![],
correct: false,
}),
),
make_benchmark_result(
"test-framework",
OutputFormat::Markdown,
"fixture_3.pdf",
false,
true,
Some(QualityMetrics {
f1_score_text: 0.92,
f1_score_numeric: 0.87,
f1_score_layout: Some(0.85),
quality_score: 0.88,
missing_tokens: vec![],
extra_tokens: vec![],
correct: true,
}),
),
];
let aggregated = aggregate_new_format(&results);
// Find the aggregation with quality metrics
let has_quality_percentiles = aggregated.by_framework_mode.values().any(|agg| {
agg.by_file_type.values().any(|ft| {
[ft.no_ocr.as_ref(), ft.with_ocr.as_ref()]
.into_iter()
.flatten()
.any(|perf| {
if let Some(q) = &perf.quality {
// Check that all three percentiles are present
q.f1_text_p50 > 0.0
&& q.f1_text_p95 > 0.0
&& q.f1_text_p99 >= 0.0
&& q.quality_score_p50 > 0.0
&& q.quality_score_p95 > 0.0
&& q.quality_score_p99 >= 0.0
} else {
false
}
})
})
});
assert!(
has_quality_percentiles,
"Expected quality percentiles with p50, p95, p99"
);
}
#[test]
fn test_ocr_flag_in_per_fixture() {
let results = vec![
make_benchmark_result(
"test-framework",
OutputFormat::Markdown,
"no_ocr.pdf",
false,
true,
None,
),
make_benchmark_result(
"test-framework",
OutputFormat::Markdown,
"with_ocr.png",
true,
true,
None,
),
];
let aggregated = aggregate_new_format(&results);
let no_ocr_row = aggregated.per_fixture_results.iter().find(|r| r.fixture_id == "no_ocr");
let with_ocr_row = aggregated
.per_fixture_results
.iter()
.find(|r| r.fixture_id == "with_ocr");
assert!(no_ocr_row.is_some());
assert!(with_ocr_row.is_some());
assert!(!no_ocr_row.unwrap().ocr);
assert!(with_ocr_row.unwrap().ocr);
}
#[test]
fn test_empty_results() {
let results = vec![];
let aggregated = aggregate_new_format(&results);
assert_eq!(aggregated.schema_version, "2.4.0");
assert!(aggregated.by_framework_mode.is_empty());
assert!(aggregated.per_fixture_results.is_empty());
assert_eq!(aggregated.metadata.total_results, 0);
}