1838 lines
72 KiB
Rust
1838 lines
72 KiB
Rust
|
|
//! Aggregation module for benchmark results (v2.4.0 output schema).
|
||
|
|
//!
|
||
|
|
//! Groups [`BenchmarkResult`] records by framework-and-mode, output format, file type, and
|
||
|
|
//! OCR usage (yes/no), then computes percentile-based statistics for each
|
||
|
|
//! group. The output schema (`schema_version: "2.4.0"`) surfaces TF1 and SF1 separately
|
||
|
|
//! with per-fixture rows preserved and split rankings by output format.
|
||
|
|
//!
|
||
|
|
//! # Percentile methodology
|
||
|
|
//!
|
||
|
|
//! All percentiles use the **R-7 interpolation** method (the default in R and
|
||
|
|
//! NumPy) via [`crate::stats::percentile_r7`]. Three percentiles are reported
|
||
|
|
//! per metric: **p50** (median), **p95**, and **p99**. Values that are `NaN`
|
||
|
|
//! or `Inf` after interpolation are sanitized to `0.0` by
|
||
|
|
//! [`crate::stats::sanitize_f64`] so that downstream JSON consumers never
|
||
|
|
//! encounter non-finite floats.
|
||
|
|
//!
|
||
|
|
//! Failed results (non-zero `error_kind`) are excluded from percentile
|
||
|
|
//! calculations but still counted in `total_sample_count` to preserve the
|
||
|
|
//! `success_rate_percent` metric.
|
||
|
|
//!
|
||
|
|
//! # Output format support
|
||
|
|
//!
|
||
|
|
//! Plaintext-only frameworks must NEVER appear in SF1 rankings or quality metrics
|
||
|
|
//! that require layout information. Markdown frameworks appear in all rankings.
|
||
|
|
//!
|
||
|
|
//! # Aggregate key format
|
||
|
|
//!
|
||
|
|
//! Keys in `by_framework_mode` differ by framework family:
|
||
|
|
//!
|
||
|
|
//! - **kreuzberg** (`kreuzberg-*`): `{framework_name}:{mode}` — the output format is already
|
||
|
|
//! encoded in the framework name (e.g. `kreuzberg-markdown-baseline`), so repeating it in
|
||
|
|
//! the key would be redundant.
|
||
|
|
//! - **competitors** (all other frameworks): `{framework}:{output_format}:{mode}` — format is
|
||
|
|
//! not encoded in the name, so the key must carry it explicitly.
|
||
|
|
|
||
|
|
use crate::stats::{percentile_r7, sanitize_f64};
|
||
|
|
use crate::types::{BenchmarkResult, DiskSizeInfo, ErrorKind, OutputFormat};
|
||
|
|
use serde::{Deserialize, Serialize};
|
||
|
|
use std::collections::HashMap;
|
||
|
|
|
||
|
|
/// Schema version for the aggregated output format.
|
||
|
|
pub const SCHEMA_VERSION: &str = "2.4.0";
|
||
|
|
|
||
|
|
/// Consolidated results using new aggregation format (v2.4.0)
|
||
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||
|
|
pub struct NewConsolidatedResults {
|
||
|
|
/// Schema version for this output format
|
||
|
|
pub schema_version: String,
|
||
|
|
/// Aggregated results grouped by framework:output_format:mode combination
|
||
|
|
pub by_framework_mode: HashMap<String, FrameworkModeAggregation>,
|
||
|
|
/// Disk sizes for each framework
|
||
|
|
pub disk_sizes: HashMap<String, DiskSizeInfo>,
|
||
|
|
/// Cross-framework comparison rankings
|
||
|
|
pub comparison: ComparisonData,
|
||
|
|
/// Per-fixture results (one row per framework:output_format:execution_mode:fixture_id:ocr)
|
||
|
|
pub per_fixture_results: Vec<PerFixtureRow>,
|
||
|
|
/// Metadata about the consolidation
|
||
|
|
pub metadata: ConsolidationMetadata,
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Per-fixture benchmark result row
|
||
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||
|
|
pub struct PerFixtureRow {
|
||
|
|
/// Framework name
|
||
|
|
pub framework: String,
|
||
|
|
/// Output format (markdown or plaintext)
|
||
|
|
pub output_format: OutputFormat,
|
||
|
|
/// Execution mode (single, batch, etc.)
|
||
|
|
pub execution_mode: String,
|
||
|
|
/// Whether OCR was used
|
||
|
|
pub ocr: bool,
|
||
|
|
/// Fixture ID (e.g., from file path)
|
||
|
|
pub fixture_id: String,
|
||
|
|
/// File type/extension
|
||
|
|
pub file_type: String,
|
||
|
|
/// Total duration in milliseconds
|
||
|
|
pub duration_ms: f64,
|
||
|
|
/// Peak memory usage in MB
|
||
|
|
pub peak_memory_mb: f64,
|
||
|
|
/// Text F1 score (optional)
|
||
|
|
pub f1_text: Option<f64>,
|
||
|
|
/// Layout F1 score (optional, only for markdown mode)
|
||
|
|
pub f1_layout: Option<f64>,
|
||
|
|
/// Numeric F1 score (optional)
|
||
|
|
pub f1_numeric: Option<f64>,
|
||
|
|
/// Overall quality score (optional)
|
||
|
|
pub quality_score: Option<f64>,
|
||
|
|
/// Whether extraction was correct (optional)
|
||
|
|
pub correct: Option<bool>,
|
||
|
|
/// Whether extraction succeeded
|
||
|
|
pub success: bool,
|
||
|
|
/// Error kind if failed (optional)
|
||
|
|
pub error_kind: Option<String>,
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Cross-framework comparison rankings and deltas
|
||
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||
|
|
pub struct ComparisonData {
|
||
|
|
/// Frameworks ranked by median duration (fastest first)
|
||
|
|
pub performance_ranking: Vec<RankedFramework>,
|
||
|
|
/// Frameworks ranked by median throughput (highest first)
|
||
|
|
pub throughput_ranking: Vec<RankedFramework>,
|
||
|
|
/// Frameworks ranked by median memory usage (lowest first)
|
||
|
|
pub memory_ranking: Vec<RankedFramework>,
|
||
|
|
/// Frameworks ranked by median CPU usage (lowest first = most efficient)
|
||
|
|
#[serde(default)]
|
||
|
|
pub cpu_ranking: Vec<RankedFramework>,
|
||
|
|
/// Frameworks ranked by quality score (highest first)
|
||
|
|
pub quality_ranking: Vec<RankedFramework>,
|
||
|
|
/// PDF-only: frameworks ranked by overall quality score (highest first)
|
||
|
|
#[serde(default, skip_serializing_if = "Vec::is_empty")]
|
||
|
|
pub pdf_quality_ranking: Vec<RankedFramework>,
|
||
|
|
/// PDF-only: frameworks ranked by text F1 / TF1 (highest first) — markdown only
|
||
|
|
#[serde(default, skip_serializing_if = "Vec::is_empty")]
|
||
|
|
pub pdf_tf1_ranking_markdown: Vec<RankedFramework>,
|
||
|
|
/// PDF-only: frameworks ranked by text F1 / TF1 (highest first) — plaintext only
|
||
|
|
#[serde(default, skip_serializing_if = "Vec::is_empty")]
|
||
|
|
pub pdf_tf1_ranking_plaintext: Vec<RankedFramework>,
|
||
|
|
/// PDF-only: frameworks ranked by structural F1 / SF1 (highest first) — markdown only
|
||
|
|
#[serde(default, skip_serializing_if = "Vec::is_empty")]
|
||
|
|
pub pdf_sf1_ranking_markdown: Vec<RankedFramework>,
|
||
|
|
/// Performance deltas relative to the fastest framework
|
||
|
|
pub deltas_vs_baseline: HashMap<String, DeltaMetrics>,
|
||
|
|
}
|
||
|
|
|
||
|
|
/// A framework entry in a ranking
|
||
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||
|
|
pub struct RankedFramework {
|
||
|
|
/// Framework:mode key (e.g., "kreuzberg-markdown-baseline:single" or "docling:markdown:single")
|
||
|
|
pub framework_mode: String,
|
||
|
|
/// Rank (1-based)
|
||
|
|
pub rank: usize,
|
||
|
|
/// The metric value used for ranking
|
||
|
|
pub value: f64,
|
||
|
|
/// Ratio relative to the best in this ranking (1.0 = best)
|
||
|
|
pub relative: f64,
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Performance deltas relative to baseline (fastest framework)
|
||
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||
|
|
pub struct DeltaMetrics {
|
||
|
|
/// Duration delta in ms (positive = slower)
|
||
|
|
pub duration_delta_ms: f64,
|
||
|
|
/// Duration delta as percentage
|
||
|
|
pub duration_delta_percent: f64,
|
||
|
|
/// Throughput delta in MB/s (negative = slower)
|
||
|
|
pub throughput_delta_mbs: f64,
|
||
|
|
/// Throughput delta as percentage
|
||
|
|
pub throughput_delta_percent: f64,
|
||
|
|
/// Memory delta in MB (positive = more)
|
||
|
|
pub memory_delta_mb: f64,
|
||
|
|
/// Memory delta as percentage
|
||
|
|
pub memory_delta_percent: f64,
|
||
|
|
/// CPU delta in percentage points (positive = higher CPU usage)
|
||
|
|
#[serde(default)]
|
||
|
|
pub cpu_delta_pp: f64,
|
||
|
|
/// CPU delta as percentage relative to baseline
|
||
|
|
#[serde(default)]
|
||
|
|
pub cpu_delta_percent: f64,
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Metadata about the consolidation process
|
||
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||
|
|
pub struct ConsolidationMetadata {
|
||
|
|
/// Number of benchmark results included
|
||
|
|
pub total_results: usize,
|
||
|
|
/// Number of unique frameworks
|
||
|
|
pub framework_count: usize,
|
||
|
|
/// Number of unique file types
|
||
|
|
pub file_type_count: usize,
|
||
|
|
/// Timestamp of consolidation
|
||
|
|
pub timestamp: String,
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Aggregated results for a specific framework, output format, and mode combination
|
||
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||
|
|
pub struct FrameworkModeAggregation {
|
||
|
|
/// Framework name (base name without mode suffix)
|
||
|
|
pub framework: String,
|
||
|
|
/// Output format (markdown or plaintext)
|
||
|
|
pub output_format: OutputFormat,
|
||
|
|
/// Mode: "single", "batch", "sync", "async"
|
||
|
|
pub mode: String,
|
||
|
|
/// Cold start duration statistics (if available)
|
||
|
|
pub cold_start: Option<DurationPercentiles>,
|
||
|
|
/// Results grouped by file type
|
||
|
|
pub by_file_type: HashMap<String, FileTypeAggregation>,
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Aggregated results for a specific file type
|
||
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||
|
|
pub struct FileTypeAggregation {
|
||
|
|
/// File type (extension)
|
||
|
|
pub file_type: String,
|
||
|
|
/// Results without OCR
|
||
|
|
pub no_ocr: Option<PerformancePercentiles>,
|
||
|
|
/// Results with OCR
|
||
|
|
pub with_ocr: Option<PerformancePercentiles>,
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Performance percentiles for a group of results
|
||
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||
|
|
pub struct PerformancePercentiles {
|
||
|
|
/// Number of successful samples used for percentile calculations
|
||
|
|
pub successful_sample_count: usize,
|
||
|
|
/// Total number of samples in this group (including failed)
|
||
|
|
pub total_sample_count: usize,
|
||
|
|
/// Number of framework-side extraction errors (not our fault)
|
||
|
|
pub framework_errors: usize,
|
||
|
|
/// Number of harness-side errors (potentially our fault)
|
||
|
|
pub harness_errors: usize,
|
||
|
|
/// Number of extractions that timed out
|
||
|
|
pub timeouts: usize,
|
||
|
|
/// Number of extractions that returned empty content
|
||
|
|
pub empty_content: usize,
|
||
|
|
/// Unique error messages with occurrence counts
|
||
|
|
#[serde(default, skip_serializing_if = "HashMap::is_empty")]
|
||
|
|
pub error_details: HashMap<String, usize>,
|
||
|
|
/// Throughput percentiles (p50, p95, p99) in MB/s
|
||
|
|
pub throughput: Percentiles,
|
||
|
|
/// Memory percentiles (p50, p95, p99) in MB
|
||
|
|
pub memory: Percentiles,
|
||
|
|
/// Duration percentiles (p50, p95, p99) in ms
|
||
|
|
pub duration: Percentiles,
|
||
|
|
/// CPU usage percentiles (p50, p95, p99) as percentage (0-100, normalized across cores)
|
||
|
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||
|
|
pub cpu: Option<Percentiles>,
|
||
|
|
/// Success rate as percentage (0-100)
|
||
|
|
pub success_rate_percent: f64,
|
||
|
|
/// Extraction duration percentiles (p50, p95, p99) in ms
|
||
|
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||
|
|
pub extraction_duration: Option<Percentiles>,
|
||
|
|
/// Quality score percentiles (p50, p95, p99) — 0.0 to 1.0
|
||
|
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||
|
|
pub quality: Option<QualityPercentiles>,
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Quality percentile values (p50, p95, p99) for all F1 metrics
|
||
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||
|
|
pub struct QualityPercentiles {
|
||
|
|
/// Text F1 50th percentile (TF1 median)
|
||
|
|
pub f1_text_p50: f64,
|
||
|
|
/// Text F1 95th percentile
|
||
|
|
pub f1_text_p95: f64,
|
||
|
|
/// Text F1 99th percentile
|
||
|
|
pub f1_text_p99: f64,
|
||
|
|
/// Numeric F1 50th percentile
|
||
|
|
pub f1_numeric_p50: f64,
|
||
|
|
/// Numeric F1 95th percentile
|
||
|
|
pub f1_numeric_p95: f64,
|
||
|
|
/// Numeric F1 99th percentile
|
||
|
|
pub f1_numeric_p99: f64,
|
||
|
|
/// Layout/structural F1 50th percentile (SF1 median) — None for plaintext-only frameworks
|
||
|
|
pub f1_layout_p50: Option<f64>,
|
||
|
|
/// Layout/structural F1 95th percentile — None for plaintext-only frameworks
|
||
|
|
pub f1_layout_p95: Option<f64>,
|
||
|
|
/// Layout/structural F1 99th percentile — None for plaintext-only frameworks
|
||
|
|
pub f1_layout_p99: Option<f64>,
|
||
|
|
/// Overall quality score 50th percentile
|
||
|
|
pub quality_score_p50: f64,
|
||
|
|
/// Overall quality score 95th percentile
|
||
|
|
pub quality_score_p95: f64,
|
||
|
|
/// Overall quality score 99th percentile
|
||
|
|
pub quality_score_p99: f64,
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Percentile values for a metric
|
||
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||
|
|
pub struct Percentiles {
|
||
|
|
/// 50th percentile (median)
|
||
|
|
pub p50: f64,
|
||
|
|
/// 95th percentile
|
||
|
|
pub p95: f64,
|
||
|
|
/// 99th percentile
|
||
|
|
pub p99: f64,
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Duration percentiles in milliseconds
|
||
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||
|
|
pub struct DurationPercentiles {
|
||
|
|
/// Number of samples with cold start data
|
||
|
|
pub sample_count: usize,
|
||
|
|
/// 50th percentile (median) in ms
|
||
|
|
pub p50_ms: f64,
|
||
|
|
/// 95th percentile in ms
|
||
|
|
pub p95_ms: f64,
|
||
|
|
/// 99th percentile in ms
|
||
|
|
pub p99_ms: f64,
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Main aggregation function for new format
|
||
|
|
///
|
||
|
|
/// Groups results by:
|
||
|
|
/// 1. Framework and mode (extracted from framework name)
|
||
|
|
/// 2. File type (extension)
|
||
|
|
/// 3. OCR usage (yes/no)
|
||
|
|
///
|
||
|
|
/// Calculates p50/p95/p99 percentiles for each group.
|
||
|
|
pub fn aggregate_new_format(results: &[BenchmarkResult]) -> NewConsolidatedResults {
|
||
|
|
// Validate input
|
||
|
|
if results.is_empty() {
|
||
|
|
return NewConsolidatedResults {
|
||
|
|
schema_version: SCHEMA_VERSION.to_string(),
|
||
|
|
by_framework_mode: HashMap::new(),
|
||
|
|
disk_sizes: HashMap::new(),
|
||
|
|
comparison: ComparisonData {
|
||
|
|
performance_ranking: Vec::new(),
|
||
|
|
throughput_ranking: Vec::new(),
|
||
|
|
memory_ranking: Vec::new(),
|
||
|
|
cpu_ranking: Vec::new(),
|
||
|
|
quality_ranking: Vec::new(),
|
||
|
|
pdf_quality_ranking: Vec::new(),
|
||
|
|
pdf_tf1_ranking_markdown: Vec::new(),
|
||
|
|
pdf_tf1_ranking_plaintext: Vec::new(),
|
||
|
|
pdf_sf1_ranking_markdown: Vec::new(),
|
||
|
|
deltas_vs_baseline: HashMap::new(),
|
||
|
|
},
|
||
|
|
per_fixture_results: Vec::new(),
|
||
|
|
metadata: ConsolidationMetadata {
|
||
|
|
total_results: 0,
|
||
|
|
framework_count: 0,
|
||
|
|
file_type_count: 0,
|
||
|
|
timestamp: chrono::Utc::now().to_rfc3339(),
|
||
|
|
},
|
||
|
|
};
|
||
|
|
}
|
||
|
|
|
||
|
|
// Group by aggregate key (see make_aggregate_key) and file type
|
||
|
|
let mut by_framework_mode_format: HashMap<String, HashMap<String, Vec<&BenchmarkResult>>> = HashMap::new();
|
||
|
|
let mut disk_sizes: HashMap<String, DiskSizeInfo> = HashMap::new();
|
||
|
|
let mut file_types = std::collections::HashSet::new();
|
||
|
|
|
||
|
|
// Group results by their aggregate key and file type.
|
||
|
|
//
|
||
|
|
// Key format differs by family (see module-level doc):
|
||
|
|
// kreuzberg-* → "{framework_name}:{mode}"
|
||
|
|
// competitors → "{framework}:{output_format}:{mode}"
|
||
|
|
for result in results {
|
||
|
|
let (framework, mode) = extract_framework_and_mode(&result.framework);
|
||
|
|
let key = make_aggregate_key(framework, result.output_format, mode);
|
||
|
|
|
||
|
|
by_framework_mode_format
|
||
|
|
.entry(key)
|
||
|
|
.or_default()
|
||
|
|
.entry(result.file_extension.clone())
|
||
|
|
.or_default()
|
||
|
|
.push(result);
|
||
|
|
|
||
|
|
file_types.insert(result.file_extension.clone());
|
||
|
|
|
||
|
|
// Collect disk sizes
|
||
|
|
if let Some(disk_size) = &result.framework_capabilities.installation_size {
|
||
|
|
disk_sizes.insert(framework.to_string(), disk_size.clone());
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Aggregate each key combination.
|
||
|
|
//
|
||
|
|
// Key shapes (see make_aggregate_key):
|
||
|
|
// kreuzberg-* → "framework_name:mode" (2 colon-separated parts)
|
||
|
|
// competitors → "framework:output_format:mode" (3 colon-separated parts)
|
||
|
|
let mut aggregated_by_framework_mode = HashMap::new();
|
||
|
|
|
||
|
|
for (framework_mode_format_key, file_type_results) in by_framework_mode_format {
|
||
|
|
// Retrieve output_format from the first result in this group; it is the same for all
|
||
|
|
// entries in the group because the key was built from result.output_format.
|
||
|
|
let output_format = file_type_results
|
||
|
|
.values()
|
||
|
|
.flatten()
|
||
|
|
.next()
|
||
|
|
.map(|r| r.output_format)
|
||
|
|
.unwrap_or(OutputFormat::Markdown);
|
||
|
|
|
||
|
|
let (framework, mode) = parse_aggregate_key(&framework_mode_format_key);
|
||
|
|
|
||
|
|
// Collect all results for this aggregate key group for cold start calculation
|
||
|
|
let all_results: Vec<&BenchmarkResult> = file_type_results.values().flat_map(|v| v.iter().copied()).collect();
|
||
|
|
let cold_start = aggregate_cold_starts(&all_results);
|
||
|
|
|
||
|
|
// Aggregate by file type
|
||
|
|
let mut by_file_type = HashMap::new();
|
||
|
|
for (file_type, results_for_type) in file_type_results {
|
||
|
|
let aggregation = aggregate_by_ocr_status(&results_for_type);
|
||
|
|
by_file_type.insert(
|
||
|
|
file_type.clone(),
|
||
|
|
FileTypeAggregation {
|
||
|
|
file_type: file_type.clone(),
|
||
|
|
no_ocr: aggregation.0,
|
||
|
|
with_ocr: aggregation.1,
|
||
|
|
},
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
aggregated_by_framework_mode.insert(
|
||
|
|
framework_mode_format_key.clone(),
|
||
|
|
FrameworkModeAggregation {
|
||
|
|
framework: framework.to_string(),
|
||
|
|
output_format,
|
||
|
|
mode: mode.to_string(),
|
||
|
|
cold_start,
|
||
|
|
by_file_type,
|
||
|
|
},
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Build per-fixture results
|
||
|
|
let per_fixture_results = build_per_fixture_results(results);
|
||
|
|
|
||
|
|
let metadata = ConsolidationMetadata {
|
||
|
|
total_results: results.len(),
|
||
|
|
framework_count: aggregated_by_framework_mode.len(),
|
||
|
|
file_type_count: file_types.len(),
|
||
|
|
timestamp: chrono::Utc::now().to_rfc3339(),
|
||
|
|
};
|
||
|
|
|
||
|
|
let comparison = build_comparison(&aggregated_by_framework_mode);
|
||
|
|
|
||
|
|
NewConsolidatedResults {
|
||
|
|
schema_version: SCHEMA_VERSION.to_string(),
|
||
|
|
by_framework_mode: aggregated_by_framework_mode,
|
||
|
|
disk_sizes,
|
||
|
|
comparison,
|
||
|
|
per_fixture_results,
|
||
|
|
metadata,
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Build per-fixture result rows from raw benchmark results
|
||
|
|
///
|
||
|
|
/// Extracts one row per (framework, output_format, execution_mode, fixture_id, ocr) group.
|
||
|
|
/// Fixture ID is derived from the file path (filename without extension).
|
||
|
|
fn build_per_fixture_results(results: &[BenchmarkResult]) -> Vec<PerFixtureRow> {
|
||
|
|
let mut fixture_rows = Vec::new();
|
||
|
|
|
||
|
|
for result in results {
|
||
|
|
let (framework, mode) = extract_framework_and_mode(&result.framework);
|
||
|
|
let fixture_id = result
|
||
|
|
.file_path
|
||
|
|
.file_stem()
|
||
|
|
.and_then(|stem| stem.to_str())
|
||
|
|
.unwrap_or("unknown")
|
||
|
|
.to_string();
|
||
|
|
|
||
|
|
let ocr = matches!(result.ocr_status, crate::types::OcrStatus::Used);
|
||
|
|
let error_kind = if !result.success {
|
||
|
|
Some(format!("{:?}", result.error_kind))
|
||
|
|
} else {
|
||
|
|
None
|
||
|
|
};
|
||
|
|
|
||
|
|
let (f1_text, f1_layout, f1_numeric, quality_score, correct) = if let Some(q) = &result.quality {
|
||
|
|
(
|
||
|
|
Some(q.f1_score_text),
|
||
|
|
q.f1_score_layout,
|
||
|
|
Some(q.f1_score_numeric),
|
||
|
|
Some(q.quality_score),
|
||
|
|
Some(q.correct),
|
||
|
|
)
|
||
|
|
} else {
|
||
|
|
(None, None, None, None, None)
|
||
|
|
};
|
||
|
|
|
||
|
|
fixture_rows.push(PerFixtureRow {
|
||
|
|
framework: framework.to_string(),
|
||
|
|
output_format: result.output_format,
|
||
|
|
execution_mode: mode.to_string(),
|
||
|
|
ocr,
|
||
|
|
fixture_id,
|
||
|
|
file_type: result.file_extension.clone(),
|
||
|
|
duration_ms: result.duration.as_secs_f64() * 1000.0,
|
||
|
|
peak_memory_mb: result.metrics.peak_memory_bytes as f64 / 1_000_000.0,
|
||
|
|
f1_text,
|
||
|
|
f1_layout,
|
||
|
|
f1_numeric,
|
||
|
|
quality_score,
|
||
|
|
correct,
|
||
|
|
success: result.success,
|
||
|
|
error_kind,
|
||
|
|
});
|
||
|
|
}
|
||
|
|
|
||
|
|
fixture_rows
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Aggregate results by OCR status
|
||
|
|
///
|
||
|
|
/// Returns (no_ocr, with_ocr) tuple of PerformancePercentiles
|
||
|
|
fn aggregate_by_ocr_status(
|
||
|
|
results: &[&BenchmarkResult],
|
||
|
|
) -> (Option<PerformancePercentiles>, Option<PerformancePercentiles>) {
|
||
|
|
use crate::types::OcrStatus;
|
||
|
|
|
||
|
|
// OCR status grouping:
|
||
|
|
// - OcrStatus::Used → "with_ocr" group
|
||
|
|
// - OcrStatus::NotUsed → "no_ocr" group
|
||
|
|
// - OcrStatus::Unknown → infer from file type: image formats → "with_ocr", others → "no_ocr"
|
||
|
|
let is_ocr_result = |r: &&BenchmarkResult| -> bool {
|
||
|
|
match r.ocr_status {
|
||
|
|
OcrStatus::Used => true,
|
||
|
|
OcrStatus::NotUsed => false,
|
||
|
|
OcrStatus::Unknown => matches!(
|
||
|
|
r.file_extension.to_lowercase().as_str(),
|
||
|
|
"jpg" | "jpeg" | "png" | "gif" | "bmp" | "tiff" | "tif" | "webp" | "jp2" | "jpx" | "jpm" | "mj2"
|
||
|
|
),
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
let no_ocr: Vec<&BenchmarkResult> = results.iter().filter(|r| !is_ocr_result(r)).copied().collect();
|
||
|
|
|
||
|
|
let with_ocr: Vec<&BenchmarkResult> = results.iter().filter(|r| is_ocr_result(r)).copied().collect();
|
||
|
|
|
||
|
|
let no_ocr_stats = if !no_ocr.is_empty() {
|
||
|
|
Some(calculate_percentiles(&no_ocr))
|
||
|
|
} else {
|
||
|
|
None
|
||
|
|
};
|
||
|
|
|
||
|
|
let with_ocr_stats = if !with_ocr.is_empty() {
|
||
|
|
Some(calculate_percentiles(&with_ocr))
|
||
|
|
} else {
|
||
|
|
None
|
||
|
|
};
|
||
|
|
|
||
|
|
(no_ocr_stats, with_ocr_stats)
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Calculate percentiles for a group of results
|
||
|
|
///
|
||
|
|
/// Only uses successful results for metric calculations.
|
||
|
|
/// Success rate is calculated from all results.
|
||
|
|
fn calculate_percentiles(results: &[&BenchmarkResult]) -> PerformancePercentiles {
|
||
|
|
let successful: Vec<&BenchmarkResult> = results.iter().filter(|r| r.success).copied().collect();
|
||
|
|
|
||
|
|
// Extract values for percentile calculation with NaN filtering - HIGH PRIORITY FIX
|
||
|
|
let mut durations: Vec<f64> = successful
|
||
|
|
.iter()
|
||
|
|
.map(|r| r.duration.as_secs_f64() * 1000.0)
|
||
|
|
.filter(|&v| !v.is_nan() && v.is_finite())
|
||
|
|
.collect();
|
||
|
|
|
||
|
|
let mut throughputs: Vec<f64> = successful
|
||
|
|
.iter()
|
||
|
|
.map(|r| r.metrics.throughput_bytes_per_sec / 1_000_000.0) // Convert to MB/s
|
||
|
|
.filter(|&v| v > 0.0 && v.is_finite()) // Filter zero values (invalid measurements)
|
||
|
|
.collect();
|
||
|
|
|
||
|
|
let mut memories: Vec<f64> = successful
|
||
|
|
.iter()
|
||
|
|
.map(|r| r.metrics.peak_memory_bytes as f64 / 1_000_000.0) // Convert to MB
|
||
|
|
.filter(|&v| !v.is_nan() && v.is_finite())
|
||
|
|
.collect();
|
||
|
|
|
||
|
|
let mut extraction_durations: Vec<f64> = successful
|
||
|
|
.iter()
|
||
|
|
.filter_map(|r| r.extraction_duration.map(|d| d.as_secs_f64() * 1000.0))
|
||
|
|
.filter(|&v| !v.is_nan() && v.is_finite())
|
||
|
|
.collect();
|
||
|
|
|
||
|
|
let mut cpus: Vec<f64> = successful
|
||
|
|
.iter()
|
||
|
|
.map(|r| r.metrics.avg_cpu_percent)
|
||
|
|
.filter(|&v| v > 0.0 && v.is_finite())
|
||
|
|
.collect();
|
||
|
|
|
||
|
|
// Sort for percentile calculation (NaN-safe)
|
||
|
|
durations.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
|
||
|
|
throughputs.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
|
||
|
|
memories.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
|
||
|
|
extraction_durations.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
|
||
|
|
cpus.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
|
||
|
|
|
||
|
|
// Build percentiles with NaN/Inf validation
|
||
|
|
let duration = Percentiles {
|
||
|
|
p50: sanitize_f64(percentile_r7(&durations, 0.50)),
|
||
|
|
p95: sanitize_f64(percentile_r7(&durations, 0.95)),
|
||
|
|
p99: sanitize_f64(percentile_r7(&durations, 0.99)),
|
||
|
|
};
|
||
|
|
|
||
|
|
let throughput = Percentiles {
|
||
|
|
p50: sanitize_f64(percentile_r7(&throughputs, 0.50)),
|
||
|
|
p95: sanitize_f64(percentile_r7(&throughputs, 0.95)),
|
||
|
|
p99: sanitize_f64(percentile_r7(&throughputs, 0.99)),
|
||
|
|
};
|
||
|
|
|
||
|
|
let memory = Percentiles {
|
||
|
|
p50: sanitize_f64(percentile_r7(&memories, 0.50)),
|
||
|
|
p95: sanitize_f64(percentile_r7(&memories, 0.95)),
|
||
|
|
p99: sanitize_f64(percentile_r7(&memories, 0.99)),
|
||
|
|
};
|
||
|
|
|
||
|
|
let extraction_duration = if !extraction_durations.is_empty() {
|
||
|
|
Some(Percentiles {
|
||
|
|
p50: sanitize_f64(percentile_r7(&extraction_durations, 0.50)),
|
||
|
|
p95: sanitize_f64(percentile_r7(&extraction_durations, 0.95)),
|
||
|
|
p99: sanitize_f64(percentile_r7(&extraction_durations, 0.99)),
|
||
|
|
})
|
||
|
|
} else {
|
||
|
|
None
|
||
|
|
};
|
||
|
|
|
||
|
|
let cpu = if !cpus.is_empty() {
|
||
|
|
Some(Percentiles {
|
||
|
|
p50: sanitize_f64(percentile_r7(&cpus, 0.50)),
|
||
|
|
p95: sanitize_f64(percentile_r7(&cpus, 0.95)),
|
||
|
|
p99: sanitize_f64(percentile_r7(&cpus, 0.99)),
|
||
|
|
})
|
||
|
|
} else {
|
||
|
|
None
|
||
|
|
};
|
||
|
|
|
||
|
|
let success_rate_percent = if !results.is_empty() {
|
||
|
|
(successful.len() as f64 / results.len() as f64) * 100.0
|
||
|
|
} else {
|
||
|
|
0.0
|
||
|
|
};
|
||
|
|
|
||
|
|
let framework_errors = results
|
||
|
|
.iter()
|
||
|
|
.filter(|r| r.error_kind == ErrorKind::FrameworkError)
|
||
|
|
.count();
|
||
|
|
let harness_errors = results
|
||
|
|
.iter()
|
||
|
|
.filter(|r| r.error_kind == ErrorKind::HarnessError)
|
||
|
|
.count();
|
||
|
|
let timeouts = results.iter().filter(|r| r.error_kind == ErrorKind::Timeout).count();
|
||
|
|
let empty_content = results
|
||
|
|
.iter()
|
||
|
|
.filter(|r| r.error_kind == ErrorKind::EmptyContent)
|
||
|
|
.count();
|
||
|
|
|
||
|
|
let mut error_details: HashMap<String, usize> = HashMap::new();
|
||
|
|
for result in results.iter().filter(|r| !r.success) {
|
||
|
|
if let Some(msg) = &result.error_message {
|
||
|
|
*error_details.entry(msg.clone()).or_insert(0) += 1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Quality percentiles
|
||
|
|
let quality = {
|
||
|
|
let mut f1_texts: Vec<f64> = successful
|
||
|
|
.iter()
|
||
|
|
.filter_map(|r| r.quality.as_ref().map(|q| q.f1_score_text))
|
||
|
|
.filter(|v| !v.is_nan() && v.is_finite())
|
||
|
|
.collect();
|
||
|
|
let mut f1_numerics: Vec<f64> = successful
|
||
|
|
.iter()
|
||
|
|
.filter_map(|r| r.quality.as_ref().map(|q| q.f1_score_numeric))
|
||
|
|
.filter(|v| !v.is_nan() && v.is_finite())
|
||
|
|
.collect();
|
||
|
|
let mut f1_layouts: Vec<f64> = successful
|
||
|
|
.iter()
|
||
|
|
.filter_map(|r| r.quality.as_ref().and_then(|q| q.f1_score_layout))
|
||
|
|
.filter(|v| !v.is_nan() && v.is_finite())
|
||
|
|
.collect();
|
||
|
|
let mut quality_scores: Vec<f64> = successful
|
||
|
|
.iter()
|
||
|
|
.filter_map(|r| r.quality.as_ref().map(|q| q.quality_score))
|
||
|
|
.filter(|v| !v.is_nan() && v.is_finite())
|
||
|
|
.collect();
|
||
|
|
|
||
|
|
if !quality_scores.is_empty() {
|
||
|
|
f1_texts.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
|
||
|
|
f1_numerics.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
|
||
|
|
f1_layouts.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
|
||
|
|
quality_scores.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
|
||
|
|
|
||
|
|
// f1_layout is None if all results have f1_score_layout = None (plaintext mode)
|
||
|
|
let f1_layout_p50 = if !f1_layouts.is_empty() {
|
||
|
|
Some(sanitize_f64(percentile_r7(&f1_layouts, 0.50)))
|
||
|
|
} else {
|
||
|
|
None
|
||
|
|
};
|
||
|
|
let f1_layout_p95 = if !f1_layouts.is_empty() {
|
||
|
|
Some(sanitize_f64(percentile_r7(&f1_layouts, 0.95)))
|
||
|
|
} else {
|
||
|
|
None
|
||
|
|
};
|
||
|
|
let f1_layout_p99 = if !f1_layouts.is_empty() {
|
||
|
|
Some(sanitize_f64(percentile_r7(&f1_layouts, 0.99)))
|
||
|
|
} else {
|
||
|
|
None
|
||
|
|
};
|
||
|
|
|
||
|
|
Some(QualityPercentiles {
|
||
|
|
f1_text_p50: sanitize_f64(percentile_r7(&f1_texts, 0.50)),
|
||
|
|
f1_text_p95: sanitize_f64(percentile_r7(&f1_texts, 0.95)),
|
||
|
|
f1_text_p99: sanitize_f64(percentile_r7(&f1_texts, 0.99)),
|
||
|
|
f1_numeric_p50: sanitize_f64(percentile_r7(&f1_numerics, 0.50)),
|
||
|
|
f1_numeric_p95: sanitize_f64(percentile_r7(&f1_numerics, 0.95)),
|
||
|
|
f1_numeric_p99: sanitize_f64(percentile_r7(&f1_numerics, 0.99)),
|
||
|
|
f1_layout_p50,
|
||
|
|
f1_layout_p95,
|
||
|
|
f1_layout_p99,
|
||
|
|
quality_score_p50: sanitize_f64(percentile_r7(&quality_scores, 0.50)),
|
||
|
|
quality_score_p95: sanitize_f64(percentile_r7(&quality_scores, 0.95)),
|
||
|
|
quality_score_p99: sanitize_f64(percentile_r7(&quality_scores, 0.99)),
|
||
|
|
})
|
||
|
|
} else {
|
||
|
|
None
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
PerformancePercentiles {
|
||
|
|
successful_sample_count: successful.len(),
|
||
|
|
total_sample_count: results.len(),
|
||
|
|
framework_errors,
|
||
|
|
harness_errors,
|
||
|
|
timeouts,
|
||
|
|
empty_content,
|
||
|
|
error_details,
|
||
|
|
throughput,
|
||
|
|
memory,
|
||
|
|
duration,
|
||
|
|
cpu,
|
||
|
|
success_rate_percent,
|
||
|
|
extraction_duration,
|
||
|
|
quality,
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Aggregate cold start durations
|
||
|
|
///
|
||
|
|
/// Returns percentiles of cold start durations if any results have cold start data.
|
||
|
|
fn aggregate_cold_starts(results: &[&BenchmarkResult]) -> Option<DurationPercentiles> {
|
||
|
|
let cold_starts: Vec<f64> = results
|
||
|
|
.iter()
|
||
|
|
.filter_map(|r| r.cold_start_duration.map(|d| d.as_secs_f64() * 1000.0))
|
||
|
|
.filter(|&v| !v.is_nan() && v.is_finite()) // HIGH PRIORITY FIX: NaN filtering
|
||
|
|
.collect();
|
||
|
|
|
||
|
|
if cold_starts.is_empty() {
|
||
|
|
return None;
|
||
|
|
}
|
||
|
|
|
||
|
|
let mut sorted = cold_starts.clone();
|
||
|
|
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
|
||
|
|
|
||
|
|
Some(DurationPercentiles {
|
||
|
|
sample_count: cold_starts.len(),
|
||
|
|
p50_ms: sanitize_f64(percentile_r7(&sorted, 0.50)),
|
||
|
|
p95_ms: sanitize_f64(percentile_r7(&sorted, 0.95)),
|
||
|
|
p99_ms: sanitize_f64(percentile_r7(&sorted, 0.99)),
|
||
|
|
})
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Extract framework name and mode from a raw framework string.
|
||
|
|
///
|
||
|
|
/// Modes: `-batch` suffix → `"batch"`, anything else → `"single"`.
|
||
|
|
/// Legacy `-sync`/`-async` suffixes (no longer emitted by current adapters, but present in
|
||
|
|
/// historical result files) are stripped from the base name to preserve backward compatibility.
|
||
|
|
///
|
||
|
|
/// Returns `(framework_name, mode)` where `mode` is `"batch"` or `"single"`.
|
||
|
|
fn extract_framework_and_mode(framework_name: &str) -> (&str, &str) {
|
||
|
|
if let Some(base) = framework_name.strip_suffix("-batch") {
|
||
|
|
// Strip legacy -sync/-async suffixes from the base if present
|
||
|
|
let normalized = base
|
||
|
|
.strip_suffix("-sync")
|
||
|
|
.or_else(|| base.strip_suffix("-async"))
|
||
|
|
.unwrap_or(base);
|
||
|
|
(normalized, "batch")
|
||
|
|
} else {
|
||
|
|
let normalized = framework_name
|
||
|
|
.strip_suffix("-sync")
|
||
|
|
.or_else(|| framework_name.strip_suffix("-async"))
|
||
|
|
.unwrap_or(framework_name);
|
||
|
|
(normalized, "single")
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Build the `by_framework_mode` map key for a result.
|
||
|
|
///
|
||
|
|
/// - `kreuzberg-*` frameworks already encode the output format in their name, so the key is
|
||
|
|
/// `"{framework}:{mode}"` — no redundant format component.
|
||
|
|
/// - All other (competitor) frameworks use `"{framework}:{output_format}:{mode}"`.
|
||
|
|
fn make_aggregate_key(framework: &str, output_format: OutputFormat, mode: &str) -> String {
|
||
|
|
if framework.starts_with("kreuzberg-") {
|
||
|
|
format!("{framework}:{mode}")
|
||
|
|
} else {
|
||
|
|
format!("{framework}:{output_format}:{mode}")
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Parse an aggregate key back into `(framework, mode)`.
|
||
|
|
///
|
||
|
|
/// Handles both key shapes produced by [`make_aggregate_key`]:
|
||
|
|
/// - `"framework:mode"` (kreuzberg family, 2 parts)
|
||
|
|
/// - `"framework:output_format:mode"` (competitors, 3 parts)
|
||
|
|
fn parse_aggregate_key(key: &str) -> (&str, &str) {
|
||
|
|
let mut parts = key.rsplitn(2, ':');
|
||
|
|
let mode = parts.next().unwrap_or("single");
|
||
|
|
// For kreuzberg keys the remainder is just the framework name.
|
||
|
|
// For competitor keys the remainder is "framework:output_format" — we want only the
|
||
|
|
// framework portion, which is everything before the first colon.
|
||
|
|
let remainder = parts.next().unwrap_or(key);
|
||
|
|
let framework = remainder.split(':').next().unwrap_or(remainder);
|
||
|
|
(framework, mode)
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Build cross-framework comparison rankings from aggregated data
|
||
|
|
///
|
||
|
|
/// Metrics are weighted by successful_sample_count so that file types with more
|
||
|
|
/// samples (e.g., 93 PDFs) dominate the ranking over file types with fewer samples
|
||
|
|
/// (e.g., 1 BMP). This prevents frameworks that handle more file types or do OCR
|
||
|
|
/// from being unfairly penalized in the overall ranking.
|
||
|
|
fn build_comparison(by_framework_mode: &HashMap<String, FrameworkModeAggregation>) -> ComparisonData {
|
||
|
|
// Collect weighted median metrics per framework:mode
|
||
|
|
// (key, duration_p50, throughput_p50, memory_p50, quality_p50, cpu_p50)
|
||
|
|
let mut metrics: Vec<(String, f64, f64, f64, f64, f64)> = Vec::new();
|
||
|
|
|
||
|
|
for (key, agg) in by_framework_mode {
|
||
|
|
// (value, weight) pairs for weighted averaging
|
||
|
|
let mut durations: Vec<(f64, usize)> = Vec::new();
|
||
|
|
let mut throughputs: Vec<(f64, usize)> = Vec::new();
|
||
|
|
let mut memories: Vec<(f64, usize)> = Vec::new();
|
||
|
|
let mut qualities: Vec<(f64, usize)> = Vec::new();
|
||
|
|
let mut cpus: Vec<(f64, usize)> = Vec::new();
|
||
|
|
|
||
|
|
for ft in agg.by_file_type.values() {
|
||
|
|
for perf in [&ft.no_ocr, &ft.with_ocr].into_iter().flatten() {
|
||
|
|
// Skip groups where all samples failed — their 0.0 values would
|
||
|
|
// pollute rankings (e.g., docling showing 0.0ms when libGL is missing).
|
||
|
|
if perf.successful_sample_count == 0 {
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
let weight = perf.successful_sample_count;
|
||
|
|
durations.push((perf.duration.p50, weight));
|
||
|
|
throughputs.push((perf.throughput.p50, weight));
|
||
|
|
memories.push((perf.memory.p50, weight));
|
||
|
|
if let Some(q) = &perf.quality {
|
||
|
|
qualities.push((q.quality_score_p50, weight));
|
||
|
|
}
|
||
|
|
if let Some(c) = &perf.cpu {
|
||
|
|
cpus.push((c.p50, weight));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if durations.is_empty() {
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
let weighted_avg = |items: &[(f64, usize)]| -> f64 {
|
||
|
|
let finite: Vec<(f64, usize)> = items.iter().copied().filter(|(v, _)| v.is_finite()).collect();
|
||
|
|
let total_weight: usize = finite.iter().map(|(_, w)| w).sum();
|
||
|
|
if total_weight == 0 {
|
||
|
|
f64::NAN
|
||
|
|
} else {
|
||
|
|
finite.iter().map(|(v, w)| v * (*w as f64)).sum::<f64>() / total_weight as f64
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
metrics.push((
|
||
|
|
key.clone(),
|
||
|
|
weighted_avg(&durations),
|
||
|
|
weighted_avg(&throughputs),
|
||
|
|
weighted_avg(&memories),
|
||
|
|
weighted_avg(&qualities),
|
||
|
|
weighted_avg(&cpus),
|
||
|
|
));
|
||
|
|
}
|
||
|
|
|
||
|
|
// Performance ranking (lower duration = better, rank 1)
|
||
|
|
let mut perf = metrics.clone();
|
||
|
|
perf.retain(|m| m.1.is_finite());
|
||
|
|
perf.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
|
||
|
|
let baseline_dur = perf.first().map(|r| r.1).unwrap_or(1.0);
|
||
|
|
let performance_ranking: Vec<RankedFramework> = perf
|
||
|
|
.iter()
|
||
|
|
.enumerate()
|
||
|
|
.map(|(i, (k, v, ..))| RankedFramework {
|
||
|
|
framework_mode: k.clone(),
|
||
|
|
rank: i + 1,
|
||
|
|
value: *v,
|
||
|
|
relative: if baseline_dur > 0.0 { *v / baseline_dur } else { 1.0 },
|
||
|
|
})
|
||
|
|
.collect();
|
||
|
|
|
||
|
|
// Throughput ranking (higher = better)
|
||
|
|
let mut thr = metrics.clone();
|
||
|
|
thr.retain(|m| m.2.is_finite());
|
||
|
|
thr.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));
|
||
|
|
let baseline_thr = thr.first().map(|r| r.2).unwrap_or(1.0);
|
||
|
|
let throughput_ranking: Vec<RankedFramework> = thr
|
||
|
|
.iter()
|
||
|
|
.enumerate()
|
||
|
|
.map(|(i, (k, _, v, ..))| RankedFramework {
|
||
|
|
framework_mode: k.clone(),
|
||
|
|
rank: i + 1,
|
||
|
|
value: *v,
|
||
|
|
relative: if baseline_thr > 0.0 { *v / baseline_thr } else { 1.0 },
|
||
|
|
})
|
||
|
|
.collect();
|
||
|
|
|
||
|
|
// Memory ranking (lower = better)
|
||
|
|
let mut mem = metrics.clone();
|
||
|
|
mem.retain(|m| m.3.is_finite());
|
||
|
|
mem.sort_by(|a, b| a.3.partial_cmp(&b.3).unwrap_or(std::cmp::Ordering::Equal));
|
||
|
|
let baseline_mem = mem.first().map(|r| r.3).unwrap_or(1.0);
|
||
|
|
let memory_ranking: Vec<RankedFramework> = mem
|
||
|
|
.iter()
|
||
|
|
.enumerate()
|
||
|
|
.map(|(i, (k, _, _, v, ..))| RankedFramework {
|
||
|
|
framework_mode: k.clone(),
|
||
|
|
rank: i + 1,
|
||
|
|
value: *v,
|
||
|
|
relative: if baseline_mem > 0.0 { *v / baseline_mem } else { 1.0 },
|
||
|
|
})
|
||
|
|
.collect();
|
||
|
|
|
||
|
|
// CPU ranking (lower = more efficient, rank 1)
|
||
|
|
let mut cpu = metrics.clone();
|
||
|
|
cpu.retain(|m| m.5.is_finite());
|
||
|
|
cpu.sort_by(|a, b| a.5.partial_cmp(&b.5).unwrap_or(std::cmp::Ordering::Equal));
|
||
|
|
let baseline_cpu = cpu.first().map(|r| r.5).unwrap_or(1.0);
|
||
|
|
let cpu_ranking: Vec<RankedFramework> = cpu
|
||
|
|
.iter()
|
||
|
|
.enumerate()
|
||
|
|
.map(|(i, (k, _, _, _, _, v))| RankedFramework {
|
||
|
|
framework_mode: k.clone(),
|
||
|
|
rank: i + 1,
|
||
|
|
value: *v,
|
||
|
|
relative: if baseline_cpu > 0.0 { *v / baseline_cpu } else { 1.0 },
|
||
|
|
})
|
||
|
|
.collect();
|
||
|
|
|
||
|
|
// Quality ranking (higher = better)
|
||
|
|
let mut qual = metrics.clone();
|
||
|
|
qual.retain(|m| m.4.is_finite());
|
||
|
|
qual.sort_by(|a, b| b.4.partial_cmp(&a.4).unwrap_or(std::cmp::Ordering::Equal));
|
||
|
|
let baseline_qual = qual.first().map(|r| r.4).unwrap_or(1.0);
|
||
|
|
let quality_ranking: Vec<RankedFramework> = qual
|
||
|
|
.iter()
|
||
|
|
.enumerate()
|
||
|
|
.map(|(i, (k, _, _, _, v, _))| RankedFramework {
|
||
|
|
framework_mode: k.clone(),
|
||
|
|
rank: i + 1,
|
||
|
|
value: *v,
|
||
|
|
relative: if baseline_qual > 0.0 { *v / baseline_qual } else { 1.0 },
|
||
|
|
})
|
||
|
|
.collect();
|
||
|
|
|
||
|
|
// Deltas vs baseline (fastest framework)
|
||
|
|
let mut deltas_vs_baseline = HashMap::new();
|
||
|
|
if let Some(baseline) = metrics
|
||
|
|
.iter()
|
||
|
|
.filter(|(_, dur, _, _, _, _)| dur.is_finite())
|
||
|
|
.min_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal))
|
||
|
|
{
|
||
|
|
for (k, dur, thr, mem_val, _, cpu_val) in &metrics {
|
||
|
|
if k != &baseline.0 {
|
||
|
|
deltas_vs_baseline.insert(
|
||
|
|
k.clone(),
|
||
|
|
DeltaMetrics {
|
||
|
|
duration_delta_ms: dur - baseline.1,
|
||
|
|
duration_delta_percent: if baseline.1 > 0.0 {
|
||
|
|
((dur - baseline.1) / baseline.1) * 100.0
|
||
|
|
} else {
|
||
|
|
0.0
|
||
|
|
},
|
||
|
|
throughput_delta_mbs: thr - baseline.2,
|
||
|
|
throughput_delta_percent: if baseline.2 > 0.0 {
|
||
|
|
((thr - baseline.2) / baseline.2) * 100.0
|
||
|
|
} else {
|
||
|
|
0.0
|
||
|
|
},
|
||
|
|
memory_delta_mb: mem_val - baseline.3,
|
||
|
|
memory_delta_percent: if baseline.3 > 0.0 {
|
||
|
|
((mem_val - baseline.3) / baseline.3) * 100.0
|
||
|
|
} else {
|
||
|
|
0.0
|
||
|
|
},
|
||
|
|
cpu_delta_pp: cpu_val - baseline.5,
|
||
|
|
cpu_delta_percent: if baseline.5 > 0.0 {
|
||
|
|
((cpu_val - baseline.5) / baseline.5) * 100.0
|
||
|
|
} else {
|
||
|
|
0.0
|
||
|
|
},
|
||
|
|
},
|
||
|
|
);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// PDF-specific quality rankings (quality, TF1, SF1)
|
||
|
|
// Collect PDF quality metrics per framework:output_format:mode
|
||
|
|
// (key, quality, tf1, sf1, output_format)
|
||
|
|
let mut pdf_metrics: Vec<(String, f64, f64, f64, OutputFormat)> = Vec::new();
|
||
|
|
for (key, agg) in by_framework_mode {
|
||
|
|
if let Some(pdf_ft) = agg.by_file_type.get("pdf") {
|
||
|
|
let mut qualities: Vec<(f64, usize)> = Vec::new();
|
||
|
|
let mut tf1s: Vec<(f64, usize)> = Vec::new();
|
||
|
|
let mut sf1s: Vec<(f64, usize)> = Vec::new();
|
||
|
|
for perf in [&pdf_ft.no_ocr, &pdf_ft.with_ocr].into_iter().flatten() {
|
||
|
|
if perf.successful_sample_count == 0 {
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
if let Some(q) = &perf.quality {
|
||
|
|
let w = perf.successful_sample_count;
|
||
|
|
qualities.push((q.quality_score_p50, w));
|
||
|
|
tf1s.push((q.f1_text_p50, w));
|
||
|
|
// Only include f1_layout if present (markdown mode)
|
||
|
|
if let Some(layout) = q.f1_layout_p50 {
|
||
|
|
sf1s.push((layout, w));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
let weighted_avg = |items: &[(f64, usize)]| -> f64 {
|
||
|
|
let finite: Vec<(f64, usize)> = items.iter().copied().filter(|(v, _)| v.is_finite()).collect();
|
||
|
|
let total_weight: usize = finite.iter().map(|(_, w)| w).sum();
|
||
|
|
if total_weight == 0 {
|
||
|
|
f64::NAN
|
||
|
|
} else {
|
||
|
|
finite.iter().map(|(v, w)| v * (*w as f64)).sum::<f64>() / total_weight as f64
|
||
|
|
}
|
||
|
|
};
|
||
|
|
let q = weighted_avg(&qualities);
|
||
|
|
let t = weighted_avg(&tf1s);
|
||
|
|
let s = weighted_avg(&sf1s);
|
||
|
|
if q.is_finite() {
|
||
|
|
pdf_metrics.push((key.clone(), q, t, s, agg.output_format));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
let build_ranking = |items: &mut Vec<(String, f64)>| -> Vec<RankedFramework> {
|
||
|
|
items.retain(|(_, v)| v.is_finite());
|
||
|
|
items.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
|
||
|
|
let best = items.first().map(|r| r.1).unwrap_or(1.0);
|
||
|
|
items
|
||
|
|
.iter()
|
||
|
|
.enumerate()
|
||
|
|
.map(|(i, (k, v))| RankedFramework {
|
||
|
|
framework_mode: k.clone(),
|
||
|
|
rank: i + 1,
|
||
|
|
value: *v,
|
||
|
|
relative: if best > 0.0 { *v / best } else { 1.0 },
|
||
|
|
})
|
||
|
|
.collect()
|
||
|
|
};
|
||
|
|
|
||
|
|
let mut pdf_qual_items: Vec<(String, f64)> = pdf_metrics.iter().map(|(k, q, _, _, _)| (k.clone(), *q)).collect();
|
||
|
|
let mut pdf_tf1_markdown: Vec<(String, f64)> = pdf_metrics
|
||
|
|
.iter()
|
||
|
|
.filter(|(_, _, _, _, fmt)| *fmt == OutputFormat::Markdown)
|
||
|
|
.map(|(k, _, t, _, _)| (k.clone(), *t))
|
||
|
|
.collect();
|
||
|
|
let mut pdf_tf1_plaintext: Vec<(String, f64)> = pdf_metrics
|
||
|
|
.iter()
|
||
|
|
.filter(|(_, _, _, _, fmt)| *fmt == OutputFormat::Plaintext)
|
||
|
|
.map(|(k, _, t, _, _)| (k.clone(), *t))
|
||
|
|
.collect();
|
||
|
|
let mut pdf_sf1_markdown: Vec<(String, f64)> = pdf_metrics
|
||
|
|
.iter()
|
||
|
|
.filter(|(_, _, _, _, fmt)| *fmt == OutputFormat::Markdown)
|
||
|
|
.map(|(k, _, _, s, _)| (k.clone(), *s))
|
||
|
|
.collect();
|
||
|
|
|
||
|
|
let pdf_quality_ranking = build_ranking(&mut pdf_qual_items);
|
||
|
|
let pdf_tf1_ranking_markdown = build_ranking(&mut pdf_tf1_markdown);
|
||
|
|
let pdf_tf1_ranking_plaintext = build_ranking(&mut pdf_tf1_plaintext);
|
||
|
|
let pdf_sf1_ranking_markdown = build_ranking(&mut pdf_sf1_markdown);
|
||
|
|
|
||
|
|
ComparisonData {
|
||
|
|
performance_ranking,
|
||
|
|
throughput_ranking,
|
||
|
|
memory_ranking,
|
||
|
|
cpu_ranking,
|
||
|
|
quality_ranking,
|
||
|
|
pdf_quality_ranking,
|
||
|
|
pdf_tf1_ranking_markdown,
|
||
|
|
pdf_tf1_ranking_plaintext,
|
||
|
|
pdf_sf1_ranking_markdown,
|
||
|
|
deltas_vs_baseline,
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
#[cfg(test)]
|
||
|
|
mod tests {
|
||
|
|
use super::*;
|
||
|
|
use crate::types::{ErrorKind, FrameworkCapabilities, OcrStatus, PerformanceMetrics};
|
||
|
|
use std::path::PathBuf;
|
||
|
|
use std::time::Duration;
|
||
|
|
|
||
|
|
fn create_test_result(
|
||
|
|
framework: &str,
|
||
|
|
file_ext: &str,
|
||
|
|
ocr_status: OcrStatus,
|
||
|
|
duration_ms: u64,
|
||
|
|
throughput_bps: f64,
|
||
|
|
memory_bytes: u64,
|
||
|
|
) -> BenchmarkResult {
|
||
|
|
BenchmarkResult {
|
||
|
|
framework: framework.to_string(),
|
||
|
|
file_path: PathBuf::from(format!("test.{}", file_ext)),
|
||
|
|
file_size: 1024,
|
||
|
|
success: true,
|
||
|
|
error_message: None,
|
||
|
|
error_kind: ErrorKind::None,
|
||
|
|
duration: Duration::from_millis(duration_ms),
|
||
|
|
extraction_duration: None,
|
||
|
|
subprocess_overhead: None,
|
||
|
|
metrics: PerformanceMetrics {
|
||
|
|
peak_memory_bytes: memory_bytes,
|
||
|
|
avg_cpu_percent: 50.0,
|
||
|
|
throughput_bytes_per_sec: throughput_bps,
|
||
|
|
p50_memory_bytes: memory_bytes,
|
||
|
|
p95_memory_bytes: memory_bytes,
|
||
|
|
p99_memory_bytes: memory_bytes,
|
||
|
|
},
|
||
|
|
quality: None,
|
||
|
|
iterations: vec![],
|
||
|
|
statistics: None,
|
||
|
|
cold_start_duration: Some(Duration::from_millis(500)),
|
||
|
|
file_extension: file_ext.to_string(),
|
||
|
|
framework_capabilities: FrameworkCapabilities::default(),
|
||
|
|
pdf_metadata: None,
|
||
|
|
ocr_status,
|
||
|
|
output_format: OutputFormat::Markdown,
|
||
|
|
extracted_text: None,
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_extract_framework_and_mode() {
|
||
|
|
// Current kreuzberg pipeline naming (format encoded in name)
|
||
|
|
assert_eq!(
|
||
|
|
extract_framework_and_mode("kreuzberg-markdown-baseline"),
|
||
|
|
("kreuzberg-markdown-baseline", "single")
|
||
|
|
);
|
||
|
|
assert_eq!(
|
||
|
|
extract_framework_and_mode("kreuzberg-plaintext-paddle-ocr"),
|
||
|
|
("kreuzberg-plaintext-paddle-ocr", "single")
|
||
|
|
);
|
||
|
|
assert_eq!(
|
||
|
|
extract_framework_and_mode("kreuzberg-markdown-baseline-batch"),
|
||
|
|
("kreuzberg-markdown-baseline", "batch")
|
||
|
|
);
|
||
|
|
|
||
|
|
// Legacy -sync/-async suffixes are still stripped for backward compatibility
|
||
|
|
assert_eq!(extract_framework_and_mode("kreuzberg-sync"), ("kreuzberg", "single"));
|
||
|
|
assert_eq!(extract_framework_and_mode("kreuzberg-async"), ("kreuzberg", "single"));
|
||
|
|
|
||
|
|
// Batch mode is preserved
|
||
|
|
assert_eq!(extract_framework_and_mode("kreuzberg-batch"), ("kreuzberg", "batch"));
|
||
|
|
assert_eq!(extract_framework_and_mode("python-batch"), ("python", "batch"));
|
||
|
|
|
||
|
|
// No suffix defaults to single mode
|
||
|
|
assert_eq!(extract_framework_and_mode("kreuzberg"), ("kreuzberg", "single"));
|
||
|
|
assert_eq!(extract_framework_and_mode("docling"), ("docling", "single"));
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_make_aggregate_key_kreuzberg_family() {
|
||
|
|
// kreuzberg-* frameworks get slim keys (no redundant format component)
|
||
|
|
assert_eq!(
|
||
|
|
make_aggregate_key("kreuzberg-markdown-baseline", OutputFormat::Markdown, "single"),
|
||
|
|
"kreuzberg-markdown-baseline:single"
|
||
|
|
);
|
||
|
|
assert_eq!(
|
||
|
|
make_aggregate_key("kreuzberg-plaintext-layout", OutputFormat::Plaintext, "batch"),
|
||
|
|
"kreuzberg-plaintext-layout:batch"
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_make_aggregate_key_competitors() {
|
||
|
|
// Competitor frameworks include format in key
|
||
|
|
assert_eq!(
|
||
|
|
make_aggregate_key("docling", OutputFormat::Markdown, "single"),
|
||
|
|
"docling:markdown:single"
|
||
|
|
);
|
||
|
|
assert_eq!(
|
||
|
|
make_aggregate_key("pdfplumber", OutputFormat::Plaintext, "batch"),
|
||
|
|
"pdfplumber:plaintext:batch"
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_aggregate_new_format_kreuzberg_key_shape() {
|
||
|
|
// kreuzberg-markdown-baseline results should produce slim keys
|
||
|
|
let results = vec![
|
||
|
|
create_test_result(
|
||
|
|
"kreuzberg-markdown-baseline",
|
||
|
|
"pdf",
|
||
|
|
OcrStatus::NotUsed,
|
||
|
|
100,
|
||
|
|
1_000_000.0,
|
||
|
|
10_000_000,
|
||
|
|
),
|
||
|
|
create_test_result(
|
||
|
|
"kreuzberg-markdown-baseline-batch",
|
||
|
|
"pdf",
|
||
|
|
OcrStatus::NotUsed,
|
||
|
|
80,
|
||
|
|
1_000_000.0,
|
||
|
|
10_000_000,
|
||
|
|
),
|
||
|
|
];
|
||
|
|
|
||
|
|
let aggregated = aggregate_new_format(&results);
|
||
|
|
|
||
|
|
assert_eq!(aggregated.by_framework_mode.len(), 2);
|
||
|
|
assert!(
|
||
|
|
aggregated
|
||
|
|
.by_framework_mode
|
||
|
|
.contains_key("kreuzberg-markdown-baseline:single")
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
aggregated
|
||
|
|
.by_framework_mode
|
||
|
|
.contains_key("kreuzberg-markdown-baseline:batch")
|
||
|
|
);
|
||
|
|
|
||
|
|
let single_agg = &aggregated.by_framework_mode["kreuzberg-markdown-baseline:single"];
|
||
|
|
assert_eq!(single_agg.framework, "kreuzberg-markdown-baseline");
|
||
|
|
assert_eq!(single_agg.mode, "single");
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_percentile_r7() {
|
||
|
|
let values = vec![1.0, 2.0, 3.0, 4.0, 5.0];
|
||
|
|
assert_eq!(percentile_r7(&values, 0.0), 1.0);
|
||
|
|
assert_eq!(percentile_r7(&values, 0.5), 3.0);
|
||
|
|
assert_eq!(percentile_r7(&values, 1.0), 5.0);
|
||
|
|
assert_eq!(percentile_r7(&[], 0.5), 0.0);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_aggregate_new_format() {
|
||
|
|
let results = vec![
|
||
|
|
create_test_result(
|
||
|
|
"kreuzberg-sync",
|
||
|
|
"pdf",
|
||
|
|
OcrStatus::NotUsed,
|
||
|
|
100,
|
||
|
|
1_000_000.0,
|
||
|
|
10_000_000,
|
||
|
|
),
|
||
|
|
create_test_result("kreuzberg-sync", "pdf", OcrStatus::Used, 200, 500_000.0, 20_000_000),
|
||
|
|
create_test_result(
|
||
|
|
"kreuzberg-batch",
|
||
|
|
"docx",
|
||
|
|
OcrStatus::NotUsed,
|
||
|
|
150,
|
||
|
|
750_000.0,
|
||
|
|
15_000_000,
|
||
|
|
),
|
||
|
|
];
|
||
|
|
|
||
|
|
let aggregated = aggregate_new_format(&results);
|
||
|
|
|
||
|
|
assert_eq!(aggregated.by_framework_mode.len(), 2);
|
||
|
|
// "kreuzberg-sync" is normalized to "kreuzberg:markdown:single"
|
||
|
|
assert!(aggregated.by_framework_mode.contains_key("kreuzberg:markdown:single"));
|
||
|
|
assert!(aggregated.by_framework_mode.contains_key("kreuzberg:markdown:batch"));
|
||
|
|
|
||
|
|
let single_agg = &aggregated.by_framework_mode["kreuzberg:markdown:single"];
|
||
|
|
assert_eq!(single_agg.framework, "kreuzberg");
|
||
|
|
assert_eq!(single_agg.mode, "single");
|
||
|
|
assert!(single_agg.cold_start.is_some());
|
||
|
|
|
||
|
|
let pdf_agg = &single_agg.by_file_type["pdf"];
|
||
|
|
assert!(pdf_agg.no_ocr.is_some());
|
||
|
|
assert!(pdf_agg.with_ocr.is_some());
|
||
|
|
|
||
|
|
assert_eq!(pdf_agg.no_ocr.as_ref().unwrap().successful_sample_count, 1);
|
||
|
|
assert_eq!(pdf_agg.with_ocr.as_ref().unwrap().successful_sample_count, 1);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_calculate_percentiles() {
|
||
|
|
let results = [
|
||
|
|
create_test_result("kreuzberg", "pdf", OcrStatus::NotUsed, 100, 1_000_000.0, 10_000_000),
|
||
|
|
create_test_result("kreuzberg", "pdf", OcrStatus::NotUsed, 200, 2_000_000.0, 20_000_000),
|
||
|
|
create_test_result("kreuzberg", "pdf", OcrStatus::NotUsed, 300, 3_000_000.0, 30_000_000),
|
||
|
|
];
|
||
|
|
|
||
|
|
let refs: Vec<&BenchmarkResult> = results.iter().collect();
|
||
|
|
let percentiles = calculate_percentiles(&refs);
|
||
|
|
|
||
|
|
assert_eq!(percentiles.successful_sample_count, 3);
|
||
|
|
assert_eq!(percentiles.total_sample_count, 3);
|
||
|
|
assert_eq!(percentiles.success_rate_percent, 100.0);
|
||
|
|
assert!(percentiles.duration.p50 > 0.0);
|
||
|
|
assert!(percentiles.throughput.p50 > 0.0);
|
||
|
|
assert!(percentiles.memory.p50 > 0.0);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_aggregate_cold_starts() {
|
||
|
|
let results = [
|
||
|
|
create_test_result("kreuzberg", "pdf", OcrStatus::NotUsed, 100, 1_000_000.0, 10_000_000),
|
||
|
|
create_test_result("kreuzberg", "pdf", OcrStatus::NotUsed, 200, 2_000_000.0, 20_000_000),
|
||
|
|
];
|
||
|
|
|
||
|
|
let refs: Vec<&BenchmarkResult> = results.iter().collect();
|
||
|
|
let cold_starts = aggregate_cold_starts(&refs);
|
||
|
|
|
||
|
|
assert!(cold_starts.is_some());
|
||
|
|
let cold_starts = cold_starts.unwrap();
|
||
|
|
assert_eq!(cold_starts.sample_count, 2);
|
||
|
|
assert!(cold_starts.p50_ms > 0.0);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_ocr_unknown_handling() {
|
||
|
|
// Test that Unknown OCR status is handled correctly
|
||
|
|
let results = vec![BenchmarkResult {
|
||
|
|
framework: "test-framework".to_string(),
|
||
|
|
file_path: PathBuf::from("/tmp/test1.pdf"),
|
||
|
|
file_size: 1024,
|
||
|
|
success: true,
|
||
|
|
error_message: None,
|
||
|
|
error_kind: ErrorKind::None,
|
||
|
|
duration: Duration::from_millis(100),
|
||
|
|
extraction_duration: None,
|
||
|
|
subprocess_overhead: None,
|
||
|
|
metrics: PerformanceMetrics {
|
||
|
|
peak_memory_bytes: 10_000_000,
|
||
|
|
avg_cpu_percent: 50.0,
|
||
|
|
throughput_bytes_per_sec: 10_240.0,
|
||
|
|
p50_memory_bytes: 8_000_000,
|
||
|
|
p95_memory_bytes: 9_500_000,
|
||
|
|
p99_memory_bytes: 9_900_000,
|
||
|
|
},
|
||
|
|
quality: None,
|
||
|
|
iterations: vec![],
|
||
|
|
statistics: None,
|
||
|
|
cold_start_duration: Some(Duration::from_millis(200)),
|
||
|
|
file_extension: "pdf".to_string(),
|
||
|
|
framework_capabilities: Default::default(),
|
||
|
|
pdf_metadata: None,
|
||
|
|
ocr_status: OcrStatus::Unknown, // Unknown status
|
||
|
|
extracted_text: None,
|
||
|
|
output_format: OutputFormat::Markdown,
|
||
|
|
}];
|
||
|
|
|
||
|
|
let aggregated = aggregate_new_format(&results);
|
||
|
|
|
||
|
|
// Unknown should be in no_ocr group
|
||
|
|
let framework_mode = aggregated
|
||
|
|
.by_framework_mode
|
||
|
|
.get("test-framework:markdown:single")
|
||
|
|
.unwrap();
|
||
|
|
let file_type = framework_mode.by_file_type.get("pdf").unwrap();
|
||
|
|
assert!(file_type.no_ocr.is_some());
|
||
|
|
assert_eq!(file_type.no_ocr.as_ref().unwrap().successful_sample_count, 1);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_failed_results_excluded_from_percentiles() {
|
||
|
|
// Test that failed results don't affect percentile calculations
|
||
|
|
let results = vec![
|
||
|
|
BenchmarkResult {
|
||
|
|
framework: "test-framework".to_string(),
|
||
|
|
file_path: PathBuf::from("/tmp/test1.pdf"),
|
||
|
|
file_size: 1024,
|
||
|
|
success: true,
|
||
|
|
error_message: None,
|
||
|
|
error_kind: ErrorKind::None,
|
||
|
|
duration: Duration::from_millis(100),
|
||
|
|
extraction_duration: None,
|
||
|
|
subprocess_overhead: None,
|
||
|
|
metrics: PerformanceMetrics {
|
||
|
|
peak_memory_bytes: 10_000_000,
|
||
|
|
avg_cpu_percent: 50.0,
|
||
|
|
throughput_bytes_per_sec: 10_240.0,
|
||
|
|
p50_memory_bytes: 8_000_000,
|
||
|
|
p95_memory_bytes: 9_500_000,
|
||
|
|
p99_memory_bytes: 9_900_000,
|
||
|
|
},
|
||
|
|
quality: None,
|
||
|
|
iterations: vec![],
|
||
|
|
statistics: None,
|
||
|
|
cold_start_duration: None,
|
||
|
|
file_extension: "pdf".to_string(),
|
||
|
|
framework_capabilities: Default::default(),
|
||
|
|
pdf_metadata: None,
|
||
|
|
ocr_status: OcrStatus::NotUsed,
|
||
|
|
extracted_text: None,
|
||
|
|
output_format: OutputFormat::Markdown,
|
||
|
|
},
|
||
|
|
BenchmarkResult {
|
||
|
|
framework: "test-framework".to_string(),
|
||
|
|
file_path: PathBuf::from("/tmp/test2.pdf"),
|
||
|
|
file_size: 2048,
|
||
|
|
success: false, // Failed result
|
||
|
|
error_message: Some("Test error".to_string()),
|
||
|
|
error_kind: ErrorKind::HarnessError,
|
||
|
|
duration: Duration::from_secs(0),
|
||
|
|
extraction_duration: None,
|
||
|
|
subprocess_overhead: None,
|
||
|
|
metrics: PerformanceMetrics {
|
||
|
|
peak_memory_bytes: 0,
|
||
|
|
avg_cpu_percent: 0.0,
|
||
|
|
throughput_bytes_per_sec: 0.0,
|
||
|
|
p50_memory_bytes: 0,
|
||
|
|
p95_memory_bytes: 0,
|
||
|
|
p99_memory_bytes: 0,
|
||
|
|
},
|
||
|
|
quality: None,
|
||
|
|
iterations: vec![],
|
||
|
|
statistics: None,
|
||
|
|
cold_start_duration: None,
|
||
|
|
file_extension: "pdf".to_string(),
|
||
|
|
framework_capabilities: Default::default(),
|
||
|
|
pdf_metadata: None,
|
||
|
|
ocr_status: OcrStatus::NotUsed,
|
||
|
|
extracted_text: None,
|
||
|
|
output_format: OutputFormat::Markdown,
|
||
|
|
},
|
||
|
|
];
|
||
|
|
|
||
|
|
let aggregated = aggregate_new_format(&results);
|
||
|
|
|
||
|
|
let framework_mode = aggregated
|
||
|
|
.by_framework_mode
|
||
|
|
.get("test-framework:markdown:single")
|
||
|
|
.unwrap();
|
||
|
|
let file_type = framework_mode.by_file_type.get("pdf").unwrap();
|
||
|
|
let no_ocr = file_type.no_ocr.as_ref().unwrap();
|
||
|
|
|
||
|
|
// successful_sample_count should only count successful results
|
||
|
|
assert_eq!(no_ocr.successful_sample_count, 1);
|
||
|
|
assert_eq!(no_ocr.total_sample_count, 2);
|
||
|
|
// success_rate_percent should account for all results
|
||
|
|
assert_eq!(no_ocr.success_rate_percent, 50.0); // 1 success / 2 total = 50%
|
||
|
|
// Percentiles based on 1 successful result
|
||
|
|
assert_eq!(no_ocr.duration.p50, 100.0);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_empty_input() {
|
||
|
|
let results: Vec<BenchmarkResult> = vec![];
|
||
|
|
let aggregated = aggregate_new_format(&results);
|
||
|
|
|
||
|
|
assert_eq!(aggregated.by_framework_mode.len(), 0);
|
||
|
|
assert_eq!(aggregated.metadata.total_results, 0);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_percentile_interpolation() {
|
||
|
|
// Test that p95 with [1,2,3,4,5] uses interpolation
|
||
|
|
let sorted = vec![1.0, 2.0, 3.0, 4.0, 5.0];
|
||
|
|
let p95 = percentile_r7(&sorted, 0.95);
|
||
|
|
|
||
|
|
// With linear interpolation: index = 0.95 * 4 = 3.8
|
||
|
|
// Result = values[3] * 0.2 + values[4] * 0.8 = 4.0 * 0.2 + 5.0 * 0.8 = 4.8
|
||
|
|
assert!((p95 - 4.8).abs() < 0.01);
|
||
|
|
}
|
||
|
|
|
||
|
|
// ============================================================================
|
||
|
|
// Tests for extraction_duration aggregation in new format
|
||
|
|
// ============================================================================
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_calculate_percentiles_extraction_duration_all_present() {
|
||
|
|
// Test: All results have extraction_duration -> percentiles populated
|
||
|
|
let mut result1 = create_test_result("framework1", "pdf", OcrStatus::NotUsed, 100, 1_000_000.0, 10_000_000);
|
||
|
|
result1.extraction_duration = Some(Duration::from_millis(80));
|
||
|
|
|
||
|
|
let mut result2 = create_test_result("framework1", "pdf", OcrStatus::NotUsed, 150, 1_000_000.0, 10_000_000);
|
||
|
|
result2.extraction_duration = Some(Duration::from_millis(120));
|
||
|
|
|
||
|
|
let mut result3 = create_test_result("framework1", "pdf", OcrStatus::NotUsed, 200, 1_000_000.0, 10_000_000);
|
||
|
|
result3.extraction_duration = Some(Duration::from_millis(160));
|
||
|
|
|
||
|
|
let refs = vec![&result1, &result2, &result3];
|
||
|
|
let percentiles = calculate_percentiles(&refs);
|
||
|
|
|
||
|
|
assert!(percentiles.extraction_duration.is_some());
|
||
|
|
let ext_dur = percentiles.extraction_duration.as_ref().unwrap();
|
||
|
|
assert!((ext_dur.p50 - 120.0).abs() < 0.1); // median: 120
|
||
|
|
assert!(ext_dur.p95 > 120.0); // p95 should be between 120 and 160
|
||
|
|
assert!(ext_dur.p95 <= 160.0);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_calculate_percentiles_extraction_duration_all_none() {
|
||
|
|
// Test: All results have extraction_duration = None -> extraction_duration None
|
||
|
|
let result1 = create_test_result("framework1", "pdf", OcrStatus::NotUsed, 100, 1_000_000.0, 10_000_000);
|
||
|
|
let result2 = create_test_result("framework1", "pdf", OcrStatus::NotUsed, 150, 1_000_000.0, 10_000_000);
|
||
|
|
let result3 = create_test_result("framework1", "pdf", OcrStatus::NotUsed, 200, 1_000_000.0, 10_000_000);
|
||
|
|
|
||
|
|
let refs = vec![&result1, &result2, &result3];
|
||
|
|
let percentiles = calculate_percentiles(&refs);
|
||
|
|
|
||
|
|
assert!(percentiles.extraction_duration.is_none());
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_calculate_percentiles_extraction_duration_mixed() {
|
||
|
|
// Test: Mixed Some/None extraction_duration -> only Some values used
|
||
|
|
let mut result1 = create_test_result("framework1", "pdf", OcrStatus::NotUsed, 100, 1_000_000.0, 10_000_000);
|
||
|
|
result1.extraction_duration = Some(Duration::from_millis(80));
|
||
|
|
|
||
|
|
let result2 = create_test_result("framework1", "pdf", OcrStatus::NotUsed, 150, 1_000_000.0, 10_000_000);
|
||
|
|
// result2.extraction_duration = None
|
||
|
|
|
||
|
|
let mut result3 = create_test_result("framework1", "pdf", OcrStatus::NotUsed, 200, 1_000_000.0, 10_000_000);
|
||
|
|
result3.extraction_duration = Some(Duration::from_millis(160));
|
||
|
|
|
||
|
|
let refs = vec![&result1, &result2, &result3];
|
||
|
|
let percentiles = calculate_percentiles(&refs);
|
||
|
|
|
||
|
|
assert!(percentiles.extraction_duration.is_some());
|
||
|
|
let ext_dur = percentiles.extraction_duration.as_ref().unwrap();
|
||
|
|
// Only 80 and 160 used, median should be 120
|
||
|
|
assert!((ext_dur.p50 - 120.0).abs() < 0.1);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_calculate_percentiles_extraction_duration_filters_invalid() {
|
||
|
|
// Test: NaN/infinite extraction durations filtered out
|
||
|
|
// Note: We can't directly create NaN with Duration, so we test the filtering logic
|
||
|
|
// by ensuring valid values are correctly processed
|
||
|
|
let mut result1 = create_test_result("framework1", "pdf", OcrStatus::NotUsed, 100, 1_000_000.0, 10_000_000);
|
||
|
|
result1.extraction_duration = Some(Duration::from_millis(80));
|
||
|
|
|
||
|
|
let mut result2 = create_test_result("framework1", "pdf", OcrStatus::NotUsed, 150, 1_000_000.0, 10_000_000);
|
||
|
|
result2.extraction_duration = Some(Duration::from_millis(120));
|
||
|
|
|
||
|
|
let mut result3 = create_test_result("framework1", "pdf", OcrStatus::NotUsed, 200, 1_000_000.0, 10_000_000);
|
||
|
|
result3.extraction_duration = Some(Duration::from_millis(160));
|
||
|
|
|
||
|
|
let refs = vec![&result1, &result2, &result3];
|
||
|
|
let percentiles = calculate_percentiles(&refs);
|
||
|
|
|
||
|
|
// All values should be present and valid
|
||
|
|
assert!(percentiles.extraction_duration.is_some());
|
||
|
|
let ext_dur = percentiles.extraction_duration.as_ref().unwrap();
|
||
|
|
assert!(ext_dur.p50.is_finite());
|
||
|
|
assert!(!ext_dur.p50.is_nan());
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_calculate_percentiles_extraction_duration_with_failed_results() {
|
||
|
|
// Test: Failed results excluded from extraction_duration calculation
|
||
|
|
let mut result1 = create_test_result("framework1", "pdf", OcrStatus::NotUsed, 100, 1_000_000.0, 10_000_000);
|
||
|
|
result1.extraction_duration = Some(Duration::from_millis(80));
|
||
|
|
|
||
|
|
let mut result2_failed = create_test_result("framework1", "pdf", OcrStatus::NotUsed, 0, 0.0, 0);
|
||
|
|
result2_failed.success = false;
|
||
|
|
result2_failed.error_message = Some("Failed".to_string());
|
||
|
|
result2_failed.extraction_duration = Some(Duration::from_millis(50)); // Should be ignored
|
||
|
|
|
||
|
|
let mut result3 = create_test_result("framework1", "pdf", OcrStatus::NotUsed, 200, 1_000_000.0, 10_000_000);
|
||
|
|
result3.extraction_duration = Some(Duration::from_millis(160));
|
||
|
|
|
||
|
|
let refs = vec![&result1, &result2_failed, &result3];
|
||
|
|
let percentiles = calculate_percentiles(&refs);
|
||
|
|
|
||
|
|
// Only result1 and result3 should be used (80 and 160)
|
||
|
|
assert!(percentiles.extraction_duration.is_some());
|
||
|
|
let ext_dur = percentiles.extraction_duration.as_ref().unwrap();
|
||
|
|
assert_eq!(percentiles.successful_sample_count, 2); // Only 2 successful results
|
||
|
|
assert_eq!(percentiles.total_sample_count, 3);
|
||
|
|
assert!((ext_dur.p50 - 120.0).abs() < 0.1); // median: 120
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_aggregate_by_ocr_status_extraction_duration() {
|
||
|
|
// Test: Extraction duration aggregated correctly with OCR status split
|
||
|
|
let mut result_no_ocr_1 =
|
||
|
|
create_test_result("framework1", "pdf", OcrStatus::NotUsed, 100, 1_000_000.0, 10_000_000);
|
||
|
|
result_no_ocr_1.extraction_duration = Some(Duration::from_millis(80));
|
||
|
|
|
||
|
|
let mut result_no_ocr_2 =
|
||
|
|
create_test_result("framework1", "pdf", OcrStatus::NotUsed, 150, 1_000_000.0, 10_000_000);
|
||
|
|
result_no_ocr_2.extraction_duration = Some(Duration::from_millis(120));
|
||
|
|
|
||
|
|
let mut result_with_ocr = create_test_result("framework1", "pdf", OcrStatus::Used, 300, 500_000.0, 20_000_000);
|
||
|
|
result_with_ocr.extraction_duration = Some(Duration::from_millis(250));
|
||
|
|
|
||
|
|
let refs = vec![&result_no_ocr_1, &result_no_ocr_2, &result_with_ocr];
|
||
|
|
let (no_ocr, with_ocr) = aggregate_by_ocr_status(&refs);
|
||
|
|
|
||
|
|
// No OCR group
|
||
|
|
assert!(no_ocr.is_some());
|
||
|
|
let no_ocr_perf = no_ocr.unwrap();
|
||
|
|
assert!(no_ocr_perf.extraction_duration.is_some());
|
||
|
|
assert_eq!(no_ocr_perf.extraction_duration.as_ref().unwrap().p50, 100.0); // median of [80, 120]
|
||
|
|
|
||
|
|
// With OCR group
|
||
|
|
assert!(with_ocr.is_some());
|
||
|
|
let with_ocr_perf = with_ocr.unwrap();
|
||
|
|
assert!(with_ocr_perf.extraction_duration.is_some());
|
||
|
|
assert_eq!(with_ocr_perf.extraction_duration.as_ref().unwrap().p50, 250.0);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_aggregate_new_format_extraction_duration_preserved() {
|
||
|
|
// Test: aggregate_new_format preserves extraction_duration statistics
|
||
|
|
let mut result1 = create_test_result(
|
||
|
|
"kreuzberg-sync",
|
||
|
|
"pdf",
|
||
|
|
OcrStatus::NotUsed,
|
||
|
|
100,
|
||
|
|
1_000_000.0,
|
||
|
|
10_000_000,
|
||
|
|
);
|
||
|
|
result1.extraction_duration = Some(Duration::from_millis(80));
|
||
|
|
|
||
|
|
let mut result2 = create_test_result(
|
||
|
|
"kreuzberg-sync",
|
||
|
|
"pdf",
|
||
|
|
OcrStatus::NotUsed,
|
||
|
|
150,
|
||
|
|
1_000_000.0,
|
||
|
|
10_000_000,
|
||
|
|
);
|
||
|
|
result2.extraction_duration = Some(Duration::from_millis(120));
|
||
|
|
|
||
|
|
let results = vec![result1, result2];
|
||
|
|
let aggregated = aggregate_new_format(&results);
|
||
|
|
|
||
|
|
let framework_mode = aggregated.by_framework_mode.get("kreuzberg:markdown:single").unwrap();
|
||
|
|
let pdf_stats = framework_mode.by_file_type.get("pdf").unwrap();
|
||
|
|
let no_ocr = pdf_stats.no_ocr.as_ref().unwrap();
|
||
|
|
|
||
|
|
assert!(no_ocr.extraction_duration.is_some());
|
||
|
|
let ext_dur = no_ocr.extraction_duration.as_ref().unwrap();
|
||
|
|
assert!((ext_dur.p50 - 100.0).abs() < 0.1);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_calculate_percentiles_extraction_duration_single_value() {
|
||
|
|
// Test: Single extraction_duration value -> all percentiles return that value
|
||
|
|
let mut result = create_test_result("framework1", "pdf", OcrStatus::NotUsed, 100, 1_000_000.0, 10_000_000);
|
||
|
|
result.extraction_duration = Some(Duration::from_millis(80));
|
||
|
|
|
||
|
|
let refs = vec![&result];
|
||
|
|
let percentiles = calculate_percentiles(&refs);
|
||
|
|
|
||
|
|
assert!(percentiles.extraction_duration.is_some());
|
||
|
|
let ext_dur = percentiles.extraction_duration.as_ref().unwrap();
|
||
|
|
assert_eq!(ext_dur.p50, 80.0);
|
||
|
|
assert_eq!(ext_dur.p95, 80.0);
|
||
|
|
assert_eq!(ext_dur.p99, 80.0);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_calculate_percentiles_extraction_duration_large_dataset() {
|
||
|
|
// Test: Large dataset with extraction_duration -> percentiles calculated correctly
|
||
|
|
let mut results = vec![];
|
||
|
|
for i in 1..=100 {
|
||
|
|
let mut result =
|
||
|
|
create_test_result("framework1", "pdf", OcrStatus::NotUsed, i * 10, 1_000_000.0, 10_000_000);
|
||
|
|
result.extraction_duration = Some(Duration::from_millis(i * 8));
|
||
|
|
results.push(result);
|
||
|
|
}
|
||
|
|
|
||
|
|
let refs: Vec<&BenchmarkResult> = results.iter().collect();
|
||
|
|
let percentiles = calculate_percentiles(&refs);
|
||
|
|
|
||
|
|
assert!(percentiles.extraction_duration.is_some());
|
||
|
|
let ext_dur = percentiles.extraction_duration.as_ref().unwrap();
|
||
|
|
|
||
|
|
// p50 (median) of 1-100 scaled by 8: around 404-408ms
|
||
|
|
assert!(ext_dur.p50 >= 400.0 && ext_dur.p50 <= 410.0);
|
||
|
|
|
||
|
|
// p95 should be higher than p50
|
||
|
|
assert!(ext_dur.p95 > ext_dur.p50);
|
||
|
|
|
||
|
|
// p99 should be higher than p95
|
||
|
|
assert!(ext_dur.p99 > ext_dur.p95);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_calculate_percentiles_extraction_duration_no_extraction_some_failed() {
|
||
|
|
// Test: No extraction_duration data, some failures -> extraction_duration None
|
||
|
|
let result1_failed = BenchmarkResult {
|
||
|
|
framework: "test".to_string(),
|
||
|
|
file_path: PathBuf::from("test1.pdf"),
|
||
|
|
file_size: 1024,
|
||
|
|
success: false,
|
||
|
|
error_message: Some("Error".to_string()),
|
||
|
|
error_kind: ErrorKind::HarnessError,
|
||
|
|
duration: Duration::from_millis(0),
|
||
|
|
extraction_duration: None,
|
||
|
|
subprocess_overhead: None,
|
||
|
|
metrics: PerformanceMetrics {
|
||
|
|
peak_memory_bytes: 0,
|
||
|
|
avg_cpu_percent: 0.0,
|
||
|
|
throughput_bytes_per_sec: 0.0,
|
||
|
|
p50_memory_bytes: 0,
|
||
|
|
p95_memory_bytes: 0,
|
||
|
|
p99_memory_bytes: 0,
|
||
|
|
},
|
||
|
|
quality: None,
|
||
|
|
iterations: vec![],
|
||
|
|
statistics: None,
|
||
|
|
cold_start_duration: None,
|
||
|
|
file_extension: "pdf".to_string(),
|
||
|
|
framework_capabilities: FrameworkCapabilities::default(),
|
||
|
|
pdf_metadata: None,
|
||
|
|
ocr_status: OcrStatus::NotUsed,
|
||
|
|
extracted_text: None,
|
||
|
|
output_format: OutputFormat::Markdown,
|
||
|
|
};
|
||
|
|
|
||
|
|
let result2 = create_test_result("framework1", "pdf", OcrStatus::NotUsed, 100, 1_000_000.0, 10_000_000);
|
||
|
|
|
||
|
|
let refs = vec![&result1_failed, &result2];
|
||
|
|
let percentiles = calculate_percentiles(&refs);
|
||
|
|
|
||
|
|
assert!(percentiles.extraction_duration.is_none());
|
||
|
|
assert_eq!(percentiles.success_rate_percent, 50.0);
|
||
|
|
}
|
||
|
|
|
||
|
|
// ============================================================================
|
||
|
|
// Tests for CPU aggregation
|
||
|
|
// ============================================================================
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_calculate_percentiles_cpu_populated() {
|
||
|
|
// Test: Results with avg_cpu_percent > 0 produce CPU percentiles
|
||
|
|
let mut r1 = create_test_result("framework1", "pdf", OcrStatus::NotUsed, 100, 1_000_000.0, 10_000_000);
|
||
|
|
r1.metrics.avg_cpu_percent = 25.0;
|
||
|
|
|
||
|
|
let mut r2 = create_test_result("framework1", "pdf", OcrStatus::NotUsed, 150, 1_000_000.0, 10_000_000);
|
||
|
|
r2.metrics.avg_cpu_percent = 75.0;
|
||
|
|
|
||
|
|
let mut r3 = create_test_result("framework1", "pdf", OcrStatus::NotUsed, 200, 1_000_000.0, 10_000_000);
|
||
|
|
r3.metrics.avg_cpu_percent = 50.0;
|
||
|
|
|
||
|
|
let refs = vec![&r1, &r2, &r3];
|
||
|
|
let percentiles = calculate_percentiles(&refs);
|
||
|
|
|
||
|
|
assert!(percentiles.cpu.is_some());
|
||
|
|
let cpu = percentiles.cpu.as_ref().unwrap();
|
||
|
|
assert_eq!(cpu.p50, 50.0); // median of [25, 50, 75]
|
||
|
|
assert!(cpu.p95 > cpu.p50);
|
||
|
|
assert!(cpu.p99 >= cpu.p95);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_calculate_percentiles_cpu_zero_excluded() {
|
||
|
|
// Test: avg_cpu_percent = 0.0 is filtered out (fallback snapshot path)
|
||
|
|
let mut r1 = create_test_result("framework1", "pdf", OcrStatus::NotUsed, 100, 1_000_000.0, 10_000_000);
|
||
|
|
r1.metrics.avg_cpu_percent = 0.0;
|
||
|
|
|
||
|
|
let refs = vec![&r1];
|
||
|
|
let percentiles = calculate_percentiles(&refs);
|
||
|
|
|
||
|
|
// 0.0 is filtered, so cpu should be None
|
||
|
|
assert!(percentiles.cpu.is_none());
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_calculate_percentiles_cpu_mixed_zero_and_nonzero() {
|
||
|
|
// Test: Mix of 0.0 and valid CPU values — only valid values used
|
||
|
|
let mut r1 = create_test_result("framework1", "pdf", OcrStatus::NotUsed, 100, 1_000_000.0, 10_000_000);
|
||
|
|
r1.metrics.avg_cpu_percent = 0.0; // filtered out
|
||
|
|
|
||
|
|
let mut r2 = create_test_result("framework1", "pdf", OcrStatus::NotUsed, 150, 1_000_000.0, 10_000_000);
|
||
|
|
r2.metrics.avg_cpu_percent = 40.0;
|
||
|
|
|
||
|
|
let mut r3 = create_test_result("framework1", "pdf", OcrStatus::NotUsed, 200, 1_000_000.0, 10_000_000);
|
||
|
|
r3.metrics.avg_cpu_percent = 60.0;
|
||
|
|
|
||
|
|
let refs = vec![&r1, &r2, &r3];
|
||
|
|
let percentiles = calculate_percentiles(&refs);
|
||
|
|
|
||
|
|
assert!(percentiles.cpu.is_some());
|
||
|
|
let cpu = percentiles.cpu.as_ref().unwrap();
|
||
|
|
// Only 40 and 60 → median = 50
|
||
|
|
assert_eq!(cpu.p50, 50.0);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_calculate_percentiles_cpu_failed_results_excluded() {
|
||
|
|
// Test: Failed results' CPU values are excluded
|
||
|
|
let mut r1 = create_test_result("framework1", "pdf", OcrStatus::NotUsed, 100, 1_000_000.0, 10_000_000);
|
||
|
|
r1.metrics.avg_cpu_percent = 30.0;
|
||
|
|
|
||
|
|
let mut r2_failed = create_test_result("framework1", "pdf", OcrStatus::NotUsed, 0, 0.0, 0);
|
||
|
|
r2_failed.success = false;
|
||
|
|
r2_failed.error_message = Some("Failed".to_string());
|
||
|
|
r2_failed.metrics.avg_cpu_percent = 90.0; // Should be ignored
|
||
|
|
|
||
|
|
let refs = vec![&r1, &r2_failed];
|
||
|
|
let percentiles = calculate_percentiles(&refs);
|
||
|
|
|
||
|
|
assert!(percentiles.cpu.is_some());
|
||
|
|
let cpu = percentiles.cpu.as_ref().unwrap();
|
||
|
|
assert_eq!(cpu.p50, 30.0); // Only successful result's CPU used
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_comparison_cpu_ranking() {
|
||
|
|
// Test: CPU ranking in comparison data — lower CPU = rank 1
|
||
|
|
let mut r1 = create_test_result("fast-framework", "pdf", OcrStatus::NotUsed, 50, 2_000_000.0, 5_000_000);
|
||
|
|
r1.metrics.avg_cpu_percent = 80.0; // high CPU
|
||
|
|
|
||
|
|
let mut r2 = create_test_result("slow-framework", "pdf", OcrStatus::NotUsed, 200, 500_000.0, 20_000_000);
|
||
|
|
r2.metrics.avg_cpu_percent = 20.0; // low CPU
|
||
|
|
|
||
|
|
let results = vec![r1, r2];
|
||
|
|
let aggregated = aggregate_new_format(&results);
|
||
|
|
|
||
|
|
assert!(!aggregated.comparison.cpu_ranking.is_empty());
|
||
|
|
// slow-framework has lower CPU, should be rank 1
|
||
|
|
assert_eq!(
|
||
|
|
aggregated.comparison.cpu_ranking[0].framework_mode,
|
||
|
|
"slow-framework:markdown:single"
|
||
|
|
);
|
||
|
|
assert_eq!(aggregated.comparison.cpu_ranking[0].rank, 1);
|
||
|
|
assert_eq!(
|
||
|
|
aggregated.comparison.cpu_ranking[1].framework_mode,
|
||
|
|
"fast-framework:markdown:single"
|
||
|
|
);
|
||
|
|
assert_eq!(aggregated.comparison.cpu_ranking[1].rank, 2);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_deltas_include_cpu() {
|
||
|
|
// Test: Deltas vs baseline include CPU delta fields
|
||
|
|
let mut r1 = create_test_result("baseline-fw", "pdf", OcrStatus::NotUsed, 50, 2_000_000.0, 5_000_000);
|
||
|
|
r1.metrics.avg_cpu_percent = 30.0;
|
||
|
|
|
||
|
|
let mut r2 = create_test_result("other-fw", "pdf", OcrStatus::NotUsed, 200, 500_000.0, 20_000_000);
|
||
|
|
r2.metrics.avg_cpu_percent = 60.0;
|
||
|
|
|
||
|
|
let results = vec![r1, r2];
|
||
|
|
let aggregated = aggregate_new_format(&results);
|
||
|
|
|
||
|
|
// baseline-fw is fastest (50ms), so other-fw has deltas vs it
|
||
|
|
let delta = aggregated
|
||
|
|
.comparison
|
||
|
|
.deltas_vs_baseline
|
||
|
|
.get("other-fw:markdown:single")
|
||
|
|
.unwrap();
|
||
|
|
assert_eq!(delta.cpu_delta_pp, 30.0); // 60 - 30 = 30 percentage points
|
||
|
|
assert!((delta.cpu_delta_percent - 100.0).abs() < 0.1); // (60-30)/30 * 100 = 100%
|
||
|
|
}
|
||
|
|
}
|