Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,142 @@
//! Framework adapter system
//!
//! Adapters provide a unified interface for extracting content across different
//! extraction frameworks (both Kreuzberg language bindings and open source alternatives).
//! This allows benchmarking any extraction framework against the same test fixtures.
use crate::{
Result,
types::{BenchmarkResult, OutputFormat},
};
use async_trait::async_trait;
use std::path::Path;
use std::time::Duration;
/// Unified interface for document extraction frameworks
///
/// Implementations of this trait can extract content from documents using
/// different extraction frameworks (Kreuzberg language bindings and open source alternatives).
#[async_trait]
pub trait FrameworkAdapter: Send + Sync {
/// Get the framework name (e.g., "kreuzberg-rust", "kreuzberg-python")
fn name(&self) -> &str;
/// Check if this adapter supports the given file type
///
/// # Arguments
/// * `file_type` - File extension without dot (e.g., "pdf", "docx")
fn supports_format(&self, file_type: &str) -> bool;
/// Check if this adapter should skip a specific file
///
/// Some adapters need to skip specific files that are known to cause
/// issues (e.g., timeouts in WASM for very large OCR-heavy documents).
///
/// # Arguments
/// * `file_name` - The file name (not full path) to check
fn should_skip_file(&self, _file_name: &str) -> bool {
false
}
/// Get the output formats supported by this adapter
///
/// # Returns
/// * `Vec<OutputFormat>` - List of supported output formats
fn supported_output_formats(&self) -> Vec<OutputFormat> {
vec![OutputFormat::Plaintext]
}
/// Extract content from a document
///
/// # Arguments
/// * `file_path` - Path to the document to extract
/// * `timeout` - Maximum time to wait for extraction
/// * `force_ocr` - When true, force OCR even if the document has a text layer
/// * `output_format` - Output format for extraction (markdown or plaintext)
///
/// # Returns
/// * `Ok(BenchmarkResult)` - Successful extraction with metrics
/// * `Err(Error)` - Extraction failed
async fn extract(
&self,
file_path: &Path,
timeout: Duration,
force_ocr: bool,
output_format: OutputFormat,
) -> Result<BenchmarkResult>;
/// Extract content from multiple documents using framework's batch API
///
/// Frameworks with native batch support should override this method to use
/// their optimized batch extraction API (e.g., Kreuzberg's `batch_extract_files()`).
///
/// Default implementation calls `extract()` sequentially for each file.
///
/// # Arguments
/// * `file_paths` - Paths to documents to extract
/// * `timeout` - Maximum time to wait for each extraction
/// * `force_ocr` - Per-file force_ocr flags (must be same length as file_paths)
/// * `output_format` - Output format for extraction
///
/// # Returns
/// * `Ok(Vec<BenchmarkResult>)` - Results for all files
/// * `Err(Error)` - Batch extraction failed
async fn extract_batch(
&self,
file_paths: &[&Path],
timeout: Duration,
force_ocr: &[bool],
output_format: OutputFormat,
) -> Result<Vec<BenchmarkResult>> {
let mut results = Vec::new();
for (i, path) in file_paths.iter().enumerate() {
let fo = force_ocr.get(i).copied().unwrap_or(false);
results.push(self.extract(path, timeout, fo, output_format).await?);
}
Ok(results)
}
/// Check if this adapter supports batch extraction
///
/// Returns true if the adapter overrides `extract_batch()` with an optimized implementation.
/// Default is false (uses sequential extraction).
fn supports_batch(&self) -> bool {
false
}
/// Get version information for this framework
fn version(&self) -> String {
"unknown".to_string()
}
/// Perform any necessary setup before benchmarking
async fn setup(&self) -> Result<()> {
Ok(())
}
/// Perform any necessary cleanup after benchmarking
async fn teardown(&self) -> Result<()> {
Ok(())
}
/// Warm up the framework by performing a test extraction
///
/// This is called once before benchmarking to get the framework into a warm state.
/// It measures the cold start time (framework load + first extraction).
///
/// The default implementation performs a single extraction on the provided warmup file.
///
/// # Arguments
/// * `warmup_file` - Path to a small test file for warmup
/// * `timeout` - Maximum time to wait for warmup
/// * `output_format` - Output format for warmup extraction
///
/// # Returns
/// * `Ok(Duration)` - Cold start duration (framework load + first extraction)
/// * `Err(Error)` - Warmup failed
async fn warmup(&self, warmup_file: &Path, timeout: Duration, output_format: OutputFormat) -> Result<Duration> {
let start = std::time::Instant::now();
let _ = self.extract(warmup_file, timeout, false, output_format).await?;
Ok(start.elapsed())
}
}

View File

@@ -0,0 +1,506 @@
use crate::{adapters::subprocess::SubprocessAdapter, error::Result};
use std::time::Duration;
use std::{env, path::PathBuf};
use super::ocr_flag;
/// Maximum per-extraction timeout for persistent adapters (seconds).
const PERSISTENT_MAX_TIMEOUT_SECS: u64 = 180;
/// Higher timeout for slow ML frameworks (mineru, pymupdf4llm) that load
/// large models and can take significantly longer on first extractions.
const SLOW_ML_TIMEOUT_SECS: u64 = 300;
/// Margin between the Python-side and Rust-side timeouts.
/// The Python script handles timeouts internally (via multiprocessing fork),
/// reporting the result as a JSON error. The Rust-side timeout is a safety net
/// that only fires if the Python side fails to respond.
const PYTHON_TIMEOUT_MARGIN_SECS: u64 = 30;
/// Python-side extraction timeout passed via `--timeout=N` CLI arg.
const PYTHON_EXTRACTION_TIMEOUT_SECS: u64 = PERSISTENT_MAX_TIMEOUT_SECS - PYTHON_TIMEOUT_MARGIN_SECS;
/// Helper function to define supported file types for each framework
///
/// Maps framework names to the file extensions they can actually process.
/// This prevents invalid benchmark combinations (e.g., Pandoc cannot read PDFs).
/// Format lists are based on comprehensive research of each framework's actual capabilities.
fn get_supported_formats(framework_name: &str) -> Vec<String> {
match framework_name {
// Pandoc: 45+ input formats, but CANNOT read PDF (output only)
// See: pandoc --list-input-formats
// Only list formats that pandoc can auto-detect from file extension
// and reliably convert to plain text via --to=plain.
// Excluded: pptx, xlsx (return empty text), bib (needs explicit --from=biblatex),
// ris (returns empty text), dbk (unreliable auto-detection)
"pandoc" => vec![
"docx", "odt", // Office documents
"md", "markdown", "rst", "org", "typst", // Markup languages
"html", "htm", // Web formats
"csv", "tsv", // Data formats
"tex", "latex", "ipynb", // Scientific/technical
"epub", // E-books
"rtf", "txt", // Other documents
]
.into_iter()
.map(|s| s.to_string())
.collect(),
// pdfplumber: PDF-only (built on pdfminer.six)
"pdfplumber" => vec!["pdf".to_string()],
// pypdf: PDF-only (pure Python PDF library)
"pypdf" => vec!["pdf".to_string()],
// playa-pdf: PDF-only (pure Python PDF library)
"playa-pdf" => vec!["pdf".to_string()],
// pdfminer.six: PDF-only (Python PDF text extraction)
"pdfminer" => vec!["pdf".to_string()],
// pdftotext: PDF-only (Python binding for poppler's pdftotext)
"pdftotext" => vec!["pdf".to_string()],
// PyMuPDF4LLM: PDF + formats via PyMuPDF/fitz
// See: https://pymupdf.readthedocs.io/en/latest/how-to-open-a-file.html
// Note: many non-PDF formats return empty content — tracked as EmptyContent errors
"pymupdf4llm" => vec![
// Documents
"pdf", // E-books
"epub", // Vector/text
"svg", "txt", // Images (for OCR) - gif and webp NOT supported by PyMuPDF
"png", "jpg", "jpeg", "bmp", "tiff", "tif",
]
.into_iter()
.map(|s| s.to_string())
.collect(),
// Docling: 15+ format types, 38+ extensions
// See: https://docling-project.github.io/docling/usage/supported_formats/
"docling" => vec![
// Office documents
"pdf", "docx", "pptx", "xlsx", // Web/markup
"html", "htm", "md", "markdown", "asciidoc", // Data formats
"csv", // Scientific/publishing
"jats", // Subtitles
"vtt", // Images (converted to PDF internally for layout analysis)
"png", "jpg", "jpeg", "tiff", "tif", "bmp", "webp",
]
.into_iter()
.map(|s| s.to_string())
.collect(),
// Tika: 1500+ formats for detection, extensive text extraction
// See: https://tika.apache.org/ and tika-mimetypes.xml
"tika" => vec![
// Office documents (Microsoft)
"pdf", "docx", "doc", "pptx", "ppt", "ppsx", "pptm", "xlsx", "xls", "xlsm", "xlsb",
// Office documents (OpenDocument)
"odt", "ods", // Other documents
"rtf", "epub", // Web/markup
"html", "htm", "xml", "svg", "md", "txt", // Data formats
"csv", "tsv", "json", "yaml", "yml", "toml", // Email
"eml", "msg", // Scientific/technical (typst not supported - too new)
"tex", "latex", "bib", "rst", "org", "ipynb", // Images (metadata + OCR)
"png", "jpg", "jpeg", "gif", "bmp", "tiff", "tif", "webp", "jp2", // Archives
"zip", "tar", "gz", "7z",
]
.into_iter()
.map(|s| s.to_string())
.collect(),
// MarkItDown: 25+ formats with optional dependencies
// See: https://github.com/microsoft/markitdown
// Note: MarkItDown OUTPUTS markdown, so md/txt are not conversion inputs
"markitdown" => vec![
// Office documents
"pdf", "docx", "pptx", "xlsx", "xls", // Web/markup (md, txt not valid - outputs markdown)
"html", "htm", "xml", // Data formats
"csv", "json", // E-books & notebooks
"epub", "ipynb", // Email
"msg", // Images (with Azure Document Intelligence)
"png", "jpg", "jpeg", "bmp", "tiff", "tif", // Archives
"zip",
]
.into_iter()
.map(|s| s.to_string())
.collect(),
// Unstructured: 31+ partitionable formats
// See: https://docs.unstructured.io/ui/supported-file-types
"unstructured" => vec![
// Office documents (Microsoft)
"pdf", "docx", "doc", "pptx", "ppt", "xlsx", "xls", // Office documents (OpenDocument)
"odt", // Other documents
"rtf", "epub", // Web/markup
"html", "htm", "xml", "md", "rst", "org", "txt",
// Data formats (json NOT supported for partitioning)
"csv", "tsv", // Email
"eml", "msg", // Images (requires hi_res strategy)
"png", "jpg", "jpeg", "tiff", "tif", "bmp",
]
.into_iter()
.map(|s| s.to_string())
.collect(),
// MinerU: PDF and PNG/JPG images ONLY
// See: https://github.com/opendatalab/MinerU - cli/common.py defines actual formats
"mineru" => vec![
// Documents
"pdf", // Images (only png, jpg confirmed in source)
"png", "jpg",
]
.into_iter()
.map(|s| s.to_string())
.collect(),
// Default: common document formats for unknown frameworks
_ => vec![
"pdf", "docx", "doc", "xlsx", "xls", "pptx", "ppt", "txt", "md", "html", "xml", "json",
]
.into_iter()
.map(|s| s.to_string())
.collect(),
}
}
/// Creates a subprocess adapter for Docling.
///
/// Uses wrapper script approach for extraction.
pub fn create_docling_adapter(ocr_enabled: bool) -> Result<SubprocessAdapter> {
let script_path = get_script_path("docling_extract.py")?;
let (command, mut args) = find_python_with_framework("docling")?;
args.push(script_path.to_string_lossy().to_string());
args.push(format!("--timeout={}", PYTHON_EXTRACTION_TIMEOUT_SECS));
args.push(ocr_flag(ocr_enabled));
args.push("sync".to_string());
let supported_formats = get_supported_formats("docling");
Ok(
SubprocessAdapter::new("docling", command, args, vec![], supported_formats)
.with_max_timeout(Duration::from_secs(PERSISTENT_MAX_TIMEOUT_SECS)),
)
}
/// Creates a subprocess adapter for Unstructured.
///
/// Uses wrapper script approach for extraction.
pub fn create_unstructured_adapter(ocr_enabled: bool) -> Result<SubprocessAdapter> {
let script_path = get_script_path("unstructured_extract.py")?;
let (command, mut args) = find_python_with_framework("unstructured")?;
args.push(script_path.to_string_lossy().to_string());
args.push(format!("--timeout={}", PYTHON_EXTRACTION_TIMEOUT_SECS));
args.push(ocr_flag(ocr_enabled));
args.push("sync".to_string());
let supported_formats = get_supported_formats("unstructured");
Ok(
SubprocessAdapter::new("unstructured", command, args, vec![], supported_formats)
.with_max_timeout(Duration::from_secs(PERSISTENT_MAX_TIMEOUT_SECS)),
)
}
/// Creates a subprocess adapter for MarkItDown
pub fn create_markitdown_adapter(ocr_enabled: bool) -> Result<SubprocessAdapter> {
let script_path = get_script_path("markitdown_extract.py")?;
let (command, mut args) = find_python_with_framework("markitdown")?;
args.push(script_path.to_string_lossy().to_string());
args.push(format!("--timeout={}", PYTHON_EXTRACTION_TIMEOUT_SECS));
args.push(ocr_flag(ocr_enabled));
args.push("sync".to_string());
let supported_formats = get_supported_formats("markitdown");
Ok(
SubprocessAdapter::new("markitdown", command, args, vec![], supported_formats)
.with_max_timeout(Duration::from_secs(PERSISTENT_MAX_TIMEOUT_SECS)),
)
}
/// Creates a subprocess adapter for Pandoc (universal document converter)
pub fn create_pandoc_adapter() -> Result<SubprocessAdapter> {
which::which("pandoc").map_err(|_| {
crate::Error::Config(
"pandoc not found. Install with: brew install pandoc (macOS) or apt install pandoc (Linux)".to_string(),
)
})?;
let script_path = get_script_path("pandoc_extract.sh")?;
let command = PathBuf::from("bash");
let args = vec![script_path.to_string_lossy().to_string()];
let supported_formats = get_supported_formats("pandoc");
Ok(
SubprocessAdapter::new("pandoc", command, args, vec![], supported_formats)
.with_max_timeout(Duration::from_secs(180)),
)
}
/// Helper function to get the path to a wrapper script
fn get_script_path(script_name: &str) -> Result<PathBuf> {
if let Ok(manifest_dir) = env::var("CARGO_MANIFEST_DIR") {
let script_path = PathBuf::from(manifest_dir).join("scripts").join(script_name);
if script_path.exists() {
return Ok(script_path);
}
}
let script_path = PathBuf::from("tools/benchmark-harness/scripts").join(script_name);
if script_path.exists() {
return Ok(script_path);
}
Err(crate::error::Error::Config(format!(
"Script not found: {}",
script_name
)))
}
/// Helper function to find Python interpreter with a specific open source extraction framework installed
///
/// Returns (command, args) where command is the executable and args are the base arguments
fn find_python_with_framework(framework: &str) -> Result<(PathBuf, Vec<String>)> {
if which::which("uv").is_ok() {
// Use `uv run <script>` which runs the script with the project's
// Python environment (.venv). Framework dependencies are installed
// via pyproject.toml dependency groups (bench-*).
return Ok((PathBuf::from("uv"), vec!["run".to_string()]));
}
let python_candidates = vec!["python3", "python"];
for candidate in python_candidates {
if let Ok(python_path) = which::which(candidate) {
let check = std::process::Command::new(&python_path)
.arg("-c")
.arg(format!("import {}", framework))
.output();
if let Ok(output) = check
&& output.status.success()
{
return Ok((python_path, vec![]));
}
}
}
Err(crate::error::Error::Config(format!(
"No Python interpreter found with {} installed. Install with: pip install {}",
framework, framework
)))
}
/// Helper to find Java runtime
fn find_java() -> Result<PathBuf> {
which::which("java").map_err(|_| crate::Error::Config("Java runtime not found".to_string()))
}
/// Helper to locate Tika JAR (auto-detect from libs/ or env var)
fn get_tika_jar_path() -> Result<PathBuf> {
if let Ok(manifest_dir) = env::var("CARGO_MANIFEST_DIR") {
let lib_dir = PathBuf::from(manifest_dir).join("libs");
if let Ok(entries) = std::fs::read_dir(&lib_dir) {
for entry in entries.flatten() {
let path = entry.path();
if let Some(name) = path.file_name().and_then(|n| n.to_str())
&& name.starts_with("tika-app-")
&& name.ends_with(".jar")
{
return Ok(path);
}
}
}
}
let fallback_lib_dir = PathBuf::from("tools/benchmark-harness/libs");
if let Ok(entries) = std::fs::read_dir(&fallback_lib_dir) {
for entry in entries.flatten() {
let path = entry.path();
if let Some(name) = path.file_name().and_then(|n| n.to_str())
&& name.starts_with("tika-app-")
&& name.ends_with(".jar")
{
return Ok(path);
}
}
}
if let Ok(jar_path) = env::var("TIKA_JAR") {
let path = PathBuf::from(jar_path);
if path.exists() {
return Ok(path);
}
}
let version = env::var("TIKA_VERSION").unwrap_or_else(|_| "3.2.3".to_string());
Err(crate::Error::Config(format!(
"Tika JAR not found. Download: curl -fsSL -o tools/benchmark-harness/libs/tika-app-{version}.jar https://repo1.maven.org/maven2/org/apache/tika/tika-app/{version}/tika-app-{version}.jar"
)))
}
/// Creates a subprocess adapter for Apache Tika (persistent server mode)
///
/// Uses Tika via wrapper script approach for extraction.
pub fn create_tika_adapter(ocr_enabled: bool) -> Result<SubprocessAdapter> {
let jar_path = get_tika_jar_path()?;
let script_path = get_script_path("TikaExtract.java")?;
let command = find_java()?;
let args = vec![
"-server".to_string(),
"-Xms512m".to_string(),
"-Xmx2g".to_string(),
"-XX:+UseG1GC".to_string(),
"-cp".to_string(),
jar_path.to_string_lossy().to_string(),
script_path.to_string_lossy().to_string(),
ocr_flag(ocr_enabled),
"sync".to_string(),
];
let supported_formats = get_supported_formats("tika");
Ok(SubprocessAdapter::new("tika", command, args, vec![], supported_formats)
.with_max_timeout(Duration::from_secs(180)))
}
/// Creates a subprocess adapter for PyMuPDF4LLM
pub fn create_pymupdf4llm_adapter(ocr_enabled: bool) -> Result<SubprocessAdapter> {
let script_path = get_script_path("pymupdf4llm_extract.py")?;
let (command, mut args) = find_python_with_framework("pymupdf4llm")?;
args.push(script_path.to_string_lossy().to_string());
args.push(format!("--timeout={}", PYTHON_EXTRACTION_TIMEOUT_SECS));
args.push(ocr_flag(ocr_enabled));
args.push("sync".to_string());
let supported_formats = get_supported_formats("pymupdf4llm");
Ok(
SubprocessAdapter::new("pymupdf4llm", command, args, vec![], supported_formats)
.with_max_timeout(Duration::from_secs(SLOW_ML_TIMEOUT_SECS)),
)
}
/// Creates a subprocess adapter for pdfplumber
pub fn create_pdfplumber_adapter(ocr_enabled: bool) -> Result<SubprocessAdapter> {
let script_path = get_script_path("pdfplumber_extract.py")?;
let (command, mut args) = find_python_with_framework("pdfplumber")?;
args.push(script_path.to_string_lossy().to_string());
args.push(format!("--timeout={}", PYTHON_EXTRACTION_TIMEOUT_SECS));
args.push(ocr_flag(ocr_enabled));
args.push("sync".to_string());
let supported_formats = get_supported_formats("pdfplumber");
Ok(
SubprocessAdapter::new("pdfplumber", command, args, vec![], supported_formats)
.with_max_timeout(Duration::from_secs(PERSISTENT_MAX_TIMEOUT_SECS)),
)
}
/// Creates a subprocess adapter for pypdf
pub fn create_pypdf_adapter(ocr_enabled: bool) -> Result<SubprocessAdapter> {
let script_path = get_script_path("pypdf_extract.py")?;
let (command, mut args) = find_python_with_framework("pypdf")?;
args.push(script_path.to_string_lossy().to_string());
args.push(format!("--timeout={}", PYTHON_EXTRACTION_TIMEOUT_SECS));
args.push(ocr_flag(ocr_enabled));
args.push("sync".to_string());
let supported_formats = get_supported_formats("pypdf");
Ok(
SubprocessAdapter::new("pypdf", command, args, vec![], supported_formats)
.with_max_timeout(Duration::from_secs(PERSISTENT_MAX_TIMEOUT_SECS)),
)
}
/// Creates a subprocess adapter for playa-pdf
pub fn create_playa_pdf_adapter(ocr_enabled: bool) -> Result<SubprocessAdapter> {
let script_path = get_script_path("playa_pdf_extract.py")?;
let (command, mut args) = find_python_with_framework("playa")?;
args.push(script_path.to_string_lossy().to_string());
args.push(format!("--timeout={}", PYTHON_EXTRACTION_TIMEOUT_SECS));
args.push(ocr_flag(ocr_enabled));
args.push("sync".to_string());
let supported_formats = get_supported_formats("playa-pdf");
Ok(
SubprocessAdapter::new("playa-pdf", command, args, vec![], supported_formats)
.with_max_timeout(Duration::from_secs(PERSISTENT_MAX_TIMEOUT_SECS)),
)
}
/// Creates a subprocess adapter for pdfminer.six
pub fn create_pdfminer_adapter(ocr_enabled: bool) -> Result<SubprocessAdapter> {
let script_path = get_script_path("pdfminer_extract.py")?;
let (command, mut args) = find_python_with_framework("pdfminer")?;
args.push(script_path.to_string_lossy().to_string());
args.push(format!("--timeout={}", PYTHON_EXTRACTION_TIMEOUT_SECS));
args.push(ocr_flag(ocr_enabled));
args.push("sync".to_string());
let supported_formats = get_supported_formats("pdfminer");
Ok(
SubprocessAdapter::new("pdfminer", command, args, vec![], supported_formats)
.with_max_timeout(Duration::from_secs(PERSISTENT_MAX_TIMEOUT_SECS)),
)
}
/// Creates a subprocess adapter for pdftotext (persistent server mode)
///
/// Requires poppler-utils system package for the Python pdftotext binding.
pub fn create_pdftotext_adapter(ocr_enabled: bool) -> Result<SubprocessAdapter> {
let script_path = get_script_path("pdftotext_extract.py")?;
let (command, mut args) = find_python_with_framework("pdftotext")?;
args.push(script_path.to_string_lossy().to_string());
args.push(format!("--timeout={}", PYTHON_EXTRACTION_TIMEOUT_SECS));
args.push(ocr_flag(ocr_enabled));
args.push("sync".to_string());
let supported_formats = get_supported_formats("pdftotext");
Ok(
SubprocessAdapter::new("pdftotext", command, args, vec![], supported_formats)
.with_max_timeout(Duration::from_secs(PERSISTENT_MAX_TIMEOUT_SECS)),
)
}
/// Creates a subprocess adapter for MinerU (persistent server mode)
///
/// Uses wrapper script approach for extraction.
pub fn create_mineru_adapter(ocr_enabled: bool) -> Result<SubprocessAdapter> {
let script_path = get_script_path("mineru_extract.py")?;
let (command, mut args) = find_python_with_framework("mineru")?;
args.push(script_path.to_string_lossy().to_string());
args.push(format!("--timeout={}", PYTHON_EXTRACTION_TIMEOUT_SECS));
args.push(ocr_flag(ocr_enabled));
args.push("sync".to_string());
let supported_formats = get_supported_formats("mineru");
Ok(
SubprocessAdapter::new("mineru", command, args, vec![], supported_formats)
.with_max_timeout(Duration::from_secs(SLOW_ML_TIMEOUT_SECS)),
)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_get_script_path() {
let result = get_script_path("docling_extract.py");
assert!(result.is_ok() || result.is_err());
}
#[tokio::test]
async fn test_adapter_creation() {
let _ = create_docling_adapter(true);
let _ = create_unstructured_adapter(true);
let _ = create_markitdown_adapter(true);
let _ = create_pandoc_adapter();
let _ = create_tika_adapter(true);
let _ = create_pymupdf4llm_adapter(true);
let _ = create_pdfplumber_adapter(true);
let _ = create_mineru_adapter(true);
let _ = create_pypdf_adapter(true);
let _ = create_pdfminer_adapter(true);
let _ = create_pdftotext_adapter(true);
let _ = create_playa_pdf_adapter(true);
}
}

View File

@@ -0,0 +1,166 @@
//! Kreuzberg adapter for Wave 2 benchmark harness.
//!
//! Provides subprocess-based extraction via kreuzberg with support for:
//! - Three pipelines: baseline, layout, paddle-ocr
//! - Single-file and batch extraction modes
//! - JSON envelope parsing (ExtractEnvelope and BatchEnvelope)
use crate::{
adapters::subprocess::SubprocessAdapter,
error::Result,
types::{KreuzbergPipeline, OutputFormat},
};
use std::path::PathBuf;
use which::which;
/// Creates a Kreuzberg adapter for the given pipeline and configuration.
///
/// # Arguments
/// * `pipeline` - The pipeline variant (baseline, layout, paddle-ocr)
/// * `output_format` - Output format for extraction (markdown or plaintext)
/// * `batch` - Whether to use batch extraction mode
///
/// # Returns
/// * `Ok(SubprocessAdapter)` - Configured adapter ready for extraction
/// * `Err(Error)` - If kreuzberg cannot be located
pub fn create_kreuzberg_adapter(
pipeline: KreuzbergPipeline,
output_format: OutputFormat,
batch: bool,
) -> Result<SubprocessAdapter> {
let cli_path = locate_kreuzberg_cli()?;
// Map output format to CLI flag
let content_format = match output_format {
OutputFormat::Markdown => "markdown",
OutputFormat::Plaintext => "plain",
};
// Build command arguments
let subcommand = if batch { "batch" } else { "extract" };
let mut args = vec![
subcommand.to_string(),
"--format".to_string(),
"json".to_string(),
"--content-format".to_string(),
content_format.to_string(),
];
// Add pipeline-specific flags
match pipeline {
KreuzbergPipeline::Baseline => {
// No additional flags for baseline
}
KreuzbergPipeline::Layout => {
// `--layout` is Option<bool> with `num_args = 0..=1`, so `--layout true` parses.
// `--use-layout-for-markdown` is a plain `bool` presence flag — appending "true"
// as a second token leaves the literal "true" as an orphan positional argument
// and clap rejects the whole invocation, producing the 100% harness-error
// pattern observed on the Kreuzberg Layout variant in the dashboard.
args.push("--layout".to_string());
args.push("true".to_string());
args.push("--use-layout-for-markdown".to_string());
}
KreuzbergPipeline::PaddleOcr => {
args.push("--ocr".to_string());
args.push("true".to_string());
args.push("--ocr-backend".to_string());
args.push("paddle-ocr".to_string());
args.push("--force-ocr".to_string());
args.push("true".to_string());
}
}
// Forward-compat marker: always specify pdf-backend
args.push("--pdf-backend".to_string());
args.push("pdf-oxide".to_string());
let format_slug = match output_format {
OutputFormat::Markdown => "markdown",
OutputFormat::Plaintext => "plaintext",
};
let framework_name = if batch {
format!("kreuzberg-{}-{}-batch", format_slug, pipeline.as_str())
} else {
format!("kreuzberg-{}-{}", format_slug, pipeline.as_str())
};
let supported_formats = vec![
"pdf", "docx", "doc", "xlsx", "xls", "pptx", "ppt", "txt", "md", "html", "xml", "json", "odt", "ods", "odp",
"epub", "rtf", "csv", "json", "yaml", "png", "jpg", "jpeg", "gif", "bmp", "tiff", "tif", "webp", "zip", "tar",
"gz", "7z",
]
.into_iter()
.map(|s| s.to_string())
.collect();
let adapter = if batch {
SubprocessAdapter::with_batch_support(&framework_name, cli_path, args, vec![], supported_formats)
} else {
SubprocessAdapter::new(&framework_name, cli_path, args, vec![], supported_formats)
};
Ok(adapter)
}
/// Locates the kreuzberg executable.
///
/// Searches in priority order:
/// 1. `target/release/kreuzberg`
/// 2. `target/debug/kreuzberg`
/// 3. `which kreuzberg`
///
/// # Returns
/// * `Ok(PathBuf)` - Path to the executable
/// * `Err(Error)` - If kreuzberg cannot be found
fn locate_kreuzberg_cli() -> Result<PathBuf> {
// Try release build first
let release_path = PathBuf::from("target/release/kreuzberg");
if release_path.exists() {
return Ok(release_path);
}
// Try debug build
let debug_path = PathBuf::from("target/debug/kreuzberg");
if debug_path.exists() {
return Ok(debug_path);
}
// Try system PATH
if let Ok(path) = which("kreuzberg") {
return Ok(path);
}
Err(crate::Error::Benchmark(
"kreuzberg binary not found. Build with: cargo build --release -p kreuzberg-cli --features all".to_string(),
))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_pipeline_baseline_str() {
assert_eq!(KreuzbergPipeline::Baseline.as_str(), "baseline");
}
#[test]
fn test_pipeline_layout_str() {
assert_eq!(KreuzbergPipeline::Layout.as_str(), "layout");
}
#[test]
fn test_pipeline_paddle_ocr_str() {
assert_eq!(KreuzbergPipeline::PaddleOcr.as_str(), "paddle-ocr");
}
#[test]
fn test_output_format_markdown() {
assert_eq!(OutputFormat::Markdown.to_string(), "markdown");
}
#[test]
fn test_output_format_plaintext() {
assert_eq!(OutputFormat::Plaintext.to_string(), "plaintext");
}
}

View File

@@ -0,0 +1,39 @@
//! Framework adapter implementations
pub mod external;
pub mod kreuzberg;
pub mod subprocess;
pub use external::{
create_docling_adapter, create_markitdown_adapter, create_mineru_adapter, create_pandoc_adapter,
create_pdfminer_adapter, create_pdfplumber_adapter, create_pdftotext_adapter, create_playa_pdf_adapter,
create_pymupdf4llm_adapter, create_pypdf_adapter, create_tika_adapter, create_unstructured_adapter,
};
pub use kreuzberg::create_kreuzberg_adapter;
pub use subprocess::SubprocessAdapter;
/// Returns the OCR flag string based on the provided boolean
pub(crate) fn ocr_flag(ocr_enabled: bool) -> String {
if ocr_enabled {
"--ocr".to_string()
} else {
"--no-ocr".to_string()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_ocr_flag_when_enabled() {
let result = ocr_flag(true);
assert_eq!(result, "--ocr", "Should return '--ocr' when enabled");
}
#[test]
fn test_ocr_flag_when_disabled() {
let result = ocr_flag(false);
assert_eq!(result, "--no-ocr", "Should return '--no-ocr' when disabled");
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,474 @@
//! Benchmark configuration
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::path::{Path, PathBuf};
use std::time::Duration;
use crate::types::DiskSizeInfo;
use crate::{Error, Result};
/// Benchmark execution mode
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum BenchmarkMode {
/// Single-file mode: Sequential execution (max_concurrent=1) for fair latency comparison
SingleFile,
/// Batch mode: Concurrent execution to measure throughput
Batch,
}
/// CPU/memory profiling configuration for benchmark analysis
///
/// Controls adaptive sampling frequency, task duration amplification, and sample collection
/// thresholds to ensure high-quality profiles with 500-5000 samples per run.
///
/// # Sampling Frequency
///
/// The sampling frequency (100-10000 Hz) is automatically adjusted based on task duration:
/// - Quick tasks (<100ms): Higher frequency (up to 10000 Hz)
/// - Medium tasks (100-1000ms): Standard frequency (1000 Hz)
/// - Long tasks (>1000ms): Lower frequency (100-1000 Hz)
///
/// # Task Duration Amplification
///
/// When profiling is enabled, tasks can be amplified (repeated multiple times) to increase
/// profiling duration and reduce variance in sample collection.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ProfilingConfig {
/// Enable/disable CPU profiling
pub enabled: bool,
/// CPU sampling frequency in Hz (100-10000)
/// Adjusted adaptively based on estimated task duration
pub sampling_frequency: i32,
/// Minimum task duration in milliseconds for adaptive frequency calculation
/// Tasks shorter than this use higher sampling frequencies
pub task_duration_ms: u64,
/// Number of documents per profiling batch
/// Larger batches provide more samples but increase memory usage
pub batch_size: usize,
/// Memory sample collection interval in milliseconds (0 = disabled)
pub memory_sampling_interval_ms: u64,
/// Enable flamegraph generation after profiling completes
pub flamegraph_enabled: bool,
/// Minimum number of samples required for a valid profile
/// Profiles with fewer samples may have high variance
pub sample_count_threshold: usize,
}
impl Default for ProfilingConfig {
fn default() -> Self {
Self {
enabled: false,
sampling_frequency: 1000,
task_duration_ms: 500,
batch_size: 10,
memory_sampling_interval_ms: 10,
flamegraph_enabled: true,
sample_count_threshold: 500,
}
}
}
impl ProfilingConfig {
/// Create a new profiling configuration with validation
///
/// # Arguments
///
/// * `sampling_frequency` - CPU sampling frequency in Hz (100-10000)
/// * `batch_size` - Number of documents per profiling batch (must be > 0)
/// * `sample_count_threshold` - Minimum samples for valid profile (must be > 0)
///
/// # Errors
///
/// Returns [`crate::Error::Config`] if any configuration value is invalid
pub fn new(sampling_frequency: i32, batch_size: usize, sample_count_threshold: usize) -> crate::Result<Self> {
let config = Self {
enabled: false,
sampling_frequency,
task_duration_ms: 500,
batch_size,
memory_sampling_interval_ms: 10,
flamegraph_enabled: true,
sample_count_threshold,
};
config.validate()?;
Ok(config)
}
/// Validate the profiling configuration
///
/// # Errors
///
/// Returns [`crate::Error::Config`] if any configuration value is invalid
pub fn validate(&self) -> crate::Result<()> {
if self.sampling_frequency < 100 || self.sampling_frequency > 10000 {
return Err(crate::Error::Config(format!(
"sampling_frequency must be 100-10000 Hz, got {}",
self.sampling_frequency
)));
}
if self.batch_size == 0 {
return Err(crate::Error::Config("batch_size must be > 0".to_string()));
}
if self.sample_count_threshold == 0 {
return Err(crate::Error::Config("sample_count_threshold must be > 0".to_string()));
}
Ok(())
}
/// Calculate optimal sampling frequency based on estimated task duration
///
/// Uses realistic sysinfo limits (100-500 Hz) to achieve target sample count.
/// sysinfo cannot reliably achieve >500 Hz on most systems due to:
/// - Process scheduling granularity
/// - System call overhead
/// - File descriptor refresh costs
///
/// Target: 500 samples minimum for statistical significance
///
/// # Arguments
///
/// * `estimated_duration_ms` - Estimated task duration in milliseconds
///
/// # Returns
///
/// Optimal sampling frequency in Hz (clamped to 100-500 range)
pub fn calculate_optimal_frequency(estimated_duration_ms: u64) -> i32 {
const TARGET_SAMPLE_COUNT: u64 = 500;
const REALISTIC_MAX_HZ: i32 = 500;
if estimated_duration_ms == 0 {
return REALISTIC_MAX_HZ;
}
let required_hz = (TARGET_SAMPLE_COUNT * 1000) / estimated_duration_ms.max(1);
(required_hz as i32).clamp(100, REALISTIC_MAX_HZ)
}
/// Calculate sampling interval in milliseconds from frequency in Hz
///
/// Converts sampling frequency to the actual interval between samples.
///
/// # Arguments
///
/// * `sampling_frequency_hz` - Sampling frequency in Hz
///
/// # Returns
///
/// Sampling interval in milliseconds (minimum 1ms)
pub fn calculate_sample_interval_ms(sampling_frequency_hz: i32) -> u64 {
(1000 / sampling_frequency_hz as u64).max(1)
}
}
/// Configuration for benchmark runs
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BenchmarkConfig {
/// File types to include (e.g., ["pdf", "docx"])
pub file_types: Option<Vec<String>>,
/// Timeout for each extraction
pub timeout: Duration,
/// Maximum number of concurrent extractions
pub max_concurrent: usize,
/// Output directory for results
pub output_dir: PathBuf,
/// Whether to include quality assessment
pub measure_quality: bool,
/// Benchmark execution mode (single-file or batch)
pub benchmark_mode: BenchmarkMode,
/// Number of warmup iterations (discarded from statistics)
pub warmup_iterations: usize,
/// Number of benchmark iterations for statistical analysis
pub benchmark_iterations: usize,
/// Profiling configuration for CPU/memory analysis
pub profiling: ProfilingConfig,
/// Whether OCR is enabled for this benchmark run.
/// When false, fixtures that require OCR (images, scanned PDFs) are excluded.
pub ocr_enabled: bool,
}
impl Default for BenchmarkConfig {
fn default() -> Self {
Self {
file_types: None,
timeout: Duration::from_secs(1800),
max_concurrent: num_cpus::get(),
output_dir: PathBuf::from("results"),
measure_quality: false,
benchmark_mode: BenchmarkMode::Batch,
warmup_iterations: 1,
benchmark_iterations: 3,
profiling: ProfilingConfig::default(),
ocr_enabled: false,
}
}
}
impl BenchmarkConfig {
/// Create a new benchmark configuration with validation
///
/// # Arguments
///
/// * `output_dir` - Directory for results
/// * `max_concurrent` - Maximum concurrent extractions (must be > 0)
/// * `benchmark_iterations` - Number of iterations (must be > 0)
/// * `timeout` - Timeout per extraction
/// * `benchmark_mode` - SingleFile or Batch mode
///
/// # Errors
///
/// Returns [`crate::Error::Config`] if any configuration value is invalid
pub fn new(
output_dir: PathBuf,
max_concurrent: usize,
benchmark_iterations: usize,
timeout: Duration,
benchmark_mode: BenchmarkMode,
) -> crate::Result<Self> {
let config = Self {
file_types: None,
timeout,
max_concurrent,
output_dir,
measure_quality: false,
benchmark_mode,
warmup_iterations: 1,
benchmark_iterations,
profiling: ProfilingConfig::default(),
ocr_enabled: false,
};
config.validate()?;
Ok(config)
}
/// Validate the configuration
///
/// # Errors
///
/// Returns [`crate::Error::Config`] if any configuration value is invalid
pub fn validate(&self) -> crate::Result<()> {
if self.timeout.as_secs() == 0 {
return Err(crate::Error::Config("Timeout must be > 0".to_string()));
}
if self.max_concurrent == 0 {
return Err(crate::Error::Config("max_concurrent must be > 0".to_string()));
}
if self.benchmark_iterations == 0 {
return Err(crate::Error::Config("benchmark_iterations must be > 0".to_string()));
}
if self.benchmark_mode == BenchmarkMode::SingleFile && self.max_concurrent != 1 {
return Err(crate::Error::Config(
"single-file mode requires max_concurrent=1".to_string(),
));
}
self.profiling.validate()?;
Ok(())
}
}
/// Load framework disk sizes from JSON configuration file
pub fn load_framework_sizes(config_path: &Path) -> Result<HashMap<String, DiskSizeInfo>> {
let json_content = std::fs::read_to_string(config_path).map_err(Error::Io)?;
let sizes: HashMap<String, DiskSizeInfo> = serde_json::from_str(&json_content)
.map_err(|e| Error::Benchmark(format!("Failed to parse framework sizes: {}", e)))?;
Ok(sizes)
}
#[cfg(test)]
mod tests {
use super::*;
// -- BenchmarkConfig::validate tests --
#[test]
fn test_valid_batch_config() {
let config = BenchmarkConfig::new(
PathBuf::from("/tmp/results"),
4,
3,
Duration::from_secs(180),
BenchmarkMode::Batch,
);
assert!(config.is_ok());
}
#[test]
fn test_valid_single_file_config() {
let config = BenchmarkConfig::new(
PathBuf::from("/tmp/results"),
1,
3,
Duration::from_secs(180),
BenchmarkMode::SingleFile,
);
assert!(config.is_ok());
}
#[test]
fn test_zero_timeout_rejected() {
let config = BenchmarkConfig::new(
PathBuf::from("/tmp/results"),
4,
3,
Duration::from_secs(0),
BenchmarkMode::Batch,
);
assert!(config.is_err());
let msg = format!("{}", config.unwrap_err());
assert!(msg.contains("Timeout must be > 0"));
}
#[test]
fn test_zero_max_concurrent_rejected() {
let config = BenchmarkConfig::new(
PathBuf::from("/tmp/results"),
0,
3,
Duration::from_secs(180),
BenchmarkMode::Batch,
);
assert!(config.is_err());
let msg = format!("{}", config.unwrap_err());
assert!(msg.contains("max_concurrent must be > 0"));
}
#[test]
fn test_zero_iterations_rejected() {
let config = BenchmarkConfig::new(
PathBuf::from("/tmp/results"),
4,
0,
Duration::from_secs(180),
BenchmarkMode::Batch,
);
assert!(config.is_err());
let msg = format!("{}", config.unwrap_err());
assert!(msg.contains("benchmark_iterations must be > 0"));
}
#[test]
fn test_single_file_mode_requires_max_concurrent_one() {
let config = BenchmarkConfig::new(
PathBuf::from("/tmp/results"),
4, // not 1
3,
Duration::from_secs(180),
BenchmarkMode::SingleFile,
);
assert!(config.is_err());
let msg = format!("{}", config.unwrap_err());
assert!(msg.contains("single-file mode requires max_concurrent=1"));
}
#[test]
fn test_default_config_validates() {
let config = BenchmarkConfig::default();
// Default is Batch mode with max_concurrent = num_cpus which is >= 1.
// This should pass unless running on a system with 0 CPUs.
assert!(config.validate().is_ok());
}
// -- ProfilingConfig::validate tests --
#[test]
fn test_valid_profiling_config() {
let config = ProfilingConfig::new(1000, 10, 500);
assert!(config.is_ok());
}
#[test]
fn test_profiling_frequency_too_low() {
let config = ProfilingConfig::new(50, 10, 500);
assert!(config.is_err());
let msg = format!("{}", config.unwrap_err());
assert!(msg.contains("sampling_frequency must be 100-10000 Hz"));
}
#[test]
fn test_profiling_frequency_too_high() {
let config = ProfilingConfig::new(20_000, 10, 500);
assert!(config.is_err());
let msg = format!("{}", config.unwrap_err());
assert!(msg.contains("sampling_frequency must be 100-10000 Hz"));
}
#[test]
fn test_profiling_zero_batch_size() {
let config = ProfilingConfig::new(1000, 0, 500);
assert!(config.is_err());
let msg = format!("{}", config.unwrap_err());
assert!(msg.contains("batch_size must be > 0"));
}
#[test]
fn test_profiling_zero_sample_threshold() {
let config = ProfilingConfig::new(1000, 10, 0);
assert!(config.is_err());
let msg = format!("{}", config.unwrap_err());
assert!(msg.contains("sample_count_threshold must be > 0"));
}
#[test]
fn test_profiling_boundary_frequencies() {
// Minimum valid frequency
assert!(ProfilingConfig::new(100, 1, 1).is_ok());
// Maximum valid frequency
assert!(ProfilingConfig::new(10000, 1, 1).is_ok());
// Just below minimum
assert!(ProfilingConfig::new(99, 1, 1).is_err());
// Just above maximum
assert!(ProfilingConfig::new(10001, 1, 1).is_err());
}
#[test]
fn test_optimal_frequency_zero_duration() {
let freq = ProfilingConfig::calculate_optimal_frequency(0);
assert_eq!(freq, 500); // REALISTIC_MAX_HZ
}
#[test]
fn test_optimal_frequency_short_task() {
let freq = ProfilingConfig::calculate_optimal_frequency(100);
// 500 * 1000 / 100 = 5000, clamped to 500
assert_eq!(freq, 500);
}
#[test]
fn test_optimal_frequency_long_task() {
let freq = ProfilingConfig::calculate_optimal_frequency(10_000);
// 500 * 1000 / 10000 = 50, clamped to 100
assert_eq!(freq, 100);
}
#[test]
fn test_sample_interval_calculation() {
assert_eq!(ProfilingConfig::calculate_sample_interval_ms(1000), 1);
assert_eq!(ProfilingConfig::calculate_sample_interval_ms(100), 10);
assert_eq!(ProfilingConfig::calculate_sample_interval_ms(500), 2);
}
}

View File

@@ -0,0 +1,198 @@
//! Loading benchmark results from disk for consolidation
//!
//! This module provides `load_run_results` which recursively loads benchmark
//! result JSON files from a directory tree, tagging them with batch mode info
//! inferred from directory names.
use crate::types::BenchmarkResult;
use crate::{Error, Result};
use std::fs;
use std::path::Path;
/// Load benchmark results from `results.json` files in a directory.
///
/// Recursively walks the given directory, loading any `results.json` files found.
/// For directories whose name ends with `-batch`, the framework name in each result
/// is suffixed with `-batch` so that the aggregation layer can distinguish single-
/// vs batch-mode results.
///
/// # Errors
///
/// Returns [`Error::Io`] if the directory cannot be read, or [`Error::Benchmark`]
/// if a `results.json` file contains invalid JSON or fails validation.
pub fn load_run_results(dir: &Path) -> Result<Vec<BenchmarkResult>> {
let mut results = Vec::new();
for entry in fs::read_dir(dir).map_err(Error::Io)? {
let entry = entry.map_err(Error::Io)?;
let path = entry.path();
if path.is_file() && path.file_name().is_some_and(|n| n == "results.json") {
eprintln!("Loading results from {}", path.display());
let json_content = fs::read_to_string(&path).map_err(Error::Io)?;
let mut run_results: Vec<BenchmarkResult> = serde_json::from_str(&json_content)
.map_err(|e| Error::Benchmark(format!("Failed to parse {}: {}", path.display(), e)))?;
// Infer benchmark mode from the parent directory name.
// The runner outputs to `benchmark-results/{FRAMEWORK}-{MODE}/results.json`
// where MODE is "batch" or "single-file". The framework field inside
// results.json does NOT include the mode, so we tag it here to allow
// the aggregation to distinguish single vs batch results.
let dir_name = dir.file_name().and_then(|n| n.to_str()).unwrap_or("");
let is_batch = dir_name.ends_with("-batch");
if is_batch {
for result in &mut run_results {
if !result.framework.ends_with("-batch") {
result.framework = format!("{}-batch", result.framework);
}
}
}
// Validate loaded results
for result in &run_results {
crate::output::validate_result(result)
.map_err(|e| Error::Benchmark(format!("Invalid result in {}: {}", path.display(), e)))?;
}
results.extend(run_results);
} else if path.is_dir() {
match load_run_results(&path) {
Ok(mut run_results) => results.append(&mut run_results),
Err(e) => eprintln!("Warning: Failed to load results from {}: {}", path.display(), e),
}
}
}
Ok(results)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::types::{ErrorKind, FrameworkCapabilities, OutputFormat, PerformanceMetrics};
use std::time::Duration;
/// Build a minimal valid `BenchmarkResult` for testing.
fn make_result(framework: &str) -> BenchmarkResult {
BenchmarkResult {
framework: framework.to_string(),
file_path: std::path::PathBuf::from("test.pdf"),
file_size: 1024,
success: true,
error_message: None,
error_kind: ErrorKind::None,
duration: Duration::from_millis(100),
extraction_duration: None,
subprocess_overhead: None,
metrics: PerformanceMetrics {
peak_memory_bytes: 1_000_000,
avg_cpu_percent: 50.0,
throughput_bytes_per_sec: 10_240.0,
p50_memory_bytes: 900_000,
p95_memory_bytes: 950_000,
p99_memory_bytes: 990_000,
},
quality: None,
iterations: vec![],
statistics: None,
cold_start_duration: None,
file_extension: "pdf".to_string(),
framework_capabilities: FrameworkCapabilities::default(),
pdf_metadata: None,
ocr_status: Default::default(),
extracted_text: None,
output_format: OutputFormat::Markdown,
}
}
#[test]
fn test_load_single_results_file() {
let dir = tempfile::tempdir().expect("create temp dir");
let results = vec![make_result("kreuzberg-rust")];
let json = serde_json::to_string(&results).expect("serialize");
fs::write(dir.path().join("results.json"), &json).expect("write");
let loaded = load_run_results(dir.path()).expect("load");
assert_eq!(loaded.len(), 1);
assert_eq!(loaded[0].framework, "kreuzberg-rust");
}
#[test]
fn test_batch_directory_tags_framework_name() {
let dir = tempfile::tempdir().expect("create temp dir");
let batch_dir = dir.path().join("kreuzberg-rust-batch");
fs::create_dir_all(&batch_dir).expect("create subdir");
let results = vec![make_result("kreuzberg-rust")];
let json = serde_json::to_string(&results).expect("serialize");
fs::write(batch_dir.join("results.json"), &json).expect("write");
let loaded = load_run_results(dir.path()).expect("load");
assert_eq!(loaded.len(), 1);
assert_eq!(loaded[0].framework, "kreuzberg-rust-batch");
}
#[test]
fn test_batch_suffix_not_doubled() {
let dir = tempfile::tempdir().expect("create temp dir");
let batch_dir = dir.path().join("kreuzberg-rust-batch");
fs::create_dir_all(&batch_dir).expect("create subdir");
let results = vec![make_result("kreuzberg-rust-batch")];
let json = serde_json::to_string(&results).expect("serialize");
fs::write(batch_dir.join("results.json"), &json).expect("write");
let loaded = load_run_results(dir.path()).expect("load");
assert_eq!(loaded.len(), 1);
assert_eq!(loaded[0].framework, "kreuzberg-rust-batch");
}
#[test]
fn test_recursive_loading() {
let dir = tempfile::tempdir().expect("create temp dir");
let sub1 = dir.path().join("framework-a");
let sub2 = dir.path().join("framework-b");
fs::create_dir_all(&sub1).expect("create subdir 1");
fs::create_dir_all(&sub2).expect("create subdir 2");
fs::write(
sub1.join("results.json"),
serde_json::to_string(&vec![make_result("framework-a")]).expect("serialize"),
)
.expect("write a");
fs::write(
sub2.join("results.json"),
serde_json::to_string(&vec![make_result("framework-b")]).expect("serialize"),
)
.expect("write b");
let loaded = load_run_results(dir.path()).expect("load");
assert_eq!(loaded.len(), 2);
let names: Vec<&str> = loaded.iter().map(|r| r.framework.as_str()).collect();
assert!(names.contains(&"framework-a"));
assert!(names.contains(&"framework-b"));
}
#[test]
fn test_malformed_json_returns_error() {
let dir = tempfile::tempdir().expect("create temp dir");
fs::write(dir.path().join("results.json"), "NOT VALID JSON").expect("write");
let result = load_run_results(dir.path());
assert!(result.is_err());
let err_msg = format!("{}", result.unwrap_err());
assert!(err_msg.contains("Failed to parse"));
}
#[test]
fn test_empty_directory_returns_empty_vec() {
let dir = tempfile::tempdir().expect("create temp dir");
let loaded = load_run_results(dir.path()).expect("load");
assert!(loaded.is_empty());
}
#[test]
fn test_nonexistent_directory_returns_error() {
let result = load_run_results(Path::new("/tmp/nonexistent_benchmark_dir_12345"));
assert!(result.is_err());
}
}

View File

@@ -0,0 +1,148 @@
//! Corpus discovery and filtering for benchmark documents.
//!
//! Builds on the existing [`FixtureManager`] to provide structured corpus access
//! with filtering by file type, ground truth availability, and name patterns.
use crate::Result;
use crate::fixture::FixtureManager;
use std::path::{Path, PathBuf};
/// A document in the benchmark corpus with resolved paths.
#[derive(Debug, Clone)]
pub struct CorpusDocument {
/// Human-readable name (fixture stem, e.g. "nougat_001")
pub name: String,
/// Absolute path to the source document
pub document_path: PathBuf,
/// File type (e.g. "pdf", "docx")
pub file_type: String,
/// File size in bytes
pub file_size: u64,
/// Absolute path to text ground truth (if available)
pub ground_truth_text: Option<PathBuf>,
/// Absolute path to markdown ground truth (if available)
pub ground_truth_markdown: Option<PathBuf>,
}
/// Filter criteria for corpus discovery.
#[derive(Debug, Clone, Default)]
pub struct CorpusFilter {
/// Only include these file types (None = all)
pub file_types: Option<Vec<String>>,
/// Require text ground truth
pub require_ground_truth: bool,
/// Require markdown ground truth
pub require_markdown_ground_truth: bool,
/// Maximum file size in bytes (None = no limit)
pub max_file_size: Option<u64>,
/// Only include fixtures whose name contains one of these strings
pub name_patterns: Vec<String>,
}
/// Build a filtered corpus from the fixture directory.
pub fn build_corpus(fixtures_dir: &Path, filter: &CorpusFilter) -> Result<Vec<CorpusDocument>> {
let mut manager = FixtureManager::new();
if fixtures_dir.is_dir() {
manager.load_fixtures_from_dir(fixtures_dir)?;
} else {
manager.load_fixture(fixtures_dir)?;
}
let mut docs = Vec::new();
for (fixture_path, fixture) in manager.fixtures() {
let fixture_dir = match fixture_path.parent() {
Some(d) => d,
None => continue,
};
let name = fixture_path
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("")
.to_string();
// Apply name filter (match ANY pattern)
if !filter.name_patterns.is_empty() && !filter.name_patterns.iter().any(|p| name.contains(p.as_str())) {
continue;
}
// Apply file type filter
if let Some(ref types) = filter.file_types
&& !types.contains(&fixture.file_type)
{
continue;
}
// Apply file size filter
if let Some(max_size) = filter.max_file_size
&& fixture.file_size > max_size
{
continue;
}
let document_path = fixture.resolve_document_path(fixture_dir);
let gt_text = fixture.resolve_ground_truth_path(fixture_dir);
let gt_markdown = fixture.resolve_ground_truth_markdown_path(fixture_dir);
// Apply ground truth filters
if filter.require_ground_truth && gt_text.is_none() {
continue;
}
if filter.require_markdown_ground_truth && gt_markdown.is_none() {
continue;
}
docs.push(CorpusDocument {
name,
document_path,
file_type: fixture.file_type.clone(),
file_size: fixture.file_size,
ground_truth_text: gt_text,
ground_truth_markdown: gt_markdown,
});
}
docs.sort_by(|a, b| a.name.cmp(&b.name));
Ok(docs)
}
/// Convenience: all PDFs with text ground truth.
pub fn pdf_corpus(fixtures_dir: &Path) -> Result<Vec<CorpusDocument>> {
build_corpus(
fixtures_dir,
&CorpusFilter {
file_types: Some(vec!["pdf".to_string()]),
require_ground_truth: true,
..Default::default()
},
)
}
/// Convenience: all PDFs with markdown ground truth.
pub fn pdf_markdown_corpus(fixtures_dir: &Path) -> Result<Vec<CorpusDocument>> {
build_corpus(
fixtures_dir,
&CorpusFilter {
file_types: Some(vec!["pdf".to_string()]),
require_ground_truth: true,
require_markdown_ground_truth: true,
..Default::default()
},
)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_default_filter_is_permissive() {
let filter = CorpusFilter::default();
assert!(filter.file_types.is_none());
assert!(!filter.require_ground_truth);
assert!(!filter.require_markdown_ground_truth);
assert!(filter.max_file_size.is_none());
assert!(filter.name_patterns.is_empty());
}
}

View File

@@ -0,0 +1,228 @@
//! Per-document diagnostic output for poor-scoring documents.
//!
//! When a document scores below the diagnostic threshold, this module generates
//! detailed diagnostics showing unmatched blocks, missing/extra tokens, cross-type
//! matches, and noise issues. Results are written to `/tmp/kreuzberg_diagnose/`.
use crate::noise_detection::DiagnosticReport;
use serde::Serialize;
/// Full diagnostic report for a single document with poor scores.
#[derive(Debug, Serialize)]
pub struct DocumentDiagnostic {
/// Name of the document being diagnosed.
pub doc_name: String,
/// File type (e.g., "pdf", "docx").
pub file_type: String,
/// Pipeline that produced the extraction.
pub pipeline: String,
/// Structural F1 score.
pub sf1: f64,
/// Token F1 score.
pub tf1: f64,
/// GT blocks that had no match in the extracted output.
pub unmatched_gt_blocks: Vec<BlockPreview>,
/// Extracted blocks that had no match in the ground truth.
pub unmatched_extracted_blocks: Vec<BlockPreview>,
/// Blocks that matched across different types (e.g., heading matched as paragraph).
pub cross_type_matches: Vec<CrossTypeMatch>,
/// Top tokens present in GT but missing in extraction (recall misses).
pub top_missing_tokens: Vec<(String, usize)>,
/// Top tokens present in extraction but absent from GT (precision misses).
pub top_extra_tokens: Vec<(String, usize)>,
/// Noise detection results for the extracted content.
pub noise: DiagnosticReport,
}
/// A preview of a single markdown block for diagnostic output.
#[derive(Debug, Serialize)]
pub struct BlockPreview {
/// Block type name (e.g., "H1", "Paragraph", "Table").
pub block_type: String,
/// First 120 characters of the block content.
pub content_preview: String,
/// Block index in the parsed sequence.
pub index: usize,
}
/// A match between blocks of different types.
#[derive(Debug, Serialize)]
pub struct CrossTypeMatch {
/// Ground truth block type.
pub gt_type: String,
/// Extracted block type.
pub extracted_type: String,
/// Token-level content similarity (0.0-1.0).
pub content_similarity: f64,
/// Type compatibility score (0.0-1.0).
pub type_compatibility: f64,
}
/// Truncate a string to `max_len` characters, appending "..." if truncated.
fn truncate(s: &str, max_len: usize) -> String {
if s.len() <= max_len {
s.to_string()
} else {
let truncated: String = s.chars().take(max_len).collect();
format!("{}...", truncated)
}
}
/// Generate diagnostics for a document with poor scores.
///
/// Analyzes the structural matching, token diffs, and noise to produce a
/// comprehensive diagnostic report explaining why the document scored poorly.
pub fn diagnose_document(
doc_name: &str,
file_type: &str,
pipeline_name: &str,
extracted_content: &str,
gt_text: &str,
gt_markdown: Option<&str>,
) -> DocumentDiagnostic {
// Structural diagnostics (unmatched blocks, cross-type matches)
let (unmatched_gt_blocks, unmatched_extracted_blocks, cross_type_matches, sf1) = if let Some(md_gt) = gt_markdown {
let (sq, diag) = crate::markdown_quality::score_structural_quality_diagnostic(extracted_content, md_gt);
let unmatched_gt: Vec<BlockPreview> = diag
.unmatched_gt
.iter()
.map(|(idx, block)| BlockPreview {
block_type: block.block_type.to_string(),
content_preview: truncate(&block.content, 120),
index: *idx,
})
.collect();
let unmatched_ext: Vec<BlockPreview> = diag
.unmatched_extracted
.iter()
.map(|(idx, block)| BlockPreview {
block_type: block.block_type.to_string(),
content_preview: truncate(&block.content, 120),
index: *idx,
})
.collect();
let cross_types: Vec<CrossTypeMatch> = diag
.cross_type_matches
.iter()
.map(|(gt_block, ext_block, sim, compat)| CrossTypeMatch {
gt_type: gt_block.block_type.to_string(),
extracted_type: ext_block.block_type.to_string(),
content_similarity: *sim,
type_compatibility: *compat,
})
.collect();
(unmatched_gt, unmatched_ext, cross_types, sq.structural_f1)
} else {
(Vec::new(), Vec::new(), Vec::new(), 0.0)
};
// Token diff (missing/extra tokens)
let ext_tokens = crate::quality::tokenize(extracted_content);
let gt_tokens = crate::quality::tokenize(gt_text);
let tf1 = crate::quality::compute_f1(&ext_tokens, &gt_tokens);
let (mut missing_tokens, mut extra_tokens) = crate::quality::compute_token_diff(&ext_tokens, &gt_tokens);
missing_tokens.truncate(30);
extra_tokens.truncate(30);
// Noise detection
let noise = crate::noise_detection::detect_noise(extracted_content);
DocumentDiagnostic {
doc_name: doc_name.to_string(),
file_type: file_type.to_string(),
pipeline: pipeline_name.to_string(),
sf1,
tf1,
unmatched_gt_blocks,
unmatched_extracted_blocks,
cross_type_matches,
top_missing_tokens: missing_tokens,
top_extra_tokens: extra_tokens,
noise,
}
}
/// Write diagnostic files to `/tmp/kreuzberg_diagnose/{doc_name}/`.
///
/// Creates the directory and writes:
/// - `gt.md` — ground truth markdown (if available)
/// - `extracted.md` — extracted output
/// - `diagnostic.json` — serialized `DocumentDiagnostic`
pub fn write_diagnostic_files(
diag: &DocumentDiagnostic,
gt_markdown: Option<&str>,
extracted_content: &str,
) -> std::io::Result<()> {
let dir = std::path::PathBuf::from("/tmp/kreuzberg_diagnose").join(format!("{}_{}", diag.doc_name, diag.file_type));
std::fs::create_dir_all(&dir)?;
if let Some(md) = gt_markdown {
std::fs::write(dir.join("gt.md"), md)?;
}
std::fs::write(dir.join("extracted.md"), extracted_content)?;
let json = serde_json::to_string_pretty(diag).map_err(std::io::Error::other)?;
std::fs::write(dir.join("diagnostic.json"), json)?;
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_truncate_short() {
assert_eq!(truncate("hello", 120), "hello");
}
#[test]
fn test_truncate_long() {
let long = "a".repeat(200);
let result = truncate(&long, 120);
assert!(result.ends_with("..."));
// 120 chars + "..."
assert_eq!(result.len(), 123);
}
#[test]
fn test_diagnose_document_no_markdown_gt() {
let diag = diagnose_document("test_doc", "pdf", "baseline", "hello world", "hello world", None);
assert_eq!(diag.doc_name, "test_doc");
assert_eq!(diag.file_type, "pdf");
assert!(diag.unmatched_gt_blocks.is_empty());
assert!(diag.unmatched_extracted_blocks.is_empty());
assert!(diag.cross_type_matches.is_empty());
}
#[test]
fn test_diagnose_document_with_markdown_gt() {
let extracted = "# Title\n\nSome content here.";
let gt_text = "Title Some content here.";
let gt_md = "# Title\n\nSome content here.\n\n## Missing Section\n\nMore text.";
let diag = diagnose_document("test_doc", "pdf", "layout", extracted, gt_text, Some(gt_md));
assert_eq!(diag.pipeline, "layout");
// There should be some unmatched GT blocks (the missing section)
assert!(!diag.unmatched_gt_blocks.is_empty() || !diag.top_missing_tokens.is_empty());
}
#[test]
fn test_write_diagnostic_files() {
let diag = diagnose_document("write_test", "pdf", "baseline", "extracted text", "ground truth", None);
let result = write_diagnostic_files(&diag, Some("# GT"), "extracted text");
assert!(result.is_ok());
let dir = std::path::PathBuf::from("/tmp/kreuzberg_diagnose/write_test_pdf");
assert!(dir.join("gt.md").exists());
assert!(dir.join("extracted.md").exists());
assert!(dir.join("diagnostic.json").exists());
// Cleanup
let _ = std::fs::remove_dir_all(&dir);
}
}

View File

@@ -0,0 +1,407 @@
//! Embedding benchmark: throughput, latency, and batch-size sweep across presets.
//!
//! Measures embedding generation performance for each preset (fast, balanced,
//! quality, multilingual) including:
//! - Model warm-up latency (first-call overhead: download + ONNX init)
//! - Steady-state throughput: chunks/sec at default batch size
//! - Batch size sweep: throughput at batch sizes 8, 16, 32, 64, 128
//!
//! Requires ONNX Runtime on the system. See `kreuzberg::embeddings` for installation
//! instructions.
use std::time::Instant;
use rayon::prelude::*;
use kreuzberg::embeddings::{EMBEDDING_PRESETS, EmbeddingPreset};
use kreuzberg::{Chunk, ChunkMetadata, EmbeddingConfig, EmbeddingModelType};
/// Embed text content into each chunk using the public `embed_texts` API.
///
/// Mirrors the internal `embed_chunks` behaviour: collects
/// chunk text, calls `embed_texts`, and writes each resulting vector back into
/// `chunk.embedding`.
fn embed_chunks(chunks: &mut [Chunk], config: &EmbeddingConfig) -> kreuzberg::Result<()> {
if chunks.is_empty() {
return Ok(());
}
let texts: Vec<String> = chunks.iter().map(|c| c.content.clone()).collect();
let embeddings = kreuzberg::embed_texts(texts, config)?;
for (chunk, embedding) in chunks.iter_mut().zip(embeddings) {
chunk.embedding = Some(embedding);
}
Ok(())
}
/// Number of chunks to embed for throughput measurement.
const THROUGHPUT_CHUNK_COUNT: usize = 100;
/// Number of words per chunk used in throughput measurement.
const WORDS_PER_CHUNK: usize = 200;
/// Batch sizes to sweep.
const BATCH_SIZES: &[usize] = &[8, 16, 32, 64, 128];
/// Per-preset benchmark results.
#[derive(Debug)]
pub struct PresetResult {
pub name: String,
pub dimensions: usize,
/// Model warm-up time in milliseconds (first call: download check + ONNX init).
pub warm_ms: f64,
/// Total time to embed `THROUGHPUT_CHUNK_COUNT` chunks at default batch size (ms).
pub total_ms: f64,
/// Chunks per second at default batch size.
pub chunks_per_sec: f64,
/// Milliseconds per chunk at default batch size.
pub ms_per_chunk: f64,
}
/// Per-batch-size result for the sweep (run on the "balanced" preset).
#[derive(Debug)]
pub struct BatchSweepResult {
pub batch_size: usize,
/// Total time to embed `THROUGHPUT_CHUNK_COUNT` chunks (ms).
pub total_ms: f64,
pub chunks_per_sec: f64,
pub ms_per_chunk: f64,
}
/// Parallel inference benchmark result.
#[derive(Debug)]
pub struct ParallelResult {
pub num_batches: usize,
pub chunks_per_batch: usize,
pub total_chunks: usize,
/// Sequential baseline time in milliseconds.
pub sequential_ms: f64,
/// Sequential throughput in chunks per second.
pub sequential_chunks_per_sec: f64,
/// Parallel (rayon) time in milliseconds.
pub parallel_ms: f64,
/// Parallel throughput in chunks per second.
pub parallel_chunks_per_sec: f64,
/// Speedup factor (sequential_ms / parallel_ms).
pub speedup: f64,
}
/// Full embed benchmark output.
#[derive(Debug)]
pub struct EmbedBenchmarkResults {
pub presets: Vec<PresetResult>,
pub batch_sweep: Vec<BatchSweepResult>,
pub parallel: Option<ParallelResult>,
}
/// Generate synthetic text chunks for benchmarking.
///
/// Each chunk contains `words_per_chunk` space-separated lorem-ipsum-style words
/// to approximate realistic sentence length distributions.
fn generate_test_chunks(count: usize, words_per_chunk: usize) -> Vec<Chunk> {
// Rotating word list gives realistic token distributions without repetition bias.
const WORDS: &[&str] = &[
"the",
"quick",
"brown",
"fox",
"jumps",
"over",
"lazy",
"dog",
"in",
"a",
"field",
"of",
"green",
"grass",
"under",
"blue",
"sky",
"with",
"white",
"clouds",
"floating",
"gently",
"by",
"as",
"birds",
"sing",
"their",
"songs",
"and",
"children",
"play",
"happily",
"near",
"river",
"bank",
"where",
"water",
"flows",
"crystal",
"clear",
"through",
"ancient",
"stones",
"document",
"extraction",
"embedding",
"vector",
"semantic",
"search",
"retrieval",
"augmented",
"generation",
"neural",
"network",
"transformer",
"attention",
"mechanism",
"tokenizer",
"inference",
"batch",
"processing",
];
(0..count)
.map(|i| {
// Build chunk text: vary starting offset so each chunk is distinct.
let text: String = (0..words_per_chunk)
.map(|j| WORDS[(i * 7 + j * 3) % WORDS.len()])
.collect::<Vec<_>>()
.join(" ");
let byte_end = text.len();
Chunk {
content: text,
embedding: None,
chunk_type: Default::default(),
metadata: ChunkMetadata {
byte_start: 0,
byte_end,
token_count: None,
chunk_index: i,
total_chunks: count,
first_page: None,
last_page: None,
heading_context: None,
image_indices: Vec::new(),
},
}
})
.collect()
}
/// Build an EmbeddingConfig for a given preset at the specified batch size.
fn config_for_preset(preset: &EmbeddingPreset, batch_size: usize) -> EmbeddingConfig {
EmbeddingConfig {
model: EmbeddingModelType::Preset {
name: preset.name.to_string(),
},
normalize: true,
batch_size,
show_download_progress: false,
cache_dir: None,
acceleration: None,
max_embed_duration_secs: None,
}
}
/// Run the full embedding benchmark.
///
/// Prints a formatted table to stdout and returns structured results.
pub fn run_embed_benchmark() -> EmbedBenchmarkResults {
println!("\n=== Embedding Benchmark ===\n");
println!(
"Generating {} test chunks (~{} words each)...",
THROUGHPUT_CHUNK_COUNT, WORDS_PER_CHUNK
);
// --- Per-preset throughput ---
let mut preset_results: Vec<PresetResult> = Vec::new();
for preset in EMBEDDING_PRESETS.iter() {
println!(
"\n[{}] {} dims — {}",
preset.name, preset.dimensions, preset.description
);
// Step 1: Warm-up (first call initializes ONNX session; may download model).
let mut warmup_chunks = generate_test_chunks(1, WORDS_PER_CHUNK);
let warmup_config = config_for_preset(preset, 1);
print!(" Warming up model...");
let warm_start = Instant::now();
match embed_chunks(&mut warmup_chunks, &warmup_config) {
Ok(()) => {}
Err(e) => {
println!(" SKIP ({})", e);
continue;
}
}
let warm_ms = warm_start.elapsed().as_secs_f64() * 1000.0;
println!(" {:.0} ms", warm_ms);
// Step 2: Throughput measurement at default batch size (32).
let mut chunks = generate_test_chunks(THROUGHPUT_CHUNK_COUNT, WORDS_PER_CHUNK);
let throughput_config = config_for_preset(preset, 32);
print!(" Throughput ({} chunks, batch=32)...", THROUGHPUT_CHUNK_COUNT);
let t_start = Instant::now();
match embed_chunks(&mut chunks, &throughput_config) {
Ok(()) => {}
Err(e) => {
println!(" ERROR: {}", e);
continue;
}
}
let total_ms = t_start.elapsed().as_secs_f64() * 1000.0;
let chunks_per_sec = THROUGHPUT_CHUNK_COUNT as f64 / (total_ms / 1000.0);
let ms_per_chunk = total_ms / THROUGHPUT_CHUNK_COUNT as f64;
println!(
" {:.1} ms total → {:.1} chunks/sec, {:.2} ms/chunk",
total_ms, chunks_per_sec, ms_per_chunk
);
preset_results.push(PresetResult {
name: preset.name.clone(),
dimensions: preset.dimensions,
warm_ms,
total_ms,
chunks_per_sec,
ms_per_chunk,
});
}
// --- Batch size sweep on "balanced" preset ---
println!(
"\n--- Batch size sweep (balanced preset, {} chunks) ---\n",
THROUGHPUT_CHUNK_COUNT
);
let balanced = match EMBEDDING_PRESETS.iter().find(|p| p.name == "balanced") {
Some(p) => p,
None => {
eprintln!("WARNING: 'balanced' preset not found; skipping batch sweep.");
return EmbedBenchmarkResults {
presets: preset_results,
batch_sweep: Vec::new(),
parallel: None,
};
}
};
let mut sweep_results: Vec<BatchSweepResult> = Vec::new();
println!(
"{:>12} {:>12} {:>14} {:>12}",
"batch_size", "total_ms", "chunks/sec", "ms/chunk"
);
println!("{}", "-".repeat(55));
for &batch_size in BATCH_SIZES {
let mut chunks = generate_test_chunks(THROUGHPUT_CHUNK_COUNT, WORDS_PER_CHUNK);
let config = config_for_preset(balanced, batch_size);
let t_start = Instant::now();
match embed_chunks(&mut chunks, &config) {
Ok(()) => {}
Err(e) => {
println!("{:>12} ERROR: {}", batch_size, e);
continue;
}
}
let total_ms = t_start.elapsed().as_secs_f64() * 1000.0;
let chunks_per_sec = THROUGHPUT_CHUNK_COUNT as f64 / (total_ms / 1000.0);
let ms_per_chunk = total_ms / THROUGHPUT_CHUNK_COUNT as f64;
println!(
"{:>12} {:>12.1} {:>14.1} {:>12.2}",
batch_size, total_ms, chunks_per_sec, ms_per_chunk
);
sweep_results.push(BatchSweepResult {
batch_size,
total_ms,
chunks_per_sec,
ms_per_chunk,
});
}
// --- Parallel inference test ---
println!("\n--- Parallel inference test (balanced preset) ---\n");
let parallel_batches: usize = 8;
let chunks_per_batch: usize = 50;
// Generate independent batches (one per simulated "document").
let mut batches: Vec<Vec<Chunk>> = (0..parallel_batches)
.map(|_| generate_test_chunks(chunks_per_batch, WORDS_PER_CHUNK))
.collect();
let parallel_config = config_for_preset(balanced, 32);
// Sequential baseline: process each batch one after another.
let mut seq_batches = batches.clone();
let seq_start = Instant::now();
for batch in &mut seq_batches {
embed_chunks(batch, &parallel_config).expect("Sequential embedding failed");
}
let seq_ms = seq_start.elapsed().as_secs_f64() * 1000.0;
// Parallel via rayon: each thread calls engine.embed(&self) concurrently.
// This works because EmbeddingEngine uses thread-local ONNX sessions
// behind Arc<EmbeddingEngine>, so concurrent reads are safe.
let par_start = Instant::now();
batches.par_iter_mut().for_each(|batch| {
embed_chunks(batch, &parallel_config).expect("Parallel embedding failed");
});
let par_ms = par_start.elapsed().as_secs_f64() * 1000.0;
let total_chunks = parallel_batches * chunks_per_batch;
let speedup = seq_ms / par_ms;
let seq_chunks_per_sec = total_chunks as f64 / (seq_ms / 1000.0);
let par_chunks_per_sec = total_chunks as f64 / (par_ms / 1000.0);
println!(
"{} batches x {} chunks = {} total chunks",
parallel_batches, chunks_per_batch, total_chunks
);
println!(" Sequential: {:.0} ms ({:.1} chunks/sec)", seq_ms, seq_chunks_per_sec);
println!(" Parallel: {:.0} ms ({:.1} chunks/sec)", par_ms, par_chunks_per_sec);
println!(" Speedup: {:.2}x", speedup);
let parallel_result = Some(ParallelResult {
num_batches: parallel_batches,
chunks_per_batch,
total_chunks,
sequential_ms: seq_ms,
sequential_chunks_per_sec: seq_chunks_per_sec,
parallel_ms: par_ms,
parallel_chunks_per_sec: par_chunks_per_sec,
speedup,
});
// --- Summary table ---
if !preset_results.is_empty() {
println!("\n=== Summary ===\n");
println!(
"{:<14} {:>6} {:>10} {:>12} {:>12}",
"preset", "dims", "warm_ms", "chunks/sec", "ms/chunk"
);
println!("{}", "-".repeat(60));
for r in &preset_results {
println!(
"{:<14} {:>6} {:>10.0} {:>12.1} {:>12.2}",
r.name, r.dimensions, r.warm_ms, r.chunks_per_sec, r.ms_per_chunk
);
}
}
EmbedBenchmarkResults {
presets: preset_results,
batch_sweep: sweep_results,
parallel: parallel_result,
}
}

View File

@@ -0,0 +1,64 @@
//! Error types for the benchmark harness
use std::path::PathBuf;
use thiserror::Error;
/// Result type alias for benchmark harness operations
pub type Result<T> = std::result::Result<T, Error>;
/// Errors that can occur during benchmark operations
#[derive(Error, Debug)]
pub enum Error {
/// I/O error occurred
#[error("I/O error: {0}")]
Io(#[from] std::io::Error),
/// JSON serialization/deserialization error
#[error("JSON error: {0}")]
Json(#[from] serde_json::Error),
/// Fixture validation error
#[error("Invalid fixture at {path}: {reason}")]
InvalidFixture { path: PathBuf, reason: String },
/// Fixture file not found
#[error("Fixture file not found: {0}")]
FixtureNotFound(PathBuf),
/// Test document not found
#[error("Test document not found: {0}")]
DocumentNotFound(PathBuf),
/// Framework extraction error
#[error("Framework '{framework}' failed on {file}: {message}")]
ExtractionFailed {
framework: String,
file: PathBuf,
message: String,
},
/// Configuration error
#[error("Configuration error: {0}")]
Config(String),
/// Benchmark execution error
#[error("Benchmark error: {0}")]
Benchmark(String),
/// Framework-reported extraction error (the framework returned {"error": "..."})
/// This is distinct from Benchmark - the framework ran but couldn't extract.
#[error("{0}")]
FrameworkError(String),
/// Framework returned empty or missing content — ran successfully but produced nothing.
#[error("Empty content: {0}")]
EmptyContent(String),
/// Timeout error
#[error("Timeout: {0}")]
Timeout(String),
/// Profiling error
#[error("Profiling error: {0}")]
Profiling(String),
}

View File

@@ -0,0 +1,855 @@
//! Fixture loading and management
//!
//! Fixtures are JSON files that describe test documents and their metadata.
//!
//! ## Fixture Format
//!
//! ```json
//! {
//! "document": "path/to/document.pdf",
//! "file_type": "pdf",
//! "file_size": 1024000,
//! "expected_frameworks": ["kreuzberg", "docling"],
//! // Note: frameworks can be Kreuzberg language bindings or open source extraction alternatives
//! "metadata": {
//! "title": "Test Document",
//! "pages": 10,
//! "requires_ocr": false // Optional: override OCR requirement detection
//! },
//! "ground_truth": {
//! "text_file": "path/to/ground_truth.txt",
//! "source": "pdf_text_layer"
//! }
//! }
//! ```
use crate::{Error, Result};
use serde::{Deserialize, Serialize};
use std::collections::{HashMap, HashSet};
use std::path::{Path, PathBuf};
/// A fixture describing a test document
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Fixture {
/// Path to the test document (relative to fixture file)
pub document: PathBuf,
/// File type (extension without dot, e.g., "pdf")
pub file_type: String,
/// File size in bytes
pub file_size: u64,
/// Extraction frameworks that should be able to process this file
/// (can be Kreuzberg language bindings or open source extraction alternatives)
#[serde(default)]
pub expected_frameworks: Vec<String>,
/// Additional metadata about the document
#[serde(default)]
pub metadata: HashMap<String, serde_json::Value>,
/// Ground truth for quality assessment (optional)
#[serde(default)]
pub ground_truth: Option<GroundTruth>,
}
/// Ground truth data for quality assessment
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GroundTruth {
/// Path to ground truth text file (optional — some fixtures only have markdown GT)
#[serde(default, skip_serializing_if = "Option::is_none")]
pub text_file: Option<PathBuf>,
/// Path to ground truth markdown file for structural quality scoring (optional)
#[serde(default, skip_serializing_if = "Option::is_none")]
pub markdown_file: Option<PathBuf>,
/// Source of the ground truth ("pdf_text_layer", "markdown_file", "manual")
pub source: String,
}
impl Fixture {
/// Load a fixture from a JSON file
pub fn from_file(path: impl AsRef<Path>) -> Result<Self> {
let path = path.as_ref();
let contents = std::fs::read_to_string(path).map_err(Error::Io)?;
let fixture: Fixture = serde_json::from_str(&contents)?;
fixture.validate(path)?;
Ok(fixture)
}
/// Validate the fixture
///
/// Performs comprehensive validation including:
/// - Path validation (relative paths only)
/// - File type validation (non-empty)
/// - Ground truth validation:
/// - Relative path requirement
/// - Valid source type
/// - File existence check (relative to fixture directory)
fn validate(&self, fixture_path: &Path) -> Result<()> {
if self.document.is_absolute() {
return Err(Error::InvalidFixture {
path: fixture_path.to_path_buf(),
reason: "document path must be relative".to_string(),
});
}
if self.file_type.is_empty() {
return Err(Error::InvalidFixture {
path: fixture_path.to_path_buf(),
reason: "file_type cannot be empty".to_string(),
});
}
if let Some(gt) = &self.ground_truth {
if let Some(ref tf) = gt.text_file
&& tf.is_absolute()
{
return Err(Error::InvalidFixture {
path: fixture_path.to_path_buf(),
reason: "ground_truth.text_file must be relative".to_string(),
});
}
if !matches!(
gt.source.as_str(),
"pdf_text_layer"
| "markdown_file"
| "manual"
| "vision"
| "python-docx"
| "python-pptx"
| "openpyxl"
| "codex-vision"
| "raw_source"
| "pandoc"
| "python_email"
| "extract_msg"
| "nbformat"
| "xml_parse"
| "beautifulsoup"
| "xlrd"
| "antiword"
| "libreoffice"
| "odfpy"
| "ebooklib"
| "striprtf"
| "pyxlsb"
| "olefile"
| "omnidocbench"
| "mistral-pixtral"
) {
return Err(Error::InvalidFixture {
path: fixture_path.to_path_buf(),
reason: format!("invalid ground_truth.source: {}", gt.source),
});
}
// Validate that ground truth file exists at load time
// Use fixture directory as the base for relative paths
if let (Some(fixture_dir), Some(tf)) = (fixture_path.parent(), &gt.text_file) {
let ground_truth_path = fixture_dir.join(tf);
if !ground_truth_path.exists() {
return Err(Error::InvalidFixture {
path: fixture_path.to_path_buf(),
reason: format!(
"ground truth file not found: {} (resolved to {})",
tf.display(),
ground_truth_path.display()
),
});
}
// Validate markdown ground truth file if specified
if let Some(ref md_file) = gt.markdown_file {
if md_file.is_absolute() {
return Err(Error::InvalidFixture {
path: fixture_path.to_path_buf(),
reason: "ground_truth.markdown_file must be relative".to_string(),
});
}
let md_path = fixture_dir.join(md_file);
if !md_path.exists() {
return Err(Error::InvalidFixture {
path: fixture_path.to_path_buf(),
reason: format!(
"ground truth markdown file not found: {} (resolved to {})",
md_file.display(),
md_path.display()
),
});
}
}
}
}
Ok(())
}
/// Resolve document path relative to fixture file
pub fn resolve_document_path(&self, fixture_dir: &Path) -> PathBuf {
fixture_dir.join(&self.document)
}
/// Resolve ground truth path relative to fixture file
pub fn resolve_ground_truth_path(&self, fixture_dir: &Path) -> Option<PathBuf> {
self.ground_truth
.as_ref()
.and_then(|gt| gt.text_file.as_ref().map(|tf| fixture_dir.join(tf)))
}
/// Resolve ground truth markdown path relative to fixture file
pub fn resolve_ground_truth_markdown_path(&self, fixture_dir: &Path) -> Option<PathBuf> {
self.ground_truth
.as_ref()
.and_then(|gt| gt.markdown_file.as_ref().map(|mf| fixture_dir.join(mf)))
}
/// Determine if this fixture requires OCR based on file type and metadata
pub fn requires_ocr(&self) -> bool {
// Check if explicitly marked in metadata
if let Some(requires_ocr) = self.metadata.get("requires_ocr").and_then(|v| v.as_bool()) {
return requires_ocr;
}
// Infer from file type - images always need OCR
matches!(
self.file_type.to_lowercase().as_str(),
"jpg" | "jpeg" | "png" | "gif" | "bmp" | "tiff" | "tif" | "webp" | "jp2" | "jpx" | "jpm" | "mj2"
)
}
}
/// Manages loading and accessing fixtures
pub struct FixtureManager {
fixtures: Vec<(PathBuf, Fixture)>,
}
impl FixtureManager {
/// Create a new empty fixture manager
pub fn new() -> Self {
Self { fixtures: Vec::new() }
}
/// Load a single fixture file
pub fn load_fixture(&mut self, path: impl AsRef<Path>) -> Result<()> {
let path = path.as_ref();
if !path.exists() {
return Err(Error::FixtureNotFound(path.to_path_buf()));
}
let fixture = Fixture::from_file(path)?;
self.fixtures.push((path.to_path_buf(), fixture));
Ok(())
}
/// Parse profiling fixtures from environment variable
///
/// Reads the `PROFILING_FIXTURES` environment variable (comma-separated fixture names).
/// Returns a HashSet of fixture names to use during profiling runs.
///
/// # Examples
///
/// ```text
/// PROFILING_FIXTURES="pdf_small,pdf_medium,docx_simple" -> {pdf_small, pdf_medium, docx_simple}
/// ```
fn get_profiling_fixtures() -> Option<HashSet<String>> {
std::env::var("PROFILING_FIXTURES")
.ok()
.map(|fixtures_str| {
fixtures_str
.split(',')
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
.collect::<HashSet<String>>()
})
.filter(|set| !set.is_empty())
}
/// Load all fixtures from a directory (recursively)
///
/// If the `PROFILING_FIXTURES` environment variable is set, only fixtures matching
/// the specified names (comma-separated) will be loaded. Otherwise, all fixtures are loaded.
pub fn load_fixtures_from_dir(&mut self, dir: impl AsRef<Path>) -> Result<()> {
self.load_fixtures_from_dir_internal(dir, true)
}
/// Internal method for loading fixtures from a directory (with filter control)
fn load_fixtures_from_dir_internal(&mut self, dir: impl AsRef<Path>, apply_filter: bool) -> Result<()> {
let dir = dir.as_ref();
if !dir.exists() {
return Err(Error::FixtureNotFound(dir.to_path_buf()));
}
let mut all_fixtures: Vec<PathBuf> = Vec::new();
for entry in std::fs::read_dir(dir)? {
let entry = entry?;
let path = entry.path();
if path.is_dir() {
let mut temp_manager = FixtureManager::new();
temp_manager.load_fixtures_from_dir_internal(&path, false)?;
for (fixture_path, _) in temp_manager.fixtures {
all_fixtures.push(fixture_path);
}
} else if path.extension().and_then(|s| s.to_str()) == Some("json") {
all_fixtures.push(path);
}
}
let total_fixtures = all_fixtures.len();
let mut failed_fixtures: Vec<(PathBuf, String)> = Vec::new();
if apply_filter {
if let Some(profiling_set) = Self::get_profiling_fixtures() {
let mut loaded_count = 0;
let mut fixture_names = Vec::new();
for fixture_path in &all_fixtures {
if let Some(stem) = fixture_path.file_stem().and_then(|s| s.to_str())
&& profiling_set.contains(stem)
{
match self.load_fixture(fixture_path) {
Ok(()) => {
loaded_count += 1;
fixture_names.push(stem.to_string());
}
Err(e) => {
failed_fixtures.push((fixture_path.clone(), e.to_string()));
}
}
}
}
if loaded_count > 0 {
fixture_names.sort();
eprintln!(
"Profiling mode: Using {} of {} fixtures: {}",
loaded_count,
total_fixtures,
fixture_names.join(", ")
);
} else {
eprintln!(
"Warning: PROFILING_FIXTURES set but no matching fixtures found. \
Loading all {} fixtures.",
total_fixtures
);
for fixture_path in all_fixtures {
match self.load_fixture(&fixture_path) {
Ok(()) => {
// Successfully loaded
}
Err(e) => {
failed_fixtures.push((fixture_path.clone(), e.to_string()));
}
}
}
}
} else {
for fixture_path in all_fixtures {
match self.load_fixture(&fixture_path) {
Ok(()) => {
// Successfully loaded
}
Err(e) => {
failed_fixtures.push((fixture_path.clone(), e.to_string()));
}
}
}
}
} else {
for fixture_path in all_fixtures {
match self.load_fixture(&fixture_path) {
Ok(()) => {
// Successfully loaded
}
Err(e) => {
failed_fixtures.push((fixture_path.clone(), e.to_string()));
}
}
}
}
// Report failed fixtures if any occurred
if !failed_fixtures.is_empty() {
eprintln!(
"Warning: {} of {} fixtures failed to load:",
failed_fixtures.len(),
total_fixtures
);
for (path, error) in failed_fixtures {
eprintln!(" - {}: {}", path.display(), error);
}
}
Ok(())
}
/// Get all loaded fixtures
pub fn fixtures(&self) -> &[(PathBuf, Fixture)] {
&self.fixtures
}
/// Get count of loaded fixtures
pub fn len(&self) -> usize {
self.fixtures.len()
}
/// Check if empty
pub fn is_empty(&self) -> bool {
self.fixtures.is_empty()
}
/// Filter fixtures by file type
pub fn filter_by_type(&self, file_types: &[String]) -> Vec<(PathBuf, Fixture)> {
self.fixtures
.iter()
.filter(|(_, fixture)| file_types.contains(&fixture.file_type))
.cloned()
.collect()
}
/// Retain only the fixtures belonging to shard `index` of `total` shards.
///
/// Fixtures are sorted by path for deterministic ordering, then assigned
/// round-robin to shards. This ensures even distribution across shards
/// regardless of file type or size ordering.
///
/// `index` is 1-based (1..=total).
pub fn retain_shard(&mut self, index: usize, total: usize) {
assert!(index >= 1 && index <= total, "shard index must be 1..=total");
// Sort by path for deterministic assignment across jobs
self.fixtures.sort_by(|a, b| a.0.cmp(&b.0));
let shard_index = index - 1; // convert to 0-based
self.fixtures = self
.fixtures
.drain(..)
.enumerate()
.filter(|(i, _)| i % total == shard_index)
.map(|(_, f)| f)
.collect();
}
}
impl Default for FixtureManager {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::sync::Mutex;
use tempfile::TempDir;
static ENV_LOCK: Mutex<()> = Mutex::new(());
#[test]
fn test_fixture_validation() {
let fixture = Fixture {
document: PathBuf::from("test.pdf"),
file_type: "pdf".to_string(),
file_size: 1024,
expected_frameworks: vec!["kreuzberg".to_string()],
metadata: HashMap::new(),
ground_truth: None,
};
assert!(fixture.validate(Path::new("fixture.json")).is_ok());
}
#[test]
fn test_absolute_path_rejected() {
#[cfg(windows)]
let absolute_path = PathBuf::from("C:\\absolute\\path\\test.pdf");
#[cfg(not(windows))]
let absolute_path = PathBuf::from("/absolute/path/test.pdf");
let fixture = Fixture {
document: absolute_path,
file_type: "pdf".to_string(),
file_size: 1024,
expected_frameworks: vec![],
metadata: HashMap::new(),
ground_truth: None,
};
assert!(fixture.validate(Path::new("fixture.json")).is_err());
}
#[test]
fn test_fixture_manager_load() {
let temp_dir = TempDir::new().unwrap();
let fixture_path = temp_dir.path().join("test.json");
let fixture = Fixture {
document: PathBuf::from("test.pdf"),
file_type: "pdf".to_string(),
file_size: 1024,
expected_frameworks: vec![],
metadata: HashMap::new(),
ground_truth: None,
};
std::fs::write(&fixture_path, serde_json::to_string(&fixture).unwrap()).unwrap();
let mut manager = FixtureManager::new();
assert!(manager.load_fixture(&fixture_path).is_ok());
assert_eq!(manager.len(), 1);
}
#[test]
fn test_profiling_fixtures_with_env_var() {
let _lock = ENV_LOCK.lock().unwrap();
let temp_dir = TempDir::new().unwrap();
let fixtures = vec!["pdf_small", "pdf_medium", "docx_simple", "html_simple"];
for fixture_name in &fixtures {
let fixture_path = temp_dir.path().join(format!("{}.json", fixture_name));
let fixture = Fixture {
document: PathBuf::from(format!("{}.pdf", fixture_name)),
file_type: "pdf".to_string(),
file_size: 1024,
expected_frameworks: vec![],
metadata: HashMap::new(),
ground_truth: None,
};
std::fs::write(&fixture_path, serde_json::to_string(&fixture).unwrap()).unwrap();
}
unsafe {
std::env::set_var("PROFILING_FIXTURES", "pdf_small,docx_simple");
}
let mut manager = FixtureManager::new();
manager.load_fixtures_from_dir(temp_dir.path()).unwrap();
assert_eq!(manager.len(), 2);
let loaded_names: Vec<String> = manager
.fixtures()
.iter()
.filter_map(|(path, _)| path.file_stem().and_then(|s| s.to_str()).map(|s| s.to_string()))
.collect();
assert!(loaded_names.contains(&"pdf_small".to_string()));
assert!(loaded_names.contains(&"docx_simple".to_string()));
assert!(!loaded_names.contains(&"pdf_medium".to_string()));
assert!(!loaded_names.contains(&"html_simple".to_string()));
unsafe {
std::env::remove_var("PROFILING_FIXTURES");
}
}
#[test]
fn test_profiling_fixtures_all_when_env_not_set() {
let _lock = ENV_LOCK.lock().unwrap();
let temp_dir = TempDir::new().unwrap();
let fixtures = vec!["pdf_small", "pdf_medium", "docx_simple"];
for fixture_name in &fixtures {
let fixture_path = temp_dir.path().join(format!("{}.json", fixture_name));
let fixture = Fixture {
document: PathBuf::from(format!("{}.pdf", fixture_name)),
file_type: "pdf".to_string(),
file_size: 1024,
expected_frameworks: vec![],
metadata: HashMap::new(),
ground_truth: None,
};
std::fs::write(&fixture_path, serde_json::to_string(&fixture).unwrap()).unwrap();
}
unsafe {
std::env::remove_var("PROFILING_FIXTURES");
}
let mut manager = FixtureManager::new();
manager.load_fixtures_from_dir(temp_dir.path()).unwrap();
assert_eq!(manager.len(), 3);
}
#[test]
fn test_profiling_fixtures_with_whitespace() {
let _lock = ENV_LOCK.lock().unwrap();
let temp_dir = TempDir::new().unwrap();
let fixtures = vec!["pdf_small", "pdf_medium", "docx_simple"];
for fixture_name in &fixtures {
let fixture_path = temp_dir.path().join(format!("{}.json", fixture_name));
let fixture = Fixture {
document: PathBuf::from(format!("{}.pdf", fixture_name)),
file_type: "pdf".to_string(),
file_size: 1024,
expected_frameworks: vec![],
metadata: HashMap::new(),
ground_truth: None,
};
std::fs::write(&fixture_path, serde_json::to_string(&fixture).unwrap()).unwrap();
}
unsafe {
std::env::set_var("PROFILING_FIXTURES", "pdf_small , pdf_medium , docx_simple");
}
let mut manager = FixtureManager::new();
manager.load_fixtures_from_dir(temp_dir.path()).unwrap();
assert_eq!(manager.len(), 3);
unsafe {
std::env::remove_var("PROFILING_FIXTURES");
}
}
#[test]
fn test_profiling_fixtures_partial_match() {
let _lock = ENV_LOCK.lock().unwrap();
let temp_dir = TempDir::new().unwrap();
let fixtures = vec!["pdf_small", "pdf_medium", "docx_simple"];
for fixture_name in &fixtures {
let fixture_path = temp_dir.path().join(format!("{}.json", fixture_name));
let fixture = Fixture {
document: PathBuf::from(format!("{}.pdf", fixture_name)),
file_type: "pdf".to_string(),
file_size: 1024,
expected_frameworks: vec![],
metadata: HashMap::new(),
ground_truth: None,
};
std::fs::write(&fixture_path, serde_json::to_string(&fixture).unwrap()).unwrap();
}
unsafe {
std::env::set_var("PROFILING_FIXTURES", "pdf_small,nonexistent_fixture");
}
let mut manager = FixtureManager::new();
manager.load_fixtures_from_dir(temp_dir.path()).unwrap();
assert_eq!(manager.len(), 1);
let loaded_names: Vec<String> = manager
.fixtures()
.iter()
.filter_map(|(path, _)| path.file_stem().and_then(|s| s.to_str()).map(|s| s.to_string()))
.collect();
assert!(loaded_names.contains(&"pdf_small".to_string()));
unsafe {
std::env::remove_var("PROFILING_FIXTURES");
}
}
#[test]
fn test_requires_ocr_for_image_types() {
let image_types = vec!["jpg", "jpeg", "png", "gif", "bmp", "tiff", "webp"];
for file_type in image_types {
let fixture = Fixture {
document: PathBuf::from(format!("test.{}", file_type)),
file_type: file_type.to_string(),
file_size: 1024,
expected_frameworks: vec![],
metadata: HashMap::new(),
ground_truth: None,
};
assert!(
fixture.requires_ocr(),
"Expected file type {} to require OCR",
file_type
);
}
}
#[test]
fn test_requires_ocr_for_non_image_types() {
let non_image_types = vec!["pdf", "docx", "txt", "html", "md"];
for file_type in non_image_types {
let fixture = Fixture {
document: PathBuf::from(format!("test.{}", file_type)),
file_type: file_type.to_string(),
file_size: 1024,
expected_frameworks: vec![],
metadata: HashMap::new(),
ground_truth: None,
};
assert!(
!fixture.requires_ocr(),
"Expected file type {} to not require OCR",
file_type
);
}
}
#[test]
fn test_requires_ocr_explicit_metadata_true() {
let mut metadata = HashMap::new();
metadata.insert("requires_ocr".to_string(), serde_json::json!(true));
let fixture = Fixture {
document: PathBuf::from("test.pdf"),
file_type: "pdf".to_string(),
file_size: 1024,
expected_frameworks: vec![],
metadata,
ground_truth: None,
};
// PDF normally doesn't require OCR, but metadata overrides this
assert!(fixture.requires_ocr());
}
#[test]
fn test_requires_ocr_explicit_metadata_false() {
let mut metadata = HashMap::new();
metadata.insert("requires_ocr".to_string(), serde_json::json!(false));
let fixture = Fixture {
document: PathBuf::from("test.png"),
file_type: "png".to_string(),
file_size: 1024,
expected_frameworks: vec![],
metadata,
ground_truth: None,
};
// PNG normally requires OCR, but metadata overrides this
assert!(!fixture.requires_ocr());
}
#[test]
fn test_requires_ocr_case_insensitive() {
let fixture = Fixture {
document: PathBuf::from("test.JPG"),
file_type: "JPG".to_string(),
file_size: 1024,
expected_frameworks: vec![],
metadata: HashMap::new(),
ground_truth: None,
};
assert!(fixture.requires_ocr());
}
#[test]
fn test_ground_truth_file_existence_validation() {
let temp_dir = TempDir::new().unwrap();
let fixture_path = temp_dir.path().join("test.json");
let fixture = Fixture {
document: PathBuf::from("test.pdf"),
file_type: "pdf".to_string(),
file_size: 1024,
expected_frameworks: vec![],
metadata: HashMap::new(),
ground_truth: Some(GroundTruth {
text_file: Some(PathBuf::from("nonexistent_ground_truth.txt")),
markdown_file: None,
source: "manual".to_string(),
}),
};
std::fs::write(&fixture_path, serde_json::to_string(&fixture).unwrap()).unwrap();
// Should fail because ground truth file doesn't exist
let result = Fixture::from_file(&fixture_path);
assert!(result.is_err());
match result {
Err(Error::InvalidFixture { reason, .. }) => {
assert!(reason.contains("ground truth file not found"));
}
_ => panic!("Expected InvalidFixture error with 'ground truth file not found'"),
}
}
#[test]
fn test_ground_truth_file_existence_validation_success() {
let temp_dir = TempDir::new().unwrap();
let fixture_path = temp_dir.path().join("test.json");
let ground_truth_path = temp_dir.path().join("ground_truth.txt");
// Create the ground truth file
std::fs::write(&ground_truth_path, "Sample ground truth text").unwrap();
let fixture = Fixture {
document: PathBuf::from("test.pdf"),
file_type: "pdf".to_string(),
file_size: 1024,
expected_frameworks: vec![],
metadata: HashMap::new(),
ground_truth: Some(GroundTruth {
text_file: Some(PathBuf::from("ground_truth.txt")),
markdown_file: None,
source: "manual".to_string(),
}),
};
std::fs::write(&fixture_path, serde_json::to_string(&fixture).unwrap()).unwrap();
// Should succeed because ground truth file exists
let result = Fixture::from_file(&fixture_path);
assert!(result.is_ok());
}
#[test]
fn test_fixture_load_with_mixed_success_and_failure() {
let _lock = ENV_LOCK.lock().unwrap();
let temp_dir = TempDir::new().unwrap();
// Create valid fixture
let valid_fixture_path = temp_dir.path().join("valid.json");
let valid_fixture = Fixture {
document: PathBuf::from("test.pdf"),
file_type: "pdf".to_string(),
file_size: 1024,
expected_frameworks: vec![],
metadata: HashMap::new(),
ground_truth: None,
};
std::fs::write(&valid_fixture_path, serde_json::to_string(&valid_fixture).unwrap()).unwrap();
// Create invalid fixture (missing ground truth file)
let invalid_fixture_path = temp_dir.path().join("invalid.json");
let invalid_fixture = Fixture {
document: PathBuf::from("test.pdf"),
file_type: "pdf".to_string(),
file_size: 1024,
expected_frameworks: vec![],
metadata: HashMap::new(),
ground_truth: Some(GroundTruth {
text_file: Some(PathBuf::from("nonexistent.txt")),
markdown_file: None,
source: "manual".to_string(),
}),
};
std::fs::write(&invalid_fixture_path, serde_json::to_string(&invalid_fixture).unwrap()).unwrap();
unsafe {
std::env::remove_var("PROFILING_FIXTURES");
}
let mut manager = FixtureManager::new();
// Should succeed overall (returns Ok), but report failed fixtures
let result = manager.load_fixtures_from_dir(temp_dir.path());
assert!(result.is_ok());
// Should have loaded only the valid fixture
assert_eq!(manager.len(), 1);
}
}

View File

@@ -0,0 +1,83 @@
//! Fast benchmark groups: curated document subsets for targeted iteration.
/// A named benchmark group with a description and list of doc name patterns.
pub struct BenchmarkGroup {
pub name: &'static str,
pub description: &'static str,
/// Document name patterns (matched via `contains`, same as --doc).
pub docs: &'static [&'static str],
}
pub const GROUPS: &[BenchmarkGroup] = &[
BenchmarkGroup {
name: "tables",
description: "Table extraction quality (wide tables, borderless, receipts)",
docs: &[
"senate-expenditures",
"nics-background-checks-2015-11",
"SPARSE-2024-INV-1234_borderless_table",
"RECEIPT-2024-TXN-98765_retail_purchase",
"REPAIR-2022-INV-001_multipage",
"redp5110_sampled",
"table-curves-example",
],
},
BenchmarkGroup {
name: "structure",
description: "Heading/structure detection (SF1 regressions)",
docs: &[
"pdfa_040",
"nougat_028",
"nougat_018",
"pdfa_033",
"pdf_structure",
"hello_structure",
"word365_structure",
"figure_structure",
],
},
BenchmarkGroup {
name: "multicolumn",
description: "Multi-column and magazine-style layouts",
docs: &[
"nougat_028",
"2305.03393v1",
"2206.01062",
"2203.01017v2",
"federal-register-2020-17221",
],
},
BenchmarkGroup {
name: "text-quality",
description: "RTL, special chars, encoding, OCR edge cases",
docs: &[
"right_to_left_02",
"right_to_left_03",
"annotations-unicode-issues",
"pdfa_033",
"test-punkt",
"issue-1114-dedupe-chars",
],
},
BenchmarkGroup {
name: "ocr-fallback",
description: "Documents where native extraction fails and OCR should trigger",
docs: &[
"senate-expenditures",
"la-precinct-bulletin-2014-p1",
"scotus-transcript-p1",
"issue-848",
"nics-background-checks-2015-11-rotated",
],
},
];
/// Find a group by name, case-insensitive.
pub fn find_group(name: &str) -> Option<&'static BenchmarkGroup> {
GROUPS.iter().find(|g| g.name.eq_ignore_ascii_case(name))
}
/// List all available group names.
pub fn group_names() -> Vec<&'static str> {
GROUPS.iter().map(|g| g.name).collect()
}

View File

@@ -0,0 +1,90 @@
//! Benchmark harness for comparing document extraction frameworks.
//!
//! This crate provides infrastructure for benchmarking Kreuzberg against other
//! document extraction frameworks, measuring performance (throughput, memory, latency)
//! and quality (F1 scores, text accuracy).
//!
//! # Dual-use pattern
//!
//! The harness serves two distinct workflows through the CLI subcommands:
//!
//! - **CI benchmarking** (`run` / `consolidate`): automated multi-framework
//! performance sweeps that produce JSON artifacts consumed by dashboards.
//! `run` executes one framework at a time; `consolidate` merges per-framework
//! result files into a single ranked report.
//!
//! - **Local quality assessment** (`compare` / `pipeline-benchmark`): interactive
//! tools for developers tuning extraction quality. `compare` runs multiple
//! Kreuzberg pipeline configurations side-by-side on the corpus, printing an
//! SF1/TF1 table. `pipeline-benchmark` extends this with timing data.
//!
//! # Module organization
//!
//! | Module | Purpose |
//! |--------|---------|
//! | [`adapter`] / [`adapters`] | Framework adapter trait and concrete implementations (native, Node, Python, Ruby). |
//! | [`aggregate`] | Consolidation aggregation: groups results by framework/mode/file-type, computes percentiles. |
//! | [`comparison`] | Multi-pipeline quality comparison on the corpus with guardrail thresholds. |
//! | [`config`] | Configuration types for benchmark runs and profiling. |
//! | [`consolidate`] | Recursive loading of `results.json` files from disk. |
//! | [`corpus`] | Test corpus discovery and filtering. |
//! | [`fixture`] | Fixture loading and validation. |
//! | [`markdown_quality`] | Structural F1 scoring via fuzzy cross-type block matching. |
//! | [`quality`] | Token-level (bag-of-words) text and numeric F1 scoring. |
//! | [`runner`] | Benchmark execution orchestrator (warmup, iterations, resource monitoring). |
//! | [`stats`] | Percentile calculations (R-7 interpolation) and NaN sanitization. |
//! | [`types`] | Core data types (`BenchmarkResult`, `QualityMetrics`, etc.). |
pub mod adapter;
pub mod adapters;
pub mod aggregate;
pub mod comparison;
pub mod config;
pub mod consolidate;
pub mod corpus;
pub mod diagnostics;
pub mod embed_benchmark;
pub mod error;
pub mod fixture;
pub mod groups;
pub mod markdown_quality;
pub mod model_benchmark;
pub mod monitoring;
pub mod noise_detection;
pub mod output;
pub mod pipeline_benchmark;
pub mod pool_metrics;
pub mod profile_report;
pub mod profiling;
pub mod quality;
pub mod registry;
pub mod runner;
pub mod sizes;
pub mod stats;
pub mod survey;
pub mod types;
pub mod validate_gt;
pub use adapter::FrameworkAdapter;
pub use aggregate::{
ComparisonData, ConsolidationMetadata, DeltaMetrics, DurationPercentiles, FileTypeAggregation,
FrameworkModeAggregation, NewConsolidatedResults, PerFixtureRow, Percentiles, PerformancePercentiles,
QualityPercentiles, RankedFramework, aggregate_new_format,
};
pub use config::{BenchmarkConfig, BenchmarkMode, ProfilingConfig, load_framework_sizes};
pub use consolidate::load_run_results;
pub use error::{Error, Result};
pub use fixture::{Fixture, FixtureManager};
pub use monitoring::{ResourceMonitor, ResourceSample, ResourceStats};
pub use output::{write_by_extension_analysis, write_json};
pub use pool_metrics::{FilePoolMetrics, PoolMetricsReport};
pub use profile_report::{Hotspot, MemorySnapshot, ProfileReport};
pub use quality::{compute_quality, compute_quality_with_structure};
pub use registry::AdapterRegistry;
pub use runner::BenchmarkRunner;
pub use types::{BenchmarkResult, DiskSizeInfo, FrameworkCapabilities, KreuzbergPipeline, OutputFormat, PdfMetadata};
pub use sizes::{
FrameworkSize, FrameworkSizes, load_framework_sizes as load_sizes_json, measure_framework_sizes,
save_framework_sizes,
};

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,173 @@
//! Layout model A/B benchmark: compare layout detection configurations on rendered PDF pages.
//!
//! Replaces `crates/kreuzberg/tests/layout_model_benchmark.rs`.
//! Compares two table model configurations on cold start, inference latency, and class distribution.
use crate::Result;
use crate::corpus::{self, CorpusFilter};
use kreuzberg::core::config::layout::TableModel;
use std::path::PathBuf;
use std::time::Instant;
fn parse_table_model(s: &str) -> TableModel {
match s {
"tatr" => TableModel::Tatr,
"slanet_wired" => TableModel::SlanetWired,
"slanet_wireless" => TableModel::SlanetWireless,
"slanet_plus" => TableModel::SlanetPlus,
"slanet_auto" => TableModel::SlanetAuto,
"disabled" => TableModel::Disabled,
_ => TableModel::default(),
}
}
/// Configuration for model benchmark.
pub struct ModelBenchmarkConfig {
pub fixtures_dir: PathBuf,
pub model_a: String,
pub model_b: String,
pub max_pages: usize,
}
impl Default for ModelBenchmarkConfig {
fn default() -> Self {
Self {
fixtures_dir: PathBuf::from("tools/benchmark-harness/fixtures"),
model_a: "tatr".to_string(),
model_b: "slanet_auto".to_string(),
max_pages: 3,
}
}
}
/// Per-document model comparison result.
#[derive(Debug)]
pub struct ModelDocResult {
pub name: String,
pub model_a_ms: f64,
pub model_b_ms: f64,
pub model_a_regions: usize,
pub model_b_regions: usize,
}
/// Run model benchmark (stub — full implementation requires layout model API).
///
/// This currently extracts using the two table model configurations and measures timing.
/// A full implementation would directly invoke the ONNX models on rendered pages.
pub async fn run_model_benchmark(config: &ModelBenchmarkConfig) -> Result<Vec<ModelDocResult>> {
let filter = CorpusFilter {
file_types: Some(vec!["pdf".to_string()]),
require_ground_truth: true,
name_patterns: Vec::new(),
max_file_size: Some(5_000_000), // Skip huge PDFs for model benchmarks
..Default::default()
};
let docs = corpus::build_corpus(&config.fixtures_dir, &filter)?;
eprintln!(
"Model benchmark: {} documents, models: {} vs {}",
docs.len(),
config.model_a,
config.model_b
);
let mut results = Vec::new();
for doc in &docs {
// Model A: extract with layout + table model A
let config_a = kreuzberg::ExtractionConfig {
output_format: kreuzberg::core::config::OutputFormat::Markdown,
layout: Some(kreuzberg::core::config::layout::LayoutDetectionConfig {
table_model: parse_table_model(&config.model_a),
..Default::default()
}),
..Default::default()
};
let t = Instant::now();
let result_a = match tokio::time::timeout(
std::time::Duration::from_secs(180),
kreuzberg::extract_file(&doc.document_path, None, &config_a),
)
.await
{
Ok(r) => r.ok(),
Err(_) => {
eprintln!(" TIMEOUT {}/{}", doc.name, config.model_a);
None
}
};
let model_a_ms = t.elapsed().as_secs_f64() * 1000.0;
// Model B: extract with different table model
let config_b = kreuzberg::ExtractionConfig {
output_format: kreuzberg::core::config::OutputFormat::Markdown,
layout: Some(kreuzberg::core::config::layout::LayoutDetectionConfig {
table_model: parse_table_model(&config.model_b),
..Default::default()
}),
..Default::default()
};
let t = Instant::now();
let result_b = match tokio::time::timeout(
std::time::Duration::from_secs(180),
kreuzberg::extract_file(&doc.document_path, None, &config_b),
)
.await
{
Ok(r) => r.ok(),
Err(_) => {
eprintln!(" TIMEOUT {}/{}", doc.name, config.model_b);
None
}
};
let model_b_ms = t.elapsed().as_secs_f64() * 1000.0;
// Count headings as a proxy for detected regions
let count_headings = |content: &str| content.lines().filter(|l| l.starts_with('#')).count();
let model_a_regions = result_a.as_ref().map(|r| count_headings(&r.content)).unwrap_or(0);
let model_b_regions = result_b.as_ref().map(|r| count_headings(&r.content)).unwrap_or(0);
results.push(ModelDocResult {
name: doc.name.clone(),
model_a_ms,
model_b_ms,
model_a_regions,
model_b_regions,
});
}
Ok(results)
}
/// Print model benchmark results table.
pub fn print_model_table(results: &[ModelDocResult], model_a: &str, model_b: &str) {
eprintln!(
"{:<25} {:>10} {:>10} {:>10} {:>10}",
"Document",
format!("{} ms", model_a),
format!("{} ms", model_b),
format!("{} rgns", model_a),
format!("{} rgns", model_b),
);
eprintln!("{}", "-".repeat(70));
for r in results {
eprintln!(
"{:<25} {:>10.0} {:>10.0} {:>10} {:>10}",
if r.name.len() > 24 { &r.name[..24] } else { &r.name },
r.model_a_ms,
r.model_b_ms,
r.model_a_regions,
r.model_b_regions,
);
}
let n = results.len() as f64;
let avg_a: f64 = results.iter().map(|r| r.model_a_ms).sum::<f64>() / n;
let avg_b: f64 = results.iter().map(|r| r.model_b_ms).sum::<f64>() / n;
eprintln!("{}", "-".repeat(70));
eprintln!("{:<25} {:>10.0} {:>10.0}", "AVERAGE", avg_a, avg_b);
}

View File

@@ -0,0 +1,884 @@
//! Resource monitoring for benchmark execution
//!
//! This module provides real-time monitoring of CPU and memory usage during
//! document extraction, with percentile calculations for performance analysis.
//! When the "memory-profiling" feature is enabled, provides additional allocation
//! hotspot analysis and heap snapshot tracking.
//!
//! # Measurement Methodology
//!
//! Both memory and CPU measurements include the entire process tree (parent + all
//! child processes). This is critical for accurate measurement of extraction
//! frameworks that spawn subprocesses (e.g., pandoc, tika). Without this,
//! measurements would only capture the idle wrapper process, not the actual
//! extraction work happening in child processes.
//!
//! Changed in v4.0: Previously only measured parent process memory.
//! Changed in v4.3.7: CPU now also measures the entire process tree (previously
//! only measured parent process CPU, causing near-zero readings for subprocess-based
//! frameworks).
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
use std::time::Duration;
use sysinfo::{Pid, ProcessRefreshKind, ProcessesToUpdate, System};
use tokio::sync::Mutex;
/// Calculate adaptive sampling interval based on file size.
///
/// Small files (<100KB) use 1ms sampling for fine-grained measurement.
/// Medium files (100KB-10MB) use 5ms sampling.
/// Large files (>10MB) use 10ms sampling to reduce overhead.
pub fn adaptive_sampling_interval_ms(file_size: u64) -> u64 {
if file_size < 100_000 {
1
} else if file_size < 10_000_000 {
5
} else {
10
}
}
/// Snapshot of memory state at a point in time.
///
/// Captures both virtual memory metrics and optional heap allocation data.
/// Used for detailed memory growth analysis and leak detection.
#[derive(Debug, Clone)]
pub struct MemorySnapshot {
/// Timestamp relative to monitoring start
pub timestamp: Duration,
/// Resident Set Size in bytes (actual physical memory)
pub rss_bytes: u64,
/// Virtual memory size in bytes
pub vm_bytes: u64,
/// Major page faults at this snapshot
pub page_faults: u64,
/// Heap allocated bytes (only available with memory-profiling feature)
#[cfg(feature = "memory-profiling")]
pub heap_allocated: Option<u64>,
}
impl MemorySnapshot {
/// Create a new memory snapshot
#[cfg(not(feature = "memory-profiling"))]
fn new(timestamp: Duration, rss_bytes: u64, vm_bytes: u64, page_faults: u64) -> Self {
Self {
timestamp,
rss_bytes,
vm_bytes,
page_faults,
}
}
/// Create a new memory snapshot with optional heap data
#[cfg(feature = "memory-profiling")]
fn new(timestamp: Duration, rss_bytes: u64, vm_bytes: u64, page_faults: u64, heap_allocated: Option<u64>) -> Self {
Self {
timestamp,
rss_bytes,
vm_bytes,
page_faults,
heap_allocated,
}
}
}
/// Allocation site with count and size information
///
/// Only available when memory-profiling feature is enabled.
#[cfg(feature = "memory-profiling")]
#[derive(Debug, Clone)]
pub struct AllocationSite {
/// Source location (file:line format)
pub location: String,
/// Total bytes allocated from this site
pub bytes_allocated: u64,
/// Number of allocations from this site
pub allocation_count: u64,
}
/// Sample of resource usage at a point in time
#[derive(Debug, Clone, Copy)]
pub struct ResourceSample {
/// Memory usage in bytes (RSS)
pub memory_bytes: u64,
/// Virtual memory size in bytes
pub vm_size_bytes: u64,
/// Major page faults count
pub page_faults: u64,
/// CPU usage percentage normalized across cores (0.0 - 100.0)
/// Includes the entire process tree (parent + all child processes).
pub cpu_percent: f64,
/// Timestamp when sample was taken (relative to monitoring start)
pub timestamp_ms: u64,
}
/// Collect all child process IDs for a given parent process
///
/// Recursively finds all descendants in the process tree by iterating through
/// all system processes and checking parent PIDs.
fn get_child_processes(parent_pid: Pid, system: &System) -> Vec<Pid> {
system
.processes()
.iter()
.filter_map(|(pid, proc)| {
if proc.parent() == Some(parent_pid) {
Some(*pid)
} else {
None
}
})
.collect()
}
/// Collect total memory usage from a process and all its descendants
///
/// Recursively traverses the process tree, summing RSS memory from the parent
/// and all child processes. This is essential for accurately measuring frameworks
/// that spawn subprocesses for extraction work.
///
/// # Arguments
/// * `pid` - The root process ID to measure
/// * `system` - System instance with refreshed process information
///
/// # Returns
/// Total RSS memory in bytes for the entire process tree
fn collect_process_tree_memory(pid: Pid, system: &System) -> u64 {
let mut total = 0;
// Add parent process memory
if let Some(proc) = system.process(pid) {
total += proc.memory();
// Recursively add all child processes
for child_pid in get_child_processes(pid, system) {
total += collect_process_tree_memory(child_pid, system);
}
}
total
}
/// Collect total virtual memory usage from a process and all its descendants
///
/// Similar to collect_process_tree_memory but for virtual memory size.
///
/// # Arguments
/// * `pid` - The root process ID to measure
/// * `system` - System instance with refreshed process information
///
/// # Returns
/// Total virtual memory in bytes for the entire process tree
fn collect_process_tree_vm(pid: Pid, system: &System) -> u64 {
let mut total = 0;
// Add parent process VM
if let Some(proc) = system.process(pid) {
total += proc.virtual_memory();
// Recursively add all child processes
for child_pid in get_child_processes(pid, system) {
total += collect_process_tree_vm(child_pid, system);
}
}
total
}
/// Collect total CPU usage from a process and all its descendants
///
/// Recursively traverses the process tree, summing CPU usage from the parent
/// and all child processes. This mirrors `collect_process_tree_memory` to ensure
/// CPU measurement is consistent with memory measurement.
///
/// Without this, subprocess-based frameworks (tika, pandoc, etc.) show near-zero
/// CPU because only the idle parent/wrapper process is measured, while the actual
/// extraction work happens in child processes.
///
/// # Arguments
/// * `pid` - The root process ID to measure
/// * `system` - System instance with refreshed process information
///
/// # Returns
/// Total CPU usage percentage for the entire process tree (0.0 - 100.0 * num_cores)
fn collect_process_tree_cpu(pid: Pid, system: &System) -> f64 {
let mut total = 0.0;
if let Some(proc) = system.process(pid) {
total += proc.cpu_usage() as f64;
// Recursively add all child processes
for child_pid in get_child_processes(pid, system) {
total += collect_process_tree_cpu(child_pid, system);
}
}
total
}
/// Resource monitor that samples CPU and memory usage periodically
///
/// Tracks both low-level CPU/memory metrics and optional heap allocation data.
/// Use the "memory-profiling" feature for enhanced allocation analysis.
pub struct ResourceMonitor {
samples: Arc<Mutex<Vec<ResourceSample>>>,
snapshots: Arc<Mutex<Vec<MemorySnapshot>>>,
running: Arc<AtomicBool>,
pid: Pid,
/// Baseline RSS captured at start(), used to compute delta-based memory metrics.
/// This removes the effect of pre-loaded models/runtimes from per-extraction measurements.
baseline_memory_bytes: Arc<Mutex<u64>>,
}
impl ResourceMonitor {
/// Create a new resource monitor for the current process
///
/// Initializes monitoring structures without starting background sampling.
/// Call `start()` to begin collecting metrics.
pub fn new() -> Self {
let pid = sysinfo::get_current_pid().expect("Failed to get current PID");
Self {
samples: Arc::new(Mutex::new(Vec::new())),
snapshots: Arc::new(Mutex::new(Vec::new())),
running: Arc::new(AtomicBool::new(false)),
pid,
baseline_memory_bytes: Arc::new(Mutex::new(0)),
}
}
/// Create a resource monitor targeting a specific process ID.
///
/// Use this for persistent-mode subprocesses where the extraction server's PID
/// is known. Monitoring a specific PID captures that process tree's actual memory
/// rather than the harness process memory.
pub fn new_for_pid(pid: u32) -> Self {
Self {
samples: Arc::new(Mutex::new(Vec::new())),
snapshots: Arc::new(Mutex::new(Vec::new())),
running: Arc::new(AtomicBool::new(false)),
pid: Pid::from_u32(pid),
baseline_memory_bytes: Arc::new(Mutex::new(0)),
}
}
/// Capture heap allocation statistics from jemalloc
///
/// Only available when "memory-profiling" feature is enabled.
/// Returns the number of bytes currently allocated on the heap.
/// Returns None if jemalloc statistics are unavailable.
#[cfg(feature = "memory-profiling")]
fn capture_heap_stats() -> Option<u64> {
use tikv_jemalloc_ctl::{epoch, stats};
let _prev_epoch = epoch::mib().and_then(|e| e.advance()).ok()?;
let allocated = stats::allocated::mib().and_then(|a| a.read()).ok()?;
Some(allocated as u64)
}
/// Start monitoring resources in the background
///
/// Spawns a background task that samples memory and CPU usage at the specified interval.
/// When "memory-profiling" feature is enabled, also captures heap allocation data.
///
/// # Arguments
/// * `sample_interval` - How often to sample (e.g., Duration::from_millis(10))
pub async fn start(&self, sample_interval: Duration) {
if self.running.swap(true, Ordering::SeqCst) {
return;
}
let samples = Arc::clone(&self.samples);
let snapshots = Arc::clone(&self.snapshots);
let running = Arc::clone(&self.running);
let baseline_memory = Arc::clone(&self.baseline_memory_bytes);
let pid = self.pid;
tokio::spawn(async move {
let mut system = System::new();
let start = std::time::Instant::now();
let refresh_kind = ProcessRefreshKind::nothing().with_memory().with_cpu();
// Establish baseline for CPU delta calculation.
// sysinfo computes cpu_usage() as a diff between two consecutive refreshes,
// so the first refresh after System::new() always returns 0.0.
// By doing a baseline refresh here, the first in-loop sample will have
// a prior measurement to compare against and yield real CPU values.
system.refresh_processes_specifics(ProcessesToUpdate::All, false, refresh_kind);
// Capture baseline RSS before extraction starts.
// This allows delta-based memory reporting: peak_during_extraction - baseline.
// Without this, pre-loaded models (e.g. PaddleOCR ~362MB) inflate every
// extraction's memory measurement, even for plain text files.
let baseline_rss = collect_process_tree_memory(pid, &system);
*baseline_memory.lock().await = baseline_rss;
tokio::time::sleep(sample_interval).await;
while running.load(Ordering::SeqCst) {
// Refresh all processes to track child processes spawned by the benchmark.
// Note: refresh_cpu_usage() is NOT called here — it refreshes global CPU counters,
// not per-process CPU. Per-process CPU is computed by refresh_processes_specifics
// as a delta between consecutive calls on the same System instance.
system.refresh_processes_specifics(ProcessesToUpdate::All, false, refresh_kind);
if system.process(pid).is_some() {
let elapsed = start.elapsed();
let cpu_count = num_cpus::get() as f64;
// Collect CPU from entire process tree (parent + all children)
// This mirrors collect_process_tree_memory to ensure CPU measurement
// captures subprocess work, not just the idle parent process.
let tree_cpu = collect_process_tree_cpu(pid, &system);
let normalized_cpu_percent = tree_cpu / cpu_count;
// Collect memory from entire process tree (parent + all children)
let tree_memory = collect_process_tree_memory(pid, &system);
let tree_vm = collect_process_tree_vm(pid, &system);
let sample = ResourceSample {
memory_bytes: tree_memory,
vm_size_bytes: tree_vm,
page_faults: 0,
cpu_percent: normalized_cpu_percent,
timestamp_ms: elapsed.as_millis() as u64,
};
#[cfg(feature = "memory-profiling")]
let heap_allocated = Self::capture_heap_stats();
#[cfg(not(feature = "memory-profiling"))]
let _heap_allocated: Option<u64> = None;
#[cfg(feature = "memory-profiling")]
let snapshot = MemorySnapshot::new(elapsed, tree_memory, tree_vm, 0, heap_allocated);
#[cfg(not(feature = "memory-profiling"))]
let snapshot = MemorySnapshot::new(elapsed, tree_memory, tree_vm, 0);
samples.lock().await.push(sample);
snapshots.lock().await.push(snapshot);
}
tokio::time::sleep(sample_interval).await;
}
});
}
/// Take a single synchronous memory and CPU measurement of the current process tree.
///
/// Useful as a fallback when the background sampler collects zero samples
/// (e.g., sub-millisecond extractions that complete before the first sample).
/// Performs two refreshes with a 50ms gap to get a valid CPU delta.
pub fn snapshot_current_memory(&self) -> ResourceSample {
let mut system = System::new();
let refresh_kind = ProcessRefreshKind::nothing().with_memory().with_cpu();
// First refresh establishes the CPU baseline
system.refresh_processes_specifics(ProcessesToUpdate::All, false, refresh_kind);
std::thread::sleep(std::time::Duration::from_millis(50));
// Second refresh computes the CPU delta
system.refresh_processes_specifics(ProcessesToUpdate::All, false, refresh_kind);
let tree_memory = collect_process_tree_memory(self.pid, &system);
let tree_vm = collect_process_tree_vm(self.pid, &system);
let cpu_count = num_cpus::get() as f64;
let tree_cpu = collect_process_tree_cpu(self.pid, &system);
let normalized_cpu_percent = tree_cpu / cpu_count;
ResourceSample {
memory_bytes: tree_memory,
vm_size_bytes: tree_vm,
page_faults: 0,
cpu_percent: normalized_cpu_percent,
timestamp_ms: 0,
}
}
/// Stop monitoring and return collected samples
pub async fn stop(&self) -> Vec<ResourceSample> {
self.running.store(false, Ordering::SeqCst);
tokio::time::sleep(Duration::from_millis(20)).await;
let samples = self.samples.lock().await;
samples.clone()
}
/// Retrieve all collected memory snapshots
///
/// Returns snapshots captured during monitoring, including detailed
/// memory state at each sampling point.
pub async fn get_snapshots(&self) -> Vec<MemorySnapshot> {
let snapshots = self.snapshots.lock().await;
snapshots.clone()
}
/// Get the peak memory snapshot
///
/// Returns the snapshot with the highest RSS memory usage.
/// Returns None if no snapshots were collected.
pub async fn peak_snapshot(&self) -> Option<MemorySnapshot> {
let snapshots = self.snapshots.lock().await;
snapshots.iter().max_by_key(|s| s.rss_bytes).cloned()
}
/// Analyze memory growth trajectory
///
/// Returns a vector of (timestamp, rss_bytes) pairs representing
/// the memory growth over time. Useful for identifying sustained
/// growth vs temporary spikes.
pub async fn growth_trajectory(&self) -> Vec<(Duration, u64)> {
let snapshots = self.snapshots.lock().await;
snapshots.iter().map(|s| (s.timestamp, s.rss_bytes)).collect()
}
/// Detect potential memory leaks
///
/// A leak is detected if memory grows by >5% from start to end
/// and the end memory is >20% of peak. This avoids false positives
/// from temporary allocations.
pub async fn detect_leaks(&self) -> bool {
let snapshots = self.snapshots.lock().await;
if snapshots.len() < 2 {
return false;
}
let start_rss = snapshots[0].rss_bytes as f64;
let end_rss = snapshots[snapshots.len() - 1].rss_bytes as f64;
let peak_rss = snapshots.iter().map(|s| s.rss_bytes as f64).fold(0.0, f64::max);
let growth_percent = ((end_rss - start_rss) / start_rss) * 100.0;
let retained_percent = (end_rss / peak_rss) * 100.0;
growth_percent > 5.0 && retained_percent > 20.0
}
/// Calculate percentile from samples
///
/// # Arguments
/// * `samples` - Sorted samples (will be sorted if not already)
/// * `percentile` - Percentile to calculate (0.0 - 1.0)
fn calculate_percentile(mut values: Vec<u64>, percentile: f64) -> u64 {
if values.is_empty() {
return 0;
}
values.sort_unstable();
let index = ((values.len() as f64 - 1.0) * percentile) as usize;
values[index]
}
/// Get the baseline memory captured at start().
pub async fn baseline_memory(&self) -> u64 {
*self.baseline_memory_bytes.lock().await
}
/// Calculate resource statistics from samples and snapshots
///
/// Memory values are reported as deltas from `baseline_bytes`, which represents
/// the process tree RSS before extraction started. This removes the effect of
/// pre-loaded models and runtimes from per-extraction measurements.
///
/// Pass `baseline_bytes = 0` to get absolute RSS (legacy behavior).
pub fn calculate_stats(
samples: &[ResourceSample],
snapshots: &[MemorySnapshot],
baseline_bytes: u64,
) -> ResourceStats {
if samples.is_empty() {
// If no background samples but snapshots are available, use snapshot RSS as fallback
if !snapshots.is_empty() {
let peak_rss = snapshots
.iter()
.map(|s| s.rss_bytes.saturating_sub(baseline_bytes))
.max()
.unwrap_or(0);
let peak_vm = snapshots.iter().map(|s| s.vm_bytes).max().unwrap_or(0);
return ResourceStats {
peak_memory_bytes: peak_rss,
peak_vm_bytes: peak_vm,
p50_memory_bytes: peak_rss,
p95_memory_bytes: peak_rss,
p99_memory_bytes: peak_rss,
sample_count: snapshots.len(),
snapshots: snapshots.to_vec(),
..Default::default()
};
}
return ResourceStats::default();
}
// Subtract baseline from memory samples to get delta (incremental cost of this extraction).
let memory_values: Vec<u64> = samples
.iter()
.map(|s| s.memory_bytes.saturating_sub(baseline_bytes))
.collect();
let cpu_values: Vec<f64> = samples.iter().map(|s| s.cpu_percent).collect();
let vm_values: Vec<u64> = samples.iter().map(|s| s.vm_size_bytes).collect();
let peak_memory = *memory_values.iter().max().unwrap_or(&0);
let peak_vm = *vm_values.iter().max().unwrap_or(&0);
let avg_cpu = cpu_values.iter().sum::<f64>() / cpu_values.len() as f64;
let memory_growth_rate_mb_s = if samples.len() >= 2 {
let first_memory = memory_values[0];
let last_memory = memory_values[memory_values.len() - 1];
let duration_ms = samples[samples.len() - 1].timestamp_ms - samples[0].timestamp_ms;
let duration_s = if duration_ms > 0 {
duration_ms as f64 / 1000.0
} else {
1.0
};
let memory_delta_bytes = if last_memory > first_memory {
(last_memory - first_memory) as f64
} else {
0.0
};
memory_delta_bytes / 1_048_576.0 / duration_s
} else {
0.0
};
let leak_detected = if snapshots.len() >= 2 {
let start_rss = snapshots[0].rss_bytes as f64;
let end_rss = snapshots[snapshots.len() - 1].rss_bytes as f64;
let peak_rss = snapshots.iter().map(|s| s.rss_bytes as f64).fold(0.0, f64::max);
if peak_rss > 0.0 {
let growth_percent = ((end_rss - start_rss) / start_rss) * 100.0;
let retained_percent = (end_rss / peak_rss) * 100.0;
growth_percent > 5.0 && retained_percent > 20.0
} else {
false
}
} else {
false
};
let total_page_faults = samples.last().map(|s| s.page_faults).unwrap_or(0);
ResourceStats {
peak_memory_bytes: peak_memory,
peak_vm_bytes: peak_vm,
total_page_faults,
memory_growth_rate_mb_s,
avg_cpu_percent: avg_cpu,
p50_memory_bytes: Self::calculate_percentile(memory_values.clone(), 0.50),
p95_memory_bytes: Self::calculate_percentile(memory_values.clone(), 0.95),
p99_memory_bytes: Self::calculate_percentile(memory_values, 0.99),
sample_count: samples.len(),
snapshots: snapshots.to_vec(),
#[cfg(feature = "memory-profiling")]
allocation_hotspots: Vec::new(), // TODO: Extract from jemalloc profiles
leak_detected,
}
}
}
impl Default for ResourceMonitor {
fn default() -> Self {
Self::new()
}
}
/// Resource usage statistics
///
/// Aggregated metrics from benchmark execution including percentiles,
/// growth rates, and optional allocation hotspot analysis.
#[derive(Debug, Clone, Default)]
pub struct ResourceStats {
/// Peak memory usage in bytes
pub peak_memory_bytes: u64,
/// Peak virtual memory size in bytes
pub peak_vm_bytes: u64,
/// Total major page faults
pub total_page_faults: u64,
/// Memory growth rate in MB/s
pub memory_growth_rate_mb_s: f64,
/// Average CPU usage percentage
pub avg_cpu_percent: f64,
/// 50th percentile (median) memory usage
pub p50_memory_bytes: u64,
/// 95th percentile memory usage
pub p95_memory_bytes: u64,
/// 99th percentile memory usage
pub p99_memory_bytes: u64,
/// Number of samples collected
pub sample_count: usize,
/// Complete memory snapshots for detailed analysis
pub snapshots: Vec<MemorySnapshot>,
/// Memory allocation hotspots (only with memory-profiling feature)
#[cfg(feature = "memory-profiling")]
pub allocation_hotspots: Vec<AllocationSite>,
/// Whether memory leak was detected (RSA growing without release)
pub leak_detected: bool,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_adaptive_sampling_interval_small_file() {
let interval = adaptive_sampling_interval_ms(50_000);
assert_eq!(interval, 1, "Small file (50KB) should use 1ms interval");
}
#[test]
fn test_adaptive_sampling_interval_boundary_100kb() {
let interval = adaptive_sampling_interval_ms(100_000);
assert_eq!(interval, 5, "Exactly 100KB boundary should use 5ms interval");
}
#[test]
fn test_adaptive_sampling_interval_medium_file() {
let interval = adaptive_sampling_interval_ms(1_000_000);
assert_eq!(interval, 5, "Medium file (1MB) should use 5ms interval");
}
#[test]
fn test_adaptive_sampling_interval_boundary_10mb() {
let interval = adaptive_sampling_interval_ms(10_000_000);
assert_eq!(interval, 10, "Exactly 10MB boundary should use 10ms interval");
}
#[test]
fn test_adaptive_sampling_interval_large_file() {
let interval = adaptive_sampling_interval_ms(100_000_000);
assert_eq!(interval, 10, "Large file (100MB) should use 10ms interval");
}
#[test]
fn test_adaptive_sampling_interval_zero_bytes() {
let interval = adaptive_sampling_interval_ms(0);
assert_eq!(interval, 1, "Zero byte file should use 1ms interval");
}
#[test]
fn test_adaptive_sampling_interval_max_u64() {
let interval = adaptive_sampling_interval_ms(u64::MAX);
assert_eq!(interval, 10, "u64::MAX should use 10ms interval");
}
#[test]
fn test_calculate_percentile() {
let values = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
assert_eq!(ResourceMonitor::calculate_percentile(values.clone(), 0.0), 1);
assert_eq!(ResourceMonitor::calculate_percentile(values.clone(), 0.5), 5);
assert_eq!(ResourceMonitor::calculate_percentile(values.clone(), 0.95), 9);
assert_eq!(ResourceMonitor::calculate_percentile(values, 1.0), 10);
}
#[test]
fn test_calculate_percentile_single_value() {
let values = vec![42];
assert_eq!(ResourceMonitor::calculate_percentile(values, 0.5), 42);
}
#[test]
fn test_calculate_percentile_empty() {
let values = vec![];
assert_eq!(ResourceMonitor::calculate_percentile(values, 0.5), 0);
}
#[tokio::test]
async fn test_resource_monitor_basic() {
let monitor = ResourceMonitor::new();
// 25ms interval + 500ms sleep gives ~20 samples even on a slow CI
// runner; the previous 10/100ms ratio occasionally produced 0
// samples on macOS CI when the first tick missed the deadline.
monitor.start(Duration::from_millis(25)).await;
tokio::time::sleep(Duration::from_millis(500)).await;
let samples = monitor.stop().await;
assert!(!samples.is_empty(), "Should have collected samples");
assert!(samples.len() >= 2, "Should have at least 2 samples");
}
#[tokio::test]
async fn test_resource_stats_calculation() {
let samples = vec![
ResourceSample {
memory_bytes: 100,
vm_size_bytes: 500,
page_faults: 10,
cpu_percent: 10.0,
timestamp_ms: 0,
},
ResourceSample {
memory_bytes: 200,
vm_size_bytes: 600,
page_faults: 20,
cpu_percent: 20.0,
timestamp_ms: 10,
},
ResourceSample {
memory_bytes: 150,
vm_size_bytes: 550,
page_faults: 25,
cpu_percent: 15.0,
timestamp_ms: 20,
},
];
let snapshots = vec![
MemorySnapshot::new(
Duration::from_millis(0),
100,
500,
10,
#[cfg(feature = "memory-profiling")]
None,
),
MemorySnapshot::new(
Duration::from_millis(10),
200,
600,
20,
#[cfg(feature = "memory-profiling")]
None,
),
MemorySnapshot::new(
Duration::from_millis(20),
150,
550,
25,
#[cfg(feature = "memory-profiling")]
None,
),
];
let stats = ResourceMonitor::calculate_stats(&samples, &snapshots, 0);
assert_eq!(stats.peak_memory_bytes, 200);
assert_eq!(stats.peak_vm_bytes, 600);
assert_eq!(stats.total_page_faults, 25);
assert_eq!(stats.p50_memory_bytes, 150);
assert!((stats.avg_cpu_percent - 15.0).abs() < 0.1);
assert_eq!(stats.sample_count, 3);
assert!(stats.memory_growth_rate_mb_s >= 0.0);
assert_eq!(stats.snapshots.len(), 3);
}
#[tokio::test]
async fn test_resource_stats_empty() {
let stats = ResourceMonitor::calculate_stats(&[], &[], 0);
assert_eq!(stats.peak_memory_bytes, 0);
assert_eq!(stats.sample_count, 0);
}
#[tokio::test]
async fn test_leak_detection() {
let snapshots = vec![
MemorySnapshot::new(
Duration::from_millis(0),
1000,
5000,
0,
#[cfg(feature = "memory-profiling")]
None,
),
MemorySnapshot::new(
Duration::from_millis(10),
2000,
6000,
0,
#[cfg(feature = "memory-profiling")]
None,
),
MemorySnapshot::new(
Duration::from_millis(20),
1200,
5500,
0,
#[cfg(feature = "memory-profiling")]
None,
),
];
let samples = vec![ResourceSample {
memory_bytes: 1200,
vm_size_bytes: 5500,
page_faults: 0,
cpu_percent: 0.0,
timestamp_ms: 20,
}];
let stats = ResourceMonitor::calculate_stats(&samples, &snapshots, 0);
assert!(
stats.leak_detected,
"Should detect leak with >5% growth and >20% retention"
);
}
#[tokio::test]
async fn test_no_leak_detection_temporary_spike() {
let snapshots = vec![
MemorySnapshot::new(
Duration::from_millis(0),
1000,
5000,
0,
#[cfg(feature = "memory-profiling")]
None,
),
MemorySnapshot::new(
Duration::from_millis(10),
5000,
9000,
0,
#[cfg(feature = "memory-profiling")]
None,
),
MemorySnapshot::new(
Duration::from_millis(20),
1001,
5001,
0,
#[cfg(feature = "memory-profiling")]
None,
),
];
let samples = vec![ResourceSample {
memory_bytes: 1001,
vm_size_bytes: 5001,
page_faults: 0,
cpu_percent: 0.0,
timestamp_ms: 20,
}];
let stats = ResourceMonitor::calculate_stats(&samples, &snapshots, 0);
assert!(!stats.leak_detected, "Should not detect leak when memory is released");
}
#[tokio::test]
async fn test_snapshot_collection() {
let monitor = ResourceMonitor::new();
monitor.start(Duration::from_millis(10)).await;
tokio::time::sleep(Duration::from_millis(50)).await;
let snapshots = monitor.get_snapshots().await;
assert!(
!snapshots.is_empty(),
"Should have collected snapshots during monitoring"
);
let peak = monitor.peak_snapshot().await;
assert!(peak.is_some(), "Should find peak snapshot");
let trajectory = monitor.growth_trajectory().await;
assert_eq!(
trajectory.len(),
snapshots.len(),
"Trajectory should match snapshot count"
);
monitor.stop().await;
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,662 @@
//! Output writers for benchmark results
//!
//! This module provides functionality for persisting benchmark results to disk
//! in JSON format.
use crate::stats::percentile_r7;
use crate::types::{BenchmarkResult, ErrorKind};
use crate::{Error, Result};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::fs;
use std::path::Path;
/// Validate a benchmark result for invalid states
///
/// # Arguments
/// * `result` - The benchmark result to validate
///
/// # Returns
/// * `Ok(())` if valid, `Err` with description if invalid
pub fn validate_result(result: &BenchmarkResult) -> Result<()> {
// Note: duration=0 is valid for sub-millisecond extractions (e.g., simple JSON files).
// We only record millisecond precision, so very fast extractions show as 0ms.
// Check for invalid state: success=true with error message
if result.success && result.error_message.is_some() {
return Err(Error::Benchmark(format!(
"Invalid result state for {}/{}: success=true but error_message is set",
result.framework,
result.file_path.display()
)));
}
// Check for invalid state: success=false without error message
if !result.success && result.error_message.is_none() {
return Err(Error::Benchmark(format!(
"Invalid result state for {}/{}: success=false but error_message is None",
result.framework,
result.file_path.display()
)));
}
// Check for invalid state: success=true but error_kind is not None
if result.success && result.error_kind != ErrorKind::None {
return Err(Error::Benchmark(format!(
"Invalid result state for {}/{}: success=true but error_kind is {:?}",
result.framework,
result.file_path.display(),
result.error_kind
)));
}
Ok(())
}
/// Write benchmark results to JSON file
///
/// # Arguments
/// * `results` - Vector of benchmark results to write
/// * `output_path` - Path to output JSON file
pub fn write_json(results: &[BenchmarkResult], output_path: &Path) -> Result<()> {
// Validate all results before writing
for result in results {
validate_result(result)?;
}
if let Some(parent) = output_path.parent() {
fs::create_dir_all(parent).map_err(Error::Io)?;
}
let json = serde_json::to_string_pretty(results)
.map_err(|e| Error::Benchmark(format!("Failed to serialize results: {}", e)))?;
fs::write(output_path, json).map_err(Error::Io)?;
Ok(())
}
/// Per-framework statistics for a specific file extension
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FrameworkExtensionStats {
/// Number of files tested
pub count: usize,
/// Number of successful extractions
pub successful: usize,
/// Number of framework-side extraction errors (not our fault)
pub framework_errors: usize,
/// Number of harness-side errors (potentially our fault)
pub harness_errors: usize,
/// Number of extractions that timed out
pub timeouts: usize,
/// Number of extractions that returned empty content
pub empty_content: usize,
/// Unique framework error messages with occurrence counts
#[serde(skip_serializing_if = "HashMap::is_empty")]
pub error_details: HashMap<String, usize>,
/// Success rate (0.0-1.0)
pub success_rate: f64,
/// Average wall-clock duration in milliseconds (includes subprocess overhead)
pub avg_duration_ms: f64,
/// Median wall-clock duration in milliseconds
pub median_duration_ms: f64,
/// P95 wall-clock duration in milliseconds
pub p95_duration_ms: f64,
/// Average pure extraction duration in milliseconds (excludes subprocess overhead)
#[serde(skip_serializing_if = "Option::is_none")]
pub avg_extraction_duration_ms: Option<f64>,
/// Median pure extraction duration in milliseconds
#[serde(skip_serializing_if = "Option::is_none")]
pub median_extraction_duration_ms: Option<f64>,
/// P95 pure extraction duration in milliseconds
#[serde(skip_serializing_if = "Option::is_none")]
pub p95_extraction_duration_ms: Option<f64>,
/// Average throughput in MB/s
pub avg_throughput_mbps: f64,
/// Average peak memory in MB
pub avg_peak_memory_mb: f64,
}
/// Analysis of results grouped by file extension
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExtensionAnalysis {
/// Total number of files with this extension
pub total_files: usize,
/// Per-framework performance statistics
pub framework_stats: HashMap<String, FrameworkExtensionStats>,
}
/// Complete by-extension analysis result
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ByExtensionReport {
/// Per-extension analysis
pub by_extension: HashMap<String, ExtensionAnalysis>,
}
/// Analyze benchmark results by file extension
///
/// Groups results by file extension and calculates per-framework statistics
/// for each extension.
///
/// # Arguments
/// * `results` - Vector of benchmark results to analyze
///
/// # Returns
/// * ByExtensionReport with statistics grouped by extension and framework
pub fn analyze_by_extension(results: &[BenchmarkResult]) -> ByExtensionReport {
let mut by_extension: HashMap<String, HashMap<String, Vec<&BenchmarkResult>>> = HashMap::new();
for result in results {
let ext = result.file_extension.clone();
let framework = result.framework.clone();
by_extension
.entry(ext)
.or_default()
.entry(framework)
.or_default()
.push(result);
}
let mut report = HashMap::new();
for (ext, framework_results) in by_extension {
let total_files = framework_results.values().map(|v| v.len()).max().unwrap_or(0);
let mut framework_stats = HashMap::new();
for (framework, results) in framework_results {
let stats = calculate_framework_stats(&results);
framework_stats.insert(framework, stats);
}
report.insert(
ext,
ExtensionAnalysis {
total_files,
framework_stats,
},
);
}
ByExtensionReport { by_extension: report }
}
/// Calculate statistics for a framework's results
fn calculate_framework_stats(results: &[&BenchmarkResult]) -> FrameworkExtensionStats {
let count = results.len();
let successful = results.iter().filter(|r| r.success).count();
let success_rate = if count > 0 {
successful as f64 / count as f64
} else {
0.0
};
let framework_errors = results
.iter()
.filter(|r| r.error_kind == ErrorKind::FrameworkError)
.count();
let harness_errors = results
.iter()
.filter(|r| r.error_kind == ErrorKind::HarnessError)
.count();
let timeouts = results.iter().filter(|r| r.error_kind == ErrorKind::Timeout).count();
let empty_content = results
.iter()
.filter(|r| r.error_kind == ErrorKind::EmptyContent)
.count();
let mut error_details: HashMap<String, usize> = HashMap::new();
for result in results.iter().filter(|r| !r.success) {
if let Some(msg) = &result.error_message {
*error_details.entry(msg.clone()).or_insert(0) += 1;
}
}
let successful_results: Vec<&&BenchmarkResult> = results.iter().filter(|r| r.success).collect();
let avg_duration_ms = if !successful_results.is_empty() {
successful_results
.iter()
.map(|r| r.duration.as_secs_f64() * 1000.0)
.sum::<f64>()
/ successful_results.len() as f64
} else {
0.0
};
let mut durations: Vec<f64> = successful_results
.iter()
.map(|r| r.duration.as_secs_f64() * 1000.0)
.collect();
durations.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let median_duration_ms = if !durations.is_empty() {
percentile_r7(&durations, 0.50)
} else {
0.0
};
let p95_duration_ms = if !durations.is_empty() {
percentile_r7(&durations, 0.95)
} else {
0.0
};
// Extraction duration stats (pure extraction time, excludes subprocess overhead)
let mut extraction_durations: Vec<f64> = successful_results
.iter()
.filter_map(|r| r.extraction_duration.map(|d| d.as_secs_f64() * 1000.0))
.filter(|v| !v.is_nan() && v.is_finite())
.collect();
extraction_durations.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let avg_extraction_duration_ms = if !extraction_durations.is_empty() {
Some(extraction_durations.iter().sum::<f64>() / extraction_durations.len() as f64)
} else {
None
};
let median_extraction_duration_ms = if !extraction_durations.is_empty() {
Some(percentile_r7(&extraction_durations, 0.50))
} else {
None
};
let p95_extraction_duration_ms = if !extraction_durations.is_empty() {
Some(percentile_r7(&extraction_durations, 0.95))
} else {
None
};
let avg_throughput_mbps = if !successful_results.is_empty() {
successful_results
.iter()
.map(|r| r.metrics.throughput_bytes_per_sec / 1_000_000.0)
.sum::<f64>()
/ successful_results.len() as f64
} else {
0.0
};
let avg_peak_memory_mb = if !successful_results.is_empty() {
successful_results
.iter()
.map(|r| r.metrics.peak_memory_bytes as f64 / 1_000_000.0)
.sum::<f64>()
/ successful_results.len() as f64
} else {
0.0
};
FrameworkExtensionStats {
count,
successful,
framework_errors,
harness_errors,
timeouts,
empty_content,
error_details,
success_rate,
avg_duration_ms,
median_duration_ms,
p95_duration_ms,
avg_extraction_duration_ms,
median_extraction_duration_ms,
p95_extraction_duration_ms,
avg_throughput_mbps,
avg_peak_memory_mb,
}
}
/// Write by-extension analysis to JSON file
///
/// # Arguments
/// * `results` - Vector of benchmark results to analyze
/// * `output_path` - Path to output JSON file (e.g., "by-extension.json")
pub fn write_by_extension_analysis(results: &[BenchmarkResult], output_path: &Path) -> Result<()> {
let report = analyze_by_extension(results);
if let Some(parent) = output_path.parent() {
fs::create_dir_all(parent).map_err(Error::Io)?;
}
let json = serde_json::to_string_pretty(&report)
.map_err(|e| Error::Benchmark(format!("Failed to serialize extension analysis: {}", e)))?;
fs::write(output_path, json).map_err(Error::Io)?;
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use crate::types::{FrameworkCapabilities, OcrStatus, OutputFormat, PerformanceMetrics};
use std::path::PathBuf;
use std::time::Duration;
use tempfile::TempDir;
fn create_benchmark_result(
framework: &str,
success: bool,
duration_ms: u64,
extraction_duration_ms: Option<u64>,
throughput_bps: f64,
memory_bytes: u64,
) -> BenchmarkResult {
BenchmarkResult {
framework: framework.to_string(),
file_path: PathBuf::from(format!("/tmp/{}.txt", framework)),
file_size: 1024,
success,
error_message: if success { None } else { Some("Test error".to_string()) },
error_kind: if success {
ErrorKind::None
} else {
ErrorKind::HarnessError
},
duration: Duration::from_millis(duration_ms),
extraction_duration: extraction_duration_ms.map(Duration::from_millis),
subprocess_overhead: extraction_duration_ms.map(|ed| Duration::from_millis(duration_ms.saturating_sub(ed))),
metrics: PerformanceMetrics {
peak_memory_bytes: memory_bytes,
avg_cpu_percent: 50.0,
throughput_bytes_per_sec: throughput_bps,
p50_memory_bytes: memory_bytes,
p95_memory_bytes: memory_bytes,
p99_memory_bytes: memory_bytes,
},
quality: None,
iterations: vec![],
statistics: None,
cold_start_duration: None,
file_extension: "txt".to_string(),
framework_capabilities: FrameworkCapabilities::default(),
pdf_metadata: None,
ocr_status: OcrStatus::Unknown,
extracted_text: None,
output_format: OutputFormat::Markdown,
}
}
#[test]
fn test_write_json() {
let temp_dir = TempDir::new().unwrap();
let output_path = temp_dir.path().join("results.json");
let results = vec![BenchmarkResult {
framework: "test-framework".to_string(),
file_path: PathBuf::from("/tmp/test.txt"),
file_size: 1024,
success: true,
error_message: None,
error_kind: ErrorKind::None,
duration: Duration::from_secs(1),
extraction_duration: None,
subprocess_overhead: None,
metrics: PerformanceMetrics {
peak_memory_bytes: 10_000_000,
avg_cpu_percent: 50.0,
throughput_bytes_per_sec: 1024.0,
p50_memory_bytes: 8_000_000,
p95_memory_bytes: 9_500_000,
p99_memory_bytes: 9_900_000,
},
quality: None,
iterations: vec![],
statistics: None,
cold_start_duration: None,
file_extension: "txt".to_string(),
framework_capabilities: Default::default(),
pdf_metadata: None,
ocr_status: OcrStatus::Unknown,
extracted_text: None,
output_format: OutputFormat::Markdown,
}];
write_json(&results, &output_path).unwrap();
assert!(output_path.exists());
let contents = fs::read_to_string(&output_path).unwrap();
let parsed: Vec<BenchmarkResult> = serde_json::from_str(&contents).unwrap();
assert_eq!(parsed.len(), 1);
assert_eq!(parsed[0].framework, "test-framework");
}
#[test]
fn test_write_json_creates_directory() {
let temp_dir = TempDir::new().unwrap();
let output_path = temp_dir.path().join("subdir/results.json");
let results = vec![];
write_json(&results, &output_path).unwrap();
assert!(output_path.exists());
assert!(output_path.parent().unwrap().exists());
}
// ============================================================================
// Tests for extraction_duration statistics in calculate_framework_stats
// ============================================================================
#[test]
fn test_framework_stats_extraction_duration_all_present() {
// Test: All results have extraction_duration -> percentiles populated
let result1 = create_benchmark_result("framework1", true, 100, Some(80), 1_000_000.0, 10_000_000);
let result2 = create_benchmark_result("framework1", true, 150, Some(120), 1_000_000.0, 10_000_000);
let result3 = create_benchmark_result("framework1", true, 200, Some(160), 1_000_000.0, 10_000_000);
let results = vec![&result1, &result2, &result3];
let stats = calculate_framework_stats(&results);
assert_eq!(stats.count, 3);
assert_eq!(stats.successful, 3);
assert!(stats.avg_extraction_duration_ms.is_some());
assert!(stats.median_extraction_duration_ms.is_some());
assert!(stats.p95_extraction_duration_ms.is_some());
// Average of 80, 120, 160 = 120 ms
assert!((stats.avg_extraction_duration_ms.unwrap() - 120.0).abs() < 0.1);
// Median of 80, 120, 160 = 120 ms
assert!((stats.median_extraction_duration_ms.unwrap() - 120.0).abs() < 0.1);
}
#[test]
fn test_framework_stats_extraction_duration_all_none() {
// Test: All results have extraction_duration = None -> percentiles None
let result1 = create_benchmark_result("framework1", true, 100, None, 1_000_000.0, 10_000_000);
let result2 = create_benchmark_result("framework1", true, 150, None, 1_000_000.0, 10_000_000);
let result3 = create_benchmark_result("framework1", true, 200, None, 1_000_000.0, 10_000_000);
let results = vec![&result1, &result2, &result3];
let stats = calculate_framework_stats(&results);
assert_eq!(stats.count, 3);
assert_eq!(stats.successful, 3);
assert!(stats.avg_extraction_duration_ms.is_none());
assert!(stats.median_extraction_duration_ms.is_none());
assert!(stats.p95_extraction_duration_ms.is_none());
}
#[test]
fn test_framework_stats_extraction_duration_mixed_some_none() {
// Test: Mixed Some/None extraction_duration -> only Some values used
let result1 = create_benchmark_result("framework1", true, 100, Some(80), 1_000_000.0, 10_000_000);
let result2 = create_benchmark_result("framework1", true, 150, None, 1_000_000.0, 10_000_000);
let result3 = create_benchmark_result("framework1", true, 200, Some(160), 1_000_000.0, 10_000_000);
let results = vec![&result1, &result2, &result3];
let stats = calculate_framework_stats(&results);
assert_eq!(stats.count, 3);
assert_eq!(stats.successful, 3);
assert!(stats.avg_extraction_duration_ms.is_some());
assert!(stats.median_extraction_duration_ms.is_some());
// Only 80 and 160 ms, average = 120 ms
assert!((stats.avg_extraction_duration_ms.unwrap() - 120.0).abs() < 0.1);
}
#[test]
fn test_framework_stats_extraction_duration_filters_nan() {
// Test: NaN/infinite durations filtered out
let result1 = create_benchmark_result("framework1", true, 100, Some(80), 1_000_000.0, 10_000_000);
let result2 = create_benchmark_result("framework1", true, 150, Some(120), 1_000_000.0, 10_000_000);
let result3 = create_benchmark_result("framework1", true, 200, Some(160), 1_000_000.0, 10_000_000);
// Inject NaN and infinity by manipulating durations (since Duration doesn't support NaN)
// We'll test this conceptually with valid values, but the filtering logic is tested
// by verifying that only finite, non-NaN values are used
let results = vec![&result1, &result2, &result3];
let stats = calculate_framework_stats(&results);
assert_eq!(stats.count, 3);
// All three values are valid (80, 120, 160)
assert!(stats.avg_extraction_duration_ms.is_some());
assert_eq!(stats.avg_extraction_duration_ms.unwrap(), 120.0);
}
#[test]
fn test_framework_stats_extraction_duration_empty_results() {
// Test: Empty results -> sensible defaults
let results: Vec<&BenchmarkResult> = vec![];
let stats = calculate_framework_stats(&results);
assert_eq!(stats.count, 0);
assert_eq!(stats.successful, 0);
assert_eq!(stats.success_rate, 0.0);
assert_eq!(stats.avg_duration_ms, 0.0);
assert_eq!(stats.median_duration_ms, 0.0);
assert_eq!(stats.p95_duration_ms, 0.0);
assert!(stats.avg_extraction_duration_ms.is_none());
assert!(stats.median_extraction_duration_ms.is_none());
assert!(stats.p95_extraction_duration_ms.is_none());
}
#[test]
fn test_framework_stats_extraction_duration_only_failed_results() {
// Test: Only failed results -> extraction_duration None (only successful results used)
let result1 = create_benchmark_result("framework1", false, 0, None, 0.0, 0);
let result2 = create_benchmark_result("framework1", false, 0, None, 0.0, 0);
let results = vec![&result1, &result2];
let stats = calculate_framework_stats(&results);
assert_eq!(stats.count, 2);
assert_eq!(stats.successful, 0);
assert!(stats.avg_extraction_duration_ms.is_none());
assert!(stats.median_extraction_duration_ms.is_none());
assert!(stats.p95_extraction_duration_ms.is_none());
}
#[test]
fn test_framework_stats_extraction_duration_single_value() {
// Test: Single extraction_duration value -> all percentiles return that value
let result = create_benchmark_result("framework1", true, 100, Some(80), 1_000_000.0, 10_000_000);
let results = vec![&result];
let stats = calculate_framework_stats(&results);
assert_eq!(stats.count, 1);
assert_eq!(stats.successful, 1);
assert_eq!(stats.avg_extraction_duration_ms.unwrap(), 80.0);
assert_eq!(stats.median_extraction_duration_ms.unwrap(), 80.0);
assert_eq!(stats.p95_extraction_duration_ms.unwrap(), 80.0);
}
#[test]
fn test_framework_stats_success_rate_with_extraction_duration() {
// Test: Mixed success/failure with extraction_duration on successful results
let result1 = create_benchmark_result("framework1", true, 100, Some(80), 1_000_000.0, 10_000_000);
let result2 = create_benchmark_result("framework1", true, 150, Some(120), 1_000_000.0, 10_000_000);
let result3 = create_benchmark_result("framework1", false, 0, None, 0.0, 0);
let results = vec![&result1, &result2, &result3];
let stats = calculate_framework_stats(&results);
assert_eq!(stats.count, 3);
assert_eq!(stats.successful, 2);
assert_eq!(stats.success_rate, 2.0 / 3.0);
// Only successful results have extraction_duration
assert!(stats.avg_extraction_duration_ms.is_some());
// Average of 80 and 120 = 100
assert!((stats.avg_extraction_duration_ms.unwrap() - 100.0).abs() < 0.1);
}
#[test]
fn test_framework_stats_large_number_extraction_durations() {
// Test: Many extraction_duration values -> percentiles calculated correctly
let mut results = vec![];
for i in 1..=100 {
results.push(create_benchmark_result(
"framework1",
true,
i * 10,
Some(i * 8),
1_000_000.0,
10_000_000,
));
}
let result_refs: Vec<&BenchmarkResult> = results.iter().collect();
let stats = calculate_framework_stats(&result_refs);
assert_eq!(stats.count, 100);
assert_eq!(stats.successful, 100);
// Average of 8, 16, 24, ..., 800 = 8*(1+2+...+100)/100 = 8*5050/100 = 404
let expected_avg = 8.0 * (1..=100).sum::<u64>() as f64 / 100.0;
assert!((stats.avg_extraction_duration_ms.unwrap() - expected_avg).abs() < 1.0);
// Median of 1-100: 50th percentile
assert!(stats.median_extraction_duration_ms.is_some());
// P95: 95th percentile
assert!(stats.p95_extraction_duration_ms.is_some());
}
#[test]
fn test_analyze_by_extension_with_extraction_duration() {
// Integration test: analyze_by_extension properly aggregates extraction_duration
let results = vec![
create_benchmark_result("framework1", true, 100, Some(80), 1_000_000.0, 10_000_000),
create_benchmark_result("framework1", true, 150, Some(120), 1_000_000.0, 10_000_000),
];
let report = analyze_by_extension(&results);
assert!(report.by_extension.contains_key("txt"));
let ext_analysis = &report.by_extension["txt"];
assert!(ext_analysis.framework_stats.contains_key("framework1"));
let framework_stats = &ext_analysis.framework_stats["framework1"];
assert!(framework_stats.avg_extraction_duration_ms.is_some());
assert!(framework_stats.median_extraction_duration_ms.is_some());
assert!(framework_stats.p95_extraction_duration_ms.is_some());
}
#[test]
fn test_analyze_by_extension_mixed_extraction_duration() {
// Test: analyze_by_extension with mixed extraction_duration presence
let mut result1 = create_benchmark_result("framework1", true, 100, Some(80), 1_000_000.0, 10_000_000);
result1.file_extension = "pdf".to_string();
let mut result2 = create_benchmark_result("framework1", true, 150, None, 1_000_000.0, 10_000_000);
result2.file_extension = "pdf".to_string();
let results = vec![result1, result2];
let report = analyze_by_extension(&results);
assert!(report.by_extension.contains_key("pdf"));
let ext_analysis = &report.by_extension["pdf"];
let framework_stats = &ext_analysis.framework_stats["framework1"];
// Should have extraction_duration stats (only from result1 which has Some)
assert!(framework_stats.avg_extraction_duration_ms.is_some());
assert_eq!(framework_stats.avg_extraction_duration_ms.unwrap(), 80.0);
}
}

View File

@@ -0,0 +1,545 @@
//! 6-path pipeline benchmark: exhaustive quality + timing comparison across
//! all extraction configurations on the full document corpus.
//!
//! | ID | Name | Config |
//! |----|-------------------|--------------------------------------------------|
//! | P1 | native | output_format: Markdown |
//! | P2 | native+layout | output_format: Markdown, layout: fast |
//! | P3 | tesseract | output_format: Markdown, ocr: tesseract, force |
//! | P4 | tesseract+layout | P3 + layout: fast |
//! | P5 | paddleocr | output_format: Markdown, ocr: paddleocr, force (mobile default) |
//! | P6 | paddleocr+layout | P5 + layout: accurate |
//! | P7 | paddleocr-server | P5 + model_tier: server |
//! | P8 | paddleocr-server+layout | P7 + layout: accurate |
use crate::Result;
use crate::comparison::{Pipeline, PipelineResult};
use crate::corpus::{self, CorpusDocument, CorpusFilter};
use crate::markdown_quality::{MdBlockType, parse_markdown_blocks, score_structural_quality_normalized};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::path::{Path, PathBuf};
/// Which pipeline paths to include.
pub struct PipelineBenchmarkConfig {
pub fixtures_dir: PathBuf,
pub paths: Vec<Pipeline>,
pub doc_filter: Vec<String>,
pub dump_outputs: bool,
pub json_output: Option<PathBuf>,
pub sort_by: SortMetric,
pub bottom_n: Option<usize>,
pub triage_blocks: bool,
}
/// Metric to sort by in triage view.
#[derive(Debug, Clone, Copy, Default)]
pub enum SortMetric {
#[default]
Sf1,
Tf1,
Time,
}
impl SortMetric {
pub fn parse(s: &str) -> Option<Self> {
match s {
"sf1" => Some(SortMetric::Sf1),
"tf1" => Some(SortMetric::Tf1),
"time" => Some(SortMetric::Time),
_ => None,
}
}
fn extract(&self, pr: &PipelineResult) -> f64 {
match self {
SortMetric::Sf1 => pr.sf1,
SortMetric::Tf1 => pr.tf1,
SortMetric::Time => {
if pr.time_ms.is_nan() {
f64::NEG_INFINITY
} else {
-pr.time_ms // negate so ascending sort = slowest first
}
}
}
}
}
/// Result for one document across all selected pipeline paths.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PipelineDocResult {
pub name: String,
pub file_type: String,
pub file_size: u64,
pub results: Vec<PipelineResult>,
}
/// Per-pipeline aggregate statistics.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PipelineAggregate {
pub pipeline: String,
pub mean_sf1: f64,
pub mean_tf1: f64,
pub mean_time_ms: f64,
pub p50_sf1: f64,
pub p50_tf1: f64,
pub p50_time_ms: f64,
pub p90_time_ms: f64,
}
/// Full benchmark run summary for JSON serialization.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PipelineRunSummary {
pub timestamp: String,
pub git_sha: String,
pub doc_count: usize,
pub pipeline_count: usize,
pub aggregates: Vec<PipelineAggregate>,
pub docs: Vec<PipelineDocResult>,
}
/// Default 6-path set.
pub fn default_paths() -> Vec<Pipeline> {
vec![
Pipeline::Baseline,
Pipeline::Layout,
Pipeline::Tesseract,
Pipeline::TesseractLayout,
Pipeline::Paddle,
Pipeline::PaddleLayout,
]
}
async fn extract_and_score(
pipeline: Pipeline,
doc: &CorpusDocument,
gt_text: &str,
gt_markdown: Option<&str>,
fixtures_dir: &Path,
) -> PipelineResult {
let (content_opt, time_ms) = crate::comparison::extract_pipeline(pipeline, doc, fixtures_dir).await;
let content = content_opt.unwrap_or_default();
let (tf1, _basic_sf1, _basic_order, _basic_per_type) =
crate::comparison::score_document(&content, gt_text, gt_markdown);
// Use the pipeline benchmark's enhanced scoring: heading-level-normalized,
// with structure detection and content capping.
let (sf1, order_score, per_type_sf1) = match gt_markdown {
Some(md) => {
// Skip SF1 for documents without structural ground truth
// (all-Paragraph docs produce meaningless 0% scores)
let gt_blocks = parse_markdown_blocks(md);
let has_structure = gt_blocks
.iter()
.any(|b| !matches!(b.block_type, MdBlockType::Paragraph));
if !has_structure {
(f64::NAN, f64::NAN, HashMap::new())
} else {
// Cap content to 50K chars to prevent scoring from taking too long
let capped = if content.len() > 50_000 {
// Find a valid UTF-8 boundary near 50K
let mut end = 50_000;
while end > 0 && !content.is_char_boundary(end) {
end -= 1;
}
&content[..end]
} else {
&content
};
// Use heading-level-normalized scoring (H1≡H2≡H3 etc.)
let sq = score_structural_quality_normalized(capped, md);
let per_type: HashMap<String, f64> = sq.per_type.iter().map(|(k, v)| (k.to_string(), v.f1)).collect();
(sq.structural_f1, sq.order_score, per_type)
}
}
None => (f64::NAN, f64::NAN, HashMap::new()),
};
let ext_tokens = crate::quality::tokenize(&content);
let gt_tok = crate::quality::tokenize(gt_text);
let (mut missing_tokens, mut extra_tokens) = crate::quality::compute_token_diff(&ext_tokens, &gt_tok);
missing_tokens.truncate(50);
extra_tokens.truncate(50);
PipelineResult {
pipeline,
sf1,
tf1,
order_score,
per_type_sf1,
time_ms,
missing_tokens,
extra_tokens,
content,
}
}
/// Run the pipeline benchmark.
pub async fn run_pipeline_benchmark(config: &PipelineBenchmarkConfig) -> Result<Vec<PipelineDocResult>> {
let filter = CorpusFilter {
file_types: None, // All formats with ground truth
require_ground_truth: true,
name_patterns: config.doc_filter.clone(),
..Default::default()
};
let docs = corpus::build_corpus(&config.fixtures_dir, &filter)?;
eprintln!(
"Pipeline benchmark: {} documents, {} paths",
docs.len(),
config.paths.len()
);
let dump_dir = if config.dump_outputs {
let dir = PathBuf::from("/tmp/kreuzberg_pipeline");
let _ = std::fs::create_dir_all(&dir);
Some(dir)
} else {
None
};
let mut results = Vec::new();
let total = docs.len();
for (idx, doc) in docs.iter().enumerate() {
eprint!("\r[{}/{}] {} ...", idx + 1, total, doc.name);
let gt_text = match doc.ground_truth_text.as_ref() {
Some(p) => match std::fs::read_to_string(p) {
Ok(s) => s,
Err(e) => {
eprintln!("Warning: failed to read ground truth text {}: {}", p.display(), e);
String::new()
}
},
None => String::new(),
};
let gt_markdown = match doc.ground_truth_markdown.as_ref() {
Some(p) => match std::fs::read_to_string(p) {
Ok(s) => Some(s),
Err(e) => {
eprintln!("Warning: failed to read ground truth markdown {}: {}", p.display(), e);
None
}
},
None => None,
};
let mut pipeline_results = Vec::new();
for &pipeline in &config.paths {
let pr = extract_and_score(pipeline, doc, &gt_text, gt_markdown.as_deref(), &config.fixtures_dir).await;
if let Some(ref dir) = dump_dir {
let doc_dir = dir.join(&doc.name);
let _ = std::fs::create_dir_all(&doc_dir);
let _ = std::fs::write(doc_dir.join(format!("{}.md", pipeline.name())), &pr.content);
// Also dump ground truth for comparison
if let Some(ref gt_md) = gt_markdown {
let _ = std::fs::write(doc_dir.join("ground_truth.md"), gt_md);
}
let _ = std::fs::write(doc_dir.join("ground_truth_text.txt"), &gt_text);
}
pipeline_results.push(pr);
}
let best_sf1 = pipeline_results.iter().map(|r| r.sf1).fold(0.0_f64, f64::max);
let best_time = pipeline_results
.iter()
.map(|r| r.time_ms)
.filter(|t| !t.is_nan())
.fold(f64::INFINITY, f64::min);
if best_time.is_infinite() {
eprint!(
"\r[{}/{}] {:<30} SF1:{:.0}%\n",
idx + 1,
total,
doc.name,
best_sf1 * 100.0,
);
} else {
eprint!(
"\r[{}/{}] {:<30} SF1:{:.0}% {:.0}ms\n",
idx + 1,
total,
doc.name,
best_sf1 * 100.0,
best_time
);
}
results.push(PipelineDocResult {
name: doc.name.clone(),
file_type: doc.file_type.clone(),
file_size: doc.file_size,
results: pipeline_results,
});
}
Ok(results)
}
/// Print a per-document + aggregate matrix table.
pub fn print_pipeline_table(results: &[PipelineDocResult], sort_by: SortMetric, bottom_n: Option<usize>) {
if results.is_empty() {
eprintln!("No results.");
return;
}
// Optionally sort and truncate for triage view
let display_results: Vec<&PipelineDocResult> = if let Some(n) = bottom_n {
let mut sorted: Vec<&PipelineDocResult> = results.iter().collect();
// Sort by the worst (min) score across all pipelines for the chosen metric
sorted.sort_by(|a, b| {
let a_worst = a
.results
.iter()
.map(|pr| sort_by.extract(pr))
.fold(f64::INFINITY, f64::min);
let b_worst = b
.results
.iter()
.map(|pr| sort_by.extract(pr))
.fold(f64::INFINITY, f64::min);
a_worst.partial_cmp(&b_worst).unwrap_or(std::cmp::Ordering::Equal)
});
sorted.into_iter().take(n).collect()
} else {
results.iter().collect()
};
let pipelines: Vec<&str> = results[0].results.iter().map(|r| r.pipeline.name()).collect();
// Header
eprint!("{:<30} {:>5}", "Document", "Type");
for p in &pipelines {
eprint!(" {:>8} {:>8} {:>7}", format!("{} SF1", p), "TF1", "ms");
}
eprintln!();
eprintln!("{}", "-".repeat(36 + pipelines.len() * 26));
for doc in &display_results {
eprint!(
"{:<30} {:>5}",
if doc.name.len() > 29 {
&doc.name[..29]
} else {
&doc.name
},
&doc.file_type,
);
for pr in &doc.results {
let sf1_str = if pr.sf1.is_nan() {
"".to_string()
} else {
format!("{:>7.1}%", pr.sf1 * 100.0)
};
let tf1_str = if pr.tf1.is_nan() {
"".to_string()
} else {
format!("{:>7.1}%", pr.tf1 * 100.0)
};
let time_str = if pr.time_ms.is_nan() {
" N/A".to_string()
} else {
format!("{:>7.0}", pr.time_ms)
};
eprint!(" {} {} {}", sf1_str, tf1_str, time_str);
}
eprintln!();
}
// Averages (always over all results, not just displayed)
let total_docs = results.len();
eprintln!("{}", "-".repeat(36 + pipelines.len() * 26));
eprint!("{:<30} {:>5}", "AVERAGE", "");
for (i, _) in pipelines.iter().enumerate() {
let sf1_vals: Vec<f64> = results
.iter()
.map(|r| r.results[i].sf1)
.filter(|v| !v.is_nan())
.collect();
let sf1 = if !sf1_vals.is_empty() {
sf1_vals.iter().sum::<f64>() / sf1_vals.len() as f64
} else {
0.0
};
let tf1_vals: Vec<f64> = results
.iter()
.map(|r| r.results[i].tf1)
.filter(|v| !v.is_nan())
.collect();
let tf1 = if !tf1_vals.is_empty() {
tf1_vals.iter().sum::<f64>() / tf1_vals.len() as f64
} else {
0.0
};
let time_vals: Vec<f64> = results
.iter()
.map(|r| r.results[i].time_ms)
.filter(|v| !v.is_nan())
.collect();
if time_vals.is_empty() {
eprint!(" {:>7.1}% {:>7.1}% {:>7}", sf1 * 100.0, tf1 * 100.0, "N/A");
} else {
let ms: f64 = time_vals.iter().sum::<f64>() / time_vals.len() as f64;
eprint!(" {:>7.1}% {:>7.1}% {:>7.0}", sf1 * 100.0, tf1 * 100.0, ms);
}
}
eprintln!();
// Report how many docs were excluded from SF1 average
let sf1_excluded: usize = results.iter().map(|r| r.results[0].sf1).filter(|v| v.is_nan()).count();
if sf1_excluded > 0 {
eprintln!(
" (SF1 averaged over {}/{} docs; {} paragraph-only docs excluded)",
total_docs - sf1_excluded,
total_docs,
sf1_excluded
);
}
}
/// Print per-block-type F1 breakdown for triage.
pub fn print_triage_blocks(results: &[PipelineDocResult], sort_by: SortMetric, bottom_n: usize) {
if results.is_empty() {
return;
}
let block_types = ["H1", "H2", "H3", "Table", "Code", "ListItem", "Paragraph"];
// Sort and take bottom N
let mut sorted: Vec<&PipelineDocResult> = results.iter().collect();
sorted.sort_by(|a, b| {
let a_worst = a
.results
.iter()
.map(|pr| sort_by.extract(pr))
.fold(f64::INFINITY, f64::min);
let b_worst = b
.results
.iter()
.map(|pr| sort_by.extract(pr))
.fold(f64::INFINITY, f64::min);
a_worst.partial_cmp(&b_worst).unwrap_or(std::cmp::Ordering::Equal)
});
let display: Vec<&PipelineDocResult> = sorted.into_iter().take(bottom_n).collect();
eprintln!("\nPer-block-type F1 breakdown (bottom {} documents):", bottom_n);
for doc in &display {
eprintln!("\n {}", doc.name);
for pr in &doc.results {
let blocks_str: String = block_types
.iter()
.filter_map(|bt| pr.per_type_sf1.get(*bt).map(|v| format!("{}:{:.0}%", bt, v * 100.0)))
.collect::<Vec<_>>()
.join(" ");
eprintln!(
" {:<18} SF1:{:.0}% {}",
pr.pipeline.name(),
pr.sf1 * 100.0,
blocks_str
);
}
}
}
fn percentile(sorted: &[f64], p: f64) -> f64 {
if sorted.is_empty() {
return 0.0;
}
let idx = (p * (sorted.len() as f64 - 1.0)).round() as usize;
sorted[idx.min(sorted.len() - 1)]
}
/// Compute per-pipeline aggregate statistics.
pub fn compute_aggregates(results: &[PipelineDocResult]) -> Vec<PipelineAggregate> {
if results.is_empty() {
return Vec::new();
}
let n = results.len() as f64;
let num_pipelines = results[0].results.len();
let mut aggregates = Vec::new();
for i in 0..num_pipelines {
let pipeline_name = results[0].results[i].pipeline.name().to_string();
// Filter NaN values from SF1 (docs without structural ground truth)
let mut sf1s: Vec<f64> = results
.iter()
.map(|r| r.results[i].sf1)
.filter(|v| !v.is_nan())
.collect();
let mut tf1s: Vec<f64> = results.iter().map(|r| r.results[i].tf1).collect();
let mut times: Vec<f64> = results
.iter()
.map(|r| r.results[i].time_ms)
.filter(|v| !v.is_nan())
.collect();
sf1s.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
tf1s.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
times.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let sf1_n = sf1s.len() as f64;
aggregates.push(PipelineAggregate {
pipeline: pipeline_name,
mean_sf1: if sf1_n > 0.0 {
sf1s.iter().sum::<f64>() / sf1_n
} else {
0.0
},
mean_tf1: tf1s.iter().sum::<f64>() / n,
mean_time_ms: if times.is_empty() {
f64::NAN
} else {
times.iter().sum::<f64>() / times.len() as f64
},
p50_sf1: percentile(&sf1s, 0.5),
p50_tf1: percentile(&tf1s, 0.5),
p50_time_ms: percentile(&times, 0.5),
p90_time_ms: percentile(&times, 0.9),
});
}
aggregates
}
/// Build a full run summary for JSON serialization.
pub fn build_summary(results: &[PipelineDocResult]) -> PipelineRunSummary {
let git_sha = std::process::Command::new("git")
.args(["rev-parse", "--short", "HEAD"])
.output()
.ok()
.and_then(|o| String::from_utf8(o.stdout).ok())
.map(|s| s.trim().to_string())
.unwrap_or_default();
let timestamp = chrono::Utc::now().to_rfc3339();
PipelineRunSummary {
timestamp,
git_sha,
doc_count: results.len(),
pipeline_count: results.first().map(|r| r.results.len()).unwrap_or(0),
aggregates: compute_aggregates(results),
docs: results.to_vec(),
}
}
/// Write the run summary to a JSON file.
pub fn write_json_output(results: &[PipelineDocResult], path: &std::path::Path) -> Result<()> {
let summary = build_summary(results);
if let Some(parent) = path.parent() {
std::fs::create_dir_all(parent).map_err(crate::Error::Io)?;
}
let json = serde_json::to_string_pretty(&summary)
.map_err(|e| crate::Error::Benchmark(format!("Failed to serialize: {}", e)))?;
std::fs::write(path, json).map_err(crate::Error::Io)?;
eprintln!("JSON output written to: {}", path.display());
Ok(())
}

View File

@@ -0,0 +1,134 @@
//! Pool metrics collection and reporting
//!
//! This module provides infrastructure for collecting and reporting metrics
//! from pool operations during document extraction, helping to identify
//! allocation patterns and pool efficiency.
use std::collections::HashMap;
use std::fs;
use std::path::Path;
/// Aggregate metrics for a single file extraction
#[derive(Debug, Clone)]
pub struct FilePoolMetrics {
pub file_name: String,
pub mime_type: String,
pub file_size: usize,
pub string_pool_acquires: usize,
pub string_pool_reuses: usize,
pub string_pool_hit_rate: f64,
}
/// Aggregate metrics for all extractions
#[derive(Debug, Clone)]
pub struct PoolMetricsReport {
pub total_files: usize,
pub files: Vec<FilePoolMetrics>,
pub average_hit_rate: f64,
pub min_hit_rate: f64,
pub max_hit_rate: f64,
}
impl PoolMetricsReport {
/// Calculate overall statistics from individual file metrics
pub fn from_files(files: Vec<FilePoolMetrics>) -> Self {
let total_files = files.len();
let hit_rates: Vec<f64> = files.iter().map(|f| f.string_pool_hit_rate).collect();
let average_hit_rate = if !hit_rates.is_empty() {
hit_rates.iter().sum::<f64>() / hit_rates.len() as f64
} else {
0.0
};
let min_hit_rate = hit_rates.iter().cloned().fold(f64::INFINITY, f64::min);
let max_hit_rate = hit_rates.iter().cloned().fold(0.0, f64::max);
PoolMetricsReport {
total_files,
files,
average_hit_rate,
min_hit_rate,
max_hit_rate,
}
}
/// Serialize to JSON format
pub fn to_json(&self) -> Result<String, serde_json::Error> {
serde_json::to_string_pretty(&serde_json::json!({
"metadata": {
"version": "1.0",
"timestamp": chrono::Local::now().to_rfc3339(),
},
"summary": {
"total_files": self.total_files,
"average_hit_rate": self.average_hit_rate,
"min_hit_rate": self.min_hit_rate,
"max_hit_rate": self.max_hit_rate,
},
"files": self.files.iter().map(|f| serde_json::json!({
"file_name": f.file_name,
"mime_type": f.mime_type,
"file_size": f.file_size,
"string_pool": {
"total_acquires": f.string_pool_acquires,
"total_reuses": f.string_pool_reuses,
"hit_rate_percent": f.string_pool_hit_rate,
}
})).collect::<Vec<_>>(),
}))
}
/// Write report to file
pub fn write_to_file<P: AsRef<Path>>(&self, path: P) -> Result<(), Box<dyn std::error::Error>> {
let json = self.to_json()?;
fs::write(path, json)?;
Ok(())
}
/// Print human-readable summary
pub fn print_summary(&self) {
println!("\n=== Pool Metrics Report ===");
println!("Total files analyzed: {}", self.total_files);
println!(
"Hit rate (avg): {:.2}% (min: {:.2}%, max: {:.2}%)",
self.average_hit_rate, self.min_hit_rate, self.max_hit_rate
);
let mut ranges = HashMap::new();
for file in &self.files {
let range = if file.string_pool_hit_rate < 25.0 {
"0-25%"
} else if file.string_pool_hit_rate < 50.0 {
"25-50%"
} else if file.string_pool_hit_rate < 75.0 {
"50-75%"
} else if file.string_pool_hit_rate < 90.0 {
"75-90%"
} else {
"90%+"
};
*ranges.entry(range).or_insert(0) += 1;
}
println!("\nHit rate distribution:");
for range in &["0-25%", "25-50%", "50-75%", "75-90%", "90%+"] {
let count = ranges.get(range).unwrap_or(&0);
println!(" {}: {} files", range, count);
}
println!("\nBottom 5 performers (lowest hit rate):");
let mut sorted = self.files.clone();
sorted.sort_by(|a, b| {
a.string_pool_hit_rate
.partial_cmp(&b.string_pool_hit_rate)
.unwrap_or(std::cmp::Ordering::Equal)
});
for file in sorted.iter().take(5) {
println!(
" {} ({:.2}% hit rate, {} bytes)",
file.file_name, file.string_pool_hit_rate, file.file_size
);
}
}
}

View File

@@ -0,0 +1,963 @@
//! Comprehensive profiling report generation with hotspot analysis
//!
//! This module provides infrastructure for generating detailed profiling reports from
//! CPU profile data. Reports include top function hotspots, memory trajectory analysis,
//! actionable recommendations, and sample quality metrics.
//!
//! # Report Components
//!
//! - **Summary Statistics**: Sample count, profiling duration, effective sampling frequency
//! - **Top Hotspots**: Top 10 functions by sample count with percentages
//! - **Memory Trajectory**: Memory usage snapshots over profiling duration (when available)
//! - **Recommendations**: Actionable insights based on sample quality and profiling data
//!
//! # Sample Quality Guidelines
//!
//! - **< 100 samples**: Profile may have high variance, increase duration or frequency
//! - **100-499 samples**: Acceptable for basic analysis, consider longer runs
//! - **500+ samples**: Good quality profile with reliable hotspot identification
//! - **1000+ samples**: Excellent quality with strong statistical confidence
//!
//! # HTML Report Format
//!
//! Reports are generated as self-contained HTML documents with inline CSS, requiring
//! no external dependencies. The HTML is viewable in any modern web browser.
#[cfg(feature = "profiling")]
use crate::profiling::ProfilingResult;
use std::time::Duration;
/// Comprehensive profiling report with hotspot analysis
///
/// Contains aggregated profiling metrics, top functions, and analysis recommendations
/// suitable for performance optimization decisions.
#[derive(Debug, Clone)]
pub struct ProfileReport {
/// Total number of CPU samples collected
pub sample_count: usize,
/// Total profiling duration
pub duration: Duration,
/// Effective sampling frequency (samples collected per second)
pub effective_frequency: f64,
/// Top 10 functions by sample count
pub top_hotspots: Vec<Hotspot>,
/// Memory usage trajectory (if available)
pub memory_trajectory: Vec<MemorySnapshot>,
/// Actionable recommendations based on profile quality
pub recommendations: Vec<String>,
}
/// Individual function hotspot identified in the profile
///
/// Represents a function that consumed significant CPU samples during profiling.
#[derive(Debug, Clone)]
pub struct Hotspot {
/// Function name or symbol (demangled if possible)
pub function_name: String,
/// Number of samples attributed to this function
pub samples: usize,
/// Percentage of total samples (0.0-100.0)
pub percentage: f64,
/// File location if available (filename:line)
pub file_location: Option<String>,
}
/// Memory usage snapshot at a point in time
///
/// Used to track memory growth patterns during profiling.
#[derive(Debug, Clone)]
pub struct MemorySnapshot {
/// Relative time from profiling start in milliseconds
pub timestamp_ms: u64,
/// Memory usage in bytes (RSS)
pub memory_bytes: u64,
}
impl Default for ProfileReport {
fn default() -> Self {
Self {
sample_count: 0,
duration: Duration::ZERO,
effective_frequency: 0.0,
top_hotspots: Vec::new(),
memory_trajectory: Vec::new(),
recommendations: Vec::new(),
}
}
}
impl ProfileReport {
/// Create a ProfileReport from profiling result (feature-gated for profiling)
///
/// Analyzes the pprof Report structure to extract:
/// - Sample count and duration metrics
/// - Top 10 functions by sample count
/// - Effective sampling frequency
/// - Quality-based recommendations
///
/// # Arguments
///
/// * `result` - ProfilingResult from ProfileGuard::finish()
/// * `framework_name` - Name of the framework being profiled (for reporting)
///
/// # Returns
///
/// A ProfileReport with hotspot analysis and recommendations
///
/// # Note
///
/// This function is only available when the `profiling` feature is enabled.
#[cfg(feature = "profiling")]
pub fn from_profiling_result(result: &ProfilingResult, framework_name: &str) -> Self {
let duration = result.duration;
let sample_count = result.sample_count;
let effective_frequency = if duration.as_secs_f64() > 0.0 {
sample_count as f64 / duration.as_secs_f64()
} else {
0.0
};
let top_hotspots = Self::extract_top_hotspots(&result.report, sample_count);
let recommendations = Self::generate_recommendations(sample_count, framework_name);
Self {
sample_count,
duration,
effective_frequency,
top_hotspots,
memory_trajectory: Vec::new(),
recommendations,
}
}
/// Extract top 10 hotspots from the pprof Report
///
/// # Arguments
///
/// * `_report` - pprof Report containing collected profile data
/// * `total_samples` - Total sample count for percentage calculation
///
/// # Returns
///
/// Vector of up to 10 hotspots sorted by sample count descending
///
/// Note: This is a stub implementation. The pprof Report API doesn't expose
/// sample-level data directly in public API. A future enhancement would require
/// either:
/// 1. Creating custom serialization from pprof protobuf output
/// 2. Writing reports to intermediate format and parsing
/// 3. Enhancing pprof with additional API methods
///
/// For now, we generate recommendations based on sample count which is meaningful.
#[cfg(feature = "profiling")]
fn extract_top_hotspots(_report: &pprof::Report, total_samples: usize) -> Vec<Hotspot> {
if total_samples == 0 {
return Vec::new();
}
vec![Hotspot {
function_name: "[profile data collected - hotspot extraction requires pprof API enhancement]".to_string(),
samples: total_samples,
percentage: 100.0,
file_location: None,
}]
}
/// Generate recommendations based on profile quality metrics
///
/// # Arguments
///
/// * `sample_count` - Number of samples collected
/// * `framework_name` - Name of the profiled framework
///
/// # Returns
///
/// Vector of actionable recommendations
#[allow(dead_code)]
fn generate_recommendations(sample_count: usize, framework_name: &str) -> Vec<String> {
let mut recommendations = vec![format!(
"Profiling data collected for {} framework with {} samples",
framework_name, sample_count
)];
if sample_count < 50 {
recommendations.push(
"Very low sample count (<50): Profile may be unreliable. Increase profiling duration \
or sampling frequency for better accuracy."
.to_string(),
);
recommendations.push(
"Consider running the benchmark with amplified iterations (see --profiling-amplification) \
to collect more samples."
.to_string(),
);
} else if sample_count < 100 {
recommendations.push(
"Low sample count (<100): Profile has high variance. Increase profiling duration or \
consider longer-running benchmarks."
.to_string(),
);
} else if sample_count < 500 {
recommendations.push(
"Acceptable sample count (100-500): Profile is suitable for basic hotspot identification, \
but confidence in percentages is moderate. Consider longer runs for more precision."
.to_string(),
);
} else if sample_count < 1000 {
recommendations.push(
"Good sample count (500-1000): Profile quality is reliable for identifying hotspots.".to_string(),
);
} else {
recommendations.push(
"Excellent sample count (1000+): Profile has high statistical confidence. \
Hotspot percentages are reliable for optimization decisions."
.to_string(),
);
}
match framework_name {
"kreuzberg" => {
recommendations.push(
"Kreuzberg profile analysis: Focus on PDF parsing (pdf module) and text extraction \
(text module) hotspots."
.to_string(),
);
}
"python" => {
recommendations.push(
"Python bindings: High overhead in PyO3 marshalling may appear in hotspots. \
Consider optimizing PyO3 FFI boundary."
.to_string(),
);
}
"ruby" => {
recommendations.push(
"Ruby bindings: GIL contention may limit threading performance. \
Verify Magnus FFI overhead in hotspot analysis."
.to_string(),
);
}
_ => {}
}
recommendations
}
/// Generate an HTML report from the profile
///
/// Creates a self-contained HTML document with inline CSS that displays:
/// - Summary statistics table
/// - Top 10 hotspots table with percentages
/// - Memory trajectory chart (if available)
/// - Recommendations list
///
/// The HTML is viewable in any modern browser without external dependencies.
///
/// # Returns
///
/// HTML string with the formatted report
pub fn generate_html(&self) -> String {
let hotspots_html = self.render_hotspots_table();
let recommendations_html = self.render_recommendations();
let memory_html = if self.memory_trajectory.is_empty() {
String::new()
} else {
self.render_memory_chart()
};
let css = Self::css_styles();
let duration_ms = self.duration.as_millis();
format!(
r#"<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Profiling Report</title>
<style>
{}
</style>
</head>
<body>
<div class="container">
<header class="report-header">
<h1>CPU Profile Report</h1>
<p class="subtitle">Comprehensive hotspot analysis and recommendations</p>
</header>
<section class="summary-stats">
<h2>Profiling Summary</h2>
<table class="stats-table">
<tr>
<td class="stat-label">Total Samples Collected:</td>
<td class="stat-value">{}</td>
</tr>
<tr>
<td class="stat-label">Profiling Duration:</td>
<td class="stat-value">{} ms</td>
</tr>
<tr>
<td class="stat-label">Effective Frequency:</td>
<td class="stat-value">{:.1} samples/sec</td>
</tr>
<tr>
<td class="stat-label">Sample Quality:</td>
<td class="stat-value">{}</td>
</tr>
</table>
</section>
<section class="hotspots-section">
<h2>Top 10 Hotspots</h2>
{}
</section>
{}
<section class="recommendations-section">
<h2>Recommendations</h2>
{}
</section>
<footer class="report-footer">
<p>Generated by Kreuzberg Benchmark Harness</p>
</footer>
</div>
</body>
</html>"#,
css,
self.sample_count,
duration_ms,
self.effective_frequency,
self.sample_quality_label(),
hotspots_html,
memory_html,
recommendations_html
)
}
/// Determine sample quality label based on count
fn sample_quality_label(&self) -> &str {
match self.sample_count {
0..=49 => "Very Low",
50..=99 => "Low",
100..=499 => "Acceptable",
500..=999 => "Good",
_ => "Excellent",
}
}
/// Render hotspots table in HTML
fn render_hotspots_table(&self) -> String {
if self.top_hotspots.is_empty() {
return "<p class=\"no-data\">No hotspots captured in profile</p>".to_string();
}
let rows: String = self
.top_hotspots
.iter()
.enumerate()
.map(|(idx, hotspot)| {
let bar_width = (hotspot.percentage * 3.0).min(300.0);
format!(
r#"<tr>
<td class="rank">{}</td>
<td class="function-name" title="{}">{}</td>
<td class="sample-count">{}</td>
<td class="percentage">
<div class="bar-container">
<div class="bar" style="width: {}px"></div>
<span class="percentage-text">{:.1}%</span>
</div>
</td>
</tr>"#,
idx + 1,
hotspot.function_name,
Self::truncate_function_name(&hotspot.function_name, 50),
hotspot.samples,
bar_width,
hotspot.percentage
)
})
.collect();
format!(
r#"<table class="hotspots-table">
<thead>
<tr>
<th class="rank-col">Rank</th>
<th class="function-col">Function</th>
<th class="samples-col">Samples</th>
<th class="percentage-col">Percentage</th>
</tr>
</thead>
<tbody>
{}
</tbody>
</table>"#,
rows
)
}
/// Render recommendations section in HTML
fn render_recommendations(&self) -> String {
if self.recommendations.is_empty() {
return String::new();
}
let items: String = self
.recommendations
.iter()
.map(|rec| format!("<li>{}</li>", html_escape(rec)))
.collect();
format!("<ul class=\"recommendations-list\">{}</ul>", items)
}
/// Render memory trajectory chart (stub for future expansion)
fn render_memory_chart(&self) -> String {
if self.memory_trajectory.is_empty() {
return String::new();
}
format!(
r#"<section class="memory-section">
<h2>Memory Trajectory</h2>
<p class="note">Memory profiling data ({} snapshots collected)</p>
</section>"#,
self.memory_trajectory.len()
)
}
/// Truncate long function names for display
fn truncate_function_name(name: &str, max_len: usize) -> String {
if name.len() > max_len {
format!("{}...", &name[..max_len - 3])
} else {
name.to_string()
}
}
/// Inline CSS styles for the HTML report
///
/// Self-contained styles requiring no external dependencies.
/// Includes responsive design and print-friendly styles.
fn css_styles() -> &'static str {
r#"
* {
margin: 0;
padding: 0;
box-sizing: border-box;
}
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
line-height: 1.6;
color: #333;
background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
min-height: 100vh;
padding: 20px;
}
.container {
max-width: 1200px;
margin: 0 auto;
background: white;
border-radius: 8px;
box-shadow: 0 10px 40px rgba(0, 0, 0, 0.1);
overflow: hidden;
}
.report-header {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
padding: 40px 30px;
text-align: center;
}
.report-header h1 {
font-size: 2.5em;
margin-bottom: 10px;
font-weight: 700;
}
.subtitle {
font-size: 1.1em;
opacity: 0.95;
font-weight: 300;
}
section {
padding: 40px 30px;
border-bottom: 1px solid #e0e0e0;
}
section:last-of-type {
border-bottom: none;
}
h2 {
color: #667eea;
font-size: 1.8em;
margin-bottom: 25px;
font-weight: 700;
}
.summary-stats {
background: #f9fafb;
}
.stats-table {
width: 100%;
border-collapse: collapse;
}
.stats-table tr {
border-bottom: 1px solid #e5e7eb;
}
.stats-table tr:last-child {
border-bottom: none;
}
.stat-label {
font-weight: 600;
color: #1f2937;
padding: 12px 16px;
width: 40%;
}
.stat-value {
padding: 12px 16px;
color: #667eea;
font-weight: 500;
font-size: 1.1em;
}
.hotspots-table {
width: 100%;
border-collapse: collapse;
}
.hotspots-table thead {
background: #f0f4ff;
border-bottom: 2px solid #e0e7ff;
}
.hotspots-table th {
padding: 15px;
text-align: left;
font-weight: 600;
color: #667eea;
font-size: 0.95em;
text-transform: uppercase;
letter-spacing: 0.5px;
}
.hotspots-table tbody tr {
border-bottom: 1px solid #e5e7eb;
transition: background 0.2s;
}
.hotspots-table tbody tr:hover {
background: #f9fafb;
}
.hotspots-table td {
padding: 12px 15px;
font-size: 0.95em;
}
.rank {
font-weight: 700;
color: #667eea;
text-align: center;
width: 50px;
}
.rank-col {
width: 50px;
}
.function-col {
width: 40%;
}
.samples-col {
width: 15%;
}
.percentage-col {
width: 35%;
}
.function-name {
font-family: 'Courier New', monospace;
font-size: 0.9em;
color: #1f2937;
word-break: break-all;
}
.sample-count {
font-weight: 500;
color: #764ba2;
}
.percentage {
min-width: 300px;
}
.bar-container {
position: relative;
height: 28px;
display: flex;
align-items: center;
}
.bar {
height: 20px;
background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
border-radius: 3px;
min-width: 2px;
transition: all 0.2s;
}
.bar-container:hover .bar {
filter: brightness(1.1);
}
.percentage-text {
margin-left: 10px;
font-weight: 600;
color: #764ba2;
font-size: 0.9em;
min-width: 50px;
}
.recommendations-section {
background: #f0fdf4;
}
.recommendations-list {
list-style: none;
margin-left: 0;
}
.recommendations-list li {
padding: 12px 16px;
margin-bottom: 10px;
background: white;
border-left: 4px solid #10b981;
border-radius: 4px;
color: #1f2937;
}
.recommendations-list li:before {
content: "✓ ";
color: #10b981;
font-weight: bold;
margin-right: 8px;
}
.memory-section {
background: #f0f9ff;
}
.note {
color: #666;
font-style: italic;
margin-top: 10px;
}
.no-data {
color: #999;
text-align: center;
padding: 20px;
font-style: italic;
}
.report-footer {
background: #f3f4f6;
text-align: center;
color: #666;
font-size: 0.9em;
padding: 20px !important;
border-top: 1px solid #e5e7eb;
border-bottom: none;
}
@media (max-width: 768px) {
.container {
border-radius: 0;
}
.report-header {
padding: 30px 20px;
}
.report-header h1 {
font-size: 1.8em;
}
section {
padding: 25px 20px;
}
h2 {
font-size: 1.4em;
}
.hotspots-table,
.stats-table {
font-size: 0.9em;
}
.hotspots-table td,
.hotspots-table th,
.stats-table td {
padding: 8px 10px;
}
.function-col {
width: 100%;
}
.percentage-col {
width: 100%;
}
.function-name {
display: block;
margin-bottom: 5px;
}
.percentage {
min-width: auto;
margin-top: 10px;
}
}
@media print {
body {
background: white;
}
.container {
box-shadow: none;
border-radius: 0;
}
.report-header {
page-break-after: avoid;
}
section {
page-break-inside: avoid;
}
}
"#
}
}
/// Escape HTML special characters
fn html_escape(s: &str) -> String {
s.replace('&', "&amp;")
.replace('<', "&lt;")
.replace('>', "&gt;")
.replace('"', "&quot;")
.replace('\'', "&#39;")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_profile_report_default() {
let report = ProfileReport::default();
assert_eq!(report.sample_count, 0);
assert_eq!(report.duration, Duration::ZERO);
assert_eq!(report.effective_frequency, 0.0);
assert!(report.top_hotspots.is_empty());
assert!(report.recommendations.is_empty());
}
#[test]
fn test_sample_quality_label() {
let mut report = ProfileReport {
sample_count: 25,
..Default::default()
};
assert_eq!(report.sample_quality_label(), "Very Low");
report.sample_count = 75;
assert_eq!(report.sample_quality_label(), "Low");
report.sample_count = 250;
assert_eq!(report.sample_quality_label(), "Acceptable");
report.sample_count = 750;
assert_eq!(report.sample_quality_label(), "Good");
report.sample_count = 1500;
assert_eq!(report.sample_quality_label(), "Excellent");
}
#[test]
fn test_generate_recommendations_very_low_samples() {
let recommendations = ProfileReport::generate_recommendations(25, "kreuzberg");
assert!(recommendations.len() >= 3);
assert!(recommendations[1].contains("Very low sample count"));
assert!(recommendations[2].contains("amplified iterations"));
}
#[test]
fn test_generate_recommendations_good_samples() {
let recommendations = ProfileReport::generate_recommendations(750, "kreuzberg");
assert!(recommendations[1].contains("Good sample count"));
}
#[test]
fn test_generate_recommendations_excellent_samples() {
let recommendations = ProfileReport::generate_recommendations(2000, "python");
assert!(recommendations[1].contains("Excellent"));
}
#[test]
fn test_truncate_function_name() {
let long_name = "this_is_a_very_long_function_name_that_should_be_truncated_for_display";
let truncated = ProfileReport::truncate_function_name(long_name, 30);
assert_eq!(truncated.len(), 30);
assert!(truncated.ends_with("..."));
}
#[test]
fn test_truncate_function_name_short() {
let short_name = "short";
let result = ProfileReport::truncate_function_name(short_name, 30);
assert_eq!(result, "short");
}
#[test]
fn test_html_escape() {
assert_eq!(html_escape("hello"), "hello");
assert_eq!(html_escape("<script>"), "&lt;script&gt;");
assert_eq!(html_escape("a&b"), "a&amp;b");
assert_eq!(html_escape("\"quote\""), "&quot;quote&quot;");
assert_eq!(html_escape("'apostrophe'"), "&#39;apostrophe&#39;");
}
#[test]
fn test_generate_html_empty_report() {
let report = ProfileReport::default();
let html = report.generate_html();
assert!(html.contains("<!DOCTYPE html>"));
assert!(html.contains("CPU Profile Report"));
assert!(html.contains("0</td>"));
assert!(html.contains("Very Low</td>"));
assert!(html.contains("No hotspots captured"));
}
#[test]
fn test_generate_html_with_hotspots() {
let report = ProfileReport {
sample_count: 1000,
duration: Duration::from_millis(1000),
effective_frequency: 1000.0,
top_hotspots: vec![
Hotspot {
function_name: "extraction_function".to_string(),
samples: 500,
percentage: 50.0,
file_location: None,
},
Hotspot {
function_name: "text_processing".to_string(),
samples: 300,
percentage: 30.0,
file_location: None,
},
],
recommendations: vec!["Good profile quality".to_string()],
..Default::default()
};
let html = report.generate_html();
assert!(html.contains("1000</td>"));
assert!(html.contains("extraction_function"));
assert!(html.contains("500"));
assert!(html.contains("50.0%"));
assert!(html.contains("Good profile quality"));
assert!(html.contains("Excellent"));
}
#[test]
fn test_effective_frequency_calculation() {
let report = ProfileReport {
sample_count: 1000,
duration: Duration::from_secs(2),
effective_frequency: 500.0,
top_hotspots: Vec::new(),
memory_trajectory: Vec::new(),
recommendations: Vec::new(),
};
assert_eq!(report.effective_frequency, 500.0);
}
#[test]
fn test_effective_frequency_zero_duration() {
let report = ProfileReport::default();
assert_eq!(report.effective_frequency, 0.0);
}
#[test]
fn test_hotspots_render_empty() {
let report = ProfileReport::default();
let html = report.render_hotspots_table();
assert!(html.contains("No hotspots captured"));
}
#[test]
fn test_hotspots_render_with_data() {
let report = ProfileReport {
top_hotspots: vec![
Hotspot {
function_name: "func_one".to_string(),
samples: 100,
percentage: 50.0,
file_location: None,
},
Hotspot {
function_name: "func_two".to_string(),
samples: 50,
percentage: 25.0,
file_location: None,
},
],
..Default::default()
};
let html = report.render_hotspots_table();
assert!(html.contains("func_one"));
assert!(html.contains("100"));
assert!(html.contains("50.0%"));
assert!(html.contains("func_two"));
assert!(html.contains("50"));
assert!(html.contains("25.0%"));
}
#[test]
fn test_css_styles_present() {
let css = ProfileReport::css_styles();
assert!(css.contains("@media (max-width: 768px)"));
assert!(css.contains("@media print"));
assert!(css.contains("border-radius"));
assert!(css.contains("font-family"));
}
}

View File

@@ -0,0 +1,418 @@
//! CPU and memory profiling module for benchmark analysis
//!
//! This module provides infrastructure for capturing CPU and memory profiles during benchmark
//! execution. CPU profiles are captured using the pprof profiler at 1000 Hz frequency and can
//! be exported as SVG flamegraphs for performance analysis. Memory profiles use jemalloc when
//! the `memory-profiling` feature is enabled.
//!
//! # Feature Gates
//!
//! - `profiling`: Enables CPU profiling with pprof (available on non-Windows platforms)
//! - `memory-profiling`: Enables memory profiling with jemalloc
//!
//! # Usage
//!
//! ```rust,no_run
//! use benchmark_harness::profiling::ProfileGuard;
//! use std::path::Path;
//!
//! fn example() -> Result<(), Box<dyn std::error::Error>> {
//! // Create a profiler guard
//! let guard = ProfileGuard::new(1000)?;
//!
//! // ... run code to profile ...
//!
//! // Finish profiling and generate flamegraph
//! let result = guard.finish()?;
//! result.generate_flamegraph(Path::new("profile.svg"))?;
//! Ok(())
//! }
//! ```
//!
//! # Overhead
//!
//! - CPU profiling at 1000 Hz typically adds 1-5% overhead to benchmark execution time.
//! - Memory profiling with jemalloc adds minimal overhead (~1-2%) in production builds.
//! - The profiler blocks system libraries to reduce noise from standard library calls.
use crate::Result;
use std::path::Path;
#[cfg(all(feature = "profiling", not(target_os = "windows")))]
use std::time::Duration;
/// CPU profiler with RAII semantics
///
/// Automatically stops profiling when dropped. Captures CPU samples at the specified
/// frequency (typically 1000 Hz). Uses pprof under the hood with blocklist for system
/// libraries (libc, libpthread, libgcc, libm) to focus on application code.
///
/// # Platform Support
///
/// Only available on non-Windows platforms where pprof is fully supported.
///
/// # Safety
///
/// Profiling involves signal handling and system-level hooks. The pprof library
/// ensures thread safety, but profiling should not be enabled in multi-threaded
/// contexts where signal handlers might interfere with other operations.
#[cfg(all(feature = "profiling", not(target_os = "windows")))]
pub struct ProfileGuard {
/// The profiler guard from pprof, stored in an Option for safe drop
guard: Option<pprof::ProfilerGuard<'static>>,
/// Start time for duration calculation
start_time: std::time::Instant,
/// Configured sampling frequency in Hz
sampling_frequency: i32,
}
#[cfg(all(feature = "profiling", not(target_os = "windows")))]
impl ProfileGuard {
/// Create a new CPU profiler with the specified frequency
///
/// The frequency is automatically clamped to the valid range (100-10000 Hz).
///
/// # Arguments
///
/// * `frequency` - Sampling frequency in Hz (clamped to 100-10000)
///
/// # Returns
///
/// A new ProfileGuard or an error if profiling setup fails
///
/// # Errors
///
/// Returns [`Error::Profiling`](crate::Error::Profiling) if the profiler cannot be initialized.
pub fn new(frequency: i32) -> Result<Self> {
let clamped_frequency = frequency.clamp(100, 10000);
let guard = pprof::ProfilerGuardBuilder::default()
.frequency(clamped_frequency)
.blocklist(&["libc", "libpthread", "libgcc", "libm"])
.build()
.map_err(|e| crate::Error::Profiling(format!("Failed to initialize profiler: {}", e)))?;
Ok(Self {
guard: Some(guard),
start_time: std::time::Instant::now(),
sampling_frequency: clamped_frequency,
})
}
/// Get the configured sampling frequency in Hz
///
/// # Returns
///
/// The sampling frequency that was used for this profiler
pub fn sampling_frequency(&self) -> i32 {
self.sampling_frequency
}
/// Calculate expected sample count for the given duration
///
/// Provides an estimate of samples collected based on sampling frequency and elapsed time.
/// Actual sample count may vary due to system load and profiler overhead.
///
/// # Returns
///
/// Estimated number of samples collected so far
pub fn estimated_sample_count(&self) -> usize {
let elapsed_ms = self.start_time.elapsed().as_millis() as u64;
(elapsed_ms as f64 * self.sampling_frequency as f64 / 1000.0).ceil() as usize
}
/// Finish profiling and consume self
///
/// This method consumes the ProfileGuard and returns a ProfilingResult containing
/// the captured profile data and execution duration. The profiler is automatically
/// stopped during this operation.
///
/// # Returns
///
/// A ProfilingResult with profile data or an error if report generation fails
///
/// # Errors
///
/// Returns [`Error::Profiling`](crate::Error::Profiling) if the profiler report
/// cannot be generated.
pub fn finish(mut self) -> Result<ProfilingResult> {
let duration = self.start_time.elapsed();
let estimated_samples = self.estimated_sample_count();
let guard = self
.guard
.take()
.ok_or_else(|| crate::Error::Profiling("Profiler already finished".to_string()))?;
let report = guard
.report()
.build()
.map_err(|e| crate::Error::Profiling(format!("Failed to generate profiler report: {}", e)))?;
Ok(ProfilingResult {
duration,
sample_count: estimated_samples,
report,
})
}
}
#[cfg(all(feature = "profiling", not(target_os = "windows")))]
impl Drop for ProfileGuard {
fn drop(&mut self) {
self.guard.take();
}
}
/// Result of CPU profiling containing captured profile data
///
/// # Note on Serialization
///
/// The `report` and `duration` fields are not serialized. Only the `sample_count`
/// is intended for serialization to JSON or other formats.
#[cfg(all(feature = "profiling", not(target_os = "windows")))]
pub struct ProfilingResult {
/// Total duration of profiling
pub duration: Duration,
/// Number of samples captured
pub sample_count: usize,
/// The pprof report containing profile data
pub report: pprof::Report,
}
#[cfg(all(feature = "profiling", not(target_os = "windows")))]
impl ProfilingResult {
/// Generate a flamegraph SVG from the captured profile
///
/// Creates parent directories as needed and writes the flamegraph to the specified path.
/// The output is an SVG file that can be viewed in any web browser.
///
/// # Arguments
///
/// * `output_path` - Path where the flamegraph SVG should be written
///
/// # Returns
///
/// Ok if the flamegraph was successfully written, or an error otherwise
///
/// # Errors
///
/// Returns [`Error::Profiling`](crate::Error::Profiling) if:
/// - Parent directories cannot be created
/// - The output file cannot be written
/// - The flamegraph generation fails
pub fn generate_flamegraph(&self, output_path: &Path) -> Result<()> {
if let Some(parent) = output_path.parent()
&& !parent.as_os_str().is_empty()
{
std::fs::create_dir_all(parent)
.map_err(|e| crate::Error::Profiling(format!("Failed to create output directory: {}", e)))?;
}
let file = std::fs::File::create(output_path)
.map_err(|e| crate::Error::Profiling(format!("Failed to create output file: {}", e)))?;
self.report
.flamegraph(file)
.map_err(|e| crate::Error::Profiling(format!("Failed to generate flamegraph: {}", e)))?;
eprintln!("Flamegraph written to: {}", output_path.display());
Ok(())
}
}
/// No-op profiling support when feature is disabled or on Windows
///
/// Provides stub implementations that are compiled out when profiling
/// is not available, allowing code to use profiling without conditional
/// compilation in every call site.
#[cfg(not(all(feature = "profiling", not(target_os = "windows"))))]
pub mod noop {
use crate::Result;
use std::path::Path;
/// Stub ProfileGuard for when profiling is disabled
pub struct ProfileGuard {
sampling_frequency: i32,
}
impl ProfileGuard {
/// Create a no-op profiler (always succeeds)
#[inline(always)]
pub fn new(frequency: i32) -> Result<Self> {
Ok(ProfileGuard {
sampling_frequency: frequency.clamp(100, 10000),
})
}
/// Get the configured sampling frequency in Hz
#[inline(always)]
pub fn sampling_frequency(&self) -> i32 {
self.sampling_frequency
}
/// Calculate expected sample count (always returns 0 for no-op)
#[inline(always)]
pub fn estimated_sample_count(&self) -> usize {
0
}
/// Finish no-op profiling
#[inline(always)]
pub fn finish(self) -> Result<ProfilingResult> {
Ok(ProfilingResult {
duration: std::time::Duration::ZERO,
sample_count: 0,
})
}
}
/// Stub result for no-op profiling
pub struct ProfilingResult {
pub duration: std::time::Duration,
pub sample_count: usize,
}
impl ProfilingResult {
/// No-op flamegraph generation
#[inline(always)]
pub fn generate_flamegraph(&self, _output_path: &Path) -> Result<()> {
eprintln!("Profiling is not available on this platform or feature is disabled");
Ok(())
}
}
}
/// Re-export the appropriate implementation based on feature and platform
#[cfg(not(all(feature = "profiling", not(target_os = "windows"))))]
pub use noop::{ProfileGuard, ProfilingResult};
/// Dump heap profile to a file using jemalloc
///
/// This function captures a heap profile snapshot from jemalloc and writes it to disk.
/// The output format is a jemalloc heap dump file that can be analyzed with specialized tools.
///
/// # Arguments
///
/// * `path` - Path where the heap dump should be written
///
/// # Returns
///
/// Ok if the heap dump was successfully written, or an error otherwise
///
/// # Errors
///
/// Returns an error if:
/// - Memory profiling feature is not enabled
/// - The output file cannot be created
/// - jemalloc heap dump generation fails
#[cfg(feature = "memory-profiling")]
pub fn dump_heap_profile(path: &Path) -> Result<()> {
use tikv_jemalloc_ctl::epoch;
epoch::mib()
.map_err(|e| crate::Error::Profiling(format!("Failed to get epoch mib: {}", e)))?
.advance()
.map_err(|e| crate::Error::Profiling(format!("Failed to advance epoch: {}", e)))?;
if let Some(parent) = path.parent()
&& !parent.as_os_str().is_empty()
{
std::fs::create_dir_all(parent)
.map_err(|e| crate::Error::Profiling(format!("Failed to create output directory: {}", e)))?;
}
let mut prof_path = path.to_path_buf();
prof_path.set_extension("heap");
eprintln!(
"Heap profile ready at: {} (jemalloc memory statistics have been updated)",
prof_path.display()
);
Ok(())
}
/// No-op heap dump when memory profiling is disabled
#[cfg(not(feature = "memory-profiling"))]
#[inline(always)]
pub fn dump_heap_profile(_path: &Path) -> Result<()> {
eprintln!("Memory profiling is not enabled (feature 'memory-profiling' required)");
Ok(())
}
#[cfg(test)]
mod tests {
#[cfg(not(all(feature = "profiling", not(target_os = "windows"))))]
mod profiling_disabled {
use crate::profiling::ProfileGuard;
use std::path::Path;
#[test]
fn test_noop_profile_guard() -> crate::Result<()> {
let guard = ProfileGuard::new(1000)?;
let result = guard.finish()?;
assert_eq!(result.sample_count, 0);
Ok(())
}
#[test]
fn test_noop_generate_flamegraph() -> crate::Result<()> {
let guard = ProfileGuard::new(1000)?;
let result = guard.finish()?;
result.generate_flamegraph(Path::new("/tmp/noop.svg"))?;
Ok(())
}
}
#[cfg(all(feature = "profiling", not(target_os = "windows")))]
mod profiling_enabled {
use crate::profiling::ProfileGuard;
use tempfile::TempDir;
#[test]
#[ignore]
fn test_profile_guard_creation() -> crate::Result<()> {
let _guard = ProfileGuard::new(1000)?;
Ok(())
}
#[test]
#[ignore]
fn test_generate_flamegraph() -> crate::Result<()> {
let guard = ProfileGuard::new(1000)?;
let _sum: u64 = (0..1_000_000).sum();
let result = guard.finish()?;
let temp_dir = TempDir::new()?;
let output_path = temp_dir.path().join("profile.svg");
result.generate_flamegraph(&output_path)?;
assert!(output_path.exists(), "Flamegraph file should exist");
Ok(())
}
#[test]
#[ignore]
fn test_profile_guard_creates_parent_directories() -> crate::Result<()> {
let guard = ProfileGuard::new(1000)?;
let _sum: u64 = (0..1_000_000).sum();
let result = guard.finish()?;
let temp_dir = TempDir::new()?;
let nested_path = temp_dir.path().join("nested").join("dirs").join("profile.svg");
result.generate_flamegraph(&nested_path)?;
assert!(nested_path.exists(), "Nested directories should be created");
assert!(nested_path.parent().unwrap().exists());
Ok(())
}
}
}

View File

@@ -0,0 +1,423 @@
//! Quality scoring module for benchmark results.
//!
//! Computes F1-based quality metrics by comparing extracted text against ground truth.
//! Uses token-level (bag-of-words) precision and recall.
//!
//! # Scoring weights
//!
//! Text-only scoring uses a **0.6 / 0.4 text / numeric split**:
//!
//! ```text
//! quality_score = 0.6 * f1_text + 0.4 * f1_numeric
//! ```
//!
//! Numeric tokens receive disproportionate weight (40% despite typically being
//! a small fraction of the token count) because financial documents, scientific
//! papers, and tabular data depend heavily on number accuracy. A single wrong
//! digit can invalidate an entire table row or equation.
//!
//! When markdown ground truth is available, **combined scoring** kicks in:
//!
//! ```text
//! quality_score = 0.5 * f1_text + 0.2 * f1_numeric + 0.3 * f1_layout
//! ```
//!
//! The layout component (`f1_layout`) comes from [`crate::markdown_quality`]
//! and captures structural fidelity (headings, tables, code blocks, etc.).
//!
//! # Tokenization
//!
//! Tokenization is intentionally simple: lowercase, split on whitespace,
//! strip non-alphanumeric characters except periods and commas embedded between
//! alphanumeric characters (preserving decimal numbers like "3.14" and European
//! format "3,14"). This preserves punctuation that is semantically meaningful
//! while ignoring decorative punctuation.
use crate::types::{OutputFormat, QualityMetrics};
use regex::Regex;
use std::collections::HashMap;
use std::sync::LazyLock;
/// Regex to strip markdown image syntax `![alt](url)` → `alt`
static MD_IMAGE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"!\[([^\]]*)\]\([^)]*\)").expect("invalid regex"));
/// Regex to strip markdown link syntax `[text](url)` → `text`
static MD_LINK_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\[([^\]]*)\]\([^)]*\)").expect("invalid regex"));
/// Strip markdown link and image syntax so URL components don't become tokens.
/// `![alt](url)` → `alt`, `[text](url)` → `text`.
fn strip_markdown_links(text: &str) -> String {
let text = MD_IMAGE_RE.replace_all(text, "$1");
MD_LINK_RE.replace_all(&text, "$1").into_owned()
}
/// Compute quality metrics comparing extracted text against ground truth,
/// optionally including structural quality scoring when markdown GT is available.
///
/// When `output_format` is `Markdown` and `ground_truth_markdown` is `Some`, computes
/// structural F1 from markdown block comparison and adjusts the quality_score formula:
/// quality_score = 0.5 * f1_text + 0.2 * f1_numeric + 0.3 * f1_layout
///
/// When `output_format` is `Plaintext`, returns text-only scoring regardless of
/// markdown ground truth availability:
/// quality_score = 0.6 * f1_text + 0.4 * f1_numeric
/// f1_score_layout = None
///
/// When `output_format` is `Markdown` but `ground_truth_markdown` is `None`, falls back
/// to text-only scoring:
/// quality_score = 0.6 * f1_text + 0.4 * f1_numeric
pub fn compute_quality_with_structure(
extracted: &str,
ground_truth: &str,
ground_truth_markdown: Option<&str>,
output_format: OutputFormat,
) -> QualityMetrics {
// For plaintext mode, always use text-only scoring
if output_format == OutputFormat::Plaintext {
return compute_quality(extracted, ground_truth);
}
// For markdown mode, include structural scoring if available
let mut metrics = compute_quality(extracted, ground_truth);
if let Some(md_gt) = ground_truth_markdown {
let structural = crate::markdown_quality::score_structural_quality(extracted, md_gt);
metrics.f1_score_layout = Some(structural.structural_f1);
// Adjust quality_score to include structural component.
// When neither side has numeric tokens, drop the numeric weight and redistribute.
metrics.quality_score = if has_any_numeric_tokens(extracted, ground_truth) {
0.5 * metrics.f1_score_text + 0.2 * metrics.f1_score_numeric + 0.3 * structural.structural_f1
} else {
// No numeric tokens: use 0.625 text + 0.375 layout (same 5:3 ratio, no numeric)
0.625 * metrics.f1_score_text + 0.375 * structural.structural_f1
};
}
metrics.correct = metrics.quality_score >= 0.95;
metrics
}
/// Compute quality metrics comparing extracted text against ground truth
///
/// Algorithm:
/// 1. Tokenize both texts: lowercase, split on whitespace, strip non-alphanumeric chars except periods and commas
/// - "3.14" is preserved as a single token
/// - "3,14" is preserved as a single token (European decimal format)
/// 2. Build token multisets (bag of words with counts)
/// 3. Compute precision = |intersection| / |extracted tokens|
/// 4. Compute recall = |intersection| / |ground truth tokens|
/// 5. F1 = 2 * precision * recall / (precision + recall)
/// - If both token sets are empty, F1 = 1.0 (vacuously perfect match)
/// 6. Separate F1 for all tokens vs numeric-only tokens
/// 7. quality_score = 0.6 * f1_text + 0.4 * f1_numeric
pub fn compute_quality(extracted: &str, ground_truth: &str) -> QualityMetrics {
let extracted_tokens = tokenize(extracted);
let truth_tokens = tokenize(ground_truth);
let f1_score_text = compute_f1(&extracted_tokens, &truth_tokens);
let extracted_numeric = filter_numeric(&extracted_tokens);
let truth_numeric = filter_numeric(&truth_tokens);
let f1_score_numeric = compute_f1(&extracted_numeric, &truth_numeric);
// When neither side has numeric tokens, both-empty compute_f1 returns 1.0
// which would give a free 0.4 boost. Use text-only scoring in that case.
let quality_score = if extracted_numeric.is_empty() && truth_numeric.is_empty() {
f1_score_text
} else {
0.6 * f1_score_text + 0.4 * f1_score_numeric
};
let (missing_tokens, extra_tokens) = compute_token_diff(&extracted_tokens, &truth_tokens);
let correct = quality_score >= 0.95;
QualityMetrics {
f1_score_text,
f1_score_numeric,
f1_score_layout: None,
quality_score,
missing_tokens,
extra_tokens,
correct,
}
}
/// Tokenize text: lowercase, split on whitespace, strip non-alphanumeric characters
/// (preserving `.` and `,` only when embedded between alphanumeric chars, e.g. "3.14", "3,14")
pub fn tokenize(text: &str) -> Vec<String> {
let text = strip_markdown_links(text);
text.to_lowercase()
.split_whitespace()
.map(|w| {
// First pass: keep alphanumeric, periods, and commas
let kept: String = w
.chars()
.filter(|c| c.is_alphanumeric() || *c == '.' || *c == ',')
.collect();
// Second pass: strip leading/trailing periods and commas
kept.trim_matches(|c: char| c == '.' || c == ',').to_string()
})
.filter(|w| !w.is_empty())
.map(|token| {
// Normalize numeric tokens: "15.0" -> "15", "100.00" -> "100"
// Only apply f64 normalization for numbers with 15 or fewer digits
// to avoid precision loss (f64 has ~15.9 significant digits).
let digit_count = token.chars().filter(|c| c.is_ascii_digit()).count();
if digit_count <= 15 {
if let Ok(num) = token.parse::<f64>() {
let normalized = format!("{num}");
if normalized != token { normalized } else { token }
} else {
token
}
} else {
token
}
})
.collect()
}
/// Check whether either text has any numeric tokens (used to decide scoring formula).
fn has_any_numeric_tokens(text_a: &str, text_b: &str) -> bool {
let a_tokens = tokenize(text_a);
let b_tokens = tokenize(text_b);
!filter_numeric(&a_tokens).is_empty() || !filter_numeric(&b_tokens).is_empty()
}
/// Filter tokens to only those containing numeric characters (Unicode-aware)
fn filter_numeric(tokens: &[String]) -> Vec<String> {
tokens
.iter()
.filter(|t| t.chars().any(|c| c.is_numeric()))
.cloned()
.collect()
}
/// Compute F1 score between two token bags using multiset intersection
pub fn compute_f1(extracted: &[String], truth: &[String]) -> f64 {
if extracted.is_empty() && truth.is_empty() {
return 1.0; // Both empty = perfect match
}
if extracted.is_empty() || truth.is_empty() {
return 0.0;
}
let extracted_counts = build_counts(extracted);
let truth_counts = build_counts(truth);
// Multiset intersection: for each ground truth token, count min(truth_count, extracted_count).
// Tokens only in extracted text contribute 0 to intersection (penalized via precision denominator).
let intersection: usize = truth_counts
.iter()
.map(|(token, &count)| {
let ext_count = extracted_counts.get(token).copied().unwrap_or(0);
ext_count.min(count)
})
.sum();
let precision = intersection as f64 / extracted.len() as f64;
let recall = intersection as f64 / truth.len() as f64;
if precision + recall == 0.0 {
return 0.0;
}
2.0 * precision * recall / (precision + recall)
}
/// Build a token frequency map
fn build_counts(tokens: &[String]) -> HashMap<&str, usize> {
let mut counts = HashMap::new();
for token in tokens {
*counts.entry(token.as_str()).or_insert(0) += 1;
}
counts
}
/// Compute token-level diff between extracted and ground truth token bags.
///
/// Returns (missing_tokens, extra_tokens) where:
/// - missing_tokens: tokens in GT with higher count than in extraction (recall misses)
/// - extra_tokens: tokens in extraction with higher count than in GT (precision misses)
///
/// Both are sorted by deficit/surplus count descending.
pub type TokenDiff = (Vec<(String, usize)>, Vec<(String, usize)>);
pub fn compute_token_diff(extracted: &[String], truth: &[String]) -> TokenDiff {
let extracted_counts = build_counts(extracted);
let truth_counts = build_counts(truth);
// Tokens in GT but missing/under-represented in extraction
let mut missing: Vec<(String, usize)> = truth_counts
.iter()
.filter_map(|(&token, &gt_count)| {
let ext_count = extracted_counts.get(token).copied().unwrap_or(0);
if gt_count > ext_count {
Some((token.to_string(), gt_count - ext_count))
} else {
None
}
})
.collect();
missing.sort_by_key(|b| std::cmp::Reverse(b.1));
// Tokens in extraction but not in GT or over-represented
let mut extra: Vec<(String, usize)> = extracted_counts
.iter()
.filter_map(|(&token, &ext_count)| {
let gt_count = truth_counts.get(token).copied().unwrap_or(0);
if ext_count > gt_count {
Some((token.to_string(), ext_count - gt_count))
} else {
None
}
})
.collect();
extra.sort_by_key(|b| std::cmp::Reverse(b.1));
(missing, extra)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_identical_text() {
let text = "Hello world this is a test";
let result = compute_quality(text, text);
assert!((result.f1_score_text - 1.0).abs() < 0.001);
assert!((result.quality_score - 1.0).abs() < 0.01); // text-only scoring (no numerics on either side)
}
#[test]
fn test_completely_different() {
let result = compute_quality("alpha beta gamma", "one two three");
assert_eq!(result.f1_score_text, 0.0);
}
#[test]
fn test_partial_overlap() {
let result = compute_quality("hello world foo", "hello world bar");
// Extracted: {hello, world, foo}, Truth: {hello, world, bar}
// Intersection: {hello, world} = 2
// Precision: 2/3, Recall: 2/3, F1: 2/3
assert!((result.f1_score_text - 2.0 / 3.0).abs() < 0.001);
}
#[test]
fn test_numeric_scoring() {
let result = compute_quality("page 42 section 7", "page 42 section 7");
assert!((result.f1_score_numeric - 1.0).abs() < 0.001);
}
#[test]
fn test_empty_inputs() {
let result = compute_quality("", "");
assert!((result.f1_score_text - 1.0).abs() < 0.001);
}
#[test]
fn test_empty_extracted() {
let result = compute_quality("", "some ground truth");
assert_eq!(result.f1_score_text, 0.0);
}
#[test]
fn test_punctuation_stripped() {
let result = compute_quality("hello, world!", "hello world");
assert!((result.f1_score_text - 1.0).abs() < 0.001);
}
#[test]
fn test_case_insensitive() {
let result = compute_quality("Hello World", "hello world");
assert!((result.f1_score_text - 1.0).abs() < 0.001);
}
#[test]
fn test_tokenize_number_normalization() {
// "15.0" and "15" should produce the same token
let tokens_a = tokenize("15.0");
let tokens_b = tokenize("15");
assert_eq!(tokens_a, tokens_b, "15.0 and 15 should normalize to the same token");
assert_eq!(tokens_a, vec!["15"]);
// "100.00" should normalize to "100"
assert_eq!(tokenize("100.00"), vec!["100"]);
}
#[test]
fn test_compute_f1_number_equivalence() {
let extracted = tokenize("price 15.0 dollars");
let truth = tokenize("price 15 dollars");
let f1 = compute_f1(&extracted, &truth);
assert!(
(f1 - 1.0).abs() < 0.001,
"F1 should be 1.0 for semantically equivalent numeric tokens, got {f1}"
);
}
#[test]
fn test_tokenize_preserves_decimals() {
// Non-trailing-zero decimals must be preserved
assert_eq!(tokenize("3.14"), vec!["3.14"]);
assert_eq!(tokenize("0.5"), vec!["0.5"]);
assert_eq!(tokenize("12.345"), vec!["12.345"]);
}
#[test]
fn test_no_numbers_no_boost() {
// Two texts with no numeric tokens should score based on text_f1 only,
// not get a free 0.4 boost from both-empty numeric F1.
let result = compute_quality("hello world foo", "hello world bar");
// text F1: intersection {hello, world} = 2, precision=2/3, recall=2/3, F1=2/3
let expected_text_f1 = 2.0 / 3.0;
assert!(
(result.f1_score_text - expected_text_f1).abs() < 0.001,
"text F1 should be 2/3, got {}",
result.f1_score_text
);
// quality_score should equal text_f1 (no numeric component)
assert!(
(result.quality_score - expected_text_f1).abs() < 0.001,
"quality_score should equal text F1 ({expected_text_f1}) when no numbers, got {}",
result.quality_score
);
}
#[test]
fn test_url_stripped_from_tokens() {
// Markdown links should not produce URL component tokens
let tokens = tokenize("[link text](https://example.com)");
assert_eq!(tokens, vec!["link", "text"]);
// Markdown images should not produce URL component tokens
let tokens = tokenize("![alt text](https://example.com/image.png)");
assert_eq!(tokens, vec!["alt", "text"]);
// Mixed content
let tokens = tokenize("See [docs](https://example.com/docs) for details");
assert_eq!(tokens, vec!["see", "docs", "for", "details"]);
}
#[test]
fn test_large_number_preserved() {
// 17-digit number should not be mangled by f64 precision loss
let tokens = tokenize("10000000000000001");
assert_eq!(
tokens,
vec!["10000000000000001"],
"17-digit number should be preserved as-is, not rounded by f64"
);
// 15-digit number (including the trailing zero) should still be normalized
let tokens = tokenize("12345678901234.0");
assert_eq!(
tokens,
vec!["12345678901234"],
"15-digit number with trailing .0 should still normalize"
);
}
}

View File

@@ -0,0 +1,133 @@
//! Adapter registry for managing framework adapters
//!
//! The registry provides a central place to register and retrieve adapters
//! for different extraction frameworks.
use crate::Error;
use crate::adapter::FrameworkAdapter;
use ahash::AHashMap;
use std::sync::Arc;
/// Registry for framework adapters
///
/// Stores adapters by name and provides lookup and iteration capabilities.
pub struct AdapterRegistry {
adapters: AHashMap<String, Arc<dyn FrameworkAdapter>>,
}
impl AdapterRegistry {
/// Create a new empty registry
pub fn new() -> Self {
Self {
adapters: AHashMap::new(),
}
}
/// Register an adapter
///
/// # Arguments
/// * `adapter` - The adapter to register
///
/// # Returns
/// * `Ok(())` - Adapter registered successfully
/// * `Err(Error::Config)` - Adapter with same name already exists
pub fn register(&mut self, adapter: Arc<dyn FrameworkAdapter>) -> crate::Result<()> {
let name = adapter.name().to_string();
if self.adapters.contains_key(&name) {
return Err(Error::Config(format!("Adapter '{}' is already registered", name)));
}
self.adapters.insert(name, adapter);
Ok(())
}
/// Get an adapter by name
///
/// # Arguments
/// * `name` - The adapter name
///
/// # Returns
/// * `Some(Arc<dyn FrameworkAdapter>)` - Adapter found
/// * `None` - No adapter with that name
pub fn get(&self, name: &str) -> Option<Arc<dyn FrameworkAdapter>> {
self.adapters.get(name).cloned()
}
/// Check if an adapter is registered
pub fn contains(&self, name: &str) -> bool {
self.adapters.contains_key(name)
}
/// Get all registered adapter names
pub fn adapter_names(&self) -> Vec<String> {
self.adapters.keys().cloned().collect()
}
/// Get all registered adapters
pub fn adapters(&self) -> Vec<Arc<dyn FrameworkAdapter>> {
self.adapters.values().cloned().collect()
}
/// Get the number of registered adapters
pub fn len(&self) -> usize {
self.adapters.len()
}
/// Check if the registry is empty
pub fn is_empty(&self) -> bool {
self.adapters.is_empty()
}
/// Remove an adapter by name
///
/// # Returns
/// * `Some(Arc<dyn FrameworkAdapter>)` - The removed adapter
/// * `None` - No adapter with that name
pub fn remove(&mut self, name: &str) -> Option<Arc<dyn FrameworkAdapter>> {
self.adapters.remove(name)
}
/// Clear all adapters
pub fn clear(&mut self) {
self.adapters.clear();
}
}
impl Default for AdapterRegistry {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_registry_creation() {
let registry = AdapterRegistry::new();
assert!(registry.is_empty());
assert_eq!(registry.len(), 0);
}
#[test]
fn test_adapter_names_empty() {
let registry = AdapterRegistry::new();
let names = registry.adapter_names();
assert_eq!(names.len(), 0);
}
#[test]
fn test_contains_nonexistent() {
let registry = AdapterRegistry::new();
assert!(!registry.contains("nonexistent"));
}
#[test]
fn test_get_nonexistent() {
let registry = AdapterRegistry::new();
let result = registry.get("nonexistent");
assert!(result.is_none());
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,414 @@
//! Statistical utilities for benchmark analysis
//!
//! This module provides shared statistical functions used across the benchmark harness.
/// Calculate percentile using R-7 linear interpolation method
///
/// The R-7 method is the default percentile calculation method in R and provides
/// linear interpolation between order statistics for improved accuracy over simpler
/// rounding-based methods.
///
/// # Arguments
/// * `sorted_values` - Sorted array of values (must be sorted for correct results)
/// * `p` - Percentile to calculate (0.0 - 1.0, where 0.5 = median, 0.95 = 95th percentile)
///
/// # Returns
/// The calculated percentile value, or 0.0 if the array is empty
///
/// # Panics
/// This function does not panic, but returns 0.0 for empty input arrays.
///
/// # Example
/// ```ignore
/// let values = vec![1.0, 2.0, 3.0, 4.0, 5.0];
/// let p50 = percentile_r7(&values, 0.50); // Median
/// let p95 = percentile_r7(&values, 0.95); // 95th percentile
/// ```
pub(crate) fn percentile_r7(sorted_values: &[f64], p: f64) -> f64 {
if sorted_values.is_empty() {
return 0.0;
}
let n = sorted_values.len();
if n == 1 {
return sorted_values[0];
}
let index = p * (n as f64 - 1.0);
let lower = index.floor() as usize;
let upper = index.ceil().min((n - 1) as f64) as usize;
if lower == upper {
sorted_values[lower]
} else {
let weight = index - lower as f64;
sorted_values[lower] * (1.0 - weight) + sorted_values[upper] * weight
}
}
/// Sanitize an f64 value, replacing NaN or infinity with 0.0
///
/// This is used to ensure JSON-serializable output from statistical calculations.
pub(crate) fn sanitize_f64(v: f64) -> f64 {
if v.is_finite() { v } else { 0.0 }
}
/// Calculate mean, sample variance (Bessel-corrected), and standard deviation
///
/// Filters out NaN and infinite values before calculation.
/// Returns `(mean, variance, std_dev)`. For empty or single-element input,
/// variance and std_dev are 0.0.
///
/// # Arguments
/// * `values` - Slice of f64 values (NaN/Inf values are filtered out)
///
/// # Returns
/// Tuple of (mean, sample_variance, standard_deviation)
#[allow(dead_code)]
pub(crate) fn calculate_variance(values: &[f64]) -> (f64, f64, f64) {
let filtered: Vec<f64> = values
.iter()
.copied()
.filter(|v| !v.is_nan() && v.is_finite())
.collect();
if filtered.len() <= 1 {
return (filtered.first().copied().unwrap_or(0.0), 0.0, 0.0);
}
let mean = filtered.iter().sum::<f64>() / filtered.len() as f64;
let variance = filtered.iter().map(|v| (v - mean).powi(2)).sum::<f64>() / (filtered.len() - 1) as f64;
let std_dev = variance.sqrt();
(mean, variance, std_dev)
}
#[cfg(test)]
mod tests {
use super::*;
// Test 1: Empty input returns 0.0
#[test]
fn test_percentile_r7_empty() {
let values: Vec<f64> = vec![];
assert_eq!(percentile_r7(&values, 0.5), 0.0);
}
// Test 2: Single element returns that element
#[test]
fn test_percentile_r7_single_value() {
let values = vec![42.0];
assert_eq!(percentile_r7(&values, 0.5), 42.0);
assert_eq!(percentile_r7(&values, 0.95), 42.0);
assert_eq!(percentile_r7(&values, 0.0), 42.0);
assert_eq!(percentile_r7(&values, 1.0), 42.0);
}
// Test 3: Two elements - p0, p50, p100
#[test]
fn test_percentile_r7_two_values_all_percentiles() {
let values = vec![10.0, 20.0];
// p0 (minimum)
let p0 = percentile_r7(&values, 0.0);
assert_eq!(p0, 10.0);
// p50 (median/midpoint)
let p50 = percentile_r7(&values, 0.5);
assert_eq!(p50, 15.0);
// p100 (maximum)
let p100 = percentile_r7(&values, 1.0);
assert_eq!(p100, 20.0);
}
// Test 4: Known R-7 values for [1,2,3,4,5]
// p50=3.0, p95=4.8, p25=2.0, p75=4.0
#[test]
fn test_percentile_r7_five_values_known_values() {
let values = vec![1.0, 2.0, 3.0, 4.0, 5.0];
// p50 (median) - should be exactly 3.0
let p50 = percentile_r7(&values, 0.50);
assert_eq!(p50, 3.0);
// p95 (95th percentile) - should be 4.8
let p95 = percentile_r7(&values, 0.95);
assert!((p95 - 4.8).abs() < 0.0001);
// p25 (25th percentile) - should be 2.0
let p25 = percentile_r7(&values, 0.25);
assert_eq!(p25, 2.0);
// p75 (75th percentile) - should be 4.0
let p75 = percentile_r7(&values, 0.75);
assert_eq!(p75, 4.0);
// p0 and p100 should be min/max
let p0 = percentile_r7(&values, 0.0);
assert_eq!(p0, 1.0);
let p100 = percentile_r7(&values, 1.0);
assert_eq!(p100, 5.0);
}
// Test 5: All identical values
#[test]
fn test_percentile_r7_identical_values() {
let values = vec![7.0, 7.0, 7.0, 7.0, 7.0];
// All percentiles should return the same value
assert_eq!(percentile_r7(&values, 0.0), 7.0);
assert_eq!(percentile_r7(&values, 0.25), 7.0);
assert_eq!(percentile_r7(&values, 0.5), 7.0);
assert_eq!(percentile_r7(&values, 0.75), 7.0);
assert_eq!(percentile_r7(&values, 0.95), 7.0);
assert_eq!(percentile_r7(&values, 1.0), 7.0);
}
// Test 6: Negative values
#[test]
fn test_percentile_r7_negative_values() {
let values = vec![-5.0, -3.0, -1.0, 0.0, 2.0];
// p50 should be -1.0
let p50 = percentile_r7(&values, 0.50);
assert_eq!(p50, -1.0);
// p95 should interpolate near 2.0
let p95 = percentile_r7(&values, 0.95);
assert!(p95 > 0.0 && p95 <= 2.0);
// p0 should be minimum
let p0 = percentile_r7(&values, 0.0);
assert_eq!(p0, -5.0);
// p100 should be maximum
let p100 = percentile_r7(&values, 1.0);
assert_eq!(p100, 2.0);
}
// Test 7: Large dataset (100 elements)
#[test]
fn test_percentile_r7_many_values() {
let values: Vec<f64> = (1..=100).map(|i| i as f64).collect();
let p50 = percentile_r7(&values, 0.50);
assert!((p50 - 50.5).abs() < 0.01);
let p95 = percentile_r7(&values, 0.95);
// With 100 values (1-100), p95 is at index 99 * 0.95 = 94.05
// which interpolates between values[94]=95 and values[95]=96 to get 95.05
assert!((p95 - 95.05).abs() < 0.01);
let p25 = percentile_r7(&values, 0.25);
// index = 99 * 0.25 = 24.75, interpolates between values[24]=25 and values[25]=26
// result = 25 * 0.25 + 26 * 0.75 = 6.25 + 19.5 = 25.75
assert!((p25 - 25.75).abs() < 0.01);
let p75 = percentile_r7(&values, 0.75);
// index = 99 * 0.75 = 74.25, interpolates between values[74]=75 and values[75]=76
// result = 75 * 0.75 + 76 * 0.25 = 56.25 + 19 = 75.25
assert!((p75 - 75.25).abs() < 0.01);
}
// Test 8: Edge percentiles - p0 always returns min, p100 always returns max
#[test]
fn test_percentile_r7_edge_percentiles() {
let values = vec![3.0, 1.0, 9.0, 2.0, 7.0];
// Note: function expects sorted input but we're testing edge behavior
let p0 = percentile_r7(&values, 0.0);
let p100 = percentile_r7(&values, 1.0);
// For unsorted input [3,1,9,2,7]:
// p0 index = 0 * (5-1) = 0 -> values[0] = 3.0
// p100 index = 1 * (5-1) = 4 -> values[4] = 7.0
assert_eq!(p0, 3.0);
assert_eq!(p100, 7.0);
}
// Test 9: Properly sorted input for correct edge percentiles
#[test]
fn test_percentile_r7_sorted_edge_percentiles() {
let values = vec![1.0, 2.0, 3.0, 7.0, 9.0]; // Already sorted
// p0 should return minimum
let p0 = percentile_r7(&values, 0.0);
assert_eq!(p0, 1.0);
// p100 should return maximum
let p100 = percentile_r7(&values, 1.0);
assert_eq!(p100, 9.0);
}
// Test 10: Non-sorted input behavior
#[test]
fn test_percentile_r7_unsorted_input_behavior() {
// Note: The function expects sorted input. This test documents the behavior
// when unsorted input is provided (it will give incorrect results).
let unsorted = vec![5.0, 1.0, 3.0, 2.0, 4.0];
// Without sorting, results will be based on array positions, not actual order
let p50_unsorted = percentile_r7(&unsorted, 0.50);
// index = 0.5 * (5-1) = 2.0, so returns values[2] = 3.0
assert_eq!(p50_unsorted, 3.0);
// Now with sorted input for comparison
let mut sorted = unsorted.clone();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
let p50_sorted = percentile_r7(&sorted, 0.50);
// index = 0.5 * (5-1) = 2.0, so returns values[2] = 3.0 (true median)
assert_eq!(p50_sorted, 3.0);
// This documents that function requires pre-sorted input
assert_eq!(sorted, vec![1.0, 2.0, 3.0, 4.0, 5.0]);
}
// Test 11: Three-element array for completeness
#[test]
fn test_percentile_r7_three_values() {
let values = vec![10.0, 20.0, 30.0];
let p0 = percentile_r7(&values, 0.0);
assert_eq!(p0, 10.0);
let p50 = percentile_r7(&values, 0.50);
// index = 0.5 * (3-1) = 1.0, so returns values[1] = 20.0
assert_eq!(p50, 20.0);
let p100 = percentile_r7(&values, 1.0);
assert_eq!(p100, 30.0);
let p25 = percentile_r7(&values, 0.25);
// index = 0.25 * (3-1) = 0.5, interpolates between values[0]=10 and values[1]=20
// result = 10 * 0.5 + 20 * 0.5 = 15.0
assert_eq!(p25, 15.0);
let p75 = percentile_r7(&values, 0.75);
// index = 0.75 * (3-1) = 1.5, interpolates between values[1]=20 and values[2]=30
// result = 20 * 0.5 + 30 * 0.5 = 25.0
assert_eq!(p75, 25.0);
}
// Test 12: Floating-point precision with decimal values
#[test]
fn test_percentile_r7_floating_point_values() {
let values = vec![1.5, 2.7, 3.2, 4.1, 5.9];
let p50 = percentile_r7(&values, 0.50);
assert_eq!(p50, 3.2);
let p25 = percentile_r7(&values, 0.25);
// index = 0.25 * (5-1) = 1.0, so returns values[1] = 2.7
assert_eq!(p25, 2.7);
let p75 = percentile_r7(&values, 0.75);
// index = 0.75 * (5-1) = 3.0, so returns values[3] = 4.1
assert_eq!(p75, 4.1);
let p95 = percentile_r7(&values, 0.95);
// index = 0.95 * (5-1) = 3.8, interpolates between values[3]=4.1 and values[4]=5.9
// result = 4.1 * 0.2 + 5.9 * 0.8 = 0.82 + 4.72 = 5.54
assert!((p95 - 5.54).abs() < 0.0001);
}
// Test 13: Very large percentile values (near 1.0)
#[test]
fn test_percentile_r7_high_percentiles() {
let values = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let p99 = percentile_r7(&values, 0.99);
// index = 0.99 * (5-1) = 3.96, interpolates between values[3]=4 and values[4]=5
// result = 4 * 0.04 + 5 * 0.96 = 0.16 + 4.8 = 4.96
assert!((p99 - 4.96).abs() < 0.0001);
let p999 = percentile_r7(&values, 0.999);
// index = 0.999 * (5-1) = 3.996, interpolates between values[3]=4 and values[4]=5
// result = 4 * 0.004 + 5 * 0.996 = 0.016 + 4.98 = 4.996
assert!((p999 - 4.996).abs() < 0.0001);
}
// Test 14: Very small percentile values (near 0.0)
#[test]
fn test_percentile_r7_low_percentiles() {
let values = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let p1 = percentile_r7(&values, 0.01);
// index = 0.01 * (5-1) = 0.04, interpolates between values[0]=1 and values[1]=2
// result = 1 * 0.96 + 2 * 0.04 = 0.96 + 0.08 = 1.04
assert!((p1 - 1.04).abs() < 0.0001);
let p001 = percentile_r7(&values, 0.001);
// index = 0.001 * (5-1) = 0.004, interpolates between values[0]=1 and values[1]=2
// result = 1 * 0.996 + 2 * 0.004 = 0.996 + 0.008 = 1.004
assert!((p001 - 1.004).abs() < 0.0001);
}
// ---- sanitize_f64 tests ----
#[test]
fn test_sanitize_f64_finite() {
assert_eq!(sanitize_f64(42.0), 42.0);
assert_eq!(sanitize_f64(-1.5), -1.5);
assert_eq!(sanitize_f64(0.0), 0.0);
}
#[test]
fn test_sanitize_f64_nan() {
assert_eq!(sanitize_f64(f64::NAN), 0.0);
}
#[test]
fn test_sanitize_f64_infinity() {
assert_eq!(sanitize_f64(f64::INFINITY), 0.0);
assert_eq!(sanitize_f64(f64::NEG_INFINITY), 0.0);
}
// ---- calculate_variance tests ----
#[test]
fn test_calculate_variance_empty() {
let (mean, variance, std_dev) = calculate_variance(&[]);
assert_eq!(mean, 0.0);
assert_eq!(variance, 0.0);
assert_eq!(std_dev, 0.0);
}
#[test]
fn test_calculate_variance_single() {
let (mean, variance, std_dev) = calculate_variance(&[5.0]);
assert!((mean - 5.0).abs() < 0.001);
assert_eq!(variance, 0.0);
assert_eq!(std_dev, 0.0);
}
#[test]
fn test_calculate_variance_bessel_correction() {
// [1, 2, 3]: mean=2, sample variance = ((1-2)^2 + (2-2)^2 + (3-2)^2) / (3-1) = 1.0
let (mean, variance, std_dev) = calculate_variance(&[1.0, 2.0, 3.0]);
assert!((mean - 2.0).abs() < 0.001);
assert!((variance - 1.0).abs() < 0.001);
assert!((std_dev - 1.0).abs() < 0.001);
}
#[test]
fn test_calculate_variance_filters_nan_and_inf() {
let values = [f64::NAN, 1.0, f64::INFINITY, 2.0, f64::NEG_INFINITY, 3.0];
let (mean, variance, std_dev) = calculate_variance(&values);
// After filtering: [1.0, 2.0, 3.0]
assert!((mean - 2.0).abs() < 0.001);
assert!((variance - 1.0).abs() < 0.001);
assert!((std_dev - 1.0).abs() < 0.001);
}
#[test]
fn test_calculate_variance_all_nan() {
let (mean, variance, std_dev) = calculate_variance(&[f64::NAN, f64::NAN]);
assert_eq!(mean, 0.0);
assert_eq!(variance, 0.0);
assert_eq!(std_dev, 0.0);
}
#[test]
fn test_calculate_variance_identical_values() {
let (mean, variance, std_dev) = calculate_variance(&[5.0, 5.0, 5.0]);
assert!((mean - 5.0).abs() < 0.001);
assert!(variance.abs() < 0.001);
assert!(std_dev.abs() < 0.001);
}
}

View File

@@ -0,0 +1,130 @@
//! Corpus-wide extraction survey: extract all documents and print stats.
//!
//! Replaces `crates/kreuzberg/tests/pdf_markdown_all_docs.rs`.
use crate::Result;
use crate::corpus::{self, CorpusFilter};
use std::path::PathBuf;
use std::time::Instant;
/// Survey configuration.
pub struct SurveyConfig {
pub fixtures_dir: PathBuf,
pub file_types: Option<Vec<String>>,
}
/// Stats for one document.
pub struct DocStats {
pub name: String,
pub file_type: String,
pub file_size: u64,
pub content_length: usize,
pub heading_count: usize,
pub table_row_count: usize,
pub list_item_count: usize,
pub extraction_ms: f64,
pub error: Option<String>,
}
/// Run the survey: extract every document and collect stats.
pub async fn run_survey(config: &SurveyConfig) -> Result<Vec<DocStats>> {
let filter = CorpusFilter {
file_types: config.file_types.clone(),
..Default::default()
};
let docs = corpus::build_corpus(&config.fixtures_dir, &filter)?;
eprintln!("Survey: {} documents", docs.len());
let extraction_config = kreuzberg::ExtractionConfig {
output_format: kreuzberg::core::config::OutputFormat::Markdown,
..Default::default()
};
let mut results = Vec::new();
let total = docs.len();
for (idx, doc) in docs.iter().enumerate() {
eprint!("[{}/{}] {} ...", idx + 1, total, doc.name);
let t = Instant::now();
let extraction_future = kreuzberg::extract_file(&doc.document_path, None, &extraction_config);
let (content, error) = match tokio::time::timeout(std::time::Duration::from_secs(180), extraction_future).await
{
Ok(Ok(r)) => (r.content, None),
Ok(Err(e)) => (String::new(), Some(e.to_string())),
Err(_) => (String::new(), Some("timeout (180s)".to_string())),
};
let extraction_ms = t.elapsed().as_secs_f64() * 1000.0;
let lines: Vec<&str> = content.lines().collect();
let heading_count = lines.iter().filter(|l| l.starts_with('#')).count();
let table_row_count = lines
.iter()
.filter(|l| l.starts_with('|') && l.ends_with('|') && !l.contains("---"))
.count();
let list_item_count = lines
.iter()
.filter(|l| {
let trimmed = l.trim_start();
trimmed.starts_with("- ")
|| trimmed.starts_with("* ")
|| trimmed.starts_with("+ ")
|| (trimmed.len() >= 3
&& trimmed.chars().next().is_some_and(|c| c.is_ascii_digit())
&& trimmed.contains(". "))
})
.count();
eprintln!(" {:.0}ms", extraction_ms);
results.push(DocStats {
name: doc.name.clone(),
file_type: doc.file_type.clone(),
file_size: doc.file_size,
content_length: content.len(),
heading_count,
table_row_count,
list_item_count,
extraction_ms,
error,
});
}
Ok(results)
}
/// Print survey stats table.
pub fn print_survey_table(results: &[DocStats]) {
eprintln!(
"{:<30} {:>6} {:>8} {:>8} {:>5} {:>6} {:>5} {:>8}",
"Document", "Type", "Size KB", "Content", "Hdgs", "TRows", "Lists", "Time ms"
);
eprintln!("{}", "-".repeat(90));
for s in results {
let status = if s.error.is_some() { "ERR" } else { "" };
eprintln!(
"{:<30} {:>6} {:>8.0} {:>8} {:>5} {:>6} {:>5} {:>7.0} {}",
if s.name.len() > 29 { &s.name[..29] } else { &s.name },
s.file_type,
s.file_size as f64 / 1024.0,
s.content_length,
s.heading_count,
s.table_row_count,
s.list_item_count,
s.extraction_ms,
status,
);
}
// Summary
let n = results.len();
let total_time: f64 = results.iter().map(|s| s.extraction_ms).sum();
let errors = results.iter().filter(|s| s.error.is_some()).count();
eprintln!("{}", "-".repeat(90));
eprintln!(
"Total: {} documents, {:.1}s extraction time, {} errors",
n,
total_time / 1000.0,
errors
);
}

View File

@@ -0,0 +1,408 @@
//! Core types for benchmark results and metrics
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::path::PathBuf;
use std::str::FromStr;
use std::time::Duration;
/// Output format for document extraction
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum OutputFormat {
/// Markdown output format with structure preservation
#[default]
Markdown,
/// Plain text output format
Plaintext,
}
impl std::fmt::Display for OutputFormat {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
OutputFormat::Markdown => write!(f, "markdown"),
OutputFormat::Plaintext => write!(f, "plaintext"),
}
}
}
impl FromStr for OutputFormat {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s.to_lowercase().as_str() {
"markdown" | "md" => Ok(OutputFormat::Markdown),
"plaintext" | "text" | "txt" => Ok(OutputFormat::Plaintext),
_ => Err(format!(
"unknown output format: {}. Valid: markdown, md, plaintext, text, txt",
s
)),
}
}
}
/// Default output format for backward compatibility with old results
fn default_output_format() -> OutputFormat {
OutputFormat::Markdown
}
/// Kreuzberg extraction pipeline variant
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum KreuzbergPipeline {
/// Baseline: text extraction without layout or OCR
Baseline,
/// Layout: layout detection and structure preservation
Layout,
/// PaddleOCR: OCR with PaddleOCR backend
#[serde(rename = "paddle-ocr")]
PaddleOcr,
}
impl KreuzbergPipeline {
/// Get the string representation of the pipeline
pub fn as_str(self) -> &'static str {
match self {
KreuzbergPipeline::Baseline => "baseline",
KreuzbergPipeline::Layout => "layout",
KreuzbergPipeline::PaddleOcr => "paddle-ocr",
}
}
}
impl std::fmt::Display for KreuzbergPipeline {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.as_str())
}
}
impl FromStr for KreuzbergPipeline {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s.to_lowercase().as_str() {
"baseline" => Ok(KreuzbergPipeline::Baseline),
"layout" => Ok(KreuzbergPipeline::Layout),
"paddle-ocr" | "paddle_ocr" | "paddleocr" => Ok(KreuzbergPipeline::PaddleOcr),
_ => Err(format!(
"unknown Kreuzberg pipeline: {}. Valid: baseline, layout, paddle-ocr",
s
)),
}
}
}
/// OCR usage status for a benchmark extraction
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
#[serde(rename_all = "snake_case")]
pub enum OcrStatus {
/// OCR was used for this extraction
Used,
/// OCR was not used for this extraction
NotUsed,
/// Unknown whether OCR was used
#[default]
Unknown,
}
/// Categorizes the source of a benchmark error.
///
/// This distinction is critical: framework errors are the framework's fault
/// (e.g. pdfplumber can't parse a malformed PDF), while harness errors are
/// our fault (e.g. timeout, process crash, invalid output format).
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
#[serde(rename_all = "snake_case")]
pub enum ErrorKind {
/// The framework itself reported an extraction error (returned `{"error": "..."}`)
/// This is NOT our fault - the framework couldn't handle this file.
FrameworkError,
/// A harness-level error: process crash, invalid JSON output, etc.
/// This IS potentially our fault or an infrastructure issue.
HarnessError,
/// Extraction timed out (exceeded the configured timeout duration).
Timeout,
/// Framework returned empty or missing content (ran but produced nothing).
EmptyContent,
/// No error occurred
#[default]
None,
}
/// Complete benchmark result for a single file extraction
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BenchmarkResult {
/// Framework that performed the extraction
pub framework: String,
/// Output format used for extraction (markdown or plaintext)
#[serde(default = "default_output_format")]
pub output_format: OutputFormat,
/// Path to the test document
pub file_path: PathBuf,
/// File size in bytes
pub file_size: u64,
/// Whether extraction succeeded
pub success: bool,
/// Error message if extraction failed
pub error_message: Option<String>,
/// Categorizes the error source (framework vs harness)
#[serde(default)]
pub error_kind: ErrorKind,
/// Total wall-clock duration (process spawn + extraction)
/// For single iteration: the actual duration
/// For multiple iterations: mean duration across all iterations
pub duration: Duration,
/// Pure extraction time (reported by subprocess via _extraction_time_ms)
/// Only available for external frameworks with internal timing
pub extraction_duration: Option<Duration>,
/// Subprocess overhead (duration - extraction_duration)
/// Only available when extraction_duration is present
pub subprocess_overhead: Option<Duration>,
/// Performance metrics (averaged across iterations if multiple)
pub metrics: PerformanceMetrics,
/// Quality metrics (if ground truth available)
pub quality: Option<QualityMetrics>,
/// Individual iteration results (empty for single iteration)
pub iterations: Vec<IterationResult>,
/// Statistical analysis of durations across iterations
/// Only present when multiple iterations were run
pub statistics: Option<DurationStatistics>,
/// Cold start duration: Time from framework not loaded to ready and warm state
/// This is measured during the first warmup extraction and represents the
/// initial framework load time (imports, initializations, etc.)
pub cold_start_duration: Option<Duration>,
/// File extension without dot (e.g., "pdf", "docx")
/// Extracted from file_path for per-extension analysis
pub file_extension: String,
/// Framework capability metadata at time of extraction
/// Contains OCR support, batch support, async support flags
pub framework_capabilities: FrameworkCapabilities,
/// PDF-specific metadata (only present for PDF files)
/// Includes text layer detection results and OCR strategy
pub pdf_metadata: Option<PdfMetadata>,
/// OCR usage status for this extraction
#[serde(default)]
pub ocr_status: OcrStatus,
/// Extracted text content (for quality assessment)
/// Not serialized to output JSON to save space
#[serde(skip)]
pub extracted_text: Option<String>,
}
impl BenchmarkResult {
/// Create a framework key combining framework name, output format, and execution mode
/// Format: "{framework}:{output_format}:{execution_mode}"
/// Example: "kreuzberg-rust:markdown:batch"
pub fn framework_key(&self, execution_mode: &str) -> String {
format!("{}:{}:{}", self.framework, self.output_format, execution_mode)
}
}
/// Performance metrics collected during extraction
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PerformanceMetrics {
/// Peak memory usage in bytes
pub peak_memory_bytes: u64,
/// Average CPU usage percentage (0-100)
pub avg_cpu_percent: f64,
/// Throughput in bytes per second
pub throughput_bytes_per_sec: f64,
/// 50th percentile memory usage in bytes
pub p50_memory_bytes: u64,
/// 95th percentile memory usage in bytes
pub p95_memory_bytes: u64,
/// 99th percentile memory usage in bytes
pub p99_memory_bytes: u64,
}
/// Quality metrics comparing extraction output to ground truth
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct QualityMetrics {
/// Text token F1 score (0.0-1.0)
pub f1_score_text: f64,
/// Numeric token F1 score (0.0-1.0)
pub f1_score_numeric: f64,
/// Layout/structure F1 score (0.0-1.0), optional for plaintext mode
#[serde(default, skip_serializing_if = "Option::is_none")]
pub f1_score_layout: Option<f64>,
/// Overall text quality score (0.0-1.0)
pub quality_score: f64,
/// Tokens in ground truth but missing/under-represented in extraction (recall misses).
/// Each entry is (token, deficit_count). Sorted by count descending.
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub missing_tokens: Vec<(String, usize)>,
/// Tokens in extraction but not in ground truth or over-represented (precision misses).
/// Each entry is (token, surplus_count). Sorted by count descending.
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub extra_tokens: Vec<(String, usize)>,
/// Whether the extraction is considered correct (quality_score >= 0.95).
#[serde(default)]
pub correct: bool,
}
/// Framework capability metadata
///
/// Records the capabilities of the framework at the time of extraction,
/// enabling proper analysis and comparison of results based on framework features.
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct FrameworkCapabilities {
/// Extensions this framework supports (e.g., ["pdf", "docx"])
#[serde(default)]
pub supported_extensions: Vec<String>,
/// Whether framework supports OCR
#[serde(default)]
pub ocr_support: bool,
/// Whether framework supports batch processing
#[serde(default)]
pub batch_support: bool,
/// Whether framework supports async extraction
#[serde(default)]
pub async_support: bool,
/// Output formats this framework supports
#[serde(default)]
pub supported_output_formats: Vec<OutputFormat>,
/// Framework version
#[serde(default)]
pub version: String,
/// Disk installation size (if known)
#[serde(default)]
pub installation_size: Option<DiskSizeInfo>,
}
fn is_zero_u64(v: &u64) -> bool {
*v == 0
}
/// Disk installation size information for a framework
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DiskSizeInfo {
/// Total size in bytes (package + system deps)
pub size_bytes: u64,
/// Package-only size in bytes (before adding system deps)
#[serde(default)]
pub package_bytes: u64,
/// System dependency size in bytes (libreoffice, tesseract, ffmpeg, etc.)
#[serde(default)]
pub system_deps_bytes: u64,
/// ML model size in bytes (auto-downloaded on first use)
#[serde(default, skip_serializing_if = "is_zero_u64")]
pub model_bytes: u64,
/// Measurement method (e.g., "binary_size", "pip_package", "npm_package")
pub method: String,
/// Human-readable description
pub description: String,
/// Breakdown of system dependency sizes by package name
/// Keys are package names (e.g., "poppler-utils"), values are installed sizes in bytes.
/// Only populated when runtime measurement via dpkg-query succeeds.
#[serde(default, skip_serializing_if = "HashMap::is_empty")]
pub system_deps_detail: HashMap<String, u64>,
}
/// PDF-specific metadata
///
/// Contains PDF text layer detection results and OCR strategy used.
/// Only populated for PDF documents.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PdfMetadata {
/// Whether PDF has a quality text layer
/// Detected via pdftotext/pdffonts/pypdf
pub has_text_layer: bool,
/// Detection method used ("pdftotext", "pdffonts", "pypdf", "fallback")
pub detection_method: String,
/// Number of pages in the PDF
pub page_count: Option<u32>,
/// Whether OCR was enabled for this extraction
pub ocr_enabled: bool,
/// Text extraction quality hint (0.0-1.0)
/// 0.0 = scanned image, 1.0 = native text
pub text_quality_score: Option<f64>,
}
/// Result from a single benchmark iteration
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct IterationResult {
/// Iteration number (0-indexed)
pub iteration: usize,
/// Total wall-clock duration for this iteration
pub duration: Duration,
/// Pure extraction time (if available from subprocess)
pub extraction_duration: Option<Duration>,
/// Performance metrics for this iteration
pub metrics: PerformanceMetrics,
}
/// Statistical analysis of durations across multiple iterations
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DurationStatistics {
/// Mean duration
pub mean: Duration,
/// Median duration
pub median: Duration,
/// Standard deviation (in milliseconds as f64)
pub std_dev_ms: f64,
/// Minimum duration
pub min: Duration,
/// Maximum duration
pub max: Duration,
/// 95th percentile duration
pub p95: Duration,
/// 99th percentile duration
pub p99: Duration,
/// Number of iterations included in statistics
pub sample_count: usize,
}

View File

@@ -0,0 +1,488 @@
//! Ground truth validation and HTML-to-GFM cleanup
//!
//! Replaces the Python scripts `validate_ground_truth.py` and `cleanup_html_in_gt.py`
//! with a single Rust module that can report HTML issues and optionally fix them in-place.
use crate::{Fixture, Result};
use regex::Regex;
use std::path::{Path, PathBuf};
/// Configuration for the validate-gt subcommand.
pub struct ValidateGtConfig {
/// Directory containing fixture JSON files.
pub fixtures_dir: PathBuf,
/// When true, auto-convert HTML tags to GFM markdown in-place.
pub fix: bool,
}
/// Summary report produced by [`validate_ground_truth`].
pub struct ValidateGtReport {
pub total_fixtures: usize,
pub with_text_gt: usize,
pub with_markdown_gt: usize,
pub missing_text_gt: usize,
pub missing_markdown_gt: usize,
/// Files smaller than 10 bytes: (relative path, size).
pub small_gt_files: Vec<(String, u64)>,
/// Markdown GT files containing HTML: (path, list of tags found).
pub html_issues: Vec<(String, Vec<String>)>,
/// Number of fixes applied (only non-zero when `--fix` is used).
pub fixes_applied: usize,
/// GT files containing noise issues (Warning or Error severity): (path, issue_count).
pub noisy_gt_files: Vec<(String, usize)>,
/// GT files with low block diversity (no headings for files > 100 bytes).
pub low_diversity_gt: Vec<String>,
}
// ---------------------------------------------------------------------------
// HTML detection
// ---------------------------------------------------------------------------
/// Common HTML tags that should not appear in GFM ground truth.
const HTML_TAG_NAMES: &[&str] = &[
"table", "tr", "td", "th", "b", "strong", "i", "em", "div", "span", "p", "br", "a ", "code", "pre", "img", "sup",
"sub", "ul", "ol", "li", "h1", "h2", "h3", "h4", "h5", "h6",
];
/// Build a regex that matches opening or self-closing HTML tags for the names
/// listed in [`HTML_TAG_NAMES`].
fn html_tag_regex() -> Regex {
// Build alternation: `table|tr|td|…|h[1-6]`
// We handle the special "a " entry by converting it to `a\s` so it only
// matches `<a ` (anchor with attributes) and not random words starting with "a".
let alts: Vec<String> = HTML_TAG_NAMES
.iter()
.map(|t| {
if *t == "a " {
r"a\s".to_string()
} else {
regex::escape(t)
}
})
.collect();
let pattern = format!(r"(?i)</?(?:{})(?:\s[^>]*)?\s*/?>", alts.join("|"));
Regex::new(&pattern).expect("invalid HTML tag regex")
}
/// Strip content inside fenced code blocks so we don't flag code examples.
///
/// Uses a line-by-line scanner because the `regex` crate does not support
/// backreferences needed to match opening/closing fences of the same length.
fn strip_fenced_code_blocks(text: &str) -> String {
let mut result = String::with_capacity(text.len());
let mut in_fence = false;
let mut fence_marker = String::new();
for line in text.lines() {
let trimmed = line.trim_start();
if in_fence {
// Check if this line closes the current fence
if trimmed.starts_with(&fence_marker) && trimmed.trim() == fence_marker {
in_fence = false;
fence_marker.clear();
}
// Skip all lines inside fence (including open/close)
continue;
}
// Check for opening fence: ``` or ~~~ (3+ chars)
let opens_backtick = trimmed.starts_with("```");
let opens_tilde = trimmed.starts_with("~~~");
if opens_backtick || opens_tilde {
let fence_char = if opens_backtick { '`' } else { '~' };
let fence_len = trimmed.chars().take_while(|&c| c == fence_char).count();
fence_marker = std::iter::repeat_n(fence_char, fence_len).collect();
in_fence = true;
continue;
}
result.push_str(line);
result.push('\n');
}
result
}
/// Strip inline code spans.
fn strip_inline_code(text: &str) -> String {
let inline_re = Regex::new(r"`[^`]+`").expect("inline code regex");
inline_re.replace_all(text, "").into_owned()
}
/// Detect HTML tags in a markdown string, returning the list of matched tags.
pub fn detect_html_tags(content: &str) -> Vec<String> {
let cleaned = strip_inline_code(&strip_fenced_code_blocks(content));
let re = html_tag_regex();
re.find_iter(&cleaned).map(|m| m.as_str().to_string()).collect()
}
// ---------------------------------------------------------------------------
// HTML-to-GFM conversion
// ---------------------------------------------------------------------------
/// Convert common HTML tags to their GFM equivalents.
///
/// This intentionally does **not** attempt to convert `<table>` blocks — those
/// are complex and should be flagged in report mode instead.
pub fn convert_html_to_gfm(content: &str) -> (String, usize) {
let mut text = content.to_string();
let mut count: usize = 0;
/// Helper: apply a regex substitution and accumulate the replacement count.
macro_rules! apply {
($re:expr, $rep:expr) => {{
let re = Regex::new($re).expect("regex");
let before_len = text.len();
let new = re.replace_all(&text, $rep);
// Count by number of matches (cheaper than diffing strings)
let n = re.find_iter(&text).count();
if n > 0 {
text = new.into_owned();
count += n;
}
let _ = before_len; // suppress unused warning
}};
}
// <b>text</b> or <strong>text</strong> → **text**
apply!(r"(?is)<(?:b|strong)>(.*?)</(?:b|strong)>", "**$1**");
// <i>text</i> or <em>text</em> → *text*
apply!(r"(?is)<(?:i|em)>(.*?)</(?:i|em)>", "*$1*");
// <code>text</code> → `text`
apply!(r"(?is)<code>(.*?)</code>", "`$1`");
// <a href="url">text</a> → [text](url)
apply!(
r#"(?is)<a\s+(?:[^>]*\s+)?href=["']([^"']*)["'][^>]*>(.*?)</a>"#,
"[$2]($1)"
);
// <br>, <br/>, <br /> → newline
apply!(r"(?i)<br\s*/?>", "\n");
// <hr>, <hr/>, <hr /> → ---
apply!(r"(?i)<hr\s*/?>", "---");
// <sup>text</sup> → text (no GFM equivalent)
apply!(r"(?is)<sup>(.*?)</sup>", "$1");
// <sub>text</sub> → text
apply!(r"(?is)<sub>(.*?)</sub>", "$1");
// <pre>text</pre> → fenced code block
{
let re = Regex::new(r"(?is)<pre>(.*?)</pre>").expect("pre regex");
let n = re.find_iter(&text).count();
if n > 0 {
text = re
.replace_all(&text, |caps: &regex::Captures| {
let inner = caps[1].trim();
format!("```\n{}\n```", inner)
})
.into_owned();
count += n;
}
}
// Strip <div>, </div>, <span>, </span>, <p>, </p> keeping content
apply!(r"(?i)</?div(?:\s[^>]*)?>", "");
apply!(r"(?i)</?span(?:\s[^>]*)?>", "");
apply!(r"(?i)</?p(?:\s[^>]*)?>", "");
(text, count)
}
// ---------------------------------------------------------------------------
// Main validation entry point
// ---------------------------------------------------------------------------
/// Walk fixture JSON files, resolve GT paths, and produce a validation report.
///
/// When `config.fix` is true, HTML tags in markdown GT files are auto-converted
/// to GFM equivalents in-place.
pub fn validate_ground_truth(config: &ValidateGtConfig) -> Result<ValidateGtReport> {
let mut report = ValidateGtReport {
total_fixtures: 0,
with_text_gt: 0,
with_markdown_gt: 0,
missing_text_gt: 0,
missing_markdown_gt: 0,
small_gt_files: Vec::new(),
html_issues: Vec::new(),
fixes_applied: 0,
noisy_gt_files: Vec::new(),
low_diversity_gt: Vec::new(),
};
let fixture_files = collect_json_files(&config.fixtures_dir)?;
for fixture_path in &fixture_files {
let fixture = match Fixture::from_file(fixture_path) {
Ok(f) => f,
Err(e) => {
eprintln!("Warning: failed to load fixture {}: {}", fixture_path.display(), e);
continue;
}
};
report.total_fixtures += 1;
let Some(gt) = &fixture.ground_truth else {
report.missing_text_gt += 1;
report.missing_markdown_gt += 1;
continue;
};
// Resolve paths relative to the fixture file's parent directory.
let fixture_dir = fixture_path.parent().unwrap_or(Path::new("."));
// --- text GT ---
if let Some(ref tf) = gt.text_file {
let text_path = fixture_dir.join(tf);
if text_path.exists() {
report.with_text_gt += 1;
check_small_file(&text_path, &config.fixtures_dir, &mut report);
} else {
report.missing_text_gt += 1;
}
} else {
report.missing_text_gt += 1;
}
// --- markdown GT ---
if let Some(md_rel) = &gt.markdown_file {
let md_path = fixture_dir.join(md_rel);
if md_path.exists() {
report.with_markdown_gt += 1;
check_small_file(&md_path, &config.fixtures_dir, &mut report);
check_html_in_markdown(&md_path, config.fix, &mut report);
check_noise_in_markdown(&md_path, &config.fixtures_dir, &mut report);
check_block_diversity(&md_path, &config.fixtures_dir, &mut report);
} else {
report.missing_markdown_gt += 1;
}
}
}
Ok(report)
}
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
/// Recursively collect `*.json` files under `dir`.
fn collect_json_files(dir: &Path) -> Result<Vec<PathBuf>> {
let mut files = Vec::new();
if !dir.is_dir() {
return Err(crate::Error::Config(format!(
"Fixtures directory does not exist: {}",
dir.display()
)));
}
collect_json_recursive(dir, &mut files)?;
files.sort();
Ok(files)
}
fn collect_json_recursive(dir: &Path, out: &mut Vec<PathBuf>) -> Result<()> {
for entry in std::fs::read_dir(dir).map_err(crate::Error::Io)? {
let entry = entry.map_err(crate::Error::Io)?;
let path = entry.path();
if path.is_dir() {
collect_json_recursive(&path, out)?;
} else if path.extension().is_some_and(|ext| ext == "json") {
out.push(path);
}
}
Ok(())
}
/// Warn if a GT file is suspiciously small (<10 bytes).
fn check_small_file(path: &Path, base: &Path, report: &mut ValidateGtReport) {
if let Ok(meta) = std::fs::metadata(path)
&& meta.len() < 10
{
let display = path.strip_prefix(base).unwrap_or(path).display().to_string();
report.small_gt_files.push((display, meta.len()));
}
}
/// Check a markdown GT file for noise issues (Warning or Error severity).
fn check_noise_in_markdown(path: &Path, base: &Path, report: &mut ValidateGtReport) {
let Ok(content) = std::fs::read_to_string(path) else {
return;
};
let diagnostic = crate::noise_detection::detect_noise(&content);
let serious_count = diagnostic
.issues
.iter()
.filter(|issue| {
matches!(
issue.severity,
crate::noise_detection::Severity::Warning | crate::noise_detection::Severity::Error
)
})
.count();
if serious_count > 0 {
let display = path.strip_prefix(base).unwrap_or(path).display().to_string();
report.noisy_gt_files.push((display, serious_count));
}
}
/// Check if a markdown GT file has at least one heading for files > 100 bytes.
fn check_block_diversity(path: &Path, base: &Path, report: &mut ValidateGtReport) {
let Ok(meta) = std::fs::metadata(path) else {
return;
};
if meta.len() <= 100 {
return;
}
let Ok(content) = std::fs::read_to_string(path) else {
return;
};
let blocks = crate::markdown_quality::parse_markdown_blocks(&content);
let has_heading = blocks.iter().any(|b| b.block_type.is_heading());
if !has_heading {
let display = path.strip_prefix(base).unwrap_or(path).display().to_string();
report.low_diversity_gt.push(display);
}
}
/// Check a markdown GT file for HTML tags; optionally fix in-place.
fn check_html_in_markdown(path: &Path, fix: bool, report: &mut ValidateGtReport) {
let Ok(content) = std::fs::read_to_string(path) else {
return;
};
let tags = detect_html_tags(&content);
if tags.is_empty() {
return;
}
report.html_issues.push((path.display().to_string(), tags));
if fix {
let (converted, n) = convert_html_to_gfm(&content);
if n > 0 && converted != content && std::fs::write(path, &converted).is_ok() {
report.fixes_applied += n;
}
}
}
// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_html_tag_detection() {
let tags = detect_html_tags("<b>bold</b> and <i>italic</i> and <table><tr><td>cell</td></tr></table>");
assert!(!tags.is_empty(), "should detect HTML tags");
// Should find <b>, </b>, <i>, </i>, <table>, <tr>, <td>, </td>, </tr>, </table>
assert!(tags.iter().any(|t| t.contains("b>")), "should detect <b>");
assert!(tags.iter().any(|t| t.contains("table")), "should detect <table>");
}
#[test]
fn test_html_tag_detection_skips_code_blocks() {
let input = "```\n<b>not a tag</b>\n```\noutside `<i>also not</i>` here";
let tags = detect_html_tags(input);
assert!(
tags.is_empty(),
"should not detect tags inside code blocks or inline code"
);
}
#[test]
fn test_html_to_gfm_bold() {
let (result, n) = convert_html_to_gfm("<b>text</b>");
assert_eq!(result, "**text**");
assert!(n > 0);
let (result, _) = convert_html_to_gfm("<strong>text</strong>");
assert_eq!(result, "**text**");
}
#[test]
fn test_html_to_gfm_italic() {
let (result, n) = convert_html_to_gfm("<i>text</i>");
assert_eq!(result, "*text*");
assert!(n > 0);
let (result, _) = convert_html_to_gfm("<em>text</em>");
assert_eq!(result, "*text*");
}
#[test]
fn test_html_to_gfm_link() {
let (result, n) = convert_html_to_gfm(r#"<a href="https://example.com">text</a>"#);
assert_eq!(result, "[text](https://example.com)");
assert!(n > 0);
}
#[test]
fn test_html_to_gfm_code() {
let (result, n) = convert_html_to_gfm("<code>text</code>");
assert_eq!(result, "`text`");
assert!(n > 0);
}
#[test]
fn test_html_to_gfm_br() {
let (result, n) = convert_html_to_gfm("line1<br>line2");
assert_eq!(result, "line1\nline2");
assert!(n > 0);
let (result, _) = convert_html_to_gfm("line1<br/>line2");
assert_eq!(result, "line1\nline2");
let (result, _) = convert_html_to_gfm("line1<br />line2");
assert_eq!(result, "line1\nline2");
}
#[test]
fn test_strip_div_span() {
let (result, n) = convert_html_to_gfm("<div>text</div>");
assert_eq!(result, "text");
assert!(n > 0);
let (result, _) = convert_html_to_gfm("<span>text</span>");
assert_eq!(result, "text");
}
#[test]
fn test_html_to_gfm_pre() {
let (result, n) = convert_html_to_gfm("<pre>some code</pre>");
assert_eq!(result, "```\nsome code\n```");
assert!(n > 0);
}
#[test]
fn test_html_to_gfm_hr() {
let (result, n) = convert_html_to_gfm("<hr>");
assert_eq!(result, "---");
assert!(n > 0);
}
#[test]
fn test_html_to_gfm_sup_sub() {
let (result, _) = convert_html_to_gfm("<sup>text</sup>");
assert_eq!(result, "text");
let (result, _) = convert_html_to_gfm("<sub>text</sub>");
assert_eq!(result, "text");
}
}