use criterion::{Criterion, criterion_group, criterion_main}; use kreuzberg::text::quality::calculate_quality_score; use std::hint::black_box; // ~1 KiB of typical paragraph text — no script or style content. fn corpus_clean_1kib() -> String { let paragraph = "The document processing pipeline extracts structured content from a wide \ variety of file formats including PDF, Office documents, plain text, and HTML. \ Quality scoring evaluates the extracted text along several dimensions: OCR \ artifact density, presence of script or style noise, navigational boilerplate, \ and structural coherence measured by sentence and paragraph metrics. "; paragraph.repeat(3) } // ~64 KiB — paragraph text interleaved with script/style noise blocks. fn corpus_noisy_64kib() -> String { let para = "This is a representative paragraph of body text extracted from a web page. \ It contains normal prose with proper punctuation and sentence boundaries. \ The quality scorer should assign this a high structural bonus. "; // ~1 KiB JS function body let js_block = |n: usize| -> String { format!( "\n", n ) }; // ~512 B CSS block let css_block = |n: usize| -> String { format!( "\n" ) }; // Naked JS function chunk (triggers JS_FUNCTION_PATTERN) let js_func_chunk = |n: usize| -> String { format!( "\nfunction renderSection{}(element, data) {{ return element.innerHTML = data; }}\n", n ) }; let mut buf = String::with_capacity(66_000); // 10 script blocks each ~1 KiB for i in 0..10 { buf.push_str(¶.repeat(6)); // ~1.2 KiB prose between blocks buf.push_str(&js_block(i)); } // 5 style blocks each ~512 B for i in 0..5 { buf.push_str(¶.repeat(4)); buf.push_str(&css_block(i)); } // 3 naked JS function chunks for i in 0..3 { buf.push_str(¶.repeat(3)); buf.push_str(&js_func_chunk(i)); } // Pad to ~64 KiB while buf.len() < 64 * 1024 { buf.push_str(para); } buf.truncate(64 * 1024); buf } // ~1 MiB — same ratio as 64 KiB but 16× larger. This is the case that hits the backtracker. fn corpus_noisy_1mib() -> String { let para = "This is a representative paragraph of body text extracted from a web page. \ It contains normal prose with proper punctuation and sentence boundaries. \ The quality scorer should assign this a high structural bonus. "; let js_block = |n: usize| -> String { format!( "\n", n ) }; let css_block = |n: usize| -> String { format!( "\n" ) }; let js_func_chunk = |n: usize| -> String { format!( "\nfunction renderSection{}(element, data) {{ return element.innerHTML = data; }}\n", n ) }; let mut buf = String::with_capacity(1_100_000); // Scale up by 16×: 160 script blocks, 80 style blocks, 48 function chunks for i in 0..160 { buf.push_str(¶.repeat(6)); buf.push_str(&js_block(i)); } for i in 0..80 { buf.push_str(¶.repeat(4)); buf.push_str(&css_block(i)); } for i in 0..48 { buf.push_str(¶.repeat(3)); buf.push_str(&js_func_chunk(i)); } while buf.len() < 1024 * 1024 { buf.push_str(para); } buf.truncate(1024 * 1024); buf } fn bench_quality_clean_1kib(criterion: &mut Criterion) { let text = corpus_clean_1kib(); criterion.bench_function("quality_clean_1kib", |b| { b.iter(|| calculate_quality_score(black_box(&text), black_box(None))) }); } fn bench_quality_noisy_64kib(criterion: &mut Criterion) { let text = corpus_noisy_64kib(); criterion.bench_function("quality_noisy_64kib", |b| { b.iter(|| calculate_quality_score(black_box(&text), black_box(None))) }); } fn bench_quality_noisy_1mib(criterion: &mut Criterion) { let text = corpus_noisy_1mib(); criterion.bench_function("quality_noisy_1mib", |b| { b.iter(|| calculate_quality_score(black_box(&text), black_box(None))) }); } criterion_group!( benches, bench_quality_clean_1kib, bench_quality_noisy_64kib, bench_quality_noisy_1mib, ); criterion_main!(benches);