//! Corpus discovery and filtering for benchmark documents. //! //! Builds on the existing [`FixtureManager`] to provide structured corpus access //! with filtering by file type, ground truth availability, and name patterns. use crate::Result; use crate::fixture::FixtureManager; use std::path::{Path, PathBuf}; /// A document in the benchmark corpus with resolved paths. #[derive(Debug, Clone)] pub struct CorpusDocument { /// Human-readable name (fixture stem, e.g. "nougat_001") pub name: String, /// Absolute path to the source document pub document_path: PathBuf, /// File type (e.g. "pdf", "docx") pub file_type: String, /// File size in bytes pub file_size: u64, /// Absolute path to text ground truth (if available) pub ground_truth_text: Option, /// Absolute path to markdown ground truth (if available) pub ground_truth_markdown: Option, } /// Filter criteria for corpus discovery. #[derive(Debug, Clone, Default)] pub struct CorpusFilter { /// Only include these file types (None = all) pub file_types: Option>, /// Require text ground truth pub require_ground_truth: bool, /// Require markdown ground truth pub require_markdown_ground_truth: bool, /// Maximum file size in bytes (None = no limit) pub max_file_size: Option, /// Only include fixtures whose name contains one of these strings pub name_patterns: Vec, } /// Build a filtered corpus from the fixture directory. pub fn build_corpus(fixtures_dir: &Path, filter: &CorpusFilter) -> Result> { let mut manager = FixtureManager::new(); if fixtures_dir.is_dir() { manager.load_fixtures_from_dir(fixtures_dir)?; } else { manager.load_fixture(fixtures_dir)?; } let mut docs = Vec::new(); for (fixture_path, fixture) in manager.fixtures() { let fixture_dir = match fixture_path.parent() { Some(d) => d, None => continue, }; let name = fixture_path .file_stem() .and_then(|s| s.to_str()) .unwrap_or("") .to_string(); // Apply name filter (match ANY pattern) if !filter.name_patterns.is_empty() && !filter.name_patterns.iter().any(|p| name.contains(p.as_str())) { continue; } // Apply file type filter if let Some(ref types) = filter.file_types && !types.contains(&fixture.file_type) { continue; } // Apply file size filter if let Some(max_size) = filter.max_file_size && fixture.file_size > max_size { continue; } let document_path = fixture.resolve_document_path(fixture_dir); let gt_text = fixture.resolve_ground_truth_path(fixture_dir); let gt_markdown = fixture.resolve_ground_truth_markdown_path(fixture_dir); // Apply ground truth filters if filter.require_ground_truth && gt_text.is_none() { continue; } if filter.require_markdown_ground_truth && gt_markdown.is_none() { continue; } docs.push(CorpusDocument { name, document_path, file_type: fixture.file_type.clone(), file_size: fixture.file_size, ground_truth_text: gt_text, ground_truth_markdown: gt_markdown, }); } docs.sort_by(|a, b| a.name.cmp(&b.name)); Ok(docs) } /// Convenience: all PDFs with text ground truth. pub fn pdf_corpus(fixtures_dir: &Path) -> Result> { build_corpus( fixtures_dir, &CorpusFilter { file_types: Some(vec!["pdf".to_string()]), require_ground_truth: true, ..Default::default() }, ) } /// Convenience: all PDFs with markdown ground truth. pub fn pdf_markdown_corpus(fixtures_dir: &Path) -> Result> { build_corpus( fixtures_dir, &CorpusFilter { file_types: Some(vec!["pdf".to_string()]), require_ground_truth: true, require_markdown_ground_truth: true, ..Default::default() }, ) } #[cfg(test)] mod tests { use super::*; #[test] fn test_default_filter_is_permissive() { let filter = CorpusFilter::default(); assert!(filter.file_types.is_none()); assert!(!filter.require_ground_truth); assert!(!filter.require_markdown_ground_truth); assert!(filter.max_file_size.is_none()); assert!(filter.name_patterns.is_empty()); } }