//! Extract command - Extract text and data from documents //! //! This module provides the extract and batch extract commands for processing single //! or multiple documents with customizable extraction configurations. use anyhow::{Context, Result}; use kreuzberg::{ BatchFileItem, ExtractionConfig, ExtractionResult, FileExtractionConfig, batch_extract_files_sync, extract_file_sync, }; use std::path::PathBuf; use std::time::Instant; use crate::{ WireFormat, output::{BatchEnvelope, ExtractEnvelope}, style, }; /// Execute single document extraction command pub fn extract_command( path: PathBuf, config: ExtractionConfig, mime_type: Option, format: WireFormat, ) -> Result<()> { let path_str = path.to_string_lossy().to_string(); let t0 = Instant::now(); let result = extract_file_sync(&path_str, mime_type.as_deref(), &config).with_context(|| { format!( "Failed to extract file '{}'. Ensure the file is readable and the format is supported.", path.display() ) })?; let extraction_time_ms = t0.elapsed().as_secs_f64() * 1000.0; match format { WireFormat::Text => { print!("{}", result.content); } WireFormat::Json => { let envelope = ExtractEnvelope { result, extraction_time_ms, }; println!( "{}", serde_json::to_string_pretty(&envelope).context("Failed to serialize extraction result to JSON")? ); } WireFormat::Toon => { println!( "{}", serde_toon::to_string(&result).context("Failed to serialize extraction result to TOON")? ); } } Ok(()) } /// Execute batch extraction command with optional per-file configuration overrides pub fn batch_command( paths: Vec, file_configs_map: Option>, config: ExtractionConfig, format: WireFormat, ) -> Result<()> { match format { WireFormat::Json => { // Run files one at a time to capture per-file wall-clock timings. // Per-file config overrides are honoured: files without an override use the // batch-level config directly; files with an override use a one-shot batch of // one item so the library's own merge logic applies. let mut results: Vec = Vec::with_capacity(paths.len()); let mut per_file_ms: Vec = Vec::with_capacity(paths.len()); let total_t0 = Instant::now(); for path in &paths { let path_str = path.to_string_lossy().to_string(); let has_file_config = file_configs_map.as_ref().and_then(|m| m.get(&path_str)).is_some(); let t0 = Instant::now(); let result = if has_file_config { // Delegate to the batch API (one item) so per-file merge logic is applied. let file_config = file_configs_map .as_ref() .and_then(|m| m.get(&path_str)) .map(|v| { serde_json::from_value::(v.clone()) .with_context(|| format!("Failed to parse file config for '{}'", path_str)) }) .transpose()?; let mut batch_results = batch_extract_files_sync( vec![BatchFileItem { path: path.clone(), config: file_config, }], &config, ) .with_context(|| { format!( "Failed to extract file '{}'. Ensure the file is readable and the format is supported.", path.display() ) })?; batch_results.remove(0) } else { extract_file_sync(&path_str, None, &config).with_context(|| { format!( "Failed to extract file '{}'. Ensure the file is readable and the format is supported.", path.display() ) })? }; per_file_ms.push(t0.elapsed().as_secs_f64() * 1000.0); results.push(result); } let total_ms = total_t0.elapsed().as_secs_f64() * 1000.0; let envelope = BatchEnvelope { results, total_ms, per_file_ms, }; println!( "{}", serde_json::to_string_pretty(&envelope) .context("Failed to serialize batch extraction results to JSON")? ); } WireFormat::Text => { let results = run_batch_sync(&paths, file_configs_map.as_ref(), &config)?; for (i, result) in results.iter().enumerate() { println!("{}", style::header(&format!("=== Document {} ===", i + 1))); println!("{} {}", style::label("MIME Type:"), style::success(&result.mime_type)); println!("{}\n{}", style::label("Content:"), result.content); println!(); } } WireFormat::Toon => { let results = run_batch_sync(&paths, file_configs_map.as_ref(), &config)?; println!( "{}", serde_toon::to_string(&results).context("Failed to serialize batch extraction results to TOON")? ); } } Ok(()) } /// Run batch extraction using the synchronous batch API for non-JSON output paths. fn run_batch_sync( paths: &[PathBuf], file_configs_map: Option<&std::collections::HashMap>, config: &ExtractionConfig, ) -> Result> { let items: Vec = paths .iter() .map(|p| { let path_str = p.to_string_lossy().to_string(); let file_config = file_configs_map .and_then(|m| m.get(&path_str)) .map(|v| { serde_json::from_value::(v.clone()) .with_context(|| format!("Failed to parse file config for '{}'", path_str)) }) .transpose()?; Ok(BatchFileItem { path: p.clone(), config: file_config, }) }) .collect::>>()?; batch_extract_files_sync(items, config) .context("Failed to batch extract documents. Check that all files are readable and formats are supported.") }