Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,466 @@
//! Cache command - Manage cache operations
//!
//! This module provides commands for cache management including statistics,
//! clearing, manifest generation, and model warming.
use anyhow::{Context, Result};
use kreuzberg::cache;
use serde_json::json;
use std::path::PathBuf;
use crate::{WireFormat, style};
/// Execute cache stats command
pub fn stats_command(cache_dir: Option<PathBuf>, format: WireFormat) -> Result<()> {
let default_cache_dir = std::env::current_dir()
.context("Failed to get current directory")?
.join(".kreuzberg");
let cache_path = cache_dir.unwrap_or(default_cache_dir);
let cache_dir_str = cache_path.to_string_lossy();
let stats = cache::get_cache_metadata(&cache_dir_str).with_context(|| {
format!(
"Failed to get cache statistics from directory '{}'. Ensure the directory exists and is readable.",
cache_dir_str
)
})?;
match format {
WireFormat::Text => {
println!("{}", style::header("Cache Statistics"));
println!("{}", style::dim("================"));
println!("{} {}", style::label("Directory:"), style::success(&cache_dir_str));
println!("{} {}", style::label("Total files:"), stats.total_files);
println!("{} {:.2} MB", style::label("Total size:"), stats.total_size_mb);
println!(
"{} {:.2} MB",
style::label("Available space:"),
stats.available_space_mb
);
println!(
"{} {:.2} days",
style::label("Oldest file age:"),
stats.oldest_file_age_days
);
println!(
"{} {:.2} days",
style::label("Newest file age:"),
stats.newest_file_age_days
);
}
WireFormat::Json => {
let output = json!({
"directory": cache_dir_str,
"total_files": stats.total_files,
"total_size_mb": stats.total_size_mb,
"available_space_mb": stats.available_space_mb,
"oldest_file_age_days": stats.oldest_file_age_days,
"newest_file_age_days": stats.newest_file_age_days,
});
println!(
"{}",
serde_json::to_string_pretty(&output).context("Failed to serialize cache statistics to JSON")?
);
}
WireFormat::Toon => {
let output = json!({
"directory": cache_dir_str,
"total_files": stats.total_files,
"total_size_mb": stats.total_size_mb,
"available_space_mb": stats.available_space_mb,
"oldest_file_age_days": stats.oldest_file_age_days,
"newest_file_age_days": stats.newest_file_age_days,
});
println!(
"{}",
serde_toon::to_string(&output).context("Failed to serialize cache statistics to TOON")?
);
}
}
Ok(())
}
/// Execute cache clear command
pub fn clear_command(cache_dir: Option<PathBuf>, format: WireFormat) -> Result<()> {
let default_cache_dir = std::env::current_dir()
.context("Failed to get current directory")?
.join(".kreuzberg");
let cache_path = cache_dir.unwrap_or(default_cache_dir);
let cache_dir_str = cache_path.to_string_lossy();
let (removed_files, freed_mb) = cache::clear_cache_directory(&cache_dir_str).with_context(|| {
format!(
"Failed to clear cache directory '{}'. Ensure you have write permissions.",
cache_dir_str
)
})?;
match format {
WireFormat::Text => {
println!("{}", style::success("Cache cleared successfully"));
println!("{} {}", style::label("Directory:"), style::success(&cache_dir_str));
println!("{} {}", style::label("Removed files:"), removed_files);
println!("{} {:.2} MB", style::label("Freed space:"), freed_mb);
}
WireFormat::Json => {
let output = json!({
"directory": cache_dir_str,
"removed_files": removed_files,
"freed_mb": freed_mb,
});
println!(
"{}",
serde_json::to_string_pretty(&output).context("Failed to serialize cache clear results to JSON")?
);
}
WireFormat::Toon => {
let output = json!({
"directory": cache_dir_str,
"removed_files": removed_files,
"freed_mb": freed_mb,
});
println!(
"{}",
serde_toon::to_string(&output).context("Failed to serialize cache clear results to TOON")?
);
}
}
Ok(())
}
/// Execute cache manifest command - outputs expected model files with checksums.
pub fn manifest_command(format: WireFormat) -> Result<()> {
// Without at least one model-providing feature, every `extend` call
// below is `#[cfg]`-stripped and `entries: Vec<_>` has no anchor for
// type inference — `e.size_bytes` on the closure further down then
// fails compilation with E0282. Bail with a clear error instead so
// (or similar minimal configurations) succeeds.
#[cfg(not(any(feature = "paddle-ocr", feature = "layout-detection")))]
{
let _ = format;
anyhow::bail!(
"manifest command unavailable: build kreuzberg-cli with at least one of \
--features \"paddle-ocr\" or --features \"layout-detection\""
);
}
#[cfg(any(feature = "paddle-ocr", feature = "layout-detection"))]
{
manifest_command_inner(format)
}
}
#[cfg(any(feature = "paddle-ocr", feature = "layout-detection"))]
fn manifest_command_inner(format: WireFormat) -> Result<()> {
let mut entries = Vec::new();
#[cfg(feature = "paddle-ocr")]
{
entries.extend(kreuzberg::paddle_ocr::ModelManager::manifest());
}
#[cfg(feature = "layout-detection")]
{
entries.extend(kreuzberg::layout::LayoutModelManager::manifest());
}
#[cfg(feature = "paddle-ocr")]
{
entries.extend(kreuzberg::ocr::TessdataManager::manifest());
}
let total_size_bytes: u64 = entries.iter().map(|e| e.size_bytes).sum();
let version = env!("CARGO_PKG_VERSION");
match format {
WireFormat::Text => {
println!(
"{} {}",
style::header("Model Manifest"),
style::dim(&format!("(kreuzberg {})", version))
);
println!("{}", style::dim("===================================="));
println!(
"{:<50} {:>12} {}",
style::label("PATH"),
style::label("SIZE"),
style::label("SHA256")
);
println!("{}", style::dim(&format!("{:<50} {:>12} ------", "----", "----")));
for entry in &entries {
let size_str = if entry.size_bytes > 0 {
format!("{:.1} MB", entry.size_bytes as f64 / 1_048_576.0)
} else {
"unknown".to_string()
};
let sha_display = if entry.sha256.len() >= 12 {
&entry.sha256[..12]
} else if entry.sha256.is_empty() {
"-"
} else {
&entry.sha256
};
println!(
"{:<50} {:>12} {}",
entry.relative_path,
size_str,
style::dim(sha_display)
);
}
println!();
println!(
"{} {} files, {:.1} MB",
style::label("Total:"),
entries.len(),
total_size_bytes as f64 / 1_048_576.0
);
}
WireFormat::Json => {
let output = json!({
"kreuzberg_version": version,
"total_size_bytes": total_size_bytes,
"model_count": entries.len(),
"models": entries,
});
println!(
"{}",
serde_json::to_string_pretty(&output).context("Failed to serialize manifest to JSON")?
);
}
WireFormat::Toon => {
let output = json!({
"kreuzberg_version": version,
"total_size_bytes": total_size_bytes,
"model_count": entries.len(),
"models": entries,
});
println!(
"{}",
serde_toon::to_string(&output).context("Failed to serialize manifest to TOON")?
);
}
}
Ok(())
}
/// Execute cache warm command - eagerly downloads all models.
#[allow(clippy::too_many_arguments)]
pub fn warm_command(
cache_dir: Option<PathBuf>,
format: WireFormat,
all_embeddings: bool,
embedding_model: Option<String>,
all_table_models: bool,
all_grammars: bool,
grammar_groups: Option<Vec<String>>,
grammars: Option<Vec<String>>,
) -> Result<()> {
let cache_base = resolve_cache_base(cache_dir);
let mut downloaded: Vec<String> = Vec::new();
let mut already_cached: Vec<String> = Vec::new();
#[cfg(feature = "paddle-ocr")]
{
let paddle_dir = cache_base.join("paddle-ocr");
let manager = kreuzberg::paddle_ocr::ModelManager::new(paddle_dir);
// ensure_all_models downloads v2 det (server+mobile), cls (PP-LCNet),
// doc_ori, v2 unified rec models, and all per-script rec families
manager
.ensure_all_models()
.context("Failed to download PaddleOCR v2 models")?;
downloaded.push("paddle-ocr v2 (server+mobile det, cls, doc_ori, unified+per-script rec)".to_string());
}
#[cfg(feature = "layout-detection")]
{
let layout_dir = cache_base.join("layout");
let manager = kreuzberg::layout::LayoutModelManager::new(Some(layout_dir));
if all_table_models {
// Download rtdetr + tatr + all SLANeXT variants (~730MB)
let was_cached = manager.is_rtdetr_cached() && manager.is_tatr_cached();
if was_cached {
already_cached.push("layout (rtdetr, tatr, slanet variants)".to_string());
} else {
manager
.ensure_all_models()
.context("Failed to download layout models")?;
downloaded.push("layout (rtdetr, tatr, slanet variants)".to_string());
}
} else {
// Default: download only rtdetr + tatr
let was_cached = manager.is_rtdetr_cached() && manager.is_tatr_cached();
if was_cached {
already_cached.push("layout (rtdetr, tatr)".to_string());
} else {
manager
.ensure_default_models()
.context("Failed to download layout models")?;
downloaded.push("layout (rtdetr, tatr)".to_string());
}
}
}
#[cfg(feature = "paddle-ocr")]
{
let tessdata_dir = cache_base.join("tessdata");
let manager = kreuzberg::ocr::TessdataManager::new(Some(tessdata_dir));
let newly_downloaded = manager
.ensure_all_languages()
.context("Failed to download tessdata files")?;
if newly_downloaded > 0 {
downloaded.push(format!("tessdata ({newly_downloaded} languages)"));
} else {
already_cached.push("tessdata (all languages)".to_string());
}
}
#[cfg(feature = "embeddings")]
{
let embeddings_dir = cache_base.join("embeddings");
let presets_to_warm: Vec<kreuzberg::EmbeddingPreset> = if all_embeddings {
kreuzberg::list_embedding_presets()
.into_iter()
.filter_map(|name| kreuzberg::get_embedding_preset(&name))
.collect()
} else if let Some(ref name) = embedding_model {
match kreuzberg::get_embedding_preset(name) {
Some(preset) => vec![preset],
None => {
let available = kreuzberg::list_embedding_presets();
anyhow::bail!(
"Unknown embedding preset '{}'. Available: {}",
name,
available.join(", ")
);
}
}
} else {
vec![]
};
for preset in &presets_to_warm {
let label = format!("embedding ({})", preset.name);
kreuzberg::embeddings::warm_model(
&kreuzberg::core::config::EmbeddingModelType::Preset {
name: preset.name.clone(),
},
Some(embeddings_dir.clone()),
)
.map_err(|e| anyhow::anyhow!("Failed to download embedding model '{}': {}", preset.name, e))?;
downloaded.push(label);
}
}
#[cfg(not(feature = "embeddings"))]
{
if all_embeddings || embedding_model.is_some() {
anyhow::bail!("Embedding model warming requires the 'embeddings' feature to be enabled");
}
}
// Tree-sitter grammar downloads
#[cfg(feature = "tree-sitter")]
{
if all_grammars {
let count =
tree_sitter_language_pack::download_all().context("Failed to download all tree-sitter grammars")?;
if count > 0 {
downloaded.push(format!("tree-sitter grammars ({count} languages)"));
} else {
already_cached.push("tree-sitter grammars (all)".to_string());
}
} else if let Some(ref groups) = grammar_groups {
let config = tree_sitter_language_pack::PackConfig {
cache_dir: None,
languages: None,
groups: Some(groups.clone()),
};
tree_sitter_language_pack::init(&config).context("Failed to download tree-sitter grammar groups")?;
downloaded.push(format!("tree-sitter grammars (groups: {})", groups.join(", ")));
} else if let Some(ref langs) = grammars {
let refs: Vec<&str> = langs.iter().map(String::as_str).collect();
let count =
tree_sitter_language_pack::download(&refs).context("Failed to download tree-sitter grammars")?;
if count > 0 {
downloaded.push(format!("tree-sitter grammars ({count} languages)"));
} else {
already_cached.push(format!("tree-sitter grammars ({})", langs.join(", ")));
}
}
}
#[cfg(not(feature = "tree-sitter"))]
{
if all_grammars || grammar_groups.is_some() || grammars.is_some() {
anyhow::bail!("Tree-sitter grammar warming requires the 'tree-sitter' feature to be enabled");
}
}
match format {
WireFormat::Text => {
if !downloaded.is_empty() {
println!("{}", style::label("Downloaded:"));
for d in &downloaded {
println!(" {}", style::success(d));
}
}
if !already_cached.is_empty() {
println!("{}", style::label("Already cached:"));
for c in &already_cached {
println!(" {}", style::dim(c));
}
}
println!(
"All models ready in {}",
style::success(&cache_base.display().to_string())
);
}
WireFormat::Json => {
let output = json!({
"cache_dir": cache_base.to_string_lossy(),
"downloaded": downloaded,
"already_cached": already_cached,
});
println!(
"{}",
serde_json::to_string_pretty(&output).context("Failed to serialize warm results to JSON")?
);
}
WireFormat::Toon => {
let output = json!({
"cache_dir": cache_base.to_string_lossy(),
"downloaded": downloaded,
"already_cached": already_cached,
});
println!(
"{}",
serde_toon::to_string(&output).context("Failed to serialize warm results to TOON")?
);
}
}
Ok(())
}
/// Resolve the cache base directory.
fn resolve_cache_base(cache_dir: Option<PathBuf>) -> PathBuf {
if let Some(dir) = cache_dir {
return dir;
}
if let Ok(env_path) = std::env::var("KREUZBERG_CACHE_DIR") {
return PathBuf::from(env_path);
}
std::env::current_dir()
.unwrap_or_else(|_| PathBuf::from("."))
.join(".kreuzberg")
}

View File

@@ -0,0 +1,61 @@
//! Chunk command implementation.
use anyhow::{Context, Result};
use crate::{WireFormat, style};
/// Execute the chunk command: split text into chunks.
pub fn chunk_command(text: String, config: kreuzberg::ChunkingConfig, format: WireFormat) -> Result<()> {
if text.is_empty() {
anyhow::bail!("No text provided for chunking. Provide --text or pipe text via stdin.");
}
let result = kreuzberg::chunking::chunk_text(&text, &config, None).context("Failed to chunk text")?;
match format {
WireFormat::Json => {
let chunks: Vec<&str> = result.chunks.iter().map(|c| c.content.as_str()).collect();
let output = serde_json::json!({
"chunks": chunks,
"chunk_count": result.chunk_count,
"config": {
"max_characters": config.max_characters,
"overlap": config.overlap,
"chunker_type": format!("{:?}", config.chunker_type),
},
"input_size_bytes": text.len(),
});
println!(
"{}",
serde_json::to_string_pretty(&output).context("Failed to serialize chunks to JSON")?
);
}
WireFormat::Toon => {
let chunks: Vec<&str> = result.chunks.iter().map(|c| c.content.as_str()).collect();
let output = serde_json::json!({
"chunks": chunks,
"chunk_count": result.chunk_count,
"config": {
"max_characters": config.max_characters,
"overlap": config.overlap,
"chunker_type": format!("{:?}", config.chunker_type),
},
"input_size_bytes": text.len(),
});
println!(
"{}",
serde_toon::to_string(&output).context("Failed to serialize chunks to TOON")?
);
}
WireFormat::Text => {
for (i, chunk) in result.chunks.iter().enumerate() {
if result.chunks.len() > 1 {
println!("{}", style::dim(&format!("--- chunk {} ---", i + 1)));
}
println!("{}", chunk.content);
}
}
}
Ok(())
}

View File

@@ -0,0 +1,51 @@
//! Config command - Configuration loading and discovery
//!
//! This module provides utilities for loading extraction configuration from files
//! or discovering them automatically in the project directory.
use anyhow::{Context, Result};
use kreuzberg::ExtractionConfig;
use std::path::PathBuf;
/// Loads extraction configuration from a file or discovers it automatically.
///
/// This function implements the CLI's configuration hierarchy:
/// 1. Explicit config file (if `--config` flag provided)
/// 2. Auto-discovered config (searches `kreuzberg.{toml,yaml,json}` in current and parent directories)
/// 3. Default configuration (if no config file found)
///
/// # Configuration File Formats
///
/// Supports three formats, determined by file extension:
/// - `.toml`: TOML format (recommended for humans)
/// - `.yaml` / `.yml`: YAML format
/// - `.json`: JSON format
///
/// # Errors
///
/// Returns an error if:
/// - Explicit config file has unsupported extension (must be .toml, .yaml, .yml, or .json)
/// - Config file cannot be read or parsed
/// - Config file contains invalid extraction settings
pub fn load_config(config_path: Option<PathBuf>) -> Result<ExtractionConfig> {
if let Some(path) = config_path {
let path_str = path.to_string_lossy();
let path_lower = path_str.to_lowercase();
let config = if path_lower.ends_with(".toml") {
ExtractionConfig::from_toml_file(&path)
} else if path_lower.ends_with(".yaml") || path_lower.ends_with(".yml") {
ExtractionConfig::from_yaml_file(&path)
} else if path_lower.ends_with(".json") {
ExtractionConfig::from_json_file(&path)
} else {
anyhow::bail!("Config file must have .toml, .yaml, .yml, or .json extension (case-insensitive)");
};
config.with_context(|| format!("Failed to load configuration from '{}'. Ensure the file exists, is readable, and contains valid configuration.", path.display()))
} else {
match ExtractionConfig::discover() {
Ok(Some(config)) => Ok(config),
Ok(None) => Ok(ExtractionConfig::default()),
Err(e) => Err(e).context("Failed to auto-discover configuration file. Searched for kreuzberg.{toml,yaml,json} in current and parent directories. Use --config to specify an explicit path."),
}
}
}

View File

@@ -0,0 +1,161 @@
//! Embed command implementation.
use anyhow::{Context, Result};
use crate::{WireFormat, style};
/// Execute the embed command: generate embeddings for input texts.
///
/// When `provider` is `"local"` (default), uses the ONNX preset model.
/// When `provider` is `"llm"`, uses liter-llm with the specified model and API key.
/// When `provider` is `"plugin"`, dispatches to a pre-registered in-process embedding backend.
pub fn embed_command(
texts: Vec<String>,
preset: &str,
provider: &str,
llm_model: Option<String>,
llm_api_key: Option<String>,
plugin_name: Option<String>,
format: WireFormat,
) -> Result<()> {
if texts.is_empty() {
anyhow::bail!("No texts provided for embedding. Provide --text or pipe text via stdin.");
}
// Validate no empty texts
for (i, t) in texts.iter().enumerate() {
if t.is_empty() {
anyhow::bail!("Text at position {} is empty. All texts must be non-empty.", i + 1);
}
}
let (config, model_label) = match provider {
"llm" => {
let model = llm_model.as_deref().ok_or_else(|| {
anyhow::anyhow!(
"--model is required when --provider is 'llm' (e.g., --model openai/text-embedding-3-small)"
)
})?;
let llm_config = kreuzberg::LlmConfig {
model: model.to_string(),
api_key: llm_api_key,
base_url: None,
timeout_secs: None,
max_retries: None,
temperature: None,
max_tokens: None,
};
let config = kreuzberg::EmbeddingConfig {
model: kreuzberg::EmbeddingModelType::Llm { llm: llm_config },
show_download_progress: true,
..Default::default()
};
(config, model.to_string())
}
"local" | "" => {
// Validate preset for local provider
let _preset_info = kreuzberg::get_embedding_preset(preset).with_context(|| {
format!(
"Unknown embedding preset '{}'. Available: {:?}",
preset,
kreuzberg::list_embedding_presets()
)
})?;
let config = kreuzberg::EmbeddingConfig {
model: kreuzberg::EmbeddingModelType::Preset {
name: preset.to_string(),
},
show_download_progress: true,
..Default::default()
};
(config, preset.to_string())
}
"plugin" => {
let name = plugin_name.as_deref().ok_or_else(|| {
anyhow::anyhow!(
"--plugin NAME is required when --provider is 'plugin'. Register a backend via kreuzberg::plugins::register_embedding_backend first."
)
})?;
if name.is_empty() {
anyhow::bail!("--plugin NAME must not be empty.");
}
// Pre-flight: surface unknown backends with a list of registered names
// (parity with the REST handler, which returns 422 for the same case).
let available =
kreuzberg::plugins::list_embedding_backends().context("Failed to read embedding backend registry")?;
if !available.iter().any(|n| n == name) {
anyhow::bail!(
"Embedding backend '{}' is not registered. Available backends: {}",
name,
if available.is_empty() {
"(none registered)".to_string()
} else {
available.join(", ")
}
);
}
let config = kreuzberg::EmbeddingConfig {
model: kreuzberg::EmbeddingModelType::Plugin { name: name.to_string() },
..Default::default()
};
(config, name.to_string())
}
other => {
anyhow::bail!(
"Unknown embedding provider '{}'. Valid providers: 'local' (default, ONNX), 'llm' (liter-llm), or 'plugin' (in-process backend).",
other
);
}
};
// Generate embeddings
let embeddings = kreuzberg::embed_texts(texts.clone(), &config).context("Failed to generate embeddings")?;
let dimensions = embeddings.first().map(|e| e.len()).unwrap_or(0);
match format {
WireFormat::Json => {
let output = serde_json::json!({
"embeddings": embeddings,
"model": model_label,
"dimensions": dimensions,
"count": embeddings.len(),
});
println!(
"{}",
serde_json::to_string_pretty(&output).context("Failed to serialize embeddings to JSON")?
);
}
WireFormat::Toon => {
let output = serde_json::json!({
"embeddings": embeddings,
"model": model_label,
"dimensions": dimensions,
"count": embeddings.len(),
});
println!(
"{}",
serde_toon::to_string(&output).context("Failed to serialize embeddings to TOON")?
);
}
WireFormat::Text => {
for (i, embedding) in embeddings.iter().enumerate() {
if texts.len() > 1 {
println!("{}", style::dim(&format!("# text {}", i + 1)));
}
let values: Vec<String> = embedding.iter().map(|v| format!("{v}")).collect();
println!("{}", values.join(","));
}
}
}
Ok(())
}

View File

@@ -0,0 +1,180 @@
//! Extract command - Extract text and data from documents
//!
//! This module provides the extract and batch extract commands for processing single
//! or multiple documents with customizable extraction configurations.
use anyhow::{Context, Result};
use kreuzberg::{
BatchFileItem, ExtractionConfig, ExtractionResult, FileExtractionConfig, batch_extract_files_sync,
extract_file_sync,
};
use std::path::PathBuf;
use std::time::Instant;
use crate::{
WireFormat,
output::{BatchEnvelope, ExtractEnvelope},
style,
};
/// Execute single document extraction command
pub fn extract_command(
path: PathBuf,
config: ExtractionConfig,
mime_type: Option<String>,
format: WireFormat,
) -> Result<()> {
let path_str = path.to_string_lossy().to_string();
let t0 = Instant::now();
let result = extract_file_sync(&path_str, mime_type.as_deref(), &config).with_context(|| {
format!(
"Failed to extract file '{}'. Ensure the file is readable and the format is supported.",
path.display()
)
})?;
let extraction_time_ms = t0.elapsed().as_secs_f64() * 1000.0;
match format {
WireFormat::Text => {
print!("{}", result.content);
}
WireFormat::Json => {
let envelope = ExtractEnvelope {
result,
extraction_time_ms,
};
println!(
"{}",
serde_json::to_string_pretty(&envelope).context("Failed to serialize extraction result to JSON")?
);
}
WireFormat::Toon => {
println!(
"{}",
serde_toon::to_string(&result).context("Failed to serialize extraction result to TOON")?
);
}
}
Ok(())
}
/// Execute batch extraction command with optional per-file configuration overrides
pub fn batch_command(
paths: Vec<PathBuf>,
file_configs_map: Option<std::collections::HashMap<String, serde_json::Value>>,
config: ExtractionConfig,
format: WireFormat,
) -> Result<()> {
match format {
WireFormat::Json => {
// Run files one at a time to capture per-file wall-clock timings.
// Per-file config overrides are honoured: files without an override use the
// batch-level config directly; files with an override use a one-shot batch of
// one item so the library's own merge logic applies.
let mut results: Vec<ExtractionResult> = Vec::with_capacity(paths.len());
let mut per_file_ms: Vec<f64> = Vec::with_capacity(paths.len());
let total_t0 = Instant::now();
for path in &paths {
let path_str = path.to_string_lossy().to_string();
let has_file_config = file_configs_map.as_ref().and_then(|m| m.get(&path_str)).is_some();
let t0 = Instant::now();
let result = if has_file_config {
// Delegate to the batch API (one item) so per-file merge logic is applied.
let file_config = file_configs_map
.as_ref()
.and_then(|m| m.get(&path_str))
.map(|v| {
serde_json::from_value::<FileExtractionConfig>(v.clone())
.with_context(|| format!("Failed to parse file config for '{}'", path_str))
})
.transpose()?;
let mut batch_results = batch_extract_files_sync(
vec![BatchFileItem {
path: path.clone(),
config: file_config,
}],
&config,
)
.with_context(|| {
format!(
"Failed to extract file '{}'. Ensure the file is readable and the format is supported.",
path.display()
)
})?;
batch_results.remove(0)
} else {
extract_file_sync(&path_str, None, &config).with_context(|| {
format!(
"Failed to extract file '{}'. Ensure the file is readable and the format is supported.",
path.display()
)
})?
};
per_file_ms.push(t0.elapsed().as_secs_f64() * 1000.0);
results.push(result);
}
let total_ms = total_t0.elapsed().as_secs_f64() * 1000.0;
let envelope = BatchEnvelope {
results,
total_ms,
per_file_ms,
};
println!(
"{}",
serde_json::to_string_pretty(&envelope)
.context("Failed to serialize batch extraction results to JSON")?
);
}
WireFormat::Text => {
let results = run_batch_sync(&paths, file_configs_map.as_ref(), &config)?;
for (i, result) in results.iter().enumerate() {
println!("{}", style::header(&format!("=== Document {} ===", i + 1)));
println!("{} {}", style::label("MIME Type:"), style::success(&result.mime_type));
println!("{}\n{}", style::label("Content:"), result.content);
println!();
}
}
WireFormat::Toon => {
let results = run_batch_sync(&paths, file_configs_map.as_ref(), &config)?;
println!(
"{}",
serde_toon::to_string(&results).context("Failed to serialize batch extraction results to TOON")?
);
}
}
Ok(())
}
/// Run batch extraction using the synchronous batch API for non-JSON output paths.
fn run_batch_sync(
paths: &[PathBuf],
file_configs_map: Option<&std::collections::HashMap<String, serde_json::Value>>,
config: &ExtractionConfig,
) -> Result<Vec<ExtractionResult>> {
let items: Vec<BatchFileItem> = paths
.iter()
.map(|p| {
let path_str = p.to_string_lossy().to_string();
let file_config = file_configs_map
.and_then(|m| m.get(&path_str))
.map(|v| {
serde_json::from_value::<FileExtractionConfig>(v.clone())
.with_context(|| format!("Failed to parse file config for '{}'", path_str))
})
.transpose()?;
Ok(BatchFileItem {
path: p.clone(),
config: file_config,
})
})
.collect::<Result<Vec<_>>>()?;
batch_extract_files_sync(items, config)
.context("Failed to batch extract documents. Check that all files are readable and formats are supported.")
}

View File

@@ -0,0 +1,116 @@
//! Extract structured command - Extract structured data from documents using an LLM.
//!
//! Reads a JSON schema file, configures LLM-based structured extraction, and
//! outputs the structured result parsed from the document.
use anyhow::{Context, Result};
use kreuzberg::{LlmConfig, StructuredExtractionConfig, extract_file_sync};
use std::path::PathBuf;
use crate::WireFormat;
/// Arguments for the extract-structured command.
pub struct ExtractStructuredArgs {
pub path: PathBuf,
pub schema_path: PathBuf,
pub model: String,
pub api_key: Option<String>,
pub prompt: Option<String>,
pub schema_name: Option<String>,
pub strict: bool,
pub config_path: Option<PathBuf>,
pub format: WireFormat,
}
/// Execute the extract-structured command.
///
/// Reads a JSON schema from `schema_path`, builds an `ExtractionConfig` with
/// `structured_extraction` configured, extracts the document, and outputs the
/// `structured_output` field from the result.
pub fn extract_structured_command(args: ExtractStructuredArgs) -> Result<()> {
let ExtractStructuredArgs {
path,
schema_path,
model,
api_key,
prompt,
schema_name,
strict,
config_path,
format,
} = args;
// 1. Read and parse the JSON schema file
let schema_str = std::fs::read_to_string(&schema_path).with_context(|| {
format!(
"Failed to read JSON schema file '{}'. Ensure the file exists and is readable.",
schema_path.display()
)
})?;
let schema: serde_json::Value = serde_json::from_str(&schema_str).with_context(|| {
format!(
"Failed to parse JSON schema from '{}'. Ensure the file contains valid JSON.",
schema_path.display()
)
})?;
// 2. Build ExtractionConfig with structured_extraction
let mut config = super::load_config(config_path)?;
let llm_config = LlmConfig {
model,
api_key,
base_url: None,
timeout_secs: None,
max_retries: None,
temperature: None,
max_tokens: None,
};
config.structured_extraction = Some(StructuredExtractionConfig {
schema,
schema_name: schema_name.unwrap_or_else(|| "extraction".to_string()),
schema_description: None,
strict,
prompt,
llm: llm_config,
});
// 3. Call kreuzberg::extract_file_sync()
let path_str = path.to_string_lossy().to_string();
let result = extract_file_sync(&path_str, None, &config).with_context(|| {
format!(
"Failed to extract structured data from '{}'. Ensure the file is readable and the LLM configuration is correct.",
path.display()
)
})?;
// 4. Output result.structured_output (or error if None)
let structured = result.structured_output.with_context(|| {
"Structured extraction completed but returned no structured output. \
This may indicate the LLM failed to produce valid structured data matching the schema."
})?;
match format {
WireFormat::Json => {
println!(
"{}",
serde_json::to_string_pretty(&structured).context("Failed to serialize structured output to JSON")?
);
}
WireFormat::Toon => {
println!(
"{}",
serde_toon::to_string(&structured).context("Failed to serialize structured output to TOON")?
);
}
WireFormat::Text => {
// For text mode, pretty-print the JSON value
println!(
"{}",
serde_json::to_string_pretty(&structured).context("Failed to serialize structured output to text")?
);
}
}
Ok(())
}

View File

@@ -0,0 +1,48 @@
//! Command modules for Kreuzberg CLI
//!
//! This module organizes the CLI commands into focused submodules:
//! - `extract` - Document extraction commands
//! - `cache` - Cache management operations
//! - `server` - API and MCP server commands
//! - `config` - Configuration loading and discovery
//! - `embed` - Embedding generation commands
//! - `chunk` - Text chunking commands
use anyhow::{Context, Result};
use std::io::Read;
pub mod cache;
pub mod chunk;
pub mod config;
#[cfg(feature = "embeddings")]
pub mod embed;
pub mod extract;
pub mod extract_structured;
pub mod overrides;
#[cfg(any(feature = "api", feature = "mcp"))]
pub mod server;
// Re-export command functions for convenience
pub use cache::{clear_command, manifest_command, stats_command, warm_command};
pub use chunk::chunk_command;
pub use config::load_config;
#[cfg(feature = "embeddings")]
pub use embed::embed_command;
pub use extract::{batch_command, extract_command};
#[cfg(feature = "mcp")]
pub use server::mcp_command;
#[cfg(feature = "api")]
pub use server::serve_command;
/// Read text from stdin, trimming whitespace.
pub fn read_stdin() -> Result<String> {
let mut input = String::new();
std::io::stdin()
.read_to_string(&mut input)
.context("Failed to read from stdin")?;
let trimmed = input.trim().to_string();
if trimmed.is_empty() {
anyhow::bail!("No input received from stdin. Provide text via --text or pipe it to stdin.");
}
Ok(trimmed)
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,104 @@
//! Server command - Start API and MCP servers
//!
//! This module provides commands for starting the Kreuzberg API server
//! and the MCP (Model Context Protocol) server.
use anyhow::Result;
/// Execute API server command
#[cfg(feature = "api")]
pub fn serve_command(
cli_host: Option<String>,
cli_port: Option<u16>,
extraction_config: kreuzberg::ExtractionConfig,
config_path: Option<std::path::PathBuf>,
) -> Result<()> {
use anyhow::Context;
use kreuzberg::ServerConfig;
// Load server config from same file or defaults
let mut server_config = if let Some(path) = &config_path {
ServerConfig::from_file(path).with_context(|| {
format!(
"Failed to load server configuration from '{}'. \
Ensure the file contains valid server settings under [server] section or at root level.",
path.display()
)
})?
} else {
ServerConfig::default()
};
// Apply environment variable overrides (precedence: env vars > config file)
server_config.apply_env_overrides()?;
// CLI args override everything (highest precedence)
if let Some(host) = cli_host {
server_config.host = host;
}
if let Some(port) = cli_port {
server_config.port = port;
}
// Log the final configuration for debugging
tracing::info!(
"Starting Kreuzberg API server on http://{}",
server_config.listen_addr()
);
let rt = tokio::runtime::Runtime::new()?;
rt.block_on(kreuzberg::api::serve_with_server_config(
extraction_config,
server_config.clone(),
))
.with_context(|| {
format!(
"Failed to start API server on {}. Ensure the port is not already in use and you have permission to bind to this address.",
server_config.listen_addr()
)
})?;
Ok(())
}
/// Execute MCP server command
#[cfg(feature = "mcp")]
pub fn mcp_command(
config: kreuzberg::ExtractionConfig,
transport: String,
#[cfg(feature = "mcp-http")] host: String,
#[cfg(feature = "mcp-http")] port: u16,
#[cfg(not(feature = "mcp-http"))] _host: String,
#[cfg(not(feature = "mcp-http"))] _port: u16,
) -> Result<()> {
tracing::debug!("Starting Kreuzberg MCP server with transport: {}", transport);
let rt = tokio::runtime::Runtime::new()?;
match transport.to_lowercase().as_str() {
"stdio" => {
rt.block_on(kreuzberg::mcp::start_mcp_server_with_config(config))
.map_err(|e| anyhow::anyhow!("Failed to start MCP server: {}", e))?;
}
"http" => {
#[cfg(not(feature = "mcp-http"))]
{
anyhow::bail!(
"HTTP transport requires 'mcp-http' feature. \
Rebuild with: cargo build --features mcp-http"
);
}
#[cfg(feature = "mcp-http")]
{
tracing::debug!("Starting MCP server on http://{}:{}", host, port);
rt.block_on(kreuzberg::mcp::start_mcp_server_http_with_config(&host, port, config))
.map_err(|e| anyhow::anyhow!("Failed to start MCP server on {}:{}: {}", host, port, e))?;
}
}
other => {
anyhow::bail!("Unknown transport '{}'. Use 'stdio' or 'http'", other);
}
}
Ok(())
}

View File

@@ -0,0 +1,230 @@
//! Tree-sitter grammar management commands.
//!
//! This module provides commands for downloading, listing, and managing
//! tree-sitter grammar parsers via the tree-sitter-language-pack crate.
use anyhow::{Context, Result};
use serde_json::json;
use std::path::PathBuf;
use crate::{WireFormat, style};
/// Execute the tree-sitter download command.
///
/// Downloads tree-sitter grammar parsers based on the provided arguments:
/// - Specific languages by name
/// - All available languages (--all)
/// - Language groups (--groups)
pub fn download_command(
languages: Vec<String>,
all: bool,
groups: Option<Vec<String>>,
cache_dir: Option<PathBuf>,
format: WireFormat,
) -> Result<()> {
// Apply custom cache directory if provided
if let Some(ref dir) = cache_dir {
let config = tree_sitter_language_pack::PackConfig {
cache_dir: Some(dir.clone()),
languages: None,
groups: None,
};
tree_sitter_language_pack::configure(&config).context("Failed to configure custom cache directory")?;
}
let count: usize;
let description: String;
if all {
count = tree_sitter_language_pack::download_all().context("Failed to download all tree-sitter grammars")?;
description = "all available languages".to_string();
} else if let Some(ref group_list) = groups {
let config = tree_sitter_language_pack::PackConfig {
cache_dir: cache_dir.clone(),
languages: None,
groups: Some(group_list.clone()),
};
tree_sitter_language_pack::init(&config).context("Failed to download tree-sitter grammar groups")?;
count = 0; // init does not return a count
description = format!("groups: {}", group_list.join(", "));
} else if !languages.is_empty() {
let refs: Vec<&str> = languages.iter().map(String::as_str).collect();
count = tree_sitter_language_pack::download(&refs).context("Failed to download tree-sitter grammars")?;
description = format!("languages: {}", languages.join(", "));
} else {
anyhow::bail!("No languages specified. Use language names, --all, --groups, or --from-config.");
}
match format {
WireFormat::Text => {
println!("{}", style::header("Tree-sitter Download"));
println!("{}", style::dim("===================="));
println!("{} {}", style::label("Requested:"), description);
if groups.is_none() || all || !languages.is_empty() {
println!(
"{} {}",
style::label("Newly downloaded:"),
style::success(&count.to_string())
);
}
if let Some(ref dir) = cache_dir {
println!(
"{} {}",
style::label("Cache directory:"),
style::success(&dir.display().to_string())
);
}
println!("{}", style::success("Done"));
}
WireFormat::Json => {
let mut output = json!({
"requested": description,
"newly_downloaded": count,
});
if let Some(ref dir) = cache_dir {
output["cache_dir"] = json!(dir.to_string_lossy());
}
println!(
"{}",
serde_json::to_string_pretty(&output).context("Failed to serialize download results to JSON")?
);
}
WireFormat::Toon => {
let mut output = json!({
"requested": description,
"newly_downloaded": count,
});
if let Some(ref dir) = cache_dir {
output["cache_dir"] = json!(dir.to_string_lossy());
}
println!(
"{}",
serde_toon::to_string(&output).context("Failed to serialize download results to TOON")?
);
}
}
Ok(())
}
/// Execute the tree-sitter list command.
///
/// Lists available or downloaded tree-sitter languages, optionally filtering
/// by a name substring.
pub fn list_command(downloaded_only: bool, filter: Option<String>, format: WireFormat) -> Result<()> {
let languages = if downloaded_only {
tree_sitter_language_pack::downloaded_languages()
} else {
tree_sitter_language_pack::manifest_languages().context("Failed to fetch tree-sitter language manifest")?
};
let filtered: Vec<&String> = if let Some(ref f) = filter {
let lower = f.to_lowercase();
languages.iter().filter(|l| l.to_lowercase().contains(&lower)).collect()
} else {
languages.iter().collect()
};
let source = if downloaded_only { "downloaded" } else { "available" };
match format {
WireFormat::Text => {
println!(
"{} ({} {}{})",
style::header("Tree-sitter Languages"),
filtered.len(),
source,
filter.as_ref().map(|f| format!(", filter: '{f}'")).unwrap_or_default()
);
println!("{}", style::dim("====================="));
for lang in &filtered {
println!(" {}", style::success(lang));
}
}
WireFormat::Json => {
let output = json!({
"source": source,
"count": filtered.len(),
"filter": filter,
"languages": filtered,
});
println!(
"{}",
serde_json::to_string_pretty(&output).context("Failed to serialize language list to JSON")?
);
}
WireFormat::Toon => {
let output = json!({
"source": source,
"count": filtered.len(),
"filter": filter,
"languages": filtered,
});
println!(
"{}",
serde_toon::to_string(&output).context("Failed to serialize language list to TOON")?
);
}
}
Ok(())
}
/// Execute the tree-sitter cache-dir command.
///
/// Displays the effective cache directory for tree-sitter grammar parsers.
pub fn cache_dir_command(format: WireFormat) -> Result<()> {
let dir = tree_sitter_language_pack::cache_dir().context("Failed to determine tree-sitter cache directory")?;
let dir_str = dir.to_string_lossy();
match format {
WireFormat::Text => {
println!("{} {}", style::label("Cache directory:"), style::success(&dir_str));
}
WireFormat::Json => {
let output = json!({ "cache_dir": dir_str });
println!(
"{}",
serde_json::to_string_pretty(&output).context("Failed to serialize cache directory to JSON")?
);
}
WireFormat::Toon => {
let output = json!({ "cache_dir": dir_str });
println!(
"{}",
serde_toon::to_string(&output).context("Failed to serialize cache directory to TOON")?
);
}
}
Ok(())
}
/// Execute the tree-sitter clean command.
///
/// Clears all cached tree-sitter grammar parser shared libraries.
pub fn clean_command(format: WireFormat) -> Result<()> {
tree_sitter_language_pack::clean_cache().context("Failed to clean tree-sitter cache")?;
match format {
WireFormat::Text => {
println!("{}", style::success("Tree-sitter cache cleared successfully"));
}
WireFormat::Json => {
let output = json!({ "status": "cleared" });
println!(
"{}",
serde_json::to_string_pretty(&output).context("Failed to serialize clean result to JSON")?
);
}
WireFormat::Toon => {
let output = json!({ "status": "cleared" });
println!(
"{}",
serde_toon::to_string(&output).context("Failed to serialize clean result to TOON")?
);
}
}
Ok(())
}

View File

@@ -0,0 +1,238 @@
//! Logging helpers for the Kreuzberg CLI.
//!
//! Provides a [`build_env_filter`] function that layers default third-party
//! transport suppressions on top of whatever the caller or `RUST_LOG` specifies.
//! User-supplied per-target rules in `RUST_LOG` always win because
//! [`EnvFilter::add_directive`] does not override existing per-target directives.
use tracing_subscriber::EnvFilter;
/// Third-party crates that are noisy at their own default level.
///
/// These are added as *fallback* directives: if `RUST_LOG` or `level_override`
/// already contain a per-target rule for any of these crates it takes precedence,
/// so the user can still do `RUST_LOG=ureq=debug` to restore full transport logs.
const QUIET_DIRECTIVES: &[&str] = &[
"ureq=warn",
"ureq_proto=warn",
"rustls=warn",
"hyper_util=warn",
"hf_hub=info",
"tower_http=info",
];
/// Extract the target crate name from a directive string like `"ureq=warn"`.
///
/// Returns the part before `=`, or `None` if there is no `=`.
fn directive_target(directive: &str) -> Option<&str> {
directive.split_once('=').map(|(target, _)| target)
}
/// Build an [`EnvFilter`] with third-party transport crates suppressed by default.
///
/// Precedence (highest first):
/// 1. Per-target directives already present in `RUST_LOG` (or `level_override`).
/// 2. The [`QUIET_DIRECTIVES`] suppressions added here.
/// 3. Root level: `level_override` → `RUST_LOG` → `"info"`.
///
/// Per-target directives that the user has already set are **not** overridden:
/// we skip adding a quiet directive when the base filter already contains a
/// rule for the same target crate. This is necessary because
/// [`EnvFilter::add_directive`] appends rather than guards — a later-added
/// per-target directive for the same crate takes precedence.
///
/// # Arguments
///
/// * `level_override` — explicit root-level string from a CLI flag (e.g. `"debug"`).
/// When `Some`, it replaces `RUST_LOG` entirely for the root level.
pub fn build_env_filter(level_override: Option<&str>) -> EnvFilter {
// Use try_new on user input so a malformed --log-level falls back to info
// instead of panicking the CLI.
let base = level_override
.and_then(|level| EnvFilter::try_new(level).ok())
.or_else(|| EnvFilter::try_from_default_env().ok())
.unwrap_or_else(|| EnvFilter::new("info"));
// Snapshot the existing directive set so we can skip quiet directives
// whose target the user has already configured explicitly.
let existing_targets: std::collections::HashSet<String> = base
.to_string()
.split(',')
.filter_map(|chunk| directive_target(chunk).map(|t| t.trim().to_string()))
.collect();
QUIET_DIRECTIVES
.iter()
.filter(|directive| {
// Only add the quiet directive when no per-target rule for this
// exact crate already exists. Word-boundary match via tokenized
// target set avoids `hf_hub` colliding with `hf_hub_server`.
directive_target(directive)
.map(|target| !existing_targets.contains(target))
.unwrap_or(true)
})
.fold(base, |filter, directive| {
filter.add_directive(directive.parse().expect("built-in logging directive must be valid"))
})
}
#[cfg(test)]
mod tests {
use super::*;
/// Parse the directive string from an EnvFilter for assertion-level checks.
///
/// `EnvFilter::to_string()` returns a comma-separated representation of all
/// directives. We use this as a stable, public inspection surface.
fn filter_directives(filter: &EnvFilter) -> String {
filter.to_string()
}
#[test]
fn default_filter_suppresses_ureq() {
// No env, no override → ureq and ureq_proto must be suppressed.
let filter = build_env_filter(None);
let directives = filter_directives(&filter);
assert!(
directives.contains("ureq=warn"),
"ureq=warn must be present in default filter; got: {directives}"
);
assert!(
directives.contains("ureq_proto=warn"),
"ureq_proto=warn must be present in default filter; got: {directives}"
);
assert!(
directives.contains("rustls=warn"),
"rustls=warn must be present in default filter; got: {directives}"
);
}
#[test]
fn default_filter_keeps_kreuzberg_info() {
// Root level info → kreuzberg has no suppression applied.
let filter = build_env_filter(None);
let directives = filter_directives(&filter);
assert!(
!directives.contains("kreuzberg=warn") && !directives.contains("kreuzberg=error"),
"kreuzberg must not be suppressed in the default filter; got: {directives}"
);
}
#[test]
fn env_override_wins_for_third_party() {
// Simulate RUST_LOG=ureq=debug by passing it as the level_override.
// build_env_filter must detect the existing ureq= directive and skip the
// ureq=warn suppression, so ureq=debug survives in the final filter.
let filter = build_env_filter(Some("info,ureq=debug"));
let directives = filter.to_string();
assert!(
directives.contains("ureq=debug"),
"user-supplied ureq=debug must be preserved; got: {directives}"
);
assert!(
!directives.contains("ureq=warn"),
"ureq=warn suppression must not be added when user already set ureq=debug; got: {directives}"
);
}
#[test]
fn level_override_wins() {
// CLI flag "debug" → root must be debug; suppression directives still present.
let filter = build_env_filter(Some("debug"));
let directives = filter_directives(&filter);
assert!(
directives.contains("debug"),
"root debug level must appear in filter with --log-level debug; got: {directives}"
);
// Suppression for ureq must still be layered on top.
assert!(
directives.contains("ureq=warn"),
"ureq=warn suppression must still be present even under --log-level debug; got: {directives}"
);
}
#[test]
fn tower_http_suppressed_at_default() {
// No override → tower_http must be suppressed.
let filter = build_env_filter(None);
let directives = filter_directives(&filter);
assert!(
directives.contains("tower_http=info") || directives.contains("tower_http=warn"),
"tower_http must be suppressed at default level; got: {directives}"
);
}
#[test]
fn all_quiet_directives_are_valid() {
// Ensure every built-in directive parses without panic.
for directive in super::QUIET_DIRECTIVES {
directive
.parse::<tracing_subscriber::filter::Directive>()
.expect("built-in directive is invalid");
}
}
#[test]
fn no_level_override_uses_info_root() {
// Without RUST_LOG set and no override, root should default to info.
// The directive string must not open with debug or trace as the root level.
let filter = build_env_filter(None);
let directives = filter_directives(&filter);
// Root "debug" or "trace" as the first token would mean root is debug/trace.
let root_is_noisier_than_info = directives.starts_with("debug") || directives.starts_with("trace");
assert!(
!root_is_noisier_than_info,
"default root level must not be debug/trace without RUST_LOG; got: {directives}"
);
}
#[test]
fn hf_hub_suppressed_at_default() {
let filter = build_env_filter(None);
let directives = filter_directives(&filter);
assert!(
directives.contains("hf_hub=info"),
"hf_hub must be suppressed to info at default; got: {directives}"
);
}
#[test]
fn hyper_util_suppressed_at_default() {
let filter = build_env_filter(None);
let directives = filter_directives(&filter);
assert!(
directives.contains("hyper_util=warn"),
"hyper_util must be suppressed to warn at default; got: {directives}"
);
}
#[test]
fn malformed_level_override_falls_back_to_info() {
// Garbage CLI flag must NOT panic — try_new returns Err and we fall back
// to RUST_LOG / info default.
let filter = build_env_filter(Some(":::garbage"));
let directives = filter_directives(&filter);
// Quiet directives should still be layered, proving we recovered.
assert!(
directives.contains("ureq=warn"),
"ureq=warn must still be present after malformed override; got: {directives}"
);
}
#[test]
fn similar_target_name_does_not_block_suppression() {
// A user-supplied directive for `hf_hub_server` must NOT cause the
// `hf_hub=info` suppression to be skipped (regression test for the
// earlier substring-containment bug).
let filter = build_env_filter(Some("info,hf_hub_server=debug"));
let directives = filter.to_string();
assert!(
directives.contains("hf_hub_server=debug"),
"user directive for hf_hub_server must survive; got: {directives}"
);
assert!(
directives.contains("hf_hub=info"),
"hf_hub=info suppression must still be applied; got: {directives}"
);
}
}

View File

@@ -0,0 +1,971 @@
//! Kreuzberg CLI - Command-line interface for document intelligence.
//!
//! This binary provides a command-line interface to the Kreuzberg document intelligence
//! library, supporting document extraction, MIME type detection, caching, and batch operations.
//!
//! # Architecture
//!
//! The CLI is built using `clap` for argument parsing and provides five main commands:
//! - `extract`: Extract text/data from a single document
//! - `batch`: Process multiple documents in parallel
//! - `detect`: Identify MIME type of a file
//! - `cache`: Manage cache (clear, stats)
//! - `serve`: Start API server (requires `api` feature)
//! - `version`: Show version information
//!
//! # Configuration
//!
//! The CLI supports configuration files in TOML, YAML, or JSON formats:
//! - Explicit: `--config path/to/config.toml`
//! - Auto-discovery: Searches for `kreuzberg.{toml,yaml,json}` in current and parent directories
//! - Inline JSON: `--config-json '{"ocr": {"backend": "tesseract"}}'`
//! - Command-line flags override config file settings
//!
//! Configuration precedence (highest to lowest):
//! 1. Individual CLI flags (--output-format, --ocr, etc.)
//! 2. Inline JSON config (--config-json or --config-json-base64)
//! 3. Config file (--config path.toml)
//! 4. Default values
//!
//! # Exit Codes
//!
//! - 0: Success
//! - Non-zero: Error (see stderr for details)
//!
//! # Examples
//!
//! ```bash
//! # Extract text from a PDF
//! kreuzberg extract document.pdf
//!
//! # Extract with OCR enabled
//! kreuzberg extract scanned.pdf --ocr true
//!
//! # Extract with inline JSON config
//! kreuzberg extract doc.pdf --config-json '{"ocr":{"backend":"tesseract"}}'
//!
//! # Batch processing
//! kreuzberg batch *.pdf --output-format json
//!
//! # Detect MIME type
//! kreuzberg detect unknown-file.bin
//! ```
#![deny(unsafe_code)]
mod commands;
mod logging;
mod output;
mod style;
use anyhow::{Context, Result};
use base64::{Engine as _, engine::general_purpose::STANDARD};
use clap::{CommandFactory, Parser, Subcommand};
#[cfg(feature = "embeddings")]
use commands::embed_command;
#[cfg(feature = "mcp")]
use commands::mcp_command;
use commands::overrides::ExtractionOverrides;
#[cfg(feature = "api")]
use commands::serve_command;
use commands::{
batch_command, chunk_command, clear_command, extract_command,
extract_structured::{ExtractStructuredArgs, extract_structured_command},
load_config, manifest_command, stats_command, warm_command,
};
use kreuzberg::{OutputFormat as ContentOutputFormat, detect_mime_type};
use serde_json::json;
use std::path::{Path, PathBuf};
/// Kreuzberg document intelligence CLI
#[derive(Parser)]
#[command(name = "kreuzberg")]
#[command(version, about, long_about = None)]
struct Cli {
/// Set log level (trace, debug, info, warn, error). Overrides RUST_LOG env var.
#[arg(long, global = true)]
log_level: Option<String>,
#[command(subcommand)]
command: Commands,
}
#[derive(Subcommand)]
enum Commands {
/// Extract text from a document
Extract {
/// Path to the document
path: PathBuf,
/// Path to config file (TOML, YAML, or JSON). If not specified, searches for kreuzberg.toml/yaml/json in current and parent directories.
#[arg(short, long)]
config: Option<PathBuf>,
/// Inline JSON configuration. Applied after config file but before individual flags.
///
/// Example: --config-json '{"ocr":{"backend":"tesseract"},"chunking":{"max_chars":1000}}'
#[arg(long)]
config_json: Option<String>,
/// Base64-encoded JSON configuration. Useful for shell environments where quotes are problematic.
///
/// Example: --config-json-base64 eyJvY3IiOnsiYmFja2VuZCI6InRlc3NlcmFjdCJ9fQ==
#[arg(long)]
config_json_base64: Option<String>,
/// MIME type hint (auto-detected if not provided)
#[arg(short, long)]
mime_type: Option<String>,
/// Output format for CLI results (text or json).
///
/// Controls how the CLI displays results, not the extraction content format.
#[arg(short, long, default_value = "text")]
format: WireFormat,
/// Extraction configuration overrides
#[command(flatten)]
overrides: ExtractionOverrides,
},
/// Extract structured data from a document using an LLM
ExtractStructured {
/// Path to the document file
path: PathBuf,
/// Path to JSON schema file defining the output structure
#[arg(long)]
schema: PathBuf,
/// LLM model (e.g., "openai/gpt-4o")
#[arg(long)]
model: String,
/// API key for the LLM provider
#[arg(long)]
api_key: Option<String>,
/// Custom Jinja2 prompt template
#[arg(long)]
prompt: Option<String>,
/// Schema name
#[arg(long, default_value = "extraction")]
schema_name: Option<String>,
/// Enable strict mode
#[arg(long)]
strict: bool,
/// Config file path
#[arg(short, long)]
config: Option<PathBuf>,
/// Output format (text or json)
#[arg(short, long, default_value = "json")]
format: WireFormat,
},
/// Batch extract from multiple documents
Batch {
/// Paths to documents
paths: Vec<PathBuf>,
/// Path to config file (TOML, YAML, or JSON). If not specified, searches for kreuzberg.toml/yaml/json in current and parent directories.
#[arg(short, long)]
config: Option<PathBuf>,
/// Inline JSON configuration. Applied after config file but before individual flags.
///
/// Example: --config-json '{"ocr":{"backend":"tesseract"},"chunking":{"max_chars":1000}}'
#[arg(long)]
config_json: Option<String>,
/// Base64-encoded JSON configuration. Useful for shell environments where quotes are problematic.
///
/// Example: --config-json-base64 eyJvY3IiOnsiYmFja2VuZCI6InRlc3NlcmFjdCJ9fQ==
#[arg(long)]
config_json_base64: Option<String>,
/// Output format for CLI results (text or json).
///
/// Controls how the CLI displays results, not the extraction content format.
#[arg(short, long, default_value = "json")]
format: WireFormat,
/// Extraction configuration overrides
#[command(flatten)]
overrides: ExtractionOverrides,
/// Path to a JSON file mapping file paths to per-file extraction config overrides.
/// The JSON should be an object where keys are file paths and values are FileExtractionConfig objects.
/// Example: {"doc1.pdf": {"force_ocr": true}, "doc2.pdf": {"output_format": "markdown"}}
#[arg(long)]
file_configs: Option<PathBuf>,
},
/// Detect MIME type of a file
Detect {
/// Path to the file
path: PathBuf,
/// Output format (text or json)
#[arg(short, long, default_value = "text")]
format: WireFormat,
},
/// List all supported document formats
Formats {
/// Output format (text or json)
#[arg(short, long, default_value = "text")]
format: WireFormat,
},
/// Show version information
Version {
/// Output format (text or json)
#[arg(short, long, default_value = "text")]
format: WireFormat,
},
/// Cache management operations
Cache {
#[command(subcommand)]
command: CacheCommands,
},
/// Start the API server
///
/// Configuration is loaded with the following precedence (highest to lowest):
/// 1. CLI arguments (--host, --port)
/// 2. Environment variables (KREUZBERG_HOST, KREUZBERG_PORT)
/// 3. Config file (TOML, YAML, or JSON)
/// 4. Built-in defaults (127.0.0.1:8000)
///
/// The config file can contain both extraction and server settings under [server] section.
#[cfg(feature = "api")]
Serve {
/// Host to bind to (e.g., "127.0.0.1" or "0.0.0.0"). CLI arg overrides config file and env vars.
#[arg(short = 'H', long)]
host: Option<String>,
/// Port to bind to. CLI arg overrides config file and env vars.
#[arg(short, long)]
port: Option<u16>,
/// Path to config file (TOML, YAML, or JSON). If not specified, searches for kreuzberg.toml/yaml/json in current and parent directories.
#[arg(short, long)]
config: Option<PathBuf>,
},
/// Start the MCP (Model Context Protocol) server
#[cfg(feature = "mcp")]
Mcp {
/// Path to config file (TOML, YAML, or JSON). If not specified, searches for kreuzberg.toml/yaml/json in current and parent directories.
#[arg(short, long)]
config: Option<PathBuf>,
/// Transport mode: stdio (default) or http
#[arg(long, default_value = "stdio")]
transport: String,
/// HTTP host (only for --transport http)
#[arg(long, default_value = "127.0.0.1")]
host: String,
/// HTTP port (only for --transport http)
#[arg(long, default_value = "8001")]
port: u16,
},
/// API utilities
#[cfg(feature = "api")]
Api {
#[command(subcommand)]
command: ApiCommands,
},
/// Generate embeddings for text
///
/// Generates vector embeddings for one or more text inputs using a specified preset model
/// or an LLM provider. Reads from --text flag or stdin if no text is provided.
#[cfg(feature = "embeddings")]
Embed {
/// Text to embed. Can be specified multiple times for batch embedding.
#[arg(long)]
text: Vec<String>,
/// Embedding preset (fast, balanced, quality, multilingual). Used with --provider local.
#[arg(long, default_value = "balanced")]
preset: String,
/// Embedding provider: "local" (default, ONNX), "llm" (liter-llm), or "plugin" (registered in-process backend)
#[arg(long, default_value = "local")]
provider: String,
/// LLM model for provider-hosted embeddings (e.g., "openai/text-embedding-3-small").
/// Required when --provider is "llm".
#[arg(long)]
model: Option<String>,
/// API key for the LLM provider
#[arg(long)]
api_key: Option<String>,
/// Name of a pre-registered in-process embedding backend.
/// Required when --provider is "plugin". The backend must have been
/// registered via `kreuzberg::plugins::register_embedding_backend`
/// before this command runs.
#[arg(long)]
plugin: Option<String>,
/// Output format (text or json)
#[arg(short, long, default_value = "json")]
format: WireFormat,
},
/// Chunk text for processing
///
/// Splits text into chunks using configurable size and overlap.
/// Reads from --text flag or stdin if no text is provided.
Chunk {
/// Text to chunk. If not provided, reads from stdin.
#[arg(long)]
text: Option<String>,
/// Path to config file (TOML, YAML, or JSON)
#[arg(short, long)]
config: Option<PathBuf>,
/// Chunk size in characters
#[arg(long)]
chunk_size: Option<usize>,
/// Chunk overlap in characters
#[arg(long)]
chunk_overlap: Option<usize>,
/// Chunker type: text, markdown, yaml, or semantic
#[arg(long, default_value = "text")]
chunker_type: String,
/// Tokenizer model for token-based chunk sizing (e.g., "Xenova/gpt-4o").
/// Requires the chunking-tokenizers feature.
#[arg(long)]
chunking_tokenizer: Option<String>,
/// Topic threshold for semantic chunking (0.0-1.0, default: 0.75)
#[arg(long)]
topic_threshold: Option<f32>,
/// Output format (text or json)
#[arg(short, long, default_value = "json")]
format: WireFormat,
},
/// Generate shell completions
///
/// Outputs shell completion scripts for the specified shell.
/// Install with: eval "$(kreuzberg completions bash)"
Completions {
/// Shell to generate completions for
#[arg(value_enum)]
shell: clap_complete::Shell,
},
}
#[cfg(feature = "api")]
#[derive(Subcommand)]
enum ApiCommands {
/// Output the OpenAPI schema (JSON)
///
/// Prints the full OpenAPI 3.1 specification for the kreuzberg REST API.
/// Useful for code generation, documentation, and API client tooling.
Schema,
}
#[derive(Subcommand)]
enum CacheCommands {
/// Show cache statistics
Stats {
/// Cache directory (default: .kreuzberg in current directory)
#[arg(short, long)]
cache_dir: Option<PathBuf>,
/// Output format (text or json)
#[arg(short, long, default_value = "text")]
format: WireFormat,
},
/// Clear the cache
Clear {
/// Cache directory (default: .kreuzberg in current directory)
#[arg(short, long)]
cache_dir: Option<PathBuf>,
/// Output format (text or json)
#[arg(short, long, default_value = "text")]
format: WireFormat,
},
/// Output model manifest (expected model files, checksums, sizes)
///
/// Outputs a JSON manifest of all model files required by kreuzberg,
/// including their relative paths, SHA256 checksums, and sizes.
/// Used for pre-populating model caches in containerized deployments.
Manifest {
/// Output format (text or json)
#[arg(short, long, default_value = "json")]
format: WireFormat,
},
/// Download all models eagerly
///
/// Downloads all PaddleOCR and layout detection models for all supported
/// languages. Unlike normal operation which downloads lazily on first use,
/// this ensures all models are present in the cache directory.
///
/// Use --all-embeddings to also download all 4 embedding model presets,
/// or --embedding-model <preset> to download a specific one.
///
/// By default, only the core layout models (rtdetr + tatr) are downloaded.
/// Use --all-table-models to also download SLANeXT variants (~730MB).
Warm {
/// Cache directory (default: .kreuzberg in current directory, or KREUZBERG_CACHE_DIR)
#[arg(short, long)]
cache_dir: Option<PathBuf>,
/// Output format (text or json)
#[arg(short, long, default_value = "text")]
format: WireFormat,
/// Download all embedding model presets (fast, balanced, quality, multilingual)
#[arg(long)]
all_embeddings: bool,
/// Download a specific embedding model preset
#[arg(long, value_name = "PRESET")]
embedding_model: Option<String>,
/// Download all table structure models including SLANeXT variants (~730MB)
#[arg(
long,
help = "Download all table structure models including SLANeXT variants (~730MB)"
)]
all_table_models: bool,
/// Download all tree-sitter grammar parsers
#[arg(long)]
all_grammars: bool,
/// Download specific tree-sitter grammar groups (comma-separated: web,systems,scripting,data,jvm,functional)
#[arg(long, value_name = "GROUPS", value_delimiter = ',')]
grammar_groups: Option<Vec<String>>,
/// Download specific tree-sitter grammars by language name (comma-separated)
#[arg(long, value_name = "LANGUAGES", value_delimiter = ',')]
grammars: Option<Vec<String>>,
},
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
enum WireFormat {
Text,
Json,
Toon,
}
impl std::str::FromStr for WireFormat {
type Err = String;
fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
match s.to_lowercase().as_str() {
"text" => Ok(WireFormat::Text),
"json" => Ok(WireFormat::Json),
"toon" => Ok(WireFormat::Toon),
_ => Err(format!("Invalid format: {}. Use 'text', 'json', or 'toon'", s)),
}
}
}
/// Content output format for extraction results.
///
/// Controls the format of the extracted content (not the CLI output format).
#[derive(Clone, Copy, Debug, PartialEq, Eq, clap::ValueEnum)]
enum ContentOutputFormatArg {
/// Plain text (default)
Plain,
/// Markdown format
Markdown,
/// Djot markup format
Djot,
/// HTML format
Html,
/// JSON tree format with heading-driven sections
Json,
}
impl From<ContentOutputFormatArg> for ContentOutputFormat {
fn from(arg: ContentOutputFormatArg) -> Self {
match arg {
ContentOutputFormatArg::Plain => ContentOutputFormat::Plain,
ContentOutputFormatArg::Markdown => ContentOutputFormat::Markdown,
ContentOutputFormatArg::Djot => ContentOutputFormat::Djot,
ContentOutputFormatArg::Html => ContentOutputFormat::Html,
ContentOutputFormatArg::Json => ContentOutputFormat::Json,
}
}
}
/// Validates that a file exists and is accessible.
///
/// Checks that the path exists in the filesystem and points to a regular file
/// (not a directory or special file). Provides user-friendly error messages if validation fails.
///
/// # Errors
///
/// Returns an error if:
/// - The path does not exist in the filesystem
/// - The path exists but is not a regular file (e.g., is a directory)
fn validate_file_exists(path: &Path) -> Result<()> {
if !path.exists() {
anyhow::bail!(
"File not found: '{}'. Please check that the file exists and is accessible.",
path.display()
);
}
if !path.is_file() {
anyhow::bail!(
"Path is not a file: '{}'. Please provide a path to a regular file.",
path.display()
);
}
Ok(())
}
/// Validates chunking parameters for correctness.
///
/// Ensures that chunking configuration makes sense: size must be positive and reasonable,
/// and overlap must be smaller than chunk size. This prevents common configuration errors
/// that would lead to cryptic failures from the underlying library.
///
/// # Errors
///
/// Returns an error if:
/// - `chunk_size` is 0 (must be at least 1 character)
/// - `chunk_size` exceeds 1,000,000 characters (to prevent excessive memory usage)
/// - `chunk_overlap` is greater than or equal to `chunk_size` (overlap must be smaller)
fn validate_chunk_params(chunk_size: Option<usize>, chunk_overlap: Option<usize>) -> Result<()> {
if let Some(size) = chunk_size {
if size == 0 {
anyhow::bail!("Invalid chunk size: {}. Chunk size must be greater than 0.", size);
}
if size > 1_000_000 {
anyhow::bail!(
"Invalid chunk size: {}. Chunk size must be less than 1,000,000 characters to avoid excessive memory usage.",
size
);
}
}
if let Some(overlap) = chunk_overlap
&& let Some(size) = chunk_size
&& overlap >= size
{
anyhow::bail!(
"Invalid chunk overlap: {}. Overlap ({}) must be less than chunk size ({}).",
overlap,
overlap,
size
);
}
Ok(())
}
/// Validates batch extraction paths for correctness.
///
/// Ensures that at least one file path is provided and that all paths point to valid,
/// accessible files. This prevents processing empty batches or failing mid-batch due
/// to invalid paths.
///
/// # Errors
///
/// Returns an error if:
/// - The paths array is empty (at least one file is required)
/// - Any path does not exist or is not a regular file
fn validate_batch_paths(paths: &[PathBuf]) -> Result<()> {
if paths.is_empty() {
anyhow::bail!("No files provided for batch extraction. Please provide at least one file path.");
}
for (i, path) in paths.iter().enumerate() {
validate_file_exists(path).with_context(|| format!("Invalid file at position {}", i + 1))?;
}
Ok(())
}
/// Apply inline JSON or base64 JSON overrides to an extraction config.
fn apply_json_overrides(
config: &mut kreuzberg::ExtractionConfig,
config_json: Option<String>,
config_json_base64: Option<String>,
) -> Result<()> {
if let Some(json_str) = config_json {
let json_value: serde_json::Value =
serde_json::from_str(&json_str).context("Failed to parse --config-json as JSON")?;
*config =
merge_json_into_config(config, json_value).context("Failed to merge --config-json with file config")?;
} else if let Some(base64_str) = config_json_base64 {
let json_bytes = STANDARD
.decode(&base64_str)
.context("Failed to decode base64 in --config-json-base64")?;
let json_str = String::from_utf8(json_bytes).context("Base64-decoded content is not valid UTF-8")?;
let json_value: serde_json::Value =
serde_json::from_str(&json_str).context("Failed to parse decoded --config-json-base64 as JSON")?;
*config = merge_json_into_config(config, json_value)
.context("Failed to merge --config-json-base64 with file config")?;
}
Ok(())
}
/// Merges a JSON value into an existing extraction config via field-by-field override.
fn merge_json_into_config(
base_config: &kreuzberg::ExtractionConfig,
json_value: serde_json::Value,
) -> Result<kreuzberg::ExtractionConfig> {
let json_str = serde_json::to_string(&json_value).map_err(|e| anyhow::anyhow!("{}", e))?;
kreuzberg::core::config::merge::merge_config_json(base_config, &json_str).map_err(|e| anyhow::anyhow!("{}", e))
}
fn main() -> Result<()> {
let cli = Cli::parse();
let env_filter = logging::build_env_filter(cli.log_level.as_deref());
let _ = tracing_subscriber::fmt()
.with_env_filter(env_filter)
.with_writer(std::io::stderr)
.try_init();
match cli.command {
Commands::Extract {
path,
config: config_path,
config_json,
config_json_base64,
mime_type,
format,
overrides,
} => {
validate_file_exists(&path)?;
overrides.validate()?;
let mut config = load_config(config_path)?;
apply_json_overrides(&mut config, config_json, config_json_base64)?;
overrides.apply(&mut config);
extract_command(path, config, mime_type, format)?;
}
Commands::ExtractStructured {
path,
schema,
model,
api_key,
prompt,
schema_name,
strict,
config,
format,
} => {
validate_file_exists(&path)?;
validate_file_exists(&schema)?;
extract_structured_command(ExtractStructuredArgs {
path,
schema_path: schema,
model,
api_key,
prompt,
schema_name,
strict,
config_path: config,
format,
})?;
}
Commands::Batch {
paths,
config: config_path,
config_json,
config_json_base64,
format,
overrides,
file_configs,
} => {
validate_batch_paths(&paths)?;
overrides.validate()?;
let mut config = load_config(config_path)?;
apply_json_overrides(&mut config, config_json, config_json_base64)?;
overrides.apply(&mut config);
let file_configs_map = if let Some(file_configs_path) = file_configs {
let file_configs_json = std::fs::read_to_string(&file_configs_path)
.with_context(|| format!("Failed to read file configs from '{}'", file_configs_path.display()))?;
let map: std::collections::HashMap<String, serde_json::Value> =
serde_json::from_str(&file_configs_json).with_context(|| {
format!(
"Failed to parse file configs JSON from '{}'",
file_configs_path.display()
)
})?;
Some(map)
} else {
None
};
batch_command(paths, file_configs_map, config, format)?;
}
Commands::Detect { path, format } => {
validate_file_exists(&path)?;
let path_str = path.to_string_lossy().to_string();
let mime_type = detect_mime_type(path_str.clone(), true).with_context(|| {
format!(
"Failed to detect MIME type for file '{}'. Ensure the file is readable.",
path.display()
)
})?;
match format {
WireFormat::Text => {
println!("{}", style::success(&mime_type));
}
WireFormat::Json => {
let output = json!({
"path": path_str,
"mime_type": mime_type,
});
println!(
"{}",
serde_json::to_string_pretty(&output)
.context("Failed to serialize MIME type detection result to JSON")?
);
}
WireFormat::Toon => {
let output = json!({
"path": path_str,
"mime_type": mime_type,
});
println!(
"{}",
serde_toon::to_string(&output)
.context("Failed to serialize MIME type detection result to TOON")?
);
}
}
}
Commands::Formats { format } => {
let formats = kreuzberg::core::mime::list_supported_formats();
match format {
WireFormat::Text => {
println!("{:<15} {}", style::label("EXTENSION"), style::label("MIME TYPE"));
println!("{}", style::dim(&format!("{:<15} ---------", "---------")));
for f in &formats {
println!("{:<15} {}", style::success(&format!(".{}", f.extension)), f.mime_type);
}
}
WireFormat::Json => {
println!(
"{}",
serde_json::to_string_pretty(&formats).context("Failed to serialize formats to JSON")?
);
}
WireFormat::Toon => {
println!(
"{}",
serde_toon::to_string(&formats).context("Failed to serialize formats to TOON")?
);
}
}
}
Commands::Version { format } => {
let version = env!("CARGO_PKG_VERSION");
let name = env!("CARGO_PKG_NAME");
match format {
WireFormat::Text => {
println!("{} {}", style::label(name), style::success(version));
}
WireFormat::Json => {
let output = json!({
"name": name,
"version": version,
});
println!(
"{}",
serde_json::to_string_pretty(&output)
.context("Failed to serialize version information to JSON")?
);
}
WireFormat::Toon => {
let output = json!({
"name": name,
"version": version,
});
println!(
"{}",
serde_toon::to_string(&output).context("Failed to serialize version information to TOON")?
);
}
}
}
#[cfg(feature = "api")]
Commands::Serve {
host: cli_host,
port: cli_port,
config: config_path,
} => {
let mut extraction_config = load_config(config_path.clone())?;
extraction_config.apply_env_overrides()?;
serve_command(cli_host, cli_port, extraction_config, config_path)?;
}
#[cfg(feature = "mcp")]
Commands::Mcp {
config: config_path,
transport,
#[cfg(feature = "mcp-http")]
host,
#[cfg(feature = "mcp-http")]
port,
#[cfg(not(feature = "mcp-http"))]
host,
#[cfg(not(feature = "mcp-http"))]
port,
} => {
let mut config = load_config(config_path)?;
config.apply_env_overrides()?;
mcp_command(config, transport, host, port)?;
}
Commands::Cache { command } => match command {
CacheCommands::Stats { cache_dir, format } => {
stats_command(cache_dir, format)?;
}
CacheCommands::Clear { cache_dir, format } => {
clear_command(cache_dir, format)?;
}
CacheCommands::Manifest { format } => {
manifest_command(format)?;
}
CacheCommands::Warm {
cache_dir,
format,
all_embeddings,
embedding_model,
all_table_models,
all_grammars,
grammar_groups,
grammars,
} => {
warm_command(
cache_dir,
format,
all_embeddings,
embedding_model,
all_table_models,
all_grammars,
grammar_groups,
grammars,
)?;
}
},
#[cfg(feature = "api")]
Commands::Api { command } => match command {
ApiCommands::Schema => {
println!("{}", kreuzberg::api::openapi::openapi_json());
}
},
#[cfg(feature = "embeddings")]
Commands::Embed {
text,
preset,
provider,
model,
api_key,
plugin,
format,
} => {
let texts = if text.is_empty() {
vec![commands::read_stdin()?]
} else {
text
};
embed_command(texts, &preset, &provider, model, api_key, plugin, format)?;
}
Commands::Chunk {
text,
config: config_path,
chunk_size,
chunk_overlap,
chunker_type,
chunking_tokenizer,
topic_threshold,
format,
} => {
let input = match text {
Some(t) => t,
None => commands::read_stdin().context("No --text provided and failed to read from stdin")?,
};
validate_chunk_params(chunk_size, chunk_overlap)?;
let base_config = load_config(config_path)?;
let mut chunking_config = base_config.chunking.unwrap_or_default();
if let Some(size) = chunk_size {
chunking_config.max_characters = size;
// If user set chunk_size but not overlap, clamp overlap to fit
if chunk_overlap.is_none() && chunking_config.overlap >= size {
chunking_config.overlap = size / 4;
}
}
if let Some(overlap) = chunk_overlap {
chunking_config.overlap = overlap;
}
match chunker_type.as_str() {
"markdown" => chunking_config.chunker_type = kreuzberg::ChunkerType::Markdown,
"yaml" => chunking_config.chunker_type = kreuzberg::ChunkerType::Yaml,
"semantic" => chunking_config.chunker_type = kreuzberg::ChunkerType::Semantic,
_ => chunking_config.chunker_type = kreuzberg::ChunkerType::Text,
}
if let Some(ref tokenizer) = chunking_tokenizer {
chunking_config.sizing = kreuzberg::ChunkSizing::Tokenizer {
model: tokenizer.clone(),
cache_dir: None,
};
}
if topic_threshold.is_some() {
chunking_config.topic_threshold = topic_threshold;
}
chunk_command(input, chunking_config, format)?;
}
Commands::Completions { shell } => {
let mut cmd = Cli::command();
clap_complete::generate(shell, &mut cmd, "kreuzberg", &mut std::io::stdout());
}
}
Ok(())
}

View File

@@ -0,0 +1,32 @@
//! JSON envelope types for CLI output.
//!
//! When `--format json` is used, extraction results are wrapped in these envelopes
//! so tooling (such as the benchmark harness) can read timing information without
//! parsing stderr or running a separate profiling tool.
use kreuzberg::ExtractionResult;
use serde::Serialize;
/// Single-file extraction result with wall-clock timing.
///
/// Emitted to stdout by `kreuzberg extract --format json`.
#[derive(Debug, Serialize)]
pub struct ExtractEnvelope {
/// The extraction result (content, metadata, tables, …).
pub result: ExtractionResult,
/// Wall-clock time for the extraction call in milliseconds.
pub extraction_time_ms: f64,
}
/// Batch extraction results with per-file and total timing.
///
/// Emitted to stdout by `kreuzberg batch --format json`.
#[derive(Debug, Serialize)]
pub struct BatchEnvelope {
/// One result per input file, in input order.
pub results: Vec<ExtractionResult>,
/// Total wall-clock time for the whole batch in milliseconds.
pub total_ms: f64,
/// Per-file wall-clock times in milliseconds, aligned with `results`.
pub per_file_ms: Vec<f64>,
}

View File

@@ -0,0 +1,104 @@
//! CLI color styling helpers using `anstyle`.
//!
//! Provides styled output for the kreuzberg CLI. Respects the `NO_COLOR`
//! environment variable (<https://no-color.org/>) and disables colors
//! when output is not a terminal.
use anstyle::{AnsiColor, Effects, Style};
use std::sync::OnceLock;
/// Bold blue for section headers.
const HEADER: Style = Style::new()
.fg_color(Some(anstyle::Color::Ansi(AnsiColor::Blue)))
.effects(Effects::BOLD);
/// Green for success values (MIME types, file paths, versions).
const SUCCESS: Style = Style::new().fg_color(Some(anstyle::Color::Ansi(AnsiColor::Green)));
/// Dim for metadata, separators, secondary info.
const DIM: Style = Style::new().effects(Effects::DIMMED);
/// Bold for labels in key-value pairs.
const LABEL: Style = Style::new().effects(Effects::BOLD);
/// Check whether color output is enabled.
///
/// Returns `false` if:
/// - The `NO_COLOR` environment variable is set (any value)
///
/// See <https://no-color.org/> for the specification.
pub fn is_color_enabled() -> bool {
static ENABLED: OnceLock<bool> = OnceLock::new();
*ENABLED.get_or_init(|| std::env::var_os("NO_COLOR").is_none())
}
/// Apply an `anstyle::Style` to text if colors are enabled.
fn styled(text: &str, style: Style) -> String {
if is_color_enabled() {
format!("{}{}{}", style.render(), text, style.render_reset())
} else {
text.to_string()
}
}
/// Style text as a section header (bold blue).
pub fn header(text: &str) -> String {
styled(text, HEADER)
}
/// Style text as a success value (green).
pub fn success(text: &str) -> String {
styled(text, SUCCESS)
}
/// Style text as dim/secondary (dimmed).
pub fn dim(text: &str) -> String {
styled(text, DIM)
}
/// Style text as a label (bold).
pub fn label(text: &str) -> String {
styled(text, LABEL)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_styled_returns_plain_text_when_no_color() {
// Set NO_COLOR for this test's assertion scope via direct env check
// Since OnceLock caches, we test the raw logic instead.
let text = "hello";
let result = format!("{}{}{}", Style::new().render(), text, Style::new().render_reset());
// A plain Style produces no ANSI codes, so the result is just the text.
assert_eq!(result, "hello");
}
#[test]
fn test_styled_applies_ansi_when_style_present() {
let style = Style::new().fg_color(Some(anstyle::Color::Ansi(AnsiColor::Green)));
let rendered = format!("{}{}{}", style.render(), "ok", style.render_reset());
// The rendered string should contain ANSI escape sequences.
assert!(rendered.contains("\x1b["));
assert!(rendered.contains("ok"));
}
#[test]
fn test_helper_functions_return_strings() {
// Smoke test: all helpers produce non-empty output for non-empty input.
assert!(!header("h").is_empty());
assert!(!success("s").is_empty());
assert!(!dim("d").is_empty());
assert!(!label("l").is_empty());
}
#[test]
fn test_is_color_enabled_respects_no_color_env() {
// We cannot easily test OnceLock-cached value, but we can verify the
// logic: NO_COLOR absence means colors enabled.
let has_no_color = std::env::var_os("NO_COLOR").is_some();
// The cached result should match the env at init time.
assert_eq!(is_color_enabled(), !has_no_color);
}
}