Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/crates/kreuzberg-cli/src/commands/cache.rs
+++ b/crates/kreuzberg-cli/src/commands/cache.rs
@@ -0,0 +1,466 @@
+//! Cache command - Manage cache operations
+//!
+//! This module provides commands for cache management including statistics,
+//! clearing, manifest generation, and model warming.
+
+use anyhow::{Context, Result};
+use kreuzberg::cache;
+use serde_json::json;
+use std::path::PathBuf;
+
+use crate::{WireFormat, style};
+
+/// Execute cache stats command
+pub fn stats_command(cache_dir: Option<PathBuf>, format: WireFormat) -> Result<()> {
+    let default_cache_dir = std::env::current_dir()
+        .context("Failed to get current directory")?
+        .join(".kreuzberg");
+
+    let cache_path = cache_dir.unwrap_or(default_cache_dir);
+    let cache_dir_str = cache_path.to_string_lossy();
+
+    let stats = cache::get_cache_metadata(&cache_dir_str).with_context(|| {
+        format!(
+            "Failed to get cache statistics from directory '{}'. Ensure the directory exists and is readable.",
+            cache_dir_str
+        )
+    })?;
+
+    match format {
+        WireFormat::Text => {
+            println!("{}", style::header("Cache Statistics"));
+            println!("{}", style::dim("================"));
+            println!("{} {}", style::label("Directory:"), style::success(&cache_dir_str));
+            println!("{} {}", style::label("Total files:"), stats.total_files);
+            println!("{} {:.2} MB", style::label("Total size:"), stats.total_size_mb);
+            println!(
+                "{} {:.2} MB",
+                style::label("Available space:"),
+                stats.available_space_mb
+            );
+            println!(
+                "{} {:.2} days",
+                style::label("Oldest file age:"),
+                stats.oldest_file_age_days
+            );
+            println!(
+                "{} {:.2} days",
+                style::label("Newest file age:"),
+                stats.newest_file_age_days
+            );
+        }
+        WireFormat::Json => {
+            let output = json!({
+                "directory": cache_dir_str,
+                "total_files": stats.total_files,
+                "total_size_mb": stats.total_size_mb,
+                "available_space_mb": stats.available_space_mb,
+                "oldest_file_age_days": stats.oldest_file_age_days,
+                "newest_file_age_days": stats.newest_file_age_days,
+            });
+            println!(
+                "{}",
+                serde_json::to_string_pretty(&output).context("Failed to serialize cache statistics to JSON")?
+            );
+        }
+        WireFormat::Toon => {
+            let output = json!({
+                "directory": cache_dir_str,
+                "total_files": stats.total_files,
+                "total_size_mb": stats.total_size_mb,
+                "available_space_mb": stats.available_space_mb,
+                "oldest_file_age_days": stats.oldest_file_age_days,
+                "newest_file_age_days": stats.newest_file_age_days,
+            });
+            println!(
+                "{}",
+                serde_toon::to_string(&output).context("Failed to serialize cache statistics to TOON")?
+            );
+        }
+    }
+
+    Ok(())
+}
+
+/// Execute cache clear command
+pub fn clear_command(cache_dir: Option<PathBuf>, format: WireFormat) -> Result<()> {
+    let default_cache_dir = std::env::current_dir()
+        .context("Failed to get current directory")?
+        .join(".kreuzberg");
+
+    let cache_path = cache_dir.unwrap_or(default_cache_dir);
+    let cache_dir_str = cache_path.to_string_lossy();
+
+    let (removed_files, freed_mb) = cache::clear_cache_directory(&cache_dir_str).with_context(|| {
+        format!(
+            "Failed to clear cache directory '{}'. Ensure you have write permissions.",
+            cache_dir_str
+        )
+    })?;
+
+    match format {
+        WireFormat::Text => {
+            println!("{}", style::success("Cache cleared successfully"));
+            println!("{} {}", style::label("Directory:"), style::success(&cache_dir_str));
+            println!("{} {}", style::label("Removed files:"), removed_files);
+            println!("{} {:.2} MB", style::label("Freed space:"), freed_mb);
+        }
+        WireFormat::Json => {
+            let output = json!({
+                "directory": cache_dir_str,
+                "removed_files": removed_files,
+                "freed_mb": freed_mb,
+            });
+            println!(
+                "{}",
+                serde_json::to_string_pretty(&output).context("Failed to serialize cache clear results to JSON")?
+            );
+        }
+        WireFormat::Toon => {
+            let output = json!({
+                "directory": cache_dir_str,
+                "removed_files": removed_files,
+                "freed_mb": freed_mb,
+            });
+            println!(
+                "{}",
+                serde_toon::to_string(&output).context("Failed to serialize cache clear results to TOON")?
+            );
+        }
+    }
+
+    Ok(())
+}
+
+/// Execute cache manifest command - outputs expected model files with checksums.
+pub fn manifest_command(format: WireFormat) -> Result<()> {
+    // Without at least one model-providing feature, every `extend` call
+    // below is `#[cfg]`-stripped and `entries: Vec<_>` has no anchor for
+    // type inference — `e.size_bytes` on the closure further down then
+    // fails compilation with E0282. Bail with a clear error instead so
+    // (or similar minimal configurations) succeeds.
+    #[cfg(not(any(feature = "paddle-ocr", feature = "layout-detection")))]
+    {
+        let _ = format;
+        anyhow::bail!(
+            "manifest command unavailable: build kreuzberg-cli with at least one of \
+             --features \"paddle-ocr\" or --features \"layout-detection\""
+        );
+    }
+
+    #[cfg(any(feature = "paddle-ocr", feature = "layout-detection"))]
+    {
+        manifest_command_inner(format)
+    }
+}
+
+#[cfg(any(feature = "paddle-ocr", feature = "layout-detection"))]
+fn manifest_command_inner(format: WireFormat) -> Result<()> {
+    let mut entries = Vec::new();
+
+    #[cfg(feature = "paddle-ocr")]
+    {
+        entries.extend(kreuzberg::paddle_ocr::ModelManager::manifest());
+    }
+
+    #[cfg(feature = "layout-detection")]
+    {
+        entries.extend(kreuzberg::layout::LayoutModelManager::manifest());
+    }
+
+    #[cfg(feature = "paddle-ocr")]
+    {
+        entries.extend(kreuzberg::ocr::TessdataManager::manifest());
+    }
+
+    let total_size_bytes: u64 = entries.iter().map(|e| e.size_bytes).sum();
+    let version = env!("CARGO_PKG_VERSION");
+
+    match format {
+        WireFormat::Text => {
+            println!(
+                "{} {}",
+                style::header("Model Manifest"),
+                style::dim(&format!("(kreuzberg {})", version))
+            );
+            println!("{}", style::dim("===================================="));
+            println!(
+                "{:<50} {:>12} {}",
+                style::label("PATH"),
+                style::label("SIZE"),
+                style::label("SHA256")
+            );
+            println!("{}", style::dim(&format!("{:<50} {:>12} ------", "----", "----")));
+            for entry in &entries {
+                let size_str = if entry.size_bytes > 0 {
+                    format!("{:.1} MB", entry.size_bytes as f64 / 1_048_576.0)
+                } else {
+                    "unknown".to_string()
+                };
+                let sha_display = if entry.sha256.len() >= 12 {
+                    &entry.sha256[..12]
+                } else if entry.sha256.is_empty() {
+                    "-"
+                } else {
+                    &entry.sha256
+                };
+                println!(
+                    "{:<50} {:>12} {}",
+                    entry.relative_path,
+                    size_str,
+                    style::dim(sha_display)
+                );
+            }
+            println!();
+            println!(
+                "{} {} files, {:.1} MB",
+                style::label("Total:"),
+                entries.len(),
+                total_size_bytes as f64 / 1_048_576.0
+            );
+        }
+        WireFormat::Json => {
+            let output = json!({
+                "kreuzberg_version": version,
+                "total_size_bytes": total_size_bytes,
+                "model_count": entries.len(),
+                "models": entries,
+            });
+            println!(
+                "{}",
+                serde_json::to_string_pretty(&output).context("Failed to serialize manifest to JSON")?
+            );
+        }
+        WireFormat::Toon => {
+            let output = json!({
+                "kreuzberg_version": version,
+                "total_size_bytes": total_size_bytes,
+                "model_count": entries.len(),
+                "models": entries,
+            });
+            println!(
+                "{}",
+                serde_toon::to_string(&output).context("Failed to serialize manifest to TOON")?
+            );
+        }
+    }
+
+    Ok(())
+}
+
+/// Execute cache warm command - eagerly downloads all models.
+#[allow(clippy::too_many_arguments)]
+pub fn warm_command(
+    cache_dir: Option<PathBuf>,
+    format: WireFormat,
+    all_embeddings: bool,
+    embedding_model: Option<String>,
+    all_table_models: bool,
+    all_grammars: bool,
+    grammar_groups: Option<Vec<String>>,
+    grammars: Option<Vec<String>>,
+) -> Result<()> {
+    let cache_base = resolve_cache_base(cache_dir);
+
+    let mut downloaded: Vec<String> = Vec::new();
+    let mut already_cached: Vec<String> = Vec::new();
+
+    #[cfg(feature = "paddle-ocr")]
+    {
+        let paddle_dir = cache_base.join("paddle-ocr");
+        let manager = kreuzberg::paddle_ocr::ModelManager::new(paddle_dir);
+
+        // ensure_all_models downloads v2 det (server+mobile), cls (PP-LCNet),
+        // doc_ori, v2 unified rec models, and all per-script rec families
+        manager
+            .ensure_all_models()
+            .context("Failed to download PaddleOCR v2 models")?;
+        downloaded.push("paddle-ocr v2 (server+mobile det, cls, doc_ori, unified+per-script rec)".to_string());
+    }
+
+    #[cfg(feature = "layout-detection")]
+    {
+        let layout_dir = cache_base.join("layout");
+        let manager = kreuzberg::layout::LayoutModelManager::new(Some(layout_dir));
+
+        if all_table_models {
+            // Download rtdetr + tatr + all SLANeXT variants (~730MB)
+            let was_cached = manager.is_rtdetr_cached() && manager.is_tatr_cached();
+            if was_cached {
+                already_cached.push("layout (rtdetr, tatr, slanet variants)".to_string());
+            } else {
+                manager
+                    .ensure_all_models()
+                    .context("Failed to download layout models")?;
+                downloaded.push("layout (rtdetr, tatr, slanet variants)".to_string());
+            }
+        } else {
+            // Default: download only rtdetr + tatr
+            let was_cached = manager.is_rtdetr_cached() && manager.is_tatr_cached();
+            if was_cached {
+                already_cached.push("layout (rtdetr, tatr)".to_string());
+            } else {
+                manager
+                    .ensure_default_models()
+                    .context("Failed to download layout models")?;
+                downloaded.push("layout (rtdetr, tatr)".to_string());
+            }
+        }
+    }
+
+    #[cfg(feature = "paddle-ocr")]
+    {
+        let tessdata_dir = cache_base.join("tessdata");
+        let manager = kreuzberg::ocr::TessdataManager::new(Some(tessdata_dir));
+
+        let newly_downloaded = manager
+            .ensure_all_languages()
+            .context("Failed to download tessdata files")?;
+
+        if newly_downloaded > 0 {
+            downloaded.push(format!("tessdata ({newly_downloaded} languages)"));
+        } else {
+            already_cached.push("tessdata (all languages)".to_string());
+        }
+    }
+
+    #[cfg(feature = "embeddings")]
+    {
+        let embeddings_dir = cache_base.join("embeddings");
+        let presets_to_warm: Vec<kreuzberg::EmbeddingPreset> = if all_embeddings {
+            kreuzberg::list_embedding_presets()
+                .into_iter()
+                .filter_map(|name| kreuzberg::get_embedding_preset(&name))
+                .collect()
+        } else if let Some(ref name) = embedding_model {
+            match kreuzberg::get_embedding_preset(name) {
+                Some(preset) => vec![preset],
+                None => {
+                    let available = kreuzberg::list_embedding_presets();
+                    anyhow::bail!(
+                        "Unknown embedding preset '{}'. Available: {}",
+                        name,
+                        available.join(", ")
+                    );
+                }
+            }
+        } else {
+            vec![]
+        };
+
+        for preset in &presets_to_warm {
+            let label = format!("embedding ({})", preset.name);
+            kreuzberg::embeddings::warm_model(
+                &kreuzberg::core::config::EmbeddingModelType::Preset {
+                    name: preset.name.clone(),
+                },
+                Some(embeddings_dir.clone()),
+            )
+            .map_err(|e| anyhow::anyhow!("Failed to download embedding model '{}': {}", preset.name, e))?;
+            downloaded.push(label);
+        }
+    }
+
+    #[cfg(not(feature = "embeddings"))]
+    {
+        if all_embeddings || embedding_model.is_some() {
+            anyhow::bail!("Embedding model warming requires the 'embeddings' feature to be enabled");
+        }
+    }
+
+    // Tree-sitter grammar downloads
+    #[cfg(feature = "tree-sitter")]
+    {
+        if all_grammars {
+            let count =
+                tree_sitter_language_pack::download_all().context("Failed to download all tree-sitter grammars")?;
+            if count > 0 {
+                downloaded.push(format!("tree-sitter grammars ({count} languages)"));
+            } else {
+                already_cached.push("tree-sitter grammars (all)".to_string());
+            }
+        } else if let Some(ref groups) = grammar_groups {
+            let config = tree_sitter_language_pack::PackConfig {
+                cache_dir: None,
+                languages: None,
+                groups: Some(groups.clone()),
+            };
+            tree_sitter_language_pack::init(&config).context("Failed to download tree-sitter grammar groups")?;
+            downloaded.push(format!("tree-sitter grammars (groups: {})", groups.join(", ")));
+        } else if let Some(ref langs) = grammars {
+            let refs: Vec<&str> = langs.iter().map(String::as_str).collect();
+            let count =
+                tree_sitter_language_pack::download(&refs).context("Failed to download tree-sitter grammars")?;
+            if count > 0 {
+                downloaded.push(format!("tree-sitter grammars ({count} languages)"));
+            } else {
+                already_cached.push(format!("tree-sitter grammars ({})", langs.join(", ")));
+            }
+        }
+    }
+
+    #[cfg(not(feature = "tree-sitter"))]
+    {
+        if all_grammars || grammar_groups.is_some() || grammars.is_some() {
+            anyhow::bail!("Tree-sitter grammar warming requires the 'tree-sitter' feature to be enabled");
+        }
+    }
+
+    match format {
+        WireFormat::Text => {
+            if !downloaded.is_empty() {
+                println!("{}", style::label("Downloaded:"));
+                for d in &downloaded {
+                    println!("  {}", style::success(d));
+                }
+            }
+            if !already_cached.is_empty() {
+                println!("{}", style::label("Already cached:"));
+                for c in &already_cached {
+                    println!("  {}", style::dim(c));
+                }
+            }
+            println!(
+                "All models ready in {}",
+                style::success(&cache_base.display().to_string())
+            );
+        }
+        WireFormat::Json => {
+            let output = json!({
+                "cache_dir": cache_base.to_string_lossy(),
+                "downloaded": downloaded,
+                "already_cached": already_cached,
+            });
+            println!(
+                "{}",
+                serde_json::to_string_pretty(&output).context("Failed to serialize warm results to JSON")?
+            );
+        }
+        WireFormat::Toon => {
+            let output = json!({
+                "cache_dir": cache_base.to_string_lossy(),
+                "downloaded": downloaded,
+                "already_cached": already_cached,
+            });
+            println!(
+                "{}",
+                serde_toon::to_string(&output).context("Failed to serialize warm results to TOON")?
+            );
+        }
+    }
+
+    Ok(())
+}
+
+/// Resolve the cache base directory.
+fn resolve_cache_base(cache_dir: Option<PathBuf>) -> PathBuf {
+    if let Some(dir) = cache_dir {
+        return dir;
+    }
+    if let Ok(env_path) = std::env::var("KREUZBERG_CACHE_DIR") {
+        return PathBuf::from(env_path);
+    }
+    std::env::current_dir()
+        .unwrap_or_else(|_| PathBuf::from("."))
+        .join(".kreuzberg")
+}
--- a/crates/kreuzberg-cli/src/commands/chunk.rs
+++ b/crates/kreuzberg-cli/src/commands/chunk.rs
@@ -0,0 +1,61 @@
+//! Chunk command implementation.
+
+use anyhow::{Context, Result};
+
+use crate::{WireFormat, style};
+
+/// Execute the chunk command: split text into chunks.
+pub fn chunk_command(text: String, config: kreuzberg::ChunkingConfig, format: WireFormat) -> Result<()> {
+    if text.is_empty() {
+        anyhow::bail!("No text provided for chunking. Provide --text or pipe text via stdin.");
+    }
+
+    let result = kreuzberg::chunking::chunk_text(&text, &config, None).context("Failed to chunk text")?;
+
+    match format {
+        WireFormat::Json => {
+            let chunks: Vec<&str> = result.chunks.iter().map(|c| c.content.as_str()).collect();
+            let output = serde_json::json!({
+                "chunks": chunks,
+                "chunk_count": result.chunk_count,
+                "config": {
+                    "max_characters": config.max_characters,
+                    "overlap": config.overlap,
+                    "chunker_type": format!("{:?}", config.chunker_type),
+                },
+                "input_size_bytes": text.len(),
+            });
+            println!(
+                "{}",
+                serde_json::to_string_pretty(&output).context("Failed to serialize chunks to JSON")?
+            );
+        }
+        WireFormat::Toon => {
+            let chunks: Vec<&str> = result.chunks.iter().map(|c| c.content.as_str()).collect();
+            let output = serde_json::json!({
+                "chunks": chunks,
+                "chunk_count": result.chunk_count,
+                "config": {
+                    "max_characters": config.max_characters,
+                    "overlap": config.overlap,
+                    "chunker_type": format!("{:?}", config.chunker_type),
+                },
+                "input_size_bytes": text.len(),
+            });
+            println!(
+                "{}",
+                serde_toon::to_string(&output).context("Failed to serialize chunks to TOON")?
+            );
+        }
+        WireFormat::Text => {
+            for (i, chunk) in result.chunks.iter().enumerate() {
+                if result.chunks.len() > 1 {
+                    println!("{}", style::dim(&format!("--- chunk {} ---", i + 1)));
+                }
+                println!("{}", chunk.content);
+            }
+        }
+    }
+
+    Ok(())
+}
--- a/crates/kreuzberg-cli/src/commands/config.rs
+++ b/crates/kreuzberg-cli/src/commands/config.rs
@@ -0,0 +1,51 @@
+//! Config command - Configuration loading and discovery
+//!
+//! This module provides utilities for loading extraction configuration from files
+//! or discovering them automatically in the project directory.
+
+use anyhow::{Context, Result};
+use kreuzberg::ExtractionConfig;
+use std::path::PathBuf;
+
+/// Loads extraction configuration from a file or discovers it automatically.
+///
+/// This function implements the CLI's configuration hierarchy:
+/// 1. Explicit config file (if `--config` flag provided)
+/// 2. Auto-discovered config (searches `kreuzberg.{toml,yaml,json}` in current and parent directories)
+/// 3. Default configuration (if no config file found)
+///
+/// # Configuration File Formats
+///
+/// Supports three formats, determined by file extension:
+/// - `.toml`: TOML format (recommended for humans)
+/// - `.yaml` / `.yml`: YAML format
+/// - `.json`: JSON format
+///
+/// # Errors
+///
+/// Returns an error if:
+/// - Explicit config file has unsupported extension (must be .toml, .yaml, .yml, or .json)
+/// - Config file cannot be read or parsed
+/// - Config file contains invalid extraction settings
+pub fn load_config(config_path: Option<PathBuf>) -> Result<ExtractionConfig> {
+    if let Some(path) = config_path {
+        let path_str = path.to_string_lossy();
+        let path_lower = path_str.to_lowercase();
+        let config = if path_lower.ends_with(".toml") {
+            ExtractionConfig::from_toml_file(&path)
+        } else if path_lower.ends_with(".yaml") || path_lower.ends_with(".yml") {
+            ExtractionConfig::from_yaml_file(&path)
+        } else if path_lower.ends_with(".json") {
+            ExtractionConfig::from_json_file(&path)
+        } else {
+            anyhow::bail!("Config file must have .toml, .yaml, .yml, or .json extension (case-insensitive)");
+        };
+        config.with_context(|| format!("Failed to load configuration from '{}'. Ensure the file exists, is readable, and contains valid configuration.", path.display()))
+    } else {
+        match ExtractionConfig::discover() {
+            Ok(Some(config)) => Ok(config),
+            Ok(None) => Ok(ExtractionConfig::default()),
+            Err(e) => Err(e).context("Failed to auto-discover configuration file. Searched for kreuzberg.{toml,yaml,json} in current and parent directories. Use --config to specify an explicit path."),
+        }
+    }
+}
--- a/crates/kreuzberg-cli/src/commands/embed.rs
+++ b/crates/kreuzberg-cli/src/commands/embed.rs
@@ -0,0 +1,161 @@
+//! Embed command implementation.
+
+use anyhow::{Context, Result};
+
+use crate::{WireFormat, style};
+
+/// Execute the embed command: generate embeddings for input texts.
+///
+/// When `provider` is `"local"` (default), uses the ONNX preset model.
+/// When `provider` is `"llm"`, uses liter-llm with the specified model and API key.
+/// When `provider` is `"plugin"`, dispatches to a pre-registered in-process embedding backend.
+pub fn embed_command(
+    texts: Vec<String>,
+    preset: &str,
+    provider: &str,
+    llm_model: Option<String>,
+    llm_api_key: Option<String>,
+    plugin_name: Option<String>,
+    format: WireFormat,
+) -> Result<()> {
+    if texts.is_empty() {
+        anyhow::bail!("No texts provided for embedding. Provide --text or pipe text via stdin.");
+    }
+
+    // Validate no empty texts
+    for (i, t) in texts.iter().enumerate() {
+        if t.is_empty() {
+            anyhow::bail!("Text at position {} is empty. All texts must be non-empty.", i + 1);
+        }
+    }
+
+    let (config, model_label) = match provider {
+        "llm" => {
+            let model = llm_model.as_deref().ok_or_else(|| {
+                anyhow::anyhow!(
+                    "--model is required when --provider is 'llm' (e.g., --model openai/text-embedding-3-small)"
+                )
+            })?;
+
+            let llm_config = kreuzberg::LlmConfig {
+                model: model.to_string(),
+                api_key: llm_api_key,
+                base_url: None,
+                timeout_secs: None,
+                max_retries: None,
+                temperature: None,
+                max_tokens: None,
+            };
+
+            let config = kreuzberg::EmbeddingConfig {
+                model: kreuzberg::EmbeddingModelType::Llm { llm: llm_config },
+                show_download_progress: true,
+                ..Default::default()
+            };
+
+            (config, model.to_string())
+        }
+        "local" | "" => {
+            // Validate preset for local provider
+            let _preset_info = kreuzberg::get_embedding_preset(preset).with_context(|| {
+                format!(
+                    "Unknown embedding preset '{}'. Available: {:?}",
+                    preset,
+                    kreuzberg::list_embedding_presets()
+                )
+            })?;
+
+            let config = kreuzberg::EmbeddingConfig {
+                model: kreuzberg::EmbeddingModelType::Preset {
+                    name: preset.to_string(),
+                },
+                show_download_progress: true,
+                ..Default::default()
+            };
+
+            (config, preset.to_string())
+        }
+        "plugin" => {
+            let name = plugin_name.as_deref().ok_or_else(|| {
+                anyhow::anyhow!(
+                    "--plugin NAME is required when --provider is 'plugin'. Register a backend via kreuzberg::plugins::register_embedding_backend first."
+                )
+            })?;
+            if name.is_empty() {
+                anyhow::bail!("--plugin NAME must not be empty.");
+            }
+
+            // Pre-flight: surface unknown backends with a list of registered names
+            // (parity with the REST handler, which returns 422 for the same case).
+            let available =
+                kreuzberg::plugins::list_embedding_backends().context("Failed to read embedding backend registry")?;
+            if !available.iter().any(|n| n == name) {
+                anyhow::bail!(
+                    "Embedding backend '{}' is not registered. Available backends: {}",
+                    name,
+                    if available.is_empty() {
+                        "(none registered)".to_string()
+                    } else {
+                        available.join(", ")
+                    }
+                );
+            }
+
+            let config = kreuzberg::EmbeddingConfig {
+                model: kreuzberg::EmbeddingModelType::Plugin { name: name.to_string() },
+                ..Default::default()
+            };
+
+            (config, name.to_string())
+        }
+        other => {
+            anyhow::bail!(
+                "Unknown embedding provider '{}'. Valid providers: 'local' (default, ONNX), 'llm' (liter-llm), or 'plugin' (in-process backend).",
+                other
+            );
+        }
+    };
+
+    // Generate embeddings
+    let embeddings = kreuzberg::embed_texts(texts.clone(), &config).context("Failed to generate embeddings")?;
+
+    let dimensions = embeddings.first().map(|e| e.len()).unwrap_or(0);
+
+    match format {
+        WireFormat::Json => {
+            let output = serde_json::json!({
+                "embeddings": embeddings,
+                "model": model_label,
+                "dimensions": dimensions,
+                "count": embeddings.len(),
+            });
+            println!(
+                "{}",
+                serde_json::to_string_pretty(&output).context("Failed to serialize embeddings to JSON")?
+            );
+        }
+        WireFormat::Toon => {
+            let output = serde_json::json!({
+                "embeddings": embeddings,
+                "model": model_label,
+                "dimensions": dimensions,
+                "count": embeddings.len(),
+            });
+            println!(
+                "{}",
+                serde_toon::to_string(&output).context("Failed to serialize embeddings to TOON")?
+            );
+        }
+        WireFormat::Text => {
+            for (i, embedding) in embeddings.iter().enumerate() {
+                if texts.len() > 1 {
+                    println!("{}", style::dim(&format!("# text {}", i + 1)));
+                }
+                let values: Vec<String> = embedding.iter().map(|v| format!("{v}")).collect();
+                println!("{}", values.join(","));
+            }
+        }
+    }
+
+    Ok(())
+}
--- a/crates/kreuzberg-cli/src/commands/extract.rs
+++ b/crates/kreuzberg-cli/src/commands/extract.rs
@@ -0,0 +1,180 @@
+//! Extract command - Extract text and data from documents
+//!
+//! This module provides the extract and batch extract commands for processing single
+//! or multiple documents with customizable extraction configurations.
+
+use anyhow::{Context, Result};
+use kreuzberg::{
+    BatchFileItem, ExtractionConfig, ExtractionResult, FileExtractionConfig, batch_extract_files_sync,
+    extract_file_sync,
+};
+use std::path::PathBuf;
+use std::time::Instant;
+
+use crate::{
+    WireFormat,
+    output::{BatchEnvelope, ExtractEnvelope},
+    style,
+};
+
+/// Execute single document extraction command
+pub fn extract_command(
+    path: PathBuf,
+    config: ExtractionConfig,
+    mime_type: Option<String>,
+    format: WireFormat,
+) -> Result<()> {
+    let path_str = path.to_string_lossy().to_string();
+
+    let t0 = Instant::now();
+    let result = extract_file_sync(&path_str, mime_type.as_deref(), &config).with_context(|| {
+        format!(
+            "Failed to extract file '{}'. Ensure the file is readable and the format is supported.",
+            path.display()
+        )
+    })?;
+    let extraction_time_ms = t0.elapsed().as_secs_f64() * 1000.0;
+
+    match format {
+        WireFormat::Text => {
+            print!("{}", result.content);
+        }
+        WireFormat::Json => {
+            let envelope = ExtractEnvelope {
+                result,
+                extraction_time_ms,
+            };
+            println!(
+                "{}",
+                serde_json::to_string_pretty(&envelope).context("Failed to serialize extraction result to JSON")?
+            );
+        }
+        WireFormat::Toon => {
+            println!(
+                "{}",
+                serde_toon::to_string(&result).context("Failed to serialize extraction result to TOON")?
+            );
+        }
+    }
+
+    Ok(())
+}
+
+/// Execute batch extraction command with optional per-file configuration overrides
+pub fn batch_command(
+    paths: Vec<PathBuf>,
+    file_configs_map: Option<std::collections::HashMap<String, serde_json::Value>>,
+    config: ExtractionConfig,
+    format: WireFormat,
+) -> Result<()> {
+    match format {
+        WireFormat::Json => {
+            // Run files one at a time to capture per-file wall-clock timings.
+            // Per-file config overrides are honoured: files without an override use the
+            // batch-level config directly; files with an override use a one-shot batch of
+            // one item so the library's own merge logic applies.
+            let mut results: Vec<ExtractionResult> = Vec::with_capacity(paths.len());
+            let mut per_file_ms: Vec<f64> = Vec::with_capacity(paths.len());
+            let total_t0 = Instant::now();
+
+            for path in &paths {
+                let path_str = path.to_string_lossy().to_string();
+                let has_file_config = file_configs_map.as_ref().and_then(|m| m.get(&path_str)).is_some();
+
+                let t0 = Instant::now();
+                let result = if has_file_config {
+                    // Delegate to the batch API (one item) so per-file merge logic is applied.
+                    let file_config = file_configs_map
+                        .as_ref()
+                        .and_then(|m| m.get(&path_str))
+                        .map(|v| {
+                            serde_json::from_value::<FileExtractionConfig>(v.clone())
+                                .with_context(|| format!("Failed to parse file config for '{}'", path_str))
+                        })
+                        .transpose()?;
+                    let mut batch_results = batch_extract_files_sync(
+                        vec![BatchFileItem {
+                            path: path.clone(),
+                            config: file_config,
+                        }],
+                        &config,
+                    )
+                    .with_context(|| {
+                        format!(
+                            "Failed to extract file '{}'. Ensure the file is readable and the format is supported.",
+                            path.display()
+                        )
+                    })?;
+                    batch_results.remove(0)
+                } else {
+                    extract_file_sync(&path_str, None, &config).with_context(|| {
+                        format!(
+                            "Failed to extract file '{}'. Ensure the file is readable and the format is supported.",
+                            path.display()
+                        )
+                    })?
+                };
+                per_file_ms.push(t0.elapsed().as_secs_f64() * 1000.0);
+                results.push(result);
+            }
+
+            let total_ms = total_t0.elapsed().as_secs_f64() * 1000.0;
+            let envelope = BatchEnvelope {
+                results,
+                total_ms,
+                per_file_ms,
+            };
+            println!(
+                "{}",
+                serde_json::to_string_pretty(&envelope)
+                    .context("Failed to serialize batch extraction results to JSON")?
+            );
+        }
+        WireFormat::Text => {
+            let results = run_batch_sync(&paths, file_configs_map.as_ref(), &config)?;
+            for (i, result) in results.iter().enumerate() {
+                println!("{}", style::header(&format!("=== Document {} ===", i + 1)));
+                println!("{} {}", style::label("MIME Type:"), style::success(&result.mime_type));
+                println!("{}\n{}", style::label("Content:"), result.content);
+                println!();
+            }
+        }
+        WireFormat::Toon => {
+            let results = run_batch_sync(&paths, file_configs_map.as_ref(), &config)?;
+            println!(
+                "{}",
+                serde_toon::to_string(&results).context("Failed to serialize batch extraction results to TOON")?
+            );
+        }
+    }
+
+    Ok(())
+}
+
+/// Run batch extraction using the synchronous batch API for non-JSON output paths.
+fn run_batch_sync(
+    paths: &[PathBuf],
+    file_configs_map: Option<&std::collections::HashMap<String, serde_json::Value>>,
+    config: &ExtractionConfig,
+) -> Result<Vec<ExtractionResult>> {
+    let items: Vec<BatchFileItem> = paths
+        .iter()
+        .map(|p| {
+            let path_str = p.to_string_lossy().to_string();
+            let file_config = file_configs_map
+                .and_then(|m| m.get(&path_str))
+                .map(|v| {
+                    serde_json::from_value::<FileExtractionConfig>(v.clone())
+                        .with_context(|| format!("Failed to parse file config for '{}'", path_str))
+                })
+                .transpose()?;
+            Ok(BatchFileItem {
+                path: p.clone(),
+                config: file_config,
+            })
+        })
+        .collect::<Result<Vec<_>>>()?;
+
+    batch_extract_files_sync(items, config)
+        .context("Failed to batch extract documents. Check that all files are readable and formats are supported.")
+}
--- a/crates/kreuzberg-cli/src/commands/extract_structured.rs
+++ b/crates/kreuzberg-cli/src/commands/extract_structured.rs
@@ -0,0 +1,116 @@
+//! Extract structured command - Extract structured data from documents using an LLM.
+//!
+//! Reads a JSON schema file, configures LLM-based structured extraction, and
+//! outputs the structured result parsed from the document.
+
+use anyhow::{Context, Result};
+use kreuzberg::{LlmConfig, StructuredExtractionConfig, extract_file_sync};
+use std::path::PathBuf;
+
+use crate::WireFormat;
+
+/// Arguments for the extract-structured command.
+pub struct ExtractStructuredArgs {
+    pub path: PathBuf,
+    pub schema_path: PathBuf,
+    pub model: String,
+    pub api_key: Option<String>,
+    pub prompt: Option<String>,
+    pub schema_name: Option<String>,
+    pub strict: bool,
+    pub config_path: Option<PathBuf>,
+    pub format: WireFormat,
+}
+
+/// Execute the extract-structured command.
+///
+/// Reads a JSON schema from `schema_path`, builds an `ExtractionConfig` with
+/// `structured_extraction` configured, extracts the document, and outputs the
+/// `structured_output` field from the result.
+pub fn extract_structured_command(args: ExtractStructuredArgs) -> Result<()> {
+    let ExtractStructuredArgs {
+        path,
+        schema_path,
+        model,
+        api_key,
+        prompt,
+        schema_name,
+        strict,
+        config_path,
+        format,
+    } = args;
+    // 1. Read and parse the JSON schema file
+    let schema_str = std::fs::read_to_string(&schema_path).with_context(|| {
+        format!(
+            "Failed to read JSON schema file '{}'. Ensure the file exists and is readable.",
+            schema_path.display()
+        )
+    })?;
+    let schema: serde_json::Value = serde_json::from_str(&schema_str).with_context(|| {
+        format!(
+            "Failed to parse JSON schema from '{}'. Ensure the file contains valid JSON.",
+            schema_path.display()
+        )
+    })?;
+
+    // 2. Build ExtractionConfig with structured_extraction
+    let mut config = super::load_config(config_path)?;
+
+    let llm_config = LlmConfig {
+        model,
+        api_key,
+        base_url: None,
+        timeout_secs: None,
+        max_retries: None,
+        temperature: None,
+        max_tokens: None,
+    };
+
+    config.structured_extraction = Some(StructuredExtractionConfig {
+        schema,
+        schema_name: schema_name.unwrap_or_else(|| "extraction".to_string()),
+        schema_description: None,
+        strict,
+        prompt,
+        llm: llm_config,
+    });
+
+    // 3. Call kreuzberg::extract_file_sync()
+    let path_str = path.to_string_lossy().to_string();
+    let result = extract_file_sync(&path_str, None, &config).with_context(|| {
+        format!(
+            "Failed to extract structured data from '{}'. Ensure the file is readable and the LLM configuration is correct.",
+            path.display()
+        )
+    })?;
+
+    // 4. Output result.structured_output (or error if None)
+    let structured = result.structured_output.with_context(|| {
+        "Structured extraction completed but returned no structured output. \
+         This may indicate the LLM failed to produce valid structured data matching the schema."
+    })?;
+
+    match format {
+        WireFormat::Json => {
+            println!(
+                "{}",
+                serde_json::to_string_pretty(&structured).context("Failed to serialize structured output to JSON")?
+            );
+        }
+        WireFormat::Toon => {
+            println!(
+                "{}",
+                serde_toon::to_string(&structured).context("Failed to serialize structured output to TOON")?
+            );
+        }
+        WireFormat::Text => {
+            // For text mode, pretty-print the JSON value
+            println!(
+                "{}",
+                serde_json::to_string_pretty(&structured).context("Failed to serialize structured output to text")?
+            );
+        }
+    }
+
+    Ok(())
+}
--- a/crates/kreuzberg-cli/src/commands/mod.rs
+++ b/crates/kreuzberg-cli/src/commands/mod.rs
@@ -0,0 +1,48 @@
+//! Command modules for Kreuzberg CLI
+//!
+//! This module organizes the CLI commands into focused submodules:
+//! - `extract` - Document extraction commands
+//! - `cache` - Cache management operations
+//! - `server` - API and MCP server commands
+//! - `config` - Configuration loading and discovery
+//! - `embed` - Embedding generation commands
+//! - `chunk` - Text chunking commands
+
+use anyhow::{Context, Result};
+use std::io::Read;
+
+pub mod cache;
+pub mod chunk;
+pub mod config;
+#[cfg(feature = "embeddings")]
+pub mod embed;
+pub mod extract;
+pub mod extract_structured;
+pub mod overrides;
+#[cfg(any(feature = "api", feature = "mcp"))]
+pub mod server;
+
+// Re-export command functions for convenience
+pub use cache::{clear_command, manifest_command, stats_command, warm_command};
+pub use chunk::chunk_command;
+pub use config::load_config;
+#[cfg(feature = "embeddings")]
+pub use embed::embed_command;
+pub use extract::{batch_command, extract_command};
+#[cfg(feature = "mcp")]
+pub use server::mcp_command;
+#[cfg(feature = "api")]
+pub use server::serve_command;
+
+/// Read text from stdin, trimming whitespace.
+pub fn read_stdin() -> Result<String> {
+    let mut input = String::new();
+    std::io::stdin()
+        .read_to_string(&mut input)
+        .context("Failed to read from stdin")?;
+    let trimmed = input.trim().to_string();
+    if trimmed.is_empty() {
+        anyhow::bail!("No input received from stdin. Provide text via --text or pipe it to stdin.");
+    }
+    Ok(trimmed)
+}
--- a/crates/kreuzberg-cli/src/commands/overrides.rs
+++ b/crates/kreuzberg-cli/src/commands/overrides.rs
--- a/crates/kreuzberg-cli/src/commands/server.rs
+++ b/crates/kreuzberg-cli/src/commands/server.rs
@@ -0,0 +1,104 @@
+//! Server command - Start API and MCP servers
+//!
+//! This module provides commands for starting the Kreuzberg API server
+//! and the MCP (Model Context Protocol) server.
+
+use anyhow::Result;
+
+/// Execute API server command
+#[cfg(feature = "api")]
+pub fn serve_command(
+    cli_host: Option<String>,
+    cli_port: Option<u16>,
+    extraction_config: kreuzberg::ExtractionConfig,
+    config_path: Option<std::path::PathBuf>,
+) -> Result<()> {
+    use anyhow::Context;
+    use kreuzberg::ServerConfig;
+
+    // Load server config from same file or defaults
+    let mut server_config = if let Some(path) = &config_path {
+        ServerConfig::from_file(path).with_context(|| {
+            format!(
+                "Failed to load server configuration from '{}'. \
+                 Ensure the file contains valid server settings under [server] section or at root level.",
+                path.display()
+            )
+        })?
+    } else {
+        ServerConfig::default()
+    };
+
+    // Apply environment variable overrides (precedence: env vars > config file)
+    server_config.apply_env_overrides()?;
+
+    // CLI args override everything (highest precedence)
+    if let Some(host) = cli_host {
+        server_config.host = host;
+    }
+    if let Some(port) = cli_port {
+        server_config.port = port;
+    }
+
+    // Log the final configuration for debugging
+    tracing::info!(
+        "Starting Kreuzberg API server on http://{}",
+        server_config.listen_addr()
+    );
+
+    let rt = tokio::runtime::Runtime::new()?;
+    rt.block_on(kreuzberg::api::serve_with_server_config(
+        extraction_config,
+        server_config.clone(),
+    ))
+    .with_context(|| {
+        format!(
+            "Failed to start API server on {}. Ensure the port is not already in use and you have permission to bind to this address.",
+            server_config.listen_addr()
+        )
+    })?;
+
+    Ok(())
+}
+
+/// Execute MCP server command
+#[cfg(feature = "mcp")]
+pub fn mcp_command(
+    config: kreuzberg::ExtractionConfig,
+    transport: String,
+    #[cfg(feature = "mcp-http")] host: String,
+    #[cfg(feature = "mcp-http")] port: u16,
+    #[cfg(not(feature = "mcp-http"))] _host: String,
+    #[cfg(not(feature = "mcp-http"))] _port: u16,
+) -> Result<()> {
+    tracing::debug!("Starting Kreuzberg MCP server with transport: {}", transport);
+    let rt = tokio::runtime::Runtime::new()?;
+
+    match transport.to_lowercase().as_str() {
+        "stdio" => {
+            rt.block_on(kreuzberg::mcp::start_mcp_server_with_config(config))
+                .map_err(|e| anyhow::anyhow!("Failed to start MCP server: {}", e))?;
+        }
+        "http" => {
+            #[cfg(not(feature = "mcp-http"))]
+            {
+                anyhow::bail!(
+                    "HTTP transport requires 'mcp-http' feature. \
+                     Rebuild with: cargo build --features mcp-http"
+                );
+            }
+
+            #[cfg(feature = "mcp-http")]
+            {
+                tracing::debug!("Starting MCP server on http://{}:{}", host, port);
+                rt.block_on(kreuzberg::mcp::start_mcp_server_http_with_config(&host, port, config))
+                    .map_err(|e| anyhow::anyhow!("Failed to start MCP server on {}:{}: {}", host, port, e))?;
+            }
+        }
+        other => {
+            anyhow::bail!("Unknown transport '{}'. Use 'stdio' or 'http'", other);
+        }
+    }
+
+    Ok(())
+}
--- a/crates/kreuzberg-cli/src/commands/tree_sitter.rs
+++ b/crates/kreuzberg-cli/src/commands/tree_sitter.rs
@@ -0,0 +1,230 @@
+//! Tree-sitter grammar management commands.
+//!
+//! This module provides commands for downloading, listing, and managing
+//! tree-sitter grammar parsers via the tree-sitter-language-pack crate.
+
+use anyhow::{Context, Result};
+use serde_json::json;
+use std::path::PathBuf;
+
+use crate::{WireFormat, style};
+
+/// Execute the tree-sitter download command.
+///
+/// Downloads tree-sitter grammar parsers based on the provided arguments:
+/// - Specific languages by name
+/// - All available languages (--all)
+/// - Language groups (--groups)
+pub fn download_command(
+    languages: Vec<String>,
+    all: bool,
+    groups: Option<Vec<String>>,
+    cache_dir: Option<PathBuf>,
+    format: WireFormat,
+) -> Result<()> {
+    // Apply custom cache directory if provided
+    if let Some(ref dir) = cache_dir {
+        let config = tree_sitter_language_pack::PackConfig {
+            cache_dir: Some(dir.clone()),
+            languages: None,
+            groups: None,
+        };
+        tree_sitter_language_pack::configure(&config).context("Failed to configure custom cache directory")?;
+    }
+
+    let count: usize;
+    let description: String;
+
+    if all {
+        count = tree_sitter_language_pack::download_all().context("Failed to download all tree-sitter grammars")?;
+        description = "all available languages".to_string();
+    } else if let Some(ref group_list) = groups {
+        let config = tree_sitter_language_pack::PackConfig {
+            cache_dir: cache_dir.clone(),
+            languages: None,
+            groups: Some(group_list.clone()),
+        };
+        tree_sitter_language_pack::init(&config).context("Failed to download tree-sitter grammar groups")?;
+        count = 0; // init does not return a count
+        description = format!("groups: {}", group_list.join(", "));
+    } else if !languages.is_empty() {
+        let refs: Vec<&str> = languages.iter().map(String::as_str).collect();
+        count = tree_sitter_language_pack::download(&refs).context("Failed to download tree-sitter grammars")?;
+        description = format!("languages: {}", languages.join(", "));
+    } else {
+        anyhow::bail!("No languages specified. Use language names, --all, --groups, or --from-config.");
+    }
+
+    match format {
+        WireFormat::Text => {
+            println!("{}", style::header("Tree-sitter Download"));
+            println!("{}", style::dim("===================="));
+            println!("{} {}", style::label("Requested:"), description);
+            if groups.is_none() || all || !languages.is_empty() {
+                println!(
+                    "{} {}",
+                    style::label("Newly downloaded:"),
+                    style::success(&count.to_string())
+                );
+            }
+            if let Some(ref dir) = cache_dir {
+                println!(
+                    "{} {}",
+                    style::label("Cache directory:"),
+                    style::success(&dir.display().to_string())
+                );
+            }
+            println!("{}", style::success("Done"));
+        }
+        WireFormat::Json => {
+            let mut output = json!({
+                "requested": description,
+                "newly_downloaded": count,
+            });
+            if let Some(ref dir) = cache_dir {
+                output["cache_dir"] = json!(dir.to_string_lossy());
+            }
+            println!(
+                "{}",
+                serde_json::to_string_pretty(&output).context("Failed to serialize download results to JSON")?
+            );
+        }
+        WireFormat::Toon => {
+            let mut output = json!({
+                "requested": description,
+                "newly_downloaded": count,
+            });
+            if let Some(ref dir) = cache_dir {
+                output["cache_dir"] = json!(dir.to_string_lossy());
+            }
+            println!(
+                "{}",
+                serde_toon::to_string(&output).context("Failed to serialize download results to TOON")?
+            );
+        }
+    }
+
+    Ok(())
+}
+
+/// Execute the tree-sitter list command.
+///
+/// Lists available or downloaded tree-sitter languages, optionally filtering
+/// by a name substring.
+pub fn list_command(downloaded_only: bool, filter: Option<String>, format: WireFormat) -> Result<()> {
+    let languages = if downloaded_only {
+        tree_sitter_language_pack::downloaded_languages()
+    } else {
+        tree_sitter_language_pack::manifest_languages().context("Failed to fetch tree-sitter language manifest")?
+    };
+
+    let filtered: Vec<&String> = if let Some(ref f) = filter {
+        let lower = f.to_lowercase();
+        languages.iter().filter(|l| l.to_lowercase().contains(&lower)).collect()
+    } else {
+        languages.iter().collect()
+    };
+
+    let source = if downloaded_only { "downloaded" } else { "available" };
+
+    match format {
+        WireFormat::Text => {
+            println!(
+                "{} ({} {}{})",
+                style::header("Tree-sitter Languages"),
+                filtered.len(),
+                source,
+                filter.as_ref().map(|f| format!(", filter: '{f}'")).unwrap_or_default()
+            );
+            println!("{}", style::dim("====================="));
+            for lang in &filtered {
+                println!("  {}", style::success(lang));
+            }
+        }
+        WireFormat::Json => {
+            let output = json!({
+                "source": source,
+                "count": filtered.len(),
+                "filter": filter,
+                "languages": filtered,
+            });
+            println!(
+                "{}",
+                serde_json::to_string_pretty(&output).context("Failed to serialize language list to JSON")?
+            );
+        }
+        WireFormat::Toon => {
+            let output = json!({
+                "source": source,
+                "count": filtered.len(),
+                "filter": filter,
+                "languages": filtered,
+            });
+            println!(
+                "{}",
+                serde_toon::to_string(&output).context("Failed to serialize language list to TOON")?
+            );
+        }
+    }
+
+    Ok(())
+}
+
+/// Execute the tree-sitter cache-dir command.
+///
+/// Displays the effective cache directory for tree-sitter grammar parsers.
+pub fn cache_dir_command(format: WireFormat) -> Result<()> {
+    let dir = tree_sitter_language_pack::cache_dir().context("Failed to determine tree-sitter cache directory")?;
+    let dir_str = dir.to_string_lossy();
+
+    match format {
+        WireFormat::Text => {
+            println!("{} {}", style::label("Cache directory:"), style::success(&dir_str));
+        }
+        WireFormat::Json => {
+            let output = json!({ "cache_dir": dir_str });
+            println!(
+                "{}",
+                serde_json::to_string_pretty(&output).context("Failed to serialize cache directory to JSON")?
+            );
+        }
+        WireFormat::Toon => {
+            let output = json!({ "cache_dir": dir_str });
+            println!(
+                "{}",
+                serde_toon::to_string(&output).context("Failed to serialize cache directory to TOON")?
+            );
+        }
+    }
+
+    Ok(())
+}
+
+/// Execute the tree-sitter clean command.
+///
+/// Clears all cached tree-sitter grammar parser shared libraries.
+pub fn clean_command(format: WireFormat) -> Result<()> {
+    tree_sitter_language_pack::clean_cache().context("Failed to clean tree-sitter cache")?;
+
+    match format {
+        WireFormat::Text => {
+            println!("{}", style::success("Tree-sitter cache cleared successfully"));
+        }
+        WireFormat::Json => {
+            let output = json!({ "status": "cleared" });
+            println!(
+                "{}",
+                serde_json::to_string_pretty(&output).context("Failed to serialize clean result to JSON")?
+            );
+        }
+        WireFormat::Toon => {
+            let output = json!({ "status": "cleared" });
+            println!(
+                "{}",
+                serde_toon::to_string(&output).context("Failed to serialize clean result to TOON")?
+            );
+        }
+    }
+
+    Ok(())
+}
--- a/crates/kreuzberg-cli/src/logging.rs
+++ b/crates/kreuzberg-cli/src/logging.rs
@@ -0,0 +1,238 @@
+//! Logging helpers for the Kreuzberg CLI.
+//!
+//! Provides a [`build_env_filter`] function that layers default third-party
+//! transport suppressions on top of whatever the caller or `RUST_LOG` specifies.
+//! User-supplied per-target rules in `RUST_LOG` always win because
+//! [`EnvFilter::add_directive`] does not override existing per-target directives.
+
+use tracing_subscriber::EnvFilter;
+
+/// Third-party crates that are noisy at their own default level.
+///
+/// These are added as *fallback* directives: if `RUST_LOG` or `level_override`
+/// already contain a per-target rule for any of these crates it takes precedence,
+/// so the user can still do `RUST_LOG=ureq=debug` to restore full transport logs.
+const QUIET_DIRECTIVES: &[&str] = &[
+    "ureq=warn",
+    "ureq_proto=warn",
+    "rustls=warn",
+    "hyper_util=warn",
+    "hf_hub=info",
+    "tower_http=info",
+];
+
+/// Extract the target crate name from a directive string like `"ureq=warn"`.
+///
+/// Returns the part before `=`, or `None` if there is no `=`.
+fn directive_target(directive: &str) -> Option<&str> {
+    directive.split_once('=').map(|(target, _)| target)
+}
+
+/// Build an [`EnvFilter`] with third-party transport crates suppressed by default.
+///
+/// Precedence (highest first):
+/// 1. Per-target directives already present in `RUST_LOG` (or `level_override`).
+/// 2. The [`QUIET_DIRECTIVES`] suppressions added here.
+/// 3. Root level: `level_override` → `RUST_LOG` → `"info"`.
+///
+/// Per-target directives that the user has already set are **not** overridden:
+/// we skip adding a quiet directive when the base filter already contains a
+/// rule for the same target crate. This is necessary because
+/// [`EnvFilter::add_directive`] appends rather than guards — a later-added
+/// per-target directive for the same crate takes precedence.
+///
+/// # Arguments
+///
+/// * `level_override` — explicit root-level string from a CLI flag (e.g. `"debug"`).
+///   When `Some`, it replaces `RUST_LOG` entirely for the root level.
+pub fn build_env_filter(level_override: Option<&str>) -> EnvFilter {
+    // Use try_new on user input so a malformed --log-level falls back to info
+    // instead of panicking the CLI.
+    let base = level_override
+        .and_then(|level| EnvFilter::try_new(level).ok())
+        .or_else(|| EnvFilter::try_from_default_env().ok())
+        .unwrap_or_else(|| EnvFilter::new("info"));
+
+    // Snapshot the existing directive set so we can skip quiet directives
+    // whose target the user has already configured explicitly.
+    let existing_targets: std::collections::HashSet<String> = base
+        .to_string()
+        .split(',')
+        .filter_map(|chunk| directive_target(chunk).map(|t| t.trim().to_string()))
+        .collect();
+
+    QUIET_DIRECTIVES
+        .iter()
+        .filter(|directive| {
+            // Only add the quiet directive when no per-target rule for this
+            // exact crate already exists. Word-boundary match via tokenized
+            // target set avoids `hf_hub` colliding with `hf_hub_server`.
+            directive_target(directive)
+                .map(|target| !existing_targets.contains(target))
+                .unwrap_or(true)
+        })
+        .fold(base, |filter, directive| {
+            filter.add_directive(directive.parse().expect("built-in logging directive must be valid"))
+        })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Parse the directive string from an EnvFilter for assertion-level checks.
+    ///
+    /// `EnvFilter::to_string()` returns a comma-separated representation of all
+    /// directives. We use this as a stable, public inspection surface.
+    fn filter_directives(filter: &EnvFilter) -> String {
+        filter.to_string()
+    }
+
+    #[test]
+    fn default_filter_suppresses_ureq() {
+        // No env, no override → ureq and ureq_proto must be suppressed.
+        let filter = build_env_filter(None);
+        let directives = filter_directives(&filter);
+        assert!(
+            directives.contains("ureq=warn"),
+            "ureq=warn must be present in default filter; got: {directives}"
+        );
+        assert!(
+            directives.contains("ureq_proto=warn"),
+            "ureq_proto=warn must be present in default filter; got: {directives}"
+        );
+        assert!(
+            directives.contains("rustls=warn"),
+            "rustls=warn must be present in default filter; got: {directives}"
+        );
+    }
+
+    #[test]
+    fn default_filter_keeps_kreuzberg_info() {
+        // Root level info → kreuzberg has no suppression applied.
+        let filter = build_env_filter(None);
+        let directives = filter_directives(&filter);
+        assert!(
+            !directives.contains("kreuzberg=warn") && !directives.contains("kreuzberg=error"),
+            "kreuzberg must not be suppressed in the default filter; got: {directives}"
+        );
+    }
+
+    #[test]
+    fn env_override_wins_for_third_party() {
+        // Simulate RUST_LOG=ureq=debug by passing it as the level_override.
+        // build_env_filter must detect the existing ureq= directive and skip the
+        // ureq=warn suppression, so ureq=debug survives in the final filter.
+        let filter = build_env_filter(Some("info,ureq=debug"));
+        let directives = filter.to_string();
+        assert!(
+            directives.contains("ureq=debug"),
+            "user-supplied ureq=debug must be preserved; got: {directives}"
+        );
+        assert!(
+            !directives.contains("ureq=warn"),
+            "ureq=warn suppression must not be added when user already set ureq=debug; got: {directives}"
+        );
+    }
+
+    #[test]
+    fn level_override_wins() {
+        // CLI flag "debug" → root must be debug; suppression directives still present.
+        let filter = build_env_filter(Some("debug"));
+        let directives = filter_directives(&filter);
+        assert!(
+            directives.contains("debug"),
+            "root debug level must appear in filter with --log-level debug; got: {directives}"
+        );
+        // Suppression for ureq must still be layered on top.
+        assert!(
+            directives.contains("ureq=warn"),
+            "ureq=warn suppression must still be present even under --log-level debug; got: {directives}"
+        );
+    }
+
+    #[test]
+    fn tower_http_suppressed_at_default() {
+        // No override → tower_http must be suppressed.
+        let filter = build_env_filter(None);
+        let directives = filter_directives(&filter);
+        assert!(
+            directives.contains("tower_http=info") || directives.contains("tower_http=warn"),
+            "tower_http must be suppressed at default level; got: {directives}"
+        );
+    }
+
+    #[test]
+    fn all_quiet_directives_are_valid() {
+        // Ensure every built-in directive parses without panic.
+        for directive in super::QUIET_DIRECTIVES {
+            directive
+                .parse::<tracing_subscriber::filter::Directive>()
+                .expect("built-in directive is invalid");
+        }
+    }
+
+    #[test]
+    fn no_level_override_uses_info_root() {
+        // Without RUST_LOG set and no override, root should default to info.
+        // The directive string must not open with debug or trace as the root level.
+        let filter = build_env_filter(None);
+        let directives = filter_directives(&filter);
+        // Root "debug" or "trace" as the first token would mean root is debug/trace.
+        let root_is_noisier_than_info = directives.starts_with("debug") || directives.starts_with("trace");
+        assert!(
+            !root_is_noisier_than_info,
+            "default root level must not be debug/trace without RUST_LOG; got: {directives}"
+        );
+    }
+
+    #[test]
+    fn hf_hub_suppressed_at_default() {
+        let filter = build_env_filter(None);
+        let directives = filter_directives(&filter);
+        assert!(
+            directives.contains("hf_hub=info"),
+            "hf_hub must be suppressed to info at default; got: {directives}"
+        );
+    }
+
+    #[test]
+    fn hyper_util_suppressed_at_default() {
+        let filter = build_env_filter(None);
+        let directives = filter_directives(&filter);
+        assert!(
+            directives.contains("hyper_util=warn"),
+            "hyper_util must be suppressed to warn at default; got: {directives}"
+        );
+    }
+
+    #[test]
+    fn malformed_level_override_falls_back_to_info() {
+        // Garbage CLI flag must NOT panic — try_new returns Err and we fall back
+        // to RUST_LOG / info default.
+        let filter = build_env_filter(Some(":::garbage"));
+        let directives = filter_directives(&filter);
+        // Quiet directives should still be layered, proving we recovered.
+        assert!(
+            directives.contains("ureq=warn"),
+            "ureq=warn must still be present after malformed override; got: {directives}"
+        );
+    }
+
+    #[test]
+    fn similar_target_name_does_not_block_suppression() {
+        // A user-supplied directive for `hf_hub_server` must NOT cause the
+        // `hf_hub=info` suppression to be skipped (regression test for the
+        // earlier substring-containment bug).
+        let filter = build_env_filter(Some("info,hf_hub_server=debug"));
+        let directives = filter.to_string();
+        assert!(
+            directives.contains("hf_hub_server=debug"),
+            "user directive for hf_hub_server must survive; got: {directives}"
+        );
+        assert!(
+            directives.contains("hf_hub=info"),
+            "hf_hub=info suppression must still be applied; got: {directives}"
+        );
+    }
+}
--- a/crates/kreuzberg-cli/src/main.rs
+++ b/crates/kreuzberg-cli/src/main.rs
@@ -0,0 +1,971 @@
+//! Kreuzberg CLI - Command-line interface for document intelligence.
+//!
+//! This binary provides a command-line interface to the Kreuzberg document intelligence
+//! library, supporting document extraction, MIME type detection, caching, and batch operations.
+//!
+//! # Architecture
+//!
+//! The CLI is built using `clap` for argument parsing and provides five main commands:
+//! - `extract`: Extract text/data from a single document
+//! - `batch`: Process multiple documents in parallel
+//! - `detect`: Identify MIME type of a file
+//! - `cache`: Manage cache (clear, stats)
+//! - `serve`: Start API server (requires `api` feature)
+//! - `version`: Show version information
+//!
+//! # Configuration
+//!
+//! The CLI supports configuration files in TOML, YAML, or JSON formats:
+//! - Explicit: `--config path/to/config.toml`
+//! - Auto-discovery: Searches for `kreuzberg.{toml,yaml,json}` in current and parent directories
+//! - Inline JSON: `--config-json '{"ocr": {"backend": "tesseract"}}'`
+//! - Command-line flags override config file settings
+//!
+//! Configuration precedence (highest to lowest):
+//! 1. Individual CLI flags (--output-format, --ocr, etc.)
+//! 2. Inline JSON config (--config-json or --config-json-base64)
+//! 3. Config file (--config path.toml)
+//! 4. Default values
+//!
+//! # Exit Codes
+//!
+//! - 0: Success
+//! - Non-zero: Error (see stderr for details)
+//!
+//! # Examples
+//!
+//! ```bash
+//! # Extract text from a PDF
+//! kreuzberg extract document.pdf
+//!
+//! # Extract with OCR enabled
+//! kreuzberg extract scanned.pdf --ocr true
+//!
+//! # Extract with inline JSON config
+//! kreuzberg extract doc.pdf --config-json '{"ocr":{"backend":"tesseract"}}'
+//!
+//! # Batch processing
+//! kreuzberg batch *.pdf --output-format json
+//!
+//! # Detect MIME type
+//! kreuzberg detect unknown-file.bin
+//! ```
+
+#![deny(unsafe_code)]
+
+mod commands;
+mod logging;
+mod output;
+mod style;
+
+use anyhow::{Context, Result};
+use base64::{Engine as _, engine::general_purpose::STANDARD};
+use clap::{CommandFactory, Parser, Subcommand};
+#[cfg(feature = "embeddings")]
+use commands::embed_command;
+#[cfg(feature = "mcp")]
+use commands::mcp_command;
+use commands::overrides::ExtractionOverrides;
+#[cfg(feature = "api")]
+use commands::serve_command;
+use commands::{
+    batch_command, chunk_command, clear_command, extract_command,
+    extract_structured::{ExtractStructuredArgs, extract_structured_command},
+    load_config, manifest_command, stats_command, warm_command,
+};
+use kreuzberg::{OutputFormat as ContentOutputFormat, detect_mime_type};
+use serde_json::json;
+use std::path::{Path, PathBuf};
+
+/// Kreuzberg document intelligence CLI
+#[derive(Parser)]
+#[command(name = "kreuzberg")]
+#[command(version, about, long_about = None)]
+struct Cli {
+    /// Set log level (trace, debug, info, warn, error). Overrides RUST_LOG env var.
+    #[arg(long, global = true)]
+    log_level: Option<String>,
+
+    #[command(subcommand)]
+    command: Commands,
+}
+
+#[derive(Subcommand)]
+enum Commands {
+    /// Extract text from a document
+    Extract {
+        /// Path to the document
+        path: PathBuf,
+
+        /// Path to config file (TOML, YAML, or JSON). If not specified, searches for kreuzberg.toml/yaml/json in current and parent directories.
+        #[arg(short, long)]
+        config: Option<PathBuf>,
+
+        /// Inline JSON configuration. Applied after config file but before individual flags.
+        ///
+        /// Example: --config-json '{"ocr":{"backend":"tesseract"},"chunking":{"max_chars":1000}}'
+        #[arg(long)]
+        config_json: Option<String>,
+
+        /// Base64-encoded JSON configuration. Useful for shell environments where quotes are problematic.
+        ///
+        /// Example: --config-json-base64 eyJvY3IiOnsiYmFja2VuZCI6InRlc3NlcmFjdCJ9fQ==
+        #[arg(long)]
+        config_json_base64: Option<String>,
+
+        /// MIME type hint (auto-detected if not provided)
+        #[arg(short, long)]
+        mime_type: Option<String>,
+
+        /// Output format for CLI results (text or json).
+        ///
+        /// Controls how the CLI displays results, not the extraction content format.
+        #[arg(short, long, default_value = "text")]
+        format: WireFormat,
+
+        /// Extraction configuration overrides
+        #[command(flatten)]
+        overrides: ExtractionOverrides,
+    },
+
+    /// Extract structured data from a document using an LLM
+    ExtractStructured {
+        /// Path to the document file
+        path: PathBuf,
+
+        /// Path to JSON schema file defining the output structure
+        #[arg(long)]
+        schema: PathBuf,
+
+        /// LLM model (e.g., "openai/gpt-4o")
+        #[arg(long)]
+        model: String,
+
+        /// API key for the LLM provider
+        #[arg(long)]
+        api_key: Option<String>,
+
+        /// Custom Jinja2 prompt template
+        #[arg(long)]
+        prompt: Option<String>,
+
+        /// Schema name
+        #[arg(long, default_value = "extraction")]
+        schema_name: Option<String>,
+
+        /// Enable strict mode
+        #[arg(long)]
+        strict: bool,
+
+        /// Config file path
+        #[arg(short, long)]
+        config: Option<PathBuf>,
+
+        /// Output format (text or json)
+        #[arg(short, long, default_value = "json")]
+        format: WireFormat,
+    },
+
+    /// Batch extract from multiple documents
+    Batch {
+        /// Paths to documents
+        paths: Vec<PathBuf>,
+
+        /// Path to config file (TOML, YAML, or JSON). If not specified, searches for kreuzberg.toml/yaml/json in current and parent directories.
+        #[arg(short, long)]
+        config: Option<PathBuf>,
+
+        /// Inline JSON configuration. Applied after config file but before individual flags.
+        ///
+        /// Example: --config-json '{"ocr":{"backend":"tesseract"},"chunking":{"max_chars":1000}}'
+        #[arg(long)]
+        config_json: Option<String>,
+
+        /// Base64-encoded JSON configuration. Useful for shell environments where quotes are problematic.
+        ///
+        /// Example: --config-json-base64 eyJvY3IiOnsiYmFja2VuZCI6InRlc3NlcmFjdCJ9fQ==
+        #[arg(long)]
+        config_json_base64: Option<String>,
+
+        /// Output format for CLI results (text or json).
+        ///
+        /// Controls how the CLI displays results, not the extraction content format.
+        #[arg(short, long, default_value = "json")]
+        format: WireFormat,
+
+        /// Extraction configuration overrides
+        #[command(flatten)]
+        overrides: ExtractionOverrides,
+
+        /// Path to a JSON file mapping file paths to per-file extraction config overrides.
+        /// The JSON should be an object where keys are file paths and values are FileExtractionConfig objects.
+        /// Example: {"doc1.pdf": {"force_ocr": true}, "doc2.pdf": {"output_format": "markdown"}}
+        #[arg(long)]
+        file_configs: Option<PathBuf>,
+    },
+
+    /// Detect MIME type of a file
+    Detect {
+        /// Path to the file
+        path: PathBuf,
+
+        /// Output format (text or json)
+        #[arg(short, long, default_value = "text")]
+        format: WireFormat,
+    },
+
+    /// List all supported document formats
+    Formats {
+        /// Output format (text or json)
+        #[arg(short, long, default_value = "text")]
+        format: WireFormat,
+    },
+
+    /// Show version information
+    Version {
+        /// Output format (text or json)
+        #[arg(short, long, default_value = "text")]
+        format: WireFormat,
+    },
+
+    /// Cache management operations
+    Cache {
+        #[command(subcommand)]
+        command: CacheCommands,
+    },
+
+    /// Start the API server
+    ///
+    /// Configuration is loaded with the following precedence (highest to lowest):
+    /// 1. CLI arguments (--host, --port)
+    /// 2. Environment variables (KREUZBERG_HOST, KREUZBERG_PORT)
+    /// 3. Config file (TOML, YAML, or JSON)
+    /// 4. Built-in defaults (127.0.0.1:8000)
+    ///
+    /// The config file can contain both extraction and server settings under [server] section.
+    #[cfg(feature = "api")]
+    Serve {
+        /// Host to bind to (e.g., "127.0.0.1" or "0.0.0.0"). CLI arg overrides config file and env vars.
+        #[arg(short = 'H', long)]
+        host: Option<String>,
+
+        /// Port to bind to. CLI arg overrides config file and env vars.
+        #[arg(short, long)]
+        port: Option<u16>,
+
+        /// Path to config file (TOML, YAML, or JSON). If not specified, searches for kreuzberg.toml/yaml/json in current and parent directories.
+        #[arg(short, long)]
+        config: Option<PathBuf>,
+    },
+
+    /// Start the MCP (Model Context Protocol) server
+    #[cfg(feature = "mcp")]
+    Mcp {
+        /// Path to config file (TOML, YAML, or JSON). If not specified, searches for kreuzberg.toml/yaml/json in current and parent directories.
+        #[arg(short, long)]
+        config: Option<PathBuf>,
+
+        /// Transport mode: stdio (default) or http
+        #[arg(long, default_value = "stdio")]
+        transport: String,
+
+        /// HTTP host (only for --transport http)
+        #[arg(long, default_value = "127.0.0.1")]
+        host: String,
+
+        /// HTTP port (only for --transport http)
+        #[arg(long, default_value = "8001")]
+        port: u16,
+    },
+
+    /// API utilities
+    #[cfg(feature = "api")]
+    Api {
+        #[command(subcommand)]
+        command: ApiCommands,
+    },
+
+    /// Generate embeddings for text
+    ///
+    /// Generates vector embeddings for one or more text inputs using a specified preset model
+    /// or an LLM provider. Reads from --text flag or stdin if no text is provided.
+    #[cfg(feature = "embeddings")]
+    Embed {
+        /// Text to embed. Can be specified multiple times for batch embedding.
+        #[arg(long)]
+        text: Vec<String>,
+
+        /// Embedding preset (fast, balanced, quality, multilingual). Used with --provider local.
+        #[arg(long, default_value = "balanced")]
+        preset: String,
+
+        /// Embedding provider: "local" (default, ONNX), "llm" (liter-llm), or "plugin" (registered in-process backend)
+        #[arg(long, default_value = "local")]
+        provider: String,
+
+        /// LLM model for provider-hosted embeddings (e.g., "openai/text-embedding-3-small").
+        /// Required when --provider is "llm".
+        #[arg(long)]
+        model: Option<String>,
+
+        /// API key for the LLM provider
+        #[arg(long)]
+        api_key: Option<String>,
+
+        /// Name of a pre-registered in-process embedding backend.
+        /// Required when --provider is "plugin". The backend must have been
+        /// registered via `kreuzberg::plugins::register_embedding_backend`
+        /// before this command runs.
+        #[arg(long)]
+        plugin: Option<String>,
+
+        /// Output format (text or json)
+        #[arg(short, long, default_value = "json")]
+        format: WireFormat,
+    },
+
+    /// Chunk text for processing
+    ///
+    /// Splits text into chunks using configurable size and overlap.
+    /// Reads from --text flag or stdin if no text is provided.
+    Chunk {
+        /// Text to chunk. If not provided, reads from stdin.
+        #[arg(long)]
+        text: Option<String>,
+
+        /// Path to config file (TOML, YAML, or JSON)
+        #[arg(short, long)]
+        config: Option<PathBuf>,
+
+        /// Chunk size in characters
+        #[arg(long)]
+        chunk_size: Option<usize>,
+
+        /// Chunk overlap in characters
+        #[arg(long)]
+        chunk_overlap: Option<usize>,
+
+        /// Chunker type: text, markdown, yaml, or semantic
+        #[arg(long, default_value = "text")]
+        chunker_type: String,
+
+        /// Tokenizer model for token-based chunk sizing (e.g., "Xenova/gpt-4o").
+        /// Requires the chunking-tokenizers feature.
+        #[arg(long)]
+        chunking_tokenizer: Option<String>,
+
+        /// Topic threshold for semantic chunking (0.0-1.0, default: 0.75)
+        #[arg(long)]
+        topic_threshold: Option<f32>,
+
+        /// Output format (text or json)
+        #[arg(short, long, default_value = "json")]
+        format: WireFormat,
+    },
+
+    /// Generate shell completions
+    ///
+    /// Outputs shell completion scripts for the specified shell.
+    /// Install with: eval "$(kreuzberg completions bash)"
+    Completions {
+        /// Shell to generate completions for
+        #[arg(value_enum)]
+        shell: clap_complete::Shell,
+    },
+}
+
+#[cfg(feature = "api")]
+#[derive(Subcommand)]
+enum ApiCommands {
+    /// Output the OpenAPI schema (JSON)
+    ///
+    /// Prints the full OpenAPI 3.1 specification for the kreuzberg REST API.
+    /// Useful for code generation, documentation, and API client tooling.
+    Schema,
+}
+
+#[derive(Subcommand)]
+enum CacheCommands {
+    /// Show cache statistics
+    Stats {
+        /// Cache directory (default: .kreuzberg in current directory)
+        #[arg(short, long)]
+        cache_dir: Option<PathBuf>,
+
+        /// Output format (text or json)
+        #[arg(short, long, default_value = "text")]
+        format: WireFormat,
+    },
+
+    /// Clear the cache
+    Clear {
+        /// Cache directory (default: .kreuzberg in current directory)
+        #[arg(short, long)]
+        cache_dir: Option<PathBuf>,
+
+        /// Output format (text or json)
+        #[arg(short, long, default_value = "text")]
+        format: WireFormat,
+    },
+
+    /// Output model manifest (expected model files, checksums, sizes)
+    ///
+    /// Outputs a JSON manifest of all model files required by kreuzberg,
+    /// including their relative paths, SHA256 checksums, and sizes.
+    /// Used for pre-populating model caches in containerized deployments.
+    Manifest {
+        /// Output format (text or json)
+        #[arg(short, long, default_value = "json")]
+        format: WireFormat,
+    },
+
+    /// Download all models eagerly
+    ///
+    /// Downloads all PaddleOCR and layout detection models for all supported
+    /// languages. Unlike normal operation which downloads lazily on first use,
+    /// this ensures all models are present in the cache directory.
+    ///
+    /// Use --all-embeddings to also download all 4 embedding model presets,
+    /// or --embedding-model <preset> to download a specific one.
+    ///
+    /// By default, only the core layout models (rtdetr + tatr) are downloaded.
+    /// Use --all-table-models to also download SLANeXT variants (~730MB).
+    Warm {
+        /// Cache directory (default: .kreuzberg in current directory, or KREUZBERG_CACHE_DIR)
+        #[arg(short, long)]
+        cache_dir: Option<PathBuf>,
+
+        /// Output format (text or json)
+        #[arg(short, long, default_value = "text")]
+        format: WireFormat,
+
+        /// Download all embedding model presets (fast, balanced, quality, multilingual)
+        #[arg(long)]
+        all_embeddings: bool,
+
+        /// Download a specific embedding model preset
+        #[arg(long, value_name = "PRESET")]
+        embedding_model: Option<String>,
+
+        /// Download all table structure models including SLANeXT variants (~730MB)
+        #[arg(
+            long,
+            help = "Download all table structure models including SLANeXT variants (~730MB)"
+        )]
+        all_table_models: bool,
+
+        /// Download all tree-sitter grammar parsers
+        #[arg(long)]
+        all_grammars: bool,
+
+        /// Download specific tree-sitter grammar groups (comma-separated: web,systems,scripting,data,jvm,functional)
+        #[arg(long, value_name = "GROUPS", value_delimiter = ',')]
+        grammar_groups: Option<Vec<String>>,
+
+        /// Download specific tree-sitter grammars by language name (comma-separated)
+        #[arg(long, value_name = "LANGUAGES", value_delimiter = ',')]
+        grammars: Option<Vec<String>>,
+    },
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+enum WireFormat {
+    Text,
+    Json,
+    Toon,
+}
+
+impl std::str::FromStr for WireFormat {
+    type Err = String;
+
+    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
+        match s.to_lowercase().as_str() {
+            "text" => Ok(WireFormat::Text),
+            "json" => Ok(WireFormat::Json),
+            "toon" => Ok(WireFormat::Toon),
+            _ => Err(format!("Invalid format: {}. Use 'text', 'json', or 'toon'", s)),
+        }
+    }
+}
+
+/// Content output format for extraction results.
+///
+/// Controls the format of the extracted content (not the CLI output format).
+#[derive(Clone, Copy, Debug, PartialEq, Eq, clap::ValueEnum)]
+enum ContentOutputFormatArg {
+    /// Plain text (default)
+    Plain,
+    /// Markdown format
+    Markdown,
+    /// Djot markup format
+    Djot,
+    /// HTML format
+    Html,
+    /// JSON tree format with heading-driven sections
+    Json,
+}
+
+impl From<ContentOutputFormatArg> for ContentOutputFormat {
+    fn from(arg: ContentOutputFormatArg) -> Self {
+        match arg {
+            ContentOutputFormatArg::Plain => ContentOutputFormat::Plain,
+            ContentOutputFormatArg::Markdown => ContentOutputFormat::Markdown,
+            ContentOutputFormatArg::Djot => ContentOutputFormat::Djot,
+            ContentOutputFormatArg::Html => ContentOutputFormat::Html,
+            ContentOutputFormatArg::Json => ContentOutputFormat::Json,
+        }
+    }
+}
+
+/// Validates that a file exists and is accessible.
+///
+/// Checks that the path exists in the filesystem and points to a regular file
+/// (not a directory or special file). Provides user-friendly error messages if validation fails.
+///
+/// # Errors
+///
+/// Returns an error if:
+/// - The path does not exist in the filesystem
+/// - The path exists but is not a regular file (e.g., is a directory)
+fn validate_file_exists(path: &Path) -> Result<()> {
+    if !path.exists() {
+        anyhow::bail!(
+            "File not found: '{}'. Please check that the file exists and is accessible.",
+            path.display()
+        );
+    }
+    if !path.is_file() {
+        anyhow::bail!(
+            "Path is not a file: '{}'. Please provide a path to a regular file.",
+            path.display()
+        );
+    }
+    Ok(())
+}
+
+/// Validates chunking parameters for correctness.
+///
+/// Ensures that chunking configuration makes sense: size must be positive and reasonable,
+/// and overlap must be smaller than chunk size. This prevents common configuration errors
+/// that would lead to cryptic failures from the underlying library.
+///
+/// # Errors
+///
+/// Returns an error if:
+/// - `chunk_size` is 0 (must be at least 1 character)
+/// - `chunk_size` exceeds 1,000,000 characters (to prevent excessive memory usage)
+/// - `chunk_overlap` is greater than or equal to `chunk_size` (overlap must be smaller)
+fn validate_chunk_params(chunk_size: Option<usize>, chunk_overlap: Option<usize>) -> Result<()> {
+    if let Some(size) = chunk_size {
+        if size == 0 {
+            anyhow::bail!("Invalid chunk size: {}. Chunk size must be greater than 0.", size);
+        }
+        if size > 1_000_000 {
+            anyhow::bail!(
+                "Invalid chunk size: {}. Chunk size must be less than 1,000,000 characters to avoid excessive memory usage.",
+                size
+            );
+        }
+    }
+
+    if let Some(overlap) = chunk_overlap
+        && let Some(size) = chunk_size
+        && overlap >= size
+    {
+        anyhow::bail!(
+            "Invalid chunk overlap: {}. Overlap ({}) must be less than chunk size ({}).",
+            overlap,
+            overlap,
+            size
+        );
+    }
+
+    Ok(())
+}
+
+/// Validates batch extraction paths for correctness.
+///
+/// Ensures that at least one file path is provided and that all paths point to valid,
+/// accessible files. This prevents processing empty batches or failing mid-batch due
+/// to invalid paths.
+///
+/// # Errors
+///
+/// Returns an error if:
+/// - The paths array is empty (at least one file is required)
+/// - Any path does not exist or is not a regular file
+fn validate_batch_paths(paths: &[PathBuf]) -> Result<()> {
+    if paths.is_empty() {
+        anyhow::bail!("No files provided for batch extraction. Please provide at least one file path.");
+    }
+
+    for (i, path) in paths.iter().enumerate() {
+        validate_file_exists(path).with_context(|| format!("Invalid file at position {}", i + 1))?;
+    }
+
+    Ok(())
+}
+
+/// Apply inline JSON or base64 JSON overrides to an extraction config.
+fn apply_json_overrides(
+    config: &mut kreuzberg::ExtractionConfig,
+    config_json: Option<String>,
+    config_json_base64: Option<String>,
+) -> Result<()> {
+    if let Some(json_str) = config_json {
+        let json_value: serde_json::Value =
+            serde_json::from_str(&json_str).context("Failed to parse --config-json as JSON")?;
+        *config =
+            merge_json_into_config(config, json_value).context("Failed to merge --config-json with file config")?;
+    } else if let Some(base64_str) = config_json_base64 {
+        let json_bytes = STANDARD
+            .decode(&base64_str)
+            .context("Failed to decode base64 in --config-json-base64")?;
+        let json_str = String::from_utf8(json_bytes).context("Base64-decoded content is not valid UTF-8")?;
+        let json_value: serde_json::Value =
+            serde_json::from_str(&json_str).context("Failed to parse decoded --config-json-base64 as JSON")?;
+        *config = merge_json_into_config(config, json_value)
+            .context("Failed to merge --config-json-base64 with file config")?;
+    }
+    Ok(())
+}
+
+/// Merges a JSON value into an existing extraction config via field-by-field override.
+fn merge_json_into_config(
+    base_config: &kreuzberg::ExtractionConfig,
+    json_value: serde_json::Value,
+) -> Result<kreuzberg::ExtractionConfig> {
+    let json_str = serde_json::to_string(&json_value).map_err(|e| anyhow::anyhow!("{}", e))?;
+    kreuzberg::core::config::merge::merge_config_json(base_config, &json_str).map_err(|e| anyhow::anyhow!("{}", e))
+}
+
+fn main() -> Result<()> {
+    let cli = Cli::parse();
+
+    let env_filter = logging::build_env_filter(cli.log_level.as_deref());
+
+    let _ = tracing_subscriber::fmt()
+        .with_env_filter(env_filter)
+        .with_writer(std::io::stderr)
+        .try_init();
+
+    match cli.command {
+        Commands::Extract {
+            path,
+            config: config_path,
+            config_json,
+            config_json_base64,
+            mime_type,
+            format,
+            overrides,
+        } => {
+            validate_file_exists(&path)?;
+            overrides.validate()?;
+
+            let mut config = load_config(config_path)?;
+            apply_json_overrides(&mut config, config_json, config_json_base64)?;
+            overrides.apply(&mut config);
+
+            extract_command(path, config, mime_type, format)?;
+        }
+
+        Commands::ExtractStructured {
+            path,
+            schema,
+            model,
+            api_key,
+            prompt,
+            schema_name,
+            strict,
+            config,
+            format,
+        } => {
+            validate_file_exists(&path)?;
+            validate_file_exists(&schema)?;
+            extract_structured_command(ExtractStructuredArgs {
+                path,
+                schema_path: schema,
+                model,
+                api_key,
+                prompt,
+                schema_name,
+                strict,
+                config_path: config,
+                format,
+            })?;
+        }
+
+        Commands::Batch {
+            paths,
+            config: config_path,
+            config_json,
+            config_json_base64,
+            format,
+            overrides,
+            file_configs,
+        } => {
+            validate_batch_paths(&paths)?;
+            overrides.validate()?;
+
+            let mut config = load_config(config_path)?;
+            apply_json_overrides(&mut config, config_json, config_json_base64)?;
+            overrides.apply(&mut config);
+
+            let file_configs_map = if let Some(file_configs_path) = file_configs {
+                let file_configs_json = std::fs::read_to_string(&file_configs_path)
+                    .with_context(|| format!("Failed to read file configs from '{}'", file_configs_path.display()))?;
+                let map: std::collections::HashMap<String, serde_json::Value> =
+                    serde_json::from_str(&file_configs_json).with_context(|| {
+                        format!(
+                            "Failed to parse file configs JSON from '{}'",
+                            file_configs_path.display()
+                        )
+                    })?;
+                Some(map)
+            } else {
+                None
+            };
+            batch_command(paths, file_configs_map, config, format)?;
+        }
+
+        Commands::Detect { path, format } => {
+            validate_file_exists(&path)?;
+
+            let path_str = path.to_string_lossy().to_string();
+            let mime_type = detect_mime_type(path_str.clone(), true).with_context(|| {
+                format!(
+                    "Failed to detect MIME type for file '{}'. Ensure the file is readable.",
+                    path.display()
+                )
+            })?;
+
+            match format {
+                WireFormat::Text => {
+                    println!("{}", style::success(&mime_type));
+                }
+                WireFormat::Json => {
+                    let output = json!({
+                        "path": path_str,
+                        "mime_type": mime_type,
+                    });
+                    println!(
+                        "{}",
+                        serde_json::to_string_pretty(&output)
+                            .context("Failed to serialize MIME type detection result to JSON")?
+                    );
+                }
+                WireFormat::Toon => {
+                    let output = json!({
+                        "path": path_str,
+                        "mime_type": mime_type,
+                    });
+                    println!(
+                        "{}",
+                        serde_toon::to_string(&output)
+                            .context("Failed to serialize MIME type detection result to TOON")?
+                    );
+                }
+            }
+        }
+
+        Commands::Formats { format } => {
+            let formats = kreuzberg::core::mime::list_supported_formats();
+            match format {
+                WireFormat::Text => {
+                    println!("{:<15} {}", style::label("EXTENSION"), style::label("MIME TYPE"));
+                    println!("{}", style::dim(&format!("{:<15} ---------", "---------")));
+                    for f in &formats {
+                        println!("{:<15} {}", style::success(&format!(".{}", f.extension)), f.mime_type);
+                    }
+                }
+                WireFormat::Json => {
+                    println!(
+                        "{}",
+                        serde_json::to_string_pretty(&formats).context("Failed to serialize formats to JSON")?
+                    );
+                }
+                WireFormat::Toon => {
+                    println!(
+                        "{}",
+                        serde_toon::to_string(&formats).context("Failed to serialize formats to TOON")?
+                    );
+                }
+            }
+        }
+
+        Commands::Version { format } => {
+            let version = env!("CARGO_PKG_VERSION");
+            let name = env!("CARGO_PKG_NAME");
+
+            match format {
+                WireFormat::Text => {
+                    println!("{} {}", style::label(name), style::success(version));
+                }
+                WireFormat::Json => {
+                    let output = json!({
+                        "name": name,
+                        "version": version,
+                    });
+                    println!(
+                        "{}",
+                        serde_json::to_string_pretty(&output)
+                            .context("Failed to serialize version information to JSON")?
+                    );
+                }
+                WireFormat::Toon => {
+                    let output = json!({
+                        "name": name,
+                        "version": version,
+                    });
+                    println!(
+                        "{}",
+                        serde_toon::to_string(&output).context("Failed to serialize version information to TOON")?
+                    );
+                }
+            }
+        }
+
+        #[cfg(feature = "api")]
+        Commands::Serve {
+            host: cli_host,
+            port: cli_port,
+            config: config_path,
+        } => {
+            let mut extraction_config = load_config(config_path.clone())?;
+            extraction_config.apply_env_overrides()?;
+            serve_command(cli_host, cli_port, extraction_config, config_path)?;
+        }
+
+        #[cfg(feature = "mcp")]
+        Commands::Mcp {
+            config: config_path,
+            transport,
+            #[cfg(feature = "mcp-http")]
+            host,
+            #[cfg(feature = "mcp-http")]
+            port,
+            #[cfg(not(feature = "mcp-http"))]
+            host,
+            #[cfg(not(feature = "mcp-http"))]
+            port,
+        } => {
+            let mut config = load_config(config_path)?;
+            config.apply_env_overrides()?;
+            mcp_command(config, transport, host, port)?;
+        }
+
+        Commands::Cache { command } => match command {
+            CacheCommands::Stats { cache_dir, format } => {
+                stats_command(cache_dir, format)?;
+            }
+            CacheCommands::Clear { cache_dir, format } => {
+                clear_command(cache_dir, format)?;
+            }
+            CacheCommands::Manifest { format } => {
+                manifest_command(format)?;
+            }
+            CacheCommands::Warm {
+                cache_dir,
+                format,
+                all_embeddings,
+                embedding_model,
+                all_table_models,
+                all_grammars,
+                grammar_groups,
+                grammars,
+            } => {
+                warm_command(
+                    cache_dir,
+                    format,
+                    all_embeddings,
+                    embedding_model,
+                    all_table_models,
+                    all_grammars,
+                    grammar_groups,
+                    grammars,
+                )?;
+            }
+        },
+
+        #[cfg(feature = "api")]
+        Commands::Api { command } => match command {
+            ApiCommands::Schema => {
+                println!("{}", kreuzberg::api::openapi::openapi_json());
+            }
+        },
+
+        #[cfg(feature = "embeddings")]
+        Commands::Embed {
+            text,
+            preset,
+            provider,
+            model,
+            api_key,
+            plugin,
+            format,
+        } => {
+            let texts = if text.is_empty() {
+                vec![commands::read_stdin()?]
+            } else {
+                text
+            };
+            embed_command(texts, &preset, &provider, model, api_key, plugin, format)?;
+        }
+
+        Commands::Chunk {
+            text,
+            config: config_path,
+            chunk_size,
+            chunk_overlap,
+            chunker_type,
+            chunking_tokenizer,
+            topic_threshold,
+            format,
+        } => {
+            let input = match text {
+                Some(t) => t,
+                None => commands::read_stdin().context("No --text provided and failed to read from stdin")?,
+            };
+
+            validate_chunk_params(chunk_size, chunk_overlap)?;
+
+            let base_config = load_config(config_path)?;
+            let mut chunking_config = base_config.chunking.unwrap_or_default();
+
+            if let Some(size) = chunk_size {
+                chunking_config.max_characters = size;
+                // If user set chunk_size but not overlap, clamp overlap to fit
+                if chunk_overlap.is_none() && chunking_config.overlap >= size {
+                    chunking_config.overlap = size / 4;
+                }
+            }
+            if let Some(overlap) = chunk_overlap {
+                chunking_config.overlap = overlap;
+            }
+            match chunker_type.as_str() {
+                "markdown" => chunking_config.chunker_type = kreuzberg::ChunkerType::Markdown,
+                "yaml" => chunking_config.chunker_type = kreuzberg::ChunkerType::Yaml,
+                "semantic" => chunking_config.chunker_type = kreuzberg::ChunkerType::Semantic,
+                _ => chunking_config.chunker_type = kreuzberg::ChunkerType::Text,
+            }
+            if let Some(ref tokenizer) = chunking_tokenizer {
+                chunking_config.sizing = kreuzberg::ChunkSizing::Tokenizer {
+                    model: tokenizer.clone(),
+                    cache_dir: None,
+                };
+            }
+            if topic_threshold.is_some() {
+                chunking_config.topic_threshold = topic_threshold;
+            }
+
+            chunk_command(input, chunking_config, format)?;
+        }
+
+        Commands::Completions { shell } => {
+            let mut cmd = Cli::command();
+            clap_complete::generate(shell, &mut cmd, "kreuzberg", &mut std::io::stdout());
+        }
+    }
+
+    Ok(())
+}
--- a/crates/kreuzberg-cli/src/output.rs
+++ b/crates/kreuzberg-cli/src/output.rs
@@ -0,0 +1,32 @@
+//! JSON envelope types for CLI output.
+//!
+//! When `--format json` is used, extraction results are wrapped in these envelopes
+//! so tooling (such as the benchmark harness) can read timing information without
+//! parsing stderr or running a separate profiling tool.
+
+use kreuzberg::ExtractionResult;
+use serde::Serialize;
+
+/// Single-file extraction result with wall-clock timing.
+///
+/// Emitted to stdout by `kreuzberg extract --format json`.
+#[derive(Debug, Serialize)]
+pub struct ExtractEnvelope {
+    /// The extraction result (content, metadata, tables, …).
+    pub result: ExtractionResult,
+    /// Wall-clock time for the extraction call in milliseconds.
+    pub extraction_time_ms: f64,
+}
+
+/// Batch extraction results with per-file and total timing.
+///
+/// Emitted to stdout by `kreuzberg batch --format json`.
+#[derive(Debug, Serialize)]
+pub struct BatchEnvelope {
+    /// One result per input file, in input order.
+    pub results: Vec<ExtractionResult>,
+    /// Total wall-clock time for the whole batch in milliseconds.
+    pub total_ms: f64,
+    /// Per-file wall-clock times in milliseconds, aligned with `results`.
+    pub per_file_ms: Vec<f64>,
+}
--- a/crates/kreuzberg-cli/src/style.rs
+++ b/crates/kreuzberg-cli/src/style.rs
@@ -0,0 +1,104 @@
+//! CLI color styling helpers using `anstyle`.
+//!
+//! Provides styled output for the kreuzberg CLI. Respects the `NO_COLOR`
+//! environment variable (<https://no-color.org/>) and disables colors
+//! when output is not a terminal.
+
+use anstyle::{AnsiColor, Effects, Style};
+use std::sync::OnceLock;
+
+/// Bold blue for section headers.
+const HEADER: Style = Style::new()
+    .fg_color(Some(anstyle::Color::Ansi(AnsiColor::Blue)))
+    .effects(Effects::BOLD);
+
+/// Green for success values (MIME types, file paths, versions).
+const SUCCESS: Style = Style::new().fg_color(Some(anstyle::Color::Ansi(AnsiColor::Green)));
+
+/// Dim for metadata, separators, secondary info.
+const DIM: Style = Style::new().effects(Effects::DIMMED);
+
+/// Bold for labels in key-value pairs.
+const LABEL: Style = Style::new().effects(Effects::BOLD);
+
+/// Check whether color output is enabled.
+///
+/// Returns `false` if:
+/// - The `NO_COLOR` environment variable is set (any value)
+///
+/// See <https://no-color.org/> for the specification.
+pub fn is_color_enabled() -> bool {
+    static ENABLED: OnceLock<bool> = OnceLock::new();
+    *ENABLED.get_or_init(|| std::env::var_os("NO_COLOR").is_none())
+}
+
+/// Apply an `anstyle::Style` to text if colors are enabled.
+fn styled(text: &str, style: Style) -> String {
+    if is_color_enabled() {
+        format!("{}{}{}", style.render(), text, style.render_reset())
+    } else {
+        text.to_string()
+    }
+}
+
+/// Style text as a section header (bold blue).
+pub fn header(text: &str) -> String {
+    styled(text, HEADER)
+}
+
+/// Style text as a success value (green).
+pub fn success(text: &str) -> String {
+    styled(text, SUCCESS)
+}
+
+/// Style text as dim/secondary (dimmed).
+pub fn dim(text: &str) -> String {
+    styled(text, DIM)
+}
+
+/// Style text as a label (bold).
+pub fn label(text: &str) -> String {
+    styled(text, LABEL)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_styled_returns_plain_text_when_no_color() {
+        // Set NO_COLOR for this test's assertion scope via direct env check
+        // Since OnceLock caches, we test the raw logic instead.
+        let text = "hello";
+        let result = format!("{}{}{}", Style::new().render(), text, Style::new().render_reset());
+        // A plain Style produces no ANSI codes, so the result is just the text.
+        assert_eq!(result, "hello");
+    }
+
+    #[test]
+    fn test_styled_applies_ansi_when_style_present() {
+        let style = Style::new().fg_color(Some(anstyle::Color::Ansi(AnsiColor::Green)));
+        let rendered = format!("{}{}{}", style.render(), "ok", style.render_reset());
+        // The rendered string should contain ANSI escape sequences.
+        assert!(rendered.contains("\x1b["));
+        assert!(rendered.contains("ok"));
+    }
+
+    #[test]
+    fn test_helper_functions_return_strings() {
+        // Smoke test: all helpers produce non-empty output for non-empty input.
+        assert!(!header("h").is_empty());
+        assert!(!success("s").is_empty());
+        assert!(!dim("d").is_empty());
+        assert!(!label("l").is_empty());
+    }
+
+    #[test]
+    fn test_is_color_enabled_respects_no_color_env() {
+        // We cannot easily test OnceLock-cached value, but we can verify the
+        // logic: NO_COLOR absence means colors enabled.
+        let has_no_color = std::env::var_os("NO_COLOR").is_some();
+        // The cached result should match the env at init time.
+        assert_eq!(is_color_enabled(), !has_no_color);
+    }
+}