This commit is contained in:
86
crates/kreuzberg-cli/Cargo.toml
Normal file
86
crates/kreuzberg-cli/Cargo.toml
Normal file
@@ -0,0 +1,86 @@
|
||||
[package]
|
||||
name = "kreuzberg-cli"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
rust-version.workspace = true
|
||||
authors.workspace = true
|
||||
description = "Command-line interface for Kreuzberg document intelligence"
|
||||
license.workspace = true
|
||||
repository.workspace = true
|
||||
homepage = "https://kreuzberg.dev"
|
||||
documentation = "https://docs.kreuzberg.dev"
|
||||
keywords = ["document", "extraction", "cli", "tool", "parser"]
|
||||
categories = ["command-line-utilities", "text-processing"]
|
||||
|
||||
[package.metadata.cargo-machete]
|
||||
ignored = ["serde_toon_format"]
|
||||
|
||||
[[bin]]
|
||||
name = "kreuzberg"
|
||||
path = "src/main.rs"
|
||||
|
||||
[features]
|
||||
default = [
|
||||
"embeddings",
|
||||
"html",
|
||||
"liter-llm",
|
||||
"ocr",
|
||||
"paddle-ocr",
|
||||
"layout-detection",
|
||||
"chunking-tokenizers",
|
||||
"tree-sitter",
|
||||
]
|
||||
ort-bundled = ["kreuzberg/ort-bundled"]
|
||||
|
||||
ocr = ["kreuzberg/ocr"]
|
||||
|
||||
api = ["kreuzberg/api"]
|
||||
mcp = ["kreuzberg/mcp"]
|
||||
mcp-http = ["kreuzberg/mcp-http"]
|
||||
embeddings = ["kreuzberg/embeddings"]
|
||||
paddle-ocr = ["kreuzberg/paddle-ocr"]
|
||||
layout-detection = ["kreuzberg/layout-detection"]
|
||||
chunking-tokenizers = ["kreuzberg/chunking-tokenizers"]
|
||||
html = ["kreuzberg/html"]
|
||||
liter-llm = ["kreuzberg/liter-llm"]
|
||||
tree-sitter = ["kreuzberg/tree-sitter", "dep:tree-sitter-language-pack"]
|
||||
all = [
|
||||
"default",
|
||||
"api",
|
||||
"html",
|
||||
"mcp",
|
||||
"mcp-http",
|
||||
"chunking-tokenizers",
|
||||
"tree-sitter",
|
||||
"liter-llm",
|
||||
]
|
||||
|
||||
[dependencies]
|
||||
|
||||
anstyle = "1"
|
||||
anyhow = { workspace = true }
|
||||
base64 = { workspace = true }
|
||||
clap = { workspace = true }
|
||||
clap_complete = "4.6"
|
||||
kreuzberg = { workspace = true, features = [
|
||||
"formats",
|
||||
"analysis",
|
||||
"tokio-runtime",
|
||||
"simd-utf8",
|
||||
"cli",
|
||||
] }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
serde_toon_format = { workspace = true }
|
||||
tokio = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt"] }
|
||||
tree-sitter-language-pack = { workspace = true, features = [
|
||||
"dynamic-loading",
|
||||
"download",
|
||||
"serde",
|
||||
], optional = true }
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = { workspace = true }
|
||||
ureq = { version = "3.3", features = ["json"] }
|
||||
1027
crates/kreuzberg-cli/README.md
Normal file
1027
crates/kreuzberg-cli/README.md
Normal file
File diff suppressed because it is too large
Load Diff
13
crates/kreuzberg-cli/build.rs
Normal file
13
crates/kreuzberg-cli/build.rs
Normal file
@@ -0,0 +1,13 @@
|
||||
fn main() {
|
||||
println!("cargo::rustc-check-cfg=cfg(coverage)");
|
||||
|
||||
let target = std::env::var("TARGET").unwrap();
|
||||
|
||||
if target.contains("darwin") {
|
||||
println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path");
|
||||
println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path/.");
|
||||
} else if target.contains("linux") {
|
||||
println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN");
|
||||
println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN/.");
|
||||
}
|
||||
}
|
||||
466
crates/kreuzberg-cli/src/commands/cache.rs
Normal file
466
crates/kreuzberg-cli/src/commands/cache.rs
Normal file
@@ -0,0 +1,466 @@
|
||||
//! Cache command - Manage cache operations
|
||||
//!
|
||||
//! This module provides commands for cache management including statistics,
|
||||
//! clearing, manifest generation, and model warming.
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use kreuzberg::cache;
|
||||
use serde_json::json;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use crate::{WireFormat, style};
|
||||
|
||||
/// Execute cache stats command
|
||||
pub fn stats_command(cache_dir: Option<PathBuf>, format: WireFormat) -> Result<()> {
|
||||
let default_cache_dir = std::env::current_dir()
|
||||
.context("Failed to get current directory")?
|
||||
.join(".kreuzberg");
|
||||
|
||||
let cache_path = cache_dir.unwrap_or(default_cache_dir);
|
||||
let cache_dir_str = cache_path.to_string_lossy();
|
||||
|
||||
let stats = cache::get_cache_metadata(&cache_dir_str).with_context(|| {
|
||||
format!(
|
||||
"Failed to get cache statistics from directory '{}'. Ensure the directory exists and is readable.",
|
||||
cache_dir_str
|
||||
)
|
||||
})?;
|
||||
|
||||
match format {
|
||||
WireFormat::Text => {
|
||||
println!("{}", style::header("Cache Statistics"));
|
||||
println!("{}", style::dim("================"));
|
||||
println!("{} {}", style::label("Directory:"), style::success(&cache_dir_str));
|
||||
println!("{} {}", style::label("Total files:"), stats.total_files);
|
||||
println!("{} {:.2} MB", style::label("Total size:"), stats.total_size_mb);
|
||||
println!(
|
||||
"{} {:.2} MB",
|
||||
style::label("Available space:"),
|
||||
stats.available_space_mb
|
||||
);
|
||||
println!(
|
||||
"{} {:.2} days",
|
||||
style::label("Oldest file age:"),
|
||||
stats.oldest_file_age_days
|
||||
);
|
||||
println!(
|
||||
"{} {:.2} days",
|
||||
style::label("Newest file age:"),
|
||||
stats.newest_file_age_days
|
||||
);
|
||||
}
|
||||
WireFormat::Json => {
|
||||
let output = json!({
|
||||
"directory": cache_dir_str,
|
||||
"total_files": stats.total_files,
|
||||
"total_size_mb": stats.total_size_mb,
|
||||
"available_space_mb": stats.available_space_mb,
|
||||
"oldest_file_age_days": stats.oldest_file_age_days,
|
||||
"newest_file_age_days": stats.newest_file_age_days,
|
||||
});
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&output).context("Failed to serialize cache statistics to JSON")?
|
||||
);
|
||||
}
|
||||
WireFormat::Toon => {
|
||||
let output = json!({
|
||||
"directory": cache_dir_str,
|
||||
"total_files": stats.total_files,
|
||||
"total_size_mb": stats.total_size_mb,
|
||||
"available_space_mb": stats.available_space_mb,
|
||||
"oldest_file_age_days": stats.oldest_file_age_days,
|
||||
"newest_file_age_days": stats.newest_file_age_days,
|
||||
});
|
||||
println!(
|
||||
"{}",
|
||||
serde_toon::to_string(&output).context("Failed to serialize cache statistics to TOON")?
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Execute cache clear command
|
||||
pub fn clear_command(cache_dir: Option<PathBuf>, format: WireFormat) -> Result<()> {
|
||||
let default_cache_dir = std::env::current_dir()
|
||||
.context("Failed to get current directory")?
|
||||
.join(".kreuzberg");
|
||||
|
||||
let cache_path = cache_dir.unwrap_or(default_cache_dir);
|
||||
let cache_dir_str = cache_path.to_string_lossy();
|
||||
|
||||
let (removed_files, freed_mb) = cache::clear_cache_directory(&cache_dir_str).with_context(|| {
|
||||
format!(
|
||||
"Failed to clear cache directory '{}'. Ensure you have write permissions.",
|
||||
cache_dir_str
|
||||
)
|
||||
})?;
|
||||
|
||||
match format {
|
||||
WireFormat::Text => {
|
||||
println!("{}", style::success("Cache cleared successfully"));
|
||||
println!("{} {}", style::label("Directory:"), style::success(&cache_dir_str));
|
||||
println!("{} {}", style::label("Removed files:"), removed_files);
|
||||
println!("{} {:.2} MB", style::label("Freed space:"), freed_mb);
|
||||
}
|
||||
WireFormat::Json => {
|
||||
let output = json!({
|
||||
"directory": cache_dir_str,
|
||||
"removed_files": removed_files,
|
||||
"freed_mb": freed_mb,
|
||||
});
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&output).context("Failed to serialize cache clear results to JSON")?
|
||||
);
|
||||
}
|
||||
WireFormat::Toon => {
|
||||
let output = json!({
|
||||
"directory": cache_dir_str,
|
||||
"removed_files": removed_files,
|
||||
"freed_mb": freed_mb,
|
||||
});
|
||||
println!(
|
||||
"{}",
|
||||
serde_toon::to_string(&output).context("Failed to serialize cache clear results to TOON")?
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Execute cache manifest command - outputs expected model files with checksums.
|
||||
pub fn manifest_command(format: WireFormat) -> Result<()> {
|
||||
// Without at least one model-providing feature, every `extend` call
|
||||
// below is `#[cfg]`-stripped and `entries: Vec<_>` has no anchor for
|
||||
// type inference — `e.size_bytes` on the closure further down then
|
||||
// fails compilation with E0282. Bail with a clear error instead so
|
||||
// (or similar minimal configurations) succeeds.
|
||||
#[cfg(not(any(feature = "paddle-ocr", feature = "layout-detection")))]
|
||||
{
|
||||
let _ = format;
|
||||
anyhow::bail!(
|
||||
"manifest command unavailable: build kreuzberg-cli with at least one of \
|
||||
--features \"paddle-ocr\" or --features \"layout-detection\""
|
||||
);
|
||||
}
|
||||
|
||||
#[cfg(any(feature = "paddle-ocr", feature = "layout-detection"))]
|
||||
{
|
||||
manifest_command_inner(format)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(any(feature = "paddle-ocr", feature = "layout-detection"))]
|
||||
fn manifest_command_inner(format: WireFormat) -> Result<()> {
|
||||
let mut entries = Vec::new();
|
||||
|
||||
#[cfg(feature = "paddle-ocr")]
|
||||
{
|
||||
entries.extend(kreuzberg::paddle_ocr::ModelManager::manifest());
|
||||
}
|
||||
|
||||
#[cfg(feature = "layout-detection")]
|
||||
{
|
||||
entries.extend(kreuzberg::layout::LayoutModelManager::manifest());
|
||||
}
|
||||
|
||||
#[cfg(feature = "paddle-ocr")]
|
||||
{
|
||||
entries.extend(kreuzberg::ocr::TessdataManager::manifest());
|
||||
}
|
||||
|
||||
let total_size_bytes: u64 = entries.iter().map(|e| e.size_bytes).sum();
|
||||
let version = env!("CARGO_PKG_VERSION");
|
||||
|
||||
match format {
|
||||
WireFormat::Text => {
|
||||
println!(
|
||||
"{} {}",
|
||||
style::header("Model Manifest"),
|
||||
style::dim(&format!("(kreuzberg {})", version))
|
||||
);
|
||||
println!("{}", style::dim("===================================="));
|
||||
println!(
|
||||
"{:<50} {:>12} {}",
|
||||
style::label("PATH"),
|
||||
style::label("SIZE"),
|
||||
style::label("SHA256")
|
||||
);
|
||||
println!("{}", style::dim(&format!("{:<50} {:>12} ------", "----", "----")));
|
||||
for entry in &entries {
|
||||
let size_str = if entry.size_bytes > 0 {
|
||||
format!("{:.1} MB", entry.size_bytes as f64 / 1_048_576.0)
|
||||
} else {
|
||||
"unknown".to_string()
|
||||
};
|
||||
let sha_display = if entry.sha256.len() >= 12 {
|
||||
&entry.sha256[..12]
|
||||
} else if entry.sha256.is_empty() {
|
||||
"-"
|
||||
} else {
|
||||
&entry.sha256
|
||||
};
|
||||
println!(
|
||||
"{:<50} {:>12} {}",
|
||||
entry.relative_path,
|
||||
size_str,
|
||||
style::dim(sha_display)
|
||||
);
|
||||
}
|
||||
println!();
|
||||
println!(
|
||||
"{} {} files, {:.1} MB",
|
||||
style::label("Total:"),
|
||||
entries.len(),
|
||||
total_size_bytes as f64 / 1_048_576.0
|
||||
);
|
||||
}
|
||||
WireFormat::Json => {
|
||||
let output = json!({
|
||||
"kreuzberg_version": version,
|
||||
"total_size_bytes": total_size_bytes,
|
||||
"model_count": entries.len(),
|
||||
"models": entries,
|
||||
});
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&output).context("Failed to serialize manifest to JSON")?
|
||||
);
|
||||
}
|
||||
WireFormat::Toon => {
|
||||
let output = json!({
|
||||
"kreuzberg_version": version,
|
||||
"total_size_bytes": total_size_bytes,
|
||||
"model_count": entries.len(),
|
||||
"models": entries,
|
||||
});
|
||||
println!(
|
||||
"{}",
|
||||
serde_toon::to_string(&output).context("Failed to serialize manifest to TOON")?
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Execute cache warm command - eagerly downloads all models.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn warm_command(
|
||||
cache_dir: Option<PathBuf>,
|
||||
format: WireFormat,
|
||||
all_embeddings: bool,
|
||||
embedding_model: Option<String>,
|
||||
all_table_models: bool,
|
||||
all_grammars: bool,
|
||||
grammar_groups: Option<Vec<String>>,
|
||||
grammars: Option<Vec<String>>,
|
||||
) -> Result<()> {
|
||||
let cache_base = resolve_cache_base(cache_dir);
|
||||
|
||||
let mut downloaded: Vec<String> = Vec::new();
|
||||
let mut already_cached: Vec<String> = Vec::new();
|
||||
|
||||
#[cfg(feature = "paddle-ocr")]
|
||||
{
|
||||
let paddle_dir = cache_base.join("paddle-ocr");
|
||||
let manager = kreuzberg::paddle_ocr::ModelManager::new(paddle_dir);
|
||||
|
||||
// ensure_all_models downloads v2 det (server+mobile), cls (PP-LCNet),
|
||||
// doc_ori, v2 unified rec models, and all per-script rec families
|
||||
manager
|
||||
.ensure_all_models()
|
||||
.context("Failed to download PaddleOCR v2 models")?;
|
||||
downloaded.push("paddle-ocr v2 (server+mobile det, cls, doc_ori, unified+per-script rec)".to_string());
|
||||
}
|
||||
|
||||
#[cfg(feature = "layout-detection")]
|
||||
{
|
||||
let layout_dir = cache_base.join("layout");
|
||||
let manager = kreuzberg::layout::LayoutModelManager::new(Some(layout_dir));
|
||||
|
||||
if all_table_models {
|
||||
// Download rtdetr + tatr + all SLANeXT variants (~730MB)
|
||||
let was_cached = manager.is_rtdetr_cached() && manager.is_tatr_cached();
|
||||
if was_cached {
|
||||
already_cached.push("layout (rtdetr, tatr, slanet variants)".to_string());
|
||||
} else {
|
||||
manager
|
||||
.ensure_all_models()
|
||||
.context("Failed to download layout models")?;
|
||||
downloaded.push("layout (rtdetr, tatr, slanet variants)".to_string());
|
||||
}
|
||||
} else {
|
||||
// Default: download only rtdetr + tatr
|
||||
let was_cached = manager.is_rtdetr_cached() && manager.is_tatr_cached();
|
||||
if was_cached {
|
||||
already_cached.push("layout (rtdetr, tatr)".to_string());
|
||||
} else {
|
||||
manager
|
||||
.ensure_default_models()
|
||||
.context("Failed to download layout models")?;
|
||||
downloaded.push("layout (rtdetr, tatr)".to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "paddle-ocr")]
|
||||
{
|
||||
let tessdata_dir = cache_base.join("tessdata");
|
||||
let manager = kreuzberg::ocr::TessdataManager::new(Some(tessdata_dir));
|
||||
|
||||
let newly_downloaded = manager
|
||||
.ensure_all_languages()
|
||||
.context("Failed to download tessdata files")?;
|
||||
|
||||
if newly_downloaded > 0 {
|
||||
downloaded.push(format!("tessdata ({newly_downloaded} languages)"));
|
||||
} else {
|
||||
already_cached.push("tessdata (all languages)".to_string());
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "embeddings")]
|
||||
{
|
||||
let embeddings_dir = cache_base.join("embeddings");
|
||||
let presets_to_warm: Vec<kreuzberg::EmbeddingPreset> = if all_embeddings {
|
||||
kreuzberg::list_embedding_presets()
|
||||
.into_iter()
|
||||
.filter_map(|name| kreuzberg::get_embedding_preset(&name))
|
||||
.collect()
|
||||
} else if let Some(ref name) = embedding_model {
|
||||
match kreuzberg::get_embedding_preset(name) {
|
||||
Some(preset) => vec![preset],
|
||||
None => {
|
||||
let available = kreuzberg::list_embedding_presets();
|
||||
anyhow::bail!(
|
||||
"Unknown embedding preset '{}'. Available: {}",
|
||||
name,
|
||||
available.join(", ")
|
||||
);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
vec![]
|
||||
};
|
||||
|
||||
for preset in &presets_to_warm {
|
||||
let label = format!("embedding ({})", preset.name);
|
||||
kreuzberg::embeddings::warm_model(
|
||||
&kreuzberg::core::config::EmbeddingModelType::Preset {
|
||||
name: preset.name.clone(),
|
||||
},
|
||||
Some(embeddings_dir.clone()),
|
||||
)
|
||||
.map_err(|e| anyhow::anyhow!("Failed to download embedding model '{}': {}", preset.name, e))?;
|
||||
downloaded.push(label);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "embeddings"))]
|
||||
{
|
||||
if all_embeddings || embedding_model.is_some() {
|
||||
anyhow::bail!("Embedding model warming requires the 'embeddings' feature to be enabled");
|
||||
}
|
||||
}
|
||||
|
||||
// Tree-sitter grammar downloads
|
||||
#[cfg(feature = "tree-sitter")]
|
||||
{
|
||||
if all_grammars {
|
||||
let count =
|
||||
tree_sitter_language_pack::download_all().context("Failed to download all tree-sitter grammars")?;
|
||||
if count > 0 {
|
||||
downloaded.push(format!("tree-sitter grammars ({count} languages)"));
|
||||
} else {
|
||||
already_cached.push("tree-sitter grammars (all)".to_string());
|
||||
}
|
||||
} else if let Some(ref groups) = grammar_groups {
|
||||
let config = tree_sitter_language_pack::PackConfig {
|
||||
cache_dir: None,
|
||||
languages: None,
|
||||
groups: Some(groups.clone()),
|
||||
};
|
||||
tree_sitter_language_pack::init(&config).context("Failed to download tree-sitter grammar groups")?;
|
||||
downloaded.push(format!("tree-sitter grammars (groups: {})", groups.join(", ")));
|
||||
} else if let Some(ref langs) = grammars {
|
||||
let refs: Vec<&str> = langs.iter().map(String::as_str).collect();
|
||||
let count =
|
||||
tree_sitter_language_pack::download(&refs).context("Failed to download tree-sitter grammars")?;
|
||||
if count > 0 {
|
||||
downloaded.push(format!("tree-sitter grammars ({count} languages)"));
|
||||
} else {
|
||||
already_cached.push(format!("tree-sitter grammars ({})", langs.join(", ")));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "tree-sitter"))]
|
||||
{
|
||||
if all_grammars || grammar_groups.is_some() || grammars.is_some() {
|
||||
anyhow::bail!("Tree-sitter grammar warming requires the 'tree-sitter' feature to be enabled");
|
||||
}
|
||||
}
|
||||
|
||||
match format {
|
||||
WireFormat::Text => {
|
||||
if !downloaded.is_empty() {
|
||||
println!("{}", style::label("Downloaded:"));
|
||||
for d in &downloaded {
|
||||
println!(" {}", style::success(d));
|
||||
}
|
||||
}
|
||||
if !already_cached.is_empty() {
|
||||
println!("{}", style::label("Already cached:"));
|
||||
for c in &already_cached {
|
||||
println!(" {}", style::dim(c));
|
||||
}
|
||||
}
|
||||
println!(
|
||||
"All models ready in {}",
|
||||
style::success(&cache_base.display().to_string())
|
||||
);
|
||||
}
|
||||
WireFormat::Json => {
|
||||
let output = json!({
|
||||
"cache_dir": cache_base.to_string_lossy(),
|
||||
"downloaded": downloaded,
|
||||
"already_cached": already_cached,
|
||||
});
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&output).context("Failed to serialize warm results to JSON")?
|
||||
);
|
||||
}
|
||||
WireFormat::Toon => {
|
||||
let output = json!({
|
||||
"cache_dir": cache_base.to_string_lossy(),
|
||||
"downloaded": downloaded,
|
||||
"already_cached": already_cached,
|
||||
});
|
||||
println!(
|
||||
"{}",
|
||||
serde_toon::to_string(&output).context("Failed to serialize warm results to TOON")?
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Resolve the cache base directory.
|
||||
fn resolve_cache_base(cache_dir: Option<PathBuf>) -> PathBuf {
|
||||
if let Some(dir) = cache_dir {
|
||||
return dir;
|
||||
}
|
||||
if let Ok(env_path) = std::env::var("KREUZBERG_CACHE_DIR") {
|
||||
return PathBuf::from(env_path);
|
||||
}
|
||||
std::env::current_dir()
|
||||
.unwrap_or_else(|_| PathBuf::from("."))
|
||||
.join(".kreuzberg")
|
||||
}
|
||||
61
crates/kreuzberg-cli/src/commands/chunk.rs
Normal file
61
crates/kreuzberg-cli/src/commands/chunk.rs
Normal file
@@ -0,0 +1,61 @@
|
||||
//! Chunk command implementation.
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
|
||||
use crate::{WireFormat, style};
|
||||
|
||||
/// Execute the chunk command: split text into chunks.
|
||||
pub fn chunk_command(text: String, config: kreuzberg::ChunkingConfig, format: WireFormat) -> Result<()> {
|
||||
if text.is_empty() {
|
||||
anyhow::bail!("No text provided for chunking. Provide --text or pipe text via stdin.");
|
||||
}
|
||||
|
||||
let result = kreuzberg::chunking::chunk_text(&text, &config, None).context("Failed to chunk text")?;
|
||||
|
||||
match format {
|
||||
WireFormat::Json => {
|
||||
let chunks: Vec<&str> = result.chunks.iter().map(|c| c.content.as_str()).collect();
|
||||
let output = serde_json::json!({
|
||||
"chunks": chunks,
|
||||
"chunk_count": result.chunk_count,
|
||||
"config": {
|
||||
"max_characters": config.max_characters,
|
||||
"overlap": config.overlap,
|
||||
"chunker_type": format!("{:?}", config.chunker_type),
|
||||
},
|
||||
"input_size_bytes": text.len(),
|
||||
});
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&output).context("Failed to serialize chunks to JSON")?
|
||||
);
|
||||
}
|
||||
WireFormat::Toon => {
|
||||
let chunks: Vec<&str> = result.chunks.iter().map(|c| c.content.as_str()).collect();
|
||||
let output = serde_json::json!({
|
||||
"chunks": chunks,
|
||||
"chunk_count": result.chunk_count,
|
||||
"config": {
|
||||
"max_characters": config.max_characters,
|
||||
"overlap": config.overlap,
|
||||
"chunker_type": format!("{:?}", config.chunker_type),
|
||||
},
|
||||
"input_size_bytes": text.len(),
|
||||
});
|
||||
println!(
|
||||
"{}",
|
||||
serde_toon::to_string(&output).context("Failed to serialize chunks to TOON")?
|
||||
);
|
||||
}
|
||||
WireFormat::Text => {
|
||||
for (i, chunk) in result.chunks.iter().enumerate() {
|
||||
if result.chunks.len() > 1 {
|
||||
println!("{}", style::dim(&format!("--- chunk {} ---", i + 1)));
|
||||
}
|
||||
println!("{}", chunk.content);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
51
crates/kreuzberg-cli/src/commands/config.rs
Normal file
51
crates/kreuzberg-cli/src/commands/config.rs
Normal file
@@ -0,0 +1,51 @@
|
||||
//! Config command - Configuration loading and discovery
|
||||
//!
|
||||
//! This module provides utilities for loading extraction configuration from files
|
||||
//! or discovering them automatically in the project directory.
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use kreuzberg::ExtractionConfig;
|
||||
use std::path::PathBuf;
|
||||
|
||||
/// Loads extraction configuration from a file or discovers it automatically.
|
||||
///
|
||||
/// This function implements the CLI's configuration hierarchy:
|
||||
/// 1. Explicit config file (if `--config` flag provided)
|
||||
/// 2. Auto-discovered config (searches `kreuzberg.{toml,yaml,json}` in current and parent directories)
|
||||
/// 3. Default configuration (if no config file found)
|
||||
///
|
||||
/// # Configuration File Formats
|
||||
///
|
||||
/// Supports three formats, determined by file extension:
|
||||
/// - `.toml`: TOML format (recommended for humans)
|
||||
/// - `.yaml` / `.yml`: YAML format
|
||||
/// - `.json`: JSON format
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns an error if:
|
||||
/// - Explicit config file has unsupported extension (must be .toml, .yaml, .yml, or .json)
|
||||
/// - Config file cannot be read or parsed
|
||||
/// - Config file contains invalid extraction settings
|
||||
pub fn load_config(config_path: Option<PathBuf>) -> Result<ExtractionConfig> {
|
||||
if let Some(path) = config_path {
|
||||
let path_str = path.to_string_lossy();
|
||||
let path_lower = path_str.to_lowercase();
|
||||
let config = if path_lower.ends_with(".toml") {
|
||||
ExtractionConfig::from_toml_file(&path)
|
||||
} else if path_lower.ends_with(".yaml") || path_lower.ends_with(".yml") {
|
||||
ExtractionConfig::from_yaml_file(&path)
|
||||
} else if path_lower.ends_with(".json") {
|
||||
ExtractionConfig::from_json_file(&path)
|
||||
} else {
|
||||
anyhow::bail!("Config file must have .toml, .yaml, .yml, or .json extension (case-insensitive)");
|
||||
};
|
||||
config.with_context(|| format!("Failed to load configuration from '{}'. Ensure the file exists, is readable, and contains valid configuration.", path.display()))
|
||||
} else {
|
||||
match ExtractionConfig::discover() {
|
||||
Ok(Some(config)) => Ok(config),
|
||||
Ok(None) => Ok(ExtractionConfig::default()),
|
||||
Err(e) => Err(e).context("Failed to auto-discover configuration file. Searched for kreuzberg.{toml,yaml,json} in current and parent directories. Use --config to specify an explicit path."),
|
||||
}
|
||||
}
|
||||
}
|
||||
161
crates/kreuzberg-cli/src/commands/embed.rs
Normal file
161
crates/kreuzberg-cli/src/commands/embed.rs
Normal file
@@ -0,0 +1,161 @@
|
||||
//! Embed command implementation.
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
|
||||
use crate::{WireFormat, style};
|
||||
|
||||
/// Execute the embed command: generate embeddings for input texts.
|
||||
///
|
||||
/// When `provider` is `"local"` (default), uses the ONNX preset model.
|
||||
/// When `provider` is `"llm"`, uses liter-llm with the specified model and API key.
|
||||
/// When `provider` is `"plugin"`, dispatches to a pre-registered in-process embedding backend.
|
||||
pub fn embed_command(
|
||||
texts: Vec<String>,
|
||||
preset: &str,
|
||||
provider: &str,
|
||||
llm_model: Option<String>,
|
||||
llm_api_key: Option<String>,
|
||||
plugin_name: Option<String>,
|
||||
format: WireFormat,
|
||||
) -> Result<()> {
|
||||
if texts.is_empty() {
|
||||
anyhow::bail!("No texts provided for embedding. Provide --text or pipe text via stdin.");
|
||||
}
|
||||
|
||||
// Validate no empty texts
|
||||
for (i, t) in texts.iter().enumerate() {
|
||||
if t.is_empty() {
|
||||
anyhow::bail!("Text at position {} is empty. All texts must be non-empty.", i + 1);
|
||||
}
|
||||
}
|
||||
|
||||
let (config, model_label) = match provider {
|
||||
"llm" => {
|
||||
let model = llm_model.as_deref().ok_or_else(|| {
|
||||
anyhow::anyhow!(
|
||||
"--model is required when --provider is 'llm' (e.g., --model openai/text-embedding-3-small)"
|
||||
)
|
||||
})?;
|
||||
|
||||
let llm_config = kreuzberg::LlmConfig {
|
||||
model: model.to_string(),
|
||||
api_key: llm_api_key,
|
||||
base_url: None,
|
||||
timeout_secs: None,
|
||||
max_retries: None,
|
||||
temperature: None,
|
||||
max_tokens: None,
|
||||
};
|
||||
|
||||
let config = kreuzberg::EmbeddingConfig {
|
||||
model: kreuzberg::EmbeddingModelType::Llm { llm: llm_config },
|
||||
show_download_progress: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
(config, model.to_string())
|
||||
}
|
||||
"local" | "" => {
|
||||
// Validate preset for local provider
|
||||
let _preset_info = kreuzberg::get_embedding_preset(preset).with_context(|| {
|
||||
format!(
|
||||
"Unknown embedding preset '{}'. Available: {:?}",
|
||||
preset,
|
||||
kreuzberg::list_embedding_presets()
|
||||
)
|
||||
})?;
|
||||
|
||||
let config = kreuzberg::EmbeddingConfig {
|
||||
model: kreuzberg::EmbeddingModelType::Preset {
|
||||
name: preset.to_string(),
|
||||
},
|
||||
show_download_progress: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
(config, preset.to_string())
|
||||
}
|
||||
"plugin" => {
|
||||
let name = plugin_name.as_deref().ok_or_else(|| {
|
||||
anyhow::anyhow!(
|
||||
"--plugin NAME is required when --provider is 'plugin'. Register a backend via kreuzberg::plugins::register_embedding_backend first."
|
||||
)
|
||||
})?;
|
||||
if name.is_empty() {
|
||||
anyhow::bail!("--plugin NAME must not be empty.");
|
||||
}
|
||||
|
||||
// Pre-flight: surface unknown backends with a list of registered names
|
||||
// (parity with the REST handler, which returns 422 for the same case).
|
||||
let available =
|
||||
kreuzberg::plugins::list_embedding_backends().context("Failed to read embedding backend registry")?;
|
||||
if !available.iter().any(|n| n == name) {
|
||||
anyhow::bail!(
|
||||
"Embedding backend '{}' is not registered. Available backends: {}",
|
||||
name,
|
||||
if available.is_empty() {
|
||||
"(none registered)".to_string()
|
||||
} else {
|
||||
available.join(", ")
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
let config = kreuzberg::EmbeddingConfig {
|
||||
model: kreuzberg::EmbeddingModelType::Plugin { name: name.to_string() },
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
(config, name.to_string())
|
||||
}
|
||||
other => {
|
||||
anyhow::bail!(
|
||||
"Unknown embedding provider '{}'. Valid providers: 'local' (default, ONNX), 'llm' (liter-llm), or 'plugin' (in-process backend).",
|
||||
other
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
// Generate embeddings
|
||||
let embeddings = kreuzberg::embed_texts(texts.clone(), &config).context("Failed to generate embeddings")?;
|
||||
|
||||
let dimensions = embeddings.first().map(|e| e.len()).unwrap_or(0);
|
||||
|
||||
match format {
|
||||
WireFormat::Json => {
|
||||
let output = serde_json::json!({
|
||||
"embeddings": embeddings,
|
||||
"model": model_label,
|
||||
"dimensions": dimensions,
|
||||
"count": embeddings.len(),
|
||||
});
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&output).context("Failed to serialize embeddings to JSON")?
|
||||
);
|
||||
}
|
||||
WireFormat::Toon => {
|
||||
let output = serde_json::json!({
|
||||
"embeddings": embeddings,
|
||||
"model": model_label,
|
||||
"dimensions": dimensions,
|
||||
"count": embeddings.len(),
|
||||
});
|
||||
println!(
|
||||
"{}",
|
||||
serde_toon::to_string(&output).context("Failed to serialize embeddings to TOON")?
|
||||
);
|
||||
}
|
||||
WireFormat::Text => {
|
||||
for (i, embedding) in embeddings.iter().enumerate() {
|
||||
if texts.len() > 1 {
|
||||
println!("{}", style::dim(&format!("# text {}", i + 1)));
|
||||
}
|
||||
let values: Vec<String> = embedding.iter().map(|v| format!("{v}")).collect();
|
||||
println!("{}", values.join(","));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
180
crates/kreuzberg-cli/src/commands/extract.rs
Normal file
180
crates/kreuzberg-cli/src/commands/extract.rs
Normal file
@@ -0,0 +1,180 @@
|
||||
//! Extract command - Extract text and data from documents
|
||||
//!
|
||||
//! This module provides the extract and batch extract commands for processing single
|
||||
//! or multiple documents with customizable extraction configurations.
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use kreuzberg::{
|
||||
BatchFileItem, ExtractionConfig, ExtractionResult, FileExtractionConfig, batch_extract_files_sync,
|
||||
extract_file_sync,
|
||||
};
|
||||
use std::path::PathBuf;
|
||||
use std::time::Instant;
|
||||
|
||||
use crate::{
|
||||
WireFormat,
|
||||
output::{BatchEnvelope, ExtractEnvelope},
|
||||
style,
|
||||
};
|
||||
|
||||
/// Execute single document extraction command
|
||||
pub fn extract_command(
|
||||
path: PathBuf,
|
||||
config: ExtractionConfig,
|
||||
mime_type: Option<String>,
|
||||
format: WireFormat,
|
||||
) -> Result<()> {
|
||||
let path_str = path.to_string_lossy().to_string();
|
||||
|
||||
let t0 = Instant::now();
|
||||
let result = extract_file_sync(&path_str, mime_type.as_deref(), &config).with_context(|| {
|
||||
format!(
|
||||
"Failed to extract file '{}'. Ensure the file is readable and the format is supported.",
|
||||
path.display()
|
||||
)
|
||||
})?;
|
||||
let extraction_time_ms = t0.elapsed().as_secs_f64() * 1000.0;
|
||||
|
||||
match format {
|
||||
WireFormat::Text => {
|
||||
print!("{}", result.content);
|
||||
}
|
||||
WireFormat::Json => {
|
||||
let envelope = ExtractEnvelope {
|
||||
result,
|
||||
extraction_time_ms,
|
||||
};
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&envelope).context("Failed to serialize extraction result to JSON")?
|
||||
);
|
||||
}
|
||||
WireFormat::Toon => {
|
||||
println!(
|
||||
"{}",
|
||||
serde_toon::to_string(&result).context("Failed to serialize extraction result to TOON")?
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Execute batch extraction command with optional per-file configuration overrides
|
||||
pub fn batch_command(
|
||||
paths: Vec<PathBuf>,
|
||||
file_configs_map: Option<std::collections::HashMap<String, serde_json::Value>>,
|
||||
config: ExtractionConfig,
|
||||
format: WireFormat,
|
||||
) -> Result<()> {
|
||||
match format {
|
||||
WireFormat::Json => {
|
||||
// Run files one at a time to capture per-file wall-clock timings.
|
||||
// Per-file config overrides are honoured: files without an override use the
|
||||
// batch-level config directly; files with an override use a one-shot batch of
|
||||
// one item so the library's own merge logic applies.
|
||||
let mut results: Vec<ExtractionResult> = Vec::with_capacity(paths.len());
|
||||
let mut per_file_ms: Vec<f64> = Vec::with_capacity(paths.len());
|
||||
let total_t0 = Instant::now();
|
||||
|
||||
for path in &paths {
|
||||
let path_str = path.to_string_lossy().to_string();
|
||||
let has_file_config = file_configs_map.as_ref().and_then(|m| m.get(&path_str)).is_some();
|
||||
|
||||
let t0 = Instant::now();
|
||||
let result = if has_file_config {
|
||||
// Delegate to the batch API (one item) so per-file merge logic is applied.
|
||||
let file_config = file_configs_map
|
||||
.as_ref()
|
||||
.and_then(|m| m.get(&path_str))
|
||||
.map(|v| {
|
||||
serde_json::from_value::<FileExtractionConfig>(v.clone())
|
||||
.with_context(|| format!("Failed to parse file config for '{}'", path_str))
|
||||
})
|
||||
.transpose()?;
|
||||
let mut batch_results = batch_extract_files_sync(
|
||||
vec![BatchFileItem {
|
||||
path: path.clone(),
|
||||
config: file_config,
|
||||
}],
|
||||
&config,
|
||||
)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to extract file '{}'. Ensure the file is readable and the format is supported.",
|
||||
path.display()
|
||||
)
|
||||
})?;
|
||||
batch_results.remove(0)
|
||||
} else {
|
||||
extract_file_sync(&path_str, None, &config).with_context(|| {
|
||||
format!(
|
||||
"Failed to extract file '{}'. Ensure the file is readable and the format is supported.",
|
||||
path.display()
|
||||
)
|
||||
})?
|
||||
};
|
||||
per_file_ms.push(t0.elapsed().as_secs_f64() * 1000.0);
|
||||
results.push(result);
|
||||
}
|
||||
|
||||
let total_ms = total_t0.elapsed().as_secs_f64() * 1000.0;
|
||||
let envelope = BatchEnvelope {
|
||||
results,
|
||||
total_ms,
|
||||
per_file_ms,
|
||||
};
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&envelope)
|
||||
.context("Failed to serialize batch extraction results to JSON")?
|
||||
);
|
||||
}
|
||||
WireFormat::Text => {
|
||||
let results = run_batch_sync(&paths, file_configs_map.as_ref(), &config)?;
|
||||
for (i, result) in results.iter().enumerate() {
|
||||
println!("{}", style::header(&format!("=== Document {} ===", i + 1)));
|
||||
println!("{} {}", style::label("MIME Type:"), style::success(&result.mime_type));
|
||||
println!("{}\n{}", style::label("Content:"), result.content);
|
||||
println!();
|
||||
}
|
||||
}
|
||||
WireFormat::Toon => {
|
||||
let results = run_batch_sync(&paths, file_configs_map.as_ref(), &config)?;
|
||||
println!(
|
||||
"{}",
|
||||
serde_toon::to_string(&results).context("Failed to serialize batch extraction results to TOON")?
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Run batch extraction using the synchronous batch API for non-JSON output paths.
|
||||
fn run_batch_sync(
|
||||
paths: &[PathBuf],
|
||||
file_configs_map: Option<&std::collections::HashMap<String, serde_json::Value>>,
|
||||
config: &ExtractionConfig,
|
||||
) -> Result<Vec<ExtractionResult>> {
|
||||
let items: Vec<BatchFileItem> = paths
|
||||
.iter()
|
||||
.map(|p| {
|
||||
let path_str = p.to_string_lossy().to_string();
|
||||
let file_config = file_configs_map
|
||||
.and_then(|m| m.get(&path_str))
|
||||
.map(|v| {
|
||||
serde_json::from_value::<FileExtractionConfig>(v.clone())
|
||||
.with_context(|| format!("Failed to parse file config for '{}'", path_str))
|
||||
})
|
||||
.transpose()?;
|
||||
Ok(BatchFileItem {
|
||||
path: p.clone(),
|
||||
config: file_config,
|
||||
})
|
||||
})
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
|
||||
batch_extract_files_sync(items, config)
|
||||
.context("Failed to batch extract documents. Check that all files are readable and formats are supported.")
|
||||
}
|
||||
116
crates/kreuzberg-cli/src/commands/extract_structured.rs
Normal file
116
crates/kreuzberg-cli/src/commands/extract_structured.rs
Normal file
@@ -0,0 +1,116 @@
|
||||
//! Extract structured command - Extract structured data from documents using an LLM.
|
||||
//!
|
||||
//! Reads a JSON schema file, configures LLM-based structured extraction, and
|
||||
//! outputs the structured result parsed from the document.
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use kreuzberg::{LlmConfig, StructuredExtractionConfig, extract_file_sync};
|
||||
use std::path::PathBuf;
|
||||
|
||||
use crate::WireFormat;
|
||||
|
||||
/// Arguments for the extract-structured command.
|
||||
pub struct ExtractStructuredArgs {
|
||||
pub path: PathBuf,
|
||||
pub schema_path: PathBuf,
|
||||
pub model: String,
|
||||
pub api_key: Option<String>,
|
||||
pub prompt: Option<String>,
|
||||
pub schema_name: Option<String>,
|
||||
pub strict: bool,
|
||||
pub config_path: Option<PathBuf>,
|
||||
pub format: WireFormat,
|
||||
}
|
||||
|
||||
/// Execute the extract-structured command.
|
||||
///
|
||||
/// Reads a JSON schema from `schema_path`, builds an `ExtractionConfig` with
|
||||
/// `structured_extraction` configured, extracts the document, and outputs the
|
||||
/// `structured_output` field from the result.
|
||||
pub fn extract_structured_command(args: ExtractStructuredArgs) -> Result<()> {
|
||||
let ExtractStructuredArgs {
|
||||
path,
|
||||
schema_path,
|
||||
model,
|
||||
api_key,
|
||||
prompt,
|
||||
schema_name,
|
||||
strict,
|
||||
config_path,
|
||||
format,
|
||||
} = args;
|
||||
// 1. Read and parse the JSON schema file
|
||||
let schema_str = std::fs::read_to_string(&schema_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to read JSON schema file '{}'. Ensure the file exists and is readable.",
|
||||
schema_path.display()
|
||||
)
|
||||
})?;
|
||||
let schema: serde_json::Value = serde_json::from_str(&schema_str).with_context(|| {
|
||||
format!(
|
||||
"Failed to parse JSON schema from '{}'. Ensure the file contains valid JSON.",
|
||||
schema_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
// 2. Build ExtractionConfig with structured_extraction
|
||||
let mut config = super::load_config(config_path)?;
|
||||
|
||||
let llm_config = LlmConfig {
|
||||
model,
|
||||
api_key,
|
||||
base_url: None,
|
||||
timeout_secs: None,
|
||||
max_retries: None,
|
||||
temperature: None,
|
||||
max_tokens: None,
|
||||
};
|
||||
|
||||
config.structured_extraction = Some(StructuredExtractionConfig {
|
||||
schema,
|
||||
schema_name: schema_name.unwrap_or_else(|| "extraction".to_string()),
|
||||
schema_description: None,
|
||||
strict,
|
||||
prompt,
|
||||
llm: llm_config,
|
||||
});
|
||||
|
||||
// 3. Call kreuzberg::extract_file_sync()
|
||||
let path_str = path.to_string_lossy().to_string();
|
||||
let result = extract_file_sync(&path_str, None, &config).with_context(|| {
|
||||
format!(
|
||||
"Failed to extract structured data from '{}'. Ensure the file is readable and the LLM configuration is correct.",
|
||||
path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
// 4. Output result.structured_output (or error if None)
|
||||
let structured = result.structured_output.with_context(|| {
|
||||
"Structured extraction completed but returned no structured output. \
|
||||
This may indicate the LLM failed to produce valid structured data matching the schema."
|
||||
})?;
|
||||
|
||||
match format {
|
||||
WireFormat::Json => {
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&structured).context("Failed to serialize structured output to JSON")?
|
||||
);
|
||||
}
|
||||
WireFormat::Toon => {
|
||||
println!(
|
||||
"{}",
|
||||
serde_toon::to_string(&structured).context("Failed to serialize structured output to TOON")?
|
||||
);
|
||||
}
|
||||
WireFormat::Text => {
|
||||
// For text mode, pretty-print the JSON value
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&structured).context("Failed to serialize structured output to text")?
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
48
crates/kreuzberg-cli/src/commands/mod.rs
Normal file
48
crates/kreuzberg-cli/src/commands/mod.rs
Normal file
@@ -0,0 +1,48 @@
|
||||
//! Command modules for Kreuzberg CLI
|
||||
//!
|
||||
//! This module organizes the CLI commands into focused submodules:
|
||||
//! - `extract` - Document extraction commands
|
||||
//! - `cache` - Cache management operations
|
||||
//! - `server` - API and MCP server commands
|
||||
//! - `config` - Configuration loading and discovery
|
||||
//! - `embed` - Embedding generation commands
|
||||
//! - `chunk` - Text chunking commands
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use std::io::Read;
|
||||
|
||||
pub mod cache;
|
||||
pub mod chunk;
|
||||
pub mod config;
|
||||
#[cfg(feature = "embeddings")]
|
||||
pub mod embed;
|
||||
pub mod extract;
|
||||
pub mod extract_structured;
|
||||
pub mod overrides;
|
||||
#[cfg(any(feature = "api", feature = "mcp"))]
|
||||
pub mod server;
|
||||
|
||||
// Re-export command functions for convenience
|
||||
pub use cache::{clear_command, manifest_command, stats_command, warm_command};
|
||||
pub use chunk::chunk_command;
|
||||
pub use config::load_config;
|
||||
#[cfg(feature = "embeddings")]
|
||||
pub use embed::embed_command;
|
||||
pub use extract::{batch_command, extract_command};
|
||||
#[cfg(feature = "mcp")]
|
||||
pub use server::mcp_command;
|
||||
#[cfg(feature = "api")]
|
||||
pub use server::serve_command;
|
||||
|
||||
/// Read text from stdin, trimming whitespace.
|
||||
pub fn read_stdin() -> Result<String> {
|
||||
let mut input = String::new();
|
||||
std::io::stdin()
|
||||
.read_to_string(&mut input)
|
||||
.context("Failed to read from stdin")?;
|
||||
let trimmed = input.trim().to_string();
|
||||
if trimmed.is_empty() {
|
||||
anyhow::bail!("No input received from stdin. Provide text via --text or pipe it to stdin.");
|
||||
}
|
||||
Ok(trimmed)
|
||||
}
|
||||
1327
crates/kreuzberg-cli/src/commands/overrides.rs
Normal file
1327
crates/kreuzberg-cli/src/commands/overrides.rs
Normal file
File diff suppressed because it is too large
Load Diff
104
crates/kreuzberg-cli/src/commands/server.rs
Normal file
104
crates/kreuzberg-cli/src/commands/server.rs
Normal file
@@ -0,0 +1,104 @@
|
||||
//! Server command - Start API and MCP servers
|
||||
//!
|
||||
//! This module provides commands for starting the Kreuzberg API server
|
||||
//! and the MCP (Model Context Protocol) server.
|
||||
|
||||
use anyhow::Result;
|
||||
|
||||
/// Execute API server command
|
||||
#[cfg(feature = "api")]
|
||||
pub fn serve_command(
|
||||
cli_host: Option<String>,
|
||||
cli_port: Option<u16>,
|
||||
extraction_config: kreuzberg::ExtractionConfig,
|
||||
config_path: Option<std::path::PathBuf>,
|
||||
) -> Result<()> {
|
||||
use anyhow::Context;
|
||||
use kreuzberg::ServerConfig;
|
||||
|
||||
// Load server config from same file or defaults
|
||||
let mut server_config = if let Some(path) = &config_path {
|
||||
ServerConfig::from_file(path).with_context(|| {
|
||||
format!(
|
||||
"Failed to load server configuration from '{}'. \
|
||||
Ensure the file contains valid server settings under [server] section or at root level.",
|
||||
path.display()
|
||||
)
|
||||
})?
|
||||
} else {
|
||||
ServerConfig::default()
|
||||
};
|
||||
|
||||
// Apply environment variable overrides (precedence: env vars > config file)
|
||||
server_config.apply_env_overrides()?;
|
||||
|
||||
// CLI args override everything (highest precedence)
|
||||
if let Some(host) = cli_host {
|
||||
server_config.host = host;
|
||||
}
|
||||
if let Some(port) = cli_port {
|
||||
server_config.port = port;
|
||||
}
|
||||
|
||||
// Log the final configuration for debugging
|
||||
tracing::info!(
|
||||
"Starting Kreuzberg API server on http://{}",
|
||||
server_config.listen_addr()
|
||||
);
|
||||
|
||||
let rt = tokio::runtime::Runtime::new()?;
|
||||
rt.block_on(kreuzberg::api::serve_with_server_config(
|
||||
extraction_config,
|
||||
server_config.clone(),
|
||||
))
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to start API server on {}. Ensure the port is not already in use and you have permission to bind to this address.",
|
||||
server_config.listen_addr()
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Execute MCP server command
|
||||
#[cfg(feature = "mcp")]
|
||||
pub fn mcp_command(
|
||||
config: kreuzberg::ExtractionConfig,
|
||||
transport: String,
|
||||
#[cfg(feature = "mcp-http")] host: String,
|
||||
#[cfg(feature = "mcp-http")] port: u16,
|
||||
#[cfg(not(feature = "mcp-http"))] _host: String,
|
||||
#[cfg(not(feature = "mcp-http"))] _port: u16,
|
||||
) -> Result<()> {
|
||||
tracing::debug!("Starting Kreuzberg MCP server with transport: {}", transport);
|
||||
let rt = tokio::runtime::Runtime::new()?;
|
||||
|
||||
match transport.to_lowercase().as_str() {
|
||||
"stdio" => {
|
||||
rt.block_on(kreuzberg::mcp::start_mcp_server_with_config(config))
|
||||
.map_err(|e| anyhow::anyhow!("Failed to start MCP server: {}", e))?;
|
||||
}
|
||||
"http" => {
|
||||
#[cfg(not(feature = "mcp-http"))]
|
||||
{
|
||||
anyhow::bail!(
|
||||
"HTTP transport requires 'mcp-http' feature. \
|
||||
Rebuild with: cargo build --features mcp-http"
|
||||
);
|
||||
}
|
||||
|
||||
#[cfg(feature = "mcp-http")]
|
||||
{
|
||||
tracing::debug!("Starting MCP server on http://{}:{}", host, port);
|
||||
rt.block_on(kreuzberg::mcp::start_mcp_server_http_with_config(&host, port, config))
|
||||
.map_err(|e| anyhow::anyhow!("Failed to start MCP server on {}:{}: {}", host, port, e))?;
|
||||
}
|
||||
}
|
||||
other => {
|
||||
anyhow::bail!("Unknown transport '{}'. Use 'stdio' or 'http'", other);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
230
crates/kreuzberg-cli/src/commands/tree_sitter.rs
Normal file
230
crates/kreuzberg-cli/src/commands/tree_sitter.rs
Normal file
@@ -0,0 +1,230 @@
|
||||
//! Tree-sitter grammar management commands.
|
||||
//!
|
||||
//! This module provides commands for downloading, listing, and managing
|
||||
//! tree-sitter grammar parsers via the tree-sitter-language-pack crate.
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use serde_json::json;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use crate::{WireFormat, style};
|
||||
|
||||
/// Execute the tree-sitter download command.
|
||||
///
|
||||
/// Downloads tree-sitter grammar parsers based on the provided arguments:
|
||||
/// - Specific languages by name
|
||||
/// - All available languages (--all)
|
||||
/// - Language groups (--groups)
|
||||
pub fn download_command(
|
||||
languages: Vec<String>,
|
||||
all: bool,
|
||||
groups: Option<Vec<String>>,
|
||||
cache_dir: Option<PathBuf>,
|
||||
format: WireFormat,
|
||||
) -> Result<()> {
|
||||
// Apply custom cache directory if provided
|
||||
if let Some(ref dir) = cache_dir {
|
||||
let config = tree_sitter_language_pack::PackConfig {
|
||||
cache_dir: Some(dir.clone()),
|
||||
languages: None,
|
||||
groups: None,
|
||||
};
|
||||
tree_sitter_language_pack::configure(&config).context("Failed to configure custom cache directory")?;
|
||||
}
|
||||
|
||||
let count: usize;
|
||||
let description: String;
|
||||
|
||||
if all {
|
||||
count = tree_sitter_language_pack::download_all().context("Failed to download all tree-sitter grammars")?;
|
||||
description = "all available languages".to_string();
|
||||
} else if let Some(ref group_list) = groups {
|
||||
let config = tree_sitter_language_pack::PackConfig {
|
||||
cache_dir: cache_dir.clone(),
|
||||
languages: None,
|
||||
groups: Some(group_list.clone()),
|
||||
};
|
||||
tree_sitter_language_pack::init(&config).context("Failed to download tree-sitter grammar groups")?;
|
||||
count = 0; // init does not return a count
|
||||
description = format!("groups: {}", group_list.join(", "));
|
||||
} else if !languages.is_empty() {
|
||||
let refs: Vec<&str> = languages.iter().map(String::as_str).collect();
|
||||
count = tree_sitter_language_pack::download(&refs).context("Failed to download tree-sitter grammars")?;
|
||||
description = format!("languages: {}", languages.join(", "));
|
||||
} else {
|
||||
anyhow::bail!("No languages specified. Use language names, --all, --groups, or --from-config.");
|
||||
}
|
||||
|
||||
match format {
|
||||
WireFormat::Text => {
|
||||
println!("{}", style::header("Tree-sitter Download"));
|
||||
println!("{}", style::dim("===================="));
|
||||
println!("{} {}", style::label("Requested:"), description);
|
||||
if groups.is_none() || all || !languages.is_empty() {
|
||||
println!(
|
||||
"{} {}",
|
||||
style::label("Newly downloaded:"),
|
||||
style::success(&count.to_string())
|
||||
);
|
||||
}
|
||||
if let Some(ref dir) = cache_dir {
|
||||
println!(
|
||||
"{} {}",
|
||||
style::label("Cache directory:"),
|
||||
style::success(&dir.display().to_string())
|
||||
);
|
||||
}
|
||||
println!("{}", style::success("Done"));
|
||||
}
|
||||
WireFormat::Json => {
|
||||
let mut output = json!({
|
||||
"requested": description,
|
||||
"newly_downloaded": count,
|
||||
});
|
||||
if let Some(ref dir) = cache_dir {
|
||||
output["cache_dir"] = json!(dir.to_string_lossy());
|
||||
}
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&output).context("Failed to serialize download results to JSON")?
|
||||
);
|
||||
}
|
||||
WireFormat::Toon => {
|
||||
let mut output = json!({
|
||||
"requested": description,
|
||||
"newly_downloaded": count,
|
||||
});
|
||||
if let Some(ref dir) = cache_dir {
|
||||
output["cache_dir"] = json!(dir.to_string_lossy());
|
||||
}
|
||||
println!(
|
||||
"{}",
|
||||
serde_toon::to_string(&output).context("Failed to serialize download results to TOON")?
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Execute the tree-sitter list command.
|
||||
///
|
||||
/// Lists available or downloaded tree-sitter languages, optionally filtering
|
||||
/// by a name substring.
|
||||
pub fn list_command(downloaded_only: bool, filter: Option<String>, format: WireFormat) -> Result<()> {
|
||||
let languages = if downloaded_only {
|
||||
tree_sitter_language_pack::downloaded_languages()
|
||||
} else {
|
||||
tree_sitter_language_pack::manifest_languages().context("Failed to fetch tree-sitter language manifest")?
|
||||
};
|
||||
|
||||
let filtered: Vec<&String> = if let Some(ref f) = filter {
|
||||
let lower = f.to_lowercase();
|
||||
languages.iter().filter(|l| l.to_lowercase().contains(&lower)).collect()
|
||||
} else {
|
||||
languages.iter().collect()
|
||||
};
|
||||
|
||||
let source = if downloaded_only { "downloaded" } else { "available" };
|
||||
|
||||
match format {
|
||||
WireFormat::Text => {
|
||||
println!(
|
||||
"{} ({} {}{})",
|
||||
style::header("Tree-sitter Languages"),
|
||||
filtered.len(),
|
||||
source,
|
||||
filter.as_ref().map(|f| format!(", filter: '{f}'")).unwrap_or_default()
|
||||
);
|
||||
println!("{}", style::dim("====================="));
|
||||
for lang in &filtered {
|
||||
println!(" {}", style::success(lang));
|
||||
}
|
||||
}
|
||||
WireFormat::Json => {
|
||||
let output = json!({
|
||||
"source": source,
|
||||
"count": filtered.len(),
|
||||
"filter": filter,
|
||||
"languages": filtered,
|
||||
});
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&output).context("Failed to serialize language list to JSON")?
|
||||
);
|
||||
}
|
||||
WireFormat::Toon => {
|
||||
let output = json!({
|
||||
"source": source,
|
||||
"count": filtered.len(),
|
||||
"filter": filter,
|
||||
"languages": filtered,
|
||||
});
|
||||
println!(
|
||||
"{}",
|
||||
serde_toon::to_string(&output).context("Failed to serialize language list to TOON")?
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Execute the tree-sitter cache-dir command.
|
||||
///
|
||||
/// Displays the effective cache directory for tree-sitter grammar parsers.
|
||||
pub fn cache_dir_command(format: WireFormat) -> Result<()> {
|
||||
let dir = tree_sitter_language_pack::cache_dir().context("Failed to determine tree-sitter cache directory")?;
|
||||
let dir_str = dir.to_string_lossy();
|
||||
|
||||
match format {
|
||||
WireFormat::Text => {
|
||||
println!("{} {}", style::label("Cache directory:"), style::success(&dir_str));
|
||||
}
|
||||
WireFormat::Json => {
|
||||
let output = json!({ "cache_dir": dir_str });
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&output).context("Failed to serialize cache directory to JSON")?
|
||||
);
|
||||
}
|
||||
WireFormat::Toon => {
|
||||
let output = json!({ "cache_dir": dir_str });
|
||||
println!(
|
||||
"{}",
|
||||
serde_toon::to_string(&output).context("Failed to serialize cache directory to TOON")?
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Execute the tree-sitter clean command.
|
||||
///
|
||||
/// Clears all cached tree-sitter grammar parser shared libraries.
|
||||
pub fn clean_command(format: WireFormat) -> Result<()> {
|
||||
tree_sitter_language_pack::clean_cache().context("Failed to clean tree-sitter cache")?;
|
||||
|
||||
match format {
|
||||
WireFormat::Text => {
|
||||
println!("{}", style::success("Tree-sitter cache cleared successfully"));
|
||||
}
|
||||
WireFormat::Json => {
|
||||
let output = json!({ "status": "cleared" });
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&output).context("Failed to serialize clean result to JSON")?
|
||||
);
|
||||
}
|
||||
WireFormat::Toon => {
|
||||
let output = json!({ "status": "cleared" });
|
||||
println!(
|
||||
"{}",
|
||||
serde_toon::to_string(&output).context("Failed to serialize clean result to TOON")?
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
238
crates/kreuzberg-cli/src/logging.rs
Normal file
238
crates/kreuzberg-cli/src/logging.rs
Normal file
@@ -0,0 +1,238 @@
|
||||
//! Logging helpers for the Kreuzberg CLI.
|
||||
//!
|
||||
//! Provides a [`build_env_filter`] function that layers default third-party
|
||||
//! transport suppressions on top of whatever the caller or `RUST_LOG` specifies.
|
||||
//! User-supplied per-target rules in `RUST_LOG` always win because
|
||||
//! [`EnvFilter::add_directive`] does not override existing per-target directives.
|
||||
|
||||
use tracing_subscriber::EnvFilter;
|
||||
|
||||
/// Third-party crates that are noisy at their own default level.
|
||||
///
|
||||
/// These are added as *fallback* directives: if `RUST_LOG` or `level_override`
|
||||
/// already contain a per-target rule for any of these crates it takes precedence,
|
||||
/// so the user can still do `RUST_LOG=ureq=debug` to restore full transport logs.
|
||||
const QUIET_DIRECTIVES: &[&str] = &[
|
||||
"ureq=warn",
|
||||
"ureq_proto=warn",
|
||||
"rustls=warn",
|
||||
"hyper_util=warn",
|
||||
"hf_hub=info",
|
||||
"tower_http=info",
|
||||
];
|
||||
|
||||
/// Extract the target crate name from a directive string like `"ureq=warn"`.
|
||||
///
|
||||
/// Returns the part before `=`, or `None` if there is no `=`.
|
||||
fn directive_target(directive: &str) -> Option<&str> {
|
||||
directive.split_once('=').map(|(target, _)| target)
|
||||
}
|
||||
|
||||
/// Build an [`EnvFilter`] with third-party transport crates suppressed by default.
|
||||
///
|
||||
/// Precedence (highest first):
|
||||
/// 1. Per-target directives already present in `RUST_LOG` (or `level_override`).
|
||||
/// 2. The [`QUIET_DIRECTIVES`] suppressions added here.
|
||||
/// 3. Root level: `level_override` → `RUST_LOG` → `"info"`.
|
||||
///
|
||||
/// Per-target directives that the user has already set are **not** overridden:
|
||||
/// we skip adding a quiet directive when the base filter already contains a
|
||||
/// rule for the same target crate. This is necessary because
|
||||
/// [`EnvFilter::add_directive`] appends rather than guards — a later-added
|
||||
/// per-target directive for the same crate takes precedence.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `level_override` — explicit root-level string from a CLI flag (e.g. `"debug"`).
|
||||
/// When `Some`, it replaces `RUST_LOG` entirely for the root level.
|
||||
pub fn build_env_filter(level_override: Option<&str>) -> EnvFilter {
|
||||
// Use try_new on user input so a malformed --log-level falls back to info
|
||||
// instead of panicking the CLI.
|
||||
let base = level_override
|
||||
.and_then(|level| EnvFilter::try_new(level).ok())
|
||||
.or_else(|| EnvFilter::try_from_default_env().ok())
|
||||
.unwrap_or_else(|| EnvFilter::new("info"));
|
||||
|
||||
// Snapshot the existing directive set so we can skip quiet directives
|
||||
// whose target the user has already configured explicitly.
|
||||
let existing_targets: std::collections::HashSet<String> = base
|
||||
.to_string()
|
||||
.split(',')
|
||||
.filter_map(|chunk| directive_target(chunk).map(|t| t.trim().to_string()))
|
||||
.collect();
|
||||
|
||||
QUIET_DIRECTIVES
|
||||
.iter()
|
||||
.filter(|directive| {
|
||||
// Only add the quiet directive when no per-target rule for this
|
||||
// exact crate already exists. Word-boundary match via tokenized
|
||||
// target set avoids `hf_hub` colliding with `hf_hub_server`.
|
||||
directive_target(directive)
|
||||
.map(|target| !existing_targets.contains(target))
|
||||
.unwrap_or(true)
|
||||
})
|
||||
.fold(base, |filter, directive| {
|
||||
filter.add_directive(directive.parse().expect("built-in logging directive must be valid"))
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
/// Parse the directive string from an EnvFilter for assertion-level checks.
|
||||
///
|
||||
/// `EnvFilter::to_string()` returns a comma-separated representation of all
|
||||
/// directives. We use this as a stable, public inspection surface.
|
||||
fn filter_directives(filter: &EnvFilter) -> String {
|
||||
filter.to_string()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn default_filter_suppresses_ureq() {
|
||||
// No env, no override → ureq and ureq_proto must be suppressed.
|
||||
let filter = build_env_filter(None);
|
||||
let directives = filter_directives(&filter);
|
||||
assert!(
|
||||
directives.contains("ureq=warn"),
|
||||
"ureq=warn must be present in default filter; got: {directives}"
|
||||
);
|
||||
assert!(
|
||||
directives.contains("ureq_proto=warn"),
|
||||
"ureq_proto=warn must be present in default filter; got: {directives}"
|
||||
);
|
||||
assert!(
|
||||
directives.contains("rustls=warn"),
|
||||
"rustls=warn must be present in default filter; got: {directives}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn default_filter_keeps_kreuzberg_info() {
|
||||
// Root level info → kreuzberg has no suppression applied.
|
||||
let filter = build_env_filter(None);
|
||||
let directives = filter_directives(&filter);
|
||||
assert!(
|
||||
!directives.contains("kreuzberg=warn") && !directives.contains("kreuzberg=error"),
|
||||
"kreuzberg must not be suppressed in the default filter; got: {directives}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn env_override_wins_for_third_party() {
|
||||
// Simulate RUST_LOG=ureq=debug by passing it as the level_override.
|
||||
// build_env_filter must detect the existing ureq= directive and skip the
|
||||
// ureq=warn suppression, so ureq=debug survives in the final filter.
|
||||
let filter = build_env_filter(Some("info,ureq=debug"));
|
||||
let directives = filter.to_string();
|
||||
assert!(
|
||||
directives.contains("ureq=debug"),
|
||||
"user-supplied ureq=debug must be preserved; got: {directives}"
|
||||
);
|
||||
assert!(
|
||||
!directives.contains("ureq=warn"),
|
||||
"ureq=warn suppression must not be added when user already set ureq=debug; got: {directives}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn level_override_wins() {
|
||||
// CLI flag "debug" → root must be debug; suppression directives still present.
|
||||
let filter = build_env_filter(Some("debug"));
|
||||
let directives = filter_directives(&filter);
|
||||
assert!(
|
||||
directives.contains("debug"),
|
||||
"root debug level must appear in filter with --log-level debug; got: {directives}"
|
||||
);
|
||||
// Suppression for ureq must still be layered on top.
|
||||
assert!(
|
||||
directives.contains("ureq=warn"),
|
||||
"ureq=warn suppression must still be present even under --log-level debug; got: {directives}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tower_http_suppressed_at_default() {
|
||||
// No override → tower_http must be suppressed.
|
||||
let filter = build_env_filter(None);
|
||||
let directives = filter_directives(&filter);
|
||||
assert!(
|
||||
directives.contains("tower_http=info") || directives.contains("tower_http=warn"),
|
||||
"tower_http must be suppressed at default level; got: {directives}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn all_quiet_directives_are_valid() {
|
||||
// Ensure every built-in directive parses without panic.
|
||||
for directive in super::QUIET_DIRECTIVES {
|
||||
directive
|
||||
.parse::<tracing_subscriber::filter::Directive>()
|
||||
.expect("built-in directive is invalid");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn no_level_override_uses_info_root() {
|
||||
// Without RUST_LOG set and no override, root should default to info.
|
||||
// The directive string must not open with debug or trace as the root level.
|
||||
let filter = build_env_filter(None);
|
||||
let directives = filter_directives(&filter);
|
||||
// Root "debug" or "trace" as the first token would mean root is debug/trace.
|
||||
let root_is_noisier_than_info = directives.starts_with("debug") || directives.starts_with("trace");
|
||||
assert!(
|
||||
!root_is_noisier_than_info,
|
||||
"default root level must not be debug/trace without RUST_LOG; got: {directives}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hf_hub_suppressed_at_default() {
|
||||
let filter = build_env_filter(None);
|
||||
let directives = filter_directives(&filter);
|
||||
assert!(
|
||||
directives.contains("hf_hub=info"),
|
||||
"hf_hub must be suppressed to info at default; got: {directives}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hyper_util_suppressed_at_default() {
|
||||
let filter = build_env_filter(None);
|
||||
let directives = filter_directives(&filter);
|
||||
assert!(
|
||||
directives.contains("hyper_util=warn"),
|
||||
"hyper_util must be suppressed to warn at default; got: {directives}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn malformed_level_override_falls_back_to_info() {
|
||||
// Garbage CLI flag must NOT panic — try_new returns Err and we fall back
|
||||
// to RUST_LOG / info default.
|
||||
let filter = build_env_filter(Some(":::garbage"));
|
||||
let directives = filter_directives(&filter);
|
||||
// Quiet directives should still be layered, proving we recovered.
|
||||
assert!(
|
||||
directives.contains("ureq=warn"),
|
||||
"ureq=warn must still be present after malformed override; got: {directives}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn similar_target_name_does_not_block_suppression() {
|
||||
// A user-supplied directive for `hf_hub_server` must NOT cause the
|
||||
// `hf_hub=info` suppression to be skipped (regression test for the
|
||||
// earlier substring-containment bug).
|
||||
let filter = build_env_filter(Some("info,hf_hub_server=debug"));
|
||||
let directives = filter.to_string();
|
||||
assert!(
|
||||
directives.contains("hf_hub_server=debug"),
|
||||
"user directive for hf_hub_server must survive; got: {directives}"
|
||||
);
|
||||
assert!(
|
||||
directives.contains("hf_hub=info"),
|
||||
"hf_hub=info suppression must still be applied; got: {directives}"
|
||||
);
|
||||
}
|
||||
}
|
||||
971
crates/kreuzberg-cli/src/main.rs
Normal file
971
crates/kreuzberg-cli/src/main.rs
Normal file
@@ -0,0 +1,971 @@
|
||||
//! Kreuzberg CLI - Command-line interface for document intelligence.
|
||||
//!
|
||||
//! This binary provides a command-line interface to the Kreuzberg document intelligence
|
||||
//! library, supporting document extraction, MIME type detection, caching, and batch operations.
|
||||
//!
|
||||
//! # Architecture
|
||||
//!
|
||||
//! The CLI is built using `clap` for argument parsing and provides five main commands:
|
||||
//! - `extract`: Extract text/data from a single document
|
||||
//! - `batch`: Process multiple documents in parallel
|
||||
//! - `detect`: Identify MIME type of a file
|
||||
//! - `cache`: Manage cache (clear, stats)
|
||||
//! - `serve`: Start API server (requires `api` feature)
|
||||
//! - `version`: Show version information
|
||||
//!
|
||||
//! # Configuration
|
||||
//!
|
||||
//! The CLI supports configuration files in TOML, YAML, or JSON formats:
|
||||
//! - Explicit: `--config path/to/config.toml`
|
||||
//! - Auto-discovery: Searches for `kreuzberg.{toml,yaml,json}` in current and parent directories
|
||||
//! - Inline JSON: `--config-json '{"ocr": {"backend": "tesseract"}}'`
|
||||
//! - Command-line flags override config file settings
|
||||
//!
|
||||
//! Configuration precedence (highest to lowest):
|
||||
//! 1. Individual CLI flags (--output-format, --ocr, etc.)
|
||||
//! 2. Inline JSON config (--config-json or --config-json-base64)
|
||||
//! 3. Config file (--config path.toml)
|
||||
//! 4. Default values
|
||||
//!
|
||||
//! # Exit Codes
|
||||
//!
|
||||
//! - 0: Success
|
||||
//! - Non-zero: Error (see stderr for details)
|
||||
//!
|
||||
//! # Examples
|
||||
//!
|
||||
//! ```bash
|
||||
//! # Extract text from a PDF
|
||||
//! kreuzberg extract document.pdf
|
||||
//!
|
||||
//! # Extract with OCR enabled
|
||||
//! kreuzberg extract scanned.pdf --ocr true
|
||||
//!
|
||||
//! # Extract with inline JSON config
|
||||
//! kreuzberg extract doc.pdf --config-json '{"ocr":{"backend":"tesseract"}}'
|
||||
//!
|
||||
//! # Batch processing
|
||||
//! kreuzberg batch *.pdf --output-format json
|
||||
//!
|
||||
//! # Detect MIME type
|
||||
//! kreuzberg detect unknown-file.bin
|
||||
//! ```
|
||||
|
||||
#![deny(unsafe_code)]
|
||||
|
||||
mod commands;
|
||||
mod logging;
|
||||
mod output;
|
||||
mod style;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use base64::{Engine as _, engine::general_purpose::STANDARD};
|
||||
use clap::{CommandFactory, Parser, Subcommand};
|
||||
#[cfg(feature = "embeddings")]
|
||||
use commands::embed_command;
|
||||
#[cfg(feature = "mcp")]
|
||||
use commands::mcp_command;
|
||||
use commands::overrides::ExtractionOverrides;
|
||||
#[cfg(feature = "api")]
|
||||
use commands::serve_command;
|
||||
use commands::{
|
||||
batch_command, chunk_command, clear_command, extract_command,
|
||||
extract_structured::{ExtractStructuredArgs, extract_structured_command},
|
||||
load_config, manifest_command, stats_command, warm_command,
|
||||
};
|
||||
use kreuzberg::{OutputFormat as ContentOutputFormat, detect_mime_type};
|
||||
use serde_json::json;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
/// Kreuzberg document intelligence CLI
|
||||
#[derive(Parser)]
|
||||
#[command(name = "kreuzberg")]
|
||||
#[command(version, about, long_about = None)]
|
||||
struct Cli {
|
||||
/// Set log level (trace, debug, info, warn, error). Overrides RUST_LOG env var.
|
||||
#[arg(long, global = true)]
|
||||
log_level: Option<String>,
|
||||
|
||||
#[command(subcommand)]
|
||||
command: Commands,
|
||||
}
|
||||
|
||||
#[derive(Subcommand)]
|
||||
enum Commands {
|
||||
/// Extract text from a document
|
||||
Extract {
|
||||
/// Path to the document
|
||||
path: PathBuf,
|
||||
|
||||
/// Path to config file (TOML, YAML, or JSON). If not specified, searches for kreuzberg.toml/yaml/json in current and parent directories.
|
||||
#[arg(short, long)]
|
||||
config: Option<PathBuf>,
|
||||
|
||||
/// Inline JSON configuration. Applied after config file but before individual flags.
|
||||
///
|
||||
/// Example: --config-json '{"ocr":{"backend":"tesseract"},"chunking":{"max_chars":1000}}'
|
||||
#[arg(long)]
|
||||
config_json: Option<String>,
|
||||
|
||||
/// Base64-encoded JSON configuration. Useful for shell environments where quotes are problematic.
|
||||
///
|
||||
/// Example: --config-json-base64 eyJvY3IiOnsiYmFja2VuZCI6InRlc3NlcmFjdCJ9fQ==
|
||||
#[arg(long)]
|
||||
config_json_base64: Option<String>,
|
||||
|
||||
/// MIME type hint (auto-detected if not provided)
|
||||
#[arg(short, long)]
|
||||
mime_type: Option<String>,
|
||||
|
||||
/// Output format for CLI results (text or json).
|
||||
///
|
||||
/// Controls how the CLI displays results, not the extraction content format.
|
||||
#[arg(short, long, default_value = "text")]
|
||||
format: WireFormat,
|
||||
|
||||
/// Extraction configuration overrides
|
||||
#[command(flatten)]
|
||||
overrides: ExtractionOverrides,
|
||||
},
|
||||
|
||||
/// Extract structured data from a document using an LLM
|
||||
ExtractStructured {
|
||||
/// Path to the document file
|
||||
path: PathBuf,
|
||||
|
||||
/// Path to JSON schema file defining the output structure
|
||||
#[arg(long)]
|
||||
schema: PathBuf,
|
||||
|
||||
/// LLM model (e.g., "openai/gpt-4o")
|
||||
#[arg(long)]
|
||||
model: String,
|
||||
|
||||
/// API key for the LLM provider
|
||||
#[arg(long)]
|
||||
api_key: Option<String>,
|
||||
|
||||
/// Custom Jinja2 prompt template
|
||||
#[arg(long)]
|
||||
prompt: Option<String>,
|
||||
|
||||
/// Schema name
|
||||
#[arg(long, default_value = "extraction")]
|
||||
schema_name: Option<String>,
|
||||
|
||||
/// Enable strict mode
|
||||
#[arg(long)]
|
||||
strict: bool,
|
||||
|
||||
/// Config file path
|
||||
#[arg(short, long)]
|
||||
config: Option<PathBuf>,
|
||||
|
||||
/// Output format (text or json)
|
||||
#[arg(short, long, default_value = "json")]
|
||||
format: WireFormat,
|
||||
},
|
||||
|
||||
/// Batch extract from multiple documents
|
||||
Batch {
|
||||
/// Paths to documents
|
||||
paths: Vec<PathBuf>,
|
||||
|
||||
/// Path to config file (TOML, YAML, or JSON). If not specified, searches for kreuzberg.toml/yaml/json in current and parent directories.
|
||||
#[arg(short, long)]
|
||||
config: Option<PathBuf>,
|
||||
|
||||
/// Inline JSON configuration. Applied after config file but before individual flags.
|
||||
///
|
||||
/// Example: --config-json '{"ocr":{"backend":"tesseract"},"chunking":{"max_chars":1000}}'
|
||||
#[arg(long)]
|
||||
config_json: Option<String>,
|
||||
|
||||
/// Base64-encoded JSON configuration. Useful for shell environments where quotes are problematic.
|
||||
///
|
||||
/// Example: --config-json-base64 eyJvY3IiOnsiYmFja2VuZCI6InRlc3NlcmFjdCJ9fQ==
|
||||
#[arg(long)]
|
||||
config_json_base64: Option<String>,
|
||||
|
||||
/// Output format for CLI results (text or json).
|
||||
///
|
||||
/// Controls how the CLI displays results, not the extraction content format.
|
||||
#[arg(short, long, default_value = "json")]
|
||||
format: WireFormat,
|
||||
|
||||
/// Extraction configuration overrides
|
||||
#[command(flatten)]
|
||||
overrides: ExtractionOverrides,
|
||||
|
||||
/// Path to a JSON file mapping file paths to per-file extraction config overrides.
|
||||
/// The JSON should be an object where keys are file paths and values are FileExtractionConfig objects.
|
||||
/// Example: {"doc1.pdf": {"force_ocr": true}, "doc2.pdf": {"output_format": "markdown"}}
|
||||
#[arg(long)]
|
||||
file_configs: Option<PathBuf>,
|
||||
},
|
||||
|
||||
/// Detect MIME type of a file
|
||||
Detect {
|
||||
/// Path to the file
|
||||
path: PathBuf,
|
||||
|
||||
/// Output format (text or json)
|
||||
#[arg(short, long, default_value = "text")]
|
||||
format: WireFormat,
|
||||
},
|
||||
|
||||
/// List all supported document formats
|
||||
Formats {
|
||||
/// Output format (text or json)
|
||||
#[arg(short, long, default_value = "text")]
|
||||
format: WireFormat,
|
||||
},
|
||||
|
||||
/// Show version information
|
||||
Version {
|
||||
/// Output format (text or json)
|
||||
#[arg(short, long, default_value = "text")]
|
||||
format: WireFormat,
|
||||
},
|
||||
|
||||
/// Cache management operations
|
||||
Cache {
|
||||
#[command(subcommand)]
|
||||
command: CacheCommands,
|
||||
},
|
||||
|
||||
/// Start the API server
|
||||
///
|
||||
/// Configuration is loaded with the following precedence (highest to lowest):
|
||||
/// 1. CLI arguments (--host, --port)
|
||||
/// 2. Environment variables (KREUZBERG_HOST, KREUZBERG_PORT)
|
||||
/// 3. Config file (TOML, YAML, or JSON)
|
||||
/// 4. Built-in defaults (127.0.0.1:8000)
|
||||
///
|
||||
/// The config file can contain both extraction and server settings under [server] section.
|
||||
#[cfg(feature = "api")]
|
||||
Serve {
|
||||
/// Host to bind to (e.g., "127.0.0.1" or "0.0.0.0"). CLI arg overrides config file and env vars.
|
||||
#[arg(short = 'H', long)]
|
||||
host: Option<String>,
|
||||
|
||||
/// Port to bind to. CLI arg overrides config file and env vars.
|
||||
#[arg(short, long)]
|
||||
port: Option<u16>,
|
||||
|
||||
/// Path to config file (TOML, YAML, or JSON). If not specified, searches for kreuzberg.toml/yaml/json in current and parent directories.
|
||||
#[arg(short, long)]
|
||||
config: Option<PathBuf>,
|
||||
},
|
||||
|
||||
/// Start the MCP (Model Context Protocol) server
|
||||
#[cfg(feature = "mcp")]
|
||||
Mcp {
|
||||
/// Path to config file (TOML, YAML, or JSON). If not specified, searches for kreuzberg.toml/yaml/json in current and parent directories.
|
||||
#[arg(short, long)]
|
||||
config: Option<PathBuf>,
|
||||
|
||||
/// Transport mode: stdio (default) or http
|
||||
#[arg(long, default_value = "stdio")]
|
||||
transport: String,
|
||||
|
||||
/// HTTP host (only for --transport http)
|
||||
#[arg(long, default_value = "127.0.0.1")]
|
||||
host: String,
|
||||
|
||||
/// HTTP port (only for --transport http)
|
||||
#[arg(long, default_value = "8001")]
|
||||
port: u16,
|
||||
},
|
||||
|
||||
/// API utilities
|
||||
#[cfg(feature = "api")]
|
||||
Api {
|
||||
#[command(subcommand)]
|
||||
command: ApiCommands,
|
||||
},
|
||||
|
||||
/// Generate embeddings for text
|
||||
///
|
||||
/// Generates vector embeddings for one or more text inputs using a specified preset model
|
||||
/// or an LLM provider. Reads from --text flag or stdin if no text is provided.
|
||||
#[cfg(feature = "embeddings")]
|
||||
Embed {
|
||||
/// Text to embed. Can be specified multiple times for batch embedding.
|
||||
#[arg(long)]
|
||||
text: Vec<String>,
|
||||
|
||||
/// Embedding preset (fast, balanced, quality, multilingual). Used with --provider local.
|
||||
#[arg(long, default_value = "balanced")]
|
||||
preset: String,
|
||||
|
||||
/// Embedding provider: "local" (default, ONNX), "llm" (liter-llm), or "plugin" (registered in-process backend)
|
||||
#[arg(long, default_value = "local")]
|
||||
provider: String,
|
||||
|
||||
/// LLM model for provider-hosted embeddings (e.g., "openai/text-embedding-3-small").
|
||||
/// Required when --provider is "llm".
|
||||
#[arg(long)]
|
||||
model: Option<String>,
|
||||
|
||||
/// API key for the LLM provider
|
||||
#[arg(long)]
|
||||
api_key: Option<String>,
|
||||
|
||||
/// Name of a pre-registered in-process embedding backend.
|
||||
/// Required when --provider is "plugin". The backend must have been
|
||||
/// registered via `kreuzberg::plugins::register_embedding_backend`
|
||||
/// before this command runs.
|
||||
#[arg(long)]
|
||||
plugin: Option<String>,
|
||||
|
||||
/// Output format (text or json)
|
||||
#[arg(short, long, default_value = "json")]
|
||||
format: WireFormat,
|
||||
},
|
||||
|
||||
/// Chunk text for processing
|
||||
///
|
||||
/// Splits text into chunks using configurable size and overlap.
|
||||
/// Reads from --text flag or stdin if no text is provided.
|
||||
Chunk {
|
||||
/// Text to chunk. If not provided, reads from stdin.
|
||||
#[arg(long)]
|
||||
text: Option<String>,
|
||||
|
||||
/// Path to config file (TOML, YAML, or JSON)
|
||||
#[arg(short, long)]
|
||||
config: Option<PathBuf>,
|
||||
|
||||
/// Chunk size in characters
|
||||
#[arg(long)]
|
||||
chunk_size: Option<usize>,
|
||||
|
||||
/// Chunk overlap in characters
|
||||
#[arg(long)]
|
||||
chunk_overlap: Option<usize>,
|
||||
|
||||
/// Chunker type: text, markdown, yaml, or semantic
|
||||
#[arg(long, default_value = "text")]
|
||||
chunker_type: String,
|
||||
|
||||
/// Tokenizer model for token-based chunk sizing (e.g., "Xenova/gpt-4o").
|
||||
/// Requires the chunking-tokenizers feature.
|
||||
#[arg(long)]
|
||||
chunking_tokenizer: Option<String>,
|
||||
|
||||
/// Topic threshold for semantic chunking (0.0-1.0, default: 0.75)
|
||||
#[arg(long)]
|
||||
topic_threshold: Option<f32>,
|
||||
|
||||
/// Output format (text or json)
|
||||
#[arg(short, long, default_value = "json")]
|
||||
format: WireFormat,
|
||||
},
|
||||
|
||||
/// Generate shell completions
|
||||
///
|
||||
/// Outputs shell completion scripts for the specified shell.
|
||||
/// Install with: eval "$(kreuzberg completions bash)"
|
||||
Completions {
|
||||
/// Shell to generate completions for
|
||||
#[arg(value_enum)]
|
||||
shell: clap_complete::Shell,
|
||||
},
|
||||
}
|
||||
|
||||
#[cfg(feature = "api")]
|
||||
#[derive(Subcommand)]
|
||||
enum ApiCommands {
|
||||
/// Output the OpenAPI schema (JSON)
|
||||
///
|
||||
/// Prints the full OpenAPI 3.1 specification for the kreuzberg REST API.
|
||||
/// Useful for code generation, documentation, and API client tooling.
|
||||
Schema,
|
||||
}
|
||||
|
||||
#[derive(Subcommand)]
|
||||
enum CacheCommands {
|
||||
/// Show cache statistics
|
||||
Stats {
|
||||
/// Cache directory (default: .kreuzberg in current directory)
|
||||
#[arg(short, long)]
|
||||
cache_dir: Option<PathBuf>,
|
||||
|
||||
/// Output format (text or json)
|
||||
#[arg(short, long, default_value = "text")]
|
||||
format: WireFormat,
|
||||
},
|
||||
|
||||
/// Clear the cache
|
||||
Clear {
|
||||
/// Cache directory (default: .kreuzberg in current directory)
|
||||
#[arg(short, long)]
|
||||
cache_dir: Option<PathBuf>,
|
||||
|
||||
/// Output format (text or json)
|
||||
#[arg(short, long, default_value = "text")]
|
||||
format: WireFormat,
|
||||
},
|
||||
|
||||
/// Output model manifest (expected model files, checksums, sizes)
|
||||
///
|
||||
/// Outputs a JSON manifest of all model files required by kreuzberg,
|
||||
/// including their relative paths, SHA256 checksums, and sizes.
|
||||
/// Used for pre-populating model caches in containerized deployments.
|
||||
Manifest {
|
||||
/// Output format (text or json)
|
||||
#[arg(short, long, default_value = "json")]
|
||||
format: WireFormat,
|
||||
},
|
||||
|
||||
/// Download all models eagerly
|
||||
///
|
||||
/// Downloads all PaddleOCR and layout detection models for all supported
|
||||
/// languages. Unlike normal operation which downloads lazily on first use,
|
||||
/// this ensures all models are present in the cache directory.
|
||||
///
|
||||
/// Use --all-embeddings to also download all 4 embedding model presets,
|
||||
/// or --embedding-model <preset> to download a specific one.
|
||||
///
|
||||
/// By default, only the core layout models (rtdetr + tatr) are downloaded.
|
||||
/// Use --all-table-models to also download SLANeXT variants (~730MB).
|
||||
Warm {
|
||||
/// Cache directory (default: .kreuzberg in current directory, or KREUZBERG_CACHE_DIR)
|
||||
#[arg(short, long)]
|
||||
cache_dir: Option<PathBuf>,
|
||||
|
||||
/// Output format (text or json)
|
||||
#[arg(short, long, default_value = "text")]
|
||||
format: WireFormat,
|
||||
|
||||
/// Download all embedding model presets (fast, balanced, quality, multilingual)
|
||||
#[arg(long)]
|
||||
all_embeddings: bool,
|
||||
|
||||
/// Download a specific embedding model preset
|
||||
#[arg(long, value_name = "PRESET")]
|
||||
embedding_model: Option<String>,
|
||||
|
||||
/// Download all table structure models including SLANeXT variants (~730MB)
|
||||
#[arg(
|
||||
long,
|
||||
help = "Download all table structure models including SLANeXT variants (~730MB)"
|
||||
)]
|
||||
all_table_models: bool,
|
||||
|
||||
/// Download all tree-sitter grammar parsers
|
||||
#[arg(long)]
|
||||
all_grammars: bool,
|
||||
|
||||
/// Download specific tree-sitter grammar groups (comma-separated: web,systems,scripting,data,jvm,functional)
|
||||
#[arg(long, value_name = "GROUPS", value_delimiter = ',')]
|
||||
grammar_groups: Option<Vec<String>>,
|
||||
|
||||
/// Download specific tree-sitter grammars by language name (comma-separated)
|
||||
#[arg(long, value_name = "LANGUAGES", value_delimiter = ',')]
|
||||
grammars: Option<Vec<String>>,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
||||
enum WireFormat {
|
||||
Text,
|
||||
Json,
|
||||
Toon,
|
||||
}
|
||||
|
||||
impl std::str::FromStr for WireFormat {
|
||||
type Err = String;
|
||||
|
||||
fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
|
||||
match s.to_lowercase().as_str() {
|
||||
"text" => Ok(WireFormat::Text),
|
||||
"json" => Ok(WireFormat::Json),
|
||||
"toon" => Ok(WireFormat::Toon),
|
||||
_ => Err(format!("Invalid format: {}. Use 'text', 'json', or 'toon'", s)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Content output format for extraction results.
|
||||
///
|
||||
/// Controls the format of the extracted content (not the CLI output format).
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, clap::ValueEnum)]
|
||||
enum ContentOutputFormatArg {
|
||||
/// Plain text (default)
|
||||
Plain,
|
||||
/// Markdown format
|
||||
Markdown,
|
||||
/// Djot markup format
|
||||
Djot,
|
||||
/// HTML format
|
||||
Html,
|
||||
/// JSON tree format with heading-driven sections
|
||||
Json,
|
||||
}
|
||||
|
||||
impl From<ContentOutputFormatArg> for ContentOutputFormat {
|
||||
fn from(arg: ContentOutputFormatArg) -> Self {
|
||||
match arg {
|
||||
ContentOutputFormatArg::Plain => ContentOutputFormat::Plain,
|
||||
ContentOutputFormatArg::Markdown => ContentOutputFormat::Markdown,
|
||||
ContentOutputFormatArg::Djot => ContentOutputFormat::Djot,
|
||||
ContentOutputFormatArg::Html => ContentOutputFormat::Html,
|
||||
ContentOutputFormatArg::Json => ContentOutputFormat::Json,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Validates that a file exists and is accessible.
|
||||
///
|
||||
/// Checks that the path exists in the filesystem and points to a regular file
|
||||
/// (not a directory or special file). Provides user-friendly error messages if validation fails.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns an error if:
|
||||
/// - The path does not exist in the filesystem
|
||||
/// - The path exists but is not a regular file (e.g., is a directory)
|
||||
fn validate_file_exists(path: &Path) -> Result<()> {
|
||||
if !path.exists() {
|
||||
anyhow::bail!(
|
||||
"File not found: '{}'. Please check that the file exists and is accessible.",
|
||||
path.display()
|
||||
);
|
||||
}
|
||||
if !path.is_file() {
|
||||
anyhow::bail!(
|
||||
"Path is not a file: '{}'. Please provide a path to a regular file.",
|
||||
path.display()
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Validates chunking parameters for correctness.
|
||||
///
|
||||
/// Ensures that chunking configuration makes sense: size must be positive and reasonable,
|
||||
/// and overlap must be smaller than chunk size. This prevents common configuration errors
|
||||
/// that would lead to cryptic failures from the underlying library.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns an error if:
|
||||
/// - `chunk_size` is 0 (must be at least 1 character)
|
||||
/// - `chunk_size` exceeds 1,000,000 characters (to prevent excessive memory usage)
|
||||
/// - `chunk_overlap` is greater than or equal to `chunk_size` (overlap must be smaller)
|
||||
fn validate_chunk_params(chunk_size: Option<usize>, chunk_overlap: Option<usize>) -> Result<()> {
|
||||
if let Some(size) = chunk_size {
|
||||
if size == 0 {
|
||||
anyhow::bail!("Invalid chunk size: {}. Chunk size must be greater than 0.", size);
|
||||
}
|
||||
if size > 1_000_000 {
|
||||
anyhow::bail!(
|
||||
"Invalid chunk size: {}. Chunk size must be less than 1,000,000 characters to avoid excessive memory usage.",
|
||||
size
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(overlap) = chunk_overlap
|
||||
&& let Some(size) = chunk_size
|
||||
&& overlap >= size
|
||||
{
|
||||
anyhow::bail!(
|
||||
"Invalid chunk overlap: {}. Overlap ({}) must be less than chunk size ({}).",
|
||||
overlap,
|
||||
overlap,
|
||||
size
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Validates batch extraction paths for correctness.
|
||||
///
|
||||
/// Ensures that at least one file path is provided and that all paths point to valid,
|
||||
/// accessible files. This prevents processing empty batches or failing mid-batch due
|
||||
/// to invalid paths.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns an error if:
|
||||
/// - The paths array is empty (at least one file is required)
|
||||
/// - Any path does not exist or is not a regular file
|
||||
fn validate_batch_paths(paths: &[PathBuf]) -> Result<()> {
|
||||
if paths.is_empty() {
|
||||
anyhow::bail!("No files provided for batch extraction. Please provide at least one file path.");
|
||||
}
|
||||
|
||||
for (i, path) in paths.iter().enumerate() {
|
||||
validate_file_exists(path).with_context(|| format!("Invalid file at position {}", i + 1))?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Apply inline JSON or base64 JSON overrides to an extraction config.
|
||||
fn apply_json_overrides(
|
||||
config: &mut kreuzberg::ExtractionConfig,
|
||||
config_json: Option<String>,
|
||||
config_json_base64: Option<String>,
|
||||
) -> Result<()> {
|
||||
if let Some(json_str) = config_json {
|
||||
let json_value: serde_json::Value =
|
||||
serde_json::from_str(&json_str).context("Failed to parse --config-json as JSON")?;
|
||||
*config =
|
||||
merge_json_into_config(config, json_value).context("Failed to merge --config-json with file config")?;
|
||||
} else if let Some(base64_str) = config_json_base64 {
|
||||
let json_bytes = STANDARD
|
||||
.decode(&base64_str)
|
||||
.context("Failed to decode base64 in --config-json-base64")?;
|
||||
let json_str = String::from_utf8(json_bytes).context("Base64-decoded content is not valid UTF-8")?;
|
||||
let json_value: serde_json::Value =
|
||||
serde_json::from_str(&json_str).context("Failed to parse decoded --config-json-base64 as JSON")?;
|
||||
*config = merge_json_into_config(config, json_value)
|
||||
.context("Failed to merge --config-json-base64 with file config")?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Merges a JSON value into an existing extraction config via field-by-field override.
|
||||
fn merge_json_into_config(
|
||||
base_config: &kreuzberg::ExtractionConfig,
|
||||
json_value: serde_json::Value,
|
||||
) -> Result<kreuzberg::ExtractionConfig> {
|
||||
let json_str = serde_json::to_string(&json_value).map_err(|e| anyhow::anyhow!("{}", e))?;
|
||||
kreuzberg::core::config::merge::merge_config_json(base_config, &json_str).map_err(|e| anyhow::anyhow!("{}", e))
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let cli = Cli::parse();
|
||||
|
||||
let env_filter = logging::build_env_filter(cli.log_level.as_deref());
|
||||
|
||||
let _ = tracing_subscriber::fmt()
|
||||
.with_env_filter(env_filter)
|
||||
.with_writer(std::io::stderr)
|
||||
.try_init();
|
||||
|
||||
match cli.command {
|
||||
Commands::Extract {
|
||||
path,
|
||||
config: config_path,
|
||||
config_json,
|
||||
config_json_base64,
|
||||
mime_type,
|
||||
format,
|
||||
overrides,
|
||||
} => {
|
||||
validate_file_exists(&path)?;
|
||||
overrides.validate()?;
|
||||
|
||||
let mut config = load_config(config_path)?;
|
||||
apply_json_overrides(&mut config, config_json, config_json_base64)?;
|
||||
overrides.apply(&mut config);
|
||||
|
||||
extract_command(path, config, mime_type, format)?;
|
||||
}
|
||||
|
||||
Commands::ExtractStructured {
|
||||
path,
|
||||
schema,
|
||||
model,
|
||||
api_key,
|
||||
prompt,
|
||||
schema_name,
|
||||
strict,
|
||||
config,
|
||||
format,
|
||||
} => {
|
||||
validate_file_exists(&path)?;
|
||||
validate_file_exists(&schema)?;
|
||||
extract_structured_command(ExtractStructuredArgs {
|
||||
path,
|
||||
schema_path: schema,
|
||||
model,
|
||||
api_key,
|
||||
prompt,
|
||||
schema_name,
|
||||
strict,
|
||||
config_path: config,
|
||||
format,
|
||||
})?;
|
||||
}
|
||||
|
||||
Commands::Batch {
|
||||
paths,
|
||||
config: config_path,
|
||||
config_json,
|
||||
config_json_base64,
|
||||
format,
|
||||
overrides,
|
||||
file_configs,
|
||||
} => {
|
||||
validate_batch_paths(&paths)?;
|
||||
overrides.validate()?;
|
||||
|
||||
let mut config = load_config(config_path)?;
|
||||
apply_json_overrides(&mut config, config_json, config_json_base64)?;
|
||||
overrides.apply(&mut config);
|
||||
|
||||
let file_configs_map = if let Some(file_configs_path) = file_configs {
|
||||
let file_configs_json = std::fs::read_to_string(&file_configs_path)
|
||||
.with_context(|| format!("Failed to read file configs from '{}'", file_configs_path.display()))?;
|
||||
let map: std::collections::HashMap<String, serde_json::Value> =
|
||||
serde_json::from_str(&file_configs_json).with_context(|| {
|
||||
format!(
|
||||
"Failed to parse file configs JSON from '{}'",
|
||||
file_configs_path.display()
|
||||
)
|
||||
})?;
|
||||
Some(map)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
batch_command(paths, file_configs_map, config, format)?;
|
||||
}
|
||||
|
||||
Commands::Detect { path, format } => {
|
||||
validate_file_exists(&path)?;
|
||||
|
||||
let path_str = path.to_string_lossy().to_string();
|
||||
let mime_type = detect_mime_type(path_str.clone(), true).with_context(|| {
|
||||
format!(
|
||||
"Failed to detect MIME type for file '{}'. Ensure the file is readable.",
|
||||
path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
match format {
|
||||
WireFormat::Text => {
|
||||
println!("{}", style::success(&mime_type));
|
||||
}
|
||||
WireFormat::Json => {
|
||||
let output = json!({
|
||||
"path": path_str,
|
||||
"mime_type": mime_type,
|
||||
});
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&output)
|
||||
.context("Failed to serialize MIME type detection result to JSON")?
|
||||
);
|
||||
}
|
||||
WireFormat::Toon => {
|
||||
let output = json!({
|
||||
"path": path_str,
|
||||
"mime_type": mime_type,
|
||||
});
|
||||
println!(
|
||||
"{}",
|
||||
serde_toon::to_string(&output)
|
||||
.context("Failed to serialize MIME type detection result to TOON")?
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Commands::Formats { format } => {
|
||||
let formats = kreuzberg::core::mime::list_supported_formats();
|
||||
match format {
|
||||
WireFormat::Text => {
|
||||
println!("{:<15} {}", style::label("EXTENSION"), style::label("MIME TYPE"));
|
||||
println!("{}", style::dim(&format!("{:<15} ---------", "---------")));
|
||||
for f in &formats {
|
||||
println!("{:<15} {}", style::success(&format!(".{}", f.extension)), f.mime_type);
|
||||
}
|
||||
}
|
||||
WireFormat::Json => {
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&formats).context("Failed to serialize formats to JSON")?
|
||||
);
|
||||
}
|
||||
WireFormat::Toon => {
|
||||
println!(
|
||||
"{}",
|
||||
serde_toon::to_string(&formats).context("Failed to serialize formats to TOON")?
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Commands::Version { format } => {
|
||||
let version = env!("CARGO_PKG_VERSION");
|
||||
let name = env!("CARGO_PKG_NAME");
|
||||
|
||||
match format {
|
||||
WireFormat::Text => {
|
||||
println!("{} {}", style::label(name), style::success(version));
|
||||
}
|
||||
WireFormat::Json => {
|
||||
let output = json!({
|
||||
"name": name,
|
||||
"version": version,
|
||||
});
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&output)
|
||||
.context("Failed to serialize version information to JSON")?
|
||||
);
|
||||
}
|
||||
WireFormat::Toon => {
|
||||
let output = json!({
|
||||
"name": name,
|
||||
"version": version,
|
||||
});
|
||||
println!(
|
||||
"{}",
|
||||
serde_toon::to_string(&output).context("Failed to serialize version information to TOON")?
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "api")]
|
||||
Commands::Serve {
|
||||
host: cli_host,
|
||||
port: cli_port,
|
||||
config: config_path,
|
||||
} => {
|
||||
let mut extraction_config = load_config(config_path.clone())?;
|
||||
extraction_config.apply_env_overrides()?;
|
||||
serve_command(cli_host, cli_port, extraction_config, config_path)?;
|
||||
}
|
||||
|
||||
#[cfg(feature = "mcp")]
|
||||
Commands::Mcp {
|
||||
config: config_path,
|
||||
transport,
|
||||
#[cfg(feature = "mcp-http")]
|
||||
host,
|
||||
#[cfg(feature = "mcp-http")]
|
||||
port,
|
||||
#[cfg(not(feature = "mcp-http"))]
|
||||
host,
|
||||
#[cfg(not(feature = "mcp-http"))]
|
||||
port,
|
||||
} => {
|
||||
let mut config = load_config(config_path)?;
|
||||
config.apply_env_overrides()?;
|
||||
mcp_command(config, transport, host, port)?;
|
||||
}
|
||||
|
||||
Commands::Cache { command } => match command {
|
||||
CacheCommands::Stats { cache_dir, format } => {
|
||||
stats_command(cache_dir, format)?;
|
||||
}
|
||||
CacheCommands::Clear { cache_dir, format } => {
|
||||
clear_command(cache_dir, format)?;
|
||||
}
|
||||
CacheCommands::Manifest { format } => {
|
||||
manifest_command(format)?;
|
||||
}
|
||||
CacheCommands::Warm {
|
||||
cache_dir,
|
||||
format,
|
||||
all_embeddings,
|
||||
embedding_model,
|
||||
all_table_models,
|
||||
all_grammars,
|
||||
grammar_groups,
|
||||
grammars,
|
||||
} => {
|
||||
warm_command(
|
||||
cache_dir,
|
||||
format,
|
||||
all_embeddings,
|
||||
embedding_model,
|
||||
all_table_models,
|
||||
all_grammars,
|
||||
grammar_groups,
|
||||
grammars,
|
||||
)?;
|
||||
}
|
||||
},
|
||||
|
||||
#[cfg(feature = "api")]
|
||||
Commands::Api { command } => match command {
|
||||
ApiCommands::Schema => {
|
||||
println!("{}", kreuzberg::api::openapi::openapi_json());
|
||||
}
|
||||
},
|
||||
|
||||
#[cfg(feature = "embeddings")]
|
||||
Commands::Embed {
|
||||
text,
|
||||
preset,
|
||||
provider,
|
||||
model,
|
||||
api_key,
|
||||
plugin,
|
||||
format,
|
||||
} => {
|
||||
let texts = if text.is_empty() {
|
||||
vec![commands::read_stdin()?]
|
||||
} else {
|
||||
text
|
||||
};
|
||||
embed_command(texts, &preset, &provider, model, api_key, plugin, format)?;
|
||||
}
|
||||
|
||||
Commands::Chunk {
|
||||
text,
|
||||
config: config_path,
|
||||
chunk_size,
|
||||
chunk_overlap,
|
||||
chunker_type,
|
||||
chunking_tokenizer,
|
||||
topic_threshold,
|
||||
format,
|
||||
} => {
|
||||
let input = match text {
|
||||
Some(t) => t,
|
||||
None => commands::read_stdin().context("No --text provided and failed to read from stdin")?,
|
||||
};
|
||||
|
||||
validate_chunk_params(chunk_size, chunk_overlap)?;
|
||||
|
||||
let base_config = load_config(config_path)?;
|
||||
let mut chunking_config = base_config.chunking.unwrap_or_default();
|
||||
|
||||
if let Some(size) = chunk_size {
|
||||
chunking_config.max_characters = size;
|
||||
// If user set chunk_size but not overlap, clamp overlap to fit
|
||||
if chunk_overlap.is_none() && chunking_config.overlap >= size {
|
||||
chunking_config.overlap = size / 4;
|
||||
}
|
||||
}
|
||||
if let Some(overlap) = chunk_overlap {
|
||||
chunking_config.overlap = overlap;
|
||||
}
|
||||
match chunker_type.as_str() {
|
||||
"markdown" => chunking_config.chunker_type = kreuzberg::ChunkerType::Markdown,
|
||||
"yaml" => chunking_config.chunker_type = kreuzberg::ChunkerType::Yaml,
|
||||
"semantic" => chunking_config.chunker_type = kreuzberg::ChunkerType::Semantic,
|
||||
_ => chunking_config.chunker_type = kreuzberg::ChunkerType::Text,
|
||||
}
|
||||
if let Some(ref tokenizer) = chunking_tokenizer {
|
||||
chunking_config.sizing = kreuzberg::ChunkSizing::Tokenizer {
|
||||
model: tokenizer.clone(),
|
||||
cache_dir: None,
|
||||
};
|
||||
}
|
||||
if topic_threshold.is_some() {
|
||||
chunking_config.topic_threshold = topic_threshold;
|
||||
}
|
||||
|
||||
chunk_command(input, chunking_config, format)?;
|
||||
}
|
||||
|
||||
Commands::Completions { shell } => {
|
||||
let mut cmd = Cli::command();
|
||||
clap_complete::generate(shell, &mut cmd, "kreuzberg", &mut std::io::stdout());
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
32
crates/kreuzberg-cli/src/output.rs
Normal file
32
crates/kreuzberg-cli/src/output.rs
Normal file
@@ -0,0 +1,32 @@
|
||||
//! JSON envelope types for CLI output.
|
||||
//!
|
||||
//! When `--format json` is used, extraction results are wrapped in these envelopes
|
||||
//! so tooling (such as the benchmark harness) can read timing information without
|
||||
//! parsing stderr or running a separate profiling tool.
|
||||
|
||||
use kreuzberg::ExtractionResult;
|
||||
use serde::Serialize;
|
||||
|
||||
/// Single-file extraction result with wall-clock timing.
|
||||
///
|
||||
/// Emitted to stdout by `kreuzberg extract --format json`.
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct ExtractEnvelope {
|
||||
/// The extraction result (content, metadata, tables, …).
|
||||
pub result: ExtractionResult,
|
||||
/// Wall-clock time for the extraction call in milliseconds.
|
||||
pub extraction_time_ms: f64,
|
||||
}
|
||||
|
||||
/// Batch extraction results with per-file and total timing.
|
||||
///
|
||||
/// Emitted to stdout by `kreuzberg batch --format json`.
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct BatchEnvelope {
|
||||
/// One result per input file, in input order.
|
||||
pub results: Vec<ExtractionResult>,
|
||||
/// Total wall-clock time for the whole batch in milliseconds.
|
||||
pub total_ms: f64,
|
||||
/// Per-file wall-clock times in milliseconds, aligned with `results`.
|
||||
pub per_file_ms: Vec<f64>,
|
||||
}
|
||||
104
crates/kreuzberg-cli/src/style.rs
Normal file
104
crates/kreuzberg-cli/src/style.rs
Normal file
@@ -0,0 +1,104 @@
|
||||
//! CLI color styling helpers using `anstyle`.
|
||||
//!
|
||||
//! Provides styled output for the kreuzberg CLI. Respects the `NO_COLOR`
|
||||
//! environment variable (<https://no-color.org/>) and disables colors
|
||||
//! when output is not a terminal.
|
||||
|
||||
use anstyle::{AnsiColor, Effects, Style};
|
||||
use std::sync::OnceLock;
|
||||
|
||||
/// Bold blue for section headers.
|
||||
const HEADER: Style = Style::new()
|
||||
.fg_color(Some(anstyle::Color::Ansi(AnsiColor::Blue)))
|
||||
.effects(Effects::BOLD);
|
||||
|
||||
/// Green for success values (MIME types, file paths, versions).
|
||||
const SUCCESS: Style = Style::new().fg_color(Some(anstyle::Color::Ansi(AnsiColor::Green)));
|
||||
|
||||
/// Dim for metadata, separators, secondary info.
|
||||
const DIM: Style = Style::new().effects(Effects::DIMMED);
|
||||
|
||||
/// Bold for labels in key-value pairs.
|
||||
const LABEL: Style = Style::new().effects(Effects::BOLD);
|
||||
|
||||
/// Check whether color output is enabled.
|
||||
///
|
||||
/// Returns `false` if:
|
||||
/// - The `NO_COLOR` environment variable is set (any value)
|
||||
///
|
||||
/// See <https://no-color.org/> for the specification.
|
||||
pub fn is_color_enabled() -> bool {
|
||||
static ENABLED: OnceLock<bool> = OnceLock::new();
|
||||
*ENABLED.get_or_init(|| std::env::var_os("NO_COLOR").is_none())
|
||||
}
|
||||
|
||||
/// Apply an `anstyle::Style` to text if colors are enabled.
|
||||
fn styled(text: &str, style: Style) -> String {
|
||||
if is_color_enabled() {
|
||||
format!("{}{}{}", style.render(), text, style.render_reset())
|
||||
} else {
|
||||
text.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
/// Style text as a section header (bold blue).
|
||||
pub fn header(text: &str) -> String {
|
||||
styled(text, HEADER)
|
||||
}
|
||||
|
||||
/// Style text as a success value (green).
|
||||
pub fn success(text: &str) -> String {
|
||||
styled(text, SUCCESS)
|
||||
}
|
||||
|
||||
/// Style text as dim/secondary (dimmed).
|
||||
pub fn dim(text: &str) -> String {
|
||||
styled(text, DIM)
|
||||
}
|
||||
|
||||
/// Style text as a label (bold).
|
||||
pub fn label(text: &str) -> String {
|
||||
styled(text, LABEL)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_styled_returns_plain_text_when_no_color() {
|
||||
// Set NO_COLOR for this test's assertion scope via direct env check
|
||||
// Since OnceLock caches, we test the raw logic instead.
|
||||
let text = "hello";
|
||||
let result = format!("{}{}{}", Style::new().render(), text, Style::new().render_reset());
|
||||
// A plain Style produces no ANSI codes, so the result is just the text.
|
||||
assert_eq!(result, "hello");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_styled_applies_ansi_when_style_present() {
|
||||
let style = Style::new().fg_color(Some(anstyle::Color::Ansi(AnsiColor::Green)));
|
||||
let rendered = format!("{}{}{}", style.render(), "ok", style.render_reset());
|
||||
// The rendered string should contain ANSI escape sequences.
|
||||
assert!(rendered.contains("\x1b["));
|
||||
assert!(rendered.contains("ok"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_helper_functions_return_strings() {
|
||||
// Smoke test: all helpers produce non-empty output for non-empty input.
|
||||
assert!(!header("h").is_empty());
|
||||
assert!(!success("s").is_empty());
|
||||
assert!(!dim("d").is_empty());
|
||||
assert!(!label("l").is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_color_enabled_respects_no_color_env() {
|
||||
// We cannot easily test OnceLock-cached value, but we can verify the
|
||||
// logic: NO_COLOR absence means colors enabled.
|
||||
let has_no_color = std::env::var_os("NO_COLOR").is_some();
|
||||
// The cached result should match the env at init time.
|
||||
assert_eq!(is_color_enabled(), !has_no_color);
|
||||
}
|
||||
}
|
||||
937
crates/kreuzberg-cli/tests/commands_test.rs
Normal file
937
crates/kreuzberg-cli/tests/commands_test.rs
Normal file
@@ -0,0 +1,937 @@
|
||||
//! Integration tests for CLI commands (extract, detect, batch).
|
||||
//!
|
||||
//! These tests verify that the CLI commands work correctly end-to-end,
|
||||
//! including input validation, file processing, and output formatting.
|
||||
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
use tempfile::tempdir;
|
||||
|
||||
/// Get the path to the kreuzberg binary.
|
||||
fn get_binary_path() -> String {
|
||||
let manifest_dir = env!("CARGO_MANIFEST_DIR");
|
||||
format!("{}/../../target/debug/kreuzberg", manifest_dir)
|
||||
}
|
||||
|
||||
/// Get the test_documents directory path.
|
||||
fn get_test_documents_dir() -> PathBuf {
|
||||
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
|
||||
manifest_dir.parent().unwrap().parent().unwrap().join("test_documents")
|
||||
}
|
||||
|
||||
/// Get a test file path relative to test_documents/.
|
||||
fn get_test_file(relative_path: &str) -> String {
|
||||
get_test_documents_dir()
|
||||
.join(relative_path)
|
||||
.to_string_lossy()
|
||||
.to_string()
|
||||
}
|
||||
|
||||
/// Build the binary before running tests.
|
||||
fn build_binary() {
|
||||
let status = Command::new("cargo")
|
||||
.args(["build", "--bin", "kreuzberg"])
|
||||
.status()
|
||||
.expect("Failed to build kreuzberg binary");
|
||||
|
||||
assert!(status.success(), "Failed to build kreuzberg binary");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_text_file() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute extract command");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Extract command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
assert!(!stdout.is_empty(), "Extract output should not be empty");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_with_json_output() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", test_file.as_str(), "--format", "json"])
|
||||
.output()
|
||||
.expect("Failed to execute extract command");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Extract command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
|
||||
let json_result: serde_json::Result<serde_json::Value> = serde_json::from_str(&stdout);
|
||||
assert!(json_result.is_ok(), "Output should be valid JSON, got: {}", stdout);
|
||||
|
||||
let json = json_result.unwrap();
|
||||
// JSON output is now wrapped in a timing envelope: { result: ExtractionResult, extraction_time_ms: f64 }
|
||||
assert!(json.get("result").is_some(), "JSON envelope should have 'result' field");
|
||||
assert!(
|
||||
json.get("extraction_time_ms").is_some(),
|
||||
"JSON envelope should have 'extraction_time_ms' field"
|
||||
);
|
||||
assert!(
|
||||
json["result"].get("content").is_some(),
|
||||
"result should have 'content' field"
|
||||
);
|
||||
assert!(
|
||||
json["result"].get("mime_type").is_some(),
|
||||
"result should have 'mime_type' field"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_with_chunking() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args([
|
||||
"extract",
|
||||
test_file.as_str(),
|
||||
"--chunk",
|
||||
"true",
|
||||
"--chunk-size",
|
||||
"100",
|
||||
"--chunk-overlap",
|
||||
"20",
|
||||
"--format",
|
||||
"json",
|
||||
])
|
||||
.output()
|
||||
.expect("Failed to execute extract command");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Extract with chunking failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
let json: serde_json::Value = serde_json::from_str(&stdout).expect("Should be valid JSON");
|
||||
|
||||
// JSON output is wrapped in an envelope; chunks live under result
|
||||
assert!(
|
||||
json["result"].get("chunks").is_some(),
|
||||
"result should have 'chunks' field"
|
||||
);
|
||||
assert!(json["result"]["chunks"].is_array(), "'chunks' should be an array");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_file_not_found() {
|
||||
build_binary();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", "/nonexistent/file.txt"])
|
||||
.output()
|
||||
.expect("Failed to execute extract command");
|
||||
|
||||
assert!(!output.status.success(), "Extract should fail for nonexistent file");
|
||||
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
assert!(
|
||||
stderr.contains("File not found"),
|
||||
"Error should mention file not found, got: {}",
|
||||
stderr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_directory_not_file() {
|
||||
build_binary();
|
||||
|
||||
let tmp_dir = tempdir().expect("Failed to create temp dir");
|
||||
let dir_path = tmp_dir.path().to_string_lossy().to_string();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", dir_path.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute extract command");
|
||||
|
||||
assert!(!output.status.success(), "Extract should fail for directory");
|
||||
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
assert!(
|
||||
stderr.contains("not a file") || stderr.contains("regular file"),
|
||||
"Error should mention path is not a file, got: {}",
|
||||
stderr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_invalid_chunk_size_zero() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", test_file.as_str(), "--chunk-size", "0"])
|
||||
.output()
|
||||
.expect("Failed to execute extract command");
|
||||
|
||||
assert!(!output.status.success(), "Extract should fail for chunk size 0");
|
||||
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
assert!(
|
||||
stderr.contains("Invalid chunk size") || stderr.contains("must be greater than 0"),
|
||||
"Error should mention invalid chunk size, got: {}",
|
||||
stderr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_invalid_chunk_size_too_large() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", test_file.as_str(), "--chunk-size", "2000000"])
|
||||
.output()
|
||||
.expect("Failed to execute extract command");
|
||||
|
||||
assert!(!output.status.success(), "Extract should fail for chunk size > 1M");
|
||||
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
assert!(
|
||||
stderr.contains("Invalid chunk size") || stderr.contains("1,000,000"),
|
||||
"Error should mention chunk size limit, got: {}",
|
||||
stderr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_invalid_overlap_equals_chunk_size() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args([
|
||||
"extract",
|
||||
test_file.as_str(),
|
||||
"--chunk-size",
|
||||
"100",
|
||||
"--chunk-overlap",
|
||||
"100",
|
||||
])
|
||||
.output()
|
||||
.expect("Failed to execute extract command");
|
||||
|
||||
assert!(
|
||||
!output.status.success(),
|
||||
"Extract should fail when overlap equals chunk size"
|
||||
);
|
||||
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
assert!(
|
||||
stderr.contains("Invalid chunk overlap") || stderr.contains("must be less than chunk size"),
|
||||
"Error should mention overlap constraint, got: {}",
|
||||
stderr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_mime_type() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["detect", test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute detect command");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Detect command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
assert!(!stdout.is_empty(), "Detect output should not be empty");
|
||||
assert!(
|
||||
stdout.contains("text/plain") || stdout.contains("text"),
|
||||
"Should detect text MIME type, got: {}",
|
||||
stdout
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_with_json_output() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["detect", test_file.as_str(), "--format", "json"])
|
||||
.output()
|
||||
.expect("Failed to execute detect command");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Detect command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
|
||||
let json_result: serde_json::Result<serde_json::Value> = serde_json::from_str(&stdout);
|
||||
assert!(json_result.is_ok(), "Output should be valid JSON, got: {}", stdout);
|
||||
|
||||
let json = json_result.unwrap();
|
||||
assert!(json.get("mime_type").is_some(), "JSON should have 'mime_type' field");
|
||||
assert!(json.get("path").is_some(), "JSON should have 'path' field");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_file_not_found() {
|
||||
build_binary();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["detect", "/nonexistent/file.txt"])
|
||||
.output()
|
||||
.expect("Failed to execute detect command");
|
||||
|
||||
assert!(!output.status.success(), "Detect should fail for nonexistent file");
|
||||
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
assert!(
|
||||
stderr.contains("File not found"),
|
||||
"Error should mention file not found, got: {}",
|
||||
stderr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_batch_multiple_files() {
|
||||
build_binary();
|
||||
|
||||
let file1 = get_test_file("text/simple.txt");
|
||||
let file2 = get_test_file("text/simple.txt");
|
||||
|
||||
if !PathBuf::from(&file1).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", file1);
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["batch", file1.as_str(), file2.as_str(), "--format", "json"])
|
||||
.output()
|
||||
.expect("Failed to execute batch command");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Batch command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
|
||||
let json_result: serde_json::Result<serde_json::Value> = serde_json::from_str(&stdout);
|
||||
assert!(json_result.is_ok(), "Output should be valid JSON, got: {}", stdout);
|
||||
|
||||
let json = json_result.unwrap();
|
||||
// Batch JSON output is now wrapped in a timing envelope: { results: [...], total_ms, per_file_ms }
|
||||
assert!(
|
||||
json.get("results").is_some(),
|
||||
"Batch envelope should have 'results' field"
|
||||
);
|
||||
assert!(json["results"].is_array(), "'results' should be a JSON array");
|
||||
assert_eq!(json["results"].as_array().unwrap().len(), 2, "Should have 2 results");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_batch_with_missing_file() {
|
||||
build_binary();
|
||||
|
||||
let valid_file = get_test_file("text/simple.txt");
|
||||
|
||||
if !PathBuf::from(&valid_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", valid_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["batch", valid_file.as_str(), "/nonexistent/file.txt"])
|
||||
.output()
|
||||
.expect("Failed to execute batch command");
|
||||
|
||||
assert!(!output.status.success(), "Batch should fail when one file is missing");
|
||||
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
assert!(
|
||||
stderr.contains("File not found") || stderr.contains("Invalid file"),
|
||||
"Error should mention file not found, got: {}",
|
||||
stderr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_help() {
|
||||
build_binary();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", "--help"])
|
||||
.output()
|
||||
.expect("Failed to execute extract --help");
|
||||
|
||||
assert!(output.status.success());
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
assert!(stdout.contains("Extract text from a document"));
|
||||
assert!(stdout.contains("--chunk-size"));
|
||||
assert!(stdout.contains("--chunk-overlap"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_help() {
|
||||
build_binary();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["detect", "--help"])
|
||||
.output()
|
||||
.expect("Failed to execute detect --help");
|
||||
|
||||
assert!(output.status.success());
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
assert!(stdout.contains("Detect MIME type"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_batch_help() {
|
||||
build_binary();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["batch", "--help"])
|
||||
.output()
|
||||
.expect("Failed to execute batch --help");
|
||||
|
||||
assert!(output.status.success());
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
assert!(stdout.contains("Batch extract from multiple documents"));
|
||||
}
|
||||
|
||||
// ── Extract command flag parsing tests ──────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn test_extract_help_shows_all_extraction_override_flags() {
|
||||
build_binary();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", "--help"])
|
||||
.output()
|
||||
.expect("Failed to execute extract --help");
|
||||
|
||||
assert!(output.status.success());
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
|
||||
// Verify all ExtractionOverrides flags appear in help output
|
||||
let expected_flags = [
|
||||
"--ocr",
|
||||
"--ocr-backend",
|
||||
"--ocr-language",
|
||||
"--force-ocr",
|
||||
"--no-cache",
|
||||
"--ocr-auto-rotate",
|
||||
"--chunk",
|
||||
"--chunk-size",
|
||||
"--chunk-overlap",
|
||||
"--chunking-tokenizer",
|
||||
"--content-format",
|
||||
"--include-structure",
|
||||
"--quality",
|
||||
"--detect-language",
|
||||
"--layout",
|
||||
"--layout-confidence",
|
||||
"--layout-table-model",
|
||||
"--acceleration",
|
||||
"--max-concurrent",
|
||||
"--max-threads",
|
||||
"--extract-pages",
|
||||
"--page-markers",
|
||||
"--extract-images",
|
||||
"--target-dpi",
|
||||
"--pdf-password",
|
||||
"--token-reduction",
|
||||
"--msg-codepage",
|
||||
];
|
||||
|
||||
for flag in &expected_flags {
|
||||
assert!(
|
||||
stdout.contains(flag),
|
||||
"Extract --help should show flag '{}', but it was not found in output:\n{}",
|
||||
flag,
|
||||
stdout
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// ── Batch command flag parity test ──────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn test_batch_has_same_extraction_flags_as_extract() {
|
||||
build_binary();
|
||||
|
||||
let extract_output = Command::new(get_binary_path())
|
||||
.args(["extract", "--help"])
|
||||
.output()
|
||||
.expect("Failed to execute extract --help");
|
||||
|
||||
let batch_output = Command::new(get_binary_path())
|
||||
.args(["batch", "--help"])
|
||||
.output()
|
||||
.expect("Failed to execute batch --help");
|
||||
|
||||
assert!(extract_output.status.success());
|
||||
assert!(batch_output.status.success());
|
||||
|
||||
let extract_help = String::from_utf8_lossy(&extract_output.stdout);
|
||||
let batch_help = String::from_utf8_lossy(&batch_output.stdout);
|
||||
|
||||
// All extraction override flags should be present on both commands
|
||||
let shared_flags = [
|
||||
"--ocr",
|
||||
"--ocr-backend",
|
||||
"--ocr-language",
|
||||
"--force-ocr",
|
||||
"--no-cache",
|
||||
"--chunk",
|
||||
"--chunk-size",
|
||||
"--chunk-overlap",
|
||||
"--content-format",
|
||||
"--quality",
|
||||
"--detect-language",
|
||||
"--layout",
|
||||
"--layout-confidence",
|
||||
"--layout-table-model",
|
||||
"--acceleration",
|
||||
"--max-concurrent",
|
||||
"--max-threads",
|
||||
"--extract-pages",
|
||||
"--page-markers",
|
||||
"--extract-images",
|
||||
"--target-dpi",
|
||||
"--pdf-password",
|
||||
"--token-reduction",
|
||||
"--msg-codepage",
|
||||
];
|
||||
|
||||
for flag in &shared_flags {
|
||||
assert!(
|
||||
extract_help.contains(flag),
|
||||
"Extract should have flag '{}' but it's missing",
|
||||
flag
|
||||
);
|
||||
assert!(
|
||||
batch_help.contains(flag),
|
||||
"Batch should have flag '{}' (parity with extract) but it's missing",
|
||||
flag
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// ── Validation error tests ──────────────────────────────────────────
|
||||
//
|
||||
// NOTE: The CLI validates file existence *before* override validation,
|
||||
// so we must provide a real file to reach the override validation stage.
|
||||
|
||||
/// Create a temporary file and return its path as a String.
|
||||
/// The caller must keep the returned `tempfile::TempDir` alive for the
|
||||
/// duration of the test so the file is not deleted.
|
||||
fn create_temp_file() -> (tempfile::TempDir, String) {
|
||||
let dir = tempdir().expect("Failed to create temp dir");
|
||||
let file_path = dir.path().join("dummy.pdf");
|
||||
std::fs::write(&file_path, b"dummy content").expect("Failed to write temp file");
|
||||
let path_str = file_path.to_string_lossy().to_string();
|
||||
(dir, path_str)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_chunk_size_zero_error() {
|
||||
build_binary();
|
||||
let (_dir, file_path) = create_temp_file();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", "--chunk-size", "0", &file_path])
|
||||
.output()
|
||||
.expect("Failed to execute extract command");
|
||||
|
||||
assert!(!output.status.success(), "Should fail when chunk size is 0");
|
||||
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
assert!(
|
||||
stderr.contains("chunk size") || stderr.contains("Chunk size") || stderr.contains("Invalid chunk size"),
|
||||
"Error should mention chunk size, got: {}",
|
||||
stderr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_chunk_overlap_exceeds_size_error() {
|
||||
build_binary();
|
||||
let (_dir, file_path) = create_temp_file();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", "--chunk-size", "10", "--chunk-overlap", "20", &file_path])
|
||||
.output()
|
||||
.expect("Failed to execute extract command");
|
||||
|
||||
assert!(!output.status.success(), "Should fail when overlap exceeds chunk size");
|
||||
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
assert!(
|
||||
stderr.contains("overlap") || stderr.contains("Overlap") || stderr.contains("Invalid chunk overlap"),
|
||||
"Error should mention overlap constraint, got: {}",
|
||||
stderr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_layout_confidence_out_of_range_error() {
|
||||
build_binary();
|
||||
let (_dir, file_path) = create_temp_file();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", "--layout-confidence", "2.0", &file_path])
|
||||
.output()
|
||||
.expect("Failed to execute extract command");
|
||||
|
||||
// This flag is feature-gated behind layout-detection. If the binary was
|
||||
// built without that feature, clap itself will reject the unknown flag.
|
||||
assert!(
|
||||
!output.status.success(),
|
||||
"Should fail for layout confidence out of range"
|
||||
);
|
||||
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
assert!(
|
||||
stderr.contains("confidence") || stderr.contains("layout") || stderr.contains("unexpected argument"),
|
||||
"Error should mention confidence or layout, got: {}",
|
||||
stderr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_layout_false_with_confidence_error() {
|
||||
build_binary();
|
||||
let (_dir, file_path) = create_temp_file();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", "--layout", "false", "--layout-confidence", "0.5", &file_path])
|
||||
.output()
|
||||
.expect("Failed to execute extract command");
|
||||
|
||||
// If layout-detection feature is enabled, validation should reject this combination.
|
||||
// If not enabled, clap rejects the unknown flags.
|
||||
assert!(
|
||||
!output.status.success(),
|
||||
"Should fail when --layout false is combined with --layout-confidence"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_target_dpi_zero_error() {
|
||||
build_binary();
|
||||
let (_dir, file_path) = create_temp_file();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", "--target-dpi", "0", &file_path])
|
||||
.output()
|
||||
.expect("Failed to execute extract command");
|
||||
|
||||
assert!(!output.status.success(), "Should fail when target DPI is 0");
|
||||
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
assert!(
|
||||
stderr.contains("DPI") || stderr.contains("dpi") || stderr.contains("target") || stderr.contains("Invalid"),
|
||||
"Error should mention DPI range, got: {}",
|
||||
stderr
|
||||
);
|
||||
}
|
||||
|
||||
// ── Completions test ────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn test_completions_bash_produces_output() {
|
||||
build_binary();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["completions", "bash"])
|
||||
.output()
|
||||
.expect("Failed to execute completions command");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Completions command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
assert!(!stdout.is_empty(), "Completions output should not be empty");
|
||||
// bash completions should contain the command name
|
||||
assert!(
|
||||
stdout.contains("kreuzberg"),
|
||||
"Bash completions should reference 'kreuzberg', got: {}",
|
||||
&stdout[..stdout.len().min(200)]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_completions_zsh_produces_output() {
|
||||
build_binary();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["completions", "zsh"])
|
||||
.output()
|
||||
.expect("Failed to execute completions command");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Completions command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
assert!(!stdout.is_empty(), "Zsh completions output should not be empty");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_completions_fish_produces_output() {
|
||||
build_binary();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["completions", "fish"])
|
||||
.output()
|
||||
.expect("Failed to execute completions command");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Completions command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
assert!(!stdout.is_empty(), "Fish completions output should not be empty");
|
||||
}
|
||||
|
||||
// ── Embed help test ─────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn test_embed_help_shows_correct_flags() {
|
||||
build_binary();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["embed", "--help"])
|
||||
.output()
|
||||
.expect("Failed to execute embed --help");
|
||||
|
||||
// embed is feature-gated; if not compiled, clap will show an error
|
||||
if !output.status.success() {
|
||||
// If embed subcommand doesn't exist, skip the test
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
if stderr.contains("unrecognized subcommand") || stderr.contains("invalid subcommand") {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
assert!(
|
||||
stdout.contains("--text"),
|
||||
"Embed help should show --text flag, got: {}",
|
||||
stdout
|
||||
);
|
||||
assert!(
|
||||
stdout.contains("--preset"),
|
||||
"Embed help should show --preset flag, got: {}",
|
||||
stdout
|
||||
);
|
||||
assert!(
|
||||
stdout.contains("--format"),
|
||||
"Embed help should show --format flag, got: {}",
|
||||
stdout
|
||||
);
|
||||
assert!(
|
||||
stdout.contains("Generate embeddings"),
|
||||
"Embed help should describe embedding generation, got: {}",
|
||||
stdout
|
||||
);
|
||||
}
|
||||
|
||||
// ── Chunk help test ─────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn test_chunk_help_shows_correct_flags() {
|
||||
build_binary();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["chunk", "--help"])
|
||||
.output()
|
||||
.expect("Failed to execute chunk --help");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Chunk --help failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
assert!(
|
||||
stdout.contains("--text"),
|
||||
"Chunk help should show --text flag, got: {}",
|
||||
stdout
|
||||
);
|
||||
assert!(
|
||||
stdout.contains("--chunk-size"),
|
||||
"Chunk help should show --chunk-size flag, got: {}",
|
||||
stdout
|
||||
);
|
||||
assert!(
|
||||
stdout.contains("--chunk-overlap"),
|
||||
"Chunk help should show --chunk-overlap flag, got: {}",
|
||||
stdout
|
||||
);
|
||||
assert!(
|
||||
stdout.contains("--chunker-type"),
|
||||
"Chunk help should show --chunker-type flag, got: {}",
|
||||
stdout
|
||||
);
|
||||
assert!(
|
||||
stdout.contains("--format"),
|
||||
"Chunk help should show --format flag, got: {}",
|
||||
stdout
|
||||
);
|
||||
assert!(
|
||||
stdout.contains("Chunk text"),
|
||||
"Chunk help should describe text chunking, got: {}",
|
||||
stdout
|
||||
);
|
||||
}
|
||||
|
||||
// ── Style module NO_COLOR test ──────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn test_no_color_env_disables_ansi_in_output() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
return;
|
||||
}
|
||||
|
||||
// Run with NO_COLOR set - output should have no ANSI escape sequences
|
||||
let output = Command::new(get_binary_path())
|
||||
.env("NO_COLOR", "1")
|
||||
.args(["detect", &test_file])
|
||||
.output()
|
||||
.expect("Failed to execute detect command");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Detect failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
assert!(
|
||||
!stdout.contains("\x1b["),
|
||||
"Output should not contain ANSI escape sequences when NO_COLOR is set, got: {:?}",
|
||||
stdout
|
||||
);
|
||||
}
|
||||
|
||||
// ── Additional validation edge cases ────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn test_extract_chunk_size_too_large_error() {
|
||||
build_binary();
|
||||
let (_dir, file_path) = create_temp_file();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", "--chunk-size", "2000000", &file_path])
|
||||
.output()
|
||||
.expect("Failed to execute extract command");
|
||||
|
||||
assert!(!output.status.success(), "Should fail when chunk size exceeds limit");
|
||||
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
assert!(
|
||||
stderr.contains("chunk size") || stderr.contains("Chunk size") || stderr.contains("1,000,000"),
|
||||
"Error should mention chunk size limit, got: {}",
|
||||
stderr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_target_dpi_too_high_error() {
|
||||
build_binary();
|
||||
let (_dir, file_path) = create_temp_file();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", "--target-dpi", "5000", &file_path])
|
||||
.output()
|
||||
.expect("Failed to execute extract command");
|
||||
|
||||
assert!(!output.status.success(), "Should fail when target DPI exceeds limit");
|
||||
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
assert!(
|
||||
stderr.contains("DPI") || stderr.contains("dpi") || stderr.contains("2400") || stderr.contains("Invalid"),
|
||||
"Error should mention DPI range, got: {}",
|
||||
stderr
|
||||
);
|
||||
}
|
||||
617
crates/kreuzberg-cli/tests/config_discovery_test.rs
Normal file
617
crates/kreuzberg-cli/tests/config_discovery_test.rs
Normal file
@@ -0,0 +1,617 @@
|
||||
//! Integration tests for CLI config file discovery.
|
||||
//!
|
||||
//! These tests verify that the CLI correctly discovers and loads configuration files
|
||||
//! in various formats (.toml, .yaml, .json) with case-insensitive extension
|
||||
//! matching, explicit --config flag support, and proper error handling.
|
||||
|
||||
use std::fs;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
use tempfile::tempdir;
|
||||
|
||||
/// Get the path to the kreuzberg binary.
|
||||
fn get_binary_path() -> String {
|
||||
let manifest_dir = env!("CARGO_MANIFEST_DIR");
|
||||
format!("{}/../../target/debug/kreuzberg", manifest_dir)
|
||||
}
|
||||
|
||||
/// Build the binary before running tests.
|
||||
fn build_binary() {
|
||||
let status = Command::new("cargo")
|
||||
.args(["build", "--bin", "kreuzberg"])
|
||||
.status()
|
||||
.expect("Failed to build kreuzberg binary");
|
||||
|
||||
assert!(status.success(), "Failed to build kreuzberg binary");
|
||||
}
|
||||
|
||||
/// Get the test_documents directory path.
|
||||
fn get_test_documents_dir() -> PathBuf {
|
||||
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
|
||||
manifest_dir.parent().unwrap().parent().unwrap().join("test_documents")
|
||||
}
|
||||
|
||||
/// Get a test file path relative to test_documents/.
|
||||
fn get_test_file(relative_path: &str) -> String {
|
||||
get_test_documents_dir()
|
||||
.join(relative_path)
|
||||
.to_string_lossy()
|
||||
.to_string()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_discover_kreuzberg_toml_in_current_directory() {
|
||||
build_binary();
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
let config_path = dir.path().join(".kreuzberg.toml");
|
||||
|
||||
fs::write(
|
||||
&config_path,
|
||||
r#"
|
||||
use_cache = false
|
||||
enable_quality_processing = false
|
||||
"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.current_dir(dir.path())
|
||||
.args(["extract", test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute kreuzberg");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_discover_kreuzberg_yaml_in_current_directory() {
|
||||
build_binary();
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
let config_path = dir.path().join(".kreuzberg.yaml");
|
||||
|
||||
fs::write(
|
||||
&config_path,
|
||||
r#"
|
||||
use_cache: false
|
||||
enable_quality_processing: false
|
||||
"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.current_dir(dir.path())
|
||||
.args(["extract", test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute kreuzberg");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_discover_kreuzberg_yml_in_current_directory() {
|
||||
build_binary();
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
let config_path = dir.path().join(".kreuzberg.yaml");
|
||||
|
||||
fs::write(
|
||||
&config_path,
|
||||
r#"
|
||||
use_cache: false
|
||||
enable_quality_processing: false
|
||||
"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.current_dir(dir.path())
|
||||
.args(["extract", test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute kreuzberg");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_discover_kreuzberg_json_in_current_directory() {
|
||||
build_binary();
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
let config_path = dir.path().join(".kreuzberg.json");
|
||||
|
||||
fs::write(
|
||||
&config_path,
|
||||
r#"{
|
||||
"use_cache": false,
|
||||
"enable_quality_processing": false
|
||||
}"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.current_dir(dir.path())
|
||||
.args(["extract", test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute kreuzberg");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_case_insensitive_toml_extension() {
|
||||
build_binary();
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
let config_path = dir.path().join("custom.TOML");
|
||||
|
||||
fs::write(
|
||||
&config_path,
|
||||
r#"
|
||||
use_cache = false
|
||||
"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let config_arg = config_path.to_string_lossy().into_owned();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.current_dir(dir.path())
|
||||
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute kreuzberg");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_case_insensitive_yaml_extension() {
|
||||
build_binary();
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
let config_path = dir.path().join("custom.Yaml");
|
||||
|
||||
fs::write(
|
||||
&config_path,
|
||||
r#"
|
||||
use_cache: false
|
||||
"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let config_arg = config_path.to_string_lossy().into_owned();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.current_dir(dir.path())
|
||||
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute kreuzberg");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_case_insensitive_yml_extension() {
|
||||
build_binary();
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
let config_path = dir.path().join("custom.YML");
|
||||
|
||||
fs::write(
|
||||
&config_path,
|
||||
r#"
|
||||
use_cache: false
|
||||
"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let config_arg = config_path.to_string_lossy().into_owned();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.current_dir(dir.path())
|
||||
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute kreuzberg");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_case_insensitive_json_extension() {
|
||||
build_binary();
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
let config_path = dir.path().join("custom.JSON");
|
||||
|
||||
fs::write(
|
||||
&config_path,
|
||||
r#"{
|
||||
"use_cache": false
|
||||
}"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let config_arg = config_path.to_string_lossy().into_owned();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.current_dir(dir.path())
|
||||
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute kreuzberg");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_explicit_config_path_toml() {
|
||||
build_binary();
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
let config_path = dir.path().join("custom_config.toml");
|
||||
|
||||
fs::write(
|
||||
&config_path,
|
||||
r#"
|
||||
use_cache = false
|
||||
enable_quality_processing = false
|
||||
"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let config_arg = config_path.to_string_lossy().into_owned();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute kreuzberg");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_explicit_config_path_yaml() {
|
||||
build_binary();
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
let config_path = dir.path().join("custom_config.yaml");
|
||||
|
||||
fs::write(
|
||||
&config_path,
|
||||
r#"
|
||||
use_cache: false
|
||||
enable_quality_processing: false
|
||||
"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let config_arg = config_path.to_string_lossy().into_owned();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute kreuzberg");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_explicit_config_path_json() {
|
||||
build_binary();
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
let config_path = dir.path().join("custom_config.json");
|
||||
|
||||
fs::write(
|
||||
&config_path,
|
||||
r#"{
|
||||
"use_cache": false,
|
||||
"enable_quality_processing": false
|
||||
}"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let config_arg = config_path.to_string_lossy().into_owned();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute kreuzberg");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_invalid_config_extension() {
|
||||
build_binary();
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
let config_path = dir.path().join("config.txt");
|
||||
|
||||
fs::write(&config_path, "invalid content").unwrap();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let config_arg = config_path.to_string_lossy().into_owned();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute kreuzberg");
|
||||
|
||||
assert!(!output.status.success());
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
assert!(
|
||||
stderr.contains(".toml") || stderr.contains(".yaml") || stderr.contains(".json"),
|
||||
"Error message should mention supported extensions: {}",
|
||||
stderr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_malformed_toml_config() {
|
||||
build_binary();
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
let config_path = dir.path().join("bad_config.toml");
|
||||
|
||||
fs::write(&config_path, "use_cache = [[[[[").unwrap();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let config_arg = config_path.to_string_lossy().into_owned();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute kreuzberg");
|
||||
|
||||
assert!(!output.status.success());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_malformed_yaml_config() {
|
||||
build_binary();
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
let config_path = dir.path().join("bad_config.yaml");
|
||||
|
||||
fs::write(&config_path, "use_cache: [[[[[").unwrap();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let config_arg = config_path.to_string_lossy().into_owned();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute kreuzberg");
|
||||
|
||||
assert!(!output.status.success());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_malformed_json_config() {
|
||||
build_binary();
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
let config_path = dir.path().join("bad_config.json");
|
||||
|
||||
fs::write(&config_path, r#"{"use_cache": [[[[[}"#).unwrap();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let config_arg = config_path.to_string_lossy().into_owned();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute kreuzberg");
|
||||
|
||||
assert!(!output.status.success());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_nonexistent_config_file() {
|
||||
build_binary();
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
let config_path = dir.path().join("nonexistent.toml");
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let config_arg = config_path.to_string_lossy().into_owned();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute kreuzberg");
|
||||
|
||||
assert!(!output.status.success());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_default_config_when_no_file_found() {
|
||||
build_binary();
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.current_dir(dir.path())
|
||||
.args(["extract", test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute kreuzberg");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_invalid_config_values() {
|
||||
build_binary();
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
let config_path = dir.path().join("invalid.toml");
|
||||
|
||||
fs::write(
|
||||
&config_path,
|
||||
r#"
|
||||
use_cache = "not_a_bool"
|
||||
"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let config_arg = config_path.to_string_lossy().into_owned();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute kreuzberg");
|
||||
|
||||
assert!(!output.status.success());
|
||||
}
|
||||
46
crates/kreuzberg-cli/tests/config_env_overrides_test.rs
Normal file
46
crates/kreuzberg-cli/tests/config_env_overrides_test.rs
Normal file
@@ -0,0 +1,46 @@
|
||||
//! Regression test for issue #773.
|
||||
//! Validates that environment variable overrides are correctly applied during configuration loading.
|
||||
|
||||
use kreuzberg::{EmbeddingModelType, ExtractionConfig};
|
||||
|
||||
#[test]
|
||||
fn test_regression_773_env_override_loading() {
|
||||
let mut config = ExtractionConfig::default();
|
||||
|
||||
if let Some(ref ocr) = config.ocr {
|
||||
assert_ne!(ocr.language, "fra");
|
||||
}
|
||||
|
||||
unsafe { std::env::set_var("KREUZBERG_OCR_LANGUAGE", "fra") };
|
||||
config.apply_env_overrides().expect("Failed to apply overrides");
|
||||
unsafe { std::env::remove_var("KREUZBERG_OCR_LANGUAGE") };
|
||||
|
||||
let ocr = config
|
||||
.ocr
|
||||
.expect("OCR config should be Some when KREUZBERG_OCR_LANGUAGE is set");
|
||||
assert_eq!(ocr.language, "fra");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_regression_773_vlm_embedding_env_override() {
|
||||
let mut config = ExtractionConfig::default();
|
||||
|
||||
unsafe { std::env::set_var("KREUZBERG_VLM_EMBEDDING_MODEL", "openai/text-embedding-3-small") };
|
||||
config
|
||||
.apply_env_overrides()
|
||||
.expect("Failed to apply environment overrides");
|
||||
unsafe { std::env::remove_var("KREUZBERG_VLM_EMBEDDING_MODEL") };
|
||||
|
||||
let chunking = config
|
||||
.chunking
|
||||
.expect("Chunking should be enabled when VLM embedding is set");
|
||||
let embedding = chunking.embedding.expect("Embedding should be configured");
|
||||
|
||||
match embedding.model {
|
||||
EmbeddingModelType::Llm { llm } => {
|
||||
assert_eq!(llm.model, "openai/text-embedding-3-small");
|
||||
assert!(llm.api_key.is_none());
|
||||
}
|
||||
_ => panic!("Expected Llm embedding model type"),
|
||||
}
|
||||
}
|
||||
344
crates/kreuzberg-cli/tests/config_tests.rs
Normal file
344
crates/kreuzberg-cli/tests/config_tests.rs
Normal file
@@ -0,0 +1,344 @@
|
||||
//! CLI configuration tests validating flags, aliases, and deprecation handling.
|
||||
//!
|
||||
//! This test suite verifies that:
|
||||
//! 1. --output-format flag works correctly for all format options
|
||||
//! 2. CLI flags properly override config file settings
|
||||
//! 3. Config merge precedence is maintained (CLI args > config file > defaults)
|
||||
//! 4. Configuration JSON can be passed inline
|
||||
//! 5. Alias handling for deprecated flags works as expected
|
||||
|
||||
#![allow(clippy::bool_assert_comparison)]
|
||||
#![allow(clippy::field_reassign_with_default)]
|
||||
|
||||
use std::path::PathBuf;
|
||||
use tempfile::TempDir;
|
||||
|
||||
/// Helper to create a temporary config file
|
||||
#[allow(dead_code)]
|
||||
fn create_test_config(dir: &TempDir, name: &str, content: &str) -> PathBuf {
|
||||
let config_path = dir.path().join(name);
|
||||
std::fs::write(&config_path, content).expect("Failed to write config file");
|
||||
config_path
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_output_format_flag_plain() {
|
||||
// Test that --output-format plain works
|
||||
// This test verifies the flag is properly recognized
|
||||
|
||||
let config = kreuzberg::core::config::ExtractionConfig::default();
|
||||
assert_eq!(
|
||||
config.output_format,
|
||||
kreuzberg::core::config::OutputFormat::Plain,
|
||||
"Default output format should be Plain"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_output_format_flag_markdown() {
|
||||
// Test that --output-format markdown is parsed correctly
|
||||
let markdown_format = kreuzberg::core::config::OutputFormat::Markdown;
|
||||
assert_eq!(
|
||||
format!("{:?}", markdown_format),
|
||||
"Markdown",
|
||||
"Markdown format should have correct debug representation"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_output_format_flag_html() {
|
||||
// Test that --output-format html is parsed correctly
|
||||
let html_format = kreuzberg::core::config::OutputFormat::Html;
|
||||
assert_eq!(
|
||||
format!("{:?}", html_format),
|
||||
"Html",
|
||||
"Html format should have correct debug representation"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extraction_config_with_output_format() {
|
||||
// Test that ExtractionConfig can be created with specific output_format
|
||||
let mut config = kreuzberg::core::config::ExtractionConfig::default();
|
||||
|
||||
config.output_format = kreuzberg::core::config::OutputFormat::Markdown;
|
||||
assert_eq!(
|
||||
config.output_format,
|
||||
kreuzberg::core::config::OutputFormat::Markdown,
|
||||
"output_format should be Markdown after assignment"
|
||||
);
|
||||
|
||||
let serialized = serde_json::to_value(&config).expect("Failed to serialize");
|
||||
assert_eq!(
|
||||
serialized["output_format"], "markdown",
|
||||
"Serialized output_format should be 'markdown' (lowercase)"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_json_parsing_complete() {
|
||||
// Test that complete JSON config can be parsed
|
||||
let json = serde_json::json!({
|
||||
"use_cache": true,
|
||||
"enable_quality_processing": true,
|
||||
"force_ocr": false,
|
||||
"output_format": "markdown",
|
||||
"result_format": "unified",
|
||||
"max_concurrent_extractions": 4,
|
||||
});
|
||||
|
||||
let config: kreuzberg::core::config::ExtractionConfig =
|
||||
serde_json::from_value(json).expect("Failed to parse config JSON");
|
||||
|
||||
assert!(config.use_cache);
|
||||
assert!(config.enable_quality_processing);
|
||||
assert_eq!(config.force_ocr, false);
|
||||
assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Markdown);
|
||||
assert_eq!(config.max_concurrent_extractions, Some(4));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_merge_precedence_cli_overrides_default() {
|
||||
// Test that CLI arguments override defaults
|
||||
let mut config = kreuzberg::core::config::ExtractionConfig::default();
|
||||
|
||||
// Simulate CLI override
|
||||
config.use_cache = false;
|
||||
config.force_ocr = true;
|
||||
|
||||
assert_eq!(config.use_cache, false, "CLI override should change use_cache to false");
|
||||
assert_eq!(config.force_ocr, true, "CLI override should change force_ocr to true");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_merge_precedence_cli_overrides_file() {
|
||||
// Test that CLI arguments override config file settings
|
||||
let mut file_config = kreuzberg::core::config::ExtractionConfig::default();
|
||||
file_config.use_cache = true;
|
||||
file_config.force_ocr = false;
|
||||
|
||||
// Simulate CLI override
|
||||
let mut final_config = file_config.clone();
|
||||
final_config.use_cache = false;
|
||||
|
||||
assert_eq!(
|
||||
final_config.use_cache, false,
|
||||
"CLI should override file config for use_cache"
|
||||
);
|
||||
assert!(!final_config.force_ocr, "CLI should not affect fields not overridden");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_file_precedence_over_defaults() {
|
||||
// Test that config file values override defaults
|
||||
let json = serde_json::json!({
|
||||
"use_cache": false,
|
||||
"force_ocr": true,
|
||||
});
|
||||
|
||||
let file_config: kreuzberg::core::config::ExtractionConfig = serde_json::from_value(json).expect("Failed to parse");
|
||||
|
||||
let default_config = kreuzberg::core::config::ExtractionConfig::default();
|
||||
|
||||
assert_ne!(
|
||||
file_config.use_cache, default_config.use_cache,
|
||||
"File config should override default for use_cache"
|
||||
);
|
||||
assert_ne!(
|
||||
file_config.force_ocr, default_config.force_ocr,
|
||||
"File config should override default for force_ocr"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_output_format_serialization() {
|
||||
// Test that output_format serializes to expected string values
|
||||
let plain = kreuzberg::core::config::OutputFormat::Plain;
|
||||
let plain_json = serde_json::to_value(plain).expect("Failed to serialize Plain");
|
||||
assert_eq!(plain_json, "plain");
|
||||
|
||||
let markdown = kreuzberg::core::config::OutputFormat::Markdown;
|
||||
let markdown_json = serde_json::to_value(markdown).expect("Failed to serialize Markdown");
|
||||
assert_eq!(markdown_json, "markdown");
|
||||
|
||||
let html = kreuzberg::core::config::OutputFormat::Html;
|
||||
let html_json = serde_json::to_value(html).expect("Failed to serialize Html");
|
||||
assert_eq!(html_json, "html");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_output_format_deserialization() {
|
||||
// Test that output_format can be deserialized from string values
|
||||
let plain: kreuzberg::core::config::OutputFormat =
|
||||
serde_json::from_value(serde_json::json!("plain")).expect("Failed to deserialize plain");
|
||||
assert_eq!(plain, kreuzberg::core::config::OutputFormat::Plain);
|
||||
|
||||
let markdown: kreuzberg::core::config::OutputFormat =
|
||||
serde_json::from_value(serde_json::json!("markdown")).expect("Failed to deserialize markdown");
|
||||
assert_eq!(markdown, kreuzberg::core::config::OutputFormat::Markdown);
|
||||
|
||||
let html: kreuzberg::core::config::OutputFormat =
|
||||
serde_json::from_value(serde_json::json!("html")).expect("Failed to deserialize html");
|
||||
assert_eq!(html, kreuzberg::core::config::OutputFormat::Html);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extraction_config_roundtrip_with_output_format() {
|
||||
// Test that output_format survives serialization roundtrip
|
||||
let original = kreuzberg::core::config::ExtractionConfig {
|
||||
output_format: kreuzberg::core::config::OutputFormat::Markdown,
|
||||
..kreuzberg::core::config::ExtractionConfig::default()
|
||||
};
|
||||
|
||||
let json_string = serde_json::to_string(&original).expect("Failed to serialize");
|
||||
let restored: kreuzberg::core::config::ExtractionConfig =
|
||||
serde_json::from_str(&json_string).expect("Failed to deserialize");
|
||||
|
||||
assert_eq!(
|
||||
original.output_format, restored.output_format,
|
||||
"output_format should survive serialization roundtrip"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_with_all_output_formats() {
|
||||
// Test that all output format variants can be set and retrieved
|
||||
let formats = vec![
|
||||
kreuzberg::core::config::OutputFormat::Plain,
|
||||
kreuzberg::core::config::OutputFormat::Markdown,
|
||||
kreuzberg::core::config::OutputFormat::Html,
|
||||
];
|
||||
|
||||
for format in formats {
|
||||
let config = kreuzberg::core::config::ExtractionConfig {
|
||||
output_format: format.clone(),
|
||||
..kreuzberg::core::config::ExtractionConfig::default()
|
||||
};
|
||||
|
||||
let json = serde_json::to_value(&config).expect("Failed to serialize");
|
||||
let restored: kreuzberg::core::config::ExtractionConfig =
|
||||
serde_json::from_value(json).expect("Failed to deserialize");
|
||||
|
||||
assert_eq!(
|
||||
format, restored.output_format,
|
||||
"Format should be preserved for {:?}",
|
||||
format
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_partial_json_with_output_format() {
|
||||
// Test that partial JSON config with only output_format is valid
|
||||
let json = serde_json::json!({
|
||||
"output_format": "markdown",
|
||||
});
|
||||
|
||||
let config: kreuzberg::core::config::ExtractionConfig =
|
||||
serde_json::from_value(json).expect("Failed to parse partial config");
|
||||
|
||||
assert_eq!(
|
||||
config.output_format,
|
||||
kreuzberg::core::config::OutputFormat::Markdown,
|
||||
"output_format should be set from partial config"
|
||||
);
|
||||
|
||||
// Other fields should have defaults
|
||||
assert!(config.use_cache, "use_cache should have default value");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_complete_json_structure() {
|
||||
// Test that a complete config JSON has all necessary fields
|
||||
let config = kreuzberg::core::config::ExtractionConfig::default();
|
||||
let json = serde_json::to_value(&config).expect("Failed to serialize");
|
||||
let obj = json.as_object().expect("Should be object");
|
||||
|
||||
// Verify critical fields are present
|
||||
assert!(obj.contains_key("output_format"), "Should have output_format");
|
||||
assert!(obj.contains_key("use_cache"), "Should have use_cache");
|
||||
assert!(
|
||||
obj.contains_key("enable_quality_processing"),
|
||||
"Should have enable_quality_processing"
|
||||
);
|
||||
assert!(obj.contains_key("force_ocr"), "Should have force_ocr");
|
||||
assert!(obj.contains_key("result_format"), "Should have result_format");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_unknown_output_format_accepted_as_custom() {
|
||||
// OutputFormat has a Custom(String) catch-all variant with #[serde(untagged)],
|
||||
// so unknown strings are accepted as custom renderer names rather than rejected.
|
||||
let json = serde_json::json!({
|
||||
"output_format": "my_custom_renderer",
|
||||
});
|
||||
|
||||
let result: Result<kreuzberg::core::config::ExtractionConfig, _> = serde_json::from_value(json);
|
||||
|
||||
assert!(
|
||||
result.is_ok(),
|
||||
"Unknown output_format should be accepted as Custom variant; got: {:?}",
|
||||
result.err()
|
||||
);
|
||||
assert_eq!(
|
||||
result.unwrap().output_format,
|
||||
kreuzberg::core::config::OutputFormat::Custom("my_custom_renderer".to_string()),
|
||||
"Unknown format string must deserialize as OutputFormat::Custom"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_case_sensitivity() {
|
||||
// Test that format values are case-insensitive due to rename_all = "lowercase"
|
||||
let plain_lowercase = serde_json::json!({"output_format": "plain"});
|
||||
let result: Result<kreuzberg::core::config::ExtractionConfig, _> = serde_json::from_value(plain_lowercase);
|
||||
|
||||
assert!(result.is_ok(), "lowercase 'plain' should be accepted");
|
||||
let config = result.unwrap();
|
||||
assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Plain);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_output_format_field_is_required_in_serialization() {
|
||||
// Test that output_format is always included in serialization
|
||||
let config = kreuzberg::core::config::ExtractionConfig::default();
|
||||
let json = serde_json::to_value(&config).expect("Failed to serialize");
|
||||
|
||||
assert!(
|
||||
json.get("output_format").is_some(),
|
||||
"output_format should always be present in serialization"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_result_format_and_output_format_independent() {
|
||||
// Test that result_format and output_format are independent fields
|
||||
let mut config = kreuzberg::core::config::ExtractionConfig::default();
|
||||
|
||||
// Set both to different values
|
||||
config.output_format = kreuzberg::core::config::OutputFormat::Markdown;
|
||||
|
||||
let json = serde_json::to_value(&config).expect("Failed to serialize");
|
||||
|
||||
assert_eq!(json["output_format"], "markdown");
|
||||
assert!(
|
||||
json["result_format"].is_string(),
|
||||
"result_format should also be present"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extraction_config_clone_preserves_format() {
|
||||
// Test that cloning config preserves output_format
|
||||
let original = kreuzberg::core::config::ExtractionConfig {
|
||||
output_format: kreuzberg::core::config::OutputFormat::Html,
|
||||
..kreuzberg::core::config::ExtractionConfig::default()
|
||||
};
|
||||
|
||||
let cloned = original.clone();
|
||||
|
||||
assert_eq!(
|
||||
original.output_format, cloned.output_format,
|
||||
"Cloned config should preserve output_format"
|
||||
);
|
||||
}
|
||||
355
crates/kreuzberg-cli/tests/contract_cli.rs
Normal file
355
crates/kreuzberg-cli/tests/contract_cli.rs
Normal file
@@ -0,0 +1,355 @@
|
||||
//! CLI contract tests - verify CLI config parsing matches Rust core
|
||||
//!
|
||||
//! This test suite validates that the CLI's configuration parsing produces
|
||||
//! identical results to the Rust core library. It ensures that users get
|
||||
//! consistent behavior whether using the CLI, SDK, or MCP interfaces.
|
||||
|
||||
use kreuzberg::core::config::ExtractionConfig;
|
||||
use kreuzberg::core::config::OutputFormat;
|
||||
use serde_json::json;
|
||||
|
||||
#[test]
|
||||
fn test_cli_config_json_flag_basic_parsing() {
|
||||
let config_str = r#"{"use_cache": true, "output_format": "plain"}"#;
|
||||
|
||||
// Parse as Rust core would
|
||||
let rust_config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize config string");
|
||||
|
||||
// Simulate CLI --config-json parsing (same as Rust core)
|
||||
let cli_json: serde_json::Value = serde_json::from_str(config_str).expect("Failed to parse JSON string");
|
||||
let cli_config: ExtractionConfig = serde_json::from_value(cli_json).expect("Failed to deserialize from JSON value");
|
||||
|
||||
// Verify identical behavior
|
||||
assert_eq!(
|
||||
rust_config.use_cache, cli_config.use_cache,
|
||||
"use_cache should be identical"
|
||||
);
|
||||
assert_eq!(
|
||||
rust_config.output_format, cli_config.output_format,
|
||||
"output_format should be identical"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_nested_config_deserialization() {
|
||||
let config_str = r#"{
|
||||
"chunking": {
|
||||
"max_characters": 1000,
|
||||
"overlap": 200
|
||||
},
|
||||
"ocr": {
|
||||
"backend": "tesseract"
|
||||
}
|
||||
}"#;
|
||||
|
||||
let config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize nested config");
|
||||
|
||||
assert!(config.chunking.is_some(), "Chunking config should be present");
|
||||
assert!(config.ocr.is_some(), "OCR config should be present");
|
||||
|
||||
let chunking = config.chunking.unwrap();
|
||||
assert_eq!(chunking.max_characters, 1000, "max_chars should be 1000");
|
||||
assert_eq!(chunking.overlap, 200, "max_overlap should be 200");
|
||||
|
||||
let ocr = config.ocr.unwrap();
|
||||
assert_eq!(ocr.backend, "tesseract", "backend should be tesseract");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_force_ocr_flag_parsing() {
|
||||
let config_str = r#"{"force_ocr": true}"#;
|
||||
|
||||
let config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize force_ocr config");
|
||||
|
||||
assert!(config.force_ocr, "force_ocr should be true");
|
||||
// Verify other fields retain defaults
|
||||
assert!(config.use_cache, "use_cache should still be true by default");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_max_concurrent_extractions_parsing() {
|
||||
let config_str = r#"{"max_concurrent_extractions": 8}"#;
|
||||
|
||||
let config: ExtractionConfig =
|
||||
serde_json::from_str(config_str).expect("Failed to deserialize concurrent extractions");
|
||||
|
||||
assert_eq!(
|
||||
config.max_concurrent_extractions,
|
||||
Some(8),
|
||||
"max_concurrent_extractions should be 8"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_complex_config_deserialization() {
|
||||
let config_str = r#"{
|
||||
"use_cache": false,
|
||||
"enable_quality_processing": true,
|
||||
"force_ocr": true,
|
||||
"output_format": "markdown",
|
||||
"result_format": "unified",
|
||||
"max_concurrent_extractions": 16,
|
||||
"ocr": {
|
||||
"backend": "tesseract",
|
||||
"language": "eng"
|
||||
},
|
||||
"chunking": {
|
||||
"max_characters": 2000,
|
||||
"overlap": 400,
|
||||
"strategy": "sliding_window"
|
||||
}
|
||||
}"#;
|
||||
|
||||
let config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize complex config");
|
||||
|
||||
// Verify all top-level fields
|
||||
assert!(!config.use_cache);
|
||||
assert!(config.enable_quality_processing);
|
||||
assert!(config.force_ocr);
|
||||
assert_eq!(config.max_concurrent_extractions, Some(16));
|
||||
|
||||
// Verify nested configs
|
||||
assert!(config.ocr.is_some());
|
||||
assert!(config.chunking.is_some());
|
||||
|
||||
let ocr = config.ocr.unwrap();
|
||||
assert_eq!(ocr.backend, "tesseract");
|
||||
assert_eq!(ocr.language, "eng");
|
||||
|
||||
let chunking = config.chunking.unwrap();
|
||||
assert_eq!(chunking.max_characters, 2000);
|
||||
assert_eq!(chunking.overlap, 400);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_empty_config_uses_defaults() {
|
||||
let config_str = r#"{}"#;
|
||||
|
||||
let config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize empty config");
|
||||
|
||||
// All defaults should apply
|
||||
assert!(config.use_cache, "Default use_cache should be true");
|
||||
assert!(
|
||||
config.enable_quality_processing,
|
||||
"Default enable_quality_processing should be true"
|
||||
);
|
||||
assert!(!config.force_ocr, "Default force_ocr should be false");
|
||||
assert_eq!(
|
||||
config.max_concurrent_extractions, None,
|
||||
"Default max_concurrent_extractions should be None"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_roundtrip_preserves_all_fields() {
|
||||
let original_str = r#"{
|
||||
"use_cache": false,
|
||||
"force_ocr": true,
|
||||
"max_concurrent_extractions": 12
|
||||
}"#;
|
||||
|
||||
// Parse
|
||||
let config: ExtractionConfig = serde_json::from_str(original_str).expect("Failed to deserialize");
|
||||
|
||||
// Serialize back
|
||||
let serialized = serde_json::to_value(&config).expect("Failed to serialize");
|
||||
|
||||
// Re-parse the serialized version
|
||||
let reparsed: ExtractionConfig =
|
||||
serde_json::from_value(serialized).expect("Failed to deserialize roundtripped config");
|
||||
|
||||
// Verify fields preserved
|
||||
assert!(!reparsed.use_cache);
|
||||
assert!(reparsed.force_ocr);
|
||||
assert_eq!(reparsed.max_concurrent_extractions, Some(12));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_output_format_enum_parsing() {
|
||||
let test_cases = vec![
|
||||
(r#"{"output_format": "plain"}"#, OutputFormat::Plain),
|
||||
(r#"{"output_format": "markdown"}"#, OutputFormat::Markdown),
|
||||
(r#"{"output_format": "html"}"#, OutputFormat::Html),
|
||||
];
|
||||
|
||||
for (config_str, expected_format) in test_cases {
|
||||
let config: ExtractionConfig =
|
||||
serde_json::from_str(config_str).unwrap_or_else(|_| panic!("Failed to deserialize {}", config_str));
|
||||
|
||||
assert_eq!(
|
||||
config.output_format, expected_format,
|
||||
"output_format should match expected value"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_result_format_enum_parsing() {
|
||||
let test_cases = vec![
|
||||
r#"{"result_format": "unified"}"#,
|
||||
r#"{"result_format": "element_based"}"#,
|
||||
];
|
||||
|
||||
for config_str in test_cases {
|
||||
let result = serde_json::from_str::<ExtractionConfig>(config_str);
|
||||
assert!(result.is_ok(), "Should deserialize result_format from {}", config_str);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_base64_encoded_config_simulation() {
|
||||
// Simulate --config-json-base64 flag handling
|
||||
let original_json = json!({
|
||||
"force_ocr": true,
|
||||
"output_format": "markdown"
|
||||
});
|
||||
|
||||
let json_string = original_json.to_string();
|
||||
|
||||
// Simulate base64 encoding
|
||||
let encoded = base64::engine::general_purpose::STANDARD.encode(&json_string);
|
||||
|
||||
// Simulate base64 decoding (as CLI would do)
|
||||
use base64::Engine;
|
||||
let decoded = String::from_utf8(
|
||||
base64::engine::general_purpose::STANDARD
|
||||
.decode(&encoded)
|
||||
.expect("Failed to decode base64"),
|
||||
)
|
||||
.expect("Failed to convert bytes to string");
|
||||
|
||||
// Parse the decoded JSON
|
||||
let config: ExtractionConfig = serde_json::from_str(&decoded).expect("Failed to deserialize base64-decoded config");
|
||||
|
||||
assert!(config.force_ocr);
|
||||
assert_eq!(config.output_format, OutputFormat::Markdown);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_partial_override_merging() {
|
||||
// Test that partial configs can override defaults
|
||||
let base_config = ExtractionConfig::default();
|
||||
let override_json = json!({"force_ocr": true, "use_cache": false});
|
||||
|
||||
// Simulate CLI merge: convert base to JSON, merge overrides, deserialize
|
||||
let mut base_json = serde_json::to_value(&base_config).expect("Failed to serialize base config");
|
||||
|
||||
if let (serde_json::Value::Object(base_obj), serde_json::Value::Object(override_obj)) =
|
||||
(&mut base_json, override_json)
|
||||
{
|
||||
for (key, value) in override_obj {
|
||||
base_obj.insert(key, value);
|
||||
}
|
||||
}
|
||||
|
||||
let merged: ExtractionConfig = serde_json::from_value(base_json).expect("Failed to deserialize merged config");
|
||||
|
||||
assert!(merged.force_ocr, "Override should apply force_ocr");
|
||||
assert!(!merged.use_cache, "Override should apply use_cache");
|
||||
assert!(
|
||||
merged.enable_quality_processing,
|
||||
"Unoverridden field should retain default"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_invalid_json_error_handling() {
|
||||
let invalid_json_str = r#"{"force_ocr": true, "invalid_field": "value"}"#;
|
||||
|
||||
// Note: serde with deny_unknown_fields would reject this
|
||||
// Without that, it should deserialize successfully and ignore unknown fields
|
||||
let result = serde_json::from_str::<ExtractionConfig>(invalid_json_str);
|
||||
|
||||
// Document the current behavior - unknown fields are typically ignored
|
||||
if let Ok(config) = result {
|
||||
assert!(config.force_ocr);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_whitespace_handling_in_json() {
|
||||
let config_strs = vec![
|
||||
r#"{"force_ocr":true}"#, // No spaces
|
||||
r#"{ "force_ocr" : true }"#, // Extra spaces
|
||||
r#"{
|
||||
"force_ocr": true
|
||||
}"#, // Newlines and indentation
|
||||
];
|
||||
|
||||
for config_str in config_strs {
|
||||
let config: ExtractionConfig =
|
||||
serde_json::from_str(config_str).unwrap_or_else(|_| panic!("Failed to parse: {}", config_str));
|
||||
|
||||
assert!(config.force_ocr);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_numeric_boundary_values() {
|
||||
// Test minimum and maximum reasonable values for numeric fields
|
||||
let test_cases = vec![
|
||||
(r#"{"max_concurrent_extractions": 1}"#, Some(1)),
|
||||
(r#"{"max_concurrent_extractions": 256}"#, Some(256)),
|
||||
(r#"{"max_concurrent_extractions": 0}"#, Some(0)), // Edge case: 0 extractions
|
||||
];
|
||||
|
||||
for (config_str, expected_value) in test_cases {
|
||||
let config: ExtractionConfig =
|
||||
serde_json::from_str(config_str).unwrap_or_else(|_| panic!("Failed to parse: {}", config_str));
|
||||
|
||||
assert_eq!(
|
||||
config.max_concurrent_extractions, expected_value,
|
||||
"Numeric values should be parsed correctly"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_boolean_values_strict_parsing() {
|
||||
// Test that boolean values are strictly true/false, not truthy/falsy
|
||||
let test_cases = vec![(r#"{"use_cache": true}"#, true), (r#"{"use_cache": false}"#, false)];
|
||||
|
||||
for (config_str, expected_value) in test_cases {
|
||||
let config: ExtractionConfig =
|
||||
serde_json::from_str(config_str).unwrap_or_else(|_| panic!("Failed to parse: {}", config_str));
|
||||
|
||||
assert_eq!(config.use_cache, expected_value);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_config_consistency_across_formats() {
|
||||
// Create a config programmatically
|
||||
let programmatic_config = ExtractionConfig {
|
||||
use_cache: false,
|
||||
enable_quality_processing: true,
|
||||
force_ocr: true,
|
||||
output_format: OutputFormat::Markdown,
|
||||
max_concurrent_extractions: Some(4),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Serialize it
|
||||
let serialized_json = serde_json::to_value(&programmatic_config).expect("Failed to serialize");
|
||||
|
||||
// Deserialize back from JSON string (simulating CLI parsing)
|
||||
let json_string = serialized_json.to_string();
|
||||
let deserialized: ExtractionConfig = serde_json::from_str(&json_string).expect("Failed to deserialize from string");
|
||||
|
||||
// Verify complete roundtrip
|
||||
assert_eq!(deserialized.use_cache, programmatic_config.use_cache);
|
||||
assert_eq!(
|
||||
deserialized.enable_quality_processing,
|
||||
programmatic_config.enable_quality_processing
|
||||
);
|
||||
assert_eq!(deserialized.force_ocr, programmatic_config.force_ocr);
|
||||
assert_eq!(deserialized.output_format, programmatic_config.output_format);
|
||||
assert_eq!(
|
||||
deserialized.max_concurrent_extractions,
|
||||
programmatic_config.max_concurrent_extractions
|
||||
);
|
||||
}
|
||||
|
||||
// Re-export needed for base64 test (moved to end of file)
|
||||
|
||||
// Re-export needed for base64 test (imported at top of file)
|
||||
603
crates/kreuzberg-cli/tests/e2e_config_test.rs
Normal file
603
crates/kreuzberg-cli/tests/e2e_config_test.rs
Normal file
@@ -0,0 +1,603 @@
|
||||
//! Comprehensive CLI end-to-end integration tests for configuration flags.
|
||||
//!
|
||||
//! This test suite validates the new configuration features including:
|
||||
//! - `--config-json` for inline JSON configuration
|
||||
//! - `--config-json-base64` for base64-encoded JSON configuration
|
||||
//! - `--output-format` flag with all variants (plain, markdown, djot, html)
|
||||
//! - Flag precedence (CLI args > JSON config > file > defaults)
|
||||
//! - Config merge scenarios and conflict detection
|
||||
//! - Error handling for invalid inputs
|
||||
//! - Real extraction with new formats
|
||||
|
||||
#![allow(clippy::bool_assert_comparison)]
|
||||
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
use tempfile::TempDir;
|
||||
|
||||
/// Get the path to the kreuzberg binary.
|
||||
fn get_binary_path() -> String {
|
||||
let manifest_dir = env!("CARGO_MANIFEST_DIR");
|
||||
format!("{}/../../target/debug/kreuzberg", manifest_dir)
|
||||
}
|
||||
|
||||
/// Get the test_documents directory path.
|
||||
fn get_test_documents_dir() -> PathBuf {
|
||||
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
|
||||
manifest_dir.parent().unwrap().parent().unwrap().join("test_documents")
|
||||
}
|
||||
|
||||
/// Get a test file path relative to test_documents/.
|
||||
fn get_test_file(relative_path: &str) -> String {
|
||||
get_test_documents_dir()
|
||||
.join(relative_path)
|
||||
.to_string_lossy()
|
||||
.to_string()
|
||||
}
|
||||
|
||||
/// Build the binary before running tests (runs once per test).
|
||||
fn build_binary() {
|
||||
let status = Command::new("cargo")
|
||||
.args(["build", "--bin", "kreuzberg"])
|
||||
.status()
|
||||
.expect("Failed to build kreuzberg binary");
|
||||
|
||||
assert!(status.success(), "Failed to build kreuzberg binary");
|
||||
}
|
||||
|
||||
/// Helper to create a temporary config file with specified content.
|
||||
fn create_test_config(dir: &TempDir, name: &str, content: &str) -> PathBuf {
|
||||
let config_path = dir.path().join(name);
|
||||
std::fs::write(&config_path, content).expect("Failed to write config file");
|
||||
config_path
|
||||
}
|
||||
|
||||
/// Helper to encode string as base64.
|
||||
fn to_base64(input: &str) -> String {
|
||||
// Manual base64 encoding
|
||||
const CHARSET: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
|
||||
let bytes = input.as_bytes();
|
||||
let mut result = String::new();
|
||||
let mut i = 0;
|
||||
|
||||
while i < bytes.len() {
|
||||
let b1 = bytes[i];
|
||||
let b2 = if i + 1 < bytes.len() { bytes[i + 1] } else { 0 };
|
||||
let b3 = if i + 2 < bytes.len() { bytes[i + 2] } else { 0 };
|
||||
|
||||
let n = ((b1 as u32) << 16) | ((b2 as u32) << 8) | (b3 as u32);
|
||||
|
||||
result.push(CHARSET[((n >> 18) & 0x3F) as usize] as char);
|
||||
result.push(CHARSET[((n >> 12) & 0x3F) as usize] as char);
|
||||
|
||||
if i + 1 < bytes.len() {
|
||||
result.push(CHARSET[((n >> 6) & 0x3F) as usize] as char);
|
||||
} else {
|
||||
result.push('=');
|
||||
}
|
||||
|
||||
if i + 2 < bytes.len() {
|
||||
result.push(CHARSET[(n & 0x3F) as usize] as char);
|
||||
} else {
|
||||
result.push('=');
|
||||
}
|
||||
|
||||
i += 3;
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Test 1: --config-json inline flag with complex configuration
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_cli_config_json_inline() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args([
|
||||
"extract",
|
||||
test_file.as_str(),
|
||||
"--config-json",
|
||||
r#"{"use_cache": false, "chunking": {"max_chars": 512}}"#,
|
||||
])
|
||||
.output()
|
||||
.expect("Failed to execute extract command with --config-json");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Extract command with --config-json failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
assert!(!stdout.is_empty(), "Output should not be empty");
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Test 2: --config-json-base64 flag for base64-encoded configuration
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_cli_config_json_base64() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
// Encode JSON config as base64
|
||||
let json_config = r#"{"use_cache": false}"#;
|
||||
let base64_config = to_base64(json_config);
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args([
|
||||
"extract",
|
||||
test_file.as_str(),
|
||||
"--config-json-base64",
|
||||
base64_config.as_str(),
|
||||
])
|
||||
.output()
|
||||
.expect("Failed to execute extract command with --config-json-base64");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Extract command with --config-json-base64 failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
assert!(!stdout.is_empty(), "Output should not be empty");
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Test 3: Flag precedence verification (CLI flags > JSON > file > defaults)
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_cli_flag_precedence() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let temp_dir = TempDir::new().expect("Failed to create temp directory");
|
||||
|
||||
// Create a config file with specific settings
|
||||
let config_content = r#"
|
||||
use_cache = true
|
||||
|
||||
[chunking]
|
||||
max_chars = 1024
|
||||
"#;
|
||||
let config_path = create_test_config(&temp_dir, "config.toml", config_content);
|
||||
|
||||
// CLI flag should override config file setting
|
||||
let output = Command::new(get_binary_path())
|
||||
.args([
|
||||
"extract",
|
||||
test_file.as_str(),
|
||||
"--config",
|
||||
config_path.to_string_lossy().as_ref(),
|
||||
"--config-json",
|
||||
r#"{"use_cache": false}"#,
|
||||
])
|
||||
.output()
|
||||
.expect("Failed to execute command with precedence test");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Precedence test command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Test 4: --output-format flag with all variants (plain, markdown, djot, html)
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_cli_output_format_all_variants() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let formats = vec!["plain", "markdown", "djot", "html"];
|
||||
|
||||
for format in formats {
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", test_file.as_str(), "--output-format", format])
|
||||
.output()
|
||||
.unwrap_or_else(|_| panic!("Failed to execute extract with --output-format {}", format));
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Extract command with --output-format {} failed: {}",
|
||||
format,
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
assert!(!stdout.is_empty(), "Output for format {} should not be empty", format);
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Test 5: Output formats (text vs json) for extraction result
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_cli_result_format() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
// Test text output format
|
||||
let output_text = Command::new(get_binary_path())
|
||||
.args(["extract", test_file.as_str(), "--format", "text"])
|
||||
.output()
|
||||
.expect("Failed to execute extract with --format text");
|
||||
|
||||
assert!(
|
||||
output_text.status.success(),
|
||||
"Text format output failed: {}",
|
||||
String::from_utf8_lossy(&output_text.stderr)
|
||||
);
|
||||
|
||||
let text_content = String::from_utf8_lossy(&output_text.stdout);
|
||||
assert!(!text_content.is_empty(), "Text output should not be empty");
|
||||
|
||||
// Test JSON output format
|
||||
let output_json = Command::new(get_binary_path())
|
||||
.args(["extract", test_file.as_str(), "--format", "json"])
|
||||
.output()
|
||||
.expect("Failed to execute extract with --format json");
|
||||
|
||||
assert!(
|
||||
output_json.status.success(),
|
||||
"JSON format output failed: {}",
|
||||
String::from_utf8_lossy(&output_json.stderr)
|
||||
);
|
||||
|
||||
let json_content = String::from_utf8_lossy(&output_json.stdout);
|
||||
let parsed: Result<serde_json::Value, _> = serde_json::from_str(&json_content);
|
||||
assert!(
|
||||
parsed.is_ok(),
|
||||
"JSON output should be valid JSON, got: {}",
|
||||
json_content
|
||||
);
|
||||
|
||||
// Verify JSON has expected envelope+result structure
|
||||
if let Ok(value) = parsed {
|
||||
assert!(
|
||||
value.get("result").is_some(),
|
||||
"JSON envelope should have 'result' field"
|
||||
);
|
||||
assert!(
|
||||
value.get("extraction_time_ms").is_some(),
|
||||
"JSON envelope should have 'extraction_time_ms' field"
|
||||
);
|
||||
assert!(
|
||||
value["result"].get("content").is_some(),
|
||||
"result should have 'content' field"
|
||||
);
|
||||
assert!(
|
||||
value["result"].get("mime_type").is_some(),
|
||||
"result should have 'mime_type' field"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Test 6: Deprecated --content-format flag warning
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_cli_content_format_deprecated_warning() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
// The deprecated --content-format should still work but may show warning
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", test_file.as_str(), "--content-format", "plain"])
|
||||
.output()
|
||||
.expect("Failed to execute extract with --content-format");
|
||||
|
||||
// Command should either succeed or show expected deprecation behavior
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
|
||||
// Note: We're checking that the command doesn't crash; deprecation warning behavior
|
||||
// depends on implementation details
|
||||
assert!(
|
||||
output.status.success() || !stdout.is_empty(),
|
||||
"Command should succeed or produce output"
|
||||
);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Test 7: Config merge scenarios - multiple configuration sources
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_cli_config_merge_scenarios() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let temp_dir = TempDir::new().expect("Failed to create temp directory");
|
||||
|
||||
// Create a base config file
|
||||
let config_content = r#"
|
||||
use_cache = true
|
||||
|
||||
[chunking]
|
||||
max_chars = 1024
|
||||
"#;
|
||||
let config_path = create_test_config(&temp_dir, "base.toml", config_content);
|
||||
|
||||
// Merge: config file + inline JSON (JSON should override matching keys)
|
||||
let output = Command::new(get_binary_path())
|
||||
.args([
|
||||
"extract",
|
||||
test_file.as_str(),
|
||||
"--config",
|
||||
config_path.to_string_lossy().as_ref(),
|
||||
"--config-json",
|
||||
r#"{"use_cache": false}"#,
|
||||
])
|
||||
.output()
|
||||
.expect("Failed to merge configs");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Config merge failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Test 8: Invalid JSON error handling
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_cli_invalid_json_error() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args([
|
||||
"extract",
|
||||
test_file.as_str(),
|
||||
"--config-json",
|
||||
r#"{"invalid json without closing"#, // Malformed JSON
|
||||
])
|
||||
.output()
|
||||
.expect("Failed to execute command");
|
||||
|
||||
// Should fail gracefully with error message
|
||||
assert!(!output.status.success(), "Command should fail with invalid JSON");
|
||||
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
// Should contain some error indication
|
||||
assert!(
|
||||
!stderr.is_empty() || !String::from_utf8_lossy(&output.stdout).is_empty(),
|
||||
"Should provide feedback about invalid JSON"
|
||||
);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Test 9: Config flag conflicts
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_cli_conflicts() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let temp_dir = TempDir::new().expect("Failed to create temp directory");
|
||||
let config_content = "use_cache = true\n";
|
||||
let config_path = create_test_config(&temp_dir, "config.toml", config_content);
|
||||
|
||||
// Using both --config-json and --config-json-base64 might conflict
|
||||
let json_config = r#"{"use_cache": false}"#;
|
||||
let base64_config = to_base64(json_config);
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args([
|
||||
"extract",
|
||||
test_file.as_str(),
|
||||
"--config",
|
||||
config_path.to_string_lossy().as_ref(),
|
||||
"--config-json",
|
||||
r#"{"chunking": {"max_chars": 512}}"#,
|
||||
"--config-json-base64",
|
||||
base64_config.as_str(),
|
||||
])
|
||||
.output()
|
||||
.expect("Failed to execute command with potential conflicts");
|
||||
|
||||
// The behavior here depends on implementation:
|
||||
// Either it should succeed (last flag wins) or show an error (mutually exclusive)
|
||||
// We verify that the command completes without crashing
|
||||
let _ = output.status.success();
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Test 10: Real end-to-end extraction with new config formats
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_cli_real_extraction() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
// Full E2E test: extract with multiple new flags
|
||||
let output = Command::new(get_binary_path())
|
||||
.args([
|
||||
"extract",
|
||||
test_file.as_str(),
|
||||
"--format",
|
||||
"json",
|
||||
"--output-format",
|
||||
"markdown",
|
||||
"--config-json",
|
||||
r#"{"use_cache": false, "disable_ocr": true}"#,
|
||||
])
|
||||
.output()
|
||||
.expect("Failed to execute full E2E extraction");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"E2E extraction failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
|
||||
// Should be valid JSON output
|
||||
let parsed: Result<serde_json::Value, _> = serde_json::from_str(&stdout);
|
||||
assert!(parsed.is_ok(), "E2E output should be valid JSON, got: {}", stdout);
|
||||
|
||||
// Verify envelope+result structure
|
||||
if let Ok(value) = parsed {
|
||||
assert!(value.get("result").is_some(), "Missing 'result' envelope field");
|
||||
assert!(
|
||||
value.get("extraction_time_ms").is_some(),
|
||||
"Missing 'extraction_time_ms' field"
|
||||
);
|
||||
assert!(
|
||||
value["result"].get("content").is_some(),
|
||||
"Missing content field in result"
|
||||
);
|
||||
assert!(
|
||||
value["result"].get("mime_type").is_some(),
|
||||
"Missing mime_type field in result"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Additional Edge Cases and Robustness Tests
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_cli_empty_config_json() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
// Empty JSON object should use defaults
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", test_file.as_str(), "--config-json", "{}"])
|
||||
.output()
|
||||
.expect("Failed to execute with empty JSON config");
|
||||
|
||||
assert!(output.status.success(), "Command with empty JSON config should succeed");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_multiple_output_format_variants() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
// Test case-insensitive format argument
|
||||
let output = Command::new(get_binary_path())
|
||||
.args([
|
||||
"extract",
|
||||
test_file.as_str(),
|
||||
"--output-format",
|
||||
"MARKDOWN", // uppercase should work or fail predictably
|
||||
])
|
||||
.output()
|
||||
.expect("Failed to execute");
|
||||
|
||||
// Either succeeds with case-insensitive parsing or fails gracefully
|
||||
let _ = output.status.success();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_config_json_with_nested_objects() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
// Complex nested JSON configuration
|
||||
let complex_config = r#"
|
||||
{
|
||||
"use_cache": false,
|
||||
"chunking": {"max_chars": 512},
|
||||
"language_detection": {
|
||||
"enabled": true,
|
||||
"confidence_threshold": 0.8
|
||||
}
|
||||
}
|
||||
"#;
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", test_file.as_str(), "--config-json", complex_config])
|
||||
.output()
|
||||
.expect("Failed to execute with nested JSON config");
|
||||
|
||||
assert!(
|
||||
output.status.success() || !String::from_utf8_lossy(&output.stderr).is_empty(),
|
||||
"Complex config should either work or provide error"
|
||||
);
|
||||
}
|
||||
237
crates/kreuzberg-cli/tests/extract_envelope.rs
Normal file
237
crates/kreuzberg-cli/tests/extract_envelope.rs
Normal file
@@ -0,0 +1,237 @@
|
||||
//! Integration tests for the JSON timing envelope added to `kreuzberg extract` and
|
||||
//! `kreuzberg batch`.
|
||||
//!
|
||||
//! Verifies:
|
||||
//! - `extract --format json` emits `{ result, extraction_time_ms }` shape
|
||||
//! - `batch --format json` emits `{ results, total_ms, per_file_ms }` shape
|
||||
//! - `result.metadata.ocr_used` exists as a bool field
|
||||
//! - `--pdf-backend xyz` exits non-zero and mentions "pdf-oxide"
|
||||
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::Command;
|
||||
|
||||
/// Returns path to the compiled `kreuzberg` binary (debug build).
|
||||
fn kreuzberg_bin() -> PathBuf {
|
||||
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
|
||||
manifest_dir
|
||||
.parent()
|
||||
.expect("crates/kreuzberg-cli parent")
|
||||
.parent()
|
||||
.expect("crates parent")
|
||||
.join("target")
|
||||
.join("debug")
|
||||
.join("kreuzberg")
|
||||
}
|
||||
|
||||
/// Returns path to the small reference PDF used in these tests.
|
||||
fn pdf_fixture() -> PathBuf {
|
||||
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
|
||||
manifest_dir
|
||||
.parent()
|
||||
.expect("crates/kreuzberg-cli parent")
|
||||
.parent()
|
||||
.expect("crates parent")
|
||||
.join("test_documents")
|
||||
.join("pdf")
|
||||
.join("pdfa_001.pdf")
|
||||
}
|
||||
|
||||
/// Returns path to the small plain-text fixture used for batch tests.
|
||||
fn txt_fixture() -> PathBuf {
|
||||
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
|
||||
manifest_dir
|
||||
.parent()
|
||||
.expect("crates/kreuzberg-cli parent")
|
||||
.parent()
|
||||
.expect("crates parent")
|
||||
.join("test_documents")
|
||||
.join("text")
|
||||
.join("fake_text.txt")
|
||||
}
|
||||
|
||||
/// Build the binary once before running. Panics on failure.
|
||||
fn build_binary() {
|
||||
let status = Command::new("cargo")
|
||||
.args(["build", "--bin", "kreuzberg"])
|
||||
.status()
|
||||
.expect("cargo build invocation failed");
|
||||
assert!(status.success(), "cargo build failed — binary unavailable");
|
||||
}
|
||||
|
||||
/// Skip-guard: returns `true` when the fixture exists so the test can run.
|
||||
fn fixture_exists(path: &Path) -> bool {
|
||||
path.exists() && path.is_file()
|
||||
}
|
||||
|
||||
// ── extract --format json envelope ──────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn test_extract_json_has_result_and_timing() {
|
||||
build_binary();
|
||||
|
||||
let pdf = pdf_fixture();
|
||||
if !fixture_exists(&pdf) {
|
||||
eprintln!("SKIP: PDF fixture not found at {}", pdf.display());
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(kreuzberg_bin())
|
||||
.args(["extract", &pdf.to_string_lossy(), "--format", "json"])
|
||||
.output()
|
||||
.expect("failed to run kreuzberg extract");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"extract exited non-zero: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let json: serde_json::Value = serde_json::from_slice(&output.stdout).expect("stdout is not valid JSON");
|
||||
|
||||
// Envelope shape
|
||||
assert!(json.get("result").is_some(), "missing 'result' key in envelope");
|
||||
let extraction_time_ms = json
|
||||
.get("extraction_time_ms")
|
||||
.and_then(|v| v.as_f64())
|
||||
.expect("'extraction_time_ms' must be a number");
|
||||
assert!(
|
||||
extraction_time_ms > 0.0,
|
||||
"extraction_time_ms must be positive, got {extraction_time_ms}"
|
||||
);
|
||||
|
||||
// ocr_used field must exist as a bool
|
||||
let ocr_used = json["result"]["metadata"]
|
||||
.get("ocr_used")
|
||||
.expect("'result.metadata.ocr_used' must be present")
|
||||
.as_bool()
|
||||
.expect("'result.metadata.ocr_used' must be a boolean");
|
||||
// For a native-text PDF without --force-ocr, OCR should NOT have run.
|
||||
assert!(!ocr_used, "expected ocr_used=false for native PDF extraction");
|
||||
}
|
||||
|
||||
// ── batch --format json envelope ─────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn test_batch_json_has_results_and_timing() {
|
||||
build_binary();
|
||||
|
||||
let pdf = pdf_fixture();
|
||||
let txt = txt_fixture();
|
||||
if !fixture_exists(&pdf) || !fixture_exists(&txt) {
|
||||
eprintln!("SKIP: one or more batch fixtures not found");
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(kreuzberg_bin())
|
||||
.args([
|
||||
"batch",
|
||||
&pdf.to_string_lossy(),
|
||||
&txt.to_string_lossy(),
|
||||
"--format",
|
||||
"json",
|
||||
])
|
||||
.output()
|
||||
.expect("failed to run kreuzberg batch");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"batch exited non-zero: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let json: serde_json::Value = serde_json::from_slice(&output.stdout).expect("stdout is not valid JSON");
|
||||
|
||||
// Envelope shape
|
||||
let results = json
|
||||
.get("results")
|
||||
.and_then(|v| v.as_array())
|
||||
.expect("'results' must be an array");
|
||||
assert_eq!(results.len(), 2, "expected 2 results for 2 input files");
|
||||
|
||||
let total_ms = json
|
||||
.get("total_ms")
|
||||
.and_then(|v| v.as_f64())
|
||||
.expect("'total_ms' must be a number");
|
||||
assert!(total_ms > 0.0, "total_ms must be positive, got {total_ms}");
|
||||
|
||||
let per_file_ms = json
|
||||
.get("per_file_ms")
|
||||
.and_then(|v| v.as_array())
|
||||
.expect("'per_file_ms' must be an array");
|
||||
assert_eq!(per_file_ms.len(), 2, "per_file_ms must have one entry per file");
|
||||
|
||||
for (i, timing) in per_file_ms.iter().enumerate() {
|
||||
let ms = timing.as_f64().expect("per_file_ms entry must be a number");
|
||||
assert!(ms > 0.0, "per_file_ms[{i}] must be positive, got {ms}");
|
||||
}
|
||||
|
||||
// Each result must have metadata.ocr_used as a bool
|
||||
for (i, result) in results.iter().enumerate() {
|
||||
assert!(
|
||||
result["metadata"].get("ocr_used").and_then(|v| v.as_bool()).is_some(),
|
||||
"results[{i}].metadata.ocr_used must be a bool"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// ── --pdf-backend validation ─────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn test_pdf_backend_invalid_value_exits_nonzero() {
|
||||
build_binary();
|
||||
|
||||
let pdf = pdf_fixture();
|
||||
if !fixture_exists(&pdf) {
|
||||
eprintln!("SKIP: PDF fixture not found at {}", pdf.display());
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(kreuzberg_bin())
|
||||
.args(["extract", &pdf.to_string_lossy(), "--pdf-backend", "xyz"])
|
||||
.output()
|
||||
.expect("failed to run kreuzberg extract");
|
||||
|
||||
assert!(
|
||||
!output.status.success(),
|
||||
"expected non-zero exit for unknown --pdf-backend"
|
||||
);
|
||||
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
assert!(
|
||||
stderr.contains("pdf-oxide"),
|
||||
"error message should mention 'pdf-oxide', got: {stderr}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pdf_backend_valid_value_succeeds() {
|
||||
build_binary();
|
||||
|
||||
let pdf = pdf_fixture();
|
||||
if !fixture_exists(&pdf) {
|
||||
eprintln!("SKIP: PDF fixture not found at {}", pdf.display());
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(kreuzberg_bin())
|
||||
.args([
|
||||
"extract",
|
||||
&pdf.to_string_lossy(),
|
||||
"--pdf-backend",
|
||||
"pdf-oxide",
|
||||
"--format",
|
||||
"json",
|
||||
])
|
||||
.output()
|
||||
.expect("failed to run kreuzberg extract");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"--pdf-backend pdf-oxide should succeed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let json: serde_json::Value = serde_json::from_slice(&output.stdout).expect("stdout is not valid JSON");
|
||||
assert!(json.get("result").is_some(), "missing 'result' key");
|
||||
assert!(json.get("extraction_time_ms").is_some(), "missing 'extraction_time_ms'");
|
||||
}
|
||||
153
crates/kreuzberg-cli/tests/server_test.rs
Normal file
153
crates/kreuzberg-cli/tests/server_test.rs
Normal file
@@ -0,0 +1,153 @@
|
||||
//! Integration tests for server commands (serve and mcp).
|
||||
|
||||
#[cfg(not(coverage))]
|
||||
use std::process::{Command, Stdio};
|
||||
#[cfg(not(coverage))]
|
||||
use std::thread;
|
||||
#[cfg(not(coverage))]
|
||||
use std::time::Duration;
|
||||
|
||||
#[cfg(not(coverage))]
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn test_serve_command_starts() {
|
||||
let status = Command::new("cargo")
|
||||
.args(["build", "--bin", "kreuzberg", "--features", "all"])
|
||||
.status()
|
||||
.expect("Failed to build binary");
|
||||
|
||||
assert!(status.success(), "Failed to build kreuzberg binary");
|
||||
|
||||
let mut child = Command::new("./target/debug/kreuzberg")
|
||||
.args(["serve", "-H", "127.0.0.1", "-p", "18000"])
|
||||
.stdout(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
.spawn()
|
||||
.expect("Failed to start server");
|
||||
|
||||
thread::sleep(Duration::from_secs(3));
|
||||
|
||||
let mut health_response = ureq::get("http://127.0.0.1:18000/health")
|
||||
.call()
|
||||
.expect("Failed to call health endpoint");
|
||||
|
||||
assert_eq!(health_response.status(), 200);
|
||||
|
||||
let health_json: serde_json::Value = health_response
|
||||
.body_mut()
|
||||
.read_json()
|
||||
.expect("Failed to parse health response");
|
||||
|
||||
assert_eq!(health_json["status"], "healthy");
|
||||
assert!(health_json["version"].is_string());
|
||||
|
||||
let mut info_response = ureq::get("http://127.0.0.1:18000/info")
|
||||
.call()
|
||||
.expect("Failed to call info endpoint");
|
||||
|
||||
assert_eq!(info_response.status(), 200);
|
||||
|
||||
let info_json: serde_json::Value = info_response
|
||||
.body_mut()
|
||||
.read_json()
|
||||
.expect("Failed to parse info response");
|
||||
|
||||
assert!(info_json["rust_backend"].as_bool().unwrap_or(false));
|
||||
|
||||
child.kill().expect("Failed to kill server");
|
||||
child.wait().expect("Failed to wait for server");
|
||||
}
|
||||
|
||||
#[cfg(not(coverage))]
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn test_serve_command_with_config() {
|
||||
use std::fs;
|
||||
|
||||
let config_content = r#"
|
||||
use_cache = true
|
||||
enable_quality_processing = true
|
||||
|
||||
[ocr]
|
||||
backend = "tesseract"
|
||||
language = "eng"
|
||||
"#;
|
||||
|
||||
fs::write("test_config.toml", config_content).expect("Failed to write test config");
|
||||
|
||||
let mut child = Command::new("./target/debug/kreuzberg")
|
||||
.args(["serve", "-H", "127.0.0.1", "-p", "18001", "-c", "test_config.toml"])
|
||||
.stdout(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
.spawn()
|
||||
.expect("Failed to start server");
|
||||
|
||||
thread::sleep(Duration::from_secs(3));
|
||||
|
||||
let health_response = ureq::get("http://127.0.0.1:18001/health").call();
|
||||
|
||||
assert!(health_response.is_ok(), "Server should be running with custom config");
|
||||
|
||||
child.kill().expect("Failed to kill server");
|
||||
child.wait().expect("Failed to wait for server");
|
||||
|
||||
fs::remove_file("test_config.toml").ok();
|
||||
}
|
||||
|
||||
#[cfg(not(coverage))]
|
||||
#[test]
|
||||
fn test_serve_command_help() {
|
||||
let build_status = Command::new("cargo")
|
||||
.args(["build", "--bin", "kreuzberg", "--features", "all"])
|
||||
.status()
|
||||
.expect("Failed to build binary");
|
||||
|
||||
assert!(build_status.success(), "Failed to build kreuzberg binary");
|
||||
|
||||
let binary_path = env!("CARGO_TARGET_TMPDIR")
|
||||
.split("target")
|
||||
.next()
|
||||
.map(|s| format!("{}target/debug/kreuzberg", s))
|
||||
.unwrap_or_else(|| "../target/debug/kreuzberg".to_string());
|
||||
|
||||
let output = Command::new(&binary_path)
|
||||
.args(["serve", "--help"])
|
||||
.output()
|
||||
.expect("Failed to execute command");
|
||||
|
||||
assert!(output.status.success());
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
assert!(stdout.contains("Start the API server"));
|
||||
assert!(stdout.contains("--host"));
|
||||
assert!(stdout.contains("--port"));
|
||||
assert!(stdout.contains("--config"));
|
||||
}
|
||||
|
||||
#[cfg(not(coverage))]
|
||||
#[test]
|
||||
fn test_mcp_command_help() {
|
||||
let build_status = Command::new("cargo")
|
||||
.args(["build", "--bin", "kreuzberg", "--features", "all"])
|
||||
.status()
|
||||
.expect("Failed to build binary");
|
||||
|
||||
assert!(build_status.success(), "Failed to build kreuzberg binary");
|
||||
|
||||
let binary_path = env!("CARGO_TARGET_TMPDIR")
|
||||
.split("target")
|
||||
.next()
|
||||
.map(|s| format!("{}target/debug/kreuzberg", s))
|
||||
.unwrap_or_else(|| "../target/debug/kreuzberg".to_string());
|
||||
|
||||
let output = Command::new(&binary_path)
|
||||
.args(["mcp", "--help"])
|
||||
.output()
|
||||
.expect("Failed to execute command");
|
||||
|
||||
assert!(output.status.success());
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
assert!(stdout.contains("Start the MCP (Model Context Protocol) server"));
|
||||
assert!(stdout.contains("--config"));
|
||||
}
|
||||
Reference in New Issue
Block a user