Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

28
crates/config.m4 Normal file
View File

@@ -0,0 +1,28 @@
dnl Configuration for Rust-based PHP extension via ext-php-rs.
dnl This file enables phpize to compile the extension using cargo instead of make.
PHP_ARG_ENABLE([kreuzberg],
[whether to enable the kreuzberg extension],
[AS_HELP_STRING([--enable-kreuzberg],
[Enable kreuzberg extension support])],
[yes])
if test "$PHP_KREUZBERG_ENABLED" = "yes"; then
dnl Check that cargo is available
AC_PATH_PROG([CARGO], [cargo], [no])
if test "x$CARGO" = "xno"; then
AC_MSG_ERROR([cargo is required to build this extension])
fi
dnl Build the Rust extension using cargo
AC_MSG_NOTICE([Building Rust extension kreuzberg])
dnl Set up the extension module
PHP_NEW_EXTENSION(kreuzberg, [], $ext_shared)
dnl Custom build: invoke cargo instead of make
PHP_ADD_BUILD_DIR($ext_builddir)
dnl The actual build is handled by the build.rs script;
dnl cargo outputs the .so/.dylib/.dll which phpize will place in extension_dir.
fi

View File

@@ -0,0 +1,86 @@
[package]
name = "kreuzberg-cli"
version.workspace = true
edition.workspace = true
rust-version.workspace = true
authors.workspace = true
description = "Command-line interface for Kreuzberg document intelligence"
license.workspace = true
repository.workspace = true
homepage = "https://kreuzberg.dev"
documentation = "https://docs.kreuzberg.dev"
keywords = ["document", "extraction", "cli", "tool", "parser"]
categories = ["command-line-utilities", "text-processing"]
[package.metadata.cargo-machete]
ignored = ["serde_toon_format"]
[[bin]]
name = "kreuzberg"
path = "src/main.rs"
[features]
default = [
"embeddings",
"html",
"liter-llm",
"ocr",
"paddle-ocr",
"layout-detection",
"chunking-tokenizers",
"tree-sitter",
]
ort-bundled = ["kreuzberg/ort-bundled"]
ocr = ["kreuzberg/ocr"]
api = ["kreuzberg/api"]
mcp = ["kreuzberg/mcp"]
mcp-http = ["kreuzberg/mcp-http"]
embeddings = ["kreuzberg/embeddings"]
paddle-ocr = ["kreuzberg/paddle-ocr"]
layout-detection = ["kreuzberg/layout-detection"]
chunking-tokenizers = ["kreuzberg/chunking-tokenizers"]
html = ["kreuzberg/html"]
liter-llm = ["kreuzberg/liter-llm"]
tree-sitter = ["kreuzberg/tree-sitter", "dep:tree-sitter-language-pack"]
all = [
"default",
"api",
"html",
"mcp",
"mcp-http",
"chunking-tokenizers",
"tree-sitter",
"liter-llm",
]
[dependencies]
anstyle = "1"
anyhow = { workspace = true }
base64 = { workspace = true }
clap = { workspace = true }
clap_complete = "4.6"
kreuzberg = { workspace = true, features = [
"formats",
"analysis",
"tokio-runtime",
"simd-utf8",
"cli",
] }
serde = { workspace = true }
serde_json = { workspace = true }
serde_toon_format = { workspace = true }
tokio = { workspace = true }
tracing = { workspace = true }
tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt"] }
tree-sitter-language-pack = { workspace = true, features = [
"dynamic-loading",
"download",
"serde",
], optional = true }
[dev-dependencies]
tempfile = { workspace = true }
ureq = { version = "3.3", features = ["json"] }

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,13 @@
fn main() {
println!("cargo::rustc-check-cfg=cfg(coverage)");
let target = std::env::var("TARGET").unwrap();
if target.contains("darwin") {
println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path");
println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path/.");
} else if target.contains("linux") {
println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN");
println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN/.");
}
}

View File

@@ -0,0 +1,466 @@
//! Cache command - Manage cache operations
//!
//! This module provides commands for cache management including statistics,
//! clearing, manifest generation, and model warming.
use anyhow::{Context, Result};
use kreuzberg::cache;
use serde_json::json;
use std::path::PathBuf;
use crate::{WireFormat, style};
/// Execute cache stats command
pub fn stats_command(cache_dir: Option<PathBuf>, format: WireFormat) -> Result<()> {
let default_cache_dir = std::env::current_dir()
.context("Failed to get current directory")?
.join(".kreuzberg");
let cache_path = cache_dir.unwrap_or(default_cache_dir);
let cache_dir_str = cache_path.to_string_lossy();
let stats = cache::get_cache_metadata(&cache_dir_str).with_context(|| {
format!(
"Failed to get cache statistics from directory '{}'. Ensure the directory exists and is readable.",
cache_dir_str
)
})?;
match format {
WireFormat::Text => {
println!("{}", style::header("Cache Statistics"));
println!("{}", style::dim("================"));
println!("{} {}", style::label("Directory:"), style::success(&cache_dir_str));
println!("{} {}", style::label("Total files:"), stats.total_files);
println!("{} {:.2} MB", style::label("Total size:"), stats.total_size_mb);
println!(
"{} {:.2} MB",
style::label("Available space:"),
stats.available_space_mb
);
println!(
"{} {:.2} days",
style::label("Oldest file age:"),
stats.oldest_file_age_days
);
println!(
"{} {:.2} days",
style::label("Newest file age:"),
stats.newest_file_age_days
);
}
WireFormat::Json => {
let output = json!({
"directory": cache_dir_str,
"total_files": stats.total_files,
"total_size_mb": stats.total_size_mb,
"available_space_mb": stats.available_space_mb,
"oldest_file_age_days": stats.oldest_file_age_days,
"newest_file_age_days": stats.newest_file_age_days,
});
println!(
"{}",
serde_json::to_string_pretty(&output).context("Failed to serialize cache statistics to JSON")?
);
}
WireFormat::Toon => {
let output = json!({
"directory": cache_dir_str,
"total_files": stats.total_files,
"total_size_mb": stats.total_size_mb,
"available_space_mb": stats.available_space_mb,
"oldest_file_age_days": stats.oldest_file_age_days,
"newest_file_age_days": stats.newest_file_age_days,
});
println!(
"{}",
serde_toon::to_string(&output).context("Failed to serialize cache statistics to TOON")?
);
}
}
Ok(())
}
/// Execute cache clear command
pub fn clear_command(cache_dir: Option<PathBuf>, format: WireFormat) -> Result<()> {
let default_cache_dir = std::env::current_dir()
.context("Failed to get current directory")?
.join(".kreuzberg");
let cache_path = cache_dir.unwrap_or(default_cache_dir);
let cache_dir_str = cache_path.to_string_lossy();
let (removed_files, freed_mb) = cache::clear_cache_directory(&cache_dir_str).with_context(|| {
format!(
"Failed to clear cache directory '{}'. Ensure you have write permissions.",
cache_dir_str
)
})?;
match format {
WireFormat::Text => {
println!("{}", style::success("Cache cleared successfully"));
println!("{} {}", style::label("Directory:"), style::success(&cache_dir_str));
println!("{} {}", style::label("Removed files:"), removed_files);
println!("{} {:.2} MB", style::label("Freed space:"), freed_mb);
}
WireFormat::Json => {
let output = json!({
"directory": cache_dir_str,
"removed_files": removed_files,
"freed_mb": freed_mb,
});
println!(
"{}",
serde_json::to_string_pretty(&output).context("Failed to serialize cache clear results to JSON")?
);
}
WireFormat::Toon => {
let output = json!({
"directory": cache_dir_str,
"removed_files": removed_files,
"freed_mb": freed_mb,
});
println!(
"{}",
serde_toon::to_string(&output).context("Failed to serialize cache clear results to TOON")?
);
}
}
Ok(())
}
/// Execute cache manifest command - outputs expected model files with checksums.
pub fn manifest_command(format: WireFormat) -> Result<()> {
// Without at least one model-providing feature, every `extend` call
// below is `#[cfg]`-stripped and `entries: Vec<_>` has no anchor for
// type inference — `e.size_bytes` on the closure further down then
// fails compilation with E0282. Bail with a clear error instead so
// (or similar minimal configurations) succeeds.
#[cfg(not(any(feature = "paddle-ocr", feature = "layout-detection")))]
{
let _ = format;
anyhow::bail!(
"manifest command unavailable: build kreuzberg-cli with at least one of \
--features \"paddle-ocr\" or --features \"layout-detection\""
);
}
#[cfg(any(feature = "paddle-ocr", feature = "layout-detection"))]
{
manifest_command_inner(format)
}
}
#[cfg(any(feature = "paddle-ocr", feature = "layout-detection"))]
fn manifest_command_inner(format: WireFormat) -> Result<()> {
let mut entries = Vec::new();
#[cfg(feature = "paddle-ocr")]
{
entries.extend(kreuzberg::paddle_ocr::ModelManager::manifest());
}
#[cfg(feature = "layout-detection")]
{
entries.extend(kreuzberg::layout::LayoutModelManager::manifest());
}
#[cfg(feature = "paddle-ocr")]
{
entries.extend(kreuzberg::ocr::TessdataManager::manifest());
}
let total_size_bytes: u64 = entries.iter().map(|e| e.size_bytes).sum();
let version = env!("CARGO_PKG_VERSION");
match format {
WireFormat::Text => {
println!(
"{} {}",
style::header("Model Manifest"),
style::dim(&format!("(kreuzberg {})", version))
);
println!("{}", style::dim("===================================="));
println!(
"{:<50} {:>12} {}",
style::label("PATH"),
style::label("SIZE"),
style::label("SHA256")
);
println!("{}", style::dim(&format!("{:<50} {:>12} ------", "----", "----")));
for entry in &entries {
let size_str = if entry.size_bytes > 0 {
format!("{:.1} MB", entry.size_bytes as f64 / 1_048_576.0)
} else {
"unknown".to_string()
};
let sha_display = if entry.sha256.len() >= 12 {
&entry.sha256[..12]
} else if entry.sha256.is_empty() {
"-"
} else {
&entry.sha256
};
println!(
"{:<50} {:>12} {}",
entry.relative_path,
size_str,
style::dim(sha_display)
);
}
println!();
println!(
"{} {} files, {:.1} MB",
style::label("Total:"),
entries.len(),
total_size_bytes as f64 / 1_048_576.0
);
}
WireFormat::Json => {
let output = json!({
"kreuzberg_version": version,
"total_size_bytes": total_size_bytes,
"model_count": entries.len(),
"models": entries,
});
println!(
"{}",
serde_json::to_string_pretty(&output).context("Failed to serialize manifest to JSON")?
);
}
WireFormat::Toon => {
let output = json!({
"kreuzberg_version": version,
"total_size_bytes": total_size_bytes,
"model_count": entries.len(),
"models": entries,
});
println!(
"{}",
serde_toon::to_string(&output).context("Failed to serialize manifest to TOON")?
);
}
}
Ok(())
}
/// Execute cache warm command - eagerly downloads all models.
#[allow(clippy::too_many_arguments)]
pub fn warm_command(
cache_dir: Option<PathBuf>,
format: WireFormat,
all_embeddings: bool,
embedding_model: Option<String>,
all_table_models: bool,
all_grammars: bool,
grammar_groups: Option<Vec<String>>,
grammars: Option<Vec<String>>,
) -> Result<()> {
let cache_base = resolve_cache_base(cache_dir);
let mut downloaded: Vec<String> = Vec::new();
let mut already_cached: Vec<String> = Vec::new();
#[cfg(feature = "paddle-ocr")]
{
let paddle_dir = cache_base.join("paddle-ocr");
let manager = kreuzberg::paddle_ocr::ModelManager::new(paddle_dir);
// ensure_all_models downloads v2 det (server+mobile), cls (PP-LCNet),
// doc_ori, v2 unified rec models, and all per-script rec families
manager
.ensure_all_models()
.context("Failed to download PaddleOCR v2 models")?;
downloaded.push("paddle-ocr v2 (server+mobile det, cls, doc_ori, unified+per-script rec)".to_string());
}
#[cfg(feature = "layout-detection")]
{
let layout_dir = cache_base.join("layout");
let manager = kreuzberg::layout::LayoutModelManager::new(Some(layout_dir));
if all_table_models {
// Download rtdetr + tatr + all SLANeXT variants (~730MB)
let was_cached = manager.is_rtdetr_cached() && manager.is_tatr_cached();
if was_cached {
already_cached.push("layout (rtdetr, tatr, slanet variants)".to_string());
} else {
manager
.ensure_all_models()
.context("Failed to download layout models")?;
downloaded.push("layout (rtdetr, tatr, slanet variants)".to_string());
}
} else {
// Default: download only rtdetr + tatr
let was_cached = manager.is_rtdetr_cached() && manager.is_tatr_cached();
if was_cached {
already_cached.push("layout (rtdetr, tatr)".to_string());
} else {
manager
.ensure_default_models()
.context("Failed to download layout models")?;
downloaded.push("layout (rtdetr, tatr)".to_string());
}
}
}
#[cfg(feature = "paddle-ocr")]
{
let tessdata_dir = cache_base.join("tessdata");
let manager = kreuzberg::ocr::TessdataManager::new(Some(tessdata_dir));
let newly_downloaded = manager
.ensure_all_languages()
.context("Failed to download tessdata files")?;
if newly_downloaded > 0 {
downloaded.push(format!("tessdata ({newly_downloaded} languages)"));
} else {
already_cached.push("tessdata (all languages)".to_string());
}
}
#[cfg(feature = "embeddings")]
{
let embeddings_dir = cache_base.join("embeddings");
let presets_to_warm: Vec<kreuzberg::EmbeddingPreset> = if all_embeddings {
kreuzberg::list_embedding_presets()
.into_iter()
.filter_map(|name| kreuzberg::get_embedding_preset(&name))
.collect()
} else if let Some(ref name) = embedding_model {
match kreuzberg::get_embedding_preset(name) {
Some(preset) => vec![preset],
None => {
let available = kreuzberg::list_embedding_presets();
anyhow::bail!(
"Unknown embedding preset '{}'. Available: {}",
name,
available.join(", ")
);
}
}
} else {
vec![]
};
for preset in &presets_to_warm {
let label = format!("embedding ({})", preset.name);
kreuzberg::embeddings::warm_model(
&kreuzberg::core::config::EmbeddingModelType::Preset {
name: preset.name.clone(),
},
Some(embeddings_dir.clone()),
)
.map_err(|e| anyhow::anyhow!("Failed to download embedding model '{}': {}", preset.name, e))?;
downloaded.push(label);
}
}
#[cfg(not(feature = "embeddings"))]
{
if all_embeddings || embedding_model.is_some() {
anyhow::bail!("Embedding model warming requires the 'embeddings' feature to be enabled");
}
}
// Tree-sitter grammar downloads
#[cfg(feature = "tree-sitter")]
{
if all_grammars {
let count =
tree_sitter_language_pack::download_all().context("Failed to download all tree-sitter grammars")?;
if count > 0 {
downloaded.push(format!("tree-sitter grammars ({count} languages)"));
} else {
already_cached.push("tree-sitter grammars (all)".to_string());
}
} else if let Some(ref groups) = grammar_groups {
let config = tree_sitter_language_pack::PackConfig {
cache_dir: None,
languages: None,
groups: Some(groups.clone()),
};
tree_sitter_language_pack::init(&config).context("Failed to download tree-sitter grammar groups")?;
downloaded.push(format!("tree-sitter grammars (groups: {})", groups.join(", ")));
} else if let Some(ref langs) = grammars {
let refs: Vec<&str> = langs.iter().map(String::as_str).collect();
let count =
tree_sitter_language_pack::download(&refs).context("Failed to download tree-sitter grammars")?;
if count > 0 {
downloaded.push(format!("tree-sitter grammars ({count} languages)"));
} else {
already_cached.push(format!("tree-sitter grammars ({})", langs.join(", ")));
}
}
}
#[cfg(not(feature = "tree-sitter"))]
{
if all_grammars || grammar_groups.is_some() || grammars.is_some() {
anyhow::bail!("Tree-sitter grammar warming requires the 'tree-sitter' feature to be enabled");
}
}
match format {
WireFormat::Text => {
if !downloaded.is_empty() {
println!("{}", style::label("Downloaded:"));
for d in &downloaded {
println!(" {}", style::success(d));
}
}
if !already_cached.is_empty() {
println!("{}", style::label("Already cached:"));
for c in &already_cached {
println!(" {}", style::dim(c));
}
}
println!(
"All models ready in {}",
style::success(&cache_base.display().to_string())
);
}
WireFormat::Json => {
let output = json!({
"cache_dir": cache_base.to_string_lossy(),
"downloaded": downloaded,
"already_cached": already_cached,
});
println!(
"{}",
serde_json::to_string_pretty(&output).context("Failed to serialize warm results to JSON")?
);
}
WireFormat::Toon => {
let output = json!({
"cache_dir": cache_base.to_string_lossy(),
"downloaded": downloaded,
"already_cached": already_cached,
});
println!(
"{}",
serde_toon::to_string(&output).context("Failed to serialize warm results to TOON")?
);
}
}
Ok(())
}
/// Resolve the cache base directory.
fn resolve_cache_base(cache_dir: Option<PathBuf>) -> PathBuf {
if let Some(dir) = cache_dir {
return dir;
}
if let Ok(env_path) = std::env::var("KREUZBERG_CACHE_DIR") {
return PathBuf::from(env_path);
}
std::env::current_dir()
.unwrap_or_else(|_| PathBuf::from("."))
.join(".kreuzberg")
}

View File

@@ -0,0 +1,61 @@
//! Chunk command implementation.
use anyhow::{Context, Result};
use crate::{WireFormat, style};
/// Execute the chunk command: split text into chunks.
pub fn chunk_command(text: String, config: kreuzberg::ChunkingConfig, format: WireFormat) -> Result<()> {
if text.is_empty() {
anyhow::bail!("No text provided for chunking. Provide --text or pipe text via stdin.");
}
let result = kreuzberg::chunking::chunk_text(&text, &config, None).context("Failed to chunk text")?;
match format {
WireFormat::Json => {
let chunks: Vec<&str> = result.chunks.iter().map(|c| c.content.as_str()).collect();
let output = serde_json::json!({
"chunks": chunks,
"chunk_count": result.chunk_count,
"config": {
"max_characters": config.max_characters,
"overlap": config.overlap,
"chunker_type": format!("{:?}", config.chunker_type),
},
"input_size_bytes": text.len(),
});
println!(
"{}",
serde_json::to_string_pretty(&output).context("Failed to serialize chunks to JSON")?
);
}
WireFormat::Toon => {
let chunks: Vec<&str> = result.chunks.iter().map(|c| c.content.as_str()).collect();
let output = serde_json::json!({
"chunks": chunks,
"chunk_count": result.chunk_count,
"config": {
"max_characters": config.max_characters,
"overlap": config.overlap,
"chunker_type": format!("{:?}", config.chunker_type),
},
"input_size_bytes": text.len(),
});
println!(
"{}",
serde_toon::to_string(&output).context("Failed to serialize chunks to TOON")?
);
}
WireFormat::Text => {
for (i, chunk) in result.chunks.iter().enumerate() {
if result.chunks.len() > 1 {
println!("{}", style::dim(&format!("--- chunk {} ---", i + 1)));
}
println!("{}", chunk.content);
}
}
}
Ok(())
}

View File

@@ -0,0 +1,51 @@
//! Config command - Configuration loading and discovery
//!
//! This module provides utilities for loading extraction configuration from files
//! or discovering them automatically in the project directory.
use anyhow::{Context, Result};
use kreuzberg::ExtractionConfig;
use std::path::PathBuf;
/// Loads extraction configuration from a file or discovers it automatically.
///
/// This function implements the CLI's configuration hierarchy:
/// 1. Explicit config file (if `--config` flag provided)
/// 2. Auto-discovered config (searches `kreuzberg.{toml,yaml,json}` in current and parent directories)
/// 3. Default configuration (if no config file found)
///
/// # Configuration File Formats
///
/// Supports three formats, determined by file extension:
/// - `.toml`: TOML format (recommended for humans)
/// - `.yaml` / `.yml`: YAML format
/// - `.json`: JSON format
///
/// # Errors
///
/// Returns an error if:
/// - Explicit config file has unsupported extension (must be .toml, .yaml, .yml, or .json)
/// - Config file cannot be read or parsed
/// - Config file contains invalid extraction settings
pub fn load_config(config_path: Option<PathBuf>) -> Result<ExtractionConfig> {
if let Some(path) = config_path {
let path_str = path.to_string_lossy();
let path_lower = path_str.to_lowercase();
let config = if path_lower.ends_with(".toml") {
ExtractionConfig::from_toml_file(&path)
} else if path_lower.ends_with(".yaml") || path_lower.ends_with(".yml") {
ExtractionConfig::from_yaml_file(&path)
} else if path_lower.ends_with(".json") {
ExtractionConfig::from_json_file(&path)
} else {
anyhow::bail!("Config file must have .toml, .yaml, .yml, or .json extension (case-insensitive)");
};
config.with_context(|| format!("Failed to load configuration from '{}'. Ensure the file exists, is readable, and contains valid configuration.", path.display()))
} else {
match ExtractionConfig::discover() {
Ok(Some(config)) => Ok(config),
Ok(None) => Ok(ExtractionConfig::default()),
Err(e) => Err(e).context("Failed to auto-discover configuration file. Searched for kreuzberg.{toml,yaml,json} in current and parent directories. Use --config to specify an explicit path."),
}
}
}

View File

@@ -0,0 +1,161 @@
//! Embed command implementation.
use anyhow::{Context, Result};
use crate::{WireFormat, style};
/// Execute the embed command: generate embeddings for input texts.
///
/// When `provider` is `"local"` (default), uses the ONNX preset model.
/// When `provider` is `"llm"`, uses liter-llm with the specified model and API key.
/// When `provider` is `"plugin"`, dispatches to a pre-registered in-process embedding backend.
pub fn embed_command(
texts: Vec<String>,
preset: &str,
provider: &str,
llm_model: Option<String>,
llm_api_key: Option<String>,
plugin_name: Option<String>,
format: WireFormat,
) -> Result<()> {
if texts.is_empty() {
anyhow::bail!("No texts provided for embedding. Provide --text or pipe text via stdin.");
}
// Validate no empty texts
for (i, t) in texts.iter().enumerate() {
if t.is_empty() {
anyhow::bail!("Text at position {} is empty. All texts must be non-empty.", i + 1);
}
}
let (config, model_label) = match provider {
"llm" => {
let model = llm_model.as_deref().ok_or_else(|| {
anyhow::anyhow!(
"--model is required when --provider is 'llm' (e.g., --model openai/text-embedding-3-small)"
)
})?;
let llm_config = kreuzberg::LlmConfig {
model: model.to_string(),
api_key: llm_api_key,
base_url: None,
timeout_secs: None,
max_retries: None,
temperature: None,
max_tokens: None,
};
let config = kreuzberg::EmbeddingConfig {
model: kreuzberg::EmbeddingModelType::Llm { llm: llm_config },
show_download_progress: true,
..Default::default()
};
(config, model.to_string())
}
"local" | "" => {
// Validate preset for local provider
let _preset_info = kreuzberg::get_embedding_preset(preset).with_context(|| {
format!(
"Unknown embedding preset '{}'. Available: {:?}",
preset,
kreuzberg::list_embedding_presets()
)
})?;
let config = kreuzberg::EmbeddingConfig {
model: kreuzberg::EmbeddingModelType::Preset {
name: preset.to_string(),
},
show_download_progress: true,
..Default::default()
};
(config, preset.to_string())
}
"plugin" => {
let name = plugin_name.as_deref().ok_or_else(|| {
anyhow::anyhow!(
"--plugin NAME is required when --provider is 'plugin'. Register a backend via kreuzberg::plugins::register_embedding_backend first."
)
})?;
if name.is_empty() {
anyhow::bail!("--plugin NAME must not be empty.");
}
// Pre-flight: surface unknown backends with a list of registered names
// (parity with the REST handler, which returns 422 for the same case).
let available =
kreuzberg::plugins::list_embedding_backends().context("Failed to read embedding backend registry")?;
if !available.iter().any(|n| n == name) {
anyhow::bail!(
"Embedding backend '{}' is not registered. Available backends: {}",
name,
if available.is_empty() {
"(none registered)".to_string()
} else {
available.join(", ")
}
);
}
let config = kreuzberg::EmbeddingConfig {
model: kreuzberg::EmbeddingModelType::Plugin { name: name.to_string() },
..Default::default()
};
(config, name.to_string())
}
other => {
anyhow::bail!(
"Unknown embedding provider '{}'. Valid providers: 'local' (default, ONNX), 'llm' (liter-llm), or 'plugin' (in-process backend).",
other
);
}
};
// Generate embeddings
let embeddings = kreuzberg::embed_texts(texts.clone(), &config).context("Failed to generate embeddings")?;
let dimensions = embeddings.first().map(|e| e.len()).unwrap_or(0);
match format {
WireFormat::Json => {
let output = serde_json::json!({
"embeddings": embeddings,
"model": model_label,
"dimensions": dimensions,
"count": embeddings.len(),
});
println!(
"{}",
serde_json::to_string_pretty(&output).context("Failed to serialize embeddings to JSON")?
);
}
WireFormat::Toon => {
let output = serde_json::json!({
"embeddings": embeddings,
"model": model_label,
"dimensions": dimensions,
"count": embeddings.len(),
});
println!(
"{}",
serde_toon::to_string(&output).context("Failed to serialize embeddings to TOON")?
);
}
WireFormat::Text => {
for (i, embedding) in embeddings.iter().enumerate() {
if texts.len() > 1 {
println!("{}", style::dim(&format!("# text {}", i + 1)));
}
let values: Vec<String> = embedding.iter().map(|v| format!("{v}")).collect();
println!("{}", values.join(","));
}
}
}
Ok(())
}

View File

@@ -0,0 +1,180 @@
//! Extract command - Extract text and data from documents
//!
//! This module provides the extract and batch extract commands for processing single
//! or multiple documents with customizable extraction configurations.
use anyhow::{Context, Result};
use kreuzberg::{
BatchFileItem, ExtractionConfig, ExtractionResult, FileExtractionConfig, batch_extract_files_sync,
extract_file_sync,
};
use std::path::PathBuf;
use std::time::Instant;
use crate::{
WireFormat,
output::{BatchEnvelope, ExtractEnvelope},
style,
};
/// Execute single document extraction command
pub fn extract_command(
path: PathBuf,
config: ExtractionConfig,
mime_type: Option<String>,
format: WireFormat,
) -> Result<()> {
let path_str = path.to_string_lossy().to_string();
let t0 = Instant::now();
let result = extract_file_sync(&path_str, mime_type.as_deref(), &config).with_context(|| {
format!(
"Failed to extract file '{}'. Ensure the file is readable and the format is supported.",
path.display()
)
})?;
let extraction_time_ms = t0.elapsed().as_secs_f64() * 1000.0;
match format {
WireFormat::Text => {
print!("{}", result.content);
}
WireFormat::Json => {
let envelope = ExtractEnvelope {
result,
extraction_time_ms,
};
println!(
"{}",
serde_json::to_string_pretty(&envelope).context("Failed to serialize extraction result to JSON")?
);
}
WireFormat::Toon => {
println!(
"{}",
serde_toon::to_string(&result).context("Failed to serialize extraction result to TOON")?
);
}
}
Ok(())
}
/// Execute batch extraction command with optional per-file configuration overrides
pub fn batch_command(
paths: Vec<PathBuf>,
file_configs_map: Option<std::collections::HashMap<String, serde_json::Value>>,
config: ExtractionConfig,
format: WireFormat,
) -> Result<()> {
match format {
WireFormat::Json => {
// Run files one at a time to capture per-file wall-clock timings.
// Per-file config overrides are honoured: files without an override use the
// batch-level config directly; files with an override use a one-shot batch of
// one item so the library's own merge logic applies.
let mut results: Vec<ExtractionResult> = Vec::with_capacity(paths.len());
let mut per_file_ms: Vec<f64> = Vec::with_capacity(paths.len());
let total_t0 = Instant::now();
for path in &paths {
let path_str = path.to_string_lossy().to_string();
let has_file_config = file_configs_map.as_ref().and_then(|m| m.get(&path_str)).is_some();
let t0 = Instant::now();
let result = if has_file_config {
// Delegate to the batch API (one item) so per-file merge logic is applied.
let file_config = file_configs_map
.as_ref()
.and_then(|m| m.get(&path_str))
.map(|v| {
serde_json::from_value::<FileExtractionConfig>(v.clone())
.with_context(|| format!("Failed to parse file config for '{}'", path_str))
})
.transpose()?;
let mut batch_results = batch_extract_files_sync(
vec![BatchFileItem {
path: path.clone(),
config: file_config,
}],
&config,
)
.with_context(|| {
format!(
"Failed to extract file '{}'. Ensure the file is readable and the format is supported.",
path.display()
)
})?;
batch_results.remove(0)
} else {
extract_file_sync(&path_str, None, &config).with_context(|| {
format!(
"Failed to extract file '{}'. Ensure the file is readable and the format is supported.",
path.display()
)
})?
};
per_file_ms.push(t0.elapsed().as_secs_f64() * 1000.0);
results.push(result);
}
let total_ms = total_t0.elapsed().as_secs_f64() * 1000.0;
let envelope = BatchEnvelope {
results,
total_ms,
per_file_ms,
};
println!(
"{}",
serde_json::to_string_pretty(&envelope)
.context("Failed to serialize batch extraction results to JSON")?
);
}
WireFormat::Text => {
let results = run_batch_sync(&paths, file_configs_map.as_ref(), &config)?;
for (i, result) in results.iter().enumerate() {
println!("{}", style::header(&format!("=== Document {} ===", i + 1)));
println!("{} {}", style::label("MIME Type:"), style::success(&result.mime_type));
println!("{}\n{}", style::label("Content:"), result.content);
println!();
}
}
WireFormat::Toon => {
let results = run_batch_sync(&paths, file_configs_map.as_ref(), &config)?;
println!(
"{}",
serde_toon::to_string(&results).context("Failed to serialize batch extraction results to TOON")?
);
}
}
Ok(())
}
/// Run batch extraction using the synchronous batch API for non-JSON output paths.
fn run_batch_sync(
paths: &[PathBuf],
file_configs_map: Option<&std::collections::HashMap<String, serde_json::Value>>,
config: &ExtractionConfig,
) -> Result<Vec<ExtractionResult>> {
let items: Vec<BatchFileItem> = paths
.iter()
.map(|p| {
let path_str = p.to_string_lossy().to_string();
let file_config = file_configs_map
.and_then(|m| m.get(&path_str))
.map(|v| {
serde_json::from_value::<FileExtractionConfig>(v.clone())
.with_context(|| format!("Failed to parse file config for '{}'", path_str))
})
.transpose()?;
Ok(BatchFileItem {
path: p.clone(),
config: file_config,
})
})
.collect::<Result<Vec<_>>>()?;
batch_extract_files_sync(items, config)
.context("Failed to batch extract documents. Check that all files are readable and formats are supported.")
}

View File

@@ -0,0 +1,116 @@
//! Extract structured command - Extract structured data from documents using an LLM.
//!
//! Reads a JSON schema file, configures LLM-based structured extraction, and
//! outputs the structured result parsed from the document.
use anyhow::{Context, Result};
use kreuzberg::{LlmConfig, StructuredExtractionConfig, extract_file_sync};
use std::path::PathBuf;
use crate::WireFormat;
/// Arguments for the extract-structured command.
pub struct ExtractStructuredArgs {
pub path: PathBuf,
pub schema_path: PathBuf,
pub model: String,
pub api_key: Option<String>,
pub prompt: Option<String>,
pub schema_name: Option<String>,
pub strict: bool,
pub config_path: Option<PathBuf>,
pub format: WireFormat,
}
/// Execute the extract-structured command.
///
/// Reads a JSON schema from `schema_path`, builds an `ExtractionConfig` with
/// `structured_extraction` configured, extracts the document, and outputs the
/// `structured_output` field from the result.
pub fn extract_structured_command(args: ExtractStructuredArgs) -> Result<()> {
let ExtractStructuredArgs {
path,
schema_path,
model,
api_key,
prompt,
schema_name,
strict,
config_path,
format,
} = args;
// 1. Read and parse the JSON schema file
let schema_str = std::fs::read_to_string(&schema_path).with_context(|| {
format!(
"Failed to read JSON schema file '{}'. Ensure the file exists and is readable.",
schema_path.display()
)
})?;
let schema: serde_json::Value = serde_json::from_str(&schema_str).with_context(|| {
format!(
"Failed to parse JSON schema from '{}'. Ensure the file contains valid JSON.",
schema_path.display()
)
})?;
// 2. Build ExtractionConfig with structured_extraction
let mut config = super::load_config(config_path)?;
let llm_config = LlmConfig {
model,
api_key,
base_url: None,
timeout_secs: None,
max_retries: None,
temperature: None,
max_tokens: None,
};
config.structured_extraction = Some(StructuredExtractionConfig {
schema,
schema_name: schema_name.unwrap_or_else(|| "extraction".to_string()),
schema_description: None,
strict,
prompt,
llm: llm_config,
});
// 3. Call kreuzberg::extract_file_sync()
let path_str = path.to_string_lossy().to_string();
let result = extract_file_sync(&path_str, None, &config).with_context(|| {
format!(
"Failed to extract structured data from '{}'. Ensure the file is readable and the LLM configuration is correct.",
path.display()
)
})?;
// 4. Output result.structured_output (or error if None)
let structured = result.structured_output.with_context(|| {
"Structured extraction completed but returned no structured output. \
This may indicate the LLM failed to produce valid structured data matching the schema."
})?;
match format {
WireFormat::Json => {
println!(
"{}",
serde_json::to_string_pretty(&structured).context("Failed to serialize structured output to JSON")?
);
}
WireFormat::Toon => {
println!(
"{}",
serde_toon::to_string(&structured).context("Failed to serialize structured output to TOON")?
);
}
WireFormat::Text => {
// For text mode, pretty-print the JSON value
println!(
"{}",
serde_json::to_string_pretty(&structured).context("Failed to serialize structured output to text")?
);
}
}
Ok(())
}

View File

@@ -0,0 +1,48 @@
//! Command modules for Kreuzberg CLI
//!
//! This module organizes the CLI commands into focused submodules:
//! - `extract` - Document extraction commands
//! - `cache` - Cache management operations
//! - `server` - API and MCP server commands
//! - `config` - Configuration loading and discovery
//! - `embed` - Embedding generation commands
//! - `chunk` - Text chunking commands
use anyhow::{Context, Result};
use std::io::Read;
pub mod cache;
pub mod chunk;
pub mod config;
#[cfg(feature = "embeddings")]
pub mod embed;
pub mod extract;
pub mod extract_structured;
pub mod overrides;
#[cfg(any(feature = "api", feature = "mcp"))]
pub mod server;
// Re-export command functions for convenience
pub use cache::{clear_command, manifest_command, stats_command, warm_command};
pub use chunk::chunk_command;
pub use config::load_config;
#[cfg(feature = "embeddings")]
pub use embed::embed_command;
pub use extract::{batch_command, extract_command};
#[cfg(feature = "mcp")]
pub use server::mcp_command;
#[cfg(feature = "api")]
pub use server::serve_command;
/// Read text from stdin, trimming whitespace.
pub fn read_stdin() -> Result<String> {
let mut input = String::new();
std::io::stdin()
.read_to_string(&mut input)
.context("Failed to read from stdin")?;
let trimmed = input.trim().to_string();
if trimmed.is_empty() {
anyhow::bail!("No input received from stdin. Provide text via --text or pipe it to stdin.");
}
Ok(trimmed)
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,104 @@
//! Server command - Start API and MCP servers
//!
//! This module provides commands for starting the Kreuzberg API server
//! and the MCP (Model Context Protocol) server.
use anyhow::Result;
/// Execute API server command
#[cfg(feature = "api")]
pub fn serve_command(
cli_host: Option<String>,
cli_port: Option<u16>,
extraction_config: kreuzberg::ExtractionConfig,
config_path: Option<std::path::PathBuf>,
) -> Result<()> {
use anyhow::Context;
use kreuzberg::ServerConfig;
// Load server config from same file or defaults
let mut server_config = if let Some(path) = &config_path {
ServerConfig::from_file(path).with_context(|| {
format!(
"Failed to load server configuration from '{}'. \
Ensure the file contains valid server settings under [server] section or at root level.",
path.display()
)
})?
} else {
ServerConfig::default()
};
// Apply environment variable overrides (precedence: env vars > config file)
server_config.apply_env_overrides()?;
// CLI args override everything (highest precedence)
if let Some(host) = cli_host {
server_config.host = host;
}
if let Some(port) = cli_port {
server_config.port = port;
}
// Log the final configuration for debugging
tracing::info!(
"Starting Kreuzberg API server on http://{}",
server_config.listen_addr()
);
let rt = tokio::runtime::Runtime::new()?;
rt.block_on(kreuzberg::api::serve_with_server_config(
extraction_config,
server_config.clone(),
))
.with_context(|| {
format!(
"Failed to start API server on {}. Ensure the port is not already in use and you have permission to bind to this address.",
server_config.listen_addr()
)
})?;
Ok(())
}
/// Execute MCP server command
#[cfg(feature = "mcp")]
pub fn mcp_command(
config: kreuzberg::ExtractionConfig,
transport: String,
#[cfg(feature = "mcp-http")] host: String,
#[cfg(feature = "mcp-http")] port: u16,
#[cfg(not(feature = "mcp-http"))] _host: String,
#[cfg(not(feature = "mcp-http"))] _port: u16,
) -> Result<()> {
tracing::debug!("Starting Kreuzberg MCP server with transport: {}", transport);
let rt = tokio::runtime::Runtime::new()?;
match transport.to_lowercase().as_str() {
"stdio" => {
rt.block_on(kreuzberg::mcp::start_mcp_server_with_config(config))
.map_err(|e| anyhow::anyhow!("Failed to start MCP server: {}", e))?;
}
"http" => {
#[cfg(not(feature = "mcp-http"))]
{
anyhow::bail!(
"HTTP transport requires 'mcp-http' feature. \
Rebuild with: cargo build --features mcp-http"
);
}
#[cfg(feature = "mcp-http")]
{
tracing::debug!("Starting MCP server on http://{}:{}", host, port);
rt.block_on(kreuzberg::mcp::start_mcp_server_http_with_config(&host, port, config))
.map_err(|e| anyhow::anyhow!("Failed to start MCP server on {}:{}: {}", host, port, e))?;
}
}
other => {
anyhow::bail!("Unknown transport '{}'. Use 'stdio' or 'http'", other);
}
}
Ok(())
}

View File

@@ -0,0 +1,230 @@
//! Tree-sitter grammar management commands.
//!
//! This module provides commands for downloading, listing, and managing
//! tree-sitter grammar parsers via the tree-sitter-language-pack crate.
use anyhow::{Context, Result};
use serde_json::json;
use std::path::PathBuf;
use crate::{WireFormat, style};
/// Execute the tree-sitter download command.
///
/// Downloads tree-sitter grammar parsers based on the provided arguments:
/// - Specific languages by name
/// - All available languages (--all)
/// - Language groups (--groups)
pub fn download_command(
languages: Vec<String>,
all: bool,
groups: Option<Vec<String>>,
cache_dir: Option<PathBuf>,
format: WireFormat,
) -> Result<()> {
// Apply custom cache directory if provided
if let Some(ref dir) = cache_dir {
let config = tree_sitter_language_pack::PackConfig {
cache_dir: Some(dir.clone()),
languages: None,
groups: None,
};
tree_sitter_language_pack::configure(&config).context("Failed to configure custom cache directory")?;
}
let count: usize;
let description: String;
if all {
count = tree_sitter_language_pack::download_all().context("Failed to download all tree-sitter grammars")?;
description = "all available languages".to_string();
} else if let Some(ref group_list) = groups {
let config = tree_sitter_language_pack::PackConfig {
cache_dir: cache_dir.clone(),
languages: None,
groups: Some(group_list.clone()),
};
tree_sitter_language_pack::init(&config).context("Failed to download tree-sitter grammar groups")?;
count = 0; // init does not return a count
description = format!("groups: {}", group_list.join(", "));
} else if !languages.is_empty() {
let refs: Vec<&str> = languages.iter().map(String::as_str).collect();
count = tree_sitter_language_pack::download(&refs).context("Failed to download tree-sitter grammars")?;
description = format!("languages: {}", languages.join(", "));
} else {
anyhow::bail!("No languages specified. Use language names, --all, --groups, or --from-config.");
}
match format {
WireFormat::Text => {
println!("{}", style::header("Tree-sitter Download"));
println!("{}", style::dim("===================="));
println!("{} {}", style::label("Requested:"), description);
if groups.is_none() || all || !languages.is_empty() {
println!(
"{} {}",
style::label("Newly downloaded:"),
style::success(&count.to_string())
);
}
if let Some(ref dir) = cache_dir {
println!(
"{} {}",
style::label("Cache directory:"),
style::success(&dir.display().to_string())
);
}
println!("{}", style::success("Done"));
}
WireFormat::Json => {
let mut output = json!({
"requested": description,
"newly_downloaded": count,
});
if let Some(ref dir) = cache_dir {
output["cache_dir"] = json!(dir.to_string_lossy());
}
println!(
"{}",
serde_json::to_string_pretty(&output).context("Failed to serialize download results to JSON")?
);
}
WireFormat::Toon => {
let mut output = json!({
"requested": description,
"newly_downloaded": count,
});
if let Some(ref dir) = cache_dir {
output["cache_dir"] = json!(dir.to_string_lossy());
}
println!(
"{}",
serde_toon::to_string(&output).context("Failed to serialize download results to TOON")?
);
}
}
Ok(())
}
/// Execute the tree-sitter list command.
///
/// Lists available or downloaded tree-sitter languages, optionally filtering
/// by a name substring.
pub fn list_command(downloaded_only: bool, filter: Option<String>, format: WireFormat) -> Result<()> {
let languages = if downloaded_only {
tree_sitter_language_pack::downloaded_languages()
} else {
tree_sitter_language_pack::manifest_languages().context("Failed to fetch tree-sitter language manifest")?
};
let filtered: Vec<&String> = if let Some(ref f) = filter {
let lower = f.to_lowercase();
languages.iter().filter(|l| l.to_lowercase().contains(&lower)).collect()
} else {
languages.iter().collect()
};
let source = if downloaded_only { "downloaded" } else { "available" };
match format {
WireFormat::Text => {
println!(
"{} ({} {}{})",
style::header("Tree-sitter Languages"),
filtered.len(),
source,
filter.as_ref().map(|f| format!(", filter: '{f}'")).unwrap_or_default()
);
println!("{}", style::dim("====================="));
for lang in &filtered {
println!(" {}", style::success(lang));
}
}
WireFormat::Json => {
let output = json!({
"source": source,
"count": filtered.len(),
"filter": filter,
"languages": filtered,
});
println!(
"{}",
serde_json::to_string_pretty(&output).context("Failed to serialize language list to JSON")?
);
}
WireFormat::Toon => {
let output = json!({
"source": source,
"count": filtered.len(),
"filter": filter,
"languages": filtered,
});
println!(
"{}",
serde_toon::to_string(&output).context("Failed to serialize language list to TOON")?
);
}
}
Ok(())
}
/// Execute the tree-sitter cache-dir command.
///
/// Displays the effective cache directory for tree-sitter grammar parsers.
pub fn cache_dir_command(format: WireFormat) -> Result<()> {
let dir = tree_sitter_language_pack::cache_dir().context("Failed to determine tree-sitter cache directory")?;
let dir_str = dir.to_string_lossy();
match format {
WireFormat::Text => {
println!("{} {}", style::label("Cache directory:"), style::success(&dir_str));
}
WireFormat::Json => {
let output = json!({ "cache_dir": dir_str });
println!(
"{}",
serde_json::to_string_pretty(&output).context("Failed to serialize cache directory to JSON")?
);
}
WireFormat::Toon => {
let output = json!({ "cache_dir": dir_str });
println!(
"{}",
serde_toon::to_string(&output).context("Failed to serialize cache directory to TOON")?
);
}
}
Ok(())
}
/// Execute the tree-sitter clean command.
///
/// Clears all cached tree-sitter grammar parser shared libraries.
pub fn clean_command(format: WireFormat) -> Result<()> {
tree_sitter_language_pack::clean_cache().context("Failed to clean tree-sitter cache")?;
match format {
WireFormat::Text => {
println!("{}", style::success("Tree-sitter cache cleared successfully"));
}
WireFormat::Json => {
let output = json!({ "status": "cleared" });
println!(
"{}",
serde_json::to_string_pretty(&output).context("Failed to serialize clean result to JSON")?
);
}
WireFormat::Toon => {
let output = json!({ "status": "cleared" });
println!(
"{}",
serde_toon::to_string(&output).context("Failed to serialize clean result to TOON")?
);
}
}
Ok(())
}

View File

@@ -0,0 +1,238 @@
//! Logging helpers for the Kreuzberg CLI.
//!
//! Provides a [`build_env_filter`] function that layers default third-party
//! transport suppressions on top of whatever the caller or `RUST_LOG` specifies.
//! User-supplied per-target rules in `RUST_LOG` always win because
//! [`EnvFilter::add_directive`] does not override existing per-target directives.
use tracing_subscriber::EnvFilter;
/// Third-party crates that are noisy at their own default level.
///
/// These are added as *fallback* directives: if `RUST_LOG` or `level_override`
/// already contain a per-target rule for any of these crates it takes precedence,
/// so the user can still do `RUST_LOG=ureq=debug` to restore full transport logs.
const QUIET_DIRECTIVES: &[&str] = &[
"ureq=warn",
"ureq_proto=warn",
"rustls=warn",
"hyper_util=warn",
"hf_hub=info",
"tower_http=info",
];
/// Extract the target crate name from a directive string like `"ureq=warn"`.
///
/// Returns the part before `=`, or `None` if there is no `=`.
fn directive_target(directive: &str) -> Option<&str> {
directive.split_once('=').map(|(target, _)| target)
}
/// Build an [`EnvFilter`] with third-party transport crates suppressed by default.
///
/// Precedence (highest first):
/// 1. Per-target directives already present in `RUST_LOG` (or `level_override`).
/// 2. The [`QUIET_DIRECTIVES`] suppressions added here.
/// 3. Root level: `level_override` → `RUST_LOG` → `"info"`.
///
/// Per-target directives that the user has already set are **not** overridden:
/// we skip adding a quiet directive when the base filter already contains a
/// rule for the same target crate. This is necessary because
/// [`EnvFilter::add_directive`] appends rather than guards — a later-added
/// per-target directive for the same crate takes precedence.
///
/// # Arguments
///
/// * `level_override` — explicit root-level string from a CLI flag (e.g. `"debug"`).
/// When `Some`, it replaces `RUST_LOG` entirely for the root level.
pub fn build_env_filter(level_override: Option<&str>) -> EnvFilter {
// Use try_new on user input so a malformed --log-level falls back to info
// instead of panicking the CLI.
let base = level_override
.and_then(|level| EnvFilter::try_new(level).ok())
.or_else(|| EnvFilter::try_from_default_env().ok())
.unwrap_or_else(|| EnvFilter::new("info"));
// Snapshot the existing directive set so we can skip quiet directives
// whose target the user has already configured explicitly.
let existing_targets: std::collections::HashSet<String> = base
.to_string()
.split(',')
.filter_map(|chunk| directive_target(chunk).map(|t| t.trim().to_string()))
.collect();
QUIET_DIRECTIVES
.iter()
.filter(|directive| {
// Only add the quiet directive when no per-target rule for this
// exact crate already exists. Word-boundary match via tokenized
// target set avoids `hf_hub` colliding with `hf_hub_server`.
directive_target(directive)
.map(|target| !existing_targets.contains(target))
.unwrap_or(true)
})
.fold(base, |filter, directive| {
filter.add_directive(directive.parse().expect("built-in logging directive must be valid"))
})
}
#[cfg(test)]
mod tests {
use super::*;
/// Parse the directive string from an EnvFilter for assertion-level checks.
///
/// `EnvFilter::to_string()` returns a comma-separated representation of all
/// directives. We use this as a stable, public inspection surface.
fn filter_directives(filter: &EnvFilter) -> String {
filter.to_string()
}
#[test]
fn default_filter_suppresses_ureq() {
// No env, no override → ureq and ureq_proto must be suppressed.
let filter = build_env_filter(None);
let directives = filter_directives(&filter);
assert!(
directives.contains("ureq=warn"),
"ureq=warn must be present in default filter; got: {directives}"
);
assert!(
directives.contains("ureq_proto=warn"),
"ureq_proto=warn must be present in default filter; got: {directives}"
);
assert!(
directives.contains("rustls=warn"),
"rustls=warn must be present in default filter; got: {directives}"
);
}
#[test]
fn default_filter_keeps_kreuzberg_info() {
// Root level info → kreuzberg has no suppression applied.
let filter = build_env_filter(None);
let directives = filter_directives(&filter);
assert!(
!directives.contains("kreuzberg=warn") && !directives.contains("kreuzberg=error"),
"kreuzberg must not be suppressed in the default filter; got: {directives}"
);
}
#[test]
fn env_override_wins_for_third_party() {
// Simulate RUST_LOG=ureq=debug by passing it as the level_override.
// build_env_filter must detect the existing ureq= directive and skip the
// ureq=warn suppression, so ureq=debug survives in the final filter.
let filter = build_env_filter(Some("info,ureq=debug"));
let directives = filter.to_string();
assert!(
directives.contains("ureq=debug"),
"user-supplied ureq=debug must be preserved; got: {directives}"
);
assert!(
!directives.contains("ureq=warn"),
"ureq=warn suppression must not be added when user already set ureq=debug; got: {directives}"
);
}
#[test]
fn level_override_wins() {
// CLI flag "debug" → root must be debug; suppression directives still present.
let filter = build_env_filter(Some("debug"));
let directives = filter_directives(&filter);
assert!(
directives.contains("debug"),
"root debug level must appear in filter with --log-level debug; got: {directives}"
);
// Suppression for ureq must still be layered on top.
assert!(
directives.contains("ureq=warn"),
"ureq=warn suppression must still be present even under --log-level debug; got: {directives}"
);
}
#[test]
fn tower_http_suppressed_at_default() {
// No override → tower_http must be suppressed.
let filter = build_env_filter(None);
let directives = filter_directives(&filter);
assert!(
directives.contains("tower_http=info") || directives.contains("tower_http=warn"),
"tower_http must be suppressed at default level; got: {directives}"
);
}
#[test]
fn all_quiet_directives_are_valid() {
// Ensure every built-in directive parses without panic.
for directive in super::QUIET_DIRECTIVES {
directive
.parse::<tracing_subscriber::filter::Directive>()
.expect("built-in directive is invalid");
}
}
#[test]
fn no_level_override_uses_info_root() {
// Without RUST_LOG set and no override, root should default to info.
// The directive string must not open with debug or trace as the root level.
let filter = build_env_filter(None);
let directives = filter_directives(&filter);
// Root "debug" or "trace" as the first token would mean root is debug/trace.
let root_is_noisier_than_info = directives.starts_with("debug") || directives.starts_with("trace");
assert!(
!root_is_noisier_than_info,
"default root level must not be debug/trace without RUST_LOG; got: {directives}"
);
}
#[test]
fn hf_hub_suppressed_at_default() {
let filter = build_env_filter(None);
let directives = filter_directives(&filter);
assert!(
directives.contains("hf_hub=info"),
"hf_hub must be suppressed to info at default; got: {directives}"
);
}
#[test]
fn hyper_util_suppressed_at_default() {
let filter = build_env_filter(None);
let directives = filter_directives(&filter);
assert!(
directives.contains("hyper_util=warn"),
"hyper_util must be suppressed to warn at default; got: {directives}"
);
}
#[test]
fn malformed_level_override_falls_back_to_info() {
// Garbage CLI flag must NOT panic — try_new returns Err and we fall back
// to RUST_LOG / info default.
let filter = build_env_filter(Some(":::garbage"));
let directives = filter_directives(&filter);
// Quiet directives should still be layered, proving we recovered.
assert!(
directives.contains("ureq=warn"),
"ureq=warn must still be present after malformed override; got: {directives}"
);
}
#[test]
fn similar_target_name_does_not_block_suppression() {
// A user-supplied directive for `hf_hub_server` must NOT cause the
// `hf_hub=info` suppression to be skipped (regression test for the
// earlier substring-containment bug).
let filter = build_env_filter(Some("info,hf_hub_server=debug"));
let directives = filter.to_string();
assert!(
directives.contains("hf_hub_server=debug"),
"user directive for hf_hub_server must survive; got: {directives}"
);
assert!(
directives.contains("hf_hub=info"),
"hf_hub=info suppression must still be applied; got: {directives}"
);
}
}

View File

@@ -0,0 +1,971 @@
//! Kreuzberg CLI - Command-line interface for document intelligence.
//!
//! This binary provides a command-line interface to the Kreuzberg document intelligence
//! library, supporting document extraction, MIME type detection, caching, and batch operations.
//!
//! # Architecture
//!
//! The CLI is built using `clap` for argument parsing and provides five main commands:
//! - `extract`: Extract text/data from a single document
//! - `batch`: Process multiple documents in parallel
//! - `detect`: Identify MIME type of a file
//! - `cache`: Manage cache (clear, stats)
//! - `serve`: Start API server (requires `api` feature)
//! - `version`: Show version information
//!
//! # Configuration
//!
//! The CLI supports configuration files in TOML, YAML, or JSON formats:
//! - Explicit: `--config path/to/config.toml`
//! - Auto-discovery: Searches for `kreuzberg.{toml,yaml,json}` in current and parent directories
//! - Inline JSON: `--config-json '{"ocr": {"backend": "tesseract"}}'`
//! - Command-line flags override config file settings
//!
//! Configuration precedence (highest to lowest):
//! 1. Individual CLI flags (--output-format, --ocr, etc.)
//! 2. Inline JSON config (--config-json or --config-json-base64)
//! 3. Config file (--config path.toml)
//! 4. Default values
//!
//! # Exit Codes
//!
//! - 0: Success
//! - Non-zero: Error (see stderr for details)
//!
//! # Examples
//!
//! ```bash
//! # Extract text from a PDF
//! kreuzberg extract document.pdf
//!
//! # Extract with OCR enabled
//! kreuzberg extract scanned.pdf --ocr true
//!
//! # Extract with inline JSON config
//! kreuzberg extract doc.pdf --config-json '{"ocr":{"backend":"tesseract"}}'
//!
//! # Batch processing
//! kreuzberg batch *.pdf --output-format json
//!
//! # Detect MIME type
//! kreuzberg detect unknown-file.bin
//! ```
#![deny(unsafe_code)]
mod commands;
mod logging;
mod output;
mod style;
use anyhow::{Context, Result};
use base64::{Engine as _, engine::general_purpose::STANDARD};
use clap::{CommandFactory, Parser, Subcommand};
#[cfg(feature = "embeddings")]
use commands::embed_command;
#[cfg(feature = "mcp")]
use commands::mcp_command;
use commands::overrides::ExtractionOverrides;
#[cfg(feature = "api")]
use commands::serve_command;
use commands::{
batch_command, chunk_command, clear_command, extract_command,
extract_structured::{ExtractStructuredArgs, extract_structured_command},
load_config, manifest_command, stats_command, warm_command,
};
use kreuzberg::{OutputFormat as ContentOutputFormat, detect_mime_type};
use serde_json::json;
use std::path::{Path, PathBuf};
/// Kreuzberg document intelligence CLI
#[derive(Parser)]
#[command(name = "kreuzberg")]
#[command(version, about, long_about = None)]
struct Cli {
/// Set log level (trace, debug, info, warn, error). Overrides RUST_LOG env var.
#[arg(long, global = true)]
log_level: Option<String>,
#[command(subcommand)]
command: Commands,
}
#[derive(Subcommand)]
enum Commands {
/// Extract text from a document
Extract {
/// Path to the document
path: PathBuf,
/// Path to config file (TOML, YAML, or JSON). If not specified, searches for kreuzberg.toml/yaml/json in current and parent directories.
#[arg(short, long)]
config: Option<PathBuf>,
/// Inline JSON configuration. Applied after config file but before individual flags.
///
/// Example: --config-json '{"ocr":{"backend":"tesseract"},"chunking":{"max_chars":1000}}'
#[arg(long)]
config_json: Option<String>,
/// Base64-encoded JSON configuration. Useful for shell environments where quotes are problematic.
///
/// Example: --config-json-base64 eyJvY3IiOnsiYmFja2VuZCI6InRlc3NlcmFjdCJ9fQ==
#[arg(long)]
config_json_base64: Option<String>,
/// MIME type hint (auto-detected if not provided)
#[arg(short, long)]
mime_type: Option<String>,
/// Output format for CLI results (text or json).
///
/// Controls how the CLI displays results, not the extraction content format.
#[arg(short, long, default_value = "text")]
format: WireFormat,
/// Extraction configuration overrides
#[command(flatten)]
overrides: ExtractionOverrides,
},
/// Extract structured data from a document using an LLM
ExtractStructured {
/// Path to the document file
path: PathBuf,
/// Path to JSON schema file defining the output structure
#[arg(long)]
schema: PathBuf,
/// LLM model (e.g., "openai/gpt-4o")
#[arg(long)]
model: String,
/// API key for the LLM provider
#[arg(long)]
api_key: Option<String>,
/// Custom Jinja2 prompt template
#[arg(long)]
prompt: Option<String>,
/// Schema name
#[arg(long, default_value = "extraction")]
schema_name: Option<String>,
/// Enable strict mode
#[arg(long)]
strict: bool,
/// Config file path
#[arg(short, long)]
config: Option<PathBuf>,
/// Output format (text or json)
#[arg(short, long, default_value = "json")]
format: WireFormat,
},
/// Batch extract from multiple documents
Batch {
/// Paths to documents
paths: Vec<PathBuf>,
/// Path to config file (TOML, YAML, or JSON). If not specified, searches for kreuzberg.toml/yaml/json in current and parent directories.
#[arg(short, long)]
config: Option<PathBuf>,
/// Inline JSON configuration. Applied after config file but before individual flags.
///
/// Example: --config-json '{"ocr":{"backend":"tesseract"},"chunking":{"max_chars":1000}}'
#[arg(long)]
config_json: Option<String>,
/// Base64-encoded JSON configuration. Useful for shell environments where quotes are problematic.
///
/// Example: --config-json-base64 eyJvY3IiOnsiYmFja2VuZCI6InRlc3NlcmFjdCJ9fQ==
#[arg(long)]
config_json_base64: Option<String>,
/// Output format for CLI results (text or json).
///
/// Controls how the CLI displays results, not the extraction content format.
#[arg(short, long, default_value = "json")]
format: WireFormat,
/// Extraction configuration overrides
#[command(flatten)]
overrides: ExtractionOverrides,
/// Path to a JSON file mapping file paths to per-file extraction config overrides.
/// The JSON should be an object where keys are file paths and values are FileExtractionConfig objects.
/// Example: {"doc1.pdf": {"force_ocr": true}, "doc2.pdf": {"output_format": "markdown"}}
#[arg(long)]
file_configs: Option<PathBuf>,
},
/// Detect MIME type of a file
Detect {
/// Path to the file
path: PathBuf,
/// Output format (text or json)
#[arg(short, long, default_value = "text")]
format: WireFormat,
},
/// List all supported document formats
Formats {
/// Output format (text or json)
#[arg(short, long, default_value = "text")]
format: WireFormat,
},
/// Show version information
Version {
/// Output format (text or json)
#[arg(short, long, default_value = "text")]
format: WireFormat,
},
/// Cache management operations
Cache {
#[command(subcommand)]
command: CacheCommands,
},
/// Start the API server
///
/// Configuration is loaded with the following precedence (highest to lowest):
/// 1. CLI arguments (--host, --port)
/// 2. Environment variables (KREUZBERG_HOST, KREUZBERG_PORT)
/// 3. Config file (TOML, YAML, or JSON)
/// 4. Built-in defaults (127.0.0.1:8000)
///
/// The config file can contain both extraction and server settings under [server] section.
#[cfg(feature = "api")]
Serve {
/// Host to bind to (e.g., "127.0.0.1" or "0.0.0.0"). CLI arg overrides config file and env vars.
#[arg(short = 'H', long)]
host: Option<String>,
/// Port to bind to. CLI arg overrides config file and env vars.
#[arg(short, long)]
port: Option<u16>,
/// Path to config file (TOML, YAML, or JSON). If not specified, searches for kreuzberg.toml/yaml/json in current and parent directories.
#[arg(short, long)]
config: Option<PathBuf>,
},
/// Start the MCP (Model Context Protocol) server
#[cfg(feature = "mcp")]
Mcp {
/// Path to config file (TOML, YAML, or JSON). If not specified, searches for kreuzberg.toml/yaml/json in current and parent directories.
#[arg(short, long)]
config: Option<PathBuf>,
/// Transport mode: stdio (default) or http
#[arg(long, default_value = "stdio")]
transport: String,
/// HTTP host (only for --transport http)
#[arg(long, default_value = "127.0.0.1")]
host: String,
/// HTTP port (only for --transport http)
#[arg(long, default_value = "8001")]
port: u16,
},
/// API utilities
#[cfg(feature = "api")]
Api {
#[command(subcommand)]
command: ApiCommands,
},
/// Generate embeddings for text
///
/// Generates vector embeddings for one or more text inputs using a specified preset model
/// or an LLM provider. Reads from --text flag or stdin if no text is provided.
#[cfg(feature = "embeddings")]
Embed {
/// Text to embed. Can be specified multiple times for batch embedding.
#[arg(long)]
text: Vec<String>,
/// Embedding preset (fast, balanced, quality, multilingual). Used with --provider local.
#[arg(long, default_value = "balanced")]
preset: String,
/// Embedding provider: "local" (default, ONNX), "llm" (liter-llm), or "plugin" (registered in-process backend)
#[arg(long, default_value = "local")]
provider: String,
/// LLM model for provider-hosted embeddings (e.g., "openai/text-embedding-3-small").
/// Required when --provider is "llm".
#[arg(long)]
model: Option<String>,
/// API key for the LLM provider
#[arg(long)]
api_key: Option<String>,
/// Name of a pre-registered in-process embedding backend.
/// Required when --provider is "plugin". The backend must have been
/// registered via `kreuzberg::plugins::register_embedding_backend`
/// before this command runs.
#[arg(long)]
plugin: Option<String>,
/// Output format (text or json)
#[arg(short, long, default_value = "json")]
format: WireFormat,
},
/// Chunk text for processing
///
/// Splits text into chunks using configurable size and overlap.
/// Reads from --text flag or stdin if no text is provided.
Chunk {
/// Text to chunk. If not provided, reads from stdin.
#[arg(long)]
text: Option<String>,
/// Path to config file (TOML, YAML, or JSON)
#[arg(short, long)]
config: Option<PathBuf>,
/// Chunk size in characters
#[arg(long)]
chunk_size: Option<usize>,
/// Chunk overlap in characters
#[arg(long)]
chunk_overlap: Option<usize>,
/// Chunker type: text, markdown, yaml, or semantic
#[arg(long, default_value = "text")]
chunker_type: String,
/// Tokenizer model for token-based chunk sizing (e.g., "Xenova/gpt-4o").
/// Requires the chunking-tokenizers feature.
#[arg(long)]
chunking_tokenizer: Option<String>,
/// Topic threshold for semantic chunking (0.0-1.0, default: 0.75)
#[arg(long)]
topic_threshold: Option<f32>,
/// Output format (text or json)
#[arg(short, long, default_value = "json")]
format: WireFormat,
},
/// Generate shell completions
///
/// Outputs shell completion scripts for the specified shell.
/// Install with: eval "$(kreuzberg completions bash)"
Completions {
/// Shell to generate completions for
#[arg(value_enum)]
shell: clap_complete::Shell,
},
}
#[cfg(feature = "api")]
#[derive(Subcommand)]
enum ApiCommands {
/// Output the OpenAPI schema (JSON)
///
/// Prints the full OpenAPI 3.1 specification for the kreuzberg REST API.
/// Useful for code generation, documentation, and API client tooling.
Schema,
}
#[derive(Subcommand)]
enum CacheCommands {
/// Show cache statistics
Stats {
/// Cache directory (default: .kreuzberg in current directory)
#[arg(short, long)]
cache_dir: Option<PathBuf>,
/// Output format (text or json)
#[arg(short, long, default_value = "text")]
format: WireFormat,
},
/// Clear the cache
Clear {
/// Cache directory (default: .kreuzberg in current directory)
#[arg(short, long)]
cache_dir: Option<PathBuf>,
/// Output format (text or json)
#[arg(short, long, default_value = "text")]
format: WireFormat,
},
/// Output model manifest (expected model files, checksums, sizes)
///
/// Outputs a JSON manifest of all model files required by kreuzberg,
/// including their relative paths, SHA256 checksums, and sizes.
/// Used for pre-populating model caches in containerized deployments.
Manifest {
/// Output format (text or json)
#[arg(short, long, default_value = "json")]
format: WireFormat,
},
/// Download all models eagerly
///
/// Downloads all PaddleOCR and layout detection models for all supported
/// languages. Unlike normal operation which downloads lazily on first use,
/// this ensures all models are present in the cache directory.
///
/// Use --all-embeddings to also download all 4 embedding model presets,
/// or --embedding-model <preset> to download a specific one.
///
/// By default, only the core layout models (rtdetr + tatr) are downloaded.
/// Use --all-table-models to also download SLANeXT variants (~730MB).
Warm {
/// Cache directory (default: .kreuzberg in current directory, or KREUZBERG_CACHE_DIR)
#[arg(short, long)]
cache_dir: Option<PathBuf>,
/// Output format (text or json)
#[arg(short, long, default_value = "text")]
format: WireFormat,
/// Download all embedding model presets (fast, balanced, quality, multilingual)
#[arg(long)]
all_embeddings: bool,
/// Download a specific embedding model preset
#[arg(long, value_name = "PRESET")]
embedding_model: Option<String>,
/// Download all table structure models including SLANeXT variants (~730MB)
#[arg(
long,
help = "Download all table structure models including SLANeXT variants (~730MB)"
)]
all_table_models: bool,
/// Download all tree-sitter grammar parsers
#[arg(long)]
all_grammars: bool,
/// Download specific tree-sitter grammar groups (comma-separated: web,systems,scripting,data,jvm,functional)
#[arg(long, value_name = "GROUPS", value_delimiter = ',')]
grammar_groups: Option<Vec<String>>,
/// Download specific tree-sitter grammars by language name (comma-separated)
#[arg(long, value_name = "LANGUAGES", value_delimiter = ',')]
grammars: Option<Vec<String>>,
},
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
enum WireFormat {
Text,
Json,
Toon,
}
impl std::str::FromStr for WireFormat {
type Err = String;
fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
match s.to_lowercase().as_str() {
"text" => Ok(WireFormat::Text),
"json" => Ok(WireFormat::Json),
"toon" => Ok(WireFormat::Toon),
_ => Err(format!("Invalid format: {}. Use 'text', 'json', or 'toon'", s)),
}
}
}
/// Content output format for extraction results.
///
/// Controls the format of the extracted content (not the CLI output format).
#[derive(Clone, Copy, Debug, PartialEq, Eq, clap::ValueEnum)]
enum ContentOutputFormatArg {
/// Plain text (default)
Plain,
/// Markdown format
Markdown,
/// Djot markup format
Djot,
/// HTML format
Html,
/// JSON tree format with heading-driven sections
Json,
}
impl From<ContentOutputFormatArg> for ContentOutputFormat {
fn from(arg: ContentOutputFormatArg) -> Self {
match arg {
ContentOutputFormatArg::Plain => ContentOutputFormat::Plain,
ContentOutputFormatArg::Markdown => ContentOutputFormat::Markdown,
ContentOutputFormatArg::Djot => ContentOutputFormat::Djot,
ContentOutputFormatArg::Html => ContentOutputFormat::Html,
ContentOutputFormatArg::Json => ContentOutputFormat::Json,
}
}
}
/// Validates that a file exists and is accessible.
///
/// Checks that the path exists in the filesystem and points to a regular file
/// (not a directory or special file). Provides user-friendly error messages if validation fails.
///
/// # Errors
///
/// Returns an error if:
/// - The path does not exist in the filesystem
/// - The path exists but is not a regular file (e.g., is a directory)
fn validate_file_exists(path: &Path) -> Result<()> {
if !path.exists() {
anyhow::bail!(
"File not found: '{}'. Please check that the file exists and is accessible.",
path.display()
);
}
if !path.is_file() {
anyhow::bail!(
"Path is not a file: '{}'. Please provide a path to a regular file.",
path.display()
);
}
Ok(())
}
/// Validates chunking parameters for correctness.
///
/// Ensures that chunking configuration makes sense: size must be positive and reasonable,
/// and overlap must be smaller than chunk size. This prevents common configuration errors
/// that would lead to cryptic failures from the underlying library.
///
/// # Errors
///
/// Returns an error if:
/// - `chunk_size` is 0 (must be at least 1 character)
/// - `chunk_size` exceeds 1,000,000 characters (to prevent excessive memory usage)
/// - `chunk_overlap` is greater than or equal to `chunk_size` (overlap must be smaller)
fn validate_chunk_params(chunk_size: Option<usize>, chunk_overlap: Option<usize>) -> Result<()> {
if let Some(size) = chunk_size {
if size == 0 {
anyhow::bail!("Invalid chunk size: {}. Chunk size must be greater than 0.", size);
}
if size > 1_000_000 {
anyhow::bail!(
"Invalid chunk size: {}. Chunk size must be less than 1,000,000 characters to avoid excessive memory usage.",
size
);
}
}
if let Some(overlap) = chunk_overlap
&& let Some(size) = chunk_size
&& overlap >= size
{
anyhow::bail!(
"Invalid chunk overlap: {}. Overlap ({}) must be less than chunk size ({}).",
overlap,
overlap,
size
);
}
Ok(())
}
/// Validates batch extraction paths for correctness.
///
/// Ensures that at least one file path is provided and that all paths point to valid,
/// accessible files. This prevents processing empty batches or failing mid-batch due
/// to invalid paths.
///
/// # Errors
///
/// Returns an error if:
/// - The paths array is empty (at least one file is required)
/// - Any path does not exist or is not a regular file
fn validate_batch_paths(paths: &[PathBuf]) -> Result<()> {
if paths.is_empty() {
anyhow::bail!("No files provided for batch extraction. Please provide at least one file path.");
}
for (i, path) in paths.iter().enumerate() {
validate_file_exists(path).with_context(|| format!("Invalid file at position {}", i + 1))?;
}
Ok(())
}
/// Apply inline JSON or base64 JSON overrides to an extraction config.
fn apply_json_overrides(
config: &mut kreuzberg::ExtractionConfig,
config_json: Option<String>,
config_json_base64: Option<String>,
) -> Result<()> {
if let Some(json_str) = config_json {
let json_value: serde_json::Value =
serde_json::from_str(&json_str).context("Failed to parse --config-json as JSON")?;
*config =
merge_json_into_config(config, json_value).context("Failed to merge --config-json with file config")?;
} else if let Some(base64_str) = config_json_base64 {
let json_bytes = STANDARD
.decode(&base64_str)
.context("Failed to decode base64 in --config-json-base64")?;
let json_str = String::from_utf8(json_bytes).context("Base64-decoded content is not valid UTF-8")?;
let json_value: serde_json::Value =
serde_json::from_str(&json_str).context("Failed to parse decoded --config-json-base64 as JSON")?;
*config = merge_json_into_config(config, json_value)
.context("Failed to merge --config-json-base64 with file config")?;
}
Ok(())
}
/// Merges a JSON value into an existing extraction config via field-by-field override.
fn merge_json_into_config(
base_config: &kreuzberg::ExtractionConfig,
json_value: serde_json::Value,
) -> Result<kreuzberg::ExtractionConfig> {
let json_str = serde_json::to_string(&json_value).map_err(|e| anyhow::anyhow!("{}", e))?;
kreuzberg::core::config::merge::merge_config_json(base_config, &json_str).map_err(|e| anyhow::anyhow!("{}", e))
}
fn main() -> Result<()> {
let cli = Cli::parse();
let env_filter = logging::build_env_filter(cli.log_level.as_deref());
let _ = tracing_subscriber::fmt()
.with_env_filter(env_filter)
.with_writer(std::io::stderr)
.try_init();
match cli.command {
Commands::Extract {
path,
config: config_path,
config_json,
config_json_base64,
mime_type,
format,
overrides,
} => {
validate_file_exists(&path)?;
overrides.validate()?;
let mut config = load_config(config_path)?;
apply_json_overrides(&mut config, config_json, config_json_base64)?;
overrides.apply(&mut config);
extract_command(path, config, mime_type, format)?;
}
Commands::ExtractStructured {
path,
schema,
model,
api_key,
prompt,
schema_name,
strict,
config,
format,
} => {
validate_file_exists(&path)?;
validate_file_exists(&schema)?;
extract_structured_command(ExtractStructuredArgs {
path,
schema_path: schema,
model,
api_key,
prompt,
schema_name,
strict,
config_path: config,
format,
})?;
}
Commands::Batch {
paths,
config: config_path,
config_json,
config_json_base64,
format,
overrides,
file_configs,
} => {
validate_batch_paths(&paths)?;
overrides.validate()?;
let mut config = load_config(config_path)?;
apply_json_overrides(&mut config, config_json, config_json_base64)?;
overrides.apply(&mut config);
let file_configs_map = if let Some(file_configs_path) = file_configs {
let file_configs_json = std::fs::read_to_string(&file_configs_path)
.with_context(|| format!("Failed to read file configs from '{}'", file_configs_path.display()))?;
let map: std::collections::HashMap<String, serde_json::Value> =
serde_json::from_str(&file_configs_json).with_context(|| {
format!(
"Failed to parse file configs JSON from '{}'",
file_configs_path.display()
)
})?;
Some(map)
} else {
None
};
batch_command(paths, file_configs_map, config, format)?;
}
Commands::Detect { path, format } => {
validate_file_exists(&path)?;
let path_str = path.to_string_lossy().to_string();
let mime_type = detect_mime_type(path_str.clone(), true).with_context(|| {
format!(
"Failed to detect MIME type for file '{}'. Ensure the file is readable.",
path.display()
)
})?;
match format {
WireFormat::Text => {
println!("{}", style::success(&mime_type));
}
WireFormat::Json => {
let output = json!({
"path": path_str,
"mime_type": mime_type,
});
println!(
"{}",
serde_json::to_string_pretty(&output)
.context("Failed to serialize MIME type detection result to JSON")?
);
}
WireFormat::Toon => {
let output = json!({
"path": path_str,
"mime_type": mime_type,
});
println!(
"{}",
serde_toon::to_string(&output)
.context("Failed to serialize MIME type detection result to TOON")?
);
}
}
}
Commands::Formats { format } => {
let formats = kreuzberg::core::mime::list_supported_formats();
match format {
WireFormat::Text => {
println!("{:<15} {}", style::label("EXTENSION"), style::label("MIME TYPE"));
println!("{}", style::dim(&format!("{:<15} ---------", "---------")));
for f in &formats {
println!("{:<15} {}", style::success(&format!(".{}", f.extension)), f.mime_type);
}
}
WireFormat::Json => {
println!(
"{}",
serde_json::to_string_pretty(&formats).context("Failed to serialize formats to JSON")?
);
}
WireFormat::Toon => {
println!(
"{}",
serde_toon::to_string(&formats).context("Failed to serialize formats to TOON")?
);
}
}
}
Commands::Version { format } => {
let version = env!("CARGO_PKG_VERSION");
let name = env!("CARGO_PKG_NAME");
match format {
WireFormat::Text => {
println!("{} {}", style::label(name), style::success(version));
}
WireFormat::Json => {
let output = json!({
"name": name,
"version": version,
});
println!(
"{}",
serde_json::to_string_pretty(&output)
.context("Failed to serialize version information to JSON")?
);
}
WireFormat::Toon => {
let output = json!({
"name": name,
"version": version,
});
println!(
"{}",
serde_toon::to_string(&output).context("Failed to serialize version information to TOON")?
);
}
}
}
#[cfg(feature = "api")]
Commands::Serve {
host: cli_host,
port: cli_port,
config: config_path,
} => {
let mut extraction_config = load_config(config_path.clone())?;
extraction_config.apply_env_overrides()?;
serve_command(cli_host, cli_port, extraction_config, config_path)?;
}
#[cfg(feature = "mcp")]
Commands::Mcp {
config: config_path,
transport,
#[cfg(feature = "mcp-http")]
host,
#[cfg(feature = "mcp-http")]
port,
#[cfg(not(feature = "mcp-http"))]
host,
#[cfg(not(feature = "mcp-http"))]
port,
} => {
let mut config = load_config(config_path)?;
config.apply_env_overrides()?;
mcp_command(config, transport, host, port)?;
}
Commands::Cache { command } => match command {
CacheCommands::Stats { cache_dir, format } => {
stats_command(cache_dir, format)?;
}
CacheCommands::Clear { cache_dir, format } => {
clear_command(cache_dir, format)?;
}
CacheCommands::Manifest { format } => {
manifest_command(format)?;
}
CacheCommands::Warm {
cache_dir,
format,
all_embeddings,
embedding_model,
all_table_models,
all_grammars,
grammar_groups,
grammars,
} => {
warm_command(
cache_dir,
format,
all_embeddings,
embedding_model,
all_table_models,
all_grammars,
grammar_groups,
grammars,
)?;
}
},
#[cfg(feature = "api")]
Commands::Api { command } => match command {
ApiCommands::Schema => {
println!("{}", kreuzberg::api::openapi::openapi_json());
}
},
#[cfg(feature = "embeddings")]
Commands::Embed {
text,
preset,
provider,
model,
api_key,
plugin,
format,
} => {
let texts = if text.is_empty() {
vec![commands::read_stdin()?]
} else {
text
};
embed_command(texts, &preset, &provider, model, api_key, plugin, format)?;
}
Commands::Chunk {
text,
config: config_path,
chunk_size,
chunk_overlap,
chunker_type,
chunking_tokenizer,
topic_threshold,
format,
} => {
let input = match text {
Some(t) => t,
None => commands::read_stdin().context("No --text provided and failed to read from stdin")?,
};
validate_chunk_params(chunk_size, chunk_overlap)?;
let base_config = load_config(config_path)?;
let mut chunking_config = base_config.chunking.unwrap_or_default();
if let Some(size) = chunk_size {
chunking_config.max_characters = size;
// If user set chunk_size but not overlap, clamp overlap to fit
if chunk_overlap.is_none() && chunking_config.overlap >= size {
chunking_config.overlap = size / 4;
}
}
if let Some(overlap) = chunk_overlap {
chunking_config.overlap = overlap;
}
match chunker_type.as_str() {
"markdown" => chunking_config.chunker_type = kreuzberg::ChunkerType::Markdown,
"yaml" => chunking_config.chunker_type = kreuzberg::ChunkerType::Yaml,
"semantic" => chunking_config.chunker_type = kreuzberg::ChunkerType::Semantic,
_ => chunking_config.chunker_type = kreuzberg::ChunkerType::Text,
}
if let Some(ref tokenizer) = chunking_tokenizer {
chunking_config.sizing = kreuzberg::ChunkSizing::Tokenizer {
model: tokenizer.clone(),
cache_dir: None,
};
}
if topic_threshold.is_some() {
chunking_config.topic_threshold = topic_threshold;
}
chunk_command(input, chunking_config, format)?;
}
Commands::Completions { shell } => {
let mut cmd = Cli::command();
clap_complete::generate(shell, &mut cmd, "kreuzberg", &mut std::io::stdout());
}
}
Ok(())
}

View File

@@ -0,0 +1,32 @@
//! JSON envelope types for CLI output.
//!
//! When `--format json` is used, extraction results are wrapped in these envelopes
//! so tooling (such as the benchmark harness) can read timing information without
//! parsing stderr or running a separate profiling tool.
use kreuzberg::ExtractionResult;
use serde::Serialize;
/// Single-file extraction result with wall-clock timing.
///
/// Emitted to stdout by `kreuzberg extract --format json`.
#[derive(Debug, Serialize)]
pub struct ExtractEnvelope {
/// The extraction result (content, metadata, tables, …).
pub result: ExtractionResult,
/// Wall-clock time for the extraction call in milliseconds.
pub extraction_time_ms: f64,
}
/// Batch extraction results with per-file and total timing.
///
/// Emitted to stdout by `kreuzberg batch --format json`.
#[derive(Debug, Serialize)]
pub struct BatchEnvelope {
/// One result per input file, in input order.
pub results: Vec<ExtractionResult>,
/// Total wall-clock time for the whole batch in milliseconds.
pub total_ms: f64,
/// Per-file wall-clock times in milliseconds, aligned with `results`.
pub per_file_ms: Vec<f64>,
}

View File

@@ -0,0 +1,104 @@
//! CLI color styling helpers using `anstyle`.
//!
//! Provides styled output for the kreuzberg CLI. Respects the `NO_COLOR`
//! environment variable (<https://no-color.org/>) and disables colors
//! when output is not a terminal.
use anstyle::{AnsiColor, Effects, Style};
use std::sync::OnceLock;
/// Bold blue for section headers.
const HEADER: Style = Style::new()
.fg_color(Some(anstyle::Color::Ansi(AnsiColor::Blue)))
.effects(Effects::BOLD);
/// Green for success values (MIME types, file paths, versions).
const SUCCESS: Style = Style::new().fg_color(Some(anstyle::Color::Ansi(AnsiColor::Green)));
/// Dim for metadata, separators, secondary info.
const DIM: Style = Style::new().effects(Effects::DIMMED);
/// Bold for labels in key-value pairs.
const LABEL: Style = Style::new().effects(Effects::BOLD);
/// Check whether color output is enabled.
///
/// Returns `false` if:
/// - The `NO_COLOR` environment variable is set (any value)
///
/// See <https://no-color.org/> for the specification.
pub fn is_color_enabled() -> bool {
static ENABLED: OnceLock<bool> = OnceLock::new();
*ENABLED.get_or_init(|| std::env::var_os("NO_COLOR").is_none())
}
/// Apply an `anstyle::Style` to text if colors are enabled.
fn styled(text: &str, style: Style) -> String {
if is_color_enabled() {
format!("{}{}{}", style.render(), text, style.render_reset())
} else {
text.to_string()
}
}
/// Style text as a section header (bold blue).
pub fn header(text: &str) -> String {
styled(text, HEADER)
}
/// Style text as a success value (green).
pub fn success(text: &str) -> String {
styled(text, SUCCESS)
}
/// Style text as dim/secondary (dimmed).
pub fn dim(text: &str) -> String {
styled(text, DIM)
}
/// Style text as a label (bold).
pub fn label(text: &str) -> String {
styled(text, LABEL)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_styled_returns_plain_text_when_no_color() {
// Set NO_COLOR for this test's assertion scope via direct env check
// Since OnceLock caches, we test the raw logic instead.
let text = "hello";
let result = format!("{}{}{}", Style::new().render(), text, Style::new().render_reset());
// A plain Style produces no ANSI codes, so the result is just the text.
assert_eq!(result, "hello");
}
#[test]
fn test_styled_applies_ansi_when_style_present() {
let style = Style::new().fg_color(Some(anstyle::Color::Ansi(AnsiColor::Green)));
let rendered = format!("{}{}{}", style.render(), "ok", style.render_reset());
// The rendered string should contain ANSI escape sequences.
assert!(rendered.contains("\x1b["));
assert!(rendered.contains("ok"));
}
#[test]
fn test_helper_functions_return_strings() {
// Smoke test: all helpers produce non-empty output for non-empty input.
assert!(!header("h").is_empty());
assert!(!success("s").is_empty());
assert!(!dim("d").is_empty());
assert!(!label("l").is_empty());
}
#[test]
fn test_is_color_enabled_respects_no_color_env() {
// We cannot easily test OnceLock-cached value, but we can verify the
// logic: NO_COLOR absence means colors enabled.
let has_no_color = std::env::var_os("NO_COLOR").is_some();
// The cached result should match the env at init time.
assert_eq!(is_color_enabled(), !has_no_color);
}
}

View File

@@ -0,0 +1,937 @@
//! Integration tests for CLI commands (extract, detect, batch).
//!
//! These tests verify that the CLI commands work correctly end-to-end,
//! including input validation, file processing, and output formatting.
use std::path::PathBuf;
use std::process::Command;
use tempfile::tempdir;
/// Get the path to the kreuzberg binary.
fn get_binary_path() -> String {
let manifest_dir = env!("CARGO_MANIFEST_DIR");
format!("{}/../../target/debug/kreuzberg", manifest_dir)
}
/// Get the test_documents directory path.
fn get_test_documents_dir() -> PathBuf {
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
manifest_dir.parent().unwrap().parent().unwrap().join("test_documents")
}
/// Get a test file path relative to test_documents/.
fn get_test_file(relative_path: &str) -> String {
get_test_documents_dir()
.join(relative_path)
.to_string_lossy()
.to_string()
}
/// Build the binary before running tests.
fn build_binary() {
let status = Command::new("cargo")
.args(["build", "--bin", "kreuzberg"])
.status()
.expect("Failed to build kreuzberg binary");
assert!(status.success(), "Failed to build kreuzberg binary");
}
#[test]
fn test_extract_text_file() {
build_binary();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
tracing::debug!("Skipping test: {} not found", test_file);
return;
}
let output = Command::new(get_binary_path())
.args(["extract", test_file.as_str()])
.output()
.expect("Failed to execute extract command");
assert!(
output.status.success(),
"Extract command failed: {}",
String::from_utf8_lossy(&output.stderr)
);
let stdout = String::from_utf8_lossy(&output.stdout);
assert!(!stdout.is_empty(), "Extract output should not be empty");
}
#[test]
fn test_extract_with_json_output() {
build_binary();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
tracing::debug!("Skipping test: {} not found", test_file);
return;
}
let output = Command::new(get_binary_path())
.args(["extract", test_file.as_str(), "--format", "json"])
.output()
.expect("Failed to execute extract command");
assert!(
output.status.success(),
"Extract command failed: {}",
String::from_utf8_lossy(&output.stderr)
);
let stdout = String::from_utf8_lossy(&output.stdout);
let json_result: serde_json::Result<serde_json::Value> = serde_json::from_str(&stdout);
assert!(json_result.is_ok(), "Output should be valid JSON, got: {}", stdout);
let json = json_result.unwrap();
// JSON output is now wrapped in a timing envelope: { result: ExtractionResult, extraction_time_ms: f64 }
assert!(json.get("result").is_some(), "JSON envelope should have 'result' field");
assert!(
json.get("extraction_time_ms").is_some(),
"JSON envelope should have 'extraction_time_ms' field"
);
assert!(
json["result"].get("content").is_some(),
"result should have 'content' field"
);
assert!(
json["result"].get("mime_type").is_some(),
"result should have 'mime_type' field"
);
}
#[test]
fn test_extract_with_chunking() {
build_binary();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
tracing::debug!("Skipping test: {} not found", test_file);
return;
}
let output = Command::new(get_binary_path())
.args([
"extract",
test_file.as_str(),
"--chunk",
"true",
"--chunk-size",
"100",
"--chunk-overlap",
"20",
"--format",
"json",
])
.output()
.expect("Failed to execute extract command");
assert!(
output.status.success(),
"Extract with chunking failed: {}",
String::from_utf8_lossy(&output.stderr)
);
let stdout = String::from_utf8_lossy(&output.stdout);
let json: serde_json::Value = serde_json::from_str(&stdout).expect("Should be valid JSON");
// JSON output is wrapped in an envelope; chunks live under result
assert!(
json["result"].get("chunks").is_some(),
"result should have 'chunks' field"
);
assert!(json["result"]["chunks"].is_array(), "'chunks' should be an array");
}
#[test]
fn test_extract_file_not_found() {
build_binary();
let output = Command::new(get_binary_path())
.args(["extract", "/nonexistent/file.txt"])
.output()
.expect("Failed to execute extract command");
assert!(!output.status.success(), "Extract should fail for nonexistent file");
let stderr = String::from_utf8_lossy(&output.stderr);
assert!(
stderr.contains("File not found"),
"Error should mention file not found, got: {}",
stderr
);
}
#[test]
fn test_extract_directory_not_file() {
build_binary();
let tmp_dir = tempdir().expect("Failed to create temp dir");
let dir_path = tmp_dir.path().to_string_lossy().to_string();
let output = Command::new(get_binary_path())
.args(["extract", dir_path.as_str()])
.output()
.expect("Failed to execute extract command");
assert!(!output.status.success(), "Extract should fail for directory");
let stderr = String::from_utf8_lossy(&output.stderr);
assert!(
stderr.contains("not a file") || stderr.contains("regular file"),
"Error should mention path is not a file, got: {}",
stderr
);
}
#[test]
fn test_extract_invalid_chunk_size_zero() {
build_binary();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
tracing::debug!("Skipping test: {} not found", test_file);
return;
}
let output = Command::new(get_binary_path())
.args(["extract", test_file.as_str(), "--chunk-size", "0"])
.output()
.expect("Failed to execute extract command");
assert!(!output.status.success(), "Extract should fail for chunk size 0");
let stderr = String::from_utf8_lossy(&output.stderr);
assert!(
stderr.contains("Invalid chunk size") || stderr.contains("must be greater than 0"),
"Error should mention invalid chunk size, got: {}",
stderr
);
}
#[test]
fn test_extract_invalid_chunk_size_too_large() {
build_binary();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
tracing::debug!("Skipping test: {} not found", test_file);
return;
}
let output = Command::new(get_binary_path())
.args(["extract", test_file.as_str(), "--chunk-size", "2000000"])
.output()
.expect("Failed to execute extract command");
assert!(!output.status.success(), "Extract should fail for chunk size > 1M");
let stderr = String::from_utf8_lossy(&output.stderr);
assert!(
stderr.contains("Invalid chunk size") || stderr.contains("1,000,000"),
"Error should mention chunk size limit, got: {}",
stderr
);
}
#[test]
fn test_extract_invalid_overlap_equals_chunk_size() {
build_binary();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
tracing::debug!("Skipping test: {} not found", test_file);
return;
}
let output = Command::new(get_binary_path())
.args([
"extract",
test_file.as_str(),
"--chunk-size",
"100",
"--chunk-overlap",
"100",
])
.output()
.expect("Failed to execute extract command");
assert!(
!output.status.success(),
"Extract should fail when overlap equals chunk size"
);
let stderr = String::from_utf8_lossy(&output.stderr);
assert!(
stderr.contains("Invalid chunk overlap") || stderr.contains("must be less than chunk size"),
"Error should mention overlap constraint, got: {}",
stderr
);
}
#[test]
fn test_detect_mime_type() {
build_binary();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
tracing::debug!("Skipping test: {} not found", test_file);
return;
}
let output = Command::new(get_binary_path())
.args(["detect", test_file.as_str()])
.output()
.expect("Failed to execute detect command");
assert!(
output.status.success(),
"Detect command failed: {}",
String::from_utf8_lossy(&output.stderr)
);
let stdout = String::from_utf8_lossy(&output.stdout);
assert!(!stdout.is_empty(), "Detect output should not be empty");
assert!(
stdout.contains("text/plain") || stdout.contains("text"),
"Should detect text MIME type, got: {}",
stdout
);
}
#[test]
fn test_detect_with_json_output() {
build_binary();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
tracing::debug!("Skipping test: {} not found", test_file);
return;
}
let output = Command::new(get_binary_path())
.args(["detect", test_file.as_str(), "--format", "json"])
.output()
.expect("Failed to execute detect command");
assert!(
output.status.success(),
"Detect command failed: {}",
String::from_utf8_lossy(&output.stderr)
);
let stdout = String::from_utf8_lossy(&output.stdout);
let json_result: serde_json::Result<serde_json::Value> = serde_json::from_str(&stdout);
assert!(json_result.is_ok(), "Output should be valid JSON, got: {}", stdout);
let json = json_result.unwrap();
assert!(json.get("mime_type").is_some(), "JSON should have 'mime_type' field");
assert!(json.get("path").is_some(), "JSON should have 'path' field");
}
#[test]
fn test_detect_file_not_found() {
build_binary();
let output = Command::new(get_binary_path())
.args(["detect", "/nonexistent/file.txt"])
.output()
.expect("Failed to execute detect command");
assert!(!output.status.success(), "Detect should fail for nonexistent file");
let stderr = String::from_utf8_lossy(&output.stderr);
assert!(
stderr.contains("File not found"),
"Error should mention file not found, got: {}",
stderr
);
}
#[test]
fn test_batch_multiple_files() {
build_binary();
let file1 = get_test_file("text/simple.txt");
let file2 = get_test_file("text/simple.txt");
if !PathBuf::from(&file1).exists() {
tracing::debug!("Skipping test: {} not found", file1);
return;
}
let output = Command::new(get_binary_path())
.args(["batch", file1.as_str(), file2.as_str(), "--format", "json"])
.output()
.expect("Failed to execute batch command");
assert!(
output.status.success(),
"Batch command failed: {}",
String::from_utf8_lossy(&output.stderr)
);
let stdout = String::from_utf8_lossy(&output.stdout);
let json_result: serde_json::Result<serde_json::Value> = serde_json::from_str(&stdout);
assert!(json_result.is_ok(), "Output should be valid JSON, got: {}", stdout);
let json = json_result.unwrap();
// Batch JSON output is now wrapped in a timing envelope: { results: [...], total_ms, per_file_ms }
assert!(
json.get("results").is_some(),
"Batch envelope should have 'results' field"
);
assert!(json["results"].is_array(), "'results' should be a JSON array");
assert_eq!(json["results"].as_array().unwrap().len(), 2, "Should have 2 results");
}
#[test]
fn test_batch_with_missing_file() {
build_binary();
let valid_file = get_test_file("text/simple.txt");
if !PathBuf::from(&valid_file).exists() {
tracing::debug!("Skipping test: {} not found", valid_file);
return;
}
let output = Command::new(get_binary_path())
.args(["batch", valid_file.as_str(), "/nonexistent/file.txt"])
.output()
.expect("Failed to execute batch command");
assert!(!output.status.success(), "Batch should fail when one file is missing");
let stderr = String::from_utf8_lossy(&output.stderr);
assert!(
stderr.contains("File not found") || stderr.contains("Invalid file"),
"Error should mention file not found, got: {}",
stderr
);
}
#[test]
fn test_extract_help() {
build_binary();
let output = Command::new(get_binary_path())
.args(["extract", "--help"])
.output()
.expect("Failed to execute extract --help");
assert!(output.status.success());
let stdout = String::from_utf8_lossy(&output.stdout);
assert!(stdout.contains("Extract text from a document"));
assert!(stdout.contains("--chunk-size"));
assert!(stdout.contains("--chunk-overlap"));
}
#[test]
fn test_detect_help() {
build_binary();
let output = Command::new(get_binary_path())
.args(["detect", "--help"])
.output()
.expect("Failed to execute detect --help");
assert!(output.status.success());
let stdout = String::from_utf8_lossy(&output.stdout);
assert!(stdout.contains("Detect MIME type"));
}
#[test]
fn test_batch_help() {
build_binary();
let output = Command::new(get_binary_path())
.args(["batch", "--help"])
.output()
.expect("Failed to execute batch --help");
assert!(output.status.success());
let stdout = String::from_utf8_lossy(&output.stdout);
assert!(stdout.contains("Batch extract from multiple documents"));
}
// ── Extract command flag parsing tests ──────────────────────────────
#[test]
fn test_extract_help_shows_all_extraction_override_flags() {
build_binary();
let output = Command::new(get_binary_path())
.args(["extract", "--help"])
.output()
.expect("Failed to execute extract --help");
assert!(output.status.success());
let stdout = String::from_utf8_lossy(&output.stdout);
// Verify all ExtractionOverrides flags appear in help output
let expected_flags = [
"--ocr",
"--ocr-backend",
"--ocr-language",
"--force-ocr",
"--no-cache",
"--ocr-auto-rotate",
"--chunk",
"--chunk-size",
"--chunk-overlap",
"--chunking-tokenizer",
"--content-format",
"--include-structure",
"--quality",
"--detect-language",
"--layout",
"--layout-confidence",
"--layout-table-model",
"--acceleration",
"--max-concurrent",
"--max-threads",
"--extract-pages",
"--page-markers",
"--extract-images",
"--target-dpi",
"--pdf-password",
"--token-reduction",
"--msg-codepage",
];
for flag in &expected_flags {
assert!(
stdout.contains(flag),
"Extract --help should show flag '{}', but it was not found in output:\n{}",
flag,
stdout
);
}
}
// ── Batch command flag parity test ──────────────────────────────────
#[test]
fn test_batch_has_same_extraction_flags_as_extract() {
build_binary();
let extract_output = Command::new(get_binary_path())
.args(["extract", "--help"])
.output()
.expect("Failed to execute extract --help");
let batch_output = Command::new(get_binary_path())
.args(["batch", "--help"])
.output()
.expect("Failed to execute batch --help");
assert!(extract_output.status.success());
assert!(batch_output.status.success());
let extract_help = String::from_utf8_lossy(&extract_output.stdout);
let batch_help = String::from_utf8_lossy(&batch_output.stdout);
// All extraction override flags should be present on both commands
let shared_flags = [
"--ocr",
"--ocr-backend",
"--ocr-language",
"--force-ocr",
"--no-cache",
"--chunk",
"--chunk-size",
"--chunk-overlap",
"--content-format",
"--quality",
"--detect-language",
"--layout",
"--layout-confidence",
"--layout-table-model",
"--acceleration",
"--max-concurrent",
"--max-threads",
"--extract-pages",
"--page-markers",
"--extract-images",
"--target-dpi",
"--pdf-password",
"--token-reduction",
"--msg-codepage",
];
for flag in &shared_flags {
assert!(
extract_help.contains(flag),
"Extract should have flag '{}' but it's missing",
flag
);
assert!(
batch_help.contains(flag),
"Batch should have flag '{}' (parity with extract) but it's missing",
flag
);
}
}
// ── Validation error tests ──────────────────────────────────────────
//
// NOTE: The CLI validates file existence *before* override validation,
// so we must provide a real file to reach the override validation stage.
/// Create a temporary file and return its path as a String.
/// The caller must keep the returned `tempfile::TempDir` alive for the
/// duration of the test so the file is not deleted.
fn create_temp_file() -> (tempfile::TempDir, String) {
let dir = tempdir().expect("Failed to create temp dir");
let file_path = dir.path().join("dummy.pdf");
std::fs::write(&file_path, b"dummy content").expect("Failed to write temp file");
let path_str = file_path.to_string_lossy().to_string();
(dir, path_str)
}
#[test]
fn test_extract_chunk_size_zero_error() {
build_binary();
let (_dir, file_path) = create_temp_file();
let output = Command::new(get_binary_path())
.args(["extract", "--chunk-size", "0", &file_path])
.output()
.expect("Failed to execute extract command");
assert!(!output.status.success(), "Should fail when chunk size is 0");
let stderr = String::from_utf8_lossy(&output.stderr);
assert!(
stderr.contains("chunk size") || stderr.contains("Chunk size") || stderr.contains("Invalid chunk size"),
"Error should mention chunk size, got: {}",
stderr
);
}
#[test]
fn test_extract_chunk_overlap_exceeds_size_error() {
build_binary();
let (_dir, file_path) = create_temp_file();
let output = Command::new(get_binary_path())
.args(["extract", "--chunk-size", "10", "--chunk-overlap", "20", &file_path])
.output()
.expect("Failed to execute extract command");
assert!(!output.status.success(), "Should fail when overlap exceeds chunk size");
let stderr = String::from_utf8_lossy(&output.stderr);
assert!(
stderr.contains("overlap") || stderr.contains("Overlap") || stderr.contains("Invalid chunk overlap"),
"Error should mention overlap constraint, got: {}",
stderr
);
}
#[test]
fn test_extract_layout_confidence_out_of_range_error() {
build_binary();
let (_dir, file_path) = create_temp_file();
let output = Command::new(get_binary_path())
.args(["extract", "--layout-confidence", "2.0", &file_path])
.output()
.expect("Failed to execute extract command");
// This flag is feature-gated behind layout-detection. If the binary was
// built without that feature, clap itself will reject the unknown flag.
assert!(
!output.status.success(),
"Should fail for layout confidence out of range"
);
let stderr = String::from_utf8_lossy(&output.stderr);
assert!(
stderr.contains("confidence") || stderr.contains("layout") || stderr.contains("unexpected argument"),
"Error should mention confidence or layout, got: {}",
stderr
);
}
#[test]
fn test_extract_layout_false_with_confidence_error() {
build_binary();
let (_dir, file_path) = create_temp_file();
let output = Command::new(get_binary_path())
.args(["extract", "--layout", "false", "--layout-confidence", "0.5", &file_path])
.output()
.expect("Failed to execute extract command");
// If layout-detection feature is enabled, validation should reject this combination.
// If not enabled, clap rejects the unknown flags.
assert!(
!output.status.success(),
"Should fail when --layout false is combined with --layout-confidence"
);
}
#[test]
fn test_extract_target_dpi_zero_error() {
build_binary();
let (_dir, file_path) = create_temp_file();
let output = Command::new(get_binary_path())
.args(["extract", "--target-dpi", "0", &file_path])
.output()
.expect("Failed to execute extract command");
assert!(!output.status.success(), "Should fail when target DPI is 0");
let stderr = String::from_utf8_lossy(&output.stderr);
assert!(
stderr.contains("DPI") || stderr.contains("dpi") || stderr.contains("target") || stderr.contains("Invalid"),
"Error should mention DPI range, got: {}",
stderr
);
}
// ── Completions test ────────────────────────────────────────────────
#[test]
fn test_completions_bash_produces_output() {
build_binary();
let output = Command::new(get_binary_path())
.args(["completions", "bash"])
.output()
.expect("Failed to execute completions command");
assert!(
output.status.success(),
"Completions command failed: {}",
String::from_utf8_lossy(&output.stderr)
);
let stdout = String::from_utf8_lossy(&output.stdout);
assert!(!stdout.is_empty(), "Completions output should not be empty");
// bash completions should contain the command name
assert!(
stdout.contains("kreuzberg"),
"Bash completions should reference 'kreuzberg', got: {}",
&stdout[..stdout.len().min(200)]
);
}
#[test]
fn test_completions_zsh_produces_output() {
build_binary();
let output = Command::new(get_binary_path())
.args(["completions", "zsh"])
.output()
.expect("Failed to execute completions command");
assert!(
output.status.success(),
"Completions command failed: {}",
String::from_utf8_lossy(&output.stderr)
);
let stdout = String::from_utf8_lossy(&output.stdout);
assert!(!stdout.is_empty(), "Zsh completions output should not be empty");
}
#[test]
fn test_completions_fish_produces_output() {
build_binary();
let output = Command::new(get_binary_path())
.args(["completions", "fish"])
.output()
.expect("Failed to execute completions command");
assert!(
output.status.success(),
"Completions command failed: {}",
String::from_utf8_lossy(&output.stderr)
);
let stdout = String::from_utf8_lossy(&output.stdout);
assert!(!stdout.is_empty(), "Fish completions output should not be empty");
}
// ── Embed help test ─────────────────────────────────────────────────
#[test]
fn test_embed_help_shows_correct_flags() {
build_binary();
let output = Command::new(get_binary_path())
.args(["embed", "--help"])
.output()
.expect("Failed to execute embed --help");
// embed is feature-gated; if not compiled, clap will show an error
if !output.status.success() {
// If embed subcommand doesn't exist, skip the test
let stderr = String::from_utf8_lossy(&output.stderr);
if stderr.contains("unrecognized subcommand") || stderr.contains("invalid subcommand") {
return;
}
}
let stdout = String::from_utf8_lossy(&output.stdout);
assert!(
stdout.contains("--text"),
"Embed help should show --text flag, got: {}",
stdout
);
assert!(
stdout.contains("--preset"),
"Embed help should show --preset flag, got: {}",
stdout
);
assert!(
stdout.contains("--format"),
"Embed help should show --format flag, got: {}",
stdout
);
assert!(
stdout.contains("Generate embeddings"),
"Embed help should describe embedding generation, got: {}",
stdout
);
}
// ── Chunk help test ─────────────────────────────────────────────────
#[test]
fn test_chunk_help_shows_correct_flags() {
build_binary();
let output = Command::new(get_binary_path())
.args(["chunk", "--help"])
.output()
.expect("Failed to execute chunk --help");
assert!(
output.status.success(),
"Chunk --help failed: {}",
String::from_utf8_lossy(&output.stderr)
);
let stdout = String::from_utf8_lossy(&output.stdout);
assert!(
stdout.contains("--text"),
"Chunk help should show --text flag, got: {}",
stdout
);
assert!(
stdout.contains("--chunk-size"),
"Chunk help should show --chunk-size flag, got: {}",
stdout
);
assert!(
stdout.contains("--chunk-overlap"),
"Chunk help should show --chunk-overlap flag, got: {}",
stdout
);
assert!(
stdout.contains("--chunker-type"),
"Chunk help should show --chunker-type flag, got: {}",
stdout
);
assert!(
stdout.contains("--format"),
"Chunk help should show --format flag, got: {}",
stdout
);
assert!(
stdout.contains("Chunk text"),
"Chunk help should describe text chunking, got: {}",
stdout
);
}
// ── Style module NO_COLOR test ──────────────────────────────────────
#[test]
fn test_no_color_env_disables_ansi_in_output() {
build_binary();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
return;
}
// Run with NO_COLOR set - output should have no ANSI escape sequences
let output = Command::new(get_binary_path())
.env("NO_COLOR", "1")
.args(["detect", &test_file])
.output()
.expect("Failed to execute detect command");
assert!(
output.status.success(),
"Detect failed: {}",
String::from_utf8_lossy(&output.stderr)
);
let stdout = String::from_utf8_lossy(&output.stdout);
assert!(
!stdout.contains("\x1b["),
"Output should not contain ANSI escape sequences when NO_COLOR is set, got: {:?}",
stdout
);
}
// ── Additional validation edge cases ────────────────────────────────
#[test]
fn test_extract_chunk_size_too_large_error() {
build_binary();
let (_dir, file_path) = create_temp_file();
let output = Command::new(get_binary_path())
.args(["extract", "--chunk-size", "2000000", &file_path])
.output()
.expect("Failed to execute extract command");
assert!(!output.status.success(), "Should fail when chunk size exceeds limit");
let stderr = String::from_utf8_lossy(&output.stderr);
assert!(
stderr.contains("chunk size") || stderr.contains("Chunk size") || stderr.contains("1,000,000"),
"Error should mention chunk size limit, got: {}",
stderr
);
}
#[test]
fn test_extract_target_dpi_too_high_error() {
build_binary();
let (_dir, file_path) = create_temp_file();
let output = Command::new(get_binary_path())
.args(["extract", "--target-dpi", "5000", &file_path])
.output()
.expect("Failed to execute extract command");
assert!(!output.status.success(), "Should fail when target DPI exceeds limit");
let stderr = String::from_utf8_lossy(&output.stderr);
assert!(
stderr.contains("DPI") || stderr.contains("dpi") || stderr.contains("2400") || stderr.contains("Invalid"),
"Error should mention DPI range, got: {}",
stderr
);
}

View File

@@ -0,0 +1,617 @@
//! Integration tests for CLI config file discovery.
//!
//! These tests verify that the CLI correctly discovers and loads configuration files
//! in various formats (.toml, .yaml, .json) with case-insensitive extension
//! matching, explicit --config flag support, and proper error handling.
use std::fs;
use std::path::PathBuf;
use std::process::Command;
use tempfile::tempdir;
/// Get the path to the kreuzberg binary.
fn get_binary_path() -> String {
let manifest_dir = env!("CARGO_MANIFEST_DIR");
format!("{}/../../target/debug/kreuzberg", manifest_dir)
}
/// Build the binary before running tests.
fn build_binary() {
let status = Command::new("cargo")
.args(["build", "--bin", "kreuzberg"])
.status()
.expect("Failed to build kreuzberg binary");
assert!(status.success(), "Failed to build kreuzberg binary");
}
/// Get the test_documents directory path.
fn get_test_documents_dir() -> PathBuf {
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
manifest_dir.parent().unwrap().parent().unwrap().join("test_documents")
}
/// Get a test file path relative to test_documents/.
fn get_test_file(relative_path: &str) -> String {
get_test_documents_dir()
.join(relative_path)
.to_string_lossy()
.to_string()
}
#[test]
fn test_discover_kreuzberg_toml_in_current_directory() {
build_binary();
let dir = tempdir().unwrap();
let config_path = dir.path().join(".kreuzberg.toml");
fs::write(
&config_path,
r#"
use_cache = false
enable_quality_processing = false
"#,
)
.unwrap();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
tracing::debug!("Skipping test: {} not found", test_file);
return;
}
let output = Command::new(get_binary_path())
.current_dir(dir.path())
.args(["extract", test_file.as_str()])
.output()
.expect("Failed to execute kreuzberg");
assert!(
output.status.success(),
"Command failed: {}",
String::from_utf8_lossy(&output.stderr)
);
}
#[test]
fn test_discover_kreuzberg_yaml_in_current_directory() {
build_binary();
let dir = tempdir().unwrap();
let config_path = dir.path().join(".kreuzberg.yaml");
fs::write(
&config_path,
r#"
use_cache: false
enable_quality_processing: false
"#,
)
.unwrap();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
tracing::debug!("Skipping test: {} not found", test_file);
return;
}
let output = Command::new(get_binary_path())
.current_dir(dir.path())
.args(["extract", test_file.as_str()])
.output()
.expect("Failed to execute kreuzberg");
assert!(
output.status.success(),
"Command failed: {}",
String::from_utf8_lossy(&output.stderr)
);
}
#[test]
fn test_discover_kreuzberg_yml_in_current_directory() {
build_binary();
let dir = tempdir().unwrap();
let config_path = dir.path().join(".kreuzberg.yaml");
fs::write(
&config_path,
r#"
use_cache: false
enable_quality_processing: false
"#,
)
.unwrap();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
tracing::debug!("Skipping test: {} not found", test_file);
return;
}
let output = Command::new(get_binary_path())
.current_dir(dir.path())
.args(["extract", test_file.as_str()])
.output()
.expect("Failed to execute kreuzberg");
assert!(
output.status.success(),
"Command failed: {}",
String::from_utf8_lossy(&output.stderr)
);
}
#[test]
fn test_discover_kreuzberg_json_in_current_directory() {
build_binary();
let dir = tempdir().unwrap();
let config_path = dir.path().join(".kreuzberg.json");
fs::write(
&config_path,
r#"{
"use_cache": false,
"enable_quality_processing": false
}"#,
)
.unwrap();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
tracing::debug!("Skipping test: {} not found", test_file);
return;
}
let output = Command::new(get_binary_path())
.current_dir(dir.path())
.args(["extract", test_file.as_str()])
.output()
.expect("Failed to execute kreuzberg");
assert!(
output.status.success(),
"Command failed: {}",
String::from_utf8_lossy(&output.stderr)
);
}
#[test]
fn test_case_insensitive_toml_extension() {
build_binary();
let dir = tempdir().unwrap();
let config_path = dir.path().join("custom.TOML");
fs::write(
&config_path,
r#"
use_cache = false
"#,
)
.unwrap();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
tracing::debug!("Skipping test: {} not found", test_file);
return;
}
let config_arg = config_path.to_string_lossy().into_owned();
let output = Command::new(get_binary_path())
.current_dir(dir.path())
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
.output()
.expect("Failed to execute kreuzberg");
assert!(
output.status.success(),
"Command failed: {}",
String::from_utf8_lossy(&output.stderr)
);
}
#[test]
fn test_case_insensitive_yaml_extension() {
build_binary();
let dir = tempdir().unwrap();
let config_path = dir.path().join("custom.Yaml");
fs::write(
&config_path,
r#"
use_cache: false
"#,
)
.unwrap();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
tracing::debug!("Skipping test: {} not found", test_file);
return;
}
let config_arg = config_path.to_string_lossy().into_owned();
let output = Command::new(get_binary_path())
.current_dir(dir.path())
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
.output()
.expect("Failed to execute kreuzberg");
assert!(
output.status.success(),
"Command failed: {}",
String::from_utf8_lossy(&output.stderr)
);
}
#[test]
fn test_case_insensitive_yml_extension() {
build_binary();
let dir = tempdir().unwrap();
let config_path = dir.path().join("custom.YML");
fs::write(
&config_path,
r#"
use_cache: false
"#,
)
.unwrap();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
tracing::debug!("Skipping test: {} not found", test_file);
return;
}
let config_arg = config_path.to_string_lossy().into_owned();
let output = Command::new(get_binary_path())
.current_dir(dir.path())
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
.output()
.expect("Failed to execute kreuzberg");
assert!(
output.status.success(),
"Command failed: {}",
String::from_utf8_lossy(&output.stderr)
);
}
#[test]
fn test_case_insensitive_json_extension() {
build_binary();
let dir = tempdir().unwrap();
let config_path = dir.path().join("custom.JSON");
fs::write(
&config_path,
r#"{
"use_cache": false
}"#,
)
.unwrap();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
tracing::debug!("Skipping test: {} not found", test_file);
return;
}
let config_arg = config_path.to_string_lossy().into_owned();
let output = Command::new(get_binary_path())
.current_dir(dir.path())
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
.output()
.expect("Failed to execute kreuzberg");
assert!(
output.status.success(),
"Command failed: {}",
String::from_utf8_lossy(&output.stderr)
);
}
#[test]
fn test_explicit_config_path_toml() {
build_binary();
let dir = tempdir().unwrap();
let config_path = dir.path().join("custom_config.toml");
fs::write(
&config_path,
r#"
use_cache = false
enable_quality_processing = false
"#,
)
.unwrap();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
tracing::debug!("Skipping test: {} not found", test_file);
return;
}
let config_arg = config_path.to_string_lossy().into_owned();
let output = Command::new(get_binary_path())
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
.output()
.expect("Failed to execute kreuzberg");
assert!(
output.status.success(),
"Command failed: {}",
String::from_utf8_lossy(&output.stderr)
);
}
#[test]
fn test_explicit_config_path_yaml() {
build_binary();
let dir = tempdir().unwrap();
let config_path = dir.path().join("custom_config.yaml");
fs::write(
&config_path,
r#"
use_cache: false
enable_quality_processing: false
"#,
)
.unwrap();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
tracing::debug!("Skipping test: {} not found", test_file);
return;
}
let config_arg = config_path.to_string_lossy().into_owned();
let output = Command::new(get_binary_path())
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
.output()
.expect("Failed to execute kreuzberg");
assert!(
output.status.success(),
"Command failed: {}",
String::from_utf8_lossy(&output.stderr)
);
}
#[test]
fn test_explicit_config_path_json() {
build_binary();
let dir = tempdir().unwrap();
let config_path = dir.path().join("custom_config.json");
fs::write(
&config_path,
r#"{
"use_cache": false,
"enable_quality_processing": false
}"#,
)
.unwrap();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
tracing::debug!("Skipping test: {} not found", test_file);
return;
}
let config_arg = config_path.to_string_lossy().into_owned();
let output = Command::new(get_binary_path())
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
.output()
.expect("Failed to execute kreuzberg");
assert!(
output.status.success(),
"Command failed: {}",
String::from_utf8_lossy(&output.stderr)
);
}
#[test]
fn test_invalid_config_extension() {
build_binary();
let dir = tempdir().unwrap();
let config_path = dir.path().join("config.txt");
fs::write(&config_path, "invalid content").unwrap();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
tracing::debug!("Skipping test: {} not found", test_file);
return;
}
let config_arg = config_path.to_string_lossy().into_owned();
let output = Command::new(get_binary_path())
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
.output()
.expect("Failed to execute kreuzberg");
assert!(!output.status.success());
let stderr = String::from_utf8_lossy(&output.stderr);
assert!(
stderr.contains(".toml") || stderr.contains(".yaml") || stderr.contains(".json"),
"Error message should mention supported extensions: {}",
stderr
);
}
#[test]
fn test_malformed_toml_config() {
build_binary();
let dir = tempdir().unwrap();
let config_path = dir.path().join("bad_config.toml");
fs::write(&config_path, "use_cache = [[[[[").unwrap();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
tracing::debug!("Skipping test: {} not found", test_file);
return;
}
let config_arg = config_path.to_string_lossy().into_owned();
let output = Command::new(get_binary_path())
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
.output()
.expect("Failed to execute kreuzberg");
assert!(!output.status.success());
}
#[test]
fn test_malformed_yaml_config() {
build_binary();
let dir = tempdir().unwrap();
let config_path = dir.path().join("bad_config.yaml");
fs::write(&config_path, "use_cache: [[[[[").unwrap();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
tracing::debug!("Skipping test: {} not found", test_file);
return;
}
let config_arg = config_path.to_string_lossy().into_owned();
let output = Command::new(get_binary_path())
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
.output()
.expect("Failed to execute kreuzberg");
assert!(!output.status.success());
}
#[test]
fn test_malformed_json_config() {
build_binary();
let dir = tempdir().unwrap();
let config_path = dir.path().join("bad_config.json");
fs::write(&config_path, r#"{"use_cache": [[[[[}"#).unwrap();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
tracing::debug!("Skipping test: {} not found", test_file);
return;
}
let config_arg = config_path.to_string_lossy().into_owned();
let output = Command::new(get_binary_path())
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
.output()
.expect("Failed to execute kreuzberg");
assert!(!output.status.success());
}
#[test]
fn test_nonexistent_config_file() {
build_binary();
let dir = tempdir().unwrap();
let config_path = dir.path().join("nonexistent.toml");
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
tracing::debug!("Skipping test: {} not found", test_file);
return;
}
let config_arg = config_path.to_string_lossy().into_owned();
let output = Command::new(get_binary_path())
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
.output()
.expect("Failed to execute kreuzberg");
assert!(!output.status.success());
}
#[test]
fn test_default_config_when_no_file_found() {
build_binary();
let dir = tempdir().unwrap();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
tracing::debug!("Skipping test: {} not found", test_file);
return;
}
let output = Command::new(get_binary_path())
.current_dir(dir.path())
.args(["extract", test_file.as_str()])
.output()
.expect("Failed to execute kreuzberg");
assert!(
output.status.success(),
"Command failed: {}",
String::from_utf8_lossy(&output.stderr)
);
}
#[test]
fn test_invalid_config_values() {
build_binary();
let dir = tempdir().unwrap();
let config_path = dir.path().join("invalid.toml");
fs::write(
&config_path,
r#"
use_cache = "not_a_bool"
"#,
)
.unwrap();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
tracing::debug!("Skipping test: {} not found", test_file);
return;
}
let config_arg = config_path.to_string_lossy().into_owned();
let output = Command::new(get_binary_path())
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
.output()
.expect("Failed to execute kreuzberg");
assert!(!output.status.success());
}

View File

@@ -0,0 +1,46 @@
//! Regression test for issue #773.
//! Validates that environment variable overrides are correctly applied during configuration loading.
use kreuzberg::{EmbeddingModelType, ExtractionConfig};
#[test]
fn test_regression_773_env_override_loading() {
let mut config = ExtractionConfig::default();
if let Some(ref ocr) = config.ocr {
assert_ne!(ocr.language, "fra");
}
unsafe { std::env::set_var("KREUZBERG_OCR_LANGUAGE", "fra") };
config.apply_env_overrides().expect("Failed to apply overrides");
unsafe { std::env::remove_var("KREUZBERG_OCR_LANGUAGE") };
let ocr = config
.ocr
.expect("OCR config should be Some when KREUZBERG_OCR_LANGUAGE is set");
assert_eq!(ocr.language, "fra");
}
#[test]
fn test_regression_773_vlm_embedding_env_override() {
let mut config = ExtractionConfig::default();
unsafe { std::env::set_var("KREUZBERG_VLM_EMBEDDING_MODEL", "openai/text-embedding-3-small") };
config
.apply_env_overrides()
.expect("Failed to apply environment overrides");
unsafe { std::env::remove_var("KREUZBERG_VLM_EMBEDDING_MODEL") };
let chunking = config
.chunking
.expect("Chunking should be enabled when VLM embedding is set");
let embedding = chunking.embedding.expect("Embedding should be configured");
match embedding.model {
EmbeddingModelType::Llm { llm } => {
assert_eq!(llm.model, "openai/text-embedding-3-small");
assert!(llm.api_key.is_none());
}
_ => panic!("Expected Llm embedding model type"),
}
}

View File

@@ -0,0 +1,344 @@
//! CLI configuration tests validating flags, aliases, and deprecation handling.
//!
//! This test suite verifies that:
//! 1. --output-format flag works correctly for all format options
//! 2. CLI flags properly override config file settings
//! 3. Config merge precedence is maintained (CLI args > config file > defaults)
//! 4. Configuration JSON can be passed inline
//! 5. Alias handling for deprecated flags works as expected
#![allow(clippy::bool_assert_comparison)]
#![allow(clippy::field_reassign_with_default)]
use std::path::PathBuf;
use tempfile::TempDir;
/// Helper to create a temporary config file
#[allow(dead_code)]
fn create_test_config(dir: &TempDir, name: &str, content: &str) -> PathBuf {
let config_path = dir.path().join(name);
std::fs::write(&config_path, content).expect("Failed to write config file");
config_path
}
#[test]
fn test_output_format_flag_plain() {
// Test that --output-format plain works
// This test verifies the flag is properly recognized
let config = kreuzberg::core::config::ExtractionConfig::default();
assert_eq!(
config.output_format,
kreuzberg::core::config::OutputFormat::Plain,
"Default output format should be Plain"
);
}
#[test]
fn test_output_format_flag_markdown() {
// Test that --output-format markdown is parsed correctly
let markdown_format = kreuzberg::core::config::OutputFormat::Markdown;
assert_eq!(
format!("{:?}", markdown_format),
"Markdown",
"Markdown format should have correct debug representation"
);
}
#[test]
fn test_output_format_flag_html() {
// Test that --output-format html is parsed correctly
let html_format = kreuzberg::core::config::OutputFormat::Html;
assert_eq!(
format!("{:?}", html_format),
"Html",
"Html format should have correct debug representation"
);
}
#[test]
fn test_extraction_config_with_output_format() {
// Test that ExtractionConfig can be created with specific output_format
let mut config = kreuzberg::core::config::ExtractionConfig::default();
config.output_format = kreuzberg::core::config::OutputFormat::Markdown;
assert_eq!(
config.output_format,
kreuzberg::core::config::OutputFormat::Markdown,
"output_format should be Markdown after assignment"
);
let serialized = serde_json::to_value(&config).expect("Failed to serialize");
assert_eq!(
serialized["output_format"], "markdown",
"Serialized output_format should be 'markdown' (lowercase)"
);
}
#[test]
fn test_config_json_parsing_complete() {
// Test that complete JSON config can be parsed
let json = serde_json::json!({
"use_cache": true,
"enable_quality_processing": true,
"force_ocr": false,
"output_format": "markdown",
"result_format": "unified",
"max_concurrent_extractions": 4,
});
let config: kreuzberg::core::config::ExtractionConfig =
serde_json::from_value(json).expect("Failed to parse config JSON");
assert!(config.use_cache);
assert!(config.enable_quality_processing);
assert_eq!(config.force_ocr, false);
assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Markdown);
assert_eq!(config.max_concurrent_extractions, Some(4));
}
#[test]
fn test_config_merge_precedence_cli_overrides_default() {
// Test that CLI arguments override defaults
let mut config = kreuzberg::core::config::ExtractionConfig::default();
// Simulate CLI override
config.use_cache = false;
config.force_ocr = true;
assert_eq!(config.use_cache, false, "CLI override should change use_cache to false");
assert_eq!(config.force_ocr, true, "CLI override should change force_ocr to true");
}
#[test]
fn test_config_merge_precedence_cli_overrides_file() {
// Test that CLI arguments override config file settings
let mut file_config = kreuzberg::core::config::ExtractionConfig::default();
file_config.use_cache = true;
file_config.force_ocr = false;
// Simulate CLI override
let mut final_config = file_config.clone();
final_config.use_cache = false;
assert_eq!(
final_config.use_cache, false,
"CLI should override file config for use_cache"
);
assert!(!final_config.force_ocr, "CLI should not affect fields not overridden");
}
#[test]
fn test_config_file_precedence_over_defaults() {
// Test that config file values override defaults
let json = serde_json::json!({
"use_cache": false,
"force_ocr": true,
});
let file_config: kreuzberg::core::config::ExtractionConfig = serde_json::from_value(json).expect("Failed to parse");
let default_config = kreuzberg::core::config::ExtractionConfig::default();
assert_ne!(
file_config.use_cache, default_config.use_cache,
"File config should override default for use_cache"
);
assert_ne!(
file_config.force_ocr, default_config.force_ocr,
"File config should override default for force_ocr"
);
}
#[test]
fn test_output_format_serialization() {
// Test that output_format serializes to expected string values
let plain = kreuzberg::core::config::OutputFormat::Plain;
let plain_json = serde_json::to_value(plain).expect("Failed to serialize Plain");
assert_eq!(plain_json, "plain");
let markdown = kreuzberg::core::config::OutputFormat::Markdown;
let markdown_json = serde_json::to_value(markdown).expect("Failed to serialize Markdown");
assert_eq!(markdown_json, "markdown");
let html = kreuzberg::core::config::OutputFormat::Html;
let html_json = serde_json::to_value(html).expect("Failed to serialize Html");
assert_eq!(html_json, "html");
}
#[test]
fn test_output_format_deserialization() {
// Test that output_format can be deserialized from string values
let plain: kreuzberg::core::config::OutputFormat =
serde_json::from_value(serde_json::json!("plain")).expect("Failed to deserialize plain");
assert_eq!(plain, kreuzberg::core::config::OutputFormat::Plain);
let markdown: kreuzberg::core::config::OutputFormat =
serde_json::from_value(serde_json::json!("markdown")).expect("Failed to deserialize markdown");
assert_eq!(markdown, kreuzberg::core::config::OutputFormat::Markdown);
let html: kreuzberg::core::config::OutputFormat =
serde_json::from_value(serde_json::json!("html")).expect("Failed to deserialize html");
assert_eq!(html, kreuzberg::core::config::OutputFormat::Html);
}
#[test]
fn test_extraction_config_roundtrip_with_output_format() {
// Test that output_format survives serialization roundtrip
let original = kreuzberg::core::config::ExtractionConfig {
output_format: kreuzberg::core::config::OutputFormat::Markdown,
..kreuzberg::core::config::ExtractionConfig::default()
};
let json_string = serde_json::to_string(&original).expect("Failed to serialize");
let restored: kreuzberg::core::config::ExtractionConfig =
serde_json::from_str(&json_string).expect("Failed to deserialize");
assert_eq!(
original.output_format, restored.output_format,
"output_format should survive serialization roundtrip"
);
}
#[test]
fn test_config_with_all_output_formats() {
// Test that all output format variants can be set and retrieved
let formats = vec![
kreuzberg::core::config::OutputFormat::Plain,
kreuzberg::core::config::OutputFormat::Markdown,
kreuzberg::core::config::OutputFormat::Html,
];
for format in formats {
let config = kreuzberg::core::config::ExtractionConfig {
output_format: format.clone(),
..kreuzberg::core::config::ExtractionConfig::default()
};
let json = serde_json::to_value(&config).expect("Failed to serialize");
let restored: kreuzberg::core::config::ExtractionConfig =
serde_json::from_value(json).expect("Failed to deserialize");
assert_eq!(
format, restored.output_format,
"Format should be preserved for {:?}",
format
);
}
}
#[test]
fn test_config_partial_json_with_output_format() {
// Test that partial JSON config with only output_format is valid
let json = serde_json::json!({
"output_format": "markdown",
});
let config: kreuzberg::core::config::ExtractionConfig =
serde_json::from_value(json).expect("Failed to parse partial config");
assert_eq!(
config.output_format,
kreuzberg::core::config::OutputFormat::Markdown,
"output_format should be set from partial config"
);
// Other fields should have defaults
assert!(config.use_cache, "use_cache should have default value");
}
#[test]
fn test_config_complete_json_structure() {
// Test that a complete config JSON has all necessary fields
let config = kreuzberg::core::config::ExtractionConfig::default();
let json = serde_json::to_value(&config).expect("Failed to serialize");
let obj = json.as_object().expect("Should be object");
// Verify critical fields are present
assert!(obj.contains_key("output_format"), "Should have output_format");
assert!(obj.contains_key("use_cache"), "Should have use_cache");
assert!(
obj.contains_key("enable_quality_processing"),
"Should have enable_quality_processing"
);
assert!(obj.contains_key("force_ocr"), "Should have force_ocr");
assert!(obj.contains_key("result_format"), "Should have result_format");
}
#[test]
fn test_unknown_output_format_accepted_as_custom() {
// OutputFormat has a Custom(String) catch-all variant with #[serde(untagged)],
// so unknown strings are accepted as custom renderer names rather than rejected.
let json = serde_json::json!({
"output_format": "my_custom_renderer",
});
let result: Result<kreuzberg::core::config::ExtractionConfig, _> = serde_json::from_value(json);
assert!(
result.is_ok(),
"Unknown output_format should be accepted as Custom variant; got: {:?}",
result.err()
);
assert_eq!(
result.unwrap().output_format,
kreuzberg::core::config::OutputFormat::Custom("my_custom_renderer".to_string()),
"Unknown format string must deserialize as OutputFormat::Custom"
);
}
#[test]
fn test_config_case_sensitivity() {
// Test that format values are case-insensitive due to rename_all = "lowercase"
let plain_lowercase = serde_json::json!({"output_format": "plain"});
let result: Result<kreuzberg::core::config::ExtractionConfig, _> = serde_json::from_value(plain_lowercase);
assert!(result.is_ok(), "lowercase 'plain' should be accepted");
let config = result.unwrap();
assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Plain);
}
#[test]
fn test_output_format_field_is_required_in_serialization() {
// Test that output_format is always included in serialization
let config = kreuzberg::core::config::ExtractionConfig::default();
let json = serde_json::to_value(&config).expect("Failed to serialize");
assert!(
json.get("output_format").is_some(),
"output_format should always be present in serialization"
);
}
#[test]
fn test_result_format_and_output_format_independent() {
// Test that result_format and output_format are independent fields
let mut config = kreuzberg::core::config::ExtractionConfig::default();
// Set both to different values
config.output_format = kreuzberg::core::config::OutputFormat::Markdown;
let json = serde_json::to_value(&config).expect("Failed to serialize");
assert_eq!(json["output_format"], "markdown");
assert!(
json["result_format"].is_string(),
"result_format should also be present"
);
}
#[test]
fn test_extraction_config_clone_preserves_format() {
// Test that cloning config preserves output_format
let original = kreuzberg::core::config::ExtractionConfig {
output_format: kreuzberg::core::config::OutputFormat::Html,
..kreuzberg::core::config::ExtractionConfig::default()
};
let cloned = original.clone();
assert_eq!(
original.output_format, cloned.output_format,
"Cloned config should preserve output_format"
);
}

View File

@@ -0,0 +1,355 @@
//! CLI contract tests - verify CLI config parsing matches Rust core
//!
//! This test suite validates that the CLI's configuration parsing produces
//! identical results to the Rust core library. It ensures that users get
//! consistent behavior whether using the CLI, SDK, or MCP interfaces.
use kreuzberg::core::config::ExtractionConfig;
use kreuzberg::core::config::OutputFormat;
use serde_json::json;
#[test]
fn test_cli_config_json_flag_basic_parsing() {
let config_str = r#"{"use_cache": true, "output_format": "plain"}"#;
// Parse as Rust core would
let rust_config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize config string");
// Simulate CLI --config-json parsing (same as Rust core)
let cli_json: serde_json::Value = serde_json::from_str(config_str).expect("Failed to parse JSON string");
let cli_config: ExtractionConfig = serde_json::from_value(cli_json).expect("Failed to deserialize from JSON value");
// Verify identical behavior
assert_eq!(
rust_config.use_cache, cli_config.use_cache,
"use_cache should be identical"
);
assert_eq!(
rust_config.output_format, cli_config.output_format,
"output_format should be identical"
);
}
#[test]
fn test_cli_nested_config_deserialization() {
let config_str = r#"{
"chunking": {
"max_characters": 1000,
"overlap": 200
},
"ocr": {
"backend": "tesseract"
}
}"#;
let config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize nested config");
assert!(config.chunking.is_some(), "Chunking config should be present");
assert!(config.ocr.is_some(), "OCR config should be present");
let chunking = config.chunking.unwrap();
assert_eq!(chunking.max_characters, 1000, "max_chars should be 1000");
assert_eq!(chunking.overlap, 200, "max_overlap should be 200");
let ocr = config.ocr.unwrap();
assert_eq!(ocr.backend, "tesseract", "backend should be tesseract");
}
#[test]
fn test_cli_force_ocr_flag_parsing() {
let config_str = r#"{"force_ocr": true}"#;
let config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize force_ocr config");
assert!(config.force_ocr, "force_ocr should be true");
// Verify other fields retain defaults
assert!(config.use_cache, "use_cache should still be true by default");
}
#[test]
fn test_cli_max_concurrent_extractions_parsing() {
let config_str = r#"{"max_concurrent_extractions": 8}"#;
let config: ExtractionConfig =
serde_json::from_str(config_str).expect("Failed to deserialize concurrent extractions");
assert_eq!(
config.max_concurrent_extractions,
Some(8),
"max_concurrent_extractions should be 8"
);
}
#[test]
fn test_cli_complex_config_deserialization() {
let config_str = r#"{
"use_cache": false,
"enable_quality_processing": true,
"force_ocr": true,
"output_format": "markdown",
"result_format": "unified",
"max_concurrent_extractions": 16,
"ocr": {
"backend": "tesseract",
"language": "eng"
},
"chunking": {
"max_characters": 2000,
"overlap": 400,
"strategy": "sliding_window"
}
}"#;
let config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize complex config");
// Verify all top-level fields
assert!(!config.use_cache);
assert!(config.enable_quality_processing);
assert!(config.force_ocr);
assert_eq!(config.max_concurrent_extractions, Some(16));
// Verify nested configs
assert!(config.ocr.is_some());
assert!(config.chunking.is_some());
let ocr = config.ocr.unwrap();
assert_eq!(ocr.backend, "tesseract");
assert_eq!(ocr.language, "eng");
let chunking = config.chunking.unwrap();
assert_eq!(chunking.max_characters, 2000);
assert_eq!(chunking.overlap, 400);
}
#[test]
fn test_cli_empty_config_uses_defaults() {
let config_str = r#"{}"#;
let config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize empty config");
// All defaults should apply
assert!(config.use_cache, "Default use_cache should be true");
assert!(
config.enable_quality_processing,
"Default enable_quality_processing should be true"
);
assert!(!config.force_ocr, "Default force_ocr should be false");
assert_eq!(
config.max_concurrent_extractions, None,
"Default max_concurrent_extractions should be None"
);
}
#[test]
fn test_cli_roundtrip_preserves_all_fields() {
let original_str = r#"{
"use_cache": false,
"force_ocr": true,
"max_concurrent_extractions": 12
}"#;
// Parse
let config: ExtractionConfig = serde_json::from_str(original_str).expect("Failed to deserialize");
// Serialize back
let serialized = serde_json::to_value(&config).expect("Failed to serialize");
// Re-parse the serialized version
let reparsed: ExtractionConfig =
serde_json::from_value(serialized).expect("Failed to deserialize roundtripped config");
// Verify fields preserved
assert!(!reparsed.use_cache);
assert!(reparsed.force_ocr);
assert_eq!(reparsed.max_concurrent_extractions, Some(12));
}
#[test]
fn test_cli_output_format_enum_parsing() {
let test_cases = vec![
(r#"{"output_format": "plain"}"#, OutputFormat::Plain),
(r#"{"output_format": "markdown"}"#, OutputFormat::Markdown),
(r#"{"output_format": "html"}"#, OutputFormat::Html),
];
for (config_str, expected_format) in test_cases {
let config: ExtractionConfig =
serde_json::from_str(config_str).unwrap_or_else(|_| panic!("Failed to deserialize {}", config_str));
assert_eq!(
config.output_format, expected_format,
"output_format should match expected value"
);
}
}
#[test]
fn test_cli_result_format_enum_parsing() {
let test_cases = vec![
r#"{"result_format": "unified"}"#,
r#"{"result_format": "element_based"}"#,
];
for config_str in test_cases {
let result = serde_json::from_str::<ExtractionConfig>(config_str);
assert!(result.is_ok(), "Should deserialize result_format from {}", config_str);
}
}
#[test]
fn test_cli_base64_encoded_config_simulation() {
// Simulate --config-json-base64 flag handling
let original_json = json!({
"force_ocr": true,
"output_format": "markdown"
});
let json_string = original_json.to_string();
// Simulate base64 encoding
let encoded = base64::engine::general_purpose::STANDARD.encode(&json_string);
// Simulate base64 decoding (as CLI would do)
use base64::Engine;
let decoded = String::from_utf8(
base64::engine::general_purpose::STANDARD
.decode(&encoded)
.expect("Failed to decode base64"),
)
.expect("Failed to convert bytes to string");
// Parse the decoded JSON
let config: ExtractionConfig = serde_json::from_str(&decoded).expect("Failed to deserialize base64-decoded config");
assert!(config.force_ocr);
assert_eq!(config.output_format, OutputFormat::Markdown);
}
#[test]
fn test_cli_partial_override_merging() {
// Test that partial configs can override defaults
let base_config = ExtractionConfig::default();
let override_json = json!({"force_ocr": true, "use_cache": false});
// Simulate CLI merge: convert base to JSON, merge overrides, deserialize
let mut base_json = serde_json::to_value(&base_config).expect("Failed to serialize base config");
if let (serde_json::Value::Object(base_obj), serde_json::Value::Object(override_obj)) =
(&mut base_json, override_json)
{
for (key, value) in override_obj {
base_obj.insert(key, value);
}
}
let merged: ExtractionConfig = serde_json::from_value(base_json).expect("Failed to deserialize merged config");
assert!(merged.force_ocr, "Override should apply force_ocr");
assert!(!merged.use_cache, "Override should apply use_cache");
assert!(
merged.enable_quality_processing,
"Unoverridden field should retain default"
);
}
#[test]
fn test_cli_invalid_json_error_handling() {
let invalid_json_str = r#"{"force_ocr": true, "invalid_field": "value"}"#;
// Note: serde with deny_unknown_fields would reject this
// Without that, it should deserialize successfully and ignore unknown fields
let result = serde_json::from_str::<ExtractionConfig>(invalid_json_str);
// Document the current behavior - unknown fields are typically ignored
if let Ok(config) = result {
assert!(config.force_ocr);
}
}
#[test]
fn test_cli_whitespace_handling_in_json() {
let config_strs = vec![
r#"{"force_ocr":true}"#, // No spaces
r#"{ "force_ocr" : true }"#, // Extra spaces
r#"{
"force_ocr": true
}"#, // Newlines and indentation
];
for config_str in config_strs {
let config: ExtractionConfig =
serde_json::from_str(config_str).unwrap_or_else(|_| panic!("Failed to parse: {}", config_str));
assert!(config.force_ocr);
}
}
#[test]
fn test_cli_numeric_boundary_values() {
// Test minimum and maximum reasonable values for numeric fields
let test_cases = vec![
(r#"{"max_concurrent_extractions": 1}"#, Some(1)),
(r#"{"max_concurrent_extractions": 256}"#, Some(256)),
(r#"{"max_concurrent_extractions": 0}"#, Some(0)), // Edge case: 0 extractions
];
for (config_str, expected_value) in test_cases {
let config: ExtractionConfig =
serde_json::from_str(config_str).unwrap_or_else(|_| panic!("Failed to parse: {}", config_str));
assert_eq!(
config.max_concurrent_extractions, expected_value,
"Numeric values should be parsed correctly"
);
}
}
#[test]
fn test_cli_boolean_values_strict_parsing() {
// Test that boolean values are strictly true/false, not truthy/falsy
let test_cases = vec![(r#"{"use_cache": true}"#, true), (r#"{"use_cache": false}"#, false)];
for (config_str, expected_value) in test_cases {
let config: ExtractionConfig =
serde_json::from_str(config_str).unwrap_or_else(|_| panic!("Failed to parse: {}", config_str));
assert_eq!(config.use_cache, expected_value);
}
}
#[test]
fn test_cli_config_consistency_across_formats() {
// Create a config programmatically
let programmatic_config = ExtractionConfig {
use_cache: false,
enable_quality_processing: true,
force_ocr: true,
output_format: OutputFormat::Markdown,
max_concurrent_extractions: Some(4),
..Default::default()
};
// Serialize it
let serialized_json = serde_json::to_value(&programmatic_config).expect("Failed to serialize");
// Deserialize back from JSON string (simulating CLI parsing)
let json_string = serialized_json.to_string();
let deserialized: ExtractionConfig = serde_json::from_str(&json_string).expect("Failed to deserialize from string");
// Verify complete roundtrip
assert_eq!(deserialized.use_cache, programmatic_config.use_cache);
assert_eq!(
deserialized.enable_quality_processing,
programmatic_config.enable_quality_processing
);
assert_eq!(deserialized.force_ocr, programmatic_config.force_ocr);
assert_eq!(deserialized.output_format, programmatic_config.output_format);
assert_eq!(
deserialized.max_concurrent_extractions,
programmatic_config.max_concurrent_extractions
);
}
// Re-export needed for base64 test (moved to end of file)
// Re-export needed for base64 test (imported at top of file)

View File

@@ -0,0 +1,603 @@
//! Comprehensive CLI end-to-end integration tests for configuration flags.
//!
//! This test suite validates the new configuration features including:
//! - `--config-json` for inline JSON configuration
//! - `--config-json-base64` for base64-encoded JSON configuration
//! - `--output-format` flag with all variants (plain, markdown, djot, html)
//! - Flag precedence (CLI args > JSON config > file > defaults)
//! - Config merge scenarios and conflict detection
//! - Error handling for invalid inputs
//! - Real extraction with new formats
#![allow(clippy::bool_assert_comparison)]
use std::path::PathBuf;
use std::process::Command;
use tempfile::TempDir;
/// Get the path to the kreuzberg binary.
fn get_binary_path() -> String {
let manifest_dir = env!("CARGO_MANIFEST_DIR");
format!("{}/../../target/debug/kreuzberg", manifest_dir)
}
/// Get the test_documents directory path.
fn get_test_documents_dir() -> PathBuf {
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
manifest_dir.parent().unwrap().parent().unwrap().join("test_documents")
}
/// Get a test file path relative to test_documents/.
fn get_test_file(relative_path: &str) -> String {
get_test_documents_dir()
.join(relative_path)
.to_string_lossy()
.to_string()
}
/// Build the binary before running tests (runs once per test).
fn build_binary() {
let status = Command::new("cargo")
.args(["build", "--bin", "kreuzberg"])
.status()
.expect("Failed to build kreuzberg binary");
assert!(status.success(), "Failed to build kreuzberg binary");
}
/// Helper to create a temporary config file with specified content.
fn create_test_config(dir: &TempDir, name: &str, content: &str) -> PathBuf {
let config_path = dir.path().join(name);
std::fs::write(&config_path, content).expect("Failed to write config file");
config_path
}
/// Helper to encode string as base64.
fn to_base64(input: &str) -> String {
// Manual base64 encoding
const CHARSET: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
let bytes = input.as_bytes();
let mut result = String::new();
let mut i = 0;
while i < bytes.len() {
let b1 = bytes[i];
let b2 = if i + 1 < bytes.len() { bytes[i + 1] } else { 0 };
let b3 = if i + 2 < bytes.len() { bytes[i + 2] } else { 0 };
let n = ((b1 as u32) << 16) | ((b2 as u32) << 8) | (b3 as u32);
result.push(CHARSET[((n >> 18) & 0x3F) as usize] as char);
result.push(CHARSET[((n >> 12) & 0x3F) as usize] as char);
if i + 1 < bytes.len() {
result.push(CHARSET[((n >> 6) & 0x3F) as usize] as char);
} else {
result.push('=');
}
if i + 2 < bytes.len() {
result.push(CHARSET[(n & 0x3F) as usize] as char);
} else {
result.push('=');
}
i += 3;
}
result
}
// ============================================================================
// Test 1: --config-json inline flag with complex configuration
// ============================================================================
#[test]
fn test_cli_config_json_inline() {
build_binary();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
eprintln!("Skipping test: {} not found", test_file);
return;
}
let output = Command::new(get_binary_path())
.args([
"extract",
test_file.as_str(),
"--config-json",
r#"{"use_cache": false, "chunking": {"max_chars": 512}}"#,
])
.output()
.expect("Failed to execute extract command with --config-json");
assert!(
output.status.success(),
"Extract command with --config-json failed: {}",
String::from_utf8_lossy(&output.stderr)
);
let stdout = String::from_utf8_lossy(&output.stdout);
assert!(!stdout.is_empty(), "Output should not be empty");
}
// ============================================================================
// Test 2: --config-json-base64 flag for base64-encoded configuration
// ============================================================================
#[test]
fn test_cli_config_json_base64() {
build_binary();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
eprintln!("Skipping test: {} not found", test_file);
return;
}
// Encode JSON config as base64
let json_config = r#"{"use_cache": false}"#;
let base64_config = to_base64(json_config);
let output = Command::new(get_binary_path())
.args([
"extract",
test_file.as_str(),
"--config-json-base64",
base64_config.as_str(),
])
.output()
.expect("Failed to execute extract command with --config-json-base64");
assert!(
output.status.success(),
"Extract command with --config-json-base64 failed: {}",
String::from_utf8_lossy(&output.stderr)
);
let stdout = String::from_utf8_lossy(&output.stdout);
assert!(!stdout.is_empty(), "Output should not be empty");
}
// ============================================================================
// Test 3: Flag precedence verification (CLI flags > JSON > file > defaults)
// ============================================================================
#[test]
fn test_cli_flag_precedence() {
build_binary();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
eprintln!("Skipping test: {} not found", test_file);
return;
}
let temp_dir = TempDir::new().expect("Failed to create temp directory");
// Create a config file with specific settings
let config_content = r#"
use_cache = true
[chunking]
max_chars = 1024
"#;
let config_path = create_test_config(&temp_dir, "config.toml", config_content);
// CLI flag should override config file setting
let output = Command::new(get_binary_path())
.args([
"extract",
test_file.as_str(),
"--config",
config_path.to_string_lossy().as_ref(),
"--config-json",
r#"{"use_cache": false}"#,
])
.output()
.expect("Failed to execute command with precedence test");
assert!(
output.status.success(),
"Precedence test command failed: {}",
String::from_utf8_lossy(&output.stderr)
);
}
// ============================================================================
// Test 4: --output-format flag with all variants (plain, markdown, djot, html)
// ============================================================================
#[test]
fn test_cli_output_format_all_variants() {
build_binary();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
eprintln!("Skipping test: {} not found", test_file);
return;
}
let formats = vec!["plain", "markdown", "djot", "html"];
for format in formats {
let output = Command::new(get_binary_path())
.args(["extract", test_file.as_str(), "--output-format", format])
.output()
.unwrap_or_else(|_| panic!("Failed to execute extract with --output-format {}", format));
assert!(
output.status.success(),
"Extract command with --output-format {} failed: {}",
format,
String::from_utf8_lossy(&output.stderr)
);
let stdout = String::from_utf8_lossy(&output.stdout);
assert!(!stdout.is_empty(), "Output for format {} should not be empty", format);
}
}
// ============================================================================
// Test 5: Output formats (text vs json) for extraction result
// ============================================================================
#[test]
fn test_cli_result_format() {
build_binary();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
eprintln!("Skipping test: {} not found", test_file);
return;
}
// Test text output format
let output_text = Command::new(get_binary_path())
.args(["extract", test_file.as_str(), "--format", "text"])
.output()
.expect("Failed to execute extract with --format text");
assert!(
output_text.status.success(),
"Text format output failed: {}",
String::from_utf8_lossy(&output_text.stderr)
);
let text_content = String::from_utf8_lossy(&output_text.stdout);
assert!(!text_content.is_empty(), "Text output should not be empty");
// Test JSON output format
let output_json = Command::new(get_binary_path())
.args(["extract", test_file.as_str(), "--format", "json"])
.output()
.expect("Failed to execute extract with --format json");
assert!(
output_json.status.success(),
"JSON format output failed: {}",
String::from_utf8_lossy(&output_json.stderr)
);
let json_content = String::from_utf8_lossy(&output_json.stdout);
let parsed: Result<serde_json::Value, _> = serde_json::from_str(&json_content);
assert!(
parsed.is_ok(),
"JSON output should be valid JSON, got: {}",
json_content
);
// Verify JSON has expected envelope+result structure
if let Ok(value) = parsed {
assert!(
value.get("result").is_some(),
"JSON envelope should have 'result' field"
);
assert!(
value.get("extraction_time_ms").is_some(),
"JSON envelope should have 'extraction_time_ms' field"
);
assert!(
value["result"].get("content").is_some(),
"result should have 'content' field"
);
assert!(
value["result"].get("mime_type").is_some(),
"result should have 'mime_type' field"
);
}
}
// ============================================================================
// Test 6: Deprecated --content-format flag warning
// ============================================================================
#[test]
fn test_cli_content_format_deprecated_warning() {
build_binary();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
eprintln!("Skipping test: {} not found", test_file);
return;
}
// The deprecated --content-format should still work but may show warning
let output = Command::new(get_binary_path())
.args(["extract", test_file.as_str(), "--content-format", "plain"])
.output()
.expect("Failed to execute extract with --content-format");
// Command should either succeed or show expected deprecation behavior
let stdout = String::from_utf8_lossy(&output.stdout);
// Note: We're checking that the command doesn't crash; deprecation warning behavior
// depends on implementation details
assert!(
output.status.success() || !stdout.is_empty(),
"Command should succeed or produce output"
);
}
// ============================================================================
// Test 7: Config merge scenarios - multiple configuration sources
// ============================================================================
#[test]
fn test_cli_config_merge_scenarios() {
build_binary();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
eprintln!("Skipping test: {} not found", test_file);
return;
}
let temp_dir = TempDir::new().expect("Failed to create temp directory");
// Create a base config file
let config_content = r#"
use_cache = true
[chunking]
max_chars = 1024
"#;
let config_path = create_test_config(&temp_dir, "base.toml", config_content);
// Merge: config file + inline JSON (JSON should override matching keys)
let output = Command::new(get_binary_path())
.args([
"extract",
test_file.as_str(),
"--config",
config_path.to_string_lossy().as_ref(),
"--config-json",
r#"{"use_cache": false}"#,
])
.output()
.expect("Failed to merge configs");
assert!(
output.status.success(),
"Config merge failed: {}",
String::from_utf8_lossy(&output.stderr)
);
}
// ============================================================================
// Test 8: Invalid JSON error handling
// ============================================================================
#[test]
fn test_cli_invalid_json_error() {
build_binary();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
eprintln!("Skipping test: {} not found", test_file);
return;
}
let output = Command::new(get_binary_path())
.args([
"extract",
test_file.as_str(),
"--config-json",
r#"{"invalid json without closing"#, // Malformed JSON
])
.output()
.expect("Failed to execute command");
// Should fail gracefully with error message
assert!(!output.status.success(), "Command should fail with invalid JSON");
let stderr = String::from_utf8_lossy(&output.stderr);
// Should contain some error indication
assert!(
!stderr.is_empty() || !String::from_utf8_lossy(&output.stdout).is_empty(),
"Should provide feedback about invalid JSON"
);
}
// ============================================================================
// Test 9: Config flag conflicts
// ============================================================================
#[test]
fn test_cli_conflicts() {
build_binary();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
eprintln!("Skipping test: {} not found", test_file);
return;
}
let temp_dir = TempDir::new().expect("Failed to create temp directory");
let config_content = "use_cache = true\n";
let config_path = create_test_config(&temp_dir, "config.toml", config_content);
// Using both --config-json and --config-json-base64 might conflict
let json_config = r#"{"use_cache": false}"#;
let base64_config = to_base64(json_config);
let output = Command::new(get_binary_path())
.args([
"extract",
test_file.as_str(),
"--config",
config_path.to_string_lossy().as_ref(),
"--config-json",
r#"{"chunking": {"max_chars": 512}}"#,
"--config-json-base64",
base64_config.as_str(),
])
.output()
.expect("Failed to execute command with potential conflicts");
// The behavior here depends on implementation:
// Either it should succeed (last flag wins) or show an error (mutually exclusive)
// We verify that the command completes without crashing
let _ = output.status.success();
}
// ============================================================================
// Test 10: Real end-to-end extraction with new config formats
// ============================================================================
#[test]
fn test_cli_real_extraction() {
build_binary();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
eprintln!("Skipping test: {} not found", test_file);
return;
}
// Full E2E test: extract with multiple new flags
let output = Command::new(get_binary_path())
.args([
"extract",
test_file.as_str(),
"--format",
"json",
"--output-format",
"markdown",
"--config-json",
r#"{"use_cache": false, "disable_ocr": true}"#,
])
.output()
.expect("Failed to execute full E2E extraction");
assert!(
output.status.success(),
"E2E extraction failed: {}",
String::from_utf8_lossy(&output.stderr)
);
let stdout = String::from_utf8_lossy(&output.stdout);
// Should be valid JSON output
let parsed: Result<serde_json::Value, _> = serde_json::from_str(&stdout);
assert!(parsed.is_ok(), "E2E output should be valid JSON, got: {}", stdout);
// Verify envelope+result structure
if let Ok(value) = parsed {
assert!(value.get("result").is_some(), "Missing 'result' envelope field");
assert!(
value.get("extraction_time_ms").is_some(),
"Missing 'extraction_time_ms' field"
);
assert!(
value["result"].get("content").is_some(),
"Missing content field in result"
);
assert!(
value["result"].get("mime_type").is_some(),
"Missing mime_type field in result"
);
}
}
// ============================================================================
// Additional Edge Cases and Robustness Tests
// ============================================================================
#[test]
fn test_cli_empty_config_json() {
build_binary();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
eprintln!("Skipping test: {} not found", test_file);
return;
}
// Empty JSON object should use defaults
let output = Command::new(get_binary_path())
.args(["extract", test_file.as_str(), "--config-json", "{}"])
.output()
.expect("Failed to execute with empty JSON config");
assert!(output.status.success(), "Command with empty JSON config should succeed");
}
#[test]
fn test_cli_multiple_output_format_variants() {
build_binary();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
eprintln!("Skipping test: {} not found", test_file);
return;
}
// Test case-insensitive format argument
let output = Command::new(get_binary_path())
.args([
"extract",
test_file.as_str(),
"--output-format",
"MARKDOWN", // uppercase should work or fail predictably
])
.output()
.expect("Failed to execute");
// Either succeeds with case-insensitive parsing or fails gracefully
let _ = output.status.success();
}
#[test]
fn test_cli_config_json_with_nested_objects() {
build_binary();
let test_file = get_test_file("text/simple.txt");
if !PathBuf::from(&test_file).exists() {
eprintln!("Skipping test: {} not found", test_file);
return;
}
// Complex nested JSON configuration
let complex_config = r#"
{
"use_cache": false,
"chunking": {"max_chars": 512},
"language_detection": {
"enabled": true,
"confidence_threshold": 0.8
}
}
"#;
let output = Command::new(get_binary_path())
.args(["extract", test_file.as_str(), "--config-json", complex_config])
.output()
.expect("Failed to execute with nested JSON config");
assert!(
output.status.success() || !String::from_utf8_lossy(&output.stderr).is_empty(),
"Complex config should either work or provide error"
);
}

View File

@@ -0,0 +1,237 @@
//! Integration tests for the JSON timing envelope added to `kreuzberg extract` and
//! `kreuzberg batch`.
//!
//! Verifies:
//! - `extract --format json` emits `{ result, extraction_time_ms }` shape
//! - `batch --format json` emits `{ results, total_ms, per_file_ms }` shape
//! - `result.metadata.ocr_used` exists as a bool field
//! - `--pdf-backend xyz` exits non-zero and mentions "pdf-oxide"
use std::path::{Path, PathBuf};
use std::process::Command;
/// Returns path to the compiled `kreuzberg` binary (debug build).
fn kreuzberg_bin() -> PathBuf {
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
manifest_dir
.parent()
.expect("crates/kreuzberg-cli parent")
.parent()
.expect("crates parent")
.join("target")
.join("debug")
.join("kreuzberg")
}
/// Returns path to the small reference PDF used in these tests.
fn pdf_fixture() -> PathBuf {
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
manifest_dir
.parent()
.expect("crates/kreuzberg-cli parent")
.parent()
.expect("crates parent")
.join("test_documents")
.join("pdf")
.join("pdfa_001.pdf")
}
/// Returns path to the small plain-text fixture used for batch tests.
fn txt_fixture() -> PathBuf {
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
manifest_dir
.parent()
.expect("crates/kreuzberg-cli parent")
.parent()
.expect("crates parent")
.join("test_documents")
.join("text")
.join("fake_text.txt")
}
/// Build the binary once before running. Panics on failure.
fn build_binary() {
let status = Command::new("cargo")
.args(["build", "--bin", "kreuzberg"])
.status()
.expect("cargo build invocation failed");
assert!(status.success(), "cargo build failed — binary unavailable");
}
/// Skip-guard: returns `true` when the fixture exists so the test can run.
fn fixture_exists(path: &Path) -> bool {
path.exists() && path.is_file()
}
// ── extract --format json envelope ──────────────────────────────────────────
#[test]
fn test_extract_json_has_result_and_timing() {
build_binary();
let pdf = pdf_fixture();
if !fixture_exists(&pdf) {
eprintln!("SKIP: PDF fixture not found at {}", pdf.display());
return;
}
let output = Command::new(kreuzberg_bin())
.args(["extract", &pdf.to_string_lossy(), "--format", "json"])
.output()
.expect("failed to run kreuzberg extract");
assert!(
output.status.success(),
"extract exited non-zero: {}",
String::from_utf8_lossy(&output.stderr)
);
let json: serde_json::Value = serde_json::from_slice(&output.stdout).expect("stdout is not valid JSON");
// Envelope shape
assert!(json.get("result").is_some(), "missing 'result' key in envelope");
let extraction_time_ms = json
.get("extraction_time_ms")
.and_then(|v| v.as_f64())
.expect("'extraction_time_ms' must be a number");
assert!(
extraction_time_ms > 0.0,
"extraction_time_ms must be positive, got {extraction_time_ms}"
);
// ocr_used field must exist as a bool
let ocr_used = json["result"]["metadata"]
.get("ocr_used")
.expect("'result.metadata.ocr_used' must be present")
.as_bool()
.expect("'result.metadata.ocr_used' must be a boolean");
// For a native-text PDF without --force-ocr, OCR should NOT have run.
assert!(!ocr_used, "expected ocr_used=false for native PDF extraction");
}
// ── batch --format json envelope ─────────────────────────────────────────────
#[test]
fn test_batch_json_has_results_and_timing() {
build_binary();
let pdf = pdf_fixture();
let txt = txt_fixture();
if !fixture_exists(&pdf) || !fixture_exists(&txt) {
eprintln!("SKIP: one or more batch fixtures not found");
return;
}
let output = Command::new(kreuzberg_bin())
.args([
"batch",
&pdf.to_string_lossy(),
&txt.to_string_lossy(),
"--format",
"json",
])
.output()
.expect("failed to run kreuzberg batch");
assert!(
output.status.success(),
"batch exited non-zero: {}",
String::from_utf8_lossy(&output.stderr)
);
let json: serde_json::Value = serde_json::from_slice(&output.stdout).expect("stdout is not valid JSON");
// Envelope shape
let results = json
.get("results")
.and_then(|v| v.as_array())
.expect("'results' must be an array");
assert_eq!(results.len(), 2, "expected 2 results for 2 input files");
let total_ms = json
.get("total_ms")
.and_then(|v| v.as_f64())
.expect("'total_ms' must be a number");
assert!(total_ms > 0.0, "total_ms must be positive, got {total_ms}");
let per_file_ms = json
.get("per_file_ms")
.and_then(|v| v.as_array())
.expect("'per_file_ms' must be an array");
assert_eq!(per_file_ms.len(), 2, "per_file_ms must have one entry per file");
for (i, timing) in per_file_ms.iter().enumerate() {
let ms = timing.as_f64().expect("per_file_ms entry must be a number");
assert!(ms > 0.0, "per_file_ms[{i}] must be positive, got {ms}");
}
// Each result must have metadata.ocr_used as a bool
for (i, result) in results.iter().enumerate() {
assert!(
result["metadata"].get("ocr_used").and_then(|v| v.as_bool()).is_some(),
"results[{i}].metadata.ocr_used must be a bool"
);
}
}
// ── --pdf-backend validation ─────────────────────────────────────────────────
#[test]
fn test_pdf_backend_invalid_value_exits_nonzero() {
build_binary();
let pdf = pdf_fixture();
if !fixture_exists(&pdf) {
eprintln!("SKIP: PDF fixture not found at {}", pdf.display());
return;
}
let output = Command::new(kreuzberg_bin())
.args(["extract", &pdf.to_string_lossy(), "--pdf-backend", "xyz"])
.output()
.expect("failed to run kreuzberg extract");
assert!(
!output.status.success(),
"expected non-zero exit for unknown --pdf-backend"
);
let stderr = String::from_utf8_lossy(&output.stderr);
assert!(
stderr.contains("pdf-oxide"),
"error message should mention 'pdf-oxide', got: {stderr}"
);
}
#[test]
fn test_pdf_backend_valid_value_succeeds() {
build_binary();
let pdf = pdf_fixture();
if !fixture_exists(&pdf) {
eprintln!("SKIP: PDF fixture not found at {}", pdf.display());
return;
}
let output = Command::new(kreuzberg_bin())
.args([
"extract",
&pdf.to_string_lossy(),
"--pdf-backend",
"pdf-oxide",
"--format",
"json",
])
.output()
.expect("failed to run kreuzberg extract");
assert!(
output.status.success(),
"--pdf-backend pdf-oxide should succeed: {}",
String::from_utf8_lossy(&output.stderr)
);
let json: serde_json::Value = serde_json::from_slice(&output.stdout).expect("stdout is not valid JSON");
assert!(json.get("result").is_some(), "missing 'result' key");
assert!(json.get("extraction_time_ms").is_some(), "missing 'extraction_time_ms'");
}

View File

@@ -0,0 +1,153 @@
//! Integration tests for server commands (serve and mcp).
#[cfg(not(coverage))]
use std::process::{Command, Stdio};
#[cfg(not(coverage))]
use std::thread;
#[cfg(not(coverage))]
use std::time::Duration;
#[cfg(not(coverage))]
#[test]
#[ignore]
fn test_serve_command_starts() {
let status = Command::new("cargo")
.args(["build", "--bin", "kreuzberg", "--features", "all"])
.status()
.expect("Failed to build binary");
assert!(status.success(), "Failed to build kreuzberg binary");
let mut child = Command::new("./target/debug/kreuzberg")
.args(["serve", "-H", "127.0.0.1", "-p", "18000"])
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.spawn()
.expect("Failed to start server");
thread::sleep(Duration::from_secs(3));
let mut health_response = ureq::get("http://127.0.0.1:18000/health")
.call()
.expect("Failed to call health endpoint");
assert_eq!(health_response.status(), 200);
let health_json: serde_json::Value = health_response
.body_mut()
.read_json()
.expect("Failed to parse health response");
assert_eq!(health_json["status"], "healthy");
assert!(health_json["version"].is_string());
let mut info_response = ureq::get("http://127.0.0.1:18000/info")
.call()
.expect("Failed to call info endpoint");
assert_eq!(info_response.status(), 200);
let info_json: serde_json::Value = info_response
.body_mut()
.read_json()
.expect("Failed to parse info response");
assert!(info_json["rust_backend"].as_bool().unwrap_or(false));
child.kill().expect("Failed to kill server");
child.wait().expect("Failed to wait for server");
}
#[cfg(not(coverage))]
#[test]
#[ignore]
fn test_serve_command_with_config() {
use std::fs;
let config_content = r#"
use_cache = true
enable_quality_processing = true
[ocr]
backend = "tesseract"
language = "eng"
"#;
fs::write("test_config.toml", config_content).expect("Failed to write test config");
let mut child = Command::new("./target/debug/kreuzberg")
.args(["serve", "-H", "127.0.0.1", "-p", "18001", "-c", "test_config.toml"])
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.spawn()
.expect("Failed to start server");
thread::sleep(Duration::from_secs(3));
let health_response = ureq::get("http://127.0.0.1:18001/health").call();
assert!(health_response.is_ok(), "Server should be running with custom config");
child.kill().expect("Failed to kill server");
child.wait().expect("Failed to wait for server");
fs::remove_file("test_config.toml").ok();
}
#[cfg(not(coverage))]
#[test]
fn test_serve_command_help() {
let build_status = Command::new("cargo")
.args(["build", "--bin", "kreuzberg", "--features", "all"])
.status()
.expect("Failed to build binary");
assert!(build_status.success(), "Failed to build kreuzberg binary");
let binary_path = env!("CARGO_TARGET_TMPDIR")
.split("target")
.next()
.map(|s| format!("{}target/debug/kreuzberg", s))
.unwrap_or_else(|| "../target/debug/kreuzberg".to_string());
let output = Command::new(&binary_path)
.args(["serve", "--help"])
.output()
.expect("Failed to execute command");
assert!(output.status.success());
let stdout = String::from_utf8_lossy(&output.stdout);
assert!(stdout.contains("Start the API server"));
assert!(stdout.contains("--host"));
assert!(stdout.contains("--port"));
assert!(stdout.contains("--config"));
}
#[cfg(not(coverage))]
#[test]
fn test_mcp_command_help() {
let build_status = Command::new("cargo")
.args(["build", "--bin", "kreuzberg", "--features", "all"])
.status()
.expect("Failed to build binary");
assert!(build_status.success(), "Failed to build kreuzberg binary");
let binary_path = env!("CARGO_TARGET_TMPDIR")
.split("target")
.next()
.map(|s| format!("{}target/debug/kreuzberg", s))
.unwrap_or_else(|| "../target/debug/kreuzberg".to_string());
let output = Command::new(&binary_path)
.args(["mcp", "--help"])
.output()
.expect("Failed to execute command");
assert!(output.status.success());
let stdout = String::from_utf8_lossy(&output.stdout);
assert!(stdout.contains("Start the MCP (Model Context Protocol) server"));
assert!(stdout.contains("--config"));
}

46
crates/kreuzberg-ffi/Cargo.toml generated Normal file
View File

@@ -0,0 +1,46 @@
[package]
name = "kreuzberg-ffi"
version = "5.0.0-rc.3"
edition = "2021"
license = "Elastic-2.0"
description = "High-performance document intelligence library"
readme = false
keywords = ["document", "extraction", "ocr", "pdf", "text"]
categories = ["text-processing"]
repository = "https://github.com/kreuzberg-dev/kreuzberg"
# `serde_json`, `ahash`, and `tokio` are emitted unconditionally above so the
# manifest is stable across regens (and so the C FFI codegen can pull them in
# when an async / Result-typed function appears in the API surface), but for
# umbrella crates with no async fns and no JSON-marshalled return types they
# are genuinely unused. The conditional `async-trait` / `futures-util` deps
# are similarly flagged when the umbrella has trait-bridge / streaming adapters
# configured but no actual async-trait / async-stream callsite in the generated
# FFI shim.
[package.metadata.cargo-machete]
ignored = ["ahash", "serde_json", "tokio", "async-trait"]
[lib]
crate-type = ["cdylib", "staticlib", "rlib"]
[features]
default = []
[dependencies]
ahash = "0.8"
async-trait = "0.1"
serde_json = "1"
tokio = { version = "1", features = ["full"] }
[target.'cfg(not(all(target_os = "android", target_arch = "x86_64")))'.dependencies]
kreuzberg = { path = "../kreuzberg", version = "5.0.0-rc.3", features = ["full", "pdf", "ocr", "paddle-ocr", "paddle-ocr-types", "layout-detection", "layout-types", "embeddings", "embedding-presets", "chunking", "keywords-yake", "keywords-rake", "language-detection", "html", "tree-sitter", "office", "email", "archives", "stopwords", "auto-rotate", "auto-rotate-types", "tokio-runtime", "api", "mcp", "liter-llm", "quality"] }
[target.'cfg(all(target_os = "android", target_arch = "x86_64"))'.dependencies]
kreuzberg = { path = "../kreuzberg", version = "5.0.0-rc.3", features = ["android-target"] }
[build-dependencies]
cbindgen = "0.29"
[dev-dependencies]
tempfile = "3"

295
crates/kreuzberg-ffi/README.md generated Normal file
View File

@@ -0,0 +1,295 @@
# FFI (C/C++)
<div align="center" style="display: flex; flex-wrap: wrap; gap: 8px; justify-content: center; margin: 20px 0;">
<a href="https://github.com/kreuzberg-dev/alef">
<img src="https://img.shields.io/badge/Bindings-alef%20%D7%90-007ec6" alt="Bindings">
</a>
<!-- Language Bindings -->
<a href="https://crates.io/crates/kreuzberg">
<img src="https://img.shields.io/crates/v/kreuzberg?label=Rust&color=007ec6" alt="Rust">
</a>
<a href="https://pypi.org/project/kreuzberg/">
<img src="https://img.shields.io/pypi/v/kreuzberg?label=Python&color=007ec6" alt="Python">
</a>
<a href="https://www.npmjs.com/package/@kreuzberg/node">
<img src="https://img.shields.io/npm/v/@kreuzberg/node?label=Node.js&color=007ec6" alt="Node.js">
</a>
<a href="https://www.npmjs.com/package/@kreuzberg/wasm">
<img src="https://img.shields.io/npm/v/@kreuzberg/wasm?label=WASM&color=007ec6" alt="WASM">
</a>
<a href="https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg">
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
</a>
<a href="https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/go/v5">
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v5*" alt="Go">
</a>
<a href="https://www.nuget.org/packages/Kreuzberg/">
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
</a>
<a href="https://packagist.org/packages/kreuzberg/kreuzberg">
<img src="https://img.shields.io/packagist/v/kreuzberg/kreuzberg?label=PHP&color=007ec6" alt="PHP">
</a>
<a href="https://rubygems.org/gems/kreuzberg">
<img src="https://img.shields.io/gem/v/kreuzberg?label=Ruby&color=007ec6" alt="Ruby">
</a>
<a href="https://hex.pm/packages/kreuzberg">
<img src="https://img.shields.io/hexpm/v/kreuzberg?label=Elixir&color=007ec6" alt="Elixir">
</a>
<a href="https://kreuzberg-dev.r-universe.dev/kreuzberg">
<img src="https://img.shields.io/badge/R-kreuzberg-007ec6" alt="R">
</a>
<a href="https://pub.dev/packages/kreuzberg">
<img src="https://img.shields.io/pub/v/kreuzberg?label=Dart&color=007ec6" alt="Dart">
</a>
<a href="https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg-android">
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg-android?label=Kotlin&color=007ec6" alt="Kotlin">
</a>
<a href="https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/swift">
<img src="https://img.shields.io/badge/Swift-SPM-007ec6" alt="Swift">
</a>
<a href="https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/zig">
<img src="https://img.shields.io/badge/Zig-package-007ec6" alt="Zig">
</a>
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
<img src="https://img.shields.io/badge/C-FFI-007ec6" alt="C FFI">
</a>
<a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/kreuzberg">
<img src="https://img.shields.io/badge/Docker-ghcr.io-007ec6?logo=docker&logoColor=white" alt="Docker">
</a>
<a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/charts%2Fkreuzberg">
<img src="https://img.shields.io/badge/Helm-ghcr.io-007ec6?logo=helm&logoColor=white" alt="Helm">
</a>
<!-- Project Info -->
<a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
<img src="https://img.shields.io/badge/License-Elastic--2.0-007ec6" alt="License">
</a>
<a href="https://docs.kreuzberg.dev">
<img src="https://img.shields.io/badge/Docs-kreuzberg-007ec6" alt="Documentation">
</a>
<a href="https://huggingface.co/Kreuzberg">
<img src="https://img.shields.io/badge/Hugging%20Face-Kreuzberg-007ec6" alt="Hugging Face">
</a>
</div>
<div align="center" style="margin: 24px 0 0;">
<a href="https://kreuzberg.dev">
<img alt="Kreuzberg" src="https://github.com/user-attachments/assets/419fc06c-8313-4324-b159-4b4d3cfce5c0" />
</a>
</div>
<div align="center" style="display: flex; flex-wrap: wrap; gap: 12px; justify-content: center; margin: 28px 0 24px;">
<a href="https://discord.gg/xt9WY3GnKR">
<img height="22" src="https://img.shields.io/badge/Discord-Chat-007ec6?logo=discord&logoColor=white" alt="Join Discord">
</a>
<a href="https://docs.kreuzberg.dev/demo.html">
<img height="22" src="https://img.shields.io/badge/Live%20Demo-Open-007ec6?logo=webassembly&logoColor=white" alt="Live Demo">
</a>
</div>
Extract text, tables, images, and metadata from 90+ file formats and 300+ programming languages including PDF, Office documents, and images. C/C++ FFI bindings providing a stable ABI for native integration, shared library distribution, and cross-language interop.
## What This Package Provides
- **Document intelligence core** — extract text, tables, images, metadata, entities, keywords, and code intelligence from one API.
- **Format coverage** — PDF, Office, images, HTML/XML, email, archives, notebooks, citations, scientific formats, and plain text.
- **OCR choices** — Tesseract, PaddleOCR, EasyOCR where supported, VLM OCR through liter-llm, and plugin hooks for custom backends.
- **Same engine as every binding** — Rust, Python, Node.js, Go, Java, PHP, Ruby, .NET, Elixir, R, WASM, Kotlin Android, Swift, Dart, Zig, and C FFI share the same Rust implementation.
- **C ABI** — stable shared library surface for custom hosts and secondary bindings.
## Installation
### Package Installation
Build the shared library from the workspace:
```bash
cargo build --release -p kreuzberg-ffi
```
The built artifacts are emitted under `target/release/` (`libkreuzberg_ffi.{so,dylib,a}`) along with the C header at `crates/kreuzberg-ffi/include/kreuzberg.h`.
### System Requirements
- A C/C++ toolchain (clang, gcc, or MSVC) and a Rust toolchain (`rustup`) for building from source
- A `pkg-config` or CMake-aware build system that can locate `libkreuzberg_ffi` and `kreuzberg.h`
- Optional: [ONNX Runtime](https://github.com/microsoft/onnxruntime/releases) version 1.22.x for embeddings support
- Optional: [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) for OCR functionality
## Quick Start
### Basic Extraction
Extract text, metadata, and structure from any supported document format:
<!-- snippet not found: -->
### Common Use Cases
#### Extract with Custom Configuration
Most use cases benefit from configuration to control extraction behavior:
#### Table Extraction
See [Configuration Guide](https://docs.kreuzberg.dev/guides/configuration/) for table extraction options.
#### Processing Multiple Files
### Next Steps
- **[Installation Guide](https://docs.kreuzberg.dev/getting-started/installation/)** - Platform-specific setup
- **[API Documentation](https://docs.kreuzberg.dev/reference/api-python/)** - Complete API reference
- **[Examples & Guides](https://docs.kreuzberg.dev/)** - Full code examples and usage guides
- **[Configuration Guide](https://docs.kreuzberg.dev/guides/configuration/)** - Advanced configuration options
## Features
### Supported File Formats (90+)
90+ file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
#### Office Documents
| Category | Formats | Capabilities |
|----------|---------|--------------|
| **Word Processing** | `.docx`, `.docm`, `.dotx`, `.dotm`, `.dot`, `.odt` | Full text, tables, images, metadata, styles |
| **Spreadsheets** | `.xlsx`, `.xlsm`, `.xlsb`, `.xls`, `.xla`, `.xlam`, `.xltm`, `.xltx`, `.xlt`, `.ods` | Sheet data, formulas, cell metadata, charts |
| **Presentations** | `.pptx`, `.pptm`, `.ppsx`, `.potx`, `.potm`, `.pot`, `.ppt` | Slides, speaker notes, images, metadata |
| **PDF** | `.pdf` | Text, tables, images, metadata, OCR support |
| **eBooks** | `.epub`, `.fb2` | Chapters, metadata, embedded resources |
| **Database** | `.dbf` | Table data extraction, field type support |
| **Hangul** | `.hwp`, `.hwpx` | Korean document format, text extraction |
#### Images (OCR-Enabled)
| Category | Formats | Features |
|----------|---------|----------|
| **Raster** | `.png`, `.jpg`, `.jpeg`, `.gif`, `.webp`, `.bmp`, `.tiff`, `.tif` | OCR, table detection, EXIF metadata, dimensions, color space |
| **Advanced** | `.jp2`, `.jpx`, `.jpm`, `.mj2`, `.jbig2`, `.jb2`, `.pnm`, `.pbm`, `.pgm`, `.ppm` | OCR via hayro-jpeg2000 (pure Rust decoder), JBIG2 support, table detection, format-specific metadata |
| **Vector** | `.svg` | DOM parsing, embedded text, graphics metadata |
#### Web & Data
| Category | Formats | Features |
|----------|---------|----------|
| **Markup** | `.html`, `.htm`, `.xhtml`, `.xml`, `.svg` | DOM parsing, metadata (Open Graph, Twitter Card), link extraction |
| **Structured Data** | `.json`, `.yaml`, `.yml`, `.toml`, `.csv`, `.tsv` | Schema detection, nested structures, validation |
| **Text & Markdown** | `.txt`, `.md`, `.markdown`, `.djot`, `.rst`, `.org`, `.rtf` | CommonMark, GFM, Djot, reStructuredText, Org Mode |
#### Email & Archives
| Category | Formats | Features |
|----------|---------|----------|
| **Email** | `.eml`, `.msg` | Headers, body (HTML/plain), attachments, threading |
| **Archives** | `.zip`, `.tar`, `.tgz`, `.gz`, `.7z` | File listing, nested archives, metadata |
#### Academic & Scientific
| Category | Formats | Features |
|----------|---------|----------|
| **Citations** | `.bib`, `.biblatex`, `.ris`, `.nbib`, `.enw`, `.csl` | Structured parsing: RIS (structured), PubMed/MEDLINE, EndNote XML (structured), BibTeX, CSL JSON |
| **Scientific** | `.tex`, `.latex`, `.typst`, `.jats`, `.ipynb`, `.docbook` | LaTeX, Jupyter notebooks, PubMed JATS |
| **Documentation** | `.opml`, `.pod`, `.mdoc`, `.troff` | Technical documentation formats |
#### Code Intelligence (300+ Languages)
| Feature | Description |
|---------|-------------|
| **Structure Extraction** | Functions, classes, methods, structs, interfaces, enums |
| **Import/Export Analysis** | Module dependencies, re-exports, wildcard imports |
| **Symbol Extraction** | Variables, constants, type aliases, properties |
| **Docstring Parsing** | Google, NumPy, Sphinx, JSDoc, RustDoc, and 10+ formats |
| **Diagnostics** | Parse errors with line/column positions |
| **Syntax-Aware Chunking** | Split code by semantic boundaries, not arbitrary byte offsets |
Powered by [tree-sitter-language-pack](https://github.com/kreuzberg-dev/tree-sitter-language-pack) — [documentation](https://docs.tree-sitter-language-pack.kreuzberg.dev).
**[Complete Format Reference](https://docs.kreuzberg.dev/reference/formats/)**
### Key Capabilities
- **Text Extraction** - Extract all text content with position and formatting information
- **Metadata Extraction** - Retrieve document properties, creation date, author, etc.
- **Table Extraction** - Parse tables with structure and cell content preservation
- **Image Extraction** - Extract embedded images and render page previews
- **OCR Support** - Integrate multiple OCR backends for scanned documents
- **Async/Await** - Non-blocking document processing with concurrent operations
- **Plugin System** - Extensible post-processing for custom text transformation
- **Embeddings** - Generate vector embeddings using ONNX Runtime models
- **Batch Processing** - Efficiently process multiple documents in parallel
- **Memory Efficient** - Stream large files without loading entirely into memory
- **Language Detection** - Detect and support multiple languages in documents
- **Code Intelligence** - Extract structure, imports, exports, symbols, and docstrings from [300+ programming languages](https://docs.tree-sitter-language-pack.kreuzberg.dev) via tree-sitter
- **Configuration** - Fine-grained control over extraction behavior
### Performance Characteristics
| Format | Speed | Memory | Notes |
|--------|-------|--------|-------|
| **PDF (text)** | 10-100 MB/s | ~50MB per doc | Fastest extraction |
| **Office docs** | 20-200 MB/s | ~100MB per doc | DOCX, XLSX, PPTX |
| **Images (OCR)** | 1-5 MB/s | Variable | Depends on OCR backend |
| **Archives** | 5-50 MB/s | ~200MB per doc | ZIP, TAR, etc. |
| **Web formats** | 50-200 MB/s | Streaming | HTML, XML, JSON |
## OCR Support
Kreuzberg supports multiple OCR backends for extracting text from scanned documents and images:
### OCR Configuration Example
<!-- snippet not found: -->
## Async Support
This binding provides full async/await support for non-blocking document processing:
<!-- snippet not found: -->
## Plugin System
Kreuzberg supports extensible post-processing plugins for custom text transformation and filtering.
For detailed plugin documentation, visit [Plugin System Guide](https://docs.kreuzberg.dev/guides/plugins/).
## Embeddings Support
Generate vector embeddings for extracted text using the built-in ONNX Runtime support. Requires ONNX Runtime installation.
**[Embeddings Guide](https://docs.kreuzberg.dev/features/#embeddings)**
## Configuration
For advanced configuration options including language detection, table extraction, OCR settings, and more:
**[Configuration Guide](https://docs.kreuzberg.dev/guides/configuration/)**
## Documentation
- **[Official Documentation](https://docs.kreuzberg.dev/)**
- **[API Reference](https://docs.kreuzberg.dev/reference/api-python/)**
- **[Examples & Guides](https://docs.kreuzberg.dev/)**
## Contributing
Contributions are welcome! See [Contributing Guide](https://github.com/kreuzberg-dev/kreuzberg/blob/main/CONTRIBUTING.md).
## Part of Kreuzberg.dev
- [Kreuzberg Cloud](https://github.com/kreuzberg-dev/kreuzberg-cloud) — managed extraction API with SDKs, dashboards, and observability.
- [kreuzcrawl](https://github.com/kreuzberg-dev/kreuzcrawl) — web crawling and scraping with HTML→Markdown and headless-Chrome fallback.
- [html-to-markdown](https://github.com/kreuzberg-dev/html-to-markdown) — fast, lossless HTML→Markdown engine.
- [liter-llm](https://github.com/kreuzberg-dev/liter-llm) — universal LLM API client with native bindings for 14 languages and 143 providers.
- [tree-sitter-language-pack](https://github.com/kreuzberg-dev/tree-sitter-language-pack) — tree-sitter grammars and code-intelligence primitives.
- [alef](https://github.com/kreuzberg-dev/alef) — the polyglot binding generator that produces this README and all per-language bindings.
- [Discord](https://discord.gg/xt9WY3GnKR) — community, roadmap, announcements.
## License
Elastic-2.0 License — see [LICENSE](../../LICENSE) for details.
## Support
- **Discord Community**: [Join our Discord](https://discord.gg/xt9WY3GnKR)
- **GitHub Issues**: [Report bugs](https://github.com/kreuzberg-dev/kreuzberg/issues)
- **Discussions**: [Ask questions](https://github.com/kreuzberg-dev/kreuzberg/discussions)

23
crates/kreuzberg-ffi/build.rs generated Normal file
View File

@@ -0,0 +1,23 @@
// This file is auto-generated by alef. DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
fn main() {
let crate_dir = std::env::var("CARGO_MANIFEST_DIR").unwrap();
cbindgen::generate(crate_dir)
.expect("Unable to generate C bindings")
.write_to_file("include/kreuzberg.h");
// Set @rpath-relative install_name on macOS so the cdylib can be relocated
// (bundled into language packages like packages/go/.lib/<rid>/, packages/
// java/src/main/resources/natives/<rid>/, etc.) and located via the consumer
// binary's rpath at runtime. Without this, the install_name embeds the CI
// runner build path (`/Users/runner/work/.../target/.../deps/lib<name>.dylib`)
// and dyld fails to load the bundled copy from its actual location.
if std::env::var("CARGO_CFG_TARGET_OS").as_deref() == Ok("macos") {
println!("cargo:rustc-link-arg-cdylib=-Wl,-install_name,@rpath/libkreuzberg_ffi.dylib");
}
let go_include_dir = std::path::Path::new("../../../packages/go/v5/include");
std::fs::create_dir_all(go_include_dir).expect("Unable to create Go include directory");
std::fs::copy("include/kreuzberg.h", go_include_dir.join("kreuzberg.h"))
.expect("Unable to copy header to Go include directory");
}

1561
crates/kreuzberg-ffi/cbindgen.toml generated Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,87 @@
# kreuzberg-ffi CMake config-mode find module
#
# Defines the imported target:
# kreuzberg-ffi::kreuzberg-ffi
#
# Usage:
# find_package(kreuzberg-ffi REQUIRED)
# target_link_libraries(myapp PRIVATE kreuzberg-ffi::kreuzberg-ffi)
if(TARGET kreuzberg-ffi::kreuzberg-ffi)
return()
endif()
get_filename_component(_FFI_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
get_filename_component(_FFI_PREFIX "${_FFI_CMAKE_DIR}/.." ABSOLUTE)
find_library(_FFI_LIBRARY
NAMES kreuzberg_ffi libkreuzberg_ffi
PATHS "${_FFI_PREFIX}/lib"
NO_DEFAULT_PATH
)
if(NOT _FFI_LIBRARY)
find_library(_FFI_LIBRARY NAMES kreuzberg_ffi libkreuzberg_ffi)
endif()
find_path(_FFI_INCLUDE_DIR
NAMES kreuzberg.h
PATHS "${_FFI_PREFIX}/include"
NO_DEFAULT_PATH
)
if(NOT _FFI_INCLUDE_DIR)
find_path(_FFI_INCLUDE_DIR NAMES kreuzberg.h)
endif()
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(kreuzberg-ffi
REQUIRED_VARS _FFI_LIBRARY _FFI_INCLUDE_DIR
)
if(kreuzberg_ffi_FOUND)
set(_FFI_LIB_TYPE UNKNOWN)
if(_FFI_LIBRARY MATCHES "\\.(dylib|so)$" OR _FFI_LIBRARY MATCHES "\\.so\\.")
set(_FFI_LIB_TYPE SHARED)
elseif(_FFI_LIBRARY MATCHES "\\.dll$")
set(_FFI_LIB_TYPE SHARED)
elseif(_FFI_LIBRARY MATCHES "\\.(a|lib)$")
set(_FFI_LIB_TYPE STATIC)
endif()
add_library(kreuzberg-ffi::kreuzberg-ffi ${_FFI_LIB_TYPE} IMPORTED)
set_target_properties(kreuzberg-ffi::kreuzberg-ffi PROPERTIES
IMPORTED_LOCATION "${_FFI_LIBRARY}"
INTERFACE_INCLUDE_DIRECTORIES "${_FFI_INCLUDE_DIR}"
)
if(WIN32 AND _FFI_LIB_TYPE STREQUAL "SHARED")
find_file(_FFI_DLL
NAMES kreuzberg_ffi.dll libkreuzberg_ffi.dll
PATHS "${_FFI_PREFIX}/bin" "${_FFI_PREFIX}/lib"
NO_DEFAULT_PATH
)
if(_FFI_DLL)
set_target_properties(kreuzberg-ffi::kreuzberg-ffi PROPERTIES
IMPORTED_LOCATION "${_FFI_DLL}"
IMPORTED_IMPLIB "${_FFI_LIBRARY}"
)
endif()
unset(_FFI_DLL CACHE)
endif()
if(APPLE)
set_property(TARGET kreuzberg-ffi::kreuzberg-ffi APPEND PROPERTY
INTERFACE_LINK_LIBRARIES "-framework CoreFoundation" "-framework Security" pthread)
elseif(UNIX)
set_property(TARGET kreuzberg-ffi::kreuzberg-ffi APPEND PROPERTY
INTERFACE_LINK_LIBRARIES pthread dl m)
elseif(WIN32)
set_property(TARGET kreuzberg-ffi::kreuzberg-ffi APPEND PROPERTY
INTERFACE_LINK_LIBRARIES ws2_32 userenv bcrypt)
endif()
unset(_FFI_LIB_TYPE)
endif()
mark_as_advanced(_FFI_LIBRARY _FFI_INCLUDE_DIR)
unset(_FFI_CMAKE_DIR)
unset(_FFI_PREFIX)

14155
crates/kreuzberg-ffi/include/kreuzberg.h generated Normal file

File diff suppressed because it is too large Load Diff

34419
crates/kreuzberg-ffi/src/lib.rs generated Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,177 @@
/// Regression test for GitHub #1059.
///
/// `kreuzberg_email_attachment_data` was the only byte-buffer accessor on a public
/// FFI-exposed DTO that did not follow the established `*_data(ptr, out_len: *mut usize)`
/// protocol used by `kreuzberg_extracted_image_data`, `kreuzberg_embedded_file_data`,
/// and `kreuzberg_batch_bytes_item_content`.
///
/// Because `EmailAttachment.data` is `Option<Bytes>` (the only optional byte buffer among
/// public types), alef's heuristic for emitting the two-parameter form did not trigger.
/// Callers had no way to know the valid length of the returned pointer, making any read
/// past the first byte undefined behaviour (especially for payloads containing 0x00).
///
/// The alef fix shipped with the 2-parameter form (`ptr`, `out_len`). These tests
/// lock in the correct 2-param ABI and verify the full-length contract for payloads
/// that contain embedded NUL bytes.
///
/// Per project rules: every unsafe block has a SAFETY comment.
use std::ffi::{c_char, CString};
use std::fs;
use std::path::Path;
use kreuzberg_ffi::{kreuzberg_email_attachment_free, kreuzberg_email_attachment_from_json, kreuzberg_last_error_code};
/// Construct a minimal EmailAttachment JSON with a data payload that contains
/// an embedded NUL and a trailing high byte (0xEF). This defeats any strlen-based
/// or "read first byte only" implementations.
fn attachment_json_with_nuls() -> CString {
// 8 bytes: JPEG-ish magic + NUL in the middle + high byte at the end.
// Length is authoritative and known.
let data: Vec<u8> = vec![0xFF, 0xD8, 0xFF, 0x00, 0xDE, 0xAD, 0xBE, 0xEF];
let json = format!(
r#"{{
"name": "test.bin",
"filename": "test.bin",
"mime_type": "application/octet-stream",
"size": {},
"is_image": false,
"data": {}
}}"#,
data.len(),
serde_json::to_string(&data).unwrap()
);
CString::new(json).expect("valid UTF-8 JSON for test attachment")
}
/// The committed C header must declare the 2-parameter form for
/// `kreuzberg_email_attachment_data` (with `out_len`). This locks in the fix
/// for GitHub #1059 so a future regeneration cannot silently revert to the
/// 1-parameter form.
#[test]
fn email_attachment_data_accessor_must_provide_out_len_in_header() {
let header_path = Path::new(env!("CARGO_MANIFEST_DIR")).join("include/kreuzberg.h");
let header = fs::read_to_string(&header_path).expect("committed kreuzberg.h must be readable by the test");
// Simple and robust: the declaration for this specific function must mention out_len.
let has_out_len = header.contains("kreuzberg_email_attachment_data") && header.contains("out_len");
assert!(
has_out_len,
"GitHub #1059 regression: the declaration of kreuzberg_email_attachment_data \
in crates/kreuzberg-ffi/include/kreuzberg.h does not contain the required \
`out_len` parameter.\n\n\
Expected something like:\n uint8_t *kreuzberg_email_attachment_data(..., uintptr_t *out_len);\n\n\
Found the old 1-parameter form. Fix requires `task alef:generate` with an \
updated alef that handles Option<Bytes> fields for the FFI byte accessor heuristic.\n\n\
This is the lock-in test for #1059."
);
}
/// When an attachment has no data payload the accessor must return a null pointer
/// and write 0 to out_len.
#[test]
fn email_attachment_data_none_returns_null_pointer() {
let json = CString::new(
r#"{"name":"empty","filename":"empty","mime_type":null,"size":null,"is_image":false,"data":null}"#,
)
.unwrap();
// SAFETY: json is valid null-terminated UTF-8.
let handle = unsafe { kreuzberg_email_attachment_from_json(json.as_ptr() as *const c_char) };
assert!(
!handle.is_null(),
"from_json should succeed (last_error_code={})",
// SAFETY: no precondition; reads a thread-local.
unsafe { kreuzberg_last_error_code() }
);
let mut out_len: usize = usize::MAX;
// SAFETY: handle is a valid non-null pointer returned by from_json;
// out_len is a valid stack-allocated usize.
let data_ptr = unsafe { kreuzberg_ffi::kreuzberg_email_attachment_data(handle, &mut out_len) };
assert!(
data_ptr.is_null(),
"data must be null when the attachment has no payload"
);
assert_eq!(out_len, 0, "out_len must be 0 when data is None");
// SAFETY: handle came from from_json; we are the sole owner.
unsafe { kreuzberg_email_attachment_free(handle) };
}
/// When an attachment carries a binary payload the accessor must return a non-null
/// pointer and write the exact byte count — including bytes past any embedded NUL —
/// to out_len. This is the core contract broken by the 1-parameter bug (#1059).
#[test]
fn email_attachment_data_with_out_len_returns_full_buffer_including_embedded_nuls() {
let json = attachment_json_with_nuls();
// SAFETY: json is a valid null-terminated CString we just created.
let handle = unsafe { kreuzberg_email_attachment_from_json(json.as_ptr() as *const c_char) };
assert!(
!handle.is_null(),
"from_json should succeed for our well-formed test attachment (last_error_code={})",
// SAFETY: no precondition; reads a thread-local.
unsafe { kreuzberg_last_error_code() }
);
let mut out_len: usize = 0;
// SAFETY: handle is non-null and freshly allocated by from_json;
// out_len is a valid stack-allocated usize. The returned pointer must not
// be freed by us — it borrows the internal Bytes of the handle.
let data_ptr = unsafe { kreuzberg_ffi::kreuzberg_email_attachment_data(handle, &mut out_len) };
assert!(
!data_ptr.is_null(),
"data pointer must be non-null for an attachment we created with a Some(data) payload"
);
assert_eq!(
out_len, 8,
"out_len must report the exact length of the Bytes payload (not 0, not guessed, not truncated at NUL)"
);
// SAFETY: data_ptr is valid for [0..out_len] because:
// - it came from the handle's internal Bytes (which we control),
// - out_len was written by the accessor,
// - the handle is still alive (we have not called free yet).
let slice = unsafe { std::slice::from_raw_parts(data_ptr, out_len) };
assert_eq!(slice.len(), 8);
assert_eq!(slice[0], 0xFF);
assert_eq!(slice[3], 0x00, "must be able to read the embedded NUL");
assert_eq!(
slice[7], 0xEF,
"must be able to read bytes after the NUL (no truncation)"
);
// Cleanup
// SAFETY: handle came from from_json; we are the owner.
unsafe { kreuzberg_email_attachment_free(handle) };
}
/// Verify that passing a null out_len pointer is safe: the accessor must not
/// segfault, and the data pointer itself must still be returned.
#[test]
fn email_attachment_data_null_out_len_is_safe() {
let json = CString::new(
r#"{"name":"hasdata.bin","filename":"hasdata.bin","mime_type":"application/octet-stream","size":4,"is_image":false,"data":[65,0,66,67]}"#,
)
.unwrap();
// SAFETY: json is valid.
let handle = unsafe { kreuzberg_email_attachment_from_json(json.as_ptr() as *const c_char) };
assert!(!handle.is_null());
// SAFETY: handle is valid; passing null for out_len is a defined contract
// (the accessor null-checks before writing).
let data_ptr = unsafe { kreuzberg_ffi::kreuzberg_email_attachment_data(handle, std::ptr::null_mut()) };
assert!(
!data_ptr.is_null(),
"data pointer should be non-null when the attachment carries a payload"
);
// SAFETY: handle from from_json; we are the owner.
unsafe { kreuzberg_email_attachment_free(handle) };
}

View File

@@ -0,0 +1,204 @@
/// Regression tests: vtable Bytes params carry companion length
///
/// The alef vtable generator previously emitted only `*const u8` for `&[u8]`
/// trait-method parameters without a companion `{name}_len: usize`. Binary
/// payloads contain embedded NUL bytes; read-until-NUL semantics silently
/// truncated every real image or document buffer at the first `0x00`.
///
/// Fix shipped in alef ≥ v0.19.21 and is present in the generated FFI shim.
/// These tests construct a vtable bridge directly, pass a buffer with an
/// embedded NUL at a known offset, and assert the full buffer is received.
///
/// Per-test state is passed via `user_data` — no global statics — so tests
/// are independent and can run in parallel without interfering.
use kreuzberg_ffi::{
KreuzbergDocumentExtractorBridge, KreuzbergDocumentExtractorVTable, KreuzbergOcrBackendBridge,
KreuzbergOcrBackendVTable,
};
use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
// ── Per-test callback state ───────────────────────────────────────────────
struct CallbackState {
received_len: AtomicUsize,
received_last_byte: AtomicU8,
}
impl CallbackState {
fn new() -> Self {
Self {
received_len: AtomicUsize::new(0),
received_last_byte: AtomicU8::new(0),
}
}
}
// ── C callback stubs ─────────────────────────────────────────────────────
unsafe extern "C" fn ocr_process_image(
user_data: *const std::ffi::c_void,
image_bytes: *const u8,
image_bytes_len: usize,
_config: *const std::ffi::c_char,
out_result: *mut *mut std::ffi::c_char,
out_error: *mut *mut std::ffi::c_char,
) -> i32 {
// SAFETY: user_data points to a CallbackState that the calling test keeps alive.
let state = unsafe { &*(user_data as *const CallbackState) };
state.received_len.store(image_bytes_len, Ordering::SeqCst);
if image_bytes_len > 0 {
// SAFETY: caller guarantees image_bytes[0..image_bytes_len] is valid.
let last = unsafe { *image_bytes.add(image_bytes_len - 1) };
state.received_last_byte.store(last, Ordering::SeqCst);
}
unsafe { *out_result = std::ptr::null_mut() };
let msg = std::ffi::CString::new("stub").unwrap();
// SAFETY: caller owns out_error and will free it via kreuzberg_free_string.
unsafe { *out_error = msg.into_raw() };
1
}
unsafe extern "C" fn extractor_extract_bytes(
user_data: *const std::ffi::c_void,
content: *const u8,
content_len: usize,
_mime_type: *const std::ffi::c_char,
_config: *const std::ffi::c_char,
out_result: *mut *mut std::ffi::c_char,
out_error: *mut *mut std::ffi::c_char,
) -> i32 {
// SAFETY: user_data points to a CallbackState that the calling test keeps alive.
let state = unsafe { &*(user_data as *const CallbackState) };
state.received_len.store(content_len, Ordering::SeqCst);
if content_len > 0 {
// SAFETY: caller guarantees content[0..content_len] is valid.
let last = unsafe { *content.add(content_len - 1) };
state.received_last_byte.store(last, Ordering::SeqCst);
}
unsafe { *out_result = std::ptr::null_mut() };
let msg = std::ffi::CString::new("stub").unwrap();
unsafe { *out_error = msg.into_raw() };
1
}
// ── Tests ─────────────────────────────────────────────────────────────────
/// OcrBackend.process_image must pass the full buffer length even when
/// the payload contains embedded NUL bytes.
#[tokio::test]
async fn ocr_backend_vtable_process_image_passes_full_length_with_embedded_nuls() {
// 8-byte buffer; NUL at index 3. strlen-style reads would stop at 3.
let image_bytes: Vec<u8> = vec![0xFF, 0xD8, 0xFF, 0x00, 0xDE, 0xAD, 0xBE, 0xEF];
let state = Box::new(CallbackState::new());
let state_ptr = state.as_ref() as *const CallbackState as *const std::ffi::c_void;
let vtable = KreuzbergOcrBackendVTable {
process_image: Some(ocr_process_image),
process_image_file: None,
name_fn: None,
version_fn: None,
initialize_fn: None,
shutdown_fn: None,
supports_language: None,
backend_type: None,
supported_languages: None,
supports_table_detection: None,
supports_document_processing: None,
process_document: None,
free_user_data: None,
};
// SAFETY: state lives for the duration of this test and outlives the bridge.
let bridge = unsafe { KreuzbergOcrBackendBridge::new("test-ocr-stub".to_string(), vtable, state_ptr) };
use kreuzberg::OcrBackend;
let _ = bridge
.process_image(&image_bytes, &kreuzberg::OcrConfig::default())
.await;
assert_eq!(
state.received_len.load(Ordering::SeqCst),
8,
"process_image vtable received wrong length (truncated at embedded NUL?)"
);
assert_eq!(
state.received_last_byte.load(Ordering::SeqCst),
0xEF,
"process_image vtable could not read past the embedded NUL"
);
}
/// DocumentExtractor.extract_bytes must pass the full buffer length even when
/// the document bytes contain embedded NUL bytes.
#[tokio::test]
async fn document_extractor_vtable_extract_bytes_passes_full_length_with_embedded_nuls() {
// 8-byte buffer; NUL at index 2.
let content: Vec<u8> = vec![0x50, 0x4B, 0x00, 0x03, 0x14, 0x00, 0x00, 0x02];
let state = Box::new(CallbackState::new());
let state_ptr = state.as_ref() as *const CallbackState as *const std::ffi::c_void;
let vtable = KreuzbergDocumentExtractorVTable {
extract_bytes: Some(extractor_extract_bytes),
extract_file: None,
name_fn: None,
version_fn: None,
initialize_fn: None,
shutdown_fn: None,
supported_mime_types: None,
priority: None,
can_handle: None,
free_user_data: None,
};
// SAFETY: state lives for the duration of this test and outlives the bridge.
let bridge = unsafe { KreuzbergDocumentExtractorBridge::new("test-extractor-stub".to_string(), vtable, state_ptr) };
use kreuzberg::DocumentExtractor;
let _ = bridge
.extract_bytes(
&content,
"application/octet-stream",
&kreuzberg::ExtractionConfig::default(),
)
.await;
assert_eq!(
state.received_len.load(Ordering::SeqCst),
8,
"extract_bytes vtable received wrong length (truncated at embedded NUL?)"
);
assert_eq!(
state.received_last_byte.load(Ordering::SeqCst),
0x02,
"extract_bytes vtable could not read past the embedded NUL"
);
}
/// ImageKind numeric values: PageRaster must be 10 and Unknown must be 11.
///
/// alef ≥ v0.19.21 added PageRaster between Mask (9) and Unknown, bumping
/// Unknown from 10 → 11. Any C/Go/Java/C# code that hardcoded Unknown = 10
/// must be updated; this test pins the new ordinals so the renumbering is
/// visible to CI.
#[test]
fn image_kind_page_raster_is_10_and_unknown_is_11() {
// SAFETY: pure integer dispatch, no pointers.
assert_eq!(
unsafe { kreuzberg_ffi::kreuzberg_image_kind_from_i32(10) },
10,
"PageRaster == 10"
);
assert_eq!(
unsafe { kreuzberg_ffi::kreuzberg_image_kind_from_i32(11) },
11,
"Unknown == 11"
);
// Old Unknown value must now resolve to PageRaster, not Unknown.
assert_ne!(
unsafe { kreuzberg_ffi::kreuzberg_image_kind_from_i32(10) },
-1,
"10 must be valid"
);
}

View File

@@ -0,0 +1,22 @@
[package]
name = "kreuzberg-jni"
version.workspace = true
edition.workspace = true
rust-version.workspace = true
authors.workspace = true
license.workspace = true
repository.workspace = true
homepage.workspace = true
publish = false
[lib]
crate-type = ["cdylib"]
name = "kreuzberg_jni"
[dependencies]
base64 = "0.22"
jni = "0.21"
kreuzberg-ffi = { path = "../kreuzberg-ffi" }
[lints]
workspace = true

File diff suppressed because it is too large Load Diff

32
crates/kreuzberg-node/Cargo.toml generated Normal file
View File

@@ -0,0 +1,32 @@
[package]
name = "kreuzberg-node"
version = "5.0.0-rc.3"
edition = "2024"
license = "Elastic-2.0"
description = "High-performance document intelligence library"
readme = false
keywords = ["document", "extraction", "ocr", "pdf", "text"]
categories = ["text-processing"]
# `serde_json` is emitted unconditionally above so the manifest is stable
# across regens, but for umbrella crates with no JSON-marshalled return types
# it is genuinely unused. The conditional `async-trait` / `futures-util` deps
# are similarly flagged when the umbrella has trait-bridge / streaming
# adapters configured but no actual async-trait callsite in this binding.
[package.metadata.cargo-machete]
ignored = ["serde_json", "async-trait"]
[lib]
crate-type = ["cdylib"]
[dependencies]
async-trait = "0.1"
kreuzberg = { version = "5.0.0-rc.3", path = "../kreuzberg", features = ["full", "pdf", "ocr", "paddle-ocr", "paddle-ocr-types", "layout-detection", "layout-types", "embeddings", "embedding-presets", "chunking", "keywords-yake", "keywords-rake", "language-detection", "html", "tree-sitter", "office", "email", "archives", "stopwords", "auto-rotate", "auto-rotate-types", "tokio-runtime", "api", "mcp", "liter-llm", "quality"] }
napi = { version = "3", features = ["async", "serde-json"] }
napi-derive = "3"
serde = { version = "1", features = ["derive"] }
serde_json = "1"
serde_with = "3"
[build-dependencies]
napi-build = "2"

93
crates/kreuzberg-node/LICENSE generated Normal file
View File

@@ -0,0 +1,93 @@
Elastic License 2.0 (ELv2)
Copyright 2025-2026 Kreuzberg, Inc.
Acceptance
By using the software, you agree to all of the terms and conditions below.
Copyright License
The licensor grants you a non-exclusive, royalty-free, worldwide,
non-sublicensable, non-transferable license to use, copy, distribute, make
available, and prepare derivative works of the software, in each case subject to
the limitations and conditions below.
Limitations
You may not provide the software to third parties as a hosted or managed
service, where the service provides users with access to any substantial set of
the features or functionality of the software.
You may not move, change, disable, or circumvent the license key functionality
in the software, and you may not remove or obscure any functionality in the
software that is protected by the license key.
You may not alter, remove, or obscure any licensing, copyright, or other notices
of the licensor in the software. Any use of the licensor's trademarks is subject
to applicable law.
Patents
The licensor grants you a license, under any patent claims the licensor can
license, or becomes able to license, to make, have made, use, sell, offer for
sale, import and have imported the software, in each case subject to the
limitations and conditions in this license. This license does not cover any
patent claims that you cause to be infringed by modifications or additions to the
software. If you or your company make any written claim that the software
infringes or contributes to infringement of any patent, your patent license for
the software granted under these terms ends immediately. If your company makes
such a claim, your patent license ends immediately for work on behalf of your
company.
Notices
You must ensure that anyone who gets a copy of any part of the software from you
also gets a copy of these terms.
If you modify the software, you must include in any modified copies of the
software prominent notices stating that you have modified the software.
No Other Rights
These terms do not imply any licenses other than those expressly granted in
these terms.
Termination
If you use the software in violation of these terms, such use is not licensed,
and your licenses will automatically terminate. If the licensor provides you with
a notice of your violation, and you cease all violation of this license no later
than 30 days after you receive that notice, your licenses will be reinstated
retroactively. However, if you violate these terms after such reinstatement, any
additional violation of these terms will cause your licenses to terminate
automatically and permanently.
No Liability
As far as the law allows, the software comes as is, without any warranty or
condition, and the licensor will not be liable to you for any damages arising out
of these terms or the use or nature of the software, under any kind of legal
claim.
Definitions
The licensor is the entity offering these terms, and the software is the
software the licensor makes available under these terms, including any portion
of it.
you refers to the individual or entity agreeing to these terms.
your company is any legal entity, sole proprietorship, or other kind of
organization that you work for, plus all organizations that have control over,
are under the control of, or are under common control with that organization.
control means ownership of substantially all the assets of an entity, or the
power to direct its management and policies by vote, contract, or otherwise.
Control can be direct or indirect.
your licenses are all the licenses granted to you for the software under these
terms.
use means anything you do with the software requiring one of your licenses.
trademark means trademarks, service marks, and similar rights.

488
crates/kreuzberg-node/README.md generated Normal file
View File

@@ -0,0 +1,488 @@
# TypeScript (Node.js)
<div align="center" style="display: flex; flex-wrap: wrap; gap: 8px; justify-content: center; margin: 20px 0;">
<a href="https://github.com/kreuzberg-dev/alef">
<img src="https://img.shields.io/badge/Bindings-alef%20%D7%90-007ec6" alt="Bindings">
</a>
<!-- Language Bindings -->
<a href="https://crates.io/crates/kreuzberg">
<img src="https://img.shields.io/crates/v/kreuzberg?label=Rust&color=007ec6" alt="Rust">
</a>
<a href="https://pypi.org/project/kreuzberg/">
<img src="https://img.shields.io/pypi/v/kreuzberg?label=Python&color=007ec6" alt="Python">
</a>
<a href="https://www.npmjs.com/package/@kreuzberg/node">
<img src="https://img.shields.io/npm/v/@kreuzberg/node?label=Node.js&color=007ec6" alt="Node.js">
</a>
<a href="https://www.npmjs.com/package/@kreuzberg/wasm">
<img src="https://img.shields.io/npm/v/@kreuzberg/wasm?label=WASM&color=007ec6" alt="WASM">
</a>
<a href="https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg">
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
</a>
<a href="https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/go/v5">
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v5*" alt="Go">
</a>
<a href="https://www.nuget.org/packages/Kreuzberg/">
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
</a>
<a href="https://packagist.org/packages/kreuzberg/kreuzberg">
<img src="https://img.shields.io/packagist/v/kreuzberg/kreuzberg?label=PHP&color=007ec6" alt="PHP">
</a>
<a href="https://rubygems.org/gems/kreuzberg">
<img src="https://img.shields.io/gem/v/kreuzberg?label=Ruby&color=007ec6" alt="Ruby">
</a>
<a href="https://hex.pm/packages/kreuzberg">
<img src="https://img.shields.io/hexpm/v/kreuzberg?label=Elixir&color=007ec6" alt="Elixir">
</a>
<a href="https://kreuzberg-dev.r-universe.dev/kreuzberg">
<img src="https://img.shields.io/badge/R-kreuzberg-007ec6" alt="R">
</a>
<a href="https://pub.dev/packages/kreuzberg">
<img src="https://img.shields.io/pub/v/kreuzberg?label=Dart&color=007ec6" alt="Dart">
</a>
<a href="https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg-android">
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg-android?label=Kotlin&color=007ec6" alt="Kotlin">
</a>
<a href="https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/swift">
<img src="https://img.shields.io/badge/Swift-SPM-007ec6" alt="Swift">
</a>
<a href="https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/zig">
<img src="https://img.shields.io/badge/Zig-package-007ec6" alt="Zig">
</a>
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
<img src="https://img.shields.io/badge/C-FFI-007ec6" alt="C FFI">
</a>
<a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/kreuzberg">
<img src="https://img.shields.io/badge/Docker-ghcr.io-007ec6?logo=docker&logoColor=white" alt="Docker">
</a>
<a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/charts%2Fkreuzberg">
<img src="https://img.shields.io/badge/Helm-ghcr.io-007ec6?logo=helm&logoColor=white" alt="Helm">
</a>
<!-- Project Info -->
<a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
<img src="https://img.shields.io/badge/License-Elastic--2.0-007ec6" alt="License">
</a>
<a href="https://docs.kreuzberg.dev">
<img src="https://img.shields.io/badge/Docs-kreuzberg-007ec6" alt="Documentation">
</a>
<a href="https://huggingface.co/Kreuzberg">
<img src="https://img.shields.io/badge/Hugging%20Face-Kreuzberg-007ec6" alt="Hugging Face">
</a>
</div>
<div align="center" style="margin: 24px 0 0;">
<a href="https://kreuzberg.dev">
<img alt="Kreuzberg" src="https://github.com/user-attachments/assets/419fc06c-8313-4324-b159-4b4d3cfce5c0" />
</a>
</div>
<div align="center" style="display: flex; flex-wrap: wrap; gap: 12px; justify-content: center; margin: 28px 0 24px;">
<a href="https://discord.gg/xt9WY3GnKR">
<img height="22" src="https://img.shields.io/badge/Discord-Chat-007ec6?logo=discord&logoColor=white" alt="Join Discord">
</a>
<a href="https://docs.kreuzberg.dev/demo.html">
<img height="22" src="https://img.shields.io/badge/Live%20Demo-Open-007ec6?logo=webassembly&logoColor=white" alt="Live Demo">
</a>
</div>
Extract text, tables, images, and metadata from 90+ file formats and 300+ programming languages including PDF, Office documents, and images. Native NAPI-RS bindings for Node.js with superior performance, async/await support, and TypeScript type definitions.
## What This Package Provides
- **Document intelligence core** — extract text, tables, images, metadata, entities, keywords, and code intelligence from one API.
- **Format coverage** — PDF, Office, images, HTML/XML, email, archives, notebooks, citations, scientific formats, and plain text.
- **OCR choices** — Tesseract, PaddleOCR, EasyOCR where supported, VLM OCR through liter-llm, and plugin hooks for custom backends.
- **Same engine as every binding** — Rust, Python, Node.js, Go, Java, PHP, Ruby, .NET, Elixir, R, WASM, Kotlin Android, Swift, Dart, Zig, and C FFI share the same Rust implementation.
- **Node-first TypeScript API** — NAPI-RS package with typed options/results and async extraction.
## Installation
### Package Installation
```bash
pnpm add @kreuzberg/node
```
### System Requirements
- **Node.js 22+** required (NAPI-RS native bindings)
- Optional: [ONNX Runtime](https://github.com/microsoft/onnxruntime/releases) version 1.22.x for embeddings support
- Optional: [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) for OCR functionality
### Platform Support
Pre-built binaries available for:
- macOS (arm64, x64)
- Linux (x64)
- Windows (x64)
## Quick Start
### Basic Extraction
Extract text, metadata, and structure from any supported document format:
```typescript title="TypeScript"
import { extractFileSync } from "@kreuzberg/node";
const config = {
useCache: true,
enableQualityProcessing: true,
};
const result = extractFileSync("document.pdf", null, config);
console.log(result.content);
console.log(`MIME Type: ${result.mimeType}`);
```
### Common Use Cases
#### Extract with Custom Configuration
Most use cases benefit from configuration to control extraction behavior:
**With OCR (for scanned documents):**
```typescript title="TypeScript"
import { extractFile } from "@kreuzberg/node";
const config = {
ocr: {
backend: "tesseract",
language: "eng+fra",
tesseractConfig: {
psm: 3,
},
},
};
const result = await extractFile("document.pdf", null, config);
console.log(result.content);
```
#### Table Extraction
```typescript title="TypeScript"
import { extractFileSync } from "kreuzberg";
const result = extractFileSync("document.pdf");
result.tables?.forEach((table) => {
console.log(`Table with ${table.cells?.length ?? 0} rows`);
console.log(table.markdown);
table.cells?.forEach((row) => console.log(row.join(" | ")));
});
```
#### Processing Multiple Files
```typescript title="TypeScript"
import { batchExtractFilesSync } from "@kreuzberg/node";
const files = ["doc1.pdf", "doc2.docx", "doc3.pptx"];
const results = batchExtractFilesSync(files);
results.forEach((result, i) => {
console.log(`File ${i + 1}: ${result.content.length} characters`);
});
```
#### Async Processing
For non-blocking document processing:
```typescript title="TypeScript"
import { extractFile } from "@kreuzberg/node";
const result = await extractFile("document.pdf");
console.log(result.content);
```
#### Configuration Discovery
```typescript title="config_discovery.ts"
import { ExtractionConfig, extractFile } from "@kreuzberg/node";
const config = ExtractionConfig.discover();
if (config) {
console.log("Found configuration file");
const result = await extractFile("document.pdf", null, config);
console.log(result.content);
} else {
console.log("No configuration file found, using defaults");
const result = await extractFile("document.pdf");
console.log(result.content);
}
```
#### Worker Thread Pool
```typescript title="worker_pool.ts"
import {
createWorkerPool,
extractFileInWorker,
batchExtractFilesInWorker,
closeWorkerPool,
} from "@kreuzberg/node";
// Create a pool with 4 worker threads
const pool = createWorkerPool(4);
try {
// Extract single file in worker
const result = await extractFileInWorker(pool, "document.pdf", null, {
useCache: true,
});
console.log(result.content);
// Extract multiple files concurrently
const files = ["doc1.pdf", "doc2.docx", "doc3.xlsx"];
const results = await batchExtractFilesInWorker(pool, files, {
useCache: true,
});
results.forEach((result, i) => {
console.log(`File ${i + 1}: ${result.content.length} characters`);
});
} finally {
// Always close the pool when done
await closeWorkerPool(pool);
}
```
**Performance Benefits:**
- **Parallel Processing**: Multiple documents extracted simultaneously
- **CPU Utilization**: Maximizes multi-core CPU usage for large batches
- **Queue Management**: Automatically distributes work across available workers
- **Resource Control**: Prevents thread exhaustion with configurable pool size
**Best Practices:**
- Use worker pools for batches of 10+ documents
- Set pool size to number of CPU cores (default behavior)
- Always close pools with `closeWorkerPool()` to prevent resource leaks
- Reuse pools across multiple batch operations for efficiency
### Next Steps
- **[Installation Guide](https://docs.kreuzberg.dev/getting-started/installation/)** - Platform-specific setup
- **[API Documentation](https://docs.kreuzberg.dev/reference/api-python/)** - Complete API reference
- **[Examples & Guides](https://docs.kreuzberg.dev/)** - Full code examples and usage guides
- **[Configuration Guide](https://docs.kreuzberg.dev/guides/configuration/)** - Advanced configuration options
## NAPI-RS Implementation Details
### Native Performance
This binding uses NAPI-RS to provide native Node.js bindings with:
- **Zero-copy data transfer** between JavaScript and Rust layers
- **Native thread pool** for concurrent document processing
- **Direct memory management** for efficient large document handling
- **Binary-compatible** pre-built native modules across platforms
### Threading Model
- Single documents are processed synchronously or asynchronously in a dedicated thread
- Batch operations distribute work across available CPU cores
- Thread count is configurable but defaults to system CPU count
- Long-running extractions block the event loop unless using async APIs
### Memory Management
- Large documents (> 100 MB) are streamed to avoid loading entirely into memory
- Temporary files are created in system temp directory for extraction
- Memory is automatically released after extraction completion
- ONNX models are cached in memory for repeated embeddings operations
## Features
### Supported File Formats (90+)
90+ file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
#### Office Documents
| Category | Formats | Capabilities |
|----------|---------|--------------|
| **Word Processing** | `.docx`, `.docm`, `.dotx`, `.dotm`, `.dot`, `.odt` | Full text, tables, images, metadata, styles |
| **Spreadsheets** | `.xlsx`, `.xlsm`, `.xlsb`, `.xls`, `.xla`, `.xlam`, `.xltm`, `.xltx`, `.xlt`, `.ods` | Sheet data, formulas, cell metadata, charts |
| **Presentations** | `.pptx`, `.pptm`, `.ppsx`, `.potx`, `.potm`, `.pot`, `.ppt` | Slides, speaker notes, images, metadata |
| **PDF** | `.pdf` | Text, tables, images, metadata, OCR support |
| **eBooks** | `.epub`, `.fb2` | Chapters, metadata, embedded resources |
| **Database** | `.dbf` | Table data extraction, field type support |
| **Hangul** | `.hwp`, `.hwpx` | Korean document format, text extraction |
#### Images (OCR-Enabled)
| Category | Formats | Features |
|----------|---------|----------|
| **Raster** | `.png`, `.jpg`, `.jpeg`, `.gif`, `.webp`, `.bmp`, `.tiff`, `.tif` | OCR, table detection, EXIF metadata, dimensions, color space |
| **Advanced** | `.jp2`, `.jpx`, `.jpm`, `.mj2`, `.jbig2`, `.jb2`, `.pnm`, `.pbm`, `.pgm`, `.ppm` | OCR via hayro-jpeg2000 (pure Rust decoder), JBIG2 support, table detection, format-specific metadata |
| **Vector** | `.svg` | DOM parsing, embedded text, graphics metadata |
#### Web & Data
| Category | Formats | Features |
|----------|---------|----------|
| **Markup** | `.html`, `.htm`, `.xhtml`, `.xml`, `.svg` | DOM parsing, metadata (Open Graph, Twitter Card), link extraction |
| **Structured Data** | `.json`, `.yaml`, `.yml`, `.toml`, `.csv`, `.tsv` | Schema detection, nested structures, validation |
| **Text & Markdown** | `.txt`, `.md`, `.markdown`, `.djot`, `.rst`, `.org`, `.rtf` | CommonMark, GFM, Djot, reStructuredText, Org Mode |
#### Email & Archives
| Category | Formats | Features |
|----------|---------|----------|
| **Email** | `.eml`, `.msg` | Headers, body (HTML/plain), attachments, threading |
| **Archives** | `.zip`, `.tar`, `.tgz`, `.gz`, `.7z` | File listing, nested archives, metadata |
#### Academic & Scientific
| Category | Formats | Features |
|----------|---------|----------|
| **Citations** | `.bib`, `.biblatex`, `.ris`, `.nbib`, `.enw`, `.csl` | Structured parsing: RIS (structured), PubMed/MEDLINE, EndNote XML (structured), BibTeX, CSL JSON |
| **Scientific** | `.tex`, `.latex`, `.typst`, `.jats`, `.ipynb`, `.docbook` | LaTeX, Jupyter notebooks, PubMed JATS |
| **Documentation** | `.opml`, `.pod`, `.mdoc`, `.troff` | Technical documentation formats |
#### Code Intelligence (300+ Languages)
| Feature | Description |
|---------|-------------|
| **Structure Extraction** | Functions, classes, methods, structs, interfaces, enums |
| **Import/Export Analysis** | Module dependencies, re-exports, wildcard imports |
| **Symbol Extraction** | Variables, constants, type aliases, properties |
| **Docstring Parsing** | Google, NumPy, Sphinx, JSDoc, RustDoc, and 10+ formats |
| **Diagnostics** | Parse errors with line/column positions |
| **Syntax-Aware Chunking** | Split code by semantic boundaries, not arbitrary byte offsets |
Powered by [tree-sitter-language-pack](https://github.com/kreuzberg-dev/tree-sitter-language-pack) — [documentation](https://docs.tree-sitter-language-pack.kreuzberg.dev).
**[Complete Format Reference](https://docs.kreuzberg.dev/reference/formats/)**
### Key Capabilities
- **Text Extraction** - Extract all text content with position and formatting information
- **Metadata Extraction** - Retrieve document properties, creation date, author, etc.
- **Table Extraction** - Parse tables with structure and cell content preservation
- **Image Extraction** - Extract embedded images and render page previews
- **OCR Support** - Integrate multiple OCR backends for scanned documents
- **Async/Await** - Non-blocking document processing with concurrent operations
- **Plugin System** - Extensible post-processing for custom text transformation
- **Embeddings** - Generate vector embeddings using ONNX Runtime models
- **Batch Processing** - Efficiently process multiple documents in parallel
- **Memory Efficient** - Stream large files without loading entirely into memory
- **Language Detection** - Detect and support multiple languages in documents
- **Code Intelligence** - Extract structure, imports, exports, symbols, and docstrings from [300+ programming languages](https://docs.tree-sitter-language-pack.kreuzberg.dev) via tree-sitter
- **Configuration** - Fine-grained control over extraction behavior
### Performance Characteristics
| Format | Speed | Memory | Notes |
|--------|-------|--------|-------|
| **PDF (text)** | 10-100 MB/s | ~50MB per doc | Fastest extraction |
| **Office docs** | 20-200 MB/s | ~100MB per doc | DOCX, XLSX, PPTX |
| **Images (OCR)** | 1-5 MB/s | Variable | Depends on OCR backend |
| **Archives** | 5-50 MB/s | ~200MB per doc | ZIP, TAR, etc. |
| **Web formats** | 50-200 MB/s | Streaming | HTML, XML, JSON |
## OCR Support
Kreuzberg supports multiple OCR backends for extracting text from scanned documents and images:
- **Tesseract**
- **Paddleocr**
### OCR Configuration Example
```typescript title="TypeScript"
import { extractFile } from "@kreuzberg/node";
const config = {
ocr: {
backend: "tesseract",
language: "eng+fra",
tesseractConfig: {
psm: 3,
},
},
};
const result = await extractFile("document.pdf", null, config);
console.log(result.content);
```
## Async Support
This binding provides full async/await support for non-blocking document processing:
```typescript title="TypeScript"
import { extractFile } from "@kreuzberg/node";
const result = await extractFile("document.pdf");
console.log(result.content);
```
## Plugin System
Kreuzberg supports extensible post-processing plugins for custom text transformation and filtering.
For detailed plugin documentation, visit [Plugin System Guide](https://docs.kreuzberg.dev/guides/plugins/).
## Embeddings Support
Generate vector embeddings for extracted text using the built-in ONNX Runtime support. Requires ONNX Runtime installation.
**[Embeddings Guide](https://docs.kreuzberg.dev/features/#embeddings)**
## Batch Processing
Process multiple documents efficiently:
```typescript title="TypeScript"
import { batchExtractFilesSync } from "@kreuzberg/node";
const files = ["doc1.pdf", "doc2.docx", "doc3.pptx"];
const results = batchExtractFilesSync(files);
results.forEach((result, i) => {
console.log(`File ${i + 1}: ${result.content.length} characters`);
});
```
## Configuration
For advanced configuration options including language detection, table extraction, OCR settings, and more:
**[Configuration Guide](https://docs.kreuzberg.dev/guides/configuration/)**
## Documentation
- **[Official Documentation](https://docs.kreuzberg.dev/)**
- **[API Reference](https://docs.kreuzberg.dev/reference/api-python/)**
- **[Examples & Guides](https://docs.kreuzberg.dev/)**
## Contributing
Contributions are welcome! See [Contributing Guide](https://github.com/kreuzberg-dev/kreuzberg/blob/main/CONTRIBUTING.md).
## Part of Kreuzberg.dev
- [Kreuzberg Cloud](https://github.com/kreuzberg-dev/kreuzberg-cloud) — managed extraction API with SDKs, dashboards, and observability.
- [kreuzcrawl](https://github.com/kreuzberg-dev/kreuzcrawl) — web crawling and scraping with HTML→Markdown and headless-Chrome fallback.
- [html-to-markdown](https://github.com/kreuzberg-dev/html-to-markdown) — fast, lossless HTML→Markdown engine.
- [liter-llm](https://github.com/kreuzberg-dev/liter-llm) — universal LLM API client with native bindings for 14 languages and 143 providers.
- [tree-sitter-language-pack](https://github.com/kreuzberg-dev/tree-sitter-language-pack) — tree-sitter grammars and code-intelligence primitives.
- [alef](https://github.com/kreuzberg-dev/alef) — the polyglot binding generator that produces this README and all per-language bindings.
- [Discord](https://discord.gg/xt9WY3GnKR) — community, roadmap, announcements.
## License
Elastic-2.0 License — see [LICENSE](../../LICENSE) for details.
## Support
- **Discord Community**: [Join our Discord](https://discord.gg/xt9WY3GnKR)
- **GitHub Issues**: [Report bugs](https://github.com/kreuzberg-dev/kreuzberg/issues)
- **Discussions**: [Ask questions](https://github.com/kreuzberg-dev/kreuzberg/discussions)

View File

@@ -0,0 +1,27 @@
// Wrap JsFormatMetadata to add getters for format-specific metadata
// This works around the limitation that #[napi(getter)] doesn't work on #[napi(object)]
export function wrapFormatMetadata(fmt) {
if (!fmt || typeof fmt !== "object") return fmt;
const tag = fmt.format_type;
const payload = fmt["0"];
if (!payload) return fmt;
try {
const data = JSON.parse(payload);
// Add the typed variant property as a non-enumerable property
Object.defineProperty(fmt, tag, {
value: data,
enumerable: false,
writable: false,
configurable: false,
});
} catch (e) {
// Ignore JSON parse errors
}
return fmt;
}

5488
crates/kreuzberg-node/index.d.ts generated vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,15 @@
{
"name": "@kreuzberg/node-darwin-arm64",
"version": "5.0.0-rc.3",
"license": "Elastic-2.0",
"repository": {
"type": "git",
"url": "git+https://github.com/kreuzberg-dev/kreuzberg.git"
},
"main": "kreuzberg-node.darwin-arm64.node",
"files": ["kreuzberg-node.darwin-arm64.node"],
"os": ["darwin"],
"cpu": ["arm64"],
"engines": { "node": ">= 18" },
"publishConfig": { "access": "public" }
}

View File

@@ -0,0 +1,15 @@
{
"name": "@kreuzberg/node-darwin-x64",
"version": "5.0.0-rc.3",
"license": "Elastic-2.0",
"repository": {
"type": "git",
"url": "git+https://github.com/kreuzberg-dev/kreuzberg.git"
},
"main": "kreuzberg-node.darwin-x64.node",
"files": ["kreuzberg-node.darwin-x64.node"],
"os": ["darwin"],
"cpu": ["x64"],
"engines": { "node": ">= 18" },
"publishConfig": { "access": "public" }
}

View File

@@ -0,0 +1,16 @@
{
"name": "@kreuzberg/node-linux-arm64-gnu",
"version": "5.0.0-rc.3",
"license": "Elastic-2.0",
"repository": {
"type": "git",
"url": "git+https://github.com/kreuzberg-dev/kreuzberg.git"
},
"main": "kreuzberg-node.linux-arm64-gnu.node",
"files": ["kreuzberg-node.linux-arm64-gnu.node"],
"os": ["linux"],
"cpu": ["arm64"],
"libc": ["glibc"],
"engines": { "node": ">= 18" },
"publishConfig": { "access": "public" }
}

View File

@@ -0,0 +1,16 @@
{
"name": "@kreuzberg/node-linux-arm64-musl",
"version": "5.0.0-rc.3",
"license": "Elastic-2.0",
"repository": {
"type": "git",
"url": "git+https://github.com/kreuzberg-dev/kreuzberg.git"
},
"main": "kreuzberg-node.linux-arm64-musl.node",
"files": ["kreuzberg-node.linux-arm64-musl.node"],
"os": ["linux"],
"cpu": ["arm64"],
"libc": ["musl"],
"engines": { "node": ">= 18" },
"publishConfig": { "access": "public" }
}

View File

@@ -0,0 +1,16 @@
{
"name": "@kreuzberg/node-linux-x64-gnu",
"version": "5.0.0-rc.3",
"license": "Elastic-2.0",
"repository": {
"type": "git",
"url": "git+https://github.com/kreuzberg-dev/kreuzberg.git"
},
"main": "kreuzberg-node.linux-x64-gnu.node",
"files": ["kreuzberg-node.linux-x64-gnu.node"],
"os": ["linux"],
"cpu": ["x64"],
"libc": ["glibc"],
"engines": { "node": ">= 18" },
"publishConfig": { "access": "public" }
}

View File

@@ -0,0 +1,16 @@
{
"name": "@kreuzberg/node-linux-x64-musl",
"version": "5.0.0-rc.3",
"license": "Elastic-2.0",
"repository": {
"type": "git",
"url": "git+https://github.com/kreuzberg-dev/kreuzberg.git"
},
"main": "kreuzberg-node.linux-x64-musl.node",
"files": ["kreuzberg-node.linux-x64-musl.node"],
"os": ["linux"],
"cpu": ["x64"],
"libc": ["musl"],
"engines": { "node": ">= 18" },
"publishConfig": { "access": "public" }
}

View File

@@ -0,0 +1,15 @@
{
"name": "@kreuzberg/node-win32-arm64-msvc",
"version": "5.0.0-rc.3",
"license": "Elastic-2.0",
"repository": {
"type": "git",
"url": "git+https://github.com/kreuzberg-dev/kreuzberg.git"
},
"main": "kreuzberg-node.win32-arm64-msvc.node",
"files": ["kreuzberg-node.win32-arm64-msvc.node"],
"os": ["win32"],
"cpu": ["arm64"],
"engines": { "node": ">= 18" },
"publishConfig": { "access": "public" }
}

View File

@@ -0,0 +1,15 @@
{
"name": "@kreuzberg/node-win32-x64-msvc",
"version": "5.0.0-rc.3",
"license": "Elastic-2.0",
"repository": {
"type": "git",
"url": "git+https://github.com/kreuzberg-dev/kreuzberg.git"
},
"main": "kreuzberg-node.win32-x64-msvc.node",
"files": ["kreuzberg-node.win32-x64-msvc.node"],
"os": ["win32"],
"cpu": ["x64"],
"engines": { "node": ">= 18" },
"publishConfig": { "access": "public" }
}

52
crates/kreuzberg-node/package.json generated Normal file
View File

@@ -0,0 +1,52 @@
{
"name": "@kreuzberg/node",
"version": "5.0.0-rc.3",
"description": "High-performance document intelligence library",
"license": "Elastic-2.0",
"repository": {
"type": "git",
"url": "git+https://github.com/kreuzberg-dev/kreuzberg.git"
},
"main": "index.js",
"types": "index.d.ts",
"exports": {
".": {
"types": "./index.d.ts",
"require": "./index.js",
"default": "./index.js"
}
},
"files": ["index.js", "index.d.ts", "*.node"],
"optionalDependencies": {
"@kreuzberg/node-linux-x64-gnu": "5.0.0-rc.3",
"@kreuzberg/node-linux-arm64-gnu": "5.0.0-rc.3",
"@kreuzberg/node-linux-x64-musl": "5.0.0-rc.3",
"@kreuzberg/node-linux-arm64-musl": "5.0.0-rc.3",
"@kreuzberg/node-darwin-x64": "5.0.0-rc.3",
"@kreuzberg/node-darwin-arm64": "5.0.0-rc.3",
"@kreuzberg/node-win32-x64-msvc": "5.0.0-rc.3",
"@kreuzberg/node-win32-arm64-msvc": "5.0.0-rc.3"
},
"napi": {
"packageName": "@kreuzberg/node",
"binaryName": "kreuzberg-node",
"targets": [
"x86_64-unknown-linux-gnu",
"aarch64-unknown-linux-gnu",
"x86_64-unknown-linux-musl",
"aarch64-unknown-linux-musl",
"x86_64-apple-darwin",
"aarch64-apple-darwin",
"x86_64-pc-windows-msvc",
"aarch64-pc-windows-msvc"
]
},
"scripts": {
"build": "napi build --platform --release",
"artifacts": "napi artifacts",
"prepublishOnly": "napi prepublish -t npm --skip-optional-publish"
},
"engines": { "node": ">= 18" },
"publishConfig": { "access": "public" },
"devDependencies": { "@napi-rs/cli": "^3.6.2" }
}

15166
crates/kreuzberg-node/src/lib.rs generated Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,41 @@
[package]
name = "kreuzberg-paddle-ocr"
version.workspace = true
edition = "2024"
rust-version.workspace = true
authors.workspace = true
description = "PaddleOCR via ONNX Runtime for Kreuzberg - high-performance text recognition"
license = "MIT"
repository.workspace = true
homepage = "https://kreuzberg.dev"
documentation = "https://docs.rs/kreuzberg-paddle-ocr"
readme = "README.md"
keywords = ["paddle", "ocr", "onnx", "recognition", "detection"]
categories = ["computer-vision", "text-processing"]
exclude = ["tests/*", ".github/*"]
[package.metadata.docs.rs]
all-features = true
rustdoc-args = ["--cfg", "docsrs"]
[lib]
name = "kreuzberg_paddle_ocr"
crate-type = ["lib"]
[features]
default = []
load-dynamic = ["ort/load-dynamic"]
[dependencies]
geo-clipper = "0.9"
geo-types = "0.7"
image = { workspace = true }
# Crate-specific dependencies (not in workspace)
# Disable rayon - OCR parallelism is handled at higher level
imageproc = { version = "0.26", default-features = false }
ndarray = "0.17"
ort = { workspace = true, features = ["ndarray"] }
# Workspace dependencies
serde = { workspace = true }
thiserror = { workspace = true }

View File

@@ -0,0 +1,22 @@
MIT License
Copyright (c) 2024 mg-chao
Copyright (c) 2025 Na'aman Hirschfeld
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -0,0 +1,57 @@
# kreuzberg-paddle-ocr
[![Bindings](https://img.shields.io/badge/Bindings-alef%20%D7%90-007ec6)](https://github.com/kreuzberg-dev/alef)
PaddleOCR via ONNX Runtime for Kreuzberg - high-performance text detection and recognition using PaddlePaddle's OCR models.
Based on the original [paddle-ocr-rs](https://github.com/mg-chao/paddle-ocr-rs) by [mg-chao](https://github.com/mg-chao), this vendored version includes improvements for Kreuzberg integration:
- **Workspace Dependency Alignment**: Uses Kreuzberg's workspace dependencies for consistency
- **Edition 2024**: Updated to Rust 2024 edition
- **ndarray Compatibility**: Aligned with Kreuzberg's ndarray version requirements
- **Integration**: Designed to work seamlessly with Kreuzberg's OCR backend system
## Features
- Text detection using DBNet (Differentiable Binarization)
- Text recognition using CRNN (Convolutional Recurrent Neural Network)
- Angle detection for rotated text
- Support for multiple languages via PaddleOCR models
- ONNX Runtime for efficient CPU inference
## ONNX Runtime Requirement
This crate requires **ONNX Runtime 1.24+** at runtime.
Install it:
- **macOS (Homebrew)**: `brew install onnxruntime`
- **Linux**: Download from [ONNX Runtime releases](https://github.com/microsoft/onnxruntime/releases)
- **Windows**: Download from [ONNX Runtime releases](https://github.com/microsoft/onnxruntime/releases)
## Usage
This crate is used internally by Kreuzberg when the `paddle-ocr` feature is enabled:
```toml
[dependencies]
kreuzberg = { version = "4.2", features = ["paddle-ocr"] }
```
## Models
PaddleOCR models are automatically downloaded and cached on first use. Supported models include:
- PP-OCRv5 server detection model
- PP-OCRv5 per-family recognition models (11 script families)
- PPOCRv2 mobile angle classification model
## License
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
## Acknowledgements
This project is based on the original [paddle-ocr-rs](https://github.com/mg-chao/paddle-ocr-rs) by [mg-chao](https://github.com/mg-chao), originally licensed under Apache-2.0. We are grateful for the foundational work that made this integration possible.
The original paddle-ocr-rs provides Rust bindings for PaddlePaddle's OCR models via ONNX Runtime, enabling efficient text detection and recognition without Python dependencies.

View File

@@ -0,0 +1,139 @@
use crate::{
base_net::BaseNet,
constants::{IMAGENET_MEAN_VALUES, IMAGENET_NORM_VALUES},
ocr_error::OcrError,
ocr_result::Angle,
ocr_utils::OcrUtils,
};
use ort::{
inputs,
session::{Session, SessionOutputs},
value::Tensor,
};
// PP-LCNet_x1_0_textline_ori preprocessing (ImageNet normalization).
// Input: resize to 160×80 (W×H), normalize with ImageNet mean/std.
// Formula in substract_mean_normalize: (pixel - MEAN) * NORM
// For ImageNet: (pixel/255 - mean) / std = (pixel - mean*255) * (1/(std*255))
// V2 PP-LCNet angle classifier expects [3, 80, 160] input (NCHW).
const ANGLE_DST_WIDTH: u32 = 160;
const ANGLE_DST_HEIGHT: u32 = 80;
const ANGLE_COLS: usize = 2;
#[derive(Debug)]
pub struct AngleNet {
session: Option<Session>,
input_names: Vec<String>,
}
impl BaseNet for AngleNet {
fn new() -> Self {
Self {
session: None,
input_names: Vec::new(),
}
}
fn set_input_names(&mut self, input_names: Vec<String>) {
self.input_names = input_names;
}
fn set_session(&mut self, session: Option<Session>) {
self.session = session;
}
}
impl AngleNet {
pub fn get_angles(
&self,
part_imgs: &[image::RgbImage],
do_angle: bool,
most_angle: bool,
cls_thresh: f32,
) -> Result<Vec<Angle>, OcrError> {
// Pre-allocate — we know exact count upfront.
let mut angles = Vec::with_capacity(part_imgs.len());
if do_angle {
for img in part_imgs {
let angle = self.get_angle(img, cls_thresh)?;
angles.push(angle);
}
} else {
angles.extend(part_imgs.iter().map(|_| Angle::default()));
}
if do_angle && most_angle {
let sum: i32 = angles.iter().map(|x| x.index).sum();
let half_percent = angles.len() as f32 / 2.0;
let most_angle_index = if (sum as f32) < half_percent { 0 } else { 1 };
for angle in angles.iter_mut() {
angle.index = most_angle_index;
}
}
Ok(angles)
}
fn get_angle(&self, img_src: &image::RgbImage, cls_thresh: f32) -> Result<Angle, OcrError> {
let Some(session) = &self.session else {
return Err(OcrError::SessionNotInitialized);
};
let angle_img = image::imageops::resize(
img_src,
ANGLE_DST_WIDTH,
ANGLE_DST_HEIGHT,
image::imageops::FilterType::Triangle,
);
let input_tensors =
OcrUtils::substract_mean_normalize(&angle_img, &IMAGENET_MEAN_VALUES, &IMAGENET_NORM_VALUES);
let input_tensors = Tensor::from_array(input_tensors)?;
// SAFETY: ONNX Runtime C API is thread-safe for concurrent inference.
#[allow(unsafe_code)]
let outputs = unsafe {
let session_ptr = session as *const Session as *mut Session;
(*session_ptr).run(inputs![self.input_names[0].as_str() => input_tensors])?
};
let mut angle = Self::score_to_angle(&outputs, ANGLE_COLS)?;
// Only apply rotation if confidence exceeds threshold (matches PaddleOCR's cls_thresh=0.9)
if angle.score < cls_thresh {
angle.index = 0; // Keep original orientation when confidence is low
}
Ok(angle)
}
fn score_to_angle(output_tensor: &SessionOutputs, angle_cols: usize) -> Result<Angle, OcrError> {
let (_, red_data) = output_tensor.iter().next().ok_or_else(|| {
OcrError::Io(std::io::Error::new(
std::io::ErrorKind::InvalidData,
"No output tensors found in angle classification session output",
))
})?;
let src_data: Vec<f32> = red_data.try_extract_tensor::<f32>()?.1.to_vec();
let mut angle = Angle::default();
let mut max_value = f32::MIN;
let mut angle_index = 0;
for (i, value) in src_data.iter().take(angle_cols).enumerate() {
if *value > max_value {
max_value = *value;
angle_index = i as i32;
}
}
angle.index = angle_index;
angle.score = max_value;
Ok(angle)
}
}

View File

@@ -0,0 +1,78 @@
use ort::session::{
Session,
builder::{GraphOptimizationLevel, SessionBuilder},
};
use crate::ocr_error::OcrError;
pub trait BaseNet {
fn new() -> Self;
fn get_session_builder(
&self,
num_thread: usize,
builder_fn: Option<fn(SessionBuilder) -> Result<SessionBuilder, ort::Error>>,
) -> Result<SessionBuilder, OcrError> {
let builder = Session::builder()?;
let builder = match builder_fn {
Some(custom) => custom(builder)?,
None => builder
.with_optimization_level(GraphOptimizationLevel::All)
.map_err(|e| OcrError::Ort(ort::Error::new(e.message())))?
.with_intra_threads(num_thread)
.map_err(|e| OcrError::Ort(ort::Error::new(e.message())))?
.with_inter_threads(1)
.map_err(|e| OcrError::Ort(ort::Error::new(e.message())))?,
};
Ok(builder)
}
fn set_input_names(&mut self, input_names: Vec<String>);
fn set_session(&mut self, session: Option<Session>);
fn init(&mut self, session: Session) {
let input_names: Vec<String> = session.inputs().iter().map(|input| input.name().to_string()).collect();
self.set_input_names(input_names);
self.set_session(Some(session));
}
fn init_model(
&mut self,
path: &str,
num_thread: usize,
builder_fn: Option<fn(SessionBuilder) -> Result<SessionBuilder, ort::Error>>,
) -> Result<(), OcrError> {
// Wrap ORT session creation in catch_unwind to prevent mutex poisoning
// on platforms where ORT initialization can panic (notably Windows).
let path_owned = path.to_string();
let session = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
let mut builder = self.get_session_builder(num_thread, builder_fn)?;
builder.commit_from_file(&path_owned).map_err(OcrError::from)
}))
.map_err(|_| OcrError::Ort(ort::Error::new("ORT session initialization panicked")))??;
self.init(session);
Ok(())
}
fn init_model_from_memory(
&mut self,
model_bytes: &[u8],
num_thread: usize,
builder_fn: Option<fn(SessionBuilder) -> Result<SessionBuilder, ort::Error>>,
) -> Result<(), OcrError> {
// Wrap ORT session creation in catch_unwind to prevent mutex poisoning
// on platforms where ORT initialization can panic (notably Windows).
let session = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
let mut builder = self.get_session_builder(num_thread, builder_fn)?;
builder.commit_from_memory(model_bytes).map_err(OcrError::from)
}))
.map_err(|_| OcrError::Ort(ort::Error::new("ORT session initialization panicked")))??;
self.init(session);
Ok(())
}
}

View File

@@ -0,0 +1,33 @@
//! Shared normalization constants for PaddleOCR preprocessing.
//!
//! Two normalization schemes are used:
//!
//! - **ImageNet** (`IMAGENET_MEAN_VALUES` / `IMAGENET_NORM_VALUES`): used by the text
//! detection network (`DbNet`) and the angle classifier (`AngleNet`).
//! Formula: `(pixel - mean * 255) * (1 / (std * 255))`.
//!
//! - **CRNN** (`CRNN_MEAN_VALUES` / `CRNN_NORM_VALUES`): used by the text recognition
//! network (`CrnnNet`).
//! Formula: `(pixel - 127.5) * (1 / 127.5)`.
/// ImageNet channel means (R, G, B), pre-multiplied by 255.
///
/// Derived from `[0.485, 0.456, 0.406]` (per-channel ImageNet means).
/// Used by `DbNet` (text detection) and `AngleNet` (angle classification).
pub(crate) const IMAGENET_MEAN_VALUES: [f32; 3] = [0.485 * 255.0, 0.456 * 255.0, 0.406 * 255.0];
/// ImageNet channel normalization factors (R, G, B), equal to `1 / (std * 255)`.
///
/// Derived from `[0.229, 0.224, 0.225]` (per-channel ImageNet standard deviations).
/// Used by `DbNet` (text detection) and `AngleNet` (angle classification).
pub(crate) const IMAGENET_NORM_VALUES: [f32; 3] = [1.0 / (0.229 * 255.0), 1.0 / (0.224 * 255.0), 1.0 / (0.225 * 255.0)];
/// CRNN channel means (R, G, B): `127.5` for all channels.
///
/// Used by `CrnnNet` (text recognition).
pub(crate) const CRNN_MEAN_VALUES: [f32; 3] = [127.5, 127.5, 127.5];
/// CRNN channel normalization factors (R, G, B): `1 / 127.5` for all channels.
///
/// Used by `CrnnNet` (text recognition).
pub(crate) const CRNN_NORM_VALUES: [f32; 3] = [1.0 / 127.5, 1.0 / 127.5, 1.0 / 127.5];

View File

@@ -0,0 +1,393 @@
use ndarray::Array4;
use ort::session::Session;
use ort::value::Tensor;
use ort::{inputs, session::builder::SessionBuilder};
use std::collections::HashMap;
use crate::{
base_net::BaseNet,
constants::{CRNN_MEAN_VALUES, CRNN_NORM_VALUES},
ocr_error::OcrError,
ocr_result::TextLine,
ocr_utils::OcrUtils,
};
const CRNN_DST_HEIGHT: u32 = 48;
#[derive(Debug)]
pub struct CrnnNet {
session: Option<Session>,
keys: Vec<String>,
input_names: Vec<String>,
}
impl BaseNet for CrnnNet {
fn new() -> Self {
Self {
session: None,
keys: Vec::new(),
input_names: Vec::new(),
}
}
fn set_input_names(&mut self, input_names: Vec<String>) {
self.input_names = input_names;
}
fn set_session(&mut self, session: Option<Session>) {
self.session = session;
}
}
impl CrnnNet {
pub fn init_model(
&mut self,
path: &str,
num_thread: usize,
builder_fn: Option<fn(SessionBuilder) -> Result<SessionBuilder, ort::Error>>,
) -> Result<(), OcrError> {
BaseNet::init_model(self, path, num_thread, builder_fn)?;
self.keys = self.get_keys()?;
Ok(())
}
pub fn init_model_dict_file(
&mut self,
path: &str,
num_thread: usize,
builder_fn: Option<fn(SessionBuilder) -> Result<SessionBuilder, ort::Error>>,
dict_file_path: &str,
) -> Result<(), OcrError> {
BaseNet::init_model(self, path, num_thread, builder_fn)?;
self.read_keys_from_file(dict_file_path)?;
Ok(())
}
pub fn init_model_from_memory(
&mut self,
model_bytes: &[u8],
num_thread: usize,
builder_fn: Option<fn(SessionBuilder) -> Result<SessionBuilder, ort::Error>>,
) -> Result<(), OcrError> {
BaseNet::init_model_from_memory(self, model_bytes, num_thread, builder_fn)?;
self.keys = self.get_keys()?;
Ok(())
}
fn get_keys(&mut self) -> Result<Vec<String>, OcrError> {
let session = self.session.as_ref().ok_or(OcrError::SessionNotInitialized)?;
let metadata = session.metadata()?;
let model_charater_list = metadata.custom("character").ok_or_else(|| {
OcrError::Io(std::io::Error::new(
std::io::ErrorKind::NotFound,
"crnn_net character not found in metadata",
))
})?;
// PP-OCRv5 model metadata already includes the CTC blank token ("#") at
// index 0 and the space token (" ") at the end. Do NOT prepend/append
// extra tokens — doing so shifts every character index by one and
// produces garbled output.
let keys: Vec<String> = model_charater_list.split('\n').map(|s: &str| s.to_string()).collect();
Ok(keys)
}
fn read_keys_from_file(&mut self, path: &str) -> Result<(), OcrError> {
let content = std::fs::read_to_string(path)?;
// PP-OCRv5 dict files already include the CTC blank token ("#") at
// index 0 and the space token (" ") at the end. Do NOT prepend/append
// extra tokens — doing so shifts every character index by one and
// produces garbled output.
let keys: Vec<String> = content.split('\n').map(|s| s.to_string()).collect();
self.keys = keys;
Ok(())
}
pub fn get_text_lines(
&self,
part_imgs: &[image::RgbImage],
angle_rollback_records: &HashMap<usize, image::RgbImage>,
angle_rollback_threshold: f32,
batch_size: u32,
) -> Result<Vec<TextLine>, OcrError> {
if part_imgs.is_empty() {
return Ok(Vec::new());
}
// Batch recognition: sort by aspect ratio, batch, pad to max width
let mut text_lines = self.get_text_lines_batched(part_imgs, batch_size)?;
// Angle rollback: re-recognize individual images that scored poorly
for (index, text_line) in text_lines.iter_mut().enumerate() {
if (text_line.text_score.is_nan() || text_line.text_score < angle_rollback_threshold)
&& let Some(angle_rollback_record) = angle_rollback_records.get(&index)
{
*text_line = self.get_text_line(angle_rollback_record)?;
}
}
Ok(text_lines)
}
/// Batch recognition: sort crops by width, group into batches, pad to max width,
/// run single ONNX inference per batch. Matches PaddleOCR/RapidOCR batching strategy.
fn get_text_lines_batched(
&self,
part_imgs: &[image::RgbImage],
batch_size: u32,
) -> Result<Vec<TextLine>, OcrError> {
let session = self.session.as_ref().ok_or(OcrError::SessionNotInitialized)?;
let batch_size = (batch_size as usize).max(1);
// Compute target widths and sort indices by aspect ratio (width/height)
let mut indexed_widths: Vec<(usize, u32)> = part_imgs
.iter()
.enumerate()
.map(|(i, img)| {
let scale = CRNN_DST_HEIGHT as f32 / img.height().max(1) as f32;
let dst_width = (img.width() as f32 * scale).ceil() as u32;
(i, dst_width.max(1))
})
.collect();
indexed_widths.sort_by_key(|&(_, w)| w);
let mut results: Vec<(usize, TextLine)> = Vec::with_capacity(part_imgs.len());
// Process in batches
for chunk in indexed_widths.chunks(batch_size) {
if chunk.len() == 1 {
// Single image — use existing path (no padding overhead)
let (orig_idx, _) = chunk[0];
let text_line = self.get_text_line(&part_imgs[orig_idx])?;
results.push((orig_idx, text_line));
continue;
}
// Find max width in this batch
let max_width = chunk.iter().map(|&(_, w)| w).max().unwrap_or(1);
// Build batch tensor [N, 3, 48, max_width] with zero-padding
let n = chunk.len();
let mut batch_data = Array4::<f32>::zeros((n, 3, CRNN_DST_HEIGHT as usize, max_width as usize));
for (batch_idx, &(orig_idx, dst_width)) in chunk.iter().enumerate() {
let img = &part_imgs[orig_idx];
let resized =
image::imageops::resize(img, dst_width, CRNN_DST_HEIGHT, image::imageops::FilterType::Triangle);
// Normalize and fill into batch tensor (zero-padded on right).
// Use raw slice access instead of per-pixel get_pixel() to
// eliminate millions of bounds checks in the hot loop.
let cols = resized.width() as usize;
let rows = resized.height() as usize;
let raw = resized.as_raw();
assert_eq!(raw.len(), rows * cols * 3, "unexpected image buffer size");
let adjusted = [
CRNN_MEAN_VALUES[0] * CRNN_NORM_VALUES[0],
CRNN_MEAN_VALUES[1] * CRNN_NORM_VALUES[1],
CRNN_MEAN_VALUES[2] * CRNN_NORM_VALUES[2],
];
for r in 0..rows {
for c in 0..cols {
let base = r * cols * 3 + c * 3;
for ch in 0..3 {
batch_data[[batch_idx, ch, r, c]] =
raw[base + ch] as f32 * CRNN_NORM_VALUES[ch] - adjusted[ch];
}
}
}
// Remaining columns stay zero (padding)
}
let input_tensor = Tensor::from_array(batch_data)?;
// SAFETY: ONNX Runtime C API is thread-safe for concurrent inference.
#[allow(unsafe_code)]
let outputs = unsafe {
let session_ptr = session as *const Session as *mut Session;
(*session_ptr).run(inputs![self.input_names[0].as_str() => input_tensor])?
};
let (_, output_value) = outputs.iter().next().ok_or_else(|| {
OcrError::Io(std::io::Error::new(
std::io::ErrorKind::InvalidData,
"No output tensors found in batched CRNN session output",
))
})?;
let (shape, flat_data) = output_value.try_extract_tensor::<f32>()?;
// Shape: [batch, timesteps, num_classes]
let batch_dim = *shape.first().unwrap_or(&1) as usize;
let timesteps = *shape.get(1).unwrap_or(&0) as usize;
let num_classes = *shape.get(2).unwrap_or(&0) as usize;
for (batch_idx, item) in chunk.iter().enumerate().take(batch_dim.min(n)) {
let offset = batch_idx * timesteps * num_classes;
let slice = &flat_data[offset..offset + timesteps * num_classes];
let text_line = Self::score_to_text_line(slice, timesteps, num_classes, &self.keys)?;
results.push((item.0, text_line));
}
}
// Reorder results back to original index order
results.sort_by_key(|&(idx, _)| idx);
Ok(results.into_iter().map(|(_, tl)| tl).collect())
}
fn get_text_line(&self, img_src: &image::RgbImage) -> Result<TextLine, OcrError> {
let Some(session) = &self.session else {
return Err(OcrError::SessionNotInitialized);
};
let scale = CRNN_DST_HEIGHT as f32 / img_src.height() as f32;
let dst_width = (img_src.width() as f32 * scale).ceil() as u32;
let src_resize = image::imageops::resize(
img_src,
dst_width,
CRNN_DST_HEIGHT,
image::imageops::FilterType::Triangle,
);
let input_tensors = OcrUtils::substract_mean_normalize(&src_resize, &CRNN_MEAN_VALUES, &CRNN_NORM_VALUES);
let input_tensors = Tensor::from_array(input_tensors)?;
// SAFETY: ONNX Runtime C API is thread-safe for concurrent inference.
#[allow(unsafe_code)]
let outputs = unsafe {
let session_ptr = session as *const Session as *mut Session;
(*session_ptr).run(inputs![self.input_names[0].as_str() => input_tensors])?
};
let (_, red_data) = outputs.iter().next().ok_or_else(|| {
OcrError::Io(std::io::Error::new(
std::io::ErrorKind::InvalidData,
"No output tensors found in CRNN session output",
))
})?;
let (shape, src_data) = red_data.try_extract_tensor::<f32>()?;
let dimensions = shape;
let height = *dimensions.get(1).ok_or_else(|| {
OcrError::Io(std::io::Error::new(
std::io::ErrorKind::InvalidData,
"CRNN output tensor missing height dimension (index 1)",
))
})? as usize;
let width = *dimensions.get(2).ok_or_else(|| {
OcrError::Io(std::io::Error::new(
std::io::ErrorKind::InvalidData,
"CRNN output tensor missing width dimension (index 2)",
))
})? as usize;
let src_data: Vec<f32> = src_data.to_vec();
Self::score_to_text_line(&src_data, height, width, &self.keys)
}
fn score_to_text_line(
output_data: &[f32],
height: usize,
width: usize,
keys: &[String],
) -> Result<TextLine, OcrError> {
let mut text_line = TextLine::default();
let mut last_index = 0;
let mut text_score_sum = 0.0;
let mut text_score_count = 0;
for i in 0..height {
let start = i * width;
let stop = (i + 1) * width;
let slice = &output_data[start..stop.min(output_data.len())];
let (max_index, max_value) =
slice
.iter()
.enumerate()
.fold((0, f32::MIN), |(max_idx, max_val), (idx, &val)| {
if val > max_val { (idx, val) } else { (max_idx, max_val) }
});
if max_index > 0 && max_index < keys.len() && !(i > 0 && max_index == last_index) {
text_line.text.push_str(&keys[max_index]);
text_score_sum += max_value;
text_score_count += 1;
}
last_index = max_index;
}
// Avoid division by zero: handle case where no characters were found
text_line.text_score = if text_score_count > 0 {
text_score_sum / text_score_count as f32
} else {
0.0
};
Ok(text_line)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_score_to_text_line_skips_blank_index() {
// keys[0] = "#" (CTC blank), keys[1] = "a", keys[2] = "b"
let keys = vec!["#".to_string(), "a".to_string(), "b".to_string()];
// 3 timesteps, 3 classes each. Simulate: blank, "a", "b"
let output = vec![
1.0, 0.0, 0.0, // timestep 0: max at index 0 (blank) -> skip
0.0, 0.9, 0.1, // timestep 1: max at index 1 ("a")
0.0, 0.1, 0.8, // timestep 2: max at index 2 ("b")
];
let result = CrnnNet::score_to_text_line(&output, 3, 3, &keys).unwrap();
assert_eq!(result.text, "ab");
}
#[test]
fn test_score_to_text_line_deduplicates_consecutive() {
let keys = vec!["#".to_string(), "h".to_string(), "i".to_string()];
// 4 timesteps: "h", "h", "i", "i" -> should deduplicate to "hi"
let output = vec![
0.0, 0.9, 0.0, // "h"
0.0, 0.8, 0.0, // "h" again (same index, skip)
0.0, 0.0, 0.9, // "i"
0.0, 0.0, 0.8, // "i" again (same index, skip)
];
let result = CrnnNet::score_to_text_line(&output, 4, 3, &keys).unwrap();
assert_eq!(result.text, "hi");
}
#[test]
fn test_read_keys_from_file_preserves_dict_layout() {
let dir = std::env::temp_dir().join("kreuzberg_test_dict");
std::fs::create_dir_all(&dir).unwrap();
let dict_path = dir.join("test_dict.txt");
// PP-OCRv5 dict files already include "#" (blank) at start and " " at end.
std::fs::write(&dict_path, "#\na\nb\nc\n ").unwrap();
let mut net = CrnnNet::new();
net.read_keys_from_file(dict_path.to_str().unwrap()).unwrap();
// Dict is loaded as-is: ["#", "a", "b", "c", " "]
assert_eq!(net.keys[0], "#");
assert_eq!(net.keys[1], "a");
assert_eq!(net.keys[2], "b");
assert_eq!(net.keys[3], "c");
assert_eq!(net.keys[net.keys.len() - 1], " ");
std::fs::remove_dir_all(&dir).ok();
}
}

View File

@@ -0,0 +1,421 @@
use crate::{
base_net::BaseNet,
constants::{IMAGENET_MEAN_VALUES, IMAGENET_NORM_VALUES},
ocr_error::OcrError,
ocr_result::{self, TextBox},
ocr_utils::OcrUtils,
scale_param::ScaleParam,
};
use geo_clipper::{Clipper, EndType, JoinType};
use geo_types::{Coord, LineString, Polygon};
use ort::{inputs, session::SessionOutputs};
use ort::{session::Session, value::Tensor};
use std::cmp::Ordering;
#[derive(Debug)]
pub struct DbNet {
session: Option<Session>,
input_names: Vec<String>,
}
impl BaseNet for DbNet {
fn new() -> Self {
Self {
session: None,
input_names: Vec::new(),
}
}
fn set_input_names(&mut self, input_names: Vec<String>) {
self.input_names = input_names;
}
fn set_session(&mut self, session: Option<Session>) {
self.session = session;
}
}
impl DbNet {
pub fn get_text_boxes(
&self,
img_src: &image::RgbImage,
scale: &ScaleParam,
box_score_thresh: f32,
box_thresh: f32,
un_clip_ratio: f32,
thresh: f32,
) -> Result<Vec<TextBox>, OcrError> {
let Some(session) = &self.session else {
return Err(OcrError::SessionNotInitialized);
};
let src_resize = image::imageops::resize(
img_src,
scale.dst_width,
scale.dst_height,
image::imageops::FilterType::Triangle,
);
let input_tensors =
OcrUtils::substract_mean_normalize(&src_resize, &IMAGENET_MEAN_VALUES, &IMAGENET_NORM_VALUES);
let tensor = Tensor::from_array(input_tensors)?;
// SAFETY: ONNX Runtime's C API (OrtRun) is thread-safe for concurrent inference
// on the same session. The ort crate's `&mut self` requirement is overly
// conservative. This matches the pattern used in kreuzberg's embedding engine.
#[allow(unsafe_code)]
let outputs = unsafe {
let session_ptr = session as *const Session as *mut Session;
(*session_ptr).run(inputs![self.input_names[0].as_str() => tensor])?
};
let text_boxes = Self::get_text_boxes_core(
&outputs,
src_resize.height(),
src_resize.width(),
&ScaleParam::new(
scale.src_width,
scale.src_height,
scale.dst_width,
scale.dst_height,
scale.scale_width,
scale.scale_height,
),
box_score_thresh,
box_thresh,
un_clip_ratio,
thresh,
)?;
Ok(text_boxes)
}
fn get_text_boxes_core(
output_tensor: &SessionOutputs,
rows: u32,
cols: u32,
s: &ScaleParam,
box_score_thresh: f32,
_box_thresh: f32,
un_clip_ratio: f32,
thresh: f32,
) -> Result<Vec<TextBox>, OcrError> {
let max_side_thresh = 3.0;
let (_, red_data) = output_tensor.iter().next().ok_or_else(|| {
OcrError::Io(std::io::Error::new(
std::io::ErrorKind::InvalidData,
"No output tensors found in session output",
))
})?;
let pred_data: Vec<f32> = red_data.try_extract_tensor::<f32>()?.1.to_vec();
let cbuf_data: Vec<u8> = pred_data.iter().map(|pixel| (pixel * 255.0) as u8).collect();
let pred_img: image::ImageBuffer<image::Luma<f32>, Vec<f32>> =
image::ImageBuffer::from_vec(cols, rows, pred_data).ok_or_else(|| {
OcrError::Io(std::io::Error::new(
std::io::ErrorKind::InvalidData,
format!(
"Failed to create image buffer from predictions: {} x {} dimensions may be invalid",
cols, rows
),
))
})?;
let cbuf_img = image::GrayImage::from_vec(cols, rows, cbuf_data).ok_or_else(|| {
OcrError::Io(std::io::Error::new(
std::io::ErrorKind::InvalidData,
format!(
"Failed to create grayscale image buffer: {} x {} dimensions may be invalid",
cols, rows
),
))
})?;
let threshold_img = imageproc::contrast::threshold(
&cbuf_img,
(thresh * 255.0) as u8,
imageproc::contrast::ThresholdType::Binary,
);
// RapidOCR and PaddleOCR reference do NOT apply dilation before contour extraction.
// Dilation merges adjacent text regions, causing word concatenation.
let img_contours: Vec<imageproc::contours::Contour<i32>> = imageproc::contours::find_contours(&threshold_img);
// Pre-allocate based on contour count to avoid repeated reallocations.
let mut rs_boxes = Vec::with_capacity(img_contours.len());
for contour in img_contours {
if contour.points.len() <= 2 {
continue;
}
let mut max_side = 0.0;
let min_box = Self::get_mini_box(&contour.points, &mut max_side)?;
if max_side < max_side_thresh {
continue;
}
let score = Self::get_score(&contour, &pred_img)?;
if score < box_score_thresh {
continue;
}
let clip_box = Self::unclip(&min_box, un_clip_ratio)?;
if clip_box.is_empty() {
continue;
}
let mut clip_contour = Vec::new();
for point in &clip_box {
clip_contour.push(*point);
}
let mut max_side_clip = 0.0;
let clip_min_box = Self::get_mini_box(&clip_contour, &mut max_side_clip)?;
if max_side_clip < max_side_thresh + 2.0 {
continue;
}
let mut final_points = Vec::new();
for item in clip_min_box {
let x = (item.x / s.scale_width) as u32;
let ptx = x.min(s.src_width);
let y = (item.y / s.scale_height) as u32;
let pty = y.min(s.src_height);
final_points.push(ocr_result::Point { x: ptx, y: pty });
}
let text_box = TextBox {
score,
points: final_points,
};
rs_boxes.push(text_box);
}
Ok(rs_boxes)
}
fn get_mini_box(
contour_points: &[imageproc::point::Point<i32>],
min_edge_size: &mut f32,
) -> Result<Vec<imageproc::point::Point<f32>>, OcrError> {
let rect = imageproc::geometry::min_area_rect(contour_points);
let mut rect_points: Vec<imageproc::point::Point<f32>> = rect
.iter()
.map(|p| imageproc::point::Point::new(p.x as f32, p.y as f32))
.collect();
// Direct multiplication instead of .powi(2) — avoids function call overhead.
let dx_w = rect_points[0].x - rect_points[1].x;
let dy_w = rect_points[0].y - rect_points[1].y;
let width = (dx_w * dx_w + dy_w * dy_w).sqrt();
let dx_h = rect_points[1].x - rect_points[2].x;
let dy_h = rect_points[1].y - rect_points[2].y;
let height = (dx_h * dx_h + dy_h * dy_h).sqrt();
*min_edge_size = width.min(height);
rect_points.sort_by(|a, b| {
if a.x > b.x {
return Ordering::Greater;
}
if a.x == b.x {
return Ordering::Equal;
}
Ordering::Less
});
let mut box_points = Vec::new();
let index_1;
let index_4;
if rect_points[1].y > rect_points[0].y {
index_1 = 0;
index_4 = 1;
} else {
index_1 = 1;
index_4 = 0;
}
let index_2;
let index_3;
if rect_points[3].y > rect_points[2].y {
index_2 = 2;
index_3 = 3;
} else {
index_2 = 3;
index_3 = 2;
}
box_points.push(rect_points[index_1]);
box_points.push(rect_points[index_2]);
box_points.push(rect_points[index_3]);
box_points.push(rect_points[index_4]);
Ok(box_points)
}
fn get_score(
contour: &imageproc::contours::Contour<i32>,
f_map_mat: &image::ImageBuffer<image::Luma<f32>, Vec<f32>>,
) -> Result<f32, OcrError> {
// Initialize boundary values
let mut xmin = i32::MAX;
let mut xmax = i32::MIN;
let mut ymin = i32::MAX;
let mut ymax = i32::MIN;
// Find contour bounding box
for point in contour.points.iter() {
let x = point.x;
let y = point.y;
if x < xmin {
xmin = x;
}
if x > xmax {
xmax = x;
}
if y < ymin {
ymin = y;
}
if y > ymax {
ymax = y;
}
}
let width = f_map_mat.width() as i32;
let height = f_map_mat.height() as i32;
xmin = xmin.max(0).min(width - 1);
xmax = xmax.max(0).min(width - 1);
ymin = ymin.max(0).min(height - 1);
ymax = ymax.max(0).min(height - 1);
let roi_width = xmax - xmin + 1;
let roi_height = ymax - ymin + 1;
if roi_width <= 0 || roi_height <= 0 {
return Ok(0.0);
}
let mut mask = image::GrayImage::new(roi_width as u32, roi_height as u32);
let mut pts = Vec::<imageproc::point::Point<i32>>::new();
for point in contour.points.iter() {
pts.push(imageproc::point::Point::new(point.x - xmin, point.y - ymin));
}
imageproc::drawing::draw_polygon_mut(&mut mask, pts.as_slice(), image::Luma([255]));
let cropped_img =
image::imageops::crop_imm(f_map_mat, xmin as u32, ymin as u32, roi_width as u32, roi_height as u32)
.to_image();
let mean = OcrUtils::calculate_mean_with_mask(&cropped_img, &mask);
Ok(mean)
}
fn unclip(
box_points: &[imageproc::point::Point<f32>],
unclip_ratio: f32,
) -> Result<Vec<imageproc::point::Point<i32>>, OcrError> {
// Direct multiplication instead of .powi(2) — avoids function call overhead.
let dx_w = box_points[0].x - box_points[1].x;
let dy_w = box_points[0].y - box_points[1].y;
let clip_rect_width = (dx_w * dx_w + dy_w * dy_w).sqrt();
let dx_h = box_points[1].x - box_points[2].x;
let dy_h = box_points[1].y - box_points[2].y;
let clip_rect_height = (dx_h * dx_h + dy_h * dy_h).sqrt();
if clip_rect_height < 1.001 && clip_rect_width < 1.001 {
return Ok(Vec::new());
}
let mut the_cliper_pts = Vec::new();
for pt in box_points {
let a1 = Coord {
x: pt.x as f64,
y: pt.y as f64,
};
the_cliper_pts.push(a1);
}
let area = Self::signed_polygon_area(box_points).abs();
let length = Self::length_of_points(box_points);
let distance = area * unclip_ratio / length as f32;
let co = Polygon::new(LineString::new(the_cliper_pts), vec![]);
let solution = co
.offset(distance as f64, JoinType::Round(2.0), EndType::ClosedPolygon, 1.0)
.0;
if solution.is_empty() {
return Ok(Vec::new());
}
let first_polygon = solution.first().ok_or_else(|| {
OcrError::Io(std::io::Error::new(
std::io::ErrorKind::InvalidData,
"Polygon solution list was empty after offset operation",
))
})?;
let ret_pts: Vec<_> = first_polygon
.exterior()
.points()
.map(|ip| imageproc::point::Point::new(ip.x() as i32, ip.y() as i32))
.collect();
Ok(ret_pts)
}
fn signed_polygon_area(points: &[imageproc::point::Point<f32>]) -> f32 {
let num_points = points.len();
let mut pts = Vec::with_capacity(num_points + 1);
pts.extend_from_slice(points);
pts.push(points[0]);
let mut area = 0.0;
for i in 0..num_points {
area += (pts[i + 1].x - pts[i].x) * (pts[i + 1].y + pts[i].y) / 2.0;
}
area
}
fn length_of_points(box_points: &[imageproc::point::Point<f32>]) -> f64 {
if box_points.is_empty() {
return 0.0;
}
let mut length = 0.0;
let mut x0 = box_points[0].x as f64;
let mut y0 = box_points[0].y as f64;
for pt in &box_points[1..] {
let x1 = pt.x as f64;
let y1 = pt.y as f64;
let dx = x1 - x0;
let dy = y1 - y0;
length += (dx * dx + dy * dy).sqrt();
x0 = x1;
y0 = y1;
}
// Closing segment back to first point
let dx = box_points[0].x as f64 - x0;
let dy = box_points[0].y as f64 - y0;
length += (dx * dx + dy * dy).sqrt();
length
}
}

View File

@@ -0,0 +1,32 @@
//! # kreuzberg-paddle-ocr
//!
//! PaddleOCR via ONNX Runtime for Kreuzberg - high-performance text detection and recognition.
//!
//! This crate is vendored from [paddle-ocr-rs](https://github.com/mg-chao/paddle-ocr-rs)
//! by mg-chao, with modifications for Kreuzberg integration.
//!
//! ## ONNX Runtime Requirement
//!
//! Requires **ONNX Runtime 1.24+** at runtime.
//!
//! ## Original License
//!
//! The original paddle-ocr-rs is licensed under Apache-2.0.
//! This vendored version is relicensed to MIT with the original author's copyright retained.
#![allow(clippy::too_many_arguments)]
pub mod angle_net;
pub mod base_net;
pub(crate) mod constants;
pub mod crnn_net;
pub mod db_net;
pub mod ocr_error;
pub mod ocr_lite;
pub mod ocr_result;
pub mod ocr_utils;
pub mod scale_param;
pub use ocr_error::OcrError;
pub use ocr_lite::OcrLite;
pub use ocr_result::{Angle, OcrResult, Point, TextBlock, TextBox, TextLine};

View File

@@ -0,0 +1,13 @@
use thiserror::Error;
#[derive(Error, Debug)]
pub enum OcrError {
#[error("Ort error: {0}")]
Ort(#[from] ort::Error),
#[error("IO error: {0}")]
Io(#[from] std::io::Error),
#[error("Image error: {0}")]
ImageError(#[from] image::ImageError),
#[error("Session not initialized")]
SessionNotInitialized,
}

View File

@@ -0,0 +1,447 @@
use std::collections::HashMap;
use image::ImageBuffer;
use ort::session::builder::SessionBuilder;
use crate::{
angle_net::AngleNet,
base_net::BaseNet,
crnn_net::CrnnNet,
db_net::DbNet,
ocr_error::OcrError,
ocr_result::{OcrResult, Point, TextBlock, TextBox},
ocr_utils::OcrUtils,
scale_param::ScaleParam,
};
#[derive(Debug)]
pub struct OcrLite {
db_net: DbNet,
angle_net: AngleNet,
crnn_net: CrnnNet,
}
// SAFETY: OcrLite inference methods (&self) use unsafe pointer casts to call
// ort Session::run, which is thread-safe at the ONNX Runtime C API level.
// After initialization (&mut self), no mutable state is accessed during inference.
unsafe impl Send for OcrLite {}
unsafe impl Sync for OcrLite {}
impl Default for OcrLite {
fn default() -> Self {
Self::new()
}
}
impl OcrLite {
pub fn new() -> Self {
Self {
db_net: DbNet::new(),
angle_net: AngleNet::new(),
crnn_net: CrnnNet::new(),
}
}
pub fn init_models(
&mut self,
det_path: &str,
cls_path: &str,
rec_path: &str,
num_thread: usize,
) -> Result<(), OcrError> {
self.db_net.init_model(det_path, num_thread, None)?;
self.angle_net.init_model(cls_path, num_thread, None)?;
self.crnn_net.init_model(rec_path, num_thread, None)?;
Ok(())
}
pub fn init_models_with_dict(
&mut self,
det_path: &str,
cls_path: &str,
rec_path: &str,
dict_path: &str,
num_thread: usize,
) -> Result<(), OcrError> {
self.db_net.init_model(det_path, num_thread, None)?;
self.angle_net.init_model(cls_path, num_thread, None)?;
self.crnn_net
.init_model_dict_file(rec_path, num_thread, None, dict_path)?;
Ok(())
}
pub fn init_models_custom(
&mut self,
det_path: &str,
cls_path: &str,
rec_path: &str,
builder_fn: fn(SessionBuilder) -> Result<SessionBuilder, ort::Error>,
) -> Result<(), OcrError> {
self.db_net.init_model(det_path, 0, Some(builder_fn))?;
self.angle_net.init_model(cls_path, 0, Some(builder_fn))?;
self.crnn_net.init_model(rec_path, 0, Some(builder_fn))?;
Ok(())
}
/// Initialize models with dictionary file and custom session builder.
///
/// Combines `init_models_with_dict` and `init_models_custom`: loads the
/// dictionary for the recognition model while applying a custom ORT
/// session builder (e.g. for GPU execution providers).
pub fn init_models_with_dict_custom(
&mut self,
det_path: &str,
cls_path: &str,
rec_path: &str,
dict_path: &str,
num_thread: usize,
builder_fn: Option<fn(SessionBuilder) -> Result<SessionBuilder, ort::Error>>,
) -> Result<(), OcrError> {
self.db_net.init_model(det_path, num_thread, builder_fn)?;
self.angle_net.init_model(cls_path, num_thread, builder_fn)?;
self.crnn_net
.init_model_dict_file(rec_path, num_thread, builder_fn, dict_path)?;
Ok(())
}
pub fn init_models_from_memory(
&mut self,
det_bytes: &[u8],
cls_bytes: &[u8],
rec_bytes: &[u8],
num_thread: usize,
) -> Result<(), OcrError> {
self.db_net.init_model_from_memory(det_bytes, num_thread, None)?;
self.angle_net.init_model_from_memory(cls_bytes, num_thread, None)?;
self.crnn_net.init_model_from_memory(rec_bytes, num_thread, None)?;
Ok(())
}
pub fn init_models_from_memory_custom(
&mut self,
det_bytes: &[u8],
cls_bytes: &[u8],
rec_bytes: &[u8],
builder_fn: fn(SessionBuilder) -> Result<SessionBuilder, ort::Error>,
) -> Result<(), OcrError> {
self.db_net.init_model_from_memory(det_bytes, 0, Some(builder_fn))?;
self.angle_net.init_model_from_memory(cls_bytes, 0, Some(builder_fn))?;
self.crnn_net.init_model_from_memory(rec_bytes, 0, Some(builder_fn))?;
Ok(())
}
fn detect_base(
&self,
img_src: &image::RgbImage,
padding: u32,
max_side_len: u32,
box_score_thresh: f32,
box_thresh: f32,
un_clip_ratio: f32,
do_angle: bool,
most_angle: bool,
angle_rollback: bool,
angle_rollback_threshold: f32,
cls_thresh: f32,
thresh: f32,
) -> Result<OcrResult, OcrError> {
let origin_max_side = img_src.width().max(img_src.height());
let mut resize;
if max_side_len == 0 || max_side_len > origin_max_side {
resize = origin_max_side;
} else {
resize = max_side_len;
}
resize += 2 * padding;
// Cow avoids cloning the image when padding=0 (the common case).
let padding_src = OcrUtils::make_padding(img_src, padding)?;
let scale = ScaleParam::get_scale_param(&padding_src, resize);
self.detect_once(
&padding_src,
&scale,
padding,
box_score_thresh,
box_thresh,
un_clip_ratio,
do_angle,
most_angle,
angle_rollback,
angle_rollback_threshold,
cls_thresh,
thresh,
)
}
/// Detect text in image
///
/// # Arguments
///
/// - `img_src` - Input image
/// - `padding` - Padding width added during image transformation (improves detection)
/// - `max_side_len` - Maximum side length after transformation (larger images will be scaled down)
/// - `box_score_thresh` - Score threshold for text region detection
/// - `box_thresh` - Box threshold
/// - `un_clip_ratio` - Unclip ratio
/// - `do_angle` - Whether to perform angle detection
/// - `most_angle` - Use most common angle for all text regions
const DEFAULT_CLS_THRESH: f32 = 0.9;
const DEFAULT_THRESH: f32 = 0.3;
const DEFAULT_REC_BATCH_SIZE: u32 = 6;
pub fn detect(
&self,
img_src: &image::RgbImage,
padding: u32,
max_side_len: u32,
box_score_thresh: f32,
box_thresh: f32,
un_clip_ratio: f32,
do_angle: bool,
most_angle: bool,
) -> Result<OcrResult, OcrError> {
self.detect_base(
img_src,
padding,
max_side_len,
box_score_thresh,
box_thresh,
un_clip_ratio,
do_angle,
most_angle,
false,
0.0,
Self::DEFAULT_CLS_THRESH,
Self::DEFAULT_THRESH,
)
}
/// Detect text with angle rollback support
///
/// When `do_angle` is true, if the image was angle-corrected but recognition
/// result is poor, the angle correction will be reverted.
///
/// # Arguments
///
/// - `img_src` - Input image
/// - `padding` - Padding width added during image transformation
/// - `max_side_len` - Maximum side length after transformation
/// - `box_score_thresh` - Score threshold for text region detection
/// - `box_thresh` - Box threshold
/// - `un_clip_ratio` - Unclip ratio
/// - `do_angle` - Whether to perform angle detection
/// - `most_angle` - Use most common angle
/// - `angle_rollback_threshold` - If text score is below this value (or NaN), angle correction is reverted
pub fn detect_angle_rollback(
&self,
img_src: &image::RgbImage,
padding: u32,
max_side_len: u32,
box_score_thresh: f32,
box_thresh: f32,
un_clip_ratio: f32,
do_angle: bool,
most_angle: bool,
angle_rollback_threshold: f32,
) -> Result<OcrResult, OcrError> {
self.detect_base(
img_src,
padding,
max_side_len,
box_score_thresh,
box_thresh,
un_clip_ratio,
do_angle,
most_angle,
true,
angle_rollback_threshold,
Self::DEFAULT_CLS_THRESH,
Self::DEFAULT_THRESH,
)
}
pub fn detect_from_path(
&self,
img_path: &str,
padding: u32,
max_side_len: u32,
box_score_thresh: f32,
box_thresh: f32,
un_clip_ratio: f32,
do_angle: bool,
most_angle: bool,
) -> Result<OcrResult, OcrError> {
let img_src = image::open(img_path)?.to_rgb8();
self.detect(
&img_src,
padding,
max_side_len,
box_score_thresh,
box_thresh,
un_clip_ratio,
do_angle,
most_angle,
)
}
/// Sort text boxes in reading order: top-to-bottom, left-to-right.
///
/// Sorts by top-left Y coordinate first, then by top-left X coordinate within
/// the same Y. Matches PaddleOCR Python's `sorted_boxes` primary ordering.
fn sort_text_boxes(text_boxes: &mut [TextBox]) {
text_boxes.sort_by(|a, b| {
let ay = a.points.first().map_or(0, |p| p.y);
let ax = a.points.first().map_or(0, |p| p.x);
let by = b.points.first().map_or(0, |p| p.y);
let bx = b.points.first().map_or(0, |p| p.x);
(ay, ax).cmp(&(by, bx))
});
}
fn detect_once(
&self,
img_src: &image::RgbImage,
scale: &ScaleParam,
padding: u32,
box_score_thresh: f32,
box_thresh: f32,
un_clip_ratio: f32,
do_angle: bool,
most_angle: bool,
angle_rollback: bool,
angle_rollback_threshold: f32,
cls_thresh: f32,
thresh: f32,
) -> Result<OcrResult, OcrError> {
let mut text_boxes =
self.db_net
.get_text_boxes(img_src, scale, box_score_thresh, box_thresh, un_clip_ratio, thresh)?;
// Sort boxes in reading order (top-to-bottom, left-to-right)
Self::sort_text_boxes(&mut text_boxes);
let part_images = OcrUtils::get_part_images(img_src, &text_boxes);
let angles = self
.angle_net
.get_angles(&part_images, do_angle, most_angle, cls_thresh)?;
let mut rotated_images: Vec<image::RgbImage> = Vec::with_capacity(part_images.len());
// Angle correction rollback
let mut angle_rollback_records = HashMap::<usize, ImageBuffer<image::Rgb<u8>, Vec<u8>>>::new();
for (index, (angle, mut part_image)) in angles.iter().zip(part_images).enumerate() {
if angle.index == 1 {
if angle_rollback {
// Keep original copy
angle_rollback_records.insert(index, part_image.clone());
}
OcrUtils::mat_rotate_clock_wise_180(&mut part_image);
}
rotated_images.push(part_image);
}
let text_lines = self.crnn_net.get_text_lines(
&rotated_images,
&angle_rollback_records,
angle_rollback_threshold,
Self::DEFAULT_REC_BATCH_SIZE,
)?;
let mut text_blocks = Vec::with_capacity(text_lines.len());
for (i, text_line) in text_lines.into_iter().enumerate() {
text_blocks.push(TextBlock {
box_points: text_boxes[i]
.points
.iter()
.map(|p| Point {
x: ((p.x as f32) - padding as f32) as u32,
y: ((p.y as f32) - padding as f32) as u32,
})
.collect(),
box_score: text_boxes[i].score,
angle_index: angles[i].index,
angle_score: angles[i].score,
text: text_line.text,
text_score: text_line.text_score,
});
}
Ok(OcrResult { text_blocks })
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::ocr_result::TextBox;
fn make_box(x: u32, y: u32) -> TextBox {
TextBox {
points: vec![
Point { x, y },
Point { x: x + 100, y },
Point { x: x + 100, y: y + 20 },
Point { x, y: y + 20 },
],
score: 0.9,
}
}
#[test]
fn test_sort_text_boxes_top_to_bottom() {
let mut boxes = vec![make_box(10, 100), make_box(10, 50), make_box(10, 10)];
OcrLite::sort_text_boxes(&mut boxes);
assert_eq!(boxes[0].points[0].y, 10);
assert_eq!(boxes[1].points[0].y, 50);
assert_eq!(boxes[2].points[0].y, 100);
}
#[test]
fn test_sort_text_boxes_same_line_left_to_right() {
// Boxes with the same Y are sorted left-to-right by X
let mut boxes = vec![make_box(200, 10), make_box(100, 10), make_box(50, 10)];
OcrLite::sort_text_boxes(&mut boxes);
assert_eq!(boxes[0].points[0].x, 50);
assert_eq!(boxes[1].points[0].x, 100);
assert_eq!(boxes[2].points[0].x, 200);
}
#[test]
fn test_sort_text_boxes_multi_line() {
// Boxes sorted strictly by (y, x): y=50/x=50, y=50/x=300, y=100/x=100, y=100/x=200
let mut boxes = vec![
make_box(300, 50), // line 1, right
make_box(100, 100), // line 2, left
make_box(50, 50), // line 1, left (same y=50)
make_box(200, 100), // line 2, right (same y=100)
];
OcrLite::sort_text_boxes(&mut boxes);
// Line 1 (y=50): left first, then right
assert_eq!(boxes[0].points[0].x, 50);
assert_eq!(boxes[1].points[0].x, 300);
// Line 2 (y=100): left first, then right
assert_eq!(boxes[2].points[0].x, 100);
assert_eq!(boxes[3].points[0].x, 200);
}
#[test]
fn test_sort_text_boxes_empty() {
let mut boxes: Vec<TextBox> = vec![];
OcrLite::sort_text_boxes(&mut boxes);
assert!(boxes.is_empty());
}
#[test]
fn test_sort_text_boxes_single() {
let mut boxes = vec![make_box(10, 20)];
OcrLite::sort_text_boxes(&mut boxes);
assert_eq!(boxes.len(), 1);
}
}

View File

@@ -0,0 +1,105 @@
use std::fmt::{self, Write};
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
pub struct Point {
pub x: u32,
pub y: u32,
}
#[derive(Debug, Serialize, Deserialize)]
pub struct TextBox {
pub points: Vec<Point>,
pub score: f32,
}
impl fmt::Display for TextBox {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
// SAFETY: We must have at least 4 points in a valid TextBox
// This is enforced at the OCR processing level, but we check bounds here for safety
if self.points.len() < 4 {
return write!(
f,
"TextBox [score({}), points_count({})]",
self.score,
self.points.len()
);
}
write!(
f,
"TextBox [score({}), [x: {}, y: {}], [x: {}, y: {}], [x: {}, y: {}], [x: {}, y: {}]]",
self.score,
self.points[0].x,
self.points[0].y,
self.points[1].x,
self.points[1].y,
self.points[2].x,
self.points[2].y,
self.points[3].x,
self.points[3].y,
)
}
}
#[derive(Debug, Default)]
pub struct Angle {
pub index: i32,
pub score: f32,
}
impl fmt::Display for Angle {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let header = if self.index >= 0 { "Angle" } else { "AngleDisabled" };
write!(f, "{}[Index({}), Score({})]", header, self.index, self.score)
}
}
#[derive(Debug, Default)]
pub struct TextLine {
pub text: String,
pub text_score: f32,
}
impl fmt::Display for TextLine {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "TextLine[Text({}),TextScore({})]", self.text, self.text_score)
}
}
#[derive(Debug, Serialize, Deserialize)]
pub struct TextBlock {
pub box_points: Vec<Point>,
pub box_score: f32,
pub angle_index: i32,
pub angle_score: f32,
pub text: String,
pub text_score: f32,
}
#[derive(Serialize, Deserialize)]
pub struct OcrResult {
pub text_blocks: Vec<TextBlock>,
}
impl fmt::Display for OcrResult {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let mut str_builder = String::with_capacity(0);
for text_block in &self.text_blocks {
write!(
str_builder,
"TextBlock[BoxPointsLen({}), BoxScore({}), AngleIndex({}), AngleScore({}), Text({}), TextScore({})]",
text_block.box_points.len(),
text_block.box_score,
text_block.angle_index,
text_block.angle_score,
text_block.text,
text_block.text_score
)?;
}
f.write_str(&str_builder)
}
}

View File

@@ -0,0 +1,206 @@
use std::borrow::Cow;
use crate::{
ocr_error::OcrError,
ocr_result::{Point, TextBox},
};
use image::imageops;
use imageproc::geometric_transformations::{Interpolation, Projection};
use ndarray::{Array, Array4};
pub struct OcrUtils;
impl OcrUtils {
/// Normalize image pixels and transpose from HWC (row-major RGB) to CHW tensor format.
///
/// Formula per pixel: `output[ch] = pixel[ch] * norm[ch] - mean[ch] * norm[ch]`
///
/// This is a hot path called once per page. Key optimizations:
/// - Pre-computes `mean * norm` constants (avoids repeated multiply)
/// - Writes each channel plane contiguously via `as_slice_mut()`, enabling
/// LLVM auto-vectorization (NEON on ARM64, SSE/AVX on x86-64). The previous
/// approach used `tensor[[0, ch, r, c]]` which scattered writes across planes
/// and prevented any vectorization.
pub fn substract_mean_normalize(img_src: &image::RgbImage, mean_vals: &[f32], norm_vals: &[f32]) -> Array4<f32> {
let cols = img_src.width() as usize;
let rows = img_src.height() as usize;
let pixel_count = rows * cols;
let mut input_tensor = Array::zeros((1, 3, rows, cols));
let adjusted = [
mean_vals[0] * norm_vals[0],
mean_vals[1] * norm_vals[1],
mean_vals[2] * norm_vals[2],
];
let raw = img_src.as_raw();
// Write each channel plane as a contiguous slice. ndarray stores (1,3,H,W)
// in C-contiguous (row-major) order, so plane [0,ch] is a contiguous H*W block.
// This enables LLVM to auto-vectorize the inner loop (4-8 f32 ops per cycle).
for ch in 0..3 {
let norm = norm_vals[ch];
let adj = adjusted[ch];
let plane = input_tensor
.slice_mut(ndarray::s![0, ch, .., ..])
.into_shape_with_order(pixel_count)
.expect("contiguous plane slice");
let plane_slice = plane.into_slice().expect("contiguous memory");
for (i, out) in plane_slice.iter_mut().enumerate() {
// raw is HWC: pixel i has R at raw[i*3], G at raw[i*3+1], B at raw[i*3+2]
*out = raw[i * 3 + ch] as f32 * norm - adj;
}
}
input_tensor
}
/// Add white padding around the image, or borrow it unchanged when padding=0.
/// Returns Cow to avoid cloning the image in the common no-padding case.
pub fn make_padding<'a>(img_src: &'a image::RgbImage, padding: u32) -> Result<Cow<'a, image::RgbImage>, OcrError> {
if padding == 0 {
return Ok(Cow::Borrowed(img_src));
}
let width = img_src.width();
let height = img_src.height();
let mut padding_src = image::RgbImage::new(width + 2 * padding, height + 2 * padding);
imageproc::drawing::draw_filled_rect_mut(
&mut padding_src,
imageproc::rect::Rect::at(0, 0).of_size(width + 2 * padding, height + 2 * padding),
image::Rgb([255, 255, 255]),
);
image::imageops::replace(&mut padding_src, img_src, padding as i64, padding as i64);
Ok(Cow::Owned(padding_src))
}
pub fn get_part_images(img_src: &image::RgbImage, text_boxes: &[TextBox]) -> Vec<image::RgbImage> {
text_boxes
.iter()
.map(|text_box| Self::get_rotate_crop_image(img_src, &text_box.points))
.collect()
}
pub fn get_rotate_crop_image(img_src: &image::RgbImage, box_points: &[Point]) -> image::RgbImage {
let mut points = box_points.to_vec();
// Calculate bounding box
let (min_x, min_y, max_x, max_y) = points.iter().fold(
(u32::MAX, u32::MAX, 0u32, 0u32),
|(min_x, min_y, max_x, max_y), point| {
(
min_x.min(point.x),
min_y.min(point.y),
max_x.max(point.x),
max_y.max(point.y),
)
},
);
// Crop image
let img_crop = imageops::crop_imm(img_src, min_x, min_y, max_x - min_x, max_y - min_y).to_image();
for point in &mut points {
point.x = point.x.saturating_sub(min_x);
point.y = point.y.saturating_sub(min_y);
}
// Ensure we have enough points for transformation
if points.len() < 4 {
// Fallback: return the cropped image as-is if we don't have 4 points
return img_crop;
}
// Direct multiplication instead of .pow(2) — avoids integer power function overhead.
let dx_w = (points[0].x as i32 - points[1].x as i32) as f32;
let dy_w = (points[0].y as i32 - points[1].y as i32) as f32;
let img_crop_width = (dx_w * dx_w + dy_w * dy_w).sqrt() as u32;
let dx_h = (points[0].x as i32 - points[3].x as i32) as f32;
let dy_h = (points[0].y as i32 - points[3].y as i32) as f32;
let img_crop_height = (dx_h * dx_h + dy_h * dy_h).sqrt() as u32;
// Ensure dimensions are valid (non-zero)
if img_crop_width == 0 || img_crop_height == 0 {
return img_crop;
}
let src_points = [
(points[0].x as f32, points[0].y as f32),
(points[1].x as f32, points[1].y as f32),
(points[2].x as f32, points[2].y as f32),
(points[3].x as f32, points[3].y as f32),
];
let dst_points = [
(0.0, 0.0),
(img_crop_width as f32, 0.0),
(img_crop_width as f32, img_crop_height as f32),
(0.0, img_crop_height as f32),
];
let projection = match Projection::from_control_points(src_points, dst_points) {
Some(proj) => proj,
None => {
// If projection cannot be created, return the cropped image as fallback
return img_crop;
}
};
let mut part_img = image::RgbImage::new(img_crop_width, img_crop_height);
imageproc::geometric_transformations::warp_into(
&img_crop,
&projection,
Interpolation::Nearest,
image::Rgb([255, 255, 255]),
&mut part_img,
);
// Rotate image if needed
if part_img.height() >= part_img.width() * 3 / 2 {
let mut rotated = image::RgbImage::new(part_img.height(), part_img.width());
for (x, y, pixel) in part_img.enumerate_pixels() {
rotated.put_pixel(y, part_img.width() - 1 - x, *pixel);
}
rotated
} else {
part_img
}
}
pub fn mat_rotate_clock_wise_180(src: &mut image::RgbImage) {
imageops::rotate180_in_place(src);
}
/// Compute mean of f32 image values where mask > 0.
///
/// Uses raw slice access instead of per-pixel get_pixel() for better
/// cache behavior and to enable auto-vectorization of the reduction.
pub fn calculate_mean_with_mask(
img: &image::ImageBuffer<image::Luma<f32>, Vec<f32>>,
mask: &image::ImageBuffer<image::Luma<u8>, Vec<u8>>,
) -> f32 {
assert_eq!(img.width(), mask.width());
assert_eq!(img.height(), mask.height());
let img_raw = img.as_raw();
let mask_raw = mask.as_raw();
let mut sum: f32 = 0.0;
let mut count: u32 = 0;
for (px, &m) in img_raw.iter().zip(mask_raw.iter()) {
if m > 0 {
sum += *px;
count += 1;
}
}
if count == 0 { 0.0 } else { sum / count as f32 }
}
}

View File

@@ -0,0 +1,69 @@
#[derive(Debug)]
pub struct ScaleParam {
pub src_width: u32,
pub src_height: u32,
pub dst_width: u32,
pub dst_height: u32,
pub scale_width: f32,
pub scale_height: f32,
}
impl ScaleParam {
pub fn new(
src_width: u32,
src_height: u32,
dst_width: u32,
dst_height: u32,
scale_width: f32,
scale_height: f32,
) -> Self {
Self {
src_width,
src_height,
dst_width,
dst_height,
scale_width,
scale_height,
}
}
pub fn get_scale_param(src: &image::RgbImage, target_size: u32) -> Self {
let src_width = src.width();
let src_height = src.height();
let mut dst_width;
let mut dst_height;
let ratio: f32 = if src_width > src_height {
target_size as f32 / src_width as f32
} else {
target_size as f32 / src_height as f32
};
dst_width = (src_width as f32 * ratio) as u32;
dst_height = (src_height as f32 * ratio) as u32;
if dst_width % 32 != 0 {
dst_width = (dst_width / 32) * 32;
dst_width = dst_width.max(32);
}
if dst_height % 32 != 0 {
dst_height = (dst_height / 32) * 32;
dst_height = dst_height.max(32);
}
let scale_width = dst_width as f32 / src_width as f32;
let scale_height = dst_height as f32 / src_height as f32;
Self::new(src_width, src_height, dst_width, dst_height, scale_width, scale_height)
}
}
impl std::fmt::Display for ScaleParam {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"src_width:{},src_height:{},dst_width:{},dst_height:{},scale_width:{},scale_height:{}",
self.src_width, self.src_height, self.dst_width, self.dst_height, self.scale_width, self.scale_height
)
}
}

View File

@@ -0,0 +1,436 @@
//! Diagnostic test to trace PaddleOCR detection pipeline.
//!
//! This test isolates each step to determine where empty results originate.
//! Since this crate doesn't have PNG/image decoder features, we create test
//! images programmatically.
use std::path::PathBuf;
fn get_workspace_root() -> PathBuf {
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
manifest_dir.parent().unwrap().parent().unwrap().to_path_buf()
}
fn get_model_dir() -> PathBuf {
get_workspace_root().join(".kreuzberg/paddle-ocr")
}
/// Create a simple test image with black text "HELLO" on white background.
/// This avoids needing PNG decoder features.
fn create_test_image() -> image::RgbImage {
let width = 200u32;
let height = 100u32;
let mut img = image::RgbImage::from_pixel(width, height, image::Rgb([255, 255, 255]));
// Draw a thick black rectangle to simulate text (a simple "block" pattern)
// This ensures the detection model has SOMETHING to detect
let black = image::Rgb([0, 0, 0]);
// Draw "H" shape (x: 20-60, y: 20-80)
for y in 20..80 {
img.put_pixel(20, y, black);
img.put_pixel(21, y, black);
img.put_pixel(22, y, black);
}
for y in 20..80 {
img.put_pixel(55, y, black);
img.put_pixel(56, y, black);
img.put_pixel(57, y, black);
}
for x in 20..58 {
img.put_pixel(x, 48, black);
img.put_pixel(x, 49, black);
img.put_pixel(x, 50, black);
}
// Draw thick solid block to be very obvious (x: 80-180, y: 30-70)
for y in 30..70 {
for x in 80..180 {
img.put_pixel(x, y, black);
}
}
img
}
#[test]
fn diagnostic_detection_pipeline() {
let model_dir = get_model_dir();
if !model_dir.join("det/model.onnx").exists() {
eprintln!("SKIP: Models not downloaded at {:?}", model_dir);
return;
}
// Discover ORT library
discover_ort();
eprintln!("=== PaddleOCR Diagnostic Test ===");
eprintln!("Model dir: {:?}", model_dir);
// Step 1: Create test image
let img = create_test_image();
eprintln!("Step 1 - Test image created: {}x{}", img.width(), img.height());
// Step 2: Initialize OcrLite
let mut ocr_lite = kreuzberg_paddle_ocr::OcrLite::new();
let det_path = model_dir.join("det/model.onnx");
let cls_path = model_dir.join("cls/model.onnx");
let rec_path = model_dir.join("rec/model.onnx");
let init_result = ocr_lite.init_models(
det_path.to_str().unwrap(),
cls_path.to_str().unwrap(),
rec_path.to_str().unwrap(),
1,
);
match &init_result {
Ok(()) => eprintln!("Step 2 - Models initialized successfully"),
Err(e) => {
eprintln!("Step 2 - FAILED to init models: {:?}", e);
panic!("Model initialization failed: {:?}", e);
}
}
// Step 3: Run detection with various parameter sets
let test_cases = vec![
("A: Default params", 50u32, 960u32, 0.3f32, 0.5f32, 1.6f32, true, false),
("B: Very low thresholds", 50, 960, 0.01, 0.01, 1.6, false, false),
("C: No padding + low", 0, 960, 0.01, 0.01, 1.6, false, false),
("D: Higher unclip ratio", 50, 960, 0.1, 0.1, 3.0, false, false),
("E: No padding + medium", 0, 960, 0.1, 0.3, 2.0, false, false),
];
let mut any_detected = false;
for (name, padding, max_side, box_score, box_thresh, unclip, do_angle, most_angle) in &test_cases {
eprintln!("\n--- Test {} ---", name);
eprintln!(
" padding={}, max_side={}, box_score={}, box_thresh={}, unclip={}",
padding, max_side, box_score, box_thresh, unclip
);
let result = ocr_lite.detect(
&img,
*padding,
*max_side,
*box_score,
*box_thresh,
*unclip,
*do_angle,
*most_angle,
);
match &result {
Ok(ocr_result) => {
eprintln!(" Result: {} text blocks", ocr_result.text_blocks.len());
for (i, block) in ocr_result.text_blocks.iter().enumerate() {
eprintln!(
" Block {}: text='{}', text_score={:.3}, box_score={:.3}",
i, block.text, block.text_score, block.box_score
);
any_detected = true;
}
}
Err(e) => {
eprintln!(" FAILED: {:?}", e);
}
}
}
eprintln!("\n=== Diagnosis ===");
if !any_detected {
eprintln!("RESULT: Detection model produces NO output regardless of thresholds.");
eprintln!("This strongly suggests an ORT version compatibility issue.");
eprintln!(" ort crate version: check Cargo.lock for current version");
eprintln!(" ORT_DYLIB_PATH: {:?}", std::env::var("ORT_DYLIB_PATH"));
} else {
eprintln!("RESULT: Detection works. Issue may be threshold-related or image-specific.");
}
}
/// Also test with raw ONNX inference to check if ORT works at all.
#[test]
fn diagnostic_raw_ort_inference() {
let model_dir = get_model_dir();
let det_model = model_dir.join("det/model.onnx");
if !det_model.exists() {
eprintln!("SKIP: Detection model not found at {:?}", det_model);
return;
}
discover_ort();
eprintln!("=== Raw ORT Inference Test ===");
// Load model directly via ort
use ort::session::Session;
let mut session = Session::builder().unwrap().commit_from_file(&det_model).unwrap();
eprintln!("Model loaded successfully");
eprintln!("Inputs:");
for input in session.inputs() {
eprintln!(" name='{}'", input.name());
}
eprintln!("Outputs:");
for output in session.outputs() {
eprintln!(" name='{}'", output.name());
}
// Create a small 32x32 test tensor (NCHW format: batch=1, channels=3, h=32, w=32)
let input_data: Vec<f32> = vec![0.5; 3 * 32 * 32];
let tensor =
ort::value::Tensor::from_array(ndarray::Array::from_shape_vec((1, 3, 32, 32), input_data).unwrap()).unwrap();
let input_name = session.inputs()[0].name().to_string();
eprintln!("\nRunning inference with 32x32 gray image...");
let outputs = session.run(ort::inputs![input_name => tensor]).unwrap();
// Check output
let (output_name, output_value) = outputs.iter().next().unwrap();
eprintln!("Output name: {}", output_name);
let output_tensor = output_value.try_extract_tensor::<f32>().unwrap();
let output_shape = output_tensor.0;
let output_data = output_tensor.1;
eprintln!("Output shape: {:?}", output_shape);
eprintln!("Output len: {}", output_data.len());
if !output_data.is_empty() {
let min = output_data.iter().cloned().fold(f32::INFINITY, f32::min);
let max = output_data.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
let sum: f32 = output_data.iter().sum();
let mean = sum / output_data.len() as f32;
let non_zero = output_data.iter().filter(|&&v| v > 0.001).count();
eprintln!("Output stats: min={:.6}, max={:.6}, mean={:.6}", min, max, mean);
eprintln!("Non-zero values (>0.001): {} / {}", non_zero, output_data.len());
if max < 0.001 {
eprintln!("\nDIAGNOSIS: Model outputs are essentially all zeros.");
eprintln!("This confirms an ORT compatibility issue - model isn't executing correctly.");
} else {
eprintln!("\nDIAGNOSIS: Model produces non-zero output. ORT is working.");
}
}
}
/// Diagnostic: test the CRNN recognition model directly.
#[test]
fn diagnostic_crnn_model_output() {
let model_dir = get_model_dir();
let rec_model = model_dir.join("rec/model.onnx");
if !rec_model.exists() {
eprintln!("SKIP: Recognition model not found");
return;
}
discover_ort();
eprintln!("=== CRNN Recognition Model Diagnostic ===");
use ort::session::Session;
let mut session = Session::builder().unwrap().commit_from_file(&rec_model).unwrap();
eprintln!("Model loaded successfully");
eprintln!("Inputs:");
for input in session.inputs() {
eprintln!(" name='{}'", input.name());
}
eprintln!("Outputs:");
for output in session.outputs() {
eprintln!(" name='{}'", output.name());
}
// Check metadata for character list
{
let metadata = session.metadata().unwrap();
// Check all metadata custom keys
eprintln!("Model metadata:");
eprintln!(" description: {:?}", metadata.description());
eprintln!(" producer: {:?}", metadata.producer());
// Try to get the character key
match metadata.custom("character") {
Some(chars) => {
let bytes = chars.as_bytes();
let char_count = chars.split('\n').count();
eprintln!(
" custom('character'): len={}, bytes={}, split_count={}",
chars.len(),
bytes.len(),
char_count
);
if chars.len() < 500 {
eprintln!(" value: {:?}", chars);
} else {
let preview: String = chars.chars().take(100).collect();
eprintln!(" preview (first 100 chars): {:?}", preview);
}
// Check for null bytes or other encoding issues
let null_count = bytes.iter().filter(|&&b| b == 0).count();
if null_count > 0 {
eprintln!(" WARNING: {} null bytes found in character string!", null_count);
}
}
None => {
eprintln!(" ERROR: No 'character' key in model metadata!");
}
}
// Try other possible metadata keys
for key in [
"character",
"characters",
"dict",
"dictionary",
"labels",
"vocab",
"alphabet",
] {
if let Some(val) = metadata.custom(key) {
eprintln!(
" custom('{}'): len={}, preview={:?}",
key,
val.len(),
&val[..val.len().min(80)]
);
}
}
} // metadata dropped here
// Test 1: Run inference with a simple input (height=48, width=200)
// CRNN expects NCHW: [1, 3, 48, width]
let h = 48usize;
let w = 200usize;
// Create a pattern that looks like text (alternating black/white vertical stripes)
let mut input_data: Vec<f32> = vec![0.0; 3 * h * w];
for c in 0..3 {
for y in 10..38 {
for x in (20..180).step_by(2) {
input_data[c * h * w + y * w + x] = -1.0; // normalized black
}
}
}
let tensor =
ort::value::Tensor::from_array(ndarray::Array::from_shape_vec((1, 3, h, w), input_data).unwrap()).unwrap();
let input_name = session.inputs()[0].name().to_string();
eprintln!("\nRunning CRNN with striped pattern (48x200)...");
let outputs = session.run(ort::inputs![input_name => tensor]).unwrap();
let (_, output_value) = outputs.iter().next().unwrap();
let (shape, data) = output_value.try_extract_tensor::<f32>().unwrap();
eprintln!("Output shape: {:?}", shape);
eprintln!("Output total values: {}", data.len());
if shape.len() >= 3 {
let time_steps = shape[1] as usize;
let vocab_size = shape[2] as usize;
eprintln!("Time steps: {}, Vocabulary size: {}", time_steps, vocab_size);
// Check if outputs are meaningful
let data_vec: Vec<f32> = data.to_vec();
let min = data_vec.iter().cloned().fold(f32::INFINITY, f32::min);
let max = data_vec.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
let mean: f32 = data_vec.iter().sum::<f32>() / data_vec.len() as f32;
eprintln!("Overall stats: min={:.6}, max={:.6}, mean={:.6}", min, max, mean);
// Check argmax distribution
let mut argmax_zero_count = 0;
let mut argmax_nonzero_count = 0;
for t in 0..time_steps {
let start = t * vocab_size;
let end = start + vocab_size;
let slice = &data_vec[start..end.min(data_vec.len())];
let (max_idx, max_val) =
slice.iter().enumerate().fold(
(0, f32::MIN),
|(mi, mv), (i, &v)| if v > mv { (i, v) } else { (mi, mv) },
);
if max_idx == 0 {
argmax_zero_count += 1;
} else {
argmax_nonzero_count += 1;
}
if t < 5 || (t > time_steps - 3) {
eprintln!(" Step {}: argmax={}, max_val={:.4}", t, max_idx, max_val);
} else if t == 5 {
eprintln!(" ... (skipping middle steps)");
}
}
eprintln!(
"\nArgmax distribution: {} blank (idx=0), {} non-blank",
argmax_zero_count, argmax_nonzero_count
);
if argmax_nonzero_count == 0 {
eprintln!("\nDIAGNOSIS: CRNN model outputs all blanks.");
eprintln!("Possible causes:");
eprintln!(" 1. ORT version incompatibility with CRNN model");
eprintln!(" 2. Model is not executing graph correctly");
eprintln!(" 3. Input normalization mismatch");
} else {
eprintln!("\nDIAGNOSIS: CRNN model produces non-blank output. Recognition works.");
}
}
// Drop outputs before reusing session
drop(outputs);
// Test 2: Run with a uniform white image (should produce all blanks - valid baseline)
let white_data: Vec<f32> = vec![1.0; 3 * h * w];
let white_tensor =
ort::value::Tensor::from_array(ndarray::Array::from_shape_vec((1, 3, h, w), white_data).unwrap()).unwrap();
let input_name2 = session.inputs()[0].name().to_string();
eprintln!("\nRunning CRNN with uniform white (48x200)...");
let white_outputs = session.run(ort::inputs![input_name2 => white_tensor]).unwrap();
let (_, white_val) = white_outputs.iter().next().unwrap();
let (_, white_data_out) = white_val.try_extract_tensor::<f32>().unwrap();
let white_vec: Vec<f32> = white_data_out.to_vec();
let white_max = white_vec.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
let white_min = white_vec.iter().cloned().fold(f32::INFINITY, f32::min);
eprintln!("White image output: min={:.6}, max={:.6}", white_min, white_max);
}
fn discover_ort() {
if let Ok(path) = std::env::var("ORT_DYLIB_PATH")
&& std::path::Path::new(&path).exists()
{
eprintln!("ORT found via ORT_DYLIB_PATH: {}", path);
return;
}
let candidates = [
"/opt/homebrew/lib/libonnxruntime.dylib",
"/usr/local/lib/libonnxruntime.dylib",
];
for candidate in &candidates {
if std::path::Path::new(candidate).exists() {
eprintln!("Setting ORT_DYLIB_PATH={}", candidate);
unsafe { std::env::set_var("ORT_DYLIB_PATH", candidate) };
return;
}
}
eprintln!("WARNING: Could not find ORT library!");
}

27
crates/kreuzberg-php/Cargo.toml generated Normal file
View File

@@ -0,0 +1,27 @@
[package]
name = "kreuzberg-php"
version = "5.0.0-rc.3"
edition = "2024"
license = "Elastic-2.0"
description = "High-performance document intelligence library"
readme = false
keywords = ["document", "extraction", "ocr", "pdf", "text"]
categories = ["text-processing"]
# `ahash` and `futures-util` are conditionally included but not directly used in PHP code.
[package.metadata.cargo-machete]
ignored = ["tokio", "ahash", "async-trait"]
[lib]
crate-type = ["cdylib"]
[features]
extension-module = []
[dependencies]
async-trait = "0.1"
ext-php-rs = "0.15"
kreuzberg = { version = "5.0.0-rc.3", path = "../kreuzberg", features = ["full", "pdf", "ocr", "paddle-ocr", "paddle-ocr-types", "layout-detection", "layout-types", "embeddings", "embedding-presets", "chunking", "keywords-yake", "keywords-rake", "language-detection", "html", "tree-sitter", "office", "email", "archives", "stopwords", "auto-rotate", "auto-rotate-types", "tokio-runtime", "api", "mcp", "liter-llm", "quality"] }
serde = { version = "1", features = ["derive"] }
serde_json = "1"
tokio = { version = "1", features = ["full"] }

93
crates/kreuzberg-php/src/LICENSE generated Normal file
View File

@@ -0,0 +1,93 @@
Elastic License 2.0 (ELv2)
Copyright 2025-2026 Kreuzberg, Inc.
Acceptance
By using the software, you agree to all of the terms and conditions below.
Copyright License
The licensor grants you a non-exclusive, royalty-free, worldwide,
non-sublicensable, non-transferable license to use, copy, distribute, make
available, and prepare derivative works of the software, in each case subject to
the limitations and conditions below.
Limitations
You may not provide the software to third parties as a hosted or managed
service, where the service provides users with access to any substantial set of
the features or functionality of the software.
You may not move, change, disable, or circumvent the license key functionality
in the software, and you may not remove or obscure any functionality in the
software that is protected by the license key.
You may not alter, remove, or obscure any licensing, copyright, or other notices
of the licensor in the software. Any use of the licensor's trademarks is subject
to applicable law.
Patents
The licensor grants you a license, under any patent claims the licensor can
license, or becomes able to license, to make, have made, use, sell, offer for
sale, import and have imported the software, in each case subject to the
limitations and conditions in this license. This license does not cover any
patent claims that you cause to be infringed by modifications or additions to the
software. If you or your company make any written claim that the software
infringes or contributes to infringement of any patent, your patent license for
the software granted under these terms ends immediately. If your company makes
such a claim, your patent license ends immediately for work on behalf of your
company.
Notices
You must ensure that anyone who gets a copy of any part of the software from you
also gets a copy of these terms.
If you modify the software, you must include in any modified copies of the
software prominent notices stating that you have modified the software.
No Other Rights
These terms do not imply any licenses other than those expressly granted in
these terms.
Termination
If you use the software in violation of these terms, such use is not licensed,
and your licenses will automatically terminate. If the licensor provides you with
a notice of your violation, and you cease all violation of this license no later
than 30 days after you receive that notice, your licenses will be reinstated
retroactively. However, if you violate these terms after such reinstatement, any
additional violation of these terms will cause your licenses to terminate
automatically and permanently.
No Liability
As far as the law allows, the software comes as is, without any warranty or
condition, and the licensor will not be liable to you for any damages arising out
of these terms or the use or nature of the software, under any kind of legal
claim.
Definitions
The licensor is the entity offering these terms, and the software is the
software the licensor makes available under these terms, including any portion
of it.
you refers to the individual or entity agreeing to these terms.
your company is any legal entity, sole proprietorship, or other kind of
organization that you work for, plus all organizations that have control over,
are under the control of, or are under common control with that organization.
control means ownership of substantially all the assets of an entity, or the
power to direct its management and policies by vote, contract, or otherwise.
Control can be direct or indirect.
your licenses are all the licenses granted to you for the software under these
terms.
use means anything you do with the software requiring one of your licenses.
trademark means trademarks, service marks, and similar rights.

34
crates/kreuzberg-php/src/composer.json generated Normal file
View File

@@ -0,0 +1,34 @@
{
"name": "kreuzberg-dev/kreuzberg",
"description": "High-performance document intelligence library",
"license": "Elastic-2.0",
"type": "php-ext",
"require": {
"php": ">=8.2"
},
"require-dev": {
"phpstan/phpstan": "^2.1",
"friendsofphp/php-cs-fixer": "^3.95",
"phpunit/phpunit": "^13.1"
},
"autoload": {
"psr-4": {
"Kreuzberg\\": "src/"
}
},
"scripts": {
"phpstan": "php -d detect_unicode=0 vendor/bin/phpstan --configuration=phpstan.neon --memory-limit=512M",
"format": "php vendor/bin/php-cs-fixer fix --quiet",
"format:check": "php vendor/bin/php-cs-fixer fix --dry-run --quiet",
"test": "php vendor/bin/phpunit",
"lint": "@phpstan",
"lint:fix": "php vendor/bin/php-cs-fixer fix --quiet && php -d detect_unicode=0 vendor/bin/phpstan --configuration=phpstan.neon --memory-limit=512M"
},
"php-ext": {
"extension-name": "kreuzberg",
"support-zts": true,
"support-nts": true,
"download-url-method": ["pre-packaged-binary", "composer-default"]
},
"keywords": ["document", "extraction", "ocr", "pdf", "text"]
}

4696
crates/kreuzberg-php/src/composer.lock generated Normal file

File diff suppressed because it is too large Load Diff

19159
crates/kreuzberg-php/src/lib.rs generated Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,2 @@
parameters:
ignoreErrors: []

12
crates/kreuzberg-php/src/phpstan.neon generated Normal file
View File

@@ -0,0 +1,12 @@
includes:
- phpstan-baseline.neon
parameters:
level: max
paths:
- src
scanFiles:
- stubs/kreuzberg_extension.php
treatPhpDocTypesAsCertain: false
reportUnmatchedIgnoredErrors: false
tmpDir: var/cache/phpstan

34
crates/kreuzberg-py/Cargo.toml generated Normal file
View File

@@ -0,0 +1,34 @@
[package]
name = "kreuzberg-py"
version = "5.0.0-rc.3"
edition = "2024"
license = "Elastic-2.0"
description = "High-performance document intelligence library"
readme = false
keywords = ["document", "extraction", "ocr", "pdf", "text"]
categories = ["text-processing"]
# `pyo3-async-runtimes` and `serde_json` are emitted unconditionally above so
# the manifest is stable across regens, but for umbrella crates with no
# async fns or no JSON-marshalled return types they are genuinely unused.
# The conditional `async-trait` / `tokio` / `futures` deps are similarly
# flagged when the umbrella has trait-bridge / streaming adapters configured
# but no actual async-trait / async callsite in the generated PyO3 shim.
[package.metadata.cargo-machete]
ignored = ["pyo3-async-runtimes", "serde_json", "async-trait", "tokio"]
[lib]
name = "_kreuzberg"
crate-type = ["cdylib"]
[features]
extension-module = ["pyo3/extension-module", "pyo3/abi3-py310"]
[dependencies]
async-trait = "0.1"
kreuzberg = { version = "5.0.0-rc.3", path = "../kreuzberg", features = ["full", "pdf", "ocr", "paddle-ocr", "paddle-ocr-types", "layout-detection", "layout-types", "embeddings", "embedding-presets", "chunking", "keywords-yake", "keywords-rake", "language-detection", "html", "tree-sitter", "office", "email", "archives", "stopwords", "auto-rotate", "auto-rotate-types", "tokio-runtime", "api", "mcp", "liter-llm", "quality"] }
pyo3 = { version = "0.28" }
pyo3-async-runtimes = { version = "0.28", features = ["tokio-runtime"] }
serde = { version = "1", features = ["derive"] }
serde_json = "1"
tokio = { version = "1", features = ["rt-multi-thread"] }

93
crates/kreuzberg-py/src/LICENSE generated Normal file
View File

@@ -0,0 +1,93 @@
Elastic License 2.0 (ELv2)
Copyright 2025-2026 Kreuzberg, Inc.
Acceptance
By using the software, you agree to all of the terms and conditions below.
Copyright License
The licensor grants you a non-exclusive, royalty-free, worldwide,
non-sublicensable, non-transferable license to use, copy, distribute, make
available, and prepare derivative works of the software, in each case subject to
the limitations and conditions below.
Limitations
You may not provide the software to third parties as a hosted or managed
service, where the service provides users with access to any substantial set of
the features or functionality of the software.
You may not move, change, disable, or circumvent the license key functionality
in the software, and you may not remove or obscure any functionality in the
software that is protected by the license key.
You may not alter, remove, or obscure any licensing, copyright, or other notices
of the licensor in the software. Any use of the licensor's trademarks is subject
to applicable law.
Patents
The licensor grants you a license, under any patent claims the licensor can
license, or becomes able to license, to make, have made, use, sell, offer for
sale, import and have imported the software, in each case subject to the
limitations and conditions in this license. This license does not cover any
patent claims that you cause to be infringed by modifications or additions to the
software. If you or your company make any written claim that the software
infringes or contributes to infringement of any patent, your patent license for
the software granted under these terms ends immediately. If your company makes
such a claim, your patent license ends immediately for work on behalf of your
company.
Notices
You must ensure that anyone who gets a copy of any part of the software from you
also gets a copy of these terms.
If you modify the software, you must include in any modified copies of the
software prominent notices stating that you have modified the software.
No Other Rights
These terms do not imply any licenses other than those expressly granted in
these terms.
Termination
If you use the software in violation of these terms, such use is not licensed,
and your licenses will automatically terminate. If the licensor provides you with
a notice of your violation, and you cease all violation of this license no later
than 30 days after you receive that notice, your licenses will be reinstated
retroactively. However, if you violate these terms after such reinstatement, any
additional violation of these terms will cause your licenses to terminate
automatically and permanently.
No Liability
As far as the law allows, the software comes as is, without any warranty or
condition, and the licensor will not be liable to you for any damages arising out
of these terms or the use or nature of the software, under any kind of legal
claim.
Definitions
The licensor is the entity offering these terms, and the software is the
software the licensor makes available under these terms, including any portion
of it.
you refers to the individual or entity agreeing to these terms.
your company is any legal entity, sole proprietorship, or other kind of
organization that you work for, plus all organizations that have control over,
are under the control of, or are under common control with that organization.
control means ownership of substantially all the assets of an entity, or the
power to direct its management and policies by vote, contract, or otherwise.
Control can be direct or indirect.
your licenses are all the licenses granted to you for the software under these
terms.
use means anything you do with the software requiring one of your licenses.
trademark means trademarks, service marks, and similar rights.

View File

17712
crates/kreuzberg-py/src/lib.rs generated Normal file

File diff suppressed because it is too large Load Diff

105
crates/kreuzberg-py/src/pyproject.toml generated Normal file
View File

@@ -0,0 +1,105 @@
[build-system]
build-backend = "maturin"
requires = [ "maturin>=1,<2" ]
[project]
name = "kreuzberg"
version = "5.0.0rc3"
description = "High-performance document intelligence library"
keywords = [ "document", "extraction", "ocr", "pdf", "text" ]
license = "Elastic-2.0"
license-files = [ "LICENSE" ]
authors = [ { name = "Na'aman Hirschfeld <naaman@kreuzberg.dev>" } ]
requires-python = ">=3.10"
classifiers = [
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Programming Language :: Python :: 3.14",
]
urls.repository = "https://github.com/kreuzberg-dev/kreuzberg"
homepage = "https://kreuzberg.dev"
[dependency-groups]
dev = [ "mypy>=1.19", "ruff>=0.14.8" ]
[tool.maturin]
module-name = "kreuzberg._kreuzberg"
manifest-path = "../../crates/kreuzberg-py/Cargo.toml"
# abi3-py310 produces a single wheel per platform that loads on Python 3.10+,
# avoiding a per-Python-version build matrix.
features = [ "pyo3/extension-module", "pyo3/abi3-py310" ]
python-packages = [ "kreuzberg" ]
# Bundle the core Rust crate so `pip install` can build from sdist on
# platforms without a precompiled wheel (e.g. Alpine/musl). Without this
# the workspace [patch.crates-io] (when present) points at a path that is
# missing from the tarball and the source build fails.
include = [
{ path = "../../crates/kreuzberg/**/*", format = "sdist" },
]
[tool.ruff]
target-version = "py310"
line-length = 120
format.docstring-code-line-length = 120
format.docstring-code-format = true
lint.select = [ "ALL" ]
lint.ignore = [
"ANN401",
"ASYNC109",
"ASYNC110",
"BLE001",
"COM812",
"D100",
"D104",
"D107",
"D205",
"E501",
"EM",
"FBT",
"FIX",
"ISC001",
"PD011",
"PGH003",
"PLR2004",
"PLW0603",
"S104",
"S110",
"S603",
"TD",
"TRY",
]
lint.per-file-ignores."kreuzberg/__init__.py" = [ "I001" ]
# The alef Python codegen still emits cosmetic warnings on the wrapper
# modules: api.py keeps the legacy `from typing import AsyncIterator` and a
# single-line import block, options.py carries # noqa: TC001 / F401 markers
# that turn out unused on every regen, __init__.py star-imports re-sort with
# a different convention. Silence these specific rules on the wrappers until
# the codegen is updated to emit ruff-clean output.
lint.per-file-ignores."kreuzberg/api.py" = [ "F401", "I001", "UP035" ]
lint.per-file-ignores."kreuzberg/options.py" = [ "F401", "RUF100" ]
lint.per-file-ignores."tests/**" = [ "ANN", "D103", "PLR2004", "S101" ]
lint.mccabe.max-complexity = 15
lint.pydocstyle.convention = "google"
lint.pylint.max-args = 10
lint.pylint.max-branches = 15
lint.pylint.max-returns = 10
[tool.mypy]
python_version = "3.10"
strict = true
show_error_codes = true
implicit_reexport = false
namespace_packages = true
overrides = [
# The alef-emitted `api.py` wrapper has a structural mismatch between its
# `options.*` dataclass signatures and the `_internal_bindings.*` pyclass
# types pyo3 accepts/returns at runtime. pyo3 reconciles them dynamically via
# FromPyObject — the Python e2e suite exercises the runtime path — but mypy
# sees only the static-type discrepancy. Disable the four error codes the
# discrepancy raises until the codegen emits matching `_to_rust_*` calls and
# casts the return values.
{ module = "kreuzberg.api", disable_error_code = [ "call-arg", "arg-type", "return-value", "attr-defined" ] },
]

316
crates/kreuzberg-py/src/uv.lock generated Normal file
View File

@@ -0,0 +1,316 @@
version = 1
revision = 3
requires-python = ">=3.10"
resolution-markers = [
"python_full_version >= '3.15'",
"python_full_version < '3.15'",
]
[[package]]
name = "ast-serialize"
version = "0.5.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/81/9d/09e27731bd5864a9ce04e3244074e674bb8936bf62b45e0357248717adac/ast_serialize-0.5.0.tar.gz", hash = "sha256:5880091bfe6f4f986f22866375c2e884843e7a0b6343ae41aeea659613d879b6", size = 61157, upload-time = "2026-05-17T17:48:29.429Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/c0/9a/13dde51ba9e15f8b97957ab7cb0120d0e381524d651c6bd630b9c359227f/ast_serialize-0.5.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:8f5c14f169eb0972c0c21bada5358b23d6047c76583b005234f865b11f1fa00a", size = 1183520, upload-time = "2026-05-17T17:47:30.831Z" },
{ url = "https://files.pythonhosted.org/packages/37/de/5a7f0a9fe68944f536632a5af84676739c7d2582be42deb082634bf3a754/ast_serialize-0.5.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7d1a2de9de5be04652f0ed60738356ef94f66db37924a9499fffe98dc491aa0b", size = 1175779, upload-time = "2026-05-17T17:47:32.551Z" },
{ url = "https://files.pythonhosted.org/packages/9c/81/0bb853e76e4f6e9a1855d569003c59e19ffac45f7079d91505d1bb212f92/ast_serialize-0.5.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:be5173fb66f9b49026d9d5a2ff0fc7c7009077107c0eb285b2d60fdf1fe10bd1", size = 1233750, upload-time = "2026-05-17T17:47:34.731Z" },
{ url = "https://files.pythonhosted.org/packages/e5/d3/4cf705beeccc08754d0bbda99aefff26110e209b9a07ac8a6b60eec48531/ast_serialize-0.5.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f8015cd071ac1339924ee2b8098c93e00e155f30a16f40ec9816fcf84f4753f6", size = 1235942, upload-time = "2026-05-17T17:47:36.287Z" },
{ url = "https://files.pythonhosted.org/packages/26/c8/ee097e437ea27dd2b8b227865c875492b585650a5802a22d82b304c8201b/ast_serialize-0.5.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5499e8797edff2a9186aa313ed382c6b422e798e9332d9953badcee6e69a88f2", size = 1442517, upload-time = "2026-05-17T17:47:38.17Z" },
{ url = "https://files.pythonhosted.org/packages/ff/bd/68063442838f1ba68ec72b5436430bc75b3bb17a1a3c3063f09b0c05ae2b/ast_serialize-0.5.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6848f2a093fb5548751a9a09bff8fcd229e2bbeb0e3331f391b6ae6d26cd9903", size = 1254081, upload-time = "2026-05-17T17:47:39.826Z" },
{ url = "https://files.pythonhosted.org/packages/50/e2/1e520793bc6a4e4524a6ab022391e827825eaa0c3811828bfdc6852eca26/ast_serialize-0.5.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:832d4c998e0b091fd60a6d6bceee535483c4d490de9ba85003af835225719261", size = 1259910, upload-time = "2026-05-17T17:47:41.369Z" },
{ url = "https://files.pythonhosted.org/packages/4e/e1/49b60f467979979cfe6913b43948ff25bca971ad0591d181812f163a988e/ast_serialize-0.5.0-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:16db7c62ec0b8efe1d7afd283a388d8f74f2605d56032e5a37747d2de8dba027", size = 1250678, upload-time = "2026-05-17T17:47:43.702Z" },
{ url = "https://files.pythonhosted.org/packages/74/ba/66ab9555de6275677566f6574e5ef6c29cb185ea866f643bc06f8280a8ee/ast_serialize-0.5.0-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:baf5eb061eb5bccade4128ad42da33787d72f6013809cd1b590376ece8b3c937", size = 1301603, upload-time = "2026-05-17T17:47:46.256Z" },
{ url = "https://files.pythonhosted.org/packages/66/42/6aca9b9abc710014b2be9059689e5dd1679339e78f567ffb4d255a9e2050/ast_serialize-0.5.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:104e4a35bd7c124173c41760ef9aaea17ddb3f86c65cb643671d59afbe3ee94c", size = 1410332, upload-time = "2026-05-17T17:47:47.899Z" },
{ url = "https://files.pythonhosted.org/packages/47/68/2f76594432a22581ecf878b5e75a9b8601c24b2241cf0bbeb1e21fcf370c/ast_serialize-0.5.0-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:36be371028fc1675acb38a331bde160dbab7ff907fdf00b67eb6911aa106951b", size = 1509979, upload-time = "2026-05-17T17:47:50.942Z" },
{ url = "https://files.pythonhosted.org/packages/40/ac/a93c9b58292653f6c595752f677a08e608f903b710594909e9231a389b3b/ast_serialize-0.5.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:061ee58bdb52341c8201a6df41182a977736bae3b7ded87ca7176ca25a8a47ab", size = 1505002, upload-time = "2026-05-17T17:47:54.093Z" },
{ url = "https://files.pythonhosted.org/packages/14/2e/b278f68c497ee2f1d1576cbbef8db5281cd4a5f2db040537592ac9c8862e/ast_serialize-0.5.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:b15219e9cdc9f53f6f4cb51c009203507228226148c05c5e8fe451c28b435eb3", size = 1456231, upload-time = "2026-05-17T17:47:56.311Z" },
{ url = "https://files.pythonhosted.org/packages/0b/43/419be1c566a4c504cd8fd60ce2f84e790f295495c0f327cfaeadf3d51012/ast_serialize-0.5.0-cp314-cp314t-win32.whl", hash = "sha256:842d1c004bb466c7df036f95fabef789570541922b10976b12f5592a69cf0b38", size = 1058668, upload-time = "2026-05-17T17:47:58.305Z" },
{ url = "https://files.pythonhosted.org/packages/03/6f/c9d4d549295ed05111aeb8853232d1afd9d0a179fddb01eeffbb3a4a6842/ast_serialize-0.5.0-cp314-cp314t-win_amd64.whl", hash = "sha256:b0c06d760909b095cc466356dfccd05a1c7233a6ca191c020dca2c6a6f16c24c", size = 1101075, upload-time = "2026-05-17T17:48:00.35Z" },
{ url = "https://files.pythonhosted.org/packages/d0/8e/d00c5ab30c58222e07d62956fca86c59d91b9ad32997e633c38b526623a3/ast_serialize-0.5.0-cp314-cp314t-win_arm64.whl", hash = "sha256:787baedb0262cc49e8ce37cc15c00ae818e46a165a3b36f5e21ed174998104cb", size = 1075347, upload-time = "2026-05-17T17:48:01.753Z" },
{ url = "https://files.pythonhosted.org/packages/e0/9e/dc2530acb3a60dc6e46d65abf27d1d9f86721694757906a148d90a6860de/ast_serialize-0.5.0-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:0668aa9459cfa8c9c49ddd2163ebcf43088ba045ef7492af6fe22e0098303101", size = 1191380, upload-time = "2026-05-17T17:48:03.738Z" },
{ url = "https://files.pythonhosted.org/packages/26/0a/bd3d18a582f273d6c843d16bb9e22e9e16365ff7991e92f18f798e9f1224/ast_serialize-0.5.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:bf683d6363edf2b39eed6b6d4fe22d34b6203867a67e27134d9e2a2680c4bc4a", size = 1183879, upload-time = "2026-05-17T17:48:05.463Z" },
{ url = "https://files.pythonhosted.org/packages/40/ae/1f919100f8620887af58fcc381c61a1f218cdf89c6e155f87b213e61010a/ast_serialize-0.5.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9cc22cf0c9be65e71cf88fda130af60d61eb4a79370ad4cfe7900d48a4aa2211", size = 1244529, upload-time = "2026-05-17T17:48:07.008Z" },
{ url = "https://files.pythonhosted.org/packages/c6/ca/6376559dcce707cdbc1d0d9a13c8d3baaaa501e949ce0ebdc4230cd881aa/ast_serialize-0.5.0-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f66173891548c9f2726bf27957b41cabce12fa679dc6da505ddbde4d4b3b31cf", size = 1240560, upload-time = "2026-05-17T17:48:08.46Z" },
{ url = "https://files.pythonhosted.org/packages/35/b2/a620e206b5aeb7efbf2710336df57d457cffbb3991076bbcc1147ef9abd4/ast_serialize-0.5.0-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e42d729ef2be96a14efbad355093284739e3670ece3e534f82cc8832790911d9", size = 1451172, upload-time = "2026-05-17T17:48:09.922Z" },
{ url = "https://files.pythonhosted.org/packages/fa/e0/4ad5c04c24a40481b2935ce9a0ccdb6023dc8b667167d06ae530cc3512f2/ast_serialize-0.5.0-cp39-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b725026bafa801dbd7310eb13a75f0a2e370e7e51b2cb225f9d21fcfadf919ee", size = 1265072, upload-time = "2026-05-17T17:48:11.469Z" },
{ url = "https://files.pythonhosted.org/packages/b2/71/4d1d479aa56d0101c40e17720c3d6ac2af7269ea0487a80b18e7bfd1a5b7/ast_serialize-0.5.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b54f60c1d78767a53b67eaa663f0dfac3afe606aa07f1301572f588b73d64809", size = 1270488, upload-time = "2026-05-17T17:48:13.575Z" },
{ url = "https://files.pythonhosted.org/packages/6d/4f/0de1bbe06f6edef9fde4ed12ca8e7b3ec7e6e2bd4e672c5af487f7957665/ast_serialize-0.5.0-cp39-abi3-manylinux_2_31_riscv64.whl", hash = "sha256:27d51654fc240a1e87e742d353d98eb45b75f62f129086b3596ab53df2ac2a43", size = 1260702, upload-time = "2026-05-17T17:48:15.141Z" },
{ url = "https://files.pythonhosted.org/packages/75/61/e00872439cfdddcc3c1b6cdaa6e5d904ba8e26a18807c67c4e14409d0ca8/ast_serialize-0.5.0-cp39-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2782c36237c46dd1674542f2109740ea5ea485a169bf1431939ada0434e17934", size = 1311182, upload-time = "2026-05-17T17:48:16.779Z" },
{ url = "https://files.pythonhosted.org/packages/76/8e/699a5b955f7926956c95e9e1d74132acad73c2fe7a426f94da89123c20aa/ast_serialize-0.5.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:1943db345233cc7194a470f13afa9c59772c0b123dea0c9414c4d4ca54369759", size = 1421410, upload-time = "2026-05-17T17:48:18.527Z" },
{ url = "https://files.pythonhosted.org/packages/a9/ae/d5b7626874478997adc7a29ab28accf21e596fb590c944290401dfd0b29e/ast_serialize-0.5.0-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:df1c00022cbbcb064bfaa505aa9c9295362443ce5dacb459d1331d3da353f887", size = 1516587, upload-time = "2026-05-17T17:48:20.133Z" },
{ url = "https://files.pythonhosted.org/packages/0c/ce/b59e02a82d9c4244d64cde502e0b00e83e38816abe19155ceb5437402c7f/ast_serialize-0.5.0-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:cae65289fc456fde04af979a2be09302ef5d8ab92ef23e596d6746dc267ada27", size = 1515171, upload-time = "2026-05-17T17:48:21.921Z" },
{ url = "https://files.pythonhosted.org/packages/8b/38/d8d90042747d05aa08d4efcf1c99035a5f670a6bf4c214d31644392afbca/ast_serialize-0.5.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:239a4c354e8d676e9d94631d1d4a64edc6b266f86ff3a5a80aedd344f342c01d", size = 1464668, upload-time = "2026-05-17T17:48:23.544Z" },
{ url = "https://files.pythonhosted.org/packages/dd/51/5b840c4df7334104cecffa28f23904fe81ca89ca223d2450e288de39fd3c/ast_serialize-0.5.0-cp39-abi3-win32.whl", hash = "sha256:143a4ef63285a075871908fda3672dc21864b83a8ec3ee12304aa3e4c5387b9a", size = 1068311, upload-time = "2026-05-17T17:48:25.027Z" },
{ url = "https://files.pythonhosted.org/packages/41/11/ca5672c7d491825bc4cd6702dea106a6b60d928707712ec257c7833ae476/ast_serialize-0.5.0-cp39-abi3-win_amd64.whl", hash = "sha256:cf25572c526add400f26a4750dc6ce0c3bb93fc1f75e7ae0cad4ce4f2cd5c590", size = 1108931, upload-time = "2026-05-17T17:48:26.591Z" },
{ url = "https://files.pythonhosted.org/packages/45/19/cc8bd127d28a43da249aa955cfd164cf8fd534e79e42cea96c4854d72fd0/ast_serialize-0.5.0-cp39-abi3-win_arm64.whl", hash = "sha256:92a31c9c20d25a076edaeec76b128a3535d74a24f340b9a8a7e96c9b86dc9642", size = 1081181, upload-time = "2026-05-17T17:48:28.122Z" },
]
[[package]]
name = "kreuzberg"
version = "5.0.0rc1"
source = { editable = "." }
[package.dev-dependencies]
dev = [
{ name = "mypy" },
{ name = "ruff" },
]
[package.metadata]
[package.metadata.requires-dev]
dev = [
{ name = "mypy", specifier = ">=1.19" },
{ name = "ruff", specifier = ">=0.14.8" },
]
[[package]]
name = "librt"
version = "0.11.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/40/08/9e7f6b5d2b5bed6ad055cdd5925f192bb403a51280f86b56554d9d0699a2/librt-0.11.0.tar.gz", hash = "sha256:075dc3ef4458a278e0195cbf6ac9d38808d9b906c5a6c7f7f79c3888276a3fb1", size = 200139, upload-time = "2026-05-10T18:17:25.138Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/83/10/37fd9e9ba96cb0bd742dfb20fc3d082e54bdbec759d7300df927f360ef07/librt-0.11.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6e94ebfcfa2d5e9926d6c3b9aa4617ffc42a845b4321fb84021b872358c82a0f", size = 141706, upload-time = "2026-05-10T18:15:16.129Z" },
{ url = "https://files.pythonhosted.org/packages/cf/72/1b1466f358e4a0b728051f69bc27e67b432c6eaa2e05b88db49d3785ae0d/librt-0.11.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ae627397a2f351560440d872d6f7c8dbb4072e57868e7b2fc5b8b430fe489d45", size = 142605, upload-time = "2026-05-10T18:15:18.148Z" },
{ url = "https://files.pythonhosted.org/packages/ca/85/ed26dd2f6bc9a0baf48306433e579e8d354d70b2bcb78134ed950a5d0e1e/librt-0.11.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dc329359321b67d24efdf4bc69012b0597001649544db662c001db5a0184794c", size = 476555, upload-time = "2026-05-10T18:15:19.569Z" },
{ url = "https://files.pythonhosted.org/packages/66/fe/11891191c0e0a3fd617724e891f6e67a71a7658974a892b9a9a97fdb2977/librt-0.11.0-cp310-cp310-manylinux2014_i686.manylinux_2_17_i686.manylinux_2_28_i686.whl", hash = "sha256:7e82e642ab0f7608ce2fe53d76ca2280a9ee33a1b06556142c7c6fe80a86fc33", size = 468434, upload-time = "2026-05-10T18:15:20.87Z" },
{ url = "https://files.pythonhosted.org/packages/6f/50/5ec949d7f9ce1a07af903aa3e13abb98b717923bdead6e719b2f824ccc07/librt-0.11.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:88145c15c67731d54283d135b03244028c750cc9edc334a96a4f5950ebdb2884", size = 496918, upload-time = "2026-05-10T18:15:22.616Z" },
{ url = "https://files.pythonhosted.org/packages/ea/c4/177336c7524e34875a38bf668e88b193a6723a4eb4045d07f74df6e1506c/librt-0.11.0-cp310-cp310-manylinux_2_34_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:9d36a51b3d93320b686588e27123f4995804dbf1bce81df78c02fc3c6eea9280", size = 490334, upload-time = "2026-05-10T18:15:24.2Z" },
{ url = "https://files.pythonhosted.org/packages/13/1f/da3112f7569eda3b49f9a2629bae1fe059812b6085df16c885f6454dff49/librt-0.11.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d00f3ac06a2a8b246327f11e186a53a100a4d5c7ed52346367e5ec751d51586c", size = 511287, upload-time = "2026-05-10T18:15:26.226Z" },
{ url = "https://files.pythonhosted.org/packages/fa/94/03fec301522e172d105581431223be56b27594ff46440ebfbb658a3735d5/librt-0.11.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:461bbceede621f1ffb8839755f8663e886087ee7af16294cab7fb4d782c62eeb", size = 517202, upload-time = "2026-05-10T18:15:27.965Z" },
{ url = "https://files.pythonhosted.org/packages/b7/6e/339f6e5a7b413ce014f1917a756dae630fe59cc99f34153205b1cb540901/librt-0.11.0-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:0cad8a4d6a8ff03c9b76f9414caccd78e7cfbc8a2e12fa334d8e1d9932753783", size = 497517, upload-time = "2026-05-10T18:15:29.614Z" },
{ url = "https://files.pythonhosted.org/packages/cd/43/acdd5ce317cb46e8253ca9bfbdb8b12e68a24d745949336a7f3d5fb79ba0/librt-0.11.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f37aa505b3cf60701562eddb32df74b12a9e380c207fd8b06dd157a943ac7ea0", size = 538878, upload-time = "2026-05-10T18:15:30.928Z" },
{ url = "https://files.pythonhosted.org/packages/29/b5/7a25bb12e3172839f647f196b3e988318b7bb1ca7501732a225c4dce2ec0/librt-0.11.0-cp310-cp310-win32.whl", hash = "sha256:94663a21534637f0e787ec2a2a756022df6e5b7b2335a5cdd7d8e33d68a2af89", size = 100070, upload-time = "2026-05-10T18:15:32.551Z" },
{ url = "https://files.pythonhosted.org/packages/c6/0d/ebbcf4d77999c02c937b05d2b90ff4cd4dcc7e9a365ba132329ac1fe7a0f/librt-0.11.0-cp310-cp310-win_amd64.whl", hash = "sha256:dec7db73758c2b54953fd8b7fe348c45188fe26b39ee18446196edd08453a5d4", size = 117918, upload-time = "2026-05-10T18:15:33.678Z" },
{ url = "https://files.pythonhosted.org/packages/fe/87/2bf31fe17587b29e3f93ec31421e2b1e1c3e349b8bf6c7c313dbad1d5340/librt-0.11.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:93d95bd45b7d58343d8b90d904450a545144eec19a002511163426f8ab1fae29", size = 141092, upload-time = "2026-05-10T18:15:34.795Z" },
{ url = "https://files.pythonhosted.org/packages/cf/08/5c5bf772920b7ebac6e32bc91a643e0ab3870199c0b542356d3baa83970a/librt-0.11.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4ee278c769a713638cdacd4c0436d72156e75df3ebc0166ab2b9dc43acc386c9", size = 142035, upload-time = "2026-05-10T18:15:36.242Z" },
{ url = "https://files.pythonhosted.org/packages/06/20/662a03d254e5b000d838e8b345d83303ddb768c080fd488e40634c0fa66b/librt-0.11.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f230cb1cbc9faaa616f9a678f530ebcf186e414b6bcbd88b960e4ba1b92428d5", size = 475022, upload-time = "2026-05-10T18:15:37.56Z" },
{ url = "https://files.pythonhosted.org/packages/de/f3/aa81523e45184c6ec23dc7f63263362ec55f80a09d424c012359ecbe7e35/librt-0.11.0-cp311-cp311-manylinux2014_i686.manylinux_2_17_i686.manylinux_2_28_i686.whl", hash = "sha256:5d63c855d86938d9de93e265c9bd8c705b51ec494de5738340ee93767a686e4b", size = 467273, upload-time = "2026-05-10T18:15:39.182Z" },
{ url = "https://files.pythonhosted.org/packages/6b/6f/59c74b560ca8853834d5501d589c8a2519f4184f273a085ffd0f37a1cc47/librt-0.11.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:993f028be9e96a08d31df3479ac80d99be374d17f3b78e4796b3fd3c913d4e89", size = 497083, upload-time = "2026-05-10T18:15:40.634Z" },
{ url = "https://files.pythonhosted.org/packages/fe/7b/5aa4d2c9600a719401160bf7055417df0b2a47439b9d88286ce45e56b65f/librt-0.11.0-cp311-cp311-manylinux_2_34_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:258d73a0aa66a055e65b2e4d1b8cdb23b9d132c5bb915d9547d804fcaed116cc", size = 489139, upload-time = "2026-05-10T18:15:41.934Z" },
{ url = "https://files.pythonhosted.org/packages/d6/31/9143803d7da6856a69153785768c4936864430eec0fd9461c3ea527d9922/librt-0.11.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0827efe7854718f04aaddf6496e96960a956e676fe1d0f04eb41511fd8ad06d5", size = 508442, upload-time = "2026-05-10T18:15:43.206Z" },
{ url = "https://files.pythonhosted.org/packages/2f/5a/bce08184488426bda4ccc2c4964ac048c8f68ae89bd7120082eef4233cfd/librt-0.11.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:7753e57d6e12d019c0d8786f1c09c709f4c3fcc57c3887b24e36e6c06ec938b7", size = 514230, upload-time = "2026-05-10T18:15:44.761Z" },
{ url = "https://files.pythonhosted.org/packages/89/8c/bb5e213d254b7505a0e658da199d8ab719086632ce09eef311ab27976523/librt-0.11.0-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:11bd19822431cc21af9f27374e7ae2e58103c7d98bda823536a6c47f6bb2bb3d", size = 494231, upload-time = "2026-05-10T18:15:46.308Z" },
{ url = "https://files.pythonhosted.org/packages/9d/fb/541cdad5b1ab1300398c74c4c9a497b88e5074c21b1244c8f49731d3a284/librt-0.11.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:22bdf239b219d3993761a148ffa134b19e52e9989c84f845d5d7b71d70a17412", size = 537585, upload-time = "2026-05-10T18:15:47.629Z" },
{ url = "https://files.pythonhosted.org/packages/8f/f2/464bb69295c320cb06bddb4f14a4ec67934ee14b2bffb12b19fb7ab287ba/librt-0.11.0-cp311-cp311-win32.whl", hash = "sha256:46c60b61e308eb535fbd6fa622b1ee1bb2815691c1ad9c98bf7b84952ec3bc8d", size = 100509, upload-time = "2026-05-10T18:15:49.157Z" },
{ url = "https://files.pythonhosted.org/packages/6d/e7/a17ee1788f9e4fbf548c19f4afa07c92089b9e24fef6cb2410863781ef4c/librt-0.11.0-cp311-cp311-win_amd64.whl", hash = "sha256:902e546ff044f579ff1c953ff5fce97b636fe9e3943996b2177710c6ef076f73", size = 118628, upload-time = "2026-05-10T18:15:50.345Z" },
{ url = "https://files.pythonhosted.org/packages/cc/c7/6c766214f9f9903bcfcfbef97d807af8d8f5aa3502d247858ab17582d212/librt-0.11.0-cp311-cp311-win_arm64.whl", hash = "sha256:65ac3bc20f78aa0ee5ae84baa68917f89fef4af63e941084dd019a0d0e749f0c", size = 103122, upload-time = "2026-05-10T18:15:52.068Z" },
{ url = "https://files.pythonhosted.org/packages/8b/d0/07c77e067f0838949b43bd89232c29d72efebb9d2801a9750184eb706b71/librt-0.11.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b87504f1690a23b9a2cca841191a04f83895d4fc2dd04df91d82b1a04ca2ad46", size = 144147, upload-time = "2026-05-10T18:15:53.227Z" },
{ url = "https://files.pythonhosted.org/packages/7a/24/8493538fa4f62f982686398a5b8f68008138a75086abdea19ade64bf4255/librt-0.11.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40071fc5fe0ce8daa6de616702314a01e1250711682b0523d6ab8d4525910cb3", size = 143614, upload-time = "2026-05-10T18:15:54.657Z" },
{ url = "https://files.pythonhosted.org/packages/ff/1e/f8bad050810d9171f34a1648ed910e56814c2ba61639f2bd53c6377ae24b/librt-0.11.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:137e79445c896a0ea7b265f52d23954e05b64222ee1af69e2cb34219067cbb67", size = 485538, upload-time = "2026-05-10T18:15:56.117Z" },
{ url = "https://files.pythonhosted.org/packages/c0/fe/3594ebfbaf03084ba4b120c9ba5c3183fd938a48725e9bbe6ff0a5159ad8/librt-0.11.0-cp312-cp312-manylinux2014_i686.manylinux_2_17_i686.manylinux_2_28_i686.whl", hash = "sha256:cca6644054e78746d8d4ef238681f9c34ff8b584fe6b988ecebb8db3b15e622a", size = 479623, upload-time = "2026-05-10T18:15:57.544Z" },
{ url = "https://files.pythonhosted.org/packages/b0/da/5d1876984b3746c85dbd219dbfcb73c85f54ee263fd32e5b2a632ec14571/librt-0.11.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d5b0eea49f5562861ee8d757a32ef7d559c1d35be2aaaa1ec28941d74c9ffc8a", size = 513082, upload-time = "2026-05-10T18:15:58.805Z" },
{ url = "https://files.pythonhosted.org/packages/19/6e/55bdf5d5ca00c3e18430690bf2c953d8d3ffd3c337418173d33dec985dc9/librt-0.11.0-cp312-cp312-manylinux_2_34_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:0d1029d7e1ae1a7e647ed6fb5df8c4ce2dffefb7a9f5fd1376a4554d96dac09f", size = 508105, upload-time = "2026-05-10T18:16:00.2Z" },
{ url = "https://files.pythonhosted.org/packages/07/10/f1f23a7c595ee90ece4d35c851e5d104b1311a887ed1b4ac4c35bbd13da8/librt-0.11.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bc3ce6b33c5828d9e80592011a5c584cb2ce86edbc4088405f70da47dc1d1b3b", size = 522268, upload-time = "2026-05-10T18:16:01.708Z" },
{ url = "https://files.pythonhosted.org/packages/b6/02/5720f5697a7f54b78b3aefbe20df3a48cedcff1276618c4aa481177942ed/librt-0.11.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:936c5995f3514a42111f20099397d8177c79b4d7e70961e396c6f5a0a3566766", size = 527348, upload-time = "2026-05-10T18:16:03.496Z" },
{ url = "https://files.pythonhosted.org/packages/50/db/b4a47c6f91db4ff76348a0b3dd0cc65e090a078b765a810a62ff9434c3d3/librt-0.11.0-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:9bc0ca6ad9381cbe8e4aa6e5726e4c80c78115a6e9723c599ed1d73e092bc49d", size = 516294, upload-time = "2026-05-10T18:16:05.173Z" },
{ url = "https://files.pythonhosted.org/packages/9e/58/9384b2f4eb1ed1d273d40948a7c5c4b2360213b402ef3be4641c06299f9c/librt-0.11.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:070aa8c26c0a74774317a72df8851facc7f0f012a5b406557ac56992d92e1ec8", size = 553608, upload-time = "2026-05-10T18:16:06.839Z" },
{ url = "https://files.pythonhosted.org/packages/21/7b/5aa8848a7c6a9278c79375146da1812e695754ceec5f005e6043461a7315/librt-0.11.0-cp312-cp312-win32.whl", hash = "sha256:6bf14feb84b05ae945277395451998c89c54d0def4070eb5c08de544930b245a", size = 101879, upload-time = "2026-05-10T18:16:08.103Z" },
{ url = "https://files.pythonhosted.org/packages/37/33/8a745436944947575b584231750a41417de1a38cf6a2e9251d1065651c09/librt-0.11.0-cp312-cp312-win_amd64.whl", hash = "sha256:75672f0bc524ede266287d532d7923dbce94c7514ad07627bac3d0c6d92cc4d9", size = 119831, upload-time = "2026-05-10T18:16:09.174Z" },
{ url = "https://files.pythonhosted.org/packages/59/67/a6739ac96e28b7855808bdb0370e250606104a859750d209e5a0716fe7ab/librt-0.11.0-cp312-cp312-win_arm64.whl", hash = "sha256:2f10cf143e4a9bb0f4f5af568a00df94a2d69ef41c2579584454bb0fe5cc642c", size = 103470, upload-time = "2026-05-10T18:16:10.369Z" },
{ url = "https://files.pythonhosted.org/packages/82/61/e59168d4d0bf2bf90f4f0caf7a001bfc60254c3af4586013b04dc3ef517b/librt-0.11.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:78dc31f7fdfe9c9d0eb0e8f42d139db230e826415bbcabd9f0e9faaaee909894", size = 144119, upload-time = "2026-05-10T18:16:11.771Z" },
{ url = "https://files.pythonhosted.org/packages/61/fd/caa1d60b12f7dd79ccea23054e06eeaebe266a5f52c40a6b651069200ce5/librt-0.11.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:fa475675db22290c3158e1d42326d0f5a65f04f44a0e68c3630a25b53560fb9c", size = 143565, upload-time = "2026-05-10T18:16:13.334Z" },
{ url = "https://files.pythonhosted.org/packages/b8/a9/dc744f5c2b4978d48db970be29f22716d3413d28b14ad99740817315cf2c/librt-0.11.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:621db29691044bdeda22e789e482e1b0f3a985d90e3426c9c6d17606416205ea", size = 485395, upload-time = "2026-05-10T18:16:14.729Z" },
{ url = "https://files.pythonhosted.org/packages/8f/21/7f8e97a1e4dae952a5a95948f6f8507a173bc1e669f54340bba6ca1ca31b/librt-0.11.0-cp313-cp313-manylinux2014_i686.manylinux_2_17_i686.manylinux_2_28_i686.whl", hash = "sha256:a9010e2ed5b3a9e158c5fd966b3ab7e834bb3d3aacc8f66c91dd4b57a3799230", size = 479383, upload-time = "2026-05-10T18:16:16.321Z" },
{ url = "https://files.pythonhosted.org/packages/a6/6d/d8ee9c114bebf2c50e29ec2aa940826fccb62a645c3e4c18760987d0e16d/librt-0.11.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7c39513d8b7477a2e1ed8c43fc21c524e8d5a0f8d4e8b7b074dbdbe7820a08e2", size = 513010, upload-time = "2026-05-10T18:16:17.647Z" },
{ url = "https://files.pythonhosted.org/packages/f0/43/0b5708af2bd30a46400e72ba6bdaa8f066f15fb9a688527e34220e8d6c06/librt-0.11.0-cp313-cp313-manylinux_2_34_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:7aef3cf1d5af86e770ab04bfd993dfc4ae8b8c17f66fb77dd4a7d50de7bbb1a3", size = 508433, upload-time = "2026-05-10T18:16:19.309Z" },
{ url = "https://files.pythonhosted.org/packages/4a/50/356187247d09013490481033183b3532b58acf8028bcb34b2b56a375c9b2/librt-0.11.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:557183ddc36babe46b27dd60facbd5adb4492181a5be887587d57cda6e092f21", size = 522595, upload-time = "2026-05-10T18:16:20.642Z" },
{ url = "https://files.pythonhosted.org/packages/40/e7/c6ac4240899c7f3248079d5a9900debe0dadb3fdeaf856684c987105ba47/librt-0.11.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:83d3e1f72bd42f6c5c0b7daec530c3f829bd02db42c70b8ddf0c2d90a2459930", size = 527255, upload-time = "2026-05-10T18:16:22.352Z" },
{ url = "https://files.pythonhosted.org/packages/eb/b5/a81322dbeedeeaf9c1ee6f001734d28a09d8383ac9e6779bc24bbd0743c6/librt-0.11.0-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:4ce1f21fbe589bc1afd7872dece84fb0e1144f794a288e58a10d2c54a55c43be", size = 516847, upload-time = "2026-05-10T18:16:23.627Z" },
{ url = "https://files.pythonhosted.org/packages/ae/66/6e6323787d592b55204a42595ff1102da5115601b53a7e9ddebc889a6da5/librt-0.11.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:970b09f7044ea2b64c9da42fd3d335666518cfd1c6e8a182c95da73d0214b41e", size = 553920, upload-time = "2026-05-10T18:16:25.025Z" },
{ url = "https://files.pythonhosted.org/packages/9c/21/623f8ca230857102066d9ca8c6c1734995908c4d0d1bee7bb2ef0021cb33/librt-0.11.0-cp313-cp313-win32.whl", hash = "sha256:78fddc31cd4d3caa897ad5d31f856b1faadc9474021ad6cb182b9018793e254e", size = 101898, upload-time = "2026-05-10T18:16:26.649Z" },
{ url = "https://files.pythonhosted.org/packages/b3/1d/b4ebd44dd723f768469007515cb92251e0ae286c94c140f374801140fa74/librt-0.11.0-cp313-cp313-win_amd64.whl", hash = "sha256:8ca8aa88751a775870b764e93bad5135385f563cb8dcee399abf034ea4d3cb47", size = 119812, upload-time = "2026-05-10T18:16:27.859Z" },
{ url = "https://files.pythonhosted.org/packages/3b/e4/b2f4ca7965ca373b491cdb4bc25cdb30c1649ca81a8782056a83850292a9/librt-0.11.0-cp313-cp313-win_arm64.whl", hash = "sha256:96f044bb325fd9cf1a723015638c219e9143f0dfbc0ca54c565df2b7fc748b44", size = 103448, upload-time = "2026-05-10T18:16:29.066Z" },
{ url = "https://files.pythonhosted.org/packages/29/eb/dbce197da4e227779e56b5735f2decc3eb36e55a1cdbf1bd65d6639d76c1/librt-0.11.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:4a017a95e5837dc15a8c5661d60e05daa96b90908b1aa6b7acdf443cd25c8ebd", size = 143345, upload-time = "2026-05-10T18:16:30.674Z" },
{ url = "https://files.pythonhosted.org/packages/76/a3/254bebd0c11c8ba684018efb8006ff22e466abce445215cca6c778e7d9de/librt-0.11.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:b1ecbd9819deccc39b7542bf4d2a740d8a620694d39989e58661d3763458f8d4", size = 143131, upload-time = "2026-05-10T18:16:32.037Z" },
{ url = "https://files.pythonhosted.org/packages/f1/3f/f77d6122d21ac7bf6ae8a7dfced1bd2a7ac545d3273ebdcaf8042f6d619f/librt-0.11.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7da327dacd7be8f8ec36547373550744a3cc0e536d54665cd83f8bcd961200e8", size = 477024, upload-time = "2026-05-10T18:16:33.493Z" },
{ url = "https://files.pythonhosted.org/packages/ac/0a/2c996dadebaa7d9bbbd43ef2d4f3e66b6da545f838a41694ef6172cebec8/librt-0.11.0-cp314-cp314-manylinux2014_i686.manylinux_2_17_i686.manylinux_2_28_i686.whl", hash = "sha256:0dc56b1f8d06e60db362cc3fdae206681817f86ce4725d34511473487f12a34b", size = 474221, upload-time = "2026-05-10T18:16:34.864Z" },
{ url = "https://files.pythonhosted.org/packages/0a/7e/f5d92af8486b8272c23b3e686b46ff72d89c8169585eb61eef01a2ac7147/librt-0.11.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:05fb8fb2ab90e21c8d12ea240d744ad514da9baf381ebfa70d91d20d21713175", size = 505174, upload-time = "2026-05-10T18:16:36.705Z" },
{ url = "https://files.pythonhosted.org/packages/af/1a/cb0734fe86398eb33193ab753b7326255c74cac5eb09e76b9b16536e7adb/librt-0.11.0-cp314-cp314-manylinux_2_34_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:cae74872be221df4374d10fec61f93ed1513b9546ea84f2c0bf73ab3e9bd0b03", size = 497216, upload-time = "2026-05-10T18:16:38.418Z" },
{ url = "https://files.pythonhosted.org/packages/18/06/094820f91558b66e29943c0ec41c9914f460f48dd51fc503c3101e10842d/librt-0.11.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:32bcc918c0148eb7e3d57385125bac7e5f9e4359d05f07448b09f6f778c2f31c", size = 513921, upload-time = "2026-05-10T18:16:39.848Z" },
{ url = "https://files.pythonhosted.org/packages/0b/c2/00de9018871a282f530cacb457d5ec0428f6ac7e6fedde9aff7468d9fb04/librt-0.11.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:f9743fc99135d5f78d2454435615f6dec0473ca507c26ce9d92b10b562a280d3", size = 520850, upload-time = "2026-05-10T18:16:41.471Z" },
{ url = "https://files.pythonhosted.org/packages/51/9d/64631832348fd1834fb3a61b996434edddaaf25a31d03b0a76273159d2cf/librt-0.11.0-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:5ba067f4aadae8fda802d91d2124c90c42195ff32d9161d3549e6d05cfe26f96", size = 504237, upload-time = "2026-05-10T18:16:43.15Z" },
{ url = "https://files.pythonhosted.org/packages/a5/ec/ae5525eb16edc827a044e7bb8777a455ff95d4bca9379e7e6bddd7383647/librt-0.11.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:de3bf945454d032f9e390b85c4072e0a0570bf825421c8be0e71209fa65e1abe", size = 546261, upload-time = "2026-05-10T18:16:44.408Z" },
{ url = "https://files.pythonhosted.org/packages/5a/09/adce371f27ca039411da9659f7430fcc2ba6cd0c7b3e4467a0f091be7fa9/librt-0.11.0-cp314-cp314-win32.whl", hash = "sha256:d2277a05f6dcb9fd13db9566aac4fabd68c3ea1ea46ee5567d4eef8efa495a2f", size = 96965, upload-time = "2026-05-10T18:16:46.039Z" },
{ url = "https://files.pythonhosted.org/packages/d6/ee/8ac720d98548f173c7ce2e632a7ca94673f74cacd5c8162a84af5b35958a/librt-0.11.0-cp314-cp314-win_amd64.whl", hash = "sha256:ab73e8db5e3f564d812c1f5c3a175930a5f9bc96ccb5e3b22a34d7858b401cf7", size = 115151, upload-time = "2026-05-10T18:16:47.133Z" },
{ url = "https://files.pythonhosted.org/packages/94/20/c900cf14efeb09b6bef2b2dff20779f73464b97fd58d1c6bccc379588ae3/librt-0.11.0-cp314-cp314-win_arm64.whl", hash = "sha256:aea3caa317752e3a466fa8af45d91ee0ea8c7fdd96e42b0a8dd9b76a7931eba1", size = 98850, upload-time = "2026-05-10T18:16:48.597Z" },
{ url = "https://files.pythonhosted.org/packages/0c/71/944bfe4b64e12abffcd3c15e1cce07f72f3d55655083786285f4dedeb532/librt-0.11.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:d1b36540d7aaf9b9101b3a6f376c8d8e9f7a9aec93ed05918f2c69d493ffef72", size = 151138, upload-time = "2026-05-10T18:16:49.839Z" },
{ url = "https://files.pythonhosted.org/packages/b6/10/99e64a5c86989357fda078c8143c533389585f6473b7439172dd8f3b3b2d/librt-0.11.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:efbb343ab2ce3540f4ecbe6315d677ed70f37cd9a72b1e58066c918ca83acbaa", size = 151976, upload-time = "2026-05-10T18:16:51.062Z" },
{ url = "https://files.pythonhosted.org/packages/21/31/5072ad880946d83e5ea4147d6d018c78eefce85b77819b19bdd0ee229435/librt-0.11.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:aa0dd688aab3f7914d3e6e5e3554978e0383312fb8e771d84be008a35b9ee548", size = 557927, upload-time = "2026-05-10T18:16:52.632Z" },
{ url = "https://files.pythonhosted.org/packages/5e/8d/70b5fb7cfbab60edbe7381614ab985da58e144fbf465c86d44c95f43cdca/librt-0.11.0-cp314-cp314t-manylinux2014_i686.manylinux_2_17_i686.manylinux_2_28_i686.whl", hash = "sha256:f5fb36b8c6c63fdcbb1d526d94c0d1331610d43f4118cc1beb4efef4f3faacb2", size = 539698, upload-time = "2026-05-10T18:16:53.934Z" },
{ url = "https://files.pythonhosted.org/packages/fa/a3/ba3495a0b3edbd24a4cae0d1d3c64f39a9fc45d06e812101289b50c1a619/librt-0.11.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4a9a237d13addb93715b6fee74023d5ee3469b53fce527626c0e088aa585805f", size = 577162, upload-time = "2026-05-10T18:16:55.589Z" },
{ url = "https://files.pythonhosted.org/packages/f7/db/36e25fb81f99937ff1b96612a1dc9fd66f039cb9cc3aee12c01fac31aab9/librt-0.11.0-cp314-cp314t-manylinux_2_34_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:5ddd17bd87b2c56ddd60e546a7984a2e64c4e8eab92fb4cf3830a48ad5469d51", size = 566494, upload-time = "2026-05-10T18:16:56.975Z" },
{ url = "https://files.pythonhosted.org/packages/33/0d/3f622b47f0b013eeb9cf4cc07ae9bfe378d832a4eec998b2b209fe84244d/librt-0.11.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:bd43992b4473d42f12ff9e68326079f0696d9d4e6000e8f39a0238d482ba6ee2", size = 596858, upload-time = "2026-05-10T18:16:58.374Z" },
{ url = "https://files.pythonhosted.org/packages/a9/02/71b90bc93039c46a2000651f6ad60122b114c8f54c4ad306e0e96f5b75ad/librt-0.11.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:f8e3e8056dd674e279741485e2e512d6e9a751c7455809d0114e6ebf8d781085", size = 590318, upload-time = "2026-05-10T18:16:59.676Z" },
{ url = "https://files.pythonhosted.org/packages/04/04/418cb3f75621e2b761fb1ab0f017f4d70a1a72a6e7c74ee4f7e8d198c2f3/librt-0.11.0-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:c1f708d8ae9c56cf38a903c44297243d2ec83fd82b396b977e0144a3e76217e3", size = 575115, upload-time = "2026-05-10T18:17:01.007Z" },
{ url = "https://files.pythonhosted.org/packages/cc/2c/5a2183ac58dd911f26b5d7e7d7d8f1d87fcecdddd99d6c12169a258ff62c/librt-0.11.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:0add982e0e7b9fc14cf4b33789d5f13f66581889b88c2f58099f6ce8f92617bd", size = 617918, upload-time = "2026-05-10T18:17:02.682Z" },
{ url = "https://files.pythonhosted.org/packages/15/1f/dc6771a52592a4451be6effa200cbfc9cec61e4393d3033d81a9d307961d/librt-0.11.0-cp314-cp314t-win32.whl", hash = "sha256:2b481d846ac894c4e8403c5fd0e87c5d11d6499e404b474602508a224ff531c8", size = 103562, upload-time = "2026-05-10T18:17:03.99Z" },
{ url = "https://files.pythonhosted.org/packages/62/4a/7d1415567027286a75ba1093ec4aca11f073e0f559c530cf3e0a757ad55c/librt-0.11.0-cp314-cp314t-win_amd64.whl", hash = "sha256:28edb433edde181112a908c78907af28f964eabc15f4dd16c9d66c834302677c", size = 124327, upload-time = "2026-05-10T18:17:05.465Z" },
{ url = "https://files.pythonhosted.org/packages/ce/62/b40b382fa0c66fee1478073eb8db352a4a6beda4a1adccf1df911d8c289c/librt-0.11.0-cp314-cp314t-win_arm64.whl", hash = "sha256:dee008f20b542e3cd162ba338a7f9ec0f6d23d395f66fe8aeeec3c9d067ea253", size = 102572, upload-time = "2026-05-10T18:17:06.809Z" },
]
[[package]]
name = "mypy"
version = "2.1.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "ast-serialize" },
{ name = "librt", marker = "platform_python_implementation != 'PyPy'" },
{ name = "mypy-extensions" },
{ name = "pathspec" },
{ name = "tomli", marker = "python_full_version < '3.11'" },
{ name = "typing-extensions" },
]
sdist = { url = "https://files.pythonhosted.org/packages/82/15/cca9d88503549ed6fedeaa1d448cdddd542ee8a490232d732e278036fbf2/mypy-2.1.0.tar.gz", hash = "sha256:81e76ad12c2d804512e9b13240d1588316531bfba07558286078bfbce9613633", size = 3898359, upload-time = "2026-05-11T18:37:36.237Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/a4/71/d351dca3e9b30da2328ee9d445c88b8388072808ebfbc49eb69d30b67749/mypy-2.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:11a6beb180257a805961aea9ec591bbd0bd17f1e18d35b8456d57aee5bedfedc", size = 14778792, upload-time = "2026-05-11T18:36:23.605Z" },
{ url = "https://files.pythonhosted.org/packages/2f/45/7d51594b644c17c0bcf74ed8cd5fc33b324276d708e8506f220b70dab9d9/mypy-2.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8ef78c1d306bbf9a8a12f526c44902c9c28dffd6c52c52bf6a72641ce18d3849", size = 13645739, upload-time = "2026-05-11T18:37:22.752Z" },
{ url = "https://files.pythonhosted.org/packages/65/01/455c31b170e9468265074840bf18863a8482a24103fdaabe4e199392aa5f/mypy-2.1.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c209a90853081ff01d01ee895cafe10f7db1474e0d95beaeef0f6c1db9119bbd", size = 14074199, upload-time = "2026-05-11T18:35:09.292Z" },
{ url = "https://files.pythonhosted.org/packages/41/5a/93093f0b29a9e982deafde698f740a2eb2e05886e79ccf0594c7fd5413a3/mypy-2.1.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:47cebf61abde7c088a4e27718a8b13a81655686b2e9c251f5c0915a802248166", size = 14953128, upload-time = "2026-05-11T18:31:57.678Z" },
{ url = "https://files.pythonhosted.org/packages/7f/2f/a196f5331d96170ad3d28f144d2aba690d4b2911381f68d51e489c7ab82a/mypy-2.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d57a90ae5e872138a425ec328edbc9b235d1934c4377881a33ec05b341acc9a8", size = 15249378, upload-time = "2026-05-11T18:33:00.101Z" },
{ url = "https://files.pythonhosted.org/packages/54/de/94d321cc12da9f71341ac0c270efbed5c725750c7b4c334d957de9a087d9/mypy-2.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:aea7f7a8a55b459c34275fc468ada6ca7c173a5e43a68f5dbe588a563d8a06b8", size = 11060994, upload-time = "2026-05-11T18:33:18.848Z" },
{ url = "https://files.pythonhosted.org/packages/e1/62/0c27ca55219a7c764a7fb88c7bb2b7b2f9780ade8bbf16bc8ed8400eef6b/mypy-2.1.0-cp310-cp310-win_arm64.whl", hash = "sha256:c989640253f0d76843e9c6c1bbf4bd48c5e85ada61bde4beb37cb3eca035685e", size = 9976743, upload-time = "2026-05-11T18:31:25.554Z" },
{ url = "https://files.pythonhosted.org/packages/0a/a1/639f3024794a2a15899cb90707fe02e044c4412794c39c5769fd3df2e2ef/mypy-2.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a683016b16fe2f572dc04c72be7ee0504ac1605a265d0200f5cea695fb788f41", size = 14691685, upload-time = "2026-05-11T18:33:27.973Z" },
{ url = "https://files.pythonhosted.org/packages/3b/08/9a585dea4325f20d8b80dc78623fa50d1fd2173b710f6237afd6ba6ab39b/mypy-2.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1a293c534adb55271fef24a26da04b855540a8c13cc07bc5917b9fd2c394f2ca", size = 13555165, upload-time = "2026-05-11T18:32:16.107Z" },
{ url = "https://files.pythonhosted.org/packages/81/dc/7c42cc9c6cb01e8eb09961f1f738741d3e9c7e9d5c5b30ec69222625cd5f/mypy-2.1.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7406f4d048e71e576f5356d317e5b0a9e666dfd966bd99f9d14ca06e1a341538", size = 13994376, upload-time = "2026-05-11T18:32:39.256Z" },
{ url = "https://files.pythonhosted.org/packages/d4/fa/285946c33bce716e082c11dfeee9ee196eaf1f5042efb3581a31f9f205e4/mypy-2.1.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e0210d626fc8b31ccc90233754c7bc90e1f43205e85d96387f7db1285b55c398", size = 14864618, upload-time = "2026-05-11T18:34:49.765Z" },
{ url = "https://files.pythonhosted.org/packages/2b/83/82397f48af6c27e295d57979ded8490c9829040152cf7571b2f026aeb9a0/mypy-2.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:3712c20deed54e814eaaa825603bada8ea1c390670a397c95b98405347acc563", size = 15102063, upload-time = "2026-05-11T18:34:05.855Z" },
{ url = "https://files.pythonhosted.org/packages/40/68/b02dec39057b88eb03dc0aa854732e26e8361f34f9d0e20c7614967d1eba/mypy-2.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:fcaa0e479066e31f7cceb6a3bea39cb22b2ff51a6b2f24f193d19179ba17c389", size = 11060564, upload-time = "2026-05-11T18:35:36.494Z" },
{ url = "https://files.pythonhosted.org/packages/cf/a8/ea3dcbef31f99b634f2ee23bb0321cbc8c1b388b76a861eb849f13c347dc/mypy-2.1.0-cp311-cp311-win_arm64.whl", hash = "sha256:0b1a5260c95aa443083f9ed3592662941951bca3d4ca224a5dc517c38b7cf666", size = 9966983, upload-time = "2026-05-11T18:37:14.139Z" },
{ url = "https://files.pythonhosted.org/packages/95/b1/55861beb5c339b44f9a2ba92df9e2cb1eeb4ae1eee674cdf7772c797778b/mypy-2.1.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:244358bf1c0da7722230bce60683d52e8e9fd030554926f15b747a84efb5b3af", size = 14874381, upload-time = "2026-05-11T18:37:31.784Z" },
{ url = "https://files.pythonhosted.org/packages/0b/b3/b7f770114b7d0ac92d0f76e8d93c2780844a70488a90e91821927850da86/mypy-2.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4ec7c57657493c7a75534df2751c8ae2cda383c16ecc55d2106c54476b1b16f6", size = 13665501, upload-time = "2026-05-11T18:34:23.063Z" },
{ url = "https://files.pythonhosted.org/packages/b6/f3/8ae2037967e2126689a0c11d99e2b707134a565191e92c60ca2572aec60a/mypy-2.1.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d8161b6ff4392410023224f0969d17db93e1e154bc3e4ba62598e720723ae211", size = 14045750, upload-time = "2026-05-11T18:31:48.151Z" },
{ url = "https://files.pythonhosted.org/packages/a0/32/615eb5911859e43d054941b0d0a7d06cfa2870eba86529cf385b052b111c/mypy-2.1.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bf03e12003084a67395184d3eb8cbd6a489dc3655b5664b28c210a9e2403ab0b", size = 15061630, upload-time = "2026-05-11T18:37:06.898Z" },
{ url = "https://files.pythonhosted.org/packages/d4/03/4eafbfff8bfab1b87082741eae6e6a624028c984e6708b73bce2a8570c9d/mypy-2.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:20509760fd791c51579d573153407d226385ec1f8bcce55d730b354f3336bc22", size = 15288831, upload-time = "2026-05-11T18:31:18.07Z" },
{ url = "https://files.pythonhosted.org/packages/99/ee/919661478e5891a3c96e549c036e467e64563ab85995b10c53c8358e16a3/mypy-2.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:6753d0c1fdd6b1a23b9e4f283ce80b2153b724adcb2653b20b85a8a28ac6436b", size = 11135228, upload-time = "2026-05-11T18:34:31.23Z" },
{ url = "https://files.pythonhosted.org/packages/24/0a/6a12b9782ca0831a553192f351679f4548abc9d19a7cc93bb7feb02084c7/mypy-2.1.0-cp312-cp312-win_arm64.whl", hash = "sha256:98ebb6589bb3b6d0c6f0c459d53ca55b8091fbc13d277c4041c885392e8195e8", size = 10040684, upload-time = "2026-05-11T18:36:48.199Z" },
{ url = "https://files.pythonhosted.org/packages/6e/dd/c7191469c777f07689c032a8f7326e393ea34c92d6d76eb7ce5ba57ea66d/mypy-2.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:35aac3bb114e03888f535d5eb51b8bafbb3266586b599da1940f9b1be3ec5bd5", size = 14852174, upload-time = "2026-05-11T18:31:38.929Z" },
{ url = "https://files.pythonhosted.org/packages/55/8c/aed55408879043d72bb9135f4d0d19a02b886dd569631e113e3d2706cb8d/mypy-2.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:8de55a8c861f2a49331f807be98d90caeceeef520bde13d43a160207f8af613e", size = 13651542, upload-time = "2026-05-11T18:36:04.636Z" },
{ url = "https://files.pythonhosted.org/packages/3a/8e/f371a824b1f1fa8ea6e3dbb8703d232977d572be2329554a3bc4d960302f/mypy-2.1.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5fdf2941a07434af755837d9880f7d7d25f1dacb1af9dcd4b9b66f2220a3024e", size = 14033929, upload-time = "2026-05-11T18:35:55.742Z" },
{ url = "https://files.pythonhosted.org/packages/94/21/f54be870d6dd53a82c674407e0f8eed7174b05ec78d42e5abd7b42e84fd5/mypy-2.1.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e195b817c13f02352a9c124301f9f30f078405444679b6753c1b96b6eed37285", size = 15039200, upload-time = "2026-05-11T18:33:10.281Z" },
{ url = "https://files.pythonhosted.org/packages/17/99/bf21748626a40ce59fd29a39386ab46afec88b7bd2f0fa6c3a97c995523f/mypy-2.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5431d42af987ebd92ba2f71d45c85ed41d8e6ca9f5fd209a69f68f707d2469e5", size = 15272690, upload-time = "2026-05-11T18:32:07.205Z" },
{ url = "https://files.pythonhosted.org/packages/d6/d7/9e90d2cf47100bea550ed2bc7b0d4de3a62181d84d5e37da0003e8462637/mypy-2.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:767fe8c66dc3e01e19e1737d4c38ebefead16125e1b8e58ad421903b376f5c65", size = 11147435, upload-time = "2026-05-11T18:33:56.477Z" },
{ url = "https://files.pythonhosted.org/packages/ec/46/e5c449e858798e35ffc90946282a27c62a77be743fe17480e4977374eb91/mypy-2.1.0-cp313-cp313-win_arm64.whl", hash = "sha256:ecfe70d43775ab99562ab128ce49854a362044c9f894961f68f898c23cb7429d", size = 10035052, upload-time = "2026-05-11T18:32:30.049Z" },
{ url = "https://files.pythonhosted.org/packages/b0/ca/b279a672e874aedd5498ae25f722dacc8aa86bbffb939b3f97cbb1cf6686/mypy-2.1.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:7354c5a7f69d9345c3d6e69921d57088eea3ddeeb6b20d34c1b3855b02c36ec2", size = 14848422, upload-time = "2026-05-11T18:35:45.984Z" },
{ url = "https://files.pythonhosted.org/packages/27/e6/3efe56c631d959b9b4454e208b0ac4b7f4f58b404c89f8bec7b49efdfc21/mypy-2.1.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:49890d4f76ac9e06ec117f9e09f3174da70a620a0c300953d8595c926e80947f", size = 13677374, upload-time = "2026-05-11T18:36:57.188Z" },
{ url = "https://files.pythonhosted.org/packages/84/7f/8107ea87a44fd1f1b59882442f033c9c3488c127201b1d1d15f1cbd6022e/mypy-2.1.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:761be68e023ef5d94678772396a8af1220030f80837a3afd8d0aef3b419666f4", size = 14055743, upload-time = "2026-05-11T18:35:18.361Z" },
{ url = "https://files.pythonhosted.org/packages/51/4d/b6d34db183133b83761b9199a82d31557cdbb70a380d8c3b3438e11882a3/mypy-2.1.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c90345fc182dc363b891350457ec69c35140858538f38b4540845afcc32b1aef", size = 15020937, upload-time = "2026-05-11T18:34:59.618Z" },
{ url = "https://files.pythonhosted.org/packages/ff/d7/f08360c691d758acb02f45022c34d98b92892f4ea756644e1000d4b9f3d8/mypy-2.1.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b84802e7b5a6daf1f5e15bc9fcd7ddae77be13981ffab037f1c67bb84d67d135", size = 15253371, upload-time = "2026-05-11T18:36:41.081Z" },
{ url = "https://files.pythonhosted.org/packages/67/1b/09460a13719530a19bce27bd3bc8449e83569dd2ba7faf51c9c3c30c0b61/mypy-2.1.0-cp314-cp314-win_amd64.whl", hash = "sha256:022c771234936ceac541ebaf836fe9e2abeb3f5e09aff21588fe543ff006fe21", size = 11326429, upload-time = "2026-05-11T18:34:13.526Z" },
{ url = "https://files.pythonhosted.org/packages/40/62/75dbf0f82f7b6680340efc614af29dd0b3c17b8a4f1cd09b8bd2fd6bc814/mypy-2.1.0-cp314-cp314-win_arm64.whl", hash = "sha256:498207db725cec88829a6a5c2fc771205fd043719ef98bc49aba8fb9fc4e6d57", size = 10218799, upload-time = "2026-05-11T18:32:23.491Z" },
{ url = "https://files.pythonhosted.org/packages/b2/66/caca04ed7d972fb6eb6dd1ccd6df1de5c38fae8c5b3dc1c4e8e0d85ee6b9/mypy-2.1.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:7d5e5cad0efeba72b93cd17490cc0d69c5ac9ca132994fe3fb0314808aeeb83e", size = 15923458, upload-time = "2026-05-11T18:35:28.64Z" },
{ url = "https://files.pythonhosted.org/packages/ed/52/2d90cbe49d014b13ed7ff337930c30bad35893fe38a1e4641e756bb62191/mypy-2.1.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:ff715050c127d724fd260a2e666e7747fdd83511c0c47d449d98238970aef780", size = 14757697, upload-time = "2026-05-11T18:36:14.208Z" },
{ url = "https://files.pythonhosted.org/packages/ac/37/d98f4a14e081b238992d0ed96b6d39c7cc0148c9699eb71eaa68629665ea/mypy-2.1.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:82208da9e09414d520e912d3e462d454854bed0810b71540bb016dcbca7308fd", size = 15405638, upload-time = "2026-05-11T18:33:48.249Z" },
{ url = "https://files.pythonhosted.org/packages/a3/c2/15c46613b24a84fad2aea1248bf9619b99c2767ae9071fe224c179a0b7d4/mypy-2.1.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e79ebc1b904b84f0310dff7469655a9c36c7a68bddb37bdd42b67a332df61d08", size = 16215852, upload-time = "2026-05-11T18:32:50.296Z" },
{ url = "https://files.pythonhosted.org/packages/5c/90/9c16a57f482c76d25f6379762b56bbf65c711d8158cf271fb2802cfb0640/mypy-2.1.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:e583edc957cfb0deb142079162ae826f58449b116c1d442f2d91c69d9fced081", size = 16452695, upload-time = "2026-05-11T18:33:38.182Z" },
{ url = "https://files.pythonhosted.org/packages/0f/4c/215a4eeb63cacc5f17f516691ea7285d11e249802b942476bff15922a314/mypy-2.1.0-cp314-cp314t-win_amd64.whl", hash = "sha256:b33b6cd332695bba180d55e717a79d3038e479a2c49cc5eb3d53603409b9a5d7", size = 12866622, upload-time = "2026-05-11T18:34:39.945Z" },
{ url = "https://files.pythonhosted.org/packages/4b/50/1043e1db5f455ffe4c9ab22747cd8ca2bc492b1e4f4e21b130a44ee2b217/mypy-2.1.0-cp314-cp314t-win_arm64.whl", hash = "sha256:4f910fe825376a7b66ef7ca8c98e5a149e8cd64c19ae71d84047a74ee060d4e6", size = 10610798, upload-time = "2026-05-11T18:36:31.444Z" },
{ url = "https://files.pythonhosted.org/packages/0d/2a/13ca1f292f6db1b98ff495ef3467736b331621c5917cad984b7043e7348d/mypy-2.1.0-py3-none-any.whl", hash = "sha256:a663814603a5c563fb87a4f96fb473eeb30d1f5a4885afcf44f9db000a366289", size = 2693302, upload-time = "2026-05-11T18:31:29.246Z" },
]
[[package]]
name = "mypy-extensions"
version = "1.1.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/a2/6e/371856a3fb9d31ca8dac321cda606860fa4548858c0cc45d9d1d4ca2628b/mypy_extensions-1.1.0.tar.gz", hash = "sha256:52e68efc3284861e772bbcd66823fde5ae21fd2fdb51c62a211403730b916558", size = 6343, upload-time = "2025-04-22T14:54:24.164Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/79/7b/2c79738432f5c924bef5071f933bcc9efd0473bac3b4aa584a6f7c1c8df8/mypy_extensions-1.1.0-py3-none-any.whl", hash = "sha256:1be4cccdb0f2482337c4743e60421de3a356cd97508abadd57d47403e94f5505", size = 4963, upload-time = "2025-04-22T14:54:22.983Z" },
]
[[package]]
name = "pathspec"
version = "1.1.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/5a/82/42f767fc1c1143d6fd36efb827202a2d997a375e160a71eb2888a925aac1/pathspec-1.1.1.tar.gz", hash = "sha256:17db5ecd524104a120e173814c90367a96a98d07c45b2e10c2f3919fff91bf5a", size = 135180, upload-time = "2026-04-27T01:46:08.907Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/f1/d9/7fb5aa316bc299258e68c73ba3bddbc499654a07f151cba08f6153988714/pathspec-1.1.1-py3-none-any.whl", hash = "sha256:a00ce642f577bf7f473932318056212bc4f8bfdf53128c78bbd5af0b9b20b189", size = 57328, upload-time = "2026-04-27T01:46:07.06Z" },
]
[[package]]
name = "ruff"
version = "0.15.14"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/dc/8a/8bce2894573e9dae6ff4d77fe34ad727d79b9e6238ad288c5638990d90f6/ruff-0.15.14.tar.gz", hash = "sha256:48e866b165be4a9bdbf310f7d3c9a07edef2fe8cd63ffeb4e00bb590506ebf9f", size = 4700910, upload-time = "2026-05-21T14:34:55.177Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/b9/c8/74a92c6ff9fcfb4f1f947126d3ebee8389276e161ecc85de5bda7cda51bd/ruff-0.15.14-py3-none-linux_armv6l.whl", hash = "sha256:8dd2db9416e487c8d4b01fa7056bb02c4d05969d4f8d17a08c229c2f4ff3c108", size = 10739177, upload-time = "2026-05-21T14:34:37.332Z" },
{ url = "https://files.pythonhosted.org/packages/45/91/254a35c20acc38a7223c9d2d594af12e794432464f2cdeb52af1dc4a892d/ruff-0.15.14-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:be4ff55af755bd71a00ab3dc6bd7ffc467bd76e0df6881e286c2e3d23e8fb43b", size = 11144969, upload-time = "2026-05-21T14:34:43.978Z" },
{ url = "https://files.pythonhosted.org/packages/56/9e/d13e40f83b8d0a94430e6778ce1d94a43b38cf2efe63278bdd2b4c65abbf/ruff-0.15.14-py3-none-macosx_11_0_arm64.whl", hash = "sha256:48d5909d7d06276ce7dde6d32bfa4b0d4cb2651145cd8ee4b440722cbc77832f", size = 10478207, upload-time = "2026-05-21T14:34:48.378Z" },
{ url = "https://files.pythonhosted.org/packages/8d/f1/b15a7839fa4f332f8acec78e20564f26bb2d866e3d21710b877fd0263000/ruff-0.15.14-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca8cbfa94c4f90984a67561978602746d4cd27103568f745fa90eee3f0d4107d", size = 10818459, upload-time = "2026-05-21T14:34:22.318Z" },
{ url = "https://files.pythonhosted.org/packages/45/33/53d651177f84f94b400a0e27f8824eeada3dddc9d5ee8aeb048f4352a520/ruff-0.15.14-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9a6bbc0333f1ab053423bcbf6226477d266ca7cec7738c4c8e3f55647803f3c4", size = 10541800, upload-time = "2026-05-21T14:34:20.209Z" },
{ url = "https://files.pythonhosted.org/packages/b8/a6/868f87e0bf9786ed24b5d0d0ad8676b8a94fd1912f42cddf9cfc7857818a/ruff-0.15.14-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a24a4f7605d7003a6674d4387651effd939dead3fddd0f36561eb77a9a2e542", size = 11342149, upload-time = "2026-05-21T14:34:46.365Z" },
{ url = "https://files.pythonhosted.org/packages/a7/8b/38cd5c19faffdcc05a408d2b78edccc69492ab9720eadb49ea15ef80d768/ruff-0.15.14-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:049b5326e53ed80978f2fc041a280603f69dd6b0c95464342a2bb4572d9d9e2f", size = 12212563, upload-time = "2026-05-21T14:34:28.579Z" },
{ url = "https://files.pythonhosted.org/packages/3e/4d/a3c5b874a556d5731e3e657aaf04311bb76f0a5c3ec220ed43051be6b64b/ruff-0.15.14-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d4ed42e6696c8dfa5f06728e6441993901f548eb92d73bc472cb5a38d1395fbf", size = 11493299, upload-time = "2026-05-21T14:34:41.836Z" },
{ url = "https://files.pythonhosted.org/packages/1e/c0/56472c251d09858a53e51efbd485b09e1995d8731668b76d52e5dd6ee0f1/ruff-0.15.14-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:715c543cf450c4888251f91c52f1942a800541d9bddd7ac060aa4e6b77ae7cba", size = 11455931, upload-time = "2026-05-21T14:34:57.276Z" },
{ url = "https://files.pythonhosted.org/packages/2c/4a/e2e7b4d8dbf233d4eace59c75bc3435fa6d8bd3bae82d351d4e4300c0fd1/ruff-0.15.14-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:72ebab6013ec887d439d8b7593737a0a4ffb06d45d209d4e4bf2e92813082d3f", size = 11400794, upload-time = "2026-05-21T14:34:39.773Z" },
{ url = "https://files.pythonhosted.org/packages/97/c7/83c0539fe34c3e09136204d1e75d6052492364e0b3cb05e9465423f567d7/ruff-0.15.14-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:49072d36abdbe97a8dd7f480afe9c675699c0c495d4c84076e2c1203c4550581", size = 10804759, upload-time = "2026-05-21T14:34:31.045Z" },
{ url = "https://files.pythonhosted.org/packages/86/a6/18f2bfc095a2ab4a78745644e428205532ce6653a5d0fa8501572891534d/ruff-0.15.14-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:958522aee105068640c2c2ceae08f413ae44d922f52a1374ac13d6a96032fc93", size = 10539517, upload-time = "2026-05-21T14:34:53.064Z" },
{ url = "https://files.pythonhosted.org/packages/54/3a/5a8b3b69c654d4e4bf1d246ac5b49cbcdac6eaab6905925f8915f31e3b80/ruff-0.15.14-py3-none-musllinux_1_2_i686.whl", hash = "sha256:f3707da619a143a2e8830e2abab8224478d69ace2d28cb6c20543ae97c36bf61", size = 11065169, upload-time = "2026-05-21T14:34:24.484Z" },
{ url = "https://files.pythonhosted.org/packages/ed/c5/8864e4e7925b836ea354b31d57641ec03830564e281a8b6f061f8c3e0ec1/ruff-0.15.14-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:bb01d645694e3ec0102105d07ef2d53703970407d59c04e59d3ba0b7a1d53553", size = 11560214, upload-time = "2026-05-21T14:34:50.975Z" },
{ url = "https://files.pythonhosted.org/packages/36/38/012bf76752e1f89ed50b77b99532d90f3a3e287bc7918e1fc0948ac866ac/ruff-0.15.14-py3-none-win32.whl", hash = "sha256:6d0c1ad2a0ab718d39b6d8fd2217981ce4d625cd96a720095f798fb47d8b13e6", size = 10805548, upload-time = "2026-05-21T14:34:33.453Z" },
{ url = "https://files.pythonhosted.org/packages/d1/b7/4ea2c170f10ad760fff2a5250beb18897719dc8b52b53a24cddbb9dd3f19/ruff-0.15.14-py3-none-win_amd64.whl", hash = "sha256:802342981e056db3851a7836e5b070f8f15f67d4a685ae2a6160939d364b2902", size = 11939523, upload-time = "2026-05-21T14:34:18.077Z" },
{ url = "https://files.pythonhosted.org/packages/62/d5/bc97ff895ec35cf3925d4bd60f3b39d822f377a446906ec9bcc87405e59b/ruff-0.15.14-py3-none-win_arm64.whl", hash = "sha256:ff47b90a9ef6a40c9e2f3b479c1fb78531adf055b94c1eba0a7ba04b31951826", size = 11208607, upload-time = "2026-05-21T14:34:26.525Z" },
]
[[package]]
name = "tomli"
version = "2.4.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/22/de/48c59722572767841493b26183a0d1cc411d54fd759c5607c4590b6563a6/tomli-2.4.1.tar.gz", hash = "sha256:7c7e1a961a0b2f2472c1ac5b69affa0ae1132c39adcb67aba98568702b9cc23f", size = 17543, upload-time = "2026-03-25T20:22:03.828Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/f4/11/db3d5885d8528263d8adc260bb2d28ebf1270b96e98f0e0268d32b8d9900/tomli-2.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f8f0fc26ec2cc2b965b7a3b87cd19c5c6b8c5e5f436b984e85f486d652285c30", size = 154704, upload-time = "2026-03-25T20:21:10.473Z" },
{ url = "https://files.pythonhosted.org/packages/6d/f7/675db52c7e46064a9aa928885a9b20f4124ecb9bc2e1ce74c9106648d202/tomli-2.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4ab97e64ccda8756376892c53a72bd1f964e519c77236368527f758fbc36a53a", size = 149454, upload-time = "2026-03-25T20:21:12.036Z" },
{ url = "https://files.pythonhosted.org/packages/61/71/81c50943cf953efa35bce7646caab3cf457a7d8c030b27cfb40d7235f9ee/tomli-2.4.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:96481a5786729fd470164b47cdb3e0e58062a496f455ee41b4403be77cb5a076", size = 237561, upload-time = "2026-03-25T20:21:13.098Z" },
{ url = "https://files.pythonhosted.org/packages/48/c1/f41d9cb618acccca7df82aaf682f9b49013c9397212cb9f53219e3abac37/tomli-2.4.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5a881ab208c0baf688221f8cecc5401bd291d67e38a1ac884d6736cbcd8247e9", size = 243824, upload-time = "2026-03-25T20:21:14.569Z" },
{ url = "https://files.pythonhosted.org/packages/22/e4/5a816ecdd1f8ca51fb756ef684b90f2780afc52fc67f987e3c61d800a46d/tomli-2.4.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:47149d5bd38761ac8be13a84864bf0b7b70bc051806bc3669ab1cbc56216b23c", size = 242227, upload-time = "2026-03-25T20:21:15.712Z" },
{ url = "https://files.pythonhosted.org/packages/6b/49/2b2a0ef529aa6eec245d25f0c703e020a73955ad7edf73e7f54ddc608aa5/tomli-2.4.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ec9bfaf3ad2df51ace80688143a6a4ebc09a248f6ff781a9945e51937008fcbc", size = 247859, upload-time = "2026-03-25T20:21:17.001Z" },
{ url = "https://files.pythonhosted.org/packages/83/bd/6c1a630eaca337e1e78c5903104f831bda934c426f9231429396ce3c3467/tomli-2.4.1-cp311-cp311-win32.whl", hash = "sha256:ff2983983d34813c1aeb0fa89091e76c3a22889ee83ab27c5eeb45100560c049", size = 97204, upload-time = "2026-03-25T20:21:18.079Z" },
{ url = "https://files.pythonhosted.org/packages/42/59/71461df1a885647e10b6bb7802d0b8e66480c61f3f43079e0dcd315b3954/tomli-2.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:5ee18d9ebdb417e384b58fe414e8d6af9f4e7a0ae761519fb50f721de398dd4e", size = 108084, upload-time = "2026-03-25T20:21:18.978Z" },
{ url = "https://files.pythonhosted.org/packages/b8/83/dceca96142499c069475b790e7913b1044c1a4337e700751f48ed723f883/tomli-2.4.1-cp311-cp311-win_arm64.whl", hash = "sha256:c2541745709bad0264b7d4705ad453b76ccd191e64aa6f0fc66b69a293a45ece", size = 95285, upload-time = "2026-03-25T20:21:20.309Z" },
{ url = "https://files.pythonhosted.org/packages/c1/ba/42f134a3fe2b370f555f44b1d72feebb94debcab01676bf918d0cb70e9aa/tomli-2.4.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c742f741d58a28940ce01d58f0ab2ea3ced8b12402f162f4d534dfe18ba1cd6a", size = 155924, upload-time = "2026-03-25T20:21:21.626Z" },
{ url = "https://files.pythonhosted.org/packages/dc/c7/62d7a17c26487ade21c5422b646110f2162f1fcc95980ef7f63e73c68f14/tomli-2.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7f86fd587c4ed9dd76f318225e7d9b29cfc5a9d43de44e5754db8d1128487085", size = 150018, upload-time = "2026-03-25T20:21:23.002Z" },
{ url = "https://files.pythonhosted.org/packages/5c/05/79d13d7c15f13bdef410bdd49a6485b1c37d28968314eabee452c22a7fda/tomli-2.4.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ff18e6a727ee0ab0388507b89d1bc6a22b138d1e2fa56d1ad494586d61d2eae9", size = 244948, upload-time = "2026-03-25T20:21:24.04Z" },
{ url = "https://files.pythonhosted.org/packages/10/90/d62ce007a1c80d0b2c93e02cab211224756240884751b94ca72df8a875ca/tomli-2.4.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:136443dbd7e1dee43c68ac2694fde36b2849865fa258d39bf822c10e8068eac5", size = 253341, upload-time = "2026-03-25T20:21:25.177Z" },
{ url = "https://files.pythonhosted.org/packages/1a/7e/caf6496d60152ad4ed09282c1885cca4eea150bfd007da84aea07bcc0a3e/tomli-2.4.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5e262d41726bc187e69af7825504c933b6794dc3fbd5945e41a79bb14c31f585", size = 248159, upload-time = "2026-03-25T20:21:26.364Z" },
{ url = "https://files.pythonhosted.org/packages/99/e7/c6f69c3120de34bbd882c6fba7975f3d7a746e9218e56ab46a1bc4b42552/tomli-2.4.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5cb41aa38891e073ee49d55fbc7839cfdb2bc0e600add13874d048c94aadddd1", size = 253290, upload-time = "2026-03-25T20:21:27.46Z" },
{ url = "https://files.pythonhosted.org/packages/d6/2f/4a3c322f22c5c66c4b836ec58211641a4067364f5dcdd7b974b4c5da300c/tomli-2.4.1-cp312-cp312-win32.whl", hash = "sha256:da25dc3563bff5965356133435b757a795a17b17d01dbc0f42fb32447ddfd917", size = 98141, upload-time = "2026-03-25T20:21:28.492Z" },
{ url = "https://files.pythonhosted.org/packages/24/22/4daacd05391b92c55759d55eaee21e1dfaea86ce5c571f10083360adf534/tomli-2.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:52c8ef851d9a240f11a88c003eacb03c31fc1c9c4ec64a99a0f922b93874fda9", size = 108847, upload-time = "2026-03-25T20:21:29.386Z" },
{ url = "https://files.pythonhosted.org/packages/68/fd/70e768887666ddd9e9f5d85129e84910f2db2796f9096aa02b721a53098d/tomli-2.4.1-cp312-cp312-win_arm64.whl", hash = "sha256:f758f1b9299d059cc3f6546ae2af89670cb1c4d48ea29c3cacc4fe7de3058257", size = 95088, upload-time = "2026-03-25T20:21:30.677Z" },
{ url = "https://files.pythonhosted.org/packages/07/06/b823a7e818c756d9a7123ba2cda7d07bc2dd32835648d1a7b7b7a05d848d/tomli-2.4.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:36d2bd2ad5fb9eaddba5226aa02c8ec3fa4f192631e347b3ed28186d43be6b54", size = 155866, upload-time = "2026-03-25T20:21:31.65Z" },
{ url = "https://files.pythonhosted.org/packages/14/6f/12645cf7f08e1a20c7eb8c297c6f11d31c1b50f316a7e7e1e1de6e2e7b7e/tomli-2.4.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:eb0dc4e38e6a1fd579e5d50369aa2e10acfc9cace504579b2faabb478e76941a", size = 149887, upload-time = "2026-03-25T20:21:33.028Z" },
{ url = "https://files.pythonhosted.org/packages/5c/e0/90637574e5e7212c09099c67ad349b04ec4d6020324539297b634a0192b0/tomli-2.4.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c7f2c7f2b9ca6bdeef8f0fa897f8e05085923eb091721675170254cbc5b02897", size = 243704, upload-time = "2026-03-25T20:21:34.51Z" },
{ url = "https://files.pythonhosted.org/packages/10/8f/d3ddb16c5a4befdf31a23307f72828686ab2096f068eaf56631e136c1fdd/tomli-2.4.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f3c6818a1a86dd6dca7ddcaaf76947d5ba31aecc28cb1b67009a5877c9a64f3f", size = 251628, upload-time = "2026-03-25T20:21:36.012Z" },
{ url = "https://files.pythonhosted.org/packages/e3/f1/dbeeb9116715abee2485bf0a12d07a8f31af94d71608c171c45f64c0469d/tomli-2.4.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:d312ef37c91508b0ab2cee7da26ec0b3ed2f03ce12bd87a588d771ae15dcf82d", size = 247180, upload-time = "2026-03-25T20:21:37.136Z" },
{ url = "https://files.pythonhosted.org/packages/d3/74/16336ffd19ed4da28a70959f92f506233bd7cfc2332b20bdb01591e8b1d1/tomli-2.4.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:51529d40e3ca50046d7606fa99ce3956a617f9b36380da3b7f0dd3dd28e68cb5", size = 251674, upload-time = "2026-03-25T20:21:38.298Z" },
{ url = "https://files.pythonhosted.org/packages/16/f9/229fa3434c590ddf6c0aa9af64d3af4b752540686cace29e6281e3458469/tomli-2.4.1-cp313-cp313-win32.whl", hash = "sha256:2190f2e9dd7508d2a90ded5ed369255980a1bcdd58e52f7fe24b8162bf9fedbd", size = 97976, upload-time = "2026-03-25T20:21:39.316Z" },
{ url = "https://files.pythonhosted.org/packages/6a/1e/71dfd96bcc1c775420cb8befe7a9d35f2e5b1309798f009dca17b7708c1e/tomli-2.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:8d65a2fbf9d2f8352685bc1364177ee3923d6baf5e7f43ea4959d7d8bc326a36", size = 108755, upload-time = "2026-03-25T20:21:40.248Z" },
{ url = "https://files.pythonhosted.org/packages/83/7a/d34f422a021d62420b78f5c538e5b102f62bea616d1d75a13f0a88acb04a/tomli-2.4.1-cp313-cp313-win_arm64.whl", hash = "sha256:4b605484e43cdc43f0954ddae319fb75f04cc10dd80d830540060ee7cd0243cd", size = 95265, upload-time = "2026-03-25T20:21:41.219Z" },
{ url = "https://files.pythonhosted.org/packages/3c/fb/9a5c8d27dbab540869f7c1f8eb0abb3244189ce780ba9cd73f3770662072/tomli-2.4.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:fd0409a3653af6c147209d267a0e4243f0ae46b011aa978b1080359fddc9b6cf", size = 155726, upload-time = "2026-03-25T20:21:42.23Z" },
{ url = "https://files.pythonhosted.org/packages/62/05/d2f816630cc771ad836af54f5001f47a6f611d2d39535364f148b6a92d6b/tomli-2.4.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:a120733b01c45e9a0c34aeef92bf0cf1d56cfe81ed9d47d562f9ed591a9828ac", size = 149859, upload-time = "2026-03-25T20:21:43.386Z" },
{ url = "https://files.pythonhosted.org/packages/ce/48/66341bdb858ad9bd0ceab5a86f90eddab127cf8b046418009f2125630ecb/tomli-2.4.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:559db847dc486944896521f68d8190be1c9e719fced785720d2216fe7022b662", size = 244713, upload-time = "2026-03-25T20:21:44.474Z" },
{ url = "https://files.pythonhosted.org/packages/df/6d/c5fad00d82b3c7a3ab6189bd4b10e60466f22cfe8a08a9394185c8a8111c/tomli-2.4.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:01f520d4f53ef97964a240a035ec2a869fe1a37dde002b57ebc4417a27ccd853", size = 252084, upload-time = "2026-03-25T20:21:45.62Z" },
{ url = "https://files.pythonhosted.org/packages/00/71/3a69e86f3eafe8c7a59d008d245888051005bd657760e96d5fbfb0b740c2/tomli-2.4.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7f94b27a62cfad8496c8d2513e1a222dd446f095fca8987fceef261225538a15", size = 247973, upload-time = "2026-03-25T20:21:46.937Z" },
{ url = "https://files.pythonhosted.org/packages/67/50/361e986652847fec4bd5e4a0208752fbe64689c603c7ae5ea7cb16b1c0ca/tomli-2.4.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:ede3e6487c5ef5d28634ba3f31f989030ad6af71edfb0055cbbd14189ff240ba", size = 256223, upload-time = "2026-03-25T20:21:48.467Z" },
{ url = "https://files.pythonhosted.org/packages/8c/9a/b4173689a9203472e5467217e0154b00e260621caa227b6fa01feab16998/tomli-2.4.1-cp314-cp314-win32.whl", hash = "sha256:3d48a93ee1c9b79c04bb38772ee1b64dcf18ff43085896ea460ca8dec96f35f6", size = 98973, upload-time = "2026-03-25T20:21:49.526Z" },
{ url = "https://files.pythonhosted.org/packages/14/58/640ac93bf230cd27d002462c9af0d837779f8773bc03dee06b5835208214/tomli-2.4.1-cp314-cp314-win_amd64.whl", hash = "sha256:88dceee75c2c63af144e456745e10101eb67361050196b0b6af5d717254dddf7", size = 109082, upload-time = "2026-03-25T20:21:50.506Z" },
{ url = "https://files.pythonhosted.org/packages/d5/2f/702d5e05b227401c1068f0d386d79a589bb12bf64c3d2c72ce0631e3bc49/tomli-2.4.1-cp314-cp314-win_arm64.whl", hash = "sha256:b8c198f8c1805dc42708689ed6864951fd2494f924149d3e4bce7710f8eb5232", size = 96490, upload-time = "2026-03-25T20:21:51.474Z" },
{ url = "https://files.pythonhosted.org/packages/45/4b/b877b05c8ba62927d9865dd980e34a755de541eb65fffba52b4cc495d4d2/tomli-2.4.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:d4d8fe59808a54658fcc0160ecfb1b30f9089906c50b23bcb4c69eddc19ec2b4", size = 164263, upload-time = "2026-03-25T20:21:52.543Z" },
{ url = "https://files.pythonhosted.org/packages/24/79/6ab420d37a270b89f7195dec5448f79400d9e9c1826df982f3f8e97b24fd/tomli-2.4.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7008df2e7655c495dd12d2a4ad038ff878d4ca4b81fccaf82b714e07eae4402c", size = 160736, upload-time = "2026-03-25T20:21:53.674Z" },
{ url = "https://files.pythonhosted.org/packages/02/e0/3630057d8eb170310785723ed5adcdfb7d50cb7e6455f85ba8a3deed642b/tomli-2.4.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1d8591993e228b0c930c4bb0db464bdad97b3289fb981255d6c9a41aedc84b2d", size = 270717, upload-time = "2026-03-25T20:21:55.129Z" },
{ url = "https://files.pythonhosted.org/packages/7a/b4/1613716072e544d1a7891f548d8f9ec6ce2faf42ca65acae01d76ea06bb0/tomli-2.4.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:734e20b57ba95624ecf1841e72b53f6e186355e216e5412de414e3c51e5e3c41", size = 278461, upload-time = "2026-03-25T20:21:56.228Z" },
{ url = "https://files.pythonhosted.org/packages/05/38/30f541baf6a3f6df77b3df16b01ba319221389e2da59427e221ef417ac0c/tomli-2.4.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:8a650c2dbafa08d42e51ba0b62740dae4ecb9338eefa093aa5c78ceb546fcd5c", size = 274855, upload-time = "2026-03-25T20:21:57.653Z" },
{ url = "https://files.pythonhosted.org/packages/77/a3/ec9dd4fd2c38e98de34223b995a3b34813e6bdadf86c75314c928350ed14/tomli-2.4.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:504aa796fe0569bb43171066009ead363de03675276d2d121ac1a4572397870f", size = 283144, upload-time = "2026-03-25T20:21:59.089Z" },
{ url = "https://files.pythonhosted.org/packages/ef/be/605a6261cac79fba2ec0c9827e986e00323a1945700969b8ee0b30d85453/tomli-2.4.1-cp314-cp314t-win32.whl", hash = "sha256:b1d22e6e9387bf4739fbe23bfa80e93f6b0373a7f1b96c6227c32bef95a4d7a8", size = 108683, upload-time = "2026-03-25T20:22:00.214Z" },
{ url = "https://files.pythonhosted.org/packages/12/64/da524626d3b9cc40c168a13da8335fe1c51be12c0a63685cc6db7308daae/tomli-2.4.1-cp314-cp314t-win_amd64.whl", hash = "sha256:2c1c351919aca02858f740c6d33adea0c5deea37f9ecca1cc1ef9e884a619d26", size = 121196, upload-time = "2026-03-25T20:22:01.169Z" },
{ url = "https://files.pythonhosted.org/packages/5a/cd/e80b62269fc78fc36c9af5a6b89c835baa8af28ff5ad28c7028d60860320/tomli-2.4.1-cp314-cp314t-win_arm64.whl", hash = "sha256:eab21f45c7f66c13f2a9e0e1535309cee140182a9cdae1e041d02e47291e8396", size = 100393, upload-time = "2026-03-25T20:22:02.137Z" },
{ url = "https://files.pythonhosted.org/packages/7b/61/cceae43728b7de99d9b847560c262873a1f6c98202171fd5ed62640b494b/tomli-2.4.1-py3-none-any.whl", hash = "sha256:0d85819802132122da43cb86656f8d1f8c6587d54ae7dcaf30e90533028b49fe", size = 14583, upload-time = "2026-03-25T20:22:03.012Z" },
]
[[package]]
name = "typing-extensions"
version = "4.15.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391, upload-time = "2025-08-25T13:49:26.313Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" },
]

View File

@@ -0,0 +1,13 @@
{
"extends": ["@commitlint/config-conventional"],
"rules": {
"body-max-line-length": [2, "always", 100],
"header-max-length": [2, "always", 100],
"subject-case": [2, "never", ["sentence-case", "start-case", "pascal-case", "upper-case"]],
"type-enum": [
2,
"always",
["feat", "fix", "docs", "style", "refactor", "perf", "test", "build", "ci", "chore", "revert"]
]
}
}

View File

@@ -0,0 +1,2 @@
/third_party/
/tessdata/

2933
crates/kreuzberg-tesseract/Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,64 @@
[package]
name = "kreuzberg-tesseract"
version.workspace = true
edition.workspace = true
rust-version.workspace = true
authors.workspace = true
description = "Rust bindings for Tesseract OCR with cross-compilation, C++17, and caching improvements"
license = "MIT"
repository.workspace = true
homepage = "https://kreuzberg.dev"
documentation = "https://docs.kreuzberg.dev"
readme = "README.md"
keywords = ["tesseract", "ocr", "bindings", "vision", "recognition"]
categories = ["external-ffi-bindings", "computer-vision", "text-processing"]
build = "build.rs"
links = "kreuzberg_tesseract"
exclude = ["tessdata/*", "third_party/*"]
[package.metadata.docs.rs]
features = ["docs-only"]
rustdoc-args = ["--cfg", "docsrs"]
[package.metadata.cargo-machete]
ignored = ["cc", "cmake", "reqwest", "zip"]
[lib]
name = "kreuzberg_tesseract"
crate-type = ["lib"]
[features]
default = ["static-linking"]
build-tesseract = ["cc", "cmake", "reqwest", "zip"]
build-tesseract-wasm = ["cmake", "reqwest", "zip"]
# Bundle eng.traineddata into the compiled crate so WASM builds can run OCR
# without runtime tessdata loading. Uses ~4 MB of binary size (tessdata_fast).
bundle-tessdata-eng = []
static-linking = ["build-tesseract"]
dynamic-linking = []
[dependencies]
thiserror = { workspace = true }
[build-dependencies]
cc = { version = "^1.2.63", optional = true }
cmake = { version = "0.1.58", optional = true }
zip = { version = ">=7.0.0", optional = true, default-features = false, features = [
"deflate-flate2-zlib-rs",
] }
[target.'cfg(not(target_os = "windows"))'.build-dependencies]
reqwest = { workspace = true, default-features = false, features = [
"blocking",
"rustls",
], optional = true }
# Use native-tls on Windows to avoid aws-lc-sys CMake build issues with MinGW
[target.'cfg(target_os = "windows")'.build-dependencies]
reqwest = { workspace = true, default-features = false, features = [
"blocking",
"native-tls",
], optional = true }
[dev-dependencies]
image = { workspace = true, features = ["png"] }

View File

@@ -0,0 +1,22 @@
MIT License
Copyright (c) 2024 Cafer Can Gündoğdu
Copyright (c) 2025 Na'aman Hirschfeld
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -0,0 +1,405 @@
# kreuzberg-tesseract
[![Bindings](https://img.shields.io/badge/Bindings-alef%20%D7%90-007ec6)](https://github.com/kreuzberg-dev/alef)
Rust bindings for Tesseract OCR with built-in compilation of Tesseract and Leptonica libraries. Provides a safe and idiomatic Rust interface to Tesseract's functionality while handling the complexity of compiling the underlying C++ libraries.
Based on the original [tesseract-rs](https://github.com/cafercangundogdu/tesseract-rs) by Cafer Can Gündoğdu, this maintained version adds critical improvements for production use:
- **C++17 Support**: Upgraded for Tesseract 5.5.1 which requires C++17 filesystem
- **Cross-Compilation**: Fixed CXX compiler detection for cross-platform builds
- **Architecture Validation**: Validates target architecture before using cached libraries
- **Windows Static Linking**: Fixed MSVC static linking issues
- **Build Caching**: Improved caching with OUT_DIR-based cache directory
- **MinGW Support**: Added support for MinGW toolchains
## Features
- Safe Rust bindings for Tesseract OCR
- **Multiple linking options:**
- **Static linking** (default): Built-in compilation with no runtime dependencies
- **Dynamic linking**: Link to system-installed libraries for faster builds
- Uses existing Tesseract training data (expects English data for tests)
- High-level Rust API for common OCR tasks
- Caching of compiled libraries for faster subsequent builds
- Support for multiple operating systems (Linux, macOS, Windows)
## Installation
### Static Linking (Default)
Static linking builds Tesseract and Leptonica from source and embeds them in your binary. No runtime dependencies required:
```toml
[dependencies]
kreuzberg-tesseract = "1.0.0-rc.1"
# or explicitly:
kreuzberg-tesseract = { version = "1.0.0-rc.1", features = ["static-linking"] }
```
### Dynamic Linking
Dynamic linking uses system-installed Tesseract and Leptonica libraries. Faster builds, but requires libraries installed on the system:
```toml
[dependencies]
kreuzberg-tesseract = { version = "1.0.0-rc.1", features = ["dynamic-linking"], default-features = false }
```
**System requirements for dynamic linking:**
- Tesseract 5.x libraries installed (`libtesseract`, `libleptonica`)
- macOS: `brew install tesseract leptonica`
- Ubuntu/Debian: `sudo apt-get install libtesseract-dev libleptonica-dev`
- RHEL/CentOS/Fedora: `sudo dnf install tesseract-devel leptonica-devel`
- Windows: Install from [Tesseract releases](https://github.com/tesseract-ocr/tesseract/releases) or vcpkg
### Development Dependencies
For development and testing, you'll also need these dependencies:
```toml
[dev-dependencies]
image = "0.25.5"
```
## System Requirements
### For Static Linking (Default)
When building with static linking, the crate will compile Tesseract and Leptonica from source. You need:
- Rust 1.85.0 or later
- A C++ compiler (e.g., gcc, clang, MSVC on Windows)
- CMake 3.x or later
- Internet connection (for downloading Tesseract source code)
### For Dynamic Linking
When using dynamic linking with system-installed libraries, you need:
- Rust 1.85.0 or later
- Tesseract 5.x and Leptonica libraries installed on your system (see Installation section)
- Internet connection (for downloading Tesseract source code)
No C++ compiler or CMake required for dynamic linking builds.
For a full development environment checklist (including optional tooling suggestions), see [CONTRIBUTING.md](../../CONTRIBUTING.md).
## Environment Variables
The following environment variables affect the build and test process:
### Build Variables
- `CARGO_CLEAN`: If set, cleans the cache directory before building
- `RUSTC_WRAPPER`: If set to "sccache", enables compiler caching with sccache
- `CC`: Compiler selection for C code (affects Linux builds)
- `HOME` (Unix) or `APPDATA` (Windows): Used to determine cache directory location
- `TESSERACT_RS_CACHE_DIR`: Optional override for the cache root. When unset or not writable, the build falls back to the default OS-specific directory, and if that still fails, a temporary directory under the system temp folder is used automatically.
### Test Variables
- `TESSDATA_PREFIX` (Optional): Path to override the default tessdata directory. If not set, the crate will use its default cache directory.
## Cache and Data Directories
The crate uses the following directory structure based on your operating system:
- macOS: `~/Library/Application Support/tesseract-rs`
- Linux: `~/.tesseract-rs`
- Windows: `%APPDATA%/tesseract-rs`
The cache includes:
- Compiled Tesseract and Leptonica libraries
- Third-party source code
Training data is not downloaded during the build. Provide `eng.traineddata` (and any other languages you need) via `TESSDATA_PREFIX` or your system Tesseract installation.
## Testing
The project includes several integration tests that verify OCR functionality. To run the tests:
1. Ensure you have the required test dependencies:
```toml
[dev-dependencies]
image = "0.25.9"
```
2. Run the tests:
```bash
cargo test
```
Note: Make sure `eng.traineddata` is available in your tessdata directory before running tests. If `TESSDATA_PREFIX` is not set, the tests look in the default cache location. You can point the tests at a custom tessdata directory by setting:
```bash
# Linux/macOS
export TESSDATA_PREFIX=/path/to/custom/tessdata
# Windows (PowerShell)
$env:TESSDATA_PREFIX="C:\path\to\custom\tessdata"
```
Available test cases:
- OCR on English sample images
- Error handling and invalid input coverage
Test images are sourced from the shared `test_documents/` directory in the repository:
- `images/test_hello_world.png`: Simple English text
- `tables/simple_table.png`: Basic table with English headers
## Usage
Here's a basic example of how to use `tesseract-rs`:
```rust
use std::path::PathBuf;
use std::error::Error;
use kreuzberg_tesseract::TesseractAPI;
fn get_default_tessdata_dir() -> PathBuf {
if cfg!(target_os = "macos") {
let home_dir = std::env::var("HOME").expect("HOME environment variable not set");
PathBuf::from(home_dir)
.join("Library")
.join("Application Support")
.join("tesseract-rs")
.join("tessdata")
} else if cfg!(target_os = "linux") {
let home_dir = std::env::var("HOME").expect("HOME environment variable not set");
PathBuf::from(home_dir)
.join(".tesseract-rs")
.join("tessdata")
} else if cfg!(target_os = "windows") {
PathBuf::from(std::env::var("APPDATA").expect("APPDATA environment variable not set"))
.join("tesseract-rs")
.join("tessdata")
} else {
panic!("Unsupported operating system");
}
}
fn get_tessdata_dir() -> PathBuf {
match std::env::var("TESSDATA_PREFIX") {
Ok(dir) => {
let path = PathBuf::from(dir);
println!("Using TESSDATA_PREFIX directory: {:?}", path);
path
}
Err(_) => {
let default_dir = get_default_tessdata_dir();
println!(
"TESSDATA_PREFIX not set, using default directory: {:?}",
default_dir
);
default_dir
}
}
}
fn main() -> Result<(), Box<dyn Error>> {
let api = TesseractAPI::new()?;
// Get tessdata directory (uses default location or TESSDATA_PREFIX if set)
let tessdata_dir = get_tessdata_dir();
api.init(tessdata_dir.to_str().unwrap(), "eng")?;
let width = 24;
let height = 24;
let bytes_per_pixel = 1;
let bytes_per_line = width * bytes_per_pixel;
// Initialize image data with all white pixels
let mut image_data = vec![255u8; width * height];
// Draw number 9 with clearer distinction
for y in 4..19 {
for x in 7..17 {
// Top bar
if y == 4 && x >= 8 && x <= 15 {
image_data[y * width + x] = 0;
}
// Top curve left side
if y >= 4 && y <= 10 && x == 7 {
image_data[y * width + x] = 0;
}
// Top curve right side
if y >= 4 && y <= 11 && x == 16 {
image_data[y * width + x] = 0;
}
// Middle bar
if y == 11 && x >= 8 && x <= 15 {
image_data[y * width + x] = 0;
}
// Bottom right vertical line
if y >= 11 && y <= 18 && x == 16 {
image_data[y * width + x] = 0;
}
// Bottom bar
if y == 18 && x >= 8 && x <= 15 {
image_data[y * width + x] = 0;
}
}
}
// Set the image data
api.set_image(
&image_data,
width.try_into().unwrap(),
height.try_into().unwrap(),
bytes_per_pixel.try_into().unwrap(),
bytes_per_line.try_into().unwrap(),
)?;
// Set whitelist for digits only
api.set_variable("tessedit_char_whitelist", "0123456789")?;
// Set PSM mode to single character
api.set_variable("tessedit_pageseg_mode", "10")?;
// Get the recognized text
let text = api.get_utf8_text()?;
println!("Recognized text: {}", text.trim());
Ok(())
}
```
## Advanced Usage
The API provides additional functionality for more complex OCR tasks, including thread-safe operations:
```rust
use kreuzberg_tesseract::TesseractAPI;
use std::sync::Arc;
use std::thread;
use std::error::Error;
fn main() -> Result<(), Box<dyn Error>> {
let tessdata_dir = get_tessdata_dir();
let api = TesseractAPI::new()?;
// Initialize the main API
api.init(tessdata_dir.to_str().unwrap(), "eng")?;
api.set_variable("tessedit_pageseg_mode", "1")?;
// Load and prepare image data
let (image_data, width, height) = load_test_image("sample_text.png")?;
// Share image data across threads
let image_data = Arc::new(image_data);
let mut handles = vec![];
// Spawn multiple threads for parallel OCR processing
for _ in 0..3 {
let api_clone = api.clone(); // Clones the API with all configurations
let image_data = Arc::clone(&image_data);
let handle = thread::spawn(move || {
// Set image in each thread
let res = api_clone.set_image(
&image_data,
width as i32,
height as i32,
3,
3 * width as i32,
);
assert!(res.is_ok());
// Perform OCR in parallel
let text = api_clone.get_utf8_text()
.expect("Failed to get text");
println!("Thread result: {}", text);
});
handles.push(handle);
}
// Wait for all threads to complete
for handle in handles {
handle.join().unwrap();
}
Ok(())
}
// Helper function to get tessdata directory
fn get_tessdata_dir() -> PathBuf {
// ... (implementation as shown in basic example)
}
// Helper function to load test image
fn load_test_image(filename: &str) -> Result<(Vec<u8>, u32, u32), Box<dyn Error>> {
let img = image::open(filename)?
.to_rgb8();
let (width, height) = img.dimensions();
Ok((img.into_raw(), width, height))
}
```
## Building
### Static Linking (Default)
With static linking, the crate will automatically download and compile Tesseract and Leptonica during the build process. This may take some time on the first build (5-10 minutes), but subsequent builds will use the cached libraries.
To clean the cache and force a rebuild:
```bash
CARGO_CLEAN=1 cargo build
```
### Dynamic Linking
With dynamic linking, the build is much faster (seconds instead of minutes) since it only links against system-installed libraries:
```bash
cargo build --no-default-features --features dynamic-linking
```
**Note**: Dynamic linking requires Tesseract and Leptonica to be installed on your system (see Installation section).
## Documentation
For more detailed information, please check the [API documentation](https://docs.rs/kreuzberg-tesseract).
## License
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
## Acknowledgements
This project is based on the original [tesseract-rs](https://github.com/cafercangundogdu/tesseract-rs) by [Cafer Can Gündoğdu](https://github.com/cafercangundogdu). We are grateful for the foundational work that made this project possible.
## Contributing
We welcome contributions! Please see our [Contributing Guide](../../CONTRIBUTING.md) for details.
### Quick Start for Contributors
1. Fork and clone the repository
2. Install uv and set up git hooks:
```bash
curl -LsSf https://astral.sh/uv/install.sh | sh
uvx prek install
```
3. Make your changes following our commit message format
4. Run tests: `cargo test`
5. Submit a Pull Request
Our commit messages follow the [Conventional Commits](https://www.conventionalcommits.org/) specification.
## Acknowledgements
This project uses [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) and [Leptonica](http://leptonica.org/). We are grateful to the maintainers and contributors of these projects.
```text
```

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,74 @@
# Tesseract WASM Patches
This directory contains patches needed to compile Tesseract for WebAssembly (WASM) targets using WASI SDK.
These patches are vendored from the [tesseract-wasm](https://github.com/naptha/tesseract.js) project and have been proven to work with WASM compilation.
## Patches
### tesseract.diff
A comprehensive patch that makes Tesseract compatible with WASM compilation. The patch includes the following changes:
#### 1. CMakeLists.txt Modifications
- **New CMake option**: `BUILD_TESSERACT_BINARY` (default: ON)
- Allows disabling the Tesseract CLI binary build, which is not needed for WASM
- Wraps all executable and installation targets for the tesseract binary
- **Disabled components for WASM**:
- Removes OpenCL support (`src/opencl/*.cpp`) - not applicable to WASM
- Removes viewer support (`src/viewer/*.cpp`) - UI components not needed for WASM
- Removes C API bindings (`src/api/capi.cpp`) - only hocrrenderer is kept
- Removes PDF and rendering support files:
- `src/api/renderer.cpp`
- `src/api/altorenderer.cpp`
- `src/api/lstmboxrenderer.cpp`
- `src/api/pdfrenderer.cpp`
- `src/api/wordstrboxrenderer.cpp`
#### 2. SIMD Detection Fixes (src/arch/simddetect.cpp)
- Guards CPUID detection with `#if !defined(__wasm__)`
- Prevents attempts to use CPU feature detection that don't exist in WASM
- The HAS_CPUID macro is only defined for non-WASM builds
- This allows the code to gracefully handle WASM's SIMD limitations
#### 3. Pointer Type Fixes (src/ccmain/pageiterator.cpp, src/ccmain/pagesegmain.cpp, src/ccmain/tesseractclass.cpp)
**Changed from stack allocation to heap allocation** in `tesseractclass.h`:
- `pixa_debug_` changed from `DebugPixa` to `std::unique_ptr<DebugPixa>`
- This prevents large allocations on the stack, which is limited in WASM
**Updated all references** throughout the codebase:
- `.get()` calls added where raw pointers are needed
- Arrow operator `->` replaces dot operator `.` for member access
- Null checks added before dereferencing to prevent crashes
**Affected functions**:
- `PageIterator::Orientation()` - added null vector check
- `Tesseract::AutoPageSeg()` - updated pointer passing
- `Tesseract::SetupPageSegAndDetectOrientation()` - multiple pointer updates
- `Tesseract::Clear()` - added null check before WritePDF
- `Tesseract::PrepareForPageseg()` - updated Split() calls
- `Tesseract::PrepareForTessOCR()` - updated Split() calls
#### 4. Additional Fixes
- **Orientation detection**: Changed comparison from `> 0.0F` to `>= 0.0F` in `pageiterator.cpp` to handle null vectors gracefully when orientation info is not available
## How to Apply
These patches are applied during the WASM build process. They modify the Tesseract source code to:
1. Disable WASM-incompatible features (OpenCL, viewers, renderers)
2. Prevent CPUID detection in WASM environment
3. Use heap allocation instead of stack allocation for large objects
4. Handle missing pointer initialization gracefully
## Source
These patches are based on the proven WASM compilation approach used by the tesseract.js project, which successfully compiles Tesseract to WebAssembly and deploys it in production environments.

View File

@@ -0,0 +1,199 @@
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8c6845cb..fdcfc4a8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -90,6 +90,7 @@ option(ENABLE_LTO "Enable link-time optimization" OFF)
option(FAST_FLOAT "Enable float for LSTM" ON)
option(ENABLE_OPENCL "Enable unsupported experimental OpenCL support" OFF)
option(BUILD_TRAINING_TOOLS "Build training tools" ON)
+option(BUILD_TESSERACT_BINARY "Build Tesseract binary" ON)
option(BUILD_TESTS "Build tests" OFF)
option(USE_SYSTEM_ICU "Use system ICU" OFF)
option(DISABLE_ARCHIVE "Disable build with libarchive (if available)" OFF)
@@ -565,9 +566,7 @@ file(
src/cutil/*.cpp
src/dict/*.cpp
src/lstm/*.cpp
- src/opencl/*.cpp
src/textord/*.cpp
- src/viewer/*.cpp
src/wordrec/*.cpp)
if(DISABLED_LEGACY_ENGINE)
@@ -714,13 +713,7 @@ file(
set(TESSERACT_SRC
${TESSERACT_SRC}
src/api/baseapi.cpp
- src/api/capi.cpp
- src/api/renderer.cpp
- src/api/altorenderer.cpp
- src/api/hocrrenderer.cpp
- src/api/lstmboxrenderer.cpp
- src/api/pdfrenderer.cpp
- src/api/wordstrboxrenderer.cpp)
+ src/api/hocrrenderer.cpp)
set(TESSERACT_CONFIGS
tessdata/configs/alto
@@ -858,14 +851,16 @@ endif()
# EXECUTABLE tesseract
# ##############################################################################
-add_executable(tesseract src/tesseract.cpp)
-target_link_libraries(tesseract libtesseract)
-if(HAVE_TIFFIO_H AND WIN32)
- target_link_libraries(tesseract ${TIFF_LIBRARIES})
-endif()
+if(BUILD_TESSERACT_BINARY)
+ add_executable(tesseract src/tesseract.cpp)
+ target_link_libraries(tesseract libtesseract)
+ if(HAVE_TIFFIO_H AND WIN32)
+ target_link_libraries(tesseract ${TIFF_LIBRARIES})
+ endif()
-if(OPENMP_BUILD AND UNIX)
- target_link_libraries(tesseract pthread)
+ if(OPENMP_BUILD AND UNIX)
+ target_link_libraries(tesseract pthread)
+ endif()
endif()
# ##############################################################################
@@ -899,7 +894,11 @@ write_basic_package_version_file(
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/tesseract.pc
DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
-install(TARGETS tesseract DESTINATION bin)
+
+if(BUILD_TESSERACT_BINARY)
+ install(TARGETS tesseract DESTINATION bin)
+endif()
+
install(
TARGETS libtesseract
EXPORT TesseractTargets
diff --git a/src/arch/simddetect.cpp b/src/arch/simddetect.cpp
index 1afe5a5d..cb8c6d4c 100644
--- a/src/arch/simddetect.cpp
+++ b/src/arch/simddetect.cpp
@@ -40,10 +40,12 @@
#endif
+#if !defined(__wasm__)
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA) || defined(HAVE_SSE4_1)
// See https://en.wikipedia.org/wiki/CPUID.
# define HAS_CPUID
#endif
+#endif
#if defined(HAS_CPUID)
# if defined(__GNUC__)
diff --git a/src/ccmain/pageiterator.cpp b/src/ccmain/pageiterator.cpp
index 64ff7f66..c0f80e5f 100644
--- a/src/ccmain/pageiterator.cpp
+++ b/src/ccmain/pageiterator.cpp
@@ -582,7 +582,9 @@ void PageIterator::Orientation(tesseract::Orientation *orientation,
up_in_image.rotate(block->re_rotation());
if (up_in_image.x() == 0.0F) {
- if (up_in_image.y() > 0.0F) {
+ // tesseract-wasm note: `up_in_image` will be a null vector if orientation
+ // info is not available. In that case, assume page up.
+ if (up_in_image.y() >= 0.0F) {
*orientation = ORIENTATION_PAGE_UP;
} else {
*orientation = ORIENTATION_PAGE_DOWN;
diff --git a/src/ccmain/pagesegmain.cpp b/src/ccmain/pagesegmain.cpp
index 0af44607..718e73ef 100644
--- a/src/ccmain/pagesegmain.cpp
+++ b/src/ccmain/pagesegmain.cpp
@@ -222,7 +222,7 @@ int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOC
}
#endif // ndef DISABLED_LEGACY_ENGINE
result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_, to_block,
- photomask_pix, pix_thresholds_, pix_grey_, &pixa_debug_,
+ photomask_pix, pix_thresholds_, pix_grey_, pixa_debug_.get(),
&found_blocks, diacritic_blobs, to_blocks);
if (result >= 0) {
finder->GetDeskewVectors(&deskew_, &reskew_);
@@ -279,17 +279,17 @@ ColumnFinder *Tesseract::SetupPageSegAndDetectOrientation(PageSegMode pageseg_mo
ICOORD bleft(0, 0);
ASSERT_HOST(pix_binary_ != nullptr);
- if (tessedit_dump_pageseg_images) {
- pixa_debug_.AddPix(pix_binary_, "PageSegInput");
+ if (tessedit_dump_pageseg_images && pixa_debug_) {
+ pixa_debug_->AddPix(pix_binary_, "PageSegInput");
}
// Leptonica is used to find the rule/separator lines in the input.
LineFinder::FindAndRemoveLines(source_resolution_, textord_tabfind_show_vlines, pix_binary_,
&vertical_x, &vertical_y, music_mask_pix, &v_lines, &h_lines);
- if (tessedit_dump_pageseg_images) {
- pixa_debug_.AddPix(pix_binary_, "NoLines");
+ if (tessedit_dump_pageseg_images && pixa_debug_) {
+ pixa_debug_->AddPix(pix_binary_, "NoLines");
}
// Leptonica is used to find a mask of the photo regions in the input.
- *photo_mask_pix = ImageFind::FindImages(pix_binary_, &pixa_debug_);
+ *photo_mask_pix = ImageFind::FindImages(pix_binary_, pixa_debug_.get());
if (tessedit_dump_pageseg_images) {
Image pix_no_image_ = nullptr;
if (*photo_mask_pix != nullptr) {
@@ -297,7 +297,7 @@ ColumnFinder *Tesseract::SetupPageSegAndDetectOrientation(PageSegMode pageseg_mo
} else {
pix_no_image_ = pix_binary_.clone();
}
- pixa_debug_.AddPix(pix_no_image_, "NoImages");
+ pixa_debug_->AddPix(pix_no_image_, "NoImages");
pix_no_image_.destroy();
}
if (!PSM_COL_FIND_ENABLED(pageseg_mode)) {
diff --git a/src/ccmain/tesseractclass.cpp b/src/ccmain/tesseractclass.cpp
index fd58ac87..517f925e 100644
--- a/src/ccmain/tesseractclass.cpp
+++ b/src/ccmain/tesseractclass.cpp
@@ -487,8 +487,10 @@ Dict &Tesseract::getDict() {
}
void Tesseract::Clear() {
- std::string debug_name = imagebasename + "_debug.pdf";
- pixa_debug_.WritePDF(debug_name.c_str());
+ if (pixa_debug_) {
+ std::string debug_name = imagebasename + "_debug.pdf";
+ pixa_debug_->WritePDF(debug_name.c_str());
+ }
pix_binary_.destroy();
pix_grey_.destroy();
pix_thresholds_.destroy();
@@ -572,7 +574,7 @@ void Tesseract::PrepareForPageseg() {
// the newly split image.
splitter_.set_orig_pix(pix_binary());
splitter_.set_pageseg_split_strategy(max_pageseg_strategy);
- if (splitter_.Split(true, &pixa_debug_)) {
+ if (splitter_.Split(true, pixa_debug_.get())) {
ASSERT_HOST(splitter_.splitted_image());
pix_binary_.destroy();
pix_binary_ = splitter_.splitted_image().clone();
@@ -599,7 +601,7 @@ void Tesseract::PrepareForTessOCR(BLOCK_LIST *block_list, Tesseract *osd_tess, O
splitter_.set_segmentation_block_list(block_list);
splitter_.set_ocr_split_strategy(max_ocr_strategy);
// Run the splitter for OCR
- bool split_for_ocr = splitter_.Split(false, &pixa_debug_);
+ bool split_for_ocr = splitter_.Split(false, pixa_debug_.get());
// Restore pix_binary to the binarized original pix for future reference.
ASSERT_HOST(splitter_.orig_pix());
pix_binary_.destroy();
diff --git a/src/ccmain/tesseractclass.h b/src/ccmain/tesseractclass.h
index 732bb9e6..030aa5bc 100644
--- a/src/ccmain/tesseractclass.h
+++ b/src/ccmain/tesseractclass.h
@@ -986,7 +986,7 @@ private:
// Thresholds that were used to generate the thresholded image from grey.
Image pix_thresholds_;
// Debug images. If non-empty, will be written on destruction.
- DebugPixa pixa_debug_;
+ std::unique_ptr<DebugPixa> pixa_debug_;
// Input image resolution after any scaling. The resolution is not well
// transmitted by operations on Pix, so we keep an independent record here.
int source_resolution_;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,77 @@
use crate::api::TessDeleteText;
use crate::error::{Result, TesseractError};
use std::ffi::CStr;
use std::os::raw::{c_char, c_float, c_int, c_void};
use std::sync::{Arc, Mutex};
pub struct ChoiceIterator {
handle: Arc<Mutex<*mut c_void>>,
}
unsafe impl Send for ChoiceIterator {}
unsafe impl Sync for ChoiceIterator {}
impl ChoiceIterator {
/// Creates a new instance of the ChoiceIterator.
///
/// # Arguments
///
/// * `handle` - Pointer to the ChoiceIterator.
pub fn new(handle: *mut c_void) -> Self {
ChoiceIterator {
handle: Arc::new(Mutex::new(handle)),
}
}
/// Gets the next choice.
///
/// # Returns
///
/// Returns `true` if the next choice is successful, otherwise returns `false`.
pub fn next(&self) -> Result<bool> {
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
Ok(unsafe { TessChoiceIteratorNext(*handle) != 0 })
}
/// Gets the UTF-8 text for the current choice.
///
/// # Returns
///
/// Returns the UTF-8 text as a `String` if successful, otherwise returns an error.
pub fn get_utf8_text(&self) -> Result<String> {
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
let text_ptr = unsafe { TessChoiceIteratorGetUTF8Text(*handle) };
if text_ptr.is_null() {
return Err(TesseractError::NullPointerError);
}
let c_str = unsafe { CStr::from_ptr(text_ptr) };
let result = c_str.to_str()?.to_owned();
unsafe { TessDeleteText(text_ptr) };
Ok(result)
}
/// Gets the confidence of the current choice.
///
/// # Returns
///
/// Returns the confidence as a `f32`.
pub fn confidence(&self) -> Result<f32> {
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
Ok(unsafe { TessChoiceIteratorConfidence(*handle) })
}
}
impl Drop for ChoiceIterator {
fn drop(&mut self) {
if let Ok(handle) = self.handle.lock() {
unsafe { TessChoiceIteratorDelete(*handle) };
}
}
}
ffi_extern! {
fn TessChoiceIteratorDelete(handle: *mut c_void);
fn TessChoiceIteratorNext(handle: *mut c_void) -> c_int;
fn TessChoiceIteratorGetUTF8Text(handle: *mut c_void) -> *mut c_char;
fn TessChoiceIteratorConfidence(handle: *mut c_void) -> c_float;
}

View File

@@ -0,0 +1,373 @@
#[repr(C)]
#[derive(Debug, Clone, Copy, PartialEq)]
#[allow(non_camel_case_types)]
pub enum TessPageSegMode {
PSM_OSD_ONLY = 0,
PSM_AUTO_OSD = 1,
PSM_AUTO_ONLY = 2,
PSM_AUTO = 3,
PSM_SINGLE_COLUMN = 4,
PSM_SINGLE_BLOCK_VERT_TEXT = 5,
PSM_SINGLE_BLOCK = 6,
PSM_SINGLE_LINE = 7,
PSM_SINGLE_WORD = 8,
PSM_CIRCLE_WORD = 9,
PSM_SINGLE_CHAR = 10,
PSM_SPARSE_TEXT = 11,
PSM_SPARSE_TEXT_OSD = 12,
PSM_RAW_LINE = 13,
PSM_COUNT = 14,
}
impl TessPageSegMode {
pub fn from_int(value: i32) -> Self {
match value {
0 => TessPageSegMode::PSM_OSD_ONLY,
1 => TessPageSegMode::PSM_AUTO_OSD,
2 => TessPageSegMode::PSM_AUTO_ONLY,
3 => TessPageSegMode::PSM_AUTO,
4 => TessPageSegMode::PSM_SINGLE_COLUMN,
5 => TessPageSegMode::PSM_SINGLE_BLOCK_VERT_TEXT,
6 => TessPageSegMode::PSM_SINGLE_BLOCK,
7 => TessPageSegMode::PSM_SINGLE_LINE,
8 => TessPageSegMode::PSM_SINGLE_WORD,
9 => TessPageSegMode::PSM_CIRCLE_WORD,
10 => TessPageSegMode::PSM_SINGLE_CHAR,
11 => TessPageSegMode::PSM_SPARSE_TEXT,
12 => TessPageSegMode::PSM_SPARSE_TEXT_OSD,
13 => TessPageSegMode::PSM_RAW_LINE,
14 => TessPageSegMode::PSM_COUNT,
_ => TessPageSegMode::PSM_AUTO,
}
}
/// Safely convert an integer to a TessPageSegMode, returning None for invalid values.
pub fn try_from_int(value: i32) -> Option<Self> {
match value {
0 => Some(TessPageSegMode::PSM_OSD_ONLY),
1 => Some(TessPageSegMode::PSM_AUTO_OSD),
2 => Some(TessPageSegMode::PSM_AUTO_ONLY),
3 => Some(TessPageSegMode::PSM_AUTO),
4 => Some(TessPageSegMode::PSM_SINGLE_COLUMN),
5 => Some(TessPageSegMode::PSM_SINGLE_BLOCK_VERT_TEXT),
6 => Some(TessPageSegMode::PSM_SINGLE_BLOCK),
7 => Some(TessPageSegMode::PSM_SINGLE_LINE),
8 => Some(TessPageSegMode::PSM_SINGLE_WORD),
9 => Some(TessPageSegMode::PSM_CIRCLE_WORD),
10 => Some(TessPageSegMode::PSM_SINGLE_CHAR),
11 => Some(TessPageSegMode::PSM_SPARSE_TEXT),
12 => Some(TessPageSegMode::PSM_SPARSE_TEXT_OSD),
13 => Some(TessPageSegMode::PSM_RAW_LINE),
14 => Some(TessPageSegMode::PSM_COUNT),
_ => None,
}
}
}
#[repr(C)]
#[derive(Debug, Clone, Copy, PartialEq)]
#[allow(non_camel_case_types)]
pub enum TessPageIteratorLevel {
RIL_BLOCK = 0,
RIL_PARA = 1,
RIL_TEXTLINE = 2,
RIL_WORD = 3,
RIL_SYMBOL = 4,
}
impl TessPageIteratorLevel {
pub fn from_int(value: i32) -> Self {
match value {
0 => TessPageIteratorLevel::RIL_BLOCK,
1 => TessPageIteratorLevel::RIL_PARA,
2 => TessPageIteratorLevel::RIL_TEXTLINE,
3 => TessPageIteratorLevel::RIL_WORD,
4 => TessPageIteratorLevel::RIL_SYMBOL,
_ => TessPageIteratorLevel::RIL_BLOCK,
}
}
}
#[repr(C)]
#[derive(Debug, Clone, Copy, PartialEq)]
#[allow(non_camel_case_types)]
pub enum TessPolyBlockType {
PT_UNKNOWN = 0,
PT_FLOWING_TEXT = 1,
PT_HEADING_TEXT = 2,
PT_PULLOUT_TEXT = 3,
PT_EQUATION = 4,
PT_INLINE_EQUATION = 5,
PT_TABLE = 6,
PT_VERTICAL_TEXT = 7,
PT_CAPTION_TEXT = 8,
PT_FLOWING_IMAGE = 9,
PT_HEADING_IMAGE = 10,
PT_PULLOUT_IMAGE = 11,
PT_HORZ_LINE = 12,
PT_VERT_LINE = 13,
PT_NOISE = 14,
PT_COUNT = 15,
}
impl TessPolyBlockType {
pub fn from_int(value: i32) -> Self {
match value {
0 => TessPolyBlockType::PT_UNKNOWN,
1 => TessPolyBlockType::PT_FLOWING_TEXT,
2 => TessPolyBlockType::PT_HEADING_TEXT,
3 => TessPolyBlockType::PT_PULLOUT_TEXT,
4 => TessPolyBlockType::PT_EQUATION,
5 => TessPolyBlockType::PT_INLINE_EQUATION,
6 => TessPolyBlockType::PT_TABLE,
7 => TessPolyBlockType::PT_VERTICAL_TEXT,
8 => TessPolyBlockType::PT_CAPTION_TEXT,
9 => TessPolyBlockType::PT_FLOWING_IMAGE,
10 => TessPolyBlockType::PT_HEADING_IMAGE,
11 => TessPolyBlockType::PT_PULLOUT_IMAGE,
12 => TessPolyBlockType::PT_HORZ_LINE,
13 => TessPolyBlockType::PT_VERT_LINE,
14 => TessPolyBlockType::PT_NOISE,
15 => TessPolyBlockType::PT_COUNT,
_ => TessPolyBlockType::PT_UNKNOWN,
}
}
/// Safely convert an integer to a TessPolyBlockType, returning None for invalid values.
pub fn try_from_int(value: i32) -> Option<Self> {
match value {
0 => Some(TessPolyBlockType::PT_UNKNOWN),
1 => Some(TessPolyBlockType::PT_FLOWING_TEXT),
2 => Some(TessPolyBlockType::PT_HEADING_TEXT),
3 => Some(TessPolyBlockType::PT_PULLOUT_TEXT),
4 => Some(TessPolyBlockType::PT_EQUATION),
5 => Some(TessPolyBlockType::PT_INLINE_EQUATION),
6 => Some(TessPolyBlockType::PT_TABLE),
7 => Some(TessPolyBlockType::PT_VERTICAL_TEXT),
8 => Some(TessPolyBlockType::PT_CAPTION_TEXT),
9 => Some(TessPolyBlockType::PT_FLOWING_IMAGE),
10 => Some(TessPolyBlockType::PT_HEADING_IMAGE),
11 => Some(TessPolyBlockType::PT_PULLOUT_IMAGE),
12 => Some(TessPolyBlockType::PT_HORZ_LINE),
13 => Some(TessPolyBlockType::PT_VERT_LINE),
14 => Some(TessPolyBlockType::PT_NOISE),
15 => Some(TessPolyBlockType::PT_COUNT),
_ => None,
}
}
}
#[repr(C)]
#[derive(Debug, Clone, Copy, PartialEq)]
#[allow(non_camel_case_types)]
pub enum TessOrientation {
ORIENTATION_PAGE_UP = 0,
ORIENTATION_PAGE_RIGHT = 1,
ORIENTATION_PAGE_DOWN = 2,
ORIENTATION_PAGE_LEFT = 3,
}
impl TessOrientation {
pub fn from_int(value: i32) -> Self {
match value {
0 => TessOrientation::ORIENTATION_PAGE_UP,
1 => TessOrientation::ORIENTATION_PAGE_RIGHT,
2 => TessOrientation::ORIENTATION_PAGE_DOWN,
3 => TessOrientation::ORIENTATION_PAGE_LEFT,
_ => TessOrientation::ORIENTATION_PAGE_UP,
}
}
/// Safely convert an integer to a TessOrientation, returning None for invalid values.
pub fn try_from_int(value: i32) -> Option<Self> {
match value {
0 => Some(TessOrientation::ORIENTATION_PAGE_UP),
1 => Some(TessOrientation::ORIENTATION_PAGE_RIGHT),
2 => Some(TessOrientation::ORIENTATION_PAGE_DOWN),
3 => Some(TessOrientation::ORIENTATION_PAGE_LEFT),
_ => None,
}
}
}
#[repr(C)]
#[derive(Debug, Clone, Copy, PartialEq)]
#[allow(non_camel_case_types)]
pub enum TessParagraphJustification {
JUSTIFICATION_UNKNOWN = 0,
JUSTIFICATION_LEFT = 1,
JUSTIFICATION_CENTER = 2,
JUSTIFICATION_RIGHT = 3,
}
impl TessParagraphJustification {
pub fn from_int(value: i32) -> Self {
match value {
0 => TessParagraphJustification::JUSTIFICATION_UNKNOWN,
1 => TessParagraphJustification::JUSTIFICATION_LEFT,
2 => TessParagraphJustification::JUSTIFICATION_CENTER,
3 => TessParagraphJustification::JUSTIFICATION_RIGHT,
_ => TessParagraphJustification::JUSTIFICATION_UNKNOWN,
}
}
}
#[repr(C)]
#[derive(Debug, Clone, Copy, PartialEq)]
#[allow(non_camel_case_types)]
pub enum TessWritingDirection {
WRITING_DIRECTION_LEFT_TO_RIGHT = 0,
WRITING_DIRECTION_RIGHT_TO_LEFT = 1,
WRITING_DIRECTION_TOP_TO_BOTTOM = 2,
}
impl TessWritingDirection {
pub fn from_int(value: i32) -> Self {
match value {
0 => TessWritingDirection::WRITING_DIRECTION_LEFT_TO_RIGHT,
1 => TessWritingDirection::WRITING_DIRECTION_RIGHT_TO_LEFT,
2 => TessWritingDirection::WRITING_DIRECTION_TOP_TO_BOTTOM,
_ => TessWritingDirection::WRITING_DIRECTION_LEFT_TO_RIGHT,
}
}
/// Safely convert an integer to a TessWritingDirection, returning None for invalid values.
pub fn try_from_int(value: i32) -> Option<Self> {
match value {
0 => Some(TessWritingDirection::WRITING_DIRECTION_LEFT_TO_RIGHT),
1 => Some(TessWritingDirection::WRITING_DIRECTION_RIGHT_TO_LEFT),
2 => Some(TessWritingDirection::WRITING_DIRECTION_TOP_TO_BOTTOM),
_ => None,
}
}
}
#[repr(C)]
#[derive(Debug, Clone, Copy, PartialEq)]
#[allow(non_camel_case_types)]
pub enum TessTextlineOrder {
TEXTLINE_ORDER_LEFT_TO_RIGHT = 0,
TEXTLINE_ORDER_RIGHT_TO_LEFT = 1,
TEXTLINE_ORDER_TOP_TO_BOTTOM = 2,
}
impl TessTextlineOrder {
pub fn from_int(value: i32) -> Self {
match value {
0 => TessTextlineOrder::TEXTLINE_ORDER_LEFT_TO_RIGHT,
1 => TessTextlineOrder::TEXTLINE_ORDER_RIGHT_TO_LEFT,
2 => TessTextlineOrder::TEXTLINE_ORDER_TOP_TO_BOTTOM,
_ => TessTextlineOrder::TEXTLINE_ORDER_LEFT_TO_RIGHT,
}
}
/// Safely convert an integer to a TessTextlineOrder, returning None for invalid values.
pub fn try_from_int(value: i32) -> Option<Self> {
match value {
0 => Some(TessTextlineOrder::TEXTLINE_ORDER_LEFT_TO_RIGHT),
1 => Some(TessTextlineOrder::TEXTLINE_ORDER_RIGHT_TO_LEFT),
2 => Some(TessTextlineOrder::TEXTLINE_ORDER_TOP_TO_BOTTOM),
_ => None,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_page_seg_mode_from_int() {
assert_eq!(TessPageSegMode::from_int(0), TessPageSegMode::PSM_OSD_ONLY);
assert_eq!(TessPageSegMode::from_int(3), TessPageSegMode::PSM_AUTO);
assert_eq!(TessPageSegMode::from_int(10), TessPageSegMode::PSM_SINGLE_CHAR);
assert_eq!(TessPageSegMode::from_int(999), TessPageSegMode::PSM_AUTO);
}
#[test]
fn test_page_seg_mode_conversion() {
let mode = TessPageSegMode::PSM_SINGLE_LINE;
assert_eq!(mode as i32, 7);
}
#[test]
fn test_page_iterator_level_from_int() {
assert_eq!(TessPageIteratorLevel::from_int(0), TessPageIteratorLevel::RIL_BLOCK);
assert_eq!(TessPageIteratorLevel::from_int(3), TessPageIteratorLevel::RIL_WORD);
assert_eq!(TessPageIteratorLevel::from_int(-1), TessPageIteratorLevel::RIL_BLOCK);
}
#[test]
fn test_poly_block_type_from_int() {
assert_eq!(TessPolyBlockType::from_int(1), TessPolyBlockType::PT_FLOWING_TEXT);
assert_eq!(TessPolyBlockType::from_int(6), TessPolyBlockType::PT_TABLE);
assert_eq!(TessPolyBlockType::from_int(100), TessPolyBlockType::PT_UNKNOWN);
}
#[test]
fn test_orientation_from_int() {
assert_eq!(TessOrientation::from_int(0), TessOrientation::ORIENTATION_PAGE_UP);
assert_eq!(TessOrientation::from_int(2), TessOrientation::ORIENTATION_PAGE_DOWN);
assert_eq!(TessOrientation::from_int(5), TessOrientation::ORIENTATION_PAGE_UP);
}
#[test]
fn test_paragraph_justification_from_int() {
assert_eq!(
TessParagraphJustification::from_int(1),
TessParagraphJustification::JUSTIFICATION_LEFT
);
assert_eq!(
TessParagraphJustification::from_int(3),
TessParagraphJustification::JUSTIFICATION_RIGHT
);
assert_eq!(
TessParagraphJustification::from_int(-1),
TessParagraphJustification::JUSTIFICATION_UNKNOWN
);
}
#[test]
fn test_writing_direction_from_int() {
assert_eq!(
TessWritingDirection::from_int(0),
TessWritingDirection::WRITING_DIRECTION_LEFT_TO_RIGHT
);
assert_eq!(
TessWritingDirection::from_int(1),
TessWritingDirection::WRITING_DIRECTION_RIGHT_TO_LEFT
);
assert_eq!(
TessWritingDirection::from_int(10),
TessWritingDirection::WRITING_DIRECTION_LEFT_TO_RIGHT
);
}
#[test]
fn test_textline_order_from_int() {
assert_eq!(
TessTextlineOrder::from_int(0),
TessTextlineOrder::TEXTLINE_ORDER_LEFT_TO_RIGHT
);
assert_eq!(
TessTextlineOrder::from_int(2),
TessTextlineOrder::TEXTLINE_ORDER_TOP_TO_BOTTOM
);
assert_eq!(
TessTextlineOrder::from_int(99),
TessTextlineOrder::TEXTLINE_ORDER_LEFT_TO_RIGHT
);
}
#[test]
fn test_enums_are_copy() {
fn assert_copy<T: Copy>() {}
assert_copy::<TessPageSegMode>();
assert_copy::<TessPageIteratorLevel>();
assert_copy::<TessPolyBlockType>();
assert_copy::<TessOrientation>();
assert_copy::<TessParagraphJustification>();
assert_copy::<TessWritingDirection>();
assert_copy::<TessTextlineOrder>();
}
}

View File

@@ -0,0 +1,85 @@
use std::str::Utf8Error;
use thiserror::Error;
/// Errors that can occur when using the Tesseract API.
#[derive(Error, Debug)]
pub enum TesseractError {
#[error("Failed to initialize Tesseract")]
InitError,
#[error("Failed to set image")]
SetImageError,
#[error("OCR operation failed")]
OcrError,
#[error("Invalid UTF-8 in Tesseract output")]
Utf8Error(#[from] Utf8Error),
#[error("Failed to lock mutex")]
MutexLockError,
#[error("Failed to set variable")]
SetVariableError,
#[error("Failed to get variable")]
GetVariableError,
#[error("Null pointer error")]
NullPointerError,
#[error("Invalid parameter")]
InvalidParameterError,
#[error("Layout analysis failed")]
AnalyseLayoutError,
#[error("Page processing failed")]
ProcessPagesError,
#[error("I/O error")]
IoError,
#[error("Mutex error")]
MutexError,
#[error("Invalid dimensions")]
InvalidDimensions,
#[error("Invalid bytes per pixel")]
InvalidBytesPerPixel,
#[error("Invalid bytes per line")]
InvalidBytesPerLine,
#[error("Invalid image data")]
InvalidImageData,
#[error("Uninitialized error")]
UninitializedError,
#[error("Invalid enum value: {0}")]
InvalidEnumValue(i32),
#[error("String contains null byte")]
NullByteInString,
}
/// Result type for Tesseract operations.
pub type Result<T> = std::result::Result<T, TesseractError>;
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_error_display() {
let error = TesseractError::InitError;
assert_eq!(error.to_string(), "Failed to initialize Tesseract");
let error = TesseractError::SetImageError;
assert_eq!(error.to_string(), "Failed to set image");
let error = TesseractError::OcrError;
assert_eq!(error.to_string(), "OCR operation failed");
}
#[test]
fn test_utf8_error_conversion() {
let invalid_utf8 = vec![0xFF, 0xFE];
let utf8_error = std::str::from_utf8(&invalid_utf8).unwrap_err();
let tess_error: TesseractError = utf8_error.into();
match tess_error {
TesseractError::Utf8Error(_) => {}
_ => panic!("Expected Utf8Error variant"),
}
}
#[test]
fn test_error_is_send_sync() {
fn assert_send_sync<T: Send + Sync>() {}
assert_send_sync::<TesseractError>();
}
}

View File

@@ -0,0 +1,807 @@
//! Safe Leptonica Pix wrapper for image preprocessing before OCR.
//!
//! Provides a safe Rust wrapper around the Leptonica image-processing library.
//! `Pix` is the core Leptonica image type. All methods return `Result<Pix>`,
//! and the wrapper takes care of proper memory management via `Drop`.
//!
//! ## Pixel format
//!
//! Leptonica's 32 bpp format stores each pixel as a native 32-bit integer
//! with the logical layout (MSB→LSB): `R G B A`, i.e.
//! `(r << 24) | (g << 16) | (b << 8) | alpha`. Leptonica accesses
//! individual channels via bit-shift on the integer value, not via
//! byte-addressed pointer arithmetic, so the packing is identical on both
//! big- and little-endian hosts. Do **not** call `pixEndianByteSwap` after
//! writing pixels this way — doing so inverts the channel order.
//!
//! ## `pixDeskew` requires a binary (1 bpp) image
//!
//! Call `to_grayscale()` followed by `adaptive_threshold()` before `deskew()`.
//! `pixDeskew` internally calls `pixFindSkewSweepAndSearchScorePivot` which
//! operates on 1-bit images only; passing a colour image will return a null
//! pointer.
use crate::error::{Result, TesseractError};
use std::ffi::c_void;
// ---------------------------------------------------------------------------
// Raw Leptonica FFI declarations
// ---------------------------------------------------------------------------
#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
ffi_extern! {
/// Allocates a new Pix with the given dimensions and bit depth.
fn pixCreate(width: i32, height: i32, depth: i32) -> *mut c_void;
/// Frees a Pix and sets the caller's pointer to null.
///
/// Leptonica uses a double-pointer convention: `*ppix` is set to null
/// after the call so that accidental double-frees are a no-op.
fn pixDestroy(ppix: *mut *mut c_void);
/// Sets the horizontal and vertical resolution (DPI) on a Pix.
///
/// Returns 0 on success, non-zero on error.
fn pixSetResolution(pix: *mut c_void, xres: i32, yres: i32) -> i32;
/// Returns the width of the Pix in pixels.
fn pixGetWidth(pix: *const c_void) -> i32;
/// Returns the height of the Pix in pixels.
fn pixGetHeight(pix: *const c_void) -> i32;
/// Returns the bit depth of the Pix (1, 2, 4, 8, 16, or 32).
fn pixGetDepth(pix: *const c_void) -> i32;
/// Returns the number of 32-bit words per row (words-per-line).
fn pixGetWpl(pix: *const c_void) -> i32;
/// Returns a mutable pointer to the start of the pixel data array.
///
/// The data is stored as rows of 32-bit words; each word covers 32/depth pixels.
fn pixGetData(pix: *mut c_void) -> *mut u32;
/// Deskews a 1 bpp image using a sweep-and-search algorithm.
///
/// `redsearch` is the reduction factor used during the search; pass 0 for
/// the Leptonica default (2x reduction). Returns a new deskewed Pix on
/// success, or null on failure. The input Pix is **not** consumed.
fn pixDeskew(pixs: *mut c_void, redsearch: i32) -> *mut c_void;
/// Estimates the skew angle and confidence for a 1 bpp image.
///
/// Writes the angle (degrees, positive = counter-clockwise) into `*pangle`
/// and a confidence score (01) into `*pconf`. Returns 0 on success.
fn pixFindSkew(pixs: *mut c_void, pangle: *mut f32, pconf: *mut f32) -> i32;
/// Applies Otsu adaptive thresholding to produce a binarised Pix.
///
/// `sx`/`sy` are the tile dimensions; `smoothx`/`smoothy` are half-widths
/// for smoothing the threshold map; `scorefract` controls threshold acceptance
/// (typical value: 0.1). `ppixth` (optional) receives the threshold image;
/// `ppixd` receives the binarised output.
fn pixOtsuAdaptiveThreshold(
pixs: *mut c_void,
sx: i32,
sy: i32,
smoothx: i32,
smoothy: i32,
scorefract: f32,
ppixth: *mut *mut c_void,
ppixd: *mut *mut c_void,
) -> i32;
/// Normalises the background of a grayscale image using morphological operations.
///
/// `reduction` is the subsampling factor (e.g. 4), `size` is the morphological
/// structuring-element half-size (e.g. 15), and `bgval` is the target background
/// value (e.g. 200). Returns a new normalised Pix, or null on failure.
fn pixBackgroundNormMorph(
pixs: *mut c_void,
pixim: *mut c_void,
reduction: i32,
size: i32,
bgval: i32,
) -> *mut c_void;
/// Applies unsharp masking to sharpen a grayscale or colour Pix.
///
/// `halfwidth` is the half-size of the blur kernel; `fract` controls the
/// sharpening strength (0.01.0 typical). Returns a new Pix, or null on failure.
fn pixUnsharpMasking(pixs: *mut c_void, halfwidth: i32, fract: f32) -> *mut c_void;
/// Scales a Pix by independent x and y factors using the best available method.
///
/// Returns a new scaled Pix, or null on failure. The input Pix is **not** consumed.
fn pixScale(pixs: *mut c_void, scalex: f32, scaley: f32) -> *mut c_void;
/// Converts an RGB (32 bpp) Pix to 8 bpp grayscale.
///
/// `rwt`, `gwt`, `bwt` are the red, green, and blue channel weights; pass
/// 0.0 for all three to use Leptonica's default equal weights. Returns a new
/// 8 bpp Pix, or null on failure.
fn pixConvertRGBToGray(pixs: *mut c_void, rwt: f32, gwt: f32, bwt: f32) -> *mut c_void;
/// Creates a Leptonica BOX with the given coordinates.
fn boxCreate(x: i32, y: i32, w: i32, h: i32) -> *mut c_void;
/// Frees a Leptonica BOX.
fn boxDestroy(pbox: *mut *mut c_void);
/// Clips a rectangular region from a Pix.
///
/// Returns a new Pix containing the clipped region, or null on failure.
/// `pboxc` (optional) receives the actual clipped box; pass null to ignore.
fn pixClipRectangle(pixs: *mut c_void, box_: *mut c_void, pboxc: *mut *mut c_void) -> *mut c_void;
/// Counts connected components in a 1 bpp image.
///
/// `connectivity` is 4 or 8. Writes the count to `*pcount`.
/// Returns 0 on success.
fn pixCountConnComp(pix: *mut c_void, connectivity: i32, pcount: *mut i32) -> i32;
/// Retrieves the horizontal and vertical resolution (DPI) from a Pix.
///
/// Writes the x-resolution into `*pxres` and y-resolution into `*pyres`.
/// Returns 0 on success, non-zero on error.
fn pixGetResolution(pix: *const c_void, pxres: *mut i32, pyres: *mut i32) -> i32;
}
// ---------------------------------------------------------------------------
// Safe Pix wrapper
// ---------------------------------------------------------------------------
/// Safe wrapper around a Leptonica `PIX *` image object.
///
/// Owns the underlying allocation and frees it in `Drop`. All methods that
/// return a new image allocate a fresh `Pix`; the receiver is never consumed.
///
/// # Thread safety
///
/// `Pix` is `Send` because Leptonica image objects are independent heap
/// allocations with no shared mutable state. Concurrent mutation from multiple
/// threads is **not** safe (no `Sync`).
#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
pub struct Pix {
ptr: *mut c_void,
}
#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
impl std::fmt::Debug for Pix {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Pix").field("ptr", &self.ptr).finish()
}
}
// SAFETY: A Pix owns a uniquely heap-allocated Leptonica PIX. There is no
// interior mutability shared across thread boundaries, so transferring
// ownership to another thread is safe.
#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
unsafe impl Send for Pix {}
#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
impl Pix {
// -----------------------------------------------------------------------
// Construction
// -----------------------------------------------------------------------
/// Creates a 32 bpp Leptonica Pix from a packed RGB byte slice.
///
/// `data` must contain exactly `width * height * 3` bytes in left-to-right,
/// top-to-bottom, `R G B` interleaved order.
///
/// The DPI is set to 300 × 300 which is a sensible default for OCR input.
///
/// # Errors
///
/// Returns `TesseractError::InvalidImageData` if `data` length does not
/// match `width * height * 3`, if either dimension is zero, or if
/// Leptonica's `pixCreate` returns null.
///
/// # Examples
///
/// ```rust,no_run
/// # use kreuzberg_tesseract::Pix;
/// let rgb = vec![255u8; 4 * 4 * 3]; // 4×4 white image
/// let pix = Pix::from_raw_rgb(&rgb, 4, 4).unwrap();
/// assert_eq!(pix.width(), 4);
/// assert_eq!(pix.height(), 4);
/// assert_eq!(pix.depth(), 32);
/// ```
pub fn from_raw_rgb(data: &[u8], width: u32, height: u32) -> Result<Pix> {
let expected = (width as usize)
.checked_mul(height as usize)
.and_then(|n| n.checked_mul(3))
.ok_or(TesseractError::InvalidImageData)?;
if data.len() != expected || width == 0 || height == 0 {
return Err(TesseractError::InvalidImageData);
}
// SAFETY: pixCreate() allocates a new PIX with the requested dimensions.
// It is safe because:
// 1. width, height, and depth (32) are valid positive integers.
// 2. pixCreate() documents that it returns null only on allocation
// failure, which we check immediately below.
let pix_ptr = unsafe { pixCreate(width as i32, height as i32, 32) };
if pix_ptr.is_null() {
return Err(TesseractError::NullPointerError);
}
// SAFETY: pixGetData() returns a mutable pointer into the allocated pixel
// buffer that is valid for the lifetime of the Pix. We own pix_ptr
// exclusively at this point and have not exposed it to any other code.
let data_ptr = unsafe { pixGetData(pix_ptr) };
if data_ptr.is_null() {
// Clean up before returning the error.
// SAFETY: pix_ptr is a valid non-null allocation from pixCreate().
// Passing &mut pix_ptr satisfies the double-pointer convention; after
// this call pix_ptr is set to null by Leptonica.
let mut ptr = pix_ptr;
unsafe { pixDestroy(&mut ptr) };
return Err(TesseractError::NullPointerError);
}
// SAFETY: pixGetWpl() is a pure read of the Pix header that is always
// valid for a correctly-allocated Pix.
// For a 32 bpp image, each pixel occupies exactly one 32-bit word, so
// wpl == width (no padding bytes). The loop below uses `row * wpl + col`
// to index into the pixel data, which is within bounds because col < width <= wpl.
let wpl = unsafe { pixGetWpl(pix_ptr) } as usize;
// Write RGB pixels into the Leptonica data buffer.
//
// Leptonica's 32 bpp pixel format stores each pixel as a native
// 32-bit integer word with the logical layout (MSB→LSB): R G B A,
// i.e. `(r << 24) | (g << 16) | (b << 8) | alpha`. This is the
// same bit pattern regardless of host endianness — Leptonica treats
// the data as an array of 32-bit integers and accesses individual
// bytes via bit-shift, not via byte-addressed pointer arithmetic.
//
// Therefore we pack directly as `(r << 24) | (g << 16) | (b << 8) | 0xFF`
// and write the resulting u32 without any byte-swapping. Calling
// `pixEndianByteSwap` would invert the channel order, producing
// A B G R instead of R G B A.
for row in 0..(height as usize) {
for col in 0..(width as usize) {
let src = (row * width as usize + col) * 3;
let r = data[src] as u32;
let g = data[src + 1] as u32;
let b = data[src + 2] as u32;
// Pack channels as (MSB) R G B A (LSB) in the 32-bit integer.
let word: u32 = (r << 24) | (g << 16) | (b << 8) | 0xFF;
// SAFETY: data_ptr is a valid writable pointer into the Leptonica
// pixel buffer. The offset `row * wpl + col` is within bounds because:
// 1. wpl >= width (Leptonica pads rows to 32-bit word boundaries).
// 2. row < height and col < width by loop invariants.
unsafe {
*data_ptr.add(row * wpl + col) = word;
}
}
}
// Set a sensible default DPI for OCR processing.
// SAFETY: pix_ptr is valid and non-null. pixSetResolution only writes
// two integer fields in the Pix header.
unsafe { pixSetResolution(pix_ptr, 300, 300) };
Ok(Pix { ptr: pix_ptr })
}
// -----------------------------------------------------------------------
// Image processing operations
// -----------------------------------------------------------------------
/// Deskews this image, returning a new corrected Pix.
///
/// **Note:** `pixDeskew` requires a 1 bpp (binary) image. Call
/// `to_grayscale()` followed by `adaptive_threshold()` before invoking
/// this method on a colour or grayscale Pix.
///
/// # Errors
///
/// Returns `TesseractError::NullPointerError` if Leptonica returns null
/// (typically because the input is not 1 bpp or the image is too small).
///
/// # Examples
///
/// ```rust,no_run
/// # use kreuzberg_tesseract::Pix;
/// # let rgb = vec![0u8; 100 * 100 * 3];
/// # let pix = Pix::from_raw_rgb(&rgb, 100, 100).unwrap();
/// let gray = pix.to_grayscale().unwrap();
/// let binary = gray.adaptive_threshold(32, 32).unwrap();
/// let deskewed = binary.deskew().unwrap();
/// ```
pub fn deskew(&self) -> Result<Pix> {
// SAFETY: self.ptr is a valid non-null Pix we own. pixDeskew() does
// not take ownership; it creates and returns a new Pix allocation.
// We check for null to handle the case where the operation fails
// (e.g. input is not 1 bpp).
let result = unsafe { pixDeskew(self.ptr, 0) };
if result.is_null() {
Err(TesseractError::NullPointerError)
} else {
Ok(Pix { ptr: result })
}
}
/// Estimates the skew angle (degrees) and confidence (01) for this image.
///
/// A positive angle indicates counter-clockwise skew. Confidence near 1.0
/// means a clear dominant skew direction was found.
///
/// **Note:** Like `deskew`, this operates on 1 bpp images.
///
/// # Errors
///
/// Returns `TesseractError::OcrError` if `pixFindSkew` returns a non-zero
/// status (e.g. insufficient contrast or wrong bit depth).
///
/// # Examples
///
/// ```rust,no_run
/// # use kreuzberg_tesseract::Pix;
/// # let rgb = vec![0u8; 100 * 100 * 3];
/// # let pix = Pix::from_raw_rgb(&rgb, 100, 100).unwrap();
/// let gray = pix.to_grayscale().unwrap();
/// let binary = gray.adaptive_threshold(32, 32).unwrap();
/// let (angle, confidence) = binary.find_skew().unwrap();
/// println!("Skew: {angle:.2}° (confidence {confidence:.2})");
/// ```
pub fn find_skew(&self) -> Result<(f32, f32)> {
let mut angle: f32 = 0.0;
let mut conf: f32 = 0.0;
// SAFETY: self.ptr is valid and non-null. We pass pointers to local
// stack-allocated f32 values, which are valid write targets for the
// duration of this call. pixFindSkew() writes into them and returns
// an integer status code.
let status = unsafe { pixFindSkew(self.ptr, &mut angle, &mut conf) };
if status != 0 {
Err(TesseractError::OcrError)
} else {
Ok((angle, conf))
}
}
/// Binarises this image using Otsu adaptive thresholding.
///
/// `tile_width` and `tile_height` control the size of the local regions
/// used to compute the threshold. Values around 1664 work well for typical
/// document images; smaller tiles follow local contrast more closely.
///
/// # Errors
///
/// Returns `TesseractError::NullPointerError` if Leptonica returns null, or
/// `TesseractError::OcrError` if `pixOtsuAdaptiveThreshold` returns a
/// non-zero status.
///
/// # Examples
///
/// ```rust,no_run
/// # use kreuzberg_tesseract::Pix;
/// # let rgb = vec![128u8; 64 * 64 * 3];
/// # let pix = Pix::from_raw_rgb(&rgb, 64, 64).unwrap();
/// let gray = pix.to_grayscale().unwrap();
/// let binary = gray.adaptive_threshold(32, 32).unwrap();
/// assert_eq!(binary.depth(), 1);
/// ```
pub fn adaptive_threshold(&self, tile_width: i32, tile_height: i32) -> Result<Pix> {
let mut result: *mut c_void = std::ptr::null_mut();
// SAFETY: self.ptr is a valid non-null Pix. We pass null for ppixth
// because we do not need the intermediate threshold image. result is a
// local pointer that will be written by pixOtsuAdaptiveThreshold(); we
// check it for null before wrapping in a Pix.
let status = unsafe {
pixOtsuAdaptiveThreshold(
self.ptr,
tile_width,
tile_height,
0, // smoothx: no smoothing
0, // smoothy: no smoothing
0.1, // scorefract: Leptonica-recommended default
std::ptr::null_mut(), // ppixth: we don't need the threshold map
&mut result,
)
};
if status != 0 {
return Err(TesseractError::OcrError);
}
if result.is_null() {
return Err(TesseractError::NullPointerError);
}
Ok(Pix { ptr: result })
}
/// Returns the horizontal and vertical resolution (DPI) of this image.
///
/// # Errors
///
/// Returns `TesseractError::OcrError` if `pixGetResolution` fails.
pub fn get_resolution(&self) -> Result<(i32, i32)> {
let mut xres: i32 = 0;
let mut yres: i32 = 0;
// SAFETY: self.ptr is a valid non-null Pix. xres and yres are valid
// stack-allocated i32 values. pixGetResolution reads the Pix header.
let status = unsafe { pixGetResolution(self.ptr, &mut xres, &mut yres) };
if status != 0 {
Err(TesseractError::OcrError)
} else {
Ok((xres, yres))
}
}
/// Sets the horizontal and vertical resolution (DPI) on this image.
///
/// # Errors
///
/// Returns `TesseractError::OcrError` if `pixSetResolution` fails.
pub fn set_resolution(&mut self, xres: i32, yres: i32) -> Result<()> {
// SAFETY: self.ptr is a valid non-null Pix. pixSetResolution only
// writes two integer fields in the Pix header.
let status = unsafe { pixSetResolution(self.ptr, xres, yres) };
if status != 0 {
Err(TesseractError::OcrError)
} else {
Ok(())
}
}
/// Ensures the image has a valid (non-zero) DPI resolution.
///
/// If both x and y resolution are zero, sets them to 72 DPI as a
/// safe fallback. This prevents Leptonica operations that depend on
/// resolution metadata from producing incorrect results.
fn ensure_valid_resolution(&self) {
if let Ok((xres, yres)) = self.get_resolution()
&& (xres == 0 || yres == 0)
{
// SAFETY: self.ptr is valid. We set a safe default DPI.
unsafe { pixSetResolution(self.ptr, 72, 72) };
}
}
/// Normalises the background of this image using morphological operations.
///
/// Useful as a preprocessing step when the document has uneven illumination
/// or a non-white background. Returns a new normalised Pix.
///
/// # Errors
///
/// Returns `TesseractError::NullPointerError` if `pixBackgroundNormMorph`
/// returns null.
///
/// # Examples
///
/// ```rust,no_run
/// # use kreuzberg_tesseract::Pix;
/// # let rgb = vec![200u8; 100 * 100 * 3];
/// # let pix = Pix::from_raw_rgb(&rgb, 100, 100).unwrap();
/// let gray = pix.to_grayscale().unwrap();
/// let normalised = gray.background_normalize().unwrap();
/// ```
pub fn background_normalize(&self) -> Result<Pix> {
self.ensure_valid_resolution();
// SAFETY: self.ptr is a valid non-null Pix. We pass null for pixim
// (no mask image). pixBackgroundNormMorph() returns a newly allocated
// Pix or null on failure.
let result = unsafe {
pixBackgroundNormMorph(
self.ptr,
std::ptr::null_mut(), // pixim: no mask
4, // reduction: 4x subsampling
15, // size: morphological SE half-size
200, // bgval: target background value
)
};
if result.is_null() {
Err(TesseractError::NullPointerError)
} else {
Ok(Pix { ptr: result })
}
}
/// Applies unsharp masking to sharpen this image.
///
/// `halfwidth` is the half-size of the blur kernel (e.g. 15).
/// `fract` is the sharpening fraction in the range 0.01.0; values
/// around 0.30.5 produce visible sharpening without artefacts.
///
/// # Errors
///
/// Returns `TesseractError::NullPointerError` if `pixUnsharpMasking`
/// returns null.
///
/// # Examples
///
/// ```rust,no_run
/// # use kreuzberg_tesseract::Pix;
/// # let rgb = vec![128u8; 64 * 64 * 3];
/// # let pix = Pix::from_raw_rgb(&rgb, 64, 64).unwrap();
/// let sharpened = pix.unsharp_mask(2, 0.4).unwrap();
/// ```
pub fn unsharp_mask(&self, halfwidth: i32, fract: f32) -> Result<Pix> {
self.ensure_valid_resolution();
// SAFETY: self.ptr is valid and non-null. pixUnsharpMasking() returns
// a new Pix without modifying or taking ownership of the source.
let result = unsafe { pixUnsharpMasking(self.ptr, halfwidth, fract) };
if result.is_null() {
Err(TesseractError::NullPointerError)
} else {
Ok(Pix { ptr: result })
}
}
/// Scales this image by independent x and y factors.
///
/// Leptonica automatically chooses the best scaling algorithm based on
/// the scale factors and bit depth (area mapping for downscaling,
/// linear interpolation for upscaling).
///
/// # Errors
///
/// Returns `TesseractError::NullPointerError` if `pixScale` returns null.
///
/// # Examples
///
/// ```rust,no_run
/// # use kreuzberg_tesseract::Pix;
/// # let rgb = vec![255u8; 40 * 40 * 3];
/// # let pix = Pix::from_raw_rgb(&rgb, 40, 40).unwrap();
/// let upscaled = pix.scale(2.0, 2.0).unwrap();
/// assert_eq!(upscaled.width(), 80);
/// assert_eq!(upscaled.height(), 80);
/// ```
pub fn scale(&self, sx: f32, sy: f32) -> Result<Pix> {
// SAFETY: self.ptr is valid and non-null. pixScale() creates a new Pix
// and does not modify the source.
let result = unsafe { pixScale(self.ptr, sx, sy) };
if result.is_null() {
Err(TesseractError::NullPointerError)
} else {
Ok(Pix { ptr: result })
}
}
/// Clips a rectangular sub-region from this image.
///
/// Returns a new Pix containing only the pixels within the given rectangle.
/// Coordinates are in pixel space: (x, y) is the top-left corner.
///
/// # Errors
///
/// Returns `TesseractError::NullPointerError` if the crop fails.
pub fn clip_rectangle(&self, x: i32, y: i32, w: i32, h: i32) -> Result<Pix> {
// SAFETY: boxCreate allocates a new BOX on the heap.
let box_ = unsafe { boxCreate(x, y, w, h) };
if box_.is_null() {
return Err(TesseractError::NullPointerError);
}
// SAFETY: pixClipRectangle returns a new Pix clipped to the BOX region.
// We pass null for pboxc (we don't need the clipped box coordinates back).
let result = unsafe { pixClipRectangle(self.ptr, box_, std::ptr::null_mut()) };
// SAFETY: Free the BOX we allocated.
let mut box_mut = box_;
unsafe { boxDestroy(&mut box_mut) };
if result.is_null() {
Err(TesseractError::NullPointerError)
} else {
Ok(Pix { ptr: result })
}
}
/// Counts connected components in a 1 bpp (binary) image.
///
/// `connectivity` should be 4 or 8.
///
/// # Errors
///
/// Returns `TesseractError::OcrError` if `pixCountConnComp` fails
/// (e.g., wrong bit depth — image must be 1 bpp).
pub fn count_connected_components(&self, connectivity: i32) -> Result<i32> {
let mut count: i32 = 0;
// SAFETY: self.ptr is a valid Pix. count is a valid stack local.
let status = unsafe { pixCountConnComp(self.ptr, connectivity, &mut count) };
if status != 0 {
Err(TesseractError::OcrError)
} else {
Ok(count)
}
}
/// Converts this 32 bpp RGB image to an 8 bpp grayscale Pix.
///
/// Passing 0.0 for all weight parameters instructs Leptonica to use its
/// default perceptual weights (approx. 0.299 R, 0.587 G, 0.114 B).
///
/// # Errors
///
/// Returns `TesseractError::NullPointerError` if `pixConvertRGBToGray`
/// returns null (e.g. the source is not 32 bpp).
///
/// # Examples
///
/// ```rust,no_run
/// # use kreuzberg_tesseract::Pix;
/// # let rgb = vec![100u8, 150u8, 200u8].repeat(10 * 10);
/// # let pix = Pix::from_raw_rgb(&rgb, 10, 10).unwrap();
/// let gray = pix.to_grayscale().unwrap();
/// assert_eq!(gray.depth(), 8);
/// ```
pub fn to_grayscale(&self) -> Result<Pix> {
self.ensure_valid_resolution();
// SAFETY: self.ptr is valid and non-null. pixConvertRGBToGray() returns
// a new 8 bpp Pix; the source is not modified.
let result = unsafe { pixConvertRGBToGray(self.ptr, 0.0, 0.0, 0.0) };
if result.is_null() {
Err(TesseractError::NullPointerError)
} else {
Ok(Pix { ptr: result })
}
}
// -----------------------------------------------------------------------
// Accessors
// -----------------------------------------------------------------------
/// Returns the raw Leptonica `PIX *` pointer.
///
/// Intended for passing this image to `TesseractAPI::set_image_2`.
///
/// # Safety
///
/// The caller must ensure the `Pix` outlives any use of the returned
/// pointer. `TessBaseAPISetImage2` **borrows** the pointer — it does not
/// take ownership — so the `Pix` must remain alive until after
/// `TessBaseAPIRecognize` (or any other Tesseract call that consumes the
/// image data) has completed. Dropping the `Pix` while Tesseract holds
/// the pointer will result in a use-after-free.
///
/// The caller must **not** free the returned pointer; `Pix::drop` is
/// solely responsible for deallocation via `pixDestroy`.
pub fn as_ptr(&self) -> *mut c_void {
self.ptr
}
/// Returns the width of the image in pixels.
///
/// # Examples
///
/// ```rust,no_run
/// # use kreuzberg_tesseract::Pix;
/// # let pix = Pix::from_raw_rgb(&vec![0u8; 8 * 6 * 3], 8, 6).unwrap();
/// assert_eq!(pix.width(), 8);
/// ```
pub fn width(&self) -> i32 {
// SAFETY: self.ptr is a valid non-null Pix. pixGetWidth() is a pure
// read of the Pix header struct; it does not mutate any state.
unsafe { pixGetWidth(self.ptr) }
}
/// Returns the height of the image in pixels.
///
/// # Examples
///
/// ```rust,no_run
/// # use kreuzberg_tesseract::Pix;
/// # let pix = Pix::from_raw_rgb(&vec![0u8; 8 * 6 * 3], 8, 6).unwrap();
/// assert_eq!(pix.height(), 6);
/// ```
pub fn height(&self) -> i32 {
// SAFETY: self.ptr is a valid non-null Pix. pixGetHeight() is a pure
// read of the Pix header struct.
unsafe { pixGetHeight(self.ptr) }
}
/// Returns the bit depth of the image (1, 8, or 32 for this module's usage).
///
/// # Examples
///
/// ```rust,no_run
/// # use kreuzberg_tesseract::Pix;
/// # let pix = Pix::from_raw_rgb(&vec![0u8; 4 * 4 * 3], 4, 4).unwrap();
/// assert_eq!(pix.depth(), 32);
/// ```
pub fn depth(&self) -> i32 {
// SAFETY: self.ptr is a valid non-null Pix. pixGetDepth() is a pure
// read of the Pix header struct.
unsafe { pixGetDepth(self.ptr) }
}
}
// ---------------------------------------------------------------------------
// Drop implementation
// ---------------------------------------------------------------------------
#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
impl Drop for Pix {
fn drop(&mut self) {
if !self.ptr.is_null() {
// SAFETY: self.ptr is a non-null Leptonica PIX that we allocated and
// own exclusively. pixDestroy() takes a double pointer, sets *ppix to
// null after freeing, and is safe to call exactly once per allocation.
// After this call self.ptr is null (Leptonica sets it), preventing
// any double-free if drop() were somehow called again.
unsafe { pixDestroy(&mut self.ptr) };
}
}
}
// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------
#[cfg(test)]
#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
mod tests {
use super::*;
fn make_rgb_pix(width: u32, height: u32, fill: u8) -> Pix {
let data = vec![fill; (width * height * 3) as usize];
Pix::from_raw_rgb(&data, width, height).expect("from_raw_rgb failed")
}
#[test]
fn test_from_raw_rgb_dimensions() {
let pix = make_rgb_pix(16, 8, 200);
assert_eq!(pix.width(), 16);
assert_eq!(pix.height(), 8);
assert_eq!(pix.depth(), 32);
}
#[test]
fn test_from_raw_rgb_wrong_length() {
let data = vec![0u8; 10]; // too short for 4×4
let err = Pix::from_raw_rgb(&data, 4, 4).unwrap_err();
assert!(matches!(err, TesseractError::InvalidImageData));
}
#[test]
fn test_from_raw_rgb_zero_dimensions() {
let err = Pix::from_raw_rgb(&[], 0, 4).unwrap_err();
assert!(matches!(err, TesseractError::InvalidImageData));
let err = Pix::from_raw_rgb(&[], 4, 0).unwrap_err();
assert!(matches!(err, TesseractError::InvalidImageData));
}
#[test]
fn test_as_ptr_is_non_null() {
let pix = make_rgb_pix(8, 8, 128);
assert!(!pix.as_ptr().is_null());
}
#[test]
fn test_to_grayscale() {
let pix = make_rgb_pix(32, 32, 150);
let gray = pix.to_grayscale().expect("to_grayscale failed");
assert_eq!(gray.width(), 32);
assert_eq!(gray.height(), 32);
assert_eq!(gray.depth(), 8);
}
#[test]
fn test_scale_up() {
let pix = make_rgb_pix(20, 10, 100);
let scaled = pix.scale(2.0, 2.0).expect("scale failed");
assert_eq!(scaled.width(), 40);
assert_eq!(scaled.height(), 20);
}
#[test]
fn test_unsharp_mask_returns_same_dimensions() {
let pix = make_rgb_pix(32, 32, 200);
let sharpened = pix.unsharp_mask(2, 0.4).expect("unsharp_mask failed");
assert_eq!(sharpened.width(), 32);
assert_eq!(sharpened.height(), 32);
}
#[test]
fn test_adaptive_threshold_produces_1bpp() {
let pix = make_rgb_pix(64, 64, 180);
let gray = pix.to_grayscale().expect("to_grayscale failed");
let binary = gray.adaptive_threshold(32, 32).expect("adaptive_threshold failed");
assert_eq!(binary.depth(), 1);
}
}

View File

@@ -0,0 +1,218 @@
#![cfg_attr(
not(any(feature = "build-tesseract", feature = "build-tesseract-wasm")),
allow(unused_variables, dead_code)
)]
#![allow(clippy::arc_with_non_send_sync)]
#![allow(clippy::missing_transmute_annotations)]
#![allow(clippy::type_complexity)]
#![allow(clippy::new_without_default)]
#![allow(clippy::not_unsafe_ptr_arg_deref)]
#![allow(clippy::cmp_null)]
//! # kreuzberg-tesseract
//!
//! `kreuzberg-tesseract` provides safe Rust bindings for Tesseract OCR with built-in compilation
//! of Tesseract and Leptonica libraries. This crate aims to make OCR functionality
//! easily accessible in Rust projects while handling the complexity of interfacing
//! with the underlying C++ libraries.
//!
//! ## Usage
//!
//! Here's a basic example of how to use `kreuzberg-tesseract`:
//!
//! ```rust
//! use std::path::PathBuf;
//! use std::error::Error;
//! use kreuzberg_tesseract::TesseractAPI;
//!
//! fn get_default_tessdata_dir() -> PathBuf {
//! if cfg!(target_os = "macos") {
//! let home_dir = std::env::var("HOME").expect("HOME environment variable not set");
//! PathBuf::from(home_dir)
//! .join("Library")
//! .join("Application Support")
//! .join("kreuzberg-tesseract")
//! .join("tessdata")
//! } else if cfg!(target_os = "linux") {
//! let home_dir = std::env::var("HOME").expect("HOME environment variable not set");
//! PathBuf::from(home_dir)
//! .join(".kreuzberg-tesseract")
//! .join("tessdata")
//! } else if cfg!(target_os = "windows") {
//! PathBuf::from(std::env::var("APPDATA").expect("APPDATA environment variable not set"))
//! .join("kreuzberg-tesseract")
//! .join("tessdata")
//! } else {
//! panic!("Unsupported operating system");
//! }
//! }
//!
//! fn get_tessdata_dir() -> PathBuf {
//! match std::env::var("TESSDATA_PREFIX") {
//! Ok(dir) => {
//! let path = PathBuf::from(dir);
//! let path = if path.ends_with("tessdata") { path } else { path.join("tessdata") };
//! println!("Using TESSDATA_PREFIX directory: {:?}", path);
//! path
//! }
//! Err(_) => {
//! let default_dir = get_default_tessdata_dir();
//! println!(
//! "TESSDATA_PREFIX not set, using default directory: {:?}",
//! default_dir
//! );
//! default_dir
//! }
//! }
//! }
//!
//! fn main() -> Result<(), Box<dyn Error>> {
//! let api = TesseractAPI::new()?;
//!
//! // Get tessdata directory (uses default location or TESSDATA_PREFIX if set)
//! let tessdata_dir = get_tessdata_dir();
//! api.init(tessdata_dir.to_str().unwrap(), "eng")?;
//!
//! let width = 24;
//! let height = 24;
//! let bytes_per_pixel = 1;
//! let bytes_per_line = width * bytes_per_pixel;
//!
//! // Initialize image data with all white pixels
//! let mut image_data = vec![255u8; width * height];
//!
//! // Draw number 9 with clearer distinction
//! for y in 4..19 {
//! for x in 7..17 {
//! // Top bar
//! if y == 4 && x >= 8 && x <= 15 {
//! image_data[y * width + x] = 0;
//! }
//! // Top curve left side
//! if y >= 4 && y <= 10 && x == 7 {
//! image_data[y * width + x] = 0;
//! }
//! // Top curve right side
//! if y >= 4 && y <= 11 && x == 16 {
//! image_data[y * width + x] = 0;
//! }
//! // Middle bar
//! if y == 11 && x >= 8 && x <= 15 {
//! image_data[y * width + x] = 0;
//! }
//! // Bottom right vertical line
//! if y >= 11 && y <= 18 && x == 16 {
//! image_data[y * width + x] = 0;
//! }
//! // Bottom bar
//! if y == 18 && x >= 8 && x <= 15 {
//! image_data[y * width + x] = 0;
//! }
//! }
//! }
//!
//! // Set the image data
//! api.set_image(&image_data, width.try_into().unwrap(), height.try_into().unwrap(), bytes_per_pixel.try_into().unwrap(), bytes_per_line.try_into().unwrap())?;
//!
//! // Set whitelist for digits only
//! api.set_variable("tessedit_char_whitelist", "0123456789")?;
//!
//! // Set PSM mode to single character
//! api.set_variable("tessedit_pageseg_mode", "10")?;
//!
//! // Get the recognized text
//! let text = api.get_utf8_text()?;
//! println!("Recognized text: {}", text.trim());
//!
//! Ok(())
//! }
//! ```
/// Declare FFI functions with `extern "C-unwind"` on native targets (to catch
/// C++ exceptions from Tesseract/Leptonica) and `extern "C"` on WASM (where
/// the LLVM backend does not support `cleanupret` / C++ unwinding).
macro_rules! ffi_extern {
(
$(
$(#[$meta:meta])*
$vis:vis fn $name:ident($($arg:ident : $ty:ty),* $(,)?) $(-> $ret:ty)?;
)*
) => {
#[cfg(not(target_arch = "wasm32"))]
unsafe extern "C-unwind" {
$(
$(#[$meta])*
$vis fn $name($($arg : $ty),*) $(-> $ret)?;
)*
}
#[cfg(target_arch = "wasm32")]
unsafe extern "C" {
$(
$(#[$meta])*
$vis fn $name($($arg : $ty),*) $(-> $ret)?;
)*
}
};
}
pub use error::{Result, TesseractError};
mod error;
// WASM: Override __cxa_atexit to be a no-op. WASI SDK's __cxa_atexit calls calloc during
// C++ static initialization, which crashes because dlmalloc's heap isn't properly set up
// for wasm32-unknown-unknown. Since WASM modules never exit normally, atexit handlers
// are unnecessary.
#[cfg(all(target_arch = "wasm32", not(target_os = "emscripten")))]
mod wasm_compat {
#[unsafe(no_mangle)]
pub unsafe extern "C" fn __cxa_atexit(
_func: Option<unsafe extern "C" fn(*mut core::ffi::c_void)>,
_arg: *mut core::ffi::c_void,
_dso_handle: *mut core::ffi::c_void,
) -> i32 {
0 // Success, but don't actually register anything
}
}
mod page_iterator;
pub use page_iterator::{BlockInfo, PageIterator, ParaInfo};
mod result_iterator;
pub use result_iterator::{FontAttributes, ResultIterator, WordData};
mod choice_iterator;
pub use choice_iterator::ChoiceIterator;
mod monitor;
pub use monitor::TessMonitor;
mod result_renderer;
pub use result_renderer::TessResultRenderer;
mod mutable_iterator;
pub use mutable_iterator::MutableIterator;
mod enums;
pub use enums::{
TessOrientation, TessPageIteratorLevel, TessPageSegMode, TessParagraphJustification, TessPolyBlockType,
TessTextlineOrder, TessWritingDirection,
};
mod api;
pub use api::{BoundingBoxArray, TesseractAPI};
pub mod leptonica;
#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
pub use leptonica::Pix;
/// Returns the compile-time-bundled English `eng.traineddata` blob when the
/// `bundle-tessdata-eng` feature is enabled, otherwise `None`.
///
/// The bundled data is the `tessdata_fast` variant (~4 MB) downloaded by
/// `build.rs` to `TESSDATA_PREFIX_BUNDLED/tessdata/eng.traineddata`. Embedding
/// it lets WASM builds drive Tesseract OCR without filesystem access or
/// runtime fetches.
#[cfg(feature = "bundle-tessdata-eng")]
pub fn bundled_eng_traineddata() -> Option<&'static [u8]> {
Some(include_bytes!(concat!(
env!("TESSDATA_PREFIX_BUNDLED"),
"/tessdata/eng.traineddata"
)))
}
/// Returns `None` when the `bundle-tessdata-eng` feature is disabled.
#[cfg(not(feature = "bundle-tessdata-eng"))]
pub fn bundled_eng_traineddata() -> Option<&'static [u8]> {
None
}

View File

@@ -0,0 +1,68 @@
use crate::error::{Result, TesseractError};
use std::os::raw::{c_int, c_void};
use std::sync::{Arc, Mutex};
pub struct TessMonitor {
handle: Arc<Mutex<*mut c_void>>,
}
unsafe impl Send for TessMonitor {}
unsafe impl Sync for TessMonitor {}
impl TessMonitor {
/// Creates a new instance of the TessMonitor.
///
/// # Returns
///
/// Returns the new instance of the TessMonitor.
pub fn new() -> Self {
let handle = unsafe { TessMonitorCreate() };
TessMonitor {
handle: Arc::new(Mutex::new(handle)),
}
}
/// Sets the deadline for the monitor.
///
/// # Arguments
///
/// * `deadline` - Deadline in milliseconds.
///
/// # Errors
///
/// Returns a `TesseractError::MutexLockError` if the mutex lock fails.
pub fn set_deadline(&self, deadline: i32) -> Result<()> {
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
unsafe { TessMonitorSetDeadlineMSecs(*handle, deadline) };
Ok(())
}
/// Gets the progress of the monitor.
///
/// # Returns
///
/// Returns the progress as an `i32` if successful, otherwise returns an error.
///
/// # Errors
///
/// Returns a `TesseractError::MutexLockError` if the mutex lock fails.
pub fn get_progress(&self) -> Result<i32> {
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
Ok(unsafe { TessMonitorGetProgress(*handle) })
}
}
impl Drop for TessMonitor {
fn drop(&mut self) {
if let Ok(handle) = self.handle.lock() {
unsafe { TessMonitorDelete(*handle) };
}
}
}
ffi_extern! {
pub fn TessMonitorCreate() -> *mut c_void;
pub fn TessMonitorDelete(monitor: *mut c_void);
pub fn TessMonitorSetDeadlineMSecs(monitor: *mut c_void, deadline: c_int);
pub fn TessMonitorGetProgress(monitor: *mut c_void) -> c_int;
}

View File

@@ -0,0 +1,197 @@
use crate::error::{Result, TesseractError};
use std::ffi::CStr;
use std::os::raw::{c_char, c_void};
use std::sync::Arc;
use std::sync::Mutex;
use crate::result_iterator::{
TessResultIteratorConfidence, TessResultIteratorGetUTF8Text, TessResultIteratorNext,
TessResultIteratorSymbolIsDropcap, TessResultIteratorSymbolIsSubscript, TessResultIteratorSymbolIsSuperscript,
TessResultIteratorWordFontAttributes, TessResultIteratorWordIsFromDictionary, TessResultIteratorWordIsNumeric,
TessResultIteratorWordRecognitionLanguage,
};
pub struct MutableIterator {
handle: Arc<Mutex<*mut c_void>>,
}
unsafe impl Send for MutableIterator {}
unsafe impl Sync for MutableIterator {}
impl MutableIterator {
/// Creates a new instance of the MutableIterator.
///
/// # Arguments
///
/// * `handle` - Pointer to the MutableIterator.
pub fn new(handle: *mut c_void) -> Self {
MutableIterator {
handle: Arc::new(Mutex::new(handle)),
}
}
/// Gets the UTF-8 text for the current iterator.
///
/// # Arguments
///
/// * `level` - Level of the text.
pub fn get_utf8_text(&self, level: i32) -> Result<String> {
let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
let text_ptr = unsafe { TessResultIteratorGetUTF8Text(*handle, level) };
if text_ptr.is_null() {
return Err(TesseractError::NullPointerError);
}
let c_str = unsafe { CStr::from_ptr(text_ptr) };
let result = c_str.to_str()?.to_owned();
unsafe { TessDeleteText(text_ptr as *mut c_char) };
Ok(result)
}
/// Gets the confidence of the current iterator.
///
/// # Arguments
///
/// * `level` - Level of the confidence.
pub fn confidence(&self, level: i32) -> Result<f32> {
let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
Ok(unsafe { TessResultIteratorConfidence(*handle, level) })
}
/// Gets the recognition language of the current iterator.
///
/// # Returns
///
/// Returns the recognition language as a `String` if successful, otherwise returns an error.
pub fn word_recognition_language(&self) -> Result<String> {
let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
let lang_ptr = unsafe { TessResultIteratorWordRecognitionLanguage(*handle) };
if lang_ptr.is_null() {
return Err(TesseractError::NullPointerError);
}
let c_str = unsafe { CStr::from_ptr(lang_ptr) };
Ok(c_str.to_str()?.to_owned())
}
/// Gets the font attributes of the current iterator.
///
/// # Returns
///
/// Returns the font attributes as a tuple if successful, otherwise returns an error.
pub fn word_font_attributes(&self) -> Result<(bool, bool, bool, bool, bool, bool, i32, i32)> {
let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
let mut is_bold = 0;
let mut is_italic = 0;
let mut is_underlined = 0;
let mut is_monospace = 0;
let mut is_serif = 0;
let mut is_smallcaps = 0;
let mut pointsize = 0;
let mut font_id = 0;
let result = unsafe {
TessResultIteratorWordFontAttributes(
*handle,
&mut is_bold,
&mut is_italic,
&mut is_underlined,
&mut is_monospace,
&mut is_serif,
&mut is_smallcaps,
&mut pointsize,
&mut font_id,
)
};
if result == 0 {
Err(TesseractError::InvalidParameterError)
} else {
Ok((
is_bold != 0,
is_italic != 0,
is_underlined != 0,
is_monospace != 0,
is_serif != 0,
is_smallcaps != 0,
pointsize,
font_id,
))
}
}
/// Checks if the current word is from the dictionary.
///
/// # Returns
///
/// Returns `Ok(true)` if the current word is from the dictionary, otherwise returns `Ok(false)`.
pub fn word_is_from_dictionary(&self) -> Result<bool> {
let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
Ok(unsafe { TessResultIteratorWordIsFromDictionary(*handle) != 0 })
}
/// Checks if the current word is numeric.
///
/// # Returns
///
/// Returns `Ok(true)` if the current word is numeric, otherwise returns `Ok(false)`.
pub fn word_is_numeric(&self) -> Result<bool> {
let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
Ok(unsafe { TessResultIteratorWordIsNumeric(*handle) != 0 })
}
/// Checks if the current symbol is superscript.
///
/// # Returns
///
/// Returns `Ok(true)` if the current symbol is superscript, otherwise returns `Ok(false)`.
pub fn symbol_is_superscript(&self) -> Result<bool> {
let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
Ok(unsafe { TessResultIteratorSymbolIsSuperscript(*handle) != 0 })
}
/// Checks if the current symbol is subscript.
///
/// # Returns
///
/// Returns `Ok(true)` if the current symbol is subscript, otherwise returns `Ok(false)`.
pub fn symbol_is_subscript(&self) -> Result<bool> {
let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
Ok(unsafe { TessResultIteratorSymbolIsSubscript(*handle) != 0 })
}
/// Checks if the current symbol is dropcap.
///
/// # Returns
///
/// Returns `Ok(true)` if the current symbol is dropcap, otherwise returns `Ok(false)`.
pub fn symbol_is_dropcap(&self) -> Result<bool> {
let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
Ok(unsafe { TessResultIteratorSymbolIsDropcap(*handle) != 0 })
}
/// Gets the next iterator.
///
/// # Arguments
///
/// * `level` - Level of the iterator.
///
/// # Returns
///
/// Returns `true` if the next iterator is successful, otherwise returns `false`.
pub fn next(&self, level: i32) -> Result<bool> {
let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
Ok(unsafe { TessResultIteratorNext(*handle, level) != 0 })
}
}
impl Drop for MutableIterator {
fn drop(&mut self) {
if let Ok(handle) = self.handle.lock() {
unsafe { TessResultIteratorDelete(*handle) };
}
}
}
ffi_extern! {
pub fn TessResultIteratorDelete(handle: *mut c_void);
pub fn TessDeleteText(text: *mut c_char);
}

View File

@@ -0,0 +1,421 @@
use crate::TesseractError;
use crate::enums::{
TessOrientation, TessPageIteratorLevel, TessParagraphJustification, TessPolyBlockType, TessTextlineOrder,
TessWritingDirection,
};
use crate::error::Result;
use std::os::raw::{c_float, c_int, c_void};
use std::sync::Arc;
use std::sync::Mutex;
/// Block-level layout information from Tesseract.
#[derive(Debug, Clone)]
pub struct BlockInfo {
pub block_type: TessPolyBlockType,
pub left: i32,
pub top: i32,
pub right: i32,
pub bottom: i32,
}
/// Paragraph-level information from Tesseract.
#[derive(Debug, Clone)]
pub struct ParaInfo {
pub justification: TessParagraphJustification,
pub is_list_item: bool,
pub is_crown: bool,
pub first_line_indent: i32,
pub left: i32,
pub top: i32,
pub right: i32,
pub bottom: i32,
}
pub struct PageIterator {
pub handle: Arc<Mutex<*mut c_void>>,
}
unsafe impl Send for PageIterator {}
unsafe impl Sync for PageIterator {}
impl PageIterator {
/// Creates a new instance of the PageIterator.
///
/// # Arguments
///
/// * `handle` - Pointer to the PageIterator.
///
/// # Returns
///
/// Returns the new instance of the PageIterator.
pub fn new(handle: *mut c_void) -> Self {
PageIterator {
handle: Arc::new(Mutex::new(handle)),
}
}
/// Begins the iteration.
pub fn begin(&self) -> Result<()> {
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
unsafe { TessPageIteratorBegin(*handle) };
Ok(())
}
/// Gets the next iterator.
///
/// # Arguments
///
/// * `level` - Level of the iterator.
///
/// # Returns
///
/// Returns `Result<bool>` - `Ok(true)` if the next iterator is successful, `Ok(false)` otherwise.
pub fn next(&self, level: TessPageIteratorLevel) -> Result<bool> {
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
Ok(unsafe { TessPageIteratorNext(*handle, level as c_int) != 0 })
}
/// Checks if the current iterator is at the beginning of the specified level.
///
/// # Arguments
///
/// * `level` - Level of the iterator.
///
/// # Returns
///
/// Returns `Result<bool>` - `Ok(true)` if at the beginning, `Ok(false)` otherwise.
pub fn is_at_beginning_of(&self, level: TessPageIteratorLevel) -> Result<bool> {
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
Ok(unsafe { TessPageIteratorIsAtBeginningOf(*handle, level as c_int) != 0 })
}
/// Checks if the current iterator is at the final element of the specified level.
///
/// # Arguments
///
/// * `level` - Level of the iterator.
/// * `element` - Element of the iterator.
///
/// # Returns
///
/// Returns `Result<bool>` - `Ok(true)` if at the final element, `Ok(false)` otherwise.
pub fn is_at_final_element(&self, level: TessPageIteratorLevel, element: TessPageIteratorLevel) -> Result<bool> {
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
Ok(unsafe { TessPageIteratorIsAtFinalElement(*handle, level as c_int, element as c_int) != 0 })
}
/// Gets the bounding box of the current iterator.
///
/// # Arguments
///
/// * `level` - Level of the bounding box.
///
/// # Returns
///
/// Returns the bounding box as a tuple if successful, otherwise returns an error.
pub fn bounding_box(&self, level: TessPageIteratorLevel) -> Result<(i32, i32, i32, i32)> {
let mut left = 0;
let mut top = 0;
let mut right = 0;
let mut bottom = 0;
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
let result = unsafe {
TessPageIteratorBoundingBox(*handle, level as c_int, &mut left, &mut top, &mut right, &mut bottom)
};
if result == 0 {
Err(TesseractError::InvalidParameterError)
} else {
Ok((left, top, right, bottom))
}
}
/// Gets the block type of the current iterator.
///
/// # Returns
///
/// Returns the block type as a `TessPolyBlockType`.
pub fn block_type(&self) -> Result<TessPolyBlockType> {
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
let block_type = unsafe { TessPageIteratorBlockType(*handle) };
Ok(TessPolyBlockType::from_int(block_type))
}
/// Gets the baseline of the current iterator.
///
/// # Arguments
///
/// * `level` - Level of the baseline.
///
/// # Returns
///
/// Returns the baseline as a tuple if successful, otherwise returns an error.
pub fn baseline(&self, level: i32) -> Result<(i32, i32, i32, i32)> {
let mut x1 = 0;
let mut y1 = 0;
let mut x2 = 0;
let mut y2 = 0;
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
let result = unsafe { TessPageIteratorBaseline(*handle, level, &mut x1, &mut y1, &mut x2, &mut y2) };
if result == 0 {
Err(TesseractError::InvalidParameterError)
} else {
Ok((x1, y1, x2, y2))
}
}
/// Gets the orientation of the current iterator.
///
/// # Returns
///
/// Returns the orientation as a tuple if successful, otherwise returns an error.
pub fn orientation(&self) -> Result<(TessOrientation, TessWritingDirection, TessTextlineOrder, f32)> {
let mut orientation = 0;
let mut writing_direction = 0;
let mut textline_order = 0;
let mut deskew_angle = 0.0;
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
let result = unsafe {
TessPageIteratorOrientation(
*handle,
&mut orientation,
&mut writing_direction,
&mut textline_order,
&mut deskew_angle,
)
};
if result == 0 {
Err(TesseractError::InvalidParameterError)
} else {
Ok((
TessOrientation::from_int(orientation),
TessWritingDirection::from_int(writing_direction),
TessTextlineOrder::from_int(textline_order),
deskew_angle,
))
}
}
/// Extracts all blocks from the page in a single mutex-locked pass.
///
/// Resets the iterator to the beginning, then iterates at `RIL_BLOCK` level,
/// collecting block type and bounding box for each block found.
///
/// # Returns
///
/// Returns `Ok(Vec<BlockInfo>)` with one entry per block, or an error if the
/// mutex cannot be acquired.
pub fn extract_all_blocks(&self) -> Result<Vec<BlockInfo>> {
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
let level = TessPageIteratorLevel::RIL_BLOCK as c_int;
let mut blocks = Vec::new();
// SAFETY: `*handle` is a valid non-null TessPageIterator pointer owned by this struct.
// `TessPageIteratorBegin` resets the iterator to the first element and takes only
// the pointer — no aliasing occurs because we hold the mutex for the duration.
unsafe { TessPageIteratorBegin(*handle) };
loop {
let block_type = unsafe {
// SAFETY: `*handle` is valid; TessPageIteratorBlockType reads the current
// iterator position and returns an integer enum value without taking ownership.
TessPageIteratorBlockType(*handle)
};
let mut left: c_int = 0;
let mut top: c_int = 0;
let mut right: c_int = 0;
let mut bottom: c_int = 0;
let bbox_ok = unsafe {
// SAFETY: `*handle` is valid; the four `*mut c_int` pointers point to local
// stack variables whose lifetimes exceed this call.
TessPageIteratorBoundingBox(*handle, level, &mut left, &mut top, &mut right, &mut bottom)
};
if bbox_ok != 0 {
blocks.push(BlockInfo {
block_type: TessPolyBlockType::from_int(block_type),
left,
top,
right,
bottom,
});
}
let has_next = unsafe {
// SAFETY: `*handle` is valid; TessPageIteratorNext advances the iterator
// in-place and returns 0 when there are no more elements at this level.
TessPageIteratorNext(*handle, level)
};
if has_next == 0 {
break;
}
}
Ok(blocks)
}
/// Extracts all paragraphs from the page in a single mutex-locked pass.
///
/// Resets the iterator to the beginning, then iterates at `RIL_PARA` level,
/// collecting paragraph metadata and bounding box for each paragraph found.
///
/// # Returns
///
/// Returns `Ok(Vec<ParaInfo>)` with one entry per paragraph, or an error if the
/// mutex cannot be acquired.
pub fn extract_all_paragraphs(&self) -> Result<Vec<ParaInfo>> {
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
let level = TessPageIteratorLevel::RIL_PARA as c_int;
let mut paragraphs = Vec::new();
// SAFETY: `*handle` is a valid non-null TessPageIterator pointer owned by this struct.
// `TessPageIteratorBegin` resets the iterator to the first element; the mutex ensures
// exclusive access for the entire loop.
unsafe { TessPageIteratorBegin(*handle) };
loop {
let mut justification: c_int = 0;
// SAFETY: TessPageIteratorParagraphInfo expects BOOL* (int*) for is_list_item and
// is_crown. Rust bool is 1 byte while C int is 4 bytes, so we use c_int temporaries
// to avoid undefined behaviour (stack corruption) and convert afterwards.
let mut is_list_item_raw: c_int = 0;
let mut is_crown_raw: c_int = 0;
let mut first_line_indent: c_int = 0;
let para_ok = unsafe {
// SAFETY: `*handle` is valid; all output pointers reference stack variables
// whose lifetimes exceed this call. TessPageIteratorParagraphInfo writes
// through these pointers without retaining them.
TessPageIteratorParagraphInfo(
*handle,
&mut justification,
&mut is_list_item_raw,
&mut is_crown_raw,
&mut first_line_indent,
)
};
let is_list_item = is_list_item_raw != 0;
let is_crown = is_crown_raw != 0;
let mut left: c_int = 0;
let mut top: c_int = 0;
let mut right: c_int = 0;
let mut bottom: c_int = 0;
let bbox_ok = unsafe {
// SAFETY: `*handle` is valid; the four `*mut c_int` pointers reference local
// stack variables. TessPageIteratorBoundingBox does not retain these pointers.
TessPageIteratorBoundingBox(*handle, level, &mut left, &mut top, &mut right, &mut bottom)
};
if para_ok != 0 && bbox_ok != 0 {
paragraphs.push(ParaInfo {
justification: TessParagraphJustification::from_int(justification),
is_list_item,
is_crown,
first_line_indent,
left,
top,
right,
bottom,
});
}
let has_next = unsafe {
// SAFETY: `*handle` is valid; TessPageIteratorNext advances the iterator
// in-place and returns 0 when there are no more elements at this level.
TessPageIteratorNext(*handle, level)
};
if has_next == 0 {
break;
}
}
Ok(paragraphs)
}
/// Gets the paragraph information of the current iterator.
///
/// # Returns
///
/// Returns the paragraph information as a tuple if successful, otherwise returns an error.
pub fn paragraph_info(&self) -> Result<(TessParagraphJustification, bool, bool, i32)> {
let mut justification = 0;
// SAFETY: TessPageIteratorParagraphInfo expects BOOL* (int*) for is_list_item and
// is_crown. Rust bool is 1 byte while C int is 4 bytes, so we use c_int temporaries
// to avoid undefined behaviour (stack corruption) and convert afterwards.
let mut is_list_item_raw: c_int = 0;
let mut is_crown_raw: c_int = 0;
let mut first_line_indent = 0;
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
let result = unsafe {
TessPageIteratorParagraphInfo(
*handle,
&mut justification,
&mut is_list_item_raw,
&mut is_crown_raw,
&mut first_line_indent,
)
};
if result == 0 {
Err(TesseractError::InvalidParameterError)
} else {
Ok((
TessParagraphJustification::from_int(justification),
is_list_item_raw != 0,
is_crown_raw != 0,
first_line_indent,
))
}
}
}
impl Drop for PageIterator {
fn drop(&mut self) {
if let Ok(handle) = self.handle.lock() {
unsafe { TessPageIteratorDelete(*handle) };
}
}
}
ffi_extern! {
pub fn TessPageIteratorDelete(handle: *mut c_void);
pub fn TessPageIteratorBegin(handle: *mut c_void);
pub fn TessPageIteratorNext(handle: *mut c_void, level: c_int) -> c_int;
pub fn TessPageIteratorIsAtBeginningOf(handle: *mut c_void, level: c_int) -> c_int;
pub fn TessPageIteratorIsAtFinalElement(handle: *mut c_void, level: c_int, element: c_int) -> c_int;
pub fn TessPageIteratorBoundingBox(
handle: *mut c_void,
level: c_int,
left: *mut c_int,
top: *mut c_int,
right: *mut c_int,
bottom: *mut c_int,
) -> c_int;
pub fn TessPageIteratorBlockType(handle: *mut c_void) -> c_int;
pub fn TessPageIteratorBaseline(
handle: *mut c_void,
level: c_int,
x1: *mut c_int,
y1: *mut c_int,
x2: *mut c_int,
y2: *mut c_int,
) -> c_int;
pub fn TessPageIteratorOrientation(
handle: *mut c_void,
orientation: *mut c_int,
writing_direction: *mut c_int,
textline_order: *mut c_int,
deskew_angle: *mut c_float,
) -> c_int;
pub fn TessBaseAPIGetIterator(handle: *mut c_void) -> *mut c_void;
pub fn TessPageIteratorParagraphInfo(
handle: *mut c_void,
justification: *mut c_int,
is_list_item: *mut c_int,
is_crown: *mut c_int,
first_line_indent: *mut c_int,
) -> c_int;
}

View File

@@ -0,0 +1,589 @@
use crate::api::TessDeleteText;
use crate::enums::TessPageIteratorLevel;
use crate::error::{Result, TesseractError};
use std::ffi::CStr;
use std::os::raw::{c_char, c_float, c_int, c_void};
use std::sync::{Arc, Mutex};
/// Font attributes detected by Tesseract for a word.
#[derive(Debug, Clone)]
pub struct FontAttributes {
pub is_bold: bool,
pub is_italic: bool,
pub is_underlined: bool,
pub is_monospace: bool,
pub is_serif: bool,
pub is_smallcaps: bool,
pub pointsize: i32,
pub font_id: i32,
}
/// Complete word data extracted in a single mutex lock.
#[derive(Debug, Clone)]
pub struct WordData {
pub text: String,
pub left: i32,
pub top: i32,
pub right: i32,
pub bottom: i32,
pub confidence: f32,
pub font_attrs: Option<FontAttributes>,
}
pub struct ResultIterator {
pub handle: Arc<Mutex<*mut c_void>>,
}
unsafe impl Send for ResultIterator {}
unsafe impl Sync for ResultIterator {}
impl ResultIterator {
/// Creates a new instance of the ResultIterator.
///
/// # Arguments
///
/// * `handle` - Pointer to the ResultIterator.
///
/// # Returns
///
/// Returns the new instance of the ResultIterator.
pub fn new(handle: *mut c_void) -> Self {
ResultIterator {
handle: Arc::new(Mutex::new(handle)),
}
}
/// Gets the UTF-8 text of the current iterator.
///
/// # Arguments
///
/// * `level` - Level of the text.
///
/// # Returns
///
/// Returns the UTF-8 text as a `String` if successful, otherwise returns an error.
pub fn get_utf8_text(&self, level: TessPageIteratorLevel) -> Result<String> {
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
// SAFETY: TessResultIteratorGetUTF8Text() allocates and returns a pointer to a C string.
// This is safe because:
// 1. *handle is a valid pointer to an initialized ResultIterator (mutex-guarded)
// 2. level is a valid TessPageIteratorLevel enum converted to c_int (in valid range)
// 3. The returned pointer is either null (error) or a valid null-terminated C string
// allocated on Tesseract's heap (must be freed with TessDeleteText)
let text_ptr = unsafe { TessResultIteratorGetUTF8Text(*handle, level as c_int) };
if text_ptr.is_null() {
return Err(TesseractError::NullPointerError);
}
// SAFETY: We've verified text_ptr is non-null. The allocation/deallocation pattern is:
// 1. text_ptr was allocated by TessResultIteratorGetUTF8Text() on the FFI boundary
// 2. CStr::from_ptr(text_ptr) is safe: pointer is non-null and points to valid C string
// 3. We read from the string (to_str() creates temporary immutable borrow)
// 4. We immediately copy all data to owned String before deallocation
// 5. The string data remains valid until TessDeleteText is called
let c_str = unsafe { CStr::from_ptr(text_ptr) };
let result = c_str.to_str()?.to_owned();
// SAFETY: TessDeleteText() deallocates memory allocated by TessResultIteratorGetUTF8Text():
// 1. text_ptr must be non-null (verified above)
// 2. text_ptr came from the Tesseract API (trusted source, correct allocation)
// 3. TessDeleteText() is the correct deallocation function for this allocation
// 4. Must be called exactly once per allocation to avoid double-free (we ensure this)
// 5. After this call, text_ptr is invalid; all uses must be via owned result String
unsafe { TessDeleteText(text_ptr as *mut c_char) };
Ok(result)
}
/// Gets the confidence of the current iterator.
///
/// # Arguments
///
/// * `level` - Level of the confidence.
///
/// # Returns
///
/// Returns the confidence as a `f32`.
pub fn confidence(&self, level: TessPageIteratorLevel) -> Result<f32> {
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
// SAFETY: TessResultIteratorConfidence() is safe because:
// 1. *handle is a valid pointer to an initialized ResultIterator
// 2. level is a valid TessPageIteratorLevel enum converted to c_int
// 3. The function only reads state and returns an f32 value (copyable)
// 4. No pointer operations or memory access is needed
Ok(unsafe { TessResultIteratorConfidence(*handle, level as c_int) })
}
/// Gets the recognition language of the current iterator.
///
/// # Returns
///
/// Returns the recognition language as a `String` if successful, otherwise returns an error.
pub fn word_recognition_language(&self) -> Result<String> {
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
// SAFETY: TessResultIteratorWordRecognitionLanguage() returns a pointer to a C string
// in the iterator's memory. This is safe because:
// 1. *handle is a valid pointer to an initialized ResultIterator
// 2. The returned pointer is either null or a valid null-terminated C string
let lang_ptr = unsafe { TessResultIteratorWordRecognitionLanguage(*handle) };
if lang_ptr.is_null() {
return Err(TesseractError::NullPointerError);
}
// SAFETY: We've verified lang_ptr is non-null. CStr::from_ptr() is safe because:
// 1. lang_ptr points to a valid null-terminated C string managed by Tesseract
// 2. We only read from it (to_str() creates temporary borrow)
let c_str = unsafe { CStr::from_ptr(lang_ptr) };
Ok(c_str.to_str()?.to_owned())
}
/// Gets the font attributes of the current iterator.
///
/// # Returns
///
/// Returns the font attributes as a tuple if successful, otherwise returns an error.
pub fn word_font_attributes(&self) -> Result<(bool, bool, bool, bool, bool, bool, i32, i32)> {
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
let mut is_bold = 0;
let mut is_italic = 0;
let mut is_underlined = 0;
let mut is_monospace = 0;
let mut is_serif = 0;
let mut is_smallcaps = 0;
let mut pointsize = 0;
let mut font_id = 0;
// SAFETY: TessResultIteratorWordFontAttributes() takes output parameter pointers
// and fills them with font attribute values. This is safe because:
// 1. *handle is a valid pointer to an initialized ResultIterator (mutex-guarded)
// 2. All mutable references (&mut ...) are valid local stack variables
// 3. Each reference has a distinct memory location (no aliasing)
// 4. The references outlive the FFI call (defined on stack, used immediately after)
// 5. The function writes output i32 values (0/1 for bools, integers for size/id)
// 6. Each reference has exclusive mutable access (Rust borrow checker enforces this)
// 7. The output parameters are independent (function cannot cause data races)
let result = unsafe {
TessResultIteratorWordFontAttributes(
*handle,
&mut is_bold,
&mut is_italic,
&mut is_underlined,
&mut is_monospace,
&mut is_serif,
&mut is_smallcaps,
&mut pointsize,
&mut font_id,
)
};
if result == 0 {
Err(TesseractError::InvalidParameterError)
} else {
Ok((
is_bold != 0,
is_italic != 0,
is_underlined != 0,
is_monospace != 0,
is_serif != 0,
is_smallcaps != 0,
pointsize,
font_id,
))
}
}
/// Checks if the current iterator is from the dictionary.
///
/// # Returns
///
/// Returns `true` if the current iterator is from the dictionary, otherwise returns `false`.
pub fn word_is_from_dictionary(&self) -> Result<bool> {
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
// SAFETY: TessResultIteratorWordIsFromDictionary() is safe because:
// 1. *handle is a valid pointer to an initialized ResultIterator
// 2. The function only reads state and returns an i32 value (0 or non-zero)
// 3. No pointer operations or memory modifications are needed
Ok(unsafe { TessResultIteratorWordIsFromDictionary(*handle) != 0 })
}
/// Checks if the current iterator is numeric.
///
/// # Returns
///
/// Returns `true` if the current iterator is numeric, otherwise returns `false`.
pub fn word_is_numeric(&self) -> Result<bool> {
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
// SAFETY: TessResultIteratorWordIsNumeric() is safe because:
// 1. *handle is a valid pointer to an initialized ResultIterator
// 2. The function only reads state and returns an i32 value
// 3. No pointer operations or state modifications needed
Ok(unsafe { TessResultIteratorWordIsNumeric(*handle) != 0 })
}
/// Checks if the current iterator is superscript.
///
/// # Returns
///
/// Returns `true` if the current iterator is superscript, otherwise returns `false`.
pub fn symbol_is_superscript(&self) -> Result<bool> {
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
// SAFETY: TessResultIteratorSymbolIsSuperscript() is safe because:
// 1. *handle is a valid pointer to an initialized ResultIterator
// 2. The function only reads state and returns an i32 value
// 3. No pointer operations or state modifications needed
Ok(unsafe { TessResultIteratorSymbolIsSuperscript(*handle) != 0 })
}
/// Checks if the current iterator is subscript.
///
/// # Returns
///
/// Returns `true` if the current iterator is subscript, otherwise returns `false`.
pub fn symbol_is_subscript(&self) -> Result<bool> {
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
// SAFETY: TessResultIteratorSymbolIsSubscript() is safe because:
// 1. *handle is a valid pointer to an initialized ResultIterator
// 2. The function only reads state and returns an i32 value
// 3. No pointer operations or state modifications needed
Ok(unsafe { TessResultIteratorSymbolIsSubscript(*handle) != 0 })
}
/// Checks if the current iterator is dropcap.
///
/// # Returns
///
/// Returns `true` if the current iterator is dropcap, otherwise returns `false`.
pub fn symbol_is_dropcap(&self) -> Result<bool> {
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
// SAFETY: TessResultIteratorSymbolIsDropcap() is safe because:
// 1. *handle is a valid pointer to an initialized ResultIterator
// 2. The function only reads state and returns an i32 value
// 3. No pointer operations or state modifications needed
Ok(unsafe { TessResultIteratorSymbolIsDropcap(*handle) != 0 })
}
/// Moves to the next iterator.
///
/// # Arguments
///
/// * `level` - Level of the next iterator.
///
/// # Returns
///
/// Returns `true` if the next iterator exists, otherwise returns `false`.
pub fn next(&self, level: TessPageIteratorLevel) -> Result<bool> {
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
// SAFETY: TessResultIteratorNext() is safe because:
// 1. *handle is a valid pointer to an initialized ResultIterator
// 2. level is a valid TessPageIteratorLevel enum converted to c_int
// 3. The function modifies iterator state (advances position) and returns i32 result
// 4. The mutex ensures exclusive access during state modification
Ok(unsafe { TessResultIteratorNext(*handle, level as c_int) != 0 })
}
/// Gets the current word from the iterator with its bounding box and confidence.
///
/// # Returns
///
/// Returns a tuple of (text, left, top, right, bottom, confidence) if successful
pub fn get_word_with_bounds(&self) -> Result<(String, i32, i32, i32, i32, f32)> {
let text = self.get_utf8_text(TessPageIteratorLevel::RIL_WORD)?;
let (left, top, right, bottom) = self.get_bounding_box(TessPageIteratorLevel::RIL_WORD)?;
let confidence = self.confidence(TessPageIteratorLevel::RIL_WORD)?;
Ok((text, left, top, right, bottom, confidence))
}
/// Advances the iterator to the next word.
///
/// # Returns
///
/// Returns true if successful, false if there are no more words
pub fn next_word(&self) -> Result<bool> {
self.next(TessPageIteratorLevel::RIL_WORD)
}
/// Gets the word information for the current position in the iterator.
/// Should be called before next() to ensure valid data.
///
/// # Returns
/// Returns a tuple of (text, left, top, right, bottom, confidence) if successful
pub fn get_current_word(&self) -> Result<(String, i32, i32, i32, i32, f32)> {
let text = self.get_utf8_text(TessPageIteratorLevel::RIL_WORD)?;
let (left, top, right, bottom) = self.get_bounding_box(TessPageIteratorLevel::RIL_WORD)?;
let confidence = self.confidence(TessPageIteratorLevel::RIL_WORD)?;
Ok((text, left, top, right, bottom, confidence))
}
/// Gets the bounding box for the current element.
pub fn get_bounding_box(&self, level: TessPageIteratorLevel) -> Result<(i32, i32, i32, i32)> {
let mut left = 0;
let mut top = 0;
let mut right = 0;
let mut bottom = 0;
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
// SAFETY: TessPageIteratorBoundingBox() queries iterator state and returns coordinates
// via output parameters. This is safe because:
// 1. *handle is a valid pointer to an initialized ResultIterator or PageIterator (mutex-guarded)
// 2. level is a valid TessPageIteratorLevel enum converted to c_int (in valid range)
// 3. All mutable references (&mut left, &mut top, &mut right, &mut bottom)
// are valid local stack variables with distinct memory locations
// 4. Each reference is exclusively borrowed (Rust enforces no aliasing)
// 5. The references outlive the FFI call (defined on stack, used immediately after)
// 6. The function writes four i32 coordinate values into these references
// 7. No pointer escaping: the function only writes to these parameters, doesn't store them
// 8. Return value indicates success/failure (checked below)
let result = unsafe {
TessPageIteratorBoundingBox(*handle, level as c_int, &mut left, &mut top, &mut right, &mut bottom)
};
if result == 0 {
Err(TesseractError::InvalidParameterError)
} else {
Ok((left, top, right, bottom))
}
}
/// Extracts all word data from the iterator in a single mutex lock.
///
/// Acquires the mutex once and iterates all words, collecting text, bounding box,
/// confidence, and font attributes for each word. This is more efficient than
/// calling individual methods in a loop since it avoids repeated mutex acquisitions.
///
/// The iterator is always reset to the beginning before traversal so that partial
/// prior consumption does not cause words to be missed.
///
/// # Returns
///
/// Returns a `Vec<WordData>` containing data for every word, or an error if the
/// mutex cannot be acquired.
pub fn extract_all_words(&self) -> Result<Vec<WordData>> {
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
let raw = *handle;
let mut words = Vec::new();
// Reset to the first element before traversal. ResultIterator inherits from
// PageIterator in C++, so TessPageIteratorBegin operates on the same handle.
// SAFETY: raw is a valid mutex-guarded ResultIterator pointer; TessPageIteratorBegin
// simply resets the internal position and does not allocate or free memory.
unsafe { TessPageIteratorBegin(raw) };
loop {
// SAFETY: raw is the mutex-guarded *mut c_void handle. All calls within this
// loop are performed while holding the mutex lock, ensuring exclusive access.
// We pass raw directly to the unlocked helper to avoid re-locking.
match extract_word_data_unlocked(raw) {
Ok(word) => words.push(word),
// NullPointerError means the text pointer was null; skip this position.
// InvalidParameterError means bounding box failed; skip this position.
// Utf8Error means the text was not valid UTF-8; skip this word rather than
// aborting, so the remaining words in the iterator are not lost.
Err(TesseractError::NullPointerError)
| Err(TesseractError::InvalidParameterError)
| Err(TesseractError::Utf8Error(_)) => {}
Err(e) => return Err(e),
}
// SAFETY: TessResultIteratorNext() advances the iterator state and returns
// non-zero if a next element exists. This is safe because:
// 1. raw is a valid pointer to an initialized ResultIterator (mutex-guarded)
// 2. RIL_WORD is a valid TessPageIteratorLevel enum value
// 3. The mutex is held for the duration of this call (exclusive access)
// 4. The function modifies iterator position and returns an i32 result
let has_next = unsafe { TessResultIteratorNext(raw, TessPageIteratorLevel::RIL_WORD as c_int) != 0 };
if !has_next {
break;
}
}
Ok(words)
}
/// Extracts the current word's data in a single mutex lock.
///
/// Acquires the mutex once and calls all FFI functions (text, bounding box,
/// confidence, font attributes) within that lock scope. More efficient than
/// calling the individual methods separately when all fields are needed.
///
/// # Returns
///
/// Returns a [`WordData`] struct if successful, otherwise returns an error.
pub fn extract_word_data(&self) -> Result<WordData> {
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
extract_word_data_unlocked(*handle)
}
}
/// Extracts word data from a raw iterator handle without acquiring the mutex.
///
/// The caller MUST hold the mutex lock for the `ResultIterator` this handle belongs to
/// before calling this function. Passing a handle that is not mutex-guarded, or calling
/// this function concurrently on the same handle, is undefined behaviour.
fn extract_word_data_unlocked(raw: *mut c_void) -> Result<WordData> {
// SAFETY: TessResultIteratorGetUTF8Text() allocates and returns a pointer to a C string.
// This is safe because:
// 1. raw is a valid pointer to an initialized ResultIterator (caller holds mutex lock)
// 2. RIL_WORD is a valid TessPageIteratorLevel enum value converted to c_int
// 3. The returned pointer is either null (error) or a valid null-terminated C string
// allocated on Tesseract's heap (must be freed with TessDeleteText)
let text_ptr = unsafe { TessResultIteratorGetUTF8Text(raw, TessPageIteratorLevel::RIL_WORD as c_int) };
if text_ptr.is_null() {
return Err(TesseractError::NullPointerError);
}
// SAFETY: We've verified text_ptr is non-null. The allocation/deallocation pattern is:
// 1. text_ptr was allocated by TessResultIteratorGetUTF8Text() on the FFI boundary
// 2. CStr::from_ptr(text_ptr) is safe: pointer is non-null and points to valid C string
// 3. We immediately copy all data to an owned String before deallocation
// 4. The string data remains valid until TessDeleteText is called
let text = {
let c_str = unsafe { CStr::from_ptr(text_ptr) };
let owned = c_str.to_str()?.to_owned();
// SAFETY: TessDeleteText() deallocates memory allocated by TessResultIteratorGetUTF8Text():
// 1. text_ptr is non-null (verified above)
// 2. text_ptr came from the Tesseract API (correct allocation type)
// 3. TessDeleteText() is the correct deallocation function for this allocation
// 4. Called exactly once per allocation to avoid double-free
// 5. owned String was already populated; text_ptr is no longer accessed after this call
unsafe { TessDeleteText(text_ptr as *mut c_char) };
owned
};
let mut left = 0;
let mut top = 0;
let mut right = 0;
let mut bottom = 0;
// SAFETY: TessPageIteratorBoundingBox() queries iterator state and fills output parameters.
// This is safe because:
// 1. raw is a valid pointer to an initialized ResultIterator (caller holds mutex lock)
// 2. RIL_WORD is a valid TessPageIteratorLevel enum value converted to c_int
// 3. All mutable references are valid local stack variables with distinct memory locations
// 4. Each reference is exclusively borrowed (Rust enforces no aliasing)
// 5. The references outlive the FFI call (defined on stack, used immediately after)
// 6. Return value indicates success/failure (checked below)
let bbox_result = unsafe {
TessPageIteratorBoundingBox(
raw,
TessPageIteratorLevel::RIL_WORD as c_int,
&mut left,
&mut top,
&mut right,
&mut bottom,
)
};
if bbox_result == 0 {
return Err(TesseractError::InvalidParameterError);
}
// SAFETY: TessResultIteratorConfidence() reads iterator state and returns an f32 value.
// This is safe because:
// 1. raw is a valid pointer to an initialized ResultIterator (caller holds mutex lock)
// 2. RIL_WORD is a valid TessPageIteratorLevel enum value converted to c_int
// 3. The function only reads state and returns a copy (no pointer operations)
let confidence = unsafe { TessResultIteratorConfidence(raw, TessPageIteratorLevel::RIL_WORD as c_int) };
// Collect font attributes; treat any failure as absent rather than propagating the error.
let font_attrs = {
let mut is_bold = 0;
let mut is_italic = 0;
let mut is_underlined = 0;
let mut is_monospace = 0;
let mut is_serif = 0;
let mut is_smallcaps = 0;
let mut pointsize = 0;
let mut font_id = 0;
// SAFETY: TessResultIteratorWordFontAttributes() fills output parameters with font info.
// This is safe because:
// 1. raw is a valid pointer to an initialized ResultIterator (caller holds mutex lock)
// 2. All mutable references are valid local stack variables with distinct memory locations
// 3. Each reference is exclusively borrowed (no aliasing)
// 4. The references outlive the FFI call
// 5. Return value is non-zero on success, zero on failure (checked below)
let result = unsafe {
TessResultIteratorWordFontAttributes(
raw,
&mut is_bold,
&mut is_italic,
&mut is_underlined,
&mut is_monospace,
&mut is_serif,
&mut is_smallcaps,
&mut pointsize,
&mut font_id,
)
};
if result != 0 {
Some(FontAttributes {
is_bold: is_bold != 0,
is_italic: is_italic != 0,
is_underlined: is_underlined != 0,
is_monospace: is_monospace != 0,
is_serif: is_serif != 0,
is_smallcaps: is_smallcaps != 0,
pointsize,
font_id,
})
} else {
None
}
};
Ok(WordData {
text,
left,
top,
right,
bottom,
confidence,
font_attrs,
})
}
impl Drop for ResultIterator {
fn drop(&mut self) {
if let Ok(handle) = self.handle.lock() {
// SAFETY: TessResultIteratorDelete() frees the ResultIterator handle allocated by Tesseract:
// 1. We use .ok() pattern to handle poisoned mutex gracefully (no panic in Drop)
// 2. *handle is a valid opaque pointer allocated by TessBaseAPIGetIterator()
// or TessBaseAPIGetMutableIterator() - Tesseract owns this memory
// 3. TessResultIteratorDelete() is the single correct way to deallocate this type
// 4. The function must be called exactly once per allocation to avoid double-free
// 5. After calling delete, the pointer is invalid; future use would cause use-after-free
// 6. Drop impl never panics (we use .ok() guard), ensuring cleanup always executes
// 7. If mutex is poisoned, handle cleanup is skipped (OS will reclaim process memory)
unsafe { TessResultIteratorDelete(*handle) };
}
}
}
#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
ffi_extern! {
pub fn TessResultIteratorDelete(handle: *mut c_void);
pub fn TessPageIteratorBegin(handle: *mut c_void);
pub fn TessResultIteratorGetUTF8Text(handle: *mut c_void, level: c_int) -> *mut c_char;
pub fn TessResultIteratorConfidence(handle: *mut c_void, level: c_int) -> c_float;
pub fn TessResultIteratorWordRecognitionLanguage(handle: *mut c_void) -> *const c_char;
pub fn TessResultIteratorWordFontAttributes(
handle: *mut c_void,
is_bold: *mut c_int,
is_italic: *mut c_int,
is_underlined: *mut c_int,
is_monospace: *mut c_int,
is_serif: *mut c_int,
is_smallcaps: *mut c_int,
pointsize: *mut c_int,
font_id: *mut c_int,
) -> c_int;
pub fn TessResultIteratorWordIsFromDictionary(handle: *mut c_void) -> c_int;
pub fn TessResultIteratorWordIsNumeric(handle: *mut c_void) -> c_int;
pub fn TessResultIteratorSymbolIsSuperscript(handle: *mut c_void) -> c_int;
pub fn TessResultIteratorSymbolIsSubscript(handle: *mut c_void) -> c_int;
pub fn TessResultIteratorSymbolIsDropcap(handle: *mut c_void) -> c_int;
pub fn TessResultIteratorNext(handle: *mut c_void, level: c_int) -> c_int;
pub fn TessPageIteratorBoundingBox(
handle: *mut c_void,
level: c_int,
left: *mut c_int,
top: *mut c_int,
right: *mut c_int,
bottom: *mut c_int,
) -> c_int;
}

View File

@@ -0,0 +1,212 @@
use crate::TesseractAPI;
use crate::error::{Result, TesseractError};
use std::ffi::{CStr, CString};
use std::os::raw::{c_char, c_int, c_void};
use std::sync::Arc;
use std::sync::Mutex;
pub struct TessResultRenderer {
handle: Arc<Mutex<*mut c_void>>,
}
unsafe impl Send for TessResultRenderer {}
unsafe impl Sync for TessResultRenderer {}
impl TessResultRenderer {
/// Creates a new instance of the TessResultRenderer.
///
/// # Arguments
///
/// * `outputbase` - Output base path.
///
/// # Returns
///
/// Returns the new instance of the TessResultRenderer.
pub fn new_text_renderer(outputbase: &str) -> Result<Self> {
let outputbase = CString::new(outputbase).map_err(|_| TesseractError::NullByteInString)?;
let handle = unsafe { TessTextRendererCreate(outputbase.as_ptr()) };
if handle.is_null() {
Err(TesseractError::NullPointerError)
} else {
Ok(TessResultRenderer {
handle: Arc::new(Mutex::new(handle)),
})
}
}
/// Creates a new instance of the TessResultRenderer for HOCR.
///
/// # Arguments
///
/// * `outputbase` - Output base path.
///
/// # Returns
///
/// Returns the new instance of the TessResultRenderer.
pub fn new_hocr_renderer(outputbase: &str) -> Result<Self> {
let outputbase = CString::new(outputbase).map_err(|_| TesseractError::NullByteInString)?;
let handle = unsafe { TessHOcrRendererCreate(outputbase.as_ptr()) };
if handle.is_null() {
Err(TesseractError::NullPointerError)
} else {
Ok(TessResultRenderer {
handle: Arc::new(Mutex::new(handle)),
})
}
}
/// Creates a new instance of the TessResultRenderer for PDF.
///
/// # Arguments
///
/// * `outputbase` - Output base path.
/// * `datadir` - Data directory path.
/// * `textonly` - Whether to include text only.
///
/// # Returns
///
/// Returns the new instance of the TessResultRenderer.
pub fn new_pdf_renderer(outputbase: &str, datadir: &str, textonly: bool) -> Result<Self> {
let outputbase = CString::new(outputbase).map_err(|_| TesseractError::NullByteInString)?;
let datadir = CString::new(datadir).map_err(|_| TesseractError::NullByteInString)?;
let handle = unsafe { TessPDFRendererCreate(outputbase.as_ptr(), datadir.as_ptr(), textonly as c_int) };
if handle.is_null() {
Err(TesseractError::NullPointerError)
} else {
Ok(TessResultRenderer {
handle: Arc::new(Mutex::new(handle)),
})
}
}
/// Begins a new document.
///
/// # Arguments
///
/// * `title` - Title of the document.
///
/// # Returns
///
/// Returns `true` if the document was created successfully, otherwise returns `false`.
///
/// # Errors
///
/// Returns a `TesseractError` if the string contains a null byte or if the mutex lock fails.
pub fn begin_document(&self, title: &str) -> Result<bool> {
let title = CString::new(title).map_err(|_| TesseractError::NullByteInString)?;
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
Ok(unsafe { TessResultRendererBeginDocument(*handle, title.as_ptr()) != 0 })
}
/// Adds an image to the document.
///
/// # Arguments
///
/// * `api` - The TesseractAPI instance.
///
/// # Returns
///
/// Returns `true` if the image was added successfully, otherwise returns `false`.
///
/// # Errors
///
/// Returns a `TesseractError::MutexLockError` if either mutex lock fails.
pub fn add_image(&self, api: &TesseractAPI) -> Result<bool> {
let api_handle = api.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
Ok(unsafe { TessResultRendererAddImage(*handle, *api_handle) != 0 })
}
/// Ends the document.
///
/// # Returns
///
/// Returns `true` if the document was ended successfully, otherwise returns `false`.
///
/// # Errors
///
/// Returns a `TesseractError::MutexLockError` if the mutex lock fails.
pub fn end_document(&self) -> Result<bool> {
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
Ok(unsafe { TessResultRendererEndDocument(*handle) != 0 })
}
/// Gets the extension of the document.
///
/// # Returns
///
/// Returns the extension as a `String` if successful, otherwise returns an error.
///
/// # Errors
///
/// Returns a `TesseractError::MutexLockError` if the mutex lock fails,
/// `TesseractError::NullPointerError` if the extension pointer is null,
/// or `TesseractError::Utf8Error` if the extension contains invalid UTF-8.
pub fn get_extension(&self) -> Result<String> {
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
let ext_ptr = unsafe { TessResultRendererExtention(*handle) };
if ext_ptr.is_null() {
Err(TesseractError::NullPointerError)
} else {
let c_str = unsafe { CStr::from_ptr(ext_ptr) };
Ok(c_str.to_str()?.to_owned())
}
}
/// Gets the title of the document.
///
/// # Returns
///
/// Returns the title as a `String` if successful, otherwise returns an error.
///
/// # Errors
///
/// Returns a `TesseractError::MutexLockError` if the mutex lock fails,
/// `TesseractError::NullPointerError` if the title pointer is null,
/// or `TesseractError::Utf8Error` if the title contains invalid UTF-8.
pub fn get_title(&self) -> Result<String> {
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
let title_ptr = unsafe { TessResultRendererTitle(*handle) };
if title_ptr.is_null() {
Err(TesseractError::NullPointerError)
} else {
let c_str = unsafe { CStr::from_ptr(title_ptr) };
Ok(c_str.to_str()?.to_owned())
}
}
/// Gets the number of images in the document.
///
/// # Returns
///
/// Returns the number of images as an `i32`.
///
/// # Errors
///
/// Returns a `TesseractError::MutexLockError` if the mutex lock fails.
pub fn get_image_num(&self) -> Result<i32> {
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
Ok(unsafe { TessResultRendererImageNum(*handle) })
}
}
impl Drop for TessResultRenderer {
fn drop(&mut self) {
if let Ok(handle) = self.handle.lock() {
unsafe { TessDeleteResultRenderer(*handle) };
}
}
}
ffi_extern! {
pub fn TessTextRendererCreate(outputbase: *const c_char) -> *mut c_void;
pub fn TessHOcrRendererCreate(outputbase: *const c_char) -> *mut c_void;
pub fn TessPDFRendererCreate(outputbase: *const c_char, datadir: *const c_char, textonly: c_int) -> *mut c_void;
pub fn TessDeleteResultRenderer(renderer: *mut c_void);
pub fn TessResultRendererBeginDocument(renderer: *mut c_void, title: *const c_char) -> c_int;
pub fn TessResultRendererAddImage(renderer: *mut c_void, api: *mut c_void) -> c_int;
pub fn TessResultRendererEndDocument(renderer: *mut c_void) -> c_int;
pub fn TessResultRendererExtention(renderer: *mut c_void) -> *const c_char;
pub fn TessResultRendererTitle(renderer: *mut c_void) -> *const c_char;
pub fn TessResultRendererImageNum(renderer: *mut c_void) -> c_int;
}

Some files were not shown because too many files have changed in this diff Show More