This commit is contained in:
28
crates/config.m4
Normal file
28
crates/config.m4
Normal file
@@ -0,0 +1,28 @@
|
||||
dnl Configuration for Rust-based PHP extension via ext-php-rs.
|
||||
dnl This file enables phpize to compile the extension using cargo instead of make.
|
||||
|
||||
PHP_ARG_ENABLE([kreuzberg],
|
||||
[whether to enable the kreuzberg extension],
|
||||
[AS_HELP_STRING([--enable-kreuzberg],
|
||||
[Enable kreuzberg extension support])],
|
||||
[yes])
|
||||
|
||||
if test "$PHP_KREUZBERG_ENABLED" = "yes"; then
|
||||
dnl Check that cargo is available
|
||||
AC_PATH_PROG([CARGO], [cargo], [no])
|
||||
if test "x$CARGO" = "xno"; then
|
||||
AC_MSG_ERROR([cargo is required to build this extension])
|
||||
fi
|
||||
|
||||
dnl Build the Rust extension using cargo
|
||||
AC_MSG_NOTICE([Building Rust extension kreuzberg])
|
||||
|
||||
dnl Set up the extension module
|
||||
PHP_NEW_EXTENSION(kreuzberg, [], $ext_shared)
|
||||
|
||||
dnl Custom build: invoke cargo instead of make
|
||||
PHP_ADD_BUILD_DIR($ext_builddir)
|
||||
|
||||
dnl The actual build is handled by the build.rs script;
|
||||
dnl cargo outputs the .so/.dylib/.dll which phpize will place in extension_dir.
|
||||
fi
|
||||
86
crates/kreuzberg-cli/Cargo.toml
Normal file
86
crates/kreuzberg-cli/Cargo.toml
Normal file
@@ -0,0 +1,86 @@
|
||||
[package]
|
||||
name = "kreuzberg-cli"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
rust-version.workspace = true
|
||||
authors.workspace = true
|
||||
description = "Command-line interface for Kreuzberg document intelligence"
|
||||
license.workspace = true
|
||||
repository.workspace = true
|
||||
homepage = "https://kreuzberg.dev"
|
||||
documentation = "https://docs.kreuzberg.dev"
|
||||
keywords = ["document", "extraction", "cli", "tool", "parser"]
|
||||
categories = ["command-line-utilities", "text-processing"]
|
||||
|
||||
[package.metadata.cargo-machete]
|
||||
ignored = ["serde_toon_format"]
|
||||
|
||||
[[bin]]
|
||||
name = "kreuzberg"
|
||||
path = "src/main.rs"
|
||||
|
||||
[features]
|
||||
default = [
|
||||
"embeddings",
|
||||
"html",
|
||||
"liter-llm",
|
||||
"ocr",
|
||||
"paddle-ocr",
|
||||
"layout-detection",
|
||||
"chunking-tokenizers",
|
||||
"tree-sitter",
|
||||
]
|
||||
ort-bundled = ["kreuzberg/ort-bundled"]
|
||||
|
||||
ocr = ["kreuzberg/ocr"]
|
||||
|
||||
api = ["kreuzberg/api"]
|
||||
mcp = ["kreuzberg/mcp"]
|
||||
mcp-http = ["kreuzberg/mcp-http"]
|
||||
embeddings = ["kreuzberg/embeddings"]
|
||||
paddle-ocr = ["kreuzberg/paddle-ocr"]
|
||||
layout-detection = ["kreuzberg/layout-detection"]
|
||||
chunking-tokenizers = ["kreuzberg/chunking-tokenizers"]
|
||||
html = ["kreuzberg/html"]
|
||||
liter-llm = ["kreuzberg/liter-llm"]
|
||||
tree-sitter = ["kreuzberg/tree-sitter", "dep:tree-sitter-language-pack"]
|
||||
all = [
|
||||
"default",
|
||||
"api",
|
||||
"html",
|
||||
"mcp",
|
||||
"mcp-http",
|
||||
"chunking-tokenizers",
|
||||
"tree-sitter",
|
||||
"liter-llm",
|
||||
]
|
||||
|
||||
[dependencies]
|
||||
|
||||
anstyle = "1"
|
||||
anyhow = { workspace = true }
|
||||
base64 = { workspace = true }
|
||||
clap = { workspace = true }
|
||||
clap_complete = "4.6"
|
||||
kreuzberg = { workspace = true, features = [
|
||||
"formats",
|
||||
"analysis",
|
||||
"tokio-runtime",
|
||||
"simd-utf8",
|
||||
"cli",
|
||||
] }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
serde_toon_format = { workspace = true }
|
||||
tokio = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt"] }
|
||||
tree-sitter-language-pack = { workspace = true, features = [
|
||||
"dynamic-loading",
|
||||
"download",
|
||||
"serde",
|
||||
], optional = true }
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = { workspace = true }
|
||||
ureq = { version = "3.3", features = ["json"] }
|
||||
1027
crates/kreuzberg-cli/README.md
Normal file
1027
crates/kreuzberg-cli/README.md
Normal file
File diff suppressed because it is too large
Load Diff
13
crates/kreuzberg-cli/build.rs
Normal file
13
crates/kreuzberg-cli/build.rs
Normal file
@@ -0,0 +1,13 @@
|
||||
fn main() {
|
||||
println!("cargo::rustc-check-cfg=cfg(coverage)");
|
||||
|
||||
let target = std::env::var("TARGET").unwrap();
|
||||
|
||||
if target.contains("darwin") {
|
||||
println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path");
|
||||
println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path/.");
|
||||
} else if target.contains("linux") {
|
||||
println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN");
|
||||
println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN/.");
|
||||
}
|
||||
}
|
||||
466
crates/kreuzberg-cli/src/commands/cache.rs
Normal file
466
crates/kreuzberg-cli/src/commands/cache.rs
Normal file
@@ -0,0 +1,466 @@
|
||||
//! Cache command - Manage cache operations
|
||||
//!
|
||||
//! This module provides commands for cache management including statistics,
|
||||
//! clearing, manifest generation, and model warming.
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use kreuzberg::cache;
|
||||
use serde_json::json;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use crate::{WireFormat, style};
|
||||
|
||||
/// Execute cache stats command
|
||||
pub fn stats_command(cache_dir: Option<PathBuf>, format: WireFormat) -> Result<()> {
|
||||
let default_cache_dir = std::env::current_dir()
|
||||
.context("Failed to get current directory")?
|
||||
.join(".kreuzberg");
|
||||
|
||||
let cache_path = cache_dir.unwrap_or(default_cache_dir);
|
||||
let cache_dir_str = cache_path.to_string_lossy();
|
||||
|
||||
let stats = cache::get_cache_metadata(&cache_dir_str).with_context(|| {
|
||||
format!(
|
||||
"Failed to get cache statistics from directory '{}'. Ensure the directory exists and is readable.",
|
||||
cache_dir_str
|
||||
)
|
||||
})?;
|
||||
|
||||
match format {
|
||||
WireFormat::Text => {
|
||||
println!("{}", style::header("Cache Statistics"));
|
||||
println!("{}", style::dim("================"));
|
||||
println!("{} {}", style::label("Directory:"), style::success(&cache_dir_str));
|
||||
println!("{} {}", style::label("Total files:"), stats.total_files);
|
||||
println!("{} {:.2} MB", style::label("Total size:"), stats.total_size_mb);
|
||||
println!(
|
||||
"{} {:.2} MB",
|
||||
style::label("Available space:"),
|
||||
stats.available_space_mb
|
||||
);
|
||||
println!(
|
||||
"{} {:.2} days",
|
||||
style::label("Oldest file age:"),
|
||||
stats.oldest_file_age_days
|
||||
);
|
||||
println!(
|
||||
"{} {:.2} days",
|
||||
style::label("Newest file age:"),
|
||||
stats.newest_file_age_days
|
||||
);
|
||||
}
|
||||
WireFormat::Json => {
|
||||
let output = json!({
|
||||
"directory": cache_dir_str,
|
||||
"total_files": stats.total_files,
|
||||
"total_size_mb": stats.total_size_mb,
|
||||
"available_space_mb": stats.available_space_mb,
|
||||
"oldest_file_age_days": stats.oldest_file_age_days,
|
||||
"newest_file_age_days": stats.newest_file_age_days,
|
||||
});
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&output).context("Failed to serialize cache statistics to JSON")?
|
||||
);
|
||||
}
|
||||
WireFormat::Toon => {
|
||||
let output = json!({
|
||||
"directory": cache_dir_str,
|
||||
"total_files": stats.total_files,
|
||||
"total_size_mb": stats.total_size_mb,
|
||||
"available_space_mb": stats.available_space_mb,
|
||||
"oldest_file_age_days": stats.oldest_file_age_days,
|
||||
"newest_file_age_days": stats.newest_file_age_days,
|
||||
});
|
||||
println!(
|
||||
"{}",
|
||||
serde_toon::to_string(&output).context("Failed to serialize cache statistics to TOON")?
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Execute cache clear command
|
||||
pub fn clear_command(cache_dir: Option<PathBuf>, format: WireFormat) -> Result<()> {
|
||||
let default_cache_dir = std::env::current_dir()
|
||||
.context("Failed to get current directory")?
|
||||
.join(".kreuzberg");
|
||||
|
||||
let cache_path = cache_dir.unwrap_or(default_cache_dir);
|
||||
let cache_dir_str = cache_path.to_string_lossy();
|
||||
|
||||
let (removed_files, freed_mb) = cache::clear_cache_directory(&cache_dir_str).with_context(|| {
|
||||
format!(
|
||||
"Failed to clear cache directory '{}'. Ensure you have write permissions.",
|
||||
cache_dir_str
|
||||
)
|
||||
})?;
|
||||
|
||||
match format {
|
||||
WireFormat::Text => {
|
||||
println!("{}", style::success("Cache cleared successfully"));
|
||||
println!("{} {}", style::label("Directory:"), style::success(&cache_dir_str));
|
||||
println!("{} {}", style::label("Removed files:"), removed_files);
|
||||
println!("{} {:.2} MB", style::label("Freed space:"), freed_mb);
|
||||
}
|
||||
WireFormat::Json => {
|
||||
let output = json!({
|
||||
"directory": cache_dir_str,
|
||||
"removed_files": removed_files,
|
||||
"freed_mb": freed_mb,
|
||||
});
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&output).context("Failed to serialize cache clear results to JSON")?
|
||||
);
|
||||
}
|
||||
WireFormat::Toon => {
|
||||
let output = json!({
|
||||
"directory": cache_dir_str,
|
||||
"removed_files": removed_files,
|
||||
"freed_mb": freed_mb,
|
||||
});
|
||||
println!(
|
||||
"{}",
|
||||
serde_toon::to_string(&output).context("Failed to serialize cache clear results to TOON")?
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Execute cache manifest command - outputs expected model files with checksums.
|
||||
pub fn manifest_command(format: WireFormat) -> Result<()> {
|
||||
// Without at least one model-providing feature, every `extend` call
|
||||
// below is `#[cfg]`-stripped and `entries: Vec<_>` has no anchor for
|
||||
// type inference — `e.size_bytes` on the closure further down then
|
||||
// fails compilation with E0282. Bail with a clear error instead so
|
||||
// (or similar minimal configurations) succeeds.
|
||||
#[cfg(not(any(feature = "paddle-ocr", feature = "layout-detection")))]
|
||||
{
|
||||
let _ = format;
|
||||
anyhow::bail!(
|
||||
"manifest command unavailable: build kreuzberg-cli with at least one of \
|
||||
--features \"paddle-ocr\" or --features \"layout-detection\""
|
||||
);
|
||||
}
|
||||
|
||||
#[cfg(any(feature = "paddle-ocr", feature = "layout-detection"))]
|
||||
{
|
||||
manifest_command_inner(format)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(any(feature = "paddle-ocr", feature = "layout-detection"))]
|
||||
fn manifest_command_inner(format: WireFormat) -> Result<()> {
|
||||
let mut entries = Vec::new();
|
||||
|
||||
#[cfg(feature = "paddle-ocr")]
|
||||
{
|
||||
entries.extend(kreuzberg::paddle_ocr::ModelManager::manifest());
|
||||
}
|
||||
|
||||
#[cfg(feature = "layout-detection")]
|
||||
{
|
||||
entries.extend(kreuzberg::layout::LayoutModelManager::manifest());
|
||||
}
|
||||
|
||||
#[cfg(feature = "paddle-ocr")]
|
||||
{
|
||||
entries.extend(kreuzberg::ocr::TessdataManager::manifest());
|
||||
}
|
||||
|
||||
let total_size_bytes: u64 = entries.iter().map(|e| e.size_bytes).sum();
|
||||
let version = env!("CARGO_PKG_VERSION");
|
||||
|
||||
match format {
|
||||
WireFormat::Text => {
|
||||
println!(
|
||||
"{} {}",
|
||||
style::header("Model Manifest"),
|
||||
style::dim(&format!("(kreuzberg {})", version))
|
||||
);
|
||||
println!("{}", style::dim("===================================="));
|
||||
println!(
|
||||
"{:<50} {:>12} {}",
|
||||
style::label("PATH"),
|
||||
style::label("SIZE"),
|
||||
style::label("SHA256")
|
||||
);
|
||||
println!("{}", style::dim(&format!("{:<50} {:>12} ------", "----", "----")));
|
||||
for entry in &entries {
|
||||
let size_str = if entry.size_bytes > 0 {
|
||||
format!("{:.1} MB", entry.size_bytes as f64 / 1_048_576.0)
|
||||
} else {
|
||||
"unknown".to_string()
|
||||
};
|
||||
let sha_display = if entry.sha256.len() >= 12 {
|
||||
&entry.sha256[..12]
|
||||
} else if entry.sha256.is_empty() {
|
||||
"-"
|
||||
} else {
|
||||
&entry.sha256
|
||||
};
|
||||
println!(
|
||||
"{:<50} {:>12} {}",
|
||||
entry.relative_path,
|
||||
size_str,
|
||||
style::dim(sha_display)
|
||||
);
|
||||
}
|
||||
println!();
|
||||
println!(
|
||||
"{} {} files, {:.1} MB",
|
||||
style::label("Total:"),
|
||||
entries.len(),
|
||||
total_size_bytes as f64 / 1_048_576.0
|
||||
);
|
||||
}
|
||||
WireFormat::Json => {
|
||||
let output = json!({
|
||||
"kreuzberg_version": version,
|
||||
"total_size_bytes": total_size_bytes,
|
||||
"model_count": entries.len(),
|
||||
"models": entries,
|
||||
});
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&output).context("Failed to serialize manifest to JSON")?
|
||||
);
|
||||
}
|
||||
WireFormat::Toon => {
|
||||
let output = json!({
|
||||
"kreuzberg_version": version,
|
||||
"total_size_bytes": total_size_bytes,
|
||||
"model_count": entries.len(),
|
||||
"models": entries,
|
||||
});
|
||||
println!(
|
||||
"{}",
|
||||
serde_toon::to_string(&output).context("Failed to serialize manifest to TOON")?
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Execute cache warm command - eagerly downloads all models.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn warm_command(
|
||||
cache_dir: Option<PathBuf>,
|
||||
format: WireFormat,
|
||||
all_embeddings: bool,
|
||||
embedding_model: Option<String>,
|
||||
all_table_models: bool,
|
||||
all_grammars: bool,
|
||||
grammar_groups: Option<Vec<String>>,
|
||||
grammars: Option<Vec<String>>,
|
||||
) -> Result<()> {
|
||||
let cache_base = resolve_cache_base(cache_dir);
|
||||
|
||||
let mut downloaded: Vec<String> = Vec::new();
|
||||
let mut already_cached: Vec<String> = Vec::new();
|
||||
|
||||
#[cfg(feature = "paddle-ocr")]
|
||||
{
|
||||
let paddle_dir = cache_base.join("paddle-ocr");
|
||||
let manager = kreuzberg::paddle_ocr::ModelManager::new(paddle_dir);
|
||||
|
||||
// ensure_all_models downloads v2 det (server+mobile), cls (PP-LCNet),
|
||||
// doc_ori, v2 unified rec models, and all per-script rec families
|
||||
manager
|
||||
.ensure_all_models()
|
||||
.context("Failed to download PaddleOCR v2 models")?;
|
||||
downloaded.push("paddle-ocr v2 (server+mobile det, cls, doc_ori, unified+per-script rec)".to_string());
|
||||
}
|
||||
|
||||
#[cfg(feature = "layout-detection")]
|
||||
{
|
||||
let layout_dir = cache_base.join("layout");
|
||||
let manager = kreuzberg::layout::LayoutModelManager::new(Some(layout_dir));
|
||||
|
||||
if all_table_models {
|
||||
// Download rtdetr + tatr + all SLANeXT variants (~730MB)
|
||||
let was_cached = manager.is_rtdetr_cached() && manager.is_tatr_cached();
|
||||
if was_cached {
|
||||
already_cached.push("layout (rtdetr, tatr, slanet variants)".to_string());
|
||||
} else {
|
||||
manager
|
||||
.ensure_all_models()
|
||||
.context("Failed to download layout models")?;
|
||||
downloaded.push("layout (rtdetr, tatr, slanet variants)".to_string());
|
||||
}
|
||||
} else {
|
||||
// Default: download only rtdetr + tatr
|
||||
let was_cached = manager.is_rtdetr_cached() && manager.is_tatr_cached();
|
||||
if was_cached {
|
||||
already_cached.push("layout (rtdetr, tatr)".to_string());
|
||||
} else {
|
||||
manager
|
||||
.ensure_default_models()
|
||||
.context("Failed to download layout models")?;
|
||||
downloaded.push("layout (rtdetr, tatr)".to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "paddle-ocr")]
|
||||
{
|
||||
let tessdata_dir = cache_base.join("tessdata");
|
||||
let manager = kreuzberg::ocr::TessdataManager::new(Some(tessdata_dir));
|
||||
|
||||
let newly_downloaded = manager
|
||||
.ensure_all_languages()
|
||||
.context("Failed to download tessdata files")?;
|
||||
|
||||
if newly_downloaded > 0 {
|
||||
downloaded.push(format!("tessdata ({newly_downloaded} languages)"));
|
||||
} else {
|
||||
already_cached.push("tessdata (all languages)".to_string());
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "embeddings")]
|
||||
{
|
||||
let embeddings_dir = cache_base.join("embeddings");
|
||||
let presets_to_warm: Vec<kreuzberg::EmbeddingPreset> = if all_embeddings {
|
||||
kreuzberg::list_embedding_presets()
|
||||
.into_iter()
|
||||
.filter_map(|name| kreuzberg::get_embedding_preset(&name))
|
||||
.collect()
|
||||
} else if let Some(ref name) = embedding_model {
|
||||
match kreuzberg::get_embedding_preset(name) {
|
||||
Some(preset) => vec![preset],
|
||||
None => {
|
||||
let available = kreuzberg::list_embedding_presets();
|
||||
anyhow::bail!(
|
||||
"Unknown embedding preset '{}'. Available: {}",
|
||||
name,
|
||||
available.join(", ")
|
||||
);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
vec![]
|
||||
};
|
||||
|
||||
for preset in &presets_to_warm {
|
||||
let label = format!("embedding ({})", preset.name);
|
||||
kreuzberg::embeddings::warm_model(
|
||||
&kreuzberg::core::config::EmbeddingModelType::Preset {
|
||||
name: preset.name.clone(),
|
||||
},
|
||||
Some(embeddings_dir.clone()),
|
||||
)
|
||||
.map_err(|e| anyhow::anyhow!("Failed to download embedding model '{}': {}", preset.name, e))?;
|
||||
downloaded.push(label);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "embeddings"))]
|
||||
{
|
||||
if all_embeddings || embedding_model.is_some() {
|
||||
anyhow::bail!("Embedding model warming requires the 'embeddings' feature to be enabled");
|
||||
}
|
||||
}
|
||||
|
||||
// Tree-sitter grammar downloads
|
||||
#[cfg(feature = "tree-sitter")]
|
||||
{
|
||||
if all_grammars {
|
||||
let count =
|
||||
tree_sitter_language_pack::download_all().context("Failed to download all tree-sitter grammars")?;
|
||||
if count > 0 {
|
||||
downloaded.push(format!("tree-sitter grammars ({count} languages)"));
|
||||
} else {
|
||||
already_cached.push("tree-sitter grammars (all)".to_string());
|
||||
}
|
||||
} else if let Some(ref groups) = grammar_groups {
|
||||
let config = tree_sitter_language_pack::PackConfig {
|
||||
cache_dir: None,
|
||||
languages: None,
|
||||
groups: Some(groups.clone()),
|
||||
};
|
||||
tree_sitter_language_pack::init(&config).context("Failed to download tree-sitter grammar groups")?;
|
||||
downloaded.push(format!("tree-sitter grammars (groups: {})", groups.join(", ")));
|
||||
} else if let Some(ref langs) = grammars {
|
||||
let refs: Vec<&str> = langs.iter().map(String::as_str).collect();
|
||||
let count =
|
||||
tree_sitter_language_pack::download(&refs).context("Failed to download tree-sitter grammars")?;
|
||||
if count > 0 {
|
||||
downloaded.push(format!("tree-sitter grammars ({count} languages)"));
|
||||
} else {
|
||||
already_cached.push(format!("tree-sitter grammars ({})", langs.join(", ")));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "tree-sitter"))]
|
||||
{
|
||||
if all_grammars || grammar_groups.is_some() || grammars.is_some() {
|
||||
anyhow::bail!("Tree-sitter grammar warming requires the 'tree-sitter' feature to be enabled");
|
||||
}
|
||||
}
|
||||
|
||||
match format {
|
||||
WireFormat::Text => {
|
||||
if !downloaded.is_empty() {
|
||||
println!("{}", style::label("Downloaded:"));
|
||||
for d in &downloaded {
|
||||
println!(" {}", style::success(d));
|
||||
}
|
||||
}
|
||||
if !already_cached.is_empty() {
|
||||
println!("{}", style::label("Already cached:"));
|
||||
for c in &already_cached {
|
||||
println!(" {}", style::dim(c));
|
||||
}
|
||||
}
|
||||
println!(
|
||||
"All models ready in {}",
|
||||
style::success(&cache_base.display().to_string())
|
||||
);
|
||||
}
|
||||
WireFormat::Json => {
|
||||
let output = json!({
|
||||
"cache_dir": cache_base.to_string_lossy(),
|
||||
"downloaded": downloaded,
|
||||
"already_cached": already_cached,
|
||||
});
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&output).context("Failed to serialize warm results to JSON")?
|
||||
);
|
||||
}
|
||||
WireFormat::Toon => {
|
||||
let output = json!({
|
||||
"cache_dir": cache_base.to_string_lossy(),
|
||||
"downloaded": downloaded,
|
||||
"already_cached": already_cached,
|
||||
});
|
||||
println!(
|
||||
"{}",
|
||||
serde_toon::to_string(&output).context("Failed to serialize warm results to TOON")?
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Resolve the cache base directory.
|
||||
fn resolve_cache_base(cache_dir: Option<PathBuf>) -> PathBuf {
|
||||
if let Some(dir) = cache_dir {
|
||||
return dir;
|
||||
}
|
||||
if let Ok(env_path) = std::env::var("KREUZBERG_CACHE_DIR") {
|
||||
return PathBuf::from(env_path);
|
||||
}
|
||||
std::env::current_dir()
|
||||
.unwrap_or_else(|_| PathBuf::from("."))
|
||||
.join(".kreuzberg")
|
||||
}
|
||||
61
crates/kreuzberg-cli/src/commands/chunk.rs
Normal file
61
crates/kreuzberg-cli/src/commands/chunk.rs
Normal file
@@ -0,0 +1,61 @@
|
||||
//! Chunk command implementation.
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
|
||||
use crate::{WireFormat, style};
|
||||
|
||||
/// Execute the chunk command: split text into chunks.
|
||||
pub fn chunk_command(text: String, config: kreuzberg::ChunkingConfig, format: WireFormat) -> Result<()> {
|
||||
if text.is_empty() {
|
||||
anyhow::bail!("No text provided for chunking. Provide --text or pipe text via stdin.");
|
||||
}
|
||||
|
||||
let result = kreuzberg::chunking::chunk_text(&text, &config, None).context("Failed to chunk text")?;
|
||||
|
||||
match format {
|
||||
WireFormat::Json => {
|
||||
let chunks: Vec<&str> = result.chunks.iter().map(|c| c.content.as_str()).collect();
|
||||
let output = serde_json::json!({
|
||||
"chunks": chunks,
|
||||
"chunk_count": result.chunk_count,
|
||||
"config": {
|
||||
"max_characters": config.max_characters,
|
||||
"overlap": config.overlap,
|
||||
"chunker_type": format!("{:?}", config.chunker_type),
|
||||
},
|
||||
"input_size_bytes": text.len(),
|
||||
});
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&output).context("Failed to serialize chunks to JSON")?
|
||||
);
|
||||
}
|
||||
WireFormat::Toon => {
|
||||
let chunks: Vec<&str> = result.chunks.iter().map(|c| c.content.as_str()).collect();
|
||||
let output = serde_json::json!({
|
||||
"chunks": chunks,
|
||||
"chunk_count": result.chunk_count,
|
||||
"config": {
|
||||
"max_characters": config.max_characters,
|
||||
"overlap": config.overlap,
|
||||
"chunker_type": format!("{:?}", config.chunker_type),
|
||||
},
|
||||
"input_size_bytes": text.len(),
|
||||
});
|
||||
println!(
|
||||
"{}",
|
||||
serde_toon::to_string(&output).context("Failed to serialize chunks to TOON")?
|
||||
);
|
||||
}
|
||||
WireFormat::Text => {
|
||||
for (i, chunk) in result.chunks.iter().enumerate() {
|
||||
if result.chunks.len() > 1 {
|
||||
println!("{}", style::dim(&format!("--- chunk {} ---", i + 1)));
|
||||
}
|
||||
println!("{}", chunk.content);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
51
crates/kreuzberg-cli/src/commands/config.rs
Normal file
51
crates/kreuzberg-cli/src/commands/config.rs
Normal file
@@ -0,0 +1,51 @@
|
||||
//! Config command - Configuration loading and discovery
|
||||
//!
|
||||
//! This module provides utilities for loading extraction configuration from files
|
||||
//! or discovering them automatically in the project directory.
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use kreuzberg::ExtractionConfig;
|
||||
use std::path::PathBuf;
|
||||
|
||||
/// Loads extraction configuration from a file or discovers it automatically.
|
||||
///
|
||||
/// This function implements the CLI's configuration hierarchy:
|
||||
/// 1. Explicit config file (if `--config` flag provided)
|
||||
/// 2. Auto-discovered config (searches `kreuzberg.{toml,yaml,json}` in current and parent directories)
|
||||
/// 3. Default configuration (if no config file found)
|
||||
///
|
||||
/// # Configuration File Formats
|
||||
///
|
||||
/// Supports three formats, determined by file extension:
|
||||
/// - `.toml`: TOML format (recommended for humans)
|
||||
/// - `.yaml` / `.yml`: YAML format
|
||||
/// - `.json`: JSON format
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns an error if:
|
||||
/// - Explicit config file has unsupported extension (must be .toml, .yaml, .yml, or .json)
|
||||
/// - Config file cannot be read or parsed
|
||||
/// - Config file contains invalid extraction settings
|
||||
pub fn load_config(config_path: Option<PathBuf>) -> Result<ExtractionConfig> {
|
||||
if let Some(path) = config_path {
|
||||
let path_str = path.to_string_lossy();
|
||||
let path_lower = path_str.to_lowercase();
|
||||
let config = if path_lower.ends_with(".toml") {
|
||||
ExtractionConfig::from_toml_file(&path)
|
||||
} else if path_lower.ends_with(".yaml") || path_lower.ends_with(".yml") {
|
||||
ExtractionConfig::from_yaml_file(&path)
|
||||
} else if path_lower.ends_with(".json") {
|
||||
ExtractionConfig::from_json_file(&path)
|
||||
} else {
|
||||
anyhow::bail!("Config file must have .toml, .yaml, .yml, or .json extension (case-insensitive)");
|
||||
};
|
||||
config.with_context(|| format!("Failed to load configuration from '{}'. Ensure the file exists, is readable, and contains valid configuration.", path.display()))
|
||||
} else {
|
||||
match ExtractionConfig::discover() {
|
||||
Ok(Some(config)) => Ok(config),
|
||||
Ok(None) => Ok(ExtractionConfig::default()),
|
||||
Err(e) => Err(e).context("Failed to auto-discover configuration file. Searched for kreuzberg.{toml,yaml,json} in current and parent directories. Use --config to specify an explicit path."),
|
||||
}
|
||||
}
|
||||
}
|
||||
161
crates/kreuzberg-cli/src/commands/embed.rs
Normal file
161
crates/kreuzberg-cli/src/commands/embed.rs
Normal file
@@ -0,0 +1,161 @@
|
||||
//! Embed command implementation.
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
|
||||
use crate::{WireFormat, style};
|
||||
|
||||
/// Execute the embed command: generate embeddings for input texts.
|
||||
///
|
||||
/// When `provider` is `"local"` (default), uses the ONNX preset model.
|
||||
/// When `provider` is `"llm"`, uses liter-llm with the specified model and API key.
|
||||
/// When `provider` is `"plugin"`, dispatches to a pre-registered in-process embedding backend.
|
||||
pub fn embed_command(
|
||||
texts: Vec<String>,
|
||||
preset: &str,
|
||||
provider: &str,
|
||||
llm_model: Option<String>,
|
||||
llm_api_key: Option<String>,
|
||||
plugin_name: Option<String>,
|
||||
format: WireFormat,
|
||||
) -> Result<()> {
|
||||
if texts.is_empty() {
|
||||
anyhow::bail!("No texts provided for embedding. Provide --text or pipe text via stdin.");
|
||||
}
|
||||
|
||||
// Validate no empty texts
|
||||
for (i, t) in texts.iter().enumerate() {
|
||||
if t.is_empty() {
|
||||
anyhow::bail!("Text at position {} is empty. All texts must be non-empty.", i + 1);
|
||||
}
|
||||
}
|
||||
|
||||
let (config, model_label) = match provider {
|
||||
"llm" => {
|
||||
let model = llm_model.as_deref().ok_or_else(|| {
|
||||
anyhow::anyhow!(
|
||||
"--model is required when --provider is 'llm' (e.g., --model openai/text-embedding-3-small)"
|
||||
)
|
||||
})?;
|
||||
|
||||
let llm_config = kreuzberg::LlmConfig {
|
||||
model: model.to_string(),
|
||||
api_key: llm_api_key,
|
||||
base_url: None,
|
||||
timeout_secs: None,
|
||||
max_retries: None,
|
||||
temperature: None,
|
||||
max_tokens: None,
|
||||
};
|
||||
|
||||
let config = kreuzberg::EmbeddingConfig {
|
||||
model: kreuzberg::EmbeddingModelType::Llm { llm: llm_config },
|
||||
show_download_progress: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
(config, model.to_string())
|
||||
}
|
||||
"local" | "" => {
|
||||
// Validate preset for local provider
|
||||
let _preset_info = kreuzberg::get_embedding_preset(preset).with_context(|| {
|
||||
format!(
|
||||
"Unknown embedding preset '{}'. Available: {:?}",
|
||||
preset,
|
||||
kreuzberg::list_embedding_presets()
|
||||
)
|
||||
})?;
|
||||
|
||||
let config = kreuzberg::EmbeddingConfig {
|
||||
model: kreuzberg::EmbeddingModelType::Preset {
|
||||
name: preset.to_string(),
|
||||
},
|
||||
show_download_progress: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
(config, preset.to_string())
|
||||
}
|
||||
"plugin" => {
|
||||
let name = plugin_name.as_deref().ok_or_else(|| {
|
||||
anyhow::anyhow!(
|
||||
"--plugin NAME is required when --provider is 'plugin'. Register a backend via kreuzberg::plugins::register_embedding_backend first."
|
||||
)
|
||||
})?;
|
||||
if name.is_empty() {
|
||||
anyhow::bail!("--plugin NAME must not be empty.");
|
||||
}
|
||||
|
||||
// Pre-flight: surface unknown backends with a list of registered names
|
||||
// (parity with the REST handler, which returns 422 for the same case).
|
||||
let available =
|
||||
kreuzberg::plugins::list_embedding_backends().context("Failed to read embedding backend registry")?;
|
||||
if !available.iter().any(|n| n == name) {
|
||||
anyhow::bail!(
|
||||
"Embedding backend '{}' is not registered. Available backends: {}",
|
||||
name,
|
||||
if available.is_empty() {
|
||||
"(none registered)".to_string()
|
||||
} else {
|
||||
available.join(", ")
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
let config = kreuzberg::EmbeddingConfig {
|
||||
model: kreuzberg::EmbeddingModelType::Plugin { name: name.to_string() },
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
(config, name.to_string())
|
||||
}
|
||||
other => {
|
||||
anyhow::bail!(
|
||||
"Unknown embedding provider '{}'. Valid providers: 'local' (default, ONNX), 'llm' (liter-llm), or 'plugin' (in-process backend).",
|
||||
other
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
// Generate embeddings
|
||||
let embeddings = kreuzberg::embed_texts(texts.clone(), &config).context("Failed to generate embeddings")?;
|
||||
|
||||
let dimensions = embeddings.first().map(|e| e.len()).unwrap_or(0);
|
||||
|
||||
match format {
|
||||
WireFormat::Json => {
|
||||
let output = serde_json::json!({
|
||||
"embeddings": embeddings,
|
||||
"model": model_label,
|
||||
"dimensions": dimensions,
|
||||
"count": embeddings.len(),
|
||||
});
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&output).context("Failed to serialize embeddings to JSON")?
|
||||
);
|
||||
}
|
||||
WireFormat::Toon => {
|
||||
let output = serde_json::json!({
|
||||
"embeddings": embeddings,
|
||||
"model": model_label,
|
||||
"dimensions": dimensions,
|
||||
"count": embeddings.len(),
|
||||
});
|
||||
println!(
|
||||
"{}",
|
||||
serde_toon::to_string(&output).context("Failed to serialize embeddings to TOON")?
|
||||
);
|
||||
}
|
||||
WireFormat::Text => {
|
||||
for (i, embedding) in embeddings.iter().enumerate() {
|
||||
if texts.len() > 1 {
|
||||
println!("{}", style::dim(&format!("# text {}", i + 1)));
|
||||
}
|
||||
let values: Vec<String> = embedding.iter().map(|v| format!("{v}")).collect();
|
||||
println!("{}", values.join(","));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
180
crates/kreuzberg-cli/src/commands/extract.rs
Normal file
180
crates/kreuzberg-cli/src/commands/extract.rs
Normal file
@@ -0,0 +1,180 @@
|
||||
//! Extract command - Extract text and data from documents
|
||||
//!
|
||||
//! This module provides the extract and batch extract commands for processing single
|
||||
//! or multiple documents with customizable extraction configurations.
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use kreuzberg::{
|
||||
BatchFileItem, ExtractionConfig, ExtractionResult, FileExtractionConfig, batch_extract_files_sync,
|
||||
extract_file_sync,
|
||||
};
|
||||
use std::path::PathBuf;
|
||||
use std::time::Instant;
|
||||
|
||||
use crate::{
|
||||
WireFormat,
|
||||
output::{BatchEnvelope, ExtractEnvelope},
|
||||
style,
|
||||
};
|
||||
|
||||
/// Execute single document extraction command
|
||||
pub fn extract_command(
|
||||
path: PathBuf,
|
||||
config: ExtractionConfig,
|
||||
mime_type: Option<String>,
|
||||
format: WireFormat,
|
||||
) -> Result<()> {
|
||||
let path_str = path.to_string_lossy().to_string();
|
||||
|
||||
let t0 = Instant::now();
|
||||
let result = extract_file_sync(&path_str, mime_type.as_deref(), &config).with_context(|| {
|
||||
format!(
|
||||
"Failed to extract file '{}'. Ensure the file is readable and the format is supported.",
|
||||
path.display()
|
||||
)
|
||||
})?;
|
||||
let extraction_time_ms = t0.elapsed().as_secs_f64() * 1000.0;
|
||||
|
||||
match format {
|
||||
WireFormat::Text => {
|
||||
print!("{}", result.content);
|
||||
}
|
||||
WireFormat::Json => {
|
||||
let envelope = ExtractEnvelope {
|
||||
result,
|
||||
extraction_time_ms,
|
||||
};
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&envelope).context("Failed to serialize extraction result to JSON")?
|
||||
);
|
||||
}
|
||||
WireFormat::Toon => {
|
||||
println!(
|
||||
"{}",
|
||||
serde_toon::to_string(&result).context("Failed to serialize extraction result to TOON")?
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Execute batch extraction command with optional per-file configuration overrides
|
||||
pub fn batch_command(
|
||||
paths: Vec<PathBuf>,
|
||||
file_configs_map: Option<std::collections::HashMap<String, serde_json::Value>>,
|
||||
config: ExtractionConfig,
|
||||
format: WireFormat,
|
||||
) -> Result<()> {
|
||||
match format {
|
||||
WireFormat::Json => {
|
||||
// Run files one at a time to capture per-file wall-clock timings.
|
||||
// Per-file config overrides are honoured: files without an override use the
|
||||
// batch-level config directly; files with an override use a one-shot batch of
|
||||
// one item so the library's own merge logic applies.
|
||||
let mut results: Vec<ExtractionResult> = Vec::with_capacity(paths.len());
|
||||
let mut per_file_ms: Vec<f64> = Vec::with_capacity(paths.len());
|
||||
let total_t0 = Instant::now();
|
||||
|
||||
for path in &paths {
|
||||
let path_str = path.to_string_lossy().to_string();
|
||||
let has_file_config = file_configs_map.as_ref().and_then(|m| m.get(&path_str)).is_some();
|
||||
|
||||
let t0 = Instant::now();
|
||||
let result = if has_file_config {
|
||||
// Delegate to the batch API (one item) so per-file merge logic is applied.
|
||||
let file_config = file_configs_map
|
||||
.as_ref()
|
||||
.and_then(|m| m.get(&path_str))
|
||||
.map(|v| {
|
||||
serde_json::from_value::<FileExtractionConfig>(v.clone())
|
||||
.with_context(|| format!("Failed to parse file config for '{}'", path_str))
|
||||
})
|
||||
.transpose()?;
|
||||
let mut batch_results = batch_extract_files_sync(
|
||||
vec![BatchFileItem {
|
||||
path: path.clone(),
|
||||
config: file_config,
|
||||
}],
|
||||
&config,
|
||||
)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to extract file '{}'. Ensure the file is readable and the format is supported.",
|
||||
path.display()
|
||||
)
|
||||
})?;
|
||||
batch_results.remove(0)
|
||||
} else {
|
||||
extract_file_sync(&path_str, None, &config).with_context(|| {
|
||||
format!(
|
||||
"Failed to extract file '{}'. Ensure the file is readable and the format is supported.",
|
||||
path.display()
|
||||
)
|
||||
})?
|
||||
};
|
||||
per_file_ms.push(t0.elapsed().as_secs_f64() * 1000.0);
|
||||
results.push(result);
|
||||
}
|
||||
|
||||
let total_ms = total_t0.elapsed().as_secs_f64() * 1000.0;
|
||||
let envelope = BatchEnvelope {
|
||||
results,
|
||||
total_ms,
|
||||
per_file_ms,
|
||||
};
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&envelope)
|
||||
.context("Failed to serialize batch extraction results to JSON")?
|
||||
);
|
||||
}
|
||||
WireFormat::Text => {
|
||||
let results = run_batch_sync(&paths, file_configs_map.as_ref(), &config)?;
|
||||
for (i, result) in results.iter().enumerate() {
|
||||
println!("{}", style::header(&format!("=== Document {} ===", i + 1)));
|
||||
println!("{} {}", style::label("MIME Type:"), style::success(&result.mime_type));
|
||||
println!("{}\n{}", style::label("Content:"), result.content);
|
||||
println!();
|
||||
}
|
||||
}
|
||||
WireFormat::Toon => {
|
||||
let results = run_batch_sync(&paths, file_configs_map.as_ref(), &config)?;
|
||||
println!(
|
||||
"{}",
|
||||
serde_toon::to_string(&results).context("Failed to serialize batch extraction results to TOON")?
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Run batch extraction using the synchronous batch API for non-JSON output paths.
|
||||
fn run_batch_sync(
|
||||
paths: &[PathBuf],
|
||||
file_configs_map: Option<&std::collections::HashMap<String, serde_json::Value>>,
|
||||
config: &ExtractionConfig,
|
||||
) -> Result<Vec<ExtractionResult>> {
|
||||
let items: Vec<BatchFileItem> = paths
|
||||
.iter()
|
||||
.map(|p| {
|
||||
let path_str = p.to_string_lossy().to_string();
|
||||
let file_config = file_configs_map
|
||||
.and_then(|m| m.get(&path_str))
|
||||
.map(|v| {
|
||||
serde_json::from_value::<FileExtractionConfig>(v.clone())
|
||||
.with_context(|| format!("Failed to parse file config for '{}'", path_str))
|
||||
})
|
||||
.transpose()?;
|
||||
Ok(BatchFileItem {
|
||||
path: p.clone(),
|
||||
config: file_config,
|
||||
})
|
||||
})
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
|
||||
batch_extract_files_sync(items, config)
|
||||
.context("Failed to batch extract documents. Check that all files are readable and formats are supported.")
|
||||
}
|
||||
116
crates/kreuzberg-cli/src/commands/extract_structured.rs
Normal file
116
crates/kreuzberg-cli/src/commands/extract_structured.rs
Normal file
@@ -0,0 +1,116 @@
|
||||
//! Extract structured command - Extract structured data from documents using an LLM.
|
||||
//!
|
||||
//! Reads a JSON schema file, configures LLM-based structured extraction, and
|
||||
//! outputs the structured result parsed from the document.
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use kreuzberg::{LlmConfig, StructuredExtractionConfig, extract_file_sync};
|
||||
use std::path::PathBuf;
|
||||
|
||||
use crate::WireFormat;
|
||||
|
||||
/// Arguments for the extract-structured command.
|
||||
pub struct ExtractStructuredArgs {
|
||||
pub path: PathBuf,
|
||||
pub schema_path: PathBuf,
|
||||
pub model: String,
|
||||
pub api_key: Option<String>,
|
||||
pub prompt: Option<String>,
|
||||
pub schema_name: Option<String>,
|
||||
pub strict: bool,
|
||||
pub config_path: Option<PathBuf>,
|
||||
pub format: WireFormat,
|
||||
}
|
||||
|
||||
/// Execute the extract-structured command.
|
||||
///
|
||||
/// Reads a JSON schema from `schema_path`, builds an `ExtractionConfig` with
|
||||
/// `structured_extraction` configured, extracts the document, and outputs the
|
||||
/// `structured_output` field from the result.
|
||||
pub fn extract_structured_command(args: ExtractStructuredArgs) -> Result<()> {
|
||||
let ExtractStructuredArgs {
|
||||
path,
|
||||
schema_path,
|
||||
model,
|
||||
api_key,
|
||||
prompt,
|
||||
schema_name,
|
||||
strict,
|
||||
config_path,
|
||||
format,
|
||||
} = args;
|
||||
// 1. Read and parse the JSON schema file
|
||||
let schema_str = std::fs::read_to_string(&schema_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to read JSON schema file '{}'. Ensure the file exists and is readable.",
|
||||
schema_path.display()
|
||||
)
|
||||
})?;
|
||||
let schema: serde_json::Value = serde_json::from_str(&schema_str).with_context(|| {
|
||||
format!(
|
||||
"Failed to parse JSON schema from '{}'. Ensure the file contains valid JSON.",
|
||||
schema_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
// 2. Build ExtractionConfig with structured_extraction
|
||||
let mut config = super::load_config(config_path)?;
|
||||
|
||||
let llm_config = LlmConfig {
|
||||
model,
|
||||
api_key,
|
||||
base_url: None,
|
||||
timeout_secs: None,
|
||||
max_retries: None,
|
||||
temperature: None,
|
||||
max_tokens: None,
|
||||
};
|
||||
|
||||
config.structured_extraction = Some(StructuredExtractionConfig {
|
||||
schema,
|
||||
schema_name: schema_name.unwrap_or_else(|| "extraction".to_string()),
|
||||
schema_description: None,
|
||||
strict,
|
||||
prompt,
|
||||
llm: llm_config,
|
||||
});
|
||||
|
||||
// 3. Call kreuzberg::extract_file_sync()
|
||||
let path_str = path.to_string_lossy().to_string();
|
||||
let result = extract_file_sync(&path_str, None, &config).with_context(|| {
|
||||
format!(
|
||||
"Failed to extract structured data from '{}'. Ensure the file is readable and the LLM configuration is correct.",
|
||||
path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
// 4. Output result.structured_output (or error if None)
|
||||
let structured = result.structured_output.with_context(|| {
|
||||
"Structured extraction completed but returned no structured output. \
|
||||
This may indicate the LLM failed to produce valid structured data matching the schema."
|
||||
})?;
|
||||
|
||||
match format {
|
||||
WireFormat::Json => {
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&structured).context("Failed to serialize structured output to JSON")?
|
||||
);
|
||||
}
|
||||
WireFormat::Toon => {
|
||||
println!(
|
||||
"{}",
|
||||
serde_toon::to_string(&structured).context("Failed to serialize structured output to TOON")?
|
||||
);
|
||||
}
|
||||
WireFormat::Text => {
|
||||
// For text mode, pretty-print the JSON value
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&structured).context("Failed to serialize structured output to text")?
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
48
crates/kreuzberg-cli/src/commands/mod.rs
Normal file
48
crates/kreuzberg-cli/src/commands/mod.rs
Normal file
@@ -0,0 +1,48 @@
|
||||
//! Command modules for Kreuzberg CLI
|
||||
//!
|
||||
//! This module organizes the CLI commands into focused submodules:
|
||||
//! - `extract` - Document extraction commands
|
||||
//! - `cache` - Cache management operations
|
||||
//! - `server` - API and MCP server commands
|
||||
//! - `config` - Configuration loading and discovery
|
||||
//! - `embed` - Embedding generation commands
|
||||
//! - `chunk` - Text chunking commands
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use std::io::Read;
|
||||
|
||||
pub mod cache;
|
||||
pub mod chunk;
|
||||
pub mod config;
|
||||
#[cfg(feature = "embeddings")]
|
||||
pub mod embed;
|
||||
pub mod extract;
|
||||
pub mod extract_structured;
|
||||
pub mod overrides;
|
||||
#[cfg(any(feature = "api", feature = "mcp"))]
|
||||
pub mod server;
|
||||
|
||||
// Re-export command functions for convenience
|
||||
pub use cache::{clear_command, manifest_command, stats_command, warm_command};
|
||||
pub use chunk::chunk_command;
|
||||
pub use config::load_config;
|
||||
#[cfg(feature = "embeddings")]
|
||||
pub use embed::embed_command;
|
||||
pub use extract::{batch_command, extract_command};
|
||||
#[cfg(feature = "mcp")]
|
||||
pub use server::mcp_command;
|
||||
#[cfg(feature = "api")]
|
||||
pub use server::serve_command;
|
||||
|
||||
/// Read text from stdin, trimming whitespace.
|
||||
pub fn read_stdin() -> Result<String> {
|
||||
let mut input = String::new();
|
||||
std::io::stdin()
|
||||
.read_to_string(&mut input)
|
||||
.context("Failed to read from stdin")?;
|
||||
let trimmed = input.trim().to_string();
|
||||
if trimmed.is_empty() {
|
||||
anyhow::bail!("No input received from stdin. Provide text via --text or pipe it to stdin.");
|
||||
}
|
||||
Ok(trimmed)
|
||||
}
|
||||
1327
crates/kreuzberg-cli/src/commands/overrides.rs
Normal file
1327
crates/kreuzberg-cli/src/commands/overrides.rs
Normal file
File diff suppressed because it is too large
Load Diff
104
crates/kreuzberg-cli/src/commands/server.rs
Normal file
104
crates/kreuzberg-cli/src/commands/server.rs
Normal file
@@ -0,0 +1,104 @@
|
||||
//! Server command - Start API and MCP servers
|
||||
//!
|
||||
//! This module provides commands for starting the Kreuzberg API server
|
||||
//! and the MCP (Model Context Protocol) server.
|
||||
|
||||
use anyhow::Result;
|
||||
|
||||
/// Execute API server command
|
||||
#[cfg(feature = "api")]
|
||||
pub fn serve_command(
|
||||
cli_host: Option<String>,
|
||||
cli_port: Option<u16>,
|
||||
extraction_config: kreuzberg::ExtractionConfig,
|
||||
config_path: Option<std::path::PathBuf>,
|
||||
) -> Result<()> {
|
||||
use anyhow::Context;
|
||||
use kreuzberg::ServerConfig;
|
||||
|
||||
// Load server config from same file or defaults
|
||||
let mut server_config = if let Some(path) = &config_path {
|
||||
ServerConfig::from_file(path).with_context(|| {
|
||||
format!(
|
||||
"Failed to load server configuration from '{}'. \
|
||||
Ensure the file contains valid server settings under [server] section or at root level.",
|
||||
path.display()
|
||||
)
|
||||
})?
|
||||
} else {
|
||||
ServerConfig::default()
|
||||
};
|
||||
|
||||
// Apply environment variable overrides (precedence: env vars > config file)
|
||||
server_config.apply_env_overrides()?;
|
||||
|
||||
// CLI args override everything (highest precedence)
|
||||
if let Some(host) = cli_host {
|
||||
server_config.host = host;
|
||||
}
|
||||
if let Some(port) = cli_port {
|
||||
server_config.port = port;
|
||||
}
|
||||
|
||||
// Log the final configuration for debugging
|
||||
tracing::info!(
|
||||
"Starting Kreuzberg API server on http://{}",
|
||||
server_config.listen_addr()
|
||||
);
|
||||
|
||||
let rt = tokio::runtime::Runtime::new()?;
|
||||
rt.block_on(kreuzberg::api::serve_with_server_config(
|
||||
extraction_config,
|
||||
server_config.clone(),
|
||||
))
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to start API server on {}. Ensure the port is not already in use and you have permission to bind to this address.",
|
||||
server_config.listen_addr()
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Execute MCP server command
|
||||
#[cfg(feature = "mcp")]
|
||||
pub fn mcp_command(
|
||||
config: kreuzberg::ExtractionConfig,
|
||||
transport: String,
|
||||
#[cfg(feature = "mcp-http")] host: String,
|
||||
#[cfg(feature = "mcp-http")] port: u16,
|
||||
#[cfg(not(feature = "mcp-http"))] _host: String,
|
||||
#[cfg(not(feature = "mcp-http"))] _port: u16,
|
||||
) -> Result<()> {
|
||||
tracing::debug!("Starting Kreuzberg MCP server with transport: {}", transport);
|
||||
let rt = tokio::runtime::Runtime::new()?;
|
||||
|
||||
match transport.to_lowercase().as_str() {
|
||||
"stdio" => {
|
||||
rt.block_on(kreuzberg::mcp::start_mcp_server_with_config(config))
|
||||
.map_err(|e| anyhow::anyhow!("Failed to start MCP server: {}", e))?;
|
||||
}
|
||||
"http" => {
|
||||
#[cfg(not(feature = "mcp-http"))]
|
||||
{
|
||||
anyhow::bail!(
|
||||
"HTTP transport requires 'mcp-http' feature. \
|
||||
Rebuild with: cargo build --features mcp-http"
|
||||
);
|
||||
}
|
||||
|
||||
#[cfg(feature = "mcp-http")]
|
||||
{
|
||||
tracing::debug!("Starting MCP server on http://{}:{}", host, port);
|
||||
rt.block_on(kreuzberg::mcp::start_mcp_server_http_with_config(&host, port, config))
|
||||
.map_err(|e| anyhow::anyhow!("Failed to start MCP server on {}:{}: {}", host, port, e))?;
|
||||
}
|
||||
}
|
||||
other => {
|
||||
anyhow::bail!("Unknown transport '{}'. Use 'stdio' or 'http'", other);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
230
crates/kreuzberg-cli/src/commands/tree_sitter.rs
Normal file
230
crates/kreuzberg-cli/src/commands/tree_sitter.rs
Normal file
@@ -0,0 +1,230 @@
|
||||
//! Tree-sitter grammar management commands.
|
||||
//!
|
||||
//! This module provides commands for downloading, listing, and managing
|
||||
//! tree-sitter grammar parsers via the tree-sitter-language-pack crate.
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use serde_json::json;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use crate::{WireFormat, style};
|
||||
|
||||
/// Execute the tree-sitter download command.
|
||||
///
|
||||
/// Downloads tree-sitter grammar parsers based on the provided arguments:
|
||||
/// - Specific languages by name
|
||||
/// - All available languages (--all)
|
||||
/// - Language groups (--groups)
|
||||
pub fn download_command(
|
||||
languages: Vec<String>,
|
||||
all: bool,
|
||||
groups: Option<Vec<String>>,
|
||||
cache_dir: Option<PathBuf>,
|
||||
format: WireFormat,
|
||||
) -> Result<()> {
|
||||
// Apply custom cache directory if provided
|
||||
if let Some(ref dir) = cache_dir {
|
||||
let config = tree_sitter_language_pack::PackConfig {
|
||||
cache_dir: Some(dir.clone()),
|
||||
languages: None,
|
||||
groups: None,
|
||||
};
|
||||
tree_sitter_language_pack::configure(&config).context("Failed to configure custom cache directory")?;
|
||||
}
|
||||
|
||||
let count: usize;
|
||||
let description: String;
|
||||
|
||||
if all {
|
||||
count = tree_sitter_language_pack::download_all().context("Failed to download all tree-sitter grammars")?;
|
||||
description = "all available languages".to_string();
|
||||
} else if let Some(ref group_list) = groups {
|
||||
let config = tree_sitter_language_pack::PackConfig {
|
||||
cache_dir: cache_dir.clone(),
|
||||
languages: None,
|
||||
groups: Some(group_list.clone()),
|
||||
};
|
||||
tree_sitter_language_pack::init(&config).context("Failed to download tree-sitter grammar groups")?;
|
||||
count = 0; // init does not return a count
|
||||
description = format!("groups: {}", group_list.join(", "));
|
||||
} else if !languages.is_empty() {
|
||||
let refs: Vec<&str> = languages.iter().map(String::as_str).collect();
|
||||
count = tree_sitter_language_pack::download(&refs).context("Failed to download tree-sitter grammars")?;
|
||||
description = format!("languages: {}", languages.join(", "));
|
||||
} else {
|
||||
anyhow::bail!("No languages specified. Use language names, --all, --groups, or --from-config.");
|
||||
}
|
||||
|
||||
match format {
|
||||
WireFormat::Text => {
|
||||
println!("{}", style::header("Tree-sitter Download"));
|
||||
println!("{}", style::dim("===================="));
|
||||
println!("{} {}", style::label("Requested:"), description);
|
||||
if groups.is_none() || all || !languages.is_empty() {
|
||||
println!(
|
||||
"{} {}",
|
||||
style::label("Newly downloaded:"),
|
||||
style::success(&count.to_string())
|
||||
);
|
||||
}
|
||||
if let Some(ref dir) = cache_dir {
|
||||
println!(
|
||||
"{} {}",
|
||||
style::label("Cache directory:"),
|
||||
style::success(&dir.display().to_string())
|
||||
);
|
||||
}
|
||||
println!("{}", style::success("Done"));
|
||||
}
|
||||
WireFormat::Json => {
|
||||
let mut output = json!({
|
||||
"requested": description,
|
||||
"newly_downloaded": count,
|
||||
});
|
||||
if let Some(ref dir) = cache_dir {
|
||||
output["cache_dir"] = json!(dir.to_string_lossy());
|
||||
}
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&output).context("Failed to serialize download results to JSON")?
|
||||
);
|
||||
}
|
||||
WireFormat::Toon => {
|
||||
let mut output = json!({
|
||||
"requested": description,
|
||||
"newly_downloaded": count,
|
||||
});
|
||||
if let Some(ref dir) = cache_dir {
|
||||
output["cache_dir"] = json!(dir.to_string_lossy());
|
||||
}
|
||||
println!(
|
||||
"{}",
|
||||
serde_toon::to_string(&output).context("Failed to serialize download results to TOON")?
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Execute the tree-sitter list command.
|
||||
///
|
||||
/// Lists available or downloaded tree-sitter languages, optionally filtering
|
||||
/// by a name substring.
|
||||
pub fn list_command(downloaded_only: bool, filter: Option<String>, format: WireFormat) -> Result<()> {
|
||||
let languages = if downloaded_only {
|
||||
tree_sitter_language_pack::downloaded_languages()
|
||||
} else {
|
||||
tree_sitter_language_pack::manifest_languages().context("Failed to fetch tree-sitter language manifest")?
|
||||
};
|
||||
|
||||
let filtered: Vec<&String> = if let Some(ref f) = filter {
|
||||
let lower = f.to_lowercase();
|
||||
languages.iter().filter(|l| l.to_lowercase().contains(&lower)).collect()
|
||||
} else {
|
||||
languages.iter().collect()
|
||||
};
|
||||
|
||||
let source = if downloaded_only { "downloaded" } else { "available" };
|
||||
|
||||
match format {
|
||||
WireFormat::Text => {
|
||||
println!(
|
||||
"{} ({} {}{})",
|
||||
style::header("Tree-sitter Languages"),
|
||||
filtered.len(),
|
||||
source,
|
||||
filter.as_ref().map(|f| format!(", filter: '{f}'")).unwrap_or_default()
|
||||
);
|
||||
println!("{}", style::dim("====================="));
|
||||
for lang in &filtered {
|
||||
println!(" {}", style::success(lang));
|
||||
}
|
||||
}
|
||||
WireFormat::Json => {
|
||||
let output = json!({
|
||||
"source": source,
|
||||
"count": filtered.len(),
|
||||
"filter": filter,
|
||||
"languages": filtered,
|
||||
});
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&output).context("Failed to serialize language list to JSON")?
|
||||
);
|
||||
}
|
||||
WireFormat::Toon => {
|
||||
let output = json!({
|
||||
"source": source,
|
||||
"count": filtered.len(),
|
||||
"filter": filter,
|
||||
"languages": filtered,
|
||||
});
|
||||
println!(
|
||||
"{}",
|
||||
serde_toon::to_string(&output).context("Failed to serialize language list to TOON")?
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Execute the tree-sitter cache-dir command.
|
||||
///
|
||||
/// Displays the effective cache directory for tree-sitter grammar parsers.
|
||||
pub fn cache_dir_command(format: WireFormat) -> Result<()> {
|
||||
let dir = tree_sitter_language_pack::cache_dir().context("Failed to determine tree-sitter cache directory")?;
|
||||
let dir_str = dir.to_string_lossy();
|
||||
|
||||
match format {
|
||||
WireFormat::Text => {
|
||||
println!("{} {}", style::label("Cache directory:"), style::success(&dir_str));
|
||||
}
|
||||
WireFormat::Json => {
|
||||
let output = json!({ "cache_dir": dir_str });
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&output).context("Failed to serialize cache directory to JSON")?
|
||||
);
|
||||
}
|
||||
WireFormat::Toon => {
|
||||
let output = json!({ "cache_dir": dir_str });
|
||||
println!(
|
||||
"{}",
|
||||
serde_toon::to_string(&output).context("Failed to serialize cache directory to TOON")?
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Execute the tree-sitter clean command.
|
||||
///
|
||||
/// Clears all cached tree-sitter grammar parser shared libraries.
|
||||
pub fn clean_command(format: WireFormat) -> Result<()> {
|
||||
tree_sitter_language_pack::clean_cache().context("Failed to clean tree-sitter cache")?;
|
||||
|
||||
match format {
|
||||
WireFormat::Text => {
|
||||
println!("{}", style::success("Tree-sitter cache cleared successfully"));
|
||||
}
|
||||
WireFormat::Json => {
|
||||
let output = json!({ "status": "cleared" });
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&output).context("Failed to serialize clean result to JSON")?
|
||||
);
|
||||
}
|
||||
WireFormat::Toon => {
|
||||
let output = json!({ "status": "cleared" });
|
||||
println!(
|
||||
"{}",
|
||||
serde_toon::to_string(&output).context("Failed to serialize clean result to TOON")?
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
238
crates/kreuzberg-cli/src/logging.rs
Normal file
238
crates/kreuzberg-cli/src/logging.rs
Normal file
@@ -0,0 +1,238 @@
|
||||
//! Logging helpers for the Kreuzberg CLI.
|
||||
//!
|
||||
//! Provides a [`build_env_filter`] function that layers default third-party
|
||||
//! transport suppressions on top of whatever the caller or `RUST_LOG` specifies.
|
||||
//! User-supplied per-target rules in `RUST_LOG` always win because
|
||||
//! [`EnvFilter::add_directive`] does not override existing per-target directives.
|
||||
|
||||
use tracing_subscriber::EnvFilter;
|
||||
|
||||
/// Third-party crates that are noisy at their own default level.
|
||||
///
|
||||
/// These are added as *fallback* directives: if `RUST_LOG` or `level_override`
|
||||
/// already contain a per-target rule for any of these crates it takes precedence,
|
||||
/// so the user can still do `RUST_LOG=ureq=debug` to restore full transport logs.
|
||||
const QUIET_DIRECTIVES: &[&str] = &[
|
||||
"ureq=warn",
|
||||
"ureq_proto=warn",
|
||||
"rustls=warn",
|
||||
"hyper_util=warn",
|
||||
"hf_hub=info",
|
||||
"tower_http=info",
|
||||
];
|
||||
|
||||
/// Extract the target crate name from a directive string like `"ureq=warn"`.
|
||||
///
|
||||
/// Returns the part before `=`, or `None` if there is no `=`.
|
||||
fn directive_target(directive: &str) -> Option<&str> {
|
||||
directive.split_once('=').map(|(target, _)| target)
|
||||
}
|
||||
|
||||
/// Build an [`EnvFilter`] with third-party transport crates suppressed by default.
|
||||
///
|
||||
/// Precedence (highest first):
|
||||
/// 1. Per-target directives already present in `RUST_LOG` (or `level_override`).
|
||||
/// 2. The [`QUIET_DIRECTIVES`] suppressions added here.
|
||||
/// 3. Root level: `level_override` → `RUST_LOG` → `"info"`.
|
||||
///
|
||||
/// Per-target directives that the user has already set are **not** overridden:
|
||||
/// we skip adding a quiet directive when the base filter already contains a
|
||||
/// rule for the same target crate. This is necessary because
|
||||
/// [`EnvFilter::add_directive`] appends rather than guards — a later-added
|
||||
/// per-target directive for the same crate takes precedence.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `level_override` — explicit root-level string from a CLI flag (e.g. `"debug"`).
|
||||
/// When `Some`, it replaces `RUST_LOG` entirely for the root level.
|
||||
pub fn build_env_filter(level_override: Option<&str>) -> EnvFilter {
|
||||
// Use try_new on user input so a malformed --log-level falls back to info
|
||||
// instead of panicking the CLI.
|
||||
let base = level_override
|
||||
.and_then(|level| EnvFilter::try_new(level).ok())
|
||||
.or_else(|| EnvFilter::try_from_default_env().ok())
|
||||
.unwrap_or_else(|| EnvFilter::new("info"));
|
||||
|
||||
// Snapshot the existing directive set so we can skip quiet directives
|
||||
// whose target the user has already configured explicitly.
|
||||
let existing_targets: std::collections::HashSet<String> = base
|
||||
.to_string()
|
||||
.split(',')
|
||||
.filter_map(|chunk| directive_target(chunk).map(|t| t.trim().to_string()))
|
||||
.collect();
|
||||
|
||||
QUIET_DIRECTIVES
|
||||
.iter()
|
||||
.filter(|directive| {
|
||||
// Only add the quiet directive when no per-target rule for this
|
||||
// exact crate already exists. Word-boundary match via tokenized
|
||||
// target set avoids `hf_hub` colliding with `hf_hub_server`.
|
||||
directive_target(directive)
|
||||
.map(|target| !existing_targets.contains(target))
|
||||
.unwrap_or(true)
|
||||
})
|
||||
.fold(base, |filter, directive| {
|
||||
filter.add_directive(directive.parse().expect("built-in logging directive must be valid"))
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
/// Parse the directive string from an EnvFilter for assertion-level checks.
|
||||
///
|
||||
/// `EnvFilter::to_string()` returns a comma-separated representation of all
|
||||
/// directives. We use this as a stable, public inspection surface.
|
||||
fn filter_directives(filter: &EnvFilter) -> String {
|
||||
filter.to_string()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn default_filter_suppresses_ureq() {
|
||||
// No env, no override → ureq and ureq_proto must be suppressed.
|
||||
let filter = build_env_filter(None);
|
||||
let directives = filter_directives(&filter);
|
||||
assert!(
|
||||
directives.contains("ureq=warn"),
|
||||
"ureq=warn must be present in default filter; got: {directives}"
|
||||
);
|
||||
assert!(
|
||||
directives.contains("ureq_proto=warn"),
|
||||
"ureq_proto=warn must be present in default filter; got: {directives}"
|
||||
);
|
||||
assert!(
|
||||
directives.contains("rustls=warn"),
|
||||
"rustls=warn must be present in default filter; got: {directives}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn default_filter_keeps_kreuzberg_info() {
|
||||
// Root level info → kreuzberg has no suppression applied.
|
||||
let filter = build_env_filter(None);
|
||||
let directives = filter_directives(&filter);
|
||||
assert!(
|
||||
!directives.contains("kreuzberg=warn") && !directives.contains("kreuzberg=error"),
|
||||
"kreuzberg must not be suppressed in the default filter; got: {directives}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn env_override_wins_for_third_party() {
|
||||
// Simulate RUST_LOG=ureq=debug by passing it as the level_override.
|
||||
// build_env_filter must detect the existing ureq= directive and skip the
|
||||
// ureq=warn suppression, so ureq=debug survives in the final filter.
|
||||
let filter = build_env_filter(Some("info,ureq=debug"));
|
||||
let directives = filter.to_string();
|
||||
assert!(
|
||||
directives.contains("ureq=debug"),
|
||||
"user-supplied ureq=debug must be preserved; got: {directives}"
|
||||
);
|
||||
assert!(
|
||||
!directives.contains("ureq=warn"),
|
||||
"ureq=warn suppression must not be added when user already set ureq=debug; got: {directives}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn level_override_wins() {
|
||||
// CLI flag "debug" → root must be debug; suppression directives still present.
|
||||
let filter = build_env_filter(Some("debug"));
|
||||
let directives = filter_directives(&filter);
|
||||
assert!(
|
||||
directives.contains("debug"),
|
||||
"root debug level must appear in filter with --log-level debug; got: {directives}"
|
||||
);
|
||||
// Suppression for ureq must still be layered on top.
|
||||
assert!(
|
||||
directives.contains("ureq=warn"),
|
||||
"ureq=warn suppression must still be present even under --log-level debug; got: {directives}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tower_http_suppressed_at_default() {
|
||||
// No override → tower_http must be suppressed.
|
||||
let filter = build_env_filter(None);
|
||||
let directives = filter_directives(&filter);
|
||||
assert!(
|
||||
directives.contains("tower_http=info") || directives.contains("tower_http=warn"),
|
||||
"tower_http must be suppressed at default level; got: {directives}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn all_quiet_directives_are_valid() {
|
||||
// Ensure every built-in directive parses without panic.
|
||||
for directive in super::QUIET_DIRECTIVES {
|
||||
directive
|
||||
.parse::<tracing_subscriber::filter::Directive>()
|
||||
.expect("built-in directive is invalid");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn no_level_override_uses_info_root() {
|
||||
// Without RUST_LOG set and no override, root should default to info.
|
||||
// The directive string must not open with debug or trace as the root level.
|
||||
let filter = build_env_filter(None);
|
||||
let directives = filter_directives(&filter);
|
||||
// Root "debug" or "trace" as the first token would mean root is debug/trace.
|
||||
let root_is_noisier_than_info = directives.starts_with("debug") || directives.starts_with("trace");
|
||||
assert!(
|
||||
!root_is_noisier_than_info,
|
||||
"default root level must not be debug/trace without RUST_LOG; got: {directives}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hf_hub_suppressed_at_default() {
|
||||
let filter = build_env_filter(None);
|
||||
let directives = filter_directives(&filter);
|
||||
assert!(
|
||||
directives.contains("hf_hub=info"),
|
||||
"hf_hub must be suppressed to info at default; got: {directives}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hyper_util_suppressed_at_default() {
|
||||
let filter = build_env_filter(None);
|
||||
let directives = filter_directives(&filter);
|
||||
assert!(
|
||||
directives.contains("hyper_util=warn"),
|
||||
"hyper_util must be suppressed to warn at default; got: {directives}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn malformed_level_override_falls_back_to_info() {
|
||||
// Garbage CLI flag must NOT panic — try_new returns Err and we fall back
|
||||
// to RUST_LOG / info default.
|
||||
let filter = build_env_filter(Some(":::garbage"));
|
||||
let directives = filter_directives(&filter);
|
||||
// Quiet directives should still be layered, proving we recovered.
|
||||
assert!(
|
||||
directives.contains("ureq=warn"),
|
||||
"ureq=warn must still be present after malformed override; got: {directives}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn similar_target_name_does_not_block_suppression() {
|
||||
// A user-supplied directive for `hf_hub_server` must NOT cause the
|
||||
// `hf_hub=info` suppression to be skipped (regression test for the
|
||||
// earlier substring-containment bug).
|
||||
let filter = build_env_filter(Some("info,hf_hub_server=debug"));
|
||||
let directives = filter.to_string();
|
||||
assert!(
|
||||
directives.contains("hf_hub_server=debug"),
|
||||
"user directive for hf_hub_server must survive; got: {directives}"
|
||||
);
|
||||
assert!(
|
||||
directives.contains("hf_hub=info"),
|
||||
"hf_hub=info suppression must still be applied; got: {directives}"
|
||||
);
|
||||
}
|
||||
}
|
||||
971
crates/kreuzberg-cli/src/main.rs
Normal file
971
crates/kreuzberg-cli/src/main.rs
Normal file
@@ -0,0 +1,971 @@
|
||||
//! Kreuzberg CLI - Command-line interface for document intelligence.
|
||||
//!
|
||||
//! This binary provides a command-line interface to the Kreuzberg document intelligence
|
||||
//! library, supporting document extraction, MIME type detection, caching, and batch operations.
|
||||
//!
|
||||
//! # Architecture
|
||||
//!
|
||||
//! The CLI is built using `clap` for argument parsing and provides five main commands:
|
||||
//! - `extract`: Extract text/data from a single document
|
||||
//! - `batch`: Process multiple documents in parallel
|
||||
//! - `detect`: Identify MIME type of a file
|
||||
//! - `cache`: Manage cache (clear, stats)
|
||||
//! - `serve`: Start API server (requires `api` feature)
|
||||
//! - `version`: Show version information
|
||||
//!
|
||||
//! # Configuration
|
||||
//!
|
||||
//! The CLI supports configuration files in TOML, YAML, or JSON formats:
|
||||
//! - Explicit: `--config path/to/config.toml`
|
||||
//! - Auto-discovery: Searches for `kreuzberg.{toml,yaml,json}` in current and parent directories
|
||||
//! - Inline JSON: `--config-json '{"ocr": {"backend": "tesseract"}}'`
|
||||
//! - Command-line flags override config file settings
|
||||
//!
|
||||
//! Configuration precedence (highest to lowest):
|
||||
//! 1. Individual CLI flags (--output-format, --ocr, etc.)
|
||||
//! 2. Inline JSON config (--config-json or --config-json-base64)
|
||||
//! 3. Config file (--config path.toml)
|
||||
//! 4. Default values
|
||||
//!
|
||||
//! # Exit Codes
|
||||
//!
|
||||
//! - 0: Success
|
||||
//! - Non-zero: Error (see stderr for details)
|
||||
//!
|
||||
//! # Examples
|
||||
//!
|
||||
//! ```bash
|
||||
//! # Extract text from a PDF
|
||||
//! kreuzberg extract document.pdf
|
||||
//!
|
||||
//! # Extract with OCR enabled
|
||||
//! kreuzberg extract scanned.pdf --ocr true
|
||||
//!
|
||||
//! # Extract with inline JSON config
|
||||
//! kreuzberg extract doc.pdf --config-json '{"ocr":{"backend":"tesseract"}}'
|
||||
//!
|
||||
//! # Batch processing
|
||||
//! kreuzberg batch *.pdf --output-format json
|
||||
//!
|
||||
//! # Detect MIME type
|
||||
//! kreuzberg detect unknown-file.bin
|
||||
//! ```
|
||||
|
||||
#![deny(unsafe_code)]
|
||||
|
||||
mod commands;
|
||||
mod logging;
|
||||
mod output;
|
||||
mod style;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use base64::{Engine as _, engine::general_purpose::STANDARD};
|
||||
use clap::{CommandFactory, Parser, Subcommand};
|
||||
#[cfg(feature = "embeddings")]
|
||||
use commands::embed_command;
|
||||
#[cfg(feature = "mcp")]
|
||||
use commands::mcp_command;
|
||||
use commands::overrides::ExtractionOverrides;
|
||||
#[cfg(feature = "api")]
|
||||
use commands::serve_command;
|
||||
use commands::{
|
||||
batch_command, chunk_command, clear_command, extract_command,
|
||||
extract_structured::{ExtractStructuredArgs, extract_structured_command},
|
||||
load_config, manifest_command, stats_command, warm_command,
|
||||
};
|
||||
use kreuzberg::{OutputFormat as ContentOutputFormat, detect_mime_type};
|
||||
use serde_json::json;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
/// Kreuzberg document intelligence CLI
|
||||
#[derive(Parser)]
|
||||
#[command(name = "kreuzberg")]
|
||||
#[command(version, about, long_about = None)]
|
||||
struct Cli {
|
||||
/// Set log level (trace, debug, info, warn, error). Overrides RUST_LOG env var.
|
||||
#[arg(long, global = true)]
|
||||
log_level: Option<String>,
|
||||
|
||||
#[command(subcommand)]
|
||||
command: Commands,
|
||||
}
|
||||
|
||||
#[derive(Subcommand)]
|
||||
enum Commands {
|
||||
/// Extract text from a document
|
||||
Extract {
|
||||
/// Path to the document
|
||||
path: PathBuf,
|
||||
|
||||
/// Path to config file (TOML, YAML, or JSON). If not specified, searches for kreuzberg.toml/yaml/json in current and parent directories.
|
||||
#[arg(short, long)]
|
||||
config: Option<PathBuf>,
|
||||
|
||||
/// Inline JSON configuration. Applied after config file but before individual flags.
|
||||
///
|
||||
/// Example: --config-json '{"ocr":{"backend":"tesseract"},"chunking":{"max_chars":1000}}'
|
||||
#[arg(long)]
|
||||
config_json: Option<String>,
|
||||
|
||||
/// Base64-encoded JSON configuration. Useful for shell environments where quotes are problematic.
|
||||
///
|
||||
/// Example: --config-json-base64 eyJvY3IiOnsiYmFja2VuZCI6InRlc3NlcmFjdCJ9fQ==
|
||||
#[arg(long)]
|
||||
config_json_base64: Option<String>,
|
||||
|
||||
/// MIME type hint (auto-detected if not provided)
|
||||
#[arg(short, long)]
|
||||
mime_type: Option<String>,
|
||||
|
||||
/// Output format for CLI results (text or json).
|
||||
///
|
||||
/// Controls how the CLI displays results, not the extraction content format.
|
||||
#[arg(short, long, default_value = "text")]
|
||||
format: WireFormat,
|
||||
|
||||
/// Extraction configuration overrides
|
||||
#[command(flatten)]
|
||||
overrides: ExtractionOverrides,
|
||||
},
|
||||
|
||||
/// Extract structured data from a document using an LLM
|
||||
ExtractStructured {
|
||||
/// Path to the document file
|
||||
path: PathBuf,
|
||||
|
||||
/// Path to JSON schema file defining the output structure
|
||||
#[arg(long)]
|
||||
schema: PathBuf,
|
||||
|
||||
/// LLM model (e.g., "openai/gpt-4o")
|
||||
#[arg(long)]
|
||||
model: String,
|
||||
|
||||
/// API key for the LLM provider
|
||||
#[arg(long)]
|
||||
api_key: Option<String>,
|
||||
|
||||
/// Custom Jinja2 prompt template
|
||||
#[arg(long)]
|
||||
prompt: Option<String>,
|
||||
|
||||
/// Schema name
|
||||
#[arg(long, default_value = "extraction")]
|
||||
schema_name: Option<String>,
|
||||
|
||||
/// Enable strict mode
|
||||
#[arg(long)]
|
||||
strict: bool,
|
||||
|
||||
/// Config file path
|
||||
#[arg(short, long)]
|
||||
config: Option<PathBuf>,
|
||||
|
||||
/// Output format (text or json)
|
||||
#[arg(short, long, default_value = "json")]
|
||||
format: WireFormat,
|
||||
},
|
||||
|
||||
/// Batch extract from multiple documents
|
||||
Batch {
|
||||
/// Paths to documents
|
||||
paths: Vec<PathBuf>,
|
||||
|
||||
/// Path to config file (TOML, YAML, or JSON). If not specified, searches for kreuzberg.toml/yaml/json in current and parent directories.
|
||||
#[arg(short, long)]
|
||||
config: Option<PathBuf>,
|
||||
|
||||
/// Inline JSON configuration. Applied after config file but before individual flags.
|
||||
///
|
||||
/// Example: --config-json '{"ocr":{"backend":"tesseract"},"chunking":{"max_chars":1000}}'
|
||||
#[arg(long)]
|
||||
config_json: Option<String>,
|
||||
|
||||
/// Base64-encoded JSON configuration. Useful for shell environments where quotes are problematic.
|
||||
///
|
||||
/// Example: --config-json-base64 eyJvY3IiOnsiYmFja2VuZCI6InRlc3NlcmFjdCJ9fQ==
|
||||
#[arg(long)]
|
||||
config_json_base64: Option<String>,
|
||||
|
||||
/// Output format for CLI results (text or json).
|
||||
///
|
||||
/// Controls how the CLI displays results, not the extraction content format.
|
||||
#[arg(short, long, default_value = "json")]
|
||||
format: WireFormat,
|
||||
|
||||
/// Extraction configuration overrides
|
||||
#[command(flatten)]
|
||||
overrides: ExtractionOverrides,
|
||||
|
||||
/// Path to a JSON file mapping file paths to per-file extraction config overrides.
|
||||
/// The JSON should be an object where keys are file paths and values are FileExtractionConfig objects.
|
||||
/// Example: {"doc1.pdf": {"force_ocr": true}, "doc2.pdf": {"output_format": "markdown"}}
|
||||
#[arg(long)]
|
||||
file_configs: Option<PathBuf>,
|
||||
},
|
||||
|
||||
/// Detect MIME type of a file
|
||||
Detect {
|
||||
/// Path to the file
|
||||
path: PathBuf,
|
||||
|
||||
/// Output format (text or json)
|
||||
#[arg(short, long, default_value = "text")]
|
||||
format: WireFormat,
|
||||
},
|
||||
|
||||
/// List all supported document formats
|
||||
Formats {
|
||||
/// Output format (text or json)
|
||||
#[arg(short, long, default_value = "text")]
|
||||
format: WireFormat,
|
||||
},
|
||||
|
||||
/// Show version information
|
||||
Version {
|
||||
/// Output format (text or json)
|
||||
#[arg(short, long, default_value = "text")]
|
||||
format: WireFormat,
|
||||
},
|
||||
|
||||
/// Cache management operations
|
||||
Cache {
|
||||
#[command(subcommand)]
|
||||
command: CacheCommands,
|
||||
},
|
||||
|
||||
/// Start the API server
|
||||
///
|
||||
/// Configuration is loaded with the following precedence (highest to lowest):
|
||||
/// 1. CLI arguments (--host, --port)
|
||||
/// 2. Environment variables (KREUZBERG_HOST, KREUZBERG_PORT)
|
||||
/// 3. Config file (TOML, YAML, or JSON)
|
||||
/// 4. Built-in defaults (127.0.0.1:8000)
|
||||
///
|
||||
/// The config file can contain both extraction and server settings under [server] section.
|
||||
#[cfg(feature = "api")]
|
||||
Serve {
|
||||
/// Host to bind to (e.g., "127.0.0.1" or "0.0.0.0"). CLI arg overrides config file and env vars.
|
||||
#[arg(short = 'H', long)]
|
||||
host: Option<String>,
|
||||
|
||||
/// Port to bind to. CLI arg overrides config file and env vars.
|
||||
#[arg(short, long)]
|
||||
port: Option<u16>,
|
||||
|
||||
/// Path to config file (TOML, YAML, or JSON). If not specified, searches for kreuzberg.toml/yaml/json in current and parent directories.
|
||||
#[arg(short, long)]
|
||||
config: Option<PathBuf>,
|
||||
},
|
||||
|
||||
/// Start the MCP (Model Context Protocol) server
|
||||
#[cfg(feature = "mcp")]
|
||||
Mcp {
|
||||
/// Path to config file (TOML, YAML, or JSON). If not specified, searches for kreuzberg.toml/yaml/json in current and parent directories.
|
||||
#[arg(short, long)]
|
||||
config: Option<PathBuf>,
|
||||
|
||||
/// Transport mode: stdio (default) or http
|
||||
#[arg(long, default_value = "stdio")]
|
||||
transport: String,
|
||||
|
||||
/// HTTP host (only for --transport http)
|
||||
#[arg(long, default_value = "127.0.0.1")]
|
||||
host: String,
|
||||
|
||||
/// HTTP port (only for --transport http)
|
||||
#[arg(long, default_value = "8001")]
|
||||
port: u16,
|
||||
},
|
||||
|
||||
/// API utilities
|
||||
#[cfg(feature = "api")]
|
||||
Api {
|
||||
#[command(subcommand)]
|
||||
command: ApiCommands,
|
||||
},
|
||||
|
||||
/// Generate embeddings for text
|
||||
///
|
||||
/// Generates vector embeddings for one or more text inputs using a specified preset model
|
||||
/// or an LLM provider. Reads from --text flag or stdin if no text is provided.
|
||||
#[cfg(feature = "embeddings")]
|
||||
Embed {
|
||||
/// Text to embed. Can be specified multiple times for batch embedding.
|
||||
#[arg(long)]
|
||||
text: Vec<String>,
|
||||
|
||||
/// Embedding preset (fast, balanced, quality, multilingual). Used with --provider local.
|
||||
#[arg(long, default_value = "balanced")]
|
||||
preset: String,
|
||||
|
||||
/// Embedding provider: "local" (default, ONNX), "llm" (liter-llm), or "plugin" (registered in-process backend)
|
||||
#[arg(long, default_value = "local")]
|
||||
provider: String,
|
||||
|
||||
/// LLM model for provider-hosted embeddings (e.g., "openai/text-embedding-3-small").
|
||||
/// Required when --provider is "llm".
|
||||
#[arg(long)]
|
||||
model: Option<String>,
|
||||
|
||||
/// API key for the LLM provider
|
||||
#[arg(long)]
|
||||
api_key: Option<String>,
|
||||
|
||||
/// Name of a pre-registered in-process embedding backend.
|
||||
/// Required when --provider is "plugin". The backend must have been
|
||||
/// registered via `kreuzberg::plugins::register_embedding_backend`
|
||||
/// before this command runs.
|
||||
#[arg(long)]
|
||||
plugin: Option<String>,
|
||||
|
||||
/// Output format (text or json)
|
||||
#[arg(short, long, default_value = "json")]
|
||||
format: WireFormat,
|
||||
},
|
||||
|
||||
/// Chunk text for processing
|
||||
///
|
||||
/// Splits text into chunks using configurable size and overlap.
|
||||
/// Reads from --text flag or stdin if no text is provided.
|
||||
Chunk {
|
||||
/// Text to chunk. If not provided, reads from stdin.
|
||||
#[arg(long)]
|
||||
text: Option<String>,
|
||||
|
||||
/// Path to config file (TOML, YAML, or JSON)
|
||||
#[arg(short, long)]
|
||||
config: Option<PathBuf>,
|
||||
|
||||
/// Chunk size in characters
|
||||
#[arg(long)]
|
||||
chunk_size: Option<usize>,
|
||||
|
||||
/// Chunk overlap in characters
|
||||
#[arg(long)]
|
||||
chunk_overlap: Option<usize>,
|
||||
|
||||
/// Chunker type: text, markdown, yaml, or semantic
|
||||
#[arg(long, default_value = "text")]
|
||||
chunker_type: String,
|
||||
|
||||
/// Tokenizer model for token-based chunk sizing (e.g., "Xenova/gpt-4o").
|
||||
/// Requires the chunking-tokenizers feature.
|
||||
#[arg(long)]
|
||||
chunking_tokenizer: Option<String>,
|
||||
|
||||
/// Topic threshold for semantic chunking (0.0-1.0, default: 0.75)
|
||||
#[arg(long)]
|
||||
topic_threshold: Option<f32>,
|
||||
|
||||
/// Output format (text or json)
|
||||
#[arg(short, long, default_value = "json")]
|
||||
format: WireFormat,
|
||||
},
|
||||
|
||||
/// Generate shell completions
|
||||
///
|
||||
/// Outputs shell completion scripts for the specified shell.
|
||||
/// Install with: eval "$(kreuzberg completions bash)"
|
||||
Completions {
|
||||
/// Shell to generate completions for
|
||||
#[arg(value_enum)]
|
||||
shell: clap_complete::Shell,
|
||||
},
|
||||
}
|
||||
|
||||
#[cfg(feature = "api")]
|
||||
#[derive(Subcommand)]
|
||||
enum ApiCommands {
|
||||
/// Output the OpenAPI schema (JSON)
|
||||
///
|
||||
/// Prints the full OpenAPI 3.1 specification for the kreuzberg REST API.
|
||||
/// Useful for code generation, documentation, and API client tooling.
|
||||
Schema,
|
||||
}
|
||||
|
||||
#[derive(Subcommand)]
|
||||
enum CacheCommands {
|
||||
/// Show cache statistics
|
||||
Stats {
|
||||
/// Cache directory (default: .kreuzberg in current directory)
|
||||
#[arg(short, long)]
|
||||
cache_dir: Option<PathBuf>,
|
||||
|
||||
/// Output format (text or json)
|
||||
#[arg(short, long, default_value = "text")]
|
||||
format: WireFormat,
|
||||
},
|
||||
|
||||
/// Clear the cache
|
||||
Clear {
|
||||
/// Cache directory (default: .kreuzberg in current directory)
|
||||
#[arg(short, long)]
|
||||
cache_dir: Option<PathBuf>,
|
||||
|
||||
/// Output format (text or json)
|
||||
#[arg(short, long, default_value = "text")]
|
||||
format: WireFormat,
|
||||
},
|
||||
|
||||
/// Output model manifest (expected model files, checksums, sizes)
|
||||
///
|
||||
/// Outputs a JSON manifest of all model files required by kreuzberg,
|
||||
/// including their relative paths, SHA256 checksums, and sizes.
|
||||
/// Used for pre-populating model caches in containerized deployments.
|
||||
Manifest {
|
||||
/// Output format (text or json)
|
||||
#[arg(short, long, default_value = "json")]
|
||||
format: WireFormat,
|
||||
},
|
||||
|
||||
/// Download all models eagerly
|
||||
///
|
||||
/// Downloads all PaddleOCR and layout detection models for all supported
|
||||
/// languages. Unlike normal operation which downloads lazily on first use,
|
||||
/// this ensures all models are present in the cache directory.
|
||||
///
|
||||
/// Use --all-embeddings to also download all 4 embedding model presets,
|
||||
/// or --embedding-model <preset> to download a specific one.
|
||||
///
|
||||
/// By default, only the core layout models (rtdetr + tatr) are downloaded.
|
||||
/// Use --all-table-models to also download SLANeXT variants (~730MB).
|
||||
Warm {
|
||||
/// Cache directory (default: .kreuzberg in current directory, or KREUZBERG_CACHE_DIR)
|
||||
#[arg(short, long)]
|
||||
cache_dir: Option<PathBuf>,
|
||||
|
||||
/// Output format (text or json)
|
||||
#[arg(short, long, default_value = "text")]
|
||||
format: WireFormat,
|
||||
|
||||
/// Download all embedding model presets (fast, balanced, quality, multilingual)
|
||||
#[arg(long)]
|
||||
all_embeddings: bool,
|
||||
|
||||
/// Download a specific embedding model preset
|
||||
#[arg(long, value_name = "PRESET")]
|
||||
embedding_model: Option<String>,
|
||||
|
||||
/// Download all table structure models including SLANeXT variants (~730MB)
|
||||
#[arg(
|
||||
long,
|
||||
help = "Download all table structure models including SLANeXT variants (~730MB)"
|
||||
)]
|
||||
all_table_models: bool,
|
||||
|
||||
/// Download all tree-sitter grammar parsers
|
||||
#[arg(long)]
|
||||
all_grammars: bool,
|
||||
|
||||
/// Download specific tree-sitter grammar groups (comma-separated: web,systems,scripting,data,jvm,functional)
|
||||
#[arg(long, value_name = "GROUPS", value_delimiter = ',')]
|
||||
grammar_groups: Option<Vec<String>>,
|
||||
|
||||
/// Download specific tree-sitter grammars by language name (comma-separated)
|
||||
#[arg(long, value_name = "LANGUAGES", value_delimiter = ',')]
|
||||
grammars: Option<Vec<String>>,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
||||
enum WireFormat {
|
||||
Text,
|
||||
Json,
|
||||
Toon,
|
||||
}
|
||||
|
||||
impl std::str::FromStr for WireFormat {
|
||||
type Err = String;
|
||||
|
||||
fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
|
||||
match s.to_lowercase().as_str() {
|
||||
"text" => Ok(WireFormat::Text),
|
||||
"json" => Ok(WireFormat::Json),
|
||||
"toon" => Ok(WireFormat::Toon),
|
||||
_ => Err(format!("Invalid format: {}. Use 'text', 'json', or 'toon'", s)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Content output format for extraction results.
|
||||
///
|
||||
/// Controls the format of the extracted content (not the CLI output format).
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, clap::ValueEnum)]
|
||||
enum ContentOutputFormatArg {
|
||||
/// Plain text (default)
|
||||
Plain,
|
||||
/// Markdown format
|
||||
Markdown,
|
||||
/// Djot markup format
|
||||
Djot,
|
||||
/// HTML format
|
||||
Html,
|
||||
/// JSON tree format with heading-driven sections
|
||||
Json,
|
||||
}
|
||||
|
||||
impl From<ContentOutputFormatArg> for ContentOutputFormat {
|
||||
fn from(arg: ContentOutputFormatArg) -> Self {
|
||||
match arg {
|
||||
ContentOutputFormatArg::Plain => ContentOutputFormat::Plain,
|
||||
ContentOutputFormatArg::Markdown => ContentOutputFormat::Markdown,
|
||||
ContentOutputFormatArg::Djot => ContentOutputFormat::Djot,
|
||||
ContentOutputFormatArg::Html => ContentOutputFormat::Html,
|
||||
ContentOutputFormatArg::Json => ContentOutputFormat::Json,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Validates that a file exists and is accessible.
|
||||
///
|
||||
/// Checks that the path exists in the filesystem and points to a regular file
|
||||
/// (not a directory or special file). Provides user-friendly error messages if validation fails.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns an error if:
|
||||
/// - The path does not exist in the filesystem
|
||||
/// - The path exists but is not a regular file (e.g., is a directory)
|
||||
fn validate_file_exists(path: &Path) -> Result<()> {
|
||||
if !path.exists() {
|
||||
anyhow::bail!(
|
||||
"File not found: '{}'. Please check that the file exists and is accessible.",
|
||||
path.display()
|
||||
);
|
||||
}
|
||||
if !path.is_file() {
|
||||
anyhow::bail!(
|
||||
"Path is not a file: '{}'. Please provide a path to a regular file.",
|
||||
path.display()
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Validates chunking parameters for correctness.
|
||||
///
|
||||
/// Ensures that chunking configuration makes sense: size must be positive and reasonable,
|
||||
/// and overlap must be smaller than chunk size. This prevents common configuration errors
|
||||
/// that would lead to cryptic failures from the underlying library.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns an error if:
|
||||
/// - `chunk_size` is 0 (must be at least 1 character)
|
||||
/// - `chunk_size` exceeds 1,000,000 characters (to prevent excessive memory usage)
|
||||
/// - `chunk_overlap` is greater than or equal to `chunk_size` (overlap must be smaller)
|
||||
fn validate_chunk_params(chunk_size: Option<usize>, chunk_overlap: Option<usize>) -> Result<()> {
|
||||
if let Some(size) = chunk_size {
|
||||
if size == 0 {
|
||||
anyhow::bail!("Invalid chunk size: {}. Chunk size must be greater than 0.", size);
|
||||
}
|
||||
if size > 1_000_000 {
|
||||
anyhow::bail!(
|
||||
"Invalid chunk size: {}. Chunk size must be less than 1,000,000 characters to avoid excessive memory usage.",
|
||||
size
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(overlap) = chunk_overlap
|
||||
&& let Some(size) = chunk_size
|
||||
&& overlap >= size
|
||||
{
|
||||
anyhow::bail!(
|
||||
"Invalid chunk overlap: {}. Overlap ({}) must be less than chunk size ({}).",
|
||||
overlap,
|
||||
overlap,
|
||||
size
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Validates batch extraction paths for correctness.
|
||||
///
|
||||
/// Ensures that at least one file path is provided and that all paths point to valid,
|
||||
/// accessible files. This prevents processing empty batches or failing mid-batch due
|
||||
/// to invalid paths.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns an error if:
|
||||
/// - The paths array is empty (at least one file is required)
|
||||
/// - Any path does not exist or is not a regular file
|
||||
fn validate_batch_paths(paths: &[PathBuf]) -> Result<()> {
|
||||
if paths.is_empty() {
|
||||
anyhow::bail!("No files provided for batch extraction. Please provide at least one file path.");
|
||||
}
|
||||
|
||||
for (i, path) in paths.iter().enumerate() {
|
||||
validate_file_exists(path).with_context(|| format!("Invalid file at position {}", i + 1))?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Apply inline JSON or base64 JSON overrides to an extraction config.
|
||||
fn apply_json_overrides(
|
||||
config: &mut kreuzberg::ExtractionConfig,
|
||||
config_json: Option<String>,
|
||||
config_json_base64: Option<String>,
|
||||
) -> Result<()> {
|
||||
if let Some(json_str) = config_json {
|
||||
let json_value: serde_json::Value =
|
||||
serde_json::from_str(&json_str).context("Failed to parse --config-json as JSON")?;
|
||||
*config =
|
||||
merge_json_into_config(config, json_value).context("Failed to merge --config-json with file config")?;
|
||||
} else if let Some(base64_str) = config_json_base64 {
|
||||
let json_bytes = STANDARD
|
||||
.decode(&base64_str)
|
||||
.context("Failed to decode base64 in --config-json-base64")?;
|
||||
let json_str = String::from_utf8(json_bytes).context("Base64-decoded content is not valid UTF-8")?;
|
||||
let json_value: serde_json::Value =
|
||||
serde_json::from_str(&json_str).context("Failed to parse decoded --config-json-base64 as JSON")?;
|
||||
*config = merge_json_into_config(config, json_value)
|
||||
.context("Failed to merge --config-json-base64 with file config")?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Merges a JSON value into an existing extraction config via field-by-field override.
|
||||
fn merge_json_into_config(
|
||||
base_config: &kreuzberg::ExtractionConfig,
|
||||
json_value: serde_json::Value,
|
||||
) -> Result<kreuzberg::ExtractionConfig> {
|
||||
let json_str = serde_json::to_string(&json_value).map_err(|e| anyhow::anyhow!("{}", e))?;
|
||||
kreuzberg::core::config::merge::merge_config_json(base_config, &json_str).map_err(|e| anyhow::anyhow!("{}", e))
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let cli = Cli::parse();
|
||||
|
||||
let env_filter = logging::build_env_filter(cli.log_level.as_deref());
|
||||
|
||||
let _ = tracing_subscriber::fmt()
|
||||
.with_env_filter(env_filter)
|
||||
.with_writer(std::io::stderr)
|
||||
.try_init();
|
||||
|
||||
match cli.command {
|
||||
Commands::Extract {
|
||||
path,
|
||||
config: config_path,
|
||||
config_json,
|
||||
config_json_base64,
|
||||
mime_type,
|
||||
format,
|
||||
overrides,
|
||||
} => {
|
||||
validate_file_exists(&path)?;
|
||||
overrides.validate()?;
|
||||
|
||||
let mut config = load_config(config_path)?;
|
||||
apply_json_overrides(&mut config, config_json, config_json_base64)?;
|
||||
overrides.apply(&mut config);
|
||||
|
||||
extract_command(path, config, mime_type, format)?;
|
||||
}
|
||||
|
||||
Commands::ExtractStructured {
|
||||
path,
|
||||
schema,
|
||||
model,
|
||||
api_key,
|
||||
prompt,
|
||||
schema_name,
|
||||
strict,
|
||||
config,
|
||||
format,
|
||||
} => {
|
||||
validate_file_exists(&path)?;
|
||||
validate_file_exists(&schema)?;
|
||||
extract_structured_command(ExtractStructuredArgs {
|
||||
path,
|
||||
schema_path: schema,
|
||||
model,
|
||||
api_key,
|
||||
prompt,
|
||||
schema_name,
|
||||
strict,
|
||||
config_path: config,
|
||||
format,
|
||||
})?;
|
||||
}
|
||||
|
||||
Commands::Batch {
|
||||
paths,
|
||||
config: config_path,
|
||||
config_json,
|
||||
config_json_base64,
|
||||
format,
|
||||
overrides,
|
||||
file_configs,
|
||||
} => {
|
||||
validate_batch_paths(&paths)?;
|
||||
overrides.validate()?;
|
||||
|
||||
let mut config = load_config(config_path)?;
|
||||
apply_json_overrides(&mut config, config_json, config_json_base64)?;
|
||||
overrides.apply(&mut config);
|
||||
|
||||
let file_configs_map = if let Some(file_configs_path) = file_configs {
|
||||
let file_configs_json = std::fs::read_to_string(&file_configs_path)
|
||||
.with_context(|| format!("Failed to read file configs from '{}'", file_configs_path.display()))?;
|
||||
let map: std::collections::HashMap<String, serde_json::Value> =
|
||||
serde_json::from_str(&file_configs_json).with_context(|| {
|
||||
format!(
|
||||
"Failed to parse file configs JSON from '{}'",
|
||||
file_configs_path.display()
|
||||
)
|
||||
})?;
|
||||
Some(map)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
batch_command(paths, file_configs_map, config, format)?;
|
||||
}
|
||||
|
||||
Commands::Detect { path, format } => {
|
||||
validate_file_exists(&path)?;
|
||||
|
||||
let path_str = path.to_string_lossy().to_string();
|
||||
let mime_type = detect_mime_type(path_str.clone(), true).with_context(|| {
|
||||
format!(
|
||||
"Failed to detect MIME type for file '{}'. Ensure the file is readable.",
|
||||
path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
match format {
|
||||
WireFormat::Text => {
|
||||
println!("{}", style::success(&mime_type));
|
||||
}
|
||||
WireFormat::Json => {
|
||||
let output = json!({
|
||||
"path": path_str,
|
||||
"mime_type": mime_type,
|
||||
});
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&output)
|
||||
.context("Failed to serialize MIME type detection result to JSON")?
|
||||
);
|
||||
}
|
||||
WireFormat::Toon => {
|
||||
let output = json!({
|
||||
"path": path_str,
|
||||
"mime_type": mime_type,
|
||||
});
|
||||
println!(
|
||||
"{}",
|
||||
serde_toon::to_string(&output)
|
||||
.context("Failed to serialize MIME type detection result to TOON")?
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Commands::Formats { format } => {
|
||||
let formats = kreuzberg::core::mime::list_supported_formats();
|
||||
match format {
|
||||
WireFormat::Text => {
|
||||
println!("{:<15} {}", style::label("EXTENSION"), style::label("MIME TYPE"));
|
||||
println!("{}", style::dim(&format!("{:<15} ---------", "---------")));
|
||||
for f in &formats {
|
||||
println!("{:<15} {}", style::success(&format!(".{}", f.extension)), f.mime_type);
|
||||
}
|
||||
}
|
||||
WireFormat::Json => {
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&formats).context("Failed to serialize formats to JSON")?
|
||||
);
|
||||
}
|
||||
WireFormat::Toon => {
|
||||
println!(
|
||||
"{}",
|
||||
serde_toon::to_string(&formats).context("Failed to serialize formats to TOON")?
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Commands::Version { format } => {
|
||||
let version = env!("CARGO_PKG_VERSION");
|
||||
let name = env!("CARGO_PKG_NAME");
|
||||
|
||||
match format {
|
||||
WireFormat::Text => {
|
||||
println!("{} {}", style::label(name), style::success(version));
|
||||
}
|
||||
WireFormat::Json => {
|
||||
let output = json!({
|
||||
"name": name,
|
||||
"version": version,
|
||||
});
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&output)
|
||||
.context("Failed to serialize version information to JSON")?
|
||||
);
|
||||
}
|
||||
WireFormat::Toon => {
|
||||
let output = json!({
|
||||
"name": name,
|
||||
"version": version,
|
||||
});
|
||||
println!(
|
||||
"{}",
|
||||
serde_toon::to_string(&output).context("Failed to serialize version information to TOON")?
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "api")]
|
||||
Commands::Serve {
|
||||
host: cli_host,
|
||||
port: cli_port,
|
||||
config: config_path,
|
||||
} => {
|
||||
let mut extraction_config = load_config(config_path.clone())?;
|
||||
extraction_config.apply_env_overrides()?;
|
||||
serve_command(cli_host, cli_port, extraction_config, config_path)?;
|
||||
}
|
||||
|
||||
#[cfg(feature = "mcp")]
|
||||
Commands::Mcp {
|
||||
config: config_path,
|
||||
transport,
|
||||
#[cfg(feature = "mcp-http")]
|
||||
host,
|
||||
#[cfg(feature = "mcp-http")]
|
||||
port,
|
||||
#[cfg(not(feature = "mcp-http"))]
|
||||
host,
|
||||
#[cfg(not(feature = "mcp-http"))]
|
||||
port,
|
||||
} => {
|
||||
let mut config = load_config(config_path)?;
|
||||
config.apply_env_overrides()?;
|
||||
mcp_command(config, transport, host, port)?;
|
||||
}
|
||||
|
||||
Commands::Cache { command } => match command {
|
||||
CacheCommands::Stats { cache_dir, format } => {
|
||||
stats_command(cache_dir, format)?;
|
||||
}
|
||||
CacheCommands::Clear { cache_dir, format } => {
|
||||
clear_command(cache_dir, format)?;
|
||||
}
|
||||
CacheCommands::Manifest { format } => {
|
||||
manifest_command(format)?;
|
||||
}
|
||||
CacheCommands::Warm {
|
||||
cache_dir,
|
||||
format,
|
||||
all_embeddings,
|
||||
embedding_model,
|
||||
all_table_models,
|
||||
all_grammars,
|
||||
grammar_groups,
|
||||
grammars,
|
||||
} => {
|
||||
warm_command(
|
||||
cache_dir,
|
||||
format,
|
||||
all_embeddings,
|
||||
embedding_model,
|
||||
all_table_models,
|
||||
all_grammars,
|
||||
grammar_groups,
|
||||
grammars,
|
||||
)?;
|
||||
}
|
||||
},
|
||||
|
||||
#[cfg(feature = "api")]
|
||||
Commands::Api { command } => match command {
|
||||
ApiCommands::Schema => {
|
||||
println!("{}", kreuzberg::api::openapi::openapi_json());
|
||||
}
|
||||
},
|
||||
|
||||
#[cfg(feature = "embeddings")]
|
||||
Commands::Embed {
|
||||
text,
|
||||
preset,
|
||||
provider,
|
||||
model,
|
||||
api_key,
|
||||
plugin,
|
||||
format,
|
||||
} => {
|
||||
let texts = if text.is_empty() {
|
||||
vec![commands::read_stdin()?]
|
||||
} else {
|
||||
text
|
||||
};
|
||||
embed_command(texts, &preset, &provider, model, api_key, plugin, format)?;
|
||||
}
|
||||
|
||||
Commands::Chunk {
|
||||
text,
|
||||
config: config_path,
|
||||
chunk_size,
|
||||
chunk_overlap,
|
||||
chunker_type,
|
||||
chunking_tokenizer,
|
||||
topic_threshold,
|
||||
format,
|
||||
} => {
|
||||
let input = match text {
|
||||
Some(t) => t,
|
||||
None => commands::read_stdin().context("No --text provided and failed to read from stdin")?,
|
||||
};
|
||||
|
||||
validate_chunk_params(chunk_size, chunk_overlap)?;
|
||||
|
||||
let base_config = load_config(config_path)?;
|
||||
let mut chunking_config = base_config.chunking.unwrap_or_default();
|
||||
|
||||
if let Some(size) = chunk_size {
|
||||
chunking_config.max_characters = size;
|
||||
// If user set chunk_size but not overlap, clamp overlap to fit
|
||||
if chunk_overlap.is_none() && chunking_config.overlap >= size {
|
||||
chunking_config.overlap = size / 4;
|
||||
}
|
||||
}
|
||||
if let Some(overlap) = chunk_overlap {
|
||||
chunking_config.overlap = overlap;
|
||||
}
|
||||
match chunker_type.as_str() {
|
||||
"markdown" => chunking_config.chunker_type = kreuzberg::ChunkerType::Markdown,
|
||||
"yaml" => chunking_config.chunker_type = kreuzberg::ChunkerType::Yaml,
|
||||
"semantic" => chunking_config.chunker_type = kreuzberg::ChunkerType::Semantic,
|
||||
_ => chunking_config.chunker_type = kreuzberg::ChunkerType::Text,
|
||||
}
|
||||
if let Some(ref tokenizer) = chunking_tokenizer {
|
||||
chunking_config.sizing = kreuzberg::ChunkSizing::Tokenizer {
|
||||
model: tokenizer.clone(),
|
||||
cache_dir: None,
|
||||
};
|
||||
}
|
||||
if topic_threshold.is_some() {
|
||||
chunking_config.topic_threshold = topic_threshold;
|
||||
}
|
||||
|
||||
chunk_command(input, chunking_config, format)?;
|
||||
}
|
||||
|
||||
Commands::Completions { shell } => {
|
||||
let mut cmd = Cli::command();
|
||||
clap_complete::generate(shell, &mut cmd, "kreuzberg", &mut std::io::stdout());
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
32
crates/kreuzberg-cli/src/output.rs
Normal file
32
crates/kreuzberg-cli/src/output.rs
Normal file
@@ -0,0 +1,32 @@
|
||||
//! JSON envelope types for CLI output.
|
||||
//!
|
||||
//! When `--format json` is used, extraction results are wrapped in these envelopes
|
||||
//! so tooling (such as the benchmark harness) can read timing information without
|
||||
//! parsing stderr or running a separate profiling tool.
|
||||
|
||||
use kreuzberg::ExtractionResult;
|
||||
use serde::Serialize;
|
||||
|
||||
/// Single-file extraction result with wall-clock timing.
|
||||
///
|
||||
/// Emitted to stdout by `kreuzberg extract --format json`.
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct ExtractEnvelope {
|
||||
/// The extraction result (content, metadata, tables, …).
|
||||
pub result: ExtractionResult,
|
||||
/// Wall-clock time for the extraction call in milliseconds.
|
||||
pub extraction_time_ms: f64,
|
||||
}
|
||||
|
||||
/// Batch extraction results with per-file and total timing.
|
||||
///
|
||||
/// Emitted to stdout by `kreuzberg batch --format json`.
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct BatchEnvelope {
|
||||
/// One result per input file, in input order.
|
||||
pub results: Vec<ExtractionResult>,
|
||||
/// Total wall-clock time for the whole batch in milliseconds.
|
||||
pub total_ms: f64,
|
||||
/// Per-file wall-clock times in milliseconds, aligned with `results`.
|
||||
pub per_file_ms: Vec<f64>,
|
||||
}
|
||||
104
crates/kreuzberg-cli/src/style.rs
Normal file
104
crates/kreuzberg-cli/src/style.rs
Normal file
@@ -0,0 +1,104 @@
|
||||
//! CLI color styling helpers using `anstyle`.
|
||||
//!
|
||||
//! Provides styled output for the kreuzberg CLI. Respects the `NO_COLOR`
|
||||
//! environment variable (<https://no-color.org/>) and disables colors
|
||||
//! when output is not a terminal.
|
||||
|
||||
use anstyle::{AnsiColor, Effects, Style};
|
||||
use std::sync::OnceLock;
|
||||
|
||||
/// Bold blue for section headers.
|
||||
const HEADER: Style = Style::new()
|
||||
.fg_color(Some(anstyle::Color::Ansi(AnsiColor::Blue)))
|
||||
.effects(Effects::BOLD);
|
||||
|
||||
/// Green for success values (MIME types, file paths, versions).
|
||||
const SUCCESS: Style = Style::new().fg_color(Some(anstyle::Color::Ansi(AnsiColor::Green)));
|
||||
|
||||
/// Dim for metadata, separators, secondary info.
|
||||
const DIM: Style = Style::new().effects(Effects::DIMMED);
|
||||
|
||||
/// Bold for labels in key-value pairs.
|
||||
const LABEL: Style = Style::new().effects(Effects::BOLD);
|
||||
|
||||
/// Check whether color output is enabled.
|
||||
///
|
||||
/// Returns `false` if:
|
||||
/// - The `NO_COLOR` environment variable is set (any value)
|
||||
///
|
||||
/// See <https://no-color.org/> for the specification.
|
||||
pub fn is_color_enabled() -> bool {
|
||||
static ENABLED: OnceLock<bool> = OnceLock::new();
|
||||
*ENABLED.get_or_init(|| std::env::var_os("NO_COLOR").is_none())
|
||||
}
|
||||
|
||||
/// Apply an `anstyle::Style` to text if colors are enabled.
|
||||
fn styled(text: &str, style: Style) -> String {
|
||||
if is_color_enabled() {
|
||||
format!("{}{}{}", style.render(), text, style.render_reset())
|
||||
} else {
|
||||
text.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
/// Style text as a section header (bold blue).
|
||||
pub fn header(text: &str) -> String {
|
||||
styled(text, HEADER)
|
||||
}
|
||||
|
||||
/// Style text as a success value (green).
|
||||
pub fn success(text: &str) -> String {
|
||||
styled(text, SUCCESS)
|
||||
}
|
||||
|
||||
/// Style text as dim/secondary (dimmed).
|
||||
pub fn dim(text: &str) -> String {
|
||||
styled(text, DIM)
|
||||
}
|
||||
|
||||
/// Style text as a label (bold).
|
||||
pub fn label(text: &str) -> String {
|
||||
styled(text, LABEL)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_styled_returns_plain_text_when_no_color() {
|
||||
// Set NO_COLOR for this test's assertion scope via direct env check
|
||||
// Since OnceLock caches, we test the raw logic instead.
|
||||
let text = "hello";
|
||||
let result = format!("{}{}{}", Style::new().render(), text, Style::new().render_reset());
|
||||
// A plain Style produces no ANSI codes, so the result is just the text.
|
||||
assert_eq!(result, "hello");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_styled_applies_ansi_when_style_present() {
|
||||
let style = Style::new().fg_color(Some(anstyle::Color::Ansi(AnsiColor::Green)));
|
||||
let rendered = format!("{}{}{}", style.render(), "ok", style.render_reset());
|
||||
// The rendered string should contain ANSI escape sequences.
|
||||
assert!(rendered.contains("\x1b["));
|
||||
assert!(rendered.contains("ok"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_helper_functions_return_strings() {
|
||||
// Smoke test: all helpers produce non-empty output for non-empty input.
|
||||
assert!(!header("h").is_empty());
|
||||
assert!(!success("s").is_empty());
|
||||
assert!(!dim("d").is_empty());
|
||||
assert!(!label("l").is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_color_enabled_respects_no_color_env() {
|
||||
// We cannot easily test OnceLock-cached value, but we can verify the
|
||||
// logic: NO_COLOR absence means colors enabled.
|
||||
let has_no_color = std::env::var_os("NO_COLOR").is_some();
|
||||
// The cached result should match the env at init time.
|
||||
assert_eq!(is_color_enabled(), !has_no_color);
|
||||
}
|
||||
}
|
||||
937
crates/kreuzberg-cli/tests/commands_test.rs
Normal file
937
crates/kreuzberg-cli/tests/commands_test.rs
Normal file
@@ -0,0 +1,937 @@
|
||||
//! Integration tests for CLI commands (extract, detect, batch).
|
||||
//!
|
||||
//! These tests verify that the CLI commands work correctly end-to-end,
|
||||
//! including input validation, file processing, and output formatting.
|
||||
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
use tempfile::tempdir;
|
||||
|
||||
/// Get the path to the kreuzberg binary.
|
||||
fn get_binary_path() -> String {
|
||||
let manifest_dir = env!("CARGO_MANIFEST_DIR");
|
||||
format!("{}/../../target/debug/kreuzberg", manifest_dir)
|
||||
}
|
||||
|
||||
/// Get the test_documents directory path.
|
||||
fn get_test_documents_dir() -> PathBuf {
|
||||
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
|
||||
manifest_dir.parent().unwrap().parent().unwrap().join("test_documents")
|
||||
}
|
||||
|
||||
/// Get a test file path relative to test_documents/.
|
||||
fn get_test_file(relative_path: &str) -> String {
|
||||
get_test_documents_dir()
|
||||
.join(relative_path)
|
||||
.to_string_lossy()
|
||||
.to_string()
|
||||
}
|
||||
|
||||
/// Build the binary before running tests.
|
||||
fn build_binary() {
|
||||
let status = Command::new("cargo")
|
||||
.args(["build", "--bin", "kreuzberg"])
|
||||
.status()
|
||||
.expect("Failed to build kreuzberg binary");
|
||||
|
||||
assert!(status.success(), "Failed to build kreuzberg binary");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_text_file() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute extract command");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Extract command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
assert!(!stdout.is_empty(), "Extract output should not be empty");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_with_json_output() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", test_file.as_str(), "--format", "json"])
|
||||
.output()
|
||||
.expect("Failed to execute extract command");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Extract command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
|
||||
let json_result: serde_json::Result<serde_json::Value> = serde_json::from_str(&stdout);
|
||||
assert!(json_result.is_ok(), "Output should be valid JSON, got: {}", stdout);
|
||||
|
||||
let json = json_result.unwrap();
|
||||
// JSON output is now wrapped in a timing envelope: { result: ExtractionResult, extraction_time_ms: f64 }
|
||||
assert!(json.get("result").is_some(), "JSON envelope should have 'result' field");
|
||||
assert!(
|
||||
json.get("extraction_time_ms").is_some(),
|
||||
"JSON envelope should have 'extraction_time_ms' field"
|
||||
);
|
||||
assert!(
|
||||
json["result"].get("content").is_some(),
|
||||
"result should have 'content' field"
|
||||
);
|
||||
assert!(
|
||||
json["result"].get("mime_type").is_some(),
|
||||
"result should have 'mime_type' field"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_with_chunking() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args([
|
||||
"extract",
|
||||
test_file.as_str(),
|
||||
"--chunk",
|
||||
"true",
|
||||
"--chunk-size",
|
||||
"100",
|
||||
"--chunk-overlap",
|
||||
"20",
|
||||
"--format",
|
||||
"json",
|
||||
])
|
||||
.output()
|
||||
.expect("Failed to execute extract command");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Extract with chunking failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
let json: serde_json::Value = serde_json::from_str(&stdout).expect("Should be valid JSON");
|
||||
|
||||
// JSON output is wrapped in an envelope; chunks live under result
|
||||
assert!(
|
||||
json["result"].get("chunks").is_some(),
|
||||
"result should have 'chunks' field"
|
||||
);
|
||||
assert!(json["result"]["chunks"].is_array(), "'chunks' should be an array");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_file_not_found() {
|
||||
build_binary();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", "/nonexistent/file.txt"])
|
||||
.output()
|
||||
.expect("Failed to execute extract command");
|
||||
|
||||
assert!(!output.status.success(), "Extract should fail for nonexistent file");
|
||||
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
assert!(
|
||||
stderr.contains("File not found"),
|
||||
"Error should mention file not found, got: {}",
|
||||
stderr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_directory_not_file() {
|
||||
build_binary();
|
||||
|
||||
let tmp_dir = tempdir().expect("Failed to create temp dir");
|
||||
let dir_path = tmp_dir.path().to_string_lossy().to_string();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", dir_path.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute extract command");
|
||||
|
||||
assert!(!output.status.success(), "Extract should fail for directory");
|
||||
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
assert!(
|
||||
stderr.contains("not a file") || stderr.contains("regular file"),
|
||||
"Error should mention path is not a file, got: {}",
|
||||
stderr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_invalid_chunk_size_zero() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", test_file.as_str(), "--chunk-size", "0"])
|
||||
.output()
|
||||
.expect("Failed to execute extract command");
|
||||
|
||||
assert!(!output.status.success(), "Extract should fail for chunk size 0");
|
||||
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
assert!(
|
||||
stderr.contains("Invalid chunk size") || stderr.contains("must be greater than 0"),
|
||||
"Error should mention invalid chunk size, got: {}",
|
||||
stderr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_invalid_chunk_size_too_large() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", test_file.as_str(), "--chunk-size", "2000000"])
|
||||
.output()
|
||||
.expect("Failed to execute extract command");
|
||||
|
||||
assert!(!output.status.success(), "Extract should fail for chunk size > 1M");
|
||||
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
assert!(
|
||||
stderr.contains("Invalid chunk size") || stderr.contains("1,000,000"),
|
||||
"Error should mention chunk size limit, got: {}",
|
||||
stderr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_invalid_overlap_equals_chunk_size() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args([
|
||||
"extract",
|
||||
test_file.as_str(),
|
||||
"--chunk-size",
|
||||
"100",
|
||||
"--chunk-overlap",
|
||||
"100",
|
||||
])
|
||||
.output()
|
||||
.expect("Failed to execute extract command");
|
||||
|
||||
assert!(
|
||||
!output.status.success(),
|
||||
"Extract should fail when overlap equals chunk size"
|
||||
);
|
||||
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
assert!(
|
||||
stderr.contains("Invalid chunk overlap") || stderr.contains("must be less than chunk size"),
|
||||
"Error should mention overlap constraint, got: {}",
|
||||
stderr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_mime_type() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["detect", test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute detect command");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Detect command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
assert!(!stdout.is_empty(), "Detect output should not be empty");
|
||||
assert!(
|
||||
stdout.contains("text/plain") || stdout.contains("text"),
|
||||
"Should detect text MIME type, got: {}",
|
||||
stdout
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_with_json_output() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["detect", test_file.as_str(), "--format", "json"])
|
||||
.output()
|
||||
.expect("Failed to execute detect command");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Detect command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
|
||||
let json_result: serde_json::Result<serde_json::Value> = serde_json::from_str(&stdout);
|
||||
assert!(json_result.is_ok(), "Output should be valid JSON, got: {}", stdout);
|
||||
|
||||
let json = json_result.unwrap();
|
||||
assert!(json.get("mime_type").is_some(), "JSON should have 'mime_type' field");
|
||||
assert!(json.get("path").is_some(), "JSON should have 'path' field");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_file_not_found() {
|
||||
build_binary();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["detect", "/nonexistent/file.txt"])
|
||||
.output()
|
||||
.expect("Failed to execute detect command");
|
||||
|
||||
assert!(!output.status.success(), "Detect should fail for nonexistent file");
|
||||
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
assert!(
|
||||
stderr.contains("File not found"),
|
||||
"Error should mention file not found, got: {}",
|
||||
stderr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_batch_multiple_files() {
|
||||
build_binary();
|
||||
|
||||
let file1 = get_test_file("text/simple.txt");
|
||||
let file2 = get_test_file("text/simple.txt");
|
||||
|
||||
if !PathBuf::from(&file1).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", file1);
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["batch", file1.as_str(), file2.as_str(), "--format", "json"])
|
||||
.output()
|
||||
.expect("Failed to execute batch command");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Batch command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
|
||||
let json_result: serde_json::Result<serde_json::Value> = serde_json::from_str(&stdout);
|
||||
assert!(json_result.is_ok(), "Output should be valid JSON, got: {}", stdout);
|
||||
|
||||
let json = json_result.unwrap();
|
||||
// Batch JSON output is now wrapped in a timing envelope: { results: [...], total_ms, per_file_ms }
|
||||
assert!(
|
||||
json.get("results").is_some(),
|
||||
"Batch envelope should have 'results' field"
|
||||
);
|
||||
assert!(json["results"].is_array(), "'results' should be a JSON array");
|
||||
assert_eq!(json["results"].as_array().unwrap().len(), 2, "Should have 2 results");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_batch_with_missing_file() {
|
||||
build_binary();
|
||||
|
||||
let valid_file = get_test_file("text/simple.txt");
|
||||
|
||||
if !PathBuf::from(&valid_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", valid_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["batch", valid_file.as_str(), "/nonexistent/file.txt"])
|
||||
.output()
|
||||
.expect("Failed to execute batch command");
|
||||
|
||||
assert!(!output.status.success(), "Batch should fail when one file is missing");
|
||||
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
assert!(
|
||||
stderr.contains("File not found") || stderr.contains("Invalid file"),
|
||||
"Error should mention file not found, got: {}",
|
||||
stderr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_help() {
|
||||
build_binary();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", "--help"])
|
||||
.output()
|
||||
.expect("Failed to execute extract --help");
|
||||
|
||||
assert!(output.status.success());
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
assert!(stdout.contains("Extract text from a document"));
|
||||
assert!(stdout.contains("--chunk-size"));
|
||||
assert!(stdout.contains("--chunk-overlap"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_help() {
|
||||
build_binary();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["detect", "--help"])
|
||||
.output()
|
||||
.expect("Failed to execute detect --help");
|
||||
|
||||
assert!(output.status.success());
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
assert!(stdout.contains("Detect MIME type"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_batch_help() {
|
||||
build_binary();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["batch", "--help"])
|
||||
.output()
|
||||
.expect("Failed to execute batch --help");
|
||||
|
||||
assert!(output.status.success());
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
assert!(stdout.contains("Batch extract from multiple documents"));
|
||||
}
|
||||
|
||||
// ── Extract command flag parsing tests ──────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn test_extract_help_shows_all_extraction_override_flags() {
|
||||
build_binary();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", "--help"])
|
||||
.output()
|
||||
.expect("Failed to execute extract --help");
|
||||
|
||||
assert!(output.status.success());
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
|
||||
// Verify all ExtractionOverrides flags appear in help output
|
||||
let expected_flags = [
|
||||
"--ocr",
|
||||
"--ocr-backend",
|
||||
"--ocr-language",
|
||||
"--force-ocr",
|
||||
"--no-cache",
|
||||
"--ocr-auto-rotate",
|
||||
"--chunk",
|
||||
"--chunk-size",
|
||||
"--chunk-overlap",
|
||||
"--chunking-tokenizer",
|
||||
"--content-format",
|
||||
"--include-structure",
|
||||
"--quality",
|
||||
"--detect-language",
|
||||
"--layout",
|
||||
"--layout-confidence",
|
||||
"--layout-table-model",
|
||||
"--acceleration",
|
||||
"--max-concurrent",
|
||||
"--max-threads",
|
||||
"--extract-pages",
|
||||
"--page-markers",
|
||||
"--extract-images",
|
||||
"--target-dpi",
|
||||
"--pdf-password",
|
||||
"--token-reduction",
|
||||
"--msg-codepage",
|
||||
];
|
||||
|
||||
for flag in &expected_flags {
|
||||
assert!(
|
||||
stdout.contains(flag),
|
||||
"Extract --help should show flag '{}', but it was not found in output:\n{}",
|
||||
flag,
|
||||
stdout
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// ── Batch command flag parity test ──────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn test_batch_has_same_extraction_flags_as_extract() {
|
||||
build_binary();
|
||||
|
||||
let extract_output = Command::new(get_binary_path())
|
||||
.args(["extract", "--help"])
|
||||
.output()
|
||||
.expect("Failed to execute extract --help");
|
||||
|
||||
let batch_output = Command::new(get_binary_path())
|
||||
.args(["batch", "--help"])
|
||||
.output()
|
||||
.expect("Failed to execute batch --help");
|
||||
|
||||
assert!(extract_output.status.success());
|
||||
assert!(batch_output.status.success());
|
||||
|
||||
let extract_help = String::from_utf8_lossy(&extract_output.stdout);
|
||||
let batch_help = String::from_utf8_lossy(&batch_output.stdout);
|
||||
|
||||
// All extraction override flags should be present on both commands
|
||||
let shared_flags = [
|
||||
"--ocr",
|
||||
"--ocr-backend",
|
||||
"--ocr-language",
|
||||
"--force-ocr",
|
||||
"--no-cache",
|
||||
"--chunk",
|
||||
"--chunk-size",
|
||||
"--chunk-overlap",
|
||||
"--content-format",
|
||||
"--quality",
|
||||
"--detect-language",
|
||||
"--layout",
|
||||
"--layout-confidence",
|
||||
"--layout-table-model",
|
||||
"--acceleration",
|
||||
"--max-concurrent",
|
||||
"--max-threads",
|
||||
"--extract-pages",
|
||||
"--page-markers",
|
||||
"--extract-images",
|
||||
"--target-dpi",
|
||||
"--pdf-password",
|
||||
"--token-reduction",
|
||||
"--msg-codepage",
|
||||
];
|
||||
|
||||
for flag in &shared_flags {
|
||||
assert!(
|
||||
extract_help.contains(flag),
|
||||
"Extract should have flag '{}' but it's missing",
|
||||
flag
|
||||
);
|
||||
assert!(
|
||||
batch_help.contains(flag),
|
||||
"Batch should have flag '{}' (parity with extract) but it's missing",
|
||||
flag
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// ── Validation error tests ──────────────────────────────────────────
|
||||
//
|
||||
// NOTE: The CLI validates file existence *before* override validation,
|
||||
// so we must provide a real file to reach the override validation stage.
|
||||
|
||||
/// Create a temporary file and return its path as a String.
|
||||
/// The caller must keep the returned `tempfile::TempDir` alive for the
|
||||
/// duration of the test so the file is not deleted.
|
||||
fn create_temp_file() -> (tempfile::TempDir, String) {
|
||||
let dir = tempdir().expect("Failed to create temp dir");
|
||||
let file_path = dir.path().join("dummy.pdf");
|
||||
std::fs::write(&file_path, b"dummy content").expect("Failed to write temp file");
|
||||
let path_str = file_path.to_string_lossy().to_string();
|
||||
(dir, path_str)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_chunk_size_zero_error() {
|
||||
build_binary();
|
||||
let (_dir, file_path) = create_temp_file();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", "--chunk-size", "0", &file_path])
|
||||
.output()
|
||||
.expect("Failed to execute extract command");
|
||||
|
||||
assert!(!output.status.success(), "Should fail when chunk size is 0");
|
||||
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
assert!(
|
||||
stderr.contains("chunk size") || stderr.contains("Chunk size") || stderr.contains("Invalid chunk size"),
|
||||
"Error should mention chunk size, got: {}",
|
||||
stderr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_chunk_overlap_exceeds_size_error() {
|
||||
build_binary();
|
||||
let (_dir, file_path) = create_temp_file();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", "--chunk-size", "10", "--chunk-overlap", "20", &file_path])
|
||||
.output()
|
||||
.expect("Failed to execute extract command");
|
||||
|
||||
assert!(!output.status.success(), "Should fail when overlap exceeds chunk size");
|
||||
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
assert!(
|
||||
stderr.contains("overlap") || stderr.contains("Overlap") || stderr.contains("Invalid chunk overlap"),
|
||||
"Error should mention overlap constraint, got: {}",
|
||||
stderr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_layout_confidence_out_of_range_error() {
|
||||
build_binary();
|
||||
let (_dir, file_path) = create_temp_file();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", "--layout-confidence", "2.0", &file_path])
|
||||
.output()
|
||||
.expect("Failed to execute extract command");
|
||||
|
||||
// This flag is feature-gated behind layout-detection. If the binary was
|
||||
// built without that feature, clap itself will reject the unknown flag.
|
||||
assert!(
|
||||
!output.status.success(),
|
||||
"Should fail for layout confidence out of range"
|
||||
);
|
||||
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
assert!(
|
||||
stderr.contains("confidence") || stderr.contains("layout") || stderr.contains("unexpected argument"),
|
||||
"Error should mention confidence or layout, got: {}",
|
||||
stderr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_layout_false_with_confidence_error() {
|
||||
build_binary();
|
||||
let (_dir, file_path) = create_temp_file();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", "--layout", "false", "--layout-confidence", "0.5", &file_path])
|
||||
.output()
|
||||
.expect("Failed to execute extract command");
|
||||
|
||||
// If layout-detection feature is enabled, validation should reject this combination.
|
||||
// If not enabled, clap rejects the unknown flags.
|
||||
assert!(
|
||||
!output.status.success(),
|
||||
"Should fail when --layout false is combined with --layout-confidence"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_target_dpi_zero_error() {
|
||||
build_binary();
|
||||
let (_dir, file_path) = create_temp_file();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", "--target-dpi", "0", &file_path])
|
||||
.output()
|
||||
.expect("Failed to execute extract command");
|
||||
|
||||
assert!(!output.status.success(), "Should fail when target DPI is 0");
|
||||
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
assert!(
|
||||
stderr.contains("DPI") || stderr.contains("dpi") || stderr.contains("target") || stderr.contains("Invalid"),
|
||||
"Error should mention DPI range, got: {}",
|
||||
stderr
|
||||
);
|
||||
}
|
||||
|
||||
// ── Completions test ────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn test_completions_bash_produces_output() {
|
||||
build_binary();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["completions", "bash"])
|
||||
.output()
|
||||
.expect("Failed to execute completions command");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Completions command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
assert!(!stdout.is_empty(), "Completions output should not be empty");
|
||||
// bash completions should contain the command name
|
||||
assert!(
|
||||
stdout.contains("kreuzberg"),
|
||||
"Bash completions should reference 'kreuzberg', got: {}",
|
||||
&stdout[..stdout.len().min(200)]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_completions_zsh_produces_output() {
|
||||
build_binary();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["completions", "zsh"])
|
||||
.output()
|
||||
.expect("Failed to execute completions command");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Completions command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
assert!(!stdout.is_empty(), "Zsh completions output should not be empty");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_completions_fish_produces_output() {
|
||||
build_binary();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["completions", "fish"])
|
||||
.output()
|
||||
.expect("Failed to execute completions command");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Completions command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
assert!(!stdout.is_empty(), "Fish completions output should not be empty");
|
||||
}
|
||||
|
||||
// ── Embed help test ─────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn test_embed_help_shows_correct_flags() {
|
||||
build_binary();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["embed", "--help"])
|
||||
.output()
|
||||
.expect("Failed to execute embed --help");
|
||||
|
||||
// embed is feature-gated; if not compiled, clap will show an error
|
||||
if !output.status.success() {
|
||||
// If embed subcommand doesn't exist, skip the test
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
if stderr.contains("unrecognized subcommand") || stderr.contains("invalid subcommand") {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
assert!(
|
||||
stdout.contains("--text"),
|
||||
"Embed help should show --text flag, got: {}",
|
||||
stdout
|
||||
);
|
||||
assert!(
|
||||
stdout.contains("--preset"),
|
||||
"Embed help should show --preset flag, got: {}",
|
||||
stdout
|
||||
);
|
||||
assert!(
|
||||
stdout.contains("--format"),
|
||||
"Embed help should show --format flag, got: {}",
|
||||
stdout
|
||||
);
|
||||
assert!(
|
||||
stdout.contains("Generate embeddings"),
|
||||
"Embed help should describe embedding generation, got: {}",
|
||||
stdout
|
||||
);
|
||||
}
|
||||
|
||||
// ── Chunk help test ─────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn test_chunk_help_shows_correct_flags() {
|
||||
build_binary();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["chunk", "--help"])
|
||||
.output()
|
||||
.expect("Failed to execute chunk --help");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Chunk --help failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
assert!(
|
||||
stdout.contains("--text"),
|
||||
"Chunk help should show --text flag, got: {}",
|
||||
stdout
|
||||
);
|
||||
assert!(
|
||||
stdout.contains("--chunk-size"),
|
||||
"Chunk help should show --chunk-size flag, got: {}",
|
||||
stdout
|
||||
);
|
||||
assert!(
|
||||
stdout.contains("--chunk-overlap"),
|
||||
"Chunk help should show --chunk-overlap flag, got: {}",
|
||||
stdout
|
||||
);
|
||||
assert!(
|
||||
stdout.contains("--chunker-type"),
|
||||
"Chunk help should show --chunker-type flag, got: {}",
|
||||
stdout
|
||||
);
|
||||
assert!(
|
||||
stdout.contains("--format"),
|
||||
"Chunk help should show --format flag, got: {}",
|
||||
stdout
|
||||
);
|
||||
assert!(
|
||||
stdout.contains("Chunk text"),
|
||||
"Chunk help should describe text chunking, got: {}",
|
||||
stdout
|
||||
);
|
||||
}
|
||||
|
||||
// ── Style module NO_COLOR test ──────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn test_no_color_env_disables_ansi_in_output() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
return;
|
||||
}
|
||||
|
||||
// Run with NO_COLOR set - output should have no ANSI escape sequences
|
||||
let output = Command::new(get_binary_path())
|
||||
.env("NO_COLOR", "1")
|
||||
.args(["detect", &test_file])
|
||||
.output()
|
||||
.expect("Failed to execute detect command");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Detect failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
assert!(
|
||||
!stdout.contains("\x1b["),
|
||||
"Output should not contain ANSI escape sequences when NO_COLOR is set, got: {:?}",
|
||||
stdout
|
||||
);
|
||||
}
|
||||
|
||||
// ── Additional validation edge cases ────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn test_extract_chunk_size_too_large_error() {
|
||||
build_binary();
|
||||
let (_dir, file_path) = create_temp_file();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", "--chunk-size", "2000000", &file_path])
|
||||
.output()
|
||||
.expect("Failed to execute extract command");
|
||||
|
||||
assert!(!output.status.success(), "Should fail when chunk size exceeds limit");
|
||||
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
assert!(
|
||||
stderr.contains("chunk size") || stderr.contains("Chunk size") || stderr.contains("1,000,000"),
|
||||
"Error should mention chunk size limit, got: {}",
|
||||
stderr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_target_dpi_too_high_error() {
|
||||
build_binary();
|
||||
let (_dir, file_path) = create_temp_file();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", "--target-dpi", "5000", &file_path])
|
||||
.output()
|
||||
.expect("Failed to execute extract command");
|
||||
|
||||
assert!(!output.status.success(), "Should fail when target DPI exceeds limit");
|
||||
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
assert!(
|
||||
stderr.contains("DPI") || stderr.contains("dpi") || stderr.contains("2400") || stderr.contains("Invalid"),
|
||||
"Error should mention DPI range, got: {}",
|
||||
stderr
|
||||
);
|
||||
}
|
||||
617
crates/kreuzberg-cli/tests/config_discovery_test.rs
Normal file
617
crates/kreuzberg-cli/tests/config_discovery_test.rs
Normal file
@@ -0,0 +1,617 @@
|
||||
//! Integration tests for CLI config file discovery.
|
||||
//!
|
||||
//! These tests verify that the CLI correctly discovers and loads configuration files
|
||||
//! in various formats (.toml, .yaml, .json) with case-insensitive extension
|
||||
//! matching, explicit --config flag support, and proper error handling.
|
||||
|
||||
use std::fs;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
use tempfile::tempdir;
|
||||
|
||||
/// Get the path to the kreuzberg binary.
|
||||
fn get_binary_path() -> String {
|
||||
let manifest_dir = env!("CARGO_MANIFEST_DIR");
|
||||
format!("{}/../../target/debug/kreuzberg", manifest_dir)
|
||||
}
|
||||
|
||||
/// Build the binary before running tests.
|
||||
fn build_binary() {
|
||||
let status = Command::new("cargo")
|
||||
.args(["build", "--bin", "kreuzberg"])
|
||||
.status()
|
||||
.expect("Failed to build kreuzberg binary");
|
||||
|
||||
assert!(status.success(), "Failed to build kreuzberg binary");
|
||||
}
|
||||
|
||||
/// Get the test_documents directory path.
|
||||
fn get_test_documents_dir() -> PathBuf {
|
||||
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
|
||||
manifest_dir.parent().unwrap().parent().unwrap().join("test_documents")
|
||||
}
|
||||
|
||||
/// Get a test file path relative to test_documents/.
|
||||
fn get_test_file(relative_path: &str) -> String {
|
||||
get_test_documents_dir()
|
||||
.join(relative_path)
|
||||
.to_string_lossy()
|
||||
.to_string()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_discover_kreuzberg_toml_in_current_directory() {
|
||||
build_binary();
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
let config_path = dir.path().join(".kreuzberg.toml");
|
||||
|
||||
fs::write(
|
||||
&config_path,
|
||||
r#"
|
||||
use_cache = false
|
||||
enable_quality_processing = false
|
||||
"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.current_dir(dir.path())
|
||||
.args(["extract", test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute kreuzberg");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_discover_kreuzberg_yaml_in_current_directory() {
|
||||
build_binary();
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
let config_path = dir.path().join(".kreuzberg.yaml");
|
||||
|
||||
fs::write(
|
||||
&config_path,
|
||||
r#"
|
||||
use_cache: false
|
||||
enable_quality_processing: false
|
||||
"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.current_dir(dir.path())
|
||||
.args(["extract", test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute kreuzberg");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_discover_kreuzberg_yml_in_current_directory() {
|
||||
build_binary();
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
let config_path = dir.path().join(".kreuzberg.yaml");
|
||||
|
||||
fs::write(
|
||||
&config_path,
|
||||
r#"
|
||||
use_cache: false
|
||||
enable_quality_processing: false
|
||||
"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.current_dir(dir.path())
|
||||
.args(["extract", test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute kreuzberg");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_discover_kreuzberg_json_in_current_directory() {
|
||||
build_binary();
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
let config_path = dir.path().join(".kreuzberg.json");
|
||||
|
||||
fs::write(
|
||||
&config_path,
|
||||
r#"{
|
||||
"use_cache": false,
|
||||
"enable_quality_processing": false
|
||||
}"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.current_dir(dir.path())
|
||||
.args(["extract", test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute kreuzberg");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_case_insensitive_toml_extension() {
|
||||
build_binary();
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
let config_path = dir.path().join("custom.TOML");
|
||||
|
||||
fs::write(
|
||||
&config_path,
|
||||
r#"
|
||||
use_cache = false
|
||||
"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let config_arg = config_path.to_string_lossy().into_owned();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.current_dir(dir.path())
|
||||
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute kreuzberg");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_case_insensitive_yaml_extension() {
|
||||
build_binary();
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
let config_path = dir.path().join("custom.Yaml");
|
||||
|
||||
fs::write(
|
||||
&config_path,
|
||||
r#"
|
||||
use_cache: false
|
||||
"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let config_arg = config_path.to_string_lossy().into_owned();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.current_dir(dir.path())
|
||||
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute kreuzberg");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_case_insensitive_yml_extension() {
|
||||
build_binary();
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
let config_path = dir.path().join("custom.YML");
|
||||
|
||||
fs::write(
|
||||
&config_path,
|
||||
r#"
|
||||
use_cache: false
|
||||
"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let config_arg = config_path.to_string_lossy().into_owned();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.current_dir(dir.path())
|
||||
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute kreuzberg");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_case_insensitive_json_extension() {
|
||||
build_binary();
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
let config_path = dir.path().join("custom.JSON");
|
||||
|
||||
fs::write(
|
||||
&config_path,
|
||||
r#"{
|
||||
"use_cache": false
|
||||
}"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let config_arg = config_path.to_string_lossy().into_owned();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.current_dir(dir.path())
|
||||
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute kreuzberg");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_explicit_config_path_toml() {
|
||||
build_binary();
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
let config_path = dir.path().join("custom_config.toml");
|
||||
|
||||
fs::write(
|
||||
&config_path,
|
||||
r#"
|
||||
use_cache = false
|
||||
enable_quality_processing = false
|
||||
"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let config_arg = config_path.to_string_lossy().into_owned();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute kreuzberg");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_explicit_config_path_yaml() {
|
||||
build_binary();
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
let config_path = dir.path().join("custom_config.yaml");
|
||||
|
||||
fs::write(
|
||||
&config_path,
|
||||
r#"
|
||||
use_cache: false
|
||||
enable_quality_processing: false
|
||||
"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let config_arg = config_path.to_string_lossy().into_owned();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute kreuzberg");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_explicit_config_path_json() {
|
||||
build_binary();
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
let config_path = dir.path().join("custom_config.json");
|
||||
|
||||
fs::write(
|
||||
&config_path,
|
||||
r#"{
|
||||
"use_cache": false,
|
||||
"enable_quality_processing": false
|
||||
}"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let config_arg = config_path.to_string_lossy().into_owned();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute kreuzberg");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_invalid_config_extension() {
|
||||
build_binary();
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
let config_path = dir.path().join("config.txt");
|
||||
|
||||
fs::write(&config_path, "invalid content").unwrap();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let config_arg = config_path.to_string_lossy().into_owned();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute kreuzberg");
|
||||
|
||||
assert!(!output.status.success());
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
assert!(
|
||||
stderr.contains(".toml") || stderr.contains(".yaml") || stderr.contains(".json"),
|
||||
"Error message should mention supported extensions: {}",
|
||||
stderr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_malformed_toml_config() {
|
||||
build_binary();
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
let config_path = dir.path().join("bad_config.toml");
|
||||
|
||||
fs::write(&config_path, "use_cache = [[[[[").unwrap();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let config_arg = config_path.to_string_lossy().into_owned();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute kreuzberg");
|
||||
|
||||
assert!(!output.status.success());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_malformed_yaml_config() {
|
||||
build_binary();
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
let config_path = dir.path().join("bad_config.yaml");
|
||||
|
||||
fs::write(&config_path, "use_cache: [[[[[").unwrap();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let config_arg = config_path.to_string_lossy().into_owned();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute kreuzberg");
|
||||
|
||||
assert!(!output.status.success());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_malformed_json_config() {
|
||||
build_binary();
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
let config_path = dir.path().join("bad_config.json");
|
||||
|
||||
fs::write(&config_path, r#"{"use_cache": [[[[[}"#).unwrap();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let config_arg = config_path.to_string_lossy().into_owned();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute kreuzberg");
|
||||
|
||||
assert!(!output.status.success());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_nonexistent_config_file() {
|
||||
build_binary();
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
let config_path = dir.path().join("nonexistent.toml");
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let config_arg = config_path.to_string_lossy().into_owned();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute kreuzberg");
|
||||
|
||||
assert!(!output.status.success());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_default_config_when_no_file_found() {
|
||||
build_binary();
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.current_dir(dir.path())
|
||||
.args(["extract", test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute kreuzberg");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_invalid_config_values() {
|
||||
build_binary();
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
let config_path = dir.path().join("invalid.toml");
|
||||
|
||||
fs::write(
|
||||
&config_path,
|
||||
r#"
|
||||
use_cache = "not_a_bool"
|
||||
"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
tracing::debug!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let config_arg = config_path.to_string_lossy().into_owned();
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
|
||||
.output()
|
||||
.expect("Failed to execute kreuzberg");
|
||||
|
||||
assert!(!output.status.success());
|
||||
}
|
||||
46
crates/kreuzberg-cli/tests/config_env_overrides_test.rs
Normal file
46
crates/kreuzberg-cli/tests/config_env_overrides_test.rs
Normal file
@@ -0,0 +1,46 @@
|
||||
//! Regression test for issue #773.
|
||||
//! Validates that environment variable overrides are correctly applied during configuration loading.
|
||||
|
||||
use kreuzberg::{EmbeddingModelType, ExtractionConfig};
|
||||
|
||||
#[test]
|
||||
fn test_regression_773_env_override_loading() {
|
||||
let mut config = ExtractionConfig::default();
|
||||
|
||||
if let Some(ref ocr) = config.ocr {
|
||||
assert_ne!(ocr.language, "fra");
|
||||
}
|
||||
|
||||
unsafe { std::env::set_var("KREUZBERG_OCR_LANGUAGE", "fra") };
|
||||
config.apply_env_overrides().expect("Failed to apply overrides");
|
||||
unsafe { std::env::remove_var("KREUZBERG_OCR_LANGUAGE") };
|
||||
|
||||
let ocr = config
|
||||
.ocr
|
||||
.expect("OCR config should be Some when KREUZBERG_OCR_LANGUAGE is set");
|
||||
assert_eq!(ocr.language, "fra");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_regression_773_vlm_embedding_env_override() {
|
||||
let mut config = ExtractionConfig::default();
|
||||
|
||||
unsafe { std::env::set_var("KREUZBERG_VLM_EMBEDDING_MODEL", "openai/text-embedding-3-small") };
|
||||
config
|
||||
.apply_env_overrides()
|
||||
.expect("Failed to apply environment overrides");
|
||||
unsafe { std::env::remove_var("KREUZBERG_VLM_EMBEDDING_MODEL") };
|
||||
|
||||
let chunking = config
|
||||
.chunking
|
||||
.expect("Chunking should be enabled when VLM embedding is set");
|
||||
let embedding = chunking.embedding.expect("Embedding should be configured");
|
||||
|
||||
match embedding.model {
|
||||
EmbeddingModelType::Llm { llm } => {
|
||||
assert_eq!(llm.model, "openai/text-embedding-3-small");
|
||||
assert!(llm.api_key.is_none());
|
||||
}
|
||||
_ => panic!("Expected Llm embedding model type"),
|
||||
}
|
||||
}
|
||||
344
crates/kreuzberg-cli/tests/config_tests.rs
Normal file
344
crates/kreuzberg-cli/tests/config_tests.rs
Normal file
@@ -0,0 +1,344 @@
|
||||
//! CLI configuration tests validating flags, aliases, and deprecation handling.
|
||||
//!
|
||||
//! This test suite verifies that:
|
||||
//! 1. --output-format flag works correctly for all format options
|
||||
//! 2. CLI flags properly override config file settings
|
||||
//! 3. Config merge precedence is maintained (CLI args > config file > defaults)
|
||||
//! 4. Configuration JSON can be passed inline
|
||||
//! 5. Alias handling for deprecated flags works as expected
|
||||
|
||||
#![allow(clippy::bool_assert_comparison)]
|
||||
#![allow(clippy::field_reassign_with_default)]
|
||||
|
||||
use std::path::PathBuf;
|
||||
use tempfile::TempDir;
|
||||
|
||||
/// Helper to create a temporary config file
|
||||
#[allow(dead_code)]
|
||||
fn create_test_config(dir: &TempDir, name: &str, content: &str) -> PathBuf {
|
||||
let config_path = dir.path().join(name);
|
||||
std::fs::write(&config_path, content).expect("Failed to write config file");
|
||||
config_path
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_output_format_flag_plain() {
|
||||
// Test that --output-format plain works
|
||||
// This test verifies the flag is properly recognized
|
||||
|
||||
let config = kreuzberg::core::config::ExtractionConfig::default();
|
||||
assert_eq!(
|
||||
config.output_format,
|
||||
kreuzberg::core::config::OutputFormat::Plain,
|
||||
"Default output format should be Plain"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_output_format_flag_markdown() {
|
||||
// Test that --output-format markdown is parsed correctly
|
||||
let markdown_format = kreuzberg::core::config::OutputFormat::Markdown;
|
||||
assert_eq!(
|
||||
format!("{:?}", markdown_format),
|
||||
"Markdown",
|
||||
"Markdown format should have correct debug representation"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_output_format_flag_html() {
|
||||
// Test that --output-format html is parsed correctly
|
||||
let html_format = kreuzberg::core::config::OutputFormat::Html;
|
||||
assert_eq!(
|
||||
format!("{:?}", html_format),
|
||||
"Html",
|
||||
"Html format should have correct debug representation"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extraction_config_with_output_format() {
|
||||
// Test that ExtractionConfig can be created with specific output_format
|
||||
let mut config = kreuzberg::core::config::ExtractionConfig::default();
|
||||
|
||||
config.output_format = kreuzberg::core::config::OutputFormat::Markdown;
|
||||
assert_eq!(
|
||||
config.output_format,
|
||||
kreuzberg::core::config::OutputFormat::Markdown,
|
||||
"output_format should be Markdown after assignment"
|
||||
);
|
||||
|
||||
let serialized = serde_json::to_value(&config).expect("Failed to serialize");
|
||||
assert_eq!(
|
||||
serialized["output_format"], "markdown",
|
||||
"Serialized output_format should be 'markdown' (lowercase)"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_json_parsing_complete() {
|
||||
// Test that complete JSON config can be parsed
|
||||
let json = serde_json::json!({
|
||||
"use_cache": true,
|
||||
"enable_quality_processing": true,
|
||||
"force_ocr": false,
|
||||
"output_format": "markdown",
|
||||
"result_format": "unified",
|
||||
"max_concurrent_extractions": 4,
|
||||
});
|
||||
|
||||
let config: kreuzberg::core::config::ExtractionConfig =
|
||||
serde_json::from_value(json).expect("Failed to parse config JSON");
|
||||
|
||||
assert!(config.use_cache);
|
||||
assert!(config.enable_quality_processing);
|
||||
assert_eq!(config.force_ocr, false);
|
||||
assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Markdown);
|
||||
assert_eq!(config.max_concurrent_extractions, Some(4));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_merge_precedence_cli_overrides_default() {
|
||||
// Test that CLI arguments override defaults
|
||||
let mut config = kreuzberg::core::config::ExtractionConfig::default();
|
||||
|
||||
// Simulate CLI override
|
||||
config.use_cache = false;
|
||||
config.force_ocr = true;
|
||||
|
||||
assert_eq!(config.use_cache, false, "CLI override should change use_cache to false");
|
||||
assert_eq!(config.force_ocr, true, "CLI override should change force_ocr to true");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_merge_precedence_cli_overrides_file() {
|
||||
// Test that CLI arguments override config file settings
|
||||
let mut file_config = kreuzberg::core::config::ExtractionConfig::default();
|
||||
file_config.use_cache = true;
|
||||
file_config.force_ocr = false;
|
||||
|
||||
// Simulate CLI override
|
||||
let mut final_config = file_config.clone();
|
||||
final_config.use_cache = false;
|
||||
|
||||
assert_eq!(
|
||||
final_config.use_cache, false,
|
||||
"CLI should override file config for use_cache"
|
||||
);
|
||||
assert!(!final_config.force_ocr, "CLI should not affect fields not overridden");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_file_precedence_over_defaults() {
|
||||
// Test that config file values override defaults
|
||||
let json = serde_json::json!({
|
||||
"use_cache": false,
|
||||
"force_ocr": true,
|
||||
});
|
||||
|
||||
let file_config: kreuzberg::core::config::ExtractionConfig = serde_json::from_value(json).expect("Failed to parse");
|
||||
|
||||
let default_config = kreuzberg::core::config::ExtractionConfig::default();
|
||||
|
||||
assert_ne!(
|
||||
file_config.use_cache, default_config.use_cache,
|
||||
"File config should override default for use_cache"
|
||||
);
|
||||
assert_ne!(
|
||||
file_config.force_ocr, default_config.force_ocr,
|
||||
"File config should override default for force_ocr"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_output_format_serialization() {
|
||||
// Test that output_format serializes to expected string values
|
||||
let plain = kreuzberg::core::config::OutputFormat::Plain;
|
||||
let plain_json = serde_json::to_value(plain).expect("Failed to serialize Plain");
|
||||
assert_eq!(plain_json, "plain");
|
||||
|
||||
let markdown = kreuzberg::core::config::OutputFormat::Markdown;
|
||||
let markdown_json = serde_json::to_value(markdown).expect("Failed to serialize Markdown");
|
||||
assert_eq!(markdown_json, "markdown");
|
||||
|
||||
let html = kreuzberg::core::config::OutputFormat::Html;
|
||||
let html_json = serde_json::to_value(html).expect("Failed to serialize Html");
|
||||
assert_eq!(html_json, "html");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_output_format_deserialization() {
|
||||
// Test that output_format can be deserialized from string values
|
||||
let plain: kreuzberg::core::config::OutputFormat =
|
||||
serde_json::from_value(serde_json::json!("plain")).expect("Failed to deserialize plain");
|
||||
assert_eq!(plain, kreuzberg::core::config::OutputFormat::Plain);
|
||||
|
||||
let markdown: kreuzberg::core::config::OutputFormat =
|
||||
serde_json::from_value(serde_json::json!("markdown")).expect("Failed to deserialize markdown");
|
||||
assert_eq!(markdown, kreuzberg::core::config::OutputFormat::Markdown);
|
||||
|
||||
let html: kreuzberg::core::config::OutputFormat =
|
||||
serde_json::from_value(serde_json::json!("html")).expect("Failed to deserialize html");
|
||||
assert_eq!(html, kreuzberg::core::config::OutputFormat::Html);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extraction_config_roundtrip_with_output_format() {
|
||||
// Test that output_format survives serialization roundtrip
|
||||
let original = kreuzberg::core::config::ExtractionConfig {
|
||||
output_format: kreuzberg::core::config::OutputFormat::Markdown,
|
||||
..kreuzberg::core::config::ExtractionConfig::default()
|
||||
};
|
||||
|
||||
let json_string = serde_json::to_string(&original).expect("Failed to serialize");
|
||||
let restored: kreuzberg::core::config::ExtractionConfig =
|
||||
serde_json::from_str(&json_string).expect("Failed to deserialize");
|
||||
|
||||
assert_eq!(
|
||||
original.output_format, restored.output_format,
|
||||
"output_format should survive serialization roundtrip"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_with_all_output_formats() {
|
||||
// Test that all output format variants can be set and retrieved
|
||||
let formats = vec![
|
||||
kreuzberg::core::config::OutputFormat::Plain,
|
||||
kreuzberg::core::config::OutputFormat::Markdown,
|
||||
kreuzberg::core::config::OutputFormat::Html,
|
||||
];
|
||||
|
||||
for format in formats {
|
||||
let config = kreuzberg::core::config::ExtractionConfig {
|
||||
output_format: format.clone(),
|
||||
..kreuzberg::core::config::ExtractionConfig::default()
|
||||
};
|
||||
|
||||
let json = serde_json::to_value(&config).expect("Failed to serialize");
|
||||
let restored: kreuzberg::core::config::ExtractionConfig =
|
||||
serde_json::from_value(json).expect("Failed to deserialize");
|
||||
|
||||
assert_eq!(
|
||||
format, restored.output_format,
|
||||
"Format should be preserved for {:?}",
|
||||
format
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_partial_json_with_output_format() {
|
||||
// Test that partial JSON config with only output_format is valid
|
||||
let json = serde_json::json!({
|
||||
"output_format": "markdown",
|
||||
});
|
||||
|
||||
let config: kreuzberg::core::config::ExtractionConfig =
|
||||
serde_json::from_value(json).expect("Failed to parse partial config");
|
||||
|
||||
assert_eq!(
|
||||
config.output_format,
|
||||
kreuzberg::core::config::OutputFormat::Markdown,
|
||||
"output_format should be set from partial config"
|
||||
);
|
||||
|
||||
// Other fields should have defaults
|
||||
assert!(config.use_cache, "use_cache should have default value");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_complete_json_structure() {
|
||||
// Test that a complete config JSON has all necessary fields
|
||||
let config = kreuzberg::core::config::ExtractionConfig::default();
|
||||
let json = serde_json::to_value(&config).expect("Failed to serialize");
|
||||
let obj = json.as_object().expect("Should be object");
|
||||
|
||||
// Verify critical fields are present
|
||||
assert!(obj.contains_key("output_format"), "Should have output_format");
|
||||
assert!(obj.contains_key("use_cache"), "Should have use_cache");
|
||||
assert!(
|
||||
obj.contains_key("enable_quality_processing"),
|
||||
"Should have enable_quality_processing"
|
||||
);
|
||||
assert!(obj.contains_key("force_ocr"), "Should have force_ocr");
|
||||
assert!(obj.contains_key("result_format"), "Should have result_format");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_unknown_output_format_accepted_as_custom() {
|
||||
// OutputFormat has a Custom(String) catch-all variant with #[serde(untagged)],
|
||||
// so unknown strings are accepted as custom renderer names rather than rejected.
|
||||
let json = serde_json::json!({
|
||||
"output_format": "my_custom_renderer",
|
||||
});
|
||||
|
||||
let result: Result<kreuzberg::core::config::ExtractionConfig, _> = serde_json::from_value(json);
|
||||
|
||||
assert!(
|
||||
result.is_ok(),
|
||||
"Unknown output_format should be accepted as Custom variant; got: {:?}",
|
||||
result.err()
|
||||
);
|
||||
assert_eq!(
|
||||
result.unwrap().output_format,
|
||||
kreuzberg::core::config::OutputFormat::Custom("my_custom_renderer".to_string()),
|
||||
"Unknown format string must deserialize as OutputFormat::Custom"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_case_sensitivity() {
|
||||
// Test that format values are case-insensitive due to rename_all = "lowercase"
|
||||
let plain_lowercase = serde_json::json!({"output_format": "plain"});
|
||||
let result: Result<kreuzberg::core::config::ExtractionConfig, _> = serde_json::from_value(plain_lowercase);
|
||||
|
||||
assert!(result.is_ok(), "lowercase 'plain' should be accepted");
|
||||
let config = result.unwrap();
|
||||
assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Plain);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_output_format_field_is_required_in_serialization() {
|
||||
// Test that output_format is always included in serialization
|
||||
let config = kreuzberg::core::config::ExtractionConfig::default();
|
||||
let json = serde_json::to_value(&config).expect("Failed to serialize");
|
||||
|
||||
assert!(
|
||||
json.get("output_format").is_some(),
|
||||
"output_format should always be present in serialization"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_result_format_and_output_format_independent() {
|
||||
// Test that result_format and output_format are independent fields
|
||||
let mut config = kreuzberg::core::config::ExtractionConfig::default();
|
||||
|
||||
// Set both to different values
|
||||
config.output_format = kreuzberg::core::config::OutputFormat::Markdown;
|
||||
|
||||
let json = serde_json::to_value(&config).expect("Failed to serialize");
|
||||
|
||||
assert_eq!(json["output_format"], "markdown");
|
||||
assert!(
|
||||
json["result_format"].is_string(),
|
||||
"result_format should also be present"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extraction_config_clone_preserves_format() {
|
||||
// Test that cloning config preserves output_format
|
||||
let original = kreuzberg::core::config::ExtractionConfig {
|
||||
output_format: kreuzberg::core::config::OutputFormat::Html,
|
||||
..kreuzberg::core::config::ExtractionConfig::default()
|
||||
};
|
||||
|
||||
let cloned = original.clone();
|
||||
|
||||
assert_eq!(
|
||||
original.output_format, cloned.output_format,
|
||||
"Cloned config should preserve output_format"
|
||||
);
|
||||
}
|
||||
355
crates/kreuzberg-cli/tests/contract_cli.rs
Normal file
355
crates/kreuzberg-cli/tests/contract_cli.rs
Normal file
@@ -0,0 +1,355 @@
|
||||
//! CLI contract tests - verify CLI config parsing matches Rust core
|
||||
//!
|
||||
//! This test suite validates that the CLI's configuration parsing produces
|
||||
//! identical results to the Rust core library. It ensures that users get
|
||||
//! consistent behavior whether using the CLI, SDK, or MCP interfaces.
|
||||
|
||||
use kreuzberg::core::config::ExtractionConfig;
|
||||
use kreuzberg::core::config::OutputFormat;
|
||||
use serde_json::json;
|
||||
|
||||
#[test]
|
||||
fn test_cli_config_json_flag_basic_parsing() {
|
||||
let config_str = r#"{"use_cache": true, "output_format": "plain"}"#;
|
||||
|
||||
// Parse as Rust core would
|
||||
let rust_config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize config string");
|
||||
|
||||
// Simulate CLI --config-json parsing (same as Rust core)
|
||||
let cli_json: serde_json::Value = serde_json::from_str(config_str).expect("Failed to parse JSON string");
|
||||
let cli_config: ExtractionConfig = serde_json::from_value(cli_json).expect("Failed to deserialize from JSON value");
|
||||
|
||||
// Verify identical behavior
|
||||
assert_eq!(
|
||||
rust_config.use_cache, cli_config.use_cache,
|
||||
"use_cache should be identical"
|
||||
);
|
||||
assert_eq!(
|
||||
rust_config.output_format, cli_config.output_format,
|
||||
"output_format should be identical"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_nested_config_deserialization() {
|
||||
let config_str = r#"{
|
||||
"chunking": {
|
||||
"max_characters": 1000,
|
||||
"overlap": 200
|
||||
},
|
||||
"ocr": {
|
||||
"backend": "tesseract"
|
||||
}
|
||||
}"#;
|
||||
|
||||
let config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize nested config");
|
||||
|
||||
assert!(config.chunking.is_some(), "Chunking config should be present");
|
||||
assert!(config.ocr.is_some(), "OCR config should be present");
|
||||
|
||||
let chunking = config.chunking.unwrap();
|
||||
assert_eq!(chunking.max_characters, 1000, "max_chars should be 1000");
|
||||
assert_eq!(chunking.overlap, 200, "max_overlap should be 200");
|
||||
|
||||
let ocr = config.ocr.unwrap();
|
||||
assert_eq!(ocr.backend, "tesseract", "backend should be tesseract");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_force_ocr_flag_parsing() {
|
||||
let config_str = r#"{"force_ocr": true}"#;
|
||||
|
||||
let config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize force_ocr config");
|
||||
|
||||
assert!(config.force_ocr, "force_ocr should be true");
|
||||
// Verify other fields retain defaults
|
||||
assert!(config.use_cache, "use_cache should still be true by default");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_max_concurrent_extractions_parsing() {
|
||||
let config_str = r#"{"max_concurrent_extractions": 8}"#;
|
||||
|
||||
let config: ExtractionConfig =
|
||||
serde_json::from_str(config_str).expect("Failed to deserialize concurrent extractions");
|
||||
|
||||
assert_eq!(
|
||||
config.max_concurrent_extractions,
|
||||
Some(8),
|
||||
"max_concurrent_extractions should be 8"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_complex_config_deserialization() {
|
||||
let config_str = r#"{
|
||||
"use_cache": false,
|
||||
"enable_quality_processing": true,
|
||||
"force_ocr": true,
|
||||
"output_format": "markdown",
|
||||
"result_format": "unified",
|
||||
"max_concurrent_extractions": 16,
|
||||
"ocr": {
|
||||
"backend": "tesseract",
|
||||
"language": "eng"
|
||||
},
|
||||
"chunking": {
|
||||
"max_characters": 2000,
|
||||
"overlap": 400,
|
||||
"strategy": "sliding_window"
|
||||
}
|
||||
}"#;
|
||||
|
||||
let config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize complex config");
|
||||
|
||||
// Verify all top-level fields
|
||||
assert!(!config.use_cache);
|
||||
assert!(config.enable_quality_processing);
|
||||
assert!(config.force_ocr);
|
||||
assert_eq!(config.max_concurrent_extractions, Some(16));
|
||||
|
||||
// Verify nested configs
|
||||
assert!(config.ocr.is_some());
|
||||
assert!(config.chunking.is_some());
|
||||
|
||||
let ocr = config.ocr.unwrap();
|
||||
assert_eq!(ocr.backend, "tesseract");
|
||||
assert_eq!(ocr.language, "eng");
|
||||
|
||||
let chunking = config.chunking.unwrap();
|
||||
assert_eq!(chunking.max_characters, 2000);
|
||||
assert_eq!(chunking.overlap, 400);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_empty_config_uses_defaults() {
|
||||
let config_str = r#"{}"#;
|
||||
|
||||
let config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize empty config");
|
||||
|
||||
// All defaults should apply
|
||||
assert!(config.use_cache, "Default use_cache should be true");
|
||||
assert!(
|
||||
config.enable_quality_processing,
|
||||
"Default enable_quality_processing should be true"
|
||||
);
|
||||
assert!(!config.force_ocr, "Default force_ocr should be false");
|
||||
assert_eq!(
|
||||
config.max_concurrent_extractions, None,
|
||||
"Default max_concurrent_extractions should be None"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_roundtrip_preserves_all_fields() {
|
||||
let original_str = r#"{
|
||||
"use_cache": false,
|
||||
"force_ocr": true,
|
||||
"max_concurrent_extractions": 12
|
||||
}"#;
|
||||
|
||||
// Parse
|
||||
let config: ExtractionConfig = serde_json::from_str(original_str).expect("Failed to deserialize");
|
||||
|
||||
// Serialize back
|
||||
let serialized = serde_json::to_value(&config).expect("Failed to serialize");
|
||||
|
||||
// Re-parse the serialized version
|
||||
let reparsed: ExtractionConfig =
|
||||
serde_json::from_value(serialized).expect("Failed to deserialize roundtripped config");
|
||||
|
||||
// Verify fields preserved
|
||||
assert!(!reparsed.use_cache);
|
||||
assert!(reparsed.force_ocr);
|
||||
assert_eq!(reparsed.max_concurrent_extractions, Some(12));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_output_format_enum_parsing() {
|
||||
let test_cases = vec![
|
||||
(r#"{"output_format": "plain"}"#, OutputFormat::Plain),
|
||||
(r#"{"output_format": "markdown"}"#, OutputFormat::Markdown),
|
||||
(r#"{"output_format": "html"}"#, OutputFormat::Html),
|
||||
];
|
||||
|
||||
for (config_str, expected_format) in test_cases {
|
||||
let config: ExtractionConfig =
|
||||
serde_json::from_str(config_str).unwrap_or_else(|_| panic!("Failed to deserialize {}", config_str));
|
||||
|
||||
assert_eq!(
|
||||
config.output_format, expected_format,
|
||||
"output_format should match expected value"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_result_format_enum_parsing() {
|
||||
let test_cases = vec![
|
||||
r#"{"result_format": "unified"}"#,
|
||||
r#"{"result_format": "element_based"}"#,
|
||||
];
|
||||
|
||||
for config_str in test_cases {
|
||||
let result = serde_json::from_str::<ExtractionConfig>(config_str);
|
||||
assert!(result.is_ok(), "Should deserialize result_format from {}", config_str);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_base64_encoded_config_simulation() {
|
||||
// Simulate --config-json-base64 flag handling
|
||||
let original_json = json!({
|
||||
"force_ocr": true,
|
||||
"output_format": "markdown"
|
||||
});
|
||||
|
||||
let json_string = original_json.to_string();
|
||||
|
||||
// Simulate base64 encoding
|
||||
let encoded = base64::engine::general_purpose::STANDARD.encode(&json_string);
|
||||
|
||||
// Simulate base64 decoding (as CLI would do)
|
||||
use base64::Engine;
|
||||
let decoded = String::from_utf8(
|
||||
base64::engine::general_purpose::STANDARD
|
||||
.decode(&encoded)
|
||||
.expect("Failed to decode base64"),
|
||||
)
|
||||
.expect("Failed to convert bytes to string");
|
||||
|
||||
// Parse the decoded JSON
|
||||
let config: ExtractionConfig = serde_json::from_str(&decoded).expect("Failed to deserialize base64-decoded config");
|
||||
|
||||
assert!(config.force_ocr);
|
||||
assert_eq!(config.output_format, OutputFormat::Markdown);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_partial_override_merging() {
|
||||
// Test that partial configs can override defaults
|
||||
let base_config = ExtractionConfig::default();
|
||||
let override_json = json!({"force_ocr": true, "use_cache": false});
|
||||
|
||||
// Simulate CLI merge: convert base to JSON, merge overrides, deserialize
|
||||
let mut base_json = serde_json::to_value(&base_config).expect("Failed to serialize base config");
|
||||
|
||||
if let (serde_json::Value::Object(base_obj), serde_json::Value::Object(override_obj)) =
|
||||
(&mut base_json, override_json)
|
||||
{
|
||||
for (key, value) in override_obj {
|
||||
base_obj.insert(key, value);
|
||||
}
|
||||
}
|
||||
|
||||
let merged: ExtractionConfig = serde_json::from_value(base_json).expect("Failed to deserialize merged config");
|
||||
|
||||
assert!(merged.force_ocr, "Override should apply force_ocr");
|
||||
assert!(!merged.use_cache, "Override should apply use_cache");
|
||||
assert!(
|
||||
merged.enable_quality_processing,
|
||||
"Unoverridden field should retain default"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_invalid_json_error_handling() {
|
||||
let invalid_json_str = r#"{"force_ocr": true, "invalid_field": "value"}"#;
|
||||
|
||||
// Note: serde with deny_unknown_fields would reject this
|
||||
// Without that, it should deserialize successfully and ignore unknown fields
|
||||
let result = serde_json::from_str::<ExtractionConfig>(invalid_json_str);
|
||||
|
||||
// Document the current behavior - unknown fields are typically ignored
|
||||
if let Ok(config) = result {
|
||||
assert!(config.force_ocr);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_whitespace_handling_in_json() {
|
||||
let config_strs = vec![
|
||||
r#"{"force_ocr":true}"#, // No spaces
|
||||
r#"{ "force_ocr" : true }"#, // Extra spaces
|
||||
r#"{
|
||||
"force_ocr": true
|
||||
}"#, // Newlines and indentation
|
||||
];
|
||||
|
||||
for config_str in config_strs {
|
||||
let config: ExtractionConfig =
|
||||
serde_json::from_str(config_str).unwrap_or_else(|_| panic!("Failed to parse: {}", config_str));
|
||||
|
||||
assert!(config.force_ocr);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_numeric_boundary_values() {
|
||||
// Test minimum and maximum reasonable values for numeric fields
|
||||
let test_cases = vec![
|
||||
(r#"{"max_concurrent_extractions": 1}"#, Some(1)),
|
||||
(r#"{"max_concurrent_extractions": 256}"#, Some(256)),
|
||||
(r#"{"max_concurrent_extractions": 0}"#, Some(0)), // Edge case: 0 extractions
|
||||
];
|
||||
|
||||
for (config_str, expected_value) in test_cases {
|
||||
let config: ExtractionConfig =
|
||||
serde_json::from_str(config_str).unwrap_or_else(|_| panic!("Failed to parse: {}", config_str));
|
||||
|
||||
assert_eq!(
|
||||
config.max_concurrent_extractions, expected_value,
|
||||
"Numeric values should be parsed correctly"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_boolean_values_strict_parsing() {
|
||||
// Test that boolean values are strictly true/false, not truthy/falsy
|
||||
let test_cases = vec![(r#"{"use_cache": true}"#, true), (r#"{"use_cache": false}"#, false)];
|
||||
|
||||
for (config_str, expected_value) in test_cases {
|
||||
let config: ExtractionConfig =
|
||||
serde_json::from_str(config_str).unwrap_or_else(|_| panic!("Failed to parse: {}", config_str));
|
||||
|
||||
assert_eq!(config.use_cache, expected_value);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_config_consistency_across_formats() {
|
||||
// Create a config programmatically
|
||||
let programmatic_config = ExtractionConfig {
|
||||
use_cache: false,
|
||||
enable_quality_processing: true,
|
||||
force_ocr: true,
|
||||
output_format: OutputFormat::Markdown,
|
||||
max_concurrent_extractions: Some(4),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Serialize it
|
||||
let serialized_json = serde_json::to_value(&programmatic_config).expect("Failed to serialize");
|
||||
|
||||
// Deserialize back from JSON string (simulating CLI parsing)
|
||||
let json_string = serialized_json.to_string();
|
||||
let deserialized: ExtractionConfig = serde_json::from_str(&json_string).expect("Failed to deserialize from string");
|
||||
|
||||
// Verify complete roundtrip
|
||||
assert_eq!(deserialized.use_cache, programmatic_config.use_cache);
|
||||
assert_eq!(
|
||||
deserialized.enable_quality_processing,
|
||||
programmatic_config.enable_quality_processing
|
||||
);
|
||||
assert_eq!(deserialized.force_ocr, programmatic_config.force_ocr);
|
||||
assert_eq!(deserialized.output_format, programmatic_config.output_format);
|
||||
assert_eq!(
|
||||
deserialized.max_concurrent_extractions,
|
||||
programmatic_config.max_concurrent_extractions
|
||||
);
|
||||
}
|
||||
|
||||
// Re-export needed for base64 test (moved to end of file)
|
||||
|
||||
// Re-export needed for base64 test (imported at top of file)
|
||||
603
crates/kreuzberg-cli/tests/e2e_config_test.rs
Normal file
603
crates/kreuzberg-cli/tests/e2e_config_test.rs
Normal file
@@ -0,0 +1,603 @@
|
||||
//! Comprehensive CLI end-to-end integration tests for configuration flags.
|
||||
//!
|
||||
//! This test suite validates the new configuration features including:
|
||||
//! - `--config-json` for inline JSON configuration
|
||||
//! - `--config-json-base64` for base64-encoded JSON configuration
|
||||
//! - `--output-format` flag with all variants (plain, markdown, djot, html)
|
||||
//! - Flag precedence (CLI args > JSON config > file > defaults)
|
||||
//! - Config merge scenarios and conflict detection
|
||||
//! - Error handling for invalid inputs
|
||||
//! - Real extraction with new formats
|
||||
|
||||
#![allow(clippy::bool_assert_comparison)]
|
||||
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
use tempfile::TempDir;
|
||||
|
||||
/// Get the path to the kreuzberg binary.
|
||||
fn get_binary_path() -> String {
|
||||
let manifest_dir = env!("CARGO_MANIFEST_DIR");
|
||||
format!("{}/../../target/debug/kreuzberg", manifest_dir)
|
||||
}
|
||||
|
||||
/// Get the test_documents directory path.
|
||||
fn get_test_documents_dir() -> PathBuf {
|
||||
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
|
||||
manifest_dir.parent().unwrap().parent().unwrap().join("test_documents")
|
||||
}
|
||||
|
||||
/// Get a test file path relative to test_documents/.
|
||||
fn get_test_file(relative_path: &str) -> String {
|
||||
get_test_documents_dir()
|
||||
.join(relative_path)
|
||||
.to_string_lossy()
|
||||
.to_string()
|
||||
}
|
||||
|
||||
/// Build the binary before running tests (runs once per test).
|
||||
fn build_binary() {
|
||||
let status = Command::new("cargo")
|
||||
.args(["build", "--bin", "kreuzberg"])
|
||||
.status()
|
||||
.expect("Failed to build kreuzberg binary");
|
||||
|
||||
assert!(status.success(), "Failed to build kreuzberg binary");
|
||||
}
|
||||
|
||||
/// Helper to create a temporary config file with specified content.
|
||||
fn create_test_config(dir: &TempDir, name: &str, content: &str) -> PathBuf {
|
||||
let config_path = dir.path().join(name);
|
||||
std::fs::write(&config_path, content).expect("Failed to write config file");
|
||||
config_path
|
||||
}
|
||||
|
||||
/// Helper to encode string as base64.
|
||||
fn to_base64(input: &str) -> String {
|
||||
// Manual base64 encoding
|
||||
const CHARSET: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
|
||||
let bytes = input.as_bytes();
|
||||
let mut result = String::new();
|
||||
let mut i = 0;
|
||||
|
||||
while i < bytes.len() {
|
||||
let b1 = bytes[i];
|
||||
let b2 = if i + 1 < bytes.len() { bytes[i + 1] } else { 0 };
|
||||
let b3 = if i + 2 < bytes.len() { bytes[i + 2] } else { 0 };
|
||||
|
||||
let n = ((b1 as u32) << 16) | ((b2 as u32) << 8) | (b3 as u32);
|
||||
|
||||
result.push(CHARSET[((n >> 18) & 0x3F) as usize] as char);
|
||||
result.push(CHARSET[((n >> 12) & 0x3F) as usize] as char);
|
||||
|
||||
if i + 1 < bytes.len() {
|
||||
result.push(CHARSET[((n >> 6) & 0x3F) as usize] as char);
|
||||
} else {
|
||||
result.push('=');
|
||||
}
|
||||
|
||||
if i + 2 < bytes.len() {
|
||||
result.push(CHARSET[(n & 0x3F) as usize] as char);
|
||||
} else {
|
||||
result.push('=');
|
||||
}
|
||||
|
||||
i += 3;
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Test 1: --config-json inline flag with complex configuration
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_cli_config_json_inline() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args([
|
||||
"extract",
|
||||
test_file.as_str(),
|
||||
"--config-json",
|
||||
r#"{"use_cache": false, "chunking": {"max_chars": 512}}"#,
|
||||
])
|
||||
.output()
|
||||
.expect("Failed to execute extract command with --config-json");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Extract command with --config-json failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
assert!(!stdout.is_empty(), "Output should not be empty");
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Test 2: --config-json-base64 flag for base64-encoded configuration
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_cli_config_json_base64() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
// Encode JSON config as base64
|
||||
let json_config = r#"{"use_cache": false}"#;
|
||||
let base64_config = to_base64(json_config);
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args([
|
||||
"extract",
|
||||
test_file.as_str(),
|
||||
"--config-json-base64",
|
||||
base64_config.as_str(),
|
||||
])
|
||||
.output()
|
||||
.expect("Failed to execute extract command with --config-json-base64");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Extract command with --config-json-base64 failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
assert!(!stdout.is_empty(), "Output should not be empty");
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Test 3: Flag precedence verification (CLI flags > JSON > file > defaults)
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_cli_flag_precedence() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let temp_dir = TempDir::new().expect("Failed to create temp directory");
|
||||
|
||||
// Create a config file with specific settings
|
||||
let config_content = r#"
|
||||
use_cache = true
|
||||
|
||||
[chunking]
|
||||
max_chars = 1024
|
||||
"#;
|
||||
let config_path = create_test_config(&temp_dir, "config.toml", config_content);
|
||||
|
||||
// CLI flag should override config file setting
|
||||
let output = Command::new(get_binary_path())
|
||||
.args([
|
||||
"extract",
|
||||
test_file.as_str(),
|
||||
"--config",
|
||||
config_path.to_string_lossy().as_ref(),
|
||||
"--config-json",
|
||||
r#"{"use_cache": false}"#,
|
||||
])
|
||||
.output()
|
||||
.expect("Failed to execute command with precedence test");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Precedence test command failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Test 4: --output-format flag with all variants (plain, markdown, djot, html)
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_cli_output_format_all_variants() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let formats = vec!["plain", "markdown", "djot", "html"];
|
||||
|
||||
for format in formats {
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", test_file.as_str(), "--output-format", format])
|
||||
.output()
|
||||
.unwrap_or_else(|_| panic!("Failed to execute extract with --output-format {}", format));
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Extract command with --output-format {} failed: {}",
|
||||
format,
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
assert!(!stdout.is_empty(), "Output for format {} should not be empty", format);
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Test 5: Output formats (text vs json) for extraction result
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_cli_result_format() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
// Test text output format
|
||||
let output_text = Command::new(get_binary_path())
|
||||
.args(["extract", test_file.as_str(), "--format", "text"])
|
||||
.output()
|
||||
.expect("Failed to execute extract with --format text");
|
||||
|
||||
assert!(
|
||||
output_text.status.success(),
|
||||
"Text format output failed: {}",
|
||||
String::from_utf8_lossy(&output_text.stderr)
|
||||
);
|
||||
|
||||
let text_content = String::from_utf8_lossy(&output_text.stdout);
|
||||
assert!(!text_content.is_empty(), "Text output should not be empty");
|
||||
|
||||
// Test JSON output format
|
||||
let output_json = Command::new(get_binary_path())
|
||||
.args(["extract", test_file.as_str(), "--format", "json"])
|
||||
.output()
|
||||
.expect("Failed to execute extract with --format json");
|
||||
|
||||
assert!(
|
||||
output_json.status.success(),
|
||||
"JSON format output failed: {}",
|
||||
String::from_utf8_lossy(&output_json.stderr)
|
||||
);
|
||||
|
||||
let json_content = String::from_utf8_lossy(&output_json.stdout);
|
||||
let parsed: Result<serde_json::Value, _> = serde_json::from_str(&json_content);
|
||||
assert!(
|
||||
parsed.is_ok(),
|
||||
"JSON output should be valid JSON, got: {}",
|
||||
json_content
|
||||
);
|
||||
|
||||
// Verify JSON has expected envelope+result structure
|
||||
if let Ok(value) = parsed {
|
||||
assert!(
|
||||
value.get("result").is_some(),
|
||||
"JSON envelope should have 'result' field"
|
||||
);
|
||||
assert!(
|
||||
value.get("extraction_time_ms").is_some(),
|
||||
"JSON envelope should have 'extraction_time_ms' field"
|
||||
);
|
||||
assert!(
|
||||
value["result"].get("content").is_some(),
|
||||
"result should have 'content' field"
|
||||
);
|
||||
assert!(
|
||||
value["result"].get("mime_type").is_some(),
|
||||
"result should have 'mime_type' field"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Test 6: Deprecated --content-format flag warning
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_cli_content_format_deprecated_warning() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
// The deprecated --content-format should still work but may show warning
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", test_file.as_str(), "--content-format", "plain"])
|
||||
.output()
|
||||
.expect("Failed to execute extract with --content-format");
|
||||
|
||||
// Command should either succeed or show expected deprecation behavior
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
|
||||
// Note: We're checking that the command doesn't crash; deprecation warning behavior
|
||||
// depends on implementation details
|
||||
assert!(
|
||||
output.status.success() || !stdout.is_empty(),
|
||||
"Command should succeed or produce output"
|
||||
);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Test 7: Config merge scenarios - multiple configuration sources
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_cli_config_merge_scenarios() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let temp_dir = TempDir::new().expect("Failed to create temp directory");
|
||||
|
||||
// Create a base config file
|
||||
let config_content = r#"
|
||||
use_cache = true
|
||||
|
||||
[chunking]
|
||||
max_chars = 1024
|
||||
"#;
|
||||
let config_path = create_test_config(&temp_dir, "base.toml", config_content);
|
||||
|
||||
// Merge: config file + inline JSON (JSON should override matching keys)
|
||||
let output = Command::new(get_binary_path())
|
||||
.args([
|
||||
"extract",
|
||||
test_file.as_str(),
|
||||
"--config",
|
||||
config_path.to_string_lossy().as_ref(),
|
||||
"--config-json",
|
||||
r#"{"use_cache": false}"#,
|
||||
])
|
||||
.output()
|
||||
.expect("Failed to merge configs");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"Config merge failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Test 8: Invalid JSON error handling
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_cli_invalid_json_error() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args([
|
||||
"extract",
|
||||
test_file.as_str(),
|
||||
"--config-json",
|
||||
r#"{"invalid json without closing"#, // Malformed JSON
|
||||
])
|
||||
.output()
|
||||
.expect("Failed to execute command");
|
||||
|
||||
// Should fail gracefully with error message
|
||||
assert!(!output.status.success(), "Command should fail with invalid JSON");
|
||||
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
// Should contain some error indication
|
||||
assert!(
|
||||
!stderr.is_empty() || !String::from_utf8_lossy(&output.stdout).is_empty(),
|
||||
"Should provide feedback about invalid JSON"
|
||||
);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Test 9: Config flag conflicts
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_cli_conflicts() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
let temp_dir = TempDir::new().expect("Failed to create temp directory");
|
||||
let config_content = "use_cache = true\n";
|
||||
let config_path = create_test_config(&temp_dir, "config.toml", config_content);
|
||||
|
||||
// Using both --config-json and --config-json-base64 might conflict
|
||||
let json_config = r#"{"use_cache": false}"#;
|
||||
let base64_config = to_base64(json_config);
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args([
|
||||
"extract",
|
||||
test_file.as_str(),
|
||||
"--config",
|
||||
config_path.to_string_lossy().as_ref(),
|
||||
"--config-json",
|
||||
r#"{"chunking": {"max_chars": 512}}"#,
|
||||
"--config-json-base64",
|
||||
base64_config.as_str(),
|
||||
])
|
||||
.output()
|
||||
.expect("Failed to execute command with potential conflicts");
|
||||
|
||||
// The behavior here depends on implementation:
|
||||
// Either it should succeed (last flag wins) or show an error (mutually exclusive)
|
||||
// We verify that the command completes without crashing
|
||||
let _ = output.status.success();
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Test 10: Real end-to-end extraction with new config formats
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_cli_real_extraction() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
// Full E2E test: extract with multiple new flags
|
||||
let output = Command::new(get_binary_path())
|
||||
.args([
|
||||
"extract",
|
||||
test_file.as_str(),
|
||||
"--format",
|
||||
"json",
|
||||
"--output-format",
|
||||
"markdown",
|
||||
"--config-json",
|
||||
r#"{"use_cache": false, "disable_ocr": true}"#,
|
||||
])
|
||||
.output()
|
||||
.expect("Failed to execute full E2E extraction");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"E2E extraction failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
|
||||
// Should be valid JSON output
|
||||
let parsed: Result<serde_json::Value, _> = serde_json::from_str(&stdout);
|
||||
assert!(parsed.is_ok(), "E2E output should be valid JSON, got: {}", stdout);
|
||||
|
||||
// Verify envelope+result structure
|
||||
if let Ok(value) = parsed {
|
||||
assert!(value.get("result").is_some(), "Missing 'result' envelope field");
|
||||
assert!(
|
||||
value.get("extraction_time_ms").is_some(),
|
||||
"Missing 'extraction_time_ms' field"
|
||||
);
|
||||
assert!(
|
||||
value["result"].get("content").is_some(),
|
||||
"Missing content field in result"
|
||||
);
|
||||
assert!(
|
||||
value["result"].get("mime_type").is_some(),
|
||||
"Missing mime_type field in result"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Additional Edge Cases and Robustness Tests
|
||||
// ============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_cli_empty_config_json() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
// Empty JSON object should use defaults
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", test_file.as_str(), "--config-json", "{}"])
|
||||
.output()
|
||||
.expect("Failed to execute with empty JSON config");
|
||||
|
||||
assert!(output.status.success(), "Command with empty JSON config should succeed");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_multiple_output_format_variants() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
// Test case-insensitive format argument
|
||||
let output = Command::new(get_binary_path())
|
||||
.args([
|
||||
"extract",
|
||||
test_file.as_str(),
|
||||
"--output-format",
|
||||
"MARKDOWN", // uppercase should work or fail predictably
|
||||
])
|
||||
.output()
|
||||
.expect("Failed to execute");
|
||||
|
||||
// Either succeeds with case-insensitive parsing or fails gracefully
|
||||
let _ = output.status.success();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cli_config_json_with_nested_objects() {
|
||||
build_binary();
|
||||
|
||||
let test_file = get_test_file("text/simple.txt");
|
||||
if !PathBuf::from(&test_file).exists() {
|
||||
eprintln!("Skipping test: {} not found", test_file);
|
||||
return;
|
||||
}
|
||||
|
||||
// Complex nested JSON configuration
|
||||
let complex_config = r#"
|
||||
{
|
||||
"use_cache": false,
|
||||
"chunking": {"max_chars": 512},
|
||||
"language_detection": {
|
||||
"enabled": true,
|
||||
"confidence_threshold": 0.8
|
||||
}
|
||||
}
|
||||
"#;
|
||||
|
||||
let output = Command::new(get_binary_path())
|
||||
.args(["extract", test_file.as_str(), "--config-json", complex_config])
|
||||
.output()
|
||||
.expect("Failed to execute with nested JSON config");
|
||||
|
||||
assert!(
|
||||
output.status.success() || !String::from_utf8_lossy(&output.stderr).is_empty(),
|
||||
"Complex config should either work or provide error"
|
||||
);
|
||||
}
|
||||
237
crates/kreuzberg-cli/tests/extract_envelope.rs
Normal file
237
crates/kreuzberg-cli/tests/extract_envelope.rs
Normal file
@@ -0,0 +1,237 @@
|
||||
//! Integration tests for the JSON timing envelope added to `kreuzberg extract` and
|
||||
//! `kreuzberg batch`.
|
||||
//!
|
||||
//! Verifies:
|
||||
//! - `extract --format json` emits `{ result, extraction_time_ms }` shape
|
||||
//! - `batch --format json` emits `{ results, total_ms, per_file_ms }` shape
|
||||
//! - `result.metadata.ocr_used` exists as a bool field
|
||||
//! - `--pdf-backend xyz` exits non-zero and mentions "pdf-oxide"
|
||||
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::Command;
|
||||
|
||||
/// Returns path to the compiled `kreuzberg` binary (debug build).
|
||||
fn kreuzberg_bin() -> PathBuf {
|
||||
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
|
||||
manifest_dir
|
||||
.parent()
|
||||
.expect("crates/kreuzberg-cli parent")
|
||||
.parent()
|
||||
.expect("crates parent")
|
||||
.join("target")
|
||||
.join("debug")
|
||||
.join("kreuzberg")
|
||||
}
|
||||
|
||||
/// Returns path to the small reference PDF used in these tests.
|
||||
fn pdf_fixture() -> PathBuf {
|
||||
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
|
||||
manifest_dir
|
||||
.parent()
|
||||
.expect("crates/kreuzberg-cli parent")
|
||||
.parent()
|
||||
.expect("crates parent")
|
||||
.join("test_documents")
|
||||
.join("pdf")
|
||||
.join("pdfa_001.pdf")
|
||||
}
|
||||
|
||||
/// Returns path to the small plain-text fixture used for batch tests.
|
||||
fn txt_fixture() -> PathBuf {
|
||||
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
|
||||
manifest_dir
|
||||
.parent()
|
||||
.expect("crates/kreuzberg-cli parent")
|
||||
.parent()
|
||||
.expect("crates parent")
|
||||
.join("test_documents")
|
||||
.join("text")
|
||||
.join("fake_text.txt")
|
||||
}
|
||||
|
||||
/// Build the binary once before running. Panics on failure.
|
||||
fn build_binary() {
|
||||
let status = Command::new("cargo")
|
||||
.args(["build", "--bin", "kreuzberg"])
|
||||
.status()
|
||||
.expect("cargo build invocation failed");
|
||||
assert!(status.success(), "cargo build failed — binary unavailable");
|
||||
}
|
||||
|
||||
/// Skip-guard: returns `true` when the fixture exists so the test can run.
|
||||
fn fixture_exists(path: &Path) -> bool {
|
||||
path.exists() && path.is_file()
|
||||
}
|
||||
|
||||
// ── extract --format json envelope ──────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn test_extract_json_has_result_and_timing() {
|
||||
build_binary();
|
||||
|
||||
let pdf = pdf_fixture();
|
||||
if !fixture_exists(&pdf) {
|
||||
eprintln!("SKIP: PDF fixture not found at {}", pdf.display());
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(kreuzberg_bin())
|
||||
.args(["extract", &pdf.to_string_lossy(), "--format", "json"])
|
||||
.output()
|
||||
.expect("failed to run kreuzberg extract");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"extract exited non-zero: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let json: serde_json::Value = serde_json::from_slice(&output.stdout).expect("stdout is not valid JSON");
|
||||
|
||||
// Envelope shape
|
||||
assert!(json.get("result").is_some(), "missing 'result' key in envelope");
|
||||
let extraction_time_ms = json
|
||||
.get("extraction_time_ms")
|
||||
.and_then(|v| v.as_f64())
|
||||
.expect("'extraction_time_ms' must be a number");
|
||||
assert!(
|
||||
extraction_time_ms > 0.0,
|
||||
"extraction_time_ms must be positive, got {extraction_time_ms}"
|
||||
);
|
||||
|
||||
// ocr_used field must exist as a bool
|
||||
let ocr_used = json["result"]["metadata"]
|
||||
.get("ocr_used")
|
||||
.expect("'result.metadata.ocr_used' must be present")
|
||||
.as_bool()
|
||||
.expect("'result.metadata.ocr_used' must be a boolean");
|
||||
// For a native-text PDF without --force-ocr, OCR should NOT have run.
|
||||
assert!(!ocr_used, "expected ocr_used=false for native PDF extraction");
|
||||
}
|
||||
|
||||
// ── batch --format json envelope ─────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn test_batch_json_has_results_and_timing() {
|
||||
build_binary();
|
||||
|
||||
let pdf = pdf_fixture();
|
||||
let txt = txt_fixture();
|
||||
if !fixture_exists(&pdf) || !fixture_exists(&txt) {
|
||||
eprintln!("SKIP: one or more batch fixtures not found");
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(kreuzberg_bin())
|
||||
.args([
|
||||
"batch",
|
||||
&pdf.to_string_lossy(),
|
||||
&txt.to_string_lossy(),
|
||||
"--format",
|
||||
"json",
|
||||
])
|
||||
.output()
|
||||
.expect("failed to run kreuzberg batch");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"batch exited non-zero: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let json: serde_json::Value = serde_json::from_slice(&output.stdout).expect("stdout is not valid JSON");
|
||||
|
||||
// Envelope shape
|
||||
let results = json
|
||||
.get("results")
|
||||
.and_then(|v| v.as_array())
|
||||
.expect("'results' must be an array");
|
||||
assert_eq!(results.len(), 2, "expected 2 results for 2 input files");
|
||||
|
||||
let total_ms = json
|
||||
.get("total_ms")
|
||||
.and_then(|v| v.as_f64())
|
||||
.expect("'total_ms' must be a number");
|
||||
assert!(total_ms > 0.0, "total_ms must be positive, got {total_ms}");
|
||||
|
||||
let per_file_ms = json
|
||||
.get("per_file_ms")
|
||||
.and_then(|v| v.as_array())
|
||||
.expect("'per_file_ms' must be an array");
|
||||
assert_eq!(per_file_ms.len(), 2, "per_file_ms must have one entry per file");
|
||||
|
||||
for (i, timing) in per_file_ms.iter().enumerate() {
|
||||
let ms = timing.as_f64().expect("per_file_ms entry must be a number");
|
||||
assert!(ms > 0.0, "per_file_ms[{i}] must be positive, got {ms}");
|
||||
}
|
||||
|
||||
// Each result must have metadata.ocr_used as a bool
|
||||
for (i, result) in results.iter().enumerate() {
|
||||
assert!(
|
||||
result["metadata"].get("ocr_used").and_then(|v| v.as_bool()).is_some(),
|
||||
"results[{i}].metadata.ocr_used must be a bool"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// ── --pdf-backend validation ─────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn test_pdf_backend_invalid_value_exits_nonzero() {
|
||||
build_binary();
|
||||
|
||||
let pdf = pdf_fixture();
|
||||
if !fixture_exists(&pdf) {
|
||||
eprintln!("SKIP: PDF fixture not found at {}", pdf.display());
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(kreuzberg_bin())
|
||||
.args(["extract", &pdf.to_string_lossy(), "--pdf-backend", "xyz"])
|
||||
.output()
|
||||
.expect("failed to run kreuzberg extract");
|
||||
|
||||
assert!(
|
||||
!output.status.success(),
|
||||
"expected non-zero exit for unknown --pdf-backend"
|
||||
);
|
||||
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
assert!(
|
||||
stderr.contains("pdf-oxide"),
|
||||
"error message should mention 'pdf-oxide', got: {stderr}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pdf_backend_valid_value_succeeds() {
|
||||
build_binary();
|
||||
|
||||
let pdf = pdf_fixture();
|
||||
if !fixture_exists(&pdf) {
|
||||
eprintln!("SKIP: PDF fixture not found at {}", pdf.display());
|
||||
return;
|
||||
}
|
||||
|
||||
let output = Command::new(kreuzberg_bin())
|
||||
.args([
|
||||
"extract",
|
||||
&pdf.to_string_lossy(),
|
||||
"--pdf-backend",
|
||||
"pdf-oxide",
|
||||
"--format",
|
||||
"json",
|
||||
])
|
||||
.output()
|
||||
.expect("failed to run kreuzberg extract");
|
||||
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"--pdf-backend pdf-oxide should succeed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
|
||||
let json: serde_json::Value = serde_json::from_slice(&output.stdout).expect("stdout is not valid JSON");
|
||||
assert!(json.get("result").is_some(), "missing 'result' key");
|
||||
assert!(json.get("extraction_time_ms").is_some(), "missing 'extraction_time_ms'");
|
||||
}
|
||||
153
crates/kreuzberg-cli/tests/server_test.rs
Normal file
153
crates/kreuzberg-cli/tests/server_test.rs
Normal file
@@ -0,0 +1,153 @@
|
||||
//! Integration tests for server commands (serve and mcp).
|
||||
|
||||
#[cfg(not(coverage))]
|
||||
use std::process::{Command, Stdio};
|
||||
#[cfg(not(coverage))]
|
||||
use std::thread;
|
||||
#[cfg(not(coverage))]
|
||||
use std::time::Duration;
|
||||
|
||||
#[cfg(not(coverage))]
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn test_serve_command_starts() {
|
||||
let status = Command::new("cargo")
|
||||
.args(["build", "--bin", "kreuzberg", "--features", "all"])
|
||||
.status()
|
||||
.expect("Failed to build binary");
|
||||
|
||||
assert!(status.success(), "Failed to build kreuzberg binary");
|
||||
|
||||
let mut child = Command::new("./target/debug/kreuzberg")
|
||||
.args(["serve", "-H", "127.0.0.1", "-p", "18000"])
|
||||
.stdout(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
.spawn()
|
||||
.expect("Failed to start server");
|
||||
|
||||
thread::sleep(Duration::from_secs(3));
|
||||
|
||||
let mut health_response = ureq::get("http://127.0.0.1:18000/health")
|
||||
.call()
|
||||
.expect("Failed to call health endpoint");
|
||||
|
||||
assert_eq!(health_response.status(), 200);
|
||||
|
||||
let health_json: serde_json::Value = health_response
|
||||
.body_mut()
|
||||
.read_json()
|
||||
.expect("Failed to parse health response");
|
||||
|
||||
assert_eq!(health_json["status"], "healthy");
|
||||
assert!(health_json["version"].is_string());
|
||||
|
||||
let mut info_response = ureq::get("http://127.0.0.1:18000/info")
|
||||
.call()
|
||||
.expect("Failed to call info endpoint");
|
||||
|
||||
assert_eq!(info_response.status(), 200);
|
||||
|
||||
let info_json: serde_json::Value = info_response
|
||||
.body_mut()
|
||||
.read_json()
|
||||
.expect("Failed to parse info response");
|
||||
|
||||
assert!(info_json["rust_backend"].as_bool().unwrap_or(false));
|
||||
|
||||
child.kill().expect("Failed to kill server");
|
||||
child.wait().expect("Failed to wait for server");
|
||||
}
|
||||
|
||||
#[cfg(not(coverage))]
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn test_serve_command_with_config() {
|
||||
use std::fs;
|
||||
|
||||
let config_content = r#"
|
||||
use_cache = true
|
||||
enable_quality_processing = true
|
||||
|
||||
[ocr]
|
||||
backend = "tesseract"
|
||||
language = "eng"
|
||||
"#;
|
||||
|
||||
fs::write("test_config.toml", config_content).expect("Failed to write test config");
|
||||
|
||||
let mut child = Command::new("./target/debug/kreuzberg")
|
||||
.args(["serve", "-H", "127.0.0.1", "-p", "18001", "-c", "test_config.toml"])
|
||||
.stdout(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
.spawn()
|
||||
.expect("Failed to start server");
|
||||
|
||||
thread::sleep(Duration::from_secs(3));
|
||||
|
||||
let health_response = ureq::get("http://127.0.0.1:18001/health").call();
|
||||
|
||||
assert!(health_response.is_ok(), "Server should be running with custom config");
|
||||
|
||||
child.kill().expect("Failed to kill server");
|
||||
child.wait().expect("Failed to wait for server");
|
||||
|
||||
fs::remove_file("test_config.toml").ok();
|
||||
}
|
||||
|
||||
#[cfg(not(coverage))]
|
||||
#[test]
|
||||
fn test_serve_command_help() {
|
||||
let build_status = Command::new("cargo")
|
||||
.args(["build", "--bin", "kreuzberg", "--features", "all"])
|
||||
.status()
|
||||
.expect("Failed to build binary");
|
||||
|
||||
assert!(build_status.success(), "Failed to build kreuzberg binary");
|
||||
|
||||
let binary_path = env!("CARGO_TARGET_TMPDIR")
|
||||
.split("target")
|
||||
.next()
|
||||
.map(|s| format!("{}target/debug/kreuzberg", s))
|
||||
.unwrap_or_else(|| "../target/debug/kreuzberg".to_string());
|
||||
|
||||
let output = Command::new(&binary_path)
|
||||
.args(["serve", "--help"])
|
||||
.output()
|
||||
.expect("Failed to execute command");
|
||||
|
||||
assert!(output.status.success());
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
assert!(stdout.contains("Start the API server"));
|
||||
assert!(stdout.contains("--host"));
|
||||
assert!(stdout.contains("--port"));
|
||||
assert!(stdout.contains("--config"));
|
||||
}
|
||||
|
||||
#[cfg(not(coverage))]
|
||||
#[test]
|
||||
fn test_mcp_command_help() {
|
||||
let build_status = Command::new("cargo")
|
||||
.args(["build", "--bin", "kreuzberg", "--features", "all"])
|
||||
.status()
|
||||
.expect("Failed to build binary");
|
||||
|
||||
assert!(build_status.success(), "Failed to build kreuzberg binary");
|
||||
|
||||
let binary_path = env!("CARGO_TARGET_TMPDIR")
|
||||
.split("target")
|
||||
.next()
|
||||
.map(|s| format!("{}target/debug/kreuzberg", s))
|
||||
.unwrap_or_else(|| "../target/debug/kreuzberg".to_string());
|
||||
|
||||
let output = Command::new(&binary_path)
|
||||
.args(["mcp", "--help"])
|
||||
.output()
|
||||
.expect("Failed to execute command");
|
||||
|
||||
assert!(output.status.success());
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
assert!(stdout.contains("Start the MCP (Model Context Protocol) server"));
|
||||
assert!(stdout.contains("--config"));
|
||||
}
|
||||
46
crates/kreuzberg-ffi/Cargo.toml
generated
Normal file
46
crates/kreuzberg-ffi/Cargo.toml
generated
Normal file
@@ -0,0 +1,46 @@
|
||||
[package]
|
||||
name = "kreuzberg-ffi"
|
||||
version = "5.0.0-rc.3"
|
||||
edition = "2021"
|
||||
license = "Elastic-2.0"
|
||||
description = "High-performance document intelligence library"
|
||||
readme = false
|
||||
keywords = ["document", "extraction", "ocr", "pdf", "text"]
|
||||
categories = ["text-processing"]
|
||||
repository = "https://github.com/kreuzberg-dev/kreuzberg"
|
||||
|
||||
# `serde_json`, `ahash`, and `tokio` are emitted unconditionally above so the
|
||||
# manifest is stable across regens (and so the C FFI codegen can pull them in
|
||||
# when an async / Result-typed function appears in the API surface), but for
|
||||
# umbrella crates with no async fns and no JSON-marshalled return types they
|
||||
# are genuinely unused. The conditional `async-trait` / `futures-util` deps
|
||||
# are similarly flagged when the umbrella has trait-bridge / streaming adapters
|
||||
# configured but no actual async-trait / async-stream callsite in the generated
|
||||
# FFI shim.
|
||||
[package.metadata.cargo-machete]
|
||||
ignored = ["ahash", "serde_json", "tokio", "async-trait"]
|
||||
|
||||
[lib]
|
||||
crate-type = ["cdylib", "staticlib", "rlib"]
|
||||
|
||||
[features]
|
||||
default = []
|
||||
|
||||
[dependencies]
|
||||
ahash = "0.8"
|
||||
async-trait = "0.1"
|
||||
serde_json = "1"
|
||||
tokio = { version = "1", features = ["full"] }
|
||||
|
||||
[target.'cfg(not(all(target_os = "android", target_arch = "x86_64")))'.dependencies]
|
||||
kreuzberg = { path = "../kreuzberg", version = "5.0.0-rc.3", features = ["full", "pdf", "ocr", "paddle-ocr", "paddle-ocr-types", "layout-detection", "layout-types", "embeddings", "embedding-presets", "chunking", "keywords-yake", "keywords-rake", "language-detection", "html", "tree-sitter", "office", "email", "archives", "stopwords", "auto-rotate", "auto-rotate-types", "tokio-runtime", "api", "mcp", "liter-llm", "quality"] }
|
||||
|
||||
[target.'cfg(all(target_os = "android", target_arch = "x86_64"))'.dependencies]
|
||||
kreuzberg = { path = "../kreuzberg", version = "5.0.0-rc.3", features = ["android-target"] }
|
||||
|
||||
|
||||
[build-dependencies]
|
||||
cbindgen = "0.29"
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3"
|
||||
295
crates/kreuzberg-ffi/README.md
generated
Normal file
295
crates/kreuzberg-ffi/README.md
generated
Normal file
@@ -0,0 +1,295 @@
|
||||
# FFI (C/C++)
|
||||
|
||||
<div align="center" style="display: flex; flex-wrap: wrap; gap: 8px; justify-content: center; margin: 20px 0;">
|
||||
<a href="https://github.com/kreuzberg-dev/alef">
|
||||
<img src="https://img.shields.io/badge/Bindings-alef%20%D7%90-007ec6" alt="Bindings">
|
||||
</a>
|
||||
<!-- Language Bindings -->
|
||||
<a href="https://crates.io/crates/kreuzberg">
|
||||
<img src="https://img.shields.io/crates/v/kreuzberg?label=Rust&color=007ec6" alt="Rust">
|
||||
</a>
|
||||
<a href="https://pypi.org/project/kreuzberg/">
|
||||
<img src="https://img.shields.io/pypi/v/kreuzberg?label=Python&color=007ec6" alt="Python">
|
||||
</a>
|
||||
<a href="https://www.npmjs.com/package/@kreuzberg/node">
|
||||
<img src="https://img.shields.io/npm/v/@kreuzberg/node?label=Node.js&color=007ec6" alt="Node.js">
|
||||
</a>
|
||||
<a href="https://www.npmjs.com/package/@kreuzberg/wasm">
|
||||
<img src="https://img.shields.io/npm/v/@kreuzberg/wasm?label=WASM&color=007ec6" alt="WASM">
|
||||
</a>
|
||||
<a href="https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg">
|
||||
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
||||
</a>
|
||||
<a href="https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/go/v5">
|
||||
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v5*" alt="Go">
|
||||
</a>
|
||||
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
||||
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
||||
</a>
|
||||
<a href="https://packagist.org/packages/kreuzberg/kreuzberg">
|
||||
<img src="https://img.shields.io/packagist/v/kreuzberg/kreuzberg?label=PHP&color=007ec6" alt="PHP">
|
||||
</a>
|
||||
<a href="https://rubygems.org/gems/kreuzberg">
|
||||
<img src="https://img.shields.io/gem/v/kreuzberg?label=Ruby&color=007ec6" alt="Ruby">
|
||||
</a>
|
||||
<a href="https://hex.pm/packages/kreuzberg">
|
||||
<img src="https://img.shields.io/hexpm/v/kreuzberg?label=Elixir&color=007ec6" alt="Elixir">
|
||||
</a>
|
||||
<a href="https://kreuzberg-dev.r-universe.dev/kreuzberg">
|
||||
<img src="https://img.shields.io/badge/R-kreuzberg-007ec6" alt="R">
|
||||
</a>
|
||||
<a href="https://pub.dev/packages/kreuzberg">
|
||||
<img src="https://img.shields.io/pub/v/kreuzberg?label=Dart&color=007ec6" alt="Dart">
|
||||
</a>
|
||||
<a href="https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg-android">
|
||||
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg-android?label=Kotlin&color=007ec6" alt="Kotlin">
|
||||
</a>
|
||||
<a href="https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/swift">
|
||||
<img src="https://img.shields.io/badge/Swift-SPM-007ec6" alt="Swift">
|
||||
</a>
|
||||
<a href="https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/zig">
|
||||
<img src="https://img.shields.io/badge/Zig-package-007ec6" alt="Zig">
|
||||
</a>
|
||||
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
||||
<img src="https://img.shields.io/badge/C-FFI-007ec6" alt="C FFI">
|
||||
</a>
|
||||
<a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/kreuzberg">
|
||||
<img src="https://img.shields.io/badge/Docker-ghcr.io-007ec6?logo=docker&logoColor=white" alt="Docker">
|
||||
</a>
|
||||
<a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/charts%2Fkreuzberg">
|
||||
<img src="https://img.shields.io/badge/Helm-ghcr.io-007ec6?logo=helm&logoColor=white" alt="Helm">
|
||||
</a>
|
||||
|
||||
<!-- Project Info -->
|
||||
<a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
|
||||
<img src="https://img.shields.io/badge/License-Elastic--2.0-007ec6" alt="License">
|
||||
</a>
|
||||
<a href="https://docs.kreuzberg.dev">
|
||||
<img src="https://img.shields.io/badge/Docs-kreuzberg-007ec6" alt="Documentation">
|
||||
</a>
|
||||
<a href="https://huggingface.co/Kreuzberg">
|
||||
<img src="https://img.shields.io/badge/Hugging%20Face-Kreuzberg-007ec6" alt="Hugging Face">
|
||||
</a>
|
||||
</div>
|
||||
|
||||
<div align="center" style="margin: 24px 0 0;">
|
||||
<a href="https://kreuzberg.dev">
|
||||
<img alt="Kreuzberg" src="https://github.com/user-attachments/assets/419fc06c-8313-4324-b159-4b4d3cfce5c0" />
|
||||
</a>
|
||||
</div>
|
||||
|
||||
<div align="center" style="display: flex; flex-wrap: wrap; gap: 12px; justify-content: center; margin: 28px 0 24px;">
|
||||
<a href="https://discord.gg/xt9WY3GnKR">
|
||||
<img height="22" src="https://img.shields.io/badge/Discord-Chat-007ec6?logo=discord&logoColor=white" alt="Join Discord">
|
||||
</a>
|
||||
<a href="https://docs.kreuzberg.dev/demo.html">
|
||||
<img height="22" src="https://img.shields.io/badge/Live%20Demo-Open-007ec6?logo=webassembly&logoColor=white" alt="Live Demo">
|
||||
</a>
|
||||
</div>
|
||||
|
||||
Extract text, tables, images, and metadata from 90+ file formats and 300+ programming languages including PDF, Office documents, and images. C/C++ FFI bindings providing a stable ABI for native integration, shared library distribution, and cross-language interop.
|
||||
|
||||
## What This Package Provides
|
||||
|
||||
- **Document intelligence core** — extract text, tables, images, metadata, entities, keywords, and code intelligence from one API.
|
||||
- **Format coverage** — PDF, Office, images, HTML/XML, email, archives, notebooks, citations, scientific formats, and plain text.
|
||||
- **OCR choices** — Tesseract, PaddleOCR, EasyOCR where supported, VLM OCR through liter-llm, and plugin hooks for custom backends.
|
||||
- **Same engine as every binding** — Rust, Python, Node.js, Go, Java, PHP, Ruby, .NET, Elixir, R, WASM, Kotlin Android, Swift, Dart, Zig, and C FFI share the same Rust implementation.
|
||||
- **C ABI** — stable shared library surface for custom hosts and secondary bindings.
|
||||
|
||||
## Installation
|
||||
|
||||
### Package Installation
|
||||
|
||||
Build the shared library from the workspace:
|
||||
|
||||
```bash
|
||||
cargo build --release -p kreuzberg-ffi
|
||||
```
|
||||
|
||||
The built artifacts are emitted under `target/release/` (`libkreuzberg_ffi.{so,dylib,a}`) along with the C header at `crates/kreuzberg-ffi/include/kreuzberg.h`.
|
||||
|
||||
### System Requirements
|
||||
- A C/C++ toolchain (clang, gcc, or MSVC) and a Rust toolchain (`rustup`) for building from source
|
||||
- A `pkg-config` or CMake-aware build system that can locate `libkreuzberg_ffi` and `kreuzberg.h`
|
||||
- Optional: [ONNX Runtime](https://github.com/microsoft/onnxruntime/releases) version 1.22.x for embeddings support
|
||||
- Optional: [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) for OCR functionality
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Basic Extraction
|
||||
|
||||
Extract text, metadata, and structure from any supported document format:
|
||||
|
||||
<!-- snippet not found: -->
|
||||
|
||||
### Common Use Cases
|
||||
|
||||
#### Extract with Custom Configuration
|
||||
|
||||
Most use cases benefit from configuration to control extraction behavior:
|
||||
|
||||
#### Table Extraction
|
||||
|
||||
See [Configuration Guide](https://docs.kreuzberg.dev/guides/configuration/) for table extraction options.
|
||||
|
||||
#### Processing Multiple Files
|
||||
|
||||
### Next Steps
|
||||
|
||||
- **[Installation Guide](https://docs.kreuzberg.dev/getting-started/installation/)** - Platform-specific setup
|
||||
- **[API Documentation](https://docs.kreuzberg.dev/reference/api-python/)** - Complete API reference
|
||||
- **[Examples & Guides](https://docs.kreuzberg.dev/)** - Full code examples and usage guides
|
||||
- **[Configuration Guide](https://docs.kreuzberg.dev/guides/configuration/)** - Advanced configuration options
|
||||
|
||||
## Features
|
||||
|
||||
### Supported File Formats (90+)
|
||||
|
||||
90+ file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
|
||||
|
||||
#### Office Documents
|
||||
|
||||
| Category | Formats | Capabilities |
|
||||
|----------|---------|--------------|
|
||||
| **Word Processing** | `.docx`, `.docm`, `.dotx`, `.dotm`, `.dot`, `.odt` | Full text, tables, images, metadata, styles |
|
||||
| **Spreadsheets** | `.xlsx`, `.xlsm`, `.xlsb`, `.xls`, `.xla`, `.xlam`, `.xltm`, `.xltx`, `.xlt`, `.ods` | Sheet data, formulas, cell metadata, charts |
|
||||
| **Presentations** | `.pptx`, `.pptm`, `.ppsx`, `.potx`, `.potm`, `.pot`, `.ppt` | Slides, speaker notes, images, metadata |
|
||||
| **PDF** | `.pdf` | Text, tables, images, metadata, OCR support |
|
||||
| **eBooks** | `.epub`, `.fb2` | Chapters, metadata, embedded resources |
|
||||
| **Database** | `.dbf` | Table data extraction, field type support |
|
||||
| **Hangul** | `.hwp`, `.hwpx` | Korean document format, text extraction |
|
||||
|
||||
#### Images (OCR-Enabled)
|
||||
|
||||
| Category | Formats | Features |
|
||||
|----------|---------|----------|
|
||||
| **Raster** | `.png`, `.jpg`, `.jpeg`, `.gif`, `.webp`, `.bmp`, `.tiff`, `.tif` | OCR, table detection, EXIF metadata, dimensions, color space |
|
||||
| **Advanced** | `.jp2`, `.jpx`, `.jpm`, `.mj2`, `.jbig2`, `.jb2`, `.pnm`, `.pbm`, `.pgm`, `.ppm` | OCR via hayro-jpeg2000 (pure Rust decoder), JBIG2 support, table detection, format-specific metadata |
|
||||
| **Vector** | `.svg` | DOM parsing, embedded text, graphics metadata |
|
||||
|
||||
#### Web & Data
|
||||
|
||||
| Category | Formats | Features |
|
||||
|----------|---------|----------|
|
||||
| **Markup** | `.html`, `.htm`, `.xhtml`, `.xml`, `.svg` | DOM parsing, metadata (Open Graph, Twitter Card), link extraction |
|
||||
| **Structured Data** | `.json`, `.yaml`, `.yml`, `.toml`, `.csv`, `.tsv` | Schema detection, nested structures, validation |
|
||||
| **Text & Markdown** | `.txt`, `.md`, `.markdown`, `.djot`, `.rst`, `.org`, `.rtf` | CommonMark, GFM, Djot, reStructuredText, Org Mode |
|
||||
|
||||
#### Email & Archives
|
||||
|
||||
| Category | Formats | Features |
|
||||
|----------|---------|----------|
|
||||
| **Email** | `.eml`, `.msg` | Headers, body (HTML/plain), attachments, threading |
|
||||
| **Archives** | `.zip`, `.tar`, `.tgz`, `.gz`, `.7z` | File listing, nested archives, metadata |
|
||||
|
||||
#### Academic & Scientific
|
||||
|
||||
| Category | Formats | Features |
|
||||
|----------|---------|----------|
|
||||
| **Citations** | `.bib`, `.biblatex`, `.ris`, `.nbib`, `.enw`, `.csl` | Structured parsing: RIS (structured), PubMed/MEDLINE, EndNote XML (structured), BibTeX, CSL JSON |
|
||||
| **Scientific** | `.tex`, `.latex`, `.typst`, `.jats`, `.ipynb`, `.docbook` | LaTeX, Jupyter notebooks, PubMed JATS |
|
||||
| **Documentation** | `.opml`, `.pod`, `.mdoc`, `.troff` | Technical documentation formats |
|
||||
|
||||
#### Code Intelligence (300+ Languages)
|
||||
|
||||
| Feature | Description |
|
||||
|---------|-------------|
|
||||
| **Structure Extraction** | Functions, classes, methods, structs, interfaces, enums |
|
||||
| **Import/Export Analysis** | Module dependencies, re-exports, wildcard imports |
|
||||
| **Symbol Extraction** | Variables, constants, type aliases, properties |
|
||||
| **Docstring Parsing** | Google, NumPy, Sphinx, JSDoc, RustDoc, and 10+ formats |
|
||||
| **Diagnostics** | Parse errors with line/column positions |
|
||||
| **Syntax-Aware Chunking** | Split code by semantic boundaries, not arbitrary byte offsets |
|
||||
|
||||
Powered by [tree-sitter-language-pack](https://github.com/kreuzberg-dev/tree-sitter-language-pack) — [documentation](https://docs.tree-sitter-language-pack.kreuzberg.dev).
|
||||
|
||||
**[Complete Format Reference](https://docs.kreuzberg.dev/reference/formats/)**
|
||||
|
||||
### Key Capabilities
|
||||
|
||||
- **Text Extraction** - Extract all text content with position and formatting information
|
||||
- **Metadata Extraction** - Retrieve document properties, creation date, author, etc.
|
||||
- **Table Extraction** - Parse tables with structure and cell content preservation
|
||||
- **Image Extraction** - Extract embedded images and render page previews
|
||||
- **OCR Support** - Integrate multiple OCR backends for scanned documents
|
||||
- **Async/Await** - Non-blocking document processing with concurrent operations
|
||||
- **Plugin System** - Extensible post-processing for custom text transformation
|
||||
- **Embeddings** - Generate vector embeddings using ONNX Runtime models
|
||||
- **Batch Processing** - Efficiently process multiple documents in parallel
|
||||
- **Memory Efficient** - Stream large files without loading entirely into memory
|
||||
- **Language Detection** - Detect and support multiple languages in documents
|
||||
- **Code Intelligence** - Extract structure, imports, exports, symbols, and docstrings from [300+ programming languages](https://docs.tree-sitter-language-pack.kreuzberg.dev) via tree-sitter
|
||||
- **Configuration** - Fine-grained control over extraction behavior
|
||||
|
||||
### Performance Characteristics
|
||||
|
||||
| Format | Speed | Memory | Notes |
|
||||
|--------|-------|--------|-------|
|
||||
| **PDF (text)** | 10-100 MB/s | ~50MB per doc | Fastest extraction |
|
||||
| **Office docs** | 20-200 MB/s | ~100MB per doc | DOCX, XLSX, PPTX |
|
||||
| **Images (OCR)** | 1-5 MB/s | Variable | Depends on OCR backend |
|
||||
| **Archives** | 5-50 MB/s | ~200MB per doc | ZIP, TAR, etc. |
|
||||
| **Web formats** | 50-200 MB/s | Streaming | HTML, XML, JSON |
|
||||
|
||||
## OCR Support
|
||||
|
||||
Kreuzberg supports multiple OCR backends for extracting text from scanned documents and images:
|
||||
|
||||
### OCR Configuration Example
|
||||
|
||||
<!-- snippet not found: -->
|
||||
|
||||
## Async Support
|
||||
|
||||
This binding provides full async/await support for non-blocking document processing:
|
||||
|
||||
<!-- snippet not found: -->
|
||||
|
||||
## Plugin System
|
||||
|
||||
Kreuzberg supports extensible post-processing plugins for custom text transformation and filtering.
|
||||
|
||||
For detailed plugin documentation, visit [Plugin System Guide](https://docs.kreuzberg.dev/guides/plugins/).
|
||||
|
||||
## Embeddings Support
|
||||
|
||||
Generate vector embeddings for extracted text using the built-in ONNX Runtime support. Requires ONNX Runtime installation.
|
||||
|
||||
**[Embeddings Guide](https://docs.kreuzberg.dev/features/#embeddings)**
|
||||
|
||||
## Configuration
|
||||
|
||||
For advanced configuration options including language detection, table extraction, OCR settings, and more:
|
||||
|
||||
**[Configuration Guide](https://docs.kreuzberg.dev/guides/configuration/)**
|
||||
|
||||
## Documentation
|
||||
|
||||
- **[Official Documentation](https://docs.kreuzberg.dev/)**
|
||||
- **[API Reference](https://docs.kreuzberg.dev/reference/api-python/)**
|
||||
- **[Examples & Guides](https://docs.kreuzberg.dev/)**
|
||||
|
||||
## Contributing
|
||||
|
||||
Contributions are welcome! See [Contributing Guide](https://github.com/kreuzberg-dev/kreuzberg/blob/main/CONTRIBUTING.md).
|
||||
|
||||
## Part of Kreuzberg.dev
|
||||
|
||||
- [Kreuzberg Cloud](https://github.com/kreuzberg-dev/kreuzberg-cloud) — managed extraction API with SDKs, dashboards, and observability.
|
||||
- [kreuzcrawl](https://github.com/kreuzberg-dev/kreuzcrawl) — web crawling and scraping with HTML→Markdown and headless-Chrome fallback.
|
||||
- [html-to-markdown](https://github.com/kreuzberg-dev/html-to-markdown) — fast, lossless HTML→Markdown engine.
|
||||
- [liter-llm](https://github.com/kreuzberg-dev/liter-llm) — universal LLM API client with native bindings for 14 languages and 143 providers.
|
||||
- [tree-sitter-language-pack](https://github.com/kreuzberg-dev/tree-sitter-language-pack) — tree-sitter grammars and code-intelligence primitives.
|
||||
- [alef](https://github.com/kreuzberg-dev/alef) — the polyglot binding generator that produces this README and all per-language bindings.
|
||||
- [Discord](https://discord.gg/xt9WY3GnKR) — community, roadmap, announcements.
|
||||
|
||||
## License
|
||||
|
||||
Elastic-2.0 License — see [LICENSE](../../LICENSE) for details.
|
||||
|
||||
## Support
|
||||
|
||||
- **Discord Community**: [Join our Discord](https://discord.gg/xt9WY3GnKR)
|
||||
- **GitHub Issues**: [Report bugs](https://github.com/kreuzberg-dev/kreuzberg/issues)
|
||||
- **Discussions**: [Ask questions](https://github.com/kreuzberg-dev/kreuzberg/discussions)
|
||||
23
crates/kreuzberg-ffi/build.rs
generated
Normal file
23
crates/kreuzberg-ffi/build.rs
generated
Normal file
@@ -0,0 +1,23 @@
|
||||
// This file is auto-generated by alef. DO NOT EDIT.
|
||||
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
fn main() {
|
||||
let crate_dir = std::env::var("CARGO_MANIFEST_DIR").unwrap();
|
||||
cbindgen::generate(crate_dir)
|
||||
.expect("Unable to generate C bindings")
|
||||
.write_to_file("include/kreuzberg.h");
|
||||
|
||||
// Set @rpath-relative install_name on macOS so the cdylib can be relocated
|
||||
// (bundled into language packages like packages/go/.lib/<rid>/, packages/
|
||||
// java/src/main/resources/natives/<rid>/, etc.) and located via the consumer
|
||||
// binary's rpath at runtime. Without this, the install_name embeds the CI
|
||||
// runner build path (`/Users/runner/work/.../target/.../deps/lib<name>.dylib`)
|
||||
// and dyld fails to load the bundled copy from its actual location.
|
||||
if std::env::var("CARGO_CFG_TARGET_OS").as_deref() == Ok("macos") {
|
||||
println!("cargo:rustc-link-arg-cdylib=-Wl,-install_name,@rpath/libkreuzberg_ffi.dylib");
|
||||
}
|
||||
|
||||
let go_include_dir = std::path::Path::new("../../../packages/go/v5/include");
|
||||
std::fs::create_dir_all(go_include_dir).expect("Unable to create Go include directory");
|
||||
std::fs::copy("include/kreuzberg.h", go_include_dir.join("kreuzberg.h"))
|
||||
.expect("Unable to copy header to Go include directory");
|
||||
}
|
||||
1561
crates/kreuzberg-ffi/cbindgen.toml
generated
Normal file
1561
crates/kreuzberg-ffi/cbindgen.toml
generated
Normal file
File diff suppressed because it is too large
Load Diff
87
crates/kreuzberg-ffi/cmake/kreuzberg-ffi-config.cmake
generated
Normal file
87
crates/kreuzberg-ffi/cmake/kreuzberg-ffi-config.cmake
generated
Normal file
@@ -0,0 +1,87 @@
|
||||
# kreuzberg-ffi CMake config-mode find module
|
||||
#
|
||||
# Defines the imported target:
|
||||
# kreuzberg-ffi::kreuzberg-ffi
|
||||
#
|
||||
# Usage:
|
||||
# find_package(kreuzberg-ffi REQUIRED)
|
||||
# target_link_libraries(myapp PRIVATE kreuzberg-ffi::kreuzberg-ffi)
|
||||
|
||||
if(TARGET kreuzberg-ffi::kreuzberg-ffi)
|
||||
return()
|
||||
endif()
|
||||
|
||||
get_filename_component(_FFI_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
|
||||
get_filename_component(_FFI_PREFIX "${_FFI_CMAKE_DIR}/.." ABSOLUTE)
|
||||
|
||||
find_library(_FFI_LIBRARY
|
||||
NAMES kreuzberg_ffi libkreuzberg_ffi
|
||||
PATHS "${_FFI_PREFIX}/lib"
|
||||
NO_DEFAULT_PATH
|
||||
)
|
||||
if(NOT _FFI_LIBRARY)
|
||||
find_library(_FFI_LIBRARY NAMES kreuzberg_ffi libkreuzberg_ffi)
|
||||
endif()
|
||||
|
||||
find_path(_FFI_INCLUDE_DIR
|
||||
NAMES kreuzberg.h
|
||||
PATHS "${_FFI_PREFIX}/include"
|
||||
NO_DEFAULT_PATH
|
||||
)
|
||||
if(NOT _FFI_INCLUDE_DIR)
|
||||
find_path(_FFI_INCLUDE_DIR NAMES kreuzberg.h)
|
||||
endif()
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
find_package_handle_standard_args(kreuzberg-ffi
|
||||
REQUIRED_VARS _FFI_LIBRARY _FFI_INCLUDE_DIR
|
||||
)
|
||||
|
||||
if(kreuzberg_ffi_FOUND)
|
||||
set(_FFI_LIB_TYPE UNKNOWN)
|
||||
if(_FFI_LIBRARY MATCHES "\\.(dylib|so)$" OR _FFI_LIBRARY MATCHES "\\.so\\.")
|
||||
set(_FFI_LIB_TYPE SHARED)
|
||||
elseif(_FFI_LIBRARY MATCHES "\\.dll$")
|
||||
set(_FFI_LIB_TYPE SHARED)
|
||||
elseif(_FFI_LIBRARY MATCHES "\\.(a|lib)$")
|
||||
set(_FFI_LIB_TYPE STATIC)
|
||||
endif()
|
||||
|
||||
add_library(kreuzberg-ffi::kreuzberg-ffi ${_FFI_LIB_TYPE} IMPORTED)
|
||||
set_target_properties(kreuzberg-ffi::kreuzberg-ffi PROPERTIES
|
||||
IMPORTED_LOCATION "${_FFI_LIBRARY}"
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${_FFI_INCLUDE_DIR}"
|
||||
)
|
||||
|
||||
if(WIN32 AND _FFI_LIB_TYPE STREQUAL "SHARED")
|
||||
find_file(_FFI_DLL
|
||||
NAMES kreuzberg_ffi.dll libkreuzberg_ffi.dll
|
||||
PATHS "${_FFI_PREFIX}/bin" "${_FFI_PREFIX}/lib"
|
||||
NO_DEFAULT_PATH
|
||||
)
|
||||
if(_FFI_DLL)
|
||||
set_target_properties(kreuzberg-ffi::kreuzberg-ffi PROPERTIES
|
||||
IMPORTED_LOCATION "${_FFI_DLL}"
|
||||
IMPORTED_IMPLIB "${_FFI_LIBRARY}"
|
||||
)
|
||||
endif()
|
||||
unset(_FFI_DLL CACHE)
|
||||
endif()
|
||||
|
||||
if(APPLE)
|
||||
set_property(TARGET kreuzberg-ffi::kreuzberg-ffi APPEND PROPERTY
|
||||
INTERFACE_LINK_LIBRARIES "-framework CoreFoundation" "-framework Security" pthread)
|
||||
elseif(UNIX)
|
||||
set_property(TARGET kreuzberg-ffi::kreuzberg-ffi APPEND PROPERTY
|
||||
INTERFACE_LINK_LIBRARIES pthread dl m)
|
||||
elseif(WIN32)
|
||||
set_property(TARGET kreuzberg-ffi::kreuzberg-ffi APPEND PROPERTY
|
||||
INTERFACE_LINK_LIBRARIES ws2_32 userenv bcrypt)
|
||||
endif()
|
||||
|
||||
unset(_FFI_LIB_TYPE)
|
||||
endif()
|
||||
|
||||
mark_as_advanced(_FFI_LIBRARY _FFI_INCLUDE_DIR)
|
||||
unset(_FFI_CMAKE_DIR)
|
||||
unset(_FFI_PREFIX)
|
||||
14155
crates/kreuzberg-ffi/include/kreuzberg.h
generated
Normal file
14155
crates/kreuzberg-ffi/include/kreuzberg.h
generated
Normal file
File diff suppressed because it is too large
Load Diff
34419
crates/kreuzberg-ffi/src/lib.rs
generated
Normal file
34419
crates/kreuzberg-ffi/src/lib.rs
generated
Normal file
File diff suppressed because it is too large
Load Diff
177
crates/kreuzberg-ffi/tests/email_attachment_data_len.rs
generated
Normal file
177
crates/kreuzberg-ffi/tests/email_attachment_data_len.rs
generated
Normal file
@@ -0,0 +1,177 @@
|
||||
/// Regression test for GitHub #1059.
|
||||
///
|
||||
/// `kreuzberg_email_attachment_data` was the only byte-buffer accessor on a public
|
||||
/// FFI-exposed DTO that did not follow the established `*_data(ptr, out_len: *mut usize)`
|
||||
/// protocol used by `kreuzberg_extracted_image_data`, `kreuzberg_embedded_file_data`,
|
||||
/// and `kreuzberg_batch_bytes_item_content`.
|
||||
///
|
||||
/// Because `EmailAttachment.data` is `Option<Bytes>` (the only optional byte buffer among
|
||||
/// public types), alef's heuristic for emitting the two-parameter form did not trigger.
|
||||
/// Callers had no way to know the valid length of the returned pointer, making any read
|
||||
/// past the first byte undefined behaviour (especially for payloads containing 0x00).
|
||||
///
|
||||
/// The alef fix shipped with the 2-parameter form (`ptr`, `out_len`). These tests
|
||||
/// lock in the correct 2-param ABI and verify the full-length contract for payloads
|
||||
/// that contain embedded NUL bytes.
|
||||
///
|
||||
/// Per project rules: every unsafe block has a SAFETY comment.
|
||||
use std::ffi::{c_char, CString};
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
|
||||
use kreuzberg_ffi::{kreuzberg_email_attachment_free, kreuzberg_email_attachment_from_json, kreuzberg_last_error_code};
|
||||
|
||||
/// Construct a minimal EmailAttachment JSON with a data payload that contains
|
||||
/// an embedded NUL and a trailing high byte (0xEF). This defeats any strlen-based
|
||||
/// or "read first byte only" implementations.
|
||||
fn attachment_json_with_nuls() -> CString {
|
||||
// 8 bytes: JPEG-ish magic + NUL in the middle + high byte at the end.
|
||||
// Length is authoritative and known.
|
||||
let data: Vec<u8> = vec![0xFF, 0xD8, 0xFF, 0x00, 0xDE, 0xAD, 0xBE, 0xEF];
|
||||
let json = format!(
|
||||
r#"{{
|
||||
"name": "test.bin",
|
||||
"filename": "test.bin",
|
||||
"mime_type": "application/octet-stream",
|
||||
"size": {},
|
||||
"is_image": false,
|
||||
"data": {}
|
||||
}}"#,
|
||||
data.len(),
|
||||
serde_json::to_string(&data).unwrap()
|
||||
);
|
||||
CString::new(json).expect("valid UTF-8 JSON for test attachment")
|
||||
}
|
||||
|
||||
/// The committed C header must declare the 2-parameter form for
|
||||
/// `kreuzberg_email_attachment_data` (with `out_len`). This locks in the fix
|
||||
/// for GitHub #1059 so a future regeneration cannot silently revert to the
|
||||
/// 1-parameter form.
|
||||
#[test]
|
||||
fn email_attachment_data_accessor_must_provide_out_len_in_header() {
|
||||
let header_path = Path::new(env!("CARGO_MANIFEST_DIR")).join("include/kreuzberg.h");
|
||||
let header = fs::read_to_string(&header_path).expect("committed kreuzberg.h must be readable by the test");
|
||||
|
||||
// Simple and robust: the declaration for this specific function must mention out_len.
|
||||
let has_out_len = header.contains("kreuzberg_email_attachment_data") && header.contains("out_len");
|
||||
|
||||
assert!(
|
||||
has_out_len,
|
||||
"GitHub #1059 regression: the declaration of kreuzberg_email_attachment_data \
|
||||
in crates/kreuzberg-ffi/include/kreuzberg.h does not contain the required \
|
||||
`out_len` parameter.\n\n\
|
||||
Expected something like:\n uint8_t *kreuzberg_email_attachment_data(..., uintptr_t *out_len);\n\n\
|
||||
Found the old 1-parameter form. Fix requires `task alef:generate` with an \
|
||||
updated alef that handles Option<Bytes> fields for the FFI byte accessor heuristic.\n\n\
|
||||
This is the lock-in test for #1059."
|
||||
);
|
||||
}
|
||||
|
||||
/// When an attachment has no data payload the accessor must return a null pointer
|
||||
/// and write 0 to out_len.
|
||||
#[test]
|
||||
fn email_attachment_data_none_returns_null_pointer() {
|
||||
let json = CString::new(
|
||||
r#"{"name":"empty","filename":"empty","mime_type":null,"size":null,"is_image":false,"data":null}"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// SAFETY: json is valid null-terminated UTF-8.
|
||||
let handle = unsafe { kreuzberg_email_attachment_from_json(json.as_ptr() as *const c_char) };
|
||||
assert!(
|
||||
!handle.is_null(),
|
||||
"from_json should succeed (last_error_code={})",
|
||||
// SAFETY: no precondition; reads a thread-local.
|
||||
unsafe { kreuzberg_last_error_code() }
|
||||
);
|
||||
|
||||
let mut out_len: usize = usize::MAX;
|
||||
// SAFETY: handle is a valid non-null pointer returned by from_json;
|
||||
// out_len is a valid stack-allocated usize.
|
||||
let data_ptr = unsafe { kreuzberg_ffi::kreuzberg_email_attachment_data(handle, &mut out_len) };
|
||||
|
||||
assert!(
|
||||
data_ptr.is_null(),
|
||||
"data must be null when the attachment has no payload"
|
||||
);
|
||||
assert_eq!(out_len, 0, "out_len must be 0 when data is None");
|
||||
|
||||
// SAFETY: handle came from from_json; we are the sole owner.
|
||||
unsafe { kreuzberg_email_attachment_free(handle) };
|
||||
}
|
||||
|
||||
/// When an attachment carries a binary payload the accessor must return a non-null
|
||||
/// pointer and write the exact byte count — including bytes past any embedded NUL —
|
||||
/// to out_len. This is the core contract broken by the 1-parameter bug (#1059).
|
||||
#[test]
|
||||
fn email_attachment_data_with_out_len_returns_full_buffer_including_embedded_nuls() {
|
||||
let json = attachment_json_with_nuls();
|
||||
// SAFETY: json is a valid null-terminated CString we just created.
|
||||
let handle = unsafe { kreuzberg_email_attachment_from_json(json.as_ptr() as *const c_char) };
|
||||
assert!(
|
||||
!handle.is_null(),
|
||||
"from_json should succeed for our well-formed test attachment (last_error_code={})",
|
||||
// SAFETY: no precondition; reads a thread-local.
|
||||
unsafe { kreuzberg_last_error_code() }
|
||||
);
|
||||
|
||||
let mut out_len: usize = 0;
|
||||
|
||||
// SAFETY: handle is non-null and freshly allocated by from_json;
|
||||
// out_len is a valid stack-allocated usize. The returned pointer must not
|
||||
// be freed by us — it borrows the internal Bytes of the handle.
|
||||
let data_ptr = unsafe { kreuzberg_ffi::kreuzberg_email_attachment_data(handle, &mut out_len) };
|
||||
|
||||
assert!(
|
||||
!data_ptr.is_null(),
|
||||
"data pointer must be non-null for an attachment we created with a Some(data) payload"
|
||||
);
|
||||
assert_eq!(
|
||||
out_len, 8,
|
||||
"out_len must report the exact length of the Bytes payload (not 0, not guessed, not truncated at NUL)"
|
||||
);
|
||||
|
||||
// SAFETY: data_ptr is valid for [0..out_len] because:
|
||||
// - it came from the handle's internal Bytes (which we control),
|
||||
// - out_len was written by the accessor,
|
||||
// - the handle is still alive (we have not called free yet).
|
||||
let slice = unsafe { std::slice::from_raw_parts(data_ptr, out_len) };
|
||||
|
||||
assert_eq!(slice.len(), 8);
|
||||
assert_eq!(slice[0], 0xFF);
|
||||
assert_eq!(slice[3], 0x00, "must be able to read the embedded NUL");
|
||||
assert_eq!(
|
||||
slice[7], 0xEF,
|
||||
"must be able to read bytes after the NUL (no truncation)"
|
||||
);
|
||||
|
||||
// Cleanup
|
||||
// SAFETY: handle came from from_json; we are the owner.
|
||||
unsafe { kreuzberg_email_attachment_free(handle) };
|
||||
}
|
||||
|
||||
/// Verify that passing a null out_len pointer is safe: the accessor must not
|
||||
/// segfault, and the data pointer itself must still be returned.
|
||||
#[test]
|
||||
fn email_attachment_data_null_out_len_is_safe() {
|
||||
let json = CString::new(
|
||||
r#"{"name":"hasdata.bin","filename":"hasdata.bin","mime_type":"application/octet-stream","size":4,"is_image":false,"data":[65,0,66,67]}"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// SAFETY: json is valid.
|
||||
let handle = unsafe { kreuzberg_email_attachment_from_json(json.as_ptr() as *const c_char) };
|
||||
assert!(!handle.is_null());
|
||||
|
||||
// SAFETY: handle is valid; passing null for out_len is a defined contract
|
||||
// (the accessor null-checks before writing).
|
||||
let data_ptr = unsafe { kreuzberg_ffi::kreuzberg_email_attachment_data(handle, std::ptr::null_mut()) };
|
||||
|
||||
assert!(
|
||||
!data_ptr.is_null(),
|
||||
"data pointer should be non-null when the attachment carries a payload"
|
||||
);
|
||||
|
||||
// SAFETY: handle from from_json; we are the owner.
|
||||
unsafe { kreuzberg_email_attachment_free(handle) };
|
||||
}
|
||||
204
crates/kreuzberg-ffi/tests/vtable_bytes_len.rs
generated
Normal file
204
crates/kreuzberg-ffi/tests/vtable_bytes_len.rs
generated
Normal file
@@ -0,0 +1,204 @@
|
||||
/// Regression tests: vtable Bytes params carry companion length
|
||||
///
|
||||
/// The alef vtable generator previously emitted only `*const u8` for `&[u8]`
|
||||
/// trait-method parameters without a companion `{name}_len: usize`. Binary
|
||||
/// payloads contain embedded NUL bytes; read-until-NUL semantics silently
|
||||
/// truncated every real image or document buffer at the first `0x00`.
|
||||
///
|
||||
/// Fix shipped in alef ≥ v0.19.21 and is present in the generated FFI shim.
|
||||
/// These tests construct a vtable bridge directly, pass a buffer with an
|
||||
/// embedded NUL at a known offset, and assert the full buffer is received.
|
||||
///
|
||||
/// Per-test state is passed via `user_data` — no global statics — so tests
|
||||
/// are independent and can run in parallel without interfering.
|
||||
use kreuzberg_ffi::{
|
||||
KreuzbergDocumentExtractorBridge, KreuzbergDocumentExtractorVTable, KreuzbergOcrBackendBridge,
|
||||
KreuzbergOcrBackendVTable,
|
||||
};
|
||||
use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
|
||||
|
||||
// ── Per-test callback state ───────────────────────────────────────────────
|
||||
|
||||
struct CallbackState {
|
||||
received_len: AtomicUsize,
|
||||
received_last_byte: AtomicU8,
|
||||
}
|
||||
|
||||
impl CallbackState {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
received_len: AtomicUsize::new(0),
|
||||
received_last_byte: AtomicU8::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── C callback stubs ─────────────────────────────────────────────────────
|
||||
|
||||
unsafe extern "C" fn ocr_process_image(
|
||||
user_data: *const std::ffi::c_void,
|
||||
image_bytes: *const u8,
|
||||
image_bytes_len: usize,
|
||||
_config: *const std::ffi::c_char,
|
||||
out_result: *mut *mut std::ffi::c_char,
|
||||
out_error: *mut *mut std::ffi::c_char,
|
||||
) -> i32 {
|
||||
// SAFETY: user_data points to a CallbackState that the calling test keeps alive.
|
||||
let state = unsafe { &*(user_data as *const CallbackState) };
|
||||
state.received_len.store(image_bytes_len, Ordering::SeqCst);
|
||||
if image_bytes_len > 0 {
|
||||
// SAFETY: caller guarantees image_bytes[0..image_bytes_len] is valid.
|
||||
let last = unsafe { *image_bytes.add(image_bytes_len - 1) };
|
||||
state.received_last_byte.store(last, Ordering::SeqCst);
|
||||
}
|
||||
unsafe { *out_result = std::ptr::null_mut() };
|
||||
let msg = std::ffi::CString::new("stub").unwrap();
|
||||
// SAFETY: caller owns out_error and will free it via kreuzberg_free_string.
|
||||
unsafe { *out_error = msg.into_raw() };
|
||||
1
|
||||
}
|
||||
|
||||
unsafe extern "C" fn extractor_extract_bytes(
|
||||
user_data: *const std::ffi::c_void,
|
||||
content: *const u8,
|
||||
content_len: usize,
|
||||
_mime_type: *const std::ffi::c_char,
|
||||
_config: *const std::ffi::c_char,
|
||||
out_result: *mut *mut std::ffi::c_char,
|
||||
out_error: *mut *mut std::ffi::c_char,
|
||||
) -> i32 {
|
||||
// SAFETY: user_data points to a CallbackState that the calling test keeps alive.
|
||||
let state = unsafe { &*(user_data as *const CallbackState) };
|
||||
state.received_len.store(content_len, Ordering::SeqCst);
|
||||
if content_len > 0 {
|
||||
// SAFETY: caller guarantees content[0..content_len] is valid.
|
||||
let last = unsafe { *content.add(content_len - 1) };
|
||||
state.received_last_byte.store(last, Ordering::SeqCst);
|
||||
}
|
||||
unsafe { *out_result = std::ptr::null_mut() };
|
||||
let msg = std::ffi::CString::new("stub").unwrap();
|
||||
unsafe { *out_error = msg.into_raw() };
|
||||
1
|
||||
}
|
||||
|
||||
// ── Tests ─────────────────────────────────────────────────────────────────
|
||||
|
||||
/// OcrBackend.process_image must pass the full buffer length even when
|
||||
/// the payload contains embedded NUL bytes.
|
||||
#[tokio::test]
|
||||
async fn ocr_backend_vtable_process_image_passes_full_length_with_embedded_nuls() {
|
||||
// 8-byte buffer; NUL at index 3. strlen-style reads would stop at 3.
|
||||
let image_bytes: Vec<u8> = vec![0xFF, 0xD8, 0xFF, 0x00, 0xDE, 0xAD, 0xBE, 0xEF];
|
||||
|
||||
let state = Box::new(CallbackState::new());
|
||||
let state_ptr = state.as_ref() as *const CallbackState as *const std::ffi::c_void;
|
||||
|
||||
let vtable = KreuzbergOcrBackendVTable {
|
||||
process_image: Some(ocr_process_image),
|
||||
process_image_file: None,
|
||||
name_fn: None,
|
||||
version_fn: None,
|
||||
initialize_fn: None,
|
||||
shutdown_fn: None,
|
||||
supports_language: None,
|
||||
backend_type: None,
|
||||
supported_languages: None,
|
||||
supports_table_detection: None,
|
||||
supports_document_processing: None,
|
||||
process_document: None,
|
||||
free_user_data: None,
|
||||
};
|
||||
|
||||
// SAFETY: state lives for the duration of this test and outlives the bridge.
|
||||
let bridge = unsafe { KreuzbergOcrBackendBridge::new("test-ocr-stub".to_string(), vtable, state_ptr) };
|
||||
|
||||
use kreuzberg::OcrBackend;
|
||||
let _ = bridge
|
||||
.process_image(&image_bytes, &kreuzberg::OcrConfig::default())
|
||||
.await;
|
||||
|
||||
assert_eq!(
|
||||
state.received_len.load(Ordering::SeqCst),
|
||||
8,
|
||||
"process_image vtable received wrong length (truncated at embedded NUL?)"
|
||||
);
|
||||
assert_eq!(
|
||||
state.received_last_byte.load(Ordering::SeqCst),
|
||||
0xEF,
|
||||
"process_image vtable could not read past the embedded NUL"
|
||||
);
|
||||
}
|
||||
|
||||
/// DocumentExtractor.extract_bytes must pass the full buffer length even when
|
||||
/// the document bytes contain embedded NUL bytes.
|
||||
#[tokio::test]
|
||||
async fn document_extractor_vtable_extract_bytes_passes_full_length_with_embedded_nuls() {
|
||||
// 8-byte buffer; NUL at index 2.
|
||||
let content: Vec<u8> = vec![0x50, 0x4B, 0x00, 0x03, 0x14, 0x00, 0x00, 0x02];
|
||||
|
||||
let state = Box::new(CallbackState::new());
|
||||
let state_ptr = state.as_ref() as *const CallbackState as *const std::ffi::c_void;
|
||||
|
||||
let vtable = KreuzbergDocumentExtractorVTable {
|
||||
extract_bytes: Some(extractor_extract_bytes),
|
||||
extract_file: None,
|
||||
name_fn: None,
|
||||
version_fn: None,
|
||||
initialize_fn: None,
|
||||
shutdown_fn: None,
|
||||
supported_mime_types: None,
|
||||
priority: None,
|
||||
can_handle: None,
|
||||
free_user_data: None,
|
||||
};
|
||||
|
||||
// SAFETY: state lives for the duration of this test and outlives the bridge.
|
||||
let bridge = unsafe { KreuzbergDocumentExtractorBridge::new("test-extractor-stub".to_string(), vtable, state_ptr) };
|
||||
|
||||
use kreuzberg::DocumentExtractor;
|
||||
let _ = bridge
|
||||
.extract_bytes(
|
||||
&content,
|
||||
"application/octet-stream",
|
||||
&kreuzberg::ExtractionConfig::default(),
|
||||
)
|
||||
.await;
|
||||
|
||||
assert_eq!(
|
||||
state.received_len.load(Ordering::SeqCst),
|
||||
8,
|
||||
"extract_bytes vtable received wrong length (truncated at embedded NUL?)"
|
||||
);
|
||||
assert_eq!(
|
||||
state.received_last_byte.load(Ordering::SeqCst),
|
||||
0x02,
|
||||
"extract_bytes vtable could not read past the embedded NUL"
|
||||
);
|
||||
}
|
||||
|
||||
/// ImageKind numeric values: PageRaster must be 10 and Unknown must be 11.
|
||||
///
|
||||
/// alef ≥ v0.19.21 added PageRaster between Mask (9) and Unknown, bumping
|
||||
/// Unknown from 10 → 11. Any C/Go/Java/C# code that hardcoded Unknown = 10
|
||||
/// must be updated; this test pins the new ordinals so the renumbering is
|
||||
/// visible to CI.
|
||||
#[test]
|
||||
fn image_kind_page_raster_is_10_and_unknown_is_11() {
|
||||
// SAFETY: pure integer dispatch, no pointers.
|
||||
assert_eq!(
|
||||
unsafe { kreuzberg_ffi::kreuzberg_image_kind_from_i32(10) },
|
||||
10,
|
||||
"PageRaster == 10"
|
||||
);
|
||||
assert_eq!(
|
||||
unsafe { kreuzberg_ffi::kreuzberg_image_kind_from_i32(11) },
|
||||
11,
|
||||
"Unknown == 11"
|
||||
);
|
||||
// Old Unknown value must now resolve to PageRaster, not Unknown.
|
||||
assert_ne!(
|
||||
unsafe { kreuzberg_ffi::kreuzberg_image_kind_from_i32(10) },
|
||||
-1,
|
||||
"10 must be valid"
|
||||
);
|
||||
}
|
||||
22
crates/kreuzberg-jni/Cargo.toml
Normal file
22
crates/kreuzberg-jni/Cargo.toml
Normal file
@@ -0,0 +1,22 @@
|
||||
[package]
|
||||
name = "kreuzberg-jni"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
rust-version.workspace = true
|
||||
authors.workspace = true
|
||||
license.workspace = true
|
||||
repository.workspace = true
|
||||
homepage.workspace = true
|
||||
publish = false
|
||||
|
||||
[lib]
|
||||
crate-type = ["cdylib"]
|
||||
name = "kreuzberg_jni"
|
||||
|
||||
[dependencies]
|
||||
base64 = "0.22"
|
||||
jni = "0.21"
|
||||
kreuzberg-ffi = { path = "../kreuzberg-ffi" }
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
1183
crates/kreuzberg-jni/src/lib.rs
Normal file
1183
crates/kreuzberg-jni/src/lib.rs
Normal file
File diff suppressed because it is too large
Load Diff
32
crates/kreuzberg-node/Cargo.toml
generated
Normal file
32
crates/kreuzberg-node/Cargo.toml
generated
Normal file
@@ -0,0 +1,32 @@
|
||||
[package]
|
||||
name = "kreuzberg-node"
|
||||
version = "5.0.0-rc.3"
|
||||
edition = "2024"
|
||||
license = "Elastic-2.0"
|
||||
description = "High-performance document intelligence library"
|
||||
readme = false
|
||||
keywords = ["document", "extraction", "ocr", "pdf", "text"]
|
||||
categories = ["text-processing"]
|
||||
|
||||
# `serde_json` is emitted unconditionally above so the manifest is stable
|
||||
# across regens, but for umbrella crates with no JSON-marshalled return types
|
||||
# it is genuinely unused. The conditional `async-trait` / `futures-util` deps
|
||||
# are similarly flagged when the umbrella has trait-bridge / streaming
|
||||
# adapters configured but no actual async-trait callsite in this binding.
|
||||
[package.metadata.cargo-machete]
|
||||
ignored = ["serde_json", "async-trait"]
|
||||
|
||||
[lib]
|
||||
crate-type = ["cdylib"]
|
||||
|
||||
[dependencies]
|
||||
async-trait = "0.1"
|
||||
kreuzberg = { version = "5.0.0-rc.3", path = "../kreuzberg", features = ["full", "pdf", "ocr", "paddle-ocr", "paddle-ocr-types", "layout-detection", "layout-types", "embeddings", "embedding-presets", "chunking", "keywords-yake", "keywords-rake", "language-detection", "html", "tree-sitter", "office", "email", "archives", "stopwords", "auto-rotate", "auto-rotate-types", "tokio-runtime", "api", "mcp", "liter-llm", "quality"] }
|
||||
napi = { version = "3", features = ["async", "serde-json"] }
|
||||
napi-derive = "3"
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
serde_with = "3"
|
||||
|
||||
[build-dependencies]
|
||||
napi-build = "2"
|
||||
93
crates/kreuzberg-node/LICENSE
generated
Normal file
93
crates/kreuzberg-node/LICENSE
generated
Normal file
@@ -0,0 +1,93 @@
|
||||
Elastic License 2.0 (ELv2)
|
||||
|
||||
Copyright 2025-2026 Kreuzberg, Inc.
|
||||
|
||||
Acceptance
|
||||
|
||||
By using the software, you agree to all of the terms and conditions below.
|
||||
|
||||
Copyright License
|
||||
|
||||
The licensor grants you a non-exclusive, royalty-free, worldwide,
|
||||
non-sublicensable, non-transferable license to use, copy, distribute, make
|
||||
available, and prepare derivative works of the software, in each case subject to
|
||||
the limitations and conditions below.
|
||||
|
||||
Limitations
|
||||
|
||||
You may not provide the software to third parties as a hosted or managed
|
||||
service, where the service provides users with access to any substantial set of
|
||||
the features or functionality of the software.
|
||||
|
||||
You may not move, change, disable, or circumvent the license key functionality
|
||||
in the software, and you may not remove or obscure any functionality in the
|
||||
software that is protected by the license key.
|
||||
|
||||
You may not alter, remove, or obscure any licensing, copyright, or other notices
|
||||
of the licensor in the software. Any use of the licensor's trademarks is subject
|
||||
to applicable law.
|
||||
|
||||
Patents
|
||||
|
||||
The licensor grants you a license, under any patent claims the licensor can
|
||||
license, or becomes able to license, to make, have made, use, sell, offer for
|
||||
sale, import and have imported the software, in each case subject to the
|
||||
limitations and conditions in this license. This license does not cover any
|
||||
patent claims that you cause to be infringed by modifications or additions to the
|
||||
software. If you or your company make any written claim that the software
|
||||
infringes or contributes to infringement of any patent, your patent license for
|
||||
the software granted under these terms ends immediately. If your company makes
|
||||
such a claim, your patent license ends immediately for work on behalf of your
|
||||
company.
|
||||
|
||||
Notices
|
||||
|
||||
You must ensure that anyone who gets a copy of any part of the software from you
|
||||
also gets a copy of these terms.
|
||||
|
||||
If you modify the software, you must include in any modified copies of the
|
||||
software prominent notices stating that you have modified the software.
|
||||
|
||||
No Other Rights
|
||||
|
||||
These terms do not imply any licenses other than those expressly granted in
|
||||
these terms.
|
||||
|
||||
Termination
|
||||
|
||||
If you use the software in violation of these terms, such use is not licensed,
|
||||
and your licenses will automatically terminate. If the licensor provides you with
|
||||
a notice of your violation, and you cease all violation of this license no later
|
||||
than 30 days after you receive that notice, your licenses will be reinstated
|
||||
retroactively. However, if you violate these terms after such reinstatement, any
|
||||
additional violation of these terms will cause your licenses to terminate
|
||||
automatically and permanently.
|
||||
|
||||
No Liability
|
||||
|
||||
As far as the law allows, the software comes as is, without any warranty or
|
||||
condition, and the licensor will not be liable to you for any damages arising out
|
||||
of these terms or the use or nature of the software, under any kind of legal
|
||||
claim.
|
||||
|
||||
Definitions
|
||||
|
||||
The licensor is the entity offering these terms, and the software is the
|
||||
software the licensor makes available under these terms, including any portion
|
||||
of it.
|
||||
|
||||
you refers to the individual or entity agreeing to these terms.
|
||||
|
||||
your company is any legal entity, sole proprietorship, or other kind of
|
||||
organization that you work for, plus all organizations that have control over,
|
||||
are under the control of, or are under common control with that organization.
|
||||
control means ownership of substantially all the assets of an entity, or the
|
||||
power to direct its management and policies by vote, contract, or otherwise.
|
||||
Control can be direct or indirect.
|
||||
|
||||
your licenses are all the licenses granted to you for the software under these
|
||||
terms.
|
||||
|
||||
use means anything you do with the software requiring one of your licenses.
|
||||
|
||||
trademark means trademarks, service marks, and similar rights.
|
||||
488
crates/kreuzberg-node/README.md
generated
Normal file
488
crates/kreuzberg-node/README.md
generated
Normal file
@@ -0,0 +1,488 @@
|
||||
# TypeScript (Node.js)
|
||||
|
||||
<div align="center" style="display: flex; flex-wrap: wrap; gap: 8px; justify-content: center; margin: 20px 0;">
|
||||
<a href="https://github.com/kreuzberg-dev/alef">
|
||||
<img src="https://img.shields.io/badge/Bindings-alef%20%D7%90-007ec6" alt="Bindings">
|
||||
</a>
|
||||
<!-- Language Bindings -->
|
||||
<a href="https://crates.io/crates/kreuzberg">
|
||||
<img src="https://img.shields.io/crates/v/kreuzberg?label=Rust&color=007ec6" alt="Rust">
|
||||
</a>
|
||||
<a href="https://pypi.org/project/kreuzberg/">
|
||||
<img src="https://img.shields.io/pypi/v/kreuzberg?label=Python&color=007ec6" alt="Python">
|
||||
</a>
|
||||
<a href="https://www.npmjs.com/package/@kreuzberg/node">
|
||||
<img src="https://img.shields.io/npm/v/@kreuzberg/node?label=Node.js&color=007ec6" alt="Node.js">
|
||||
</a>
|
||||
<a href="https://www.npmjs.com/package/@kreuzberg/wasm">
|
||||
<img src="https://img.shields.io/npm/v/@kreuzberg/wasm?label=WASM&color=007ec6" alt="WASM">
|
||||
</a>
|
||||
<a href="https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg">
|
||||
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
||||
</a>
|
||||
<a href="https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/go/v5">
|
||||
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v5*" alt="Go">
|
||||
</a>
|
||||
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
||||
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
||||
</a>
|
||||
<a href="https://packagist.org/packages/kreuzberg/kreuzberg">
|
||||
<img src="https://img.shields.io/packagist/v/kreuzberg/kreuzberg?label=PHP&color=007ec6" alt="PHP">
|
||||
</a>
|
||||
<a href="https://rubygems.org/gems/kreuzberg">
|
||||
<img src="https://img.shields.io/gem/v/kreuzberg?label=Ruby&color=007ec6" alt="Ruby">
|
||||
</a>
|
||||
<a href="https://hex.pm/packages/kreuzberg">
|
||||
<img src="https://img.shields.io/hexpm/v/kreuzberg?label=Elixir&color=007ec6" alt="Elixir">
|
||||
</a>
|
||||
<a href="https://kreuzberg-dev.r-universe.dev/kreuzberg">
|
||||
<img src="https://img.shields.io/badge/R-kreuzberg-007ec6" alt="R">
|
||||
</a>
|
||||
<a href="https://pub.dev/packages/kreuzberg">
|
||||
<img src="https://img.shields.io/pub/v/kreuzberg?label=Dart&color=007ec6" alt="Dart">
|
||||
</a>
|
||||
<a href="https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg-android">
|
||||
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg-android?label=Kotlin&color=007ec6" alt="Kotlin">
|
||||
</a>
|
||||
<a href="https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/swift">
|
||||
<img src="https://img.shields.io/badge/Swift-SPM-007ec6" alt="Swift">
|
||||
</a>
|
||||
<a href="https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/zig">
|
||||
<img src="https://img.shields.io/badge/Zig-package-007ec6" alt="Zig">
|
||||
</a>
|
||||
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
||||
<img src="https://img.shields.io/badge/C-FFI-007ec6" alt="C FFI">
|
||||
</a>
|
||||
<a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/kreuzberg">
|
||||
<img src="https://img.shields.io/badge/Docker-ghcr.io-007ec6?logo=docker&logoColor=white" alt="Docker">
|
||||
</a>
|
||||
<a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/charts%2Fkreuzberg">
|
||||
<img src="https://img.shields.io/badge/Helm-ghcr.io-007ec6?logo=helm&logoColor=white" alt="Helm">
|
||||
</a>
|
||||
|
||||
<!-- Project Info -->
|
||||
<a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
|
||||
<img src="https://img.shields.io/badge/License-Elastic--2.0-007ec6" alt="License">
|
||||
</a>
|
||||
<a href="https://docs.kreuzberg.dev">
|
||||
<img src="https://img.shields.io/badge/Docs-kreuzberg-007ec6" alt="Documentation">
|
||||
</a>
|
||||
<a href="https://huggingface.co/Kreuzberg">
|
||||
<img src="https://img.shields.io/badge/Hugging%20Face-Kreuzberg-007ec6" alt="Hugging Face">
|
||||
</a>
|
||||
</div>
|
||||
|
||||
<div align="center" style="margin: 24px 0 0;">
|
||||
<a href="https://kreuzberg.dev">
|
||||
<img alt="Kreuzberg" src="https://github.com/user-attachments/assets/419fc06c-8313-4324-b159-4b4d3cfce5c0" />
|
||||
</a>
|
||||
</div>
|
||||
|
||||
<div align="center" style="display: flex; flex-wrap: wrap; gap: 12px; justify-content: center; margin: 28px 0 24px;">
|
||||
<a href="https://discord.gg/xt9WY3GnKR">
|
||||
<img height="22" src="https://img.shields.io/badge/Discord-Chat-007ec6?logo=discord&logoColor=white" alt="Join Discord">
|
||||
</a>
|
||||
<a href="https://docs.kreuzberg.dev/demo.html">
|
||||
<img height="22" src="https://img.shields.io/badge/Live%20Demo-Open-007ec6?logo=webassembly&logoColor=white" alt="Live Demo">
|
||||
</a>
|
||||
</div>
|
||||
|
||||
Extract text, tables, images, and metadata from 90+ file formats and 300+ programming languages including PDF, Office documents, and images. Native NAPI-RS bindings for Node.js with superior performance, async/await support, and TypeScript type definitions.
|
||||
|
||||
## What This Package Provides
|
||||
|
||||
- **Document intelligence core** — extract text, tables, images, metadata, entities, keywords, and code intelligence from one API.
|
||||
- **Format coverage** — PDF, Office, images, HTML/XML, email, archives, notebooks, citations, scientific formats, and plain text.
|
||||
- **OCR choices** — Tesseract, PaddleOCR, EasyOCR where supported, VLM OCR through liter-llm, and plugin hooks for custom backends.
|
||||
- **Same engine as every binding** — Rust, Python, Node.js, Go, Java, PHP, Ruby, .NET, Elixir, R, WASM, Kotlin Android, Swift, Dart, Zig, and C FFI share the same Rust implementation.
|
||||
- **Node-first TypeScript API** — NAPI-RS package with typed options/results and async extraction.
|
||||
|
||||
## Installation
|
||||
|
||||
### Package Installation
|
||||
|
||||
```bash
|
||||
pnpm add @kreuzberg/node
|
||||
```
|
||||
|
||||
### System Requirements
|
||||
- **Node.js 22+** required (NAPI-RS native bindings)
|
||||
- Optional: [ONNX Runtime](https://github.com/microsoft/onnxruntime/releases) version 1.22.x for embeddings support
|
||||
- Optional: [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) for OCR functionality
|
||||
|
||||
### Platform Support
|
||||
|
||||
Pre-built binaries available for:
|
||||
- macOS (arm64, x64)
|
||||
- Linux (x64)
|
||||
- Windows (x64)
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Basic Extraction
|
||||
|
||||
Extract text, metadata, and structure from any supported document format:
|
||||
|
||||
```typescript title="TypeScript"
|
||||
import { extractFileSync } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
};
|
||||
|
||||
const result = extractFileSync("document.pdf", null, config);
|
||||
|
||||
console.log(result.content);
|
||||
console.log(`MIME Type: ${result.mimeType}`);
|
||||
```
|
||||
|
||||
### Common Use Cases
|
||||
|
||||
#### Extract with Custom Configuration
|
||||
|
||||
Most use cases benefit from configuration to control extraction behavior:
|
||||
|
||||
**With OCR (for scanned documents):**
|
||||
|
||||
```typescript title="TypeScript"
|
||||
import { extractFile } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
ocr: {
|
||||
backend: "tesseract",
|
||||
language: "eng+fra",
|
||||
tesseractConfig: {
|
||||
psm: 3,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
console.log(result.content);
|
||||
```
|
||||
|
||||
#### Table Extraction
|
||||
|
||||
```typescript title="TypeScript"
|
||||
import { extractFileSync } from "kreuzberg";
|
||||
|
||||
const result = extractFileSync("document.pdf");
|
||||
|
||||
result.tables?.forEach((table) => {
|
||||
console.log(`Table with ${table.cells?.length ?? 0} rows`);
|
||||
console.log(table.markdown);
|
||||
table.cells?.forEach((row) => console.log(row.join(" | ")));
|
||||
});
|
||||
```
|
||||
|
||||
#### Processing Multiple Files
|
||||
|
||||
```typescript title="TypeScript"
|
||||
import { batchExtractFilesSync } from "@kreuzberg/node";
|
||||
|
||||
const files = ["doc1.pdf", "doc2.docx", "doc3.pptx"];
|
||||
const results = batchExtractFilesSync(files);
|
||||
|
||||
results.forEach((result, i) => {
|
||||
console.log(`File ${i + 1}: ${result.content.length} characters`);
|
||||
});
|
||||
```
|
||||
|
||||
#### Async Processing
|
||||
|
||||
For non-blocking document processing:
|
||||
|
||||
```typescript title="TypeScript"
|
||||
import { extractFile } from "@kreuzberg/node";
|
||||
|
||||
const result = await extractFile("document.pdf");
|
||||
console.log(result.content);
|
||||
```
|
||||
|
||||
#### Configuration Discovery
|
||||
|
||||
```typescript title="config_discovery.ts"
|
||||
import { ExtractionConfig, extractFile } from "@kreuzberg/node";
|
||||
|
||||
const config = ExtractionConfig.discover();
|
||||
if (config) {
|
||||
console.log("Found configuration file");
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
console.log(result.content);
|
||||
} else {
|
||||
console.log("No configuration file found, using defaults");
|
||||
const result = await extractFile("document.pdf");
|
||||
console.log(result.content);
|
||||
}
|
||||
```
|
||||
|
||||
#### Worker Thread Pool
|
||||
|
||||
```typescript title="worker_pool.ts"
|
||||
import {
|
||||
createWorkerPool,
|
||||
extractFileInWorker,
|
||||
batchExtractFilesInWorker,
|
||||
closeWorkerPool,
|
||||
} from "@kreuzberg/node";
|
||||
|
||||
// Create a pool with 4 worker threads
|
||||
const pool = createWorkerPool(4);
|
||||
|
||||
try {
|
||||
// Extract single file in worker
|
||||
const result = await extractFileInWorker(pool, "document.pdf", null, {
|
||||
useCache: true,
|
||||
});
|
||||
console.log(result.content);
|
||||
|
||||
// Extract multiple files concurrently
|
||||
const files = ["doc1.pdf", "doc2.docx", "doc3.xlsx"];
|
||||
const results = await batchExtractFilesInWorker(pool, files, {
|
||||
useCache: true,
|
||||
});
|
||||
|
||||
results.forEach((result, i) => {
|
||||
console.log(`File ${i + 1}: ${result.content.length} characters`);
|
||||
});
|
||||
} finally {
|
||||
// Always close the pool when done
|
||||
await closeWorkerPool(pool);
|
||||
}
|
||||
```
|
||||
|
||||
**Performance Benefits:**
|
||||
- **Parallel Processing**: Multiple documents extracted simultaneously
|
||||
- **CPU Utilization**: Maximizes multi-core CPU usage for large batches
|
||||
- **Queue Management**: Automatically distributes work across available workers
|
||||
- **Resource Control**: Prevents thread exhaustion with configurable pool size
|
||||
|
||||
**Best Practices:**
|
||||
- Use worker pools for batches of 10+ documents
|
||||
- Set pool size to number of CPU cores (default behavior)
|
||||
- Always close pools with `closeWorkerPool()` to prevent resource leaks
|
||||
- Reuse pools across multiple batch operations for efficiency
|
||||
|
||||
### Next Steps
|
||||
|
||||
- **[Installation Guide](https://docs.kreuzberg.dev/getting-started/installation/)** - Platform-specific setup
|
||||
- **[API Documentation](https://docs.kreuzberg.dev/reference/api-python/)** - Complete API reference
|
||||
- **[Examples & Guides](https://docs.kreuzberg.dev/)** - Full code examples and usage guides
|
||||
- **[Configuration Guide](https://docs.kreuzberg.dev/guides/configuration/)** - Advanced configuration options
|
||||
|
||||
## NAPI-RS Implementation Details
|
||||
|
||||
### Native Performance
|
||||
|
||||
This binding uses NAPI-RS to provide native Node.js bindings with:
|
||||
|
||||
- **Zero-copy data transfer** between JavaScript and Rust layers
|
||||
- **Native thread pool** for concurrent document processing
|
||||
- **Direct memory management** for efficient large document handling
|
||||
- **Binary-compatible** pre-built native modules across platforms
|
||||
|
||||
### Threading Model
|
||||
|
||||
- Single documents are processed synchronously or asynchronously in a dedicated thread
|
||||
- Batch operations distribute work across available CPU cores
|
||||
- Thread count is configurable but defaults to system CPU count
|
||||
- Long-running extractions block the event loop unless using async APIs
|
||||
|
||||
### Memory Management
|
||||
|
||||
- Large documents (> 100 MB) are streamed to avoid loading entirely into memory
|
||||
- Temporary files are created in system temp directory for extraction
|
||||
- Memory is automatically released after extraction completion
|
||||
- ONNX models are cached in memory for repeated embeddings operations
|
||||
|
||||
## Features
|
||||
|
||||
### Supported File Formats (90+)
|
||||
|
||||
90+ file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
|
||||
|
||||
#### Office Documents
|
||||
|
||||
| Category | Formats | Capabilities |
|
||||
|----------|---------|--------------|
|
||||
| **Word Processing** | `.docx`, `.docm`, `.dotx`, `.dotm`, `.dot`, `.odt` | Full text, tables, images, metadata, styles |
|
||||
| **Spreadsheets** | `.xlsx`, `.xlsm`, `.xlsb`, `.xls`, `.xla`, `.xlam`, `.xltm`, `.xltx`, `.xlt`, `.ods` | Sheet data, formulas, cell metadata, charts |
|
||||
| **Presentations** | `.pptx`, `.pptm`, `.ppsx`, `.potx`, `.potm`, `.pot`, `.ppt` | Slides, speaker notes, images, metadata |
|
||||
| **PDF** | `.pdf` | Text, tables, images, metadata, OCR support |
|
||||
| **eBooks** | `.epub`, `.fb2` | Chapters, metadata, embedded resources |
|
||||
| **Database** | `.dbf` | Table data extraction, field type support |
|
||||
| **Hangul** | `.hwp`, `.hwpx` | Korean document format, text extraction |
|
||||
|
||||
#### Images (OCR-Enabled)
|
||||
|
||||
| Category | Formats | Features |
|
||||
|----------|---------|----------|
|
||||
| **Raster** | `.png`, `.jpg`, `.jpeg`, `.gif`, `.webp`, `.bmp`, `.tiff`, `.tif` | OCR, table detection, EXIF metadata, dimensions, color space |
|
||||
| **Advanced** | `.jp2`, `.jpx`, `.jpm`, `.mj2`, `.jbig2`, `.jb2`, `.pnm`, `.pbm`, `.pgm`, `.ppm` | OCR via hayro-jpeg2000 (pure Rust decoder), JBIG2 support, table detection, format-specific metadata |
|
||||
| **Vector** | `.svg` | DOM parsing, embedded text, graphics metadata |
|
||||
|
||||
#### Web & Data
|
||||
|
||||
| Category | Formats | Features |
|
||||
|----------|---------|----------|
|
||||
| **Markup** | `.html`, `.htm`, `.xhtml`, `.xml`, `.svg` | DOM parsing, metadata (Open Graph, Twitter Card), link extraction |
|
||||
| **Structured Data** | `.json`, `.yaml`, `.yml`, `.toml`, `.csv`, `.tsv` | Schema detection, nested structures, validation |
|
||||
| **Text & Markdown** | `.txt`, `.md`, `.markdown`, `.djot`, `.rst`, `.org`, `.rtf` | CommonMark, GFM, Djot, reStructuredText, Org Mode |
|
||||
|
||||
#### Email & Archives
|
||||
|
||||
| Category | Formats | Features |
|
||||
|----------|---------|----------|
|
||||
| **Email** | `.eml`, `.msg` | Headers, body (HTML/plain), attachments, threading |
|
||||
| **Archives** | `.zip`, `.tar`, `.tgz`, `.gz`, `.7z` | File listing, nested archives, metadata |
|
||||
|
||||
#### Academic & Scientific
|
||||
|
||||
| Category | Formats | Features |
|
||||
|----------|---------|----------|
|
||||
| **Citations** | `.bib`, `.biblatex`, `.ris`, `.nbib`, `.enw`, `.csl` | Structured parsing: RIS (structured), PubMed/MEDLINE, EndNote XML (structured), BibTeX, CSL JSON |
|
||||
| **Scientific** | `.tex`, `.latex`, `.typst`, `.jats`, `.ipynb`, `.docbook` | LaTeX, Jupyter notebooks, PubMed JATS |
|
||||
| **Documentation** | `.opml`, `.pod`, `.mdoc`, `.troff` | Technical documentation formats |
|
||||
|
||||
#### Code Intelligence (300+ Languages)
|
||||
|
||||
| Feature | Description |
|
||||
|---------|-------------|
|
||||
| **Structure Extraction** | Functions, classes, methods, structs, interfaces, enums |
|
||||
| **Import/Export Analysis** | Module dependencies, re-exports, wildcard imports |
|
||||
| **Symbol Extraction** | Variables, constants, type aliases, properties |
|
||||
| **Docstring Parsing** | Google, NumPy, Sphinx, JSDoc, RustDoc, and 10+ formats |
|
||||
| **Diagnostics** | Parse errors with line/column positions |
|
||||
| **Syntax-Aware Chunking** | Split code by semantic boundaries, not arbitrary byte offsets |
|
||||
|
||||
Powered by [tree-sitter-language-pack](https://github.com/kreuzberg-dev/tree-sitter-language-pack) — [documentation](https://docs.tree-sitter-language-pack.kreuzberg.dev).
|
||||
|
||||
**[Complete Format Reference](https://docs.kreuzberg.dev/reference/formats/)**
|
||||
|
||||
### Key Capabilities
|
||||
|
||||
- **Text Extraction** - Extract all text content with position and formatting information
|
||||
- **Metadata Extraction** - Retrieve document properties, creation date, author, etc.
|
||||
- **Table Extraction** - Parse tables with structure and cell content preservation
|
||||
- **Image Extraction** - Extract embedded images and render page previews
|
||||
- **OCR Support** - Integrate multiple OCR backends for scanned documents
|
||||
- **Async/Await** - Non-blocking document processing with concurrent operations
|
||||
- **Plugin System** - Extensible post-processing for custom text transformation
|
||||
- **Embeddings** - Generate vector embeddings using ONNX Runtime models
|
||||
- **Batch Processing** - Efficiently process multiple documents in parallel
|
||||
- **Memory Efficient** - Stream large files without loading entirely into memory
|
||||
- **Language Detection** - Detect and support multiple languages in documents
|
||||
- **Code Intelligence** - Extract structure, imports, exports, symbols, and docstrings from [300+ programming languages](https://docs.tree-sitter-language-pack.kreuzberg.dev) via tree-sitter
|
||||
- **Configuration** - Fine-grained control over extraction behavior
|
||||
|
||||
### Performance Characteristics
|
||||
|
||||
| Format | Speed | Memory | Notes |
|
||||
|--------|-------|--------|-------|
|
||||
| **PDF (text)** | 10-100 MB/s | ~50MB per doc | Fastest extraction |
|
||||
| **Office docs** | 20-200 MB/s | ~100MB per doc | DOCX, XLSX, PPTX |
|
||||
| **Images (OCR)** | 1-5 MB/s | Variable | Depends on OCR backend |
|
||||
| **Archives** | 5-50 MB/s | ~200MB per doc | ZIP, TAR, etc. |
|
||||
| **Web formats** | 50-200 MB/s | Streaming | HTML, XML, JSON |
|
||||
|
||||
## OCR Support
|
||||
|
||||
Kreuzberg supports multiple OCR backends for extracting text from scanned documents and images:
|
||||
|
||||
- **Tesseract**
|
||||
|
||||
- **Paddleocr**
|
||||
|
||||
### OCR Configuration Example
|
||||
|
||||
```typescript title="TypeScript"
|
||||
import { extractFile } from "@kreuzberg/node";
|
||||
|
||||
const config = {
|
||||
ocr: {
|
||||
backend: "tesseract",
|
||||
language: "eng+fra",
|
||||
tesseractConfig: {
|
||||
psm: 3,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
console.log(result.content);
|
||||
```
|
||||
|
||||
## Async Support
|
||||
|
||||
This binding provides full async/await support for non-blocking document processing:
|
||||
|
||||
```typescript title="TypeScript"
|
||||
import { extractFile } from "@kreuzberg/node";
|
||||
|
||||
const result = await extractFile("document.pdf");
|
||||
console.log(result.content);
|
||||
```
|
||||
|
||||
## Plugin System
|
||||
|
||||
Kreuzberg supports extensible post-processing plugins for custom text transformation and filtering.
|
||||
|
||||
For detailed plugin documentation, visit [Plugin System Guide](https://docs.kreuzberg.dev/guides/plugins/).
|
||||
|
||||
## Embeddings Support
|
||||
|
||||
Generate vector embeddings for extracted text using the built-in ONNX Runtime support. Requires ONNX Runtime installation.
|
||||
|
||||
**[Embeddings Guide](https://docs.kreuzberg.dev/features/#embeddings)**
|
||||
|
||||
## Batch Processing
|
||||
|
||||
Process multiple documents efficiently:
|
||||
|
||||
```typescript title="TypeScript"
|
||||
import { batchExtractFilesSync } from "@kreuzberg/node";
|
||||
|
||||
const files = ["doc1.pdf", "doc2.docx", "doc3.pptx"];
|
||||
const results = batchExtractFilesSync(files);
|
||||
|
||||
results.forEach((result, i) => {
|
||||
console.log(`File ${i + 1}: ${result.content.length} characters`);
|
||||
});
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
For advanced configuration options including language detection, table extraction, OCR settings, and more:
|
||||
|
||||
**[Configuration Guide](https://docs.kreuzberg.dev/guides/configuration/)**
|
||||
|
||||
## Documentation
|
||||
|
||||
- **[Official Documentation](https://docs.kreuzberg.dev/)**
|
||||
- **[API Reference](https://docs.kreuzberg.dev/reference/api-python/)**
|
||||
- **[Examples & Guides](https://docs.kreuzberg.dev/)**
|
||||
|
||||
## Contributing
|
||||
|
||||
Contributions are welcome! See [Contributing Guide](https://github.com/kreuzberg-dev/kreuzberg/blob/main/CONTRIBUTING.md).
|
||||
|
||||
## Part of Kreuzberg.dev
|
||||
|
||||
- [Kreuzberg Cloud](https://github.com/kreuzberg-dev/kreuzberg-cloud) — managed extraction API with SDKs, dashboards, and observability.
|
||||
- [kreuzcrawl](https://github.com/kreuzberg-dev/kreuzcrawl) — web crawling and scraping with HTML→Markdown and headless-Chrome fallback.
|
||||
- [html-to-markdown](https://github.com/kreuzberg-dev/html-to-markdown) — fast, lossless HTML→Markdown engine.
|
||||
- [liter-llm](https://github.com/kreuzberg-dev/liter-llm) — universal LLM API client with native bindings for 14 languages and 143 providers.
|
||||
- [tree-sitter-language-pack](https://github.com/kreuzberg-dev/tree-sitter-language-pack) — tree-sitter grammars and code-intelligence primitives.
|
||||
- [alef](https://github.com/kreuzberg-dev/alef) — the polyglot binding generator that produces this README and all per-language bindings.
|
||||
- [Discord](https://discord.gg/xt9WY3GnKR) — community, roadmap, announcements.
|
||||
|
||||
## License
|
||||
|
||||
Elastic-2.0 License — see [LICENSE](../../LICENSE) for details.
|
||||
|
||||
## Support
|
||||
|
||||
- **Discord Community**: [Join our Discord](https://discord.gg/xt9WY3GnKR)
|
||||
- **GitHub Issues**: [Report bugs](https://github.com/kreuzberg-dev/kreuzberg/issues)
|
||||
- **Discussions**: [Ask questions](https://github.com/kreuzberg-dev/kreuzberg/discussions)
|
||||
27
crates/kreuzberg-node/format-metadata-wrapper.js
generated
Normal file
27
crates/kreuzberg-node/format-metadata-wrapper.js
generated
Normal file
@@ -0,0 +1,27 @@
|
||||
// Wrap JsFormatMetadata to add getters for format-specific metadata
|
||||
// This works around the limitation that #[napi(getter)] doesn't work on #[napi(object)]
|
||||
|
||||
export function wrapFormatMetadata(fmt) {
|
||||
if (!fmt || typeof fmt !== "object") return fmt;
|
||||
|
||||
const tag = fmt.format_type;
|
||||
const payload = fmt["0"];
|
||||
|
||||
if (!payload) return fmt;
|
||||
|
||||
try {
|
||||
const data = JSON.parse(payload);
|
||||
|
||||
// Add the typed variant property as a non-enumerable property
|
||||
Object.defineProperty(fmt, tag, {
|
||||
value: data,
|
||||
enumerable: false,
|
||||
writable: false,
|
||||
configurable: false,
|
||||
});
|
||||
} catch (e) {
|
||||
// Ignore JSON parse errors
|
||||
}
|
||||
|
||||
return fmt;
|
||||
}
|
||||
5488
crates/kreuzberg-node/index.d.ts
generated
vendored
Normal file
5488
crates/kreuzberg-node/index.d.ts
generated
vendored
Normal file
File diff suppressed because it is too large
Load Diff
15
crates/kreuzberg-node/npm/darwin-arm64/package.json
generated
Normal file
15
crates/kreuzberg-node/npm/darwin-arm64/package.json
generated
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"name": "@kreuzberg/node-darwin-arm64",
|
||||
"version": "5.0.0-rc.3",
|
||||
"license": "Elastic-2.0",
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "git+https://github.com/kreuzberg-dev/kreuzberg.git"
|
||||
},
|
||||
"main": "kreuzberg-node.darwin-arm64.node",
|
||||
"files": ["kreuzberg-node.darwin-arm64.node"],
|
||||
"os": ["darwin"],
|
||||
"cpu": ["arm64"],
|
||||
"engines": { "node": ">= 18" },
|
||||
"publishConfig": { "access": "public" }
|
||||
}
|
||||
15
crates/kreuzberg-node/npm/darwin-x64/package.json
generated
Normal file
15
crates/kreuzberg-node/npm/darwin-x64/package.json
generated
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"name": "@kreuzberg/node-darwin-x64",
|
||||
"version": "5.0.0-rc.3",
|
||||
"license": "Elastic-2.0",
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "git+https://github.com/kreuzberg-dev/kreuzberg.git"
|
||||
},
|
||||
"main": "kreuzberg-node.darwin-x64.node",
|
||||
"files": ["kreuzberg-node.darwin-x64.node"],
|
||||
"os": ["darwin"],
|
||||
"cpu": ["x64"],
|
||||
"engines": { "node": ">= 18" },
|
||||
"publishConfig": { "access": "public" }
|
||||
}
|
||||
16
crates/kreuzberg-node/npm/linux-arm64-gnu/package.json
generated
Normal file
16
crates/kreuzberg-node/npm/linux-arm64-gnu/package.json
generated
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"name": "@kreuzberg/node-linux-arm64-gnu",
|
||||
"version": "5.0.0-rc.3",
|
||||
"license": "Elastic-2.0",
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "git+https://github.com/kreuzberg-dev/kreuzberg.git"
|
||||
},
|
||||
"main": "kreuzberg-node.linux-arm64-gnu.node",
|
||||
"files": ["kreuzberg-node.linux-arm64-gnu.node"],
|
||||
"os": ["linux"],
|
||||
"cpu": ["arm64"],
|
||||
"libc": ["glibc"],
|
||||
"engines": { "node": ">= 18" },
|
||||
"publishConfig": { "access": "public" }
|
||||
}
|
||||
16
crates/kreuzberg-node/npm/linux-arm64-musl/package.json
generated
Normal file
16
crates/kreuzberg-node/npm/linux-arm64-musl/package.json
generated
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"name": "@kreuzberg/node-linux-arm64-musl",
|
||||
"version": "5.0.0-rc.3",
|
||||
"license": "Elastic-2.0",
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "git+https://github.com/kreuzberg-dev/kreuzberg.git"
|
||||
},
|
||||
"main": "kreuzberg-node.linux-arm64-musl.node",
|
||||
"files": ["kreuzberg-node.linux-arm64-musl.node"],
|
||||
"os": ["linux"],
|
||||
"cpu": ["arm64"],
|
||||
"libc": ["musl"],
|
||||
"engines": { "node": ">= 18" },
|
||||
"publishConfig": { "access": "public" }
|
||||
}
|
||||
16
crates/kreuzberg-node/npm/linux-x64-gnu/package.json
generated
Normal file
16
crates/kreuzberg-node/npm/linux-x64-gnu/package.json
generated
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"name": "@kreuzberg/node-linux-x64-gnu",
|
||||
"version": "5.0.0-rc.3",
|
||||
"license": "Elastic-2.0",
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "git+https://github.com/kreuzberg-dev/kreuzberg.git"
|
||||
},
|
||||
"main": "kreuzberg-node.linux-x64-gnu.node",
|
||||
"files": ["kreuzberg-node.linux-x64-gnu.node"],
|
||||
"os": ["linux"],
|
||||
"cpu": ["x64"],
|
||||
"libc": ["glibc"],
|
||||
"engines": { "node": ">= 18" },
|
||||
"publishConfig": { "access": "public" }
|
||||
}
|
||||
16
crates/kreuzberg-node/npm/linux-x64-musl/package.json
generated
Normal file
16
crates/kreuzberg-node/npm/linux-x64-musl/package.json
generated
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"name": "@kreuzberg/node-linux-x64-musl",
|
||||
"version": "5.0.0-rc.3",
|
||||
"license": "Elastic-2.0",
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "git+https://github.com/kreuzberg-dev/kreuzberg.git"
|
||||
},
|
||||
"main": "kreuzberg-node.linux-x64-musl.node",
|
||||
"files": ["kreuzberg-node.linux-x64-musl.node"],
|
||||
"os": ["linux"],
|
||||
"cpu": ["x64"],
|
||||
"libc": ["musl"],
|
||||
"engines": { "node": ">= 18" },
|
||||
"publishConfig": { "access": "public" }
|
||||
}
|
||||
15
crates/kreuzberg-node/npm/win32-arm64-msvc/package.json
generated
Normal file
15
crates/kreuzberg-node/npm/win32-arm64-msvc/package.json
generated
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"name": "@kreuzberg/node-win32-arm64-msvc",
|
||||
"version": "5.0.0-rc.3",
|
||||
"license": "Elastic-2.0",
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "git+https://github.com/kreuzberg-dev/kreuzberg.git"
|
||||
},
|
||||
"main": "kreuzberg-node.win32-arm64-msvc.node",
|
||||
"files": ["kreuzberg-node.win32-arm64-msvc.node"],
|
||||
"os": ["win32"],
|
||||
"cpu": ["arm64"],
|
||||
"engines": { "node": ">= 18" },
|
||||
"publishConfig": { "access": "public" }
|
||||
}
|
||||
15
crates/kreuzberg-node/npm/win32-x64-msvc/package.json
generated
Normal file
15
crates/kreuzberg-node/npm/win32-x64-msvc/package.json
generated
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"name": "@kreuzberg/node-win32-x64-msvc",
|
||||
"version": "5.0.0-rc.3",
|
||||
"license": "Elastic-2.0",
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "git+https://github.com/kreuzberg-dev/kreuzberg.git"
|
||||
},
|
||||
"main": "kreuzberg-node.win32-x64-msvc.node",
|
||||
"files": ["kreuzberg-node.win32-x64-msvc.node"],
|
||||
"os": ["win32"],
|
||||
"cpu": ["x64"],
|
||||
"engines": { "node": ">= 18" },
|
||||
"publishConfig": { "access": "public" }
|
||||
}
|
||||
52
crates/kreuzberg-node/package.json
generated
Normal file
52
crates/kreuzberg-node/package.json
generated
Normal file
@@ -0,0 +1,52 @@
|
||||
{
|
||||
"name": "@kreuzberg/node",
|
||||
"version": "5.0.0-rc.3",
|
||||
"description": "High-performance document intelligence library",
|
||||
"license": "Elastic-2.0",
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "git+https://github.com/kreuzberg-dev/kreuzberg.git"
|
||||
},
|
||||
"main": "index.js",
|
||||
"types": "index.d.ts",
|
||||
"exports": {
|
||||
".": {
|
||||
"types": "./index.d.ts",
|
||||
"require": "./index.js",
|
||||
"default": "./index.js"
|
||||
}
|
||||
},
|
||||
"files": ["index.js", "index.d.ts", "*.node"],
|
||||
"optionalDependencies": {
|
||||
"@kreuzberg/node-linux-x64-gnu": "5.0.0-rc.3",
|
||||
"@kreuzberg/node-linux-arm64-gnu": "5.0.0-rc.3",
|
||||
"@kreuzberg/node-linux-x64-musl": "5.0.0-rc.3",
|
||||
"@kreuzberg/node-linux-arm64-musl": "5.0.0-rc.3",
|
||||
"@kreuzberg/node-darwin-x64": "5.0.0-rc.3",
|
||||
"@kreuzberg/node-darwin-arm64": "5.0.0-rc.3",
|
||||
"@kreuzberg/node-win32-x64-msvc": "5.0.0-rc.3",
|
||||
"@kreuzberg/node-win32-arm64-msvc": "5.0.0-rc.3"
|
||||
},
|
||||
"napi": {
|
||||
"packageName": "@kreuzberg/node",
|
||||
"binaryName": "kreuzberg-node",
|
||||
"targets": [
|
||||
"x86_64-unknown-linux-gnu",
|
||||
"aarch64-unknown-linux-gnu",
|
||||
"x86_64-unknown-linux-musl",
|
||||
"aarch64-unknown-linux-musl",
|
||||
"x86_64-apple-darwin",
|
||||
"aarch64-apple-darwin",
|
||||
"x86_64-pc-windows-msvc",
|
||||
"aarch64-pc-windows-msvc"
|
||||
]
|
||||
},
|
||||
"scripts": {
|
||||
"build": "napi build --platform --release",
|
||||
"artifacts": "napi artifacts",
|
||||
"prepublishOnly": "napi prepublish -t npm --skip-optional-publish"
|
||||
},
|
||||
"engines": { "node": ">= 18" },
|
||||
"publishConfig": { "access": "public" },
|
||||
"devDependencies": { "@napi-rs/cli": "^3.6.2" }
|
||||
}
|
||||
15166
crates/kreuzberg-node/src/lib.rs
generated
Normal file
15166
crates/kreuzberg-node/src/lib.rs
generated
Normal file
File diff suppressed because it is too large
Load Diff
41
crates/kreuzberg-paddle-ocr/Cargo.toml
Normal file
41
crates/kreuzberg-paddle-ocr/Cargo.toml
Normal file
@@ -0,0 +1,41 @@
|
||||
[package]
|
||||
name = "kreuzberg-paddle-ocr"
|
||||
version.workspace = true
|
||||
edition = "2024"
|
||||
rust-version.workspace = true
|
||||
authors.workspace = true
|
||||
description = "PaddleOCR via ONNX Runtime for Kreuzberg - high-performance text recognition"
|
||||
license = "MIT"
|
||||
repository.workspace = true
|
||||
homepage = "https://kreuzberg.dev"
|
||||
documentation = "https://docs.rs/kreuzberg-paddle-ocr"
|
||||
readme = "README.md"
|
||||
keywords = ["paddle", "ocr", "onnx", "recognition", "detection"]
|
||||
categories = ["computer-vision", "text-processing"]
|
||||
exclude = ["tests/*", ".github/*"]
|
||||
|
||||
[package.metadata.docs.rs]
|
||||
all-features = true
|
||||
rustdoc-args = ["--cfg", "docsrs"]
|
||||
|
||||
[lib]
|
||||
name = "kreuzberg_paddle_ocr"
|
||||
crate-type = ["lib"]
|
||||
|
||||
[features]
|
||||
default = []
|
||||
load-dynamic = ["ort/load-dynamic"]
|
||||
|
||||
[dependencies]
|
||||
geo-clipper = "0.9"
|
||||
geo-types = "0.7"
|
||||
image = { workspace = true }
|
||||
|
||||
# Crate-specific dependencies (not in workspace)
|
||||
# Disable rayon - OCR parallelism is handled at higher level
|
||||
imageproc = { version = "0.26", default-features = false }
|
||||
ndarray = "0.17"
|
||||
ort = { workspace = true, features = ["ndarray"] }
|
||||
# Workspace dependencies
|
||||
serde = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
22
crates/kreuzberg-paddle-ocr/LICENSE
Normal file
22
crates/kreuzberg-paddle-ocr/LICENSE
Normal file
@@ -0,0 +1,22 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2024 mg-chao
|
||||
Copyright (c) 2025 Na'aman Hirschfeld
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
57
crates/kreuzberg-paddle-ocr/README.md
Normal file
57
crates/kreuzberg-paddle-ocr/README.md
Normal file
@@ -0,0 +1,57 @@
|
||||
# kreuzberg-paddle-ocr
|
||||
|
||||
[](https://github.com/kreuzberg-dev/alef)
|
||||
|
||||
PaddleOCR via ONNX Runtime for Kreuzberg - high-performance text detection and recognition using PaddlePaddle's OCR models.
|
||||
|
||||
Based on the original [paddle-ocr-rs](https://github.com/mg-chao/paddle-ocr-rs) by [mg-chao](https://github.com/mg-chao), this vendored version includes improvements for Kreuzberg integration:
|
||||
|
||||
- **Workspace Dependency Alignment**: Uses Kreuzberg's workspace dependencies for consistency
|
||||
- **Edition 2024**: Updated to Rust 2024 edition
|
||||
- **ndarray Compatibility**: Aligned with Kreuzberg's ndarray version requirements
|
||||
- **Integration**: Designed to work seamlessly with Kreuzberg's OCR backend system
|
||||
|
||||
## Features
|
||||
|
||||
- Text detection using DBNet (Differentiable Binarization)
|
||||
- Text recognition using CRNN (Convolutional Recurrent Neural Network)
|
||||
- Angle detection for rotated text
|
||||
- Support for multiple languages via PaddleOCR models
|
||||
- ONNX Runtime for efficient CPU inference
|
||||
|
||||
## ONNX Runtime Requirement
|
||||
|
||||
This crate requires **ONNX Runtime 1.24+** at runtime.
|
||||
|
||||
Install it:
|
||||
|
||||
- **macOS (Homebrew)**: `brew install onnxruntime`
|
||||
- **Linux**: Download from [ONNX Runtime releases](https://github.com/microsoft/onnxruntime/releases)
|
||||
- **Windows**: Download from [ONNX Runtime releases](https://github.com/microsoft/onnxruntime/releases)
|
||||
|
||||
## Usage
|
||||
|
||||
This crate is used internally by Kreuzberg when the `paddle-ocr` feature is enabled:
|
||||
|
||||
```toml
|
||||
[dependencies]
|
||||
kreuzberg = { version = "4.2", features = ["paddle-ocr"] }
|
||||
```
|
||||
|
||||
## Models
|
||||
|
||||
PaddleOCR models are automatically downloaded and cached on first use. Supported models include:
|
||||
|
||||
- PP-OCRv5 server detection model
|
||||
- PP-OCRv5 per-family recognition models (11 script families)
|
||||
- PPOCRv2 mobile angle classification model
|
||||
|
||||
## License
|
||||
|
||||
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
||||
|
||||
## Acknowledgements
|
||||
|
||||
This project is based on the original [paddle-ocr-rs](https://github.com/mg-chao/paddle-ocr-rs) by [mg-chao](https://github.com/mg-chao), originally licensed under Apache-2.0. We are grateful for the foundational work that made this integration possible.
|
||||
|
||||
The original paddle-ocr-rs provides Rust bindings for PaddlePaddle's OCR models via ONNX Runtime, enabling efficient text detection and recognition without Python dependencies.
|
||||
139
crates/kreuzberg-paddle-ocr/src/angle_net.rs
Normal file
139
crates/kreuzberg-paddle-ocr/src/angle_net.rs
Normal file
@@ -0,0 +1,139 @@
|
||||
use crate::{
|
||||
base_net::BaseNet,
|
||||
constants::{IMAGENET_MEAN_VALUES, IMAGENET_NORM_VALUES},
|
||||
ocr_error::OcrError,
|
||||
ocr_result::Angle,
|
||||
ocr_utils::OcrUtils,
|
||||
};
|
||||
|
||||
use ort::{
|
||||
inputs,
|
||||
session::{Session, SessionOutputs},
|
||||
value::Tensor,
|
||||
};
|
||||
|
||||
// PP-LCNet_x1_0_textline_ori preprocessing (ImageNet normalization).
|
||||
// Input: resize to 160×80 (W×H), normalize with ImageNet mean/std.
|
||||
// Formula in substract_mean_normalize: (pixel - MEAN) * NORM
|
||||
// For ImageNet: (pixel/255 - mean) / std = (pixel - mean*255) * (1/(std*255))
|
||||
// V2 PP-LCNet angle classifier expects [3, 80, 160] input (NCHW).
|
||||
const ANGLE_DST_WIDTH: u32 = 160;
|
||||
const ANGLE_DST_HEIGHT: u32 = 80;
|
||||
const ANGLE_COLS: usize = 2;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct AngleNet {
|
||||
session: Option<Session>,
|
||||
input_names: Vec<String>,
|
||||
}
|
||||
|
||||
impl BaseNet for AngleNet {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
session: None,
|
||||
input_names: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn set_input_names(&mut self, input_names: Vec<String>) {
|
||||
self.input_names = input_names;
|
||||
}
|
||||
|
||||
fn set_session(&mut self, session: Option<Session>) {
|
||||
self.session = session;
|
||||
}
|
||||
}
|
||||
|
||||
impl AngleNet {
|
||||
pub fn get_angles(
|
||||
&self,
|
||||
part_imgs: &[image::RgbImage],
|
||||
do_angle: bool,
|
||||
most_angle: bool,
|
||||
cls_thresh: f32,
|
||||
) -> Result<Vec<Angle>, OcrError> {
|
||||
// Pre-allocate — we know exact count upfront.
|
||||
let mut angles = Vec::with_capacity(part_imgs.len());
|
||||
|
||||
if do_angle {
|
||||
for img in part_imgs {
|
||||
let angle = self.get_angle(img, cls_thresh)?;
|
||||
angles.push(angle);
|
||||
}
|
||||
} else {
|
||||
angles.extend(part_imgs.iter().map(|_| Angle::default()));
|
||||
}
|
||||
|
||||
if do_angle && most_angle {
|
||||
let sum: i32 = angles.iter().map(|x| x.index).sum();
|
||||
let half_percent = angles.len() as f32 / 2.0;
|
||||
let most_angle_index = if (sum as f32) < half_percent { 0 } else { 1 };
|
||||
|
||||
for angle in angles.iter_mut() {
|
||||
angle.index = most_angle_index;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(angles)
|
||||
}
|
||||
|
||||
fn get_angle(&self, img_src: &image::RgbImage, cls_thresh: f32) -> Result<Angle, OcrError> {
|
||||
let Some(session) = &self.session else {
|
||||
return Err(OcrError::SessionNotInitialized);
|
||||
};
|
||||
|
||||
let angle_img = image::imageops::resize(
|
||||
img_src,
|
||||
ANGLE_DST_WIDTH,
|
||||
ANGLE_DST_HEIGHT,
|
||||
image::imageops::FilterType::Triangle,
|
||||
);
|
||||
|
||||
let input_tensors =
|
||||
OcrUtils::substract_mean_normalize(&angle_img, &IMAGENET_MEAN_VALUES, &IMAGENET_NORM_VALUES);
|
||||
|
||||
let input_tensors = Tensor::from_array(input_tensors)?;
|
||||
|
||||
// SAFETY: ONNX Runtime C API is thread-safe for concurrent inference.
|
||||
#[allow(unsafe_code)]
|
||||
let outputs = unsafe {
|
||||
let session_ptr = session as *const Session as *mut Session;
|
||||
(*session_ptr).run(inputs![self.input_names[0].as_str() => input_tensors])?
|
||||
};
|
||||
|
||||
let mut angle = Self::score_to_angle(&outputs, ANGLE_COLS)?;
|
||||
|
||||
// Only apply rotation if confidence exceeds threshold (matches PaddleOCR's cls_thresh=0.9)
|
||||
if angle.score < cls_thresh {
|
||||
angle.index = 0; // Keep original orientation when confidence is low
|
||||
}
|
||||
|
||||
Ok(angle)
|
||||
}
|
||||
|
||||
fn score_to_angle(output_tensor: &SessionOutputs, angle_cols: usize) -> Result<Angle, OcrError> {
|
||||
let (_, red_data) = output_tensor.iter().next().ok_or_else(|| {
|
||||
OcrError::Io(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidData,
|
||||
"No output tensors found in angle classification session output",
|
||||
))
|
||||
})?;
|
||||
|
||||
let src_data: Vec<f32> = red_data.try_extract_tensor::<f32>()?.1.to_vec();
|
||||
|
||||
let mut angle = Angle::default();
|
||||
let mut max_value = f32::MIN;
|
||||
let mut angle_index = 0;
|
||||
|
||||
for (i, value) in src_data.iter().take(angle_cols).enumerate() {
|
||||
if *value > max_value {
|
||||
max_value = *value;
|
||||
angle_index = i as i32;
|
||||
}
|
||||
}
|
||||
|
||||
angle.index = angle_index;
|
||||
angle.score = max_value;
|
||||
Ok(angle)
|
||||
}
|
||||
}
|
||||
78
crates/kreuzberg-paddle-ocr/src/base_net.rs
Normal file
78
crates/kreuzberg-paddle-ocr/src/base_net.rs
Normal file
@@ -0,0 +1,78 @@
|
||||
use ort::session::{
|
||||
Session,
|
||||
builder::{GraphOptimizationLevel, SessionBuilder},
|
||||
};
|
||||
|
||||
use crate::ocr_error::OcrError;
|
||||
|
||||
pub trait BaseNet {
|
||||
fn new() -> Self;
|
||||
|
||||
fn get_session_builder(
|
||||
&self,
|
||||
num_thread: usize,
|
||||
builder_fn: Option<fn(SessionBuilder) -> Result<SessionBuilder, ort::Error>>,
|
||||
) -> Result<SessionBuilder, OcrError> {
|
||||
let builder = Session::builder()?;
|
||||
let builder = match builder_fn {
|
||||
Some(custom) => custom(builder)?,
|
||||
None => builder
|
||||
.with_optimization_level(GraphOptimizationLevel::All)
|
||||
.map_err(|e| OcrError::Ort(ort::Error::new(e.message())))?
|
||||
.with_intra_threads(num_thread)
|
||||
.map_err(|e| OcrError::Ort(ort::Error::new(e.message())))?
|
||||
.with_inter_threads(1)
|
||||
.map_err(|e| OcrError::Ort(ort::Error::new(e.message())))?,
|
||||
};
|
||||
|
||||
Ok(builder)
|
||||
}
|
||||
|
||||
fn set_input_names(&mut self, input_names: Vec<String>);
|
||||
fn set_session(&mut self, session: Option<Session>);
|
||||
|
||||
fn init(&mut self, session: Session) {
|
||||
let input_names: Vec<String> = session.inputs().iter().map(|input| input.name().to_string()).collect();
|
||||
|
||||
self.set_input_names(input_names);
|
||||
self.set_session(Some(session));
|
||||
}
|
||||
|
||||
fn init_model(
|
||||
&mut self,
|
||||
path: &str,
|
||||
num_thread: usize,
|
||||
builder_fn: Option<fn(SessionBuilder) -> Result<SessionBuilder, ort::Error>>,
|
||||
) -> Result<(), OcrError> {
|
||||
// Wrap ORT session creation in catch_unwind to prevent mutex poisoning
|
||||
// on platforms where ORT initialization can panic (notably Windows).
|
||||
let path_owned = path.to_string();
|
||||
let session = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
|
||||
let mut builder = self.get_session_builder(num_thread, builder_fn)?;
|
||||
builder.commit_from_file(&path_owned).map_err(OcrError::from)
|
||||
}))
|
||||
.map_err(|_| OcrError::Ort(ort::Error::new("ORT session initialization panicked")))??;
|
||||
self.init(session);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn init_model_from_memory(
|
||||
&mut self,
|
||||
model_bytes: &[u8],
|
||||
num_thread: usize,
|
||||
builder_fn: Option<fn(SessionBuilder) -> Result<SessionBuilder, ort::Error>>,
|
||||
) -> Result<(), OcrError> {
|
||||
// Wrap ORT session creation in catch_unwind to prevent mutex poisoning
|
||||
// on platforms where ORT initialization can panic (notably Windows).
|
||||
let session = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
|
||||
let mut builder = self.get_session_builder(num_thread, builder_fn)?;
|
||||
builder.commit_from_memory(model_bytes).map_err(OcrError::from)
|
||||
}))
|
||||
.map_err(|_| OcrError::Ort(ort::Error::new("ORT session initialization panicked")))??;
|
||||
|
||||
self.init(session);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
33
crates/kreuzberg-paddle-ocr/src/constants.rs
Normal file
33
crates/kreuzberg-paddle-ocr/src/constants.rs
Normal file
@@ -0,0 +1,33 @@
|
||||
//! Shared normalization constants for PaddleOCR preprocessing.
|
||||
//!
|
||||
//! Two normalization schemes are used:
|
||||
//!
|
||||
//! - **ImageNet** (`IMAGENET_MEAN_VALUES` / `IMAGENET_NORM_VALUES`): used by the text
|
||||
//! detection network (`DbNet`) and the angle classifier (`AngleNet`).
|
||||
//! Formula: `(pixel - mean * 255) * (1 / (std * 255))`.
|
||||
//!
|
||||
//! - **CRNN** (`CRNN_MEAN_VALUES` / `CRNN_NORM_VALUES`): used by the text recognition
|
||||
//! network (`CrnnNet`).
|
||||
//! Formula: `(pixel - 127.5) * (1 / 127.5)`.
|
||||
|
||||
/// ImageNet channel means (R, G, B), pre-multiplied by 255.
|
||||
///
|
||||
/// Derived from `[0.485, 0.456, 0.406]` (per-channel ImageNet means).
|
||||
/// Used by `DbNet` (text detection) and `AngleNet` (angle classification).
|
||||
pub(crate) const IMAGENET_MEAN_VALUES: [f32; 3] = [0.485 * 255.0, 0.456 * 255.0, 0.406 * 255.0];
|
||||
|
||||
/// ImageNet channel normalization factors (R, G, B), equal to `1 / (std * 255)`.
|
||||
///
|
||||
/// Derived from `[0.229, 0.224, 0.225]` (per-channel ImageNet standard deviations).
|
||||
/// Used by `DbNet` (text detection) and `AngleNet` (angle classification).
|
||||
pub(crate) const IMAGENET_NORM_VALUES: [f32; 3] = [1.0 / (0.229 * 255.0), 1.0 / (0.224 * 255.0), 1.0 / (0.225 * 255.0)];
|
||||
|
||||
/// CRNN channel means (R, G, B): `127.5` for all channels.
|
||||
///
|
||||
/// Used by `CrnnNet` (text recognition).
|
||||
pub(crate) const CRNN_MEAN_VALUES: [f32; 3] = [127.5, 127.5, 127.5];
|
||||
|
||||
/// CRNN channel normalization factors (R, G, B): `1 / 127.5` for all channels.
|
||||
///
|
||||
/// Used by `CrnnNet` (text recognition).
|
||||
pub(crate) const CRNN_NORM_VALUES: [f32; 3] = [1.0 / 127.5, 1.0 / 127.5, 1.0 / 127.5];
|
||||
393
crates/kreuzberg-paddle-ocr/src/crnn_net.rs
Normal file
393
crates/kreuzberg-paddle-ocr/src/crnn_net.rs
Normal file
@@ -0,0 +1,393 @@
|
||||
use ndarray::Array4;
|
||||
use ort::session::Session;
|
||||
use ort::value::Tensor;
|
||||
use ort::{inputs, session::builder::SessionBuilder};
|
||||
use std::collections::HashMap;
|
||||
|
||||
use crate::{
|
||||
base_net::BaseNet,
|
||||
constants::{CRNN_MEAN_VALUES, CRNN_NORM_VALUES},
|
||||
ocr_error::OcrError,
|
||||
ocr_result::TextLine,
|
||||
ocr_utils::OcrUtils,
|
||||
};
|
||||
|
||||
const CRNN_DST_HEIGHT: u32 = 48;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct CrnnNet {
|
||||
session: Option<Session>,
|
||||
keys: Vec<String>,
|
||||
input_names: Vec<String>,
|
||||
}
|
||||
|
||||
impl BaseNet for CrnnNet {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
session: None,
|
||||
keys: Vec::new(),
|
||||
input_names: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn set_input_names(&mut self, input_names: Vec<String>) {
|
||||
self.input_names = input_names;
|
||||
}
|
||||
|
||||
fn set_session(&mut self, session: Option<Session>) {
|
||||
self.session = session;
|
||||
}
|
||||
}
|
||||
|
||||
impl CrnnNet {
|
||||
pub fn init_model(
|
||||
&mut self,
|
||||
path: &str,
|
||||
num_thread: usize,
|
||||
builder_fn: Option<fn(SessionBuilder) -> Result<SessionBuilder, ort::Error>>,
|
||||
) -> Result<(), OcrError> {
|
||||
BaseNet::init_model(self, path, num_thread, builder_fn)?;
|
||||
|
||||
self.keys = self.get_keys()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn init_model_dict_file(
|
||||
&mut self,
|
||||
path: &str,
|
||||
num_thread: usize,
|
||||
builder_fn: Option<fn(SessionBuilder) -> Result<SessionBuilder, ort::Error>>,
|
||||
dict_file_path: &str,
|
||||
) -> Result<(), OcrError> {
|
||||
BaseNet::init_model(self, path, num_thread, builder_fn)?;
|
||||
|
||||
self.read_keys_from_file(dict_file_path)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn init_model_from_memory(
|
||||
&mut self,
|
||||
model_bytes: &[u8],
|
||||
num_thread: usize,
|
||||
builder_fn: Option<fn(SessionBuilder) -> Result<SessionBuilder, ort::Error>>,
|
||||
) -> Result<(), OcrError> {
|
||||
BaseNet::init_model_from_memory(self, model_bytes, num_thread, builder_fn)?;
|
||||
|
||||
self.keys = self.get_keys()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_keys(&mut self) -> Result<Vec<String>, OcrError> {
|
||||
let session = self.session.as_ref().ok_or(OcrError::SessionNotInitialized)?;
|
||||
|
||||
let metadata = session.metadata()?;
|
||||
let model_charater_list = metadata.custom("character").ok_or_else(|| {
|
||||
OcrError::Io(std::io::Error::new(
|
||||
std::io::ErrorKind::NotFound,
|
||||
"crnn_net character not found in metadata",
|
||||
))
|
||||
})?;
|
||||
|
||||
// PP-OCRv5 model metadata already includes the CTC blank token ("#") at
|
||||
// index 0 and the space token (" ") at the end. Do NOT prepend/append
|
||||
// extra tokens — doing so shifts every character index by one and
|
||||
// produces garbled output.
|
||||
let keys: Vec<String> = model_charater_list.split('\n').map(|s: &str| s.to_string()).collect();
|
||||
|
||||
Ok(keys)
|
||||
}
|
||||
|
||||
fn read_keys_from_file(&mut self, path: &str) -> Result<(), OcrError> {
|
||||
let content = std::fs::read_to_string(path)?;
|
||||
|
||||
// PP-OCRv5 dict files already include the CTC blank token ("#") at
|
||||
// index 0 and the space token (" ") at the end. Do NOT prepend/append
|
||||
// extra tokens — doing so shifts every character index by one and
|
||||
// produces garbled output.
|
||||
let keys: Vec<String> = content.split('\n').map(|s| s.to_string()).collect();
|
||||
|
||||
self.keys = keys;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn get_text_lines(
|
||||
&self,
|
||||
part_imgs: &[image::RgbImage],
|
||||
angle_rollback_records: &HashMap<usize, image::RgbImage>,
|
||||
angle_rollback_threshold: f32,
|
||||
batch_size: u32,
|
||||
) -> Result<Vec<TextLine>, OcrError> {
|
||||
if part_imgs.is_empty() {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
// Batch recognition: sort by aspect ratio, batch, pad to max width
|
||||
let mut text_lines = self.get_text_lines_batched(part_imgs, batch_size)?;
|
||||
|
||||
// Angle rollback: re-recognize individual images that scored poorly
|
||||
for (index, text_line) in text_lines.iter_mut().enumerate() {
|
||||
if (text_line.text_score.is_nan() || text_line.text_score < angle_rollback_threshold)
|
||||
&& let Some(angle_rollback_record) = angle_rollback_records.get(&index)
|
||||
{
|
||||
*text_line = self.get_text_line(angle_rollback_record)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(text_lines)
|
||||
}
|
||||
|
||||
/// Batch recognition: sort crops by width, group into batches, pad to max width,
|
||||
/// run single ONNX inference per batch. Matches PaddleOCR/RapidOCR batching strategy.
|
||||
fn get_text_lines_batched(
|
||||
&self,
|
||||
part_imgs: &[image::RgbImage],
|
||||
batch_size: u32,
|
||||
) -> Result<Vec<TextLine>, OcrError> {
|
||||
let session = self.session.as_ref().ok_or(OcrError::SessionNotInitialized)?;
|
||||
let batch_size = (batch_size as usize).max(1);
|
||||
|
||||
// Compute target widths and sort indices by aspect ratio (width/height)
|
||||
let mut indexed_widths: Vec<(usize, u32)> = part_imgs
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, img)| {
|
||||
let scale = CRNN_DST_HEIGHT as f32 / img.height().max(1) as f32;
|
||||
let dst_width = (img.width() as f32 * scale).ceil() as u32;
|
||||
(i, dst_width.max(1))
|
||||
})
|
||||
.collect();
|
||||
indexed_widths.sort_by_key(|&(_, w)| w);
|
||||
|
||||
let mut results: Vec<(usize, TextLine)> = Vec::with_capacity(part_imgs.len());
|
||||
|
||||
// Process in batches
|
||||
for chunk in indexed_widths.chunks(batch_size) {
|
||||
if chunk.len() == 1 {
|
||||
// Single image — use existing path (no padding overhead)
|
||||
let (orig_idx, _) = chunk[0];
|
||||
let text_line = self.get_text_line(&part_imgs[orig_idx])?;
|
||||
results.push((orig_idx, text_line));
|
||||
continue;
|
||||
}
|
||||
|
||||
// Find max width in this batch
|
||||
let max_width = chunk.iter().map(|&(_, w)| w).max().unwrap_or(1);
|
||||
|
||||
// Build batch tensor [N, 3, 48, max_width] with zero-padding
|
||||
let n = chunk.len();
|
||||
let mut batch_data = Array4::<f32>::zeros((n, 3, CRNN_DST_HEIGHT as usize, max_width as usize));
|
||||
|
||||
for (batch_idx, &(orig_idx, dst_width)) in chunk.iter().enumerate() {
|
||||
let img = &part_imgs[orig_idx];
|
||||
let resized =
|
||||
image::imageops::resize(img, dst_width, CRNN_DST_HEIGHT, image::imageops::FilterType::Triangle);
|
||||
|
||||
// Normalize and fill into batch tensor (zero-padded on right).
|
||||
// Use raw slice access instead of per-pixel get_pixel() to
|
||||
// eliminate millions of bounds checks in the hot loop.
|
||||
let cols = resized.width() as usize;
|
||||
let rows = resized.height() as usize;
|
||||
let raw = resized.as_raw();
|
||||
assert_eq!(raw.len(), rows * cols * 3, "unexpected image buffer size");
|
||||
let adjusted = [
|
||||
CRNN_MEAN_VALUES[0] * CRNN_NORM_VALUES[0],
|
||||
CRNN_MEAN_VALUES[1] * CRNN_NORM_VALUES[1],
|
||||
CRNN_MEAN_VALUES[2] * CRNN_NORM_VALUES[2],
|
||||
];
|
||||
for r in 0..rows {
|
||||
for c in 0..cols {
|
||||
let base = r * cols * 3 + c * 3;
|
||||
for ch in 0..3 {
|
||||
batch_data[[batch_idx, ch, r, c]] =
|
||||
raw[base + ch] as f32 * CRNN_NORM_VALUES[ch] - adjusted[ch];
|
||||
}
|
||||
}
|
||||
}
|
||||
// Remaining columns stay zero (padding)
|
||||
}
|
||||
|
||||
let input_tensor = Tensor::from_array(batch_data)?;
|
||||
|
||||
// SAFETY: ONNX Runtime C API is thread-safe for concurrent inference.
|
||||
#[allow(unsafe_code)]
|
||||
let outputs = unsafe {
|
||||
let session_ptr = session as *const Session as *mut Session;
|
||||
(*session_ptr).run(inputs![self.input_names[0].as_str() => input_tensor])?
|
||||
};
|
||||
|
||||
let (_, output_value) = outputs.iter().next().ok_or_else(|| {
|
||||
OcrError::Io(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidData,
|
||||
"No output tensors found in batched CRNN session output",
|
||||
))
|
||||
})?;
|
||||
|
||||
let (shape, flat_data) = output_value.try_extract_tensor::<f32>()?;
|
||||
// Shape: [batch, timesteps, num_classes]
|
||||
let batch_dim = *shape.first().unwrap_or(&1) as usize;
|
||||
let timesteps = *shape.get(1).unwrap_or(&0) as usize;
|
||||
let num_classes = *shape.get(2).unwrap_or(&0) as usize;
|
||||
|
||||
for (batch_idx, item) in chunk.iter().enumerate().take(batch_dim.min(n)) {
|
||||
let offset = batch_idx * timesteps * num_classes;
|
||||
let slice = &flat_data[offset..offset + timesteps * num_classes];
|
||||
let text_line = Self::score_to_text_line(slice, timesteps, num_classes, &self.keys)?;
|
||||
results.push((item.0, text_line));
|
||||
}
|
||||
}
|
||||
|
||||
// Reorder results back to original index order
|
||||
results.sort_by_key(|&(idx, _)| idx);
|
||||
Ok(results.into_iter().map(|(_, tl)| tl).collect())
|
||||
}
|
||||
|
||||
fn get_text_line(&self, img_src: &image::RgbImage) -> Result<TextLine, OcrError> {
|
||||
let Some(session) = &self.session else {
|
||||
return Err(OcrError::SessionNotInitialized);
|
||||
};
|
||||
|
||||
let scale = CRNN_DST_HEIGHT as f32 / img_src.height() as f32;
|
||||
let dst_width = (img_src.width() as f32 * scale).ceil() as u32;
|
||||
|
||||
let src_resize = image::imageops::resize(
|
||||
img_src,
|
||||
dst_width,
|
||||
CRNN_DST_HEIGHT,
|
||||
image::imageops::FilterType::Triangle,
|
||||
);
|
||||
|
||||
let input_tensors = OcrUtils::substract_mean_normalize(&src_resize, &CRNN_MEAN_VALUES, &CRNN_NORM_VALUES);
|
||||
|
||||
let input_tensors = Tensor::from_array(input_tensors)?;
|
||||
|
||||
// SAFETY: ONNX Runtime C API is thread-safe for concurrent inference.
|
||||
#[allow(unsafe_code)]
|
||||
let outputs = unsafe {
|
||||
let session_ptr = session as *const Session as *mut Session;
|
||||
(*session_ptr).run(inputs![self.input_names[0].as_str() => input_tensors])?
|
||||
};
|
||||
|
||||
let (_, red_data) = outputs.iter().next().ok_or_else(|| {
|
||||
OcrError::Io(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidData,
|
||||
"No output tensors found in CRNN session output",
|
||||
))
|
||||
})?;
|
||||
|
||||
let (shape, src_data) = red_data.try_extract_tensor::<f32>()?;
|
||||
let dimensions = shape;
|
||||
let height = *dimensions.get(1).ok_or_else(|| {
|
||||
OcrError::Io(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidData,
|
||||
"CRNN output tensor missing height dimension (index 1)",
|
||||
))
|
||||
})? as usize;
|
||||
let width = *dimensions.get(2).ok_or_else(|| {
|
||||
OcrError::Io(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidData,
|
||||
"CRNN output tensor missing width dimension (index 2)",
|
||||
))
|
||||
})? as usize;
|
||||
let src_data: Vec<f32> = src_data.to_vec();
|
||||
|
||||
Self::score_to_text_line(&src_data, height, width, &self.keys)
|
||||
}
|
||||
|
||||
fn score_to_text_line(
|
||||
output_data: &[f32],
|
||||
height: usize,
|
||||
width: usize,
|
||||
keys: &[String],
|
||||
) -> Result<TextLine, OcrError> {
|
||||
let mut text_line = TextLine::default();
|
||||
let mut last_index = 0;
|
||||
|
||||
let mut text_score_sum = 0.0;
|
||||
let mut text_score_count = 0;
|
||||
for i in 0..height {
|
||||
let start = i * width;
|
||||
let stop = (i + 1) * width;
|
||||
let slice = &output_data[start..stop.min(output_data.len())];
|
||||
|
||||
let (max_index, max_value) =
|
||||
slice
|
||||
.iter()
|
||||
.enumerate()
|
||||
.fold((0, f32::MIN), |(max_idx, max_val), (idx, &val)| {
|
||||
if val > max_val { (idx, val) } else { (max_idx, max_val) }
|
||||
});
|
||||
|
||||
if max_index > 0 && max_index < keys.len() && !(i > 0 && max_index == last_index) {
|
||||
text_line.text.push_str(&keys[max_index]);
|
||||
text_score_sum += max_value;
|
||||
text_score_count += 1;
|
||||
}
|
||||
last_index = max_index;
|
||||
}
|
||||
|
||||
// Avoid division by zero: handle case where no characters were found
|
||||
text_line.text_score = if text_score_count > 0 {
|
||||
text_score_sum / text_score_count as f32
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
Ok(text_line)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_score_to_text_line_skips_blank_index() {
|
||||
// keys[0] = "#" (CTC blank), keys[1] = "a", keys[2] = "b"
|
||||
let keys = vec!["#".to_string(), "a".to_string(), "b".to_string()];
|
||||
// 3 timesteps, 3 classes each. Simulate: blank, "a", "b"
|
||||
let output = vec![
|
||||
1.0, 0.0, 0.0, // timestep 0: max at index 0 (blank) -> skip
|
||||
0.0, 0.9, 0.1, // timestep 1: max at index 1 ("a")
|
||||
0.0, 0.1, 0.8, // timestep 2: max at index 2 ("b")
|
||||
];
|
||||
let result = CrnnNet::score_to_text_line(&output, 3, 3, &keys).unwrap();
|
||||
assert_eq!(result.text, "ab");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_score_to_text_line_deduplicates_consecutive() {
|
||||
let keys = vec!["#".to_string(), "h".to_string(), "i".to_string()];
|
||||
// 4 timesteps: "h", "h", "i", "i" -> should deduplicate to "hi"
|
||||
let output = vec![
|
||||
0.0, 0.9, 0.0, // "h"
|
||||
0.0, 0.8, 0.0, // "h" again (same index, skip)
|
||||
0.0, 0.0, 0.9, // "i"
|
||||
0.0, 0.0, 0.8, // "i" again (same index, skip)
|
||||
];
|
||||
let result = CrnnNet::score_to_text_line(&output, 4, 3, &keys).unwrap();
|
||||
assert_eq!(result.text, "hi");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_read_keys_from_file_preserves_dict_layout() {
|
||||
let dir = std::env::temp_dir().join("kreuzberg_test_dict");
|
||||
std::fs::create_dir_all(&dir).unwrap();
|
||||
let dict_path = dir.join("test_dict.txt");
|
||||
// PP-OCRv5 dict files already include "#" (blank) at start and " " at end.
|
||||
std::fs::write(&dict_path, "#\na\nb\nc\n ").unwrap();
|
||||
|
||||
let mut net = CrnnNet::new();
|
||||
net.read_keys_from_file(dict_path.to_str().unwrap()).unwrap();
|
||||
|
||||
// Dict is loaded as-is: ["#", "a", "b", "c", " "]
|
||||
assert_eq!(net.keys[0], "#");
|
||||
assert_eq!(net.keys[1], "a");
|
||||
assert_eq!(net.keys[2], "b");
|
||||
assert_eq!(net.keys[3], "c");
|
||||
assert_eq!(net.keys[net.keys.len() - 1], " ");
|
||||
|
||||
std::fs::remove_dir_all(&dir).ok();
|
||||
}
|
||||
}
|
||||
421
crates/kreuzberg-paddle-ocr/src/db_net.rs
Normal file
421
crates/kreuzberg-paddle-ocr/src/db_net.rs
Normal file
@@ -0,0 +1,421 @@
|
||||
use crate::{
|
||||
base_net::BaseNet,
|
||||
constants::{IMAGENET_MEAN_VALUES, IMAGENET_NORM_VALUES},
|
||||
ocr_error::OcrError,
|
||||
ocr_result::{self, TextBox},
|
||||
ocr_utils::OcrUtils,
|
||||
scale_param::ScaleParam,
|
||||
};
|
||||
use geo_clipper::{Clipper, EndType, JoinType};
|
||||
use geo_types::{Coord, LineString, Polygon};
|
||||
use ort::{inputs, session::SessionOutputs};
|
||||
use ort::{session::Session, value::Tensor};
|
||||
use std::cmp::Ordering;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct DbNet {
|
||||
session: Option<Session>,
|
||||
input_names: Vec<String>,
|
||||
}
|
||||
|
||||
impl BaseNet for DbNet {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
session: None,
|
||||
input_names: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn set_input_names(&mut self, input_names: Vec<String>) {
|
||||
self.input_names = input_names;
|
||||
}
|
||||
|
||||
fn set_session(&mut self, session: Option<Session>) {
|
||||
self.session = session;
|
||||
}
|
||||
}
|
||||
|
||||
impl DbNet {
|
||||
pub fn get_text_boxes(
|
||||
&self,
|
||||
img_src: &image::RgbImage,
|
||||
scale: &ScaleParam,
|
||||
box_score_thresh: f32,
|
||||
box_thresh: f32,
|
||||
un_clip_ratio: f32,
|
||||
thresh: f32,
|
||||
) -> Result<Vec<TextBox>, OcrError> {
|
||||
let Some(session) = &self.session else {
|
||||
return Err(OcrError::SessionNotInitialized);
|
||||
};
|
||||
|
||||
let src_resize = image::imageops::resize(
|
||||
img_src,
|
||||
scale.dst_width,
|
||||
scale.dst_height,
|
||||
image::imageops::FilterType::Triangle,
|
||||
);
|
||||
|
||||
let input_tensors =
|
||||
OcrUtils::substract_mean_normalize(&src_resize, &IMAGENET_MEAN_VALUES, &IMAGENET_NORM_VALUES);
|
||||
|
||||
let tensor = Tensor::from_array(input_tensors)?;
|
||||
|
||||
// SAFETY: ONNX Runtime's C API (OrtRun) is thread-safe for concurrent inference
|
||||
// on the same session. The ort crate's `&mut self` requirement is overly
|
||||
// conservative. This matches the pattern used in kreuzberg's embedding engine.
|
||||
#[allow(unsafe_code)]
|
||||
let outputs = unsafe {
|
||||
let session_ptr = session as *const Session as *mut Session;
|
||||
(*session_ptr).run(inputs![self.input_names[0].as_str() => tensor])?
|
||||
};
|
||||
|
||||
let text_boxes = Self::get_text_boxes_core(
|
||||
&outputs,
|
||||
src_resize.height(),
|
||||
src_resize.width(),
|
||||
&ScaleParam::new(
|
||||
scale.src_width,
|
||||
scale.src_height,
|
||||
scale.dst_width,
|
||||
scale.dst_height,
|
||||
scale.scale_width,
|
||||
scale.scale_height,
|
||||
),
|
||||
box_score_thresh,
|
||||
box_thresh,
|
||||
un_clip_ratio,
|
||||
thresh,
|
||||
)?;
|
||||
|
||||
Ok(text_boxes)
|
||||
}
|
||||
|
||||
fn get_text_boxes_core(
|
||||
output_tensor: &SessionOutputs,
|
||||
rows: u32,
|
||||
cols: u32,
|
||||
s: &ScaleParam,
|
||||
box_score_thresh: f32,
|
||||
_box_thresh: f32,
|
||||
un_clip_ratio: f32,
|
||||
thresh: f32,
|
||||
) -> Result<Vec<TextBox>, OcrError> {
|
||||
let max_side_thresh = 3.0;
|
||||
|
||||
let (_, red_data) = output_tensor.iter().next().ok_or_else(|| {
|
||||
OcrError::Io(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidData,
|
||||
"No output tensors found in session output",
|
||||
))
|
||||
})?;
|
||||
|
||||
let pred_data: Vec<f32> = red_data.try_extract_tensor::<f32>()?.1.to_vec();
|
||||
|
||||
let cbuf_data: Vec<u8> = pred_data.iter().map(|pixel| (pixel * 255.0) as u8).collect();
|
||||
|
||||
let pred_img: image::ImageBuffer<image::Luma<f32>, Vec<f32>> =
|
||||
image::ImageBuffer::from_vec(cols, rows, pred_data).ok_or_else(|| {
|
||||
OcrError::Io(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidData,
|
||||
format!(
|
||||
"Failed to create image buffer from predictions: {} x {} dimensions may be invalid",
|
||||
cols, rows
|
||||
),
|
||||
))
|
||||
})?;
|
||||
|
||||
let cbuf_img = image::GrayImage::from_vec(cols, rows, cbuf_data).ok_or_else(|| {
|
||||
OcrError::Io(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidData,
|
||||
format!(
|
||||
"Failed to create grayscale image buffer: {} x {} dimensions may be invalid",
|
||||
cols, rows
|
||||
),
|
||||
))
|
||||
})?;
|
||||
|
||||
let threshold_img = imageproc::contrast::threshold(
|
||||
&cbuf_img,
|
||||
(thresh * 255.0) as u8,
|
||||
imageproc::contrast::ThresholdType::Binary,
|
||||
);
|
||||
|
||||
// RapidOCR and PaddleOCR reference do NOT apply dilation before contour extraction.
|
||||
// Dilation merges adjacent text regions, causing word concatenation.
|
||||
let img_contours: Vec<imageproc::contours::Contour<i32>> = imageproc::contours::find_contours(&threshold_img);
|
||||
|
||||
// Pre-allocate based on contour count to avoid repeated reallocations.
|
||||
let mut rs_boxes = Vec::with_capacity(img_contours.len());
|
||||
|
||||
for contour in img_contours {
|
||||
if contour.points.len() <= 2 {
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut max_side = 0.0;
|
||||
let min_box = Self::get_mini_box(&contour.points, &mut max_side)?;
|
||||
if max_side < max_side_thresh {
|
||||
continue;
|
||||
}
|
||||
|
||||
let score = Self::get_score(&contour, &pred_img)?;
|
||||
if score < box_score_thresh {
|
||||
continue;
|
||||
}
|
||||
|
||||
let clip_box = Self::unclip(&min_box, un_clip_ratio)?;
|
||||
if clip_box.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut clip_contour = Vec::new();
|
||||
for point in &clip_box {
|
||||
clip_contour.push(*point);
|
||||
}
|
||||
|
||||
let mut max_side_clip = 0.0;
|
||||
let clip_min_box = Self::get_mini_box(&clip_contour, &mut max_side_clip)?;
|
||||
if max_side_clip < max_side_thresh + 2.0 {
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut final_points = Vec::new();
|
||||
for item in clip_min_box {
|
||||
let x = (item.x / s.scale_width) as u32;
|
||||
let ptx = x.min(s.src_width);
|
||||
|
||||
let y = (item.y / s.scale_height) as u32;
|
||||
let pty = y.min(s.src_height);
|
||||
|
||||
final_points.push(ocr_result::Point { x: ptx, y: pty });
|
||||
}
|
||||
|
||||
let text_box = TextBox {
|
||||
score,
|
||||
points: final_points,
|
||||
};
|
||||
|
||||
rs_boxes.push(text_box);
|
||||
}
|
||||
|
||||
Ok(rs_boxes)
|
||||
}
|
||||
|
||||
fn get_mini_box(
|
||||
contour_points: &[imageproc::point::Point<i32>],
|
||||
min_edge_size: &mut f32,
|
||||
) -> Result<Vec<imageproc::point::Point<f32>>, OcrError> {
|
||||
let rect = imageproc::geometry::min_area_rect(contour_points);
|
||||
|
||||
let mut rect_points: Vec<imageproc::point::Point<f32>> = rect
|
||||
.iter()
|
||||
.map(|p| imageproc::point::Point::new(p.x as f32, p.y as f32))
|
||||
.collect();
|
||||
|
||||
// Direct multiplication instead of .powi(2) — avoids function call overhead.
|
||||
let dx_w = rect_points[0].x - rect_points[1].x;
|
||||
let dy_w = rect_points[0].y - rect_points[1].y;
|
||||
let width = (dx_w * dx_w + dy_w * dy_w).sqrt();
|
||||
let dx_h = rect_points[1].x - rect_points[2].x;
|
||||
let dy_h = rect_points[1].y - rect_points[2].y;
|
||||
let height = (dx_h * dx_h + dy_h * dy_h).sqrt();
|
||||
|
||||
*min_edge_size = width.min(height);
|
||||
|
||||
rect_points.sort_by(|a, b| {
|
||||
if a.x > b.x {
|
||||
return Ordering::Greater;
|
||||
}
|
||||
if a.x == b.x {
|
||||
return Ordering::Equal;
|
||||
}
|
||||
Ordering::Less
|
||||
});
|
||||
|
||||
let mut box_points = Vec::new();
|
||||
let index_1;
|
||||
let index_4;
|
||||
if rect_points[1].y > rect_points[0].y {
|
||||
index_1 = 0;
|
||||
index_4 = 1;
|
||||
} else {
|
||||
index_1 = 1;
|
||||
index_4 = 0;
|
||||
}
|
||||
|
||||
let index_2;
|
||||
let index_3;
|
||||
if rect_points[3].y > rect_points[2].y {
|
||||
index_2 = 2;
|
||||
index_3 = 3;
|
||||
} else {
|
||||
index_2 = 3;
|
||||
index_3 = 2;
|
||||
}
|
||||
|
||||
box_points.push(rect_points[index_1]);
|
||||
box_points.push(rect_points[index_2]);
|
||||
box_points.push(rect_points[index_3]);
|
||||
box_points.push(rect_points[index_4]);
|
||||
|
||||
Ok(box_points)
|
||||
}
|
||||
|
||||
fn get_score(
|
||||
contour: &imageproc::contours::Contour<i32>,
|
||||
f_map_mat: &image::ImageBuffer<image::Luma<f32>, Vec<f32>>,
|
||||
) -> Result<f32, OcrError> {
|
||||
// Initialize boundary values
|
||||
let mut xmin = i32::MAX;
|
||||
let mut xmax = i32::MIN;
|
||||
let mut ymin = i32::MAX;
|
||||
let mut ymax = i32::MIN;
|
||||
|
||||
// Find contour bounding box
|
||||
for point in contour.points.iter() {
|
||||
let x = point.x;
|
||||
let y = point.y;
|
||||
|
||||
if x < xmin {
|
||||
xmin = x;
|
||||
}
|
||||
if x > xmax {
|
||||
xmax = x;
|
||||
}
|
||||
if y < ymin {
|
||||
ymin = y;
|
||||
}
|
||||
if y > ymax {
|
||||
ymax = y;
|
||||
}
|
||||
}
|
||||
|
||||
let width = f_map_mat.width() as i32;
|
||||
let height = f_map_mat.height() as i32;
|
||||
|
||||
xmin = xmin.max(0).min(width - 1);
|
||||
xmax = xmax.max(0).min(width - 1);
|
||||
ymin = ymin.max(0).min(height - 1);
|
||||
ymax = ymax.max(0).min(height - 1);
|
||||
|
||||
let roi_width = xmax - xmin + 1;
|
||||
let roi_height = ymax - ymin + 1;
|
||||
|
||||
if roi_width <= 0 || roi_height <= 0 {
|
||||
return Ok(0.0);
|
||||
}
|
||||
|
||||
let mut mask = image::GrayImage::new(roi_width as u32, roi_height as u32);
|
||||
|
||||
let mut pts = Vec::<imageproc::point::Point<i32>>::new();
|
||||
for point in contour.points.iter() {
|
||||
pts.push(imageproc::point::Point::new(point.x - xmin, point.y - ymin));
|
||||
}
|
||||
|
||||
imageproc::drawing::draw_polygon_mut(&mut mask, pts.as_slice(), image::Luma([255]));
|
||||
|
||||
let cropped_img =
|
||||
image::imageops::crop_imm(f_map_mat, xmin as u32, ymin as u32, roi_width as u32, roi_height as u32)
|
||||
.to_image();
|
||||
|
||||
let mean = OcrUtils::calculate_mean_with_mask(&cropped_img, &mask);
|
||||
|
||||
Ok(mean)
|
||||
}
|
||||
|
||||
fn unclip(
|
||||
box_points: &[imageproc::point::Point<f32>],
|
||||
unclip_ratio: f32,
|
||||
) -> Result<Vec<imageproc::point::Point<i32>>, OcrError> {
|
||||
// Direct multiplication instead of .powi(2) — avoids function call overhead.
|
||||
let dx_w = box_points[0].x - box_points[1].x;
|
||||
let dy_w = box_points[0].y - box_points[1].y;
|
||||
let clip_rect_width = (dx_w * dx_w + dy_w * dy_w).sqrt();
|
||||
let dx_h = box_points[1].x - box_points[2].x;
|
||||
let dy_h = box_points[1].y - box_points[2].y;
|
||||
let clip_rect_height = (dx_h * dx_h + dy_h * dy_h).sqrt();
|
||||
|
||||
if clip_rect_height < 1.001 && clip_rect_width < 1.001 {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
let mut the_cliper_pts = Vec::new();
|
||||
for pt in box_points {
|
||||
let a1 = Coord {
|
||||
x: pt.x as f64,
|
||||
y: pt.y as f64,
|
||||
};
|
||||
the_cliper_pts.push(a1);
|
||||
}
|
||||
|
||||
let area = Self::signed_polygon_area(box_points).abs();
|
||||
let length = Self::length_of_points(box_points);
|
||||
let distance = area * unclip_ratio / length as f32;
|
||||
|
||||
let co = Polygon::new(LineString::new(the_cliper_pts), vec![]);
|
||||
let solution = co
|
||||
.offset(distance as f64, JoinType::Round(2.0), EndType::ClosedPolygon, 1.0)
|
||||
.0;
|
||||
|
||||
if solution.is_empty() {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
let first_polygon = solution.first().ok_or_else(|| {
|
||||
OcrError::Io(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidData,
|
||||
"Polygon solution list was empty after offset operation",
|
||||
))
|
||||
})?;
|
||||
|
||||
let ret_pts: Vec<_> = first_polygon
|
||||
.exterior()
|
||||
.points()
|
||||
.map(|ip| imageproc::point::Point::new(ip.x() as i32, ip.y() as i32))
|
||||
.collect();
|
||||
|
||||
Ok(ret_pts)
|
||||
}
|
||||
|
||||
fn signed_polygon_area(points: &[imageproc::point::Point<f32>]) -> f32 {
|
||||
let num_points = points.len();
|
||||
let mut pts = Vec::with_capacity(num_points + 1);
|
||||
pts.extend_from_slice(points);
|
||||
pts.push(points[0]);
|
||||
|
||||
let mut area = 0.0;
|
||||
for i in 0..num_points {
|
||||
area += (pts[i + 1].x - pts[i].x) * (pts[i + 1].y + pts[i].y) / 2.0;
|
||||
}
|
||||
|
||||
area
|
||||
}
|
||||
|
||||
fn length_of_points(box_points: &[imageproc::point::Point<f32>]) -> f64 {
|
||||
if box_points.is_empty() {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
let mut length = 0.0;
|
||||
let mut x0 = box_points[0].x as f64;
|
||||
let mut y0 = box_points[0].y as f64;
|
||||
|
||||
for pt in &box_points[1..] {
|
||||
let x1 = pt.x as f64;
|
||||
let y1 = pt.y as f64;
|
||||
let dx = x1 - x0;
|
||||
let dy = y1 - y0;
|
||||
length += (dx * dx + dy * dy).sqrt();
|
||||
x0 = x1;
|
||||
y0 = y1;
|
||||
}
|
||||
|
||||
// Closing segment back to first point
|
||||
let dx = box_points[0].x as f64 - x0;
|
||||
let dy = box_points[0].y as f64 - y0;
|
||||
length += (dx * dx + dy * dy).sqrt();
|
||||
|
||||
length
|
||||
}
|
||||
}
|
||||
32
crates/kreuzberg-paddle-ocr/src/lib.rs
Normal file
32
crates/kreuzberg-paddle-ocr/src/lib.rs
Normal file
@@ -0,0 +1,32 @@
|
||||
//! # kreuzberg-paddle-ocr
|
||||
//!
|
||||
//! PaddleOCR via ONNX Runtime for Kreuzberg - high-performance text detection and recognition.
|
||||
//!
|
||||
//! This crate is vendored from [paddle-ocr-rs](https://github.com/mg-chao/paddle-ocr-rs)
|
||||
//! by mg-chao, with modifications for Kreuzberg integration.
|
||||
//!
|
||||
//! ## ONNX Runtime Requirement
|
||||
//!
|
||||
//! Requires **ONNX Runtime 1.24+** at runtime.
|
||||
//!
|
||||
//! ## Original License
|
||||
//!
|
||||
//! The original paddle-ocr-rs is licensed under Apache-2.0.
|
||||
//! This vendored version is relicensed to MIT with the original author's copyright retained.
|
||||
|
||||
#![allow(clippy::too_many_arguments)]
|
||||
|
||||
pub mod angle_net;
|
||||
pub mod base_net;
|
||||
pub(crate) mod constants;
|
||||
pub mod crnn_net;
|
||||
pub mod db_net;
|
||||
pub mod ocr_error;
|
||||
pub mod ocr_lite;
|
||||
pub mod ocr_result;
|
||||
pub mod ocr_utils;
|
||||
pub mod scale_param;
|
||||
|
||||
pub use ocr_error::OcrError;
|
||||
pub use ocr_lite::OcrLite;
|
||||
pub use ocr_result::{Angle, OcrResult, Point, TextBlock, TextBox, TextLine};
|
||||
13
crates/kreuzberg-paddle-ocr/src/ocr_error.rs
Normal file
13
crates/kreuzberg-paddle-ocr/src/ocr_error.rs
Normal file
@@ -0,0 +1,13 @@
|
||||
use thiserror::Error;
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum OcrError {
|
||||
#[error("Ort error: {0}")]
|
||||
Ort(#[from] ort::Error),
|
||||
#[error("IO error: {0}")]
|
||||
Io(#[from] std::io::Error),
|
||||
#[error("Image error: {0}")]
|
||||
ImageError(#[from] image::ImageError),
|
||||
#[error("Session not initialized")]
|
||||
SessionNotInitialized,
|
||||
}
|
||||
447
crates/kreuzberg-paddle-ocr/src/ocr_lite.rs
Normal file
447
crates/kreuzberg-paddle-ocr/src/ocr_lite.rs
Normal file
@@ -0,0 +1,447 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use image::ImageBuffer;
|
||||
use ort::session::builder::SessionBuilder;
|
||||
|
||||
use crate::{
|
||||
angle_net::AngleNet,
|
||||
base_net::BaseNet,
|
||||
crnn_net::CrnnNet,
|
||||
db_net::DbNet,
|
||||
ocr_error::OcrError,
|
||||
ocr_result::{OcrResult, Point, TextBlock, TextBox},
|
||||
ocr_utils::OcrUtils,
|
||||
scale_param::ScaleParam,
|
||||
};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct OcrLite {
|
||||
db_net: DbNet,
|
||||
angle_net: AngleNet,
|
||||
crnn_net: CrnnNet,
|
||||
}
|
||||
|
||||
// SAFETY: OcrLite inference methods (&self) use unsafe pointer casts to call
|
||||
// ort Session::run, which is thread-safe at the ONNX Runtime C API level.
|
||||
// After initialization (&mut self), no mutable state is accessed during inference.
|
||||
unsafe impl Send for OcrLite {}
|
||||
unsafe impl Sync for OcrLite {}
|
||||
|
||||
impl Default for OcrLite {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl OcrLite {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
db_net: DbNet::new(),
|
||||
angle_net: AngleNet::new(),
|
||||
crnn_net: CrnnNet::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn init_models(
|
||||
&mut self,
|
||||
det_path: &str,
|
||||
cls_path: &str,
|
||||
rec_path: &str,
|
||||
num_thread: usize,
|
||||
) -> Result<(), OcrError> {
|
||||
self.db_net.init_model(det_path, num_thread, None)?;
|
||||
self.angle_net.init_model(cls_path, num_thread, None)?;
|
||||
self.crnn_net.init_model(rec_path, num_thread, None)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn init_models_with_dict(
|
||||
&mut self,
|
||||
det_path: &str,
|
||||
cls_path: &str,
|
||||
rec_path: &str,
|
||||
dict_path: &str,
|
||||
num_thread: usize,
|
||||
) -> Result<(), OcrError> {
|
||||
self.db_net.init_model(det_path, num_thread, None)?;
|
||||
self.angle_net.init_model(cls_path, num_thread, None)?;
|
||||
self.crnn_net
|
||||
.init_model_dict_file(rec_path, num_thread, None, dict_path)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn init_models_custom(
|
||||
&mut self,
|
||||
det_path: &str,
|
||||
cls_path: &str,
|
||||
rec_path: &str,
|
||||
builder_fn: fn(SessionBuilder) -> Result<SessionBuilder, ort::Error>,
|
||||
) -> Result<(), OcrError> {
|
||||
self.db_net.init_model(det_path, 0, Some(builder_fn))?;
|
||||
self.angle_net.init_model(cls_path, 0, Some(builder_fn))?;
|
||||
self.crnn_net.init_model(rec_path, 0, Some(builder_fn))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Initialize models with dictionary file and custom session builder.
|
||||
///
|
||||
/// Combines `init_models_with_dict` and `init_models_custom`: loads the
|
||||
/// dictionary for the recognition model while applying a custom ORT
|
||||
/// session builder (e.g. for GPU execution providers).
|
||||
pub fn init_models_with_dict_custom(
|
||||
&mut self,
|
||||
det_path: &str,
|
||||
cls_path: &str,
|
||||
rec_path: &str,
|
||||
dict_path: &str,
|
||||
num_thread: usize,
|
||||
builder_fn: Option<fn(SessionBuilder) -> Result<SessionBuilder, ort::Error>>,
|
||||
) -> Result<(), OcrError> {
|
||||
self.db_net.init_model(det_path, num_thread, builder_fn)?;
|
||||
self.angle_net.init_model(cls_path, num_thread, builder_fn)?;
|
||||
self.crnn_net
|
||||
.init_model_dict_file(rec_path, num_thread, builder_fn, dict_path)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn init_models_from_memory(
|
||||
&mut self,
|
||||
det_bytes: &[u8],
|
||||
cls_bytes: &[u8],
|
||||
rec_bytes: &[u8],
|
||||
num_thread: usize,
|
||||
) -> Result<(), OcrError> {
|
||||
self.db_net.init_model_from_memory(det_bytes, num_thread, None)?;
|
||||
self.angle_net.init_model_from_memory(cls_bytes, num_thread, None)?;
|
||||
self.crnn_net.init_model_from_memory(rec_bytes, num_thread, None)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn init_models_from_memory_custom(
|
||||
&mut self,
|
||||
det_bytes: &[u8],
|
||||
cls_bytes: &[u8],
|
||||
rec_bytes: &[u8],
|
||||
builder_fn: fn(SessionBuilder) -> Result<SessionBuilder, ort::Error>,
|
||||
) -> Result<(), OcrError> {
|
||||
self.db_net.init_model_from_memory(det_bytes, 0, Some(builder_fn))?;
|
||||
self.angle_net.init_model_from_memory(cls_bytes, 0, Some(builder_fn))?;
|
||||
self.crnn_net.init_model_from_memory(rec_bytes, 0, Some(builder_fn))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn detect_base(
|
||||
&self,
|
||||
img_src: &image::RgbImage,
|
||||
padding: u32,
|
||||
max_side_len: u32,
|
||||
box_score_thresh: f32,
|
||||
box_thresh: f32,
|
||||
un_clip_ratio: f32,
|
||||
do_angle: bool,
|
||||
most_angle: bool,
|
||||
angle_rollback: bool,
|
||||
angle_rollback_threshold: f32,
|
||||
cls_thresh: f32,
|
||||
thresh: f32,
|
||||
) -> Result<OcrResult, OcrError> {
|
||||
let origin_max_side = img_src.width().max(img_src.height());
|
||||
let mut resize;
|
||||
if max_side_len == 0 || max_side_len > origin_max_side {
|
||||
resize = origin_max_side;
|
||||
} else {
|
||||
resize = max_side_len;
|
||||
}
|
||||
resize += 2 * padding;
|
||||
|
||||
// Cow avoids cloning the image when padding=0 (the common case).
|
||||
let padding_src = OcrUtils::make_padding(img_src, padding)?;
|
||||
|
||||
let scale = ScaleParam::get_scale_param(&padding_src, resize);
|
||||
|
||||
self.detect_once(
|
||||
&padding_src,
|
||||
&scale,
|
||||
padding,
|
||||
box_score_thresh,
|
||||
box_thresh,
|
||||
un_clip_ratio,
|
||||
do_angle,
|
||||
most_angle,
|
||||
angle_rollback,
|
||||
angle_rollback_threshold,
|
||||
cls_thresh,
|
||||
thresh,
|
||||
)
|
||||
}
|
||||
|
||||
/// Detect text in image
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// - `img_src` - Input image
|
||||
/// - `padding` - Padding width added during image transformation (improves detection)
|
||||
/// - `max_side_len` - Maximum side length after transformation (larger images will be scaled down)
|
||||
/// - `box_score_thresh` - Score threshold for text region detection
|
||||
/// - `box_thresh` - Box threshold
|
||||
/// - `un_clip_ratio` - Unclip ratio
|
||||
/// - `do_angle` - Whether to perform angle detection
|
||||
/// - `most_angle` - Use most common angle for all text regions
|
||||
const DEFAULT_CLS_THRESH: f32 = 0.9;
|
||||
const DEFAULT_THRESH: f32 = 0.3;
|
||||
const DEFAULT_REC_BATCH_SIZE: u32 = 6;
|
||||
|
||||
pub fn detect(
|
||||
&self,
|
||||
img_src: &image::RgbImage,
|
||||
padding: u32,
|
||||
max_side_len: u32,
|
||||
box_score_thresh: f32,
|
||||
box_thresh: f32,
|
||||
un_clip_ratio: f32,
|
||||
do_angle: bool,
|
||||
most_angle: bool,
|
||||
) -> Result<OcrResult, OcrError> {
|
||||
self.detect_base(
|
||||
img_src,
|
||||
padding,
|
||||
max_side_len,
|
||||
box_score_thresh,
|
||||
box_thresh,
|
||||
un_clip_ratio,
|
||||
do_angle,
|
||||
most_angle,
|
||||
false,
|
||||
0.0,
|
||||
Self::DEFAULT_CLS_THRESH,
|
||||
Self::DEFAULT_THRESH,
|
||||
)
|
||||
}
|
||||
|
||||
/// Detect text with angle rollback support
|
||||
///
|
||||
/// When `do_angle` is true, if the image was angle-corrected but recognition
|
||||
/// result is poor, the angle correction will be reverted.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// - `img_src` - Input image
|
||||
/// - `padding` - Padding width added during image transformation
|
||||
/// - `max_side_len` - Maximum side length after transformation
|
||||
/// - `box_score_thresh` - Score threshold for text region detection
|
||||
/// - `box_thresh` - Box threshold
|
||||
/// - `un_clip_ratio` - Unclip ratio
|
||||
/// - `do_angle` - Whether to perform angle detection
|
||||
/// - `most_angle` - Use most common angle
|
||||
/// - `angle_rollback_threshold` - If text score is below this value (or NaN), angle correction is reverted
|
||||
pub fn detect_angle_rollback(
|
||||
&self,
|
||||
img_src: &image::RgbImage,
|
||||
padding: u32,
|
||||
max_side_len: u32,
|
||||
box_score_thresh: f32,
|
||||
box_thresh: f32,
|
||||
un_clip_ratio: f32,
|
||||
do_angle: bool,
|
||||
most_angle: bool,
|
||||
angle_rollback_threshold: f32,
|
||||
) -> Result<OcrResult, OcrError> {
|
||||
self.detect_base(
|
||||
img_src,
|
||||
padding,
|
||||
max_side_len,
|
||||
box_score_thresh,
|
||||
box_thresh,
|
||||
un_clip_ratio,
|
||||
do_angle,
|
||||
most_angle,
|
||||
true,
|
||||
angle_rollback_threshold,
|
||||
Self::DEFAULT_CLS_THRESH,
|
||||
Self::DEFAULT_THRESH,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn detect_from_path(
|
||||
&self,
|
||||
img_path: &str,
|
||||
padding: u32,
|
||||
max_side_len: u32,
|
||||
box_score_thresh: f32,
|
||||
box_thresh: f32,
|
||||
un_clip_ratio: f32,
|
||||
do_angle: bool,
|
||||
most_angle: bool,
|
||||
) -> Result<OcrResult, OcrError> {
|
||||
let img_src = image::open(img_path)?.to_rgb8();
|
||||
|
||||
self.detect(
|
||||
&img_src,
|
||||
padding,
|
||||
max_side_len,
|
||||
box_score_thresh,
|
||||
box_thresh,
|
||||
un_clip_ratio,
|
||||
do_angle,
|
||||
most_angle,
|
||||
)
|
||||
}
|
||||
|
||||
/// Sort text boxes in reading order: top-to-bottom, left-to-right.
|
||||
///
|
||||
/// Sorts by top-left Y coordinate first, then by top-left X coordinate within
|
||||
/// the same Y. Matches PaddleOCR Python's `sorted_boxes` primary ordering.
|
||||
fn sort_text_boxes(text_boxes: &mut [TextBox]) {
|
||||
text_boxes.sort_by(|a, b| {
|
||||
let ay = a.points.first().map_or(0, |p| p.y);
|
||||
let ax = a.points.first().map_or(0, |p| p.x);
|
||||
let by = b.points.first().map_or(0, |p| p.y);
|
||||
let bx = b.points.first().map_or(0, |p| p.x);
|
||||
(ay, ax).cmp(&(by, bx))
|
||||
});
|
||||
}
|
||||
|
||||
fn detect_once(
|
||||
&self,
|
||||
img_src: &image::RgbImage,
|
||||
scale: &ScaleParam,
|
||||
padding: u32,
|
||||
box_score_thresh: f32,
|
||||
box_thresh: f32,
|
||||
un_clip_ratio: f32,
|
||||
do_angle: bool,
|
||||
most_angle: bool,
|
||||
angle_rollback: bool,
|
||||
angle_rollback_threshold: f32,
|
||||
cls_thresh: f32,
|
||||
thresh: f32,
|
||||
) -> Result<OcrResult, OcrError> {
|
||||
let mut text_boxes =
|
||||
self.db_net
|
||||
.get_text_boxes(img_src, scale, box_score_thresh, box_thresh, un_clip_ratio, thresh)?;
|
||||
|
||||
// Sort boxes in reading order (top-to-bottom, left-to-right)
|
||||
Self::sort_text_boxes(&mut text_boxes);
|
||||
|
||||
let part_images = OcrUtils::get_part_images(img_src, &text_boxes);
|
||||
|
||||
let angles = self
|
||||
.angle_net
|
||||
.get_angles(&part_images, do_angle, most_angle, cls_thresh)?;
|
||||
|
||||
let mut rotated_images: Vec<image::RgbImage> = Vec::with_capacity(part_images.len());
|
||||
|
||||
// Angle correction rollback
|
||||
let mut angle_rollback_records = HashMap::<usize, ImageBuffer<image::Rgb<u8>, Vec<u8>>>::new();
|
||||
|
||||
for (index, (angle, mut part_image)) in angles.iter().zip(part_images).enumerate() {
|
||||
if angle.index == 1 {
|
||||
if angle_rollback {
|
||||
// Keep original copy
|
||||
angle_rollback_records.insert(index, part_image.clone());
|
||||
}
|
||||
|
||||
OcrUtils::mat_rotate_clock_wise_180(&mut part_image);
|
||||
}
|
||||
rotated_images.push(part_image);
|
||||
}
|
||||
|
||||
let text_lines = self.crnn_net.get_text_lines(
|
||||
&rotated_images,
|
||||
&angle_rollback_records,
|
||||
angle_rollback_threshold,
|
||||
Self::DEFAULT_REC_BATCH_SIZE,
|
||||
)?;
|
||||
|
||||
let mut text_blocks = Vec::with_capacity(text_lines.len());
|
||||
for (i, text_line) in text_lines.into_iter().enumerate() {
|
||||
text_blocks.push(TextBlock {
|
||||
box_points: text_boxes[i]
|
||||
.points
|
||||
.iter()
|
||||
.map(|p| Point {
|
||||
x: ((p.x as f32) - padding as f32) as u32,
|
||||
y: ((p.y as f32) - padding as f32) as u32,
|
||||
})
|
||||
.collect(),
|
||||
box_score: text_boxes[i].score,
|
||||
angle_index: angles[i].index,
|
||||
angle_score: angles[i].score,
|
||||
text: text_line.text,
|
||||
text_score: text_line.text_score,
|
||||
});
|
||||
}
|
||||
|
||||
Ok(OcrResult { text_blocks })
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::ocr_result::TextBox;
|
||||
|
||||
fn make_box(x: u32, y: u32) -> TextBox {
|
||||
TextBox {
|
||||
points: vec![
|
||||
Point { x, y },
|
||||
Point { x: x + 100, y },
|
||||
Point { x: x + 100, y: y + 20 },
|
||||
Point { x, y: y + 20 },
|
||||
],
|
||||
score: 0.9,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sort_text_boxes_top_to_bottom() {
|
||||
let mut boxes = vec![make_box(10, 100), make_box(10, 50), make_box(10, 10)];
|
||||
OcrLite::sort_text_boxes(&mut boxes);
|
||||
assert_eq!(boxes[0].points[0].y, 10);
|
||||
assert_eq!(boxes[1].points[0].y, 50);
|
||||
assert_eq!(boxes[2].points[0].y, 100);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sort_text_boxes_same_line_left_to_right() {
|
||||
// Boxes with the same Y are sorted left-to-right by X
|
||||
let mut boxes = vec![make_box(200, 10), make_box(100, 10), make_box(50, 10)];
|
||||
OcrLite::sort_text_boxes(&mut boxes);
|
||||
assert_eq!(boxes[0].points[0].x, 50);
|
||||
assert_eq!(boxes[1].points[0].x, 100);
|
||||
assert_eq!(boxes[2].points[0].x, 200);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sort_text_boxes_multi_line() {
|
||||
// Boxes sorted strictly by (y, x): y=50/x=50, y=50/x=300, y=100/x=100, y=100/x=200
|
||||
let mut boxes = vec![
|
||||
make_box(300, 50), // line 1, right
|
||||
make_box(100, 100), // line 2, left
|
||||
make_box(50, 50), // line 1, left (same y=50)
|
||||
make_box(200, 100), // line 2, right (same y=100)
|
||||
];
|
||||
OcrLite::sort_text_boxes(&mut boxes);
|
||||
|
||||
// Line 1 (y=50): left first, then right
|
||||
assert_eq!(boxes[0].points[0].x, 50);
|
||||
assert_eq!(boxes[1].points[0].x, 300);
|
||||
// Line 2 (y=100): left first, then right
|
||||
assert_eq!(boxes[2].points[0].x, 100);
|
||||
assert_eq!(boxes[3].points[0].x, 200);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sort_text_boxes_empty() {
|
||||
let mut boxes: Vec<TextBox> = vec![];
|
||||
OcrLite::sort_text_boxes(&mut boxes);
|
||||
assert!(boxes.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sort_text_boxes_single() {
|
||||
let mut boxes = vec![make_box(10, 20)];
|
||||
OcrLite::sort_text_boxes(&mut boxes);
|
||||
assert_eq!(boxes.len(), 1);
|
||||
}
|
||||
}
|
||||
105
crates/kreuzberg-paddle-ocr/src/ocr_result.rs
Normal file
105
crates/kreuzberg-paddle-ocr/src/ocr_result.rs
Normal file
@@ -0,0 +1,105 @@
|
||||
use std::fmt::{self, Write};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
|
||||
pub struct Point {
|
||||
pub x: u32,
|
||||
pub y: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct TextBox {
|
||||
pub points: Vec<Point>,
|
||||
pub score: f32,
|
||||
}
|
||||
|
||||
impl fmt::Display for TextBox {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
// SAFETY: We must have at least 4 points in a valid TextBox
|
||||
// This is enforced at the OCR processing level, but we check bounds here for safety
|
||||
if self.points.len() < 4 {
|
||||
return write!(
|
||||
f,
|
||||
"TextBox [score({}), points_count({})]",
|
||||
self.score,
|
||||
self.points.len()
|
||||
);
|
||||
}
|
||||
|
||||
write!(
|
||||
f,
|
||||
"TextBox [score({}), [x: {}, y: {}], [x: {}, y: {}], [x: {}, y: {}], [x: {}, y: {}]]",
|
||||
self.score,
|
||||
self.points[0].x,
|
||||
self.points[0].y,
|
||||
self.points[1].x,
|
||||
self.points[1].y,
|
||||
self.points[2].x,
|
||||
self.points[2].y,
|
||||
self.points[3].x,
|
||||
self.points[3].y,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct Angle {
|
||||
pub index: i32,
|
||||
pub score: f32,
|
||||
}
|
||||
|
||||
impl fmt::Display for Angle {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let header = if self.index >= 0 { "Angle" } else { "AngleDisabled" };
|
||||
write!(f, "{}[Index({}), Score({})]", header, self.index, self.score)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct TextLine {
|
||||
pub text: String,
|
||||
pub text_score: f32,
|
||||
}
|
||||
|
||||
impl fmt::Display for TextLine {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "TextLine[Text({}),TextScore({})]", self.text, self.text_score)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct TextBlock {
|
||||
pub box_points: Vec<Point>,
|
||||
pub box_score: f32,
|
||||
|
||||
pub angle_index: i32,
|
||||
pub angle_score: f32,
|
||||
|
||||
pub text: String,
|
||||
pub text_score: f32,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct OcrResult {
|
||||
pub text_blocks: Vec<TextBlock>,
|
||||
}
|
||||
|
||||
impl fmt::Display for OcrResult {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let mut str_builder = String::with_capacity(0);
|
||||
for text_block in &self.text_blocks {
|
||||
write!(
|
||||
str_builder,
|
||||
"TextBlock[BoxPointsLen({}), BoxScore({}), AngleIndex({}), AngleScore({}), Text({}), TextScore({})]",
|
||||
text_block.box_points.len(),
|
||||
text_block.box_score,
|
||||
text_block.angle_index,
|
||||
text_block.angle_score,
|
||||
text_block.text,
|
||||
text_block.text_score
|
||||
)?;
|
||||
}
|
||||
f.write_str(&str_builder)
|
||||
}
|
||||
}
|
||||
206
crates/kreuzberg-paddle-ocr/src/ocr_utils.rs
Normal file
206
crates/kreuzberg-paddle-ocr/src/ocr_utils.rs
Normal file
@@ -0,0 +1,206 @@
|
||||
use std::borrow::Cow;
|
||||
|
||||
use crate::{
|
||||
ocr_error::OcrError,
|
||||
ocr_result::{Point, TextBox},
|
||||
};
|
||||
use image::imageops;
|
||||
use imageproc::geometric_transformations::{Interpolation, Projection};
|
||||
use ndarray::{Array, Array4};
|
||||
|
||||
pub struct OcrUtils;
|
||||
|
||||
impl OcrUtils {
|
||||
/// Normalize image pixels and transpose from HWC (row-major RGB) to CHW tensor format.
|
||||
///
|
||||
/// Formula per pixel: `output[ch] = pixel[ch] * norm[ch] - mean[ch] * norm[ch]`
|
||||
///
|
||||
/// This is a hot path called once per page. Key optimizations:
|
||||
/// - Pre-computes `mean * norm` constants (avoids repeated multiply)
|
||||
/// - Writes each channel plane contiguously via `as_slice_mut()`, enabling
|
||||
/// LLVM auto-vectorization (NEON on ARM64, SSE/AVX on x86-64). The previous
|
||||
/// approach used `tensor[[0, ch, r, c]]` which scattered writes across planes
|
||||
/// and prevented any vectorization.
|
||||
pub fn substract_mean_normalize(img_src: &image::RgbImage, mean_vals: &[f32], norm_vals: &[f32]) -> Array4<f32> {
|
||||
let cols = img_src.width() as usize;
|
||||
let rows = img_src.height() as usize;
|
||||
let pixel_count = rows * cols;
|
||||
|
||||
let mut input_tensor = Array::zeros((1, 3, rows, cols));
|
||||
|
||||
let adjusted = [
|
||||
mean_vals[0] * norm_vals[0],
|
||||
mean_vals[1] * norm_vals[1],
|
||||
mean_vals[2] * norm_vals[2],
|
||||
];
|
||||
|
||||
let raw = img_src.as_raw();
|
||||
|
||||
// Write each channel plane as a contiguous slice. ndarray stores (1,3,H,W)
|
||||
// in C-contiguous (row-major) order, so plane [0,ch] is a contiguous H*W block.
|
||||
// This enables LLVM to auto-vectorize the inner loop (4-8 f32 ops per cycle).
|
||||
for ch in 0..3 {
|
||||
let norm = norm_vals[ch];
|
||||
let adj = adjusted[ch];
|
||||
let plane = input_tensor
|
||||
.slice_mut(ndarray::s![0, ch, .., ..])
|
||||
.into_shape_with_order(pixel_count)
|
||||
.expect("contiguous plane slice");
|
||||
let plane_slice = plane.into_slice().expect("contiguous memory");
|
||||
|
||||
for (i, out) in plane_slice.iter_mut().enumerate() {
|
||||
// raw is HWC: pixel i has R at raw[i*3], G at raw[i*3+1], B at raw[i*3+2]
|
||||
*out = raw[i * 3 + ch] as f32 * norm - adj;
|
||||
}
|
||||
}
|
||||
|
||||
input_tensor
|
||||
}
|
||||
|
||||
/// Add white padding around the image, or borrow it unchanged when padding=0.
|
||||
/// Returns Cow to avoid cloning the image in the common no-padding case.
|
||||
pub fn make_padding<'a>(img_src: &'a image::RgbImage, padding: u32) -> Result<Cow<'a, image::RgbImage>, OcrError> {
|
||||
if padding == 0 {
|
||||
return Ok(Cow::Borrowed(img_src));
|
||||
}
|
||||
|
||||
let width = img_src.width();
|
||||
let height = img_src.height();
|
||||
|
||||
let mut padding_src = image::RgbImage::new(width + 2 * padding, height + 2 * padding);
|
||||
imageproc::drawing::draw_filled_rect_mut(
|
||||
&mut padding_src,
|
||||
imageproc::rect::Rect::at(0, 0).of_size(width + 2 * padding, height + 2 * padding),
|
||||
image::Rgb([255, 255, 255]),
|
||||
);
|
||||
|
||||
image::imageops::replace(&mut padding_src, img_src, padding as i64, padding as i64);
|
||||
|
||||
Ok(Cow::Owned(padding_src))
|
||||
}
|
||||
|
||||
pub fn get_part_images(img_src: &image::RgbImage, text_boxes: &[TextBox]) -> Vec<image::RgbImage> {
|
||||
text_boxes
|
||||
.iter()
|
||||
.map(|text_box| Self::get_rotate_crop_image(img_src, &text_box.points))
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn get_rotate_crop_image(img_src: &image::RgbImage, box_points: &[Point]) -> image::RgbImage {
|
||||
let mut points = box_points.to_vec();
|
||||
|
||||
// Calculate bounding box
|
||||
let (min_x, min_y, max_x, max_y) = points.iter().fold(
|
||||
(u32::MAX, u32::MAX, 0u32, 0u32),
|
||||
|(min_x, min_y, max_x, max_y), point| {
|
||||
(
|
||||
min_x.min(point.x),
|
||||
min_y.min(point.y),
|
||||
max_x.max(point.x),
|
||||
max_y.max(point.y),
|
||||
)
|
||||
},
|
||||
);
|
||||
|
||||
// Crop image
|
||||
let img_crop = imageops::crop_imm(img_src, min_x, min_y, max_x - min_x, max_y - min_y).to_image();
|
||||
|
||||
for point in &mut points {
|
||||
point.x = point.x.saturating_sub(min_x);
|
||||
point.y = point.y.saturating_sub(min_y);
|
||||
}
|
||||
|
||||
// Ensure we have enough points for transformation
|
||||
if points.len() < 4 {
|
||||
// Fallback: return the cropped image as-is if we don't have 4 points
|
||||
return img_crop;
|
||||
}
|
||||
|
||||
// Direct multiplication instead of .pow(2) — avoids integer power function overhead.
|
||||
let dx_w = (points[0].x as i32 - points[1].x as i32) as f32;
|
||||
let dy_w = (points[0].y as i32 - points[1].y as i32) as f32;
|
||||
let img_crop_width = (dx_w * dx_w + dy_w * dy_w).sqrt() as u32;
|
||||
let dx_h = (points[0].x as i32 - points[3].x as i32) as f32;
|
||||
let dy_h = (points[0].y as i32 - points[3].y as i32) as f32;
|
||||
let img_crop_height = (dx_h * dx_h + dy_h * dy_h).sqrt() as u32;
|
||||
|
||||
// Ensure dimensions are valid (non-zero)
|
||||
if img_crop_width == 0 || img_crop_height == 0 {
|
||||
return img_crop;
|
||||
}
|
||||
|
||||
let src_points = [
|
||||
(points[0].x as f32, points[0].y as f32),
|
||||
(points[1].x as f32, points[1].y as f32),
|
||||
(points[2].x as f32, points[2].y as f32),
|
||||
(points[3].x as f32, points[3].y as f32),
|
||||
];
|
||||
|
||||
let dst_points = [
|
||||
(0.0, 0.0),
|
||||
(img_crop_width as f32, 0.0),
|
||||
(img_crop_width as f32, img_crop_height as f32),
|
||||
(0.0, img_crop_height as f32),
|
||||
];
|
||||
|
||||
let projection = match Projection::from_control_points(src_points, dst_points) {
|
||||
Some(proj) => proj,
|
||||
None => {
|
||||
// If projection cannot be created, return the cropped image as fallback
|
||||
return img_crop;
|
||||
}
|
||||
};
|
||||
|
||||
let mut part_img = image::RgbImage::new(img_crop_width, img_crop_height);
|
||||
imageproc::geometric_transformations::warp_into(
|
||||
&img_crop,
|
||||
&projection,
|
||||
Interpolation::Nearest,
|
||||
image::Rgb([255, 255, 255]),
|
||||
&mut part_img,
|
||||
);
|
||||
|
||||
// Rotate image if needed
|
||||
if part_img.height() >= part_img.width() * 3 / 2 {
|
||||
let mut rotated = image::RgbImage::new(part_img.height(), part_img.width());
|
||||
|
||||
for (x, y, pixel) in part_img.enumerate_pixels() {
|
||||
rotated.put_pixel(y, part_img.width() - 1 - x, *pixel);
|
||||
}
|
||||
|
||||
rotated
|
||||
} else {
|
||||
part_img
|
||||
}
|
||||
}
|
||||
|
||||
pub fn mat_rotate_clock_wise_180(src: &mut image::RgbImage) {
|
||||
imageops::rotate180_in_place(src);
|
||||
}
|
||||
|
||||
/// Compute mean of f32 image values where mask > 0.
|
||||
///
|
||||
/// Uses raw slice access instead of per-pixel get_pixel() for better
|
||||
/// cache behavior and to enable auto-vectorization of the reduction.
|
||||
pub fn calculate_mean_with_mask(
|
||||
img: &image::ImageBuffer<image::Luma<f32>, Vec<f32>>,
|
||||
mask: &image::ImageBuffer<image::Luma<u8>, Vec<u8>>,
|
||||
) -> f32 {
|
||||
assert_eq!(img.width(), mask.width());
|
||||
assert_eq!(img.height(), mask.height());
|
||||
|
||||
let img_raw = img.as_raw();
|
||||
let mask_raw = mask.as_raw();
|
||||
let mut sum: f32 = 0.0;
|
||||
let mut count: u32 = 0;
|
||||
|
||||
for (px, &m) in img_raw.iter().zip(mask_raw.iter()) {
|
||||
if m > 0 {
|
||||
sum += *px;
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
|
||||
if count == 0 { 0.0 } else { sum / count as f32 }
|
||||
}
|
||||
}
|
||||
69
crates/kreuzberg-paddle-ocr/src/scale_param.rs
Normal file
69
crates/kreuzberg-paddle-ocr/src/scale_param.rs
Normal file
@@ -0,0 +1,69 @@
|
||||
#[derive(Debug)]
|
||||
pub struct ScaleParam {
|
||||
pub src_width: u32,
|
||||
pub src_height: u32,
|
||||
pub dst_width: u32,
|
||||
pub dst_height: u32,
|
||||
pub scale_width: f32,
|
||||
pub scale_height: f32,
|
||||
}
|
||||
|
||||
impl ScaleParam {
|
||||
pub fn new(
|
||||
src_width: u32,
|
||||
src_height: u32,
|
||||
dst_width: u32,
|
||||
dst_height: u32,
|
||||
scale_width: f32,
|
||||
scale_height: f32,
|
||||
) -> Self {
|
||||
Self {
|
||||
src_width,
|
||||
src_height,
|
||||
dst_width,
|
||||
dst_height,
|
||||
scale_width,
|
||||
scale_height,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_scale_param(src: &image::RgbImage, target_size: u32) -> Self {
|
||||
let src_width = src.width();
|
||||
let src_height = src.height();
|
||||
let mut dst_width;
|
||||
let mut dst_height;
|
||||
|
||||
let ratio: f32 = if src_width > src_height {
|
||||
target_size as f32 / src_width as f32
|
||||
} else {
|
||||
target_size as f32 / src_height as f32
|
||||
};
|
||||
|
||||
dst_width = (src_width as f32 * ratio) as u32;
|
||||
dst_height = (src_height as f32 * ratio) as u32;
|
||||
|
||||
if dst_width % 32 != 0 {
|
||||
dst_width = (dst_width / 32) * 32;
|
||||
dst_width = dst_width.max(32);
|
||||
}
|
||||
if dst_height % 32 != 0 {
|
||||
dst_height = (dst_height / 32) * 32;
|
||||
dst_height = dst_height.max(32);
|
||||
}
|
||||
|
||||
let scale_width = dst_width as f32 / src_width as f32;
|
||||
let scale_height = dst_height as f32 / src_height as f32;
|
||||
|
||||
Self::new(src_width, src_height, dst_width, dst_height, scale_width, scale_height)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ScaleParam {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"src_width:{},src_height:{},dst_width:{},dst_height:{},scale_width:{},scale_height:{}",
|
||||
self.src_width, self.src_height, self.dst_width, self.dst_height, self.scale_width, self.scale_height
|
||||
)
|
||||
}
|
||||
}
|
||||
436
crates/kreuzberg-paddle-ocr/tests/diagnostic.rs
Normal file
436
crates/kreuzberg-paddle-ocr/tests/diagnostic.rs
Normal file
@@ -0,0 +1,436 @@
|
||||
//! Diagnostic test to trace PaddleOCR detection pipeline.
|
||||
//!
|
||||
//! This test isolates each step to determine where empty results originate.
|
||||
//! Since this crate doesn't have PNG/image decoder features, we create test
|
||||
//! images programmatically.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
fn get_workspace_root() -> PathBuf {
|
||||
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
|
||||
manifest_dir.parent().unwrap().parent().unwrap().to_path_buf()
|
||||
}
|
||||
|
||||
fn get_model_dir() -> PathBuf {
|
||||
get_workspace_root().join(".kreuzberg/paddle-ocr")
|
||||
}
|
||||
|
||||
/// Create a simple test image with black text "HELLO" on white background.
|
||||
/// This avoids needing PNG decoder features.
|
||||
fn create_test_image() -> image::RgbImage {
|
||||
let width = 200u32;
|
||||
let height = 100u32;
|
||||
let mut img = image::RgbImage::from_pixel(width, height, image::Rgb([255, 255, 255]));
|
||||
|
||||
// Draw a thick black rectangle to simulate text (a simple "block" pattern)
|
||||
// This ensures the detection model has SOMETHING to detect
|
||||
let black = image::Rgb([0, 0, 0]);
|
||||
|
||||
// Draw "H" shape (x: 20-60, y: 20-80)
|
||||
for y in 20..80 {
|
||||
img.put_pixel(20, y, black);
|
||||
img.put_pixel(21, y, black);
|
||||
img.put_pixel(22, y, black);
|
||||
}
|
||||
for y in 20..80 {
|
||||
img.put_pixel(55, y, black);
|
||||
img.put_pixel(56, y, black);
|
||||
img.put_pixel(57, y, black);
|
||||
}
|
||||
for x in 20..58 {
|
||||
img.put_pixel(x, 48, black);
|
||||
img.put_pixel(x, 49, black);
|
||||
img.put_pixel(x, 50, black);
|
||||
}
|
||||
|
||||
// Draw thick solid block to be very obvious (x: 80-180, y: 30-70)
|
||||
for y in 30..70 {
|
||||
for x in 80..180 {
|
||||
img.put_pixel(x, y, black);
|
||||
}
|
||||
}
|
||||
|
||||
img
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn diagnostic_detection_pipeline() {
|
||||
let model_dir = get_model_dir();
|
||||
|
||||
if !model_dir.join("det/model.onnx").exists() {
|
||||
eprintln!("SKIP: Models not downloaded at {:?}", model_dir);
|
||||
return;
|
||||
}
|
||||
|
||||
// Discover ORT library
|
||||
discover_ort();
|
||||
|
||||
eprintln!("=== PaddleOCR Diagnostic Test ===");
|
||||
eprintln!("Model dir: {:?}", model_dir);
|
||||
|
||||
// Step 1: Create test image
|
||||
let img = create_test_image();
|
||||
eprintln!("Step 1 - Test image created: {}x{}", img.width(), img.height());
|
||||
|
||||
// Step 2: Initialize OcrLite
|
||||
let mut ocr_lite = kreuzberg_paddle_ocr::OcrLite::new();
|
||||
let det_path = model_dir.join("det/model.onnx");
|
||||
let cls_path = model_dir.join("cls/model.onnx");
|
||||
let rec_path = model_dir.join("rec/model.onnx");
|
||||
|
||||
let init_result = ocr_lite.init_models(
|
||||
det_path.to_str().unwrap(),
|
||||
cls_path.to_str().unwrap(),
|
||||
rec_path.to_str().unwrap(),
|
||||
1,
|
||||
);
|
||||
|
||||
match &init_result {
|
||||
Ok(()) => eprintln!("Step 2 - Models initialized successfully"),
|
||||
Err(e) => {
|
||||
eprintln!("Step 2 - FAILED to init models: {:?}", e);
|
||||
panic!("Model initialization failed: {:?}", e);
|
||||
}
|
||||
}
|
||||
|
||||
// Step 3: Run detection with various parameter sets
|
||||
let test_cases = vec![
|
||||
("A: Default params", 50u32, 960u32, 0.3f32, 0.5f32, 1.6f32, true, false),
|
||||
("B: Very low thresholds", 50, 960, 0.01, 0.01, 1.6, false, false),
|
||||
("C: No padding + low", 0, 960, 0.01, 0.01, 1.6, false, false),
|
||||
("D: Higher unclip ratio", 50, 960, 0.1, 0.1, 3.0, false, false),
|
||||
("E: No padding + medium", 0, 960, 0.1, 0.3, 2.0, false, false),
|
||||
];
|
||||
|
||||
let mut any_detected = false;
|
||||
|
||||
for (name, padding, max_side, box_score, box_thresh, unclip, do_angle, most_angle) in &test_cases {
|
||||
eprintln!("\n--- Test {} ---", name);
|
||||
eprintln!(
|
||||
" padding={}, max_side={}, box_score={}, box_thresh={}, unclip={}",
|
||||
padding, max_side, box_score, box_thresh, unclip
|
||||
);
|
||||
|
||||
let result = ocr_lite.detect(
|
||||
&img,
|
||||
*padding,
|
||||
*max_side,
|
||||
*box_score,
|
||||
*box_thresh,
|
||||
*unclip,
|
||||
*do_angle,
|
||||
*most_angle,
|
||||
);
|
||||
|
||||
match &result {
|
||||
Ok(ocr_result) => {
|
||||
eprintln!(" Result: {} text blocks", ocr_result.text_blocks.len());
|
||||
for (i, block) in ocr_result.text_blocks.iter().enumerate() {
|
||||
eprintln!(
|
||||
" Block {}: text='{}', text_score={:.3}, box_score={:.3}",
|
||||
i, block.text, block.text_score, block.box_score
|
||||
);
|
||||
any_detected = true;
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
eprintln!(" FAILED: {:?}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
eprintln!("\n=== Diagnosis ===");
|
||||
if !any_detected {
|
||||
eprintln!("RESULT: Detection model produces NO output regardless of thresholds.");
|
||||
eprintln!("This strongly suggests an ORT version compatibility issue.");
|
||||
eprintln!(" ort crate version: check Cargo.lock for current version");
|
||||
eprintln!(" ORT_DYLIB_PATH: {:?}", std::env::var("ORT_DYLIB_PATH"));
|
||||
} else {
|
||||
eprintln!("RESULT: Detection works. Issue may be threshold-related or image-specific.");
|
||||
}
|
||||
}
|
||||
|
||||
/// Also test with raw ONNX inference to check if ORT works at all.
|
||||
#[test]
|
||||
fn diagnostic_raw_ort_inference() {
|
||||
let model_dir = get_model_dir();
|
||||
let det_model = model_dir.join("det/model.onnx");
|
||||
|
||||
if !det_model.exists() {
|
||||
eprintln!("SKIP: Detection model not found at {:?}", det_model);
|
||||
return;
|
||||
}
|
||||
|
||||
discover_ort();
|
||||
|
||||
eprintln!("=== Raw ORT Inference Test ===");
|
||||
|
||||
// Load model directly via ort
|
||||
use ort::session::Session;
|
||||
|
||||
let mut session = Session::builder().unwrap().commit_from_file(&det_model).unwrap();
|
||||
|
||||
eprintln!("Model loaded successfully");
|
||||
eprintln!("Inputs:");
|
||||
for input in session.inputs() {
|
||||
eprintln!(" name='{}'", input.name());
|
||||
}
|
||||
eprintln!("Outputs:");
|
||||
for output in session.outputs() {
|
||||
eprintln!(" name='{}'", output.name());
|
||||
}
|
||||
|
||||
// Create a small 32x32 test tensor (NCHW format: batch=1, channels=3, h=32, w=32)
|
||||
let input_data: Vec<f32> = vec![0.5; 3 * 32 * 32];
|
||||
let tensor =
|
||||
ort::value::Tensor::from_array(ndarray::Array::from_shape_vec((1, 3, 32, 32), input_data).unwrap()).unwrap();
|
||||
|
||||
let input_name = session.inputs()[0].name().to_string();
|
||||
eprintln!("\nRunning inference with 32x32 gray image...");
|
||||
|
||||
let outputs = session.run(ort::inputs![input_name => tensor]).unwrap();
|
||||
|
||||
// Check output
|
||||
let (output_name, output_value) = outputs.iter().next().unwrap();
|
||||
eprintln!("Output name: {}", output_name);
|
||||
|
||||
let output_tensor = output_value.try_extract_tensor::<f32>().unwrap();
|
||||
let output_shape = output_tensor.0;
|
||||
let output_data = output_tensor.1;
|
||||
|
||||
eprintln!("Output shape: {:?}", output_shape);
|
||||
eprintln!("Output len: {}", output_data.len());
|
||||
|
||||
if !output_data.is_empty() {
|
||||
let min = output_data.iter().cloned().fold(f32::INFINITY, f32::min);
|
||||
let max = output_data.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
|
||||
let sum: f32 = output_data.iter().sum();
|
||||
let mean = sum / output_data.len() as f32;
|
||||
let non_zero = output_data.iter().filter(|&&v| v > 0.001).count();
|
||||
|
||||
eprintln!("Output stats: min={:.6}, max={:.6}, mean={:.6}", min, max, mean);
|
||||
eprintln!("Non-zero values (>0.001): {} / {}", non_zero, output_data.len());
|
||||
|
||||
if max < 0.001 {
|
||||
eprintln!("\nDIAGNOSIS: Model outputs are essentially all zeros.");
|
||||
eprintln!("This confirms an ORT compatibility issue - model isn't executing correctly.");
|
||||
} else {
|
||||
eprintln!("\nDIAGNOSIS: Model produces non-zero output. ORT is working.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Diagnostic: test the CRNN recognition model directly.
|
||||
#[test]
|
||||
fn diagnostic_crnn_model_output() {
|
||||
let model_dir = get_model_dir();
|
||||
let rec_model = model_dir.join("rec/model.onnx");
|
||||
|
||||
if !rec_model.exists() {
|
||||
eprintln!("SKIP: Recognition model not found");
|
||||
return;
|
||||
}
|
||||
|
||||
discover_ort();
|
||||
|
||||
eprintln!("=== CRNN Recognition Model Diagnostic ===");
|
||||
|
||||
use ort::session::Session;
|
||||
|
||||
let mut session = Session::builder().unwrap().commit_from_file(&rec_model).unwrap();
|
||||
|
||||
eprintln!("Model loaded successfully");
|
||||
eprintln!("Inputs:");
|
||||
for input in session.inputs() {
|
||||
eprintln!(" name='{}'", input.name());
|
||||
}
|
||||
eprintln!("Outputs:");
|
||||
for output in session.outputs() {
|
||||
eprintln!(" name='{}'", output.name());
|
||||
}
|
||||
|
||||
// Check metadata for character list
|
||||
{
|
||||
let metadata = session.metadata().unwrap();
|
||||
|
||||
// Check all metadata custom keys
|
||||
eprintln!("Model metadata:");
|
||||
eprintln!(" description: {:?}", metadata.description());
|
||||
eprintln!(" producer: {:?}", metadata.producer());
|
||||
|
||||
// Try to get the character key
|
||||
match metadata.custom("character") {
|
||||
Some(chars) => {
|
||||
let bytes = chars.as_bytes();
|
||||
let char_count = chars.split('\n').count();
|
||||
eprintln!(
|
||||
" custom('character'): len={}, bytes={}, split_count={}",
|
||||
chars.len(),
|
||||
bytes.len(),
|
||||
char_count
|
||||
);
|
||||
if chars.len() < 500 {
|
||||
eprintln!(" value: {:?}", chars);
|
||||
} else {
|
||||
let preview: String = chars.chars().take(100).collect();
|
||||
eprintln!(" preview (first 100 chars): {:?}", preview);
|
||||
}
|
||||
|
||||
// Check for null bytes or other encoding issues
|
||||
let null_count = bytes.iter().filter(|&&b| b == 0).count();
|
||||
if null_count > 0 {
|
||||
eprintln!(" WARNING: {} null bytes found in character string!", null_count);
|
||||
}
|
||||
}
|
||||
None => {
|
||||
eprintln!(" ERROR: No 'character' key in model metadata!");
|
||||
}
|
||||
}
|
||||
|
||||
// Try other possible metadata keys
|
||||
for key in [
|
||||
"character",
|
||||
"characters",
|
||||
"dict",
|
||||
"dictionary",
|
||||
"labels",
|
||||
"vocab",
|
||||
"alphabet",
|
||||
] {
|
||||
if let Some(val) = metadata.custom(key) {
|
||||
eprintln!(
|
||||
" custom('{}'): len={}, preview={:?}",
|
||||
key,
|
||||
val.len(),
|
||||
&val[..val.len().min(80)]
|
||||
);
|
||||
}
|
||||
}
|
||||
} // metadata dropped here
|
||||
|
||||
// Test 1: Run inference with a simple input (height=48, width=200)
|
||||
// CRNN expects NCHW: [1, 3, 48, width]
|
||||
let h = 48usize;
|
||||
let w = 200usize;
|
||||
|
||||
// Create a pattern that looks like text (alternating black/white vertical stripes)
|
||||
let mut input_data: Vec<f32> = vec![0.0; 3 * h * w];
|
||||
for c in 0..3 {
|
||||
for y in 10..38 {
|
||||
for x in (20..180).step_by(2) {
|
||||
input_data[c * h * w + y * w + x] = -1.0; // normalized black
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let tensor =
|
||||
ort::value::Tensor::from_array(ndarray::Array::from_shape_vec((1, 3, h, w), input_data).unwrap()).unwrap();
|
||||
|
||||
let input_name = session.inputs()[0].name().to_string();
|
||||
eprintln!("\nRunning CRNN with striped pattern (48x200)...");
|
||||
|
||||
let outputs = session.run(ort::inputs![input_name => tensor]).unwrap();
|
||||
|
||||
let (_, output_value) = outputs.iter().next().unwrap();
|
||||
let (shape, data) = output_value.try_extract_tensor::<f32>().unwrap();
|
||||
|
||||
eprintln!("Output shape: {:?}", shape);
|
||||
eprintln!("Output total values: {}", data.len());
|
||||
|
||||
if shape.len() >= 3 {
|
||||
let time_steps = shape[1] as usize;
|
||||
let vocab_size = shape[2] as usize;
|
||||
eprintln!("Time steps: {}, Vocabulary size: {}", time_steps, vocab_size);
|
||||
|
||||
// Check if outputs are meaningful
|
||||
let data_vec: Vec<f32> = data.to_vec();
|
||||
let min = data_vec.iter().cloned().fold(f32::INFINITY, f32::min);
|
||||
let max = data_vec.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
|
||||
let mean: f32 = data_vec.iter().sum::<f32>() / data_vec.len() as f32;
|
||||
eprintln!("Overall stats: min={:.6}, max={:.6}, mean={:.6}", min, max, mean);
|
||||
|
||||
// Check argmax distribution
|
||||
let mut argmax_zero_count = 0;
|
||||
let mut argmax_nonzero_count = 0;
|
||||
for t in 0..time_steps {
|
||||
let start = t * vocab_size;
|
||||
let end = start + vocab_size;
|
||||
let slice = &data_vec[start..end.min(data_vec.len())];
|
||||
|
||||
let (max_idx, max_val) =
|
||||
slice.iter().enumerate().fold(
|
||||
(0, f32::MIN),
|
||||
|(mi, mv), (i, &v)| if v > mv { (i, v) } else { (mi, mv) },
|
||||
);
|
||||
|
||||
if max_idx == 0 {
|
||||
argmax_zero_count += 1;
|
||||
} else {
|
||||
argmax_nonzero_count += 1;
|
||||
}
|
||||
|
||||
if t < 5 || (t > time_steps - 3) {
|
||||
eprintln!(" Step {}: argmax={}, max_val={:.4}", t, max_idx, max_val);
|
||||
} else if t == 5 {
|
||||
eprintln!(" ... (skipping middle steps)");
|
||||
}
|
||||
}
|
||||
|
||||
eprintln!(
|
||||
"\nArgmax distribution: {} blank (idx=0), {} non-blank",
|
||||
argmax_zero_count, argmax_nonzero_count
|
||||
);
|
||||
|
||||
if argmax_nonzero_count == 0 {
|
||||
eprintln!("\nDIAGNOSIS: CRNN model outputs all blanks.");
|
||||
eprintln!("Possible causes:");
|
||||
eprintln!(" 1. ORT version incompatibility with CRNN model");
|
||||
eprintln!(" 2. Model is not executing graph correctly");
|
||||
eprintln!(" 3. Input normalization mismatch");
|
||||
} else {
|
||||
eprintln!("\nDIAGNOSIS: CRNN model produces non-blank output. Recognition works.");
|
||||
}
|
||||
}
|
||||
|
||||
// Drop outputs before reusing session
|
||||
drop(outputs);
|
||||
|
||||
// Test 2: Run with a uniform white image (should produce all blanks - valid baseline)
|
||||
let white_data: Vec<f32> = vec![1.0; 3 * h * w];
|
||||
let white_tensor =
|
||||
ort::value::Tensor::from_array(ndarray::Array::from_shape_vec((1, 3, h, w), white_data).unwrap()).unwrap();
|
||||
|
||||
let input_name2 = session.inputs()[0].name().to_string();
|
||||
eprintln!("\nRunning CRNN with uniform white (48x200)...");
|
||||
let white_outputs = session.run(ort::inputs![input_name2 => white_tensor]).unwrap();
|
||||
let (_, white_val) = white_outputs.iter().next().unwrap();
|
||||
let (_, white_data_out) = white_val.try_extract_tensor::<f32>().unwrap();
|
||||
let white_vec: Vec<f32> = white_data_out.to_vec();
|
||||
let white_max = white_vec.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
|
||||
let white_min = white_vec.iter().cloned().fold(f32::INFINITY, f32::min);
|
||||
eprintln!("White image output: min={:.6}, max={:.6}", white_min, white_max);
|
||||
}
|
||||
|
||||
fn discover_ort() {
|
||||
if let Ok(path) = std::env::var("ORT_DYLIB_PATH")
|
||||
&& std::path::Path::new(&path).exists()
|
||||
{
|
||||
eprintln!("ORT found via ORT_DYLIB_PATH: {}", path);
|
||||
return;
|
||||
}
|
||||
|
||||
let candidates = [
|
||||
"/opt/homebrew/lib/libonnxruntime.dylib",
|
||||
"/usr/local/lib/libonnxruntime.dylib",
|
||||
];
|
||||
|
||||
for candidate in &candidates {
|
||||
if std::path::Path::new(candidate).exists() {
|
||||
eprintln!("Setting ORT_DYLIB_PATH={}", candidate);
|
||||
unsafe { std::env::set_var("ORT_DYLIB_PATH", candidate) };
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
eprintln!("WARNING: Could not find ORT library!");
|
||||
}
|
||||
27
crates/kreuzberg-php/Cargo.toml
generated
Normal file
27
crates/kreuzberg-php/Cargo.toml
generated
Normal file
@@ -0,0 +1,27 @@
|
||||
[package]
|
||||
name = "kreuzberg-php"
|
||||
version = "5.0.0-rc.3"
|
||||
edition = "2024"
|
||||
license = "Elastic-2.0"
|
||||
description = "High-performance document intelligence library"
|
||||
readme = false
|
||||
keywords = ["document", "extraction", "ocr", "pdf", "text"]
|
||||
categories = ["text-processing"]
|
||||
|
||||
# `ahash` and `futures-util` are conditionally included but not directly used in PHP code.
|
||||
[package.metadata.cargo-machete]
|
||||
ignored = ["tokio", "ahash", "async-trait"]
|
||||
|
||||
[lib]
|
||||
crate-type = ["cdylib"]
|
||||
|
||||
[features]
|
||||
extension-module = []
|
||||
|
||||
[dependencies]
|
||||
async-trait = "0.1"
|
||||
ext-php-rs = "0.15"
|
||||
kreuzberg = { version = "5.0.0-rc.3", path = "../kreuzberg", features = ["full", "pdf", "ocr", "paddle-ocr", "paddle-ocr-types", "layout-detection", "layout-types", "embeddings", "embedding-presets", "chunking", "keywords-yake", "keywords-rake", "language-detection", "html", "tree-sitter", "office", "email", "archives", "stopwords", "auto-rotate", "auto-rotate-types", "tokio-runtime", "api", "mcp", "liter-llm", "quality"] }
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
tokio = { version = "1", features = ["full"] }
|
||||
93
crates/kreuzberg-php/src/LICENSE
generated
Normal file
93
crates/kreuzberg-php/src/LICENSE
generated
Normal file
@@ -0,0 +1,93 @@
|
||||
Elastic License 2.0 (ELv2)
|
||||
|
||||
Copyright 2025-2026 Kreuzberg, Inc.
|
||||
|
||||
Acceptance
|
||||
|
||||
By using the software, you agree to all of the terms and conditions below.
|
||||
|
||||
Copyright License
|
||||
|
||||
The licensor grants you a non-exclusive, royalty-free, worldwide,
|
||||
non-sublicensable, non-transferable license to use, copy, distribute, make
|
||||
available, and prepare derivative works of the software, in each case subject to
|
||||
the limitations and conditions below.
|
||||
|
||||
Limitations
|
||||
|
||||
You may not provide the software to third parties as a hosted or managed
|
||||
service, where the service provides users with access to any substantial set of
|
||||
the features or functionality of the software.
|
||||
|
||||
You may not move, change, disable, or circumvent the license key functionality
|
||||
in the software, and you may not remove or obscure any functionality in the
|
||||
software that is protected by the license key.
|
||||
|
||||
You may not alter, remove, or obscure any licensing, copyright, or other notices
|
||||
of the licensor in the software. Any use of the licensor's trademarks is subject
|
||||
to applicable law.
|
||||
|
||||
Patents
|
||||
|
||||
The licensor grants you a license, under any patent claims the licensor can
|
||||
license, or becomes able to license, to make, have made, use, sell, offer for
|
||||
sale, import and have imported the software, in each case subject to the
|
||||
limitations and conditions in this license. This license does not cover any
|
||||
patent claims that you cause to be infringed by modifications or additions to the
|
||||
software. If you or your company make any written claim that the software
|
||||
infringes or contributes to infringement of any patent, your patent license for
|
||||
the software granted under these terms ends immediately. If your company makes
|
||||
such a claim, your patent license ends immediately for work on behalf of your
|
||||
company.
|
||||
|
||||
Notices
|
||||
|
||||
You must ensure that anyone who gets a copy of any part of the software from you
|
||||
also gets a copy of these terms.
|
||||
|
||||
If you modify the software, you must include in any modified copies of the
|
||||
software prominent notices stating that you have modified the software.
|
||||
|
||||
No Other Rights
|
||||
|
||||
These terms do not imply any licenses other than those expressly granted in
|
||||
these terms.
|
||||
|
||||
Termination
|
||||
|
||||
If you use the software in violation of these terms, such use is not licensed,
|
||||
and your licenses will automatically terminate. If the licensor provides you with
|
||||
a notice of your violation, and you cease all violation of this license no later
|
||||
than 30 days after you receive that notice, your licenses will be reinstated
|
||||
retroactively. However, if you violate these terms after such reinstatement, any
|
||||
additional violation of these terms will cause your licenses to terminate
|
||||
automatically and permanently.
|
||||
|
||||
No Liability
|
||||
|
||||
As far as the law allows, the software comes as is, without any warranty or
|
||||
condition, and the licensor will not be liable to you for any damages arising out
|
||||
of these terms or the use or nature of the software, under any kind of legal
|
||||
claim.
|
||||
|
||||
Definitions
|
||||
|
||||
The licensor is the entity offering these terms, and the software is the
|
||||
software the licensor makes available under these terms, including any portion
|
||||
of it.
|
||||
|
||||
you refers to the individual or entity agreeing to these terms.
|
||||
|
||||
your company is any legal entity, sole proprietorship, or other kind of
|
||||
organization that you work for, plus all organizations that have control over,
|
||||
are under the control of, or are under common control with that organization.
|
||||
control means ownership of substantially all the assets of an entity, or the
|
||||
power to direct its management and policies by vote, contract, or otherwise.
|
||||
Control can be direct or indirect.
|
||||
|
||||
your licenses are all the licenses granted to you for the software under these
|
||||
terms.
|
||||
|
||||
use means anything you do with the software requiring one of your licenses.
|
||||
|
||||
trademark means trademarks, service marks, and similar rights.
|
||||
34
crates/kreuzberg-php/src/composer.json
generated
Normal file
34
crates/kreuzberg-php/src/composer.json
generated
Normal file
@@ -0,0 +1,34 @@
|
||||
{
|
||||
"name": "kreuzberg-dev/kreuzberg",
|
||||
"description": "High-performance document intelligence library",
|
||||
"license": "Elastic-2.0",
|
||||
"type": "php-ext",
|
||||
"require": {
|
||||
"php": ">=8.2"
|
||||
},
|
||||
"require-dev": {
|
||||
"phpstan/phpstan": "^2.1",
|
||||
"friendsofphp/php-cs-fixer": "^3.95",
|
||||
"phpunit/phpunit": "^13.1"
|
||||
},
|
||||
"autoload": {
|
||||
"psr-4": {
|
||||
"Kreuzberg\\": "src/"
|
||||
}
|
||||
},
|
||||
"scripts": {
|
||||
"phpstan": "php -d detect_unicode=0 vendor/bin/phpstan --configuration=phpstan.neon --memory-limit=512M",
|
||||
"format": "php vendor/bin/php-cs-fixer fix --quiet",
|
||||
"format:check": "php vendor/bin/php-cs-fixer fix --dry-run --quiet",
|
||||
"test": "php vendor/bin/phpunit",
|
||||
"lint": "@phpstan",
|
||||
"lint:fix": "php vendor/bin/php-cs-fixer fix --quiet && php -d detect_unicode=0 vendor/bin/phpstan --configuration=phpstan.neon --memory-limit=512M"
|
||||
},
|
||||
"php-ext": {
|
||||
"extension-name": "kreuzberg",
|
||||
"support-zts": true,
|
||||
"support-nts": true,
|
||||
"download-url-method": ["pre-packaged-binary", "composer-default"]
|
||||
},
|
||||
"keywords": ["document", "extraction", "ocr", "pdf", "text"]
|
||||
}
|
||||
4696
crates/kreuzberg-php/src/composer.lock
generated
Normal file
4696
crates/kreuzberg-php/src/composer.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
19159
crates/kreuzberg-php/src/lib.rs
generated
Normal file
19159
crates/kreuzberg-php/src/lib.rs
generated
Normal file
File diff suppressed because it is too large
Load Diff
2
crates/kreuzberg-php/src/phpstan-baseline.neon
generated
Normal file
2
crates/kreuzberg-php/src/phpstan-baseline.neon
generated
Normal file
@@ -0,0 +1,2 @@
|
||||
parameters:
|
||||
ignoreErrors: []
|
||||
12
crates/kreuzberg-php/src/phpstan.neon
generated
Normal file
12
crates/kreuzberg-php/src/phpstan.neon
generated
Normal file
@@ -0,0 +1,12 @@
|
||||
includes:
|
||||
- phpstan-baseline.neon
|
||||
|
||||
parameters:
|
||||
level: max
|
||||
paths:
|
||||
- src
|
||||
scanFiles:
|
||||
- stubs/kreuzberg_extension.php
|
||||
treatPhpDocTypesAsCertain: false
|
||||
reportUnmatchedIgnoredErrors: false
|
||||
tmpDir: var/cache/phpstan
|
||||
34
crates/kreuzberg-py/Cargo.toml
generated
Normal file
34
crates/kreuzberg-py/Cargo.toml
generated
Normal file
@@ -0,0 +1,34 @@
|
||||
[package]
|
||||
name = "kreuzberg-py"
|
||||
version = "5.0.0-rc.3"
|
||||
edition = "2024"
|
||||
license = "Elastic-2.0"
|
||||
description = "High-performance document intelligence library"
|
||||
readme = false
|
||||
keywords = ["document", "extraction", "ocr", "pdf", "text"]
|
||||
categories = ["text-processing"]
|
||||
|
||||
# `pyo3-async-runtimes` and `serde_json` are emitted unconditionally above so
|
||||
# the manifest is stable across regens, but for umbrella crates with no
|
||||
# async fns or no JSON-marshalled return types they are genuinely unused.
|
||||
# The conditional `async-trait` / `tokio` / `futures` deps are similarly
|
||||
# flagged when the umbrella has trait-bridge / streaming adapters configured
|
||||
# but no actual async-trait / async callsite in the generated PyO3 shim.
|
||||
[package.metadata.cargo-machete]
|
||||
ignored = ["pyo3-async-runtimes", "serde_json", "async-trait", "tokio"]
|
||||
|
||||
[lib]
|
||||
name = "_kreuzberg"
|
||||
crate-type = ["cdylib"]
|
||||
|
||||
[features]
|
||||
extension-module = ["pyo3/extension-module", "pyo3/abi3-py310"]
|
||||
|
||||
[dependencies]
|
||||
async-trait = "0.1"
|
||||
kreuzberg = { version = "5.0.0-rc.3", path = "../kreuzberg", features = ["full", "pdf", "ocr", "paddle-ocr", "paddle-ocr-types", "layout-detection", "layout-types", "embeddings", "embedding-presets", "chunking", "keywords-yake", "keywords-rake", "language-detection", "html", "tree-sitter", "office", "email", "archives", "stopwords", "auto-rotate", "auto-rotate-types", "tokio-runtime", "api", "mcp", "liter-llm", "quality"] }
|
||||
pyo3 = { version = "0.28" }
|
||||
pyo3-async-runtimes = { version = "0.28", features = ["tokio-runtime"] }
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
tokio = { version = "1", features = ["rt-multi-thread"] }
|
||||
93
crates/kreuzberg-py/src/LICENSE
generated
Normal file
93
crates/kreuzberg-py/src/LICENSE
generated
Normal file
@@ -0,0 +1,93 @@
|
||||
Elastic License 2.0 (ELv2)
|
||||
|
||||
Copyright 2025-2026 Kreuzberg, Inc.
|
||||
|
||||
Acceptance
|
||||
|
||||
By using the software, you agree to all of the terms and conditions below.
|
||||
|
||||
Copyright License
|
||||
|
||||
The licensor grants you a non-exclusive, royalty-free, worldwide,
|
||||
non-sublicensable, non-transferable license to use, copy, distribute, make
|
||||
available, and prepare derivative works of the software, in each case subject to
|
||||
the limitations and conditions below.
|
||||
|
||||
Limitations
|
||||
|
||||
You may not provide the software to third parties as a hosted or managed
|
||||
service, where the service provides users with access to any substantial set of
|
||||
the features or functionality of the software.
|
||||
|
||||
You may not move, change, disable, or circumvent the license key functionality
|
||||
in the software, and you may not remove or obscure any functionality in the
|
||||
software that is protected by the license key.
|
||||
|
||||
You may not alter, remove, or obscure any licensing, copyright, or other notices
|
||||
of the licensor in the software. Any use of the licensor's trademarks is subject
|
||||
to applicable law.
|
||||
|
||||
Patents
|
||||
|
||||
The licensor grants you a license, under any patent claims the licensor can
|
||||
license, or becomes able to license, to make, have made, use, sell, offer for
|
||||
sale, import and have imported the software, in each case subject to the
|
||||
limitations and conditions in this license. This license does not cover any
|
||||
patent claims that you cause to be infringed by modifications or additions to the
|
||||
software. If you or your company make any written claim that the software
|
||||
infringes or contributes to infringement of any patent, your patent license for
|
||||
the software granted under these terms ends immediately. If your company makes
|
||||
such a claim, your patent license ends immediately for work on behalf of your
|
||||
company.
|
||||
|
||||
Notices
|
||||
|
||||
You must ensure that anyone who gets a copy of any part of the software from you
|
||||
also gets a copy of these terms.
|
||||
|
||||
If you modify the software, you must include in any modified copies of the
|
||||
software prominent notices stating that you have modified the software.
|
||||
|
||||
No Other Rights
|
||||
|
||||
These terms do not imply any licenses other than those expressly granted in
|
||||
these terms.
|
||||
|
||||
Termination
|
||||
|
||||
If you use the software in violation of these terms, such use is not licensed,
|
||||
and your licenses will automatically terminate. If the licensor provides you with
|
||||
a notice of your violation, and you cease all violation of this license no later
|
||||
than 30 days after you receive that notice, your licenses will be reinstated
|
||||
retroactively. However, if you violate these terms after such reinstatement, any
|
||||
additional violation of these terms will cause your licenses to terminate
|
||||
automatically and permanently.
|
||||
|
||||
No Liability
|
||||
|
||||
As far as the law allows, the software comes as is, without any warranty or
|
||||
condition, and the licensor will not be liable to you for any damages arising out
|
||||
of these terms or the use or nature of the software, under any kind of legal
|
||||
claim.
|
||||
|
||||
Definitions
|
||||
|
||||
The licensor is the entity offering these terms, and the software is the
|
||||
software the licensor makes available under these terms, including any portion
|
||||
of it.
|
||||
|
||||
you refers to the individual or entity agreeing to these terms.
|
||||
|
||||
your company is any legal entity, sole proprietorship, or other kind of
|
||||
organization that you work for, plus all organizations that have control over,
|
||||
are under the control of, or are under common control with that organization.
|
||||
control means ownership of substantially all the assets of an entity, or the
|
||||
power to direct its management and policies by vote, contract, or otherwise.
|
||||
Control can be direct or indirect.
|
||||
|
||||
your licenses are all the licenses granted to you for the software under these
|
||||
terms.
|
||||
|
||||
use means anything you do with the software requiring one of your licenses.
|
||||
|
||||
trademark means trademarks, service marks, and similar rights.
|
||||
0
crates/kreuzberg-py/src/kreuzberg/py.typed
generated
Normal file
0
crates/kreuzberg-py/src/kreuzberg/py.typed
generated
Normal file
17712
crates/kreuzberg-py/src/lib.rs
generated
Normal file
17712
crates/kreuzberg-py/src/lib.rs
generated
Normal file
File diff suppressed because it is too large
Load Diff
105
crates/kreuzberg-py/src/pyproject.toml
generated
Normal file
105
crates/kreuzberg-py/src/pyproject.toml
generated
Normal file
@@ -0,0 +1,105 @@
|
||||
[build-system]
|
||||
build-backend = "maturin"
|
||||
requires = [ "maturin>=1,<2" ]
|
||||
|
||||
[project]
|
||||
name = "kreuzberg"
|
||||
version = "5.0.0rc3"
|
||||
description = "High-performance document intelligence library"
|
||||
keywords = [ "document", "extraction", "ocr", "pdf", "text" ]
|
||||
license = "Elastic-2.0"
|
||||
license-files = [ "LICENSE" ]
|
||||
authors = [ { name = "Na'aman Hirschfeld <naaman@kreuzberg.dev>" } ]
|
||||
requires-python = ">=3.10"
|
||||
classifiers = [
|
||||
"Programming Language :: Python :: 3 :: Only",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Programming Language :: Python :: 3.12",
|
||||
"Programming Language :: Python :: 3.13",
|
||||
"Programming Language :: Python :: 3.14",
|
||||
]
|
||||
urls.repository = "https://github.com/kreuzberg-dev/kreuzberg"
|
||||
homepage = "https://kreuzberg.dev"
|
||||
|
||||
[dependency-groups]
|
||||
dev = [ "mypy>=1.19", "ruff>=0.14.8" ]
|
||||
|
||||
[tool.maturin]
|
||||
module-name = "kreuzberg._kreuzberg"
|
||||
manifest-path = "../../crates/kreuzberg-py/Cargo.toml"
|
||||
# abi3-py310 produces a single wheel per platform that loads on Python 3.10+,
|
||||
# avoiding a per-Python-version build matrix.
|
||||
features = [ "pyo3/extension-module", "pyo3/abi3-py310" ]
|
||||
python-packages = [ "kreuzberg" ]
|
||||
# Bundle the core Rust crate so `pip install` can build from sdist on
|
||||
# platforms without a precompiled wheel (e.g. Alpine/musl). Without this
|
||||
# the workspace [patch.crates-io] (when present) points at a path that is
|
||||
# missing from the tarball and the source build fails.
|
||||
include = [
|
||||
{ path = "../../crates/kreuzberg/**/*", format = "sdist" },
|
||||
]
|
||||
|
||||
[tool.ruff]
|
||||
target-version = "py310"
|
||||
line-length = 120
|
||||
format.docstring-code-line-length = 120
|
||||
format.docstring-code-format = true
|
||||
lint.select = [ "ALL" ]
|
||||
lint.ignore = [
|
||||
"ANN401",
|
||||
"ASYNC109",
|
||||
"ASYNC110",
|
||||
"BLE001",
|
||||
"COM812",
|
||||
"D100",
|
||||
"D104",
|
||||
"D107",
|
||||
"D205",
|
||||
"E501",
|
||||
"EM",
|
||||
"FBT",
|
||||
"FIX",
|
||||
"ISC001",
|
||||
"PD011",
|
||||
"PGH003",
|
||||
"PLR2004",
|
||||
"PLW0603",
|
||||
"S104",
|
||||
"S110",
|
||||
"S603",
|
||||
"TD",
|
||||
"TRY",
|
||||
]
|
||||
lint.per-file-ignores."kreuzberg/__init__.py" = [ "I001" ]
|
||||
# The alef Python codegen still emits cosmetic warnings on the wrapper
|
||||
# modules: api.py keeps the legacy `from typing import AsyncIterator` and a
|
||||
# single-line import block, options.py carries # noqa: TC001 / F401 markers
|
||||
# that turn out unused on every regen, __init__.py star-imports re-sort with
|
||||
# a different convention. Silence these specific rules on the wrappers until
|
||||
# the codegen is updated to emit ruff-clean output.
|
||||
lint.per-file-ignores."kreuzberg/api.py" = [ "F401", "I001", "UP035" ]
|
||||
lint.per-file-ignores."kreuzberg/options.py" = [ "F401", "RUF100" ]
|
||||
lint.per-file-ignores."tests/**" = [ "ANN", "D103", "PLR2004", "S101" ]
|
||||
lint.mccabe.max-complexity = 15
|
||||
lint.pydocstyle.convention = "google"
|
||||
lint.pylint.max-args = 10
|
||||
lint.pylint.max-branches = 15
|
||||
lint.pylint.max-returns = 10
|
||||
|
||||
[tool.mypy]
|
||||
python_version = "3.10"
|
||||
strict = true
|
||||
show_error_codes = true
|
||||
implicit_reexport = false
|
||||
namespace_packages = true
|
||||
overrides = [
|
||||
# The alef-emitted `api.py` wrapper has a structural mismatch between its
|
||||
# `options.*` dataclass signatures and the `_internal_bindings.*` pyclass
|
||||
# types pyo3 accepts/returns at runtime. pyo3 reconciles them dynamically via
|
||||
# FromPyObject — the Python e2e suite exercises the runtime path — but mypy
|
||||
# sees only the static-type discrepancy. Disable the four error codes the
|
||||
# discrepancy raises until the codegen emits matching `_to_rust_*` calls and
|
||||
# casts the return values.
|
||||
{ module = "kreuzberg.api", disable_error_code = [ "call-arg", "arg-type", "return-value", "attr-defined" ] },
|
||||
]
|
||||
316
crates/kreuzberg-py/src/uv.lock
generated
Normal file
316
crates/kreuzberg-py/src/uv.lock
generated
Normal file
@@ -0,0 +1,316 @@
|
||||
version = 1
|
||||
revision = 3
|
||||
requires-python = ">=3.10"
|
||||
resolution-markers = [
|
||||
"python_full_version >= '3.15'",
|
||||
"python_full_version < '3.15'",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ast-serialize"
|
||||
version = "0.5.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/81/9d/09e27731bd5864a9ce04e3244074e674bb8936bf62b45e0357248717adac/ast_serialize-0.5.0.tar.gz", hash = "sha256:5880091bfe6f4f986f22866375c2e884843e7a0b6343ae41aeea659613d879b6", size = 61157, upload-time = "2026-05-17T17:48:29.429Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/c0/9a/13dde51ba9e15f8b97957ab7cb0120d0e381524d651c6bd630b9c359227f/ast_serialize-0.5.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:8f5c14f169eb0972c0c21bada5358b23d6047c76583b005234f865b11f1fa00a", size = 1183520, upload-time = "2026-05-17T17:47:30.831Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/37/de/5a7f0a9fe68944f536632a5af84676739c7d2582be42deb082634bf3a754/ast_serialize-0.5.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7d1a2de9de5be04652f0ed60738356ef94f66db37924a9499fffe98dc491aa0b", size = 1175779, upload-time = "2026-05-17T17:47:32.551Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/9c/81/0bb853e76e4f6e9a1855d569003c59e19ffac45f7079d91505d1bb212f92/ast_serialize-0.5.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:be5173fb66f9b49026d9d5a2ff0fc7c7009077107c0eb285b2d60fdf1fe10bd1", size = 1233750, upload-time = "2026-05-17T17:47:34.731Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/e5/d3/4cf705beeccc08754d0bbda99aefff26110e209b9a07ac8a6b60eec48531/ast_serialize-0.5.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f8015cd071ac1339924ee2b8098c93e00e155f30a16f40ec9816fcf84f4753f6", size = 1235942, upload-time = "2026-05-17T17:47:36.287Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/26/c8/ee097e437ea27dd2b8b227865c875492b585650a5802a22d82b304c8201b/ast_serialize-0.5.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5499e8797edff2a9186aa313ed382c6b422e798e9332d9953badcee6e69a88f2", size = 1442517, upload-time = "2026-05-17T17:47:38.17Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ff/bd/68063442838f1ba68ec72b5436430bc75b3bb17a1a3c3063f09b0c05ae2b/ast_serialize-0.5.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6848f2a093fb5548751a9a09bff8fcd229e2bbeb0e3331f391b6ae6d26cd9903", size = 1254081, upload-time = "2026-05-17T17:47:39.826Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/50/e2/1e520793bc6a4e4524a6ab022391e827825eaa0c3811828bfdc6852eca26/ast_serialize-0.5.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:832d4c998e0b091fd60a6d6bceee535483c4d490de9ba85003af835225719261", size = 1259910, upload-time = "2026-05-17T17:47:41.369Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/4e/e1/49b60f467979979cfe6913b43948ff25bca971ad0591d181812f163a988e/ast_serialize-0.5.0-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:16db7c62ec0b8efe1d7afd283a388d8f74f2605d56032e5a37747d2de8dba027", size = 1250678, upload-time = "2026-05-17T17:47:43.702Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/74/ba/66ab9555de6275677566f6574e5ef6c29cb185ea866f643bc06f8280a8ee/ast_serialize-0.5.0-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:baf5eb061eb5bccade4128ad42da33787d72f6013809cd1b590376ece8b3c937", size = 1301603, upload-time = "2026-05-17T17:47:46.256Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/66/42/6aca9b9abc710014b2be9059689e5dd1679339e78f567ffb4d255a9e2050/ast_serialize-0.5.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:104e4a35bd7c124173c41760ef9aaea17ddb3f86c65cb643671d59afbe3ee94c", size = 1410332, upload-time = "2026-05-17T17:47:47.899Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/47/68/2f76594432a22581ecf878b5e75a9b8601c24b2241cf0bbeb1e21fcf370c/ast_serialize-0.5.0-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:36be371028fc1675acb38a331bde160dbab7ff907fdf00b67eb6911aa106951b", size = 1509979, upload-time = "2026-05-17T17:47:50.942Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/40/ac/a93c9b58292653f6c595752f677a08e608f903b710594909e9231a389b3b/ast_serialize-0.5.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:061ee58bdb52341c8201a6df41182a977736bae3b7ded87ca7176ca25a8a47ab", size = 1505002, upload-time = "2026-05-17T17:47:54.093Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/14/2e/b278f68c497ee2f1d1576cbbef8db5281cd4a5f2db040537592ac9c8862e/ast_serialize-0.5.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:b15219e9cdc9f53f6f4cb51c009203507228226148c05c5e8fe451c28b435eb3", size = 1456231, upload-time = "2026-05-17T17:47:56.311Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/0b/43/419be1c566a4c504cd8fd60ce2f84e790f295495c0f327cfaeadf3d51012/ast_serialize-0.5.0-cp314-cp314t-win32.whl", hash = "sha256:842d1c004bb466c7df036f95fabef789570541922b10976b12f5592a69cf0b38", size = 1058668, upload-time = "2026-05-17T17:47:58.305Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/03/6f/c9d4d549295ed05111aeb8853232d1afd9d0a179fddb01eeffbb3a4a6842/ast_serialize-0.5.0-cp314-cp314t-win_amd64.whl", hash = "sha256:b0c06d760909b095cc466356dfccd05a1c7233a6ca191c020dca2c6a6f16c24c", size = 1101075, upload-time = "2026-05-17T17:48:00.35Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d0/8e/d00c5ab30c58222e07d62956fca86c59d91b9ad32997e633c38b526623a3/ast_serialize-0.5.0-cp314-cp314t-win_arm64.whl", hash = "sha256:787baedb0262cc49e8ce37cc15c00ae818e46a165a3b36f5e21ed174998104cb", size = 1075347, upload-time = "2026-05-17T17:48:01.753Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/e0/9e/dc2530acb3a60dc6e46d65abf27d1d9f86721694757906a148d90a6860de/ast_serialize-0.5.0-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:0668aa9459cfa8c9c49ddd2163ebcf43088ba045ef7492af6fe22e0098303101", size = 1191380, upload-time = "2026-05-17T17:48:03.738Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/26/0a/bd3d18a582f273d6c843d16bb9e22e9e16365ff7991e92f18f798e9f1224/ast_serialize-0.5.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:bf683d6363edf2b39eed6b6d4fe22d34b6203867a67e27134d9e2a2680c4bc4a", size = 1183879, upload-time = "2026-05-17T17:48:05.463Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/40/ae/1f919100f8620887af58fcc381c61a1f218cdf89c6e155f87b213e61010a/ast_serialize-0.5.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9cc22cf0c9be65e71cf88fda130af60d61eb4a79370ad4cfe7900d48a4aa2211", size = 1244529, upload-time = "2026-05-17T17:48:07.008Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c6/ca/6376559dcce707cdbc1d0d9a13c8d3baaaa501e949ce0ebdc4230cd881aa/ast_serialize-0.5.0-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f66173891548c9f2726bf27957b41cabce12fa679dc6da505ddbde4d4b3b31cf", size = 1240560, upload-time = "2026-05-17T17:48:08.46Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/35/b2/a620e206b5aeb7efbf2710336df57d457cffbb3991076bbcc1147ef9abd4/ast_serialize-0.5.0-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e42d729ef2be96a14efbad355093284739e3670ece3e534f82cc8832790911d9", size = 1451172, upload-time = "2026-05-17T17:48:09.922Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/fa/e0/4ad5c04c24a40481b2935ce9a0ccdb6023dc8b667167d06ae530cc3512f2/ast_serialize-0.5.0-cp39-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b725026bafa801dbd7310eb13a75f0a2e370e7e51b2cb225f9d21fcfadf919ee", size = 1265072, upload-time = "2026-05-17T17:48:11.469Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b2/71/4d1d479aa56d0101c40e17720c3d6ac2af7269ea0487a80b18e7bfd1a5b7/ast_serialize-0.5.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b54f60c1d78767a53b67eaa663f0dfac3afe606aa07f1301572f588b73d64809", size = 1270488, upload-time = "2026-05-17T17:48:13.575Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/6d/4f/0de1bbe06f6edef9fde4ed12ca8e7b3ec7e6e2bd4e672c5af487f7957665/ast_serialize-0.5.0-cp39-abi3-manylinux_2_31_riscv64.whl", hash = "sha256:27d51654fc240a1e87e742d353d98eb45b75f62f129086b3596ab53df2ac2a43", size = 1260702, upload-time = "2026-05-17T17:48:15.141Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/75/61/e00872439cfdddcc3c1b6cdaa6e5d904ba8e26a18807c67c4e14409d0ca8/ast_serialize-0.5.0-cp39-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2782c36237c46dd1674542f2109740ea5ea485a169bf1431939ada0434e17934", size = 1311182, upload-time = "2026-05-17T17:48:16.779Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/76/8e/699a5b955f7926956c95e9e1d74132acad73c2fe7a426f94da89123c20aa/ast_serialize-0.5.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:1943db345233cc7194a470f13afa9c59772c0b123dea0c9414c4d4ca54369759", size = 1421410, upload-time = "2026-05-17T17:48:18.527Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a9/ae/d5b7626874478997adc7a29ab28accf21e596fb590c944290401dfd0b29e/ast_serialize-0.5.0-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:df1c00022cbbcb064bfaa505aa9c9295362443ce5dacb459d1331d3da353f887", size = 1516587, upload-time = "2026-05-17T17:48:20.133Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/0c/ce/b59e02a82d9c4244d64cde502e0b00e83e38816abe19155ceb5437402c7f/ast_serialize-0.5.0-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:cae65289fc456fde04af979a2be09302ef5d8ab92ef23e596d6746dc267ada27", size = 1515171, upload-time = "2026-05-17T17:48:21.921Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/8b/38/d8d90042747d05aa08d4efcf1c99035a5f670a6bf4c214d31644392afbca/ast_serialize-0.5.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:239a4c354e8d676e9d94631d1d4a64edc6b266f86ff3a5a80aedd344f342c01d", size = 1464668, upload-time = "2026-05-17T17:48:23.544Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/dd/51/5b840c4df7334104cecffa28f23904fe81ca89ca223d2450e288de39fd3c/ast_serialize-0.5.0-cp39-abi3-win32.whl", hash = "sha256:143a4ef63285a075871908fda3672dc21864b83a8ec3ee12304aa3e4c5387b9a", size = 1068311, upload-time = "2026-05-17T17:48:25.027Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/41/11/ca5672c7d491825bc4cd6702dea106a6b60d928707712ec257c7833ae476/ast_serialize-0.5.0-cp39-abi3-win_amd64.whl", hash = "sha256:cf25572c526add400f26a4750dc6ce0c3bb93fc1f75e7ae0cad4ce4f2cd5c590", size = 1108931, upload-time = "2026-05-17T17:48:26.591Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/45/19/cc8bd127d28a43da249aa955cfd164cf8fd534e79e42cea96c4854d72fd0/ast_serialize-0.5.0-cp39-abi3-win_arm64.whl", hash = "sha256:92a31c9c20d25a076edaeec76b128a3535d74a24f340b9a8a7e96c9b86dc9642", size = 1081181, upload-time = "2026-05-17T17:48:28.122Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kreuzberg"
|
||||
version = "5.0.0rc1"
|
||||
source = { editable = "." }
|
||||
|
||||
[package.dev-dependencies]
|
||||
dev = [
|
||||
{ name = "mypy" },
|
||||
{ name = "ruff" },
|
||||
]
|
||||
|
||||
[package.metadata]
|
||||
|
||||
[package.metadata.requires-dev]
|
||||
dev = [
|
||||
{ name = "mypy", specifier = ">=1.19" },
|
||||
{ name = "ruff", specifier = ">=0.14.8" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "librt"
|
||||
version = "0.11.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/40/08/9e7f6b5d2b5bed6ad055cdd5925f192bb403a51280f86b56554d9d0699a2/librt-0.11.0.tar.gz", hash = "sha256:075dc3ef4458a278e0195cbf6ac9d38808d9b906c5a6c7f7f79c3888276a3fb1", size = 200139, upload-time = "2026-05-10T18:17:25.138Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/83/10/37fd9e9ba96cb0bd742dfb20fc3d082e54bdbec759d7300df927f360ef07/librt-0.11.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6e94ebfcfa2d5e9926d6c3b9aa4617ffc42a845b4321fb84021b872358c82a0f", size = 141706, upload-time = "2026-05-10T18:15:16.129Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/cf/72/1b1466f358e4a0b728051f69bc27e67b432c6eaa2e05b88db49d3785ae0d/librt-0.11.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ae627397a2f351560440d872d6f7c8dbb4072e57868e7b2fc5b8b430fe489d45", size = 142605, upload-time = "2026-05-10T18:15:18.148Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ca/85/ed26dd2f6bc9a0baf48306433e579e8d354d70b2bcb78134ed950a5d0e1e/librt-0.11.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dc329359321b67d24efdf4bc69012b0597001649544db662c001db5a0184794c", size = 476555, upload-time = "2026-05-10T18:15:19.569Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/66/fe/11891191c0e0a3fd617724e891f6e67a71a7658974a892b9a9a97fdb2977/librt-0.11.0-cp310-cp310-manylinux2014_i686.manylinux_2_17_i686.manylinux_2_28_i686.whl", hash = "sha256:7e82e642ab0f7608ce2fe53d76ca2280a9ee33a1b06556142c7c6fe80a86fc33", size = 468434, upload-time = "2026-05-10T18:15:20.87Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/6f/50/5ec949d7f9ce1a07af903aa3e13abb98b717923bdead6e719b2f824ccc07/librt-0.11.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:88145c15c67731d54283d135b03244028c750cc9edc334a96a4f5950ebdb2884", size = 496918, upload-time = "2026-05-10T18:15:22.616Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ea/c4/177336c7524e34875a38bf668e88b193a6723a4eb4045d07f74df6e1506c/librt-0.11.0-cp310-cp310-manylinux_2_34_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:9d36a51b3d93320b686588e27123f4995804dbf1bce81df78c02fc3c6eea9280", size = 490334, upload-time = "2026-05-10T18:15:24.2Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/13/1f/da3112f7569eda3b49f9a2629bae1fe059812b6085df16c885f6454dff49/librt-0.11.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d00f3ac06a2a8b246327f11e186a53a100a4d5c7ed52346367e5ec751d51586c", size = 511287, upload-time = "2026-05-10T18:15:26.226Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/fa/94/03fec301522e172d105581431223be56b27594ff46440ebfbb658a3735d5/librt-0.11.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:461bbceede621f1ffb8839755f8663e886087ee7af16294cab7fb4d782c62eeb", size = 517202, upload-time = "2026-05-10T18:15:27.965Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b7/6e/339f6e5a7b413ce014f1917a756dae630fe59cc99f34153205b1cb540901/librt-0.11.0-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:0cad8a4d6a8ff03c9b76f9414caccd78e7cfbc8a2e12fa334d8e1d9932753783", size = 497517, upload-time = "2026-05-10T18:15:29.614Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/cd/43/acdd5ce317cb46e8253ca9bfbdb8b12e68a24d745949336a7f3d5fb79ba0/librt-0.11.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f37aa505b3cf60701562eddb32df74b12a9e380c207fd8b06dd157a943ac7ea0", size = 538878, upload-time = "2026-05-10T18:15:30.928Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/29/b5/7a25bb12e3172839f647f196b3e988318b7bb1ca7501732a225c4dce2ec0/librt-0.11.0-cp310-cp310-win32.whl", hash = "sha256:94663a21534637f0e787ec2a2a756022df6e5b7b2335a5cdd7d8e33d68a2af89", size = 100070, upload-time = "2026-05-10T18:15:32.551Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c6/0d/ebbcf4d77999c02c937b05d2b90ff4cd4dcc7e9a365ba132329ac1fe7a0f/librt-0.11.0-cp310-cp310-win_amd64.whl", hash = "sha256:dec7db73758c2b54953fd8b7fe348c45188fe26b39ee18446196edd08453a5d4", size = 117918, upload-time = "2026-05-10T18:15:33.678Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/fe/87/2bf31fe17587b29e3f93ec31421e2b1e1c3e349b8bf6c7c313dbad1d5340/librt-0.11.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:93d95bd45b7d58343d8b90d904450a545144eec19a002511163426f8ab1fae29", size = 141092, upload-time = "2026-05-10T18:15:34.795Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/cf/08/5c5bf772920b7ebac6e32bc91a643e0ab3870199c0b542356d3baa83970a/librt-0.11.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4ee278c769a713638cdacd4c0436d72156e75df3ebc0166ab2b9dc43acc386c9", size = 142035, upload-time = "2026-05-10T18:15:36.242Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/06/20/662a03d254e5b000d838e8b345d83303ddb768c080fd488e40634c0fa66b/librt-0.11.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f230cb1cbc9faaa616f9a678f530ebcf186e414b6bcbd88b960e4ba1b92428d5", size = 475022, upload-time = "2026-05-10T18:15:37.56Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/de/f3/aa81523e45184c6ec23dc7f63263362ec55f80a09d424c012359ecbe7e35/librt-0.11.0-cp311-cp311-manylinux2014_i686.manylinux_2_17_i686.manylinux_2_28_i686.whl", hash = "sha256:5d63c855d86938d9de93e265c9bd8c705b51ec494de5738340ee93767a686e4b", size = 467273, upload-time = "2026-05-10T18:15:39.182Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/6b/6f/59c74b560ca8853834d5501d589c8a2519f4184f273a085ffd0f37a1cc47/librt-0.11.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:993f028be9e96a08d31df3479ac80d99be374d17f3b78e4796b3fd3c913d4e89", size = 497083, upload-time = "2026-05-10T18:15:40.634Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/fe/7b/5aa4d2c9600a719401160bf7055417df0b2a47439b9d88286ce45e56b65f/librt-0.11.0-cp311-cp311-manylinux_2_34_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:258d73a0aa66a055e65b2e4d1b8cdb23b9d132c5bb915d9547d804fcaed116cc", size = 489139, upload-time = "2026-05-10T18:15:41.934Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d6/31/9143803d7da6856a69153785768c4936864430eec0fd9461c3ea527d9922/librt-0.11.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0827efe7854718f04aaddf6496e96960a956e676fe1d0f04eb41511fd8ad06d5", size = 508442, upload-time = "2026-05-10T18:15:43.206Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/2f/5a/bce08184488426bda4ccc2c4964ac048c8f68ae89bd7120082eef4233cfd/librt-0.11.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:7753e57d6e12d019c0d8786f1c09c709f4c3fcc57c3887b24e36e6c06ec938b7", size = 514230, upload-time = "2026-05-10T18:15:44.761Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/89/8c/bb5e213d254b7505a0e658da199d8ab719086632ce09eef311ab27976523/librt-0.11.0-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:11bd19822431cc21af9f27374e7ae2e58103c7d98bda823536a6c47f6bb2bb3d", size = 494231, upload-time = "2026-05-10T18:15:46.308Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/9d/fb/541cdad5b1ab1300398c74c4c9a497b88e5074c21b1244c8f49731d3a284/librt-0.11.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:22bdf239b219d3993761a148ffa134b19e52e9989c84f845d5d7b71d70a17412", size = 537585, upload-time = "2026-05-10T18:15:47.629Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/8f/f2/464bb69295c320cb06bddb4f14a4ec67934ee14b2bffb12b19fb7ab287ba/librt-0.11.0-cp311-cp311-win32.whl", hash = "sha256:46c60b61e308eb535fbd6fa622b1ee1bb2815691c1ad9c98bf7b84952ec3bc8d", size = 100509, upload-time = "2026-05-10T18:15:49.157Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/6d/e7/a17ee1788f9e4fbf548c19f4afa07c92089b9e24fef6cb2410863781ef4c/librt-0.11.0-cp311-cp311-win_amd64.whl", hash = "sha256:902e546ff044f579ff1c953ff5fce97b636fe9e3943996b2177710c6ef076f73", size = 118628, upload-time = "2026-05-10T18:15:50.345Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/cc/c7/6c766214f9f9903bcfcfbef97d807af8d8f5aa3502d247858ab17582d212/librt-0.11.0-cp311-cp311-win_arm64.whl", hash = "sha256:65ac3bc20f78aa0ee5ae84baa68917f89fef4af63e941084dd019a0d0e749f0c", size = 103122, upload-time = "2026-05-10T18:15:52.068Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/8b/d0/07c77e067f0838949b43bd89232c29d72efebb9d2801a9750184eb706b71/librt-0.11.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b87504f1690a23b9a2cca841191a04f83895d4fc2dd04df91d82b1a04ca2ad46", size = 144147, upload-time = "2026-05-10T18:15:53.227Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/7a/24/8493538fa4f62f982686398a5b8f68008138a75086abdea19ade64bf4255/librt-0.11.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40071fc5fe0ce8daa6de616702314a01e1250711682b0523d6ab8d4525910cb3", size = 143614, upload-time = "2026-05-10T18:15:54.657Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ff/1e/f8bad050810d9171f34a1648ed910e56814c2ba61639f2bd53c6377ae24b/librt-0.11.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:137e79445c896a0ea7b265f52d23954e05b64222ee1af69e2cb34219067cbb67", size = 485538, upload-time = "2026-05-10T18:15:56.117Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c0/fe/3594ebfbaf03084ba4b120c9ba5c3183fd938a48725e9bbe6ff0a5159ad8/librt-0.11.0-cp312-cp312-manylinux2014_i686.manylinux_2_17_i686.manylinux_2_28_i686.whl", hash = "sha256:cca6644054e78746d8d4ef238681f9c34ff8b584fe6b988ecebb8db3b15e622a", size = 479623, upload-time = "2026-05-10T18:15:57.544Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b0/da/5d1876984b3746c85dbd219dbfcb73c85f54ee263fd32e5b2a632ec14571/librt-0.11.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d5b0eea49f5562861ee8d757a32ef7d559c1d35be2aaaa1ec28941d74c9ffc8a", size = 513082, upload-time = "2026-05-10T18:15:58.805Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/19/6e/55bdf5d5ca00c3e18430690bf2c953d8d3ffd3c337418173d33dec985dc9/librt-0.11.0-cp312-cp312-manylinux_2_34_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:0d1029d7e1ae1a7e647ed6fb5df8c4ce2dffefb7a9f5fd1376a4554d96dac09f", size = 508105, upload-time = "2026-05-10T18:16:00.2Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/07/10/f1f23a7c595ee90ece4d35c851e5d104b1311a887ed1b4ac4c35bbd13da8/librt-0.11.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bc3ce6b33c5828d9e80592011a5c584cb2ce86edbc4088405f70da47dc1d1b3b", size = 522268, upload-time = "2026-05-10T18:16:01.708Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b6/02/5720f5697a7f54b78b3aefbe20df3a48cedcff1276618c4aa481177942ed/librt-0.11.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:936c5995f3514a42111f20099397d8177c79b4d7e70961e396c6f5a0a3566766", size = 527348, upload-time = "2026-05-10T18:16:03.496Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/50/db/b4a47c6f91db4ff76348a0b3dd0cc65e090a078b765a810a62ff9434c3d3/librt-0.11.0-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:9bc0ca6ad9381cbe8e4aa6e5726e4c80c78115a6e9723c599ed1d73e092bc49d", size = 516294, upload-time = "2026-05-10T18:16:05.173Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/9e/58/9384b2f4eb1ed1d273d40948a7c5c4b2360213b402ef3be4641c06299f9c/librt-0.11.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:070aa8c26c0a74774317a72df8851facc7f0f012a5b406557ac56992d92e1ec8", size = 553608, upload-time = "2026-05-10T18:16:06.839Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/21/7b/5aa8848a7c6a9278c79375146da1812e695754ceec5f005e6043461a7315/librt-0.11.0-cp312-cp312-win32.whl", hash = "sha256:6bf14feb84b05ae945277395451998c89c54d0def4070eb5c08de544930b245a", size = 101879, upload-time = "2026-05-10T18:16:08.103Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/37/33/8a745436944947575b584231750a41417de1a38cf6a2e9251d1065651c09/librt-0.11.0-cp312-cp312-win_amd64.whl", hash = "sha256:75672f0bc524ede266287d532d7923dbce94c7514ad07627bac3d0c6d92cc4d9", size = 119831, upload-time = "2026-05-10T18:16:09.174Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/59/67/a6739ac96e28b7855808bdb0370e250606104a859750d209e5a0716fe7ab/librt-0.11.0-cp312-cp312-win_arm64.whl", hash = "sha256:2f10cf143e4a9bb0f4f5af568a00df94a2d69ef41c2579584454bb0fe5cc642c", size = 103470, upload-time = "2026-05-10T18:16:10.369Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/82/61/e59168d4d0bf2bf90f4f0caf7a001bfc60254c3af4586013b04dc3ef517b/librt-0.11.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:78dc31f7fdfe9c9d0eb0e8f42d139db230e826415bbcabd9f0e9faaaee909894", size = 144119, upload-time = "2026-05-10T18:16:11.771Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/61/fd/caa1d60b12f7dd79ccea23054e06eeaebe266a5f52c40a6b651069200ce5/librt-0.11.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:fa475675db22290c3158e1d42326d0f5a65f04f44a0e68c3630a25b53560fb9c", size = 143565, upload-time = "2026-05-10T18:16:13.334Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b8/a9/dc744f5c2b4978d48db970be29f22716d3413d28b14ad99740817315cf2c/librt-0.11.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:621db29691044bdeda22e789e482e1b0f3a985d90e3426c9c6d17606416205ea", size = 485395, upload-time = "2026-05-10T18:16:14.729Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/8f/21/7f8e97a1e4dae952a5a95948f6f8507a173bc1e669f54340bba6ca1ca31b/librt-0.11.0-cp313-cp313-manylinux2014_i686.manylinux_2_17_i686.manylinux_2_28_i686.whl", hash = "sha256:a9010e2ed5b3a9e158c5fd966b3ab7e834bb3d3aacc8f66c91dd4b57a3799230", size = 479383, upload-time = "2026-05-10T18:16:16.321Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a6/6d/d8ee9c114bebf2c50e29ec2aa940826fccb62a645c3e4c18760987d0e16d/librt-0.11.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7c39513d8b7477a2e1ed8c43fc21c524e8d5a0f8d4e8b7b074dbdbe7820a08e2", size = 513010, upload-time = "2026-05-10T18:16:17.647Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f0/43/0b5708af2bd30a46400e72ba6bdaa8f066f15fb9a688527e34220e8d6c06/librt-0.11.0-cp313-cp313-manylinux_2_34_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:7aef3cf1d5af86e770ab04bfd993dfc4ae8b8c17f66fb77dd4a7d50de7bbb1a3", size = 508433, upload-time = "2026-05-10T18:16:19.309Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/4a/50/356187247d09013490481033183b3532b58acf8028bcb34b2b56a375c9b2/librt-0.11.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:557183ddc36babe46b27dd60facbd5adb4492181a5be887587d57cda6e092f21", size = 522595, upload-time = "2026-05-10T18:16:20.642Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/40/e7/c6ac4240899c7f3248079d5a9900debe0dadb3fdeaf856684c987105ba47/librt-0.11.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:83d3e1f72bd42f6c5c0b7daec530c3f829bd02db42c70b8ddf0c2d90a2459930", size = 527255, upload-time = "2026-05-10T18:16:22.352Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/eb/b5/a81322dbeedeeaf9c1ee6f001734d28a09d8383ac9e6779bc24bbd0743c6/librt-0.11.0-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:4ce1f21fbe589bc1afd7872dece84fb0e1144f794a288e58a10d2c54a55c43be", size = 516847, upload-time = "2026-05-10T18:16:23.627Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ae/66/6e6323787d592b55204a42595ff1102da5115601b53a7e9ddebc889a6da5/librt-0.11.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:970b09f7044ea2b64c9da42fd3d335666518cfd1c6e8a182c95da73d0214b41e", size = 553920, upload-time = "2026-05-10T18:16:25.025Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/9c/21/623f8ca230857102066d9ca8c6c1734995908c4d0d1bee7bb2ef0021cb33/librt-0.11.0-cp313-cp313-win32.whl", hash = "sha256:78fddc31cd4d3caa897ad5d31f856b1faadc9474021ad6cb182b9018793e254e", size = 101898, upload-time = "2026-05-10T18:16:26.649Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b3/1d/b4ebd44dd723f768469007515cb92251e0ae286c94c140f374801140fa74/librt-0.11.0-cp313-cp313-win_amd64.whl", hash = "sha256:8ca8aa88751a775870b764e93bad5135385f563cb8dcee399abf034ea4d3cb47", size = 119812, upload-time = "2026-05-10T18:16:27.859Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/3b/e4/b2f4ca7965ca373b491cdb4bc25cdb30c1649ca81a8782056a83850292a9/librt-0.11.0-cp313-cp313-win_arm64.whl", hash = "sha256:96f044bb325fd9cf1a723015638c219e9143f0dfbc0ca54c565df2b7fc748b44", size = 103448, upload-time = "2026-05-10T18:16:29.066Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/29/eb/dbce197da4e227779e56b5735f2decc3eb36e55a1cdbf1bd65d6639d76c1/librt-0.11.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:4a017a95e5837dc15a8c5661d60e05daa96b90908b1aa6b7acdf443cd25c8ebd", size = 143345, upload-time = "2026-05-10T18:16:30.674Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/76/a3/254bebd0c11c8ba684018efb8006ff22e466abce445215cca6c778e7d9de/librt-0.11.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:b1ecbd9819deccc39b7542bf4d2a740d8a620694d39989e58661d3763458f8d4", size = 143131, upload-time = "2026-05-10T18:16:32.037Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f1/3f/f77d6122d21ac7bf6ae8a7dfced1bd2a7ac545d3273ebdcaf8042f6d619f/librt-0.11.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7da327dacd7be8f8ec36547373550744a3cc0e536d54665cd83f8bcd961200e8", size = 477024, upload-time = "2026-05-10T18:16:33.493Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ac/0a/2c996dadebaa7d9bbbd43ef2d4f3e66b6da545f838a41694ef6172cebec8/librt-0.11.0-cp314-cp314-manylinux2014_i686.manylinux_2_17_i686.manylinux_2_28_i686.whl", hash = "sha256:0dc56b1f8d06e60db362cc3fdae206681817f86ce4725d34511473487f12a34b", size = 474221, upload-time = "2026-05-10T18:16:34.864Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/0a/7e/f5d92af8486b8272c23b3e686b46ff72d89c8169585eb61eef01a2ac7147/librt-0.11.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:05fb8fb2ab90e21c8d12ea240d744ad514da9baf381ebfa70d91d20d21713175", size = 505174, upload-time = "2026-05-10T18:16:36.705Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/af/1a/cb0734fe86398eb33193ab753b7326255c74cac5eb09e76b9b16536e7adb/librt-0.11.0-cp314-cp314-manylinux_2_34_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:cae74872be221df4374d10fec61f93ed1513b9546ea84f2c0bf73ab3e9bd0b03", size = 497216, upload-time = "2026-05-10T18:16:38.418Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/18/06/094820f91558b66e29943c0ec41c9914f460f48dd51fc503c3101e10842d/librt-0.11.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:32bcc918c0148eb7e3d57385125bac7e5f9e4359d05f07448b09f6f778c2f31c", size = 513921, upload-time = "2026-05-10T18:16:39.848Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/0b/c2/00de9018871a282f530cacb457d5ec0428f6ac7e6fedde9aff7468d9fb04/librt-0.11.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:f9743fc99135d5f78d2454435615f6dec0473ca507c26ce9d92b10b562a280d3", size = 520850, upload-time = "2026-05-10T18:16:41.471Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/51/9d/64631832348fd1834fb3a61b996434edddaaf25a31d03b0a76273159d2cf/librt-0.11.0-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:5ba067f4aadae8fda802d91d2124c90c42195ff32d9161d3549e6d05cfe26f96", size = 504237, upload-time = "2026-05-10T18:16:43.15Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a5/ec/ae5525eb16edc827a044e7bb8777a455ff95d4bca9379e7e6bddd7383647/librt-0.11.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:de3bf945454d032f9e390b85c4072e0a0570bf825421c8be0e71209fa65e1abe", size = 546261, upload-time = "2026-05-10T18:16:44.408Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/5a/09/adce371f27ca039411da9659f7430fcc2ba6cd0c7b3e4467a0f091be7fa9/librt-0.11.0-cp314-cp314-win32.whl", hash = "sha256:d2277a05f6dcb9fd13db9566aac4fabd68c3ea1ea46ee5567d4eef8efa495a2f", size = 96965, upload-time = "2026-05-10T18:16:46.039Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d6/ee/8ac720d98548f173c7ce2e632a7ca94673f74cacd5c8162a84af5b35958a/librt-0.11.0-cp314-cp314-win_amd64.whl", hash = "sha256:ab73e8db5e3f564d812c1f5c3a175930a5f9bc96ccb5e3b22a34d7858b401cf7", size = 115151, upload-time = "2026-05-10T18:16:47.133Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/94/20/c900cf14efeb09b6bef2b2dff20779f73464b97fd58d1c6bccc379588ae3/librt-0.11.0-cp314-cp314-win_arm64.whl", hash = "sha256:aea3caa317752e3a466fa8af45d91ee0ea8c7fdd96e42b0a8dd9b76a7931eba1", size = 98850, upload-time = "2026-05-10T18:16:48.597Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/0c/71/944bfe4b64e12abffcd3c15e1cce07f72f3d55655083786285f4dedeb532/librt-0.11.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:d1b36540d7aaf9b9101b3a6f376c8d8e9f7a9aec93ed05918f2c69d493ffef72", size = 151138, upload-time = "2026-05-10T18:16:49.839Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b6/10/99e64a5c86989357fda078c8143c533389585f6473b7439172dd8f3b3b2d/librt-0.11.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:efbb343ab2ce3540f4ecbe6315d677ed70f37cd9a72b1e58066c918ca83acbaa", size = 151976, upload-time = "2026-05-10T18:16:51.062Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/21/31/5072ad880946d83e5ea4147d6d018c78eefce85b77819b19bdd0ee229435/librt-0.11.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:aa0dd688aab3f7914d3e6e5e3554978e0383312fb8e771d84be008a35b9ee548", size = 557927, upload-time = "2026-05-10T18:16:52.632Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/5e/8d/70b5fb7cfbab60edbe7381614ab985da58e144fbf465c86d44c95f43cdca/librt-0.11.0-cp314-cp314t-manylinux2014_i686.manylinux_2_17_i686.manylinux_2_28_i686.whl", hash = "sha256:f5fb36b8c6c63fdcbb1d526d94c0d1331610d43f4118cc1beb4efef4f3faacb2", size = 539698, upload-time = "2026-05-10T18:16:53.934Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/fa/a3/ba3495a0b3edbd24a4cae0d1d3c64f39a9fc45d06e812101289b50c1a619/librt-0.11.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4a9a237d13addb93715b6fee74023d5ee3469b53fce527626c0e088aa585805f", size = 577162, upload-time = "2026-05-10T18:16:55.589Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f7/db/36e25fb81f99937ff1b96612a1dc9fd66f039cb9cc3aee12c01fac31aab9/librt-0.11.0-cp314-cp314t-manylinux_2_34_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:5ddd17bd87b2c56ddd60e546a7984a2e64c4e8eab92fb4cf3830a48ad5469d51", size = 566494, upload-time = "2026-05-10T18:16:56.975Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/33/0d/3f622b47f0b013eeb9cf4cc07ae9bfe378d832a4eec998b2b209fe84244d/librt-0.11.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:bd43992b4473d42f12ff9e68326079f0696d9d4e6000e8f39a0238d482ba6ee2", size = 596858, upload-time = "2026-05-10T18:16:58.374Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a9/02/71b90bc93039c46a2000651f6ad60122b114c8f54c4ad306e0e96f5b75ad/librt-0.11.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:f8e3e8056dd674e279741485e2e512d6e9a751c7455809d0114e6ebf8d781085", size = 590318, upload-time = "2026-05-10T18:16:59.676Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/04/04/418cb3f75621e2b761fb1ab0f017f4d70a1a72a6e7c74ee4f7e8d198c2f3/librt-0.11.0-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:c1f708d8ae9c56cf38a903c44297243d2ec83fd82b396b977e0144a3e76217e3", size = 575115, upload-time = "2026-05-10T18:17:01.007Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/cc/2c/5a2183ac58dd911f26b5d7e7d7d8f1d87fcecdddd99d6c12169a258ff62c/librt-0.11.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:0add982e0e7b9fc14cf4b33789d5f13f66581889b88c2f58099f6ce8f92617bd", size = 617918, upload-time = "2026-05-10T18:17:02.682Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/15/1f/dc6771a52592a4451be6effa200cbfc9cec61e4393d3033d81a9d307961d/librt-0.11.0-cp314-cp314t-win32.whl", hash = "sha256:2b481d846ac894c4e8403c5fd0e87c5d11d6499e404b474602508a224ff531c8", size = 103562, upload-time = "2026-05-10T18:17:03.99Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/62/4a/7d1415567027286a75ba1093ec4aca11f073e0f559c530cf3e0a757ad55c/librt-0.11.0-cp314-cp314t-win_amd64.whl", hash = "sha256:28edb433edde181112a908c78907af28f964eabc15f4dd16c9d66c834302677c", size = 124327, upload-time = "2026-05-10T18:17:05.465Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ce/62/b40b382fa0c66fee1478073eb8db352a4a6beda4a1adccf1df911d8c289c/librt-0.11.0-cp314-cp314t-win_arm64.whl", hash = "sha256:dee008f20b542e3cd162ba338a7f9ec0f6d23d395f66fe8aeeec3c9d067ea253", size = 102572, upload-time = "2026-05-10T18:17:06.809Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "mypy"
|
||||
version = "2.1.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "ast-serialize" },
|
||||
{ name = "librt", marker = "platform_python_implementation != 'PyPy'" },
|
||||
{ name = "mypy-extensions" },
|
||||
{ name = "pathspec" },
|
||||
{ name = "tomli", marker = "python_full_version < '3.11'" },
|
||||
{ name = "typing-extensions" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/82/15/cca9d88503549ed6fedeaa1d448cdddd542ee8a490232d732e278036fbf2/mypy-2.1.0.tar.gz", hash = "sha256:81e76ad12c2d804512e9b13240d1588316531bfba07558286078bfbce9613633", size = 3898359, upload-time = "2026-05-11T18:37:36.237Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/a4/71/d351dca3e9b30da2328ee9d445c88b8388072808ebfbc49eb69d30b67749/mypy-2.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:11a6beb180257a805961aea9ec591bbd0bd17f1e18d35b8456d57aee5bedfedc", size = 14778792, upload-time = "2026-05-11T18:36:23.605Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/2f/45/7d51594b644c17c0bcf74ed8cd5fc33b324276d708e8506f220b70dab9d9/mypy-2.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8ef78c1d306bbf9a8a12f526c44902c9c28dffd6c52c52bf6a72641ce18d3849", size = 13645739, upload-time = "2026-05-11T18:37:22.752Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/65/01/455c31b170e9468265074840bf18863a8482a24103fdaabe4e199392aa5f/mypy-2.1.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c209a90853081ff01d01ee895cafe10f7db1474e0d95beaeef0f6c1db9119bbd", size = 14074199, upload-time = "2026-05-11T18:35:09.292Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/41/5a/93093f0b29a9e982deafde698f740a2eb2e05886e79ccf0594c7fd5413a3/mypy-2.1.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:47cebf61abde7c088a4e27718a8b13a81655686b2e9c251f5c0915a802248166", size = 14953128, upload-time = "2026-05-11T18:31:57.678Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/7f/2f/a196f5331d96170ad3d28f144d2aba690d4b2911381f68d51e489c7ab82a/mypy-2.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d57a90ae5e872138a425ec328edbc9b235d1934c4377881a33ec05b341acc9a8", size = 15249378, upload-time = "2026-05-11T18:33:00.101Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/54/de/94d321cc12da9f71341ac0c270efbed5c725750c7b4c334d957de9a087d9/mypy-2.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:aea7f7a8a55b459c34275fc468ada6ca7c173a5e43a68f5dbe588a563d8a06b8", size = 11060994, upload-time = "2026-05-11T18:33:18.848Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/e1/62/0c27ca55219a7c764a7fb88c7bb2b7b2f9780ade8bbf16bc8ed8400eef6b/mypy-2.1.0-cp310-cp310-win_arm64.whl", hash = "sha256:c989640253f0d76843e9c6c1bbf4bd48c5e85ada61bde4beb37cb3eca035685e", size = 9976743, upload-time = "2026-05-11T18:31:25.554Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/0a/a1/639f3024794a2a15899cb90707fe02e044c4412794c39c5769fd3df2e2ef/mypy-2.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a683016b16fe2f572dc04c72be7ee0504ac1605a265d0200f5cea695fb788f41", size = 14691685, upload-time = "2026-05-11T18:33:27.973Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/3b/08/9a585dea4325f20d8b80dc78623fa50d1fd2173b710f6237afd6ba6ab39b/mypy-2.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1a293c534adb55271fef24a26da04b855540a8c13cc07bc5917b9fd2c394f2ca", size = 13555165, upload-time = "2026-05-11T18:32:16.107Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/81/dc/7c42cc9c6cb01e8eb09961f1f738741d3e9c7e9d5c5b30ec69222625cd5f/mypy-2.1.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7406f4d048e71e576f5356d317e5b0a9e666dfd966bd99f9d14ca06e1a341538", size = 13994376, upload-time = "2026-05-11T18:32:39.256Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d4/fa/285946c33bce716e082c11dfeee9ee196eaf1f5042efb3581a31f9f205e4/mypy-2.1.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e0210d626fc8b31ccc90233754c7bc90e1f43205e85d96387f7db1285b55c398", size = 14864618, upload-time = "2026-05-11T18:34:49.765Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/2b/83/82397f48af6c27e295d57979ded8490c9829040152cf7571b2f026aeb9a0/mypy-2.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:3712c20deed54e814eaaa825603bada8ea1c390670a397c95b98405347acc563", size = 15102063, upload-time = "2026-05-11T18:34:05.855Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/40/68/b02dec39057b88eb03dc0aa854732e26e8361f34f9d0e20c7614967d1eba/mypy-2.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:fcaa0e479066e31f7cceb6a3bea39cb22b2ff51a6b2f24f193d19179ba17c389", size = 11060564, upload-time = "2026-05-11T18:35:36.494Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/cf/a8/ea3dcbef31f99b634f2ee23bb0321cbc8c1b388b76a861eb849f13c347dc/mypy-2.1.0-cp311-cp311-win_arm64.whl", hash = "sha256:0b1a5260c95aa443083f9ed3592662941951bca3d4ca224a5dc517c38b7cf666", size = 9966983, upload-time = "2026-05-11T18:37:14.139Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/95/b1/55861beb5c339b44f9a2ba92df9e2cb1eeb4ae1eee674cdf7772c797778b/mypy-2.1.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:244358bf1c0da7722230bce60683d52e8e9fd030554926f15b747a84efb5b3af", size = 14874381, upload-time = "2026-05-11T18:37:31.784Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/0b/b3/b7f770114b7d0ac92d0f76e8d93c2780844a70488a90e91821927850da86/mypy-2.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4ec7c57657493c7a75534df2751c8ae2cda383c16ecc55d2106c54476b1b16f6", size = 13665501, upload-time = "2026-05-11T18:34:23.063Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b6/f3/8ae2037967e2126689a0c11d99e2b707134a565191e92c60ca2572aec60a/mypy-2.1.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d8161b6ff4392410023224f0969d17db93e1e154bc3e4ba62598e720723ae211", size = 14045750, upload-time = "2026-05-11T18:31:48.151Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a0/32/615eb5911859e43d054941b0d0a7d06cfa2870eba86529cf385b052b111c/mypy-2.1.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bf03e12003084a67395184d3eb8cbd6a489dc3655b5664b28c210a9e2403ab0b", size = 15061630, upload-time = "2026-05-11T18:37:06.898Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d4/03/4eafbfff8bfab1b87082741eae6e6a624028c984e6708b73bce2a8570c9d/mypy-2.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:20509760fd791c51579d573153407d226385ec1f8bcce55d730b354f3336bc22", size = 15288831, upload-time = "2026-05-11T18:31:18.07Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/99/ee/919661478e5891a3c96e549c036e467e64563ab85995b10c53c8358e16a3/mypy-2.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:6753d0c1fdd6b1a23b9e4f283ce80b2153b724adcb2653b20b85a8a28ac6436b", size = 11135228, upload-time = "2026-05-11T18:34:31.23Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/24/0a/6a12b9782ca0831a553192f351679f4548abc9d19a7cc93bb7feb02084c7/mypy-2.1.0-cp312-cp312-win_arm64.whl", hash = "sha256:98ebb6589bb3b6d0c6f0c459d53ca55b8091fbc13d277c4041c885392e8195e8", size = 10040684, upload-time = "2026-05-11T18:36:48.199Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/6e/dd/c7191469c777f07689c032a8f7326e393ea34c92d6d76eb7ce5ba57ea66d/mypy-2.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:35aac3bb114e03888f535d5eb51b8bafbb3266586b599da1940f9b1be3ec5bd5", size = 14852174, upload-time = "2026-05-11T18:31:38.929Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/55/8c/aed55408879043d72bb9135f4d0d19a02b886dd569631e113e3d2706cb8d/mypy-2.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:8de55a8c861f2a49331f807be98d90caeceeef520bde13d43a160207f8af613e", size = 13651542, upload-time = "2026-05-11T18:36:04.636Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/3a/8e/f371a824b1f1fa8ea6e3dbb8703d232977d572be2329554a3bc4d960302f/mypy-2.1.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5fdf2941a07434af755837d9880f7d7d25f1dacb1af9dcd4b9b66f2220a3024e", size = 14033929, upload-time = "2026-05-11T18:35:55.742Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/94/21/f54be870d6dd53a82c674407e0f8eed7174b05ec78d42e5abd7b42e84fd5/mypy-2.1.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e195b817c13f02352a9c124301f9f30f078405444679b6753c1b96b6eed37285", size = 15039200, upload-time = "2026-05-11T18:33:10.281Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/17/99/bf21748626a40ce59fd29a39386ab46afec88b7bd2f0fa6c3a97c995523f/mypy-2.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5431d42af987ebd92ba2f71d45c85ed41d8e6ca9f5fd209a69f68f707d2469e5", size = 15272690, upload-time = "2026-05-11T18:32:07.205Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d6/d7/9e90d2cf47100bea550ed2bc7b0d4de3a62181d84d5e37da0003e8462637/mypy-2.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:767fe8c66dc3e01e19e1737d4c38ebefead16125e1b8e58ad421903b376f5c65", size = 11147435, upload-time = "2026-05-11T18:33:56.477Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ec/46/e5c449e858798e35ffc90946282a27c62a77be743fe17480e4977374eb91/mypy-2.1.0-cp313-cp313-win_arm64.whl", hash = "sha256:ecfe70d43775ab99562ab128ce49854a362044c9f894961f68f898c23cb7429d", size = 10035052, upload-time = "2026-05-11T18:32:30.049Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b0/ca/b279a672e874aedd5498ae25f722dacc8aa86bbffb939b3f97cbb1cf6686/mypy-2.1.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:7354c5a7f69d9345c3d6e69921d57088eea3ddeeb6b20d34c1b3855b02c36ec2", size = 14848422, upload-time = "2026-05-11T18:35:45.984Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/27/e6/3efe56c631d959b9b4454e208b0ac4b7f4f58b404c89f8bec7b49efdfc21/mypy-2.1.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:49890d4f76ac9e06ec117f9e09f3174da70a620a0c300953d8595c926e80947f", size = 13677374, upload-time = "2026-05-11T18:36:57.188Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/84/7f/8107ea87a44fd1f1b59882442f033c9c3488c127201b1d1d15f1cbd6022e/mypy-2.1.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:761be68e023ef5d94678772396a8af1220030f80837a3afd8d0aef3b419666f4", size = 14055743, upload-time = "2026-05-11T18:35:18.361Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/51/4d/b6d34db183133b83761b9199a82d31557cdbb70a380d8c3b3438e11882a3/mypy-2.1.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c90345fc182dc363b891350457ec69c35140858538f38b4540845afcc32b1aef", size = 15020937, upload-time = "2026-05-11T18:34:59.618Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ff/d7/f08360c691d758acb02f45022c34d98b92892f4ea756644e1000d4b9f3d8/mypy-2.1.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b84802e7b5a6daf1f5e15bc9fcd7ddae77be13981ffab037f1c67bb84d67d135", size = 15253371, upload-time = "2026-05-11T18:36:41.081Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/67/1b/09460a13719530a19bce27bd3bc8449e83569dd2ba7faf51c9c3c30c0b61/mypy-2.1.0-cp314-cp314-win_amd64.whl", hash = "sha256:022c771234936ceac541ebaf836fe9e2abeb3f5e09aff21588fe543ff006fe21", size = 11326429, upload-time = "2026-05-11T18:34:13.526Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/40/62/75dbf0f82f7b6680340efc614af29dd0b3c17b8a4f1cd09b8bd2fd6bc814/mypy-2.1.0-cp314-cp314-win_arm64.whl", hash = "sha256:498207db725cec88829a6a5c2fc771205fd043719ef98bc49aba8fb9fc4e6d57", size = 10218799, upload-time = "2026-05-11T18:32:23.491Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b2/66/caca04ed7d972fb6eb6dd1ccd6df1de5c38fae8c5b3dc1c4e8e0d85ee6b9/mypy-2.1.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:7d5e5cad0efeba72b93cd17490cc0d69c5ac9ca132994fe3fb0314808aeeb83e", size = 15923458, upload-time = "2026-05-11T18:35:28.64Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ed/52/2d90cbe49d014b13ed7ff337930c30bad35893fe38a1e4641e756bb62191/mypy-2.1.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:ff715050c127d724fd260a2e666e7747fdd83511c0c47d449d98238970aef780", size = 14757697, upload-time = "2026-05-11T18:36:14.208Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ac/37/d98f4a14e081b238992d0ed96b6d39c7cc0148c9699eb71eaa68629665ea/mypy-2.1.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:82208da9e09414d520e912d3e462d454854bed0810b71540bb016dcbca7308fd", size = 15405638, upload-time = "2026-05-11T18:33:48.249Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a3/c2/15c46613b24a84fad2aea1248bf9619b99c2767ae9071fe224c179a0b7d4/mypy-2.1.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e79ebc1b904b84f0310dff7469655a9c36c7a68bddb37bdd42b67a332df61d08", size = 16215852, upload-time = "2026-05-11T18:32:50.296Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/5c/90/9c16a57f482c76d25f6379762b56bbf65c711d8158cf271fb2802cfb0640/mypy-2.1.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:e583edc957cfb0deb142079162ae826f58449b116c1d442f2d91c69d9fced081", size = 16452695, upload-time = "2026-05-11T18:33:38.182Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/0f/4c/215a4eeb63cacc5f17f516691ea7285d11e249802b942476bff15922a314/mypy-2.1.0-cp314-cp314t-win_amd64.whl", hash = "sha256:b33b6cd332695bba180d55e717a79d3038e479a2c49cc5eb3d53603409b9a5d7", size = 12866622, upload-time = "2026-05-11T18:34:39.945Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/4b/50/1043e1db5f455ffe4c9ab22747cd8ca2bc492b1e4f4e21b130a44ee2b217/mypy-2.1.0-cp314-cp314t-win_arm64.whl", hash = "sha256:4f910fe825376a7b66ef7ca8c98e5a149e8cd64c19ae71d84047a74ee060d4e6", size = 10610798, upload-time = "2026-05-11T18:36:31.444Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/0d/2a/13ca1f292f6db1b98ff495ef3467736b331621c5917cad984b7043e7348d/mypy-2.1.0-py3-none-any.whl", hash = "sha256:a663814603a5c563fb87a4f96fb473eeb30d1f5a4885afcf44f9db000a366289", size = 2693302, upload-time = "2026-05-11T18:31:29.246Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "mypy-extensions"
|
||||
version = "1.1.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/a2/6e/371856a3fb9d31ca8dac321cda606860fa4548858c0cc45d9d1d4ca2628b/mypy_extensions-1.1.0.tar.gz", hash = "sha256:52e68efc3284861e772bbcd66823fde5ae21fd2fdb51c62a211403730b916558", size = 6343, upload-time = "2025-04-22T14:54:24.164Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/79/7b/2c79738432f5c924bef5071f933bcc9efd0473bac3b4aa584a6f7c1c8df8/mypy_extensions-1.1.0-py3-none-any.whl", hash = "sha256:1be4cccdb0f2482337c4743e60421de3a356cd97508abadd57d47403e94f5505", size = 4963, upload-time = "2025-04-22T14:54:22.983Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pathspec"
|
||||
version = "1.1.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/5a/82/42f767fc1c1143d6fd36efb827202a2d997a375e160a71eb2888a925aac1/pathspec-1.1.1.tar.gz", hash = "sha256:17db5ecd524104a120e173814c90367a96a98d07c45b2e10c2f3919fff91bf5a", size = 135180, upload-time = "2026-04-27T01:46:08.907Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/f1/d9/7fb5aa316bc299258e68c73ba3bddbc499654a07f151cba08f6153988714/pathspec-1.1.1-py3-none-any.whl", hash = "sha256:a00ce642f577bf7f473932318056212bc4f8bfdf53128c78bbd5af0b9b20b189", size = 57328, upload-time = "2026-04-27T01:46:07.06Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ruff"
|
||||
version = "0.15.14"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/dc/8a/8bce2894573e9dae6ff4d77fe34ad727d79b9e6238ad288c5638990d90f6/ruff-0.15.14.tar.gz", hash = "sha256:48e866b165be4a9bdbf310f7d3c9a07edef2fe8cd63ffeb4e00bb590506ebf9f", size = 4700910, upload-time = "2026-05-21T14:34:55.177Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/b9/c8/74a92c6ff9fcfb4f1f947126d3ebee8389276e161ecc85de5bda7cda51bd/ruff-0.15.14-py3-none-linux_armv6l.whl", hash = "sha256:8dd2db9416e487c8d4b01fa7056bb02c4d05969d4f8d17a08c229c2f4ff3c108", size = 10739177, upload-time = "2026-05-21T14:34:37.332Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/45/91/254a35c20acc38a7223c9d2d594af12e794432464f2cdeb52af1dc4a892d/ruff-0.15.14-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:be4ff55af755bd71a00ab3dc6bd7ffc467bd76e0df6881e286c2e3d23e8fb43b", size = 11144969, upload-time = "2026-05-21T14:34:43.978Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/56/9e/d13e40f83b8d0a94430e6778ce1d94a43b38cf2efe63278bdd2b4c65abbf/ruff-0.15.14-py3-none-macosx_11_0_arm64.whl", hash = "sha256:48d5909d7d06276ce7dde6d32bfa4b0d4cb2651145cd8ee4b440722cbc77832f", size = 10478207, upload-time = "2026-05-21T14:34:48.378Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/8d/f1/b15a7839fa4f332f8acec78e20564f26bb2d866e3d21710b877fd0263000/ruff-0.15.14-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca8cbfa94c4f90984a67561978602746d4cd27103568f745fa90eee3f0d4107d", size = 10818459, upload-time = "2026-05-21T14:34:22.318Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/45/33/53d651177f84f94b400a0e27f8824eeada3dddc9d5ee8aeb048f4352a520/ruff-0.15.14-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9a6bbc0333f1ab053423bcbf6226477d266ca7cec7738c4c8e3f55647803f3c4", size = 10541800, upload-time = "2026-05-21T14:34:20.209Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b8/a6/868f87e0bf9786ed24b5d0d0ad8676b8a94fd1912f42cddf9cfc7857818a/ruff-0.15.14-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a24a4f7605d7003a6674d4387651effd939dead3fddd0f36561eb77a9a2e542", size = 11342149, upload-time = "2026-05-21T14:34:46.365Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a7/8b/38cd5c19faffdcc05a408d2b78edccc69492ab9720eadb49ea15ef80d768/ruff-0.15.14-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:049b5326e53ed80978f2fc041a280603f69dd6b0c95464342a2bb4572d9d9e2f", size = 12212563, upload-time = "2026-05-21T14:34:28.579Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/3e/4d/a3c5b874a556d5731e3e657aaf04311bb76f0a5c3ec220ed43051be6b64b/ruff-0.15.14-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d4ed42e6696c8dfa5f06728e6441993901f548eb92d73bc472cb5a38d1395fbf", size = 11493299, upload-time = "2026-05-21T14:34:41.836Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/1e/c0/56472c251d09858a53e51efbd485b09e1995d8731668b76d52e5dd6ee0f1/ruff-0.15.14-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:715c543cf450c4888251f91c52f1942a800541d9bddd7ac060aa4e6b77ae7cba", size = 11455931, upload-time = "2026-05-21T14:34:57.276Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/2c/4a/e2e7b4d8dbf233d4eace59c75bc3435fa6d8bd3bae82d351d4e4300c0fd1/ruff-0.15.14-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:72ebab6013ec887d439d8b7593737a0a4ffb06d45d209d4e4bf2e92813082d3f", size = 11400794, upload-time = "2026-05-21T14:34:39.773Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/97/c7/83c0539fe34c3e09136204d1e75d6052492364e0b3cb05e9465423f567d7/ruff-0.15.14-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:49072d36abdbe97a8dd7f480afe9c675699c0c495d4c84076e2c1203c4550581", size = 10804759, upload-time = "2026-05-21T14:34:31.045Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/86/a6/18f2bfc095a2ab4a78745644e428205532ce6653a5d0fa8501572891534d/ruff-0.15.14-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:958522aee105068640c2c2ceae08f413ae44d922f52a1374ac13d6a96032fc93", size = 10539517, upload-time = "2026-05-21T14:34:53.064Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/54/3a/5a8b3b69c654d4e4bf1d246ac5b49cbcdac6eaab6905925f8915f31e3b80/ruff-0.15.14-py3-none-musllinux_1_2_i686.whl", hash = "sha256:f3707da619a143a2e8830e2abab8224478d69ace2d28cb6c20543ae97c36bf61", size = 11065169, upload-time = "2026-05-21T14:34:24.484Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ed/c5/8864e4e7925b836ea354b31d57641ec03830564e281a8b6f061f8c3e0ec1/ruff-0.15.14-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:bb01d645694e3ec0102105d07ef2d53703970407d59c04e59d3ba0b7a1d53553", size = 11560214, upload-time = "2026-05-21T14:34:50.975Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/36/38/012bf76752e1f89ed50b77b99532d90f3a3e287bc7918e1fc0948ac866ac/ruff-0.15.14-py3-none-win32.whl", hash = "sha256:6d0c1ad2a0ab718d39b6d8fd2217981ce4d625cd96a720095f798fb47d8b13e6", size = 10805548, upload-time = "2026-05-21T14:34:33.453Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d1/b7/4ea2c170f10ad760fff2a5250beb18897719dc8b52b53a24cddbb9dd3f19/ruff-0.15.14-py3-none-win_amd64.whl", hash = "sha256:802342981e056db3851a7836e5b070f8f15f67d4a685ae2a6160939d364b2902", size = 11939523, upload-time = "2026-05-21T14:34:18.077Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/62/d5/bc97ff895ec35cf3925d4bd60f3b39d822f377a446906ec9bcc87405e59b/ruff-0.15.14-py3-none-win_arm64.whl", hash = "sha256:ff47b90a9ef6a40c9e2f3b479c1fb78531adf055b94c1eba0a7ba04b31951826", size = 11208607, upload-time = "2026-05-21T14:34:26.525Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tomli"
|
||||
version = "2.4.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/22/de/48c59722572767841493b26183a0d1cc411d54fd759c5607c4590b6563a6/tomli-2.4.1.tar.gz", hash = "sha256:7c7e1a961a0b2f2472c1ac5b69affa0ae1132c39adcb67aba98568702b9cc23f", size = 17543, upload-time = "2026-03-25T20:22:03.828Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/f4/11/db3d5885d8528263d8adc260bb2d28ebf1270b96e98f0e0268d32b8d9900/tomli-2.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f8f0fc26ec2cc2b965b7a3b87cd19c5c6b8c5e5f436b984e85f486d652285c30", size = 154704, upload-time = "2026-03-25T20:21:10.473Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/6d/f7/675db52c7e46064a9aa928885a9b20f4124ecb9bc2e1ce74c9106648d202/tomli-2.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4ab97e64ccda8756376892c53a72bd1f964e519c77236368527f758fbc36a53a", size = 149454, upload-time = "2026-03-25T20:21:12.036Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/61/71/81c50943cf953efa35bce7646caab3cf457a7d8c030b27cfb40d7235f9ee/tomli-2.4.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:96481a5786729fd470164b47cdb3e0e58062a496f455ee41b4403be77cb5a076", size = 237561, upload-time = "2026-03-25T20:21:13.098Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/48/c1/f41d9cb618acccca7df82aaf682f9b49013c9397212cb9f53219e3abac37/tomli-2.4.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5a881ab208c0baf688221f8cecc5401bd291d67e38a1ac884d6736cbcd8247e9", size = 243824, upload-time = "2026-03-25T20:21:14.569Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/22/e4/5a816ecdd1f8ca51fb756ef684b90f2780afc52fc67f987e3c61d800a46d/tomli-2.4.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:47149d5bd38761ac8be13a84864bf0b7b70bc051806bc3669ab1cbc56216b23c", size = 242227, upload-time = "2026-03-25T20:21:15.712Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/6b/49/2b2a0ef529aa6eec245d25f0c703e020a73955ad7edf73e7f54ddc608aa5/tomli-2.4.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ec9bfaf3ad2df51ace80688143a6a4ebc09a248f6ff781a9945e51937008fcbc", size = 247859, upload-time = "2026-03-25T20:21:17.001Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/83/bd/6c1a630eaca337e1e78c5903104f831bda934c426f9231429396ce3c3467/tomli-2.4.1-cp311-cp311-win32.whl", hash = "sha256:ff2983983d34813c1aeb0fa89091e76c3a22889ee83ab27c5eeb45100560c049", size = 97204, upload-time = "2026-03-25T20:21:18.079Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/42/59/71461df1a885647e10b6bb7802d0b8e66480c61f3f43079e0dcd315b3954/tomli-2.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:5ee18d9ebdb417e384b58fe414e8d6af9f4e7a0ae761519fb50f721de398dd4e", size = 108084, upload-time = "2026-03-25T20:21:18.978Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b8/83/dceca96142499c069475b790e7913b1044c1a4337e700751f48ed723f883/tomli-2.4.1-cp311-cp311-win_arm64.whl", hash = "sha256:c2541745709bad0264b7d4705ad453b76ccd191e64aa6f0fc66b69a293a45ece", size = 95285, upload-time = "2026-03-25T20:21:20.309Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c1/ba/42f134a3fe2b370f555f44b1d72feebb94debcab01676bf918d0cb70e9aa/tomli-2.4.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c742f741d58a28940ce01d58f0ab2ea3ced8b12402f162f4d534dfe18ba1cd6a", size = 155924, upload-time = "2026-03-25T20:21:21.626Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/dc/c7/62d7a17c26487ade21c5422b646110f2162f1fcc95980ef7f63e73c68f14/tomli-2.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7f86fd587c4ed9dd76f318225e7d9b29cfc5a9d43de44e5754db8d1128487085", size = 150018, upload-time = "2026-03-25T20:21:23.002Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/5c/05/79d13d7c15f13bdef410bdd49a6485b1c37d28968314eabee452c22a7fda/tomli-2.4.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ff18e6a727ee0ab0388507b89d1bc6a22b138d1e2fa56d1ad494586d61d2eae9", size = 244948, upload-time = "2026-03-25T20:21:24.04Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/10/90/d62ce007a1c80d0b2c93e02cab211224756240884751b94ca72df8a875ca/tomli-2.4.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:136443dbd7e1dee43c68ac2694fde36b2849865fa258d39bf822c10e8068eac5", size = 253341, upload-time = "2026-03-25T20:21:25.177Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/1a/7e/caf6496d60152ad4ed09282c1885cca4eea150bfd007da84aea07bcc0a3e/tomli-2.4.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5e262d41726bc187e69af7825504c933b6794dc3fbd5945e41a79bb14c31f585", size = 248159, upload-time = "2026-03-25T20:21:26.364Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/99/e7/c6f69c3120de34bbd882c6fba7975f3d7a746e9218e56ab46a1bc4b42552/tomli-2.4.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5cb41aa38891e073ee49d55fbc7839cfdb2bc0e600add13874d048c94aadddd1", size = 253290, upload-time = "2026-03-25T20:21:27.46Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d6/2f/4a3c322f22c5c66c4b836ec58211641a4067364f5dcdd7b974b4c5da300c/tomli-2.4.1-cp312-cp312-win32.whl", hash = "sha256:da25dc3563bff5965356133435b757a795a17b17d01dbc0f42fb32447ddfd917", size = 98141, upload-time = "2026-03-25T20:21:28.492Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/24/22/4daacd05391b92c55759d55eaee21e1dfaea86ce5c571f10083360adf534/tomli-2.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:52c8ef851d9a240f11a88c003eacb03c31fc1c9c4ec64a99a0f922b93874fda9", size = 108847, upload-time = "2026-03-25T20:21:29.386Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/68/fd/70e768887666ddd9e9f5d85129e84910f2db2796f9096aa02b721a53098d/tomli-2.4.1-cp312-cp312-win_arm64.whl", hash = "sha256:f758f1b9299d059cc3f6546ae2af89670cb1c4d48ea29c3cacc4fe7de3058257", size = 95088, upload-time = "2026-03-25T20:21:30.677Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/07/06/b823a7e818c756d9a7123ba2cda7d07bc2dd32835648d1a7b7b7a05d848d/tomli-2.4.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:36d2bd2ad5fb9eaddba5226aa02c8ec3fa4f192631e347b3ed28186d43be6b54", size = 155866, upload-time = "2026-03-25T20:21:31.65Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/14/6f/12645cf7f08e1a20c7eb8c297c6f11d31c1b50f316a7e7e1e1de6e2e7b7e/tomli-2.4.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:eb0dc4e38e6a1fd579e5d50369aa2e10acfc9cace504579b2faabb478e76941a", size = 149887, upload-time = "2026-03-25T20:21:33.028Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/5c/e0/90637574e5e7212c09099c67ad349b04ec4d6020324539297b634a0192b0/tomli-2.4.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c7f2c7f2b9ca6bdeef8f0fa897f8e05085923eb091721675170254cbc5b02897", size = 243704, upload-time = "2026-03-25T20:21:34.51Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/10/8f/d3ddb16c5a4befdf31a23307f72828686ab2096f068eaf56631e136c1fdd/tomli-2.4.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f3c6818a1a86dd6dca7ddcaaf76947d5ba31aecc28cb1b67009a5877c9a64f3f", size = 251628, upload-time = "2026-03-25T20:21:36.012Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/e3/f1/dbeeb9116715abee2485bf0a12d07a8f31af94d71608c171c45f64c0469d/tomli-2.4.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:d312ef37c91508b0ab2cee7da26ec0b3ed2f03ce12bd87a588d771ae15dcf82d", size = 247180, upload-time = "2026-03-25T20:21:37.136Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d3/74/16336ffd19ed4da28a70959f92f506233bd7cfc2332b20bdb01591e8b1d1/tomli-2.4.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:51529d40e3ca50046d7606fa99ce3956a617f9b36380da3b7f0dd3dd28e68cb5", size = 251674, upload-time = "2026-03-25T20:21:38.298Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/16/f9/229fa3434c590ddf6c0aa9af64d3af4b752540686cace29e6281e3458469/tomli-2.4.1-cp313-cp313-win32.whl", hash = "sha256:2190f2e9dd7508d2a90ded5ed369255980a1bcdd58e52f7fe24b8162bf9fedbd", size = 97976, upload-time = "2026-03-25T20:21:39.316Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/6a/1e/71dfd96bcc1c775420cb8befe7a9d35f2e5b1309798f009dca17b7708c1e/tomli-2.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:8d65a2fbf9d2f8352685bc1364177ee3923d6baf5e7f43ea4959d7d8bc326a36", size = 108755, upload-time = "2026-03-25T20:21:40.248Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/83/7a/d34f422a021d62420b78f5c538e5b102f62bea616d1d75a13f0a88acb04a/tomli-2.4.1-cp313-cp313-win_arm64.whl", hash = "sha256:4b605484e43cdc43f0954ddae319fb75f04cc10dd80d830540060ee7cd0243cd", size = 95265, upload-time = "2026-03-25T20:21:41.219Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/3c/fb/9a5c8d27dbab540869f7c1f8eb0abb3244189ce780ba9cd73f3770662072/tomli-2.4.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:fd0409a3653af6c147209d267a0e4243f0ae46b011aa978b1080359fddc9b6cf", size = 155726, upload-time = "2026-03-25T20:21:42.23Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/62/05/d2f816630cc771ad836af54f5001f47a6f611d2d39535364f148b6a92d6b/tomli-2.4.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:a120733b01c45e9a0c34aeef92bf0cf1d56cfe81ed9d47d562f9ed591a9828ac", size = 149859, upload-time = "2026-03-25T20:21:43.386Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ce/48/66341bdb858ad9bd0ceab5a86f90eddab127cf8b046418009f2125630ecb/tomli-2.4.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:559db847dc486944896521f68d8190be1c9e719fced785720d2216fe7022b662", size = 244713, upload-time = "2026-03-25T20:21:44.474Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/df/6d/c5fad00d82b3c7a3ab6189bd4b10e60466f22cfe8a08a9394185c8a8111c/tomli-2.4.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:01f520d4f53ef97964a240a035ec2a869fe1a37dde002b57ebc4417a27ccd853", size = 252084, upload-time = "2026-03-25T20:21:45.62Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/00/71/3a69e86f3eafe8c7a59d008d245888051005bd657760e96d5fbfb0b740c2/tomli-2.4.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7f94b27a62cfad8496c8d2513e1a222dd446f095fca8987fceef261225538a15", size = 247973, upload-time = "2026-03-25T20:21:46.937Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/67/50/361e986652847fec4bd5e4a0208752fbe64689c603c7ae5ea7cb16b1c0ca/tomli-2.4.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:ede3e6487c5ef5d28634ba3f31f989030ad6af71edfb0055cbbd14189ff240ba", size = 256223, upload-time = "2026-03-25T20:21:48.467Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/8c/9a/b4173689a9203472e5467217e0154b00e260621caa227b6fa01feab16998/tomli-2.4.1-cp314-cp314-win32.whl", hash = "sha256:3d48a93ee1c9b79c04bb38772ee1b64dcf18ff43085896ea460ca8dec96f35f6", size = 98973, upload-time = "2026-03-25T20:21:49.526Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/14/58/640ac93bf230cd27d002462c9af0d837779f8773bc03dee06b5835208214/tomli-2.4.1-cp314-cp314-win_amd64.whl", hash = "sha256:88dceee75c2c63af144e456745e10101eb67361050196b0b6af5d717254dddf7", size = 109082, upload-time = "2026-03-25T20:21:50.506Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d5/2f/702d5e05b227401c1068f0d386d79a589bb12bf64c3d2c72ce0631e3bc49/tomli-2.4.1-cp314-cp314-win_arm64.whl", hash = "sha256:b8c198f8c1805dc42708689ed6864951fd2494f924149d3e4bce7710f8eb5232", size = 96490, upload-time = "2026-03-25T20:21:51.474Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/45/4b/b877b05c8ba62927d9865dd980e34a755de541eb65fffba52b4cc495d4d2/tomli-2.4.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:d4d8fe59808a54658fcc0160ecfb1b30f9089906c50b23bcb4c69eddc19ec2b4", size = 164263, upload-time = "2026-03-25T20:21:52.543Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/24/79/6ab420d37a270b89f7195dec5448f79400d9e9c1826df982f3f8e97b24fd/tomli-2.4.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7008df2e7655c495dd12d2a4ad038ff878d4ca4b81fccaf82b714e07eae4402c", size = 160736, upload-time = "2026-03-25T20:21:53.674Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/02/e0/3630057d8eb170310785723ed5adcdfb7d50cb7e6455f85ba8a3deed642b/tomli-2.4.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1d8591993e228b0c930c4bb0db464bdad97b3289fb981255d6c9a41aedc84b2d", size = 270717, upload-time = "2026-03-25T20:21:55.129Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/7a/b4/1613716072e544d1a7891f548d8f9ec6ce2faf42ca65acae01d76ea06bb0/tomli-2.4.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:734e20b57ba95624ecf1841e72b53f6e186355e216e5412de414e3c51e5e3c41", size = 278461, upload-time = "2026-03-25T20:21:56.228Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/05/38/30f541baf6a3f6df77b3df16b01ba319221389e2da59427e221ef417ac0c/tomli-2.4.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:8a650c2dbafa08d42e51ba0b62740dae4ecb9338eefa093aa5c78ceb546fcd5c", size = 274855, upload-time = "2026-03-25T20:21:57.653Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/77/a3/ec9dd4fd2c38e98de34223b995a3b34813e6bdadf86c75314c928350ed14/tomli-2.4.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:504aa796fe0569bb43171066009ead363de03675276d2d121ac1a4572397870f", size = 283144, upload-time = "2026-03-25T20:21:59.089Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ef/be/605a6261cac79fba2ec0c9827e986e00323a1945700969b8ee0b30d85453/tomli-2.4.1-cp314-cp314t-win32.whl", hash = "sha256:b1d22e6e9387bf4739fbe23bfa80e93f6b0373a7f1b96c6227c32bef95a4d7a8", size = 108683, upload-time = "2026-03-25T20:22:00.214Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/12/64/da524626d3b9cc40c168a13da8335fe1c51be12c0a63685cc6db7308daae/tomli-2.4.1-cp314-cp314t-win_amd64.whl", hash = "sha256:2c1c351919aca02858f740c6d33adea0c5deea37f9ecca1cc1ef9e884a619d26", size = 121196, upload-time = "2026-03-25T20:22:01.169Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/5a/cd/e80b62269fc78fc36c9af5a6b89c835baa8af28ff5ad28c7028d60860320/tomli-2.4.1-cp314-cp314t-win_arm64.whl", hash = "sha256:eab21f45c7f66c13f2a9e0e1535309cee140182a9cdae1e041d02e47291e8396", size = 100393, upload-time = "2026-03-25T20:22:02.137Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/7b/61/cceae43728b7de99d9b847560c262873a1f6c98202171fd5ed62640b494b/tomli-2.4.1-py3-none-any.whl", hash = "sha256:0d85819802132122da43cb86656f8d1f8c6587d54ae7dcaf30e90533028b49fe", size = 14583, upload-time = "2026-03-25T20:22:03.012Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "typing-extensions"
|
||||
version = "4.15.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391, upload-time = "2025-08-25T13:49:26.313Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" },
|
||||
]
|
||||
13
crates/kreuzberg-tesseract/.commitlintrc.json
Normal file
13
crates/kreuzberg-tesseract/.commitlintrc.json
Normal file
@@ -0,0 +1,13 @@
|
||||
{
|
||||
"extends": ["@commitlint/config-conventional"],
|
||||
"rules": {
|
||||
"body-max-line-length": [2, "always", 100],
|
||||
"header-max-length": [2, "always", 100],
|
||||
"subject-case": [2, "never", ["sentence-case", "start-case", "pascal-case", "upper-case"]],
|
||||
"type-enum": [
|
||||
2,
|
||||
"always",
|
||||
["feat", "fix", "docs", "style", "refactor", "perf", "test", "build", "ci", "chore", "revert"]
|
||||
]
|
||||
}
|
||||
}
|
||||
2
crates/kreuzberg-tesseract/.crate-ignore
Normal file
2
crates/kreuzberg-tesseract/.crate-ignore
Normal file
@@ -0,0 +1,2 @@
|
||||
/third_party/
|
||||
/tessdata/
|
||||
2933
crates/kreuzberg-tesseract/Cargo.lock
generated
Normal file
2933
crates/kreuzberg-tesseract/Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
64
crates/kreuzberg-tesseract/Cargo.toml
Normal file
64
crates/kreuzberg-tesseract/Cargo.toml
Normal file
@@ -0,0 +1,64 @@
|
||||
[package]
|
||||
name = "kreuzberg-tesseract"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
rust-version.workspace = true
|
||||
authors.workspace = true
|
||||
description = "Rust bindings for Tesseract OCR with cross-compilation, C++17, and caching improvements"
|
||||
license = "MIT"
|
||||
repository.workspace = true
|
||||
homepage = "https://kreuzberg.dev"
|
||||
documentation = "https://docs.kreuzberg.dev"
|
||||
readme = "README.md"
|
||||
keywords = ["tesseract", "ocr", "bindings", "vision", "recognition"]
|
||||
categories = ["external-ffi-bindings", "computer-vision", "text-processing"]
|
||||
build = "build.rs"
|
||||
links = "kreuzberg_tesseract"
|
||||
exclude = ["tessdata/*", "third_party/*"]
|
||||
|
||||
[package.metadata.docs.rs]
|
||||
features = ["docs-only"]
|
||||
rustdoc-args = ["--cfg", "docsrs"]
|
||||
|
||||
[package.metadata.cargo-machete]
|
||||
ignored = ["cc", "cmake", "reqwest", "zip"]
|
||||
|
||||
[lib]
|
||||
name = "kreuzberg_tesseract"
|
||||
crate-type = ["lib"]
|
||||
|
||||
[features]
|
||||
default = ["static-linking"]
|
||||
build-tesseract = ["cc", "cmake", "reqwest", "zip"]
|
||||
build-tesseract-wasm = ["cmake", "reqwest", "zip"]
|
||||
# Bundle eng.traineddata into the compiled crate so WASM builds can run OCR
|
||||
# without runtime tessdata loading. Uses ~4 MB of binary size (tessdata_fast).
|
||||
bundle-tessdata-eng = []
|
||||
static-linking = ["build-tesseract"]
|
||||
dynamic-linking = []
|
||||
|
||||
[dependencies]
|
||||
thiserror = { workspace = true }
|
||||
|
||||
[build-dependencies]
|
||||
cc = { version = "^1.2.63", optional = true }
|
||||
cmake = { version = "0.1.58", optional = true }
|
||||
zip = { version = ">=7.0.0", optional = true, default-features = false, features = [
|
||||
"deflate-flate2-zlib-rs",
|
||||
] }
|
||||
|
||||
[target.'cfg(not(target_os = "windows"))'.build-dependencies]
|
||||
reqwest = { workspace = true, default-features = false, features = [
|
||||
"blocking",
|
||||
"rustls",
|
||||
], optional = true }
|
||||
|
||||
# Use native-tls on Windows to avoid aws-lc-sys CMake build issues with MinGW
|
||||
[target.'cfg(target_os = "windows")'.build-dependencies]
|
||||
reqwest = { workspace = true, default-features = false, features = [
|
||||
"blocking",
|
||||
"native-tls",
|
||||
], optional = true }
|
||||
|
||||
[dev-dependencies]
|
||||
image = { workspace = true, features = ["png"] }
|
||||
22
crates/kreuzberg-tesseract/LICENSE
Normal file
22
crates/kreuzberg-tesseract/LICENSE
Normal file
@@ -0,0 +1,22 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2024 Cafer Can Gündoğdu
|
||||
Copyright (c) 2025 Na'aman Hirschfeld
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
405
crates/kreuzberg-tesseract/README.md
Normal file
405
crates/kreuzberg-tesseract/README.md
Normal file
@@ -0,0 +1,405 @@
|
||||
# kreuzberg-tesseract
|
||||
|
||||
[](https://github.com/kreuzberg-dev/alef)
|
||||
|
||||
Rust bindings for Tesseract OCR with built-in compilation of Tesseract and Leptonica libraries. Provides a safe and idiomatic Rust interface to Tesseract's functionality while handling the complexity of compiling the underlying C++ libraries.
|
||||
|
||||
Based on the original [tesseract-rs](https://github.com/cafercangundogdu/tesseract-rs) by Cafer Can Gündoğdu, this maintained version adds critical improvements for production use:
|
||||
|
||||
- **C++17 Support**: Upgraded for Tesseract 5.5.1 which requires C++17 filesystem
|
||||
- **Cross-Compilation**: Fixed CXX compiler detection for cross-platform builds
|
||||
- **Architecture Validation**: Validates target architecture before using cached libraries
|
||||
- **Windows Static Linking**: Fixed MSVC static linking issues
|
||||
- **Build Caching**: Improved caching with OUT_DIR-based cache directory
|
||||
- **MinGW Support**: Added support for MinGW toolchains
|
||||
|
||||
## Features
|
||||
|
||||
- Safe Rust bindings for Tesseract OCR
|
||||
- **Multiple linking options:**
|
||||
- **Static linking** (default): Built-in compilation with no runtime dependencies
|
||||
- **Dynamic linking**: Link to system-installed libraries for faster builds
|
||||
- Uses existing Tesseract training data (expects English data for tests)
|
||||
- High-level Rust API for common OCR tasks
|
||||
- Caching of compiled libraries for faster subsequent builds
|
||||
- Support for multiple operating systems (Linux, macOS, Windows)
|
||||
|
||||
## Installation
|
||||
|
||||
### Static Linking (Default)
|
||||
|
||||
Static linking builds Tesseract and Leptonica from source and embeds them in your binary. No runtime dependencies required:
|
||||
|
||||
```toml
|
||||
[dependencies]
|
||||
kreuzberg-tesseract = "1.0.0-rc.1"
|
||||
# or explicitly:
|
||||
kreuzberg-tesseract = { version = "1.0.0-rc.1", features = ["static-linking"] }
|
||||
```
|
||||
|
||||
### Dynamic Linking
|
||||
|
||||
Dynamic linking uses system-installed Tesseract and Leptonica libraries. Faster builds, but requires libraries installed on the system:
|
||||
|
||||
```toml
|
||||
[dependencies]
|
||||
kreuzberg-tesseract = { version = "1.0.0-rc.1", features = ["dynamic-linking"], default-features = false }
|
||||
```
|
||||
|
||||
**System requirements for dynamic linking:**
|
||||
|
||||
- Tesseract 5.x libraries installed (`libtesseract`, `libleptonica`)
|
||||
- macOS: `brew install tesseract leptonica`
|
||||
- Ubuntu/Debian: `sudo apt-get install libtesseract-dev libleptonica-dev`
|
||||
- RHEL/CentOS/Fedora: `sudo dnf install tesseract-devel leptonica-devel`
|
||||
- Windows: Install from [Tesseract releases](https://github.com/tesseract-ocr/tesseract/releases) or vcpkg
|
||||
|
||||
### Development Dependencies
|
||||
|
||||
For development and testing, you'll also need these dependencies:
|
||||
|
||||
```toml
|
||||
[dev-dependencies]
|
||||
image = "0.25.5"
|
||||
```
|
||||
|
||||
## System Requirements
|
||||
|
||||
### For Static Linking (Default)
|
||||
|
||||
When building with static linking, the crate will compile Tesseract and Leptonica from source. You need:
|
||||
|
||||
- Rust 1.85.0 or later
|
||||
- A C++ compiler (e.g., gcc, clang, MSVC on Windows)
|
||||
- CMake 3.x or later
|
||||
- Internet connection (for downloading Tesseract source code)
|
||||
|
||||
### For Dynamic Linking
|
||||
|
||||
When using dynamic linking with system-installed libraries, you need:
|
||||
|
||||
- Rust 1.85.0 or later
|
||||
- Tesseract 5.x and Leptonica libraries installed on your system (see Installation section)
|
||||
- Internet connection (for downloading Tesseract source code)
|
||||
|
||||
No C++ compiler or CMake required for dynamic linking builds.
|
||||
|
||||
For a full development environment checklist (including optional tooling suggestions), see [CONTRIBUTING.md](../../CONTRIBUTING.md).
|
||||
|
||||
## Environment Variables
|
||||
|
||||
The following environment variables affect the build and test process:
|
||||
|
||||
### Build Variables
|
||||
|
||||
- `CARGO_CLEAN`: If set, cleans the cache directory before building
|
||||
- `RUSTC_WRAPPER`: If set to "sccache", enables compiler caching with sccache
|
||||
- `CC`: Compiler selection for C code (affects Linux builds)
|
||||
- `HOME` (Unix) or `APPDATA` (Windows): Used to determine cache directory location
|
||||
- `TESSERACT_RS_CACHE_DIR`: Optional override for the cache root. When unset or not writable, the build falls back to the default OS-specific directory, and if that still fails, a temporary directory under the system temp folder is used automatically.
|
||||
|
||||
### Test Variables
|
||||
|
||||
- `TESSDATA_PREFIX` (Optional): Path to override the default tessdata directory. If not set, the crate will use its default cache directory.
|
||||
|
||||
## Cache and Data Directories
|
||||
|
||||
The crate uses the following directory structure based on your operating system:
|
||||
|
||||
- macOS: `~/Library/Application Support/tesseract-rs`
|
||||
- Linux: `~/.tesseract-rs`
|
||||
- Windows: `%APPDATA%/tesseract-rs`
|
||||
|
||||
The cache includes:
|
||||
|
||||
- Compiled Tesseract and Leptonica libraries
|
||||
- Third-party source code
|
||||
|
||||
Training data is not downloaded during the build. Provide `eng.traineddata` (and any other languages you need) via `TESSDATA_PREFIX` or your system Tesseract installation.
|
||||
|
||||
## Testing
|
||||
|
||||
The project includes several integration tests that verify OCR functionality. To run the tests:
|
||||
|
||||
1. Ensure you have the required test dependencies:
|
||||
|
||||
```toml
|
||||
[dev-dependencies]
|
||||
image = "0.25.9"
|
||||
```
|
||||
|
||||
2. Run the tests:
|
||||
|
||||
```bash
|
||||
cargo test
|
||||
```
|
||||
|
||||
Note: Make sure `eng.traineddata` is available in your tessdata directory before running tests. If `TESSDATA_PREFIX` is not set, the tests look in the default cache location. You can point the tests at a custom tessdata directory by setting:
|
||||
|
||||
```bash
|
||||
# Linux/macOS
|
||||
export TESSDATA_PREFIX=/path/to/custom/tessdata
|
||||
|
||||
# Windows (PowerShell)
|
||||
$env:TESSDATA_PREFIX="C:\path\to\custom\tessdata"
|
||||
```
|
||||
|
||||
Available test cases:
|
||||
|
||||
- OCR on English sample images
|
||||
- Error handling and invalid input coverage
|
||||
|
||||
Test images are sourced from the shared `test_documents/` directory in the repository:
|
||||
|
||||
- `images/test_hello_world.png`: Simple English text
|
||||
- `tables/simple_table.png`: Basic table with English headers
|
||||
|
||||
## Usage
|
||||
|
||||
Here's a basic example of how to use `tesseract-rs`:
|
||||
|
||||
```rust
|
||||
use std::path::PathBuf;
|
||||
use std::error::Error;
|
||||
use kreuzberg_tesseract::TesseractAPI;
|
||||
|
||||
fn get_default_tessdata_dir() -> PathBuf {
|
||||
if cfg!(target_os = "macos") {
|
||||
let home_dir = std::env::var("HOME").expect("HOME environment variable not set");
|
||||
PathBuf::from(home_dir)
|
||||
.join("Library")
|
||||
.join("Application Support")
|
||||
.join("tesseract-rs")
|
||||
.join("tessdata")
|
||||
} else if cfg!(target_os = "linux") {
|
||||
let home_dir = std::env::var("HOME").expect("HOME environment variable not set");
|
||||
PathBuf::from(home_dir)
|
||||
.join(".tesseract-rs")
|
||||
.join("tessdata")
|
||||
} else if cfg!(target_os = "windows") {
|
||||
PathBuf::from(std::env::var("APPDATA").expect("APPDATA environment variable not set"))
|
||||
.join("tesseract-rs")
|
||||
.join("tessdata")
|
||||
} else {
|
||||
panic!("Unsupported operating system");
|
||||
}
|
||||
}
|
||||
|
||||
fn get_tessdata_dir() -> PathBuf {
|
||||
match std::env::var("TESSDATA_PREFIX") {
|
||||
Ok(dir) => {
|
||||
let path = PathBuf::from(dir);
|
||||
println!("Using TESSDATA_PREFIX directory: {:?}", path);
|
||||
path
|
||||
}
|
||||
Err(_) => {
|
||||
let default_dir = get_default_tessdata_dir();
|
||||
println!(
|
||||
"TESSDATA_PREFIX not set, using default directory: {:?}",
|
||||
default_dir
|
||||
);
|
||||
default_dir
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn main() -> Result<(), Box<dyn Error>> {
|
||||
let api = TesseractAPI::new()?;
|
||||
|
||||
// Get tessdata directory (uses default location or TESSDATA_PREFIX if set)
|
||||
let tessdata_dir = get_tessdata_dir();
|
||||
api.init(tessdata_dir.to_str().unwrap(), "eng")?;
|
||||
|
||||
let width = 24;
|
||||
let height = 24;
|
||||
let bytes_per_pixel = 1;
|
||||
let bytes_per_line = width * bytes_per_pixel;
|
||||
|
||||
// Initialize image data with all white pixels
|
||||
let mut image_data = vec![255u8; width * height];
|
||||
|
||||
// Draw number 9 with clearer distinction
|
||||
for y in 4..19 {
|
||||
for x in 7..17 {
|
||||
// Top bar
|
||||
if y == 4 && x >= 8 && x <= 15 {
|
||||
image_data[y * width + x] = 0;
|
||||
}
|
||||
// Top curve left side
|
||||
if y >= 4 && y <= 10 && x == 7 {
|
||||
image_data[y * width + x] = 0;
|
||||
}
|
||||
// Top curve right side
|
||||
if y >= 4 && y <= 11 && x == 16 {
|
||||
image_data[y * width + x] = 0;
|
||||
}
|
||||
// Middle bar
|
||||
if y == 11 && x >= 8 && x <= 15 {
|
||||
image_data[y * width + x] = 0;
|
||||
}
|
||||
// Bottom right vertical line
|
||||
if y >= 11 && y <= 18 && x == 16 {
|
||||
image_data[y * width + x] = 0;
|
||||
}
|
||||
// Bottom bar
|
||||
if y == 18 && x >= 8 && x <= 15 {
|
||||
image_data[y * width + x] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Set the image data
|
||||
api.set_image(
|
||||
&image_data,
|
||||
width.try_into().unwrap(),
|
||||
height.try_into().unwrap(),
|
||||
bytes_per_pixel.try_into().unwrap(),
|
||||
bytes_per_line.try_into().unwrap(),
|
||||
)?;
|
||||
|
||||
// Set whitelist for digits only
|
||||
api.set_variable("tessedit_char_whitelist", "0123456789")?;
|
||||
|
||||
// Set PSM mode to single character
|
||||
api.set_variable("tessedit_pageseg_mode", "10")?;
|
||||
|
||||
// Get the recognized text
|
||||
let text = api.get_utf8_text()?;
|
||||
println!("Recognized text: {}", text.trim());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
|
||||
## Advanced Usage
|
||||
|
||||
The API provides additional functionality for more complex OCR tasks, including thread-safe operations:
|
||||
|
||||
```rust
|
||||
use kreuzberg_tesseract::TesseractAPI;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
use std::error::Error;
|
||||
|
||||
fn main() -> Result<(), Box<dyn Error>> {
|
||||
let tessdata_dir = get_tessdata_dir();
|
||||
let api = TesseractAPI::new()?;
|
||||
|
||||
// Initialize the main API
|
||||
api.init(tessdata_dir.to_str().unwrap(), "eng")?;
|
||||
api.set_variable("tessedit_pageseg_mode", "1")?;
|
||||
|
||||
// Load and prepare image data
|
||||
let (image_data, width, height) = load_test_image("sample_text.png")?;
|
||||
|
||||
// Share image data across threads
|
||||
let image_data = Arc::new(image_data);
|
||||
let mut handles = vec![];
|
||||
|
||||
// Spawn multiple threads for parallel OCR processing
|
||||
for _ in 0..3 {
|
||||
let api_clone = api.clone(); // Clones the API with all configurations
|
||||
let image_data = Arc::clone(&image_data);
|
||||
|
||||
let handle = thread::spawn(move || {
|
||||
// Set image in each thread
|
||||
let res = api_clone.set_image(
|
||||
&image_data,
|
||||
width as i32,
|
||||
height as i32,
|
||||
3,
|
||||
3 * width as i32,
|
||||
);
|
||||
assert!(res.is_ok());
|
||||
|
||||
// Perform OCR in parallel
|
||||
let text = api_clone.get_utf8_text()
|
||||
.expect("Failed to get text");
|
||||
println!("Thread result: {}", text);
|
||||
});
|
||||
handles.push(handle);
|
||||
}
|
||||
|
||||
// Wait for all threads to complete
|
||||
for handle in handles {
|
||||
handle.join().unwrap();
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Helper function to get tessdata directory
|
||||
fn get_tessdata_dir() -> PathBuf {
|
||||
// ... (implementation as shown in basic example)
|
||||
}
|
||||
|
||||
// Helper function to load test image
|
||||
fn load_test_image(filename: &str) -> Result<(Vec<u8>, u32, u32), Box<dyn Error>> {
|
||||
let img = image::open(filename)?
|
||||
.to_rgb8();
|
||||
let (width, height) = img.dimensions();
|
||||
Ok((img.into_raw(), width, height))
|
||||
}
|
||||
```
|
||||
|
||||
## Building
|
||||
|
||||
### Static Linking (Default)
|
||||
|
||||
With static linking, the crate will automatically download and compile Tesseract and Leptonica during the build process. This may take some time on the first build (5-10 minutes), but subsequent builds will use the cached libraries.
|
||||
|
||||
To clean the cache and force a rebuild:
|
||||
|
||||
```bash
|
||||
CARGO_CLEAN=1 cargo build
|
||||
```
|
||||
|
||||
### Dynamic Linking
|
||||
|
||||
With dynamic linking, the build is much faster (seconds instead of minutes) since it only links against system-installed libraries:
|
||||
|
||||
```bash
|
||||
cargo build --no-default-features --features dynamic-linking
|
||||
```
|
||||
|
||||
**Note**: Dynamic linking requires Tesseract and Leptonica to be installed on your system (see Installation section).
|
||||
|
||||
## Documentation
|
||||
|
||||
For more detailed information, please check the [API documentation](https://docs.rs/kreuzberg-tesseract).
|
||||
|
||||
## License
|
||||
|
||||
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
||||
|
||||
## Acknowledgements
|
||||
|
||||
This project is based on the original [tesseract-rs](https://github.com/cafercangundogdu/tesseract-rs) by [Cafer Can Gündoğdu](https://github.com/cafercangundogdu). We are grateful for the foundational work that made this project possible.
|
||||
|
||||
## Contributing
|
||||
|
||||
We welcome contributions! Please see our [Contributing Guide](../../CONTRIBUTING.md) for details.
|
||||
|
||||
### Quick Start for Contributors
|
||||
|
||||
1. Fork and clone the repository
|
||||
2. Install uv and set up git hooks:
|
||||
|
||||
```bash
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
uvx prek install
|
||||
```
|
||||
|
||||
3. Make your changes following our commit message format
|
||||
4. Run tests: `cargo test`
|
||||
5. Submit a Pull Request
|
||||
|
||||
Our commit messages follow the [Conventional Commits](https://www.conventionalcommits.org/) specification.
|
||||
|
||||
## Acknowledgements
|
||||
|
||||
This project uses [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) and [Leptonica](http://leptonica.org/). We are grateful to the maintainers and contributors of these projects.
|
||||
|
||||
```text
|
||||
|
||||
```
|
||||
2011
crates/kreuzberg-tesseract/build.rs
Normal file
2011
crates/kreuzberg-tesseract/build.rs
Normal file
File diff suppressed because it is too large
Load Diff
74
crates/kreuzberg-tesseract/patches/README.md
Normal file
74
crates/kreuzberg-tesseract/patches/README.md
Normal file
@@ -0,0 +1,74 @@
|
||||
# Tesseract WASM Patches
|
||||
|
||||
This directory contains patches needed to compile Tesseract for WebAssembly (WASM) targets using WASI SDK.
|
||||
|
||||
These patches are vendored from the [tesseract-wasm](https://github.com/naptha/tesseract.js) project and have been proven to work with WASM compilation.
|
||||
|
||||
## Patches
|
||||
|
||||
### tesseract.diff
|
||||
|
||||
A comprehensive patch that makes Tesseract compatible with WASM compilation. The patch includes the following changes:
|
||||
|
||||
#### 1. CMakeLists.txt Modifications
|
||||
|
||||
- **New CMake option**: `BUILD_TESSERACT_BINARY` (default: ON)
|
||||
- Allows disabling the Tesseract CLI binary build, which is not needed for WASM
|
||||
- Wraps all executable and installation targets for the tesseract binary
|
||||
|
||||
- **Disabled components for WASM**:
|
||||
- Removes OpenCL support (`src/opencl/*.cpp`) - not applicable to WASM
|
||||
- Removes viewer support (`src/viewer/*.cpp`) - UI components not needed for WASM
|
||||
- Removes C API bindings (`src/api/capi.cpp`) - only hocrrenderer is kept
|
||||
- Removes PDF and rendering support files:
|
||||
- `src/api/renderer.cpp`
|
||||
- `src/api/altorenderer.cpp`
|
||||
- `src/api/lstmboxrenderer.cpp`
|
||||
- `src/api/pdfrenderer.cpp`
|
||||
- `src/api/wordstrboxrenderer.cpp`
|
||||
|
||||
#### 2. SIMD Detection Fixes (src/arch/simddetect.cpp)
|
||||
|
||||
- Guards CPUID detection with `#if !defined(__wasm__)`
|
||||
- Prevents attempts to use CPU feature detection that don't exist in WASM
|
||||
- The HAS_CPUID macro is only defined for non-WASM builds
|
||||
- This allows the code to gracefully handle WASM's SIMD limitations
|
||||
|
||||
#### 3. Pointer Type Fixes (src/ccmain/pageiterator.cpp, src/ccmain/pagesegmain.cpp, src/ccmain/tesseractclass.cpp)
|
||||
|
||||
**Changed from stack allocation to heap allocation** in `tesseractclass.h`:
|
||||
|
||||
- `pixa_debug_` changed from `DebugPixa` to `std::unique_ptr<DebugPixa>`
|
||||
- This prevents large allocations on the stack, which is limited in WASM
|
||||
|
||||
**Updated all references** throughout the codebase:
|
||||
|
||||
- `.get()` calls added where raw pointers are needed
|
||||
- Arrow operator `->` replaces dot operator `.` for member access
|
||||
- Null checks added before dereferencing to prevent crashes
|
||||
|
||||
**Affected functions**:
|
||||
|
||||
- `PageIterator::Orientation()` - added null vector check
|
||||
- `Tesseract::AutoPageSeg()` - updated pointer passing
|
||||
- `Tesseract::SetupPageSegAndDetectOrientation()` - multiple pointer updates
|
||||
- `Tesseract::Clear()` - added null check before WritePDF
|
||||
- `Tesseract::PrepareForPageseg()` - updated Split() calls
|
||||
- `Tesseract::PrepareForTessOCR()` - updated Split() calls
|
||||
|
||||
#### 4. Additional Fixes
|
||||
|
||||
- **Orientation detection**: Changed comparison from `> 0.0F` to `>= 0.0F` in `pageiterator.cpp` to handle null vectors gracefully when orientation info is not available
|
||||
|
||||
## How to Apply
|
||||
|
||||
These patches are applied during the WASM build process. They modify the Tesseract source code to:
|
||||
|
||||
1. Disable WASM-incompatible features (OpenCL, viewers, renderers)
|
||||
2. Prevent CPUID detection in WASM environment
|
||||
3. Use heap allocation instead of stack allocation for large objects
|
||||
4. Handle missing pointer initialization gracefully
|
||||
|
||||
## Source
|
||||
|
||||
These patches are based on the proven WASM compilation approach used by the tesseract.js project, which successfully compiles Tesseract to WebAssembly and deploys it in production environments.
|
||||
199
crates/kreuzberg-tesseract/patches/tesseract.diff
Normal file
199
crates/kreuzberg-tesseract/patches/tesseract.diff
Normal file
@@ -0,0 +1,199 @@
|
||||
diff --git a/CMakeLists.txt b/CMakeLists.txt
|
||||
index 8c6845cb..fdcfc4a8 100644
|
||||
--- a/CMakeLists.txt
|
||||
+++ b/CMakeLists.txt
|
||||
@@ -90,6 +90,7 @@ option(ENABLE_LTO "Enable link-time optimization" OFF)
|
||||
option(FAST_FLOAT "Enable float for LSTM" ON)
|
||||
option(ENABLE_OPENCL "Enable unsupported experimental OpenCL support" OFF)
|
||||
option(BUILD_TRAINING_TOOLS "Build training tools" ON)
|
||||
+option(BUILD_TESSERACT_BINARY "Build Tesseract binary" ON)
|
||||
option(BUILD_TESTS "Build tests" OFF)
|
||||
option(USE_SYSTEM_ICU "Use system ICU" OFF)
|
||||
option(DISABLE_ARCHIVE "Disable build with libarchive (if available)" OFF)
|
||||
@@ -565,9 +566,7 @@ file(
|
||||
src/cutil/*.cpp
|
||||
src/dict/*.cpp
|
||||
src/lstm/*.cpp
|
||||
- src/opencl/*.cpp
|
||||
src/textord/*.cpp
|
||||
- src/viewer/*.cpp
|
||||
src/wordrec/*.cpp)
|
||||
|
||||
if(DISABLED_LEGACY_ENGINE)
|
||||
@@ -714,13 +713,7 @@ file(
|
||||
set(TESSERACT_SRC
|
||||
${TESSERACT_SRC}
|
||||
src/api/baseapi.cpp
|
||||
- src/api/capi.cpp
|
||||
- src/api/renderer.cpp
|
||||
- src/api/altorenderer.cpp
|
||||
- src/api/hocrrenderer.cpp
|
||||
- src/api/lstmboxrenderer.cpp
|
||||
- src/api/pdfrenderer.cpp
|
||||
- src/api/wordstrboxrenderer.cpp)
|
||||
+ src/api/hocrrenderer.cpp)
|
||||
|
||||
set(TESSERACT_CONFIGS
|
||||
tessdata/configs/alto
|
||||
@@ -858,14 +851,16 @@ endif()
|
||||
# EXECUTABLE tesseract
|
||||
# ##############################################################################
|
||||
|
||||
-add_executable(tesseract src/tesseract.cpp)
|
||||
-target_link_libraries(tesseract libtesseract)
|
||||
-if(HAVE_TIFFIO_H AND WIN32)
|
||||
- target_link_libraries(tesseract ${TIFF_LIBRARIES})
|
||||
-endif()
|
||||
+if(BUILD_TESSERACT_BINARY)
|
||||
+ add_executable(tesseract src/tesseract.cpp)
|
||||
+ target_link_libraries(tesseract libtesseract)
|
||||
+ if(HAVE_TIFFIO_H AND WIN32)
|
||||
+ target_link_libraries(tesseract ${TIFF_LIBRARIES})
|
||||
+ endif()
|
||||
|
||||
-if(OPENMP_BUILD AND UNIX)
|
||||
- target_link_libraries(tesseract pthread)
|
||||
+ if(OPENMP_BUILD AND UNIX)
|
||||
+ target_link_libraries(tesseract pthread)
|
||||
+ endif()
|
||||
endif()
|
||||
|
||||
# ##############################################################################
|
||||
@@ -899,7 +894,11 @@ write_basic_package_version_file(
|
||||
|
||||
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/tesseract.pc
|
||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
|
||||
-install(TARGETS tesseract DESTINATION bin)
|
||||
+
|
||||
+if(BUILD_TESSERACT_BINARY)
|
||||
+ install(TARGETS tesseract DESTINATION bin)
|
||||
+endif()
|
||||
+
|
||||
install(
|
||||
TARGETS libtesseract
|
||||
EXPORT TesseractTargets
|
||||
diff --git a/src/arch/simddetect.cpp b/src/arch/simddetect.cpp
|
||||
index 1afe5a5d..cb8c6d4c 100644
|
||||
--- a/src/arch/simddetect.cpp
|
||||
+++ b/src/arch/simddetect.cpp
|
||||
@@ -40,10 +40,12 @@
|
||||
|
||||
#endif
|
||||
|
||||
+#if !defined(__wasm__)
|
||||
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA) || defined(HAVE_SSE4_1)
|
||||
// See https://en.wikipedia.org/wiki/CPUID.
|
||||
# define HAS_CPUID
|
||||
#endif
|
||||
+#endif
|
||||
|
||||
#if defined(HAS_CPUID)
|
||||
# if defined(__GNUC__)
|
||||
diff --git a/src/ccmain/pageiterator.cpp b/src/ccmain/pageiterator.cpp
|
||||
index 64ff7f66..c0f80e5f 100644
|
||||
--- a/src/ccmain/pageiterator.cpp
|
||||
+++ b/src/ccmain/pageiterator.cpp
|
||||
@@ -582,7 +582,9 @@ void PageIterator::Orientation(tesseract::Orientation *orientation,
|
||||
up_in_image.rotate(block->re_rotation());
|
||||
|
||||
if (up_in_image.x() == 0.0F) {
|
||||
- if (up_in_image.y() > 0.0F) {
|
||||
+ // tesseract-wasm note: `up_in_image` will be a null vector if orientation
|
||||
+ // info is not available. In that case, assume page up.
|
||||
+ if (up_in_image.y() >= 0.0F) {
|
||||
*orientation = ORIENTATION_PAGE_UP;
|
||||
} else {
|
||||
*orientation = ORIENTATION_PAGE_DOWN;
|
||||
diff --git a/src/ccmain/pagesegmain.cpp b/src/ccmain/pagesegmain.cpp
|
||||
index 0af44607..718e73ef 100644
|
||||
--- a/src/ccmain/pagesegmain.cpp
|
||||
+++ b/src/ccmain/pagesegmain.cpp
|
||||
@@ -222,7 +222,7 @@ int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOC
|
||||
}
|
||||
#endif // ndef DISABLED_LEGACY_ENGINE
|
||||
result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_, to_block,
|
||||
- photomask_pix, pix_thresholds_, pix_grey_, &pixa_debug_,
|
||||
+ photomask_pix, pix_thresholds_, pix_grey_, pixa_debug_.get(),
|
||||
&found_blocks, diacritic_blobs, to_blocks);
|
||||
if (result >= 0) {
|
||||
finder->GetDeskewVectors(&deskew_, &reskew_);
|
||||
@@ -279,17 +279,17 @@ ColumnFinder *Tesseract::SetupPageSegAndDetectOrientation(PageSegMode pageseg_mo
|
||||
ICOORD bleft(0, 0);
|
||||
|
||||
ASSERT_HOST(pix_binary_ != nullptr);
|
||||
- if (tessedit_dump_pageseg_images) {
|
||||
- pixa_debug_.AddPix(pix_binary_, "PageSegInput");
|
||||
+ if (tessedit_dump_pageseg_images && pixa_debug_) {
|
||||
+ pixa_debug_->AddPix(pix_binary_, "PageSegInput");
|
||||
}
|
||||
// Leptonica is used to find the rule/separator lines in the input.
|
||||
LineFinder::FindAndRemoveLines(source_resolution_, textord_tabfind_show_vlines, pix_binary_,
|
||||
&vertical_x, &vertical_y, music_mask_pix, &v_lines, &h_lines);
|
||||
- if (tessedit_dump_pageseg_images) {
|
||||
- pixa_debug_.AddPix(pix_binary_, "NoLines");
|
||||
+ if (tessedit_dump_pageseg_images && pixa_debug_) {
|
||||
+ pixa_debug_->AddPix(pix_binary_, "NoLines");
|
||||
}
|
||||
// Leptonica is used to find a mask of the photo regions in the input.
|
||||
- *photo_mask_pix = ImageFind::FindImages(pix_binary_, &pixa_debug_);
|
||||
+ *photo_mask_pix = ImageFind::FindImages(pix_binary_, pixa_debug_.get());
|
||||
if (tessedit_dump_pageseg_images) {
|
||||
Image pix_no_image_ = nullptr;
|
||||
if (*photo_mask_pix != nullptr) {
|
||||
@@ -297,7 +297,7 @@ ColumnFinder *Tesseract::SetupPageSegAndDetectOrientation(PageSegMode pageseg_mo
|
||||
} else {
|
||||
pix_no_image_ = pix_binary_.clone();
|
||||
}
|
||||
- pixa_debug_.AddPix(pix_no_image_, "NoImages");
|
||||
+ pixa_debug_->AddPix(pix_no_image_, "NoImages");
|
||||
pix_no_image_.destroy();
|
||||
}
|
||||
if (!PSM_COL_FIND_ENABLED(pageseg_mode)) {
|
||||
diff --git a/src/ccmain/tesseractclass.cpp b/src/ccmain/tesseractclass.cpp
|
||||
index fd58ac87..517f925e 100644
|
||||
--- a/src/ccmain/tesseractclass.cpp
|
||||
+++ b/src/ccmain/tesseractclass.cpp
|
||||
@@ -487,8 +487,10 @@ Dict &Tesseract::getDict() {
|
||||
}
|
||||
|
||||
void Tesseract::Clear() {
|
||||
- std::string debug_name = imagebasename + "_debug.pdf";
|
||||
- pixa_debug_.WritePDF(debug_name.c_str());
|
||||
+ if (pixa_debug_) {
|
||||
+ std::string debug_name = imagebasename + "_debug.pdf";
|
||||
+ pixa_debug_->WritePDF(debug_name.c_str());
|
||||
+ }
|
||||
pix_binary_.destroy();
|
||||
pix_grey_.destroy();
|
||||
pix_thresholds_.destroy();
|
||||
@@ -572,7 +574,7 @@ void Tesseract::PrepareForPageseg() {
|
||||
// the newly split image.
|
||||
splitter_.set_orig_pix(pix_binary());
|
||||
splitter_.set_pageseg_split_strategy(max_pageseg_strategy);
|
||||
- if (splitter_.Split(true, &pixa_debug_)) {
|
||||
+ if (splitter_.Split(true, pixa_debug_.get())) {
|
||||
ASSERT_HOST(splitter_.splitted_image());
|
||||
pix_binary_.destroy();
|
||||
pix_binary_ = splitter_.splitted_image().clone();
|
||||
@@ -599,7 +601,7 @@ void Tesseract::PrepareForTessOCR(BLOCK_LIST *block_list, Tesseract *osd_tess, O
|
||||
splitter_.set_segmentation_block_list(block_list);
|
||||
splitter_.set_ocr_split_strategy(max_ocr_strategy);
|
||||
// Run the splitter for OCR
|
||||
- bool split_for_ocr = splitter_.Split(false, &pixa_debug_);
|
||||
+ bool split_for_ocr = splitter_.Split(false, pixa_debug_.get());
|
||||
// Restore pix_binary to the binarized original pix for future reference.
|
||||
ASSERT_HOST(splitter_.orig_pix());
|
||||
pix_binary_.destroy();
|
||||
diff --git a/src/ccmain/tesseractclass.h b/src/ccmain/tesseractclass.h
|
||||
index 732bb9e6..030aa5bc 100644
|
||||
--- a/src/ccmain/tesseractclass.h
|
||||
+++ b/src/ccmain/tesseractclass.h
|
||||
@@ -986,7 +986,7 @@ private:
|
||||
// Thresholds that were used to generate the thresholded image from grey.
|
||||
Image pix_thresholds_;
|
||||
// Debug images. If non-empty, will be written on destruction.
|
||||
- DebugPixa pixa_debug_;
|
||||
+ std::unique_ptr<DebugPixa> pixa_debug_;
|
||||
// Input image resolution after any scaling. The resolution is not well
|
||||
// transmitted by operations on Pix, so we keep an independent record here.
|
||||
int source_resolution_;
|
||||
2309
crates/kreuzberg-tesseract/src/api.rs
Normal file
2309
crates/kreuzberg-tesseract/src/api.rs
Normal file
File diff suppressed because it is too large
Load Diff
77
crates/kreuzberg-tesseract/src/choice_iterator.rs
Normal file
77
crates/kreuzberg-tesseract/src/choice_iterator.rs
Normal file
@@ -0,0 +1,77 @@
|
||||
use crate::api::TessDeleteText;
|
||||
use crate::error::{Result, TesseractError};
|
||||
use std::ffi::CStr;
|
||||
use std::os::raw::{c_char, c_float, c_int, c_void};
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
pub struct ChoiceIterator {
|
||||
handle: Arc<Mutex<*mut c_void>>,
|
||||
}
|
||||
|
||||
unsafe impl Send for ChoiceIterator {}
|
||||
unsafe impl Sync for ChoiceIterator {}
|
||||
|
||||
impl ChoiceIterator {
|
||||
/// Creates a new instance of the ChoiceIterator.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `handle` - Pointer to the ChoiceIterator.
|
||||
pub fn new(handle: *mut c_void) -> Self {
|
||||
ChoiceIterator {
|
||||
handle: Arc::new(Mutex::new(handle)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Gets the next choice.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `true` if the next choice is successful, otherwise returns `false`.
|
||||
pub fn next(&self) -> Result<bool> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
Ok(unsafe { TessChoiceIteratorNext(*handle) != 0 })
|
||||
}
|
||||
|
||||
/// Gets the UTF-8 text for the current choice.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the UTF-8 text as a `String` if successful, otherwise returns an error.
|
||||
pub fn get_utf8_text(&self) -> Result<String> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
let text_ptr = unsafe { TessChoiceIteratorGetUTF8Text(*handle) };
|
||||
if text_ptr.is_null() {
|
||||
return Err(TesseractError::NullPointerError);
|
||||
}
|
||||
let c_str = unsafe { CStr::from_ptr(text_ptr) };
|
||||
let result = c_str.to_str()?.to_owned();
|
||||
unsafe { TessDeleteText(text_ptr) };
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Gets the confidence of the current choice.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the confidence as a `f32`.
|
||||
pub fn confidence(&self) -> Result<f32> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
Ok(unsafe { TessChoiceIteratorConfidence(*handle) })
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for ChoiceIterator {
|
||||
fn drop(&mut self) {
|
||||
if let Ok(handle) = self.handle.lock() {
|
||||
unsafe { TessChoiceIteratorDelete(*handle) };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ffi_extern! {
|
||||
fn TessChoiceIteratorDelete(handle: *mut c_void);
|
||||
fn TessChoiceIteratorNext(handle: *mut c_void) -> c_int;
|
||||
fn TessChoiceIteratorGetUTF8Text(handle: *mut c_void) -> *mut c_char;
|
||||
fn TessChoiceIteratorConfidence(handle: *mut c_void) -> c_float;
|
||||
}
|
||||
373
crates/kreuzberg-tesseract/src/enums.rs
Normal file
373
crates/kreuzberg-tesseract/src/enums.rs
Normal file
@@ -0,0 +1,373 @@
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
#[allow(non_camel_case_types)]
|
||||
pub enum TessPageSegMode {
|
||||
PSM_OSD_ONLY = 0,
|
||||
PSM_AUTO_OSD = 1,
|
||||
PSM_AUTO_ONLY = 2,
|
||||
PSM_AUTO = 3,
|
||||
PSM_SINGLE_COLUMN = 4,
|
||||
PSM_SINGLE_BLOCK_VERT_TEXT = 5,
|
||||
PSM_SINGLE_BLOCK = 6,
|
||||
PSM_SINGLE_LINE = 7,
|
||||
PSM_SINGLE_WORD = 8,
|
||||
PSM_CIRCLE_WORD = 9,
|
||||
PSM_SINGLE_CHAR = 10,
|
||||
PSM_SPARSE_TEXT = 11,
|
||||
PSM_SPARSE_TEXT_OSD = 12,
|
||||
PSM_RAW_LINE = 13,
|
||||
PSM_COUNT = 14,
|
||||
}
|
||||
|
||||
impl TessPageSegMode {
|
||||
pub fn from_int(value: i32) -> Self {
|
||||
match value {
|
||||
0 => TessPageSegMode::PSM_OSD_ONLY,
|
||||
1 => TessPageSegMode::PSM_AUTO_OSD,
|
||||
2 => TessPageSegMode::PSM_AUTO_ONLY,
|
||||
3 => TessPageSegMode::PSM_AUTO,
|
||||
4 => TessPageSegMode::PSM_SINGLE_COLUMN,
|
||||
5 => TessPageSegMode::PSM_SINGLE_BLOCK_VERT_TEXT,
|
||||
6 => TessPageSegMode::PSM_SINGLE_BLOCK,
|
||||
7 => TessPageSegMode::PSM_SINGLE_LINE,
|
||||
8 => TessPageSegMode::PSM_SINGLE_WORD,
|
||||
9 => TessPageSegMode::PSM_CIRCLE_WORD,
|
||||
10 => TessPageSegMode::PSM_SINGLE_CHAR,
|
||||
11 => TessPageSegMode::PSM_SPARSE_TEXT,
|
||||
12 => TessPageSegMode::PSM_SPARSE_TEXT_OSD,
|
||||
13 => TessPageSegMode::PSM_RAW_LINE,
|
||||
14 => TessPageSegMode::PSM_COUNT,
|
||||
_ => TessPageSegMode::PSM_AUTO,
|
||||
}
|
||||
}
|
||||
|
||||
/// Safely convert an integer to a TessPageSegMode, returning None for invalid values.
|
||||
pub fn try_from_int(value: i32) -> Option<Self> {
|
||||
match value {
|
||||
0 => Some(TessPageSegMode::PSM_OSD_ONLY),
|
||||
1 => Some(TessPageSegMode::PSM_AUTO_OSD),
|
||||
2 => Some(TessPageSegMode::PSM_AUTO_ONLY),
|
||||
3 => Some(TessPageSegMode::PSM_AUTO),
|
||||
4 => Some(TessPageSegMode::PSM_SINGLE_COLUMN),
|
||||
5 => Some(TessPageSegMode::PSM_SINGLE_BLOCK_VERT_TEXT),
|
||||
6 => Some(TessPageSegMode::PSM_SINGLE_BLOCK),
|
||||
7 => Some(TessPageSegMode::PSM_SINGLE_LINE),
|
||||
8 => Some(TessPageSegMode::PSM_SINGLE_WORD),
|
||||
9 => Some(TessPageSegMode::PSM_CIRCLE_WORD),
|
||||
10 => Some(TessPageSegMode::PSM_SINGLE_CHAR),
|
||||
11 => Some(TessPageSegMode::PSM_SPARSE_TEXT),
|
||||
12 => Some(TessPageSegMode::PSM_SPARSE_TEXT_OSD),
|
||||
13 => Some(TessPageSegMode::PSM_RAW_LINE),
|
||||
14 => Some(TessPageSegMode::PSM_COUNT),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
#[allow(non_camel_case_types)]
|
||||
pub enum TessPageIteratorLevel {
|
||||
RIL_BLOCK = 0,
|
||||
RIL_PARA = 1,
|
||||
RIL_TEXTLINE = 2,
|
||||
RIL_WORD = 3,
|
||||
RIL_SYMBOL = 4,
|
||||
}
|
||||
|
||||
impl TessPageIteratorLevel {
|
||||
pub fn from_int(value: i32) -> Self {
|
||||
match value {
|
||||
0 => TessPageIteratorLevel::RIL_BLOCK,
|
||||
1 => TessPageIteratorLevel::RIL_PARA,
|
||||
2 => TessPageIteratorLevel::RIL_TEXTLINE,
|
||||
3 => TessPageIteratorLevel::RIL_WORD,
|
||||
4 => TessPageIteratorLevel::RIL_SYMBOL,
|
||||
_ => TessPageIteratorLevel::RIL_BLOCK,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
#[allow(non_camel_case_types)]
|
||||
pub enum TessPolyBlockType {
|
||||
PT_UNKNOWN = 0,
|
||||
PT_FLOWING_TEXT = 1,
|
||||
PT_HEADING_TEXT = 2,
|
||||
PT_PULLOUT_TEXT = 3,
|
||||
PT_EQUATION = 4,
|
||||
PT_INLINE_EQUATION = 5,
|
||||
PT_TABLE = 6,
|
||||
PT_VERTICAL_TEXT = 7,
|
||||
PT_CAPTION_TEXT = 8,
|
||||
PT_FLOWING_IMAGE = 9,
|
||||
PT_HEADING_IMAGE = 10,
|
||||
PT_PULLOUT_IMAGE = 11,
|
||||
PT_HORZ_LINE = 12,
|
||||
PT_VERT_LINE = 13,
|
||||
PT_NOISE = 14,
|
||||
PT_COUNT = 15,
|
||||
}
|
||||
|
||||
impl TessPolyBlockType {
|
||||
pub fn from_int(value: i32) -> Self {
|
||||
match value {
|
||||
0 => TessPolyBlockType::PT_UNKNOWN,
|
||||
1 => TessPolyBlockType::PT_FLOWING_TEXT,
|
||||
2 => TessPolyBlockType::PT_HEADING_TEXT,
|
||||
3 => TessPolyBlockType::PT_PULLOUT_TEXT,
|
||||
4 => TessPolyBlockType::PT_EQUATION,
|
||||
5 => TessPolyBlockType::PT_INLINE_EQUATION,
|
||||
6 => TessPolyBlockType::PT_TABLE,
|
||||
7 => TessPolyBlockType::PT_VERTICAL_TEXT,
|
||||
8 => TessPolyBlockType::PT_CAPTION_TEXT,
|
||||
9 => TessPolyBlockType::PT_FLOWING_IMAGE,
|
||||
10 => TessPolyBlockType::PT_HEADING_IMAGE,
|
||||
11 => TessPolyBlockType::PT_PULLOUT_IMAGE,
|
||||
12 => TessPolyBlockType::PT_HORZ_LINE,
|
||||
13 => TessPolyBlockType::PT_VERT_LINE,
|
||||
14 => TessPolyBlockType::PT_NOISE,
|
||||
15 => TessPolyBlockType::PT_COUNT,
|
||||
_ => TessPolyBlockType::PT_UNKNOWN,
|
||||
}
|
||||
}
|
||||
|
||||
/// Safely convert an integer to a TessPolyBlockType, returning None for invalid values.
|
||||
pub fn try_from_int(value: i32) -> Option<Self> {
|
||||
match value {
|
||||
0 => Some(TessPolyBlockType::PT_UNKNOWN),
|
||||
1 => Some(TessPolyBlockType::PT_FLOWING_TEXT),
|
||||
2 => Some(TessPolyBlockType::PT_HEADING_TEXT),
|
||||
3 => Some(TessPolyBlockType::PT_PULLOUT_TEXT),
|
||||
4 => Some(TessPolyBlockType::PT_EQUATION),
|
||||
5 => Some(TessPolyBlockType::PT_INLINE_EQUATION),
|
||||
6 => Some(TessPolyBlockType::PT_TABLE),
|
||||
7 => Some(TessPolyBlockType::PT_VERTICAL_TEXT),
|
||||
8 => Some(TessPolyBlockType::PT_CAPTION_TEXT),
|
||||
9 => Some(TessPolyBlockType::PT_FLOWING_IMAGE),
|
||||
10 => Some(TessPolyBlockType::PT_HEADING_IMAGE),
|
||||
11 => Some(TessPolyBlockType::PT_PULLOUT_IMAGE),
|
||||
12 => Some(TessPolyBlockType::PT_HORZ_LINE),
|
||||
13 => Some(TessPolyBlockType::PT_VERT_LINE),
|
||||
14 => Some(TessPolyBlockType::PT_NOISE),
|
||||
15 => Some(TessPolyBlockType::PT_COUNT),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
#[allow(non_camel_case_types)]
|
||||
pub enum TessOrientation {
|
||||
ORIENTATION_PAGE_UP = 0,
|
||||
ORIENTATION_PAGE_RIGHT = 1,
|
||||
ORIENTATION_PAGE_DOWN = 2,
|
||||
ORIENTATION_PAGE_LEFT = 3,
|
||||
}
|
||||
|
||||
impl TessOrientation {
|
||||
pub fn from_int(value: i32) -> Self {
|
||||
match value {
|
||||
0 => TessOrientation::ORIENTATION_PAGE_UP,
|
||||
1 => TessOrientation::ORIENTATION_PAGE_RIGHT,
|
||||
2 => TessOrientation::ORIENTATION_PAGE_DOWN,
|
||||
3 => TessOrientation::ORIENTATION_PAGE_LEFT,
|
||||
_ => TessOrientation::ORIENTATION_PAGE_UP,
|
||||
}
|
||||
}
|
||||
|
||||
/// Safely convert an integer to a TessOrientation, returning None for invalid values.
|
||||
pub fn try_from_int(value: i32) -> Option<Self> {
|
||||
match value {
|
||||
0 => Some(TessOrientation::ORIENTATION_PAGE_UP),
|
||||
1 => Some(TessOrientation::ORIENTATION_PAGE_RIGHT),
|
||||
2 => Some(TessOrientation::ORIENTATION_PAGE_DOWN),
|
||||
3 => Some(TessOrientation::ORIENTATION_PAGE_LEFT),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
#[allow(non_camel_case_types)]
|
||||
pub enum TessParagraphJustification {
|
||||
JUSTIFICATION_UNKNOWN = 0,
|
||||
JUSTIFICATION_LEFT = 1,
|
||||
JUSTIFICATION_CENTER = 2,
|
||||
JUSTIFICATION_RIGHT = 3,
|
||||
}
|
||||
|
||||
impl TessParagraphJustification {
|
||||
pub fn from_int(value: i32) -> Self {
|
||||
match value {
|
||||
0 => TessParagraphJustification::JUSTIFICATION_UNKNOWN,
|
||||
1 => TessParagraphJustification::JUSTIFICATION_LEFT,
|
||||
2 => TessParagraphJustification::JUSTIFICATION_CENTER,
|
||||
3 => TessParagraphJustification::JUSTIFICATION_RIGHT,
|
||||
_ => TessParagraphJustification::JUSTIFICATION_UNKNOWN,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
#[allow(non_camel_case_types)]
|
||||
pub enum TessWritingDirection {
|
||||
WRITING_DIRECTION_LEFT_TO_RIGHT = 0,
|
||||
WRITING_DIRECTION_RIGHT_TO_LEFT = 1,
|
||||
WRITING_DIRECTION_TOP_TO_BOTTOM = 2,
|
||||
}
|
||||
|
||||
impl TessWritingDirection {
|
||||
pub fn from_int(value: i32) -> Self {
|
||||
match value {
|
||||
0 => TessWritingDirection::WRITING_DIRECTION_LEFT_TO_RIGHT,
|
||||
1 => TessWritingDirection::WRITING_DIRECTION_RIGHT_TO_LEFT,
|
||||
2 => TessWritingDirection::WRITING_DIRECTION_TOP_TO_BOTTOM,
|
||||
_ => TessWritingDirection::WRITING_DIRECTION_LEFT_TO_RIGHT,
|
||||
}
|
||||
}
|
||||
|
||||
/// Safely convert an integer to a TessWritingDirection, returning None for invalid values.
|
||||
pub fn try_from_int(value: i32) -> Option<Self> {
|
||||
match value {
|
||||
0 => Some(TessWritingDirection::WRITING_DIRECTION_LEFT_TO_RIGHT),
|
||||
1 => Some(TessWritingDirection::WRITING_DIRECTION_RIGHT_TO_LEFT),
|
||||
2 => Some(TessWritingDirection::WRITING_DIRECTION_TOP_TO_BOTTOM),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
#[allow(non_camel_case_types)]
|
||||
pub enum TessTextlineOrder {
|
||||
TEXTLINE_ORDER_LEFT_TO_RIGHT = 0,
|
||||
TEXTLINE_ORDER_RIGHT_TO_LEFT = 1,
|
||||
TEXTLINE_ORDER_TOP_TO_BOTTOM = 2,
|
||||
}
|
||||
|
||||
impl TessTextlineOrder {
|
||||
pub fn from_int(value: i32) -> Self {
|
||||
match value {
|
||||
0 => TessTextlineOrder::TEXTLINE_ORDER_LEFT_TO_RIGHT,
|
||||
1 => TessTextlineOrder::TEXTLINE_ORDER_RIGHT_TO_LEFT,
|
||||
2 => TessTextlineOrder::TEXTLINE_ORDER_TOP_TO_BOTTOM,
|
||||
_ => TessTextlineOrder::TEXTLINE_ORDER_LEFT_TO_RIGHT,
|
||||
}
|
||||
}
|
||||
|
||||
/// Safely convert an integer to a TessTextlineOrder, returning None for invalid values.
|
||||
pub fn try_from_int(value: i32) -> Option<Self> {
|
||||
match value {
|
||||
0 => Some(TessTextlineOrder::TEXTLINE_ORDER_LEFT_TO_RIGHT),
|
||||
1 => Some(TessTextlineOrder::TEXTLINE_ORDER_RIGHT_TO_LEFT),
|
||||
2 => Some(TessTextlineOrder::TEXTLINE_ORDER_TOP_TO_BOTTOM),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_page_seg_mode_from_int() {
|
||||
assert_eq!(TessPageSegMode::from_int(0), TessPageSegMode::PSM_OSD_ONLY);
|
||||
assert_eq!(TessPageSegMode::from_int(3), TessPageSegMode::PSM_AUTO);
|
||||
assert_eq!(TessPageSegMode::from_int(10), TessPageSegMode::PSM_SINGLE_CHAR);
|
||||
assert_eq!(TessPageSegMode::from_int(999), TessPageSegMode::PSM_AUTO);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_page_seg_mode_conversion() {
|
||||
let mode = TessPageSegMode::PSM_SINGLE_LINE;
|
||||
assert_eq!(mode as i32, 7);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_page_iterator_level_from_int() {
|
||||
assert_eq!(TessPageIteratorLevel::from_int(0), TessPageIteratorLevel::RIL_BLOCK);
|
||||
assert_eq!(TessPageIteratorLevel::from_int(3), TessPageIteratorLevel::RIL_WORD);
|
||||
assert_eq!(TessPageIteratorLevel::from_int(-1), TessPageIteratorLevel::RIL_BLOCK);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_poly_block_type_from_int() {
|
||||
assert_eq!(TessPolyBlockType::from_int(1), TessPolyBlockType::PT_FLOWING_TEXT);
|
||||
assert_eq!(TessPolyBlockType::from_int(6), TessPolyBlockType::PT_TABLE);
|
||||
assert_eq!(TessPolyBlockType::from_int(100), TessPolyBlockType::PT_UNKNOWN);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_orientation_from_int() {
|
||||
assert_eq!(TessOrientation::from_int(0), TessOrientation::ORIENTATION_PAGE_UP);
|
||||
assert_eq!(TessOrientation::from_int(2), TessOrientation::ORIENTATION_PAGE_DOWN);
|
||||
assert_eq!(TessOrientation::from_int(5), TessOrientation::ORIENTATION_PAGE_UP);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_paragraph_justification_from_int() {
|
||||
assert_eq!(
|
||||
TessParagraphJustification::from_int(1),
|
||||
TessParagraphJustification::JUSTIFICATION_LEFT
|
||||
);
|
||||
assert_eq!(
|
||||
TessParagraphJustification::from_int(3),
|
||||
TessParagraphJustification::JUSTIFICATION_RIGHT
|
||||
);
|
||||
assert_eq!(
|
||||
TessParagraphJustification::from_int(-1),
|
||||
TessParagraphJustification::JUSTIFICATION_UNKNOWN
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_writing_direction_from_int() {
|
||||
assert_eq!(
|
||||
TessWritingDirection::from_int(0),
|
||||
TessWritingDirection::WRITING_DIRECTION_LEFT_TO_RIGHT
|
||||
);
|
||||
assert_eq!(
|
||||
TessWritingDirection::from_int(1),
|
||||
TessWritingDirection::WRITING_DIRECTION_RIGHT_TO_LEFT
|
||||
);
|
||||
assert_eq!(
|
||||
TessWritingDirection::from_int(10),
|
||||
TessWritingDirection::WRITING_DIRECTION_LEFT_TO_RIGHT
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_textline_order_from_int() {
|
||||
assert_eq!(
|
||||
TessTextlineOrder::from_int(0),
|
||||
TessTextlineOrder::TEXTLINE_ORDER_LEFT_TO_RIGHT
|
||||
);
|
||||
assert_eq!(
|
||||
TessTextlineOrder::from_int(2),
|
||||
TessTextlineOrder::TEXTLINE_ORDER_TOP_TO_BOTTOM
|
||||
);
|
||||
assert_eq!(
|
||||
TessTextlineOrder::from_int(99),
|
||||
TessTextlineOrder::TEXTLINE_ORDER_LEFT_TO_RIGHT
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_enums_are_copy() {
|
||||
fn assert_copy<T: Copy>() {}
|
||||
assert_copy::<TessPageSegMode>();
|
||||
assert_copy::<TessPageIteratorLevel>();
|
||||
assert_copy::<TessPolyBlockType>();
|
||||
assert_copy::<TessOrientation>();
|
||||
assert_copy::<TessParagraphJustification>();
|
||||
assert_copy::<TessWritingDirection>();
|
||||
assert_copy::<TessTextlineOrder>();
|
||||
}
|
||||
}
|
||||
85
crates/kreuzberg-tesseract/src/error.rs
Normal file
85
crates/kreuzberg-tesseract/src/error.rs
Normal file
@@ -0,0 +1,85 @@
|
||||
use std::str::Utf8Error;
|
||||
use thiserror::Error;
|
||||
|
||||
/// Errors that can occur when using the Tesseract API.
|
||||
#[derive(Error, Debug)]
|
||||
pub enum TesseractError {
|
||||
#[error("Failed to initialize Tesseract")]
|
||||
InitError,
|
||||
#[error("Failed to set image")]
|
||||
SetImageError,
|
||||
#[error("OCR operation failed")]
|
||||
OcrError,
|
||||
#[error("Invalid UTF-8 in Tesseract output")]
|
||||
Utf8Error(#[from] Utf8Error),
|
||||
#[error("Failed to lock mutex")]
|
||||
MutexLockError,
|
||||
#[error("Failed to set variable")]
|
||||
SetVariableError,
|
||||
#[error("Failed to get variable")]
|
||||
GetVariableError,
|
||||
#[error("Null pointer error")]
|
||||
NullPointerError,
|
||||
#[error("Invalid parameter")]
|
||||
InvalidParameterError,
|
||||
#[error("Layout analysis failed")]
|
||||
AnalyseLayoutError,
|
||||
#[error("Page processing failed")]
|
||||
ProcessPagesError,
|
||||
#[error("I/O error")]
|
||||
IoError,
|
||||
#[error("Mutex error")]
|
||||
MutexError,
|
||||
#[error("Invalid dimensions")]
|
||||
InvalidDimensions,
|
||||
#[error("Invalid bytes per pixel")]
|
||||
InvalidBytesPerPixel,
|
||||
#[error("Invalid bytes per line")]
|
||||
InvalidBytesPerLine,
|
||||
#[error("Invalid image data")]
|
||||
InvalidImageData,
|
||||
#[error("Uninitialized error")]
|
||||
UninitializedError,
|
||||
#[error("Invalid enum value: {0}")]
|
||||
InvalidEnumValue(i32),
|
||||
#[error("String contains null byte")]
|
||||
NullByteInString,
|
||||
}
|
||||
|
||||
/// Result type for Tesseract operations.
|
||||
pub type Result<T> = std::result::Result<T, TesseractError>;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_error_display() {
|
||||
let error = TesseractError::InitError;
|
||||
assert_eq!(error.to_string(), "Failed to initialize Tesseract");
|
||||
|
||||
let error = TesseractError::SetImageError;
|
||||
assert_eq!(error.to_string(), "Failed to set image");
|
||||
|
||||
let error = TesseractError::OcrError;
|
||||
assert_eq!(error.to_string(), "OCR operation failed");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf8_error_conversion() {
|
||||
let invalid_utf8 = vec![0xFF, 0xFE];
|
||||
let utf8_error = std::str::from_utf8(&invalid_utf8).unwrap_err();
|
||||
let tess_error: TesseractError = utf8_error.into();
|
||||
|
||||
match tess_error {
|
||||
TesseractError::Utf8Error(_) => {}
|
||||
_ => panic!("Expected Utf8Error variant"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_error_is_send_sync() {
|
||||
fn assert_send_sync<T: Send + Sync>() {}
|
||||
assert_send_sync::<TesseractError>();
|
||||
}
|
||||
}
|
||||
807
crates/kreuzberg-tesseract/src/leptonica.rs
Normal file
807
crates/kreuzberg-tesseract/src/leptonica.rs
Normal file
@@ -0,0 +1,807 @@
|
||||
//! Safe Leptonica Pix wrapper for image preprocessing before OCR.
|
||||
//!
|
||||
//! Provides a safe Rust wrapper around the Leptonica image-processing library.
|
||||
//! `Pix` is the core Leptonica image type. All methods return `Result<Pix>`,
|
||||
//! and the wrapper takes care of proper memory management via `Drop`.
|
||||
//!
|
||||
//! ## Pixel format
|
||||
//!
|
||||
//! Leptonica's 32 bpp format stores each pixel as a native 32-bit integer
|
||||
//! with the logical layout (MSB→LSB): `R G B A`, i.e.
|
||||
//! `(r << 24) | (g << 16) | (b << 8) | alpha`. Leptonica accesses
|
||||
//! individual channels via bit-shift on the integer value, not via
|
||||
//! byte-addressed pointer arithmetic, so the packing is identical on both
|
||||
//! big- and little-endian hosts. Do **not** call `pixEndianByteSwap` after
|
||||
//! writing pixels this way — doing so inverts the channel order.
|
||||
//!
|
||||
//! ## `pixDeskew` requires a binary (1 bpp) image
|
||||
//!
|
||||
//! Call `to_grayscale()` followed by `adaptive_threshold()` before `deskew()`.
|
||||
//! `pixDeskew` internally calls `pixFindSkewSweepAndSearchScorePivot` which
|
||||
//! operates on 1-bit images only; passing a colour image will return a null
|
||||
//! pointer.
|
||||
|
||||
use crate::error::{Result, TesseractError};
|
||||
use std::ffi::c_void;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Raw Leptonica FFI declarations
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
|
||||
ffi_extern! {
|
||||
/// Allocates a new Pix with the given dimensions and bit depth.
|
||||
fn pixCreate(width: i32, height: i32, depth: i32) -> *mut c_void;
|
||||
|
||||
/// Frees a Pix and sets the caller's pointer to null.
|
||||
///
|
||||
/// Leptonica uses a double-pointer convention: `*ppix` is set to null
|
||||
/// after the call so that accidental double-frees are a no-op.
|
||||
fn pixDestroy(ppix: *mut *mut c_void);
|
||||
|
||||
/// Sets the horizontal and vertical resolution (DPI) on a Pix.
|
||||
///
|
||||
/// Returns 0 on success, non-zero on error.
|
||||
fn pixSetResolution(pix: *mut c_void, xres: i32, yres: i32) -> i32;
|
||||
|
||||
/// Returns the width of the Pix in pixels.
|
||||
fn pixGetWidth(pix: *const c_void) -> i32;
|
||||
|
||||
/// Returns the height of the Pix in pixels.
|
||||
fn pixGetHeight(pix: *const c_void) -> i32;
|
||||
|
||||
/// Returns the bit depth of the Pix (1, 2, 4, 8, 16, or 32).
|
||||
fn pixGetDepth(pix: *const c_void) -> i32;
|
||||
|
||||
/// Returns the number of 32-bit words per row (words-per-line).
|
||||
fn pixGetWpl(pix: *const c_void) -> i32;
|
||||
|
||||
/// Returns a mutable pointer to the start of the pixel data array.
|
||||
///
|
||||
/// The data is stored as rows of 32-bit words; each word covers 32/depth pixels.
|
||||
fn pixGetData(pix: *mut c_void) -> *mut u32;
|
||||
|
||||
/// Deskews a 1 bpp image using a sweep-and-search algorithm.
|
||||
///
|
||||
/// `redsearch` is the reduction factor used during the search; pass 0 for
|
||||
/// the Leptonica default (2x reduction). Returns a new deskewed Pix on
|
||||
/// success, or null on failure. The input Pix is **not** consumed.
|
||||
fn pixDeskew(pixs: *mut c_void, redsearch: i32) -> *mut c_void;
|
||||
|
||||
/// Estimates the skew angle and confidence for a 1 bpp image.
|
||||
///
|
||||
/// Writes the angle (degrees, positive = counter-clockwise) into `*pangle`
|
||||
/// and a confidence score (0–1) into `*pconf`. Returns 0 on success.
|
||||
fn pixFindSkew(pixs: *mut c_void, pangle: *mut f32, pconf: *mut f32) -> i32;
|
||||
|
||||
/// Applies Otsu adaptive thresholding to produce a binarised Pix.
|
||||
///
|
||||
/// `sx`/`sy` are the tile dimensions; `smoothx`/`smoothy` are half-widths
|
||||
/// for smoothing the threshold map; `scorefract` controls threshold acceptance
|
||||
/// (typical value: 0.1). `ppixth` (optional) receives the threshold image;
|
||||
/// `ppixd` receives the binarised output.
|
||||
fn pixOtsuAdaptiveThreshold(
|
||||
pixs: *mut c_void,
|
||||
sx: i32,
|
||||
sy: i32,
|
||||
smoothx: i32,
|
||||
smoothy: i32,
|
||||
scorefract: f32,
|
||||
ppixth: *mut *mut c_void,
|
||||
ppixd: *mut *mut c_void,
|
||||
) -> i32;
|
||||
|
||||
/// Normalises the background of a grayscale image using morphological operations.
|
||||
///
|
||||
/// `reduction` is the subsampling factor (e.g. 4), `size` is the morphological
|
||||
/// structuring-element half-size (e.g. 15), and `bgval` is the target background
|
||||
/// value (e.g. 200). Returns a new normalised Pix, or null on failure.
|
||||
fn pixBackgroundNormMorph(
|
||||
pixs: *mut c_void,
|
||||
pixim: *mut c_void,
|
||||
reduction: i32,
|
||||
size: i32,
|
||||
bgval: i32,
|
||||
) -> *mut c_void;
|
||||
|
||||
/// Applies unsharp masking to sharpen a grayscale or colour Pix.
|
||||
///
|
||||
/// `halfwidth` is the half-size of the blur kernel; `fract` controls the
|
||||
/// sharpening strength (0.0–1.0 typical). Returns a new Pix, or null on failure.
|
||||
fn pixUnsharpMasking(pixs: *mut c_void, halfwidth: i32, fract: f32) -> *mut c_void;
|
||||
|
||||
/// Scales a Pix by independent x and y factors using the best available method.
|
||||
///
|
||||
/// Returns a new scaled Pix, or null on failure. The input Pix is **not** consumed.
|
||||
fn pixScale(pixs: *mut c_void, scalex: f32, scaley: f32) -> *mut c_void;
|
||||
|
||||
/// Converts an RGB (32 bpp) Pix to 8 bpp grayscale.
|
||||
///
|
||||
/// `rwt`, `gwt`, `bwt` are the red, green, and blue channel weights; pass
|
||||
/// 0.0 for all three to use Leptonica's default equal weights. Returns a new
|
||||
/// 8 bpp Pix, or null on failure.
|
||||
fn pixConvertRGBToGray(pixs: *mut c_void, rwt: f32, gwt: f32, bwt: f32) -> *mut c_void;
|
||||
|
||||
/// Creates a Leptonica BOX with the given coordinates.
|
||||
fn boxCreate(x: i32, y: i32, w: i32, h: i32) -> *mut c_void;
|
||||
|
||||
/// Frees a Leptonica BOX.
|
||||
fn boxDestroy(pbox: *mut *mut c_void);
|
||||
|
||||
/// Clips a rectangular region from a Pix.
|
||||
///
|
||||
/// Returns a new Pix containing the clipped region, or null on failure.
|
||||
/// `pboxc` (optional) receives the actual clipped box; pass null to ignore.
|
||||
fn pixClipRectangle(pixs: *mut c_void, box_: *mut c_void, pboxc: *mut *mut c_void) -> *mut c_void;
|
||||
|
||||
/// Counts connected components in a 1 bpp image.
|
||||
///
|
||||
/// `connectivity` is 4 or 8. Writes the count to `*pcount`.
|
||||
/// Returns 0 on success.
|
||||
fn pixCountConnComp(pix: *mut c_void, connectivity: i32, pcount: *mut i32) -> i32;
|
||||
|
||||
/// Retrieves the horizontal and vertical resolution (DPI) from a Pix.
|
||||
///
|
||||
/// Writes the x-resolution into `*pxres` and y-resolution into `*pyres`.
|
||||
/// Returns 0 on success, non-zero on error.
|
||||
fn pixGetResolution(pix: *const c_void, pxres: *mut i32, pyres: *mut i32) -> i32;
|
||||
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Safe Pix wrapper
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Safe wrapper around a Leptonica `PIX *` image object.
|
||||
///
|
||||
/// Owns the underlying allocation and frees it in `Drop`. All methods that
|
||||
/// return a new image allocate a fresh `Pix`; the receiver is never consumed.
|
||||
///
|
||||
/// # Thread safety
|
||||
///
|
||||
/// `Pix` is `Send` because Leptonica image objects are independent heap
|
||||
/// allocations with no shared mutable state. Concurrent mutation from multiple
|
||||
/// threads is **not** safe (no `Sync`).
|
||||
#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
|
||||
pub struct Pix {
|
||||
ptr: *mut c_void,
|
||||
}
|
||||
|
||||
#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
|
||||
impl std::fmt::Debug for Pix {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("Pix").field("ptr", &self.ptr).finish()
|
||||
}
|
||||
}
|
||||
|
||||
// SAFETY: A Pix owns a uniquely heap-allocated Leptonica PIX. There is no
|
||||
// interior mutability shared across thread boundaries, so transferring
|
||||
// ownership to another thread is safe.
|
||||
#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
|
||||
unsafe impl Send for Pix {}
|
||||
|
||||
#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
|
||||
impl Pix {
|
||||
// -----------------------------------------------------------------------
|
||||
// Construction
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
/// Creates a 32 bpp Leptonica Pix from a packed RGB byte slice.
|
||||
///
|
||||
/// `data` must contain exactly `width * height * 3` bytes in left-to-right,
|
||||
/// top-to-bottom, `R G B` interleaved order.
|
||||
///
|
||||
/// The DPI is set to 300 × 300 which is a sensible default for OCR input.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns `TesseractError::InvalidImageData` if `data` length does not
|
||||
/// match `width * height * 3`, if either dimension is zero, or if
|
||||
/// Leptonica's `pixCreate` returns null.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// # use kreuzberg_tesseract::Pix;
|
||||
/// let rgb = vec![255u8; 4 * 4 * 3]; // 4×4 white image
|
||||
/// let pix = Pix::from_raw_rgb(&rgb, 4, 4).unwrap();
|
||||
/// assert_eq!(pix.width(), 4);
|
||||
/// assert_eq!(pix.height(), 4);
|
||||
/// assert_eq!(pix.depth(), 32);
|
||||
/// ```
|
||||
pub fn from_raw_rgb(data: &[u8], width: u32, height: u32) -> Result<Pix> {
|
||||
let expected = (width as usize)
|
||||
.checked_mul(height as usize)
|
||||
.and_then(|n| n.checked_mul(3))
|
||||
.ok_or(TesseractError::InvalidImageData)?;
|
||||
|
||||
if data.len() != expected || width == 0 || height == 0 {
|
||||
return Err(TesseractError::InvalidImageData);
|
||||
}
|
||||
|
||||
// SAFETY: pixCreate() allocates a new PIX with the requested dimensions.
|
||||
// It is safe because:
|
||||
// 1. width, height, and depth (32) are valid positive integers.
|
||||
// 2. pixCreate() documents that it returns null only on allocation
|
||||
// failure, which we check immediately below.
|
||||
let pix_ptr = unsafe { pixCreate(width as i32, height as i32, 32) };
|
||||
if pix_ptr.is_null() {
|
||||
return Err(TesseractError::NullPointerError);
|
||||
}
|
||||
|
||||
// SAFETY: pixGetData() returns a mutable pointer into the allocated pixel
|
||||
// buffer that is valid for the lifetime of the Pix. We own pix_ptr
|
||||
// exclusively at this point and have not exposed it to any other code.
|
||||
let data_ptr = unsafe { pixGetData(pix_ptr) };
|
||||
if data_ptr.is_null() {
|
||||
// Clean up before returning the error.
|
||||
// SAFETY: pix_ptr is a valid non-null allocation from pixCreate().
|
||||
// Passing &mut pix_ptr satisfies the double-pointer convention; after
|
||||
// this call pix_ptr is set to null by Leptonica.
|
||||
let mut ptr = pix_ptr;
|
||||
unsafe { pixDestroy(&mut ptr) };
|
||||
return Err(TesseractError::NullPointerError);
|
||||
}
|
||||
|
||||
// SAFETY: pixGetWpl() is a pure read of the Pix header that is always
|
||||
// valid for a correctly-allocated Pix.
|
||||
// For a 32 bpp image, each pixel occupies exactly one 32-bit word, so
|
||||
// wpl == width (no padding bytes). The loop below uses `row * wpl + col`
|
||||
// to index into the pixel data, which is within bounds because col < width <= wpl.
|
||||
let wpl = unsafe { pixGetWpl(pix_ptr) } as usize;
|
||||
|
||||
// Write RGB pixels into the Leptonica data buffer.
|
||||
//
|
||||
// Leptonica's 32 bpp pixel format stores each pixel as a native
|
||||
// 32-bit integer word with the logical layout (MSB→LSB): R G B A,
|
||||
// i.e. `(r << 24) | (g << 16) | (b << 8) | alpha`. This is the
|
||||
// same bit pattern regardless of host endianness — Leptonica treats
|
||||
// the data as an array of 32-bit integers and accesses individual
|
||||
// bytes via bit-shift, not via byte-addressed pointer arithmetic.
|
||||
//
|
||||
// Therefore we pack directly as `(r << 24) | (g << 16) | (b << 8) | 0xFF`
|
||||
// and write the resulting u32 without any byte-swapping. Calling
|
||||
// `pixEndianByteSwap` would invert the channel order, producing
|
||||
// A B G R instead of R G B A.
|
||||
for row in 0..(height as usize) {
|
||||
for col in 0..(width as usize) {
|
||||
let src = (row * width as usize + col) * 3;
|
||||
let r = data[src] as u32;
|
||||
let g = data[src + 1] as u32;
|
||||
let b = data[src + 2] as u32;
|
||||
// Pack channels as (MSB) R G B A (LSB) in the 32-bit integer.
|
||||
let word: u32 = (r << 24) | (g << 16) | (b << 8) | 0xFF;
|
||||
// SAFETY: data_ptr is a valid writable pointer into the Leptonica
|
||||
// pixel buffer. The offset `row * wpl + col` is within bounds because:
|
||||
// 1. wpl >= width (Leptonica pads rows to 32-bit word boundaries).
|
||||
// 2. row < height and col < width by loop invariants.
|
||||
unsafe {
|
||||
*data_ptr.add(row * wpl + col) = word;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Set a sensible default DPI for OCR processing.
|
||||
// SAFETY: pix_ptr is valid and non-null. pixSetResolution only writes
|
||||
// two integer fields in the Pix header.
|
||||
unsafe { pixSetResolution(pix_ptr, 300, 300) };
|
||||
|
||||
Ok(Pix { ptr: pix_ptr })
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Image processing operations
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
/// Deskews this image, returning a new corrected Pix.
|
||||
///
|
||||
/// **Note:** `pixDeskew` requires a 1 bpp (binary) image. Call
|
||||
/// `to_grayscale()` followed by `adaptive_threshold()` before invoking
|
||||
/// this method on a colour or grayscale Pix.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns `TesseractError::NullPointerError` if Leptonica returns null
|
||||
/// (typically because the input is not 1 bpp or the image is too small).
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// # use kreuzberg_tesseract::Pix;
|
||||
/// # let rgb = vec![0u8; 100 * 100 * 3];
|
||||
/// # let pix = Pix::from_raw_rgb(&rgb, 100, 100).unwrap();
|
||||
/// let gray = pix.to_grayscale().unwrap();
|
||||
/// let binary = gray.adaptive_threshold(32, 32).unwrap();
|
||||
/// let deskewed = binary.deskew().unwrap();
|
||||
/// ```
|
||||
pub fn deskew(&self) -> Result<Pix> {
|
||||
// SAFETY: self.ptr is a valid non-null Pix we own. pixDeskew() does
|
||||
// not take ownership; it creates and returns a new Pix allocation.
|
||||
// We check for null to handle the case where the operation fails
|
||||
// (e.g. input is not 1 bpp).
|
||||
let result = unsafe { pixDeskew(self.ptr, 0) };
|
||||
if result.is_null() {
|
||||
Err(TesseractError::NullPointerError)
|
||||
} else {
|
||||
Ok(Pix { ptr: result })
|
||||
}
|
||||
}
|
||||
|
||||
/// Estimates the skew angle (degrees) and confidence (0–1) for this image.
|
||||
///
|
||||
/// A positive angle indicates counter-clockwise skew. Confidence near 1.0
|
||||
/// means a clear dominant skew direction was found.
|
||||
///
|
||||
/// **Note:** Like `deskew`, this operates on 1 bpp images.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns `TesseractError::OcrError` if `pixFindSkew` returns a non-zero
|
||||
/// status (e.g. insufficient contrast or wrong bit depth).
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// # use kreuzberg_tesseract::Pix;
|
||||
/// # let rgb = vec![0u8; 100 * 100 * 3];
|
||||
/// # let pix = Pix::from_raw_rgb(&rgb, 100, 100).unwrap();
|
||||
/// let gray = pix.to_grayscale().unwrap();
|
||||
/// let binary = gray.adaptive_threshold(32, 32).unwrap();
|
||||
/// let (angle, confidence) = binary.find_skew().unwrap();
|
||||
/// println!("Skew: {angle:.2}° (confidence {confidence:.2})");
|
||||
/// ```
|
||||
pub fn find_skew(&self) -> Result<(f32, f32)> {
|
||||
let mut angle: f32 = 0.0;
|
||||
let mut conf: f32 = 0.0;
|
||||
// SAFETY: self.ptr is valid and non-null. We pass pointers to local
|
||||
// stack-allocated f32 values, which are valid write targets for the
|
||||
// duration of this call. pixFindSkew() writes into them and returns
|
||||
// an integer status code.
|
||||
let status = unsafe { pixFindSkew(self.ptr, &mut angle, &mut conf) };
|
||||
if status != 0 {
|
||||
Err(TesseractError::OcrError)
|
||||
} else {
|
||||
Ok((angle, conf))
|
||||
}
|
||||
}
|
||||
|
||||
/// Binarises this image using Otsu adaptive thresholding.
|
||||
///
|
||||
/// `tile_width` and `tile_height` control the size of the local regions
|
||||
/// used to compute the threshold. Values around 16–64 work well for typical
|
||||
/// document images; smaller tiles follow local contrast more closely.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns `TesseractError::NullPointerError` if Leptonica returns null, or
|
||||
/// `TesseractError::OcrError` if `pixOtsuAdaptiveThreshold` returns a
|
||||
/// non-zero status.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// # use kreuzberg_tesseract::Pix;
|
||||
/// # let rgb = vec![128u8; 64 * 64 * 3];
|
||||
/// # let pix = Pix::from_raw_rgb(&rgb, 64, 64).unwrap();
|
||||
/// let gray = pix.to_grayscale().unwrap();
|
||||
/// let binary = gray.adaptive_threshold(32, 32).unwrap();
|
||||
/// assert_eq!(binary.depth(), 1);
|
||||
/// ```
|
||||
pub fn adaptive_threshold(&self, tile_width: i32, tile_height: i32) -> Result<Pix> {
|
||||
let mut result: *mut c_void = std::ptr::null_mut();
|
||||
// SAFETY: self.ptr is a valid non-null Pix. We pass null for ppixth
|
||||
// because we do not need the intermediate threshold image. result is a
|
||||
// local pointer that will be written by pixOtsuAdaptiveThreshold(); we
|
||||
// check it for null before wrapping in a Pix.
|
||||
let status = unsafe {
|
||||
pixOtsuAdaptiveThreshold(
|
||||
self.ptr,
|
||||
tile_width,
|
||||
tile_height,
|
||||
0, // smoothx: no smoothing
|
||||
0, // smoothy: no smoothing
|
||||
0.1, // scorefract: Leptonica-recommended default
|
||||
std::ptr::null_mut(), // ppixth: we don't need the threshold map
|
||||
&mut result,
|
||||
)
|
||||
};
|
||||
if status != 0 {
|
||||
return Err(TesseractError::OcrError);
|
||||
}
|
||||
if result.is_null() {
|
||||
return Err(TesseractError::NullPointerError);
|
||||
}
|
||||
Ok(Pix { ptr: result })
|
||||
}
|
||||
|
||||
/// Returns the horizontal and vertical resolution (DPI) of this image.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns `TesseractError::OcrError` if `pixGetResolution` fails.
|
||||
pub fn get_resolution(&self) -> Result<(i32, i32)> {
|
||||
let mut xres: i32 = 0;
|
||||
let mut yres: i32 = 0;
|
||||
// SAFETY: self.ptr is a valid non-null Pix. xres and yres are valid
|
||||
// stack-allocated i32 values. pixGetResolution reads the Pix header.
|
||||
let status = unsafe { pixGetResolution(self.ptr, &mut xres, &mut yres) };
|
||||
if status != 0 {
|
||||
Err(TesseractError::OcrError)
|
||||
} else {
|
||||
Ok((xres, yres))
|
||||
}
|
||||
}
|
||||
|
||||
/// Sets the horizontal and vertical resolution (DPI) on this image.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns `TesseractError::OcrError` if `pixSetResolution` fails.
|
||||
pub fn set_resolution(&mut self, xres: i32, yres: i32) -> Result<()> {
|
||||
// SAFETY: self.ptr is a valid non-null Pix. pixSetResolution only
|
||||
// writes two integer fields in the Pix header.
|
||||
let status = unsafe { pixSetResolution(self.ptr, xres, yres) };
|
||||
if status != 0 {
|
||||
Err(TesseractError::OcrError)
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Ensures the image has a valid (non-zero) DPI resolution.
|
||||
///
|
||||
/// If both x and y resolution are zero, sets them to 72 DPI as a
|
||||
/// safe fallback. This prevents Leptonica operations that depend on
|
||||
/// resolution metadata from producing incorrect results.
|
||||
fn ensure_valid_resolution(&self) {
|
||||
if let Ok((xres, yres)) = self.get_resolution()
|
||||
&& (xres == 0 || yres == 0)
|
||||
{
|
||||
// SAFETY: self.ptr is valid. We set a safe default DPI.
|
||||
unsafe { pixSetResolution(self.ptr, 72, 72) };
|
||||
}
|
||||
}
|
||||
|
||||
/// Normalises the background of this image using morphological operations.
|
||||
///
|
||||
/// Useful as a preprocessing step when the document has uneven illumination
|
||||
/// or a non-white background. Returns a new normalised Pix.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns `TesseractError::NullPointerError` if `pixBackgroundNormMorph`
|
||||
/// returns null.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// # use kreuzberg_tesseract::Pix;
|
||||
/// # let rgb = vec![200u8; 100 * 100 * 3];
|
||||
/// # let pix = Pix::from_raw_rgb(&rgb, 100, 100).unwrap();
|
||||
/// let gray = pix.to_grayscale().unwrap();
|
||||
/// let normalised = gray.background_normalize().unwrap();
|
||||
/// ```
|
||||
pub fn background_normalize(&self) -> Result<Pix> {
|
||||
self.ensure_valid_resolution();
|
||||
// SAFETY: self.ptr is a valid non-null Pix. We pass null for pixim
|
||||
// (no mask image). pixBackgroundNormMorph() returns a newly allocated
|
||||
// Pix or null on failure.
|
||||
let result = unsafe {
|
||||
pixBackgroundNormMorph(
|
||||
self.ptr,
|
||||
std::ptr::null_mut(), // pixim: no mask
|
||||
4, // reduction: 4x subsampling
|
||||
15, // size: morphological SE half-size
|
||||
200, // bgval: target background value
|
||||
)
|
||||
};
|
||||
if result.is_null() {
|
||||
Err(TesseractError::NullPointerError)
|
||||
} else {
|
||||
Ok(Pix { ptr: result })
|
||||
}
|
||||
}
|
||||
|
||||
/// Applies unsharp masking to sharpen this image.
|
||||
///
|
||||
/// `halfwidth` is the half-size of the blur kernel (e.g. 1–5).
|
||||
/// `fract` is the sharpening fraction in the range 0.0–1.0; values
|
||||
/// around 0.3–0.5 produce visible sharpening without artefacts.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns `TesseractError::NullPointerError` if `pixUnsharpMasking`
|
||||
/// returns null.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// # use kreuzberg_tesseract::Pix;
|
||||
/// # let rgb = vec![128u8; 64 * 64 * 3];
|
||||
/// # let pix = Pix::from_raw_rgb(&rgb, 64, 64).unwrap();
|
||||
/// let sharpened = pix.unsharp_mask(2, 0.4).unwrap();
|
||||
/// ```
|
||||
pub fn unsharp_mask(&self, halfwidth: i32, fract: f32) -> Result<Pix> {
|
||||
self.ensure_valid_resolution();
|
||||
// SAFETY: self.ptr is valid and non-null. pixUnsharpMasking() returns
|
||||
// a new Pix without modifying or taking ownership of the source.
|
||||
let result = unsafe { pixUnsharpMasking(self.ptr, halfwidth, fract) };
|
||||
if result.is_null() {
|
||||
Err(TesseractError::NullPointerError)
|
||||
} else {
|
||||
Ok(Pix { ptr: result })
|
||||
}
|
||||
}
|
||||
|
||||
/// Scales this image by independent x and y factors.
|
||||
///
|
||||
/// Leptonica automatically chooses the best scaling algorithm based on
|
||||
/// the scale factors and bit depth (area mapping for downscaling,
|
||||
/// linear interpolation for upscaling).
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns `TesseractError::NullPointerError` if `pixScale` returns null.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// # use kreuzberg_tesseract::Pix;
|
||||
/// # let rgb = vec![255u8; 40 * 40 * 3];
|
||||
/// # let pix = Pix::from_raw_rgb(&rgb, 40, 40).unwrap();
|
||||
/// let upscaled = pix.scale(2.0, 2.0).unwrap();
|
||||
/// assert_eq!(upscaled.width(), 80);
|
||||
/// assert_eq!(upscaled.height(), 80);
|
||||
/// ```
|
||||
pub fn scale(&self, sx: f32, sy: f32) -> Result<Pix> {
|
||||
// SAFETY: self.ptr is valid and non-null. pixScale() creates a new Pix
|
||||
// and does not modify the source.
|
||||
let result = unsafe { pixScale(self.ptr, sx, sy) };
|
||||
if result.is_null() {
|
||||
Err(TesseractError::NullPointerError)
|
||||
} else {
|
||||
Ok(Pix { ptr: result })
|
||||
}
|
||||
}
|
||||
|
||||
/// Clips a rectangular sub-region from this image.
|
||||
///
|
||||
/// Returns a new Pix containing only the pixels within the given rectangle.
|
||||
/// Coordinates are in pixel space: (x, y) is the top-left corner.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns `TesseractError::NullPointerError` if the crop fails.
|
||||
pub fn clip_rectangle(&self, x: i32, y: i32, w: i32, h: i32) -> Result<Pix> {
|
||||
// SAFETY: boxCreate allocates a new BOX on the heap.
|
||||
let box_ = unsafe { boxCreate(x, y, w, h) };
|
||||
if box_.is_null() {
|
||||
return Err(TesseractError::NullPointerError);
|
||||
}
|
||||
// SAFETY: pixClipRectangle returns a new Pix clipped to the BOX region.
|
||||
// We pass null for pboxc (we don't need the clipped box coordinates back).
|
||||
let result = unsafe { pixClipRectangle(self.ptr, box_, std::ptr::null_mut()) };
|
||||
// SAFETY: Free the BOX we allocated.
|
||||
let mut box_mut = box_;
|
||||
unsafe { boxDestroy(&mut box_mut) };
|
||||
if result.is_null() {
|
||||
Err(TesseractError::NullPointerError)
|
||||
} else {
|
||||
Ok(Pix { ptr: result })
|
||||
}
|
||||
}
|
||||
|
||||
/// Counts connected components in a 1 bpp (binary) image.
|
||||
///
|
||||
/// `connectivity` should be 4 or 8.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns `TesseractError::OcrError` if `pixCountConnComp` fails
|
||||
/// (e.g., wrong bit depth — image must be 1 bpp).
|
||||
pub fn count_connected_components(&self, connectivity: i32) -> Result<i32> {
|
||||
let mut count: i32 = 0;
|
||||
// SAFETY: self.ptr is a valid Pix. count is a valid stack local.
|
||||
let status = unsafe { pixCountConnComp(self.ptr, connectivity, &mut count) };
|
||||
if status != 0 {
|
||||
Err(TesseractError::OcrError)
|
||||
} else {
|
||||
Ok(count)
|
||||
}
|
||||
}
|
||||
|
||||
/// Converts this 32 bpp RGB image to an 8 bpp grayscale Pix.
|
||||
///
|
||||
/// Passing 0.0 for all weight parameters instructs Leptonica to use its
|
||||
/// default perceptual weights (approx. 0.299 R, 0.587 G, 0.114 B).
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns `TesseractError::NullPointerError` if `pixConvertRGBToGray`
|
||||
/// returns null (e.g. the source is not 32 bpp).
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// # use kreuzberg_tesseract::Pix;
|
||||
/// # let rgb = vec![100u8, 150u8, 200u8].repeat(10 * 10);
|
||||
/// # let pix = Pix::from_raw_rgb(&rgb, 10, 10).unwrap();
|
||||
/// let gray = pix.to_grayscale().unwrap();
|
||||
/// assert_eq!(gray.depth(), 8);
|
||||
/// ```
|
||||
pub fn to_grayscale(&self) -> Result<Pix> {
|
||||
self.ensure_valid_resolution();
|
||||
// SAFETY: self.ptr is valid and non-null. pixConvertRGBToGray() returns
|
||||
// a new 8 bpp Pix; the source is not modified.
|
||||
let result = unsafe { pixConvertRGBToGray(self.ptr, 0.0, 0.0, 0.0) };
|
||||
if result.is_null() {
|
||||
Err(TesseractError::NullPointerError)
|
||||
} else {
|
||||
Ok(Pix { ptr: result })
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Accessors
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
/// Returns the raw Leptonica `PIX *` pointer.
|
||||
///
|
||||
/// Intended for passing this image to `TesseractAPI::set_image_2`.
|
||||
///
|
||||
/// # Safety
|
||||
///
|
||||
/// The caller must ensure the `Pix` outlives any use of the returned
|
||||
/// pointer. `TessBaseAPISetImage2` **borrows** the pointer — it does not
|
||||
/// take ownership — so the `Pix` must remain alive until after
|
||||
/// `TessBaseAPIRecognize` (or any other Tesseract call that consumes the
|
||||
/// image data) has completed. Dropping the `Pix` while Tesseract holds
|
||||
/// the pointer will result in a use-after-free.
|
||||
///
|
||||
/// The caller must **not** free the returned pointer; `Pix::drop` is
|
||||
/// solely responsible for deallocation via `pixDestroy`.
|
||||
pub fn as_ptr(&self) -> *mut c_void {
|
||||
self.ptr
|
||||
}
|
||||
|
||||
/// Returns the width of the image in pixels.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// # use kreuzberg_tesseract::Pix;
|
||||
/// # let pix = Pix::from_raw_rgb(&vec![0u8; 8 * 6 * 3], 8, 6).unwrap();
|
||||
/// assert_eq!(pix.width(), 8);
|
||||
/// ```
|
||||
pub fn width(&self) -> i32 {
|
||||
// SAFETY: self.ptr is a valid non-null Pix. pixGetWidth() is a pure
|
||||
// read of the Pix header struct; it does not mutate any state.
|
||||
unsafe { pixGetWidth(self.ptr) }
|
||||
}
|
||||
|
||||
/// Returns the height of the image in pixels.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// # use kreuzberg_tesseract::Pix;
|
||||
/// # let pix = Pix::from_raw_rgb(&vec![0u8; 8 * 6 * 3], 8, 6).unwrap();
|
||||
/// assert_eq!(pix.height(), 6);
|
||||
/// ```
|
||||
pub fn height(&self) -> i32 {
|
||||
// SAFETY: self.ptr is a valid non-null Pix. pixGetHeight() is a pure
|
||||
// read of the Pix header struct.
|
||||
unsafe { pixGetHeight(self.ptr) }
|
||||
}
|
||||
|
||||
/// Returns the bit depth of the image (1, 8, or 32 for this module's usage).
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// # use kreuzberg_tesseract::Pix;
|
||||
/// # let pix = Pix::from_raw_rgb(&vec![0u8; 4 * 4 * 3], 4, 4).unwrap();
|
||||
/// assert_eq!(pix.depth(), 32);
|
||||
/// ```
|
||||
pub fn depth(&self) -> i32 {
|
||||
// SAFETY: self.ptr is a valid non-null Pix. pixGetDepth() is a pure
|
||||
// read of the Pix header struct.
|
||||
unsafe { pixGetDepth(self.ptr) }
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Drop implementation
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
|
||||
impl Drop for Pix {
|
||||
fn drop(&mut self) {
|
||||
if !self.ptr.is_null() {
|
||||
// SAFETY: self.ptr is a non-null Leptonica PIX that we allocated and
|
||||
// own exclusively. pixDestroy() takes a double pointer, sets *ppix to
|
||||
// null after freeing, and is safe to call exactly once per allocation.
|
||||
// After this call self.ptr is null (Leptonica sets it), preventing
|
||||
// any double-free if drop() were somehow called again.
|
||||
unsafe { pixDestroy(&mut self.ptr) };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Tests
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[cfg(test)]
|
||||
#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn make_rgb_pix(width: u32, height: u32, fill: u8) -> Pix {
|
||||
let data = vec![fill; (width * height * 3) as usize];
|
||||
Pix::from_raw_rgb(&data, width, height).expect("from_raw_rgb failed")
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_from_raw_rgb_dimensions() {
|
||||
let pix = make_rgb_pix(16, 8, 200);
|
||||
assert_eq!(pix.width(), 16);
|
||||
assert_eq!(pix.height(), 8);
|
||||
assert_eq!(pix.depth(), 32);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_from_raw_rgb_wrong_length() {
|
||||
let data = vec![0u8; 10]; // too short for 4×4
|
||||
let err = Pix::from_raw_rgb(&data, 4, 4).unwrap_err();
|
||||
assert!(matches!(err, TesseractError::InvalidImageData));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_from_raw_rgb_zero_dimensions() {
|
||||
let err = Pix::from_raw_rgb(&[], 0, 4).unwrap_err();
|
||||
assert!(matches!(err, TesseractError::InvalidImageData));
|
||||
|
||||
let err = Pix::from_raw_rgb(&[], 4, 0).unwrap_err();
|
||||
assert!(matches!(err, TesseractError::InvalidImageData));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_as_ptr_is_non_null() {
|
||||
let pix = make_rgb_pix(8, 8, 128);
|
||||
assert!(!pix.as_ptr().is_null());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_to_grayscale() {
|
||||
let pix = make_rgb_pix(32, 32, 150);
|
||||
let gray = pix.to_grayscale().expect("to_grayscale failed");
|
||||
assert_eq!(gray.width(), 32);
|
||||
assert_eq!(gray.height(), 32);
|
||||
assert_eq!(gray.depth(), 8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_scale_up() {
|
||||
let pix = make_rgb_pix(20, 10, 100);
|
||||
let scaled = pix.scale(2.0, 2.0).expect("scale failed");
|
||||
assert_eq!(scaled.width(), 40);
|
||||
assert_eq!(scaled.height(), 20);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_unsharp_mask_returns_same_dimensions() {
|
||||
let pix = make_rgb_pix(32, 32, 200);
|
||||
let sharpened = pix.unsharp_mask(2, 0.4).expect("unsharp_mask failed");
|
||||
assert_eq!(sharpened.width(), 32);
|
||||
assert_eq!(sharpened.height(), 32);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_adaptive_threshold_produces_1bpp() {
|
||||
let pix = make_rgb_pix(64, 64, 180);
|
||||
let gray = pix.to_grayscale().expect("to_grayscale failed");
|
||||
let binary = gray.adaptive_threshold(32, 32).expect("adaptive_threshold failed");
|
||||
assert_eq!(binary.depth(), 1);
|
||||
}
|
||||
}
|
||||
218
crates/kreuzberg-tesseract/src/lib.rs
Normal file
218
crates/kreuzberg-tesseract/src/lib.rs
Normal file
@@ -0,0 +1,218 @@
|
||||
#![cfg_attr(
|
||||
not(any(feature = "build-tesseract", feature = "build-tesseract-wasm")),
|
||||
allow(unused_variables, dead_code)
|
||||
)]
|
||||
#![allow(clippy::arc_with_non_send_sync)]
|
||||
#![allow(clippy::missing_transmute_annotations)]
|
||||
#![allow(clippy::type_complexity)]
|
||||
#![allow(clippy::new_without_default)]
|
||||
#![allow(clippy::not_unsafe_ptr_arg_deref)]
|
||||
#![allow(clippy::cmp_null)]
|
||||
|
||||
//! # kreuzberg-tesseract
|
||||
//!
|
||||
//! `kreuzberg-tesseract` provides safe Rust bindings for Tesseract OCR with built-in compilation
|
||||
//! of Tesseract and Leptonica libraries. This crate aims to make OCR functionality
|
||||
//! easily accessible in Rust projects while handling the complexity of interfacing
|
||||
//! with the underlying C++ libraries.
|
||||
//!
|
||||
//! ## Usage
|
||||
//!
|
||||
//! Here's a basic example of how to use `kreuzberg-tesseract`:
|
||||
//!
|
||||
//! ```rust
|
||||
//! use std::path::PathBuf;
|
||||
//! use std::error::Error;
|
||||
//! use kreuzberg_tesseract::TesseractAPI;
|
||||
//!
|
||||
//! fn get_default_tessdata_dir() -> PathBuf {
|
||||
//! if cfg!(target_os = "macos") {
|
||||
//! let home_dir = std::env::var("HOME").expect("HOME environment variable not set");
|
||||
//! PathBuf::from(home_dir)
|
||||
//! .join("Library")
|
||||
//! .join("Application Support")
|
||||
//! .join("kreuzberg-tesseract")
|
||||
//! .join("tessdata")
|
||||
//! } else if cfg!(target_os = "linux") {
|
||||
//! let home_dir = std::env::var("HOME").expect("HOME environment variable not set");
|
||||
//! PathBuf::from(home_dir)
|
||||
//! .join(".kreuzberg-tesseract")
|
||||
//! .join("tessdata")
|
||||
//! } else if cfg!(target_os = "windows") {
|
||||
//! PathBuf::from(std::env::var("APPDATA").expect("APPDATA environment variable not set"))
|
||||
//! .join("kreuzberg-tesseract")
|
||||
//! .join("tessdata")
|
||||
//! } else {
|
||||
//! panic!("Unsupported operating system");
|
||||
//! }
|
||||
//! }
|
||||
//!
|
||||
//! fn get_tessdata_dir() -> PathBuf {
|
||||
//! match std::env::var("TESSDATA_PREFIX") {
|
||||
//! Ok(dir) => {
|
||||
//! let path = PathBuf::from(dir);
|
||||
//! let path = if path.ends_with("tessdata") { path } else { path.join("tessdata") };
|
||||
//! println!("Using TESSDATA_PREFIX directory: {:?}", path);
|
||||
//! path
|
||||
//! }
|
||||
//! Err(_) => {
|
||||
//! let default_dir = get_default_tessdata_dir();
|
||||
//! println!(
|
||||
//! "TESSDATA_PREFIX not set, using default directory: {:?}",
|
||||
//! default_dir
|
||||
//! );
|
||||
//! default_dir
|
||||
//! }
|
||||
//! }
|
||||
//! }
|
||||
//!
|
||||
//! fn main() -> Result<(), Box<dyn Error>> {
|
||||
//! let api = TesseractAPI::new()?;
|
||||
//!
|
||||
//! // Get tessdata directory (uses default location or TESSDATA_PREFIX if set)
|
||||
//! let tessdata_dir = get_tessdata_dir();
|
||||
//! api.init(tessdata_dir.to_str().unwrap(), "eng")?;
|
||||
//!
|
||||
//! let width = 24;
|
||||
//! let height = 24;
|
||||
//! let bytes_per_pixel = 1;
|
||||
//! let bytes_per_line = width * bytes_per_pixel;
|
||||
//!
|
||||
//! // Initialize image data with all white pixels
|
||||
//! let mut image_data = vec![255u8; width * height];
|
||||
//!
|
||||
//! // Draw number 9 with clearer distinction
|
||||
//! for y in 4..19 {
|
||||
//! for x in 7..17 {
|
||||
//! // Top bar
|
||||
//! if y == 4 && x >= 8 && x <= 15 {
|
||||
//! image_data[y * width + x] = 0;
|
||||
//! }
|
||||
//! // Top curve left side
|
||||
//! if y >= 4 && y <= 10 && x == 7 {
|
||||
//! image_data[y * width + x] = 0;
|
||||
//! }
|
||||
//! // Top curve right side
|
||||
//! if y >= 4 && y <= 11 && x == 16 {
|
||||
//! image_data[y * width + x] = 0;
|
||||
//! }
|
||||
//! // Middle bar
|
||||
//! if y == 11 && x >= 8 && x <= 15 {
|
||||
//! image_data[y * width + x] = 0;
|
||||
//! }
|
||||
//! // Bottom right vertical line
|
||||
//! if y >= 11 && y <= 18 && x == 16 {
|
||||
//! image_data[y * width + x] = 0;
|
||||
//! }
|
||||
//! // Bottom bar
|
||||
//! if y == 18 && x >= 8 && x <= 15 {
|
||||
//! image_data[y * width + x] = 0;
|
||||
//! }
|
||||
//! }
|
||||
//! }
|
||||
//!
|
||||
//! // Set the image data
|
||||
//! api.set_image(&image_data, width.try_into().unwrap(), height.try_into().unwrap(), bytes_per_pixel.try_into().unwrap(), bytes_per_line.try_into().unwrap())?;
|
||||
//!
|
||||
//! // Set whitelist for digits only
|
||||
//! api.set_variable("tessedit_char_whitelist", "0123456789")?;
|
||||
//!
|
||||
//! // Set PSM mode to single character
|
||||
//! api.set_variable("tessedit_pageseg_mode", "10")?;
|
||||
//!
|
||||
//! // Get the recognized text
|
||||
//! let text = api.get_utf8_text()?;
|
||||
//! println!("Recognized text: {}", text.trim());
|
||||
//!
|
||||
//! Ok(())
|
||||
//! }
|
||||
//! ```
|
||||
/// Declare FFI functions with `extern "C-unwind"` on native targets (to catch
|
||||
/// C++ exceptions from Tesseract/Leptonica) and `extern "C"` on WASM (where
|
||||
/// the LLVM backend does not support `cleanupret` / C++ unwinding).
|
||||
macro_rules! ffi_extern {
|
||||
(
|
||||
$(
|
||||
$(#[$meta:meta])*
|
||||
$vis:vis fn $name:ident($($arg:ident : $ty:ty),* $(,)?) $(-> $ret:ty)?;
|
||||
)*
|
||||
) => {
|
||||
#[cfg(not(target_arch = "wasm32"))]
|
||||
unsafe extern "C-unwind" {
|
||||
$(
|
||||
$(#[$meta])*
|
||||
$vis fn $name($($arg : $ty),*) $(-> $ret)?;
|
||||
)*
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "wasm32")]
|
||||
unsafe extern "C" {
|
||||
$(
|
||||
$(#[$meta])*
|
||||
$vis fn $name($($arg : $ty),*) $(-> $ret)?;
|
||||
)*
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
pub use error::{Result, TesseractError};
|
||||
mod error;
|
||||
|
||||
// WASM: Override __cxa_atexit to be a no-op. WASI SDK's __cxa_atexit calls calloc during
|
||||
// C++ static initialization, which crashes because dlmalloc's heap isn't properly set up
|
||||
// for wasm32-unknown-unknown. Since WASM modules never exit normally, atexit handlers
|
||||
// are unnecessary.
|
||||
#[cfg(all(target_arch = "wasm32", not(target_os = "emscripten")))]
|
||||
mod wasm_compat {
|
||||
#[unsafe(no_mangle)]
|
||||
pub unsafe extern "C" fn __cxa_atexit(
|
||||
_func: Option<unsafe extern "C" fn(*mut core::ffi::c_void)>,
|
||||
_arg: *mut core::ffi::c_void,
|
||||
_dso_handle: *mut core::ffi::c_void,
|
||||
) -> i32 {
|
||||
0 // Success, but don't actually register anything
|
||||
}
|
||||
}
|
||||
mod page_iterator;
|
||||
pub use page_iterator::{BlockInfo, PageIterator, ParaInfo};
|
||||
mod result_iterator;
|
||||
pub use result_iterator::{FontAttributes, ResultIterator, WordData};
|
||||
mod choice_iterator;
|
||||
pub use choice_iterator::ChoiceIterator;
|
||||
mod monitor;
|
||||
pub use monitor::TessMonitor;
|
||||
mod result_renderer;
|
||||
pub use result_renderer::TessResultRenderer;
|
||||
mod mutable_iterator;
|
||||
pub use mutable_iterator::MutableIterator;
|
||||
mod enums;
|
||||
pub use enums::{
|
||||
TessOrientation, TessPageIteratorLevel, TessPageSegMode, TessParagraphJustification, TessPolyBlockType,
|
||||
TessTextlineOrder, TessWritingDirection,
|
||||
};
|
||||
mod api;
|
||||
pub use api::{BoundingBoxArray, TesseractAPI};
|
||||
pub mod leptonica;
|
||||
#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
|
||||
pub use leptonica::Pix;
|
||||
|
||||
/// Returns the compile-time-bundled English `eng.traineddata` blob when the
|
||||
/// `bundle-tessdata-eng` feature is enabled, otherwise `None`.
|
||||
///
|
||||
/// The bundled data is the `tessdata_fast` variant (~4 MB) downloaded by
|
||||
/// `build.rs` to `TESSDATA_PREFIX_BUNDLED/tessdata/eng.traineddata`. Embedding
|
||||
/// it lets WASM builds drive Tesseract OCR without filesystem access or
|
||||
/// runtime fetches.
|
||||
#[cfg(feature = "bundle-tessdata-eng")]
|
||||
pub fn bundled_eng_traineddata() -> Option<&'static [u8]> {
|
||||
Some(include_bytes!(concat!(
|
||||
env!("TESSDATA_PREFIX_BUNDLED"),
|
||||
"/tessdata/eng.traineddata"
|
||||
)))
|
||||
}
|
||||
|
||||
/// Returns `None` when the `bundle-tessdata-eng` feature is disabled.
|
||||
#[cfg(not(feature = "bundle-tessdata-eng"))]
|
||||
pub fn bundled_eng_traineddata() -> Option<&'static [u8]> {
|
||||
None
|
||||
}
|
||||
68
crates/kreuzberg-tesseract/src/monitor.rs
Normal file
68
crates/kreuzberg-tesseract/src/monitor.rs
Normal file
@@ -0,0 +1,68 @@
|
||||
use crate::error::{Result, TesseractError};
|
||||
use std::os::raw::{c_int, c_void};
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
pub struct TessMonitor {
|
||||
handle: Arc<Mutex<*mut c_void>>,
|
||||
}
|
||||
|
||||
unsafe impl Send for TessMonitor {}
|
||||
unsafe impl Sync for TessMonitor {}
|
||||
|
||||
impl TessMonitor {
|
||||
/// Creates a new instance of the TessMonitor.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the new instance of the TessMonitor.
|
||||
pub fn new() -> Self {
|
||||
let handle = unsafe { TessMonitorCreate() };
|
||||
TessMonitor {
|
||||
handle: Arc::new(Mutex::new(handle)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Sets the deadline for the monitor.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `deadline` - Deadline in milliseconds.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns a `TesseractError::MutexLockError` if the mutex lock fails.
|
||||
pub fn set_deadline(&self, deadline: i32) -> Result<()> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
unsafe { TessMonitorSetDeadlineMSecs(*handle, deadline) };
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Gets the progress of the monitor.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the progress as an `i32` if successful, otherwise returns an error.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns a `TesseractError::MutexLockError` if the mutex lock fails.
|
||||
pub fn get_progress(&self) -> Result<i32> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
Ok(unsafe { TessMonitorGetProgress(*handle) })
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for TessMonitor {
|
||||
fn drop(&mut self) {
|
||||
if let Ok(handle) = self.handle.lock() {
|
||||
unsafe { TessMonitorDelete(*handle) };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ffi_extern! {
|
||||
pub fn TessMonitorCreate() -> *mut c_void;
|
||||
pub fn TessMonitorDelete(monitor: *mut c_void);
|
||||
pub fn TessMonitorSetDeadlineMSecs(monitor: *mut c_void, deadline: c_int);
|
||||
pub fn TessMonitorGetProgress(monitor: *mut c_void) -> c_int;
|
||||
}
|
||||
197
crates/kreuzberg-tesseract/src/mutable_iterator.rs
Normal file
197
crates/kreuzberg-tesseract/src/mutable_iterator.rs
Normal file
@@ -0,0 +1,197 @@
|
||||
use crate::error::{Result, TesseractError};
|
||||
use std::ffi::CStr;
|
||||
use std::os::raw::{c_char, c_void};
|
||||
use std::sync::Arc;
|
||||
use std::sync::Mutex;
|
||||
|
||||
use crate::result_iterator::{
|
||||
TessResultIteratorConfidence, TessResultIteratorGetUTF8Text, TessResultIteratorNext,
|
||||
TessResultIteratorSymbolIsDropcap, TessResultIteratorSymbolIsSubscript, TessResultIteratorSymbolIsSuperscript,
|
||||
TessResultIteratorWordFontAttributes, TessResultIteratorWordIsFromDictionary, TessResultIteratorWordIsNumeric,
|
||||
TessResultIteratorWordRecognitionLanguage,
|
||||
};
|
||||
|
||||
pub struct MutableIterator {
|
||||
handle: Arc<Mutex<*mut c_void>>,
|
||||
}
|
||||
|
||||
unsafe impl Send for MutableIterator {}
|
||||
unsafe impl Sync for MutableIterator {}
|
||||
|
||||
impl MutableIterator {
|
||||
/// Creates a new instance of the MutableIterator.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `handle` - Pointer to the MutableIterator.
|
||||
pub fn new(handle: *mut c_void) -> Self {
|
||||
MutableIterator {
|
||||
handle: Arc::new(Mutex::new(handle)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Gets the UTF-8 text for the current iterator.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `level` - Level of the text.
|
||||
pub fn get_utf8_text(&self, level: i32) -> Result<String> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
|
||||
let text_ptr = unsafe { TessResultIteratorGetUTF8Text(*handle, level) };
|
||||
if text_ptr.is_null() {
|
||||
return Err(TesseractError::NullPointerError);
|
||||
}
|
||||
let c_str = unsafe { CStr::from_ptr(text_ptr) };
|
||||
let result = c_str.to_str()?.to_owned();
|
||||
unsafe { TessDeleteText(text_ptr as *mut c_char) };
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Gets the confidence of the current iterator.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `level` - Level of the confidence.
|
||||
pub fn confidence(&self, level: i32) -> Result<f32> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
|
||||
Ok(unsafe { TessResultIteratorConfidence(*handle, level) })
|
||||
}
|
||||
|
||||
/// Gets the recognition language of the current iterator.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the recognition language as a `String` if successful, otherwise returns an error.
|
||||
pub fn word_recognition_language(&self) -> Result<String> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
|
||||
let lang_ptr = unsafe { TessResultIteratorWordRecognitionLanguage(*handle) };
|
||||
if lang_ptr.is_null() {
|
||||
return Err(TesseractError::NullPointerError);
|
||||
}
|
||||
let c_str = unsafe { CStr::from_ptr(lang_ptr) };
|
||||
Ok(c_str.to_str()?.to_owned())
|
||||
}
|
||||
|
||||
/// Gets the font attributes of the current iterator.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the font attributes as a tuple if successful, otherwise returns an error.
|
||||
pub fn word_font_attributes(&self) -> Result<(bool, bool, bool, bool, bool, bool, i32, i32)> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
|
||||
let mut is_bold = 0;
|
||||
let mut is_italic = 0;
|
||||
let mut is_underlined = 0;
|
||||
let mut is_monospace = 0;
|
||||
let mut is_serif = 0;
|
||||
let mut is_smallcaps = 0;
|
||||
let mut pointsize = 0;
|
||||
let mut font_id = 0;
|
||||
|
||||
let result = unsafe {
|
||||
TessResultIteratorWordFontAttributes(
|
||||
*handle,
|
||||
&mut is_bold,
|
||||
&mut is_italic,
|
||||
&mut is_underlined,
|
||||
&mut is_monospace,
|
||||
&mut is_serif,
|
||||
&mut is_smallcaps,
|
||||
&mut pointsize,
|
||||
&mut font_id,
|
||||
)
|
||||
};
|
||||
|
||||
if result == 0 {
|
||||
Err(TesseractError::InvalidParameterError)
|
||||
} else {
|
||||
Ok((
|
||||
is_bold != 0,
|
||||
is_italic != 0,
|
||||
is_underlined != 0,
|
||||
is_monospace != 0,
|
||||
is_serif != 0,
|
||||
is_smallcaps != 0,
|
||||
pointsize,
|
||||
font_id,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
/// Checks if the current word is from the dictionary.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `Ok(true)` if the current word is from the dictionary, otherwise returns `Ok(false)`.
|
||||
pub fn word_is_from_dictionary(&self) -> Result<bool> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
|
||||
Ok(unsafe { TessResultIteratorWordIsFromDictionary(*handle) != 0 })
|
||||
}
|
||||
|
||||
/// Checks if the current word is numeric.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `Ok(true)` if the current word is numeric, otherwise returns `Ok(false)`.
|
||||
pub fn word_is_numeric(&self) -> Result<bool> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
|
||||
Ok(unsafe { TessResultIteratorWordIsNumeric(*handle) != 0 })
|
||||
}
|
||||
|
||||
/// Checks if the current symbol is superscript.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `Ok(true)` if the current symbol is superscript, otherwise returns `Ok(false)`.
|
||||
pub fn symbol_is_superscript(&self) -> Result<bool> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
|
||||
Ok(unsafe { TessResultIteratorSymbolIsSuperscript(*handle) != 0 })
|
||||
}
|
||||
|
||||
/// Checks if the current symbol is subscript.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `Ok(true)` if the current symbol is subscript, otherwise returns `Ok(false)`.
|
||||
pub fn symbol_is_subscript(&self) -> Result<bool> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
|
||||
Ok(unsafe { TessResultIteratorSymbolIsSubscript(*handle) != 0 })
|
||||
}
|
||||
|
||||
/// Checks if the current symbol is dropcap.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `Ok(true)` if the current symbol is dropcap, otherwise returns `Ok(false)`.
|
||||
pub fn symbol_is_dropcap(&self) -> Result<bool> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
|
||||
Ok(unsafe { TessResultIteratorSymbolIsDropcap(*handle) != 0 })
|
||||
}
|
||||
|
||||
/// Gets the next iterator.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `level` - Level of the iterator.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `true` if the next iterator is successful, otherwise returns `false`.
|
||||
pub fn next(&self, level: i32) -> Result<bool> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
|
||||
Ok(unsafe { TessResultIteratorNext(*handle, level) != 0 })
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for MutableIterator {
|
||||
fn drop(&mut self) {
|
||||
if let Ok(handle) = self.handle.lock() {
|
||||
unsafe { TessResultIteratorDelete(*handle) };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ffi_extern! {
|
||||
pub fn TessResultIteratorDelete(handle: *mut c_void);
|
||||
pub fn TessDeleteText(text: *mut c_char);
|
||||
}
|
||||
421
crates/kreuzberg-tesseract/src/page_iterator.rs
Normal file
421
crates/kreuzberg-tesseract/src/page_iterator.rs
Normal file
@@ -0,0 +1,421 @@
|
||||
use crate::TesseractError;
|
||||
use crate::enums::{
|
||||
TessOrientation, TessPageIteratorLevel, TessParagraphJustification, TessPolyBlockType, TessTextlineOrder,
|
||||
TessWritingDirection,
|
||||
};
|
||||
use crate::error::Result;
|
||||
use std::os::raw::{c_float, c_int, c_void};
|
||||
use std::sync::Arc;
|
||||
use std::sync::Mutex;
|
||||
|
||||
/// Block-level layout information from Tesseract.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct BlockInfo {
|
||||
pub block_type: TessPolyBlockType,
|
||||
pub left: i32,
|
||||
pub top: i32,
|
||||
pub right: i32,
|
||||
pub bottom: i32,
|
||||
}
|
||||
|
||||
/// Paragraph-level information from Tesseract.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ParaInfo {
|
||||
pub justification: TessParagraphJustification,
|
||||
pub is_list_item: bool,
|
||||
pub is_crown: bool,
|
||||
pub first_line_indent: i32,
|
||||
pub left: i32,
|
||||
pub top: i32,
|
||||
pub right: i32,
|
||||
pub bottom: i32,
|
||||
}
|
||||
|
||||
pub struct PageIterator {
|
||||
pub handle: Arc<Mutex<*mut c_void>>,
|
||||
}
|
||||
|
||||
unsafe impl Send for PageIterator {}
|
||||
unsafe impl Sync for PageIterator {}
|
||||
|
||||
impl PageIterator {
|
||||
/// Creates a new instance of the PageIterator.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `handle` - Pointer to the PageIterator.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the new instance of the PageIterator.
|
||||
pub fn new(handle: *mut c_void) -> Self {
|
||||
PageIterator {
|
||||
handle: Arc::new(Mutex::new(handle)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Begins the iteration.
|
||||
pub fn begin(&self) -> Result<()> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
unsafe { TessPageIteratorBegin(*handle) };
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Gets the next iterator.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `level` - Level of the iterator.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `Result<bool>` - `Ok(true)` if the next iterator is successful, `Ok(false)` otherwise.
|
||||
pub fn next(&self, level: TessPageIteratorLevel) -> Result<bool> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
Ok(unsafe { TessPageIteratorNext(*handle, level as c_int) != 0 })
|
||||
}
|
||||
|
||||
/// Checks if the current iterator is at the beginning of the specified level.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `level` - Level of the iterator.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `Result<bool>` - `Ok(true)` if at the beginning, `Ok(false)` otherwise.
|
||||
pub fn is_at_beginning_of(&self, level: TessPageIteratorLevel) -> Result<bool> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
Ok(unsafe { TessPageIteratorIsAtBeginningOf(*handle, level as c_int) != 0 })
|
||||
}
|
||||
|
||||
/// Checks if the current iterator is at the final element of the specified level.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `level` - Level of the iterator.
|
||||
/// * `element` - Element of the iterator.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `Result<bool>` - `Ok(true)` if at the final element, `Ok(false)` otherwise.
|
||||
pub fn is_at_final_element(&self, level: TessPageIteratorLevel, element: TessPageIteratorLevel) -> Result<bool> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
Ok(unsafe { TessPageIteratorIsAtFinalElement(*handle, level as c_int, element as c_int) != 0 })
|
||||
}
|
||||
|
||||
/// Gets the bounding box of the current iterator.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `level` - Level of the bounding box.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the bounding box as a tuple if successful, otherwise returns an error.
|
||||
pub fn bounding_box(&self, level: TessPageIteratorLevel) -> Result<(i32, i32, i32, i32)> {
|
||||
let mut left = 0;
|
||||
let mut top = 0;
|
||||
let mut right = 0;
|
||||
let mut bottom = 0;
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
let result = unsafe {
|
||||
TessPageIteratorBoundingBox(*handle, level as c_int, &mut left, &mut top, &mut right, &mut bottom)
|
||||
};
|
||||
if result == 0 {
|
||||
Err(TesseractError::InvalidParameterError)
|
||||
} else {
|
||||
Ok((left, top, right, bottom))
|
||||
}
|
||||
}
|
||||
|
||||
/// Gets the block type of the current iterator.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the block type as a `TessPolyBlockType`.
|
||||
pub fn block_type(&self) -> Result<TessPolyBlockType> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
let block_type = unsafe { TessPageIteratorBlockType(*handle) };
|
||||
Ok(TessPolyBlockType::from_int(block_type))
|
||||
}
|
||||
|
||||
/// Gets the baseline of the current iterator.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `level` - Level of the baseline.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the baseline as a tuple if successful, otherwise returns an error.
|
||||
pub fn baseline(&self, level: i32) -> Result<(i32, i32, i32, i32)> {
|
||||
let mut x1 = 0;
|
||||
let mut y1 = 0;
|
||||
let mut x2 = 0;
|
||||
let mut y2 = 0;
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
let result = unsafe { TessPageIteratorBaseline(*handle, level, &mut x1, &mut y1, &mut x2, &mut y2) };
|
||||
if result == 0 {
|
||||
Err(TesseractError::InvalidParameterError)
|
||||
} else {
|
||||
Ok((x1, y1, x2, y2))
|
||||
}
|
||||
}
|
||||
|
||||
/// Gets the orientation of the current iterator.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the orientation as a tuple if successful, otherwise returns an error.
|
||||
pub fn orientation(&self) -> Result<(TessOrientation, TessWritingDirection, TessTextlineOrder, f32)> {
|
||||
let mut orientation = 0;
|
||||
let mut writing_direction = 0;
|
||||
let mut textline_order = 0;
|
||||
let mut deskew_angle = 0.0;
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
let result = unsafe {
|
||||
TessPageIteratorOrientation(
|
||||
*handle,
|
||||
&mut orientation,
|
||||
&mut writing_direction,
|
||||
&mut textline_order,
|
||||
&mut deskew_angle,
|
||||
)
|
||||
};
|
||||
if result == 0 {
|
||||
Err(TesseractError::InvalidParameterError)
|
||||
} else {
|
||||
Ok((
|
||||
TessOrientation::from_int(orientation),
|
||||
TessWritingDirection::from_int(writing_direction),
|
||||
TessTextlineOrder::from_int(textline_order),
|
||||
deskew_angle,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
/// Extracts all blocks from the page in a single mutex-locked pass.
|
||||
///
|
||||
/// Resets the iterator to the beginning, then iterates at `RIL_BLOCK` level,
|
||||
/// collecting block type and bounding box for each block found.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `Ok(Vec<BlockInfo>)` with one entry per block, or an error if the
|
||||
/// mutex cannot be acquired.
|
||||
pub fn extract_all_blocks(&self) -> Result<Vec<BlockInfo>> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
let level = TessPageIteratorLevel::RIL_BLOCK as c_int;
|
||||
let mut blocks = Vec::new();
|
||||
|
||||
// SAFETY: `*handle` is a valid non-null TessPageIterator pointer owned by this struct.
|
||||
// `TessPageIteratorBegin` resets the iterator to the first element and takes only
|
||||
// the pointer — no aliasing occurs because we hold the mutex for the duration.
|
||||
unsafe { TessPageIteratorBegin(*handle) };
|
||||
|
||||
loop {
|
||||
let block_type = unsafe {
|
||||
// SAFETY: `*handle` is valid; TessPageIteratorBlockType reads the current
|
||||
// iterator position and returns an integer enum value without taking ownership.
|
||||
TessPageIteratorBlockType(*handle)
|
||||
};
|
||||
|
||||
let mut left: c_int = 0;
|
||||
let mut top: c_int = 0;
|
||||
let mut right: c_int = 0;
|
||||
let mut bottom: c_int = 0;
|
||||
|
||||
let bbox_ok = unsafe {
|
||||
// SAFETY: `*handle` is valid; the four `*mut c_int` pointers point to local
|
||||
// stack variables whose lifetimes exceed this call.
|
||||
TessPageIteratorBoundingBox(*handle, level, &mut left, &mut top, &mut right, &mut bottom)
|
||||
};
|
||||
|
||||
if bbox_ok != 0 {
|
||||
blocks.push(BlockInfo {
|
||||
block_type: TessPolyBlockType::from_int(block_type),
|
||||
left,
|
||||
top,
|
||||
right,
|
||||
bottom,
|
||||
});
|
||||
}
|
||||
|
||||
let has_next = unsafe {
|
||||
// SAFETY: `*handle` is valid; TessPageIteratorNext advances the iterator
|
||||
// in-place and returns 0 when there are no more elements at this level.
|
||||
TessPageIteratorNext(*handle, level)
|
||||
};
|
||||
if has_next == 0 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(blocks)
|
||||
}
|
||||
|
||||
/// Extracts all paragraphs from the page in a single mutex-locked pass.
|
||||
///
|
||||
/// Resets the iterator to the beginning, then iterates at `RIL_PARA` level,
|
||||
/// collecting paragraph metadata and bounding box for each paragraph found.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `Ok(Vec<ParaInfo>)` with one entry per paragraph, or an error if the
|
||||
/// mutex cannot be acquired.
|
||||
pub fn extract_all_paragraphs(&self) -> Result<Vec<ParaInfo>> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
let level = TessPageIteratorLevel::RIL_PARA as c_int;
|
||||
let mut paragraphs = Vec::new();
|
||||
|
||||
// SAFETY: `*handle` is a valid non-null TessPageIterator pointer owned by this struct.
|
||||
// `TessPageIteratorBegin` resets the iterator to the first element; the mutex ensures
|
||||
// exclusive access for the entire loop.
|
||||
unsafe { TessPageIteratorBegin(*handle) };
|
||||
|
||||
loop {
|
||||
let mut justification: c_int = 0;
|
||||
// SAFETY: TessPageIteratorParagraphInfo expects BOOL* (int*) for is_list_item and
|
||||
// is_crown. Rust bool is 1 byte while C int is 4 bytes, so we use c_int temporaries
|
||||
// to avoid undefined behaviour (stack corruption) and convert afterwards.
|
||||
let mut is_list_item_raw: c_int = 0;
|
||||
let mut is_crown_raw: c_int = 0;
|
||||
let mut first_line_indent: c_int = 0;
|
||||
|
||||
let para_ok = unsafe {
|
||||
// SAFETY: `*handle` is valid; all output pointers reference stack variables
|
||||
// whose lifetimes exceed this call. TessPageIteratorParagraphInfo writes
|
||||
// through these pointers without retaining them.
|
||||
TessPageIteratorParagraphInfo(
|
||||
*handle,
|
||||
&mut justification,
|
||||
&mut is_list_item_raw,
|
||||
&mut is_crown_raw,
|
||||
&mut first_line_indent,
|
||||
)
|
||||
};
|
||||
|
||||
let is_list_item = is_list_item_raw != 0;
|
||||
let is_crown = is_crown_raw != 0;
|
||||
|
||||
let mut left: c_int = 0;
|
||||
let mut top: c_int = 0;
|
||||
let mut right: c_int = 0;
|
||||
let mut bottom: c_int = 0;
|
||||
|
||||
let bbox_ok = unsafe {
|
||||
// SAFETY: `*handle` is valid; the four `*mut c_int` pointers reference local
|
||||
// stack variables. TessPageIteratorBoundingBox does not retain these pointers.
|
||||
TessPageIteratorBoundingBox(*handle, level, &mut left, &mut top, &mut right, &mut bottom)
|
||||
};
|
||||
|
||||
if para_ok != 0 && bbox_ok != 0 {
|
||||
paragraphs.push(ParaInfo {
|
||||
justification: TessParagraphJustification::from_int(justification),
|
||||
is_list_item,
|
||||
is_crown,
|
||||
first_line_indent,
|
||||
left,
|
||||
top,
|
||||
right,
|
||||
bottom,
|
||||
});
|
||||
}
|
||||
|
||||
let has_next = unsafe {
|
||||
// SAFETY: `*handle` is valid; TessPageIteratorNext advances the iterator
|
||||
// in-place and returns 0 when there are no more elements at this level.
|
||||
TessPageIteratorNext(*handle, level)
|
||||
};
|
||||
if has_next == 0 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(paragraphs)
|
||||
}
|
||||
|
||||
/// Gets the paragraph information of the current iterator.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the paragraph information as a tuple if successful, otherwise returns an error.
|
||||
pub fn paragraph_info(&self) -> Result<(TessParagraphJustification, bool, bool, i32)> {
|
||||
let mut justification = 0;
|
||||
// SAFETY: TessPageIteratorParagraphInfo expects BOOL* (int*) for is_list_item and
|
||||
// is_crown. Rust bool is 1 byte while C int is 4 bytes, so we use c_int temporaries
|
||||
// to avoid undefined behaviour (stack corruption) and convert afterwards.
|
||||
let mut is_list_item_raw: c_int = 0;
|
||||
let mut is_crown_raw: c_int = 0;
|
||||
let mut first_line_indent = 0;
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
let result = unsafe {
|
||||
TessPageIteratorParagraphInfo(
|
||||
*handle,
|
||||
&mut justification,
|
||||
&mut is_list_item_raw,
|
||||
&mut is_crown_raw,
|
||||
&mut first_line_indent,
|
||||
)
|
||||
};
|
||||
if result == 0 {
|
||||
Err(TesseractError::InvalidParameterError)
|
||||
} else {
|
||||
Ok((
|
||||
TessParagraphJustification::from_int(justification),
|
||||
is_list_item_raw != 0,
|
||||
is_crown_raw != 0,
|
||||
first_line_indent,
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for PageIterator {
|
||||
fn drop(&mut self) {
|
||||
if let Ok(handle) = self.handle.lock() {
|
||||
unsafe { TessPageIteratorDelete(*handle) };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ffi_extern! {
|
||||
pub fn TessPageIteratorDelete(handle: *mut c_void);
|
||||
pub fn TessPageIteratorBegin(handle: *mut c_void);
|
||||
pub fn TessPageIteratorNext(handle: *mut c_void, level: c_int) -> c_int;
|
||||
pub fn TessPageIteratorIsAtBeginningOf(handle: *mut c_void, level: c_int) -> c_int;
|
||||
pub fn TessPageIteratorIsAtFinalElement(handle: *mut c_void, level: c_int, element: c_int) -> c_int;
|
||||
pub fn TessPageIteratorBoundingBox(
|
||||
handle: *mut c_void,
|
||||
level: c_int,
|
||||
left: *mut c_int,
|
||||
top: *mut c_int,
|
||||
right: *mut c_int,
|
||||
bottom: *mut c_int,
|
||||
) -> c_int;
|
||||
pub fn TessPageIteratorBlockType(handle: *mut c_void) -> c_int;
|
||||
pub fn TessPageIteratorBaseline(
|
||||
handle: *mut c_void,
|
||||
level: c_int,
|
||||
x1: *mut c_int,
|
||||
y1: *mut c_int,
|
||||
x2: *mut c_int,
|
||||
y2: *mut c_int,
|
||||
) -> c_int;
|
||||
pub fn TessPageIteratorOrientation(
|
||||
handle: *mut c_void,
|
||||
orientation: *mut c_int,
|
||||
writing_direction: *mut c_int,
|
||||
textline_order: *mut c_int,
|
||||
deskew_angle: *mut c_float,
|
||||
) -> c_int;
|
||||
pub fn TessBaseAPIGetIterator(handle: *mut c_void) -> *mut c_void;
|
||||
pub fn TessPageIteratorParagraphInfo(
|
||||
handle: *mut c_void,
|
||||
justification: *mut c_int,
|
||||
is_list_item: *mut c_int,
|
||||
is_crown: *mut c_int,
|
||||
first_line_indent: *mut c_int,
|
||||
) -> c_int;
|
||||
}
|
||||
589
crates/kreuzberg-tesseract/src/result_iterator.rs
Normal file
589
crates/kreuzberg-tesseract/src/result_iterator.rs
Normal file
@@ -0,0 +1,589 @@
|
||||
use crate::api::TessDeleteText;
|
||||
use crate::enums::TessPageIteratorLevel;
|
||||
use crate::error::{Result, TesseractError};
|
||||
use std::ffi::CStr;
|
||||
use std::os::raw::{c_char, c_float, c_int, c_void};
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
/// Font attributes detected by Tesseract for a word.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct FontAttributes {
|
||||
pub is_bold: bool,
|
||||
pub is_italic: bool,
|
||||
pub is_underlined: bool,
|
||||
pub is_monospace: bool,
|
||||
pub is_serif: bool,
|
||||
pub is_smallcaps: bool,
|
||||
pub pointsize: i32,
|
||||
pub font_id: i32,
|
||||
}
|
||||
|
||||
/// Complete word data extracted in a single mutex lock.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct WordData {
|
||||
pub text: String,
|
||||
pub left: i32,
|
||||
pub top: i32,
|
||||
pub right: i32,
|
||||
pub bottom: i32,
|
||||
pub confidence: f32,
|
||||
pub font_attrs: Option<FontAttributes>,
|
||||
}
|
||||
|
||||
pub struct ResultIterator {
|
||||
pub handle: Arc<Mutex<*mut c_void>>,
|
||||
}
|
||||
|
||||
unsafe impl Send for ResultIterator {}
|
||||
unsafe impl Sync for ResultIterator {}
|
||||
|
||||
impl ResultIterator {
|
||||
/// Creates a new instance of the ResultIterator.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `handle` - Pointer to the ResultIterator.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the new instance of the ResultIterator.
|
||||
pub fn new(handle: *mut c_void) -> Self {
|
||||
ResultIterator {
|
||||
handle: Arc::new(Mutex::new(handle)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Gets the UTF-8 text of the current iterator.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `level` - Level of the text.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the UTF-8 text as a `String` if successful, otherwise returns an error.
|
||||
pub fn get_utf8_text(&self, level: TessPageIteratorLevel) -> Result<String> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
// SAFETY: TessResultIteratorGetUTF8Text() allocates and returns a pointer to a C string.
|
||||
// This is safe because:
|
||||
// 1. *handle is a valid pointer to an initialized ResultIterator (mutex-guarded)
|
||||
// 2. level is a valid TessPageIteratorLevel enum converted to c_int (in valid range)
|
||||
// 3. The returned pointer is either null (error) or a valid null-terminated C string
|
||||
// allocated on Tesseract's heap (must be freed with TessDeleteText)
|
||||
let text_ptr = unsafe { TessResultIteratorGetUTF8Text(*handle, level as c_int) };
|
||||
if text_ptr.is_null() {
|
||||
return Err(TesseractError::NullPointerError);
|
||||
}
|
||||
// SAFETY: We've verified text_ptr is non-null. The allocation/deallocation pattern is:
|
||||
// 1. text_ptr was allocated by TessResultIteratorGetUTF8Text() on the FFI boundary
|
||||
// 2. CStr::from_ptr(text_ptr) is safe: pointer is non-null and points to valid C string
|
||||
// 3. We read from the string (to_str() creates temporary immutable borrow)
|
||||
// 4. We immediately copy all data to owned String before deallocation
|
||||
// 5. The string data remains valid until TessDeleteText is called
|
||||
let c_str = unsafe { CStr::from_ptr(text_ptr) };
|
||||
let result = c_str.to_str()?.to_owned();
|
||||
// SAFETY: TessDeleteText() deallocates memory allocated by TessResultIteratorGetUTF8Text():
|
||||
// 1. text_ptr must be non-null (verified above)
|
||||
// 2. text_ptr came from the Tesseract API (trusted source, correct allocation)
|
||||
// 3. TessDeleteText() is the correct deallocation function for this allocation
|
||||
// 4. Must be called exactly once per allocation to avoid double-free (we ensure this)
|
||||
// 5. After this call, text_ptr is invalid; all uses must be via owned result String
|
||||
unsafe { TessDeleteText(text_ptr as *mut c_char) };
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Gets the confidence of the current iterator.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `level` - Level of the confidence.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the confidence as a `f32`.
|
||||
pub fn confidence(&self, level: TessPageIteratorLevel) -> Result<f32> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
// SAFETY: TessResultIteratorConfidence() is safe because:
|
||||
// 1. *handle is a valid pointer to an initialized ResultIterator
|
||||
// 2. level is a valid TessPageIteratorLevel enum converted to c_int
|
||||
// 3. The function only reads state and returns an f32 value (copyable)
|
||||
// 4. No pointer operations or memory access is needed
|
||||
Ok(unsafe { TessResultIteratorConfidence(*handle, level as c_int) })
|
||||
}
|
||||
|
||||
/// Gets the recognition language of the current iterator.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the recognition language as a `String` if successful, otherwise returns an error.
|
||||
pub fn word_recognition_language(&self) -> Result<String> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
// SAFETY: TessResultIteratorWordRecognitionLanguage() returns a pointer to a C string
|
||||
// in the iterator's memory. This is safe because:
|
||||
// 1. *handle is a valid pointer to an initialized ResultIterator
|
||||
// 2. The returned pointer is either null or a valid null-terminated C string
|
||||
let lang_ptr = unsafe { TessResultIteratorWordRecognitionLanguage(*handle) };
|
||||
if lang_ptr.is_null() {
|
||||
return Err(TesseractError::NullPointerError);
|
||||
}
|
||||
// SAFETY: We've verified lang_ptr is non-null. CStr::from_ptr() is safe because:
|
||||
// 1. lang_ptr points to a valid null-terminated C string managed by Tesseract
|
||||
// 2. We only read from it (to_str() creates temporary borrow)
|
||||
let c_str = unsafe { CStr::from_ptr(lang_ptr) };
|
||||
Ok(c_str.to_str()?.to_owned())
|
||||
}
|
||||
|
||||
/// Gets the font attributes of the current iterator.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the font attributes as a tuple if successful, otherwise returns an error.
|
||||
pub fn word_font_attributes(&self) -> Result<(bool, bool, bool, bool, bool, bool, i32, i32)> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
let mut is_bold = 0;
|
||||
let mut is_italic = 0;
|
||||
let mut is_underlined = 0;
|
||||
let mut is_monospace = 0;
|
||||
let mut is_serif = 0;
|
||||
let mut is_smallcaps = 0;
|
||||
let mut pointsize = 0;
|
||||
let mut font_id = 0;
|
||||
|
||||
// SAFETY: TessResultIteratorWordFontAttributes() takes output parameter pointers
|
||||
// and fills them with font attribute values. This is safe because:
|
||||
// 1. *handle is a valid pointer to an initialized ResultIterator (mutex-guarded)
|
||||
// 2. All mutable references (&mut ...) are valid local stack variables
|
||||
// 3. Each reference has a distinct memory location (no aliasing)
|
||||
// 4. The references outlive the FFI call (defined on stack, used immediately after)
|
||||
// 5. The function writes output i32 values (0/1 for bools, integers for size/id)
|
||||
// 6. Each reference has exclusive mutable access (Rust borrow checker enforces this)
|
||||
// 7. The output parameters are independent (function cannot cause data races)
|
||||
let result = unsafe {
|
||||
TessResultIteratorWordFontAttributes(
|
||||
*handle,
|
||||
&mut is_bold,
|
||||
&mut is_italic,
|
||||
&mut is_underlined,
|
||||
&mut is_monospace,
|
||||
&mut is_serif,
|
||||
&mut is_smallcaps,
|
||||
&mut pointsize,
|
||||
&mut font_id,
|
||||
)
|
||||
};
|
||||
|
||||
if result == 0 {
|
||||
Err(TesseractError::InvalidParameterError)
|
||||
} else {
|
||||
Ok((
|
||||
is_bold != 0,
|
||||
is_italic != 0,
|
||||
is_underlined != 0,
|
||||
is_monospace != 0,
|
||||
is_serif != 0,
|
||||
is_smallcaps != 0,
|
||||
pointsize,
|
||||
font_id,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
/// Checks if the current iterator is from the dictionary.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `true` if the current iterator is from the dictionary, otherwise returns `false`.
|
||||
pub fn word_is_from_dictionary(&self) -> Result<bool> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
// SAFETY: TessResultIteratorWordIsFromDictionary() is safe because:
|
||||
// 1. *handle is a valid pointer to an initialized ResultIterator
|
||||
// 2. The function only reads state and returns an i32 value (0 or non-zero)
|
||||
// 3. No pointer operations or memory modifications are needed
|
||||
Ok(unsafe { TessResultIteratorWordIsFromDictionary(*handle) != 0 })
|
||||
}
|
||||
|
||||
/// Checks if the current iterator is numeric.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `true` if the current iterator is numeric, otherwise returns `false`.
|
||||
pub fn word_is_numeric(&self) -> Result<bool> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
// SAFETY: TessResultIteratorWordIsNumeric() is safe because:
|
||||
// 1. *handle is a valid pointer to an initialized ResultIterator
|
||||
// 2. The function only reads state and returns an i32 value
|
||||
// 3. No pointer operations or state modifications needed
|
||||
Ok(unsafe { TessResultIteratorWordIsNumeric(*handle) != 0 })
|
||||
}
|
||||
|
||||
/// Checks if the current iterator is superscript.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `true` if the current iterator is superscript, otherwise returns `false`.
|
||||
pub fn symbol_is_superscript(&self) -> Result<bool> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
// SAFETY: TessResultIteratorSymbolIsSuperscript() is safe because:
|
||||
// 1. *handle is a valid pointer to an initialized ResultIterator
|
||||
// 2. The function only reads state and returns an i32 value
|
||||
// 3. No pointer operations or state modifications needed
|
||||
Ok(unsafe { TessResultIteratorSymbolIsSuperscript(*handle) != 0 })
|
||||
}
|
||||
|
||||
/// Checks if the current iterator is subscript.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `true` if the current iterator is subscript, otherwise returns `false`.
|
||||
pub fn symbol_is_subscript(&self) -> Result<bool> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
// SAFETY: TessResultIteratorSymbolIsSubscript() is safe because:
|
||||
// 1. *handle is a valid pointer to an initialized ResultIterator
|
||||
// 2. The function only reads state and returns an i32 value
|
||||
// 3. No pointer operations or state modifications needed
|
||||
Ok(unsafe { TessResultIteratorSymbolIsSubscript(*handle) != 0 })
|
||||
}
|
||||
|
||||
/// Checks if the current iterator is dropcap.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `true` if the current iterator is dropcap, otherwise returns `false`.
|
||||
pub fn symbol_is_dropcap(&self) -> Result<bool> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
// SAFETY: TessResultIteratorSymbolIsDropcap() is safe because:
|
||||
// 1. *handle is a valid pointer to an initialized ResultIterator
|
||||
// 2. The function only reads state and returns an i32 value
|
||||
// 3. No pointer operations or state modifications needed
|
||||
Ok(unsafe { TessResultIteratorSymbolIsDropcap(*handle) != 0 })
|
||||
}
|
||||
|
||||
/// Moves to the next iterator.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `level` - Level of the next iterator.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `true` if the next iterator exists, otherwise returns `false`.
|
||||
pub fn next(&self, level: TessPageIteratorLevel) -> Result<bool> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
// SAFETY: TessResultIteratorNext() is safe because:
|
||||
// 1. *handle is a valid pointer to an initialized ResultIterator
|
||||
// 2. level is a valid TessPageIteratorLevel enum converted to c_int
|
||||
// 3. The function modifies iterator state (advances position) and returns i32 result
|
||||
// 4. The mutex ensures exclusive access during state modification
|
||||
Ok(unsafe { TessResultIteratorNext(*handle, level as c_int) != 0 })
|
||||
}
|
||||
|
||||
/// Gets the current word from the iterator with its bounding box and confidence.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns a tuple of (text, left, top, right, bottom, confidence) if successful
|
||||
pub fn get_word_with_bounds(&self) -> Result<(String, i32, i32, i32, i32, f32)> {
|
||||
let text = self.get_utf8_text(TessPageIteratorLevel::RIL_WORD)?;
|
||||
let (left, top, right, bottom) = self.get_bounding_box(TessPageIteratorLevel::RIL_WORD)?;
|
||||
let confidence = self.confidence(TessPageIteratorLevel::RIL_WORD)?;
|
||||
|
||||
Ok((text, left, top, right, bottom, confidence))
|
||||
}
|
||||
|
||||
/// Advances the iterator to the next word.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns true if successful, false if there are no more words
|
||||
pub fn next_word(&self) -> Result<bool> {
|
||||
self.next(TessPageIteratorLevel::RIL_WORD)
|
||||
}
|
||||
|
||||
/// Gets the word information for the current position in the iterator.
|
||||
/// Should be called before next() to ensure valid data.
|
||||
///
|
||||
/// # Returns
|
||||
/// Returns a tuple of (text, left, top, right, bottom, confidence) if successful
|
||||
pub fn get_current_word(&self) -> Result<(String, i32, i32, i32, i32, f32)> {
|
||||
let text = self.get_utf8_text(TessPageIteratorLevel::RIL_WORD)?;
|
||||
let (left, top, right, bottom) = self.get_bounding_box(TessPageIteratorLevel::RIL_WORD)?;
|
||||
let confidence = self.confidence(TessPageIteratorLevel::RIL_WORD)?;
|
||||
|
||||
Ok((text, left, top, right, bottom, confidence))
|
||||
}
|
||||
|
||||
/// Gets the bounding box for the current element.
|
||||
pub fn get_bounding_box(&self, level: TessPageIteratorLevel) -> Result<(i32, i32, i32, i32)> {
|
||||
let mut left = 0;
|
||||
let mut top = 0;
|
||||
let mut right = 0;
|
||||
let mut bottom = 0;
|
||||
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
|
||||
// SAFETY: TessPageIteratorBoundingBox() queries iterator state and returns coordinates
|
||||
// via output parameters. This is safe because:
|
||||
// 1. *handle is a valid pointer to an initialized ResultIterator or PageIterator (mutex-guarded)
|
||||
// 2. level is a valid TessPageIteratorLevel enum converted to c_int (in valid range)
|
||||
// 3. All mutable references (&mut left, &mut top, &mut right, &mut bottom)
|
||||
// are valid local stack variables with distinct memory locations
|
||||
// 4. Each reference is exclusively borrowed (Rust enforces no aliasing)
|
||||
// 5. The references outlive the FFI call (defined on stack, used immediately after)
|
||||
// 6. The function writes four i32 coordinate values into these references
|
||||
// 7. No pointer escaping: the function only writes to these parameters, doesn't store them
|
||||
// 8. Return value indicates success/failure (checked below)
|
||||
let result = unsafe {
|
||||
TessPageIteratorBoundingBox(*handle, level as c_int, &mut left, &mut top, &mut right, &mut bottom)
|
||||
};
|
||||
|
||||
if result == 0 {
|
||||
Err(TesseractError::InvalidParameterError)
|
||||
} else {
|
||||
Ok((left, top, right, bottom))
|
||||
}
|
||||
}
|
||||
|
||||
/// Extracts all word data from the iterator in a single mutex lock.
|
||||
///
|
||||
/// Acquires the mutex once and iterates all words, collecting text, bounding box,
|
||||
/// confidence, and font attributes for each word. This is more efficient than
|
||||
/// calling individual methods in a loop since it avoids repeated mutex acquisitions.
|
||||
///
|
||||
/// The iterator is always reset to the beginning before traversal so that partial
|
||||
/// prior consumption does not cause words to be missed.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns a `Vec<WordData>` containing data for every word, or an error if the
|
||||
/// mutex cannot be acquired.
|
||||
pub fn extract_all_words(&self) -> Result<Vec<WordData>> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
let raw = *handle;
|
||||
let mut words = Vec::new();
|
||||
|
||||
// Reset to the first element before traversal. ResultIterator inherits from
|
||||
// PageIterator in C++, so TessPageIteratorBegin operates on the same handle.
|
||||
// SAFETY: raw is a valid mutex-guarded ResultIterator pointer; TessPageIteratorBegin
|
||||
// simply resets the internal position and does not allocate or free memory.
|
||||
unsafe { TessPageIteratorBegin(raw) };
|
||||
|
||||
loop {
|
||||
// SAFETY: raw is the mutex-guarded *mut c_void handle. All calls within this
|
||||
// loop are performed while holding the mutex lock, ensuring exclusive access.
|
||||
// We pass raw directly to the unlocked helper to avoid re-locking.
|
||||
match extract_word_data_unlocked(raw) {
|
||||
Ok(word) => words.push(word),
|
||||
// NullPointerError means the text pointer was null; skip this position.
|
||||
// InvalidParameterError means bounding box failed; skip this position.
|
||||
// Utf8Error means the text was not valid UTF-8; skip this word rather than
|
||||
// aborting, so the remaining words in the iterator are not lost.
|
||||
Err(TesseractError::NullPointerError)
|
||||
| Err(TesseractError::InvalidParameterError)
|
||||
| Err(TesseractError::Utf8Error(_)) => {}
|
||||
Err(e) => return Err(e),
|
||||
}
|
||||
|
||||
// SAFETY: TessResultIteratorNext() advances the iterator state and returns
|
||||
// non-zero if a next element exists. This is safe because:
|
||||
// 1. raw is a valid pointer to an initialized ResultIterator (mutex-guarded)
|
||||
// 2. RIL_WORD is a valid TessPageIteratorLevel enum value
|
||||
// 3. The mutex is held for the duration of this call (exclusive access)
|
||||
// 4. The function modifies iterator position and returns an i32 result
|
||||
let has_next = unsafe { TessResultIteratorNext(raw, TessPageIteratorLevel::RIL_WORD as c_int) != 0 };
|
||||
if !has_next {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(words)
|
||||
}
|
||||
|
||||
/// Extracts the current word's data in a single mutex lock.
|
||||
///
|
||||
/// Acquires the mutex once and calls all FFI functions (text, bounding box,
|
||||
/// confidence, font attributes) within that lock scope. More efficient than
|
||||
/// calling the individual methods separately when all fields are needed.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns a [`WordData`] struct if successful, otherwise returns an error.
|
||||
pub fn extract_word_data(&self) -> Result<WordData> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
extract_word_data_unlocked(*handle)
|
||||
}
|
||||
}
|
||||
|
||||
/// Extracts word data from a raw iterator handle without acquiring the mutex.
|
||||
///
|
||||
/// The caller MUST hold the mutex lock for the `ResultIterator` this handle belongs to
|
||||
/// before calling this function. Passing a handle that is not mutex-guarded, or calling
|
||||
/// this function concurrently on the same handle, is undefined behaviour.
|
||||
fn extract_word_data_unlocked(raw: *mut c_void) -> Result<WordData> {
|
||||
// SAFETY: TessResultIteratorGetUTF8Text() allocates and returns a pointer to a C string.
|
||||
// This is safe because:
|
||||
// 1. raw is a valid pointer to an initialized ResultIterator (caller holds mutex lock)
|
||||
// 2. RIL_WORD is a valid TessPageIteratorLevel enum value converted to c_int
|
||||
// 3. The returned pointer is either null (error) or a valid null-terminated C string
|
||||
// allocated on Tesseract's heap (must be freed with TessDeleteText)
|
||||
let text_ptr = unsafe { TessResultIteratorGetUTF8Text(raw, TessPageIteratorLevel::RIL_WORD as c_int) };
|
||||
if text_ptr.is_null() {
|
||||
return Err(TesseractError::NullPointerError);
|
||||
}
|
||||
// SAFETY: We've verified text_ptr is non-null. The allocation/deallocation pattern is:
|
||||
// 1. text_ptr was allocated by TessResultIteratorGetUTF8Text() on the FFI boundary
|
||||
// 2. CStr::from_ptr(text_ptr) is safe: pointer is non-null and points to valid C string
|
||||
// 3. We immediately copy all data to an owned String before deallocation
|
||||
// 4. The string data remains valid until TessDeleteText is called
|
||||
let text = {
|
||||
let c_str = unsafe { CStr::from_ptr(text_ptr) };
|
||||
let owned = c_str.to_str()?.to_owned();
|
||||
// SAFETY: TessDeleteText() deallocates memory allocated by TessResultIteratorGetUTF8Text():
|
||||
// 1. text_ptr is non-null (verified above)
|
||||
// 2. text_ptr came from the Tesseract API (correct allocation type)
|
||||
// 3. TessDeleteText() is the correct deallocation function for this allocation
|
||||
// 4. Called exactly once per allocation to avoid double-free
|
||||
// 5. owned String was already populated; text_ptr is no longer accessed after this call
|
||||
unsafe { TessDeleteText(text_ptr as *mut c_char) };
|
||||
owned
|
||||
};
|
||||
|
||||
let mut left = 0;
|
||||
let mut top = 0;
|
||||
let mut right = 0;
|
||||
let mut bottom = 0;
|
||||
// SAFETY: TessPageIteratorBoundingBox() queries iterator state and fills output parameters.
|
||||
// This is safe because:
|
||||
// 1. raw is a valid pointer to an initialized ResultIterator (caller holds mutex lock)
|
||||
// 2. RIL_WORD is a valid TessPageIteratorLevel enum value converted to c_int
|
||||
// 3. All mutable references are valid local stack variables with distinct memory locations
|
||||
// 4. Each reference is exclusively borrowed (Rust enforces no aliasing)
|
||||
// 5. The references outlive the FFI call (defined on stack, used immediately after)
|
||||
// 6. Return value indicates success/failure (checked below)
|
||||
let bbox_result = unsafe {
|
||||
TessPageIteratorBoundingBox(
|
||||
raw,
|
||||
TessPageIteratorLevel::RIL_WORD as c_int,
|
||||
&mut left,
|
||||
&mut top,
|
||||
&mut right,
|
||||
&mut bottom,
|
||||
)
|
||||
};
|
||||
if bbox_result == 0 {
|
||||
return Err(TesseractError::InvalidParameterError);
|
||||
}
|
||||
|
||||
// SAFETY: TessResultIteratorConfidence() reads iterator state and returns an f32 value.
|
||||
// This is safe because:
|
||||
// 1. raw is a valid pointer to an initialized ResultIterator (caller holds mutex lock)
|
||||
// 2. RIL_WORD is a valid TessPageIteratorLevel enum value converted to c_int
|
||||
// 3. The function only reads state and returns a copy (no pointer operations)
|
||||
let confidence = unsafe { TessResultIteratorConfidence(raw, TessPageIteratorLevel::RIL_WORD as c_int) };
|
||||
|
||||
// Collect font attributes; treat any failure as absent rather than propagating the error.
|
||||
let font_attrs = {
|
||||
let mut is_bold = 0;
|
||||
let mut is_italic = 0;
|
||||
let mut is_underlined = 0;
|
||||
let mut is_monospace = 0;
|
||||
let mut is_serif = 0;
|
||||
let mut is_smallcaps = 0;
|
||||
let mut pointsize = 0;
|
||||
let mut font_id = 0;
|
||||
// SAFETY: TessResultIteratorWordFontAttributes() fills output parameters with font info.
|
||||
// This is safe because:
|
||||
// 1. raw is a valid pointer to an initialized ResultIterator (caller holds mutex lock)
|
||||
// 2. All mutable references are valid local stack variables with distinct memory locations
|
||||
// 3. Each reference is exclusively borrowed (no aliasing)
|
||||
// 4. The references outlive the FFI call
|
||||
// 5. Return value is non-zero on success, zero on failure (checked below)
|
||||
let result = unsafe {
|
||||
TessResultIteratorWordFontAttributes(
|
||||
raw,
|
||||
&mut is_bold,
|
||||
&mut is_italic,
|
||||
&mut is_underlined,
|
||||
&mut is_monospace,
|
||||
&mut is_serif,
|
||||
&mut is_smallcaps,
|
||||
&mut pointsize,
|
||||
&mut font_id,
|
||||
)
|
||||
};
|
||||
if result != 0 {
|
||||
Some(FontAttributes {
|
||||
is_bold: is_bold != 0,
|
||||
is_italic: is_italic != 0,
|
||||
is_underlined: is_underlined != 0,
|
||||
is_monospace: is_monospace != 0,
|
||||
is_serif: is_serif != 0,
|
||||
is_smallcaps: is_smallcaps != 0,
|
||||
pointsize,
|
||||
font_id,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
Ok(WordData {
|
||||
text,
|
||||
left,
|
||||
top,
|
||||
right,
|
||||
bottom,
|
||||
confidence,
|
||||
font_attrs,
|
||||
})
|
||||
}
|
||||
|
||||
impl Drop for ResultIterator {
|
||||
fn drop(&mut self) {
|
||||
if let Ok(handle) = self.handle.lock() {
|
||||
// SAFETY: TessResultIteratorDelete() frees the ResultIterator handle allocated by Tesseract:
|
||||
// 1. We use .ok() pattern to handle poisoned mutex gracefully (no panic in Drop)
|
||||
// 2. *handle is a valid opaque pointer allocated by TessBaseAPIGetIterator()
|
||||
// or TessBaseAPIGetMutableIterator() - Tesseract owns this memory
|
||||
// 3. TessResultIteratorDelete() is the single correct way to deallocate this type
|
||||
// 4. The function must be called exactly once per allocation to avoid double-free
|
||||
// 5. After calling delete, the pointer is invalid; future use would cause use-after-free
|
||||
// 6. Drop impl never panics (we use .ok() guard), ensuring cleanup always executes
|
||||
// 7. If mutex is poisoned, handle cleanup is skipped (OS will reclaim process memory)
|
||||
unsafe { TessResultIteratorDelete(*handle) };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
|
||||
ffi_extern! {
|
||||
pub fn TessResultIteratorDelete(handle: *mut c_void);
|
||||
pub fn TessPageIteratorBegin(handle: *mut c_void);
|
||||
pub fn TessResultIteratorGetUTF8Text(handle: *mut c_void, level: c_int) -> *mut c_char;
|
||||
pub fn TessResultIteratorConfidence(handle: *mut c_void, level: c_int) -> c_float;
|
||||
pub fn TessResultIteratorWordRecognitionLanguage(handle: *mut c_void) -> *const c_char;
|
||||
pub fn TessResultIteratorWordFontAttributes(
|
||||
handle: *mut c_void,
|
||||
is_bold: *mut c_int,
|
||||
is_italic: *mut c_int,
|
||||
is_underlined: *mut c_int,
|
||||
is_monospace: *mut c_int,
|
||||
is_serif: *mut c_int,
|
||||
is_smallcaps: *mut c_int,
|
||||
pointsize: *mut c_int,
|
||||
font_id: *mut c_int,
|
||||
) -> c_int;
|
||||
pub fn TessResultIteratorWordIsFromDictionary(handle: *mut c_void) -> c_int;
|
||||
pub fn TessResultIteratorWordIsNumeric(handle: *mut c_void) -> c_int;
|
||||
pub fn TessResultIteratorSymbolIsSuperscript(handle: *mut c_void) -> c_int;
|
||||
pub fn TessResultIteratorSymbolIsSubscript(handle: *mut c_void) -> c_int;
|
||||
pub fn TessResultIteratorSymbolIsDropcap(handle: *mut c_void) -> c_int;
|
||||
pub fn TessResultIteratorNext(handle: *mut c_void, level: c_int) -> c_int;
|
||||
pub fn TessPageIteratorBoundingBox(
|
||||
handle: *mut c_void,
|
||||
level: c_int,
|
||||
left: *mut c_int,
|
||||
top: *mut c_int,
|
||||
right: *mut c_int,
|
||||
bottom: *mut c_int,
|
||||
) -> c_int;
|
||||
}
|
||||
212
crates/kreuzberg-tesseract/src/result_renderer.rs
Normal file
212
crates/kreuzberg-tesseract/src/result_renderer.rs
Normal file
@@ -0,0 +1,212 @@
|
||||
use crate::TesseractAPI;
|
||||
use crate::error::{Result, TesseractError};
|
||||
use std::ffi::{CStr, CString};
|
||||
use std::os::raw::{c_char, c_int, c_void};
|
||||
use std::sync::Arc;
|
||||
use std::sync::Mutex;
|
||||
|
||||
pub struct TessResultRenderer {
|
||||
handle: Arc<Mutex<*mut c_void>>,
|
||||
}
|
||||
|
||||
unsafe impl Send for TessResultRenderer {}
|
||||
unsafe impl Sync for TessResultRenderer {}
|
||||
|
||||
impl TessResultRenderer {
|
||||
/// Creates a new instance of the TessResultRenderer.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `outputbase` - Output base path.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the new instance of the TessResultRenderer.
|
||||
pub fn new_text_renderer(outputbase: &str) -> Result<Self> {
|
||||
let outputbase = CString::new(outputbase).map_err(|_| TesseractError::NullByteInString)?;
|
||||
let handle = unsafe { TessTextRendererCreate(outputbase.as_ptr()) };
|
||||
if handle.is_null() {
|
||||
Err(TesseractError::NullPointerError)
|
||||
} else {
|
||||
Ok(TessResultRenderer {
|
||||
handle: Arc::new(Mutex::new(handle)),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new instance of the TessResultRenderer for HOCR.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `outputbase` - Output base path.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the new instance of the TessResultRenderer.
|
||||
pub fn new_hocr_renderer(outputbase: &str) -> Result<Self> {
|
||||
let outputbase = CString::new(outputbase).map_err(|_| TesseractError::NullByteInString)?;
|
||||
let handle = unsafe { TessHOcrRendererCreate(outputbase.as_ptr()) };
|
||||
if handle.is_null() {
|
||||
Err(TesseractError::NullPointerError)
|
||||
} else {
|
||||
Ok(TessResultRenderer {
|
||||
handle: Arc::new(Mutex::new(handle)),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new instance of the TessResultRenderer for PDF.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `outputbase` - Output base path.
|
||||
/// * `datadir` - Data directory path.
|
||||
/// * `textonly` - Whether to include text only.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the new instance of the TessResultRenderer.
|
||||
pub fn new_pdf_renderer(outputbase: &str, datadir: &str, textonly: bool) -> Result<Self> {
|
||||
let outputbase = CString::new(outputbase).map_err(|_| TesseractError::NullByteInString)?;
|
||||
let datadir = CString::new(datadir).map_err(|_| TesseractError::NullByteInString)?;
|
||||
let handle = unsafe { TessPDFRendererCreate(outputbase.as_ptr(), datadir.as_ptr(), textonly as c_int) };
|
||||
if handle.is_null() {
|
||||
Err(TesseractError::NullPointerError)
|
||||
} else {
|
||||
Ok(TessResultRenderer {
|
||||
handle: Arc::new(Mutex::new(handle)),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Begins a new document.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `title` - Title of the document.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `true` if the document was created successfully, otherwise returns `false`.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns a `TesseractError` if the string contains a null byte or if the mutex lock fails.
|
||||
pub fn begin_document(&self, title: &str) -> Result<bool> {
|
||||
let title = CString::new(title).map_err(|_| TesseractError::NullByteInString)?;
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
Ok(unsafe { TessResultRendererBeginDocument(*handle, title.as_ptr()) != 0 })
|
||||
}
|
||||
|
||||
/// Adds an image to the document.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `api` - The TesseractAPI instance.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `true` if the image was added successfully, otherwise returns `false`.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns a `TesseractError::MutexLockError` if either mutex lock fails.
|
||||
pub fn add_image(&self, api: &TesseractAPI) -> Result<bool> {
|
||||
let api_handle = api.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
Ok(unsafe { TessResultRendererAddImage(*handle, *api_handle) != 0 })
|
||||
}
|
||||
|
||||
/// Ends the document.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `true` if the document was ended successfully, otherwise returns `false`.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns a `TesseractError::MutexLockError` if the mutex lock fails.
|
||||
pub fn end_document(&self) -> Result<bool> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
Ok(unsafe { TessResultRendererEndDocument(*handle) != 0 })
|
||||
}
|
||||
|
||||
/// Gets the extension of the document.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the extension as a `String` if successful, otherwise returns an error.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns a `TesseractError::MutexLockError` if the mutex lock fails,
|
||||
/// `TesseractError::NullPointerError` if the extension pointer is null,
|
||||
/// or `TesseractError::Utf8Error` if the extension contains invalid UTF-8.
|
||||
pub fn get_extension(&self) -> Result<String> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
let ext_ptr = unsafe { TessResultRendererExtention(*handle) };
|
||||
if ext_ptr.is_null() {
|
||||
Err(TesseractError::NullPointerError)
|
||||
} else {
|
||||
let c_str = unsafe { CStr::from_ptr(ext_ptr) };
|
||||
Ok(c_str.to_str()?.to_owned())
|
||||
}
|
||||
}
|
||||
|
||||
/// Gets the title of the document.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the title as a `String` if successful, otherwise returns an error.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns a `TesseractError::MutexLockError` if the mutex lock fails,
|
||||
/// `TesseractError::NullPointerError` if the title pointer is null,
|
||||
/// or `TesseractError::Utf8Error` if the title contains invalid UTF-8.
|
||||
pub fn get_title(&self) -> Result<String> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
let title_ptr = unsafe { TessResultRendererTitle(*handle) };
|
||||
if title_ptr.is_null() {
|
||||
Err(TesseractError::NullPointerError)
|
||||
} else {
|
||||
let c_str = unsafe { CStr::from_ptr(title_ptr) };
|
||||
Ok(c_str.to_str()?.to_owned())
|
||||
}
|
||||
}
|
||||
|
||||
/// Gets the number of images in the document.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the number of images as an `i32`.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns a `TesseractError::MutexLockError` if the mutex lock fails.
|
||||
pub fn get_image_num(&self) -> Result<i32> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
Ok(unsafe { TessResultRendererImageNum(*handle) })
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for TessResultRenderer {
|
||||
fn drop(&mut self) {
|
||||
if let Ok(handle) = self.handle.lock() {
|
||||
unsafe { TessDeleteResultRenderer(*handle) };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ffi_extern! {
|
||||
pub fn TessTextRendererCreate(outputbase: *const c_char) -> *mut c_void;
|
||||
pub fn TessHOcrRendererCreate(outputbase: *const c_char) -> *mut c_void;
|
||||
pub fn TessPDFRendererCreate(outputbase: *const c_char, datadir: *const c_char, textonly: c_int) -> *mut c_void;
|
||||
pub fn TessDeleteResultRenderer(renderer: *mut c_void);
|
||||
pub fn TessResultRendererBeginDocument(renderer: *mut c_void, title: *const c_char) -> c_int;
|
||||
pub fn TessResultRendererAddImage(renderer: *mut c_void, api: *mut c_void) -> c_int;
|
||||
pub fn TessResultRendererEndDocument(renderer: *mut c_void) -> c_int;
|
||||
pub fn TessResultRendererExtention(renderer: *mut c_void) -> *const c_char;
|
||||
pub fn TessResultRendererTitle(renderer: *mut c_void) -> *const c_char;
|
||||
pub fn TessResultRendererImageNum(renderer: *mut c_void) -> c_int;
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user