Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/crates/config.m4
+++ b/crates/config.m4
@@ -0,0 +1,28 @@
+dnl Configuration for Rust-based PHP extension via ext-php-rs.
+dnl This file enables phpize to compile the extension using cargo instead of make.
+
+PHP_ARG_ENABLE([kreuzberg],
+  [whether to enable the kreuzberg extension],
+  [AS_HELP_STRING([--enable-kreuzberg],
+    [Enable kreuzberg extension support])],
+  [yes])
+
+if test "$PHP_KREUZBERG_ENABLED" = "yes"; then
+  dnl Check that cargo is available
+  AC_PATH_PROG([CARGO], [cargo], [no])
+  if test "x$CARGO" = "xno"; then
+    AC_MSG_ERROR([cargo is required to build this extension])
+  fi
+
+  dnl Build the Rust extension using cargo
+  AC_MSG_NOTICE([Building Rust extension kreuzberg])
+
+  dnl Set up the extension module
+  PHP_NEW_EXTENSION(kreuzberg, [], $ext_shared)
+
+  dnl Custom build: invoke cargo instead of make
+  PHP_ADD_BUILD_DIR($ext_builddir)
+
+  dnl The actual build is handled by the build.rs script;
+  dnl cargo outputs the .so/.dylib/.dll which phpize will place in extension_dir.
+fi
--- a/crates/kreuzberg-cli/Cargo.toml
+++ b/crates/kreuzberg-cli/Cargo.toml
@@ -0,0 +1,86 @@
+[package]
+name = "kreuzberg-cli"
+version.workspace = true
+edition.workspace = true
+rust-version.workspace = true
+authors.workspace = true
+description = "Command-line interface for Kreuzberg document intelligence"
+license.workspace = true
+repository.workspace = true
+homepage = "https://kreuzberg.dev"
+documentation = "https://docs.kreuzberg.dev"
+keywords = ["document", "extraction", "cli", "tool", "parser"]
+categories = ["command-line-utilities", "text-processing"]
+
+[package.metadata.cargo-machete]
+ignored = ["serde_toon_format"]
+
+[[bin]]
+name = "kreuzberg"
+path = "src/main.rs"
+
+[features]
+default = [
+    "embeddings",
+    "html",
+    "liter-llm",
+    "ocr",
+    "paddle-ocr",
+    "layout-detection",
+    "chunking-tokenizers",
+    "tree-sitter",
+]
+ort-bundled = ["kreuzberg/ort-bundled"]
+
+ocr = ["kreuzberg/ocr"]
+
+api = ["kreuzberg/api"]
+mcp = ["kreuzberg/mcp"]
+mcp-http = ["kreuzberg/mcp-http"]
+embeddings = ["kreuzberg/embeddings"]
+paddle-ocr = ["kreuzberg/paddle-ocr"]
+layout-detection = ["kreuzberg/layout-detection"]
+chunking-tokenizers = ["kreuzberg/chunking-tokenizers"]
+html = ["kreuzberg/html"]
+liter-llm = ["kreuzberg/liter-llm"]
+tree-sitter = ["kreuzberg/tree-sitter", "dep:tree-sitter-language-pack"]
+all = [
+    "default",
+    "api",
+    "html",
+    "mcp",
+    "mcp-http",
+    "chunking-tokenizers",
+    "tree-sitter",
+    "liter-llm",
+]
+
+[dependencies]
+
+anstyle = "1"
+anyhow = { workspace = true }
+base64 = { workspace = true }
+clap = { workspace = true }
+clap_complete = "4.6"
+kreuzberg = { workspace = true, features = [
+    "formats",
+    "analysis",
+    "tokio-runtime",
+    "simd-utf8",
+    "cli",
+] }
+serde = { workspace = true }
+serde_json = { workspace = true }
+serde_toon_format = { workspace = true }
+tokio = { workspace = true }
+tracing = { workspace = true }
+tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt"] }
+tree-sitter-language-pack = { workspace = true, features = [
+    "dynamic-loading",
+    "download",
+    "serde",
+], optional = true }
+
+[dev-dependencies]
+tempfile = { workspace = true }
+ureq = { version = "3.3", features = ["json"] }
--- a/crates/kreuzberg-cli/README.md
+++ b/crates/kreuzberg-cli/README.md
--- a/crates/kreuzberg-cli/build.rs
+++ b/crates/kreuzberg-cli/build.rs
@@ -0,0 +1,13 @@
+fn main() {
+    println!("cargo::rustc-check-cfg=cfg(coverage)");
+
+    let target = std::env::var("TARGET").unwrap();
+
+    if target.contains("darwin") {
+        println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path");
+        println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path/.");
+    } else if target.contains("linux") {
+        println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN");
+        println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN/.");
+    }
+}
--- a/crates/kreuzberg-cli/src/commands/cache.rs
+++ b/crates/kreuzberg-cli/src/commands/cache.rs
@@ -0,0 +1,466 @@
+//! Cache command - Manage cache operations
+//!
+//! This module provides commands for cache management including statistics,
+//! clearing, manifest generation, and model warming.
+
+use anyhow::{Context, Result};
+use kreuzberg::cache;
+use serde_json::json;
+use std::path::PathBuf;
+
+use crate::{WireFormat, style};
+
+/// Execute cache stats command
+pub fn stats_command(cache_dir: Option<PathBuf>, format: WireFormat) -> Result<()> {
+    let default_cache_dir = std::env::current_dir()
+        .context("Failed to get current directory")?
+        .join(".kreuzberg");
+
+    let cache_path = cache_dir.unwrap_or(default_cache_dir);
+    let cache_dir_str = cache_path.to_string_lossy();
+
+    let stats = cache::get_cache_metadata(&cache_dir_str).with_context(|| {
+        format!(
+            "Failed to get cache statistics from directory '{}'. Ensure the directory exists and is readable.",
+            cache_dir_str
+        )
+    })?;
+
+    match format {
+        WireFormat::Text => {
+            println!("{}", style::header("Cache Statistics"));
+            println!("{}", style::dim("================"));
+            println!("{} {}", style::label("Directory:"), style::success(&cache_dir_str));
+            println!("{} {}", style::label("Total files:"), stats.total_files);
+            println!("{} {:.2} MB", style::label("Total size:"), stats.total_size_mb);
+            println!(
+                "{} {:.2} MB",
+                style::label("Available space:"),
+                stats.available_space_mb
+            );
+            println!(
+                "{} {:.2} days",
+                style::label("Oldest file age:"),
+                stats.oldest_file_age_days
+            );
+            println!(
+                "{} {:.2} days",
+                style::label("Newest file age:"),
+                stats.newest_file_age_days
+            );
+        }
+        WireFormat::Json => {
+            let output = json!({
+                "directory": cache_dir_str,
+                "total_files": stats.total_files,
+                "total_size_mb": stats.total_size_mb,
+                "available_space_mb": stats.available_space_mb,
+                "oldest_file_age_days": stats.oldest_file_age_days,
+                "newest_file_age_days": stats.newest_file_age_days,
+            });
+            println!(
+                "{}",
+                serde_json::to_string_pretty(&output).context("Failed to serialize cache statistics to JSON")?
+            );
+        }
+        WireFormat::Toon => {
+            let output = json!({
+                "directory": cache_dir_str,
+                "total_files": stats.total_files,
+                "total_size_mb": stats.total_size_mb,
+                "available_space_mb": stats.available_space_mb,
+                "oldest_file_age_days": stats.oldest_file_age_days,
+                "newest_file_age_days": stats.newest_file_age_days,
+            });
+            println!(
+                "{}",
+                serde_toon::to_string(&output).context("Failed to serialize cache statistics to TOON")?
+            );
+        }
+    }
+
+    Ok(())
+}
+
+/// Execute cache clear command
+pub fn clear_command(cache_dir: Option<PathBuf>, format: WireFormat) -> Result<()> {
+    let default_cache_dir = std::env::current_dir()
+        .context("Failed to get current directory")?
+        .join(".kreuzberg");
+
+    let cache_path = cache_dir.unwrap_or(default_cache_dir);
+    let cache_dir_str = cache_path.to_string_lossy();
+
+    let (removed_files, freed_mb) = cache::clear_cache_directory(&cache_dir_str).with_context(|| {
+        format!(
+            "Failed to clear cache directory '{}'. Ensure you have write permissions.",
+            cache_dir_str
+        )
+    })?;
+
+    match format {
+        WireFormat::Text => {
+            println!("{}", style::success("Cache cleared successfully"));
+            println!("{} {}", style::label("Directory:"), style::success(&cache_dir_str));
+            println!("{} {}", style::label("Removed files:"), removed_files);
+            println!("{} {:.2} MB", style::label("Freed space:"), freed_mb);
+        }
+        WireFormat::Json => {
+            let output = json!({
+                "directory": cache_dir_str,
+                "removed_files": removed_files,
+                "freed_mb": freed_mb,
+            });
+            println!(
+                "{}",
+                serde_json::to_string_pretty(&output).context("Failed to serialize cache clear results to JSON")?
+            );
+        }
+        WireFormat::Toon => {
+            let output = json!({
+                "directory": cache_dir_str,
+                "removed_files": removed_files,
+                "freed_mb": freed_mb,
+            });
+            println!(
+                "{}",
+                serde_toon::to_string(&output).context("Failed to serialize cache clear results to TOON")?
+            );
+        }
+    }
+
+    Ok(())
+}
+
+/// Execute cache manifest command - outputs expected model files with checksums.
+pub fn manifest_command(format: WireFormat) -> Result<()> {
+    // Without at least one model-providing feature, every `extend` call
+    // below is `#[cfg]`-stripped and `entries: Vec<_>` has no anchor for
+    // type inference — `e.size_bytes` on the closure further down then
+    // fails compilation with E0282. Bail with a clear error instead so
+    // (or similar minimal configurations) succeeds.
+    #[cfg(not(any(feature = "paddle-ocr", feature = "layout-detection")))]
+    {
+        let _ = format;
+        anyhow::bail!(
+            "manifest command unavailable: build kreuzberg-cli with at least one of \
+             --features \"paddle-ocr\" or --features \"layout-detection\""
+        );
+    }
+
+    #[cfg(any(feature = "paddle-ocr", feature = "layout-detection"))]
+    {
+        manifest_command_inner(format)
+    }
+}
+
+#[cfg(any(feature = "paddle-ocr", feature = "layout-detection"))]
+fn manifest_command_inner(format: WireFormat) -> Result<()> {
+    let mut entries = Vec::new();
+
+    #[cfg(feature = "paddle-ocr")]
+    {
+        entries.extend(kreuzberg::paddle_ocr::ModelManager::manifest());
+    }
+
+    #[cfg(feature = "layout-detection")]
+    {
+        entries.extend(kreuzberg::layout::LayoutModelManager::manifest());
+    }
+
+    #[cfg(feature = "paddle-ocr")]
+    {
+        entries.extend(kreuzberg::ocr::TessdataManager::manifest());
+    }
+
+    let total_size_bytes: u64 = entries.iter().map(|e| e.size_bytes).sum();
+    let version = env!("CARGO_PKG_VERSION");
+
+    match format {
+        WireFormat::Text => {
+            println!(
+                "{} {}",
+                style::header("Model Manifest"),
+                style::dim(&format!("(kreuzberg {})", version))
+            );
+            println!("{}", style::dim("===================================="));
+            println!(
+                "{:<50} {:>12} {}",
+                style::label("PATH"),
+                style::label("SIZE"),
+                style::label("SHA256")
+            );
+            println!("{}", style::dim(&format!("{:<50} {:>12} ------", "----", "----")));
+            for entry in &entries {
+                let size_str = if entry.size_bytes > 0 {
+                    format!("{:.1} MB", entry.size_bytes as f64 / 1_048_576.0)
+                } else {
+                    "unknown".to_string()
+                };
+                let sha_display = if entry.sha256.len() >= 12 {
+                    &entry.sha256[..12]
+                } else if entry.sha256.is_empty() {
+                    "-"
+                } else {
+                    &entry.sha256
+                };
+                println!(
+                    "{:<50} {:>12} {}",
+                    entry.relative_path,
+                    size_str,
+                    style::dim(sha_display)
+                );
+            }
+            println!();
+            println!(
+                "{} {} files, {:.1} MB",
+                style::label("Total:"),
+                entries.len(),
+                total_size_bytes as f64 / 1_048_576.0
+            );
+        }
+        WireFormat::Json => {
+            let output = json!({
+                "kreuzberg_version": version,
+                "total_size_bytes": total_size_bytes,
+                "model_count": entries.len(),
+                "models": entries,
+            });
+            println!(
+                "{}",
+                serde_json::to_string_pretty(&output).context("Failed to serialize manifest to JSON")?
+            );
+        }
+        WireFormat::Toon => {
+            let output = json!({
+                "kreuzberg_version": version,
+                "total_size_bytes": total_size_bytes,
+                "model_count": entries.len(),
+                "models": entries,
+            });
+            println!(
+                "{}",
+                serde_toon::to_string(&output).context("Failed to serialize manifest to TOON")?
+            );
+        }
+    }
+
+    Ok(())
+}
+
+/// Execute cache warm command - eagerly downloads all models.
+#[allow(clippy::too_many_arguments)]
+pub fn warm_command(
+    cache_dir: Option<PathBuf>,
+    format: WireFormat,
+    all_embeddings: bool,
+    embedding_model: Option<String>,
+    all_table_models: bool,
+    all_grammars: bool,
+    grammar_groups: Option<Vec<String>>,
+    grammars: Option<Vec<String>>,
+) -> Result<()> {
+    let cache_base = resolve_cache_base(cache_dir);
+
+    let mut downloaded: Vec<String> = Vec::new();
+    let mut already_cached: Vec<String> = Vec::new();
+
+    #[cfg(feature = "paddle-ocr")]
+    {
+        let paddle_dir = cache_base.join("paddle-ocr");
+        let manager = kreuzberg::paddle_ocr::ModelManager::new(paddle_dir);
+
+        // ensure_all_models downloads v2 det (server+mobile), cls (PP-LCNet),
+        // doc_ori, v2 unified rec models, and all per-script rec families
+        manager
+            .ensure_all_models()
+            .context("Failed to download PaddleOCR v2 models")?;
+        downloaded.push("paddle-ocr v2 (server+mobile det, cls, doc_ori, unified+per-script rec)".to_string());
+    }
+
+    #[cfg(feature = "layout-detection")]
+    {
+        let layout_dir = cache_base.join("layout");
+        let manager = kreuzberg::layout::LayoutModelManager::new(Some(layout_dir));
+
+        if all_table_models {
+            // Download rtdetr + tatr + all SLANeXT variants (~730MB)
+            let was_cached = manager.is_rtdetr_cached() && manager.is_tatr_cached();
+            if was_cached {
+                already_cached.push("layout (rtdetr, tatr, slanet variants)".to_string());
+            } else {
+                manager
+                    .ensure_all_models()
+                    .context("Failed to download layout models")?;
+                downloaded.push("layout (rtdetr, tatr, slanet variants)".to_string());
+            }
+        } else {
+            // Default: download only rtdetr + tatr
+            let was_cached = manager.is_rtdetr_cached() && manager.is_tatr_cached();
+            if was_cached {
+                already_cached.push("layout (rtdetr, tatr)".to_string());
+            } else {
+                manager
+                    .ensure_default_models()
+                    .context("Failed to download layout models")?;
+                downloaded.push("layout (rtdetr, tatr)".to_string());
+            }
+        }
+    }
+
+    #[cfg(feature = "paddle-ocr")]
+    {
+        let tessdata_dir = cache_base.join("tessdata");
+        let manager = kreuzberg::ocr::TessdataManager::new(Some(tessdata_dir));
+
+        let newly_downloaded = manager
+            .ensure_all_languages()
+            .context("Failed to download tessdata files")?;
+
+        if newly_downloaded > 0 {
+            downloaded.push(format!("tessdata ({newly_downloaded} languages)"));
+        } else {
+            already_cached.push("tessdata (all languages)".to_string());
+        }
+    }
+
+    #[cfg(feature = "embeddings")]
+    {
+        let embeddings_dir = cache_base.join("embeddings");
+        let presets_to_warm: Vec<kreuzberg::EmbeddingPreset> = if all_embeddings {
+            kreuzberg::list_embedding_presets()
+                .into_iter()
+                .filter_map(|name| kreuzberg::get_embedding_preset(&name))
+                .collect()
+        } else if let Some(ref name) = embedding_model {
+            match kreuzberg::get_embedding_preset(name) {
+                Some(preset) => vec![preset],
+                None => {
+                    let available = kreuzberg::list_embedding_presets();
+                    anyhow::bail!(
+                        "Unknown embedding preset '{}'. Available: {}",
+                        name,
+                        available.join(", ")
+                    );
+                }
+            }
+        } else {
+            vec![]
+        };
+
+        for preset in &presets_to_warm {
+            let label = format!("embedding ({})", preset.name);
+            kreuzberg::embeddings::warm_model(
+                &kreuzberg::core::config::EmbeddingModelType::Preset {
+                    name: preset.name.clone(),
+                },
+                Some(embeddings_dir.clone()),
+            )
+            .map_err(|e| anyhow::anyhow!("Failed to download embedding model '{}': {}", preset.name, e))?;
+            downloaded.push(label);
+        }
+    }
+
+    #[cfg(not(feature = "embeddings"))]
+    {
+        if all_embeddings || embedding_model.is_some() {
+            anyhow::bail!("Embedding model warming requires the 'embeddings' feature to be enabled");
+        }
+    }
+
+    // Tree-sitter grammar downloads
+    #[cfg(feature = "tree-sitter")]
+    {
+        if all_grammars {
+            let count =
+                tree_sitter_language_pack::download_all().context("Failed to download all tree-sitter grammars")?;
+            if count > 0 {
+                downloaded.push(format!("tree-sitter grammars ({count} languages)"));
+            } else {
+                already_cached.push("tree-sitter grammars (all)".to_string());
+            }
+        } else if let Some(ref groups) = grammar_groups {
+            let config = tree_sitter_language_pack::PackConfig {
+                cache_dir: None,
+                languages: None,
+                groups: Some(groups.clone()),
+            };
+            tree_sitter_language_pack::init(&config).context("Failed to download tree-sitter grammar groups")?;
+            downloaded.push(format!("tree-sitter grammars (groups: {})", groups.join(", ")));
+        } else if let Some(ref langs) = grammars {
+            let refs: Vec<&str> = langs.iter().map(String::as_str).collect();
+            let count =
+                tree_sitter_language_pack::download(&refs).context("Failed to download tree-sitter grammars")?;
+            if count > 0 {
+                downloaded.push(format!("tree-sitter grammars ({count} languages)"));
+            } else {
+                already_cached.push(format!("tree-sitter grammars ({})", langs.join(", ")));
+            }
+        }
+    }
+
+    #[cfg(not(feature = "tree-sitter"))]
+    {
+        if all_grammars || grammar_groups.is_some() || grammars.is_some() {
+            anyhow::bail!("Tree-sitter grammar warming requires the 'tree-sitter' feature to be enabled");
+        }
+    }
+
+    match format {
+        WireFormat::Text => {
+            if !downloaded.is_empty() {
+                println!("{}", style::label("Downloaded:"));
+                for d in &downloaded {
+                    println!("  {}", style::success(d));
+                }
+            }
+            if !already_cached.is_empty() {
+                println!("{}", style::label("Already cached:"));
+                for c in &already_cached {
+                    println!("  {}", style::dim(c));
+                }
+            }
+            println!(
+                "All models ready in {}",
+                style::success(&cache_base.display().to_string())
+            );
+        }
+        WireFormat::Json => {
+            let output = json!({
+                "cache_dir": cache_base.to_string_lossy(),
+                "downloaded": downloaded,
+                "already_cached": already_cached,
+            });
+            println!(
+                "{}",
+                serde_json::to_string_pretty(&output).context("Failed to serialize warm results to JSON")?
+            );
+        }
+        WireFormat::Toon => {
+            let output = json!({
+                "cache_dir": cache_base.to_string_lossy(),
+                "downloaded": downloaded,
+                "already_cached": already_cached,
+            });
+            println!(
+                "{}",
+                serde_toon::to_string(&output).context("Failed to serialize warm results to TOON")?
+            );
+        }
+    }
+
+    Ok(())
+}
+
+/// Resolve the cache base directory.
+fn resolve_cache_base(cache_dir: Option<PathBuf>) -> PathBuf {
+    if let Some(dir) = cache_dir {
+        return dir;
+    }
+    if let Ok(env_path) = std::env::var("KREUZBERG_CACHE_DIR") {
+        return PathBuf::from(env_path);
+    }
+    std::env::current_dir()
+        .unwrap_or_else(|_| PathBuf::from("."))
+        .join(".kreuzberg")
+}
--- a/crates/kreuzberg-cli/src/commands/chunk.rs
+++ b/crates/kreuzberg-cli/src/commands/chunk.rs
@@ -0,0 +1,61 @@
+//! Chunk command implementation.
+
+use anyhow::{Context, Result};
+
+use crate::{WireFormat, style};
+
+/// Execute the chunk command: split text into chunks.
+pub fn chunk_command(text: String, config: kreuzberg::ChunkingConfig, format: WireFormat) -> Result<()> {
+    if text.is_empty() {
+        anyhow::bail!("No text provided for chunking. Provide --text or pipe text via stdin.");
+    }
+
+    let result = kreuzberg::chunking::chunk_text(&text, &config, None).context("Failed to chunk text")?;
+
+    match format {
+        WireFormat::Json => {
+            let chunks: Vec<&str> = result.chunks.iter().map(|c| c.content.as_str()).collect();
+            let output = serde_json::json!({
+                "chunks": chunks,
+                "chunk_count": result.chunk_count,
+                "config": {
+                    "max_characters": config.max_characters,
+                    "overlap": config.overlap,
+                    "chunker_type": format!("{:?}", config.chunker_type),
+                },
+                "input_size_bytes": text.len(),
+            });
+            println!(
+                "{}",
+                serde_json::to_string_pretty(&output).context("Failed to serialize chunks to JSON")?
+            );
+        }
+        WireFormat::Toon => {
+            let chunks: Vec<&str> = result.chunks.iter().map(|c| c.content.as_str()).collect();
+            let output = serde_json::json!({
+                "chunks": chunks,
+                "chunk_count": result.chunk_count,
+                "config": {
+                    "max_characters": config.max_characters,
+                    "overlap": config.overlap,
+                    "chunker_type": format!("{:?}", config.chunker_type),
+                },
+                "input_size_bytes": text.len(),
+            });
+            println!(
+                "{}",
+                serde_toon::to_string(&output).context("Failed to serialize chunks to TOON")?
+            );
+        }
+        WireFormat::Text => {
+            for (i, chunk) in result.chunks.iter().enumerate() {
+                if result.chunks.len() > 1 {
+                    println!("{}", style::dim(&format!("--- chunk {} ---", i + 1)));
+                }
+                println!("{}", chunk.content);
+            }
+        }
+    }
+
+    Ok(())
+}
--- a/crates/kreuzberg-cli/src/commands/config.rs
+++ b/crates/kreuzberg-cli/src/commands/config.rs
@@ -0,0 +1,51 @@
+//! Config command - Configuration loading and discovery
+//!
+//! This module provides utilities for loading extraction configuration from files
+//! or discovering them automatically in the project directory.
+
+use anyhow::{Context, Result};
+use kreuzberg::ExtractionConfig;
+use std::path::PathBuf;
+
+/// Loads extraction configuration from a file or discovers it automatically.
+///
+/// This function implements the CLI's configuration hierarchy:
+/// 1. Explicit config file (if `--config` flag provided)
+/// 2. Auto-discovered config (searches `kreuzberg.{toml,yaml,json}` in current and parent directories)
+/// 3. Default configuration (if no config file found)
+///
+/// # Configuration File Formats
+///
+/// Supports three formats, determined by file extension:
+/// - `.toml`: TOML format (recommended for humans)
+/// - `.yaml` / `.yml`: YAML format
+/// - `.json`: JSON format
+///
+/// # Errors
+///
+/// Returns an error if:
+/// - Explicit config file has unsupported extension (must be .toml, .yaml, .yml, or .json)
+/// - Config file cannot be read or parsed
+/// - Config file contains invalid extraction settings
+pub fn load_config(config_path: Option<PathBuf>) -> Result<ExtractionConfig> {
+    if let Some(path) = config_path {
+        let path_str = path.to_string_lossy();
+        let path_lower = path_str.to_lowercase();
+        let config = if path_lower.ends_with(".toml") {
+            ExtractionConfig::from_toml_file(&path)
+        } else if path_lower.ends_with(".yaml") || path_lower.ends_with(".yml") {
+            ExtractionConfig::from_yaml_file(&path)
+        } else if path_lower.ends_with(".json") {
+            ExtractionConfig::from_json_file(&path)
+        } else {
+            anyhow::bail!("Config file must have .toml, .yaml, .yml, or .json extension (case-insensitive)");
+        };
+        config.with_context(|| format!("Failed to load configuration from '{}'. Ensure the file exists, is readable, and contains valid configuration.", path.display()))
+    } else {
+        match ExtractionConfig::discover() {
+            Ok(Some(config)) => Ok(config),
+            Ok(None) => Ok(ExtractionConfig::default()),
+            Err(e) => Err(e).context("Failed to auto-discover configuration file. Searched for kreuzberg.{toml,yaml,json} in current and parent directories. Use --config to specify an explicit path."),
+        }
+    }
+}
--- a/crates/kreuzberg-cli/src/commands/embed.rs
+++ b/crates/kreuzberg-cli/src/commands/embed.rs
@@ -0,0 +1,161 @@
+//! Embed command implementation.
+
+use anyhow::{Context, Result};
+
+use crate::{WireFormat, style};
+
+/// Execute the embed command: generate embeddings for input texts.
+///
+/// When `provider` is `"local"` (default), uses the ONNX preset model.
+/// When `provider` is `"llm"`, uses liter-llm with the specified model and API key.
+/// When `provider` is `"plugin"`, dispatches to a pre-registered in-process embedding backend.
+pub fn embed_command(
+    texts: Vec<String>,
+    preset: &str,
+    provider: &str,
+    llm_model: Option<String>,
+    llm_api_key: Option<String>,
+    plugin_name: Option<String>,
+    format: WireFormat,
+) -> Result<()> {
+    if texts.is_empty() {
+        anyhow::bail!("No texts provided for embedding. Provide --text or pipe text via stdin.");
+    }
+
+    // Validate no empty texts
+    for (i, t) in texts.iter().enumerate() {
+        if t.is_empty() {
+            anyhow::bail!("Text at position {} is empty. All texts must be non-empty.", i + 1);
+        }
+    }
+
+    let (config, model_label) = match provider {
+        "llm" => {
+            let model = llm_model.as_deref().ok_or_else(|| {
+                anyhow::anyhow!(
+                    "--model is required when --provider is 'llm' (e.g., --model openai/text-embedding-3-small)"
+                )
+            })?;
+
+            let llm_config = kreuzberg::LlmConfig {
+                model: model.to_string(),
+                api_key: llm_api_key,
+                base_url: None,
+                timeout_secs: None,
+                max_retries: None,
+                temperature: None,
+                max_tokens: None,
+            };
+
+            let config = kreuzberg::EmbeddingConfig {
+                model: kreuzberg::EmbeddingModelType::Llm { llm: llm_config },
+                show_download_progress: true,
+                ..Default::default()
+            };
+
+            (config, model.to_string())
+        }
+        "local" | "" => {
+            // Validate preset for local provider
+            let _preset_info = kreuzberg::get_embedding_preset(preset).with_context(|| {
+                format!(
+                    "Unknown embedding preset '{}'. Available: {:?}",
+                    preset,
+                    kreuzberg::list_embedding_presets()
+                )
+            })?;
+
+            let config = kreuzberg::EmbeddingConfig {
+                model: kreuzberg::EmbeddingModelType::Preset {
+                    name: preset.to_string(),
+                },
+                show_download_progress: true,
+                ..Default::default()
+            };
+
+            (config, preset.to_string())
+        }
+        "plugin" => {
+            let name = plugin_name.as_deref().ok_or_else(|| {
+                anyhow::anyhow!(
+                    "--plugin NAME is required when --provider is 'plugin'. Register a backend via kreuzberg::plugins::register_embedding_backend first."
+                )
+            })?;
+            if name.is_empty() {
+                anyhow::bail!("--plugin NAME must not be empty.");
+            }
+
+            // Pre-flight: surface unknown backends with a list of registered names
+            // (parity with the REST handler, which returns 422 for the same case).
+            let available =
+                kreuzberg::plugins::list_embedding_backends().context("Failed to read embedding backend registry")?;
+            if !available.iter().any(|n| n == name) {
+                anyhow::bail!(
+                    "Embedding backend '{}' is not registered. Available backends: {}",
+                    name,
+                    if available.is_empty() {
+                        "(none registered)".to_string()
+                    } else {
+                        available.join(", ")
+                    }
+                );
+            }
+
+            let config = kreuzberg::EmbeddingConfig {
+                model: kreuzberg::EmbeddingModelType::Plugin { name: name.to_string() },
+                ..Default::default()
+            };
+
+            (config, name.to_string())
+        }
+        other => {
+            anyhow::bail!(
+                "Unknown embedding provider '{}'. Valid providers: 'local' (default, ONNX), 'llm' (liter-llm), or 'plugin' (in-process backend).",
+                other
+            );
+        }
+    };
+
+    // Generate embeddings
+    let embeddings = kreuzberg::embed_texts(texts.clone(), &config).context("Failed to generate embeddings")?;
+
+    let dimensions = embeddings.first().map(|e| e.len()).unwrap_or(0);
+
+    match format {
+        WireFormat::Json => {
+            let output = serde_json::json!({
+                "embeddings": embeddings,
+                "model": model_label,
+                "dimensions": dimensions,
+                "count": embeddings.len(),
+            });
+            println!(
+                "{}",
+                serde_json::to_string_pretty(&output).context("Failed to serialize embeddings to JSON")?
+            );
+        }
+        WireFormat::Toon => {
+            let output = serde_json::json!({
+                "embeddings": embeddings,
+                "model": model_label,
+                "dimensions": dimensions,
+                "count": embeddings.len(),
+            });
+            println!(
+                "{}",
+                serde_toon::to_string(&output).context("Failed to serialize embeddings to TOON")?
+            );
+        }
+        WireFormat::Text => {
+            for (i, embedding) in embeddings.iter().enumerate() {
+                if texts.len() > 1 {
+                    println!("{}", style::dim(&format!("# text {}", i + 1)));
+                }
+                let values: Vec<String> = embedding.iter().map(|v| format!("{v}")).collect();
+                println!("{}", values.join(","));
+            }
+        }
+    }
+
+    Ok(())
+}
--- a/crates/kreuzberg-cli/src/commands/extract.rs
+++ b/crates/kreuzberg-cli/src/commands/extract.rs
@@ -0,0 +1,180 @@
+//! Extract command - Extract text and data from documents
+//!
+//! This module provides the extract and batch extract commands for processing single
+//! or multiple documents with customizable extraction configurations.
+
+use anyhow::{Context, Result};
+use kreuzberg::{
+    BatchFileItem, ExtractionConfig, ExtractionResult, FileExtractionConfig, batch_extract_files_sync,
+    extract_file_sync,
+};
+use std::path::PathBuf;
+use std::time::Instant;
+
+use crate::{
+    WireFormat,
+    output::{BatchEnvelope, ExtractEnvelope},
+    style,
+};
+
+/// Execute single document extraction command
+pub fn extract_command(
+    path: PathBuf,
+    config: ExtractionConfig,
+    mime_type: Option<String>,
+    format: WireFormat,
+) -> Result<()> {
+    let path_str = path.to_string_lossy().to_string();
+
+    let t0 = Instant::now();
+    let result = extract_file_sync(&path_str, mime_type.as_deref(), &config).with_context(|| {
+        format!(
+            "Failed to extract file '{}'. Ensure the file is readable and the format is supported.",
+            path.display()
+        )
+    })?;
+    let extraction_time_ms = t0.elapsed().as_secs_f64() * 1000.0;
+
+    match format {
+        WireFormat::Text => {
+            print!("{}", result.content);
+        }
+        WireFormat::Json => {
+            let envelope = ExtractEnvelope {
+                result,
+                extraction_time_ms,
+            };
+            println!(
+                "{}",
+                serde_json::to_string_pretty(&envelope).context("Failed to serialize extraction result to JSON")?
+            );
+        }
+        WireFormat::Toon => {
+            println!(
+                "{}",
+                serde_toon::to_string(&result).context("Failed to serialize extraction result to TOON")?
+            );
+        }
+    }
+
+    Ok(())
+}
+
+/// Execute batch extraction command with optional per-file configuration overrides
+pub fn batch_command(
+    paths: Vec<PathBuf>,
+    file_configs_map: Option<std::collections::HashMap<String, serde_json::Value>>,
+    config: ExtractionConfig,
+    format: WireFormat,
+) -> Result<()> {
+    match format {
+        WireFormat::Json => {
+            // Run files one at a time to capture per-file wall-clock timings.
+            // Per-file config overrides are honoured: files without an override use the
+            // batch-level config directly; files with an override use a one-shot batch of
+            // one item so the library's own merge logic applies.
+            let mut results: Vec<ExtractionResult> = Vec::with_capacity(paths.len());
+            let mut per_file_ms: Vec<f64> = Vec::with_capacity(paths.len());
+            let total_t0 = Instant::now();
+
+            for path in &paths {
+                let path_str = path.to_string_lossy().to_string();
+                let has_file_config = file_configs_map.as_ref().and_then(|m| m.get(&path_str)).is_some();
+
+                let t0 = Instant::now();
+                let result = if has_file_config {
+                    // Delegate to the batch API (one item) so per-file merge logic is applied.
+                    let file_config = file_configs_map
+                        .as_ref()
+                        .and_then(|m| m.get(&path_str))
+                        .map(|v| {
+                            serde_json::from_value::<FileExtractionConfig>(v.clone())
+                                .with_context(|| format!("Failed to parse file config for '{}'", path_str))
+                        })
+                        .transpose()?;
+                    let mut batch_results = batch_extract_files_sync(
+                        vec![BatchFileItem {
+                            path: path.clone(),
+                            config: file_config,
+                        }],
+                        &config,
+                    )
+                    .with_context(|| {
+                        format!(
+                            "Failed to extract file '{}'. Ensure the file is readable and the format is supported.",
+                            path.display()
+                        )
+                    })?;
+                    batch_results.remove(0)
+                } else {
+                    extract_file_sync(&path_str, None, &config).with_context(|| {
+                        format!(
+                            "Failed to extract file '{}'. Ensure the file is readable and the format is supported.",
+                            path.display()
+                        )
+                    })?
+                };
+                per_file_ms.push(t0.elapsed().as_secs_f64() * 1000.0);
+                results.push(result);
+            }
+
+            let total_ms = total_t0.elapsed().as_secs_f64() * 1000.0;
+            let envelope = BatchEnvelope {
+                results,
+                total_ms,
+                per_file_ms,
+            };
+            println!(
+                "{}",
+                serde_json::to_string_pretty(&envelope)
+                    .context("Failed to serialize batch extraction results to JSON")?
+            );
+        }
+        WireFormat::Text => {
+            let results = run_batch_sync(&paths, file_configs_map.as_ref(), &config)?;
+            for (i, result) in results.iter().enumerate() {
+                println!("{}", style::header(&format!("=== Document {} ===", i + 1)));
+                println!("{} {}", style::label("MIME Type:"), style::success(&result.mime_type));
+                println!("{}\n{}", style::label("Content:"), result.content);
+                println!();
+            }
+        }
+        WireFormat::Toon => {
+            let results = run_batch_sync(&paths, file_configs_map.as_ref(), &config)?;
+            println!(
+                "{}",
+                serde_toon::to_string(&results).context("Failed to serialize batch extraction results to TOON")?
+            );
+        }
+    }
+
+    Ok(())
+}
+
+/// Run batch extraction using the synchronous batch API for non-JSON output paths.
+fn run_batch_sync(
+    paths: &[PathBuf],
+    file_configs_map: Option<&std::collections::HashMap<String, serde_json::Value>>,
+    config: &ExtractionConfig,
+) -> Result<Vec<ExtractionResult>> {
+    let items: Vec<BatchFileItem> = paths
+        .iter()
+        .map(|p| {
+            let path_str = p.to_string_lossy().to_string();
+            let file_config = file_configs_map
+                .and_then(|m| m.get(&path_str))
+                .map(|v| {
+                    serde_json::from_value::<FileExtractionConfig>(v.clone())
+                        .with_context(|| format!("Failed to parse file config for '{}'", path_str))
+                })
+                .transpose()?;
+            Ok(BatchFileItem {
+                path: p.clone(),
+                config: file_config,
+            })
+        })
+        .collect::<Result<Vec<_>>>()?;
+
+    batch_extract_files_sync(items, config)
+        .context("Failed to batch extract documents. Check that all files are readable and formats are supported.")
+}
--- a/crates/kreuzberg-cli/src/commands/extract_structured.rs
+++ b/crates/kreuzberg-cli/src/commands/extract_structured.rs
@@ -0,0 +1,116 @@
+//! Extract structured command - Extract structured data from documents using an LLM.
+//!
+//! Reads a JSON schema file, configures LLM-based structured extraction, and
+//! outputs the structured result parsed from the document.
+
+use anyhow::{Context, Result};
+use kreuzberg::{LlmConfig, StructuredExtractionConfig, extract_file_sync};
+use std::path::PathBuf;
+
+use crate::WireFormat;
+
+/// Arguments for the extract-structured command.
+pub struct ExtractStructuredArgs {
+    pub path: PathBuf,
+    pub schema_path: PathBuf,
+    pub model: String,
+    pub api_key: Option<String>,
+    pub prompt: Option<String>,
+    pub schema_name: Option<String>,
+    pub strict: bool,
+    pub config_path: Option<PathBuf>,
+    pub format: WireFormat,
+}
+
+/// Execute the extract-structured command.
+///
+/// Reads a JSON schema from `schema_path`, builds an `ExtractionConfig` with
+/// `structured_extraction` configured, extracts the document, and outputs the
+/// `structured_output` field from the result.
+pub fn extract_structured_command(args: ExtractStructuredArgs) -> Result<()> {
+    let ExtractStructuredArgs {
+        path,
+        schema_path,
+        model,
+        api_key,
+        prompt,
+        schema_name,
+        strict,
+        config_path,
+        format,
+    } = args;
+    // 1. Read and parse the JSON schema file
+    let schema_str = std::fs::read_to_string(&schema_path).with_context(|| {
+        format!(
+            "Failed to read JSON schema file '{}'. Ensure the file exists and is readable.",
+            schema_path.display()
+        )
+    })?;
+    let schema: serde_json::Value = serde_json::from_str(&schema_str).with_context(|| {
+        format!(
+            "Failed to parse JSON schema from '{}'. Ensure the file contains valid JSON.",
+            schema_path.display()
+        )
+    })?;
+
+    // 2. Build ExtractionConfig with structured_extraction
+    let mut config = super::load_config(config_path)?;
+
+    let llm_config = LlmConfig {
+        model,
+        api_key,
+        base_url: None,
+        timeout_secs: None,
+        max_retries: None,
+        temperature: None,
+        max_tokens: None,
+    };
+
+    config.structured_extraction = Some(StructuredExtractionConfig {
+        schema,
+        schema_name: schema_name.unwrap_or_else(|| "extraction".to_string()),
+        schema_description: None,
+        strict,
+        prompt,
+        llm: llm_config,
+    });
+
+    // 3. Call kreuzberg::extract_file_sync()
+    let path_str = path.to_string_lossy().to_string();
+    let result = extract_file_sync(&path_str, None, &config).with_context(|| {
+        format!(
+            "Failed to extract structured data from '{}'. Ensure the file is readable and the LLM configuration is correct.",
+            path.display()
+        )
+    })?;
+
+    // 4. Output result.structured_output (or error if None)
+    let structured = result.structured_output.with_context(|| {
+        "Structured extraction completed but returned no structured output. \
+         This may indicate the LLM failed to produce valid structured data matching the schema."
+    })?;
+
+    match format {
+        WireFormat::Json => {
+            println!(
+                "{}",
+                serde_json::to_string_pretty(&structured).context("Failed to serialize structured output to JSON")?
+            );
+        }
+        WireFormat::Toon => {
+            println!(
+                "{}",
+                serde_toon::to_string(&structured).context("Failed to serialize structured output to TOON")?
+            );
+        }
+        WireFormat::Text => {
+            // For text mode, pretty-print the JSON value
+            println!(
+                "{}",
+                serde_json::to_string_pretty(&structured).context("Failed to serialize structured output to text")?
+            );
+        }
+    }
+
+    Ok(())
+}
--- a/crates/kreuzberg-cli/src/commands/mod.rs
+++ b/crates/kreuzberg-cli/src/commands/mod.rs
@@ -0,0 +1,48 @@
+//! Command modules for Kreuzberg CLI
+//!
+//! This module organizes the CLI commands into focused submodules:
+//! - `extract` - Document extraction commands
+//! - `cache` - Cache management operations
+//! - `server` - API and MCP server commands
+//! - `config` - Configuration loading and discovery
+//! - `embed` - Embedding generation commands
+//! - `chunk` - Text chunking commands
+
+use anyhow::{Context, Result};
+use std::io::Read;
+
+pub mod cache;
+pub mod chunk;
+pub mod config;
+#[cfg(feature = "embeddings")]
+pub mod embed;
+pub mod extract;
+pub mod extract_structured;
+pub mod overrides;
+#[cfg(any(feature = "api", feature = "mcp"))]
+pub mod server;
+
+// Re-export command functions for convenience
+pub use cache::{clear_command, manifest_command, stats_command, warm_command};
+pub use chunk::chunk_command;
+pub use config::load_config;
+#[cfg(feature = "embeddings")]
+pub use embed::embed_command;
+pub use extract::{batch_command, extract_command};
+#[cfg(feature = "mcp")]
+pub use server::mcp_command;
+#[cfg(feature = "api")]
+pub use server::serve_command;
+
+/// Read text from stdin, trimming whitespace.
+pub fn read_stdin() -> Result<String> {
+    let mut input = String::new();
+    std::io::stdin()
+        .read_to_string(&mut input)
+        .context("Failed to read from stdin")?;
+    let trimmed = input.trim().to_string();
+    if trimmed.is_empty() {
+        anyhow::bail!("No input received from stdin. Provide text via --text or pipe it to stdin.");
+    }
+    Ok(trimmed)
+}
--- a/crates/kreuzberg-cli/src/commands/overrides.rs
+++ b/crates/kreuzberg-cli/src/commands/overrides.rs
--- a/crates/kreuzberg-cli/src/commands/server.rs
+++ b/crates/kreuzberg-cli/src/commands/server.rs
@@ -0,0 +1,104 @@
+//! Server command - Start API and MCP servers
+//!
+//! This module provides commands for starting the Kreuzberg API server
+//! and the MCP (Model Context Protocol) server.
+
+use anyhow::Result;
+
+/// Execute API server command
+#[cfg(feature = "api")]
+pub fn serve_command(
+    cli_host: Option<String>,
+    cli_port: Option<u16>,
+    extraction_config: kreuzberg::ExtractionConfig,
+    config_path: Option<std::path::PathBuf>,
+) -> Result<()> {
+    use anyhow::Context;
+    use kreuzberg::ServerConfig;
+
+    // Load server config from same file or defaults
+    let mut server_config = if let Some(path) = &config_path {
+        ServerConfig::from_file(path).with_context(|| {
+            format!(
+                "Failed to load server configuration from '{}'. \
+                 Ensure the file contains valid server settings under [server] section or at root level.",
+                path.display()
+            )
+        })?
+    } else {
+        ServerConfig::default()
+    };
+
+    // Apply environment variable overrides (precedence: env vars > config file)
+    server_config.apply_env_overrides()?;
+
+    // CLI args override everything (highest precedence)
+    if let Some(host) = cli_host {
+        server_config.host = host;
+    }
+    if let Some(port) = cli_port {
+        server_config.port = port;
+    }
+
+    // Log the final configuration for debugging
+    tracing::info!(
+        "Starting Kreuzberg API server on http://{}",
+        server_config.listen_addr()
+    );
+
+    let rt = tokio::runtime::Runtime::new()?;
+    rt.block_on(kreuzberg::api::serve_with_server_config(
+        extraction_config,
+        server_config.clone(),
+    ))
+    .with_context(|| {
+        format!(
+            "Failed to start API server on {}. Ensure the port is not already in use and you have permission to bind to this address.",
+            server_config.listen_addr()
+        )
+    })?;
+
+    Ok(())
+}
+
+/// Execute MCP server command
+#[cfg(feature = "mcp")]
+pub fn mcp_command(
+    config: kreuzberg::ExtractionConfig,
+    transport: String,
+    #[cfg(feature = "mcp-http")] host: String,
+    #[cfg(feature = "mcp-http")] port: u16,
+    #[cfg(not(feature = "mcp-http"))] _host: String,
+    #[cfg(not(feature = "mcp-http"))] _port: u16,
+) -> Result<()> {
+    tracing::debug!("Starting Kreuzberg MCP server with transport: {}", transport);
+    let rt = tokio::runtime::Runtime::new()?;
+
+    match transport.to_lowercase().as_str() {
+        "stdio" => {
+            rt.block_on(kreuzberg::mcp::start_mcp_server_with_config(config))
+                .map_err(|e| anyhow::anyhow!("Failed to start MCP server: {}", e))?;
+        }
+        "http" => {
+            #[cfg(not(feature = "mcp-http"))]
+            {
+                anyhow::bail!(
+                    "HTTP transport requires 'mcp-http' feature. \
+                     Rebuild with: cargo build --features mcp-http"
+                );
+            }
+
+            #[cfg(feature = "mcp-http")]
+            {
+                tracing::debug!("Starting MCP server on http://{}:{}", host, port);
+                rt.block_on(kreuzberg::mcp::start_mcp_server_http_with_config(&host, port, config))
+                    .map_err(|e| anyhow::anyhow!("Failed to start MCP server on {}:{}: {}", host, port, e))?;
+            }
+        }
+        other => {
+            anyhow::bail!("Unknown transport '{}'. Use 'stdio' or 'http'", other);
+        }
+    }
+
+    Ok(())
+}
--- a/crates/kreuzberg-cli/src/commands/tree_sitter.rs
+++ b/crates/kreuzberg-cli/src/commands/tree_sitter.rs
@@ -0,0 +1,230 @@
+//! Tree-sitter grammar management commands.
+//!
+//! This module provides commands for downloading, listing, and managing
+//! tree-sitter grammar parsers via the tree-sitter-language-pack crate.
+
+use anyhow::{Context, Result};
+use serde_json::json;
+use std::path::PathBuf;
+
+use crate::{WireFormat, style};
+
+/// Execute the tree-sitter download command.
+///
+/// Downloads tree-sitter grammar parsers based on the provided arguments:
+/// - Specific languages by name
+/// - All available languages (--all)
+/// - Language groups (--groups)
+pub fn download_command(
+    languages: Vec<String>,
+    all: bool,
+    groups: Option<Vec<String>>,
+    cache_dir: Option<PathBuf>,
+    format: WireFormat,
+) -> Result<()> {
+    // Apply custom cache directory if provided
+    if let Some(ref dir) = cache_dir {
+        let config = tree_sitter_language_pack::PackConfig {
+            cache_dir: Some(dir.clone()),
+            languages: None,
+            groups: None,
+        };
+        tree_sitter_language_pack::configure(&config).context("Failed to configure custom cache directory")?;
+    }
+
+    let count: usize;
+    let description: String;
+
+    if all {
+        count = tree_sitter_language_pack::download_all().context("Failed to download all tree-sitter grammars")?;
+        description = "all available languages".to_string();
+    } else if let Some(ref group_list) = groups {
+        let config = tree_sitter_language_pack::PackConfig {
+            cache_dir: cache_dir.clone(),
+            languages: None,
+            groups: Some(group_list.clone()),
+        };
+        tree_sitter_language_pack::init(&config).context("Failed to download tree-sitter grammar groups")?;
+        count = 0; // init does not return a count
+        description = format!("groups: {}", group_list.join(", "));
+    } else if !languages.is_empty() {
+        let refs: Vec<&str> = languages.iter().map(String::as_str).collect();
+        count = tree_sitter_language_pack::download(&refs).context("Failed to download tree-sitter grammars")?;
+        description = format!("languages: {}", languages.join(", "));
+    } else {
+        anyhow::bail!("No languages specified. Use language names, --all, --groups, or --from-config.");
+    }
+
+    match format {
+        WireFormat::Text => {
+            println!("{}", style::header("Tree-sitter Download"));
+            println!("{}", style::dim("===================="));
+            println!("{} {}", style::label("Requested:"), description);
+            if groups.is_none() || all || !languages.is_empty() {
+                println!(
+                    "{} {}",
+                    style::label("Newly downloaded:"),
+                    style::success(&count.to_string())
+                );
+            }
+            if let Some(ref dir) = cache_dir {
+                println!(
+                    "{} {}",
+                    style::label("Cache directory:"),
+                    style::success(&dir.display().to_string())
+                );
+            }
+            println!("{}", style::success("Done"));
+        }
+        WireFormat::Json => {
+            let mut output = json!({
+                "requested": description,
+                "newly_downloaded": count,
+            });
+            if let Some(ref dir) = cache_dir {
+                output["cache_dir"] = json!(dir.to_string_lossy());
+            }
+            println!(
+                "{}",
+                serde_json::to_string_pretty(&output).context("Failed to serialize download results to JSON")?
+            );
+        }
+        WireFormat::Toon => {
+            let mut output = json!({
+                "requested": description,
+                "newly_downloaded": count,
+            });
+            if let Some(ref dir) = cache_dir {
+                output["cache_dir"] = json!(dir.to_string_lossy());
+            }
+            println!(
+                "{}",
+                serde_toon::to_string(&output).context("Failed to serialize download results to TOON")?
+            );
+        }
+    }
+
+    Ok(())
+}
+
+/// Execute the tree-sitter list command.
+///
+/// Lists available or downloaded tree-sitter languages, optionally filtering
+/// by a name substring.
+pub fn list_command(downloaded_only: bool, filter: Option<String>, format: WireFormat) -> Result<()> {
+    let languages = if downloaded_only {
+        tree_sitter_language_pack::downloaded_languages()
+    } else {
+        tree_sitter_language_pack::manifest_languages().context("Failed to fetch tree-sitter language manifest")?
+    };
+
+    let filtered: Vec<&String> = if let Some(ref f) = filter {
+        let lower = f.to_lowercase();
+        languages.iter().filter(|l| l.to_lowercase().contains(&lower)).collect()
+    } else {
+        languages.iter().collect()
+    };
+
+    let source = if downloaded_only { "downloaded" } else { "available" };
+
+    match format {
+        WireFormat::Text => {
+            println!(
+                "{} ({} {}{})",
+                style::header("Tree-sitter Languages"),
+                filtered.len(),
+                source,
+                filter.as_ref().map(|f| format!(", filter: '{f}'")).unwrap_or_default()
+            );
+            println!("{}", style::dim("====================="));
+            for lang in &filtered {
+                println!("  {}", style::success(lang));
+            }
+        }
+        WireFormat::Json => {
+            let output = json!({
+                "source": source,
+                "count": filtered.len(),
+                "filter": filter,
+                "languages": filtered,
+            });
+            println!(
+                "{}",
+                serde_json::to_string_pretty(&output).context("Failed to serialize language list to JSON")?
+            );
+        }
+        WireFormat::Toon => {
+            let output = json!({
+                "source": source,
+                "count": filtered.len(),
+                "filter": filter,
+                "languages": filtered,
+            });
+            println!(
+                "{}",
+                serde_toon::to_string(&output).context("Failed to serialize language list to TOON")?
+            );
+        }
+    }
+
+    Ok(())
+}
+
+/// Execute the tree-sitter cache-dir command.
+///
+/// Displays the effective cache directory for tree-sitter grammar parsers.
+pub fn cache_dir_command(format: WireFormat) -> Result<()> {
+    let dir = tree_sitter_language_pack::cache_dir().context("Failed to determine tree-sitter cache directory")?;
+    let dir_str = dir.to_string_lossy();
+
+    match format {
+        WireFormat::Text => {
+            println!("{} {}", style::label("Cache directory:"), style::success(&dir_str));
+        }
+        WireFormat::Json => {
+            let output = json!({ "cache_dir": dir_str });
+            println!(
+                "{}",
+                serde_json::to_string_pretty(&output).context("Failed to serialize cache directory to JSON")?
+            );
+        }
+        WireFormat::Toon => {
+            let output = json!({ "cache_dir": dir_str });
+            println!(
+                "{}",
+                serde_toon::to_string(&output).context("Failed to serialize cache directory to TOON")?
+            );
+        }
+    }
+
+    Ok(())
+}
+
+/// Execute the tree-sitter clean command.
+///
+/// Clears all cached tree-sitter grammar parser shared libraries.
+pub fn clean_command(format: WireFormat) -> Result<()> {
+    tree_sitter_language_pack::clean_cache().context("Failed to clean tree-sitter cache")?;
+
+    match format {
+        WireFormat::Text => {
+            println!("{}", style::success("Tree-sitter cache cleared successfully"));
+        }
+        WireFormat::Json => {
+            let output = json!({ "status": "cleared" });
+            println!(
+                "{}",
+                serde_json::to_string_pretty(&output).context("Failed to serialize clean result to JSON")?
+            );
+        }
+        WireFormat::Toon => {
+            let output = json!({ "status": "cleared" });
+            println!(
+                "{}",
+                serde_toon::to_string(&output).context("Failed to serialize clean result to TOON")?
+            );
+        }
+    }
+
+    Ok(())
+}
--- a/crates/kreuzberg-cli/src/logging.rs
+++ b/crates/kreuzberg-cli/src/logging.rs
@@ -0,0 +1,238 @@
+//! Logging helpers for the Kreuzberg CLI.
+//!
+//! Provides a [`build_env_filter`] function that layers default third-party
+//! transport suppressions on top of whatever the caller or `RUST_LOG` specifies.
+//! User-supplied per-target rules in `RUST_LOG` always win because
+//! [`EnvFilter::add_directive`] does not override existing per-target directives.
+
+use tracing_subscriber::EnvFilter;
+
+/// Third-party crates that are noisy at their own default level.
+///
+/// These are added as *fallback* directives: if `RUST_LOG` or `level_override`
+/// already contain a per-target rule for any of these crates it takes precedence,
+/// so the user can still do `RUST_LOG=ureq=debug` to restore full transport logs.
+const QUIET_DIRECTIVES: &[&str] = &[
+    "ureq=warn",
+    "ureq_proto=warn",
+    "rustls=warn",
+    "hyper_util=warn",
+    "hf_hub=info",
+    "tower_http=info",
+];
+
+/// Extract the target crate name from a directive string like `"ureq=warn"`.
+///
+/// Returns the part before `=`, or `None` if there is no `=`.
+fn directive_target(directive: &str) -> Option<&str> {
+    directive.split_once('=').map(|(target, _)| target)
+}
+
+/// Build an [`EnvFilter`] with third-party transport crates suppressed by default.
+///
+/// Precedence (highest first):
+/// 1. Per-target directives already present in `RUST_LOG` (or `level_override`).
+/// 2. The [`QUIET_DIRECTIVES`] suppressions added here.
+/// 3. Root level: `level_override` → `RUST_LOG` → `"info"`.
+///
+/// Per-target directives that the user has already set are **not** overridden:
+/// we skip adding a quiet directive when the base filter already contains a
+/// rule for the same target crate. This is necessary because
+/// [`EnvFilter::add_directive`] appends rather than guards — a later-added
+/// per-target directive for the same crate takes precedence.
+///
+/// # Arguments
+///
+/// * `level_override` — explicit root-level string from a CLI flag (e.g. `"debug"`).
+///   When `Some`, it replaces `RUST_LOG` entirely for the root level.
+pub fn build_env_filter(level_override: Option<&str>) -> EnvFilter {
+    // Use try_new on user input so a malformed --log-level falls back to info
+    // instead of panicking the CLI.
+    let base = level_override
+        .and_then(|level| EnvFilter::try_new(level).ok())
+        .or_else(|| EnvFilter::try_from_default_env().ok())
+        .unwrap_or_else(|| EnvFilter::new("info"));
+
+    // Snapshot the existing directive set so we can skip quiet directives
+    // whose target the user has already configured explicitly.
+    let existing_targets: std::collections::HashSet<String> = base
+        .to_string()
+        .split(',')
+        .filter_map(|chunk| directive_target(chunk).map(|t| t.trim().to_string()))
+        .collect();
+
+    QUIET_DIRECTIVES
+        .iter()
+        .filter(|directive| {
+            // Only add the quiet directive when no per-target rule for this
+            // exact crate already exists. Word-boundary match via tokenized
+            // target set avoids `hf_hub` colliding with `hf_hub_server`.
+            directive_target(directive)
+                .map(|target| !existing_targets.contains(target))
+                .unwrap_or(true)
+        })
+        .fold(base, |filter, directive| {
+            filter.add_directive(directive.parse().expect("built-in logging directive must be valid"))
+        })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Parse the directive string from an EnvFilter for assertion-level checks.
+    ///
+    /// `EnvFilter::to_string()` returns a comma-separated representation of all
+    /// directives. We use this as a stable, public inspection surface.
+    fn filter_directives(filter: &EnvFilter) -> String {
+        filter.to_string()
+    }
+
+    #[test]
+    fn default_filter_suppresses_ureq() {
+        // No env, no override → ureq and ureq_proto must be suppressed.
+        let filter = build_env_filter(None);
+        let directives = filter_directives(&filter);
+        assert!(
+            directives.contains("ureq=warn"),
+            "ureq=warn must be present in default filter; got: {directives}"
+        );
+        assert!(
+            directives.contains("ureq_proto=warn"),
+            "ureq_proto=warn must be present in default filter; got: {directives}"
+        );
+        assert!(
+            directives.contains("rustls=warn"),
+            "rustls=warn must be present in default filter; got: {directives}"
+        );
+    }
+
+    #[test]
+    fn default_filter_keeps_kreuzberg_info() {
+        // Root level info → kreuzberg has no suppression applied.
+        let filter = build_env_filter(None);
+        let directives = filter_directives(&filter);
+        assert!(
+            !directives.contains("kreuzberg=warn") && !directives.contains("kreuzberg=error"),
+            "kreuzberg must not be suppressed in the default filter; got: {directives}"
+        );
+    }
+
+    #[test]
+    fn env_override_wins_for_third_party() {
+        // Simulate RUST_LOG=ureq=debug by passing it as the level_override.
+        // build_env_filter must detect the existing ureq= directive and skip the
+        // ureq=warn suppression, so ureq=debug survives in the final filter.
+        let filter = build_env_filter(Some("info,ureq=debug"));
+        let directives = filter.to_string();
+        assert!(
+            directives.contains("ureq=debug"),
+            "user-supplied ureq=debug must be preserved; got: {directives}"
+        );
+        assert!(
+            !directives.contains("ureq=warn"),
+            "ureq=warn suppression must not be added when user already set ureq=debug; got: {directives}"
+        );
+    }
+
+    #[test]
+    fn level_override_wins() {
+        // CLI flag "debug" → root must be debug; suppression directives still present.
+        let filter = build_env_filter(Some("debug"));
+        let directives = filter_directives(&filter);
+        assert!(
+            directives.contains("debug"),
+            "root debug level must appear in filter with --log-level debug; got: {directives}"
+        );
+        // Suppression for ureq must still be layered on top.
+        assert!(
+            directives.contains("ureq=warn"),
+            "ureq=warn suppression must still be present even under --log-level debug; got: {directives}"
+        );
+    }
+
+    #[test]
+    fn tower_http_suppressed_at_default() {
+        // No override → tower_http must be suppressed.
+        let filter = build_env_filter(None);
+        let directives = filter_directives(&filter);
+        assert!(
+            directives.contains("tower_http=info") || directives.contains("tower_http=warn"),
+            "tower_http must be suppressed at default level; got: {directives}"
+        );
+    }
+
+    #[test]
+    fn all_quiet_directives_are_valid() {
+        // Ensure every built-in directive parses without panic.
+        for directive in super::QUIET_DIRECTIVES {
+            directive
+                .parse::<tracing_subscriber::filter::Directive>()
+                .expect("built-in directive is invalid");
+        }
+    }
+
+    #[test]
+    fn no_level_override_uses_info_root() {
+        // Without RUST_LOG set and no override, root should default to info.
+        // The directive string must not open with debug or trace as the root level.
+        let filter = build_env_filter(None);
+        let directives = filter_directives(&filter);
+        // Root "debug" or "trace" as the first token would mean root is debug/trace.
+        let root_is_noisier_than_info = directives.starts_with("debug") || directives.starts_with("trace");
+        assert!(
+            !root_is_noisier_than_info,
+            "default root level must not be debug/trace without RUST_LOG; got: {directives}"
+        );
+    }
+
+    #[test]
+    fn hf_hub_suppressed_at_default() {
+        let filter = build_env_filter(None);
+        let directives = filter_directives(&filter);
+        assert!(
+            directives.contains("hf_hub=info"),
+            "hf_hub must be suppressed to info at default; got: {directives}"
+        );
+    }
+
+    #[test]
+    fn hyper_util_suppressed_at_default() {
+        let filter = build_env_filter(None);
+        let directives = filter_directives(&filter);
+        assert!(
+            directives.contains("hyper_util=warn"),
+            "hyper_util must be suppressed to warn at default; got: {directives}"
+        );
+    }
+
+    #[test]
+    fn malformed_level_override_falls_back_to_info() {
+        // Garbage CLI flag must NOT panic — try_new returns Err and we fall back
+        // to RUST_LOG / info default.
+        let filter = build_env_filter(Some(":::garbage"));
+        let directives = filter_directives(&filter);
+        // Quiet directives should still be layered, proving we recovered.
+        assert!(
+            directives.contains("ureq=warn"),
+            "ureq=warn must still be present after malformed override; got: {directives}"
+        );
+    }
+
+    #[test]
+    fn similar_target_name_does_not_block_suppression() {
+        // A user-supplied directive for `hf_hub_server` must NOT cause the
+        // `hf_hub=info` suppression to be skipped (regression test for the
+        // earlier substring-containment bug).
+        let filter = build_env_filter(Some("info,hf_hub_server=debug"));
+        let directives = filter.to_string();
+        assert!(
+            directives.contains("hf_hub_server=debug"),
+            "user directive for hf_hub_server must survive; got: {directives}"
+        );
+        assert!(
+            directives.contains("hf_hub=info"),
+            "hf_hub=info suppression must still be applied; got: {directives}"
+        );
+    }
+}
--- a/crates/kreuzberg-cli/src/main.rs
+++ b/crates/kreuzberg-cli/src/main.rs
@@ -0,0 +1,971 @@
+//! Kreuzberg CLI - Command-line interface for document intelligence.
+//!
+//! This binary provides a command-line interface to the Kreuzberg document intelligence
+//! library, supporting document extraction, MIME type detection, caching, and batch operations.
+//!
+//! # Architecture
+//!
+//! The CLI is built using `clap` for argument parsing and provides five main commands:
+//! - `extract`: Extract text/data from a single document
+//! - `batch`: Process multiple documents in parallel
+//! - `detect`: Identify MIME type of a file
+//! - `cache`: Manage cache (clear, stats)
+//! - `serve`: Start API server (requires `api` feature)
+//! - `version`: Show version information
+//!
+//! # Configuration
+//!
+//! The CLI supports configuration files in TOML, YAML, or JSON formats:
+//! - Explicit: `--config path/to/config.toml`
+//! - Auto-discovery: Searches for `kreuzberg.{toml,yaml,json}` in current and parent directories
+//! - Inline JSON: `--config-json '{"ocr": {"backend": "tesseract"}}'`
+//! - Command-line flags override config file settings
+//!
+//! Configuration precedence (highest to lowest):
+//! 1. Individual CLI flags (--output-format, --ocr, etc.)
+//! 2. Inline JSON config (--config-json or --config-json-base64)
+//! 3. Config file (--config path.toml)
+//! 4. Default values
+//!
+//! # Exit Codes
+//!
+//! - 0: Success
+//! - Non-zero: Error (see stderr for details)
+//!
+//! # Examples
+//!
+//! ```bash
+//! # Extract text from a PDF
+//! kreuzberg extract document.pdf
+//!
+//! # Extract with OCR enabled
+//! kreuzberg extract scanned.pdf --ocr true
+//!
+//! # Extract with inline JSON config
+//! kreuzberg extract doc.pdf --config-json '{"ocr":{"backend":"tesseract"}}'
+//!
+//! # Batch processing
+//! kreuzberg batch *.pdf --output-format json
+//!
+//! # Detect MIME type
+//! kreuzberg detect unknown-file.bin
+//! ```
+
+#![deny(unsafe_code)]
+
+mod commands;
+mod logging;
+mod output;
+mod style;
+
+use anyhow::{Context, Result};
+use base64::{Engine as _, engine::general_purpose::STANDARD};
+use clap::{CommandFactory, Parser, Subcommand};
+#[cfg(feature = "embeddings")]
+use commands::embed_command;
+#[cfg(feature = "mcp")]
+use commands::mcp_command;
+use commands::overrides::ExtractionOverrides;
+#[cfg(feature = "api")]
+use commands::serve_command;
+use commands::{
+    batch_command, chunk_command, clear_command, extract_command,
+    extract_structured::{ExtractStructuredArgs, extract_structured_command},
+    load_config, manifest_command, stats_command, warm_command,
+};
+use kreuzberg::{OutputFormat as ContentOutputFormat, detect_mime_type};
+use serde_json::json;
+use std::path::{Path, PathBuf};
+
+/// Kreuzberg document intelligence CLI
+#[derive(Parser)]
+#[command(name = "kreuzberg")]
+#[command(version, about, long_about = None)]
+struct Cli {
+    /// Set log level (trace, debug, info, warn, error). Overrides RUST_LOG env var.
+    #[arg(long, global = true)]
+    log_level: Option<String>,
+
+    #[command(subcommand)]
+    command: Commands,
+}
+
+#[derive(Subcommand)]
+enum Commands {
+    /// Extract text from a document
+    Extract {
+        /// Path to the document
+        path: PathBuf,
+
+        /// Path to config file (TOML, YAML, or JSON). If not specified, searches for kreuzberg.toml/yaml/json in current and parent directories.
+        #[arg(short, long)]
+        config: Option<PathBuf>,
+
+        /// Inline JSON configuration. Applied after config file but before individual flags.
+        ///
+        /// Example: --config-json '{"ocr":{"backend":"tesseract"},"chunking":{"max_chars":1000}}'
+        #[arg(long)]
+        config_json: Option<String>,
+
+        /// Base64-encoded JSON configuration. Useful for shell environments where quotes are problematic.
+        ///
+        /// Example: --config-json-base64 eyJvY3IiOnsiYmFja2VuZCI6InRlc3NlcmFjdCJ9fQ==
+        #[arg(long)]
+        config_json_base64: Option<String>,
+
+        /// MIME type hint (auto-detected if not provided)
+        #[arg(short, long)]
+        mime_type: Option<String>,
+
+        /// Output format for CLI results (text or json).
+        ///
+        /// Controls how the CLI displays results, not the extraction content format.
+        #[arg(short, long, default_value = "text")]
+        format: WireFormat,
+
+        /// Extraction configuration overrides
+        #[command(flatten)]
+        overrides: ExtractionOverrides,
+    },
+
+    /// Extract structured data from a document using an LLM
+    ExtractStructured {
+        /// Path to the document file
+        path: PathBuf,
+
+        /// Path to JSON schema file defining the output structure
+        #[arg(long)]
+        schema: PathBuf,
+
+        /// LLM model (e.g., "openai/gpt-4o")
+        #[arg(long)]
+        model: String,
+
+        /// API key for the LLM provider
+        #[arg(long)]
+        api_key: Option<String>,
+
+        /// Custom Jinja2 prompt template
+        #[arg(long)]
+        prompt: Option<String>,
+
+        /// Schema name
+        #[arg(long, default_value = "extraction")]
+        schema_name: Option<String>,
+
+        /// Enable strict mode
+        #[arg(long)]
+        strict: bool,
+
+        /// Config file path
+        #[arg(short, long)]
+        config: Option<PathBuf>,
+
+        /// Output format (text or json)
+        #[arg(short, long, default_value = "json")]
+        format: WireFormat,
+    },
+
+    /// Batch extract from multiple documents
+    Batch {
+        /// Paths to documents
+        paths: Vec<PathBuf>,
+
+        /// Path to config file (TOML, YAML, or JSON). If not specified, searches for kreuzberg.toml/yaml/json in current and parent directories.
+        #[arg(short, long)]
+        config: Option<PathBuf>,
+
+        /// Inline JSON configuration. Applied after config file but before individual flags.
+        ///
+        /// Example: --config-json '{"ocr":{"backend":"tesseract"},"chunking":{"max_chars":1000}}'
+        #[arg(long)]
+        config_json: Option<String>,
+
+        /// Base64-encoded JSON configuration. Useful for shell environments where quotes are problematic.
+        ///
+        /// Example: --config-json-base64 eyJvY3IiOnsiYmFja2VuZCI6InRlc3NlcmFjdCJ9fQ==
+        #[arg(long)]
+        config_json_base64: Option<String>,
+
+        /// Output format for CLI results (text or json).
+        ///
+        /// Controls how the CLI displays results, not the extraction content format.
+        #[arg(short, long, default_value = "json")]
+        format: WireFormat,
+
+        /// Extraction configuration overrides
+        #[command(flatten)]
+        overrides: ExtractionOverrides,
+
+        /// Path to a JSON file mapping file paths to per-file extraction config overrides.
+        /// The JSON should be an object where keys are file paths and values are FileExtractionConfig objects.
+        /// Example: {"doc1.pdf": {"force_ocr": true}, "doc2.pdf": {"output_format": "markdown"}}
+        #[arg(long)]
+        file_configs: Option<PathBuf>,
+    },
+
+    /// Detect MIME type of a file
+    Detect {
+        /// Path to the file
+        path: PathBuf,
+
+        /// Output format (text or json)
+        #[arg(short, long, default_value = "text")]
+        format: WireFormat,
+    },
+
+    /// List all supported document formats
+    Formats {
+        /// Output format (text or json)
+        #[arg(short, long, default_value = "text")]
+        format: WireFormat,
+    },
+
+    /// Show version information
+    Version {
+        /// Output format (text or json)
+        #[arg(short, long, default_value = "text")]
+        format: WireFormat,
+    },
+
+    /// Cache management operations
+    Cache {
+        #[command(subcommand)]
+        command: CacheCommands,
+    },
+
+    /// Start the API server
+    ///
+    /// Configuration is loaded with the following precedence (highest to lowest):
+    /// 1. CLI arguments (--host, --port)
+    /// 2. Environment variables (KREUZBERG_HOST, KREUZBERG_PORT)
+    /// 3. Config file (TOML, YAML, or JSON)
+    /// 4. Built-in defaults (127.0.0.1:8000)
+    ///
+    /// The config file can contain both extraction and server settings under [server] section.
+    #[cfg(feature = "api")]
+    Serve {
+        /// Host to bind to (e.g., "127.0.0.1" or "0.0.0.0"). CLI arg overrides config file and env vars.
+        #[arg(short = 'H', long)]
+        host: Option<String>,
+
+        /// Port to bind to. CLI arg overrides config file and env vars.
+        #[arg(short, long)]
+        port: Option<u16>,
+
+        /// Path to config file (TOML, YAML, or JSON). If not specified, searches for kreuzberg.toml/yaml/json in current and parent directories.
+        #[arg(short, long)]
+        config: Option<PathBuf>,
+    },
+
+    /// Start the MCP (Model Context Protocol) server
+    #[cfg(feature = "mcp")]
+    Mcp {
+        /// Path to config file (TOML, YAML, or JSON). If not specified, searches for kreuzberg.toml/yaml/json in current and parent directories.
+        #[arg(short, long)]
+        config: Option<PathBuf>,
+
+        /// Transport mode: stdio (default) or http
+        #[arg(long, default_value = "stdio")]
+        transport: String,
+
+        /// HTTP host (only for --transport http)
+        #[arg(long, default_value = "127.0.0.1")]
+        host: String,
+
+        /// HTTP port (only for --transport http)
+        #[arg(long, default_value = "8001")]
+        port: u16,
+    },
+
+    /// API utilities
+    #[cfg(feature = "api")]
+    Api {
+        #[command(subcommand)]
+        command: ApiCommands,
+    },
+
+    /// Generate embeddings for text
+    ///
+    /// Generates vector embeddings for one or more text inputs using a specified preset model
+    /// or an LLM provider. Reads from --text flag or stdin if no text is provided.
+    #[cfg(feature = "embeddings")]
+    Embed {
+        /// Text to embed. Can be specified multiple times for batch embedding.
+        #[arg(long)]
+        text: Vec<String>,
+
+        /// Embedding preset (fast, balanced, quality, multilingual). Used with --provider local.
+        #[arg(long, default_value = "balanced")]
+        preset: String,
+
+        /// Embedding provider: "local" (default, ONNX), "llm" (liter-llm), or "plugin" (registered in-process backend)
+        #[arg(long, default_value = "local")]
+        provider: String,
+
+        /// LLM model for provider-hosted embeddings (e.g., "openai/text-embedding-3-small").
+        /// Required when --provider is "llm".
+        #[arg(long)]
+        model: Option<String>,
+
+        /// API key for the LLM provider
+        #[arg(long)]
+        api_key: Option<String>,
+
+        /// Name of a pre-registered in-process embedding backend.
+        /// Required when --provider is "plugin". The backend must have been
+        /// registered via `kreuzberg::plugins::register_embedding_backend`
+        /// before this command runs.
+        #[arg(long)]
+        plugin: Option<String>,
+
+        /// Output format (text or json)
+        #[arg(short, long, default_value = "json")]
+        format: WireFormat,
+    },
+
+    /// Chunk text for processing
+    ///
+    /// Splits text into chunks using configurable size and overlap.
+    /// Reads from --text flag or stdin if no text is provided.
+    Chunk {
+        /// Text to chunk. If not provided, reads from stdin.
+        #[arg(long)]
+        text: Option<String>,
+
+        /// Path to config file (TOML, YAML, or JSON)
+        #[arg(short, long)]
+        config: Option<PathBuf>,
+
+        /// Chunk size in characters
+        #[arg(long)]
+        chunk_size: Option<usize>,
+
+        /// Chunk overlap in characters
+        #[arg(long)]
+        chunk_overlap: Option<usize>,
+
+        /// Chunker type: text, markdown, yaml, or semantic
+        #[arg(long, default_value = "text")]
+        chunker_type: String,
+
+        /// Tokenizer model for token-based chunk sizing (e.g., "Xenova/gpt-4o").
+        /// Requires the chunking-tokenizers feature.
+        #[arg(long)]
+        chunking_tokenizer: Option<String>,
+
+        /// Topic threshold for semantic chunking (0.0-1.0, default: 0.75)
+        #[arg(long)]
+        topic_threshold: Option<f32>,
+
+        /// Output format (text or json)
+        #[arg(short, long, default_value = "json")]
+        format: WireFormat,
+    },
+
+    /// Generate shell completions
+    ///
+    /// Outputs shell completion scripts for the specified shell.
+    /// Install with: eval "$(kreuzberg completions bash)"
+    Completions {
+        /// Shell to generate completions for
+        #[arg(value_enum)]
+        shell: clap_complete::Shell,
+    },
+}
+
+#[cfg(feature = "api")]
+#[derive(Subcommand)]
+enum ApiCommands {
+    /// Output the OpenAPI schema (JSON)
+    ///
+    /// Prints the full OpenAPI 3.1 specification for the kreuzberg REST API.
+    /// Useful for code generation, documentation, and API client tooling.
+    Schema,
+}
+
+#[derive(Subcommand)]
+enum CacheCommands {
+    /// Show cache statistics
+    Stats {
+        /// Cache directory (default: .kreuzberg in current directory)
+        #[arg(short, long)]
+        cache_dir: Option<PathBuf>,
+
+        /// Output format (text or json)
+        #[arg(short, long, default_value = "text")]
+        format: WireFormat,
+    },
+
+    /// Clear the cache
+    Clear {
+        /// Cache directory (default: .kreuzberg in current directory)
+        #[arg(short, long)]
+        cache_dir: Option<PathBuf>,
+
+        /// Output format (text or json)
+        #[arg(short, long, default_value = "text")]
+        format: WireFormat,
+    },
+
+    /// Output model manifest (expected model files, checksums, sizes)
+    ///
+    /// Outputs a JSON manifest of all model files required by kreuzberg,
+    /// including their relative paths, SHA256 checksums, and sizes.
+    /// Used for pre-populating model caches in containerized deployments.
+    Manifest {
+        /// Output format (text or json)
+        #[arg(short, long, default_value = "json")]
+        format: WireFormat,
+    },
+
+    /// Download all models eagerly
+    ///
+    /// Downloads all PaddleOCR and layout detection models for all supported
+    /// languages. Unlike normal operation which downloads lazily on first use,
+    /// this ensures all models are present in the cache directory.
+    ///
+    /// Use --all-embeddings to also download all 4 embedding model presets,
+    /// or --embedding-model <preset> to download a specific one.
+    ///
+    /// By default, only the core layout models (rtdetr + tatr) are downloaded.
+    /// Use --all-table-models to also download SLANeXT variants (~730MB).
+    Warm {
+        /// Cache directory (default: .kreuzberg in current directory, or KREUZBERG_CACHE_DIR)
+        #[arg(short, long)]
+        cache_dir: Option<PathBuf>,
+
+        /// Output format (text or json)
+        #[arg(short, long, default_value = "text")]
+        format: WireFormat,
+
+        /// Download all embedding model presets (fast, balanced, quality, multilingual)
+        #[arg(long)]
+        all_embeddings: bool,
+
+        /// Download a specific embedding model preset
+        #[arg(long, value_name = "PRESET")]
+        embedding_model: Option<String>,
+
+        /// Download all table structure models including SLANeXT variants (~730MB)
+        #[arg(
+            long,
+            help = "Download all table structure models including SLANeXT variants (~730MB)"
+        )]
+        all_table_models: bool,
+
+        /// Download all tree-sitter grammar parsers
+        #[arg(long)]
+        all_grammars: bool,
+
+        /// Download specific tree-sitter grammar groups (comma-separated: web,systems,scripting,data,jvm,functional)
+        #[arg(long, value_name = "GROUPS", value_delimiter = ',')]
+        grammar_groups: Option<Vec<String>>,
+
+        /// Download specific tree-sitter grammars by language name (comma-separated)
+        #[arg(long, value_name = "LANGUAGES", value_delimiter = ',')]
+        grammars: Option<Vec<String>>,
+    },
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+enum WireFormat {
+    Text,
+    Json,
+    Toon,
+}
+
+impl std::str::FromStr for WireFormat {
+    type Err = String;
+
+    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
+        match s.to_lowercase().as_str() {
+            "text" => Ok(WireFormat::Text),
+            "json" => Ok(WireFormat::Json),
+            "toon" => Ok(WireFormat::Toon),
+            _ => Err(format!("Invalid format: {}. Use 'text', 'json', or 'toon'", s)),
+        }
+    }
+}
+
+/// Content output format for extraction results.
+///
+/// Controls the format of the extracted content (not the CLI output format).
+#[derive(Clone, Copy, Debug, PartialEq, Eq, clap::ValueEnum)]
+enum ContentOutputFormatArg {
+    /// Plain text (default)
+    Plain,
+    /// Markdown format
+    Markdown,
+    /// Djot markup format
+    Djot,
+    /// HTML format
+    Html,
+    /// JSON tree format with heading-driven sections
+    Json,
+}
+
+impl From<ContentOutputFormatArg> for ContentOutputFormat {
+    fn from(arg: ContentOutputFormatArg) -> Self {
+        match arg {
+            ContentOutputFormatArg::Plain => ContentOutputFormat::Plain,
+            ContentOutputFormatArg::Markdown => ContentOutputFormat::Markdown,
+            ContentOutputFormatArg::Djot => ContentOutputFormat::Djot,
+            ContentOutputFormatArg::Html => ContentOutputFormat::Html,
+            ContentOutputFormatArg::Json => ContentOutputFormat::Json,
+        }
+    }
+}
+
+/// Validates that a file exists and is accessible.
+///
+/// Checks that the path exists in the filesystem and points to a regular file
+/// (not a directory or special file). Provides user-friendly error messages if validation fails.
+///
+/// # Errors
+///
+/// Returns an error if:
+/// - The path does not exist in the filesystem
+/// - The path exists but is not a regular file (e.g., is a directory)
+fn validate_file_exists(path: &Path) -> Result<()> {
+    if !path.exists() {
+        anyhow::bail!(
+            "File not found: '{}'. Please check that the file exists and is accessible.",
+            path.display()
+        );
+    }
+    if !path.is_file() {
+        anyhow::bail!(
+            "Path is not a file: '{}'. Please provide a path to a regular file.",
+            path.display()
+        );
+    }
+    Ok(())
+}
+
+/// Validates chunking parameters for correctness.
+///
+/// Ensures that chunking configuration makes sense: size must be positive and reasonable,
+/// and overlap must be smaller than chunk size. This prevents common configuration errors
+/// that would lead to cryptic failures from the underlying library.
+///
+/// # Errors
+///
+/// Returns an error if:
+/// - `chunk_size` is 0 (must be at least 1 character)
+/// - `chunk_size` exceeds 1,000,000 characters (to prevent excessive memory usage)
+/// - `chunk_overlap` is greater than or equal to `chunk_size` (overlap must be smaller)
+fn validate_chunk_params(chunk_size: Option<usize>, chunk_overlap: Option<usize>) -> Result<()> {
+    if let Some(size) = chunk_size {
+        if size == 0 {
+            anyhow::bail!("Invalid chunk size: {}. Chunk size must be greater than 0.", size);
+        }
+        if size > 1_000_000 {
+            anyhow::bail!(
+                "Invalid chunk size: {}. Chunk size must be less than 1,000,000 characters to avoid excessive memory usage.",
+                size
+            );
+        }
+    }
+
+    if let Some(overlap) = chunk_overlap
+        && let Some(size) = chunk_size
+        && overlap >= size
+    {
+        anyhow::bail!(
+            "Invalid chunk overlap: {}. Overlap ({}) must be less than chunk size ({}).",
+            overlap,
+            overlap,
+            size
+        );
+    }
+
+    Ok(())
+}
+
+/// Validates batch extraction paths for correctness.
+///
+/// Ensures that at least one file path is provided and that all paths point to valid,
+/// accessible files. This prevents processing empty batches or failing mid-batch due
+/// to invalid paths.
+///
+/// # Errors
+///
+/// Returns an error if:
+/// - The paths array is empty (at least one file is required)
+/// - Any path does not exist or is not a regular file
+fn validate_batch_paths(paths: &[PathBuf]) -> Result<()> {
+    if paths.is_empty() {
+        anyhow::bail!("No files provided for batch extraction. Please provide at least one file path.");
+    }
+
+    for (i, path) in paths.iter().enumerate() {
+        validate_file_exists(path).with_context(|| format!("Invalid file at position {}", i + 1))?;
+    }
+
+    Ok(())
+}
+
+/// Apply inline JSON or base64 JSON overrides to an extraction config.
+fn apply_json_overrides(
+    config: &mut kreuzberg::ExtractionConfig,
+    config_json: Option<String>,
+    config_json_base64: Option<String>,
+) -> Result<()> {
+    if let Some(json_str) = config_json {
+        let json_value: serde_json::Value =
+            serde_json::from_str(&json_str).context("Failed to parse --config-json as JSON")?;
+        *config =
+            merge_json_into_config(config, json_value).context("Failed to merge --config-json with file config")?;
+    } else if let Some(base64_str) = config_json_base64 {
+        let json_bytes = STANDARD
+            .decode(&base64_str)
+            .context("Failed to decode base64 in --config-json-base64")?;
+        let json_str = String::from_utf8(json_bytes).context("Base64-decoded content is not valid UTF-8")?;
+        let json_value: serde_json::Value =
+            serde_json::from_str(&json_str).context("Failed to parse decoded --config-json-base64 as JSON")?;
+        *config = merge_json_into_config(config, json_value)
+            .context("Failed to merge --config-json-base64 with file config")?;
+    }
+    Ok(())
+}
+
+/// Merges a JSON value into an existing extraction config via field-by-field override.
+fn merge_json_into_config(
+    base_config: &kreuzberg::ExtractionConfig,
+    json_value: serde_json::Value,
+) -> Result<kreuzberg::ExtractionConfig> {
+    let json_str = serde_json::to_string(&json_value).map_err(|e| anyhow::anyhow!("{}", e))?;
+    kreuzberg::core::config::merge::merge_config_json(base_config, &json_str).map_err(|e| anyhow::anyhow!("{}", e))
+}
+
+fn main() -> Result<()> {
+    let cli = Cli::parse();
+
+    let env_filter = logging::build_env_filter(cli.log_level.as_deref());
+
+    let _ = tracing_subscriber::fmt()
+        .with_env_filter(env_filter)
+        .with_writer(std::io::stderr)
+        .try_init();
+
+    match cli.command {
+        Commands::Extract {
+            path,
+            config: config_path,
+            config_json,
+            config_json_base64,
+            mime_type,
+            format,
+            overrides,
+        } => {
+            validate_file_exists(&path)?;
+            overrides.validate()?;
+
+            let mut config = load_config(config_path)?;
+            apply_json_overrides(&mut config, config_json, config_json_base64)?;
+            overrides.apply(&mut config);
+
+            extract_command(path, config, mime_type, format)?;
+        }
+
+        Commands::ExtractStructured {
+            path,
+            schema,
+            model,
+            api_key,
+            prompt,
+            schema_name,
+            strict,
+            config,
+            format,
+        } => {
+            validate_file_exists(&path)?;
+            validate_file_exists(&schema)?;
+            extract_structured_command(ExtractStructuredArgs {
+                path,
+                schema_path: schema,
+                model,
+                api_key,
+                prompt,
+                schema_name,
+                strict,
+                config_path: config,
+                format,
+            })?;
+        }
+
+        Commands::Batch {
+            paths,
+            config: config_path,
+            config_json,
+            config_json_base64,
+            format,
+            overrides,
+            file_configs,
+        } => {
+            validate_batch_paths(&paths)?;
+            overrides.validate()?;
+
+            let mut config = load_config(config_path)?;
+            apply_json_overrides(&mut config, config_json, config_json_base64)?;
+            overrides.apply(&mut config);
+
+            let file_configs_map = if let Some(file_configs_path) = file_configs {
+                let file_configs_json = std::fs::read_to_string(&file_configs_path)
+                    .with_context(|| format!("Failed to read file configs from '{}'", file_configs_path.display()))?;
+                let map: std::collections::HashMap<String, serde_json::Value> =
+                    serde_json::from_str(&file_configs_json).with_context(|| {
+                        format!(
+                            "Failed to parse file configs JSON from '{}'",
+                            file_configs_path.display()
+                        )
+                    })?;
+                Some(map)
+            } else {
+                None
+            };
+            batch_command(paths, file_configs_map, config, format)?;
+        }
+
+        Commands::Detect { path, format } => {
+            validate_file_exists(&path)?;
+
+            let path_str = path.to_string_lossy().to_string();
+            let mime_type = detect_mime_type(path_str.clone(), true).with_context(|| {
+                format!(
+                    "Failed to detect MIME type for file '{}'. Ensure the file is readable.",
+                    path.display()
+                )
+            })?;
+
+            match format {
+                WireFormat::Text => {
+                    println!("{}", style::success(&mime_type));
+                }
+                WireFormat::Json => {
+                    let output = json!({
+                        "path": path_str,
+                        "mime_type": mime_type,
+                    });
+                    println!(
+                        "{}",
+                        serde_json::to_string_pretty(&output)
+                            .context("Failed to serialize MIME type detection result to JSON")?
+                    );
+                }
+                WireFormat::Toon => {
+                    let output = json!({
+                        "path": path_str,
+                        "mime_type": mime_type,
+                    });
+                    println!(
+                        "{}",
+                        serde_toon::to_string(&output)
+                            .context("Failed to serialize MIME type detection result to TOON")?
+                    );
+                }
+            }
+        }
+
+        Commands::Formats { format } => {
+            let formats = kreuzberg::core::mime::list_supported_formats();
+            match format {
+                WireFormat::Text => {
+                    println!("{:<15} {}", style::label("EXTENSION"), style::label("MIME TYPE"));
+                    println!("{}", style::dim(&format!("{:<15} ---------", "---------")));
+                    for f in &formats {
+                        println!("{:<15} {}", style::success(&format!(".{}", f.extension)), f.mime_type);
+                    }
+                }
+                WireFormat::Json => {
+                    println!(
+                        "{}",
+                        serde_json::to_string_pretty(&formats).context("Failed to serialize formats to JSON")?
+                    );
+                }
+                WireFormat::Toon => {
+                    println!(
+                        "{}",
+                        serde_toon::to_string(&formats).context("Failed to serialize formats to TOON")?
+                    );
+                }
+            }
+        }
+
+        Commands::Version { format } => {
+            let version = env!("CARGO_PKG_VERSION");
+            let name = env!("CARGO_PKG_NAME");
+
+            match format {
+                WireFormat::Text => {
+                    println!("{} {}", style::label(name), style::success(version));
+                }
+                WireFormat::Json => {
+                    let output = json!({
+                        "name": name,
+                        "version": version,
+                    });
+                    println!(
+                        "{}",
+                        serde_json::to_string_pretty(&output)
+                            .context("Failed to serialize version information to JSON")?
+                    );
+                }
+                WireFormat::Toon => {
+                    let output = json!({
+                        "name": name,
+                        "version": version,
+                    });
+                    println!(
+                        "{}",
+                        serde_toon::to_string(&output).context("Failed to serialize version information to TOON")?
+                    );
+                }
+            }
+        }
+
+        #[cfg(feature = "api")]
+        Commands::Serve {
+            host: cli_host,
+            port: cli_port,
+            config: config_path,
+        } => {
+            let mut extraction_config = load_config(config_path.clone())?;
+            extraction_config.apply_env_overrides()?;
+            serve_command(cli_host, cli_port, extraction_config, config_path)?;
+        }
+
+        #[cfg(feature = "mcp")]
+        Commands::Mcp {
+            config: config_path,
+            transport,
+            #[cfg(feature = "mcp-http")]
+            host,
+            #[cfg(feature = "mcp-http")]
+            port,
+            #[cfg(not(feature = "mcp-http"))]
+            host,
+            #[cfg(not(feature = "mcp-http"))]
+            port,
+        } => {
+            let mut config = load_config(config_path)?;
+            config.apply_env_overrides()?;
+            mcp_command(config, transport, host, port)?;
+        }
+
+        Commands::Cache { command } => match command {
+            CacheCommands::Stats { cache_dir, format } => {
+                stats_command(cache_dir, format)?;
+            }
+            CacheCommands::Clear { cache_dir, format } => {
+                clear_command(cache_dir, format)?;
+            }
+            CacheCommands::Manifest { format } => {
+                manifest_command(format)?;
+            }
+            CacheCommands::Warm {
+                cache_dir,
+                format,
+                all_embeddings,
+                embedding_model,
+                all_table_models,
+                all_grammars,
+                grammar_groups,
+                grammars,
+            } => {
+                warm_command(
+                    cache_dir,
+                    format,
+                    all_embeddings,
+                    embedding_model,
+                    all_table_models,
+                    all_grammars,
+                    grammar_groups,
+                    grammars,
+                )?;
+            }
+        },
+
+        #[cfg(feature = "api")]
+        Commands::Api { command } => match command {
+            ApiCommands::Schema => {
+                println!("{}", kreuzberg::api::openapi::openapi_json());
+            }
+        },
+
+        #[cfg(feature = "embeddings")]
+        Commands::Embed {
+            text,
+            preset,
+            provider,
+            model,
+            api_key,
+            plugin,
+            format,
+        } => {
+            let texts = if text.is_empty() {
+                vec![commands::read_stdin()?]
+            } else {
+                text
+            };
+            embed_command(texts, &preset, &provider, model, api_key, plugin, format)?;
+        }
+
+        Commands::Chunk {
+            text,
+            config: config_path,
+            chunk_size,
+            chunk_overlap,
+            chunker_type,
+            chunking_tokenizer,
+            topic_threshold,
+            format,
+        } => {
+            let input = match text {
+                Some(t) => t,
+                None => commands::read_stdin().context("No --text provided and failed to read from stdin")?,
+            };
+
+            validate_chunk_params(chunk_size, chunk_overlap)?;
+
+            let base_config = load_config(config_path)?;
+            let mut chunking_config = base_config.chunking.unwrap_or_default();
+
+            if let Some(size) = chunk_size {
+                chunking_config.max_characters = size;
+                // If user set chunk_size but not overlap, clamp overlap to fit
+                if chunk_overlap.is_none() && chunking_config.overlap >= size {
+                    chunking_config.overlap = size / 4;
+                }
+            }
+            if let Some(overlap) = chunk_overlap {
+                chunking_config.overlap = overlap;
+            }
+            match chunker_type.as_str() {
+                "markdown" => chunking_config.chunker_type = kreuzberg::ChunkerType::Markdown,
+                "yaml" => chunking_config.chunker_type = kreuzberg::ChunkerType::Yaml,
+                "semantic" => chunking_config.chunker_type = kreuzberg::ChunkerType::Semantic,
+                _ => chunking_config.chunker_type = kreuzberg::ChunkerType::Text,
+            }
+            if let Some(ref tokenizer) = chunking_tokenizer {
+                chunking_config.sizing = kreuzberg::ChunkSizing::Tokenizer {
+                    model: tokenizer.clone(),
+                    cache_dir: None,
+                };
+            }
+            if topic_threshold.is_some() {
+                chunking_config.topic_threshold = topic_threshold;
+            }
+
+            chunk_command(input, chunking_config, format)?;
+        }
+
+        Commands::Completions { shell } => {
+            let mut cmd = Cli::command();
+            clap_complete::generate(shell, &mut cmd, "kreuzberg", &mut std::io::stdout());
+        }
+    }
+
+    Ok(())
+}
--- a/crates/kreuzberg-cli/src/output.rs
+++ b/crates/kreuzberg-cli/src/output.rs
@@ -0,0 +1,32 @@
+//! JSON envelope types for CLI output.
+//!
+//! When `--format json` is used, extraction results are wrapped in these envelopes
+//! so tooling (such as the benchmark harness) can read timing information without
+//! parsing stderr or running a separate profiling tool.
+
+use kreuzberg::ExtractionResult;
+use serde::Serialize;
+
+/// Single-file extraction result with wall-clock timing.
+///
+/// Emitted to stdout by `kreuzberg extract --format json`.
+#[derive(Debug, Serialize)]
+pub struct ExtractEnvelope {
+    /// The extraction result (content, metadata, tables, …).
+    pub result: ExtractionResult,
+    /// Wall-clock time for the extraction call in milliseconds.
+    pub extraction_time_ms: f64,
+}
+
+/// Batch extraction results with per-file and total timing.
+///
+/// Emitted to stdout by `kreuzberg batch --format json`.
+#[derive(Debug, Serialize)]
+pub struct BatchEnvelope {
+    /// One result per input file, in input order.
+    pub results: Vec<ExtractionResult>,
+    /// Total wall-clock time for the whole batch in milliseconds.
+    pub total_ms: f64,
+    /// Per-file wall-clock times in milliseconds, aligned with `results`.
+    pub per_file_ms: Vec<f64>,
+}
--- a/crates/kreuzberg-cli/src/style.rs
+++ b/crates/kreuzberg-cli/src/style.rs
@@ -0,0 +1,104 @@
+//! CLI color styling helpers using `anstyle`.
+//!
+//! Provides styled output for the kreuzberg CLI. Respects the `NO_COLOR`
+//! environment variable (<https://no-color.org/>) and disables colors
+//! when output is not a terminal.
+
+use anstyle::{AnsiColor, Effects, Style};
+use std::sync::OnceLock;
+
+/// Bold blue for section headers.
+const HEADER: Style = Style::new()
+    .fg_color(Some(anstyle::Color::Ansi(AnsiColor::Blue)))
+    .effects(Effects::BOLD);
+
+/// Green for success values (MIME types, file paths, versions).
+const SUCCESS: Style = Style::new().fg_color(Some(anstyle::Color::Ansi(AnsiColor::Green)));
+
+/// Dim for metadata, separators, secondary info.
+const DIM: Style = Style::new().effects(Effects::DIMMED);
+
+/// Bold for labels in key-value pairs.
+const LABEL: Style = Style::new().effects(Effects::BOLD);
+
+/// Check whether color output is enabled.
+///
+/// Returns `false` if:
+/// - The `NO_COLOR` environment variable is set (any value)
+///
+/// See <https://no-color.org/> for the specification.
+pub fn is_color_enabled() -> bool {
+    static ENABLED: OnceLock<bool> = OnceLock::new();
+    *ENABLED.get_or_init(|| std::env::var_os("NO_COLOR").is_none())
+}
+
+/// Apply an `anstyle::Style` to text if colors are enabled.
+fn styled(text: &str, style: Style) -> String {
+    if is_color_enabled() {
+        format!("{}{}{}", style.render(), text, style.render_reset())
+    } else {
+        text.to_string()
+    }
+}
+
+/// Style text as a section header (bold blue).
+pub fn header(text: &str) -> String {
+    styled(text, HEADER)
+}
+
+/// Style text as a success value (green).
+pub fn success(text: &str) -> String {
+    styled(text, SUCCESS)
+}
+
+/// Style text as dim/secondary (dimmed).
+pub fn dim(text: &str) -> String {
+    styled(text, DIM)
+}
+
+/// Style text as a label (bold).
+pub fn label(text: &str) -> String {
+    styled(text, LABEL)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_styled_returns_plain_text_when_no_color() {
+        // Set NO_COLOR for this test's assertion scope via direct env check
+        // Since OnceLock caches, we test the raw logic instead.
+        let text = "hello";
+        let result = format!("{}{}{}", Style::new().render(), text, Style::new().render_reset());
+        // A plain Style produces no ANSI codes, so the result is just the text.
+        assert_eq!(result, "hello");
+    }
+
+    #[test]
+    fn test_styled_applies_ansi_when_style_present() {
+        let style = Style::new().fg_color(Some(anstyle::Color::Ansi(AnsiColor::Green)));
+        let rendered = format!("{}{}{}", style.render(), "ok", style.render_reset());
+        // The rendered string should contain ANSI escape sequences.
+        assert!(rendered.contains("\x1b["));
+        assert!(rendered.contains("ok"));
+    }
+
+    #[test]
+    fn test_helper_functions_return_strings() {
+        // Smoke test: all helpers produce non-empty output for non-empty input.
+        assert!(!header("h").is_empty());
+        assert!(!success("s").is_empty());
+        assert!(!dim("d").is_empty());
+        assert!(!label("l").is_empty());
+    }
+
+    #[test]
+    fn test_is_color_enabled_respects_no_color_env() {
+        // We cannot easily test OnceLock-cached value, but we can verify the
+        // logic: NO_COLOR absence means colors enabled.
+        let has_no_color = std::env::var_os("NO_COLOR").is_some();
+        // The cached result should match the env at init time.
+        assert_eq!(is_color_enabled(), !has_no_color);
+    }
+}
--- a/crates/kreuzberg-cli/tests/commands_test.rs
+++ b/crates/kreuzberg-cli/tests/commands_test.rs
@@ -0,0 +1,937 @@
+//! Integration tests for CLI commands (extract, detect, batch).
+//!
+//! These tests verify that the CLI commands work correctly end-to-end,
+//! including input validation, file processing, and output formatting.
+
+use std::path::PathBuf;
+use std::process::Command;
+use tempfile::tempdir;
+
+/// Get the path to the kreuzberg binary.
+fn get_binary_path() -> String {
+    let manifest_dir = env!("CARGO_MANIFEST_DIR");
+    format!("{}/../../target/debug/kreuzberg", manifest_dir)
+}
+
+/// Get the test_documents directory path.
+fn get_test_documents_dir() -> PathBuf {
+    let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+    manifest_dir.parent().unwrap().parent().unwrap().join("test_documents")
+}
+
+/// Get a test file path relative to test_documents/.
+fn get_test_file(relative_path: &str) -> String {
+    get_test_documents_dir()
+        .join(relative_path)
+        .to_string_lossy()
+        .to_string()
+}
+
+/// Build the binary before running tests.
+fn build_binary() {
+    let status = Command::new("cargo")
+        .args(["build", "--bin", "kreuzberg"])
+        .status()
+        .expect("Failed to build kreuzberg binary");
+
+    assert!(status.success(), "Failed to build kreuzberg binary");
+}
+
+#[test]
+fn test_extract_text_file() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", test_file.as_str()])
+        .output()
+        .expect("Failed to execute extract command");
+
+    assert!(
+        output.status.success(),
+        "Extract command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(!stdout.is_empty(), "Extract output should not be empty");
+}
+
+#[test]
+fn test_extract_with_json_output() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", test_file.as_str(), "--format", "json"])
+        .output()
+        .expect("Failed to execute extract command");
+
+    assert!(
+        output.status.success(),
+        "Extract command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+
+    let json_result: serde_json::Result<serde_json::Value> = serde_json::from_str(&stdout);
+    assert!(json_result.is_ok(), "Output should be valid JSON, got: {}", stdout);
+
+    let json = json_result.unwrap();
+    // JSON output is now wrapped in a timing envelope: { result: ExtractionResult, extraction_time_ms: f64 }
+    assert!(json.get("result").is_some(), "JSON envelope should have 'result' field");
+    assert!(
+        json.get("extraction_time_ms").is_some(),
+        "JSON envelope should have 'extraction_time_ms' field"
+    );
+    assert!(
+        json["result"].get("content").is_some(),
+        "result should have 'content' field"
+    );
+    assert!(
+        json["result"].get("mime_type").is_some(),
+        "result should have 'mime_type' field"
+    );
+}
+
+#[test]
+fn test_extract_with_chunking() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let output = Command::new(get_binary_path())
+        .args([
+            "extract",
+            test_file.as_str(),
+            "--chunk",
+            "true",
+            "--chunk-size",
+            "100",
+            "--chunk-overlap",
+            "20",
+            "--format",
+            "json",
+        ])
+        .output()
+        .expect("Failed to execute extract command");
+
+    assert!(
+        output.status.success(),
+        "Extract with chunking failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    let json: serde_json::Value = serde_json::from_str(&stdout).expect("Should be valid JSON");
+
+    // JSON output is wrapped in an envelope; chunks live under result
+    assert!(
+        json["result"].get("chunks").is_some(),
+        "result should have 'chunks' field"
+    );
+    assert!(json["result"]["chunks"].is_array(), "'chunks' should be an array");
+}
+
+#[test]
+fn test_extract_file_not_found() {
+    build_binary();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", "/nonexistent/file.txt"])
+        .output()
+        .expect("Failed to execute extract command");
+
+    assert!(!output.status.success(), "Extract should fail for nonexistent file");
+
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    assert!(
+        stderr.contains("File not found"),
+        "Error should mention file not found, got: {}",
+        stderr
+    );
+}
+
+#[test]
+fn test_extract_directory_not_file() {
+    build_binary();
+
+    let tmp_dir = tempdir().expect("Failed to create temp dir");
+    let dir_path = tmp_dir.path().to_string_lossy().to_string();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", dir_path.as_str()])
+        .output()
+        .expect("Failed to execute extract command");
+
+    assert!(!output.status.success(), "Extract should fail for directory");
+
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    assert!(
+        stderr.contains("not a file") || stderr.contains("regular file"),
+        "Error should mention path is not a file, got: {}",
+        stderr
+    );
+}
+
+#[test]
+fn test_extract_invalid_chunk_size_zero() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", test_file.as_str(), "--chunk-size", "0"])
+        .output()
+        .expect("Failed to execute extract command");
+
+    assert!(!output.status.success(), "Extract should fail for chunk size 0");
+
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    assert!(
+        stderr.contains("Invalid chunk size") || stderr.contains("must be greater than 0"),
+        "Error should mention invalid chunk size, got: {}",
+        stderr
+    );
+}
+
+#[test]
+fn test_extract_invalid_chunk_size_too_large() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", test_file.as_str(), "--chunk-size", "2000000"])
+        .output()
+        .expect("Failed to execute extract command");
+
+    assert!(!output.status.success(), "Extract should fail for chunk size > 1M");
+
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    assert!(
+        stderr.contains("Invalid chunk size") || stderr.contains("1,000,000"),
+        "Error should mention chunk size limit, got: {}",
+        stderr
+    );
+}
+
+#[test]
+fn test_extract_invalid_overlap_equals_chunk_size() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let output = Command::new(get_binary_path())
+        .args([
+            "extract",
+            test_file.as_str(),
+            "--chunk-size",
+            "100",
+            "--chunk-overlap",
+            "100",
+        ])
+        .output()
+        .expect("Failed to execute extract command");
+
+    assert!(
+        !output.status.success(),
+        "Extract should fail when overlap equals chunk size"
+    );
+
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    assert!(
+        stderr.contains("Invalid chunk overlap") || stderr.contains("must be less than chunk size"),
+        "Error should mention overlap constraint, got: {}",
+        stderr
+    );
+}
+
+#[test]
+fn test_detect_mime_type() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let output = Command::new(get_binary_path())
+        .args(["detect", test_file.as_str()])
+        .output()
+        .expect("Failed to execute detect command");
+
+    assert!(
+        output.status.success(),
+        "Detect command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(!stdout.is_empty(), "Detect output should not be empty");
+    assert!(
+        stdout.contains("text/plain") || stdout.contains("text"),
+        "Should detect text MIME type, got: {}",
+        stdout
+    );
+}
+
+#[test]
+fn test_detect_with_json_output() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let output = Command::new(get_binary_path())
+        .args(["detect", test_file.as_str(), "--format", "json"])
+        .output()
+        .expect("Failed to execute detect command");
+
+    assert!(
+        output.status.success(),
+        "Detect command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+
+    let json_result: serde_json::Result<serde_json::Value> = serde_json::from_str(&stdout);
+    assert!(json_result.is_ok(), "Output should be valid JSON, got: {}", stdout);
+
+    let json = json_result.unwrap();
+    assert!(json.get("mime_type").is_some(), "JSON should have 'mime_type' field");
+    assert!(json.get("path").is_some(), "JSON should have 'path' field");
+}
+
+#[test]
+fn test_detect_file_not_found() {
+    build_binary();
+
+    let output = Command::new(get_binary_path())
+        .args(["detect", "/nonexistent/file.txt"])
+        .output()
+        .expect("Failed to execute detect command");
+
+    assert!(!output.status.success(), "Detect should fail for nonexistent file");
+
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    assert!(
+        stderr.contains("File not found"),
+        "Error should mention file not found, got: {}",
+        stderr
+    );
+}
+
+#[test]
+fn test_batch_multiple_files() {
+    build_binary();
+
+    let file1 = get_test_file("text/simple.txt");
+    let file2 = get_test_file("text/simple.txt");
+
+    if !PathBuf::from(&file1).exists() {
+        tracing::debug!("Skipping test: {} not found", file1);
+        return;
+    }
+
+    let output = Command::new(get_binary_path())
+        .args(["batch", file1.as_str(), file2.as_str(), "--format", "json"])
+        .output()
+        .expect("Failed to execute batch command");
+
+    assert!(
+        output.status.success(),
+        "Batch command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+
+    let json_result: serde_json::Result<serde_json::Value> = serde_json::from_str(&stdout);
+    assert!(json_result.is_ok(), "Output should be valid JSON, got: {}", stdout);
+
+    let json = json_result.unwrap();
+    // Batch JSON output is now wrapped in a timing envelope: { results: [...], total_ms, per_file_ms }
+    assert!(
+        json.get("results").is_some(),
+        "Batch envelope should have 'results' field"
+    );
+    assert!(json["results"].is_array(), "'results' should be a JSON array");
+    assert_eq!(json["results"].as_array().unwrap().len(), 2, "Should have 2 results");
+}
+
+#[test]
+fn test_batch_with_missing_file() {
+    build_binary();
+
+    let valid_file = get_test_file("text/simple.txt");
+
+    if !PathBuf::from(&valid_file).exists() {
+        tracing::debug!("Skipping test: {} not found", valid_file);
+        return;
+    }
+
+    let output = Command::new(get_binary_path())
+        .args(["batch", valid_file.as_str(), "/nonexistent/file.txt"])
+        .output()
+        .expect("Failed to execute batch command");
+
+    assert!(!output.status.success(), "Batch should fail when one file is missing");
+
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    assert!(
+        stderr.contains("File not found") || stderr.contains("Invalid file"),
+        "Error should mention file not found, got: {}",
+        stderr
+    );
+}
+
+#[test]
+fn test_extract_help() {
+    build_binary();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", "--help"])
+        .output()
+        .expect("Failed to execute extract --help");
+
+    assert!(output.status.success());
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(stdout.contains("Extract text from a document"));
+    assert!(stdout.contains("--chunk-size"));
+    assert!(stdout.contains("--chunk-overlap"));
+}
+
+#[test]
+fn test_detect_help() {
+    build_binary();
+
+    let output = Command::new(get_binary_path())
+        .args(["detect", "--help"])
+        .output()
+        .expect("Failed to execute detect --help");
+
+    assert!(output.status.success());
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(stdout.contains("Detect MIME type"));
+}
+
+#[test]
+fn test_batch_help() {
+    build_binary();
+
+    let output = Command::new(get_binary_path())
+        .args(["batch", "--help"])
+        .output()
+        .expect("Failed to execute batch --help");
+
+    assert!(output.status.success());
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(stdout.contains("Batch extract from multiple documents"));
+}
+
+// ── Extract command flag parsing tests ──────────────────────────────
+
+#[test]
+fn test_extract_help_shows_all_extraction_override_flags() {
+    build_binary();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", "--help"])
+        .output()
+        .expect("Failed to execute extract --help");
+
+    assert!(output.status.success());
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+
+    // Verify all ExtractionOverrides flags appear in help output
+    let expected_flags = [
+        "--ocr",
+        "--ocr-backend",
+        "--ocr-language",
+        "--force-ocr",
+        "--no-cache",
+        "--ocr-auto-rotate",
+        "--chunk",
+        "--chunk-size",
+        "--chunk-overlap",
+        "--chunking-tokenizer",
+        "--content-format",
+        "--include-structure",
+        "--quality",
+        "--detect-language",
+        "--layout",
+        "--layout-confidence",
+        "--layout-table-model",
+        "--acceleration",
+        "--max-concurrent",
+        "--max-threads",
+        "--extract-pages",
+        "--page-markers",
+        "--extract-images",
+        "--target-dpi",
+        "--pdf-password",
+        "--token-reduction",
+        "--msg-codepage",
+    ];
+
+    for flag in &expected_flags {
+        assert!(
+            stdout.contains(flag),
+            "Extract --help should show flag '{}', but it was not found in output:\n{}",
+            flag,
+            stdout
+        );
+    }
+}
+
+// ── Batch command flag parity test ──────────────────────────────────
+
+#[test]
+fn test_batch_has_same_extraction_flags_as_extract() {
+    build_binary();
+
+    let extract_output = Command::new(get_binary_path())
+        .args(["extract", "--help"])
+        .output()
+        .expect("Failed to execute extract --help");
+
+    let batch_output = Command::new(get_binary_path())
+        .args(["batch", "--help"])
+        .output()
+        .expect("Failed to execute batch --help");
+
+    assert!(extract_output.status.success());
+    assert!(batch_output.status.success());
+
+    let extract_help = String::from_utf8_lossy(&extract_output.stdout);
+    let batch_help = String::from_utf8_lossy(&batch_output.stdout);
+
+    // All extraction override flags should be present on both commands
+    let shared_flags = [
+        "--ocr",
+        "--ocr-backend",
+        "--ocr-language",
+        "--force-ocr",
+        "--no-cache",
+        "--chunk",
+        "--chunk-size",
+        "--chunk-overlap",
+        "--content-format",
+        "--quality",
+        "--detect-language",
+        "--layout",
+        "--layout-confidence",
+        "--layout-table-model",
+        "--acceleration",
+        "--max-concurrent",
+        "--max-threads",
+        "--extract-pages",
+        "--page-markers",
+        "--extract-images",
+        "--target-dpi",
+        "--pdf-password",
+        "--token-reduction",
+        "--msg-codepage",
+    ];
+
+    for flag in &shared_flags {
+        assert!(
+            extract_help.contains(flag),
+            "Extract should have flag '{}' but it's missing",
+            flag
+        );
+        assert!(
+            batch_help.contains(flag),
+            "Batch should have flag '{}' (parity with extract) but it's missing",
+            flag
+        );
+    }
+}
+
+// ── Validation error tests ──────────────────────────────────────────
+//
+// NOTE: The CLI validates file existence *before* override validation,
+// so we must provide a real file to reach the override validation stage.
+
+/// Create a temporary file and return its path as a String.
+/// The caller must keep the returned `tempfile::TempDir` alive for the
+/// duration of the test so the file is not deleted.
+fn create_temp_file() -> (tempfile::TempDir, String) {
+    let dir = tempdir().expect("Failed to create temp dir");
+    let file_path = dir.path().join("dummy.pdf");
+    std::fs::write(&file_path, b"dummy content").expect("Failed to write temp file");
+    let path_str = file_path.to_string_lossy().to_string();
+    (dir, path_str)
+}
+
+#[test]
+fn test_extract_chunk_size_zero_error() {
+    build_binary();
+    let (_dir, file_path) = create_temp_file();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", "--chunk-size", "0", &file_path])
+        .output()
+        .expect("Failed to execute extract command");
+
+    assert!(!output.status.success(), "Should fail when chunk size is 0");
+
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    assert!(
+        stderr.contains("chunk size") || stderr.contains("Chunk size") || stderr.contains("Invalid chunk size"),
+        "Error should mention chunk size, got: {}",
+        stderr
+    );
+}
+
+#[test]
+fn test_extract_chunk_overlap_exceeds_size_error() {
+    build_binary();
+    let (_dir, file_path) = create_temp_file();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", "--chunk-size", "10", "--chunk-overlap", "20", &file_path])
+        .output()
+        .expect("Failed to execute extract command");
+
+    assert!(!output.status.success(), "Should fail when overlap exceeds chunk size");
+
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    assert!(
+        stderr.contains("overlap") || stderr.contains("Overlap") || stderr.contains("Invalid chunk overlap"),
+        "Error should mention overlap constraint, got: {}",
+        stderr
+    );
+}
+
+#[test]
+fn test_extract_layout_confidence_out_of_range_error() {
+    build_binary();
+    let (_dir, file_path) = create_temp_file();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", "--layout-confidence", "2.0", &file_path])
+        .output()
+        .expect("Failed to execute extract command");
+
+    // This flag is feature-gated behind layout-detection. If the binary was
+    // built without that feature, clap itself will reject the unknown flag.
+    assert!(
+        !output.status.success(),
+        "Should fail for layout confidence out of range"
+    );
+
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    assert!(
+        stderr.contains("confidence") || stderr.contains("layout") || stderr.contains("unexpected argument"),
+        "Error should mention confidence or layout, got: {}",
+        stderr
+    );
+}
+
+#[test]
+fn test_extract_layout_false_with_confidence_error() {
+    build_binary();
+    let (_dir, file_path) = create_temp_file();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", "--layout", "false", "--layout-confidence", "0.5", &file_path])
+        .output()
+        .expect("Failed to execute extract command");
+
+    // If layout-detection feature is enabled, validation should reject this combination.
+    // If not enabled, clap rejects the unknown flags.
+    assert!(
+        !output.status.success(),
+        "Should fail when --layout false is combined with --layout-confidence"
+    );
+}
+
+#[test]
+fn test_extract_target_dpi_zero_error() {
+    build_binary();
+    let (_dir, file_path) = create_temp_file();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", "--target-dpi", "0", &file_path])
+        .output()
+        .expect("Failed to execute extract command");
+
+    assert!(!output.status.success(), "Should fail when target DPI is 0");
+
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    assert!(
+        stderr.contains("DPI") || stderr.contains("dpi") || stderr.contains("target") || stderr.contains("Invalid"),
+        "Error should mention DPI range, got: {}",
+        stderr
+    );
+}
+
+// ── Completions test ────────────────────────────────────────────────
+
+#[test]
+fn test_completions_bash_produces_output() {
+    build_binary();
+
+    let output = Command::new(get_binary_path())
+        .args(["completions", "bash"])
+        .output()
+        .expect("Failed to execute completions command");
+
+    assert!(
+        output.status.success(),
+        "Completions command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(!stdout.is_empty(), "Completions output should not be empty");
+    // bash completions should contain the command name
+    assert!(
+        stdout.contains("kreuzberg"),
+        "Bash completions should reference 'kreuzberg', got: {}",
+        &stdout[..stdout.len().min(200)]
+    );
+}
+
+#[test]
+fn test_completions_zsh_produces_output() {
+    build_binary();
+
+    let output = Command::new(get_binary_path())
+        .args(["completions", "zsh"])
+        .output()
+        .expect("Failed to execute completions command");
+
+    assert!(
+        output.status.success(),
+        "Completions command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(!stdout.is_empty(), "Zsh completions output should not be empty");
+}
+
+#[test]
+fn test_completions_fish_produces_output() {
+    build_binary();
+
+    let output = Command::new(get_binary_path())
+        .args(["completions", "fish"])
+        .output()
+        .expect("Failed to execute completions command");
+
+    assert!(
+        output.status.success(),
+        "Completions command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(!stdout.is_empty(), "Fish completions output should not be empty");
+}
+
+// ── Embed help test ─────────────────────────────────────────────────
+
+#[test]
+fn test_embed_help_shows_correct_flags() {
+    build_binary();
+
+    let output = Command::new(get_binary_path())
+        .args(["embed", "--help"])
+        .output()
+        .expect("Failed to execute embed --help");
+
+    // embed is feature-gated; if not compiled, clap will show an error
+    if !output.status.success() {
+        // If embed subcommand doesn't exist, skip the test
+        let stderr = String::from_utf8_lossy(&output.stderr);
+        if stderr.contains("unrecognized subcommand") || stderr.contains("invalid subcommand") {
+            return;
+        }
+    }
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(
+        stdout.contains("--text"),
+        "Embed help should show --text flag, got: {}",
+        stdout
+    );
+    assert!(
+        stdout.contains("--preset"),
+        "Embed help should show --preset flag, got: {}",
+        stdout
+    );
+    assert!(
+        stdout.contains("--format"),
+        "Embed help should show --format flag, got: {}",
+        stdout
+    );
+    assert!(
+        stdout.contains("Generate embeddings"),
+        "Embed help should describe embedding generation, got: {}",
+        stdout
+    );
+}
+
+// ── Chunk help test ─────────────────────────────────────────────────
+
+#[test]
+fn test_chunk_help_shows_correct_flags() {
+    build_binary();
+
+    let output = Command::new(get_binary_path())
+        .args(["chunk", "--help"])
+        .output()
+        .expect("Failed to execute chunk --help");
+
+    assert!(
+        output.status.success(),
+        "Chunk --help failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(
+        stdout.contains("--text"),
+        "Chunk help should show --text flag, got: {}",
+        stdout
+    );
+    assert!(
+        stdout.contains("--chunk-size"),
+        "Chunk help should show --chunk-size flag, got: {}",
+        stdout
+    );
+    assert!(
+        stdout.contains("--chunk-overlap"),
+        "Chunk help should show --chunk-overlap flag, got: {}",
+        stdout
+    );
+    assert!(
+        stdout.contains("--chunker-type"),
+        "Chunk help should show --chunker-type flag, got: {}",
+        stdout
+    );
+    assert!(
+        stdout.contains("--format"),
+        "Chunk help should show --format flag, got: {}",
+        stdout
+    );
+    assert!(
+        stdout.contains("Chunk text"),
+        "Chunk help should describe text chunking, got: {}",
+        stdout
+    );
+}
+
+// ── Style module NO_COLOR test ──────────────────────────────────────
+
+#[test]
+fn test_no_color_env_disables_ansi_in_output() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        return;
+    }
+
+    // Run with NO_COLOR set - output should have no ANSI escape sequences
+    let output = Command::new(get_binary_path())
+        .env("NO_COLOR", "1")
+        .args(["detect", &test_file])
+        .output()
+        .expect("Failed to execute detect command");
+
+    assert!(
+        output.status.success(),
+        "Detect failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(
+        !stdout.contains("\x1b["),
+        "Output should not contain ANSI escape sequences when NO_COLOR is set, got: {:?}",
+        stdout
+    );
+}
+
+// ── Additional validation edge cases ────────────────────────────────
+
+#[test]
+fn test_extract_chunk_size_too_large_error() {
+    build_binary();
+    let (_dir, file_path) = create_temp_file();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", "--chunk-size", "2000000", &file_path])
+        .output()
+        .expect("Failed to execute extract command");
+
+    assert!(!output.status.success(), "Should fail when chunk size exceeds limit");
+
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    assert!(
+        stderr.contains("chunk size") || stderr.contains("Chunk size") || stderr.contains("1,000,000"),
+        "Error should mention chunk size limit, got: {}",
+        stderr
+    );
+}
+
+#[test]
+fn test_extract_target_dpi_too_high_error() {
+    build_binary();
+    let (_dir, file_path) = create_temp_file();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", "--target-dpi", "5000", &file_path])
+        .output()
+        .expect("Failed to execute extract command");
+
+    assert!(!output.status.success(), "Should fail when target DPI exceeds limit");
+
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    assert!(
+        stderr.contains("DPI") || stderr.contains("dpi") || stderr.contains("2400") || stderr.contains("Invalid"),
+        "Error should mention DPI range, got: {}",
+        stderr
+    );
+}
--- a/crates/kreuzberg-cli/tests/config_discovery_test.rs
+++ b/crates/kreuzberg-cli/tests/config_discovery_test.rs
@@ -0,0 +1,617 @@
+//! Integration tests for CLI config file discovery.
+//!
+//! These tests verify that the CLI correctly discovers and loads configuration files
+//! in various formats (.toml, .yaml, .json) with case-insensitive extension
+//! matching, explicit --config flag support, and proper error handling.
+
+use std::fs;
+use std::path::PathBuf;
+use std::process::Command;
+use tempfile::tempdir;
+
+/// Get the path to the kreuzberg binary.
+fn get_binary_path() -> String {
+    let manifest_dir = env!("CARGO_MANIFEST_DIR");
+    format!("{}/../../target/debug/kreuzberg", manifest_dir)
+}
+
+/// Build the binary before running tests.
+fn build_binary() {
+    let status = Command::new("cargo")
+        .args(["build", "--bin", "kreuzberg"])
+        .status()
+        .expect("Failed to build kreuzberg binary");
+
+    assert!(status.success(), "Failed to build kreuzberg binary");
+}
+
+/// Get the test_documents directory path.
+fn get_test_documents_dir() -> PathBuf {
+    let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+    manifest_dir.parent().unwrap().parent().unwrap().join("test_documents")
+}
+
+/// Get a test file path relative to test_documents/.
+fn get_test_file(relative_path: &str) -> String {
+    get_test_documents_dir()
+        .join(relative_path)
+        .to_string_lossy()
+        .to_string()
+}
+
+#[test]
+fn test_discover_kreuzberg_toml_in_current_directory() {
+    build_binary();
+
+    let dir = tempdir().unwrap();
+    let config_path = dir.path().join(".kreuzberg.toml");
+
+    fs::write(
+        &config_path,
+        r#"
+use_cache = false
+enable_quality_processing = false
+"#,
+    )
+    .unwrap();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let output = Command::new(get_binary_path())
+        .current_dir(dir.path())
+        .args(["extract", test_file.as_str()])
+        .output()
+        .expect("Failed to execute kreuzberg");
+
+    assert!(
+        output.status.success(),
+        "Command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+}
+
+#[test]
+fn test_discover_kreuzberg_yaml_in_current_directory() {
+    build_binary();
+
+    let dir = tempdir().unwrap();
+    let config_path = dir.path().join(".kreuzberg.yaml");
+
+    fs::write(
+        &config_path,
+        r#"
+use_cache: false
+enable_quality_processing: false
+"#,
+    )
+    .unwrap();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let output = Command::new(get_binary_path())
+        .current_dir(dir.path())
+        .args(["extract", test_file.as_str()])
+        .output()
+        .expect("Failed to execute kreuzberg");
+
+    assert!(
+        output.status.success(),
+        "Command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+}
+
+#[test]
+fn test_discover_kreuzberg_yml_in_current_directory() {
+    build_binary();
+
+    let dir = tempdir().unwrap();
+    let config_path = dir.path().join(".kreuzberg.yaml");
+
+    fs::write(
+        &config_path,
+        r#"
+use_cache: false
+enable_quality_processing: false
+"#,
+    )
+    .unwrap();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let output = Command::new(get_binary_path())
+        .current_dir(dir.path())
+        .args(["extract", test_file.as_str()])
+        .output()
+        .expect("Failed to execute kreuzberg");
+
+    assert!(
+        output.status.success(),
+        "Command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+}
+
+#[test]
+fn test_discover_kreuzberg_json_in_current_directory() {
+    build_binary();
+
+    let dir = tempdir().unwrap();
+    let config_path = dir.path().join(".kreuzberg.json");
+
+    fs::write(
+        &config_path,
+        r#"{
+    "use_cache": false,
+    "enable_quality_processing": false
+}"#,
+    )
+    .unwrap();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let output = Command::new(get_binary_path())
+        .current_dir(dir.path())
+        .args(["extract", test_file.as_str()])
+        .output()
+        .expect("Failed to execute kreuzberg");
+
+    assert!(
+        output.status.success(),
+        "Command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+}
+
+#[test]
+fn test_case_insensitive_toml_extension() {
+    build_binary();
+
+    let dir = tempdir().unwrap();
+    let config_path = dir.path().join("custom.TOML");
+
+    fs::write(
+        &config_path,
+        r#"
+use_cache = false
+"#,
+    )
+    .unwrap();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let config_arg = config_path.to_string_lossy().into_owned();
+
+    let output = Command::new(get_binary_path())
+        .current_dir(dir.path())
+        .args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
+        .output()
+        .expect("Failed to execute kreuzberg");
+
+    assert!(
+        output.status.success(),
+        "Command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+}
+
+#[test]
+fn test_case_insensitive_yaml_extension() {
+    build_binary();
+
+    let dir = tempdir().unwrap();
+    let config_path = dir.path().join("custom.Yaml");
+
+    fs::write(
+        &config_path,
+        r#"
+use_cache: false
+"#,
+    )
+    .unwrap();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let config_arg = config_path.to_string_lossy().into_owned();
+
+    let output = Command::new(get_binary_path())
+        .current_dir(dir.path())
+        .args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
+        .output()
+        .expect("Failed to execute kreuzberg");
+
+    assert!(
+        output.status.success(),
+        "Command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+}
+
+#[test]
+fn test_case_insensitive_yml_extension() {
+    build_binary();
+
+    let dir = tempdir().unwrap();
+    let config_path = dir.path().join("custom.YML");
+
+    fs::write(
+        &config_path,
+        r#"
+use_cache: false
+"#,
+    )
+    .unwrap();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let config_arg = config_path.to_string_lossy().into_owned();
+
+    let output = Command::new(get_binary_path())
+        .current_dir(dir.path())
+        .args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
+        .output()
+        .expect("Failed to execute kreuzberg");
+
+    assert!(
+        output.status.success(),
+        "Command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+}
+
+#[test]
+fn test_case_insensitive_json_extension() {
+    build_binary();
+
+    let dir = tempdir().unwrap();
+    let config_path = dir.path().join("custom.JSON");
+
+    fs::write(
+        &config_path,
+        r#"{
+    "use_cache": false
+}"#,
+    )
+    .unwrap();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let config_arg = config_path.to_string_lossy().into_owned();
+
+    let output = Command::new(get_binary_path())
+        .current_dir(dir.path())
+        .args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
+        .output()
+        .expect("Failed to execute kreuzberg");
+
+    assert!(
+        output.status.success(),
+        "Command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+}
+
+#[test]
+fn test_explicit_config_path_toml() {
+    build_binary();
+
+    let dir = tempdir().unwrap();
+    let config_path = dir.path().join("custom_config.toml");
+
+    fs::write(
+        &config_path,
+        r#"
+use_cache = false
+enable_quality_processing = false
+"#,
+    )
+    .unwrap();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let config_arg = config_path.to_string_lossy().into_owned();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
+        .output()
+        .expect("Failed to execute kreuzberg");
+
+    assert!(
+        output.status.success(),
+        "Command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+}
+
+#[test]
+fn test_explicit_config_path_yaml() {
+    build_binary();
+
+    let dir = tempdir().unwrap();
+    let config_path = dir.path().join("custom_config.yaml");
+
+    fs::write(
+        &config_path,
+        r#"
+use_cache: false
+enable_quality_processing: false
+"#,
+    )
+    .unwrap();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let config_arg = config_path.to_string_lossy().into_owned();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
+        .output()
+        .expect("Failed to execute kreuzberg");
+
+    assert!(
+        output.status.success(),
+        "Command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+}
+
+#[test]
+fn test_explicit_config_path_json() {
+    build_binary();
+
+    let dir = tempdir().unwrap();
+    let config_path = dir.path().join("custom_config.json");
+
+    fs::write(
+        &config_path,
+        r#"{
+    "use_cache": false,
+    "enable_quality_processing": false
+}"#,
+    )
+    .unwrap();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let config_arg = config_path.to_string_lossy().into_owned();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
+        .output()
+        .expect("Failed to execute kreuzberg");
+
+    assert!(
+        output.status.success(),
+        "Command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+}
+
+#[test]
+fn test_invalid_config_extension() {
+    build_binary();
+
+    let dir = tempdir().unwrap();
+    let config_path = dir.path().join("config.txt");
+
+    fs::write(&config_path, "invalid content").unwrap();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let config_arg = config_path.to_string_lossy().into_owned();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
+        .output()
+        .expect("Failed to execute kreuzberg");
+
+    assert!(!output.status.success());
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    assert!(
+        stderr.contains(".toml") || stderr.contains(".yaml") || stderr.contains(".json"),
+        "Error message should mention supported extensions: {}",
+        stderr
+    );
+}
+
+#[test]
+fn test_malformed_toml_config() {
+    build_binary();
+
+    let dir = tempdir().unwrap();
+    let config_path = dir.path().join("bad_config.toml");
+
+    fs::write(&config_path, "use_cache = [[[[[").unwrap();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let config_arg = config_path.to_string_lossy().into_owned();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
+        .output()
+        .expect("Failed to execute kreuzberg");
+
+    assert!(!output.status.success());
+}
+
+#[test]
+fn test_malformed_yaml_config() {
+    build_binary();
+
+    let dir = tempdir().unwrap();
+    let config_path = dir.path().join("bad_config.yaml");
+
+    fs::write(&config_path, "use_cache: [[[[[").unwrap();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let config_arg = config_path.to_string_lossy().into_owned();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
+        .output()
+        .expect("Failed to execute kreuzberg");
+
+    assert!(!output.status.success());
+}
+
+#[test]
+fn test_malformed_json_config() {
+    build_binary();
+
+    let dir = tempdir().unwrap();
+    let config_path = dir.path().join("bad_config.json");
+
+    fs::write(&config_path, r#"{"use_cache": [[[[[}"#).unwrap();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let config_arg = config_path.to_string_lossy().into_owned();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
+        .output()
+        .expect("Failed to execute kreuzberg");
+
+    assert!(!output.status.success());
+}
+
+#[test]
+fn test_nonexistent_config_file() {
+    build_binary();
+
+    let dir = tempdir().unwrap();
+    let config_path = dir.path().join("nonexistent.toml");
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let config_arg = config_path.to_string_lossy().into_owned();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
+        .output()
+        .expect("Failed to execute kreuzberg");
+
+    assert!(!output.status.success());
+}
+
+#[test]
+fn test_default_config_when_no_file_found() {
+    build_binary();
+
+    let dir = tempdir().unwrap();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let output = Command::new(get_binary_path())
+        .current_dir(dir.path())
+        .args(["extract", test_file.as_str()])
+        .output()
+        .expect("Failed to execute kreuzberg");
+
+    assert!(
+        output.status.success(),
+        "Command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+}
+
+#[test]
+fn test_invalid_config_values() {
+    build_binary();
+
+    let dir = tempdir().unwrap();
+    let config_path = dir.path().join("invalid.toml");
+
+    fs::write(
+        &config_path,
+        r#"
+use_cache = "not_a_bool"
+"#,
+    )
+    .unwrap();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let config_arg = config_path.to_string_lossy().into_owned();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
+        .output()
+        .expect("Failed to execute kreuzberg");
+
+    assert!(!output.status.success());
+}
--- a/crates/kreuzberg-cli/tests/config_env_overrides_test.rs
+++ b/crates/kreuzberg-cli/tests/config_env_overrides_test.rs
@@ -0,0 +1,46 @@
+//! Regression test for issue #773.
+//! Validates that environment variable overrides are correctly applied during configuration loading.
+
+use kreuzberg::{EmbeddingModelType, ExtractionConfig};
+
+#[test]
+fn test_regression_773_env_override_loading() {
+    let mut config = ExtractionConfig::default();
+
+    if let Some(ref ocr) = config.ocr {
+        assert_ne!(ocr.language, "fra");
+    }
+
+    unsafe { std::env::set_var("KREUZBERG_OCR_LANGUAGE", "fra") };
+    config.apply_env_overrides().expect("Failed to apply overrides");
+    unsafe { std::env::remove_var("KREUZBERG_OCR_LANGUAGE") };
+
+    let ocr = config
+        .ocr
+        .expect("OCR config should be Some when KREUZBERG_OCR_LANGUAGE is set");
+    assert_eq!(ocr.language, "fra");
+}
+
+#[test]
+fn test_regression_773_vlm_embedding_env_override() {
+    let mut config = ExtractionConfig::default();
+
+    unsafe { std::env::set_var("KREUZBERG_VLM_EMBEDDING_MODEL", "openai/text-embedding-3-small") };
+    config
+        .apply_env_overrides()
+        .expect("Failed to apply environment overrides");
+    unsafe { std::env::remove_var("KREUZBERG_VLM_EMBEDDING_MODEL") };
+
+    let chunking = config
+        .chunking
+        .expect("Chunking should be enabled when VLM embedding is set");
+    let embedding = chunking.embedding.expect("Embedding should be configured");
+
+    match embedding.model {
+        EmbeddingModelType::Llm { llm } => {
+            assert_eq!(llm.model, "openai/text-embedding-3-small");
+            assert!(llm.api_key.is_none());
+        }
+        _ => panic!("Expected Llm embedding model type"),
+    }
+}
--- a/crates/kreuzberg-cli/tests/config_tests.rs
+++ b/crates/kreuzberg-cli/tests/config_tests.rs
@@ -0,0 +1,344 @@
+//! CLI configuration tests validating flags, aliases, and deprecation handling.
+//!
+//! This test suite verifies that:
+//! 1. --output-format flag works correctly for all format options
+//! 2. CLI flags properly override config file settings
+//! 3. Config merge precedence is maintained (CLI args > config file > defaults)
+//! 4. Configuration JSON can be passed inline
+//! 5. Alias handling for deprecated flags works as expected
+
+#![allow(clippy::bool_assert_comparison)]
+#![allow(clippy::field_reassign_with_default)]
+
+use std::path::PathBuf;
+use tempfile::TempDir;
+
+/// Helper to create a temporary config file
+#[allow(dead_code)]
+fn create_test_config(dir: &TempDir, name: &str, content: &str) -> PathBuf {
+    let config_path = dir.path().join(name);
+    std::fs::write(&config_path, content).expect("Failed to write config file");
+    config_path
+}
+
+#[test]
+fn test_output_format_flag_plain() {
+    // Test that --output-format plain works
+    // This test verifies the flag is properly recognized
+
+    let config = kreuzberg::core::config::ExtractionConfig::default();
+    assert_eq!(
+        config.output_format,
+        kreuzberg::core::config::OutputFormat::Plain,
+        "Default output format should be Plain"
+    );
+}
+
+#[test]
+fn test_output_format_flag_markdown() {
+    // Test that --output-format markdown is parsed correctly
+    let markdown_format = kreuzberg::core::config::OutputFormat::Markdown;
+    assert_eq!(
+        format!("{:?}", markdown_format),
+        "Markdown",
+        "Markdown format should have correct debug representation"
+    );
+}
+
+#[test]
+fn test_output_format_flag_html() {
+    // Test that --output-format html is parsed correctly
+    let html_format = kreuzberg::core::config::OutputFormat::Html;
+    assert_eq!(
+        format!("{:?}", html_format),
+        "Html",
+        "Html format should have correct debug representation"
+    );
+}
+
+#[test]
+fn test_extraction_config_with_output_format() {
+    // Test that ExtractionConfig can be created with specific output_format
+    let mut config = kreuzberg::core::config::ExtractionConfig::default();
+
+    config.output_format = kreuzberg::core::config::OutputFormat::Markdown;
+    assert_eq!(
+        config.output_format,
+        kreuzberg::core::config::OutputFormat::Markdown,
+        "output_format should be Markdown after assignment"
+    );
+
+    let serialized = serde_json::to_value(&config).expect("Failed to serialize");
+    assert_eq!(
+        serialized["output_format"], "markdown",
+        "Serialized output_format should be 'markdown' (lowercase)"
+    );
+}
+
+#[test]
+fn test_config_json_parsing_complete() {
+    // Test that complete JSON config can be parsed
+    let json = serde_json::json!({
+        "use_cache": true,
+        "enable_quality_processing": true,
+        "force_ocr": false,
+        "output_format": "markdown",
+        "result_format": "unified",
+        "max_concurrent_extractions": 4,
+    });
+
+    let config: kreuzberg::core::config::ExtractionConfig =
+        serde_json::from_value(json).expect("Failed to parse config JSON");
+
+    assert!(config.use_cache);
+    assert!(config.enable_quality_processing);
+    assert_eq!(config.force_ocr, false);
+    assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Markdown);
+    assert_eq!(config.max_concurrent_extractions, Some(4));
+}
+
+#[test]
+fn test_config_merge_precedence_cli_overrides_default() {
+    // Test that CLI arguments override defaults
+    let mut config = kreuzberg::core::config::ExtractionConfig::default();
+
+    // Simulate CLI override
+    config.use_cache = false;
+    config.force_ocr = true;
+
+    assert_eq!(config.use_cache, false, "CLI override should change use_cache to false");
+    assert_eq!(config.force_ocr, true, "CLI override should change force_ocr to true");
+}
+
+#[test]
+fn test_config_merge_precedence_cli_overrides_file() {
+    // Test that CLI arguments override config file settings
+    let mut file_config = kreuzberg::core::config::ExtractionConfig::default();
+    file_config.use_cache = true;
+    file_config.force_ocr = false;
+
+    // Simulate CLI override
+    let mut final_config = file_config.clone();
+    final_config.use_cache = false;
+
+    assert_eq!(
+        final_config.use_cache, false,
+        "CLI should override file config for use_cache"
+    );
+    assert!(!final_config.force_ocr, "CLI should not affect fields not overridden");
+}
+
+#[test]
+fn test_config_file_precedence_over_defaults() {
+    // Test that config file values override defaults
+    let json = serde_json::json!({
+        "use_cache": false,
+        "force_ocr": true,
+    });
+
+    let file_config: kreuzberg::core::config::ExtractionConfig = serde_json::from_value(json).expect("Failed to parse");
+
+    let default_config = kreuzberg::core::config::ExtractionConfig::default();
+
+    assert_ne!(
+        file_config.use_cache, default_config.use_cache,
+        "File config should override default for use_cache"
+    );
+    assert_ne!(
+        file_config.force_ocr, default_config.force_ocr,
+        "File config should override default for force_ocr"
+    );
+}
+
+#[test]
+fn test_output_format_serialization() {
+    // Test that output_format serializes to expected string values
+    let plain = kreuzberg::core::config::OutputFormat::Plain;
+    let plain_json = serde_json::to_value(plain).expect("Failed to serialize Plain");
+    assert_eq!(plain_json, "plain");
+
+    let markdown = kreuzberg::core::config::OutputFormat::Markdown;
+    let markdown_json = serde_json::to_value(markdown).expect("Failed to serialize Markdown");
+    assert_eq!(markdown_json, "markdown");
+
+    let html = kreuzberg::core::config::OutputFormat::Html;
+    let html_json = serde_json::to_value(html).expect("Failed to serialize Html");
+    assert_eq!(html_json, "html");
+}
+
+#[test]
+fn test_output_format_deserialization() {
+    // Test that output_format can be deserialized from string values
+    let plain: kreuzberg::core::config::OutputFormat =
+        serde_json::from_value(serde_json::json!("plain")).expect("Failed to deserialize plain");
+    assert_eq!(plain, kreuzberg::core::config::OutputFormat::Plain);
+
+    let markdown: kreuzberg::core::config::OutputFormat =
+        serde_json::from_value(serde_json::json!("markdown")).expect("Failed to deserialize markdown");
+    assert_eq!(markdown, kreuzberg::core::config::OutputFormat::Markdown);
+
+    let html: kreuzberg::core::config::OutputFormat =
+        serde_json::from_value(serde_json::json!("html")).expect("Failed to deserialize html");
+    assert_eq!(html, kreuzberg::core::config::OutputFormat::Html);
+}
+
+#[test]
+fn test_extraction_config_roundtrip_with_output_format() {
+    // Test that output_format survives serialization roundtrip
+    let original = kreuzberg::core::config::ExtractionConfig {
+        output_format: kreuzberg::core::config::OutputFormat::Markdown,
+        ..kreuzberg::core::config::ExtractionConfig::default()
+    };
+
+    let json_string = serde_json::to_string(&original).expect("Failed to serialize");
+    let restored: kreuzberg::core::config::ExtractionConfig =
+        serde_json::from_str(&json_string).expect("Failed to deserialize");
+
+    assert_eq!(
+        original.output_format, restored.output_format,
+        "output_format should survive serialization roundtrip"
+    );
+}
+
+#[test]
+fn test_config_with_all_output_formats() {
+    // Test that all output format variants can be set and retrieved
+    let formats = vec![
+        kreuzberg::core::config::OutputFormat::Plain,
+        kreuzberg::core::config::OutputFormat::Markdown,
+        kreuzberg::core::config::OutputFormat::Html,
+    ];
+
+    for format in formats {
+        let config = kreuzberg::core::config::ExtractionConfig {
+            output_format: format.clone(),
+            ..kreuzberg::core::config::ExtractionConfig::default()
+        };
+
+        let json = serde_json::to_value(&config).expect("Failed to serialize");
+        let restored: kreuzberg::core::config::ExtractionConfig =
+            serde_json::from_value(json).expect("Failed to deserialize");
+
+        assert_eq!(
+            format, restored.output_format,
+            "Format should be preserved for {:?}",
+            format
+        );
+    }
+}
+
+#[test]
+fn test_config_partial_json_with_output_format() {
+    // Test that partial JSON config with only output_format is valid
+    let json = serde_json::json!({
+        "output_format": "markdown",
+    });
+
+    let config: kreuzberg::core::config::ExtractionConfig =
+        serde_json::from_value(json).expect("Failed to parse partial config");
+
+    assert_eq!(
+        config.output_format,
+        kreuzberg::core::config::OutputFormat::Markdown,
+        "output_format should be set from partial config"
+    );
+
+    // Other fields should have defaults
+    assert!(config.use_cache, "use_cache should have default value");
+}
+
+#[test]
+fn test_config_complete_json_structure() {
+    // Test that a complete config JSON has all necessary fields
+    let config = kreuzberg::core::config::ExtractionConfig::default();
+    let json = serde_json::to_value(&config).expect("Failed to serialize");
+    let obj = json.as_object().expect("Should be object");
+
+    // Verify critical fields are present
+    assert!(obj.contains_key("output_format"), "Should have output_format");
+    assert!(obj.contains_key("use_cache"), "Should have use_cache");
+    assert!(
+        obj.contains_key("enable_quality_processing"),
+        "Should have enable_quality_processing"
+    );
+    assert!(obj.contains_key("force_ocr"), "Should have force_ocr");
+    assert!(obj.contains_key("result_format"), "Should have result_format");
+}
+
+#[test]
+fn test_unknown_output_format_accepted_as_custom() {
+    // OutputFormat has a Custom(String) catch-all variant with #[serde(untagged)],
+    // so unknown strings are accepted as custom renderer names rather than rejected.
+    let json = serde_json::json!({
+        "output_format": "my_custom_renderer",
+    });
+
+    let result: Result<kreuzberg::core::config::ExtractionConfig, _> = serde_json::from_value(json);
+
+    assert!(
+        result.is_ok(),
+        "Unknown output_format should be accepted as Custom variant; got: {:?}",
+        result.err()
+    );
+    assert_eq!(
+        result.unwrap().output_format,
+        kreuzberg::core::config::OutputFormat::Custom("my_custom_renderer".to_string()),
+        "Unknown format string must deserialize as OutputFormat::Custom"
+    );
+}
+
+#[test]
+fn test_config_case_sensitivity() {
+    // Test that format values are case-insensitive due to rename_all = "lowercase"
+    let plain_lowercase = serde_json::json!({"output_format": "plain"});
+    let result: Result<kreuzberg::core::config::ExtractionConfig, _> = serde_json::from_value(plain_lowercase);
+
+    assert!(result.is_ok(), "lowercase 'plain' should be accepted");
+    let config = result.unwrap();
+    assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Plain);
+}
+
+#[test]
+fn test_output_format_field_is_required_in_serialization() {
+    // Test that output_format is always included in serialization
+    let config = kreuzberg::core::config::ExtractionConfig::default();
+    let json = serde_json::to_value(&config).expect("Failed to serialize");
+
+    assert!(
+        json.get("output_format").is_some(),
+        "output_format should always be present in serialization"
+    );
+}
+
+#[test]
+fn test_result_format_and_output_format_independent() {
+    // Test that result_format and output_format are independent fields
+    let mut config = kreuzberg::core::config::ExtractionConfig::default();
+
+    // Set both to different values
+    config.output_format = kreuzberg::core::config::OutputFormat::Markdown;
+
+    let json = serde_json::to_value(&config).expect("Failed to serialize");
+
+    assert_eq!(json["output_format"], "markdown");
+    assert!(
+        json["result_format"].is_string(),
+        "result_format should also be present"
+    );
+}
+
+#[test]
+fn test_extraction_config_clone_preserves_format() {
+    // Test that cloning config preserves output_format
+    let original = kreuzberg::core::config::ExtractionConfig {
+        output_format: kreuzberg::core::config::OutputFormat::Html,
+        ..kreuzberg::core::config::ExtractionConfig::default()
+    };
+
+    let cloned = original.clone();
+
+    assert_eq!(
+        original.output_format, cloned.output_format,
+        "Cloned config should preserve output_format"
+    );
+}
--- a/crates/kreuzberg-cli/tests/contract_cli.rs
+++ b/crates/kreuzberg-cli/tests/contract_cli.rs
@@ -0,0 +1,355 @@
+//! CLI contract tests - verify CLI config parsing matches Rust core
+//!
+//! This test suite validates that the CLI's configuration parsing produces
+//! identical results to the Rust core library. It ensures that users get
+//! consistent behavior whether using the CLI, SDK, or MCP interfaces.
+
+use kreuzberg::core::config::ExtractionConfig;
+use kreuzberg::core::config::OutputFormat;
+use serde_json::json;
+
+#[test]
+fn test_cli_config_json_flag_basic_parsing() {
+    let config_str = r#"{"use_cache": true, "output_format": "plain"}"#;
+
+    // Parse as Rust core would
+    let rust_config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize config string");
+
+    // Simulate CLI --config-json parsing (same as Rust core)
+    let cli_json: serde_json::Value = serde_json::from_str(config_str).expect("Failed to parse JSON string");
+    let cli_config: ExtractionConfig = serde_json::from_value(cli_json).expect("Failed to deserialize from JSON value");
+
+    // Verify identical behavior
+    assert_eq!(
+        rust_config.use_cache, cli_config.use_cache,
+        "use_cache should be identical"
+    );
+    assert_eq!(
+        rust_config.output_format, cli_config.output_format,
+        "output_format should be identical"
+    );
+}
+
+#[test]
+fn test_cli_nested_config_deserialization() {
+    let config_str = r#"{
+        "chunking": {
+            "max_characters": 1000,
+            "overlap": 200
+        },
+        "ocr": {
+            "backend": "tesseract"
+        }
+    }"#;
+
+    let config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize nested config");
+
+    assert!(config.chunking.is_some(), "Chunking config should be present");
+    assert!(config.ocr.is_some(), "OCR config should be present");
+
+    let chunking = config.chunking.unwrap();
+    assert_eq!(chunking.max_characters, 1000, "max_chars should be 1000");
+    assert_eq!(chunking.overlap, 200, "max_overlap should be 200");
+
+    let ocr = config.ocr.unwrap();
+    assert_eq!(ocr.backend, "tesseract", "backend should be tesseract");
+}
+
+#[test]
+fn test_cli_force_ocr_flag_parsing() {
+    let config_str = r#"{"force_ocr": true}"#;
+
+    let config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize force_ocr config");
+
+    assert!(config.force_ocr, "force_ocr should be true");
+    // Verify other fields retain defaults
+    assert!(config.use_cache, "use_cache should still be true by default");
+}
+
+#[test]
+fn test_cli_max_concurrent_extractions_parsing() {
+    let config_str = r#"{"max_concurrent_extractions": 8}"#;
+
+    let config: ExtractionConfig =
+        serde_json::from_str(config_str).expect("Failed to deserialize concurrent extractions");
+
+    assert_eq!(
+        config.max_concurrent_extractions,
+        Some(8),
+        "max_concurrent_extractions should be 8"
+    );
+}
+
+#[test]
+fn test_cli_complex_config_deserialization() {
+    let config_str = r#"{
+        "use_cache": false,
+        "enable_quality_processing": true,
+        "force_ocr": true,
+        "output_format": "markdown",
+        "result_format": "unified",
+        "max_concurrent_extractions": 16,
+        "ocr": {
+            "backend": "tesseract",
+            "language": "eng"
+        },
+        "chunking": {
+            "max_characters": 2000,
+            "overlap": 400,
+            "strategy": "sliding_window"
+        }
+    }"#;
+
+    let config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize complex config");
+
+    // Verify all top-level fields
+    assert!(!config.use_cache);
+    assert!(config.enable_quality_processing);
+    assert!(config.force_ocr);
+    assert_eq!(config.max_concurrent_extractions, Some(16));
+
+    // Verify nested configs
+    assert!(config.ocr.is_some());
+    assert!(config.chunking.is_some());
+
+    let ocr = config.ocr.unwrap();
+    assert_eq!(ocr.backend, "tesseract");
+    assert_eq!(ocr.language, "eng");
+
+    let chunking = config.chunking.unwrap();
+    assert_eq!(chunking.max_characters, 2000);
+    assert_eq!(chunking.overlap, 400);
+}
+
+#[test]
+fn test_cli_empty_config_uses_defaults() {
+    let config_str = r#"{}"#;
+
+    let config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize empty config");
+
+    // All defaults should apply
+    assert!(config.use_cache, "Default use_cache should be true");
+    assert!(
+        config.enable_quality_processing,
+        "Default enable_quality_processing should be true"
+    );
+    assert!(!config.force_ocr, "Default force_ocr should be false");
+    assert_eq!(
+        config.max_concurrent_extractions, None,
+        "Default max_concurrent_extractions should be None"
+    );
+}
+
+#[test]
+fn test_cli_roundtrip_preserves_all_fields() {
+    let original_str = r#"{
+        "use_cache": false,
+        "force_ocr": true,
+        "max_concurrent_extractions": 12
+    }"#;
+
+    // Parse
+    let config: ExtractionConfig = serde_json::from_str(original_str).expect("Failed to deserialize");
+
+    // Serialize back
+    let serialized = serde_json::to_value(&config).expect("Failed to serialize");
+
+    // Re-parse the serialized version
+    let reparsed: ExtractionConfig =
+        serde_json::from_value(serialized).expect("Failed to deserialize roundtripped config");
+
+    // Verify fields preserved
+    assert!(!reparsed.use_cache);
+    assert!(reparsed.force_ocr);
+    assert_eq!(reparsed.max_concurrent_extractions, Some(12));
+}
+
+#[test]
+fn test_cli_output_format_enum_parsing() {
+    let test_cases = vec![
+        (r#"{"output_format": "plain"}"#, OutputFormat::Plain),
+        (r#"{"output_format": "markdown"}"#, OutputFormat::Markdown),
+        (r#"{"output_format": "html"}"#, OutputFormat::Html),
+    ];
+
+    for (config_str, expected_format) in test_cases {
+        let config: ExtractionConfig =
+            serde_json::from_str(config_str).unwrap_or_else(|_| panic!("Failed to deserialize {}", config_str));
+
+        assert_eq!(
+            config.output_format, expected_format,
+            "output_format should match expected value"
+        );
+    }
+}
+
+#[test]
+fn test_cli_result_format_enum_parsing() {
+    let test_cases = vec![
+        r#"{"result_format": "unified"}"#,
+        r#"{"result_format": "element_based"}"#,
+    ];
+
+    for config_str in test_cases {
+        let result = serde_json::from_str::<ExtractionConfig>(config_str);
+        assert!(result.is_ok(), "Should deserialize result_format from {}", config_str);
+    }
+}
+
+#[test]
+fn test_cli_base64_encoded_config_simulation() {
+    // Simulate --config-json-base64 flag handling
+    let original_json = json!({
+        "force_ocr": true,
+        "output_format": "markdown"
+    });
+
+    let json_string = original_json.to_string();
+
+    // Simulate base64 encoding
+    let encoded = base64::engine::general_purpose::STANDARD.encode(&json_string);
+
+    // Simulate base64 decoding (as CLI would do)
+    use base64::Engine;
+    let decoded = String::from_utf8(
+        base64::engine::general_purpose::STANDARD
+            .decode(&encoded)
+            .expect("Failed to decode base64"),
+    )
+    .expect("Failed to convert bytes to string");
+
+    // Parse the decoded JSON
+    let config: ExtractionConfig = serde_json::from_str(&decoded).expect("Failed to deserialize base64-decoded config");
+
+    assert!(config.force_ocr);
+    assert_eq!(config.output_format, OutputFormat::Markdown);
+}
+
+#[test]
+fn test_cli_partial_override_merging() {
+    // Test that partial configs can override defaults
+    let base_config = ExtractionConfig::default();
+    let override_json = json!({"force_ocr": true, "use_cache": false});
+
+    // Simulate CLI merge: convert base to JSON, merge overrides, deserialize
+    let mut base_json = serde_json::to_value(&base_config).expect("Failed to serialize base config");
+
+    if let (serde_json::Value::Object(base_obj), serde_json::Value::Object(override_obj)) =
+        (&mut base_json, override_json)
+    {
+        for (key, value) in override_obj {
+            base_obj.insert(key, value);
+        }
+    }
+
+    let merged: ExtractionConfig = serde_json::from_value(base_json).expect("Failed to deserialize merged config");
+
+    assert!(merged.force_ocr, "Override should apply force_ocr");
+    assert!(!merged.use_cache, "Override should apply use_cache");
+    assert!(
+        merged.enable_quality_processing,
+        "Unoverridden field should retain default"
+    );
+}
+
+#[test]
+fn test_cli_invalid_json_error_handling() {
+    let invalid_json_str = r#"{"force_ocr": true, "invalid_field": "value"}"#;
+
+    // Note: serde with deny_unknown_fields would reject this
+    // Without that, it should deserialize successfully and ignore unknown fields
+    let result = serde_json::from_str::<ExtractionConfig>(invalid_json_str);
+
+    // Document the current behavior - unknown fields are typically ignored
+    if let Ok(config) = result {
+        assert!(config.force_ocr);
+    }
+}
+
+#[test]
+fn test_cli_whitespace_handling_in_json() {
+    let config_strs = vec![
+        r#"{"force_ocr":true}"#,     // No spaces
+        r#"{ "force_ocr" : true }"#, // Extra spaces
+        r#"{
+            "force_ocr": true
+        }"#, // Newlines and indentation
+    ];
+
+    for config_str in config_strs {
+        let config: ExtractionConfig =
+            serde_json::from_str(config_str).unwrap_or_else(|_| panic!("Failed to parse: {}", config_str));
+
+        assert!(config.force_ocr);
+    }
+}
+
+#[test]
+fn test_cli_numeric_boundary_values() {
+    // Test minimum and maximum reasonable values for numeric fields
+    let test_cases = vec![
+        (r#"{"max_concurrent_extractions": 1}"#, Some(1)),
+        (r#"{"max_concurrent_extractions": 256}"#, Some(256)),
+        (r#"{"max_concurrent_extractions": 0}"#, Some(0)), // Edge case: 0 extractions
+    ];
+
+    for (config_str, expected_value) in test_cases {
+        let config: ExtractionConfig =
+            serde_json::from_str(config_str).unwrap_or_else(|_| panic!("Failed to parse: {}", config_str));
+
+        assert_eq!(
+            config.max_concurrent_extractions, expected_value,
+            "Numeric values should be parsed correctly"
+        );
+    }
+}
+
+#[test]
+fn test_cli_boolean_values_strict_parsing() {
+    // Test that boolean values are strictly true/false, not truthy/falsy
+    let test_cases = vec![(r#"{"use_cache": true}"#, true), (r#"{"use_cache": false}"#, false)];
+
+    for (config_str, expected_value) in test_cases {
+        let config: ExtractionConfig =
+            serde_json::from_str(config_str).unwrap_or_else(|_| panic!("Failed to parse: {}", config_str));
+
+        assert_eq!(config.use_cache, expected_value);
+    }
+}
+
+#[test]
+fn test_cli_config_consistency_across_formats() {
+    // Create a config programmatically
+    let programmatic_config = ExtractionConfig {
+        use_cache: false,
+        enable_quality_processing: true,
+        force_ocr: true,
+        output_format: OutputFormat::Markdown,
+        max_concurrent_extractions: Some(4),
+        ..Default::default()
+    };
+
+    // Serialize it
+    let serialized_json = serde_json::to_value(&programmatic_config).expect("Failed to serialize");
+
+    // Deserialize back from JSON string (simulating CLI parsing)
+    let json_string = serialized_json.to_string();
+    let deserialized: ExtractionConfig = serde_json::from_str(&json_string).expect("Failed to deserialize from string");
+
+    // Verify complete roundtrip
+    assert_eq!(deserialized.use_cache, programmatic_config.use_cache);
+    assert_eq!(
+        deserialized.enable_quality_processing,
+        programmatic_config.enable_quality_processing
+    );
+    assert_eq!(deserialized.force_ocr, programmatic_config.force_ocr);
+    assert_eq!(deserialized.output_format, programmatic_config.output_format);
+    assert_eq!(
+        deserialized.max_concurrent_extractions,
+        programmatic_config.max_concurrent_extractions
+    );
+}
+
+// Re-export needed for base64 test (moved to end of file)
+
+// Re-export needed for base64 test (imported at top of file)
--- a/crates/kreuzberg-cli/tests/e2e_config_test.rs
+++ b/crates/kreuzberg-cli/tests/e2e_config_test.rs
@@ -0,0 +1,603 @@
+//! Comprehensive CLI end-to-end integration tests for configuration flags.
+//!
+//! This test suite validates the new configuration features including:
+//! - `--config-json` for inline JSON configuration
+//! - `--config-json-base64` for base64-encoded JSON configuration
+//! - `--output-format` flag with all variants (plain, markdown, djot, html)
+//! - Flag precedence (CLI args > JSON config > file > defaults)
+//! - Config merge scenarios and conflict detection
+//! - Error handling for invalid inputs
+//! - Real extraction with new formats
+
+#![allow(clippy::bool_assert_comparison)]
+
+use std::path::PathBuf;
+use std::process::Command;
+use tempfile::TempDir;
+
+/// Get the path to the kreuzberg binary.
+fn get_binary_path() -> String {
+    let manifest_dir = env!("CARGO_MANIFEST_DIR");
+    format!("{}/../../target/debug/kreuzberg", manifest_dir)
+}
+
+/// Get the test_documents directory path.
+fn get_test_documents_dir() -> PathBuf {
+    let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+    manifest_dir.parent().unwrap().parent().unwrap().join("test_documents")
+}
+
+/// Get a test file path relative to test_documents/.
+fn get_test_file(relative_path: &str) -> String {
+    get_test_documents_dir()
+        .join(relative_path)
+        .to_string_lossy()
+        .to_string()
+}
+
+/// Build the binary before running tests (runs once per test).
+fn build_binary() {
+    let status = Command::new("cargo")
+        .args(["build", "--bin", "kreuzberg"])
+        .status()
+        .expect("Failed to build kreuzberg binary");
+
+    assert!(status.success(), "Failed to build kreuzberg binary");
+}
+
+/// Helper to create a temporary config file with specified content.
+fn create_test_config(dir: &TempDir, name: &str, content: &str) -> PathBuf {
+    let config_path = dir.path().join(name);
+    std::fs::write(&config_path, content).expect("Failed to write config file");
+    config_path
+}
+
+/// Helper to encode string as base64.
+fn to_base64(input: &str) -> String {
+    // Manual base64 encoding
+    const CHARSET: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+    let bytes = input.as_bytes();
+    let mut result = String::new();
+    let mut i = 0;
+
+    while i < bytes.len() {
+        let b1 = bytes[i];
+        let b2 = if i + 1 < bytes.len() { bytes[i + 1] } else { 0 };
+        let b3 = if i + 2 < bytes.len() { bytes[i + 2] } else { 0 };
+
+        let n = ((b1 as u32) << 16) | ((b2 as u32) << 8) | (b3 as u32);
+
+        result.push(CHARSET[((n >> 18) & 0x3F) as usize] as char);
+        result.push(CHARSET[((n >> 12) & 0x3F) as usize] as char);
+
+        if i + 1 < bytes.len() {
+            result.push(CHARSET[((n >> 6) & 0x3F) as usize] as char);
+        } else {
+            result.push('=');
+        }
+
+        if i + 2 < bytes.len() {
+            result.push(CHARSET[(n & 0x3F) as usize] as char);
+        } else {
+            result.push('=');
+        }
+
+        i += 3;
+    }
+
+    result
+}
+
+// ============================================================================
+// Test 1: --config-json inline flag with complex configuration
+// ============================================================================
+
+#[test]
+fn test_cli_config_json_inline() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        eprintln!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let output = Command::new(get_binary_path())
+        .args([
+            "extract",
+            test_file.as_str(),
+            "--config-json",
+            r#"{"use_cache": false, "chunking": {"max_chars": 512}}"#,
+        ])
+        .output()
+        .expect("Failed to execute extract command with --config-json");
+
+    assert!(
+        output.status.success(),
+        "Extract command with --config-json failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(!stdout.is_empty(), "Output should not be empty");
+}
+
+// ============================================================================
+// Test 2: --config-json-base64 flag for base64-encoded configuration
+// ============================================================================
+
+#[test]
+fn test_cli_config_json_base64() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        eprintln!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    // Encode JSON config as base64
+    let json_config = r#"{"use_cache": false}"#;
+    let base64_config = to_base64(json_config);
+
+    let output = Command::new(get_binary_path())
+        .args([
+            "extract",
+            test_file.as_str(),
+            "--config-json-base64",
+            base64_config.as_str(),
+        ])
+        .output()
+        .expect("Failed to execute extract command with --config-json-base64");
+
+    assert!(
+        output.status.success(),
+        "Extract command with --config-json-base64 failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(!stdout.is_empty(), "Output should not be empty");
+}
+
+// ============================================================================
+// Test 3: Flag precedence verification (CLI flags > JSON > file > defaults)
+// ============================================================================
+
+#[test]
+fn test_cli_flag_precedence() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        eprintln!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let temp_dir = TempDir::new().expect("Failed to create temp directory");
+
+    // Create a config file with specific settings
+    let config_content = r#"
+use_cache = true
+
+[chunking]
+max_chars = 1024
+"#;
+    let config_path = create_test_config(&temp_dir, "config.toml", config_content);
+
+    // CLI flag should override config file setting
+    let output = Command::new(get_binary_path())
+        .args([
+            "extract",
+            test_file.as_str(),
+            "--config",
+            config_path.to_string_lossy().as_ref(),
+            "--config-json",
+            r#"{"use_cache": false}"#,
+        ])
+        .output()
+        .expect("Failed to execute command with precedence test");
+
+    assert!(
+        output.status.success(),
+        "Precedence test command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+}
+
+// ============================================================================
+// Test 4: --output-format flag with all variants (plain, markdown, djot, html)
+// ============================================================================
+
+#[test]
+fn test_cli_output_format_all_variants() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        eprintln!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let formats = vec!["plain", "markdown", "djot", "html"];
+
+    for format in formats {
+        let output = Command::new(get_binary_path())
+            .args(["extract", test_file.as_str(), "--output-format", format])
+            .output()
+            .unwrap_or_else(|_| panic!("Failed to execute extract with --output-format {}", format));
+
+        assert!(
+            output.status.success(),
+            "Extract command with --output-format {} failed: {}",
+            format,
+            String::from_utf8_lossy(&output.stderr)
+        );
+
+        let stdout = String::from_utf8_lossy(&output.stdout);
+        assert!(!stdout.is_empty(), "Output for format {} should not be empty", format);
+    }
+}
+
+// ============================================================================
+// Test 5: Output formats (text vs json) for extraction result
+// ============================================================================
+
+#[test]
+fn test_cli_result_format() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        eprintln!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    // Test text output format
+    let output_text = Command::new(get_binary_path())
+        .args(["extract", test_file.as_str(), "--format", "text"])
+        .output()
+        .expect("Failed to execute extract with --format text");
+
+    assert!(
+        output_text.status.success(),
+        "Text format output failed: {}",
+        String::from_utf8_lossy(&output_text.stderr)
+    );
+
+    let text_content = String::from_utf8_lossy(&output_text.stdout);
+    assert!(!text_content.is_empty(), "Text output should not be empty");
+
+    // Test JSON output format
+    let output_json = Command::new(get_binary_path())
+        .args(["extract", test_file.as_str(), "--format", "json"])
+        .output()
+        .expect("Failed to execute extract with --format json");
+
+    assert!(
+        output_json.status.success(),
+        "JSON format output failed: {}",
+        String::from_utf8_lossy(&output_json.stderr)
+    );
+
+    let json_content = String::from_utf8_lossy(&output_json.stdout);
+    let parsed: Result<serde_json::Value, _> = serde_json::from_str(&json_content);
+    assert!(
+        parsed.is_ok(),
+        "JSON output should be valid JSON, got: {}",
+        json_content
+    );
+
+    // Verify JSON has expected envelope+result structure
+    if let Ok(value) = parsed {
+        assert!(
+            value.get("result").is_some(),
+            "JSON envelope should have 'result' field"
+        );
+        assert!(
+            value.get("extraction_time_ms").is_some(),
+            "JSON envelope should have 'extraction_time_ms' field"
+        );
+        assert!(
+            value["result"].get("content").is_some(),
+            "result should have 'content' field"
+        );
+        assert!(
+            value["result"].get("mime_type").is_some(),
+            "result should have 'mime_type' field"
+        );
+    }
+}
+
+// ============================================================================
+// Test 6: Deprecated --content-format flag warning
+// ============================================================================
+
+#[test]
+fn test_cli_content_format_deprecated_warning() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        eprintln!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    // The deprecated --content-format should still work but may show warning
+    let output = Command::new(get_binary_path())
+        .args(["extract", test_file.as_str(), "--content-format", "plain"])
+        .output()
+        .expect("Failed to execute extract with --content-format");
+
+    // Command should either succeed or show expected deprecation behavior
+    let stdout = String::from_utf8_lossy(&output.stdout);
+
+    // Note: We're checking that the command doesn't crash; deprecation warning behavior
+    // depends on implementation details
+    assert!(
+        output.status.success() || !stdout.is_empty(),
+        "Command should succeed or produce output"
+    );
+}
+
+// ============================================================================
+// Test 7: Config merge scenarios - multiple configuration sources
+// ============================================================================
+
+#[test]
+fn test_cli_config_merge_scenarios() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        eprintln!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let temp_dir = TempDir::new().expect("Failed to create temp directory");
+
+    // Create a base config file
+    let config_content = r#"
+use_cache = true
+
+[chunking]
+max_chars = 1024
+"#;
+    let config_path = create_test_config(&temp_dir, "base.toml", config_content);
+
+    // Merge: config file + inline JSON (JSON should override matching keys)
+    let output = Command::new(get_binary_path())
+        .args([
+            "extract",
+            test_file.as_str(),
+            "--config",
+            config_path.to_string_lossy().as_ref(),
+            "--config-json",
+            r#"{"use_cache": false}"#,
+        ])
+        .output()
+        .expect("Failed to merge configs");
+
+    assert!(
+        output.status.success(),
+        "Config merge failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+}
+
+// ============================================================================
+// Test 8: Invalid JSON error handling
+// ============================================================================
+
+#[test]
+fn test_cli_invalid_json_error() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        eprintln!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let output = Command::new(get_binary_path())
+        .args([
+            "extract",
+            test_file.as_str(),
+            "--config-json",
+            r#"{"invalid json without closing"#, // Malformed JSON
+        ])
+        .output()
+        .expect("Failed to execute command");
+
+    // Should fail gracefully with error message
+    assert!(!output.status.success(), "Command should fail with invalid JSON");
+
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    // Should contain some error indication
+    assert!(
+        !stderr.is_empty() || !String::from_utf8_lossy(&output.stdout).is_empty(),
+        "Should provide feedback about invalid JSON"
+    );
+}
+
+// ============================================================================
+// Test 9: Config flag conflicts
+// ============================================================================
+
+#[test]
+fn test_cli_conflicts() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        eprintln!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let temp_dir = TempDir::new().expect("Failed to create temp directory");
+    let config_content = "use_cache = true\n";
+    let config_path = create_test_config(&temp_dir, "config.toml", config_content);
+
+    // Using both --config-json and --config-json-base64 might conflict
+    let json_config = r#"{"use_cache": false}"#;
+    let base64_config = to_base64(json_config);
+
+    let output = Command::new(get_binary_path())
+        .args([
+            "extract",
+            test_file.as_str(),
+            "--config",
+            config_path.to_string_lossy().as_ref(),
+            "--config-json",
+            r#"{"chunking": {"max_chars": 512}}"#,
+            "--config-json-base64",
+            base64_config.as_str(),
+        ])
+        .output()
+        .expect("Failed to execute command with potential conflicts");
+
+    // The behavior here depends on implementation:
+    // Either it should succeed (last flag wins) or show an error (mutually exclusive)
+    // We verify that the command completes without crashing
+    let _ = output.status.success();
+}
+
+// ============================================================================
+// Test 10: Real end-to-end extraction with new config formats
+// ============================================================================
+
+#[test]
+fn test_cli_real_extraction() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        eprintln!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    // Full E2E test: extract with multiple new flags
+    let output = Command::new(get_binary_path())
+        .args([
+            "extract",
+            test_file.as_str(),
+            "--format",
+            "json",
+            "--output-format",
+            "markdown",
+            "--config-json",
+            r#"{"use_cache": false, "disable_ocr": true}"#,
+        ])
+        .output()
+        .expect("Failed to execute full E2E extraction");
+
+    assert!(
+        output.status.success(),
+        "E2E extraction failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+
+    // Should be valid JSON output
+    let parsed: Result<serde_json::Value, _> = serde_json::from_str(&stdout);
+    assert!(parsed.is_ok(), "E2E output should be valid JSON, got: {}", stdout);
+
+    // Verify envelope+result structure
+    if let Ok(value) = parsed {
+        assert!(value.get("result").is_some(), "Missing 'result' envelope field");
+        assert!(
+            value.get("extraction_time_ms").is_some(),
+            "Missing 'extraction_time_ms' field"
+        );
+        assert!(
+            value["result"].get("content").is_some(),
+            "Missing content field in result"
+        );
+        assert!(
+            value["result"].get("mime_type").is_some(),
+            "Missing mime_type field in result"
+        );
+    }
+}
+
+// ============================================================================
+// Additional Edge Cases and Robustness Tests
+// ============================================================================
+
+#[test]
+fn test_cli_empty_config_json() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        eprintln!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    // Empty JSON object should use defaults
+    let output = Command::new(get_binary_path())
+        .args(["extract", test_file.as_str(), "--config-json", "{}"])
+        .output()
+        .expect("Failed to execute with empty JSON config");
+
+    assert!(output.status.success(), "Command with empty JSON config should succeed");
+}
+
+#[test]
+fn test_cli_multiple_output_format_variants() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        eprintln!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    // Test case-insensitive format argument
+    let output = Command::new(get_binary_path())
+        .args([
+            "extract",
+            test_file.as_str(),
+            "--output-format",
+            "MARKDOWN", // uppercase should work or fail predictably
+        ])
+        .output()
+        .expect("Failed to execute");
+
+    // Either succeeds with case-insensitive parsing or fails gracefully
+    let _ = output.status.success();
+}
+
+#[test]
+fn test_cli_config_json_with_nested_objects() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        eprintln!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    // Complex nested JSON configuration
+    let complex_config = r#"
+{
+    "use_cache": false,
+    "chunking": {"max_chars": 512},
+    "language_detection": {
+        "enabled": true,
+        "confidence_threshold": 0.8
+    }
+}
+"#;
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", test_file.as_str(), "--config-json", complex_config])
+        .output()
+        .expect("Failed to execute with nested JSON config");
+
+    assert!(
+        output.status.success() || !String::from_utf8_lossy(&output.stderr).is_empty(),
+        "Complex config should either work or provide error"
+    );
+}
--- a/crates/kreuzberg-cli/tests/extract_envelope.rs
+++ b/crates/kreuzberg-cli/tests/extract_envelope.rs
@@ -0,0 +1,237 @@
+//! Integration tests for the JSON timing envelope added to `kreuzberg extract` and
+//! `kreuzberg batch`.
+//!
+//! Verifies:
+//!  - `extract --format json` emits `{ result, extraction_time_ms }` shape
+//!  - `batch --format json` emits `{ results, total_ms, per_file_ms }` shape
+//!  - `result.metadata.ocr_used` exists as a bool field
+//!  - `--pdf-backend xyz` exits non-zero and mentions "pdf-oxide"
+
+use std::path::{Path, PathBuf};
+use std::process::Command;
+
+/// Returns path to the compiled `kreuzberg` binary (debug build).
+fn kreuzberg_bin() -> PathBuf {
+    let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+    manifest_dir
+        .parent()
+        .expect("crates/kreuzberg-cli parent")
+        .parent()
+        .expect("crates parent")
+        .join("target")
+        .join("debug")
+        .join("kreuzberg")
+}
+
+/// Returns path to the small reference PDF used in these tests.
+fn pdf_fixture() -> PathBuf {
+    let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+    manifest_dir
+        .parent()
+        .expect("crates/kreuzberg-cli parent")
+        .parent()
+        .expect("crates parent")
+        .join("test_documents")
+        .join("pdf")
+        .join("pdfa_001.pdf")
+}
+
+/// Returns path to the small plain-text fixture used for batch tests.
+fn txt_fixture() -> PathBuf {
+    let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+    manifest_dir
+        .parent()
+        .expect("crates/kreuzberg-cli parent")
+        .parent()
+        .expect("crates parent")
+        .join("test_documents")
+        .join("text")
+        .join("fake_text.txt")
+}
+
+/// Build the binary once before running. Panics on failure.
+fn build_binary() {
+    let status = Command::new("cargo")
+        .args(["build", "--bin", "kreuzberg"])
+        .status()
+        .expect("cargo build invocation failed");
+    assert!(status.success(), "cargo build failed — binary unavailable");
+}
+
+/// Skip-guard: returns `true` when the fixture exists so the test can run.
+fn fixture_exists(path: &Path) -> bool {
+    path.exists() && path.is_file()
+}
+
+// ── extract --format json envelope ──────────────────────────────────────────
+
+#[test]
+fn test_extract_json_has_result_and_timing() {
+    build_binary();
+
+    let pdf = pdf_fixture();
+    if !fixture_exists(&pdf) {
+        eprintln!("SKIP: PDF fixture not found at {}", pdf.display());
+        return;
+    }
+
+    let output = Command::new(kreuzberg_bin())
+        .args(["extract", &pdf.to_string_lossy(), "--format", "json"])
+        .output()
+        .expect("failed to run kreuzberg extract");
+
+    assert!(
+        output.status.success(),
+        "extract exited non-zero: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+
+    let json: serde_json::Value = serde_json::from_slice(&output.stdout).expect("stdout is not valid JSON");
+
+    // Envelope shape
+    assert!(json.get("result").is_some(), "missing 'result' key in envelope");
+    let extraction_time_ms = json
+        .get("extraction_time_ms")
+        .and_then(|v| v.as_f64())
+        .expect("'extraction_time_ms' must be a number");
+    assert!(
+        extraction_time_ms > 0.0,
+        "extraction_time_ms must be positive, got {extraction_time_ms}"
+    );
+
+    // ocr_used field must exist as a bool
+    let ocr_used = json["result"]["metadata"]
+        .get("ocr_used")
+        .expect("'result.metadata.ocr_used' must be present")
+        .as_bool()
+        .expect("'result.metadata.ocr_used' must be a boolean");
+    // For a native-text PDF without --force-ocr, OCR should NOT have run.
+    assert!(!ocr_used, "expected ocr_used=false for native PDF extraction");
+}
+
+// ── batch --format json envelope ─────────────────────────────────────────────
+
+#[test]
+fn test_batch_json_has_results_and_timing() {
+    build_binary();
+
+    let pdf = pdf_fixture();
+    let txt = txt_fixture();
+    if !fixture_exists(&pdf) || !fixture_exists(&txt) {
+        eprintln!("SKIP: one or more batch fixtures not found");
+        return;
+    }
+
+    let output = Command::new(kreuzberg_bin())
+        .args([
+            "batch",
+            &pdf.to_string_lossy(),
+            &txt.to_string_lossy(),
+            "--format",
+            "json",
+        ])
+        .output()
+        .expect("failed to run kreuzberg batch");
+
+    assert!(
+        output.status.success(),
+        "batch exited non-zero: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+
+    let json: serde_json::Value = serde_json::from_slice(&output.stdout).expect("stdout is not valid JSON");
+
+    // Envelope shape
+    let results = json
+        .get("results")
+        .and_then(|v| v.as_array())
+        .expect("'results' must be an array");
+    assert_eq!(results.len(), 2, "expected 2 results for 2 input files");
+
+    let total_ms = json
+        .get("total_ms")
+        .and_then(|v| v.as_f64())
+        .expect("'total_ms' must be a number");
+    assert!(total_ms > 0.0, "total_ms must be positive, got {total_ms}");
+
+    let per_file_ms = json
+        .get("per_file_ms")
+        .and_then(|v| v.as_array())
+        .expect("'per_file_ms' must be an array");
+    assert_eq!(per_file_ms.len(), 2, "per_file_ms must have one entry per file");
+
+    for (i, timing) in per_file_ms.iter().enumerate() {
+        let ms = timing.as_f64().expect("per_file_ms entry must be a number");
+        assert!(ms > 0.0, "per_file_ms[{i}] must be positive, got {ms}");
+    }
+
+    // Each result must have metadata.ocr_used as a bool
+    for (i, result) in results.iter().enumerate() {
+        assert!(
+            result["metadata"].get("ocr_used").and_then(|v| v.as_bool()).is_some(),
+            "results[{i}].metadata.ocr_used must be a bool"
+        );
+    }
+}
+
+// ── --pdf-backend validation ─────────────────────────────────────────────────
+
+#[test]
+fn test_pdf_backend_invalid_value_exits_nonzero() {
+    build_binary();
+
+    let pdf = pdf_fixture();
+    if !fixture_exists(&pdf) {
+        eprintln!("SKIP: PDF fixture not found at {}", pdf.display());
+        return;
+    }
+
+    let output = Command::new(kreuzberg_bin())
+        .args(["extract", &pdf.to_string_lossy(), "--pdf-backend", "xyz"])
+        .output()
+        .expect("failed to run kreuzberg extract");
+
+    assert!(
+        !output.status.success(),
+        "expected non-zero exit for unknown --pdf-backend"
+    );
+
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    assert!(
+        stderr.contains("pdf-oxide"),
+        "error message should mention 'pdf-oxide', got: {stderr}"
+    );
+}
+
+#[test]
+fn test_pdf_backend_valid_value_succeeds() {
+    build_binary();
+
+    let pdf = pdf_fixture();
+    if !fixture_exists(&pdf) {
+        eprintln!("SKIP: PDF fixture not found at {}", pdf.display());
+        return;
+    }
+
+    let output = Command::new(kreuzberg_bin())
+        .args([
+            "extract",
+            &pdf.to_string_lossy(),
+            "--pdf-backend",
+            "pdf-oxide",
+            "--format",
+            "json",
+        ])
+        .output()
+        .expect("failed to run kreuzberg extract");
+
+    assert!(
+        output.status.success(),
+        "--pdf-backend pdf-oxide should succeed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+
+    let json: serde_json::Value = serde_json::from_slice(&output.stdout).expect("stdout is not valid JSON");
+    assert!(json.get("result").is_some(), "missing 'result' key");
+    assert!(json.get("extraction_time_ms").is_some(), "missing 'extraction_time_ms'");
+}
--- a/crates/kreuzberg-cli/tests/server_test.rs
+++ b/crates/kreuzberg-cli/tests/server_test.rs
@@ -0,0 +1,153 @@
+//! Integration tests for server commands (serve and mcp).
+
+#[cfg(not(coverage))]
+use std::process::{Command, Stdio};
+#[cfg(not(coverage))]
+use std::thread;
+#[cfg(not(coverage))]
+use std::time::Duration;
+
+#[cfg(not(coverage))]
+#[test]
+#[ignore]
+fn test_serve_command_starts() {
+    let status = Command::new("cargo")
+        .args(["build", "--bin", "kreuzberg", "--features", "all"])
+        .status()
+        .expect("Failed to build binary");
+
+    assert!(status.success(), "Failed to build kreuzberg binary");
+
+    let mut child = Command::new("./target/debug/kreuzberg")
+        .args(["serve", "-H", "127.0.0.1", "-p", "18000"])
+        .stdout(Stdio::piped())
+        .stderr(Stdio::piped())
+        .spawn()
+        .expect("Failed to start server");
+
+    thread::sleep(Duration::from_secs(3));
+
+    let mut health_response = ureq::get("http://127.0.0.1:18000/health")
+        .call()
+        .expect("Failed to call health endpoint");
+
+    assert_eq!(health_response.status(), 200);
+
+    let health_json: serde_json::Value = health_response
+        .body_mut()
+        .read_json()
+        .expect("Failed to parse health response");
+
+    assert_eq!(health_json["status"], "healthy");
+    assert!(health_json["version"].is_string());
+
+    let mut info_response = ureq::get("http://127.0.0.1:18000/info")
+        .call()
+        .expect("Failed to call info endpoint");
+
+    assert_eq!(info_response.status(), 200);
+
+    let info_json: serde_json::Value = info_response
+        .body_mut()
+        .read_json()
+        .expect("Failed to parse info response");
+
+    assert!(info_json["rust_backend"].as_bool().unwrap_or(false));
+
+    child.kill().expect("Failed to kill server");
+    child.wait().expect("Failed to wait for server");
+}
+
+#[cfg(not(coverage))]
+#[test]
+#[ignore]
+fn test_serve_command_with_config() {
+    use std::fs;
+
+    let config_content = r#"
+use_cache = true
+enable_quality_processing = true
+
+[ocr]
+backend = "tesseract"
+language = "eng"
+"#;
+
+    fs::write("test_config.toml", config_content).expect("Failed to write test config");
+
+    let mut child = Command::new("./target/debug/kreuzberg")
+        .args(["serve", "-H", "127.0.0.1", "-p", "18001", "-c", "test_config.toml"])
+        .stdout(Stdio::piped())
+        .stderr(Stdio::piped())
+        .spawn()
+        .expect("Failed to start server");
+
+    thread::sleep(Duration::from_secs(3));
+
+    let health_response = ureq::get("http://127.0.0.1:18001/health").call();
+
+    assert!(health_response.is_ok(), "Server should be running with custom config");
+
+    child.kill().expect("Failed to kill server");
+    child.wait().expect("Failed to wait for server");
+
+    fs::remove_file("test_config.toml").ok();
+}
+
+#[cfg(not(coverage))]
+#[test]
+fn test_serve_command_help() {
+    let build_status = Command::new("cargo")
+        .args(["build", "--bin", "kreuzberg", "--features", "all"])
+        .status()
+        .expect("Failed to build binary");
+
+    assert!(build_status.success(), "Failed to build kreuzberg binary");
+
+    let binary_path = env!("CARGO_TARGET_TMPDIR")
+        .split("target")
+        .next()
+        .map(|s| format!("{}target/debug/kreuzberg", s))
+        .unwrap_or_else(|| "../target/debug/kreuzberg".to_string());
+
+    let output = Command::new(&binary_path)
+        .args(["serve", "--help"])
+        .output()
+        .expect("Failed to execute command");
+
+    assert!(output.status.success());
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(stdout.contains("Start the API server"));
+    assert!(stdout.contains("--host"));
+    assert!(stdout.contains("--port"));
+    assert!(stdout.contains("--config"));
+}
+
+#[cfg(not(coverage))]
+#[test]
+fn test_mcp_command_help() {
+    let build_status = Command::new("cargo")
+        .args(["build", "--bin", "kreuzberg", "--features", "all"])
+        .status()
+        .expect("Failed to build binary");
+
+    assert!(build_status.success(), "Failed to build kreuzberg binary");
+
+    let binary_path = env!("CARGO_TARGET_TMPDIR")
+        .split("target")
+        .next()
+        .map(|s| format!("{}target/debug/kreuzberg", s))
+        .unwrap_or_else(|| "../target/debug/kreuzberg".to_string());
+
+    let output = Command::new(&binary_path)
+        .args(["mcp", "--help"])
+        .output()
+        .expect("Failed to execute command");
+
+    assert!(output.status.success());
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(stdout.contains("Start the MCP (Model Context Protocol) server"));
+    assert!(stdout.contains("--config"));
+}
--- a/crates/kreuzberg-ffi/Cargo.toml
+++ b/crates/kreuzberg-ffi/Cargo.toml
@@ -0,0 +1,46 @@
+[package]
+name = "kreuzberg-ffi"
+version = "5.0.0-rc.3"
+edition = "2021"
+license = "Elastic-2.0"
+description = "High-performance document intelligence library"
+readme = false
+keywords = ["document", "extraction", "ocr", "pdf", "text"]
+categories = ["text-processing"]
+repository = "https://github.com/kreuzberg-dev/kreuzberg"
+
+# `serde_json`, `ahash`, and `tokio` are emitted unconditionally above so the
+# manifest is stable across regens (and so the C FFI codegen can pull them in
+# when an async / Result-typed function appears in the API surface), but for
+# umbrella crates with no async fns and no JSON-marshalled return types they
+# are genuinely unused. The conditional `async-trait` / `futures-util` deps
+# are similarly flagged when the umbrella has trait-bridge / streaming adapters
+# configured but no actual async-trait / async-stream callsite in the generated
+# FFI shim.
+[package.metadata.cargo-machete]
+ignored = ["ahash", "serde_json", "tokio", "async-trait"]
+
+[lib]
+crate-type = ["cdylib", "staticlib", "rlib"]
+
+[features]
+default = []
+
+[dependencies]
+ahash = "0.8"
+async-trait = "0.1"
+serde_json = "1"
+tokio = { version = "1", features = ["full"] }
+
+[target.'cfg(not(all(target_os = "android", target_arch = "x86_64")))'.dependencies]
+kreuzberg = { path = "../kreuzberg", version = "5.0.0-rc.3", features = ["full", "pdf", "ocr", "paddle-ocr", "paddle-ocr-types", "layout-detection", "layout-types", "embeddings", "embedding-presets", "chunking", "keywords-yake", "keywords-rake", "language-detection", "html", "tree-sitter", "office", "email", "archives", "stopwords", "auto-rotate", "auto-rotate-types", "tokio-runtime", "api", "mcp", "liter-llm", "quality"] }
+
+[target.'cfg(all(target_os = "android", target_arch = "x86_64"))'.dependencies]
+kreuzberg = { path = "../kreuzberg", version = "5.0.0-rc.3", features = ["android-target"] }
+
+
+[build-dependencies]
+cbindgen = "0.29"
+
+[dev-dependencies]
+tempfile = "3"
--- a/crates/kreuzberg-ffi/README.md
+++ b/crates/kreuzberg-ffi/README.md
@@ -0,0 +1,295 @@
+# FFI (C/C++)
+
+<div align="center" style="display: flex; flex-wrap: wrap; gap: 8px; justify-content: center; margin: 20px 0;">
+  <a href="https://github.com/kreuzberg-dev/alef">
+    <img src="https://img.shields.io/badge/Bindings-alef%20%D7%90-007ec6" alt="Bindings">
+  </a>
+  <!-- Language Bindings -->
+  <a href="https://crates.io/crates/kreuzberg">
+    <img src="https://img.shields.io/crates/v/kreuzberg?label=Rust&color=007ec6" alt="Rust">
+  </a>
+  <a href="https://pypi.org/project/kreuzberg/">
+    <img src="https://img.shields.io/pypi/v/kreuzberg?label=Python&color=007ec6" alt="Python">
+  </a>
+  <a href="https://www.npmjs.com/package/@kreuzberg/node">
+    <img src="https://img.shields.io/npm/v/@kreuzberg/node?label=Node.js&color=007ec6" alt="Node.js">
+  </a>
+  <a href="https://www.npmjs.com/package/@kreuzberg/wasm">
+    <img src="https://img.shields.io/npm/v/@kreuzberg/wasm?label=WASM&color=007ec6" alt="WASM">
+  </a>
+  <a href="https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg">
+    <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
+  </a>
+  <a href="https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/go/v5">
+    <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v5*" alt="Go">
+  </a>
+  <a href="https://www.nuget.org/packages/Kreuzberg/">
+    <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
+  </a>
+  <a href="https://packagist.org/packages/kreuzberg/kreuzberg">
+    <img src="https://img.shields.io/packagist/v/kreuzberg/kreuzberg?label=PHP&color=007ec6" alt="PHP">
+  </a>
+  <a href="https://rubygems.org/gems/kreuzberg">
+    <img src="https://img.shields.io/gem/v/kreuzberg?label=Ruby&color=007ec6" alt="Ruby">
+  </a>
+  <a href="https://hex.pm/packages/kreuzberg">
+    <img src="https://img.shields.io/hexpm/v/kreuzberg?label=Elixir&color=007ec6" alt="Elixir">
+  </a>
+  <a href="https://kreuzberg-dev.r-universe.dev/kreuzberg">
+    <img src="https://img.shields.io/badge/R-kreuzberg-007ec6" alt="R">
+  </a>
+  <a href="https://pub.dev/packages/kreuzberg">
+    <img src="https://img.shields.io/pub/v/kreuzberg?label=Dart&color=007ec6" alt="Dart">
+  </a>
+  <a href="https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg-android">
+    <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg-android?label=Kotlin&color=007ec6" alt="Kotlin">
+  </a>
+  <a href="https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/swift">
+    <img src="https://img.shields.io/badge/Swift-SPM-007ec6" alt="Swift">
+  </a>
+  <a href="https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/zig">
+    <img src="https://img.shields.io/badge/Zig-package-007ec6" alt="Zig">
+  </a>
+  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
+    <img src="https://img.shields.io/badge/C-FFI-007ec6" alt="C FFI">
+  </a>
+  <a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/kreuzberg">
+    <img src="https://img.shields.io/badge/Docker-ghcr.io-007ec6?logo=docker&logoColor=white" alt="Docker">
+  </a>
+  <a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/charts%2Fkreuzberg">
+    <img src="https://img.shields.io/badge/Helm-ghcr.io-007ec6?logo=helm&logoColor=white" alt="Helm">
+  </a>
+
+  <!-- Project Info -->
+  <a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
+    <img src="https://img.shields.io/badge/License-Elastic--2.0-007ec6" alt="License">
+  </a>
+  <a href="https://docs.kreuzberg.dev">
+    <img src="https://img.shields.io/badge/Docs-kreuzberg-007ec6" alt="Documentation">
+  </a>
+  <a href="https://huggingface.co/Kreuzberg">
+    <img src="https://img.shields.io/badge/Hugging%20Face-Kreuzberg-007ec6" alt="Hugging Face">
+  </a>
+</div>
+
+<div align="center" style="margin: 24px 0 0;">
+  <a href="https://kreuzberg.dev">
+    <img alt="Kreuzberg" src="https://github.com/user-attachments/assets/419fc06c-8313-4324-b159-4b4d3cfce5c0" />
+  </a>
+</div>
+
+<div align="center" style="display: flex; flex-wrap: wrap; gap: 12px; justify-content: center; margin: 28px 0 24px;">
+  <a href="https://discord.gg/xt9WY3GnKR">
+    <img height="22" src="https://img.shields.io/badge/Discord-Chat-007ec6?logo=discord&logoColor=white" alt="Join Discord">
+  </a>
+  <a href="https://docs.kreuzberg.dev/demo.html">
+    <img height="22" src="https://img.shields.io/badge/Live%20Demo-Open-007ec6?logo=webassembly&logoColor=white" alt="Live Demo">
+  </a>
+</div>
+
+Extract text, tables, images, and metadata from 90+ file formats and 300+ programming languages including PDF, Office documents, and images. C/C++ FFI bindings providing a stable ABI for native integration, shared library distribution, and cross-language interop.
+
+## What This Package Provides
+
+- **Document intelligence core** — extract text, tables, images, metadata, entities, keywords, and code intelligence from one API.
+- **Format coverage** — PDF, Office, images, HTML/XML, email, archives, notebooks, citations, scientific formats, and plain text.
+- **OCR choices** — Tesseract, PaddleOCR, EasyOCR where supported, VLM OCR through liter-llm, and plugin hooks for custom backends.
+- **Same engine as every binding** — Rust, Python, Node.js, Go, Java, PHP, Ruby, .NET, Elixir, R, WASM, Kotlin Android, Swift, Dart, Zig, and C FFI share the same Rust implementation.
+- **C ABI** — stable shared library surface for custom hosts and secondary bindings.
+
+## Installation
+
+### Package Installation
+
+Build the shared library from the workspace:
+
+```bash
+cargo build --release -p kreuzberg-ffi
+```
+
+The built artifacts are emitted under `target/release/` (`libkreuzberg_ffi.{so,dylib,a}`) along with the C header at `crates/kreuzberg-ffi/include/kreuzberg.h`.
+
+### System Requirements
+- A C/C++ toolchain (clang, gcc, or MSVC) and a Rust toolchain (`rustup`) for building from source
+- A `pkg-config` or CMake-aware build system that can locate `libkreuzberg_ffi` and `kreuzberg.h`
+- Optional: [ONNX Runtime](https://github.com/microsoft/onnxruntime/releases) version 1.22.x for embeddings support
+- Optional: [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) for OCR functionality
+
+## Quick Start
+
+### Basic Extraction
+
+Extract text, metadata, and structure from any supported document format:
+
+<!-- snippet not found:  -->
+
+### Common Use Cases
+
+#### Extract with Custom Configuration
+
+Most use cases benefit from configuration to control extraction behavior:
+
+#### Table Extraction
+
+See [Configuration Guide](https://docs.kreuzberg.dev/guides/configuration/) for table extraction options.
+
+#### Processing Multiple Files
+
+### Next Steps
+
+- **[Installation Guide](https://docs.kreuzberg.dev/getting-started/installation/)** - Platform-specific setup
+- **[API Documentation](https://docs.kreuzberg.dev/reference/api-python/)** - Complete API reference
+- **[Examples & Guides](https://docs.kreuzberg.dev/)** - Full code examples and usage guides
+- **[Configuration Guide](https://docs.kreuzberg.dev/guides/configuration/)** - Advanced configuration options
+
+## Features
+
+### Supported File Formats (90+)
+
+90+ file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
+
+#### Office Documents
+
+| Category | Formats | Capabilities |
+|----------|---------|--------------|
+| **Word Processing** | `.docx`, `.docm`, `.dotx`, `.dotm`, `.dot`, `.odt` | Full text, tables, images, metadata, styles |
+| **Spreadsheets** | `.xlsx`, `.xlsm`, `.xlsb`, `.xls`, `.xla`, `.xlam`, `.xltm`, `.xltx`, `.xlt`, `.ods` | Sheet data, formulas, cell metadata, charts |
+| **Presentations** | `.pptx`, `.pptm`, `.ppsx`, `.potx`, `.potm`, `.pot`, `.ppt` | Slides, speaker notes, images, metadata |
+| **PDF** | `.pdf` | Text, tables, images, metadata, OCR support |
+| **eBooks** | `.epub`, `.fb2` | Chapters, metadata, embedded resources |
+| **Database** | `.dbf` | Table data extraction, field type support |
+| **Hangul** | `.hwp`, `.hwpx` | Korean document format, text extraction |
+
+#### Images (OCR-Enabled)
+
+| Category | Formats | Features |
+|----------|---------|----------|
+| **Raster** | `.png`, `.jpg`, `.jpeg`, `.gif`, `.webp`, `.bmp`, `.tiff`, `.tif` | OCR, table detection, EXIF metadata, dimensions, color space |
+| **Advanced** | `.jp2`, `.jpx`, `.jpm`, `.mj2`, `.jbig2`, `.jb2`, `.pnm`, `.pbm`, `.pgm`, `.ppm` | OCR via hayro-jpeg2000 (pure Rust decoder), JBIG2 support, table detection, format-specific metadata |
+| **Vector** | `.svg` | DOM parsing, embedded text, graphics metadata |
+
+#### Web & Data
+
+| Category | Formats | Features |
+|----------|---------|----------|
+| **Markup** | `.html`, `.htm`, `.xhtml`, `.xml`, `.svg` | DOM parsing, metadata (Open Graph, Twitter Card), link extraction |
+| **Structured Data** | `.json`, `.yaml`, `.yml`, `.toml`, `.csv`, `.tsv` | Schema detection, nested structures, validation |
+| **Text & Markdown** | `.txt`, `.md`, `.markdown`, `.djot`, `.rst`, `.org`, `.rtf` | CommonMark, GFM, Djot, reStructuredText, Org Mode |
+
+#### Email & Archives
+
+| Category | Formats | Features |
+|----------|---------|----------|
+| **Email** | `.eml`, `.msg` | Headers, body (HTML/plain), attachments, threading |
+| **Archives** | `.zip`, `.tar`, `.tgz`, `.gz`, `.7z` | File listing, nested archives, metadata |
+
+#### Academic & Scientific
+
+| Category | Formats | Features |
+|----------|---------|----------|
+| **Citations** | `.bib`, `.biblatex`, `.ris`, `.nbib`, `.enw`, `.csl` | Structured parsing: RIS (structured), PubMed/MEDLINE, EndNote XML (structured), BibTeX, CSL JSON |
+| **Scientific** | `.tex`, `.latex`, `.typst`, `.jats`, `.ipynb`, `.docbook` | LaTeX, Jupyter notebooks, PubMed JATS |
+| **Documentation** | `.opml`, `.pod`, `.mdoc`, `.troff` | Technical documentation formats |
+
+#### Code Intelligence (300+ Languages)
+
+| Feature | Description |
+|---------|-------------|
+| **Structure Extraction** | Functions, classes, methods, structs, interfaces, enums |
+| **Import/Export Analysis** | Module dependencies, re-exports, wildcard imports |
+| **Symbol Extraction** | Variables, constants, type aliases, properties |
+| **Docstring Parsing** | Google, NumPy, Sphinx, JSDoc, RustDoc, and 10+ formats |
+| **Diagnostics** | Parse errors with line/column positions |
+| **Syntax-Aware Chunking** | Split code by semantic boundaries, not arbitrary byte offsets |
+
+Powered by [tree-sitter-language-pack](https://github.com/kreuzberg-dev/tree-sitter-language-pack) — [documentation](https://docs.tree-sitter-language-pack.kreuzberg.dev).
+
+**[Complete Format Reference](https://docs.kreuzberg.dev/reference/formats/)**
+
+### Key Capabilities
+
+- **Text Extraction** - Extract all text content with position and formatting information
+- **Metadata Extraction** - Retrieve document properties, creation date, author, etc.
+- **Table Extraction** - Parse tables with structure and cell content preservation
+- **Image Extraction** - Extract embedded images and render page previews
+- **OCR Support** - Integrate multiple OCR backends for scanned documents
+- **Async/Await** - Non-blocking document processing with concurrent operations
+- **Plugin System** - Extensible post-processing for custom text transformation
+- **Embeddings** - Generate vector embeddings using ONNX Runtime models
+- **Batch Processing** - Efficiently process multiple documents in parallel
+- **Memory Efficient** - Stream large files without loading entirely into memory
+- **Language Detection** - Detect and support multiple languages in documents
+- **Code Intelligence** - Extract structure, imports, exports, symbols, and docstrings from [300+ programming languages](https://docs.tree-sitter-language-pack.kreuzberg.dev) via tree-sitter
+- **Configuration** - Fine-grained control over extraction behavior
+
+### Performance Characteristics
+
+| Format | Speed | Memory | Notes |
+|--------|-------|--------|-------|
+| **PDF (text)** | 10-100 MB/s | ~50MB per doc | Fastest extraction |
+| **Office docs** | 20-200 MB/s | ~100MB per doc | DOCX, XLSX, PPTX |
+| **Images (OCR)** | 1-5 MB/s | Variable | Depends on OCR backend |
+| **Archives** | 5-50 MB/s | ~200MB per doc | ZIP, TAR, etc. |
+| **Web formats** | 50-200 MB/s | Streaming | HTML, XML, JSON |
+
+## OCR Support
+
+Kreuzberg supports multiple OCR backends for extracting text from scanned documents and images:
+
+### OCR Configuration Example
+
+<!-- snippet not found:  -->
+
+## Async Support
+
+This binding provides full async/await support for non-blocking document processing:
+
+<!-- snippet not found:  -->
+
+## Plugin System
+
+Kreuzberg supports extensible post-processing plugins for custom text transformation and filtering.
+
+For detailed plugin documentation, visit [Plugin System Guide](https://docs.kreuzberg.dev/guides/plugins/).
+
+## Embeddings Support
+
+Generate vector embeddings for extracted text using the built-in ONNX Runtime support. Requires ONNX Runtime installation.
+
+**[Embeddings Guide](https://docs.kreuzberg.dev/features/#embeddings)**
+
+## Configuration
+
+For advanced configuration options including language detection, table extraction, OCR settings, and more:
+
+**[Configuration Guide](https://docs.kreuzberg.dev/guides/configuration/)**
+
+## Documentation
+
+- **[Official Documentation](https://docs.kreuzberg.dev/)**
+- **[API Reference](https://docs.kreuzberg.dev/reference/api-python/)**
+- **[Examples & Guides](https://docs.kreuzberg.dev/)**
+
+## Contributing
+
+Contributions are welcome! See [Contributing Guide](https://github.com/kreuzberg-dev/kreuzberg/blob/main/CONTRIBUTING.md).
+
+## Part of Kreuzberg.dev
+
+- [Kreuzberg Cloud](https://github.com/kreuzberg-dev/kreuzberg-cloud) — managed extraction API with SDKs, dashboards, and observability.
+- [kreuzcrawl](https://github.com/kreuzberg-dev/kreuzcrawl) — web crawling and scraping with HTML→Markdown and headless-Chrome fallback.
+- [html-to-markdown](https://github.com/kreuzberg-dev/html-to-markdown) — fast, lossless HTML→Markdown engine.
+- [liter-llm](https://github.com/kreuzberg-dev/liter-llm) — universal LLM API client with native bindings for 14 languages and 143 providers.
+- [tree-sitter-language-pack](https://github.com/kreuzberg-dev/tree-sitter-language-pack) — tree-sitter grammars and code-intelligence primitives.
+- [alef](https://github.com/kreuzberg-dev/alef) — the polyglot binding generator that produces this README and all per-language bindings.
+- [Discord](https://discord.gg/xt9WY3GnKR) — community, roadmap, announcements.
+
+## License
+
+Elastic-2.0 License — see [LICENSE](../../LICENSE) for details.
+
+## Support
+
+- **Discord Community**: [Join our Discord](https://discord.gg/xt9WY3GnKR)
+- **GitHub Issues**: [Report bugs](https://github.com/kreuzberg-dev/kreuzberg/issues)
+- **Discussions**: [Ask questions](https://github.com/kreuzberg-dev/kreuzberg/discussions)
--- a/crates/kreuzberg-ffi/build.rs
+++ b/crates/kreuzberg-ffi/build.rs
@@ -0,0 +1,23 @@
+// This file is auto-generated by alef. DO NOT EDIT.
+// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
+fn main() {
+    let crate_dir = std::env::var("CARGO_MANIFEST_DIR").unwrap();
+    cbindgen::generate(crate_dir)
+        .expect("Unable to generate C bindings")
+        .write_to_file("include/kreuzberg.h");
+
+    // Set @rpath-relative install_name on macOS so the cdylib can be relocated
+    // (bundled into language packages like packages/go/.lib/<rid>/, packages/
+    // java/src/main/resources/natives/<rid>/, etc.) and located via the consumer
+    // binary's rpath at runtime. Without this, the install_name embeds the CI
+    // runner build path (`/Users/runner/work/.../target/.../deps/lib<name>.dylib`)
+    // and dyld fails to load the bundled copy from its actual location.
+    if std::env::var("CARGO_CFG_TARGET_OS").as_deref() == Ok("macos") {
+        println!("cargo:rustc-link-arg-cdylib=-Wl,-install_name,@rpath/libkreuzberg_ffi.dylib");
+    }
+
+    let go_include_dir = std::path::Path::new("../../../packages/go/v5/include");
+    std::fs::create_dir_all(go_include_dir).expect("Unable to create Go include directory");
+    std::fs::copy("include/kreuzberg.h", go_include_dir.join("kreuzberg.h"))
+        .expect("Unable to copy header to Go include directory");
+}
--- a/crates/kreuzberg-ffi/cbindgen.toml
+++ b/crates/kreuzberg-ffi/cbindgen.toml
--- a/crates/kreuzberg-ffi/cmake/kreuzberg-ffi-config.cmake
+++ b/crates/kreuzberg-ffi/cmake/kreuzberg-ffi-config.cmake
@@ -0,0 +1,87 @@
+# kreuzberg-ffi CMake config-mode find module
+#
+# Defines the imported target:
+#   kreuzberg-ffi::kreuzberg-ffi
+#
+# Usage:
+#   find_package(kreuzberg-ffi REQUIRED)
+#   target_link_libraries(myapp PRIVATE kreuzberg-ffi::kreuzberg-ffi)
+
+if(TARGET kreuzberg-ffi::kreuzberg-ffi)
+  return()
+endif()
+
+get_filename_component(_FFI_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+get_filename_component(_FFI_PREFIX "${_FFI_CMAKE_DIR}/.." ABSOLUTE)
+
+find_library(_FFI_LIBRARY
+  NAMES kreuzberg_ffi libkreuzberg_ffi
+  PATHS "${_FFI_PREFIX}/lib"
+  NO_DEFAULT_PATH
+)
+if(NOT _FFI_LIBRARY)
+  find_library(_FFI_LIBRARY NAMES kreuzberg_ffi libkreuzberg_ffi)
+endif()
+
+find_path(_FFI_INCLUDE_DIR
+  NAMES kreuzberg.h
+  PATHS "${_FFI_PREFIX}/include"
+  NO_DEFAULT_PATH
+)
+if(NOT _FFI_INCLUDE_DIR)
+  find_path(_FFI_INCLUDE_DIR NAMES kreuzberg.h)
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(kreuzberg-ffi
+  REQUIRED_VARS _FFI_LIBRARY _FFI_INCLUDE_DIR
+)
+
+if(kreuzberg_ffi_FOUND)
+  set(_FFI_LIB_TYPE UNKNOWN)
+  if(_FFI_LIBRARY MATCHES "\\.(dylib|so)$" OR _FFI_LIBRARY MATCHES "\\.so\\.")
+    set(_FFI_LIB_TYPE SHARED)
+  elseif(_FFI_LIBRARY MATCHES "\\.dll$")
+    set(_FFI_LIB_TYPE SHARED)
+  elseif(_FFI_LIBRARY MATCHES "\\.(a|lib)$")
+    set(_FFI_LIB_TYPE STATIC)
+  endif()
+
+  add_library(kreuzberg-ffi::kreuzberg-ffi ${_FFI_LIB_TYPE} IMPORTED)
+  set_target_properties(kreuzberg-ffi::kreuzberg-ffi PROPERTIES
+    IMPORTED_LOCATION "${_FFI_LIBRARY}"
+    INTERFACE_INCLUDE_DIRECTORIES "${_FFI_INCLUDE_DIR}"
+  )
+
+  if(WIN32 AND _FFI_LIB_TYPE STREQUAL "SHARED")
+    find_file(_FFI_DLL
+      NAMES kreuzberg_ffi.dll libkreuzberg_ffi.dll
+      PATHS "${_FFI_PREFIX}/bin" "${_FFI_PREFIX}/lib"
+      NO_DEFAULT_PATH
+    )
+    if(_FFI_DLL)
+      set_target_properties(kreuzberg-ffi::kreuzberg-ffi PROPERTIES
+        IMPORTED_LOCATION "${_FFI_DLL}"
+        IMPORTED_IMPLIB "${_FFI_LIBRARY}"
+      )
+    endif()
+    unset(_FFI_DLL CACHE)
+  endif()
+
+  if(APPLE)
+    set_property(TARGET kreuzberg-ffi::kreuzberg-ffi APPEND PROPERTY
+      INTERFACE_LINK_LIBRARIES "-framework CoreFoundation" "-framework Security" pthread)
+  elseif(UNIX)
+    set_property(TARGET kreuzberg-ffi::kreuzberg-ffi APPEND PROPERTY
+      INTERFACE_LINK_LIBRARIES pthread dl m)
+  elseif(WIN32)
+    set_property(TARGET kreuzberg-ffi::kreuzberg-ffi APPEND PROPERTY
+      INTERFACE_LINK_LIBRARIES ws2_32 userenv bcrypt)
+  endif()
+
+  unset(_FFI_LIB_TYPE)
+endif()
+
+mark_as_advanced(_FFI_LIBRARY _FFI_INCLUDE_DIR)
+unset(_FFI_CMAKE_DIR)
+unset(_FFI_PREFIX)
--- a/crates/kreuzberg-ffi/include/kreuzberg.h
+++ b/crates/kreuzberg-ffi/include/kreuzberg.h
--- a/crates/kreuzberg-ffi/src/lib.rs
+++ b/crates/kreuzberg-ffi/src/lib.rs
--- a/crates/kreuzberg-ffi/tests/email_attachment_data_len.rs
+++ b/crates/kreuzberg-ffi/tests/email_attachment_data_len.rs
@@ -0,0 +1,177 @@
+/// Regression test for GitHub #1059.
+///
+/// `kreuzberg_email_attachment_data` was the only byte-buffer accessor on a public
+/// FFI-exposed DTO that did not follow the established `*_data(ptr, out_len: *mut usize)`
+/// protocol used by `kreuzberg_extracted_image_data`, `kreuzberg_embedded_file_data`,
+/// and `kreuzberg_batch_bytes_item_content`.
+///
+/// Because `EmailAttachment.data` is `Option<Bytes>` (the only optional byte buffer among
+/// public types), alef's heuristic for emitting the two-parameter form did not trigger.
+/// Callers had no way to know the valid length of the returned pointer, making any read
+/// past the first byte undefined behaviour (especially for payloads containing 0x00).
+///
+/// The alef fix shipped with the 2-parameter form (`ptr`, `out_len`). These tests
+/// lock in the correct 2-param ABI and verify the full-length contract for payloads
+/// that contain embedded NUL bytes.
+///
+/// Per project rules: every unsafe block has a SAFETY comment.
+use std::ffi::{c_char, CString};
+use std::fs;
+use std::path::Path;
+
+use kreuzberg_ffi::{kreuzberg_email_attachment_free, kreuzberg_email_attachment_from_json, kreuzberg_last_error_code};
+
+/// Construct a minimal EmailAttachment JSON with a data payload that contains
+/// an embedded NUL and a trailing high byte (0xEF). This defeats any strlen-based
+/// or "read first byte only" implementations.
+fn attachment_json_with_nuls() -> CString {
+    // 8 bytes: JPEG-ish magic + NUL in the middle + high byte at the end.
+    // Length is authoritative and known.
+    let data: Vec<u8> = vec![0xFF, 0xD8, 0xFF, 0x00, 0xDE, 0xAD, 0xBE, 0xEF];
+    let json = format!(
+        r#"{{
+            "name": "test.bin",
+            "filename": "test.bin",
+            "mime_type": "application/octet-stream",
+            "size": {},
+            "is_image": false,
+            "data": {}
+        }}"#,
+        data.len(),
+        serde_json::to_string(&data).unwrap()
+    );
+    CString::new(json).expect("valid UTF-8 JSON for test attachment")
+}
+
+/// The committed C header must declare the 2-parameter form for
+/// `kreuzberg_email_attachment_data` (with `out_len`). This locks in the fix
+/// for GitHub #1059 so a future regeneration cannot silently revert to the
+/// 1-parameter form.
+#[test]
+fn email_attachment_data_accessor_must_provide_out_len_in_header() {
+    let header_path = Path::new(env!("CARGO_MANIFEST_DIR")).join("include/kreuzberg.h");
+    let header = fs::read_to_string(&header_path).expect("committed kreuzberg.h must be readable by the test");
+
+    // Simple and robust: the declaration for this specific function must mention out_len.
+    let has_out_len = header.contains("kreuzberg_email_attachment_data") && header.contains("out_len");
+
+    assert!(
+        has_out_len,
+        "GitHub #1059 regression: the declaration of kreuzberg_email_attachment_data \
+         in crates/kreuzberg-ffi/include/kreuzberg.h does not contain the required \
+         `out_len` parameter.\n\n\
+         Expected something like:\n    uint8_t *kreuzberg_email_attachment_data(..., uintptr_t *out_len);\n\n\
+         Found the old 1-parameter form. Fix requires `task alef:generate` with an \
+         updated alef that handles Option<Bytes> fields for the FFI byte accessor heuristic.\n\n\
+         This is the lock-in test for #1059."
+    );
+}
+
+/// When an attachment has no data payload the accessor must return a null pointer
+/// and write 0 to out_len.
+#[test]
+fn email_attachment_data_none_returns_null_pointer() {
+    let json = CString::new(
+        r#"{"name":"empty","filename":"empty","mime_type":null,"size":null,"is_image":false,"data":null}"#,
+    )
+    .unwrap();
+
+    // SAFETY: json is valid null-terminated UTF-8.
+    let handle = unsafe { kreuzberg_email_attachment_from_json(json.as_ptr() as *const c_char) };
+    assert!(
+        !handle.is_null(),
+        "from_json should succeed (last_error_code={})",
+        // SAFETY: no precondition; reads a thread-local.
+        unsafe { kreuzberg_last_error_code() }
+    );
+
+    let mut out_len: usize = usize::MAX;
+    // SAFETY: handle is a valid non-null pointer returned by from_json;
+    // out_len is a valid stack-allocated usize.
+    let data_ptr = unsafe { kreuzberg_ffi::kreuzberg_email_attachment_data(handle, &mut out_len) };
+
+    assert!(
+        data_ptr.is_null(),
+        "data must be null when the attachment has no payload"
+    );
+    assert_eq!(out_len, 0, "out_len must be 0 when data is None");
+
+    // SAFETY: handle came from from_json; we are the sole owner.
+    unsafe { kreuzberg_email_attachment_free(handle) };
+}
+
+/// When an attachment carries a binary payload the accessor must return a non-null
+/// pointer and write the exact byte count — including bytes past any embedded NUL —
+/// to out_len. This is the core contract broken by the 1-parameter bug (#1059).
+#[test]
+fn email_attachment_data_with_out_len_returns_full_buffer_including_embedded_nuls() {
+    let json = attachment_json_with_nuls();
+    // SAFETY: json is a valid null-terminated CString we just created.
+    let handle = unsafe { kreuzberg_email_attachment_from_json(json.as_ptr() as *const c_char) };
+    assert!(
+        !handle.is_null(),
+        "from_json should succeed for our well-formed test attachment (last_error_code={})",
+        // SAFETY: no precondition; reads a thread-local.
+        unsafe { kreuzberg_last_error_code() }
+    );
+
+    let mut out_len: usize = 0;
+
+    // SAFETY: handle is non-null and freshly allocated by from_json;
+    // out_len is a valid stack-allocated usize. The returned pointer must not
+    // be freed by us — it borrows the internal Bytes of the handle.
+    let data_ptr = unsafe { kreuzberg_ffi::kreuzberg_email_attachment_data(handle, &mut out_len) };
+
+    assert!(
+        !data_ptr.is_null(),
+        "data pointer must be non-null for an attachment we created with a Some(data) payload"
+    );
+    assert_eq!(
+        out_len, 8,
+        "out_len must report the exact length of the Bytes payload (not 0, not guessed, not truncated at NUL)"
+    );
+
+    // SAFETY: data_ptr is valid for [0..out_len] because:
+    // - it came from the handle's internal Bytes (which we control),
+    // - out_len was written by the accessor,
+    // - the handle is still alive (we have not called free yet).
+    let slice = unsafe { std::slice::from_raw_parts(data_ptr, out_len) };
+
+    assert_eq!(slice.len(), 8);
+    assert_eq!(slice[0], 0xFF);
+    assert_eq!(slice[3], 0x00, "must be able to read the embedded NUL");
+    assert_eq!(
+        slice[7], 0xEF,
+        "must be able to read bytes after the NUL (no truncation)"
+    );
+
+    // Cleanup
+    // SAFETY: handle came from from_json; we are the owner.
+    unsafe { kreuzberg_email_attachment_free(handle) };
+}
+
+/// Verify that passing a null out_len pointer is safe: the accessor must not
+/// segfault, and the data pointer itself must still be returned.
+#[test]
+fn email_attachment_data_null_out_len_is_safe() {
+    let json = CString::new(
+        r#"{"name":"hasdata.bin","filename":"hasdata.bin","mime_type":"application/octet-stream","size":4,"is_image":false,"data":[65,0,66,67]}"#,
+    )
+    .unwrap();
+
+    // SAFETY: json is valid.
+    let handle = unsafe { kreuzberg_email_attachment_from_json(json.as_ptr() as *const c_char) };
+    assert!(!handle.is_null());
+
+    // SAFETY: handle is valid; passing null for out_len is a defined contract
+    // (the accessor null-checks before writing).
+    let data_ptr = unsafe { kreuzberg_ffi::kreuzberg_email_attachment_data(handle, std::ptr::null_mut()) };
+
+    assert!(
+        !data_ptr.is_null(),
+        "data pointer should be non-null when the attachment carries a payload"
+    );
+
+    // SAFETY: handle from from_json; we are the owner.
+    unsafe { kreuzberg_email_attachment_free(handle) };
+}
--- a/crates/kreuzberg-ffi/tests/vtable_bytes_len.rs
+++ b/crates/kreuzberg-ffi/tests/vtable_bytes_len.rs
@@ -0,0 +1,204 @@
+/// Regression tests: vtable Bytes params carry companion length
+///
+/// The alef vtable generator previously emitted only `*const u8` for `&[u8]`
+/// trait-method parameters without a companion `{name}_len: usize`. Binary
+/// payloads contain embedded NUL bytes; read-until-NUL semantics silently
+/// truncated every real image or document buffer at the first `0x00`.
+///
+/// Fix shipped in alef ≥ v0.19.21 and is present in the generated FFI shim.
+/// These tests construct a vtable bridge directly, pass a buffer with an
+/// embedded NUL at a known offset, and assert the full buffer is received.
+///
+/// Per-test state is passed via `user_data` — no global statics — so tests
+/// are independent and can run in parallel without interfering.
+use kreuzberg_ffi::{
+    KreuzbergDocumentExtractorBridge, KreuzbergDocumentExtractorVTable, KreuzbergOcrBackendBridge,
+    KreuzbergOcrBackendVTable,
+};
+use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
+
+// ── Per-test callback state ───────────────────────────────────────────────
+
+struct CallbackState {
+    received_len: AtomicUsize,
+    received_last_byte: AtomicU8,
+}
+
+impl CallbackState {
+    fn new() -> Self {
+        Self {
+            received_len: AtomicUsize::new(0),
+            received_last_byte: AtomicU8::new(0),
+        }
+    }
+}
+
+// ── C callback stubs ─────────────────────────────────────────────────────
+
+unsafe extern "C" fn ocr_process_image(
+    user_data: *const std::ffi::c_void,
+    image_bytes: *const u8,
+    image_bytes_len: usize,
+    _config: *const std::ffi::c_char,
+    out_result: *mut *mut std::ffi::c_char,
+    out_error: *mut *mut std::ffi::c_char,
+) -> i32 {
+    // SAFETY: user_data points to a CallbackState that the calling test keeps alive.
+    let state = unsafe { &*(user_data as *const CallbackState) };
+    state.received_len.store(image_bytes_len, Ordering::SeqCst);
+    if image_bytes_len > 0 {
+        // SAFETY: caller guarantees image_bytes[0..image_bytes_len] is valid.
+        let last = unsafe { *image_bytes.add(image_bytes_len - 1) };
+        state.received_last_byte.store(last, Ordering::SeqCst);
+    }
+    unsafe { *out_result = std::ptr::null_mut() };
+    let msg = std::ffi::CString::new("stub").unwrap();
+    // SAFETY: caller owns out_error and will free it via kreuzberg_free_string.
+    unsafe { *out_error = msg.into_raw() };
+    1
+}
+
+unsafe extern "C" fn extractor_extract_bytes(
+    user_data: *const std::ffi::c_void,
+    content: *const u8,
+    content_len: usize,
+    _mime_type: *const std::ffi::c_char,
+    _config: *const std::ffi::c_char,
+    out_result: *mut *mut std::ffi::c_char,
+    out_error: *mut *mut std::ffi::c_char,
+) -> i32 {
+    // SAFETY: user_data points to a CallbackState that the calling test keeps alive.
+    let state = unsafe { &*(user_data as *const CallbackState) };
+    state.received_len.store(content_len, Ordering::SeqCst);
+    if content_len > 0 {
+        // SAFETY: caller guarantees content[0..content_len] is valid.
+        let last = unsafe { *content.add(content_len - 1) };
+        state.received_last_byte.store(last, Ordering::SeqCst);
+    }
+    unsafe { *out_result = std::ptr::null_mut() };
+    let msg = std::ffi::CString::new("stub").unwrap();
+    unsafe { *out_error = msg.into_raw() };
+    1
+}
+
+// ── Tests ─────────────────────────────────────────────────────────────────
+
+/// OcrBackend.process_image must pass the full buffer length even when
+/// the payload contains embedded NUL bytes.
+#[tokio::test]
+async fn ocr_backend_vtable_process_image_passes_full_length_with_embedded_nuls() {
+    // 8-byte buffer; NUL at index 3. strlen-style reads would stop at 3.
+    let image_bytes: Vec<u8> = vec![0xFF, 0xD8, 0xFF, 0x00, 0xDE, 0xAD, 0xBE, 0xEF];
+
+    let state = Box::new(CallbackState::new());
+    let state_ptr = state.as_ref() as *const CallbackState as *const std::ffi::c_void;
+
+    let vtable = KreuzbergOcrBackendVTable {
+        process_image: Some(ocr_process_image),
+        process_image_file: None,
+        name_fn: None,
+        version_fn: None,
+        initialize_fn: None,
+        shutdown_fn: None,
+        supports_language: None,
+        backend_type: None,
+        supported_languages: None,
+        supports_table_detection: None,
+        supports_document_processing: None,
+        process_document: None,
+        free_user_data: None,
+    };
+
+    // SAFETY: state lives for the duration of this test and outlives the bridge.
+    let bridge = unsafe { KreuzbergOcrBackendBridge::new("test-ocr-stub".to_string(), vtable, state_ptr) };
+
+    use kreuzberg::OcrBackend;
+    let _ = bridge
+        .process_image(&image_bytes, &kreuzberg::OcrConfig::default())
+        .await;
+
+    assert_eq!(
+        state.received_len.load(Ordering::SeqCst),
+        8,
+        "process_image vtable received wrong length (truncated at embedded NUL?)"
+    );
+    assert_eq!(
+        state.received_last_byte.load(Ordering::SeqCst),
+        0xEF,
+        "process_image vtable could not read past the embedded NUL"
+    );
+}
+
+/// DocumentExtractor.extract_bytes must pass the full buffer length even when
+/// the document bytes contain embedded NUL bytes.
+#[tokio::test]
+async fn document_extractor_vtable_extract_bytes_passes_full_length_with_embedded_nuls() {
+    // 8-byte buffer; NUL at index 2.
+    let content: Vec<u8> = vec![0x50, 0x4B, 0x00, 0x03, 0x14, 0x00, 0x00, 0x02];
+
+    let state = Box::new(CallbackState::new());
+    let state_ptr = state.as_ref() as *const CallbackState as *const std::ffi::c_void;
+
+    let vtable = KreuzbergDocumentExtractorVTable {
+        extract_bytes: Some(extractor_extract_bytes),
+        extract_file: None,
+        name_fn: None,
+        version_fn: None,
+        initialize_fn: None,
+        shutdown_fn: None,
+        supported_mime_types: None,
+        priority: None,
+        can_handle: None,
+        free_user_data: None,
+    };
+
+    // SAFETY: state lives for the duration of this test and outlives the bridge.
+    let bridge = unsafe { KreuzbergDocumentExtractorBridge::new("test-extractor-stub".to_string(), vtable, state_ptr) };
+
+    use kreuzberg::DocumentExtractor;
+    let _ = bridge
+        .extract_bytes(
+            &content,
+            "application/octet-stream",
+            &kreuzberg::ExtractionConfig::default(),
+        )
+        .await;
+
+    assert_eq!(
+        state.received_len.load(Ordering::SeqCst),
+        8,
+        "extract_bytes vtable received wrong length (truncated at embedded NUL?)"
+    );
+    assert_eq!(
+        state.received_last_byte.load(Ordering::SeqCst),
+        0x02,
+        "extract_bytes vtable could not read past the embedded NUL"
+    );
+}
+
+/// ImageKind numeric values: PageRaster must be 10 and Unknown must be 11.
+///
+/// alef ≥ v0.19.21 added PageRaster between Mask (9) and Unknown, bumping
+/// Unknown from 10 → 11. Any C/Go/Java/C# code that hardcoded Unknown = 10
+/// must be updated; this test pins the new ordinals so the renumbering is
+/// visible to CI.
+#[test]
+fn image_kind_page_raster_is_10_and_unknown_is_11() {
+    // SAFETY: pure integer dispatch, no pointers.
+    assert_eq!(
+        unsafe { kreuzberg_ffi::kreuzberg_image_kind_from_i32(10) },
+        10,
+        "PageRaster == 10"
+    );
+    assert_eq!(
+        unsafe { kreuzberg_ffi::kreuzberg_image_kind_from_i32(11) },
+        11,
+        "Unknown == 11"
+    );
+    // Old Unknown value must now resolve to PageRaster, not Unknown.
+    assert_ne!(
+        unsafe { kreuzberg_ffi::kreuzberg_image_kind_from_i32(10) },
+        -1,
+        "10 must be valid"
+    );
+}
--- a/crates/kreuzberg-jni/Cargo.toml
+++ b/crates/kreuzberg-jni/Cargo.toml
@@ -0,0 +1,22 @@
+[package]
+name = "kreuzberg-jni"
+version.workspace = true
+edition.workspace = true
+rust-version.workspace = true
+authors.workspace = true
+license.workspace = true
+repository.workspace = true
+homepage.workspace = true
+publish = false
+
+[lib]
+crate-type = ["cdylib"]
+name = "kreuzberg_jni"
+
+[dependencies]
+base64 = "0.22"
+jni = "0.21"
+kreuzberg-ffi = { path = "../kreuzberg-ffi" }
+
+[lints]
+workspace = true
--- a/crates/kreuzberg-jni/src/lib.rs
+++ b/crates/kreuzberg-jni/src/lib.rs
--- a/crates/kreuzberg-node/Cargo.toml
+++ b/crates/kreuzberg-node/Cargo.toml
@@ -0,0 +1,32 @@
+[package]
+name = "kreuzberg-node"
+version = "5.0.0-rc.3"
+edition = "2024"
+license = "Elastic-2.0"
+description = "High-performance document intelligence library"
+readme = false
+keywords = ["document", "extraction", "ocr", "pdf", "text"]
+categories = ["text-processing"]
+
+# `serde_json` is emitted unconditionally above so the manifest is stable
+# across regens, but for umbrella crates with no JSON-marshalled return types
+# it is genuinely unused. The conditional `async-trait` / `futures-util` deps
+# are similarly flagged when the umbrella has trait-bridge / streaming
+# adapters configured but no actual async-trait callsite in this binding.
+[package.metadata.cargo-machete]
+ignored = ["serde_json", "async-trait"]
+
+[lib]
+crate-type = ["cdylib"]
+
+[dependencies]
+async-trait = "0.1"
+kreuzberg = { version = "5.0.0-rc.3", path = "../kreuzberg", features = ["full", "pdf", "ocr", "paddle-ocr", "paddle-ocr-types", "layout-detection", "layout-types", "embeddings", "embedding-presets", "chunking", "keywords-yake", "keywords-rake", "language-detection", "html", "tree-sitter", "office", "email", "archives", "stopwords", "auto-rotate", "auto-rotate-types", "tokio-runtime", "api", "mcp", "liter-llm", "quality"] }
+napi = { version = "3", features = ["async", "serde-json"] }
+napi-derive = "3"
+serde = { version = "1", features = ["derive"] }
+serde_json = "1"
+serde_with = "3"
+
+[build-dependencies]
+napi-build = "2"
--- a/crates/kreuzberg-node/LICENSE
+++ b/crates/kreuzberg-node/LICENSE
@@ -0,0 +1,93 @@
+Elastic License 2.0 (ELv2)
+
+Copyright 2025-2026 Kreuzberg, Inc.
+
+Acceptance
+
+By using the software, you agree to all of the terms and conditions below.
+
+Copyright License
+
+The licensor grants you a non-exclusive, royalty-free, worldwide,
+non-sublicensable, non-transferable license to use, copy, distribute, make
+available, and prepare derivative works of the software, in each case subject to
+the limitations and conditions below.
+
+Limitations
+
+You may not provide the software to third parties as a hosted or managed
+service, where the service provides users with access to any substantial set of
+the features or functionality of the software.
+
+You may not move, change, disable, or circumvent the license key functionality
+in the software, and you may not remove or obscure any functionality in the
+software that is protected by the license key.
+
+You may not alter, remove, or obscure any licensing, copyright, or other notices
+of the licensor in the software. Any use of the licensor's trademarks is subject
+to applicable law.
+
+Patents
+
+The licensor grants you a license, under any patent claims the licensor can
+license, or becomes able to license, to make, have made, use, sell, offer for
+sale, import and have imported the software, in each case subject to the
+limitations and conditions in this license. This license does not cover any
+patent claims that you cause to be infringed by modifications or additions to the
+software. If you or your company make any written claim that the software
+infringes or contributes to infringement of any patent, your patent license for
+the software granted under these terms ends immediately. If your company makes
+such a claim, your patent license ends immediately for work on behalf of your
+company.
+
+Notices
+
+You must ensure that anyone who gets a copy of any part of the software from you
+also gets a copy of these terms.
+
+If you modify the software, you must include in any modified copies of the
+software prominent notices stating that you have modified the software.
+
+No Other Rights
+
+These terms do not imply any licenses other than those expressly granted in
+these terms.
+
+Termination
+
+If you use the software in violation of these terms, such use is not licensed,
+and your licenses will automatically terminate. If the licensor provides you with
+a notice of your violation, and you cease all violation of this license no later
+than 30 days after you receive that notice, your licenses will be reinstated
+retroactively. However, if you violate these terms after such reinstatement, any
+additional violation of these terms will cause your licenses to terminate
+automatically and permanently.
+
+No Liability
+
+As far as the law allows, the software comes as is, without any warranty or
+condition, and the licensor will not be liable to you for any damages arising out
+of these terms or the use or nature of the software, under any kind of legal
+claim.
+
+Definitions
+
+The licensor is the entity offering these terms, and the software is the
+software the licensor makes available under these terms, including any portion
+of it.
+
+you refers to the individual or entity agreeing to these terms.
+
+your company is any legal entity, sole proprietorship, or other kind of
+organization that you work for, plus all organizations that have control over,
+are under the control of, or are under common control with that organization.
+control means ownership of substantially all the assets of an entity, or the
+power to direct its management and policies by vote, contract, or otherwise.
+Control can be direct or indirect.
+
+your licenses are all the licenses granted to you for the software under these
+terms.
+
+use means anything you do with the software requiring one of your licenses.
+
+trademark means trademarks, service marks, and similar rights.
--- a/crates/kreuzberg-node/README.md
+++ b/crates/kreuzberg-node/README.md
@@ -0,0 +1,488 @@
+# TypeScript (Node.js)
+
+<div align="center" style="display: flex; flex-wrap: wrap; gap: 8px; justify-content: center; margin: 20px 0;">
+  <a href="https://github.com/kreuzberg-dev/alef">
+    <img src="https://img.shields.io/badge/Bindings-alef%20%D7%90-007ec6" alt="Bindings">
+  </a>
+  <!-- Language Bindings -->
+  <a href="https://crates.io/crates/kreuzberg">
+    <img src="https://img.shields.io/crates/v/kreuzberg?label=Rust&color=007ec6" alt="Rust">
+  </a>
+  <a href="https://pypi.org/project/kreuzberg/">
+    <img src="https://img.shields.io/pypi/v/kreuzberg?label=Python&color=007ec6" alt="Python">
+  </a>
+  <a href="https://www.npmjs.com/package/@kreuzberg/node">
+    <img src="https://img.shields.io/npm/v/@kreuzberg/node?label=Node.js&color=007ec6" alt="Node.js">
+  </a>
+  <a href="https://www.npmjs.com/package/@kreuzberg/wasm">
+    <img src="https://img.shields.io/npm/v/@kreuzberg/wasm?label=WASM&color=007ec6" alt="WASM">
+  </a>
+  <a href="https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg">
+    <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
+  </a>
+  <a href="https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/go/v5">
+    <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v5*" alt="Go">
+  </a>
+  <a href="https://www.nuget.org/packages/Kreuzberg/">
+    <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
+  </a>
+  <a href="https://packagist.org/packages/kreuzberg/kreuzberg">
+    <img src="https://img.shields.io/packagist/v/kreuzberg/kreuzberg?label=PHP&color=007ec6" alt="PHP">
+  </a>
+  <a href="https://rubygems.org/gems/kreuzberg">
+    <img src="https://img.shields.io/gem/v/kreuzberg?label=Ruby&color=007ec6" alt="Ruby">
+  </a>
+  <a href="https://hex.pm/packages/kreuzberg">
+    <img src="https://img.shields.io/hexpm/v/kreuzberg?label=Elixir&color=007ec6" alt="Elixir">
+  </a>
+  <a href="https://kreuzberg-dev.r-universe.dev/kreuzberg">
+    <img src="https://img.shields.io/badge/R-kreuzberg-007ec6" alt="R">
+  </a>
+  <a href="https://pub.dev/packages/kreuzberg">
+    <img src="https://img.shields.io/pub/v/kreuzberg?label=Dart&color=007ec6" alt="Dart">
+  </a>
+  <a href="https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg-android">
+    <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg-android?label=Kotlin&color=007ec6" alt="Kotlin">
+  </a>
+  <a href="https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/swift">
+    <img src="https://img.shields.io/badge/Swift-SPM-007ec6" alt="Swift">
+  </a>
+  <a href="https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/zig">
+    <img src="https://img.shields.io/badge/Zig-package-007ec6" alt="Zig">
+  </a>
+  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
+    <img src="https://img.shields.io/badge/C-FFI-007ec6" alt="C FFI">
+  </a>
+  <a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/kreuzberg">
+    <img src="https://img.shields.io/badge/Docker-ghcr.io-007ec6?logo=docker&logoColor=white" alt="Docker">
+  </a>
+  <a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/charts%2Fkreuzberg">
+    <img src="https://img.shields.io/badge/Helm-ghcr.io-007ec6?logo=helm&logoColor=white" alt="Helm">
+  </a>
+
+  <!-- Project Info -->
+  <a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
+    <img src="https://img.shields.io/badge/License-Elastic--2.0-007ec6" alt="License">
+  </a>
+  <a href="https://docs.kreuzberg.dev">
+    <img src="https://img.shields.io/badge/Docs-kreuzberg-007ec6" alt="Documentation">
+  </a>
+  <a href="https://huggingface.co/Kreuzberg">
+    <img src="https://img.shields.io/badge/Hugging%20Face-Kreuzberg-007ec6" alt="Hugging Face">
+  </a>
+</div>
+
+<div align="center" style="margin: 24px 0 0;">
+  <a href="https://kreuzberg.dev">
+    <img alt="Kreuzberg" src="https://github.com/user-attachments/assets/419fc06c-8313-4324-b159-4b4d3cfce5c0" />
+  </a>
+</div>
+
+<div align="center" style="display: flex; flex-wrap: wrap; gap: 12px; justify-content: center; margin: 28px 0 24px;">
+  <a href="https://discord.gg/xt9WY3GnKR">
+    <img height="22" src="https://img.shields.io/badge/Discord-Chat-007ec6?logo=discord&logoColor=white" alt="Join Discord">
+  </a>
+  <a href="https://docs.kreuzberg.dev/demo.html">
+    <img height="22" src="https://img.shields.io/badge/Live%20Demo-Open-007ec6?logo=webassembly&logoColor=white" alt="Live Demo">
+  </a>
+</div>
+
+Extract text, tables, images, and metadata from 90+ file formats and 300+ programming languages including PDF, Office documents, and images. Native NAPI-RS bindings for Node.js with superior performance, async/await support, and TypeScript type definitions.
+
+## What This Package Provides
+
+- **Document intelligence core** — extract text, tables, images, metadata, entities, keywords, and code intelligence from one API.
+- **Format coverage** — PDF, Office, images, HTML/XML, email, archives, notebooks, citations, scientific formats, and plain text.
+- **OCR choices** — Tesseract, PaddleOCR, EasyOCR where supported, VLM OCR through liter-llm, and plugin hooks for custom backends.
+- **Same engine as every binding** — Rust, Python, Node.js, Go, Java, PHP, Ruby, .NET, Elixir, R, WASM, Kotlin Android, Swift, Dart, Zig, and C FFI share the same Rust implementation.
+- **Node-first TypeScript API** — NAPI-RS package with typed options/results and async extraction.
+
+## Installation
+
+### Package Installation
+
+```bash
+pnpm add @kreuzberg/node
+```
+
+### System Requirements
+- **Node.js 22+** required (NAPI-RS native bindings)
+- Optional: [ONNX Runtime](https://github.com/microsoft/onnxruntime/releases) version 1.22.x for embeddings support
+- Optional: [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) for OCR functionality
+
+### Platform Support
+
+Pre-built binaries available for:
+- macOS (arm64, x64)
+- Linux (x64)
+- Windows (x64)
+
+## Quick Start
+
+### Basic Extraction
+
+Extract text, metadata, and structure from any supported document format:
+
+```typescript title="TypeScript"
+import { extractFileSync } from "@kreuzberg/node";
+
+const config = {
+  useCache: true,
+  enableQualityProcessing: true,
+};
+
+const result = extractFileSync("document.pdf", null, config);
+
+console.log(result.content);
+console.log(`MIME Type: ${result.mimeType}`);
+```
+
+### Common Use Cases
+
+#### Extract with Custom Configuration
+
+Most use cases benefit from configuration to control extraction behavior:
+
+**With OCR (for scanned documents):**
+
+```typescript title="TypeScript"
+import { extractFile } from "@kreuzberg/node";
+
+const config = {
+  ocr: {
+    backend: "tesseract",
+    language: "eng+fra",
+    tesseractConfig: {
+      psm: 3,
+    },
+  },
+};
+
+const result = await extractFile("document.pdf", null, config);
+console.log(result.content);
+```
+
+#### Table Extraction
+
+```typescript title="TypeScript"
+import { extractFileSync } from "kreuzberg";
+
+const result = extractFileSync("document.pdf");
+
+result.tables?.forEach((table) => {
+  console.log(`Table with ${table.cells?.length ?? 0} rows`);
+  console.log(table.markdown);
+  table.cells?.forEach((row) => console.log(row.join(" | ")));
+});
+```
+
+#### Processing Multiple Files
+
+```typescript title="TypeScript"
+import { batchExtractFilesSync } from "@kreuzberg/node";
+
+const files = ["doc1.pdf", "doc2.docx", "doc3.pptx"];
+const results = batchExtractFilesSync(files);
+
+results.forEach((result, i) => {
+  console.log(`File ${i + 1}: ${result.content.length} characters`);
+});
+```
+
+#### Async Processing
+
+For non-blocking document processing:
+
+```typescript title="TypeScript"
+import { extractFile } from "@kreuzberg/node";
+
+const result = await extractFile("document.pdf");
+console.log(result.content);
+```
+
+#### Configuration Discovery
+
+```typescript title="config_discovery.ts"
+import { ExtractionConfig, extractFile } from "@kreuzberg/node";
+
+const config = ExtractionConfig.discover();
+if (config) {
+  console.log("Found configuration file");
+  const result = await extractFile("document.pdf", null, config);
+  console.log(result.content);
+} else {
+  console.log("No configuration file found, using defaults");
+  const result = await extractFile("document.pdf");
+  console.log(result.content);
+}
+```
+
+#### Worker Thread Pool
+
+```typescript title="worker_pool.ts"
+import {
+  createWorkerPool,
+  extractFileInWorker,
+  batchExtractFilesInWorker,
+  closeWorkerPool,
+} from "@kreuzberg/node";
+
+// Create a pool with 4 worker threads
+const pool = createWorkerPool(4);
+
+try {
+  // Extract single file in worker
+  const result = await extractFileInWorker(pool, "document.pdf", null, {
+    useCache: true,
+  });
+  console.log(result.content);
+
+  // Extract multiple files concurrently
+  const files = ["doc1.pdf", "doc2.docx", "doc3.xlsx"];
+  const results = await batchExtractFilesInWorker(pool, files, {
+    useCache: true,
+  });
+
+  results.forEach((result, i) => {
+    console.log(`File ${i + 1}: ${result.content.length} characters`);
+  });
+} finally {
+  // Always close the pool when done
+  await closeWorkerPool(pool);
+}
+```
+
+**Performance Benefits:**
+- **Parallel Processing**: Multiple documents extracted simultaneously
+- **CPU Utilization**: Maximizes multi-core CPU usage for large batches
+- **Queue Management**: Automatically distributes work across available workers
+- **Resource Control**: Prevents thread exhaustion with configurable pool size
+
+**Best Practices:**
+- Use worker pools for batches of 10+ documents
+- Set pool size to number of CPU cores (default behavior)
+- Always close pools with `closeWorkerPool()` to prevent resource leaks
+- Reuse pools across multiple batch operations for efficiency
+
+### Next Steps
+
+- **[Installation Guide](https://docs.kreuzberg.dev/getting-started/installation/)** - Platform-specific setup
+- **[API Documentation](https://docs.kreuzberg.dev/reference/api-python/)** - Complete API reference
+- **[Examples & Guides](https://docs.kreuzberg.dev/)** - Full code examples and usage guides
+- **[Configuration Guide](https://docs.kreuzberg.dev/guides/configuration/)** - Advanced configuration options
+
+## NAPI-RS Implementation Details
+
+### Native Performance
+
+This binding uses NAPI-RS to provide native Node.js bindings with:
+
+- **Zero-copy data transfer** between JavaScript and Rust layers
+- **Native thread pool** for concurrent document processing
+- **Direct memory management** for efficient large document handling
+- **Binary-compatible** pre-built native modules across platforms
+
+### Threading Model
+
+- Single documents are processed synchronously or asynchronously in a dedicated thread
+- Batch operations distribute work across available CPU cores
+- Thread count is configurable but defaults to system CPU count
+- Long-running extractions block the event loop unless using async APIs
+
+### Memory Management
+
+- Large documents (> 100 MB) are streamed to avoid loading entirely into memory
+- Temporary files are created in system temp directory for extraction
+- Memory is automatically released after extraction completion
+- ONNX models are cached in memory for repeated embeddings operations
+
+## Features
+
+### Supported File Formats (90+)
+
+90+ file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
+
+#### Office Documents
+
+| Category | Formats | Capabilities |
+|----------|---------|--------------|
+| **Word Processing** | `.docx`, `.docm`, `.dotx`, `.dotm`, `.dot`, `.odt` | Full text, tables, images, metadata, styles |
+| **Spreadsheets** | `.xlsx`, `.xlsm`, `.xlsb`, `.xls`, `.xla`, `.xlam`, `.xltm`, `.xltx`, `.xlt`, `.ods` | Sheet data, formulas, cell metadata, charts |
+| **Presentations** | `.pptx`, `.pptm`, `.ppsx`, `.potx`, `.potm`, `.pot`, `.ppt` | Slides, speaker notes, images, metadata |
+| **PDF** | `.pdf` | Text, tables, images, metadata, OCR support |
+| **eBooks** | `.epub`, `.fb2` | Chapters, metadata, embedded resources |
+| **Database** | `.dbf` | Table data extraction, field type support |
+| **Hangul** | `.hwp`, `.hwpx` | Korean document format, text extraction |
+
+#### Images (OCR-Enabled)
+
+| Category | Formats | Features |
+|----------|---------|----------|
+| **Raster** | `.png`, `.jpg`, `.jpeg`, `.gif`, `.webp`, `.bmp`, `.tiff`, `.tif` | OCR, table detection, EXIF metadata, dimensions, color space |
+| **Advanced** | `.jp2`, `.jpx`, `.jpm`, `.mj2`, `.jbig2`, `.jb2`, `.pnm`, `.pbm`, `.pgm`, `.ppm` | OCR via hayro-jpeg2000 (pure Rust decoder), JBIG2 support, table detection, format-specific metadata |
+| **Vector** | `.svg` | DOM parsing, embedded text, graphics metadata |
+
+#### Web & Data
+
+| Category | Formats | Features |
+|----------|---------|----------|
+| **Markup** | `.html`, `.htm`, `.xhtml`, `.xml`, `.svg` | DOM parsing, metadata (Open Graph, Twitter Card), link extraction |
+| **Structured Data** | `.json`, `.yaml`, `.yml`, `.toml`, `.csv`, `.tsv` | Schema detection, nested structures, validation |
+| **Text & Markdown** | `.txt`, `.md`, `.markdown`, `.djot`, `.rst`, `.org`, `.rtf` | CommonMark, GFM, Djot, reStructuredText, Org Mode |
+
+#### Email & Archives
+
+| Category | Formats | Features |
+|----------|---------|----------|
+| **Email** | `.eml`, `.msg` | Headers, body (HTML/plain), attachments, threading |
+| **Archives** | `.zip`, `.tar`, `.tgz`, `.gz`, `.7z` | File listing, nested archives, metadata |
+
+#### Academic & Scientific
+
+| Category | Formats | Features |
+|----------|---------|----------|
+| **Citations** | `.bib`, `.biblatex`, `.ris`, `.nbib`, `.enw`, `.csl` | Structured parsing: RIS (structured), PubMed/MEDLINE, EndNote XML (structured), BibTeX, CSL JSON |
+| **Scientific** | `.tex`, `.latex`, `.typst`, `.jats`, `.ipynb`, `.docbook` | LaTeX, Jupyter notebooks, PubMed JATS |
+| **Documentation** | `.opml`, `.pod`, `.mdoc`, `.troff` | Technical documentation formats |
+
+#### Code Intelligence (300+ Languages)
+
+| Feature | Description |
+|---------|-------------|
+| **Structure Extraction** | Functions, classes, methods, structs, interfaces, enums |
+| **Import/Export Analysis** | Module dependencies, re-exports, wildcard imports |
+| **Symbol Extraction** | Variables, constants, type aliases, properties |
+| **Docstring Parsing** | Google, NumPy, Sphinx, JSDoc, RustDoc, and 10+ formats |
+| **Diagnostics** | Parse errors with line/column positions |
+| **Syntax-Aware Chunking** | Split code by semantic boundaries, not arbitrary byte offsets |
+
+Powered by [tree-sitter-language-pack](https://github.com/kreuzberg-dev/tree-sitter-language-pack) — [documentation](https://docs.tree-sitter-language-pack.kreuzberg.dev).
+
+**[Complete Format Reference](https://docs.kreuzberg.dev/reference/formats/)**
+
+### Key Capabilities
+
+- **Text Extraction** - Extract all text content with position and formatting information
+- **Metadata Extraction** - Retrieve document properties, creation date, author, etc.
+- **Table Extraction** - Parse tables with structure and cell content preservation
+- **Image Extraction** - Extract embedded images and render page previews
+- **OCR Support** - Integrate multiple OCR backends for scanned documents
+- **Async/Await** - Non-blocking document processing with concurrent operations
+- **Plugin System** - Extensible post-processing for custom text transformation
+- **Embeddings** - Generate vector embeddings using ONNX Runtime models
+- **Batch Processing** - Efficiently process multiple documents in parallel
+- **Memory Efficient** - Stream large files without loading entirely into memory
+- **Language Detection** - Detect and support multiple languages in documents
+- **Code Intelligence** - Extract structure, imports, exports, symbols, and docstrings from [300+ programming languages](https://docs.tree-sitter-language-pack.kreuzberg.dev) via tree-sitter
+- **Configuration** - Fine-grained control over extraction behavior
+
+### Performance Characteristics
+
+| Format | Speed | Memory | Notes |
+|--------|-------|--------|-------|
+| **PDF (text)** | 10-100 MB/s | ~50MB per doc | Fastest extraction |
+| **Office docs** | 20-200 MB/s | ~100MB per doc | DOCX, XLSX, PPTX |
+| **Images (OCR)** | 1-5 MB/s | Variable | Depends on OCR backend |
+| **Archives** | 5-50 MB/s | ~200MB per doc | ZIP, TAR, etc. |
+| **Web formats** | 50-200 MB/s | Streaming | HTML, XML, JSON |
+
+## OCR Support
+
+Kreuzberg supports multiple OCR backends for extracting text from scanned documents and images:
+
+- **Tesseract**
+
+- **Paddleocr**
+
+### OCR Configuration Example
+
+```typescript title="TypeScript"
+import { extractFile } from "@kreuzberg/node";
+
+const config = {
+  ocr: {
+    backend: "tesseract",
+    language: "eng+fra",
+    tesseractConfig: {
+      psm: 3,
+    },
+  },
+};
+
+const result = await extractFile("document.pdf", null, config);
+console.log(result.content);
+```
+
+## Async Support
+
+This binding provides full async/await support for non-blocking document processing:
+
+```typescript title="TypeScript"
+import { extractFile } from "@kreuzberg/node";
+
+const result = await extractFile("document.pdf");
+console.log(result.content);
+```
+
+## Plugin System
+
+Kreuzberg supports extensible post-processing plugins for custom text transformation and filtering.
+
+For detailed plugin documentation, visit [Plugin System Guide](https://docs.kreuzberg.dev/guides/plugins/).
+
+## Embeddings Support
+
+Generate vector embeddings for extracted text using the built-in ONNX Runtime support. Requires ONNX Runtime installation.
+
+**[Embeddings Guide](https://docs.kreuzberg.dev/features/#embeddings)**
+
+## Batch Processing
+
+Process multiple documents efficiently:
+
+```typescript title="TypeScript"
+import { batchExtractFilesSync } from "@kreuzberg/node";
+
+const files = ["doc1.pdf", "doc2.docx", "doc3.pptx"];
+const results = batchExtractFilesSync(files);
+
+results.forEach((result, i) => {
+  console.log(`File ${i + 1}: ${result.content.length} characters`);
+});
+```
+
+## Configuration
+
+For advanced configuration options including language detection, table extraction, OCR settings, and more:
+
+**[Configuration Guide](https://docs.kreuzberg.dev/guides/configuration/)**
+
+## Documentation
+
+- **[Official Documentation](https://docs.kreuzberg.dev/)**
+- **[API Reference](https://docs.kreuzberg.dev/reference/api-python/)**
+- **[Examples & Guides](https://docs.kreuzberg.dev/)**
+
+## Contributing
+
+Contributions are welcome! See [Contributing Guide](https://github.com/kreuzberg-dev/kreuzberg/blob/main/CONTRIBUTING.md).
+
+## Part of Kreuzberg.dev
+
+- [Kreuzberg Cloud](https://github.com/kreuzberg-dev/kreuzberg-cloud) — managed extraction API with SDKs, dashboards, and observability.
+- [kreuzcrawl](https://github.com/kreuzberg-dev/kreuzcrawl) — web crawling and scraping with HTML→Markdown and headless-Chrome fallback.
+- [html-to-markdown](https://github.com/kreuzberg-dev/html-to-markdown) — fast, lossless HTML→Markdown engine.
+- [liter-llm](https://github.com/kreuzberg-dev/liter-llm) — universal LLM API client with native bindings for 14 languages and 143 providers.
+- [tree-sitter-language-pack](https://github.com/kreuzberg-dev/tree-sitter-language-pack) — tree-sitter grammars and code-intelligence primitives.
+- [alef](https://github.com/kreuzberg-dev/alef) — the polyglot binding generator that produces this README and all per-language bindings.
+- [Discord](https://discord.gg/xt9WY3GnKR) — community, roadmap, announcements.
+
+## License
+
+Elastic-2.0 License — see [LICENSE](../../LICENSE) for details.
+
+## Support
+
+- **Discord Community**: [Join our Discord](https://discord.gg/xt9WY3GnKR)
+- **GitHub Issues**: [Report bugs](https://github.com/kreuzberg-dev/kreuzberg/issues)
+- **Discussions**: [Ask questions](https://github.com/kreuzberg-dev/kreuzberg/discussions)
--- a/crates/kreuzberg-node/format-metadata-wrapper.js
+++ b/crates/kreuzberg-node/format-metadata-wrapper.js
@@ -0,0 +1,27 @@
+// Wrap JsFormatMetadata to add getters for format-specific metadata
+// This works around the limitation that #[napi(getter)] doesn't work on #[napi(object)]
+
+export function wrapFormatMetadata(fmt) {
+	if (!fmt || typeof fmt !== "object") return fmt;
+
+	const tag = fmt.format_type;
+	const payload = fmt["0"];
+
+	if (!payload) return fmt;
+
+	try {
+		const data = JSON.parse(payload);
+
+		// Add the typed variant property as a non-enumerable property
+		Object.defineProperty(fmt, tag, {
+			value: data,
+			enumerable: false,
+			writable: false,
+			configurable: false,
+		});
+	} catch (e) {
+		// Ignore JSON parse errors
+	}
+
+	return fmt;
+}
--- a/crates/kreuzberg-node/index.d.ts
+++ b/crates/kreuzberg-node/index.d.ts
--- a/crates/kreuzberg-node/npm/darwin-arm64/package.json
+++ b/crates/kreuzberg-node/npm/darwin-arm64/package.json
@@ -0,0 +1,15 @@
+{
+  "name": "@kreuzberg/node-darwin-arm64",
+  "version": "5.0.0-rc.3",
+  "license": "Elastic-2.0",
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/kreuzberg-dev/kreuzberg.git"
+  },
+  "main": "kreuzberg-node.darwin-arm64.node",
+  "files": ["kreuzberg-node.darwin-arm64.node"],
+  "os": ["darwin"],
+  "cpu": ["arm64"],
+  "engines": { "node": ">= 18" },
+  "publishConfig": { "access": "public" }
+}
--- a/crates/kreuzberg-node/npm/darwin-x64/package.json
+++ b/crates/kreuzberg-node/npm/darwin-x64/package.json
@@ -0,0 +1,15 @@
+{
+  "name": "@kreuzberg/node-darwin-x64",
+  "version": "5.0.0-rc.3",
+  "license": "Elastic-2.0",
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/kreuzberg-dev/kreuzberg.git"
+  },
+  "main": "kreuzberg-node.darwin-x64.node",
+  "files": ["kreuzberg-node.darwin-x64.node"],
+  "os": ["darwin"],
+  "cpu": ["x64"],
+  "engines": { "node": ">= 18" },
+  "publishConfig": { "access": "public" }
+}
--- a/crates/kreuzberg-node/npm/linux-arm64-gnu/package.json
+++ b/crates/kreuzberg-node/npm/linux-arm64-gnu/package.json
@@ -0,0 +1,16 @@
+{
+  "name": "@kreuzberg/node-linux-arm64-gnu",
+  "version": "5.0.0-rc.3",
+  "license": "Elastic-2.0",
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/kreuzberg-dev/kreuzberg.git"
+  },
+  "main": "kreuzberg-node.linux-arm64-gnu.node",
+  "files": ["kreuzberg-node.linux-arm64-gnu.node"],
+  "os": ["linux"],
+  "cpu": ["arm64"],
+  "libc": ["glibc"],
+  "engines": { "node": ">= 18" },
+  "publishConfig": { "access": "public" }
+}
--- a/crates/kreuzberg-node/npm/linux-arm64-musl/package.json
+++ b/crates/kreuzberg-node/npm/linux-arm64-musl/package.json
@@ -0,0 +1,16 @@
+{
+  "name": "@kreuzberg/node-linux-arm64-musl",
+  "version": "5.0.0-rc.3",
+  "license": "Elastic-2.0",
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/kreuzberg-dev/kreuzberg.git"
+  },
+  "main": "kreuzberg-node.linux-arm64-musl.node",
+  "files": ["kreuzberg-node.linux-arm64-musl.node"],
+  "os": ["linux"],
+  "cpu": ["arm64"],
+  "libc": ["musl"],
+  "engines": { "node": ">= 18" },
+  "publishConfig": { "access": "public" }
+}
--- a/crates/kreuzberg-node/npm/linux-x64-gnu/package.json
+++ b/crates/kreuzberg-node/npm/linux-x64-gnu/package.json
@@ -0,0 +1,16 @@
+{
+  "name": "@kreuzberg/node-linux-x64-gnu",
+  "version": "5.0.0-rc.3",
+  "license": "Elastic-2.0",
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/kreuzberg-dev/kreuzberg.git"
+  },
+  "main": "kreuzberg-node.linux-x64-gnu.node",
+  "files": ["kreuzberg-node.linux-x64-gnu.node"],
+  "os": ["linux"],
+  "cpu": ["x64"],
+  "libc": ["glibc"],
+  "engines": { "node": ">= 18" },
+  "publishConfig": { "access": "public" }
+}
--- a/crates/kreuzberg-node/npm/linux-x64-musl/package.json
+++ b/crates/kreuzberg-node/npm/linux-x64-musl/package.json
@@ -0,0 +1,16 @@
+{
+  "name": "@kreuzberg/node-linux-x64-musl",
+  "version": "5.0.0-rc.3",
+  "license": "Elastic-2.0",
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/kreuzberg-dev/kreuzberg.git"
+  },
+  "main": "kreuzberg-node.linux-x64-musl.node",
+  "files": ["kreuzberg-node.linux-x64-musl.node"],
+  "os": ["linux"],
+  "cpu": ["x64"],
+  "libc": ["musl"],
+  "engines": { "node": ">= 18" },
+  "publishConfig": { "access": "public" }
+}
--- a/crates/kreuzberg-node/npm/win32-arm64-msvc/package.json
+++ b/crates/kreuzberg-node/npm/win32-arm64-msvc/package.json
@@ -0,0 +1,15 @@
+{
+  "name": "@kreuzberg/node-win32-arm64-msvc",
+  "version": "5.0.0-rc.3",
+  "license": "Elastic-2.0",
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/kreuzberg-dev/kreuzberg.git"
+  },
+  "main": "kreuzberg-node.win32-arm64-msvc.node",
+  "files": ["kreuzberg-node.win32-arm64-msvc.node"],
+  "os": ["win32"],
+  "cpu": ["arm64"],
+  "engines": { "node": ">= 18" },
+  "publishConfig": { "access": "public" }
+}
--- a/crates/kreuzberg-node/npm/win32-x64-msvc/package.json
+++ b/crates/kreuzberg-node/npm/win32-x64-msvc/package.json
@@ -0,0 +1,15 @@
+{
+  "name": "@kreuzberg/node-win32-x64-msvc",
+  "version": "5.0.0-rc.3",
+  "license": "Elastic-2.0",
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/kreuzberg-dev/kreuzberg.git"
+  },
+  "main": "kreuzberg-node.win32-x64-msvc.node",
+  "files": ["kreuzberg-node.win32-x64-msvc.node"],
+  "os": ["win32"],
+  "cpu": ["x64"],
+  "engines": { "node": ">= 18" },
+  "publishConfig": { "access": "public" }
+}
--- a/crates/kreuzberg-node/package.json
+++ b/crates/kreuzberg-node/package.json
@@ -0,0 +1,52 @@
+{
+  "name": "@kreuzberg/node",
+  "version": "5.0.0-rc.3",
+  "description": "High-performance document intelligence library",
+  "license": "Elastic-2.0",
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/kreuzberg-dev/kreuzberg.git"
+  },
+  "main": "index.js",
+  "types": "index.d.ts",
+  "exports": {
+    ".": {
+      "types": "./index.d.ts",
+      "require": "./index.js",
+      "default": "./index.js"
+    }
+  },
+  "files": ["index.js", "index.d.ts", "*.node"],
+  "optionalDependencies": {
+    "@kreuzberg/node-linux-x64-gnu": "5.0.0-rc.3",
+    "@kreuzberg/node-linux-arm64-gnu": "5.0.0-rc.3",
+    "@kreuzberg/node-linux-x64-musl": "5.0.0-rc.3",
+    "@kreuzberg/node-linux-arm64-musl": "5.0.0-rc.3",
+    "@kreuzberg/node-darwin-x64": "5.0.0-rc.3",
+    "@kreuzberg/node-darwin-arm64": "5.0.0-rc.3",
+    "@kreuzberg/node-win32-x64-msvc": "5.0.0-rc.3",
+    "@kreuzberg/node-win32-arm64-msvc": "5.0.0-rc.3"
+  },
+  "napi": {
+    "packageName": "@kreuzberg/node",
+    "binaryName": "kreuzberg-node",
+    "targets": [
+      "x86_64-unknown-linux-gnu",
+      "aarch64-unknown-linux-gnu",
+      "x86_64-unknown-linux-musl",
+      "aarch64-unknown-linux-musl",
+      "x86_64-apple-darwin",
+      "aarch64-apple-darwin",
+      "x86_64-pc-windows-msvc",
+      "aarch64-pc-windows-msvc"
+    ]
+  },
+  "scripts": {
+    "build": "napi build --platform --release",
+    "artifacts": "napi artifacts",
+    "prepublishOnly": "napi prepublish -t npm --skip-optional-publish"
+  },
+  "engines": { "node": ">= 18" },
+  "publishConfig": { "access": "public" },
+  "devDependencies": { "@napi-rs/cli": "^3.6.2" }
+}
--- a/crates/kreuzberg-node/src/lib.rs
+++ b/crates/kreuzberg-node/src/lib.rs
--- a/crates/kreuzberg-paddle-ocr/Cargo.toml
+++ b/crates/kreuzberg-paddle-ocr/Cargo.toml
@@ -0,0 +1,41 @@
+[package]
+name = "kreuzberg-paddle-ocr"
+version.workspace = true
+edition = "2024"
+rust-version.workspace = true
+authors.workspace = true
+description = "PaddleOCR via ONNX Runtime for Kreuzberg - high-performance text recognition"
+license = "MIT"
+repository.workspace = true
+homepage = "https://kreuzberg.dev"
+documentation = "https://docs.rs/kreuzberg-paddle-ocr"
+readme = "README.md"
+keywords = ["paddle", "ocr", "onnx", "recognition", "detection"]
+categories = ["computer-vision", "text-processing"]
+exclude = ["tests/*", ".github/*"]
+
+[package.metadata.docs.rs]
+all-features = true
+rustdoc-args = ["--cfg", "docsrs"]
+
+[lib]
+name = "kreuzberg_paddle_ocr"
+crate-type = ["lib"]
+
+[features]
+default = []
+load-dynamic = ["ort/load-dynamic"]
+
+[dependencies]
+geo-clipper = "0.9"
+geo-types = "0.7"
+image = { workspace = true }
+
+# Crate-specific dependencies (not in workspace)
+# Disable rayon - OCR parallelism is handled at higher level
+imageproc = { version = "0.26", default-features = false }
+ndarray = "0.17"
+ort = { workspace = true, features = ["ndarray"] }
+# Workspace dependencies
+serde = { workspace = true }
+thiserror = { workspace = true }
--- a/crates/kreuzberg-paddle-ocr/LICENSE
+++ b/crates/kreuzberg-paddle-ocr/LICENSE
@@ -0,0 +1,22 @@
+MIT License
+
+Copyright (c) 2024 mg-chao
+Copyright (c) 2025 Na'aman Hirschfeld
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/crates/kreuzberg-paddle-ocr/README.md
+++ b/crates/kreuzberg-paddle-ocr/README.md
@@ -0,0 +1,57 @@
+# kreuzberg-paddle-ocr
+
+[![Bindings](https://img.shields.io/badge/Bindings-alef%20%D7%90-007ec6)](https://github.com/kreuzberg-dev/alef)
+
+PaddleOCR via ONNX Runtime for Kreuzberg - high-performance text detection and recognition using PaddlePaddle's OCR models.
+
+Based on the original [paddle-ocr-rs](https://github.com/mg-chao/paddle-ocr-rs) by [mg-chao](https://github.com/mg-chao), this vendored version includes improvements for Kreuzberg integration:
+
+- **Workspace Dependency Alignment**: Uses Kreuzberg's workspace dependencies for consistency
+- **Edition 2024**: Updated to Rust 2024 edition
+- **ndarray Compatibility**: Aligned with Kreuzberg's ndarray version requirements
+- **Integration**: Designed to work seamlessly with Kreuzberg's OCR backend system
+
+## Features
+
+- Text detection using DBNet (Differentiable Binarization)
+- Text recognition using CRNN (Convolutional Recurrent Neural Network)
+- Angle detection for rotated text
+- Support for multiple languages via PaddleOCR models
+- ONNX Runtime for efficient CPU inference
+
+## ONNX Runtime Requirement
+
+This crate requires **ONNX Runtime 1.24+** at runtime.
+
+Install it:
+
+- **macOS (Homebrew)**: `brew install onnxruntime`
+- **Linux**: Download from [ONNX Runtime releases](https://github.com/microsoft/onnxruntime/releases)
+- **Windows**: Download from [ONNX Runtime releases](https://github.com/microsoft/onnxruntime/releases)
+
+## Usage
+
+This crate is used internally by Kreuzberg when the `paddle-ocr` feature is enabled:
+
+```toml
+[dependencies]
+kreuzberg = { version = "4.2", features = ["paddle-ocr"] }
+```
+
+## Models
+
+PaddleOCR models are automatically downloaded and cached on first use. Supported models include:
+
+- PP-OCRv5 server detection model
+- PP-OCRv5 per-family recognition models (11 script families)
+- PPOCRv2 mobile angle classification model
+
+## License
+
+This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
+
+## Acknowledgements
+
+This project is based on the original [paddle-ocr-rs](https://github.com/mg-chao/paddle-ocr-rs) by [mg-chao](https://github.com/mg-chao), originally licensed under Apache-2.0. We are grateful for the foundational work that made this integration possible.
+
+The original paddle-ocr-rs provides Rust bindings for PaddlePaddle's OCR models via ONNX Runtime, enabling efficient text detection and recognition without Python dependencies.
--- a/crates/kreuzberg-paddle-ocr/src/angle_net.rs
+++ b/crates/kreuzberg-paddle-ocr/src/angle_net.rs
@@ -0,0 +1,139 @@
+use crate::{
+    base_net::BaseNet,
+    constants::{IMAGENET_MEAN_VALUES, IMAGENET_NORM_VALUES},
+    ocr_error::OcrError,
+    ocr_result::Angle,
+    ocr_utils::OcrUtils,
+};
+
+use ort::{
+    inputs,
+    session::{Session, SessionOutputs},
+    value::Tensor,
+};
+
+// PP-LCNet_x1_0_textline_ori preprocessing (ImageNet normalization).
+// Input: resize to 160×80 (W×H), normalize with ImageNet mean/std.
+// Formula in substract_mean_normalize: (pixel - MEAN) * NORM
+// For ImageNet: (pixel/255 - mean) / std = (pixel - mean*255) * (1/(std*255))
+// V2 PP-LCNet angle classifier expects [3, 80, 160] input (NCHW).
+const ANGLE_DST_WIDTH: u32 = 160;
+const ANGLE_DST_HEIGHT: u32 = 80;
+const ANGLE_COLS: usize = 2;
+
+#[derive(Debug)]
+pub struct AngleNet {
+    session: Option<Session>,
+    input_names: Vec<String>,
+}
+
+impl BaseNet for AngleNet {
+    fn new() -> Self {
+        Self {
+            session: None,
+            input_names: Vec::new(),
+        }
+    }
+
+    fn set_input_names(&mut self, input_names: Vec<String>) {
+        self.input_names = input_names;
+    }
+
+    fn set_session(&mut self, session: Option<Session>) {
+        self.session = session;
+    }
+}
+
+impl AngleNet {
+    pub fn get_angles(
+        &self,
+        part_imgs: &[image::RgbImage],
+        do_angle: bool,
+        most_angle: bool,
+        cls_thresh: f32,
+    ) -> Result<Vec<Angle>, OcrError> {
+        // Pre-allocate — we know exact count upfront.
+        let mut angles = Vec::with_capacity(part_imgs.len());
+
+        if do_angle {
+            for img in part_imgs {
+                let angle = self.get_angle(img, cls_thresh)?;
+                angles.push(angle);
+            }
+        } else {
+            angles.extend(part_imgs.iter().map(|_| Angle::default()));
+        }
+
+        if do_angle && most_angle {
+            let sum: i32 = angles.iter().map(|x| x.index).sum();
+            let half_percent = angles.len() as f32 / 2.0;
+            let most_angle_index = if (sum as f32) < half_percent { 0 } else { 1 };
+
+            for angle in angles.iter_mut() {
+                angle.index = most_angle_index;
+            }
+        }
+
+        Ok(angles)
+    }
+
+    fn get_angle(&self, img_src: &image::RgbImage, cls_thresh: f32) -> Result<Angle, OcrError> {
+        let Some(session) = &self.session else {
+            return Err(OcrError::SessionNotInitialized);
+        };
+
+        let angle_img = image::imageops::resize(
+            img_src,
+            ANGLE_DST_WIDTH,
+            ANGLE_DST_HEIGHT,
+            image::imageops::FilterType::Triangle,
+        );
+
+        let input_tensors =
+            OcrUtils::substract_mean_normalize(&angle_img, &IMAGENET_MEAN_VALUES, &IMAGENET_NORM_VALUES);
+
+        let input_tensors = Tensor::from_array(input_tensors)?;
+
+        // SAFETY: ONNX Runtime C API is thread-safe for concurrent inference.
+        #[allow(unsafe_code)]
+        let outputs = unsafe {
+            let session_ptr = session as *const Session as *mut Session;
+            (*session_ptr).run(inputs![self.input_names[0].as_str() => input_tensors])?
+        };
+
+        let mut angle = Self::score_to_angle(&outputs, ANGLE_COLS)?;
+
+        // Only apply rotation if confidence exceeds threshold (matches PaddleOCR's cls_thresh=0.9)
+        if angle.score < cls_thresh {
+            angle.index = 0; // Keep original orientation when confidence is low
+        }
+
+        Ok(angle)
+    }
+
+    fn score_to_angle(output_tensor: &SessionOutputs, angle_cols: usize) -> Result<Angle, OcrError> {
+        let (_, red_data) = output_tensor.iter().next().ok_or_else(|| {
+            OcrError::Io(std::io::Error::new(
+                std::io::ErrorKind::InvalidData,
+                "No output tensors found in angle classification session output",
+            ))
+        })?;
+
+        let src_data: Vec<f32> = red_data.try_extract_tensor::<f32>()?.1.to_vec();
+
+        let mut angle = Angle::default();
+        let mut max_value = f32::MIN;
+        let mut angle_index = 0;
+
+        for (i, value) in src_data.iter().take(angle_cols).enumerate() {
+            if *value > max_value {
+                max_value = *value;
+                angle_index = i as i32;
+            }
+        }
+
+        angle.index = angle_index;
+        angle.score = max_value;
+        Ok(angle)
+    }
+}
--- a/crates/kreuzberg-paddle-ocr/src/base_net.rs
+++ b/crates/kreuzberg-paddle-ocr/src/base_net.rs
@@ -0,0 +1,78 @@
+use ort::session::{
+    Session,
+    builder::{GraphOptimizationLevel, SessionBuilder},
+};
+
+use crate::ocr_error::OcrError;
+
+pub trait BaseNet {
+    fn new() -> Self;
+
+    fn get_session_builder(
+        &self,
+        num_thread: usize,
+        builder_fn: Option<fn(SessionBuilder) -> Result<SessionBuilder, ort::Error>>,
+    ) -> Result<SessionBuilder, OcrError> {
+        let builder = Session::builder()?;
+        let builder = match builder_fn {
+            Some(custom) => custom(builder)?,
+            None => builder
+                .with_optimization_level(GraphOptimizationLevel::All)
+                .map_err(|e| OcrError::Ort(ort::Error::new(e.message())))?
+                .with_intra_threads(num_thread)
+                .map_err(|e| OcrError::Ort(ort::Error::new(e.message())))?
+                .with_inter_threads(1)
+                .map_err(|e| OcrError::Ort(ort::Error::new(e.message())))?,
+        };
+
+        Ok(builder)
+    }
+
+    fn set_input_names(&mut self, input_names: Vec<String>);
+    fn set_session(&mut self, session: Option<Session>);
+
+    fn init(&mut self, session: Session) {
+        let input_names: Vec<String> = session.inputs().iter().map(|input| input.name().to_string()).collect();
+
+        self.set_input_names(input_names);
+        self.set_session(Some(session));
+    }
+
+    fn init_model(
+        &mut self,
+        path: &str,
+        num_thread: usize,
+        builder_fn: Option<fn(SessionBuilder) -> Result<SessionBuilder, ort::Error>>,
+    ) -> Result<(), OcrError> {
+        // Wrap ORT session creation in catch_unwind to prevent mutex poisoning
+        // on platforms where ORT initialization can panic (notably Windows).
+        let path_owned = path.to_string();
+        let session = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
+            let mut builder = self.get_session_builder(num_thread, builder_fn)?;
+            builder.commit_from_file(&path_owned).map_err(OcrError::from)
+        }))
+        .map_err(|_| OcrError::Ort(ort::Error::new("ORT session initialization panicked")))??;
+        self.init(session);
+
+        Ok(())
+    }
+
+    fn init_model_from_memory(
+        &mut self,
+        model_bytes: &[u8],
+        num_thread: usize,
+        builder_fn: Option<fn(SessionBuilder) -> Result<SessionBuilder, ort::Error>>,
+    ) -> Result<(), OcrError> {
+        // Wrap ORT session creation in catch_unwind to prevent mutex poisoning
+        // on platforms where ORT initialization can panic (notably Windows).
+        let session = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
+            let mut builder = self.get_session_builder(num_thread, builder_fn)?;
+            builder.commit_from_memory(model_bytes).map_err(OcrError::from)
+        }))
+        .map_err(|_| OcrError::Ort(ort::Error::new("ORT session initialization panicked")))??;
+
+        self.init(session);
+
+        Ok(())
+    }
+}
--- a/crates/kreuzberg-paddle-ocr/src/constants.rs
+++ b/crates/kreuzberg-paddle-ocr/src/constants.rs
@@ -0,0 +1,33 @@
+//! Shared normalization constants for PaddleOCR preprocessing.
+//!
+//! Two normalization schemes are used:
+//!
+//! - **ImageNet** (`IMAGENET_MEAN_VALUES` / `IMAGENET_NORM_VALUES`): used by the text
+//!   detection network (`DbNet`) and the angle classifier (`AngleNet`).
+//!   Formula: `(pixel - mean * 255) * (1 / (std * 255))`.
+//!
+//! - **CRNN** (`CRNN_MEAN_VALUES` / `CRNN_NORM_VALUES`): used by the text recognition
+//!   network (`CrnnNet`).
+//!   Formula: `(pixel - 127.5) * (1 / 127.5)`.
+
+/// ImageNet channel means (R, G, B), pre-multiplied by 255.
+///
+/// Derived from `[0.485, 0.456, 0.406]` (per-channel ImageNet means).
+/// Used by `DbNet` (text detection) and `AngleNet` (angle classification).
+pub(crate) const IMAGENET_MEAN_VALUES: [f32; 3] = [0.485 * 255.0, 0.456 * 255.0, 0.406 * 255.0];
+
+/// ImageNet channel normalization factors (R, G, B), equal to `1 / (std * 255)`.
+///
+/// Derived from `[0.229, 0.224, 0.225]` (per-channel ImageNet standard deviations).
+/// Used by `DbNet` (text detection) and `AngleNet` (angle classification).
+pub(crate) const IMAGENET_NORM_VALUES: [f32; 3] = [1.0 / (0.229 * 255.0), 1.0 / (0.224 * 255.0), 1.0 / (0.225 * 255.0)];
+
+/// CRNN channel means (R, G, B): `127.5` for all channels.
+///
+/// Used by `CrnnNet` (text recognition).
+pub(crate) const CRNN_MEAN_VALUES: [f32; 3] = [127.5, 127.5, 127.5];
+
+/// CRNN channel normalization factors (R, G, B): `1 / 127.5` for all channels.
+///
+/// Used by `CrnnNet` (text recognition).
+pub(crate) const CRNN_NORM_VALUES: [f32; 3] = [1.0 / 127.5, 1.0 / 127.5, 1.0 / 127.5];
--- a/crates/kreuzberg-paddle-ocr/src/crnn_net.rs
+++ b/crates/kreuzberg-paddle-ocr/src/crnn_net.rs
@@ -0,0 +1,393 @@
+use ndarray::Array4;
+use ort::session::Session;
+use ort::value::Tensor;
+use ort::{inputs, session::builder::SessionBuilder};
+use std::collections::HashMap;
+
+use crate::{
+    base_net::BaseNet,
+    constants::{CRNN_MEAN_VALUES, CRNN_NORM_VALUES},
+    ocr_error::OcrError,
+    ocr_result::TextLine,
+    ocr_utils::OcrUtils,
+};
+
+const CRNN_DST_HEIGHT: u32 = 48;
+
+#[derive(Debug)]
+pub struct CrnnNet {
+    session: Option<Session>,
+    keys: Vec<String>,
+    input_names: Vec<String>,
+}
+
+impl BaseNet for CrnnNet {
+    fn new() -> Self {
+        Self {
+            session: None,
+            keys: Vec::new(),
+            input_names: Vec::new(),
+        }
+    }
+
+    fn set_input_names(&mut self, input_names: Vec<String>) {
+        self.input_names = input_names;
+    }
+
+    fn set_session(&mut self, session: Option<Session>) {
+        self.session = session;
+    }
+}
+
+impl CrnnNet {
+    pub fn init_model(
+        &mut self,
+        path: &str,
+        num_thread: usize,
+        builder_fn: Option<fn(SessionBuilder) -> Result<SessionBuilder, ort::Error>>,
+    ) -> Result<(), OcrError> {
+        BaseNet::init_model(self, path, num_thread, builder_fn)?;
+
+        self.keys = self.get_keys()?;
+
+        Ok(())
+    }
+
+    pub fn init_model_dict_file(
+        &mut self,
+        path: &str,
+        num_thread: usize,
+        builder_fn: Option<fn(SessionBuilder) -> Result<SessionBuilder, ort::Error>>,
+        dict_file_path: &str,
+    ) -> Result<(), OcrError> {
+        BaseNet::init_model(self, path, num_thread, builder_fn)?;
+
+        self.read_keys_from_file(dict_file_path)?;
+
+        Ok(())
+    }
+
+    pub fn init_model_from_memory(
+        &mut self,
+        model_bytes: &[u8],
+        num_thread: usize,
+        builder_fn: Option<fn(SessionBuilder) -> Result<SessionBuilder, ort::Error>>,
+    ) -> Result<(), OcrError> {
+        BaseNet::init_model_from_memory(self, model_bytes, num_thread, builder_fn)?;
+
+        self.keys = self.get_keys()?;
+
+        Ok(())
+    }
+
+    fn get_keys(&mut self) -> Result<Vec<String>, OcrError> {
+        let session = self.session.as_ref().ok_or(OcrError::SessionNotInitialized)?;
+
+        let metadata = session.metadata()?;
+        let model_charater_list = metadata.custom("character").ok_or_else(|| {
+            OcrError::Io(std::io::Error::new(
+                std::io::ErrorKind::NotFound,
+                "crnn_net character not found in metadata",
+            ))
+        })?;
+
+        // PP-OCRv5 model metadata already includes the CTC blank token ("#") at
+        // index 0 and the space token (" ") at the end.  Do NOT prepend/append
+        // extra tokens — doing so shifts every character index by one and
+        // produces garbled output.
+        let keys: Vec<String> = model_charater_list.split('\n').map(|s: &str| s.to_string()).collect();
+
+        Ok(keys)
+    }
+
+    fn read_keys_from_file(&mut self, path: &str) -> Result<(), OcrError> {
+        let content = std::fs::read_to_string(path)?;
+
+        // PP-OCRv5 dict files already include the CTC blank token ("#") at
+        // index 0 and the space token (" ") at the end.  Do NOT prepend/append
+        // extra tokens — doing so shifts every character index by one and
+        // produces garbled output.
+        let keys: Vec<String> = content.split('\n').map(|s| s.to_string()).collect();
+
+        self.keys = keys;
+        Ok(())
+    }
+
+    pub fn get_text_lines(
+        &self,
+        part_imgs: &[image::RgbImage],
+        angle_rollback_records: &HashMap<usize, image::RgbImage>,
+        angle_rollback_threshold: f32,
+        batch_size: u32,
+    ) -> Result<Vec<TextLine>, OcrError> {
+        if part_imgs.is_empty() {
+            return Ok(Vec::new());
+        }
+
+        // Batch recognition: sort by aspect ratio, batch, pad to max width
+        let mut text_lines = self.get_text_lines_batched(part_imgs, batch_size)?;
+
+        // Angle rollback: re-recognize individual images that scored poorly
+        for (index, text_line) in text_lines.iter_mut().enumerate() {
+            if (text_line.text_score.is_nan() || text_line.text_score < angle_rollback_threshold)
+                && let Some(angle_rollback_record) = angle_rollback_records.get(&index)
+            {
+                *text_line = self.get_text_line(angle_rollback_record)?;
+            }
+        }
+
+        Ok(text_lines)
+    }
+
+    /// Batch recognition: sort crops by width, group into batches, pad to max width,
+    /// run single ONNX inference per batch. Matches PaddleOCR/RapidOCR batching strategy.
+    fn get_text_lines_batched(
+        &self,
+        part_imgs: &[image::RgbImage],
+        batch_size: u32,
+    ) -> Result<Vec<TextLine>, OcrError> {
+        let session = self.session.as_ref().ok_or(OcrError::SessionNotInitialized)?;
+        let batch_size = (batch_size as usize).max(1);
+
+        // Compute target widths and sort indices by aspect ratio (width/height)
+        let mut indexed_widths: Vec<(usize, u32)> = part_imgs
+            .iter()
+            .enumerate()
+            .map(|(i, img)| {
+                let scale = CRNN_DST_HEIGHT as f32 / img.height().max(1) as f32;
+                let dst_width = (img.width() as f32 * scale).ceil() as u32;
+                (i, dst_width.max(1))
+            })
+            .collect();
+        indexed_widths.sort_by_key(|&(_, w)| w);
+
+        let mut results: Vec<(usize, TextLine)> = Vec::with_capacity(part_imgs.len());
+
+        // Process in batches
+        for chunk in indexed_widths.chunks(batch_size) {
+            if chunk.len() == 1 {
+                // Single image — use existing path (no padding overhead)
+                let (orig_idx, _) = chunk[0];
+                let text_line = self.get_text_line(&part_imgs[orig_idx])?;
+                results.push((orig_idx, text_line));
+                continue;
+            }
+
+            // Find max width in this batch
+            let max_width = chunk.iter().map(|&(_, w)| w).max().unwrap_or(1);
+
+            // Build batch tensor [N, 3, 48, max_width] with zero-padding
+            let n = chunk.len();
+            let mut batch_data = Array4::<f32>::zeros((n, 3, CRNN_DST_HEIGHT as usize, max_width as usize));
+
+            for (batch_idx, &(orig_idx, dst_width)) in chunk.iter().enumerate() {
+                let img = &part_imgs[orig_idx];
+                let resized =
+                    image::imageops::resize(img, dst_width, CRNN_DST_HEIGHT, image::imageops::FilterType::Triangle);
+
+                // Normalize and fill into batch tensor (zero-padded on right).
+                // Use raw slice access instead of per-pixel get_pixel() to
+                // eliminate millions of bounds checks in the hot loop.
+                let cols = resized.width() as usize;
+                let rows = resized.height() as usize;
+                let raw = resized.as_raw();
+                assert_eq!(raw.len(), rows * cols * 3, "unexpected image buffer size");
+                let adjusted = [
+                    CRNN_MEAN_VALUES[0] * CRNN_NORM_VALUES[0],
+                    CRNN_MEAN_VALUES[1] * CRNN_NORM_VALUES[1],
+                    CRNN_MEAN_VALUES[2] * CRNN_NORM_VALUES[2],
+                ];
+                for r in 0..rows {
+                    for c in 0..cols {
+                        let base = r * cols * 3 + c * 3;
+                        for ch in 0..3 {
+                            batch_data[[batch_idx, ch, r, c]] =
+                                raw[base + ch] as f32 * CRNN_NORM_VALUES[ch] - adjusted[ch];
+                        }
+                    }
+                }
+                // Remaining columns stay zero (padding)
+            }
+
+            let input_tensor = Tensor::from_array(batch_data)?;
+
+            // SAFETY: ONNX Runtime C API is thread-safe for concurrent inference.
+            #[allow(unsafe_code)]
+            let outputs = unsafe {
+                let session_ptr = session as *const Session as *mut Session;
+                (*session_ptr).run(inputs![self.input_names[0].as_str() => input_tensor])?
+            };
+
+            let (_, output_value) = outputs.iter().next().ok_or_else(|| {
+                OcrError::Io(std::io::Error::new(
+                    std::io::ErrorKind::InvalidData,
+                    "No output tensors found in batched CRNN session output",
+                ))
+            })?;
+
+            let (shape, flat_data) = output_value.try_extract_tensor::<f32>()?;
+            // Shape: [batch, timesteps, num_classes]
+            let batch_dim = *shape.first().unwrap_or(&1) as usize;
+            let timesteps = *shape.get(1).unwrap_or(&0) as usize;
+            let num_classes = *shape.get(2).unwrap_or(&0) as usize;
+
+            for (batch_idx, item) in chunk.iter().enumerate().take(batch_dim.min(n)) {
+                let offset = batch_idx * timesteps * num_classes;
+                let slice = &flat_data[offset..offset + timesteps * num_classes];
+                let text_line = Self::score_to_text_line(slice, timesteps, num_classes, &self.keys)?;
+                results.push((item.0, text_line));
+            }
+        }
+
+        // Reorder results back to original index order
+        results.sort_by_key(|&(idx, _)| idx);
+        Ok(results.into_iter().map(|(_, tl)| tl).collect())
+    }
+
+    fn get_text_line(&self, img_src: &image::RgbImage) -> Result<TextLine, OcrError> {
+        let Some(session) = &self.session else {
+            return Err(OcrError::SessionNotInitialized);
+        };
+
+        let scale = CRNN_DST_HEIGHT as f32 / img_src.height() as f32;
+        let dst_width = (img_src.width() as f32 * scale).ceil() as u32;
+
+        let src_resize = image::imageops::resize(
+            img_src,
+            dst_width,
+            CRNN_DST_HEIGHT,
+            image::imageops::FilterType::Triangle,
+        );
+
+        let input_tensors = OcrUtils::substract_mean_normalize(&src_resize, &CRNN_MEAN_VALUES, &CRNN_NORM_VALUES);
+
+        let input_tensors = Tensor::from_array(input_tensors)?;
+
+        // SAFETY: ONNX Runtime C API is thread-safe for concurrent inference.
+        #[allow(unsafe_code)]
+        let outputs = unsafe {
+            let session_ptr = session as *const Session as *mut Session;
+            (*session_ptr).run(inputs![self.input_names[0].as_str() => input_tensors])?
+        };
+
+        let (_, red_data) = outputs.iter().next().ok_or_else(|| {
+            OcrError::Io(std::io::Error::new(
+                std::io::ErrorKind::InvalidData,
+                "No output tensors found in CRNN session output",
+            ))
+        })?;
+
+        let (shape, src_data) = red_data.try_extract_tensor::<f32>()?;
+        let dimensions = shape;
+        let height = *dimensions.get(1).ok_or_else(|| {
+            OcrError::Io(std::io::Error::new(
+                std::io::ErrorKind::InvalidData,
+                "CRNN output tensor missing height dimension (index 1)",
+            ))
+        })? as usize;
+        let width = *dimensions.get(2).ok_or_else(|| {
+            OcrError::Io(std::io::Error::new(
+                std::io::ErrorKind::InvalidData,
+                "CRNN output tensor missing width dimension (index 2)",
+            ))
+        })? as usize;
+        let src_data: Vec<f32> = src_data.to_vec();
+
+        Self::score_to_text_line(&src_data, height, width, &self.keys)
+    }
+
+    fn score_to_text_line(
+        output_data: &[f32],
+        height: usize,
+        width: usize,
+        keys: &[String],
+    ) -> Result<TextLine, OcrError> {
+        let mut text_line = TextLine::default();
+        let mut last_index = 0;
+
+        let mut text_score_sum = 0.0;
+        let mut text_score_count = 0;
+        for i in 0..height {
+            let start = i * width;
+            let stop = (i + 1) * width;
+            let slice = &output_data[start..stop.min(output_data.len())];
+
+            let (max_index, max_value) =
+                slice
+                    .iter()
+                    .enumerate()
+                    .fold((0, f32::MIN), |(max_idx, max_val), (idx, &val)| {
+                        if val > max_val { (idx, val) } else { (max_idx, max_val) }
+                    });
+
+            if max_index > 0 && max_index < keys.len() && !(i > 0 && max_index == last_index) {
+                text_line.text.push_str(&keys[max_index]);
+                text_score_sum += max_value;
+                text_score_count += 1;
+            }
+            last_index = max_index;
+        }
+
+        // Avoid division by zero: handle case where no characters were found
+        text_line.text_score = if text_score_count > 0 {
+            text_score_sum / text_score_count as f32
+        } else {
+            0.0
+        };
+        Ok(text_line)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_score_to_text_line_skips_blank_index() {
+        // keys[0] = "#" (CTC blank), keys[1] = "a", keys[2] = "b"
+        let keys = vec!["#".to_string(), "a".to_string(), "b".to_string()];
+        // 3 timesteps, 3 classes each. Simulate: blank, "a", "b"
+        let output = vec![
+            1.0, 0.0, 0.0, // timestep 0: max at index 0 (blank) -> skip
+            0.0, 0.9, 0.1, // timestep 1: max at index 1 ("a")
+            0.0, 0.1, 0.8, // timestep 2: max at index 2 ("b")
+        ];
+        let result = CrnnNet::score_to_text_line(&output, 3, 3, &keys).unwrap();
+        assert_eq!(result.text, "ab");
+    }
+
+    #[test]
+    fn test_score_to_text_line_deduplicates_consecutive() {
+        let keys = vec!["#".to_string(), "h".to_string(), "i".to_string()];
+        // 4 timesteps: "h", "h", "i", "i" -> should deduplicate to "hi"
+        let output = vec![
+            0.0, 0.9, 0.0, // "h"
+            0.0, 0.8, 0.0, // "h" again (same index, skip)
+            0.0, 0.0, 0.9, // "i"
+            0.0, 0.0, 0.8, // "i" again (same index, skip)
+        ];
+        let result = CrnnNet::score_to_text_line(&output, 4, 3, &keys).unwrap();
+        assert_eq!(result.text, "hi");
+    }
+
+    #[test]
+    fn test_read_keys_from_file_preserves_dict_layout() {
+        let dir = std::env::temp_dir().join("kreuzberg_test_dict");
+        std::fs::create_dir_all(&dir).unwrap();
+        let dict_path = dir.join("test_dict.txt");
+        // PP-OCRv5 dict files already include "#" (blank) at start and " " at end.
+        std::fs::write(&dict_path, "#\na\nb\nc\n ").unwrap();
+
+        let mut net = CrnnNet::new();
+        net.read_keys_from_file(dict_path.to_str().unwrap()).unwrap();
+
+        // Dict is loaded as-is: ["#", "a", "b", "c", " "]
+        assert_eq!(net.keys[0], "#");
+        assert_eq!(net.keys[1], "a");
+        assert_eq!(net.keys[2], "b");
+        assert_eq!(net.keys[3], "c");
+        assert_eq!(net.keys[net.keys.len() - 1], " ");
+
+        std::fs::remove_dir_all(&dir).ok();
+    }
+}
--- a/crates/kreuzberg-paddle-ocr/src/db_net.rs
+++ b/crates/kreuzberg-paddle-ocr/src/db_net.rs
@@ -0,0 +1,421 @@
+use crate::{
+    base_net::BaseNet,
+    constants::{IMAGENET_MEAN_VALUES, IMAGENET_NORM_VALUES},
+    ocr_error::OcrError,
+    ocr_result::{self, TextBox},
+    ocr_utils::OcrUtils,
+    scale_param::ScaleParam,
+};
+use geo_clipper::{Clipper, EndType, JoinType};
+use geo_types::{Coord, LineString, Polygon};
+use ort::{inputs, session::SessionOutputs};
+use ort::{session::Session, value::Tensor};
+use std::cmp::Ordering;
+
+#[derive(Debug)]
+pub struct DbNet {
+    session: Option<Session>,
+    input_names: Vec<String>,
+}
+
+impl BaseNet for DbNet {
+    fn new() -> Self {
+        Self {
+            session: None,
+            input_names: Vec::new(),
+        }
+    }
+
+    fn set_input_names(&mut self, input_names: Vec<String>) {
+        self.input_names = input_names;
+    }
+
+    fn set_session(&mut self, session: Option<Session>) {
+        self.session = session;
+    }
+}
+
+impl DbNet {
+    pub fn get_text_boxes(
+        &self,
+        img_src: &image::RgbImage,
+        scale: &ScaleParam,
+        box_score_thresh: f32,
+        box_thresh: f32,
+        un_clip_ratio: f32,
+        thresh: f32,
+    ) -> Result<Vec<TextBox>, OcrError> {
+        let Some(session) = &self.session else {
+            return Err(OcrError::SessionNotInitialized);
+        };
+
+        let src_resize = image::imageops::resize(
+            img_src,
+            scale.dst_width,
+            scale.dst_height,
+            image::imageops::FilterType::Triangle,
+        );
+
+        let input_tensors =
+            OcrUtils::substract_mean_normalize(&src_resize, &IMAGENET_MEAN_VALUES, &IMAGENET_NORM_VALUES);
+
+        let tensor = Tensor::from_array(input_tensors)?;
+
+        // SAFETY: ONNX Runtime's C API (OrtRun) is thread-safe for concurrent inference
+        // on the same session. The ort crate's `&mut self` requirement is overly
+        // conservative. This matches the pattern used in kreuzberg's embedding engine.
+        #[allow(unsafe_code)]
+        let outputs = unsafe {
+            let session_ptr = session as *const Session as *mut Session;
+            (*session_ptr).run(inputs![self.input_names[0].as_str() => tensor])?
+        };
+
+        let text_boxes = Self::get_text_boxes_core(
+            &outputs,
+            src_resize.height(),
+            src_resize.width(),
+            &ScaleParam::new(
+                scale.src_width,
+                scale.src_height,
+                scale.dst_width,
+                scale.dst_height,
+                scale.scale_width,
+                scale.scale_height,
+            ),
+            box_score_thresh,
+            box_thresh,
+            un_clip_ratio,
+            thresh,
+        )?;
+
+        Ok(text_boxes)
+    }
+
+    fn get_text_boxes_core(
+        output_tensor: &SessionOutputs,
+        rows: u32,
+        cols: u32,
+        s: &ScaleParam,
+        box_score_thresh: f32,
+        _box_thresh: f32,
+        un_clip_ratio: f32,
+        thresh: f32,
+    ) -> Result<Vec<TextBox>, OcrError> {
+        let max_side_thresh = 3.0;
+
+        let (_, red_data) = output_tensor.iter().next().ok_or_else(|| {
+            OcrError::Io(std::io::Error::new(
+                std::io::ErrorKind::InvalidData,
+                "No output tensors found in session output",
+            ))
+        })?;
+
+        let pred_data: Vec<f32> = red_data.try_extract_tensor::<f32>()?.1.to_vec();
+
+        let cbuf_data: Vec<u8> = pred_data.iter().map(|pixel| (pixel * 255.0) as u8).collect();
+
+        let pred_img: image::ImageBuffer<image::Luma<f32>, Vec<f32>> =
+            image::ImageBuffer::from_vec(cols, rows, pred_data).ok_or_else(|| {
+                OcrError::Io(std::io::Error::new(
+                    std::io::ErrorKind::InvalidData,
+                    format!(
+                        "Failed to create image buffer from predictions: {} x {} dimensions may be invalid",
+                        cols, rows
+                    ),
+                ))
+            })?;
+
+        let cbuf_img = image::GrayImage::from_vec(cols, rows, cbuf_data).ok_or_else(|| {
+            OcrError::Io(std::io::Error::new(
+                std::io::ErrorKind::InvalidData,
+                format!(
+                    "Failed to create grayscale image buffer: {} x {} dimensions may be invalid",
+                    cols, rows
+                ),
+            ))
+        })?;
+
+        let threshold_img = imageproc::contrast::threshold(
+            &cbuf_img,
+            (thresh * 255.0) as u8,
+            imageproc::contrast::ThresholdType::Binary,
+        );
+
+        // RapidOCR and PaddleOCR reference do NOT apply dilation before contour extraction.
+        // Dilation merges adjacent text regions, causing word concatenation.
+        let img_contours: Vec<imageproc::contours::Contour<i32>> = imageproc::contours::find_contours(&threshold_img);
+
+        // Pre-allocate based on contour count to avoid repeated reallocations.
+        let mut rs_boxes = Vec::with_capacity(img_contours.len());
+
+        for contour in img_contours {
+            if contour.points.len() <= 2 {
+                continue;
+            }
+
+            let mut max_side = 0.0;
+            let min_box = Self::get_mini_box(&contour.points, &mut max_side)?;
+            if max_side < max_side_thresh {
+                continue;
+            }
+
+            let score = Self::get_score(&contour, &pred_img)?;
+            if score < box_score_thresh {
+                continue;
+            }
+
+            let clip_box = Self::unclip(&min_box, un_clip_ratio)?;
+            if clip_box.is_empty() {
+                continue;
+            }
+
+            let mut clip_contour = Vec::new();
+            for point in &clip_box {
+                clip_contour.push(*point);
+            }
+
+            let mut max_side_clip = 0.0;
+            let clip_min_box = Self::get_mini_box(&clip_contour, &mut max_side_clip)?;
+            if max_side_clip < max_side_thresh + 2.0 {
+                continue;
+            }
+
+            let mut final_points = Vec::new();
+            for item in clip_min_box {
+                let x = (item.x / s.scale_width) as u32;
+                let ptx = x.min(s.src_width);
+
+                let y = (item.y / s.scale_height) as u32;
+                let pty = y.min(s.src_height);
+
+                final_points.push(ocr_result::Point { x: ptx, y: pty });
+            }
+
+            let text_box = TextBox {
+                score,
+                points: final_points,
+            };
+
+            rs_boxes.push(text_box);
+        }
+
+        Ok(rs_boxes)
+    }
+
+    fn get_mini_box(
+        contour_points: &[imageproc::point::Point<i32>],
+        min_edge_size: &mut f32,
+    ) -> Result<Vec<imageproc::point::Point<f32>>, OcrError> {
+        let rect = imageproc::geometry::min_area_rect(contour_points);
+
+        let mut rect_points: Vec<imageproc::point::Point<f32>> = rect
+            .iter()
+            .map(|p| imageproc::point::Point::new(p.x as f32, p.y as f32))
+            .collect();
+
+        // Direct multiplication instead of .powi(2) — avoids function call overhead.
+        let dx_w = rect_points[0].x - rect_points[1].x;
+        let dy_w = rect_points[0].y - rect_points[1].y;
+        let width = (dx_w * dx_w + dy_w * dy_w).sqrt();
+        let dx_h = rect_points[1].x - rect_points[2].x;
+        let dy_h = rect_points[1].y - rect_points[2].y;
+        let height = (dx_h * dx_h + dy_h * dy_h).sqrt();
+
+        *min_edge_size = width.min(height);
+
+        rect_points.sort_by(|a, b| {
+            if a.x > b.x {
+                return Ordering::Greater;
+            }
+            if a.x == b.x {
+                return Ordering::Equal;
+            }
+            Ordering::Less
+        });
+
+        let mut box_points = Vec::new();
+        let index_1;
+        let index_4;
+        if rect_points[1].y > rect_points[0].y {
+            index_1 = 0;
+            index_4 = 1;
+        } else {
+            index_1 = 1;
+            index_4 = 0;
+        }
+
+        let index_2;
+        let index_3;
+        if rect_points[3].y > rect_points[2].y {
+            index_2 = 2;
+            index_3 = 3;
+        } else {
+            index_2 = 3;
+            index_3 = 2;
+        }
+
+        box_points.push(rect_points[index_1]);
+        box_points.push(rect_points[index_2]);
+        box_points.push(rect_points[index_3]);
+        box_points.push(rect_points[index_4]);
+
+        Ok(box_points)
+    }
+
+    fn get_score(
+        contour: &imageproc::contours::Contour<i32>,
+        f_map_mat: &image::ImageBuffer<image::Luma<f32>, Vec<f32>>,
+    ) -> Result<f32, OcrError> {
+        // Initialize boundary values
+        let mut xmin = i32::MAX;
+        let mut xmax = i32::MIN;
+        let mut ymin = i32::MAX;
+        let mut ymax = i32::MIN;
+
+        // Find contour bounding box
+        for point in contour.points.iter() {
+            let x = point.x;
+            let y = point.y;
+
+            if x < xmin {
+                xmin = x;
+            }
+            if x > xmax {
+                xmax = x;
+            }
+            if y < ymin {
+                ymin = y;
+            }
+            if y > ymax {
+                ymax = y;
+            }
+        }
+
+        let width = f_map_mat.width() as i32;
+        let height = f_map_mat.height() as i32;
+
+        xmin = xmin.max(0).min(width - 1);
+        xmax = xmax.max(0).min(width - 1);
+        ymin = ymin.max(0).min(height - 1);
+        ymax = ymax.max(0).min(height - 1);
+
+        let roi_width = xmax - xmin + 1;
+        let roi_height = ymax - ymin + 1;
+
+        if roi_width <= 0 || roi_height <= 0 {
+            return Ok(0.0);
+        }
+
+        let mut mask = image::GrayImage::new(roi_width as u32, roi_height as u32);
+
+        let mut pts = Vec::<imageproc::point::Point<i32>>::new();
+        for point in contour.points.iter() {
+            pts.push(imageproc::point::Point::new(point.x - xmin, point.y - ymin));
+        }
+
+        imageproc::drawing::draw_polygon_mut(&mut mask, pts.as_slice(), image::Luma([255]));
+
+        let cropped_img =
+            image::imageops::crop_imm(f_map_mat, xmin as u32, ymin as u32, roi_width as u32, roi_height as u32)
+                .to_image();
+
+        let mean = OcrUtils::calculate_mean_with_mask(&cropped_img, &mask);
+
+        Ok(mean)
+    }
+
+    fn unclip(
+        box_points: &[imageproc::point::Point<f32>],
+        unclip_ratio: f32,
+    ) -> Result<Vec<imageproc::point::Point<i32>>, OcrError> {
+        // Direct multiplication instead of .powi(2) — avoids function call overhead.
+        let dx_w = box_points[0].x - box_points[1].x;
+        let dy_w = box_points[0].y - box_points[1].y;
+        let clip_rect_width = (dx_w * dx_w + dy_w * dy_w).sqrt();
+        let dx_h = box_points[1].x - box_points[2].x;
+        let dy_h = box_points[1].y - box_points[2].y;
+        let clip_rect_height = (dx_h * dx_h + dy_h * dy_h).sqrt();
+
+        if clip_rect_height < 1.001 && clip_rect_width < 1.001 {
+            return Ok(Vec::new());
+        }
+
+        let mut the_cliper_pts = Vec::new();
+        for pt in box_points {
+            let a1 = Coord {
+                x: pt.x as f64,
+                y: pt.y as f64,
+            };
+            the_cliper_pts.push(a1);
+        }
+
+        let area = Self::signed_polygon_area(box_points).abs();
+        let length = Self::length_of_points(box_points);
+        let distance = area * unclip_ratio / length as f32;
+
+        let co = Polygon::new(LineString::new(the_cliper_pts), vec![]);
+        let solution = co
+            .offset(distance as f64, JoinType::Round(2.0), EndType::ClosedPolygon, 1.0)
+            .0;
+
+        if solution.is_empty() {
+            return Ok(Vec::new());
+        }
+
+        let first_polygon = solution.first().ok_or_else(|| {
+            OcrError::Io(std::io::Error::new(
+                std::io::ErrorKind::InvalidData,
+                "Polygon solution list was empty after offset operation",
+            ))
+        })?;
+
+        let ret_pts: Vec<_> = first_polygon
+            .exterior()
+            .points()
+            .map(|ip| imageproc::point::Point::new(ip.x() as i32, ip.y() as i32))
+            .collect();
+
+        Ok(ret_pts)
+    }
+
+    fn signed_polygon_area(points: &[imageproc::point::Point<f32>]) -> f32 {
+        let num_points = points.len();
+        let mut pts = Vec::with_capacity(num_points + 1);
+        pts.extend_from_slice(points);
+        pts.push(points[0]);
+
+        let mut area = 0.0;
+        for i in 0..num_points {
+            area += (pts[i + 1].x - pts[i].x) * (pts[i + 1].y + pts[i].y) / 2.0;
+        }
+
+        area
+    }
+
+    fn length_of_points(box_points: &[imageproc::point::Point<f32>]) -> f64 {
+        if box_points.is_empty() {
+            return 0.0;
+        }
+
+        let mut length = 0.0;
+        let mut x0 = box_points[0].x as f64;
+        let mut y0 = box_points[0].y as f64;
+
+        for pt in &box_points[1..] {
+            let x1 = pt.x as f64;
+            let y1 = pt.y as f64;
+            let dx = x1 - x0;
+            let dy = y1 - y0;
+            length += (dx * dx + dy * dy).sqrt();
+            x0 = x1;
+            y0 = y1;
+        }
+
+        // Closing segment back to first point
+        let dx = box_points[0].x as f64 - x0;
+        let dy = box_points[0].y as f64 - y0;
+        length += (dx * dx + dy * dy).sqrt();
+
+        length
+    }
+}
--- a/crates/kreuzberg-paddle-ocr/src/lib.rs
+++ b/crates/kreuzberg-paddle-ocr/src/lib.rs
@@ -0,0 +1,32 @@
+//! # kreuzberg-paddle-ocr
+//!
+//! PaddleOCR via ONNX Runtime for Kreuzberg - high-performance text detection and recognition.
+//!
+//! This crate is vendored from [paddle-ocr-rs](https://github.com/mg-chao/paddle-ocr-rs)
+//! by mg-chao, with modifications for Kreuzberg integration.
+//!
+//! ## ONNX Runtime Requirement
+//!
+//! Requires **ONNX Runtime 1.24+** at runtime.
+//!
+//! ## Original License
+//!
+//! The original paddle-ocr-rs is licensed under Apache-2.0.
+//! This vendored version is relicensed to MIT with the original author's copyright retained.
+
+#![allow(clippy::too_many_arguments)]
+
+pub mod angle_net;
+pub mod base_net;
+pub(crate) mod constants;
+pub mod crnn_net;
+pub mod db_net;
+pub mod ocr_error;
+pub mod ocr_lite;
+pub mod ocr_result;
+pub mod ocr_utils;
+pub mod scale_param;
+
+pub use ocr_error::OcrError;
+pub use ocr_lite::OcrLite;
+pub use ocr_result::{Angle, OcrResult, Point, TextBlock, TextBox, TextLine};
--- a/crates/kreuzberg-paddle-ocr/src/ocr_error.rs
+++ b/crates/kreuzberg-paddle-ocr/src/ocr_error.rs
@@ -0,0 +1,13 @@
+use thiserror::Error;
+
+#[derive(Error, Debug)]
+pub enum OcrError {
+    #[error("Ort error: {0}")]
+    Ort(#[from] ort::Error),
+    #[error("IO error: {0}")]
+    Io(#[from] std::io::Error),
+    #[error("Image error: {0}")]
+    ImageError(#[from] image::ImageError),
+    #[error("Session not initialized")]
+    SessionNotInitialized,
+}
--- a/crates/kreuzberg-paddle-ocr/src/ocr_lite.rs
+++ b/crates/kreuzberg-paddle-ocr/src/ocr_lite.rs
@@ -0,0 +1,447 @@
+use std::collections::HashMap;
+
+use image::ImageBuffer;
+use ort::session::builder::SessionBuilder;
+
+use crate::{
+    angle_net::AngleNet,
+    base_net::BaseNet,
+    crnn_net::CrnnNet,
+    db_net::DbNet,
+    ocr_error::OcrError,
+    ocr_result::{OcrResult, Point, TextBlock, TextBox},
+    ocr_utils::OcrUtils,
+    scale_param::ScaleParam,
+};
+
+#[derive(Debug)]
+pub struct OcrLite {
+    db_net: DbNet,
+    angle_net: AngleNet,
+    crnn_net: CrnnNet,
+}
+
+// SAFETY: OcrLite inference methods (&self) use unsafe pointer casts to call
+// ort Session::run, which is thread-safe at the ONNX Runtime C API level.
+// After initialization (&mut self), no mutable state is accessed during inference.
+unsafe impl Send for OcrLite {}
+unsafe impl Sync for OcrLite {}
+
+impl Default for OcrLite {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl OcrLite {
+    pub fn new() -> Self {
+        Self {
+            db_net: DbNet::new(),
+            angle_net: AngleNet::new(),
+            crnn_net: CrnnNet::new(),
+        }
+    }
+
+    pub fn init_models(
+        &mut self,
+        det_path: &str,
+        cls_path: &str,
+        rec_path: &str,
+        num_thread: usize,
+    ) -> Result<(), OcrError> {
+        self.db_net.init_model(det_path, num_thread, None)?;
+        self.angle_net.init_model(cls_path, num_thread, None)?;
+        self.crnn_net.init_model(rec_path, num_thread, None)?;
+        Ok(())
+    }
+
+    pub fn init_models_with_dict(
+        &mut self,
+        det_path: &str,
+        cls_path: &str,
+        rec_path: &str,
+        dict_path: &str,
+        num_thread: usize,
+    ) -> Result<(), OcrError> {
+        self.db_net.init_model(det_path, num_thread, None)?;
+        self.angle_net.init_model(cls_path, num_thread, None)?;
+        self.crnn_net
+            .init_model_dict_file(rec_path, num_thread, None, dict_path)?;
+        Ok(())
+    }
+
+    pub fn init_models_custom(
+        &mut self,
+        det_path: &str,
+        cls_path: &str,
+        rec_path: &str,
+        builder_fn: fn(SessionBuilder) -> Result<SessionBuilder, ort::Error>,
+    ) -> Result<(), OcrError> {
+        self.db_net.init_model(det_path, 0, Some(builder_fn))?;
+        self.angle_net.init_model(cls_path, 0, Some(builder_fn))?;
+        self.crnn_net.init_model(rec_path, 0, Some(builder_fn))?;
+        Ok(())
+    }
+
+    /// Initialize models with dictionary file and custom session builder.
+    ///
+    /// Combines `init_models_with_dict` and `init_models_custom`: loads the
+    /// dictionary for the recognition model while applying a custom ORT
+    /// session builder (e.g. for GPU execution providers).
+    pub fn init_models_with_dict_custom(
+        &mut self,
+        det_path: &str,
+        cls_path: &str,
+        rec_path: &str,
+        dict_path: &str,
+        num_thread: usize,
+        builder_fn: Option<fn(SessionBuilder) -> Result<SessionBuilder, ort::Error>>,
+    ) -> Result<(), OcrError> {
+        self.db_net.init_model(det_path, num_thread, builder_fn)?;
+        self.angle_net.init_model(cls_path, num_thread, builder_fn)?;
+        self.crnn_net
+            .init_model_dict_file(rec_path, num_thread, builder_fn, dict_path)?;
+        Ok(())
+    }
+
+    pub fn init_models_from_memory(
+        &mut self,
+        det_bytes: &[u8],
+        cls_bytes: &[u8],
+        rec_bytes: &[u8],
+        num_thread: usize,
+    ) -> Result<(), OcrError> {
+        self.db_net.init_model_from_memory(det_bytes, num_thread, None)?;
+        self.angle_net.init_model_from_memory(cls_bytes, num_thread, None)?;
+        self.crnn_net.init_model_from_memory(rec_bytes, num_thread, None)?;
+        Ok(())
+    }
+
+    pub fn init_models_from_memory_custom(
+        &mut self,
+        det_bytes: &[u8],
+        cls_bytes: &[u8],
+        rec_bytes: &[u8],
+        builder_fn: fn(SessionBuilder) -> Result<SessionBuilder, ort::Error>,
+    ) -> Result<(), OcrError> {
+        self.db_net.init_model_from_memory(det_bytes, 0, Some(builder_fn))?;
+        self.angle_net.init_model_from_memory(cls_bytes, 0, Some(builder_fn))?;
+        self.crnn_net.init_model_from_memory(rec_bytes, 0, Some(builder_fn))?;
+        Ok(())
+    }
+
+    fn detect_base(
+        &self,
+        img_src: &image::RgbImage,
+        padding: u32,
+        max_side_len: u32,
+        box_score_thresh: f32,
+        box_thresh: f32,
+        un_clip_ratio: f32,
+        do_angle: bool,
+        most_angle: bool,
+        angle_rollback: bool,
+        angle_rollback_threshold: f32,
+        cls_thresh: f32,
+        thresh: f32,
+    ) -> Result<OcrResult, OcrError> {
+        let origin_max_side = img_src.width().max(img_src.height());
+        let mut resize;
+        if max_side_len == 0 || max_side_len > origin_max_side {
+            resize = origin_max_side;
+        } else {
+            resize = max_side_len;
+        }
+        resize += 2 * padding;
+
+        // Cow avoids cloning the image when padding=0 (the common case).
+        let padding_src = OcrUtils::make_padding(img_src, padding)?;
+
+        let scale = ScaleParam::get_scale_param(&padding_src, resize);
+
+        self.detect_once(
+            &padding_src,
+            &scale,
+            padding,
+            box_score_thresh,
+            box_thresh,
+            un_clip_ratio,
+            do_angle,
+            most_angle,
+            angle_rollback,
+            angle_rollback_threshold,
+            cls_thresh,
+            thresh,
+        )
+    }
+
+    /// Detect text in image
+    ///
+    /// # Arguments
+    ///
+    /// - `img_src` - Input image
+    /// - `padding` - Padding width added during image transformation (improves detection)
+    /// - `max_side_len` - Maximum side length after transformation (larger images will be scaled down)
+    /// - `box_score_thresh` - Score threshold for text region detection
+    /// - `box_thresh` - Box threshold
+    /// - `un_clip_ratio` - Unclip ratio
+    /// - `do_angle` - Whether to perform angle detection
+    /// - `most_angle` - Use most common angle for all text regions
+    const DEFAULT_CLS_THRESH: f32 = 0.9;
+    const DEFAULT_THRESH: f32 = 0.3;
+    const DEFAULT_REC_BATCH_SIZE: u32 = 6;
+
+    pub fn detect(
+        &self,
+        img_src: &image::RgbImage,
+        padding: u32,
+        max_side_len: u32,
+        box_score_thresh: f32,
+        box_thresh: f32,
+        un_clip_ratio: f32,
+        do_angle: bool,
+        most_angle: bool,
+    ) -> Result<OcrResult, OcrError> {
+        self.detect_base(
+            img_src,
+            padding,
+            max_side_len,
+            box_score_thresh,
+            box_thresh,
+            un_clip_ratio,
+            do_angle,
+            most_angle,
+            false,
+            0.0,
+            Self::DEFAULT_CLS_THRESH,
+            Self::DEFAULT_THRESH,
+        )
+    }
+
+    /// Detect text with angle rollback support
+    ///
+    /// When `do_angle` is true, if the image was angle-corrected but recognition
+    /// result is poor, the angle correction will be reverted.
+    ///
+    /// # Arguments
+    ///
+    /// - `img_src` - Input image
+    /// - `padding` - Padding width added during image transformation
+    /// - `max_side_len` - Maximum side length after transformation
+    /// - `box_score_thresh` - Score threshold for text region detection
+    /// - `box_thresh` - Box threshold
+    /// - `un_clip_ratio` - Unclip ratio
+    /// - `do_angle` - Whether to perform angle detection
+    /// - `most_angle` - Use most common angle
+    /// - `angle_rollback_threshold` - If text score is below this value (or NaN), angle correction is reverted
+    pub fn detect_angle_rollback(
+        &self,
+        img_src: &image::RgbImage,
+        padding: u32,
+        max_side_len: u32,
+        box_score_thresh: f32,
+        box_thresh: f32,
+        un_clip_ratio: f32,
+        do_angle: bool,
+        most_angle: bool,
+        angle_rollback_threshold: f32,
+    ) -> Result<OcrResult, OcrError> {
+        self.detect_base(
+            img_src,
+            padding,
+            max_side_len,
+            box_score_thresh,
+            box_thresh,
+            un_clip_ratio,
+            do_angle,
+            most_angle,
+            true,
+            angle_rollback_threshold,
+            Self::DEFAULT_CLS_THRESH,
+            Self::DEFAULT_THRESH,
+        )
+    }
+
+    pub fn detect_from_path(
+        &self,
+        img_path: &str,
+        padding: u32,
+        max_side_len: u32,
+        box_score_thresh: f32,
+        box_thresh: f32,
+        un_clip_ratio: f32,
+        do_angle: bool,
+        most_angle: bool,
+    ) -> Result<OcrResult, OcrError> {
+        let img_src = image::open(img_path)?.to_rgb8();
+
+        self.detect(
+            &img_src,
+            padding,
+            max_side_len,
+            box_score_thresh,
+            box_thresh,
+            un_clip_ratio,
+            do_angle,
+            most_angle,
+        )
+    }
+
+    /// Sort text boxes in reading order: top-to-bottom, left-to-right.
+    ///
+    /// Sorts by top-left Y coordinate first, then by top-left X coordinate within
+    /// the same Y. Matches PaddleOCR Python's `sorted_boxes` primary ordering.
+    fn sort_text_boxes(text_boxes: &mut [TextBox]) {
+        text_boxes.sort_by(|a, b| {
+            let ay = a.points.first().map_or(0, |p| p.y);
+            let ax = a.points.first().map_or(0, |p| p.x);
+            let by = b.points.first().map_or(0, |p| p.y);
+            let bx = b.points.first().map_or(0, |p| p.x);
+            (ay, ax).cmp(&(by, bx))
+        });
+    }
+
+    fn detect_once(
+        &self,
+        img_src: &image::RgbImage,
+        scale: &ScaleParam,
+        padding: u32,
+        box_score_thresh: f32,
+        box_thresh: f32,
+        un_clip_ratio: f32,
+        do_angle: bool,
+        most_angle: bool,
+        angle_rollback: bool,
+        angle_rollback_threshold: f32,
+        cls_thresh: f32,
+        thresh: f32,
+    ) -> Result<OcrResult, OcrError> {
+        let mut text_boxes =
+            self.db_net
+                .get_text_boxes(img_src, scale, box_score_thresh, box_thresh, un_clip_ratio, thresh)?;
+
+        // Sort boxes in reading order (top-to-bottom, left-to-right)
+        Self::sort_text_boxes(&mut text_boxes);
+
+        let part_images = OcrUtils::get_part_images(img_src, &text_boxes);
+
+        let angles = self
+            .angle_net
+            .get_angles(&part_images, do_angle, most_angle, cls_thresh)?;
+
+        let mut rotated_images: Vec<image::RgbImage> = Vec::with_capacity(part_images.len());
+
+        // Angle correction rollback
+        let mut angle_rollback_records = HashMap::<usize, ImageBuffer<image::Rgb<u8>, Vec<u8>>>::new();
+
+        for (index, (angle, mut part_image)) in angles.iter().zip(part_images).enumerate() {
+            if angle.index == 1 {
+                if angle_rollback {
+                    // Keep original copy
+                    angle_rollback_records.insert(index, part_image.clone());
+                }
+
+                OcrUtils::mat_rotate_clock_wise_180(&mut part_image);
+            }
+            rotated_images.push(part_image);
+        }
+
+        let text_lines = self.crnn_net.get_text_lines(
+            &rotated_images,
+            &angle_rollback_records,
+            angle_rollback_threshold,
+            Self::DEFAULT_REC_BATCH_SIZE,
+        )?;
+
+        let mut text_blocks = Vec::with_capacity(text_lines.len());
+        for (i, text_line) in text_lines.into_iter().enumerate() {
+            text_blocks.push(TextBlock {
+                box_points: text_boxes[i]
+                    .points
+                    .iter()
+                    .map(|p| Point {
+                        x: ((p.x as f32) - padding as f32) as u32,
+                        y: ((p.y as f32) - padding as f32) as u32,
+                    })
+                    .collect(),
+                box_score: text_boxes[i].score,
+                angle_index: angles[i].index,
+                angle_score: angles[i].score,
+                text: text_line.text,
+                text_score: text_line.text_score,
+            });
+        }
+
+        Ok(OcrResult { text_blocks })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::ocr_result::TextBox;
+
+    fn make_box(x: u32, y: u32) -> TextBox {
+        TextBox {
+            points: vec![
+                Point { x, y },
+                Point { x: x + 100, y },
+                Point { x: x + 100, y: y + 20 },
+                Point { x, y: y + 20 },
+            ],
+            score: 0.9,
+        }
+    }
+
+    #[test]
+    fn test_sort_text_boxes_top_to_bottom() {
+        let mut boxes = vec![make_box(10, 100), make_box(10, 50), make_box(10, 10)];
+        OcrLite::sort_text_boxes(&mut boxes);
+        assert_eq!(boxes[0].points[0].y, 10);
+        assert_eq!(boxes[1].points[0].y, 50);
+        assert_eq!(boxes[2].points[0].y, 100);
+    }
+
+    #[test]
+    fn test_sort_text_boxes_same_line_left_to_right() {
+        // Boxes with the same Y are sorted left-to-right by X
+        let mut boxes = vec![make_box(200, 10), make_box(100, 10), make_box(50, 10)];
+        OcrLite::sort_text_boxes(&mut boxes);
+        assert_eq!(boxes[0].points[0].x, 50);
+        assert_eq!(boxes[1].points[0].x, 100);
+        assert_eq!(boxes[2].points[0].x, 200);
+    }
+
+    #[test]
+    fn test_sort_text_boxes_multi_line() {
+        // Boxes sorted strictly by (y, x): y=50/x=50, y=50/x=300, y=100/x=100, y=100/x=200
+        let mut boxes = vec![
+            make_box(300, 50),  // line 1, right
+            make_box(100, 100), // line 2, left
+            make_box(50, 50),   // line 1, left (same y=50)
+            make_box(200, 100), // line 2, right (same y=100)
+        ];
+        OcrLite::sort_text_boxes(&mut boxes);
+
+        // Line 1 (y=50): left first, then right
+        assert_eq!(boxes[0].points[0].x, 50);
+        assert_eq!(boxes[1].points[0].x, 300);
+        // Line 2 (y=100): left first, then right
+        assert_eq!(boxes[2].points[0].x, 100);
+        assert_eq!(boxes[3].points[0].x, 200);
+    }
+
+    #[test]
+    fn test_sort_text_boxes_empty() {
+        let mut boxes: Vec<TextBox> = vec![];
+        OcrLite::sort_text_boxes(&mut boxes);
+        assert!(boxes.is_empty());
+    }
+
+    #[test]
+    fn test_sort_text_boxes_single() {
+        let mut boxes = vec![make_box(10, 20)];
+        OcrLite::sort_text_boxes(&mut boxes);
+        assert_eq!(boxes.len(), 1);
+    }
+}
--- a/crates/kreuzberg-paddle-ocr/src/ocr_result.rs
+++ b/crates/kreuzberg-paddle-ocr/src/ocr_result.rs
@@ -0,0 +1,105 @@
+use std::fmt::{self, Write};
+
+use serde::{Deserialize, Serialize};
+
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+pub struct Point {
+    pub x: u32,
+    pub y: u32,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct TextBox {
+    pub points: Vec<Point>,
+    pub score: f32,
+}
+
+impl fmt::Display for TextBox {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        // SAFETY: We must have at least 4 points in a valid TextBox
+        // This is enforced at the OCR processing level, but we check bounds here for safety
+        if self.points.len() < 4 {
+            return write!(
+                f,
+                "TextBox [score({}), points_count({})]",
+                self.score,
+                self.points.len()
+            );
+        }
+
+        write!(
+            f,
+            "TextBox [score({}), [x: {}, y: {}], [x: {}, y: {}], [x: {}, y: {}], [x: {}, y: {}]]",
+            self.score,
+            self.points[0].x,
+            self.points[0].y,
+            self.points[1].x,
+            self.points[1].y,
+            self.points[2].x,
+            self.points[2].y,
+            self.points[3].x,
+            self.points[3].y,
+        )
+    }
+}
+
+#[derive(Debug, Default)]
+pub struct Angle {
+    pub index: i32,
+    pub score: f32,
+}
+
+impl fmt::Display for Angle {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let header = if self.index >= 0 { "Angle" } else { "AngleDisabled" };
+        write!(f, "{}[Index({}), Score({})]", header, self.index, self.score)
+    }
+}
+
+#[derive(Debug, Default)]
+pub struct TextLine {
+    pub text: String,
+    pub text_score: f32,
+}
+
+impl fmt::Display for TextLine {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "TextLine[Text({}),TextScore({})]", self.text, self.text_score)
+    }
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct TextBlock {
+    pub box_points: Vec<Point>,
+    pub box_score: f32,
+
+    pub angle_index: i32,
+    pub angle_score: f32,
+
+    pub text: String,
+    pub text_score: f32,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct OcrResult {
+    pub text_blocks: Vec<TextBlock>,
+}
+
+impl fmt::Display for OcrResult {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let mut str_builder = String::with_capacity(0);
+        for text_block in &self.text_blocks {
+            write!(
+                str_builder,
+                "TextBlock[BoxPointsLen({}), BoxScore({}), AngleIndex({}), AngleScore({}), Text({}), TextScore({})]",
+                text_block.box_points.len(),
+                text_block.box_score,
+                text_block.angle_index,
+                text_block.angle_score,
+                text_block.text,
+                text_block.text_score
+            )?;
+        }
+        f.write_str(&str_builder)
+    }
+}
--- a/crates/kreuzberg-paddle-ocr/src/ocr_utils.rs
+++ b/crates/kreuzberg-paddle-ocr/src/ocr_utils.rs
@@ -0,0 +1,206 @@
+use std::borrow::Cow;
+
+use crate::{
+    ocr_error::OcrError,
+    ocr_result::{Point, TextBox},
+};
+use image::imageops;
+use imageproc::geometric_transformations::{Interpolation, Projection};
+use ndarray::{Array, Array4};
+
+pub struct OcrUtils;
+
+impl OcrUtils {
+    /// Normalize image pixels and transpose from HWC (row-major RGB) to CHW tensor format.
+    ///
+    /// Formula per pixel: `output[ch] = pixel[ch] * norm[ch] - mean[ch] * norm[ch]`
+    ///
+    /// This is a hot path called once per page. Key optimizations:
+    /// - Pre-computes `mean * norm` constants (avoids repeated multiply)
+    /// - Writes each channel plane contiguously via `as_slice_mut()`, enabling
+    ///   LLVM auto-vectorization (NEON on ARM64, SSE/AVX on x86-64). The previous
+    ///   approach used `tensor[[0, ch, r, c]]` which scattered writes across planes
+    ///   and prevented any vectorization.
+    pub fn substract_mean_normalize(img_src: &image::RgbImage, mean_vals: &[f32], norm_vals: &[f32]) -> Array4<f32> {
+        let cols = img_src.width() as usize;
+        let rows = img_src.height() as usize;
+        let pixel_count = rows * cols;
+
+        let mut input_tensor = Array::zeros((1, 3, rows, cols));
+
+        let adjusted = [
+            mean_vals[0] * norm_vals[0],
+            mean_vals[1] * norm_vals[1],
+            mean_vals[2] * norm_vals[2],
+        ];
+
+        let raw = img_src.as_raw();
+
+        // Write each channel plane as a contiguous slice. ndarray stores (1,3,H,W)
+        // in C-contiguous (row-major) order, so plane [0,ch] is a contiguous H*W block.
+        // This enables LLVM to auto-vectorize the inner loop (4-8 f32 ops per cycle).
+        for ch in 0..3 {
+            let norm = norm_vals[ch];
+            let adj = adjusted[ch];
+            let plane = input_tensor
+                .slice_mut(ndarray::s![0, ch, .., ..])
+                .into_shape_with_order(pixel_count)
+                .expect("contiguous plane slice");
+            let plane_slice = plane.into_slice().expect("contiguous memory");
+
+            for (i, out) in plane_slice.iter_mut().enumerate() {
+                // raw is HWC: pixel i has R at raw[i*3], G at raw[i*3+1], B at raw[i*3+2]
+                *out = raw[i * 3 + ch] as f32 * norm - adj;
+            }
+        }
+
+        input_tensor
+    }
+
+    /// Add white padding around the image, or borrow it unchanged when padding=0.
+    /// Returns Cow to avoid cloning the image in the common no-padding case.
+    pub fn make_padding<'a>(img_src: &'a image::RgbImage, padding: u32) -> Result<Cow<'a, image::RgbImage>, OcrError> {
+        if padding == 0 {
+            return Ok(Cow::Borrowed(img_src));
+        }
+
+        let width = img_src.width();
+        let height = img_src.height();
+
+        let mut padding_src = image::RgbImage::new(width + 2 * padding, height + 2 * padding);
+        imageproc::drawing::draw_filled_rect_mut(
+            &mut padding_src,
+            imageproc::rect::Rect::at(0, 0).of_size(width + 2 * padding, height + 2 * padding),
+            image::Rgb([255, 255, 255]),
+        );
+
+        image::imageops::replace(&mut padding_src, img_src, padding as i64, padding as i64);
+
+        Ok(Cow::Owned(padding_src))
+    }
+
+    pub fn get_part_images(img_src: &image::RgbImage, text_boxes: &[TextBox]) -> Vec<image::RgbImage> {
+        text_boxes
+            .iter()
+            .map(|text_box| Self::get_rotate_crop_image(img_src, &text_box.points))
+            .collect()
+    }
+
+    pub fn get_rotate_crop_image(img_src: &image::RgbImage, box_points: &[Point]) -> image::RgbImage {
+        let mut points = box_points.to_vec();
+
+        // Calculate bounding box
+        let (min_x, min_y, max_x, max_y) = points.iter().fold(
+            (u32::MAX, u32::MAX, 0u32, 0u32),
+            |(min_x, min_y, max_x, max_y), point| {
+                (
+                    min_x.min(point.x),
+                    min_y.min(point.y),
+                    max_x.max(point.x),
+                    max_y.max(point.y),
+                )
+            },
+        );
+
+        // Crop image
+        let img_crop = imageops::crop_imm(img_src, min_x, min_y, max_x - min_x, max_y - min_y).to_image();
+
+        for point in &mut points {
+            point.x = point.x.saturating_sub(min_x);
+            point.y = point.y.saturating_sub(min_y);
+        }
+
+        // Ensure we have enough points for transformation
+        if points.len() < 4 {
+            // Fallback: return the cropped image as-is if we don't have 4 points
+            return img_crop;
+        }
+
+        // Direct multiplication instead of .pow(2) — avoids integer power function overhead.
+        let dx_w = (points[0].x as i32 - points[1].x as i32) as f32;
+        let dy_w = (points[0].y as i32 - points[1].y as i32) as f32;
+        let img_crop_width = (dx_w * dx_w + dy_w * dy_w).sqrt() as u32;
+        let dx_h = (points[0].x as i32 - points[3].x as i32) as f32;
+        let dy_h = (points[0].y as i32 - points[3].y as i32) as f32;
+        let img_crop_height = (dx_h * dx_h + dy_h * dy_h).sqrt() as u32;
+
+        // Ensure dimensions are valid (non-zero)
+        if img_crop_width == 0 || img_crop_height == 0 {
+            return img_crop;
+        }
+
+        let src_points = [
+            (points[0].x as f32, points[0].y as f32),
+            (points[1].x as f32, points[1].y as f32),
+            (points[2].x as f32, points[2].y as f32),
+            (points[3].x as f32, points[3].y as f32),
+        ];
+
+        let dst_points = [
+            (0.0, 0.0),
+            (img_crop_width as f32, 0.0),
+            (img_crop_width as f32, img_crop_height as f32),
+            (0.0, img_crop_height as f32),
+        ];
+
+        let projection = match Projection::from_control_points(src_points, dst_points) {
+            Some(proj) => proj,
+            None => {
+                // If projection cannot be created, return the cropped image as fallback
+                return img_crop;
+            }
+        };
+
+        let mut part_img = image::RgbImage::new(img_crop_width, img_crop_height);
+        imageproc::geometric_transformations::warp_into(
+            &img_crop,
+            &projection,
+            Interpolation::Nearest,
+            image::Rgb([255, 255, 255]),
+            &mut part_img,
+        );
+
+        // Rotate image if needed
+        if part_img.height() >= part_img.width() * 3 / 2 {
+            let mut rotated = image::RgbImage::new(part_img.height(), part_img.width());
+
+            for (x, y, pixel) in part_img.enumerate_pixels() {
+                rotated.put_pixel(y, part_img.width() - 1 - x, *pixel);
+            }
+
+            rotated
+        } else {
+            part_img
+        }
+    }
+
+    pub fn mat_rotate_clock_wise_180(src: &mut image::RgbImage) {
+        imageops::rotate180_in_place(src);
+    }
+
+    /// Compute mean of f32 image values where mask > 0.
+    ///
+    /// Uses raw slice access instead of per-pixel get_pixel() for better
+    /// cache behavior and to enable auto-vectorization of the reduction.
+    pub fn calculate_mean_with_mask(
+        img: &image::ImageBuffer<image::Luma<f32>, Vec<f32>>,
+        mask: &image::ImageBuffer<image::Luma<u8>, Vec<u8>>,
+    ) -> f32 {
+        assert_eq!(img.width(), mask.width());
+        assert_eq!(img.height(), mask.height());
+
+        let img_raw = img.as_raw();
+        let mask_raw = mask.as_raw();
+        let mut sum: f32 = 0.0;
+        let mut count: u32 = 0;
+
+        for (px, &m) in img_raw.iter().zip(mask_raw.iter()) {
+            if m > 0 {
+                sum += *px;
+                count += 1;
+            }
+        }
+
+        if count == 0 { 0.0 } else { sum / count as f32 }
+    }
+}
--- a/crates/kreuzberg-paddle-ocr/src/scale_param.rs
+++ b/crates/kreuzberg-paddle-ocr/src/scale_param.rs
@@ -0,0 +1,69 @@
+#[derive(Debug)]
+pub struct ScaleParam {
+    pub src_width: u32,
+    pub src_height: u32,
+    pub dst_width: u32,
+    pub dst_height: u32,
+    pub scale_width: f32,
+    pub scale_height: f32,
+}
+
+impl ScaleParam {
+    pub fn new(
+        src_width: u32,
+        src_height: u32,
+        dst_width: u32,
+        dst_height: u32,
+        scale_width: f32,
+        scale_height: f32,
+    ) -> Self {
+        Self {
+            src_width,
+            src_height,
+            dst_width,
+            dst_height,
+            scale_width,
+            scale_height,
+        }
+    }
+
+    pub fn get_scale_param(src: &image::RgbImage, target_size: u32) -> Self {
+        let src_width = src.width();
+        let src_height = src.height();
+        let mut dst_width;
+        let mut dst_height;
+
+        let ratio: f32 = if src_width > src_height {
+            target_size as f32 / src_width as f32
+        } else {
+            target_size as f32 / src_height as f32
+        };
+
+        dst_width = (src_width as f32 * ratio) as u32;
+        dst_height = (src_height as f32 * ratio) as u32;
+
+        if dst_width % 32 != 0 {
+            dst_width = (dst_width / 32) * 32;
+            dst_width = dst_width.max(32);
+        }
+        if dst_height % 32 != 0 {
+            dst_height = (dst_height / 32) * 32;
+            dst_height = dst_height.max(32);
+        }
+
+        let scale_width = dst_width as f32 / src_width as f32;
+        let scale_height = dst_height as f32 / src_height as f32;
+
+        Self::new(src_width, src_height, dst_width, dst_height, scale_width, scale_height)
+    }
+}
+
+impl std::fmt::Display for ScaleParam {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "src_width:{},src_height:{},dst_width:{},dst_height:{},scale_width:{},scale_height:{}",
+            self.src_width, self.src_height, self.dst_width, self.dst_height, self.scale_width, self.scale_height
+        )
+    }
+}
--- a/crates/kreuzberg-paddle-ocr/tests/diagnostic.rs
+++ b/crates/kreuzberg-paddle-ocr/tests/diagnostic.rs
@@ -0,0 +1,436 @@
+//! Diagnostic test to trace PaddleOCR detection pipeline.
+//!
+//! This test isolates each step to determine where empty results originate.
+//! Since this crate doesn't have PNG/image decoder features, we create test
+//! images programmatically.
+
+use std::path::PathBuf;
+
+fn get_workspace_root() -> PathBuf {
+    let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+    manifest_dir.parent().unwrap().parent().unwrap().to_path_buf()
+}
+
+fn get_model_dir() -> PathBuf {
+    get_workspace_root().join(".kreuzberg/paddle-ocr")
+}
+
+/// Create a simple test image with black text "HELLO" on white background.
+/// This avoids needing PNG decoder features.
+fn create_test_image() -> image::RgbImage {
+    let width = 200u32;
+    let height = 100u32;
+    let mut img = image::RgbImage::from_pixel(width, height, image::Rgb([255, 255, 255]));
+
+    // Draw a thick black rectangle to simulate text (a simple "block" pattern)
+    // This ensures the detection model has SOMETHING to detect
+    let black = image::Rgb([0, 0, 0]);
+
+    // Draw "H" shape (x: 20-60, y: 20-80)
+    for y in 20..80 {
+        img.put_pixel(20, y, black);
+        img.put_pixel(21, y, black);
+        img.put_pixel(22, y, black);
+    }
+    for y in 20..80 {
+        img.put_pixel(55, y, black);
+        img.put_pixel(56, y, black);
+        img.put_pixel(57, y, black);
+    }
+    for x in 20..58 {
+        img.put_pixel(x, 48, black);
+        img.put_pixel(x, 49, black);
+        img.put_pixel(x, 50, black);
+    }
+
+    // Draw thick solid block to be very obvious (x: 80-180, y: 30-70)
+    for y in 30..70 {
+        for x in 80..180 {
+            img.put_pixel(x, y, black);
+        }
+    }
+
+    img
+}
+
+#[test]
+fn diagnostic_detection_pipeline() {
+    let model_dir = get_model_dir();
+
+    if !model_dir.join("det/model.onnx").exists() {
+        eprintln!("SKIP: Models not downloaded at {:?}", model_dir);
+        return;
+    }
+
+    // Discover ORT library
+    discover_ort();
+
+    eprintln!("=== PaddleOCR Diagnostic Test ===");
+    eprintln!("Model dir: {:?}", model_dir);
+
+    // Step 1: Create test image
+    let img = create_test_image();
+    eprintln!("Step 1 - Test image created: {}x{}", img.width(), img.height());
+
+    // Step 2: Initialize OcrLite
+    let mut ocr_lite = kreuzberg_paddle_ocr::OcrLite::new();
+    let det_path = model_dir.join("det/model.onnx");
+    let cls_path = model_dir.join("cls/model.onnx");
+    let rec_path = model_dir.join("rec/model.onnx");
+
+    let init_result = ocr_lite.init_models(
+        det_path.to_str().unwrap(),
+        cls_path.to_str().unwrap(),
+        rec_path.to_str().unwrap(),
+        1,
+    );
+
+    match &init_result {
+        Ok(()) => eprintln!("Step 2 - Models initialized successfully"),
+        Err(e) => {
+            eprintln!("Step 2 - FAILED to init models: {:?}", e);
+            panic!("Model initialization failed: {:?}", e);
+        }
+    }
+
+    // Step 3: Run detection with various parameter sets
+    let test_cases = vec![
+        ("A: Default params", 50u32, 960u32, 0.3f32, 0.5f32, 1.6f32, true, false),
+        ("B: Very low thresholds", 50, 960, 0.01, 0.01, 1.6, false, false),
+        ("C: No padding + low", 0, 960, 0.01, 0.01, 1.6, false, false),
+        ("D: Higher unclip ratio", 50, 960, 0.1, 0.1, 3.0, false, false),
+        ("E: No padding + medium", 0, 960, 0.1, 0.3, 2.0, false, false),
+    ];
+
+    let mut any_detected = false;
+
+    for (name, padding, max_side, box_score, box_thresh, unclip, do_angle, most_angle) in &test_cases {
+        eprintln!("\n--- Test {} ---", name);
+        eprintln!(
+            "  padding={}, max_side={}, box_score={}, box_thresh={}, unclip={}",
+            padding, max_side, box_score, box_thresh, unclip
+        );
+
+        let result = ocr_lite.detect(
+            &img,
+            *padding,
+            *max_side,
+            *box_score,
+            *box_thresh,
+            *unclip,
+            *do_angle,
+            *most_angle,
+        );
+
+        match &result {
+            Ok(ocr_result) => {
+                eprintln!("  Result: {} text blocks", ocr_result.text_blocks.len());
+                for (i, block) in ocr_result.text_blocks.iter().enumerate() {
+                    eprintln!(
+                        "    Block {}: text='{}', text_score={:.3}, box_score={:.3}",
+                        i, block.text, block.text_score, block.box_score
+                    );
+                    any_detected = true;
+                }
+            }
+            Err(e) => {
+                eprintln!("  FAILED: {:?}", e);
+            }
+        }
+    }
+
+    eprintln!("\n=== Diagnosis ===");
+    if !any_detected {
+        eprintln!("RESULT: Detection model produces NO output regardless of thresholds.");
+        eprintln!("This strongly suggests an ORT version compatibility issue.");
+        eprintln!("  ort crate version: check Cargo.lock for current version");
+        eprintln!("  ORT_DYLIB_PATH: {:?}", std::env::var("ORT_DYLIB_PATH"));
+    } else {
+        eprintln!("RESULT: Detection works. Issue may be threshold-related or image-specific.");
+    }
+}
+
+/// Also test with raw ONNX inference to check if ORT works at all.
+#[test]
+fn diagnostic_raw_ort_inference() {
+    let model_dir = get_model_dir();
+    let det_model = model_dir.join("det/model.onnx");
+
+    if !det_model.exists() {
+        eprintln!("SKIP: Detection model not found at {:?}", det_model);
+        return;
+    }
+
+    discover_ort();
+
+    eprintln!("=== Raw ORT Inference Test ===");
+
+    // Load model directly via ort
+    use ort::session::Session;
+
+    let mut session = Session::builder().unwrap().commit_from_file(&det_model).unwrap();
+
+    eprintln!("Model loaded successfully");
+    eprintln!("Inputs:");
+    for input in session.inputs() {
+        eprintln!("  name='{}'", input.name());
+    }
+    eprintln!("Outputs:");
+    for output in session.outputs() {
+        eprintln!("  name='{}'", output.name());
+    }
+
+    // Create a small 32x32 test tensor (NCHW format: batch=1, channels=3, h=32, w=32)
+    let input_data: Vec<f32> = vec![0.5; 3 * 32 * 32];
+    let tensor =
+        ort::value::Tensor::from_array(ndarray::Array::from_shape_vec((1, 3, 32, 32), input_data).unwrap()).unwrap();
+
+    let input_name = session.inputs()[0].name().to_string();
+    eprintln!("\nRunning inference with 32x32 gray image...");
+
+    let outputs = session.run(ort::inputs![input_name => tensor]).unwrap();
+
+    // Check output
+    let (output_name, output_value) = outputs.iter().next().unwrap();
+    eprintln!("Output name: {}", output_name);
+
+    let output_tensor = output_value.try_extract_tensor::<f32>().unwrap();
+    let output_shape = output_tensor.0;
+    let output_data = output_tensor.1;
+
+    eprintln!("Output shape: {:?}", output_shape);
+    eprintln!("Output len: {}", output_data.len());
+
+    if !output_data.is_empty() {
+        let min = output_data.iter().cloned().fold(f32::INFINITY, f32::min);
+        let max = output_data.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
+        let sum: f32 = output_data.iter().sum();
+        let mean = sum / output_data.len() as f32;
+        let non_zero = output_data.iter().filter(|&&v| v > 0.001).count();
+
+        eprintln!("Output stats: min={:.6}, max={:.6}, mean={:.6}", min, max, mean);
+        eprintln!("Non-zero values (>0.001): {} / {}", non_zero, output_data.len());
+
+        if max < 0.001 {
+            eprintln!("\nDIAGNOSIS: Model outputs are essentially all zeros.");
+            eprintln!("This confirms an ORT compatibility issue - model isn't executing correctly.");
+        } else {
+            eprintln!("\nDIAGNOSIS: Model produces non-zero output. ORT is working.");
+        }
+    }
+}
+
+/// Diagnostic: test the CRNN recognition model directly.
+#[test]
+fn diagnostic_crnn_model_output() {
+    let model_dir = get_model_dir();
+    let rec_model = model_dir.join("rec/model.onnx");
+
+    if !rec_model.exists() {
+        eprintln!("SKIP: Recognition model not found");
+        return;
+    }
+
+    discover_ort();
+
+    eprintln!("=== CRNN Recognition Model Diagnostic ===");
+
+    use ort::session::Session;
+
+    let mut session = Session::builder().unwrap().commit_from_file(&rec_model).unwrap();
+
+    eprintln!("Model loaded successfully");
+    eprintln!("Inputs:");
+    for input in session.inputs() {
+        eprintln!("  name='{}'", input.name());
+    }
+    eprintln!("Outputs:");
+    for output in session.outputs() {
+        eprintln!("  name='{}'", output.name());
+    }
+
+    // Check metadata for character list
+    {
+        let metadata = session.metadata().unwrap();
+
+        // Check all metadata custom keys
+        eprintln!("Model metadata:");
+        eprintln!("  description: {:?}", metadata.description());
+        eprintln!("  producer: {:?}", metadata.producer());
+
+        // Try to get the character key
+        match metadata.custom("character") {
+            Some(chars) => {
+                let bytes = chars.as_bytes();
+                let char_count = chars.split('\n').count();
+                eprintln!(
+                    "  custom('character'): len={}, bytes={}, split_count={}",
+                    chars.len(),
+                    bytes.len(),
+                    char_count
+                );
+                if chars.len() < 500 {
+                    eprintln!("  value: {:?}", chars);
+                } else {
+                    let preview: String = chars.chars().take(100).collect();
+                    eprintln!("  preview (first 100 chars): {:?}", preview);
+                }
+
+                // Check for null bytes or other encoding issues
+                let null_count = bytes.iter().filter(|&&b| b == 0).count();
+                if null_count > 0 {
+                    eprintln!("  WARNING: {} null bytes found in character string!", null_count);
+                }
+            }
+            None => {
+                eprintln!("  ERROR: No 'character' key in model metadata!");
+            }
+        }
+
+        // Try other possible metadata keys
+        for key in [
+            "character",
+            "characters",
+            "dict",
+            "dictionary",
+            "labels",
+            "vocab",
+            "alphabet",
+        ] {
+            if let Some(val) = metadata.custom(key) {
+                eprintln!(
+                    "  custom('{}'): len={}, preview={:?}",
+                    key,
+                    val.len(),
+                    &val[..val.len().min(80)]
+                );
+            }
+        }
+    } // metadata dropped here
+
+    // Test 1: Run inference with a simple input (height=48, width=200)
+    // CRNN expects NCHW: [1, 3, 48, width]
+    let h = 48usize;
+    let w = 200usize;
+
+    // Create a pattern that looks like text (alternating black/white vertical stripes)
+    let mut input_data: Vec<f32> = vec![0.0; 3 * h * w];
+    for c in 0..3 {
+        for y in 10..38 {
+            for x in (20..180).step_by(2) {
+                input_data[c * h * w + y * w + x] = -1.0; // normalized black
+            }
+        }
+    }
+
+    let tensor =
+        ort::value::Tensor::from_array(ndarray::Array::from_shape_vec((1, 3, h, w), input_data).unwrap()).unwrap();
+
+    let input_name = session.inputs()[0].name().to_string();
+    eprintln!("\nRunning CRNN with striped pattern (48x200)...");
+
+    let outputs = session.run(ort::inputs![input_name => tensor]).unwrap();
+
+    let (_, output_value) = outputs.iter().next().unwrap();
+    let (shape, data) = output_value.try_extract_tensor::<f32>().unwrap();
+
+    eprintln!("Output shape: {:?}", shape);
+    eprintln!("Output total values: {}", data.len());
+
+    if shape.len() >= 3 {
+        let time_steps = shape[1] as usize;
+        let vocab_size = shape[2] as usize;
+        eprintln!("Time steps: {}, Vocabulary size: {}", time_steps, vocab_size);
+
+        // Check if outputs are meaningful
+        let data_vec: Vec<f32> = data.to_vec();
+        let min = data_vec.iter().cloned().fold(f32::INFINITY, f32::min);
+        let max = data_vec.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
+        let mean: f32 = data_vec.iter().sum::<f32>() / data_vec.len() as f32;
+        eprintln!("Overall stats: min={:.6}, max={:.6}, mean={:.6}", min, max, mean);
+
+        // Check argmax distribution
+        let mut argmax_zero_count = 0;
+        let mut argmax_nonzero_count = 0;
+        for t in 0..time_steps {
+            let start = t * vocab_size;
+            let end = start + vocab_size;
+            let slice = &data_vec[start..end.min(data_vec.len())];
+
+            let (max_idx, max_val) =
+                slice.iter().enumerate().fold(
+                    (0, f32::MIN),
+                    |(mi, mv), (i, &v)| if v > mv { (i, v) } else { (mi, mv) },
+                );
+
+            if max_idx == 0 {
+                argmax_zero_count += 1;
+            } else {
+                argmax_nonzero_count += 1;
+            }
+
+            if t < 5 || (t > time_steps - 3) {
+                eprintln!("  Step {}: argmax={}, max_val={:.4}", t, max_idx, max_val);
+            } else if t == 5 {
+                eprintln!("  ... (skipping middle steps)");
+            }
+        }
+
+        eprintln!(
+            "\nArgmax distribution: {} blank (idx=0), {} non-blank",
+            argmax_zero_count, argmax_nonzero_count
+        );
+
+        if argmax_nonzero_count == 0 {
+            eprintln!("\nDIAGNOSIS: CRNN model outputs all blanks.");
+            eprintln!("Possible causes:");
+            eprintln!("  1. ORT version incompatibility with CRNN model");
+            eprintln!("  2. Model is not executing graph correctly");
+            eprintln!("  3. Input normalization mismatch");
+        } else {
+            eprintln!("\nDIAGNOSIS: CRNN model produces non-blank output. Recognition works.");
+        }
+    }
+
+    // Drop outputs before reusing session
+    drop(outputs);
+
+    // Test 2: Run with a uniform white image (should produce all blanks - valid baseline)
+    let white_data: Vec<f32> = vec![1.0; 3 * h * w];
+    let white_tensor =
+        ort::value::Tensor::from_array(ndarray::Array::from_shape_vec((1, 3, h, w), white_data).unwrap()).unwrap();
+
+    let input_name2 = session.inputs()[0].name().to_string();
+    eprintln!("\nRunning CRNN with uniform white (48x200)...");
+    let white_outputs = session.run(ort::inputs![input_name2 => white_tensor]).unwrap();
+    let (_, white_val) = white_outputs.iter().next().unwrap();
+    let (_, white_data_out) = white_val.try_extract_tensor::<f32>().unwrap();
+    let white_vec: Vec<f32> = white_data_out.to_vec();
+    let white_max = white_vec.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
+    let white_min = white_vec.iter().cloned().fold(f32::INFINITY, f32::min);
+    eprintln!("White image output: min={:.6}, max={:.6}", white_min, white_max);
+}
+
+fn discover_ort() {
+    if let Ok(path) = std::env::var("ORT_DYLIB_PATH")
+        && std::path::Path::new(&path).exists()
+    {
+        eprintln!("ORT found via ORT_DYLIB_PATH: {}", path);
+        return;
+    }
+
+    let candidates = [
+        "/opt/homebrew/lib/libonnxruntime.dylib",
+        "/usr/local/lib/libonnxruntime.dylib",
+    ];
+
+    for candidate in &candidates {
+        if std::path::Path::new(candidate).exists() {
+            eprintln!("Setting ORT_DYLIB_PATH={}", candidate);
+            unsafe { std::env::set_var("ORT_DYLIB_PATH", candidate) };
+            return;
+        }
+    }
+
+    eprintln!("WARNING: Could not find ORT library!");
+}
--- a/crates/kreuzberg-php/Cargo.toml
+++ b/crates/kreuzberg-php/Cargo.toml
@@ -0,0 +1,27 @@
+[package]
+name = "kreuzberg-php"
+version = "5.0.0-rc.3"
+edition = "2024"
+license = "Elastic-2.0"
+description = "High-performance document intelligence library"
+readme = false
+keywords = ["document", "extraction", "ocr", "pdf", "text"]
+categories = ["text-processing"]
+
+# `ahash` and `futures-util` are conditionally included but not directly used in PHP code.
+[package.metadata.cargo-machete]
+ignored = ["tokio", "ahash", "async-trait"]
+
+[lib]
+crate-type = ["cdylib"]
+
+[features]
+extension-module = []
+
+[dependencies]
+async-trait = "0.1"
+ext-php-rs = "0.15"
+kreuzberg = { version = "5.0.0-rc.3", path = "../kreuzberg", features = ["full", "pdf", "ocr", "paddle-ocr", "paddle-ocr-types", "layout-detection", "layout-types", "embeddings", "embedding-presets", "chunking", "keywords-yake", "keywords-rake", "language-detection", "html", "tree-sitter", "office", "email", "archives", "stopwords", "auto-rotate", "auto-rotate-types", "tokio-runtime", "api", "mcp", "liter-llm", "quality"] }
+serde = { version = "1", features = ["derive"] }
+serde_json = "1"
+tokio = { version = "1", features = ["full"] }
--- a/crates/kreuzberg-php/src/LICENSE
+++ b/crates/kreuzberg-php/src/LICENSE
@@ -0,0 +1,93 @@
+Elastic License 2.0 (ELv2)
+
+Copyright 2025-2026 Kreuzberg, Inc.
+
+Acceptance
+
+By using the software, you agree to all of the terms and conditions below.
+
+Copyright License
+
+The licensor grants you a non-exclusive, royalty-free, worldwide,
+non-sublicensable, non-transferable license to use, copy, distribute, make
+available, and prepare derivative works of the software, in each case subject to
+the limitations and conditions below.
+
+Limitations
+
+You may not provide the software to third parties as a hosted or managed
+service, where the service provides users with access to any substantial set of
+the features or functionality of the software.
+
+You may not move, change, disable, or circumvent the license key functionality
+in the software, and you may not remove or obscure any functionality in the
+software that is protected by the license key.
+
+You may not alter, remove, or obscure any licensing, copyright, or other notices
+of the licensor in the software. Any use of the licensor's trademarks is subject
+to applicable law.
+
+Patents
+
+The licensor grants you a license, under any patent claims the licensor can
+license, or becomes able to license, to make, have made, use, sell, offer for
+sale, import and have imported the software, in each case subject to the
+limitations and conditions in this license. This license does not cover any
+patent claims that you cause to be infringed by modifications or additions to the
+software. If you or your company make any written claim that the software
+infringes or contributes to infringement of any patent, your patent license for
+the software granted under these terms ends immediately. If your company makes
+such a claim, your patent license ends immediately for work on behalf of your
+company.
+
+Notices
+
+You must ensure that anyone who gets a copy of any part of the software from you
+also gets a copy of these terms.
+
+If you modify the software, you must include in any modified copies of the
+software prominent notices stating that you have modified the software.
+
+No Other Rights
+
+These terms do not imply any licenses other than those expressly granted in
+these terms.
+
+Termination
+
+If you use the software in violation of these terms, such use is not licensed,
+and your licenses will automatically terminate. If the licensor provides you with
+a notice of your violation, and you cease all violation of this license no later
+than 30 days after you receive that notice, your licenses will be reinstated
+retroactively. However, if you violate these terms after such reinstatement, any
+additional violation of these terms will cause your licenses to terminate
+automatically and permanently.
+
+No Liability
+
+As far as the law allows, the software comes as is, without any warranty or
+condition, and the licensor will not be liable to you for any damages arising out
+of these terms or the use or nature of the software, under any kind of legal
+claim.
+
+Definitions
+
+The licensor is the entity offering these terms, and the software is the
+software the licensor makes available under these terms, including any portion
+of it.
+
+you refers to the individual or entity agreeing to these terms.
+
+your company is any legal entity, sole proprietorship, or other kind of
+organization that you work for, plus all organizations that have control over,
+are under the control of, or are under common control with that organization.
+control means ownership of substantially all the assets of an entity, or the
+power to direct its management and policies by vote, contract, or otherwise.
+Control can be direct or indirect.
+
+your licenses are all the licenses granted to you for the software under these
+terms.
+
+use means anything you do with the software requiring one of your licenses.
+
+trademark means trademarks, service marks, and similar rights.
--- a/crates/kreuzberg-php/src/composer.json
+++ b/crates/kreuzberg-php/src/composer.json
@@ -0,0 +1,34 @@
+{
+  "name": "kreuzberg-dev/kreuzberg",
+  "description": "High-performance document intelligence library",
+  "license": "Elastic-2.0",
+  "type": "php-ext",
+  "require": {
+    "php": ">=8.2"
+  },
+  "require-dev": {
+    "phpstan/phpstan": "^2.1",
+    "friendsofphp/php-cs-fixer": "^3.95",
+    "phpunit/phpunit": "^13.1"
+  },
+  "autoload": {
+    "psr-4": {
+      "Kreuzberg\\": "src/"
+    }
+  },
+  "scripts": {
+    "phpstan": "php -d detect_unicode=0 vendor/bin/phpstan --configuration=phpstan.neon --memory-limit=512M",
+    "format": "php vendor/bin/php-cs-fixer fix --quiet",
+    "format:check": "php vendor/bin/php-cs-fixer fix --dry-run --quiet",
+    "test": "php vendor/bin/phpunit",
+    "lint": "@phpstan",
+    "lint:fix": "php vendor/bin/php-cs-fixer fix --quiet && php -d detect_unicode=0 vendor/bin/phpstan --configuration=phpstan.neon --memory-limit=512M"
+  },
+  "php-ext": {
+    "extension-name": "kreuzberg",
+    "support-zts": true,
+    "support-nts": true,
+    "download-url-method": ["pre-packaged-binary", "composer-default"]
+  },
+  "keywords": ["document", "extraction", "ocr", "pdf", "text"]
+}
--- a/crates/kreuzberg-php/src/composer.lock
+++ b/crates/kreuzberg-php/src/composer.lock
--- a/crates/kreuzberg-php/src/lib.rs
+++ b/crates/kreuzberg-php/src/lib.rs
--- a/crates/kreuzberg-php/src/phpstan-baseline.neon
+++ b/crates/kreuzberg-php/src/phpstan-baseline.neon
@@ -0,0 +1,2 @@
+parameters:
+	ignoreErrors: []
--- a/crates/kreuzberg-php/src/phpstan.neon
+++ b/crates/kreuzberg-php/src/phpstan.neon
@@ -0,0 +1,12 @@
+includes:
+    - phpstan-baseline.neon
+
+parameters:
+    level: max
+    paths:
+        - src
+    scanFiles:
+        - stubs/kreuzberg_extension.php
+    treatPhpDocTypesAsCertain: false
+    reportUnmatchedIgnoredErrors: false
+    tmpDir: var/cache/phpstan
--- a/crates/kreuzberg-py/Cargo.toml
+++ b/crates/kreuzberg-py/Cargo.toml
@@ -0,0 +1,34 @@
+[package]
+name = "kreuzberg-py"
+version = "5.0.0-rc.3"
+edition = "2024"
+license = "Elastic-2.0"
+description = "High-performance document intelligence library"
+readme = false
+keywords = ["document", "extraction", "ocr", "pdf", "text"]
+categories = ["text-processing"]
+
+# `pyo3-async-runtimes` and `serde_json` are emitted unconditionally above so
+# the manifest is stable across regens, but for umbrella crates with no
+# async fns or no JSON-marshalled return types they are genuinely unused.
+# The conditional `async-trait` / `tokio` / `futures` deps are similarly
+# flagged when the umbrella has trait-bridge / streaming adapters configured
+# but no actual async-trait / async callsite in the generated PyO3 shim.
+[package.metadata.cargo-machete]
+ignored = ["pyo3-async-runtimes", "serde_json", "async-trait", "tokio"]
+
+[lib]
+name = "_kreuzberg"
+crate-type = ["cdylib"]
+
+[features]
+extension-module = ["pyo3/extension-module", "pyo3/abi3-py310"]
+
+[dependencies]
+async-trait = "0.1"
+kreuzberg = { version = "5.0.0-rc.3", path = "../kreuzberg", features = ["full", "pdf", "ocr", "paddle-ocr", "paddle-ocr-types", "layout-detection", "layout-types", "embeddings", "embedding-presets", "chunking", "keywords-yake", "keywords-rake", "language-detection", "html", "tree-sitter", "office", "email", "archives", "stopwords", "auto-rotate", "auto-rotate-types", "tokio-runtime", "api", "mcp", "liter-llm", "quality"] }
+pyo3 = { version = "0.28" }
+pyo3-async-runtimes = { version = "0.28", features = ["tokio-runtime"] }
+serde = { version = "1", features = ["derive"] }
+serde_json = "1"
+tokio = { version = "1", features = ["rt-multi-thread"] }
--- a/crates/kreuzberg-py/src/LICENSE
+++ b/crates/kreuzberg-py/src/LICENSE
@@ -0,0 +1,93 @@
+Elastic License 2.0 (ELv2)
+
+Copyright 2025-2026 Kreuzberg, Inc.
+
+Acceptance
+
+By using the software, you agree to all of the terms and conditions below.
+
+Copyright License
+
+The licensor grants you a non-exclusive, royalty-free, worldwide,
+non-sublicensable, non-transferable license to use, copy, distribute, make
+available, and prepare derivative works of the software, in each case subject to
+the limitations and conditions below.
+
+Limitations
+
+You may not provide the software to third parties as a hosted or managed
+service, where the service provides users with access to any substantial set of
+the features or functionality of the software.
+
+You may not move, change, disable, or circumvent the license key functionality
+in the software, and you may not remove or obscure any functionality in the
+software that is protected by the license key.
+
+You may not alter, remove, or obscure any licensing, copyright, or other notices
+of the licensor in the software. Any use of the licensor's trademarks is subject
+to applicable law.
+
+Patents
+
+The licensor grants you a license, under any patent claims the licensor can
+license, or becomes able to license, to make, have made, use, sell, offer for
+sale, import and have imported the software, in each case subject to the
+limitations and conditions in this license. This license does not cover any
+patent claims that you cause to be infringed by modifications or additions to the
+software. If you or your company make any written claim that the software
+infringes or contributes to infringement of any patent, your patent license for
+the software granted under these terms ends immediately. If your company makes
+such a claim, your patent license ends immediately for work on behalf of your
+company.
+
+Notices
+
+You must ensure that anyone who gets a copy of any part of the software from you
+also gets a copy of these terms.
+
+If you modify the software, you must include in any modified copies of the
+software prominent notices stating that you have modified the software.
+
+No Other Rights
+
+These terms do not imply any licenses other than those expressly granted in
+these terms.
+
+Termination
+
+If you use the software in violation of these terms, such use is not licensed,
+and your licenses will automatically terminate. If the licensor provides you with
+a notice of your violation, and you cease all violation of this license no later
+than 30 days after you receive that notice, your licenses will be reinstated
+retroactively. However, if you violate these terms after such reinstatement, any
+additional violation of these terms will cause your licenses to terminate
+automatically and permanently.
+
+No Liability
+
+As far as the law allows, the software comes as is, without any warranty or
+condition, and the licensor will not be liable to you for any damages arising out
+of these terms or the use or nature of the software, under any kind of legal
+claim.
+
+Definitions
+
+The licensor is the entity offering these terms, and the software is the
+software the licensor makes available under these terms, including any portion
+of it.
+
+you refers to the individual or entity agreeing to these terms.
+
+your company is any legal entity, sole proprietorship, or other kind of
+organization that you work for, plus all organizations that have control over,
+are under the control of, or are under common control with that organization.
+control means ownership of substantially all the assets of an entity, or the
+power to direct its management and policies by vote, contract, or otherwise.
+Control can be direct or indirect.
+
+your licenses are all the licenses granted to you for the software under these
+terms.
+
+use means anything you do with the software requiring one of your licenses.
+
+trademark means trademarks, service marks, and similar rights.
--- a/crates/kreuzberg-py/src/kreuzberg/py.typed
+++ b/crates/kreuzberg-py/src/kreuzberg/py.typed
--- a/crates/kreuzberg-py/src/lib.rs
+++ b/crates/kreuzberg-py/src/lib.rs
--- a/crates/kreuzberg-py/src/pyproject.toml
+++ b/crates/kreuzberg-py/src/pyproject.toml
@@ -0,0 +1,105 @@
+[build-system]
+build-backend = "maturin"
+requires = [ "maturin>=1,<2" ]
+
+[project]
+name = "kreuzberg"
+version = "5.0.0rc3"
+description = "High-performance document intelligence library"
+keywords = [ "document", "extraction", "ocr", "pdf", "text" ]
+license = "Elastic-2.0"
+license-files = [ "LICENSE" ]
+authors = [ { name = "Na'aman Hirschfeld <naaman@kreuzberg.dev>" } ]
+requires-python = ">=3.10"
+classifiers = [
+  "Programming Language :: Python :: 3 :: Only",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: 3.13",
+  "Programming Language :: Python :: 3.14",
+]
+urls.repository = "https://github.com/kreuzberg-dev/kreuzberg"
+homepage = "https://kreuzberg.dev"
+
+[dependency-groups]
+dev = [ "mypy>=1.19", "ruff>=0.14.8" ]
+
+[tool.maturin]
+module-name = "kreuzberg._kreuzberg"
+manifest-path = "../../crates/kreuzberg-py/Cargo.toml"
+# abi3-py310 produces a single wheel per platform that loads on Python 3.10+,
+# avoiding a per-Python-version build matrix.
+features = [ "pyo3/extension-module", "pyo3/abi3-py310" ]
+python-packages = [ "kreuzberg" ]
+# Bundle the core Rust crate so `pip install` can build from sdist on
+# platforms without a precompiled wheel (e.g. Alpine/musl). Without this
+# the workspace [patch.crates-io] (when present) points at a path that is
+# missing from the tarball and the source build fails.
+include = [
+  { path = "../../crates/kreuzberg/**/*", format = "sdist" },
+]
+
+[tool.ruff]
+target-version = "py310"
+line-length = 120
+format.docstring-code-line-length = 120
+format.docstring-code-format = true
+lint.select = [ "ALL" ]
+lint.ignore = [
+  "ANN401",
+  "ASYNC109",
+  "ASYNC110",
+  "BLE001",
+  "COM812",
+  "D100",
+  "D104",
+  "D107",
+  "D205",
+  "E501",
+  "EM",
+  "FBT",
+  "FIX",
+  "ISC001",
+  "PD011",
+  "PGH003",
+  "PLR2004",
+  "PLW0603",
+  "S104",
+  "S110",
+  "S603",
+  "TD",
+  "TRY",
+]
+lint.per-file-ignores."kreuzberg/__init__.py" = [ "I001" ]
+# The alef Python codegen still emits cosmetic warnings on the wrapper
+# modules: api.py keeps the legacy `from typing import AsyncIterator` and a
+# single-line import block, options.py carries # noqa: TC001 / F401 markers
+# that turn out unused on every regen, __init__.py star-imports re-sort with
+# a different convention. Silence these specific rules on the wrappers until
+# the codegen is updated to emit ruff-clean output.
+lint.per-file-ignores."kreuzberg/api.py" = [ "F401", "I001", "UP035" ]
+lint.per-file-ignores."kreuzberg/options.py" = [ "F401", "RUF100" ]
+lint.per-file-ignores."tests/**" = [ "ANN", "D103", "PLR2004", "S101" ]
+lint.mccabe.max-complexity = 15
+lint.pydocstyle.convention = "google"
+lint.pylint.max-args = 10
+lint.pylint.max-branches = 15
+lint.pylint.max-returns = 10
+
+[tool.mypy]
+python_version = "3.10"
+strict = true
+show_error_codes = true
+implicit_reexport = false
+namespace_packages = true
+overrides = [
+  # The alef-emitted `api.py` wrapper has a structural mismatch between its
+  # `options.*` dataclass signatures and the `_internal_bindings.*` pyclass
+  # types pyo3 accepts/returns at runtime. pyo3 reconciles them dynamically via
+  # FromPyObject — the Python e2e suite exercises the runtime path — but mypy
+  # sees only the static-type discrepancy. Disable the four error codes the
+  # discrepancy raises until the codegen emits matching `_to_rust_*` calls and
+  # casts the return values.
+  { module = "kreuzberg.api", disable_error_code = [ "call-arg", "arg-type", "return-value", "attr-defined" ] },
+]
--- a/crates/kreuzberg-py/src/uv.lock
+++ b/crates/kreuzberg-py/src/uv.lock
@@ -0,0 +1,316 @@
+version = 1
+revision = 3
+requires-python = ">=3.10"
+resolution-markers = [
+    "python_full_version >= '3.15'",
+    "python_full_version < '3.15'",
+]
+
+[[package]]
+name = "ast-serialize"
+version = "0.5.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/81/9d/09e27731bd5864a9ce04e3244074e674bb8936bf62b45e0357248717adac/ast_serialize-0.5.0.tar.gz", hash = "sha256:5880091bfe6f4f986f22866375c2e884843e7a0b6343ae41aeea659613d879b6", size = 61157, upload-time = "2026-05-17T17:48:29.429Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c0/9a/13dde51ba9e15f8b97957ab7cb0120d0e381524d651c6bd630b9c359227f/ast_serialize-0.5.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:8f5c14f169eb0972c0c21bada5358b23d6047c76583b005234f865b11f1fa00a", size = 1183520, upload-time = "2026-05-17T17:47:30.831Z" },
+    { url = "https://files.pythonhosted.org/packages/37/de/5a7f0a9fe68944f536632a5af84676739c7d2582be42deb082634bf3a754/ast_serialize-0.5.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7d1a2de9de5be04652f0ed60738356ef94f66db37924a9499fffe98dc491aa0b", size = 1175779, upload-time = "2026-05-17T17:47:32.551Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/81/0bb853e76e4f6e9a1855d569003c59e19ffac45f7079d91505d1bb212f92/ast_serialize-0.5.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:be5173fb66f9b49026d9d5a2ff0fc7c7009077107c0eb285b2d60fdf1fe10bd1", size = 1233750, upload-time = "2026-05-17T17:47:34.731Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/d3/4cf705beeccc08754d0bbda99aefff26110e209b9a07ac8a6b60eec48531/ast_serialize-0.5.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f8015cd071ac1339924ee2b8098c93e00e155f30a16f40ec9816fcf84f4753f6", size = 1235942, upload-time = "2026-05-17T17:47:36.287Z" },
+    { url = "https://files.pythonhosted.org/packages/26/c8/ee097e437ea27dd2b8b227865c875492b585650a5802a22d82b304c8201b/ast_serialize-0.5.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5499e8797edff2a9186aa313ed382c6b422e798e9332d9953badcee6e69a88f2", size = 1442517, upload-time = "2026-05-17T17:47:38.17Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/bd/68063442838f1ba68ec72b5436430bc75b3bb17a1a3c3063f09b0c05ae2b/ast_serialize-0.5.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6848f2a093fb5548751a9a09bff8fcd229e2bbeb0e3331f391b6ae6d26cd9903", size = 1254081, upload-time = "2026-05-17T17:47:39.826Z" },
+    { url = "https://files.pythonhosted.org/packages/50/e2/1e520793bc6a4e4524a6ab022391e827825eaa0c3811828bfdc6852eca26/ast_serialize-0.5.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:832d4c998e0b091fd60a6d6bceee535483c4d490de9ba85003af835225719261", size = 1259910, upload-time = "2026-05-17T17:47:41.369Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/e1/49b60f467979979cfe6913b43948ff25bca971ad0591d181812f163a988e/ast_serialize-0.5.0-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:16db7c62ec0b8efe1d7afd283a388d8f74f2605d56032e5a37747d2de8dba027", size = 1250678, upload-time = "2026-05-17T17:47:43.702Z" },
+    { url = "https://files.pythonhosted.org/packages/74/ba/66ab9555de6275677566f6574e5ef6c29cb185ea866f643bc06f8280a8ee/ast_serialize-0.5.0-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:baf5eb061eb5bccade4128ad42da33787d72f6013809cd1b590376ece8b3c937", size = 1301603, upload-time = "2026-05-17T17:47:46.256Z" },
+    { url = "https://files.pythonhosted.org/packages/66/42/6aca9b9abc710014b2be9059689e5dd1679339e78f567ffb4d255a9e2050/ast_serialize-0.5.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:104e4a35bd7c124173c41760ef9aaea17ddb3f86c65cb643671d59afbe3ee94c", size = 1410332, upload-time = "2026-05-17T17:47:47.899Z" },
+    { url = "https://files.pythonhosted.org/packages/47/68/2f76594432a22581ecf878b5e75a9b8601c24b2241cf0bbeb1e21fcf370c/ast_serialize-0.5.0-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:36be371028fc1675acb38a331bde160dbab7ff907fdf00b67eb6911aa106951b", size = 1509979, upload-time = "2026-05-17T17:47:50.942Z" },
+    { url = "https://files.pythonhosted.org/packages/40/ac/a93c9b58292653f6c595752f677a08e608f903b710594909e9231a389b3b/ast_serialize-0.5.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:061ee58bdb52341c8201a6df41182a977736bae3b7ded87ca7176ca25a8a47ab", size = 1505002, upload-time = "2026-05-17T17:47:54.093Z" },
+    { url = "https://files.pythonhosted.org/packages/14/2e/b278f68c497ee2f1d1576cbbef8db5281cd4a5f2db040537592ac9c8862e/ast_serialize-0.5.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:b15219e9cdc9f53f6f4cb51c009203507228226148c05c5e8fe451c28b435eb3", size = 1456231, upload-time = "2026-05-17T17:47:56.311Z" },
+    { url = "https://files.pythonhosted.org/packages/0b/43/419be1c566a4c504cd8fd60ce2f84e790f295495c0f327cfaeadf3d51012/ast_serialize-0.5.0-cp314-cp314t-win32.whl", hash = "sha256:842d1c004bb466c7df036f95fabef789570541922b10976b12f5592a69cf0b38", size = 1058668, upload-time = "2026-05-17T17:47:58.305Z" },
+    { url = "https://files.pythonhosted.org/packages/03/6f/c9d4d549295ed05111aeb8853232d1afd9d0a179fddb01eeffbb3a4a6842/ast_serialize-0.5.0-cp314-cp314t-win_amd64.whl", hash = "sha256:b0c06d760909b095cc466356dfccd05a1c7233a6ca191c020dca2c6a6f16c24c", size = 1101075, upload-time = "2026-05-17T17:48:00.35Z" },
+    { url = "https://files.pythonhosted.org/packages/d0/8e/d00c5ab30c58222e07d62956fca86c59d91b9ad32997e633c38b526623a3/ast_serialize-0.5.0-cp314-cp314t-win_arm64.whl", hash = "sha256:787baedb0262cc49e8ce37cc15c00ae818e46a165a3b36f5e21ed174998104cb", size = 1075347, upload-time = "2026-05-17T17:48:01.753Z" },
+    { url = "https://files.pythonhosted.org/packages/e0/9e/dc2530acb3a60dc6e46d65abf27d1d9f86721694757906a148d90a6860de/ast_serialize-0.5.0-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:0668aa9459cfa8c9c49ddd2163ebcf43088ba045ef7492af6fe22e0098303101", size = 1191380, upload-time = "2026-05-17T17:48:03.738Z" },
+    { url = "https://files.pythonhosted.org/packages/26/0a/bd3d18a582f273d6c843d16bb9e22e9e16365ff7991e92f18f798e9f1224/ast_serialize-0.5.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:bf683d6363edf2b39eed6b6d4fe22d34b6203867a67e27134d9e2a2680c4bc4a", size = 1183879, upload-time = "2026-05-17T17:48:05.463Z" },
+    { url = "https://files.pythonhosted.org/packages/40/ae/1f919100f8620887af58fcc381c61a1f218cdf89c6e155f87b213e61010a/ast_serialize-0.5.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9cc22cf0c9be65e71cf88fda130af60d61eb4a79370ad4cfe7900d48a4aa2211", size = 1244529, upload-time = "2026-05-17T17:48:07.008Z" },
+    { url = "https://files.pythonhosted.org/packages/c6/ca/6376559dcce707cdbc1d0d9a13c8d3baaaa501e949ce0ebdc4230cd881aa/ast_serialize-0.5.0-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f66173891548c9f2726bf27957b41cabce12fa679dc6da505ddbde4d4b3b31cf", size = 1240560, upload-time = "2026-05-17T17:48:08.46Z" },
+    { url = "https://files.pythonhosted.org/packages/35/b2/a620e206b5aeb7efbf2710336df57d457cffbb3991076bbcc1147ef9abd4/ast_serialize-0.5.0-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e42d729ef2be96a14efbad355093284739e3670ece3e534f82cc8832790911d9", size = 1451172, upload-time = "2026-05-17T17:48:09.922Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/e0/4ad5c04c24a40481b2935ce9a0ccdb6023dc8b667167d06ae530cc3512f2/ast_serialize-0.5.0-cp39-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b725026bafa801dbd7310eb13a75f0a2e370e7e51b2cb225f9d21fcfadf919ee", size = 1265072, upload-time = "2026-05-17T17:48:11.469Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/71/4d1d479aa56d0101c40e17720c3d6ac2af7269ea0487a80b18e7bfd1a5b7/ast_serialize-0.5.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b54f60c1d78767a53b67eaa663f0dfac3afe606aa07f1301572f588b73d64809", size = 1270488, upload-time = "2026-05-17T17:48:13.575Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/4f/0de1bbe06f6edef9fde4ed12ca8e7b3ec7e6e2bd4e672c5af487f7957665/ast_serialize-0.5.0-cp39-abi3-manylinux_2_31_riscv64.whl", hash = "sha256:27d51654fc240a1e87e742d353d98eb45b75f62f129086b3596ab53df2ac2a43", size = 1260702, upload-time = "2026-05-17T17:48:15.141Z" },
+    { url = "https://files.pythonhosted.org/packages/75/61/e00872439cfdddcc3c1b6cdaa6e5d904ba8e26a18807c67c4e14409d0ca8/ast_serialize-0.5.0-cp39-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2782c36237c46dd1674542f2109740ea5ea485a169bf1431939ada0434e17934", size = 1311182, upload-time = "2026-05-17T17:48:16.779Z" },
+    { url = "https://files.pythonhosted.org/packages/76/8e/699a5b955f7926956c95e9e1d74132acad73c2fe7a426f94da89123c20aa/ast_serialize-0.5.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:1943db345233cc7194a470f13afa9c59772c0b123dea0c9414c4d4ca54369759", size = 1421410, upload-time = "2026-05-17T17:48:18.527Z" },
+    { url = "https://files.pythonhosted.org/packages/a9/ae/d5b7626874478997adc7a29ab28accf21e596fb590c944290401dfd0b29e/ast_serialize-0.5.0-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:df1c00022cbbcb064bfaa505aa9c9295362443ce5dacb459d1331d3da353f887", size = 1516587, upload-time = "2026-05-17T17:48:20.133Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/ce/b59e02a82d9c4244d64cde502e0b00e83e38816abe19155ceb5437402c7f/ast_serialize-0.5.0-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:cae65289fc456fde04af979a2be09302ef5d8ab92ef23e596d6746dc267ada27", size = 1515171, upload-time = "2026-05-17T17:48:21.921Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/38/d8d90042747d05aa08d4efcf1c99035a5f670a6bf4c214d31644392afbca/ast_serialize-0.5.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:239a4c354e8d676e9d94631d1d4a64edc6b266f86ff3a5a80aedd344f342c01d", size = 1464668, upload-time = "2026-05-17T17:48:23.544Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/51/5b840c4df7334104cecffa28f23904fe81ca89ca223d2450e288de39fd3c/ast_serialize-0.5.0-cp39-abi3-win32.whl", hash = "sha256:143a4ef63285a075871908fda3672dc21864b83a8ec3ee12304aa3e4c5387b9a", size = 1068311, upload-time = "2026-05-17T17:48:25.027Z" },
+    { url = "https://files.pythonhosted.org/packages/41/11/ca5672c7d491825bc4cd6702dea106a6b60d928707712ec257c7833ae476/ast_serialize-0.5.0-cp39-abi3-win_amd64.whl", hash = "sha256:cf25572c526add400f26a4750dc6ce0c3bb93fc1f75e7ae0cad4ce4f2cd5c590", size = 1108931, upload-time = "2026-05-17T17:48:26.591Z" },
+    { url = "https://files.pythonhosted.org/packages/45/19/cc8bd127d28a43da249aa955cfd164cf8fd534e79e42cea96c4854d72fd0/ast_serialize-0.5.0-cp39-abi3-win_arm64.whl", hash = "sha256:92a31c9c20d25a076edaeec76b128a3535d74a24f340b9a8a7e96c9b86dc9642", size = 1081181, upload-time = "2026-05-17T17:48:28.122Z" },
+]
+
+[[package]]
+name = "kreuzberg"
+version = "5.0.0rc1"
+source = { editable = "." }
+
+[package.dev-dependencies]
+dev = [
+    { name = "mypy" },
+    { name = "ruff" },
+]
+
+[package.metadata]
+
+[package.metadata.requires-dev]
+dev = [
+    { name = "mypy", specifier = ">=1.19" },
+    { name = "ruff", specifier = ">=0.14.8" },
+]
+
+[[package]]
+name = "librt"
+version = "0.11.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/40/08/9e7f6b5d2b5bed6ad055cdd5925f192bb403a51280f86b56554d9d0699a2/librt-0.11.0.tar.gz", hash = "sha256:075dc3ef4458a278e0195cbf6ac9d38808d9b906c5a6c7f7f79c3888276a3fb1", size = 200139, upload-time = "2026-05-10T18:17:25.138Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/83/10/37fd9e9ba96cb0bd742dfb20fc3d082e54bdbec759d7300df927f360ef07/librt-0.11.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6e94ebfcfa2d5e9926d6c3b9aa4617ffc42a845b4321fb84021b872358c82a0f", size = 141706, upload-time = "2026-05-10T18:15:16.129Z" },
+    { url = "https://files.pythonhosted.org/packages/cf/72/1b1466f358e4a0b728051f69bc27e67b432c6eaa2e05b88db49d3785ae0d/librt-0.11.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ae627397a2f351560440d872d6f7c8dbb4072e57868e7b2fc5b8b430fe489d45", size = 142605, upload-time = "2026-05-10T18:15:18.148Z" },
+    { url = "https://files.pythonhosted.org/packages/ca/85/ed26dd2f6bc9a0baf48306433e579e8d354d70b2bcb78134ed950a5d0e1e/librt-0.11.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dc329359321b67d24efdf4bc69012b0597001649544db662c001db5a0184794c", size = 476555, upload-time = "2026-05-10T18:15:19.569Z" },
+    { url = "https://files.pythonhosted.org/packages/66/fe/11891191c0e0a3fd617724e891f6e67a71a7658974a892b9a9a97fdb2977/librt-0.11.0-cp310-cp310-manylinux2014_i686.manylinux_2_17_i686.manylinux_2_28_i686.whl", hash = "sha256:7e82e642ab0f7608ce2fe53d76ca2280a9ee33a1b06556142c7c6fe80a86fc33", size = 468434, upload-time = "2026-05-10T18:15:20.87Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/50/5ec949d7f9ce1a07af903aa3e13abb98b717923bdead6e719b2f824ccc07/librt-0.11.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:88145c15c67731d54283d135b03244028c750cc9edc334a96a4f5950ebdb2884", size = 496918, upload-time = "2026-05-10T18:15:22.616Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/c4/177336c7524e34875a38bf668e88b193a6723a4eb4045d07f74df6e1506c/librt-0.11.0-cp310-cp310-manylinux_2_34_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:9d36a51b3d93320b686588e27123f4995804dbf1bce81df78c02fc3c6eea9280", size = 490334, upload-time = "2026-05-10T18:15:24.2Z" },
+    { url = "https://files.pythonhosted.org/packages/13/1f/da3112f7569eda3b49f9a2629bae1fe059812b6085df16c885f6454dff49/librt-0.11.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d00f3ac06a2a8b246327f11e186a53a100a4d5c7ed52346367e5ec751d51586c", size = 511287, upload-time = "2026-05-10T18:15:26.226Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/94/03fec301522e172d105581431223be56b27594ff46440ebfbb658a3735d5/librt-0.11.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:461bbceede621f1ffb8839755f8663e886087ee7af16294cab7fb4d782c62eeb", size = 517202, upload-time = "2026-05-10T18:15:27.965Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/6e/339f6e5a7b413ce014f1917a756dae630fe59cc99f34153205b1cb540901/librt-0.11.0-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:0cad8a4d6a8ff03c9b76f9414caccd78e7cfbc8a2e12fa334d8e1d9932753783", size = 497517, upload-time = "2026-05-10T18:15:29.614Z" },
+    { url = "https://files.pythonhosted.org/packages/cd/43/acdd5ce317cb46e8253ca9bfbdb8b12e68a24d745949336a7f3d5fb79ba0/librt-0.11.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f37aa505b3cf60701562eddb32df74b12a9e380c207fd8b06dd157a943ac7ea0", size = 538878, upload-time = "2026-05-10T18:15:30.928Z" },
+    { url = "https://files.pythonhosted.org/packages/29/b5/7a25bb12e3172839f647f196b3e988318b7bb1ca7501732a225c4dce2ec0/librt-0.11.0-cp310-cp310-win32.whl", hash = "sha256:94663a21534637f0e787ec2a2a756022df6e5b7b2335a5cdd7d8e33d68a2af89", size = 100070, upload-time = "2026-05-10T18:15:32.551Z" },
+    { url = "https://files.pythonhosted.org/packages/c6/0d/ebbcf4d77999c02c937b05d2b90ff4cd4dcc7e9a365ba132329ac1fe7a0f/librt-0.11.0-cp310-cp310-win_amd64.whl", hash = "sha256:dec7db73758c2b54953fd8b7fe348c45188fe26b39ee18446196edd08453a5d4", size = 117918, upload-time = "2026-05-10T18:15:33.678Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/87/2bf31fe17587b29e3f93ec31421e2b1e1c3e349b8bf6c7c313dbad1d5340/librt-0.11.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:93d95bd45b7d58343d8b90d904450a545144eec19a002511163426f8ab1fae29", size = 141092, upload-time = "2026-05-10T18:15:34.795Z" },
+    { url = "https://files.pythonhosted.org/packages/cf/08/5c5bf772920b7ebac6e32bc91a643e0ab3870199c0b542356d3baa83970a/librt-0.11.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4ee278c769a713638cdacd4c0436d72156e75df3ebc0166ab2b9dc43acc386c9", size = 142035, upload-time = "2026-05-10T18:15:36.242Z" },
+    { url = "https://files.pythonhosted.org/packages/06/20/662a03d254e5b000d838e8b345d83303ddb768c080fd488e40634c0fa66b/librt-0.11.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f230cb1cbc9faaa616f9a678f530ebcf186e414b6bcbd88b960e4ba1b92428d5", size = 475022, upload-time = "2026-05-10T18:15:37.56Z" },
+    { url = "https://files.pythonhosted.org/packages/de/f3/aa81523e45184c6ec23dc7f63263362ec55f80a09d424c012359ecbe7e35/librt-0.11.0-cp311-cp311-manylinux2014_i686.manylinux_2_17_i686.manylinux_2_28_i686.whl", hash = "sha256:5d63c855d86938d9de93e265c9bd8c705b51ec494de5738340ee93767a686e4b", size = 467273, upload-time = "2026-05-10T18:15:39.182Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/6f/59c74b560ca8853834d5501d589c8a2519f4184f273a085ffd0f37a1cc47/librt-0.11.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:993f028be9e96a08d31df3479ac80d99be374d17f3b78e4796b3fd3c913d4e89", size = 497083, upload-time = "2026-05-10T18:15:40.634Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/7b/5aa4d2c9600a719401160bf7055417df0b2a47439b9d88286ce45e56b65f/librt-0.11.0-cp311-cp311-manylinux_2_34_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:258d73a0aa66a055e65b2e4d1b8cdb23b9d132c5bb915d9547d804fcaed116cc", size = 489139, upload-time = "2026-05-10T18:15:41.934Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/31/9143803d7da6856a69153785768c4936864430eec0fd9461c3ea527d9922/librt-0.11.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0827efe7854718f04aaddf6496e96960a956e676fe1d0f04eb41511fd8ad06d5", size = 508442, upload-time = "2026-05-10T18:15:43.206Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/5a/bce08184488426bda4ccc2c4964ac048c8f68ae89bd7120082eef4233cfd/librt-0.11.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:7753e57d6e12d019c0d8786f1c09c709f4c3fcc57c3887b24e36e6c06ec938b7", size = 514230, upload-time = "2026-05-10T18:15:44.761Z" },
+    { url = "https://files.pythonhosted.org/packages/89/8c/bb5e213d254b7505a0e658da199d8ab719086632ce09eef311ab27976523/librt-0.11.0-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:11bd19822431cc21af9f27374e7ae2e58103c7d98bda823536a6c47f6bb2bb3d", size = 494231, upload-time = "2026-05-10T18:15:46.308Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/fb/541cdad5b1ab1300398c74c4c9a497b88e5074c21b1244c8f49731d3a284/librt-0.11.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:22bdf239b219d3993761a148ffa134b19e52e9989c84f845d5d7b71d70a17412", size = 537585, upload-time = "2026-05-10T18:15:47.629Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/f2/464bb69295c320cb06bddb4f14a4ec67934ee14b2bffb12b19fb7ab287ba/librt-0.11.0-cp311-cp311-win32.whl", hash = "sha256:46c60b61e308eb535fbd6fa622b1ee1bb2815691c1ad9c98bf7b84952ec3bc8d", size = 100509, upload-time = "2026-05-10T18:15:49.157Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/e7/a17ee1788f9e4fbf548c19f4afa07c92089b9e24fef6cb2410863781ef4c/librt-0.11.0-cp311-cp311-win_amd64.whl", hash = "sha256:902e546ff044f579ff1c953ff5fce97b636fe9e3943996b2177710c6ef076f73", size = 118628, upload-time = "2026-05-10T18:15:50.345Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/c7/6c766214f9f9903bcfcfbef97d807af8d8f5aa3502d247858ab17582d212/librt-0.11.0-cp311-cp311-win_arm64.whl", hash = "sha256:65ac3bc20f78aa0ee5ae84baa68917f89fef4af63e941084dd019a0d0e749f0c", size = 103122, upload-time = "2026-05-10T18:15:52.068Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/d0/07c77e067f0838949b43bd89232c29d72efebb9d2801a9750184eb706b71/librt-0.11.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b87504f1690a23b9a2cca841191a04f83895d4fc2dd04df91d82b1a04ca2ad46", size = 144147, upload-time = "2026-05-10T18:15:53.227Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/24/8493538fa4f62f982686398a5b8f68008138a75086abdea19ade64bf4255/librt-0.11.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40071fc5fe0ce8daa6de616702314a01e1250711682b0523d6ab8d4525910cb3", size = 143614, upload-time = "2026-05-10T18:15:54.657Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/1e/f8bad050810d9171f34a1648ed910e56814c2ba61639f2bd53c6377ae24b/librt-0.11.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:137e79445c896a0ea7b265f52d23954e05b64222ee1af69e2cb34219067cbb67", size = 485538, upload-time = "2026-05-10T18:15:56.117Z" },
+    { url = "https://files.pythonhosted.org/packages/c0/fe/3594ebfbaf03084ba4b120c9ba5c3183fd938a48725e9bbe6ff0a5159ad8/librt-0.11.0-cp312-cp312-manylinux2014_i686.manylinux_2_17_i686.manylinux_2_28_i686.whl", hash = "sha256:cca6644054e78746d8d4ef238681f9c34ff8b584fe6b988ecebb8db3b15e622a", size = 479623, upload-time = "2026-05-10T18:15:57.544Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/da/5d1876984b3746c85dbd219dbfcb73c85f54ee263fd32e5b2a632ec14571/librt-0.11.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d5b0eea49f5562861ee8d757a32ef7d559c1d35be2aaaa1ec28941d74c9ffc8a", size = 513082, upload-time = "2026-05-10T18:15:58.805Z" },
+    { url = "https://files.pythonhosted.org/packages/19/6e/55bdf5d5ca00c3e18430690bf2c953d8d3ffd3c337418173d33dec985dc9/librt-0.11.0-cp312-cp312-manylinux_2_34_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:0d1029d7e1ae1a7e647ed6fb5df8c4ce2dffefb7a9f5fd1376a4554d96dac09f", size = 508105, upload-time = "2026-05-10T18:16:00.2Z" },
+    { url = "https://files.pythonhosted.org/packages/07/10/f1f23a7c595ee90ece4d35c851e5d104b1311a887ed1b4ac4c35bbd13da8/librt-0.11.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bc3ce6b33c5828d9e80592011a5c584cb2ce86edbc4088405f70da47dc1d1b3b", size = 522268, upload-time = "2026-05-10T18:16:01.708Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/02/5720f5697a7f54b78b3aefbe20df3a48cedcff1276618c4aa481177942ed/librt-0.11.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:936c5995f3514a42111f20099397d8177c79b4d7e70961e396c6f5a0a3566766", size = 527348, upload-time = "2026-05-10T18:16:03.496Z" },
+    { url = "https://files.pythonhosted.org/packages/50/db/b4a47c6f91db4ff76348a0b3dd0cc65e090a078b765a810a62ff9434c3d3/librt-0.11.0-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:9bc0ca6ad9381cbe8e4aa6e5726e4c80c78115a6e9723c599ed1d73e092bc49d", size = 516294, upload-time = "2026-05-10T18:16:05.173Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/58/9384b2f4eb1ed1d273d40948a7c5c4b2360213b402ef3be4641c06299f9c/librt-0.11.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:070aa8c26c0a74774317a72df8851facc7f0f012a5b406557ac56992d92e1ec8", size = 553608, upload-time = "2026-05-10T18:16:06.839Z" },
+    { url = "https://files.pythonhosted.org/packages/21/7b/5aa8848a7c6a9278c79375146da1812e695754ceec5f005e6043461a7315/librt-0.11.0-cp312-cp312-win32.whl", hash = "sha256:6bf14feb84b05ae945277395451998c89c54d0def4070eb5c08de544930b245a", size = 101879, upload-time = "2026-05-10T18:16:08.103Z" },
+    { url = "https://files.pythonhosted.org/packages/37/33/8a745436944947575b584231750a41417de1a38cf6a2e9251d1065651c09/librt-0.11.0-cp312-cp312-win_amd64.whl", hash = "sha256:75672f0bc524ede266287d532d7923dbce94c7514ad07627bac3d0c6d92cc4d9", size = 119831, upload-time = "2026-05-10T18:16:09.174Z" },
+    { url = "https://files.pythonhosted.org/packages/59/67/a6739ac96e28b7855808bdb0370e250606104a859750d209e5a0716fe7ab/librt-0.11.0-cp312-cp312-win_arm64.whl", hash = "sha256:2f10cf143e4a9bb0f4f5af568a00df94a2d69ef41c2579584454bb0fe5cc642c", size = 103470, upload-time = "2026-05-10T18:16:10.369Z" },
+    { url = "https://files.pythonhosted.org/packages/82/61/e59168d4d0bf2bf90f4f0caf7a001bfc60254c3af4586013b04dc3ef517b/librt-0.11.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:78dc31f7fdfe9c9d0eb0e8f42d139db230e826415bbcabd9f0e9faaaee909894", size = 144119, upload-time = "2026-05-10T18:16:11.771Z" },
+    { url = "https://files.pythonhosted.org/packages/61/fd/caa1d60b12f7dd79ccea23054e06eeaebe266a5f52c40a6b651069200ce5/librt-0.11.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:fa475675db22290c3158e1d42326d0f5a65f04f44a0e68c3630a25b53560fb9c", size = 143565, upload-time = "2026-05-10T18:16:13.334Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/a9/dc744f5c2b4978d48db970be29f22716d3413d28b14ad99740817315cf2c/librt-0.11.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:621db29691044bdeda22e789e482e1b0f3a985d90e3426c9c6d17606416205ea", size = 485395, upload-time = "2026-05-10T18:16:14.729Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/21/7f8e97a1e4dae952a5a95948f6f8507a173bc1e669f54340bba6ca1ca31b/librt-0.11.0-cp313-cp313-manylinux2014_i686.manylinux_2_17_i686.manylinux_2_28_i686.whl", hash = "sha256:a9010e2ed5b3a9e158c5fd966b3ab7e834bb3d3aacc8f66c91dd4b57a3799230", size = 479383, upload-time = "2026-05-10T18:16:16.321Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/6d/d8ee9c114bebf2c50e29ec2aa940826fccb62a645c3e4c18760987d0e16d/librt-0.11.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7c39513d8b7477a2e1ed8c43fc21c524e8d5a0f8d4e8b7b074dbdbe7820a08e2", size = 513010, upload-time = "2026-05-10T18:16:17.647Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/43/0b5708af2bd30a46400e72ba6bdaa8f066f15fb9a688527e34220e8d6c06/librt-0.11.0-cp313-cp313-manylinux_2_34_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:7aef3cf1d5af86e770ab04bfd993dfc4ae8b8c17f66fb77dd4a7d50de7bbb1a3", size = 508433, upload-time = "2026-05-10T18:16:19.309Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/50/356187247d09013490481033183b3532b58acf8028bcb34b2b56a375c9b2/librt-0.11.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:557183ddc36babe46b27dd60facbd5adb4492181a5be887587d57cda6e092f21", size = 522595, upload-time = "2026-05-10T18:16:20.642Z" },
+    { url = "https://files.pythonhosted.org/packages/40/e7/c6ac4240899c7f3248079d5a9900debe0dadb3fdeaf856684c987105ba47/librt-0.11.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:83d3e1f72bd42f6c5c0b7daec530c3f829bd02db42c70b8ddf0c2d90a2459930", size = 527255, upload-time = "2026-05-10T18:16:22.352Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/b5/a81322dbeedeeaf9c1ee6f001734d28a09d8383ac9e6779bc24bbd0743c6/librt-0.11.0-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:4ce1f21fbe589bc1afd7872dece84fb0e1144f794a288e58a10d2c54a55c43be", size = 516847, upload-time = "2026-05-10T18:16:23.627Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/66/6e6323787d592b55204a42595ff1102da5115601b53a7e9ddebc889a6da5/librt-0.11.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:970b09f7044ea2b64c9da42fd3d335666518cfd1c6e8a182c95da73d0214b41e", size = 553920, upload-time = "2026-05-10T18:16:25.025Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/21/623f8ca230857102066d9ca8c6c1734995908c4d0d1bee7bb2ef0021cb33/librt-0.11.0-cp313-cp313-win32.whl", hash = "sha256:78fddc31cd4d3caa897ad5d31f856b1faadc9474021ad6cb182b9018793e254e", size = 101898, upload-time = "2026-05-10T18:16:26.649Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/1d/b4ebd44dd723f768469007515cb92251e0ae286c94c140f374801140fa74/librt-0.11.0-cp313-cp313-win_amd64.whl", hash = "sha256:8ca8aa88751a775870b764e93bad5135385f563cb8dcee399abf034ea4d3cb47", size = 119812, upload-time = "2026-05-10T18:16:27.859Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/e4/b2f4ca7965ca373b491cdb4bc25cdb30c1649ca81a8782056a83850292a9/librt-0.11.0-cp313-cp313-win_arm64.whl", hash = "sha256:96f044bb325fd9cf1a723015638c219e9143f0dfbc0ca54c565df2b7fc748b44", size = 103448, upload-time = "2026-05-10T18:16:29.066Z" },
+    { url = "https://files.pythonhosted.org/packages/29/eb/dbce197da4e227779e56b5735f2decc3eb36e55a1cdbf1bd65d6639d76c1/librt-0.11.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:4a017a95e5837dc15a8c5661d60e05daa96b90908b1aa6b7acdf443cd25c8ebd", size = 143345, upload-time = "2026-05-10T18:16:30.674Z" },
+    { url = "https://files.pythonhosted.org/packages/76/a3/254bebd0c11c8ba684018efb8006ff22e466abce445215cca6c778e7d9de/librt-0.11.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:b1ecbd9819deccc39b7542bf4d2a740d8a620694d39989e58661d3763458f8d4", size = 143131, upload-time = "2026-05-10T18:16:32.037Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/3f/f77d6122d21ac7bf6ae8a7dfced1bd2a7ac545d3273ebdcaf8042f6d619f/librt-0.11.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7da327dacd7be8f8ec36547373550744a3cc0e536d54665cd83f8bcd961200e8", size = 477024, upload-time = "2026-05-10T18:16:33.493Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/0a/2c996dadebaa7d9bbbd43ef2d4f3e66b6da545f838a41694ef6172cebec8/librt-0.11.0-cp314-cp314-manylinux2014_i686.manylinux_2_17_i686.manylinux_2_28_i686.whl", hash = "sha256:0dc56b1f8d06e60db362cc3fdae206681817f86ce4725d34511473487f12a34b", size = 474221, upload-time = "2026-05-10T18:16:34.864Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/7e/f5d92af8486b8272c23b3e686b46ff72d89c8169585eb61eef01a2ac7147/librt-0.11.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:05fb8fb2ab90e21c8d12ea240d744ad514da9baf381ebfa70d91d20d21713175", size = 505174, upload-time = "2026-05-10T18:16:36.705Z" },
+    { url = "https://files.pythonhosted.org/packages/af/1a/cb0734fe86398eb33193ab753b7326255c74cac5eb09e76b9b16536e7adb/librt-0.11.0-cp314-cp314-manylinux_2_34_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:cae74872be221df4374d10fec61f93ed1513b9546ea84f2c0bf73ab3e9bd0b03", size = 497216, upload-time = "2026-05-10T18:16:38.418Z" },
+    { url = "https://files.pythonhosted.org/packages/18/06/094820f91558b66e29943c0ec41c9914f460f48dd51fc503c3101e10842d/librt-0.11.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:32bcc918c0148eb7e3d57385125bac7e5f9e4359d05f07448b09f6f778c2f31c", size = 513921, upload-time = "2026-05-10T18:16:39.848Z" },
+    { url = "https://files.pythonhosted.org/packages/0b/c2/00de9018871a282f530cacb457d5ec0428f6ac7e6fedde9aff7468d9fb04/librt-0.11.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:f9743fc99135d5f78d2454435615f6dec0473ca507c26ce9d92b10b562a280d3", size = 520850, upload-time = "2026-05-10T18:16:41.471Z" },
+    { url = "https://files.pythonhosted.org/packages/51/9d/64631832348fd1834fb3a61b996434edddaaf25a31d03b0a76273159d2cf/librt-0.11.0-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:5ba067f4aadae8fda802d91d2124c90c42195ff32d9161d3549e6d05cfe26f96", size = 504237, upload-time = "2026-05-10T18:16:43.15Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/ec/ae5525eb16edc827a044e7bb8777a455ff95d4bca9379e7e6bddd7383647/librt-0.11.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:de3bf945454d032f9e390b85c4072e0a0570bf825421c8be0e71209fa65e1abe", size = 546261, upload-time = "2026-05-10T18:16:44.408Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/09/adce371f27ca039411da9659f7430fcc2ba6cd0c7b3e4467a0f091be7fa9/librt-0.11.0-cp314-cp314-win32.whl", hash = "sha256:d2277a05f6dcb9fd13db9566aac4fabd68c3ea1ea46ee5567d4eef8efa495a2f", size = 96965, upload-time = "2026-05-10T18:16:46.039Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/ee/8ac720d98548f173c7ce2e632a7ca94673f74cacd5c8162a84af5b35958a/librt-0.11.0-cp314-cp314-win_amd64.whl", hash = "sha256:ab73e8db5e3f564d812c1f5c3a175930a5f9bc96ccb5e3b22a34d7858b401cf7", size = 115151, upload-time = "2026-05-10T18:16:47.133Z" },
+    { url = "https://files.pythonhosted.org/packages/94/20/c900cf14efeb09b6bef2b2dff20779f73464b97fd58d1c6bccc379588ae3/librt-0.11.0-cp314-cp314-win_arm64.whl", hash = "sha256:aea3caa317752e3a466fa8af45d91ee0ea8c7fdd96e42b0a8dd9b76a7931eba1", size = 98850, upload-time = "2026-05-10T18:16:48.597Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/71/944bfe4b64e12abffcd3c15e1cce07f72f3d55655083786285f4dedeb532/librt-0.11.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:d1b36540d7aaf9b9101b3a6f376c8d8e9f7a9aec93ed05918f2c69d493ffef72", size = 151138, upload-time = "2026-05-10T18:16:49.839Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/10/99e64a5c86989357fda078c8143c533389585f6473b7439172dd8f3b3b2d/librt-0.11.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:efbb343ab2ce3540f4ecbe6315d677ed70f37cd9a72b1e58066c918ca83acbaa", size = 151976, upload-time = "2026-05-10T18:16:51.062Z" },
+    { url = "https://files.pythonhosted.org/packages/21/31/5072ad880946d83e5ea4147d6d018c78eefce85b77819b19bdd0ee229435/librt-0.11.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:aa0dd688aab3f7914d3e6e5e3554978e0383312fb8e771d84be008a35b9ee548", size = 557927, upload-time = "2026-05-10T18:16:52.632Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/8d/70b5fb7cfbab60edbe7381614ab985da58e144fbf465c86d44c95f43cdca/librt-0.11.0-cp314-cp314t-manylinux2014_i686.manylinux_2_17_i686.manylinux_2_28_i686.whl", hash = "sha256:f5fb36b8c6c63fdcbb1d526d94c0d1331610d43f4118cc1beb4efef4f3faacb2", size = 539698, upload-time = "2026-05-10T18:16:53.934Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/a3/ba3495a0b3edbd24a4cae0d1d3c64f39a9fc45d06e812101289b50c1a619/librt-0.11.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4a9a237d13addb93715b6fee74023d5ee3469b53fce527626c0e088aa585805f", size = 577162, upload-time = "2026-05-10T18:16:55.589Z" },
+    { url = "https://files.pythonhosted.org/packages/f7/db/36e25fb81f99937ff1b96612a1dc9fd66f039cb9cc3aee12c01fac31aab9/librt-0.11.0-cp314-cp314t-manylinux_2_34_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:5ddd17bd87b2c56ddd60e546a7984a2e64c4e8eab92fb4cf3830a48ad5469d51", size = 566494, upload-time = "2026-05-10T18:16:56.975Z" },
+    { url = "https://files.pythonhosted.org/packages/33/0d/3f622b47f0b013eeb9cf4cc07ae9bfe378d832a4eec998b2b209fe84244d/librt-0.11.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:bd43992b4473d42f12ff9e68326079f0696d9d4e6000e8f39a0238d482ba6ee2", size = 596858, upload-time = "2026-05-10T18:16:58.374Z" },
+    { url = "https://files.pythonhosted.org/packages/a9/02/71b90bc93039c46a2000651f6ad60122b114c8f54c4ad306e0e96f5b75ad/librt-0.11.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:f8e3e8056dd674e279741485e2e512d6e9a751c7455809d0114e6ebf8d781085", size = 590318, upload-time = "2026-05-10T18:16:59.676Z" },
+    { url = "https://files.pythonhosted.org/packages/04/04/418cb3f75621e2b761fb1ab0f017f4d70a1a72a6e7c74ee4f7e8d198c2f3/librt-0.11.0-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:c1f708d8ae9c56cf38a903c44297243d2ec83fd82b396b977e0144a3e76217e3", size = 575115, upload-time = "2026-05-10T18:17:01.007Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/2c/5a2183ac58dd911f26b5d7e7d7d8f1d87fcecdddd99d6c12169a258ff62c/librt-0.11.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:0add982e0e7b9fc14cf4b33789d5f13f66581889b88c2f58099f6ce8f92617bd", size = 617918, upload-time = "2026-05-10T18:17:02.682Z" },
+    { url = "https://files.pythonhosted.org/packages/15/1f/dc6771a52592a4451be6effa200cbfc9cec61e4393d3033d81a9d307961d/librt-0.11.0-cp314-cp314t-win32.whl", hash = "sha256:2b481d846ac894c4e8403c5fd0e87c5d11d6499e404b474602508a224ff531c8", size = 103562, upload-time = "2026-05-10T18:17:03.99Z" },
+    { url = "https://files.pythonhosted.org/packages/62/4a/7d1415567027286a75ba1093ec4aca11f073e0f559c530cf3e0a757ad55c/librt-0.11.0-cp314-cp314t-win_amd64.whl", hash = "sha256:28edb433edde181112a908c78907af28f964eabc15f4dd16c9d66c834302677c", size = 124327, upload-time = "2026-05-10T18:17:05.465Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/62/b40b382fa0c66fee1478073eb8db352a4a6beda4a1adccf1df911d8c289c/librt-0.11.0-cp314-cp314t-win_arm64.whl", hash = "sha256:dee008f20b542e3cd162ba338a7f9ec0f6d23d395f66fe8aeeec3c9d067ea253", size = 102572, upload-time = "2026-05-10T18:17:06.809Z" },
+]
+
+[[package]]
+name = "mypy"
+version = "2.1.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "ast-serialize" },
+    { name = "librt", marker = "platform_python_implementation != 'PyPy'" },
+    { name = "mypy-extensions" },
+    { name = "pathspec" },
+    { name = "tomli", marker = "python_full_version < '3.11'" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/82/15/cca9d88503549ed6fedeaa1d448cdddd542ee8a490232d732e278036fbf2/mypy-2.1.0.tar.gz", hash = "sha256:81e76ad12c2d804512e9b13240d1588316531bfba07558286078bfbce9613633", size = 3898359, upload-time = "2026-05-11T18:37:36.237Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a4/71/d351dca3e9b30da2328ee9d445c88b8388072808ebfbc49eb69d30b67749/mypy-2.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:11a6beb180257a805961aea9ec591bbd0bd17f1e18d35b8456d57aee5bedfedc", size = 14778792, upload-time = "2026-05-11T18:36:23.605Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/45/7d51594b644c17c0bcf74ed8cd5fc33b324276d708e8506f220b70dab9d9/mypy-2.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8ef78c1d306bbf9a8a12f526c44902c9c28dffd6c52c52bf6a72641ce18d3849", size = 13645739, upload-time = "2026-05-11T18:37:22.752Z" },
+    { url = "https://files.pythonhosted.org/packages/65/01/455c31b170e9468265074840bf18863a8482a24103fdaabe4e199392aa5f/mypy-2.1.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c209a90853081ff01d01ee895cafe10f7db1474e0d95beaeef0f6c1db9119bbd", size = 14074199, upload-time = "2026-05-11T18:35:09.292Z" },
+    { url = "https://files.pythonhosted.org/packages/41/5a/93093f0b29a9e982deafde698f740a2eb2e05886e79ccf0594c7fd5413a3/mypy-2.1.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:47cebf61abde7c088a4e27718a8b13a81655686b2e9c251f5c0915a802248166", size = 14953128, upload-time = "2026-05-11T18:31:57.678Z" },
+    { url = "https://files.pythonhosted.org/packages/7f/2f/a196f5331d96170ad3d28f144d2aba690d4b2911381f68d51e489c7ab82a/mypy-2.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d57a90ae5e872138a425ec328edbc9b235d1934c4377881a33ec05b341acc9a8", size = 15249378, upload-time = "2026-05-11T18:33:00.101Z" },
+    { url = "https://files.pythonhosted.org/packages/54/de/94d321cc12da9f71341ac0c270efbed5c725750c7b4c334d957de9a087d9/mypy-2.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:aea7f7a8a55b459c34275fc468ada6ca7c173a5e43a68f5dbe588a563d8a06b8", size = 11060994, upload-time = "2026-05-11T18:33:18.848Z" },
+    { url = "https://files.pythonhosted.org/packages/e1/62/0c27ca55219a7c764a7fb88c7bb2b7b2f9780ade8bbf16bc8ed8400eef6b/mypy-2.1.0-cp310-cp310-win_arm64.whl", hash = "sha256:c989640253f0d76843e9c6c1bbf4bd48c5e85ada61bde4beb37cb3eca035685e", size = 9976743, upload-time = "2026-05-11T18:31:25.554Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/a1/639f3024794a2a15899cb90707fe02e044c4412794c39c5769fd3df2e2ef/mypy-2.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a683016b16fe2f572dc04c72be7ee0504ac1605a265d0200f5cea695fb788f41", size = 14691685, upload-time = "2026-05-11T18:33:27.973Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/08/9a585dea4325f20d8b80dc78623fa50d1fd2173b710f6237afd6ba6ab39b/mypy-2.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1a293c534adb55271fef24a26da04b855540a8c13cc07bc5917b9fd2c394f2ca", size = 13555165, upload-time = "2026-05-11T18:32:16.107Z" },
+    { url = "https://files.pythonhosted.org/packages/81/dc/7c42cc9c6cb01e8eb09961f1f738741d3e9c7e9d5c5b30ec69222625cd5f/mypy-2.1.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7406f4d048e71e576f5356d317e5b0a9e666dfd966bd99f9d14ca06e1a341538", size = 13994376, upload-time = "2026-05-11T18:32:39.256Z" },
+    { url = "https://files.pythonhosted.org/packages/d4/fa/285946c33bce716e082c11dfeee9ee196eaf1f5042efb3581a31f9f205e4/mypy-2.1.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e0210d626fc8b31ccc90233754c7bc90e1f43205e85d96387f7db1285b55c398", size = 14864618, upload-time = "2026-05-11T18:34:49.765Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/83/82397f48af6c27e295d57979ded8490c9829040152cf7571b2f026aeb9a0/mypy-2.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:3712c20deed54e814eaaa825603bada8ea1c390670a397c95b98405347acc563", size = 15102063, upload-time = "2026-05-11T18:34:05.855Z" },
+    { url = "https://files.pythonhosted.org/packages/40/68/b02dec39057b88eb03dc0aa854732e26e8361f34f9d0e20c7614967d1eba/mypy-2.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:fcaa0e479066e31f7cceb6a3bea39cb22b2ff51a6b2f24f193d19179ba17c389", size = 11060564, upload-time = "2026-05-11T18:35:36.494Z" },
+    { url = "https://files.pythonhosted.org/packages/cf/a8/ea3dcbef31f99b634f2ee23bb0321cbc8c1b388b76a861eb849f13c347dc/mypy-2.1.0-cp311-cp311-win_arm64.whl", hash = "sha256:0b1a5260c95aa443083f9ed3592662941951bca3d4ca224a5dc517c38b7cf666", size = 9966983, upload-time = "2026-05-11T18:37:14.139Z" },
+    { url = "https://files.pythonhosted.org/packages/95/b1/55861beb5c339b44f9a2ba92df9e2cb1eeb4ae1eee674cdf7772c797778b/mypy-2.1.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:244358bf1c0da7722230bce60683d52e8e9fd030554926f15b747a84efb5b3af", size = 14874381, upload-time = "2026-05-11T18:37:31.784Z" },
+    { url = "https://files.pythonhosted.org/packages/0b/b3/b7f770114b7d0ac92d0f76e8d93c2780844a70488a90e91821927850da86/mypy-2.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4ec7c57657493c7a75534df2751c8ae2cda383c16ecc55d2106c54476b1b16f6", size = 13665501, upload-time = "2026-05-11T18:34:23.063Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/f3/8ae2037967e2126689a0c11d99e2b707134a565191e92c60ca2572aec60a/mypy-2.1.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d8161b6ff4392410023224f0969d17db93e1e154bc3e4ba62598e720723ae211", size = 14045750, upload-time = "2026-05-11T18:31:48.151Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/32/615eb5911859e43d054941b0d0a7d06cfa2870eba86529cf385b052b111c/mypy-2.1.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bf03e12003084a67395184d3eb8cbd6a489dc3655b5664b28c210a9e2403ab0b", size = 15061630, upload-time = "2026-05-11T18:37:06.898Z" },
+    { url = "https://files.pythonhosted.org/packages/d4/03/4eafbfff8bfab1b87082741eae6e6a624028c984e6708b73bce2a8570c9d/mypy-2.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:20509760fd791c51579d573153407d226385ec1f8bcce55d730b354f3336bc22", size = 15288831, upload-time = "2026-05-11T18:31:18.07Z" },
+    { url = "https://files.pythonhosted.org/packages/99/ee/919661478e5891a3c96e549c036e467e64563ab85995b10c53c8358e16a3/mypy-2.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:6753d0c1fdd6b1a23b9e4f283ce80b2153b724adcb2653b20b85a8a28ac6436b", size = 11135228, upload-time = "2026-05-11T18:34:31.23Z" },
+    { url = "https://files.pythonhosted.org/packages/24/0a/6a12b9782ca0831a553192f351679f4548abc9d19a7cc93bb7feb02084c7/mypy-2.1.0-cp312-cp312-win_arm64.whl", hash = "sha256:98ebb6589bb3b6d0c6f0c459d53ca55b8091fbc13d277c4041c885392e8195e8", size = 10040684, upload-time = "2026-05-11T18:36:48.199Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/dd/c7191469c777f07689c032a8f7326e393ea34c92d6d76eb7ce5ba57ea66d/mypy-2.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:35aac3bb114e03888f535d5eb51b8bafbb3266586b599da1940f9b1be3ec5bd5", size = 14852174, upload-time = "2026-05-11T18:31:38.929Z" },
+    { url = "https://files.pythonhosted.org/packages/55/8c/aed55408879043d72bb9135f4d0d19a02b886dd569631e113e3d2706cb8d/mypy-2.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:8de55a8c861f2a49331f807be98d90caeceeef520bde13d43a160207f8af613e", size = 13651542, upload-time = "2026-05-11T18:36:04.636Z" },
+    { url = "https://files.pythonhosted.org/packages/3a/8e/f371a824b1f1fa8ea6e3dbb8703d232977d572be2329554a3bc4d960302f/mypy-2.1.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5fdf2941a07434af755837d9880f7d7d25f1dacb1af9dcd4b9b66f2220a3024e", size = 14033929, upload-time = "2026-05-11T18:35:55.742Z" },
+    { url = "https://files.pythonhosted.org/packages/94/21/f54be870d6dd53a82c674407e0f8eed7174b05ec78d42e5abd7b42e84fd5/mypy-2.1.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e195b817c13f02352a9c124301f9f30f078405444679b6753c1b96b6eed37285", size = 15039200, upload-time = "2026-05-11T18:33:10.281Z" },
+    { url = "https://files.pythonhosted.org/packages/17/99/bf21748626a40ce59fd29a39386ab46afec88b7bd2f0fa6c3a97c995523f/mypy-2.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5431d42af987ebd92ba2f71d45c85ed41d8e6ca9f5fd209a69f68f707d2469e5", size = 15272690, upload-time = "2026-05-11T18:32:07.205Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/d7/9e90d2cf47100bea550ed2bc7b0d4de3a62181d84d5e37da0003e8462637/mypy-2.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:767fe8c66dc3e01e19e1737d4c38ebefead16125e1b8e58ad421903b376f5c65", size = 11147435, upload-time = "2026-05-11T18:33:56.477Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/46/e5c449e858798e35ffc90946282a27c62a77be743fe17480e4977374eb91/mypy-2.1.0-cp313-cp313-win_arm64.whl", hash = "sha256:ecfe70d43775ab99562ab128ce49854a362044c9f894961f68f898c23cb7429d", size = 10035052, upload-time = "2026-05-11T18:32:30.049Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/ca/b279a672e874aedd5498ae25f722dacc8aa86bbffb939b3f97cbb1cf6686/mypy-2.1.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:7354c5a7f69d9345c3d6e69921d57088eea3ddeeb6b20d34c1b3855b02c36ec2", size = 14848422, upload-time = "2026-05-11T18:35:45.984Z" },
+    { url = "https://files.pythonhosted.org/packages/27/e6/3efe56c631d959b9b4454e208b0ac4b7f4f58b404c89f8bec7b49efdfc21/mypy-2.1.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:49890d4f76ac9e06ec117f9e09f3174da70a620a0c300953d8595c926e80947f", size = 13677374, upload-time = "2026-05-11T18:36:57.188Z" },
+    { url = "https://files.pythonhosted.org/packages/84/7f/8107ea87a44fd1f1b59882442f033c9c3488c127201b1d1d15f1cbd6022e/mypy-2.1.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:761be68e023ef5d94678772396a8af1220030f80837a3afd8d0aef3b419666f4", size = 14055743, upload-time = "2026-05-11T18:35:18.361Z" },
+    { url = "https://files.pythonhosted.org/packages/51/4d/b6d34db183133b83761b9199a82d31557cdbb70a380d8c3b3438e11882a3/mypy-2.1.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c90345fc182dc363b891350457ec69c35140858538f38b4540845afcc32b1aef", size = 15020937, upload-time = "2026-05-11T18:34:59.618Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/d7/f08360c691d758acb02f45022c34d98b92892f4ea756644e1000d4b9f3d8/mypy-2.1.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b84802e7b5a6daf1f5e15bc9fcd7ddae77be13981ffab037f1c67bb84d67d135", size = 15253371, upload-time = "2026-05-11T18:36:41.081Z" },
+    { url = "https://files.pythonhosted.org/packages/67/1b/09460a13719530a19bce27bd3bc8449e83569dd2ba7faf51c9c3c30c0b61/mypy-2.1.0-cp314-cp314-win_amd64.whl", hash = "sha256:022c771234936ceac541ebaf836fe9e2abeb3f5e09aff21588fe543ff006fe21", size = 11326429, upload-time = "2026-05-11T18:34:13.526Z" },
+    { url = "https://files.pythonhosted.org/packages/40/62/75dbf0f82f7b6680340efc614af29dd0b3c17b8a4f1cd09b8bd2fd6bc814/mypy-2.1.0-cp314-cp314-win_arm64.whl", hash = "sha256:498207db725cec88829a6a5c2fc771205fd043719ef98bc49aba8fb9fc4e6d57", size = 10218799, upload-time = "2026-05-11T18:32:23.491Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/66/caca04ed7d972fb6eb6dd1ccd6df1de5c38fae8c5b3dc1c4e8e0d85ee6b9/mypy-2.1.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:7d5e5cad0efeba72b93cd17490cc0d69c5ac9ca132994fe3fb0314808aeeb83e", size = 15923458, upload-time = "2026-05-11T18:35:28.64Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/52/2d90cbe49d014b13ed7ff337930c30bad35893fe38a1e4641e756bb62191/mypy-2.1.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:ff715050c127d724fd260a2e666e7747fdd83511c0c47d449d98238970aef780", size = 14757697, upload-time = "2026-05-11T18:36:14.208Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/37/d98f4a14e081b238992d0ed96b6d39c7cc0148c9699eb71eaa68629665ea/mypy-2.1.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:82208da9e09414d520e912d3e462d454854bed0810b71540bb016dcbca7308fd", size = 15405638, upload-time = "2026-05-11T18:33:48.249Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/c2/15c46613b24a84fad2aea1248bf9619b99c2767ae9071fe224c179a0b7d4/mypy-2.1.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e79ebc1b904b84f0310dff7469655a9c36c7a68bddb37bdd42b67a332df61d08", size = 16215852, upload-time = "2026-05-11T18:32:50.296Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/90/9c16a57f482c76d25f6379762b56bbf65c711d8158cf271fb2802cfb0640/mypy-2.1.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:e583edc957cfb0deb142079162ae826f58449b116c1d442f2d91c69d9fced081", size = 16452695, upload-time = "2026-05-11T18:33:38.182Z" },
+    { url = "https://files.pythonhosted.org/packages/0f/4c/215a4eeb63cacc5f17f516691ea7285d11e249802b942476bff15922a314/mypy-2.1.0-cp314-cp314t-win_amd64.whl", hash = "sha256:b33b6cd332695bba180d55e717a79d3038e479a2c49cc5eb3d53603409b9a5d7", size = 12866622, upload-time = "2026-05-11T18:34:39.945Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/50/1043e1db5f455ffe4c9ab22747cd8ca2bc492b1e4f4e21b130a44ee2b217/mypy-2.1.0-cp314-cp314t-win_arm64.whl", hash = "sha256:4f910fe825376a7b66ef7ca8c98e5a149e8cd64c19ae71d84047a74ee060d4e6", size = 10610798, upload-time = "2026-05-11T18:36:31.444Z" },
+    { url = "https://files.pythonhosted.org/packages/0d/2a/13ca1f292f6db1b98ff495ef3467736b331621c5917cad984b7043e7348d/mypy-2.1.0-py3-none-any.whl", hash = "sha256:a663814603a5c563fb87a4f96fb473eeb30d1f5a4885afcf44f9db000a366289", size = 2693302, upload-time = "2026-05-11T18:31:29.246Z" },
+]
+
+[[package]]
+name = "mypy-extensions"
+version = "1.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a2/6e/371856a3fb9d31ca8dac321cda606860fa4548858c0cc45d9d1d4ca2628b/mypy_extensions-1.1.0.tar.gz", hash = "sha256:52e68efc3284861e772bbcd66823fde5ae21fd2fdb51c62a211403730b916558", size = 6343, upload-time = "2025-04-22T14:54:24.164Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/79/7b/2c79738432f5c924bef5071f933bcc9efd0473bac3b4aa584a6f7c1c8df8/mypy_extensions-1.1.0-py3-none-any.whl", hash = "sha256:1be4cccdb0f2482337c4743e60421de3a356cd97508abadd57d47403e94f5505", size = 4963, upload-time = "2025-04-22T14:54:22.983Z" },
+]
+
+[[package]]
+name = "pathspec"
+version = "1.1.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/5a/82/42f767fc1c1143d6fd36efb827202a2d997a375e160a71eb2888a925aac1/pathspec-1.1.1.tar.gz", hash = "sha256:17db5ecd524104a120e173814c90367a96a98d07c45b2e10c2f3919fff91bf5a", size = 135180, upload-time = "2026-04-27T01:46:08.907Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f1/d9/7fb5aa316bc299258e68c73ba3bddbc499654a07f151cba08f6153988714/pathspec-1.1.1-py3-none-any.whl", hash = "sha256:a00ce642f577bf7f473932318056212bc4f8bfdf53128c78bbd5af0b9b20b189", size = 57328, upload-time = "2026-04-27T01:46:07.06Z" },
+]
+
+[[package]]
+name = "ruff"
+version = "0.15.14"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/dc/8a/8bce2894573e9dae6ff4d77fe34ad727d79b9e6238ad288c5638990d90f6/ruff-0.15.14.tar.gz", hash = "sha256:48e866b165be4a9bdbf310f7d3c9a07edef2fe8cd63ffeb4e00bb590506ebf9f", size = 4700910, upload-time = "2026-05-21T14:34:55.177Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b9/c8/74a92c6ff9fcfb4f1f947126d3ebee8389276e161ecc85de5bda7cda51bd/ruff-0.15.14-py3-none-linux_armv6l.whl", hash = "sha256:8dd2db9416e487c8d4b01fa7056bb02c4d05969d4f8d17a08c229c2f4ff3c108", size = 10739177, upload-time = "2026-05-21T14:34:37.332Z" },
+    { url = "https://files.pythonhosted.org/packages/45/91/254a35c20acc38a7223c9d2d594af12e794432464f2cdeb52af1dc4a892d/ruff-0.15.14-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:be4ff55af755bd71a00ab3dc6bd7ffc467bd76e0df6881e286c2e3d23e8fb43b", size = 11144969, upload-time = "2026-05-21T14:34:43.978Z" },
+    { url = "https://files.pythonhosted.org/packages/56/9e/d13e40f83b8d0a94430e6778ce1d94a43b38cf2efe63278bdd2b4c65abbf/ruff-0.15.14-py3-none-macosx_11_0_arm64.whl", hash = "sha256:48d5909d7d06276ce7dde6d32bfa4b0d4cb2651145cd8ee4b440722cbc77832f", size = 10478207, upload-time = "2026-05-21T14:34:48.378Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/f1/b15a7839fa4f332f8acec78e20564f26bb2d866e3d21710b877fd0263000/ruff-0.15.14-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca8cbfa94c4f90984a67561978602746d4cd27103568f745fa90eee3f0d4107d", size = 10818459, upload-time = "2026-05-21T14:34:22.318Z" },
+    { url = "https://files.pythonhosted.org/packages/45/33/53d651177f84f94b400a0e27f8824eeada3dddc9d5ee8aeb048f4352a520/ruff-0.15.14-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9a6bbc0333f1ab053423bcbf6226477d266ca7cec7738c4c8e3f55647803f3c4", size = 10541800, upload-time = "2026-05-21T14:34:20.209Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/a6/868f87e0bf9786ed24b5d0d0ad8676b8a94fd1912f42cddf9cfc7857818a/ruff-0.15.14-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a24a4f7605d7003a6674d4387651effd939dead3fddd0f36561eb77a9a2e542", size = 11342149, upload-time = "2026-05-21T14:34:46.365Z" },
+    { url = "https://files.pythonhosted.org/packages/a7/8b/38cd5c19faffdcc05a408d2b78edccc69492ab9720eadb49ea15ef80d768/ruff-0.15.14-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:049b5326e53ed80978f2fc041a280603f69dd6b0c95464342a2bb4572d9d9e2f", size = 12212563, upload-time = "2026-05-21T14:34:28.579Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/4d/a3c5b874a556d5731e3e657aaf04311bb76f0a5c3ec220ed43051be6b64b/ruff-0.15.14-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d4ed42e6696c8dfa5f06728e6441993901f548eb92d73bc472cb5a38d1395fbf", size = 11493299, upload-time = "2026-05-21T14:34:41.836Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/c0/56472c251d09858a53e51efbd485b09e1995d8731668b76d52e5dd6ee0f1/ruff-0.15.14-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:715c543cf450c4888251f91c52f1942a800541d9bddd7ac060aa4e6b77ae7cba", size = 11455931, upload-time = "2026-05-21T14:34:57.276Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/4a/e2e7b4d8dbf233d4eace59c75bc3435fa6d8bd3bae82d351d4e4300c0fd1/ruff-0.15.14-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:72ebab6013ec887d439d8b7593737a0a4ffb06d45d209d4e4bf2e92813082d3f", size = 11400794, upload-time = "2026-05-21T14:34:39.773Z" },
+    { url = "https://files.pythonhosted.org/packages/97/c7/83c0539fe34c3e09136204d1e75d6052492364e0b3cb05e9465423f567d7/ruff-0.15.14-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:49072d36abdbe97a8dd7f480afe9c675699c0c495d4c84076e2c1203c4550581", size = 10804759, upload-time = "2026-05-21T14:34:31.045Z" },
+    { url = "https://files.pythonhosted.org/packages/86/a6/18f2bfc095a2ab4a78745644e428205532ce6653a5d0fa8501572891534d/ruff-0.15.14-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:958522aee105068640c2c2ceae08f413ae44d922f52a1374ac13d6a96032fc93", size = 10539517, upload-time = "2026-05-21T14:34:53.064Z" },
+    { url = "https://files.pythonhosted.org/packages/54/3a/5a8b3b69c654d4e4bf1d246ac5b49cbcdac6eaab6905925f8915f31e3b80/ruff-0.15.14-py3-none-musllinux_1_2_i686.whl", hash = "sha256:f3707da619a143a2e8830e2abab8224478d69ace2d28cb6c20543ae97c36bf61", size = 11065169, upload-time = "2026-05-21T14:34:24.484Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/c5/8864e4e7925b836ea354b31d57641ec03830564e281a8b6f061f8c3e0ec1/ruff-0.15.14-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:bb01d645694e3ec0102105d07ef2d53703970407d59c04e59d3ba0b7a1d53553", size = 11560214, upload-time = "2026-05-21T14:34:50.975Z" },
+    { url = "https://files.pythonhosted.org/packages/36/38/012bf76752e1f89ed50b77b99532d90f3a3e287bc7918e1fc0948ac866ac/ruff-0.15.14-py3-none-win32.whl", hash = "sha256:6d0c1ad2a0ab718d39b6d8fd2217981ce4d625cd96a720095f798fb47d8b13e6", size = 10805548, upload-time = "2026-05-21T14:34:33.453Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/b7/4ea2c170f10ad760fff2a5250beb18897719dc8b52b53a24cddbb9dd3f19/ruff-0.15.14-py3-none-win_amd64.whl", hash = "sha256:802342981e056db3851a7836e5b070f8f15f67d4a685ae2a6160939d364b2902", size = 11939523, upload-time = "2026-05-21T14:34:18.077Z" },
+    { url = "https://files.pythonhosted.org/packages/62/d5/bc97ff895ec35cf3925d4bd60f3b39d822f377a446906ec9bcc87405e59b/ruff-0.15.14-py3-none-win_arm64.whl", hash = "sha256:ff47b90a9ef6a40c9e2f3b479c1fb78531adf055b94c1eba0a7ba04b31951826", size = 11208607, upload-time = "2026-05-21T14:34:26.525Z" },
+]
+
+[[package]]
+name = "tomli"
+version = "2.4.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/22/de/48c59722572767841493b26183a0d1cc411d54fd759c5607c4590b6563a6/tomli-2.4.1.tar.gz", hash = "sha256:7c7e1a961a0b2f2472c1ac5b69affa0ae1132c39adcb67aba98568702b9cc23f", size = 17543, upload-time = "2026-03-25T20:22:03.828Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f4/11/db3d5885d8528263d8adc260bb2d28ebf1270b96e98f0e0268d32b8d9900/tomli-2.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f8f0fc26ec2cc2b965b7a3b87cd19c5c6b8c5e5f436b984e85f486d652285c30", size = 154704, upload-time = "2026-03-25T20:21:10.473Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/f7/675db52c7e46064a9aa928885a9b20f4124ecb9bc2e1ce74c9106648d202/tomli-2.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4ab97e64ccda8756376892c53a72bd1f964e519c77236368527f758fbc36a53a", size = 149454, upload-time = "2026-03-25T20:21:12.036Z" },
+    { url = "https://files.pythonhosted.org/packages/61/71/81c50943cf953efa35bce7646caab3cf457a7d8c030b27cfb40d7235f9ee/tomli-2.4.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:96481a5786729fd470164b47cdb3e0e58062a496f455ee41b4403be77cb5a076", size = 237561, upload-time = "2026-03-25T20:21:13.098Z" },
+    { url = "https://files.pythonhosted.org/packages/48/c1/f41d9cb618acccca7df82aaf682f9b49013c9397212cb9f53219e3abac37/tomli-2.4.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5a881ab208c0baf688221f8cecc5401bd291d67e38a1ac884d6736cbcd8247e9", size = 243824, upload-time = "2026-03-25T20:21:14.569Z" },
+    { url = "https://files.pythonhosted.org/packages/22/e4/5a816ecdd1f8ca51fb756ef684b90f2780afc52fc67f987e3c61d800a46d/tomli-2.4.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:47149d5bd38761ac8be13a84864bf0b7b70bc051806bc3669ab1cbc56216b23c", size = 242227, upload-time = "2026-03-25T20:21:15.712Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/49/2b2a0ef529aa6eec245d25f0c703e020a73955ad7edf73e7f54ddc608aa5/tomli-2.4.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ec9bfaf3ad2df51ace80688143a6a4ebc09a248f6ff781a9945e51937008fcbc", size = 247859, upload-time = "2026-03-25T20:21:17.001Z" },
+    { url = "https://files.pythonhosted.org/packages/83/bd/6c1a630eaca337e1e78c5903104f831bda934c426f9231429396ce3c3467/tomli-2.4.1-cp311-cp311-win32.whl", hash = "sha256:ff2983983d34813c1aeb0fa89091e76c3a22889ee83ab27c5eeb45100560c049", size = 97204, upload-time = "2026-03-25T20:21:18.079Z" },
+    { url = "https://files.pythonhosted.org/packages/42/59/71461df1a885647e10b6bb7802d0b8e66480c61f3f43079e0dcd315b3954/tomli-2.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:5ee18d9ebdb417e384b58fe414e8d6af9f4e7a0ae761519fb50f721de398dd4e", size = 108084, upload-time = "2026-03-25T20:21:18.978Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/83/dceca96142499c069475b790e7913b1044c1a4337e700751f48ed723f883/tomli-2.4.1-cp311-cp311-win_arm64.whl", hash = "sha256:c2541745709bad0264b7d4705ad453b76ccd191e64aa6f0fc66b69a293a45ece", size = 95285, upload-time = "2026-03-25T20:21:20.309Z" },
+    { url = "https://files.pythonhosted.org/packages/c1/ba/42f134a3fe2b370f555f44b1d72feebb94debcab01676bf918d0cb70e9aa/tomli-2.4.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c742f741d58a28940ce01d58f0ab2ea3ced8b12402f162f4d534dfe18ba1cd6a", size = 155924, upload-time = "2026-03-25T20:21:21.626Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/c7/62d7a17c26487ade21c5422b646110f2162f1fcc95980ef7f63e73c68f14/tomli-2.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7f86fd587c4ed9dd76f318225e7d9b29cfc5a9d43de44e5754db8d1128487085", size = 150018, upload-time = "2026-03-25T20:21:23.002Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/05/79d13d7c15f13bdef410bdd49a6485b1c37d28968314eabee452c22a7fda/tomli-2.4.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ff18e6a727ee0ab0388507b89d1bc6a22b138d1e2fa56d1ad494586d61d2eae9", size = 244948, upload-time = "2026-03-25T20:21:24.04Z" },
+    { url = "https://files.pythonhosted.org/packages/10/90/d62ce007a1c80d0b2c93e02cab211224756240884751b94ca72df8a875ca/tomli-2.4.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:136443dbd7e1dee43c68ac2694fde36b2849865fa258d39bf822c10e8068eac5", size = 253341, upload-time = "2026-03-25T20:21:25.177Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/7e/caf6496d60152ad4ed09282c1885cca4eea150bfd007da84aea07bcc0a3e/tomli-2.4.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5e262d41726bc187e69af7825504c933b6794dc3fbd5945e41a79bb14c31f585", size = 248159, upload-time = "2026-03-25T20:21:26.364Z" },
+    { url = "https://files.pythonhosted.org/packages/99/e7/c6f69c3120de34bbd882c6fba7975f3d7a746e9218e56ab46a1bc4b42552/tomli-2.4.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5cb41aa38891e073ee49d55fbc7839cfdb2bc0e600add13874d048c94aadddd1", size = 253290, upload-time = "2026-03-25T20:21:27.46Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/2f/4a3c322f22c5c66c4b836ec58211641a4067364f5dcdd7b974b4c5da300c/tomli-2.4.1-cp312-cp312-win32.whl", hash = "sha256:da25dc3563bff5965356133435b757a795a17b17d01dbc0f42fb32447ddfd917", size = 98141, upload-time = "2026-03-25T20:21:28.492Z" },
+    { url = "https://files.pythonhosted.org/packages/24/22/4daacd05391b92c55759d55eaee21e1dfaea86ce5c571f10083360adf534/tomli-2.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:52c8ef851d9a240f11a88c003eacb03c31fc1c9c4ec64a99a0f922b93874fda9", size = 108847, upload-time = "2026-03-25T20:21:29.386Z" },
+    { url = "https://files.pythonhosted.org/packages/68/fd/70e768887666ddd9e9f5d85129e84910f2db2796f9096aa02b721a53098d/tomli-2.4.1-cp312-cp312-win_arm64.whl", hash = "sha256:f758f1b9299d059cc3f6546ae2af89670cb1c4d48ea29c3cacc4fe7de3058257", size = 95088, upload-time = "2026-03-25T20:21:30.677Z" },
+    { url = "https://files.pythonhosted.org/packages/07/06/b823a7e818c756d9a7123ba2cda7d07bc2dd32835648d1a7b7b7a05d848d/tomli-2.4.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:36d2bd2ad5fb9eaddba5226aa02c8ec3fa4f192631e347b3ed28186d43be6b54", size = 155866, upload-time = "2026-03-25T20:21:31.65Z" },
+    { url = "https://files.pythonhosted.org/packages/14/6f/12645cf7f08e1a20c7eb8c297c6f11d31c1b50f316a7e7e1e1de6e2e7b7e/tomli-2.4.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:eb0dc4e38e6a1fd579e5d50369aa2e10acfc9cace504579b2faabb478e76941a", size = 149887, upload-time = "2026-03-25T20:21:33.028Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/e0/90637574e5e7212c09099c67ad349b04ec4d6020324539297b634a0192b0/tomli-2.4.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c7f2c7f2b9ca6bdeef8f0fa897f8e05085923eb091721675170254cbc5b02897", size = 243704, upload-time = "2026-03-25T20:21:34.51Z" },
+    { url = "https://files.pythonhosted.org/packages/10/8f/d3ddb16c5a4befdf31a23307f72828686ab2096f068eaf56631e136c1fdd/tomli-2.4.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f3c6818a1a86dd6dca7ddcaaf76947d5ba31aecc28cb1b67009a5877c9a64f3f", size = 251628, upload-time = "2026-03-25T20:21:36.012Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/f1/dbeeb9116715abee2485bf0a12d07a8f31af94d71608c171c45f64c0469d/tomli-2.4.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:d312ef37c91508b0ab2cee7da26ec0b3ed2f03ce12bd87a588d771ae15dcf82d", size = 247180, upload-time = "2026-03-25T20:21:37.136Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/74/16336ffd19ed4da28a70959f92f506233bd7cfc2332b20bdb01591e8b1d1/tomli-2.4.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:51529d40e3ca50046d7606fa99ce3956a617f9b36380da3b7f0dd3dd28e68cb5", size = 251674, upload-time = "2026-03-25T20:21:38.298Z" },
+    { url = "https://files.pythonhosted.org/packages/16/f9/229fa3434c590ddf6c0aa9af64d3af4b752540686cace29e6281e3458469/tomli-2.4.1-cp313-cp313-win32.whl", hash = "sha256:2190f2e9dd7508d2a90ded5ed369255980a1bcdd58e52f7fe24b8162bf9fedbd", size = 97976, upload-time = "2026-03-25T20:21:39.316Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/1e/71dfd96bcc1c775420cb8befe7a9d35f2e5b1309798f009dca17b7708c1e/tomli-2.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:8d65a2fbf9d2f8352685bc1364177ee3923d6baf5e7f43ea4959d7d8bc326a36", size = 108755, upload-time = "2026-03-25T20:21:40.248Z" },
+    { url = "https://files.pythonhosted.org/packages/83/7a/d34f422a021d62420b78f5c538e5b102f62bea616d1d75a13f0a88acb04a/tomli-2.4.1-cp313-cp313-win_arm64.whl", hash = "sha256:4b605484e43cdc43f0954ddae319fb75f04cc10dd80d830540060ee7cd0243cd", size = 95265, upload-time = "2026-03-25T20:21:41.219Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/fb/9a5c8d27dbab540869f7c1f8eb0abb3244189ce780ba9cd73f3770662072/tomli-2.4.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:fd0409a3653af6c147209d267a0e4243f0ae46b011aa978b1080359fddc9b6cf", size = 155726, upload-time = "2026-03-25T20:21:42.23Z" },
+    { url = "https://files.pythonhosted.org/packages/62/05/d2f816630cc771ad836af54f5001f47a6f611d2d39535364f148b6a92d6b/tomli-2.4.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:a120733b01c45e9a0c34aeef92bf0cf1d56cfe81ed9d47d562f9ed591a9828ac", size = 149859, upload-time = "2026-03-25T20:21:43.386Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/48/66341bdb858ad9bd0ceab5a86f90eddab127cf8b046418009f2125630ecb/tomli-2.4.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:559db847dc486944896521f68d8190be1c9e719fced785720d2216fe7022b662", size = 244713, upload-time = "2026-03-25T20:21:44.474Z" },
+    { url = "https://files.pythonhosted.org/packages/df/6d/c5fad00d82b3c7a3ab6189bd4b10e60466f22cfe8a08a9394185c8a8111c/tomli-2.4.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:01f520d4f53ef97964a240a035ec2a869fe1a37dde002b57ebc4417a27ccd853", size = 252084, upload-time = "2026-03-25T20:21:45.62Z" },
+    { url = "https://files.pythonhosted.org/packages/00/71/3a69e86f3eafe8c7a59d008d245888051005bd657760e96d5fbfb0b740c2/tomli-2.4.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7f94b27a62cfad8496c8d2513e1a222dd446f095fca8987fceef261225538a15", size = 247973, upload-time = "2026-03-25T20:21:46.937Z" },
+    { url = "https://files.pythonhosted.org/packages/67/50/361e986652847fec4bd5e4a0208752fbe64689c603c7ae5ea7cb16b1c0ca/tomli-2.4.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:ede3e6487c5ef5d28634ba3f31f989030ad6af71edfb0055cbbd14189ff240ba", size = 256223, upload-time = "2026-03-25T20:21:48.467Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/9a/b4173689a9203472e5467217e0154b00e260621caa227b6fa01feab16998/tomli-2.4.1-cp314-cp314-win32.whl", hash = "sha256:3d48a93ee1c9b79c04bb38772ee1b64dcf18ff43085896ea460ca8dec96f35f6", size = 98973, upload-time = "2026-03-25T20:21:49.526Z" },
+    { url = "https://files.pythonhosted.org/packages/14/58/640ac93bf230cd27d002462c9af0d837779f8773bc03dee06b5835208214/tomli-2.4.1-cp314-cp314-win_amd64.whl", hash = "sha256:88dceee75c2c63af144e456745e10101eb67361050196b0b6af5d717254dddf7", size = 109082, upload-time = "2026-03-25T20:21:50.506Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/2f/702d5e05b227401c1068f0d386d79a589bb12bf64c3d2c72ce0631e3bc49/tomli-2.4.1-cp314-cp314-win_arm64.whl", hash = "sha256:b8c198f8c1805dc42708689ed6864951fd2494f924149d3e4bce7710f8eb5232", size = 96490, upload-time = "2026-03-25T20:21:51.474Z" },
+    { url = "https://files.pythonhosted.org/packages/45/4b/b877b05c8ba62927d9865dd980e34a755de541eb65fffba52b4cc495d4d2/tomli-2.4.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:d4d8fe59808a54658fcc0160ecfb1b30f9089906c50b23bcb4c69eddc19ec2b4", size = 164263, upload-time = "2026-03-25T20:21:52.543Z" },
+    { url = "https://files.pythonhosted.org/packages/24/79/6ab420d37a270b89f7195dec5448f79400d9e9c1826df982f3f8e97b24fd/tomli-2.4.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7008df2e7655c495dd12d2a4ad038ff878d4ca4b81fccaf82b714e07eae4402c", size = 160736, upload-time = "2026-03-25T20:21:53.674Z" },
+    { url = "https://files.pythonhosted.org/packages/02/e0/3630057d8eb170310785723ed5adcdfb7d50cb7e6455f85ba8a3deed642b/tomli-2.4.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1d8591993e228b0c930c4bb0db464bdad97b3289fb981255d6c9a41aedc84b2d", size = 270717, upload-time = "2026-03-25T20:21:55.129Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/b4/1613716072e544d1a7891f548d8f9ec6ce2faf42ca65acae01d76ea06bb0/tomli-2.4.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:734e20b57ba95624ecf1841e72b53f6e186355e216e5412de414e3c51e5e3c41", size = 278461, upload-time = "2026-03-25T20:21:56.228Z" },
+    { url = "https://files.pythonhosted.org/packages/05/38/30f541baf6a3f6df77b3df16b01ba319221389e2da59427e221ef417ac0c/tomli-2.4.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:8a650c2dbafa08d42e51ba0b62740dae4ecb9338eefa093aa5c78ceb546fcd5c", size = 274855, upload-time = "2026-03-25T20:21:57.653Z" },
+    { url = "https://files.pythonhosted.org/packages/77/a3/ec9dd4fd2c38e98de34223b995a3b34813e6bdadf86c75314c928350ed14/tomli-2.4.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:504aa796fe0569bb43171066009ead363de03675276d2d121ac1a4572397870f", size = 283144, upload-time = "2026-03-25T20:21:59.089Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/be/605a6261cac79fba2ec0c9827e986e00323a1945700969b8ee0b30d85453/tomli-2.4.1-cp314-cp314t-win32.whl", hash = "sha256:b1d22e6e9387bf4739fbe23bfa80e93f6b0373a7f1b96c6227c32bef95a4d7a8", size = 108683, upload-time = "2026-03-25T20:22:00.214Z" },
+    { url = "https://files.pythonhosted.org/packages/12/64/da524626d3b9cc40c168a13da8335fe1c51be12c0a63685cc6db7308daae/tomli-2.4.1-cp314-cp314t-win_amd64.whl", hash = "sha256:2c1c351919aca02858f740c6d33adea0c5deea37f9ecca1cc1ef9e884a619d26", size = 121196, upload-time = "2026-03-25T20:22:01.169Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/cd/e80b62269fc78fc36c9af5a6b89c835baa8af28ff5ad28c7028d60860320/tomli-2.4.1-cp314-cp314t-win_arm64.whl", hash = "sha256:eab21f45c7f66c13f2a9e0e1535309cee140182a9cdae1e041d02e47291e8396", size = 100393, upload-time = "2026-03-25T20:22:02.137Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/61/cceae43728b7de99d9b847560c262873a1f6c98202171fd5ed62640b494b/tomli-2.4.1-py3-none-any.whl", hash = "sha256:0d85819802132122da43cb86656f8d1f8c6587d54ae7dcaf30e90533028b49fe", size = 14583, upload-time = "2026-03-25T20:22:03.012Z" },
+]
+
+[[package]]
+name = "typing-extensions"
+version = "4.15.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391, upload-time = "2025-08-25T13:49:26.313Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" },
+]
--- a/crates/kreuzberg-tesseract/.commitlintrc.json
+++ b/crates/kreuzberg-tesseract/.commitlintrc.json
@@ -0,0 +1,13 @@
+{
+  "extends": ["@commitlint/config-conventional"],
+  "rules": {
+    "body-max-line-length": [2, "always", 100],
+    "header-max-length": [2, "always", 100],
+    "subject-case": [2, "never", ["sentence-case", "start-case", "pascal-case", "upper-case"]],
+    "type-enum": [
+      2,
+      "always",
+      ["feat", "fix", "docs", "style", "refactor", "perf", "test", "build", "ci", "chore", "revert"]
+    ]
+  }
+}
--- a/crates/kreuzberg-tesseract/.crate-ignore
+++ b/crates/kreuzberg-tesseract/.crate-ignore
@@ -0,0 +1,2 @@
+/third_party/
+/tessdata/
--- a/crates/kreuzberg-tesseract/Cargo.lock
+++ b/crates/kreuzberg-tesseract/Cargo.lock
--- a/crates/kreuzberg-tesseract/Cargo.toml
+++ b/crates/kreuzberg-tesseract/Cargo.toml
@@ -0,0 +1,64 @@
+[package]
+name = "kreuzberg-tesseract"
+version.workspace = true
+edition.workspace = true
+rust-version.workspace = true
+authors.workspace = true
+description = "Rust bindings for Tesseract OCR with cross-compilation, C++17, and caching improvements"
+license = "MIT"
+repository.workspace = true
+homepage = "https://kreuzberg.dev"
+documentation = "https://docs.kreuzberg.dev"
+readme = "README.md"
+keywords = ["tesseract", "ocr", "bindings", "vision", "recognition"]
+categories = ["external-ffi-bindings", "computer-vision", "text-processing"]
+build = "build.rs"
+links = "kreuzberg_tesseract"
+exclude = ["tessdata/*", "third_party/*"]
+
+[package.metadata.docs.rs]
+features = ["docs-only"]
+rustdoc-args = ["--cfg", "docsrs"]
+
+[package.metadata.cargo-machete]
+ignored = ["cc", "cmake", "reqwest", "zip"]
+
+[lib]
+name = "kreuzberg_tesseract"
+crate-type = ["lib"]
+
+[features]
+default = ["static-linking"]
+build-tesseract = ["cc", "cmake", "reqwest", "zip"]
+build-tesseract-wasm = ["cmake", "reqwest", "zip"]
+# Bundle eng.traineddata into the compiled crate so WASM builds can run OCR
+# without runtime tessdata loading. Uses ~4 MB of binary size (tessdata_fast).
+bundle-tessdata-eng = []
+static-linking = ["build-tesseract"]
+dynamic-linking = []
+
+[dependencies]
+thiserror = { workspace = true }
+
+[build-dependencies]
+cc = { version = "^1.2.63", optional = true }
+cmake = { version = "0.1.58", optional = true }
+zip = { version = ">=7.0.0", optional = true, default-features = false, features = [
+    "deflate-flate2-zlib-rs",
+] }
+
+[target.'cfg(not(target_os = "windows"))'.build-dependencies]
+reqwest = { workspace = true, default-features = false, features = [
+    "blocking",
+    "rustls",
+], optional = true }
+
+# Use native-tls on Windows to avoid aws-lc-sys CMake build issues with MinGW
+[target.'cfg(target_os = "windows")'.build-dependencies]
+reqwest = { workspace = true, default-features = false, features = [
+    "blocking",
+    "native-tls",
+], optional = true }
+
+[dev-dependencies]
+image = { workspace = true, features = ["png"] }
--- a/crates/kreuzberg-tesseract/LICENSE
+++ b/crates/kreuzberg-tesseract/LICENSE
@@ -0,0 +1,22 @@
+MIT License
+
+Copyright (c) 2024 Cafer Can Gündoğdu
+Copyright (c) 2025 Na'aman Hirschfeld
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/crates/kreuzberg-tesseract/README.md
+++ b/crates/kreuzberg-tesseract/README.md
@@ -0,0 +1,405 @@
+# kreuzberg-tesseract
+
+[![Bindings](https://img.shields.io/badge/Bindings-alef%20%D7%90-007ec6)](https://github.com/kreuzberg-dev/alef)
+
+Rust bindings for Tesseract OCR with built-in compilation of Tesseract and Leptonica libraries. Provides a safe and idiomatic Rust interface to Tesseract's functionality while handling the complexity of compiling the underlying C++ libraries.
+
+Based on the original [tesseract-rs](https://github.com/cafercangundogdu/tesseract-rs) by Cafer Can Gündoğdu, this maintained version adds critical improvements for production use:
+
+- **C++17 Support**: Upgraded for Tesseract 5.5.1 which requires C++17 filesystem
+- **Cross-Compilation**: Fixed CXX compiler detection for cross-platform builds
+- **Architecture Validation**: Validates target architecture before using cached libraries
+- **Windows Static Linking**: Fixed MSVC static linking issues
+- **Build Caching**: Improved caching with OUT_DIR-based cache directory
+- **MinGW Support**: Added support for MinGW toolchains
+
+## Features
+
+- Safe Rust bindings for Tesseract OCR
+- **Multiple linking options:**
+  - **Static linking** (default): Built-in compilation with no runtime dependencies
+  - **Dynamic linking**: Link to system-installed libraries for faster builds
+- Uses existing Tesseract training data (expects English data for tests)
+- High-level Rust API for common OCR tasks
+- Caching of compiled libraries for faster subsequent builds
+- Support for multiple operating systems (Linux, macOS, Windows)
+
+## Installation
+
+### Static Linking (Default)
+
+Static linking builds Tesseract and Leptonica from source and embeds them in your binary. No runtime dependencies required:
+
+```toml
+[dependencies]
+kreuzberg-tesseract = "1.0.0-rc.1"
+# or explicitly:
+kreuzberg-tesseract = { version = "1.0.0-rc.1", features = ["static-linking"] }
+```
+
+### Dynamic Linking
+
+Dynamic linking uses system-installed Tesseract and Leptonica libraries. Faster builds, but requires libraries installed on the system:
+
+```toml
+[dependencies]
+kreuzberg-tesseract = { version = "1.0.0-rc.1", features = ["dynamic-linking"], default-features = false }
+```
+
+**System requirements for dynamic linking:**
+
+- Tesseract 5.x libraries installed (`libtesseract`, `libleptonica`)
+- macOS: `brew install tesseract leptonica`
+- Ubuntu/Debian: `sudo apt-get install libtesseract-dev libleptonica-dev`
+- RHEL/CentOS/Fedora: `sudo dnf install tesseract-devel leptonica-devel`
+- Windows: Install from [Tesseract releases](https://github.com/tesseract-ocr/tesseract/releases) or vcpkg
+
+### Development Dependencies
+
+For development and testing, you'll also need these dependencies:
+
+```toml
+[dev-dependencies]
+image = "0.25.5"
+```
+
+## System Requirements
+
+### For Static Linking (Default)
+
+When building with static linking, the crate will compile Tesseract and Leptonica from source. You need:
+
+- Rust 1.85.0 or later
+- A C++ compiler (e.g., gcc, clang, MSVC on Windows)
+- CMake 3.x or later
+- Internet connection (for downloading Tesseract source code)
+
+### For Dynamic Linking
+
+When using dynamic linking with system-installed libraries, you need:
+
+- Rust 1.85.0 or later
+- Tesseract 5.x and Leptonica libraries installed on your system (see Installation section)
+- Internet connection (for downloading Tesseract source code)
+
+No C++ compiler or CMake required for dynamic linking builds.
+
+For a full development environment checklist (including optional tooling suggestions), see [CONTRIBUTING.md](../../CONTRIBUTING.md).
+
+## Environment Variables
+
+The following environment variables affect the build and test process:
+
+### Build Variables
+
+- `CARGO_CLEAN`: If set, cleans the cache directory before building
+- `RUSTC_WRAPPER`: If set to "sccache", enables compiler caching with sccache
+- `CC`: Compiler selection for C code (affects Linux builds)
+- `HOME` (Unix) or `APPDATA` (Windows): Used to determine cache directory location
+- `TESSERACT_RS_CACHE_DIR`: Optional override for the cache root. When unset or not writable, the build falls back to the default OS-specific directory, and if that still fails, a temporary directory under the system temp folder is used automatically.
+
+### Test Variables
+
+- `TESSDATA_PREFIX` (Optional): Path to override the default tessdata directory. If not set, the crate will use its default cache directory.
+
+## Cache and Data Directories
+
+The crate uses the following directory structure based on your operating system:
+
+- macOS: `~/Library/Application Support/tesseract-rs`
+- Linux: `~/.tesseract-rs`
+- Windows: `%APPDATA%/tesseract-rs`
+
+The cache includes:
+
+- Compiled Tesseract and Leptonica libraries
+- Third-party source code
+
+Training data is not downloaded during the build. Provide `eng.traineddata` (and any other languages you need) via `TESSDATA_PREFIX` or your system Tesseract installation.
+
+## Testing
+
+The project includes several integration tests that verify OCR functionality. To run the tests:
+
+1. Ensure you have the required test dependencies:
+
+   ```toml
+   [dev-dependencies]
+   image = "0.25.9"
+   ```
+
+2. Run the tests:
+
+   ```bash
+   cargo test
+   ```
+
+Note: Make sure `eng.traineddata` is available in your tessdata directory before running tests. If `TESSDATA_PREFIX` is not set, the tests look in the default cache location. You can point the tests at a custom tessdata directory by setting:
+
+```bash
+# Linux/macOS
+export TESSDATA_PREFIX=/path/to/custom/tessdata
+
+# Windows (PowerShell)
+$env:TESSDATA_PREFIX="C:\path\to\custom\tessdata"
+```
+
+Available test cases:
+
+- OCR on English sample images
+- Error handling and invalid input coverage
+
+Test images are sourced from the shared `test_documents/` directory in the repository:
+
+- `images/test_hello_world.png`: Simple English text
+- `tables/simple_table.png`: Basic table with English headers
+
+## Usage
+
+Here's a basic example of how to use `tesseract-rs`:
+
+```rust
+use std::path::PathBuf;
+use std::error::Error;
+use kreuzberg_tesseract::TesseractAPI;
+
+fn get_default_tessdata_dir() -> PathBuf {
+    if cfg!(target_os = "macos") {
+        let home_dir = std::env::var("HOME").expect("HOME environment variable not set");
+        PathBuf::from(home_dir)
+            .join("Library")
+            .join("Application Support")
+            .join("tesseract-rs")
+            .join("tessdata")
+    } else if cfg!(target_os = "linux") {
+        let home_dir = std::env::var("HOME").expect("HOME environment variable not set");
+        PathBuf::from(home_dir)
+            .join(".tesseract-rs")
+            .join("tessdata")
+    } else if cfg!(target_os = "windows") {
+        PathBuf::from(std::env::var("APPDATA").expect("APPDATA environment variable not set"))
+            .join("tesseract-rs")
+            .join("tessdata")
+    } else {
+        panic!("Unsupported operating system");
+    }
+}
+
+fn get_tessdata_dir() -> PathBuf {
+    match std::env::var("TESSDATA_PREFIX") {
+        Ok(dir) => {
+            let path = PathBuf::from(dir);
+            println!("Using TESSDATA_PREFIX directory: {:?}", path);
+            path
+        }
+        Err(_) => {
+            let default_dir = get_default_tessdata_dir();
+            println!(
+                "TESSDATA_PREFIX not set, using default directory: {:?}",
+                default_dir
+            );
+            default_dir
+        }
+    }
+}
+
+fn main() -> Result<(), Box<dyn Error>> {
+    let api = TesseractAPI::new()?;
+
+    // Get tessdata directory (uses default location or TESSDATA_PREFIX if set)
+    let tessdata_dir = get_tessdata_dir();
+    api.init(tessdata_dir.to_str().unwrap(), "eng")?;
+
+    let width = 24;
+    let height = 24;
+    let bytes_per_pixel = 1;
+    let bytes_per_line = width * bytes_per_pixel;
+
+    // Initialize image data with all white pixels
+    let mut image_data = vec![255u8; width * height];
+
+    // Draw number 9 with clearer distinction
+    for y in 4..19 {
+        for x in 7..17 {
+            // Top bar
+            if y == 4 && x >= 8 && x <= 15 {
+                image_data[y * width + x] = 0;
+            }
+            // Top curve left side
+            if y >= 4 && y <= 10 && x == 7 {
+                image_data[y * width + x] = 0;
+            }
+            // Top curve right side
+            if y >= 4 && y <= 11 && x == 16 {
+                image_data[y * width + x] = 0;
+            }
+            // Middle bar
+            if y == 11 && x >= 8 && x <= 15 {
+                image_data[y * width + x] = 0;
+            }
+            // Bottom right vertical line
+            if y >= 11 && y <= 18 && x == 16 {
+                image_data[y * width + x] = 0;
+            }
+            // Bottom bar
+            if y == 18 && x >= 8 && x <= 15 {
+                image_data[y * width + x] = 0;
+            }
+        }
+    }
+
+    // Set the image data
+    api.set_image(
+        &image_data,
+        width.try_into().unwrap(),
+        height.try_into().unwrap(),
+        bytes_per_pixel.try_into().unwrap(),
+        bytes_per_line.try_into().unwrap(),
+    )?;
+
+    // Set whitelist for digits only
+    api.set_variable("tessedit_char_whitelist", "0123456789")?;
+
+    // Set PSM mode to single character
+    api.set_variable("tessedit_pageseg_mode", "10")?;
+
+    // Get the recognized text
+    let text = api.get_utf8_text()?;
+    println!("Recognized text: {}", text.trim());
+
+    Ok(())
+}
+```
+
+## Advanced Usage
+
+The API provides additional functionality for more complex OCR tasks, including thread-safe operations:
+
+```rust
+use kreuzberg_tesseract::TesseractAPI;
+use std::sync::Arc;
+use std::thread;
+use std::error::Error;
+
+fn main() -> Result<(), Box<dyn Error>> {
+    let tessdata_dir = get_tessdata_dir();
+    let api = TesseractAPI::new()?;
+
+    // Initialize the main API
+    api.init(tessdata_dir.to_str().unwrap(), "eng")?;
+    api.set_variable("tessedit_pageseg_mode", "1")?;
+
+    // Load and prepare image data
+    let (image_data, width, height) = load_test_image("sample_text.png")?;
+
+    // Share image data across threads
+    let image_data = Arc::new(image_data);
+    let mut handles = vec![];
+
+    // Spawn multiple threads for parallel OCR processing
+    for _ in 0..3 {
+        let api_clone = api.clone(); // Clones the API with all configurations
+        let image_data = Arc::clone(&image_data);
+
+        let handle = thread::spawn(move || {
+            // Set image in each thread
+            let res = api_clone.set_image(
+                &image_data,
+                width as i32,
+                height as i32,
+                3,
+                3 * width as i32,
+            );
+            assert!(res.is_ok());
+
+            // Perform OCR in parallel
+            let text = api_clone.get_utf8_text()
+                .expect("Failed to get text");
+            println!("Thread result: {}", text);
+        });
+        handles.push(handle);
+    }
+
+    // Wait for all threads to complete
+    for handle in handles {
+        handle.join().unwrap();
+    }
+
+    Ok(())
+}
+
+// Helper function to get tessdata directory
+fn get_tessdata_dir() -> PathBuf {
+    // ... (implementation as shown in basic example)
+}
+
+// Helper function to load test image
+fn load_test_image(filename: &str) -> Result<(Vec<u8>, u32, u32), Box<dyn Error>> {
+    let img = image::open(filename)?
+        .to_rgb8();
+    let (width, height) = img.dimensions();
+    Ok((img.into_raw(), width, height))
+}
+```
+
+## Building
+
+### Static Linking (Default)
+
+With static linking, the crate will automatically download and compile Tesseract and Leptonica during the build process. This may take some time on the first build (5-10 minutes), but subsequent builds will use the cached libraries.
+
+To clean the cache and force a rebuild:
+
+```bash
+CARGO_CLEAN=1 cargo build
+```
+
+### Dynamic Linking
+
+With dynamic linking, the build is much faster (seconds instead of minutes) since it only links against system-installed libraries:
+
+```bash
+cargo build --no-default-features --features dynamic-linking
+```
+
+**Note**: Dynamic linking requires Tesseract and Leptonica to be installed on your system (see Installation section).
+
+## Documentation
+
+For more detailed information, please check the [API documentation](https://docs.rs/kreuzberg-tesseract).
+
+## License
+
+This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
+
+## Acknowledgements
+
+This project is based on the original [tesseract-rs](https://github.com/cafercangundogdu/tesseract-rs) by [Cafer Can Gündoğdu](https://github.com/cafercangundogdu). We are grateful for the foundational work that made this project possible.
+
+## Contributing
+
+We welcome contributions! Please see our [Contributing Guide](../../CONTRIBUTING.md) for details.
+
+### Quick Start for Contributors
+
+1. Fork and clone the repository
+2. Install uv and set up git hooks:
+
+   ```bash
+   curl -LsSf https://astral.sh/uv/install.sh | sh
+   uvx prek install
+   ```
+
+3. Make your changes following our commit message format
+4. Run tests: `cargo test`
+5. Submit a Pull Request
+
+Our commit messages follow the [Conventional Commits](https://www.conventionalcommits.org/) specification.
+
+## Acknowledgements
+
+This project uses [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) and [Leptonica](http://leptonica.org/). We are grateful to the maintainers and contributors of these projects.
+
+```text
+
+```
--- a/crates/kreuzberg-tesseract/build.rs
+++ b/crates/kreuzberg-tesseract/build.rs
--- a/crates/kreuzberg-tesseract/patches/README.md
+++ b/crates/kreuzberg-tesseract/patches/README.md
@@ -0,0 +1,74 @@
+# Tesseract WASM Patches
+
+This directory contains patches needed to compile Tesseract for WebAssembly (WASM) targets using WASI SDK.
+
+These patches are vendored from the [tesseract-wasm](https://github.com/naptha/tesseract.js) project and have been proven to work with WASM compilation.
+
+## Patches
+
+### tesseract.diff
+
+A comprehensive patch that makes Tesseract compatible with WASM compilation. The patch includes the following changes:
+
+#### 1. CMakeLists.txt Modifications
+
+- **New CMake option**: `BUILD_TESSERACT_BINARY` (default: ON)
+  - Allows disabling the Tesseract CLI binary build, which is not needed for WASM
+  - Wraps all executable and installation targets for the tesseract binary
+
+- **Disabled components for WASM**:
+  - Removes OpenCL support (`src/opencl/*.cpp`) - not applicable to WASM
+  - Removes viewer support (`src/viewer/*.cpp`) - UI components not needed for WASM
+  - Removes C API bindings (`src/api/capi.cpp`) - only hocrrenderer is kept
+  - Removes PDF and rendering support files:
+    - `src/api/renderer.cpp`
+    - `src/api/altorenderer.cpp`
+    - `src/api/lstmboxrenderer.cpp`
+    - `src/api/pdfrenderer.cpp`
+    - `src/api/wordstrboxrenderer.cpp`
+
+#### 2. SIMD Detection Fixes (src/arch/simddetect.cpp)
+
+- Guards CPUID detection with `#if !defined(__wasm__)`
+- Prevents attempts to use CPU feature detection that don't exist in WASM
+- The HAS_CPUID macro is only defined for non-WASM builds
+- This allows the code to gracefully handle WASM's SIMD limitations
+
+#### 3. Pointer Type Fixes (src/ccmain/pageiterator.cpp, src/ccmain/pagesegmain.cpp, src/ccmain/tesseractclass.cpp)
+
+**Changed from stack allocation to heap allocation** in `tesseractclass.h`:
+
+- `pixa_debug_` changed from `DebugPixa` to `std::unique_ptr<DebugPixa>`
+- This prevents large allocations on the stack, which is limited in WASM
+
+**Updated all references** throughout the codebase:
+
+- `.get()` calls added where raw pointers are needed
+- Arrow operator `->` replaces dot operator `.` for member access
+- Null checks added before dereferencing to prevent crashes
+
+**Affected functions**:
+
+- `PageIterator::Orientation()` - added null vector check
+- `Tesseract::AutoPageSeg()` - updated pointer passing
+- `Tesseract::SetupPageSegAndDetectOrientation()` - multiple pointer updates
+- `Tesseract::Clear()` - added null check before WritePDF
+- `Tesseract::PrepareForPageseg()` - updated Split() calls
+- `Tesseract::PrepareForTessOCR()` - updated Split() calls
+
+#### 4. Additional Fixes
+
+- **Orientation detection**: Changed comparison from `> 0.0F` to `>= 0.0F` in `pageiterator.cpp` to handle null vectors gracefully when orientation info is not available
+
+## How to Apply
+
+These patches are applied during the WASM build process. They modify the Tesseract source code to:
+
+1. Disable WASM-incompatible features (OpenCL, viewers, renderers)
+2. Prevent CPUID detection in WASM environment
+3. Use heap allocation instead of stack allocation for large objects
+4. Handle missing pointer initialization gracefully
+
+## Source
+
+These patches are based on the proven WASM compilation approach used by the tesseract.js project, which successfully compiles Tesseract to WebAssembly and deploys it in production environments.
--- a/crates/kreuzberg-tesseract/patches/tesseract.diff
+++ b/crates/kreuzberg-tesseract/patches/tesseract.diff
@@ -0,0 +1,199 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 8c6845cb..fdcfc4a8 100644
+--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+@@ -90,6 +90,7 @@ option(ENABLE_LTO "Enable link-time optimization" OFF)
+ option(FAST_FLOAT "Enable float for LSTM" ON)
+ option(ENABLE_OPENCL "Enable unsupported experimental OpenCL support" OFF)
+ option(BUILD_TRAINING_TOOLS "Build training tools" ON)
+option(BUILD_TESSERACT_BINARY "Build Tesseract binary" ON)
+ option(BUILD_TESTS "Build tests" OFF)
+ option(USE_SYSTEM_ICU "Use system ICU" OFF)
+ option(DISABLE_ARCHIVE "Disable build with libarchive (if available)" OFF)
+@@ -565,9 +566,7 @@ file(
+   src/cutil/*.cpp
+   src/dict/*.cpp
+   src/lstm/*.cpp
+-  src/opencl/*.cpp
+   src/textord/*.cpp
+-  src/viewer/*.cpp
+   src/wordrec/*.cpp)
+
+ if(DISABLED_LEGACY_ENGINE)
+@@ -714,13 +713,7 @@ file(
+ set(TESSERACT_SRC
+     ${TESSERACT_SRC}
+     src/api/baseapi.cpp
+-    src/api/capi.cpp
+-    src/api/renderer.cpp
+-    src/api/altorenderer.cpp
+-    src/api/hocrrenderer.cpp
+-    src/api/lstmboxrenderer.cpp
+-    src/api/pdfrenderer.cpp
+-    src/api/wordstrboxrenderer.cpp)
+    src/api/hocrrenderer.cpp)
+
+ set(TESSERACT_CONFIGS
+   tessdata/configs/alto
+@@ -858,14 +851,16 @@ endif()
+ # EXECUTABLE tesseract
+ # ##############################################################################
+
+-add_executable(tesseract src/tesseract.cpp)
+-target_link_libraries(tesseract libtesseract)
+-if(HAVE_TIFFIO_H AND WIN32)
+-  target_link_libraries(tesseract ${TIFF_LIBRARIES})
+-endif()
+if(BUILD_TESSERACT_BINARY)
+  add_executable(tesseract src/tesseract.cpp)
+  target_link_libraries(tesseract libtesseract)
+  if(HAVE_TIFFIO_H AND WIN32)
+    target_link_libraries(tesseract ${TIFF_LIBRARIES})
+  endif()
+
+-if(OPENMP_BUILD AND UNIX)
+-  target_link_libraries(tesseract pthread)
+  if(OPENMP_BUILD AND UNIX)
+    target_link_libraries(tesseract pthread)
+  endif()
+ endif()
+
+ # ##############################################################################
+@@ -899,7 +894,11 @@ write_basic_package_version_file(
+
+ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/tesseract.pc
+         DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
+-install(TARGETS tesseract DESTINATION bin)
+
+if(BUILD_TESSERACT_BINARY)
+  install(TARGETS tesseract DESTINATION bin)
+endif()
+
+ install(
+   TARGETS libtesseract
+   EXPORT TesseractTargets
+diff --git a/src/arch/simddetect.cpp b/src/arch/simddetect.cpp
+index 1afe5a5d..cb8c6d4c 100644
+--- a/src/arch/simddetect.cpp
+++ b/src/arch/simddetect.cpp
+@@ -40,10 +40,12 @@
+
+ #endif
+
+#if !defined(__wasm__)
+ #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA) || defined(HAVE_SSE4_1)
+ // See https://en.wikipedia.org/wiki/CPUID.
+ #  define HAS_CPUID
+ #endif
+#endif
+
+ #if defined(HAS_CPUID)
+ #  if defined(__GNUC__)
+diff --git a/src/ccmain/pageiterator.cpp b/src/ccmain/pageiterator.cpp
+index 64ff7f66..c0f80e5f 100644
+--- a/src/ccmain/pageiterator.cpp
+++ b/src/ccmain/pageiterator.cpp
+@@ -582,7 +582,9 @@ void PageIterator::Orientation(tesseract::Orientation *orientation,
+   up_in_image.rotate(block->re_rotation());
+
+   if (up_in_image.x() == 0.0F) {
+-    if (up_in_image.y() > 0.0F) {
+    // tesseract-wasm note: `up_in_image` will be a null vector if orientation
+    // info is not available. In that case, assume page up.
+    if (up_in_image.y() >= 0.0F) {
+       *orientation = ORIENTATION_PAGE_UP;
+     } else {
+       *orientation = ORIENTATION_PAGE_DOWN;
+diff --git a/src/ccmain/pagesegmain.cpp b/src/ccmain/pagesegmain.cpp
+index 0af44607..718e73ef 100644
+--- a/src/ccmain/pagesegmain.cpp
+++ b/src/ccmain/pagesegmain.cpp
+@@ -222,7 +222,7 @@ int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOC
+     }
+ #endif // ndef DISABLED_LEGACY_ENGINE
+     result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_, to_block,
+-                                photomask_pix, pix_thresholds_, pix_grey_, &pixa_debug_,
+                                photomask_pix, pix_thresholds_, pix_grey_, pixa_debug_.get(),
+                                 &found_blocks, diacritic_blobs, to_blocks);
+     if (result >= 0) {
+       finder->GetDeskewVectors(&deskew_, &reskew_);
+@@ -279,17 +279,17 @@ ColumnFinder *Tesseract::SetupPageSegAndDetectOrientation(PageSegMode pageseg_mo
+   ICOORD bleft(0, 0);
+
+   ASSERT_HOST(pix_binary_ != nullptr);
+-  if (tessedit_dump_pageseg_images) {
+-    pixa_debug_.AddPix(pix_binary_, "PageSegInput");
+  if (tessedit_dump_pageseg_images && pixa_debug_) {
+    pixa_debug_->AddPix(pix_binary_, "PageSegInput");
+   }
+   // Leptonica is used to find the rule/separator lines in the input.
+   LineFinder::FindAndRemoveLines(source_resolution_, textord_tabfind_show_vlines, pix_binary_,
+                                  &vertical_x, &vertical_y, music_mask_pix, &v_lines, &h_lines);
+-  if (tessedit_dump_pageseg_images) {
+-    pixa_debug_.AddPix(pix_binary_, "NoLines");
+  if (tessedit_dump_pageseg_images && pixa_debug_) {
+    pixa_debug_->AddPix(pix_binary_, "NoLines");
+   }
+   // Leptonica is used to find a mask of the photo regions in the input.
+-  *photo_mask_pix = ImageFind::FindImages(pix_binary_, &pixa_debug_);
+  *photo_mask_pix = ImageFind::FindImages(pix_binary_, pixa_debug_.get());
+   if (tessedit_dump_pageseg_images) {
+     Image pix_no_image_ = nullptr;
+     if (*photo_mask_pix != nullptr) {
+@@ -297,7 +297,7 @@ ColumnFinder *Tesseract::SetupPageSegAndDetectOrientation(PageSegMode pageseg_mo
+     } else {
+       pix_no_image_ = pix_binary_.clone();
+     }
+-    pixa_debug_.AddPix(pix_no_image_, "NoImages");
+    pixa_debug_->AddPix(pix_no_image_, "NoImages");
+     pix_no_image_.destroy();
+   }
+   if (!PSM_COL_FIND_ENABLED(pageseg_mode)) {
+diff --git a/src/ccmain/tesseractclass.cpp b/src/ccmain/tesseractclass.cpp
+index fd58ac87..517f925e 100644
+--- a/src/ccmain/tesseractclass.cpp
+++ b/src/ccmain/tesseractclass.cpp
+@@ -487,8 +487,10 @@ Dict &Tesseract::getDict() {
+ }
+
+ void Tesseract::Clear() {
+-  std::string debug_name = imagebasename + "_debug.pdf";
+-  pixa_debug_.WritePDF(debug_name.c_str());
+  if (pixa_debug_) {
+    std::string debug_name = imagebasename + "_debug.pdf";
+    pixa_debug_->WritePDF(debug_name.c_str());
+  }
+   pix_binary_.destroy();
+   pix_grey_.destroy();
+   pix_thresholds_.destroy();
+@@ -572,7 +574,7 @@ void Tesseract::PrepareForPageseg() {
+   // the newly split image.
+   splitter_.set_orig_pix(pix_binary());
+   splitter_.set_pageseg_split_strategy(max_pageseg_strategy);
+-  if (splitter_.Split(true, &pixa_debug_)) {
+  if (splitter_.Split(true, pixa_debug_.get())) {
+     ASSERT_HOST(splitter_.splitted_image());
+     pix_binary_.destroy();
+     pix_binary_ = splitter_.splitted_image().clone();
+@@ -599,7 +601,7 @@ void Tesseract::PrepareForTessOCR(BLOCK_LIST *block_list, Tesseract *osd_tess, O
+   splitter_.set_segmentation_block_list(block_list);
+   splitter_.set_ocr_split_strategy(max_ocr_strategy);
+   // Run the splitter for OCR
+-  bool split_for_ocr = splitter_.Split(false, &pixa_debug_);
+  bool split_for_ocr = splitter_.Split(false, pixa_debug_.get());
+   // Restore pix_binary to the binarized original pix for future reference.
+   ASSERT_HOST(splitter_.orig_pix());
+   pix_binary_.destroy();
+diff --git a/src/ccmain/tesseractclass.h b/src/ccmain/tesseractclass.h
+index 732bb9e6..030aa5bc 100644
+--- a/src/ccmain/tesseractclass.h
+++ b/src/ccmain/tesseractclass.h
+@@ -986,7 +986,7 @@ private:
+   // Thresholds that were used to generate the thresholded image from grey.
+   Image pix_thresholds_;
+   // Debug images. If non-empty, will be written on destruction.
+-  DebugPixa pixa_debug_;
+  std::unique_ptr<DebugPixa> pixa_debug_;
+   // Input image resolution after any scaling. The resolution is not well
+   // transmitted by operations on Pix, so we keep an independent record here.
+   int source_resolution_;
--- a/crates/kreuzberg-tesseract/src/api.rs
+++ b/crates/kreuzberg-tesseract/src/api.rs
--- a/crates/kreuzberg-tesseract/src/choice_iterator.rs
+++ b/crates/kreuzberg-tesseract/src/choice_iterator.rs
@@ -0,0 +1,77 @@
+use crate::api::TessDeleteText;
+use crate::error::{Result, TesseractError};
+use std::ffi::CStr;
+use std::os::raw::{c_char, c_float, c_int, c_void};
+use std::sync::{Arc, Mutex};
+
+pub struct ChoiceIterator {
+    handle: Arc<Mutex<*mut c_void>>,
+}
+
+unsafe impl Send for ChoiceIterator {}
+unsafe impl Sync for ChoiceIterator {}
+
+impl ChoiceIterator {
+    /// Creates a new instance of the ChoiceIterator.
+    ///
+    /// # Arguments
+    ///
+    /// * `handle` - Pointer to the ChoiceIterator.
+    pub fn new(handle: *mut c_void) -> Self {
+        ChoiceIterator {
+            handle: Arc::new(Mutex::new(handle)),
+        }
+    }
+
+    /// Gets the next choice.
+    ///
+    /// # Returns
+    ///
+    /// Returns `true` if the next choice is successful, otherwise returns `false`.
+    pub fn next(&self) -> Result<bool> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        Ok(unsafe { TessChoiceIteratorNext(*handle) != 0 })
+    }
+
+    /// Gets the UTF-8 text for the current choice.
+    ///
+    /// # Returns
+    ///
+    /// Returns the UTF-8 text as a `String` if successful, otherwise returns an error.
+    pub fn get_utf8_text(&self) -> Result<String> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        let text_ptr = unsafe { TessChoiceIteratorGetUTF8Text(*handle) };
+        if text_ptr.is_null() {
+            return Err(TesseractError::NullPointerError);
+        }
+        let c_str = unsafe { CStr::from_ptr(text_ptr) };
+        let result = c_str.to_str()?.to_owned();
+        unsafe { TessDeleteText(text_ptr) };
+        Ok(result)
+    }
+
+    /// Gets the confidence of the current choice.
+    ///
+    /// # Returns
+    ///
+    /// Returns the confidence as a `f32`.
+    pub fn confidence(&self) -> Result<f32> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        Ok(unsafe { TessChoiceIteratorConfidence(*handle) })
+    }
+}
+
+impl Drop for ChoiceIterator {
+    fn drop(&mut self) {
+        if let Ok(handle) = self.handle.lock() {
+            unsafe { TessChoiceIteratorDelete(*handle) };
+        }
+    }
+}
+
+ffi_extern! {
+    fn TessChoiceIteratorDelete(handle: *mut c_void);
+    fn TessChoiceIteratorNext(handle: *mut c_void) -> c_int;
+    fn TessChoiceIteratorGetUTF8Text(handle: *mut c_void) -> *mut c_char;
+    fn TessChoiceIteratorConfidence(handle: *mut c_void) -> c_float;
+}
--- a/crates/kreuzberg-tesseract/src/enums.rs
+++ b/crates/kreuzberg-tesseract/src/enums.rs
@@ -0,0 +1,373 @@
+#[repr(C)]
+#[derive(Debug, Clone, Copy, PartialEq)]
+#[allow(non_camel_case_types)]
+pub enum TessPageSegMode {
+    PSM_OSD_ONLY = 0,
+    PSM_AUTO_OSD = 1,
+    PSM_AUTO_ONLY = 2,
+    PSM_AUTO = 3,
+    PSM_SINGLE_COLUMN = 4,
+    PSM_SINGLE_BLOCK_VERT_TEXT = 5,
+    PSM_SINGLE_BLOCK = 6,
+    PSM_SINGLE_LINE = 7,
+    PSM_SINGLE_WORD = 8,
+    PSM_CIRCLE_WORD = 9,
+    PSM_SINGLE_CHAR = 10,
+    PSM_SPARSE_TEXT = 11,
+    PSM_SPARSE_TEXT_OSD = 12,
+    PSM_RAW_LINE = 13,
+    PSM_COUNT = 14,
+}
+
+impl TessPageSegMode {
+    pub fn from_int(value: i32) -> Self {
+        match value {
+            0 => TessPageSegMode::PSM_OSD_ONLY,
+            1 => TessPageSegMode::PSM_AUTO_OSD,
+            2 => TessPageSegMode::PSM_AUTO_ONLY,
+            3 => TessPageSegMode::PSM_AUTO,
+            4 => TessPageSegMode::PSM_SINGLE_COLUMN,
+            5 => TessPageSegMode::PSM_SINGLE_BLOCK_VERT_TEXT,
+            6 => TessPageSegMode::PSM_SINGLE_BLOCK,
+            7 => TessPageSegMode::PSM_SINGLE_LINE,
+            8 => TessPageSegMode::PSM_SINGLE_WORD,
+            9 => TessPageSegMode::PSM_CIRCLE_WORD,
+            10 => TessPageSegMode::PSM_SINGLE_CHAR,
+            11 => TessPageSegMode::PSM_SPARSE_TEXT,
+            12 => TessPageSegMode::PSM_SPARSE_TEXT_OSD,
+            13 => TessPageSegMode::PSM_RAW_LINE,
+            14 => TessPageSegMode::PSM_COUNT,
+            _ => TessPageSegMode::PSM_AUTO,
+        }
+    }
+
+    /// Safely convert an integer to a TessPageSegMode, returning None for invalid values.
+    pub fn try_from_int(value: i32) -> Option<Self> {
+        match value {
+            0 => Some(TessPageSegMode::PSM_OSD_ONLY),
+            1 => Some(TessPageSegMode::PSM_AUTO_OSD),
+            2 => Some(TessPageSegMode::PSM_AUTO_ONLY),
+            3 => Some(TessPageSegMode::PSM_AUTO),
+            4 => Some(TessPageSegMode::PSM_SINGLE_COLUMN),
+            5 => Some(TessPageSegMode::PSM_SINGLE_BLOCK_VERT_TEXT),
+            6 => Some(TessPageSegMode::PSM_SINGLE_BLOCK),
+            7 => Some(TessPageSegMode::PSM_SINGLE_LINE),
+            8 => Some(TessPageSegMode::PSM_SINGLE_WORD),
+            9 => Some(TessPageSegMode::PSM_CIRCLE_WORD),
+            10 => Some(TessPageSegMode::PSM_SINGLE_CHAR),
+            11 => Some(TessPageSegMode::PSM_SPARSE_TEXT),
+            12 => Some(TessPageSegMode::PSM_SPARSE_TEXT_OSD),
+            13 => Some(TessPageSegMode::PSM_RAW_LINE),
+            14 => Some(TessPageSegMode::PSM_COUNT),
+            _ => None,
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug, Clone, Copy, PartialEq)]
+#[allow(non_camel_case_types)]
+pub enum TessPageIteratorLevel {
+    RIL_BLOCK = 0,
+    RIL_PARA = 1,
+    RIL_TEXTLINE = 2,
+    RIL_WORD = 3,
+    RIL_SYMBOL = 4,
+}
+
+impl TessPageIteratorLevel {
+    pub fn from_int(value: i32) -> Self {
+        match value {
+            0 => TessPageIteratorLevel::RIL_BLOCK,
+            1 => TessPageIteratorLevel::RIL_PARA,
+            2 => TessPageIteratorLevel::RIL_TEXTLINE,
+            3 => TessPageIteratorLevel::RIL_WORD,
+            4 => TessPageIteratorLevel::RIL_SYMBOL,
+            _ => TessPageIteratorLevel::RIL_BLOCK,
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug, Clone, Copy, PartialEq)]
+#[allow(non_camel_case_types)]
+pub enum TessPolyBlockType {
+    PT_UNKNOWN = 0,
+    PT_FLOWING_TEXT = 1,
+    PT_HEADING_TEXT = 2,
+    PT_PULLOUT_TEXT = 3,
+    PT_EQUATION = 4,
+    PT_INLINE_EQUATION = 5,
+    PT_TABLE = 6,
+    PT_VERTICAL_TEXT = 7,
+    PT_CAPTION_TEXT = 8,
+    PT_FLOWING_IMAGE = 9,
+    PT_HEADING_IMAGE = 10,
+    PT_PULLOUT_IMAGE = 11,
+    PT_HORZ_LINE = 12,
+    PT_VERT_LINE = 13,
+    PT_NOISE = 14,
+    PT_COUNT = 15,
+}
+
+impl TessPolyBlockType {
+    pub fn from_int(value: i32) -> Self {
+        match value {
+            0 => TessPolyBlockType::PT_UNKNOWN,
+            1 => TessPolyBlockType::PT_FLOWING_TEXT,
+            2 => TessPolyBlockType::PT_HEADING_TEXT,
+            3 => TessPolyBlockType::PT_PULLOUT_TEXT,
+            4 => TessPolyBlockType::PT_EQUATION,
+            5 => TessPolyBlockType::PT_INLINE_EQUATION,
+            6 => TessPolyBlockType::PT_TABLE,
+            7 => TessPolyBlockType::PT_VERTICAL_TEXT,
+            8 => TessPolyBlockType::PT_CAPTION_TEXT,
+            9 => TessPolyBlockType::PT_FLOWING_IMAGE,
+            10 => TessPolyBlockType::PT_HEADING_IMAGE,
+            11 => TessPolyBlockType::PT_PULLOUT_IMAGE,
+            12 => TessPolyBlockType::PT_HORZ_LINE,
+            13 => TessPolyBlockType::PT_VERT_LINE,
+            14 => TessPolyBlockType::PT_NOISE,
+            15 => TessPolyBlockType::PT_COUNT,
+            _ => TessPolyBlockType::PT_UNKNOWN,
+        }
+    }
+
+    /// Safely convert an integer to a TessPolyBlockType, returning None for invalid values.
+    pub fn try_from_int(value: i32) -> Option<Self> {
+        match value {
+            0 => Some(TessPolyBlockType::PT_UNKNOWN),
+            1 => Some(TessPolyBlockType::PT_FLOWING_TEXT),
+            2 => Some(TessPolyBlockType::PT_HEADING_TEXT),
+            3 => Some(TessPolyBlockType::PT_PULLOUT_TEXT),
+            4 => Some(TessPolyBlockType::PT_EQUATION),
+            5 => Some(TessPolyBlockType::PT_INLINE_EQUATION),
+            6 => Some(TessPolyBlockType::PT_TABLE),
+            7 => Some(TessPolyBlockType::PT_VERTICAL_TEXT),
+            8 => Some(TessPolyBlockType::PT_CAPTION_TEXT),
+            9 => Some(TessPolyBlockType::PT_FLOWING_IMAGE),
+            10 => Some(TessPolyBlockType::PT_HEADING_IMAGE),
+            11 => Some(TessPolyBlockType::PT_PULLOUT_IMAGE),
+            12 => Some(TessPolyBlockType::PT_HORZ_LINE),
+            13 => Some(TessPolyBlockType::PT_VERT_LINE),
+            14 => Some(TessPolyBlockType::PT_NOISE),
+            15 => Some(TessPolyBlockType::PT_COUNT),
+            _ => None,
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug, Clone, Copy, PartialEq)]
+#[allow(non_camel_case_types)]
+pub enum TessOrientation {
+    ORIENTATION_PAGE_UP = 0,
+    ORIENTATION_PAGE_RIGHT = 1,
+    ORIENTATION_PAGE_DOWN = 2,
+    ORIENTATION_PAGE_LEFT = 3,
+}
+
+impl TessOrientation {
+    pub fn from_int(value: i32) -> Self {
+        match value {
+            0 => TessOrientation::ORIENTATION_PAGE_UP,
+            1 => TessOrientation::ORIENTATION_PAGE_RIGHT,
+            2 => TessOrientation::ORIENTATION_PAGE_DOWN,
+            3 => TessOrientation::ORIENTATION_PAGE_LEFT,
+            _ => TessOrientation::ORIENTATION_PAGE_UP,
+        }
+    }
+
+    /// Safely convert an integer to a TessOrientation, returning None for invalid values.
+    pub fn try_from_int(value: i32) -> Option<Self> {
+        match value {
+            0 => Some(TessOrientation::ORIENTATION_PAGE_UP),
+            1 => Some(TessOrientation::ORIENTATION_PAGE_RIGHT),
+            2 => Some(TessOrientation::ORIENTATION_PAGE_DOWN),
+            3 => Some(TessOrientation::ORIENTATION_PAGE_LEFT),
+            _ => None,
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug, Clone, Copy, PartialEq)]
+#[allow(non_camel_case_types)]
+pub enum TessParagraphJustification {
+    JUSTIFICATION_UNKNOWN = 0,
+    JUSTIFICATION_LEFT = 1,
+    JUSTIFICATION_CENTER = 2,
+    JUSTIFICATION_RIGHT = 3,
+}
+
+impl TessParagraphJustification {
+    pub fn from_int(value: i32) -> Self {
+        match value {
+            0 => TessParagraphJustification::JUSTIFICATION_UNKNOWN,
+            1 => TessParagraphJustification::JUSTIFICATION_LEFT,
+            2 => TessParagraphJustification::JUSTIFICATION_CENTER,
+            3 => TessParagraphJustification::JUSTIFICATION_RIGHT,
+            _ => TessParagraphJustification::JUSTIFICATION_UNKNOWN,
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug, Clone, Copy, PartialEq)]
+#[allow(non_camel_case_types)]
+pub enum TessWritingDirection {
+    WRITING_DIRECTION_LEFT_TO_RIGHT = 0,
+    WRITING_DIRECTION_RIGHT_TO_LEFT = 1,
+    WRITING_DIRECTION_TOP_TO_BOTTOM = 2,
+}
+
+impl TessWritingDirection {
+    pub fn from_int(value: i32) -> Self {
+        match value {
+            0 => TessWritingDirection::WRITING_DIRECTION_LEFT_TO_RIGHT,
+            1 => TessWritingDirection::WRITING_DIRECTION_RIGHT_TO_LEFT,
+            2 => TessWritingDirection::WRITING_DIRECTION_TOP_TO_BOTTOM,
+            _ => TessWritingDirection::WRITING_DIRECTION_LEFT_TO_RIGHT,
+        }
+    }
+
+    /// Safely convert an integer to a TessWritingDirection, returning None for invalid values.
+    pub fn try_from_int(value: i32) -> Option<Self> {
+        match value {
+            0 => Some(TessWritingDirection::WRITING_DIRECTION_LEFT_TO_RIGHT),
+            1 => Some(TessWritingDirection::WRITING_DIRECTION_RIGHT_TO_LEFT),
+            2 => Some(TessWritingDirection::WRITING_DIRECTION_TOP_TO_BOTTOM),
+            _ => None,
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug, Clone, Copy, PartialEq)]
+#[allow(non_camel_case_types)]
+pub enum TessTextlineOrder {
+    TEXTLINE_ORDER_LEFT_TO_RIGHT = 0,
+    TEXTLINE_ORDER_RIGHT_TO_LEFT = 1,
+    TEXTLINE_ORDER_TOP_TO_BOTTOM = 2,
+}
+
+impl TessTextlineOrder {
+    pub fn from_int(value: i32) -> Self {
+        match value {
+            0 => TessTextlineOrder::TEXTLINE_ORDER_LEFT_TO_RIGHT,
+            1 => TessTextlineOrder::TEXTLINE_ORDER_RIGHT_TO_LEFT,
+            2 => TessTextlineOrder::TEXTLINE_ORDER_TOP_TO_BOTTOM,
+            _ => TessTextlineOrder::TEXTLINE_ORDER_LEFT_TO_RIGHT,
+        }
+    }
+
+    /// Safely convert an integer to a TessTextlineOrder, returning None for invalid values.
+    pub fn try_from_int(value: i32) -> Option<Self> {
+        match value {
+            0 => Some(TessTextlineOrder::TEXTLINE_ORDER_LEFT_TO_RIGHT),
+            1 => Some(TessTextlineOrder::TEXTLINE_ORDER_RIGHT_TO_LEFT),
+            2 => Some(TessTextlineOrder::TEXTLINE_ORDER_TOP_TO_BOTTOM),
+            _ => None,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_page_seg_mode_from_int() {
+        assert_eq!(TessPageSegMode::from_int(0), TessPageSegMode::PSM_OSD_ONLY);
+        assert_eq!(TessPageSegMode::from_int(3), TessPageSegMode::PSM_AUTO);
+        assert_eq!(TessPageSegMode::from_int(10), TessPageSegMode::PSM_SINGLE_CHAR);
+        assert_eq!(TessPageSegMode::from_int(999), TessPageSegMode::PSM_AUTO);
+    }
+
+    #[test]
+    fn test_page_seg_mode_conversion() {
+        let mode = TessPageSegMode::PSM_SINGLE_LINE;
+        assert_eq!(mode as i32, 7);
+    }
+
+    #[test]
+    fn test_page_iterator_level_from_int() {
+        assert_eq!(TessPageIteratorLevel::from_int(0), TessPageIteratorLevel::RIL_BLOCK);
+        assert_eq!(TessPageIteratorLevel::from_int(3), TessPageIteratorLevel::RIL_WORD);
+        assert_eq!(TessPageIteratorLevel::from_int(-1), TessPageIteratorLevel::RIL_BLOCK);
+    }
+
+    #[test]
+    fn test_poly_block_type_from_int() {
+        assert_eq!(TessPolyBlockType::from_int(1), TessPolyBlockType::PT_FLOWING_TEXT);
+        assert_eq!(TessPolyBlockType::from_int(6), TessPolyBlockType::PT_TABLE);
+        assert_eq!(TessPolyBlockType::from_int(100), TessPolyBlockType::PT_UNKNOWN);
+    }
+
+    #[test]
+    fn test_orientation_from_int() {
+        assert_eq!(TessOrientation::from_int(0), TessOrientation::ORIENTATION_PAGE_UP);
+        assert_eq!(TessOrientation::from_int(2), TessOrientation::ORIENTATION_PAGE_DOWN);
+        assert_eq!(TessOrientation::from_int(5), TessOrientation::ORIENTATION_PAGE_UP);
+    }
+
+    #[test]
+    fn test_paragraph_justification_from_int() {
+        assert_eq!(
+            TessParagraphJustification::from_int(1),
+            TessParagraphJustification::JUSTIFICATION_LEFT
+        );
+        assert_eq!(
+            TessParagraphJustification::from_int(3),
+            TessParagraphJustification::JUSTIFICATION_RIGHT
+        );
+        assert_eq!(
+            TessParagraphJustification::from_int(-1),
+            TessParagraphJustification::JUSTIFICATION_UNKNOWN
+        );
+    }
+
+    #[test]
+    fn test_writing_direction_from_int() {
+        assert_eq!(
+            TessWritingDirection::from_int(0),
+            TessWritingDirection::WRITING_DIRECTION_LEFT_TO_RIGHT
+        );
+        assert_eq!(
+            TessWritingDirection::from_int(1),
+            TessWritingDirection::WRITING_DIRECTION_RIGHT_TO_LEFT
+        );
+        assert_eq!(
+            TessWritingDirection::from_int(10),
+            TessWritingDirection::WRITING_DIRECTION_LEFT_TO_RIGHT
+        );
+    }
+
+    #[test]
+    fn test_textline_order_from_int() {
+        assert_eq!(
+            TessTextlineOrder::from_int(0),
+            TessTextlineOrder::TEXTLINE_ORDER_LEFT_TO_RIGHT
+        );
+        assert_eq!(
+            TessTextlineOrder::from_int(2),
+            TessTextlineOrder::TEXTLINE_ORDER_TOP_TO_BOTTOM
+        );
+        assert_eq!(
+            TessTextlineOrder::from_int(99),
+            TessTextlineOrder::TEXTLINE_ORDER_LEFT_TO_RIGHT
+        );
+    }
+
+    #[test]
+    fn test_enums_are_copy() {
+        fn assert_copy<T: Copy>() {}
+        assert_copy::<TessPageSegMode>();
+        assert_copy::<TessPageIteratorLevel>();
+        assert_copy::<TessPolyBlockType>();
+        assert_copy::<TessOrientation>();
+        assert_copy::<TessParagraphJustification>();
+        assert_copy::<TessWritingDirection>();
+        assert_copy::<TessTextlineOrder>();
+    }
+}
--- a/crates/kreuzberg-tesseract/src/error.rs
+++ b/crates/kreuzberg-tesseract/src/error.rs
@@ -0,0 +1,85 @@
+use std::str::Utf8Error;
+use thiserror::Error;
+
+/// Errors that can occur when using the Tesseract API.
+#[derive(Error, Debug)]
+pub enum TesseractError {
+    #[error("Failed to initialize Tesseract")]
+    InitError,
+    #[error("Failed to set image")]
+    SetImageError,
+    #[error("OCR operation failed")]
+    OcrError,
+    #[error("Invalid UTF-8 in Tesseract output")]
+    Utf8Error(#[from] Utf8Error),
+    #[error("Failed to lock mutex")]
+    MutexLockError,
+    #[error("Failed to set variable")]
+    SetVariableError,
+    #[error("Failed to get variable")]
+    GetVariableError,
+    #[error("Null pointer error")]
+    NullPointerError,
+    #[error("Invalid parameter")]
+    InvalidParameterError,
+    #[error("Layout analysis failed")]
+    AnalyseLayoutError,
+    #[error("Page processing failed")]
+    ProcessPagesError,
+    #[error("I/O error")]
+    IoError,
+    #[error("Mutex error")]
+    MutexError,
+    #[error("Invalid dimensions")]
+    InvalidDimensions,
+    #[error("Invalid bytes per pixel")]
+    InvalidBytesPerPixel,
+    #[error("Invalid bytes per line")]
+    InvalidBytesPerLine,
+    #[error("Invalid image data")]
+    InvalidImageData,
+    #[error("Uninitialized error")]
+    UninitializedError,
+    #[error("Invalid enum value: {0}")]
+    InvalidEnumValue(i32),
+    #[error("String contains null byte")]
+    NullByteInString,
+}
+
+/// Result type for Tesseract operations.
+pub type Result<T> = std::result::Result<T, TesseractError>;
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_error_display() {
+        let error = TesseractError::InitError;
+        assert_eq!(error.to_string(), "Failed to initialize Tesseract");
+
+        let error = TesseractError::SetImageError;
+        assert_eq!(error.to_string(), "Failed to set image");
+
+        let error = TesseractError::OcrError;
+        assert_eq!(error.to_string(), "OCR operation failed");
+    }
+
+    #[test]
+    fn test_utf8_error_conversion() {
+        let invalid_utf8 = vec![0xFF, 0xFE];
+        let utf8_error = std::str::from_utf8(&invalid_utf8).unwrap_err();
+        let tess_error: TesseractError = utf8_error.into();
+
+        match tess_error {
+            TesseractError::Utf8Error(_) => {}
+            _ => panic!("Expected Utf8Error variant"),
+        }
+    }
+
+    #[test]
+    fn test_error_is_send_sync() {
+        fn assert_send_sync<T: Send + Sync>() {}
+        assert_send_sync::<TesseractError>();
+    }
+}
--- a/crates/kreuzberg-tesseract/src/leptonica.rs
+++ b/crates/kreuzberg-tesseract/src/leptonica.rs
@@ -0,0 +1,807 @@
+//! Safe Leptonica Pix wrapper for image preprocessing before OCR.
+//!
+//! Provides a safe Rust wrapper around the Leptonica image-processing library.
+//! `Pix` is the core Leptonica image type. All methods return `Result<Pix>`,
+//! and the wrapper takes care of proper memory management via `Drop`.
+//!
+//! ## Pixel format
+//!
+//! Leptonica's 32 bpp format stores each pixel as a native 32-bit integer
+//! with the logical layout (MSB→LSB): `R G B A`, i.e.
+//! `(r << 24) | (g << 16) | (b << 8) | alpha`.  Leptonica accesses
+//! individual channels via bit-shift on the integer value, not via
+//! byte-addressed pointer arithmetic, so the packing is identical on both
+//! big- and little-endian hosts.  Do **not** call `pixEndianByteSwap` after
+//! writing pixels this way — doing so inverts the channel order.
+//!
+//! ## `pixDeskew` requires a binary (1 bpp) image
+//!
+//! Call `to_grayscale()` followed by `adaptive_threshold()` before `deskew()`.
+//! `pixDeskew` internally calls `pixFindSkewSweepAndSearchScorePivot` which
+//! operates on 1-bit images only; passing a colour image will return a null
+//! pointer.
+
+use crate::error::{Result, TesseractError};
+use std::ffi::c_void;
+
+// ---------------------------------------------------------------------------
+// Raw Leptonica FFI declarations
+// ---------------------------------------------------------------------------
+
+#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
+ffi_extern! {
+    /// Allocates a new Pix with the given dimensions and bit depth.
+    fn pixCreate(width: i32, height: i32, depth: i32) -> *mut c_void;
+
+    /// Frees a Pix and sets the caller's pointer to null.
+    ///
+    /// Leptonica uses a double-pointer convention: `*ppix` is set to null
+    /// after the call so that accidental double-frees are a no-op.
+    fn pixDestroy(ppix: *mut *mut c_void);
+
+    /// Sets the horizontal and vertical resolution (DPI) on a Pix.
+    ///
+    /// Returns 0 on success, non-zero on error.
+    fn pixSetResolution(pix: *mut c_void, xres: i32, yres: i32) -> i32;
+
+    /// Returns the width of the Pix in pixels.
+    fn pixGetWidth(pix: *const c_void) -> i32;
+
+    /// Returns the height of the Pix in pixels.
+    fn pixGetHeight(pix: *const c_void) -> i32;
+
+    /// Returns the bit depth of the Pix (1, 2, 4, 8, 16, or 32).
+    fn pixGetDepth(pix: *const c_void) -> i32;
+
+    /// Returns the number of 32-bit words per row (words-per-line).
+    fn pixGetWpl(pix: *const c_void) -> i32;
+
+    /// Returns a mutable pointer to the start of the pixel data array.
+    ///
+    /// The data is stored as rows of 32-bit words; each word covers 32/depth pixels.
+    fn pixGetData(pix: *mut c_void) -> *mut u32;
+
+    /// Deskews a 1 bpp image using a sweep-and-search algorithm.
+    ///
+    /// `redsearch` is the reduction factor used during the search; pass 0 for
+    /// the Leptonica default (2x reduction). Returns a new deskewed Pix on
+    /// success, or null on failure. The input Pix is **not** consumed.
+    fn pixDeskew(pixs: *mut c_void, redsearch: i32) -> *mut c_void;
+
+    /// Estimates the skew angle and confidence for a 1 bpp image.
+    ///
+    /// Writes the angle (degrees, positive = counter-clockwise) into `*pangle`
+    /// and a confidence score (0–1) into `*pconf`. Returns 0 on success.
+    fn pixFindSkew(pixs: *mut c_void, pangle: *mut f32, pconf: *mut f32) -> i32;
+
+    /// Applies Otsu adaptive thresholding to produce a binarised Pix.
+    ///
+    /// `sx`/`sy` are the tile dimensions; `smoothx`/`smoothy` are half-widths
+    /// for smoothing the threshold map; `scorefract` controls threshold acceptance
+    /// (typical value: 0.1). `ppixth` (optional) receives the threshold image;
+    /// `ppixd` receives the binarised output.
+    fn pixOtsuAdaptiveThreshold(
+        pixs: *mut c_void,
+        sx: i32,
+        sy: i32,
+        smoothx: i32,
+        smoothy: i32,
+        scorefract: f32,
+        ppixth: *mut *mut c_void,
+        ppixd: *mut *mut c_void,
+    ) -> i32;
+
+    /// Normalises the background of a grayscale image using morphological operations.
+    ///
+    /// `reduction` is the subsampling factor (e.g. 4), `size` is the morphological
+    /// structuring-element half-size (e.g. 15), and `bgval` is the target background
+    /// value (e.g. 200). Returns a new normalised Pix, or null on failure.
+    fn pixBackgroundNormMorph(
+        pixs: *mut c_void,
+        pixim: *mut c_void,
+        reduction: i32,
+        size: i32,
+        bgval: i32,
+    ) -> *mut c_void;
+
+    /// Applies unsharp masking to sharpen a grayscale or colour Pix.
+    ///
+    /// `halfwidth` is the half-size of the blur kernel; `fract` controls the
+    /// sharpening strength (0.0–1.0 typical). Returns a new Pix, or null on failure.
+    fn pixUnsharpMasking(pixs: *mut c_void, halfwidth: i32, fract: f32) -> *mut c_void;
+
+    /// Scales a Pix by independent x and y factors using the best available method.
+    ///
+    /// Returns a new scaled Pix, or null on failure. The input Pix is **not** consumed.
+    fn pixScale(pixs: *mut c_void, scalex: f32, scaley: f32) -> *mut c_void;
+
+    /// Converts an RGB (32 bpp) Pix to 8 bpp grayscale.
+    ///
+    /// `rwt`, `gwt`, `bwt` are the red, green, and blue channel weights; pass
+    /// 0.0 for all three to use Leptonica's default equal weights. Returns a new
+    /// 8 bpp Pix, or null on failure.
+    fn pixConvertRGBToGray(pixs: *mut c_void, rwt: f32, gwt: f32, bwt: f32) -> *mut c_void;
+
+    /// Creates a Leptonica BOX with the given coordinates.
+    fn boxCreate(x: i32, y: i32, w: i32, h: i32) -> *mut c_void;
+
+    /// Frees a Leptonica BOX.
+    fn boxDestroy(pbox: *mut *mut c_void);
+
+    /// Clips a rectangular region from a Pix.
+    ///
+    /// Returns a new Pix containing the clipped region, or null on failure.
+    /// `pboxc` (optional) receives the actual clipped box; pass null to ignore.
+    fn pixClipRectangle(pixs: *mut c_void, box_: *mut c_void, pboxc: *mut *mut c_void) -> *mut c_void;
+
+    /// Counts connected components in a 1 bpp image.
+    ///
+    /// `connectivity` is 4 or 8. Writes the count to `*pcount`.
+    /// Returns 0 on success.
+    fn pixCountConnComp(pix: *mut c_void, connectivity: i32, pcount: *mut i32) -> i32;
+
+    /// Retrieves the horizontal and vertical resolution (DPI) from a Pix.
+    ///
+    /// Writes the x-resolution into `*pxres` and y-resolution into `*pyres`.
+    /// Returns 0 on success, non-zero on error.
+    fn pixGetResolution(pix: *const c_void, pxres: *mut i32, pyres: *mut i32) -> i32;
+
+}
+
+// ---------------------------------------------------------------------------
+// Safe Pix wrapper
+// ---------------------------------------------------------------------------
+
+/// Safe wrapper around a Leptonica `PIX *` image object.
+///
+/// Owns the underlying allocation and frees it in `Drop`. All methods that
+/// return a new image allocate a fresh `Pix`; the receiver is never consumed.
+///
+/// # Thread safety
+///
+/// `Pix` is `Send` because Leptonica image objects are independent heap
+/// allocations with no shared mutable state. Concurrent mutation from multiple
+/// threads is **not** safe (no `Sync`).
+#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
+pub struct Pix {
+    ptr: *mut c_void,
+}
+
+#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
+impl std::fmt::Debug for Pix {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("Pix").field("ptr", &self.ptr).finish()
+    }
+}
+
+// SAFETY: A Pix owns a uniquely heap-allocated Leptonica PIX. There is no
+// interior mutability shared across thread boundaries, so transferring
+// ownership to another thread is safe.
+#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
+unsafe impl Send for Pix {}
+
+#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
+impl Pix {
+    // -----------------------------------------------------------------------
+    // Construction
+    // -----------------------------------------------------------------------
+
+    /// Creates a 32 bpp Leptonica Pix from a packed RGB byte slice.
+    ///
+    /// `data` must contain exactly `width * height * 3` bytes in left-to-right,
+    /// top-to-bottom, `R G B` interleaved order.
+    ///
+    /// The DPI is set to 300 × 300 which is a sensible default for OCR input.
+    ///
+    /// # Errors
+    ///
+    /// Returns `TesseractError::InvalidImageData` if `data` length does not
+    /// match `width * height * 3`, if either dimension is zero, or if
+    /// Leptonica's `pixCreate` returns null.
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use kreuzberg_tesseract::Pix;
+    /// let rgb = vec![255u8; 4 * 4 * 3]; // 4×4 white image
+    /// let pix = Pix::from_raw_rgb(&rgb, 4, 4).unwrap();
+    /// assert_eq!(pix.width(), 4);
+    /// assert_eq!(pix.height(), 4);
+    /// assert_eq!(pix.depth(), 32);
+    /// ```
+    pub fn from_raw_rgb(data: &[u8], width: u32, height: u32) -> Result<Pix> {
+        let expected = (width as usize)
+            .checked_mul(height as usize)
+            .and_then(|n| n.checked_mul(3))
+            .ok_or(TesseractError::InvalidImageData)?;
+
+        if data.len() != expected || width == 0 || height == 0 {
+            return Err(TesseractError::InvalidImageData);
+        }
+
+        // SAFETY: pixCreate() allocates a new PIX with the requested dimensions.
+        // It is safe because:
+        // 1. width, height, and depth (32) are valid positive integers.
+        // 2. pixCreate() documents that it returns null only on allocation
+        //    failure, which we check immediately below.
+        let pix_ptr = unsafe { pixCreate(width as i32, height as i32, 32) };
+        if pix_ptr.is_null() {
+            return Err(TesseractError::NullPointerError);
+        }
+
+        // SAFETY: pixGetData() returns a mutable pointer into the allocated pixel
+        // buffer that is valid for the lifetime of the Pix. We own pix_ptr
+        // exclusively at this point and have not exposed it to any other code.
+        let data_ptr = unsafe { pixGetData(pix_ptr) };
+        if data_ptr.is_null() {
+            // Clean up before returning the error.
+            // SAFETY: pix_ptr is a valid non-null allocation from pixCreate().
+            // Passing &mut pix_ptr satisfies the double-pointer convention; after
+            // this call pix_ptr is set to null by Leptonica.
+            let mut ptr = pix_ptr;
+            unsafe { pixDestroy(&mut ptr) };
+            return Err(TesseractError::NullPointerError);
+        }
+
+        // SAFETY: pixGetWpl() is a pure read of the Pix header that is always
+        // valid for a correctly-allocated Pix.
+        // For a 32 bpp image, each pixel occupies exactly one 32-bit word, so
+        // wpl == width (no padding bytes). The loop below uses `row * wpl + col`
+        // to index into the pixel data, which is within bounds because col < width <= wpl.
+        let wpl = unsafe { pixGetWpl(pix_ptr) } as usize;
+
+        // Write RGB pixels into the Leptonica data buffer.
+        //
+        // Leptonica's 32 bpp pixel format stores each pixel as a native
+        // 32-bit integer word with the logical layout (MSB→LSB): R G B A,
+        // i.e. `(r << 24) | (g << 16) | (b << 8) | alpha`.  This is the
+        // same bit pattern regardless of host endianness — Leptonica treats
+        // the data as an array of 32-bit integers and accesses individual
+        // bytes via bit-shift, not via byte-addressed pointer arithmetic.
+        //
+        // Therefore we pack directly as `(r << 24) | (g << 16) | (b << 8) | 0xFF`
+        // and write the resulting u32 without any byte-swapping.  Calling
+        // `pixEndianByteSwap` would invert the channel order, producing
+        // A B G R instead of R G B A.
+        for row in 0..(height as usize) {
+            for col in 0..(width as usize) {
+                let src = (row * width as usize + col) * 3;
+                let r = data[src] as u32;
+                let g = data[src + 1] as u32;
+                let b = data[src + 2] as u32;
+                // Pack channels as (MSB) R G B A (LSB) in the 32-bit integer.
+                let word: u32 = (r << 24) | (g << 16) | (b << 8) | 0xFF;
+                // SAFETY: data_ptr is a valid writable pointer into the Leptonica
+                // pixel buffer. The offset `row * wpl + col` is within bounds because:
+                // 1. wpl >= width (Leptonica pads rows to 32-bit word boundaries).
+                // 2. row < height and col < width by loop invariants.
+                unsafe {
+                    *data_ptr.add(row * wpl + col) = word;
+                }
+            }
+        }
+
+        // Set a sensible default DPI for OCR processing.
+        // SAFETY: pix_ptr is valid and non-null. pixSetResolution only writes
+        // two integer fields in the Pix header.
+        unsafe { pixSetResolution(pix_ptr, 300, 300) };
+
+        Ok(Pix { ptr: pix_ptr })
+    }
+
+    // -----------------------------------------------------------------------
+    // Image processing operations
+    // -----------------------------------------------------------------------
+
+    /// Deskews this image, returning a new corrected Pix.
+    ///
+    /// **Note:** `pixDeskew` requires a 1 bpp (binary) image. Call
+    /// `to_grayscale()` followed by `adaptive_threshold()` before invoking
+    /// this method on a colour or grayscale Pix.
+    ///
+    /// # Errors
+    ///
+    /// Returns `TesseractError::NullPointerError` if Leptonica returns null
+    /// (typically because the input is not 1 bpp or the image is too small).
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use kreuzberg_tesseract::Pix;
+    /// # let rgb = vec![0u8; 100 * 100 * 3];
+    /// # let pix = Pix::from_raw_rgb(&rgb, 100, 100).unwrap();
+    /// let gray = pix.to_grayscale().unwrap();
+    /// let binary = gray.adaptive_threshold(32, 32).unwrap();
+    /// let deskewed = binary.deskew().unwrap();
+    /// ```
+    pub fn deskew(&self) -> Result<Pix> {
+        // SAFETY: self.ptr is a valid non-null Pix we own. pixDeskew() does
+        // not take ownership; it creates and returns a new Pix allocation.
+        // We check for null to handle the case where the operation fails
+        // (e.g. input is not 1 bpp).
+        let result = unsafe { pixDeskew(self.ptr, 0) };
+        if result.is_null() {
+            Err(TesseractError::NullPointerError)
+        } else {
+            Ok(Pix { ptr: result })
+        }
+    }
+
+    /// Estimates the skew angle (degrees) and confidence (0–1) for this image.
+    ///
+    /// A positive angle indicates counter-clockwise skew. Confidence near 1.0
+    /// means a clear dominant skew direction was found.
+    ///
+    /// **Note:** Like `deskew`, this operates on 1 bpp images.
+    ///
+    /// # Errors
+    ///
+    /// Returns `TesseractError::OcrError` if `pixFindSkew` returns a non-zero
+    /// status (e.g. insufficient contrast or wrong bit depth).
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use kreuzberg_tesseract::Pix;
+    /// # let rgb = vec![0u8; 100 * 100 * 3];
+    /// # let pix = Pix::from_raw_rgb(&rgb, 100, 100).unwrap();
+    /// let gray = pix.to_grayscale().unwrap();
+    /// let binary = gray.adaptive_threshold(32, 32).unwrap();
+    /// let (angle, confidence) = binary.find_skew().unwrap();
+    /// println!("Skew: {angle:.2}° (confidence {confidence:.2})");
+    /// ```
+    pub fn find_skew(&self) -> Result<(f32, f32)> {
+        let mut angle: f32 = 0.0;
+        let mut conf: f32 = 0.0;
+        // SAFETY: self.ptr is valid and non-null. We pass pointers to local
+        // stack-allocated f32 values, which are valid write targets for the
+        // duration of this call. pixFindSkew() writes into them and returns
+        // an integer status code.
+        let status = unsafe { pixFindSkew(self.ptr, &mut angle, &mut conf) };
+        if status != 0 {
+            Err(TesseractError::OcrError)
+        } else {
+            Ok((angle, conf))
+        }
+    }
+
+    /// Binarises this image using Otsu adaptive thresholding.
+    ///
+    /// `tile_width` and `tile_height` control the size of the local regions
+    /// used to compute the threshold. Values around 16–64 work well for typical
+    /// document images; smaller tiles follow local contrast more closely.
+    ///
+    /// # Errors
+    ///
+    /// Returns `TesseractError::NullPointerError` if Leptonica returns null, or
+    /// `TesseractError::OcrError` if `pixOtsuAdaptiveThreshold` returns a
+    /// non-zero status.
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use kreuzberg_tesseract::Pix;
+    /// # let rgb = vec![128u8; 64 * 64 * 3];
+    /// # let pix = Pix::from_raw_rgb(&rgb, 64, 64).unwrap();
+    /// let gray = pix.to_grayscale().unwrap();
+    /// let binary = gray.adaptive_threshold(32, 32).unwrap();
+    /// assert_eq!(binary.depth(), 1);
+    /// ```
+    pub fn adaptive_threshold(&self, tile_width: i32, tile_height: i32) -> Result<Pix> {
+        let mut result: *mut c_void = std::ptr::null_mut();
+        // SAFETY: self.ptr is a valid non-null Pix. We pass null for ppixth
+        // because we do not need the intermediate threshold image. result is a
+        // local pointer that will be written by pixOtsuAdaptiveThreshold(); we
+        // check it for null before wrapping in a Pix.
+        let status = unsafe {
+            pixOtsuAdaptiveThreshold(
+                self.ptr,
+                tile_width,
+                tile_height,
+                0,                    // smoothx: no smoothing
+                0,                    // smoothy: no smoothing
+                0.1,                  // scorefract: Leptonica-recommended default
+                std::ptr::null_mut(), // ppixth: we don't need the threshold map
+                &mut result,
+            )
+        };
+        if status != 0 {
+            return Err(TesseractError::OcrError);
+        }
+        if result.is_null() {
+            return Err(TesseractError::NullPointerError);
+        }
+        Ok(Pix { ptr: result })
+    }
+
+    /// Returns the horizontal and vertical resolution (DPI) of this image.
+    ///
+    /// # Errors
+    ///
+    /// Returns `TesseractError::OcrError` if `pixGetResolution` fails.
+    pub fn get_resolution(&self) -> Result<(i32, i32)> {
+        let mut xres: i32 = 0;
+        let mut yres: i32 = 0;
+        // SAFETY: self.ptr is a valid non-null Pix. xres and yres are valid
+        // stack-allocated i32 values. pixGetResolution reads the Pix header.
+        let status = unsafe { pixGetResolution(self.ptr, &mut xres, &mut yres) };
+        if status != 0 {
+            Err(TesseractError::OcrError)
+        } else {
+            Ok((xres, yres))
+        }
+    }
+
+    /// Sets the horizontal and vertical resolution (DPI) on this image.
+    ///
+    /// # Errors
+    ///
+    /// Returns `TesseractError::OcrError` if `pixSetResolution` fails.
+    pub fn set_resolution(&mut self, xres: i32, yres: i32) -> Result<()> {
+        // SAFETY: self.ptr is a valid non-null Pix. pixSetResolution only
+        // writes two integer fields in the Pix header.
+        let status = unsafe { pixSetResolution(self.ptr, xres, yres) };
+        if status != 0 {
+            Err(TesseractError::OcrError)
+        } else {
+            Ok(())
+        }
+    }
+
+    /// Ensures the image has a valid (non-zero) DPI resolution.
+    ///
+    /// If both x and y resolution are zero, sets them to 72 DPI as a
+    /// safe fallback. This prevents Leptonica operations that depend on
+    /// resolution metadata from producing incorrect results.
+    fn ensure_valid_resolution(&self) {
+        if let Ok((xres, yres)) = self.get_resolution()
+            && (xres == 0 || yres == 0)
+        {
+            // SAFETY: self.ptr is valid. We set a safe default DPI.
+            unsafe { pixSetResolution(self.ptr, 72, 72) };
+        }
+    }
+
+    /// Normalises the background of this image using morphological operations.
+    ///
+    /// Useful as a preprocessing step when the document has uneven illumination
+    /// or a non-white background. Returns a new normalised Pix.
+    ///
+    /// # Errors
+    ///
+    /// Returns `TesseractError::NullPointerError` if `pixBackgroundNormMorph`
+    /// returns null.
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use kreuzberg_tesseract::Pix;
+    /// # let rgb = vec![200u8; 100 * 100 * 3];
+    /// # let pix = Pix::from_raw_rgb(&rgb, 100, 100).unwrap();
+    /// let gray = pix.to_grayscale().unwrap();
+    /// let normalised = gray.background_normalize().unwrap();
+    /// ```
+    pub fn background_normalize(&self) -> Result<Pix> {
+        self.ensure_valid_resolution();
+        // SAFETY: self.ptr is a valid non-null Pix. We pass null for pixim
+        // (no mask image). pixBackgroundNormMorph() returns a newly allocated
+        // Pix or null on failure.
+        let result = unsafe {
+            pixBackgroundNormMorph(
+                self.ptr,
+                std::ptr::null_mut(), // pixim: no mask
+                4,                    // reduction: 4x subsampling
+                15,                   // size: morphological SE half-size
+                200,                  // bgval: target background value
+            )
+        };
+        if result.is_null() {
+            Err(TesseractError::NullPointerError)
+        } else {
+            Ok(Pix { ptr: result })
+        }
+    }
+
+    /// Applies unsharp masking to sharpen this image.
+    ///
+    /// `halfwidth` is the half-size of the blur kernel (e.g. 1–5).
+    /// `fract` is the sharpening fraction in the range 0.0–1.0; values
+    /// around 0.3–0.5 produce visible sharpening without artefacts.
+    ///
+    /// # Errors
+    ///
+    /// Returns `TesseractError::NullPointerError` if `pixUnsharpMasking`
+    /// returns null.
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use kreuzberg_tesseract::Pix;
+    /// # let rgb = vec![128u8; 64 * 64 * 3];
+    /// # let pix = Pix::from_raw_rgb(&rgb, 64, 64).unwrap();
+    /// let sharpened = pix.unsharp_mask(2, 0.4).unwrap();
+    /// ```
+    pub fn unsharp_mask(&self, halfwidth: i32, fract: f32) -> Result<Pix> {
+        self.ensure_valid_resolution();
+        // SAFETY: self.ptr is valid and non-null. pixUnsharpMasking() returns
+        // a new Pix without modifying or taking ownership of the source.
+        let result = unsafe { pixUnsharpMasking(self.ptr, halfwidth, fract) };
+        if result.is_null() {
+            Err(TesseractError::NullPointerError)
+        } else {
+            Ok(Pix { ptr: result })
+        }
+    }
+
+    /// Scales this image by independent x and y factors.
+    ///
+    /// Leptonica automatically chooses the best scaling algorithm based on
+    /// the scale factors and bit depth (area mapping for downscaling,
+    /// linear interpolation for upscaling).
+    ///
+    /// # Errors
+    ///
+    /// Returns `TesseractError::NullPointerError` if `pixScale` returns null.
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use kreuzberg_tesseract::Pix;
+    /// # let rgb = vec![255u8; 40 * 40 * 3];
+    /// # let pix = Pix::from_raw_rgb(&rgb, 40, 40).unwrap();
+    /// let upscaled = pix.scale(2.0, 2.0).unwrap();
+    /// assert_eq!(upscaled.width(), 80);
+    /// assert_eq!(upscaled.height(), 80);
+    /// ```
+    pub fn scale(&self, sx: f32, sy: f32) -> Result<Pix> {
+        // SAFETY: self.ptr is valid and non-null. pixScale() creates a new Pix
+        // and does not modify the source.
+        let result = unsafe { pixScale(self.ptr, sx, sy) };
+        if result.is_null() {
+            Err(TesseractError::NullPointerError)
+        } else {
+            Ok(Pix { ptr: result })
+        }
+    }
+
+    /// Clips a rectangular sub-region from this image.
+    ///
+    /// Returns a new Pix containing only the pixels within the given rectangle.
+    /// Coordinates are in pixel space: (x, y) is the top-left corner.
+    ///
+    /// # Errors
+    ///
+    /// Returns `TesseractError::NullPointerError` if the crop fails.
+    pub fn clip_rectangle(&self, x: i32, y: i32, w: i32, h: i32) -> Result<Pix> {
+        // SAFETY: boxCreate allocates a new BOX on the heap.
+        let box_ = unsafe { boxCreate(x, y, w, h) };
+        if box_.is_null() {
+            return Err(TesseractError::NullPointerError);
+        }
+        // SAFETY: pixClipRectangle returns a new Pix clipped to the BOX region.
+        // We pass null for pboxc (we don't need the clipped box coordinates back).
+        let result = unsafe { pixClipRectangle(self.ptr, box_, std::ptr::null_mut()) };
+        // SAFETY: Free the BOX we allocated.
+        let mut box_mut = box_;
+        unsafe { boxDestroy(&mut box_mut) };
+        if result.is_null() {
+            Err(TesseractError::NullPointerError)
+        } else {
+            Ok(Pix { ptr: result })
+        }
+    }
+
+    /// Counts connected components in a 1 bpp (binary) image.
+    ///
+    /// `connectivity` should be 4 or 8.
+    ///
+    /// # Errors
+    ///
+    /// Returns `TesseractError::OcrError` if `pixCountConnComp` fails
+    /// (e.g., wrong bit depth — image must be 1 bpp).
+    pub fn count_connected_components(&self, connectivity: i32) -> Result<i32> {
+        let mut count: i32 = 0;
+        // SAFETY: self.ptr is a valid Pix. count is a valid stack local.
+        let status = unsafe { pixCountConnComp(self.ptr, connectivity, &mut count) };
+        if status != 0 {
+            Err(TesseractError::OcrError)
+        } else {
+            Ok(count)
+        }
+    }
+
+    /// Converts this 32 bpp RGB image to an 8 bpp grayscale Pix.
+    ///
+    /// Passing 0.0 for all weight parameters instructs Leptonica to use its
+    /// default perceptual weights (approx. 0.299 R, 0.587 G, 0.114 B).
+    ///
+    /// # Errors
+    ///
+    /// Returns `TesseractError::NullPointerError` if `pixConvertRGBToGray`
+    /// returns null (e.g. the source is not 32 bpp).
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use kreuzberg_tesseract::Pix;
+    /// # let rgb = vec![100u8, 150u8, 200u8].repeat(10 * 10);
+    /// # let pix = Pix::from_raw_rgb(&rgb, 10, 10).unwrap();
+    /// let gray = pix.to_grayscale().unwrap();
+    /// assert_eq!(gray.depth(), 8);
+    /// ```
+    pub fn to_grayscale(&self) -> Result<Pix> {
+        self.ensure_valid_resolution();
+        // SAFETY: self.ptr is valid and non-null. pixConvertRGBToGray() returns
+        // a new 8 bpp Pix; the source is not modified.
+        let result = unsafe { pixConvertRGBToGray(self.ptr, 0.0, 0.0, 0.0) };
+        if result.is_null() {
+            Err(TesseractError::NullPointerError)
+        } else {
+            Ok(Pix { ptr: result })
+        }
+    }
+
+    // -----------------------------------------------------------------------
+    // Accessors
+    // -----------------------------------------------------------------------
+
+    /// Returns the raw Leptonica `PIX *` pointer.
+    ///
+    /// Intended for passing this image to `TesseractAPI::set_image_2`.
+    ///
+    /// # Safety
+    ///
+    /// The caller must ensure the `Pix` outlives any use of the returned
+    /// pointer.  `TessBaseAPISetImage2` **borrows** the pointer — it does not
+    /// take ownership — so the `Pix` must remain alive until after
+    /// `TessBaseAPIRecognize` (or any other Tesseract call that consumes the
+    /// image data) has completed.  Dropping the `Pix` while Tesseract holds
+    /// the pointer will result in a use-after-free.
+    ///
+    /// The caller must **not** free the returned pointer; `Pix::drop` is
+    /// solely responsible for deallocation via `pixDestroy`.
+    pub fn as_ptr(&self) -> *mut c_void {
+        self.ptr
+    }
+
+    /// Returns the width of the image in pixels.
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use kreuzberg_tesseract::Pix;
+    /// # let pix = Pix::from_raw_rgb(&vec![0u8; 8 * 6 * 3], 8, 6).unwrap();
+    /// assert_eq!(pix.width(), 8);
+    /// ```
+    pub fn width(&self) -> i32 {
+        // SAFETY: self.ptr is a valid non-null Pix. pixGetWidth() is a pure
+        // read of the Pix header struct; it does not mutate any state.
+        unsafe { pixGetWidth(self.ptr) }
+    }
+
+    /// Returns the height of the image in pixels.
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use kreuzberg_tesseract::Pix;
+    /// # let pix = Pix::from_raw_rgb(&vec![0u8; 8 * 6 * 3], 8, 6).unwrap();
+    /// assert_eq!(pix.height(), 6);
+    /// ```
+    pub fn height(&self) -> i32 {
+        // SAFETY: self.ptr is a valid non-null Pix. pixGetHeight() is a pure
+        // read of the Pix header struct.
+        unsafe { pixGetHeight(self.ptr) }
+    }
+
+    /// Returns the bit depth of the image (1, 8, or 32 for this module's usage).
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use kreuzberg_tesseract::Pix;
+    /// # let pix = Pix::from_raw_rgb(&vec![0u8; 4 * 4 * 3], 4, 4).unwrap();
+    /// assert_eq!(pix.depth(), 32);
+    /// ```
+    pub fn depth(&self) -> i32 {
+        // SAFETY: self.ptr is a valid non-null Pix. pixGetDepth() is a pure
+        // read of the Pix header struct.
+        unsafe { pixGetDepth(self.ptr) }
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Drop implementation
+// ---------------------------------------------------------------------------
+
+#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
+impl Drop for Pix {
+    fn drop(&mut self) {
+        if !self.ptr.is_null() {
+            // SAFETY: self.ptr is a non-null Leptonica PIX that we allocated and
+            // own exclusively. pixDestroy() takes a double pointer, sets *ppix to
+            // null after freeing, and is safe to call exactly once per allocation.
+            // After this call self.ptr is null (Leptonica sets it), preventing
+            // any double-free if drop() were somehow called again.
+            unsafe { pixDestroy(&mut self.ptr) };
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Tests
+// ---------------------------------------------------------------------------
+
+#[cfg(test)]
+#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
+mod tests {
+    use super::*;
+
+    fn make_rgb_pix(width: u32, height: u32, fill: u8) -> Pix {
+        let data = vec![fill; (width * height * 3) as usize];
+        Pix::from_raw_rgb(&data, width, height).expect("from_raw_rgb failed")
+    }
+
+    #[test]
+    fn test_from_raw_rgb_dimensions() {
+        let pix = make_rgb_pix(16, 8, 200);
+        assert_eq!(pix.width(), 16);
+        assert_eq!(pix.height(), 8);
+        assert_eq!(pix.depth(), 32);
+    }
+
+    #[test]
+    fn test_from_raw_rgb_wrong_length() {
+        let data = vec![0u8; 10]; // too short for 4×4
+        let err = Pix::from_raw_rgb(&data, 4, 4).unwrap_err();
+        assert!(matches!(err, TesseractError::InvalidImageData));
+    }
+
+    #[test]
+    fn test_from_raw_rgb_zero_dimensions() {
+        let err = Pix::from_raw_rgb(&[], 0, 4).unwrap_err();
+        assert!(matches!(err, TesseractError::InvalidImageData));
+
+        let err = Pix::from_raw_rgb(&[], 4, 0).unwrap_err();
+        assert!(matches!(err, TesseractError::InvalidImageData));
+    }
+
+    #[test]
+    fn test_as_ptr_is_non_null() {
+        let pix = make_rgb_pix(8, 8, 128);
+        assert!(!pix.as_ptr().is_null());
+    }
+
+    #[test]
+    fn test_to_grayscale() {
+        let pix = make_rgb_pix(32, 32, 150);
+        let gray = pix.to_grayscale().expect("to_grayscale failed");
+        assert_eq!(gray.width(), 32);
+        assert_eq!(gray.height(), 32);
+        assert_eq!(gray.depth(), 8);
+    }
+
+    #[test]
+    fn test_scale_up() {
+        let pix = make_rgb_pix(20, 10, 100);
+        let scaled = pix.scale(2.0, 2.0).expect("scale failed");
+        assert_eq!(scaled.width(), 40);
+        assert_eq!(scaled.height(), 20);
+    }
+
+    #[test]
+    fn test_unsharp_mask_returns_same_dimensions() {
+        let pix = make_rgb_pix(32, 32, 200);
+        let sharpened = pix.unsharp_mask(2, 0.4).expect("unsharp_mask failed");
+        assert_eq!(sharpened.width(), 32);
+        assert_eq!(sharpened.height(), 32);
+    }
+
+    #[test]
+    fn test_adaptive_threshold_produces_1bpp() {
+        let pix = make_rgb_pix(64, 64, 180);
+        let gray = pix.to_grayscale().expect("to_grayscale failed");
+        let binary = gray.adaptive_threshold(32, 32).expect("adaptive_threshold failed");
+        assert_eq!(binary.depth(), 1);
+    }
+}
--- a/crates/kreuzberg-tesseract/src/lib.rs
+++ b/crates/kreuzberg-tesseract/src/lib.rs
@@ -0,0 +1,218 @@
+#![cfg_attr(
+    not(any(feature = "build-tesseract", feature = "build-tesseract-wasm")),
+    allow(unused_variables, dead_code)
+)]
+#![allow(clippy::arc_with_non_send_sync)]
+#![allow(clippy::missing_transmute_annotations)]
+#![allow(clippy::type_complexity)]
+#![allow(clippy::new_without_default)]
+#![allow(clippy::not_unsafe_ptr_arg_deref)]
+#![allow(clippy::cmp_null)]
+
+//! # kreuzberg-tesseract
+//!
+//! `kreuzberg-tesseract` provides safe Rust bindings for Tesseract OCR with built-in compilation
+//! of Tesseract and Leptonica libraries. This crate aims to make OCR functionality
+//! easily accessible in Rust projects while handling the complexity of interfacing
+//! with the underlying C++ libraries.
+//!
+//! ## Usage
+//!
+//! Here's a basic example of how to use `kreuzberg-tesseract`:
+//!
+//! ```rust
+//! use std::path::PathBuf;
+//! use std::error::Error;
+//! use kreuzberg_tesseract::TesseractAPI;
+//!
+//! fn get_default_tessdata_dir() -> PathBuf {
+//!     if cfg!(target_os = "macos") {
+//!         let home_dir = std::env::var("HOME").expect("HOME environment variable not set");
+//!         PathBuf::from(home_dir)
+//!             .join("Library")
+//!             .join("Application Support")
+//!             .join("kreuzberg-tesseract")
+//!             .join("tessdata")
+//!     } else if cfg!(target_os = "linux") {
+//!         let home_dir = std::env::var("HOME").expect("HOME environment variable not set");
+//!         PathBuf::from(home_dir)
+//!             .join(".kreuzberg-tesseract")
+//!             .join("tessdata")
+//!     } else if cfg!(target_os = "windows") {
+//!         PathBuf::from(std::env::var("APPDATA").expect("APPDATA environment variable not set"))
+//!             .join("kreuzberg-tesseract")
+//!             .join("tessdata")
+//!     } else {
+//!         panic!("Unsupported operating system");
+//!     }
+//! }
+//!
+//! fn get_tessdata_dir() -> PathBuf {
+//!     match std::env::var("TESSDATA_PREFIX") {
+//!         Ok(dir) => {
+//!             let path = PathBuf::from(dir);
+//!             let path = if path.ends_with("tessdata") { path } else { path.join("tessdata") };
+//!             println!("Using TESSDATA_PREFIX directory: {:?}", path);
+//!             path
+//!         }
+//!         Err(_) => {
+//!             let default_dir = get_default_tessdata_dir();
+//!             println!(
+//!                 "TESSDATA_PREFIX not set, using default directory: {:?}",
+//!                 default_dir
+//!             );
+//!             default_dir
+//!         }
+//!     }
+//! }
+//!
+//! fn main() -> Result<(), Box<dyn Error>> {
+//!     let api = TesseractAPI::new()?;
+//!
+//!     // Get tessdata directory (uses default location or TESSDATA_PREFIX if set)
+//!     let tessdata_dir = get_tessdata_dir();
+//!     api.init(tessdata_dir.to_str().unwrap(), "eng")?;
+//!
+//!     let width = 24;
+//!     let height = 24;
+//!     let bytes_per_pixel = 1;
+//!     let bytes_per_line = width * bytes_per_pixel;
+//!
+//!     // Initialize image data with all white pixels
+//!     let mut image_data = vec![255u8; width * height];
+//!
+//!     // Draw number 9 with clearer distinction
+//!     for y in 4..19 {
+//!         for x in 7..17 {
+//!             // Top bar
+//!             if y == 4 && x >= 8 && x <= 15 {
+//!                 image_data[y * width + x] = 0;
+//!             }
+//!             // Top curve left side
+//!             if y >= 4 && y <= 10 && x == 7 {
+//!                 image_data[y * width + x] = 0;
+//!             }
+//!             // Top curve right side
+//!             if y >= 4 && y <= 11 && x == 16 {
+//!                 image_data[y * width + x] = 0;
+//!             }
+//!             // Middle bar
+//!             if y == 11 && x >= 8 && x <= 15 {
+//!                 image_data[y * width + x] = 0;
+//!             }
+//!             // Bottom right vertical line
+//!             if y >= 11 && y <= 18 && x == 16 {
+//!                 image_data[y * width + x] = 0;
+//!             }
+//!             // Bottom bar
+//!             if y == 18 && x >= 8 && x <= 15 {
+//!                 image_data[y * width + x] = 0;
+//!             }
+//!         }
+//!     }
+//!
+//!     // Set the image data
+//!     api.set_image(&image_data, width.try_into().unwrap(), height.try_into().unwrap(), bytes_per_pixel.try_into().unwrap(), bytes_per_line.try_into().unwrap())?;
+//!
+//!     // Set whitelist for digits only
+//!     api.set_variable("tessedit_char_whitelist", "0123456789")?;
+//!
+//!     // Set PSM mode to single character
+//!     api.set_variable("tessedit_pageseg_mode", "10")?;
+//!
+//!     // Get the recognized text
+//!     let text = api.get_utf8_text()?;
+//!     println!("Recognized text: {}", text.trim());
+//!
+//!     Ok(())
+//! }
+//! ```
+/// Declare FFI functions with `extern "C-unwind"` on native targets (to catch
+/// C++ exceptions from Tesseract/Leptonica) and `extern "C"` on WASM (where
+/// the LLVM backend does not support `cleanupret` / C++ unwinding).
+macro_rules! ffi_extern {
+    (
+        $(
+            $(#[$meta:meta])*
+            $vis:vis fn $name:ident($($arg:ident : $ty:ty),* $(,)?) $(-> $ret:ty)?;
+        )*
+    ) => {
+        #[cfg(not(target_arch = "wasm32"))]
+        unsafe extern "C-unwind" {
+            $(
+                $(#[$meta])*
+                $vis fn $name($($arg : $ty),*) $(-> $ret)?;
+            )*
+        }
+
+        #[cfg(target_arch = "wasm32")]
+        unsafe extern "C" {
+            $(
+                $(#[$meta])*
+                $vis fn $name($($arg : $ty),*) $(-> $ret)?;
+            )*
+        }
+    };
+}
+
+pub use error::{Result, TesseractError};
+mod error;
+
+// WASM: Override __cxa_atexit to be a no-op. WASI SDK's __cxa_atexit calls calloc during
+// C++ static initialization, which crashes because dlmalloc's heap isn't properly set up
+// for wasm32-unknown-unknown. Since WASM modules never exit normally, atexit handlers
+// are unnecessary.
+#[cfg(all(target_arch = "wasm32", not(target_os = "emscripten")))]
+mod wasm_compat {
+    #[unsafe(no_mangle)]
+    pub unsafe extern "C" fn __cxa_atexit(
+        _func: Option<unsafe extern "C" fn(*mut core::ffi::c_void)>,
+        _arg: *mut core::ffi::c_void,
+        _dso_handle: *mut core::ffi::c_void,
+    ) -> i32 {
+        0 // Success, but don't actually register anything
+    }
+}
+mod page_iterator;
+pub use page_iterator::{BlockInfo, PageIterator, ParaInfo};
+mod result_iterator;
+pub use result_iterator::{FontAttributes, ResultIterator, WordData};
+mod choice_iterator;
+pub use choice_iterator::ChoiceIterator;
+mod monitor;
+pub use monitor::TessMonitor;
+mod result_renderer;
+pub use result_renderer::TessResultRenderer;
+mod mutable_iterator;
+pub use mutable_iterator::MutableIterator;
+mod enums;
+pub use enums::{
+    TessOrientation, TessPageIteratorLevel, TessPageSegMode, TessParagraphJustification, TessPolyBlockType,
+    TessTextlineOrder, TessWritingDirection,
+};
+mod api;
+pub use api::{BoundingBoxArray, TesseractAPI};
+pub mod leptonica;
+#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
+pub use leptonica::Pix;
+
+/// Returns the compile-time-bundled English `eng.traineddata` blob when the
+/// `bundle-tessdata-eng` feature is enabled, otherwise `None`.
+///
+/// The bundled data is the `tessdata_fast` variant (~4 MB) downloaded by
+/// `build.rs` to `TESSDATA_PREFIX_BUNDLED/tessdata/eng.traineddata`. Embedding
+/// it lets WASM builds drive Tesseract OCR without filesystem access or
+/// runtime fetches.
+#[cfg(feature = "bundle-tessdata-eng")]
+pub fn bundled_eng_traineddata() -> Option<&'static [u8]> {
+    Some(include_bytes!(concat!(
+        env!("TESSDATA_PREFIX_BUNDLED"),
+        "/tessdata/eng.traineddata"
+    )))
+}
+
+/// Returns `None` when the `bundle-tessdata-eng` feature is disabled.
+#[cfg(not(feature = "bundle-tessdata-eng"))]
+pub fn bundled_eng_traineddata() -> Option<&'static [u8]> {
+    None
+}
--- a/crates/kreuzberg-tesseract/src/monitor.rs
+++ b/crates/kreuzberg-tesseract/src/monitor.rs
@@ -0,0 +1,68 @@
+use crate::error::{Result, TesseractError};
+use std::os::raw::{c_int, c_void};
+use std::sync::{Arc, Mutex};
+
+pub struct TessMonitor {
+    handle: Arc<Mutex<*mut c_void>>,
+}
+
+unsafe impl Send for TessMonitor {}
+unsafe impl Sync for TessMonitor {}
+
+impl TessMonitor {
+    /// Creates a new instance of the TessMonitor.
+    ///
+    /// # Returns
+    ///
+    /// Returns the new instance of the TessMonitor.
+    pub fn new() -> Self {
+        let handle = unsafe { TessMonitorCreate() };
+        TessMonitor {
+            handle: Arc::new(Mutex::new(handle)),
+        }
+    }
+
+    /// Sets the deadline for the monitor.
+    ///
+    /// # Arguments
+    ///
+    /// * `deadline` - Deadline in milliseconds.
+    ///
+    /// # Errors
+    ///
+    /// Returns a `TesseractError::MutexLockError` if the mutex lock fails.
+    pub fn set_deadline(&self, deadline: i32) -> Result<()> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        unsafe { TessMonitorSetDeadlineMSecs(*handle, deadline) };
+        Ok(())
+    }
+
+    /// Gets the progress of the monitor.
+    ///
+    /// # Returns
+    ///
+    /// Returns the progress as an `i32` if successful, otherwise returns an error.
+    ///
+    /// # Errors
+    ///
+    /// Returns a `TesseractError::MutexLockError` if the mutex lock fails.
+    pub fn get_progress(&self) -> Result<i32> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        Ok(unsafe { TessMonitorGetProgress(*handle) })
+    }
+}
+
+impl Drop for TessMonitor {
+    fn drop(&mut self) {
+        if let Ok(handle) = self.handle.lock() {
+            unsafe { TessMonitorDelete(*handle) };
+        }
+    }
+}
+
+ffi_extern! {
+    pub fn TessMonitorCreate() -> *mut c_void;
+    pub fn TessMonitorDelete(monitor: *mut c_void);
+    pub fn TessMonitorSetDeadlineMSecs(monitor: *mut c_void, deadline: c_int);
+    pub fn TessMonitorGetProgress(monitor: *mut c_void) -> c_int;
+}
--- a/crates/kreuzberg-tesseract/src/mutable_iterator.rs
+++ b/crates/kreuzberg-tesseract/src/mutable_iterator.rs
@@ -0,0 +1,197 @@
+use crate::error::{Result, TesseractError};
+use std::ffi::CStr;
+use std::os::raw::{c_char, c_void};
+use std::sync::Arc;
+use std::sync::Mutex;
+
+use crate::result_iterator::{
+    TessResultIteratorConfidence, TessResultIteratorGetUTF8Text, TessResultIteratorNext,
+    TessResultIteratorSymbolIsDropcap, TessResultIteratorSymbolIsSubscript, TessResultIteratorSymbolIsSuperscript,
+    TessResultIteratorWordFontAttributes, TessResultIteratorWordIsFromDictionary, TessResultIteratorWordIsNumeric,
+    TessResultIteratorWordRecognitionLanguage,
+};
+
+pub struct MutableIterator {
+    handle: Arc<Mutex<*mut c_void>>,
+}
+
+unsafe impl Send for MutableIterator {}
+unsafe impl Sync for MutableIterator {}
+
+impl MutableIterator {
+    /// Creates a new instance of the MutableIterator.
+    ///
+    /// # Arguments
+    ///
+    /// * `handle` - Pointer to the MutableIterator.
+    pub fn new(handle: *mut c_void) -> Self {
+        MutableIterator {
+            handle: Arc::new(Mutex::new(handle)),
+        }
+    }
+
+    /// Gets the UTF-8 text for the current iterator.
+    ///
+    /// # Arguments
+    ///
+    /// * `level` - Level of the text.
+    pub fn get_utf8_text(&self, level: i32) -> Result<String> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
+        let text_ptr = unsafe { TessResultIteratorGetUTF8Text(*handle, level) };
+        if text_ptr.is_null() {
+            return Err(TesseractError::NullPointerError);
+        }
+        let c_str = unsafe { CStr::from_ptr(text_ptr) };
+        let result = c_str.to_str()?.to_owned();
+        unsafe { TessDeleteText(text_ptr as *mut c_char) };
+        Ok(result)
+    }
+
+    /// Gets the confidence of the current iterator.
+    ///
+    /// # Arguments
+    ///
+    /// * `level` - Level of the confidence.
+    pub fn confidence(&self, level: i32) -> Result<f32> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
+        Ok(unsafe { TessResultIteratorConfidence(*handle, level) })
+    }
+
+    /// Gets the recognition language of the current iterator.
+    ///
+    /// # Returns
+    ///
+    /// Returns the recognition language as a `String` if successful, otherwise returns an error.
+    pub fn word_recognition_language(&self) -> Result<String> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
+        let lang_ptr = unsafe { TessResultIteratorWordRecognitionLanguage(*handle) };
+        if lang_ptr.is_null() {
+            return Err(TesseractError::NullPointerError);
+        }
+        let c_str = unsafe { CStr::from_ptr(lang_ptr) };
+        Ok(c_str.to_str()?.to_owned())
+    }
+
+    /// Gets the font attributes of the current iterator.
+    ///
+    /// # Returns
+    ///
+    /// Returns the font attributes as a tuple if successful, otherwise returns an error.
+    pub fn word_font_attributes(&self) -> Result<(bool, bool, bool, bool, bool, bool, i32, i32)> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
+        let mut is_bold = 0;
+        let mut is_italic = 0;
+        let mut is_underlined = 0;
+        let mut is_monospace = 0;
+        let mut is_serif = 0;
+        let mut is_smallcaps = 0;
+        let mut pointsize = 0;
+        let mut font_id = 0;
+
+        let result = unsafe {
+            TessResultIteratorWordFontAttributes(
+                *handle,
+                &mut is_bold,
+                &mut is_italic,
+                &mut is_underlined,
+                &mut is_monospace,
+                &mut is_serif,
+                &mut is_smallcaps,
+                &mut pointsize,
+                &mut font_id,
+            )
+        };
+
+        if result == 0 {
+            Err(TesseractError::InvalidParameterError)
+        } else {
+            Ok((
+                is_bold != 0,
+                is_italic != 0,
+                is_underlined != 0,
+                is_monospace != 0,
+                is_serif != 0,
+                is_smallcaps != 0,
+                pointsize,
+                font_id,
+            ))
+        }
+    }
+
+    /// Checks if the current word is from the dictionary.
+    ///
+    /// # Returns
+    ///
+    /// Returns `Ok(true)` if the current word is from the dictionary, otherwise returns `Ok(false)`.
+    pub fn word_is_from_dictionary(&self) -> Result<bool> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
+        Ok(unsafe { TessResultIteratorWordIsFromDictionary(*handle) != 0 })
+    }
+
+    /// Checks if the current word is numeric.
+    ///
+    /// # Returns
+    ///
+    /// Returns `Ok(true)` if the current word is numeric, otherwise returns `Ok(false)`.
+    pub fn word_is_numeric(&self) -> Result<bool> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
+        Ok(unsafe { TessResultIteratorWordIsNumeric(*handle) != 0 })
+    }
+
+    /// Checks if the current symbol is superscript.
+    ///
+    /// # Returns
+    ///
+    /// Returns `Ok(true)` if the current symbol is superscript, otherwise returns `Ok(false)`.
+    pub fn symbol_is_superscript(&self) -> Result<bool> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
+        Ok(unsafe { TessResultIteratorSymbolIsSuperscript(*handle) != 0 })
+    }
+
+    /// Checks if the current symbol is subscript.
+    ///
+    /// # Returns
+    ///
+    /// Returns `Ok(true)` if the current symbol is subscript, otherwise returns `Ok(false)`.
+    pub fn symbol_is_subscript(&self) -> Result<bool> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
+        Ok(unsafe { TessResultIteratorSymbolIsSubscript(*handle) != 0 })
+    }
+
+    /// Checks if the current symbol is dropcap.
+    ///
+    /// # Returns
+    ///
+    /// Returns `Ok(true)` if the current symbol is dropcap, otherwise returns `Ok(false)`.
+    pub fn symbol_is_dropcap(&self) -> Result<bool> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
+        Ok(unsafe { TessResultIteratorSymbolIsDropcap(*handle) != 0 })
+    }
+
+    /// Gets the next iterator.
+    ///
+    /// # Arguments
+    ///
+    /// * `level` - Level of the iterator.
+    ///
+    /// # Returns
+    ///
+    /// Returns `true` if the next iterator is successful, otherwise returns `false`.
+    pub fn next(&self, level: i32) -> Result<bool> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
+        Ok(unsafe { TessResultIteratorNext(*handle, level) != 0 })
+    }
+}
+
+impl Drop for MutableIterator {
+    fn drop(&mut self) {
+        if let Ok(handle) = self.handle.lock() {
+            unsafe { TessResultIteratorDelete(*handle) };
+        }
+    }
+}
+
+ffi_extern! {
+    pub fn TessResultIteratorDelete(handle: *mut c_void);
+    pub fn TessDeleteText(text: *mut c_char);
+}
--- a/crates/kreuzberg-tesseract/src/page_iterator.rs
+++ b/crates/kreuzberg-tesseract/src/page_iterator.rs
@@ -0,0 +1,421 @@
+use crate::TesseractError;
+use crate::enums::{
+    TessOrientation, TessPageIteratorLevel, TessParagraphJustification, TessPolyBlockType, TessTextlineOrder,
+    TessWritingDirection,
+};
+use crate::error::Result;
+use std::os::raw::{c_float, c_int, c_void};
+use std::sync::Arc;
+use std::sync::Mutex;
+
+/// Block-level layout information from Tesseract.
+#[derive(Debug, Clone)]
+pub struct BlockInfo {
+    pub block_type: TessPolyBlockType,
+    pub left: i32,
+    pub top: i32,
+    pub right: i32,
+    pub bottom: i32,
+}
+
+/// Paragraph-level information from Tesseract.
+#[derive(Debug, Clone)]
+pub struct ParaInfo {
+    pub justification: TessParagraphJustification,
+    pub is_list_item: bool,
+    pub is_crown: bool,
+    pub first_line_indent: i32,
+    pub left: i32,
+    pub top: i32,
+    pub right: i32,
+    pub bottom: i32,
+}
+
+pub struct PageIterator {
+    pub handle: Arc<Mutex<*mut c_void>>,
+}
+
+unsafe impl Send for PageIterator {}
+unsafe impl Sync for PageIterator {}
+
+impl PageIterator {
+    /// Creates a new instance of the PageIterator.
+    ///
+    /// # Arguments
+    ///
+    /// * `handle` - Pointer to the PageIterator.
+    ///
+    /// # Returns
+    ///
+    /// Returns the new instance of the PageIterator.
+    pub fn new(handle: *mut c_void) -> Self {
+        PageIterator {
+            handle: Arc::new(Mutex::new(handle)),
+        }
+    }
+
+    /// Begins the iteration.
+    pub fn begin(&self) -> Result<()> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        unsafe { TessPageIteratorBegin(*handle) };
+        Ok(())
+    }
+
+    /// Gets the next iterator.
+    ///
+    /// # Arguments
+    ///
+    /// * `level` - Level of the iterator.
+    ///
+    /// # Returns
+    ///
+    /// Returns `Result<bool>` - `Ok(true)` if the next iterator is successful, `Ok(false)` otherwise.
+    pub fn next(&self, level: TessPageIteratorLevel) -> Result<bool> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        Ok(unsafe { TessPageIteratorNext(*handle, level as c_int) != 0 })
+    }
+
+    /// Checks if the current iterator is at the beginning of the specified level.
+    ///
+    /// # Arguments
+    ///
+    /// * `level` - Level of the iterator.
+    ///
+    /// # Returns
+    ///
+    /// Returns `Result<bool>` - `Ok(true)` if at the beginning, `Ok(false)` otherwise.
+    pub fn is_at_beginning_of(&self, level: TessPageIteratorLevel) -> Result<bool> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        Ok(unsafe { TessPageIteratorIsAtBeginningOf(*handle, level as c_int) != 0 })
+    }
+
+    /// Checks if the current iterator is at the final element of the specified level.
+    ///
+    /// # Arguments
+    ///
+    /// * `level` - Level of the iterator.
+    /// * `element` - Element of the iterator.
+    ///
+    /// # Returns
+    ///
+    /// Returns `Result<bool>` - `Ok(true)` if at the final element, `Ok(false)` otherwise.
+    pub fn is_at_final_element(&self, level: TessPageIteratorLevel, element: TessPageIteratorLevel) -> Result<bool> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        Ok(unsafe { TessPageIteratorIsAtFinalElement(*handle, level as c_int, element as c_int) != 0 })
+    }
+
+    /// Gets the bounding box of the current iterator.
+    ///
+    /// # Arguments
+    ///
+    /// * `level` - Level of the bounding box.
+    ///
+    /// # Returns
+    ///
+    /// Returns the bounding box as a tuple if successful, otherwise returns an error.
+    pub fn bounding_box(&self, level: TessPageIteratorLevel) -> Result<(i32, i32, i32, i32)> {
+        let mut left = 0;
+        let mut top = 0;
+        let mut right = 0;
+        let mut bottom = 0;
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        let result = unsafe {
+            TessPageIteratorBoundingBox(*handle, level as c_int, &mut left, &mut top, &mut right, &mut bottom)
+        };
+        if result == 0 {
+            Err(TesseractError::InvalidParameterError)
+        } else {
+            Ok((left, top, right, bottom))
+        }
+    }
+
+    /// Gets the block type of the current iterator.
+    ///
+    /// # Returns
+    ///
+    /// Returns the block type as a `TessPolyBlockType`.
+    pub fn block_type(&self) -> Result<TessPolyBlockType> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        let block_type = unsafe { TessPageIteratorBlockType(*handle) };
+        Ok(TessPolyBlockType::from_int(block_type))
+    }
+
+    /// Gets the baseline of the current iterator.
+    ///
+    /// # Arguments
+    ///
+    /// * `level` - Level of the baseline.
+    ///
+    /// # Returns
+    ///
+    /// Returns the baseline as a tuple if successful, otherwise returns an error.
+    pub fn baseline(&self, level: i32) -> Result<(i32, i32, i32, i32)> {
+        let mut x1 = 0;
+        let mut y1 = 0;
+        let mut x2 = 0;
+        let mut y2 = 0;
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        let result = unsafe { TessPageIteratorBaseline(*handle, level, &mut x1, &mut y1, &mut x2, &mut y2) };
+        if result == 0 {
+            Err(TesseractError::InvalidParameterError)
+        } else {
+            Ok((x1, y1, x2, y2))
+        }
+    }
+
+    /// Gets the orientation of the current iterator.
+    ///
+    /// # Returns
+    ///
+    /// Returns the orientation as a tuple if successful, otherwise returns an error.
+    pub fn orientation(&self) -> Result<(TessOrientation, TessWritingDirection, TessTextlineOrder, f32)> {
+        let mut orientation = 0;
+        let mut writing_direction = 0;
+        let mut textline_order = 0;
+        let mut deskew_angle = 0.0;
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        let result = unsafe {
+            TessPageIteratorOrientation(
+                *handle,
+                &mut orientation,
+                &mut writing_direction,
+                &mut textline_order,
+                &mut deskew_angle,
+            )
+        };
+        if result == 0 {
+            Err(TesseractError::InvalidParameterError)
+        } else {
+            Ok((
+                TessOrientation::from_int(orientation),
+                TessWritingDirection::from_int(writing_direction),
+                TessTextlineOrder::from_int(textline_order),
+                deskew_angle,
+            ))
+        }
+    }
+
+    /// Extracts all blocks from the page in a single mutex-locked pass.
+    ///
+    /// Resets the iterator to the beginning, then iterates at `RIL_BLOCK` level,
+    /// collecting block type and bounding box for each block found.
+    ///
+    /// # Returns
+    ///
+    /// Returns `Ok(Vec<BlockInfo>)` with one entry per block, or an error if the
+    /// mutex cannot be acquired.
+    pub fn extract_all_blocks(&self) -> Result<Vec<BlockInfo>> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        let level = TessPageIteratorLevel::RIL_BLOCK as c_int;
+        let mut blocks = Vec::new();
+
+        // SAFETY: `*handle` is a valid non-null TessPageIterator pointer owned by this struct.
+        // `TessPageIteratorBegin` resets the iterator to the first element and takes only
+        // the pointer — no aliasing occurs because we hold the mutex for the duration.
+        unsafe { TessPageIteratorBegin(*handle) };
+
+        loop {
+            let block_type = unsafe {
+                // SAFETY: `*handle` is valid; TessPageIteratorBlockType reads the current
+                // iterator position and returns an integer enum value without taking ownership.
+                TessPageIteratorBlockType(*handle)
+            };
+
+            let mut left: c_int = 0;
+            let mut top: c_int = 0;
+            let mut right: c_int = 0;
+            let mut bottom: c_int = 0;
+
+            let bbox_ok = unsafe {
+                // SAFETY: `*handle` is valid; the four `*mut c_int` pointers point to local
+                // stack variables whose lifetimes exceed this call.
+                TessPageIteratorBoundingBox(*handle, level, &mut left, &mut top, &mut right, &mut bottom)
+            };
+
+            if bbox_ok != 0 {
+                blocks.push(BlockInfo {
+                    block_type: TessPolyBlockType::from_int(block_type),
+                    left,
+                    top,
+                    right,
+                    bottom,
+                });
+            }
+
+            let has_next = unsafe {
+                // SAFETY: `*handle` is valid; TessPageIteratorNext advances the iterator
+                // in-place and returns 0 when there are no more elements at this level.
+                TessPageIteratorNext(*handle, level)
+            };
+            if has_next == 0 {
+                break;
+            }
+        }
+
+        Ok(blocks)
+    }
+
+    /// Extracts all paragraphs from the page in a single mutex-locked pass.
+    ///
+    /// Resets the iterator to the beginning, then iterates at `RIL_PARA` level,
+    /// collecting paragraph metadata and bounding box for each paragraph found.
+    ///
+    /// # Returns
+    ///
+    /// Returns `Ok(Vec<ParaInfo>)` with one entry per paragraph, or an error if the
+    /// mutex cannot be acquired.
+    pub fn extract_all_paragraphs(&self) -> Result<Vec<ParaInfo>> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        let level = TessPageIteratorLevel::RIL_PARA as c_int;
+        let mut paragraphs = Vec::new();
+
+        // SAFETY: `*handle` is a valid non-null TessPageIterator pointer owned by this struct.
+        // `TessPageIteratorBegin` resets the iterator to the first element; the mutex ensures
+        // exclusive access for the entire loop.
+        unsafe { TessPageIteratorBegin(*handle) };
+
+        loop {
+            let mut justification: c_int = 0;
+            // SAFETY: TessPageIteratorParagraphInfo expects BOOL* (int*) for is_list_item and
+            // is_crown. Rust bool is 1 byte while C int is 4 bytes, so we use c_int temporaries
+            // to avoid undefined behaviour (stack corruption) and convert afterwards.
+            let mut is_list_item_raw: c_int = 0;
+            let mut is_crown_raw: c_int = 0;
+            let mut first_line_indent: c_int = 0;
+
+            let para_ok = unsafe {
+                // SAFETY: `*handle` is valid; all output pointers reference stack variables
+                // whose lifetimes exceed this call. TessPageIteratorParagraphInfo writes
+                // through these pointers without retaining them.
+                TessPageIteratorParagraphInfo(
+                    *handle,
+                    &mut justification,
+                    &mut is_list_item_raw,
+                    &mut is_crown_raw,
+                    &mut first_line_indent,
+                )
+            };
+
+            let is_list_item = is_list_item_raw != 0;
+            let is_crown = is_crown_raw != 0;
+
+            let mut left: c_int = 0;
+            let mut top: c_int = 0;
+            let mut right: c_int = 0;
+            let mut bottom: c_int = 0;
+
+            let bbox_ok = unsafe {
+                // SAFETY: `*handle` is valid; the four `*mut c_int` pointers reference local
+                // stack variables. TessPageIteratorBoundingBox does not retain these pointers.
+                TessPageIteratorBoundingBox(*handle, level, &mut left, &mut top, &mut right, &mut bottom)
+            };
+
+            if para_ok != 0 && bbox_ok != 0 {
+                paragraphs.push(ParaInfo {
+                    justification: TessParagraphJustification::from_int(justification),
+                    is_list_item,
+                    is_crown,
+                    first_line_indent,
+                    left,
+                    top,
+                    right,
+                    bottom,
+                });
+            }
+
+            let has_next = unsafe {
+                // SAFETY: `*handle` is valid; TessPageIteratorNext advances the iterator
+                // in-place and returns 0 when there are no more elements at this level.
+                TessPageIteratorNext(*handle, level)
+            };
+            if has_next == 0 {
+                break;
+            }
+        }
+
+        Ok(paragraphs)
+    }
+
+    /// Gets the paragraph information of the current iterator.
+    ///
+    /// # Returns
+    ///
+    /// Returns the paragraph information as a tuple if successful, otherwise returns an error.
+    pub fn paragraph_info(&self) -> Result<(TessParagraphJustification, bool, bool, i32)> {
+        let mut justification = 0;
+        // SAFETY: TessPageIteratorParagraphInfo expects BOOL* (int*) for is_list_item and
+        // is_crown. Rust bool is 1 byte while C int is 4 bytes, so we use c_int temporaries
+        // to avoid undefined behaviour (stack corruption) and convert afterwards.
+        let mut is_list_item_raw: c_int = 0;
+        let mut is_crown_raw: c_int = 0;
+        let mut first_line_indent = 0;
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        let result = unsafe {
+            TessPageIteratorParagraphInfo(
+                *handle,
+                &mut justification,
+                &mut is_list_item_raw,
+                &mut is_crown_raw,
+                &mut first_line_indent,
+            )
+        };
+        if result == 0 {
+            Err(TesseractError::InvalidParameterError)
+        } else {
+            Ok((
+                TessParagraphJustification::from_int(justification),
+                is_list_item_raw != 0,
+                is_crown_raw != 0,
+                first_line_indent,
+            ))
+        }
+    }
+}
+
+impl Drop for PageIterator {
+    fn drop(&mut self) {
+        if let Ok(handle) = self.handle.lock() {
+            unsafe { TessPageIteratorDelete(*handle) };
+        }
+    }
+}
+
+ffi_extern! {
+    pub fn TessPageIteratorDelete(handle: *mut c_void);
+    pub fn TessPageIteratorBegin(handle: *mut c_void);
+    pub fn TessPageIteratorNext(handle: *mut c_void, level: c_int) -> c_int;
+    pub fn TessPageIteratorIsAtBeginningOf(handle: *mut c_void, level: c_int) -> c_int;
+    pub fn TessPageIteratorIsAtFinalElement(handle: *mut c_void, level: c_int, element: c_int) -> c_int;
+    pub fn TessPageIteratorBoundingBox(
+        handle: *mut c_void,
+        level: c_int,
+        left: *mut c_int,
+        top: *mut c_int,
+        right: *mut c_int,
+        bottom: *mut c_int,
+    ) -> c_int;
+    pub fn TessPageIteratorBlockType(handle: *mut c_void) -> c_int;
+    pub fn TessPageIteratorBaseline(
+        handle: *mut c_void,
+        level: c_int,
+        x1: *mut c_int,
+        y1: *mut c_int,
+        x2: *mut c_int,
+        y2: *mut c_int,
+    ) -> c_int;
+    pub fn TessPageIteratorOrientation(
+        handle: *mut c_void,
+        orientation: *mut c_int,
+        writing_direction: *mut c_int,
+        textline_order: *mut c_int,
+        deskew_angle: *mut c_float,
+    ) -> c_int;
+    pub fn TessBaseAPIGetIterator(handle: *mut c_void) -> *mut c_void;
+    pub fn TessPageIteratorParagraphInfo(
+        handle: *mut c_void,
+        justification: *mut c_int,
+        is_list_item: *mut c_int,
+        is_crown: *mut c_int,
+        first_line_indent: *mut c_int,
+    ) -> c_int;
+}
--- a/crates/kreuzberg-tesseract/src/result_iterator.rs
+++ b/crates/kreuzberg-tesseract/src/result_iterator.rs
@@ -0,0 +1,589 @@
+use crate::api::TessDeleteText;
+use crate::enums::TessPageIteratorLevel;
+use crate::error::{Result, TesseractError};
+use std::ffi::CStr;
+use std::os::raw::{c_char, c_float, c_int, c_void};
+use std::sync::{Arc, Mutex};
+
+/// Font attributes detected by Tesseract for a word.
+#[derive(Debug, Clone)]
+pub struct FontAttributes {
+    pub is_bold: bool,
+    pub is_italic: bool,
+    pub is_underlined: bool,
+    pub is_monospace: bool,
+    pub is_serif: bool,
+    pub is_smallcaps: bool,
+    pub pointsize: i32,
+    pub font_id: i32,
+}
+
+/// Complete word data extracted in a single mutex lock.
+#[derive(Debug, Clone)]
+pub struct WordData {
+    pub text: String,
+    pub left: i32,
+    pub top: i32,
+    pub right: i32,
+    pub bottom: i32,
+    pub confidence: f32,
+    pub font_attrs: Option<FontAttributes>,
+}
+
+pub struct ResultIterator {
+    pub handle: Arc<Mutex<*mut c_void>>,
+}
+
+unsafe impl Send for ResultIterator {}
+unsafe impl Sync for ResultIterator {}
+
+impl ResultIterator {
+    /// Creates a new instance of the ResultIterator.
+    ///
+    /// # Arguments
+    ///
+    /// * `handle` - Pointer to the ResultIterator.
+    ///
+    /// # Returns
+    ///
+    /// Returns the new instance of the ResultIterator.
+    pub fn new(handle: *mut c_void) -> Self {
+        ResultIterator {
+            handle: Arc::new(Mutex::new(handle)),
+        }
+    }
+
+    /// Gets the UTF-8 text of the current iterator.
+    ///
+    /// # Arguments
+    ///
+    /// * `level` - Level of the text.
+    ///
+    /// # Returns
+    ///
+    /// Returns the UTF-8 text as a `String` if successful, otherwise returns an error.
+    pub fn get_utf8_text(&self, level: TessPageIteratorLevel) -> Result<String> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        // SAFETY: TessResultIteratorGetUTF8Text() allocates and returns a pointer to a C string.
+        // This is safe because:
+        // 1. *handle is a valid pointer to an initialized ResultIterator (mutex-guarded)
+        // 2. level is a valid TessPageIteratorLevel enum converted to c_int (in valid range)
+        // 3. The returned pointer is either null (error) or a valid null-terminated C string
+        //    allocated on Tesseract's heap (must be freed with TessDeleteText)
+        let text_ptr = unsafe { TessResultIteratorGetUTF8Text(*handle, level as c_int) };
+        if text_ptr.is_null() {
+            return Err(TesseractError::NullPointerError);
+        }
+        // SAFETY: We've verified text_ptr is non-null. The allocation/deallocation pattern is:
+        // 1. text_ptr was allocated by TessResultIteratorGetUTF8Text() on the FFI boundary
+        // 2. CStr::from_ptr(text_ptr) is safe: pointer is non-null and points to valid C string
+        // 3. We read from the string (to_str() creates temporary immutable borrow)
+        // 4. We immediately copy all data to owned String before deallocation
+        // 5. The string data remains valid until TessDeleteText is called
+        let c_str = unsafe { CStr::from_ptr(text_ptr) };
+        let result = c_str.to_str()?.to_owned();
+        // SAFETY: TessDeleteText() deallocates memory allocated by TessResultIteratorGetUTF8Text():
+        // 1. text_ptr must be non-null (verified above)
+        // 2. text_ptr came from the Tesseract API (trusted source, correct allocation)
+        // 3. TessDeleteText() is the correct deallocation function for this allocation
+        // 4. Must be called exactly once per allocation to avoid double-free (we ensure this)
+        // 5. After this call, text_ptr is invalid; all uses must be via owned result String
+        unsafe { TessDeleteText(text_ptr as *mut c_char) };
+        Ok(result)
+    }
+
+    /// Gets the confidence of the current iterator.
+    ///
+    /// # Arguments
+    ///
+    /// * `level` - Level of the confidence.
+    ///
+    /// # Returns
+    ///
+    /// Returns the confidence as a `f32`.
+    pub fn confidence(&self, level: TessPageIteratorLevel) -> Result<f32> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        // SAFETY: TessResultIteratorConfidence() is safe because:
+        // 1. *handle is a valid pointer to an initialized ResultIterator
+        // 2. level is a valid TessPageIteratorLevel enum converted to c_int
+        // 3. The function only reads state and returns an f32 value (copyable)
+        // 4. No pointer operations or memory access is needed
+        Ok(unsafe { TessResultIteratorConfidence(*handle, level as c_int) })
+    }
+
+    /// Gets the recognition language of the current iterator.
+    ///
+    /// # Returns
+    ///
+    /// Returns the recognition language as a `String` if successful, otherwise returns an error.
+    pub fn word_recognition_language(&self) -> Result<String> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        // SAFETY: TessResultIteratorWordRecognitionLanguage() returns a pointer to a C string
+        // in the iterator's memory. This is safe because:
+        // 1. *handle is a valid pointer to an initialized ResultIterator
+        // 2. The returned pointer is either null or a valid null-terminated C string
+        let lang_ptr = unsafe { TessResultIteratorWordRecognitionLanguage(*handle) };
+        if lang_ptr.is_null() {
+            return Err(TesseractError::NullPointerError);
+        }
+        // SAFETY: We've verified lang_ptr is non-null. CStr::from_ptr() is safe because:
+        // 1. lang_ptr points to a valid null-terminated C string managed by Tesseract
+        // 2. We only read from it (to_str() creates temporary borrow)
+        let c_str = unsafe { CStr::from_ptr(lang_ptr) };
+        Ok(c_str.to_str()?.to_owned())
+    }
+
+    /// Gets the font attributes of the current iterator.
+    ///
+    /// # Returns
+    ///
+    /// Returns the font attributes as a tuple if successful, otherwise returns an error.
+    pub fn word_font_attributes(&self) -> Result<(bool, bool, bool, bool, bool, bool, i32, i32)> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        let mut is_bold = 0;
+        let mut is_italic = 0;
+        let mut is_underlined = 0;
+        let mut is_monospace = 0;
+        let mut is_serif = 0;
+        let mut is_smallcaps = 0;
+        let mut pointsize = 0;
+        let mut font_id = 0;
+
+        // SAFETY: TessResultIteratorWordFontAttributes() takes output parameter pointers
+        // and fills them with font attribute values. This is safe because:
+        // 1. *handle is a valid pointer to an initialized ResultIterator (mutex-guarded)
+        // 2. All mutable references (&mut ...) are valid local stack variables
+        // 3. Each reference has a distinct memory location (no aliasing)
+        // 4. The references outlive the FFI call (defined on stack, used immediately after)
+        // 5. The function writes output i32 values (0/1 for bools, integers for size/id)
+        // 6. Each reference has exclusive mutable access (Rust borrow checker enforces this)
+        // 7. The output parameters are independent (function cannot cause data races)
+        let result = unsafe {
+            TessResultIteratorWordFontAttributes(
+                *handle,
+                &mut is_bold,
+                &mut is_italic,
+                &mut is_underlined,
+                &mut is_monospace,
+                &mut is_serif,
+                &mut is_smallcaps,
+                &mut pointsize,
+                &mut font_id,
+            )
+        };
+
+        if result == 0 {
+            Err(TesseractError::InvalidParameterError)
+        } else {
+            Ok((
+                is_bold != 0,
+                is_italic != 0,
+                is_underlined != 0,
+                is_monospace != 0,
+                is_serif != 0,
+                is_smallcaps != 0,
+                pointsize,
+                font_id,
+            ))
+        }
+    }
+
+    /// Checks if the current iterator is from the dictionary.
+    ///
+    /// # Returns
+    ///
+    /// Returns `true` if the current iterator is from the dictionary, otherwise returns `false`.
+    pub fn word_is_from_dictionary(&self) -> Result<bool> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        // SAFETY: TessResultIteratorWordIsFromDictionary() is safe because:
+        // 1. *handle is a valid pointer to an initialized ResultIterator
+        // 2. The function only reads state and returns an i32 value (0 or non-zero)
+        // 3. No pointer operations or memory modifications are needed
+        Ok(unsafe { TessResultIteratorWordIsFromDictionary(*handle) != 0 })
+    }
+
+    /// Checks if the current iterator is numeric.
+    ///
+    /// # Returns
+    ///
+    /// Returns `true` if the current iterator is numeric, otherwise returns `false`.
+    pub fn word_is_numeric(&self) -> Result<bool> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        // SAFETY: TessResultIteratorWordIsNumeric() is safe because:
+        // 1. *handle is a valid pointer to an initialized ResultIterator
+        // 2. The function only reads state and returns an i32 value
+        // 3. No pointer operations or state modifications needed
+        Ok(unsafe { TessResultIteratorWordIsNumeric(*handle) != 0 })
+    }
+
+    /// Checks if the current iterator is superscript.
+    ///
+    /// # Returns
+    ///
+    /// Returns `true` if the current iterator is superscript, otherwise returns `false`.
+    pub fn symbol_is_superscript(&self) -> Result<bool> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        // SAFETY: TessResultIteratorSymbolIsSuperscript() is safe because:
+        // 1. *handle is a valid pointer to an initialized ResultIterator
+        // 2. The function only reads state and returns an i32 value
+        // 3. No pointer operations or state modifications needed
+        Ok(unsafe { TessResultIteratorSymbolIsSuperscript(*handle) != 0 })
+    }
+
+    /// Checks if the current iterator is subscript.
+    ///
+    /// # Returns
+    ///
+    /// Returns `true` if the current iterator is subscript, otherwise returns `false`.
+    pub fn symbol_is_subscript(&self) -> Result<bool> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        // SAFETY: TessResultIteratorSymbolIsSubscript() is safe because:
+        // 1. *handle is a valid pointer to an initialized ResultIterator
+        // 2. The function only reads state and returns an i32 value
+        // 3. No pointer operations or state modifications needed
+        Ok(unsafe { TessResultIteratorSymbolIsSubscript(*handle) != 0 })
+    }
+
+    /// Checks if the current iterator is dropcap.
+    ///
+    /// # Returns
+    ///
+    /// Returns `true` if the current iterator is dropcap, otherwise returns `false`.
+    pub fn symbol_is_dropcap(&self) -> Result<bool> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        // SAFETY: TessResultIteratorSymbolIsDropcap() is safe because:
+        // 1. *handle is a valid pointer to an initialized ResultIterator
+        // 2. The function only reads state and returns an i32 value
+        // 3. No pointer operations or state modifications needed
+        Ok(unsafe { TessResultIteratorSymbolIsDropcap(*handle) != 0 })
+    }
+
+    /// Moves to the next iterator.
+    ///
+    /// # Arguments
+    ///
+    /// * `level` - Level of the next iterator.
+    ///
+    /// # Returns
+    ///
+    /// Returns `true` if the next iterator exists, otherwise returns `false`.
+    pub fn next(&self, level: TessPageIteratorLevel) -> Result<bool> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        // SAFETY: TessResultIteratorNext() is safe because:
+        // 1. *handle is a valid pointer to an initialized ResultIterator
+        // 2. level is a valid TessPageIteratorLevel enum converted to c_int
+        // 3. The function modifies iterator state (advances position) and returns i32 result
+        // 4. The mutex ensures exclusive access during state modification
+        Ok(unsafe { TessResultIteratorNext(*handle, level as c_int) != 0 })
+    }
+
+    /// Gets the current word from the iterator with its bounding box and confidence.
+    ///
+    /// # Returns
+    ///
+    /// Returns a tuple of (text, left, top, right, bottom, confidence) if successful
+    pub fn get_word_with_bounds(&self) -> Result<(String, i32, i32, i32, i32, f32)> {
+        let text = self.get_utf8_text(TessPageIteratorLevel::RIL_WORD)?;
+        let (left, top, right, bottom) = self.get_bounding_box(TessPageIteratorLevel::RIL_WORD)?;
+        let confidence = self.confidence(TessPageIteratorLevel::RIL_WORD)?;
+
+        Ok((text, left, top, right, bottom, confidence))
+    }
+
+    /// Advances the iterator to the next word.
+    ///
+    /// # Returns
+    ///
+    /// Returns true if successful, false if there are no more words
+    pub fn next_word(&self) -> Result<bool> {
+        self.next(TessPageIteratorLevel::RIL_WORD)
+    }
+
+    /// Gets the word information for the current position in the iterator.
+    /// Should be called before next() to ensure valid data.
+    ///
+    /// # Returns
+    /// Returns a tuple of (text, left, top, right, bottom, confidence) if successful
+    pub fn get_current_word(&self) -> Result<(String, i32, i32, i32, i32, f32)> {
+        let text = self.get_utf8_text(TessPageIteratorLevel::RIL_WORD)?;
+        let (left, top, right, bottom) = self.get_bounding_box(TessPageIteratorLevel::RIL_WORD)?;
+        let confidence = self.confidence(TessPageIteratorLevel::RIL_WORD)?;
+
+        Ok((text, left, top, right, bottom, confidence))
+    }
+
+    /// Gets the bounding box for the current element.
+    pub fn get_bounding_box(&self, level: TessPageIteratorLevel) -> Result<(i32, i32, i32, i32)> {
+        let mut left = 0;
+        let mut top = 0;
+        let mut right = 0;
+        let mut bottom = 0;
+
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+
+        // SAFETY: TessPageIteratorBoundingBox() queries iterator state and returns coordinates
+        // via output parameters. This is safe because:
+        // 1. *handle is a valid pointer to an initialized ResultIterator or PageIterator (mutex-guarded)
+        // 2. level is a valid TessPageIteratorLevel enum converted to c_int (in valid range)
+        // 3. All mutable references (&mut left, &mut top, &mut right, &mut bottom)
+        //    are valid local stack variables with distinct memory locations
+        // 4. Each reference is exclusively borrowed (Rust enforces no aliasing)
+        // 5. The references outlive the FFI call (defined on stack, used immediately after)
+        // 6. The function writes four i32 coordinate values into these references
+        // 7. No pointer escaping: the function only writes to these parameters, doesn't store them
+        // 8. Return value indicates success/failure (checked below)
+        let result = unsafe {
+            TessPageIteratorBoundingBox(*handle, level as c_int, &mut left, &mut top, &mut right, &mut bottom)
+        };
+
+        if result == 0 {
+            Err(TesseractError::InvalidParameterError)
+        } else {
+            Ok((left, top, right, bottom))
+        }
+    }
+
+    /// Extracts all word data from the iterator in a single mutex lock.
+    ///
+    /// Acquires the mutex once and iterates all words, collecting text, bounding box,
+    /// confidence, and font attributes for each word. This is more efficient than
+    /// calling individual methods in a loop since it avoids repeated mutex acquisitions.
+    ///
+    /// The iterator is always reset to the beginning before traversal so that partial
+    /// prior consumption does not cause words to be missed.
+    ///
+    /// # Returns
+    ///
+    /// Returns a `Vec<WordData>` containing data for every word, or an error if the
+    /// mutex cannot be acquired.
+    pub fn extract_all_words(&self) -> Result<Vec<WordData>> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        let raw = *handle;
+        let mut words = Vec::new();
+
+        // Reset to the first element before traversal.  ResultIterator inherits from
+        // PageIterator in C++, so TessPageIteratorBegin operates on the same handle.
+        // SAFETY: raw is a valid mutex-guarded ResultIterator pointer; TessPageIteratorBegin
+        // simply resets the internal position and does not allocate or free memory.
+        unsafe { TessPageIteratorBegin(raw) };
+
+        loop {
+            // SAFETY: raw is the mutex-guarded *mut c_void handle. All calls within this
+            // loop are performed while holding the mutex lock, ensuring exclusive access.
+            // We pass raw directly to the unlocked helper to avoid re-locking.
+            match extract_word_data_unlocked(raw) {
+                Ok(word) => words.push(word),
+                // NullPointerError means the text pointer was null; skip this position.
+                // InvalidParameterError means bounding box failed; skip this position.
+                // Utf8Error means the text was not valid UTF-8; skip this word rather than
+                // aborting, so the remaining words in the iterator are not lost.
+                Err(TesseractError::NullPointerError)
+                | Err(TesseractError::InvalidParameterError)
+                | Err(TesseractError::Utf8Error(_)) => {}
+                Err(e) => return Err(e),
+            }
+
+            // SAFETY: TessResultIteratorNext() advances the iterator state and returns
+            // non-zero if a next element exists. This is safe because:
+            // 1. raw is a valid pointer to an initialized ResultIterator (mutex-guarded)
+            // 2. RIL_WORD is a valid TessPageIteratorLevel enum value
+            // 3. The mutex is held for the duration of this call (exclusive access)
+            // 4. The function modifies iterator position and returns an i32 result
+            let has_next = unsafe { TessResultIteratorNext(raw, TessPageIteratorLevel::RIL_WORD as c_int) != 0 };
+            if !has_next {
+                break;
+            }
+        }
+
+        Ok(words)
+    }
+
+    /// Extracts the current word's data in a single mutex lock.
+    ///
+    /// Acquires the mutex once and calls all FFI functions (text, bounding box,
+    /// confidence, font attributes) within that lock scope. More efficient than
+    /// calling the individual methods separately when all fields are needed.
+    ///
+    /// # Returns
+    ///
+    /// Returns a [`WordData`] struct if successful, otherwise returns an error.
+    pub fn extract_word_data(&self) -> Result<WordData> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        extract_word_data_unlocked(*handle)
+    }
+}
+
+/// Extracts word data from a raw iterator handle without acquiring the mutex.
+///
+/// The caller MUST hold the mutex lock for the `ResultIterator` this handle belongs to
+/// before calling this function. Passing a handle that is not mutex-guarded, or calling
+/// this function concurrently on the same handle, is undefined behaviour.
+fn extract_word_data_unlocked(raw: *mut c_void) -> Result<WordData> {
+    // SAFETY: TessResultIteratorGetUTF8Text() allocates and returns a pointer to a C string.
+    // This is safe because:
+    // 1. raw is a valid pointer to an initialized ResultIterator (caller holds mutex lock)
+    // 2. RIL_WORD is a valid TessPageIteratorLevel enum value converted to c_int
+    // 3. The returned pointer is either null (error) or a valid null-terminated C string
+    //    allocated on Tesseract's heap (must be freed with TessDeleteText)
+    let text_ptr = unsafe { TessResultIteratorGetUTF8Text(raw, TessPageIteratorLevel::RIL_WORD as c_int) };
+    if text_ptr.is_null() {
+        return Err(TesseractError::NullPointerError);
+    }
+    // SAFETY: We've verified text_ptr is non-null. The allocation/deallocation pattern is:
+    // 1. text_ptr was allocated by TessResultIteratorGetUTF8Text() on the FFI boundary
+    // 2. CStr::from_ptr(text_ptr) is safe: pointer is non-null and points to valid C string
+    // 3. We immediately copy all data to an owned String before deallocation
+    // 4. The string data remains valid until TessDeleteText is called
+    let text = {
+        let c_str = unsafe { CStr::from_ptr(text_ptr) };
+        let owned = c_str.to_str()?.to_owned();
+        // SAFETY: TessDeleteText() deallocates memory allocated by TessResultIteratorGetUTF8Text():
+        // 1. text_ptr is non-null (verified above)
+        // 2. text_ptr came from the Tesseract API (correct allocation type)
+        // 3. TessDeleteText() is the correct deallocation function for this allocation
+        // 4. Called exactly once per allocation to avoid double-free
+        // 5. owned String was already populated; text_ptr is no longer accessed after this call
+        unsafe { TessDeleteText(text_ptr as *mut c_char) };
+        owned
+    };
+
+    let mut left = 0;
+    let mut top = 0;
+    let mut right = 0;
+    let mut bottom = 0;
+    // SAFETY: TessPageIteratorBoundingBox() queries iterator state and fills output parameters.
+    // This is safe because:
+    // 1. raw is a valid pointer to an initialized ResultIterator (caller holds mutex lock)
+    // 2. RIL_WORD is a valid TessPageIteratorLevel enum value converted to c_int
+    // 3. All mutable references are valid local stack variables with distinct memory locations
+    // 4. Each reference is exclusively borrowed (Rust enforces no aliasing)
+    // 5. The references outlive the FFI call (defined on stack, used immediately after)
+    // 6. Return value indicates success/failure (checked below)
+    let bbox_result = unsafe {
+        TessPageIteratorBoundingBox(
+            raw,
+            TessPageIteratorLevel::RIL_WORD as c_int,
+            &mut left,
+            &mut top,
+            &mut right,
+            &mut bottom,
+        )
+    };
+    if bbox_result == 0 {
+        return Err(TesseractError::InvalidParameterError);
+    }
+
+    // SAFETY: TessResultIteratorConfidence() reads iterator state and returns an f32 value.
+    // This is safe because:
+    // 1. raw is a valid pointer to an initialized ResultIterator (caller holds mutex lock)
+    // 2. RIL_WORD is a valid TessPageIteratorLevel enum value converted to c_int
+    // 3. The function only reads state and returns a copy (no pointer operations)
+    let confidence = unsafe { TessResultIteratorConfidence(raw, TessPageIteratorLevel::RIL_WORD as c_int) };
+
+    // Collect font attributes; treat any failure as absent rather than propagating the error.
+    let font_attrs = {
+        let mut is_bold = 0;
+        let mut is_italic = 0;
+        let mut is_underlined = 0;
+        let mut is_monospace = 0;
+        let mut is_serif = 0;
+        let mut is_smallcaps = 0;
+        let mut pointsize = 0;
+        let mut font_id = 0;
+        // SAFETY: TessResultIteratorWordFontAttributes() fills output parameters with font info.
+        // This is safe because:
+        // 1. raw is a valid pointer to an initialized ResultIterator (caller holds mutex lock)
+        // 2. All mutable references are valid local stack variables with distinct memory locations
+        // 3. Each reference is exclusively borrowed (no aliasing)
+        // 4. The references outlive the FFI call
+        // 5. Return value is non-zero on success, zero on failure (checked below)
+        let result = unsafe {
+            TessResultIteratorWordFontAttributes(
+                raw,
+                &mut is_bold,
+                &mut is_italic,
+                &mut is_underlined,
+                &mut is_monospace,
+                &mut is_serif,
+                &mut is_smallcaps,
+                &mut pointsize,
+                &mut font_id,
+            )
+        };
+        if result != 0 {
+            Some(FontAttributes {
+                is_bold: is_bold != 0,
+                is_italic: is_italic != 0,
+                is_underlined: is_underlined != 0,
+                is_monospace: is_monospace != 0,
+                is_serif: is_serif != 0,
+                is_smallcaps: is_smallcaps != 0,
+                pointsize,
+                font_id,
+            })
+        } else {
+            None
+        }
+    };
+
+    Ok(WordData {
+        text,
+        left,
+        top,
+        right,
+        bottom,
+        confidence,
+        font_attrs,
+    })
+}
+
+impl Drop for ResultIterator {
+    fn drop(&mut self) {
+        if let Ok(handle) = self.handle.lock() {
+            // SAFETY: TessResultIteratorDelete() frees the ResultIterator handle allocated by Tesseract:
+            // 1. We use .ok() pattern to handle poisoned mutex gracefully (no panic in Drop)
+            // 2. *handle is a valid opaque pointer allocated by TessBaseAPIGetIterator()
+            //    or TessBaseAPIGetMutableIterator() - Tesseract owns this memory
+            // 3. TessResultIteratorDelete() is the single correct way to deallocate this type
+            // 4. The function must be called exactly once per allocation to avoid double-free
+            // 5. After calling delete, the pointer is invalid; future use would cause use-after-free
+            // 6. Drop impl never panics (we use .ok() guard), ensuring cleanup always executes
+            // 7. If mutex is poisoned, handle cleanup is skipped (OS will reclaim process memory)
+            unsafe { TessResultIteratorDelete(*handle) };
+        }
+    }
+}
+
+#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
+ffi_extern! {
+    pub fn TessResultIteratorDelete(handle: *mut c_void);
+    pub fn TessPageIteratorBegin(handle: *mut c_void);
+    pub fn TessResultIteratorGetUTF8Text(handle: *mut c_void, level: c_int) -> *mut c_char;
+    pub fn TessResultIteratorConfidence(handle: *mut c_void, level: c_int) -> c_float;
+    pub fn TessResultIteratorWordRecognitionLanguage(handle: *mut c_void) -> *const c_char;
+    pub fn TessResultIteratorWordFontAttributes(
+        handle: *mut c_void,
+        is_bold: *mut c_int,
+        is_italic: *mut c_int,
+        is_underlined: *mut c_int,
+        is_monospace: *mut c_int,
+        is_serif: *mut c_int,
+        is_smallcaps: *mut c_int,
+        pointsize: *mut c_int,
+        font_id: *mut c_int,
+    ) -> c_int;
+    pub fn TessResultIteratorWordIsFromDictionary(handle: *mut c_void) -> c_int;
+    pub fn TessResultIteratorWordIsNumeric(handle: *mut c_void) -> c_int;
+    pub fn TessResultIteratorSymbolIsSuperscript(handle: *mut c_void) -> c_int;
+    pub fn TessResultIteratorSymbolIsSubscript(handle: *mut c_void) -> c_int;
+    pub fn TessResultIteratorSymbolIsDropcap(handle: *mut c_void) -> c_int;
+    pub fn TessResultIteratorNext(handle: *mut c_void, level: c_int) -> c_int;
+    pub fn TessPageIteratorBoundingBox(
+        handle: *mut c_void,
+        level: c_int,
+        left: *mut c_int,
+        top: *mut c_int,
+        right: *mut c_int,
+        bottom: *mut c_int,
+    ) -> c_int;
+}
--- a/crates/kreuzberg-tesseract/src/result_renderer.rs
+++ b/crates/kreuzberg-tesseract/src/result_renderer.rs
@@ -0,0 +1,212 @@
+use crate::TesseractAPI;
+use crate::error::{Result, TesseractError};
+use std::ffi::{CStr, CString};
+use std::os::raw::{c_char, c_int, c_void};
+use std::sync::Arc;
+use std::sync::Mutex;
+
+pub struct TessResultRenderer {
+    handle: Arc<Mutex<*mut c_void>>,
+}
+
+unsafe impl Send for TessResultRenderer {}
+unsafe impl Sync for TessResultRenderer {}
+
+impl TessResultRenderer {
+    /// Creates a new instance of the TessResultRenderer.
+    ///
+    /// # Arguments
+    ///
+    /// * `outputbase` - Output base path.
+    ///
+    /// # Returns
+    ///
+    /// Returns the new instance of the TessResultRenderer.
+    pub fn new_text_renderer(outputbase: &str) -> Result<Self> {
+        let outputbase = CString::new(outputbase).map_err(|_| TesseractError::NullByteInString)?;
+        let handle = unsafe { TessTextRendererCreate(outputbase.as_ptr()) };
+        if handle.is_null() {
+            Err(TesseractError::NullPointerError)
+        } else {
+            Ok(TessResultRenderer {
+                handle: Arc::new(Mutex::new(handle)),
+            })
+        }
+    }
+
+    /// Creates a new instance of the TessResultRenderer for HOCR.
+    ///
+    /// # Arguments
+    ///
+    /// * `outputbase` - Output base path.
+    ///
+    /// # Returns
+    ///
+    /// Returns the new instance of the TessResultRenderer.
+    pub fn new_hocr_renderer(outputbase: &str) -> Result<Self> {
+        let outputbase = CString::new(outputbase).map_err(|_| TesseractError::NullByteInString)?;
+        let handle = unsafe { TessHOcrRendererCreate(outputbase.as_ptr()) };
+        if handle.is_null() {
+            Err(TesseractError::NullPointerError)
+        } else {
+            Ok(TessResultRenderer {
+                handle: Arc::new(Mutex::new(handle)),
+            })
+        }
+    }
+
+    /// Creates a new instance of the TessResultRenderer for PDF.
+    ///
+    /// # Arguments
+    ///
+    /// * `outputbase` - Output base path.
+    /// * `datadir` - Data directory path.
+    /// * `textonly` - Whether to include text only.
+    ///
+    /// # Returns
+    ///
+    /// Returns the new instance of the TessResultRenderer.
+    pub fn new_pdf_renderer(outputbase: &str, datadir: &str, textonly: bool) -> Result<Self> {
+        let outputbase = CString::new(outputbase).map_err(|_| TesseractError::NullByteInString)?;
+        let datadir = CString::new(datadir).map_err(|_| TesseractError::NullByteInString)?;
+        let handle = unsafe { TessPDFRendererCreate(outputbase.as_ptr(), datadir.as_ptr(), textonly as c_int) };
+        if handle.is_null() {
+            Err(TesseractError::NullPointerError)
+        } else {
+            Ok(TessResultRenderer {
+                handle: Arc::new(Mutex::new(handle)),
+            })
+        }
+    }
+
+    /// Begins a new document.
+    ///
+    /// # Arguments
+    ///
+    /// * `title` - Title of the document.
+    ///
+    /// # Returns
+    ///
+    /// Returns `true` if the document was created successfully, otherwise returns `false`.
+    ///
+    /// # Errors
+    ///
+    /// Returns a `TesseractError` if the string contains a null byte or if the mutex lock fails.
+    pub fn begin_document(&self, title: &str) -> Result<bool> {
+        let title = CString::new(title).map_err(|_| TesseractError::NullByteInString)?;
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        Ok(unsafe { TessResultRendererBeginDocument(*handle, title.as_ptr()) != 0 })
+    }
+
+    /// Adds an image to the document.
+    ///
+    /// # Arguments
+    ///
+    /// * `api` - The TesseractAPI instance.
+    ///
+    /// # Returns
+    ///
+    /// Returns `true` if the image was added successfully, otherwise returns `false`.
+    ///
+    /// # Errors
+    ///
+    /// Returns a `TesseractError::MutexLockError` if either mutex lock fails.
+    pub fn add_image(&self, api: &TesseractAPI) -> Result<bool> {
+        let api_handle = api.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        Ok(unsafe { TessResultRendererAddImage(*handle, *api_handle) != 0 })
+    }
+
+    /// Ends the document.
+    ///
+    /// # Returns
+    ///
+    /// Returns `true` if the document was ended successfully, otherwise returns `false`.
+    ///
+    /// # Errors
+    ///
+    /// Returns a `TesseractError::MutexLockError` if the mutex lock fails.
+    pub fn end_document(&self) -> Result<bool> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        Ok(unsafe { TessResultRendererEndDocument(*handle) != 0 })
+    }
+
+    /// Gets the extension of the document.
+    ///
+    /// # Returns
+    ///
+    /// Returns the extension as a `String` if successful, otherwise returns an error.
+    ///
+    /// # Errors
+    ///
+    /// Returns a `TesseractError::MutexLockError` if the mutex lock fails,
+    /// `TesseractError::NullPointerError` if the extension pointer is null,
+    /// or `TesseractError::Utf8Error` if the extension contains invalid UTF-8.
+    pub fn get_extension(&self) -> Result<String> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        let ext_ptr = unsafe { TessResultRendererExtention(*handle) };
+        if ext_ptr.is_null() {
+            Err(TesseractError::NullPointerError)
+        } else {
+            let c_str = unsafe { CStr::from_ptr(ext_ptr) };
+            Ok(c_str.to_str()?.to_owned())
+        }
+    }
+
+    /// Gets the title of the document.
+    ///
+    /// # Returns
+    ///
+    /// Returns the title as a `String` if successful, otherwise returns an error.
+    ///
+    /// # Errors
+    ///
+    /// Returns a `TesseractError::MutexLockError` if the mutex lock fails,
+    /// `TesseractError::NullPointerError` if the title pointer is null,
+    /// or `TesseractError::Utf8Error` if the title contains invalid UTF-8.
+    pub fn get_title(&self) -> Result<String> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        let title_ptr = unsafe { TessResultRendererTitle(*handle) };
+        if title_ptr.is_null() {
+            Err(TesseractError::NullPointerError)
+        } else {
+            let c_str = unsafe { CStr::from_ptr(title_ptr) };
+            Ok(c_str.to_str()?.to_owned())
+        }
+    }
+
+    /// Gets the number of images in the document.
+    ///
+    /// # Returns
+    ///
+    /// Returns the number of images as an `i32`.
+    ///
+    /// # Errors
+    ///
+    /// Returns a `TesseractError::MutexLockError` if the mutex lock fails.
+    pub fn get_image_num(&self) -> Result<i32> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        Ok(unsafe { TessResultRendererImageNum(*handle) })
+    }
+}
+
+impl Drop for TessResultRenderer {
+    fn drop(&mut self) {
+        if let Ok(handle) = self.handle.lock() {
+            unsafe { TessDeleteResultRenderer(*handle) };
+        }
+    }
+}
+
+ffi_extern! {
+    pub fn TessTextRendererCreate(outputbase: *const c_char) -> *mut c_void;
+    pub fn TessHOcrRendererCreate(outputbase: *const c_char) -> *mut c_void;
+    pub fn TessPDFRendererCreate(outputbase: *const c_char, datadir: *const c_char, textonly: c_int) -> *mut c_void;
+    pub fn TessDeleteResultRenderer(renderer: *mut c_void);
+    pub fn TessResultRendererBeginDocument(renderer: *mut c_void, title: *const c_char) -> c_int;
+    pub fn TessResultRendererAddImage(renderer: *mut c_void, api: *mut c_void) -> c_int;
+    pub fn TessResultRendererEndDocument(renderer: *mut c_void) -> c_int;
+    pub fn TessResultRendererExtention(renderer: *mut c_void) -> *const c_char;
+    pub fn TessResultRendererTitle(renderer: *mut c_void) -> *const c_char;
+    pub fn TessResultRendererImageNum(renderer: *mut c_void) -> c_int;
+}
--- a/Show More
+++ b/Show More