Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/crates/kreuzberg-tesseract/.commitlintrc.json
+++ b/crates/kreuzberg-tesseract/.commitlintrc.json
@@ -0,0 +1,13 @@
+{
+  "extends": ["@commitlint/config-conventional"],
+  "rules": {
+    "body-max-line-length": [2, "always", 100],
+    "header-max-length": [2, "always", 100],
+    "subject-case": [2, "never", ["sentence-case", "start-case", "pascal-case", "upper-case"]],
+    "type-enum": [
+      2,
+      "always",
+      ["feat", "fix", "docs", "style", "refactor", "perf", "test", "build", "ci", "chore", "revert"]
+    ]
+  }
+}
--- a/crates/kreuzberg-tesseract/.crate-ignore
+++ b/crates/kreuzberg-tesseract/.crate-ignore
@@ -0,0 +1,2 @@
+/third_party/
+/tessdata/
--- a/crates/kreuzberg-tesseract/Cargo.lock
+++ b/crates/kreuzberg-tesseract/Cargo.lock
--- a/crates/kreuzberg-tesseract/Cargo.toml
+++ b/crates/kreuzberg-tesseract/Cargo.toml
@@ -0,0 +1,64 @@
+[package]
+name = "kreuzberg-tesseract"
+version.workspace = true
+edition.workspace = true
+rust-version.workspace = true
+authors.workspace = true
+description = "Rust bindings for Tesseract OCR with cross-compilation, C++17, and caching improvements"
+license = "MIT"
+repository.workspace = true
+homepage = "https://kreuzberg.dev"
+documentation = "https://docs.kreuzberg.dev"
+readme = "README.md"
+keywords = ["tesseract", "ocr", "bindings", "vision", "recognition"]
+categories = ["external-ffi-bindings", "computer-vision", "text-processing"]
+build = "build.rs"
+links = "kreuzberg_tesseract"
+exclude = ["tessdata/*", "third_party/*"]
+
+[package.metadata.docs.rs]
+features = ["docs-only"]
+rustdoc-args = ["--cfg", "docsrs"]
+
+[package.metadata.cargo-machete]
+ignored = ["cc", "cmake", "reqwest", "zip"]
+
+[lib]
+name = "kreuzberg_tesseract"
+crate-type = ["lib"]
+
+[features]
+default = ["static-linking"]
+build-tesseract = ["cc", "cmake", "reqwest", "zip"]
+build-tesseract-wasm = ["cmake", "reqwest", "zip"]
+# Bundle eng.traineddata into the compiled crate so WASM builds can run OCR
+# without runtime tessdata loading. Uses ~4 MB of binary size (tessdata_fast).
+bundle-tessdata-eng = []
+static-linking = ["build-tesseract"]
+dynamic-linking = []
+
+[dependencies]
+thiserror = { workspace = true }
+
+[build-dependencies]
+cc = { version = "^1.2.63", optional = true }
+cmake = { version = "0.1.58", optional = true }
+zip = { version = ">=7.0.0", optional = true, default-features = false, features = [
+    "deflate-flate2-zlib-rs",
+] }
+
+[target.'cfg(not(target_os = "windows"))'.build-dependencies]
+reqwest = { workspace = true, default-features = false, features = [
+    "blocking",
+    "rustls",
+], optional = true }
+
+# Use native-tls on Windows to avoid aws-lc-sys CMake build issues with MinGW
+[target.'cfg(target_os = "windows")'.build-dependencies]
+reqwest = { workspace = true, default-features = false, features = [
+    "blocking",
+    "native-tls",
+], optional = true }
+
+[dev-dependencies]
+image = { workspace = true, features = ["png"] }
--- a/crates/kreuzberg-tesseract/LICENSE
+++ b/crates/kreuzberg-tesseract/LICENSE
@@ -0,0 +1,22 @@
+MIT License
+
+Copyright (c) 2024 Cafer Can Gündoğdu
+Copyright (c) 2025 Na'aman Hirschfeld
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/crates/kreuzberg-tesseract/README.md
+++ b/crates/kreuzberg-tesseract/README.md
@@ -0,0 +1,405 @@
+# kreuzberg-tesseract
+
+[![Bindings](https://img.shields.io/badge/Bindings-alef%20%D7%90-007ec6)](https://github.com/kreuzberg-dev/alef)
+
+Rust bindings for Tesseract OCR with built-in compilation of Tesseract and Leptonica libraries. Provides a safe and idiomatic Rust interface to Tesseract's functionality while handling the complexity of compiling the underlying C++ libraries.
+
+Based on the original [tesseract-rs](https://github.com/cafercangundogdu/tesseract-rs) by Cafer Can Gündoğdu, this maintained version adds critical improvements for production use:
+
+- **C++17 Support**: Upgraded for Tesseract 5.5.1 which requires C++17 filesystem
+- **Cross-Compilation**: Fixed CXX compiler detection for cross-platform builds
+- **Architecture Validation**: Validates target architecture before using cached libraries
+- **Windows Static Linking**: Fixed MSVC static linking issues
+- **Build Caching**: Improved caching with OUT_DIR-based cache directory
+- **MinGW Support**: Added support for MinGW toolchains
+
+## Features
+
+- Safe Rust bindings for Tesseract OCR
+- **Multiple linking options:**
+  - **Static linking** (default): Built-in compilation with no runtime dependencies
+  - **Dynamic linking**: Link to system-installed libraries for faster builds
+- Uses existing Tesseract training data (expects English data for tests)
+- High-level Rust API for common OCR tasks
+- Caching of compiled libraries for faster subsequent builds
+- Support for multiple operating systems (Linux, macOS, Windows)
+
+## Installation
+
+### Static Linking (Default)
+
+Static linking builds Tesseract and Leptonica from source and embeds them in your binary. No runtime dependencies required:
+
+```toml
+[dependencies]
+kreuzberg-tesseract = "1.0.0-rc.1"
+# or explicitly:
+kreuzberg-tesseract = { version = "1.0.0-rc.1", features = ["static-linking"] }
+```
+
+### Dynamic Linking
+
+Dynamic linking uses system-installed Tesseract and Leptonica libraries. Faster builds, but requires libraries installed on the system:
+
+```toml
+[dependencies]
+kreuzberg-tesseract = { version = "1.0.0-rc.1", features = ["dynamic-linking"], default-features = false }
+```
+
+**System requirements for dynamic linking:**
+
+- Tesseract 5.x libraries installed (`libtesseract`, `libleptonica`)
+- macOS: `brew install tesseract leptonica`
+- Ubuntu/Debian: `sudo apt-get install libtesseract-dev libleptonica-dev`
+- RHEL/CentOS/Fedora: `sudo dnf install tesseract-devel leptonica-devel`
+- Windows: Install from [Tesseract releases](https://github.com/tesseract-ocr/tesseract/releases) or vcpkg
+
+### Development Dependencies
+
+For development and testing, you'll also need these dependencies:
+
+```toml
+[dev-dependencies]
+image = "0.25.5"
+```
+
+## System Requirements
+
+### For Static Linking (Default)
+
+When building with static linking, the crate will compile Tesseract and Leptonica from source. You need:
+
+- Rust 1.85.0 or later
+- A C++ compiler (e.g., gcc, clang, MSVC on Windows)
+- CMake 3.x or later
+- Internet connection (for downloading Tesseract source code)
+
+### For Dynamic Linking
+
+When using dynamic linking with system-installed libraries, you need:
+
+- Rust 1.85.0 or later
+- Tesseract 5.x and Leptonica libraries installed on your system (see Installation section)
+- Internet connection (for downloading Tesseract source code)
+
+No C++ compiler or CMake required for dynamic linking builds.
+
+For a full development environment checklist (including optional tooling suggestions), see [CONTRIBUTING.md](../../CONTRIBUTING.md).
+
+## Environment Variables
+
+The following environment variables affect the build and test process:
+
+### Build Variables
+
+- `CARGO_CLEAN`: If set, cleans the cache directory before building
+- `RUSTC_WRAPPER`: If set to "sccache", enables compiler caching with sccache
+- `CC`: Compiler selection for C code (affects Linux builds)
+- `HOME` (Unix) or `APPDATA` (Windows): Used to determine cache directory location
+- `TESSERACT_RS_CACHE_DIR`: Optional override for the cache root. When unset or not writable, the build falls back to the default OS-specific directory, and if that still fails, a temporary directory under the system temp folder is used automatically.
+
+### Test Variables
+
+- `TESSDATA_PREFIX` (Optional): Path to override the default tessdata directory. If not set, the crate will use its default cache directory.
+
+## Cache and Data Directories
+
+The crate uses the following directory structure based on your operating system:
+
+- macOS: `~/Library/Application Support/tesseract-rs`
+- Linux: `~/.tesseract-rs`
+- Windows: `%APPDATA%/tesseract-rs`
+
+The cache includes:
+
+- Compiled Tesseract and Leptonica libraries
+- Third-party source code
+
+Training data is not downloaded during the build. Provide `eng.traineddata` (and any other languages you need) via `TESSDATA_PREFIX` or your system Tesseract installation.
+
+## Testing
+
+The project includes several integration tests that verify OCR functionality. To run the tests:
+
+1. Ensure you have the required test dependencies:
+
+   ```toml
+   [dev-dependencies]
+   image = "0.25.9"
+   ```
+
+2. Run the tests:
+
+   ```bash
+   cargo test
+   ```
+
+Note: Make sure `eng.traineddata` is available in your tessdata directory before running tests. If `TESSDATA_PREFIX` is not set, the tests look in the default cache location. You can point the tests at a custom tessdata directory by setting:
+
+```bash
+# Linux/macOS
+export TESSDATA_PREFIX=/path/to/custom/tessdata
+
+# Windows (PowerShell)
+$env:TESSDATA_PREFIX="C:\path\to\custom\tessdata"
+```
+
+Available test cases:
+
+- OCR on English sample images
+- Error handling and invalid input coverage
+
+Test images are sourced from the shared `test_documents/` directory in the repository:
+
+- `images/test_hello_world.png`: Simple English text
+- `tables/simple_table.png`: Basic table with English headers
+
+## Usage
+
+Here's a basic example of how to use `tesseract-rs`:
+
+```rust
+use std::path::PathBuf;
+use std::error::Error;
+use kreuzberg_tesseract::TesseractAPI;
+
+fn get_default_tessdata_dir() -> PathBuf {
+    if cfg!(target_os = "macos") {
+        let home_dir = std::env::var("HOME").expect("HOME environment variable not set");
+        PathBuf::from(home_dir)
+            .join("Library")
+            .join("Application Support")
+            .join("tesseract-rs")
+            .join("tessdata")
+    } else if cfg!(target_os = "linux") {
+        let home_dir = std::env::var("HOME").expect("HOME environment variable not set");
+        PathBuf::from(home_dir)
+            .join(".tesseract-rs")
+            .join("tessdata")
+    } else if cfg!(target_os = "windows") {
+        PathBuf::from(std::env::var("APPDATA").expect("APPDATA environment variable not set"))
+            .join("tesseract-rs")
+            .join("tessdata")
+    } else {
+        panic!("Unsupported operating system");
+    }
+}
+
+fn get_tessdata_dir() -> PathBuf {
+    match std::env::var("TESSDATA_PREFIX") {
+        Ok(dir) => {
+            let path = PathBuf::from(dir);
+            println!("Using TESSDATA_PREFIX directory: {:?}", path);
+            path
+        }
+        Err(_) => {
+            let default_dir = get_default_tessdata_dir();
+            println!(
+                "TESSDATA_PREFIX not set, using default directory: {:?}",
+                default_dir
+            );
+            default_dir
+        }
+    }
+}
+
+fn main() -> Result<(), Box<dyn Error>> {
+    let api = TesseractAPI::new()?;
+
+    // Get tessdata directory (uses default location or TESSDATA_PREFIX if set)
+    let tessdata_dir = get_tessdata_dir();
+    api.init(tessdata_dir.to_str().unwrap(), "eng")?;
+
+    let width = 24;
+    let height = 24;
+    let bytes_per_pixel = 1;
+    let bytes_per_line = width * bytes_per_pixel;
+
+    // Initialize image data with all white pixels
+    let mut image_data = vec![255u8; width * height];
+
+    // Draw number 9 with clearer distinction
+    for y in 4..19 {
+        for x in 7..17 {
+            // Top bar
+            if y == 4 && x >= 8 && x <= 15 {
+                image_data[y * width + x] = 0;
+            }
+            // Top curve left side
+            if y >= 4 && y <= 10 && x == 7 {
+                image_data[y * width + x] = 0;
+            }
+            // Top curve right side
+            if y >= 4 && y <= 11 && x == 16 {
+                image_data[y * width + x] = 0;
+            }
+            // Middle bar
+            if y == 11 && x >= 8 && x <= 15 {
+                image_data[y * width + x] = 0;
+            }
+            // Bottom right vertical line
+            if y >= 11 && y <= 18 && x == 16 {
+                image_data[y * width + x] = 0;
+            }
+            // Bottom bar
+            if y == 18 && x >= 8 && x <= 15 {
+                image_data[y * width + x] = 0;
+            }
+        }
+    }
+
+    // Set the image data
+    api.set_image(
+        &image_data,
+        width.try_into().unwrap(),
+        height.try_into().unwrap(),
+        bytes_per_pixel.try_into().unwrap(),
+        bytes_per_line.try_into().unwrap(),
+    )?;
+
+    // Set whitelist for digits only
+    api.set_variable("tessedit_char_whitelist", "0123456789")?;
+
+    // Set PSM mode to single character
+    api.set_variable("tessedit_pageseg_mode", "10")?;
+
+    // Get the recognized text
+    let text = api.get_utf8_text()?;
+    println!("Recognized text: {}", text.trim());
+
+    Ok(())
+}
+```
+
+## Advanced Usage
+
+The API provides additional functionality for more complex OCR tasks, including thread-safe operations:
+
+```rust
+use kreuzberg_tesseract::TesseractAPI;
+use std::sync::Arc;
+use std::thread;
+use std::error::Error;
+
+fn main() -> Result<(), Box<dyn Error>> {
+    let tessdata_dir = get_tessdata_dir();
+    let api = TesseractAPI::new()?;
+
+    // Initialize the main API
+    api.init(tessdata_dir.to_str().unwrap(), "eng")?;
+    api.set_variable("tessedit_pageseg_mode", "1")?;
+
+    // Load and prepare image data
+    let (image_data, width, height) = load_test_image("sample_text.png")?;
+
+    // Share image data across threads
+    let image_data = Arc::new(image_data);
+    let mut handles = vec![];
+
+    // Spawn multiple threads for parallel OCR processing
+    for _ in 0..3 {
+        let api_clone = api.clone(); // Clones the API with all configurations
+        let image_data = Arc::clone(&image_data);
+
+        let handle = thread::spawn(move || {
+            // Set image in each thread
+            let res = api_clone.set_image(
+                &image_data,
+                width as i32,
+                height as i32,
+                3,
+                3 * width as i32,
+            );
+            assert!(res.is_ok());
+
+            // Perform OCR in parallel
+            let text = api_clone.get_utf8_text()
+                .expect("Failed to get text");
+            println!("Thread result: {}", text);
+        });
+        handles.push(handle);
+    }
+
+    // Wait for all threads to complete
+    for handle in handles {
+        handle.join().unwrap();
+    }
+
+    Ok(())
+}
+
+// Helper function to get tessdata directory
+fn get_tessdata_dir() -> PathBuf {
+    // ... (implementation as shown in basic example)
+}
+
+// Helper function to load test image
+fn load_test_image(filename: &str) -> Result<(Vec<u8>, u32, u32), Box<dyn Error>> {
+    let img = image::open(filename)?
+        .to_rgb8();
+    let (width, height) = img.dimensions();
+    Ok((img.into_raw(), width, height))
+}
+```
+
+## Building
+
+### Static Linking (Default)
+
+With static linking, the crate will automatically download and compile Tesseract and Leptonica during the build process. This may take some time on the first build (5-10 minutes), but subsequent builds will use the cached libraries.
+
+To clean the cache and force a rebuild:
+
+```bash
+CARGO_CLEAN=1 cargo build
+```
+
+### Dynamic Linking
+
+With dynamic linking, the build is much faster (seconds instead of minutes) since it only links against system-installed libraries:
+
+```bash
+cargo build --no-default-features --features dynamic-linking
+```
+
+**Note**: Dynamic linking requires Tesseract and Leptonica to be installed on your system (see Installation section).
+
+## Documentation
+
+For more detailed information, please check the [API documentation](https://docs.rs/kreuzberg-tesseract).
+
+## License
+
+This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
+
+## Acknowledgements
+
+This project is based on the original [tesseract-rs](https://github.com/cafercangundogdu/tesseract-rs) by [Cafer Can Gündoğdu](https://github.com/cafercangundogdu). We are grateful for the foundational work that made this project possible.
+
+## Contributing
+
+We welcome contributions! Please see our [Contributing Guide](../../CONTRIBUTING.md) for details.
+
+### Quick Start for Contributors
+
+1. Fork and clone the repository
+2. Install uv and set up git hooks:
+
+   ```bash
+   curl -LsSf https://astral.sh/uv/install.sh | sh
+   uvx prek install
+   ```
+
+3. Make your changes following our commit message format
+4. Run tests: `cargo test`
+5. Submit a Pull Request
+
+Our commit messages follow the [Conventional Commits](https://www.conventionalcommits.org/) specification.
+
+## Acknowledgements
+
+This project uses [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) and [Leptonica](http://leptonica.org/). We are grateful to the maintainers and contributors of these projects.
+
+```text
+
+```
--- a/crates/kreuzberg-tesseract/build.rs
+++ b/crates/kreuzberg-tesseract/build.rs
--- a/crates/kreuzberg-tesseract/patches/README.md
+++ b/crates/kreuzberg-tesseract/patches/README.md
@@ -0,0 +1,74 @@
+# Tesseract WASM Patches
+
+This directory contains patches needed to compile Tesseract for WebAssembly (WASM) targets using WASI SDK.
+
+These patches are vendored from the [tesseract-wasm](https://github.com/naptha/tesseract.js) project and have been proven to work with WASM compilation.
+
+## Patches
+
+### tesseract.diff
+
+A comprehensive patch that makes Tesseract compatible with WASM compilation. The patch includes the following changes:
+
+#### 1. CMakeLists.txt Modifications
+
+- **New CMake option**: `BUILD_TESSERACT_BINARY` (default: ON)
+  - Allows disabling the Tesseract CLI binary build, which is not needed for WASM
+  - Wraps all executable and installation targets for the tesseract binary
+
+- **Disabled components for WASM**:
+  - Removes OpenCL support (`src/opencl/*.cpp`) - not applicable to WASM
+  - Removes viewer support (`src/viewer/*.cpp`) - UI components not needed for WASM
+  - Removes C API bindings (`src/api/capi.cpp`) - only hocrrenderer is kept
+  - Removes PDF and rendering support files:
+    - `src/api/renderer.cpp`
+    - `src/api/altorenderer.cpp`
+    - `src/api/lstmboxrenderer.cpp`
+    - `src/api/pdfrenderer.cpp`
+    - `src/api/wordstrboxrenderer.cpp`
+
+#### 2. SIMD Detection Fixes (src/arch/simddetect.cpp)
+
+- Guards CPUID detection with `#if !defined(__wasm__)`
+- Prevents attempts to use CPU feature detection that don't exist in WASM
+- The HAS_CPUID macro is only defined for non-WASM builds
+- This allows the code to gracefully handle WASM's SIMD limitations
+
+#### 3. Pointer Type Fixes (src/ccmain/pageiterator.cpp, src/ccmain/pagesegmain.cpp, src/ccmain/tesseractclass.cpp)
+
+**Changed from stack allocation to heap allocation** in `tesseractclass.h`:
+
+- `pixa_debug_` changed from `DebugPixa` to `std::unique_ptr<DebugPixa>`
+- This prevents large allocations on the stack, which is limited in WASM
+
+**Updated all references** throughout the codebase:
+
+- `.get()` calls added where raw pointers are needed
+- Arrow operator `->` replaces dot operator `.` for member access
+- Null checks added before dereferencing to prevent crashes
+
+**Affected functions**:
+
+- `PageIterator::Orientation()` - added null vector check
+- `Tesseract::AutoPageSeg()` - updated pointer passing
+- `Tesseract::SetupPageSegAndDetectOrientation()` - multiple pointer updates
+- `Tesseract::Clear()` - added null check before WritePDF
+- `Tesseract::PrepareForPageseg()` - updated Split() calls
+- `Tesseract::PrepareForTessOCR()` - updated Split() calls
+
+#### 4. Additional Fixes
+
+- **Orientation detection**: Changed comparison from `> 0.0F` to `>= 0.0F` in `pageiterator.cpp` to handle null vectors gracefully when orientation info is not available
+
+## How to Apply
+
+These patches are applied during the WASM build process. They modify the Tesseract source code to:
+
+1. Disable WASM-incompatible features (OpenCL, viewers, renderers)
+2. Prevent CPUID detection in WASM environment
+3. Use heap allocation instead of stack allocation for large objects
+4. Handle missing pointer initialization gracefully
+
+## Source
+
+These patches are based on the proven WASM compilation approach used by the tesseract.js project, which successfully compiles Tesseract to WebAssembly and deploys it in production environments.
--- a/crates/kreuzberg-tesseract/patches/tesseract.diff
+++ b/crates/kreuzberg-tesseract/patches/tesseract.diff
@@ -0,0 +1,199 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 8c6845cb..fdcfc4a8 100644
+--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+@@ -90,6 +90,7 @@ option(ENABLE_LTO "Enable link-time optimization" OFF)
+ option(FAST_FLOAT "Enable float for LSTM" ON)
+ option(ENABLE_OPENCL "Enable unsupported experimental OpenCL support" OFF)
+ option(BUILD_TRAINING_TOOLS "Build training tools" ON)
+option(BUILD_TESSERACT_BINARY "Build Tesseract binary" ON)
+ option(BUILD_TESTS "Build tests" OFF)
+ option(USE_SYSTEM_ICU "Use system ICU" OFF)
+ option(DISABLE_ARCHIVE "Disable build with libarchive (if available)" OFF)
+@@ -565,9 +566,7 @@ file(
+   src/cutil/*.cpp
+   src/dict/*.cpp
+   src/lstm/*.cpp
+-  src/opencl/*.cpp
+   src/textord/*.cpp
+-  src/viewer/*.cpp
+   src/wordrec/*.cpp)
+
+ if(DISABLED_LEGACY_ENGINE)
+@@ -714,13 +713,7 @@ file(
+ set(TESSERACT_SRC
+     ${TESSERACT_SRC}
+     src/api/baseapi.cpp
+-    src/api/capi.cpp
+-    src/api/renderer.cpp
+-    src/api/altorenderer.cpp
+-    src/api/hocrrenderer.cpp
+-    src/api/lstmboxrenderer.cpp
+-    src/api/pdfrenderer.cpp
+-    src/api/wordstrboxrenderer.cpp)
+    src/api/hocrrenderer.cpp)
+
+ set(TESSERACT_CONFIGS
+   tessdata/configs/alto
+@@ -858,14 +851,16 @@ endif()
+ # EXECUTABLE tesseract
+ # ##############################################################################
+
+-add_executable(tesseract src/tesseract.cpp)
+-target_link_libraries(tesseract libtesseract)
+-if(HAVE_TIFFIO_H AND WIN32)
+-  target_link_libraries(tesseract ${TIFF_LIBRARIES})
+-endif()
+if(BUILD_TESSERACT_BINARY)
+  add_executable(tesseract src/tesseract.cpp)
+  target_link_libraries(tesseract libtesseract)
+  if(HAVE_TIFFIO_H AND WIN32)
+    target_link_libraries(tesseract ${TIFF_LIBRARIES})
+  endif()
+
+-if(OPENMP_BUILD AND UNIX)
+-  target_link_libraries(tesseract pthread)
+  if(OPENMP_BUILD AND UNIX)
+    target_link_libraries(tesseract pthread)
+  endif()
+ endif()
+
+ # ##############################################################################
+@@ -899,7 +894,11 @@ write_basic_package_version_file(
+
+ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/tesseract.pc
+         DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
+-install(TARGETS tesseract DESTINATION bin)
+
+if(BUILD_TESSERACT_BINARY)
+  install(TARGETS tesseract DESTINATION bin)
+endif()
+
+ install(
+   TARGETS libtesseract
+   EXPORT TesseractTargets
+diff --git a/src/arch/simddetect.cpp b/src/arch/simddetect.cpp
+index 1afe5a5d..cb8c6d4c 100644
+--- a/src/arch/simddetect.cpp
+++ b/src/arch/simddetect.cpp
+@@ -40,10 +40,12 @@
+
+ #endif
+
+#if !defined(__wasm__)
+ #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA) || defined(HAVE_SSE4_1)
+ // See https://en.wikipedia.org/wiki/CPUID.
+ #  define HAS_CPUID
+ #endif
+#endif
+
+ #if defined(HAS_CPUID)
+ #  if defined(__GNUC__)
+diff --git a/src/ccmain/pageiterator.cpp b/src/ccmain/pageiterator.cpp
+index 64ff7f66..c0f80e5f 100644
+--- a/src/ccmain/pageiterator.cpp
+++ b/src/ccmain/pageiterator.cpp
+@@ -582,7 +582,9 @@ void PageIterator::Orientation(tesseract::Orientation *orientation,
+   up_in_image.rotate(block->re_rotation());
+
+   if (up_in_image.x() == 0.0F) {
+-    if (up_in_image.y() > 0.0F) {
+    // tesseract-wasm note: `up_in_image` will be a null vector if orientation
+    // info is not available. In that case, assume page up.
+    if (up_in_image.y() >= 0.0F) {
+       *orientation = ORIENTATION_PAGE_UP;
+     } else {
+       *orientation = ORIENTATION_PAGE_DOWN;
+diff --git a/src/ccmain/pagesegmain.cpp b/src/ccmain/pagesegmain.cpp
+index 0af44607..718e73ef 100644
+--- a/src/ccmain/pagesegmain.cpp
+++ b/src/ccmain/pagesegmain.cpp
+@@ -222,7 +222,7 @@ int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOC
+     }
+ #endif // ndef DISABLED_LEGACY_ENGINE
+     result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_, to_block,
+-                                photomask_pix, pix_thresholds_, pix_grey_, &pixa_debug_,
+                                photomask_pix, pix_thresholds_, pix_grey_, pixa_debug_.get(),
+                                 &found_blocks, diacritic_blobs, to_blocks);
+     if (result >= 0) {
+       finder->GetDeskewVectors(&deskew_, &reskew_);
+@@ -279,17 +279,17 @@ ColumnFinder *Tesseract::SetupPageSegAndDetectOrientation(PageSegMode pageseg_mo
+   ICOORD bleft(0, 0);
+
+   ASSERT_HOST(pix_binary_ != nullptr);
+-  if (tessedit_dump_pageseg_images) {
+-    pixa_debug_.AddPix(pix_binary_, "PageSegInput");
+  if (tessedit_dump_pageseg_images && pixa_debug_) {
+    pixa_debug_->AddPix(pix_binary_, "PageSegInput");
+   }
+   // Leptonica is used to find the rule/separator lines in the input.
+   LineFinder::FindAndRemoveLines(source_resolution_, textord_tabfind_show_vlines, pix_binary_,
+                                  &vertical_x, &vertical_y, music_mask_pix, &v_lines, &h_lines);
+-  if (tessedit_dump_pageseg_images) {
+-    pixa_debug_.AddPix(pix_binary_, "NoLines");
+  if (tessedit_dump_pageseg_images && pixa_debug_) {
+    pixa_debug_->AddPix(pix_binary_, "NoLines");
+   }
+   // Leptonica is used to find a mask of the photo regions in the input.
+-  *photo_mask_pix = ImageFind::FindImages(pix_binary_, &pixa_debug_);
+  *photo_mask_pix = ImageFind::FindImages(pix_binary_, pixa_debug_.get());
+   if (tessedit_dump_pageseg_images) {
+     Image pix_no_image_ = nullptr;
+     if (*photo_mask_pix != nullptr) {
+@@ -297,7 +297,7 @@ ColumnFinder *Tesseract::SetupPageSegAndDetectOrientation(PageSegMode pageseg_mo
+     } else {
+       pix_no_image_ = pix_binary_.clone();
+     }
+-    pixa_debug_.AddPix(pix_no_image_, "NoImages");
+    pixa_debug_->AddPix(pix_no_image_, "NoImages");
+     pix_no_image_.destroy();
+   }
+   if (!PSM_COL_FIND_ENABLED(pageseg_mode)) {
+diff --git a/src/ccmain/tesseractclass.cpp b/src/ccmain/tesseractclass.cpp
+index fd58ac87..517f925e 100644
+--- a/src/ccmain/tesseractclass.cpp
+++ b/src/ccmain/tesseractclass.cpp
+@@ -487,8 +487,10 @@ Dict &Tesseract::getDict() {
+ }
+
+ void Tesseract::Clear() {
+-  std::string debug_name = imagebasename + "_debug.pdf";
+-  pixa_debug_.WritePDF(debug_name.c_str());
+  if (pixa_debug_) {
+    std::string debug_name = imagebasename + "_debug.pdf";
+    pixa_debug_->WritePDF(debug_name.c_str());
+  }
+   pix_binary_.destroy();
+   pix_grey_.destroy();
+   pix_thresholds_.destroy();
+@@ -572,7 +574,7 @@ void Tesseract::PrepareForPageseg() {
+   // the newly split image.
+   splitter_.set_orig_pix(pix_binary());
+   splitter_.set_pageseg_split_strategy(max_pageseg_strategy);
+-  if (splitter_.Split(true, &pixa_debug_)) {
+  if (splitter_.Split(true, pixa_debug_.get())) {
+     ASSERT_HOST(splitter_.splitted_image());
+     pix_binary_.destroy();
+     pix_binary_ = splitter_.splitted_image().clone();
+@@ -599,7 +601,7 @@ void Tesseract::PrepareForTessOCR(BLOCK_LIST *block_list, Tesseract *osd_tess, O
+   splitter_.set_segmentation_block_list(block_list);
+   splitter_.set_ocr_split_strategy(max_ocr_strategy);
+   // Run the splitter for OCR
+-  bool split_for_ocr = splitter_.Split(false, &pixa_debug_);
+  bool split_for_ocr = splitter_.Split(false, pixa_debug_.get());
+   // Restore pix_binary to the binarized original pix for future reference.
+   ASSERT_HOST(splitter_.orig_pix());
+   pix_binary_.destroy();
+diff --git a/src/ccmain/tesseractclass.h b/src/ccmain/tesseractclass.h
+index 732bb9e6..030aa5bc 100644
+--- a/src/ccmain/tesseractclass.h
+++ b/src/ccmain/tesseractclass.h
+@@ -986,7 +986,7 @@ private:
+   // Thresholds that were used to generate the thresholded image from grey.
+   Image pix_thresholds_;
+   // Debug images. If non-empty, will be written on destruction.
+-  DebugPixa pixa_debug_;
+  std::unique_ptr<DebugPixa> pixa_debug_;
+   // Input image resolution after any scaling. The resolution is not well
+   // transmitted by operations on Pix, so we keep an independent record here.
+   int source_resolution_;
--- a/crates/kreuzberg-tesseract/src/api.rs
+++ b/crates/kreuzberg-tesseract/src/api.rs
--- a/crates/kreuzberg-tesseract/src/choice_iterator.rs
+++ b/crates/kreuzberg-tesseract/src/choice_iterator.rs
@@ -0,0 +1,77 @@
+use crate::api::TessDeleteText;
+use crate::error::{Result, TesseractError};
+use std::ffi::CStr;
+use std::os::raw::{c_char, c_float, c_int, c_void};
+use std::sync::{Arc, Mutex};
+
+pub struct ChoiceIterator {
+    handle: Arc<Mutex<*mut c_void>>,
+}
+
+unsafe impl Send for ChoiceIterator {}
+unsafe impl Sync for ChoiceIterator {}
+
+impl ChoiceIterator {
+    /// Creates a new instance of the ChoiceIterator.
+    ///
+    /// # Arguments
+    ///
+    /// * `handle` - Pointer to the ChoiceIterator.
+    pub fn new(handle: *mut c_void) -> Self {
+        ChoiceIterator {
+            handle: Arc::new(Mutex::new(handle)),
+        }
+    }
+
+    /// Gets the next choice.
+    ///
+    /// # Returns
+    ///
+    /// Returns `true` if the next choice is successful, otherwise returns `false`.
+    pub fn next(&self) -> Result<bool> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        Ok(unsafe { TessChoiceIteratorNext(*handle) != 0 })
+    }
+
+    /// Gets the UTF-8 text for the current choice.
+    ///
+    /// # Returns
+    ///
+    /// Returns the UTF-8 text as a `String` if successful, otherwise returns an error.
+    pub fn get_utf8_text(&self) -> Result<String> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        let text_ptr = unsafe { TessChoiceIteratorGetUTF8Text(*handle) };
+        if text_ptr.is_null() {
+            return Err(TesseractError::NullPointerError);
+        }
+        let c_str = unsafe { CStr::from_ptr(text_ptr) };
+        let result = c_str.to_str()?.to_owned();
+        unsafe { TessDeleteText(text_ptr) };
+        Ok(result)
+    }
+
+    /// Gets the confidence of the current choice.
+    ///
+    /// # Returns
+    ///
+    /// Returns the confidence as a `f32`.
+    pub fn confidence(&self) -> Result<f32> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        Ok(unsafe { TessChoiceIteratorConfidence(*handle) })
+    }
+}
+
+impl Drop for ChoiceIterator {
+    fn drop(&mut self) {
+        if let Ok(handle) = self.handle.lock() {
+            unsafe { TessChoiceIteratorDelete(*handle) };
+        }
+    }
+}
+
+ffi_extern! {
+    fn TessChoiceIteratorDelete(handle: *mut c_void);
+    fn TessChoiceIteratorNext(handle: *mut c_void) -> c_int;
+    fn TessChoiceIteratorGetUTF8Text(handle: *mut c_void) -> *mut c_char;
+    fn TessChoiceIteratorConfidence(handle: *mut c_void) -> c_float;
+}
--- a/crates/kreuzberg-tesseract/src/enums.rs
+++ b/crates/kreuzberg-tesseract/src/enums.rs
@@ -0,0 +1,373 @@
+#[repr(C)]
+#[derive(Debug, Clone, Copy, PartialEq)]
+#[allow(non_camel_case_types)]
+pub enum TessPageSegMode {
+    PSM_OSD_ONLY = 0,
+    PSM_AUTO_OSD = 1,
+    PSM_AUTO_ONLY = 2,
+    PSM_AUTO = 3,
+    PSM_SINGLE_COLUMN = 4,
+    PSM_SINGLE_BLOCK_VERT_TEXT = 5,
+    PSM_SINGLE_BLOCK = 6,
+    PSM_SINGLE_LINE = 7,
+    PSM_SINGLE_WORD = 8,
+    PSM_CIRCLE_WORD = 9,
+    PSM_SINGLE_CHAR = 10,
+    PSM_SPARSE_TEXT = 11,
+    PSM_SPARSE_TEXT_OSD = 12,
+    PSM_RAW_LINE = 13,
+    PSM_COUNT = 14,
+}
+
+impl TessPageSegMode {
+    pub fn from_int(value: i32) -> Self {
+        match value {
+            0 => TessPageSegMode::PSM_OSD_ONLY,
+            1 => TessPageSegMode::PSM_AUTO_OSD,
+            2 => TessPageSegMode::PSM_AUTO_ONLY,
+            3 => TessPageSegMode::PSM_AUTO,
+            4 => TessPageSegMode::PSM_SINGLE_COLUMN,
+            5 => TessPageSegMode::PSM_SINGLE_BLOCK_VERT_TEXT,
+            6 => TessPageSegMode::PSM_SINGLE_BLOCK,
+            7 => TessPageSegMode::PSM_SINGLE_LINE,
+            8 => TessPageSegMode::PSM_SINGLE_WORD,
+            9 => TessPageSegMode::PSM_CIRCLE_WORD,
+            10 => TessPageSegMode::PSM_SINGLE_CHAR,
+            11 => TessPageSegMode::PSM_SPARSE_TEXT,
+            12 => TessPageSegMode::PSM_SPARSE_TEXT_OSD,
+            13 => TessPageSegMode::PSM_RAW_LINE,
+            14 => TessPageSegMode::PSM_COUNT,
+            _ => TessPageSegMode::PSM_AUTO,
+        }
+    }
+
+    /// Safely convert an integer to a TessPageSegMode, returning None for invalid values.
+    pub fn try_from_int(value: i32) -> Option<Self> {
+        match value {
+            0 => Some(TessPageSegMode::PSM_OSD_ONLY),
+            1 => Some(TessPageSegMode::PSM_AUTO_OSD),
+            2 => Some(TessPageSegMode::PSM_AUTO_ONLY),
+            3 => Some(TessPageSegMode::PSM_AUTO),
+            4 => Some(TessPageSegMode::PSM_SINGLE_COLUMN),
+            5 => Some(TessPageSegMode::PSM_SINGLE_BLOCK_VERT_TEXT),
+            6 => Some(TessPageSegMode::PSM_SINGLE_BLOCK),
+            7 => Some(TessPageSegMode::PSM_SINGLE_LINE),
+            8 => Some(TessPageSegMode::PSM_SINGLE_WORD),
+            9 => Some(TessPageSegMode::PSM_CIRCLE_WORD),
+            10 => Some(TessPageSegMode::PSM_SINGLE_CHAR),
+            11 => Some(TessPageSegMode::PSM_SPARSE_TEXT),
+            12 => Some(TessPageSegMode::PSM_SPARSE_TEXT_OSD),
+            13 => Some(TessPageSegMode::PSM_RAW_LINE),
+            14 => Some(TessPageSegMode::PSM_COUNT),
+            _ => None,
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug, Clone, Copy, PartialEq)]
+#[allow(non_camel_case_types)]
+pub enum TessPageIteratorLevel {
+    RIL_BLOCK = 0,
+    RIL_PARA = 1,
+    RIL_TEXTLINE = 2,
+    RIL_WORD = 3,
+    RIL_SYMBOL = 4,
+}
+
+impl TessPageIteratorLevel {
+    pub fn from_int(value: i32) -> Self {
+        match value {
+            0 => TessPageIteratorLevel::RIL_BLOCK,
+            1 => TessPageIteratorLevel::RIL_PARA,
+            2 => TessPageIteratorLevel::RIL_TEXTLINE,
+            3 => TessPageIteratorLevel::RIL_WORD,
+            4 => TessPageIteratorLevel::RIL_SYMBOL,
+            _ => TessPageIteratorLevel::RIL_BLOCK,
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug, Clone, Copy, PartialEq)]
+#[allow(non_camel_case_types)]
+pub enum TessPolyBlockType {
+    PT_UNKNOWN = 0,
+    PT_FLOWING_TEXT = 1,
+    PT_HEADING_TEXT = 2,
+    PT_PULLOUT_TEXT = 3,
+    PT_EQUATION = 4,
+    PT_INLINE_EQUATION = 5,
+    PT_TABLE = 6,
+    PT_VERTICAL_TEXT = 7,
+    PT_CAPTION_TEXT = 8,
+    PT_FLOWING_IMAGE = 9,
+    PT_HEADING_IMAGE = 10,
+    PT_PULLOUT_IMAGE = 11,
+    PT_HORZ_LINE = 12,
+    PT_VERT_LINE = 13,
+    PT_NOISE = 14,
+    PT_COUNT = 15,
+}
+
+impl TessPolyBlockType {
+    pub fn from_int(value: i32) -> Self {
+        match value {
+            0 => TessPolyBlockType::PT_UNKNOWN,
+            1 => TessPolyBlockType::PT_FLOWING_TEXT,
+            2 => TessPolyBlockType::PT_HEADING_TEXT,
+            3 => TessPolyBlockType::PT_PULLOUT_TEXT,
+            4 => TessPolyBlockType::PT_EQUATION,
+            5 => TessPolyBlockType::PT_INLINE_EQUATION,
+            6 => TessPolyBlockType::PT_TABLE,
+            7 => TessPolyBlockType::PT_VERTICAL_TEXT,
+            8 => TessPolyBlockType::PT_CAPTION_TEXT,
+            9 => TessPolyBlockType::PT_FLOWING_IMAGE,
+            10 => TessPolyBlockType::PT_HEADING_IMAGE,
+            11 => TessPolyBlockType::PT_PULLOUT_IMAGE,
+            12 => TessPolyBlockType::PT_HORZ_LINE,
+            13 => TessPolyBlockType::PT_VERT_LINE,
+            14 => TessPolyBlockType::PT_NOISE,
+            15 => TessPolyBlockType::PT_COUNT,
+            _ => TessPolyBlockType::PT_UNKNOWN,
+        }
+    }
+
+    /// Safely convert an integer to a TessPolyBlockType, returning None for invalid values.
+    pub fn try_from_int(value: i32) -> Option<Self> {
+        match value {
+            0 => Some(TessPolyBlockType::PT_UNKNOWN),
+            1 => Some(TessPolyBlockType::PT_FLOWING_TEXT),
+            2 => Some(TessPolyBlockType::PT_HEADING_TEXT),
+            3 => Some(TessPolyBlockType::PT_PULLOUT_TEXT),
+            4 => Some(TessPolyBlockType::PT_EQUATION),
+            5 => Some(TessPolyBlockType::PT_INLINE_EQUATION),
+            6 => Some(TessPolyBlockType::PT_TABLE),
+            7 => Some(TessPolyBlockType::PT_VERTICAL_TEXT),
+            8 => Some(TessPolyBlockType::PT_CAPTION_TEXT),
+            9 => Some(TessPolyBlockType::PT_FLOWING_IMAGE),
+            10 => Some(TessPolyBlockType::PT_HEADING_IMAGE),
+            11 => Some(TessPolyBlockType::PT_PULLOUT_IMAGE),
+            12 => Some(TessPolyBlockType::PT_HORZ_LINE),
+            13 => Some(TessPolyBlockType::PT_VERT_LINE),
+            14 => Some(TessPolyBlockType::PT_NOISE),
+            15 => Some(TessPolyBlockType::PT_COUNT),
+            _ => None,
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug, Clone, Copy, PartialEq)]
+#[allow(non_camel_case_types)]
+pub enum TessOrientation {
+    ORIENTATION_PAGE_UP = 0,
+    ORIENTATION_PAGE_RIGHT = 1,
+    ORIENTATION_PAGE_DOWN = 2,
+    ORIENTATION_PAGE_LEFT = 3,
+}
+
+impl TessOrientation {
+    pub fn from_int(value: i32) -> Self {
+        match value {
+            0 => TessOrientation::ORIENTATION_PAGE_UP,
+            1 => TessOrientation::ORIENTATION_PAGE_RIGHT,
+            2 => TessOrientation::ORIENTATION_PAGE_DOWN,
+            3 => TessOrientation::ORIENTATION_PAGE_LEFT,
+            _ => TessOrientation::ORIENTATION_PAGE_UP,
+        }
+    }
+
+    /// Safely convert an integer to a TessOrientation, returning None for invalid values.
+    pub fn try_from_int(value: i32) -> Option<Self> {
+        match value {
+            0 => Some(TessOrientation::ORIENTATION_PAGE_UP),
+            1 => Some(TessOrientation::ORIENTATION_PAGE_RIGHT),
+            2 => Some(TessOrientation::ORIENTATION_PAGE_DOWN),
+            3 => Some(TessOrientation::ORIENTATION_PAGE_LEFT),
+            _ => None,
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug, Clone, Copy, PartialEq)]
+#[allow(non_camel_case_types)]
+pub enum TessParagraphJustification {
+    JUSTIFICATION_UNKNOWN = 0,
+    JUSTIFICATION_LEFT = 1,
+    JUSTIFICATION_CENTER = 2,
+    JUSTIFICATION_RIGHT = 3,
+}
+
+impl TessParagraphJustification {
+    pub fn from_int(value: i32) -> Self {
+        match value {
+            0 => TessParagraphJustification::JUSTIFICATION_UNKNOWN,
+            1 => TessParagraphJustification::JUSTIFICATION_LEFT,
+            2 => TessParagraphJustification::JUSTIFICATION_CENTER,
+            3 => TessParagraphJustification::JUSTIFICATION_RIGHT,
+            _ => TessParagraphJustification::JUSTIFICATION_UNKNOWN,
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug, Clone, Copy, PartialEq)]
+#[allow(non_camel_case_types)]
+pub enum TessWritingDirection {
+    WRITING_DIRECTION_LEFT_TO_RIGHT = 0,
+    WRITING_DIRECTION_RIGHT_TO_LEFT = 1,
+    WRITING_DIRECTION_TOP_TO_BOTTOM = 2,
+}
+
+impl TessWritingDirection {
+    pub fn from_int(value: i32) -> Self {
+        match value {
+            0 => TessWritingDirection::WRITING_DIRECTION_LEFT_TO_RIGHT,
+            1 => TessWritingDirection::WRITING_DIRECTION_RIGHT_TO_LEFT,
+            2 => TessWritingDirection::WRITING_DIRECTION_TOP_TO_BOTTOM,
+            _ => TessWritingDirection::WRITING_DIRECTION_LEFT_TO_RIGHT,
+        }
+    }
+
+    /// Safely convert an integer to a TessWritingDirection, returning None for invalid values.
+    pub fn try_from_int(value: i32) -> Option<Self> {
+        match value {
+            0 => Some(TessWritingDirection::WRITING_DIRECTION_LEFT_TO_RIGHT),
+            1 => Some(TessWritingDirection::WRITING_DIRECTION_RIGHT_TO_LEFT),
+            2 => Some(TessWritingDirection::WRITING_DIRECTION_TOP_TO_BOTTOM),
+            _ => None,
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug, Clone, Copy, PartialEq)]
+#[allow(non_camel_case_types)]
+pub enum TessTextlineOrder {
+    TEXTLINE_ORDER_LEFT_TO_RIGHT = 0,
+    TEXTLINE_ORDER_RIGHT_TO_LEFT = 1,
+    TEXTLINE_ORDER_TOP_TO_BOTTOM = 2,
+}
+
+impl TessTextlineOrder {
+    pub fn from_int(value: i32) -> Self {
+        match value {
+            0 => TessTextlineOrder::TEXTLINE_ORDER_LEFT_TO_RIGHT,
+            1 => TessTextlineOrder::TEXTLINE_ORDER_RIGHT_TO_LEFT,
+            2 => TessTextlineOrder::TEXTLINE_ORDER_TOP_TO_BOTTOM,
+            _ => TessTextlineOrder::TEXTLINE_ORDER_LEFT_TO_RIGHT,
+        }
+    }
+
+    /// Safely convert an integer to a TessTextlineOrder, returning None for invalid values.
+    pub fn try_from_int(value: i32) -> Option<Self> {
+        match value {
+            0 => Some(TessTextlineOrder::TEXTLINE_ORDER_LEFT_TO_RIGHT),
+            1 => Some(TessTextlineOrder::TEXTLINE_ORDER_RIGHT_TO_LEFT),
+            2 => Some(TessTextlineOrder::TEXTLINE_ORDER_TOP_TO_BOTTOM),
+            _ => None,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_page_seg_mode_from_int() {
+        assert_eq!(TessPageSegMode::from_int(0), TessPageSegMode::PSM_OSD_ONLY);
+        assert_eq!(TessPageSegMode::from_int(3), TessPageSegMode::PSM_AUTO);
+        assert_eq!(TessPageSegMode::from_int(10), TessPageSegMode::PSM_SINGLE_CHAR);
+        assert_eq!(TessPageSegMode::from_int(999), TessPageSegMode::PSM_AUTO);
+    }
+
+    #[test]
+    fn test_page_seg_mode_conversion() {
+        let mode = TessPageSegMode::PSM_SINGLE_LINE;
+        assert_eq!(mode as i32, 7);
+    }
+
+    #[test]
+    fn test_page_iterator_level_from_int() {
+        assert_eq!(TessPageIteratorLevel::from_int(0), TessPageIteratorLevel::RIL_BLOCK);
+        assert_eq!(TessPageIteratorLevel::from_int(3), TessPageIteratorLevel::RIL_WORD);
+        assert_eq!(TessPageIteratorLevel::from_int(-1), TessPageIteratorLevel::RIL_BLOCK);
+    }
+
+    #[test]
+    fn test_poly_block_type_from_int() {
+        assert_eq!(TessPolyBlockType::from_int(1), TessPolyBlockType::PT_FLOWING_TEXT);
+        assert_eq!(TessPolyBlockType::from_int(6), TessPolyBlockType::PT_TABLE);
+        assert_eq!(TessPolyBlockType::from_int(100), TessPolyBlockType::PT_UNKNOWN);
+    }
+
+    #[test]
+    fn test_orientation_from_int() {
+        assert_eq!(TessOrientation::from_int(0), TessOrientation::ORIENTATION_PAGE_UP);
+        assert_eq!(TessOrientation::from_int(2), TessOrientation::ORIENTATION_PAGE_DOWN);
+        assert_eq!(TessOrientation::from_int(5), TessOrientation::ORIENTATION_PAGE_UP);
+    }
+
+    #[test]
+    fn test_paragraph_justification_from_int() {
+        assert_eq!(
+            TessParagraphJustification::from_int(1),
+            TessParagraphJustification::JUSTIFICATION_LEFT
+        );
+        assert_eq!(
+            TessParagraphJustification::from_int(3),
+            TessParagraphJustification::JUSTIFICATION_RIGHT
+        );
+        assert_eq!(
+            TessParagraphJustification::from_int(-1),
+            TessParagraphJustification::JUSTIFICATION_UNKNOWN
+        );
+    }
+
+    #[test]
+    fn test_writing_direction_from_int() {
+        assert_eq!(
+            TessWritingDirection::from_int(0),
+            TessWritingDirection::WRITING_DIRECTION_LEFT_TO_RIGHT
+        );
+        assert_eq!(
+            TessWritingDirection::from_int(1),
+            TessWritingDirection::WRITING_DIRECTION_RIGHT_TO_LEFT
+        );
+        assert_eq!(
+            TessWritingDirection::from_int(10),
+            TessWritingDirection::WRITING_DIRECTION_LEFT_TO_RIGHT
+        );
+    }
+
+    #[test]
+    fn test_textline_order_from_int() {
+        assert_eq!(
+            TessTextlineOrder::from_int(0),
+            TessTextlineOrder::TEXTLINE_ORDER_LEFT_TO_RIGHT
+        );
+        assert_eq!(
+            TessTextlineOrder::from_int(2),
+            TessTextlineOrder::TEXTLINE_ORDER_TOP_TO_BOTTOM
+        );
+        assert_eq!(
+            TessTextlineOrder::from_int(99),
+            TessTextlineOrder::TEXTLINE_ORDER_LEFT_TO_RIGHT
+        );
+    }
+
+    #[test]
+    fn test_enums_are_copy() {
+        fn assert_copy<T: Copy>() {}
+        assert_copy::<TessPageSegMode>();
+        assert_copy::<TessPageIteratorLevel>();
+        assert_copy::<TessPolyBlockType>();
+        assert_copy::<TessOrientation>();
+        assert_copy::<TessParagraphJustification>();
+        assert_copy::<TessWritingDirection>();
+        assert_copy::<TessTextlineOrder>();
+    }
+}
--- a/crates/kreuzberg-tesseract/src/error.rs
+++ b/crates/kreuzberg-tesseract/src/error.rs
@@ -0,0 +1,85 @@
+use std::str::Utf8Error;
+use thiserror::Error;
+
+/// Errors that can occur when using the Tesseract API.
+#[derive(Error, Debug)]
+pub enum TesseractError {
+    #[error("Failed to initialize Tesseract")]
+    InitError,
+    #[error("Failed to set image")]
+    SetImageError,
+    #[error("OCR operation failed")]
+    OcrError,
+    #[error("Invalid UTF-8 in Tesseract output")]
+    Utf8Error(#[from] Utf8Error),
+    #[error("Failed to lock mutex")]
+    MutexLockError,
+    #[error("Failed to set variable")]
+    SetVariableError,
+    #[error("Failed to get variable")]
+    GetVariableError,
+    #[error("Null pointer error")]
+    NullPointerError,
+    #[error("Invalid parameter")]
+    InvalidParameterError,
+    #[error("Layout analysis failed")]
+    AnalyseLayoutError,
+    #[error("Page processing failed")]
+    ProcessPagesError,
+    #[error("I/O error")]
+    IoError,
+    #[error("Mutex error")]
+    MutexError,
+    #[error("Invalid dimensions")]
+    InvalidDimensions,
+    #[error("Invalid bytes per pixel")]
+    InvalidBytesPerPixel,
+    #[error("Invalid bytes per line")]
+    InvalidBytesPerLine,
+    #[error("Invalid image data")]
+    InvalidImageData,
+    #[error("Uninitialized error")]
+    UninitializedError,
+    #[error("Invalid enum value: {0}")]
+    InvalidEnumValue(i32),
+    #[error("String contains null byte")]
+    NullByteInString,
+}
+
+/// Result type for Tesseract operations.
+pub type Result<T> = std::result::Result<T, TesseractError>;
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_error_display() {
+        let error = TesseractError::InitError;
+        assert_eq!(error.to_string(), "Failed to initialize Tesseract");
+
+        let error = TesseractError::SetImageError;
+        assert_eq!(error.to_string(), "Failed to set image");
+
+        let error = TesseractError::OcrError;
+        assert_eq!(error.to_string(), "OCR operation failed");
+    }
+
+    #[test]
+    fn test_utf8_error_conversion() {
+        let invalid_utf8 = vec![0xFF, 0xFE];
+        let utf8_error = std::str::from_utf8(&invalid_utf8).unwrap_err();
+        let tess_error: TesseractError = utf8_error.into();
+
+        match tess_error {
+            TesseractError::Utf8Error(_) => {}
+            _ => panic!("Expected Utf8Error variant"),
+        }
+    }
+
+    #[test]
+    fn test_error_is_send_sync() {
+        fn assert_send_sync<T: Send + Sync>() {}
+        assert_send_sync::<TesseractError>();
+    }
+}
--- a/crates/kreuzberg-tesseract/src/leptonica.rs
+++ b/crates/kreuzberg-tesseract/src/leptonica.rs
@@ -0,0 +1,807 @@
+//! Safe Leptonica Pix wrapper for image preprocessing before OCR.
+//!
+//! Provides a safe Rust wrapper around the Leptonica image-processing library.
+//! `Pix` is the core Leptonica image type. All methods return `Result<Pix>`,
+//! and the wrapper takes care of proper memory management via `Drop`.
+//!
+//! ## Pixel format
+//!
+//! Leptonica's 32 bpp format stores each pixel as a native 32-bit integer
+//! with the logical layout (MSB→LSB): `R G B A`, i.e.
+//! `(r << 24) | (g << 16) | (b << 8) | alpha`.  Leptonica accesses
+//! individual channels via bit-shift on the integer value, not via
+//! byte-addressed pointer arithmetic, so the packing is identical on both
+//! big- and little-endian hosts.  Do **not** call `pixEndianByteSwap` after
+//! writing pixels this way — doing so inverts the channel order.
+//!
+//! ## `pixDeskew` requires a binary (1 bpp) image
+//!
+//! Call `to_grayscale()` followed by `adaptive_threshold()` before `deskew()`.
+//! `pixDeskew` internally calls `pixFindSkewSweepAndSearchScorePivot` which
+//! operates on 1-bit images only; passing a colour image will return a null
+//! pointer.
+
+use crate::error::{Result, TesseractError};
+use std::ffi::c_void;
+
+// ---------------------------------------------------------------------------
+// Raw Leptonica FFI declarations
+// ---------------------------------------------------------------------------
+
+#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
+ffi_extern! {
+    /// Allocates a new Pix with the given dimensions and bit depth.
+    fn pixCreate(width: i32, height: i32, depth: i32) -> *mut c_void;
+
+    /// Frees a Pix and sets the caller's pointer to null.
+    ///
+    /// Leptonica uses a double-pointer convention: `*ppix` is set to null
+    /// after the call so that accidental double-frees are a no-op.
+    fn pixDestroy(ppix: *mut *mut c_void);
+
+    /// Sets the horizontal and vertical resolution (DPI) on a Pix.
+    ///
+    /// Returns 0 on success, non-zero on error.
+    fn pixSetResolution(pix: *mut c_void, xres: i32, yres: i32) -> i32;
+
+    /// Returns the width of the Pix in pixels.
+    fn pixGetWidth(pix: *const c_void) -> i32;
+
+    /// Returns the height of the Pix in pixels.
+    fn pixGetHeight(pix: *const c_void) -> i32;
+
+    /// Returns the bit depth of the Pix (1, 2, 4, 8, 16, or 32).
+    fn pixGetDepth(pix: *const c_void) -> i32;
+
+    /// Returns the number of 32-bit words per row (words-per-line).
+    fn pixGetWpl(pix: *const c_void) -> i32;
+
+    /// Returns a mutable pointer to the start of the pixel data array.
+    ///
+    /// The data is stored as rows of 32-bit words; each word covers 32/depth pixels.
+    fn pixGetData(pix: *mut c_void) -> *mut u32;
+
+    /// Deskews a 1 bpp image using a sweep-and-search algorithm.
+    ///
+    /// `redsearch` is the reduction factor used during the search; pass 0 for
+    /// the Leptonica default (2x reduction). Returns a new deskewed Pix on
+    /// success, or null on failure. The input Pix is **not** consumed.
+    fn pixDeskew(pixs: *mut c_void, redsearch: i32) -> *mut c_void;
+
+    /// Estimates the skew angle and confidence for a 1 bpp image.
+    ///
+    /// Writes the angle (degrees, positive = counter-clockwise) into `*pangle`
+    /// and a confidence score (0–1) into `*pconf`. Returns 0 on success.
+    fn pixFindSkew(pixs: *mut c_void, pangle: *mut f32, pconf: *mut f32) -> i32;
+
+    /// Applies Otsu adaptive thresholding to produce a binarised Pix.
+    ///
+    /// `sx`/`sy` are the tile dimensions; `smoothx`/`smoothy` are half-widths
+    /// for smoothing the threshold map; `scorefract` controls threshold acceptance
+    /// (typical value: 0.1). `ppixth` (optional) receives the threshold image;
+    /// `ppixd` receives the binarised output.
+    fn pixOtsuAdaptiveThreshold(
+        pixs: *mut c_void,
+        sx: i32,
+        sy: i32,
+        smoothx: i32,
+        smoothy: i32,
+        scorefract: f32,
+        ppixth: *mut *mut c_void,
+        ppixd: *mut *mut c_void,
+    ) -> i32;
+
+    /// Normalises the background of a grayscale image using morphological operations.
+    ///
+    /// `reduction` is the subsampling factor (e.g. 4), `size` is the morphological
+    /// structuring-element half-size (e.g. 15), and `bgval` is the target background
+    /// value (e.g. 200). Returns a new normalised Pix, or null on failure.
+    fn pixBackgroundNormMorph(
+        pixs: *mut c_void,
+        pixim: *mut c_void,
+        reduction: i32,
+        size: i32,
+        bgval: i32,
+    ) -> *mut c_void;
+
+    /// Applies unsharp masking to sharpen a grayscale or colour Pix.
+    ///
+    /// `halfwidth` is the half-size of the blur kernel; `fract` controls the
+    /// sharpening strength (0.0–1.0 typical). Returns a new Pix, or null on failure.
+    fn pixUnsharpMasking(pixs: *mut c_void, halfwidth: i32, fract: f32) -> *mut c_void;
+
+    /// Scales a Pix by independent x and y factors using the best available method.
+    ///
+    /// Returns a new scaled Pix, or null on failure. The input Pix is **not** consumed.
+    fn pixScale(pixs: *mut c_void, scalex: f32, scaley: f32) -> *mut c_void;
+
+    /// Converts an RGB (32 bpp) Pix to 8 bpp grayscale.
+    ///
+    /// `rwt`, `gwt`, `bwt` are the red, green, and blue channel weights; pass
+    /// 0.0 for all three to use Leptonica's default equal weights. Returns a new
+    /// 8 bpp Pix, or null on failure.
+    fn pixConvertRGBToGray(pixs: *mut c_void, rwt: f32, gwt: f32, bwt: f32) -> *mut c_void;
+
+    /// Creates a Leptonica BOX with the given coordinates.
+    fn boxCreate(x: i32, y: i32, w: i32, h: i32) -> *mut c_void;
+
+    /// Frees a Leptonica BOX.
+    fn boxDestroy(pbox: *mut *mut c_void);
+
+    /// Clips a rectangular region from a Pix.
+    ///
+    /// Returns a new Pix containing the clipped region, or null on failure.
+    /// `pboxc` (optional) receives the actual clipped box; pass null to ignore.
+    fn pixClipRectangle(pixs: *mut c_void, box_: *mut c_void, pboxc: *mut *mut c_void) -> *mut c_void;
+
+    /// Counts connected components in a 1 bpp image.
+    ///
+    /// `connectivity` is 4 or 8. Writes the count to `*pcount`.
+    /// Returns 0 on success.
+    fn pixCountConnComp(pix: *mut c_void, connectivity: i32, pcount: *mut i32) -> i32;
+
+    /// Retrieves the horizontal and vertical resolution (DPI) from a Pix.
+    ///
+    /// Writes the x-resolution into `*pxres` and y-resolution into `*pyres`.
+    /// Returns 0 on success, non-zero on error.
+    fn pixGetResolution(pix: *const c_void, pxres: *mut i32, pyres: *mut i32) -> i32;
+
+}
+
+// ---------------------------------------------------------------------------
+// Safe Pix wrapper
+// ---------------------------------------------------------------------------
+
+/// Safe wrapper around a Leptonica `PIX *` image object.
+///
+/// Owns the underlying allocation and frees it in `Drop`. All methods that
+/// return a new image allocate a fresh `Pix`; the receiver is never consumed.
+///
+/// # Thread safety
+///
+/// `Pix` is `Send` because Leptonica image objects are independent heap
+/// allocations with no shared mutable state. Concurrent mutation from multiple
+/// threads is **not** safe (no `Sync`).
+#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
+pub struct Pix {
+    ptr: *mut c_void,
+}
+
+#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
+impl std::fmt::Debug for Pix {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("Pix").field("ptr", &self.ptr).finish()
+    }
+}
+
+// SAFETY: A Pix owns a uniquely heap-allocated Leptonica PIX. There is no
+// interior mutability shared across thread boundaries, so transferring
+// ownership to another thread is safe.
+#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
+unsafe impl Send for Pix {}
+
+#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
+impl Pix {
+    // -----------------------------------------------------------------------
+    // Construction
+    // -----------------------------------------------------------------------
+
+    /// Creates a 32 bpp Leptonica Pix from a packed RGB byte slice.
+    ///
+    /// `data` must contain exactly `width * height * 3` bytes in left-to-right,
+    /// top-to-bottom, `R G B` interleaved order.
+    ///
+    /// The DPI is set to 300 × 300 which is a sensible default for OCR input.
+    ///
+    /// # Errors
+    ///
+    /// Returns `TesseractError::InvalidImageData` if `data` length does not
+    /// match `width * height * 3`, if either dimension is zero, or if
+    /// Leptonica's `pixCreate` returns null.
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use kreuzberg_tesseract::Pix;
+    /// let rgb = vec![255u8; 4 * 4 * 3]; // 4×4 white image
+    /// let pix = Pix::from_raw_rgb(&rgb, 4, 4).unwrap();
+    /// assert_eq!(pix.width(), 4);
+    /// assert_eq!(pix.height(), 4);
+    /// assert_eq!(pix.depth(), 32);
+    /// ```
+    pub fn from_raw_rgb(data: &[u8], width: u32, height: u32) -> Result<Pix> {
+        let expected = (width as usize)
+            .checked_mul(height as usize)
+            .and_then(|n| n.checked_mul(3))
+            .ok_or(TesseractError::InvalidImageData)?;
+
+        if data.len() != expected || width == 0 || height == 0 {
+            return Err(TesseractError::InvalidImageData);
+        }
+
+        // SAFETY: pixCreate() allocates a new PIX with the requested dimensions.
+        // It is safe because:
+        // 1. width, height, and depth (32) are valid positive integers.
+        // 2. pixCreate() documents that it returns null only on allocation
+        //    failure, which we check immediately below.
+        let pix_ptr = unsafe { pixCreate(width as i32, height as i32, 32) };
+        if pix_ptr.is_null() {
+            return Err(TesseractError::NullPointerError);
+        }
+
+        // SAFETY: pixGetData() returns a mutable pointer into the allocated pixel
+        // buffer that is valid for the lifetime of the Pix. We own pix_ptr
+        // exclusively at this point and have not exposed it to any other code.
+        let data_ptr = unsafe { pixGetData(pix_ptr) };
+        if data_ptr.is_null() {
+            // Clean up before returning the error.
+            // SAFETY: pix_ptr is a valid non-null allocation from pixCreate().
+            // Passing &mut pix_ptr satisfies the double-pointer convention; after
+            // this call pix_ptr is set to null by Leptonica.
+            let mut ptr = pix_ptr;
+            unsafe { pixDestroy(&mut ptr) };
+            return Err(TesseractError::NullPointerError);
+        }
+
+        // SAFETY: pixGetWpl() is a pure read of the Pix header that is always
+        // valid for a correctly-allocated Pix.
+        // For a 32 bpp image, each pixel occupies exactly one 32-bit word, so
+        // wpl == width (no padding bytes). The loop below uses `row * wpl + col`
+        // to index into the pixel data, which is within bounds because col < width <= wpl.
+        let wpl = unsafe { pixGetWpl(pix_ptr) } as usize;
+
+        // Write RGB pixels into the Leptonica data buffer.
+        //
+        // Leptonica's 32 bpp pixel format stores each pixel as a native
+        // 32-bit integer word with the logical layout (MSB→LSB): R G B A,
+        // i.e. `(r << 24) | (g << 16) | (b << 8) | alpha`.  This is the
+        // same bit pattern regardless of host endianness — Leptonica treats
+        // the data as an array of 32-bit integers and accesses individual
+        // bytes via bit-shift, not via byte-addressed pointer arithmetic.
+        //
+        // Therefore we pack directly as `(r << 24) | (g << 16) | (b << 8) | 0xFF`
+        // and write the resulting u32 without any byte-swapping.  Calling
+        // `pixEndianByteSwap` would invert the channel order, producing
+        // A B G R instead of R G B A.
+        for row in 0..(height as usize) {
+            for col in 0..(width as usize) {
+                let src = (row * width as usize + col) * 3;
+                let r = data[src] as u32;
+                let g = data[src + 1] as u32;
+                let b = data[src + 2] as u32;
+                // Pack channels as (MSB) R G B A (LSB) in the 32-bit integer.
+                let word: u32 = (r << 24) | (g << 16) | (b << 8) | 0xFF;
+                // SAFETY: data_ptr is a valid writable pointer into the Leptonica
+                // pixel buffer. The offset `row * wpl + col` is within bounds because:
+                // 1. wpl >= width (Leptonica pads rows to 32-bit word boundaries).
+                // 2. row < height and col < width by loop invariants.
+                unsafe {
+                    *data_ptr.add(row * wpl + col) = word;
+                }
+            }
+        }
+
+        // Set a sensible default DPI for OCR processing.
+        // SAFETY: pix_ptr is valid and non-null. pixSetResolution only writes
+        // two integer fields in the Pix header.
+        unsafe { pixSetResolution(pix_ptr, 300, 300) };
+
+        Ok(Pix { ptr: pix_ptr })
+    }
+
+    // -----------------------------------------------------------------------
+    // Image processing operations
+    // -----------------------------------------------------------------------
+
+    /// Deskews this image, returning a new corrected Pix.
+    ///
+    /// **Note:** `pixDeskew` requires a 1 bpp (binary) image. Call
+    /// `to_grayscale()` followed by `adaptive_threshold()` before invoking
+    /// this method on a colour or grayscale Pix.
+    ///
+    /// # Errors
+    ///
+    /// Returns `TesseractError::NullPointerError` if Leptonica returns null
+    /// (typically because the input is not 1 bpp or the image is too small).
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use kreuzberg_tesseract::Pix;
+    /// # let rgb = vec![0u8; 100 * 100 * 3];
+    /// # let pix = Pix::from_raw_rgb(&rgb, 100, 100).unwrap();
+    /// let gray = pix.to_grayscale().unwrap();
+    /// let binary = gray.adaptive_threshold(32, 32).unwrap();
+    /// let deskewed = binary.deskew().unwrap();
+    /// ```
+    pub fn deskew(&self) -> Result<Pix> {
+        // SAFETY: self.ptr is a valid non-null Pix we own. pixDeskew() does
+        // not take ownership; it creates and returns a new Pix allocation.
+        // We check for null to handle the case where the operation fails
+        // (e.g. input is not 1 bpp).
+        let result = unsafe { pixDeskew(self.ptr, 0) };
+        if result.is_null() {
+            Err(TesseractError::NullPointerError)
+        } else {
+            Ok(Pix { ptr: result })
+        }
+    }
+
+    /// Estimates the skew angle (degrees) and confidence (0–1) for this image.
+    ///
+    /// A positive angle indicates counter-clockwise skew. Confidence near 1.0
+    /// means a clear dominant skew direction was found.
+    ///
+    /// **Note:** Like `deskew`, this operates on 1 bpp images.
+    ///
+    /// # Errors
+    ///
+    /// Returns `TesseractError::OcrError` if `pixFindSkew` returns a non-zero
+    /// status (e.g. insufficient contrast or wrong bit depth).
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use kreuzberg_tesseract::Pix;
+    /// # let rgb = vec![0u8; 100 * 100 * 3];
+    /// # let pix = Pix::from_raw_rgb(&rgb, 100, 100).unwrap();
+    /// let gray = pix.to_grayscale().unwrap();
+    /// let binary = gray.adaptive_threshold(32, 32).unwrap();
+    /// let (angle, confidence) = binary.find_skew().unwrap();
+    /// println!("Skew: {angle:.2}° (confidence {confidence:.2})");
+    /// ```
+    pub fn find_skew(&self) -> Result<(f32, f32)> {
+        let mut angle: f32 = 0.0;
+        let mut conf: f32 = 0.0;
+        // SAFETY: self.ptr is valid and non-null. We pass pointers to local
+        // stack-allocated f32 values, which are valid write targets for the
+        // duration of this call. pixFindSkew() writes into them and returns
+        // an integer status code.
+        let status = unsafe { pixFindSkew(self.ptr, &mut angle, &mut conf) };
+        if status != 0 {
+            Err(TesseractError::OcrError)
+        } else {
+            Ok((angle, conf))
+        }
+    }
+
+    /// Binarises this image using Otsu adaptive thresholding.
+    ///
+    /// `tile_width` and `tile_height` control the size of the local regions
+    /// used to compute the threshold. Values around 16–64 work well for typical
+    /// document images; smaller tiles follow local contrast more closely.
+    ///
+    /// # Errors
+    ///
+    /// Returns `TesseractError::NullPointerError` if Leptonica returns null, or
+    /// `TesseractError::OcrError` if `pixOtsuAdaptiveThreshold` returns a
+    /// non-zero status.
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use kreuzberg_tesseract::Pix;
+    /// # let rgb = vec![128u8; 64 * 64 * 3];
+    /// # let pix = Pix::from_raw_rgb(&rgb, 64, 64).unwrap();
+    /// let gray = pix.to_grayscale().unwrap();
+    /// let binary = gray.adaptive_threshold(32, 32).unwrap();
+    /// assert_eq!(binary.depth(), 1);
+    /// ```
+    pub fn adaptive_threshold(&self, tile_width: i32, tile_height: i32) -> Result<Pix> {
+        let mut result: *mut c_void = std::ptr::null_mut();
+        // SAFETY: self.ptr is a valid non-null Pix. We pass null for ppixth
+        // because we do not need the intermediate threshold image. result is a
+        // local pointer that will be written by pixOtsuAdaptiveThreshold(); we
+        // check it for null before wrapping in a Pix.
+        let status = unsafe {
+            pixOtsuAdaptiveThreshold(
+                self.ptr,
+                tile_width,
+                tile_height,
+                0,                    // smoothx: no smoothing
+                0,                    // smoothy: no smoothing
+                0.1,                  // scorefract: Leptonica-recommended default
+                std::ptr::null_mut(), // ppixth: we don't need the threshold map
+                &mut result,
+            )
+        };
+        if status != 0 {
+            return Err(TesseractError::OcrError);
+        }
+        if result.is_null() {
+            return Err(TesseractError::NullPointerError);
+        }
+        Ok(Pix { ptr: result })
+    }
+
+    /// Returns the horizontal and vertical resolution (DPI) of this image.
+    ///
+    /// # Errors
+    ///
+    /// Returns `TesseractError::OcrError` if `pixGetResolution` fails.
+    pub fn get_resolution(&self) -> Result<(i32, i32)> {
+        let mut xres: i32 = 0;
+        let mut yres: i32 = 0;
+        // SAFETY: self.ptr is a valid non-null Pix. xres and yres are valid
+        // stack-allocated i32 values. pixGetResolution reads the Pix header.
+        let status = unsafe { pixGetResolution(self.ptr, &mut xres, &mut yres) };
+        if status != 0 {
+            Err(TesseractError::OcrError)
+        } else {
+            Ok((xres, yres))
+        }
+    }
+
+    /// Sets the horizontal and vertical resolution (DPI) on this image.
+    ///
+    /// # Errors
+    ///
+    /// Returns `TesseractError::OcrError` if `pixSetResolution` fails.
+    pub fn set_resolution(&mut self, xres: i32, yres: i32) -> Result<()> {
+        // SAFETY: self.ptr is a valid non-null Pix. pixSetResolution only
+        // writes two integer fields in the Pix header.
+        let status = unsafe { pixSetResolution(self.ptr, xres, yres) };
+        if status != 0 {
+            Err(TesseractError::OcrError)
+        } else {
+            Ok(())
+        }
+    }
+
+    /// Ensures the image has a valid (non-zero) DPI resolution.
+    ///
+    /// If both x and y resolution are zero, sets them to 72 DPI as a
+    /// safe fallback. This prevents Leptonica operations that depend on
+    /// resolution metadata from producing incorrect results.
+    fn ensure_valid_resolution(&self) {
+        if let Ok((xres, yres)) = self.get_resolution()
+            && (xres == 0 || yres == 0)
+        {
+            // SAFETY: self.ptr is valid. We set a safe default DPI.
+            unsafe { pixSetResolution(self.ptr, 72, 72) };
+        }
+    }
+
+    /// Normalises the background of this image using morphological operations.
+    ///
+    /// Useful as a preprocessing step when the document has uneven illumination
+    /// or a non-white background. Returns a new normalised Pix.
+    ///
+    /// # Errors
+    ///
+    /// Returns `TesseractError::NullPointerError` if `pixBackgroundNormMorph`
+    /// returns null.
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use kreuzberg_tesseract::Pix;
+    /// # let rgb = vec![200u8; 100 * 100 * 3];
+    /// # let pix = Pix::from_raw_rgb(&rgb, 100, 100).unwrap();
+    /// let gray = pix.to_grayscale().unwrap();
+    /// let normalised = gray.background_normalize().unwrap();
+    /// ```
+    pub fn background_normalize(&self) -> Result<Pix> {
+        self.ensure_valid_resolution();
+        // SAFETY: self.ptr is a valid non-null Pix. We pass null for pixim
+        // (no mask image). pixBackgroundNormMorph() returns a newly allocated
+        // Pix or null on failure.
+        let result = unsafe {
+            pixBackgroundNormMorph(
+                self.ptr,
+                std::ptr::null_mut(), // pixim: no mask
+                4,                    // reduction: 4x subsampling
+                15,                   // size: morphological SE half-size
+                200,                  // bgval: target background value
+            )
+        };
+        if result.is_null() {
+            Err(TesseractError::NullPointerError)
+        } else {
+            Ok(Pix { ptr: result })
+        }
+    }
+
+    /// Applies unsharp masking to sharpen this image.
+    ///
+    /// `halfwidth` is the half-size of the blur kernel (e.g. 1–5).
+    /// `fract` is the sharpening fraction in the range 0.0–1.0; values
+    /// around 0.3–0.5 produce visible sharpening without artefacts.
+    ///
+    /// # Errors
+    ///
+    /// Returns `TesseractError::NullPointerError` if `pixUnsharpMasking`
+    /// returns null.
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use kreuzberg_tesseract::Pix;
+    /// # let rgb = vec![128u8; 64 * 64 * 3];
+    /// # let pix = Pix::from_raw_rgb(&rgb, 64, 64).unwrap();
+    /// let sharpened = pix.unsharp_mask(2, 0.4).unwrap();
+    /// ```
+    pub fn unsharp_mask(&self, halfwidth: i32, fract: f32) -> Result<Pix> {
+        self.ensure_valid_resolution();
+        // SAFETY: self.ptr is valid and non-null. pixUnsharpMasking() returns
+        // a new Pix without modifying or taking ownership of the source.
+        let result = unsafe { pixUnsharpMasking(self.ptr, halfwidth, fract) };
+        if result.is_null() {
+            Err(TesseractError::NullPointerError)
+        } else {
+            Ok(Pix { ptr: result })
+        }
+    }
+
+    /// Scales this image by independent x and y factors.
+    ///
+    /// Leptonica automatically chooses the best scaling algorithm based on
+    /// the scale factors and bit depth (area mapping for downscaling,
+    /// linear interpolation for upscaling).
+    ///
+    /// # Errors
+    ///
+    /// Returns `TesseractError::NullPointerError` if `pixScale` returns null.
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use kreuzberg_tesseract::Pix;
+    /// # let rgb = vec![255u8; 40 * 40 * 3];
+    /// # let pix = Pix::from_raw_rgb(&rgb, 40, 40).unwrap();
+    /// let upscaled = pix.scale(2.0, 2.0).unwrap();
+    /// assert_eq!(upscaled.width(), 80);
+    /// assert_eq!(upscaled.height(), 80);
+    /// ```
+    pub fn scale(&self, sx: f32, sy: f32) -> Result<Pix> {
+        // SAFETY: self.ptr is valid and non-null. pixScale() creates a new Pix
+        // and does not modify the source.
+        let result = unsafe { pixScale(self.ptr, sx, sy) };
+        if result.is_null() {
+            Err(TesseractError::NullPointerError)
+        } else {
+            Ok(Pix { ptr: result })
+        }
+    }
+
+    /// Clips a rectangular sub-region from this image.
+    ///
+    /// Returns a new Pix containing only the pixels within the given rectangle.
+    /// Coordinates are in pixel space: (x, y) is the top-left corner.
+    ///
+    /// # Errors
+    ///
+    /// Returns `TesseractError::NullPointerError` if the crop fails.
+    pub fn clip_rectangle(&self, x: i32, y: i32, w: i32, h: i32) -> Result<Pix> {
+        // SAFETY: boxCreate allocates a new BOX on the heap.
+        let box_ = unsafe { boxCreate(x, y, w, h) };
+        if box_.is_null() {
+            return Err(TesseractError::NullPointerError);
+        }
+        // SAFETY: pixClipRectangle returns a new Pix clipped to the BOX region.
+        // We pass null for pboxc (we don't need the clipped box coordinates back).
+        let result = unsafe { pixClipRectangle(self.ptr, box_, std::ptr::null_mut()) };
+        // SAFETY: Free the BOX we allocated.
+        let mut box_mut = box_;
+        unsafe { boxDestroy(&mut box_mut) };
+        if result.is_null() {
+            Err(TesseractError::NullPointerError)
+        } else {
+            Ok(Pix { ptr: result })
+        }
+    }
+
+    /// Counts connected components in a 1 bpp (binary) image.
+    ///
+    /// `connectivity` should be 4 or 8.
+    ///
+    /// # Errors
+    ///
+    /// Returns `TesseractError::OcrError` if `pixCountConnComp` fails
+    /// (e.g., wrong bit depth — image must be 1 bpp).
+    pub fn count_connected_components(&self, connectivity: i32) -> Result<i32> {
+        let mut count: i32 = 0;
+        // SAFETY: self.ptr is a valid Pix. count is a valid stack local.
+        let status = unsafe { pixCountConnComp(self.ptr, connectivity, &mut count) };
+        if status != 0 {
+            Err(TesseractError::OcrError)
+        } else {
+            Ok(count)
+        }
+    }
+
+    /// Converts this 32 bpp RGB image to an 8 bpp grayscale Pix.
+    ///
+    /// Passing 0.0 for all weight parameters instructs Leptonica to use its
+    /// default perceptual weights (approx. 0.299 R, 0.587 G, 0.114 B).
+    ///
+    /// # Errors
+    ///
+    /// Returns `TesseractError::NullPointerError` if `pixConvertRGBToGray`
+    /// returns null (e.g. the source is not 32 bpp).
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use kreuzberg_tesseract::Pix;
+    /// # let rgb = vec![100u8, 150u8, 200u8].repeat(10 * 10);
+    /// # let pix = Pix::from_raw_rgb(&rgb, 10, 10).unwrap();
+    /// let gray = pix.to_grayscale().unwrap();
+    /// assert_eq!(gray.depth(), 8);
+    /// ```
+    pub fn to_grayscale(&self) -> Result<Pix> {
+        self.ensure_valid_resolution();
+        // SAFETY: self.ptr is valid and non-null. pixConvertRGBToGray() returns
+        // a new 8 bpp Pix; the source is not modified.
+        let result = unsafe { pixConvertRGBToGray(self.ptr, 0.0, 0.0, 0.0) };
+        if result.is_null() {
+            Err(TesseractError::NullPointerError)
+        } else {
+            Ok(Pix { ptr: result })
+        }
+    }
+
+    // -----------------------------------------------------------------------
+    // Accessors
+    // -----------------------------------------------------------------------
+
+    /// Returns the raw Leptonica `PIX *` pointer.
+    ///
+    /// Intended for passing this image to `TesseractAPI::set_image_2`.
+    ///
+    /// # Safety
+    ///
+    /// The caller must ensure the `Pix` outlives any use of the returned
+    /// pointer.  `TessBaseAPISetImage2` **borrows** the pointer — it does not
+    /// take ownership — so the `Pix` must remain alive until after
+    /// `TessBaseAPIRecognize` (or any other Tesseract call that consumes the
+    /// image data) has completed.  Dropping the `Pix` while Tesseract holds
+    /// the pointer will result in a use-after-free.
+    ///
+    /// The caller must **not** free the returned pointer; `Pix::drop` is
+    /// solely responsible for deallocation via `pixDestroy`.
+    pub fn as_ptr(&self) -> *mut c_void {
+        self.ptr
+    }
+
+    /// Returns the width of the image in pixels.
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use kreuzberg_tesseract::Pix;
+    /// # let pix = Pix::from_raw_rgb(&vec![0u8; 8 * 6 * 3], 8, 6).unwrap();
+    /// assert_eq!(pix.width(), 8);
+    /// ```
+    pub fn width(&self) -> i32 {
+        // SAFETY: self.ptr is a valid non-null Pix. pixGetWidth() is a pure
+        // read of the Pix header struct; it does not mutate any state.
+        unsafe { pixGetWidth(self.ptr) }
+    }
+
+    /// Returns the height of the image in pixels.
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use kreuzberg_tesseract::Pix;
+    /// # let pix = Pix::from_raw_rgb(&vec![0u8; 8 * 6 * 3], 8, 6).unwrap();
+    /// assert_eq!(pix.height(), 6);
+    /// ```
+    pub fn height(&self) -> i32 {
+        // SAFETY: self.ptr is a valid non-null Pix. pixGetHeight() is a pure
+        // read of the Pix header struct.
+        unsafe { pixGetHeight(self.ptr) }
+    }
+
+    /// Returns the bit depth of the image (1, 8, or 32 for this module's usage).
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use kreuzberg_tesseract::Pix;
+    /// # let pix = Pix::from_raw_rgb(&vec![0u8; 4 * 4 * 3], 4, 4).unwrap();
+    /// assert_eq!(pix.depth(), 32);
+    /// ```
+    pub fn depth(&self) -> i32 {
+        // SAFETY: self.ptr is a valid non-null Pix. pixGetDepth() is a pure
+        // read of the Pix header struct.
+        unsafe { pixGetDepth(self.ptr) }
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Drop implementation
+// ---------------------------------------------------------------------------
+
+#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
+impl Drop for Pix {
+    fn drop(&mut self) {
+        if !self.ptr.is_null() {
+            // SAFETY: self.ptr is a non-null Leptonica PIX that we allocated and
+            // own exclusively. pixDestroy() takes a double pointer, sets *ppix to
+            // null after freeing, and is safe to call exactly once per allocation.
+            // After this call self.ptr is null (Leptonica sets it), preventing
+            // any double-free if drop() were somehow called again.
+            unsafe { pixDestroy(&mut self.ptr) };
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Tests
+// ---------------------------------------------------------------------------
+
+#[cfg(test)]
+#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
+mod tests {
+    use super::*;
+
+    fn make_rgb_pix(width: u32, height: u32, fill: u8) -> Pix {
+        let data = vec![fill; (width * height * 3) as usize];
+        Pix::from_raw_rgb(&data, width, height).expect("from_raw_rgb failed")
+    }
+
+    #[test]
+    fn test_from_raw_rgb_dimensions() {
+        let pix = make_rgb_pix(16, 8, 200);
+        assert_eq!(pix.width(), 16);
+        assert_eq!(pix.height(), 8);
+        assert_eq!(pix.depth(), 32);
+    }
+
+    #[test]
+    fn test_from_raw_rgb_wrong_length() {
+        let data = vec![0u8; 10]; // too short for 4×4
+        let err = Pix::from_raw_rgb(&data, 4, 4).unwrap_err();
+        assert!(matches!(err, TesseractError::InvalidImageData));
+    }
+
+    #[test]
+    fn test_from_raw_rgb_zero_dimensions() {
+        let err = Pix::from_raw_rgb(&[], 0, 4).unwrap_err();
+        assert!(matches!(err, TesseractError::InvalidImageData));
+
+        let err = Pix::from_raw_rgb(&[], 4, 0).unwrap_err();
+        assert!(matches!(err, TesseractError::InvalidImageData));
+    }
+
+    #[test]
+    fn test_as_ptr_is_non_null() {
+        let pix = make_rgb_pix(8, 8, 128);
+        assert!(!pix.as_ptr().is_null());
+    }
+
+    #[test]
+    fn test_to_grayscale() {
+        let pix = make_rgb_pix(32, 32, 150);
+        let gray = pix.to_grayscale().expect("to_grayscale failed");
+        assert_eq!(gray.width(), 32);
+        assert_eq!(gray.height(), 32);
+        assert_eq!(gray.depth(), 8);
+    }
+
+    #[test]
+    fn test_scale_up() {
+        let pix = make_rgb_pix(20, 10, 100);
+        let scaled = pix.scale(2.0, 2.0).expect("scale failed");
+        assert_eq!(scaled.width(), 40);
+        assert_eq!(scaled.height(), 20);
+    }
+
+    #[test]
+    fn test_unsharp_mask_returns_same_dimensions() {
+        let pix = make_rgb_pix(32, 32, 200);
+        let sharpened = pix.unsharp_mask(2, 0.4).expect("unsharp_mask failed");
+        assert_eq!(sharpened.width(), 32);
+        assert_eq!(sharpened.height(), 32);
+    }
+
+    #[test]
+    fn test_adaptive_threshold_produces_1bpp() {
+        let pix = make_rgb_pix(64, 64, 180);
+        let gray = pix.to_grayscale().expect("to_grayscale failed");
+        let binary = gray.adaptive_threshold(32, 32).expect("adaptive_threshold failed");
+        assert_eq!(binary.depth(), 1);
+    }
+}
--- a/crates/kreuzberg-tesseract/src/lib.rs
+++ b/crates/kreuzberg-tesseract/src/lib.rs
@@ -0,0 +1,218 @@
+#![cfg_attr(
+    not(any(feature = "build-tesseract", feature = "build-tesseract-wasm")),
+    allow(unused_variables, dead_code)
+)]
+#![allow(clippy::arc_with_non_send_sync)]
+#![allow(clippy::missing_transmute_annotations)]
+#![allow(clippy::type_complexity)]
+#![allow(clippy::new_without_default)]
+#![allow(clippy::not_unsafe_ptr_arg_deref)]
+#![allow(clippy::cmp_null)]
+
+//! # kreuzberg-tesseract
+//!
+//! `kreuzberg-tesseract` provides safe Rust bindings for Tesseract OCR with built-in compilation
+//! of Tesseract and Leptonica libraries. This crate aims to make OCR functionality
+//! easily accessible in Rust projects while handling the complexity of interfacing
+//! with the underlying C++ libraries.
+//!
+//! ## Usage
+//!
+//! Here's a basic example of how to use `kreuzberg-tesseract`:
+//!
+//! ```rust
+//! use std::path::PathBuf;
+//! use std::error::Error;
+//! use kreuzberg_tesseract::TesseractAPI;
+//!
+//! fn get_default_tessdata_dir() -> PathBuf {
+//!     if cfg!(target_os = "macos") {
+//!         let home_dir = std::env::var("HOME").expect("HOME environment variable not set");
+//!         PathBuf::from(home_dir)
+//!             .join("Library")
+//!             .join("Application Support")
+//!             .join("kreuzberg-tesseract")
+//!             .join("tessdata")
+//!     } else if cfg!(target_os = "linux") {
+//!         let home_dir = std::env::var("HOME").expect("HOME environment variable not set");
+//!         PathBuf::from(home_dir)
+//!             .join(".kreuzberg-tesseract")
+//!             .join("tessdata")
+//!     } else if cfg!(target_os = "windows") {
+//!         PathBuf::from(std::env::var("APPDATA").expect("APPDATA environment variable not set"))
+//!             .join("kreuzberg-tesseract")
+//!             .join("tessdata")
+//!     } else {
+//!         panic!("Unsupported operating system");
+//!     }
+//! }
+//!
+//! fn get_tessdata_dir() -> PathBuf {
+//!     match std::env::var("TESSDATA_PREFIX") {
+//!         Ok(dir) => {
+//!             let path = PathBuf::from(dir);
+//!             let path = if path.ends_with("tessdata") { path } else { path.join("tessdata") };
+//!             println!("Using TESSDATA_PREFIX directory: {:?}", path);
+//!             path
+//!         }
+//!         Err(_) => {
+//!             let default_dir = get_default_tessdata_dir();
+//!             println!(
+//!                 "TESSDATA_PREFIX not set, using default directory: {:?}",
+//!                 default_dir
+//!             );
+//!             default_dir
+//!         }
+//!     }
+//! }
+//!
+//! fn main() -> Result<(), Box<dyn Error>> {
+//!     let api = TesseractAPI::new()?;
+//!
+//!     // Get tessdata directory (uses default location or TESSDATA_PREFIX if set)
+//!     let tessdata_dir = get_tessdata_dir();
+//!     api.init(tessdata_dir.to_str().unwrap(), "eng")?;
+//!
+//!     let width = 24;
+//!     let height = 24;
+//!     let bytes_per_pixel = 1;
+//!     let bytes_per_line = width * bytes_per_pixel;
+//!
+//!     // Initialize image data with all white pixels
+//!     let mut image_data = vec![255u8; width * height];
+//!
+//!     // Draw number 9 with clearer distinction
+//!     for y in 4..19 {
+//!         for x in 7..17 {
+//!             // Top bar
+//!             if y == 4 && x >= 8 && x <= 15 {
+//!                 image_data[y * width + x] = 0;
+//!             }
+//!             // Top curve left side
+//!             if y >= 4 && y <= 10 && x == 7 {
+//!                 image_data[y * width + x] = 0;
+//!             }
+//!             // Top curve right side
+//!             if y >= 4 && y <= 11 && x == 16 {
+//!                 image_data[y * width + x] = 0;
+//!             }
+//!             // Middle bar
+//!             if y == 11 && x >= 8 && x <= 15 {
+//!                 image_data[y * width + x] = 0;
+//!             }
+//!             // Bottom right vertical line
+//!             if y >= 11 && y <= 18 && x == 16 {
+//!                 image_data[y * width + x] = 0;
+//!             }
+//!             // Bottom bar
+//!             if y == 18 && x >= 8 && x <= 15 {
+//!                 image_data[y * width + x] = 0;
+//!             }
+//!         }
+//!     }
+//!
+//!     // Set the image data
+//!     api.set_image(&image_data, width.try_into().unwrap(), height.try_into().unwrap(), bytes_per_pixel.try_into().unwrap(), bytes_per_line.try_into().unwrap())?;
+//!
+//!     // Set whitelist for digits only
+//!     api.set_variable("tessedit_char_whitelist", "0123456789")?;
+//!
+//!     // Set PSM mode to single character
+//!     api.set_variable("tessedit_pageseg_mode", "10")?;
+//!
+//!     // Get the recognized text
+//!     let text = api.get_utf8_text()?;
+//!     println!("Recognized text: {}", text.trim());
+//!
+//!     Ok(())
+//! }
+//! ```
+/// Declare FFI functions with `extern "C-unwind"` on native targets (to catch
+/// C++ exceptions from Tesseract/Leptonica) and `extern "C"` on WASM (where
+/// the LLVM backend does not support `cleanupret` / C++ unwinding).
+macro_rules! ffi_extern {
+    (
+        $(
+            $(#[$meta:meta])*
+            $vis:vis fn $name:ident($($arg:ident : $ty:ty),* $(,)?) $(-> $ret:ty)?;
+        )*
+    ) => {
+        #[cfg(not(target_arch = "wasm32"))]
+        unsafe extern "C-unwind" {
+            $(
+                $(#[$meta])*
+                $vis fn $name($($arg : $ty),*) $(-> $ret)?;
+            )*
+        }
+
+        #[cfg(target_arch = "wasm32")]
+        unsafe extern "C" {
+            $(
+                $(#[$meta])*
+                $vis fn $name($($arg : $ty),*) $(-> $ret)?;
+            )*
+        }
+    };
+}
+
+pub use error::{Result, TesseractError};
+mod error;
+
+// WASM: Override __cxa_atexit to be a no-op. WASI SDK's __cxa_atexit calls calloc during
+// C++ static initialization, which crashes because dlmalloc's heap isn't properly set up
+// for wasm32-unknown-unknown. Since WASM modules never exit normally, atexit handlers
+// are unnecessary.
+#[cfg(all(target_arch = "wasm32", not(target_os = "emscripten")))]
+mod wasm_compat {
+    #[unsafe(no_mangle)]
+    pub unsafe extern "C" fn __cxa_atexit(
+        _func: Option<unsafe extern "C" fn(*mut core::ffi::c_void)>,
+        _arg: *mut core::ffi::c_void,
+        _dso_handle: *mut core::ffi::c_void,
+    ) -> i32 {
+        0 // Success, but don't actually register anything
+    }
+}
+mod page_iterator;
+pub use page_iterator::{BlockInfo, PageIterator, ParaInfo};
+mod result_iterator;
+pub use result_iterator::{FontAttributes, ResultIterator, WordData};
+mod choice_iterator;
+pub use choice_iterator::ChoiceIterator;
+mod monitor;
+pub use monitor::TessMonitor;
+mod result_renderer;
+pub use result_renderer::TessResultRenderer;
+mod mutable_iterator;
+pub use mutable_iterator::MutableIterator;
+mod enums;
+pub use enums::{
+    TessOrientation, TessPageIteratorLevel, TessPageSegMode, TessParagraphJustification, TessPolyBlockType,
+    TessTextlineOrder, TessWritingDirection,
+};
+mod api;
+pub use api::{BoundingBoxArray, TesseractAPI};
+pub mod leptonica;
+#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
+pub use leptonica::Pix;
+
+/// Returns the compile-time-bundled English `eng.traineddata` blob when the
+/// `bundle-tessdata-eng` feature is enabled, otherwise `None`.
+///
+/// The bundled data is the `tessdata_fast` variant (~4 MB) downloaded by
+/// `build.rs` to `TESSDATA_PREFIX_BUNDLED/tessdata/eng.traineddata`. Embedding
+/// it lets WASM builds drive Tesseract OCR without filesystem access or
+/// runtime fetches.
+#[cfg(feature = "bundle-tessdata-eng")]
+pub fn bundled_eng_traineddata() -> Option<&'static [u8]> {
+    Some(include_bytes!(concat!(
+        env!("TESSDATA_PREFIX_BUNDLED"),
+        "/tessdata/eng.traineddata"
+    )))
+}
+
+/// Returns `None` when the `bundle-tessdata-eng` feature is disabled.
+#[cfg(not(feature = "bundle-tessdata-eng"))]
+pub fn bundled_eng_traineddata() -> Option<&'static [u8]> {
+    None
+}
--- a/crates/kreuzberg-tesseract/src/monitor.rs
+++ b/crates/kreuzberg-tesseract/src/monitor.rs
@@ -0,0 +1,68 @@
+use crate::error::{Result, TesseractError};
+use std::os::raw::{c_int, c_void};
+use std::sync::{Arc, Mutex};
+
+pub struct TessMonitor {
+    handle: Arc<Mutex<*mut c_void>>,
+}
+
+unsafe impl Send for TessMonitor {}
+unsafe impl Sync for TessMonitor {}
+
+impl TessMonitor {
+    /// Creates a new instance of the TessMonitor.
+    ///
+    /// # Returns
+    ///
+    /// Returns the new instance of the TessMonitor.
+    pub fn new() -> Self {
+        let handle = unsafe { TessMonitorCreate() };
+        TessMonitor {
+            handle: Arc::new(Mutex::new(handle)),
+        }
+    }
+
+    /// Sets the deadline for the monitor.
+    ///
+    /// # Arguments
+    ///
+    /// * `deadline` - Deadline in milliseconds.
+    ///
+    /// # Errors
+    ///
+    /// Returns a `TesseractError::MutexLockError` if the mutex lock fails.
+    pub fn set_deadline(&self, deadline: i32) -> Result<()> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        unsafe { TessMonitorSetDeadlineMSecs(*handle, deadline) };
+        Ok(())
+    }
+
+    /// Gets the progress of the monitor.
+    ///
+    /// # Returns
+    ///
+    /// Returns the progress as an `i32` if successful, otherwise returns an error.
+    ///
+    /// # Errors
+    ///
+    /// Returns a `TesseractError::MutexLockError` if the mutex lock fails.
+    pub fn get_progress(&self) -> Result<i32> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        Ok(unsafe { TessMonitorGetProgress(*handle) })
+    }
+}
+
+impl Drop for TessMonitor {
+    fn drop(&mut self) {
+        if let Ok(handle) = self.handle.lock() {
+            unsafe { TessMonitorDelete(*handle) };
+        }
+    }
+}
+
+ffi_extern! {
+    pub fn TessMonitorCreate() -> *mut c_void;
+    pub fn TessMonitorDelete(monitor: *mut c_void);
+    pub fn TessMonitorSetDeadlineMSecs(monitor: *mut c_void, deadline: c_int);
+    pub fn TessMonitorGetProgress(monitor: *mut c_void) -> c_int;
+}
--- a/crates/kreuzberg-tesseract/src/mutable_iterator.rs
+++ b/crates/kreuzberg-tesseract/src/mutable_iterator.rs
@@ -0,0 +1,197 @@
+use crate::error::{Result, TesseractError};
+use std::ffi::CStr;
+use std::os::raw::{c_char, c_void};
+use std::sync::Arc;
+use std::sync::Mutex;
+
+use crate::result_iterator::{
+    TessResultIteratorConfidence, TessResultIteratorGetUTF8Text, TessResultIteratorNext,
+    TessResultIteratorSymbolIsDropcap, TessResultIteratorSymbolIsSubscript, TessResultIteratorSymbolIsSuperscript,
+    TessResultIteratorWordFontAttributes, TessResultIteratorWordIsFromDictionary, TessResultIteratorWordIsNumeric,
+    TessResultIteratorWordRecognitionLanguage,
+};
+
+pub struct MutableIterator {
+    handle: Arc<Mutex<*mut c_void>>,
+}
+
+unsafe impl Send for MutableIterator {}
+unsafe impl Sync for MutableIterator {}
+
+impl MutableIterator {
+    /// Creates a new instance of the MutableIterator.
+    ///
+    /// # Arguments
+    ///
+    /// * `handle` - Pointer to the MutableIterator.
+    pub fn new(handle: *mut c_void) -> Self {
+        MutableIterator {
+            handle: Arc::new(Mutex::new(handle)),
+        }
+    }
+
+    /// Gets the UTF-8 text for the current iterator.
+    ///
+    /// # Arguments
+    ///
+    /// * `level` - Level of the text.
+    pub fn get_utf8_text(&self, level: i32) -> Result<String> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
+        let text_ptr = unsafe { TessResultIteratorGetUTF8Text(*handle, level) };
+        if text_ptr.is_null() {
+            return Err(TesseractError::NullPointerError);
+        }
+        let c_str = unsafe { CStr::from_ptr(text_ptr) };
+        let result = c_str.to_str()?.to_owned();
+        unsafe { TessDeleteText(text_ptr as *mut c_char) };
+        Ok(result)
+    }
+
+    /// Gets the confidence of the current iterator.
+    ///
+    /// # Arguments
+    ///
+    /// * `level` - Level of the confidence.
+    pub fn confidence(&self, level: i32) -> Result<f32> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
+        Ok(unsafe { TessResultIteratorConfidence(*handle, level) })
+    }
+
+    /// Gets the recognition language of the current iterator.
+    ///
+    /// # Returns
+    ///
+    /// Returns the recognition language as a `String` if successful, otherwise returns an error.
+    pub fn word_recognition_language(&self) -> Result<String> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
+        let lang_ptr = unsafe { TessResultIteratorWordRecognitionLanguage(*handle) };
+        if lang_ptr.is_null() {
+            return Err(TesseractError::NullPointerError);
+        }
+        let c_str = unsafe { CStr::from_ptr(lang_ptr) };
+        Ok(c_str.to_str()?.to_owned())
+    }
+
+    /// Gets the font attributes of the current iterator.
+    ///
+    /// # Returns
+    ///
+    /// Returns the font attributes as a tuple if successful, otherwise returns an error.
+    pub fn word_font_attributes(&self) -> Result<(bool, bool, bool, bool, bool, bool, i32, i32)> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
+        let mut is_bold = 0;
+        let mut is_italic = 0;
+        let mut is_underlined = 0;
+        let mut is_monospace = 0;
+        let mut is_serif = 0;
+        let mut is_smallcaps = 0;
+        let mut pointsize = 0;
+        let mut font_id = 0;
+
+        let result = unsafe {
+            TessResultIteratorWordFontAttributes(
+                *handle,
+                &mut is_bold,
+                &mut is_italic,
+                &mut is_underlined,
+                &mut is_monospace,
+                &mut is_serif,
+                &mut is_smallcaps,
+                &mut pointsize,
+                &mut font_id,
+            )
+        };
+
+        if result == 0 {
+            Err(TesseractError::InvalidParameterError)
+        } else {
+            Ok((
+                is_bold != 0,
+                is_italic != 0,
+                is_underlined != 0,
+                is_monospace != 0,
+                is_serif != 0,
+                is_smallcaps != 0,
+                pointsize,
+                font_id,
+            ))
+        }
+    }
+
+    /// Checks if the current word is from the dictionary.
+    ///
+    /// # Returns
+    ///
+    /// Returns `Ok(true)` if the current word is from the dictionary, otherwise returns `Ok(false)`.
+    pub fn word_is_from_dictionary(&self) -> Result<bool> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
+        Ok(unsafe { TessResultIteratorWordIsFromDictionary(*handle) != 0 })
+    }
+
+    /// Checks if the current word is numeric.
+    ///
+    /// # Returns
+    ///
+    /// Returns `Ok(true)` if the current word is numeric, otherwise returns `Ok(false)`.
+    pub fn word_is_numeric(&self) -> Result<bool> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
+        Ok(unsafe { TessResultIteratorWordIsNumeric(*handle) != 0 })
+    }
+
+    /// Checks if the current symbol is superscript.
+    ///
+    /// # Returns
+    ///
+    /// Returns `Ok(true)` if the current symbol is superscript, otherwise returns `Ok(false)`.
+    pub fn symbol_is_superscript(&self) -> Result<bool> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
+        Ok(unsafe { TessResultIteratorSymbolIsSuperscript(*handle) != 0 })
+    }
+
+    /// Checks if the current symbol is subscript.
+    ///
+    /// # Returns
+    ///
+    /// Returns `Ok(true)` if the current symbol is subscript, otherwise returns `Ok(false)`.
+    pub fn symbol_is_subscript(&self) -> Result<bool> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
+        Ok(unsafe { TessResultIteratorSymbolIsSubscript(*handle) != 0 })
+    }
+
+    /// Checks if the current symbol is dropcap.
+    ///
+    /// # Returns
+    ///
+    /// Returns `Ok(true)` if the current symbol is dropcap, otherwise returns `Ok(false)`.
+    pub fn symbol_is_dropcap(&self) -> Result<bool> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
+        Ok(unsafe { TessResultIteratorSymbolIsDropcap(*handle) != 0 })
+    }
+
+    /// Gets the next iterator.
+    ///
+    /// # Arguments
+    ///
+    /// * `level` - Level of the iterator.
+    ///
+    /// # Returns
+    ///
+    /// Returns `true` if the next iterator is successful, otherwise returns `false`.
+    pub fn next(&self, level: i32) -> Result<bool> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
+        Ok(unsafe { TessResultIteratorNext(*handle, level) != 0 })
+    }
+}
+
+impl Drop for MutableIterator {
+    fn drop(&mut self) {
+        if let Ok(handle) = self.handle.lock() {
+            unsafe { TessResultIteratorDelete(*handle) };
+        }
+    }
+}
+
+ffi_extern! {
+    pub fn TessResultIteratorDelete(handle: *mut c_void);
+    pub fn TessDeleteText(text: *mut c_char);
+}
--- a/crates/kreuzberg-tesseract/src/page_iterator.rs
+++ b/crates/kreuzberg-tesseract/src/page_iterator.rs
@@ -0,0 +1,421 @@
+use crate::TesseractError;
+use crate::enums::{
+    TessOrientation, TessPageIteratorLevel, TessParagraphJustification, TessPolyBlockType, TessTextlineOrder,
+    TessWritingDirection,
+};
+use crate::error::Result;
+use std::os::raw::{c_float, c_int, c_void};
+use std::sync::Arc;
+use std::sync::Mutex;
+
+/// Block-level layout information from Tesseract.
+#[derive(Debug, Clone)]
+pub struct BlockInfo {
+    pub block_type: TessPolyBlockType,
+    pub left: i32,
+    pub top: i32,
+    pub right: i32,
+    pub bottom: i32,
+}
+
+/// Paragraph-level information from Tesseract.
+#[derive(Debug, Clone)]
+pub struct ParaInfo {
+    pub justification: TessParagraphJustification,
+    pub is_list_item: bool,
+    pub is_crown: bool,
+    pub first_line_indent: i32,
+    pub left: i32,
+    pub top: i32,
+    pub right: i32,
+    pub bottom: i32,
+}
+
+pub struct PageIterator {
+    pub handle: Arc<Mutex<*mut c_void>>,
+}
+
+unsafe impl Send for PageIterator {}
+unsafe impl Sync for PageIterator {}
+
+impl PageIterator {
+    /// Creates a new instance of the PageIterator.
+    ///
+    /// # Arguments
+    ///
+    /// * `handle` - Pointer to the PageIterator.
+    ///
+    /// # Returns
+    ///
+    /// Returns the new instance of the PageIterator.
+    pub fn new(handle: *mut c_void) -> Self {
+        PageIterator {
+            handle: Arc::new(Mutex::new(handle)),
+        }
+    }
+
+    /// Begins the iteration.
+    pub fn begin(&self) -> Result<()> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        unsafe { TessPageIteratorBegin(*handle) };
+        Ok(())
+    }
+
+    /// Gets the next iterator.
+    ///
+    /// # Arguments
+    ///
+    /// * `level` - Level of the iterator.
+    ///
+    /// # Returns
+    ///
+    /// Returns `Result<bool>` - `Ok(true)` if the next iterator is successful, `Ok(false)` otherwise.
+    pub fn next(&self, level: TessPageIteratorLevel) -> Result<bool> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        Ok(unsafe { TessPageIteratorNext(*handle, level as c_int) != 0 })
+    }
+
+    /// Checks if the current iterator is at the beginning of the specified level.
+    ///
+    /// # Arguments
+    ///
+    /// * `level` - Level of the iterator.
+    ///
+    /// # Returns
+    ///
+    /// Returns `Result<bool>` - `Ok(true)` if at the beginning, `Ok(false)` otherwise.
+    pub fn is_at_beginning_of(&self, level: TessPageIteratorLevel) -> Result<bool> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        Ok(unsafe { TessPageIteratorIsAtBeginningOf(*handle, level as c_int) != 0 })
+    }
+
+    /// Checks if the current iterator is at the final element of the specified level.
+    ///
+    /// # Arguments
+    ///
+    /// * `level` - Level of the iterator.
+    /// * `element` - Element of the iterator.
+    ///
+    /// # Returns
+    ///
+    /// Returns `Result<bool>` - `Ok(true)` if at the final element, `Ok(false)` otherwise.
+    pub fn is_at_final_element(&self, level: TessPageIteratorLevel, element: TessPageIteratorLevel) -> Result<bool> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        Ok(unsafe { TessPageIteratorIsAtFinalElement(*handle, level as c_int, element as c_int) != 0 })
+    }
+
+    /// Gets the bounding box of the current iterator.
+    ///
+    /// # Arguments
+    ///
+    /// * `level` - Level of the bounding box.
+    ///
+    /// # Returns
+    ///
+    /// Returns the bounding box as a tuple if successful, otherwise returns an error.
+    pub fn bounding_box(&self, level: TessPageIteratorLevel) -> Result<(i32, i32, i32, i32)> {
+        let mut left = 0;
+        let mut top = 0;
+        let mut right = 0;
+        let mut bottom = 0;
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        let result = unsafe {
+            TessPageIteratorBoundingBox(*handle, level as c_int, &mut left, &mut top, &mut right, &mut bottom)
+        };
+        if result == 0 {
+            Err(TesseractError::InvalidParameterError)
+        } else {
+            Ok((left, top, right, bottom))
+        }
+    }
+
+    /// Gets the block type of the current iterator.
+    ///
+    /// # Returns
+    ///
+    /// Returns the block type as a `TessPolyBlockType`.
+    pub fn block_type(&self) -> Result<TessPolyBlockType> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        let block_type = unsafe { TessPageIteratorBlockType(*handle) };
+        Ok(TessPolyBlockType::from_int(block_type))
+    }
+
+    /// Gets the baseline of the current iterator.
+    ///
+    /// # Arguments
+    ///
+    /// * `level` - Level of the baseline.
+    ///
+    /// # Returns
+    ///
+    /// Returns the baseline as a tuple if successful, otherwise returns an error.
+    pub fn baseline(&self, level: i32) -> Result<(i32, i32, i32, i32)> {
+        let mut x1 = 0;
+        let mut y1 = 0;
+        let mut x2 = 0;
+        let mut y2 = 0;
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        let result = unsafe { TessPageIteratorBaseline(*handle, level, &mut x1, &mut y1, &mut x2, &mut y2) };
+        if result == 0 {
+            Err(TesseractError::InvalidParameterError)
+        } else {
+            Ok((x1, y1, x2, y2))
+        }
+    }
+
+    /// Gets the orientation of the current iterator.
+    ///
+    /// # Returns
+    ///
+    /// Returns the orientation as a tuple if successful, otherwise returns an error.
+    pub fn orientation(&self) -> Result<(TessOrientation, TessWritingDirection, TessTextlineOrder, f32)> {
+        let mut orientation = 0;
+        let mut writing_direction = 0;
+        let mut textline_order = 0;
+        let mut deskew_angle = 0.0;
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        let result = unsafe {
+            TessPageIteratorOrientation(
+                *handle,
+                &mut orientation,
+                &mut writing_direction,
+                &mut textline_order,
+                &mut deskew_angle,
+            )
+        };
+        if result == 0 {
+            Err(TesseractError::InvalidParameterError)
+        } else {
+            Ok((
+                TessOrientation::from_int(orientation),
+                TessWritingDirection::from_int(writing_direction),
+                TessTextlineOrder::from_int(textline_order),
+                deskew_angle,
+            ))
+        }
+    }
+
+    /// Extracts all blocks from the page in a single mutex-locked pass.
+    ///
+    /// Resets the iterator to the beginning, then iterates at `RIL_BLOCK` level,
+    /// collecting block type and bounding box for each block found.
+    ///
+    /// # Returns
+    ///
+    /// Returns `Ok(Vec<BlockInfo>)` with one entry per block, or an error if the
+    /// mutex cannot be acquired.
+    pub fn extract_all_blocks(&self) -> Result<Vec<BlockInfo>> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        let level = TessPageIteratorLevel::RIL_BLOCK as c_int;
+        let mut blocks = Vec::new();
+
+        // SAFETY: `*handle` is a valid non-null TessPageIterator pointer owned by this struct.
+        // `TessPageIteratorBegin` resets the iterator to the first element and takes only
+        // the pointer — no aliasing occurs because we hold the mutex for the duration.
+        unsafe { TessPageIteratorBegin(*handle) };
+
+        loop {
+            let block_type = unsafe {
+                // SAFETY: `*handle` is valid; TessPageIteratorBlockType reads the current
+                // iterator position and returns an integer enum value without taking ownership.
+                TessPageIteratorBlockType(*handle)
+            };
+
+            let mut left: c_int = 0;
+            let mut top: c_int = 0;
+            let mut right: c_int = 0;
+            let mut bottom: c_int = 0;
+
+            let bbox_ok = unsafe {
+                // SAFETY: `*handle` is valid; the four `*mut c_int` pointers point to local
+                // stack variables whose lifetimes exceed this call.
+                TessPageIteratorBoundingBox(*handle, level, &mut left, &mut top, &mut right, &mut bottom)
+            };
+
+            if bbox_ok != 0 {
+                blocks.push(BlockInfo {
+                    block_type: TessPolyBlockType::from_int(block_type),
+                    left,
+                    top,
+                    right,
+                    bottom,
+                });
+            }
+
+            let has_next = unsafe {
+                // SAFETY: `*handle` is valid; TessPageIteratorNext advances the iterator
+                // in-place and returns 0 when there are no more elements at this level.
+                TessPageIteratorNext(*handle, level)
+            };
+            if has_next == 0 {
+                break;
+            }
+        }
+
+        Ok(blocks)
+    }
+
+    /// Extracts all paragraphs from the page in a single mutex-locked pass.
+    ///
+    /// Resets the iterator to the beginning, then iterates at `RIL_PARA` level,
+    /// collecting paragraph metadata and bounding box for each paragraph found.
+    ///
+    /// # Returns
+    ///
+    /// Returns `Ok(Vec<ParaInfo>)` with one entry per paragraph, or an error if the
+    /// mutex cannot be acquired.
+    pub fn extract_all_paragraphs(&self) -> Result<Vec<ParaInfo>> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        let level = TessPageIteratorLevel::RIL_PARA as c_int;
+        let mut paragraphs = Vec::new();
+
+        // SAFETY: `*handle` is a valid non-null TessPageIterator pointer owned by this struct.
+        // `TessPageIteratorBegin` resets the iterator to the first element; the mutex ensures
+        // exclusive access for the entire loop.
+        unsafe { TessPageIteratorBegin(*handle) };
+
+        loop {
+            let mut justification: c_int = 0;
+            // SAFETY: TessPageIteratorParagraphInfo expects BOOL* (int*) for is_list_item and
+            // is_crown. Rust bool is 1 byte while C int is 4 bytes, so we use c_int temporaries
+            // to avoid undefined behaviour (stack corruption) and convert afterwards.
+            let mut is_list_item_raw: c_int = 0;
+            let mut is_crown_raw: c_int = 0;
+            let mut first_line_indent: c_int = 0;
+
+            let para_ok = unsafe {
+                // SAFETY: `*handle` is valid; all output pointers reference stack variables
+                // whose lifetimes exceed this call. TessPageIteratorParagraphInfo writes
+                // through these pointers without retaining them.
+                TessPageIteratorParagraphInfo(
+                    *handle,
+                    &mut justification,
+                    &mut is_list_item_raw,
+                    &mut is_crown_raw,
+                    &mut first_line_indent,
+                )
+            };
+
+            let is_list_item = is_list_item_raw != 0;
+            let is_crown = is_crown_raw != 0;
+
+            let mut left: c_int = 0;
+            let mut top: c_int = 0;
+            let mut right: c_int = 0;
+            let mut bottom: c_int = 0;
+
+            let bbox_ok = unsafe {
+                // SAFETY: `*handle` is valid; the four `*mut c_int` pointers reference local
+                // stack variables. TessPageIteratorBoundingBox does not retain these pointers.
+                TessPageIteratorBoundingBox(*handle, level, &mut left, &mut top, &mut right, &mut bottom)
+            };
+
+            if para_ok != 0 && bbox_ok != 0 {
+                paragraphs.push(ParaInfo {
+                    justification: TessParagraphJustification::from_int(justification),
+                    is_list_item,
+                    is_crown,
+                    first_line_indent,
+                    left,
+                    top,
+                    right,
+                    bottom,
+                });
+            }
+
+            let has_next = unsafe {
+                // SAFETY: `*handle` is valid; TessPageIteratorNext advances the iterator
+                // in-place and returns 0 when there are no more elements at this level.
+                TessPageIteratorNext(*handle, level)
+            };
+            if has_next == 0 {
+                break;
+            }
+        }
+
+        Ok(paragraphs)
+    }
+
+    /// Gets the paragraph information of the current iterator.
+    ///
+    /// # Returns
+    ///
+    /// Returns the paragraph information as a tuple if successful, otherwise returns an error.
+    pub fn paragraph_info(&self) -> Result<(TessParagraphJustification, bool, bool, i32)> {
+        let mut justification = 0;
+        // SAFETY: TessPageIteratorParagraphInfo expects BOOL* (int*) for is_list_item and
+        // is_crown. Rust bool is 1 byte while C int is 4 bytes, so we use c_int temporaries
+        // to avoid undefined behaviour (stack corruption) and convert afterwards.
+        let mut is_list_item_raw: c_int = 0;
+        let mut is_crown_raw: c_int = 0;
+        let mut first_line_indent = 0;
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        let result = unsafe {
+            TessPageIteratorParagraphInfo(
+                *handle,
+                &mut justification,
+                &mut is_list_item_raw,
+                &mut is_crown_raw,
+                &mut first_line_indent,
+            )
+        };
+        if result == 0 {
+            Err(TesseractError::InvalidParameterError)
+        } else {
+            Ok((
+                TessParagraphJustification::from_int(justification),
+                is_list_item_raw != 0,
+                is_crown_raw != 0,
+                first_line_indent,
+            ))
+        }
+    }
+}
+
+impl Drop for PageIterator {
+    fn drop(&mut self) {
+        if let Ok(handle) = self.handle.lock() {
+            unsafe { TessPageIteratorDelete(*handle) };
+        }
+    }
+}
+
+ffi_extern! {
+    pub fn TessPageIteratorDelete(handle: *mut c_void);
+    pub fn TessPageIteratorBegin(handle: *mut c_void);
+    pub fn TessPageIteratorNext(handle: *mut c_void, level: c_int) -> c_int;
+    pub fn TessPageIteratorIsAtBeginningOf(handle: *mut c_void, level: c_int) -> c_int;
+    pub fn TessPageIteratorIsAtFinalElement(handle: *mut c_void, level: c_int, element: c_int) -> c_int;
+    pub fn TessPageIteratorBoundingBox(
+        handle: *mut c_void,
+        level: c_int,
+        left: *mut c_int,
+        top: *mut c_int,
+        right: *mut c_int,
+        bottom: *mut c_int,
+    ) -> c_int;
+    pub fn TessPageIteratorBlockType(handle: *mut c_void) -> c_int;
+    pub fn TessPageIteratorBaseline(
+        handle: *mut c_void,
+        level: c_int,
+        x1: *mut c_int,
+        y1: *mut c_int,
+        x2: *mut c_int,
+        y2: *mut c_int,
+    ) -> c_int;
+    pub fn TessPageIteratorOrientation(
+        handle: *mut c_void,
+        orientation: *mut c_int,
+        writing_direction: *mut c_int,
+        textline_order: *mut c_int,
+        deskew_angle: *mut c_float,
+    ) -> c_int;
+    pub fn TessBaseAPIGetIterator(handle: *mut c_void) -> *mut c_void;
+    pub fn TessPageIteratorParagraphInfo(
+        handle: *mut c_void,
+        justification: *mut c_int,
+        is_list_item: *mut c_int,
+        is_crown: *mut c_int,
+        first_line_indent: *mut c_int,
+    ) -> c_int;
+}
--- a/crates/kreuzberg-tesseract/src/result_iterator.rs
+++ b/crates/kreuzberg-tesseract/src/result_iterator.rs
@@ -0,0 +1,589 @@
+use crate::api::TessDeleteText;
+use crate::enums::TessPageIteratorLevel;
+use crate::error::{Result, TesseractError};
+use std::ffi::CStr;
+use std::os::raw::{c_char, c_float, c_int, c_void};
+use std::sync::{Arc, Mutex};
+
+/// Font attributes detected by Tesseract for a word.
+#[derive(Debug, Clone)]
+pub struct FontAttributes {
+    pub is_bold: bool,
+    pub is_italic: bool,
+    pub is_underlined: bool,
+    pub is_monospace: bool,
+    pub is_serif: bool,
+    pub is_smallcaps: bool,
+    pub pointsize: i32,
+    pub font_id: i32,
+}
+
+/// Complete word data extracted in a single mutex lock.
+#[derive(Debug, Clone)]
+pub struct WordData {
+    pub text: String,
+    pub left: i32,
+    pub top: i32,
+    pub right: i32,
+    pub bottom: i32,
+    pub confidence: f32,
+    pub font_attrs: Option<FontAttributes>,
+}
+
+pub struct ResultIterator {
+    pub handle: Arc<Mutex<*mut c_void>>,
+}
+
+unsafe impl Send for ResultIterator {}
+unsafe impl Sync for ResultIterator {}
+
+impl ResultIterator {
+    /// Creates a new instance of the ResultIterator.
+    ///
+    /// # Arguments
+    ///
+    /// * `handle` - Pointer to the ResultIterator.
+    ///
+    /// # Returns
+    ///
+    /// Returns the new instance of the ResultIterator.
+    pub fn new(handle: *mut c_void) -> Self {
+        ResultIterator {
+            handle: Arc::new(Mutex::new(handle)),
+        }
+    }
+
+    /// Gets the UTF-8 text of the current iterator.
+    ///
+    /// # Arguments
+    ///
+    /// * `level` - Level of the text.
+    ///
+    /// # Returns
+    ///
+    /// Returns the UTF-8 text as a `String` if successful, otherwise returns an error.
+    pub fn get_utf8_text(&self, level: TessPageIteratorLevel) -> Result<String> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        // SAFETY: TessResultIteratorGetUTF8Text() allocates and returns a pointer to a C string.
+        // This is safe because:
+        // 1. *handle is a valid pointer to an initialized ResultIterator (mutex-guarded)
+        // 2. level is a valid TessPageIteratorLevel enum converted to c_int (in valid range)
+        // 3. The returned pointer is either null (error) or a valid null-terminated C string
+        //    allocated on Tesseract's heap (must be freed with TessDeleteText)
+        let text_ptr = unsafe { TessResultIteratorGetUTF8Text(*handle, level as c_int) };
+        if text_ptr.is_null() {
+            return Err(TesseractError::NullPointerError);
+        }
+        // SAFETY: We've verified text_ptr is non-null. The allocation/deallocation pattern is:
+        // 1. text_ptr was allocated by TessResultIteratorGetUTF8Text() on the FFI boundary
+        // 2. CStr::from_ptr(text_ptr) is safe: pointer is non-null and points to valid C string
+        // 3. We read from the string (to_str() creates temporary immutable borrow)
+        // 4. We immediately copy all data to owned String before deallocation
+        // 5. The string data remains valid until TessDeleteText is called
+        let c_str = unsafe { CStr::from_ptr(text_ptr) };
+        let result = c_str.to_str()?.to_owned();
+        // SAFETY: TessDeleteText() deallocates memory allocated by TessResultIteratorGetUTF8Text():
+        // 1. text_ptr must be non-null (verified above)
+        // 2. text_ptr came from the Tesseract API (trusted source, correct allocation)
+        // 3. TessDeleteText() is the correct deallocation function for this allocation
+        // 4. Must be called exactly once per allocation to avoid double-free (we ensure this)
+        // 5. After this call, text_ptr is invalid; all uses must be via owned result String
+        unsafe { TessDeleteText(text_ptr as *mut c_char) };
+        Ok(result)
+    }
+
+    /// Gets the confidence of the current iterator.
+    ///
+    /// # Arguments
+    ///
+    /// * `level` - Level of the confidence.
+    ///
+    /// # Returns
+    ///
+    /// Returns the confidence as a `f32`.
+    pub fn confidence(&self, level: TessPageIteratorLevel) -> Result<f32> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        // SAFETY: TessResultIteratorConfidence() is safe because:
+        // 1. *handle is a valid pointer to an initialized ResultIterator
+        // 2. level is a valid TessPageIteratorLevel enum converted to c_int
+        // 3. The function only reads state and returns an f32 value (copyable)
+        // 4. No pointer operations or memory access is needed
+        Ok(unsafe { TessResultIteratorConfidence(*handle, level as c_int) })
+    }
+
+    /// Gets the recognition language of the current iterator.
+    ///
+    /// # Returns
+    ///
+    /// Returns the recognition language as a `String` if successful, otherwise returns an error.
+    pub fn word_recognition_language(&self) -> Result<String> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        // SAFETY: TessResultIteratorWordRecognitionLanguage() returns a pointer to a C string
+        // in the iterator's memory. This is safe because:
+        // 1. *handle is a valid pointer to an initialized ResultIterator
+        // 2. The returned pointer is either null or a valid null-terminated C string
+        let lang_ptr = unsafe { TessResultIteratorWordRecognitionLanguage(*handle) };
+        if lang_ptr.is_null() {
+            return Err(TesseractError::NullPointerError);
+        }
+        // SAFETY: We've verified lang_ptr is non-null. CStr::from_ptr() is safe because:
+        // 1. lang_ptr points to a valid null-terminated C string managed by Tesseract
+        // 2. We only read from it (to_str() creates temporary borrow)
+        let c_str = unsafe { CStr::from_ptr(lang_ptr) };
+        Ok(c_str.to_str()?.to_owned())
+    }
+
+    /// Gets the font attributes of the current iterator.
+    ///
+    /// # Returns
+    ///
+    /// Returns the font attributes as a tuple if successful, otherwise returns an error.
+    pub fn word_font_attributes(&self) -> Result<(bool, bool, bool, bool, bool, bool, i32, i32)> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        let mut is_bold = 0;
+        let mut is_italic = 0;
+        let mut is_underlined = 0;
+        let mut is_monospace = 0;
+        let mut is_serif = 0;
+        let mut is_smallcaps = 0;
+        let mut pointsize = 0;
+        let mut font_id = 0;
+
+        // SAFETY: TessResultIteratorWordFontAttributes() takes output parameter pointers
+        // and fills them with font attribute values. This is safe because:
+        // 1. *handle is a valid pointer to an initialized ResultIterator (mutex-guarded)
+        // 2. All mutable references (&mut ...) are valid local stack variables
+        // 3. Each reference has a distinct memory location (no aliasing)
+        // 4. The references outlive the FFI call (defined on stack, used immediately after)
+        // 5. The function writes output i32 values (0/1 for bools, integers for size/id)
+        // 6. Each reference has exclusive mutable access (Rust borrow checker enforces this)
+        // 7. The output parameters are independent (function cannot cause data races)
+        let result = unsafe {
+            TessResultIteratorWordFontAttributes(
+                *handle,
+                &mut is_bold,
+                &mut is_italic,
+                &mut is_underlined,
+                &mut is_monospace,
+                &mut is_serif,
+                &mut is_smallcaps,
+                &mut pointsize,
+                &mut font_id,
+            )
+        };
+
+        if result == 0 {
+            Err(TesseractError::InvalidParameterError)
+        } else {
+            Ok((
+                is_bold != 0,
+                is_italic != 0,
+                is_underlined != 0,
+                is_monospace != 0,
+                is_serif != 0,
+                is_smallcaps != 0,
+                pointsize,
+                font_id,
+            ))
+        }
+    }
+
+    /// Checks if the current iterator is from the dictionary.
+    ///
+    /// # Returns
+    ///
+    /// Returns `true` if the current iterator is from the dictionary, otherwise returns `false`.
+    pub fn word_is_from_dictionary(&self) -> Result<bool> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        // SAFETY: TessResultIteratorWordIsFromDictionary() is safe because:
+        // 1. *handle is a valid pointer to an initialized ResultIterator
+        // 2. The function only reads state and returns an i32 value (0 or non-zero)
+        // 3. No pointer operations or memory modifications are needed
+        Ok(unsafe { TessResultIteratorWordIsFromDictionary(*handle) != 0 })
+    }
+
+    /// Checks if the current iterator is numeric.
+    ///
+    /// # Returns
+    ///
+    /// Returns `true` if the current iterator is numeric, otherwise returns `false`.
+    pub fn word_is_numeric(&self) -> Result<bool> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        // SAFETY: TessResultIteratorWordIsNumeric() is safe because:
+        // 1. *handle is a valid pointer to an initialized ResultIterator
+        // 2. The function only reads state and returns an i32 value
+        // 3. No pointer operations or state modifications needed
+        Ok(unsafe { TessResultIteratorWordIsNumeric(*handle) != 0 })
+    }
+
+    /// Checks if the current iterator is superscript.
+    ///
+    /// # Returns
+    ///
+    /// Returns `true` if the current iterator is superscript, otherwise returns `false`.
+    pub fn symbol_is_superscript(&self) -> Result<bool> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        // SAFETY: TessResultIteratorSymbolIsSuperscript() is safe because:
+        // 1. *handle is a valid pointer to an initialized ResultIterator
+        // 2. The function only reads state and returns an i32 value
+        // 3. No pointer operations or state modifications needed
+        Ok(unsafe { TessResultIteratorSymbolIsSuperscript(*handle) != 0 })
+    }
+
+    /// Checks if the current iterator is subscript.
+    ///
+    /// # Returns
+    ///
+    /// Returns `true` if the current iterator is subscript, otherwise returns `false`.
+    pub fn symbol_is_subscript(&self) -> Result<bool> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        // SAFETY: TessResultIteratorSymbolIsSubscript() is safe because:
+        // 1. *handle is a valid pointer to an initialized ResultIterator
+        // 2. The function only reads state and returns an i32 value
+        // 3. No pointer operations or state modifications needed
+        Ok(unsafe { TessResultIteratorSymbolIsSubscript(*handle) != 0 })
+    }
+
+    /// Checks if the current iterator is dropcap.
+    ///
+    /// # Returns
+    ///
+    /// Returns `true` if the current iterator is dropcap, otherwise returns `false`.
+    pub fn symbol_is_dropcap(&self) -> Result<bool> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        // SAFETY: TessResultIteratorSymbolIsDropcap() is safe because:
+        // 1. *handle is a valid pointer to an initialized ResultIterator
+        // 2. The function only reads state and returns an i32 value
+        // 3. No pointer operations or state modifications needed
+        Ok(unsafe { TessResultIteratorSymbolIsDropcap(*handle) != 0 })
+    }
+
+    /// Moves to the next iterator.
+    ///
+    /// # Arguments
+    ///
+    /// * `level` - Level of the next iterator.
+    ///
+    /// # Returns
+    ///
+    /// Returns `true` if the next iterator exists, otherwise returns `false`.
+    pub fn next(&self, level: TessPageIteratorLevel) -> Result<bool> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        // SAFETY: TessResultIteratorNext() is safe because:
+        // 1. *handle is a valid pointer to an initialized ResultIterator
+        // 2. level is a valid TessPageIteratorLevel enum converted to c_int
+        // 3. The function modifies iterator state (advances position) and returns i32 result
+        // 4. The mutex ensures exclusive access during state modification
+        Ok(unsafe { TessResultIteratorNext(*handle, level as c_int) != 0 })
+    }
+
+    /// Gets the current word from the iterator with its bounding box and confidence.
+    ///
+    /// # Returns
+    ///
+    /// Returns a tuple of (text, left, top, right, bottom, confidence) if successful
+    pub fn get_word_with_bounds(&self) -> Result<(String, i32, i32, i32, i32, f32)> {
+        let text = self.get_utf8_text(TessPageIteratorLevel::RIL_WORD)?;
+        let (left, top, right, bottom) = self.get_bounding_box(TessPageIteratorLevel::RIL_WORD)?;
+        let confidence = self.confidence(TessPageIteratorLevel::RIL_WORD)?;
+
+        Ok((text, left, top, right, bottom, confidence))
+    }
+
+    /// Advances the iterator to the next word.
+    ///
+    /// # Returns
+    ///
+    /// Returns true if successful, false if there are no more words
+    pub fn next_word(&self) -> Result<bool> {
+        self.next(TessPageIteratorLevel::RIL_WORD)
+    }
+
+    /// Gets the word information for the current position in the iterator.
+    /// Should be called before next() to ensure valid data.
+    ///
+    /// # Returns
+    /// Returns a tuple of (text, left, top, right, bottom, confidence) if successful
+    pub fn get_current_word(&self) -> Result<(String, i32, i32, i32, i32, f32)> {
+        let text = self.get_utf8_text(TessPageIteratorLevel::RIL_WORD)?;
+        let (left, top, right, bottom) = self.get_bounding_box(TessPageIteratorLevel::RIL_WORD)?;
+        let confidence = self.confidence(TessPageIteratorLevel::RIL_WORD)?;
+
+        Ok((text, left, top, right, bottom, confidence))
+    }
+
+    /// Gets the bounding box for the current element.
+    pub fn get_bounding_box(&self, level: TessPageIteratorLevel) -> Result<(i32, i32, i32, i32)> {
+        let mut left = 0;
+        let mut top = 0;
+        let mut right = 0;
+        let mut bottom = 0;
+
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+
+        // SAFETY: TessPageIteratorBoundingBox() queries iterator state and returns coordinates
+        // via output parameters. This is safe because:
+        // 1. *handle is a valid pointer to an initialized ResultIterator or PageIterator (mutex-guarded)
+        // 2. level is a valid TessPageIteratorLevel enum converted to c_int (in valid range)
+        // 3. All mutable references (&mut left, &mut top, &mut right, &mut bottom)
+        //    are valid local stack variables with distinct memory locations
+        // 4. Each reference is exclusively borrowed (Rust enforces no aliasing)
+        // 5. The references outlive the FFI call (defined on stack, used immediately after)
+        // 6. The function writes four i32 coordinate values into these references
+        // 7. No pointer escaping: the function only writes to these parameters, doesn't store them
+        // 8. Return value indicates success/failure (checked below)
+        let result = unsafe {
+            TessPageIteratorBoundingBox(*handle, level as c_int, &mut left, &mut top, &mut right, &mut bottom)
+        };
+
+        if result == 0 {
+            Err(TesseractError::InvalidParameterError)
+        } else {
+            Ok((left, top, right, bottom))
+        }
+    }
+
+    /// Extracts all word data from the iterator in a single mutex lock.
+    ///
+    /// Acquires the mutex once and iterates all words, collecting text, bounding box,
+    /// confidence, and font attributes for each word. This is more efficient than
+    /// calling individual methods in a loop since it avoids repeated mutex acquisitions.
+    ///
+    /// The iterator is always reset to the beginning before traversal so that partial
+    /// prior consumption does not cause words to be missed.
+    ///
+    /// # Returns
+    ///
+    /// Returns a `Vec<WordData>` containing data for every word, or an error if the
+    /// mutex cannot be acquired.
+    pub fn extract_all_words(&self) -> Result<Vec<WordData>> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        let raw = *handle;
+        let mut words = Vec::new();
+
+        // Reset to the first element before traversal.  ResultIterator inherits from
+        // PageIterator in C++, so TessPageIteratorBegin operates on the same handle.
+        // SAFETY: raw is a valid mutex-guarded ResultIterator pointer; TessPageIteratorBegin
+        // simply resets the internal position and does not allocate or free memory.
+        unsafe { TessPageIteratorBegin(raw) };
+
+        loop {
+            // SAFETY: raw is the mutex-guarded *mut c_void handle. All calls within this
+            // loop are performed while holding the mutex lock, ensuring exclusive access.
+            // We pass raw directly to the unlocked helper to avoid re-locking.
+            match extract_word_data_unlocked(raw) {
+                Ok(word) => words.push(word),
+                // NullPointerError means the text pointer was null; skip this position.
+                // InvalidParameterError means bounding box failed; skip this position.
+                // Utf8Error means the text was not valid UTF-8; skip this word rather than
+                // aborting, so the remaining words in the iterator are not lost.
+                Err(TesseractError::NullPointerError)
+                | Err(TesseractError::InvalidParameterError)
+                | Err(TesseractError::Utf8Error(_)) => {}
+                Err(e) => return Err(e),
+            }
+
+            // SAFETY: TessResultIteratorNext() advances the iterator state and returns
+            // non-zero if a next element exists. This is safe because:
+            // 1. raw is a valid pointer to an initialized ResultIterator (mutex-guarded)
+            // 2. RIL_WORD is a valid TessPageIteratorLevel enum value
+            // 3. The mutex is held for the duration of this call (exclusive access)
+            // 4. The function modifies iterator position and returns an i32 result
+            let has_next = unsafe { TessResultIteratorNext(raw, TessPageIteratorLevel::RIL_WORD as c_int) != 0 };
+            if !has_next {
+                break;
+            }
+        }
+
+        Ok(words)
+    }
+
+    /// Extracts the current word's data in a single mutex lock.
+    ///
+    /// Acquires the mutex once and calls all FFI functions (text, bounding box,
+    /// confidence, font attributes) within that lock scope. More efficient than
+    /// calling the individual methods separately when all fields are needed.
+    ///
+    /// # Returns
+    ///
+    /// Returns a [`WordData`] struct if successful, otherwise returns an error.
+    pub fn extract_word_data(&self) -> Result<WordData> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        extract_word_data_unlocked(*handle)
+    }
+}
+
+/// Extracts word data from a raw iterator handle without acquiring the mutex.
+///
+/// The caller MUST hold the mutex lock for the `ResultIterator` this handle belongs to
+/// before calling this function. Passing a handle that is not mutex-guarded, or calling
+/// this function concurrently on the same handle, is undefined behaviour.
+fn extract_word_data_unlocked(raw: *mut c_void) -> Result<WordData> {
+    // SAFETY: TessResultIteratorGetUTF8Text() allocates and returns a pointer to a C string.
+    // This is safe because:
+    // 1. raw is a valid pointer to an initialized ResultIterator (caller holds mutex lock)
+    // 2. RIL_WORD is a valid TessPageIteratorLevel enum value converted to c_int
+    // 3. The returned pointer is either null (error) or a valid null-terminated C string
+    //    allocated on Tesseract's heap (must be freed with TessDeleteText)
+    let text_ptr = unsafe { TessResultIteratorGetUTF8Text(raw, TessPageIteratorLevel::RIL_WORD as c_int) };
+    if text_ptr.is_null() {
+        return Err(TesseractError::NullPointerError);
+    }
+    // SAFETY: We've verified text_ptr is non-null. The allocation/deallocation pattern is:
+    // 1. text_ptr was allocated by TessResultIteratorGetUTF8Text() on the FFI boundary
+    // 2. CStr::from_ptr(text_ptr) is safe: pointer is non-null and points to valid C string
+    // 3. We immediately copy all data to an owned String before deallocation
+    // 4. The string data remains valid until TessDeleteText is called
+    let text = {
+        let c_str = unsafe { CStr::from_ptr(text_ptr) };
+        let owned = c_str.to_str()?.to_owned();
+        // SAFETY: TessDeleteText() deallocates memory allocated by TessResultIteratorGetUTF8Text():
+        // 1. text_ptr is non-null (verified above)
+        // 2. text_ptr came from the Tesseract API (correct allocation type)
+        // 3. TessDeleteText() is the correct deallocation function for this allocation
+        // 4. Called exactly once per allocation to avoid double-free
+        // 5. owned String was already populated; text_ptr is no longer accessed after this call
+        unsafe { TessDeleteText(text_ptr as *mut c_char) };
+        owned
+    };
+
+    let mut left = 0;
+    let mut top = 0;
+    let mut right = 0;
+    let mut bottom = 0;
+    // SAFETY: TessPageIteratorBoundingBox() queries iterator state and fills output parameters.
+    // This is safe because:
+    // 1. raw is a valid pointer to an initialized ResultIterator (caller holds mutex lock)
+    // 2. RIL_WORD is a valid TessPageIteratorLevel enum value converted to c_int
+    // 3. All mutable references are valid local stack variables with distinct memory locations
+    // 4. Each reference is exclusively borrowed (Rust enforces no aliasing)
+    // 5. The references outlive the FFI call (defined on stack, used immediately after)
+    // 6. Return value indicates success/failure (checked below)
+    let bbox_result = unsafe {
+        TessPageIteratorBoundingBox(
+            raw,
+            TessPageIteratorLevel::RIL_WORD as c_int,
+            &mut left,
+            &mut top,
+            &mut right,
+            &mut bottom,
+        )
+    };
+    if bbox_result == 0 {
+        return Err(TesseractError::InvalidParameterError);
+    }
+
+    // SAFETY: TessResultIteratorConfidence() reads iterator state and returns an f32 value.
+    // This is safe because:
+    // 1. raw is a valid pointer to an initialized ResultIterator (caller holds mutex lock)
+    // 2. RIL_WORD is a valid TessPageIteratorLevel enum value converted to c_int
+    // 3. The function only reads state and returns a copy (no pointer operations)
+    let confidence = unsafe { TessResultIteratorConfidence(raw, TessPageIteratorLevel::RIL_WORD as c_int) };
+
+    // Collect font attributes; treat any failure as absent rather than propagating the error.
+    let font_attrs = {
+        let mut is_bold = 0;
+        let mut is_italic = 0;
+        let mut is_underlined = 0;
+        let mut is_monospace = 0;
+        let mut is_serif = 0;
+        let mut is_smallcaps = 0;
+        let mut pointsize = 0;
+        let mut font_id = 0;
+        // SAFETY: TessResultIteratorWordFontAttributes() fills output parameters with font info.
+        // This is safe because:
+        // 1. raw is a valid pointer to an initialized ResultIterator (caller holds mutex lock)
+        // 2. All mutable references are valid local stack variables with distinct memory locations
+        // 3. Each reference is exclusively borrowed (no aliasing)
+        // 4. The references outlive the FFI call
+        // 5. Return value is non-zero on success, zero on failure (checked below)
+        let result = unsafe {
+            TessResultIteratorWordFontAttributes(
+                raw,
+                &mut is_bold,
+                &mut is_italic,
+                &mut is_underlined,
+                &mut is_monospace,
+                &mut is_serif,
+                &mut is_smallcaps,
+                &mut pointsize,
+                &mut font_id,
+            )
+        };
+        if result != 0 {
+            Some(FontAttributes {
+                is_bold: is_bold != 0,
+                is_italic: is_italic != 0,
+                is_underlined: is_underlined != 0,
+                is_monospace: is_monospace != 0,
+                is_serif: is_serif != 0,
+                is_smallcaps: is_smallcaps != 0,
+                pointsize,
+                font_id,
+            })
+        } else {
+            None
+        }
+    };
+
+    Ok(WordData {
+        text,
+        left,
+        top,
+        right,
+        bottom,
+        confidence,
+        font_attrs,
+    })
+}
+
+impl Drop for ResultIterator {
+    fn drop(&mut self) {
+        if let Ok(handle) = self.handle.lock() {
+            // SAFETY: TessResultIteratorDelete() frees the ResultIterator handle allocated by Tesseract:
+            // 1. We use .ok() pattern to handle poisoned mutex gracefully (no panic in Drop)
+            // 2. *handle is a valid opaque pointer allocated by TessBaseAPIGetIterator()
+            //    or TessBaseAPIGetMutableIterator() - Tesseract owns this memory
+            // 3. TessResultIteratorDelete() is the single correct way to deallocate this type
+            // 4. The function must be called exactly once per allocation to avoid double-free
+            // 5. After calling delete, the pointer is invalid; future use would cause use-after-free
+            // 6. Drop impl never panics (we use .ok() guard), ensuring cleanup always executes
+            // 7. If mutex is poisoned, handle cleanup is skipped (OS will reclaim process memory)
+            unsafe { TessResultIteratorDelete(*handle) };
+        }
+    }
+}
+
+#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
+ffi_extern! {
+    pub fn TessResultIteratorDelete(handle: *mut c_void);
+    pub fn TessPageIteratorBegin(handle: *mut c_void);
+    pub fn TessResultIteratorGetUTF8Text(handle: *mut c_void, level: c_int) -> *mut c_char;
+    pub fn TessResultIteratorConfidence(handle: *mut c_void, level: c_int) -> c_float;
+    pub fn TessResultIteratorWordRecognitionLanguage(handle: *mut c_void) -> *const c_char;
+    pub fn TessResultIteratorWordFontAttributes(
+        handle: *mut c_void,
+        is_bold: *mut c_int,
+        is_italic: *mut c_int,
+        is_underlined: *mut c_int,
+        is_monospace: *mut c_int,
+        is_serif: *mut c_int,
+        is_smallcaps: *mut c_int,
+        pointsize: *mut c_int,
+        font_id: *mut c_int,
+    ) -> c_int;
+    pub fn TessResultIteratorWordIsFromDictionary(handle: *mut c_void) -> c_int;
+    pub fn TessResultIteratorWordIsNumeric(handle: *mut c_void) -> c_int;
+    pub fn TessResultIteratorSymbolIsSuperscript(handle: *mut c_void) -> c_int;
+    pub fn TessResultIteratorSymbolIsSubscript(handle: *mut c_void) -> c_int;
+    pub fn TessResultIteratorSymbolIsDropcap(handle: *mut c_void) -> c_int;
+    pub fn TessResultIteratorNext(handle: *mut c_void, level: c_int) -> c_int;
+    pub fn TessPageIteratorBoundingBox(
+        handle: *mut c_void,
+        level: c_int,
+        left: *mut c_int,
+        top: *mut c_int,
+        right: *mut c_int,
+        bottom: *mut c_int,
+    ) -> c_int;
+}
--- a/crates/kreuzberg-tesseract/src/result_renderer.rs
+++ b/crates/kreuzberg-tesseract/src/result_renderer.rs
@@ -0,0 +1,212 @@
+use crate::TesseractAPI;
+use crate::error::{Result, TesseractError};
+use std::ffi::{CStr, CString};
+use std::os::raw::{c_char, c_int, c_void};
+use std::sync::Arc;
+use std::sync::Mutex;
+
+pub struct TessResultRenderer {
+    handle: Arc<Mutex<*mut c_void>>,
+}
+
+unsafe impl Send for TessResultRenderer {}
+unsafe impl Sync for TessResultRenderer {}
+
+impl TessResultRenderer {
+    /// Creates a new instance of the TessResultRenderer.
+    ///
+    /// # Arguments
+    ///
+    /// * `outputbase` - Output base path.
+    ///
+    /// # Returns
+    ///
+    /// Returns the new instance of the TessResultRenderer.
+    pub fn new_text_renderer(outputbase: &str) -> Result<Self> {
+        let outputbase = CString::new(outputbase).map_err(|_| TesseractError::NullByteInString)?;
+        let handle = unsafe { TessTextRendererCreate(outputbase.as_ptr()) };
+        if handle.is_null() {
+            Err(TesseractError::NullPointerError)
+        } else {
+            Ok(TessResultRenderer {
+                handle: Arc::new(Mutex::new(handle)),
+            })
+        }
+    }
+
+    /// Creates a new instance of the TessResultRenderer for HOCR.
+    ///
+    /// # Arguments
+    ///
+    /// * `outputbase` - Output base path.
+    ///
+    /// # Returns
+    ///
+    /// Returns the new instance of the TessResultRenderer.
+    pub fn new_hocr_renderer(outputbase: &str) -> Result<Self> {
+        let outputbase = CString::new(outputbase).map_err(|_| TesseractError::NullByteInString)?;
+        let handle = unsafe { TessHOcrRendererCreate(outputbase.as_ptr()) };
+        if handle.is_null() {
+            Err(TesseractError::NullPointerError)
+        } else {
+            Ok(TessResultRenderer {
+                handle: Arc::new(Mutex::new(handle)),
+            })
+        }
+    }
+
+    /// Creates a new instance of the TessResultRenderer for PDF.
+    ///
+    /// # Arguments
+    ///
+    /// * `outputbase` - Output base path.
+    /// * `datadir` - Data directory path.
+    /// * `textonly` - Whether to include text only.
+    ///
+    /// # Returns
+    ///
+    /// Returns the new instance of the TessResultRenderer.
+    pub fn new_pdf_renderer(outputbase: &str, datadir: &str, textonly: bool) -> Result<Self> {
+        let outputbase = CString::new(outputbase).map_err(|_| TesseractError::NullByteInString)?;
+        let datadir = CString::new(datadir).map_err(|_| TesseractError::NullByteInString)?;
+        let handle = unsafe { TessPDFRendererCreate(outputbase.as_ptr(), datadir.as_ptr(), textonly as c_int) };
+        if handle.is_null() {
+            Err(TesseractError::NullPointerError)
+        } else {
+            Ok(TessResultRenderer {
+                handle: Arc::new(Mutex::new(handle)),
+            })
+        }
+    }
+
+    /// Begins a new document.
+    ///
+    /// # Arguments
+    ///
+    /// * `title` - Title of the document.
+    ///
+    /// # Returns
+    ///
+    /// Returns `true` if the document was created successfully, otherwise returns `false`.
+    ///
+    /// # Errors
+    ///
+    /// Returns a `TesseractError` if the string contains a null byte or if the mutex lock fails.
+    pub fn begin_document(&self, title: &str) -> Result<bool> {
+        let title = CString::new(title).map_err(|_| TesseractError::NullByteInString)?;
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        Ok(unsafe { TessResultRendererBeginDocument(*handle, title.as_ptr()) != 0 })
+    }
+
+    /// Adds an image to the document.
+    ///
+    /// # Arguments
+    ///
+    /// * `api` - The TesseractAPI instance.
+    ///
+    /// # Returns
+    ///
+    /// Returns `true` if the image was added successfully, otherwise returns `false`.
+    ///
+    /// # Errors
+    ///
+    /// Returns a `TesseractError::MutexLockError` if either mutex lock fails.
+    pub fn add_image(&self, api: &TesseractAPI) -> Result<bool> {
+        let api_handle = api.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        Ok(unsafe { TessResultRendererAddImage(*handle, *api_handle) != 0 })
+    }
+
+    /// Ends the document.
+    ///
+    /// # Returns
+    ///
+    /// Returns `true` if the document was ended successfully, otherwise returns `false`.
+    ///
+    /// # Errors
+    ///
+    /// Returns a `TesseractError::MutexLockError` if the mutex lock fails.
+    pub fn end_document(&self) -> Result<bool> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        Ok(unsafe { TessResultRendererEndDocument(*handle) != 0 })
+    }
+
+    /// Gets the extension of the document.
+    ///
+    /// # Returns
+    ///
+    /// Returns the extension as a `String` if successful, otherwise returns an error.
+    ///
+    /// # Errors
+    ///
+    /// Returns a `TesseractError::MutexLockError` if the mutex lock fails,
+    /// `TesseractError::NullPointerError` if the extension pointer is null,
+    /// or `TesseractError::Utf8Error` if the extension contains invalid UTF-8.
+    pub fn get_extension(&self) -> Result<String> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        let ext_ptr = unsafe { TessResultRendererExtention(*handle) };
+        if ext_ptr.is_null() {
+            Err(TesseractError::NullPointerError)
+        } else {
+            let c_str = unsafe { CStr::from_ptr(ext_ptr) };
+            Ok(c_str.to_str()?.to_owned())
+        }
+    }
+
+    /// Gets the title of the document.
+    ///
+    /// # Returns
+    ///
+    /// Returns the title as a `String` if successful, otherwise returns an error.
+    ///
+    /// # Errors
+    ///
+    /// Returns a `TesseractError::MutexLockError` if the mutex lock fails,
+    /// `TesseractError::NullPointerError` if the title pointer is null,
+    /// or `TesseractError::Utf8Error` if the title contains invalid UTF-8.
+    pub fn get_title(&self) -> Result<String> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        let title_ptr = unsafe { TessResultRendererTitle(*handle) };
+        if title_ptr.is_null() {
+            Err(TesseractError::NullPointerError)
+        } else {
+            let c_str = unsafe { CStr::from_ptr(title_ptr) };
+            Ok(c_str.to_str()?.to_owned())
+        }
+    }
+
+    /// Gets the number of images in the document.
+    ///
+    /// # Returns
+    ///
+    /// Returns the number of images as an `i32`.
+    ///
+    /// # Errors
+    ///
+    /// Returns a `TesseractError::MutexLockError` if the mutex lock fails.
+    pub fn get_image_num(&self) -> Result<i32> {
+        let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
+        Ok(unsafe { TessResultRendererImageNum(*handle) })
+    }
+}
+
+impl Drop for TessResultRenderer {
+    fn drop(&mut self) {
+        if let Ok(handle) = self.handle.lock() {
+            unsafe { TessDeleteResultRenderer(*handle) };
+        }
+    }
+}
+
+ffi_extern! {
+    pub fn TessTextRendererCreate(outputbase: *const c_char) -> *mut c_void;
+    pub fn TessHOcrRendererCreate(outputbase: *const c_char) -> *mut c_void;
+    pub fn TessPDFRendererCreate(outputbase: *const c_char, datadir: *const c_char, textonly: c_int) -> *mut c_void;
+    pub fn TessDeleteResultRenderer(renderer: *mut c_void);
+    pub fn TessResultRendererBeginDocument(renderer: *mut c_void, title: *const c_char) -> c_int;
+    pub fn TessResultRendererAddImage(renderer: *mut c_void, api: *mut c_void) -> c_int;
+    pub fn TessResultRendererEndDocument(renderer: *mut c_void) -> c_int;
+    pub fn TessResultRendererExtention(renderer: *mut c_void) -> *const c_char;
+    pub fn TessResultRendererTitle(renderer: *mut c_void) -> *const c_char;
+    pub fn TessResultRendererImageNum(renderer: *mut c_void) -> c_int;
+}
--- a/crates/kreuzberg-tesseract/tests/integration_test.rs
+++ b/crates/kreuzberg-tesseract/tests/integration_test.rs
@@ -0,0 +1,211 @@
+use kreuzberg_tesseract::TesseractAPI;
+use std::path::{Path, PathBuf};
+
+fn get_default_tessdata_dir() -> PathBuf {
+    if cfg!(target_os = "macos") {
+        let home_dir = std::env::var("HOME").expect("HOME environment variable not set");
+        PathBuf::from(home_dir)
+            .join("Library")
+            .join("Application Support")
+            .join("kreuzberg-tesseract")
+            .join("tessdata")
+    } else if cfg!(target_os = "linux") {
+        let system_paths = [
+            PathBuf::from("/usr/share/tesseract-ocr/5/tessdata"),
+            PathBuf::from("/usr/share/tesseract-ocr/tessdata"),
+        ];
+        for path in &system_paths {
+            if path.exists() {
+                return path.clone();
+            }
+        }
+        let home_dir = std::env::var("HOME").expect("HOME environment variable not set");
+        PathBuf::from(home_dir).join(".kreuzberg-tesseract").join("tessdata")
+    } else if cfg!(target_os = "windows") {
+        PathBuf::from(std::env::var("APPDATA").expect("APPDATA environment variable not set"))
+            .join("kreuzberg-tesseract")
+            .join("tessdata")
+    } else {
+        panic!("Unsupported operating system");
+    }
+}
+
+fn get_tessdata_dir() -> PathBuf {
+    match std::env::var("TESSDATA_PREFIX") {
+        Ok(dir) => {
+            let prefix_path = PathBuf::from(dir);
+            let tessdata_path = if prefix_path.ends_with("tessdata") {
+                prefix_path
+            } else {
+                prefix_path.join("tessdata")
+            };
+            println!("Using TESSDATA_PREFIX directory: {:?}", tessdata_path);
+            tessdata_path
+        }
+        Err(_) => {
+            let default_dir = get_default_tessdata_dir();
+            println!("TESSDATA_PREFIX not set, using default directory: {:?}", default_dir);
+            default_dir
+        }
+    }
+}
+
+fn ensure_eng_traineddata_exists(tessdata_dir: &Path) {
+    let eng_traineddata = tessdata_dir.join("eng.traineddata");
+    assert!(
+        eng_traineddata.exists(),
+        "eng.traineddata not found in {}. Set TESSDATA_PREFIX or install English tessdata.",
+        tessdata_dir.display()
+    );
+}
+
+fn repo_root() -> PathBuf {
+    PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("..").join("..")
+}
+
+fn load_test_image(relative: &str) -> Result<(Vec<u8>, u32, u32), Box<dyn std::error::Error>> {
+    let mut path = repo_root();
+    path.push("test_documents");
+    path.push(relative);
+
+    let img = image::open(&path)
+        .map_err(|e| format!("Failed to open test image {}: {}", path.display(), e))?
+        .to_rgb8();
+    let (width, height) = img.dimensions();
+    Ok((img.into_raw(), width, height))
+}
+
+#[test]
+fn test_ocr_on_hello_world_image() {
+    let tessdata_dir = get_tessdata_dir();
+    ensure_eng_traineddata_exists(&tessdata_dir);
+
+    let api = TesseractAPI::new().expect("Failed to create TesseractAPI");
+    api.init(tessdata_dir.to_str().unwrap(), "eng")
+        .expect("Failed to initialize Tesseract");
+
+    let (image_data, width, height) =
+        load_test_image("images/test_hello_world.png").expect("Failed to load test image");
+    api.set_image(&image_data, width as i32, height as i32, 3, 3 * width as i32)
+        .expect("Failed to set image");
+
+    let text = api.get_utf8_text().expect("Failed to perform OCR");
+    assert!(
+        text.to_lowercase().contains("hello"),
+        "Text does not contain expected word. Found: {}",
+        text
+    );
+}
+
+#[test]
+fn test_ocr_on_table_image() {
+    let tessdata_dir = get_tessdata_dir();
+    ensure_eng_traineddata_exists(&tessdata_dir);
+
+    let api = TesseractAPI::new().expect("Failed to create TesseractAPI");
+    api.init(tessdata_dir.to_str().unwrap(), "eng")
+        .expect("Failed to initialize Tesseract");
+    api.set_variable("tessedit_pageseg_mode", "1")
+        .expect("Failed to set PSM");
+
+    let (image_data, width, height) = load_test_image("images/simple_table.png").expect("Failed to load test image");
+    api.set_image(&image_data, width as i32, height as i32, 3, 3 * width as i32)
+        .expect("Failed to set image");
+
+    let text = api.get_utf8_text().expect("Failed to perform OCR");
+    let lowercase = text.to_lowercase();
+    assert!(
+        lowercase.contains("product") && lowercase.contains("price"),
+        "Table text missing expected words. Found: {}",
+        text
+    );
+}
+
+#[test]
+fn test_invalid_language_code() {
+    let tessdata_dir = get_tessdata_dir();
+    ensure_eng_traineddata_exists(&tessdata_dir);
+
+    let api = TesseractAPI::new().expect("Failed to create TesseractAPI");
+
+    let result = api.init(tessdata_dir.to_str().unwrap(), "invalid_lang");
+    assert!(result.is_err());
+}
+
+#[test]
+fn test_empty_image_data() {
+    let tessdata_dir = get_tessdata_dir();
+    ensure_eng_traineddata_exists(&tessdata_dir);
+
+    let api = TesseractAPI::new().expect("Failed to create TesseractAPI");
+    api.init(tessdata_dir.to_str().unwrap(), "eng")
+        .expect("Failed to initialize Tesseract");
+
+    let empty_data: Vec<u8> = Vec::new();
+    let res = api.set_image(&empty_data, 100, 100, 3, 300);
+    assert!(res.is_err());
+}
+
+#[test]
+fn test_invalid_image_parameters() {
+    let tessdata_dir = get_tessdata_dir();
+    ensure_eng_traineddata_exists(&tessdata_dir);
+
+    let api = TesseractAPI::new().expect("Failed to create TesseractAPI");
+    api.init(tessdata_dir.to_str().unwrap(), "eng")
+        .expect("Failed to initialize Tesseract");
+
+    let (image_data, width, height) =
+        load_test_image("images/test_hello_world.png").expect("Failed to load test image");
+
+    let res = api.set_image(&image_data, -1, height as i32, 3, 3 * width as i32);
+    assert!(res.is_err());
+
+    let res = api.set_image(&image_data, width as i32, 0, 3, 3 * width as i32);
+    assert!(res.is_err());
+
+    let res = api.set_image(&image_data, width as i32, height as i32, 0, 3 * width as i32);
+    assert!(res.is_err());
+
+    let res = api.set_image(&image_data, width as i32, height as i32, 3, width as i32);
+    assert!(res.is_err());
+}
+
+#[test]
+fn test_variable_setting() {
+    let tessdata_dir = get_tessdata_dir();
+    ensure_eng_traineddata_exists(&tessdata_dir);
+
+    let api = TesseractAPI::new().expect("Failed to create TesseractAPI");
+    api.init(tessdata_dir.to_str().unwrap(), "eng")
+        .expect("Failed to initialize Tesseract");
+
+    let res = api.set_variable("invalid_variable_name", "1");
+    assert!(res.is_err());
+
+    let res = api.set_variable("tessedit_char_whitelist", "");
+    assert!(res.is_ok());
+
+    assert!(api.set_variable("tessedit_pageseg_mode", "1").is_ok());
+    assert!(api.set_variable("tessedit_ocr_engine_mode", "1").is_ok());
+}
+
+#[test]
+fn test_multiple_operations() {
+    let tessdata_dir = get_tessdata_dir();
+    ensure_eng_traineddata_exists(&tessdata_dir);
+
+    let api = TesseractAPI::new().expect("Failed to create TesseractAPI");
+    api.init(tessdata_dir.to_str().unwrap(), "eng")
+        .expect("Failed to initialize Tesseract");
+
+    let (image_data, width, height) =
+        load_test_image("images/test_hello_world.png").expect("Failed to load test image");
+
+    for _ in 0..3 {
+        let res = api.set_image(&image_data, width as i32, height as i32, 3, 3 * width as i32);
+        assert!(res.is_ok());
+        let text = api.get_utf8_text().expect("Failed to perform OCR");
+        assert!(!text.is_empty());
+    }
+}