Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,218 @@
#![cfg_attr(
not(any(feature = "build-tesseract", feature = "build-tesseract-wasm")),
allow(unused_variables, dead_code)
)]
#![allow(clippy::arc_with_non_send_sync)]
#![allow(clippy::missing_transmute_annotations)]
#![allow(clippy::type_complexity)]
#![allow(clippy::new_without_default)]
#![allow(clippy::not_unsafe_ptr_arg_deref)]
#![allow(clippy::cmp_null)]
//! # kreuzberg-tesseract
//!
//! `kreuzberg-tesseract` provides safe Rust bindings for Tesseract OCR with built-in compilation
//! of Tesseract and Leptonica libraries. This crate aims to make OCR functionality
//! easily accessible in Rust projects while handling the complexity of interfacing
//! with the underlying C++ libraries.
//!
//! ## Usage
//!
//! Here's a basic example of how to use `kreuzberg-tesseract`:
//!
//! ```rust
//! use std::path::PathBuf;
//! use std::error::Error;
//! use kreuzberg_tesseract::TesseractAPI;
//!
//! fn get_default_tessdata_dir() -> PathBuf {
//! if cfg!(target_os = "macos") {
//! let home_dir = std::env::var("HOME").expect("HOME environment variable not set");
//! PathBuf::from(home_dir)
//! .join("Library")
//! .join("Application Support")
//! .join("kreuzberg-tesseract")
//! .join("tessdata")
//! } else if cfg!(target_os = "linux") {
//! let home_dir = std::env::var("HOME").expect("HOME environment variable not set");
//! PathBuf::from(home_dir)
//! .join(".kreuzberg-tesseract")
//! .join("tessdata")
//! } else if cfg!(target_os = "windows") {
//! PathBuf::from(std::env::var("APPDATA").expect("APPDATA environment variable not set"))
//! .join("kreuzberg-tesseract")
//! .join("tessdata")
//! } else {
//! panic!("Unsupported operating system");
//! }
//! }
//!
//! fn get_tessdata_dir() -> PathBuf {
//! match std::env::var("TESSDATA_PREFIX") {
//! Ok(dir) => {
//! let path = PathBuf::from(dir);
//! let path = if path.ends_with("tessdata") { path } else { path.join("tessdata") };
//! println!("Using TESSDATA_PREFIX directory: {:?}", path);
//! path
//! }
//! Err(_) => {
//! let default_dir = get_default_tessdata_dir();
//! println!(
//! "TESSDATA_PREFIX not set, using default directory: {:?}",
//! default_dir
//! );
//! default_dir
//! }
//! }
//! }
//!
//! fn main() -> Result<(), Box<dyn Error>> {
//! let api = TesseractAPI::new()?;
//!
//! // Get tessdata directory (uses default location or TESSDATA_PREFIX if set)
//! let tessdata_dir = get_tessdata_dir();
//! api.init(tessdata_dir.to_str().unwrap(), "eng")?;
//!
//! let width = 24;
//! let height = 24;
//! let bytes_per_pixel = 1;
//! let bytes_per_line = width * bytes_per_pixel;
//!
//! // Initialize image data with all white pixels
//! let mut image_data = vec![255u8; width * height];
//!
//! // Draw number 9 with clearer distinction
//! for y in 4..19 {
//! for x in 7..17 {
//! // Top bar
//! if y == 4 && x >= 8 && x <= 15 {
//! image_data[y * width + x] = 0;
//! }
//! // Top curve left side
//! if y >= 4 && y <= 10 && x == 7 {
//! image_data[y * width + x] = 0;
//! }
//! // Top curve right side
//! if y >= 4 && y <= 11 && x == 16 {
//! image_data[y * width + x] = 0;
//! }
//! // Middle bar
//! if y == 11 && x >= 8 && x <= 15 {
//! image_data[y * width + x] = 0;
//! }
//! // Bottom right vertical line
//! if y >= 11 && y <= 18 && x == 16 {
//! image_data[y * width + x] = 0;
//! }
//! // Bottom bar
//! if y == 18 && x >= 8 && x <= 15 {
//! image_data[y * width + x] = 0;
//! }
//! }
//! }
//!
//! // Set the image data
//! api.set_image(&image_data, width.try_into().unwrap(), height.try_into().unwrap(), bytes_per_pixel.try_into().unwrap(), bytes_per_line.try_into().unwrap())?;
//!
//! // Set whitelist for digits only
//! api.set_variable("tessedit_char_whitelist", "0123456789")?;
//!
//! // Set PSM mode to single character
//! api.set_variable("tessedit_pageseg_mode", "10")?;
//!
//! // Get the recognized text
//! let text = api.get_utf8_text()?;
//! println!("Recognized text: {}", text.trim());
//!
//! Ok(())
//! }
//! ```
/// Declare FFI functions with `extern "C-unwind"` on native targets (to catch
/// C++ exceptions from Tesseract/Leptonica) and `extern "C"` on WASM (where
/// the LLVM backend does not support `cleanupret` / C++ unwinding).
macro_rules! ffi_extern {
(
$(
$(#[$meta:meta])*
$vis:vis fn $name:ident($($arg:ident : $ty:ty),* $(,)?) $(-> $ret:ty)?;
)*
) => {
#[cfg(not(target_arch = "wasm32"))]
unsafe extern "C-unwind" {
$(
$(#[$meta])*
$vis fn $name($($arg : $ty),*) $(-> $ret)?;
)*
}
#[cfg(target_arch = "wasm32")]
unsafe extern "C" {
$(
$(#[$meta])*
$vis fn $name($($arg : $ty),*) $(-> $ret)?;
)*
}
};
}
pub use error::{Result, TesseractError};
mod error;
// WASM: Override __cxa_atexit to be a no-op. WASI SDK's __cxa_atexit calls calloc during
// C++ static initialization, which crashes because dlmalloc's heap isn't properly set up
// for wasm32-unknown-unknown. Since WASM modules never exit normally, atexit handlers
// are unnecessary.
#[cfg(all(target_arch = "wasm32", not(target_os = "emscripten")))]
mod wasm_compat {
#[unsafe(no_mangle)]
pub unsafe extern "C" fn __cxa_atexit(
_func: Option<unsafe extern "C" fn(*mut core::ffi::c_void)>,
_arg: *mut core::ffi::c_void,
_dso_handle: *mut core::ffi::c_void,
) -> i32 {
0 // Success, but don't actually register anything
}
}
mod page_iterator;
pub use page_iterator::{BlockInfo, PageIterator, ParaInfo};
mod result_iterator;
pub use result_iterator::{FontAttributes, ResultIterator, WordData};
mod choice_iterator;
pub use choice_iterator::ChoiceIterator;
mod monitor;
pub use monitor::TessMonitor;
mod result_renderer;
pub use result_renderer::TessResultRenderer;
mod mutable_iterator;
pub use mutable_iterator::MutableIterator;
mod enums;
pub use enums::{
TessOrientation, TessPageIteratorLevel, TessPageSegMode, TessParagraphJustification, TessPolyBlockType,
TessTextlineOrder, TessWritingDirection,
};
mod api;
pub use api::{BoundingBoxArray, TesseractAPI};
pub mod leptonica;
#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
pub use leptonica::Pix;
/// Returns the compile-time-bundled English `eng.traineddata` blob when the
/// `bundle-tessdata-eng` feature is enabled, otherwise `None`.
///
/// The bundled data is the `tessdata_fast` variant (~4 MB) downloaded by
/// `build.rs` to `TESSDATA_PREFIX_BUNDLED/tessdata/eng.traineddata`. Embedding
/// it lets WASM builds drive Tesseract OCR without filesystem access or
/// runtime fetches.
#[cfg(feature = "bundle-tessdata-eng")]
pub fn bundled_eng_traineddata() -> Option<&'static [u8]> {
Some(include_bytes!(concat!(
env!("TESSDATA_PREFIX_BUNDLED"),
"/tessdata/eng.traineddata"
)))
}
/// Returns `None` when the `bundle-tessdata-eng` feature is disabled.
#[cfg(not(feature = "bundle-tessdata-eng"))]
pub fn bundled_eng_traineddata() -> Option<&'static [u8]> {
None
}