#![cfg_attr( not(any(feature = "build-tesseract", feature = "build-tesseract-wasm")), allow(unused_variables, dead_code) )] #![allow(clippy::arc_with_non_send_sync)] #![allow(clippy::missing_transmute_annotations)] #![allow(clippy::type_complexity)] #![allow(clippy::new_without_default)] #![allow(clippy::not_unsafe_ptr_arg_deref)] #![allow(clippy::cmp_null)] //! # kreuzberg-tesseract //! //! `kreuzberg-tesseract` provides safe Rust bindings for Tesseract OCR with built-in compilation //! of Tesseract and Leptonica libraries. This crate aims to make OCR functionality //! easily accessible in Rust projects while handling the complexity of interfacing //! with the underlying C++ libraries. //! //! ## Usage //! //! Here's a basic example of how to use `kreuzberg-tesseract`: //! //! ```rust //! use std::path::PathBuf; //! use std::error::Error; //! use kreuzberg_tesseract::TesseractAPI; //! //! fn get_default_tessdata_dir() -> PathBuf { //! if cfg!(target_os = "macos") { //! let home_dir = std::env::var("HOME").expect("HOME environment variable not set"); //! PathBuf::from(home_dir) //! .join("Library") //! .join("Application Support") //! .join("kreuzberg-tesseract") //! .join("tessdata") //! } else if cfg!(target_os = "linux") { //! let home_dir = std::env::var("HOME").expect("HOME environment variable not set"); //! PathBuf::from(home_dir) //! .join(".kreuzberg-tesseract") //! .join("tessdata") //! } else if cfg!(target_os = "windows") { //! PathBuf::from(std::env::var("APPDATA").expect("APPDATA environment variable not set")) //! .join("kreuzberg-tesseract") //! .join("tessdata") //! } else { //! panic!("Unsupported operating system"); //! } //! } //! //! fn get_tessdata_dir() -> PathBuf { //! match std::env::var("TESSDATA_PREFIX") { //! Ok(dir) => { //! let path = PathBuf::from(dir); //! let path = if path.ends_with("tessdata") { path } else { path.join("tessdata") }; //! println!("Using TESSDATA_PREFIX directory: {:?}", path); //! path //! } //! Err(_) => { //! let default_dir = get_default_tessdata_dir(); //! println!( //! "TESSDATA_PREFIX not set, using default directory: {:?}", //! default_dir //! ); //! default_dir //! } //! } //! } //! //! fn main() -> Result<(), Box> { //! let api = TesseractAPI::new()?; //! //! // Get tessdata directory (uses default location or TESSDATA_PREFIX if set) //! let tessdata_dir = get_tessdata_dir(); //! api.init(tessdata_dir.to_str().unwrap(), "eng")?; //! //! let width = 24; //! let height = 24; //! let bytes_per_pixel = 1; //! let bytes_per_line = width * bytes_per_pixel; //! //! // Initialize image data with all white pixels //! let mut image_data = vec![255u8; width * height]; //! //! // Draw number 9 with clearer distinction //! for y in 4..19 { //! for x in 7..17 { //! // Top bar //! if y == 4 && x >= 8 && x <= 15 { //! image_data[y * width + x] = 0; //! } //! // Top curve left side //! if y >= 4 && y <= 10 && x == 7 { //! image_data[y * width + x] = 0; //! } //! // Top curve right side //! if y >= 4 && y <= 11 && x == 16 { //! image_data[y * width + x] = 0; //! } //! // Middle bar //! if y == 11 && x >= 8 && x <= 15 { //! image_data[y * width + x] = 0; //! } //! // Bottom right vertical line //! if y >= 11 && y <= 18 && x == 16 { //! image_data[y * width + x] = 0; //! } //! // Bottom bar //! if y == 18 && x >= 8 && x <= 15 { //! image_data[y * width + x] = 0; //! } //! } //! } //! //! // Set the image data //! api.set_image(&image_data, width.try_into().unwrap(), height.try_into().unwrap(), bytes_per_pixel.try_into().unwrap(), bytes_per_line.try_into().unwrap())?; //! //! // Set whitelist for digits only //! api.set_variable("tessedit_char_whitelist", "0123456789")?; //! //! // Set PSM mode to single character //! api.set_variable("tessedit_pageseg_mode", "10")?; //! //! // Get the recognized text //! let text = api.get_utf8_text()?; //! println!("Recognized text: {}", text.trim()); //! //! Ok(()) //! } //! ``` /// Declare FFI functions with `extern "C-unwind"` on native targets (to catch /// C++ exceptions from Tesseract/Leptonica) and `extern "C"` on WASM (where /// the LLVM backend does not support `cleanupret` / C++ unwinding). macro_rules! ffi_extern { ( $( $(#[$meta:meta])* $vis:vis fn $name:ident($($arg:ident : $ty:ty),* $(,)?) $(-> $ret:ty)?; )* ) => { #[cfg(not(target_arch = "wasm32"))] unsafe extern "C-unwind" { $( $(#[$meta])* $vis fn $name($($arg : $ty),*) $(-> $ret)?; )* } #[cfg(target_arch = "wasm32")] unsafe extern "C" { $( $(#[$meta])* $vis fn $name($($arg : $ty),*) $(-> $ret)?; )* } }; } pub use error::{Result, TesseractError}; mod error; // WASM: Override __cxa_atexit to be a no-op. WASI SDK's __cxa_atexit calls calloc during // C++ static initialization, which crashes because dlmalloc's heap isn't properly set up // for wasm32-unknown-unknown. Since WASM modules never exit normally, atexit handlers // are unnecessary. #[cfg(all(target_arch = "wasm32", not(target_os = "emscripten")))] mod wasm_compat { #[unsafe(no_mangle)] pub unsafe extern "C" fn __cxa_atexit( _func: Option, _arg: *mut core::ffi::c_void, _dso_handle: *mut core::ffi::c_void, ) -> i32 { 0 // Success, but don't actually register anything } } mod page_iterator; pub use page_iterator::{BlockInfo, PageIterator, ParaInfo}; mod result_iterator; pub use result_iterator::{FontAttributes, ResultIterator, WordData}; mod choice_iterator; pub use choice_iterator::ChoiceIterator; mod monitor; pub use monitor::TessMonitor; mod result_renderer; pub use result_renderer::TessResultRenderer; mod mutable_iterator; pub use mutable_iterator::MutableIterator; mod enums; pub use enums::{ TessOrientation, TessPageIteratorLevel, TessPageSegMode, TessParagraphJustification, TessPolyBlockType, TessTextlineOrder, TessWritingDirection, }; mod api; pub use api::{BoundingBoxArray, TesseractAPI}; pub mod leptonica; #[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))] pub use leptonica::Pix; /// Returns the compile-time-bundled English `eng.traineddata` blob when the /// `bundle-tessdata-eng` feature is enabled, otherwise `None`. /// /// The bundled data is the `tessdata_fast` variant (~4 MB) downloaded by /// `build.rs` to `TESSDATA_PREFIX_BUNDLED/tessdata/eng.traineddata`. Embedding /// it lets WASM builds drive Tesseract OCR without filesystem access or /// runtime fetches. #[cfg(feature = "bundle-tessdata-eng")] pub fn bundled_eng_traineddata() -> Option<&'static [u8]> { Some(include_bytes!(concat!( env!("TESSDATA_PREFIX_BUNDLED"), "/tessdata/eng.traineddata" ))) } /// Returns `None` when the `bundle-tessdata-eng` feature is disabled. #[cfg(not(feature = "bundle-tessdata-eng"))] pub fn bundled_eng_traineddata() -> Option<&'static [u8]> { None }