This commit is contained in:
218
crates/kreuzberg-tesseract/src/lib.rs
Normal file
218
crates/kreuzberg-tesseract/src/lib.rs
Normal file
@@ -0,0 +1,218 @@
|
||||
#![cfg_attr(
|
||||
not(any(feature = "build-tesseract", feature = "build-tesseract-wasm")),
|
||||
allow(unused_variables, dead_code)
|
||||
)]
|
||||
#![allow(clippy::arc_with_non_send_sync)]
|
||||
#![allow(clippy::missing_transmute_annotations)]
|
||||
#![allow(clippy::type_complexity)]
|
||||
#![allow(clippy::new_without_default)]
|
||||
#![allow(clippy::not_unsafe_ptr_arg_deref)]
|
||||
#![allow(clippy::cmp_null)]
|
||||
|
||||
//! # kreuzberg-tesseract
|
||||
//!
|
||||
//! `kreuzberg-tesseract` provides safe Rust bindings for Tesseract OCR with built-in compilation
|
||||
//! of Tesseract and Leptonica libraries. This crate aims to make OCR functionality
|
||||
//! easily accessible in Rust projects while handling the complexity of interfacing
|
||||
//! with the underlying C++ libraries.
|
||||
//!
|
||||
//! ## Usage
|
||||
//!
|
||||
//! Here's a basic example of how to use `kreuzberg-tesseract`:
|
||||
//!
|
||||
//! ```rust
|
||||
//! use std::path::PathBuf;
|
||||
//! use std::error::Error;
|
||||
//! use kreuzberg_tesseract::TesseractAPI;
|
||||
//!
|
||||
//! fn get_default_tessdata_dir() -> PathBuf {
|
||||
//! if cfg!(target_os = "macos") {
|
||||
//! let home_dir = std::env::var("HOME").expect("HOME environment variable not set");
|
||||
//! PathBuf::from(home_dir)
|
||||
//! .join("Library")
|
||||
//! .join("Application Support")
|
||||
//! .join("kreuzberg-tesseract")
|
||||
//! .join("tessdata")
|
||||
//! } else if cfg!(target_os = "linux") {
|
||||
//! let home_dir = std::env::var("HOME").expect("HOME environment variable not set");
|
||||
//! PathBuf::from(home_dir)
|
||||
//! .join(".kreuzberg-tesseract")
|
||||
//! .join("tessdata")
|
||||
//! } else if cfg!(target_os = "windows") {
|
||||
//! PathBuf::from(std::env::var("APPDATA").expect("APPDATA environment variable not set"))
|
||||
//! .join("kreuzberg-tesseract")
|
||||
//! .join("tessdata")
|
||||
//! } else {
|
||||
//! panic!("Unsupported operating system");
|
||||
//! }
|
||||
//! }
|
||||
//!
|
||||
//! fn get_tessdata_dir() -> PathBuf {
|
||||
//! match std::env::var("TESSDATA_PREFIX") {
|
||||
//! Ok(dir) => {
|
||||
//! let path = PathBuf::from(dir);
|
||||
//! let path = if path.ends_with("tessdata") { path } else { path.join("tessdata") };
|
||||
//! println!("Using TESSDATA_PREFIX directory: {:?}", path);
|
||||
//! path
|
||||
//! }
|
||||
//! Err(_) => {
|
||||
//! let default_dir = get_default_tessdata_dir();
|
||||
//! println!(
|
||||
//! "TESSDATA_PREFIX not set, using default directory: {:?}",
|
||||
//! default_dir
|
||||
//! );
|
||||
//! default_dir
|
||||
//! }
|
||||
//! }
|
||||
//! }
|
||||
//!
|
||||
//! fn main() -> Result<(), Box<dyn Error>> {
|
||||
//! let api = TesseractAPI::new()?;
|
||||
//!
|
||||
//! // Get tessdata directory (uses default location or TESSDATA_PREFIX if set)
|
||||
//! let tessdata_dir = get_tessdata_dir();
|
||||
//! api.init(tessdata_dir.to_str().unwrap(), "eng")?;
|
||||
//!
|
||||
//! let width = 24;
|
||||
//! let height = 24;
|
||||
//! let bytes_per_pixel = 1;
|
||||
//! let bytes_per_line = width * bytes_per_pixel;
|
||||
//!
|
||||
//! // Initialize image data with all white pixels
|
||||
//! let mut image_data = vec![255u8; width * height];
|
||||
//!
|
||||
//! // Draw number 9 with clearer distinction
|
||||
//! for y in 4..19 {
|
||||
//! for x in 7..17 {
|
||||
//! // Top bar
|
||||
//! if y == 4 && x >= 8 && x <= 15 {
|
||||
//! image_data[y * width + x] = 0;
|
||||
//! }
|
||||
//! // Top curve left side
|
||||
//! if y >= 4 && y <= 10 && x == 7 {
|
||||
//! image_data[y * width + x] = 0;
|
||||
//! }
|
||||
//! // Top curve right side
|
||||
//! if y >= 4 && y <= 11 && x == 16 {
|
||||
//! image_data[y * width + x] = 0;
|
||||
//! }
|
||||
//! // Middle bar
|
||||
//! if y == 11 && x >= 8 && x <= 15 {
|
||||
//! image_data[y * width + x] = 0;
|
||||
//! }
|
||||
//! // Bottom right vertical line
|
||||
//! if y >= 11 && y <= 18 && x == 16 {
|
||||
//! image_data[y * width + x] = 0;
|
||||
//! }
|
||||
//! // Bottom bar
|
||||
//! if y == 18 && x >= 8 && x <= 15 {
|
||||
//! image_data[y * width + x] = 0;
|
||||
//! }
|
||||
//! }
|
||||
//! }
|
||||
//!
|
||||
//! // Set the image data
|
||||
//! api.set_image(&image_data, width.try_into().unwrap(), height.try_into().unwrap(), bytes_per_pixel.try_into().unwrap(), bytes_per_line.try_into().unwrap())?;
|
||||
//!
|
||||
//! // Set whitelist for digits only
|
||||
//! api.set_variable("tessedit_char_whitelist", "0123456789")?;
|
||||
//!
|
||||
//! // Set PSM mode to single character
|
||||
//! api.set_variable("tessedit_pageseg_mode", "10")?;
|
||||
//!
|
||||
//! // Get the recognized text
|
||||
//! let text = api.get_utf8_text()?;
|
||||
//! println!("Recognized text: {}", text.trim());
|
||||
//!
|
||||
//! Ok(())
|
||||
//! }
|
||||
//! ```
|
||||
/// Declare FFI functions with `extern "C-unwind"` on native targets (to catch
|
||||
/// C++ exceptions from Tesseract/Leptonica) and `extern "C"` on WASM (where
|
||||
/// the LLVM backend does not support `cleanupret` / C++ unwinding).
|
||||
macro_rules! ffi_extern {
|
||||
(
|
||||
$(
|
||||
$(#[$meta:meta])*
|
||||
$vis:vis fn $name:ident($($arg:ident : $ty:ty),* $(,)?) $(-> $ret:ty)?;
|
||||
)*
|
||||
) => {
|
||||
#[cfg(not(target_arch = "wasm32"))]
|
||||
unsafe extern "C-unwind" {
|
||||
$(
|
||||
$(#[$meta])*
|
||||
$vis fn $name($($arg : $ty),*) $(-> $ret)?;
|
||||
)*
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "wasm32")]
|
||||
unsafe extern "C" {
|
||||
$(
|
||||
$(#[$meta])*
|
||||
$vis fn $name($($arg : $ty),*) $(-> $ret)?;
|
||||
)*
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
pub use error::{Result, TesseractError};
|
||||
mod error;
|
||||
|
||||
// WASM: Override __cxa_atexit to be a no-op. WASI SDK's __cxa_atexit calls calloc during
|
||||
// C++ static initialization, which crashes because dlmalloc's heap isn't properly set up
|
||||
// for wasm32-unknown-unknown. Since WASM modules never exit normally, atexit handlers
|
||||
// are unnecessary.
|
||||
#[cfg(all(target_arch = "wasm32", not(target_os = "emscripten")))]
|
||||
mod wasm_compat {
|
||||
#[unsafe(no_mangle)]
|
||||
pub unsafe extern "C" fn __cxa_atexit(
|
||||
_func: Option<unsafe extern "C" fn(*mut core::ffi::c_void)>,
|
||||
_arg: *mut core::ffi::c_void,
|
||||
_dso_handle: *mut core::ffi::c_void,
|
||||
) -> i32 {
|
||||
0 // Success, but don't actually register anything
|
||||
}
|
||||
}
|
||||
mod page_iterator;
|
||||
pub use page_iterator::{BlockInfo, PageIterator, ParaInfo};
|
||||
mod result_iterator;
|
||||
pub use result_iterator::{FontAttributes, ResultIterator, WordData};
|
||||
mod choice_iterator;
|
||||
pub use choice_iterator::ChoiceIterator;
|
||||
mod monitor;
|
||||
pub use monitor::TessMonitor;
|
||||
mod result_renderer;
|
||||
pub use result_renderer::TessResultRenderer;
|
||||
mod mutable_iterator;
|
||||
pub use mutable_iterator::MutableIterator;
|
||||
mod enums;
|
||||
pub use enums::{
|
||||
TessOrientation, TessPageIteratorLevel, TessPageSegMode, TessParagraphJustification, TessPolyBlockType,
|
||||
TessTextlineOrder, TessWritingDirection,
|
||||
};
|
||||
mod api;
|
||||
pub use api::{BoundingBoxArray, TesseractAPI};
|
||||
pub mod leptonica;
|
||||
#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
|
||||
pub use leptonica::Pix;
|
||||
|
||||
/// Returns the compile-time-bundled English `eng.traineddata` blob when the
|
||||
/// `bundle-tessdata-eng` feature is enabled, otherwise `None`.
|
||||
///
|
||||
/// The bundled data is the `tessdata_fast` variant (~4 MB) downloaded by
|
||||
/// `build.rs` to `TESSDATA_PREFIX_BUNDLED/tessdata/eng.traineddata`. Embedding
|
||||
/// it lets WASM builds drive Tesseract OCR without filesystem access or
|
||||
/// runtime fetches.
|
||||
#[cfg(feature = "bundle-tessdata-eng")]
|
||||
pub fn bundled_eng_traineddata() -> Option<&'static [u8]> {
|
||||
Some(include_bytes!(concat!(
|
||||
env!("TESSDATA_PREFIX_BUNDLED"),
|
||||
"/tessdata/eng.traineddata"
|
||||
)))
|
||||
}
|
||||
|
||||
/// Returns `None` when the `bundle-tessdata-eng` feature is disabled.
|
||||
#[cfg(not(feature = "bundle-tessdata-eng"))]
|
||||
pub fn bundled_eng_traineddata() -> Option<&'static [u8]> {
|
||||
None
|
||||
}
|
||||
Reference in New Issue
Block a user