219 lines
7.6 KiB
Rust
219 lines
7.6 KiB
Rust
|
|
#![cfg_attr(
|
||
|
|
not(any(feature = "build-tesseract", feature = "build-tesseract-wasm")),
|
||
|
|
allow(unused_variables, dead_code)
|
||
|
|
)]
|
||
|
|
#![allow(clippy::arc_with_non_send_sync)]
|
||
|
|
#![allow(clippy::missing_transmute_annotations)]
|
||
|
|
#![allow(clippy::type_complexity)]
|
||
|
|
#![allow(clippy::new_without_default)]
|
||
|
|
#![allow(clippy::not_unsafe_ptr_arg_deref)]
|
||
|
|
#![allow(clippy::cmp_null)]
|
||
|
|
|
||
|
|
//! # kreuzberg-tesseract
|
||
|
|
//!
|
||
|
|
//! `kreuzberg-tesseract` provides safe Rust bindings for Tesseract OCR with built-in compilation
|
||
|
|
//! of Tesseract and Leptonica libraries. This crate aims to make OCR functionality
|
||
|
|
//! easily accessible in Rust projects while handling the complexity of interfacing
|
||
|
|
//! with the underlying C++ libraries.
|
||
|
|
//!
|
||
|
|
//! ## Usage
|
||
|
|
//!
|
||
|
|
//! Here's a basic example of how to use `kreuzberg-tesseract`:
|
||
|
|
//!
|
||
|
|
//! ```rust
|
||
|
|
//! use std::path::PathBuf;
|
||
|
|
//! use std::error::Error;
|
||
|
|
//! use kreuzberg_tesseract::TesseractAPI;
|
||
|
|
//!
|
||
|
|
//! fn get_default_tessdata_dir() -> PathBuf {
|
||
|
|
//! if cfg!(target_os = "macos") {
|
||
|
|
//! let home_dir = std::env::var("HOME").expect("HOME environment variable not set");
|
||
|
|
//! PathBuf::from(home_dir)
|
||
|
|
//! .join("Library")
|
||
|
|
//! .join("Application Support")
|
||
|
|
//! .join("kreuzberg-tesseract")
|
||
|
|
//! .join("tessdata")
|
||
|
|
//! } else if cfg!(target_os = "linux") {
|
||
|
|
//! let home_dir = std::env::var("HOME").expect("HOME environment variable not set");
|
||
|
|
//! PathBuf::from(home_dir)
|
||
|
|
//! .join(".kreuzberg-tesseract")
|
||
|
|
//! .join("tessdata")
|
||
|
|
//! } else if cfg!(target_os = "windows") {
|
||
|
|
//! PathBuf::from(std::env::var("APPDATA").expect("APPDATA environment variable not set"))
|
||
|
|
//! .join("kreuzberg-tesseract")
|
||
|
|
//! .join("tessdata")
|
||
|
|
//! } else {
|
||
|
|
//! panic!("Unsupported operating system");
|
||
|
|
//! }
|
||
|
|
//! }
|
||
|
|
//!
|
||
|
|
//! fn get_tessdata_dir() -> PathBuf {
|
||
|
|
//! match std::env::var("TESSDATA_PREFIX") {
|
||
|
|
//! Ok(dir) => {
|
||
|
|
//! let path = PathBuf::from(dir);
|
||
|
|
//! let path = if path.ends_with("tessdata") { path } else { path.join("tessdata") };
|
||
|
|
//! println!("Using TESSDATA_PREFIX directory: {:?}", path);
|
||
|
|
//! path
|
||
|
|
//! }
|
||
|
|
//! Err(_) => {
|
||
|
|
//! let default_dir = get_default_tessdata_dir();
|
||
|
|
//! println!(
|
||
|
|
//! "TESSDATA_PREFIX not set, using default directory: {:?}",
|
||
|
|
//! default_dir
|
||
|
|
//! );
|
||
|
|
//! default_dir
|
||
|
|
//! }
|
||
|
|
//! }
|
||
|
|
//! }
|
||
|
|
//!
|
||
|
|
//! fn main() -> Result<(), Box<dyn Error>> {
|
||
|
|
//! let api = TesseractAPI::new()?;
|
||
|
|
//!
|
||
|
|
//! // Get tessdata directory (uses default location or TESSDATA_PREFIX if set)
|
||
|
|
//! let tessdata_dir = get_tessdata_dir();
|
||
|
|
//! api.init(tessdata_dir.to_str().unwrap(), "eng")?;
|
||
|
|
//!
|
||
|
|
//! let width = 24;
|
||
|
|
//! let height = 24;
|
||
|
|
//! let bytes_per_pixel = 1;
|
||
|
|
//! let bytes_per_line = width * bytes_per_pixel;
|
||
|
|
//!
|
||
|
|
//! // Initialize image data with all white pixels
|
||
|
|
//! let mut image_data = vec![255u8; width * height];
|
||
|
|
//!
|
||
|
|
//! // Draw number 9 with clearer distinction
|
||
|
|
//! for y in 4..19 {
|
||
|
|
//! for x in 7..17 {
|
||
|
|
//! // Top bar
|
||
|
|
//! if y == 4 && x >= 8 && x <= 15 {
|
||
|
|
//! image_data[y * width + x] = 0;
|
||
|
|
//! }
|
||
|
|
//! // Top curve left side
|
||
|
|
//! if y >= 4 && y <= 10 && x == 7 {
|
||
|
|
//! image_data[y * width + x] = 0;
|
||
|
|
//! }
|
||
|
|
//! // Top curve right side
|
||
|
|
//! if y >= 4 && y <= 11 && x == 16 {
|
||
|
|
//! image_data[y * width + x] = 0;
|
||
|
|
//! }
|
||
|
|
//! // Middle bar
|
||
|
|
//! if y == 11 && x >= 8 && x <= 15 {
|
||
|
|
//! image_data[y * width + x] = 0;
|
||
|
|
//! }
|
||
|
|
//! // Bottom right vertical line
|
||
|
|
//! if y >= 11 && y <= 18 && x == 16 {
|
||
|
|
//! image_data[y * width + x] = 0;
|
||
|
|
//! }
|
||
|
|
//! // Bottom bar
|
||
|
|
//! if y == 18 && x >= 8 && x <= 15 {
|
||
|
|
//! image_data[y * width + x] = 0;
|
||
|
|
//! }
|
||
|
|
//! }
|
||
|
|
//! }
|
||
|
|
//!
|
||
|
|
//! // Set the image data
|
||
|
|
//! api.set_image(&image_data, width.try_into().unwrap(), height.try_into().unwrap(), bytes_per_pixel.try_into().unwrap(), bytes_per_line.try_into().unwrap())?;
|
||
|
|
//!
|
||
|
|
//! // Set whitelist for digits only
|
||
|
|
//! api.set_variable("tessedit_char_whitelist", "0123456789")?;
|
||
|
|
//!
|
||
|
|
//! // Set PSM mode to single character
|
||
|
|
//! api.set_variable("tessedit_pageseg_mode", "10")?;
|
||
|
|
//!
|
||
|
|
//! // Get the recognized text
|
||
|
|
//! let text = api.get_utf8_text()?;
|
||
|
|
//! println!("Recognized text: {}", text.trim());
|
||
|
|
//!
|
||
|
|
//! Ok(())
|
||
|
|
//! }
|
||
|
|
//! ```
|
||
|
|
/// Declare FFI functions with `extern "C-unwind"` on native targets (to catch
|
||
|
|
/// C++ exceptions from Tesseract/Leptonica) and `extern "C"` on WASM (where
|
||
|
|
/// the LLVM backend does not support `cleanupret` / C++ unwinding).
|
||
|
|
macro_rules! ffi_extern {
|
||
|
|
(
|
||
|
|
$(
|
||
|
|
$(#[$meta:meta])*
|
||
|
|
$vis:vis fn $name:ident($($arg:ident : $ty:ty),* $(,)?) $(-> $ret:ty)?;
|
||
|
|
)*
|
||
|
|
) => {
|
||
|
|
#[cfg(not(target_arch = "wasm32"))]
|
||
|
|
unsafe extern "C-unwind" {
|
||
|
|
$(
|
||
|
|
$(#[$meta])*
|
||
|
|
$vis fn $name($($arg : $ty),*) $(-> $ret)?;
|
||
|
|
)*
|
||
|
|
}
|
||
|
|
|
||
|
|
#[cfg(target_arch = "wasm32")]
|
||
|
|
unsafe extern "C" {
|
||
|
|
$(
|
||
|
|
$(#[$meta])*
|
||
|
|
$vis fn $name($($arg : $ty),*) $(-> $ret)?;
|
||
|
|
)*
|
||
|
|
}
|
||
|
|
};
|
||
|
|
}
|
||
|
|
|
||
|
|
pub use error::{Result, TesseractError};
|
||
|
|
mod error;
|
||
|
|
|
||
|
|
// WASM: Override __cxa_atexit to be a no-op. WASI SDK's __cxa_atexit calls calloc during
|
||
|
|
// C++ static initialization, which crashes because dlmalloc's heap isn't properly set up
|
||
|
|
// for wasm32-unknown-unknown. Since WASM modules never exit normally, atexit handlers
|
||
|
|
// are unnecessary.
|
||
|
|
#[cfg(all(target_arch = "wasm32", not(target_os = "emscripten")))]
|
||
|
|
mod wasm_compat {
|
||
|
|
#[unsafe(no_mangle)]
|
||
|
|
pub unsafe extern "C" fn __cxa_atexit(
|
||
|
|
_func: Option<unsafe extern "C" fn(*mut core::ffi::c_void)>,
|
||
|
|
_arg: *mut core::ffi::c_void,
|
||
|
|
_dso_handle: *mut core::ffi::c_void,
|
||
|
|
) -> i32 {
|
||
|
|
0 // Success, but don't actually register anything
|
||
|
|
}
|
||
|
|
}
|
||
|
|
mod page_iterator;
|
||
|
|
pub use page_iterator::{BlockInfo, PageIterator, ParaInfo};
|
||
|
|
mod result_iterator;
|
||
|
|
pub use result_iterator::{FontAttributes, ResultIterator, WordData};
|
||
|
|
mod choice_iterator;
|
||
|
|
pub use choice_iterator::ChoiceIterator;
|
||
|
|
mod monitor;
|
||
|
|
pub use monitor::TessMonitor;
|
||
|
|
mod result_renderer;
|
||
|
|
pub use result_renderer::TessResultRenderer;
|
||
|
|
mod mutable_iterator;
|
||
|
|
pub use mutable_iterator::MutableIterator;
|
||
|
|
mod enums;
|
||
|
|
pub use enums::{
|
||
|
|
TessOrientation, TessPageIteratorLevel, TessPageSegMode, TessParagraphJustification, TessPolyBlockType,
|
||
|
|
TessTextlineOrder, TessWritingDirection,
|
||
|
|
};
|
||
|
|
mod api;
|
||
|
|
pub use api::{BoundingBoxArray, TesseractAPI};
|
||
|
|
pub mod leptonica;
|
||
|
|
#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
|
||
|
|
pub use leptonica::Pix;
|
||
|
|
|
||
|
|
/// Returns the compile-time-bundled English `eng.traineddata` blob when the
|
||
|
|
/// `bundle-tessdata-eng` feature is enabled, otherwise `None`.
|
||
|
|
///
|
||
|
|
/// The bundled data is the `tessdata_fast` variant (~4 MB) downloaded by
|
||
|
|
/// `build.rs` to `TESSDATA_PREFIX_BUNDLED/tessdata/eng.traineddata`. Embedding
|
||
|
|
/// it lets WASM builds drive Tesseract OCR without filesystem access or
|
||
|
|
/// runtime fetches.
|
||
|
|
#[cfg(feature = "bundle-tessdata-eng")]
|
||
|
|
pub fn bundled_eng_traineddata() -> Option<&'static [u8]> {
|
||
|
|
Some(include_bytes!(concat!(
|
||
|
|
env!("TESSDATA_PREFIX_BUNDLED"),
|
||
|
|
"/tessdata/eng.traineddata"
|
||
|
|
)))
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Returns `None` when the `bundle-tessdata-eng` feature is disabled.
|
||
|
|
#[cfg(not(feature = "bundle-tessdata-eng"))]
|
||
|
|
pub fn bundled_eng_traineddata() -> Option<&'static [u8]> {
|
||
|
|
None
|
||
|
|
}
|