This commit is contained in:
13
crates/kreuzberg-tesseract/.commitlintrc.json
Normal file
13
crates/kreuzberg-tesseract/.commitlintrc.json
Normal file
@@ -0,0 +1,13 @@
|
||||
{
|
||||
"extends": ["@commitlint/config-conventional"],
|
||||
"rules": {
|
||||
"body-max-line-length": [2, "always", 100],
|
||||
"header-max-length": [2, "always", 100],
|
||||
"subject-case": [2, "never", ["sentence-case", "start-case", "pascal-case", "upper-case"]],
|
||||
"type-enum": [
|
||||
2,
|
||||
"always",
|
||||
["feat", "fix", "docs", "style", "refactor", "perf", "test", "build", "ci", "chore", "revert"]
|
||||
]
|
||||
}
|
||||
}
|
||||
2
crates/kreuzberg-tesseract/.crate-ignore
Normal file
2
crates/kreuzberg-tesseract/.crate-ignore
Normal file
@@ -0,0 +1,2 @@
|
||||
/third_party/
|
||||
/tessdata/
|
||||
2933
crates/kreuzberg-tesseract/Cargo.lock
generated
Normal file
2933
crates/kreuzberg-tesseract/Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
64
crates/kreuzberg-tesseract/Cargo.toml
Normal file
64
crates/kreuzberg-tesseract/Cargo.toml
Normal file
@@ -0,0 +1,64 @@
|
||||
[package]
|
||||
name = "kreuzberg-tesseract"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
rust-version.workspace = true
|
||||
authors.workspace = true
|
||||
description = "Rust bindings for Tesseract OCR with cross-compilation, C++17, and caching improvements"
|
||||
license = "MIT"
|
||||
repository.workspace = true
|
||||
homepage = "https://kreuzberg.dev"
|
||||
documentation = "https://docs.kreuzberg.dev"
|
||||
readme = "README.md"
|
||||
keywords = ["tesseract", "ocr", "bindings", "vision", "recognition"]
|
||||
categories = ["external-ffi-bindings", "computer-vision", "text-processing"]
|
||||
build = "build.rs"
|
||||
links = "kreuzberg_tesseract"
|
||||
exclude = ["tessdata/*", "third_party/*"]
|
||||
|
||||
[package.metadata.docs.rs]
|
||||
features = ["docs-only"]
|
||||
rustdoc-args = ["--cfg", "docsrs"]
|
||||
|
||||
[package.metadata.cargo-machete]
|
||||
ignored = ["cc", "cmake", "reqwest", "zip"]
|
||||
|
||||
[lib]
|
||||
name = "kreuzberg_tesseract"
|
||||
crate-type = ["lib"]
|
||||
|
||||
[features]
|
||||
default = ["static-linking"]
|
||||
build-tesseract = ["cc", "cmake", "reqwest", "zip"]
|
||||
build-tesseract-wasm = ["cmake", "reqwest", "zip"]
|
||||
# Bundle eng.traineddata into the compiled crate so WASM builds can run OCR
|
||||
# without runtime tessdata loading. Uses ~4 MB of binary size (tessdata_fast).
|
||||
bundle-tessdata-eng = []
|
||||
static-linking = ["build-tesseract"]
|
||||
dynamic-linking = []
|
||||
|
||||
[dependencies]
|
||||
thiserror = { workspace = true }
|
||||
|
||||
[build-dependencies]
|
||||
cc = { version = "^1.2.63", optional = true }
|
||||
cmake = { version = "0.1.58", optional = true }
|
||||
zip = { version = ">=7.0.0", optional = true, default-features = false, features = [
|
||||
"deflate-flate2-zlib-rs",
|
||||
] }
|
||||
|
||||
[target.'cfg(not(target_os = "windows"))'.build-dependencies]
|
||||
reqwest = { workspace = true, default-features = false, features = [
|
||||
"blocking",
|
||||
"rustls",
|
||||
], optional = true }
|
||||
|
||||
# Use native-tls on Windows to avoid aws-lc-sys CMake build issues with MinGW
|
||||
[target.'cfg(target_os = "windows")'.build-dependencies]
|
||||
reqwest = { workspace = true, default-features = false, features = [
|
||||
"blocking",
|
||||
"native-tls",
|
||||
], optional = true }
|
||||
|
||||
[dev-dependencies]
|
||||
image = { workspace = true, features = ["png"] }
|
||||
22
crates/kreuzberg-tesseract/LICENSE
Normal file
22
crates/kreuzberg-tesseract/LICENSE
Normal file
@@ -0,0 +1,22 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2024 Cafer Can Gündoğdu
|
||||
Copyright (c) 2025 Na'aman Hirschfeld
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
405
crates/kreuzberg-tesseract/README.md
Normal file
405
crates/kreuzberg-tesseract/README.md
Normal file
@@ -0,0 +1,405 @@
|
||||
# kreuzberg-tesseract
|
||||
|
||||
[](https://github.com/kreuzberg-dev/alef)
|
||||
|
||||
Rust bindings for Tesseract OCR with built-in compilation of Tesseract and Leptonica libraries. Provides a safe and idiomatic Rust interface to Tesseract's functionality while handling the complexity of compiling the underlying C++ libraries.
|
||||
|
||||
Based on the original [tesseract-rs](https://github.com/cafercangundogdu/tesseract-rs) by Cafer Can Gündoğdu, this maintained version adds critical improvements for production use:
|
||||
|
||||
- **C++17 Support**: Upgraded for Tesseract 5.5.1 which requires C++17 filesystem
|
||||
- **Cross-Compilation**: Fixed CXX compiler detection for cross-platform builds
|
||||
- **Architecture Validation**: Validates target architecture before using cached libraries
|
||||
- **Windows Static Linking**: Fixed MSVC static linking issues
|
||||
- **Build Caching**: Improved caching with OUT_DIR-based cache directory
|
||||
- **MinGW Support**: Added support for MinGW toolchains
|
||||
|
||||
## Features
|
||||
|
||||
- Safe Rust bindings for Tesseract OCR
|
||||
- **Multiple linking options:**
|
||||
- **Static linking** (default): Built-in compilation with no runtime dependencies
|
||||
- **Dynamic linking**: Link to system-installed libraries for faster builds
|
||||
- Uses existing Tesseract training data (expects English data for tests)
|
||||
- High-level Rust API for common OCR tasks
|
||||
- Caching of compiled libraries for faster subsequent builds
|
||||
- Support for multiple operating systems (Linux, macOS, Windows)
|
||||
|
||||
## Installation
|
||||
|
||||
### Static Linking (Default)
|
||||
|
||||
Static linking builds Tesseract and Leptonica from source and embeds them in your binary. No runtime dependencies required:
|
||||
|
||||
```toml
|
||||
[dependencies]
|
||||
kreuzberg-tesseract = "1.0.0-rc.1"
|
||||
# or explicitly:
|
||||
kreuzberg-tesseract = { version = "1.0.0-rc.1", features = ["static-linking"] }
|
||||
```
|
||||
|
||||
### Dynamic Linking
|
||||
|
||||
Dynamic linking uses system-installed Tesseract and Leptonica libraries. Faster builds, but requires libraries installed on the system:
|
||||
|
||||
```toml
|
||||
[dependencies]
|
||||
kreuzberg-tesseract = { version = "1.0.0-rc.1", features = ["dynamic-linking"], default-features = false }
|
||||
```
|
||||
|
||||
**System requirements for dynamic linking:**
|
||||
|
||||
- Tesseract 5.x libraries installed (`libtesseract`, `libleptonica`)
|
||||
- macOS: `brew install tesseract leptonica`
|
||||
- Ubuntu/Debian: `sudo apt-get install libtesseract-dev libleptonica-dev`
|
||||
- RHEL/CentOS/Fedora: `sudo dnf install tesseract-devel leptonica-devel`
|
||||
- Windows: Install from [Tesseract releases](https://github.com/tesseract-ocr/tesseract/releases) or vcpkg
|
||||
|
||||
### Development Dependencies
|
||||
|
||||
For development and testing, you'll also need these dependencies:
|
||||
|
||||
```toml
|
||||
[dev-dependencies]
|
||||
image = "0.25.5"
|
||||
```
|
||||
|
||||
## System Requirements
|
||||
|
||||
### For Static Linking (Default)
|
||||
|
||||
When building with static linking, the crate will compile Tesseract and Leptonica from source. You need:
|
||||
|
||||
- Rust 1.85.0 or later
|
||||
- A C++ compiler (e.g., gcc, clang, MSVC on Windows)
|
||||
- CMake 3.x or later
|
||||
- Internet connection (for downloading Tesseract source code)
|
||||
|
||||
### For Dynamic Linking
|
||||
|
||||
When using dynamic linking with system-installed libraries, you need:
|
||||
|
||||
- Rust 1.85.0 or later
|
||||
- Tesseract 5.x and Leptonica libraries installed on your system (see Installation section)
|
||||
- Internet connection (for downloading Tesseract source code)
|
||||
|
||||
No C++ compiler or CMake required for dynamic linking builds.
|
||||
|
||||
For a full development environment checklist (including optional tooling suggestions), see [CONTRIBUTING.md](../../CONTRIBUTING.md).
|
||||
|
||||
## Environment Variables
|
||||
|
||||
The following environment variables affect the build and test process:
|
||||
|
||||
### Build Variables
|
||||
|
||||
- `CARGO_CLEAN`: If set, cleans the cache directory before building
|
||||
- `RUSTC_WRAPPER`: If set to "sccache", enables compiler caching with sccache
|
||||
- `CC`: Compiler selection for C code (affects Linux builds)
|
||||
- `HOME` (Unix) or `APPDATA` (Windows): Used to determine cache directory location
|
||||
- `TESSERACT_RS_CACHE_DIR`: Optional override for the cache root. When unset or not writable, the build falls back to the default OS-specific directory, and if that still fails, a temporary directory under the system temp folder is used automatically.
|
||||
|
||||
### Test Variables
|
||||
|
||||
- `TESSDATA_PREFIX` (Optional): Path to override the default tessdata directory. If not set, the crate will use its default cache directory.
|
||||
|
||||
## Cache and Data Directories
|
||||
|
||||
The crate uses the following directory structure based on your operating system:
|
||||
|
||||
- macOS: `~/Library/Application Support/tesseract-rs`
|
||||
- Linux: `~/.tesseract-rs`
|
||||
- Windows: `%APPDATA%/tesseract-rs`
|
||||
|
||||
The cache includes:
|
||||
|
||||
- Compiled Tesseract and Leptonica libraries
|
||||
- Third-party source code
|
||||
|
||||
Training data is not downloaded during the build. Provide `eng.traineddata` (and any other languages you need) via `TESSDATA_PREFIX` or your system Tesseract installation.
|
||||
|
||||
## Testing
|
||||
|
||||
The project includes several integration tests that verify OCR functionality. To run the tests:
|
||||
|
||||
1. Ensure you have the required test dependencies:
|
||||
|
||||
```toml
|
||||
[dev-dependencies]
|
||||
image = "0.25.9"
|
||||
```
|
||||
|
||||
2. Run the tests:
|
||||
|
||||
```bash
|
||||
cargo test
|
||||
```
|
||||
|
||||
Note: Make sure `eng.traineddata` is available in your tessdata directory before running tests. If `TESSDATA_PREFIX` is not set, the tests look in the default cache location. You can point the tests at a custom tessdata directory by setting:
|
||||
|
||||
```bash
|
||||
# Linux/macOS
|
||||
export TESSDATA_PREFIX=/path/to/custom/tessdata
|
||||
|
||||
# Windows (PowerShell)
|
||||
$env:TESSDATA_PREFIX="C:\path\to\custom\tessdata"
|
||||
```
|
||||
|
||||
Available test cases:
|
||||
|
||||
- OCR on English sample images
|
||||
- Error handling and invalid input coverage
|
||||
|
||||
Test images are sourced from the shared `test_documents/` directory in the repository:
|
||||
|
||||
- `images/test_hello_world.png`: Simple English text
|
||||
- `tables/simple_table.png`: Basic table with English headers
|
||||
|
||||
## Usage
|
||||
|
||||
Here's a basic example of how to use `tesseract-rs`:
|
||||
|
||||
```rust
|
||||
use std::path::PathBuf;
|
||||
use std::error::Error;
|
||||
use kreuzberg_tesseract::TesseractAPI;
|
||||
|
||||
fn get_default_tessdata_dir() -> PathBuf {
|
||||
if cfg!(target_os = "macos") {
|
||||
let home_dir = std::env::var("HOME").expect("HOME environment variable not set");
|
||||
PathBuf::from(home_dir)
|
||||
.join("Library")
|
||||
.join("Application Support")
|
||||
.join("tesseract-rs")
|
||||
.join("tessdata")
|
||||
} else if cfg!(target_os = "linux") {
|
||||
let home_dir = std::env::var("HOME").expect("HOME environment variable not set");
|
||||
PathBuf::from(home_dir)
|
||||
.join(".tesseract-rs")
|
||||
.join("tessdata")
|
||||
} else if cfg!(target_os = "windows") {
|
||||
PathBuf::from(std::env::var("APPDATA").expect("APPDATA environment variable not set"))
|
||||
.join("tesseract-rs")
|
||||
.join("tessdata")
|
||||
} else {
|
||||
panic!("Unsupported operating system");
|
||||
}
|
||||
}
|
||||
|
||||
fn get_tessdata_dir() -> PathBuf {
|
||||
match std::env::var("TESSDATA_PREFIX") {
|
||||
Ok(dir) => {
|
||||
let path = PathBuf::from(dir);
|
||||
println!("Using TESSDATA_PREFIX directory: {:?}", path);
|
||||
path
|
||||
}
|
||||
Err(_) => {
|
||||
let default_dir = get_default_tessdata_dir();
|
||||
println!(
|
||||
"TESSDATA_PREFIX not set, using default directory: {:?}",
|
||||
default_dir
|
||||
);
|
||||
default_dir
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn main() -> Result<(), Box<dyn Error>> {
|
||||
let api = TesseractAPI::new()?;
|
||||
|
||||
// Get tessdata directory (uses default location or TESSDATA_PREFIX if set)
|
||||
let tessdata_dir = get_tessdata_dir();
|
||||
api.init(tessdata_dir.to_str().unwrap(), "eng")?;
|
||||
|
||||
let width = 24;
|
||||
let height = 24;
|
||||
let bytes_per_pixel = 1;
|
||||
let bytes_per_line = width * bytes_per_pixel;
|
||||
|
||||
// Initialize image data with all white pixels
|
||||
let mut image_data = vec![255u8; width * height];
|
||||
|
||||
// Draw number 9 with clearer distinction
|
||||
for y in 4..19 {
|
||||
for x in 7..17 {
|
||||
// Top bar
|
||||
if y == 4 && x >= 8 && x <= 15 {
|
||||
image_data[y * width + x] = 0;
|
||||
}
|
||||
// Top curve left side
|
||||
if y >= 4 && y <= 10 && x == 7 {
|
||||
image_data[y * width + x] = 0;
|
||||
}
|
||||
// Top curve right side
|
||||
if y >= 4 && y <= 11 && x == 16 {
|
||||
image_data[y * width + x] = 0;
|
||||
}
|
||||
// Middle bar
|
||||
if y == 11 && x >= 8 && x <= 15 {
|
||||
image_data[y * width + x] = 0;
|
||||
}
|
||||
// Bottom right vertical line
|
||||
if y >= 11 && y <= 18 && x == 16 {
|
||||
image_data[y * width + x] = 0;
|
||||
}
|
||||
// Bottom bar
|
||||
if y == 18 && x >= 8 && x <= 15 {
|
||||
image_data[y * width + x] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Set the image data
|
||||
api.set_image(
|
||||
&image_data,
|
||||
width.try_into().unwrap(),
|
||||
height.try_into().unwrap(),
|
||||
bytes_per_pixel.try_into().unwrap(),
|
||||
bytes_per_line.try_into().unwrap(),
|
||||
)?;
|
||||
|
||||
// Set whitelist for digits only
|
||||
api.set_variable("tessedit_char_whitelist", "0123456789")?;
|
||||
|
||||
// Set PSM mode to single character
|
||||
api.set_variable("tessedit_pageseg_mode", "10")?;
|
||||
|
||||
// Get the recognized text
|
||||
let text = api.get_utf8_text()?;
|
||||
println!("Recognized text: {}", text.trim());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
|
||||
## Advanced Usage
|
||||
|
||||
The API provides additional functionality for more complex OCR tasks, including thread-safe operations:
|
||||
|
||||
```rust
|
||||
use kreuzberg_tesseract::TesseractAPI;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
use std::error::Error;
|
||||
|
||||
fn main() -> Result<(), Box<dyn Error>> {
|
||||
let tessdata_dir = get_tessdata_dir();
|
||||
let api = TesseractAPI::new()?;
|
||||
|
||||
// Initialize the main API
|
||||
api.init(tessdata_dir.to_str().unwrap(), "eng")?;
|
||||
api.set_variable("tessedit_pageseg_mode", "1")?;
|
||||
|
||||
// Load and prepare image data
|
||||
let (image_data, width, height) = load_test_image("sample_text.png")?;
|
||||
|
||||
// Share image data across threads
|
||||
let image_data = Arc::new(image_data);
|
||||
let mut handles = vec![];
|
||||
|
||||
// Spawn multiple threads for parallel OCR processing
|
||||
for _ in 0..3 {
|
||||
let api_clone = api.clone(); // Clones the API with all configurations
|
||||
let image_data = Arc::clone(&image_data);
|
||||
|
||||
let handle = thread::spawn(move || {
|
||||
// Set image in each thread
|
||||
let res = api_clone.set_image(
|
||||
&image_data,
|
||||
width as i32,
|
||||
height as i32,
|
||||
3,
|
||||
3 * width as i32,
|
||||
);
|
||||
assert!(res.is_ok());
|
||||
|
||||
// Perform OCR in parallel
|
||||
let text = api_clone.get_utf8_text()
|
||||
.expect("Failed to get text");
|
||||
println!("Thread result: {}", text);
|
||||
});
|
||||
handles.push(handle);
|
||||
}
|
||||
|
||||
// Wait for all threads to complete
|
||||
for handle in handles {
|
||||
handle.join().unwrap();
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Helper function to get tessdata directory
|
||||
fn get_tessdata_dir() -> PathBuf {
|
||||
// ... (implementation as shown in basic example)
|
||||
}
|
||||
|
||||
// Helper function to load test image
|
||||
fn load_test_image(filename: &str) -> Result<(Vec<u8>, u32, u32), Box<dyn Error>> {
|
||||
let img = image::open(filename)?
|
||||
.to_rgb8();
|
||||
let (width, height) = img.dimensions();
|
||||
Ok((img.into_raw(), width, height))
|
||||
}
|
||||
```
|
||||
|
||||
## Building
|
||||
|
||||
### Static Linking (Default)
|
||||
|
||||
With static linking, the crate will automatically download and compile Tesseract and Leptonica during the build process. This may take some time on the first build (5-10 minutes), but subsequent builds will use the cached libraries.
|
||||
|
||||
To clean the cache and force a rebuild:
|
||||
|
||||
```bash
|
||||
CARGO_CLEAN=1 cargo build
|
||||
```
|
||||
|
||||
### Dynamic Linking
|
||||
|
||||
With dynamic linking, the build is much faster (seconds instead of minutes) since it only links against system-installed libraries:
|
||||
|
||||
```bash
|
||||
cargo build --no-default-features --features dynamic-linking
|
||||
```
|
||||
|
||||
**Note**: Dynamic linking requires Tesseract and Leptonica to be installed on your system (see Installation section).
|
||||
|
||||
## Documentation
|
||||
|
||||
For more detailed information, please check the [API documentation](https://docs.rs/kreuzberg-tesseract).
|
||||
|
||||
## License
|
||||
|
||||
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
||||
|
||||
## Acknowledgements
|
||||
|
||||
This project is based on the original [tesseract-rs](https://github.com/cafercangundogdu/tesseract-rs) by [Cafer Can Gündoğdu](https://github.com/cafercangundogdu). We are grateful for the foundational work that made this project possible.
|
||||
|
||||
## Contributing
|
||||
|
||||
We welcome contributions! Please see our [Contributing Guide](../../CONTRIBUTING.md) for details.
|
||||
|
||||
### Quick Start for Contributors
|
||||
|
||||
1. Fork and clone the repository
|
||||
2. Install uv and set up git hooks:
|
||||
|
||||
```bash
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
uvx prek install
|
||||
```
|
||||
|
||||
3. Make your changes following our commit message format
|
||||
4. Run tests: `cargo test`
|
||||
5. Submit a Pull Request
|
||||
|
||||
Our commit messages follow the [Conventional Commits](https://www.conventionalcommits.org/) specification.
|
||||
|
||||
## Acknowledgements
|
||||
|
||||
This project uses [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) and [Leptonica](http://leptonica.org/). We are grateful to the maintainers and contributors of these projects.
|
||||
|
||||
```text
|
||||
|
||||
```
|
||||
2011
crates/kreuzberg-tesseract/build.rs
Normal file
2011
crates/kreuzberg-tesseract/build.rs
Normal file
File diff suppressed because it is too large
Load Diff
74
crates/kreuzberg-tesseract/patches/README.md
Normal file
74
crates/kreuzberg-tesseract/patches/README.md
Normal file
@@ -0,0 +1,74 @@
|
||||
# Tesseract WASM Patches
|
||||
|
||||
This directory contains patches needed to compile Tesseract for WebAssembly (WASM) targets using WASI SDK.
|
||||
|
||||
These patches are vendored from the [tesseract-wasm](https://github.com/naptha/tesseract.js) project and have been proven to work with WASM compilation.
|
||||
|
||||
## Patches
|
||||
|
||||
### tesseract.diff
|
||||
|
||||
A comprehensive patch that makes Tesseract compatible with WASM compilation. The patch includes the following changes:
|
||||
|
||||
#### 1. CMakeLists.txt Modifications
|
||||
|
||||
- **New CMake option**: `BUILD_TESSERACT_BINARY` (default: ON)
|
||||
- Allows disabling the Tesseract CLI binary build, which is not needed for WASM
|
||||
- Wraps all executable and installation targets for the tesseract binary
|
||||
|
||||
- **Disabled components for WASM**:
|
||||
- Removes OpenCL support (`src/opencl/*.cpp`) - not applicable to WASM
|
||||
- Removes viewer support (`src/viewer/*.cpp`) - UI components not needed for WASM
|
||||
- Removes C API bindings (`src/api/capi.cpp`) - only hocrrenderer is kept
|
||||
- Removes PDF and rendering support files:
|
||||
- `src/api/renderer.cpp`
|
||||
- `src/api/altorenderer.cpp`
|
||||
- `src/api/lstmboxrenderer.cpp`
|
||||
- `src/api/pdfrenderer.cpp`
|
||||
- `src/api/wordstrboxrenderer.cpp`
|
||||
|
||||
#### 2. SIMD Detection Fixes (src/arch/simddetect.cpp)
|
||||
|
||||
- Guards CPUID detection with `#if !defined(__wasm__)`
|
||||
- Prevents attempts to use CPU feature detection that don't exist in WASM
|
||||
- The HAS_CPUID macro is only defined for non-WASM builds
|
||||
- This allows the code to gracefully handle WASM's SIMD limitations
|
||||
|
||||
#### 3. Pointer Type Fixes (src/ccmain/pageiterator.cpp, src/ccmain/pagesegmain.cpp, src/ccmain/tesseractclass.cpp)
|
||||
|
||||
**Changed from stack allocation to heap allocation** in `tesseractclass.h`:
|
||||
|
||||
- `pixa_debug_` changed from `DebugPixa` to `std::unique_ptr<DebugPixa>`
|
||||
- This prevents large allocations on the stack, which is limited in WASM
|
||||
|
||||
**Updated all references** throughout the codebase:
|
||||
|
||||
- `.get()` calls added where raw pointers are needed
|
||||
- Arrow operator `->` replaces dot operator `.` for member access
|
||||
- Null checks added before dereferencing to prevent crashes
|
||||
|
||||
**Affected functions**:
|
||||
|
||||
- `PageIterator::Orientation()` - added null vector check
|
||||
- `Tesseract::AutoPageSeg()` - updated pointer passing
|
||||
- `Tesseract::SetupPageSegAndDetectOrientation()` - multiple pointer updates
|
||||
- `Tesseract::Clear()` - added null check before WritePDF
|
||||
- `Tesseract::PrepareForPageseg()` - updated Split() calls
|
||||
- `Tesseract::PrepareForTessOCR()` - updated Split() calls
|
||||
|
||||
#### 4. Additional Fixes
|
||||
|
||||
- **Orientation detection**: Changed comparison from `> 0.0F` to `>= 0.0F` in `pageiterator.cpp` to handle null vectors gracefully when orientation info is not available
|
||||
|
||||
## How to Apply
|
||||
|
||||
These patches are applied during the WASM build process. They modify the Tesseract source code to:
|
||||
|
||||
1. Disable WASM-incompatible features (OpenCL, viewers, renderers)
|
||||
2. Prevent CPUID detection in WASM environment
|
||||
3. Use heap allocation instead of stack allocation for large objects
|
||||
4. Handle missing pointer initialization gracefully
|
||||
|
||||
## Source
|
||||
|
||||
These patches are based on the proven WASM compilation approach used by the tesseract.js project, which successfully compiles Tesseract to WebAssembly and deploys it in production environments.
|
||||
199
crates/kreuzberg-tesseract/patches/tesseract.diff
Normal file
199
crates/kreuzberg-tesseract/patches/tesseract.diff
Normal file
@@ -0,0 +1,199 @@
|
||||
diff --git a/CMakeLists.txt b/CMakeLists.txt
|
||||
index 8c6845cb..fdcfc4a8 100644
|
||||
--- a/CMakeLists.txt
|
||||
+++ b/CMakeLists.txt
|
||||
@@ -90,6 +90,7 @@ option(ENABLE_LTO "Enable link-time optimization" OFF)
|
||||
option(FAST_FLOAT "Enable float for LSTM" ON)
|
||||
option(ENABLE_OPENCL "Enable unsupported experimental OpenCL support" OFF)
|
||||
option(BUILD_TRAINING_TOOLS "Build training tools" ON)
|
||||
+option(BUILD_TESSERACT_BINARY "Build Tesseract binary" ON)
|
||||
option(BUILD_TESTS "Build tests" OFF)
|
||||
option(USE_SYSTEM_ICU "Use system ICU" OFF)
|
||||
option(DISABLE_ARCHIVE "Disable build with libarchive (if available)" OFF)
|
||||
@@ -565,9 +566,7 @@ file(
|
||||
src/cutil/*.cpp
|
||||
src/dict/*.cpp
|
||||
src/lstm/*.cpp
|
||||
- src/opencl/*.cpp
|
||||
src/textord/*.cpp
|
||||
- src/viewer/*.cpp
|
||||
src/wordrec/*.cpp)
|
||||
|
||||
if(DISABLED_LEGACY_ENGINE)
|
||||
@@ -714,13 +713,7 @@ file(
|
||||
set(TESSERACT_SRC
|
||||
${TESSERACT_SRC}
|
||||
src/api/baseapi.cpp
|
||||
- src/api/capi.cpp
|
||||
- src/api/renderer.cpp
|
||||
- src/api/altorenderer.cpp
|
||||
- src/api/hocrrenderer.cpp
|
||||
- src/api/lstmboxrenderer.cpp
|
||||
- src/api/pdfrenderer.cpp
|
||||
- src/api/wordstrboxrenderer.cpp)
|
||||
+ src/api/hocrrenderer.cpp)
|
||||
|
||||
set(TESSERACT_CONFIGS
|
||||
tessdata/configs/alto
|
||||
@@ -858,14 +851,16 @@ endif()
|
||||
# EXECUTABLE tesseract
|
||||
# ##############################################################################
|
||||
|
||||
-add_executable(tesseract src/tesseract.cpp)
|
||||
-target_link_libraries(tesseract libtesseract)
|
||||
-if(HAVE_TIFFIO_H AND WIN32)
|
||||
- target_link_libraries(tesseract ${TIFF_LIBRARIES})
|
||||
-endif()
|
||||
+if(BUILD_TESSERACT_BINARY)
|
||||
+ add_executable(tesseract src/tesseract.cpp)
|
||||
+ target_link_libraries(tesseract libtesseract)
|
||||
+ if(HAVE_TIFFIO_H AND WIN32)
|
||||
+ target_link_libraries(tesseract ${TIFF_LIBRARIES})
|
||||
+ endif()
|
||||
|
||||
-if(OPENMP_BUILD AND UNIX)
|
||||
- target_link_libraries(tesseract pthread)
|
||||
+ if(OPENMP_BUILD AND UNIX)
|
||||
+ target_link_libraries(tesseract pthread)
|
||||
+ endif()
|
||||
endif()
|
||||
|
||||
# ##############################################################################
|
||||
@@ -899,7 +894,11 @@ write_basic_package_version_file(
|
||||
|
||||
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/tesseract.pc
|
||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
|
||||
-install(TARGETS tesseract DESTINATION bin)
|
||||
+
|
||||
+if(BUILD_TESSERACT_BINARY)
|
||||
+ install(TARGETS tesseract DESTINATION bin)
|
||||
+endif()
|
||||
+
|
||||
install(
|
||||
TARGETS libtesseract
|
||||
EXPORT TesseractTargets
|
||||
diff --git a/src/arch/simddetect.cpp b/src/arch/simddetect.cpp
|
||||
index 1afe5a5d..cb8c6d4c 100644
|
||||
--- a/src/arch/simddetect.cpp
|
||||
+++ b/src/arch/simddetect.cpp
|
||||
@@ -40,10 +40,12 @@
|
||||
|
||||
#endif
|
||||
|
||||
+#if !defined(__wasm__)
|
||||
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA) || defined(HAVE_SSE4_1)
|
||||
// See https://en.wikipedia.org/wiki/CPUID.
|
||||
# define HAS_CPUID
|
||||
#endif
|
||||
+#endif
|
||||
|
||||
#if defined(HAS_CPUID)
|
||||
# if defined(__GNUC__)
|
||||
diff --git a/src/ccmain/pageiterator.cpp b/src/ccmain/pageiterator.cpp
|
||||
index 64ff7f66..c0f80e5f 100644
|
||||
--- a/src/ccmain/pageiterator.cpp
|
||||
+++ b/src/ccmain/pageiterator.cpp
|
||||
@@ -582,7 +582,9 @@ void PageIterator::Orientation(tesseract::Orientation *orientation,
|
||||
up_in_image.rotate(block->re_rotation());
|
||||
|
||||
if (up_in_image.x() == 0.0F) {
|
||||
- if (up_in_image.y() > 0.0F) {
|
||||
+ // tesseract-wasm note: `up_in_image` will be a null vector if orientation
|
||||
+ // info is not available. In that case, assume page up.
|
||||
+ if (up_in_image.y() >= 0.0F) {
|
||||
*orientation = ORIENTATION_PAGE_UP;
|
||||
} else {
|
||||
*orientation = ORIENTATION_PAGE_DOWN;
|
||||
diff --git a/src/ccmain/pagesegmain.cpp b/src/ccmain/pagesegmain.cpp
|
||||
index 0af44607..718e73ef 100644
|
||||
--- a/src/ccmain/pagesegmain.cpp
|
||||
+++ b/src/ccmain/pagesegmain.cpp
|
||||
@@ -222,7 +222,7 @@ int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOC
|
||||
}
|
||||
#endif // ndef DISABLED_LEGACY_ENGINE
|
||||
result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_, to_block,
|
||||
- photomask_pix, pix_thresholds_, pix_grey_, &pixa_debug_,
|
||||
+ photomask_pix, pix_thresholds_, pix_grey_, pixa_debug_.get(),
|
||||
&found_blocks, diacritic_blobs, to_blocks);
|
||||
if (result >= 0) {
|
||||
finder->GetDeskewVectors(&deskew_, &reskew_);
|
||||
@@ -279,17 +279,17 @@ ColumnFinder *Tesseract::SetupPageSegAndDetectOrientation(PageSegMode pageseg_mo
|
||||
ICOORD bleft(0, 0);
|
||||
|
||||
ASSERT_HOST(pix_binary_ != nullptr);
|
||||
- if (tessedit_dump_pageseg_images) {
|
||||
- pixa_debug_.AddPix(pix_binary_, "PageSegInput");
|
||||
+ if (tessedit_dump_pageseg_images && pixa_debug_) {
|
||||
+ pixa_debug_->AddPix(pix_binary_, "PageSegInput");
|
||||
}
|
||||
// Leptonica is used to find the rule/separator lines in the input.
|
||||
LineFinder::FindAndRemoveLines(source_resolution_, textord_tabfind_show_vlines, pix_binary_,
|
||||
&vertical_x, &vertical_y, music_mask_pix, &v_lines, &h_lines);
|
||||
- if (tessedit_dump_pageseg_images) {
|
||||
- pixa_debug_.AddPix(pix_binary_, "NoLines");
|
||||
+ if (tessedit_dump_pageseg_images && pixa_debug_) {
|
||||
+ pixa_debug_->AddPix(pix_binary_, "NoLines");
|
||||
}
|
||||
// Leptonica is used to find a mask of the photo regions in the input.
|
||||
- *photo_mask_pix = ImageFind::FindImages(pix_binary_, &pixa_debug_);
|
||||
+ *photo_mask_pix = ImageFind::FindImages(pix_binary_, pixa_debug_.get());
|
||||
if (tessedit_dump_pageseg_images) {
|
||||
Image pix_no_image_ = nullptr;
|
||||
if (*photo_mask_pix != nullptr) {
|
||||
@@ -297,7 +297,7 @@ ColumnFinder *Tesseract::SetupPageSegAndDetectOrientation(PageSegMode pageseg_mo
|
||||
} else {
|
||||
pix_no_image_ = pix_binary_.clone();
|
||||
}
|
||||
- pixa_debug_.AddPix(pix_no_image_, "NoImages");
|
||||
+ pixa_debug_->AddPix(pix_no_image_, "NoImages");
|
||||
pix_no_image_.destroy();
|
||||
}
|
||||
if (!PSM_COL_FIND_ENABLED(pageseg_mode)) {
|
||||
diff --git a/src/ccmain/tesseractclass.cpp b/src/ccmain/tesseractclass.cpp
|
||||
index fd58ac87..517f925e 100644
|
||||
--- a/src/ccmain/tesseractclass.cpp
|
||||
+++ b/src/ccmain/tesseractclass.cpp
|
||||
@@ -487,8 +487,10 @@ Dict &Tesseract::getDict() {
|
||||
}
|
||||
|
||||
void Tesseract::Clear() {
|
||||
- std::string debug_name = imagebasename + "_debug.pdf";
|
||||
- pixa_debug_.WritePDF(debug_name.c_str());
|
||||
+ if (pixa_debug_) {
|
||||
+ std::string debug_name = imagebasename + "_debug.pdf";
|
||||
+ pixa_debug_->WritePDF(debug_name.c_str());
|
||||
+ }
|
||||
pix_binary_.destroy();
|
||||
pix_grey_.destroy();
|
||||
pix_thresholds_.destroy();
|
||||
@@ -572,7 +574,7 @@ void Tesseract::PrepareForPageseg() {
|
||||
// the newly split image.
|
||||
splitter_.set_orig_pix(pix_binary());
|
||||
splitter_.set_pageseg_split_strategy(max_pageseg_strategy);
|
||||
- if (splitter_.Split(true, &pixa_debug_)) {
|
||||
+ if (splitter_.Split(true, pixa_debug_.get())) {
|
||||
ASSERT_HOST(splitter_.splitted_image());
|
||||
pix_binary_.destroy();
|
||||
pix_binary_ = splitter_.splitted_image().clone();
|
||||
@@ -599,7 +601,7 @@ void Tesseract::PrepareForTessOCR(BLOCK_LIST *block_list, Tesseract *osd_tess, O
|
||||
splitter_.set_segmentation_block_list(block_list);
|
||||
splitter_.set_ocr_split_strategy(max_ocr_strategy);
|
||||
// Run the splitter for OCR
|
||||
- bool split_for_ocr = splitter_.Split(false, &pixa_debug_);
|
||||
+ bool split_for_ocr = splitter_.Split(false, pixa_debug_.get());
|
||||
// Restore pix_binary to the binarized original pix for future reference.
|
||||
ASSERT_HOST(splitter_.orig_pix());
|
||||
pix_binary_.destroy();
|
||||
diff --git a/src/ccmain/tesseractclass.h b/src/ccmain/tesseractclass.h
|
||||
index 732bb9e6..030aa5bc 100644
|
||||
--- a/src/ccmain/tesseractclass.h
|
||||
+++ b/src/ccmain/tesseractclass.h
|
||||
@@ -986,7 +986,7 @@ private:
|
||||
// Thresholds that were used to generate the thresholded image from grey.
|
||||
Image pix_thresholds_;
|
||||
// Debug images. If non-empty, will be written on destruction.
|
||||
- DebugPixa pixa_debug_;
|
||||
+ std::unique_ptr<DebugPixa> pixa_debug_;
|
||||
// Input image resolution after any scaling. The resolution is not well
|
||||
// transmitted by operations on Pix, so we keep an independent record here.
|
||||
int source_resolution_;
|
||||
2309
crates/kreuzberg-tesseract/src/api.rs
Normal file
2309
crates/kreuzberg-tesseract/src/api.rs
Normal file
File diff suppressed because it is too large
Load Diff
77
crates/kreuzberg-tesseract/src/choice_iterator.rs
Normal file
77
crates/kreuzberg-tesseract/src/choice_iterator.rs
Normal file
@@ -0,0 +1,77 @@
|
||||
use crate::api::TessDeleteText;
|
||||
use crate::error::{Result, TesseractError};
|
||||
use std::ffi::CStr;
|
||||
use std::os::raw::{c_char, c_float, c_int, c_void};
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
pub struct ChoiceIterator {
|
||||
handle: Arc<Mutex<*mut c_void>>,
|
||||
}
|
||||
|
||||
unsafe impl Send for ChoiceIterator {}
|
||||
unsafe impl Sync for ChoiceIterator {}
|
||||
|
||||
impl ChoiceIterator {
|
||||
/// Creates a new instance of the ChoiceIterator.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `handle` - Pointer to the ChoiceIterator.
|
||||
pub fn new(handle: *mut c_void) -> Self {
|
||||
ChoiceIterator {
|
||||
handle: Arc::new(Mutex::new(handle)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Gets the next choice.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `true` if the next choice is successful, otherwise returns `false`.
|
||||
pub fn next(&self) -> Result<bool> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
Ok(unsafe { TessChoiceIteratorNext(*handle) != 0 })
|
||||
}
|
||||
|
||||
/// Gets the UTF-8 text for the current choice.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the UTF-8 text as a `String` if successful, otherwise returns an error.
|
||||
pub fn get_utf8_text(&self) -> Result<String> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
let text_ptr = unsafe { TessChoiceIteratorGetUTF8Text(*handle) };
|
||||
if text_ptr.is_null() {
|
||||
return Err(TesseractError::NullPointerError);
|
||||
}
|
||||
let c_str = unsafe { CStr::from_ptr(text_ptr) };
|
||||
let result = c_str.to_str()?.to_owned();
|
||||
unsafe { TessDeleteText(text_ptr) };
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Gets the confidence of the current choice.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the confidence as a `f32`.
|
||||
pub fn confidence(&self) -> Result<f32> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
Ok(unsafe { TessChoiceIteratorConfidence(*handle) })
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for ChoiceIterator {
|
||||
fn drop(&mut self) {
|
||||
if let Ok(handle) = self.handle.lock() {
|
||||
unsafe { TessChoiceIteratorDelete(*handle) };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ffi_extern! {
|
||||
fn TessChoiceIteratorDelete(handle: *mut c_void);
|
||||
fn TessChoiceIteratorNext(handle: *mut c_void) -> c_int;
|
||||
fn TessChoiceIteratorGetUTF8Text(handle: *mut c_void) -> *mut c_char;
|
||||
fn TessChoiceIteratorConfidence(handle: *mut c_void) -> c_float;
|
||||
}
|
||||
373
crates/kreuzberg-tesseract/src/enums.rs
Normal file
373
crates/kreuzberg-tesseract/src/enums.rs
Normal file
@@ -0,0 +1,373 @@
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
#[allow(non_camel_case_types)]
|
||||
pub enum TessPageSegMode {
|
||||
PSM_OSD_ONLY = 0,
|
||||
PSM_AUTO_OSD = 1,
|
||||
PSM_AUTO_ONLY = 2,
|
||||
PSM_AUTO = 3,
|
||||
PSM_SINGLE_COLUMN = 4,
|
||||
PSM_SINGLE_BLOCK_VERT_TEXT = 5,
|
||||
PSM_SINGLE_BLOCK = 6,
|
||||
PSM_SINGLE_LINE = 7,
|
||||
PSM_SINGLE_WORD = 8,
|
||||
PSM_CIRCLE_WORD = 9,
|
||||
PSM_SINGLE_CHAR = 10,
|
||||
PSM_SPARSE_TEXT = 11,
|
||||
PSM_SPARSE_TEXT_OSD = 12,
|
||||
PSM_RAW_LINE = 13,
|
||||
PSM_COUNT = 14,
|
||||
}
|
||||
|
||||
impl TessPageSegMode {
|
||||
pub fn from_int(value: i32) -> Self {
|
||||
match value {
|
||||
0 => TessPageSegMode::PSM_OSD_ONLY,
|
||||
1 => TessPageSegMode::PSM_AUTO_OSD,
|
||||
2 => TessPageSegMode::PSM_AUTO_ONLY,
|
||||
3 => TessPageSegMode::PSM_AUTO,
|
||||
4 => TessPageSegMode::PSM_SINGLE_COLUMN,
|
||||
5 => TessPageSegMode::PSM_SINGLE_BLOCK_VERT_TEXT,
|
||||
6 => TessPageSegMode::PSM_SINGLE_BLOCK,
|
||||
7 => TessPageSegMode::PSM_SINGLE_LINE,
|
||||
8 => TessPageSegMode::PSM_SINGLE_WORD,
|
||||
9 => TessPageSegMode::PSM_CIRCLE_WORD,
|
||||
10 => TessPageSegMode::PSM_SINGLE_CHAR,
|
||||
11 => TessPageSegMode::PSM_SPARSE_TEXT,
|
||||
12 => TessPageSegMode::PSM_SPARSE_TEXT_OSD,
|
||||
13 => TessPageSegMode::PSM_RAW_LINE,
|
||||
14 => TessPageSegMode::PSM_COUNT,
|
||||
_ => TessPageSegMode::PSM_AUTO,
|
||||
}
|
||||
}
|
||||
|
||||
/// Safely convert an integer to a TessPageSegMode, returning None for invalid values.
|
||||
pub fn try_from_int(value: i32) -> Option<Self> {
|
||||
match value {
|
||||
0 => Some(TessPageSegMode::PSM_OSD_ONLY),
|
||||
1 => Some(TessPageSegMode::PSM_AUTO_OSD),
|
||||
2 => Some(TessPageSegMode::PSM_AUTO_ONLY),
|
||||
3 => Some(TessPageSegMode::PSM_AUTO),
|
||||
4 => Some(TessPageSegMode::PSM_SINGLE_COLUMN),
|
||||
5 => Some(TessPageSegMode::PSM_SINGLE_BLOCK_VERT_TEXT),
|
||||
6 => Some(TessPageSegMode::PSM_SINGLE_BLOCK),
|
||||
7 => Some(TessPageSegMode::PSM_SINGLE_LINE),
|
||||
8 => Some(TessPageSegMode::PSM_SINGLE_WORD),
|
||||
9 => Some(TessPageSegMode::PSM_CIRCLE_WORD),
|
||||
10 => Some(TessPageSegMode::PSM_SINGLE_CHAR),
|
||||
11 => Some(TessPageSegMode::PSM_SPARSE_TEXT),
|
||||
12 => Some(TessPageSegMode::PSM_SPARSE_TEXT_OSD),
|
||||
13 => Some(TessPageSegMode::PSM_RAW_LINE),
|
||||
14 => Some(TessPageSegMode::PSM_COUNT),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
#[allow(non_camel_case_types)]
|
||||
pub enum TessPageIteratorLevel {
|
||||
RIL_BLOCK = 0,
|
||||
RIL_PARA = 1,
|
||||
RIL_TEXTLINE = 2,
|
||||
RIL_WORD = 3,
|
||||
RIL_SYMBOL = 4,
|
||||
}
|
||||
|
||||
impl TessPageIteratorLevel {
|
||||
pub fn from_int(value: i32) -> Self {
|
||||
match value {
|
||||
0 => TessPageIteratorLevel::RIL_BLOCK,
|
||||
1 => TessPageIteratorLevel::RIL_PARA,
|
||||
2 => TessPageIteratorLevel::RIL_TEXTLINE,
|
||||
3 => TessPageIteratorLevel::RIL_WORD,
|
||||
4 => TessPageIteratorLevel::RIL_SYMBOL,
|
||||
_ => TessPageIteratorLevel::RIL_BLOCK,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
#[allow(non_camel_case_types)]
|
||||
pub enum TessPolyBlockType {
|
||||
PT_UNKNOWN = 0,
|
||||
PT_FLOWING_TEXT = 1,
|
||||
PT_HEADING_TEXT = 2,
|
||||
PT_PULLOUT_TEXT = 3,
|
||||
PT_EQUATION = 4,
|
||||
PT_INLINE_EQUATION = 5,
|
||||
PT_TABLE = 6,
|
||||
PT_VERTICAL_TEXT = 7,
|
||||
PT_CAPTION_TEXT = 8,
|
||||
PT_FLOWING_IMAGE = 9,
|
||||
PT_HEADING_IMAGE = 10,
|
||||
PT_PULLOUT_IMAGE = 11,
|
||||
PT_HORZ_LINE = 12,
|
||||
PT_VERT_LINE = 13,
|
||||
PT_NOISE = 14,
|
||||
PT_COUNT = 15,
|
||||
}
|
||||
|
||||
impl TessPolyBlockType {
|
||||
pub fn from_int(value: i32) -> Self {
|
||||
match value {
|
||||
0 => TessPolyBlockType::PT_UNKNOWN,
|
||||
1 => TessPolyBlockType::PT_FLOWING_TEXT,
|
||||
2 => TessPolyBlockType::PT_HEADING_TEXT,
|
||||
3 => TessPolyBlockType::PT_PULLOUT_TEXT,
|
||||
4 => TessPolyBlockType::PT_EQUATION,
|
||||
5 => TessPolyBlockType::PT_INLINE_EQUATION,
|
||||
6 => TessPolyBlockType::PT_TABLE,
|
||||
7 => TessPolyBlockType::PT_VERTICAL_TEXT,
|
||||
8 => TessPolyBlockType::PT_CAPTION_TEXT,
|
||||
9 => TessPolyBlockType::PT_FLOWING_IMAGE,
|
||||
10 => TessPolyBlockType::PT_HEADING_IMAGE,
|
||||
11 => TessPolyBlockType::PT_PULLOUT_IMAGE,
|
||||
12 => TessPolyBlockType::PT_HORZ_LINE,
|
||||
13 => TessPolyBlockType::PT_VERT_LINE,
|
||||
14 => TessPolyBlockType::PT_NOISE,
|
||||
15 => TessPolyBlockType::PT_COUNT,
|
||||
_ => TessPolyBlockType::PT_UNKNOWN,
|
||||
}
|
||||
}
|
||||
|
||||
/// Safely convert an integer to a TessPolyBlockType, returning None for invalid values.
|
||||
pub fn try_from_int(value: i32) -> Option<Self> {
|
||||
match value {
|
||||
0 => Some(TessPolyBlockType::PT_UNKNOWN),
|
||||
1 => Some(TessPolyBlockType::PT_FLOWING_TEXT),
|
||||
2 => Some(TessPolyBlockType::PT_HEADING_TEXT),
|
||||
3 => Some(TessPolyBlockType::PT_PULLOUT_TEXT),
|
||||
4 => Some(TessPolyBlockType::PT_EQUATION),
|
||||
5 => Some(TessPolyBlockType::PT_INLINE_EQUATION),
|
||||
6 => Some(TessPolyBlockType::PT_TABLE),
|
||||
7 => Some(TessPolyBlockType::PT_VERTICAL_TEXT),
|
||||
8 => Some(TessPolyBlockType::PT_CAPTION_TEXT),
|
||||
9 => Some(TessPolyBlockType::PT_FLOWING_IMAGE),
|
||||
10 => Some(TessPolyBlockType::PT_HEADING_IMAGE),
|
||||
11 => Some(TessPolyBlockType::PT_PULLOUT_IMAGE),
|
||||
12 => Some(TessPolyBlockType::PT_HORZ_LINE),
|
||||
13 => Some(TessPolyBlockType::PT_VERT_LINE),
|
||||
14 => Some(TessPolyBlockType::PT_NOISE),
|
||||
15 => Some(TessPolyBlockType::PT_COUNT),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
#[allow(non_camel_case_types)]
|
||||
pub enum TessOrientation {
|
||||
ORIENTATION_PAGE_UP = 0,
|
||||
ORIENTATION_PAGE_RIGHT = 1,
|
||||
ORIENTATION_PAGE_DOWN = 2,
|
||||
ORIENTATION_PAGE_LEFT = 3,
|
||||
}
|
||||
|
||||
impl TessOrientation {
|
||||
pub fn from_int(value: i32) -> Self {
|
||||
match value {
|
||||
0 => TessOrientation::ORIENTATION_PAGE_UP,
|
||||
1 => TessOrientation::ORIENTATION_PAGE_RIGHT,
|
||||
2 => TessOrientation::ORIENTATION_PAGE_DOWN,
|
||||
3 => TessOrientation::ORIENTATION_PAGE_LEFT,
|
||||
_ => TessOrientation::ORIENTATION_PAGE_UP,
|
||||
}
|
||||
}
|
||||
|
||||
/// Safely convert an integer to a TessOrientation, returning None for invalid values.
|
||||
pub fn try_from_int(value: i32) -> Option<Self> {
|
||||
match value {
|
||||
0 => Some(TessOrientation::ORIENTATION_PAGE_UP),
|
||||
1 => Some(TessOrientation::ORIENTATION_PAGE_RIGHT),
|
||||
2 => Some(TessOrientation::ORIENTATION_PAGE_DOWN),
|
||||
3 => Some(TessOrientation::ORIENTATION_PAGE_LEFT),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
#[allow(non_camel_case_types)]
|
||||
pub enum TessParagraphJustification {
|
||||
JUSTIFICATION_UNKNOWN = 0,
|
||||
JUSTIFICATION_LEFT = 1,
|
||||
JUSTIFICATION_CENTER = 2,
|
||||
JUSTIFICATION_RIGHT = 3,
|
||||
}
|
||||
|
||||
impl TessParagraphJustification {
|
||||
pub fn from_int(value: i32) -> Self {
|
||||
match value {
|
||||
0 => TessParagraphJustification::JUSTIFICATION_UNKNOWN,
|
||||
1 => TessParagraphJustification::JUSTIFICATION_LEFT,
|
||||
2 => TessParagraphJustification::JUSTIFICATION_CENTER,
|
||||
3 => TessParagraphJustification::JUSTIFICATION_RIGHT,
|
||||
_ => TessParagraphJustification::JUSTIFICATION_UNKNOWN,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
#[allow(non_camel_case_types)]
|
||||
pub enum TessWritingDirection {
|
||||
WRITING_DIRECTION_LEFT_TO_RIGHT = 0,
|
||||
WRITING_DIRECTION_RIGHT_TO_LEFT = 1,
|
||||
WRITING_DIRECTION_TOP_TO_BOTTOM = 2,
|
||||
}
|
||||
|
||||
impl TessWritingDirection {
|
||||
pub fn from_int(value: i32) -> Self {
|
||||
match value {
|
||||
0 => TessWritingDirection::WRITING_DIRECTION_LEFT_TO_RIGHT,
|
||||
1 => TessWritingDirection::WRITING_DIRECTION_RIGHT_TO_LEFT,
|
||||
2 => TessWritingDirection::WRITING_DIRECTION_TOP_TO_BOTTOM,
|
||||
_ => TessWritingDirection::WRITING_DIRECTION_LEFT_TO_RIGHT,
|
||||
}
|
||||
}
|
||||
|
||||
/// Safely convert an integer to a TessWritingDirection, returning None for invalid values.
|
||||
pub fn try_from_int(value: i32) -> Option<Self> {
|
||||
match value {
|
||||
0 => Some(TessWritingDirection::WRITING_DIRECTION_LEFT_TO_RIGHT),
|
||||
1 => Some(TessWritingDirection::WRITING_DIRECTION_RIGHT_TO_LEFT),
|
||||
2 => Some(TessWritingDirection::WRITING_DIRECTION_TOP_TO_BOTTOM),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
#[allow(non_camel_case_types)]
|
||||
pub enum TessTextlineOrder {
|
||||
TEXTLINE_ORDER_LEFT_TO_RIGHT = 0,
|
||||
TEXTLINE_ORDER_RIGHT_TO_LEFT = 1,
|
||||
TEXTLINE_ORDER_TOP_TO_BOTTOM = 2,
|
||||
}
|
||||
|
||||
impl TessTextlineOrder {
|
||||
pub fn from_int(value: i32) -> Self {
|
||||
match value {
|
||||
0 => TessTextlineOrder::TEXTLINE_ORDER_LEFT_TO_RIGHT,
|
||||
1 => TessTextlineOrder::TEXTLINE_ORDER_RIGHT_TO_LEFT,
|
||||
2 => TessTextlineOrder::TEXTLINE_ORDER_TOP_TO_BOTTOM,
|
||||
_ => TessTextlineOrder::TEXTLINE_ORDER_LEFT_TO_RIGHT,
|
||||
}
|
||||
}
|
||||
|
||||
/// Safely convert an integer to a TessTextlineOrder, returning None for invalid values.
|
||||
pub fn try_from_int(value: i32) -> Option<Self> {
|
||||
match value {
|
||||
0 => Some(TessTextlineOrder::TEXTLINE_ORDER_LEFT_TO_RIGHT),
|
||||
1 => Some(TessTextlineOrder::TEXTLINE_ORDER_RIGHT_TO_LEFT),
|
||||
2 => Some(TessTextlineOrder::TEXTLINE_ORDER_TOP_TO_BOTTOM),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_page_seg_mode_from_int() {
|
||||
assert_eq!(TessPageSegMode::from_int(0), TessPageSegMode::PSM_OSD_ONLY);
|
||||
assert_eq!(TessPageSegMode::from_int(3), TessPageSegMode::PSM_AUTO);
|
||||
assert_eq!(TessPageSegMode::from_int(10), TessPageSegMode::PSM_SINGLE_CHAR);
|
||||
assert_eq!(TessPageSegMode::from_int(999), TessPageSegMode::PSM_AUTO);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_page_seg_mode_conversion() {
|
||||
let mode = TessPageSegMode::PSM_SINGLE_LINE;
|
||||
assert_eq!(mode as i32, 7);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_page_iterator_level_from_int() {
|
||||
assert_eq!(TessPageIteratorLevel::from_int(0), TessPageIteratorLevel::RIL_BLOCK);
|
||||
assert_eq!(TessPageIteratorLevel::from_int(3), TessPageIteratorLevel::RIL_WORD);
|
||||
assert_eq!(TessPageIteratorLevel::from_int(-1), TessPageIteratorLevel::RIL_BLOCK);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_poly_block_type_from_int() {
|
||||
assert_eq!(TessPolyBlockType::from_int(1), TessPolyBlockType::PT_FLOWING_TEXT);
|
||||
assert_eq!(TessPolyBlockType::from_int(6), TessPolyBlockType::PT_TABLE);
|
||||
assert_eq!(TessPolyBlockType::from_int(100), TessPolyBlockType::PT_UNKNOWN);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_orientation_from_int() {
|
||||
assert_eq!(TessOrientation::from_int(0), TessOrientation::ORIENTATION_PAGE_UP);
|
||||
assert_eq!(TessOrientation::from_int(2), TessOrientation::ORIENTATION_PAGE_DOWN);
|
||||
assert_eq!(TessOrientation::from_int(5), TessOrientation::ORIENTATION_PAGE_UP);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_paragraph_justification_from_int() {
|
||||
assert_eq!(
|
||||
TessParagraphJustification::from_int(1),
|
||||
TessParagraphJustification::JUSTIFICATION_LEFT
|
||||
);
|
||||
assert_eq!(
|
||||
TessParagraphJustification::from_int(3),
|
||||
TessParagraphJustification::JUSTIFICATION_RIGHT
|
||||
);
|
||||
assert_eq!(
|
||||
TessParagraphJustification::from_int(-1),
|
||||
TessParagraphJustification::JUSTIFICATION_UNKNOWN
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_writing_direction_from_int() {
|
||||
assert_eq!(
|
||||
TessWritingDirection::from_int(0),
|
||||
TessWritingDirection::WRITING_DIRECTION_LEFT_TO_RIGHT
|
||||
);
|
||||
assert_eq!(
|
||||
TessWritingDirection::from_int(1),
|
||||
TessWritingDirection::WRITING_DIRECTION_RIGHT_TO_LEFT
|
||||
);
|
||||
assert_eq!(
|
||||
TessWritingDirection::from_int(10),
|
||||
TessWritingDirection::WRITING_DIRECTION_LEFT_TO_RIGHT
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_textline_order_from_int() {
|
||||
assert_eq!(
|
||||
TessTextlineOrder::from_int(0),
|
||||
TessTextlineOrder::TEXTLINE_ORDER_LEFT_TO_RIGHT
|
||||
);
|
||||
assert_eq!(
|
||||
TessTextlineOrder::from_int(2),
|
||||
TessTextlineOrder::TEXTLINE_ORDER_TOP_TO_BOTTOM
|
||||
);
|
||||
assert_eq!(
|
||||
TessTextlineOrder::from_int(99),
|
||||
TessTextlineOrder::TEXTLINE_ORDER_LEFT_TO_RIGHT
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_enums_are_copy() {
|
||||
fn assert_copy<T: Copy>() {}
|
||||
assert_copy::<TessPageSegMode>();
|
||||
assert_copy::<TessPageIteratorLevel>();
|
||||
assert_copy::<TessPolyBlockType>();
|
||||
assert_copy::<TessOrientation>();
|
||||
assert_copy::<TessParagraphJustification>();
|
||||
assert_copy::<TessWritingDirection>();
|
||||
assert_copy::<TessTextlineOrder>();
|
||||
}
|
||||
}
|
||||
85
crates/kreuzberg-tesseract/src/error.rs
Normal file
85
crates/kreuzberg-tesseract/src/error.rs
Normal file
@@ -0,0 +1,85 @@
|
||||
use std::str::Utf8Error;
|
||||
use thiserror::Error;
|
||||
|
||||
/// Errors that can occur when using the Tesseract API.
|
||||
#[derive(Error, Debug)]
|
||||
pub enum TesseractError {
|
||||
#[error("Failed to initialize Tesseract")]
|
||||
InitError,
|
||||
#[error("Failed to set image")]
|
||||
SetImageError,
|
||||
#[error("OCR operation failed")]
|
||||
OcrError,
|
||||
#[error("Invalid UTF-8 in Tesseract output")]
|
||||
Utf8Error(#[from] Utf8Error),
|
||||
#[error("Failed to lock mutex")]
|
||||
MutexLockError,
|
||||
#[error("Failed to set variable")]
|
||||
SetVariableError,
|
||||
#[error("Failed to get variable")]
|
||||
GetVariableError,
|
||||
#[error("Null pointer error")]
|
||||
NullPointerError,
|
||||
#[error("Invalid parameter")]
|
||||
InvalidParameterError,
|
||||
#[error("Layout analysis failed")]
|
||||
AnalyseLayoutError,
|
||||
#[error("Page processing failed")]
|
||||
ProcessPagesError,
|
||||
#[error("I/O error")]
|
||||
IoError,
|
||||
#[error("Mutex error")]
|
||||
MutexError,
|
||||
#[error("Invalid dimensions")]
|
||||
InvalidDimensions,
|
||||
#[error("Invalid bytes per pixel")]
|
||||
InvalidBytesPerPixel,
|
||||
#[error("Invalid bytes per line")]
|
||||
InvalidBytesPerLine,
|
||||
#[error("Invalid image data")]
|
||||
InvalidImageData,
|
||||
#[error("Uninitialized error")]
|
||||
UninitializedError,
|
||||
#[error("Invalid enum value: {0}")]
|
||||
InvalidEnumValue(i32),
|
||||
#[error("String contains null byte")]
|
||||
NullByteInString,
|
||||
}
|
||||
|
||||
/// Result type for Tesseract operations.
|
||||
pub type Result<T> = std::result::Result<T, TesseractError>;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_error_display() {
|
||||
let error = TesseractError::InitError;
|
||||
assert_eq!(error.to_string(), "Failed to initialize Tesseract");
|
||||
|
||||
let error = TesseractError::SetImageError;
|
||||
assert_eq!(error.to_string(), "Failed to set image");
|
||||
|
||||
let error = TesseractError::OcrError;
|
||||
assert_eq!(error.to_string(), "OCR operation failed");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf8_error_conversion() {
|
||||
let invalid_utf8 = vec![0xFF, 0xFE];
|
||||
let utf8_error = std::str::from_utf8(&invalid_utf8).unwrap_err();
|
||||
let tess_error: TesseractError = utf8_error.into();
|
||||
|
||||
match tess_error {
|
||||
TesseractError::Utf8Error(_) => {}
|
||||
_ => panic!("Expected Utf8Error variant"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_error_is_send_sync() {
|
||||
fn assert_send_sync<T: Send + Sync>() {}
|
||||
assert_send_sync::<TesseractError>();
|
||||
}
|
||||
}
|
||||
807
crates/kreuzberg-tesseract/src/leptonica.rs
Normal file
807
crates/kreuzberg-tesseract/src/leptonica.rs
Normal file
@@ -0,0 +1,807 @@
|
||||
//! Safe Leptonica Pix wrapper for image preprocessing before OCR.
|
||||
//!
|
||||
//! Provides a safe Rust wrapper around the Leptonica image-processing library.
|
||||
//! `Pix` is the core Leptonica image type. All methods return `Result<Pix>`,
|
||||
//! and the wrapper takes care of proper memory management via `Drop`.
|
||||
//!
|
||||
//! ## Pixel format
|
||||
//!
|
||||
//! Leptonica's 32 bpp format stores each pixel as a native 32-bit integer
|
||||
//! with the logical layout (MSB→LSB): `R G B A`, i.e.
|
||||
//! `(r << 24) | (g << 16) | (b << 8) | alpha`. Leptonica accesses
|
||||
//! individual channels via bit-shift on the integer value, not via
|
||||
//! byte-addressed pointer arithmetic, so the packing is identical on both
|
||||
//! big- and little-endian hosts. Do **not** call `pixEndianByteSwap` after
|
||||
//! writing pixels this way — doing so inverts the channel order.
|
||||
//!
|
||||
//! ## `pixDeskew` requires a binary (1 bpp) image
|
||||
//!
|
||||
//! Call `to_grayscale()` followed by `adaptive_threshold()` before `deskew()`.
|
||||
//! `pixDeskew` internally calls `pixFindSkewSweepAndSearchScorePivot` which
|
||||
//! operates on 1-bit images only; passing a colour image will return a null
|
||||
//! pointer.
|
||||
|
||||
use crate::error::{Result, TesseractError};
|
||||
use std::ffi::c_void;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Raw Leptonica FFI declarations
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
|
||||
ffi_extern! {
|
||||
/// Allocates a new Pix with the given dimensions and bit depth.
|
||||
fn pixCreate(width: i32, height: i32, depth: i32) -> *mut c_void;
|
||||
|
||||
/// Frees a Pix and sets the caller's pointer to null.
|
||||
///
|
||||
/// Leptonica uses a double-pointer convention: `*ppix` is set to null
|
||||
/// after the call so that accidental double-frees are a no-op.
|
||||
fn pixDestroy(ppix: *mut *mut c_void);
|
||||
|
||||
/// Sets the horizontal and vertical resolution (DPI) on a Pix.
|
||||
///
|
||||
/// Returns 0 on success, non-zero on error.
|
||||
fn pixSetResolution(pix: *mut c_void, xres: i32, yres: i32) -> i32;
|
||||
|
||||
/// Returns the width of the Pix in pixels.
|
||||
fn pixGetWidth(pix: *const c_void) -> i32;
|
||||
|
||||
/// Returns the height of the Pix in pixels.
|
||||
fn pixGetHeight(pix: *const c_void) -> i32;
|
||||
|
||||
/// Returns the bit depth of the Pix (1, 2, 4, 8, 16, or 32).
|
||||
fn pixGetDepth(pix: *const c_void) -> i32;
|
||||
|
||||
/// Returns the number of 32-bit words per row (words-per-line).
|
||||
fn pixGetWpl(pix: *const c_void) -> i32;
|
||||
|
||||
/// Returns a mutable pointer to the start of the pixel data array.
|
||||
///
|
||||
/// The data is stored as rows of 32-bit words; each word covers 32/depth pixels.
|
||||
fn pixGetData(pix: *mut c_void) -> *mut u32;
|
||||
|
||||
/// Deskews a 1 bpp image using a sweep-and-search algorithm.
|
||||
///
|
||||
/// `redsearch` is the reduction factor used during the search; pass 0 for
|
||||
/// the Leptonica default (2x reduction). Returns a new deskewed Pix on
|
||||
/// success, or null on failure. The input Pix is **not** consumed.
|
||||
fn pixDeskew(pixs: *mut c_void, redsearch: i32) -> *mut c_void;
|
||||
|
||||
/// Estimates the skew angle and confidence for a 1 bpp image.
|
||||
///
|
||||
/// Writes the angle (degrees, positive = counter-clockwise) into `*pangle`
|
||||
/// and a confidence score (0–1) into `*pconf`. Returns 0 on success.
|
||||
fn pixFindSkew(pixs: *mut c_void, pangle: *mut f32, pconf: *mut f32) -> i32;
|
||||
|
||||
/// Applies Otsu adaptive thresholding to produce a binarised Pix.
|
||||
///
|
||||
/// `sx`/`sy` are the tile dimensions; `smoothx`/`smoothy` are half-widths
|
||||
/// for smoothing the threshold map; `scorefract` controls threshold acceptance
|
||||
/// (typical value: 0.1). `ppixth` (optional) receives the threshold image;
|
||||
/// `ppixd` receives the binarised output.
|
||||
fn pixOtsuAdaptiveThreshold(
|
||||
pixs: *mut c_void,
|
||||
sx: i32,
|
||||
sy: i32,
|
||||
smoothx: i32,
|
||||
smoothy: i32,
|
||||
scorefract: f32,
|
||||
ppixth: *mut *mut c_void,
|
||||
ppixd: *mut *mut c_void,
|
||||
) -> i32;
|
||||
|
||||
/// Normalises the background of a grayscale image using morphological operations.
|
||||
///
|
||||
/// `reduction` is the subsampling factor (e.g. 4), `size` is the morphological
|
||||
/// structuring-element half-size (e.g. 15), and `bgval` is the target background
|
||||
/// value (e.g. 200). Returns a new normalised Pix, or null on failure.
|
||||
fn pixBackgroundNormMorph(
|
||||
pixs: *mut c_void,
|
||||
pixim: *mut c_void,
|
||||
reduction: i32,
|
||||
size: i32,
|
||||
bgval: i32,
|
||||
) -> *mut c_void;
|
||||
|
||||
/// Applies unsharp masking to sharpen a grayscale or colour Pix.
|
||||
///
|
||||
/// `halfwidth` is the half-size of the blur kernel; `fract` controls the
|
||||
/// sharpening strength (0.0–1.0 typical). Returns a new Pix, or null on failure.
|
||||
fn pixUnsharpMasking(pixs: *mut c_void, halfwidth: i32, fract: f32) -> *mut c_void;
|
||||
|
||||
/// Scales a Pix by independent x and y factors using the best available method.
|
||||
///
|
||||
/// Returns a new scaled Pix, or null on failure. The input Pix is **not** consumed.
|
||||
fn pixScale(pixs: *mut c_void, scalex: f32, scaley: f32) -> *mut c_void;
|
||||
|
||||
/// Converts an RGB (32 bpp) Pix to 8 bpp grayscale.
|
||||
///
|
||||
/// `rwt`, `gwt`, `bwt` are the red, green, and blue channel weights; pass
|
||||
/// 0.0 for all three to use Leptonica's default equal weights. Returns a new
|
||||
/// 8 bpp Pix, or null on failure.
|
||||
fn pixConvertRGBToGray(pixs: *mut c_void, rwt: f32, gwt: f32, bwt: f32) -> *mut c_void;
|
||||
|
||||
/// Creates a Leptonica BOX with the given coordinates.
|
||||
fn boxCreate(x: i32, y: i32, w: i32, h: i32) -> *mut c_void;
|
||||
|
||||
/// Frees a Leptonica BOX.
|
||||
fn boxDestroy(pbox: *mut *mut c_void);
|
||||
|
||||
/// Clips a rectangular region from a Pix.
|
||||
///
|
||||
/// Returns a new Pix containing the clipped region, or null on failure.
|
||||
/// `pboxc` (optional) receives the actual clipped box; pass null to ignore.
|
||||
fn pixClipRectangle(pixs: *mut c_void, box_: *mut c_void, pboxc: *mut *mut c_void) -> *mut c_void;
|
||||
|
||||
/// Counts connected components in a 1 bpp image.
|
||||
///
|
||||
/// `connectivity` is 4 or 8. Writes the count to `*pcount`.
|
||||
/// Returns 0 on success.
|
||||
fn pixCountConnComp(pix: *mut c_void, connectivity: i32, pcount: *mut i32) -> i32;
|
||||
|
||||
/// Retrieves the horizontal and vertical resolution (DPI) from a Pix.
|
||||
///
|
||||
/// Writes the x-resolution into `*pxres` and y-resolution into `*pyres`.
|
||||
/// Returns 0 on success, non-zero on error.
|
||||
fn pixGetResolution(pix: *const c_void, pxres: *mut i32, pyres: *mut i32) -> i32;
|
||||
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Safe Pix wrapper
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Safe wrapper around a Leptonica `PIX *` image object.
|
||||
///
|
||||
/// Owns the underlying allocation and frees it in `Drop`. All methods that
|
||||
/// return a new image allocate a fresh `Pix`; the receiver is never consumed.
|
||||
///
|
||||
/// # Thread safety
|
||||
///
|
||||
/// `Pix` is `Send` because Leptonica image objects are independent heap
|
||||
/// allocations with no shared mutable state. Concurrent mutation from multiple
|
||||
/// threads is **not** safe (no `Sync`).
|
||||
#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
|
||||
pub struct Pix {
|
||||
ptr: *mut c_void,
|
||||
}
|
||||
|
||||
#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
|
||||
impl std::fmt::Debug for Pix {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("Pix").field("ptr", &self.ptr).finish()
|
||||
}
|
||||
}
|
||||
|
||||
// SAFETY: A Pix owns a uniquely heap-allocated Leptonica PIX. There is no
|
||||
// interior mutability shared across thread boundaries, so transferring
|
||||
// ownership to another thread is safe.
|
||||
#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
|
||||
unsafe impl Send for Pix {}
|
||||
|
||||
#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
|
||||
impl Pix {
|
||||
// -----------------------------------------------------------------------
|
||||
// Construction
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
/// Creates a 32 bpp Leptonica Pix from a packed RGB byte slice.
|
||||
///
|
||||
/// `data` must contain exactly `width * height * 3` bytes in left-to-right,
|
||||
/// top-to-bottom, `R G B` interleaved order.
|
||||
///
|
||||
/// The DPI is set to 300 × 300 which is a sensible default for OCR input.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns `TesseractError::InvalidImageData` if `data` length does not
|
||||
/// match `width * height * 3`, if either dimension is zero, or if
|
||||
/// Leptonica's `pixCreate` returns null.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// # use kreuzberg_tesseract::Pix;
|
||||
/// let rgb = vec![255u8; 4 * 4 * 3]; // 4×4 white image
|
||||
/// let pix = Pix::from_raw_rgb(&rgb, 4, 4).unwrap();
|
||||
/// assert_eq!(pix.width(), 4);
|
||||
/// assert_eq!(pix.height(), 4);
|
||||
/// assert_eq!(pix.depth(), 32);
|
||||
/// ```
|
||||
pub fn from_raw_rgb(data: &[u8], width: u32, height: u32) -> Result<Pix> {
|
||||
let expected = (width as usize)
|
||||
.checked_mul(height as usize)
|
||||
.and_then(|n| n.checked_mul(3))
|
||||
.ok_or(TesseractError::InvalidImageData)?;
|
||||
|
||||
if data.len() != expected || width == 0 || height == 0 {
|
||||
return Err(TesseractError::InvalidImageData);
|
||||
}
|
||||
|
||||
// SAFETY: pixCreate() allocates a new PIX with the requested dimensions.
|
||||
// It is safe because:
|
||||
// 1. width, height, and depth (32) are valid positive integers.
|
||||
// 2. pixCreate() documents that it returns null only on allocation
|
||||
// failure, which we check immediately below.
|
||||
let pix_ptr = unsafe { pixCreate(width as i32, height as i32, 32) };
|
||||
if pix_ptr.is_null() {
|
||||
return Err(TesseractError::NullPointerError);
|
||||
}
|
||||
|
||||
// SAFETY: pixGetData() returns a mutable pointer into the allocated pixel
|
||||
// buffer that is valid for the lifetime of the Pix. We own pix_ptr
|
||||
// exclusively at this point and have not exposed it to any other code.
|
||||
let data_ptr = unsafe { pixGetData(pix_ptr) };
|
||||
if data_ptr.is_null() {
|
||||
// Clean up before returning the error.
|
||||
// SAFETY: pix_ptr is a valid non-null allocation from pixCreate().
|
||||
// Passing &mut pix_ptr satisfies the double-pointer convention; after
|
||||
// this call pix_ptr is set to null by Leptonica.
|
||||
let mut ptr = pix_ptr;
|
||||
unsafe { pixDestroy(&mut ptr) };
|
||||
return Err(TesseractError::NullPointerError);
|
||||
}
|
||||
|
||||
// SAFETY: pixGetWpl() is a pure read of the Pix header that is always
|
||||
// valid for a correctly-allocated Pix.
|
||||
// For a 32 bpp image, each pixel occupies exactly one 32-bit word, so
|
||||
// wpl == width (no padding bytes). The loop below uses `row * wpl + col`
|
||||
// to index into the pixel data, which is within bounds because col < width <= wpl.
|
||||
let wpl = unsafe { pixGetWpl(pix_ptr) } as usize;
|
||||
|
||||
// Write RGB pixels into the Leptonica data buffer.
|
||||
//
|
||||
// Leptonica's 32 bpp pixel format stores each pixel as a native
|
||||
// 32-bit integer word with the logical layout (MSB→LSB): R G B A,
|
||||
// i.e. `(r << 24) | (g << 16) | (b << 8) | alpha`. This is the
|
||||
// same bit pattern regardless of host endianness — Leptonica treats
|
||||
// the data as an array of 32-bit integers and accesses individual
|
||||
// bytes via bit-shift, not via byte-addressed pointer arithmetic.
|
||||
//
|
||||
// Therefore we pack directly as `(r << 24) | (g << 16) | (b << 8) | 0xFF`
|
||||
// and write the resulting u32 without any byte-swapping. Calling
|
||||
// `pixEndianByteSwap` would invert the channel order, producing
|
||||
// A B G R instead of R G B A.
|
||||
for row in 0..(height as usize) {
|
||||
for col in 0..(width as usize) {
|
||||
let src = (row * width as usize + col) * 3;
|
||||
let r = data[src] as u32;
|
||||
let g = data[src + 1] as u32;
|
||||
let b = data[src + 2] as u32;
|
||||
// Pack channels as (MSB) R G B A (LSB) in the 32-bit integer.
|
||||
let word: u32 = (r << 24) | (g << 16) | (b << 8) | 0xFF;
|
||||
// SAFETY: data_ptr is a valid writable pointer into the Leptonica
|
||||
// pixel buffer. The offset `row * wpl + col` is within bounds because:
|
||||
// 1. wpl >= width (Leptonica pads rows to 32-bit word boundaries).
|
||||
// 2. row < height and col < width by loop invariants.
|
||||
unsafe {
|
||||
*data_ptr.add(row * wpl + col) = word;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Set a sensible default DPI for OCR processing.
|
||||
// SAFETY: pix_ptr is valid and non-null. pixSetResolution only writes
|
||||
// two integer fields in the Pix header.
|
||||
unsafe { pixSetResolution(pix_ptr, 300, 300) };
|
||||
|
||||
Ok(Pix { ptr: pix_ptr })
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Image processing operations
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
/// Deskews this image, returning a new corrected Pix.
|
||||
///
|
||||
/// **Note:** `pixDeskew` requires a 1 bpp (binary) image. Call
|
||||
/// `to_grayscale()` followed by `adaptive_threshold()` before invoking
|
||||
/// this method on a colour or grayscale Pix.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns `TesseractError::NullPointerError` if Leptonica returns null
|
||||
/// (typically because the input is not 1 bpp or the image is too small).
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// # use kreuzberg_tesseract::Pix;
|
||||
/// # let rgb = vec![0u8; 100 * 100 * 3];
|
||||
/// # let pix = Pix::from_raw_rgb(&rgb, 100, 100).unwrap();
|
||||
/// let gray = pix.to_grayscale().unwrap();
|
||||
/// let binary = gray.adaptive_threshold(32, 32).unwrap();
|
||||
/// let deskewed = binary.deskew().unwrap();
|
||||
/// ```
|
||||
pub fn deskew(&self) -> Result<Pix> {
|
||||
// SAFETY: self.ptr is a valid non-null Pix we own. pixDeskew() does
|
||||
// not take ownership; it creates and returns a new Pix allocation.
|
||||
// We check for null to handle the case where the operation fails
|
||||
// (e.g. input is not 1 bpp).
|
||||
let result = unsafe { pixDeskew(self.ptr, 0) };
|
||||
if result.is_null() {
|
||||
Err(TesseractError::NullPointerError)
|
||||
} else {
|
||||
Ok(Pix { ptr: result })
|
||||
}
|
||||
}
|
||||
|
||||
/// Estimates the skew angle (degrees) and confidence (0–1) for this image.
|
||||
///
|
||||
/// A positive angle indicates counter-clockwise skew. Confidence near 1.0
|
||||
/// means a clear dominant skew direction was found.
|
||||
///
|
||||
/// **Note:** Like `deskew`, this operates on 1 bpp images.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns `TesseractError::OcrError` if `pixFindSkew` returns a non-zero
|
||||
/// status (e.g. insufficient contrast or wrong bit depth).
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// # use kreuzberg_tesseract::Pix;
|
||||
/// # let rgb = vec![0u8; 100 * 100 * 3];
|
||||
/// # let pix = Pix::from_raw_rgb(&rgb, 100, 100).unwrap();
|
||||
/// let gray = pix.to_grayscale().unwrap();
|
||||
/// let binary = gray.adaptive_threshold(32, 32).unwrap();
|
||||
/// let (angle, confidence) = binary.find_skew().unwrap();
|
||||
/// println!("Skew: {angle:.2}° (confidence {confidence:.2})");
|
||||
/// ```
|
||||
pub fn find_skew(&self) -> Result<(f32, f32)> {
|
||||
let mut angle: f32 = 0.0;
|
||||
let mut conf: f32 = 0.0;
|
||||
// SAFETY: self.ptr is valid and non-null. We pass pointers to local
|
||||
// stack-allocated f32 values, which are valid write targets for the
|
||||
// duration of this call. pixFindSkew() writes into them and returns
|
||||
// an integer status code.
|
||||
let status = unsafe { pixFindSkew(self.ptr, &mut angle, &mut conf) };
|
||||
if status != 0 {
|
||||
Err(TesseractError::OcrError)
|
||||
} else {
|
||||
Ok((angle, conf))
|
||||
}
|
||||
}
|
||||
|
||||
/// Binarises this image using Otsu adaptive thresholding.
|
||||
///
|
||||
/// `tile_width` and `tile_height` control the size of the local regions
|
||||
/// used to compute the threshold. Values around 16–64 work well for typical
|
||||
/// document images; smaller tiles follow local contrast more closely.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns `TesseractError::NullPointerError` if Leptonica returns null, or
|
||||
/// `TesseractError::OcrError` if `pixOtsuAdaptiveThreshold` returns a
|
||||
/// non-zero status.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// # use kreuzberg_tesseract::Pix;
|
||||
/// # let rgb = vec![128u8; 64 * 64 * 3];
|
||||
/// # let pix = Pix::from_raw_rgb(&rgb, 64, 64).unwrap();
|
||||
/// let gray = pix.to_grayscale().unwrap();
|
||||
/// let binary = gray.adaptive_threshold(32, 32).unwrap();
|
||||
/// assert_eq!(binary.depth(), 1);
|
||||
/// ```
|
||||
pub fn adaptive_threshold(&self, tile_width: i32, tile_height: i32) -> Result<Pix> {
|
||||
let mut result: *mut c_void = std::ptr::null_mut();
|
||||
// SAFETY: self.ptr is a valid non-null Pix. We pass null for ppixth
|
||||
// because we do not need the intermediate threshold image. result is a
|
||||
// local pointer that will be written by pixOtsuAdaptiveThreshold(); we
|
||||
// check it for null before wrapping in a Pix.
|
||||
let status = unsafe {
|
||||
pixOtsuAdaptiveThreshold(
|
||||
self.ptr,
|
||||
tile_width,
|
||||
tile_height,
|
||||
0, // smoothx: no smoothing
|
||||
0, // smoothy: no smoothing
|
||||
0.1, // scorefract: Leptonica-recommended default
|
||||
std::ptr::null_mut(), // ppixth: we don't need the threshold map
|
||||
&mut result,
|
||||
)
|
||||
};
|
||||
if status != 0 {
|
||||
return Err(TesseractError::OcrError);
|
||||
}
|
||||
if result.is_null() {
|
||||
return Err(TesseractError::NullPointerError);
|
||||
}
|
||||
Ok(Pix { ptr: result })
|
||||
}
|
||||
|
||||
/// Returns the horizontal and vertical resolution (DPI) of this image.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns `TesseractError::OcrError` if `pixGetResolution` fails.
|
||||
pub fn get_resolution(&self) -> Result<(i32, i32)> {
|
||||
let mut xres: i32 = 0;
|
||||
let mut yres: i32 = 0;
|
||||
// SAFETY: self.ptr is a valid non-null Pix. xres and yres are valid
|
||||
// stack-allocated i32 values. pixGetResolution reads the Pix header.
|
||||
let status = unsafe { pixGetResolution(self.ptr, &mut xres, &mut yres) };
|
||||
if status != 0 {
|
||||
Err(TesseractError::OcrError)
|
||||
} else {
|
||||
Ok((xres, yres))
|
||||
}
|
||||
}
|
||||
|
||||
/// Sets the horizontal and vertical resolution (DPI) on this image.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns `TesseractError::OcrError` if `pixSetResolution` fails.
|
||||
pub fn set_resolution(&mut self, xres: i32, yres: i32) -> Result<()> {
|
||||
// SAFETY: self.ptr is a valid non-null Pix. pixSetResolution only
|
||||
// writes two integer fields in the Pix header.
|
||||
let status = unsafe { pixSetResolution(self.ptr, xres, yres) };
|
||||
if status != 0 {
|
||||
Err(TesseractError::OcrError)
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Ensures the image has a valid (non-zero) DPI resolution.
|
||||
///
|
||||
/// If both x and y resolution are zero, sets them to 72 DPI as a
|
||||
/// safe fallback. This prevents Leptonica operations that depend on
|
||||
/// resolution metadata from producing incorrect results.
|
||||
fn ensure_valid_resolution(&self) {
|
||||
if let Ok((xres, yres)) = self.get_resolution()
|
||||
&& (xres == 0 || yres == 0)
|
||||
{
|
||||
// SAFETY: self.ptr is valid. We set a safe default DPI.
|
||||
unsafe { pixSetResolution(self.ptr, 72, 72) };
|
||||
}
|
||||
}
|
||||
|
||||
/// Normalises the background of this image using morphological operations.
|
||||
///
|
||||
/// Useful as a preprocessing step when the document has uneven illumination
|
||||
/// or a non-white background. Returns a new normalised Pix.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns `TesseractError::NullPointerError` if `pixBackgroundNormMorph`
|
||||
/// returns null.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// # use kreuzberg_tesseract::Pix;
|
||||
/// # let rgb = vec![200u8; 100 * 100 * 3];
|
||||
/// # let pix = Pix::from_raw_rgb(&rgb, 100, 100).unwrap();
|
||||
/// let gray = pix.to_grayscale().unwrap();
|
||||
/// let normalised = gray.background_normalize().unwrap();
|
||||
/// ```
|
||||
pub fn background_normalize(&self) -> Result<Pix> {
|
||||
self.ensure_valid_resolution();
|
||||
// SAFETY: self.ptr is a valid non-null Pix. We pass null for pixim
|
||||
// (no mask image). pixBackgroundNormMorph() returns a newly allocated
|
||||
// Pix or null on failure.
|
||||
let result = unsafe {
|
||||
pixBackgroundNormMorph(
|
||||
self.ptr,
|
||||
std::ptr::null_mut(), // pixim: no mask
|
||||
4, // reduction: 4x subsampling
|
||||
15, // size: morphological SE half-size
|
||||
200, // bgval: target background value
|
||||
)
|
||||
};
|
||||
if result.is_null() {
|
||||
Err(TesseractError::NullPointerError)
|
||||
} else {
|
||||
Ok(Pix { ptr: result })
|
||||
}
|
||||
}
|
||||
|
||||
/// Applies unsharp masking to sharpen this image.
|
||||
///
|
||||
/// `halfwidth` is the half-size of the blur kernel (e.g. 1–5).
|
||||
/// `fract` is the sharpening fraction in the range 0.0–1.0; values
|
||||
/// around 0.3–0.5 produce visible sharpening without artefacts.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns `TesseractError::NullPointerError` if `pixUnsharpMasking`
|
||||
/// returns null.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// # use kreuzberg_tesseract::Pix;
|
||||
/// # let rgb = vec![128u8; 64 * 64 * 3];
|
||||
/// # let pix = Pix::from_raw_rgb(&rgb, 64, 64).unwrap();
|
||||
/// let sharpened = pix.unsharp_mask(2, 0.4).unwrap();
|
||||
/// ```
|
||||
pub fn unsharp_mask(&self, halfwidth: i32, fract: f32) -> Result<Pix> {
|
||||
self.ensure_valid_resolution();
|
||||
// SAFETY: self.ptr is valid and non-null. pixUnsharpMasking() returns
|
||||
// a new Pix without modifying or taking ownership of the source.
|
||||
let result = unsafe { pixUnsharpMasking(self.ptr, halfwidth, fract) };
|
||||
if result.is_null() {
|
||||
Err(TesseractError::NullPointerError)
|
||||
} else {
|
||||
Ok(Pix { ptr: result })
|
||||
}
|
||||
}
|
||||
|
||||
/// Scales this image by independent x and y factors.
|
||||
///
|
||||
/// Leptonica automatically chooses the best scaling algorithm based on
|
||||
/// the scale factors and bit depth (area mapping for downscaling,
|
||||
/// linear interpolation for upscaling).
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns `TesseractError::NullPointerError` if `pixScale` returns null.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// # use kreuzberg_tesseract::Pix;
|
||||
/// # let rgb = vec![255u8; 40 * 40 * 3];
|
||||
/// # let pix = Pix::from_raw_rgb(&rgb, 40, 40).unwrap();
|
||||
/// let upscaled = pix.scale(2.0, 2.0).unwrap();
|
||||
/// assert_eq!(upscaled.width(), 80);
|
||||
/// assert_eq!(upscaled.height(), 80);
|
||||
/// ```
|
||||
pub fn scale(&self, sx: f32, sy: f32) -> Result<Pix> {
|
||||
// SAFETY: self.ptr is valid and non-null. pixScale() creates a new Pix
|
||||
// and does not modify the source.
|
||||
let result = unsafe { pixScale(self.ptr, sx, sy) };
|
||||
if result.is_null() {
|
||||
Err(TesseractError::NullPointerError)
|
||||
} else {
|
||||
Ok(Pix { ptr: result })
|
||||
}
|
||||
}
|
||||
|
||||
/// Clips a rectangular sub-region from this image.
|
||||
///
|
||||
/// Returns a new Pix containing only the pixels within the given rectangle.
|
||||
/// Coordinates are in pixel space: (x, y) is the top-left corner.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns `TesseractError::NullPointerError` if the crop fails.
|
||||
pub fn clip_rectangle(&self, x: i32, y: i32, w: i32, h: i32) -> Result<Pix> {
|
||||
// SAFETY: boxCreate allocates a new BOX on the heap.
|
||||
let box_ = unsafe { boxCreate(x, y, w, h) };
|
||||
if box_.is_null() {
|
||||
return Err(TesseractError::NullPointerError);
|
||||
}
|
||||
// SAFETY: pixClipRectangle returns a new Pix clipped to the BOX region.
|
||||
// We pass null for pboxc (we don't need the clipped box coordinates back).
|
||||
let result = unsafe { pixClipRectangle(self.ptr, box_, std::ptr::null_mut()) };
|
||||
// SAFETY: Free the BOX we allocated.
|
||||
let mut box_mut = box_;
|
||||
unsafe { boxDestroy(&mut box_mut) };
|
||||
if result.is_null() {
|
||||
Err(TesseractError::NullPointerError)
|
||||
} else {
|
||||
Ok(Pix { ptr: result })
|
||||
}
|
||||
}
|
||||
|
||||
/// Counts connected components in a 1 bpp (binary) image.
|
||||
///
|
||||
/// `connectivity` should be 4 or 8.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns `TesseractError::OcrError` if `pixCountConnComp` fails
|
||||
/// (e.g., wrong bit depth — image must be 1 bpp).
|
||||
pub fn count_connected_components(&self, connectivity: i32) -> Result<i32> {
|
||||
let mut count: i32 = 0;
|
||||
// SAFETY: self.ptr is a valid Pix. count is a valid stack local.
|
||||
let status = unsafe { pixCountConnComp(self.ptr, connectivity, &mut count) };
|
||||
if status != 0 {
|
||||
Err(TesseractError::OcrError)
|
||||
} else {
|
||||
Ok(count)
|
||||
}
|
||||
}
|
||||
|
||||
/// Converts this 32 bpp RGB image to an 8 bpp grayscale Pix.
|
||||
///
|
||||
/// Passing 0.0 for all weight parameters instructs Leptonica to use its
|
||||
/// default perceptual weights (approx. 0.299 R, 0.587 G, 0.114 B).
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns `TesseractError::NullPointerError` if `pixConvertRGBToGray`
|
||||
/// returns null (e.g. the source is not 32 bpp).
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// # use kreuzberg_tesseract::Pix;
|
||||
/// # let rgb = vec![100u8, 150u8, 200u8].repeat(10 * 10);
|
||||
/// # let pix = Pix::from_raw_rgb(&rgb, 10, 10).unwrap();
|
||||
/// let gray = pix.to_grayscale().unwrap();
|
||||
/// assert_eq!(gray.depth(), 8);
|
||||
/// ```
|
||||
pub fn to_grayscale(&self) -> Result<Pix> {
|
||||
self.ensure_valid_resolution();
|
||||
// SAFETY: self.ptr is valid and non-null. pixConvertRGBToGray() returns
|
||||
// a new 8 bpp Pix; the source is not modified.
|
||||
let result = unsafe { pixConvertRGBToGray(self.ptr, 0.0, 0.0, 0.0) };
|
||||
if result.is_null() {
|
||||
Err(TesseractError::NullPointerError)
|
||||
} else {
|
||||
Ok(Pix { ptr: result })
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Accessors
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
/// Returns the raw Leptonica `PIX *` pointer.
|
||||
///
|
||||
/// Intended for passing this image to `TesseractAPI::set_image_2`.
|
||||
///
|
||||
/// # Safety
|
||||
///
|
||||
/// The caller must ensure the `Pix` outlives any use of the returned
|
||||
/// pointer. `TessBaseAPISetImage2` **borrows** the pointer — it does not
|
||||
/// take ownership — so the `Pix` must remain alive until after
|
||||
/// `TessBaseAPIRecognize` (or any other Tesseract call that consumes the
|
||||
/// image data) has completed. Dropping the `Pix` while Tesseract holds
|
||||
/// the pointer will result in a use-after-free.
|
||||
///
|
||||
/// The caller must **not** free the returned pointer; `Pix::drop` is
|
||||
/// solely responsible for deallocation via `pixDestroy`.
|
||||
pub fn as_ptr(&self) -> *mut c_void {
|
||||
self.ptr
|
||||
}
|
||||
|
||||
/// Returns the width of the image in pixels.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// # use kreuzberg_tesseract::Pix;
|
||||
/// # let pix = Pix::from_raw_rgb(&vec![0u8; 8 * 6 * 3], 8, 6).unwrap();
|
||||
/// assert_eq!(pix.width(), 8);
|
||||
/// ```
|
||||
pub fn width(&self) -> i32 {
|
||||
// SAFETY: self.ptr is a valid non-null Pix. pixGetWidth() is a pure
|
||||
// read of the Pix header struct; it does not mutate any state.
|
||||
unsafe { pixGetWidth(self.ptr) }
|
||||
}
|
||||
|
||||
/// Returns the height of the image in pixels.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// # use kreuzberg_tesseract::Pix;
|
||||
/// # let pix = Pix::from_raw_rgb(&vec![0u8; 8 * 6 * 3], 8, 6).unwrap();
|
||||
/// assert_eq!(pix.height(), 6);
|
||||
/// ```
|
||||
pub fn height(&self) -> i32 {
|
||||
// SAFETY: self.ptr is a valid non-null Pix. pixGetHeight() is a pure
|
||||
// read of the Pix header struct.
|
||||
unsafe { pixGetHeight(self.ptr) }
|
||||
}
|
||||
|
||||
/// Returns the bit depth of the image (1, 8, or 32 for this module's usage).
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// # use kreuzberg_tesseract::Pix;
|
||||
/// # let pix = Pix::from_raw_rgb(&vec![0u8; 4 * 4 * 3], 4, 4).unwrap();
|
||||
/// assert_eq!(pix.depth(), 32);
|
||||
/// ```
|
||||
pub fn depth(&self) -> i32 {
|
||||
// SAFETY: self.ptr is a valid non-null Pix. pixGetDepth() is a pure
|
||||
// read of the Pix header struct.
|
||||
unsafe { pixGetDepth(self.ptr) }
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Drop implementation
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
|
||||
impl Drop for Pix {
|
||||
fn drop(&mut self) {
|
||||
if !self.ptr.is_null() {
|
||||
// SAFETY: self.ptr is a non-null Leptonica PIX that we allocated and
|
||||
// own exclusively. pixDestroy() takes a double pointer, sets *ppix to
|
||||
// null after freeing, and is safe to call exactly once per allocation.
|
||||
// After this call self.ptr is null (Leptonica sets it), preventing
|
||||
// any double-free if drop() were somehow called again.
|
||||
unsafe { pixDestroy(&mut self.ptr) };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Tests
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[cfg(test)]
|
||||
#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn make_rgb_pix(width: u32, height: u32, fill: u8) -> Pix {
|
||||
let data = vec![fill; (width * height * 3) as usize];
|
||||
Pix::from_raw_rgb(&data, width, height).expect("from_raw_rgb failed")
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_from_raw_rgb_dimensions() {
|
||||
let pix = make_rgb_pix(16, 8, 200);
|
||||
assert_eq!(pix.width(), 16);
|
||||
assert_eq!(pix.height(), 8);
|
||||
assert_eq!(pix.depth(), 32);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_from_raw_rgb_wrong_length() {
|
||||
let data = vec![0u8; 10]; // too short for 4×4
|
||||
let err = Pix::from_raw_rgb(&data, 4, 4).unwrap_err();
|
||||
assert!(matches!(err, TesseractError::InvalidImageData));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_from_raw_rgb_zero_dimensions() {
|
||||
let err = Pix::from_raw_rgb(&[], 0, 4).unwrap_err();
|
||||
assert!(matches!(err, TesseractError::InvalidImageData));
|
||||
|
||||
let err = Pix::from_raw_rgb(&[], 4, 0).unwrap_err();
|
||||
assert!(matches!(err, TesseractError::InvalidImageData));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_as_ptr_is_non_null() {
|
||||
let pix = make_rgb_pix(8, 8, 128);
|
||||
assert!(!pix.as_ptr().is_null());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_to_grayscale() {
|
||||
let pix = make_rgb_pix(32, 32, 150);
|
||||
let gray = pix.to_grayscale().expect("to_grayscale failed");
|
||||
assert_eq!(gray.width(), 32);
|
||||
assert_eq!(gray.height(), 32);
|
||||
assert_eq!(gray.depth(), 8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_scale_up() {
|
||||
let pix = make_rgb_pix(20, 10, 100);
|
||||
let scaled = pix.scale(2.0, 2.0).expect("scale failed");
|
||||
assert_eq!(scaled.width(), 40);
|
||||
assert_eq!(scaled.height(), 20);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_unsharp_mask_returns_same_dimensions() {
|
||||
let pix = make_rgb_pix(32, 32, 200);
|
||||
let sharpened = pix.unsharp_mask(2, 0.4).expect("unsharp_mask failed");
|
||||
assert_eq!(sharpened.width(), 32);
|
||||
assert_eq!(sharpened.height(), 32);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_adaptive_threshold_produces_1bpp() {
|
||||
let pix = make_rgb_pix(64, 64, 180);
|
||||
let gray = pix.to_grayscale().expect("to_grayscale failed");
|
||||
let binary = gray.adaptive_threshold(32, 32).expect("adaptive_threshold failed");
|
||||
assert_eq!(binary.depth(), 1);
|
||||
}
|
||||
}
|
||||
218
crates/kreuzberg-tesseract/src/lib.rs
Normal file
218
crates/kreuzberg-tesseract/src/lib.rs
Normal file
@@ -0,0 +1,218 @@
|
||||
#![cfg_attr(
|
||||
not(any(feature = "build-tesseract", feature = "build-tesseract-wasm")),
|
||||
allow(unused_variables, dead_code)
|
||||
)]
|
||||
#![allow(clippy::arc_with_non_send_sync)]
|
||||
#![allow(clippy::missing_transmute_annotations)]
|
||||
#![allow(clippy::type_complexity)]
|
||||
#![allow(clippy::new_without_default)]
|
||||
#![allow(clippy::not_unsafe_ptr_arg_deref)]
|
||||
#![allow(clippy::cmp_null)]
|
||||
|
||||
//! # kreuzberg-tesseract
|
||||
//!
|
||||
//! `kreuzberg-tesseract` provides safe Rust bindings for Tesseract OCR with built-in compilation
|
||||
//! of Tesseract and Leptonica libraries. This crate aims to make OCR functionality
|
||||
//! easily accessible in Rust projects while handling the complexity of interfacing
|
||||
//! with the underlying C++ libraries.
|
||||
//!
|
||||
//! ## Usage
|
||||
//!
|
||||
//! Here's a basic example of how to use `kreuzberg-tesseract`:
|
||||
//!
|
||||
//! ```rust
|
||||
//! use std::path::PathBuf;
|
||||
//! use std::error::Error;
|
||||
//! use kreuzberg_tesseract::TesseractAPI;
|
||||
//!
|
||||
//! fn get_default_tessdata_dir() -> PathBuf {
|
||||
//! if cfg!(target_os = "macos") {
|
||||
//! let home_dir = std::env::var("HOME").expect("HOME environment variable not set");
|
||||
//! PathBuf::from(home_dir)
|
||||
//! .join("Library")
|
||||
//! .join("Application Support")
|
||||
//! .join("kreuzberg-tesseract")
|
||||
//! .join("tessdata")
|
||||
//! } else if cfg!(target_os = "linux") {
|
||||
//! let home_dir = std::env::var("HOME").expect("HOME environment variable not set");
|
||||
//! PathBuf::from(home_dir)
|
||||
//! .join(".kreuzberg-tesseract")
|
||||
//! .join("tessdata")
|
||||
//! } else if cfg!(target_os = "windows") {
|
||||
//! PathBuf::from(std::env::var("APPDATA").expect("APPDATA environment variable not set"))
|
||||
//! .join("kreuzberg-tesseract")
|
||||
//! .join("tessdata")
|
||||
//! } else {
|
||||
//! panic!("Unsupported operating system");
|
||||
//! }
|
||||
//! }
|
||||
//!
|
||||
//! fn get_tessdata_dir() -> PathBuf {
|
||||
//! match std::env::var("TESSDATA_PREFIX") {
|
||||
//! Ok(dir) => {
|
||||
//! let path = PathBuf::from(dir);
|
||||
//! let path = if path.ends_with("tessdata") { path } else { path.join("tessdata") };
|
||||
//! println!("Using TESSDATA_PREFIX directory: {:?}", path);
|
||||
//! path
|
||||
//! }
|
||||
//! Err(_) => {
|
||||
//! let default_dir = get_default_tessdata_dir();
|
||||
//! println!(
|
||||
//! "TESSDATA_PREFIX not set, using default directory: {:?}",
|
||||
//! default_dir
|
||||
//! );
|
||||
//! default_dir
|
||||
//! }
|
||||
//! }
|
||||
//! }
|
||||
//!
|
||||
//! fn main() -> Result<(), Box<dyn Error>> {
|
||||
//! let api = TesseractAPI::new()?;
|
||||
//!
|
||||
//! // Get tessdata directory (uses default location or TESSDATA_PREFIX if set)
|
||||
//! let tessdata_dir = get_tessdata_dir();
|
||||
//! api.init(tessdata_dir.to_str().unwrap(), "eng")?;
|
||||
//!
|
||||
//! let width = 24;
|
||||
//! let height = 24;
|
||||
//! let bytes_per_pixel = 1;
|
||||
//! let bytes_per_line = width * bytes_per_pixel;
|
||||
//!
|
||||
//! // Initialize image data with all white pixels
|
||||
//! let mut image_data = vec![255u8; width * height];
|
||||
//!
|
||||
//! // Draw number 9 with clearer distinction
|
||||
//! for y in 4..19 {
|
||||
//! for x in 7..17 {
|
||||
//! // Top bar
|
||||
//! if y == 4 && x >= 8 && x <= 15 {
|
||||
//! image_data[y * width + x] = 0;
|
||||
//! }
|
||||
//! // Top curve left side
|
||||
//! if y >= 4 && y <= 10 && x == 7 {
|
||||
//! image_data[y * width + x] = 0;
|
||||
//! }
|
||||
//! // Top curve right side
|
||||
//! if y >= 4 && y <= 11 && x == 16 {
|
||||
//! image_data[y * width + x] = 0;
|
||||
//! }
|
||||
//! // Middle bar
|
||||
//! if y == 11 && x >= 8 && x <= 15 {
|
||||
//! image_data[y * width + x] = 0;
|
||||
//! }
|
||||
//! // Bottom right vertical line
|
||||
//! if y >= 11 && y <= 18 && x == 16 {
|
||||
//! image_data[y * width + x] = 0;
|
||||
//! }
|
||||
//! // Bottom bar
|
||||
//! if y == 18 && x >= 8 && x <= 15 {
|
||||
//! image_data[y * width + x] = 0;
|
||||
//! }
|
||||
//! }
|
||||
//! }
|
||||
//!
|
||||
//! // Set the image data
|
||||
//! api.set_image(&image_data, width.try_into().unwrap(), height.try_into().unwrap(), bytes_per_pixel.try_into().unwrap(), bytes_per_line.try_into().unwrap())?;
|
||||
//!
|
||||
//! // Set whitelist for digits only
|
||||
//! api.set_variable("tessedit_char_whitelist", "0123456789")?;
|
||||
//!
|
||||
//! // Set PSM mode to single character
|
||||
//! api.set_variable("tessedit_pageseg_mode", "10")?;
|
||||
//!
|
||||
//! // Get the recognized text
|
||||
//! let text = api.get_utf8_text()?;
|
||||
//! println!("Recognized text: {}", text.trim());
|
||||
//!
|
||||
//! Ok(())
|
||||
//! }
|
||||
//! ```
|
||||
/// Declare FFI functions with `extern "C-unwind"` on native targets (to catch
|
||||
/// C++ exceptions from Tesseract/Leptonica) and `extern "C"` on WASM (where
|
||||
/// the LLVM backend does not support `cleanupret` / C++ unwinding).
|
||||
macro_rules! ffi_extern {
|
||||
(
|
||||
$(
|
||||
$(#[$meta:meta])*
|
||||
$vis:vis fn $name:ident($($arg:ident : $ty:ty),* $(,)?) $(-> $ret:ty)?;
|
||||
)*
|
||||
) => {
|
||||
#[cfg(not(target_arch = "wasm32"))]
|
||||
unsafe extern "C-unwind" {
|
||||
$(
|
||||
$(#[$meta])*
|
||||
$vis fn $name($($arg : $ty),*) $(-> $ret)?;
|
||||
)*
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "wasm32")]
|
||||
unsafe extern "C" {
|
||||
$(
|
||||
$(#[$meta])*
|
||||
$vis fn $name($($arg : $ty),*) $(-> $ret)?;
|
||||
)*
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
pub use error::{Result, TesseractError};
|
||||
mod error;
|
||||
|
||||
// WASM: Override __cxa_atexit to be a no-op. WASI SDK's __cxa_atexit calls calloc during
|
||||
// C++ static initialization, which crashes because dlmalloc's heap isn't properly set up
|
||||
// for wasm32-unknown-unknown. Since WASM modules never exit normally, atexit handlers
|
||||
// are unnecessary.
|
||||
#[cfg(all(target_arch = "wasm32", not(target_os = "emscripten")))]
|
||||
mod wasm_compat {
|
||||
#[unsafe(no_mangle)]
|
||||
pub unsafe extern "C" fn __cxa_atexit(
|
||||
_func: Option<unsafe extern "C" fn(*mut core::ffi::c_void)>,
|
||||
_arg: *mut core::ffi::c_void,
|
||||
_dso_handle: *mut core::ffi::c_void,
|
||||
) -> i32 {
|
||||
0 // Success, but don't actually register anything
|
||||
}
|
||||
}
|
||||
mod page_iterator;
|
||||
pub use page_iterator::{BlockInfo, PageIterator, ParaInfo};
|
||||
mod result_iterator;
|
||||
pub use result_iterator::{FontAttributes, ResultIterator, WordData};
|
||||
mod choice_iterator;
|
||||
pub use choice_iterator::ChoiceIterator;
|
||||
mod monitor;
|
||||
pub use monitor::TessMonitor;
|
||||
mod result_renderer;
|
||||
pub use result_renderer::TessResultRenderer;
|
||||
mod mutable_iterator;
|
||||
pub use mutable_iterator::MutableIterator;
|
||||
mod enums;
|
||||
pub use enums::{
|
||||
TessOrientation, TessPageIteratorLevel, TessPageSegMode, TessParagraphJustification, TessPolyBlockType,
|
||||
TessTextlineOrder, TessWritingDirection,
|
||||
};
|
||||
mod api;
|
||||
pub use api::{BoundingBoxArray, TesseractAPI};
|
||||
pub mod leptonica;
|
||||
#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
|
||||
pub use leptonica::Pix;
|
||||
|
||||
/// Returns the compile-time-bundled English `eng.traineddata` blob when the
|
||||
/// `bundle-tessdata-eng` feature is enabled, otherwise `None`.
|
||||
///
|
||||
/// The bundled data is the `tessdata_fast` variant (~4 MB) downloaded by
|
||||
/// `build.rs` to `TESSDATA_PREFIX_BUNDLED/tessdata/eng.traineddata`. Embedding
|
||||
/// it lets WASM builds drive Tesseract OCR without filesystem access or
|
||||
/// runtime fetches.
|
||||
#[cfg(feature = "bundle-tessdata-eng")]
|
||||
pub fn bundled_eng_traineddata() -> Option<&'static [u8]> {
|
||||
Some(include_bytes!(concat!(
|
||||
env!("TESSDATA_PREFIX_BUNDLED"),
|
||||
"/tessdata/eng.traineddata"
|
||||
)))
|
||||
}
|
||||
|
||||
/// Returns `None` when the `bundle-tessdata-eng` feature is disabled.
|
||||
#[cfg(not(feature = "bundle-tessdata-eng"))]
|
||||
pub fn bundled_eng_traineddata() -> Option<&'static [u8]> {
|
||||
None
|
||||
}
|
||||
68
crates/kreuzberg-tesseract/src/monitor.rs
Normal file
68
crates/kreuzberg-tesseract/src/monitor.rs
Normal file
@@ -0,0 +1,68 @@
|
||||
use crate::error::{Result, TesseractError};
|
||||
use std::os::raw::{c_int, c_void};
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
pub struct TessMonitor {
|
||||
handle: Arc<Mutex<*mut c_void>>,
|
||||
}
|
||||
|
||||
unsafe impl Send for TessMonitor {}
|
||||
unsafe impl Sync for TessMonitor {}
|
||||
|
||||
impl TessMonitor {
|
||||
/// Creates a new instance of the TessMonitor.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the new instance of the TessMonitor.
|
||||
pub fn new() -> Self {
|
||||
let handle = unsafe { TessMonitorCreate() };
|
||||
TessMonitor {
|
||||
handle: Arc::new(Mutex::new(handle)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Sets the deadline for the monitor.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `deadline` - Deadline in milliseconds.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns a `TesseractError::MutexLockError` if the mutex lock fails.
|
||||
pub fn set_deadline(&self, deadline: i32) -> Result<()> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
unsafe { TessMonitorSetDeadlineMSecs(*handle, deadline) };
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Gets the progress of the monitor.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the progress as an `i32` if successful, otherwise returns an error.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns a `TesseractError::MutexLockError` if the mutex lock fails.
|
||||
pub fn get_progress(&self) -> Result<i32> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
Ok(unsafe { TessMonitorGetProgress(*handle) })
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for TessMonitor {
|
||||
fn drop(&mut self) {
|
||||
if let Ok(handle) = self.handle.lock() {
|
||||
unsafe { TessMonitorDelete(*handle) };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ffi_extern! {
|
||||
pub fn TessMonitorCreate() -> *mut c_void;
|
||||
pub fn TessMonitorDelete(monitor: *mut c_void);
|
||||
pub fn TessMonitorSetDeadlineMSecs(monitor: *mut c_void, deadline: c_int);
|
||||
pub fn TessMonitorGetProgress(monitor: *mut c_void) -> c_int;
|
||||
}
|
||||
197
crates/kreuzberg-tesseract/src/mutable_iterator.rs
Normal file
197
crates/kreuzberg-tesseract/src/mutable_iterator.rs
Normal file
@@ -0,0 +1,197 @@
|
||||
use crate::error::{Result, TesseractError};
|
||||
use std::ffi::CStr;
|
||||
use std::os::raw::{c_char, c_void};
|
||||
use std::sync::Arc;
|
||||
use std::sync::Mutex;
|
||||
|
||||
use crate::result_iterator::{
|
||||
TessResultIteratorConfidence, TessResultIteratorGetUTF8Text, TessResultIteratorNext,
|
||||
TessResultIteratorSymbolIsDropcap, TessResultIteratorSymbolIsSubscript, TessResultIteratorSymbolIsSuperscript,
|
||||
TessResultIteratorWordFontAttributes, TessResultIteratorWordIsFromDictionary, TessResultIteratorWordIsNumeric,
|
||||
TessResultIteratorWordRecognitionLanguage,
|
||||
};
|
||||
|
||||
pub struct MutableIterator {
|
||||
handle: Arc<Mutex<*mut c_void>>,
|
||||
}
|
||||
|
||||
unsafe impl Send for MutableIterator {}
|
||||
unsafe impl Sync for MutableIterator {}
|
||||
|
||||
impl MutableIterator {
|
||||
/// Creates a new instance of the MutableIterator.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `handle` - Pointer to the MutableIterator.
|
||||
pub fn new(handle: *mut c_void) -> Self {
|
||||
MutableIterator {
|
||||
handle: Arc::new(Mutex::new(handle)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Gets the UTF-8 text for the current iterator.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `level` - Level of the text.
|
||||
pub fn get_utf8_text(&self, level: i32) -> Result<String> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
|
||||
let text_ptr = unsafe { TessResultIteratorGetUTF8Text(*handle, level) };
|
||||
if text_ptr.is_null() {
|
||||
return Err(TesseractError::NullPointerError);
|
||||
}
|
||||
let c_str = unsafe { CStr::from_ptr(text_ptr) };
|
||||
let result = c_str.to_str()?.to_owned();
|
||||
unsafe { TessDeleteText(text_ptr as *mut c_char) };
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Gets the confidence of the current iterator.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `level` - Level of the confidence.
|
||||
pub fn confidence(&self, level: i32) -> Result<f32> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
|
||||
Ok(unsafe { TessResultIteratorConfidence(*handle, level) })
|
||||
}
|
||||
|
||||
/// Gets the recognition language of the current iterator.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the recognition language as a `String` if successful, otherwise returns an error.
|
||||
pub fn word_recognition_language(&self) -> Result<String> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
|
||||
let lang_ptr = unsafe { TessResultIteratorWordRecognitionLanguage(*handle) };
|
||||
if lang_ptr.is_null() {
|
||||
return Err(TesseractError::NullPointerError);
|
||||
}
|
||||
let c_str = unsafe { CStr::from_ptr(lang_ptr) };
|
||||
Ok(c_str.to_str()?.to_owned())
|
||||
}
|
||||
|
||||
/// Gets the font attributes of the current iterator.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the font attributes as a tuple if successful, otherwise returns an error.
|
||||
pub fn word_font_attributes(&self) -> Result<(bool, bool, bool, bool, bool, bool, i32, i32)> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
|
||||
let mut is_bold = 0;
|
||||
let mut is_italic = 0;
|
||||
let mut is_underlined = 0;
|
||||
let mut is_monospace = 0;
|
||||
let mut is_serif = 0;
|
||||
let mut is_smallcaps = 0;
|
||||
let mut pointsize = 0;
|
||||
let mut font_id = 0;
|
||||
|
||||
let result = unsafe {
|
||||
TessResultIteratorWordFontAttributes(
|
||||
*handle,
|
||||
&mut is_bold,
|
||||
&mut is_italic,
|
||||
&mut is_underlined,
|
||||
&mut is_monospace,
|
||||
&mut is_serif,
|
||||
&mut is_smallcaps,
|
||||
&mut pointsize,
|
||||
&mut font_id,
|
||||
)
|
||||
};
|
||||
|
||||
if result == 0 {
|
||||
Err(TesseractError::InvalidParameterError)
|
||||
} else {
|
||||
Ok((
|
||||
is_bold != 0,
|
||||
is_italic != 0,
|
||||
is_underlined != 0,
|
||||
is_monospace != 0,
|
||||
is_serif != 0,
|
||||
is_smallcaps != 0,
|
||||
pointsize,
|
||||
font_id,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
/// Checks if the current word is from the dictionary.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `Ok(true)` if the current word is from the dictionary, otherwise returns `Ok(false)`.
|
||||
pub fn word_is_from_dictionary(&self) -> Result<bool> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
|
||||
Ok(unsafe { TessResultIteratorWordIsFromDictionary(*handle) != 0 })
|
||||
}
|
||||
|
||||
/// Checks if the current word is numeric.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `Ok(true)` if the current word is numeric, otherwise returns `Ok(false)`.
|
||||
pub fn word_is_numeric(&self) -> Result<bool> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
|
||||
Ok(unsafe { TessResultIteratorWordIsNumeric(*handle) != 0 })
|
||||
}
|
||||
|
||||
/// Checks if the current symbol is superscript.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `Ok(true)` if the current symbol is superscript, otherwise returns `Ok(false)`.
|
||||
pub fn symbol_is_superscript(&self) -> Result<bool> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
|
||||
Ok(unsafe { TessResultIteratorSymbolIsSuperscript(*handle) != 0 })
|
||||
}
|
||||
|
||||
/// Checks if the current symbol is subscript.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `Ok(true)` if the current symbol is subscript, otherwise returns `Ok(false)`.
|
||||
pub fn symbol_is_subscript(&self) -> Result<bool> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
|
||||
Ok(unsafe { TessResultIteratorSymbolIsSubscript(*handle) != 0 })
|
||||
}
|
||||
|
||||
/// Checks if the current symbol is dropcap.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `Ok(true)` if the current symbol is dropcap, otherwise returns `Ok(false)`.
|
||||
pub fn symbol_is_dropcap(&self) -> Result<bool> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
|
||||
Ok(unsafe { TessResultIteratorSymbolIsDropcap(*handle) != 0 })
|
||||
}
|
||||
|
||||
/// Gets the next iterator.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `level` - Level of the iterator.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `true` if the next iterator is successful, otherwise returns `false`.
|
||||
pub fn next(&self, level: i32) -> Result<bool> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexError)?;
|
||||
Ok(unsafe { TessResultIteratorNext(*handle, level) != 0 })
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for MutableIterator {
|
||||
fn drop(&mut self) {
|
||||
if let Ok(handle) = self.handle.lock() {
|
||||
unsafe { TessResultIteratorDelete(*handle) };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ffi_extern! {
|
||||
pub fn TessResultIteratorDelete(handle: *mut c_void);
|
||||
pub fn TessDeleteText(text: *mut c_char);
|
||||
}
|
||||
421
crates/kreuzberg-tesseract/src/page_iterator.rs
Normal file
421
crates/kreuzberg-tesseract/src/page_iterator.rs
Normal file
@@ -0,0 +1,421 @@
|
||||
use crate::TesseractError;
|
||||
use crate::enums::{
|
||||
TessOrientation, TessPageIteratorLevel, TessParagraphJustification, TessPolyBlockType, TessTextlineOrder,
|
||||
TessWritingDirection,
|
||||
};
|
||||
use crate::error::Result;
|
||||
use std::os::raw::{c_float, c_int, c_void};
|
||||
use std::sync::Arc;
|
||||
use std::sync::Mutex;
|
||||
|
||||
/// Block-level layout information from Tesseract.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct BlockInfo {
|
||||
pub block_type: TessPolyBlockType,
|
||||
pub left: i32,
|
||||
pub top: i32,
|
||||
pub right: i32,
|
||||
pub bottom: i32,
|
||||
}
|
||||
|
||||
/// Paragraph-level information from Tesseract.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ParaInfo {
|
||||
pub justification: TessParagraphJustification,
|
||||
pub is_list_item: bool,
|
||||
pub is_crown: bool,
|
||||
pub first_line_indent: i32,
|
||||
pub left: i32,
|
||||
pub top: i32,
|
||||
pub right: i32,
|
||||
pub bottom: i32,
|
||||
}
|
||||
|
||||
pub struct PageIterator {
|
||||
pub handle: Arc<Mutex<*mut c_void>>,
|
||||
}
|
||||
|
||||
unsafe impl Send for PageIterator {}
|
||||
unsafe impl Sync for PageIterator {}
|
||||
|
||||
impl PageIterator {
|
||||
/// Creates a new instance of the PageIterator.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `handle` - Pointer to the PageIterator.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the new instance of the PageIterator.
|
||||
pub fn new(handle: *mut c_void) -> Self {
|
||||
PageIterator {
|
||||
handle: Arc::new(Mutex::new(handle)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Begins the iteration.
|
||||
pub fn begin(&self) -> Result<()> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
unsafe { TessPageIteratorBegin(*handle) };
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Gets the next iterator.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `level` - Level of the iterator.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `Result<bool>` - `Ok(true)` if the next iterator is successful, `Ok(false)` otherwise.
|
||||
pub fn next(&self, level: TessPageIteratorLevel) -> Result<bool> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
Ok(unsafe { TessPageIteratorNext(*handle, level as c_int) != 0 })
|
||||
}
|
||||
|
||||
/// Checks if the current iterator is at the beginning of the specified level.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `level` - Level of the iterator.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `Result<bool>` - `Ok(true)` if at the beginning, `Ok(false)` otherwise.
|
||||
pub fn is_at_beginning_of(&self, level: TessPageIteratorLevel) -> Result<bool> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
Ok(unsafe { TessPageIteratorIsAtBeginningOf(*handle, level as c_int) != 0 })
|
||||
}
|
||||
|
||||
/// Checks if the current iterator is at the final element of the specified level.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `level` - Level of the iterator.
|
||||
/// * `element` - Element of the iterator.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `Result<bool>` - `Ok(true)` if at the final element, `Ok(false)` otherwise.
|
||||
pub fn is_at_final_element(&self, level: TessPageIteratorLevel, element: TessPageIteratorLevel) -> Result<bool> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
Ok(unsafe { TessPageIteratorIsAtFinalElement(*handle, level as c_int, element as c_int) != 0 })
|
||||
}
|
||||
|
||||
/// Gets the bounding box of the current iterator.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `level` - Level of the bounding box.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the bounding box as a tuple if successful, otherwise returns an error.
|
||||
pub fn bounding_box(&self, level: TessPageIteratorLevel) -> Result<(i32, i32, i32, i32)> {
|
||||
let mut left = 0;
|
||||
let mut top = 0;
|
||||
let mut right = 0;
|
||||
let mut bottom = 0;
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
let result = unsafe {
|
||||
TessPageIteratorBoundingBox(*handle, level as c_int, &mut left, &mut top, &mut right, &mut bottom)
|
||||
};
|
||||
if result == 0 {
|
||||
Err(TesseractError::InvalidParameterError)
|
||||
} else {
|
||||
Ok((left, top, right, bottom))
|
||||
}
|
||||
}
|
||||
|
||||
/// Gets the block type of the current iterator.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the block type as a `TessPolyBlockType`.
|
||||
pub fn block_type(&self) -> Result<TessPolyBlockType> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
let block_type = unsafe { TessPageIteratorBlockType(*handle) };
|
||||
Ok(TessPolyBlockType::from_int(block_type))
|
||||
}
|
||||
|
||||
/// Gets the baseline of the current iterator.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `level` - Level of the baseline.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the baseline as a tuple if successful, otherwise returns an error.
|
||||
pub fn baseline(&self, level: i32) -> Result<(i32, i32, i32, i32)> {
|
||||
let mut x1 = 0;
|
||||
let mut y1 = 0;
|
||||
let mut x2 = 0;
|
||||
let mut y2 = 0;
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
let result = unsafe { TessPageIteratorBaseline(*handle, level, &mut x1, &mut y1, &mut x2, &mut y2) };
|
||||
if result == 0 {
|
||||
Err(TesseractError::InvalidParameterError)
|
||||
} else {
|
||||
Ok((x1, y1, x2, y2))
|
||||
}
|
||||
}
|
||||
|
||||
/// Gets the orientation of the current iterator.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the orientation as a tuple if successful, otherwise returns an error.
|
||||
pub fn orientation(&self) -> Result<(TessOrientation, TessWritingDirection, TessTextlineOrder, f32)> {
|
||||
let mut orientation = 0;
|
||||
let mut writing_direction = 0;
|
||||
let mut textline_order = 0;
|
||||
let mut deskew_angle = 0.0;
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
let result = unsafe {
|
||||
TessPageIteratorOrientation(
|
||||
*handle,
|
||||
&mut orientation,
|
||||
&mut writing_direction,
|
||||
&mut textline_order,
|
||||
&mut deskew_angle,
|
||||
)
|
||||
};
|
||||
if result == 0 {
|
||||
Err(TesseractError::InvalidParameterError)
|
||||
} else {
|
||||
Ok((
|
||||
TessOrientation::from_int(orientation),
|
||||
TessWritingDirection::from_int(writing_direction),
|
||||
TessTextlineOrder::from_int(textline_order),
|
||||
deskew_angle,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
/// Extracts all blocks from the page in a single mutex-locked pass.
|
||||
///
|
||||
/// Resets the iterator to the beginning, then iterates at `RIL_BLOCK` level,
|
||||
/// collecting block type and bounding box for each block found.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `Ok(Vec<BlockInfo>)` with one entry per block, or an error if the
|
||||
/// mutex cannot be acquired.
|
||||
pub fn extract_all_blocks(&self) -> Result<Vec<BlockInfo>> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
let level = TessPageIteratorLevel::RIL_BLOCK as c_int;
|
||||
let mut blocks = Vec::new();
|
||||
|
||||
// SAFETY: `*handle` is a valid non-null TessPageIterator pointer owned by this struct.
|
||||
// `TessPageIteratorBegin` resets the iterator to the first element and takes only
|
||||
// the pointer — no aliasing occurs because we hold the mutex for the duration.
|
||||
unsafe { TessPageIteratorBegin(*handle) };
|
||||
|
||||
loop {
|
||||
let block_type = unsafe {
|
||||
// SAFETY: `*handle` is valid; TessPageIteratorBlockType reads the current
|
||||
// iterator position and returns an integer enum value without taking ownership.
|
||||
TessPageIteratorBlockType(*handle)
|
||||
};
|
||||
|
||||
let mut left: c_int = 0;
|
||||
let mut top: c_int = 0;
|
||||
let mut right: c_int = 0;
|
||||
let mut bottom: c_int = 0;
|
||||
|
||||
let bbox_ok = unsafe {
|
||||
// SAFETY: `*handle` is valid; the four `*mut c_int` pointers point to local
|
||||
// stack variables whose lifetimes exceed this call.
|
||||
TessPageIteratorBoundingBox(*handle, level, &mut left, &mut top, &mut right, &mut bottom)
|
||||
};
|
||||
|
||||
if bbox_ok != 0 {
|
||||
blocks.push(BlockInfo {
|
||||
block_type: TessPolyBlockType::from_int(block_type),
|
||||
left,
|
||||
top,
|
||||
right,
|
||||
bottom,
|
||||
});
|
||||
}
|
||||
|
||||
let has_next = unsafe {
|
||||
// SAFETY: `*handle` is valid; TessPageIteratorNext advances the iterator
|
||||
// in-place and returns 0 when there are no more elements at this level.
|
||||
TessPageIteratorNext(*handle, level)
|
||||
};
|
||||
if has_next == 0 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(blocks)
|
||||
}
|
||||
|
||||
/// Extracts all paragraphs from the page in a single mutex-locked pass.
|
||||
///
|
||||
/// Resets the iterator to the beginning, then iterates at `RIL_PARA` level,
|
||||
/// collecting paragraph metadata and bounding box for each paragraph found.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `Ok(Vec<ParaInfo>)` with one entry per paragraph, or an error if the
|
||||
/// mutex cannot be acquired.
|
||||
pub fn extract_all_paragraphs(&self) -> Result<Vec<ParaInfo>> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
let level = TessPageIteratorLevel::RIL_PARA as c_int;
|
||||
let mut paragraphs = Vec::new();
|
||||
|
||||
// SAFETY: `*handle` is a valid non-null TessPageIterator pointer owned by this struct.
|
||||
// `TessPageIteratorBegin` resets the iterator to the first element; the mutex ensures
|
||||
// exclusive access for the entire loop.
|
||||
unsafe { TessPageIteratorBegin(*handle) };
|
||||
|
||||
loop {
|
||||
let mut justification: c_int = 0;
|
||||
// SAFETY: TessPageIteratorParagraphInfo expects BOOL* (int*) for is_list_item and
|
||||
// is_crown. Rust bool is 1 byte while C int is 4 bytes, so we use c_int temporaries
|
||||
// to avoid undefined behaviour (stack corruption) and convert afterwards.
|
||||
let mut is_list_item_raw: c_int = 0;
|
||||
let mut is_crown_raw: c_int = 0;
|
||||
let mut first_line_indent: c_int = 0;
|
||||
|
||||
let para_ok = unsafe {
|
||||
// SAFETY: `*handle` is valid; all output pointers reference stack variables
|
||||
// whose lifetimes exceed this call. TessPageIteratorParagraphInfo writes
|
||||
// through these pointers without retaining them.
|
||||
TessPageIteratorParagraphInfo(
|
||||
*handle,
|
||||
&mut justification,
|
||||
&mut is_list_item_raw,
|
||||
&mut is_crown_raw,
|
||||
&mut first_line_indent,
|
||||
)
|
||||
};
|
||||
|
||||
let is_list_item = is_list_item_raw != 0;
|
||||
let is_crown = is_crown_raw != 0;
|
||||
|
||||
let mut left: c_int = 0;
|
||||
let mut top: c_int = 0;
|
||||
let mut right: c_int = 0;
|
||||
let mut bottom: c_int = 0;
|
||||
|
||||
let bbox_ok = unsafe {
|
||||
// SAFETY: `*handle` is valid; the four `*mut c_int` pointers reference local
|
||||
// stack variables. TessPageIteratorBoundingBox does not retain these pointers.
|
||||
TessPageIteratorBoundingBox(*handle, level, &mut left, &mut top, &mut right, &mut bottom)
|
||||
};
|
||||
|
||||
if para_ok != 0 && bbox_ok != 0 {
|
||||
paragraphs.push(ParaInfo {
|
||||
justification: TessParagraphJustification::from_int(justification),
|
||||
is_list_item,
|
||||
is_crown,
|
||||
first_line_indent,
|
||||
left,
|
||||
top,
|
||||
right,
|
||||
bottom,
|
||||
});
|
||||
}
|
||||
|
||||
let has_next = unsafe {
|
||||
// SAFETY: `*handle` is valid; TessPageIteratorNext advances the iterator
|
||||
// in-place and returns 0 when there are no more elements at this level.
|
||||
TessPageIteratorNext(*handle, level)
|
||||
};
|
||||
if has_next == 0 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(paragraphs)
|
||||
}
|
||||
|
||||
/// Gets the paragraph information of the current iterator.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the paragraph information as a tuple if successful, otherwise returns an error.
|
||||
pub fn paragraph_info(&self) -> Result<(TessParagraphJustification, bool, bool, i32)> {
|
||||
let mut justification = 0;
|
||||
// SAFETY: TessPageIteratorParagraphInfo expects BOOL* (int*) for is_list_item and
|
||||
// is_crown. Rust bool is 1 byte while C int is 4 bytes, so we use c_int temporaries
|
||||
// to avoid undefined behaviour (stack corruption) and convert afterwards.
|
||||
let mut is_list_item_raw: c_int = 0;
|
||||
let mut is_crown_raw: c_int = 0;
|
||||
let mut first_line_indent = 0;
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
let result = unsafe {
|
||||
TessPageIteratorParagraphInfo(
|
||||
*handle,
|
||||
&mut justification,
|
||||
&mut is_list_item_raw,
|
||||
&mut is_crown_raw,
|
||||
&mut first_line_indent,
|
||||
)
|
||||
};
|
||||
if result == 0 {
|
||||
Err(TesseractError::InvalidParameterError)
|
||||
} else {
|
||||
Ok((
|
||||
TessParagraphJustification::from_int(justification),
|
||||
is_list_item_raw != 0,
|
||||
is_crown_raw != 0,
|
||||
first_line_indent,
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for PageIterator {
|
||||
fn drop(&mut self) {
|
||||
if let Ok(handle) = self.handle.lock() {
|
||||
unsafe { TessPageIteratorDelete(*handle) };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ffi_extern! {
|
||||
pub fn TessPageIteratorDelete(handle: *mut c_void);
|
||||
pub fn TessPageIteratorBegin(handle: *mut c_void);
|
||||
pub fn TessPageIteratorNext(handle: *mut c_void, level: c_int) -> c_int;
|
||||
pub fn TessPageIteratorIsAtBeginningOf(handle: *mut c_void, level: c_int) -> c_int;
|
||||
pub fn TessPageIteratorIsAtFinalElement(handle: *mut c_void, level: c_int, element: c_int) -> c_int;
|
||||
pub fn TessPageIteratorBoundingBox(
|
||||
handle: *mut c_void,
|
||||
level: c_int,
|
||||
left: *mut c_int,
|
||||
top: *mut c_int,
|
||||
right: *mut c_int,
|
||||
bottom: *mut c_int,
|
||||
) -> c_int;
|
||||
pub fn TessPageIteratorBlockType(handle: *mut c_void) -> c_int;
|
||||
pub fn TessPageIteratorBaseline(
|
||||
handle: *mut c_void,
|
||||
level: c_int,
|
||||
x1: *mut c_int,
|
||||
y1: *mut c_int,
|
||||
x2: *mut c_int,
|
||||
y2: *mut c_int,
|
||||
) -> c_int;
|
||||
pub fn TessPageIteratorOrientation(
|
||||
handle: *mut c_void,
|
||||
orientation: *mut c_int,
|
||||
writing_direction: *mut c_int,
|
||||
textline_order: *mut c_int,
|
||||
deskew_angle: *mut c_float,
|
||||
) -> c_int;
|
||||
pub fn TessBaseAPIGetIterator(handle: *mut c_void) -> *mut c_void;
|
||||
pub fn TessPageIteratorParagraphInfo(
|
||||
handle: *mut c_void,
|
||||
justification: *mut c_int,
|
||||
is_list_item: *mut c_int,
|
||||
is_crown: *mut c_int,
|
||||
first_line_indent: *mut c_int,
|
||||
) -> c_int;
|
||||
}
|
||||
589
crates/kreuzberg-tesseract/src/result_iterator.rs
Normal file
589
crates/kreuzberg-tesseract/src/result_iterator.rs
Normal file
@@ -0,0 +1,589 @@
|
||||
use crate::api::TessDeleteText;
|
||||
use crate::enums::TessPageIteratorLevel;
|
||||
use crate::error::{Result, TesseractError};
|
||||
use std::ffi::CStr;
|
||||
use std::os::raw::{c_char, c_float, c_int, c_void};
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
/// Font attributes detected by Tesseract for a word.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct FontAttributes {
|
||||
pub is_bold: bool,
|
||||
pub is_italic: bool,
|
||||
pub is_underlined: bool,
|
||||
pub is_monospace: bool,
|
||||
pub is_serif: bool,
|
||||
pub is_smallcaps: bool,
|
||||
pub pointsize: i32,
|
||||
pub font_id: i32,
|
||||
}
|
||||
|
||||
/// Complete word data extracted in a single mutex lock.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct WordData {
|
||||
pub text: String,
|
||||
pub left: i32,
|
||||
pub top: i32,
|
||||
pub right: i32,
|
||||
pub bottom: i32,
|
||||
pub confidence: f32,
|
||||
pub font_attrs: Option<FontAttributes>,
|
||||
}
|
||||
|
||||
pub struct ResultIterator {
|
||||
pub handle: Arc<Mutex<*mut c_void>>,
|
||||
}
|
||||
|
||||
unsafe impl Send for ResultIterator {}
|
||||
unsafe impl Sync for ResultIterator {}
|
||||
|
||||
impl ResultIterator {
|
||||
/// Creates a new instance of the ResultIterator.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `handle` - Pointer to the ResultIterator.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the new instance of the ResultIterator.
|
||||
pub fn new(handle: *mut c_void) -> Self {
|
||||
ResultIterator {
|
||||
handle: Arc::new(Mutex::new(handle)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Gets the UTF-8 text of the current iterator.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `level` - Level of the text.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the UTF-8 text as a `String` if successful, otherwise returns an error.
|
||||
pub fn get_utf8_text(&self, level: TessPageIteratorLevel) -> Result<String> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
// SAFETY: TessResultIteratorGetUTF8Text() allocates and returns a pointer to a C string.
|
||||
// This is safe because:
|
||||
// 1. *handle is a valid pointer to an initialized ResultIterator (mutex-guarded)
|
||||
// 2. level is a valid TessPageIteratorLevel enum converted to c_int (in valid range)
|
||||
// 3. The returned pointer is either null (error) or a valid null-terminated C string
|
||||
// allocated on Tesseract's heap (must be freed with TessDeleteText)
|
||||
let text_ptr = unsafe { TessResultIteratorGetUTF8Text(*handle, level as c_int) };
|
||||
if text_ptr.is_null() {
|
||||
return Err(TesseractError::NullPointerError);
|
||||
}
|
||||
// SAFETY: We've verified text_ptr is non-null. The allocation/deallocation pattern is:
|
||||
// 1. text_ptr was allocated by TessResultIteratorGetUTF8Text() on the FFI boundary
|
||||
// 2. CStr::from_ptr(text_ptr) is safe: pointer is non-null and points to valid C string
|
||||
// 3. We read from the string (to_str() creates temporary immutable borrow)
|
||||
// 4. We immediately copy all data to owned String before deallocation
|
||||
// 5. The string data remains valid until TessDeleteText is called
|
||||
let c_str = unsafe { CStr::from_ptr(text_ptr) };
|
||||
let result = c_str.to_str()?.to_owned();
|
||||
// SAFETY: TessDeleteText() deallocates memory allocated by TessResultIteratorGetUTF8Text():
|
||||
// 1. text_ptr must be non-null (verified above)
|
||||
// 2. text_ptr came from the Tesseract API (trusted source, correct allocation)
|
||||
// 3. TessDeleteText() is the correct deallocation function for this allocation
|
||||
// 4. Must be called exactly once per allocation to avoid double-free (we ensure this)
|
||||
// 5. After this call, text_ptr is invalid; all uses must be via owned result String
|
||||
unsafe { TessDeleteText(text_ptr as *mut c_char) };
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Gets the confidence of the current iterator.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `level` - Level of the confidence.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the confidence as a `f32`.
|
||||
pub fn confidence(&self, level: TessPageIteratorLevel) -> Result<f32> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
// SAFETY: TessResultIteratorConfidence() is safe because:
|
||||
// 1. *handle is a valid pointer to an initialized ResultIterator
|
||||
// 2. level is a valid TessPageIteratorLevel enum converted to c_int
|
||||
// 3. The function only reads state and returns an f32 value (copyable)
|
||||
// 4. No pointer operations or memory access is needed
|
||||
Ok(unsafe { TessResultIteratorConfidence(*handle, level as c_int) })
|
||||
}
|
||||
|
||||
/// Gets the recognition language of the current iterator.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the recognition language as a `String` if successful, otherwise returns an error.
|
||||
pub fn word_recognition_language(&self) -> Result<String> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
// SAFETY: TessResultIteratorWordRecognitionLanguage() returns a pointer to a C string
|
||||
// in the iterator's memory. This is safe because:
|
||||
// 1. *handle is a valid pointer to an initialized ResultIterator
|
||||
// 2. The returned pointer is either null or a valid null-terminated C string
|
||||
let lang_ptr = unsafe { TessResultIteratorWordRecognitionLanguage(*handle) };
|
||||
if lang_ptr.is_null() {
|
||||
return Err(TesseractError::NullPointerError);
|
||||
}
|
||||
// SAFETY: We've verified lang_ptr is non-null. CStr::from_ptr() is safe because:
|
||||
// 1. lang_ptr points to a valid null-terminated C string managed by Tesseract
|
||||
// 2. We only read from it (to_str() creates temporary borrow)
|
||||
let c_str = unsafe { CStr::from_ptr(lang_ptr) };
|
||||
Ok(c_str.to_str()?.to_owned())
|
||||
}
|
||||
|
||||
/// Gets the font attributes of the current iterator.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the font attributes as a tuple if successful, otherwise returns an error.
|
||||
pub fn word_font_attributes(&self) -> Result<(bool, bool, bool, bool, bool, bool, i32, i32)> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
let mut is_bold = 0;
|
||||
let mut is_italic = 0;
|
||||
let mut is_underlined = 0;
|
||||
let mut is_monospace = 0;
|
||||
let mut is_serif = 0;
|
||||
let mut is_smallcaps = 0;
|
||||
let mut pointsize = 0;
|
||||
let mut font_id = 0;
|
||||
|
||||
// SAFETY: TessResultIteratorWordFontAttributes() takes output parameter pointers
|
||||
// and fills them with font attribute values. This is safe because:
|
||||
// 1. *handle is a valid pointer to an initialized ResultIterator (mutex-guarded)
|
||||
// 2. All mutable references (&mut ...) are valid local stack variables
|
||||
// 3. Each reference has a distinct memory location (no aliasing)
|
||||
// 4. The references outlive the FFI call (defined on stack, used immediately after)
|
||||
// 5. The function writes output i32 values (0/1 for bools, integers for size/id)
|
||||
// 6. Each reference has exclusive mutable access (Rust borrow checker enforces this)
|
||||
// 7. The output parameters are independent (function cannot cause data races)
|
||||
let result = unsafe {
|
||||
TessResultIteratorWordFontAttributes(
|
||||
*handle,
|
||||
&mut is_bold,
|
||||
&mut is_italic,
|
||||
&mut is_underlined,
|
||||
&mut is_monospace,
|
||||
&mut is_serif,
|
||||
&mut is_smallcaps,
|
||||
&mut pointsize,
|
||||
&mut font_id,
|
||||
)
|
||||
};
|
||||
|
||||
if result == 0 {
|
||||
Err(TesseractError::InvalidParameterError)
|
||||
} else {
|
||||
Ok((
|
||||
is_bold != 0,
|
||||
is_italic != 0,
|
||||
is_underlined != 0,
|
||||
is_monospace != 0,
|
||||
is_serif != 0,
|
||||
is_smallcaps != 0,
|
||||
pointsize,
|
||||
font_id,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
/// Checks if the current iterator is from the dictionary.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `true` if the current iterator is from the dictionary, otherwise returns `false`.
|
||||
pub fn word_is_from_dictionary(&self) -> Result<bool> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
// SAFETY: TessResultIteratorWordIsFromDictionary() is safe because:
|
||||
// 1. *handle is a valid pointer to an initialized ResultIterator
|
||||
// 2. The function only reads state and returns an i32 value (0 or non-zero)
|
||||
// 3. No pointer operations or memory modifications are needed
|
||||
Ok(unsafe { TessResultIteratorWordIsFromDictionary(*handle) != 0 })
|
||||
}
|
||||
|
||||
/// Checks if the current iterator is numeric.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `true` if the current iterator is numeric, otherwise returns `false`.
|
||||
pub fn word_is_numeric(&self) -> Result<bool> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
// SAFETY: TessResultIteratorWordIsNumeric() is safe because:
|
||||
// 1. *handle is a valid pointer to an initialized ResultIterator
|
||||
// 2. The function only reads state and returns an i32 value
|
||||
// 3. No pointer operations or state modifications needed
|
||||
Ok(unsafe { TessResultIteratorWordIsNumeric(*handle) != 0 })
|
||||
}
|
||||
|
||||
/// Checks if the current iterator is superscript.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `true` if the current iterator is superscript, otherwise returns `false`.
|
||||
pub fn symbol_is_superscript(&self) -> Result<bool> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
// SAFETY: TessResultIteratorSymbolIsSuperscript() is safe because:
|
||||
// 1. *handle is a valid pointer to an initialized ResultIterator
|
||||
// 2. The function only reads state and returns an i32 value
|
||||
// 3. No pointer operations or state modifications needed
|
||||
Ok(unsafe { TessResultIteratorSymbolIsSuperscript(*handle) != 0 })
|
||||
}
|
||||
|
||||
/// Checks if the current iterator is subscript.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `true` if the current iterator is subscript, otherwise returns `false`.
|
||||
pub fn symbol_is_subscript(&self) -> Result<bool> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
// SAFETY: TessResultIteratorSymbolIsSubscript() is safe because:
|
||||
// 1. *handle is a valid pointer to an initialized ResultIterator
|
||||
// 2. The function only reads state and returns an i32 value
|
||||
// 3. No pointer operations or state modifications needed
|
||||
Ok(unsafe { TessResultIteratorSymbolIsSubscript(*handle) != 0 })
|
||||
}
|
||||
|
||||
/// Checks if the current iterator is dropcap.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `true` if the current iterator is dropcap, otherwise returns `false`.
|
||||
pub fn symbol_is_dropcap(&self) -> Result<bool> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
// SAFETY: TessResultIteratorSymbolIsDropcap() is safe because:
|
||||
// 1. *handle is a valid pointer to an initialized ResultIterator
|
||||
// 2. The function only reads state and returns an i32 value
|
||||
// 3. No pointer operations or state modifications needed
|
||||
Ok(unsafe { TessResultIteratorSymbolIsDropcap(*handle) != 0 })
|
||||
}
|
||||
|
||||
/// Moves to the next iterator.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `level` - Level of the next iterator.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `true` if the next iterator exists, otherwise returns `false`.
|
||||
pub fn next(&self, level: TessPageIteratorLevel) -> Result<bool> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
// SAFETY: TessResultIteratorNext() is safe because:
|
||||
// 1. *handle is a valid pointer to an initialized ResultIterator
|
||||
// 2. level is a valid TessPageIteratorLevel enum converted to c_int
|
||||
// 3. The function modifies iterator state (advances position) and returns i32 result
|
||||
// 4. The mutex ensures exclusive access during state modification
|
||||
Ok(unsafe { TessResultIteratorNext(*handle, level as c_int) != 0 })
|
||||
}
|
||||
|
||||
/// Gets the current word from the iterator with its bounding box and confidence.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns a tuple of (text, left, top, right, bottom, confidence) if successful
|
||||
pub fn get_word_with_bounds(&self) -> Result<(String, i32, i32, i32, i32, f32)> {
|
||||
let text = self.get_utf8_text(TessPageIteratorLevel::RIL_WORD)?;
|
||||
let (left, top, right, bottom) = self.get_bounding_box(TessPageIteratorLevel::RIL_WORD)?;
|
||||
let confidence = self.confidence(TessPageIteratorLevel::RIL_WORD)?;
|
||||
|
||||
Ok((text, left, top, right, bottom, confidence))
|
||||
}
|
||||
|
||||
/// Advances the iterator to the next word.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns true if successful, false if there are no more words
|
||||
pub fn next_word(&self) -> Result<bool> {
|
||||
self.next(TessPageIteratorLevel::RIL_WORD)
|
||||
}
|
||||
|
||||
/// Gets the word information for the current position in the iterator.
|
||||
/// Should be called before next() to ensure valid data.
|
||||
///
|
||||
/// # Returns
|
||||
/// Returns a tuple of (text, left, top, right, bottom, confidence) if successful
|
||||
pub fn get_current_word(&self) -> Result<(String, i32, i32, i32, i32, f32)> {
|
||||
let text = self.get_utf8_text(TessPageIteratorLevel::RIL_WORD)?;
|
||||
let (left, top, right, bottom) = self.get_bounding_box(TessPageIteratorLevel::RIL_WORD)?;
|
||||
let confidence = self.confidence(TessPageIteratorLevel::RIL_WORD)?;
|
||||
|
||||
Ok((text, left, top, right, bottom, confidence))
|
||||
}
|
||||
|
||||
/// Gets the bounding box for the current element.
|
||||
pub fn get_bounding_box(&self, level: TessPageIteratorLevel) -> Result<(i32, i32, i32, i32)> {
|
||||
let mut left = 0;
|
||||
let mut top = 0;
|
||||
let mut right = 0;
|
||||
let mut bottom = 0;
|
||||
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
|
||||
// SAFETY: TessPageIteratorBoundingBox() queries iterator state and returns coordinates
|
||||
// via output parameters. This is safe because:
|
||||
// 1. *handle is a valid pointer to an initialized ResultIterator or PageIterator (mutex-guarded)
|
||||
// 2. level is a valid TessPageIteratorLevel enum converted to c_int (in valid range)
|
||||
// 3. All mutable references (&mut left, &mut top, &mut right, &mut bottom)
|
||||
// are valid local stack variables with distinct memory locations
|
||||
// 4. Each reference is exclusively borrowed (Rust enforces no aliasing)
|
||||
// 5. The references outlive the FFI call (defined on stack, used immediately after)
|
||||
// 6. The function writes four i32 coordinate values into these references
|
||||
// 7. No pointer escaping: the function only writes to these parameters, doesn't store them
|
||||
// 8. Return value indicates success/failure (checked below)
|
||||
let result = unsafe {
|
||||
TessPageIteratorBoundingBox(*handle, level as c_int, &mut left, &mut top, &mut right, &mut bottom)
|
||||
};
|
||||
|
||||
if result == 0 {
|
||||
Err(TesseractError::InvalidParameterError)
|
||||
} else {
|
||||
Ok((left, top, right, bottom))
|
||||
}
|
||||
}
|
||||
|
||||
/// Extracts all word data from the iterator in a single mutex lock.
|
||||
///
|
||||
/// Acquires the mutex once and iterates all words, collecting text, bounding box,
|
||||
/// confidence, and font attributes for each word. This is more efficient than
|
||||
/// calling individual methods in a loop since it avoids repeated mutex acquisitions.
|
||||
///
|
||||
/// The iterator is always reset to the beginning before traversal so that partial
|
||||
/// prior consumption does not cause words to be missed.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns a `Vec<WordData>` containing data for every word, or an error if the
|
||||
/// mutex cannot be acquired.
|
||||
pub fn extract_all_words(&self) -> Result<Vec<WordData>> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
let raw = *handle;
|
||||
let mut words = Vec::new();
|
||||
|
||||
// Reset to the first element before traversal. ResultIterator inherits from
|
||||
// PageIterator in C++, so TessPageIteratorBegin operates on the same handle.
|
||||
// SAFETY: raw is a valid mutex-guarded ResultIterator pointer; TessPageIteratorBegin
|
||||
// simply resets the internal position and does not allocate or free memory.
|
||||
unsafe { TessPageIteratorBegin(raw) };
|
||||
|
||||
loop {
|
||||
// SAFETY: raw is the mutex-guarded *mut c_void handle. All calls within this
|
||||
// loop are performed while holding the mutex lock, ensuring exclusive access.
|
||||
// We pass raw directly to the unlocked helper to avoid re-locking.
|
||||
match extract_word_data_unlocked(raw) {
|
||||
Ok(word) => words.push(word),
|
||||
// NullPointerError means the text pointer was null; skip this position.
|
||||
// InvalidParameterError means bounding box failed; skip this position.
|
||||
// Utf8Error means the text was not valid UTF-8; skip this word rather than
|
||||
// aborting, so the remaining words in the iterator are not lost.
|
||||
Err(TesseractError::NullPointerError)
|
||||
| Err(TesseractError::InvalidParameterError)
|
||||
| Err(TesseractError::Utf8Error(_)) => {}
|
||||
Err(e) => return Err(e),
|
||||
}
|
||||
|
||||
// SAFETY: TessResultIteratorNext() advances the iterator state and returns
|
||||
// non-zero if a next element exists. This is safe because:
|
||||
// 1. raw is a valid pointer to an initialized ResultIterator (mutex-guarded)
|
||||
// 2. RIL_WORD is a valid TessPageIteratorLevel enum value
|
||||
// 3. The mutex is held for the duration of this call (exclusive access)
|
||||
// 4. The function modifies iterator position and returns an i32 result
|
||||
let has_next = unsafe { TessResultIteratorNext(raw, TessPageIteratorLevel::RIL_WORD as c_int) != 0 };
|
||||
if !has_next {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(words)
|
||||
}
|
||||
|
||||
/// Extracts the current word's data in a single mutex lock.
|
||||
///
|
||||
/// Acquires the mutex once and calls all FFI functions (text, bounding box,
|
||||
/// confidence, font attributes) within that lock scope. More efficient than
|
||||
/// calling the individual methods separately when all fields are needed.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns a [`WordData`] struct if successful, otherwise returns an error.
|
||||
pub fn extract_word_data(&self) -> Result<WordData> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
extract_word_data_unlocked(*handle)
|
||||
}
|
||||
}
|
||||
|
||||
/// Extracts word data from a raw iterator handle without acquiring the mutex.
|
||||
///
|
||||
/// The caller MUST hold the mutex lock for the `ResultIterator` this handle belongs to
|
||||
/// before calling this function. Passing a handle that is not mutex-guarded, or calling
|
||||
/// this function concurrently on the same handle, is undefined behaviour.
|
||||
fn extract_word_data_unlocked(raw: *mut c_void) -> Result<WordData> {
|
||||
// SAFETY: TessResultIteratorGetUTF8Text() allocates and returns a pointer to a C string.
|
||||
// This is safe because:
|
||||
// 1. raw is a valid pointer to an initialized ResultIterator (caller holds mutex lock)
|
||||
// 2. RIL_WORD is a valid TessPageIteratorLevel enum value converted to c_int
|
||||
// 3. The returned pointer is either null (error) or a valid null-terminated C string
|
||||
// allocated on Tesseract's heap (must be freed with TessDeleteText)
|
||||
let text_ptr = unsafe { TessResultIteratorGetUTF8Text(raw, TessPageIteratorLevel::RIL_WORD as c_int) };
|
||||
if text_ptr.is_null() {
|
||||
return Err(TesseractError::NullPointerError);
|
||||
}
|
||||
// SAFETY: We've verified text_ptr is non-null. The allocation/deallocation pattern is:
|
||||
// 1. text_ptr was allocated by TessResultIteratorGetUTF8Text() on the FFI boundary
|
||||
// 2. CStr::from_ptr(text_ptr) is safe: pointer is non-null and points to valid C string
|
||||
// 3. We immediately copy all data to an owned String before deallocation
|
||||
// 4. The string data remains valid until TessDeleteText is called
|
||||
let text = {
|
||||
let c_str = unsafe { CStr::from_ptr(text_ptr) };
|
||||
let owned = c_str.to_str()?.to_owned();
|
||||
// SAFETY: TessDeleteText() deallocates memory allocated by TessResultIteratorGetUTF8Text():
|
||||
// 1. text_ptr is non-null (verified above)
|
||||
// 2. text_ptr came from the Tesseract API (correct allocation type)
|
||||
// 3. TessDeleteText() is the correct deallocation function for this allocation
|
||||
// 4. Called exactly once per allocation to avoid double-free
|
||||
// 5. owned String was already populated; text_ptr is no longer accessed after this call
|
||||
unsafe { TessDeleteText(text_ptr as *mut c_char) };
|
||||
owned
|
||||
};
|
||||
|
||||
let mut left = 0;
|
||||
let mut top = 0;
|
||||
let mut right = 0;
|
||||
let mut bottom = 0;
|
||||
// SAFETY: TessPageIteratorBoundingBox() queries iterator state and fills output parameters.
|
||||
// This is safe because:
|
||||
// 1. raw is a valid pointer to an initialized ResultIterator (caller holds mutex lock)
|
||||
// 2. RIL_WORD is a valid TessPageIteratorLevel enum value converted to c_int
|
||||
// 3. All mutable references are valid local stack variables with distinct memory locations
|
||||
// 4. Each reference is exclusively borrowed (Rust enforces no aliasing)
|
||||
// 5. The references outlive the FFI call (defined on stack, used immediately after)
|
||||
// 6. Return value indicates success/failure (checked below)
|
||||
let bbox_result = unsafe {
|
||||
TessPageIteratorBoundingBox(
|
||||
raw,
|
||||
TessPageIteratorLevel::RIL_WORD as c_int,
|
||||
&mut left,
|
||||
&mut top,
|
||||
&mut right,
|
||||
&mut bottom,
|
||||
)
|
||||
};
|
||||
if bbox_result == 0 {
|
||||
return Err(TesseractError::InvalidParameterError);
|
||||
}
|
||||
|
||||
// SAFETY: TessResultIteratorConfidence() reads iterator state and returns an f32 value.
|
||||
// This is safe because:
|
||||
// 1. raw is a valid pointer to an initialized ResultIterator (caller holds mutex lock)
|
||||
// 2. RIL_WORD is a valid TessPageIteratorLevel enum value converted to c_int
|
||||
// 3. The function only reads state and returns a copy (no pointer operations)
|
||||
let confidence = unsafe { TessResultIteratorConfidence(raw, TessPageIteratorLevel::RIL_WORD as c_int) };
|
||||
|
||||
// Collect font attributes; treat any failure as absent rather than propagating the error.
|
||||
let font_attrs = {
|
||||
let mut is_bold = 0;
|
||||
let mut is_italic = 0;
|
||||
let mut is_underlined = 0;
|
||||
let mut is_monospace = 0;
|
||||
let mut is_serif = 0;
|
||||
let mut is_smallcaps = 0;
|
||||
let mut pointsize = 0;
|
||||
let mut font_id = 0;
|
||||
// SAFETY: TessResultIteratorWordFontAttributes() fills output parameters with font info.
|
||||
// This is safe because:
|
||||
// 1. raw is a valid pointer to an initialized ResultIterator (caller holds mutex lock)
|
||||
// 2. All mutable references are valid local stack variables with distinct memory locations
|
||||
// 3. Each reference is exclusively borrowed (no aliasing)
|
||||
// 4. The references outlive the FFI call
|
||||
// 5. Return value is non-zero on success, zero on failure (checked below)
|
||||
let result = unsafe {
|
||||
TessResultIteratorWordFontAttributes(
|
||||
raw,
|
||||
&mut is_bold,
|
||||
&mut is_italic,
|
||||
&mut is_underlined,
|
||||
&mut is_monospace,
|
||||
&mut is_serif,
|
||||
&mut is_smallcaps,
|
||||
&mut pointsize,
|
||||
&mut font_id,
|
||||
)
|
||||
};
|
||||
if result != 0 {
|
||||
Some(FontAttributes {
|
||||
is_bold: is_bold != 0,
|
||||
is_italic: is_italic != 0,
|
||||
is_underlined: is_underlined != 0,
|
||||
is_monospace: is_monospace != 0,
|
||||
is_serif: is_serif != 0,
|
||||
is_smallcaps: is_smallcaps != 0,
|
||||
pointsize,
|
||||
font_id,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
Ok(WordData {
|
||||
text,
|
||||
left,
|
||||
top,
|
||||
right,
|
||||
bottom,
|
||||
confidence,
|
||||
font_attrs,
|
||||
})
|
||||
}
|
||||
|
||||
impl Drop for ResultIterator {
|
||||
fn drop(&mut self) {
|
||||
if let Ok(handle) = self.handle.lock() {
|
||||
// SAFETY: TessResultIteratorDelete() frees the ResultIterator handle allocated by Tesseract:
|
||||
// 1. We use .ok() pattern to handle poisoned mutex gracefully (no panic in Drop)
|
||||
// 2. *handle is a valid opaque pointer allocated by TessBaseAPIGetIterator()
|
||||
// or TessBaseAPIGetMutableIterator() - Tesseract owns this memory
|
||||
// 3. TessResultIteratorDelete() is the single correct way to deallocate this type
|
||||
// 4. The function must be called exactly once per allocation to avoid double-free
|
||||
// 5. After calling delete, the pointer is invalid; future use would cause use-after-free
|
||||
// 6. Drop impl never panics (we use .ok() guard), ensuring cleanup always executes
|
||||
// 7. If mutex is poisoned, handle cleanup is skipped (OS will reclaim process memory)
|
||||
unsafe { TessResultIteratorDelete(*handle) };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
|
||||
ffi_extern! {
|
||||
pub fn TessResultIteratorDelete(handle: *mut c_void);
|
||||
pub fn TessPageIteratorBegin(handle: *mut c_void);
|
||||
pub fn TessResultIteratorGetUTF8Text(handle: *mut c_void, level: c_int) -> *mut c_char;
|
||||
pub fn TessResultIteratorConfidence(handle: *mut c_void, level: c_int) -> c_float;
|
||||
pub fn TessResultIteratorWordRecognitionLanguage(handle: *mut c_void) -> *const c_char;
|
||||
pub fn TessResultIteratorWordFontAttributes(
|
||||
handle: *mut c_void,
|
||||
is_bold: *mut c_int,
|
||||
is_italic: *mut c_int,
|
||||
is_underlined: *mut c_int,
|
||||
is_monospace: *mut c_int,
|
||||
is_serif: *mut c_int,
|
||||
is_smallcaps: *mut c_int,
|
||||
pointsize: *mut c_int,
|
||||
font_id: *mut c_int,
|
||||
) -> c_int;
|
||||
pub fn TessResultIteratorWordIsFromDictionary(handle: *mut c_void) -> c_int;
|
||||
pub fn TessResultIteratorWordIsNumeric(handle: *mut c_void) -> c_int;
|
||||
pub fn TessResultIteratorSymbolIsSuperscript(handle: *mut c_void) -> c_int;
|
||||
pub fn TessResultIteratorSymbolIsSubscript(handle: *mut c_void) -> c_int;
|
||||
pub fn TessResultIteratorSymbolIsDropcap(handle: *mut c_void) -> c_int;
|
||||
pub fn TessResultIteratorNext(handle: *mut c_void, level: c_int) -> c_int;
|
||||
pub fn TessPageIteratorBoundingBox(
|
||||
handle: *mut c_void,
|
||||
level: c_int,
|
||||
left: *mut c_int,
|
||||
top: *mut c_int,
|
||||
right: *mut c_int,
|
||||
bottom: *mut c_int,
|
||||
) -> c_int;
|
||||
}
|
||||
212
crates/kreuzberg-tesseract/src/result_renderer.rs
Normal file
212
crates/kreuzberg-tesseract/src/result_renderer.rs
Normal file
@@ -0,0 +1,212 @@
|
||||
use crate::TesseractAPI;
|
||||
use crate::error::{Result, TesseractError};
|
||||
use std::ffi::{CStr, CString};
|
||||
use std::os::raw::{c_char, c_int, c_void};
|
||||
use std::sync::Arc;
|
||||
use std::sync::Mutex;
|
||||
|
||||
pub struct TessResultRenderer {
|
||||
handle: Arc<Mutex<*mut c_void>>,
|
||||
}
|
||||
|
||||
unsafe impl Send for TessResultRenderer {}
|
||||
unsafe impl Sync for TessResultRenderer {}
|
||||
|
||||
impl TessResultRenderer {
|
||||
/// Creates a new instance of the TessResultRenderer.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `outputbase` - Output base path.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the new instance of the TessResultRenderer.
|
||||
pub fn new_text_renderer(outputbase: &str) -> Result<Self> {
|
||||
let outputbase = CString::new(outputbase).map_err(|_| TesseractError::NullByteInString)?;
|
||||
let handle = unsafe { TessTextRendererCreate(outputbase.as_ptr()) };
|
||||
if handle.is_null() {
|
||||
Err(TesseractError::NullPointerError)
|
||||
} else {
|
||||
Ok(TessResultRenderer {
|
||||
handle: Arc::new(Mutex::new(handle)),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new instance of the TessResultRenderer for HOCR.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `outputbase` - Output base path.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the new instance of the TessResultRenderer.
|
||||
pub fn new_hocr_renderer(outputbase: &str) -> Result<Self> {
|
||||
let outputbase = CString::new(outputbase).map_err(|_| TesseractError::NullByteInString)?;
|
||||
let handle = unsafe { TessHOcrRendererCreate(outputbase.as_ptr()) };
|
||||
if handle.is_null() {
|
||||
Err(TesseractError::NullPointerError)
|
||||
} else {
|
||||
Ok(TessResultRenderer {
|
||||
handle: Arc::new(Mutex::new(handle)),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new instance of the TessResultRenderer for PDF.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `outputbase` - Output base path.
|
||||
/// * `datadir` - Data directory path.
|
||||
/// * `textonly` - Whether to include text only.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the new instance of the TessResultRenderer.
|
||||
pub fn new_pdf_renderer(outputbase: &str, datadir: &str, textonly: bool) -> Result<Self> {
|
||||
let outputbase = CString::new(outputbase).map_err(|_| TesseractError::NullByteInString)?;
|
||||
let datadir = CString::new(datadir).map_err(|_| TesseractError::NullByteInString)?;
|
||||
let handle = unsafe { TessPDFRendererCreate(outputbase.as_ptr(), datadir.as_ptr(), textonly as c_int) };
|
||||
if handle.is_null() {
|
||||
Err(TesseractError::NullPointerError)
|
||||
} else {
|
||||
Ok(TessResultRenderer {
|
||||
handle: Arc::new(Mutex::new(handle)),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Begins a new document.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `title` - Title of the document.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `true` if the document was created successfully, otherwise returns `false`.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns a `TesseractError` if the string contains a null byte or if the mutex lock fails.
|
||||
pub fn begin_document(&self, title: &str) -> Result<bool> {
|
||||
let title = CString::new(title).map_err(|_| TesseractError::NullByteInString)?;
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
Ok(unsafe { TessResultRendererBeginDocument(*handle, title.as_ptr()) != 0 })
|
||||
}
|
||||
|
||||
/// Adds an image to the document.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `api` - The TesseractAPI instance.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `true` if the image was added successfully, otherwise returns `false`.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns a `TesseractError::MutexLockError` if either mutex lock fails.
|
||||
pub fn add_image(&self, api: &TesseractAPI) -> Result<bool> {
|
||||
let api_handle = api.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
Ok(unsafe { TessResultRendererAddImage(*handle, *api_handle) != 0 })
|
||||
}
|
||||
|
||||
/// Ends the document.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `true` if the document was ended successfully, otherwise returns `false`.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns a `TesseractError::MutexLockError` if the mutex lock fails.
|
||||
pub fn end_document(&self) -> Result<bool> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
Ok(unsafe { TessResultRendererEndDocument(*handle) != 0 })
|
||||
}
|
||||
|
||||
/// Gets the extension of the document.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the extension as a `String` if successful, otherwise returns an error.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns a `TesseractError::MutexLockError` if the mutex lock fails,
|
||||
/// `TesseractError::NullPointerError` if the extension pointer is null,
|
||||
/// or `TesseractError::Utf8Error` if the extension contains invalid UTF-8.
|
||||
pub fn get_extension(&self) -> Result<String> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
let ext_ptr = unsafe { TessResultRendererExtention(*handle) };
|
||||
if ext_ptr.is_null() {
|
||||
Err(TesseractError::NullPointerError)
|
||||
} else {
|
||||
let c_str = unsafe { CStr::from_ptr(ext_ptr) };
|
||||
Ok(c_str.to_str()?.to_owned())
|
||||
}
|
||||
}
|
||||
|
||||
/// Gets the title of the document.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the title as a `String` if successful, otherwise returns an error.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns a `TesseractError::MutexLockError` if the mutex lock fails,
|
||||
/// `TesseractError::NullPointerError` if the title pointer is null,
|
||||
/// or `TesseractError::Utf8Error` if the title contains invalid UTF-8.
|
||||
pub fn get_title(&self) -> Result<String> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
let title_ptr = unsafe { TessResultRendererTitle(*handle) };
|
||||
if title_ptr.is_null() {
|
||||
Err(TesseractError::NullPointerError)
|
||||
} else {
|
||||
let c_str = unsafe { CStr::from_ptr(title_ptr) };
|
||||
Ok(c_str.to_str()?.to_owned())
|
||||
}
|
||||
}
|
||||
|
||||
/// Gets the number of images in the document.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the number of images as an `i32`.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns a `TesseractError::MutexLockError` if the mutex lock fails.
|
||||
pub fn get_image_num(&self) -> Result<i32> {
|
||||
let handle = self.handle.lock().map_err(|_| TesseractError::MutexLockError)?;
|
||||
Ok(unsafe { TessResultRendererImageNum(*handle) })
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for TessResultRenderer {
|
||||
fn drop(&mut self) {
|
||||
if let Ok(handle) = self.handle.lock() {
|
||||
unsafe { TessDeleteResultRenderer(*handle) };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ffi_extern! {
|
||||
pub fn TessTextRendererCreate(outputbase: *const c_char) -> *mut c_void;
|
||||
pub fn TessHOcrRendererCreate(outputbase: *const c_char) -> *mut c_void;
|
||||
pub fn TessPDFRendererCreate(outputbase: *const c_char, datadir: *const c_char, textonly: c_int) -> *mut c_void;
|
||||
pub fn TessDeleteResultRenderer(renderer: *mut c_void);
|
||||
pub fn TessResultRendererBeginDocument(renderer: *mut c_void, title: *const c_char) -> c_int;
|
||||
pub fn TessResultRendererAddImage(renderer: *mut c_void, api: *mut c_void) -> c_int;
|
||||
pub fn TessResultRendererEndDocument(renderer: *mut c_void) -> c_int;
|
||||
pub fn TessResultRendererExtention(renderer: *mut c_void) -> *const c_char;
|
||||
pub fn TessResultRendererTitle(renderer: *mut c_void) -> *const c_char;
|
||||
pub fn TessResultRendererImageNum(renderer: *mut c_void) -> c_int;
|
||||
}
|
||||
211
crates/kreuzberg-tesseract/tests/integration_test.rs
Normal file
211
crates/kreuzberg-tesseract/tests/integration_test.rs
Normal file
@@ -0,0 +1,211 @@
|
||||
use kreuzberg_tesseract::TesseractAPI;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
fn get_default_tessdata_dir() -> PathBuf {
|
||||
if cfg!(target_os = "macos") {
|
||||
let home_dir = std::env::var("HOME").expect("HOME environment variable not set");
|
||||
PathBuf::from(home_dir)
|
||||
.join("Library")
|
||||
.join("Application Support")
|
||||
.join("kreuzberg-tesseract")
|
||||
.join("tessdata")
|
||||
} else if cfg!(target_os = "linux") {
|
||||
let system_paths = [
|
||||
PathBuf::from("/usr/share/tesseract-ocr/5/tessdata"),
|
||||
PathBuf::from("/usr/share/tesseract-ocr/tessdata"),
|
||||
];
|
||||
for path in &system_paths {
|
||||
if path.exists() {
|
||||
return path.clone();
|
||||
}
|
||||
}
|
||||
let home_dir = std::env::var("HOME").expect("HOME environment variable not set");
|
||||
PathBuf::from(home_dir).join(".kreuzberg-tesseract").join("tessdata")
|
||||
} else if cfg!(target_os = "windows") {
|
||||
PathBuf::from(std::env::var("APPDATA").expect("APPDATA environment variable not set"))
|
||||
.join("kreuzberg-tesseract")
|
||||
.join("tessdata")
|
||||
} else {
|
||||
panic!("Unsupported operating system");
|
||||
}
|
||||
}
|
||||
|
||||
fn get_tessdata_dir() -> PathBuf {
|
||||
match std::env::var("TESSDATA_PREFIX") {
|
||||
Ok(dir) => {
|
||||
let prefix_path = PathBuf::from(dir);
|
||||
let tessdata_path = if prefix_path.ends_with("tessdata") {
|
||||
prefix_path
|
||||
} else {
|
||||
prefix_path.join("tessdata")
|
||||
};
|
||||
println!("Using TESSDATA_PREFIX directory: {:?}", tessdata_path);
|
||||
tessdata_path
|
||||
}
|
||||
Err(_) => {
|
||||
let default_dir = get_default_tessdata_dir();
|
||||
println!("TESSDATA_PREFIX not set, using default directory: {:?}", default_dir);
|
||||
default_dir
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn ensure_eng_traineddata_exists(tessdata_dir: &Path) {
|
||||
let eng_traineddata = tessdata_dir.join("eng.traineddata");
|
||||
assert!(
|
||||
eng_traineddata.exists(),
|
||||
"eng.traineddata not found in {}. Set TESSDATA_PREFIX or install English tessdata.",
|
||||
tessdata_dir.display()
|
||||
);
|
||||
}
|
||||
|
||||
fn repo_root() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("..").join("..")
|
||||
}
|
||||
|
||||
fn load_test_image(relative: &str) -> Result<(Vec<u8>, u32, u32), Box<dyn std::error::Error>> {
|
||||
let mut path = repo_root();
|
||||
path.push("test_documents");
|
||||
path.push(relative);
|
||||
|
||||
let img = image::open(&path)
|
||||
.map_err(|e| format!("Failed to open test image {}: {}", path.display(), e))?
|
||||
.to_rgb8();
|
||||
let (width, height) = img.dimensions();
|
||||
Ok((img.into_raw(), width, height))
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ocr_on_hello_world_image() {
|
||||
let tessdata_dir = get_tessdata_dir();
|
||||
ensure_eng_traineddata_exists(&tessdata_dir);
|
||||
|
||||
let api = TesseractAPI::new().expect("Failed to create TesseractAPI");
|
||||
api.init(tessdata_dir.to_str().unwrap(), "eng")
|
||||
.expect("Failed to initialize Tesseract");
|
||||
|
||||
let (image_data, width, height) =
|
||||
load_test_image("images/test_hello_world.png").expect("Failed to load test image");
|
||||
api.set_image(&image_data, width as i32, height as i32, 3, 3 * width as i32)
|
||||
.expect("Failed to set image");
|
||||
|
||||
let text = api.get_utf8_text().expect("Failed to perform OCR");
|
||||
assert!(
|
||||
text.to_lowercase().contains("hello"),
|
||||
"Text does not contain expected word. Found: {}",
|
||||
text
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ocr_on_table_image() {
|
||||
let tessdata_dir = get_tessdata_dir();
|
||||
ensure_eng_traineddata_exists(&tessdata_dir);
|
||||
|
||||
let api = TesseractAPI::new().expect("Failed to create TesseractAPI");
|
||||
api.init(tessdata_dir.to_str().unwrap(), "eng")
|
||||
.expect("Failed to initialize Tesseract");
|
||||
api.set_variable("tessedit_pageseg_mode", "1")
|
||||
.expect("Failed to set PSM");
|
||||
|
||||
let (image_data, width, height) = load_test_image("images/simple_table.png").expect("Failed to load test image");
|
||||
api.set_image(&image_data, width as i32, height as i32, 3, 3 * width as i32)
|
||||
.expect("Failed to set image");
|
||||
|
||||
let text = api.get_utf8_text().expect("Failed to perform OCR");
|
||||
let lowercase = text.to_lowercase();
|
||||
assert!(
|
||||
lowercase.contains("product") && lowercase.contains("price"),
|
||||
"Table text missing expected words. Found: {}",
|
||||
text
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_invalid_language_code() {
|
||||
let tessdata_dir = get_tessdata_dir();
|
||||
ensure_eng_traineddata_exists(&tessdata_dir);
|
||||
|
||||
let api = TesseractAPI::new().expect("Failed to create TesseractAPI");
|
||||
|
||||
let result = api.init(tessdata_dir.to_str().unwrap(), "invalid_lang");
|
||||
assert!(result.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_image_data() {
|
||||
let tessdata_dir = get_tessdata_dir();
|
||||
ensure_eng_traineddata_exists(&tessdata_dir);
|
||||
|
||||
let api = TesseractAPI::new().expect("Failed to create TesseractAPI");
|
||||
api.init(tessdata_dir.to_str().unwrap(), "eng")
|
||||
.expect("Failed to initialize Tesseract");
|
||||
|
||||
let empty_data: Vec<u8> = Vec::new();
|
||||
let res = api.set_image(&empty_data, 100, 100, 3, 300);
|
||||
assert!(res.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_invalid_image_parameters() {
|
||||
let tessdata_dir = get_tessdata_dir();
|
||||
ensure_eng_traineddata_exists(&tessdata_dir);
|
||||
|
||||
let api = TesseractAPI::new().expect("Failed to create TesseractAPI");
|
||||
api.init(tessdata_dir.to_str().unwrap(), "eng")
|
||||
.expect("Failed to initialize Tesseract");
|
||||
|
||||
let (image_data, width, height) =
|
||||
load_test_image("images/test_hello_world.png").expect("Failed to load test image");
|
||||
|
||||
let res = api.set_image(&image_data, -1, height as i32, 3, 3 * width as i32);
|
||||
assert!(res.is_err());
|
||||
|
||||
let res = api.set_image(&image_data, width as i32, 0, 3, 3 * width as i32);
|
||||
assert!(res.is_err());
|
||||
|
||||
let res = api.set_image(&image_data, width as i32, height as i32, 0, 3 * width as i32);
|
||||
assert!(res.is_err());
|
||||
|
||||
let res = api.set_image(&image_data, width as i32, height as i32, 3, width as i32);
|
||||
assert!(res.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_variable_setting() {
|
||||
let tessdata_dir = get_tessdata_dir();
|
||||
ensure_eng_traineddata_exists(&tessdata_dir);
|
||||
|
||||
let api = TesseractAPI::new().expect("Failed to create TesseractAPI");
|
||||
api.init(tessdata_dir.to_str().unwrap(), "eng")
|
||||
.expect("Failed to initialize Tesseract");
|
||||
|
||||
let res = api.set_variable("invalid_variable_name", "1");
|
||||
assert!(res.is_err());
|
||||
|
||||
let res = api.set_variable("tessedit_char_whitelist", "");
|
||||
assert!(res.is_ok());
|
||||
|
||||
assert!(api.set_variable("tessedit_pageseg_mode", "1").is_ok());
|
||||
assert!(api.set_variable("tessedit_ocr_engine_mode", "1").is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multiple_operations() {
|
||||
let tessdata_dir = get_tessdata_dir();
|
||||
ensure_eng_traineddata_exists(&tessdata_dir);
|
||||
|
||||
let api = TesseractAPI::new().expect("Failed to create TesseractAPI");
|
||||
api.init(tessdata_dir.to_str().unwrap(), "eng")
|
||||
.expect("Failed to initialize Tesseract");
|
||||
|
||||
let (image_data, width, height) =
|
||||
load_test_image("images/test_hello_world.png").expect("Failed to load test image");
|
||||
|
||||
for _ in 0..3 {
|
||||
let res = api.set_image(&image_data, width as i32, height as i32, 3, 3 * width as i32);
|
||||
assert!(res.is_ok());
|
||||
let text = api.get_utf8_text().expect("Failed to perform OCR");
|
||||
assert!(!text.is_empty());
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user