This commit is contained in:
74
crates/kreuzberg-tesseract/patches/README.md
Normal file
74
crates/kreuzberg-tesseract/patches/README.md
Normal file
@@ -0,0 +1,74 @@
|
||||
# Tesseract WASM Patches
|
||||
|
||||
This directory contains patches needed to compile Tesseract for WebAssembly (WASM) targets using WASI SDK.
|
||||
|
||||
These patches are vendored from the [tesseract-wasm](https://github.com/naptha/tesseract.js) project and have been proven to work with WASM compilation.
|
||||
|
||||
## Patches
|
||||
|
||||
### tesseract.diff
|
||||
|
||||
A comprehensive patch that makes Tesseract compatible with WASM compilation. The patch includes the following changes:
|
||||
|
||||
#### 1. CMakeLists.txt Modifications
|
||||
|
||||
- **New CMake option**: `BUILD_TESSERACT_BINARY` (default: ON)
|
||||
- Allows disabling the Tesseract CLI binary build, which is not needed for WASM
|
||||
- Wraps all executable and installation targets for the tesseract binary
|
||||
|
||||
- **Disabled components for WASM**:
|
||||
- Removes OpenCL support (`src/opencl/*.cpp`) - not applicable to WASM
|
||||
- Removes viewer support (`src/viewer/*.cpp`) - UI components not needed for WASM
|
||||
- Removes C API bindings (`src/api/capi.cpp`) - only hocrrenderer is kept
|
||||
- Removes PDF and rendering support files:
|
||||
- `src/api/renderer.cpp`
|
||||
- `src/api/altorenderer.cpp`
|
||||
- `src/api/lstmboxrenderer.cpp`
|
||||
- `src/api/pdfrenderer.cpp`
|
||||
- `src/api/wordstrboxrenderer.cpp`
|
||||
|
||||
#### 2. SIMD Detection Fixes (src/arch/simddetect.cpp)
|
||||
|
||||
- Guards CPUID detection with `#if !defined(__wasm__)`
|
||||
- Prevents attempts to use CPU feature detection that don't exist in WASM
|
||||
- The HAS_CPUID macro is only defined for non-WASM builds
|
||||
- This allows the code to gracefully handle WASM's SIMD limitations
|
||||
|
||||
#### 3. Pointer Type Fixes (src/ccmain/pageiterator.cpp, src/ccmain/pagesegmain.cpp, src/ccmain/tesseractclass.cpp)
|
||||
|
||||
**Changed from stack allocation to heap allocation** in `tesseractclass.h`:
|
||||
|
||||
- `pixa_debug_` changed from `DebugPixa` to `std::unique_ptr<DebugPixa>`
|
||||
- This prevents large allocations on the stack, which is limited in WASM
|
||||
|
||||
**Updated all references** throughout the codebase:
|
||||
|
||||
- `.get()` calls added where raw pointers are needed
|
||||
- Arrow operator `->` replaces dot operator `.` for member access
|
||||
- Null checks added before dereferencing to prevent crashes
|
||||
|
||||
**Affected functions**:
|
||||
|
||||
- `PageIterator::Orientation()` - added null vector check
|
||||
- `Tesseract::AutoPageSeg()` - updated pointer passing
|
||||
- `Tesseract::SetupPageSegAndDetectOrientation()` - multiple pointer updates
|
||||
- `Tesseract::Clear()` - added null check before WritePDF
|
||||
- `Tesseract::PrepareForPageseg()` - updated Split() calls
|
||||
- `Tesseract::PrepareForTessOCR()` - updated Split() calls
|
||||
|
||||
#### 4. Additional Fixes
|
||||
|
||||
- **Orientation detection**: Changed comparison from `> 0.0F` to `>= 0.0F` in `pageiterator.cpp` to handle null vectors gracefully when orientation info is not available
|
||||
|
||||
## How to Apply
|
||||
|
||||
These patches are applied during the WASM build process. They modify the Tesseract source code to:
|
||||
|
||||
1. Disable WASM-incompatible features (OpenCL, viewers, renderers)
|
||||
2. Prevent CPUID detection in WASM environment
|
||||
3. Use heap allocation instead of stack allocation for large objects
|
||||
4. Handle missing pointer initialization gracefully
|
||||
|
||||
## Source
|
||||
|
||||
These patches are based on the proven WASM compilation approach used by the tesseract.js project, which successfully compiles Tesseract to WebAssembly and deploys it in production environments.
|
||||
199
crates/kreuzberg-tesseract/patches/tesseract.diff
Normal file
199
crates/kreuzberg-tesseract/patches/tesseract.diff
Normal file
@@ -0,0 +1,199 @@
|
||||
diff --git a/CMakeLists.txt b/CMakeLists.txt
|
||||
index 8c6845cb..fdcfc4a8 100644
|
||||
--- a/CMakeLists.txt
|
||||
+++ b/CMakeLists.txt
|
||||
@@ -90,6 +90,7 @@ option(ENABLE_LTO "Enable link-time optimization" OFF)
|
||||
option(FAST_FLOAT "Enable float for LSTM" ON)
|
||||
option(ENABLE_OPENCL "Enable unsupported experimental OpenCL support" OFF)
|
||||
option(BUILD_TRAINING_TOOLS "Build training tools" ON)
|
||||
+option(BUILD_TESSERACT_BINARY "Build Tesseract binary" ON)
|
||||
option(BUILD_TESTS "Build tests" OFF)
|
||||
option(USE_SYSTEM_ICU "Use system ICU" OFF)
|
||||
option(DISABLE_ARCHIVE "Disable build with libarchive (if available)" OFF)
|
||||
@@ -565,9 +566,7 @@ file(
|
||||
src/cutil/*.cpp
|
||||
src/dict/*.cpp
|
||||
src/lstm/*.cpp
|
||||
- src/opencl/*.cpp
|
||||
src/textord/*.cpp
|
||||
- src/viewer/*.cpp
|
||||
src/wordrec/*.cpp)
|
||||
|
||||
if(DISABLED_LEGACY_ENGINE)
|
||||
@@ -714,13 +713,7 @@ file(
|
||||
set(TESSERACT_SRC
|
||||
${TESSERACT_SRC}
|
||||
src/api/baseapi.cpp
|
||||
- src/api/capi.cpp
|
||||
- src/api/renderer.cpp
|
||||
- src/api/altorenderer.cpp
|
||||
- src/api/hocrrenderer.cpp
|
||||
- src/api/lstmboxrenderer.cpp
|
||||
- src/api/pdfrenderer.cpp
|
||||
- src/api/wordstrboxrenderer.cpp)
|
||||
+ src/api/hocrrenderer.cpp)
|
||||
|
||||
set(TESSERACT_CONFIGS
|
||||
tessdata/configs/alto
|
||||
@@ -858,14 +851,16 @@ endif()
|
||||
# EXECUTABLE tesseract
|
||||
# ##############################################################################
|
||||
|
||||
-add_executable(tesseract src/tesseract.cpp)
|
||||
-target_link_libraries(tesseract libtesseract)
|
||||
-if(HAVE_TIFFIO_H AND WIN32)
|
||||
- target_link_libraries(tesseract ${TIFF_LIBRARIES})
|
||||
-endif()
|
||||
+if(BUILD_TESSERACT_BINARY)
|
||||
+ add_executable(tesseract src/tesseract.cpp)
|
||||
+ target_link_libraries(tesseract libtesseract)
|
||||
+ if(HAVE_TIFFIO_H AND WIN32)
|
||||
+ target_link_libraries(tesseract ${TIFF_LIBRARIES})
|
||||
+ endif()
|
||||
|
||||
-if(OPENMP_BUILD AND UNIX)
|
||||
- target_link_libraries(tesseract pthread)
|
||||
+ if(OPENMP_BUILD AND UNIX)
|
||||
+ target_link_libraries(tesseract pthread)
|
||||
+ endif()
|
||||
endif()
|
||||
|
||||
# ##############################################################################
|
||||
@@ -899,7 +894,11 @@ write_basic_package_version_file(
|
||||
|
||||
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/tesseract.pc
|
||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
|
||||
-install(TARGETS tesseract DESTINATION bin)
|
||||
+
|
||||
+if(BUILD_TESSERACT_BINARY)
|
||||
+ install(TARGETS tesseract DESTINATION bin)
|
||||
+endif()
|
||||
+
|
||||
install(
|
||||
TARGETS libtesseract
|
||||
EXPORT TesseractTargets
|
||||
diff --git a/src/arch/simddetect.cpp b/src/arch/simddetect.cpp
|
||||
index 1afe5a5d..cb8c6d4c 100644
|
||||
--- a/src/arch/simddetect.cpp
|
||||
+++ b/src/arch/simddetect.cpp
|
||||
@@ -40,10 +40,12 @@
|
||||
|
||||
#endif
|
||||
|
||||
+#if !defined(__wasm__)
|
||||
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA) || defined(HAVE_SSE4_1)
|
||||
// See https://en.wikipedia.org/wiki/CPUID.
|
||||
# define HAS_CPUID
|
||||
#endif
|
||||
+#endif
|
||||
|
||||
#if defined(HAS_CPUID)
|
||||
# if defined(__GNUC__)
|
||||
diff --git a/src/ccmain/pageiterator.cpp b/src/ccmain/pageiterator.cpp
|
||||
index 64ff7f66..c0f80e5f 100644
|
||||
--- a/src/ccmain/pageiterator.cpp
|
||||
+++ b/src/ccmain/pageiterator.cpp
|
||||
@@ -582,7 +582,9 @@ void PageIterator::Orientation(tesseract::Orientation *orientation,
|
||||
up_in_image.rotate(block->re_rotation());
|
||||
|
||||
if (up_in_image.x() == 0.0F) {
|
||||
- if (up_in_image.y() > 0.0F) {
|
||||
+ // tesseract-wasm note: `up_in_image` will be a null vector if orientation
|
||||
+ // info is not available. In that case, assume page up.
|
||||
+ if (up_in_image.y() >= 0.0F) {
|
||||
*orientation = ORIENTATION_PAGE_UP;
|
||||
} else {
|
||||
*orientation = ORIENTATION_PAGE_DOWN;
|
||||
diff --git a/src/ccmain/pagesegmain.cpp b/src/ccmain/pagesegmain.cpp
|
||||
index 0af44607..718e73ef 100644
|
||||
--- a/src/ccmain/pagesegmain.cpp
|
||||
+++ b/src/ccmain/pagesegmain.cpp
|
||||
@@ -222,7 +222,7 @@ int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOC
|
||||
}
|
||||
#endif // ndef DISABLED_LEGACY_ENGINE
|
||||
result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_, to_block,
|
||||
- photomask_pix, pix_thresholds_, pix_grey_, &pixa_debug_,
|
||||
+ photomask_pix, pix_thresholds_, pix_grey_, pixa_debug_.get(),
|
||||
&found_blocks, diacritic_blobs, to_blocks);
|
||||
if (result >= 0) {
|
||||
finder->GetDeskewVectors(&deskew_, &reskew_);
|
||||
@@ -279,17 +279,17 @@ ColumnFinder *Tesseract::SetupPageSegAndDetectOrientation(PageSegMode pageseg_mo
|
||||
ICOORD bleft(0, 0);
|
||||
|
||||
ASSERT_HOST(pix_binary_ != nullptr);
|
||||
- if (tessedit_dump_pageseg_images) {
|
||||
- pixa_debug_.AddPix(pix_binary_, "PageSegInput");
|
||||
+ if (tessedit_dump_pageseg_images && pixa_debug_) {
|
||||
+ pixa_debug_->AddPix(pix_binary_, "PageSegInput");
|
||||
}
|
||||
// Leptonica is used to find the rule/separator lines in the input.
|
||||
LineFinder::FindAndRemoveLines(source_resolution_, textord_tabfind_show_vlines, pix_binary_,
|
||||
&vertical_x, &vertical_y, music_mask_pix, &v_lines, &h_lines);
|
||||
- if (tessedit_dump_pageseg_images) {
|
||||
- pixa_debug_.AddPix(pix_binary_, "NoLines");
|
||||
+ if (tessedit_dump_pageseg_images && pixa_debug_) {
|
||||
+ pixa_debug_->AddPix(pix_binary_, "NoLines");
|
||||
}
|
||||
// Leptonica is used to find a mask of the photo regions in the input.
|
||||
- *photo_mask_pix = ImageFind::FindImages(pix_binary_, &pixa_debug_);
|
||||
+ *photo_mask_pix = ImageFind::FindImages(pix_binary_, pixa_debug_.get());
|
||||
if (tessedit_dump_pageseg_images) {
|
||||
Image pix_no_image_ = nullptr;
|
||||
if (*photo_mask_pix != nullptr) {
|
||||
@@ -297,7 +297,7 @@ ColumnFinder *Tesseract::SetupPageSegAndDetectOrientation(PageSegMode pageseg_mo
|
||||
} else {
|
||||
pix_no_image_ = pix_binary_.clone();
|
||||
}
|
||||
- pixa_debug_.AddPix(pix_no_image_, "NoImages");
|
||||
+ pixa_debug_->AddPix(pix_no_image_, "NoImages");
|
||||
pix_no_image_.destroy();
|
||||
}
|
||||
if (!PSM_COL_FIND_ENABLED(pageseg_mode)) {
|
||||
diff --git a/src/ccmain/tesseractclass.cpp b/src/ccmain/tesseractclass.cpp
|
||||
index fd58ac87..517f925e 100644
|
||||
--- a/src/ccmain/tesseractclass.cpp
|
||||
+++ b/src/ccmain/tesseractclass.cpp
|
||||
@@ -487,8 +487,10 @@ Dict &Tesseract::getDict() {
|
||||
}
|
||||
|
||||
void Tesseract::Clear() {
|
||||
- std::string debug_name = imagebasename + "_debug.pdf";
|
||||
- pixa_debug_.WritePDF(debug_name.c_str());
|
||||
+ if (pixa_debug_) {
|
||||
+ std::string debug_name = imagebasename + "_debug.pdf";
|
||||
+ pixa_debug_->WritePDF(debug_name.c_str());
|
||||
+ }
|
||||
pix_binary_.destroy();
|
||||
pix_grey_.destroy();
|
||||
pix_thresholds_.destroy();
|
||||
@@ -572,7 +574,7 @@ void Tesseract::PrepareForPageseg() {
|
||||
// the newly split image.
|
||||
splitter_.set_orig_pix(pix_binary());
|
||||
splitter_.set_pageseg_split_strategy(max_pageseg_strategy);
|
||||
- if (splitter_.Split(true, &pixa_debug_)) {
|
||||
+ if (splitter_.Split(true, pixa_debug_.get())) {
|
||||
ASSERT_HOST(splitter_.splitted_image());
|
||||
pix_binary_.destroy();
|
||||
pix_binary_ = splitter_.splitted_image().clone();
|
||||
@@ -599,7 +601,7 @@ void Tesseract::PrepareForTessOCR(BLOCK_LIST *block_list, Tesseract *osd_tess, O
|
||||
splitter_.set_segmentation_block_list(block_list);
|
||||
splitter_.set_ocr_split_strategy(max_ocr_strategy);
|
||||
// Run the splitter for OCR
|
||||
- bool split_for_ocr = splitter_.Split(false, &pixa_debug_);
|
||||
+ bool split_for_ocr = splitter_.Split(false, pixa_debug_.get());
|
||||
// Restore pix_binary to the binarized original pix for future reference.
|
||||
ASSERT_HOST(splitter_.orig_pix());
|
||||
pix_binary_.destroy();
|
||||
diff --git a/src/ccmain/tesseractclass.h b/src/ccmain/tesseractclass.h
|
||||
index 732bb9e6..030aa5bc 100644
|
||||
--- a/src/ccmain/tesseractclass.h
|
||||
+++ b/src/ccmain/tesseractclass.h
|
||||
@@ -986,7 +986,7 @@ private:
|
||||
// Thresholds that were used to generate the thresholded image from grey.
|
||||
Image pix_thresholds_;
|
||||
// Debug images. If non-empty, will be written on destruction.
|
||||
- DebugPixa pixa_debug_;
|
||||
+ std::unique_ptr<DebugPixa> pixa_debug_;
|
||||
// Input image resolution after any scaling. The resolution is not well
|
||||
// transmitted by operations on Pix, so we keep an independent record here.
|
||||
int source_resolution_;
|
||||
Reference in New Issue
Block a user