2012 lines
91 KiB
Rust
2012 lines
91 KiB
Rust
|
|
#![allow(clippy::uninlined_format_args)]
|
||
|
|
|
||
|
|
#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
|
||
|
|
mod build_tesseract {
|
||
|
|
use cmake::Config;
|
||
|
|
use std::env;
|
||
|
|
use std::fs;
|
||
|
|
use std::path::{Path, PathBuf};
|
||
|
|
|
||
|
|
const LEPTONICA_VERSION: &str = "1.87.0";
|
||
|
|
const TESSERACT_VERSION: &str = "5.5.2";
|
||
|
|
|
||
|
|
fn leptonica_url() -> String {
|
||
|
|
format!(
|
||
|
|
"https://codeload.github.com/DanBloomberg/leptonica/zip/refs/tags/{}",
|
||
|
|
LEPTONICA_VERSION
|
||
|
|
)
|
||
|
|
}
|
||
|
|
|
||
|
|
fn tesseract_url() -> String {
|
||
|
|
format!(
|
||
|
|
"https://codeload.github.com/tesseract-ocr/tesseract/zip/refs/tags/{}",
|
||
|
|
TESSERACT_VERSION
|
||
|
|
)
|
||
|
|
}
|
||
|
|
|
||
|
|
fn workspace_cache_dir_from_out_dir() -> Option<PathBuf> {
|
||
|
|
let out_dir = env::var_os("OUT_DIR")?;
|
||
|
|
let mut path = PathBuf::from(out_dir);
|
||
|
|
for _ in 0..4 {
|
||
|
|
if !path.pop() {
|
||
|
|
return None;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
Some(path.join("kreuzberg-tesseract-cache"))
|
||
|
|
}
|
||
|
|
|
||
|
|
fn get_preferred_out_dir() -> PathBuf {
|
||
|
|
if let Ok(custom) = env::var("TESSERACT_RS_CACHE_DIR") {
|
||
|
|
return PathBuf::from(custom);
|
||
|
|
}
|
||
|
|
|
||
|
|
if cfg!(target_os = "windows") {
|
||
|
|
return PathBuf::from("C:\\tess");
|
||
|
|
}
|
||
|
|
|
||
|
|
if let Some(workspace_cache) = workspace_cache_dir_from_out_dir() {
|
||
|
|
return workspace_cache;
|
||
|
|
}
|
||
|
|
|
||
|
|
if cfg!(target_os = "macos") {
|
||
|
|
let home_dir = env::var("HOME").unwrap_or_else(|_| {
|
||
|
|
env::var("USER")
|
||
|
|
.map(|user| format!("/Users/{}", user))
|
||
|
|
.expect("Neither HOME nor USER environment variable set")
|
||
|
|
});
|
||
|
|
PathBuf::from(home_dir)
|
||
|
|
.join("Library")
|
||
|
|
.join("Application Support")
|
||
|
|
.join("kreuzberg-tesseract")
|
||
|
|
} else if cfg!(target_os = "linux") {
|
||
|
|
let home_dir = env::var("HOME").unwrap_or_else(|_| {
|
||
|
|
env::var("USER")
|
||
|
|
.map(|user| format!("/home/{}", user))
|
||
|
|
.expect("Neither HOME nor USER environment variable set")
|
||
|
|
});
|
||
|
|
PathBuf::from(home_dir).join(".kreuzberg-tesseract")
|
||
|
|
} else {
|
||
|
|
panic!("Unsupported operating system");
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
fn target_triple() -> String {
|
||
|
|
env::var("TARGET").unwrap_or_else(|_| env::var("HOST").unwrap_or_default())
|
||
|
|
}
|
||
|
|
|
||
|
|
fn target_matches(target: &str, needle: &str) -> bool {
|
||
|
|
target.contains(needle)
|
||
|
|
}
|
||
|
|
|
||
|
|
fn is_windows_target(target: &str) -> bool {
|
||
|
|
target_matches(target, "windows")
|
||
|
|
}
|
||
|
|
|
||
|
|
fn is_macos_target(target: &str) -> bool {
|
||
|
|
target_matches(target, "apple-darwin")
|
||
|
|
}
|
||
|
|
|
||
|
|
fn is_linux_target(target: &str) -> bool {
|
||
|
|
target_matches(target, "linux")
|
||
|
|
}
|
||
|
|
|
||
|
|
fn is_msvc_target(target: &str) -> bool {
|
||
|
|
is_windows_target(target) && target_matches(target, "msvc")
|
||
|
|
}
|
||
|
|
|
||
|
|
fn is_mingw_target(target: &str) -> bool {
|
||
|
|
is_windows_target(target) && target_matches(target, "gnu")
|
||
|
|
}
|
||
|
|
|
||
|
|
fn is_wasm_target(target: &str) -> bool {
|
||
|
|
target_matches(target, "wasm32") || target_matches(target, "wasm64")
|
||
|
|
}
|
||
|
|
|
||
|
|
fn is_android_target(target: &str) -> bool {
|
||
|
|
target_matches(target, "android")
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Map a Rust Android target triple to the NDK ABI name.
|
||
|
|
fn android_abi(target: &str) -> &'static str {
|
||
|
|
if target.contains("aarch64") {
|
||
|
|
"arm64-v8a"
|
||
|
|
} else if target.contains("x86_64") {
|
||
|
|
"x86_64"
|
||
|
|
} else if target.contains("i686") {
|
||
|
|
"x86"
|
||
|
|
} else {
|
||
|
|
"armeabi-v7a"
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Derive the versioned NDK clang++ path for a given ABI.
|
||
|
|
/// e.g. `{ndk}/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang++`
|
||
|
|
fn ndk_clangxx(ndk_home: &str, abi: &str, api: u32) -> Option<String> {
|
||
|
|
// NDK ships darwin-x86_64 binaries even on Apple Silicon (runs via Rosetta)
|
||
|
|
let host_tags: &[&str] = if cfg!(target_os = "macos") {
|
||
|
|
&["darwin-x86_64", "darwin-aarch64"]
|
||
|
|
} else {
|
||
|
|
&["linux-x86_64", "linux-aarch64"]
|
||
|
|
};
|
||
|
|
let clang_arch = match abi {
|
||
|
|
"arm64-v8a" => "aarch64-linux-android",
|
||
|
|
"x86_64" => "x86_64-linux-android",
|
||
|
|
"x86" => "i686-linux-android",
|
||
|
|
_ => "armv7a-linux-androideabi",
|
||
|
|
};
|
||
|
|
for tag in host_tags {
|
||
|
|
let bin = format!("{}/toolchains/llvm/prebuilt/{}/bin", ndk_home, tag);
|
||
|
|
let path = format!("{}/{}{}-clang++", bin, clang_arch, api);
|
||
|
|
if Path::new(&path).exists() {
|
||
|
|
return Some(path);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
None
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Detect whether the build is driven by cargo-zigbuild, which wraps the
|
||
|
|
/// C toolchain in a `zigcc`/`zigcxx` shim. zig's bundled libstdc++ has
|
||
|
|
/// `std::filesystem` inline (no standalone `libstdc++fs`) and its clang
|
||
|
|
/// splits `avx512f` from `evex512`, so tesseract's AVX512 intrinsics
|
||
|
|
/// fail to compile. Both workarounds below gate on this.
|
||
|
|
fn is_zigbuild() -> bool {
|
||
|
|
// CC_* / CXX_* covers target-prefixed vars that cargo-zigbuild sets
|
||
|
|
// when HOST == TARGET (e.g. `CC_x86_64_unknown_linux_gnu`).
|
||
|
|
env::vars().any(|(k, v)| {
|
||
|
|
let k_relevant = k == "CC"
|
||
|
|
|| k == "CXX"
|
||
|
|
|| k == "RUSTC_LINKER"
|
||
|
|
|| k.starts_with("CC_")
|
||
|
|
|| k.starts_with("CXX_")
|
||
|
|
|| (k.starts_with("CARGO_TARGET_") && k.ends_with("_LINKER"));
|
||
|
|
k_relevant && (v.contains("zigcc") || v.contains("zigcxx") || v.contains("cargo-zigbuild"))
|
||
|
|
})
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Resolve the C++ compiler for CMake, following the cc-rs/Cargo convention:
|
||
|
|
/// 1. Check `CXX` env var (explicit override)
|
||
|
|
/// 2. Check target-specific `CXX_{target}` env var (e.g. `CXX_x86_64_unknown_linux_musl`)
|
||
|
|
/// 3. Fall back to `{fallback}` (e.g. "clang++" or "g++")
|
||
|
|
fn resolve_cxx_compiler(target: &str, fallback: &str) -> String {
|
||
|
|
// 1. Explicit CXX override (skip empty strings, e.g. from CI unsetting via GITHUB_ENV)
|
||
|
|
if let Ok(cxx) = env::var("CXX")
|
||
|
|
&& !cxx.is_empty()
|
||
|
|
{
|
||
|
|
return cxx;
|
||
|
|
}
|
||
|
|
|
||
|
|
// 2. Target-specific CXX (hyphens → underscores, matching cc-rs convention)
|
||
|
|
let target_env = target.replace('-', "_");
|
||
|
|
if let Ok(cxx) = env::var(format!("CXX_{target_env}"))
|
||
|
|
&& !cxx.is_empty()
|
||
|
|
{
|
||
|
|
return cxx;
|
||
|
|
}
|
||
|
|
|
||
|
|
// 3. Default fallback
|
||
|
|
fallback.to_string()
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Resolve a MinGW compiler to an absolute path.
|
||
|
|
///
|
||
|
|
/// On Windows CI runners (GitHub Actions), both MSVC and MinGW toolchains
|
||
|
|
/// are present. CMake may pick up MSVC's cl.exe even when
|
||
|
|
/// `CMAKE_CXX_COMPILER=g++` is set, producing MSVC-ABI objects that
|
||
|
|
/// MinGW's linker cannot link. Using the absolute path prevents this.
|
||
|
|
///
|
||
|
|
/// Search order:
|
||
|
|
/// 1. `CXX`/`CC` env var (if it matches the tool name)
|
||
|
|
/// 2. Common MSYS2 paths: ucrt64, mingw64, clang64, usr
|
||
|
|
/// 3. Fall back to bare name (rely on PATH)
|
||
|
|
fn resolve_mingw_compiler(name: &str) -> String {
|
||
|
|
// Check environment variables first
|
||
|
|
let env_var = if name.contains("++") { "CXX" } else { "CC" };
|
||
|
|
if let Ok(val) = env::var(env_var)
|
||
|
|
&& !val.is_empty()
|
||
|
|
{
|
||
|
|
let p = PathBuf::from(&val);
|
||
|
|
if p.is_absolute() && p.exists() {
|
||
|
|
return val;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Search common MSYS2 subsystem paths
|
||
|
|
let msys2_base = PathBuf::from(r"C:\msys64");
|
||
|
|
for subsystem in &["ucrt64", "mingw64", "clang64", "usr"] {
|
||
|
|
let candidate = msys2_base.join(subsystem).join("bin").join(format!("{}.exe", name));
|
||
|
|
if candidate.exists() {
|
||
|
|
let path = candidate.to_string_lossy().replace('\\', "/");
|
||
|
|
eprintln!("Resolved MinGW {} to {}", name, path);
|
||
|
|
return path;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Fall back to bare name
|
||
|
|
println!(
|
||
|
|
"cargo:warning=Could not resolve absolute path for MinGW {}, using bare name",
|
||
|
|
name
|
||
|
|
);
|
||
|
|
name.to_string()
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Create a g++ wrapper script for musl cross-compilation.
|
||
|
|
///
|
||
|
|
/// When cross-compiling from a glibc host to a musl target, plain g++ picks up
|
||
|
|
/// glibc C headers, producing objects with glibc-versioned symbols (e.g.
|
||
|
|
/// `__isoc23_sscanf@@GLIBC_2.38`) incompatible with musl linking.
|
||
|
|
///
|
||
|
|
/// This wrapper prepends musl's C header directory via `-isystem` so that musl's
|
||
|
|
/// headers shadow glibc's. Unlike libc++ (which uses wrapper `<stddef.h>` etc.
|
||
|
|
/// with `#include_next`), libstdc++ includes C headers directly from `<cstdlib>`
|
||
|
|
/// etc., so `-isystem` shadowing works correctly without `-nostdinc`.
|
||
|
|
///
|
||
|
|
/// Additionally, some glibc-specific C++ platform headers (e.g. `os_defines.h`,
|
||
|
|
/// `libc-header-start.h`, `floatn.h`) still get picked up from gcc's built-in
|
||
|
|
/// include paths. These headers use `__GLIBC_PREREQ()` and `__GLIBC_USE()` macros
|
||
|
|
/// that musl doesn't define. We define these as no-op macros evaluating to 0 so
|
||
|
|
/// glibc-guarded code paths are correctly skipped.
|
||
|
|
#[cfg(unix)]
|
||
|
|
fn create_musl_cxx_wrapper(target: &str) -> Option<String> {
|
||
|
|
use std::os::unix::fs::PermissionsExt;
|
||
|
|
|
||
|
|
let host = env::var("HOST").unwrap_or_default();
|
||
|
|
|
||
|
|
// Only needed for cross-compilation from glibc host to musl target
|
||
|
|
if !target.contains("musl") || host.contains("musl") {
|
||
|
|
return None;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Detect musl include directory: /usr/include/{arch}-linux-musl
|
||
|
|
let arch = target.split('-').next().unwrap_or("x86_64");
|
||
|
|
let musl_include = format!("/usr/include/{arch}-linux-musl");
|
||
|
|
if !Path::new(&musl_include).exists() {
|
||
|
|
eprintln!("musl include dir not found at {musl_include}, skipping wrapper");
|
||
|
|
return None;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Write wrapper script to OUT_DIR
|
||
|
|
let out_dir = env::var("OUT_DIR").unwrap();
|
||
|
|
let wrapper_path = format!("{out_dir}/musl-g++.sh");
|
||
|
|
let wrapper_content = format!(
|
||
|
|
"#!/bin/sh\n\
|
||
|
|
# Auto-generated musl-g++ wrapper for cross-compilation.\n\
|
||
|
|
# Prepends musl C headers so they shadow glibc's.\n\
|
||
|
|
# Defines glibc compat macros as 0 for musl -- handles os_defines.h,\n\
|
||
|
|
# libc-header-start.h, floatn.h etc. that use __GLIBC_PREREQ().\n\
|
||
|
|
# Also defines __GNUC_PREREQ for floatn.h which checks compiler version.\n\
|
||
|
|
exec g++ -isystem \"{musl_include}\" \\\n\
|
||
|
|
'-D__GLIBC_PREREQ(maj,min)=0' \\\n\
|
||
|
|
'-D__GLIBC_USE(F)=0' \\\n\
|
||
|
|
'-D__GNUC_PREREQ(maj,min)=0' \\\n\
|
||
|
|
\"$@\"\n"
|
||
|
|
);
|
||
|
|
|
||
|
|
fs::write(&wrapper_path, &wrapper_content).ok()?;
|
||
|
|
fs::set_permissions(&wrapper_path, fs::Permissions::from_mode(0o755)).ok()?;
|
||
|
|
|
||
|
|
eprintln!("Created musl g++ wrapper at {wrapper_path} (musl headers: {musl_include})");
|
||
|
|
Some(wrapper_path)
|
||
|
|
}
|
||
|
|
|
||
|
|
#[cfg(not(unix))]
|
||
|
|
fn create_musl_cxx_wrapper(_target: &str) -> Option<String> {
|
||
|
|
None
|
||
|
|
}
|
||
|
|
|
||
|
|
fn prepare_out_dir() -> PathBuf {
|
||
|
|
let preferred = get_preferred_out_dir();
|
||
|
|
match fs::create_dir_all(&preferred) {
|
||
|
|
Ok(_) => preferred,
|
||
|
|
Err(err) => {
|
||
|
|
println!(
|
||
|
|
"cargo:warning=Failed to create cache dir {:?}: {}. Falling back to temp dir.",
|
||
|
|
preferred, err
|
||
|
|
);
|
||
|
|
let fallback = env::temp_dir().join("kreuzberg-tesseract-cache");
|
||
|
|
fs::create_dir_all(&fallback).expect("Failed to create fallback cache directory in temp dir");
|
||
|
|
fallback
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Find the WASI SDK installation directory.
|
||
|
|
/// Checks `WASI_SDK_PATH` env var first, then common install locations.
|
||
|
|
fn find_wasi_sdk() -> Result<PathBuf, String> {
|
||
|
|
if let Ok(sdk_path) = env::var("WASI_SDK_PATH") {
|
||
|
|
let path = PathBuf::from(sdk_path);
|
||
|
|
if path.join("share/wasi-sysroot").exists() {
|
||
|
|
return Ok(path);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
let home = env::var("HOME").unwrap_or_default();
|
||
|
|
let common_paths = vec![
|
||
|
|
PathBuf::from(&home).join("wasi-sdk"),
|
||
|
|
PathBuf::from("/opt/wasi-sdk"),
|
||
|
|
PathBuf::from("/usr/local/opt/wasi-sdk"),
|
||
|
|
];
|
||
|
|
|
||
|
|
// Also check for versioned directories
|
||
|
|
for base in &["/opt", &home] {
|
||
|
|
if let Ok(entries) = fs::read_dir(base) {
|
||
|
|
for entry in entries.flatten() {
|
||
|
|
let name = entry.file_name().to_string_lossy().to_string();
|
||
|
|
if name.starts_with("wasi-sdk-") {
|
||
|
|
let path = entry.path();
|
||
|
|
if path.join("share/wasi-sysroot").exists() {
|
||
|
|
return Ok(path);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
for path in common_paths {
|
||
|
|
if path.join("share/wasi-sysroot").exists() {
|
||
|
|
return Ok(path);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
Err(
|
||
|
|
"WASI SDK not found. Install from https://github.com/WebAssembly/wasi-sdk/releases and set WASI_SDK_PATH"
|
||
|
|
.to_string(),
|
||
|
|
)
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Find the WASI SDK CMake toolchain file.
|
||
|
|
fn find_wasi_toolchain(wasi_sdk_dir: &Path) -> PathBuf {
|
||
|
|
let candidate = wasi_sdk_dir.join("share/cmake/wasi-sdk.cmake");
|
||
|
|
if candidate.exists() {
|
||
|
|
eprintln!("Found WASI SDK toolchain: {}", candidate.display());
|
||
|
|
return candidate;
|
||
|
|
}
|
||
|
|
panic!(
|
||
|
|
"Could not find WASI SDK CMake toolchain file at: {}\nEnsure WASI SDK is properly installed.",
|
||
|
|
candidate.display()
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Find the WASI SDK pthread CMake toolchain file (for C++ code using std::mutex/std::thread).
|
||
|
|
#[allow(dead_code)]
|
||
|
|
fn find_wasi_pthread_toolchain(wasi_sdk_dir: &Path) -> PathBuf {
|
||
|
|
let candidate = wasi_sdk_dir.join("share/cmake/wasi-sdk-pthread.cmake");
|
||
|
|
if candidate.exists() {
|
||
|
|
println!(
|
||
|
|
"cargo:warning=Found WASI SDK pthread toolchain: {}",
|
||
|
|
candidate.display()
|
||
|
|
);
|
||
|
|
return candidate;
|
||
|
|
}
|
||
|
|
panic!(
|
||
|
|
"Could not find WASI SDK pthread CMake toolchain at: {}\nEnsure WASI SDK is properly installed.",
|
||
|
|
candidate.display()
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Find the compiler-rt builtins library in WASI SDK.
|
||
|
|
fn find_wasi_compiler_rt(wasi_sdk_dir: &Path) -> Option<PathBuf> {
|
||
|
|
// Search lib/clang/*/lib/wasi/ for libclang_rt.builtins-wasm32.a
|
||
|
|
let clang_lib = wasi_sdk_dir.join("lib/clang");
|
||
|
|
if let Ok(entries) = fs::read_dir(&clang_lib) {
|
||
|
|
for entry in entries.flatten() {
|
||
|
|
let rt_dir = entry.path().join("lib/wasi");
|
||
|
|
if rt_dir.join("libclang_rt.builtins-wasm32.a").exists() {
|
||
|
|
return Some(rt_dir);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
None
|
||
|
|
}
|
||
|
|
|
||
|
|
pub fn build() {
|
||
|
|
let target = target_triple();
|
||
|
|
|
||
|
|
if is_wasm_target(&target) {
|
||
|
|
println!(
|
||
|
|
"cargo:warning=Detected WASM target: {}, routing to build_wasm()",
|
||
|
|
target
|
||
|
|
);
|
||
|
|
return build_wasm();
|
||
|
|
}
|
||
|
|
|
||
|
|
let custom_out_dir = prepare_out_dir();
|
||
|
|
let windows_target = is_windows_target(&target);
|
||
|
|
let msvc_target = is_msvc_target(&target);
|
||
|
|
let mingw_target = is_mingw_target(&target);
|
||
|
|
let android_target = is_android_target(&target);
|
||
|
|
|
||
|
|
eprintln!("custom_out_dir: {:?}", custom_out_dir);
|
||
|
|
|
||
|
|
let cache_dir = custom_out_dir.join("cache");
|
||
|
|
|
||
|
|
if env::var("CARGO_CLEAN").is_ok() {
|
||
|
|
clean_cache(&cache_dir);
|
||
|
|
}
|
||
|
|
|
||
|
|
std::fs::create_dir_all(&cache_dir).expect("Failed to create cache directory");
|
||
|
|
|
||
|
|
let out_dir = custom_out_dir.clone();
|
||
|
|
let project_dir = custom_out_dir.clone();
|
||
|
|
let third_party_dir = project_dir.join("third_party");
|
||
|
|
|
||
|
|
let leptonica_dir = if third_party_dir.join("leptonica").exists() {
|
||
|
|
eprintln!("Using existing leptonica source");
|
||
|
|
third_party_dir.join("leptonica")
|
||
|
|
} else {
|
||
|
|
fs::create_dir_all(&third_party_dir).expect("Failed to create third_party directory");
|
||
|
|
download_and_extract(&third_party_dir, &leptonica_url(), "leptonica")
|
||
|
|
};
|
||
|
|
|
||
|
|
let tesseract_dir = if third_party_dir.join("tesseract").exists() {
|
||
|
|
eprintln!("Using existing tesseract source");
|
||
|
|
third_party_dir.join("tesseract")
|
||
|
|
} else {
|
||
|
|
fs::create_dir_all(&third_party_dir).expect("Failed to create third_party directory");
|
||
|
|
download_and_extract(&third_party_dir, &tesseract_url(), "tesseract")
|
||
|
|
};
|
||
|
|
|
||
|
|
let (cmake_cxx_flags, cmake_c_flags, additional_defines) = get_os_specific_config();
|
||
|
|
|
||
|
|
let leptonica_install_dir = out_dir.join("leptonica");
|
||
|
|
let leptonica_cache_dir = cache_dir.join("leptonica");
|
||
|
|
|
||
|
|
let leptonica_link_name = build_or_use_cached(
|
||
|
|
"leptonica",
|
||
|
|
&leptonica_cache_dir,
|
||
|
|
&leptonica_install_dir,
|
||
|
|
|| {
|
||
|
|
let mut leptonica_config = Config::new(&leptonica_dir);
|
||
|
|
|
||
|
|
let leptonica_src_dir = leptonica_dir.join("src");
|
||
|
|
let environ_h_path = leptonica_src_dir.join("environ.h");
|
||
|
|
|
||
|
|
if environ_h_path.exists() {
|
||
|
|
let environ_h = std::fs::read_to_string(&environ_h_path)
|
||
|
|
.expect("Failed to read environ.h")
|
||
|
|
.replace("#define HAVE_LIBZ 1", "#define HAVE_LIBZ 0")
|
||
|
|
.replace("#ifdef NO_CONSOLE_IO", "#define NO_CONSOLE_IO\n#ifdef NO_CONSOLE_IO");
|
||
|
|
std::fs::write(environ_h_path, environ_h).expect("Failed to write environ.h");
|
||
|
|
}
|
||
|
|
|
||
|
|
let makefile_static_path = leptonica_dir.join("prog").join("makefile.static");
|
||
|
|
|
||
|
|
let leptonica_src_cmakelists = leptonica_dir.join("src").join("CMakeLists.txt");
|
||
|
|
|
||
|
|
if leptonica_src_cmakelists.exists() {
|
||
|
|
let cmakelists = std::fs::read_to_string(&leptonica_src_cmakelists)
|
||
|
|
.expect("Failed to read leptonica src CMakeLists.txt");
|
||
|
|
let patched = cmakelists.replace(
|
||
|
|
"if(MINGW)\n set_target_properties(\n leptonica PROPERTIES SUFFIX\n \"-${PROJECT_VERSION}${CMAKE_SHARED_LIBRARY_SUFFIX}\")\nendif(MINGW)\n",
|
||
|
|
"if(MINGW AND BUILD_SHARED_LIBS)\n set_target_properties(\n leptonica PROPERTIES SUFFIX\n \"-${PROJECT_VERSION}${CMAKE_SHARED_LIBRARY_SUFFIX}\")\nendif()\n",
|
||
|
|
);
|
||
|
|
if patched != cmakelists {
|
||
|
|
std::fs::write(&leptonica_src_cmakelists, patched)
|
||
|
|
.expect("Failed to patch leptonica src CMakeLists.txt");
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if makefile_static_path.exists() {
|
||
|
|
let makefile_static = std::fs::read_to_string(&makefile_static_path)
|
||
|
|
.expect("Failed to read makefile.static")
|
||
|
|
.replace(
|
||
|
|
"ALL_LIBS = $(LEPTLIB) -ltiff -ljpeg -lpng -lz -lm",
|
||
|
|
"ALL_LIBS = $(LEPTLIB) -lm",
|
||
|
|
);
|
||
|
|
std::fs::write(makefile_static_path, makefile_static).expect("Failed to write makefile.static");
|
||
|
|
}
|
||
|
|
|
||
|
|
if windows_target {
|
||
|
|
if mingw_target {
|
||
|
|
leptonica_config.generator("Unix Makefiles");
|
||
|
|
leptonica_config.define("CMAKE_MAKE_PROGRAM", "mingw32-make");
|
||
|
|
leptonica_config.define("MSYS2_ARG_CONV_EXCL", "/MD;/MDd;/D;-D;-I;-L");
|
||
|
|
} else if msvc_target && env::var("VSINSTALLDIR").is_ok() {
|
||
|
|
leptonica_config.generator("NMake Makefiles");
|
||
|
|
}
|
||
|
|
leptonica_config.define("CMAKE_CL_SHOWINCLUDES_PREFIX", "");
|
||
|
|
}
|
||
|
|
|
||
|
|
if env::var("CI").is_err() && env::var("RUSTC_WRAPPER").unwrap_or_default() == "sccache" {
|
||
|
|
leptonica_config.env("CC", "sccache cc").env("CXX", "sccache c++");
|
||
|
|
}
|
||
|
|
|
||
|
|
let leptonica_install_dir_cmake = normalize_cmake_path(&leptonica_install_dir);
|
||
|
|
|
||
|
|
leptonica_config
|
||
|
|
.define("CMAKE_POLICY_VERSION_MINIMUM", "3.5")
|
||
|
|
.define("CMAKE_BUILD_TYPE", "Release")
|
||
|
|
.define("BUILD_PROG", "OFF")
|
||
|
|
.define("BUILD_SHARED_LIBS", "OFF")
|
||
|
|
.define("ENABLE_ZLIB", "OFF")
|
||
|
|
.define("ENABLE_PNG", "OFF")
|
||
|
|
.define("ENABLE_JPEG", "OFF")
|
||
|
|
.define("ENABLE_TIFF", "OFF")
|
||
|
|
.define("ENABLE_WEBP", "OFF")
|
||
|
|
.define("ENABLE_OPENJPEG", "OFF")
|
||
|
|
.define("ENABLE_GIF", "OFF")
|
||
|
|
.define("NO_CONSOLE_IO", "ON")
|
||
|
|
.define("CMAKE_CXX_FLAGS", &cmake_cxx_flags)
|
||
|
|
.define("CMAKE_C_FLAGS", &cmake_c_flags)
|
||
|
|
.define("MINIMUM_SEVERITY", "L_SEVERITY_NONE")
|
||
|
|
.define("SW_BUILD", "OFF")
|
||
|
|
.define("HAVE_LIBZ", "0")
|
||
|
|
.define("ENABLE_LTO", "OFF")
|
||
|
|
.define("CMAKE_INSTALL_PREFIX", &leptonica_install_dir_cmake);
|
||
|
|
|
||
|
|
if windows_target {
|
||
|
|
if msvc_target {
|
||
|
|
leptonica_config
|
||
|
|
.define("CMAKE_C_FLAGS_RELEASE", "/MD /O2")
|
||
|
|
.define("CMAKE_C_FLAGS_DEBUG", "/MDd /Od");
|
||
|
|
} else if mingw_target {
|
||
|
|
leptonica_config
|
||
|
|
.define("CMAKE_C_FLAGS_RELEASE", "-O2 -DNDEBUG")
|
||
|
|
.define("CMAKE_C_FLAGS_DEBUG", "-O0 -g");
|
||
|
|
} else {
|
||
|
|
leptonica_config
|
||
|
|
.define("CMAKE_C_FLAGS_RELEASE", "-O2")
|
||
|
|
.define("CMAKE_C_FLAGS_DEBUG", "-O0 -g");
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
for (key, value) in &additional_defines {
|
||
|
|
leptonica_config.define(key, value);
|
||
|
|
}
|
||
|
|
|
||
|
|
leptonica_config.build();
|
||
|
|
},
|
||
|
|
);
|
||
|
|
|
||
|
|
let leptonica_include_dir = leptonica_install_dir.join("include");
|
||
|
|
let leptonica_lib_dir = leptonica_install_dir.join("lib");
|
||
|
|
let tesseract_install_dir = out_dir.join("tesseract");
|
||
|
|
let tesseract_cache_dir = cache_dir.join("tesseract");
|
||
|
|
let tessdata_prefix = project_dir.clone();
|
||
|
|
|
||
|
|
let leptonica_install_dir_cmake = normalize_cmake_path(&leptonica_install_dir);
|
||
|
|
// Leptonica_DIR must point to the directory containing LeptonicaConfig.cmake,
|
||
|
|
// not the install prefix. On Windows with cross-compilation toolchains,
|
||
|
|
// CMAKE_PREFIX_PATH search doesn't find it automatically.
|
||
|
|
let leptonica_cmake_dir = leptonica_install_dir.join("lib/cmake/leptonica");
|
||
|
|
let leptonica_cmake_dir_cmake = normalize_cmake_path(&leptonica_cmake_dir);
|
||
|
|
let leptonica_include_dir_cmake = normalize_cmake_path(&leptonica_include_dir);
|
||
|
|
let leptonica_lib_dir_cmake = normalize_cmake_path(&leptonica_lib_dir);
|
||
|
|
let tesseract_install_dir_cmake = normalize_cmake_path(&tesseract_install_dir);
|
||
|
|
let tessdata_prefix_cmake = normalize_cmake_path(&tessdata_prefix);
|
||
|
|
|
||
|
|
let tesseract_link_name =
|
||
|
|
build_or_use_cached("tesseract", &tesseract_cache_dir, &tesseract_install_dir, || {
|
||
|
|
let cmakelists_path = tesseract_dir.join("CMakeLists.txt");
|
||
|
|
let cmakelists = std::fs::read_to_string(&cmakelists_path)
|
||
|
|
.expect("Failed to read CMakeLists.txt")
|
||
|
|
.replace("set(HAVE_TIFFIO_H ON)", "")
|
||
|
|
// Remove the tesseract CLI executable target — it uses try/catch which is
|
||
|
|
// incompatible with -fno-exceptions. We only need the library (libtesseract).
|
||
|
|
.replace(
|
||
|
|
"add_executable(tesseract src/tesseract.cpp)\n\
|
||
|
|
target_link_libraries(tesseract libtesseract)\n\
|
||
|
|
if(HAVE_TIFFIO_H AND WIN32)\n\
|
||
|
|
\x20 target_link_libraries(tesseract ${TIFF_LIBRARIES})\n\
|
||
|
|
endif()\n\
|
||
|
|
\n\
|
||
|
|
if(OPENMP_BUILD AND UNIX)\n\
|
||
|
|
\x20 target_link_libraries(tesseract pthread)\n\
|
||
|
|
endif()",
|
||
|
|
"",
|
||
|
|
)
|
||
|
|
.replace("install(TARGETS tesseract DESTINATION bin)", "")
|
||
|
|
.replace(
|
||
|
|
"if (MSVC)\n\
|
||
|
|
\x20 install(FILES $<TARGET_PDB_FILE:${PROJECT_NAME}> DESTINATION bin OPTIONAL)\n\
|
||
|
|
endif()",
|
||
|
|
"",
|
||
|
|
);
|
||
|
|
|
||
|
|
// NDK r25+ no longer ships CpuFeaturesNdkCompatConfig.cmake.
|
||
|
|
// Strip the find_package block so the build doesn't abort.
|
||
|
|
let cmakelists = if android_target {
|
||
|
|
cmakelists.replace(
|
||
|
|
"if(ANDROID)\n\
|
||
|
|
\x20 add_definitions(-DANDROID)\n\
|
||
|
|
\x20 find_package(CpuFeaturesNdkCompat REQUIRED)\n\
|
||
|
|
\x20 target_include_directories(\n\
|
||
|
|
\x20\x20\x20 libtesseract\n\
|
||
|
|
\x20\x20\x20 PRIVATE \"${CpuFeaturesNdkCompat_DIR}/../../../include/ndk_compat\")\n\
|
||
|
|
\x20 target_link_libraries(libtesseract PRIVATE CpuFeatures::ndk_compat)\n\
|
||
|
|
endif()",
|
||
|
|
"if(ANDROID)\n\
|
||
|
|
\x20 add_definitions(-DANDROID)\n\
|
||
|
|
endif()",
|
||
|
|
)
|
||
|
|
} else {
|
||
|
|
cmakelists
|
||
|
|
};
|
||
|
|
|
||
|
|
std::fs::write(&cmakelists_path, cmakelists).expect("Failed to write CMakeLists.txt");
|
||
|
|
|
||
|
|
let mut tesseract_config = Config::new(&tesseract_dir);
|
||
|
|
if windows_target {
|
||
|
|
if mingw_target {
|
||
|
|
tesseract_config.generator("Unix Makefiles");
|
||
|
|
tesseract_config.define("CMAKE_MAKE_PROGRAM", "mingw32-make");
|
||
|
|
tesseract_config.define("MSYS2_ARG_CONV_EXCL", "/MD;/MDd;/D;-D;-I;-L");
|
||
|
|
} else if msvc_target && env::var("VSINSTALLDIR").is_ok() {
|
||
|
|
tesseract_config.generator("NMake Makefiles");
|
||
|
|
}
|
||
|
|
tesseract_config.define("CMAKE_CL_SHOWINCLUDES_PREFIX", "");
|
||
|
|
}
|
||
|
|
|
||
|
|
if env::var("CI").is_err() && env::var("RUSTC_WRAPPER").unwrap_or_default() == "sccache" {
|
||
|
|
tesseract_config.env("CC", "sccache cc").env("CXX", "sccache c++");
|
||
|
|
}
|
||
|
|
tesseract_config
|
||
|
|
.define("CMAKE_POLICY_VERSION_MINIMUM", "3.5")
|
||
|
|
.define("CMAKE_BUILD_TYPE", "Release")
|
||
|
|
.define("BUILD_TRAINING_TOOLS", "OFF")
|
||
|
|
.define("BUILD_SHARED_LIBS", "OFF")
|
||
|
|
.define("DISABLE_ARCHIVE", "ON")
|
||
|
|
.define("DISABLE_CURL", "ON")
|
||
|
|
.define("DISABLE_OPENCL", "ON")
|
||
|
|
.define("Leptonica_DIR", &leptonica_cmake_dir_cmake)
|
||
|
|
.define("LEPTONICA_INCLUDE_DIR", &leptonica_include_dir_cmake)
|
||
|
|
.define("LEPTONICA_LIBRARY", &leptonica_lib_dir_cmake)
|
||
|
|
.define("CMAKE_PREFIX_PATH", &leptonica_install_dir_cmake)
|
||
|
|
.define("CMAKE_INSTALL_PREFIX", &tesseract_install_dir_cmake)
|
||
|
|
.define("TESSDATA_PREFIX", &tessdata_prefix_cmake)
|
||
|
|
.define("DISABLE_TIFF", "ON")
|
||
|
|
.define("DISABLE_PNG", "ON")
|
||
|
|
.define("DISABLE_JPEG", "ON")
|
||
|
|
.define("DISABLE_WEBP", "ON")
|
||
|
|
.define("DISABLE_OPENJPEG", "ON")
|
||
|
|
.define("DISABLE_ZLIB", "ON")
|
||
|
|
.define("DISABLE_LIBXML2", "ON")
|
||
|
|
.define("DISABLE_LIBICU", "ON")
|
||
|
|
.define("DISABLE_LZMA", "ON")
|
||
|
|
.define("DISABLE_GIF", "ON")
|
||
|
|
.define("DISABLE_DEBUG_MESSAGES", "ON")
|
||
|
|
.define("debug_file", "/dev/null")
|
||
|
|
.define("HAVE_LIBARCHIVE", "OFF")
|
||
|
|
.define("HAVE_LIBCURL", "OFF")
|
||
|
|
.define("HAVE_TIFFIO_H", "OFF")
|
||
|
|
.define("GRAPHICS_DISABLED", "ON")
|
||
|
|
.define("DISABLED_LEGACY_ENGINE", "OFF")
|
||
|
|
.define("USE_OPENCL", "OFF")
|
||
|
|
.define("OPENMP_BUILD", "OFF")
|
||
|
|
.define("BUILD_TESTS", "OFF")
|
||
|
|
.define("ENABLE_LTO", "OFF")
|
||
|
|
.define("BUILD_PROG", "OFF")
|
||
|
|
.define("BUILD_TESSERACT_BINARY", "OFF")
|
||
|
|
.define("SW_BUILD", "OFF")
|
||
|
|
.define("LEPT_TIFF_RESULT", "FALSE")
|
||
|
|
.define("INSTALL_CONFIGS", "ON")
|
||
|
|
.define("USE_SYSTEM_ICU", "ON")
|
||
|
|
.define("CMAKE_CXX_FLAGS", &cmake_cxx_flags)
|
||
|
|
.define("CMAKE_C_FLAGS", &cmake_c_flags);
|
||
|
|
|
||
|
|
// zig's clang (14+) requires an explicit `evex512` target
|
||
|
|
// feature for AVX512 intrinsics (`_mm512_*`), but tesseract's
|
||
|
|
// CMakeLists only passes `-mavx512f`. The resulting build
|
||
|
|
// fails with `always_inline function requires target feature
|
||
|
|
// 'evex512'`. Disable tesseract's AVX512 codepath under
|
||
|
|
// zigbuild — AVX2/SSE dispatch remains active.
|
||
|
|
if is_zigbuild() {
|
||
|
|
tesseract_config.define("HAVE_AVX512F", "OFF");
|
||
|
|
}
|
||
|
|
|
||
|
|
for (key, value) in &additional_defines {
|
||
|
|
tesseract_config.define(key, value);
|
||
|
|
}
|
||
|
|
|
||
|
|
tesseract_config.build();
|
||
|
|
});
|
||
|
|
|
||
|
|
// Bundle eng.traineddata (tessdata_fast, ~4MB) so English OCR works out of the box.
|
||
|
|
// Tesseract looks for traineddata at {TESSDATA_PREFIX}/tessdata/{lang}.traineddata.
|
||
|
|
let bundled_tessdata_dir = tessdata_prefix.join("tessdata");
|
||
|
|
let eng_traineddata = bundled_tessdata_dir.join("eng.traineddata");
|
||
|
|
if !eng_traineddata.exists() {
|
||
|
|
fs::create_dir_all(&bundled_tessdata_dir).expect("Failed to create tessdata directory");
|
||
|
|
download_file_with_fallback(
|
||
|
|
&[
|
||
|
|
"https://github.com/tesseract-ocr/tessdata_fast/raw/main/eng.traineddata",
|
||
|
|
"https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/main/eng.traineddata",
|
||
|
|
],
|
||
|
|
&eng_traineddata,
|
||
|
|
"eng.traineddata",
|
||
|
|
);
|
||
|
|
}
|
||
|
|
println!("cargo:rustc-env=TESSDATA_PREFIX_BUNDLED={}", tessdata_prefix.display());
|
||
|
|
eprintln!("Bundled tessdata dir: {:?}", bundled_tessdata_dir);
|
||
|
|
|
||
|
|
println!("cargo:rerun-if-changed=build.rs");
|
||
|
|
println!("cargo:rerun-if-changed={}", third_party_dir.display());
|
||
|
|
println!("cargo:rerun-if-changed={}", leptonica_dir.display());
|
||
|
|
println!("cargo:rerun-if-changed={}", tesseract_dir.display());
|
||
|
|
|
||
|
|
println!("cargo:rustc-link-search=native={}", leptonica_lib_dir.display());
|
||
|
|
println!(
|
||
|
|
"cargo:rustc-link-search=native={}",
|
||
|
|
tesseract_install_dir.join("lib").display()
|
||
|
|
);
|
||
|
|
|
||
|
|
// Link libraries in the correct order for static linking:
|
||
|
|
// 1. tesseract first (depends on leptonica and C++ stdlib)
|
||
|
|
// 2. leptonica (depends on C++ stdlib)
|
||
|
|
// 3. C++ standard library and system libraries (via set_os_specific_link_flags)
|
||
|
|
//
|
||
|
|
// IMPORTANT: For static linking, the linker resolves symbols in order.
|
||
|
|
// Libraries must be listed BEFORE the libraries they depend on.
|
||
|
|
// The C++ stdlib must come LAST because both tesseract and leptonica
|
||
|
|
// depend on it for symbols like operator new, operator delete, etc.
|
||
|
|
#[cfg(feature = "dynamic-linking")]
|
||
|
|
let link_type = "dylib";
|
||
|
|
#[cfg(not(feature = "dynamic-linking"))]
|
||
|
|
let link_type = "static";
|
||
|
|
|
||
|
|
println!("cargo:rustc-link-lib={}={}", link_type, tesseract_link_name);
|
||
|
|
println!(
|
||
|
|
"cargo:warning=Linking with tesseract ({} linking): {}",
|
||
|
|
link_type, tesseract_link_name
|
||
|
|
);
|
||
|
|
println!("cargo:rustc-link-lib={}={}", link_type, leptonica_link_name);
|
||
|
|
println!(
|
||
|
|
"cargo:warning=Linking with leptonica ({} linking): {}",
|
||
|
|
link_type, leptonica_link_name
|
||
|
|
);
|
||
|
|
|
||
|
|
// Link C++ standard library and system libraries AFTER tesseract and leptonica.
|
||
|
|
// This is critical for static linking on Linux (especially aarch64) where
|
||
|
|
// tesseract's C++ code needs symbols like operator new/delete from libstdc++.
|
||
|
|
set_os_specific_link_flags();
|
||
|
|
|
||
|
|
eprintln!("Leptonica include dir: {:?}", leptonica_include_dir);
|
||
|
|
eprintln!("Leptonica lib dir: {:?}", leptonica_lib_dir);
|
||
|
|
eprintln!("Tesseract install dir: {:?}", tesseract_install_dir);
|
||
|
|
eprintln!("Tessdata dir: {:?}", tessdata_prefix);
|
||
|
|
}
|
||
|
|
|
||
|
|
fn get_os_specific_config() -> (String, String, Vec<(String, String)>) {
|
||
|
|
let mut cmake_cxx_flags = String::new();
|
||
|
|
let mut cmake_c_flags = String::new();
|
||
|
|
let mut additional_defines = Vec::new();
|
||
|
|
let target = target_triple();
|
||
|
|
let target_macos = is_macos_target(&target);
|
||
|
|
let target_linux = is_linux_target(&target);
|
||
|
|
let target_windows = is_windows_target(&target);
|
||
|
|
let target_msvc = is_msvc_target(&target);
|
||
|
|
let target_mingw = is_mingw_target(&target);
|
||
|
|
let target_musl = target.contains("musl");
|
||
|
|
|
||
|
|
if target_macos {
|
||
|
|
cmake_cxx_flags.push_str("-stdlib=libc++ ");
|
||
|
|
cmake_cxx_flags.push_str("-std=c++17 ");
|
||
|
|
cmake_cxx_flags.push_str("-fno-exceptions ");
|
||
|
|
} else if is_android_target(&target) {
|
||
|
|
cmake_c_flags.push_str("-std=gnu11 ");
|
||
|
|
cmake_cxx_flags.push_str("-std=c++17 ");
|
||
|
|
cmake_cxx_flags.push_str("-fno-exceptions ");
|
||
|
|
|
||
|
|
let abi = android_abi(&target);
|
||
|
|
let api: u32 = 21;
|
||
|
|
additional_defines.push(("ANDROID_ABI".to_string(), abi.to_string()));
|
||
|
|
additional_defines.push(("ANDROID_PLATFORM".to_string(), format!("android-{api}")));
|
||
|
|
|
||
|
|
// cmake-rs sets CMAKE_C_COMPILER from the NDK but not CMAKE_ANDROID_NDK
|
||
|
|
// (needed for CMake's Android platform detection) or CMAKE_CXX_COMPILER.
|
||
|
|
let ndk_home = env::var("ANDROID_NDK_HOME")
|
||
|
|
.or_else(|_| env::var("ANDROID_NDK"))
|
||
|
|
.or_else(|_| env::var("NDK_HOME"))
|
||
|
|
.ok();
|
||
|
|
if let Some(ref ndk) = ndk_home {
|
||
|
|
additional_defines.push(("CMAKE_ANDROID_NDK".to_string(), ndk.clone()));
|
||
|
|
let cxx = ndk_clangxx(ndk, abi, api).unwrap_or_else(|| resolve_cxx_compiler(&target, "clang++"));
|
||
|
|
additional_defines.push(("CMAKE_CXX_COMPILER".to_string(), cxx));
|
||
|
|
} else {
|
||
|
|
let cxx = resolve_cxx_compiler(&target, "clang++");
|
||
|
|
additional_defines.push(("CMAKE_CXX_COMPILER".to_string(), cxx));
|
||
|
|
}
|
||
|
|
|
||
|
|
// Force CMake to search only inside the NDK sysroot / CMAKE_FIND_ROOT_PATH.
|
||
|
|
// Without ONLY mode, CMake falls back to host Homebrew paths (e.g.
|
||
|
|
// /opt/homebrew/Cellar/leptonica) and picks up the wrong architecture.
|
||
|
|
additional_defines.push(("CMAKE_FIND_ROOT_PATH_MODE_INCLUDE".to_string(), "ONLY".to_string()));
|
||
|
|
additional_defines.push(("CMAKE_FIND_ROOT_PATH_MODE_LIBRARY".to_string(), "ONLY".to_string()));
|
||
|
|
// Programs (e.g. cmake tools, pkg-config) must come from the host.
|
||
|
|
additional_defines.push(("CMAKE_FIND_ROOT_PATH_MODE_PROGRAM".to_string(), "NEVER".to_string()));
|
||
|
|
// Belt-and-suspenders: explicitly ignore host-only include/lib trees.
|
||
|
|
additional_defines.push((
|
||
|
|
"CMAKE_IGNORE_PATH".to_string(),
|
||
|
|
"/opt/homebrew/Cellar;/opt/homebrew/include;/opt/homebrew/lib;/usr/local/include;/usr/local/lib"
|
||
|
|
.to_string(),
|
||
|
|
));
|
||
|
|
} else if target_linux {
|
||
|
|
// Prevent GCC 14+ from emitting C23-versioned glibc symbols (__isoc23_strtoll etc.)
|
||
|
|
// that require glibc >= 2.38. Force C11 mode for C code.
|
||
|
|
cmake_c_flags.push_str("-std=gnu11 ");
|
||
|
|
cmake_cxx_flags.push_str("-std=gnu++17 ");
|
||
|
|
cmake_cxx_flags.push_str("-fno-exceptions ");
|
||
|
|
if target_musl {
|
||
|
|
// For musl: use g++ with musl-gcc specs (avoids libc++/musl locale
|
||
|
|
// incompatibilities). The wrapper redirects C headers to musl while
|
||
|
|
// keeping libstdc++ intact.
|
||
|
|
let cxx_compiler =
|
||
|
|
create_musl_cxx_wrapper(&target).unwrap_or_else(|| resolve_cxx_compiler(&target, "g++"));
|
||
|
|
additional_defines.push(("CMAKE_CXX_COMPILER".to_string(), cxx_compiler));
|
||
|
|
} else if env::var("CC").map(|cc| cc.contains("clang")).unwrap_or(false) {
|
||
|
|
cmake_cxx_flags.push_str("-stdlib=libc++ ");
|
||
|
|
let cxx_compiler = resolve_cxx_compiler(&target, "clang++");
|
||
|
|
additional_defines.push(("CMAKE_CXX_COMPILER".to_string(), cxx_compiler));
|
||
|
|
} else {
|
||
|
|
let cxx_compiler = resolve_cxx_compiler(&target, "g++");
|
||
|
|
additional_defines.push(("CMAKE_CXX_COMPILER".to_string(), cxx_compiler));
|
||
|
|
}
|
||
|
|
} else if target_windows {
|
||
|
|
if target_msvc {
|
||
|
|
cmake_cxx_flags.push_str("/MP /std:c++17 /DTESSERACT_STATIC ");
|
||
|
|
additional_defines.push(("CMAKE_C_FLAGS_RELEASE".to_string(), "/MD /O2".to_string()));
|
||
|
|
additional_defines.push(("CMAKE_C_FLAGS_DEBUG".to_string(), "/MDd /Od".to_string()));
|
||
|
|
additional_defines.push((
|
||
|
|
"CMAKE_CXX_FLAGS_RELEASE".to_string(),
|
||
|
|
"/MD /O2 /DTESSERACT_STATIC".to_string(),
|
||
|
|
));
|
||
|
|
additional_defines.push((
|
||
|
|
"CMAKE_CXX_FLAGS_DEBUG".to_string(),
|
||
|
|
"/MDd /Od /DTESSERACT_STATIC".to_string(),
|
||
|
|
));
|
||
|
|
additional_defines.push(("CMAKE_MSVC_RUNTIME_LIBRARY".to_string(), "MultiThreadedDLL".to_string()));
|
||
|
|
} else if target_mingw {
|
||
|
|
cmake_cxx_flags.push_str("-std=c++17 -DTESSERACT_STATIC -fno-exceptions ");
|
||
|
|
additional_defines.push(("CMAKE_C_FLAGS_RELEASE".to_string(), "-O2 -DNDEBUG".to_string()));
|
||
|
|
additional_defines.push(("CMAKE_C_FLAGS_DEBUG".to_string(), "-O0 -g".to_string()));
|
||
|
|
// Use absolute paths for MinGW compilers to prevent cmake from
|
||
|
|
// falling back to MSVC cl.exe on Windows CI runners where both
|
||
|
|
// toolchains are present.
|
||
|
|
let gcc_path = resolve_mingw_compiler("gcc");
|
||
|
|
let gxx_path = resolve_mingw_compiler("g++");
|
||
|
|
additional_defines.push(("CMAKE_C_COMPILER".to_string(), gcc_path));
|
||
|
|
additional_defines.push(("CMAKE_CXX_COMPILER".to_string(), gxx_path));
|
||
|
|
additional_defines.push(("CMAKE_SYSTEM_NAME".to_string(), "Windows".to_string()));
|
||
|
|
additional_defines.push((
|
||
|
|
"CMAKE_CXX_FLAGS_RELEASE".to_string(),
|
||
|
|
"-O2 -DNDEBUG -DTESSERACT_STATIC".to_string(),
|
||
|
|
));
|
||
|
|
additional_defines.push((
|
||
|
|
"CMAKE_CXX_FLAGS_DEBUG".to_string(),
|
||
|
|
"-O0 -g -DTESSERACT_STATIC".to_string(),
|
||
|
|
));
|
||
|
|
} else {
|
||
|
|
cmake_cxx_flags.push_str("-std=c++17 -DTESSERACT_STATIC ");
|
||
|
|
additional_defines.push(("CMAKE_C_FLAGS_RELEASE".to_string(), "-O2 -DNDEBUG".to_string()));
|
||
|
|
additional_defines.push(("CMAKE_C_FLAGS_DEBUG".to_string(), "-O0 -g".to_string()));
|
||
|
|
additional_defines.push((
|
||
|
|
"CMAKE_CXX_FLAGS_RELEASE".to_string(),
|
||
|
|
"-O2 -DNDEBUG -DTESSERACT_STATIC".to_string(),
|
||
|
|
));
|
||
|
|
additional_defines.push((
|
||
|
|
"CMAKE_CXX_FLAGS_DEBUG".to_string(),
|
||
|
|
"-O0 -g -DTESSERACT_STATIC".to_string(),
|
||
|
|
));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
cmake_cxx_flags.push_str("-DUSE_STD_NAMESPACE ");
|
||
|
|
additional_defines.push(("CMAKE_POSITION_INDEPENDENT_CODE".to_string(), "ON".to_string()));
|
||
|
|
|
||
|
|
if target_windows && target_msvc {
|
||
|
|
cmake_cxx_flags.push_str("/permissive- ");
|
||
|
|
additional_defines.push(("CMAKE_EXE_LINKER_FLAGS".to_string(), "/INCREMENTAL:NO".to_string()));
|
||
|
|
additional_defines.push(("CMAKE_SHARED_LINKER_FLAGS".to_string(), "/INCREMENTAL:NO".to_string()));
|
||
|
|
additional_defines.push(("CMAKE_MODULE_LINKER_FLAGS".to_string(), "/INCREMENTAL:NO".to_string()));
|
||
|
|
}
|
||
|
|
|
||
|
|
(cmake_cxx_flags, cmake_c_flags, additional_defines)
|
||
|
|
}
|
||
|
|
|
||
|
|
fn set_os_specific_link_flags() {
|
||
|
|
let target = target_triple();
|
||
|
|
let target_macos = is_macos_target(&target);
|
||
|
|
let target_linux = is_linux_target(&target);
|
||
|
|
let target_windows = is_windows_target(&target);
|
||
|
|
let target_mingw = is_mingw_target(&target);
|
||
|
|
let target_musl = target.contains("musl");
|
||
|
|
|
||
|
|
if target_macos {
|
||
|
|
println!("cargo:rustc-link-lib=c++");
|
||
|
|
} else if is_android_target(&target) {
|
||
|
|
// NDK toolchain handles C++ runtime linkage; link against log for Android logging.
|
||
|
|
println!("cargo:rustc-link-lib=c++_static");
|
||
|
|
println!("cargo:rustc-link-lib=log");
|
||
|
|
} else if target_linux {
|
||
|
|
if target_musl {
|
||
|
|
// musl builds: statically link libstdc++ for fully portable binaries
|
||
|
|
// Add GCC library path so the linker can find libstdc++.a
|
||
|
|
if let Ok(output) = std::process::Command::new("gcc")
|
||
|
|
.arg("--print-file-name=libstdc++.a")
|
||
|
|
.output()
|
||
|
|
{
|
||
|
|
let path = String::from_utf8_lossy(&output.stdout);
|
||
|
|
if let Some(parent) = std::path::Path::new(path.trim()).parent() {
|
||
|
|
println!("cargo:rustc-link-search=native={}", parent.display());
|
||
|
|
}
|
||
|
|
}
|
||
|
|
println!("cargo:rustc-link-lib=static=stdc++");
|
||
|
|
} else if env::var("CC").map(|cc| cc.contains("clang")).unwrap_or(false) {
|
||
|
|
println!("cargo:rustc-link-lib=c++");
|
||
|
|
} else {
|
||
|
|
println!("cargo:rustc-link-lib=stdc++");
|
||
|
|
// zig's bundled libstdc++ has `std::filesystem` inline; no
|
||
|
|
// standalone libstdc++fs ships with the toolchain, so emitting
|
||
|
|
// `-lstdc++fs` makes zld fail with `unable to find dynamic
|
||
|
|
// system library 'stdc++fs'`. Skip it under zigbuild.
|
||
|
|
if !is_zigbuild() {
|
||
|
|
println!("cargo:rustc-link-lib=stdc++fs");
|
||
|
|
}
|
||
|
|
}
|
||
|
|
println!("cargo:rustc-link-lib=pthread");
|
||
|
|
println!("cargo:rustc-link-lib=m");
|
||
|
|
if !target_musl {
|
||
|
|
println!("cargo:rustc-link-lib=dl");
|
||
|
|
}
|
||
|
|
} else if target_windows {
|
||
|
|
if target_mingw {
|
||
|
|
println!("cargo:rustc-link-lib=stdc++");
|
||
|
|
}
|
||
|
|
println!("cargo:rustc-link-lib=user32");
|
||
|
|
println!("cargo:rustc-link-lib=gdi32");
|
||
|
|
println!("cargo:rustc-link-lib=ws2_32");
|
||
|
|
println!("cargo:rustc-link-lib=advapi32");
|
||
|
|
println!("cargo:rustc-link-lib=shell32");
|
||
|
|
}
|
||
|
|
|
||
|
|
println!("cargo:rustc-link-search=native={}", env::var("OUT_DIR").unwrap());
|
||
|
|
}
|
||
|
|
|
||
|
|
fn download_and_extract(target_dir: &Path, url: &str, name: &str) -> PathBuf {
|
||
|
|
use zip::ZipArchive;
|
||
|
|
|
||
|
|
fs::create_dir_all(target_dir).expect("Failed to create target directory");
|
||
|
|
|
||
|
|
let client = reqwest::blocking::Client::builder()
|
||
|
|
.timeout(std::time::Duration::from_secs(300))
|
||
|
|
.http1_only()
|
||
|
|
.build()
|
||
|
|
.expect("Failed to create HTTP client");
|
||
|
|
|
||
|
|
eprintln!("Downloading {} from {}", name, url);
|
||
|
|
let max_attempts = 5;
|
||
|
|
let mut content = None;
|
||
|
|
|
||
|
|
for attempt in 1..=max_attempts {
|
||
|
|
let err_msg = match client.get(url).send() {
|
||
|
|
Ok(resp) => {
|
||
|
|
if resp.status().is_success() {
|
||
|
|
match resp.bytes() {
|
||
|
|
Ok(bytes) => {
|
||
|
|
content = Some(bytes.to_vec());
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
Err(err) => format!("Failed to read response: {}", err),
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
format!("HTTP {}", resp.status().as_u16())
|
||
|
|
}
|
||
|
|
}
|
||
|
|
Err(err) => err.to_string(),
|
||
|
|
};
|
||
|
|
|
||
|
|
if attempt == max_attempts {
|
||
|
|
panic!(
|
||
|
|
"Failed to download {} after {} attempts: {}",
|
||
|
|
name, max_attempts, err_msg
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
let backoff = 2u64.pow((attempt - 1).min(4));
|
||
|
|
println!(
|
||
|
|
"cargo:warning=Download attempt {}/{} for {} failed ({}). Retrying in {}s...",
|
||
|
|
attempt, max_attempts, name, err_msg, backoff
|
||
|
|
);
|
||
|
|
std::thread::sleep(std::time::Duration::from_secs(backoff));
|
||
|
|
}
|
||
|
|
|
||
|
|
let content = content.expect("unreachable: download loop must either succeed or panic");
|
||
|
|
|
||
|
|
eprintln!("Downloaded {} bytes for {}", content.len(), name);
|
||
|
|
|
||
|
|
let temp_file = target_dir.join(format!("{}.zip", name));
|
||
|
|
fs::write(&temp_file, content).expect("Failed to write archive to file");
|
||
|
|
|
||
|
|
let extract_dir = target_dir.join(name);
|
||
|
|
if extract_dir.exists() {
|
||
|
|
fs::remove_dir_all(&extract_dir).expect("Failed to remove existing directory");
|
||
|
|
}
|
||
|
|
fs::create_dir_all(&extract_dir).expect("Failed to create extraction directory");
|
||
|
|
|
||
|
|
let mut archive = ZipArchive::new(fs::File::open(&temp_file).unwrap()).unwrap();
|
||
|
|
|
||
|
|
for i in 0..archive.len() {
|
||
|
|
let mut file = archive.by_index(i).unwrap();
|
||
|
|
let file_path = file.mangled_name();
|
||
|
|
let file_path = file_path.to_str().unwrap();
|
||
|
|
|
||
|
|
let path = Path::new(file_path);
|
||
|
|
let path = path.strip_prefix(path.components().next().unwrap()).unwrap();
|
||
|
|
|
||
|
|
if path.as_os_str().is_empty() {
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
let target_path = extract_dir.join(path);
|
||
|
|
|
||
|
|
if file.is_dir() {
|
||
|
|
fs::create_dir_all(target_path).unwrap();
|
||
|
|
} else {
|
||
|
|
if let Some(parent) = target_path.parent() {
|
||
|
|
fs::create_dir_all(parent).unwrap();
|
||
|
|
}
|
||
|
|
let mut outfile = fs::File::create(target_path).unwrap();
|
||
|
|
std::io::copy(&mut file, &mut outfile).unwrap();
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
fs::remove_file(temp_file).expect("Failed to remove temporary zip file");
|
||
|
|
|
||
|
|
extract_dir
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Download a single file to a destination path with retries.
|
||
|
|
/// Download a single file, trying each URL in order. Each URL gets up to
|
||
|
|
/// `max_attempts` retries with exponential backoff before falling through
|
||
|
|
/// to the next URL.
|
||
|
|
fn download_file_with_fallback(urls: &[&str], dest: &Path, label: &str) {
|
||
|
|
let client = reqwest::blocking::Client::builder()
|
||
|
|
.timeout(std::time::Duration::from_secs(300))
|
||
|
|
.http1_only()
|
||
|
|
.build()
|
||
|
|
.expect("Failed to create HTTP client");
|
||
|
|
|
||
|
|
let max_attempts: u32 = 5;
|
||
|
|
let mut last_err = String::new();
|
||
|
|
|
||
|
|
for url in urls {
|
||
|
|
eprintln!("Downloading {} from {}", label, url);
|
||
|
|
|
||
|
|
for attempt in 1..=max_attempts {
|
||
|
|
let err_msg = match client.get(*url).send() {
|
||
|
|
Ok(resp) => {
|
||
|
|
if resp.status().is_success() {
|
||
|
|
match resp.bytes() {
|
||
|
|
Ok(bytes) => {
|
||
|
|
fs::write(dest, &bytes).expect("Failed to write downloaded file");
|
||
|
|
eprintln!("Downloaded {} ({} bytes)", label, bytes.len());
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
Err(err) => format!("Failed to read response: {}", err),
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
format!("HTTP {}", resp.status().as_u16())
|
||
|
|
}
|
||
|
|
}
|
||
|
|
Err(err) => err.to_string(),
|
||
|
|
};
|
||
|
|
|
||
|
|
last_err = err_msg.clone();
|
||
|
|
|
||
|
|
if attempt == max_attempts {
|
||
|
|
println!(
|
||
|
|
"cargo:warning=All {} attempts for {} exhausted on URL {}",
|
||
|
|
max_attempts, label, url
|
||
|
|
);
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
let backoff = 2u64.pow((attempt - 1).min(4));
|
||
|
|
println!(
|
||
|
|
"cargo:warning=Download attempt {}/{} for {} failed ({}). Retrying in {}s...",
|
||
|
|
attempt, max_attempts, label, err_msg, backoff
|
||
|
|
);
|
||
|
|
std::thread::sleep(std::time::Duration::from_secs(backoff));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
panic!(
|
||
|
|
"Failed to download {} after trying {} URL(s): {}",
|
||
|
|
label,
|
||
|
|
urls.len(),
|
||
|
|
last_err
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
fn normalize_cmake_path(path: &Path) -> String {
|
||
|
|
path.to_string_lossy().replace('\\', "/")
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Apply the WASM patch to Tesseract source. Uses `git apply` if available, falls back to manual application.
|
||
|
|
fn apply_tesseract_wasm_patch(tesseract_dir: &Path) {
|
||
|
|
let patch_file = Path::new(env!("CARGO_MANIFEST_DIR")).join("patches/tesseract.diff");
|
||
|
|
if !patch_file.exists() {
|
||
|
|
println!(
|
||
|
|
"cargo:warning=Tesseract WASM patch not found at {:?}, skipping",
|
||
|
|
patch_file
|
||
|
|
);
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
eprintln!("Applying tesseract WASM patch from {:?}", patch_file);
|
||
|
|
|
||
|
|
// Normalize paths to forward slashes for cross-platform compatibility.
|
||
|
|
// On Windows, backslash paths cause git apply and patch to fail.
|
||
|
|
let dir_str = normalize_cmake_path(tesseract_dir);
|
||
|
|
let patch_str = normalize_cmake_path(&patch_file);
|
||
|
|
|
||
|
|
// Try git apply first
|
||
|
|
let result = std::process::Command::new("git")
|
||
|
|
.args(["apply", "--ignore-whitespace", "--directory"])
|
||
|
|
.arg(&dir_str)
|
||
|
|
.arg(&patch_str)
|
||
|
|
.output();
|
||
|
|
|
||
|
|
let patch_applied = match result {
|
||
|
|
Ok(output) if output.status.success() => {
|
||
|
|
eprintln!("Successfully applied tesseract WASM patch via git apply");
|
||
|
|
true
|
||
|
|
}
|
||
|
|
_ => {
|
||
|
|
eprintln!("git apply failed, trying patch command...");
|
||
|
|
// Try patch command
|
||
|
|
let result = std::process::Command::new("patch")
|
||
|
|
.args(["--force", "-p1", "-d"])
|
||
|
|
.arg(&dir_str)
|
||
|
|
.arg("-i")
|
||
|
|
.arg(&patch_str)
|
||
|
|
.output();
|
||
|
|
|
||
|
|
match result {
|
||
|
|
Ok(output) if output.status.success() => {
|
||
|
|
eprintln!("Successfully applied tesseract WASM patch via patch command");
|
||
|
|
true
|
||
|
|
}
|
||
|
|
Ok(output) => {
|
||
|
|
let stderr = String::from_utf8_lossy(&output.stderr);
|
||
|
|
let stdout = String::from_utf8_lossy(&output.stdout);
|
||
|
|
println!(
|
||
|
|
"cargo:warning=Patch command failed, will apply programmatic fixups.\
|
||
|
|
\nstderr: {}\nstdout: {}",
|
||
|
|
stderr, stdout
|
||
|
|
);
|
||
|
|
false
|
||
|
|
}
|
||
|
|
Err(e) => {
|
||
|
|
println!(
|
||
|
|
"cargo:warning=patch command not available ({}), will apply programmatic fixups",
|
||
|
|
e
|
||
|
|
);
|
||
|
|
false
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
// When the diff patch fails (or partially applies), apply all necessary
|
||
|
|
// modifications programmatically. These fixups are idempotent — safe to
|
||
|
|
// run even if the diff patch already applied some changes.
|
||
|
|
if !patch_applied {
|
||
|
|
apply_wasm_source_fixups(tesseract_dir);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Tesseract 5.5.2 moved source lists to cmake/SourceLists.cmake.
|
||
|
|
// The diff patch modifies CMakeLists.txt but the viewer/renderer sources
|
||
|
|
// are now defined in SourceLists.cmake. Fix them programmatically.
|
||
|
|
let source_lists = tesseract_dir.join("cmake/SourceLists.cmake");
|
||
|
|
if source_lists.exists() {
|
||
|
|
eprintln!("Patching cmake/SourceLists.cmake for WASM compatibility");
|
||
|
|
let content = fs::read_to_string(&source_lists).expect("Failed to read cmake/SourceLists.cmake");
|
||
|
|
|
||
|
|
let mut patched = content;
|
||
|
|
|
||
|
|
// Remove viewer from TESSERACT_SRC_CORE
|
||
|
|
patched = patched.replace(" ${TESSERACT_SRC_VIEWER}\n", "");
|
||
|
|
|
||
|
|
// Strip API sources down to baseapi.cpp and hocrrenderer.cpp
|
||
|
|
// Replace the entire TESSERACT_SRC_API block
|
||
|
|
if let Some(start) = patched.find("set(TESSERACT_SRC_API\n")
|
||
|
|
&& let Some(end) = patched[start..].find(")\n")
|
||
|
|
{
|
||
|
|
let replacement = "set(TESSERACT_SRC_API\n src/api/baseapi.cpp\n src/api/hocrrenderer.cpp\n)\n";
|
||
|
|
patched = format!("{}{}{}", &patched[..start], replacement, &patched[start + end + 2..]);
|
||
|
|
}
|
||
|
|
|
||
|
|
fs::write(&source_lists, patched).expect("Failed to write patched cmake/SourceLists.cmake");
|
||
|
|
eprintln!("Successfully patched cmake/SourceLists.cmake");
|
||
|
|
}
|
||
|
|
|
||
|
|
// Remove the tesseract CLI binary target from CMakeLists.txt
|
||
|
|
// In 5.5.2, the patch's BUILD_TESSERACT_BINARY guard may not apply cleanly
|
||
|
|
let cmakelists = tesseract_dir.join("CMakeLists.txt");
|
||
|
|
if cmakelists.exists() {
|
||
|
|
let content = fs::read_to_string(&cmakelists).expect("Failed to read CMakeLists.txt");
|
||
|
|
let mut patched = content;
|
||
|
|
|
||
|
|
// Comment out the tesseract executable build
|
||
|
|
patched = patched.replace(
|
||
|
|
"add_executable(tesseract src/tesseract.cpp)",
|
||
|
|
"# WASM: disabled tesseract binary\n# add_executable(tesseract src/tesseract.cpp)",
|
||
|
|
);
|
||
|
|
patched = patched.replace(
|
||
|
|
"target_link_libraries(tesseract libtesseract)",
|
||
|
|
"# target_link_libraries(tesseract libtesseract)",
|
||
|
|
);
|
||
|
|
patched = patched.replace(
|
||
|
|
"target_link_libraries(tesseract pthread)",
|
||
|
|
"# target_link_libraries(tesseract pthread)",
|
||
|
|
);
|
||
|
|
patched = patched.replace(
|
||
|
|
"install(TARGETS tesseract DESTINATION bin)",
|
||
|
|
"# install(TARGETS tesseract DESTINATION bin)",
|
||
|
|
);
|
||
|
|
patched = patched.replace(
|
||
|
|
"if (MSVC)\n\
|
||
|
|
\x20 install(FILES $<TARGET_PDB_FILE:${PROJECT_NAME}> DESTINATION bin OPTIONAL)\n\
|
||
|
|
endif()",
|
||
|
|
"# WASM: disabled MSVC PDB install\n\
|
||
|
|
# if (MSVC)\n\
|
||
|
|
# install(FILES $<TARGET_PDB_FILE:${PROJECT_NAME}> DESTINATION bin OPTIONAL)\n\
|
||
|
|
# endif()",
|
||
|
|
);
|
||
|
|
|
||
|
|
fs::write(&cmakelists, patched).expect("Failed to write patched CMakeLists.txt");
|
||
|
|
eprintln!("Disabled tesseract binary build in CMakeLists.txt");
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Apply C++ source fixups programmatically when the diff patch fails.
|
||
|
|
/// These are the same changes from patches/tesseract.diff applied via string replacement.
|
||
|
|
/// All replacements are idempotent (no-op if already applied).
|
||
|
|
fn apply_wasm_source_fixups(tesseract_dir: &Path) {
|
||
|
|
eprintln!("Applying programmatic C++ source fixups for WASM");
|
||
|
|
|
||
|
|
// 1. simddetect.cpp: Guard CPUID detection with !defined(__wasm__)
|
||
|
|
let simddetect = tesseract_dir.join("src/arch/simddetect.cpp");
|
||
|
|
if simddetect.exists() {
|
||
|
|
let content = fs::read_to_string(&simddetect).expect("Failed to read simddetect.cpp");
|
||
|
|
if !content.contains("#if !defined(__wasm__)") {
|
||
|
|
let patched = content.replace(
|
||
|
|
"#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA) || defined(HAVE_SSE4_1)\n\
|
||
|
|
// See https://en.wikipedia.org/wiki/CPUID.\n\
|
||
|
|
# define HAS_CPUID\n\
|
||
|
|
#endif",
|
||
|
|
"#if !defined(__wasm__)\n\
|
||
|
|
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA) || defined(HAVE_SSE4_1)\n\
|
||
|
|
// See https://en.wikipedia.org/wiki/CPUID.\n\
|
||
|
|
# define HAS_CPUID\n\
|
||
|
|
#endif\n\
|
||
|
|
#endif",
|
||
|
|
);
|
||
|
|
fs::write(&simddetect, patched).expect("Failed to write simddetect.cpp");
|
||
|
|
eprintln!("Patched simddetect.cpp: added __wasm__ guard for CPUID");
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// 2. pageiterator.cpp: Fix orientation null vector check
|
||
|
|
let pageiter = tesseract_dir.join("src/ccmain/pageiterator.cpp");
|
||
|
|
if pageiter.exists() {
|
||
|
|
let content = fs::read_to_string(&pageiter).expect("Failed to read pageiterator.cpp");
|
||
|
|
if content.contains("if (up_in_image.y() > 0.0F) {") && !content.contains("if (up_in_image.y() >= 0.0F) {")
|
||
|
|
{
|
||
|
|
let patched = content.replace("if (up_in_image.y() > 0.0F) {", "if (up_in_image.y() >= 0.0F) {");
|
||
|
|
fs::write(&pageiter, patched).expect("Failed to write pageiterator.cpp");
|
||
|
|
eprintln!("Patched pageiterator.cpp: fixed orientation null vector check");
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// 3. tesseractclass.h: Convert pixa_debug_ to unique_ptr
|
||
|
|
let tessclass_h = tesseract_dir.join("src/ccmain/tesseractclass.h");
|
||
|
|
if tessclass_h.exists() {
|
||
|
|
let content = fs::read_to_string(&tessclass_h).expect("Failed to read tesseractclass.h");
|
||
|
|
if content.contains("DebugPixa pixa_debug_;") {
|
||
|
|
let patched = content.replace("DebugPixa pixa_debug_;", "std::unique_ptr<DebugPixa> pixa_debug_;");
|
||
|
|
fs::write(&tessclass_h, patched).expect("Failed to write tesseractclass.h");
|
||
|
|
eprintln!("Patched tesseractclass.h: pixa_debug_ -> unique_ptr");
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// 4. tesseractclass.cpp: Update pixa_debug_ usage for unique_ptr
|
||
|
|
let tessclass_cpp = tesseract_dir.join("src/ccmain/tesseractclass.cpp");
|
||
|
|
if tessclass_cpp.exists() {
|
||
|
|
let content = fs::read_to_string(&tessclass_cpp).expect("Failed to read tesseractclass.cpp");
|
||
|
|
if content.contains("pixa_debug_.WritePDF") {
|
||
|
|
let mut patched = content;
|
||
|
|
// Clear() method: guard WritePDF with null check
|
||
|
|
patched = patched.replace(
|
||
|
|
" std::string debug_name = imagebasename + \"_debug.pdf\";\n pixa_debug_.WritePDF(debug_name.c_str());",
|
||
|
|
" if (pixa_debug_) {\n std::string debug_name = imagebasename + \"_debug.pdf\";\n pixa_debug_->WritePDF(debug_name.c_str());\n }",
|
||
|
|
);
|
||
|
|
// Split methods: &pixa_debug_ -> pixa_debug_.get()
|
||
|
|
patched = patched.replace("&pixa_debug_)", "pixa_debug_.get())");
|
||
|
|
fs::write(&tessclass_cpp, patched).expect("Failed to write tesseractclass.cpp");
|
||
|
|
eprintln!("Patched tesseractclass.cpp: updated pixa_debug_ for unique_ptr");
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// 5. pagesegmain.cpp: Update pixa_debug_ usage for unique_ptr
|
||
|
|
let pageseg = tesseract_dir.join("src/ccmain/pagesegmain.cpp");
|
||
|
|
if pageseg.exists() {
|
||
|
|
let content = fs::read_to_string(&pageseg).expect("Failed to read pagesegmain.cpp");
|
||
|
|
if content.contains("pixa_debug_.AddPix") || content.contains("&pixa_debug_") {
|
||
|
|
let mut patched = content;
|
||
|
|
// pixa_debug_.AddPix -> pixa_debug_->AddPix (with null guard)
|
||
|
|
patched = patched.replace("pixa_debug_.AddPix(", "pixa_debug_->AddPix(");
|
||
|
|
// Add null checks for dump_pageseg_images blocks
|
||
|
|
patched = patched.replace(
|
||
|
|
"if (tessedit_dump_pageseg_images) {\n pixa_debug_->AddPix(",
|
||
|
|
"if (tessedit_dump_pageseg_images && pixa_debug_) {\n pixa_debug_->AddPix(",
|
||
|
|
);
|
||
|
|
// &pixa_debug_ -> pixa_debug_.get()
|
||
|
|
patched = patched.replace("&pixa_debug_", "pixa_debug_.get()");
|
||
|
|
fs::write(&pageseg, patched).expect("Failed to write pagesegmain.cpp");
|
||
|
|
eprintln!("Patched pagesegmain.cpp: updated pixa_debug_ for unique_ptr");
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// 6. CMakeLists.txt: Remove opencl and viewer source globs, strip API sources
|
||
|
|
let cmakelists = tesseract_dir.join("CMakeLists.txt");
|
||
|
|
if cmakelists.exists() {
|
||
|
|
let content = fs::read_to_string(&cmakelists).expect("Failed to read CMakeLists.txt");
|
||
|
|
let mut patched = content;
|
||
|
|
// Remove opencl and viewer source globs
|
||
|
|
patched = patched.replace(" src/opencl/*.cpp\n", "");
|
||
|
|
patched = patched.replace(" src/viewer/*.cpp\n", "");
|
||
|
|
// Strip API sources to only baseapi.cpp and hocrrenderer.cpp
|
||
|
|
patched = patched.replace(" src/api/capi.cpp\n", "");
|
||
|
|
patched = patched.replace(" src/api/renderer.cpp\n", "");
|
||
|
|
patched = patched.replace(" src/api/altorenderer.cpp\n", "");
|
||
|
|
patched = patched.replace(" src/api/lstmboxrenderer.cpp\n", "");
|
||
|
|
patched = patched.replace(" src/api/pdfrenderer.cpp\n", "");
|
||
|
|
patched = patched.replace(" src/api/wordstrboxrenderer.cpp\n", "");
|
||
|
|
fs::write(&cmakelists, &patched).expect("Failed to write CMakeLists.txt");
|
||
|
|
eprintln!("Patched CMakeLists.txt: removed unnecessary sources for WASM");
|
||
|
|
}
|
||
|
|
|
||
|
|
eprintln!("Programmatic C++ source fixups complete");
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Install a no-op mutex header for WASM builds.
|
||
|
|
///
|
||
|
|
/// The wasm32-wasi-threads libc++ provides std::mutex that uses memory.atomic.wait32
|
||
|
|
/// instructions. These deadlock in single-threaded WASM (no SharedArrayBuffer).
|
||
|
|
/// This function writes a header that replaces std::mutex with a no-op stub when
|
||
|
|
/// TESSERACT_WASM_NOOP_MUTEX is defined, and patches Tesseract source files to use it.
|
||
|
|
/// Patch Tesseract source for single-threaded WASM builds.
|
||
|
|
///
|
||
|
|
/// The non-threaded wasm32-wasi sysroot doesn't provide `<mutex>` or `<thread>`.
|
||
|
|
/// This function:
|
||
|
|
/// 1. Writes a no-op header providing stub mutex, lock_guard, thread, and this_thread types
|
||
|
|
/// 2. Patches Tesseract source files to use the stubs instead of std:: types
|
||
|
|
fn apply_wasm_noop_mutex_patch(tesseract_dir: &Path) {
|
||
|
|
let noop_header = tesseract_dir.join("src/wasm_noop_mutex.h");
|
||
|
|
let header_content = r#"// No-op threading primitives for single-threaded WASM builds.
|
||
|
|
// Replaces std::mutex, std::lock_guard, std::thread, std::this_thread
|
||
|
|
// to avoid dependency on <mutex>/<thread> which are unavailable in
|
||
|
|
// the non-threaded wasm32-wasi sysroot.
|
||
|
|
#ifndef TESSERACT_WASM_NOOP_MUTEX_H_
|
||
|
|
#define TESSERACT_WASM_NOOP_MUTEX_H_
|
||
|
|
|
||
|
|
#ifdef TESSERACT_WASM_NOOP_MUTEX
|
||
|
|
|
||
|
|
namespace wasm_noop {
|
||
|
|
|
||
|
|
struct mutex {
|
||
|
|
void lock() {}
|
||
|
|
void unlock() {}
|
||
|
|
bool try_lock() { return true; }
|
||
|
|
};
|
||
|
|
|
||
|
|
template <typename M>
|
||
|
|
struct lock_guard {
|
||
|
|
explicit lock_guard(M&) {}
|
||
|
|
~lock_guard() = default;
|
||
|
|
lock_guard(const lock_guard&) = delete;
|
||
|
|
lock_guard& operator=(const lock_guard&) = delete;
|
||
|
|
};
|
||
|
|
|
||
|
|
// No-op thread: single-threaded WASM never spawns threads.
|
||
|
|
// The callable is invoked synchronously in the constructor.
|
||
|
|
struct thread {
|
||
|
|
thread() = default;
|
||
|
|
template <typename F, typename... Args>
|
||
|
|
explicit thread(F&& f, Args&&... args) {
|
||
|
|
// Execute synchronously — no real thread in WASM.
|
||
|
|
f(static_cast<Args&&>(args)...);
|
||
|
|
}
|
||
|
|
bool joinable() const { return false; }
|
||
|
|
void join() {}
|
||
|
|
void detach() {}
|
||
|
|
};
|
||
|
|
|
||
|
|
namespace this_thread {
|
||
|
|
inline void yield() {}
|
||
|
|
} // namespace this_thread
|
||
|
|
|
||
|
|
} // namespace wasm_noop
|
||
|
|
|
||
|
|
#define TESSERACT_MUTEX_TYPE wasm_noop::mutex
|
||
|
|
#define TESSERACT_LOCK_GUARD wasm_noop::lock_guard
|
||
|
|
#define TESSERACT_THREAD_TYPE wasm_noop::thread
|
||
|
|
#define TESSERACT_THIS_THREAD wasm_noop::this_thread
|
||
|
|
|
||
|
|
#else
|
||
|
|
|
||
|
|
#include <mutex>
|
||
|
|
#include <thread>
|
||
|
|
#define TESSERACT_MUTEX_TYPE std::mutex
|
||
|
|
#define TESSERACT_LOCK_GUARD std::lock_guard
|
||
|
|
#define TESSERACT_THREAD_TYPE std::thread
|
||
|
|
#define TESSERACT_THIS_THREAD std::this_thread
|
||
|
|
|
||
|
|
#endif // TESSERACT_WASM_NOOP_MUTEX
|
||
|
|
#endif // TESSERACT_WASM_NOOP_MUTEX_H_
|
||
|
|
"#;
|
||
|
|
fs::write(&noop_header, header_content).expect("Failed to write wasm_noop_mutex.h");
|
||
|
|
eprintln!("Wrote wasm_noop_mutex.h for WASM no-op threading stubs");
|
||
|
|
|
||
|
|
// Patch source files to use the no-op header
|
||
|
|
let files_to_patch = [
|
||
|
|
"src/lstm/networkscratch.h",
|
||
|
|
"src/ccstruct/imagedata.h",
|
||
|
|
"src/ccstruct/imagedata.cpp",
|
||
|
|
"src/ccutil/object_cache.h",
|
||
|
|
"src/classify/intfx.cpp",
|
||
|
|
];
|
||
|
|
|
||
|
|
for rel_path in &files_to_patch {
|
||
|
|
let file_path = tesseract_dir.join(rel_path);
|
||
|
|
if !file_path.exists() {
|
||
|
|
eprintln!("Skipping {}: file not found", rel_path);
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
let content = fs::read_to_string(&file_path).unwrap_or_default();
|
||
|
|
let patched = content
|
||
|
|
// Replace threading headers with our no-op header
|
||
|
|
.replace("#include <mutex>", "#include \"wasm_noop_mutex.h\"")
|
||
|
|
.replace("#include <thread>", "#include \"wasm_noop_mutex.h\"")
|
||
|
|
// Replace std::mutex with TESSERACT_MUTEX_TYPE
|
||
|
|
.replace("std::mutex", "TESSERACT_MUTEX_TYPE")
|
||
|
|
// Replace std::lock_guard<TESSERACT_MUTEX_TYPE> with TESSERACT_LOCK_GUARD<TESSERACT_MUTEX_TYPE>
|
||
|
|
.replace("std::lock_guard<TESSERACT_MUTEX_TYPE>", "TESSERACT_LOCK_GUARD<TESSERACT_MUTEX_TYPE>")
|
||
|
|
// Replace std::thread with TESSERACT_THREAD_TYPE
|
||
|
|
.replace("std::thread", "TESSERACT_THREAD_TYPE")
|
||
|
|
// Replace std::this_thread with TESSERACT_THIS_THREAD
|
||
|
|
.replace("std::this_thread", "TESSERACT_THIS_THREAD")
|
||
|
|
// Fix double-replacement: TESSERACT_THIS_THREAD was already transformed
|
||
|
|
// from "std::this_thread" but "std::thread" replacement may have mangled it
|
||
|
|
.replace("TESSERACT_THIS_THREAD_TYPE", "TESSERACT_THIS_THREAD");
|
||
|
|
|
||
|
|
if patched != content {
|
||
|
|
fs::write(&file_path, patched).unwrap_or_else(|_| panic!("Failed to patch {}", rel_path));
|
||
|
|
eprintln!("Patched {} for WASM no-op threading", rel_path);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
fn clean_cache(cache_dir: &Path) {
|
||
|
|
println!("Cleaning cache directory: {:?}", cache_dir);
|
||
|
|
if cache_dir.exists() {
|
||
|
|
fs::remove_dir_all(cache_dir).expect("Failed to remove cache directory");
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
fn build_leptonica_wasm(leptonica_src: &Path, leptonica_install: &Path, wasi_sdk_dir: &Path) {
|
||
|
|
let toolchain_file = find_wasi_toolchain(wasi_sdk_dir);
|
||
|
|
let sysroot = wasi_sdk_dir.join("share/wasi-sysroot");
|
||
|
|
let clang = wasi_sdk_dir.join("bin/clang");
|
||
|
|
|
||
|
|
let mut config = Config::new(leptonica_src);
|
||
|
|
|
||
|
|
config.target("wasm32-wasi");
|
||
|
|
// On Windows, the default Visual Studio generator ignores CMAKE_C_COMPILER
|
||
|
|
// and uses cl.exe, which doesn't understand GCC/Clang flags (-fPIC, -Wno-*, etc.).
|
||
|
|
// Force Ninja to ensure the WASI SDK clang is actually used.
|
||
|
|
if cfg!(target_os = "windows") {
|
||
|
|
config.generator("Ninja");
|
||
|
|
}
|
||
|
|
// Normalize all paths to forward slashes for CMake on Windows.
|
||
|
|
// Backslash paths (e.g. C:\hostedtoolcache\...) cause CMake "Invalid character escape"
|
||
|
|
// errors when written to CMakeCCompiler.cmake cache files.
|
||
|
|
config.define("CMAKE_TOOLCHAIN_FILE", normalize_cmake_path(&toolchain_file));
|
||
|
|
config.define("CMAKE_SYSROOT", normalize_cmake_path(&sysroot));
|
||
|
|
config.define("CMAKE_C_COMPILER", normalize_cmake_path(&clang));
|
||
|
|
|
||
|
|
config
|
||
|
|
.define("CMAKE_BUILD_TYPE", "Release")
|
||
|
|
.define("CMAKE_POLICY_VERSION_MINIMUM", "3.5")
|
||
|
|
// Skip executable linking in CMake try-compile checks (cross-compilation).
|
||
|
|
// On Windows, the host MSVC compiler may be used for try-compile, and it
|
||
|
|
// does not understand GCC/Clang flags like -Wno-implicit-function-declaration.
|
||
|
|
.define("CMAKE_TRY_COMPILE_TARGET_TYPE", "STATIC_LIBRARY")
|
||
|
|
.define("LIBWEBP_SUPPORT", "OFF")
|
||
|
|
.define("OPENJPEG_SUPPORT", "OFF")
|
||
|
|
.define("ENABLE_ZLIB", "OFF")
|
||
|
|
.define("ENABLE_PNG", "OFF")
|
||
|
|
.define("ENABLE_JPEG", "OFF")
|
||
|
|
.define("ENABLE_TIFF", "OFF")
|
||
|
|
.define("ENABLE_WEBP", "OFF")
|
||
|
|
.define("ENABLE_OPENJPEG", "OFF")
|
||
|
|
.define("ENABLE_GIF", "OFF")
|
||
|
|
.define("BUILD_PROG", "OFF")
|
||
|
|
.define("BUILD_SHARED_LIBS", "OFF")
|
||
|
|
.define("NO_CONSOLE_IO", "ON")
|
||
|
|
.define("HAVE_LIBZ", "0")
|
||
|
|
.define("ENABLE_LTO", "OFF")
|
||
|
|
// Disable LTO in compiler flags to avoid LLVM bitcode version mismatch with Rust's linker.
|
||
|
|
// Enable WASI emulated process clocks for getrusage() support.
|
||
|
|
// Suppress implicit-function-declaration errors for POSIX functions not in WASI
|
||
|
|
// (e.g., mkstemp — WASI has no temp directories). These code paths are never reached
|
||
|
|
// in WASM since OCR is fully in-memory.
|
||
|
|
.define("CMAKE_C_FLAGS", "-fPIC -Os -fno-lto -fno-exceptions -D_WASI_EMULATED_PROCESS_CLOCKS -D_WASI_EMULATED_SIGNAL -Wno-implicit-function-declaration")
|
||
|
|
.define("CMAKE_INSTALL_PREFIX", normalize_cmake_path(leptonica_install));
|
||
|
|
|
||
|
|
config.build();
|
||
|
|
}
|
||
|
|
|
||
|
|
fn build_wasm() {
|
||
|
|
eprintln!("Building for WASM target with WASI SDK");
|
||
|
|
|
||
|
|
let custom_out_dir = prepare_out_dir();
|
||
|
|
let cache_dir = custom_out_dir.join("cache");
|
||
|
|
fs::create_dir_all(&cache_dir).expect("Failed to create cache directory");
|
||
|
|
|
||
|
|
let project_dir = custom_out_dir.clone();
|
||
|
|
let third_party_dir = project_dir.join("third_party");
|
||
|
|
|
||
|
|
eprintln!("Looking for WASI SDK...");
|
||
|
|
let wasi_sdk_dir = match find_wasi_sdk() {
|
||
|
|
Ok(path) => {
|
||
|
|
eprintln!("Found WASI SDK at: {}", path.display());
|
||
|
|
path
|
||
|
|
}
|
||
|
|
Err(err) => {
|
||
|
|
panic!(
|
||
|
|
"{}
|
||
|
|
|
||
|
|
Installation instructions:
|
||
|
|
Download from: https://github.com/WebAssembly/wasi-sdk/releases
|
||
|
|
Extract to ~/wasi-sdk or /opt/wasi-sdk
|
||
|
|
Set WASI_SDK_PATH environment variable to the extracted directory",
|
||
|
|
err
|
||
|
|
);
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
let leptonica_dir = if third_party_dir.join("leptonica").exists() {
|
||
|
|
eprintln!("Using existing leptonica source");
|
||
|
|
third_party_dir.join("leptonica")
|
||
|
|
} else {
|
||
|
|
fs::create_dir_all(&third_party_dir).expect("Failed to create third_party directory");
|
||
|
|
download_and_extract(&third_party_dir, &leptonica_url(), "leptonica")
|
||
|
|
};
|
||
|
|
|
||
|
|
let tesseract_dir = if third_party_dir.join("tesseract").exists() {
|
||
|
|
eprintln!("Using existing tesseract source");
|
||
|
|
third_party_dir.join("tesseract")
|
||
|
|
} else {
|
||
|
|
fs::create_dir_all(&third_party_dir).expect("Failed to create third_party directory");
|
||
|
|
let dir = download_and_extract(&third_party_dir, &tesseract_url(), "tesseract");
|
||
|
|
// Apply WASM patches to tesseract source
|
||
|
|
apply_tesseract_wasm_patch(&dir);
|
||
|
|
apply_wasm_noop_mutex_patch(&dir);
|
||
|
|
dir
|
||
|
|
};
|
||
|
|
|
||
|
|
let leptonica_install_dir = custom_out_dir.join("leptonica");
|
||
|
|
let leptonica_cache_dir = cache_dir.join("leptonica");
|
||
|
|
|
||
|
|
let _leptonica_link_name =
|
||
|
|
build_or_use_cached("leptonica", &leptonica_cache_dir, &leptonica_install_dir, || {
|
||
|
|
eprintln!("Building Leptonica for WASM...");
|
||
|
|
build_leptonica_wasm(&leptonica_dir, &leptonica_install_dir, &wasi_sdk_dir);
|
||
|
|
});
|
||
|
|
|
||
|
|
let tesseract_install_dir = custom_out_dir.join("tesseract");
|
||
|
|
let tesseract_cache_dir = cache_dir.join("tesseract");
|
||
|
|
|
||
|
|
let _tesseract_link_name =
|
||
|
|
build_or_use_cached("tesseract", &tesseract_cache_dir, &tesseract_install_dir, || {
|
||
|
|
eprintln!("Building Tesseract for WASM (SIMD enabled)...");
|
||
|
|
build_tesseract_wasm(
|
||
|
|
&tesseract_dir,
|
||
|
|
&tesseract_install_dir,
|
||
|
|
&leptonica_install_dir,
|
||
|
|
&wasi_sdk_dir,
|
||
|
|
true,
|
||
|
|
);
|
||
|
|
});
|
||
|
|
|
||
|
|
let leptonica_lib_dir = leptonica_install_dir.join("lib");
|
||
|
|
let tesseract_lib_dir = tesseract_install_dir.join("lib");
|
||
|
|
|
||
|
|
println!("cargo:rustc-link-search=native={}", leptonica_lib_dir.display());
|
||
|
|
println!("cargo:rustc-link-search=native={}", tesseract_lib_dir.display());
|
||
|
|
|
||
|
|
println!("cargo:rustc-link-lib=static=tesseract");
|
||
|
|
println!("cargo:rustc-link-lib=static=leptonica");
|
||
|
|
|
||
|
|
// Link WASI SDK sysroot libraries for C/C++ standard library symbols.
|
||
|
|
// Use wasm32-wasi (non-threaded) for both C and C++.
|
||
|
|
// Tesseract's mutex usage is handled by no-op stubs, so we don't need the
|
||
|
|
// threaded libc++ (which generates memory.atomic.wait32 that deadlocks in WASM).
|
||
|
|
let sysroot_lib = wasi_sdk_dir.join("share/wasi-sysroot/lib/wasm32-wasi");
|
||
|
|
eprintln!("Linking WASI SDK sysroot from: {}", sysroot_lib.display());
|
||
|
|
|
||
|
|
println!("cargo:rustc-link-search=native={}", sysroot_lib.display());
|
||
|
|
// WASI SDK v33+ moved libc++.a/libc++abi.a to the noeh subdirectory
|
||
|
|
let sysroot_lib_noeh = sysroot_lib.join("noeh");
|
||
|
|
if sysroot_lib_noeh.exists() {
|
||
|
|
println!("cargo:rustc-link-search=native={}", sysroot_lib_noeh.display());
|
||
|
|
}
|
||
|
|
// C++ libs from non-threaded sysroot (no atomic operations)
|
||
|
|
println!("cargo:rustc-link-lib=static=c++");
|
||
|
|
println!("cargo:rustc-link-lib=static=c++abi");
|
||
|
|
println!("cargo:rustc-link-lib=static=c");
|
||
|
|
// WASI emulation libraries for POSIX functions used by Leptonica/Tesseract
|
||
|
|
println!("cargo:rustc-link-lib=static=wasi-emulated-process-clocks");
|
||
|
|
println!("cargo:rustc-link-lib=static=wasi-emulated-signal");
|
||
|
|
|
||
|
|
// Link compiler-rt builtins
|
||
|
|
if let Some(rt_dir) = find_wasi_compiler_rt(&wasi_sdk_dir) {
|
||
|
|
eprintln!("Linking compiler-rt from: {}", rt_dir.display());
|
||
|
|
println!("cargo:rustc-link-search=native={}", rt_dir.display());
|
||
|
|
println!("cargo:rustc-link-lib=static=clang_rt.builtins-wasm32");
|
||
|
|
} else {
|
||
|
|
eprintln!("compiler-rt builtins not found in WASI SDK, some symbols may be unresolved");
|
||
|
|
}
|
||
|
|
|
||
|
|
// Bundle eng.traineddata for the optional `bundle-tessdata-eng` feature.
|
||
|
|
// Tesseract on WASM has no filesystem, so the kreuzberg-tesseract crate
|
||
|
|
// ships the language data as a `&'static [u8]` via include_bytes! when
|
||
|
|
// this feature is on. We always populate the path so include_bytes!
|
||
|
|
// resolves at compile time.
|
||
|
|
let bundled_tessdata_dir = project_dir.join("tessdata");
|
||
|
|
let eng_traineddata = bundled_tessdata_dir.join("eng.traineddata");
|
||
|
|
if !eng_traineddata.exists() {
|
||
|
|
fs::create_dir_all(&bundled_tessdata_dir).expect("Failed to create tessdata directory");
|
||
|
|
download_file_with_fallback(
|
||
|
|
&[
|
||
|
|
"https://github.com/tesseract-ocr/tessdata_fast/raw/main/eng.traineddata",
|
||
|
|
"https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/main/eng.traineddata",
|
||
|
|
],
|
||
|
|
&eng_traineddata,
|
||
|
|
"eng.traineddata",
|
||
|
|
);
|
||
|
|
}
|
||
|
|
println!("cargo:rustc-env=TESSDATA_PREFIX_BUNDLED={}", project_dir.display());
|
||
|
|
|
||
|
|
eprintln!("WASM build completed successfully!");
|
||
|
|
eprintln!("Leptonica install dir: {:?}", leptonica_install_dir);
|
||
|
|
eprintln!("Tesseract install dir: {:?}", tesseract_install_dir);
|
||
|
|
}
|
||
|
|
|
||
|
|
fn build_tesseract_wasm(
|
||
|
|
src_dir: &Path,
|
||
|
|
tesseract_install: &Path,
|
||
|
|
leptonica_install: &Path,
|
||
|
|
wasi_sdk_dir: &Path,
|
||
|
|
enable_simd: bool,
|
||
|
|
) {
|
||
|
|
// Use the non-threaded WASI toolchain for Tesseract.
|
||
|
|
// Tesseract's std::mutex usage is replaced by no-op stubs via apply_wasm_noop_mutex_patch(),
|
||
|
|
// so we don't need the threaded libc++ (which generates memory.atomic.wait32 instructions
|
||
|
|
// that deadlock in single-threaded WASM environments without SharedArrayBuffer).
|
||
|
|
let toolchain_file = find_wasi_toolchain(wasi_sdk_dir);
|
||
|
|
let sysroot = wasi_sdk_dir.join("share/wasi-sysroot");
|
||
|
|
let clang = wasi_sdk_dir.join("bin/clang");
|
||
|
|
let clangxx = wasi_sdk_dir.join("bin/clang++");
|
||
|
|
|
||
|
|
let mut config = Config::new(src_dir);
|
||
|
|
|
||
|
|
// Use wasm32-wasi (non-threaded) - no atomic operations emitted
|
||
|
|
config.target("wasm32-wasi");
|
||
|
|
// On Windows, the default Visual Studio generator ignores CMAKE_C_COMPILER
|
||
|
|
// and uses cl.exe, which doesn't understand GCC/Clang flags (-fPIC, -Wno-*, etc.).
|
||
|
|
// Force Ninja to ensure the WASI SDK clang is actually used.
|
||
|
|
if cfg!(target_os = "windows") {
|
||
|
|
config.generator("Ninja");
|
||
|
|
}
|
||
|
|
// Normalize all paths to forward slashes for CMake on Windows.
|
||
|
|
// Backslash paths (e.g. C:\hostedtoolcache\...) cause CMake "Invalid character escape"
|
||
|
|
// errors when written to CMakeCCompiler.cmake cache files.
|
||
|
|
config.define("CMAKE_TOOLCHAIN_FILE", normalize_cmake_path(&toolchain_file));
|
||
|
|
config.define("CMAKE_SYSROOT", normalize_cmake_path(&sysroot));
|
||
|
|
config.define("CMAKE_C_COMPILER", normalize_cmake_path(&clang));
|
||
|
|
config.define("CMAKE_CXX_COMPILER", normalize_cmake_path(&clangxx));
|
||
|
|
config.define("WASI_SDK_PREFIX", normalize_cmake_path(wasi_sdk_dir));
|
||
|
|
|
||
|
|
let leptonica_lib_dir = leptonica_install.join("lib");
|
||
|
|
let leptonica_include_dir = leptonica_install.join("include");
|
||
|
|
|
||
|
|
// Leptonica_DIR must point to the directory containing LeptonicaConfig.cmake,
|
||
|
|
// not the install prefix. On Windows with WASI toolchain, CMAKE_PREFIX_PATH
|
||
|
|
// search doesn't find it automatically because the toolchain overrides search paths.
|
||
|
|
let leptonica_cmake_dir = leptonica_install.join("lib/cmake/leptonica");
|
||
|
|
config.define("Leptonica_DIR", normalize_cmake_path(&leptonica_cmake_dir));
|
||
|
|
config.define("CMAKE_PREFIX_PATH", normalize_cmake_path(leptonica_install));
|
||
|
|
// Help the linker find leptonica during try_compile checks
|
||
|
|
config.define(
|
||
|
|
"CMAKE_EXE_LINKER_FLAGS",
|
||
|
|
format!("-L{}", normalize_cmake_path(&leptonica_lib_dir)),
|
||
|
|
);
|
||
|
|
|
||
|
|
// TESSERACT_WASM_NOOP_MUTEX: Replace std::mutex with no-op stubs in WASM builds.
|
||
|
|
// The wasm32-wasi-threads libc++ provides std::mutex that uses memory.atomic.wait32,
|
||
|
|
// which deadlocks in single-threaded WASM environments (no SharedArrayBuffer).
|
||
|
|
let noop_mutex_include = src_dir.join("src");
|
||
|
|
let mut cxx_flags = String::from(
|
||
|
|
"-DTESSERACT_IMAGEDATA_AS_PIX -DTESSERACT_WASM_NOOP_MUTEX -fno-exceptions -D_WASI_EMULATED_PROCESS_CLOCKS -D_WASI_EMULATED_SIGNAL ",
|
||
|
|
);
|
||
|
|
if enable_simd {
|
||
|
|
cxx_flags.push_str("-msimd128 ");
|
||
|
|
}
|
||
|
|
cxx_flags.push_str(&format!(
|
||
|
|
"-fPIC -Os -fno-lto -I{} -I{}",
|
||
|
|
normalize_cmake_path(&leptonica_include_dir),
|
||
|
|
normalize_cmake_path(&noop_mutex_include)
|
||
|
|
));
|
||
|
|
|
||
|
|
let c_flags = format!(
|
||
|
|
"-fPIC -Os -fno-lto -fno-exceptions -D_WASI_EMULATED_PROCESS_CLOCKS -D_WASI_EMULATED_SIGNAL -I{}",
|
||
|
|
normalize_cmake_path(&leptonica_include_dir)
|
||
|
|
);
|
||
|
|
|
||
|
|
config
|
||
|
|
.define("CMAKE_BUILD_TYPE", "Release")
|
||
|
|
.define("CMAKE_POLICY_VERSION_MINIMUM", "3.5")
|
||
|
|
// Skip executable linking in CMake try-compile checks (cross-compilation).
|
||
|
|
// On Windows, the host MSVC compiler may be used for try-compile, and it
|
||
|
|
// does not understand GCC/Clang flags passed via CMAKE_C_FLAGS/CMAKE_CXX_FLAGS.
|
||
|
|
.define("CMAKE_TRY_COMPILE_TARGET_TYPE", "STATIC_LIBRARY")
|
||
|
|
// Cross-compilation: provide try_run results since we can't execute WASM binaries
|
||
|
|
.define("LEPT_TIFF_RESULT", "1")
|
||
|
|
.define("LEPT_TIFF_RESULT__TRYRUN_OUTPUT", "")
|
||
|
|
.define("BUILD_TESSERACT_BINARY", "OFF")
|
||
|
|
.define("BUILD_TRAINING_TOOLS", "OFF")
|
||
|
|
.define("INSTALL_CONFIGS", "ON")
|
||
|
|
.define("BUILD_TESTS", "OFF")
|
||
|
|
.define("BUILD_PROG", "OFF")
|
||
|
|
.define("SYNTAX_LOG", "OFF")
|
||
|
|
.define("DISABLE_ARCHIVE", "ON")
|
||
|
|
.define("DISABLE_CURL", "ON")
|
||
|
|
.define("DISABLE_OPENCL", "ON")
|
||
|
|
.define("DISABLE_TIFF", "ON")
|
||
|
|
.define("DISABLE_PNG", "ON")
|
||
|
|
.define("DISABLE_JPEG", "ON")
|
||
|
|
.define("DISABLE_WEBP", "ON")
|
||
|
|
.define("DISABLE_OPENJPEG", "ON")
|
||
|
|
.define("DISABLE_ZLIB", "ON")
|
||
|
|
.define("DISABLE_LIBXML2", "ON")
|
||
|
|
.define("DISABLE_LIBICU", "ON")
|
||
|
|
.define("DISABLE_LZMA", "ON")
|
||
|
|
.define("DISABLE_GIF", "ON")
|
||
|
|
.define("DISABLE_DEBUG_MESSAGES", "ON")
|
||
|
|
.define("GRAPHICS_DISABLED", "ON")
|
||
|
|
.define("USE_OPENCL", "OFF")
|
||
|
|
.define("OPENMP_BUILD", "OFF")
|
||
|
|
.define("ENABLE_LTO", "OFF")
|
||
|
|
// For WASM, disable x86-specific SIMD detection (cpuid.h).
|
||
|
|
// WASM SIMD is enabled via -msimd128 compiler flag instead.
|
||
|
|
.define("HAVE_SSE4_1", "OFF")
|
||
|
|
.define("HAVE_AVX", "OFF")
|
||
|
|
.define("HAVE_AVX2", "OFF")
|
||
|
|
.define("HAVE_AVX512F", "OFF")
|
||
|
|
.define("HAVE_FMA", "OFF")
|
||
|
|
.define("CMAKE_INSTALL_PREFIX", normalize_cmake_path(tesseract_install))
|
||
|
|
.define("CMAKE_CXX_FLAGS", &cxx_flags)
|
||
|
|
.define("CMAKE_C_FLAGS", &c_flags);
|
||
|
|
|
||
|
|
config.build();
|
||
|
|
}
|
||
|
|
|
||
|
|
fn build_or_use_cached<F>(name: &str, cache_dir: &Path, install_dir: &Path, build_fn: F) -> String
|
||
|
|
where
|
||
|
|
F: FnOnce(),
|
||
|
|
{
|
||
|
|
let target_env = env::var("CARGO_CFG_TARGET_ENV").unwrap_or_default();
|
||
|
|
let target_triple = env::var("TARGET")
|
||
|
|
.unwrap_or_else(|_| env::var("CARGO_CFG_TARGET_ARCH").unwrap_or_else(|_| "unknown".to_string()));
|
||
|
|
let is_windows = target_triple.contains("windows");
|
||
|
|
let is_windows_gnu = is_windows && target_env == "gnu";
|
||
|
|
|
||
|
|
let lib_name = if is_windows && !is_windows_gnu {
|
||
|
|
format!("{}.lib", name)
|
||
|
|
} else {
|
||
|
|
format!("lib{}.a", name)
|
||
|
|
};
|
||
|
|
|
||
|
|
let cached_path = cache_dir.join(&lib_name);
|
||
|
|
let marker_path = cache_dir.join(format!("{}.target", name));
|
||
|
|
let out_path = install_dir.join("lib").join(&lib_name);
|
||
|
|
|
||
|
|
let possible_lib_names: Vec<String> = if is_windows {
|
||
|
|
let mut base = match name {
|
||
|
|
"leptonica" => vec![
|
||
|
|
"leptonica.lib".to_string(),
|
||
|
|
"libleptonica.lib".to_string(),
|
||
|
|
"leptonica-static.lib".to_string(),
|
||
|
|
format!("leptonica-{}.lib", LEPTONICA_VERSION),
|
||
|
|
"leptonica-1.86.0.lib".to_string(),
|
||
|
|
"leptonica-1.84.1.lib".to_string(),
|
||
|
|
"leptonicad.lib".to_string(),
|
||
|
|
"libleptonica_d.lib".to_string(),
|
||
|
|
format!("leptonica-{}d.lib", LEPTONICA_VERSION),
|
||
|
|
"leptonica-1.86.0d.lib".to_string(),
|
||
|
|
"leptonica-1.84.1d.lib".to_string(),
|
||
|
|
],
|
||
|
|
"tesseract" => vec![
|
||
|
|
"tesseract.lib".to_string(),
|
||
|
|
"libtesseract.lib".to_string(),
|
||
|
|
"tesseract-static.lib".to_string(),
|
||
|
|
"tesseract53.lib".to_string(),
|
||
|
|
"tesseract54.lib".to_string(),
|
||
|
|
"tesseract55.lib".to_string(),
|
||
|
|
"tesseractd.lib".to_string(),
|
||
|
|
"libtesseract_d.lib".to_string(),
|
||
|
|
"tesseract53d.lib".to_string(),
|
||
|
|
"tesseract54d.lib".to_string(),
|
||
|
|
"tesseract55d.lib".to_string(),
|
||
|
|
],
|
||
|
|
_ => vec![format!("{}.lib", name)],
|
||
|
|
};
|
||
|
|
|
||
|
|
if is_windows_gnu {
|
||
|
|
match name {
|
||
|
|
"leptonica" => {
|
||
|
|
base.push(format!("libleptonica-{}.a", LEPTONICA_VERSION));
|
||
|
|
base.push("libleptonica.a".to_string());
|
||
|
|
}
|
||
|
|
"tesseract" => {
|
||
|
|
base.push(format!("libtesseract{}.a", TESSERACT_VERSION.replace('.', "")));
|
||
|
|
base.push("libtesseract.a".to_string());
|
||
|
|
base.push("libtesseract55.a".to_string());
|
||
|
|
}
|
||
|
|
_ => {
|
||
|
|
base.push(format!("lib{}.a", name));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
base
|
||
|
|
} else {
|
||
|
|
vec![format!("lib{}.a", name)]
|
||
|
|
};
|
||
|
|
|
||
|
|
fs::create_dir_all(cache_dir).expect("Failed to create cache directory");
|
||
|
|
fs::create_dir_all(out_path.parent().unwrap()).expect("Failed to create output directory");
|
||
|
|
|
||
|
|
let candidate_lib_dirs = [
|
||
|
|
install_dir.join("lib"),
|
||
|
|
install_dir.join("lib64"),
|
||
|
|
install_dir.join("lib").join("tesseract"),
|
||
|
|
];
|
||
|
|
|
||
|
|
let cache_valid = cached_path.exists()
|
||
|
|
&& {
|
||
|
|
match fs::read_to_string(&marker_path) {
|
||
|
|
Ok(cached_target) => {
|
||
|
|
let valid = cached_target.trim() == target_triple;
|
||
|
|
if !valid {
|
||
|
|
println!(
|
||
|
|
"cargo:warning=Cached {} library is for wrong architecture (cached: {}, current: {}), rebuilding",
|
||
|
|
name,
|
||
|
|
cached_target.trim(),
|
||
|
|
target_triple
|
||
|
|
);
|
||
|
|
let _ = fs::remove_file(&cached_path);
|
||
|
|
let _ = fs::remove_file(&marker_path);
|
||
|
|
}
|
||
|
|
valid
|
||
|
|
}
|
||
|
|
Err(_) => {
|
||
|
|
println!(
|
||
|
|
"cargo:warning=Cached {} library missing target marker, rebuilding",
|
||
|
|
name
|
||
|
|
);
|
||
|
|
let _ = fs::remove_file(&cached_path);
|
||
|
|
false
|
||
|
|
}
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
let link_name_to_use = if cache_valid {
|
||
|
|
eprintln!("Using cached {} library for {}", name, target_triple);
|
||
|
|
if let Err(e) = fs::copy(&cached_path, &out_path) {
|
||
|
|
eprintln!("Failed to copy cached library: {}", e);
|
||
|
|
build_fn();
|
||
|
|
}
|
||
|
|
name.to_string()
|
||
|
|
} else {
|
||
|
|
println!("Building {} library", name);
|
||
|
|
build_fn();
|
||
|
|
|
||
|
|
let mut found_lib_name = None;
|
||
|
|
'search: for lib_name in &possible_lib_names {
|
||
|
|
for dir in &candidate_lib_dirs {
|
||
|
|
let lib_path = dir.join(lib_name);
|
||
|
|
if lib_path.exists() {
|
||
|
|
eprintln!("Found {} library at: {}", name, lib_path.display());
|
||
|
|
let link_name = if lib_name.ends_with(".lib") {
|
||
|
|
lib_name.strip_suffix(".lib").unwrap_or(lib_name).to_string()
|
||
|
|
} else if lib_name.ends_with(".a") {
|
||
|
|
lib_name
|
||
|
|
.strip_prefix("lib")
|
||
|
|
.and_then(|s| s.strip_suffix(".a"))
|
||
|
|
.unwrap_or(lib_name)
|
||
|
|
.to_string()
|
||
|
|
} else {
|
||
|
|
lib_name.to_string()
|
||
|
|
};
|
||
|
|
found_lib_name = Some((lib_path, link_name));
|
||
|
|
break 'search;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if let Some((lib_path, link_name)) = found_lib_name {
|
||
|
|
if out_path.exists() {
|
||
|
|
println!(
|
||
|
|
"cargo:warning=Library already available at expected location: {}",
|
||
|
|
out_path.display()
|
||
|
|
);
|
||
|
|
} else if let Err(e) = fs::copy(&lib_path, &out_path) {
|
||
|
|
eprintln!("Failed to copy library to standard location: {}", e);
|
||
|
|
}
|
||
|
|
if let Err(e) = fs::copy(&lib_path, &cached_path) {
|
||
|
|
eprintln!("Failed to cache library: {}", e);
|
||
|
|
} else if let Err(e) = fs::write(&marker_path, &target_triple) {
|
||
|
|
eprintln!("Failed to write cache marker: {}", e);
|
||
|
|
} else {
|
||
|
|
eprintln!("Cached {} library for {}", name, target_triple);
|
||
|
|
}
|
||
|
|
link_name
|
||
|
|
} else {
|
||
|
|
println!(
|
||
|
|
"cargo:warning=Library {} not found! Searched for: {:?}",
|
||
|
|
name, possible_lib_names
|
||
|
|
);
|
||
|
|
for dir in &candidate_lib_dirs {
|
||
|
|
eprintln!("Checked directory: {}", dir.display());
|
||
|
|
if let Ok(entries) = fs::read_dir(dir) {
|
||
|
|
eprintln!("Files in {}:", dir.display());
|
||
|
|
for entry in entries.flatten() {
|
||
|
|
eprintln!(" - {}", entry.file_name().to_string_lossy());
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
eprintln!("Directory not accessible: {}", dir.display());
|
||
|
|
}
|
||
|
|
}
|
||
|
|
name.to_string()
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
for dir in candidate_lib_dirs.iter().filter(|d| d.exists()) {
|
||
|
|
println!("cargo:rustc-link-search=native={}", dir.display());
|
||
|
|
}
|
||
|
|
|
||
|
|
// Return the link name instead of outputting the link directive here
|
||
|
|
// This allows the caller to control the linking order
|
||
|
|
link_name_to_use
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
fn main() {
|
||
|
|
#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
|
||
|
|
{
|
||
|
|
build_tesseract::build();
|
||
|
|
}
|
||
|
|
|
||
|
|
#[cfg(all(feature = "dynamic-linking", not(feature = "build-tesseract")))]
|
||
|
|
{
|
||
|
|
eprintln!("Using dynamic linking with system-installed Tesseract libraries");
|
||
|
|
println!("cargo:rustc-link-lib=dylib=tesseract");
|
||
|
|
println!("cargo:rustc-link-lib=dylib=leptonica");
|
||
|
|
}
|
||
|
|
}
|