This commit is contained in:
499
crates/kreuzberg/Cargo.toml
Normal file
499
crates/kreuzberg/Cargo.toml
Normal file
@@ -0,0 +1,499 @@
|
||||
[package]
|
||||
name = "kreuzberg"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
rust-version.workspace = true
|
||||
authors.workspace = true
|
||||
description = "High-performance document intelligence library for Rust. Extract text, metadata, and structured data from PDFs, Office documents, images, and 90+ formats and 300+ programming languages via tree-sitter code intelligence with async/sync APIs."
|
||||
license.workspace = true
|
||||
repository = "https://github.com/kreuzberg-dev/kreuzberg"
|
||||
homepage = "https://kreuzberg.dev"
|
||||
documentation = "https://docs.rs/kreuzberg"
|
||||
keywords = ["document", "extraction", "pdf", "ocr", "parser"]
|
||||
categories = ["parser-implementations", "text-processing"]
|
||||
readme = "README.md"
|
||||
|
||||
[package.metadata.cargo-machete]
|
||||
ignored = [
|
||||
"liter-llm",
|
||||
"opentelemetry_sdk",
|
||||
"tracing-opentelemetry",
|
||||
"kamadak-exif",
|
||||
# icu_provider is feature-gated and consumed transitively by language-detection;
|
||||
# cargo-machete cannot resolve `dep:icu_provider` references in feature flag chains.
|
||||
"icu_provider",
|
||||
"serde_toon_format",
|
||||
]
|
||||
|
||||
[lib]
|
||||
crate-type = ["rlib"]
|
||||
|
||||
[features]
|
||||
default = ["tokio-runtime", "simd-utf8"]
|
||||
|
||||
# Build-time only: enables pprof flamegraph profiling. No #[cfg] gates —
|
||||
# activates the pprof dependency (non-Windows, non-WASM only).
|
||||
profiling = ["dep:pprof"]
|
||||
|
||||
pool-metrics = []
|
||||
|
||||
simd-utf8 = ["dep:simdutf8"]
|
||||
|
||||
tokio-runtime = ["dep:tokio"]
|
||||
|
||||
# PDF extraction is currently backed by pdf_oxide. Keep this as the canonical
|
||||
# `pdf` feature until/unless a second PDF backend is introduced.
|
||||
pdf = ["dep:pdf_oxide", "dep:lopdf", "dep:image", "dep:flate2", "html"]
|
||||
excel = ["dep:calamine", "tokio-runtime"]
|
||||
excel-wasm = ["dep:calamine"]
|
||||
office = [
|
||||
"dep:cfb",
|
||||
"dep:roxmltree",
|
||||
"dep:zip",
|
||||
"dep:quick-xml",
|
||||
"dep:biblatex",
|
||||
"dep:biblib",
|
||||
"dep:org",
|
||||
"dep:dbase",
|
||||
"html",
|
||||
]
|
||||
hwp = ["dep:cfb", "dep:flate2"]
|
||||
hwpx = ["dep:unhwp", "dep:zip"]
|
||||
iwork = ["dep:zip", "dep:snap"]
|
||||
email = [
|
||||
"dep:mail-parser",
|
||||
"dep:cfb",
|
||||
"dep:outlook-pst",
|
||||
"dep:tempfile",
|
||||
"dep:chrono",
|
||||
"dep:chardetng",
|
||||
]
|
||||
html = ["dep:html-to-markdown-rs", "dep:v_htmlescape"]
|
||||
xml = ["dep:quick-xml", "dep:roxmltree"]
|
||||
archives = ["dep:zip", "dep:tar", "dep:sevenz-rust2", "dep:flate2"]
|
||||
mdx = []
|
||||
|
||||
liter-llm = ["dep:liter-llm", "dep:minijinja"]
|
||||
|
||||
tree-sitter = ["dep:tree-sitter-language-pack"]
|
||||
tree-sitter-wasm = ["tree-sitter"]
|
||||
|
||||
ocr = [
|
||||
"dep:kreuzberg-tesseract",
|
||||
"dep:image",
|
||||
"dep:tiff",
|
||||
"dep:fast_image_resize",
|
||||
"dep:kamadak-exif",
|
||||
"dep:hayro-jpeg2000",
|
||||
"dep:hayro-jbig2",
|
||||
"html",
|
||||
]
|
||||
# WASM OCR: Minimal Tesseract backend without html-to-markdown (which has WASI imports)
|
||||
# Includes only the core OCR processing dependencies needed for Tesseract on WASM
|
||||
ocr-wasm = [
|
||||
"dep:kreuzberg-tesseract",
|
||||
"dep:image",
|
||||
"dep:hayro-jpeg2000",
|
||||
"dep:hayro-jbig2",
|
||||
]
|
||||
# PaddleOCR type definitions only — pure-Rust config structs, no ORT dependency.
|
||||
# Exposes PaddleOcrConfig, PaddleLanguage, ModelPaths. Safe on Android/WASM.
|
||||
paddle-ocr-types = []
|
||||
# PaddleOCR via ONNX Runtime
|
||||
# Requires 'ocr' feature for shared conversion utilities and table reconstruction
|
||||
paddle-ocr = [
|
||||
"paddle-ocr-types",
|
||||
"dep:kreuzberg-paddle-ocr",
|
||||
"dep:sha2",
|
||||
"dep:image",
|
||||
"dep:hf-hub",
|
||||
"dep:ureq",
|
||||
"dep:ort",
|
||||
"dep:ndarray",
|
||||
"html",
|
||||
"tokio-runtime",
|
||||
"ocr",
|
||||
"auto-rotate",
|
||||
]
|
||||
# Bundle ORT binaries via the official Microsoft release (includes CoreML on macOS, CUDA on Linux).
|
||||
# When active, no system ORT library is required and ort_discovery is skipped.
|
||||
ort-bundled = ["ort/download-binaries", "ort/tls-native"]
|
||||
# Build-time only: configures ORT to load ONNX Runtime dynamically at runtime.
|
||||
# No #[cfg] gates — passed through to the `ort` crate.
|
||||
ort-dynamic = ["ort/load-dynamic"]
|
||||
# Layout detection via ONNX Runtime (YOLO + RT-DETR)
|
||||
# Document orientation detection using PP-LCNet for auto_rotate
|
||||
auto-rotate = [
|
||||
"auto-rotate-types",
|
||||
"dep:ort",
|
||||
"dep:ndarray",
|
||||
"dep:hf-hub",
|
||||
"dep:sha2",
|
||||
"dep:image",
|
||||
"dep:ureq",
|
||||
"ort-bundled",
|
||||
]
|
||||
layout-detection = [
|
||||
"layout-types",
|
||||
"dep:ort",
|
||||
"dep:ndarray",
|
||||
"dep:hf-hub",
|
||||
"dep:sha2",
|
||||
"dep:image",
|
||||
"tokio-runtime",
|
||||
"ort-bundled",
|
||||
]
|
||||
language-detection = ["dep:whatlang"]
|
||||
# layout-types: pure-Rust layout struct definitions — no ORT dependency.
|
||||
# Included by layout-detection; also usable standalone on WASM/Android targets.
|
||||
layout-types = []
|
||||
# auto-rotate-types: pure-Rust OrientationResult struct — no ORT dependency.
|
||||
# Included by auto-rotate; also usable standalone on WASM/Android targets.
|
||||
auto-rotate-types = []
|
||||
chunking = [
|
||||
"dep:auto_enums",
|
||||
"dep:either",
|
||||
"dep:icu_provider",
|
||||
"dep:icu_segmenter",
|
||||
"dep:itertools",
|
||||
"dep:strum",
|
||||
]
|
||||
chunking-tokenizers = ["chunking", "dep:tokenizers"]
|
||||
# embedding-presets: static preset metadata (WASM-safe, no ORT)
|
||||
embedding-presets = []
|
||||
# embeddings: full embedding generation with ONNX Runtime (ORT-dependent, WASM-incompatible)
|
||||
# NOTE: embedding-presets is a subset of embeddings (no ORT); always implied by embeddings.
|
||||
embeddings = [
|
||||
"dep:hf-hub",
|
||||
"dep:ort",
|
||||
"dep:ndarray",
|
||||
"dep:tokenizers",
|
||||
"chunking",
|
||||
"tokio-runtime",
|
||||
"ort-bundled",
|
||||
"embedding-presets",
|
||||
]
|
||||
stopwords = []
|
||||
quality = ["dep:unicode-normalization", "dep:chardetng", "stopwords"]
|
||||
|
||||
keywords-yake = ["stopwords"]
|
||||
keywords-rake = ["dep:rake", "stopwords"]
|
||||
keywords = ["keywords-yake", "keywords-rake"]
|
||||
|
||||
diff = ["dep:similar"]
|
||||
|
||||
tower-service = ["dep:tower", "tokio-runtime"]
|
||||
|
||||
api-types = []
|
||||
api = [
|
||||
"api-types",
|
||||
"tower-service",
|
||||
"dep:axum",
|
||||
"dep:chrono",
|
||||
"dep:moka",
|
||||
"dep:tower-http",
|
||||
"dep:utoipa",
|
||||
"dep:uuid",
|
||||
"tokio-runtime",
|
||||
"chunking",
|
||||
]
|
||||
mcp = ["tower-service", "dep:rmcp", "tokio-runtime"]
|
||||
mcp-http = ["mcp", "api"]
|
||||
|
||||
otel = ["dep:opentelemetry", "dep:opentelemetry_sdk", "dep:tracing-opentelemetry"]
|
||||
|
||||
# no-ort-target: shared base for targets that cannot use ONNX Runtime (Android x86_64 emulator,
|
||||
# WASM). Pure-Rust only — no ORT, no native-only C++ deps. Used as a foundation for both
|
||||
# wasm-target and android-target to avoid duplication.
|
||||
#
|
||||
# liter-llm is intentionally retained here: hosted LLM calls do not require ORT
|
||||
# and the dependency has a wasm-http backend. Runtime cfg gates still disable the
|
||||
# wasm LLM paths until the browser/runtime integration is wired; see the TODOs
|
||||
# next to those gates.
|
||||
no-ort-target = [
|
||||
"pdf",
|
||||
"html",
|
||||
"xml",
|
||||
"email",
|
||||
"language-detection",
|
||||
"chunking",
|
||||
"chunking-tokenizers",
|
||||
"quality",
|
||||
"keywords",
|
||||
"office",
|
||||
"iwork",
|
||||
"hwp",
|
||||
"hwpx",
|
||||
"mdx",
|
||||
"archives",
|
||||
"liter-llm",
|
||||
"stopwords",
|
||||
"embedding-presets",
|
||||
"layout-types",
|
||||
"auto-rotate-types",
|
||||
"paddle-ocr-types",
|
||||
]
|
||||
# wasm-target: no-ort-target plus WASM-specific overrides (excel-wasm, tree-sitter-wasm, ocr-wasm).
|
||||
wasm-target = ["no-ort-target", "excel-wasm", "tree-sitter-wasm", "ocr-wasm"]
|
||||
# android-target: no-ort-target plus native-Linux variants that work on Android ABI
|
||||
# (excel with tokio, native tree-sitter, Tesseract OCR, and API/MCP transport).
|
||||
# Excludes ORT-dependent features (paddle-ocr, layout-detection, embeddings, auto-rotate).
|
||||
android-target = ["no-ort-target", "excel", "tree-sitter", "ocr", "api", "mcp"]
|
||||
# Mobile deployment preset — formats + analysis + Tesseract OCR + code intelligence + API types.
|
||||
# Excludes ORT-dependent features (paddle-ocr, layout-detection, embeddings, auto-rotate)
|
||||
# and server transport infrastructure (mcp, otel, liter-llm).
|
||||
mobile = ["formats", "analysis", "ocr", "tree-sitter", "api-types", "tokio-runtime"]
|
||||
# WASM only: enables wasm-bindgen-rayon for multi-threaded WASM builds.
|
||||
# No #[cfg] gates — activates the wasm-bindgen-rayon dependency.
|
||||
wasm-threads = ["dep:wasm-bindgen-rayon"]
|
||||
# --- Aggregate features ---
|
||||
|
||||
# Document format extractors only — no infrastructure or text analysis.
|
||||
formats = [
|
||||
"pdf",
|
||||
"excel",
|
||||
"office",
|
||||
"hwp",
|
||||
"hwpx",
|
||||
"iwork",
|
||||
"email",
|
||||
"html",
|
||||
"xml",
|
||||
"archives",
|
||||
"mdx",
|
||||
]
|
||||
# Text processing and analysis capabilities.
|
||||
analysis = ["language-detection", "chunking", "quality", "keywords", "diff"]
|
||||
# Network services and observability infrastructure.
|
||||
services = ["api", "mcp", "otel"]
|
||||
# Everything — all formats, analysis, services, OCR, ML, and code intelligence.
|
||||
full = [
|
||||
"formats",
|
||||
"analysis",
|
||||
"services",
|
||||
"ocr",
|
||||
"paddle-ocr",
|
||||
"layout-detection",
|
||||
"embeddings",
|
||||
"chunking-tokenizers",
|
||||
"tree-sitter",
|
||||
"liter-llm",
|
||||
"tokio-runtime",
|
||||
"diff",
|
||||
]
|
||||
# Server deployment preset — formats + analysis + services + OCR.
|
||||
server = ["formats", "analysis", "services", "ocr", "paddle-ocr", "layout-detection"]
|
||||
# CLI-only features (thin gate for kreuzberg-cli).
|
||||
cli = ["services", "chunking"]
|
||||
|
||||
[dependencies]
|
||||
ahash = { workspace = true }
|
||||
async-trait = { workspace = true }
|
||||
auto_enums = { version = "0.8", optional = true }
|
||||
axum = { version = "0.8", features = ["macros", "json", "multipart"], optional = true }
|
||||
base64 = { workspace = true }
|
||||
biblatex = { version = "0.11", optional = true }
|
||||
biblib = { version = "0.4", default-features = false, features = [
|
||||
"ris",
|
||||
"pubmed",
|
||||
"xml",
|
||||
"regex",
|
||||
], optional = true }
|
||||
bitvec = "1.0"
|
||||
blake3 = { workspace = true }
|
||||
bytes = { workspace = true }
|
||||
calamine = { version = "0.35.0", features = ["dates"], optional = true }
|
||||
cfb = { workspace = true, optional = true }
|
||||
chardetng = { version = "1.0.0", optional = true }
|
||||
chrono = { workspace = true, optional = true }
|
||||
comrak = { workspace = true }
|
||||
dbase = { workspace = true, optional = true }
|
||||
dirs = "6"
|
||||
either = { version = "1", optional = true }
|
||||
encoding_rs = { version = "0.8.35" }
|
||||
fast_image_resize = { version = "6.0.0", optional = true }
|
||||
flate2 = { version = "1.1", optional = true }
|
||||
hayro-jbig2 = { version = "0.3", default-features = false, features = ["std"], optional = true }
|
||||
hayro-jpeg2000 = { version = "0.3", default-features = false, features = [
|
||||
"std",
|
||||
"simd",
|
||||
], optional = true }
|
||||
hex = { workspace = true }
|
||||
html-to-markdown-rs = { workspace = true, features = [
|
||||
"inline-images",
|
||||
"metadata",
|
||||
], optional = true }
|
||||
icu_provider = { version = "2", features = ["sync"], optional = true }
|
||||
icu_segmenter = { version = "2", optional = true }
|
||||
image = { workspace = true, default-features = false, features = [
|
||||
"png",
|
||||
"jpeg",
|
||||
"webp",
|
||||
"bmp",
|
||||
"tiff",
|
||||
"gif",
|
||||
"pnm",
|
||||
"rayon",
|
||||
], optional = true }
|
||||
indexmap = "2.14.0"
|
||||
infer = "0.19.0"
|
||||
itertools = { workspace = true, optional = true }
|
||||
jotdown = "0.10"
|
||||
kamadak-exif = { version = "0.6.1", optional = true }
|
||||
|
||||
kreuzberg-tesseract = { path = "../kreuzberg-tesseract", version = "5.0.0-rc.3", optional = true }
|
||||
libc = { workspace = true }
|
||||
# liter-llm is declared in the platform-conditional dependencies block below — it must be
|
||||
# excluded on Windows because it pulls reqwest/rustls -> aws-lc-rs -> aws-lc-sys, which fails
|
||||
# to build on Windows MSVC.
|
||||
log = { workspace = true }
|
||||
lopdf = { version = "0.40.0", optional = true }
|
||||
mail-parser = { version = "0.11.3", optional = true }
|
||||
memchr = "2.8.1"
|
||||
memmap2 = { workspace = true }
|
||||
mime_guess = "2.0"
|
||||
minijinja = { workspace = true, optional = true }
|
||||
moka = { version = "0.12", features = ["sync"], optional = true }
|
||||
ndarray = { version = "0.17", optional = true }
|
||||
num_cpus = { workspace = true }
|
||||
once_cell = { workspace = true }
|
||||
opentelemetry = { version = "0.32", features = ["trace"], optional = true }
|
||||
opentelemetry_sdk = { version = "0.32", features = ["rt-tokio"], optional = true }
|
||||
org = { version = "0.3", optional = true }
|
||||
ort = { version = "2.0.0-rc.12", default-features = false, features = [
|
||||
"std",
|
||||
"ndarray",
|
||||
"api-18",
|
||||
], optional = true }
|
||||
outlook-pst = { version = "1.2.0", optional = true }
|
||||
parking_lot = { workspace = true }
|
||||
pastey = "0.2"
|
||||
pdf_oxide = { workspace = true, features = ["rendering"], optional = true }
|
||||
pulldown-cmark = { version = "0.13" }
|
||||
quick-xml = { version = "0.40.1", features = ["serialize"], optional = true }
|
||||
rake = { version = "0.3.6", optional = true }
|
||||
rayon = { workspace = true }
|
||||
regex = "1.12.3"
|
||||
rmcp = { version = "1.7.0", features = [
|
||||
"server",
|
||||
"macros",
|
||||
"base64",
|
||||
"transport-io",
|
||||
"transport-streamable-http-server",
|
||||
"server-side-http",
|
||||
], optional = true }
|
||||
rmp-serde = "1.3"
|
||||
|
||||
roxmltree = { version = "0.21.1", optional = true }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
serde_toon_format = { workspace = true }
|
||||
serde_yaml_ng = "0.10.0"
|
||||
sevenz-rust2 = { version = "0.20.2", optional = true }
|
||||
sha2 = { version = "0.11", optional = true }
|
||||
simdutf8 = { version = "0.1", optional = true }
|
||||
similar = { workspace = true, optional = true }
|
||||
snap = { version = "1.1", optional = true }
|
||||
strum = { version = "0.28", features = ["derive"], optional = true }
|
||||
tar = { version = "^0.4", optional = true }
|
||||
tempfile = { workspace = true, optional = true }
|
||||
thiserror = { workspace = true }
|
||||
tiff = { version = "0.11", optional = true }
|
||||
tokenizers = { version = "0.23", optional = true, default-features = false, features = [
|
||||
"http",
|
||||
"fancy-regex",
|
||||
] }
|
||||
tokio = { workspace = true, optional = true }
|
||||
toml = { workspace = true }
|
||||
tower = { version = "0.5", features = ["timeout", "limit", "util"], optional = true }
|
||||
tower-http = { version = "0.6", features = [
|
||||
"cors",
|
||||
"trace",
|
||||
"limit",
|
||||
"catch-panic",
|
||||
"request-id",
|
||||
"sensitive-headers",
|
||||
"compression-full",
|
||||
], optional = true }
|
||||
tracing = { workspace = true }
|
||||
tracing-opentelemetry = { version = "0.33", optional = true }
|
||||
unhwp = { version = "0.3.2", default-features = false, features = ["hwpx"], optional = true }
|
||||
unicode-normalization = { version = "0.1.25", optional = true }
|
||||
urlencoding = "2"
|
||||
utoipa = { version = "5.5", features = ["axum_extras"], optional = true }
|
||||
uuid = { version = "1", features = ["v4"], optional = true }
|
||||
v_htmlescape = { version = "0.17", optional = true }
|
||||
whatlang = { version = "0.18.0", optional = true }
|
||||
zip = { version = ">=7.0.0", optional = true, default-features = false, features = [
|
||||
"deflate-flate2",
|
||||
] }
|
||||
|
||||
[target.'cfg(all(not(target_os = "windows"), not(target_arch = "wasm32")))'.dependencies]
|
||||
hf-hub = { version = "0.5", default-features = false, features = ["ureq"], optional = true }
|
||||
# PaddleOCR via ONNX Runtime - not available on WASM (vendored from paddle-ocr-rs)
|
||||
kreuzberg-paddle-ocr = { path = "../kreuzberg-paddle-ocr", version = "5.0.0-rc.3", optional = true }
|
||||
# liter-llm pulls reqwest/rustls -> aws-lc-rs -> aws-lc-sys. aws-lc-sys 0.40 fails to build on
|
||||
# Windows MSVC (stdalign feature detection treats `-WX` warnings as errors). Until upstream ships
|
||||
# a fix, restrict liter-llm to non-Windows targets so Windows FFI/CLI builds don't pull aws-lc-sys.
|
||||
liter-llm = { workspace = true, optional = true, features = ["native-http"] }
|
||||
pprof = { version = "0.15.0", features = ["flamegraph"], optional = true }
|
||||
# Force ureq (transitive dep via hf-hub) to use rustls on non-Windows
|
||||
ureq = { version = "3.3", default-features = false, features = ["rustls", "json"], optional = true }
|
||||
|
||||
# Use native-tls on Windows to avoid aws-lc-sys CMake build issues with MinGW
|
||||
[target.'cfg(all(target_os = "windows", not(target_arch = "wasm32")))'.dependencies]
|
||||
hf-hub = { version = "0.5", default-features = false, features = ["ureq"], optional = true }
|
||||
# PaddleOCR via ONNX Runtime - not available on WASM (vendored from paddle-ocr-rs)
|
||||
kreuzberg-paddle-ocr = { path = "../kreuzberg-paddle-ocr", version = "5.0.0-rc.3", optional = true }
|
||||
# Force ureq (transitive dep via hf-hub) to use native-tls on Windows
|
||||
ureq = { version = "3.3", default-features = false, features = [
|
||||
"native-tls",
|
||||
"json",
|
||||
], optional = true }
|
||||
|
||||
[target.'cfg(not(target_arch = "wasm32"))'.dependencies.tree-sitter-language-pack]
|
||||
workspace = true
|
||||
features = ["dynamic-loading", "download", "serde"]
|
||||
optional = true
|
||||
|
||||
[target.'cfg(target_arch = "wasm32")'.dependencies]
|
||||
# Override getrandom to enable wasm_js feature for WASM targets
|
||||
# This is needed because ring/rustls (via ureq) depend on getrandom without wasm_js feature
|
||||
getrandom = { version = "0.4", features = ["wasm_js"] }
|
||||
# On wasm32, swap kreuzberg-tesseract from its native default ("static-linking" -> "build-tesseract")
|
||||
# to the WASI-SDK build path ("build-tesseract-wasm") and bundle eng.traineddata so OCR works
|
||||
# in-memory with no filesystem.
|
||||
kreuzberg-tesseract = { path = "../kreuzberg-tesseract", version = "5.0.0-rc.3", default-features = false, features = [
|
||||
"build-tesseract-wasm",
|
||||
"bundle-tessdata-eng",
|
||||
], optional = true }
|
||||
liter-llm = { workspace = true, optional = true, features = ["wasm-http"] }
|
||||
tree-sitter-language-pack = { workspace = true, default-features = false, features = [
|
||||
"serde",
|
||||
], optional = true }
|
||||
wasm-bindgen-rayon = { version = "1.3", optional = true }
|
||||
|
||||
[build-dependencies]
|
||||
|
||||
[dev-dependencies]
|
||||
anyhow = { workspace = true }
|
||||
criterion = { workspace = true }
|
||||
dotenvy = "0.15"
|
||||
filetime = "0.2"
|
||||
image = { workspace = true, default-features = false, features = ["png"] }
|
||||
jsonschema = "0.46"
|
||||
serial_test = "3.4.0"
|
||||
tar = "^0.4"
|
||||
tempfile = { workspace = true }
|
||||
tokio = { workspace = true, features = ["macros", "time"] }
|
||||
tokio-test = "0.4"
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||
zip = { version = ">=7.0.0, <8.7.0", default-features = false, features = ["deflate-flate2"] }
|
||||
|
||||
[[bench]]
|
||||
name = "text_quality"
|
||||
harness = false
|
||||
required-features = ["quality"]
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
Reference in New Issue
Block a user