500 lines
17 KiB
TOML
500 lines
17 KiB
TOML
[package]
|
|
name = "kreuzberg"
|
|
version.workspace = true
|
|
edition.workspace = true
|
|
rust-version.workspace = true
|
|
authors.workspace = true
|
|
description = "High-performance document intelligence library for Rust. Extract text, metadata, and structured data from PDFs, Office documents, images, and 90+ formats and 300+ programming languages via tree-sitter code intelligence with async/sync APIs."
|
|
license.workspace = true
|
|
repository = "https://github.com/kreuzberg-dev/kreuzberg"
|
|
homepage = "https://kreuzberg.dev"
|
|
documentation = "https://docs.rs/kreuzberg"
|
|
keywords = ["document", "extraction", "pdf", "ocr", "parser"]
|
|
categories = ["parser-implementations", "text-processing"]
|
|
readme = "README.md"
|
|
|
|
[package.metadata.cargo-machete]
|
|
ignored = [
|
|
"liter-llm",
|
|
"opentelemetry_sdk",
|
|
"tracing-opentelemetry",
|
|
"kamadak-exif",
|
|
# icu_provider is feature-gated and consumed transitively by language-detection;
|
|
# cargo-machete cannot resolve `dep:icu_provider` references in feature flag chains.
|
|
"icu_provider",
|
|
"serde_toon_format",
|
|
]
|
|
|
|
[lib]
|
|
crate-type = ["rlib"]
|
|
|
|
[features]
|
|
default = ["tokio-runtime", "simd-utf8"]
|
|
|
|
# Build-time only: enables pprof flamegraph profiling. No #[cfg] gates —
|
|
# activates the pprof dependency (non-Windows, non-WASM only).
|
|
profiling = ["dep:pprof"]
|
|
|
|
pool-metrics = []
|
|
|
|
simd-utf8 = ["dep:simdutf8"]
|
|
|
|
tokio-runtime = ["dep:tokio"]
|
|
|
|
# PDF extraction is currently backed by pdf_oxide. Keep this as the canonical
|
|
# `pdf` feature until/unless a second PDF backend is introduced.
|
|
pdf = ["dep:pdf_oxide", "dep:lopdf", "dep:image", "dep:flate2", "html"]
|
|
excel = ["dep:calamine", "tokio-runtime"]
|
|
excel-wasm = ["dep:calamine"]
|
|
office = [
|
|
"dep:cfb",
|
|
"dep:roxmltree",
|
|
"dep:zip",
|
|
"dep:quick-xml",
|
|
"dep:biblatex",
|
|
"dep:biblib",
|
|
"dep:org",
|
|
"dep:dbase",
|
|
"html",
|
|
]
|
|
hwp = ["dep:cfb", "dep:flate2"]
|
|
hwpx = ["dep:unhwp", "dep:zip"]
|
|
iwork = ["dep:zip", "dep:snap"]
|
|
email = [
|
|
"dep:mail-parser",
|
|
"dep:cfb",
|
|
"dep:outlook-pst",
|
|
"dep:tempfile",
|
|
"dep:chrono",
|
|
"dep:chardetng",
|
|
]
|
|
html = ["dep:html-to-markdown-rs", "dep:v_htmlescape"]
|
|
xml = ["dep:quick-xml", "dep:roxmltree"]
|
|
archives = ["dep:zip", "dep:tar", "dep:sevenz-rust2", "dep:flate2"]
|
|
mdx = []
|
|
|
|
liter-llm = ["dep:liter-llm", "dep:minijinja"]
|
|
|
|
tree-sitter = ["dep:tree-sitter-language-pack"]
|
|
tree-sitter-wasm = ["tree-sitter"]
|
|
|
|
ocr = [
|
|
"dep:kreuzberg-tesseract",
|
|
"dep:image",
|
|
"dep:tiff",
|
|
"dep:fast_image_resize",
|
|
"dep:kamadak-exif",
|
|
"dep:hayro-jpeg2000",
|
|
"dep:hayro-jbig2",
|
|
"html",
|
|
]
|
|
# WASM OCR: Minimal Tesseract backend without html-to-markdown (which has WASI imports)
|
|
# Includes only the core OCR processing dependencies needed for Tesseract on WASM
|
|
ocr-wasm = [
|
|
"dep:kreuzberg-tesseract",
|
|
"dep:image",
|
|
"dep:hayro-jpeg2000",
|
|
"dep:hayro-jbig2",
|
|
]
|
|
# PaddleOCR type definitions only — pure-Rust config structs, no ORT dependency.
|
|
# Exposes PaddleOcrConfig, PaddleLanguage, ModelPaths. Safe on Android/WASM.
|
|
paddle-ocr-types = []
|
|
# PaddleOCR via ONNX Runtime
|
|
# Requires 'ocr' feature for shared conversion utilities and table reconstruction
|
|
paddle-ocr = [
|
|
"paddle-ocr-types",
|
|
"dep:kreuzberg-paddle-ocr",
|
|
"dep:sha2",
|
|
"dep:image",
|
|
"dep:hf-hub",
|
|
"dep:ureq",
|
|
"dep:ort",
|
|
"dep:ndarray",
|
|
"html",
|
|
"tokio-runtime",
|
|
"ocr",
|
|
"auto-rotate",
|
|
]
|
|
# Bundle ORT binaries via the official Microsoft release (includes CoreML on macOS, CUDA on Linux).
|
|
# When active, no system ORT library is required and ort_discovery is skipped.
|
|
ort-bundled = ["ort/download-binaries", "ort/tls-native"]
|
|
# Build-time only: configures ORT to load ONNX Runtime dynamically at runtime.
|
|
# No #[cfg] gates — passed through to the `ort` crate.
|
|
ort-dynamic = ["ort/load-dynamic"]
|
|
# Layout detection via ONNX Runtime (YOLO + RT-DETR)
|
|
# Document orientation detection using PP-LCNet for auto_rotate
|
|
auto-rotate = [
|
|
"auto-rotate-types",
|
|
"dep:ort",
|
|
"dep:ndarray",
|
|
"dep:hf-hub",
|
|
"dep:sha2",
|
|
"dep:image",
|
|
"dep:ureq",
|
|
"ort-bundled",
|
|
]
|
|
layout-detection = [
|
|
"layout-types",
|
|
"dep:ort",
|
|
"dep:ndarray",
|
|
"dep:hf-hub",
|
|
"dep:sha2",
|
|
"dep:image",
|
|
"tokio-runtime",
|
|
"ort-bundled",
|
|
]
|
|
language-detection = ["dep:whatlang"]
|
|
# layout-types: pure-Rust layout struct definitions — no ORT dependency.
|
|
# Included by layout-detection; also usable standalone on WASM/Android targets.
|
|
layout-types = []
|
|
# auto-rotate-types: pure-Rust OrientationResult struct — no ORT dependency.
|
|
# Included by auto-rotate; also usable standalone on WASM/Android targets.
|
|
auto-rotate-types = []
|
|
chunking = [
|
|
"dep:auto_enums",
|
|
"dep:either",
|
|
"dep:icu_provider",
|
|
"dep:icu_segmenter",
|
|
"dep:itertools",
|
|
"dep:strum",
|
|
]
|
|
chunking-tokenizers = ["chunking", "dep:tokenizers"]
|
|
# embedding-presets: static preset metadata (WASM-safe, no ORT)
|
|
embedding-presets = []
|
|
# embeddings: full embedding generation with ONNX Runtime (ORT-dependent, WASM-incompatible)
|
|
# NOTE: embedding-presets is a subset of embeddings (no ORT); always implied by embeddings.
|
|
embeddings = [
|
|
"dep:hf-hub",
|
|
"dep:ort",
|
|
"dep:ndarray",
|
|
"dep:tokenizers",
|
|
"chunking",
|
|
"tokio-runtime",
|
|
"ort-bundled",
|
|
"embedding-presets",
|
|
]
|
|
stopwords = []
|
|
quality = ["dep:unicode-normalization", "dep:chardetng", "stopwords"]
|
|
|
|
keywords-yake = ["stopwords"]
|
|
keywords-rake = ["dep:rake", "stopwords"]
|
|
keywords = ["keywords-yake", "keywords-rake"]
|
|
|
|
diff = ["dep:similar"]
|
|
|
|
tower-service = ["dep:tower", "tokio-runtime"]
|
|
|
|
api-types = []
|
|
api = [
|
|
"api-types",
|
|
"tower-service",
|
|
"dep:axum",
|
|
"dep:chrono",
|
|
"dep:moka",
|
|
"dep:tower-http",
|
|
"dep:utoipa",
|
|
"dep:uuid",
|
|
"tokio-runtime",
|
|
"chunking",
|
|
]
|
|
mcp = ["tower-service", "dep:rmcp", "tokio-runtime"]
|
|
mcp-http = ["mcp", "api"]
|
|
|
|
otel = ["dep:opentelemetry", "dep:opentelemetry_sdk", "dep:tracing-opentelemetry"]
|
|
|
|
# no-ort-target: shared base for targets that cannot use ONNX Runtime (Android x86_64 emulator,
|
|
# WASM). Pure-Rust only — no ORT, no native-only C++ deps. Used as a foundation for both
|
|
# wasm-target and android-target to avoid duplication.
|
|
#
|
|
# liter-llm is intentionally retained here: hosted LLM calls do not require ORT
|
|
# and the dependency has a wasm-http backend. Runtime cfg gates still disable the
|
|
# wasm LLM paths until the browser/runtime integration is wired; see the TODOs
|
|
# next to those gates.
|
|
no-ort-target = [
|
|
"pdf",
|
|
"html",
|
|
"xml",
|
|
"email",
|
|
"language-detection",
|
|
"chunking",
|
|
"chunking-tokenizers",
|
|
"quality",
|
|
"keywords",
|
|
"office",
|
|
"iwork",
|
|
"hwp",
|
|
"hwpx",
|
|
"mdx",
|
|
"archives",
|
|
"liter-llm",
|
|
"stopwords",
|
|
"embedding-presets",
|
|
"layout-types",
|
|
"auto-rotate-types",
|
|
"paddle-ocr-types",
|
|
]
|
|
# wasm-target: no-ort-target plus WASM-specific overrides (excel-wasm, tree-sitter-wasm, ocr-wasm).
|
|
wasm-target = ["no-ort-target", "excel-wasm", "tree-sitter-wasm", "ocr-wasm"]
|
|
# android-target: no-ort-target plus native-Linux variants that work on Android ABI
|
|
# (excel with tokio, native tree-sitter, Tesseract OCR, and API/MCP transport).
|
|
# Excludes ORT-dependent features (paddle-ocr, layout-detection, embeddings, auto-rotate).
|
|
android-target = ["no-ort-target", "excel", "tree-sitter", "ocr", "api", "mcp"]
|
|
# Mobile deployment preset — formats + analysis + Tesseract OCR + code intelligence + API types.
|
|
# Excludes ORT-dependent features (paddle-ocr, layout-detection, embeddings, auto-rotate)
|
|
# and server transport infrastructure (mcp, otel, liter-llm).
|
|
mobile = ["formats", "analysis", "ocr", "tree-sitter", "api-types", "tokio-runtime"]
|
|
# WASM only: enables wasm-bindgen-rayon for multi-threaded WASM builds.
|
|
# No #[cfg] gates — activates the wasm-bindgen-rayon dependency.
|
|
wasm-threads = ["dep:wasm-bindgen-rayon"]
|
|
# --- Aggregate features ---
|
|
|
|
# Document format extractors only — no infrastructure or text analysis.
|
|
formats = [
|
|
"pdf",
|
|
"excel",
|
|
"office",
|
|
"hwp",
|
|
"hwpx",
|
|
"iwork",
|
|
"email",
|
|
"html",
|
|
"xml",
|
|
"archives",
|
|
"mdx",
|
|
]
|
|
# Text processing and analysis capabilities.
|
|
analysis = ["language-detection", "chunking", "quality", "keywords", "diff"]
|
|
# Network services and observability infrastructure.
|
|
services = ["api", "mcp", "otel"]
|
|
# Everything — all formats, analysis, services, OCR, ML, and code intelligence.
|
|
full = [
|
|
"formats",
|
|
"analysis",
|
|
"services",
|
|
"ocr",
|
|
"paddle-ocr",
|
|
"layout-detection",
|
|
"embeddings",
|
|
"chunking-tokenizers",
|
|
"tree-sitter",
|
|
"liter-llm",
|
|
"tokio-runtime",
|
|
"diff",
|
|
]
|
|
# Server deployment preset — formats + analysis + services + OCR.
|
|
server = ["formats", "analysis", "services", "ocr", "paddle-ocr", "layout-detection"]
|
|
# CLI-only features (thin gate for kreuzberg-cli).
|
|
cli = ["services", "chunking"]
|
|
|
|
[dependencies]
|
|
ahash = { workspace = true }
|
|
async-trait = { workspace = true }
|
|
auto_enums = { version = "0.8", optional = true }
|
|
axum = { version = "0.8", features = ["macros", "json", "multipart"], optional = true }
|
|
base64 = { workspace = true }
|
|
biblatex = { version = "0.11", optional = true }
|
|
biblib = { version = "0.4", default-features = false, features = [
|
|
"ris",
|
|
"pubmed",
|
|
"xml",
|
|
"regex",
|
|
], optional = true }
|
|
bitvec = "1.0"
|
|
blake3 = { workspace = true }
|
|
bytes = { workspace = true }
|
|
calamine = { version = "0.35.0", features = ["dates"], optional = true }
|
|
cfb = { workspace = true, optional = true }
|
|
chardetng = { version = "1.0.0", optional = true }
|
|
chrono = { workspace = true, optional = true }
|
|
comrak = { workspace = true }
|
|
dbase = { workspace = true, optional = true }
|
|
dirs = "6"
|
|
either = { version = "1", optional = true }
|
|
encoding_rs = { version = "0.8.35" }
|
|
fast_image_resize = { version = "6.0.0", optional = true }
|
|
flate2 = { version = "1.1", optional = true }
|
|
hayro-jbig2 = { version = "0.3", default-features = false, features = ["std"], optional = true }
|
|
hayro-jpeg2000 = { version = "0.3", default-features = false, features = [
|
|
"std",
|
|
"simd",
|
|
], optional = true }
|
|
hex = { workspace = true }
|
|
html-to-markdown-rs = { workspace = true, features = [
|
|
"inline-images",
|
|
"metadata",
|
|
], optional = true }
|
|
icu_provider = { version = "2", features = ["sync"], optional = true }
|
|
icu_segmenter = { version = "2", optional = true }
|
|
image = { workspace = true, default-features = false, features = [
|
|
"png",
|
|
"jpeg",
|
|
"webp",
|
|
"bmp",
|
|
"tiff",
|
|
"gif",
|
|
"pnm",
|
|
"rayon",
|
|
], optional = true }
|
|
indexmap = "2.14.0"
|
|
infer = "0.19.0"
|
|
itertools = { workspace = true, optional = true }
|
|
jotdown = "0.10"
|
|
kamadak-exif = { version = "0.6.1", optional = true }
|
|
|
|
kreuzberg-tesseract = { path = "../kreuzberg-tesseract", version = "5.0.0-rc.3", optional = true }
|
|
libc = { workspace = true }
|
|
# liter-llm is declared in the platform-conditional dependencies block below — it must be
|
|
# excluded on Windows because it pulls reqwest/rustls -> aws-lc-rs -> aws-lc-sys, which fails
|
|
# to build on Windows MSVC.
|
|
log = { workspace = true }
|
|
lopdf = { version = "0.40.0", optional = true }
|
|
mail-parser = { version = "0.11.3", optional = true }
|
|
memchr = "2.8.1"
|
|
memmap2 = { workspace = true }
|
|
mime_guess = "2.0"
|
|
minijinja = { workspace = true, optional = true }
|
|
moka = { version = "0.12", features = ["sync"], optional = true }
|
|
ndarray = { version = "0.17", optional = true }
|
|
num_cpus = { workspace = true }
|
|
once_cell = { workspace = true }
|
|
opentelemetry = { version = "0.32", features = ["trace"], optional = true }
|
|
opentelemetry_sdk = { version = "0.32", features = ["rt-tokio"], optional = true }
|
|
org = { version = "0.3", optional = true }
|
|
ort = { version = "2.0.0-rc.12", default-features = false, features = [
|
|
"std",
|
|
"ndarray",
|
|
"api-18",
|
|
], optional = true }
|
|
outlook-pst = { version = "1.2.0", optional = true }
|
|
parking_lot = { workspace = true }
|
|
pastey = "0.2"
|
|
pdf_oxide = { workspace = true, features = ["rendering"], optional = true }
|
|
pulldown-cmark = { version = "0.13" }
|
|
quick-xml = { version = "0.40.1", features = ["serialize"], optional = true }
|
|
rake = { version = "0.3.6", optional = true }
|
|
rayon = { workspace = true }
|
|
regex = "1.12.3"
|
|
rmcp = { version = "1.7.0", features = [
|
|
"server",
|
|
"macros",
|
|
"base64",
|
|
"transport-io",
|
|
"transport-streamable-http-server",
|
|
"server-side-http",
|
|
], optional = true }
|
|
rmp-serde = "1.3"
|
|
|
|
roxmltree = { version = "0.21.1", optional = true }
|
|
serde = { workspace = true }
|
|
serde_json = { workspace = true }
|
|
serde_toon_format = { workspace = true }
|
|
serde_yaml_ng = "0.10.0"
|
|
sevenz-rust2 = { version = "0.20.2", optional = true }
|
|
sha2 = { version = "0.11", optional = true }
|
|
simdutf8 = { version = "0.1", optional = true }
|
|
similar = { workspace = true, optional = true }
|
|
snap = { version = "1.1", optional = true }
|
|
strum = { version = "0.28", features = ["derive"], optional = true }
|
|
tar = { version = "^0.4", optional = true }
|
|
tempfile = { workspace = true, optional = true }
|
|
thiserror = { workspace = true }
|
|
tiff = { version = "0.11", optional = true }
|
|
tokenizers = { version = "0.23", optional = true, default-features = false, features = [
|
|
"http",
|
|
"fancy-regex",
|
|
] }
|
|
tokio = { workspace = true, optional = true }
|
|
toml = { workspace = true }
|
|
tower = { version = "0.5", features = ["timeout", "limit", "util"], optional = true }
|
|
tower-http = { version = "0.6", features = [
|
|
"cors",
|
|
"trace",
|
|
"limit",
|
|
"catch-panic",
|
|
"request-id",
|
|
"sensitive-headers",
|
|
"compression-full",
|
|
], optional = true }
|
|
tracing = { workspace = true }
|
|
tracing-opentelemetry = { version = "0.33", optional = true }
|
|
unhwp = { version = "0.3.2", default-features = false, features = ["hwpx"], optional = true }
|
|
unicode-normalization = { version = "0.1.25", optional = true }
|
|
urlencoding = "2"
|
|
utoipa = { version = "5.5", features = ["axum_extras"], optional = true }
|
|
uuid = { version = "1", features = ["v4"], optional = true }
|
|
v_htmlescape = { version = "0.17", optional = true }
|
|
whatlang = { version = "0.18.0", optional = true }
|
|
zip = { version = ">=7.0.0", optional = true, default-features = false, features = [
|
|
"deflate-flate2",
|
|
] }
|
|
|
|
[target.'cfg(all(not(target_os = "windows"), not(target_arch = "wasm32")))'.dependencies]
|
|
hf-hub = { version = "0.5", default-features = false, features = ["ureq"], optional = true }
|
|
# PaddleOCR via ONNX Runtime - not available on WASM (vendored from paddle-ocr-rs)
|
|
kreuzberg-paddle-ocr = { path = "../kreuzberg-paddle-ocr", version = "5.0.0-rc.3", optional = true }
|
|
# liter-llm pulls reqwest/rustls -> aws-lc-rs -> aws-lc-sys. aws-lc-sys 0.40 fails to build on
|
|
# Windows MSVC (stdalign feature detection treats `-WX` warnings as errors). Until upstream ships
|
|
# a fix, restrict liter-llm to non-Windows targets so Windows FFI/CLI builds don't pull aws-lc-sys.
|
|
liter-llm = { workspace = true, optional = true, features = ["native-http"] }
|
|
pprof = { version = "0.15.0", features = ["flamegraph"], optional = true }
|
|
# Force ureq (transitive dep via hf-hub) to use rustls on non-Windows
|
|
ureq = { version = "3.3", default-features = false, features = ["rustls", "json"], optional = true }
|
|
|
|
# Use native-tls on Windows to avoid aws-lc-sys CMake build issues with MinGW
|
|
[target.'cfg(all(target_os = "windows", not(target_arch = "wasm32")))'.dependencies]
|
|
hf-hub = { version = "0.5", default-features = false, features = ["ureq"], optional = true }
|
|
# PaddleOCR via ONNX Runtime - not available on WASM (vendored from paddle-ocr-rs)
|
|
kreuzberg-paddle-ocr = { path = "../kreuzberg-paddle-ocr", version = "5.0.0-rc.3", optional = true }
|
|
# Force ureq (transitive dep via hf-hub) to use native-tls on Windows
|
|
ureq = { version = "3.3", default-features = false, features = [
|
|
"native-tls",
|
|
"json",
|
|
], optional = true }
|
|
|
|
[target.'cfg(not(target_arch = "wasm32"))'.dependencies.tree-sitter-language-pack]
|
|
workspace = true
|
|
features = ["dynamic-loading", "download", "serde"]
|
|
optional = true
|
|
|
|
[target.'cfg(target_arch = "wasm32")'.dependencies]
|
|
# Override getrandom to enable wasm_js feature for WASM targets
|
|
# This is needed because ring/rustls (via ureq) depend on getrandom without wasm_js feature
|
|
getrandom = { version = "0.4", features = ["wasm_js"] }
|
|
# On wasm32, swap kreuzberg-tesseract from its native default ("static-linking" -> "build-tesseract")
|
|
# to the WASI-SDK build path ("build-tesseract-wasm") and bundle eng.traineddata so OCR works
|
|
# in-memory with no filesystem.
|
|
kreuzberg-tesseract = { path = "../kreuzberg-tesseract", version = "5.0.0-rc.3", default-features = false, features = [
|
|
"build-tesseract-wasm",
|
|
"bundle-tessdata-eng",
|
|
], optional = true }
|
|
liter-llm = { workspace = true, optional = true, features = ["wasm-http"] }
|
|
tree-sitter-language-pack = { workspace = true, default-features = false, features = [
|
|
"serde",
|
|
], optional = true }
|
|
wasm-bindgen-rayon = { version = "1.3", optional = true }
|
|
|
|
[build-dependencies]
|
|
|
|
[dev-dependencies]
|
|
anyhow = { workspace = true }
|
|
criterion = { workspace = true }
|
|
dotenvy = "0.15"
|
|
filetime = "0.2"
|
|
image = { workspace = true, default-features = false, features = ["png"] }
|
|
jsonschema = "0.46"
|
|
serial_test = "3.4.0"
|
|
tar = "^0.4"
|
|
tempfile = { workspace = true }
|
|
tokio = { workspace = true, features = ["macros", "time"] }
|
|
tokio-test = "0.4"
|
|
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
|
zip = { version = ">=7.0.0, <8.7.0", default-features = false, features = ["deflate-flate2"] }
|
|
|
|
[[bench]]
|
|
name = "text_quality"
|
|
harness = false
|
|
required-features = ["quality"]
|
|
|
|
[lints]
|
|
workspace = true
|