Files
fil/crates/kreuzberg/Cargo.toml
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

500 lines
17 KiB
TOML

[package]
name = "kreuzberg"
version.workspace = true
edition.workspace = true
rust-version.workspace = true
authors.workspace = true
description = "High-performance document intelligence library for Rust. Extract text, metadata, and structured data from PDFs, Office documents, images, and 90+ formats and 300+ programming languages via tree-sitter code intelligence with async/sync APIs."
license.workspace = true
repository = "https://github.com/kreuzberg-dev/kreuzberg"
homepage = "https://kreuzberg.dev"
documentation = "https://docs.rs/kreuzberg"
keywords = ["document", "extraction", "pdf", "ocr", "parser"]
categories = ["parser-implementations", "text-processing"]
readme = "README.md"
[package.metadata.cargo-machete]
ignored = [
"liter-llm",
"opentelemetry_sdk",
"tracing-opentelemetry",
"kamadak-exif",
# icu_provider is feature-gated and consumed transitively by language-detection;
# cargo-machete cannot resolve `dep:icu_provider` references in feature flag chains.
"icu_provider",
"serde_toon_format",
]
[lib]
crate-type = ["rlib"]
[features]
default = ["tokio-runtime", "simd-utf8"]
# Build-time only: enables pprof flamegraph profiling. No #[cfg] gates —
# activates the pprof dependency (non-Windows, non-WASM only).
profiling = ["dep:pprof"]
pool-metrics = []
simd-utf8 = ["dep:simdutf8"]
tokio-runtime = ["dep:tokio"]
# PDF extraction is currently backed by pdf_oxide. Keep this as the canonical
# `pdf` feature until/unless a second PDF backend is introduced.
pdf = ["dep:pdf_oxide", "dep:lopdf", "dep:image", "dep:flate2", "html"]
excel = ["dep:calamine", "tokio-runtime"]
excel-wasm = ["dep:calamine"]
office = [
"dep:cfb",
"dep:roxmltree",
"dep:zip",
"dep:quick-xml",
"dep:biblatex",
"dep:biblib",
"dep:org",
"dep:dbase",
"html",
]
hwp = ["dep:cfb", "dep:flate2"]
hwpx = ["dep:unhwp", "dep:zip"]
iwork = ["dep:zip", "dep:snap"]
email = [
"dep:mail-parser",
"dep:cfb",
"dep:outlook-pst",
"dep:tempfile",
"dep:chrono",
"dep:chardetng",
]
html = ["dep:html-to-markdown-rs", "dep:v_htmlescape"]
xml = ["dep:quick-xml", "dep:roxmltree"]
archives = ["dep:zip", "dep:tar", "dep:sevenz-rust2", "dep:flate2"]
mdx = []
liter-llm = ["dep:liter-llm", "dep:minijinja"]
tree-sitter = ["dep:tree-sitter-language-pack"]
tree-sitter-wasm = ["tree-sitter"]
ocr = [
"dep:kreuzberg-tesseract",
"dep:image",
"dep:tiff",
"dep:fast_image_resize",
"dep:kamadak-exif",
"dep:hayro-jpeg2000",
"dep:hayro-jbig2",
"html",
]
# WASM OCR: Minimal Tesseract backend without html-to-markdown (which has WASI imports)
# Includes only the core OCR processing dependencies needed for Tesseract on WASM
ocr-wasm = [
"dep:kreuzberg-tesseract",
"dep:image",
"dep:hayro-jpeg2000",
"dep:hayro-jbig2",
]
# PaddleOCR type definitions only — pure-Rust config structs, no ORT dependency.
# Exposes PaddleOcrConfig, PaddleLanguage, ModelPaths. Safe on Android/WASM.
paddle-ocr-types = []
# PaddleOCR via ONNX Runtime
# Requires 'ocr' feature for shared conversion utilities and table reconstruction
paddle-ocr = [
"paddle-ocr-types",
"dep:kreuzberg-paddle-ocr",
"dep:sha2",
"dep:image",
"dep:hf-hub",
"dep:ureq",
"dep:ort",
"dep:ndarray",
"html",
"tokio-runtime",
"ocr",
"auto-rotate",
]
# Bundle ORT binaries via the official Microsoft release (includes CoreML on macOS, CUDA on Linux).
# When active, no system ORT library is required and ort_discovery is skipped.
ort-bundled = ["ort/download-binaries", "ort/tls-native"]
# Build-time only: configures ORT to load ONNX Runtime dynamically at runtime.
# No #[cfg] gates — passed through to the `ort` crate.
ort-dynamic = ["ort/load-dynamic"]
# Layout detection via ONNX Runtime (YOLO + RT-DETR)
# Document orientation detection using PP-LCNet for auto_rotate
auto-rotate = [
"auto-rotate-types",
"dep:ort",
"dep:ndarray",
"dep:hf-hub",
"dep:sha2",
"dep:image",
"dep:ureq",
"ort-bundled",
]
layout-detection = [
"layout-types",
"dep:ort",
"dep:ndarray",
"dep:hf-hub",
"dep:sha2",
"dep:image",
"tokio-runtime",
"ort-bundled",
]
language-detection = ["dep:whatlang"]
# layout-types: pure-Rust layout struct definitions — no ORT dependency.
# Included by layout-detection; also usable standalone on WASM/Android targets.
layout-types = []
# auto-rotate-types: pure-Rust OrientationResult struct — no ORT dependency.
# Included by auto-rotate; also usable standalone on WASM/Android targets.
auto-rotate-types = []
chunking = [
"dep:auto_enums",
"dep:either",
"dep:icu_provider",
"dep:icu_segmenter",
"dep:itertools",
"dep:strum",
]
chunking-tokenizers = ["chunking", "dep:tokenizers"]
# embedding-presets: static preset metadata (WASM-safe, no ORT)
embedding-presets = []
# embeddings: full embedding generation with ONNX Runtime (ORT-dependent, WASM-incompatible)
# NOTE: embedding-presets is a subset of embeddings (no ORT); always implied by embeddings.
embeddings = [
"dep:hf-hub",
"dep:ort",
"dep:ndarray",
"dep:tokenizers",
"chunking",
"tokio-runtime",
"ort-bundled",
"embedding-presets",
]
stopwords = []
quality = ["dep:unicode-normalization", "dep:chardetng", "stopwords"]
keywords-yake = ["stopwords"]
keywords-rake = ["dep:rake", "stopwords"]
keywords = ["keywords-yake", "keywords-rake"]
diff = ["dep:similar"]
tower-service = ["dep:tower", "tokio-runtime"]
api-types = []
api = [
"api-types",
"tower-service",
"dep:axum",
"dep:chrono",
"dep:moka",
"dep:tower-http",
"dep:utoipa",
"dep:uuid",
"tokio-runtime",
"chunking",
]
mcp = ["tower-service", "dep:rmcp", "tokio-runtime"]
mcp-http = ["mcp", "api"]
otel = ["dep:opentelemetry", "dep:opentelemetry_sdk", "dep:tracing-opentelemetry"]
# no-ort-target: shared base for targets that cannot use ONNX Runtime (Android x86_64 emulator,
# WASM). Pure-Rust only — no ORT, no native-only C++ deps. Used as a foundation for both
# wasm-target and android-target to avoid duplication.
#
# liter-llm is intentionally retained here: hosted LLM calls do not require ORT
# and the dependency has a wasm-http backend. Runtime cfg gates still disable the
# wasm LLM paths until the browser/runtime integration is wired; see the TODOs
# next to those gates.
no-ort-target = [
"pdf",
"html",
"xml",
"email",
"language-detection",
"chunking",
"chunking-tokenizers",
"quality",
"keywords",
"office",
"iwork",
"hwp",
"hwpx",
"mdx",
"archives",
"liter-llm",
"stopwords",
"embedding-presets",
"layout-types",
"auto-rotate-types",
"paddle-ocr-types",
]
# wasm-target: no-ort-target plus WASM-specific overrides (excel-wasm, tree-sitter-wasm, ocr-wasm).
wasm-target = ["no-ort-target", "excel-wasm", "tree-sitter-wasm", "ocr-wasm"]
# android-target: no-ort-target plus native-Linux variants that work on Android ABI
# (excel with tokio, native tree-sitter, Tesseract OCR, and API/MCP transport).
# Excludes ORT-dependent features (paddle-ocr, layout-detection, embeddings, auto-rotate).
android-target = ["no-ort-target", "excel", "tree-sitter", "ocr", "api", "mcp"]
# Mobile deployment preset — formats + analysis + Tesseract OCR + code intelligence + API types.
# Excludes ORT-dependent features (paddle-ocr, layout-detection, embeddings, auto-rotate)
# and server transport infrastructure (mcp, otel, liter-llm).
mobile = ["formats", "analysis", "ocr", "tree-sitter", "api-types", "tokio-runtime"]
# WASM only: enables wasm-bindgen-rayon for multi-threaded WASM builds.
# No #[cfg] gates — activates the wasm-bindgen-rayon dependency.
wasm-threads = ["dep:wasm-bindgen-rayon"]
# --- Aggregate features ---
# Document format extractors only — no infrastructure or text analysis.
formats = [
"pdf",
"excel",
"office",
"hwp",
"hwpx",
"iwork",
"email",
"html",
"xml",
"archives",
"mdx",
]
# Text processing and analysis capabilities.
analysis = ["language-detection", "chunking", "quality", "keywords", "diff"]
# Network services and observability infrastructure.
services = ["api", "mcp", "otel"]
# Everything — all formats, analysis, services, OCR, ML, and code intelligence.
full = [
"formats",
"analysis",
"services",
"ocr",
"paddle-ocr",
"layout-detection",
"embeddings",
"chunking-tokenizers",
"tree-sitter",
"liter-llm",
"tokio-runtime",
"diff",
]
# Server deployment preset — formats + analysis + services + OCR.
server = ["formats", "analysis", "services", "ocr", "paddle-ocr", "layout-detection"]
# CLI-only features (thin gate for kreuzberg-cli).
cli = ["services", "chunking"]
[dependencies]
ahash = { workspace = true }
async-trait = { workspace = true }
auto_enums = { version = "0.8", optional = true }
axum = { version = "0.8", features = ["macros", "json", "multipart"], optional = true }
base64 = { workspace = true }
biblatex = { version = "0.11", optional = true }
biblib = { version = "0.4", default-features = false, features = [
"ris",
"pubmed",
"xml",
"regex",
], optional = true }
bitvec = "1.0"
blake3 = { workspace = true }
bytes = { workspace = true }
calamine = { version = "0.35.0", features = ["dates"], optional = true }
cfb = { workspace = true, optional = true }
chardetng = { version = "1.0.0", optional = true }
chrono = { workspace = true, optional = true }
comrak = { workspace = true }
dbase = { workspace = true, optional = true }
dirs = "6"
either = { version = "1", optional = true }
encoding_rs = { version = "0.8.35" }
fast_image_resize = { version = "6.0.0", optional = true }
flate2 = { version = "1.1", optional = true }
hayro-jbig2 = { version = "0.3", default-features = false, features = ["std"], optional = true }
hayro-jpeg2000 = { version = "0.3", default-features = false, features = [
"std",
"simd",
], optional = true }
hex = { workspace = true }
html-to-markdown-rs = { workspace = true, features = [
"inline-images",
"metadata",
], optional = true }
icu_provider = { version = "2", features = ["sync"], optional = true }
icu_segmenter = { version = "2", optional = true }
image = { workspace = true, default-features = false, features = [
"png",
"jpeg",
"webp",
"bmp",
"tiff",
"gif",
"pnm",
"rayon",
], optional = true }
indexmap = "2.14.0"
infer = "0.19.0"
itertools = { workspace = true, optional = true }
jotdown = "0.10"
kamadak-exif = { version = "0.6.1", optional = true }
kreuzberg-tesseract = { path = "../kreuzberg-tesseract", version = "5.0.0-rc.3", optional = true }
libc = { workspace = true }
# liter-llm is declared in the platform-conditional dependencies block below — it must be
# excluded on Windows because it pulls reqwest/rustls -> aws-lc-rs -> aws-lc-sys, which fails
# to build on Windows MSVC.
log = { workspace = true }
lopdf = { version = "0.40.0", optional = true }
mail-parser = { version = "0.11.3", optional = true }
memchr = "2.8.1"
memmap2 = { workspace = true }
mime_guess = "2.0"
minijinja = { workspace = true, optional = true }
moka = { version = "0.12", features = ["sync"], optional = true }
ndarray = { version = "0.17", optional = true }
num_cpus = { workspace = true }
once_cell = { workspace = true }
opentelemetry = { version = "0.32", features = ["trace"], optional = true }
opentelemetry_sdk = { version = "0.32", features = ["rt-tokio"], optional = true }
org = { version = "0.3", optional = true }
ort = { version = "2.0.0-rc.12", default-features = false, features = [
"std",
"ndarray",
"api-18",
], optional = true }
outlook-pst = { version = "1.2.0", optional = true }
parking_lot = { workspace = true }
pastey = "0.2"
pdf_oxide = { workspace = true, features = ["rendering"], optional = true }
pulldown-cmark = { version = "0.13" }
quick-xml = { version = "0.40.1", features = ["serialize"], optional = true }
rake = { version = "0.3.6", optional = true }
rayon = { workspace = true }
regex = "1.12.3"
rmcp = { version = "1.7.0", features = [
"server",
"macros",
"base64",
"transport-io",
"transport-streamable-http-server",
"server-side-http",
], optional = true }
rmp-serde = "1.3"
roxmltree = { version = "0.21.1", optional = true }
serde = { workspace = true }
serde_json = { workspace = true }
serde_toon_format = { workspace = true }
serde_yaml_ng = "0.10.0"
sevenz-rust2 = { version = "0.20.2", optional = true }
sha2 = { version = "0.11", optional = true }
simdutf8 = { version = "0.1", optional = true }
similar = { workspace = true, optional = true }
snap = { version = "1.1", optional = true }
strum = { version = "0.28", features = ["derive"], optional = true }
tar = { version = "^0.4", optional = true }
tempfile = { workspace = true, optional = true }
thiserror = { workspace = true }
tiff = { version = "0.11", optional = true }
tokenizers = { version = "0.23", optional = true, default-features = false, features = [
"http",
"fancy-regex",
] }
tokio = { workspace = true, optional = true }
toml = { workspace = true }
tower = { version = "0.5", features = ["timeout", "limit", "util"], optional = true }
tower-http = { version = "0.6", features = [
"cors",
"trace",
"limit",
"catch-panic",
"request-id",
"sensitive-headers",
"compression-full",
], optional = true }
tracing = { workspace = true }
tracing-opentelemetry = { version = "0.33", optional = true }
unhwp = { version = "0.3.2", default-features = false, features = ["hwpx"], optional = true }
unicode-normalization = { version = "0.1.25", optional = true }
urlencoding = "2"
utoipa = { version = "5.5", features = ["axum_extras"], optional = true }
uuid = { version = "1", features = ["v4"], optional = true }
v_htmlescape = { version = "0.17", optional = true }
whatlang = { version = "0.18.0", optional = true }
zip = { version = ">=7.0.0", optional = true, default-features = false, features = [
"deflate-flate2",
] }
[target.'cfg(all(not(target_os = "windows"), not(target_arch = "wasm32")))'.dependencies]
hf-hub = { version = "0.5", default-features = false, features = ["ureq"], optional = true }
# PaddleOCR via ONNX Runtime - not available on WASM (vendored from paddle-ocr-rs)
kreuzberg-paddle-ocr = { path = "../kreuzberg-paddle-ocr", version = "5.0.0-rc.3", optional = true }
# liter-llm pulls reqwest/rustls -> aws-lc-rs -> aws-lc-sys. aws-lc-sys 0.40 fails to build on
# Windows MSVC (stdalign feature detection treats `-WX` warnings as errors). Until upstream ships
# a fix, restrict liter-llm to non-Windows targets so Windows FFI/CLI builds don't pull aws-lc-sys.
liter-llm = { workspace = true, optional = true, features = ["native-http"] }
pprof = { version = "0.15.0", features = ["flamegraph"], optional = true }
# Force ureq (transitive dep via hf-hub) to use rustls on non-Windows
ureq = { version = "3.3", default-features = false, features = ["rustls", "json"], optional = true }
# Use native-tls on Windows to avoid aws-lc-sys CMake build issues with MinGW
[target.'cfg(all(target_os = "windows", not(target_arch = "wasm32")))'.dependencies]
hf-hub = { version = "0.5", default-features = false, features = ["ureq"], optional = true }
# PaddleOCR via ONNX Runtime - not available on WASM (vendored from paddle-ocr-rs)
kreuzberg-paddle-ocr = { path = "../kreuzberg-paddle-ocr", version = "5.0.0-rc.3", optional = true }
# Force ureq (transitive dep via hf-hub) to use native-tls on Windows
ureq = { version = "3.3", default-features = false, features = [
"native-tls",
"json",
], optional = true }
[target.'cfg(not(target_arch = "wasm32"))'.dependencies.tree-sitter-language-pack]
workspace = true
features = ["dynamic-loading", "download", "serde"]
optional = true
[target.'cfg(target_arch = "wasm32")'.dependencies]
# Override getrandom to enable wasm_js feature for WASM targets
# This is needed because ring/rustls (via ureq) depend on getrandom without wasm_js feature
getrandom = { version = "0.4", features = ["wasm_js"] }
# On wasm32, swap kreuzberg-tesseract from its native default ("static-linking" -> "build-tesseract")
# to the WASI-SDK build path ("build-tesseract-wasm") and bundle eng.traineddata so OCR works
# in-memory with no filesystem.
kreuzberg-tesseract = { path = "../kreuzberg-tesseract", version = "5.0.0-rc.3", default-features = false, features = [
"build-tesseract-wasm",
"bundle-tessdata-eng",
], optional = true }
liter-llm = { workspace = true, optional = true, features = ["wasm-http"] }
tree-sitter-language-pack = { workspace = true, default-features = false, features = [
"serde",
], optional = true }
wasm-bindgen-rayon = { version = "1.3", optional = true }
[build-dependencies]
[dev-dependencies]
anyhow = { workspace = true }
criterion = { workspace = true }
dotenvy = "0.15"
filetime = "0.2"
image = { workspace = true, default-features = false, features = ["png"] }
jsonschema = "0.46"
serial_test = "3.4.0"
tar = "^0.4"
tempfile = { workspace = true }
tokio = { workspace = true, features = ["macros", "time"] }
tokio-test = "0.4"
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
zip = { version = ">=7.0.0, <8.7.0", default-features = false, features = ["deflate-flate2"] }
[[bench]]
name = "text_quality"
harness = false
required-features = ["quality"]
[lints]
workspace = true