[package] name = "kreuzberg" version.workspace = true edition.workspace = true rust-version.workspace = true authors.workspace = true description = "High-performance document intelligence library for Rust. Extract text, metadata, and structured data from PDFs, Office documents, images, and 90+ formats and 300+ programming languages via tree-sitter code intelligence with async/sync APIs." license.workspace = true repository = "https://github.com/kreuzberg-dev/kreuzberg" homepage = "https://kreuzberg.dev" documentation = "https://docs.rs/kreuzberg" keywords = ["document", "extraction", "pdf", "ocr", "parser"] categories = ["parser-implementations", "text-processing"] readme = "README.md" [package.metadata.cargo-machete] ignored = [ "liter-llm", "opentelemetry_sdk", "tracing-opentelemetry", "kamadak-exif", # icu_provider is feature-gated and consumed transitively by language-detection; # cargo-machete cannot resolve `dep:icu_provider` references in feature flag chains. "icu_provider", "serde_toon_format", ] [lib] crate-type = ["rlib"] [features] default = ["tokio-runtime", "simd-utf8"] # Build-time only: enables pprof flamegraph profiling. No #[cfg] gates — # activates the pprof dependency (non-Windows, non-WASM only). profiling = ["dep:pprof"] pool-metrics = [] simd-utf8 = ["dep:simdutf8"] tokio-runtime = ["dep:tokio"] # PDF extraction is currently backed by pdf_oxide. Keep this as the canonical # `pdf` feature until/unless a second PDF backend is introduced. pdf = ["dep:pdf_oxide", "dep:lopdf", "dep:image", "dep:flate2", "html"] excel = ["dep:calamine", "tokio-runtime"] excel-wasm = ["dep:calamine"] office = [ "dep:cfb", "dep:roxmltree", "dep:zip", "dep:quick-xml", "dep:biblatex", "dep:biblib", "dep:org", "dep:dbase", "html", ] hwp = ["dep:cfb", "dep:flate2"] hwpx = ["dep:unhwp", "dep:zip"] iwork = ["dep:zip", "dep:snap"] email = [ "dep:mail-parser", "dep:cfb", "dep:outlook-pst", "dep:tempfile", "dep:chrono", "dep:chardetng", ] html = ["dep:html-to-markdown-rs", "dep:v_htmlescape"] xml = ["dep:quick-xml", "dep:roxmltree"] archives = ["dep:zip", "dep:tar", "dep:sevenz-rust2", "dep:flate2"] mdx = [] liter-llm = ["dep:liter-llm", "dep:minijinja"] tree-sitter = ["dep:tree-sitter-language-pack"] tree-sitter-wasm = ["tree-sitter"] ocr = [ "dep:kreuzberg-tesseract", "dep:image", "dep:tiff", "dep:fast_image_resize", "dep:kamadak-exif", "dep:hayro-jpeg2000", "dep:hayro-jbig2", "html", ] # WASM OCR: Minimal Tesseract backend without html-to-markdown (which has WASI imports) # Includes only the core OCR processing dependencies needed for Tesseract on WASM ocr-wasm = [ "dep:kreuzberg-tesseract", "dep:image", "dep:hayro-jpeg2000", "dep:hayro-jbig2", ] # PaddleOCR type definitions only — pure-Rust config structs, no ORT dependency. # Exposes PaddleOcrConfig, PaddleLanguage, ModelPaths. Safe on Android/WASM. paddle-ocr-types = [] # PaddleOCR via ONNX Runtime # Requires 'ocr' feature for shared conversion utilities and table reconstruction paddle-ocr = [ "paddle-ocr-types", "dep:kreuzberg-paddle-ocr", "dep:sha2", "dep:image", "dep:hf-hub", "dep:ureq", "dep:ort", "dep:ndarray", "html", "tokio-runtime", "ocr", "auto-rotate", ] # Bundle ORT binaries via the official Microsoft release (includes CoreML on macOS, CUDA on Linux). # When active, no system ORT library is required and ort_discovery is skipped. ort-bundled = ["ort/download-binaries", "ort/tls-native"] # Build-time only: configures ORT to load ONNX Runtime dynamically at runtime. # No #[cfg] gates — passed through to the `ort` crate. ort-dynamic = ["ort/load-dynamic"] # Layout detection via ONNX Runtime (YOLO + RT-DETR) # Document orientation detection using PP-LCNet for auto_rotate auto-rotate = [ "auto-rotate-types", "dep:ort", "dep:ndarray", "dep:hf-hub", "dep:sha2", "dep:image", "dep:ureq", "ort-bundled", ] layout-detection = [ "layout-types", "dep:ort", "dep:ndarray", "dep:hf-hub", "dep:sha2", "dep:image", "tokio-runtime", "ort-bundled", ] language-detection = ["dep:whatlang"] # layout-types: pure-Rust layout struct definitions — no ORT dependency. # Included by layout-detection; also usable standalone on WASM/Android targets. layout-types = [] # auto-rotate-types: pure-Rust OrientationResult struct — no ORT dependency. # Included by auto-rotate; also usable standalone on WASM/Android targets. auto-rotate-types = [] chunking = [ "dep:auto_enums", "dep:either", "dep:icu_provider", "dep:icu_segmenter", "dep:itertools", "dep:strum", ] chunking-tokenizers = ["chunking", "dep:tokenizers"] # embedding-presets: static preset metadata (WASM-safe, no ORT) embedding-presets = [] # embeddings: full embedding generation with ONNX Runtime (ORT-dependent, WASM-incompatible) # NOTE: embedding-presets is a subset of embeddings (no ORT); always implied by embeddings. embeddings = [ "dep:hf-hub", "dep:ort", "dep:ndarray", "dep:tokenizers", "chunking", "tokio-runtime", "ort-bundled", "embedding-presets", ] stopwords = [] quality = ["dep:unicode-normalization", "dep:chardetng", "stopwords"] keywords-yake = ["stopwords"] keywords-rake = ["dep:rake", "stopwords"] keywords = ["keywords-yake", "keywords-rake"] diff = ["dep:similar"] tower-service = ["dep:tower", "tokio-runtime"] api-types = [] api = [ "api-types", "tower-service", "dep:axum", "dep:chrono", "dep:moka", "dep:tower-http", "dep:utoipa", "dep:uuid", "tokio-runtime", "chunking", ] mcp = ["tower-service", "dep:rmcp", "tokio-runtime"] mcp-http = ["mcp", "api"] otel = ["dep:opentelemetry", "dep:opentelemetry_sdk", "dep:tracing-opentelemetry"] # no-ort-target: shared base for targets that cannot use ONNX Runtime (Android x86_64 emulator, # WASM). Pure-Rust only — no ORT, no native-only C++ deps. Used as a foundation for both # wasm-target and android-target to avoid duplication. # # liter-llm is intentionally retained here: hosted LLM calls do not require ORT # and the dependency has a wasm-http backend. Runtime cfg gates still disable the # wasm LLM paths until the browser/runtime integration is wired; see the TODOs # next to those gates. no-ort-target = [ "pdf", "html", "xml", "email", "language-detection", "chunking", "chunking-tokenizers", "quality", "keywords", "office", "iwork", "hwp", "hwpx", "mdx", "archives", "liter-llm", "stopwords", "embedding-presets", "layout-types", "auto-rotate-types", "paddle-ocr-types", ] # wasm-target: no-ort-target plus WASM-specific overrides (excel-wasm, tree-sitter-wasm, ocr-wasm). wasm-target = ["no-ort-target", "excel-wasm", "tree-sitter-wasm", "ocr-wasm"] # android-target: no-ort-target plus native-Linux variants that work on Android ABI # (excel with tokio, native tree-sitter, Tesseract OCR, and API/MCP transport). # Excludes ORT-dependent features (paddle-ocr, layout-detection, embeddings, auto-rotate). android-target = ["no-ort-target", "excel", "tree-sitter", "ocr", "api", "mcp"] # Mobile deployment preset — formats + analysis + Tesseract OCR + code intelligence + API types. # Excludes ORT-dependent features (paddle-ocr, layout-detection, embeddings, auto-rotate) # and server transport infrastructure (mcp, otel, liter-llm). mobile = ["formats", "analysis", "ocr", "tree-sitter", "api-types", "tokio-runtime"] # WASM only: enables wasm-bindgen-rayon for multi-threaded WASM builds. # No #[cfg] gates — activates the wasm-bindgen-rayon dependency. wasm-threads = ["dep:wasm-bindgen-rayon"] # --- Aggregate features --- # Document format extractors only — no infrastructure or text analysis. formats = [ "pdf", "excel", "office", "hwp", "hwpx", "iwork", "email", "html", "xml", "archives", "mdx", ] # Text processing and analysis capabilities. analysis = ["language-detection", "chunking", "quality", "keywords", "diff"] # Network services and observability infrastructure. services = ["api", "mcp", "otel"] # Everything — all formats, analysis, services, OCR, ML, and code intelligence. full = [ "formats", "analysis", "services", "ocr", "paddle-ocr", "layout-detection", "embeddings", "chunking-tokenizers", "tree-sitter", "liter-llm", "tokio-runtime", "diff", ] # Server deployment preset — formats + analysis + services + OCR. server = ["formats", "analysis", "services", "ocr", "paddle-ocr", "layout-detection"] # CLI-only features (thin gate for kreuzberg-cli). cli = ["services", "chunking"] [dependencies] ahash = { workspace = true } async-trait = { workspace = true } auto_enums = { version = "0.8", optional = true } axum = { version = "0.8", features = ["macros", "json", "multipart"], optional = true } base64 = { workspace = true } biblatex = { version = "0.11", optional = true } biblib = { version = "0.4", default-features = false, features = [ "ris", "pubmed", "xml", "regex", ], optional = true } bitvec = "1.0" blake3 = { workspace = true } bytes = { workspace = true } calamine = { version = "0.35.0", features = ["dates"], optional = true } cfb = { workspace = true, optional = true } chardetng = { version = "1.0.0", optional = true } chrono = { workspace = true, optional = true } comrak = { workspace = true } dbase = { workspace = true, optional = true } dirs = "6" either = { version = "1", optional = true } encoding_rs = { version = "0.8.35" } fast_image_resize = { version = "6.0.0", optional = true } flate2 = { version = "1.1", optional = true } hayro-jbig2 = { version = "0.3", default-features = false, features = ["std"], optional = true } hayro-jpeg2000 = { version = "0.3", default-features = false, features = [ "std", "simd", ], optional = true } hex = { workspace = true } html-to-markdown-rs = { workspace = true, features = [ "inline-images", "metadata", ], optional = true } icu_provider = { version = "2", features = ["sync"], optional = true } icu_segmenter = { version = "2", optional = true } image = { workspace = true, default-features = false, features = [ "png", "jpeg", "webp", "bmp", "tiff", "gif", "pnm", "rayon", ], optional = true } indexmap = "2.14.0" infer = "0.19.0" itertools = { workspace = true, optional = true } jotdown = "0.10" kamadak-exif = { version = "0.6.1", optional = true } kreuzberg-tesseract = { path = "../kreuzberg-tesseract", version = "5.0.0-rc.3", optional = true } libc = { workspace = true } # liter-llm is declared in the platform-conditional dependencies block below — it must be # excluded on Windows because it pulls reqwest/rustls -> aws-lc-rs -> aws-lc-sys, which fails # to build on Windows MSVC. log = { workspace = true } lopdf = { version = "0.40.0", optional = true } mail-parser = { version = "0.11.3", optional = true } memchr = "2.8.1" memmap2 = { workspace = true } mime_guess = "2.0" minijinja = { workspace = true, optional = true } moka = { version = "0.12", features = ["sync"], optional = true } ndarray = { version = "0.17", optional = true } num_cpus = { workspace = true } once_cell = { workspace = true } opentelemetry = { version = "0.32", features = ["trace"], optional = true } opentelemetry_sdk = { version = "0.32", features = ["rt-tokio"], optional = true } org = { version = "0.3", optional = true } ort = { version = "2.0.0-rc.12", default-features = false, features = [ "std", "ndarray", "api-18", ], optional = true } outlook-pst = { version = "1.2.0", optional = true } parking_lot = { workspace = true } pastey = "0.2" pdf_oxide = { workspace = true, features = ["rendering"], optional = true } pulldown-cmark = { version = "0.13" } quick-xml = { version = "0.40.1", features = ["serialize"], optional = true } rake = { version = "0.3.6", optional = true } rayon = { workspace = true } regex = "1.12.3" rmcp = { version = "1.7.0", features = [ "server", "macros", "base64", "transport-io", "transport-streamable-http-server", "server-side-http", ], optional = true } rmp-serde = "1.3" roxmltree = { version = "0.21.1", optional = true } serde = { workspace = true } serde_json = { workspace = true } serde_toon_format = { workspace = true } serde_yaml_ng = "0.10.0" sevenz-rust2 = { version = "0.20.2", optional = true } sha2 = { version = "0.11", optional = true } simdutf8 = { version = "0.1", optional = true } similar = { workspace = true, optional = true } snap = { version = "1.1", optional = true } strum = { version = "0.28", features = ["derive"], optional = true } tar = { version = "^0.4", optional = true } tempfile = { workspace = true, optional = true } thiserror = { workspace = true } tiff = { version = "0.11", optional = true } tokenizers = { version = "0.23", optional = true, default-features = false, features = [ "http", "fancy-regex", ] } tokio = { workspace = true, optional = true } toml = { workspace = true } tower = { version = "0.5", features = ["timeout", "limit", "util"], optional = true } tower-http = { version = "0.6", features = [ "cors", "trace", "limit", "catch-panic", "request-id", "sensitive-headers", "compression-full", ], optional = true } tracing = { workspace = true } tracing-opentelemetry = { version = "0.33", optional = true } unhwp = { version = "0.3.2", default-features = false, features = ["hwpx"], optional = true } unicode-normalization = { version = "0.1.25", optional = true } urlencoding = "2" utoipa = { version = "5.5", features = ["axum_extras"], optional = true } uuid = { version = "1", features = ["v4"], optional = true } v_htmlescape = { version = "0.17", optional = true } whatlang = { version = "0.18.0", optional = true } zip = { version = ">=7.0.0", optional = true, default-features = false, features = [ "deflate-flate2", ] } [target.'cfg(all(not(target_os = "windows"), not(target_arch = "wasm32")))'.dependencies] hf-hub = { version = "0.5", default-features = false, features = ["ureq"], optional = true } # PaddleOCR via ONNX Runtime - not available on WASM (vendored from paddle-ocr-rs) kreuzberg-paddle-ocr = { path = "../kreuzberg-paddle-ocr", version = "5.0.0-rc.3", optional = true } # liter-llm pulls reqwest/rustls -> aws-lc-rs -> aws-lc-sys. aws-lc-sys 0.40 fails to build on # Windows MSVC (stdalign feature detection treats `-WX` warnings as errors). Until upstream ships # a fix, restrict liter-llm to non-Windows targets so Windows FFI/CLI builds don't pull aws-lc-sys. liter-llm = { workspace = true, optional = true, features = ["native-http"] } pprof = { version = "0.15.0", features = ["flamegraph"], optional = true } # Force ureq (transitive dep via hf-hub) to use rustls on non-Windows ureq = { version = "3.3", default-features = false, features = ["rustls", "json"], optional = true } # Use native-tls on Windows to avoid aws-lc-sys CMake build issues with MinGW [target.'cfg(all(target_os = "windows", not(target_arch = "wasm32")))'.dependencies] hf-hub = { version = "0.5", default-features = false, features = ["ureq"], optional = true } # PaddleOCR via ONNX Runtime - not available on WASM (vendored from paddle-ocr-rs) kreuzberg-paddle-ocr = { path = "../kreuzberg-paddle-ocr", version = "5.0.0-rc.3", optional = true } # Force ureq (transitive dep via hf-hub) to use native-tls on Windows ureq = { version = "3.3", default-features = false, features = [ "native-tls", "json", ], optional = true } [target.'cfg(not(target_arch = "wasm32"))'.dependencies.tree-sitter-language-pack] workspace = true features = ["dynamic-loading", "download", "serde"] optional = true [target.'cfg(target_arch = "wasm32")'.dependencies] # Override getrandom to enable wasm_js feature for WASM targets # This is needed because ring/rustls (via ureq) depend on getrandom without wasm_js feature getrandom = { version = "0.4", features = ["wasm_js"] } # On wasm32, swap kreuzberg-tesseract from its native default ("static-linking" -> "build-tesseract") # to the WASI-SDK build path ("build-tesseract-wasm") and bundle eng.traineddata so OCR works # in-memory with no filesystem. kreuzberg-tesseract = { path = "../kreuzberg-tesseract", version = "5.0.0-rc.3", default-features = false, features = [ "build-tesseract-wasm", "bundle-tessdata-eng", ], optional = true } liter-llm = { workspace = true, optional = true, features = ["wasm-http"] } tree-sitter-language-pack = { workspace = true, default-features = false, features = [ "serde", ], optional = true } wasm-bindgen-rayon = { version = "1.3", optional = true } [build-dependencies] [dev-dependencies] anyhow = { workspace = true } criterion = { workspace = true } dotenvy = "0.15" filetime = "0.2" image = { workspace = true, default-features = false, features = ["png"] } jsonschema = "0.46" serial_test = "3.4.0" tar = "^0.4" tempfile = { workspace = true } tokio = { workspace = true, features = ["macros", "time"] } tokio-test = "0.4" tracing-subscriber = { version = "0.3", features = ["env-filter"] } zip = { version = ">=7.0.0, <8.7.0", default-features = false, features = ["deflate-flate2"] } [[bench]] name = "text_quality" harness = false required-features = ["quality"] [lints] workspace = true