# ============================================================================= # Builder Stage - Build Rust binary with all dependencies # ============================================================================= FROM rust:1.91-trixie AS builder WORKDIR /build # Install build dependencies RUN apt-get update && \ apt-get install -y --no-install-recommends \ cmake \ g++ \ pkg-config \ libssl-dev \ libleptonica-dev \ libtesseract-dev \ clang \ curl \ file \ && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* # Set onnxruntime version (can be overridden via build-arg) ARG ONNXRUNTIME_VERSION=1.24.2 ARG TARGETARCH ENV ONNXRUNTIME_VERSION=${ONNXRUNTIME_VERSION} # Download and extract ONNX Runtime RUN mkdir -p /build/onnxruntime && \ if [ "$TARGETARCH" = "arm64" ]; then \ ORT_ARCH="aarch64"; \ else \ ORT_ARCH="x64"; \ fi && \ curl -fL "https://github.com/microsoft/onnxruntime/releases/download/v${ONNXRUNTIME_VERSION}/onnxruntime-linux-${ORT_ARCH}-${ONNXRUNTIME_VERSION}.tgz" \ -o /build/onnxruntime.tgz && \ tar -xzf /build/onnxruntime.tgz -C /build/onnxruntime --strip-components=1 && \ rm /build/onnxruntime.tgz # Copy workspace manifests and crates COPY Cargo.toml Cargo.lock ./ COPY crates/kreuzberg/ crates/kreuzberg/ COPY crates/kreuzberg-cli/ crates/kreuzberg-cli/ COPY crates/kreuzberg-tesseract/ crates/kreuzberg-tesseract/ COPY crates/kreuzberg-paddle-ocr/ crates/kreuzberg-paddle-ocr/ # Remove workspace members that aren't included (Ruby, Node, Python, PHP, Elixir, tools, e2e) RUN sed -i '/kreuzberg-py/d; /kreuzberg_rb/d; /kreuzberg-node/d; /kreuzberg-ffi/d; /kreuzberg-php/d; /kreuzberg_rustler/d; /kreuzberg_nif/d; /packages\/dart\/rust/d; /packages\/swift\/rust/d; /"crates\/kreuzberg-wasm"/d; /^\[profile\.release\.package\.kreuzberg-wasm\]$/,$d; /benchmark-harness/d; /e2e-generator/d; /snippet-runner/d; /e2e\/rust/d' Cargo.toml # Build release binary with server features (api + full format support) RUN --mount=type=cache,target=/usr/local/cargo/registry \ --mount=type=cache,target=/usr/local/cargo/git \ --mount=type=cache,target=/build/target \ cargo build --release --package kreuzberg-cli --features all && \ cp target/release/kreuzberg /build/kreuzberg && \ strip /build/kreuzberg # ============================================================================= # Runtime Stage - Minimal runtime environment # ============================================================================= FROM debian:trixie-slim # OCI labels for container metadata LABEL org.opencontainers.image.source="https://github.com/kreuzberg-dev/kreuzberg" LABEL org.opencontainers.image.description="Kreuzberg document intelligence - full variant" LABEL org.opencontainers.image.licenses="MIT" WORKDIR /app # Install runtime dependencies RUN apt-get update && \ apt-get install -y --no-install-recommends \ ca-certificates \ curl \ tesseract-ocr \ tesseract-ocr-eng \ tesseract-ocr-osd \ tesseract-ocr-spa \ tesseract-ocr-fra \ tesseract-ocr-deu \ tesseract-ocr-ita \ tesseract-ocr-por \ tesseract-ocr-chi-sim \ tesseract-ocr-chi-tra \ tesseract-ocr-jpn \ tesseract-ocr-ara \ tesseract-ocr-rus \ tesseract-ocr-hin \ fontconfig \ libssl3 \ && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* # Copy ONNX Runtime libraries from builder COPY --from=builder /build/onnxruntime/lib/libonnxruntime.so* /usr/local/lib/ RUN ldconfig # Copy binary from builder COPY --from=builder /build/kreuzberg /usr/local/bin/kreuzberg RUN chmod +x /usr/local/bin/kreuzberg # Create non-root user RUN groupadd -r kreuzberg && \ useradd -r -g kreuzberg -d /app -s /sbin/nologin kreuzberg && \ mkdir -p /app/.kreuzberg && \ chown -R kreuzberg:kreuzberg /app # Create config directories for volume mounts RUN mkdir -p /etc/kreuzberg /app/.config/kreuzberg && \ chown -R kreuzberg:kreuzberg /etc/kreuzberg /app/.config/kreuzberg # Create Hugging Face cache directory for embeddings models RUN mkdir -p /app/.kreuzberg/huggingface && \ chown -R kreuzberg:kreuzberg /app/.kreuzberg/huggingface # Pre-download all models (PaddleOCR + layout detection) using kreuzberg cache warm RUN KREUZBERG_CACHE_DIR=/app/.kreuzberg \ /usr/local/bin/kreuzberg cache warm --cache-dir /app/.kreuzberg --format json && \ chown -R kreuzberg:kreuzberg /app/.kreuzberg && \ echo "All models ready (PaddleOCR + layout detection)" # Ensure read permissions on tessdata files for non-root user # Tessdata is installed in version-specific directory (e.g., tesseract-ocr/5 or tesseract-ocr/4) # Make all tessdata directories readable by the non-root kreuzberg user RUN set -eux; \ echo "Setting up tessdata permissions..."; \ for dir in /usr/share/tesseract-ocr/*/tessdata /usr/share/tesseract-ocr/tessdata /usr/share/tessdata; do \ if [ -d "$dir" ]; then \ chmod -R a+rx "$dir" 2>/dev/null || true; \ if [ -f "$dir/eng.traineddata" ]; then \ echo "✓ Found tessdata with eng.traineddata at: $dir"; \ fi; \ fi; \ done; \ echo "✓ Tessdata permissions configured" # Environment configuration. # RUST_LOG=info is the right default here. Third-party transport crates # (ureq, rustls, hyper_util, tower_http, hf_hub) are pre-suppressed by the # kreuzberg-cli subscriber defaults in logging.rs, so they won't emit DEBUG # even at the "info" root level. HuggingFace model downloads happen once at # image build time via `cache warm`; runtime /extract requests hit the HF # disk cache under HF_HOME and generate no network traffic. ENV KREUZBERG_CACHE_DIR=/app/.kreuzberg \ HF_HOME=/app/.kreuzberg/huggingface \ RUST_LOG=info \ LD_LIBRARY_PATH=/usr/local/lib:/usr/lib:/lib USER kreuzberg EXPOSE 8000 # Health check HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ CMD ["/usr/local/bin/kreuzberg", "--version"] # Set kreuzberg as entrypoint for flexible command usage # Default: Start API server (can be overridden for CLI or MCP mode) ENTRYPOINT ["kreuzberg"] CMD ["serve", "--host", "0.0.0.0", "--port", "8000"]