Files
fil/docker/Dockerfile.core

163 lines
6.2 KiB
Core
Raw Permalink Normal View History

2026-06-01 23:40:55 +02:00
# =============================================================================
# Builder Stage - Build Rust binary with all dependencies
# =============================================================================
FROM rust:1.91-trixie AS builder
WORKDIR /build
# Install build dependencies
RUN apt-get update && \
apt-get install -y --no-install-recommends \
cmake \
g++ \
pkg-config \
libssl-dev \
libleptonica-dev \
libtesseract-dev \
clang \
curl \
file \
&& \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Set onnxruntime version (can be overridden via build-arg)
ARG ONNXRUNTIME_VERSION=1.24.2
ARG TARGETARCH
ENV ONNXRUNTIME_VERSION=${ONNXRUNTIME_VERSION}
# Download and extract ONNX Runtime
RUN mkdir -p /build/onnxruntime && \
if [ "$TARGETARCH" = "arm64" ]; then \
ORT_ARCH="aarch64"; \
else \
ORT_ARCH="x64"; \
fi && \
curl -fL "https://github.com/microsoft/onnxruntime/releases/download/v${ONNXRUNTIME_VERSION}/onnxruntime-linux-${ORT_ARCH}-${ONNXRUNTIME_VERSION}.tgz" \
-o /build/onnxruntime.tgz && \
tar -xzf /build/onnxruntime.tgz -C /build/onnxruntime --strip-components=1 && \
rm /build/onnxruntime.tgz
# Copy workspace manifests and crates
COPY Cargo.toml Cargo.lock ./
COPY crates/kreuzberg/ crates/kreuzberg/
COPY crates/kreuzberg-cli/ crates/kreuzberg-cli/
COPY crates/kreuzberg-tesseract/ crates/kreuzberg-tesseract/
COPY crates/kreuzberg-paddle-ocr/ crates/kreuzberg-paddle-ocr/
# Remove workspace members that aren't included (Ruby, Node, Python, PHP, tools, e2e)
RUN sed -i '/kreuzberg-py/d; /kreuzberg_rb/d; /kreuzberg-node/d; /kreuzberg-ffi/d; /kreuzberg-php/d; /kreuzberg_rustler/d; /kreuzberg_nif/d; /packages\/dart\/rust/d; /packages\/swift\/rust/d; /"crates\/kreuzberg-wasm"/d; /^\[profile\.release\.package\.kreuzberg-wasm\]$/,$d; /benchmark-harness/d; /e2e-generator/d; /snippet-runner/d; /e2e\/rust/d' Cargo.toml
# Build release binary with server features (api + full format support)
RUN --mount=type=cache,target=/usr/local/cargo/registry \
--mount=type=cache,target=/usr/local/cargo/git \
--mount=type=cache,target=/build/target \
cargo build --release --package kreuzberg-cli --features all && \
cp target/release/kreuzberg /build/kreuzberg && \
strip /build/kreuzberg
# =============================================================================
# Runtime Stage - Minimal runtime environment
# =============================================================================
FROM debian:trixie-slim
# OCI labels for container metadata
LABEL org.opencontainers.image.source="https://github.com/kreuzberg-dev/kreuzberg"
LABEL org.opencontainers.image.description="Kreuzberg document intelligence - core variant"
LABEL org.opencontainers.image.licenses="MIT"
WORKDIR /app
# Download and install dependencies (Core version)
ARG TARGETARCH
# Install runtime dependencies and download binaries
RUN apt-get update && \
apt-get install -y --no-install-recommends \
ca-certificates \
curl \
tesseract-ocr \
tesseract-ocr-eng \
tesseract-ocr-osd \
tesseract-ocr-spa \
tesseract-ocr-fra \
tesseract-ocr-deu \
tesseract-ocr-ita \
tesseract-ocr-por \
tesseract-ocr-chi-sim \
tesseract-ocr-chi-tra \
tesseract-ocr-jpn \
tesseract-ocr-ara \
tesseract-ocr-rus \
tesseract-ocr-hin \
&& \
# Clean up
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
# Copy ONNX Runtime libraries from builder
COPY --from=builder /build/onnxruntime/lib/libonnxruntime.so* /usr/local/lib/
RUN ldconfig
# Copy binary from builder
COPY --from=builder /build/kreuzberg /usr/local/bin/kreuzberg
RUN chmod +x /usr/local/bin/kreuzberg
# Create non-root user
RUN groupadd -r kreuzberg && \
useradd -r -g kreuzberg -d /app -s /sbin/nologin kreuzberg && \
mkdir -p /app/.kreuzberg && \
chown -R kreuzberg:kreuzberg /app
# Create config directories for volume mounts
RUN mkdir -p /etc/kreuzberg /app/.config/kreuzberg && \
chown -R kreuzberg:kreuzberg /etc/kreuzberg /app/.config/kreuzberg
# Create Hugging Face cache directory for embeddings models
RUN mkdir -p /app/.kreuzberg/huggingface && \
chown -R kreuzberg:kreuzberg /app/.kreuzberg/huggingface
# Create PaddleOCR model cache directory (models downloaded on demand if paddle-ocr feature used)
RUN mkdir -p /app/.kreuzberg/paddle-ocr && \
chown -R kreuzberg:kreuzberg /app/.kreuzberg/paddle-ocr
# Ensure read permissions on tessdata files for non-root user
# Tessdata is installed in version-specific directory (e.g., tesseract-ocr/5 or tesseract-ocr/4)
# Make all tessdata directories readable by the non-root kreuzberg user
RUN set -eux; \
echo "Setting up tessdata permissions..."; \
for dir in /usr/share/tesseract-ocr/*/tessdata /usr/share/tesseract-ocr/tessdata /usr/share/tessdata; do \
if [ -d "$dir" ]; then \
chmod -R a+rx "$dir" 2>/dev/null || true; \
if [ -f "$dir/eng.traineddata" ]; then \
echo "✓ Found tessdata with eng.traineddata at: $dir"; \
fi; \
fi; \
done; \
echo "✓ Tessdata permissions configured"
# Environment configuration.
# RUST_LOG=info is the right default here. Third-party transport crates
# (ureq, rustls, hyper_util, tower_http, hf_hub) are pre-suppressed by the
# kreuzberg-cli subscriber defaults in logging.rs, so they won't emit DEBUG
# even at the "info" root level. HuggingFace model downloads happen once at
# image build time via `cache warm`; runtime /extract requests hit the HF
# disk cache under HF_HOME and generate no network traffic.
ENV KREUZBERG_CACHE_DIR=/app/.kreuzberg \
HF_HOME=/app/.kreuzberg/huggingface \
RUST_LOG=info \
LD_LIBRARY_PATH=/usr/local/lib:/usr/lib:/lib
USER kreuzberg
EXPOSE 8000
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD ["/usr/local/bin/kreuzberg", "--version"]
# Set kreuzberg as entrypoint for flexible command usage
# Default: Start API server (can be overridden for CLI or MCP mode)
ENTRYPOINT ["kreuzberg"]
CMD ["serve", "--host", "0.0.0.0", "--port", "8000"]