163 lines
6.2 KiB
Docker
163 lines
6.2 KiB
Docker
|
|
# =============================================================================
|
||
|
|
# Builder Stage - Build Rust binary with all dependencies
|
||
|
|
# =============================================================================
|
||
|
|
FROM rust:1.91-trixie AS builder
|
||
|
|
|
||
|
|
WORKDIR /build
|
||
|
|
|
||
|
|
# Install build dependencies
|
||
|
|
RUN apt-get update && \
|
||
|
|
apt-get install -y --no-install-recommends \
|
||
|
|
cmake \
|
||
|
|
g++ \
|
||
|
|
pkg-config \
|
||
|
|
libssl-dev \
|
||
|
|
libleptonica-dev \
|
||
|
|
libtesseract-dev \
|
||
|
|
clang \
|
||
|
|
curl \
|
||
|
|
file \
|
||
|
|
&& \
|
||
|
|
apt-get clean && \
|
||
|
|
rm -rf /var/lib/apt/lists/*
|
||
|
|
|
||
|
|
# Set onnxruntime version (can be overridden via build-arg)
|
||
|
|
ARG ONNXRUNTIME_VERSION=1.24.2
|
||
|
|
ARG TARGETARCH
|
||
|
|
ENV ONNXRUNTIME_VERSION=${ONNXRUNTIME_VERSION}
|
||
|
|
|
||
|
|
# Download and extract ONNX Runtime
|
||
|
|
RUN mkdir -p /build/onnxruntime && \
|
||
|
|
if [ "$TARGETARCH" = "arm64" ]; then \
|
||
|
|
ORT_ARCH="aarch64"; \
|
||
|
|
else \
|
||
|
|
ORT_ARCH="x64"; \
|
||
|
|
fi && \
|
||
|
|
curl -fL "https://github.com/microsoft/onnxruntime/releases/download/v${ONNXRUNTIME_VERSION}/onnxruntime-linux-${ORT_ARCH}-${ONNXRUNTIME_VERSION}.tgz" \
|
||
|
|
-o /build/onnxruntime.tgz && \
|
||
|
|
tar -xzf /build/onnxruntime.tgz -C /build/onnxruntime --strip-components=1 && \
|
||
|
|
rm /build/onnxruntime.tgz
|
||
|
|
|
||
|
|
# Copy workspace manifests and crates
|
||
|
|
COPY Cargo.toml Cargo.lock ./
|
||
|
|
COPY crates/kreuzberg/ crates/kreuzberg/
|
||
|
|
COPY crates/kreuzberg-cli/ crates/kreuzberg-cli/
|
||
|
|
COPY crates/kreuzberg-tesseract/ crates/kreuzberg-tesseract/
|
||
|
|
COPY crates/kreuzberg-paddle-ocr/ crates/kreuzberg-paddle-ocr/
|
||
|
|
|
||
|
|
# Remove workspace members that aren't included (Ruby, Node, Python, PHP, Elixir, tools, e2e)
|
||
|
|
RUN sed -i '/kreuzberg-py/d; /kreuzberg_rb/d; /kreuzberg-node/d; /kreuzberg-ffi/d; /kreuzberg-php/d; /kreuzberg_rustler/d; /kreuzberg_nif/d; /packages\/dart\/rust/d; /packages\/swift\/rust/d; /"crates\/kreuzberg-wasm"/d; /^\[profile\.release\.package\.kreuzberg-wasm\]$/,$d; /benchmark-harness/d; /e2e-generator/d; /snippet-runner/d; /e2e\/rust/d' Cargo.toml
|
||
|
|
|
||
|
|
# Build release binary with server features (api + full format support)
|
||
|
|
RUN --mount=type=cache,target=/usr/local/cargo/registry \
|
||
|
|
--mount=type=cache,target=/usr/local/cargo/git \
|
||
|
|
--mount=type=cache,target=/build/target \
|
||
|
|
cargo build --release --package kreuzberg-cli --features all && \
|
||
|
|
cp target/release/kreuzberg /build/kreuzberg && \
|
||
|
|
strip /build/kreuzberg
|
||
|
|
|
||
|
|
# =============================================================================
|
||
|
|
# Runtime Stage - Minimal runtime environment
|
||
|
|
# =============================================================================
|
||
|
|
FROM debian:trixie-slim
|
||
|
|
|
||
|
|
# OCI labels for container metadata
|
||
|
|
LABEL org.opencontainers.image.source="https://github.com/kreuzberg-dev/kreuzberg"
|
||
|
|
LABEL org.opencontainers.image.description="Kreuzberg document intelligence - full variant"
|
||
|
|
LABEL org.opencontainers.image.licenses="MIT"
|
||
|
|
|
||
|
|
WORKDIR /app
|
||
|
|
|
||
|
|
# Install runtime dependencies
|
||
|
|
RUN apt-get update && \
|
||
|
|
apt-get install -y --no-install-recommends \
|
||
|
|
ca-certificates \
|
||
|
|
curl \
|
||
|
|
tesseract-ocr \
|
||
|
|
tesseract-ocr-eng \
|
||
|
|
tesseract-ocr-osd \
|
||
|
|
tesseract-ocr-spa \
|
||
|
|
tesseract-ocr-fra \
|
||
|
|
tesseract-ocr-deu \
|
||
|
|
tesseract-ocr-ita \
|
||
|
|
tesseract-ocr-por \
|
||
|
|
tesseract-ocr-chi-sim \
|
||
|
|
tesseract-ocr-chi-tra \
|
||
|
|
tesseract-ocr-jpn \
|
||
|
|
tesseract-ocr-ara \
|
||
|
|
tesseract-ocr-rus \
|
||
|
|
tesseract-ocr-hin \
|
||
|
|
fontconfig \
|
||
|
|
libssl3 \
|
||
|
|
&& \
|
||
|
|
apt-get clean && \
|
||
|
|
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||
|
|
|
||
|
|
# Copy ONNX Runtime libraries from builder
|
||
|
|
COPY --from=builder /build/onnxruntime/lib/libonnxruntime.so* /usr/local/lib/
|
||
|
|
RUN ldconfig
|
||
|
|
|
||
|
|
# Copy binary from builder
|
||
|
|
COPY --from=builder /build/kreuzberg /usr/local/bin/kreuzberg
|
||
|
|
RUN chmod +x /usr/local/bin/kreuzberg
|
||
|
|
|
||
|
|
# Create non-root user
|
||
|
|
RUN groupadd -r kreuzberg && \
|
||
|
|
useradd -r -g kreuzberg -d /app -s /sbin/nologin kreuzberg && \
|
||
|
|
mkdir -p /app/.kreuzberg && \
|
||
|
|
chown -R kreuzberg:kreuzberg /app
|
||
|
|
|
||
|
|
# Create config directories for volume mounts
|
||
|
|
RUN mkdir -p /etc/kreuzberg /app/.config/kreuzberg && \
|
||
|
|
chown -R kreuzberg:kreuzberg /etc/kreuzberg /app/.config/kreuzberg
|
||
|
|
|
||
|
|
# Create Hugging Face cache directory for embeddings models
|
||
|
|
RUN mkdir -p /app/.kreuzberg/huggingface && \
|
||
|
|
chown -R kreuzberg:kreuzberg /app/.kreuzberg/huggingface
|
||
|
|
|
||
|
|
# Pre-download all models (PaddleOCR + layout detection) using kreuzberg cache warm
|
||
|
|
RUN KREUZBERG_CACHE_DIR=/app/.kreuzberg \
|
||
|
|
/usr/local/bin/kreuzberg cache warm --cache-dir /app/.kreuzberg --format json && \
|
||
|
|
chown -R kreuzberg:kreuzberg /app/.kreuzberg && \
|
||
|
|
echo "All models ready (PaddleOCR + layout detection)"
|
||
|
|
|
||
|
|
# Ensure read permissions on tessdata files for non-root user
|
||
|
|
# Tessdata is installed in version-specific directory (e.g., tesseract-ocr/5 or tesseract-ocr/4)
|
||
|
|
# Make all tessdata directories readable by the non-root kreuzberg user
|
||
|
|
RUN set -eux; \
|
||
|
|
echo "Setting up tessdata permissions..."; \
|
||
|
|
for dir in /usr/share/tesseract-ocr/*/tessdata /usr/share/tesseract-ocr/tessdata /usr/share/tessdata; do \
|
||
|
|
if [ -d "$dir" ]; then \
|
||
|
|
chmod -R a+rx "$dir" 2>/dev/null || true; \
|
||
|
|
if [ -f "$dir/eng.traineddata" ]; then \
|
||
|
|
echo "✓ Found tessdata with eng.traineddata at: $dir"; \
|
||
|
|
fi; \
|
||
|
|
fi; \
|
||
|
|
done; \
|
||
|
|
echo "✓ Tessdata permissions configured"
|
||
|
|
|
||
|
|
# Environment configuration.
|
||
|
|
# RUST_LOG=info is the right default here. Third-party transport crates
|
||
|
|
# (ureq, rustls, hyper_util, tower_http, hf_hub) are pre-suppressed by the
|
||
|
|
# kreuzberg-cli subscriber defaults in logging.rs, so they won't emit DEBUG
|
||
|
|
# even at the "info" root level. HuggingFace model downloads happen once at
|
||
|
|
# image build time via `cache warm`; runtime /extract requests hit the HF
|
||
|
|
# disk cache under HF_HOME and generate no network traffic.
|
||
|
|
ENV KREUZBERG_CACHE_DIR=/app/.kreuzberg \
|
||
|
|
HF_HOME=/app/.kreuzberg/huggingface \
|
||
|
|
RUST_LOG=info \
|
||
|
|
LD_LIBRARY_PATH=/usr/local/lib:/usr/lib:/lib
|
||
|
|
|
||
|
|
USER kreuzberg
|
||
|
|
|
||
|
|
EXPOSE 8000
|
||
|
|
|
||
|
|
# Health check
|
||
|
|
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||
|
|
CMD ["/usr/local/bin/kreuzberg", "--version"]
|
||
|
|
|
||
|
|
# Set kreuzberg as entrypoint for flexible command usage
|
||
|
|
# Default: Start API server (can be overridden for CLI or MCP mode)
|
||
|
|
ENTRYPOINT ["kreuzberg"]
|
||
|
|
CMD ["serve", "--host", "0.0.0.0", "--port", "8000"]
|