Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

75
docker/Dockerfile.cli Normal file
View File

@@ -0,0 +1,75 @@
# =============================================================================
# Minimal CLI-only Docker image for Kreuzberg.
#
# Uses the musl-build stage to produce a fully static binary, then copies it
# into a minimal Alpine image for shell access and volume mounts.
#
# Usage:
# docker build -f docker/Dockerfile.cli -t kreuzberg-cli .
# docker run -v $(pwd):/data kreuzberg-cli extract /data/document.pdf
# =============================================================================
# Stage 1: Build the static binary using the musl builder
FROM alpine:3.21 AS builder
ARG RUST_TOOLCHAIN=nightly-2026-03-10
WORKDIR /build
# Install build dependencies + ONNX Runtime from Alpine edge (musl-native).
# ort-sys checks ORT_LIB_LOCATION before attempting download, so this overrides
# the download-binaries feature transparently — no Cargo feature changes needed.
# Edge repos needed because onnxruntime depends on abseil-cpp/protobuf from edge/main.
RUN apk add --no-cache \
curl gcc g++ musl-dev cmake make pkgconf \
openssl-dev openssl-libs-static \
perl linux-headers git file && \
apk add --no-cache onnxruntime-dev \
--repository=https://dl-cdn.alpinelinux.org/alpine/edge/community \
--repository=https://dl-cdn.alpinelinux.org/alpine/edge/main
# Install Rust via rustup
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs \
| sh -s -- -y --default-toolchain "${RUST_TOOLCHAIN}" --component rust-src
ENV PATH="/root/.cargo/bin:${PATH}"
# Allow dynamic loading (dlopen) on musl targets by disabling crt-static.
ENV RUSTFLAGS="-C target-feature=-crt-static"
# Point ort-sys to Alpine's system ORT library instead of downloading prebuilt binaries.
ENV ORT_LIB_LOCATION=/usr/lib
ENV ORT_PREFER_DYNAMIC_LINK=1
# Copy workspace manifests and crates
COPY Cargo.toml Cargo.lock ./
COPY crates/kreuzberg/ crates/kreuzberg/
COPY crates/kreuzberg-cli/ crates/kreuzberg-cli/
COPY crates/kreuzberg-tesseract/ crates/kreuzberg-tesseract/
COPY crates/kreuzberg-paddle-ocr/ crates/kreuzberg-paddle-ocr/
# Remove workspace members that aren't included
RUN sed -i '/kreuzberg-py/d; /kreuzberg_rb/d; /kreuzberg-node/d; /kreuzberg-ffi/d; /kreuzberg-php/d; /kreuzberg_rustler/d; /kreuzberg_nif/d; /packages\/dart\/rust/d; /packages\/swift\/rust/d; /\"crates\/kreuzberg-wasm\"/d; /^\[profile\.release\.package\.kreuzberg-wasm\]$/,$d; /benchmark-harness/d; /e2e-generator/d; /snippet-runner/d; /e2e\/rust/d' Cargo.toml
RUN cargo build --release --package kreuzberg-cli --features all && \
cp target/release/kreuzberg /build/kreuzberg && \
strip /build/kreuzberg
# Verify the binary was built successfully
RUN file /build/kreuzberg && \
echo "=== Dynamic dependencies ===" && \
readelf -d /build/kreuzberg 2>/dev/null | grep NEEDED || echo "No external dependencies"
# =============================================================================
# Stage 2: Minimal runtime image
# =============================================================================
FROM alpine:3.21
# Install runtime dependencies needed by dynamically linked binary
RUN apk add --no-cache libstdc++ libgcc && \
apk add --no-cache onnxruntime \
--repository=https://dl-cdn.alpinelinux.org/alpine/edge/community \
--repository=https://dl-cdn.alpinelinux.org/alpine/edge/main
COPY --from=builder /build/kreuzberg /usr/local/bin/kreuzberg
ENTRYPOINT ["kreuzberg"]

162
docker/Dockerfile.core Normal file
View File

@@ -0,0 +1,162 @@
# =============================================================================
# Builder Stage - Build Rust binary with all dependencies
# =============================================================================
FROM rust:1.91-trixie AS builder
WORKDIR /build
# Install build dependencies
RUN apt-get update && \
apt-get install -y --no-install-recommends \
cmake \
g++ \
pkg-config \
libssl-dev \
libleptonica-dev \
libtesseract-dev \
clang \
curl \
file \
&& \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Set onnxruntime version (can be overridden via build-arg)
ARG ONNXRUNTIME_VERSION=1.24.2
ARG TARGETARCH
ENV ONNXRUNTIME_VERSION=${ONNXRUNTIME_VERSION}
# Download and extract ONNX Runtime
RUN mkdir -p /build/onnxruntime && \
if [ "$TARGETARCH" = "arm64" ]; then \
ORT_ARCH="aarch64"; \
else \
ORT_ARCH="x64"; \
fi && \
curl -fL "https://github.com/microsoft/onnxruntime/releases/download/v${ONNXRUNTIME_VERSION}/onnxruntime-linux-${ORT_ARCH}-${ONNXRUNTIME_VERSION}.tgz" \
-o /build/onnxruntime.tgz && \
tar -xzf /build/onnxruntime.tgz -C /build/onnxruntime --strip-components=1 && \
rm /build/onnxruntime.tgz
# Copy workspace manifests and crates
COPY Cargo.toml Cargo.lock ./
COPY crates/kreuzberg/ crates/kreuzberg/
COPY crates/kreuzberg-cli/ crates/kreuzberg-cli/
COPY crates/kreuzberg-tesseract/ crates/kreuzberg-tesseract/
COPY crates/kreuzberg-paddle-ocr/ crates/kreuzberg-paddle-ocr/
# Remove workspace members that aren't included (Ruby, Node, Python, PHP, tools, e2e)
RUN sed -i '/kreuzberg-py/d; /kreuzberg_rb/d; /kreuzberg-node/d; /kreuzberg-ffi/d; /kreuzberg-php/d; /kreuzberg_rustler/d; /kreuzberg_nif/d; /packages\/dart\/rust/d; /packages\/swift\/rust/d; /"crates\/kreuzberg-wasm"/d; /^\[profile\.release\.package\.kreuzberg-wasm\]$/,$d; /benchmark-harness/d; /e2e-generator/d; /snippet-runner/d; /e2e\/rust/d' Cargo.toml
# Build release binary with server features (api + full format support)
RUN --mount=type=cache,target=/usr/local/cargo/registry \
--mount=type=cache,target=/usr/local/cargo/git \
--mount=type=cache,target=/build/target \
cargo build --release --package kreuzberg-cli --features all && \
cp target/release/kreuzberg /build/kreuzberg && \
strip /build/kreuzberg
# =============================================================================
# Runtime Stage - Minimal runtime environment
# =============================================================================
FROM debian:trixie-slim
# OCI labels for container metadata
LABEL org.opencontainers.image.source="https://github.com/kreuzberg-dev/kreuzberg"
LABEL org.opencontainers.image.description="Kreuzberg document intelligence - core variant"
LABEL org.opencontainers.image.licenses="MIT"
WORKDIR /app
# Download and install dependencies (Core version)
ARG TARGETARCH
# Install runtime dependencies and download binaries
RUN apt-get update && \
apt-get install -y --no-install-recommends \
ca-certificates \
curl \
tesseract-ocr \
tesseract-ocr-eng \
tesseract-ocr-osd \
tesseract-ocr-spa \
tesseract-ocr-fra \
tesseract-ocr-deu \
tesseract-ocr-ita \
tesseract-ocr-por \
tesseract-ocr-chi-sim \
tesseract-ocr-chi-tra \
tesseract-ocr-jpn \
tesseract-ocr-ara \
tesseract-ocr-rus \
tesseract-ocr-hin \
&& \
# Clean up
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
# Copy ONNX Runtime libraries from builder
COPY --from=builder /build/onnxruntime/lib/libonnxruntime.so* /usr/local/lib/
RUN ldconfig
# Copy binary from builder
COPY --from=builder /build/kreuzberg /usr/local/bin/kreuzberg
RUN chmod +x /usr/local/bin/kreuzberg
# Create non-root user
RUN groupadd -r kreuzberg && \
useradd -r -g kreuzberg -d /app -s /sbin/nologin kreuzberg && \
mkdir -p /app/.kreuzberg && \
chown -R kreuzberg:kreuzberg /app
# Create config directories for volume mounts
RUN mkdir -p /etc/kreuzberg /app/.config/kreuzberg && \
chown -R kreuzberg:kreuzberg /etc/kreuzberg /app/.config/kreuzberg
# Create Hugging Face cache directory for embeddings models
RUN mkdir -p /app/.kreuzberg/huggingface && \
chown -R kreuzberg:kreuzberg /app/.kreuzberg/huggingface
# Create PaddleOCR model cache directory (models downloaded on demand if paddle-ocr feature used)
RUN mkdir -p /app/.kreuzberg/paddle-ocr && \
chown -R kreuzberg:kreuzberg /app/.kreuzberg/paddle-ocr
# Ensure read permissions on tessdata files for non-root user
# Tessdata is installed in version-specific directory (e.g., tesseract-ocr/5 or tesseract-ocr/4)
# Make all tessdata directories readable by the non-root kreuzberg user
RUN set -eux; \
echo "Setting up tessdata permissions..."; \
for dir in /usr/share/tesseract-ocr/*/tessdata /usr/share/tesseract-ocr/tessdata /usr/share/tessdata; do \
if [ -d "$dir" ]; then \
chmod -R a+rx "$dir" 2>/dev/null || true; \
if [ -f "$dir/eng.traineddata" ]; then \
echo "✓ Found tessdata with eng.traineddata at: $dir"; \
fi; \
fi; \
done; \
echo "✓ Tessdata permissions configured"
# Environment configuration.
# RUST_LOG=info is the right default here. Third-party transport crates
# (ureq, rustls, hyper_util, tower_http, hf_hub) are pre-suppressed by the
# kreuzberg-cli subscriber defaults in logging.rs, so they won't emit DEBUG
# even at the "info" root level. HuggingFace model downloads happen once at
# image build time via `cache warm`; runtime /extract requests hit the HF
# disk cache under HF_HOME and generate no network traffic.
ENV KREUZBERG_CACHE_DIR=/app/.kreuzberg \
HF_HOME=/app/.kreuzberg/huggingface \
RUST_LOG=info \
LD_LIBRARY_PATH=/usr/local/lib:/usr/lib:/lib
USER kreuzberg
EXPOSE 8000
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD ["/usr/local/bin/kreuzberg", "--version"]
# Set kreuzberg as entrypoint for flexible command usage
# Default: Start API server (can be overridden for CLI or MCP mode)
ENTRYPOINT ["kreuzberg"]
CMD ["serve", "--host", "0.0.0.0", "--port", "8000"]

162
docker/Dockerfile.full Normal file
View File

@@ -0,0 +1,162 @@
# =============================================================================
# Builder Stage - Build Rust binary with all dependencies
# =============================================================================
FROM rust:1.91-trixie AS builder
WORKDIR /build
# Install build dependencies
RUN apt-get update && \
apt-get install -y --no-install-recommends \
cmake \
g++ \
pkg-config \
libssl-dev \
libleptonica-dev \
libtesseract-dev \
clang \
curl \
file \
&& \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Set onnxruntime version (can be overridden via build-arg)
ARG ONNXRUNTIME_VERSION=1.24.2
ARG TARGETARCH
ENV ONNXRUNTIME_VERSION=${ONNXRUNTIME_VERSION}
# Download and extract ONNX Runtime
RUN mkdir -p /build/onnxruntime && \
if [ "$TARGETARCH" = "arm64" ]; then \
ORT_ARCH="aarch64"; \
else \
ORT_ARCH="x64"; \
fi && \
curl -fL "https://github.com/microsoft/onnxruntime/releases/download/v${ONNXRUNTIME_VERSION}/onnxruntime-linux-${ORT_ARCH}-${ONNXRUNTIME_VERSION}.tgz" \
-o /build/onnxruntime.tgz && \
tar -xzf /build/onnxruntime.tgz -C /build/onnxruntime --strip-components=1 && \
rm /build/onnxruntime.tgz
# Copy workspace manifests and crates
COPY Cargo.toml Cargo.lock ./
COPY crates/kreuzberg/ crates/kreuzberg/
COPY crates/kreuzberg-cli/ crates/kreuzberg-cli/
COPY crates/kreuzberg-tesseract/ crates/kreuzberg-tesseract/
COPY crates/kreuzberg-paddle-ocr/ crates/kreuzberg-paddle-ocr/
# Remove workspace members that aren't included (Ruby, Node, Python, PHP, Elixir, tools, e2e)
RUN sed -i '/kreuzberg-py/d; /kreuzberg_rb/d; /kreuzberg-node/d; /kreuzberg-ffi/d; /kreuzberg-php/d; /kreuzberg_rustler/d; /kreuzberg_nif/d; /packages\/dart\/rust/d; /packages\/swift\/rust/d; /"crates\/kreuzberg-wasm"/d; /^\[profile\.release\.package\.kreuzberg-wasm\]$/,$d; /benchmark-harness/d; /e2e-generator/d; /snippet-runner/d; /e2e\/rust/d' Cargo.toml
# Build release binary with server features (api + full format support)
RUN --mount=type=cache,target=/usr/local/cargo/registry \
--mount=type=cache,target=/usr/local/cargo/git \
--mount=type=cache,target=/build/target \
cargo build --release --package kreuzberg-cli --features all && \
cp target/release/kreuzberg /build/kreuzberg && \
strip /build/kreuzberg
# =============================================================================
# Runtime Stage - Minimal runtime environment
# =============================================================================
FROM debian:trixie-slim
# OCI labels for container metadata
LABEL org.opencontainers.image.source="https://github.com/kreuzberg-dev/kreuzberg"
LABEL org.opencontainers.image.description="Kreuzberg document intelligence - full variant"
LABEL org.opencontainers.image.licenses="MIT"
WORKDIR /app
# Install runtime dependencies
RUN apt-get update && \
apt-get install -y --no-install-recommends \
ca-certificates \
curl \
tesseract-ocr \
tesseract-ocr-eng \
tesseract-ocr-osd \
tesseract-ocr-spa \
tesseract-ocr-fra \
tesseract-ocr-deu \
tesseract-ocr-ita \
tesseract-ocr-por \
tesseract-ocr-chi-sim \
tesseract-ocr-chi-tra \
tesseract-ocr-jpn \
tesseract-ocr-ara \
tesseract-ocr-rus \
tesseract-ocr-hin \
fontconfig \
libssl3 \
&& \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
# Copy ONNX Runtime libraries from builder
COPY --from=builder /build/onnxruntime/lib/libonnxruntime.so* /usr/local/lib/
RUN ldconfig
# Copy binary from builder
COPY --from=builder /build/kreuzberg /usr/local/bin/kreuzberg
RUN chmod +x /usr/local/bin/kreuzberg
# Create non-root user
RUN groupadd -r kreuzberg && \
useradd -r -g kreuzberg -d /app -s /sbin/nologin kreuzberg && \
mkdir -p /app/.kreuzberg && \
chown -R kreuzberg:kreuzberg /app
# Create config directories for volume mounts
RUN mkdir -p /etc/kreuzberg /app/.config/kreuzberg && \
chown -R kreuzberg:kreuzberg /etc/kreuzberg /app/.config/kreuzberg
# Create Hugging Face cache directory for embeddings models
RUN mkdir -p /app/.kreuzberg/huggingface && \
chown -R kreuzberg:kreuzberg /app/.kreuzberg/huggingface
# Pre-download all models (PaddleOCR + layout detection) using kreuzberg cache warm
RUN KREUZBERG_CACHE_DIR=/app/.kreuzberg \
/usr/local/bin/kreuzberg cache warm --cache-dir /app/.kreuzberg --format json && \
chown -R kreuzberg:kreuzberg /app/.kreuzberg && \
echo "All models ready (PaddleOCR + layout detection)"
# Ensure read permissions on tessdata files for non-root user
# Tessdata is installed in version-specific directory (e.g., tesseract-ocr/5 or tesseract-ocr/4)
# Make all tessdata directories readable by the non-root kreuzberg user
RUN set -eux; \
echo "Setting up tessdata permissions..."; \
for dir in /usr/share/tesseract-ocr/*/tessdata /usr/share/tesseract-ocr/tessdata /usr/share/tessdata; do \
if [ -d "$dir" ]; then \
chmod -R a+rx "$dir" 2>/dev/null || true; \
if [ -f "$dir/eng.traineddata" ]; then \
echo "✓ Found tessdata with eng.traineddata at: $dir"; \
fi; \
fi; \
done; \
echo "✓ Tessdata permissions configured"
# Environment configuration.
# RUST_LOG=info is the right default here. Third-party transport crates
# (ureq, rustls, hyper_util, tower_http, hf_hub) are pre-suppressed by the
# kreuzberg-cli subscriber defaults in logging.rs, so they won't emit DEBUG
# even at the "info" root level. HuggingFace model downloads happen once at
# image build time via `cache warm`; runtime /extract requests hit the HF
# disk cache under HF_HOME and generate no network traffic.
ENV KREUZBERG_CACHE_DIR=/app/.kreuzberg \
HF_HOME=/app/.kreuzberg/huggingface \
RUST_LOG=info \
LD_LIBRARY_PATH=/usr/local/lib:/usr/lib:/lib
USER kreuzberg
EXPOSE 8000
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD ["/usr/local/bin/kreuzberg", "--version"]
# Set kreuzberg as entrypoint for flexible command usage
# Default: Start API server (can be overridden for CLI or MCP mode)
ENTRYPOINT ["kreuzberg"]
CMD ["serve", "--host", "0.0.0.0", "--port", "8000"]

View File

@@ -0,0 +1,142 @@
# =============================================================================
# Alpine-based builder for musl CLI binaries.
#
# Usage:
# docker build -f docker/Dockerfile.musl-build \
# --output type=local,dest=./dist \
# --build-arg TARGETARCH=x86_64 .
#
# Produces: dist/kreuzberg (binary) and dist/lib/ (runtime libraries)
#
# Runtime libraries (musl libc, libstdc++, libgcc) are bundled alongside
# the binary for portability across Linux distros.
# =============================================================================
FROM alpine:3.21 AS builder
ARG RUST_TOOLCHAIN=nightly-2026-03-10
WORKDIR /build
# Install build dependencies — Alpine's g++ and libstdc++ are musl-native,
# so tesseract C++ compilation works without glibc conflicts.
# onnxruntime-dev from edge provides musl-native ORT for linking.
RUN apk add --no-cache \
curl gcc g++ musl-dev cmake make pkgconf \
openssl-dev openssl-libs-static \
perl linux-headers git file patchelf && \
apk add --no-cache onnxruntime-dev \
--repository=https://dl-cdn.alpinelinux.org/alpine/edge/community \
--repository=https://dl-cdn.alpinelinux.org/alpine/edge/main
# Install Rust via rustup (Alpine's packaged Rust may be too old / not nightly)
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs \
| sh -s -- -y --default-toolchain "${RUST_TOOLCHAIN}" --component rust-src && \
echo "Rust host: $(~/.cargo/bin/rustc -vV | grep host)" && \
echo "Default target: $(~/.cargo/bin/rustc --print cfg | grep target)"
ENV PATH="/root/.cargo/bin:${PATH}"
# Disable crt-static so the binary can dlopen shared libraries at runtime.
ENV RUSTFLAGS="-C target-feature=-crt-static"
# Point ort-sys to Alpine's system ORT library instead of downloading prebuilt binaries.
# ort-sys checks ORT_LIB_LOCATION before attempting download (build/main.rs line 45).
ENV ORT_LIB_LOCATION=/usr/lib
ENV ORT_PREFER_DYNAMIC_LINK=1
# Copy workspace manifests and crates
COPY Cargo.toml Cargo.lock ./
COPY crates/kreuzberg/ crates/kreuzberg/
COPY crates/kreuzberg-cli/ crates/kreuzberg-cli/
COPY crates/kreuzberg-tesseract/ crates/kreuzberg-tesseract/
COPY crates/kreuzberg-paddle-ocr/ crates/kreuzberg-paddle-ocr/
# Remove workspace members that aren't included
RUN sed -i '/kreuzberg-py/d; /kreuzberg_rb/d; /kreuzberg-node/d; /kreuzberg-ffi/d; /kreuzberg-php/d; /kreuzberg_rustler/d; /kreuzberg_nif/d; /packages\/dart\/rust/d; /packages\/swift\/rust/d; /"crates\/kreuzberg-wasm"/d; /^\[profile\.release\.package\.kreuzberg-wasm\]$/,$d; /benchmark-harness/d; /e2e-generator/d; /snippet-runner/d; /e2e\/rust/d' Cargo.toml
RUN cargo build --release --package kreuzberg-cli --features all && \
cp target/release/kreuzberg /build/kreuzberg && \
strip /build/kreuzberg
# Set RPATH so the binary finds shared libs relative to itself
RUN patchelf --set-rpath '$ORIGIN/lib' /build/kreuzberg
# Collect runtime libraries.
#
# The launcher (below) invokes the musl loader with `--library-path lib/`,
# which REPLACES the loader's search path. The bundle must therefore be
# self-contained: every transitive dependency of every shipped .so has to
# live in /build/lib/ too, otherwise the loader prints "Error loading shared
# library X: No such file or directory" at startup (issue #991).
#
# Strategy:
# 1. Copy the well-known runtime bits (musl loader, libstdc++, libgcc, ORT).
# 2. Recursively `ldd`-walk every .so in the bundle and copy any host lib
# they resolve to that isn't already present.
# 3. Smoke-test the loader against each shipped .so so the build FAILS if
# anything is still missing — better to break the image than to ship a
# tarball that crashes on first invocation.
RUN set -eu; \
mkdir -p /build/lib; \
cp /usr/lib/libstdc++.so.6 /build/lib/; \
cp /usr/lib/libgcc_s.so.1 /build/lib/; \
# Bundle ONNX Runtime for embeddings/layout-detection at runtime.
cp /usr/lib/libonnxruntime.so* /build/lib/ 2>/dev/null || true; \
# Copy the musl dynamic linker/libc.
cp /lib/ld-musl-*.so.1 /build/lib/ 2>/dev/null || true; \
# Recursively resolve transitive deps of everything in /build/lib via ldd
# (alpine's musl ldd resolves against system paths). Re-walk until no new
# libraries are pulled in to handle multi-level chains (libonnxruntime →
# libprotobuf-lite → libabsl_* → ...).
LOADER="$(ls /build/lib/ld-musl-*.so.1 | head -n1)"; \
while :; do \
before=$(ls /build/lib | wc -l); \
for so in /build/lib/*.so*; do \
# Skip the loader itself; ldd against it is meaningless.
case "$so" in *ld-musl-*) continue ;; esac; \
"$LOADER" --list "$so" 2>/dev/null \
| awk '/=>/ { print $3 }' \
| grep -E '^/' \
| while read -r dep; do \
base="$(basename "$dep")"; \
if [ ! -e "/build/lib/$base" ]; then \
cp -L "$dep" /build/lib/; \
fi; \
done; \
done; \
after=$(ls /build/lib | wc -l); \
[ "$before" = "$after" ] && break; \
done; \
# Verify nothing in the bundle has unresolved deps when constrained to lib/.
for so in /build/lib/*.so*; do \
case "$so" in *ld-musl-*) continue ;; esac; \
if "$LOADER" --library-path /build/lib --list "$so" 2>&1 | grep -q 'not found'; then \
echo "FAIL: $so has unresolved dependencies inside the bundle:" >&2; \
"$LOADER" --library-path /build/lib --list "$so" >&2; \
exit 1; \
fi; \
done; \
echo "OK: every bundled library resolves inside /build/lib/"
# Rename the actual binary and create a wrapper script that invokes it
# via the bundled musl interpreter. This makes the binary work on ANY
# Linux distro (glibc or musl) without system dependencies.
RUN mv /build/kreuzberg /build/kreuzberg.bin && \
INTERP_NAME=$(basename /lib/ld-musl-*.so.1) && \
printf '#!/bin/sh\nSCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"\nexec "$SCRIPT_DIR/lib/%s" --library-path "$SCRIPT_DIR/lib" "$SCRIPT_DIR/kreuzberg.bin" "$@"\n' \
"$INTERP_NAME" > /build/kreuzberg && \
chmod +x /build/kreuzberg
# Verify the binary was built successfully
RUN file /build/kreuzberg && \
echo "=== Dynamic dependencies ===" && \
readelf -d /build/kreuzberg 2>/dev/null | grep -E "NEEDED|RPATH|RUNPATH" || echo "No dependencies" && \
echo "=== Bundled libraries ===" && \
ls -la /build/lib/
# =============================================================================
# Output stage — binary + bundled runtime libraries
# =============================================================================
FROM scratch
COPY --from=builder /build/kreuzberg /kreuzberg
COPY --from=builder /build/kreuzberg.bin /kreuzberg.bin
COPY --from=builder /build/lib/ /lib/

View File

@@ -0,0 +1,65 @@
# =============================================================================
# Alpine-based builder for musl-linked FFI shared library.
#
# Usage:
# docker build -f docker/Dockerfile.musl-ffi \
# --output type=local,dest=./dist .
#
# Produces libkreuzberg_ffi.so at dist/libkreuzberg_ffi.so
# =============================================================================
FROM alpine:3.21 AS builder
ARG RUST_TOOLCHAIN=nightly-2026-03-10
WORKDIR /build
# Install build dependencies — Alpine's g++ and libstdc++ are musl-native,
# so tesseract C++ compilation works without glibc conflicts.
RUN apk add --no-cache \
curl gcc g++ musl-dev cmake make pkgconf \
openssl-dev openssl-libs-static \
perl linux-headers git file && \
apk add --no-cache onnxruntime-dev \
--repository=https://dl-cdn.alpinelinux.org/alpine/edge/community \
--repository=https://dl-cdn.alpinelinux.org/alpine/edge/main
# Install Rust via rustup (Alpine's packaged Rust may be too old / not nightly)
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs \
| sh -s -- -y --default-toolchain "${RUST_TOOLCHAIN}" --component rust-src && \
echo "Rust host: $(~/.cargo/bin/rustc -vV | grep host)" && \
echo "Default target: $(~/.cargo/bin/rustc --print cfg | grep target)"
ENV PATH="/root/.cargo/bin:${PATH}"
# Point ort-sys to Alpine's system ORT library instead of downloading prebuilt binaries.
ENV ORT_LIB_LOCATION=/usr/lib
ENV ORT_PREFER_DYNAMIC_LINK=1
ENV ORT_SKIP_DOWNLOAD=1
ENV ORT_STRATEGY=system
# Allow cdylib output on musl targets (default is +crt-static which blocks shared libs)
ENV RUSTFLAGS="-C target-feature=-crt-static"
# Copy workspace manifests and crates
COPY Cargo.toml Cargo.lock ./
COPY crates/kreuzberg/ crates/kreuzberg/
COPY crates/kreuzberg-ffi/ crates/kreuzberg-ffi/
COPY crates/kreuzberg-tesseract/ crates/kreuzberg-tesseract/
COPY crates/kreuzberg-paddle-ocr/ crates/kreuzberg-paddle-ocr/
# Remove workspace members that aren't included
RUN sed -i '/kreuzberg-py/d; /kreuzberg_rb/d; /kreuzberg-node/d; /kreuzberg-cli/d; /kreuzberg-php/d; /kreuzberg_rustler/d; /kreuzberg_nif/d; /packages\/dart\/rust/d; /packages\/swift\/rust/d; /"crates\/kreuzberg-wasm"/d; /^\[profile\.release\.package\.kreuzberg-wasm\]$/,$d; /benchmark-harness/d; /e2e-generator/d; /snippet-runner/d; /e2e\/rust/d' Cargo.toml
# Build the FFI shared library
RUN cargo build --release --package kreuzberg-ffi && \
cp target/release/libkreuzberg_ffi.so /build/libkreuzberg_ffi.so && \
strip /build/libkreuzberg_ffi.so
# Verify the library
RUN file /build/libkreuzberg_ffi.so && \
echo "=== Dynamic dependencies ===" && \
readelf -d /build/libkreuzberg_ffi.so 2>/dev/null | grep NEEDED || echo "No dynamic dependencies (fully static)"
# =============================================================================
# Output stage — just the shared library
# =============================================================================
FROM scratch
COPY --from=builder /build/libkreuzberg_ffi.so /libkreuzberg_ffi.so

View File

@@ -0,0 +1,67 @@
# =============================================================================
# Alpine-based builder for musl-linked Elixir Rustler NIF.
#
# Usage:
# docker build -f docker/Dockerfile.musl-rustler \
# --output type=local,dest=./dist .
#
# Produces libkreuzberg_nif.so at dist/libkreuzberg_nif.so
# =============================================================================
FROM alpine:3.21 AS builder
ARG RUST_TOOLCHAIN=nightly-2026-03-10
WORKDIR /build
# Install build dependencies — Alpine's g++ and libstdc++ are musl-native,
# so tesseract C++ compilation works without glibc conflicts.
RUN apk add --no-cache \
curl gcc g++ musl-dev cmake make pkgconf \
openssl-dev openssl-libs-static \
perl linux-headers git file && \
apk add --no-cache onnxruntime-dev \
--repository=https://dl-cdn.alpinelinux.org/alpine/edge/community \
--repository=https://dl-cdn.alpinelinux.org/alpine/edge/main
# Install Rust via rustup (Alpine's packaged Rust may be too old / not nightly)
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs \
| sh -s -- -y --default-toolchain "${RUST_TOOLCHAIN}" --component rust-src && \
echo "Rust host: $(~/.cargo/bin/rustc -vV | grep host)" && \
echo "Default target: $(~/.cargo/bin/rustc --print cfg | grep target)"
ENV PATH="/root/.cargo/bin:${PATH}"
# Point ort-sys to Alpine's system ORT library instead of downloading prebuilt binaries.
ENV ORT_LIB_LOCATION=/usr/lib
ENV ORT_PREFER_DYNAMIC_LINK=1
ENV ORT_SKIP_DOWNLOAD=1
ENV ORT_STRATEGY=system
# Allow cdylib output on musl targets (default is +crt-static which blocks shared libs)
ENV RUSTFLAGS="-C target-feature=-crt-static"
# Copy workspace manifests and crates
COPY Cargo.toml Cargo.lock ./
COPY crates/kreuzberg/ crates/kreuzberg/
COPY crates/kreuzberg-tesseract/ crates/kreuzberg-tesseract/
COPY crates/kreuzberg-paddle-ocr/ crates/kreuzberg-paddle-ocr/
COPY packages/elixir/native/kreuzberg_nif/ packages/elixir/native/kreuzberg_nif/
# Remove workspace members that aren't included
RUN sed -i '/kreuzberg-py/d; /kreuzberg_rb/d; /kreuzberg-node/d; /kreuzberg-cli/d; /kreuzberg-ffi/d; /kreuzberg-php/d; /packages\/dart\/rust/d; /packages\/swift\/rust/d; /"crates\/kreuzberg-wasm"/d; /^\[profile\.release\.package\.kreuzberg-wasm\]$/,$d; /benchmark-harness/d; /e2e-generator/d; /snippet-runner/d; /e2e\/rust/d' Cargo.toml
# Build the Rustler NIF shared library (the crate is excluded from the workspace,
# so we build it directly from its package directory).
RUN cd packages/elixir/native/kreuzberg_nif && \
cargo build --release && \
cp target/release/libkreuzberg_nif.so /build/libkreuzberg_nif.so && \
strip /build/libkreuzberg_nif.so
# Verify the library
RUN file /build/libkreuzberg_nif.so && \
echo "=== Dynamic dependencies ===" && \
readelf -d /build/libkreuzberg_nif.so 2>/dev/null | grep NEEDED || echo "No dynamic dependencies (fully static)"
# =============================================================================
# Output stage — just the shared library
# =============================================================================
FROM scratch
COPY --from=builder /build/libkreuzberg_nif.so /libkreuzberg_nif.so

137
docker/README.md Normal file
View File

@@ -0,0 +1,137 @@
# Kreuzberg Docker Images
This directory contains Dockerfile variants for building Kreuzberg Docker images with different feature sets.
## Base Image
Both variants use **Debian 13 (Trixie) slim** - the latest stable Debian release for optimal package availability and security updates.
## Image Variants
### 1. Core Image (`Dockerfile.core`)
**Size:** ~1.0-1.3GB
**Base:** debian:trixie-slim
**Features:** PDF, DOCX, PPTX, images, HTML, XML, text, Excel, email, academic formats (LaTeX, EPUB, etc.)
**OCR:** Tesseract (12 languages)
**Legacy Office:** Native OLE/CFB parsing support
**When to use:**
- Production deployments where image size matters
- Cloud environments with size/bandwidth constraints
- Kubernetes deployments with frequent pod scaling
- All use cases (both images have equivalent legacy Office support)
**Build command:**
```bash
docker build -f docker/Dockerfile.core -t kreuzberg:core .
```
### 2. Full Image (`Dockerfile.full`)
**Size:** ~1.0-1.3GB
**Base:** debian:trixie-slim
**Features:** All core features with native legacy Office format support
**OCR:** Tesseract (12 languages)
**Legacy Office:** Native OLE/CFB parsing for .doc, .ppt, .xls
**When to use:**
- Complete document intelligence pipeline with all optional dependencies
- Development and testing environments
- When you want maximum feature completeness
**Build command:**
```bash
docker build -f docker/Dockerfile.full -t kreuzberg:full .
```
## Size Comparison
| Component | Core | Full | Difference |
| -------------------- | -------------- | -------------- | ----------------- |
| Base (trixie-slim) | ~120MB | ~120MB | - |
| Tesseract + 12 langs | ~250MB | ~250MB | - |
| Rust binary | ~80MB | ~80MB | - |
| System libraries | ~100MB | ~100MB | - |
| **Total (approx)** | **~1.0-1.3GB** | **~1.0-1.3GB** | **- (same size)** |
## Default Image
The root `Dockerfile` is a symlink to `Dockerfile.full` for backward compatibility and complete feature support by default.
## Multi-Architecture Support
Both images support:
- `linux/amd64` (x86_64)
- `linux/arm64` (aarch64)
Both architectures use the same pure-Rust PDF library — no architecture-specific binaries needed.
## Usage Modes
All images support three execution modes via ENTRYPOINT:
### 1. API Server (default)
```bash
docker run -p 8000:8000 kreuzberg:core
# or override host/port:
docker run -p 8000:8000 kreuzberg:core serve --host 0.0.0.0 --port 8000
```
### 2. CLI Mode
```bash
docker run -v $(pwd):/data kreuzberg:core extract /data/document.pdf
docker run -v $(pwd):/data kreuzberg:core detect /data/file.bin
docker run -v $(pwd):/data kreuzberg:core batch /data/*.pdf
```
### 3. MCP Server Mode
```bash
docker run kreuzberg:core mcp
```
## Testing
Test scripts are provided to verify both image variants:
```bash
# Test core image
IMAGE_NAME=kreuzberg:core ./scripts/test_docker.sh
# Test full image
IMAGE_NAME=kreuzberg:full ./scripts/test_docker.sh
```
## GitHub Actions
The `.github/workflows/publish-docker.yaml` workflow builds and publishes both variants to GitHub Container Registry:
- `ghcr.io/kreuzberg-dev/kreuzberg:VERSION-core` - Core image (minimal runtime)
- `ghcr.io/kreuzberg-dev/kreuzberg:core` - Latest core image
- `ghcr.io/kreuzberg-dev/kreuzberg:VERSION` - Full image (all optional dependencies)
- `ghcr.io/kreuzberg-dev/kreuzberg:latest` - Latest full image
For local development, use the local tags shown in the build commands above.
## Recommendations
**Choose Core if:**
- ✅ Minimal runtime setup
- ✅ Standard document processing needs
- ✅ Cloud deployments with cost constraints
- ✅ Kubernetes or container orchestration
**Choose Full if:**
- ✅ Want maximum optional dependencies preinstalled
- ✅ Development and testing environments
- ✅ "Batteries included" experience preferred

206
docker/test-tessdata.sh Executable file
View File

@@ -0,0 +1,206 @@
#!/usr/bin/env bash
#
# Test script to verify tessdata configuration in Docker images
# This script tests both Dockerfile.core and Dockerfile.full
#
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
echo -e "${YELLOW}Testing Kreuzberg Docker tessdata configuration...${NC}\n"
# Test 1: Check if tessdata path discovery logic works
test_tessdata_discovery() {
local test_name="$1"
local dockerfile="$2"
echo -e "${YELLOW}Test: $test_name${NC}"
# Extract the tessdata setup section from Dockerfile
if grep -A 10 "Setting up tessdata permissions" "$dockerfile" >/dev/null; then
echo -e "${GREEN}✓ Tessdata setup code found in $dockerfile${NC}"
else
echo -e "${RED}✗ Tessdata setup code NOT found in $dockerfile${NC}"
return 1
fi
# Check if TESSDATA_PREFIX is hardcoded (it should NOT be)
if grep "TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata" "$dockerfile" >/dev/null; then
echo -e "${RED}✗ TESSDATA_PREFIX is still hardcoded in $dockerfile (should be removed)${NC}"
return 1
else
echo -e "${GREEN}✓ TESSDATA_PREFIX is not hardcoded (correct)${NC}"
fi
# Check if chmod is being used to set permissions
if grep -q "chmod -R a+rx" "$dockerfile"; then
echo -e "${GREEN}✓ Chmod command found to set permissions${NC}"
else
echo -e "${RED}✗ Chmod command NOT found in $dockerfile${NC}"
return 1
fi
# Check for multiple fallback paths
if grep -q "/usr/share/tesseract-ocr/\*/tessdata" "$dockerfile"; then
echo -e "${GREEN}✓ Multiple tessdata paths checked in Dockerfile${NC}"
else
echo -e "${RED}✗ Multiple tessdata paths NOT found${NC}"
return 1
fi
echo ""
return 0
}
# Test 2: Verify Dockerfile syntax
test_dockerfile_syntax() {
local dockerfile="$1"
local test_name="$2"
echo -e "${YELLOW}Test: Verify $test_name syntax${NC}"
# Use docker build --dry-run if available, otherwise just validate basic syntax
if command -v docker &>/dev/null; then
if docker build --dry-run -f "$dockerfile" "$PROJECT_ROOT" &>/dev/null; then
echo -e "${GREEN}✓ Dockerfile syntax is valid${NC}"
else
echo -e "${YELLOW}! Dockerfile syntax check failed (may be due to missing Docker or build prerequisites)${NC}"
fi
else
# Basic syntax check without Docker
if grep -q "^FROM " "$dockerfile" && grep -q "^ENV " "$dockerfile"; then
echo -e "${GREEN}✓ Basic Dockerfile structure looks valid${NC}"
else
echo -e "${RED}✗ Dockerfile structure is invalid${NC}"
return 1
fi
fi
echo ""
return 0
}
# Test 3: Check that non-root user permissions are set
test_user_permissions() {
local dockerfile="$1"
local test_name="$2"
echo -e "${YELLOW}Test: User permissions in $test_name${NC}"
if grep -q "USER kreuzberg" "$dockerfile"; then
echo -e "${GREEN}✓ Non-root 'kreuzberg' user is set${NC}"
else
echo -e "${RED}✗ Non-root user NOT found${NC}"
return 1
fi
if grep -q "chown -R kreuzberg:kreuzberg" "$dockerfile"; then
echo -e "${GREEN}✓ Directory ownership set to kreuzberg user${NC}"
else
echo -e "${RED}✗ Directory ownership NOT set for kreuzberg user${NC}"
return 1
fi
echo ""
return 0
}
# Test 4: Verify no version-specific paths remain
test_no_hardcoded_versions() {
local dockerfile="$1"
local test_name="$2"
echo -e "${YELLOW}Test: No hardcoded version paths in $test_name${NC}"
if grep "tesseract-ocr/5/tessdata" "$dockerfile" | grep -v "tesseract-ocr/\*/tessdata" >/dev/null; then
echo -e "${RED}✗ Hardcoded tesseract-ocr/5 version found${NC}"
return 1
else
echo -e "${GREEN}✓ No hardcoded tesseract-ocr/5 version${NC}"
fi
if grep "tesseract-ocr/4/tessdata" "$dockerfile" | grep -v "tesseract-ocr/\*/tessdata" >/dev/null; then
echo -e "${YELLOW}! Hardcoded tesseract-ocr/4 version found (but it's in the loop, so OK)${NC}"
else
echo -e "${GREEN}✓ Version paths are in dynamic loop${NC}"
fi
echo ""
return 0
}
# Run all tests
run_tests() {
local dockerfile="$1"
local test_name="$2"
local passed=0
local failed=0
echo -e "${YELLOW}========================================${NC}"
echo -e "${YELLOW}Testing: $test_name${NC}"
echo -e "${YELLOW}File: $dockerfile${NC}"
echo -e "${YELLOW}========================================\n${NC}"
if test_tessdata_discovery "Tessdata discovery logic" "$dockerfile"; then
((passed++))
else
((failed++))
fi
if test_dockerfile_syntax "$dockerfile" "$test_name"; then
((passed++))
else
((failed++))
fi
if test_user_permissions "$dockerfile" "$test_name"; then
((passed++))
else
((failed++))
fi
if test_no_hardcoded_versions "$dockerfile" "$test_name"; then
((passed++))
else
((failed++))
fi
echo -e "${YELLOW}----------------------------------------${NC}"
echo -e "Results: ${GREEN}$passed passed${NC}, ${RED}$failed failed${NC}"
echo -e "${YELLOW}========================================\n${NC}"
return $failed
}
# Main execution
total_failed=0
# Test Dockerfile.core
if ! run_tests "$SCRIPT_DIR/Dockerfile.core" "Dockerfile.core"; then
total_failed=$((total_failed + $?))
fi
# Test Dockerfile.full
if ! run_tests "$SCRIPT_DIR/Dockerfile.full" "Dockerfile.full"; then
total_failed=$((total_failed + $?))
fi
# Summary
echo -e "${YELLOW}========================================${NC}"
if [ $total_failed -eq 0 ]; then
echo -e "${GREEN}✓ All tests passed!${NC}"
echo -e "${GREEN}Tessdata configuration is properly set up.${NC}"
exit 0
else
echo -e "${RED}✗ Some tests failed (total failures: $total_failed)${NC}"
echo -e "${RED}Please review the Dockerfile changes.${NC}"
exit 1
fi