This commit is contained in:
75
docker/Dockerfile.cli
Normal file
75
docker/Dockerfile.cli
Normal file
@@ -0,0 +1,75 @@
|
||||
# =============================================================================
|
||||
# Minimal CLI-only Docker image for Kreuzberg.
|
||||
#
|
||||
# Uses the musl-build stage to produce a fully static binary, then copies it
|
||||
# into a minimal Alpine image for shell access and volume mounts.
|
||||
#
|
||||
# Usage:
|
||||
# docker build -f docker/Dockerfile.cli -t kreuzberg-cli .
|
||||
# docker run -v $(pwd):/data kreuzberg-cli extract /data/document.pdf
|
||||
# =============================================================================
|
||||
|
||||
# Stage 1: Build the static binary using the musl builder
|
||||
FROM alpine:3.21 AS builder
|
||||
|
||||
ARG RUST_TOOLCHAIN=nightly-2026-03-10
|
||||
|
||||
WORKDIR /build
|
||||
|
||||
# Install build dependencies + ONNX Runtime from Alpine edge (musl-native).
|
||||
# ort-sys checks ORT_LIB_LOCATION before attempting download, so this overrides
|
||||
# the download-binaries feature transparently — no Cargo feature changes needed.
|
||||
# Edge repos needed because onnxruntime depends on abseil-cpp/protobuf from edge/main.
|
||||
RUN apk add --no-cache \
|
||||
curl gcc g++ musl-dev cmake make pkgconf \
|
||||
openssl-dev openssl-libs-static \
|
||||
perl linux-headers git file && \
|
||||
apk add --no-cache onnxruntime-dev \
|
||||
--repository=https://dl-cdn.alpinelinux.org/alpine/edge/community \
|
||||
--repository=https://dl-cdn.alpinelinux.org/alpine/edge/main
|
||||
|
||||
# Install Rust via rustup
|
||||
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs \
|
||||
| sh -s -- -y --default-toolchain "${RUST_TOOLCHAIN}" --component rust-src
|
||||
ENV PATH="/root/.cargo/bin:${PATH}"
|
||||
|
||||
# Allow dynamic loading (dlopen) on musl targets by disabling crt-static.
|
||||
ENV RUSTFLAGS="-C target-feature=-crt-static"
|
||||
|
||||
# Point ort-sys to Alpine's system ORT library instead of downloading prebuilt binaries.
|
||||
ENV ORT_LIB_LOCATION=/usr/lib
|
||||
ENV ORT_PREFER_DYNAMIC_LINK=1
|
||||
|
||||
# Copy workspace manifests and crates
|
||||
COPY Cargo.toml Cargo.lock ./
|
||||
COPY crates/kreuzberg/ crates/kreuzberg/
|
||||
COPY crates/kreuzberg-cli/ crates/kreuzberg-cli/
|
||||
COPY crates/kreuzberg-tesseract/ crates/kreuzberg-tesseract/
|
||||
COPY crates/kreuzberg-paddle-ocr/ crates/kreuzberg-paddle-ocr/
|
||||
|
||||
# Remove workspace members that aren't included
|
||||
RUN sed -i '/kreuzberg-py/d; /kreuzberg_rb/d; /kreuzberg-node/d; /kreuzberg-ffi/d; /kreuzberg-php/d; /kreuzberg_rustler/d; /kreuzberg_nif/d; /packages\/dart\/rust/d; /packages\/swift\/rust/d; /\"crates\/kreuzberg-wasm\"/d; /^\[profile\.release\.package\.kreuzberg-wasm\]$/,$d; /benchmark-harness/d; /e2e-generator/d; /snippet-runner/d; /e2e\/rust/d' Cargo.toml
|
||||
|
||||
RUN cargo build --release --package kreuzberg-cli --features all && \
|
||||
cp target/release/kreuzberg /build/kreuzberg && \
|
||||
strip /build/kreuzberg
|
||||
|
||||
# Verify the binary was built successfully
|
||||
RUN file /build/kreuzberg && \
|
||||
echo "=== Dynamic dependencies ===" && \
|
||||
readelf -d /build/kreuzberg 2>/dev/null | grep NEEDED || echo "No external dependencies"
|
||||
|
||||
# =============================================================================
|
||||
# Stage 2: Minimal runtime image
|
||||
# =============================================================================
|
||||
FROM alpine:3.21
|
||||
|
||||
# Install runtime dependencies needed by dynamically linked binary
|
||||
RUN apk add --no-cache libstdc++ libgcc && \
|
||||
apk add --no-cache onnxruntime \
|
||||
--repository=https://dl-cdn.alpinelinux.org/alpine/edge/community \
|
||||
--repository=https://dl-cdn.alpinelinux.org/alpine/edge/main
|
||||
|
||||
COPY --from=builder /build/kreuzberg /usr/local/bin/kreuzberg
|
||||
|
||||
ENTRYPOINT ["kreuzberg"]
|
||||
162
docker/Dockerfile.core
Normal file
162
docker/Dockerfile.core
Normal file
@@ -0,0 +1,162 @@
|
||||
# =============================================================================
|
||||
# Builder Stage - Build Rust binary with all dependencies
|
||||
# =============================================================================
|
||||
FROM rust:1.91-trixie AS builder
|
||||
|
||||
WORKDIR /build
|
||||
|
||||
# Install build dependencies
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
cmake \
|
||||
g++ \
|
||||
pkg-config \
|
||||
libssl-dev \
|
||||
libleptonica-dev \
|
||||
libtesseract-dev \
|
||||
clang \
|
||||
curl \
|
||||
file \
|
||||
&& \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Set onnxruntime version (can be overridden via build-arg)
|
||||
ARG ONNXRUNTIME_VERSION=1.24.2
|
||||
ARG TARGETARCH
|
||||
ENV ONNXRUNTIME_VERSION=${ONNXRUNTIME_VERSION}
|
||||
|
||||
# Download and extract ONNX Runtime
|
||||
RUN mkdir -p /build/onnxruntime && \
|
||||
if [ "$TARGETARCH" = "arm64" ]; then \
|
||||
ORT_ARCH="aarch64"; \
|
||||
else \
|
||||
ORT_ARCH="x64"; \
|
||||
fi && \
|
||||
curl -fL "https://github.com/microsoft/onnxruntime/releases/download/v${ONNXRUNTIME_VERSION}/onnxruntime-linux-${ORT_ARCH}-${ONNXRUNTIME_VERSION}.tgz" \
|
||||
-o /build/onnxruntime.tgz && \
|
||||
tar -xzf /build/onnxruntime.tgz -C /build/onnxruntime --strip-components=1 && \
|
||||
rm /build/onnxruntime.tgz
|
||||
|
||||
# Copy workspace manifests and crates
|
||||
COPY Cargo.toml Cargo.lock ./
|
||||
COPY crates/kreuzberg/ crates/kreuzberg/
|
||||
COPY crates/kreuzberg-cli/ crates/kreuzberg-cli/
|
||||
COPY crates/kreuzberg-tesseract/ crates/kreuzberg-tesseract/
|
||||
COPY crates/kreuzberg-paddle-ocr/ crates/kreuzberg-paddle-ocr/
|
||||
|
||||
# Remove workspace members that aren't included (Ruby, Node, Python, PHP, tools, e2e)
|
||||
RUN sed -i '/kreuzberg-py/d; /kreuzberg_rb/d; /kreuzberg-node/d; /kreuzberg-ffi/d; /kreuzberg-php/d; /kreuzberg_rustler/d; /kreuzberg_nif/d; /packages\/dart\/rust/d; /packages\/swift\/rust/d; /"crates\/kreuzberg-wasm"/d; /^\[profile\.release\.package\.kreuzberg-wasm\]$/,$d; /benchmark-harness/d; /e2e-generator/d; /snippet-runner/d; /e2e\/rust/d' Cargo.toml
|
||||
|
||||
# Build release binary with server features (api + full format support)
|
||||
RUN --mount=type=cache,target=/usr/local/cargo/registry \
|
||||
--mount=type=cache,target=/usr/local/cargo/git \
|
||||
--mount=type=cache,target=/build/target \
|
||||
cargo build --release --package kreuzberg-cli --features all && \
|
||||
cp target/release/kreuzberg /build/kreuzberg && \
|
||||
strip /build/kreuzberg
|
||||
|
||||
# =============================================================================
|
||||
# Runtime Stage - Minimal runtime environment
|
||||
# =============================================================================
|
||||
FROM debian:trixie-slim
|
||||
|
||||
# OCI labels for container metadata
|
||||
LABEL org.opencontainers.image.source="https://github.com/kreuzberg-dev/kreuzberg"
|
||||
LABEL org.opencontainers.image.description="Kreuzberg document intelligence - core variant"
|
||||
LABEL org.opencontainers.image.licenses="MIT"
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Download and install dependencies (Core version)
|
||||
ARG TARGETARCH
|
||||
|
||||
# Install runtime dependencies and download binaries
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
ca-certificates \
|
||||
curl \
|
||||
tesseract-ocr \
|
||||
tesseract-ocr-eng \
|
||||
tesseract-ocr-osd \
|
||||
tesseract-ocr-spa \
|
||||
tesseract-ocr-fra \
|
||||
tesseract-ocr-deu \
|
||||
tesseract-ocr-ita \
|
||||
tesseract-ocr-por \
|
||||
tesseract-ocr-chi-sim \
|
||||
tesseract-ocr-chi-tra \
|
||||
tesseract-ocr-jpn \
|
||||
tesseract-ocr-ara \
|
||||
tesseract-ocr-rus \
|
||||
tesseract-ocr-hin \
|
||||
&& \
|
||||
# Clean up
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
|
||||
# Copy ONNX Runtime libraries from builder
|
||||
COPY --from=builder /build/onnxruntime/lib/libonnxruntime.so* /usr/local/lib/
|
||||
RUN ldconfig
|
||||
|
||||
# Copy binary from builder
|
||||
COPY --from=builder /build/kreuzberg /usr/local/bin/kreuzberg
|
||||
RUN chmod +x /usr/local/bin/kreuzberg
|
||||
|
||||
# Create non-root user
|
||||
RUN groupadd -r kreuzberg && \
|
||||
useradd -r -g kreuzberg -d /app -s /sbin/nologin kreuzberg && \
|
||||
mkdir -p /app/.kreuzberg && \
|
||||
chown -R kreuzberg:kreuzberg /app
|
||||
|
||||
# Create config directories for volume mounts
|
||||
RUN mkdir -p /etc/kreuzberg /app/.config/kreuzberg && \
|
||||
chown -R kreuzberg:kreuzberg /etc/kreuzberg /app/.config/kreuzberg
|
||||
|
||||
# Create Hugging Face cache directory for embeddings models
|
||||
RUN mkdir -p /app/.kreuzberg/huggingface && \
|
||||
chown -R kreuzberg:kreuzberg /app/.kreuzberg/huggingface
|
||||
|
||||
# Create PaddleOCR model cache directory (models downloaded on demand if paddle-ocr feature used)
|
||||
RUN mkdir -p /app/.kreuzberg/paddle-ocr && \
|
||||
chown -R kreuzberg:kreuzberg /app/.kreuzberg/paddle-ocr
|
||||
|
||||
# Ensure read permissions on tessdata files for non-root user
|
||||
# Tessdata is installed in version-specific directory (e.g., tesseract-ocr/5 or tesseract-ocr/4)
|
||||
# Make all tessdata directories readable by the non-root kreuzberg user
|
||||
RUN set -eux; \
|
||||
echo "Setting up tessdata permissions..."; \
|
||||
for dir in /usr/share/tesseract-ocr/*/tessdata /usr/share/tesseract-ocr/tessdata /usr/share/tessdata; do \
|
||||
if [ -d "$dir" ]; then \
|
||||
chmod -R a+rx "$dir" 2>/dev/null || true; \
|
||||
if [ -f "$dir/eng.traineddata" ]; then \
|
||||
echo "✓ Found tessdata with eng.traineddata at: $dir"; \
|
||||
fi; \
|
||||
fi; \
|
||||
done; \
|
||||
echo "✓ Tessdata permissions configured"
|
||||
|
||||
# Environment configuration.
|
||||
# RUST_LOG=info is the right default here. Third-party transport crates
|
||||
# (ureq, rustls, hyper_util, tower_http, hf_hub) are pre-suppressed by the
|
||||
# kreuzberg-cli subscriber defaults in logging.rs, so they won't emit DEBUG
|
||||
# even at the "info" root level. HuggingFace model downloads happen once at
|
||||
# image build time via `cache warm`; runtime /extract requests hit the HF
|
||||
# disk cache under HF_HOME and generate no network traffic.
|
||||
ENV KREUZBERG_CACHE_DIR=/app/.kreuzberg \
|
||||
HF_HOME=/app/.kreuzberg/huggingface \
|
||||
RUST_LOG=info \
|
||||
LD_LIBRARY_PATH=/usr/local/lib:/usr/lib:/lib
|
||||
|
||||
USER kreuzberg
|
||||
|
||||
EXPOSE 8000
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||
CMD ["/usr/local/bin/kreuzberg", "--version"]
|
||||
|
||||
# Set kreuzberg as entrypoint for flexible command usage
|
||||
# Default: Start API server (can be overridden for CLI or MCP mode)
|
||||
ENTRYPOINT ["kreuzberg"]
|
||||
CMD ["serve", "--host", "0.0.0.0", "--port", "8000"]
|
||||
162
docker/Dockerfile.full
Normal file
162
docker/Dockerfile.full
Normal file
@@ -0,0 +1,162 @@
|
||||
# =============================================================================
|
||||
# Builder Stage - Build Rust binary with all dependencies
|
||||
# =============================================================================
|
||||
FROM rust:1.91-trixie AS builder
|
||||
|
||||
WORKDIR /build
|
||||
|
||||
# Install build dependencies
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
cmake \
|
||||
g++ \
|
||||
pkg-config \
|
||||
libssl-dev \
|
||||
libleptonica-dev \
|
||||
libtesseract-dev \
|
||||
clang \
|
||||
curl \
|
||||
file \
|
||||
&& \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Set onnxruntime version (can be overridden via build-arg)
|
||||
ARG ONNXRUNTIME_VERSION=1.24.2
|
||||
ARG TARGETARCH
|
||||
ENV ONNXRUNTIME_VERSION=${ONNXRUNTIME_VERSION}
|
||||
|
||||
# Download and extract ONNX Runtime
|
||||
RUN mkdir -p /build/onnxruntime && \
|
||||
if [ "$TARGETARCH" = "arm64" ]; then \
|
||||
ORT_ARCH="aarch64"; \
|
||||
else \
|
||||
ORT_ARCH="x64"; \
|
||||
fi && \
|
||||
curl -fL "https://github.com/microsoft/onnxruntime/releases/download/v${ONNXRUNTIME_VERSION}/onnxruntime-linux-${ORT_ARCH}-${ONNXRUNTIME_VERSION}.tgz" \
|
||||
-o /build/onnxruntime.tgz && \
|
||||
tar -xzf /build/onnxruntime.tgz -C /build/onnxruntime --strip-components=1 && \
|
||||
rm /build/onnxruntime.tgz
|
||||
|
||||
# Copy workspace manifests and crates
|
||||
COPY Cargo.toml Cargo.lock ./
|
||||
COPY crates/kreuzberg/ crates/kreuzberg/
|
||||
COPY crates/kreuzberg-cli/ crates/kreuzberg-cli/
|
||||
COPY crates/kreuzberg-tesseract/ crates/kreuzberg-tesseract/
|
||||
COPY crates/kreuzberg-paddle-ocr/ crates/kreuzberg-paddle-ocr/
|
||||
|
||||
# Remove workspace members that aren't included (Ruby, Node, Python, PHP, Elixir, tools, e2e)
|
||||
RUN sed -i '/kreuzberg-py/d; /kreuzberg_rb/d; /kreuzberg-node/d; /kreuzberg-ffi/d; /kreuzberg-php/d; /kreuzberg_rustler/d; /kreuzberg_nif/d; /packages\/dart\/rust/d; /packages\/swift\/rust/d; /"crates\/kreuzberg-wasm"/d; /^\[profile\.release\.package\.kreuzberg-wasm\]$/,$d; /benchmark-harness/d; /e2e-generator/d; /snippet-runner/d; /e2e\/rust/d' Cargo.toml
|
||||
|
||||
# Build release binary with server features (api + full format support)
|
||||
RUN --mount=type=cache,target=/usr/local/cargo/registry \
|
||||
--mount=type=cache,target=/usr/local/cargo/git \
|
||||
--mount=type=cache,target=/build/target \
|
||||
cargo build --release --package kreuzberg-cli --features all && \
|
||||
cp target/release/kreuzberg /build/kreuzberg && \
|
||||
strip /build/kreuzberg
|
||||
|
||||
# =============================================================================
|
||||
# Runtime Stage - Minimal runtime environment
|
||||
# =============================================================================
|
||||
FROM debian:trixie-slim
|
||||
|
||||
# OCI labels for container metadata
|
||||
LABEL org.opencontainers.image.source="https://github.com/kreuzberg-dev/kreuzberg"
|
||||
LABEL org.opencontainers.image.description="Kreuzberg document intelligence - full variant"
|
||||
LABEL org.opencontainers.image.licenses="MIT"
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Install runtime dependencies
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
ca-certificates \
|
||||
curl \
|
||||
tesseract-ocr \
|
||||
tesseract-ocr-eng \
|
||||
tesseract-ocr-osd \
|
||||
tesseract-ocr-spa \
|
||||
tesseract-ocr-fra \
|
||||
tesseract-ocr-deu \
|
||||
tesseract-ocr-ita \
|
||||
tesseract-ocr-por \
|
||||
tesseract-ocr-chi-sim \
|
||||
tesseract-ocr-chi-tra \
|
||||
tesseract-ocr-jpn \
|
||||
tesseract-ocr-ara \
|
||||
tesseract-ocr-rus \
|
||||
tesseract-ocr-hin \
|
||||
fontconfig \
|
||||
libssl3 \
|
||||
&& \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
|
||||
# Copy ONNX Runtime libraries from builder
|
||||
COPY --from=builder /build/onnxruntime/lib/libonnxruntime.so* /usr/local/lib/
|
||||
RUN ldconfig
|
||||
|
||||
# Copy binary from builder
|
||||
COPY --from=builder /build/kreuzberg /usr/local/bin/kreuzberg
|
||||
RUN chmod +x /usr/local/bin/kreuzberg
|
||||
|
||||
# Create non-root user
|
||||
RUN groupadd -r kreuzberg && \
|
||||
useradd -r -g kreuzberg -d /app -s /sbin/nologin kreuzberg && \
|
||||
mkdir -p /app/.kreuzberg && \
|
||||
chown -R kreuzberg:kreuzberg /app
|
||||
|
||||
# Create config directories for volume mounts
|
||||
RUN mkdir -p /etc/kreuzberg /app/.config/kreuzberg && \
|
||||
chown -R kreuzberg:kreuzberg /etc/kreuzberg /app/.config/kreuzberg
|
||||
|
||||
# Create Hugging Face cache directory for embeddings models
|
||||
RUN mkdir -p /app/.kreuzberg/huggingface && \
|
||||
chown -R kreuzberg:kreuzberg /app/.kreuzberg/huggingface
|
||||
|
||||
# Pre-download all models (PaddleOCR + layout detection) using kreuzberg cache warm
|
||||
RUN KREUZBERG_CACHE_DIR=/app/.kreuzberg \
|
||||
/usr/local/bin/kreuzberg cache warm --cache-dir /app/.kreuzberg --format json && \
|
||||
chown -R kreuzberg:kreuzberg /app/.kreuzberg && \
|
||||
echo "All models ready (PaddleOCR + layout detection)"
|
||||
|
||||
# Ensure read permissions on tessdata files for non-root user
|
||||
# Tessdata is installed in version-specific directory (e.g., tesseract-ocr/5 or tesseract-ocr/4)
|
||||
# Make all tessdata directories readable by the non-root kreuzberg user
|
||||
RUN set -eux; \
|
||||
echo "Setting up tessdata permissions..."; \
|
||||
for dir in /usr/share/tesseract-ocr/*/tessdata /usr/share/tesseract-ocr/tessdata /usr/share/tessdata; do \
|
||||
if [ -d "$dir" ]; then \
|
||||
chmod -R a+rx "$dir" 2>/dev/null || true; \
|
||||
if [ -f "$dir/eng.traineddata" ]; then \
|
||||
echo "✓ Found tessdata with eng.traineddata at: $dir"; \
|
||||
fi; \
|
||||
fi; \
|
||||
done; \
|
||||
echo "✓ Tessdata permissions configured"
|
||||
|
||||
# Environment configuration.
|
||||
# RUST_LOG=info is the right default here. Third-party transport crates
|
||||
# (ureq, rustls, hyper_util, tower_http, hf_hub) are pre-suppressed by the
|
||||
# kreuzberg-cli subscriber defaults in logging.rs, so they won't emit DEBUG
|
||||
# even at the "info" root level. HuggingFace model downloads happen once at
|
||||
# image build time via `cache warm`; runtime /extract requests hit the HF
|
||||
# disk cache under HF_HOME and generate no network traffic.
|
||||
ENV KREUZBERG_CACHE_DIR=/app/.kreuzberg \
|
||||
HF_HOME=/app/.kreuzberg/huggingface \
|
||||
RUST_LOG=info \
|
||||
LD_LIBRARY_PATH=/usr/local/lib:/usr/lib:/lib
|
||||
|
||||
USER kreuzberg
|
||||
|
||||
EXPOSE 8000
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||
CMD ["/usr/local/bin/kreuzberg", "--version"]
|
||||
|
||||
# Set kreuzberg as entrypoint for flexible command usage
|
||||
# Default: Start API server (can be overridden for CLI or MCP mode)
|
||||
ENTRYPOINT ["kreuzberg"]
|
||||
CMD ["serve", "--host", "0.0.0.0", "--port", "8000"]
|
||||
142
docker/Dockerfile.musl-build
Normal file
142
docker/Dockerfile.musl-build
Normal file
@@ -0,0 +1,142 @@
|
||||
# =============================================================================
|
||||
# Alpine-based builder for musl CLI binaries.
|
||||
#
|
||||
# Usage:
|
||||
# docker build -f docker/Dockerfile.musl-build \
|
||||
# --output type=local,dest=./dist \
|
||||
# --build-arg TARGETARCH=x86_64 .
|
||||
#
|
||||
# Produces: dist/kreuzberg (binary) and dist/lib/ (runtime libraries)
|
||||
#
|
||||
# Runtime libraries (musl libc, libstdc++, libgcc) are bundled alongside
|
||||
# the binary for portability across Linux distros.
|
||||
# =============================================================================
|
||||
FROM alpine:3.21 AS builder
|
||||
|
||||
ARG RUST_TOOLCHAIN=nightly-2026-03-10
|
||||
|
||||
WORKDIR /build
|
||||
|
||||
# Install build dependencies — Alpine's g++ and libstdc++ are musl-native,
|
||||
# so tesseract C++ compilation works without glibc conflicts.
|
||||
# onnxruntime-dev from edge provides musl-native ORT for linking.
|
||||
RUN apk add --no-cache \
|
||||
curl gcc g++ musl-dev cmake make pkgconf \
|
||||
openssl-dev openssl-libs-static \
|
||||
perl linux-headers git file patchelf && \
|
||||
apk add --no-cache onnxruntime-dev \
|
||||
--repository=https://dl-cdn.alpinelinux.org/alpine/edge/community \
|
||||
--repository=https://dl-cdn.alpinelinux.org/alpine/edge/main
|
||||
|
||||
# Install Rust via rustup (Alpine's packaged Rust may be too old / not nightly)
|
||||
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs \
|
||||
| sh -s -- -y --default-toolchain "${RUST_TOOLCHAIN}" --component rust-src && \
|
||||
echo "Rust host: $(~/.cargo/bin/rustc -vV | grep host)" && \
|
||||
echo "Default target: $(~/.cargo/bin/rustc --print cfg | grep target)"
|
||||
ENV PATH="/root/.cargo/bin:${PATH}"
|
||||
|
||||
# Disable crt-static so the binary can dlopen shared libraries at runtime.
|
||||
ENV RUSTFLAGS="-C target-feature=-crt-static"
|
||||
|
||||
# Point ort-sys to Alpine's system ORT library instead of downloading prebuilt binaries.
|
||||
# ort-sys checks ORT_LIB_LOCATION before attempting download (build/main.rs line 45).
|
||||
ENV ORT_LIB_LOCATION=/usr/lib
|
||||
ENV ORT_PREFER_DYNAMIC_LINK=1
|
||||
|
||||
# Copy workspace manifests and crates
|
||||
COPY Cargo.toml Cargo.lock ./
|
||||
COPY crates/kreuzberg/ crates/kreuzberg/
|
||||
COPY crates/kreuzberg-cli/ crates/kreuzberg-cli/
|
||||
COPY crates/kreuzberg-tesseract/ crates/kreuzberg-tesseract/
|
||||
COPY crates/kreuzberg-paddle-ocr/ crates/kreuzberg-paddle-ocr/
|
||||
|
||||
# Remove workspace members that aren't included
|
||||
RUN sed -i '/kreuzberg-py/d; /kreuzberg_rb/d; /kreuzberg-node/d; /kreuzberg-ffi/d; /kreuzberg-php/d; /kreuzberg_rustler/d; /kreuzberg_nif/d; /packages\/dart\/rust/d; /packages\/swift\/rust/d; /"crates\/kreuzberg-wasm"/d; /^\[profile\.release\.package\.kreuzberg-wasm\]$/,$d; /benchmark-harness/d; /e2e-generator/d; /snippet-runner/d; /e2e\/rust/d' Cargo.toml
|
||||
|
||||
RUN cargo build --release --package kreuzberg-cli --features all && \
|
||||
cp target/release/kreuzberg /build/kreuzberg && \
|
||||
strip /build/kreuzberg
|
||||
|
||||
# Set RPATH so the binary finds shared libs relative to itself
|
||||
RUN patchelf --set-rpath '$ORIGIN/lib' /build/kreuzberg
|
||||
|
||||
# Collect runtime libraries.
|
||||
#
|
||||
# The launcher (below) invokes the musl loader with `--library-path lib/`,
|
||||
# which REPLACES the loader's search path. The bundle must therefore be
|
||||
# self-contained: every transitive dependency of every shipped .so has to
|
||||
# live in /build/lib/ too, otherwise the loader prints "Error loading shared
|
||||
# library X: No such file or directory" at startup (issue #991).
|
||||
#
|
||||
# Strategy:
|
||||
# 1. Copy the well-known runtime bits (musl loader, libstdc++, libgcc, ORT).
|
||||
# 2. Recursively `ldd`-walk every .so in the bundle and copy any host lib
|
||||
# they resolve to that isn't already present.
|
||||
# 3. Smoke-test the loader against each shipped .so so the build FAILS if
|
||||
# anything is still missing — better to break the image than to ship a
|
||||
# tarball that crashes on first invocation.
|
||||
RUN set -eu; \
|
||||
mkdir -p /build/lib; \
|
||||
cp /usr/lib/libstdc++.so.6 /build/lib/; \
|
||||
cp /usr/lib/libgcc_s.so.1 /build/lib/; \
|
||||
# Bundle ONNX Runtime for embeddings/layout-detection at runtime.
|
||||
cp /usr/lib/libonnxruntime.so* /build/lib/ 2>/dev/null || true; \
|
||||
# Copy the musl dynamic linker/libc.
|
||||
cp /lib/ld-musl-*.so.1 /build/lib/ 2>/dev/null || true; \
|
||||
# Recursively resolve transitive deps of everything in /build/lib via ldd
|
||||
# (alpine's musl ldd resolves against system paths). Re-walk until no new
|
||||
# libraries are pulled in to handle multi-level chains (libonnxruntime →
|
||||
# libprotobuf-lite → libabsl_* → ...).
|
||||
LOADER="$(ls /build/lib/ld-musl-*.so.1 | head -n1)"; \
|
||||
while :; do \
|
||||
before=$(ls /build/lib | wc -l); \
|
||||
for so in /build/lib/*.so*; do \
|
||||
# Skip the loader itself; ldd against it is meaningless.
|
||||
case "$so" in *ld-musl-*) continue ;; esac; \
|
||||
"$LOADER" --list "$so" 2>/dev/null \
|
||||
| awk '/=>/ { print $3 }' \
|
||||
| grep -E '^/' \
|
||||
| while read -r dep; do \
|
||||
base="$(basename "$dep")"; \
|
||||
if [ ! -e "/build/lib/$base" ]; then \
|
||||
cp -L "$dep" /build/lib/; \
|
||||
fi; \
|
||||
done; \
|
||||
done; \
|
||||
after=$(ls /build/lib | wc -l); \
|
||||
[ "$before" = "$after" ] && break; \
|
||||
done; \
|
||||
# Verify nothing in the bundle has unresolved deps when constrained to lib/.
|
||||
for so in /build/lib/*.so*; do \
|
||||
case "$so" in *ld-musl-*) continue ;; esac; \
|
||||
if "$LOADER" --library-path /build/lib --list "$so" 2>&1 | grep -q 'not found'; then \
|
||||
echo "FAIL: $so has unresolved dependencies inside the bundle:" >&2; \
|
||||
"$LOADER" --library-path /build/lib --list "$so" >&2; \
|
||||
exit 1; \
|
||||
fi; \
|
||||
done; \
|
||||
echo "OK: every bundled library resolves inside /build/lib/"
|
||||
|
||||
# Rename the actual binary and create a wrapper script that invokes it
|
||||
# via the bundled musl interpreter. This makes the binary work on ANY
|
||||
# Linux distro (glibc or musl) without system dependencies.
|
||||
RUN mv /build/kreuzberg /build/kreuzberg.bin && \
|
||||
INTERP_NAME=$(basename /lib/ld-musl-*.so.1) && \
|
||||
printf '#!/bin/sh\nSCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"\nexec "$SCRIPT_DIR/lib/%s" --library-path "$SCRIPT_DIR/lib" "$SCRIPT_DIR/kreuzberg.bin" "$@"\n' \
|
||||
"$INTERP_NAME" > /build/kreuzberg && \
|
||||
chmod +x /build/kreuzberg
|
||||
|
||||
# Verify the binary was built successfully
|
||||
RUN file /build/kreuzberg && \
|
||||
echo "=== Dynamic dependencies ===" && \
|
||||
readelf -d /build/kreuzberg 2>/dev/null | grep -E "NEEDED|RPATH|RUNPATH" || echo "No dependencies" && \
|
||||
echo "=== Bundled libraries ===" && \
|
||||
ls -la /build/lib/
|
||||
|
||||
# =============================================================================
|
||||
# Output stage — binary + bundled runtime libraries
|
||||
# =============================================================================
|
||||
FROM scratch
|
||||
COPY --from=builder /build/kreuzberg /kreuzberg
|
||||
COPY --from=builder /build/kreuzberg.bin /kreuzberg.bin
|
||||
COPY --from=builder /build/lib/ /lib/
|
||||
65
docker/Dockerfile.musl-ffi
Normal file
65
docker/Dockerfile.musl-ffi
Normal file
@@ -0,0 +1,65 @@
|
||||
# =============================================================================
|
||||
# Alpine-based builder for musl-linked FFI shared library.
|
||||
#
|
||||
# Usage:
|
||||
# docker build -f docker/Dockerfile.musl-ffi \
|
||||
# --output type=local,dest=./dist .
|
||||
#
|
||||
# Produces libkreuzberg_ffi.so at dist/libkreuzberg_ffi.so
|
||||
# =============================================================================
|
||||
FROM alpine:3.21 AS builder
|
||||
|
||||
ARG RUST_TOOLCHAIN=nightly-2026-03-10
|
||||
|
||||
WORKDIR /build
|
||||
|
||||
# Install build dependencies — Alpine's g++ and libstdc++ are musl-native,
|
||||
# so tesseract C++ compilation works without glibc conflicts.
|
||||
RUN apk add --no-cache \
|
||||
curl gcc g++ musl-dev cmake make pkgconf \
|
||||
openssl-dev openssl-libs-static \
|
||||
perl linux-headers git file && \
|
||||
apk add --no-cache onnxruntime-dev \
|
||||
--repository=https://dl-cdn.alpinelinux.org/alpine/edge/community \
|
||||
--repository=https://dl-cdn.alpinelinux.org/alpine/edge/main
|
||||
|
||||
# Install Rust via rustup (Alpine's packaged Rust may be too old / not nightly)
|
||||
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs \
|
||||
| sh -s -- -y --default-toolchain "${RUST_TOOLCHAIN}" --component rust-src && \
|
||||
echo "Rust host: $(~/.cargo/bin/rustc -vV | grep host)" && \
|
||||
echo "Default target: $(~/.cargo/bin/rustc --print cfg | grep target)"
|
||||
ENV PATH="/root/.cargo/bin:${PATH}"
|
||||
|
||||
# Point ort-sys to Alpine's system ORT library instead of downloading prebuilt binaries.
|
||||
ENV ORT_LIB_LOCATION=/usr/lib
|
||||
ENV ORT_PREFER_DYNAMIC_LINK=1
|
||||
ENV ORT_SKIP_DOWNLOAD=1
|
||||
ENV ORT_STRATEGY=system
|
||||
# Allow cdylib output on musl targets (default is +crt-static which blocks shared libs)
|
||||
ENV RUSTFLAGS="-C target-feature=-crt-static"
|
||||
|
||||
# Copy workspace manifests and crates
|
||||
COPY Cargo.toml Cargo.lock ./
|
||||
COPY crates/kreuzberg/ crates/kreuzberg/
|
||||
COPY crates/kreuzberg-ffi/ crates/kreuzberg-ffi/
|
||||
COPY crates/kreuzberg-tesseract/ crates/kreuzberg-tesseract/
|
||||
COPY crates/kreuzberg-paddle-ocr/ crates/kreuzberg-paddle-ocr/
|
||||
|
||||
# Remove workspace members that aren't included
|
||||
RUN sed -i '/kreuzberg-py/d; /kreuzberg_rb/d; /kreuzberg-node/d; /kreuzberg-cli/d; /kreuzberg-php/d; /kreuzberg_rustler/d; /kreuzberg_nif/d; /packages\/dart\/rust/d; /packages\/swift\/rust/d; /"crates\/kreuzberg-wasm"/d; /^\[profile\.release\.package\.kreuzberg-wasm\]$/,$d; /benchmark-harness/d; /e2e-generator/d; /snippet-runner/d; /e2e\/rust/d' Cargo.toml
|
||||
|
||||
# Build the FFI shared library
|
||||
RUN cargo build --release --package kreuzberg-ffi && \
|
||||
cp target/release/libkreuzberg_ffi.so /build/libkreuzberg_ffi.so && \
|
||||
strip /build/libkreuzberg_ffi.so
|
||||
|
||||
# Verify the library
|
||||
RUN file /build/libkreuzberg_ffi.so && \
|
||||
echo "=== Dynamic dependencies ===" && \
|
||||
readelf -d /build/libkreuzberg_ffi.so 2>/dev/null | grep NEEDED || echo "No dynamic dependencies (fully static)"
|
||||
|
||||
# =============================================================================
|
||||
# Output stage — just the shared library
|
||||
# =============================================================================
|
||||
FROM scratch
|
||||
COPY --from=builder /build/libkreuzberg_ffi.so /libkreuzberg_ffi.so
|
||||
67
docker/Dockerfile.musl-rustler
Normal file
67
docker/Dockerfile.musl-rustler
Normal file
@@ -0,0 +1,67 @@
|
||||
# =============================================================================
|
||||
# Alpine-based builder for musl-linked Elixir Rustler NIF.
|
||||
#
|
||||
# Usage:
|
||||
# docker build -f docker/Dockerfile.musl-rustler \
|
||||
# --output type=local,dest=./dist .
|
||||
#
|
||||
# Produces libkreuzberg_nif.so at dist/libkreuzberg_nif.so
|
||||
# =============================================================================
|
||||
FROM alpine:3.21 AS builder
|
||||
|
||||
ARG RUST_TOOLCHAIN=nightly-2026-03-10
|
||||
|
||||
WORKDIR /build
|
||||
|
||||
# Install build dependencies — Alpine's g++ and libstdc++ are musl-native,
|
||||
# so tesseract C++ compilation works without glibc conflicts.
|
||||
RUN apk add --no-cache \
|
||||
curl gcc g++ musl-dev cmake make pkgconf \
|
||||
openssl-dev openssl-libs-static \
|
||||
perl linux-headers git file && \
|
||||
apk add --no-cache onnxruntime-dev \
|
||||
--repository=https://dl-cdn.alpinelinux.org/alpine/edge/community \
|
||||
--repository=https://dl-cdn.alpinelinux.org/alpine/edge/main
|
||||
|
||||
# Install Rust via rustup (Alpine's packaged Rust may be too old / not nightly)
|
||||
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs \
|
||||
| sh -s -- -y --default-toolchain "${RUST_TOOLCHAIN}" --component rust-src && \
|
||||
echo "Rust host: $(~/.cargo/bin/rustc -vV | grep host)" && \
|
||||
echo "Default target: $(~/.cargo/bin/rustc --print cfg | grep target)"
|
||||
ENV PATH="/root/.cargo/bin:${PATH}"
|
||||
|
||||
# Point ort-sys to Alpine's system ORT library instead of downloading prebuilt binaries.
|
||||
ENV ORT_LIB_LOCATION=/usr/lib
|
||||
ENV ORT_PREFER_DYNAMIC_LINK=1
|
||||
ENV ORT_SKIP_DOWNLOAD=1
|
||||
ENV ORT_STRATEGY=system
|
||||
# Allow cdylib output on musl targets (default is +crt-static which blocks shared libs)
|
||||
ENV RUSTFLAGS="-C target-feature=-crt-static"
|
||||
|
||||
# Copy workspace manifests and crates
|
||||
COPY Cargo.toml Cargo.lock ./
|
||||
COPY crates/kreuzberg/ crates/kreuzberg/
|
||||
COPY crates/kreuzberg-tesseract/ crates/kreuzberg-tesseract/
|
||||
COPY crates/kreuzberg-paddle-ocr/ crates/kreuzberg-paddle-ocr/
|
||||
COPY packages/elixir/native/kreuzberg_nif/ packages/elixir/native/kreuzberg_nif/
|
||||
|
||||
# Remove workspace members that aren't included
|
||||
RUN sed -i '/kreuzberg-py/d; /kreuzberg_rb/d; /kreuzberg-node/d; /kreuzberg-cli/d; /kreuzberg-ffi/d; /kreuzberg-php/d; /packages\/dart\/rust/d; /packages\/swift\/rust/d; /"crates\/kreuzberg-wasm"/d; /^\[profile\.release\.package\.kreuzberg-wasm\]$/,$d; /benchmark-harness/d; /e2e-generator/d; /snippet-runner/d; /e2e\/rust/d' Cargo.toml
|
||||
|
||||
# Build the Rustler NIF shared library (the crate is excluded from the workspace,
|
||||
# so we build it directly from its package directory).
|
||||
RUN cd packages/elixir/native/kreuzberg_nif && \
|
||||
cargo build --release && \
|
||||
cp target/release/libkreuzberg_nif.so /build/libkreuzberg_nif.so && \
|
||||
strip /build/libkreuzberg_nif.so
|
||||
|
||||
# Verify the library
|
||||
RUN file /build/libkreuzberg_nif.so && \
|
||||
echo "=== Dynamic dependencies ===" && \
|
||||
readelf -d /build/libkreuzberg_nif.so 2>/dev/null | grep NEEDED || echo "No dynamic dependencies (fully static)"
|
||||
|
||||
# =============================================================================
|
||||
# Output stage — just the shared library
|
||||
# =============================================================================
|
||||
FROM scratch
|
||||
COPY --from=builder /build/libkreuzberg_nif.so /libkreuzberg_nif.so
|
||||
137
docker/README.md
Normal file
137
docker/README.md
Normal file
@@ -0,0 +1,137 @@
|
||||
# Kreuzberg Docker Images
|
||||
|
||||
This directory contains Dockerfile variants for building Kreuzberg Docker images with different feature sets.
|
||||
|
||||
## Base Image
|
||||
|
||||
Both variants use **Debian 13 (Trixie) slim** - the latest stable Debian release for optimal package availability and security updates.
|
||||
|
||||
## Image Variants
|
||||
|
||||
### 1. Core Image (`Dockerfile.core`)
|
||||
|
||||
**Size:** ~1.0-1.3GB
|
||||
**Base:** debian:trixie-slim
|
||||
**Features:** PDF, DOCX, PPTX, images, HTML, XML, text, Excel, email, academic formats (LaTeX, EPUB, etc.)
|
||||
**OCR:** Tesseract (12 languages)
|
||||
**Legacy Office:** Native OLE/CFB parsing support
|
||||
|
||||
**When to use:**
|
||||
|
||||
- Production deployments where image size matters
|
||||
- Cloud environments with size/bandwidth constraints
|
||||
- Kubernetes deployments with frequent pod scaling
|
||||
- All use cases (both images have equivalent legacy Office support)
|
||||
|
||||
**Build command:**
|
||||
|
||||
```bash
|
||||
docker build -f docker/Dockerfile.core -t kreuzberg:core .
|
||||
```
|
||||
|
||||
### 2. Full Image (`Dockerfile.full`)
|
||||
|
||||
**Size:** ~1.0-1.3GB
|
||||
**Base:** debian:trixie-slim
|
||||
**Features:** All core features with native legacy Office format support
|
||||
**OCR:** Tesseract (12 languages)
|
||||
**Legacy Office:** Native OLE/CFB parsing for .doc, .ppt, .xls
|
||||
|
||||
**When to use:**
|
||||
|
||||
- Complete document intelligence pipeline with all optional dependencies
|
||||
- Development and testing environments
|
||||
- When you want maximum feature completeness
|
||||
|
||||
**Build command:**
|
||||
|
||||
```bash
|
||||
docker build -f docker/Dockerfile.full -t kreuzberg:full .
|
||||
```
|
||||
|
||||
## Size Comparison
|
||||
|
||||
| Component | Core | Full | Difference |
|
||||
| -------------------- | -------------- | -------------- | ----------------- |
|
||||
| Base (trixie-slim) | ~120MB | ~120MB | - |
|
||||
| Tesseract + 12 langs | ~250MB | ~250MB | - |
|
||||
| Rust binary | ~80MB | ~80MB | - |
|
||||
| System libraries | ~100MB | ~100MB | - |
|
||||
| **Total (approx)** | **~1.0-1.3GB** | **~1.0-1.3GB** | **- (same size)** |
|
||||
|
||||
## Default Image
|
||||
|
||||
The root `Dockerfile` is a symlink to `Dockerfile.full` for backward compatibility and complete feature support by default.
|
||||
|
||||
## Multi-Architecture Support
|
||||
|
||||
Both images support:
|
||||
|
||||
- `linux/amd64` (x86_64)
|
||||
- `linux/arm64` (aarch64)
|
||||
|
||||
Both architectures use the same pure-Rust PDF library — no architecture-specific binaries needed.
|
||||
|
||||
## Usage Modes
|
||||
|
||||
All images support three execution modes via ENTRYPOINT:
|
||||
|
||||
### 1. API Server (default)
|
||||
|
||||
```bash
|
||||
docker run -p 8000:8000 kreuzberg:core
|
||||
# or override host/port:
|
||||
docker run -p 8000:8000 kreuzberg:core serve --host 0.0.0.0 --port 8000
|
||||
```
|
||||
|
||||
### 2. CLI Mode
|
||||
|
||||
```bash
|
||||
docker run -v $(pwd):/data kreuzberg:core extract /data/document.pdf
|
||||
docker run -v $(pwd):/data kreuzberg:core detect /data/file.bin
|
||||
docker run -v $(pwd):/data kreuzberg:core batch /data/*.pdf
|
||||
```
|
||||
|
||||
### 3. MCP Server Mode
|
||||
|
||||
```bash
|
||||
docker run kreuzberg:core mcp
|
||||
```
|
||||
|
||||
## Testing
|
||||
|
||||
Test scripts are provided to verify both image variants:
|
||||
|
||||
```bash
|
||||
# Test core image
|
||||
IMAGE_NAME=kreuzberg:core ./scripts/test_docker.sh
|
||||
|
||||
# Test full image
|
||||
IMAGE_NAME=kreuzberg:full ./scripts/test_docker.sh
|
||||
```
|
||||
|
||||
## GitHub Actions
|
||||
|
||||
The `.github/workflows/publish-docker.yaml` workflow builds and publishes both variants to GitHub Container Registry:
|
||||
|
||||
- `ghcr.io/kreuzberg-dev/kreuzberg:VERSION-core` - Core image (minimal runtime)
|
||||
- `ghcr.io/kreuzberg-dev/kreuzberg:core` - Latest core image
|
||||
- `ghcr.io/kreuzberg-dev/kreuzberg:VERSION` - Full image (all optional dependencies)
|
||||
- `ghcr.io/kreuzberg-dev/kreuzberg:latest` - Latest full image
|
||||
|
||||
For local development, use the local tags shown in the build commands above.
|
||||
|
||||
## Recommendations
|
||||
|
||||
**Choose Core if:**
|
||||
|
||||
- ✅ Minimal runtime setup
|
||||
- ✅ Standard document processing needs
|
||||
- ✅ Cloud deployments with cost constraints
|
||||
- ✅ Kubernetes or container orchestration
|
||||
|
||||
**Choose Full if:**
|
||||
|
||||
- ✅ Want maximum optional dependencies preinstalled
|
||||
- ✅ Development and testing environments
|
||||
- ✅ "Batteries included" experience preferred
|
||||
206
docker/test-tessdata.sh
Executable file
206
docker/test-tessdata.sh
Executable file
@@ -0,0 +1,206 @@
|
||||
#!/usr/bin/env bash
|
||||
#
|
||||
# Test script to verify tessdata configuration in Docker images
|
||||
# This script tests both Dockerfile.core and Dockerfile.full
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
echo -e "${YELLOW}Testing Kreuzberg Docker tessdata configuration...${NC}\n"
|
||||
|
||||
# Test 1: Check if tessdata path discovery logic works
|
||||
test_tessdata_discovery() {
|
||||
local test_name="$1"
|
||||
local dockerfile="$2"
|
||||
|
||||
echo -e "${YELLOW}Test: $test_name${NC}"
|
||||
|
||||
# Extract the tessdata setup section from Dockerfile
|
||||
if grep -A 10 "Setting up tessdata permissions" "$dockerfile" >/dev/null; then
|
||||
echo -e "${GREEN}✓ Tessdata setup code found in $dockerfile${NC}"
|
||||
else
|
||||
echo -e "${RED}✗ Tessdata setup code NOT found in $dockerfile${NC}"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Check if TESSDATA_PREFIX is hardcoded (it should NOT be)
|
||||
if grep "TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata" "$dockerfile" >/dev/null; then
|
||||
echo -e "${RED}✗ TESSDATA_PREFIX is still hardcoded in $dockerfile (should be removed)${NC}"
|
||||
return 1
|
||||
else
|
||||
echo -e "${GREEN}✓ TESSDATA_PREFIX is not hardcoded (correct)${NC}"
|
||||
fi
|
||||
|
||||
# Check if chmod is being used to set permissions
|
||||
if grep -q "chmod -R a+rx" "$dockerfile"; then
|
||||
echo -e "${GREEN}✓ Chmod command found to set permissions${NC}"
|
||||
else
|
||||
echo -e "${RED}✗ Chmod command NOT found in $dockerfile${NC}"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Check for multiple fallback paths
|
||||
if grep -q "/usr/share/tesseract-ocr/\*/tessdata" "$dockerfile"; then
|
||||
echo -e "${GREEN}✓ Multiple tessdata paths checked in Dockerfile${NC}"
|
||||
else
|
||||
echo -e "${RED}✗ Multiple tessdata paths NOT found${NC}"
|
||||
return 1
|
||||
fi
|
||||
|
||||
echo ""
|
||||
return 0
|
||||
}
|
||||
|
||||
# Test 2: Verify Dockerfile syntax
|
||||
test_dockerfile_syntax() {
|
||||
local dockerfile="$1"
|
||||
local test_name="$2"
|
||||
|
||||
echo -e "${YELLOW}Test: Verify $test_name syntax${NC}"
|
||||
|
||||
# Use docker build --dry-run if available, otherwise just validate basic syntax
|
||||
if command -v docker &>/dev/null; then
|
||||
if docker build --dry-run -f "$dockerfile" "$PROJECT_ROOT" &>/dev/null; then
|
||||
echo -e "${GREEN}✓ Dockerfile syntax is valid${NC}"
|
||||
else
|
||||
echo -e "${YELLOW}! Dockerfile syntax check failed (may be due to missing Docker or build prerequisites)${NC}"
|
||||
fi
|
||||
else
|
||||
# Basic syntax check without Docker
|
||||
if grep -q "^FROM " "$dockerfile" && grep -q "^ENV " "$dockerfile"; then
|
||||
echo -e "${GREEN}✓ Basic Dockerfile structure looks valid${NC}"
|
||||
else
|
||||
echo -e "${RED}✗ Dockerfile structure is invalid${NC}"
|
||||
return 1
|
||||
fi
|
||||
fi
|
||||
|
||||
echo ""
|
||||
return 0
|
||||
}
|
||||
|
||||
# Test 3: Check that non-root user permissions are set
|
||||
test_user_permissions() {
|
||||
local dockerfile="$1"
|
||||
local test_name="$2"
|
||||
|
||||
echo -e "${YELLOW}Test: User permissions in $test_name${NC}"
|
||||
|
||||
if grep -q "USER kreuzberg" "$dockerfile"; then
|
||||
echo -e "${GREEN}✓ Non-root 'kreuzberg' user is set${NC}"
|
||||
else
|
||||
echo -e "${RED}✗ Non-root user NOT found${NC}"
|
||||
return 1
|
||||
fi
|
||||
|
||||
if grep -q "chown -R kreuzberg:kreuzberg" "$dockerfile"; then
|
||||
echo -e "${GREEN}✓ Directory ownership set to kreuzberg user${NC}"
|
||||
else
|
||||
echo -e "${RED}✗ Directory ownership NOT set for kreuzberg user${NC}"
|
||||
return 1
|
||||
fi
|
||||
|
||||
echo ""
|
||||
return 0
|
||||
}
|
||||
|
||||
# Test 4: Verify no version-specific paths remain
|
||||
test_no_hardcoded_versions() {
|
||||
local dockerfile="$1"
|
||||
local test_name="$2"
|
||||
|
||||
echo -e "${YELLOW}Test: No hardcoded version paths in $test_name${NC}"
|
||||
|
||||
if grep "tesseract-ocr/5/tessdata" "$dockerfile" | grep -v "tesseract-ocr/\*/tessdata" >/dev/null; then
|
||||
echo -e "${RED}✗ Hardcoded tesseract-ocr/5 version found${NC}"
|
||||
return 1
|
||||
else
|
||||
echo -e "${GREEN}✓ No hardcoded tesseract-ocr/5 version${NC}"
|
||||
fi
|
||||
|
||||
if grep "tesseract-ocr/4/tessdata" "$dockerfile" | grep -v "tesseract-ocr/\*/tessdata" >/dev/null; then
|
||||
echo -e "${YELLOW}! Hardcoded tesseract-ocr/4 version found (but it's in the loop, so OK)${NC}"
|
||||
else
|
||||
echo -e "${GREEN}✓ Version paths are in dynamic loop${NC}"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
return 0
|
||||
}
|
||||
|
||||
# Run all tests
|
||||
run_tests() {
|
||||
local dockerfile="$1"
|
||||
local test_name="$2"
|
||||
local passed=0
|
||||
local failed=0
|
||||
|
||||
echo -e "${YELLOW}========================================${NC}"
|
||||
echo -e "${YELLOW}Testing: $test_name${NC}"
|
||||
echo -e "${YELLOW}File: $dockerfile${NC}"
|
||||
echo -e "${YELLOW}========================================\n${NC}"
|
||||
|
||||
if test_tessdata_discovery "Tessdata discovery logic" "$dockerfile"; then
|
||||
((passed++))
|
||||
else
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
if test_dockerfile_syntax "$dockerfile" "$test_name"; then
|
||||
((passed++))
|
||||
else
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
if test_user_permissions "$dockerfile" "$test_name"; then
|
||||
((passed++))
|
||||
else
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
if test_no_hardcoded_versions "$dockerfile" "$test_name"; then
|
||||
((passed++))
|
||||
else
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
echo -e "${YELLOW}----------------------------------------${NC}"
|
||||
echo -e "Results: ${GREEN}$passed passed${NC}, ${RED}$failed failed${NC}"
|
||||
echo -e "${YELLOW}========================================\n${NC}"
|
||||
|
||||
return $failed
|
||||
}
|
||||
|
||||
# Main execution
|
||||
total_failed=0
|
||||
|
||||
# Test Dockerfile.core
|
||||
if ! run_tests "$SCRIPT_DIR/Dockerfile.core" "Dockerfile.core"; then
|
||||
total_failed=$((total_failed + $?))
|
||||
fi
|
||||
|
||||
# Test Dockerfile.full
|
||||
if ! run_tests "$SCRIPT_DIR/Dockerfile.full" "Dockerfile.full"; then
|
||||
total_failed=$((total_failed + $?))
|
||||
fi
|
||||
|
||||
# Summary
|
||||
echo -e "${YELLOW}========================================${NC}"
|
||||
if [ $total_failed -eq 0 ]; then
|
||||
echo -e "${GREEN}✓ All tests passed!${NC}"
|
||||
echo -e "${GREEN}Tessdata configuration is properly set up.${NC}"
|
||||
exit 0
|
||||
else
|
||||
echo -e "${RED}✗ Some tests failed (total failures: $total_failed)${NC}"
|
||||
echo -e "${RED}Please review the Dockerfile changes.${NC}"
|
||||
exit 1
|
||||
fi
|
||||
Reference in New Issue
Block a user