Shrink docker size by avoiding chown
Change-Id: I778c3993a37deef1d2eaa3d2b401d796ce8662f7
diff --git a/Dockerfile b/Dockerfile
index 246ef83..0efc8c5 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -32,16 +32,19 @@
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean
-# Copy virtual environment from builder
-COPY --from=builder /app/venv /app/venv
+# Add non-root user FIRST (before copying files)
+RUN groupadd -r appuser && useradd -r -g appuser appuser
-# Copy application code
-COPY lib /app/lib
-COPY systems /app/systems
-COPY my_utils /app/my_utils
-COPY docker-entrypoint.sh /docker-entrypoint.sh
-COPY download_with_progress.py /app/download_with_progress.py
-COPY list_spacy_models.py /app/list_spacy_models.py
+# Copy virtual environment from builder and set ownership immediately
+COPY --from=builder --chown=appuser:appuser /app/venv /app/venv
+
+# Copy application code with correct ownership
+COPY --chown=appuser:appuser lib /app/lib
+COPY --chown=appuser:appuser systems /app/systems
+COPY --chown=appuser:appuser my_utils /app/my_utils
+COPY --chown=appuser:appuser download_with_progress.py /app/download_with_progress.py
+COPY --chown=appuser:appuser list_spacy_models.py /app/list_spacy_models.py
+COPY --chown=appuser:appuser docker-entrypoint.sh /docker-entrypoint.sh
# Set environment variables
ENV VIRTUAL_ENV=/app/venv
@@ -58,24 +61,19 @@
ENV SPACY_CHUNK_SIZE="20000"
WORKDIR /app
-RUN mkdir -p "/app/logs" "/app/tmp" "/local/models"
+
+# Create directories with correct ownership
+RUN mkdir -p "/app/logs" "/app/tmp" "/local/models" && \
+ chown -R appuser:appuser "/app/logs" "/app/tmp" "/local/models" && \
+ chmod +x /docker-entrypoint.sh && \
+ chmod +x /app/download_with_progress.py && \
+ chmod +x /app/list_spacy_models.py
# Set temp directories to use app directory instead of system /tmp
ENV TMPDIR="/app/tmp"
ENV TEMP="/app/tmp"
ENV TMP="/app/tmp"
-# Add non-root user
-RUN groupadd -r appuser && useradd -r -g appuser appuser
-
-# Make entrypoint executable and set permissions
-RUN chmod +x /docker-entrypoint.sh && \
- chmod +x /app/download_with_progress.py && \
- chmod +x /app/list_spacy_models.py
-
-# Change ownership of app directories to appuser
-RUN chown -R appuser:appuser /app /local /docker-entrypoint.sh
-
# Switch to non-root user
USER appuser
diff --git a/Dockerfile.slim b/Dockerfile.slim
new file mode 100644
index 0000000..c01ad04
--- /dev/null
+++ b/Dockerfile.slim
@@ -0,0 +1,80 @@
+# Slim version without GermaLemma (saves ~180MB)
+# Multi-stage Docker build for size optimization
+FROM python:3.12-slim-bookworm AS builder
+
+# Install build dependencies
+RUN apt-get update && apt-get install -y \
+ gcc \
+ g++ \
+ && rm -rf /var/lib/apt/lists/*
+
+# Set environment variables
+ENV PIP_CACHE_DIR="/tmp/.cache/pip" \
+ PYTHONPATH="PYTHONPATH:."
+ENV VIRTUAL_ENV=/app/venv
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+
+# Set the working directory and copy requirements
+WORKDIR /app
+
+# Install Python dependencies WITHOUT germalemma
+RUN python -m venv venv
+RUN venv/bin/pip install --upgrade pip wheel thinc spacy
+
+# Production stage
+FROM python:3.12-slim-bookworm AS production
+
+# Install minimal runtime dependencies
+RUN apt-get update && apt-get install -y \
+ wget \
+ coreutils \
+ && rm -rf /var/lib/apt/lists/* \
+ && apt-get clean
+
+# Add non-root user FIRST (before copying files)
+RUN groupadd -r appuser && useradd -r -g appuser appuser
+
+# Copy virtual environment from builder and set ownership immediately
+COPY --from=builder --chown=appuser:appuser /app/venv /app/venv
+
+# Copy application code with correct ownership
+COPY --chown=appuser:appuser lib /app/lib
+COPY --chown=appuser:appuser systems /app/systems
+COPY --chown=appuser:appuser my_utils /app/my_utils
+COPY --chown=appuser:appuser download_with_progress.py /app/download_with_progress.py
+COPY --chown=appuser:appuser list_spacy_models.py /app/list_spacy_models.py
+COPY --chown=appuser:appuser docker-entrypoint.sh /docker-entrypoint.sh
+
+# Set environment variables
+ENV VIRTUAL_ENV=/app/venv
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+ENV PYTHONPATH="PYTHONPATH:."
+
+# spaCy processing configuration - GermaLemma disabled by default
+ENV SPACY_USE_DEPENDENCIES="True"
+ENV SPACY_USE_GERMALEMMA="False"
+ENV SPACY_PARSE_TIMEOUT="30"
+ENV SPACY_MAX_SENTENCE_LENGTH="500"
+ENV SPACY_N_PROCESS="10"
+ENV SPACY_BATCH_SIZE="2000"
+ENV SPACY_CHUNK_SIZE="20000"
+
+WORKDIR /app
+
+# Create directories with correct ownership
+RUN mkdir -p "/app/logs" "/app/tmp" "/local/models" && \
+ chown -R appuser:appuser "/app/logs" "/app/tmp" "/local/models" && \
+ chmod +x /docker-entrypoint.sh && \
+ chmod +x /app/download_with_progress.py && \
+ chmod +x /app/list_spacy_models.py
+
+# Set temp directories to use app directory instead of system /tmp
+ENV TMPDIR="/app/tmp"
+ENV TEMP="/app/tmp"
+ENV TMP="/app/tmp"
+
+# Switch to non-root user
+USER appuser
+
+# Define the entry point
+ENTRYPOINT ["/docker-entrypoint.sh"]
diff --git a/Dockerfile.with-models b/Dockerfile.with-models
index 9d71974..96a0775 100644
--- a/Dockerfile.with-models
+++ b/Dockerfile.with-models
@@ -55,19 +55,22 @@
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean
-# Copy virtual environment from builder
-COPY --from=builder /app/venv /app/venv
+# Add non-root user FIRST (before copying files)
+RUN groupadd -r appuser && useradd -r -g appuser appuser
-# Copy pre-downloaded models
-COPY --from=builder /local/models /local/models
+# Copy virtual environment from builder and set ownership immediately
+COPY --from=builder --chown=appuser:appuser /app/venv /app/venv
-# Copy application code
-COPY lib /app/lib
-COPY systems /app/systems
-COPY my_utils /app/my_utils
-COPY docker-entrypoint.sh /docker-entrypoint.sh
-COPY download_with_progress.py /app/download_with_progress.py
-COPY list_spacy_models.py /app/list_spacy_models.py
+# Copy pre-downloaded models with correct ownership
+COPY --from=builder --chown=appuser:appuser /local/models /local/models
+
+# Copy application code with correct ownership
+COPY --chown=appuser:appuser lib /app/lib
+COPY --chown=appuser:appuser systems /app/systems
+COPY --chown=appuser:appuser my_utils /app/my_utils
+COPY --chown=appuser:appuser download_with_progress.py /app/download_with_progress.py
+COPY --chown=appuser:appuser list_spacy_models.py /app/list_spacy_models.py
+COPY --chown=appuser:appuser docker-entrypoint.sh /docker-entrypoint.sh
# Set environment variables
ENV VIRTUAL_ENV=/app/venv
@@ -84,24 +87,19 @@
ENV SPACY_CHUNK_SIZE="20000"
WORKDIR /app
-RUN mkdir -p "/app/logs" "/app/tmp"
+
+# Create directories with correct ownership
+RUN mkdir -p "/app/logs" "/app/tmp" && \
+ chown -R appuser:appuser "/app/logs" "/app/tmp" && \
+ chmod +x /docker-entrypoint.sh && \
+ chmod +x /app/download_with_progress.py && \
+ chmod +x /app/list_spacy_models.py
# Set temp directories to use app directory instead of system /tmp
ENV TMPDIR="/app/tmp"
ENV TEMP="/app/tmp"
ENV TMP="/app/tmp"
-# Add non-root user
-RUN groupadd -r appuser && useradd -r -g appuser appuser
-
-# Make entrypoint executable and set permissions
-RUN chmod +x /docker-entrypoint.sh && \
- chmod +x /app/download_with_progress.py && \
- chmod +x /app/list_spacy_models.py
-
-# Change ownership of app directories to appuser
-RUN chown -R appuser:appuser /app /local /docker-entrypoint.sh
-
# Switch to non-root user
USER appuser
diff --git a/Makefile b/Makefile
index 5d1b22d..9a54a7e 100644
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,12 @@
-.PHONY: build build-with-models preload-models run test clean
+.PHONY: build build-slim build-with-models preload-models run test clean
build:
docker build -t korap/conllu-spacy:latest .
+build-slim:
+ docker build -f Dockerfile.slim -t korap/conllu-spacy:slim .
+ @echo "Slim build complete (without GermaLemma, saves ~180MB)"
+
build-with-models:
docker build -f Dockerfile.with-models -t korap/conllu-spacy:with-models .
diff --git a/systems/parse_spacy_pipe.py b/systems/parse_spacy_pipe.py
index 0abfb33..8736669 100644
--- a/systems/parse_spacy_pipe.py
+++ b/systems/parse_spacy_pipe.py
@@ -5,7 +5,14 @@
import logging, sys, time, signal
from lib.CoNLL_Annotation import get_token_type
import my_utils.file_utils as fu
-from germalemma import GermaLemma
+
+# Try to import GermaLemma, but make it optional
+try:
+ from germalemma import GermaLemma
+ GERMALEMMA_AVAILABLE = True
+except ImportError:
+ GERMALEMMA_AVAILABLE = False
+ GermaLemma = None
# Dependency parsing safety limits
DEFAULT_PARSE_TIMEOUT = 0.5 # seconds per sentence
@@ -236,21 +243,31 @@
spacy_de = spacy.load(args.spacy_model, disable=disabled_components)
spacy_de.tokenizer = WhitespaceTokenizer(spacy_de.vocab) # We won't re-tokenize to respect how the source CoNLL are tokenized!
-
+
# Increase max_length to handle very long sentences (especially when parser is disabled)
spacy_de.max_length = 10000000 # 10M characters
-
- lemmatizer = GermaLemma()
+
+ # Initialize GermaLemma if available and requested
+ lemmatizer = None
+ if args.use_germalemma == "True":
+ if GERMALEMMA_AVAILABLE:
+ lemmatizer = GermaLemma()
+ else:
+ logger.warning("GermaLemma requested but not available. Using spaCy lemmatizer instead.")
+ args.use_germalemma = "False"
# Log version information
logger.info(f"spaCy version: {spacy.__version__}")
logger.info(f"spaCy model: {args.spacy_model}")
logger.info(f"spaCy model version: {spacy_de.meta.get('version', 'unknown')}")
- try:
- import germalemma
- logger.info(f"GermaLemma version: {germalemma.__version__}")
- except AttributeError:
- logger.info("GermaLemma version: unknown (no __version__ attribute)")
+ if GERMALEMMA_AVAILABLE:
+ try:
+ import germalemma
+ logger.info(f"GermaLemma version: {germalemma.__version__}")
+ except AttributeError:
+ logger.info("GermaLemma version: unknown (no __version__ attribute)")
+ else:
+ logger.info("GermaLemma: not installed")
# Parse timeout and sentence length limits from environment variables
parse_timeout = float(os.getenv("SPACY_PARSE_TIMEOUT", str(DEFAULT_PARSE_TIMEOUT)))