Shrink docker size by avoiding chown

Change-Id: I778c3993a37deef1d2eaa3d2b401d796ce8662f7
diff --git a/Dockerfile b/Dockerfile
index 246ef83..0efc8c5 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -32,16 +32,19 @@
     && rm -rf /var/lib/apt/lists/* \
     && apt-get clean
 
-# Copy virtual environment from builder
-COPY --from=builder /app/venv /app/venv
+# Add non-root user FIRST (before copying files)
+RUN groupadd -r appuser && useradd -r -g appuser appuser
 
-# Copy application code
-COPY lib /app/lib
-COPY systems /app/systems
-COPY my_utils /app/my_utils
-COPY docker-entrypoint.sh /docker-entrypoint.sh
-COPY download_with_progress.py /app/download_with_progress.py
-COPY list_spacy_models.py /app/list_spacy_models.py
+# Copy virtual environment from builder and set ownership immediately
+COPY --from=builder --chown=appuser:appuser /app/venv /app/venv
+
+# Copy application code with correct ownership
+COPY --chown=appuser:appuser lib /app/lib
+COPY --chown=appuser:appuser systems /app/systems
+COPY --chown=appuser:appuser my_utils /app/my_utils
+COPY --chown=appuser:appuser download_with_progress.py /app/download_with_progress.py
+COPY --chown=appuser:appuser list_spacy_models.py /app/list_spacy_models.py
+COPY --chown=appuser:appuser docker-entrypoint.sh /docker-entrypoint.sh
 
 # Set environment variables
 ENV VIRTUAL_ENV=/app/venv
@@ -58,24 +61,19 @@
 ENV SPACY_CHUNK_SIZE="20000"
 
 WORKDIR /app
-RUN mkdir -p "/app/logs" "/app/tmp" "/local/models"
+
+# Create directories with correct ownership
+RUN mkdir -p "/app/logs" "/app/tmp" "/local/models" && \
+    chown -R appuser:appuser "/app/logs" "/app/tmp" "/local/models" && \
+    chmod +x /docker-entrypoint.sh && \
+    chmod +x /app/download_with_progress.py && \
+    chmod +x /app/list_spacy_models.py
 
 # Set temp directories to use app directory instead of system /tmp
 ENV TMPDIR="/app/tmp"
 ENV TEMP="/app/tmp"
 ENV TMP="/app/tmp"
 
-# Add non-root user
-RUN groupadd -r appuser && useradd -r -g appuser appuser
-
-# Make entrypoint executable and set permissions
-RUN chmod +x /docker-entrypoint.sh && \
-    chmod +x /app/download_with_progress.py && \
-    chmod +x /app/list_spacy_models.py
-
-# Change ownership of app directories to appuser
-RUN chown -R appuser:appuser /app /local /docker-entrypoint.sh
-
 # Switch to non-root user
 USER appuser
 
diff --git a/Dockerfile.slim b/Dockerfile.slim
new file mode 100644
index 0000000..c01ad04
--- /dev/null
+++ b/Dockerfile.slim
@@ -0,0 +1,80 @@
+# Slim version without GermaLemma (saves ~180MB)
+# Multi-stage Docker build for size optimization
+FROM python:3.12-slim-bookworm AS builder
+
+# Install build dependencies
+RUN apt-get update && apt-get install -y \
+    gcc \
+    g++ \
+    && rm -rf /var/lib/apt/lists/*
+
+# Set environment variables
+ENV PIP_CACHE_DIR="/tmp/.cache/pip" \
+    PYTHONPATH="PYTHONPATH:."
+ENV VIRTUAL_ENV=/app/venv
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+
+# Set the working directory and copy requirements
+WORKDIR /app
+
+# Install Python dependencies WITHOUT germalemma
+RUN python -m venv venv
+RUN venv/bin/pip install --upgrade pip wheel thinc spacy
+
+# Production stage
+FROM python:3.12-slim-bookworm AS production
+
+# Install minimal runtime dependencies
+RUN apt-get update && apt-get install -y \
+    wget \
+    coreutils \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+
+# Add non-root user FIRST (before copying files)
+RUN groupadd -r appuser && useradd -r -g appuser appuser
+
+# Copy virtual environment from builder and set ownership immediately
+COPY --from=builder --chown=appuser:appuser /app/venv /app/venv
+
+# Copy application code with correct ownership
+COPY --chown=appuser:appuser lib /app/lib
+COPY --chown=appuser:appuser systems /app/systems
+COPY --chown=appuser:appuser my_utils /app/my_utils
+COPY --chown=appuser:appuser download_with_progress.py /app/download_with_progress.py
+COPY --chown=appuser:appuser list_spacy_models.py /app/list_spacy_models.py
+COPY --chown=appuser:appuser docker-entrypoint.sh /docker-entrypoint.sh
+
+# Set environment variables
+ENV VIRTUAL_ENV=/app/venv
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+ENV PYTHONPATH="PYTHONPATH:."
+
+# spaCy processing configuration - GermaLemma disabled by default
+ENV SPACY_USE_DEPENDENCIES="True"
+ENV SPACY_USE_GERMALEMMA="False"
+ENV SPACY_PARSE_TIMEOUT="30"
+ENV SPACY_MAX_SENTENCE_LENGTH="500"
+ENV SPACY_N_PROCESS="10"
+ENV SPACY_BATCH_SIZE="2000"
+ENV SPACY_CHUNK_SIZE="20000"
+
+WORKDIR /app
+
+# Create directories with correct ownership
+RUN mkdir -p "/app/logs" "/app/tmp" "/local/models" && \
+    chown -R appuser:appuser "/app/logs" "/app/tmp" "/local/models" && \
+    chmod +x /docker-entrypoint.sh && \
+    chmod +x /app/download_with_progress.py && \
+    chmod +x /app/list_spacy_models.py
+
+# Set temp directories to use app directory instead of system /tmp
+ENV TMPDIR="/app/tmp"
+ENV TEMP="/app/tmp"
+ENV TMP="/app/tmp"
+
+# Switch to non-root user
+USER appuser
+
+# Define the entry point
+ENTRYPOINT ["/docker-entrypoint.sh"]
diff --git a/Dockerfile.with-models b/Dockerfile.with-models
index 9d71974..96a0775 100644
--- a/Dockerfile.with-models
+++ b/Dockerfile.with-models
@@ -55,19 +55,22 @@
     && rm -rf /var/lib/apt/lists/* \
     && apt-get clean
 
-# Copy virtual environment from builder
-COPY --from=builder /app/venv /app/venv
+# Add non-root user FIRST (before copying files)
+RUN groupadd -r appuser && useradd -r -g appuser appuser
 
-# Copy pre-downloaded models
-COPY --from=builder /local/models /local/models
+# Copy virtual environment from builder and set ownership immediately
+COPY --from=builder --chown=appuser:appuser /app/venv /app/venv
 
-# Copy application code
-COPY lib /app/lib
-COPY systems /app/systems
-COPY my_utils /app/my_utils
-COPY docker-entrypoint.sh /docker-entrypoint.sh
-COPY download_with_progress.py /app/download_with_progress.py
-COPY list_spacy_models.py /app/list_spacy_models.py
+# Copy pre-downloaded models with correct ownership
+COPY --from=builder --chown=appuser:appuser /local/models /local/models
+
+# Copy application code with correct ownership
+COPY --chown=appuser:appuser lib /app/lib
+COPY --chown=appuser:appuser systems /app/systems
+COPY --chown=appuser:appuser my_utils /app/my_utils
+COPY --chown=appuser:appuser download_with_progress.py /app/download_with_progress.py
+COPY --chown=appuser:appuser list_spacy_models.py /app/list_spacy_models.py
+COPY --chown=appuser:appuser docker-entrypoint.sh /docker-entrypoint.sh
 
 # Set environment variables
 ENV VIRTUAL_ENV=/app/venv
@@ -84,24 +87,19 @@
 ENV SPACY_CHUNK_SIZE="20000"
 
 WORKDIR /app
-RUN mkdir -p "/app/logs" "/app/tmp"
+
+# Create directories with correct ownership
+RUN mkdir -p "/app/logs" "/app/tmp" && \
+    chown -R appuser:appuser "/app/logs" "/app/tmp" && \
+    chmod +x /docker-entrypoint.sh && \
+    chmod +x /app/download_with_progress.py && \
+    chmod +x /app/list_spacy_models.py
 
 # Set temp directories to use app directory instead of system /tmp
 ENV TMPDIR="/app/tmp"
 ENV TEMP="/app/tmp"
 ENV TMP="/app/tmp"
 
-# Add non-root user
-RUN groupadd -r appuser && useradd -r -g appuser appuser
-
-# Make entrypoint executable and set permissions
-RUN chmod +x /docker-entrypoint.sh && \
-    chmod +x /app/download_with_progress.py && \
-    chmod +x /app/list_spacy_models.py
-
-# Change ownership of app directories to appuser
-RUN chown -R appuser:appuser /app /local /docker-entrypoint.sh
-
 # Switch to non-root user
 USER appuser
 
diff --git a/Makefile b/Makefile
index 5d1b22d..9a54a7e 100644
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,12 @@
-.PHONY: build build-with-models preload-models run test clean
+.PHONY: build build-slim build-with-models preload-models run test clean
 
 build:
 	docker build -t korap/conllu-spacy:latest .
 
+build-slim:
+	docker build -f Dockerfile.slim -t korap/conllu-spacy:slim .
+	@echo "Slim build complete (without GermaLemma, saves ~180MB)"
+
 build-with-models:
 	docker build -f Dockerfile.with-models -t korap/conllu-spacy:with-models .
 
diff --git a/systems/parse_spacy_pipe.py b/systems/parse_spacy_pipe.py
index 0abfb33..8736669 100644
--- a/systems/parse_spacy_pipe.py
+++ b/systems/parse_spacy_pipe.py
@@ -5,7 +5,14 @@
 import logging, sys, time, signal
 from lib.CoNLL_Annotation import get_token_type
 import my_utils.file_utils as fu
-from germalemma import GermaLemma
+
+# Try to import GermaLemma, but make it optional
+try:
+    from germalemma import GermaLemma
+    GERMALEMMA_AVAILABLE = True
+except ImportError:
+    GERMALEMMA_AVAILABLE = False
+    GermaLemma = None
 
 # Dependency parsing safety limits
 DEFAULT_PARSE_TIMEOUT = 0.5  # seconds per sentence
@@ -236,21 +243,31 @@
 	
 	spacy_de = spacy.load(args.spacy_model, disable=disabled_components)
 	spacy_de.tokenizer = WhitespaceTokenizer(spacy_de.vocab) # We won't re-tokenize to respect how the source CoNLL are tokenized!
-	
+
 	# Increase max_length to handle very long sentences (especially when parser is disabled)
 	spacy_de.max_length = 10000000  # 10M characters
-	
-	lemmatizer = GermaLemma()
+
+	# Initialize GermaLemma if available and requested
+	lemmatizer = None
+	if args.use_germalemma == "True":
+		if GERMALEMMA_AVAILABLE:
+			lemmatizer = GermaLemma()
+		else:
+			logger.warning("GermaLemma requested but not available. Using spaCy lemmatizer instead.")
+			args.use_germalemma = "False"
 	
 	# Log version information
 	logger.info(f"spaCy version: {spacy.__version__}")
 	logger.info(f"spaCy model: {args.spacy_model}")
 	logger.info(f"spaCy model version: {spacy_de.meta.get('version', 'unknown')}")
-	try:
-		import germalemma
-		logger.info(f"GermaLemma version: {germalemma.__version__}")
-	except AttributeError:
-		logger.info("GermaLemma version: unknown (no __version__ attribute)")
+	if GERMALEMMA_AVAILABLE:
+		try:
+			import germalemma
+			logger.info(f"GermaLemma version: {germalemma.__version__}")
+		except AttributeError:
+			logger.info("GermaLemma version: unknown (no __version__ attribute)")
+	else:
+		logger.info("GermaLemma: not installed")
 	
 	# Parse timeout and sentence length limits from environment variables
 	parse_timeout = float(os.getenv("SPACY_PARSE_TIMEOUT", str(DEFAULT_PARSE_TIMEOUT)))