blob: 96a0775acb7dc227c74ed559c4c5a85245676461 [file] [log] [blame]
# Dockerfile with pre-installed models
# Build: docker build -f Dockerfile.with-models -t korap/conllu-spacy:with-models .
# Multi-stage Docker build for size optimization
FROM python:3.12-slim-bookworm AS builder
# Install build dependencies
RUN apt-get update && apt-get install -y \
gcc \
g++ \
&& rm -rf /var/lib/apt/lists/*
# Set environment variables
ENV PIP_CACHE_DIR="/tmp/.cache/pip" \
PYTHONPATH="PYTHONPATH:."
ENV VIRTUAL_ENV=/app/venv
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
# Set the working directory and copy requirements
WORKDIR /app
COPY requirements.txt /app/requirements.txt
# Install Python dependencies in virtual environment
RUN python -m venv venv
RUN venv/bin/pip install --upgrade pip
RUN venv/bin/pip install -r requirements.txt
# Download spaCy models to /local/models
RUN mkdir -p /local/models
# Download the default model (de_core_news_lg)
RUN venv/bin/python -m spacy download de_core_news_lg --no-cache-dir
# Move model to /local/models for persistence
RUN MODEL_PATH=$(venv/bin/python -c "import site; print(site.getsitepackages()[0] + '/de_core_news_lg')") && \
mv "$MODEL_PATH" /local/models/de_core_news_lg
# Optionally download additional models
# Uncomment to include medium model:
# RUN venv/bin/python -m spacy download de_core_news_md --no-cache-dir && \
# MODEL_PATH=$(venv/bin/python -c "import site; print(site.getsitepackages()[0] + '/de_core_news_md')") && \
# mv "$MODEL_PATH" /local/models/de_core_news_md
# Uncomment to include small model:
# RUN venv/bin/python -m spacy download de_core_news_sm --no-cache-dir && \
# MODEL_PATH=$(venv/bin/python -c "import site; print(site.getsitepackages()[0] + '/de_core_news_sm')") && \
# mv "$MODEL_PATH" /local/models/de_core_news_sm
# Production stage
FROM python:3.12-slim-bookworm AS production
# Install minimal runtime dependencies
RUN apt-get update && apt-get install -y \
wget \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean
# Add non-root user FIRST (before copying files)
RUN groupadd -r appuser && useradd -r -g appuser appuser
# Copy virtual environment from builder and set ownership immediately
COPY --from=builder --chown=appuser:appuser /app/venv /app/venv
# Copy pre-downloaded models with correct ownership
COPY --from=builder --chown=appuser:appuser /local/models /local/models
# Copy application code with correct ownership
COPY --chown=appuser:appuser lib /app/lib
COPY --chown=appuser:appuser systems /app/systems
COPY --chown=appuser:appuser my_utils /app/my_utils
COPY --chown=appuser:appuser download_with_progress.py /app/download_with_progress.py
COPY --chown=appuser:appuser list_spacy_models.py /app/list_spacy_models.py
COPY --chown=appuser:appuser docker-entrypoint.sh /docker-entrypoint.sh
# Set environment variables
ENV VIRTUAL_ENV=/app/venv
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
ENV PYTHONPATH="PYTHONPATH:."
# spaCy processing configuration
ENV SPACY_USE_DEPENDENCIES="True"
ENV SPACY_USE_GERMALEMMA="True"
ENV SPACY_PARSE_TIMEOUT="30"
ENV SPACY_MAX_SENTENCE_LENGTH="500"
ENV SPACY_N_PROCESS="10"
ENV SPACY_BATCH_SIZE="2000"
ENV SPACY_CHUNK_SIZE="20000"
WORKDIR /app
# Create directories with correct ownership
RUN mkdir -p "/app/logs" "/app/tmp" && \
chown -R appuser:appuser "/app/logs" "/app/tmp" && \
chmod +x /docker-entrypoint.sh && \
chmod +x /app/download_with_progress.py && \
chmod +x /app/list_spacy_models.py
# Set temp directories to use app directory instead of system /tmp
ENV TMPDIR="/app/tmp"
ENV TEMP="/app/tmp"
ENV TMP="/app/tmp"
# Switch to non-root user
USER appuser
# Define the entry point
ENTRYPOINT ["/docker-entrypoint.sh"]