Initial import
Change-Id: I6315233ee1bfbdf7cc985cb336d0df7a10274189
diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh
new file mode 100755
index 0000000..1ada4a7
--- /dev/null
+++ b/docker-entrypoint.sh
@@ -0,0 +1,158 @@
+#!/bin/bash
+
+set -o pipefail
+
+# Default values
+model="de_core_news_lg"
+use_dependencies="True"
+use_germalemma="True"
+
+usage() {
+ echo "Usage: $0 [-h] [-m MODEL] [-L] [-d] [-g]"
+ echo " -h Display this help message"
+ echo " -m MODEL Specify spaCy model (default: $model)"
+ echo " -L List available/installed models"
+ echo " -d Disable dependency parsing (faster processing)"
+ echo " -g Disable GermaLemma (use spaCy lemmatizer only)"
+ exit 1
+}
+
+# Parse command line options
+while getopts "hm:Ldg" opt; do
+ case $opt in
+ h)
+ usage
+ ;;
+ m)
+ model="$OPTARG"
+ ;;
+ L)
+ python -m spacy info 2>/dev/null || echo "No models installed"
+ exit 0
+ ;;
+ d)
+ use_dependencies="False"
+ ;;
+ g)
+ use_germalemma="False"
+ ;;
+ \?)
+ echo "Invalid option: -$OPTARG" >&2
+ usage
+ ;;
+ :)
+ echo "Option -$OPTARG requires an argument" >&2
+ usage
+ ;;
+ esac
+done
+
+if [ $OPTIND -le $# ]; then
+ usage
+fi
+
+MODEL_DIR="/local/models"
+MODEL_PATH="$MODEL_DIR/$model"
+
+# Ensure MODEL_DIR exists
+mkdir -p "$MODEL_DIR"
+
+# Function to check if model is installed and usable
+is_model_installed() {
+ local model_name="$1"
+ # Check if model is installed in the venv
+ python -c "import spacy; spacy.load('$model_name')" 2>/dev/null
+ return $?
+}
+
+# Function to check if preloaded model exists and is valid
+has_preloaded_model() {
+ local model_path="$1"
+ # Check for config.cfg which indicates a valid spaCy model
+ if [ -f "$model_path/config.cfg" ]; then
+ return 0
+ fi
+ return 1
+}
+
+# Function to install model
+install_model() {
+ local model_name="$1"
+
+ # Check if model exists in /local/models - if so, we'll use absolute path
+ if has_preloaded_model "$MODEL_PATH"; then
+ echo "Found preloaded model in $MODEL_PATH" >&2
+ echo "Will use absolute path to avoid download" >&2
+ return 0
+ fi
+
+ # Check if already installed in venv
+ if is_model_installed "$model_name"; then
+ echo "Model $model_name already installed in venv" >&2
+ return 0
+ fi
+
+ # Try to download model to /local/models if writable
+ if [ -w "$MODEL_DIR" ]; then
+ # Download and install to /local/models with progress
+ if python /app/download_with_progress.py "$model_name" 2>&1 | tee /tmp/spacy_download.log >&2; then
+ # Try to move the installed model to /local/models for persistence
+ SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
+ INSTALLED_MODEL="$SITE_PACKAGES/$model_name"
+
+ if [ -d "$INSTALLED_MODEL" ]; then
+ echo "Moving model to $MODEL_PATH for persistence..." >&2
+ mv "$INSTALLED_MODEL" "$MODEL_PATH" 2>/dev/null || true
+ # Create symlink back
+ ln -sf "$MODEL_PATH" "$INSTALLED_MODEL" 2>/dev/null || true
+ echo "Model saved to $MODEL_PATH" >&2
+ fi
+ return 0
+ else
+ echo "Failed to download model $model_name" >&2
+ return 1
+ fi
+ else
+ # MODEL_DIR not writable, install to venv (ephemeral)
+ echo "Cannot write to $MODEL_DIR, installing to venv (ephemeral)" >&2
+ if python /app/download_with_progress.py "$model_name" 2>&1 | tee /tmp/spacy_download.log >&2; then
+ return 0
+ else
+ echo "Failed to download model $model_name" >&2
+ return 1
+ fi
+ fi
+}
+
+# Install or verify model
+if ! install_model "$model"; then
+ echo "ERROR: Could not install model $model, aborting." >&2
+ exit 1
+fi
+
+# Determine which model path to use
+# If preloaded model exists, use absolute path; otherwise use model name
+if has_preloaded_model "$MODEL_PATH"; then
+ MODEL_TO_USE="$MODEL_PATH"
+ echo "Using preloaded model at: $MODEL_TO_USE" >&2
+else
+ MODEL_TO_USE="$model"
+ echo "Using installed model: $MODEL_TO_USE" >&2
+fi
+
+# Set environment variables for the Python script
+export SPACY_USE_DEPENDENCIES="$use_dependencies"
+export SPACY_USE_GERMALEMMA="$use_germalemma"
+
+# Log configuration
+echo "Configuration:" >&2
+echo " Model: $MODEL_TO_USE" >&2
+echo " Use dependencies: $use_dependencies" >&2
+echo " Use GermaLemma: $use_germalemma" >&2
+
+# Run the spaCy tagging pipeline
+python /app/systems/parse_spacy_pipe.py \
+ --spacy_model "$MODEL_TO_USE" \
+ --corpus_name "stdin" \
+ --gld_token_type "CoNLLUP_Token" \
+ --comment_str "#"