blob: 6e6c989aace89f4f9f65fa89a98ec79d787ba07e [file] [log] [blame]
Marc Kupietz86044852025-11-29 10:19:03 +01001#!/bin/bash
2
3set -o pipefail
4
5# Default values
6model="de_core_news_lg"
7use_dependencies="True"
8use_germalemma="True"
9
10usage() {
Marc Kupietza137c072025-11-29 15:37:37 +010011 echo "Usage: $0 [-h] [-m MODEL] [-L] [-V] [-d] [-g]"
Marc Kupietz86044852025-11-29 10:19:03 +010012 echo " -h Display this help message"
13 echo " -m MODEL Specify spaCy model (default: $model)"
14 echo " -L List available/installed models"
Marc Kupietza137c072025-11-29 15:37:37 +010015 echo " -V Display spaCy version information"
Marc Kupietz86044852025-11-29 10:19:03 +010016 echo " -d Disable dependency parsing (faster processing)"
17 echo " -g Disable GermaLemma (use spaCy lemmatizer only)"
18 exit 1
19}
20
21# Parse command line options
Marc Kupietza137c072025-11-29 15:37:37 +010022while getopts "hm:LVdg" opt; do
Marc Kupietz86044852025-11-29 10:19:03 +010023 case $opt in
24 h)
25 usage
26 ;;
27 m)
28 model="$OPTARG"
29 ;;
30 L)
Marc Kupietz68a18132025-11-29 11:17:06 +010031 echo "=== Installed Models ===" >&2
32
33 # List models installed in venv
34 INSTALLED=$(python -c "import spacy; import pkg_resources; print('\n'.join([pkg.key for pkg in pkg_resources.working_set if pkg.key.endswith(('-sm', '-md', '-lg', '-trf')) and not pkg.key.startswith('spacy')]))" 2>/dev/null)
35
36 if [ -n "$INSTALLED" ]; then
37 echo "$INSTALLED" | while read model; do
38 # Convert package name to model name (e.g., de-core-news-lg -> de_core_news_lg)
39 model_name=$(echo "$model" | sed 's/-/_/g')
40 echo " $model_name" >&2
41 done
42 else
43 echo " No models installed in venv" >&2
44 fi
45
46 # Check for models in /local/models
47 if [ -d "/local/models" ] && [ "$(ls -A /local/models 2>/dev/null)" ]; then
48 echo "" >&2
49 echo "Models in /local/models:" >&2
50 ls -1 /local/models/ 2>/dev/null | while read dir; do
51 if [ -f "/local/models/$dir/config.cfg" ]; then
52 echo " $dir" >&2
53 fi
54 done
55 fi
56
57 echo "" >&2
58
59 # Show available models list
60 python /app/list_spacy_models.py
Marc Kupietz86044852025-11-29 10:19:03 +010061 exit 0
62 ;;
Marc Kupietza137c072025-11-29 15:37:37 +010063 V)
Marc Kupietze0ca9d22025-11-30 07:49:28 +010064 echo "=== Version Information ===" >&2
65 echo "conllu-spacy-docker version: 3.8.11-1" >&2
Marc Kupietza137c072025-11-29 15:37:37 +010066 python -c "import spacy; print(f'spaCy version: {spacy.__version__}')" >&2
67
68 # Check for GermaLemma
69 python -c "try:
70 import germalemma
71 try:
72 print(f'GermaLemma version: {germalemma.__version__}')
73 except AttributeError:
74 print('GermaLemma: installed (version unknown)')
75except ImportError:
76 print('GermaLemma: not installed')" >&2
77
78 # Show Python version
79 python -c "import sys; print(f'Python version: {sys.version.split()[0]}')" >&2
80
81 exit 0
82 ;;
Marc Kupietz86044852025-11-29 10:19:03 +010083 d)
84 use_dependencies="False"
85 ;;
86 g)
87 use_germalemma="False"
88 ;;
89 \?)
90 echo "Invalid option: -$OPTARG" >&2
91 usage
92 ;;
93 :)
94 echo "Option -$OPTARG requires an argument" >&2
95 usage
96 ;;
97 esac
98done
99
100if [ $OPTIND -le $# ]; then
101 usage
102fi
103
104MODEL_DIR="/local/models"
105MODEL_PATH="$MODEL_DIR/$model"
106
107# Ensure MODEL_DIR exists
108mkdir -p "$MODEL_DIR"
109
110# Function to check if model is installed and usable
111is_model_installed() {
112 local model_name="$1"
113 # Check if model is installed in the venv
114 python -c "import spacy; spacy.load('$model_name')" 2>/dev/null
115 return $?
116}
117
118# Function to check if preloaded model exists and is valid
119has_preloaded_model() {
120 local model_path="$1"
121 # Check for config.cfg which indicates a valid spaCy model
122 if [ -f "$model_path/config.cfg" ]; then
123 return 0
124 fi
125 return 1
126}
127
128# Function to install model
129install_model() {
130 local model_name="$1"
131
132 # Check if model exists in /local/models - if so, we'll use absolute path
133 if has_preloaded_model "$MODEL_PATH"; then
134 echo "Found preloaded model in $MODEL_PATH" >&2
135 echo "Will use absolute path to avoid download" >&2
136 return 0
137 fi
138
139 # Check if already installed in venv
140 if is_model_installed "$model_name"; then
141 echo "Model $model_name already installed in venv" >&2
142 return 0
143 fi
144
145 # Try to download model to /local/models if writable
146 if [ -w "$MODEL_DIR" ]; then
147 # Download and install to /local/models with progress
148 if python /app/download_with_progress.py "$model_name" 2>&1 | tee /tmp/spacy_download.log >&2; then
Marc Kupietzc75ae7c2025-11-29 10:41:26 +0100149 # Extract and flatten the model structure for persistence
Marc Kupietz86044852025-11-29 10:19:03 +0100150 SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
151 INSTALLED_MODEL="$SITE_PACKAGES/$model_name"
152
153 if [ -d "$INSTALLED_MODEL" ]; then
Marc Kupietzc75ae7c2025-11-29 10:41:26 +0100154 echo "Extracting model to $MODEL_PATH for persistence..." >&2
155
156 # Find the actual model directory (e.g., de_core_news_lg-3.8.0)
157 VERSIONED_DIR=$(find "$INSTALLED_MODEL" -maxdepth 1 -type d -name "${model_name}-*" | head -1)
158
159 if [ -n "$VERSIONED_DIR" ] && [ -f "$VERSIONED_DIR/config.cfg" ]; then
160 # Copy the versioned model directory contents to MODEL_PATH
161 mkdir -p "$MODEL_PATH"
162 cp -r "$VERSIONED_DIR"/* "$MODEL_PATH/"
163 # Set permissions so user can modify the model files
164 chmod -R a+rwX "$MODEL_PATH" 2>/dev/null || true
165 echo "Model extracted to $MODEL_PATH" >&2
166 else
167 # Fallback: just move the whole package
168 echo "Warning: Could not find versioned model directory, moving package as-is" >&2
169 mv "$INSTALLED_MODEL" "$MODEL_PATH" 2>/dev/null || true
170 chmod -R a+rwX "$MODEL_PATH" 2>/dev/null || true
171 fi
Marc Kupietz86044852025-11-29 10:19:03 +0100172 fi
173 return 0
174 else
175 echo "Failed to download model $model_name" >&2
176 return 1
177 fi
178 else
179 # MODEL_DIR not writable, install to venv (ephemeral)
180 echo "Cannot write to $MODEL_DIR, installing to venv (ephemeral)" >&2
181 if python /app/download_with_progress.py "$model_name" 2>&1 | tee /tmp/spacy_download.log >&2; then
182 return 0
183 else
184 echo "Failed to download model $model_name" >&2
185 return 1
186 fi
187 fi
188}
189
190# Install or verify model
191if ! install_model "$model"; then
192 echo "ERROR: Could not install model $model, aborting." >&2
193 exit 1
194fi
195
196# Determine which model path to use
197# If preloaded model exists, use absolute path; otherwise use model name
198if has_preloaded_model "$MODEL_PATH"; then
199 MODEL_TO_USE="$MODEL_PATH"
200 echo "Using preloaded model at: $MODEL_TO_USE" >&2
201else
202 MODEL_TO_USE="$model"
203 echo "Using installed model: $MODEL_TO_USE" >&2
204fi
205
206# Set environment variables for the Python script
207export SPACY_USE_DEPENDENCIES="$use_dependencies"
208export SPACY_USE_GERMALEMMA="$use_germalemma"
209
210# Log configuration
211echo "Configuration:" >&2
212echo " Model: $MODEL_TO_USE" >&2
213echo " Use dependencies: $use_dependencies" >&2
214echo " Use GermaLemma: $use_germalemma" >&2
215
216# Run the spaCy tagging pipeline
217python /app/systems/parse_spacy_pipe.py \
218 --spacy_model "$MODEL_TO_USE" \
219 --corpus_name "stdin" \
220 --gld_token_type "CoNLLUP_Token" \
221 --comment_str "#"