blob: 639e371412680c837abcc949ea7d54c1724ba301 [file] [log] [blame]
Marc Kupietz86044852025-11-29 10:19:03 +01001#!/bin/bash
2
3set -o pipefail
4
5# Default values
6model="de_core_news_lg"
7use_dependencies="True"
8use_germalemma="True"
9
10usage() {
Marc Kupietza137c072025-11-29 15:37:37 +010011 echo "Usage: $0 [-h] [-m MODEL] [-L] [-V] [-d] [-g]"
Marc Kupietz86044852025-11-29 10:19:03 +010012 echo " -h Display this help message"
13 echo " -m MODEL Specify spaCy model (default: $model)"
14 echo " -L List available/installed models"
Marc Kupietza137c072025-11-29 15:37:37 +010015 echo " -V Display spaCy version information"
Marc Kupietz86044852025-11-29 10:19:03 +010016 echo " -d Disable dependency parsing (faster processing)"
17 echo " -g Disable GermaLemma (use spaCy lemmatizer only)"
18 exit 1
19}
20
21# Parse command line options
Marc Kupietza137c072025-11-29 15:37:37 +010022while getopts "hm:LVdg" opt; do
Marc Kupietz86044852025-11-29 10:19:03 +010023 case $opt in
24 h)
25 usage
26 ;;
27 m)
28 model="$OPTARG"
29 ;;
30 L)
Marc Kupietz68a18132025-11-29 11:17:06 +010031 echo "=== Installed Models ===" >&2
32
33 # List models installed in venv
34 INSTALLED=$(python -c "import spacy; import pkg_resources; print('\n'.join([pkg.key for pkg in pkg_resources.working_set if pkg.key.endswith(('-sm', '-md', '-lg', '-trf')) and not pkg.key.startswith('spacy')]))" 2>/dev/null)
35
36 if [ -n "$INSTALLED" ]; then
37 echo "$INSTALLED" | while read model; do
38 # Convert package name to model name (e.g., de-core-news-lg -> de_core_news_lg)
39 model_name=$(echo "$model" | sed 's/-/_/g')
40 echo " $model_name" >&2
41 done
42 else
43 echo " No models installed in venv" >&2
44 fi
45
46 # Check for models in /local/models
47 if [ -d "/local/models" ] && [ "$(ls -A /local/models 2>/dev/null)" ]; then
48 echo "" >&2
49 echo "Models in /local/models:" >&2
50 ls -1 /local/models/ 2>/dev/null | while read dir; do
51 if [ -f "/local/models/$dir/config.cfg" ]; then
52 echo " $dir" >&2
53 fi
54 done
55 fi
56
57 echo "" >&2
58
59 # Show available models list
60 python /app/list_spacy_models.py
Marc Kupietz86044852025-11-29 10:19:03 +010061 exit 0
62 ;;
Marc Kupietza137c072025-11-29 15:37:37 +010063 V)
64 echo "=== spaCy Version Information ===" >&2
65 python -c "import spacy; print(f'spaCy version: {spacy.__version__}')" >&2
66
67 # Check for GermaLemma
68 python -c "try:
69 import germalemma
70 try:
71 print(f'GermaLemma version: {germalemma.__version__}')
72 except AttributeError:
73 print('GermaLemma: installed (version unknown)')
74except ImportError:
75 print('GermaLemma: not installed')" >&2
76
77 # Show Python version
78 python -c "import sys; print(f'Python version: {sys.version.split()[0]}')" >&2
79
80 exit 0
81 ;;
Marc Kupietz86044852025-11-29 10:19:03 +010082 d)
83 use_dependencies="False"
84 ;;
85 g)
86 use_germalemma="False"
87 ;;
88 \?)
89 echo "Invalid option: -$OPTARG" >&2
90 usage
91 ;;
92 :)
93 echo "Option -$OPTARG requires an argument" >&2
94 usage
95 ;;
96 esac
97done
98
99if [ $OPTIND -le $# ]; then
100 usage
101fi
102
103MODEL_DIR="/local/models"
104MODEL_PATH="$MODEL_DIR/$model"
105
106# Ensure MODEL_DIR exists
107mkdir -p "$MODEL_DIR"
108
109# Function to check if model is installed and usable
110is_model_installed() {
111 local model_name="$1"
112 # Check if model is installed in the venv
113 python -c "import spacy; spacy.load('$model_name')" 2>/dev/null
114 return $?
115}
116
117# Function to check if preloaded model exists and is valid
118has_preloaded_model() {
119 local model_path="$1"
120 # Check for config.cfg which indicates a valid spaCy model
121 if [ -f "$model_path/config.cfg" ]; then
122 return 0
123 fi
124 return 1
125}
126
127# Function to install model
128install_model() {
129 local model_name="$1"
130
131 # Check if model exists in /local/models - if so, we'll use absolute path
132 if has_preloaded_model "$MODEL_PATH"; then
133 echo "Found preloaded model in $MODEL_PATH" >&2
134 echo "Will use absolute path to avoid download" >&2
135 return 0
136 fi
137
138 # Check if already installed in venv
139 if is_model_installed "$model_name"; then
140 echo "Model $model_name already installed in venv" >&2
141 return 0
142 fi
143
144 # Try to download model to /local/models if writable
145 if [ -w "$MODEL_DIR" ]; then
146 # Download and install to /local/models with progress
147 if python /app/download_with_progress.py "$model_name" 2>&1 | tee /tmp/spacy_download.log >&2; then
Marc Kupietzc75ae7c2025-11-29 10:41:26 +0100148 # Extract and flatten the model structure for persistence
Marc Kupietz86044852025-11-29 10:19:03 +0100149 SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
150 INSTALLED_MODEL="$SITE_PACKAGES/$model_name"
151
152 if [ -d "$INSTALLED_MODEL" ]; then
Marc Kupietzc75ae7c2025-11-29 10:41:26 +0100153 echo "Extracting model to $MODEL_PATH for persistence..." >&2
154
155 # Find the actual model directory (e.g., de_core_news_lg-3.8.0)
156 VERSIONED_DIR=$(find "$INSTALLED_MODEL" -maxdepth 1 -type d -name "${model_name}-*" | head -1)
157
158 if [ -n "$VERSIONED_DIR" ] && [ -f "$VERSIONED_DIR/config.cfg" ]; then
159 # Copy the versioned model directory contents to MODEL_PATH
160 mkdir -p "$MODEL_PATH"
161 cp -r "$VERSIONED_DIR"/* "$MODEL_PATH/"
162 # Set permissions so user can modify the model files
163 chmod -R a+rwX "$MODEL_PATH" 2>/dev/null || true
164 echo "Model extracted to $MODEL_PATH" >&2
165 else
166 # Fallback: just move the whole package
167 echo "Warning: Could not find versioned model directory, moving package as-is" >&2
168 mv "$INSTALLED_MODEL" "$MODEL_PATH" 2>/dev/null || true
169 chmod -R a+rwX "$MODEL_PATH" 2>/dev/null || true
170 fi
Marc Kupietz86044852025-11-29 10:19:03 +0100171 fi
172 return 0
173 else
174 echo "Failed to download model $model_name" >&2
175 return 1
176 fi
177 else
178 # MODEL_DIR not writable, install to venv (ephemeral)
179 echo "Cannot write to $MODEL_DIR, installing to venv (ephemeral)" >&2
180 if python /app/download_with_progress.py "$model_name" 2>&1 | tee /tmp/spacy_download.log >&2; then
181 return 0
182 else
183 echo "Failed to download model $model_name" >&2
184 return 1
185 fi
186 fi
187}
188
189# Install or verify model
190if ! install_model "$model"; then
191 echo "ERROR: Could not install model $model, aborting." >&2
192 exit 1
193fi
194
195# Determine which model path to use
196# If preloaded model exists, use absolute path; otherwise use model name
197if has_preloaded_model "$MODEL_PATH"; then
198 MODEL_TO_USE="$MODEL_PATH"
199 echo "Using preloaded model at: $MODEL_TO_USE" >&2
200else
201 MODEL_TO_USE="$model"
202 echo "Using installed model: $MODEL_TO_USE" >&2
203fi
204
205# Set environment variables for the Python script
206export SPACY_USE_DEPENDENCIES="$use_dependencies"
207export SPACY_USE_GERMALEMMA="$use_germalemma"
208
209# Log configuration
210echo "Configuration:" >&2
211echo " Model: $MODEL_TO_USE" >&2
212echo " Use dependencies: $use_dependencies" >&2
213echo " Use GermaLemma: $use_germalemma" >&2
214
215# Run the spaCy tagging pipeline
216python /app/systems/parse_spacy_pipe.py \
217 --spacy_model "$MODEL_TO_USE" \
218 --corpus_name "stdin" \
219 --gld_token_type "CoNLLUP_Token" \
220 --comment_str "#"