blob: 0e455145ae183a7d288966f798d432f6505728f8 [file] [log] [blame]
Marc Kupietz86044852025-11-29 10:19:03 +01001#!/bin/bash
2
3set -o pipefail
4
5# Default values
6model="de_core_news_lg"
7use_dependencies="True"
8use_germalemma="True"
9
10usage() {
11 echo "Usage: $0 [-h] [-m MODEL] [-L] [-d] [-g]"
12 echo " -h Display this help message"
13 echo " -m MODEL Specify spaCy model (default: $model)"
14 echo " -L List available/installed models"
15 echo " -d Disable dependency parsing (faster processing)"
16 echo " -g Disable GermaLemma (use spaCy lemmatizer only)"
17 exit 1
18}
19
20# Parse command line options
21while getopts "hm:Ldg" opt; do
22 case $opt in
23 h)
24 usage
25 ;;
26 m)
27 model="$OPTARG"
28 ;;
29 L)
Marc Kupietz68a18132025-11-29 11:17:06 +010030 echo "=== Installed Models ===" >&2
31
32 # List models installed in venv
33 INSTALLED=$(python -c "import spacy; import pkg_resources; print('\n'.join([pkg.key for pkg in pkg_resources.working_set if pkg.key.endswith(('-sm', '-md', '-lg', '-trf')) and not pkg.key.startswith('spacy')]))" 2>/dev/null)
34
35 if [ -n "$INSTALLED" ]; then
36 echo "$INSTALLED" | while read model; do
37 # Convert package name to model name (e.g., de-core-news-lg -> de_core_news_lg)
38 model_name=$(echo "$model" | sed 's/-/_/g')
39 echo " $model_name" >&2
40 done
41 else
42 echo " No models installed in venv" >&2
43 fi
44
45 # Check for models in /local/models
46 if [ -d "/local/models" ] && [ "$(ls -A /local/models 2>/dev/null)" ]; then
47 echo "" >&2
48 echo "Models in /local/models:" >&2
49 ls -1 /local/models/ 2>/dev/null | while read dir; do
50 if [ -f "/local/models/$dir/config.cfg" ]; then
51 echo " $dir" >&2
52 fi
53 done
54 fi
55
56 echo "" >&2
57
58 # Show available models list
59 python /app/list_spacy_models.py
Marc Kupietz86044852025-11-29 10:19:03 +010060 exit 0
61 ;;
62 d)
63 use_dependencies="False"
64 ;;
65 g)
66 use_germalemma="False"
67 ;;
68 \?)
69 echo "Invalid option: -$OPTARG" >&2
70 usage
71 ;;
72 :)
73 echo "Option -$OPTARG requires an argument" >&2
74 usage
75 ;;
76 esac
77done
78
79if [ $OPTIND -le $# ]; then
80 usage
81fi
82
83MODEL_DIR="/local/models"
84MODEL_PATH="$MODEL_DIR/$model"
85
86# Ensure MODEL_DIR exists
87mkdir -p "$MODEL_DIR"
88
89# Function to check if model is installed and usable
90is_model_installed() {
91 local model_name="$1"
92 # Check if model is installed in the venv
93 python -c "import spacy; spacy.load('$model_name')" 2>/dev/null
94 return $?
95}
96
97# Function to check if preloaded model exists and is valid
98has_preloaded_model() {
99 local model_path="$1"
100 # Check for config.cfg which indicates a valid spaCy model
101 if [ -f "$model_path/config.cfg" ]; then
102 return 0
103 fi
104 return 1
105}
106
107# Function to install model
108install_model() {
109 local model_name="$1"
110
111 # Check if model exists in /local/models - if so, we'll use absolute path
112 if has_preloaded_model "$MODEL_PATH"; then
113 echo "Found preloaded model in $MODEL_PATH" >&2
114 echo "Will use absolute path to avoid download" >&2
115 return 0
116 fi
117
118 # Check if already installed in venv
119 if is_model_installed "$model_name"; then
120 echo "Model $model_name already installed in venv" >&2
121 return 0
122 fi
123
124 # Try to download model to /local/models if writable
125 if [ -w "$MODEL_DIR" ]; then
126 # Download and install to /local/models with progress
127 if python /app/download_with_progress.py "$model_name" 2>&1 | tee /tmp/spacy_download.log >&2; then
Marc Kupietzc75ae7c2025-11-29 10:41:26 +0100128 # Extract and flatten the model structure for persistence
Marc Kupietz86044852025-11-29 10:19:03 +0100129 SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
130 INSTALLED_MODEL="$SITE_PACKAGES/$model_name"
131
132 if [ -d "$INSTALLED_MODEL" ]; then
Marc Kupietzc75ae7c2025-11-29 10:41:26 +0100133 echo "Extracting model to $MODEL_PATH for persistence..." >&2
134
135 # Find the actual model directory (e.g., de_core_news_lg-3.8.0)
136 VERSIONED_DIR=$(find "$INSTALLED_MODEL" -maxdepth 1 -type d -name "${model_name}-*" | head -1)
137
138 if [ -n "$VERSIONED_DIR" ] && [ -f "$VERSIONED_DIR/config.cfg" ]; then
139 # Copy the versioned model directory contents to MODEL_PATH
140 mkdir -p "$MODEL_PATH"
141 cp -r "$VERSIONED_DIR"/* "$MODEL_PATH/"
142 # Set permissions so user can modify the model files
143 chmod -R a+rwX "$MODEL_PATH" 2>/dev/null || true
144 echo "Model extracted to $MODEL_PATH" >&2
145 else
146 # Fallback: just move the whole package
147 echo "Warning: Could not find versioned model directory, moving package as-is" >&2
148 mv "$INSTALLED_MODEL" "$MODEL_PATH" 2>/dev/null || true
149 chmod -R a+rwX "$MODEL_PATH" 2>/dev/null || true
150 fi
Marc Kupietz86044852025-11-29 10:19:03 +0100151 fi
152 return 0
153 else
154 echo "Failed to download model $model_name" >&2
155 return 1
156 fi
157 else
158 # MODEL_DIR not writable, install to venv (ephemeral)
159 echo "Cannot write to $MODEL_DIR, installing to venv (ephemeral)" >&2
160 if python /app/download_with_progress.py "$model_name" 2>&1 | tee /tmp/spacy_download.log >&2; then
161 return 0
162 else
163 echo "Failed to download model $model_name" >&2
164 return 1
165 fi
166 fi
167}
168
169# Install or verify model
170if ! install_model "$model"; then
171 echo "ERROR: Could not install model $model, aborting." >&2
172 exit 1
173fi
174
175# Determine which model path to use
176# If preloaded model exists, use absolute path; otherwise use model name
177if has_preloaded_model "$MODEL_PATH"; then
178 MODEL_TO_USE="$MODEL_PATH"
179 echo "Using preloaded model at: $MODEL_TO_USE" >&2
180else
181 MODEL_TO_USE="$model"
182 echo "Using installed model: $MODEL_TO_USE" >&2
183fi
184
185# Set environment variables for the Python script
186export SPACY_USE_DEPENDENCIES="$use_dependencies"
187export SPACY_USE_GERMALEMMA="$use_germalemma"
188
189# Log configuration
190echo "Configuration:" >&2
191echo " Model: $MODEL_TO_USE" >&2
192echo " Use dependencies: $use_dependencies" >&2
193echo " Use GermaLemma: $use_germalemma" >&2
194
195# Run the spaCy tagging pipeline
196python /app/systems/parse_spacy_pipe.py \
197 --spacy_model "$MODEL_TO_USE" \
198 --corpus_name "stdin" \
199 --gld_token_type "CoNLLUP_Token" \
200 --comment_str "#"