blob: 5fb9b5eb225a928235950ae4fd4443172e79ebaa [file] [log] [blame]
Marc Kupietz86044852025-11-29 10:19:03 +01001#!/bin/bash
2
3set -o pipefail
4
5# Default values
6model="de_core_news_lg"
7use_dependencies="True"
8use_germalemma="True"
9
10usage() {
11 echo "Usage: $0 [-h] [-m MODEL] [-L] [-d] [-g]"
12 echo " -h Display this help message"
13 echo " -m MODEL Specify spaCy model (default: $model)"
14 echo " -L List available/installed models"
15 echo " -d Disable dependency parsing (faster processing)"
16 echo " -g Disable GermaLemma (use spaCy lemmatizer only)"
17 exit 1
18}
19
20# Parse command line options
21while getopts "hm:Ldg" opt; do
22 case $opt in
23 h)
24 usage
25 ;;
26 m)
27 model="$OPTARG"
28 ;;
29 L)
30 python -m spacy info 2>/dev/null || echo "No models installed"
31 exit 0
32 ;;
33 d)
34 use_dependencies="False"
35 ;;
36 g)
37 use_germalemma="False"
38 ;;
39 \?)
40 echo "Invalid option: -$OPTARG" >&2
41 usage
42 ;;
43 :)
44 echo "Option -$OPTARG requires an argument" >&2
45 usage
46 ;;
47 esac
48done
49
50if [ $OPTIND -le $# ]; then
51 usage
52fi
53
54MODEL_DIR="/local/models"
55MODEL_PATH="$MODEL_DIR/$model"
56
57# Ensure MODEL_DIR exists
58mkdir -p "$MODEL_DIR"
59
60# Function to check if model is installed and usable
61is_model_installed() {
62 local model_name="$1"
63 # Check if model is installed in the venv
64 python -c "import spacy; spacy.load('$model_name')" 2>/dev/null
65 return $?
66}
67
68# Function to check if preloaded model exists and is valid
69has_preloaded_model() {
70 local model_path="$1"
71 # Check for config.cfg which indicates a valid spaCy model
72 if [ -f "$model_path/config.cfg" ]; then
73 return 0
74 fi
75 return 1
76}
77
78# Function to install model
79install_model() {
80 local model_name="$1"
81
82 # Check if model exists in /local/models - if so, we'll use absolute path
83 if has_preloaded_model "$MODEL_PATH"; then
84 echo "Found preloaded model in $MODEL_PATH" >&2
85 echo "Will use absolute path to avoid download" >&2
86 return 0
87 fi
88
89 # Check if already installed in venv
90 if is_model_installed "$model_name"; then
91 echo "Model $model_name already installed in venv" >&2
92 return 0
93 fi
94
95 # Try to download model to /local/models if writable
96 if [ -w "$MODEL_DIR" ]; then
97 # Download and install to /local/models with progress
98 if python /app/download_with_progress.py "$model_name" 2>&1 | tee /tmp/spacy_download.log >&2; then
Marc Kupietzc75ae7c2025-11-29 10:41:26 +010099 # Extract and flatten the model structure for persistence
Marc Kupietz86044852025-11-29 10:19:03 +0100100 SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
101 INSTALLED_MODEL="$SITE_PACKAGES/$model_name"
102
103 if [ -d "$INSTALLED_MODEL" ]; then
Marc Kupietzc75ae7c2025-11-29 10:41:26 +0100104 echo "Extracting model to $MODEL_PATH for persistence..." >&2
105
106 # Find the actual model directory (e.g., de_core_news_lg-3.8.0)
107 VERSIONED_DIR=$(find "$INSTALLED_MODEL" -maxdepth 1 -type d -name "${model_name}-*" | head -1)
108
109 if [ -n "$VERSIONED_DIR" ] && [ -f "$VERSIONED_DIR/config.cfg" ]; then
110 # Copy the versioned model directory contents to MODEL_PATH
111 mkdir -p "$MODEL_PATH"
112 cp -r "$VERSIONED_DIR"/* "$MODEL_PATH/"
113 # Set permissions so user can modify the model files
114 chmod -R a+rwX "$MODEL_PATH" 2>/dev/null || true
115 echo "Model extracted to $MODEL_PATH" >&2
116 else
117 # Fallback: just move the whole package
118 echo "Warning: Could not find versioned model directory, moving package as-is" >&2
119 mv "$INSTALLED_MODEL" "$MODEL_PATH" 2>/dev/null || true
120 chmod -R a+rwX "$MODEL_PATH" 2>/dev/null || true
121 fi
Marc Kupietz86044852025-11-29 10:19:03 +0100122 fi
123 return 0
124 else
125 echo "Failed to download model $model_name" >&2
126 return 1
127 fi
128 else
129 # MODEL_DIR not writable, install to venv (ephemeral)
130 echo "Cannot write to $MODEL_DIR, installing to venv (ephemeral)" >&2
131 if python /app/download_with_progress.py "$model_name" 2>&1 | tee /tmp/spacy_download.log >&2; then
132 return 0
133 else
134 echo "Failed to download model $model_name" >&2
135 return 1
136 fi
137 fi
138}
139
140# Install or verify model
141if ! install_model "$model"; then
142 echo "ERROR: Could not install model $model, aborting." >&2
143 exit 1
144fi
145
146# Determine which model path to use
147# If preloaded model exists, use absolute path; otherwise use model name
148if has_preloaded_model "$MODEL_PATH"; then
149 MODEL_TO_USE="$MODEL_PATH"
150 echo "Using preloaded model at: $MODEL_TO_USE" >&2
151else
152 MODEL_TO_USE="$model"
153 echo "Using installed model: $MODEL_TO_USE" >&2
154fi
155
156# Set environment variables for the Python script
157export SPACY_USE_DEPENDENCIES="$use_dependencies"
158export SPACY_USE_GERMALEMMA="$use_germalemma"
159
160# Log configuration
161echo "Configuration:" >&2
162echo " Model: $MODEL_TO_USE" >&2
163echo " Use dependencies: $use_dependencies" >&2
164echo " Use GermaLemma: $use_germalemma" >&2
165
166# Run the spaCy tagging pipeline
167python /app/systems/parse_spacy_pipe.py \
168 --spacy_model "$MODEL_TO_USE" \
169 --corpus_name "stdin" \
170 --gld_token_type "CoNLLUP_Token" \
171 --comment_str "#"