| Marc Kupietz | 8604485 | 2025-11-29 10:19:03 +0100 | [diff] [blame] | 1 | #!/bin/bash |
| 2 | |
| 3 | set -o pipefail |
| 4 | |
| 5 | # Default values |
| 6 | model="de_core_news_lg" |
| 7 | use_dependencies="True" |
| 8 | use_germalemma="True" |
| 9 | |
| 10 | usage() { |
| 11 | echo "Usage: $0 [-h] [-m MODEL] [-L] [-d] [-g]" |
| 12 | echo " -h Display this help message" |
| 13 | echo " -m MODEL Specify spaCy model (default: $model)" |
| 14 | echo " -L List available/installed models" |
| 15 | echo " -d Disable dependency parsing (faster processing)" |
| 16 | echo " -g Disable GermaLemma (use spaCy lemmatizer only)" |
| 17 | exit 1 |
| 18 | } |
| 19 | |
| 20 | # Parse command line options |
| 21 | while getopts "hm:Ldg" opt; do |
| 22 | case $opt in |
| 23 | h) |
| 24 | usage |
| 25 | ;; |
| 26 | m) |
| 27 | model="$OPTARG" |
| 28 | ;; |
| 29 | L) |
| Marc Kupietz | 68a1813 | 2025-11-29 11:17:06 +0100 | [diff] [blame^] | 30 | echo "=== Installed Models ===" >&2 |
| 31 | |
| 32 | # List models installed in venv |
| 33 | INSTALLED=$(python -c "import spacy; import pkg_resources; print('\n'.join([pkg.key for pkg in pkg_resources.working_set if pkg.key.endswith(('-sm', '-md', '-lg', '-trf')) and not pkg.key.startswith('spacy')]))" 2>/dev/null) |
| 34 | |
| 35 | if [ -n "$INSTALLED" ]; then |
| 36 | echo "$INSTALLED" | while read model; do |
| 37 | # Convert package name to model name (e.g., de-core-news-lg -> de_core_news_lg) |
| 38 | model_name=$(echo "$model" | sed 's/-/_/g') |
| 39 | echo " $model_name" >&2 |
| 40 | done |
| 41 | else |
| 42 | echo " No models installed in venv" >&2 |
| 43 | fi |
| 44 | |
| 45 | # Check for models in /local/models |
| 46 | if [ -d "/local/models" ] && [ "$(ls -A /local/models 2>/dev/null)" ]; then |
| 47 | echo "" >&2 |
| 48 | echo "Models in /local/models:" >&2 |
| 49 | ls -1 /local/models/ 2>/dev/null | while read dir; do |
| 50 | if [ -f "/local/models/$dir/config.cfg" ]; then |
| 51 | echo " $dir" >&2 |
| 52 | fi |
| 53 | done |
| 54 | fi |
| 55 | |
| 56 | echo "" >&2 |
| 57 | |
| 58 | # Show available models list |
| 59 | python /app/list_spacy_models.py |
| Marc Kupietz | 8604485 | 2025-11-29 10:19:03 +0100 | [diff] [blame] | 60 | exit 0 |
| 61 | ;; |
| 62 | d) |
| 63 | use_dependencies="False" |
| 64 | ;; |
| 65 | g) |
| 66 | use_germalemma="False" |
| 67 | ;; |
| 68 | \?) |
| 69 | echo "Invalid option: -$OPTARG" >&2 |
| 70 | usage |
| 71 | ;; |
| 72 | :) |
| 73 | echo "Option -$OPTARG requires an argument" >&2 |
| 74 | usage |
| 75 | ;; |
| 76 | esac |
| 77 | done |
| 78 | |
| 79 | if [ $OPTIND -le $# ]; then |
| 80 | usage |
| 81 | fi |
| 82 | |
| 83 | MODEL_DIR="/local/models" |
| 84 | MODEL_PATH="$MODEL_DIR/$model" |
| 85 | |
| 86 | # Ensure MODEL_DIR exists |
| 87 | mkdir -p "$MODEL_DIR" |
| 88 | |
| 89 | # Function to check if model is installed and usable |
| 90 | is_model_installed() { |
| 91 | local model_name="$1" |
| 92 | # Check if model is installed in the venv |
| 93 | python -c "import spacy; spacy.load('$model_name')" 2>/dev/null |
| 94 | return $? |
| 95 | } |
| 96 | |
| 97 | # Function to check if preloaded model exists and is valid |
| 98 | has_preloaded_model() { |
| 99 | local model_path="$1" |
| 100 | # Check for config.cfg which indicates a valid spaCy model |
| 101 | if [ -f "$model_path/config.cfg" ]; then |
| 102 | return 0 |
| 103 | fi |
| 104 | return 1 |
| 105 | } |
| 106 | |
| 107 | # Function to install model |
| 108 | install_model() { |
| 109 | local model_name="$1" |
| 110 | |
| 111 | # Check if model exists in /local/models - if so, we'll use absolute path |
| 112 | if has_preloaded_model "$MODEL_PATH"; then |
| 113 | echo "Found preloaded model in $MODEL_PATH" >&2 |
| 114 | echo "Will use absolute path to avoid download" >&2 |
| 115 | return 0 |
| 116 | fi |
| 117 | |
| 118 | # Check if already installed in venv |
| 119 | if is_model_installed "$model_name"; then |
| 120 | echo "Model $model_name already installed in venv" >&2 |
| 121 | return 0 |
| 122 | fi |
| 123 | |
| 124 | # Try to download model to /local/models if writable |
| 125 | if [ -w "$MODEL_DIR" ]; then |
| 126 | # Download and install to /local/models with progress |
| 127 | if python /app/download_with_progress.py "$model_name" 2>&1 | tee /tmp/spacy_download.log >&2; then |
| Marc Kupietz | c75ae7c | 2025-11-29 10:41:26 +0100 | [diff] [blame] | 128 | # Extract and flatten the model structure for persistence |
| Marc Kupietz | 8604485 | 2025-11-29 10:19:03 +0100 | [diff] [blame] | 129 | SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])") |
| 130 | INSTALLED_MODEL="$SITE_PACKAGES/$model_name" |
| 131 | |
| 132 | if [ -d "$INSTALLED_MODEL" ]; then |
| Marc Kupietz | c75ae7c | 2025-11-29 10:41:26 +0100 | [diff] [blame] | 133 | echo "Extracting model to $MODEL_PATH for persistence..." >&2 |
| 134 | |
| 135 | # Find the actual model directory (e.g., de_core_news_lg-3.8.0) |
| 136 | VERSIONED_DIR=$(find "$INSTALLED_MODEL" -maxdepth 1 -type d -name "${model_name}-*" | head -1) |
| 137 | |
| 138 | if [ -n "$VERSIONED_DIR" ] && [ -f "$VERSIONED_DIR/config.cfg" ]; then |
| 139 | # Copy the versioned model directory contents to MODEL_PATH |
| 140 | mkdir -p "$MODEL_PATH" |
| 141 | cp -r "$VERSIONED_DIR"/* "$MODEL_PATH/" |
| 142 | # Set permissions so user can modify the model files |
| 143 | chmod -R a+rwX "$MODEL_PATH" 2>/dev/null || true |
| 144 | echo "Model extracted to $MODEL_PATH" >&2 |
| 145 | else |
| 146 | # Fallback: just move the whole package |
| 147 | echo "Warning: Could not find versioned model directory, moving package as-is" >&2 |
| 148 | mv "$INSTALLED_MODEL" "$MODEL_PATH" 2>/dev/null || true |
| 149 | chmod -R a+rwX "$MODEL_PATH" 2>/dev/null || true |
| 150 | fi |
| Marc Kupietz | 8604485 | 2025-11-29 10:19:03 +0100 | [diff] [blame] | 151 | fi |
| 152 | return 0 |
| 153 | else |
| 154 | echo "Failed to download model $model_name" >&2 |
| 155 | return 1 |
| 156 | fi |
| 157 | else |
| 158 | # MODEL_DIR not writable, install to venv (ephemeral) |
| 159 | echo "Cannot write to $MODEL_DIR, installing to venv (ephemeral)" >&2 |
| 160 | if python /app/download_with_progress.py "$model_name" 2>&1 | tee /tmp/spacy_download.log >&2; then |
| 161 | return 0 |
| 162 | else |
| 163 | echo "Failed to download model $model_name" >&2 |
| 164 | return 1 |
| 165 | fi |
| 166 | fi |
| 167 | } |
| 168 | |
| 169 | # Install or verify model |
| 170 | if ! install_model "$model"; then |
| 171 | echo "ERROR: Could not install model $model, aborting." >&2 |
| 172 | exit 1 |
| 173 | fi |
| 174 | |
| 175 | # Determine which model path to use |
| 176 | # If preloaded model exists, use absolute path; otherwise use model name |
| 177 | if has_preloaded_model "$MODEL_PATH"; then |
| 178 | MODEL_TO_USE="$MODEL_PATH" |
| 179 | echo "Using preloaded model at: $MODEL_TO_USE" >&2 |
| 180 | else |
| 181 | MODEL_TO_USE="$model" |
| 182 | echo "Using installed model: $MODEL_TO_USE" >&2 |
| 183 | fi |
| 184 | |
| 185 | # Set environment variables for the Python script |
| 186 | export SPACY_USE_DEPENDENCIES="$use_dependencies" |
| 187 | export SPACY_USE_GERMALEMMA="$use_germalemma" |
| 188 | |
| 189 | # Log configuration |
| 190 | echo "Configuration:" >&2 |
| 191 | echo " Model: $MODEL_TO_USE" >&2 |
| 192 | echo " Use dependencies: $use_dependencies" >&2 |
| 193 | echo " Use GermaLemma: $use_germalemma" >&2 |
| 194 | |
| 195 | # Run the spaCy tagging pipeline |
| 196 | python /app/systems/parse_spacy_pipe.py \ |
| 197 | --spacy_model "$MODEL_TO_USE" \ |
| 198 | --corpus_name "stdin" \ |
| 199 | --gld_token_type "CoNLLUP_Token" \ |
| 200 | --comment_str "#" |