| Marc Kupietz | 8604485 | 2025-11-29 10:19:03 +0100 | [diff] [blame] | 1 | #!/bin/bash |
| 2 | |
| 3 | set -o pipefail |
| 4 | |
| 5 | # Default values |
| 6 | model="de_core_news_lg" |
| 7 | use_dependencies="True" |
| 8 | use_germalemma="True" |
| 9 | |
| 10 | usage() { |
| Marc Kupietz | a137c07 | 2025-11-29 15:37:37 +0100 | [diff] [blame] | 11 | echo "Usage: $0 [-h] [-m MODEL] [-L] [-V] [-d] [-g]" |
| Marc Kupietz | 8604485 | 2025-11-29 10:19:03 +0100 | [diff] [blame] | 12 | echo " -h Display this help message" |
| 13 | echo " -m MODEL Specify spaCy model (default: $model)" |
| 14 | echo " -L List available/installed models" |
| Marc Kupietz | a137c07 | 2025-11-29 15:37:37 +0100 | [diff] [blame] | 15 | echo " -V Display spaCy version information" |
| Marc Kupietz | 8604485 | 2025-11-29 10:19:03 +0100 | [diff] [blame] | 16 | echo " -d Disable dependency parsing (faster processing)" |
| 17 | echo " -g Disable GermaLemma (use spaCy lemmatizer only)" |
| 18 | exit 1 |
| 19 | } |
| 20 | |
| 21 | # Parse command line options |
| Marc Kupietz | a137c07 | 2025-11-29 15:37:37 +0100 | [diff] [blame] | 22 | while getopts "hm:LVdg" opt; do |
| Marc Kupietz | 8604485 | 2025-11-29 10:19:03 +0100 | [diff] [blame] | 23 | case $opt in |
| 24 | h) |
| 25 | usage |
| 26 | ;; |
| 27 | m) |
| 28 | model="$OPTARG" |
| 29 | ;; |
| 30 | L) |
| Marc Kupietz | 68a1813 | 2025-11-29 11:17:06 +0100 | [diff] [blame] | 31 | echo "=== Installed Models ===" >&2 |
| 32 | |
| 33 | # List models installed in venv |
| 34 | INSTALLED=$(python -c "import spacy; import pkg_resources; print('\n'.join([pkg.key for pkg in pkg_resources.working_set if pkg.key.endswith(('-sm', '-md', '-lg', '-trf')) and not pkg.key.startswith('spacy')]))" 2>/dev/null) |
| 35 | |
| 36 | if [ -n "$INSTALLED" ]; then |
| 37 | echo "$INSTALLED" | while read model; do |
| 38 | # Convert package name to model name (e.g., de-core-news-lg -> de_core_news_lg) |
| 39 | model_name=$(echo "$model" | sed 's/-/_/g') |
| 40 | echo " $model_name" >&2 |
| 41 | done |
| 42 | else |
| 43 | echo " No models installed in venv" >&2 |
| 44 | fi |
| 45 | |
| 46 | # Check for models in /local/models |
| 47 | if [ -d "/local/models" ] && [ "$(ls -A /local/models 2>/dev/null)" ]; then |
| 48 | echo "" >&2 |
| 49 | echo "Models in /local/models:" >&2 |
| 50 | ls -1 /local/models/ 2>/dev/null | while read dir; do |
| 51 | if [ -f "/local/models/$dir/config.cfg" ]; then |
| 52 | echo " $dir" >&2 |
| 53 | fi |
| 54 | done |
| 55 | fi |
| 56 | |
| 57 | echo "" >&2 |
| 58 | |
| 59 | # Show available models list |
| 60 | python /app/list_spacy_models.py |
| Marc Kupietz | 8604485 | 2025-11-29 10:19:03 +0100 | [diff] [blame] | 61 | exit 0 |
| 62 | ;; |
| Marc Kupietz | a137c07 | 2025-11-29 15:37:37 +0100 | [diff] [blame] | 63 | V) |
| Marc Kupietz | e0ca9d2 | 2025-11-30 07:49:28 +0100 | [diff] [blame] | 64 | echo "=== Version Information ===" >&2 |
| 65 | echo "conllu-spacy-docker version: 3.8.11-1" >&2 |
| Marc Kupietz | a137c07 | 2025-11-29 15:37:37 +0100 | [diff] [blame] | 66 | python -c "import spacy; print(f'spaCy version: {spacy.__version__}')" >&2 |
| 67 | |
| 68 | # Check for GermaLemma |
| 69 | python -c "try: |
| 70 | import germalemma |
| 71 | try: |
| 72 | print(f'GermaLemma version: {germalemma.__version__}') |
| 73 | except AttributeError: |
| 74 | print('GermaLemma: installed (version unknown)') |
| 75 | except ImportError: |
| 76 | print('GermaLemma: not installed')" >&2 |
| 77 | |
| 78 | # Show Python version |
| 79 | python -c "import sys; print(f'Python version: {sys.version.split()[0]}')" >&2 |
| 80 | |
| 81 | exit 0 |
| 82 | ;; |
| Marc Kupietz | 8604485 | 2025-11-29 10:19:03 +0100 | [diff] [blame] | 83 | d) |
| 84 | use_dependencies="False" |
| 85 | ;; |
| 86 | g) |
| 87 | use_germalemma="False" |
| 88 | ;; |
| 89 | \?) |
| 90 | echo "Invalid option: -$OPTARG" >&2 |
| 91 | usage |
| 92 | ;; |
| 93 | :) |
| 94 | echo "Option -$OPTARG requires an argument" >&2 |
| 95 | usage |
| 96 | ;; |
| 97 | esac |
| 98 | done |
| 99 | |
| 100 | if [ $OPTIND -le $# ]; then |
| 101 | usage |
| 102 | fi |
| 103 | |
| 104 | MODEL_DIR="/local/models" |
| 105 | MODEL_PATH="$MODEL_DIR/$model" |
| 106 | |
| 107 | # Ensure MODEL_DIR exists |
| 108 | mkdir -p "$MODEL_DIR" |
| 109 | |
| 110 | # Function to check if model is installed and usable |
| 111 | is_model_installed() { |
| 112 | local model_name="$1" |
| 113 | # Check if model is installed in the venv |
| 114 | python -c "import spacy; spacy.load('$model_name')" 2>/dev/null |
| 115 | return $? |
| 116 | } |
| 117 | |
| 118 | # Function to check if preloaded model exists and is valid |
| 119 | has_preloaded_model() { |
| 120 | local model_path="$1" |
| 121 | # Check for config.cfg which indicates a valid spaCy model |
| 122 | if [ -f "$model_path/config.cfg" ]; then |
| 123 | return 0 |
| 124 | fi |
| 125 | return 1 |
| 126 | } |
| 127 | |
| 128 | # Function to install model |
| 129 | install_model() { |
| 130 | local model_name="$1" |
| 131 | |
| 132 | # Check if model exists in /local/models - if so, we'll use absolute path |
| 133 | if has_preloaded_model "$MODEL_PATH"; then |
| 134 | echo "Found preloaded model in $MODEL_PATH" >&2 |
| 135 | echo "Will use absolute path to avoid download" >&2 |
| 136 | return 0 |
| 137 | fi |
| 138 | |
| 139 | # Check if already installed in venv |
| 140 | if is_model_installed "$model_name"; then |
| 141 | echo "Model $model_name already installed in venv" >&2 |
| 142 | return 0 |
| 143 | fi |
| 144 | |
| 145 | # Try to download model to /local/models if writable |
| 146 | if [ -w "$MODEL_DIR" ]; then |
| 147 | # Download and install to /local/models with progress |
| 148 | if python /app/download_with_progress.py "$model_name" 2>&1 | tee /tmp/spacy_download.log >&2; then |
| Marc Kupietz | c75ae7c | 2025-11-29 10:41:26 +0100 | [diff] [blame] | 149 | # Extract and flatten the model structure for persistence |
| Marc Kupietz | 8604485 | 2025-11-29 10:19:03 +0100 | [diff] [blame] | 150 | SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])") |
| 151 | INSTALLED_MODEL="$SITE_PACKAGES/$model_name" |
| 152 | |
| 153 | if [ -d "$INSTALLED_MODEL" ]; then |
| Marc Kupietz | c75ae7c | 2025-11-29 10:41:26 +0100 | [diff] [blame] | 154 | echo "Extracting model to $MODEL_PATH for persistence..." >&2 |
| 155 | |
| 156 | # Find the actual model directory (e.g., de_core_news_lg-3.8.0) |
| 157 | VERSIONED_DIR=$(find "$INSTALLED_MODEL" -maxdepth 1 -type d -name "${model_name}-*" | head -1) |
| 158 | |
| 159 | if [ -n "$VERSIONED_DIR" ] && [ -f "$VERSIONED_DIR/config.cfg" ]; then |
| 160 | # Copy the versioned model directory contents to MODEL_PATH |
| 161 | mkdir -p "$MODEL_PATH" |
| 162 | cp -r "$VERSIONED_DIR"/* "$MODEL_PATH/" |
| 163 | # Set permissions so user can modify the model files |
| 164 | chmod -R a+rwX "$MODEL_PATH" 2>/dev/null || true |
| 165 | echo "Model extracted to $MODEL_PATH" >&2 |
| 166 | else |
| 167 | # Fallback: just move the whole package |
| 168 | echo "Warning: Could not find versioned model directory, moving package as-is" >&2 |
| 169 | mv "$INSTALLED_MODEL" "$MODEL_PATH" 2>/dev/null || true |
| 170 | chmod -R a+rwX "$MODEL_PATH" 2>/dev/null || true |
| 171 | fi |
| Marc Kupietz | 8604485 | 2025-11-29 10:19:03 +0100 | [diff] [blame] | 172 | fi |
| 173 | return 0 |
| 174 | else |
| 175 | echo "Failed to download model $model_name" >&2 |
| 176 | return 1 |
| 177 | fi |
| 178 | else |
| 179 | # MODEL_DIR not writable, install to venv (ephemeral) |
| 180 | echo "Cannot write to $MODEL_DIR, installing to venv (ephemeral)" >&2 |
| 181 | if python /app/download_with_progress.py "$model_name" 2>&1 | tee /tmp/spacy_download.log >&2; then |
| 182 | return 0 |
| 183 | else |
| 184 | echo "Failed to download model $model_name" >&2 |
| 185 | return 1 |
| 186 | fi |
| 187 | fi |
| 188 | } |
| 189 | |
| 190 | # Install or verify model |
| 191 | if ! install_model "$model"; then |
| 192 | echo "ERROR: Could not install model $model, aborting." >&2 |
| 193 | exit 1 |
| 194 | fi |
| 195 | |
| 196 | # Determine which model path to use |
| 197 | # If preloaded model exists, use absolute path; otherwise use model name |
| 198 | if has_preloaded_model "$MODEL_PATH"; then |
| 199 | MODEL_TO_USE="$MODEL_PATH" |
| 200 | echo "Using preloaded model at: $MODEL_TO_USE" >&2 |
| 201 | else |
| 202 | MODEL_TO_USE="$model" |
| 203 | echo "Using installed model: $MODEL_TO_USE" >&2 |
| 204 | fi |
| 205 | |
| 206 | # Set environment variables for the Python script |
| 207 | export SPACY_USE_DEPENDENCIES="$use_dependencies" |
| 208 | export SPACY_USE_GERMALEMMA="$use_germalemma" |
| 209 | |
| 210 | # Log configuration |
| 211 | echo "Configuration:" >&2 |
| 212 | echo " Model: $MODEL_TO_USE" >&2 |
| 213 | echo " Use dependencies: $use_dependencies" >&2 |
| 214 | echo " Use GermaLemma: $use_germalemma" >&2 |
| 215 | |
| 216 | # Run the spaCy tagging pipeline |
| 217 | python /app/systems/parse_spacy_pipe.py \ |
| 218 | --spacy_model "$MODEL_TO_USE" \ |
| 219 | --corpus_name "stdin" \ |
| 220 | --gld_token_type "CoNLLUP_Token" \ |
| 221 | --comment_str "#" |