| Marc Kupietz | 8604485 | 2025-11-29 10:19:03 +0100 | [diff] [blame^] | 1 | #!/bin/bash |
| 2 | |
| 3 | set -o pipefail |
| 4 | |
| 5 | # Default values |
| 6 | model="de_core_news_lg" |
| 7 | use_dependencies="True" |
| 8 | use_germalemma="True" |
| 9 | |
| 10 | usage() { |
| 11 | echo "Usage: $0 [-h] [-m MODEL] [-L] [-d] [-g]" |
| 12 | echo " -h Display this help message" |
| 13 | echo " -m MODEL Specify spaCy model (default: $model)" |
| 14 | echo " -L List available/installed models" |
| 15 | echo " -d Disable dependency parsing (faster processing)" |
| 16 | echo " -g Disable GermaLemma (use spaCy lemmatizer only)" |
| 17 | exit 1 |
| 18 | } |
| 19 | |
| 20 | # Parse command line options |
| 21 | while getopts "hm:Ldg" opt; do |
| 22 | case $opt in |
| 23 | h) |
| 24 | usage |
| 25 | ;; |
| 26 | m) |
| 27 | model="$OPTARG" |
| 28 | ;; |
| 29 | L) |
| 30 | python -m spacy info 2>/dev/null || echo "No models installed" |
| 31 | exit 0 |
| 32 | ;; |
| 33 | d) |
| 34 | use_dependencies="False" |
| 35 | ;; |
| 36 | g) |
| 37 | use_germalemma="False" |
| 38 | ;; |
| 39 | \?) |
| 40 | echo "Invalid option: -$OPTARG" >&2 |
| 41 | usage |
| 42 | ;; |
| 43 | :) |
| 44 | echo "Option -$OPTARG requires an argument" >&2 |
| 45 | usage |
| 46 | ;; |
| 47 | esac |
| 48 | done |
| 49 | |
| 50 | if [ $OPTIND -le $# ]; then |
| 51 | usage |
| 52 | fi |
| 53 | |
| 54 | MODEL_DIR="/local/models" |
| 55 | MODEL_PATH="$MODEL_DIR/$model" |
| 56 | |
| 57 | # Ensure MODEL_DIR exists |
| 58 | mkdir -p "$MODEL_DIR" |
| 59 | |
| 60 | # Function to check if model is installed and usable |
| 61 | is_model_installed() { |
| 62 | local model_name="$1" |
| 63 | # Check if model is installed in the venv |
| 64 | python -c "import spacy; spacy.load('$model_name')" 2>/dev/null |
| 65 | return $? |
| 66 | } |
| 67 | |
| 68 | # Function to check if preloaded model exists and is valid |
| 69 | has_preloaded_model() { |
| 70 | local model_path="$1" |
| 71 | # Check for config.cfg which indicates a valid spaCy model |
| 72 | if [ -f "$model_path/config.cfg" ]; then |
| 73 | return 0 |
| 74 | fi |
| 75 | return 1 |
| 76 | } |
| 77 | |
| 78 | # Function to install model |
| 79 | install_model() { |
| 80 | local model_name="$1" |
| 81 | |
| 82 | # Check if model exists in /local/models - if so, we'll use absolute path |
| 83 | if has_preloaded_model "$MODEL_PATH"; then |
| 84 | echo "Found preloaded model in $MODEL_PATH" >&2 |
| 85 | echo "Will use absolute path to avoid download" >&2 |
| 86 | return 0 |
| 87 | fi |
| 88 | |
| 89 | # Check if already installed in venv |
| 90 | if is_model_installed "$model_name"; then |
| 91 | echo "Model $model_name already installed in venv" >&2 |
| 92 | return 0 |
| 93 | fi |
| 94 | |
| 95 | # Try to download model to /local/models if writable |
| 96 | if [ -w "$MODEL_DIR" ]; then |
| 97 | # Download and install to /local/models with progress |
| 98 | if python /app/download_with_progress.py "$model_name" 2>&1 | tee /tmp/spacy_download.log >&2; then |
| 99 | # Try to move the installed model to /local/models for persistence |
| 100 | SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])") |
| 101 | INSTALLED_MODEL="$SITE_PACKAGES/$model_name" |
| 102 | |
| 103 | if [ -d "$INSTALLED_MODEL" ]; then |
| 104 | echo "Moving model to $MODEL_PATH for persistence..." >&2 |
| 105 | mv "$INSTALLED_MODEL" "$MODEL_PATH" 2>/dev/null || true |
| 106 | # Create symlink back |
| 107 | ln -sf "$MODEL_PATH" "$INSTALLED_MODEL" 2>/dev/null || true |
| 108 | echo "Model saved to $MODEL_PATH" >&2 |
| 109 | fi |
| 110 | return 0 |
| 111 | else |
| 112 | echo "Failed to download model $model_name" >&2 |
| 113 | return 1 |
| 114 | fi |
| 115 | else |
| 116 | # MODEL_DIR not writable, install to venv (ephemeral) |
| 117 | echo "Cannot write to $MODEL_DIR, installing to venv (ephemeral)" >&2 |
| 118 | if python /app/download_with_progress.py "$model_name" 2>&1 | tee /tmp/spacy_download.log >&2; then |
| 119 | return 0 |
| 120 | else |
| 121 | echo "Failed to download model $model_name" >&2 |
| 122 | return 1 |
| 123 | fi |
| 124 | fi |
| 125 | } |
| 126 | |
| 127 | # Install or verify model |
| 128 | if ! install_model "$model"; then |
| 129 | echo "ERROR: Could not install model $model, aborting." >&2 |
| 130 | exit 1 |
| 131 | fi |
| 132 | |
| 133 | # Determine which model path to use |
| 134 | # If preloaded model exists, use absolute path; otherwise use model name |
| 135 | if has_preloaded_model "$MODEL_PATH"; then |
| 136 | MODEL_TO_USE="$MODEL_PATH" |
| 137 | echo "Using preloaded model at: $MODEL_TO_USE" >&2 |
| 138 | else |
| 139 | MODEL_TO_USE="$model" |
| 140 | echo "Using installed model: $MODEL_TO_USE" >&2 |
| 141 | fi |
| 142 | |
| 143 | # Set environment variables for the Python script |
| 144 | export SPACY_USE_DEPENDENCIES="$use_dependencies" |
| 145 | export SPACY_USE_GERMALEMMA="$use_germalemma" |
| 146 | |
| 147 | # Log configuration |
| 148 | echo "Configuration:" >&2 |
| 149 | echo " Model: $MODEL_TO_USE" >&2 |
| 150 | echo " Use dependencies: $use_dependencies" >&2 |
| 151 | echo " Use GermaLemma: $use_germalemma" >&2 |
| 152 | |
| 153 | # Run the spaCy tagging pipeline |
| 154 | python /app/systems/parse_spacy_pipe.py \ |
| 155 | --spacy_model "$MODEL_TO_USE" \ |
| 156 | --corpus_name "stdin" \ |
| 157 | --gld_token_type "CoNLLUP_Token" \ |
| 158 | --comment_str "#" |