blob: 5fb9b5eb225a928235950ae4fd4443172e79ebaa [file] [log] [blame]
#!/bin/bash
set -o pipefail
# Default values
model="de_core_news_lg"
use_dependencies="True"
use_germalemma="True"
usage() {
echo "Usage: $0 [-h] [-m MODEL] [-L] [-d] [-g]"
echo " -h Display this help message"
echo " -m MODEL Specify spaCy model (default: $model)"
echo " -L List available/installed models"
echo " -d Disable dependency parsing (faster processing)"
echo " -g Disable GermaLemma (use spaCy lemmatizer only)"
exit 1
}
# Parse command line options
while getopts "hm:Ldg" opt; do
case $opt in
h)
usage
;;
m)
model="$OPTARG"
;;
L)
python -m spacy info 2>/dev/null || echo "No models installed"
exit 0
;;
d)
use_dependencies="False"
;;
g)
use_germalemma="False"
;;
\?)
echo "Invalid option: -$OPTARG" >&2
usage
;;
:)
echo "Option -$OPTARG requires an argument" >&2
usage
;;
esac
done
if [ $OPTIND -le $# ]; then
usage
fi
MODEL_DIR="/local/models"
MODEL_PATH="$MODEL_DIR/$model"
# Ensure MODEL_DIR exists
mkdir -p "$MODEL_DIR"
# Function to check if model is installed and usable
is_model_installed() {
local model_name="$1"
# Check if model is installed in the venv
python -c "import spacy; spacy.load('$model_name')" 2>/dev/null
return $?
}
# Function to check if preloaded model exists and is valid
has_preloaded_model() {
local model_path="$1"
# Check for config.cfg which indicates a valid spaCy model
if [ -f "$model_path/config.cfg" ]; then
return 0
fi
return 1
}
# Function to install model
install_model() {
local model_name="$1"
# Check if model exists in /local/models - if so, we'll use absolute path
if has_preloaded_model "$MODEL_PATH"; then
echo "Found preloaded model in $MODEL_PATH" >&2
echo "Will use absolute path to avoid download" >&2
return 0
fi
# Check if already installed in venv
if is_model_installed "$model_name"; then
echo "Model $model_name already installed in venv" >&2
return 0
fi
# Try to download model to /local/models if writable
if [ -w "$MODEL_DIR" ]; then
# Download and install to /local/models with progress
if python /app/download_with_progress.py "$model_name" 2>&1 | tee /tmp/spacy_download.log >&2; then
# Extract and flatten the model structure for persistence
SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
INSTALLED_MODEL="$SITE_PACKAGES/$model_name"
if [ -d "$INSTALLED_MODEL" ]; then
echo "Extracting model to $MODEL_PATH for persistence..." >&2
# Find the actual model directory (e.g., de_core_news_lg-3.8.0)
VERSIONED_DIR=$(find "$INSTALLED_MODEL" -maxdepth 1 -type d -name "${model_name}-*" | head -1)
if [ -n "$VERSIONED_DIR" ] && [ -f "$VERSIONED_DIR/config.cfg" ]; then
# Copy the versioned model directory contents to MODEL_PATH
mkdir -p "$MODEL_PATH"
cp -r "$VERSIONED_DIR"/* "$MODEL_PATH/"
# Set permissions so user can modify the model files
chmod -R a+rwX "$MODEL_PATH" 2>/dev/null || true
echo "Model extracted to $MODEL_PATH" >&2
else
# Fallback: just move the whole package
echo "Warning: Could not find versioned model directory, moving package as-is" >&2
mv "$INSTALLED_MODEL" "$MODEL_PATH" 2>/dev/null || true
chmod -R a+rwX "$MODEL_PATH" 2>/dev/null || true
fi
fi
return 0
else
echo "Failed to download model $model_name" >&2
return 1
fi
else
# MODEL_DIR not writable, install to venv (ephemeral)
echo "Cannot write to $MODEL_DIR, installing to venv (ephemeral)" >&2
if python /app/download_with_progress.py "$model_name" 2>&1 | tee /tmp/spacy_download.log >&2; then
return 0
else
echo "Failed to download model $model_name" >&2
return 1
fi
fi
}
# Install or verify model
if ! install_model "$model"; then
echo "ERROR: Could not install model $model, aborting." >&2
exit 1
fi
# Determine which model path to use
# If preloaded model exists, use absolute path; otherwise use model name
if has_preloaded_model "$MODEL_PATH"; then
MODEL_TO_USE="$MODEL_PATH"
echo "Using preloaded model at: $MODEL_TO_USE" >&2
else
MODEL_TO_USE="$model"
echo "Using installed model: $MODEL_TO_USE" >&2
fi
# Set environment variables for the Python script
export SPACY_USE_DEPENDENCIES="$use_dependencies"
export SPACY_USE_GERMALEMMA="$use_germalemma"
# Log configuration
echo "Configuration:" >&2
echo " Model: $MODEL_TO_USE" >&2
echo " Use dependencies: $use_dependencies" >&2
echo " Use GermaLemma: $use_germalemma" >&2
# Run the spaCy tagging pipeline
python /app/systems/parse_spacy_pipe.py \
--spacy_model "$MODEL_TO_USE" \
--corpus_name "stdin" \
--gld_token_type "CoNLLUP_Token" \
--comment_str "#"