Actually list available models with -L
Change-Id: I25c1d38b86dd8f208c749327f80575309b3b0d7d
diff --git a/Dockerfile b/Dockerfile
index 0aa6026..246ef83 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -41,6 +41,7 @@
COPY my_utils /app/my_utils
COPY docker-entrypoint.sh /docker-entrypoint.sh
COPY download_with_progress.py /app/download_with_progress.py
+COPY list_spacy_models.py /app/list_spacy_models.py
# Set environment variables
ENV VIRTUAL_ENV=/app/venv
@@ -69,7 +70,8 @@
# Make entrypoint executable and set permissions
RUN chmod +x /docker-entrypoint.sh && \
- chmod +x /app/download_with_progress.py
+ chmod +x /app/download_with_progress.py && \
+ chmod +x /app/list_spacy_models.py
# Change ownership of app directories to appuser
RUN chown -R appuser:appuser /app /local /docker-entrypoint.sh
diff --git a/Dockerfile.with-models b/Dockerfile.with-models
index f6f64c5..9d71974 100644
--- a/Dockerfile.with-models
+++ b/Dockerfile.with-models
@@ -67,6 +67,7 @@
COPY my_utils /app/my_utils
COPY docker-entrypoint.sh /docker-entrypoint.sh
COPY download_with_progress.py /app/download_with_progress.py
+COPY list_spacy_models.py /app/list_spacy_models.py
# Set environment variables
ENV VIRTUAL_ENV=/app/venv
@@ -95,7 +96,8 @@
# Make entrypoint executable and set permissions
RUN chmod +x /docker-entrypoint.sh && \
- chmod +x /app/download_with_progress.py
+ chmod +x /app/download_with_progress.py && \
+ chmod +x /app/list_spacy_models.py
# Change ownership of app directories to appuser
RUN chown -R appuser:appuser /app /local /docker-entrypoint.sh
diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh
index 5fb9b5e..0e45514 100755
--- a/docker-entrypoint.sh
+++ b/docker-entrypoint.sh
@@ -27,7 +27,36 @@
model="$OPTARG"
;;
L)
- python -m spacy info 2>/dev/null || echo "No models installed"
+ echo "=== Installed Models ===" >&2
+
+ # List models installed in venv
+ INSTALLED=$(python -c "import spacy; import pkg_resources; print('\n'.join([pkg.key for pkg in pkg_resources.working_set if pkg.key.endswith(('-sm', '-md', '-lg', '-trf')) and not pkg.key.startswith('spacy')]))" 2>/dev/null)
+
+ if [ -n "$INSTALLED" ]; then
+ echo "$INSTALLED" | while read model; do
+ # Convert package name to model name (e.g., de-core-news-lg -> de_core_news_lg)
+ model_name=$(echo "$model" | sed 's/-/_/g')
+ echo " $model_name" >&2
+ done
+ else
+ echo " No models installed in venv" >&2
+ fi
+
+ # Check for models in /local/models
+ if [ -d "/local/models" ] && [ "$(ls -A /local/models 2>/dev/null)" ]; then
+ echo "" >&2
+ echo "Models in /local/models:" >&2
+ ls -1 /local/models/ 2>/dev/null | while read dir; do
+ if [ -f "/local/models/$dir/config.cfg" ]; then
+ echo " $dir" >&2
+ fi
+ done
+ fi
+
+ echo "" >&2
+
+ # Show available models list
+ python /app/list_spacy_models.py
exit 0
;;
d)
diff --git a/list_spacy_models.py b/list_spacy_models.py
new file mode 100755
index 0000000..41daabb
--- /dev/null
+++ b/list_spacy_models.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+"""
+Display available spaCy models
+Uses a curated list of current models since spacy.io/models is JavaScript-rendered
+"""
+import sys
+
+def get_models():
+ """Get list of available models organized by language"""
+ # Curated list of spaCy models (updated 2025-01)
+ # Based on https://spacy.io/models and https://github.com/explosion/spacy-models
+ return {
+ 'ca': ['ca_core_news_sm', 'ca_core_news_md', 'ca_core_news_lg', 'ca_core_news_trf'],
+ 'zh': ['zh_core_web_sm', 'zh_core_web_md', 'zh_core_web_lg', 'zh_core_web_trf'],
+ 'hr': ['hr_core_news_sm', 'hr_core_news_md', 'hr_core_news_lg'],
+ 'da': ['da_core_news_sm', 'da_core_news_md', 'da_core_news_lg', 'da_core_news_trf'],
+ 'nl': ['nl_core_news_sm', 'nl_core_news_md', 'nl_core_news_lg'],
+ 'en': ['en_core_web_sm', 'en_core_web_md', 'en_core_web_lg', 'en_core_web_trf'],
+ 'fi': ['fi_core_news_sm', 'fi_core_news_md', 'fi_core_news_lg'],
+ 'fr': ['fr_core_news_sm', 'fr_core_news_md', 'fr_core_news_lg', 'fr_dep_news_trf'],
+ 'de': ['de_core_news_sm', 'de_core_news_md', 'de_core_news_lg'],
+ 'el': ['el_core_news_sm', 'el_core_news_md', 'el_core_news_lg'],
+ 'it': ['it_core_news_sm', 'it_core_news_md', 'it_core_news_lg'],
+ 'ja': ['ja_core_news_sm', 'ja_core_news_md', 'ja_core_news_lg', 'ja_core_news_trf'],
+ 'ko': ['ko_core_news_sm', 'ko_core_news_md', 'ko_core_news_lg'],
+ 'lt': ['lt_core_news_sm', 'lt_core_news_md', 'lt_core_news_lg'],
+ 'mk': ['mk_core_news_sm', 'mk_core_news_md', 'mk_core_news_lg'],
+ 'nb': ['nb_core_news_sm', 'nb_core_news_md', 'nb_core_news_lg'],
+ 'pl': ['pl_core_news_sm', 'pl_core_news_md', 'pl_core_news_lg'],
+ 'pt': ['pt_core_news_sm', 'pt_core_news_md', 'pt_core_news_lg'],
+ 'ro': ['ro_core_news_sm', 'ro_core_news_md', 'ro_core_news_lg'],
+ 'ru': ['ru_core_news_sm', 'ru_core_news_md', 'ru_core_news_lg'],
+ 'es': ['es_core_news_sm', 'es_core_news_md', 'es_core_news_lg'],
+ 'sv': ['sv_core_news_sm', 'sv_core_news_md', 'sv_core_news_lg'],
+ 'uk': ['uk_core_news_sm', 'uk_core_news_md', 'uk_core_news_lg', 'uk_core_news_trf'],
+ }
+
+def get_language_name(code):
+ """Get full language name from code"""
+ languages = {
+ 'ca': 'Catalan',
+ 'zh': 'Chinese',
+ 'hr': 'Croatian',
+ 'da': 'Danish',
+ 'nl': 'Dutch',
+ 'en': 'English',
+ 'fi': 'Finnish',
+ 'fr': 'French',
+ 'de': 'German',
+ 'el': 'Greek',
+ 'it': 'Italian',
+ 'ja': 'Japanese',
+ 'ko': 'Korean',
+ 'lt': 'Lithuanian',
+ 'mk': 'Macedonian',
+ 'nb': 'Norwegian Bokmål',
+ 'pl': 'Polish',
+ 'pt': 'Portuguese',
+ 'ro': 'Romanian',
+ 'ru': 'Russian',
+ 'es': 'Spanish',
+ 'sv': 'Swedish',
+ 'uk': 'Ukrainian',
+ }
+ return languages.get(code, code.upper())
+
+def display_models(by_language):
+ """Display models grouped by language"""
+ # Priority languages to show first
+ priority = ['de', 'en', 'fr', 'es', 'it', 'pt', 'nl', 'pl', 'ru', 'zh', 'ja']
+
+ # Show priority languages first
+ for lang_code in priority:
+ if lang_code in by_language:
+ lang_name = get_language_name(lang_code)
+ print(f"\n{lang_name}:", file=sys.stderr)
+ for model in sorted(by_language[lang_code]):
+ # Estimate size based on suffix
+ if model.endswith('_sm'):
+ size = "~15MB"
+ elif model.endswith('_md'):
+ size = "~100MB"
+ elif model.endswith('_lg'):
+ size = "~560MB"
+ elif model.endswith('_trf'):
+ size = "~500MB (transformer)"
+ else:
+ size = ""
+
+ default = " (default)" if model == "de_core_news_lg" else ""
+ print(f" {model:30} {size}{default}", file=sys.stderr)
+
+ # Show remaining languages
+ remaining = sorted([code for code in by_language.keys() if code not in priority])
+ if remaining:
+ print(f"\nOther languages:", file=sys.stderr)
+ for lang_code in remaining:
+ lang_name = get_language_name(lang_code)
+ models = ", ".join([m.split('_')[-1] for m in sorted(by_language[lang_code])])
+ print(f" {lang_name}: {models}", file=sys.stderr)
+
+def main():
+ print("=== Available spaCy Models ===\n", file=sys.stderr)
+
+ by_language = get_models()
+ display_models(by_language)
+
+ print(f"\n\nTotal: {sum(len(models) for models in by_language.values())} models across {len(by_language)} languages", file=sys.stderr)
+ print("\nFor complete details and latest updates, visit: https://spacy.io/models", file=sys.stderr)
+ print("\nUsage: docker run --rm -i korap/conllu-spacy -m MODEL_NAME < input.conllu", file=sys.stderr)
+
+if __name__ == "__main__":
+ main()