blob: 41daabb337846825156b3e16e767de16c537a566 [file] [log] [blame]
#!/usr/bin/env python3
"""
Display available spaCy models
Uses a curated list of current models since spacy.io/models is JavaScript-rendered
"""
import sys
def get_models():
"""Get list of available models organized by language"""
# Curated list of spaCy models (updated 2025-01)
# Based on https://spacy.io/models and https://github.com/explosion/spacy-models
return {
'ca': ['ca_core_news_sm', 'ca_core_news_md', 'ca_core_news_lg', 'ca_core_news_trf'],
'zh': ['zh_core_web_sm', 'zh_core_web_md', 'zh_core_web_lg', 'zh_core_web_trf'],
'hr': ['hr_core_news_sm', 'hr_core_news_md', 'hr_core_news_lg'],
'da': ['da_core_news_sm', 'da_core_news_md', 'da_core_news_lg', 'da_core_news_trf'],
'nl': ['nl_core_news_sm', 'nl_core_news_md', 'nl_core_news_lg'],
'en': ['en_core_web_sm', 'en_core_web_md', 'en_core_web_lg', 'en_core_web_trf'],
'fi': ['fi_core_news_sm', 'fi_core_news_md', 'fi_core_news_lg'],
'fr': ['fr_core_news_sm', 'fr_core_news_md', 'fr_core_news_lg', 'fr_dep_news_trf'],
'de': ['de_core_news_sm', 'de_core_news_md', 'de_core_news_lg'],
'el': ['el_core_news_sm', 'el_core_news_md', 'el_core_news_lg'],
'it': ['it_core_news_sm', 'it_core_news_md', 'it_core_news_lg'],
'ja': ['ja_core_news_sm', 'ja_core_news_md', 'ja_core_news_lg', 'ja_core_news_trf'],
'ko': ['ko_core_news_sm', 'ko_core_news_md', 'ko_core_news_lg'],
'lt': ['lt_core_news_sm', 'lt_core_news_md', 'lt_core_news_lg'],
'mk': ['mk_core_news_sm', 'mk_core_news_md', 'mk_core_news_lg'],
'nb': ['nb_core_news_sm', 'nb_core_news_md', 'nb_core_news_lg'],
'pl': ['pl_core_news_sm', 'pl_core_news_md', 'pl_core_news_lg'],
'pt': ['pt_core_news_sm', 'pt_core_news_md', 'pt_core_news_lg'],
'ro': ['ro_core_news_sm', 'ro_core_news_md', 'ro_core_news_lg'],
'ru': ['ru_core_news_sm', 'ru_core_news_md', 'ru_core_news_lg'],
'es': ['es_core_news_sm', 'es_core_news_md', 'es_core_news_lg'],
'sv': ['sv_core_news_sm', 'sv_core_news_md', 'sv_core_news_lg'],
'uk': ['uk_core_news_sm', 'uk_core_news_md', 'uk_core_news_lg', 'uk_core_news_trf'],
}
def get_language_name(code):
"""Get full language name from code"""
languages = {
'ca': 'Catalan',
'zh': 'Chinese',
'hr': 'Croatian',
'da': 'Danish',
'nl': 'Dutch',
'en': 'English',
'fi': 'Finnish',
'fr': 'French',
'de': 'German',
'el': 'Greek',
'it': 'Italian',
'ja': 'Japanese',
'ko': 'Korean',
'lt': 'Lithuanian',
'mk': 'Macedonian',
'nb': 'Norwegian Bokmål',
'pl': 'Polish',
'pt': 'Portuguese',
'ro': 'Romanian',
'ru': 'Russian',
'es': 'Spanish',
'sv': 'Swedish',
'uk': 'Ukrainian',
}
return languages.get(code, code.upper())
def display_models(by_language):
"""Display models grouped by language"""
# Priority languages to show first
priority = ['de', 'en', 'fr', 'es', 'it', 'pt', 'nl', 'pl', 'ru', 'zh', 'ja']
# Show priority languages first
for lang_code in priority:
if lang_code in by_language:
lang_name = get_language_name(lang_code)
print(f"\n{lang_name}:", file=sys.stderr)
for model in sorted(by_language[lang_code]):
# Estimate size based on suffix
if model.endswith('_sm'):
size = "~15MB"
elif model.endswith('_md'):
size = "~100MB"
elif model.endswith('_lg'):
size = "~560MB"
elif model.endswith('_trf'):
size = "~500MB (transformer)"
else:
size = ""
default = " (default)" if model == "de_core_news_lg" else ""
print(f" {model:30} {size}{default}", file=sys.stderr)
# Show remaining languages
remaining = sorted([code for code in by_language.keys() if code not in priority])
if remaining:
print(f"\nOther languages:", file=sys.stderr)
for lang_code in remaining:
lang_name = get_language_name(lang_code)
models = ", ".join([m.split('_')[-1] for m in sorted(by_language[lang_code])])
print(f" {lang_name}: {models}", file=sys.stderr)
def main():
print("=== Available spaCy Models ===\n", file=sys.stderr)
by_language = get_models()
display_models(by_language)
print(f"\n\nTotal: {sum(len(models) for models in by_language.values())} models across {len(by_language)} languages", file=sys.stderr)
print("\nFor complete details and latest updates, visit: https://spacy.io/models", file=sys.stderr)
print("\nUsage: docker run --rm -i korap/conllu-spacy -m MODEL_NAME < input.conllu", file=sys.stderr)
if __name__ == "__main__":
main()