| Marc Kupietz | 68a1813 | 2025-11-29 11:17:06 +0100 | [diff] [blame] | 1 | #!/usr/bin/env python3 |
| 2 | """ |
| 3 | Display available spaCy models |
| 4 | Uses a curated list of current models since spacy.io/models is JavaScript-rendered |
| 5 | """ |
| 6 | import sys |
| 7 | |
| 8 | def get_models(): |
| 9 | """Get list of available models organized by language""" |
| 10 | # Curated list of spaCy models (updated 2025-01) |
| 11 | # Based on https://spacy.io/models and https://github.com/explosion/spacy-models |
| 12 | return { |
| 13 | 'ca': ['ca_core_news_sm', 'ca_core_news_md', 'ca_core_news_lg', 'ca_core_news_trf'], |
| 14 | 'zh': ['zh_core_web_sm', 'zh_core_web_md', 'zh_core_web_lg', 'zh_core_web_trf'], |
| 15 | 'hr': ['hr_core_news_sm', 'hr_core_news_md', 'hr_core_news_lg'], |
| 16 | 'da': ['da_core_news_sm', 'da_core_news_md', 'da_core_news_lg', 'da_core_news_trf'], |
| 17 | 'nl': ['nl_core_news_sm', 'nl_core_news_md', 'nl_core_news_lg'], |
| 18 | 'en': ['en_core_web_sm', 'en_core_web_md', 'en_core_web_lg', 'en_core_web_trf'], |
| 19 | 'fi': ['fi_core_news_sm', 'fi_core_news_md', 'fi_core_news_lg'], |
| 20 | 'fr': ['fr_core_news_sm', 'fr_core_news_md', 'fr_core_news_lg', 'fr_dep_news_trf'], |
| 21 | 'de': ['de_core_news_sm', 'de_core_news_md', 'de_core_news_lg'], |
| 22 | 'el': ['el_core_news_sm', 'el_core_news_md', 'el_core_news_lg'], |
| 23 | 'it': ['it_core_news_sm', 'it_core_news_md', 'it_core_news_lg'], |
| 24 | 'ja': ['ja_core_news_sm', 'ja_core_news_md', 'ja_core_news_lg', 'ja_core_news_trf'], |
| 25 | 'ko': ['ko_core_news_sm', 'ko_core_news_md', 'ko_core_news_lg'], |
| 26 | 'lt': ['lt_core_news_sm', 'lt_core_news_md', 'lt_core_news_lg'], |
| 27 | 'mk': ['mk_core_news_sm', 'mk_core_news_md', 'mk_core_news_lg'], |
| 28 | 'nb': ['nb_core_news_sm', 'nb_core_news_md', 'nb_core_news_lg'], |
| 29 | 'pl': ['pl_core_news_sm', 'pl_core_news_md', 'pl_core_news_lg'], |
| 30 | 'pt': ['pt_core_news_sm', 'pt_core_news_md', 'pt_core_news_lg'], |
| 31 | 'ro': ['ro_core_news_sm', 'ro_core_news_md', 'ro_core_news_lg'], |
| 32 | 'ru': ['ru_core_news_sm', 'ru_core_news_md', 'ru_core_news_lg'], |
| 33 | 'es': ['es_core_news_sm', 'es_core_news_md', 'es_core_news_lg'], |
| 34 | 'sv': ['sv_core_news_sm', 'sv_core_news_md', 'sv_core_news_lg'], |
| 35 | 'uk': ['uk_core_news_sm', 'uk_core_news_md', 'uk_core_news_lg', 'uk_core_news_trf'], |
| 36 | } |
| 37 | |
| 38 | def get_language_name(code): |
| 39 | """Get full language name from code""" |
| 40 | languages = { |
| 41 | 'ca': 'Catalan', |
| 42 | 'zh': 'Chinese', |
| 43 | 'hr': 'Croatian', |
| 44 | 'da': 'Danish', |
| 45 | 'nl': 'Dutch', |
| 46 | 'en': 'English', |
| 47 | 'fi': 'Finnish', |
| 48 | 'fr': 'French', |
| 49 | 'de': 'German', |
| 50 | 'el': 'Greek', |
| 51 | 'it': 'Italian', |
| 52 | 'ja': 'Japanese', |
| 53 | 'ko': 'Korean', |
| 54 | 'lt': 'Lithuanian', |
| 55 | 'mk': 'Macedonian', |
| 56 | 'nb': 'Norwegian Bokmål', |
| 57 | 'pl': 'Polish', |
| 58 | 'pt': 'Portuguese', |
| 59 | 'ro': 'Romanian', |
| 60 | 'ru': 'Russian', |
| 61 | 'es': 'Spanish', |
| 62 | 'sv': 'Swedish', |
| 63 | 'uk': 'Ukrainian', |
| 64 | } |
| 65 | return languages.get(code, code.upper()) |
| 66 | |
| 67 | def display_models(by_language): |
| 68 | """Display models grouped by language""" |
| 69 | # Priority languages to show first |
| 70 | priority = ['de', 'en', 'fr', 'es', 'it', 'pt', 'nl', 'pl', 'ru', 'zh', 'ja'] |
| 71 | |
| 72 | # Show priority languages first |
| 73 | for lang_code in priority: |
| 74 | if lang_code in by_language: |
| 75 | lang_name = get_language_name(lang_code) |
| 76 | print(f"\n{lang_name}:", file=sys.stderr) |
| 77 | for model in sorted(by_language[lang_code]): |
| 78 | # Estimate size based on suffix |
| 79 | if model.endswith('_sm'): |
| 80 | size = "~15MB" |
| 81 | elif model.endswith('_md'): |
| 82 | size = "~100MB" |
| 83 | elif model.endswith('_lg'): |
| 84 | size = "~560MB" |
| 85 | elif model.endswith('_trf'): |
| 86 | size = "~500MB (transformer)" |
| 87 | else: |
| 88 | size = "" |
| 89 | |
| 90 | default = " (default)" if model == "de_core_news_lg" else "" |
| 91 | print(f" {model:30} {size}{default}", file=sys.stderr) |
| 92 | |
| 93 | # Show remaining languages |
| 94 | remaining = sorted([code for code in by_language.keys() if code not in priority]) |
| 95 | if remaining: |
| 96 | print(f"\nOther languages:", file=sys.stderr) |
| 97 | for lang_code in remaining: |
| 98 | lang_name = get_language_name(lang_code) |
| 99 | models = ", ".join([m.split('_')[-1] for m in sorted(by_language[lang_code])]) |
| 100 | print(f" {lang_name}: {models}", file=sys.stderr) |
| 101 | |
| 102 | def main(): |
| 103 | print("=== Available spaCy Models ===\n", file=sys.stderr) |
| 104 | |
| 105 | by_language = get_models() |
| 106 | display_models(by_language) |
| 107 | |
| 108 | print(f"\n\nTotal: {sum(len(models) for models in by_language.values())} models across {len(by_language)} languages", file=sys.stderr) |
| 109 | print("\nFor complete details and latest updates, visit: https://spacy.io/models", file=sys.stderr) |
| 110 | print("\nUsage: docker run --rm -i korap/conllu-spacy -m MODEL_NAME < input.conllu", file=sys.stderr) |
| 111 | |
| 112 | if __name__ == "__main__": |
| 113 | main() |