blob: 41daabb337846825156b3e16e767de16c537a566 [file] [log] [blame]
Marc Kupietz68a18132025-11-29 11:17:06 +01001#!/usr/bin/env python3
2"""
3Display available spaCy models
4Uses a curated list of current models since spacy.io/models is JavaScript-rendered
5"""
6import sys
7
8def get_models():
9 """Get list of available models organized by language"""
10 # Curated list of spaCy models (updated 2025-01)
11 # Based on https://spacy.io/models and https://github.com/explosion/spacy-models
12 return {
13 'ca': ['ca_core_news_sm', 'ca_core_news_md', 'ca_core_news_lg', 'ca_core_news_trf'],
14 'zh': ['zh_core_web_sm', 'zh_core_web_md', 'zh_core_web_lg', 'zh_core_web_trf'],
15 'hr': ['hr_core_news_sm', 'hr_core_news_md', 'hr_core_news_lg'],
16 'da': ['da_core_news_sm', 'da_core_news_md', 'da_core_news_lg', 'da_core_news_trf'],
17 'nl': ['nl_core_news_sm', 'nl_core_news_md', 'nl_core_news_lg'],
18 'en': ['en_core_web_sm', 'en_core_web_md', 'en_core_web_lg', 'en_core_web_trf'],
19 'fi': ['fi_core_news_sm', 'fi_core_news_md', 'fi_core_news_lg'],
20 'fr': ['fr_core_news_sm', 'fr_core_news_md', 'fr_core_news_lg', 'fr_dep_news_trf'],
21 'de': ['de_core_news_sm', 'de_core_news_md', 'de_core_news_lg'],
22 'el': ['el_core_news_sm', 'el_core_news_md', 'el_core_news_lg'],
23 'it': ['it_core_news_sm', 'it_core_news_md', 'it_core_news_lg'],
24 'ja': ['ja_core_news_sm', 'ja_core_news_md', 'ja_core_news_lg', 'ja_core_news_trf'],
25 'ko': ['ko_core_news_sm', 'ko_core_news_md', 'ko_core_news_lg'],
26 'lt': ['lt_core_news_sm', 'lt_core_news_md', 'lt_core_news_lg'],
27 'mk': ['mk_core_news_sm', 'mk_core_news_md', 'mk_core_news_lg'],
28 'nb': ['nb_core_news_sm', 'nb_core_news_md', 'nb_core_news_lg'],
29 'pl': ['pl_core_news_sm', 'pl_core_news_md', 'pl_core_news_lg'],
30 'pt': ['pt_core_news_sm', 'pt_core_news_md', 'pt_core_news_lg'],
31 'ro': ['ro_core_news_sm', 'ro_core_news_md', 'ro_core_news_lg'],
32 'ru': ['ru_core_news_sm', 'ru_core_news_md', 'ru_core_news_lg'],
33 'es': ['es_core_news_sm', 'es_core_news_md', 'es_core_news_lg'],
34 'sv': ['sv_core_news_sm', 'sv_core_news_md', 'sv_core_news_lg'],
35 'uk': ['uk_core_news_sm', 'uk_core_news_md', 'uk_core_news_lg', 'uk_core_news_trf'],
36 }
37
38def get_language_name(code):
39 """Get full language name from code"""
40 languages = {
41 'ca': 'Catalan',
42 'zh': 'Chinese',
43 'hr': 'Croatian',
44 'da': 'Danish',
45 'nl': 'Dutch',
46 'en': 'English',
47 'fi': 'Finnish',
48 'fr': 'French',
49 'de': 'German',
50 'el': 'Greek',
51 'it': 'Italian',
52 'ja': 'Japanese',
53 'ko': 'Korean',
54 'lt': 'Lithuanian',
55 'mk': 'Macedonian',
56 'nb': 'Norwegian Bokmål',
57 'pl': 'Polish',
58 'pt': 'Portuguese',
59 'ro': 'Romanian',
60 'ru': 'Russian',
61 'es': 'Spanish',
62 'sv': 'Swedish',
63 'uk': 'Ukrainian',
64 }
65 return languages.get(code, code.upper())
66
67def display_models(by_language):
68 """Display models grouped by language"""
69 # Priority languages to show first
70 priority = ['de', 'en', 'fr', 'es', 'it', 'pt', 'nl', 'pl', 'ru', 'zh', 'ja']
71
72 # Show priority languages first
73 for lang_code in priority:
74 if lang_code in by_language:
75 lang_name = get_language_name(lang_code)
76 print(f"\n{lang_name}:", file=sys.stderr)
77 for model in sorted(by_language[lang_code]):
78 # Estimate size based on suffix
79 if model.endswith('_sm'):
80 size = "~15MB"
81 elif model.endswith('_md'):
82 size = "~100MB"
83 elif model.endswith('_lg'):
84 size = "~560MB"
85 elif model.endswith('_trf'):
86 size = "~500MB (transformer)"
87 else:
88 size = ""
89
90 default = " (default)" if model == "de_core_news_lg" else ""
91 print(f" {model:30} {size}{default}", file=sys.stderr)
92
93 # Show remaining languages
94 remaining = sorted([code for code in by_language.keys() if code not in priority])
95 if remaining:
96 print(f"\nOther languages:", file=sys.stderr)
97 for lang_code in remaining:
98 lang_name = get_language_name(lang_code)
99 models = ", ".join([m.split('_')[-1] for m in sorted(by_language[lang_code])])
100 print(f" {lang_name}: {models}", file=sys.stderr)
101
102def main():
103 print("=== Available spaCy Models ===\n", file=sys.stderr)
104
105 by_language = get_models()
106 display_models(by_language)
107
108 print(f"\n\nTotal: {sum(len(models) for models in by_language.values())} models across {len(by_language)} languages", file=sys.stderr)
109 print("\nFor complete details and latest updates, visit: https://spacy.io/models", file=sys.stderr)
110 print("\nUsage: docker run --rm -i korap/conllu-spacy -m MODEL_NAME < input.conllu", file=sys.stderr)
111
112if __name__ == "__main__":
113 main()