blob: 74b110b5aaed84faefe53cf537cc71fb84ededa6 [file] [log] [blame]
daza48606ba2021-02-10 14:16:41 +01001from sys import stdin
2import argparse
3import spacy
4from spacy.tokens import Doc
5import logging, sys, time
6from lib.CoNLL_Annotation import get_token_type
7import my_utils.file_utils as fu
8from germalemma import GermaLemma
9
Marc Kupietz88eea722025-10-26 15:21:14 +010010def format_morphological_features(token):
11 """
12 Extract and format morphological features from a spaCy token for CoNLL-U output.
13
14 Args:
15 token: spaCy token object
16
17 Returns:
18 str: Formatted morphological features string for CoNLL-U 5th column
19 Returns "_" if no features are available
20 """
21 if not hasattr(token, 'morph') or not token.morph:
22 return "_"
23
24 morph_dict = token.morph.to_dict()
25 if not morph_dict:
26 return "_"
27
28 # Format as CoNLL-U format: Feature=Value|Feature2=Value2
29 features = []
30 for feature, value in sorted(morph_dict.items()):
31 features.append(f"{feature}={value}")
32
33 return "|".join(features)
34
daza48606ba2021-02-10 14:16:41 +010035
Marc Kupietz0ce98a62025-10-26 15:59:27 +010036def format_dependency_relations(doc):
37 """
38 Extract and format dependency relations from a spaCy doc for CoNLL-U output.
39
40 Args:
41 doc: spaCy Doc object
42
43 Returns:
44 list: List of tuples (head_id, deprel) for each token
45 """
46 dependencies = []
47 for i, token in enumerate(doc):
48 # HEAD column: 1-based index of the head token (0 for root)
49 if token.dep_ == "ROOT":
50 head_id = 0
51 else:
52 # Find the 1-based index of the head token
53 head_id = None
54 for j, potential_head in enumerate(doc):
55 if potential_head == token.head:
56 head_id = j + 1
57 break
58 if head_id is None:
59 head_id = 0 # Fallback to root if head not found
60
61 # DEPREL column: dependency relation
62 deprel = token.dep_ if token.dep_ else "_"
63
64 dependencies.append((head_id, deprel))
65
66 return dependencies
67
68
daza48606ba2021-02-10 14:16:41 +010069class WhitespaceTokenizer(object):
70 def __init__(self, vocab):
71 self.vocab = vocab
72
73 def __call__(self, text):
74 words = text.split(' ')
75 # All tokens 'own' a subsequent space character in this tokenizer
76 spaces = [True] * len(words)
77 return Doc(self.vocab, words=words, spaces=spaces)
78
79
Marc Kupietz0ce98a62025-10-26 15:59:27 +010080def get_conll_str(anno_obj, spacy_doc, use_germalemma, use_dependencies):
daza48606ba2021-02-10 14:16:41 +010081 # First lines are comments. (metadata)
82 conll_lines = anno_obj.metadata # Then we want: [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
Marc Kupietz0ce98a62025-10-26 15:59:27 +010083
84 # Get dependency relations if enabled
85 dependencies = format_dependency_relations(spacy_doc) if use_dependencies == "True" else None
86
daza48606ba2021-02-10 14:16:41 +010087 for ix, token in enumerate(spacy_doc):
Marc Kupietz88eea722025-10-26 15:21:14 +010088 morph_features = format_morphological_features(token)
Marc Kupietz0ce98a62025-10-26 15:59:27 +010089
90 # Get HEAD and DEPREL columns
91 if dependencies:
92 head_id, deprel = dependencies[ix]
daza48606ba2021-02-10 14:16:41 +010093 else:
Marc Kupietz0ce98a62025-10-26 15:59:27 +010094 head_id, deprel = "_", "_"
95
96 if use_germalemma == "True":
97 content = (str(ix+1), token.text, find_germalemma(token.text, token.tag_, token.lemma_), token.pos_, token.tag_, morph_features, str(head_id), deprel, "_", "_")
98 else:
99 content = (str(ix+1), token.text, token.lemma_, token.pos_, token.tag_, morph_features, str(head_id), deprel, "_", "_") # Pure SpaCy!
daza48606ba2021-02-10 14:16:41 +0100100 conll_lines.append("\t".join(content))
101 return "\n".join(conll_lines)
102
103
104def find_germalemma(word, pos, spacy_lemma):
105 simplify_pos = {"ADJA":"ADJ", "ADJD":"ADJ",
106 "NA":"N", "NE":"N", "NN":"N",
107 "ADV":"ADV", "PAV":"ADV", "PROAV":"ADV", "PAVREL":"ADV", "PWAV":"ADV", "PWAVREL":"ADV",
108 "VAFIN":"V", "VAIMP":"V", "VAINF":"V", "VAPP":"V", "VMFIN":"V", "VMINF":"V",
109 "VMPP":"V", "VVFIN":"V", "VVIMP":"V", "VVINF":"V", "VVIZU":"V","VVPP":"V"
110 }
111 # simplify_pos = {"VERB": "V", "ADV": "ADV", "ADJ": "ADJ", "NOUN":"N", "PROPN": "N"}
112 try:
113 return lemmatizer.find_lemma(word, simplify_pos.get(pos, "UNK"))
114 except:
115 return spacy_lemma
116
117
118if __name__ == "__main__":
119 """
120 --- Example Real Data TEST ---
121
122 cat /export/netapp/kupietz/N-GRAMM-STUDIE/conllu/zca18.conllu | python systems/parse_spacy_pipe.py \
123 --corpus_name DeReKo_zca18 --comment_str "#" > output_zca18.conll
124 """
125
126 parser = argparse.ArgumentParser()
127 parser.add_argument("-n", "--corpus_name", help="Corpus Name", default="Corpus")
128 parser.add_argument("-sm", "--spacy_model", help="Spacy model containing the pipeline to tag", default="de_core_news_lg")
129 parser.add_argument("-gtt", "--gld_token_type", help="CoNLL Format of the Gold Data", default="CoNLLUP_Token")
130 parser.add_argument("-ugl", "--use_germalemma", help="Use Germalemma lemmatizer on top of SpaCy", default="True")
Marc Kupietz0ce98a62025-10-26 15:59:27 +0100131 parser.add_argument("-udp", "--use_dependencies", help="Include dependency parsing (adds HEAD/DEPREL columns, set to False for faster processing)", default="True")
daza48606ba2021-02-10 14:16:41 +0100132 parser.add_argument("-c", "--comment_str", help="CoNLL Format of comentaries inside the file", default="#")
133 args = parser.parse_args()
134
135 file_has_next, chunk_ix = True, 0
136 CHUNK_SIZE = 20000
137 SPACY_BATCH = 2000
138 SPACY_PROC = 10
139
140 # =====================================================================================
141 # LOGGING INFO ...
142 # =====================================================================================
143 logger = logging.getLogger(__name__)
144 console_hdlr = logging.StreamHandler(sys.stderr)
145 file_hdlr = logging.FileHandler(filename=f"logs/Parse_{args.corpus_name}.SpaCy.log")
146 logging.basicConfig(level=logging.INFO, handlers=[console_hdlr, file_hdlr])
Marc Kupietz0ce98a62025-10-26 15:59:27 +0100147
148 # Override with environment variables if set (useful for Docker)
149 import os
150 if os.getenv("SPACY_USE_DEPENDENCIES") is not None:
151 args.use_dependencies = os.getenv("SPACY_USE_DEPENDENCIES", "True")
152 logger.info(f"Using SPACY_USE_DEPENDENCIES environment variable: {args.use_dependencies}")
153
154 if os.getenv("SPACY_USE_GERMALEMMA") is not None:
155 args.use_germalemma = os.getenv("SPACY_USE_GERMALEMMA", "True")
156 logger.info(f"Using SPACY_USE_GERMALEMMA environment variable: {args.use_germalemma}")
157
daza48606ba2021-02-10 14:16:41 +0100158 logger.info(f"Chunking {args.corpus_name} Corpus in chunks of {CHUNK_SIZE} Sentences")
159
160 # =====================================================================================
161 # POS TAG DOCUMENTS
162 # =====================================================================================
Marc Kupietz0ce98a62025-10-26 15:59:27 +0100163 # Configure which components to disable based on dependency parsing option
164 disabled_components = ["ner"]
165 if args.use_dependencies != "True":
166 disabled_components.append("parser")
167 logger.info("Dependency parsing disabled for faster processing")
168 else:
169 logger.info("Dependency parsing enabled (slower but includes HEAD/DEPREL)")
170
171 spacy_de = spacy.load(args.spacy_model, disable=disabled_components)
daza48606ba2021-02-10 14:16:41 +0100172 spacy_de.tokenizer = WhitespaceTokenizer(spacy_de.vocab) # We won't re-tokenize to respect how the source CoNLL are tokenized!
173 lemmatizer = GermaLemma()
174
175 start = time.time()
176 total_processed_sents = 0
177
178 while file_has_next:
Marc Kupietza01314f2021-02-11 17:02:08 +0100179 annos, file_has_next = fu.get_file_annos_chunk(stdin, chunk_size=CHUNK_SIZE, token_class=get_token_type(args.gld_token_type), comment_str=args.comment_str, our_foundry="spacy")
daza48606ba2021-02-10 14:16:41 +0100180 if len(annos) == 0: break
181 total_processed_sents += len(annos)
182 logger.info(f"Already processed {total_processed_sents} sentences...")
183 sents = [a.get_sentence() for a in annos]
184 for ix, doc in enumerate(spacy_de.pipe(sents, batch_size=SPACY_BATCH, n_process=SPACY_PROC)):
Marc Kupietz0ce98a62025-10-26 15:59:27 +0100185 conll_str = get_conll_str(annos[ix], doc, use_germalemma=args.use_germalemma, use_dependencies=args.use_dependencies)
daza48606ba2021-02-10 14:16:41 +0100186 print(conll_str+ "\n")
187
188 end = time.time()
189 logger.info(f"Processing {args.corpus_name} took {(end - start)} seconds!")
190