blob: 69bf4a7e7eda8bc53309935a6ebbeb1f9f0aace5 [file] [log] [blame]
dazaa805f3a2021-02-09 11:10:15 +01001from collections import defaultdict, OrderedDict
2import re
3
4# CoNLL-U Format - https://universaldependencies.org/format.html
5
6
7def get_token_type(type_str):
8 if type_str =="CoNLL09_Token":
9 return CoNLL09_Token
10 elif type_str == "RNNTagger_Token":
11 return RNNTagger_Token
12 elif type_str == "CoNLLUP_Token":
13 return CoNLLUP_Token
14 elif type_str == "TigerNew_Token":
15 return TigerNew_Token
16 else:
17 raise NotImplementedError(f"I don't know what to do with {type_str} token type!")
18
19
20class TigerNew_Token():
21 def __init__(self, raw_line, word_ix):
22 info = raw_line.split() # [FORM, XPOS]
23 self.info = info
24 self.id = word_ix + 1 # 1-based ID as in the CoNLL file
25 self.position = word_ix # 0-based position in sentence
26 self.word = info[0]
27 self.lemma = "_"
28 self.pos_universal = "_"
29 self.pos_tag = info[1]
30 self.detail_tag = "_"
31 self.head = "_"
32 self.dep_tag = "_"
33 self.blank = "_"
34 self.auto_score = "_"
35
36 def get_info(self):
37 return [str(self.id), self.word, self.lemma, self.pos_universal, self.pos_tag, self.detail_tag,
38 str(self.head), self.dep_tag, self.blank, self.auto_score]
39
40 def get_conllU_line(self, separator="\t"):
41 info = self.get_info()
42 return separator.join(info)
43
44
45class RNNTagger_Token():
46 def __init__(self, raw_line, word_ix):
47 info = raw_line.split() # [FORM, XPOS.FEATS, LEMMA]
48 self.info = info
49 self.id = word_ix + 1 # 1-based ID as in the CoNLL file
50 self.position = word_ix # 0-based position in sentence
51 self.word = info[0]
52 self.lemma = info[2]
53 self.pos_universal = "_"
54 self.pos_tag, self.detail_tag = self._process_tag(info[1]) # 'NN.Gen.Sg.Fem'
55 self.head = "_"
56 self.dep_tag = "_"
57 self.blank = "_"
58 self.auto_score = "_"
59
60 def _process_tag(self, tag):
61 if tag == "_" or "." not in tag: return tag, "_"
62 info = tag.split(".")
63 return info[0], "|".join(info[1:])
64
65 def get_info(self):
66 return [str(self.id), self.word, self.lemma, self.pos_universal, self.pos_tag, self.detail_tag,
67 str(self.head), self.dep_tag, self.blank, self.auto_score]
68
69 def get_conllU_line(self, separator="\t"):
70 info = self.get_info()
71 return separator.join(info)
72
73
74class CoNLLUP_Token():
75 def __init__(self, raw_line, word_ix):
76 info = raw_line.split()
77 # print(info)
78 # [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
79 # [11, Prügel, Prügel, NN, NN, _, _, _, _, 1.000000]
80 self.info = info
81 self.id = info[0] # 1-based ID as in the CoNLL file
82 self.position = word_ix # 0-based position in sentence
83 self.word = info[1]
84 self.lemma = info[2]
85 self.pos_universal = info[3]
86 self.pos_tag = self._process_tag(info[4]) # 'XPOS=NE|Case=Nom|Gender=Masc|Number=Sing' TODO: Reuse MorphInfo in the self.detail_tag
87 self.detail_tag = info[5]
88 self.head = info[6]
89 self.dep_tag = info[7]
90 self.blank = info[8] # ???
91 self.auto_score = info[9]
92
93 def _process_tag(self, tag):
94 if tag == "_" or "|" not in tag: return tag # The XPOS=NE|Case=Nom... is only for Turku!
95 info = tag.split("|")
96 info = [x.split("=") for x in info]
97 return info[0][1]
98
99 def get_info(self):
100 return [str(self.id), self.word, self.lemma, self.pos_universal, self.pos_tag, self.detail_tag,
101 str(self.head), self.dep_tag, self.blank, self.auto_score]
102
103 def get_conllU_line(self, separator="\t"):
104 info = self.get_info()
105 return separator.join(info)
106
107
108
109class CoNLL09_Token():
110 def __init__(self, raw_line, word_ix):
111 info = raw_line.split()
112 # print(info)
113 # # ['1', 'Frau', 'Frau', 'Frau', 'NN', 'NN', '_', 'nom|sg|fem', '5', '5', 'CJ', 'CJ', '_', '_', 'AM-DIS', '_']
114 self.info = info
115 self.id = info[0] # 1-based ID as in the CoNLL file
116 self.position = word_ix # 0-based position in sentence
117 self.word = info[1]
118 self.lemma = info[2]
119 self.pos_universal = "_" # _convert_to_universal(self.pos_tag, self.lemma)
120 self.pos_tag = info[4]
121 self.head = info[8]
122 self.dep_tag = info[10]
123 self.detail_tag = "_"
124 self.is_pred = True if info[12] == "Y" else False
125 if self.is_pred:
126 self.pred_sense = info[13].strip("[]")
127 self.pred_sense_id = str(self.position) + "##" + self.pred_sense
128 else:
129 self.pred_sense = None
130 self.pred_sense_id = ""
131 if len(info) > 14:
132 self.labels = info[14:]
133 else:
134 self.labels = []
135
136 def get_conllU_line(self, separator="\t"):
137 # We want: [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
138 tok_id = str(self.id) #.split("_")[0]
139 conllUinfo = [tok_id, self.word, self.lemma, self.pos_universal, self.pos_tag, self.detail_tag, self.head, self.dep_tag, "_", "_"]
140 return separator.join(conllUinfo)
141
142 def get_conll09_line(self, delim="\t"):
143 # We want:
144 # 1 Frau Frau Frau NN NN _ nom|sg|fem 5 5 CJ CJ _ _ AM-DIS _
145 # 10 fall fall fall VB VB _ _ 8 8 VC VC Y fall.01 _ _ _ _ _
146 is_pred_str = "Y" if self.is_pred else "_"
147 sense_str = self.pred_sense if self.is_pred else "_"
148 info = [self.id, self.word, self.lemma, self.lemma, self.pos_tag, self.pos_tag, "_", self.detail_tag,
149 self.head, self.head, self.dep_tag, self.dep_tag, is_pred_str, sense_str] + self.labels
150 return delim.join(info)
151
152
153
154################################# GETTING SENTENCE ANNOTATIONS ####################################
155class AnnotatedSentence():
156 def __init__(self):
157 self.metadata = []
158 self.tokens = []
159
160 def get_words(self):
161 return [tok.word for tok in self.tokens]
162
163 def get_sentence(self):
164 return " ".join([tok.word for tok in self.tokens])
165
166 def get_pos_tags(self, universal=False):
167 if universal:
168 return [tok.pos_universal for tok in self.tokens]
169 else:
170 return [tok.pos_tag for tok in self.tokens]
171
172
173def get_annotation(raw_lines, raw_meta, token_class):
174 ann = AnnotatedSentence()
175 ann.metadata = [m.strip("\n") for m in raw_meta]
176 # Annotate the predicates and senses
177 real_index = 0
178 for i, line in enumerate(raw_lines):
179 tok = token_class(line, real_index)
180 ann.tokens.append(tok)
181 real_index += 1
182 return ann
183
184
Marc Kupietza01314f2021-02-11 17:02:08 +0100185def read_conll(line_generator, chunk_size, token_class=CoNLLUP_Token, comment_str="###C:", our_foundry="spacy"):
dazaa805f3a2021-02-09 11:10:15 +0100186 n_sents = 0
187 annotated_sentences, buffer_meta, buffer_lst = [], [], []
188 for i, line in enumerate(line_generator):
189 if line.startswith(comment_str):
Marc Kupietza01314f2021-02-11 17:02:08 +0100190 line = re.sub("(foundry\s*=\s*).*", r"\1" + our_foundry, line)
191 line = re.sub("(filename\s*=\s* .[^/]*/[^/]+/[^/]+/).*", r"\1" + our_foundry + "/morpho.xml", line)
192 buffer_meta.append(line)
dazaa805f3a2021-02-09 11:10:15 +0100193 continue
194 if len(line.split()) > 0:
195 buffer_lst.append(line)
196 else:
197 ann = get_annotation(buffer_lst, buffer_meta, token_class)
198 n_sents += 1
199 buffer_lst, buffer_meta = [], []
200 annotated_sentences.append(ann)
201 if chunk_size > 0 and n_sents == chunk_size: break
202 # logger.info("Read {} Sentences!".format(n_sents))
203 return annotated_sentences, n_sents
204
205
206def read_conll_generator(filepath, token_class=CoNLLUP_Token, sent_sep=None, comment_str="###C:"):
207 buffer_meta, buffer_lst = [], []
208 sentence_finished = False
209 with open(filepath) as f:
210 for i, line in enumerate(f.readlines()):
211 if sent_sep and sent_sep in line: sentence_finished = True
212 if line.startswith(comment_str):
213 continue
214 if len(line.split()) > 0 and not sentence_finished:
215 buffer_lst.append(line)
216 else:
217 ann = get_annotation(buffer_lst, buffer_meta, token_class)
218 buffer_lst, buffer_meta = [], []
219 sentence_finished = False
220 yield ann