blob: 10b3f5460459dafb750fd5df8278f780907a1fde [file] [log] [blame]
dazacd9cf012020-08-31 17:19:11 +02001from collections import defaultdict, OrderedDict
2import re
3
4# CoNLL-U Format - https://universaldependencies.org/format.html
5
6
7class CoNLLUP_Token():
8 def __init__(self, raw_line, word_ix):
9 info = raw_line.split()
10 # [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
11 # [11, Prügel, Prügel, NN, NN, _, _, _, _, 1.000000]
12 self.info = info
13 self.id = int(info[0]) # 1-based ID as in the CoNLL file
14 self.position = int(word_ix) # 0-based position in sentence
15 self.word = info[1]
16 self.lemma = info[2]
17 self.pos_universal = info[3]
18 self.pos_tag = info[4]
19 self.detail_tag = info[5]
20 self.head = info[6]
21 self.dep_tag = info[7]
22 self.blank = info[8] # ???
23 self.auto_score = info[9]
24
25 def get_info(self):
26 return [str(self.id), self.word, self.lemma, self.pos_universal, self.pos_tag, self.detail_tag,
27 str(self.head), self.dep_tag, self.blank, self.auto_score]
28
29 def get_conllU_line(self, separator="\t"):
30 info = self.get_info()
31 return separator.join(info)
32
33
34################################# GETTING SENTENCE ANNOTATIONS ####################################
35class AnnotatedSentence():
36 def __init__(self):
37 self.metadata = []
38 self.tokens = []
39
40 def get_words(self):
41 return [tok.word for tok in self.tokens]
42
43 def get_sentence(self):
44 return " ".join([tok.word for tok in self.tokens])
45
46 def get_pos_tags(self, universal=False):
47 if universal:
48 return [tok.pos_universal for tok in self.tokens]
49 else:
50 return [tok.pos_tag for tok in self.tokens]
51
52
53def get_annotation(raw_lines, raw_meta, token_class=CoNLLUP_Token):
54 ann = AnnotatedSentence()
55 ann.metadata = [m.strip("\n") for m in raw_meta]
56 # Annotate the predicates and senses
57 real_index = 0
58 for i, line in enumerate(raw_lines):
59 tok = token_class(line, real_index)
60 if tok:
61 ann.tokens.append(tok)
62 real_index += 1
63 return ann