blob: 972937565dee2680f33f114a76ee541d1e90f0ef [file] [log] [blame]
dazacd9cf012020-08-31 17:19:11 +02001from collections import defaultdict, OrderedDict
2import re
3
4# CoNLL-U Format - https://universaldependencies.org/format.html
5
6
7class CoNLLUP_Token():
8 def __init__(self, raw_line, word_ix):
9 info = raw_line.split()
10 # [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
11 # [11, Prügel, Prügel, NN, NN, _, _, _, _, 1.000000]
12 self.info = info
daza972aabc2020-09-01 16:41:30 +020013 self.id = info[0] # 1-based ID as in the CoNLL file
14 self.position = word_ix # 0-based position in sentence
dazacd9cf012020-08-31 17:19:11 +020015 self.word = info[1]
16 self.lemma = info[2]
17 self.pos_universal = info[3]
18 self.pos_tag = info[4]
19 self.detail_tag = info[5]
20 self.head = info[6]
21 self.dep_tag = info[7]
22 self.blank = info[8] # ???
23 self.auto_score = info[9]
24
25 def get_info(self):
26 return [str(self.id), self.word, self.lemma, self.pos_universal, self.pos_tag, self.detail_tag,
27 str(self.head), self.dep_tag, self.blank, self.auto_score]
28
29 def get_conllU_line(self, separator="\t"):
30 info = self.get_info()
31 return separator.join(info)
32
33
daza972aabc2020-09-01 16:41:30 +020034
35class CoNLL09_Token():
36 def __init__(self, raw_line, word_ix):
37 info = raw_line.split()
38 # print(info)
39 # # ['1', 'Frau', 'Frau', 'Frau', 'NN', 'NN', '_', 'nom|sg|fem', '5', '5', 'CJ', 'CJ', '_', '_', 'AM-DIS', '_']
40 self.info = info
41 self.id = info[0] # 1-based ID as in the CoNLL file
42 self.position = word_ix # 0-based position in sentence
43 self.word = info[1]
44 self.lemma = info[2]
45 self.pos_tag = info[4]
46 self.pos_universal = "_" # _convert_to_universal(self.pos_tag, self.lemma)
47 self.head = info[8]
48 self.dep_tag = info[10]
49 self.detail_tag = "_"
50 self.is_pred = True if info[12] == "Y" else False
51 if self.is_pred:
52 self.pred_sense = info[13].strip("[]")
53 self.pred_sense_id = str(self.position) + "##" + self.pred_sense
54 else:
55 self.pred_sense = None
56 self.pred_sense_id = ""
57 if len(info) > 14:
58 self.labels = info[14:]
59 else:
60 self.labels = []
61
62 def get_conllU_line(self, separator="\t"):
63 # We want: [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
64 tok_id = str(self.id) #.split("_")[0]
65 conllUinfo = [tok_id, self.word, self.lemma, self.pos_tag, self.pos_universal, self.detail_tag, self.head, self.dep_tag, "_", "_"]
66 return separator.join(conllUinfo)
67
68 def get_conll09_line(self, delim="\t"):
69 # We want:
70 # 1 Frau Frau Frau NN NN _ nom|sg|fem 5 5 CJ CJ _ _ AM-DIS _
71 # 10 fall fall fall VB VB _ _ 8 8 VC VC Y fall.01 _ _ _ _ _
72 is_pred_str = "Y" if self.is_pred else "_"
73 sense_str = self.pred_sense if self.is_pred else "_"
74 info = [self.id, self.word, self.lemma, self.lemma, self.pos_tag, self.pos_tag, "_", self.detail_tag,
75 self.head, self.head, self.dep_tag, self.dep_tag, is_pred_str, sense_str] + self.labels
76 return delim.join(info)
77
78
79
dazacd9cf012020-08-31 17:19:11 +020080################################# GETTING SENTENCE ANNOTATIONS ####################################
81class AnnotatedSentence():
82 def __init__(self):
83 self.metadata = []
84 self.tokens = []
85
86 def get_words(self):
87 return [tok.word for tok in self.tokens]
88
89 def get_sentence(self):
90 return " ".join([tok.word for tok in self.tokens])
91
92 def get_pos_tags(self, universal=False):
93 if universal:
94 return [tok.pos_universal for tok in self.tokens]
95 else:
96 return [tok.pos_tag for tok in self.tokens]
97
98
daza972aabc2020-09-01 16:41:30 +020099def get_annotation(raw_lines, raw_meta, token_class):
dazacd9cf012020-08-31 17:19:11 +0200100 ann = AnnotatedSentence()
101 ann.metadata = [m.strip("\n") for m in raw_meta]
102 # Annotate the predicates and senses
103 real_index = 0
104 for i, line in enumerate(raw_lines):
105 tok = token_class(line, real_index)
daza972aabc2020-09-01 16:41:30 +0200106 ann.tokens.append(tok)
dazacd9cf012020-08-31 17:19:11 +0200107 real_index += 1
108 return ann
daza972aabc2020-09-01 16:41:30 +0200109
110
111def read_conll(line_generator, chunk_size, token_class=CoNLLUP_Token):
112 n_sents = 0
113 annotated_sentences, buffer_meta, buffer_lst = [], [], []
114 for i, line in enumerate(line_generator):
115 if line.startswith("###C:"):
116 buffer_meta.append(line)
117 continue
118 if len(line.split()) > 0:
119 buffer_lst.append(line)
120 else:
121 ann = get_annotation(buffer_lst, buffer_meta, token_class)
122 n_sents += 1
123 buffer_lst, buffer_meta = [], []
124 annotated_sentences.append(ann)
125 if chunk_size > 0 and n_sents == chunk_size: break
126 # logger.info("Read {} Sentences!".format(n_sents))
127 return annotated_sentences, n_sents
128
129
130def read_conll_generator(filepath, token_class=CoNLLUP_Token):
131 buffer_meta, buffer_lst = [], []
132 with open(filepath) as f:
133 for i, line in enumerate(f.readlines()):
134 if line.startswith("###C:"):
135 continue
136 if len(line.split()) > 0:
137 buffer_lst.append(line)
138 else:
139 ann = get_annotation(buffer_lst, buffer_meta, token_class)
140 buffer_lst, buffer_meta = [], []
141 yield ann