daza | cd9cf01 | 2020-08-31 17:19:11 +0200 | [diff] [blame^] | 1 | from collections import defaultdict, OrderedDict |
| 2 | import re |
| 3 | |
| 4 | # CoNLL-U Format - https://universaldependencies.org/format.html |
| 5 | |
| 6 | |
| 7 | class CoNLLUP_Token(): |
| 8 | def __init__(self, raw_line, word_ix): |
| 9 | info = raw_line.split() |
| 10 | # [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC] |
| 11 | # [11, Prügel, Prügel, NN, NN, _, _, _, _, 1.000000] |
| 12 | self.info = info |
| 13 | self.id = int(info[0]) # 1-based ID as in the CoNLL file |
| 14 | self.position = int(word_ix) # 0-based position in sentence |
| 15 | self.word = info[1] |
| 16 | self.lemma = info[2] |
| 17 | self.pos_universal = info[3] |
| 18 | self.pos_tag = info[4] |
| 19 | self.detail_tag = info[5] |
| 20 | self.head = info[6] |
| 21 | self.dep_tag = info[7] |
| 22 | self.blank = info[8] # ??? |
| 23 | self.auto_score = info[9] |
| 24 | |
| 25 | def get_info(self): |
| 26 | return [str(self.id), self.word, self.lemma, self.pos_universal, self.pos_tag, self.detail_tag, |
| 27 | str(self.head), self.dep_tag, self.blank, self.auto_score] |
| 28 | |
| 29 | def get_conllU_line(self, separator="\t"): |
| 30 | info = self.get_info() |
| 31 | return separator.join(info) |
| 32 | |
| 33 | |
| 34 | ################################# GETTING SENTENCE ANNOTATIONS #################################### |
| 35 | class AnnotatedSentence(): |
| 36 | def __init__(self): |
| 37 | self.metadata = [] |
| 38 | self.tokens = [] |
| 39 | |
| 40 | def get_words(self): |
| 41 | return [tok.word for tok in self.tokens] |
| 42 | |
| 43 | def get_sentence(self): |
| 44 | return " ".join([tok.word for tok in self.tokens]) |
| 45 | |
| 46 | def get_pos_tags(self, universal=False): |
| 47 | if universal: |
| 48 | return [tok.pos_universal for tok in self.tokens] |
| 49 | else: |
| 50 | return [tok.pos_tag for tok in self.tokens] |
| 51 | |
| 52 | |
| 53 | def get_annotation(raw_lines, raw_meta, token_class=CoNLLUP_Token): |
| 54 | ann = AnnotatedSentence() |
| 55 | ann.metadata = [m.strip("\n") for m in raw_meta] |
| 56 | # Annotate the predicates and senses |
| 57 | real_index = 0 |
| 58 | for i, line in enumerate(raw_lines): |
| 59 | tok = token_class(line, real_index) |
| 60 | if tok: |
| 61 | ann.tokens.append(tok) |
| 62 | real_index += 1 |
| 63 | return ann |