| from collections import defaultdict, OrderedDict |
| import re |
| |
| # CoNLL-U Format - https://universaldependencies.org/format.html |
| |
| |
| class CoNLLUP_Token(): |
| def __init__(self, raw_line, word_ix): |
| info = raw_line.split() |
| # [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC] |
| # [11, Prügel, Prügel, NN, NN, _, _, _, _, 1.000000] |
| self.info = info |
| self.id = int(info[0]) # 1-based ID as in the CoNLL file |
| self.position = int(word_ix) # 0-based position in sentence |
| self.word = info[1] |
| self.lemma = info[2] |
| self.pos_universal = info[3] |
| self.pos_tag = info[4] |
| self.detail_tag = info[5] |
| self.head = info[6] |
| self.dep_tag = info[7] |
| self.blank = info[8] # ??? |
| self.auto_score = info[9] |
| |
| def get_info(self): |
| return [str(self.id), self.word, self.lemma, self.pos_universal, self.pos_tag, self.detail_tag, |
| str(self.head), self.dep_tag, self.blank, self.auto_score] |
| |
| def get_conllU_line(self, separator="\t"): |
| info = self.get_info() |
| return separator.join(info) |
| |
| |
| ################################# GETTING SENTENCE ANNOTATIONS #################################### |
| class AnnotatedSentence(): |
| def __init__(self): |
| self.metadata = [] |
| self.tokens = [] |
| |
| def get_words(self): |
| return [tok.word for tok in self.tokens] |
| |
| def get_sentence(self): |
| return " ".join([tok.word for tok in self.tokens]) |
| |
| def get_pos_tags(self, universal=False): |
| if universal: |
| return [tok.pos_universal for tok in self.tokens] |
| else: |
| return [tok.pos_tag for tok in self.tokens] |
| |
| |
| def get_annotation(raw_lines, raw_meta, token_class=CoNLLUP_Token): |
| ann = AnnotatedSentence() |
| ann.metadata = [m.strip("\n") for m in raw_meta] |
| # Annotate the predicates and senses |
| real_index = 0 |
| for i, line in enumerate(raw_lines): |
| tok = token_class(line, real_index) |
| if tok: |
| ann.tokens.append(tok) |
| real_index += 1 |
| return ann |