blob: 10b3f5460459dafb750fd5df8278f780907a1fde [file] [log] [blame]
from collections import defaultdict, OrderedDict
import re
# CoNLL-U Format - https://universaldependencies.org/format.html
class CoNLLUP_Token():
def __init__(self, raw_line, word_ix):
info = raw_line.split()
# [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
# [11, Prügel, Prügel, NN, NN, _, _, _, _, 1.000000]
self.info = info
self.id = int(info[0]) # 1-based ID as in the CoNLL file
self.position = int(word_ix) # 0-based position in sentence
self.word = info[1]
self.lemma = info[2]
self.pos_universal = info[3]
self.pos_tag = info[4]
self.detail_tag = info[5]
self.head = info[6]
self.dep_tag = info[7]
self.blank = info[8] # ???
self.auto_score = info[9]
def get_info(self):
return [str(self.id), self.word, self.lemma, self.pos_universal, self.pos_tag, self.detail_tag,
str(self.head), self.dep_tag, self.blank, self.auto_score]
def get_conllU_line(self, separator="\t"):
info = self.get_info()
return separator.join(info)
################################# GETTING SENTENCE ANNOTATIONS ####################################
class AnnotatedSentence():
def __init__(self):
self.metadata = []
self.tokens = []
def get_words(self):
return [tok.word for tok in self.tokens]
def get_sentence(self):
return " ".join([tok.word for tok in self.tokens])
def get_pos_tags(self, universal=False):
if universal:
return [tok.pos_universal for tok in self.tokens]
else:
return [tok.pos_tag for tok in self.tokens]
def get_annotation(raw_lines, raw_meta, token_class=CoNLLUP_Token):
ann = AnnotatedSentence()
ann.metadata = [m.strip("\n") for m in raw_meta]
# Annotate the predicates and senses
real_index = 0
for i, line in enumerate(raw_lines):
tok = token_class(line, real_index)
if tok:
ann.tokens.append(tok)
real_index += 1
return ann