blob: 1a8ddf0ae652082529c5a10126b2ebac58ffe976 [file] [log] [blame]
from collections import defaultdict, OrderedDict
import re
# CoNLL-U Format - https://universaldependencies.org/format.html
def get_token_type(type_str):
if type_str =="CoNLL09_Token":
return CoNLL09_Token
elif type_str == "RNNTagger_Token":
return RNNTagger_Token
elif type_str == "CoNLLUP_Token":
return CoNLLUP_Token
elif type_str == "TigerNew_Token":
return TigerNew_Token
else:
raise NotImplementedError(f"I don't know what to do with {type_str} token type!")
class TigerNew_Token():
def __init__(self, raw_line, word_ix):
info = raw_line.split() # [FORM, XPOS]
self.info = info
self.id = word_ix + 1 # 1-based ID as in the CoNLL file
self.position = word_ix # 0-based position in sentence
self.word = info[0]
self.lemma = "_"
self.pos_universal = "_"
self.pos_tag = info[1]
self.detail_tag = "_"
self.head = "_"
self.dep_tag = "_"
self.blank = "_"
self.auto_score = "_"
def get_info(self):
return [str(self.id), self.word, self.lemma, self.pos_universal, self.pos_tag, self.detail_tag,
str(self.head), self.dep_tag, self.blank, self.auto_score]
def get_conllU_line(self, separator="\t"):
info = self.get_info()
return separator.join(info)
class RNNTagger_Token():
def __init__(self, raw_line, word_ix):
info = raw_line.split() # [FORM, XPOS.FEATS, LEMMA]
self.info = info
self.id = word_ix + 1 # 1-based ID as in the CoNLL file
self.position = word_ix # 0-based position in sentence
self.word = info[0]
self.lemma = info[2]
self.pos_universal = "_"
self.pos_tag, self.detail_tag = self._process_tag(info[1]) # 'NN.Gen.Sg.Fem'
self.head = "_"
self.dep_tag = "_"
self.blank = "_"
self.auto_score = "_"
def _process_tag(self, tag):
if tag == "_" or "." not in tag: return tag, "_"
info = tag.split(".")
return info[0], "|".join(info[1:])
def get_info(self):
return [str(self.id), self.word, self.lemma, self.pos_universal, self.pos_tag, self.detail_tag,
str(self.head), self.dep_tag, self.blank, self.auto_score]
def get_conllU_line(self, separator="\t"):
info = self.get_info()
return separator.join(info)
class CoNLLUP_Token():
def __init__(self, raw_line, word_ix):
info = raw_line.split()
# print(info)
# [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
# [11, Prügel, Prügel, NN, NN, _, _, _, _, 1.000000]
self.info = info
self.id = info[0] # 1-based ID as in the CoNLL file
self.position = word_ix # 0-based position in sentence
self.word = info[1]
self.lemma = info[2]
self.pos_universal = info[3]
self.pos_tag = self._process_tag(info[4]) # 'XPOS=NE|Case=Nom|Gender=Masc|Number=Sing' TODO: Reuse MorphInfo in the self.detail_tag
self.detail_tag = info[5]
self.head = info[6]
self.dep_tag = info[7]
self.blank = info[8] # ???
self.auto_score = info[9]
def _process_tag(self, tag):
if tag == "_" or "|" not in tag: return tag # The XPOS=NE|Case=Nom... is only for Turku!
info = tag.split("|")
info = [x.split("=") for x in info]
return info[0][1]
def get_info(self):
return [str(self.id), self.word, self.lemma, self.pos_universal, self.pos_tag, self.detail_tag,
str(self.head), self.dep_tag, self.blank, self.auto_score]
def get_conllU_line(self, separator="\t"):
info = self.get_info()
return separator.join(info)
class CoNLL09_Token():
def __init__(self, raw_line, word_ix):
info = raw_line.split()
# print(info)
# # ['1', 'Frau', 'Frau', 'Frau', 'NN', 'NN', '_', 'nom|sg|fem', '5', '5', 'CJ', 'CJ', '_', '_', 'AM-DIS', '_']
self.info = info
self.id = info[0] # 1-based ID as in the CoNLL file
self.position = word_ix # 0-based position in sentence
self.word = info[1]
self.lemma = info[2]
self.pos_universal = "_" # _convert_to_universal(self.pos_tag, self.lemma)
self.pos_tag = info[4]
self.head = info[8]
self.dep_tag = info[10]
self.detail_tag = "_"
self.is_pred = True if info[12] == "Y" else False
if self.is_pred:
self.pred_sense = info[13].strip("[]")
self.pred_sense_id = str(self.position) + "##" + self.pred_sense
else:
self.pred_sense = None
self.pred_sense_id = ""
if len(info) > 14:
self.labels = info[14:]
else:
self.labels = []
def get_conllU_line(self, separator="\t"):
# We want: [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
tok_id = str(self.id) #.split("_")[0]
conllUinfo = [tok_id, self.word, self.lemma, self.pos_universal, self.pos_tag, self.detail_tag, self.head, self.dep_tag, "_", "_"]
return separator.join(conllUinfo)
def get_conll09_line(self, delim="\t"):
# We want:
# 1 Frau Frau Frau NN NN _ nom|sg|fem 5 5 CJ CJ _ _ AM-DIS _
# 10 fall fall fall VB VB _ _ 8 8 VC VC Y fall.01 _ _ _ _ _
is_pred_str = "Y" if self.is_pred else "_"
sense_str = self.pred_sense if self.is_pred else "_"
info = [self.id, self.word, self.lemma, self.lemma, self.pos_tag, self.pos_tag, "_", self.detail_tag,
self.head, self.head, self.dep_tag, self.dep_tag, is_pred_str, sense_str] + self.labels
return delim.join(info)
################################# GETTING SENTENCE ANNOTATIONS ####################################
class AnnotatedSentence():
def __init__(self):
self.metadata = []
self.tokens = []
def get_words(self):
return [tok.word for tok in self.tokens]
def get_sentence(self):
return " ".join([tok.word for tok in self.tokens])
def get_pos_tags(self, universal=False):
if universal:
return [tok.pos_universal for tok in self.tokens]
else:
return [tok.pos_tag for tok in self.tokens]
def get_annotation(raw_lines, raw_meta, token_class):
ann = AnnotatedSentence()
ann.metadata = [m.strip("\n") for m in raw_meta]
# Annotate the predicates and senses
real_index = 0
for i, line in enumerate(raw_lines):
tok = token_class(line, real_index)
ann.tokens.append(tok)
real_index += 1
return ann
def read_conll(line_generator, chunk_size, token_class=CoNLLUP_Token, comment_str="###C:", our_foundry="spacy"):
n_sents = 0
annotated_sentences, buffer_meta, buffer_lst = [], [], []
for i, line in enumerate(line_generator):
if line.startswith(comment_str):
line = re.sub(r'(foundry\s*=\s*).*', r"\1" + our_foundry, line)
line = re.sub(r'(filename\s*=\s* .[^/]*/[^/]+/[^/]+/).*', r"\1" + our_foundry + "/morpho.xml", line)
buffer_meta.append(line)
continue
if len(line.split()) > 0:
buffer_lst.append(line)
else:
ann = get_annotation(buffer_lst, buffer_meta, token_class)
n_sents += 1
buffer_lst, buffer_meta = [], []
annotated_sentences.append(ann)
if chunk_size > 0 and n_sents == chunk_size: break
# logger.info("Read {} Sentences!".format(n_sents))
return annotated_sentences, n_sents
def read_conll_generator(filepath, token_class=CoNLLUP_Token, sent_sep=None, comment_str="###C:"):
buffer_meta, buffer_lst = [], []
sentence_finished = False
with open(filepath) as f:
for i, line in enumerate(f.readlines()):
if sent_sep and sent_sep in line: sentence_finished = True
if line.startswith(comment_str):
continue
if len(line.split()) > 0 and not sentence_finished:
buffer_lst.append(line)
else:
ann = get_annotation(buffer_lst, buffer_meta, token_class)
buffer_lst, buffer_meta = [], []
sentence_finished = False
yield ann