Added Parsing and Evaluation of Lemmas using Tiger Corpus
diff --git a/DeReKo/CoNLL_Annotation.py b/DeReKo/CoNLL_Annotation.py
index 10b3f54..9729375 100644
--- a/DeReKo/CoNLL_Annotation.py
+++ b/DeReKo/CoNLL_Annotation.py
@@ -10,8 +10,8 @@
# [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
# [11, Prügel, Prügel, NN, NN, _, _, _, _, 1.000000]
self.info = info
- self.id = int(info[0]) # 1-based ID as in the CoNLL file
- self.position = int(word_ix) # 0-based position in sentence
+ self.id = info[0] # 1-based ID as in the CoNLL file
+ self.position = word_ix # 0-based position in sentence
self.word = info[1]
self.lemma = info[2]
self.pos_universal = info[3]
@@ -31,6 +31,52 @@
return separator.join(info)
+
+class CoNLL09_Token():
+ def __init__(self, raw_line, word_ix):
+ info = raw_line.split()
+ # print(info)
+ # # ['1', 'Frau', 'Frau', 'Frau', 'NN', 'NN', '_', 'nom|sg|fem', '5', '5', 'CJ', 'CJ', '_', '_', 'AM-DIS', '_']
+ self.info = info
+ self.id = info[0] # 1-based ID as in the CoNLL file
+ self.position = word_ix # 0-based position in sentence
+ self.word = info[1]
+ self.lemma = info[2]
+ self.pos_tag = info[4]
+ self.pos_universal = "_" # _convert_to_universal(self.pos_tag, self.lemma)
+ self.head = info[8]
+ self.dep_tag = info[10]
+ self.detail_tag = "_"
+ self.is_pred = True if info[12] == "Y" else False
+ if self.is_pred:
+ self.pred_sense = info[13].strip("[]")
+ self.pred_sense_id = str(self.position) + "##" + self.pred_sense
+ else:
+ self.pred_sense = None
+ self.pred_sense_id = ""
+ if len(info) > 14:
+ self.labels = info[14:]
+ else:
+ self.labels = []
+
+ def get_conllU_line(self, separator="\t"):
+ # We want: [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
+ tok_id = str(self.id) #.split("_")[0]
+ conllUinfo = [tok_id, self.word, self.lemma, self.pos_tag, self.pos_universal, self.detail_tag, self.head, self.dep_tag, "_", "_"]
+ return separator.join(conllUinfo)
+
+ def get_conll09_line(self, delim="\t"):
+ # We want:
+ # 1 Frau Frau Frau NN NN _ nom|sg|fem 5 5 CJ CJ _ _ AM-DIS _
+ # 10 fall fall fall VB VB _ _ 8 8 VC VC Y fall.01 _ _ _ _ _
+ is_pred_str = "Y" if self.is_pred else "_"
+ sense_str = self.pred_sense if self.is_pred else "_"
+ info = [self.id, self.word, self.lemma, self.lemma, self.pos_tag, self.pos_tag, "_", self.detail_tag,
+ self.head, self.head, self.dep_tag, self.dep_tag, is_pred_str, sense_str] + self.labels
+ return delim.join(info)
+
+
+
################################# GETTING SENTENCE ANNOTATIONS ####################################
class AnnotatedSentence():
def __init__(self):
@@ -50,14 +96,46 @@
return [tok.pos_tag for tok in self.tokens]
-def get_annotation(raw_lines, raw_meta, token_class=CoNLLUP_Token):
+def get_annotation(raw_lines, raw_meta, token_class):
ann = AnnotatedSentence()
ann.metadata = [m.strip("\n") for m in raw_meta]
# Annotate the predicates and senses
real_index = 0
for i, line in enumerate(raw_lines):
tok = token_class(line, real_index)
- if tok:
- ann.tokens.append(tok)
+ ann.tokens.append(tok)
real_index += 1
return ann
+
+
+def read_conll(line_generator, chunk_size, token_class=CoNLLUP_Token):
+ n_sents = 0
+ annotated_sentences, buffer_meta, buffer_lst = [], [], []
+ for i, line in enumerate(line_generator):
+ if line.startswith("###C:"):
+ buffer_meta.append(line)
+ continue
+ if len(line.split()) > 0:
+ buffer_lst.append(line)
+ else:
+ ann = get_annotation(buffer_lst, buffer_meta, token_class)
+ n_sents += 1
+ buffer_lst, buffer_meta = [], []
+ annotated_sentences.append(ann)
+ if chunk_size > 0 and n_sents == chunk_size: break
+ # logger.info("Read {} Sentences!".format(n_sents))
+ return annotated_sentences, n_sents
+
+
+def read_conll_generator(filepath, token_class=CoNLLUP_Token):
+ buffer_meta, buffer_lst = [], []
+ with open(filepath) as f:
+ for i, line in enumerate(f.readlines()):
+ if line.startswith("###C:"):
+ continue
+ if len(line.split()) > 0:
+ buffer_lst.append(line)
+ else:
+ ann = get_annotation(buffer_lst, buffer_meta, token_class)
+ buffer_lst, buffer_meta = [], []
+ yield ann
\ No newline at end of file