Added Parsing and Evaluation of Lemmas using Tiger Corpus

commit: e94ab18717c5ff0982f408f5bc709948b9e2ac88 [log] [tgz]
author: daza <daza@uni-heidelberg.de> Tue Sep 01 16:41:30 2020 +0200
committer: daza <daza@uni-heidelberg.de> Tue Sep 01 16:41:30 2020 +0200
tree: ff91c37e32499c2f2c51ae4ac7d7001a56f53a7e
parent: a3c8c8b7ca8a582b485df56441f4a4794e25a36e [diff] [blame]
diff --git a/DeReKo/CoNLL_Annotation.py b/DeReKo/CoNLL_Annotation.py
index 10b3f54..9729375 100644
--- a/DeReKo/CoNLL_Annotation.py
+++ b/DeReKo/CoNLL_Annotation.py

@@ -10,8 +10,8 @@
         # [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
         # [11, Prügel, Prügel, NN, NN, _, _, _,	_, 1.000000]
         self.info = info
-        self.id = int(info[0]) # 1-based ID as in the CoNLL file
-        self.position = int(word_ix) # 0-based position in sentence
+        self.id = info[0] # 1-based ID as in the CoNLL file
+        self.position = word_ix # 0-based position in sentence
         self.word = info[1]
         self.lemma = info[2]
         self.pos_universal = info[3]
@@ -31,6 +31,52 @@
         return separator.join(info)
 
 
+
+class CoNLL09_Token():
+    def __init__(self, raw_line, word_ix):
+        info = raw_line.split()
+        # print(info)
+        # # ['1', 'Frau', 'Frau', 'Frau', 'NN', 'NN', '_', 'nom|sg|fem', '5', '5', 'CJ', 'CJ', '_', '_', 'AM-DIS', '_']
+        self.info = info
+        self.id = info[0] # 1-based ID as in the CoNLL file
+        self.position = word_ix # 0-based position in sentence
+        self.word = info[1]
+        self.lemma = info[2]
+        self.pos_tag = info[4]
+        self.pos_universal = "_" # _convert_to_universal(self.pos_tag, self.lemma)
+        self.head = info[8]
+        self.dep_tag = info[10]
+        self.detail_tag = "_"
+        self.is_pred = True if info[12] == "Y" else False
+        if self.is_pred:
+            self.pred_sense = info[13].strip("[]")
+            self.pred_sense_id = str(self.position) + "##" + self.pred_sense
+        else:
+            self.pred_sense = None
+            self.pred_sense_id = ""
+        if len(info) > 14:
+            self.labels = info[14:]
+        else:
+            self.labels = []
+
+    def get_conllU_line(self, separator="\t"):
+        # We want: [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
+        tok_id = str(self.id) #.split("_")[0]
+        conllUinfo = [tok_id, self.word, self.lemma, self.pos_tag, self.pos_universal, self.detail_tag, self.head, self.dep_tag, "_", "_"]
+        return separator.join(conllUinfo)
+
+    def get_conll09_line(self, delim="\t"):
+        # We want:
+        # 1 Frau Frau Frau NN NN _ nom|sg|fem 5 5 CJ CJ _ _ AM-DIS _
+        # 10	fall	fall	fall	VB	VB	_	_	8	8	VC	VC	Y	fall.01	_	_	_	_	_
+        is_pred_str = "Y" if self.is_pred else "_"
+        sense_str = self.pred_sense if self.is_pred else "_"
+        info = [self.id, self.word, self.lemma, self.lemma, self.pos_tag, self.pos_tag, "_", self.detail_tag,
+                self.head, self.head, self.dep_tag, self.dep_tag, is_pred_str, sense_str] + self.labels
+        return delim.join(info)
+
+
+
 ################################# GETTING SENTENCE ANNOTATIONS ####################################
 class AnnotatedSentence():
     def __init__(self):
@@ -50,14 +96,46 @@
             return [tok.pos_tag for tok in self.tokens]
 
 
-def get_annotation(raw_lines, raw_meta, token_class=CoNLLUP_Token):
+def get_annotation(raw_lines, raw_meta, token_class):
     ann = AnnotatedSentence()
     ann.metadata = [m.strip("\n") for m in raw_meta]
     # Annotate the predicates and senses
     real_index = 0
     for i, line in enumerate(raw_lines):
         tok = token_class(line, real_index)
-        if tok:
-            ann.tokens.append(tok)
+        ann.tokens.append(tok)
         real_index += 1
     return ann
+
+
+def read_conll(line_generator, chunk_size, token_class=CoNLLUP_Token):
+    n_sents = 0
+    annotated_sentences, buffer_meta, buffer_lst = [], [], []
+    for i, line in enumerate(line_generator):
+        if line.startswith("###C:"):
+            buffer_meta.append(line) 
+            continue
+        if len(line.split()) > 0:
+            buffer_lst.append(line)
+        else:
+            ann = get_annotation(buffer_lst, buffer_meta, token_class)
+            n_sents += 1
+            buffer_lst, buffer_meta = [], []
+            annotated_sentences.append(ann)
+        if chunk_size > 0 and n_sents == chunk_size: break
+    # logger.info("Read {} Sentences!".format(n_sents))
+    return annotated_sentences, n_sents
+
+    
+def read_conll_generator(filepath, token_class=CoNLLUP_Token):
+    buffer_meta, buffer_lst = [], []
+    with open(filepath) as f:
+        for i, line in enumerate(f.readlines()):
+            if line.startswith("###C:"):
+                continue
+            if len(line.split()) > 0:
+                buffer_lst.append(line)
+            else:
+                ann = get_annotation(buffer_lst, buffer_meta, token_class)
+                buffer_lst, buffer_meta = [], []
+                yield ann
\ No newline at end of file
commit	e94ab18717c5ff0982f408f5bc709948b9e2ac88	[log] [tgz]
author	daza <daza@uni-heidelberg.de>	Tue Sep 01 16:41:30 2020 +0200
committer	daza <daza@uni-heidelberg.de>	Tue Sep 01 16:41:30 2020 +0200
tree	ff91c37e32499c2f2c51ae4ac7d7001a56f53a7e
parent	a3c8c8b7ca8a582b485df56441f4a4794e25a36e [diff] [blame]