Work with Turku Client-Server schema

commit: a3c8c8b7ca8a582b485df56441f4a4794e25a36e [log] [tgz]
author: daza <daza@uni-heidelberg.de> Mon Aug 31 17:19:11 2020 +0200
committer: daza <daza@uni-heidelberg.de> Mon Aug 31 17:19:11 2020 +0200
tree: afdc8c4565a9b8303dd4e05e1ac3668a9f647a16
parent: 49b14c03af4d39eb7d530c5a35ee3cc14dbb3377 [diff] [blame]
diff --git a/DeReKo/CoNLL_Annotation.py b/DeReKo/CoNLL_Annotation.py
new file mode 100644
index 0000000..10b3f54
--- /dev/null
+++ b/DeReKo/CoNLL_Annotation.py

@@ -0,0 +1,63 @@
+from collections import defaultdict, OrderedDict
+import re
+
+# CoNLL-U Format - https://universaldependencies.org/format.html
+
+
+class CoNLLUP_Token():
+    def __init__(self, raw_line, word_ix):
+        info = raw_line.split()
+        # [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
+        # [11, Prügel, Prügel, NN, NN, _, _, _,	_, 1.000000]
+        self.info = info
+        self.id = int(info[0]) # 1-based ID as in the CoNLL file
+        self.position = int(word_ix) # 0-based position in sentence
+        self.word = info[1]
+        self.lemma = info[2]
+        self.pos_universal = info[3]
+        self.pos_tag = info[4]
+        self.detail_tag = info[5]
+        self.head = info[6]
+        self.dep_tag = info[7]
+        self.blank = info[8] # ???
+        self.auto_score = info[9]
+        
+    def get_info(self):
+        return [str(self.id), self.word, self.lemma, self.pos_universal, self.pos_tag, self.detail_tag,
+                str(self.head), self.dep_tag, self.blank, self.auto_score]
+
+    def get_conllU_line(self, separator="\t"):
+        info = self.get_info()
+        return separator.join(info)
+
+
+################################# GETTING SENTENCE ANNOTATIONS ####################################
+class AnnotatedSentence():
+    def __init__(self):
+        self.metadata = []
+        self.tokens = []
+
+    def get_words(self):
+        return [tok.word for tok in self.tokens]
+
+    def get_sentence(self):
+        return " ".join([tok.word for tok in self.tokens])
+
+    def get_pos_tags(self, universal=False):
+        if universal:
+            return [tok.pos_universal for tok in self.tokens]
+        else:
+            return [tok.pos_tag for tok in self.tokens]
+
+
+def get_annotation(raw_lines, raw_meta, token_class=CoNLLUP_Token):
+    ann = AnnotatedSentence()
+    ann.metadata = [m.strip("\n") for m in raw_meta]
+    # Annotate the predicates and senses
+    real_index = 0
+    for i, line in enumerate(raw_lines):
+        tok = token_class(line, real_index)
+        if tok:
+            ann.tokens.append(tok)
+        real_index += 1
+    return ann
commit	a3c8c8b7ca8a582b485df56441f4a4794e25a36e	[log] [tgz]
author	daza <daza@uni-heidelberg.de>	Mon Aug 31 17:19:11 2020 +0200
committer	daza <daza@uni-heidelberg.de>	Mon Aug 31 17:19:11 2020 +0200
tree	afdc8c4565a9b8303dd4e05e1ac3668a9f647a16
parent	49b14c03af4d39eb7d530c5a35ee3cc14dbb3377 [diff] [blame]