Work with Turku Client-Server schema
diff --git a/DeReKo/CoNLL_Annotation.py b/DeReKo/CoNLL_Annotation.py
new file mode 100644
index 0000000..10b3f54
--- /dev/null
+++ b/DeReKo/CoNLL_Annotation.py
@@ -0,0 +1,63 @@
+from collections import defaultdict, OrderedDict
+import re
+
+# CoNLL-U Format - https://universaldependencies.org/format.html
+
+
+class CoNLLUP_Token():
+ def __init__(self, raw_line, word_ix):
+ info = raw_line.split()
+ # [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
+ # [11, Prügel, Prügel, NN, NN, _, _, _, _, 1.000000]
+ self.info = info
+ self.id = int(info[0]) # 1-based ID as in the CoNLL file
+ self.position = int(word_ix) # 0-based position in sentence
+ self.word = info[1]
+ self.lemma = info[2]
+ self.pos_universal = info[3]
+ self.pos_tag = info[4]
+ self.detail_tag = info[5]
+ self.head = info[6]
+ self.dep_tag = info[7]
+ self.blank = info[8] # ???
+ self.auto_score = info[9]
+
+ def get_info(self):
+ return [str(self.id), self.word, self.lemma, self.pos_universal, self.pos_tag, self.detail_tag,
+ str(self.head), self.dep_tag, self.blank, self.auto_score]
+
+ def get_conllU_line(self, separator="\t"):
+ info = self.get_info()
+ return separator.join(info)
+
+
+################################# GETTING SENTENCE ANNOTATIONS ####################################
+class AnnotatedSentence():
+ def __init__(self):
+ self.metadata = []
+ self.tokens = []
+
+ def get_words(self):
+ return [tok.word for tok in self.tokens]
+
+ def get_sentence(self):
+ return " ".join([tok.word for tok in self.tokens])
+
+ def get_pos_tags(self, universal=False):
+ if universal:
+ return [tok.pos_universal for tok in self.tokens]
+ else:
+ return [tok.pos_tag for tok in self.tokens]
+
+
+def get_annotation(raw_lines, raw_meta, token_class=CoNLLUP_Token):
+ ann = AnnotatedSentence()
+ ann.metadata = [m.strip("\n") for m in raw_meta]
+ # Annotate the predicates and senses
+ real_index = 0
+ for i, line in enumerate(raw_lines):
+ tok = token_class(line, real_index)
+ if tok:
+ ann.tokens.append(tok)
+ real_index += 1
+ return ann