Added lib folder

Change-Id: Ifb0c8f44a1648bb22f636520dbf3678daf0c1c87
diff --git a/lib/CoNLL_Annotation.py b/lib/CoNLL_Annotation.py
new file mode 100644
index 0000000..02fae7d
--- /dev/null
+++ b/lib/CoNLL_Annotation.py
@@ -0,0 +1,218 @@
+from collections import defaultdict, OrderedDict
+import re
+
+# CoNLL-U Format - https://universaldependencies.org/format.html
+
+
+def get_token_type(type_str):
+    if type_str =="CoNLL09_Token":
+        return CoNLL09_Token
+    elif type_str == "RNNTagger_Token":
+        return RNNTagger_Token
+    elif type_str == "CoNLLUP_Token":
+        return CoNLLUP_Token
+    elif type_str == "TigerNew_Token":
+        return TigerNew_Token
+    else:
+        raise NotImplementedError(f"I don't know what to do with {type_str} token type!")
+
+
+class TigerNew_Token():
+    def __init__(self, raw_line, word_ix):
+        info = raw_line.split() # [FORM, XPOS]
+        self.info = info
+        self.id = word_ix + 1 # 1-based ID as in the CoNLL file
+        self.position = word_ix # 0-based position in sentence
+        self.word = info[0]
+        self.lemma = "_"
+        self.pos_universal = "_"
+        self.pos_tag = info[1]
+        self.detail_tag = "_"
+        self.head = "_"
+        self.dep_tag = "_"
+        self.blank = "_"
+        self.auto_score = "_"
+        
+    def get_info(self):
+        return [str(self.id), self.word, self.lemma, self.pos_universal, self.pos_tag, self.detail_tag,
+                str(self.head), self.dep_tag, self.blank, self.auto_score]
+
+    def get_conllU_line(self, separator="\t"):
+        info = self.get_info()
+        return separator.join(info)
+
+
+class RNNTagger_Token():
+    def __init__(self, raw_line, word_ix):
+        info = raw_line.split() # [FORM, XPOS.FEATS, LEMMA]
+        self.info = info
+        self.id = word_ix + 1 # 1-based ID as in the CoNLL file
+        self.position = word_ix # 0-based position in sentence
+        self.word = info[0]
+        self.lemma = info[2]
+        self.pos_universal = "_"
+        self.pos_tag, self.detail_tag = self._process_tag(info[1]) # 'NN.Gen.Sg.Fem'
+        self.head = "_"
+        self.dep_tag = "_"
+        self.blank = "_"
+        self.auto_score = "_"
+        
+    def _process_tag(self, tag):
+        if tag == "_" or "." not in tag: return tag, "_"
+        info = tag.split(".")
+        return info[0], "|".join(info[1:])
+        
+    def get_info(self):
+        return [str(self.id), self.word, self.lemma, self.pos_universal, self.pos_tag, self.detail_tag,
+                str(self.head), self.dep_tag, self.blank, self.auto_score]
+
+    def get_conllU_line(self, separator="\t"):
+        info = self.get_info()
+        return separator.join(info)
+
+
+class CoNLLUP_Token():
+    def __init__(self, raw_line, word_ix):
+        info = raw_line.split()
+        # print(info)
+        # [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
+        # [11, Prügel, Prügel, NN, NN, _, _, _,	_, 1.000000]
+        self.info = info
+        self.id = info[0] # 1-based ID as in the CoNLL file
+        self.position = word_ix # 0-based position in sentence
+        self.word = info[1]
+        self.lemma = info[2]
+        self.pos_universal = info[3]
+        self.pos_tag = self._process_tag(info[4]) # 'XPOS=NE|Case=Nom|Gender=Masc|Number=Sing' TODO: Reuse MorphInfo in the self.detail_tag
+        self.detail_tag = info[5]
+        self.head = info[6]
+        self.dep_tag = info[7]
+        self.blank = info[8] # ???
+        self.auto_score = info[9]
+        
+    def _process_tag(self, tag):
+        if tag == "_" or "|" not in tag: return tag # The XPOS=NE|Case=Nom... is only for Turku!
+        info = tag.split("|")
+        info = [x.split("=") for x in info]
+        return info[0][1]
+        
+    def get_info(self):
+        return [str(self.id), self.word, self.lemma, self.pos_universal, self.pos_tag, self.detail_tag,
+                str(self.head), self.dep_tag, self.blank, self.auto_score]
+
+    def get_conllU_line(self, separator="\t"):
+        info = self.get_info()
+        return separator.join(info)
+
+
+
+class CoNLL09_Token():
+    def __init__(self, raw_line, word_ix):
+        info = raw_line.split()
+        # print(info)
+        # # ['1', 'Frau', 'Frau', 'Frau', 'NN', 'NN', '_', 'nom|sg|fem', '5', '5', 'CJ', 'CJ', '_', '_', 'AM-DIS', '_']
+        self.info = info
+        self.id = info[0] # 1-based ID as in the CoNLL file
+        self.position = word_ix # 0-based position in sentence
+        self.word = info[1]
+        self.lemma = info[2]
+        self.pos_universal = "_" # _convert_to_universal(self.pos_tag, self.lemma)
+        self.pos_tag = info[4]
+        self.head = info[8]
+        self.dep_tag = info[10]
+        self.detail_tag = "_"
+        self.is_pred = True if info[12] == "Y" else False
+        if self.is_pred:
+            self.pred_sense = info[13].strip("[]")
+            self.pred_sense_id = str(self.position) + "##" + self.pred_sense
+        else:
+            self.pred_sense = None
+            self.pred_sense_id = ""
+        if len(info) > 14:
+            self.labels = info[14:]
+        else:
+            self.labels = []
+
+    def get_conllU_line(self, separator="\t"):
+        # We want: [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
+        tok_id = str(self.id) #.split("_")[0]
+        conllUinfo = [tok_id, self.word, self.lemma, self.pos_universal, self.pos_tag, self.detail_tag, self.head, self.dep_tag, "_", "_"]
+        return separator.join(conllUinfo)
+
+    def get_conll09_line(self, delim="\t"):
+        # We want:
+        # 1 Frau Frau Frau NN NN _ nom|sg|fem 5 5 CJ CJ _ _ AM-DIS _
+        # 10	fall	fall	fall	VB	VB	_	_	8	8	VC	VC	Y	fall.01	_	_	_	_	_
+        is_pred_str = "Y" if self.is_pred else "_"
+        sense_str = self.pred_sense if self.is_pred else "_"
+        info = [self.id, self.word, self.lemma, self.lemma, self.pos_tag, self.pos_tag, "_", self.detail_tag,
+                self.head, self.head, self.dep_tag, self.dep_tag, is_pred_str, sense_str] + self.labels
+        return delim.join(info)
+
+
+
+################################# GETTING SENTENCE ANNOTATIONS ####################################
+class AnnotatedSentence():
+    def __init__(self):
+        self.metadata = []
+        self.tokens = []
+
+    def get_words(self):
+        return [tok.word for tok in self.tokens]
+
+    def get_sentence(self):
+        return " ".join([tok.word for tok in self.tokens])
+
+    def get_pos_tags(self, universal=False):
+        if universal:
+            return [tok.pos_universal for tok in self.tokens]
+        else:
+            return [tok.pos_tag for tok in self.tokens]
+
+
+def get_annotation(raw_lines, raw_meta, token_class):
+    ann = AnnotatedSentence()
+    ann.metadata = [m.strip("\n") for m in raw_meta]
+    # Annotate the predicates and senses
+    real_index = 0
+    for i, line in enumerate(raw_lines):
+        tok = token_class(line, real_index)
+        ann.tokens.append(tok)
+        real_index += 1
+    return ann
+
+
+def read_conll(line_generator, chunk_size, token_class=CoNLLUP_Token, comment_str="###C:"):
+    n_sents = 0
+    annotated_sentences, buffer_meta, buffer_lst = [], [], []
+    for i, line in enumerate(line_generator):
+        if line.startswith(comment_str):
+            buffer_meta.append(line) 
+            continue
+        if len(line.split()) > 0:
+            buffer_lst.append(line)
+        else:
+            ann = get_annotation(buffer_lst, buffer_meta, token_class)
+            n_sents += 1
+            buffer_lst, buffer_meta = [], []
+            annotated_sentences.append(ann)
+        if chunk_size > 0 and n_sents == chunk_size: break
+    # logger.info("Read {} Sentences!".format(n_sents))
+    return annotated_sentences, n_sents
+
+    
+def read_conll_generator(filepath, token_class=CoNLLUP_Token, sent_sep=None, comment_str="###C:"):
+    buffer_meta, buffer_lst = [], []
+    sentence_finished = False
+    with open(filepath) as f:
+        for i, line in enumerate(f.readlines()):
+            if sent_sep and sent_sep in line: sentence_finished = True
+            if line.startswith(comment_str):
+                continue
+            if len(line.split()) > 0 and not sentence_finished:
+                buffer_lst.append(line)
+            else:
+                ann = get_annotation(buffer_lst, buffer_meta, token_class)
+                buffer_lst, buffer_meta = [], []
+                sentence_finished = False
+                yield ann
\ No newline at end of file
diff --git a/lib/German_STTS_Tasget.tsv b/lib/German_STTS_Tasget.tsv
new file mode 100644
index 0000000..04b424d
--- /dev/null
+++ b/lib/German_STTS_Tasget.tsv
@@ -0,0 +1,55 @@
+Tag	Description	Example
+ADJA	attributive adjective (including participles used adjectivally)	das große Haus die versunkene Glocke
+ADJD	predicate adjective; adjective used adverbially	der Vogel ist blau er fährt schnell
+ADV	adverb (never used as attributive adjective)	sie kommt bald
+APPR	preposition left hand part of double preposition	auf dem Tisch an der Straße entlang
+APPRART	preposition with fused article	am Tag
+APPO	postposition	meiner Meinung nach
+APZR	right hand part of double preposition	an der Straße entlang
+ART	article (definite or indefinite)	die Tante; eine Tante
+CARD	cardinal number (words or figures); also declined	zwei; 526; dreier
+FM	foreign words (actual part of speech in original language may be appended, e.g. FMADV/ FM-NN)	semper fidem
+ITJ	interjection	Ach!
+KON	co-ordinating conjunction	oder ich bezahle nicht
+KOKOM	comparative conjunction or particle	er arbeitet als Straßenfeger, so gut wie du
+KOUI	preposition used to introduce infinitive clause	um den König zu töten
+KOUS	subordinating conjunction	weil er sie gesehen hat
+NA	adjective used as noun	der Gesandte
+NE	names and other proper nouns	Moskau
+NN	noun (but not adjectives used as nouns)	der Abend
+PAV [PROAV]	pronominal adverb	sie spielt damit
+PAVREL	pronominal adverb used as relative	die Puppe, damit sie spielt
+PDAT	demonstrative determiner	dieser Mann war schlecht
+PDS	demonstrative pronoun	dieser war schlecht
+PIAT	indefinite determiner (whether occurring on its own or in conjunction with another determiner)	einige Wochen, viele solche Bemerkungen
+PIS	indefinite pronoun	sie hat viele gesehen
+PPER	personal pronoun	sie liebt mich
+PRF	reflexive pronoun	ich wasche mich, sie wäscht sich
+PPOSS	possessive pronoun	das ist meins
+PPOSAT	possessive determiner	mein Buch, das ist der meine/meinige
+PRELAT	relative depending on a noun	der Mann, dessen Lied ich singe […], welchen Begriff ich nicht verstehe
+PRELS	relative pronoun (i.e. forms of der or welcher)	der Herr, der gerade kommt; der Herr, welcher nun kommt
+PTKA	particle with adjective or adverb	am besten, zu schnell, aufs herzlichste
+PTKANT	answer particle	ja, nein
+PTKNEG	negative particle	nicht
+PTKREL	indeclinable relative particle	so
+PTKVZ	separable prefix	sie kommt an
+PTKZU	infinitive particle	zu
+PWS	interrogative pronoun	wer kommt?
+PWAT	interrogative determiner	welche Farbe?
+PWAV	interrogative adverb	wann kommst du?
+PWAVREL	interrogative adverb used as relative	der Zaun, worüber sie springt
+PWREL	interrogative pronoun used as relative	etwas, was er sieht
+TRUNC	truncated form of compound	Vor- und Nachteile
+VAFIN	finite auxiliary verb	sie ist gekommen
+VAIMP	imperative of auxiliary	sei still!
+VAINF	infinitive of auxiliary	er wird es gesehen haben
+VAPP	past participle of auxiliary	sie ist es gewesen
+VMFIN	finite modal verb	sie will kommen
+VMINF	infinitive of modal	er hat es sehen müssen
+VMPP	past participle of auxiliary	sie hat es gekonnt
+VVFIN	finite full verb	sie ist gekommen
+VVIMP	imperative of full verb	bleibt da!
+VVINF	infinitive of full verb	er wird es sehen
+VVIZU	infinitive with incorporated zu	sie versprach aufzuhören
+VVPP	past participle of full verb	sie ist gekommen
\ No newline at end of file
diff --git a/lib/__init__.py b/lib/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/lib/__init__.py