DeReKo/CoNLL_Annotation.py - KorAP/sota-pos-lemmatizers - Gitiles

 from collections import defaultdict, OrderedDict
 import re

 # CoNLL-U Format - https://universaldependencies.org/format.html


 class CoNLLUP_Token():
     def __init__(self, raw_line, word_ix):
         info = raw_line.split()
         # [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
         # [11, Prügel, Prügel, NN, NN, _, _, _,	_, 1.000000]
         self.info = info
         self.id = int(info[0]) # 1-based ID as in the CoNLL file
         self.position = int(word_ix) # 0-based position in sentence
         self.word = info[1]
         self.lemma = info[2]
         self.pos_universal = info[3]
         self.pos_tag = info[4]
         self.detail_tag = info[5]
         self.head = info[6]
         self.dep_tag = info[7]
         self.blank = info[8] # ???
         self.auto_score = info[9]

     def get_info(self):
         return [str(self.id), self.word, self.lemma, self.pos_universal, self.pos_tag, self.detail_tag,
                 str(self.head), self.dep_tag, self.blank, self.auto_score]

     def get_conllU_line(self, separator="\t"):
         info = self.get_info()
         return separator.join(info)


 ################################# GETTING SENTENCE ANNOTATIONS ####################################
 class AnnotatedSentence():
     def __init__(self):
         self.metadata = []
         self.tokens = []

     def get_words(self):
         return [tok.word for tok in self.tokens]

     def get_sentence(self):
         return " ".join([tok.word for tok in self.tokens])

     def get_pos_tags(self, universal=False):
         if universal:
             return [tok.pos_universal for tok in self.tokens]
         else:
             return [tok.pos_tag for tok in self.tokens]


 def get_annotation(raw_lines, raw_meta, token_class=CoNLLUP_Token):
     ann = AnnotatedSentence()
     ann.metadata = [m.strip("\n") for m in raw_meta]
     # Annotate the predicates and senses
     real_index = 0
     for i, line in enumerate(raw_lines):
         tok = token_class(line, real_index)
         if tok:
             ann.tokens.append(tok)
         real_index += 1
     return ann
	from collections import defaultdict, OrderedDict
	import re

	# CoNLL-U Format - https://universaldependencies.org/format.html


	class CoNLLUP_Token():
	def __init__(self, raw_line, word_ix):
	info = raw_line.split()
	# [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
	# [11, Prügel, Prügel, NN, NN, _, _, _, _, 1.000000]
	self.info = info
	self.id = int(info[0]) # 1-based ID as in the CoNLL file
	self.position = int(word_ix) # 0-based position in sentence
	self.word = info[1]
	self.lemma = info[2]
	self.pos_universal = info[3]
	self.pos_tag = info[4]
	self.detail_tag = info[5]
	self.head = info[6]
	self.dep_tag = info[7]
	self.blank = info[8] # ???
	self.auto_score = info[9]

	def get_info(self):
	return [str(self.id), self.word, self.lemma, self.pos_universal, self.pos_tag, self.detail_tag,
	str(self.head), self.dep_tag, self.blank, self.auto_score]

	def get_conllU_line(self, separator="\t"):
	info = self.get_info()
	return separator.join(info)


	################################# GETTING SENTENCE ANNOTATIONS ####################################
	class AnnotatedSentence():
	def __init__(self):
	self.metadata = []
	self.tokens = []

	def get_words(self):
	return [tok.word for tok in self.tokens]

	def get_sentence(self):
	return " ".join([tok.word for tok in self.tokens])

	def get_pos_tags(self, universal=False):
	if universal:
	return [tok.pos_universal for tok in self.tokens]
	else:
	return [tok.pos_tag for tok in self.tokens]


	def get_annotation(raw_lines, raw_meta, token_class=CoNLLUP_Token):
	ann = AnnotatedSentence()
	ann.metadata = [m.strip("\n") for m in raw_meta]
	# Annotate the predicates and senses
	real_index = 0
	for i, line in enumerate(raw_lines):
	tok = token_class(line, real_index)
	if tok:
	ann.tokens.append(tok)
	real_index += 1
	return ann