import my_utils.file_utils as fu
from lib.CoNLL_Annotation import read_conll, CoNLLUP_Token 
from collections import Counter
from germalemma import GermaLemma

SPACY_NEW = "/home/daza/datasets/TIGER_conll/Tiger.NewOrth.train.spacy_parsed.conllu"
CASES = "/home/daza/datasets/TIGER_conll/NewOrthProblems_Indices.train.txt"

orth_dict = fu.file_to_dict("/vol/netapp/daza/datasets/TIGER_conll/TigerOrthMapping.train.json")
new_to_old = {v:k for k,v in orth_dict.items()}


if __name__ == "__main__":
	line_generator = fu.file_generator(SPACY_NEW)
	conll_sents, _ = read_conll(line_generator, chunk_size=60000, token_class=CoNLLUP_Token, comment_str="#")
	special_cases = [int(line) for line in open(CASES).read().splitlines()]
	checked_cases = []
	
	lemmatizer = GermaLemma()
	
	for ix, sent in enumerate(conll_sents):
		if ix in special_cases:
			for tok in sent.tokens:
				old_word_change = new_to_old.get(tok.word)
				if old_word_change:
					try:
						old_lemma = lemmatizer.find_lemma(old_word_change, tok.pos_tag)
					except:
						old_lemma = f"UNK_{tok.pos_tag}"
					checked_cases.append((old_word_change, tok.word, old_lemma, tok.lemma))
	
	print(f"Cases checked: {len(checked_cases)}")
	case_count = Counter(checked_cases).most_common()
	fu.counter_to_file(case_count, "/home/daza/datasets/TIGER_conll/TigerLemmas_Old_New.tsv")
	