import argparse, time, json
import my_utils.file_utils as fu
from lib.CoNLL_Annotation import get_token_type

if __name__ == "__main__":
	"""
	
	--- TIGER NEW Orthography ---
		python DeReKo/spacy_train/conll2spacy.py --corpus_name TigerNew --gld_token_type CoNLLUP_Token \
			-i /home/daza/datasets/TIGER_conll/data_splits/train/Tiger.NewOrth.train.conll \
			-o DeReKo/spacy_train/Tiger.NewOrth.train.json \
			-t DeReKo/spacy_train/Tiger.NewOrth.train.txt
		
		python DeReKo/spacy_train/conll2spacy.py --corpus_name TigerNew --gld_token_type CoNLLUP_Token \
			-i /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.NewOrth.test.conll \
			-o DeReKo/spacy_train/Tiger.NewOrth.test.json \
			-t DeReKo/spacy_train/Tiger.NewOrth.test.txt
			
	--- TIGER NEW + OLD Orthography ---
		cat Tiger.OldOrth.train.conll Tiger.NewOrth.train.conll > Tiger.ALL.Orth.train.conll
		cat Tiger.OldOrth.test.conll Tiger.NewOrth.test.conll > Tiger.ALL.Orth.test.conll
	
		python DeReKo/spacy_train/conll2spacy.py --corpus_name TigerALL --gld_token_type CoNLLUP_Token \
			-i /home/daza/datasets/TIGER_conll/data_splits/train/Tiger.ALL.Orth.train.conll \
			-o /home/daza/datasets/TIGER_conll/data_splits/train/Tiger.ALL.Orth.train.json \
			-t /home/daza/datasets/TIGER_conll/data_splits/train/Tiger.ALL.Orth.train.txt
		
		python DeReKo/spacy_train/conll2spacy.py --corpus_name TigerALL --gld_token_type CoNLLUP_Token \
			-i /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.ALL.Orth.test.conll \
			-o /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.ALL.Orth.test.json \
			-t /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.ALL.Orth.test.txt
			
	"""
	
	parser = argparse.ArgumentParser()
	parser.add_argument("-i", "--input_file", help="Input Corpus", required=True)
	parser.add_argument("-n", "--corpus_name", help="Corpus Name", default="Corpus")
	parser.add_argument("-o", "--output_file", help="File where the Predictions will be saved", required=True)
	parser.add_argument("-t", "--text_file", help="Output Plain Text File", default=None)
	parser.add_argument("-gtt", "--gld_token_type", help="CoNLL Format of the Gold Data", default="CoNLL09_Token")
	parser.add_argument("-c", "--comment_str", help="CoNLL Format of comentaries inside the file", default="#")
	args = parser.parse_args()
	
	file_has_next, chunk_ix = True, 0
	CHUNK_SIZE = 60000
	
	write_out = open(args.output_file, "w")
	if args.text_file: write_plain = open(args.text_file, "w")
	
	if ".gz" == args.input_file[-3:]:
		in_file = fu.expand_file(args.input_file)
	else:
		in_file = args.input_file
	
	start = time.time()
	total_processed_sents = 0
	line_generator = fu.file_generator(in_file)
	while file_has_next:
		annos, file_has_next = fu.get_file_annos_chunk(line_generator, chunk_size=CHUNK_SIZE, 		token_class=get_token_type(args.gld_token_type), comment_str=args.comment_str)
		if len(annos) == 0: break
		total_processed_sents += len(annos)
		print(f"Already processed {total_processed_sents} sentences...")
		spacy_docs = []
		for anno_id, anno in enumerate(annos):
			plain_text, token_objs = [], []
			for ix, tok in enumerate(anno.tokens):
				token_objs.append({"id": ix, "orth":tok.word, "tag": tok.pos_tag})
				plain_text.append(tok.word)
			plain_text_str = " ".join(plain_text)
			sent_obj = {
				"id": anno_id,
				"meta": anno.metadata,
				"paragraphs": [{
					"raw": plain_text_str,
					"sentences": [{
						"tokens": token_objs
					}]	
				}]
			}
			spacy_docs.append(sent_obj)
			if args.text_file:
				write_plain.write(plain_text_str + "\n")
		write_out.write(json.dumps(spacy_docs))
	end = time.time()
	print(f"Processing {args.corpus_name} took {(end - start)} seconds!")