DeReKo/spacy_train/conll2spacy.py - KorAP/sota-pos-lemmatizers - Gitiles

 import argparse, time, json
 import my_utils.file_utils as fu
 from lib.CoNLL_Annotation import get_token_type

 if __name__ == "__main__":
 	"""
 		--- TIGER New Orthography ---
 			python DeReKo/spacy_train/conll2spacy.py --corpus_name TigerNew --gld_token_type CoNLLUP_Token \
 				-i /home/daza/datasets/TIGER_conll/data_splits/train/Tiger.NewOrth.train.conll \
 				-o DeReKo/spacy_train/Tiger.NewOrth.train.json \
 				-t DeReKo/spacy_train/Tiger.NewOrth.train.txt

 			python DeReKo/spacy_train/conll2spacy.py --corpus_name TigerNew --gld_token_type CoNLLUP_Token \
 			-i /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.NewOrth.test.conll \
 			-o DeReKo/spacy_train/Tiger.NewOrth.test.json \
 			-t DeReKo/spacy_train/Tiger.NewOrth.test.txt

 	"""

 	parser = argparse.ArgumentParser()
 	parser.add_argument("-i", "--input_file", help="Input Corpus", required=True)
 	parser.add_argument("-n", "--corpus_name", help="Corpus Name", default="Corpus")
 	parser.add_argument("-o", "--output_file", help="File where the Predictions will be saved", required=True)
 	parser.add_argument("-t", "--text_file", help="Output Plain Text File", default=None)
 	parser.add_argument("-gtt", "--gld_token_type", help="CoNLL Format of the Gold Data", default="CoNLL09_Token")
 	parser.add_argument("-c", "--comment_str", help="CoNLL Format of comentaries inside the file", default="#")
 	args = parser.parse_args()

 	file_has_next, chunk_ix = True, 0
 	CHUNK_SIZE = 60000

 	write_out = open(args.output_file, "w")
 	if args.text_file: write_plain = open(args.text_file, "w")

 	if ".gz" == args.input_file[-3:]:
 		in_file = fu.expand_file(args.input_file)
 	else:
 		in_file = args.input_file

 	start = time.time()
 	total_processed_sents = 0
 	line_generator = fu.file_generator(in_file)
 	while file_has_next:
 		annos, file_has_next = fu.get_file_annos_chunk(line_generator, chunk_size=CHUNK_SIZE, 		token_class=get_token_type(args.gld_token_type), comment_str=args.comment_str)
 		if len(annos) == 0: break
 		total_processed_sents += len(annos)
 		print(f"Already processed {total_processed_sents} sentences...")
 		spacy_docs = []
 		for anno_id, anno in enumerate(annos):
 			plain_text, token_objs = [], []
 			for ix, tok in enumerate(anno.tokens):
 				token_objs.append({"id": ix, "orth":tok.word, "tag": tok.pos_tag})
 				plain_text.append(tok.word)
 			plain_text_str = " ".join(plain_text)
 			sent_obj = {
 				"id": anno_id,
 				"meta": anno.metadata,
 				"paragraphs": [{
 					"raw": plain_text_str,
 					"sentences": [{
 						"tokens": token_objs
 					}]
 				}]
 			}
 			spacy_docs.append(sent_obj)
 			if args.text_file:
 				write_plain.write(plain_text_str + "\n")
 		write_out.write(json.dumps(spacy_docs))
 	end = time.time()
 	print(f"Processing {args.corpus_name} took {(end - start)} seconds!")
	import argparse, time, json
	import my_utils.file_utils as fu
	from lib.CoNLL_Annotation import get_token_type

	if __name__ == "__main__":
	"""
	--- TIGER New Orthography ---
	python DeReKo/spacy_train/conll2spacy.py --corpus_name TigerNew --gld_token_type CoNLLUP_Token \
	-i /home/daza/datasets/TIGER_conll/data_splits/train/Tiger.NewOrth.train.conll \
	-o DeReKo/spacy_train/Tiger.NewOrth.train.json \
	-t DeReKo/spacy_train/Tiger.NewOrth.train.txt

	python DeReKo/spacy_train/conll2spacy.py --corpus_name TigerNew --gld_token_type CoNLLUP_Token \
	-i /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.NewOrth.test.conll \
	-o DeReKo/spacy_train/Tiger.NewOrth.test.json \
	-t DeReKo/spacy_train/Tiger.NewOrth.test.txt

	"""

	parser = argparse.ArgumentParser()
	parser.add_argument("-i", "--input_file", help="Input Corpus", required=True)
	parser.add_argument("-n", "--corpus_name", help="Corpus Name", default="Corpus")
	parser.add_argument("-o", "--output_file", help="File where the Predictions will be saved", required=True)
	parser.add_argument("-t", "--text_file", help="Output Plain Text File", default=None)
	parser.add_argument("-gtt", "--gld_token_type", help="CoNLL Format of the Gold Data", default="CoNLL09_Token")
	parser.add_argument("-c", "--comment_str", help="CoNLL Format of comentaries inside the file", default="#")
	args = parser.parse_args()

	file_has_next, chunk_ix = True, 0
	CHUNK_SIZE = 60000

	write_out = open(args.output_file, "w")
	if args.text_file: write_plain = open(args.text_file, "w")

	if ".gz" == args.input_file[-3:]:
	in_file = fu.expand_file(args.input_file)
	else:
	in_file = args.input_file

	start = time.time()
	total_processed_sents = 0
	line_generator = fu.file_generator(in_file)
	while file_has_next:
	annos, file_has_next = fu.get_file_annos_chunk(line_generator, chunk_size=CHUNK_SIZE, token_class=get_token_type(args.gld_token_type), comment_str=args.comment_str)
	if len(annos) == 0: break
	total_processed_sents += len(annos)
	print(f"Already processed {total_processed_sents} sentences...")
	spacy_docs = []
	for anno_id, anno in enumerate(annos):
	plain_text, token_objs = [], []
	for ix, tok in enumerate(anno.tokens):
	token_objs.append({"id": ix, "orth":tok.word, "tag": tok.pos_tag})
	plain_text.append(tok.word)
	plain_text_str = " ".join(plain_text)
	sent_obj = {
	"id": anno_id,
	"meta": anno.metadata,
	"paragraphs": [{
	"raw": plain_text_str,
	"sentences": [{
	"tokens": token_objs
	}]
	}]
	}
	spacy_docs.append(sent_obj)
	if args.text_file:
	write_plain.write(plain_text_str + "\n")
	write_out.write(json.dumps(spacy_docs))
	end = time.time()
	print(f"Processing {args.corpus_name} took {(end - start)} seconds!")