Blame - DeReKo/spacy_train/conll2spacy.py - KorAP/sota-pos-lemmatizers

blob: f0d0d4cff067ef5d743700bd34cd00e9f1e0b3f8 [file] [log] [blame]

daza	8534747	2020-11-23 18:43:33 +0100	[diff] [blame^]	1	import argparse, time, json
				2	import my_utils.file_utils as fu
				3	from lib.CoNLL_Annotation import get_token_type
				4
				5	if __name__ == "__main__":
				6	"""
				7	--- TIGER New Orthography ---
				8	python DeReKo/spacy_train/conll2spacy.py --corpus_name TigerNew --gld_token_type CoNLLUP_Token \
				9	-i /home/daza/datasets/TIGER_conll/data_splits/train/Tiger.NewOrth.train.conll \
				10	-o DeReKo/spacy_train/Tiger.NewOrth.train.json \
				11	-t DeReKo/spacy_train/Tiger.NewOrth.train.txt
				12
				13	python DeReKo/spacy_train/conll2spacy.py --corpus_name TigerNew --gld_token_type CoNLLUP_Token \
				14	-i /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.NewOrth.test.conll \
				15	-o DeReKo/spacy_train/Tiger.NewOrth.test.json \
				16	-t DeReKo/spacy_train/Tiger.NewOrth.test.txt
				17
				18	"""
				19
				20	parser = argparse.ArgumentParser()
				21	parser.add_argument("-i", "--input_file", help="Input Corpus", required=True)
				22	parser.add_argument("-n", "--corpus_name", help="Corpus Name", default="Corpus")
				23	parser.add_argument("-o", "--output_file", help="File where the Predictions will be saved", required=True)
				24	parser.add_argument("-t", "--text_file", help="Output Plain Text File", default=None)
				25	parser.add_argument("-gtt", "--gld_token_type", help="CoNLL Format of the Gold Data", default="CoNLL09_Token")
				26	parser.add_argument("-c", "--comment_str", help="CoNLL Format of comentaries inside the file", default="#")
				27	args = parser.parse_args()
				28
				29	file_has_next, chunk_ix = True, 0
				30	CHUNK_SIZE = 60000
				31
				32	write_out = open(args.output_file, "w")
				33	if args.text_file: write_plain = open(args.text_file, "w")
				34
				35	if ".gz" == args.input_file[-3:]:
				36	in_file = fu.expand_file(args.input_file)
				37	else:
				38	in_file = args.input_file
				39
				40	start = time.time()
				41	total_processed_sents = 0
				42	line_generator = fu.file_generator(in_file)
				43	while file_has_next:
				44	annos, file_has_next = fu.get_file_annos_chunk(line_generator, chunk_size=CHUNK_SIZE, token_class=get_token_type(args.gld_token_type), comment_str=args.comment_str)
				45	if len(annos) == 0: break
				46	total_processed_sents += len(annos)
				47	print(f"Already processed {total_processed_sents} sentences...")
				48	spacy_docs = []
				49	for anno_id, anno in enumerate(annos):
				50	plain_text, token_objs = [], []
				51	for ix, tok in enumerate(anno.tokens):
				52	token_objs.append({"id": ix, "orth":tok.word, "tag": tok.pos_tag})
				53	plain_text.append(tok.word)
				54	plain_text_str = " ".join(plain_text)
				55	sent_obj = {
				56	"id": anno_id,
				57	"meta": anno.metadata,
				58	"paragraphs": [{
				59	"raw": plain_text_str,
				60	"sentences": [{
				61	"tokens": token_objs
				62	}]
				63	}]
				64	}
				65	spacy_docs.append(sent_obj)
				66	if args.text_file:
				67	write_plain.write(plain_text_str + "\n")
				68	write_out.write(json.dumps(spacy_docs))
				69	end = time.time()
				70	print(f"Processing {args.corpus_name} took {(end - start)} seconds!")