Blame - my_utils/conll_to_tok.py - KorAP/sota-pos-lemmatizers

blob: 9d7754313dfa2dc7d6968bdfef9349e2ef88c51e [file] [log] [blame]

daza	ff42f63	2020-10-08 14:46:32 +0200	[diff] [blame^]	1	import argparse
				2	from lib.CoNLL_Annotation import read_conll_generator, CoNLL09_Token
				3
				4	# TODO: Parallelize this for HUGE Files: All sentences can be processed independently
				5
				6	if __name__ == "__main__":
				7	"""
				8	EXAMPLE:
				9	For TreeTagger:
				10	python my_utils/conll_to_tok.py -s /vol/netapp/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09 -ss "</S>"
				11
				12	For RNNTagger
				13	python my_utils/conll_to_tok.py -s /vol/netapp/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09
				14	"""
				15
				16	parser = argparse.ArgumentParser()
				17	parser.add_argument("-s", "--src_file", help="CoNLLU File to Convert into the .tok input for RNNTagger/TreeTagger", required=True)
				18	parser.add_argument("-ss", "--sent_sep", help="Special separator to distinguish sentence boundaries", default="")
				19	args = parser.parse_args()
				20
				21	output_file = open(f"{args.src_file}.tok","w")
				22
				23	for conll_obj in read_conll_generator(args.src_file, token_class=CoNLL09_Token):
				24	for tok in conll_obj.tokens:
				25	output_file.write(tok.word+"\n")
				26	output_file.write(args.sent_sep+"\n")