my_utils/conll_to_tok.py - KorAP/sota-pos-lemmatizers - Gitiles

 import argparse
 from lib.CoNLL_Annotation import read_conll_generator, get_token_type

 # TODO: Parallelize this for HUGE Files: All sentences can be processed independently

 if __name__ == "__main__":
 	"""
 		EXAMPLE:
 		For TreeTagger:
 			python my_utils/conll_to_tok.py \
 				-s /home/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09 \
 				-ss "</S>" \
 				--token_type CoNLL09_Token

 			*** GERMAN UNIVERSAL DEPS TEST ***
 			python my_utils/conll_to_tok.py \
 				-s /home/daza/datasets/ud-treebanks-v2.2/UD_German-GSD/de_gsd-ud-test.conllu \
 				-ss "</S>" \
 				--token_type CoNLLUP_Token

 			*** TIGER TEST NEW ORTH ***

 			python my_utils/conll_to_tok.py \
 			-s /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.NewOrth.test.conll \
 			-ss "</S>" \
 			--token_type CoNLLUP_Token

 		For RNNTagger
 			python my_utils/conll_to_tok.py \
 				-s /home/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09 \
 				--token_type CoNLL09_Token

 			python my_utils/conll_to_tok.py \
 				-s /home/daza/datasets/ud-treebanks-v2.2/UD_German-GSD/de_gsd-ud-test.conllu \
 				--token_type CoNLLUP_Token
 	"""

 	parser = argparse.ArgumentParser()
 	parser.add_argument("-s", "--src_file", help="CoNLLU File to Convert into the .tok input for RNNTagger/TreeTagger", required=True)
 	parser.add_argument("-o", "--output_file", help="Output Formatted Corpus", default=None)
 	parser.add_argument("-t", "--token_type", help="Type of Token of the INPUT file", required=True)
 	parser.add_argument("-ss", "--sent_sep", help="Special separator to distinguish sentence boundaries", default="")
 	parser.add_argument("-c", "--com_str", help="Skip line if it starts with this string (comment market)", default="# ")
 	args = parser.parse_args()

 	output_prefix = args.src_file if not args.output_file else args.output_file

 	if args.sent_sep == "":
 		output_file = open(f"{output_prefix}.tok","w")
 	else:
 		output_file = open(f"{output_prefix}.sep.tok","w")

 	for conll_obj in read_conll_generator(args.src_file, token_class=get_token_type(args.token_type), comment_str=args.com_str):
 		for tok in conll_obj.tokens:
 			output_file.write(tok.word+"\n")
 		output_file.write(args.sent_sep+"\n")
	import argparse
	from lib.CoNLL_Annotation import read_conll_generator, get_token_type

	# TODO: Parallelize this for HUGE Files: All sentences can be processed independently

	if __name__ == "__main__":
	"""
	EXAMPLE:
	For TreeTagger:
	python my_utils/conll_to_tok.py \
	-s /home/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09 \
	-ss "</S>" \
	--token_type CoNLL09_Token

	* GERMAN UNIVERSAL DEPS TEST *
	python my_utils/conll_to_tok.py \
	-s /home/daza/datasets/ud-treebanks-v2.2/UD_German-GSD/de_gsd-ud-test.conllu \
	-ss "</S>" \
	--token_type CoNLLUP_Token

	* TIGER TEST NEW ORTH *

	python my_utils/conll_to_tok.py \
	-s /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.NewOrth.test.conll \
	-ss "</S>" \
	--token_type CoNLLUP_Token

	For RNNTagger
	python my_utils/conll_to_tok.py \
	-s /home/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09 \
	--token_type CoNLL09_Token

	python my_utils/conll_to_tok.py \
	-s /home/daza/datasets/ud-treebanks-v2.2/UD_German-GSD/de_gsd-ud-test.conllu \
	--token_type CoNLLUP_Token
	"""

	parser = argparse.ArgumentParser()
	parser.add_argument("-s", "--src_file", help="CoNLLU File to Convert into the .tok input for RNNTagger/TreeTagger", required=True)
	parser.add_argument("-o", "--output_file", help="Output Formatted Corpus", default=None)
	parser.add_argument("-t", "--token_type", help="Type of Token of the INPUT file", required=True)
	parser.add_argument("-ss", "--sent_sep", help="Special separator to distinguish sentence boundaries", default="")
	parser.add_argument("-c", "--com_str", help="Skip line if it starts with this string (comment market)", default="# ")
	args = parser.parse_args()

	output_prefix = args.src_file if not args.output_file else args.output_file

	if args.sent_sep == "":
	output_file = open(f"{output_prefix}.tok","w")
	else:
	output_file = open(f"{output_prefix}.sep.tok","w")

	for conll_obj in read_conll_generator(args.src_file, token_class=get_token_type(args.token_type), comment_str=args.com_str):
	for tok in conll_obj.tokens:
	output_file.write(tok.word+"\n")
	output_file.write(args.sent_sep+"\n")