Blame - my_utils/conll_to_tok.py - KorAP/sota-pos-lemmatizers

blob: 8402e57bcf784027e3931c750a790aa186bd22ae [file] [log] [blame]

daza	ff42f63	2020-10-08 14:46:32 +0200	[diff] [blame]	1	import argparse
daza	e3bc92e	2020-11-04 11:06:26 +0100	[diff] [blame]	2	from lib.CoNLL_Annotation import read_conll_generator, get_token_type
daza	ff42f63	2020-10-08 14:46:32 +0200	[diff] [blame]	3
				4	# TODO: Parallelize this for HUGE Files: All sentences can be processed independently
				5
				6	if __name__ == "__main__":
				7	"""
				8	EXAMPLE:
				9	For TreeTagger:
daza	e3bc92e	2020-11-04 11:06:26 +0100	[diff] [blame]	10	python my_utils/conll_to_tok.py \
				11	-s /home/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09 \
				12	-ss "</S>" \
				13	--token_type CoNLL09_Token
daza	fb308a2	2021-01-27 16:20:08 +0100	[diff] [blame]	14
daza	d7d7075	2021-01-12 18:17:49 +0100	[diff] [blame]	15	* GERMAN UNIVERSAL DEPS TEST *
daza	e3bc92e	2020-11-04 11:06:26 +0100	[diff] [blame]	16	python my_utils/conll_to_tok.py \
				17	-s /home/daza/datasets/ud-treebanks-v2.2/UD_German-GSD/de_gsd-ud-test.conllu \
				18	-ss "</S>" \
				19	--token_type CoNLLUP_Token
daza	fb308a2	2021-01-27 16:20:08 +0100	[diff] [blame]	20
daza	d7d7075	2021-01-12 18:17:49 +0100	[diff] [blame]	21	* TIGER TEST NEW ORTH *
				22
				23	python my_utils/conll_to_tok.py \
				24	-s /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.NewOrth.test.conll \
				25	-ss "</S>" \
				26	--token_type CoNLLUP_Token
daza	d7d7075	2021-01-12 18:17:49 +0100	[diff] [blame]	27
daza	ff42f63	2020-10-08 14:46:32 +0200	[diff] [blame]	28	For RNNTagger
daza	e3bc92e	2020-11-04 11:06:26 +0100	[diff] [blame]	29	python my_utils/conll_to_tok.py \
				30	-s /home/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09 \
				31	--token_type CoNLL09_Token
				32
				33	python my_utils/conll_to_tok.py \
				34	-s /home/daza/datasets/ud-treebanks-v2.2/UD_German-GSD/de_gsd-ud-test.conllu \
				35	--token_type CoNLLUP_Token
daza	ff42f63	2020-10-08 14:46:32 +0200	[diff] [blame]	36	"""
				37
				38	parser = argparse.ArgumentParser()
				39	parser.add_argument("-s", "--src_file", help="CoNLLU File to Convert into the .tok input for RNNTagger/TreeTagger", required=True)
daza	fb308a2	2021-01-27 16:20:08 +0100	[diff] [blame]	40	parser.add_argument("-o", "--output_file", help="Output Formatted Corpus", default=None)
daza	e3bc92e	2020-11-04 11:06:26 +0100	[diff] [blame]	41	parser.add_argument("-t", "--token_type", help="Type of Token of the INPUT file", required=True)
daza	ff42f63	2020-10-08 14:46:32 +0200	[diff] [blame]	42	parser.add_argument("-ss", "--sent_sep", help="Special separator to distinguish sentence boundaries", default="")
daza	e3bc92e	2020-11-04 11:06:26 +0100	[diff] [blame]	43	parser.add_argument("-c", "--com_str", help="Skip line if it starts with this string (comment market)", default="# ")
daza	ff42f63	2020-10-08 14:46:32 +0200	[diff] [blame]	44	args = parser.parse_args()
				45
daza	fb308a2	2021-01-27 16:20:08 +0100	[diff] [blame]	46	output_prefix = args.src_file if not args.output_file else args.output_file
				47
daza	d7d7075	2021-01-12 18:17:49 +0100	[diff] [blame]	48	if args.sent_sep == "":
daza	fb308a2	2021-01-27 16:20:08 +0100	[diff] [blame]	49	output_file = open(f"{output_prefix}.tok","w")
daza	d7d7075	2021-01-12 18:17:49 +0100	[diff] [blame]	50	else:
daza	fb308a2	2021-01-27 16:20:08 +0100	[diff] [blame]	51	output_file = open(f"{output_prefix}.sep.tok","w")
daza	ff42f63	2020-10-08 14:46:32 +0200	[diff] [blame]	52
daza	e3bc92e	2020-11-04 11:06:26 +0100	[diff] [blame]	53	for conll_obj in read_conll_generator(args.src_file, token_class=get_token_type(args.token_type), comment_str=args.com_str):
daza	ff42f63	2020-10-08 14:46:32 +0200	[diff] [blame]	54	for tok in conll_obj.tokens:
				55	output_file.write(tok.word+"\n")
				56	output_file.write(args.sent_sep+"\n")