| import argparse |
| from lib.CoNLL_Annotation import read_conll_generator, get_token_type |
| |
| # TODO: Parallelize this for HUGE Files: All sentences can be processed independently |
| |
| if __name__ == "__main__": |
| """ |
| EXAMPLE: |
| For TreeTagger: |
| python my_utils/conll_to_tok.py \ |
| -s /home/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09 \ |
| -ss "</S>" \ |
| --token_type CoNLL09_Token |
| |
| *** GERMAN UNIVERSAL DEPS TEST *** |
| python my_utils/conll_to_tok.py \ |
| -s /home/daza/datasets/ud-treebanks-v2.2/UD_German-GSD/de_gsd-ud-test.conllu \ |
| -ss "</S>" \ |
| --token_type CoNLLUP_Token |
| |
| *** TIGER TEST NEW ORTH *** |
| |
| python my_utils/conll_to_tok.py \ |
| -s /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.NewOrth.test.conll \ |
| -ss "</S>" \ |
| --token_type CoNLLUP_Token |
| |
| For RNNTagger |
| python my_utils/conll_to_tok.py \ |
| -s /home/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09 \ |
| --token_type CoNLL09_Token |
| |
| python my_utils/conll_to_tok.py \ |
| -s /home/daza/datasets/ud-treebanks-v2.2/UD_German-GSD/de_gsd-ud-test.conllu \ |
| --token_type CoNLLUP_Token |
| """ |
| |
| parser = argparse.ArgumentParser() |
| parser.add_argument("-s", "--src_file", help="CoNLLU File to Convert into the .tok input for RNNTagger/TreeTagger", required=True) |
| parser.add_argument("-o", "--output_file", help="Output Formatted Corpus", default=None) |
| parser.add_argument("-t", "--token_type", help="Type of Token of the INPUT file", required=True) |
| parser.add_argument("-ss", "--sent_sep", help="Special separator to distinguish sentence boundaries", default="") |
| parser.add_argument("-c", "--com_str", help="Skip line if it starts with this string (comment market)", default="# ") |
| args = parser.parse_args() |
| |
| output_prefix = args.src_file if not args.output_file else args.output_file |
| |
| if args.sent_sep == "": |
| output_file = open(f"{output_prefix}.tok","w") |
| else: |
| output_file = open(f"{output_prefix}.sep.tok","w") |
| |
| for conll_obj in read_conll_generator(args.src_file, token_class=get_token_type(args.token_type), comment_str=args.com_str): |
| for tok in conll_obj.tokens: |
| output_file.write(tok.word+"\n") |
| output_file.write(args.sent_sep+"\n") |