daza | ff42f63 | 2020-10-08 14:46:32 +0200 | [diff] [blame^] | 1 | import argparse |
| 2 | from lib.CoNLL_Annotation import read_conll_generator, CoNLL09_Token |
| 3 | |
| 4 | # TODO: Parallelize this for HUGE Files: All sentences can be processed independently |
| 5 | |
| 6 | if __name__ == "__main__": |
| 7 | """ |
| 8 | EXAMPLE: |
| 9 | For TreeTagger: |
| 10 | python my_utils/conll_to_tok.py -s /vol/netapp/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09 -ss "</S>" |
| 11 | |
| 12 | For RNNTagger |
| 13 | python my_utils/conll_to_tok.py -s /vol/netapp/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09 |
| 14 | """ |
| 15 | |
| 16 | parser = argparse.ArgumentParser() |
| 17 | parser.add_argument("-s", "--src_file", help="CoNLLU File to Convert into the .tok input for RNNTagger/TreeTagger", required=True) |
| 18 | parser.add_argument("-ss", "--sent_sep", help="Special separator to distinguish sentence boundaries", default="") |
| 19 | args = parser.parse_args() |
| 20 | |
| 21 | output_file = open(f"{args.src_file}.tok","w") |
| 22 | |
| 23 | for conll_obj in read_conll_generator(args.src_file, token_class=CoNLL09_Token): |
| 24 | for tok in conll_obj.tokens: |
| 25 | output_file.write(tok.word+"\n") |
| 26 | output_file.write(args.sent_sep+"\n") |