daza | ff42f63 | 2020-10-08 14:46:32 +0200 | [diff] [blame] | 1 | import argparse |
daza | e3bc92e | 2020-11-04 11:06:26 +0100 | [diff] [blame] | 2 | from lib.CoNLL_Annotation import read_conll_generator, get_token_type |
daza | ff42f63 | 2020-10-08 14:46:32 +0200 | [diff] [blame] | 3 | |
| 4 | # TODO: Parallelize this for HUGE Files: All sentences can be processed independently |
| 5 | |
| 6 | if __name__ == "__main__": |
| 7 | """ |
| 8 | EXAMPLE: |
| 9 | For TreeTagger: |
daza | e3bc92e | 2020-11-04 11:06:26 +0100 | [diff] [blame] | 10 | python my_utils/conll_to_tok.py \ |
| 11 | -s /home/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09 \ |
| 12 | -ss "</S>" \ |
| 13 | --token_type CoNLL09_Token |
daza | fb308a2 | 2021-01-27 16:20:08 +0100 | [diff] [blame] | 14 | |
daza | d7d7075 | 2021-01-12 18:17:49 +0100 | [diff] [blame] | 15 | *** GERMAN UNIVERSAL DEPS TEST *** |
daza | e3bc92e | 2020-11-04 11:06:26 +0100 | [diff] [blame] | 16 | python my_utils/conll_to_tok.py \ |
| 17 | -s /home/daza/datasets/ud-treebanks-v2.2/UD_German-GSD/de_gsd-ud-test.conllu \ |
| 18 | -ss "</S>" \ |
| 19 | --token_type CoNLLUP_Token |
daza | fb308a2 | 2021-01-27 16:20:08 +0100 | [diff] [blame] | 20 | |
daza | d7d7075 | 2021-01-12 18:17:49 +0100 | [diff] [blame] | 21 | *** TIGER TEST NEW ORTH *** |
| 22 | |
| 23 | python my_utils/conll_to_tok.py \ |
| 24 | -s /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.NewOrth.test.conll \ |
| 25 | -ss "</S>" \ |
| 26 | --token_type CoNLLUP_Token |
daza | d7d7075 | 2021-01-12 18:17:49 +0100 | [diff] [blame] | 27 | |
daza | ff42f63 | 2020-10-08 14:46:32 +0200 | [diff] [blame] | 28 | For RNNTagger |
daza | e3bc92e | 2020-11-04 11:06:26 +0100 | [diff] [blame] | 29 | python my_utils/conll_to_tok.py \ |
| 30 | -s /home/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09 \ |
| 31 | --token_type CoNLL09_Token |
| 32 | |
| 33 | python my_utils/conll_to_tok.py \ |
| 34 | -s /home/daza/datasets/ud-treebanks-v2.2/UD_German-GSD/de_gsd-ud-test.conllu \ |
| 35 | --token_type CoNLLUP_Token |
daza | ff42f63 | 2020-10-08 14:46:32 +0200 | [diff] [blame] | 36 | """ |
| 37 | |
| 38 | parser = argparse.ArgumentParser() |
| 39 | parser.add_argument("-s", "--src_file", help="CoNLLU File to Convert into the .tok input for RNNTagger/TreeTagger", required=True) |
daza | fb308a2 | 2021-01-27 16:20:08 +0100 | [diff] [blame] | 40 | parser.add_argument("-o", "--output_file", help="Output Formatted Corpus", default=None) |
daza | e3bc92e | 2020-11-04 11:06:26 +0100 | [diff] [blame] | 41 | parser.add_argument("-t", "--token_type", help="Type of Token of the INPUT file", required=True) |
daza | ff42f63 | 2020-10-08 14:46:32 +0200 | [diff] [blame] | 42 | parser.add_argument("-ss", "--sent_sep", help="Special separator to distinguish sentence boundaries", default="") |
daza | e3bc92e | 2020-11-04 11:06:26 +0100 | [diff] [blame] | 43 | parser.add_argument("-c", "--com_str", help="Skip line if it starts with this string (comment market)", default="# ") |
daza | ff42f63 | 2020-10-08 14:46:32 +0200 | [diff] [blame] | 44 | args = parser.parse_args() |
| 45 | |
daza | fb308a2 | 2021-01-27 16:20:08 +0100 | [diff] [blame] | 46 | output_prefix = args.src_file if not args.output_file else args.output_file |
| 47 | |
daza | d7d7075 | 2021-01-12 18:17:49 +0100 | [diff] [blame] | 48 | if args.sent_sep == "": |
daza | fb308a2 | 2021-01-27 16:20:08 +0100 | [diff] [blame] | 49 | output_file = open(f"{output_prefix}.tok","w") |
daza | d7d7075 | 2021-01-12 18:17:49 +0100 | [diff] [blame] | 50 | else: |
daza | fb308a2 | 2021-01-27 16:20:08 +0100 | [diff] [blame] | 51 | output_file = open(f"{output_prefix}.sep.tok","w") |
daza | ff42f63 | 2020-10-08 14:46:32 +0200 | [diff] [blame] | 52 | |
daza | e3bc92e | 2020-11-04 11:06:26 +0100 | [diff] [blame] | 53 | for conll_obj in read_conll_generator(args.src_file, token_class=get_token_type(args.token_type), comment_str=args.com_str): |
daza | ff42f63 | 2020-10-08 14:46:32 +0200 | [diff] [blame] | 54 | for tok in conll_obj.tokens: |
| 55 | output_file.write(tok.word+"\n") |
| 56 | output_file.write(args.sent_sep+"\n") |