blob: 2dbe2ed95e85a0b26f38b30850954b55037455a8 [file] [log] [blame]
dazaff42f632020-10-08 14:46:32 +02001import argparse
dazae3bc92e2020-11-04 11:06:26 +01002from lib.CoNLL_Annotation import read_conll_generator, get_token_type
dazaff42f632020-10-08 14:46:32 +02003
4# TODO: Parallelize this for HUGE Files: All sentences can be processed independently
5
6if __name__ == "__main__":
7 """
8 EXAMPLE:
9 For TreeTagger:
dazae3bc92e2020-11-04 11:06:26 +010010 python my_utils/conll_to_tok.py \
11 -s /home/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09 \
12 -ss "</S>" \
13 --token_type CoNLL09_Token
14
dazad7d70752021-01-12 18:17:49 +010015 *** GERMAN UNIVERSAL DEPS TEST ***
16
dazae3bc92e2020-11-04 11:06:26 +010017 python my_utils/conll_to_tok.py \
18 -s /home/daza/datasets/ud-treebanks-v2.2/UD_German-GSD/de_gsd-ud-test.conllu \
19 -ss "</S>" \
20 --token_type CoNLLUP_Token
dazaff42f632020-10-08 14:46:32 +020021
dazad7d70752021-01-12 18:17:49 +010022 *** TIGER TEST NEW ORTH ***
23
24 python my_utils/conll_to_tok.py \
25 -s /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.NewOrth.test.conll \
26 -ss "</S>" \
27 --token_type CoNLLUP_Token
28
29
dazaff42f632020-10-08 14:46:32 +020030 For RNNTagger
dazae3bc92e2020-11-04 11:06:26 +010031 python my_utils/conll_to_tok.py \
32 -s /home/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09 \
33 --token_type CoNLL09_Token
34
35 python my_utils/conll_to_tok.py \
36 -s /home/daza/datasets/ud-treebanks-v2.2/UD_German-GSD/de_gsd-ud-test.conllu \
37 --token_type CoNLLUP_Token
dazaff42f632020-10-08 14:46:32 +020038 """
39
40 parser = argparse.ArgumentParser()
41 parser.add_argument("-s", "--src_file", help="CoNLLU File to Convert into the .tok input for RNNTagger/TreeTagger", required=True)
dazae3bc92e2020-11-04 11:06:26 +010042 parser.add_argument("-t", "--token_type", help="Type of Token of the INPUT file", required=True)
dazaff42f632020-10-08 14:46:32 +020043 parser.add_argument("-ss", "--sent_sep", help="Special separator to distinguish sentence boundaries", default="")
dazae3bc92e2020-11-04 11:06:26 +010044 parser.add_argument("-c", "--com_str", help="Skip line if it starts with this string (comment market)", default="# ")
dazaff42f632020-10-08 14:46:32 +020045 args = parser.parse_args()
46
dazad7d70752021-01-12 18:17:49 +010047 if args.sent_sep == "":
48 output_file = open(f"{args.src_file}.tok","w")
49 else:
50 output_file = open(f"{args.src_file}.sep.tok","w")
dazaff42f632020-10-08 14:46:32 +020051
dazae3bc92e2020-11-04 11:06:26 +010052 for conll_obj in read_conll_generator(args.src_file, token_class=get_token_type(args.token_type), comment_str=args.com_str):
dazaff42f632020-10-08 14:46:32 +020053 for tok in conll_obj.tokens:
54 output_file.write(tok.word+"\n")
55 output_file.write(args.sent_sep+"\n")