Stable testing across datasets
diff --git a/my_utils/conll_to_tok.py b/my_utils/conll_to_tok.py
index 9d77543..d5656e8 100644
--- a/my_utils/conll_to_tok.py
+++ b/my_utils/conll_to_tok.py
@@ -1,5 +1,5 @@
import argparse
-from lib.CoNLL_Annotation import read_conll_generator, CoNLL09_Token
+from lib.CoNLL_Annotation import read_conll_generator, get_token_type
# TODO: Parallelize this for HUGE Files: All sentences can be processed independently
@@ -7,20 +7,36 @@
"""
EXAMPLE:
For TreeTagger:
- python my_utils/conll_to_tok.py -s /vol/netapp/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09 -ss "</S>"
+ python my_utils/conll_to_tok.py \
+ -s /home/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09 \
+ -ss "</S>" \
+ --token_type CoNLL09_Token
+
+ python my_utils/conll_to_tok.py \
+ -s /home/daza/datasets/ud-treebanks-v2.2/UD_German-GSD/de_gsd-ud-test.conllu \
+ -ss "</S>" \
+ --token_type CoNLLUP_Token
For RNNTagger
- python my_utils/conll_to_tok.py -s /vol/netapp/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09
+ python my_utils/conll_to_tok.py \
+ -s /home/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09 \
+ --token_type CoNLL09_Token
+
+ python my_utils/conll_to_tok.py \
+ -s /home/daza/datasets/ud-treebanks-v2.2/UD_German-GSD/de_gsd-ud-test.conllu \
+ --token_type CoNLLUP_Token
"""
parser = argparse.ArgumentParser()
parser.add_argument("-s", "--src_file", help="CoNLLU File to Convert into the .tok input for RNNTagger/TreeTagger", required=True)
+ parser.add_argument("-t", "--token_type", help="Type of Token of the INPUT file", required=True)
parser.add_argument("-ss", "--sent_sep", help="Special separator to distinguish sentence boundaries", default="")
+ parser.add_argument("-c", "--com_str", help="Skip line if it starts with this string (comment market)", default="# ")
args = parser.parse_args()
output_file = open(f"{args.src_file}.tok","w")
- for conll_obj in read_conll_generator(args.src_file, token_class=CoNLL09_Token):
+ for conll_obj in read_conll_generator(args.src_file, token_class=get_token_type(args.token_type), comment_str=args.com_str):
for tok in conll_obj.tokens:
output_file.write(tok.word+"\n")
output_file.write(args.sent_sep+"\n")
\ No newline at end of file