Stable tested version
diff --git a/my_utils/conll_to_tok.py b/my_utils/conll_to_tok.py
index 2dbe2ed..8402e57 100644
--- a/my_utils/conll_to_tok.py
+++ b/my_utils/conll_to_tok.py
@@ -11,21 +11,19 @@
-s /home/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09 \
-ss "</S>" \
--token_type CoNLL09_Token
-
+
*** GERMAN UNIVERSAL DEPS TEST ***
-
python my_utils/conll_to_tok.py \
-s /home/daza/datasets/ud-treebanks-v2.2/UD_German-GSD/de_gsd-ud-test.conllu \
-ss "</S>" \
--token_type CoNLLUP_Token
-
+
*** TIGER TEST NEW ORTH ***
python my_utils/conll_to_tok.py \
-s /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.NewOrth.test.conll \
-ss "</S>" \
--token_type CoNLLUP_Token
-
For RNNTagger
python my_utils/conll_to_tok.py \
@@ -39,15 +37,18 @@
parser = argparse.ArgumentParser()
parser.add_argument("-s", "--src_file", help="CoNLLU File to Convert into the .tok input for RNNTagger/TreeTagger", required=True)
+ parser.add_argument("-o", "--output_file", help="Output Formatted Corpus", default=None)
parser.add_argument("-t", "--token_type", help="Type of Token of the INPUT file", required=True)
parser.add_argument("-ss", "--sent_sep", help="Special separator to distinguish sentence boundaries", default="")
parser.add_argument("-c", "--com_str", help="Skip line if it starts with this string (comment market)", default="# ")
args = parser.parse_args()
+ output_prefix = args.src_file if not args.output_file else args.output_file
+
if args.sent_sep == "":
- output_file = open(f"{args.src_file}.tok","w")
+ output_file = open(f"{output_prefix}.tok","w")
else:
- output_file = open(f"{args.src_file}.sep.tok","w")
+ output_file = open(f"{output_prefix}.sep.tok","w")
for conll_obj in read_conll_generator(args.src_file, token_class=get_token_type(args.token_type), comment_str=args.com_str):
for tok in conll_obj.tokens:
diff --git a/my_utils/file_utils.py b/my_utils/file_utils.py
index 1865fc5..4dbe1b4 100644
--- a/my_utils/file_utils.py
+++ b/my_utils/file_utils.py
@@ -22,6 +22,7 @@
with open(out_path, "w", encoding='utf8') as out:
json.dump(my_dict, fp=out, ensure_ascii=False)
+
def file_to_dict(file_path):
d = {}
with open(file_path) as f:
@@ -36,7 +37,6 @@
out.write(tok.get_conllU_line()+"\n")
out.write("\n")
-
def file_generator(file_path):
with open(file_path, "r") as data_file:
logger.info("Reading instances from lines in file at: %s", file_path)
@@ -121,4 +121,4 @@
raise Exception
return fixed_filename
else:
- return fname
\ No newline at end of file
+ return fname