Stable tested version

commit: fb308a2618feb359ea06335c4d4cf01a2906e2a9 [log] [tgz]
author: daza <daza@uni-heidelberg.de> Wed Jan 27 16:20:08 2021 +0100
committer: daza <daza@uni-heidelberg.de> Wed Jan 27 16:20:08 2021 +0100
tree: 48df297112112a986d387ba417957856f21e4a94
parent: d7d707559a6ae5568b76ea2533a8ab382a42e6b4 [diff]
parent: 54e072e61c24a3ce12a7f47e17e9d8d0d1583236 [diff]
diff --git a/my_utils/conll_to_tok.py b/my_utils/conll_to_tok.py
index 2dbe2ed..8402e57 100644
--- a/my_utils/conll_to_tok.py
+++ b/my_utils/conll_to_tok.py

@@ -11,21 +11,19 @@
 				-s /home/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09 \
 				-ss "</S>" \
 				--token_type CoNLL09_Token
-				
+
 			*** GERMAN UNIVERSAL DEPS TEST ***
-			
 			python my_utils/conll_to_tok.py \
 				-s /home/daza/datasets/ud-treebanks-v2.2/UD_German-GSD/de_gsd-ud-test.conllu \
 				-ss "</S>" \
 				--token_type CoNLLUP_Token
-			
+
 			*** TIGER TEST NEW ORTH ***
 			
 			python my_utils/conll_to_tok.py \
 			-s /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.NewOrth.test.conll \
 			-ss "</S>" \
 			--token_type CoNLLUP_Token
-			
 		
 		For RNNTagger
 			python my_utils/conll_to_tok.py \
@@ -39,15 +37,18 @@
 	
 	parser = argparse.ArgumentParser()
 	parser.add_argument("-s", "--src_file", help="CoNLLU File to Convert into the .tok input for RNNTagger/TreeTagger", required=True)
+	parser.add_argument("-o", "--output_file", help="Output Formatted Corpus", default=None)
 	parser.add_argument("-t", "--token_type", help="Type of Token of the INPUT file", required=True)
 	parser.add_argument("-ss", "--sent_sep", help="Special separator to distinguish sentence boundaries", default="")
 	parser.add_argument("-c", "--com_str", help="Skip line if it starts with this string (comment market)", default="# ")
 	args = parser.parse_args()
 	
+	output_prefix = args.src_file if not args.output_file else args.output_file
+	
 	if args.sent_sep == "":
-		output_file = open(f"{args.src_file}.tok","w")
+		output_file = open(f"{output_prefix}.tok","w")
 	else:
-		output_file = open(f"{args.src_file}.sep.tok","w")
+		output_file = open(f"{output_prefix}.sep.tok","w")
 	
 	for conll_obj in read_conll_generator(args.src_file, token_class=get_token_type(args.token_type), comment_str=args.com_str):
 		for tok in conll_obj.tokens:

diff --git a/my_utils/file_utils.py b/my_utils/file_utils.py
index 1865fc5..4dbe1b4 100644
--- a/my_utils/file_utils.py
+++ b/my_utils/file_utils.py

@@ -22,6 +22,7 @@
     with open(out_path, "w", encoding='utf8') as out:
         json.dump(my_dict, fp=out, ensure_ascii=False)
 
+
 def file_to_dict(file_path):
     d = {}
     with open(file_path) as f:
@@ -36,7 +37,6 @@
                 out.write(tok.get_conllU_line()+"\n")
             out.write("\n")
 
-
 def file_generator(file_path):
     with open(file_path, "r") as data_file:
         logger.info("Reading instances from lines in file at: %s", file_path)
@@ -121,4 +121,4 @@
             raise Exception    
         return fixed_filename
     else:
-        return fname
\ No newline at end of file
+        return fname
commit	fb308a2618feb359ea06335c4d4cf01a2906e2a9	[log] [tgz]
author	daza <daza@uni-heidelberg.de>	Wed Jan 27 16:20:08 2021 +0100
committer	daza <daza@uni-heidelberg.de>	Wed Jan 27 16:20:08 2021 +0100
tree	48df297112112a986d387ba417957856f21e4a94
parent	d7d707559a6ae5568b76ea2533a8ab382a42e6b4 [diff]
parent	54e072e61c24a3ce12a7f47e17e9d8d0d1583236 [diff]