Stable testing across datasets

commit: e3bc92ec638020eddd0d0ce53cb73b0763dee76c [log] [tgz]
author: daza <daza@uni-heidelberg.de> Wed Nov 04 11:06:26 2020 +0100
committer: daza <daza@uni-heidelberg.de> Wed Nov 04 11:06:26 2020 +0100
tree: 8920e2c8fe30f9ec1b308bea73167e9265778402
parent: ff42f630665edf42363b2d00fba39a4e2285a78e [diff]
diff --git a/my_utils/conll_to_tok.py b/my_utils/conll_to_tok.py
index 9d77543..d5656e8 100644
--- a/my_utils/conll_to_tok.py
+++ b/my_utils/conll_to_tok.py

@@ -1,5 +1,5 @@
 import argparse
-from lib.CoNLL_Annotation import read_conll_generator, CoNLL09_Token
+from lib.CoNLL_Annotation import read_conll_generator, get_token_type
 
 # TODO: Parallelize this for HUGE Files: All sentences can be processed independently
 
@@ -7,20 +7,36 @@
 	"""
 		EXAMPLE:
 		For TreeTagger:
-			python my_utils/conll_to_tok.py -s /vol/netapp/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09 -ss "</S>"
+			python my_utils/conll_to_tok.py \
+				-s /home/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09 \
+				-ss "</S>" \
+				--token_type CoNLL09_Token
+				
+			python my_utils/conll_to_tok.py \
+				-s /home/daza/datasets/ud-treebanks-v2.2/UD_German-GSD/de_gsd-ud-test.conllu \
+				-ss "</S>" \
+				--token_type CoNLLUP_Token
 			
 		For RNNTagger
-			python my_utils/conll_to_tok.py -s /vol/netapp/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09
+			python my_utils/conll_to_tok.py \
+				-s /home/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09 \
+				--token_type CoNLL09_Token
+			
+			python my_utils/conll_to_tok.py \
+				-s /home/daza/datasets/ud-treebanks-v2.2/UD_German-GSD/de_gsd-ud-test.conllu \
+				--token_type CoNLLUP_Token
 	"""
 	
 	parser = argparse.ArgumentParser()
 	parser.add_argument("-s", "--src_file", help="CoNLLU File to Convert into the .tok input for RNNTagger/TreeTagger", required=True)
+	parser.add_argument("-t", "--token_type", help="Type of Token of the INPUT file", required=True)
 	parser.add_argument("-ss", "--sent_sep", help="Special separator to distinguish sentence boundaries", default="")
+	parser.add_argument("-c", "--com_str", help="Skip line if it starts with this string (comment market)", default="# ")
 	args = parser.parse_args()
 	
 	output_file = open(f"{args.src_file}.tok","w")
 	
-	for conll_obj in read_conll_generator(args.src_file, token_class=CoNLL09_Token):
+	for conll_obj in read_conll_generator(args.src_file, token_class=get_token_type(args.token_type), comment_str=args.com_str):
 		for tok in conll_obj.tokens:
 			output_file.write(tok.word+"\n")
 		output_file.write(args.sent_sep+"\n")
\ No newline at end of file

diff --git a/my_utils/file_utils.py b/my_utils/file_utils.py
index baa6eb6..e63ddca 100644
--- a/my_utils/file_utils.py
+++ b/my_utils/file_utils.py

@@ -1,9 +1,20 @@
-import requests, logging
+import requests, logging, json
 from lib.CoNLL_Annotation import read_conll, read_conll_generator
 
 logger = logging.getLogger(__name__)
 
 
+def dict_to_file(my_dict, out_path):
+    with open(out_path, "w") as out:
+        out.write(json.dump(my_dict))
+
+def file_to_dict(file_path):
+    d = {}
+    with open(file_path) as f:
+        d = f.load(f)
+    return d  
+
+
 def file_generator(file_path):
     with open(file_path, "r") as data_file:
         logger.info("Reading instances from lines in file at: %s", file_path)
@@ -12,9 +23,9 @@
             yield line
 
 
-def get_file_text_chunk(line_generator, chunk_size, token_class):
+def get_file_text_chunk(line_generator, chunk_size, token_class, comment_str="###C:"):
     file_has_next = True
-    chunk, n_sents = read_conll(line_generator, chunk_size, token_class)
+    chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str)
     if n_sents == 0: file_has_next = False
     sents, gld, meta = [], [], []
     for anno in chunk:
@@ -24,10 +35,10 @@
     return sents, gld, file_has_next
 
 
-def get_file_chunk(line_generator, chunk_size, token_class):
+def get_file_chunk(line_generator, chunk_size, token_class, comment_str="###C:"):
     file_has_next = True
-    chunk, n_sents = read_conll(line_generator, chunk_size, token_class)
-    if n_sents == 0: file_has_next = False
+    chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str)
+    if n_sents < chunk_size: file_has_next = False
     raw_text = ""
     for anno in chunk:
         if len(anno.metadata) > 0: 
@@ -41,8 +52,7 @@
 
 
 def turku_parse_file(raw_text, filename, chunk_ix):
-    f = filename.split(".")[0]
-    out_file_str = f"{f}.parsed.{chunk_ix}.conllu"
+    out_file_str = f"{filename}.parsed.{chunk_ix}.conllu"
     # For each file make a request to obtain the parse back
     logger.info(f"Sending Request {chunk_ix} to Parser Server...")
     response = requests.post("http://localhost:7689/", data=raw_text.encode('utf-8'))
commit	e3bc92ec638020eddd0d0ce53cb73b0763dee76c	[log] [tgz]
author	daza <daza@uni-heidelberg.de>	Wed Nov 04 11:06:26 2020 +0100
committer	daza <daza@uni-heidelberg.de>	Wed Nov 04 11:06:26 2020 +0100
tree	8920e2c8fe30f9ec1b308bea73167e9265778402
parent	ff42f630665edf42363b2d00fba39a4e2285a78e [diff]