Stable testing across datasets
diff --git a/my_utils/file_utils.py b/my_utils/file_utils.py
index baa6eb6..e63ddca 100644
--- a/my_utils/file_utils.py
+++ b/my_utils/file_utils.py
@@ -1,9 +1,20 @@
-import requests, logging
+import requests, logging, json
from lib.CoNLL_Annotation import read_conll, read_conll_generator
logger = logging.getLogger(__name__)
+def dict_to_file(my_dict, out_path):
+ with open(out_path, "w") as out:
+ out.write(json.dump(my_dict))
+
+def file_to_dict(file_path):
+ d = {}
+ with open(file_path) as f:
+ d = f.load(f)
+ return d
+
+
def file_generator(file_path):
with open(file_path, "r") as data_file:
logger.info("Reading instances from lines in file at: %s", file_path)
@@ -12,9 +23,9 @@
yield line
-def get_file_text_chunk(line_generator, chunk_size, token_class):
+def get_file_text_chunk(line_generator, chunk_size, token_class, comment_str="###C:"):
file_has_next = True
- chunk, n_sents = read_conll(line_generator, chunk_size, token_class)
+ chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str)
if n_sents == 0: file_has_next = False
sents, gld, meta = [], [], []
for anno in chunk:
@@ -24,10 +35,10 @@
return sents, gld, file_has_next
-def get_file_chunk(line_generator, chunk_size, token_class):
+def get_file_chunk(line_generator, chunk_size, token_class, comment_str="###C:"):
file_has_next = True
- chunk, n_sents = read_conll(line_generator, chunk_size, token_class)
- if n_sents == 0: file_has_next = False
+ chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str)
+ if n_sents < chunk_size: file_has_next = False
raw_text = ""
for anno in chunk:
if len(anno.metadata) > 0:
@@ -41,8 +52,7 @@
def turku_parse_file(raw_text, filename, chunk_ix):
- f = filename.split(".")[0]
- out_file_str = f"{f}.parsed.{chunk_ix}.conllu"
+ out_file_str = f"{filename}.parsed.{chunk_ix}.conllu"
# For each file make a request to obtain the parse back
logger.info(f"Sending Request {chunk_ix} to Parser Server...")
response = requests.post("http://localhost:7689/", data=raw_text.encode('utf-8'))