Added training examples for SpaCy3
diff --git a/my_utils/file_utils.py b/my_utils/file_utils.py
index e63ddca..1865fc5 100644
--- a/my_utils/file_utils.py
+++ b/my_utils/file_utils.py
@@ -1,20 +1,42 @@
import requests, logging, json
+import subprocess, time
+import glob, logging
+import os.path, sys
from lib.CoNLL_Annotation import read_conll, read_conll_generator
logger = logging.getLogger(__name__)
-def dict_to_file(my_dict, out_path):
+def list_to_file(my_list, out_path):
with open(out_path, "w") as out:
- out.write(json.dump(my_dict))
+ for item_str in my_list:
+ out.write(f"{item_str}\n")
+
+def counter_to_file(my_counter, out_path):
+ with open(out_path, "w") as out:
+ for item, count in my_counter:
+ item_str = "\t".join(item)
+ out.write(f"{item_str}\t{count}\n")
+
+def dict_to_file(my_dict, out_path):
+ with open(out_path, "w", encoding='utf8') as out:
+ json.dump(my_dict, fp=out, ensure_ascii=False)
def file_to_dict(file_path):
d = {}
with open(file_path) as f:
- d = f.load(f)
+ d = json.load(f)
return d
+def write_conll_file(conll_objs, out_path):
+ with open(out_path, "w", encoding='utf8') as out:
+ for obj in conll_objs:
+ for tok in obj.tokens:
+ out.write(tok.get_conllU_line()+"\n")
+ out.write("\n")
+
+
def file_generator(file_path):
with open(file_path, "r") as data_file:
logger.info("Reading instances from lines in file at: %s", file_path)
@@ -23,7 +45,16 @@
yield line
+def get_file_annos_chunk(line_generator, chunk_size, token_class, comment_str="###C:"):
+ file_has_next = True
+ chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str)
+ if n_sents == 0: file_has_next = False
+ sents, gld, meta = [], [], []
+ return chunk, file_has_next
+
+
def get_file_text_chunk(line_generator, chunk_size, token_class, comment_str="###C:"):
+ """ Same as get_file_annos_chunk but directly get (text, labels) pairs"""
file_has_next = True
chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str)
if n_sents == 0: file_has_next = False
@@ -64,3 +95,30 @@
fout = open(fname, "w")
fout.write(response_str)
fout.close()
+
+
+def expand_file(f, substitute_comment=False):
+ # Expand the .gz file
+ fname = f[:-3]
+ if not os.path.isfile(fname):
+ p = subprocess.call(f"gunzip -c {f} > {fname}", shell=True)
+ if p == 0:
+ logger.info("Successfully uncompressed file")
+ else:
+ logger.info(f"Couldn't expand file {f}")
+ raise Exception
+ else:
+ logger.info(f"File {fname} is already uncompressed. Skipping this step...")
+
+ # Substitute the Commentary Lines on the Expanded file
+ if substitute_comment:
+ fixed_filename = f"{fname}.fixed"
+ p = subprocess.call(f"sed 's/^# /###C: /g' {fname}", shell=True, stdout=open(fixed_filename, "w")) # stdout=subprocess.PIPE
+ if p == 0:
+ logger.info("Successfully fixed comments on file")
+ else:
+ logger.info(f"Something went wrong when substituting commentaries")
+ raise Exception
+ return fixed_filename
+ else:
+ return fname
\ No newline at end of file