Added training examples for SpaCy3
diff --git a/my_utils/file_utils.py b/my_utils/file_utils.py
index e63ddca..1865fc5 100644
--- a/my_utils/file_utils.py
+++ b/my_utils/file_utils.py
@@ -1,20 +1,42 @@
import requests, logging, json
+import subprocess, time
+import glob, logging
+import os.path, sys
from lib.CoNLL_Annotation import read_conll, read_conll_generator
logger = logging.getLogger(__name__)
-def dict_to_file(my_dict, out_path):
+def list_to_file(my_list, out_path):
with open(out_path, "w") as out:
- out.write(json.dump(my_dict))
+ for item_str in my_list:
+ out.write(f"{item_str}\n")
+
+def counter_to_file(my_counter, out_path):
+ with open(out_path, "w") as out:
+ for item, count in my_counter:
+ item_str = "\t".join(item)
+ out.write(f"{item_str}\t{count}\n")
+
+def dict_to_file(my_dict, out_path):
+ with open(out_path, "w", encoding='utf8') as out:
+ json.dump(my_dict, fp=out, ensure_ascii=False)
def file_to_dict(file_path):
d = {}
with open(file_path) as f:
- d = f.load(f)
+ d = json.load(f)
return d
+def write_conll_file(conll_objs, out_path):
+ with open(out_path, "w", encoding='utf8') as out:
+ for obj in conll_objs:
+ for tok in obj.tokens:
+ out.write(tok.get_conllU_line()+"\n")
+ out.write("\n")
+
+
def file_generator(file_path):
with open(file_path, "r") as data_file:
logger.info("Reading instances from lines in file at: %s", file_path)
@@ -23,7 +45,16 @@
yield line
+def get_file_annos_chunk(line_generator, chunk_size, token_class, comment_str="###C:"):
+ file_has_next = True
+ chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str)
+ if n_sents == 0: file_has_next = False
+ sents, gld, meta = [], [], []
+ return chunk, file_has_next
+
+
def get_file_text_chunk(line_generator, chunk_size, token_class, comment_str="###C:"):
+ """ Same as get_file_annos_chunk but directly get (text, labels) pairs"""
file_has_next = True
chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str)
if n_sents == 0: file_has_next = False
@@ -64,3 +95,30 @@
fout = open(fname, "w")
fout.write(response_str)
fout.close()
+
+
+def expand_file(f, substitute_comment=False):
+ # Expand the .gz file
+ fname = f[:-3]
+ if not os.path.isfile(fname):
+ p = subprocess.call(f"gunzip -c {f} > {fname}", shell=True)
+ if p == 0:
+ logger.info("Successfully uncompressed file")
+ else:
+ logger.info(f"Couldn't expand file {f}")
+ raise Exception
+ else:
+ logger.info(f"File {fname} is already uncompressed. Skipping this step...")
+
+ # Substitute the Commentary Lines on the Expanded file
+ if substitute_comment:
+ fixed_filename = f"{fname}.fixed"
+ p = subprocess.call(f"sed 's/^# /###C: /g' {fname}", shell=True, stdout=open(fixed_filename, "w")) # stdout=subprocess.PIPE
+ if p == 0:
+ logger.info("Successfully fixed comments on file")
+ else:
+ logger.info(f"Something went wrong when substituting commentaries")
+ raise Exception
+ return fixed_filename
+ else:
+ return fname
\ No newline at end of file
diff --git a/my_utils/make_tiger_new_orth.py b/my_utils/make_tiger_new_orth.py
new file mode 100644
index 0000000..8886f74
--- /dev/null
+++ b/my_utils/make_tiger_new_orth.py
@@ -0,0 +1,78 @@
+import my_utils.file_utils as fu
+from lib.CoNLL_Annotation import read_conll, CoNLL09_Token, TigerNew_Token
+from collections import Counter
+
+ORIGINAL_TIGER = "/home/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09"
+NEW_ORTH = "/vol/work/kupietz/Tiger_2_2/data/german/tiger/train/german_tiger_new_orthography.csv"
+
+
+def get_confident_mapping(common_counts):
+ mapping = {}
+ for (old, new), cnt in common_counts:
+ if old not in mapping:
+ mapping[old] = (new, cnt)
+ else:
+ prev_tok, prev_cnt = mapping[old]
+ if cnt > prev_cnt:
+ mapping[old] = (new, cnt)
+ return {k: v[0] for k,v in mapping.items()}
+
+
+def check_orthography(s_old,s_new):
+ global total_tokens
+ identical_sents = True
+ words_old = s_old.get_words()
+ words_new = s_new.get_words()
+ assert len(words_old) == len(words_new)
+ total_tokens += len(words_old)
+ for w1,w2 in zip(words_old, words_new):
+ if w1 != w2 and w1[0].lower() == w2[0].lower():
+ token_changes.append((w1, w2))
+ identical_sents = False
+ return identical_sents
+
+
+if __name__ == "__main__":
+ line_generator = fu.file_generator(ORIGINAL_TIGER)
+ original_sents, _ = read_conll(line_generator, chunk_size=60000, token_class=CoNLL09_Token, comment_str="#")
+
+ line_generator = fu.file_generator(NEW_ORTH)
+ new_orth_sents, _ = read_conll(line_generator, chunk_size=60000, token_class=TigerNew_Token, comment_str="#")
+
+ new_ix = 0
+ train_tiger, test_tiger = [], []
+ problematic_sents, token_changes = [], []
+ total_tokens = 0
+ for i, s1 in enumerate(original_sents):
+ s2 = new_orth_sents[new_ix]
+ print(f"--- {new_ix} ---\n{s1.get_sentence()}\n{s2.get_sentence()}\n\n")
+ if len(s1.get_words()) == len(s2.get_words()):
+ train_tiger.append((s1,s2))
+ identical_sents = check_orthography(s1,s2)
+ if not identical_sents:
+ problematic_sents.append(new_ix)
+ new_ix += 1
+ else:
+ test_tiger.append(s1)
+
+ # Print Stats
+ print(len(train_tiger))
+ print(len(test_tiger))
+ print(len(new_orth_sents))
+ print(f"{len(problematic_sents)}/{len(train_tiger)} ({len(problematic_sents)*100/len(train_tiger)}%) of sentences have change of orthography.")
+ print(f"{len(token_changes)}/{total_tokens} ({len(token_changes)*100/total_tokens}%) of tokens have change of orthography.")
+ # Save Files
+ tiger_path = "/home/daza/datasets/TIGER_conll/"
+ new_cases = Counter(token_changes).most_common()
+ case_mapping = get_confident_mapping(new_cases)
+ # Stats
+ fu.counter_to_file(new_cases, f"{tiger_path}/TigerTokensChangeOrth.train.tsv")
+ fu.dict_to_file(case_mapping, f"{tiger_path}/TigerOrthMapping.train.json")
+ fu.list_to_file(problematic_sents, f"{tiger_path}/NewOrthProblems_Indices.train.txt")
+ # Train/Test Splits
+ old_train, new_train = zip(*train_tiger)
+ fu.write_conll_file(old_train, out_path=f"{tiger_path}/Tiger.OldOrth.train.conll")
+ fu.write_conll_file(new_train, out_path=f"{tiger_path}/Tiger.NewOrth.train.conll")
+ fu.write_conll_file(test_tiger, out_path=f"{tiger_path}/Tiger.OldOrth.test.conll")
+
+