Added training examples for SpaCy3

commit: 8534747e42fc3043891197994809c2af962bbaf7 [log] [tgz]
author: daza <daza@uni-heidelberg.de> Mon Nov 23 18:43:33 2020 +0100
committer: daza <daza@uni-heidelberg.de> Mon Nov 23 18:43:33 2020 +0100
tree: 7cecac06979ebfbe07b167076fef5b24eeaed75c
parent: e3bc92ec638020eddd0d0ce53cb73b0763dee76c [diff]
diff --git a/my_utils/file_utils.py b/my_utils/file_utils.py
index e63ddca..1865fc5 100644
--- a/my_utils/file_utils.py
+++ b/my_utils/file_utils.py

@@ -1,20 +1,42 @@
 import requests, logging, json
+import subprocess, time
+import glob, logging
+import os.path, sys
 from lib.CoNLL_Annotation import read_conll, read_conll_generator
 
 logger = logging.getLogger(__name__)
 
 
-def dict_to_file(my_dict, out_path):
+def list_to_file(my_list, out_path):
     with open(out_path, "w") as out:
-        out.write(json.dump(my_dict))
+        for item_str in my_list:
+            out.write(f"{item_str}\n")
+
+def counter_to_file(my_counter, out_path):
+    with open(out_path, "w") as out:
+        for item, count in my_counter:
+            item_str = "\t".join(item)
+            out.write(f"{item_str}\t{count}\n")
+
+def dict_to_file(my_dict, out_path):
+    with open(out_path, "w", encoding='utf8') as out:
+        json.dump(my_dict, fp=out, ensure_ascii=False)
 
 def file_to_dict(file_path):
     d = {}
     with open(file_path) as f:
-        d = f.load(f)
+        d = json.load(f)
     return d  
 
 
+def write_conll_file(conll_objs, out_path):
+    with open(out_path, "w", encoding='utf8') as out:
+        for obj in conll_objs:
+            for tok in obj.tokens:
+                out.write(tok.get_conllU_line()+"\n")
+            out.write("\n")
+
+
 def file_generator(file_path):
     with open(file_path, "r") as data_file:
         logger.info("Reading instances from lines in file at: %s", file_path)
@@ -23,7 +45,16 @@
             yield line
 
 
+def get_file_annos_chunk(line_generator, chunk_size, token_class, comment_str="###C:"):
+    file_has_next = True
+    chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str)
+    if n_sents == 0: file_has_next = False
+    sents, gld, meta = [], [], []
+    return chunk, file_has_next
+
+
 def get_file_text_chunk(line_generator, chunk_size, token_class, comment_str="###C:"):
+    """ Same as get_file_annos_chunk but directly get (text, labels) pairs"""
     file_has_next = True
     chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str)
     if n_sents == 0: file_has_next = False
@@ -64,3 +95,30 @@
     fout = open(fname, "w")
     fout.write(response_str)
     fout.close()
+
+
+def expand_file(f, substitute_comment=False):
+    # Expand the .gz file
+    fname = f[:-3]
+    if not os.path.isfile(fname): 
+        p = subprocess.call(f"gunzip -c {f} > {fname}", shell=True)
+        if p == 0:
+            logger.info("Successfully uncompressed file")
+        else:
+            logger.info(f"Couldn't expand file {f}")
+            raise Exception
+    else:
+        logger.info(f"File {fname} is already uncompressed. Skipping this step...")
+    
+    # Substitute the Commentary Lines on the Expanded file
+    if substitute_comment:
+        fixed_filename = f"{fname}.fixed"
+        p = subprocess.call(f"sed 's/^# /###C: /g' {fname}", shell=True, stdout=open(fixed_filename, "w")) # stdout=subprocess.PIPE
+        if p == 0:
+            logger.info("Successfully fixed comments on file")
+        else:
+            logger.info(f"Something went wrong when substituting commentaries")
+            raise Exception    
+        return fixed_filename
+    else:
+        return fname
\ No newline at end of file

diff --git a/my_utils/make_tiger_new_orth.py b/my_utils/make_tiger_new_orth.py
new file mode 100644
index 0000000..8886f74
--- /dev/null
+++ b/my_utils/make_tiger_new_orth.py

@@ -0,0 +1,78 @@
+import my_utils.file_utils as fu
+from lib.CoNLL_Annotation import read_conll, CoNLL09_Token, TigerNew_Token 
+from collections import Counter
+
+ORIGINAL_TIGER = "/home/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09"
+NEW_ORTH = "/vol/work/kupietz/Tiger_2_2/data/german/tiger/train/german_tiger_new_orthography.csv"
+
+
+def get_confident_mapping(common_counts):
+	mapping = {}
+	for (old, new), cnt in common_counts:
+		if old not in mapping:
+			mapping[old] = (new, cnt)
+		else:
+			prev_tok, prev_cnt = mapping[old]
+			if cnt > prev_cnt:
+				mapping[old] = (new, cnt)
+	return {k: v[0] for k,v in mapping.items()}
+		
+
+def check_orthography(s_old,s_new):
+	global total_tokens
+	identical_sents = True
+	words_old = s_old.get_words()
+	words_new = s_new.get_words()
+	assert len(words_old) == len(words_new)
+	total_tokens += len(words_old)
+	for w1,w2 in zip(words_old, words_new):
+		if w1 != w2 and w1[0].lower() == w2[0].lower():
+			token_changes.append((w1, w2))
+			identical_sents = False
+	return identical_sents
+
+
+if __name__ == "__main__":
+	line_generator = fu.file_generator(ORIGINAL_TIGER)
+	original_sents, _ = read_conll(line_generator, chunk_size=60000, token_class=CoNLL09_Token, comment_str="#")
+	
+	line_generator = fu.file_generator(NEW_ORTH)
+	new_orth_sents, _ = read_conll(line_generator, chunk_size=60000, token_class=TigerNew_Token, comment_str="#")
+	
+	new_ix = 0
+	train_tiger, test_tiger = [], []
+	problematic_sents, token_changes = [], []
+	total_tokens = 0
+	for i, s1 in enumerate(original_sents):
+		s2 = new_orth_sents[new_ix]
+		print(f"--- {new_ix} ---\n{s1.get_sentence()}\n{s2.get_sentence()}\n\n")
+		if len(s1.get_words()) == len(s2.get_words()):
+			train_tiger.append((s1,s2))
+			identical_sents = check_orthography(s1,s2)
+			if not identical_sents: 
+				problematic_sents.append(new_ix)
+			new_ix += 1
+		else:
+			test_tiger.append(s1)
+	
+	# Print Stats
+	print(len(train_tiger))
+	print(len(test_tiger))
+	print(len(new_orth_sents))
+	print(f"{len(problematic_sents)}/{len(train_tiger)} ({len(problematic_sents)*100/len(train_tiger)}%) of sentences have change of orthography.")
+	print(f"{len(token_changes)}/{total_tokens} ({len(token_changes)*100/total_tokens}%) of tokens have change of orthography.")
+	# Save Files
+	tiger_path = "/home/daza/datasets/TIGER_conll/"
+	new_cases = Counter(token_changes).most_common()
+	case_mapping = get_confident_mapping(new_cases)
+	# Stats
+	fu.counter_to_file(new_cases, f"{tiger_path}/TigerTokensChangeOrth.train.tsv")
+	fu.dict_to_file(case_mapping, f"{tiger_path}/TigerOrthMapping.train.json")
+	fu.list_to_file(problematic_sents, f"{tiger_path}/NewOrthProblems_Indices.train.txt")
+	# Train/Test Splits
+	old_train, new_train = zip(*train_tiger)
+	fu.write_conll_file(old_train, out_path=f"{tiger_path}/Tiger.OldOrth.train.conll")
+	fu.write_conll_file(new_train, out_path=f"{tiger_path}/Tiger.NewOrth.train.conll")
+	fu.write_conll_file(test_tiger, out_path=f"{tiger_path}/Tiger.OldOrth.test.conll")
+	
+
commit	8534747e42fc3043891197994809c2af962bbaf7	[log] [tgz]
author	daza <daza@uni-heidelberg.de>	Mon Nov 23 18:43:33 2020 +0100
committer	daza <daza@uni-heidelberg.de>	Mon Nov 23 18:43:33 2020 +0100
tree	7cecac06979ebfbe07b167076fef5b24eeaed75c
parent	e3bc92ec638020eddd0d0ce53cb73b0763dee76c [diff]