upgrading repo to latest version

commit: d7d707559a6ae5568b76ea2533a8ab382a42e6b4 [log] [tgz]
author: daza <daza@uni-heidelberg.de> Tue Jan 12 18:17:49 2021 +0100
committer: daza <daza@uni-heidelberg.de> Tue Jan 12 18:17:49 2021 +0100
tree: fecbb1a3e4ccbf14f3703d357f16b4bbd6e438f7
parent: a63ab50a0c9bb7cd7da32c725f34f1e2752e5be6 [diff]
diff --git a/my_utils/clean_dereko_vectors.py b/my_utils/clean_dereko_vectors.py
new file mode 100644
index 0000000..934ce27
--- /dev/null
+++ b/my_utils/clean_dereko_vectors.py

@@ -0,0 +1,14 @@
+import gzip
+
+ORIG_DEREKO_VECS = "/export/netapp/kupietz/embeddings/dereko-2020-ii-alpha.all.txt.gz"
+txt_vec = open("/home/daza/ids-projects/DeReKo/spacy_train/dereko-2020-ii-alpha.all.txt", "w")
+
+skept_vecs = 0
+with gzip.open(ORIG_DEREKO_VECS,'r') as f:        
+	for ix, line in enumerate(f):     
+		try:
+			txt_vec.write(line.decode("utf-8"))
+		except:
+			skept_vecs += 1
+
+print(skept_vecs)
\ No newline at end of file

diff --git a/my_utils/conll_to_tok.py b/my_utils/conll_to_tok.py
index d5656e8..2dbe2ed 100644
--- a/my_utils/conll_to_tok.py
+++ b/my_utils/conll_to_tok.py

@@ -12,11 +12,21 @@
 				-ss "</S>" \
 				--token_type CoNLL09_Token
 				
+			*** GERMAN UNIVERSAL DEPS TEST ***
+			
 			python my_utils/conll_to_tok.py \
 				-s /home/daza/datasets/ud-treebanks-v2.2/UD_German-GSD/de_gsd-ud-test.conllu \
 				-ss "</S>" \
 				--token_type CoNLLUP_Token
 			
+			*** TIGER TEST NEW ORTH ***
+			
+			python my_utils/conll_to_tok.py \
+			-s /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.NewOrth.test.conll \
+			-ss "</S>" \
+			--token_type CoNLLUP_Token
+			
+		
 		For RNNTagger
 			python my_utils/conll_to_tok.py \
 				-s /home/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09 \
@@ -34,7 +44,10 @@
 	parser.add_argument("-c", "--com_str", help="Skip line if it starts with this string (comment market)", default="# ")
 	args = parser.parse_args()
 	
-	output_file = open(f"{args.src_file}.tok","w")
+	if args.sent_sep == "":
+		output_file = open(f"{args.src_file}.tok","w")
+	else:
+		output_file = open(f"{args.src_file}.sep.tok","w")
 	
 	for conll_obj in read_conll_generator(args.src_file, token_class=get_token_type(args.token_type), comment_str=args.com_str):
 		for tok in conll_obj.tokens:

diff --git a/my_utils/make_new_orth_silver_lemmas.py b/my_utils/make_new_orth_silver_lemmas.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/my_utils/make_new_orth_silver_lemmas.py


diff --git a/my_utils/make_tiger_new_orth.py b/my_utils/make_tiger_new_orth.py
index 8886f74..7bb42f4 100644
--- a/my_utils/make_tiger_new_orth.py
+++ b/my_utils/make_tiger_new_orth.py

@@ -2,7 +2,7 @@
 from lib.CoNLL_Annotation import read_conll, CoNLL09_Token, TigerNew_Token 
 from collections import Counter
 
-ORIGINAL_TIGER = "/home/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09"
+ORIGINAL_TIGER = "/home/daza/datasets/TIGER_conll/TIGER_original_data/tiger_release_aug07.corrected.16012013.conll09"
 NEW_ORTH = "/vol/work/kupietz/Tiger_2_2/data/german/tiger/train/german_tiger_new_orthography.csv"
 
 
@@ -62,17 +62,17 @@
 	print(f"{len(problematic_sents)}/{len(train_tiger)} ({len(problematic_sents)*100/len(train_tiger)}%) of sentences have change of orthography.")
 	print(f"{len(token_changes)}/{total_tokens} ({len(token_changes)*100/total_tokens}%) of tokens have change of orthography.")
 	# Save Files
-	tiger_path = "/home/daza/datasets/TIGER_conll/"
+	save_path = "/home/daza/datasets/TIGER_conll"
 	new_cases = Counter(token_changes).most_common()
 	case_mapping = get_confident_mapping(new_cases)
 	# Stats
-	fu.counter_to_file(new_cases, f"{tiger_path}/TigerTokensChangeOrth.train.tsv")
-	fu.dict_to_file(case_mapping, f"{tiger_path}/TigerOrthMapping.train.json")
-	fu.list_to_file(problematic_sents, f"{tiger_path}/NewOrthProblems_Indices.train.txt")
+	fu.counter_to_file(new_cases, f"{save_path}/TigerTokensChangeOrth.train.tsv")
+	fu.dict_to_file(case_mapping, f"{save_path}/TigerOrthMapping.train.json")
+	fu.list_to_file(problematic_sents, f"{save_path}/NewOrthProblems_Indices.train.txt")
 	# Train/Test Splits
 	old_train, new_train = zip(*train_tiger)
-	fu.write_conll_file(old_train, out_path=f"{tiger_path}/Tiger.OldOrth.train.conll")
-	fu.write_conll_file(new_train, out_path=f"{tiger_path}/Tiger.NewOrth.train.conll")
-	fu.write_conll_file(test_tiger, out_path=f"{tiger_path}/Tiger.OldOrth.test.conll")
+	fu.write_conll_file(old_train, out_path=f"{save_path}/Tiger.OldOrth.train.conll")
+	fu.write_conll_file(new_train, out_path=f"{save_path}/Tiger.NewOrth.train.conll")
+	fu.write_conll_file(test_tiger, out_path=f"{save_path}/Tiger.OldOrth.test.conll")
commit	d7d707559a6ae5568b76ea2533a8ab382a42e6b4	[log] [tgz]
author	daza <daza@uni-heidelberg.de>	Tue Jan 12 18:17:49 2021 +0100
committer	daza <daza@uni-heidelberg.de>	Tue Jan 12 18:17:49 2021 +0100
tree	fecbb1a3e4ccbf14f3703d357f16b4bbd6e438f7
parent	a63ab50a0c9bb7cd7da32c725f34f1e2752e5be6 [diff]