upgrading repo to latest version
diff --git a/my_utils/clean_dereko_vectors.py b/my_utils/clean_dereko_vectors.py
new file mode 100644
index 0000000..934ce27
--- /dev/null
+++ b/my_utils/clean_dereko_vectors.py
@@ -0,0 +1,14 @@
+import gzip
+
+ORIG_DEREKO_VECS = "/export/netapp/kupietz/embeddings/dereko-2020-ii-alpha.all.txt.gz"
+txt_vec = open("/home/daza/ids-projects/DeReKo/spacy_train/dereko-2020-ii-alpha.all.txt", "w")
+
+skept_vecs = 0
+with gzip.open(ORIG_DEREKO_VECS,'r') as f:
+ for ix, line in enumerate(f):
+ try:
+ txt_vec.write(line.decode("utf-8"))
+ except:
+ skept_vecs += 1
+
+print(skept_vecs)
\ No newline at end of file
diff --git a/my_utils/conll_to_tok.py b/my_utils/conll_to_tok.py
index d5656e8..2dbe2ed 100644
--- a/my_utils/conll_to_tok.py
+++ b/my_utils/conll_to_tok.py
@@ -12,11 +12,21 @@
-ss "</S>" \
--token_type CoNLL09_Token
+ *** GERMAN UNIVERSAL DEPS TEST ***
+
python my_utils/conll_to_tok.py \
-s /home/daza/datasets/ud-treebanks-v2.2/UD_German-GSD/de_gsd-ud-test.conllu \
-ss "</S>" \
--token_type CoNLLUP_Token
+ *** TIGER TEST NEW ORTH ***
+
+ python my_utils/conll_to_tok.py \
+ -s /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.NewOrth.test.conll \
+ -ss "</S>" \
+ --token_type CoNLLUP_Token
+
+
For RNNTagger
python my_utils/conll_to_tok.py \
-s /home/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09 \
@@ -34,7 +44,10 @@
parser.add_argument("-c", "--com_str", help="Skip line if it starts with this string (comment market)", default="# ")
args = parser.parse_args()
- output_file = open(f"{args.src_file}.tok","w")
+ if args.sent_sep == "":
+ output_file = open(f"{args.src_file}.tok","w")
+ else:
+ output_file = open(f"{args.src_file}.sep.tok","w")
for conll_obj in read_conll_generator(args.src_file, token_class=get_token_type(args.token_type), comment_str=args.com_str):
for tok in conll_obj.tokens:
diff --git a/my_utils/make_new_orth_silver_lemmas.py b/my_utils/make_new_orth_silver_lemmas.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/my_utils/make_new_orth_silver_lemmas.py
diff --git a/my_utils/make_tiger_new_orth.py b/my_utils/make_tiger_new_orth.py
index 8886f74..7bb42f4 100644
--- a/my_utils/make_tiger_new_orth.py
+++ b/my_utils/make_tiger_new_orth.py
@@ -2,7 +2,7 @@
from lib.CoNLL_Annotation import read_conll, CoNLL09_Token, TigerNew_Token
from collections import Counter
-ORIGINAL_TIGER = "/home/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09"
+ORIGINAL_TIGER = "/home/daza/datasets/TIGER_conll/TIGER_original_data/tiger_release_aug07.corrected.16012013.conll09"
NEW_ORTH = "/vol/work/kupietz/Tiger_2_2/data/german/tiger/train/german_tiger_new_orthography.csv"
@@ -62,17 +62,17 @@
print(f"{len(problematic_sents)}/{len(train_tiger)} ({len(problematic_sents)*100/len(train_tiger)}%) of sentences have change of orthography.")
print(f"{len(token_changes)}/{total_tokens} ({len(token_changes)*100/total_tokens}%) of tokens have change of orthography.")
# Save Files
- tiger_path = "/home/daza/datasets/TIGER_conll/"
+ save_path = "/home/daza/datasets/TIGER_conll"
new_cases = Counter(token_changes).most_common()
case_mapping = get_confident_mapping(new_cases)
# Stats
- fu.counter_to_file(new_cases, f"{tiger_path}/TigerTokensChangeOrth.train.tsv")
- fu.dict_to_file(case_mapping, f"{tiger_path}/TigerOrthMapping.train.json")
- fu.list_to_file(problematic_sents, f"{tiger_path}/NewOrthProblems_Indices.train.txt")
+ fu.counter_to_file(new_cases, f"{save_path}/TigerTokensChangeOrth.train.tsv")
+ fu.dict_to_file(case_mapping, f"{save_path}/TigerOrthMapping.train.json")
+ fu.list_to_file(problematic_sents, f"{save_path}/NewOrthProblems_Indices.train.txt")
# Train/Test Splits
old_train, new_train = zip(*train_tiger)
- fu.write_conll_file(old_train, out_path=f"{tiger_path}/Tiger.OldOrth.train.conll")
- fu.write_conll_file(new_train, out_path=f"{tiger_path}/Tiger.NewOrth.train.conll")
- fu.write_conll_file(test_tiger, out_path=f"{tiger_path}/Tiger.OldOrth.test.conll")
+ fu.write_conll_file(old_train, out_path=f"{save_path}/Tiger.OldOrth.train.conll")
+ fu.write_conll_file(new_train, out_path=f"{save_path}/Tiger.NewOrth.train.conll")
+ fu.write_conll_file(test_tiger, out_path=f"{save_path}/Tiger.OldOrth.test.conll")