upgrading repo to latest version
diff --git a/DeReKo/spacy_train/basic_config_allOrth.cfg b/DeReKo/spacy_train/basic_config_allOrth.cfg
new file mode 100644
index 0000000..2b27744
--- /dev/null
+++ b/DeReKo/spacy_train/basic_config_allOrth.cfg
@@ -0,0 +1,81 @@
+[paths]
+train = "/vol/netapp/daza/datasets/TIGER_conll/data_splits/train/Tiger.ALL.Orth.train.spacy"
+dev = "/vol/netapp/daza/datasets/TIGER_conll/data_splits/test/Tiger.NewOrth.test.spacy"
+
+[system]
+gpu_allocator = "pytorch"
+
+
+[nlp]
+lang = "de"
+pipeline = ["transformer", "tagger"]
+tokenizer = {"@tokenizers": "spacy.Tokenizer.v1"}
+
+[components]
+
+[components.transformer]
+factory = "transformer"
+
+[components.transformer.model]
+@architectures = "spacy-transformers.TransformerModel.v1"
+name = "bert-base-german-cased"
+tokenizer_config = {"use_fast": true}
+
+[components.transformer.model.get_spans]
+@span_getters = "spacy-transformers.strided_spans.v1"
+window = 128
+stride = 96
+
+
+[components.tagger]
+factory = "tagger"
+
+[components.tagger.model]
+@architectures = "spacy.Tagger.v1"
+nO = null
+
+[components.tagger.model.tok2vec]
+@architectures = "spacy-transformers.TransformerListener.v1"
+grad_factor = 1.0
+
+[components.tagger.model.tok2vec.pooling]
+@layers = "reduce_mean.v1"
+
+
+
+[corpora]
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+max_length = 500
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+max_length = 0
+
+[training]
+accumulate_gradient = 3
+dev_corpus = "corpora.dev"
+train_corpus = "corpora.train"
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+
+[training.optimizer.learn_rate]
+@schedules = "warmup_linear.v1"
+warmup_steps = 250
+total_steps = 20000
+initial_rate = 5e-5
+
+
+
+[training.batcher]
+@batchers = "spacy.batch_by_padded.v1"
+discard_oversize = true
+size = 2000
+buffer = 256
+
+[initialize]
+vectors = null
diff --git a/DeReKo/spacy_train/basic_config.cfg b/DeReKo/spacy_train/basic_config_newOrth.cfg
similarity index 100%
rename from DeReKo/spacy_train/basic_config.cfg
rename to DeReKo/spacy_train/basic_config_newOrth.cfg
diff --git a/DeReKo/spacy_train/config_allOrth.cfg b/DeReKo/spacy_train/config_allOrth.cfg
new file mode 100644
index 0000000..cf3b09b
--- /dev/null
+++ b/DeReKo/spacy_train/config_allOrth.cfg
@@ -0,0 +1,123 @@
+[paths]
+train = "/vol/netapp/daza/datasets/TIGER_conll/data_splits/train/Tiger.ALL.Orth.train.spacy"
+dev = "/vol/netapp/daza/datasets/TIGER_conll/data_splits/test/Tiger.NewOrth.test.spacy"
+vectors = null
+init_tok2vec = null
+
+[system]
+gpu_allocator = "pytorch"
+seed = 0
+
+[nlp]
+lang = "de"
+pipeline = ["transformer","tagger"]
+tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
+disabled = []
+before_creation = null
+after_creation = null
+after_pipeline_creation = null
+
+[components]
+
+[components.tagger]
+factory = "tagger"
+
+[components.tagger.model]
+@architectures = "spacy.Tagger.v1"
+nO = null
+
+[components.tagger.model.tok2vec]
+@architectures = "spacy-transformers.TransformerListener.v1"
+grad_factor = 1.0
+pooling = {"@layers":"reduce_mean.v1"}
+
+[components.transformer]
+factory = "transformer"
+max_batch_items = 4096
+set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
+
+[components.transformer.model]
+@architectures = "spacy-transformers.TransformerModel.v1"
+name = "bert-base-german-cased"
+
+[components.transformer.model.get_spans]
+@span_getters = "spacy-transformers.strided_spans.v1"
+window = 128
+stride = 96
+
+[components.transformer.model.tokenizer_config]
+use_fast = true
+
+[corpora]
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+max_length = 0
+gold_preproc = false
+limit = 0
+augmenter = null
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+max_length = 500
+gold_preproc = false
+limit = 0
+augmenter = null
+
+[training]
+accumulate_gradient = 3
+dev_corpus = "corpora.dev"
+train_corpus = "corpora.train"
+seed = ${system.seed}
+gpu_allocator = ${system.gpu_allocator}
+dropout = 0.1
+patience = 1600
+max_epochs = 0
+max_steps = 20000
+eval_frequency = 200
+frozen_components = []
+before_to_disk = null
+
+[training.batcher]
+@batchers = "spacy.batch_by_padded.v1"
+discard_oversize = true
+size = 2000
+buffer = 256
+get_length = null
+
+[training.logger]
+@loggers = "spacy.ConsoleLogger.v1"
+progress_bar = false
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = false
+eps = 0.00000001
+
+[training.optimizer.learn_rate]
+@schedules = "warmup_linear.v1"
+warmup_steps = 250
+total_steps = 20000
+initial_rate = 0.00005
+
+[training.score_weights]
+tag_acc = 1.0
+
+[pretraining]
+
+[initialize]
+vectors = null
+init_tok2vec = ${paths.init_tok2vec}
+vocab_data = null
+lookups = null
+
+[initialize.components]
+
+[initialize.tokenizer]
\ No newline at end of file
diff --git a/DeReKo/spacy_train/config.cfg b/DeReKo/spacy_train/config_newOrth.cfg
similarity index 100%
rename from DeReKo/spacy_train/config.cfg
rename to DeReKo/spacy_train/config_newOrth.cfg
diff --git a/DeReKo/spacy_train/conll2spacy.py b/DeReKo/spacy_train/conll2spacy.py
index f0d0d4c..9be14ab 100644
--- a/DeReKo/spacy_train/conll2spacy.py
+++ b/DeReKo/spacy_train/conll2spacy.py
@@ -4,17 +4,32 @@
if __name__ == "__main__":
"""
- --- TIGER New Orthography ---
- python DeReKo/spacy_train/conll2spacy.py --corpus_name TigerNew --gld_token_type CoNLLUP_Token \
- -i /home/daza/datasets/TIGER_conll/data_splits/train/Tiger.NewOrth.train.conll \
- -o DeReKo/spacy_train/Tiger.NewOrth.train.json \
- -t DeReKo/spacy_train/Tiger.NewOrth.train.txt
-
- python DeReKo/spacy_train/conll2spacy.py --corpus_name TigerNew --gld_token_type CoNLLUP_Token \
+
+ --- TIGER NEW Orthography ---
+ python DeReKo/spacy_train/conll2spacy.py --corpus_name TigerNew --gld_token_type CoNLLUP_Token \
+ -i /home/daza/datasets/TIGER_conll/data_splits/train/Tiger.NewOrth.train.conll \
+ -o DeReKo/spacy_train/Tiger.NewOrth.train.json \
+ -t DeReKo/spacy_train/Tiger.NewOrth.train.txt
+
+ python DeReKo/spacy_train/conll2spacy.py --corpus_name TigerNew --gld_token_type CoNLLUP_Token \
-i /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.NewOrth.test.conll \
-o DeReKo/spacy_train/Tiger.NewOrth.test.json \
-t DeReKo/spacy_train/Tiger.NewOrth.test.txt
+ --- TIGER NEW + OLD Orthography ---
+ cat Tiger.OldOrth.train.conll Tiger.NewOrth.train.conll > Tiger.ALL.Orth.train.conll
+ cat Tiger.OldOrth.test.conll Tiger.NewOrth.test.conll > Tiger.ALL.Orth.test.conll
+
+ python DeReKo/spacy_train/conll2spacy.py --corpus_name TigerALL --gld_token_type CoNLLUP_Token \
+ -i /home/daza/datasets/TIGER_conll/data_splits/train/Tiger.ALL.Orth.train.conll \
+ -o /home/daza/datasets/TIGER_conll/data_splits/train/Tiger.ALL.Orth.train.json \
+ -t /home/daza/datasets/TIGER_conll/data_splits/train/Tiger.ALL.Orth.train.txt
+
+ python DeReKo/spacy_train/conll2spacy.py --corpus_name TigerALL --gld_token_type CoNLLUP_Token \
+ -i /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.ALL.Orth.test.conll \
+ -o /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.ALL.Orth.test.json \
+ -t /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.ALL.Orth.test.txt
+
"""
parser = argparse.ArgumentParser()
diff --git a/DeReKo/spacy_train/custom_spacy_dereko.py b/DeReKo/spacy_train/custom_spacy_dereko.py
index a4674dd..4b35282 100644
--- a/DeReKo/spacy_train/custom_spacy_dereko.py
+++ b/DeReKo/spacy_train/custom_spacy_dereko.py
@@ -1,11 +1,11 @@
import spacy
+
+ORIG_DEREKO_VECS = "/export/netapp/kupietz/embeddings/dereko-2020-ii-alpha.all.txt.gz"
-#ORIG_DEREKO_VECS = "/export/netapp/kupietz/embeddings/dereko-2020-ii-alpha.all.txt.gz"
+# Made with command (2.x): python -m spacy init-model de de_fastext_vectors --vectors-loc dereko_vectors/cc.de.300.vec.gz
+# Made with command (3.x): python -m spacy init vectors de dereko_vectors/dereko-2020-ii-alpha.all.txt dereko_vectors/ --name de_dereko_2020
+VECTORS = "lib/dereko_vectors"
-# Made with command: python -m spacy init-model de de_fastext_vectors --vectors-loc dereko_vectors/cc.de.300.vec.gz
-SPACY_FAST_DE = "../../lib/de_fastext_vectors"
-
-VECTORS = SPACY_FAST_DE
nlp_dereko = spacy.load(VECTORS)
doc1 = nlp_dereko("`` Ross Perot wäre vielleicht ein prächtiger Diktator ''")
diff --git a/DeReKo/spacy_train/spacy_bin_corpora/Tiger.NewOrth.test.spacy b/DeReKo/spacy_train/spacy_bin_corpora/Tiger.NewOrth.test.spacy
deleted file mode 100644
index de11ccc..0000000
--- a/DeReKo/spacy_train/spacy_bin_corpora/Tiger.NewOrth.test.spacy
+++ /dev/null
Binary files differ
diff --git a/DeReKo/spacy_train/spacy_bin_corpora/Tiger.NewOrth.train.spacy b/DeReKo/spacy_train/spacy_bin_corpora/Tiger.NewOrth.train.spacy
deleted file mode 100644
index 857239d..0000000
--- a/DeReKo/spacy_train/spacy_bin_corpora/Tiger.NewOrth.train.spacy
+++ /dev/null
Binary files differ