Added training examples for SpaCy3

commit: 8534747e42fc3043891197994809c2af962bbaf7 [log] [tgz]
author: daza <daza@uni-heidelberg.de> Mon Nov 23 18:43:33 2020 +0100
committer: daza <daza@uni-heidelberg.de> Mon Nov 23 18:43:33 2020 +0100
tree: 7cecac06979ebfbe07b167076fef5b24eeaed75c
parent: e3bc92ec638020eddd0d0ce53cb73b0763dee76c [diff]
diff --git a/DeReKo/spacy_train/basic_config.cfg b/DeReKo/spacy_train/basic_config.cfg
new file mode 100644
index 0000000..e35d4e3
--- /dev/null
+++ b/DeReKo/spacy_train/basic_config.cfg

@@ -0,0 +1,81 @@
+[paths]
+train = "/home/daza/ids-projects/DeReKo/spacy_train/spacy_bin_corpora/Tiger.NewOrth.train.spacy"
+dev = "/home/daza/ids-projects/DeReKo/spacy_train/spacy_bin_corpora/Tiger.NewOrth.test.spacy"
+
+[system]
+gpu_allocator = "pytorch"
+
+
+[nlp]
+lang = "de"
+pipeline = ["transformer", "tagger"]
+tokenizer = {"@tokenizers": "spacy.Tokenizer.v1"}
+
+[components]
+
+[components.transformer]
+factory = "transformer"
+
+[components.transformer.model]
+@architectures = "spacy-transformers.TransformerModel.v1"
+name = "bert-base-german-cased"
+tokenizer_config = {"use_fast": true}
+
+[components.transformer.model.get_spans]
+@span_getters = "spacy-transformers.strided_spans.v1"
+window = 128
+stride = 96
+
+
+[components.tagger]
+factory = "tagger"
+
+[components.tagger.model]
+@architectures = "spacy.Tagger.v1"
+nO = null
+
+[components.tagger.model.tok2vec]
+@architectures = "spacy-transformers.TransformerListener.v1"
+grad_factor = 1.0
+
+[components.tagger.model.tok2vec.pooling]
+@layers = "reduce_mean.v1"
+
+
+
+[corpora]
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+max_length = 500
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+max_length = 0
+
+[training]
+accumulate_gradient = 3
+dev_corpus = "corpora.dev"
+train_corpus = "corpora.train"
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+
+[training.optimizer.learn_rate]
+@schedules = "warmup_linear.v1"
+warmup_steps = 250
+total_steps = 20000
+initial_rate = 5e-5
+
+
+
+[training.batcher]
+@batchers = "spacy.batch_by_padded.v1"
+discard_oversize = true
+size = 2000
+buffer = 256
+
+[initialize]
+vectors = null

diff --git a/DeReKo/spacy_train/config.cfg b/DeReKo/spacy_train/config.cfg
new file mode 100644
index 0000000..d701932
--- /dev/null
+++ b/DeReKo/spacy_train/config.cfg

@@ -0,0 +1,123 @@
+[paths]
+train = "/home/daza/ids-projects/DeReKo/spacy_train/spacy_bin_corpora/Tiger.NewOrth.train.spacy"
+dev = "/home/daza/ids-projects/DeReKo/spacy_train/spacy_bin_corpora/Tiger.NewOrth.test.spacy"
+vectors = null
+init_tok2vec = null
+
+[system]
+gpu_allocator = "pytorch"
+seed = 0
+
+[nlp]
+lang = "de"
+pipeline = ["transformer","tagger"]
+tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
+disabled = []
+before_creation = null
+after_creation = null
+after_pipeline_creation = null
+
+[components]
+
+[components.tagger]
+factory = "tagger"
+
+[components.tagger.model]
+@architectures = "spacy.Tagger.v1"
+nO = null
+
+[components.tagger.model.tok2vec]
+@architectures = "spacy-transformers.TransformerListener.v1"
+grad_factor = 1.0
+pooling = {"@layers":"reduce_mean.v1"}
+
+[components.transformer]
+factory = "transformer"
+max_batch_items = 4096
+set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
+
+[components.transformer.model]
+@architectures = "spacy-transformers.TransformerModel.v1"
+name = "bert-base-german-cased"
+
+[components.transformer.model.get_spans]
+@span_getters = "spacy-transformers.strided_spans.v1"
+window = 128
+stride = 96
+
+[components.transformer.model.tokenizer_config]
+use_fast = true
+
+[corpora]
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+max_length = 0
+gold_preproc = false
+limit = 0
+augmenter = null
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+max_length = 500
+gold_preproc = false
+limit = 0
+augmenter = null
+
+[training]
+accumulate_gradient = 3
+dev_corpus = "corpora.dev"
+train_corpus = "corpora.train"
+seed = ${system.seed}
+gpu_allocator = ${system.gpu_allocator}
+dropout = 0.1
+patience = 1600
+max_epochs = 0
+max_steps = 20000
+eval_frequency = 200
+frozen_components = []
+before_to_disk = null
+
+[training.batcher]
+@batchers = "spacy.batch_by_padded.v1"
+discard_oversize = true
+size = 2000
+buffer = 256
+get_length = null
+
+[training.logger]
+@loggers = "spacy.ConsoleLogger.v1"
+progress_bar = false
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = false
+eps = 0.00000001
+
+[training.optimizer.learn_rate]
+@schedules = "warmup_linear.v1"
+warmup_steps = 250
+total_steps = 20000
+initial_rate = 0.00005
+
+[training.score_weights]
+tag_acc = 1.0
+
+[pretraining]
+
+[initialize]
+vectors = null
+init_tok2vec = ${paths.init_tok2vec}
+vocab_data = null
+lookups = null
+
+[initialize.components]
+
+[initialize.tokenizer]
\ No newline at end of file

diff --git a/DeReKo/spacy_train/conll2spacy.py b/DeReKo/spacy_train/conll2spacy.py
new file mode 100644
index 0000000..f0d0d4c
--- /dev/null
+++ b/DeReKo/spacy_train/conll2spacy.py

@@ -0,0 +1,70 @@
+import argparse, time, json
+import my_utils.file_utils as fu
+from lib.CoNLL_Annotation import get_token_type
+
+if __name__ == "__main__":
+	"""
+		--- TIGER New Orthography ---
+			python DeReKo/spacy_train/conll2spacy.py --corpus_name TigerNew --gld_token_type CoNLLUP_Token \
+				-i /home/daza/datasets/TIGER_conll/data_splits/train/Tiger.NewOrth.train.conll \
+				-o DeReKo/spacy_train/Tiger.NewOrth.train.json \
+				-t DeReKo/spacy_train/Tiger.NewOrth.train.txt
+			
+			python DeReKo/spacy_train/conll2spacy.py --corpus_name TigerNew --gld_token_type CoNLLUP_Token \
+			-i /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.NewOrth.test.conll \
+			-o DeReKo/spacy_train/Tiger.NewOrth.test.json \
+			-t DeReKo/spacy_train/Tiger.NewOrth.test.txt
+			
+	"""
+	
+	parser = argparse.ArgumentParser()
+	parser.add_argument("-i", "--input_file", help="Input Corpus", required=True)
+	parser.add_argument("-n", "--corpus_name", help="Corpus Name", default="Corpus")
+	parser.add_argument("-o", "--output_file", help="File where the Predictions will be saved", required=True)
+	parser.add_argument("-t", "--text_file", help="Output Plain Text File", default=None)
+	parser.add_argument("-gtt", "--gld_token_type", help="CoNLL Format of the Gold Data", default="CoNLL09_Token")
+	parser.add_argument("-c", "--comment_str", help="CoNLL Format of comentaries inside the file", default="#")
+	args = parser.parse_args()
+	
+	file_has_next, chunk_ix = True, 0
+	CHUNK_SIZE = 60000
+	
+	write_out = open(args.output_file, "w")
+	if args.text_file: write_plain = open(args.text_file, "w")
+	
+	if ".gz" == args.input_file[-3:]:
+		in_file = fu.expand_file(args.input_file)
+	else:
+		in_file = args.input_file
+	
+	start = time.time()
+	total_processed_sents = 0
+	line_generator = fu.file_generator(in_file)
+	while file_has_next:
+		annos, file_has_next = fu.get_file_annos_chunk(line_generator, chunk_size=CHUNK_SIZE, 		token_class=get_token_type(args.gld_token_type), comment_str=args.comment_str)
+		if len(annos) == 0: break
+		total_processed_sents += len(annos)
+		print(f"Already processed {total_processed_sents} sentences...")
+		spacy_docs = []
+		for anno_id, anno in enumerate(annos):
+			plain_text, token_objs = [], []
+			for ix, tok in enumerate(anno.tokens):
+				token_objs.append({"id": ix, "orth":tok.word, "tag": tok.pos_tag})
+				plain_text.append(tok.word)
+			plain_text_str = " ".join(plain_text)
+			sent_obj = {
+				"id": anno_id,
+				"meta": anno.metadata,
+				"paragraphs": [{
+					"raw": plain_text_str,
+					"sentences": [{
+						"tokens": token_objs
+					}]	
+				}]
+			}
+			spacy_docs.append(sent_obj)
+			if args.text_file:
+				write_plain.write(plain_text_str + "\n")
+		write_out.write(json.dumps(spacy_docs))
+	end = time.time()
+	print(f"Processing {args.corpus_name} took {(end - start)} seconds!")
\ No newline at end of file

diff --git a/DeReKo/spacy_train/custom_spacy_dereko.py b/DeReKo/spacy_train/custom_spacy_dereko.py
new file mode 100644
index 0000000..a4674dd
--- /dev/null
+++ b/DeReKo/spacy_train/custom_spacy_dereko.py

@@ -0,0 +1,17 @@
+import spacy
+
+#ORIG_DEREKO_VECS = "/export/netapp/kupietz/embeddings/dereko-2020-ii-alpha.all.txt.gz"
+
+# Made with command: python -m spacy init-model de de_fastext_vectors --vectors-loc dereko_vectors/cc.de.300.vec.gz
+SPACY_FAST_DE = "../../lib/de_fastext_vectors" 
+
+VECTORS = SPACY_FAST_DE
+
+nlp_dereko = spacy.load(VECTORS)
+doc1 = nlp_dereko("`` Ross Perot wäre vielleicht ein prächtiger Diktator ''")
+doc2 = nlp_dereko("Konzernchefs lehnen den Milliardär als US-Präsidenten ab")
+doc3 = nlp_dereko("Texaner gibt nur vage Auskunft über seine Wirtschaftspolitik")
+print(doc1.similarity(doc2))
+print(doc1.similarity(doc3))
+
+

diff --git a/DeReKo/spacy_train/custom_spacy_tagger_2x.py b/DeReKo/spacy_train/custom_spacy_tagger_2x.py
new file mode 100644
index 0000000..5e529b5
--- /dev/null
+++ b/DeReKo/spacy_train/custom_spacy_tagger_2x.py

@@ -0,0 +1,100 @@
+#!/usr/bin/env python
+# coding: utf8
+"""
+A simple example for training a part-of-speech tagger with a custom tag map.
+To allow us to update the tag map with our custom one, this example starts off
+with a blank Language class and modifies its defaults. For more details, see
+the documentation:
+* Training: https://spacy.io/usage/training
+* POS Tagging: https://spacy.io/usage/linguistic-features#pos-tagging
+Compatible with: spaCy v2.0.0+
+Last tested with: v2.1.0
+"""
+from __future__ import unicode_literals, print_function
+
+import plac
+import random
+from pathlib import Path
+import spacy
+from spacy.util import minibatch, compounding
+
+
+# You need to define a mapping from your data's part-of-speech tag names to the
+# Universal Part-of-Speech tag set, as spaCy includes an enum of these tags.
+# See here for the Universal Tag Set:
+# http://universaldependencies.github.io/docs/u/pos/index.html
+# You may also specify morphological features for your tags, from the universal
+# scheme.
+TAG_MAP = {"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}, "J": {"pos": "ADJ"}}
+
+# Usually you'll read this in, of course. Data formats vary. Ensure your
+# strings are unicode and that the number of tags assigned matches spaCy's
+# tokenization. If not, you can always add a 'words' key to the annotations
+# that specifies the gold-standard tokenization, e.g.:
+# ("Eatblueham", {'words': ['Eat', 'blue', 'ham'], 'tags': ['V', 'J', 'N']})
+TRAIN_DATA = [
+	("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
+	("Eat blue ham", {"tags": ["V", "J", "N"]}),
+]
+
+
+@plac.annotations(
+	lang=("ISO Code of language to use", "option", "l", str),
+	output_dir=("Optional output directory", "option", "o", Path),
+	n_iter=("Number of training iterations", "option", "n", int),
+)
+def main(lang="en", output_dir=None, n_iter=25):
+	"""Create a new model, set up the pipeline and train the tagger. In order to
+	train the tagger with a custom tag map, we're creating a new Language
+	instance with a custom vocab.
+	"""
+	nlp = spacy.blank(lang)
+	# add the tagger to the pipeline
+	# nlp.create_pipe works for built-ins that are registered with spaCy
+	tagger = nlp.create_pipe("tagger")
+	# Add the tags. This needs to be done before you start training.
+	for tag, values in TAG_MAP.items():
+		tagger.add_label(tag, values)
+	nlp.add_pipe(tagger)
+
+	optimizer = nlp.begin_training()
+	for i in range(n_iter):
+		random.shuffle(TRAIN_DATA)
+		losses = {}
+		# batch up the examples using spaCy's minibatch
+		batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
+		for batch in batches:
+			texts, annotations = zip(*batch)
+			nlp.update(texts, annotations, sgd=optimizer, losses=losses)
+		print("Losses", losses)
+
+	# test the trained model
+	test_text = "I like blue eggs"
+	doc = nlp(test_text)
+	print("Tags", [(t.text, t.tag_, t.pos_) for t in doc])
+
+	# save model to output directory
+	if output_dir is not None:
+		output_dir = Path(output_dir)
+		if not output_dir.exists():
+			output_dir.mkdir()
+		nlp.to_disk(output_dir)
+		print("Saved model to", output_dir)
+
+		# test the save model
+		print("Loading from", output_dir)
+		nlp2 = spacy.load(output_dir)
+		doc = nlp2(test_text)
+		print("Tags", [(t.text, t.tag_, t.pos_) for t in doc])
+
+
+if __name__ == "__main__":
+	plac.call(main)
+
+	# Expected output:
+	# [
+	#   ('I', 'N', 'NOUN'),
+	#   ('like', 'V', 'VERB'),
+	#   ('blue', 'J', 'ADJ'),
+	#   ('eggs', 'N', 'NOUN')
+	# ]
\ No newline at end of file

diff --git a/DeReKo/spacy_train/custom_spacy_tagger_3x.py b/DeReKo/spacy_train/custom_spacy_tagger_3x.py
new file mode 100644
index 0000000..45a76c9
--- /dev/null
+++ b/DeReKo/spacy_train/custom_spacy_tagger_3x.py

@@ -0,0 +1,4 @@
+import spacy
+nlp = spacy.load("de_dep_news_trf")
+doc = nlp("Das ist ein Satz.")
+print(doc)
\ No newline at end of file

diff --git a/DeReKo/spacy_train/spacy_bin_corpora/Tiger.NewOrth.test.spacy b/DeReKo/spacy_train/spacy_bin_corpora/Tiger.NewOrth.test.spacy
new file mode 100644
index 0000000..de11ccc
--- /dev/null
+++ b/DeReKo/spacy_train/spacy_bin_corpora/Tiger.NewOrth.test.spacy
Binary files differ

diff --git a/DeReKo/spacy_train/spacy_bin_corpora/Tiger.NewOrth.train.spacy b/DeReKo/spacy_train/spacy_bin_corpora/Tiger.NewOrth.train.spacy
new file mode 100644
index 0000000..857239d
--- /dev/null
+++ b/DeReKo/spacy_train/spacy_bin_corpora/Tiger.NewOrth.train.spacy
Binary files differ
commit	8534747e42fc3043891197994809c2af962bbaf7	[log] [tgz]
author	daza <daza@uni-heidelberg.de>	Mon Nov 23 18:43:33 2020 +0100
committer	daza <daza@uni-heidelberg.de>	Mon Nov 23 18:43:33 2020 +0100
tree	7cecac06979ebfbe07b167076fef5b24eeaed75c
parent	e3bc92ec638020eddd0d0ce53cb73b0763dee76c [diff]