Added training examples for SpaCy3

commit: 8534747e42fc3043891197994809c2af962bbaf7 [log] [tgz]
author: daza <daza@uni-heidelberg.de> Mon Nov 23 18:43:33 2020 +0100
committer: daza <daza@uni-heidelberg.de> Mon Nov 23 18:43:33 2020 +0100
tree: 7cecac06979ebfbe07b167076fef5b24eeaed75c
parent: e3bc92ec638020eddd0d0ce53cb73b0763dee76c [diff]
diff --git a/.gitignore b/.gitignore
index cdb2786..e6a1dca 100644
--- a/.gitignore
+++ b/.gitignore

@@ -107,6 +107,7 @@
 .venv
 env/
 venv/
+venv-*
 ENV/
 env.bak/
 venv.bak/

diff --git a/DeReKo/spacy_train/basic_config.cfg b/DeReKo/spacy_train/basic_config.cfg
new file mode 100644
index 0000000..e35d4e3
--- /dev/null
+++ b/DeReKo/spacy_train/basic_config.cfg

@@ -0,0 +1,81 @@
+[paths]
+train = "/home/daza/ids-projects/DeReKo/spacy_train/spacy_bin_corpora/Tiger.NewOrth.train.spacy"
+dev = "/home/daza/ids-projects/DeReKo/spacy_train/spacy_bin_corpora/Tiger.NewOrth.test.spacy"
+
+[system]
+gpu_allocator = "pytorch"
+
+
+[nlp]
+lang = "de"
+pipeline = ["transformer", "tagger"]
+tokenizer = {"@tokenizers": "spacy.Tokenizer.v1"}
+
+[components]
+
+[components.transformer]
+factory = "transformer"
+
+[components.transformer.model]
+@architectures = "spacy-transformers.TransformerModel.v1"
+name = "bert-base-german-cased"
+tokenizer_config = {"use_fast": true}
+
+[components.transformer.model.get_spans]
+@span_getters = "spacy-transformers.strided_spans.v1"
+window = 128
+stride = 96
+
+
+[components.tagger]
+factory = "tagger"
+
+[components.tagger.model]
+@architectures = "spacy.Tagger.v1"
+nO = null
+
+[components.tagger.model.tok2vec]
+@architectures = "spacy-transformers.TransformerListener.v1"
+grad_factor = 1.0
+
+[components.tagger.model.tok2vec.pooling]
+@layers = "reduce_mean.v1"
+
+
+
+[corpora]
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+max_length = 500
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+max_length = 0
+
+[training]
+accumulate_gradient = 3
+dev_corpus = "corpora.dev"
+train_corpus = "corpora.train"
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+
+[training.optimizer.learn_rate]
+@schedules = "warmup_linear.v1"
+warmup_steps = 250
+total_steps = 20000
+initial_rate = 5e-5
+
+
+
+[training.batcher]
+@batchers = "spacy.batch_by_padded.v1"
+discard_oversize = true
+size = 2000
+buffer = 256
+
+[initialize]
+vectors = null

diff --git a/DeReKo/spacy_train/config.cfg b/DeReKo/spacy_train/config.cfg
new file mode 100644
index 0000000..d701932
--- /dev/null
+++ b/DeReKo/spacy_train/config.cfg

@@ -0,0 +1,123 @@
+[paths]
+train = "/home/daza/ids-projects/DeReKo/spacy_train/spacy_bin_corpora/Tiger.NewOrth.train.spacy"
+dev = "/home/daza/ids-projects/DeReKo/spacy_train/spacy_bin_corpora/Tiger.NewOrth.test.spacy"
+vectors = null
+init_tok2vec = null
+
+[system]
+gpu_allocator = "pytorch"
+seed = 0
+
+[nlp]
+lang = "de"
+pipeline = ["transformer","tagger"]
+tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
+disabled = []
+before_creation = null
+after_creation = null
+after_pipeline_creation = null
+
+[components]
+
+[components.tagger]
+factory = "tagger"
+
+[components.tagger.model]
+@architectures = "spacy.Tagger.v1"
+nO = null
+
+[components.tagger.model.tok2vec]
+@architectures = "spacy-transformers.TransformerListener.v1"
+grad_factor = 1.0
+pooling = {"@layers":"reduce_mean.v1"}
+
+[components.transformer]
+factory = "transformer"
+max_batch_items = 4096
+set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
+
+[components.transformer.model]
+@architectures = "spacy-transformers.TransformerModel.v1"
+name = "bert-base-german-cased"
+
+[components.transformer.model.get_spans]
+@span_getters = "spacy-transformers.strided_spans.v1"
+window = 128
+stride = 96
+
+[components.transformer.model.tokenizer_config]
+use_fast = true
+
+[corpora]
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+max_length = 0
+gold_preproc = false
+limit = 0
+augmenter = null
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+max_length = 500
+gold_preproc = false
+limit = 0
+augmenter = null
+
+[training]
+accumulate_gradient = 3
+dev_corpus = "corpora.dev"
+train_corpus = "corpora.train"
+seed = ${system.seed}
+gpu_allocator = ${system.gpu_allocator}
+dropout = 0.1
+patience = 1600
+max_epochs = 0
+max_steps = 20000
+eval_frequency = 200
+frozen_components = []
+before_to_disk = null
+
+[training.batcher]
+@batchers = "spacy.batch_by_padded.v1"
+discard_oversize = true
+size = 2000
+buffer = 256
+get_length = null
+
+[training.logger]
+@loggers = "spacy.ConsoleLogger.v1"
+progress_bar = false
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = false
+eps = 0.00000001
+
+[training.optimizer.learn_rate]
+@schedules = "warmup_linear.v1"
+warmup_steps = 250
+total_steps = 20000
+initial_rate = 0.00005
+
+[training.score_weights]
+tag_acc = 1.0
+
+[pretraining]
+
+[initialize]
+vectors = null
+init_tok2vec = ${paths.init_tok2vec}
+vocab_data = null
+lookups = null
+
+[initialize.components]
+
+[initialize.tokenizer]
\ No newline at end of file

diff --git a/DeReKo/spacy_train/conll2spacy.py b/DeReKo/spacy_train/conll2spacy.py
new file mode 100644
index 0000000..f0d0d4c
--- /dev/null
+++ b/DeReKo/spacy_train/conll2spacy.py

@@ -0,0 +1,70 @@
+import argparse, time, json
+import my_utils.file_utils as fu
+from lib.CoNLL_Annotation import get_token_type
+
+if __name__ == "__main__":
+	"""
+		--- TIGER New Orthography ---
+			python DeReKo/spacy_train/conll2spacy.py --corpus_name TigerNew --gld_token_type CoNLLUP_Token \
+				-i /home/daza/datasets/TIGER_conll/data_splits/train/Tiger.NewOrth.train.conll \
+				-o DeReKo/spacy_train/Tiger.NewOrth.train.json \
+				-t DeReKo/spacy_train/Tiger.NewOrth.train.txt
+			
+			python DeReKo/spacy_train/conll2spacy.py --corpus_name TigerNew --gld_token_type CoNLLUP_Token \
+			-i /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.NewOrth.test.conll \
+			-o DeReKo/spacy_train/Tiger.NewOrth.test.json \
+			-t DeReKo/spacy_train/Tiger.NewOrth.test.txt
+			
+	"""
+	
+	parser = argparse.ArgumentParser()
+	parser.add_argument("-i", "--input_file", help="Input Corpus", required=True)
+	parser.add_argument("-n", "--corpus_name", help="Corpus Name", default="Corpus")
+	parser.add_argument("-o", "--output_file", help="File where the Predictions will be saved", required=True)
+	parser.add_argument("-t", "--text_file", help="Output Plain Text File", default=None)
+	parser.add_argument("-gtt", "--gld_token_type", help="CoNLL Format of the Gold Data", default="CoNLL09_Token")
+	parser.add_argument("-c", "--comment_str", help="CoNLL Format of comentaries inside the file", default="#")
+	args = parser.parse_args()
+	
+	file_has_next, chunk_ix = True, 0
+	CHUNK_SIZE = 60000
+	
+	write_out = open(args.output_file, "w")
+	if args.text_file: write_plain = open(args.text_file, "w")
+	
+	if ".gz" == args.input_file[-3:]:
+		in_file = fu.expand_file(args.input_file)
+	else:
+		in_file = args.input_file
+	
+	start = time.time()
+	total_processed_sents = 0
+	line_generator = fu.file_generator(in_file)
+	while file_has_next:
+		annos, file_has_next = fu.get_file_annos_chunk(line_generator, chunk_size=CHUNK_SIZE, 		token_class=get_token_type(args.gld_token_type), comment_str=args.comment_str)
+		if len(annos) == 0: break
+		total_processed_sents += len(annos)
+		print(f"Already processed {total_processed_sents} sentences...")
+		spacy_docs = []
+		for anno_id, anno in enumerate(annos):
+			plain_text, token_objs = [], []
+			for ix, tok in enumerate(anno.tokens):
+				token_objs.append({"id": ix, "orth":tok.word, "tag": tok.pos_tag})
+				plain_text.append(tok.word)
+			plain_text_str = " ".join(plain_text)
+			sent_obj = {
+				"id": anno_id,
+				"meta": anno.metadata,
+				"paragraphs": [{
+					"raw": plain_text_str,
+					"sentences": [{
+						"tokens": token_objs
+					}]	
+				}]
+			}
+			spacy_docs.append(sent_obj)
+			if args.text_file:
+				write_plain.write(plain_text_str + "\n")
+		write_out.write(json.dumps(spacy_docs))
+	end = time.time()
+	print(f"Processing {args.corpus_name} took {(end - start)} seconds!")
\ No newline at end of file

diff --git a/DeReKo/spacy_train/custom_spacy_dereko.py b/DeReKo/spacy_train/custom_spacy_dereko.py
new file mode 100644
index 0000000..a4674dd
--- /dev/null
+++ b/DeReKo/spacy_train/custom_spacy_dereko.py

@@ -0,0 +1,17 @@
+import spacy
+
+#ORIG_DEREKO_VECS = "/export/netapp/kupietz/embeddings/dereko-2020-ii-alpha.all.txt.gz"
+
+# Made with command: python -m spacy init-model de de_fastext_vectors --vectors-loc dereko_vectors/cc.de.300.vec.gz
+SPACY_FAST_DE = "../../lib/de_fastext_vectors" 
+
+VECTORS = SPACY_FAST_DE
+
+nlp_dereko = spacy.load(VECTORS)
+doc1 = nlp_dereko("`` Ross Perot wäre vielleicht ein prächtiger Diktator ''")
+doc2 = nlp_dereko("Konzernchefs lehnen den Milliardär als US-Präsidenten ab")
+doc3 = nlp_dereko("Texaner gibt nur vage Auskunft über seine Wirtschaftspolitik")
+print(doc1.similarity(doc2))
+print(doc1.similarity(doc3))
+
+

diff --git a/DeReKo/spacy_train/custom_spacy_tagger_2x.py b/DeReKo/spacy_train/custom_spacy_tagger_2x.py
new file mode 100644
index 0000000..5e529b5
--- /dev/null
+++ b/DeReKo/spacy_train/custom_spacy_tagger_2x.py

@@ -0,0 +1,100 @@
+#!/usr/bin/env python
+# coding: utf8
+"""
+A simple example for training a part-of-speech tagger with a custom tag map.
+To allow us to update the tag map with our custom one, this example starts off
+with a blank Language class and modifies its defaults. For more details, see
+the documentation:
+* Training: https://spacy.io/usage/training
+* POS Tagging: https://spacy.io/usage/linguistic-features#pos-tagging
+Compatible with: spaCy v2.0.0+
+Last tested with: v2.1.0
+"""
+from __future__ import unicode_literals, print_function
+
+import plac
+import random
+from pathlib import Path
+import spacy
+from spacy.util import minibatch, compounding
+
+
+# You need to define a mapping from your data's part-of-speech tag names to the
+# Universal Part-of-Speech tag set, as spaCy includes an enum of these tags.
+# See here for the Universal Tag Set:
+# http://universaldependencies.github.io/docs/u/pos/index.html
+# You may also specify morphological features for your tags, from the universal
+# scheme.
+TAG_MAP = {"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}, "J": {"pos": "ADJ"}}
+
+# Usually you'll read this in, of course. Data formats vary. Ensure your
+# strings are unicode and that the number of tags assigned matches spaCy's
+# tokenization. If not, you can always add a 'words' key to the annotations
+# that specifies the gold-standard tokenization, e.g.:
+# ("Eatblueham", {'words': ['Eat', 'blue', 'ham'], 'tags': ['V', 'J', 'N']})
+TRAIN_DATA = [
+	("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
+	("Eat blue ham", {"tags": ["V", "J", "N"]}),
+]
+
+
+@plac.annotations(
+	lang=("ISO Code of language to use", "option", "l", str),
+	output_dir=("Optional output directory", "option", "o", Path),
+	n_iter=("Number of training iterations", "option", "n", int),
+)
+def main(lang="en", output_dir=None, n_iter=25):
+	"""Create a new model, set up the pipeline and train the tagger. In order to
+	train the tagger with a custom tag map, we're creating a new Language
+	instance with a custom vocab.
+	"""
+	nlp = spacy.blank(lang)
+	# add the tagger to the pipeline
+	# nlp.create_pipe works for built-ins that are registered with spaCy
+	tagger = nlp.create_pipe("tagger")
+	# Add the tags. This needs to be done before you start training.
+	for tag, values in TAG_MAP.items():
+		tagger.add_label(tag, values)
+	nlp.add_pipe(tagger)
+
+	optimizer = nlp.begin_training()
+	for i in range(n_iter):
+		random.shuffle(TRAIN_DATA)
+		losses = {}
+		# batch up the examples using spaCy's minibatch
+		batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
+		for batch in batches:
+			texts, annotations = zip(*batch)
+			nlp.update(texts, annotations, sgd=optimizer, losses=losses)
+		print("Losses", losses)
+
+	# test the trained model
+	test_text = "I like blue eggs"
+	doc = nlp(test_text)
+	print("Tags", [(t.text, t.tag_, t.pos_) for t in doc])
+
+	# save model to output directory
+	if output_dir is not None:
+		output_dir = Path(output_dir)
+		if not output_dir.exists():
+			output_dir.mkdir()
+		nlp.to_disk(output_dir)
+		print("Saved model to", output_dir)
+
+		# test the save model
+		print("Loading from", output_dir)
+		nlp2 = spacy.load(output_dir)
+		doc = nlp2(test_text)
+		print("Tags", [(t.text, t.tag_, t.pos_) for t in doc])
+
+
+if __name__ == "__main__":
+	plac.call(main)
+
+	# Expected output:
+	# [
+	#   ('I', 'N', 'NOUN'),
+	#   ('like', 'V', 'VERB'),
+	#   ('blue', 'J', 'ADJ'),
+	#   ('eggs', 'N', 'NOUN')
+	# ]
\ No newline at end of file

diff --git a/DeReKo/spacy_train/custom_spacy_tagger_3x.py b/DeReKo/spacy_train/custom_spacy_tagger_3x.py
new file mode 100644
index 0000000..45a76c9
--- /dev/null
+++ b/DeReKo/spacy_train/custom_spacy_tagger_3x.py

@@ -0,0 +1,4 @@
+import spacy
+nlp = spacy.load("de_dep_news_trf")
+doc = nlp("Das ist ein Satz.")
+print(doc)
\ No newline at end of file

diff --git a/DeReKo/spacy_train/spacy_bin_corpora/Tiger.NewOrth.test.spacy b/DeReKo/spacy_train/spacy_bin_corpora/Tiger.NewOrth.test.spacy
new file mode 100644
index 0000000..de11ccc
--- /dev/null
+++ b/DeReKo/spacy_train/spacy_bin_corpora/Tiger.NewOrth.test.spacy
Binary files differ

diff --git a/DeReKo/spacy_train/spacy_bin_corpora/Tiger.NewOrth.train.spacy b/DeReKo/spacy_train/spacy_bin_corpora/Tiger.NewOrth.train.spacy
new file mode 100644
index 0000000..857239d
--- /dev/null
+++ b/DeReKo/spacy_train/spacy_bin_corpora/Tiger.NewOrth.train.spacy
Binary files differ

diff --git a/my_utils/file_utils.py b/my_utils/file_utils.py
index e63ddca..1865fc5 100644
--- a/my_utils/file_utils.py
+++ b/my_utils/file_utils.py

@@ -1,20 +1,42 @@
 import requests, logging, json
+import subprocess, time
+import glob, logging
+import os.path, sys
 from lib.CoNLL_Annotation import read_conll, read_conll_generator
 
 logger = logging.getLogger(__name__)
 
 
-def dict_to_file(my_dict, out_path):
+def list_to_file(my_list, out_path):
     with open(out_path, "w") as out:
-        out.write(json.dump(my_dict))
+        for item_str in my_list:
+            out.write(f"{item_str}\n")
+
+def counter_to_file(my_counter, out_path):
+    with open(out_path, "w") as out:
+        for item, count in my_counter:
+            item_str = "\t".join(item)
+            out.write(f"{item_str}\t{count}\n")
+
+def dict_to_file(my_dict, out_path):
+    with open(out_path, "w", encoding='utf8') as out:
+        json.dump(my_dict, fp=out, ensure_ascii=False)
 
 def file_to_dict(file_path):
     d = {}
     with open(file_path) as f:
-        d = f.load(f)
+        d = json.load(f)
     return d  
 
 
+def write_conll_file(conll_objs, out_path):
+    with open(out_path, "w", encoding='utf8') as out:
+        for obj in conll_objs:
+            for tok in obj.tokens:
+                out.write(tok.get_conllU_line()+"\n")
+            out.write("\n")
+
+
 def file_generator(file_path):
     with open(file_path, "r") as data_file:
         logger.info("Reading instances from lines in file at: %s", file_path)
@@ -23,7 +45,16 @@
             yield line
 
 
+def get_file_annos_chunk(line_generator, chunk_size, token_class, comment_str="###C:"):
+    file_has_next = True
+    chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str)
+    if n_sents == 0: file_has_next = False
+    sents, gld, meta = [], [], []
+    return chunk, file_has_next
+
+
 def get_file_text_chunk(line_generator, chunk_size, token_class, comment_str="###C:"):
+    """ Same as get_file_annos_chunk but directly get (text, labels) pairs"""
     file_has_next = True
     chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str)
     if n_sents == 0: file_has_next = False
@@ -64,3 +95,30 @@
     fout = open(fname, "w")
     fout.write(response_str)
     fout.close()
+
+
+def expand_file(f, substitute_comment=False):
+    # Expand the .gz file
+    fname = f[:-3]
+    if not os.path.isfile(fname): 
+        p = subprocess.call(f"gunzip -c {f} > {fname}", shell=True)
+        if p == 0:
+            logger.info("Successfully uncompressed file")
+        else:
+            logger.info(f"Couldn't expand file {f}")
+            raise Exception
+    else:
+        logger.info(f"File {fname} is already uncompressed. Skipping this step...")
+    
+    # Substitute the Commentary Lines on the Expanded file
+    if substitute_comment:
+        fixed_filename = f"{fname}.fixed"
+        p = subprocess.call(f"sed 's/^# /###C: /g' {fname}", shell=True, stdout=open(fixed_filename, "w")) # stdout=subprocess.PIPE
+        if p == 0:
+            logger.info("Successfully fixed comments on file")
+        else:
+            logger.info(f"Something went wrong when substituting commentaries")
+            raise Exception    
+        return fixed_filename
+    else:
+        return fname
\ No newline at end of file

diff --git a/my_utils/make_tiger_new_orth.py b/my_utils/make_tiger_new_orth.py
new file mode 100644
index 0000000..8886f74
--- /dev/null
+++ b/my_utils/make_tiger_new_orth.py

@@ -0,0 +1,78 @@
+import my_utils.file_utils as fu
+from lib.CoNLL_Annotation import read_conll, CoNLL09_Token, TigerNew_Token 
+from collections import Counter
+
+ORIGINAL_TIGER = "/home/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09"
+NEW_ORTH = "/vol/work/kupietz/Tiger_2_2/data/german/tiger/train/german_tiger_new_orthography.csv"
+
+
+def get_confident_mapping(common_counts):
+	mapping = {}
+	for (old, new), cnt in common_counts:
+		if old not in mapping:
+			mapping[old] = (new, cnt)
+		else:
+			prev_tok, prev_cnt = mapping[old]
+			if cnt > prev_cnt:
+				mapping[old] = (new, cnt)
+	return {k: v[0] for k,v in mapping.items()}
+		
+
+def check_orthography(s_old,s_new):
+	global total_tokens
+	identical_sents = True
+	words_old = s_old.get_words()
+	words_new = s_new.get_words()
+	assert len(words_old) == len(words_new)
+	total_tokens += len(words_old)
+	for w1,w2 in zip(words_old, words_new):
+		if w1 != w2 and w1[0].lower() == w2[0].lower():
+			token_changes.append((w1, w2))
+			identical_sents = False
+	return identical_sents
+
+
+if __name__ == "__main__":
+	line_generator = fu.file_generator(ORIGINAL_TIGER)
+	original_sents, _ = read_conll(line_generator, chunk_size=60000, token_class=CoNLL09_Token, comment_str="#")
+	
+	line_generator = fu.file_generator(NEW_ORTH)
+	new_orth_sents, _ = read_conll(line_generator, chunk_size=60000, token_class=TigerNew_Token, comment_str="#")
+	
+	new_ix = 0
+	train_tiger, test_tiger = [], []
+	problematic_sents, token_changes = [], []
+	total_tokens = 0
+	for i, s1 in enumerate(original_sents):
+		s2 = new_orth_sents[new_ix]
+		print(f"--- {new_ix} ---\n{s1.get_sentence()}\n{s2.get_sentence()}\n\n")
+		if len(s1.get_words()) == len(s2.get_words()):
+			train_tiger.append((s1,s2))
+			identical_sents = check_orthography(s1,s2)
+			if not identical_sents: 
+				problematic_sents.append(new_ix)
+			new_ix += 1
+		else:
+			test_tiger.append(s1)
+	
+	# Print Stats
+	print(len(train_tiger))
+	print(len(test_tiger))
+	print(len(new_orth_sents))
+	print(f"{len(problematic_sents)}/{len(train_tiger)} ({len(problematic_sents)*100/len(train_tiger)}%) of sentences have change of orthography.")
+	print(f"{len(token_changes)}/{total_tokens} ({len(token_changes)*100/total_tokens}%) of tokens have change of orthography.")
+	# Save Files
+	tiger_path = "/home/daza/datasets/TIGER_conll/"
+	new_cases = Counter(token_changes).most_common()
+	case_mapping = get_confident_mapping(new_cases)
+	# Stats
+	fu.counter_to_file(new_cases, f"{tiger_path}/TigerTokensChangeOrth.train.tsv")
+	fu.dict_to_file(case_mapping, f"{tiger_path}/TigerOrthMapping.train.json")
+	fu.list_to_file(problematic_sents, f"{tiger_path}/NewOrthProblems_Indices.train.txt")
+	# Train/Test Splits
+	old_train, new_train = zip(*train_tiger)
+	fu.write_conll_file(old_train, out_path=f"{tiger_path}/Tiger.OldOrth.train.conll")
+	fu.write_conll_file(new_train, out_path=f"{tiger_path}/Tiger.NewOrth.train.conll")
+	fu.write_conll_file(test_tiger, out_path=f"{tiger_path}/Tiger.OldOrth.test.conll")
+	
+		

diff --git a/systems/eval_old_vs_new_tiger.py b/systems/eval_old_vs_new_tiger.py
new file mode 100644
index 0000000..38dc597
--- /dev/null
+++ b/systems/eval_old_vs_new_tiger.py

@@ -0,0 +1,35 @@
+import my_utils.file_utils as fu
+from lib.CoNLL_Annotation import read_conll, CoNLLUP_Token 
+from collections import Counter
+from germalemma import GermaLemma
+
+SPACY_NEW = "/home/daza/datasets/TIGER_conll/Tiger.NewOrth.train.spacy_parsed.conllu"
+CASES = "/home/daza/datasets/TIGER_conll/NewOrthProblems_Indices.train.txt"
+
+orth_dict = fu.file_to_dict("/vol/netapp/daza/datasets/TIGER_conll/TigerOrthMapping.train.json")
+new_to_old = {v:k for k,v in orth_dict.items()}
+
+
+if __name__ == "__main__":
+	line_generator = fu.file_generator(SPACY_NEW)
+	conll_sents, _ = read_conll(line_generator, chunk_size=60000, token_class=CoNLLUP_Token, comment_str="#")
+	special_cases = [int(line) for line in open(CASES).read().splitlines()]
+	checked_cases = []
+	
+	lemmatizer = GermaLemma()
+	
+	for ix, sent in enumerate(conll_sents):
+		if ix in special_cases:
+			for tok in sent.tokens:
+				old_word_change = new_to_old.get(tok.word)
+				if old_word_change:
+					try:
+						old_lemma = lemmatizer.find_lemma(old_word_change, tok.pos_tag)
+					except:
+						old_lemma = f"UNK_{tok.pos_tag}"
+					checked_cases.append((old_word_change, tok.word, old_lemma, tok.lemma))
+	
+	print(f"Cases checked: {len(checked_cases)}")
+	case_count = Counter(checked_cases).most_common()
+	fu.counter_to_file(case_count, "/home/daza/datasets/TIGER_conll/TigerLemmas_Old_New.tsv")
+	
\ No newline at end of file

diff --git a/systems/parse_spacy.py b/systems/parse_spacy.py
index 75a1eb5..1a32b67 100644
--- a/systems/parse_spacy.py
+++ b/systems/parse_spacy.py

@@ -18,8 +18,9 @@
 		return Doc(self.vocab, words=words, spaces=spaces)
 
 
-def get_conll_str(spacy_doc, use_germalemma):
-	conll_lines = [] # We want: [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
+def get_conll_str(anno_obj, spacy_doc, use_germalemma):
+	#  First lines are comments. (metadata)
+	conll_lines = anno_obj.metadata # Then we want: [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
 	for ix, token in enumerate(spacy_doc):
 		if use_germalemma == "True":
 			content = (str(ix), token.text, find_germalemma(token.text, token.tag_, token.lemma_), token.pos_, token.tag_, "_", "_", "_", "_", "_")
@@ -49,15 +50,30 @@
 if __name__ == "__main__":
 	"""
 		EXAMPLE:
+		--- TIGER Classic Orthography ---
 			python systems/parse_spacy.py --corpus_name Tiger \
 				-i /home/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09 \
 				-o /home/daza/datasets/TIGER_conll/tiger_spacy_parsed.conllu \
 				-t /home/daza/datasets/TIGER_conll/tiger_all.txt
-			
+		
+		--- TIGER New Orthography ---
+			python systems/parse_spacy.py --corpus_name TigerNew --gld_token_type CoNLLUP_Token \
+				-i /home/daza/datasets/TIGER_conll/Tiger.NewOrth.train.conll \
+				-o /home/daza/datasets/TIGER_conll/Tiger.NewOrth.train.spacy_parsed.conllu \
+				-t /home/daza/datasets/TIGER_conll/Tiger.NewOrth.train.txt
+		
+		--- German GSD Universal Deps ---
 			python systems/parse_spacy.py --corpus_name DE_GSD --gld_token_type CoNLLUP_Token \
 				-i /home/daza/datasets/ud-treebanks-v2.2/UD_German-GSD/de_gsd-ud-test.conllu \
 				-o /home/daza/datasets/ud-treebanks-v2.2/UD_German-GSD/de_gsd-ud-test.parsed.germalemma.conllu \
-				-t/home/daza/datasets/ud-treebanks-v2.2/UD_German-GSD/de_gsd-ud-test.txt
+				-t /home/daza/datasets/ud-treebanks-v2.2/UD_German-GSD/de_gsd-ud-test.txt
+				
+			
+		--- Real Data TEST  ---
+		time python systems/parse_spacy.py --corpus_name DeReKo_a00 --gld_token_type CoNLLUP_Token --comment_str "#" \
+			-i /export/netapp/kupietz/N-GRAMM-STUDIE/conllu/a00.conllu.gz \
+			-o /export/netapp/kupietz/N-GRAMM-STUDIE/conllu/0_SpaCyParsed/a00.spacy.gl.conllu
+			
 	"""
 	
 	parser = argparse.ArgumentParser()
@@ -71,7 +87,9 @@
 	args = parser.parse_args()
 	
 	file_has_next, chunk_ix = True, 0
-	CHUNK_SIZE = 10000
+	CHUNK_SIZE = 100000
+	SPACY_BATCH = 10000
+	SPACY_PROC = 50
 	
 	# =====================================================================================
 	#                    LOGGING INFO ...
@@ -91,16 +109,22 @@
 	lemmatizer = GermaLemma()
 	if args.text_file: write_plain = open(args.text_file, "w")
 	
+	if ".gz" == args.input_file[-3:]:
+		in_file = fu.expand_file(args.input_file)
+	else:
+		in_file = args.input_file
+	
 	start = time.time()
 	total_processed_sents = 0
-	line_generator = fu.file_generator(args.input_file)
+	line_generator = fu.file_generator(in_file)
 	while file_has_next:
-		sents, gld, file_has_next = fu.get_file_text_chunk(line_generator, chunk_size=CHUNK_SIZE, token_class=get_token_type(args.gld_token_type), comment_str=args.comment_str)
-		if len(sents) == 0: break
-		total_processed_sents += len(sents)
+		annos, file_has_next = fu.get_file_annos_chunk(line_generator, chunk_size=CHUNK_SIZE, 		token_class=get_token_type(args.gld_token_type), comment_str=args.comment_str)
+		if len(annos) == 0: break
+		total_processed_sents += len(annos)
 		logger.info(f"Already processed {total_processed_sents} sentences...")
-		for doc in spacy_de.pipe(sents, batch_size=1000, n_process=10):
-			conll_str = get_conll_str(doc, use_germalemma=args.use_germalemma)
+		sents = [a.get_sentence() for a in annos]
+		for ix, doc in enumerate(spacy_de.pipe(sents, batch_size=SPACY_BATCH, n_process=SPACY_PROC)):
+			conll_str = get_conll_str(annos[ix], doc, use_germalemma=args.use_germalemma)
 			write_out.write(conll_str)
 			write_out.write("\n\n")
 			if args.text_file:
commit	8534747e42fc3043891197994809c2af962bbaf7	[log] [tgz]
author	daza <daza@uni-heidelberg.de>	Mon Nov 23 18:43:33 2020 +0100
committer	daza <daza@uni-heidelberg.de>	Mon Nov 23 18:43:33 2020 +0100
tree	7cecac06979ebfbe07b167076fef5b24eeaed75c
parent	e3bc92ec638020eddd0d0ce53cb73b0763dee76c [diff]