Added training examples for SpaCy3
diff --git a/.gitignore b/.gitignore
index cdb2786..e6a1dca 100644
--- a/.gitignore
+++ b/.gitignore
@@ -107,6 +107,7 @@
.venv
env/
venv/
+venv-*
ENV/
env.bak/
venv.bak/
diff --git a/DeReKo/spacy_train/basic_config.cfg b/DeReKo/spacy_train/basic_config.cfg
new file mode 100644
index 0000000..e35d4e3
--- /dev/null
+++ b/DeReKo/spacy_train/basic_config.cfg
@@ -0,0 +1,81 @@
+[paths]
+train = "/home/daza/ids-projects/DeReKo/spacy_train/spacy_bin_corpora/Tiger.NewOrth.train.spacy"
+dev = "/home/daza/ids-projects/DeReKo/spacy_train/spacy_bin_corpora/Tiger.NewOrth.test.spacy"
+
+[system]
+gpu_allocator = "pytorch"
+
+
+[nlp]
+lang = "de"
+pipeline = ["transformer", "tagger"]
+tokenizer = {"@tokenizers": "spacy.Tokenizer.v1"}
+
+[components]
+
+[components.transformer]
+factory = "transformer"
+
+[components.transformer.model]
+@architectures = "spacy-transformers.TransformerModel.v1"
+name = "bert-base-german-cased"
+tokenizer_config = {"use_fast": true}
+
+[components.transformer.model.get_spans]
+@span_getters = "spacy-transformers.strided_spans.v1"
+window = 128
+stride = 96
+
+
+[components.tagger]
+factory = "tagger"
+
+[components.tagger.model]
+@architectures = "spacy.Tagger.v1"
+nO = null
+
+[components.tagger.model.tok2vec]
+@architectures = "spacy-transformers.TransformerListener.v1"
+grad_factor = 1.0
+
+[components.tagger.model.tok2vec.pooling]
+@layers = "reduce_mean.v1"
+
+
+
+[corpora]
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+max_length = 500
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+max_length = 0
+
+[training]
+accumulate_gradient = 3
+dev_corpus = "corpora.dev"
+train_corpus = "corpora.train"
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+
+[training.optimizer.learn_rate]
+@schedules = "warmup_linear.v1"
+warmup_steps = 250
+total_steps = 20000
+initial_rate = 5e-5
+
+
+
+[training.batcher]
+@batchers = "spacy.batch_by_padded.v1"
+discard_oversize = true
+size = 2000
+buffer = 256
+
+[initialize]
+vectors = null
diff --git a/DeReKo/spacy_train/config.cfg b/DeReKo/spacy_train/config.cfg
new file mode 100644
index 0000000..d701932
--- /dev/null
+++ b/DeReKo/spacy_train/config.cfg
@@ -0,0 +1,123 @@
+[paths]
+train = "/home/daza/ids-projects/DeReKo/spacy_train/spacy_bin_corpora/Tiger.NewOrth.train.spacy"
+dev = "/home/daza/ids-projects/DeReKo/spacy_train/spacy_bin_corpora/Tiger.NewOrth.test.spacy"
+vectors = null
+init_tok2vec = null
+
+[system]
+gpu_allocator = "pytorch"
+seed = 0
+
+[nlp]
+lang = "de"
+pipeline = ["transformer","tagger"]
+tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
+disabled = []
+before_creation = null
+after_creation = null
+after_pipeline_creation = null
+
+[components]
+
+[components.tagger]
+factory = "tagger"
+
+[components.tagger.model]
+@architectures = "spacy.Tagger.v1"
+nO = null
+
+[components.tagger.model.tok2vec]
+@architectures = "spacy-transformers.TransformerListener.v1"
+grad_factor = 1.0
+pooling = {"@layers":"reduce_mean.v1"}
+
+[components.transformer]
+factory = "transformer"
+max_batch_items = 4096
+set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
+
+[components.transformer.model]
+@architectures = "spacy-transformers.TransformerModel.v1"
+name = "bert-base-german-cased"
+
+[components.transformer.model.get_spans]
+@span_getters = "spacy-transformers.strided_spans.v1"
+window = 128
+stride = 96
+
+[components.transformer.model.tokenizer_config]
+use_fast = true
+
+[corpora]
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+max_length = 0
+gold_preproc = false
+limit = 0
+augmenter = null
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+max_length = 500
+gold_preproc = false
+limit = 0
+augmenter = null
+
+[training]
+accumulate_gradient = 3
+dev_corpus = "corpora.dev"
+train_corpus = "corpora.train"
+seed = ${system.seed}
+gpu_allocator = ${system.gpu_allocator}
+dropout = 0.1
+patience = 1600
+max_epochs = 0
+max_steps = 20000
+eval_frequency = 200
+frozen_components = []
+before_to_disk = null
+
+[training.batcher]
+@batchers = "spacy.batch_by_padded.v1"
+discard_oversize = true
+size = 2000
+buffer = 256
+get_length = null
+
+[training.logger]
+@loggers = "spacy.ConsoleLogger.v1"
+progress_bar = false
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = false
+eps = 0.00000001
+
+[training.optimizer.learn_rate]
+@schedules = "warmup_linear.v1"
+warmup_steps = 250
+total_steps = 20000
+initial_rate = 0.00005
+
+[training.score_weights]
+tag_acc = 1.0
+
+[pretraining]
+
+[initialize]
+vectors = null
+init_tok2vec = ${paths.init_tok2vec}
+vocab_data = null
+lookups = null
+
+[initialize.components]
+
+[initialize.tokenizer]
\ No newline at end of file
diff --git a/DeReKo/spacy_train/conll2spacy.py b/DeReKo/spacy_train/conll2spacy.py
new file mode 100644
index 0000000..f0d0d4c
--- /dev/null
+++ b/DeReKo/spacy_train/conll2spacy.py
@@ -0,0 +1,70 @@
+import argparse, time, json
+import my_utils.file_utils as fu
+from lib.CoNLL_Annotation import get_token_type
+
+if __name__ == "__main__":
+ """
+ --- TIGER New Orthography ---
+ python DeReKo/spacy_train/conll2spacy.py --corpus_name TigerNew --gld_token_type CoNLLUP_Token \
+ -i /home/daza/datasets/TIGER_conll/data_splits/train/Tiger.NewOrth.train.conll \
+ -o DeReKo/spacy_train/Tiger.NewOrth.train.json \
+ -t DeReKo/spacy_train/Tiger.NewOrth.train.txt
+
+ python DeReKo/spacy_train/conll2spacy.py --corpus_name TigerNew --gld_token_type CoNLLUP_Token \
+ -i /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.NewOrth.test.conll \
+ -o DeReKo/spacy_train/Tiger.NewOrth.test.json \
+ -t DeReKo/spacy_train/Tiger.NewOrth.test.txt
+
+ """
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("-i", "--input_file", help="Input Corpus", required=True)
+ parser.add_argument("-n", "--corpus_name", help="Corpus Name", default="Corpus")
+ parser.add_argument("-o", "--output_file", help="File where the Predictions will be saved", required=True)
+ parser.add_argument("-t", "--text_file", help="Output Plain Text File", default=None)
+ parser.add_argument("-gtt", "--gld_token_type", help="CoNLL Format of the Gold Data", default="CoNLL09_Token")
+ parser.add_argument("-c", "--comment_str", help="CoNLL Format of comentaries inside the file", default="#")
+ args = parser.parse_args()
+
+ file_has_next, chunk_ix = True, 0
+ CHUNK_SIZE = 60000
+
+ write_out = open(args.output_file, "w")
+ if args.text_file: write_plain = open(args.text_file, "w")
+
+ if ".gz" == args.input_file[-3:]:
+ in_file = fu.expand_file(args.input_file)
+ else:
+ in_file = args.input_file
+
+ start = time.time()
+ total_processed_sents = 0
+ line_generator = fu.file_generator(in_file)
+ while file_has_next:
+ annos, file_has_next = fu.get_file_annos_chunk(line_generator, chunk_size=CHUNK_SIZE, token_class=get_token_type(args.gld_token_type), comment_str=args.comment_str)
+ if len(annos) == 0: break
+ total_processed_sents += len(annos)
+ print(f"Already processed {total_processed_sents} sentences...")
+ spacy_docs = []
+ for anno_id, anno in enumerate(annos):
+ plain_text, token_objs = [], []
+ for ix, tok in enumerate(anno.tokens):
+ token_objs.append({"id": ix, "orth":tok.word, "tag": tok.pos_tag})
+ plain_text.append(tok.word)
+ plain_text_str = " ".join(plain_text)
+ sent_obj = {
+ "id": anno_id,
+ "meta": anno.metadata,
+ "paragraphs": [{
+ "raw": plain_text_str,
+ "sentences": [{
+ "tokens": token_objs
+ }]
+ }]
+ }
+ spacy_docs.append(sent_obj)
+ if args.text_file:
+ write_plain.write(plain_text_str + "\n")
+ write_out.write(json.dumps(spacy_docs))
+ end = time.time()
+ print(f"Processing {args.corpus_name} took {(end - start)} seconds!")
\ No newline at end of file
diff --git a/DeReKo/spacy_train/custom_spacy_dereko.py b/DeReKo/spacy_train/custom_spacy_dereko.py
new file mode 100644
index 0000000..a4674dd
--- /dev/null
+++ b/DeReKo/spacy_train/custom_spacy_dereko.py
@@ -0,0 +1,17 @@
+import spacy
+
+#ORIG_DEREKO_VECS = "/export/netapp/kupietz/embeddings/dereko-2020-ii-alpha.all.txt.gz"
+
+# Made with command: python -m spacy init-model de de_fastext_vectors --vectors-loc dereko_vectors/cc.de.300.vec.gz
+SPACY_FAST_DE = "../../lib/de_fastext_vectors"
+
+VECTORS = SPACY_FAST_DE
+
+nlp_dereko = spacy.load(VECTORS)
+doc1 = nlp_dereko("`` Ross Perot wäre vielleicht ein prächtiger Diktator ''")
+doc2 = nlp_dereko("Konzernchefs lehnen den Milliardär als US-Präsidenten ab")
+doc3 = nlp_dereko("Texaner gibt nur vage Auskunft über seine Wirtschaftspolitik")
+print(doc1.similarity(doc2))
+print(doc1.similarity(doc3))
+
+
diff --git a/DeReKo/spacy_train/custom_spacy_tagger_2x.py b/DeReKo/spacy_train/custom_spacy_tagger_2x.py
new file mode 100644
index 0000000..5e529b5
--- /dev/null
+++ b/DeReKo/spacy_train/custom_spacy_tagger_2x.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python
+# coding: utf8
+"""
+A simple example for training a part-of-speech tagger with a custom tag map.
+To allow us to update the tag map with our custom one, this example starts off
+with a blank Language class and modifies its defaults. For more details, see
+the documentation:
+* Training: https://spacy.io/usage/training
+* POS Tagging: https://spacy.io/usage/linguistic-features#pos-tagging
+Compatible with: spaCy v2.0.0+
+Last tested with: v2.1.0
+"""
+from __future__ import unicode_literals, print_function
+
+import plac
+import random
+from pathlib import Path
+import spacy
+from spacy.util import minibatch, compounding
+
+
+# You need to define a mapping from your data's part-of-speech tag names to the
+# Universal Part-of-Speech tag set, as spaCy includes an enum of these tags.
+# See here for the Universal Tag Set:
+# http://universaldependencies.github.io/docs/u/pos/index.html
+# You may also specify morphological features for your tags, from the universal
+# scheme.
+TAG_MAP = {"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}, "J": {"pos": "ADJ"}}
+
+# Usually you'll read this in, of course. Data formats vary. Ensure your
+# strings are unicode and that the number of tags assigned matches spaCy's
+# tokenization. If not, you can always add a 'words' key to the annotations
+# that specifies the gold-standard tokenization, e.g.:
+# ("Eatblueham", {'words': ['Eat', 'blue', 'ham'], 'tags': ['V', 'J', 'N']})
+TRAIN_DATA = [
+ ("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
+ ("Eat blue ham", {"tags": ["V", "J", "N"]}),
+]
+
+
+@plac.annotations(
+ lang=("ISO Code of language to use", "option", "l", str),
+ output_dir=("Optional output directory", "option", "o", Path),
+ n_iter=("Number of training iterations", "option", "n", int),
+)
+def main(lang="en", output_dir=None, n_iter=25):
+ """Create a new model, set up the pipeline and train the tagger. In order to
+ train the tagger with a custom tag map, we're creating a new Language
+ instance with a custom vocab.
+ """
+ nlp = spacy.blank(lang)
+ # add the tagger to the pipeline
+ # nlp.create_pipe works for built-ins that are registered with spaCy
+ tagger = nlp.create_pipe("tagger")
+ # Add the tags. This needs to be done before you start training.
+ for tag, values in TAG_MAP.items():
+ tagger.add_label(tag, values)
+ nlp.add_pipe(tagger)
+
+ optimizer = nlp.begin_training()
+ for i in range(n_iter):
+ random.shuffle(TRAIN_DATA)
+ losses = {}
+ # batch up the examples using spaCy's minibatch
+ batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
+ for batch in batches:
+ texts, annotations = zip(*batch)
+ nlp.update(texts, annotations, sgd=optimizer, losses=losses)
+ print("Losses", losses)
+
+ # test the trained model
+ test_text = "I like blue eggs"
+ doc = nlp(test_text)
+ print("Tags", [(t.text, t.tag_, t.pos_) for t in doc])
+
+ # save model to output directory
+ if output_dir is not None:
+ output_dir = Path(output_dir)
+ if not output_dir.exists():
+ output_dir.mkdir()
+ nlp.to_disk(output_dir)
+ print("Saved model to", output_dir)
+
+ # test the save model
+ print("Loading from", output_dir)
+ nlp2 = spacy.load(output_dir)
+ doc = nlp2(test_text)
+ print("Tags", [(t.text, t.tag_, t.pos_) for t in doc])
+
+
+if __name__ == "__main__":
+ plac.call(main)
+
+ # Expected output:
+ # [
+ # ('I', 'N', 'NOUN'),
+ # ('like', 'V', 'VERB'),
+ # ('blue', 'J', 'ADJ'),
+ # ('eggs', 'N', 'NOUN')
+ # ]
\ No newline at end of file
diff --git a/DeReKo/spacy_train/custom_spacy_tagger_3x.py b/DeReKo/spacy_train/custom_spacy_tagger_3x.py
new file mode 100644
index 0000000..45a76c9
--- /dev/null
+++ b/DeReKo/spacy_train/custom_spacy_tagger_3x.py
@@ -0,0 +1,4 @@
+import spacy
+nlp = spacy.load("de_dep_news_trf")
+doc = nlp("Das ist ein Satz.")
+print(doc)
\ No newline at end of file
diff --git a/DeReKo/spacy_train/spacy_bin_corpora/Tiger.NewOrth.test.spacy b/DeReKo/spacy_train/spacy_bin_corpora/Tiger.NewOrth.test.spacy
new file mode 100644
index 0000000..de11ccc
--- /dev/null
+++ b/DeReKo/spacy_train/spacy_bin_corpora/Tiger.NewOrth.test.spacy
Binary files differ
diff --git a/DeReKo/spacy_train/spacy_bin_corpora/Tiger.NewOrth.train.spacy b/DeReKo/spacy_train/spacy_bin_corpora/Tiger.NewOrth.train.spacy
new file mode 100644
index 0000000..857239d
--- /dev/null
+++ b/DeReKo/spacy_train/spacy_bin_corpora/Tiger.NewOrth.train.spacy
Binary files differ
diff --git a/my_utils/file_utils.py b/my_utils/file_utils.py
index e63ddca..1865fc5 100644
--- a/my_utils/file_utils.py
+++ b/my_utils/file_utils.py
@@ -1,20 +1,42 @@
import requests, logging, json
+import subprocess, time
+import glob, logging
+import os.path, sys
from lib.CoNLL_Annotation import read_conll, read_conll_generator
logger = logging.getLogger(__name__)
-def dict_to_file(my_dict, out_path):
+def list_to_file(my_list, out_path):
with open(out_path, "w") as out:
- out.write(json.dump(my_dict))
+ for item_str in my_list:
+ out.write(f"{item_str}\n")
+
+def counter_to_file(my_counter, out_path):
+ with open(out_path, "w") as out:
+ for item, count in my_counter:
+ item_str = "\t".join(item)
+ out.write(f"{item_str}\t{count}\n")
+
+def dict_to_file(my_dict, out_path):
+ with open(out_path, "w", encoding='utf8') as out:
+ json.dump(my_dict, fp=out, ensure_ascii=False)
def file_to_dict(file_path):
d = {}
with open(file_path) as f:
- d = f.load(f)
+ d = json.load(f)
return d
+def write_conll_file(conll_objs, out_path):
+ with open(out_path, "w", encoding='utf8') as out:
+ for obj in conll_objs:
+ for tok in obj.tokens:
+ out.write(tok.get_conllU_line()+"\n")
+ out.write("\n")
+
+
def file_generator(file_path):
with open(file_path, "r") as data_file:
logger.info("Reading instances from lines in file at: %s", file_path)
@@ -23,7 +45,16 @@
yield line
+def get_file_annos_chunk(line_generator, chunk_size, token_class, comment_str="###C:"):
+ file_has_next = True
+ chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str)
+ if n_sents == 0: file_has_next = False
+ sents, gld, meta = [], [], []
+ return chunk, file_has_next
+
+
def get_file_text_chunk(line_generator, chunk_size, token_class, comment_str="###C:"):
+ """ Same as get_file_annos_chunk but directly get (text, labels) pairs"""
file_has_next = True
chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str)
if n_sents == 0: file_has_next = False
@@ -64,3 +95,30 @@
fout = open(fname, "w")
fout.write(response_str)
fout.close()
+
+
+def expand_file(f, substitute_comment=False):
+ # Expand the .gz file
+ fname = f[:-3]
+ if not os.path.isfile(fname):
+ p = subprocess.call(f"gunzip -c {f} > {fname}", shell=True)
+ if p == 0:
+ logger.info("Successfully uncompressed file")
+ else:
+ logger.info(f"Couldn't expand file {f}")
+ raise Exception
+ else:
+ logger.info(f"File {fname} is already uncompressed. Skipping this step...")
+
+ # Substitute the Commentary Lines on the Expanded file
+ if substitute_comment:
+ fixed_filename = f"{fname}.fixed"
+ p = subprocess.call(f"sed 's/^# /###C: /g' {fname}", shell=True, stdout=open(fixed_filename, "w")) # stdout=subprocess.PIPE
+ if p == 0:
+ logger.info("Successfully fixed comments on file")
+ else:
+ logger.info(f"Something went wrong when substituting commentaries")
+ raise Exception
+ return fixed_filename
+ else:
+ return fname
\ No newline at end of file
diff --git a/my_utils/make_tiger_new_orth.py b/my_utils/make_tiger_new_orth.py
new file mode 100644
index 0000000..8886f74
--- /dev/null
+++ b/my_utils/make_tiger_new_orth.py
@@ -0,0 +1,78 @@
+import my_utils.file_utils as fu
+from lib.CoNLL_Annotation import read_conll, CoNLL09_Token, TigerNew_Token
+from collections import Counter
+
+ORIGINAL_TIGER = "/home/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09"
+NEW_ORTH = "/vol/work/kupietz/Tiger_2_2/data/german/tiger/train/german_tiger_new_orthography.csv"
+
+
+def get_confident_mapping(common_counts):
+ mapping = {}
+ for (old, new), cnt in common_counts:
+ if old not in mapping:
+ mapping[old] = (new, cnt)
+ else:
+ prev_tok, prev_cnt = mapping[old]
+ if cnt > prev_cnt:
+ mapping[old] = (new, cnt)
+ return {k: v[0] for k,v in mapping.items()}
+
+
+def check_orthography(s_old,s_new):
+ global total_tokens
+ identical_sents = True
+ words_old = s_old.get_words()
+ words_new = s_new.get_words()
+ assert len(words_old) == len(words_new)
+ total_tokens += len(words_old)
+ for w1,w2 in zip(words_old, words_new):
+ if w1 != w2 and w1[0].lower() == w2[0].lower():
+ token_changes.append((w1, w2))
+ identical_sents = False
+ return identical_sents
+
+
+if __name__ == "__main__":
+ line_generator = fu.file_generator(ORIGINAL_TIGER)
+ original_sents, _ = read_conll(line_generator, chunk_size=60000, token_class=CoNLL09_Token, comment_str="#")
+
+ line_generator = fu.file_generator(NEW_ORTH)
+ new_orth_sents, _ = read_conll(line_generator, chunk_size=60000, token_class=TigerNew_Token, comment_str="#")
+
+ new_ix = 0
+ train_tiger, test_tiger = [], []
+ problematic_sents, token_changes = [], []
+ total_tokens = 0
+ for i, s1 in enumerate(original_sents):
+ s2 = new_orth_sents[new_ix]
+ print(f"--- {new_ix} ---\n{s1.get_sentence()}\n{s2.get_sentence()}\n\n")
+ if len(s1.get_words()) == len(s2.get_words()):
+ train_tiger.append((s1,s2))
+ identical_sents = check_orthography(s1,s2)
+ if not identical_sents:
+ problematic_sents.append(new_ix)
+ new_ix += 1
+ else:
+ test_tiger.append(s1)
+
+ # Print Stats
+ print(len(train_tiger))
+ print(len(test_tiger))
+ print(len(new_orth_sents))
+ print(f"{len(problematic_sents)}/{len(train_tiger)} ({len(problematic_sents)*100/len(train_tiger)}%) of sentences have change of orthography.")
+ print(f"{len(token_changes)}/{total_tokens} ({len(token_changes)*100/total_tokens}%) of tokens have change of orthography.")
+ # Save Files
+ tiger_path = "/home/daza/datasets/TIGER_conll/"
+ new_cases = Counter(token_changes).most_common()
+ case_mapping = get_confident_mapping(new_cases)
+ # Stats
+ fu.counter_to_file(new_cases, f"{tiger_path}/TigerTokensChangeOrth.train.tsv")
+ fu.dict_to_file(case_mapping, f"{tiger_path}/TigerOrthMapping.train.json")
+ fu.list_to_file(problematic_sents, f"{tiger_path}/NewOrthProblems_Indices.train.txt")
+ # Train/Test Splits
+ old_train, new_train = zip(*train_tiger)
+ fu.write_conll_file(old_train, out_path=f"{tiger_path}/Tiger.OldOrth.train.conll")
+ fu.write_conll_file(new_train, out_path=f"{tiger_path}/Tiger.NewOrth.train.conll")
+ fu.write_conll_file(test_tiger, out_path=f"{tiger_path}/Tiger.OldOrth.test.conll")
+
+
diff --git a/systems/eval_old_vs_new_tiger.py b/systems/eval_old_vs_new_tiger.py
new file mode 100644
index 0000000..38dc597
--- /dev/null
+++ b/systems/eval_old_vs_new_tiger.py
@@ -0,0 +1,35 @@
+import my_utils.file_utils as fu
+from lib.CoNLL_Annotation import read_conll, CoNLLUP_Token
+from collections import Counter
+from germalemma import GermaLemma
+
+SPACY_NEW = "/home/daza/datasets/TIGER_conll/Tiger.NewOrth.train.spacy_parsed.conllu"
+CASES = "/home/daza/datasets/TIGER_conll/NewOrthProblems_Indices.train.txt"
+
+orth_dict = fu.file_to_dict("/vol/netapp/daza/datasets/TIGER_conll/TigerOrthMapping.train.json")
+new_to_old = {v:k for k,v in orth_dict.items()}
+
+
+if __name__ == "__main__":
+ line_generator = fu.file_generator(SPACY_NEW)
+ conll_sents, _ = read_conll(line_generator, chunk_size=60000, token_class=CoNLLUP_Token, comment_str="#")
+ special_cases = [int(line) for line in open(CASES).read().splitlines()]
+ checked_cases = []
+
+ lemmatizer = GermaLemma()
+
+ for ix, sent in enumerate(conll_sents):
+ if ix in special_cases:
+ for tok in sent.tokens:
+ old_word_change = new_to_old.get(tok.word)
+ if old_word_change:
+ try:
+ old_lemma = lemmatizer.find_lemma(old_word_change, tok.pos_tag)
+ except:
+ old_lemma = f"UNK_{tok.pos_tag}"
+ checked_cases.append((old_word_change, tok.word, old_lemma, tok.lemma))
+
+ print(f"Cases checked: {len(checked_cases)}")
+ case_count = Counter(checked_cases).most_common()
+ fu.counter_to_file(case_count, "/home/daza/datasets/TIGER_conll/TigerLemmas_Old_New.tsv")
+
\ No newline at end of file
diff --git a/systems/parse_spacy.py b/systems/parse_spacy.py
index 75a1eb5..1a32b67 100644
--- a/systems/parse_spacy.py
+++ b/systems/parse_spacy.py
@@ -18,8 +18,9 @@
return Doc(self.vocab, words=words, spaces=spaces)
-def get_conll_str(spacy_doc, use_germalemma):
- conll_lines = [] # We want: [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
+def get_conll_str(anno_obj, spacy_doc, use_germalemma):
+ # First lines are comments. (metadata)
+ conll_lines = anno_obj.metadata # Then we want: [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
for ix, token in enumerate(spacy_doc):
if use_germalemma == "True":
content = (str(ix), token.text, find_germalemma(token.text, token.tag_, token.lemma_), token.pos_, token.tag_, "_", "_", "_", "_", "_")
@@ -49,15 +50,30 @@
if __name__ == "__main__":
"""
EXAMPLE:
+ --- TIGER Classic Orthography ---
python systems/parse_spacy.py --corpus_name Tiger \
-i /home/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09 \
-o /home/daza/datasets/TIGER_conll/tiger_spacy_parsed.conllu \
-t /home/daza/datasets/TIGER_conll/tiger_all.txt
-
+
+ --- TIGER New Orthography ---
+ python systems/parse_spacy.py --corpus_name TigerNew --gld_token_type CoNLLUP_Token \
+ -i /home/daza/datasets/TIGER_conll/Tiger.NewOrth.train.conll \
+ -o /home/daza/datasets/TIGER_conll/Tiger.NewOrth.train.spacy_parsed.conllu \
+ -t /home/daza/datasets/TIGER_conll/Tiger.NewOrth.train.txt
+
+ --- German GSD Universal Deps ---
python systems/parse_spacy.py --corpus_name DE_GSD --gld_token_type CoNLLUP_Token \
-i /home/daza/datasets/ud-treebanks-v2.2/UD_German-GSD/de_gsd-ud-test.conllu \
-o /home/daza/datasets/ud-treebanks-v2.2/UD_German-GSD/de_gsd-ud-test.parsed.germalemma.conllu \
- -t/home/daza/datasets/ud-treebanks-v2.2/UD_German-GSD/de_gsd-ud-test.txt
+ -t /home/daza/datasets/ud-treebanks-v2.2/UD_German-GSD/de_gsd-ud-test.txt
+
+
+ --- Real Data TEST ---
+ time python systems/parse_spacy.py --corpus_name DeReKo_a00 --gld_token_type CoNLLUP_Token --comment_str "#" \
+ -i /export/netapp/kupietz/N-GRAMM-STUDIE/conllu/a00.conllu.gz \
+ -o /export/netapp/kupietz/N-GRAMM-STUDIE/conllu/0_SpaCyParsed/a00.spacy.gl.conllu
+
"""
parser = argparse.ArgumentParser()
@@ -71,7 +87,9 @@
args = parser.parse_args()
file_has_next, chunk_ix = True, 0
- CHUNK_SIZE = 10000
+ CHUNK_SIZE = 100000
+ SPACY_BATCH = 10000
+ SPACY_PROC = 50
# =====================================================================================
# LOGGING INFO ...
@@ -91,16 +109,22 @@
lemmatizer = GermaLemma()
if args.text_file: write_plain = open(args.text_file, "w")
+ if ".gz" == args.input_file[-3:]:
+ in_file = fu.expand_file(args.input_file)
+ else:
+ in_file = args.input_file
+
start = time.time()
total_processed_sents = 0
- line_generator = fu.file_generator(args.input_file)
+ line_generator = fu.file_generator(in_file)
while file_has_next:
- sents, gld, file_has_next = fu.get_file_text_chunk(line_generator, chunk_size=CHUNK_SIZE, token_class=get_token_type(args.gld_token_type), comment_str=args.comment_str)
- if len(sents) == 0: break
- total_processed_sents += len(sents)
+ annos, file_has_next = fu.get_file_annos_chunk(line_generator, chunk_size=CHUNK_SIZE, token_class=get_token_type(args.gld_token_type), comment_str=args.comment_str)
+ if len(annos) == 0: break
+ total_processed_sents += len(annos)
logger.info(f"Already processed {total_processed_sents} sentences...")
- for doc in spacy_de.pipe(sents, batch_size=1000, n_process=10):
- conll_str = get_conll_str(doc, use_germalemma=args.use_germalemma)
+ sents = [a.get_sentence() for a in annos]
+ for ix, doc in enumerate(spacy_de.pipe(sents, batch_size=SPACY_BATCH, n_process=SPACY_PROC)):
+ conll_str = get_conll_str(annos[ix], doc, use_germalemma=args.use_germalemma)
write_out.write(conll_str)
write_out.write("\n\n")
if args.text_file: