upgrading repo to latest version
diff --git a/systems/parse_spacy.py b/systems/parse_spacy.py
index 1a32b67..3c56233 100644
--- a/systems/parse_spacy.py
+++ b/systems/parse_spacy.py
@@ -51,26 +51,34 @@
"""
EXAMPLE:
--- TIGER Classic Orthography ---
- python systems/parse_spacy.py --corpus_name Tiger \
+ python systems/parse_spacy.py --corpus_name Tiger --gld_token_type CoNLL09_Token \
-i /home/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09 \
-o /home/daza/datasets/TIGER_conll/tiger_spacy_parsed.conllu \
-t /home/daza/datasets/TIGER_conll/tiger_all.txt
+
+ python systems/parse_spacy.py --corpus_name TigerOld_test \
+ -i /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.OldOrth.test.conll \
+ -o /home/daza/datasets/TIGER_conll/tiger_spacy_parsed.test.conllu
--- TIGER New Orthography ---
- python systems/parse_spacy.py --corpus_name TigerNew --gld_token_type CoNLLUP_Token \
+ python systems/parse_spacy.py --corpus_name TigerNew \
-i /home/daza/datasets/TIGER_conll/Tiger.NewOrth.train.conll \
-o /home/daza/datasets/TIGER_conll/Tiger.NewOrth.train.spacy_parsed.conllu \
-t /home/daza/datasets/TIGER_conll/Tiger.NewOrth.train.txt
+
+ python systems/parse_spacy.py --corpus_name TigerNew_test \
+ -i /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.NewOrth.test.conll \
+ -o /home/daza/datasets/TIGER_conll/Tiger.NewOrth.test.spacy_parsed.conllu
--- German GSD Universal Deps ---
- python systems/parse_spacy.py --corpus_name DE_GSD --gld_token_type CoNLLUP_Token \
+ python systems/parse_spacy.py --corpus_name DE_GSD \
-i /home/daza/datasets/ud-treebanks-v2.2/UD_German-GSD/de_gsd-ud-test.conllu \
-o /home/daza/datasets/ud-treebanks-v2.2/UD_German-GSD/de_gsd-ud-test.parsed.germalemma.conllu \
-t /home/daza/datasets/ud-treebanks-v2.2/UD_German-GSD/de_gsd-ud-test.txt
--- Real Data TEST ---
- time python systems/parse_spacy.py --corpus_name DeReKo_a00 --gld_token_type CoNLLUP_Token --comment_str "#" \
+ time python systems/parse_spacy.py --corpus_name DeReKo_a00 --comment_str "#" \
-i /export/netapp/kupietz/N-GRAMM-STUDIE/conllu/a00.conllu.gz \
-o /export/netapp/kupietz/N-GRAMM-STUDIE/conllu/0_SpaCyParsed/a00.spacy.gl.conllu
@@ -81,15 +89,16 @@
parser.add_argument("-n", "--corpus_name", help="Corpus Name", default="Corpus")
parser.add_argument("-o", "--output_file", help="File where the Predictions will be saved", required=True)
parser.add_argument("-t", "--text_file", help="Output Plain Text File", default=None)
- parser.add_argument("-gtt", "--gld_token_type", help="CoNLL Format of the Gold Data", default="CoNLL09_Token")
+ parser.add_argument("-sm", "--spacy_model", help="Spacy model containing the pipeline to tag", default="de_core_news_lg")
+ parser.add_argument("-gtt", "--gld_token_type", help="CoNLL Format of the Gold Data", default="CoNLLUP_Token")
parser.add_argument("-ugl", "--use_germalemma", help="Use Germalemma lemmatizer on top of SpaCy", default="True")
parser.add_argument("-c", "--comment_str", help="CoNLL Format of comentaries inside the file", default="#")
args = parser.parse_args()
file_has_next, chunk_ix = True, 0
- CHUNK_SIZE = 100000
- SPACY_BATCH = 10000
- SPACY_PROC = 50
+ CHUNK_SIZE = 20000
+ SPACY_BATCH = 2000
+ SPACY_PROC = 10
# =====================================================================================
# LOGGING INFO ...
@@ -103,7 +112,7 @@
# =====================================================================================
# POS TAG DOCUMENTS
# =====================================================================================
- spacy_de = spacy.load("de_core_news_lg", disable=["ner", "parser"])
+ spacy_de = spacy.load(args.spacy_model, disable=["ner", "parser"])
spacy_de.tokenizer = WhitespaceTokenizer(spacy_de.vocab) # We won't re-tokenize to respect how the source CoNLL are tokenized!
write_out = open(args.output_file, "w")
lemmatizer = GermaLemma()