upgrading repo to latest version

commit: d7d707559a6ae5568b76ea2533a8ab382a42e6b4 [log] [tgz]
author: daza <daza@uni-heidelberg.de> Tue Jan 12 18:17:49 2021 +0100
committer: daza <daza@uni-heidelberg.de> Tue Jan 12 18:17:49 2021 +0100
tree: fecbb1a3e4ccbf14f3703d357f16b4bbd6e438f7
parent: a63ab50a0c9bb7cd7da32c725f34f1e2752e5be6 [diff] [blame]
diff --git a/systems/parse_spacy.py b/systems/parse_spacy.py
index 1a32b67..3c56233 100644
--- a/systems/parse_spacy.py
+++ b/systems/parse_spacy.py

@@ -51,26 +51,34 @@
 	"""
 		EXAMPLE:
 		--- TIGER Classic Orthography ---
-			python systems/parse_spacy.py --corpus_name Tiger \
+			python systems/parse_spacy.py --corpus_name Tiger --gld_token_type CoNLL09_Token \
 				-i /home/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09 \
 				-o /home/daza/datasets/TIGER_conll/tiger_spacy_parsed.conllu \
 				-t /home/daza/datasets/TIGER_conll/tiger_all.txt
+			
+			python systems/parse_spacy.py --corpus_name TigerOld_test \
+			-i /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.OldOrth.test.conll \
+			-o /home/daza/datasets/TIGER_conll/tiger_spacy_parsed.test.conllu
 		
 		--- TIGER New Orthography ---
-			python systems/parse_spacy.py --corpus_name TigerNew --gld_token_type CoNLLUP_Token \
+			python systems/parse_spacy.py --corpus_name TigerNew \
 				-i /home/daza/datasets/TIGER_conll/Tiger.NewOrth.train.conll \
 				-o /home/daza/datasets/TIGER_conll/Tiger.NewOrth.train.spacy_parsed.conllu \
 				-t /home/daza/datasets/TIGER_conll/Tiger.NewOrth.train.txt
+			
+			python systems/parse_spacy.py --corpus_name TigerNew_test \
+			-i /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.NewOrth.test.conll \
+			-o /home/daza/datasets/TIGER_conll/Tiger.NewOrth.test.spacy_parsed.conllu
 		
 		--- German GSD Universal Deps ---
-			python systems/parse_spacy.py --corpus_name DE_GSD --gld_token_type CoNLLUP_Token \
+			python systems/parse_spacy.py --corpus_name DE_GSD \
 				-i /home/daza/datasets/ud-treebanks-v2.2/UD_German-GSD/de_gsd-ud-test.conllu \
 				-o /home/daza/datasets/ud-treebanks-v2.2/UD_German-GSD/de_gsd-ud-test.parsed.germalemma.conllu \
 				-t /home/daza/datasets/ud-treebanks-v2.2/UD_German-GSD/de_gsd-ud-test.txt
 				
 			
 		--- Real Data TEST  ---
-		time python systems/parse_spacy.py --corpus_name DeReKo_a00 --gld_token_type CoNLLUP_Token --comment_str "#" \
+		time python systems/parse_spacy.py --corpus_name DeReKo_a00 --comment_str "#" \
 			-i /export/netapp/kupietz/N-GRAMM-STUDIE/conllu/a00.conllu.gz \
 			-o /export/netapp/kupietz/N-GRAMM-STUDIE/conllu/0_SpaCyParsed/a00.spacy.gl.conllu
 			
@@ -81,15 +89,16 @@
 	parser.add_argument("-n", "--corpus_name", help="Corpus Name", default="Corpus")
 	parser.add_argument("-o", "--output_file", help="File where the Predictions will be saved", required=True)
 	parser.add_argument("-t", "--text_file", help="Output Plain Text File", default=None)
-	parser.add_argument("-gtt", "--gld_token_type", help="CoNLL Format of the Gold Data", default="CoNLL09_Token")
+	parser.add_argument("-sm", "--spacy_model", help="Spacy model containing the pipeline to tag", default="de_core_news_lg")
+	parser.add_argument("-gtt", "--gld_token_type", help="CoNLL Format of the Gold Data", default="CoNLLUP_Token")
 	parser.add_argument("-ugl", "--use_germalemma", help="Use Germalemma lemmatizer on top of SpaCy", default="True")
 	parser.add_argument("-c", "--comment_str", help="CoNLL Format of comentaries inside the file", default="#")
 	args = parser.parse_args()
 	
 	file_has_next, chunk_ix = True, 0
-	CHUNK_SIZE = 100000
-	SPACY_BATCH = 10000
-	SPACY_PROC = 50
+	CHUNK_SIZE = 20000
+	SPACY_BATCH = 2000
+	SPACY_PROC = 10
 	
 	# =====================================================================================
 	#                    LOGGING INFO ...
@@ -103,7 +112,7 @@
 	# =====================================================================================
 	#                    POS TAG DOCUMENTS
 	# =====================================================================================
-	spacy_de = spacy.load("de_core_news_lg", disable=["ner", "parser"])
+	spacy_de = spacy.load(args.spacy_model, disable=["ner", "parser"])
 	spacy_de.tokenizer = WhitespaceTokenizer(spacy_de.vocab) # We won't re-tokenize to respect how the source CoNLL are tokenized!
 	write_out = open(args.output_file, "w")
 	lemmatizer = GermaLemma()
commit	d7d707559a6ae5568b76ea2533a8ab382a42e6b4	[log] [tgz]
author	daza <daza@uni-heidelberg.de>	Tue Jan 12 18:17:49 2021 +0100
committer	daza <daza@uni-heidelberg.de>	Tue Jan 12 18:17:49 2021 +0100
tree	fecbb1a3e4ccbf14f3703d357f16b4bbd6e438f7
parent	a63ab50a0c9bb7cd7da32c725f34f1e2752e5be6 [diff] [blame]