Added training examples for SpaCy3

commit: 8534747e42fc3043891197994809c2af962bbaf7 [log] [tgz]
author: daza <daza@uni-heidelberg.de> Mon Nov 23 18:43:33 2020 +0100
committer: daza <daza@uni-heidelberg.de> Mon Nov 23 18:43:33 2020 +0100
tree: 7cecac06979ebfbe07b167076fef5b24eeaed75c
parent: e3bc92ec638020eddd0d0ce53cb73b0763dee76c [diff] [blame]
diff --git a/DeReKo/spacy_train/conll2spacy.py b/DeReKo/spacy_train/conll2spacy.py
new file mode 100644
index 0000000..f0d0d4c
--- /dev/null
+++ b/DeReKo/spacy_train/conll2spacy.py

@@ -0,0 +1,70 @@
+import argparse, time, json
+import my_utils.file_utils as fu
+from lib.CoNLL_Annotation import get_token_type
+
+if __name__ == "__main__":
+	"""
+		--- TIGER New Orthography ---
+			python DeReKo/spacy_train/conll2spacy.py --corpus_name TigerNew --gld_token_type CoNLLUP_Token \
+				-i /home/daza/datasets/TIGER_conll/data_splits/train/Tiger.NewOrth.train.conll \
+				-o DeReKo/spacy_train/Tiger.NewOrth.train.json \
+				-t DeReKo/spacy_train/Tiger.NewOrth.train.txt
+			
+			python DeReKo/spacy_train/conll2spacy.py --corpus_name TigerNew --gld_token_type CoNLLUP_Token \
+			-i /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.NewOrth.test.conll \
+			-o DeReKo/spacy_train/Tiger.NewOrth.test.json \
+			-t DeReKo/spacy_train/Tiger.NewOrth.test.txt
+			
+	"""
+	
+	parser = argparse.ArgumentParser()
+	parser.add_argument("-i", "--input_file", help="Input Corpus", required=True)
+	parser.add_argument("-n", "--corpus_name", help="Corpus Name", default="Corpus")
+	parser.add_argument("-o", "--output_file", help="File where the Predictions will be saved", required=True)
+	parser.add_argument("-t", "--text_file", help="Output Plain Text File", default=None)
+	parser.add_argument("-gtt", "--gld_token_type", help="CoNLL Format of the Gold Data", default="CoNLL09_Token")
+	parser.add_argument("-c", "--comment_str", help="CoNLL Format of comentaries inside the file", default="#")
+	args = parser.parse_args()
+	
+	file_has_next, chunk_ix = True, 0
+	CHUNK_SIZE = 60000
+	
+	write_out = open(args.output_file, "w")
+	if args.text_file: write_plain = open(args.text_file, "w")
+	
+	if ".gz" == args.input_file[-3:]:
+		in_file = fu.expand_file(args.input_file)
+	else:
+		in_file = args.input_file
+	
+	start = time.time()
+	total_processed_sents = 0
+	line_generator = fu.file_generator(in_file)
+	while file_has_next:
+		annos, file_has_next = fu.get_file_annos_chunk(line_generator, chunk_size=CHUNK_SIZE, 		token_class=get_token_type(args.gld_token_type), comment_str=args.comment_str)
+		if len(annos) == 0: break
+		total_processed_sents += len(annos)
+		print(f"Already processed {total_processed_sents} sentences...")
+		spacy_docs = []
+		for anno_id, anno in enumerate(annos):
+			plain_text, token_objs = [], []
+			for ix, tok in enumerate(anno.tokens):
+				token_objs.append({"id": ix, "orth":tok.word, "tag": tok.pos_tag})
+				plain_text.append(tok.word)
+			plain_text_str = " ".join(plain_text)
+			sent_obj = {
+				"id": anno_id,
+				"meta": anno.metadata,
+				"paragraphs": [{
+					"raw": plain_text_str,
+					"sentences": [{
+						"tokens": token_objs
+					}]	
+				}]
+			}
+			spacy_docs.append(sent_obj)
+			if args.text_file:
+				write_plain.write(plain_text_str + "\n")
+		write_out.write(json.dumps(spacy_docs))
+	end = time.time()
+	print(f"Processing {args.corpus_name} took {(end - start)} seconds!")
\ No newline at end of file
commit	8534747e42fc3043891197994809c2af962bbaf7	[log] [tgz]
author	daza <daza@uni-heidelberg.de>	Mon Nov 23 18:43:33 2020 +0100
committer	daza <daza@uni-heidelberg.de>	Mon Nov 23 18:43:33 2020 +0100
tree	7cecac06979ebfbe07b167076fef5b24eeaed75c
parent	e3bc92ec638020eddd0d0ce53cb73b0763dee76c [diff] [blame]