Added training examples for SpaCy3
diff --git a/DeReKo/spacy_train/conll2spacy.py b/DeReKo/spacy_train/conll2spacy.py
new file mode 100644
index 0000000..f0d0d4c
--- /dev/null
+++ b/DeReKo/spacy_train/conll2spacy.py
@@ -0,0 +1,70 @@
+import argparse, time, json
+import my_utils.file_utils as fu
+from lib.CoNLL_Annotation import get_token_type
+
+if __name__ == "__main__":
+ """
+ --- TIGER New Orthography ---
+ python DeReKo/spacy_train/conll2spacy.py --corpus_name TigerNew --gld_token_type CoNLLUP_Token \
+ -i /home/daza/datasets/TIGER_conll/data_splits/train/Tiger.NewOrth.train.conll \
+ -o DeReKo/spacy_train/Tiger.NewOrth.train.json \
+ -t DeReKo/spacy_train/Tiger.NewOrth.train.txt
+
+ python DeReKo/spacy_train/conll2spacy.py --corpus_name TigerNew --gld_token_type CoNLLUP_Token \
+ -i /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.NewOrth.test.conll \
+ -o DeReKo/spacy_train/Tiger.NewOrth.test.json \
+ -t DeReKo/spacy_train/Tiger.NewOrth.test.txt
+
+ """
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("-i", "--input_file", help="Input Corpus", required=True)
+ parser.add_argument("-n", "--corpus_name", help="Corpus Name", default="Corpus")
+ parser.add_argument("-o", "--output_file", help="File where the Predictions will be saved", required=True)
+ parser.add_argument("-t", "--text_file", help="Output Plain Text File", default=None)
+ parser.add_argument("-gtt", "--gld_token_type", help="CoNLL Format of the Gold Data", default="CoNLL09_Token")
+ parser.add_argument("-c", "--comment_str", help="CoNLL Format of comentaries inside the file", default="#")
+ args = parser.parse_args()
+
+ file_has_next, chunk_ix = True, 0
+ CHUNK_SIZE = 60000
+
+ write_out = open(args.output_file, "w")
+ if args.text_file: write_plain = open(args.text_file, "w")
+
+ if ".gz" == args.input_file[-3:]:
+ in_file = fu.expand_file(args.input_file)
+ else:
+ in_file = args.input_file
+
+ start = time.time()
+ total_processed_sents = 0
+ line_generator = fu.file_generator(in_file)
+ while file_has_next:
+ annos, file_has_next = fu.get_file_annos_chunk(line_generator, chunk_size=CHUNK_SIZE, token_class=get_token_type(args.gld_token_type), comment_str=args.comment_str)
+ if len(annos) == 0: break
+ total_processed_sents += len(annos)
+ print(f"Already processed {total_processed_sents} sentences...")
+ spacy_docs = []
+ for anno_id, anno in enumerate(annos):
+ plain_text, token_objs = [], []
+ for ix, tok in enumerate(anno.tokens):
+ token_objs.append({"id": ix, "orth":tok.word, "tag": tok.pos_tag})
+ plain_text.append(tok.word)
+ plain_text_str = " ".join(plain_text)
+ sent_obj = {
+ "id": anno_id,
+ "meta": anno.metadata,
+ "paragraphs": [{
+ "raw": plain_text_str,
+ "sentences": [{
+ "tokens": token_objs
+ }]
+ }]
+ }
+ spacy_docs.append(sent_obj)
+ if args.text_file:
+ write_plain.write(plain_text_str + "\n")
+ write_out.write(json.dumps(spacy_docs))
+ end = time.time()
+ print(f"Processing {args.corpus_name} took {(end - start)} seconds!")
\ No newline at end of file