|  | import argparse, time, json | 
|  | import my_utils.file_utils as fu | 
|  | from lib.CoNLL_Annotation import get_token_type | 
|  |  | 
|  | if __name__ == "__main__": | 
|  | """ | 
|  |  | 
|  | --- TIGER NEW Orthography --- | 
|  | python DeReKo/spacy_train/conll2spacy.py --corpus_name TigerNew --gld_token_type CoNLLUP_Token \ | 
|  | -i /home/daza/datasets/TIGER_conll/data_splits/train/Tiger.NewOrth.train.conll \ | 
|  | -o DeReKo/spacy_train/Tiger.NewOrth.train.json \ | 
|  | -t DeReKo/spacy_train/Tiger.NewOrth.train.txt | 
|  |  | 
|  | python DeReKo/spacy_train/conll2spacy.py --corpus_name TigerNew --gld_token_type CoNLLUP_Token \ | 
|  | -i /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.NewOrth.test.conll \ | 
|  | -o DeReKo/spacy_train/Tiger.NewOrth.test.json \ | 
|  | -t DeReKo/spacy_train/Tiger.NewOrth.test.txt | 
|  |  | 
|  | --- TIGER NEW + OLD Orthography --- | 
|  | cat Tiger.OldOrth.train.conll Tiger.NewOrth.train.conll > Tiger.ALL.Orth.train.conll | 
|  | cat Tiger.OldOrth.test.conll Tiger.NewOrth.test.conll > Tiger.ALL.Orth.test.conll | 
|  |  | 
|  | python DeReKo/spacy_train/conll2spacy.py --corpus_name TigerALL --gld_token_type CoNLLUP_Token \ | 
|  | -i /home/daza/datasets/TIGER_conll/data_splits/train/Tiger.ALL.Orth.train.conll \ | 
|  | -o /home/daza/datasets/TIGER_conll/data_splits/train/Tiger.ALL.Orth.train.json \ | 
|  | -t /home/daza/datasets/TIGER_conll/data_splits/train/Tiger.ALL.Orth.train.txt | 
|  |  | 
|  | python DeReKo/spacy_train/conll2spacy.py --corpus_name TigerALL --gld_token_type CoNLLUP_Token \ | 
|  | -i /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.ALL.Orth.test.conll \ | 
|  | -o /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.ALL.Orth.test.json \ | 
|  | -t /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.ALL.Orth.test.txt | 
|  |  | 
|  | """ | 
|  |  | 
|  | parser = argparse.ArgumentParser() | 
|  | parser.add_argument("-i", "--input_file", help="Input Corpus", required=True) | 
|  | parser.add_argument("-n", "--corpus_name", help="Corpus Name", default="Corpus") | 
|  | parser.add_argument("-o", "--output_file", help="File where the Predictions will be saved", required=True) | 
|  | parser.add_argument("-t", "--text_file", help="Output Plain Text File", default=None) | 
|  | parser.add_argument("-gtt", "--gld_token_type", help="CoNLL Format of the Gold Data", default="CoNLL09_Token") | 
|  | parser.add_argument("-c", "--comment_str", help="CoNLL Format of comentaries inside the file", default="#") | 
|  | args = parser.parse_args() | 
|  |  | 
|  | file_has_next, chunk_ix = True, 0 | 
|  | CHUNK_SIZE = 60000 | 
|  |  | 
|  | write_out = open(args.output_file, "w") | 
|  | if args.text_file: write_plain = open(args.text_file, "w") | 
|  |  | 
|  | if ".gz" == args.input_file[-3:]: | 
|  | in_file = fu.expand_file(args.input_file) | 
|  | else: | 
|  | in_file = args.input_file | 
|  |  | 
|  | start = time.time() | 
|  | total_processed_sents = 0 | 
|  | line_generator = fu.file_generator(in_file) | 
|  | while file_has_next: | 
|  | annos, file_has_next = fu.get_file_annos_chunk(line_generator, chunk_size=CHUNK_SIZE, 		token_class=get_token_type(args.gld_token_type), comment_str=args.comment_str) | 
|  | if len(annos) == 0: break | 
|  | total_processed_sents += len(annos) | 
|  | print(f"Already processed {total_processed_sents} sentences...") | 
|  | spacy_docs = [] | 
|  | for anno_id, anno in enumerate(annos): | 
|  | plain_text, token_objs = [], [] | 
|  | for ix, tok in enumerate(anno.tokens): | 
|  | token_objs.append({"id": ix, "orth":tok.word, "tag": tok.pos_tag}) | 
|  | plain_text.append(tok.word) | 
|  | plain_text_str = " ".join(plain_text) | 
|  | sent_obj = { | 
|  | "id": anno_id, | 
|  | "meta": anno.metadata, | 
|  | "paragraphs": [{ | 
|  | "raw": plain_text_str, | 
|  | "sentences": [{ | 
|  | "tokens": token_objs | 
|  | }] | 
|  | }] | 
|  | } | 
|  | spacy_docs.append(sent_obj) | 
|  | if args.text_file: | 
|  | write_plain.write(plain_text_str + "\n") | 
|  | write_out.write(json.dumps(spacy_docs)) | 
|  | end = time.time() | 
|  | print(f"Processing {args.corpus_name} took {(end - start)} seconds!") |