daza | 8534747 | 2020-11-23 18:43:33 +0100 | [diff] [blame] | 1 | import argparse, time, json |
| 2 | import my_utils.file_utils as fu |
| 3 | from lib.CoNLL_Annotation import get_token_type |
| 4 | |
| 5 | if __name__ == "__main__": |
| 6 | """ |
daza | d7d7075 | 2021-01-12 18:17:49 +0100 | [diff] [blame^] | 7 | |
| 8 | --- TIGER NEW Orthography --- |
| 9 | python DeReKo/spacy_train/conll2spacy.py --corpus_name TigerNew --gld_token_type CoNLLUP_Token \ |
| 10 | -i /home/daza/datasets/TIGER_conll/data_splits/train/Tiger.NewOrth.train.conll \ |
| 11 | -o DeReKo/spacy_train/Tiger.NewOrth.train.json \ |
| 12 | -t DeReKo/spacy_train/Tiger.NewOrth.train.txt |
| 13 | |
| 14 | python DeReKo/spacy_train/conll2spacy.py --corpus_name TigerNew --gld_token_type CoNLLUP_Token \ |
daza | 8534747 | 2020-11-23 18:43:33 +0100 | [diff] [blame] | 15 | -i /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.NewOrth.test.conll \ |
| 16 | -o DeReKo/spacy_train/Tiger.NewOrth.test.json \ |
| 17 | -t DeReKo/spacy_train/Tiger.NewOrth.test.txt |
| 18 | |
daza | d7d7075 | 2021-01-12 18:17:49 +0100 | [diff] [blame^] | 19 | --- TIGER NEW + OLD Orthography --- |
| 20 | cat Tiger.OldOrth.train.conll Tiger.NewOrth.train.conll > Tiger.ALL.Orth.train.conll |
| 21 | cat Tiger.OldOrth.test.conll Tiger.NewOrth.test.conll > Tiger.ALL.Orth.test.conll |
| 22 | |
| 23 | python DeReKo/spacy_train/conll2spacy.py --corpus_name TigerALL --gld_token_type CoNLLUP_Token \ |
| 24 | -i /home/daza/datasets/TIGER_conll/data_splits/train/Tiger.ALL.Orth.train.conll \ |
| 25 | -o /home/daza/datasets/TIGER_conll/data_splits/train/Tiger.ALL.Orth.train.json \ |
| 26 | -t /home/daza/datasets/TIGER_conll/data_splits/train/Tiger.ALL.Orth.train.txt |
| 27 | |
| 28 | python DeReKo/spacy_train/conll2spacy.py --corpus_name TigerALL --gld_token_type CoNLLUP_Token \ |
| 29 | -i /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.ALL.Orth.test.conll \ |
| 30 | -o /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.ALL.Orth.test.json \ |
| 31 | -t /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.ALL.Orth.test.txt |
| 32 | |
daza | 8534747 | 2020-11-23 18:43:33 +0100 | [diff] [blame] | 33 | """ |
| 34 | |
| 35 | parser = argparse.ArgumentParser() |
| 36 | parser.add_argument("-i", "--input_file", help="Input Corpus", required=True) |
| 37 | parser.add_argument("-n", "--corpus_name", help="Corpus Name", default="Corpus") |
| 38 | parser.add_argument("-o", "--output_file", help="File where the Predictions will be saved", required=True) |
| 39 | parser.add_argument("-t", "--text_file", help="Output Plain Text File", default=None) |
| 40 | parser.add_argument("-gtt", "--gld_token_type", help="CoNLL Format of the Gold Data", default="CoNLL09_Token") |
| 41 | parser.add_argument("-c", "--comment_str", help="CoNLL Format of comentaries inside the file", default="#") |
| 42 | args = parser.parse_args() |
| 43 | |
| 44 | file_has_next, chunk_ix = True, 0 |
| 45 | CHUNK_SIZE = 60000 |
| 46 | |
| 47 | write_out = open(args.output_file, "w") |
| 48 | if args.text_file: write_plain = open(args.text_file, "w") |
| 49 | |
| 50 | if ".gz" == args.input_file[-3:]: |
| 51 | in_file = fu.expand_file(args.input_file) |
| 52 | else: |
| 53 | in_file = args.input_file |
| 54 | |
| 55 | start = time.time() |
| 56 | total_processed_sents = 0 |
| 57 | line_generator = fu.file_generator(in_file) |
| 58 | while file_has_next: |
| 59 | annos, file_has_next = fu.get_file_annos_chunk(line_generator, chunk_size=CHUNK_SIZE, token_class=get_token_type(args.gld_token_type), comment_str=args.comment_str) |
| 60 | if len(annos) == 0: break |
| 61 | total_processed_sents += len(annos) |
| 62 | print(f"Already processed {total_processed_sents} sentences...") |
| 63 | spacy_docs = [] |
| 64 | for anno_id, anno in enumerate(annos): |
| 65 | plain_text, token_objs = [], [] |
| 66 | for ix, tok in enumerate(anno.tokens): |
| 67 | token_objs.append({"id": ix, "orth":tok.word, "tag": tok.pos_tag}) |
| 68 | plain_text.append(tok.word) |
| 69 | plain_text_str = " ".join(plain_text) |
| 70 | sent_obj = { |
| 71 | "id": anno_id, |
| 72 | "meta": anno.metadata, |
| 73 | "paragraphs": [{ |
| 74 | "raw": plain_text_str, |
| 75 | "sentences": [{ |
| 76 | "tokens": token_objs |
| 77 | }] |
| 78 | }] |
| 79 | } |
| 80 | spacy_docs.append(sent_obj) |
| 81 | if args.text_file: |
| 82 | write_plain.write(plain_text_str + "\n") |
| 83 | write_out.write(json.dumps(spacy_docs)) |
| 84 | end = time.time() |
| 85 | print(f"Processing {args.corpus_name} took {(end - start)} seconds!") |