blob: 9be14ab7b668118bde56c357238cef9ae00fce88 [file] [log] [blame]
daza85347472020-11-23 18:43:33 +01001import argparse, time, json
2import my_utils.file_utils as fu
3from lib.CoNLL_Annotation import get_token_type
4
5if __name__ == "__main__":
6 """
dazad7d70752021-01-12 18:17:49 +01007
8 --- TIGER NEW Orthography ---
9 python DeReKo/spacy_train/conll2spacy.py --corpus_name TigerNew --gld_token_type CoNLLUP_Token \
10 -i /home/daza/datasets/TIGER_conll/data_splits/train/Tiger.NewOrth.train.conll \
11 -o DeReKo/spacy_train/Tiger.NewOrth.train.json \
12 -t DeReKo/spacy_train/Tiger.NewOrth.train.txt
13
14 python DeReKo/spacy_train/conll2spacy.py --corpus_name TigerNew --gld_token_type CoNLLUP_Token \
daza85347472020-11-23 18:43:33 +010015 -i /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.NewOrth.test.conll \
16 -o DeReKo/spacy_train/Tiger.NewOrth.test.json \
17 -t DeReKo/spacy_train/Tiger.NewOrth.test.txt
18
dazad7d70752021-01-12 18:17:49 +010019 --- TIGER NEW + OLD Orthography ---
20 cat Tiger.OldOrth.train.conll Tiger.NewOrth.train.conll > Tiger.ALL.Orth.train.conll
21 cat Tiger.OldOrth.test.conll Tiger.NewOrth.test.conll > Tiger.ALL.Orth.test.conll
22
23 python DeReKo/spacy_train/conll2spacy.py --corpus_name TigerALL --gld_token_type CoNLLUP_Token \
24 -i /home/daza/datasets/TIGER_conll/data_splits/train/Tiger.ALL.Orth.train.conll \
25 -o /home/daza/datasets/TIGER_conll/data_splits/train/Tiger.ALL.Orth.train.json \
26 -t /home/daza/datasets/TIGER_conll/data_splits/train/Tiger.ALL.Orth.train.txt
27
28 python DeReKo/spacy_train/conll2spacy.py --corpus_name TigerALL --gld_token_type CoNLLUP_Token \
29 -i /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.ALL.Orth.test.conll \
30 -o /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.ALL.Orth.test.json \
31 -t /home/daza/datasets/TIGER_conll/data_splits/test/Tiger.ALL.Orth.test.txt
32
daza85347472020-11-23 18:43:33 +010033 """
34
35 parser = argparse.ArgumentParser()
36 parser.add_argument("-i", "--input_file", help="Input Corpus", required=True)
37 parser.add_argument("-n", "--corpus_name", help="Corpus Name", default="Corpus")
38 parser.add_argument("-o", "--output_file", help="File where the Predictions will be saved", required=True)
39 parser.add_argument("-t", "--text_file", help="Output Plain Text File", default=None)
40 parser.add_argument("-gtt", "--gld_token_type", help="CoNLL Format of the Gold Data", default="CoNLL09_Token")
41 parser.add_argument("-c", "--comment_str", help="CoNLL Format of comentaries inside the file", default="#")
42 args = parser.parse_args()
43
44 file_has_next, chunk_ix = True, 0
45 CHUNK_SIZE = 60000
46
47 write_out = open(args.output_file, "w")
48 if args.text_file: write_plain = open(args.text_file, "w")
49
50 if ".gz" == args.input_file[-3:]:
51 in_file = fu.expand_file(args.input_file)
52 else:
53 in_file = args.input_file
54
55 start = time.time()
56 total_processed_sents = 0
57 line_generator = fu.file_generator(in_file)
58 while file_has_next:
59 annos, file_has_next = fu.get_file_annos_chunk(line_generator, chunk_size=CHUNK_SIZE, token_class=get_token_type(args.gld_token_type), comment_str=args.comment_str)
60 if len(annos) == 0: break
61 total_processed_sents += len(annos)
62 print(f"Already processed {total_processed_sents} sentences...")
63 spacy_docs = []
64 for anno_id, anno in enumerate(annos):
65 plain_text, token_objs = [], []
66 for ix, tok in enumerate(anno.tokens):
67 token_objs.append({"id": ix, "orth":tok.word, "tag": tok.pos_tag})
68 plain_text.append(tok.word)
69 plain_text_str = " ".join(plain_text)
70 sent_obj = {
71 "id": anno_id,
72 "meta": anno.metadata,
73 "paragraphs": [{
74 "raw": plain_text_str,
75 "sentences": [{
76 "tokens": token_objs
77 }]
78 }]
79 }
80 spacy_docs.append(sent_obj)
81 if args.text_file:
82 write_plain.write(plain_text_str + "\n")
83 write_out.write(json.dumps(spacy_docs))
84 end = time.time()
85 print(f"Processing {args.corpus_name} took {(end - start)} seconds!")