blob: e63ddca1a22a49334caecb8c3044c4f5f7c0dda9 [file] [log] [blame]
daza54e072e2020-11-04 11:06:26 +01001import requests, logging, json
daza5cb357d2020-10-06 12:03:12 +02002from lib.CoNLL_Annotation import read_conll, read_conll_generator
dazae94ab182020-09-01 16:41:30 +02003
4logger = logging.getLogger(__name__)
5
6
daza54e072e2020-11-04 11:06:26 +01007def dict_to_file(my_dict, out_path):
8 with open(out_path, "w") as out:
9 out.write(json.dump(my_dict))
10
11def file_to_dict(file_path):
12 d = {}
13 with open(file_path) as f:
14 d = f.load(f)
15 return d
16
17
dazae94ab182020-09-01 16:41:30 +020018def file_generator(file_path):
19 with open(file_path, "r") as data_file:
20 logger.info("Reading instances from lines in file at: %s", file_path)
21 for line in data_file:
22 if not line: continue
23 yield line
24
25
daza54e072e2020-11-04 11:06:26 +010026def get_file_text_chunk(line_generator, chunk_size, token_class, comment_str="###C:"):
dazad1403802020-10-08 14:46:32 +020027 file_has_next = True
daza54e072e2020-11-04 11:06:26 +010028 chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str)
dazad1403802020-10-08 14:46:32 +020029 if n_sents == 0: file_has_next = False
30 sents, gld, meta = [], [], []
31 for anno in chunk:
32 if len(anno.metadata) > 0: meta.append("\n".join(anno.metadata))
33 sents.append(anno.get_sentence())
34 gld.append(anno.get_pos_tags())
35 return sents, gld, file_has_next
36
37
daza54e072e2020-11-04 11:06:26 +010038def get_file_chunk(line_generator, chunk_size, token_class, comment_str="###C:"):
dazae94ab182020-09-01 16:41:30 +020039 file_has_next = True
daza54e072e2020-11-04 11:06:26 +010040 chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str)
41 if n_sents < chunk_size: file_has_next = False
dazae94ab182020-09-01 16:41:30 +020042 raw_text = ""
43 for anno in chunk:
44 if len(anno.metadata) > 0:
45 raw_text += "\n".join(anno.metadata) + "\n"
46 else:
47 raw_text += "\n"
48 for tok in anno.tokens:
49 raw_text += tok.get_conllU_line() + "\n"
50 raw_text += "\n"
51 return raw_text, file_has_next, n_sents
52
53
54def turku_parse_file(raw_text, filename, chunk_ix):
daza54e072e2020-11-04 11:06:26 +010055 out_file_str = f"{filename}.parsed.{chunk_ix}.conllu"
dazae94ab182020-09-01 16:41:30 +020056 # For each file make a request to obtain the parse back
57 logger.info(f"Sending Request {chunk_ix} to Parser Server...")
58 response = requests.post("http://localhost:7689/", data=raw_text.encode('utf-8'))
59 response_to_file(response.text, out_file_str)
60
61
62
63def response_to_file(response_str, fname):
64 fout = open(fname, "w")
65 fout.write(response_str)
66 fout.close()