daza | e3bc92e | 2020-11-04 11:06:26 +0100 | [diff] [blame^] | 1 | import requests, logging, json |
daza | 0498a6a | 2020-10-06 12:03:12 +0200 | [diff] [blame] | 2 | from lib.CoNLL_Annotation import read_conll, read_conll_generator |
daza | 972aabc | 2020-09-01 16:41:30 +0200 | [diff] [blame] | 3 | |
| 4 | logger = logging.getLogger(__name__) |
| 5 | |
| 6 | |
daza | e3bc92e | 2020-11-04 11:06:26 +0100 | [diff] [blame^] | 7 | def dict_to_file(my_dict, out_path): |
| 8 | with open(out_path, "w") as out: |
| 9 | out.write(json.dump(my_dict)) |
| 10 | |
| 11 | def file_to_dict(file_path): |
| 12 | d = {} |
| 13 | with open(file_path) as f: |
| 14 | d = f.load(f) |
| 15 | return d |
| 16 | |
| 17 | |
daza | 972aabc | 2020-09-01 16:41:30 +0200 | [diff] [blame] | 18 | def file_generator(file_path): |
| 19 | with open(file_path, "r") as data_file: |
| 20 | logger.info("Reading instances from lines in file at: %s", file_path) |
| 21 | for line in data_file: |
| 22 | if not line: continue |
| 23 | yield line |
| 24 | |
| 25 | |
daza | e3bc92e | 2020-11-04 11:06:26 +0100 | [diff] [blame^] | 26 | def get_file_text_chunk(line_generator, chunk_size, token_class, comment_str="###C:"): |
daza | ff42f63 | 2020-10-08 14:46:32 +0200 | [diff] [blame] | 27 | file_has_next = True |
daza | e3bc92e | 2020-11-04 11:06:26 +0100 | [diff] [blame^] | 28 | chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str) |
daza | ff42f63 | 2020-10-08 14:46:32 +0200 | [diff] [blame] | 29 | if n_sents == 0: file_has_next = False |
| 30 | sents, gld, meta = [], [], [] |
| 31 | for anno in chunk: |
| 32 | if len(anno.metadata) > 0: meta.append("\n".join(anno.metadata)) |
| 33 | sents.append(anno.get_sentence()) |
| 34 | gld.append(anno.get_pos_tags()) |
| 35 | return sents, gld, file_has_next |
| 36 | |
| 37 | |
daza | e3bc92e | 2020-11-04 11:06:26 +0100 | [diff] [blame^] | 38 | def get_file_chunk(line_generator, chunk_size, token_class, comment_str="###C:"): |
daza | 972aabc | 2020-09-01 16:41:30 +0200 | [diff] [blame] | 39 | file_has_next = True |
daza | e3bc92e | 2020-11-04 11:06:26 +0100 | [diff] [blame^] | 40 | chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str) |
| 41 | if n_sents < chunk_size: file_has_next = False |
daza | 972aabc | 2020-09-01 16:41:30 +0200 | [diff] [blame] | 42 | raw_text = "" |
| 43 | for anno in chunk: |
| 44 | if len(anno.metadata) > 0: |
| 45 | raw_text += "\n".join(anno.metadata) + "\n" |
| 46 | else: |
| 47 | raw_text += "\n" |
| 48 | for tok in anno.tokens: |
| 49 | raw_text += tok.get_conllU_line() + "\n" |
| 50 | raw_text += "\n" |
| 51 | return raw_text, file_has_next, n_sents |
| 52 | |
| 53 | |
| 54 | def turku_parse_file(raw_text, filename, chunk_ix): |
daza | e3bc92e | 2020-11-04 11:06:26 +0100 | [diff] [blame^] | 55 | out_file_str = f"{filename}.parsed.{chunk_ix}.conllu" |
daza | 972aabc | 2020-09-01 16:41:30 +0200 | [diff] [blame] | 56 | # For each file make a request to obtain the parse back |
| 57 | logger.info(f"Sending Request {chunk_ix} to Parser Server...") |
| 58 | response = requests.post("http://localhost:7689/", data=raw_text.encode('utf-8')) |
| 59 | response_to_file(response.text, out_file_str) |
| 60 | |
| 61 | |
| 62 | |
| 63 | def response_to_file(response_str, fname): |
| 64 | fout = open(fname, "w") |
| 65 | fout.write(response_str) |
| 66 | fout.close() |