daza | e3bc92e | 2020-11-04 11:06:26 +0100 | [diff] [blame] | 1 | import requests, logging, json |
daza | 8534747 | 2020-11-23 18:43:33 +0100 | [diff] [blame] | 2 | import subprocess, time |
| 3 | import glob, logging |
| 4 | import os.path, sys |
daza | 0498a6a | 2020-10-06 12:03:12 +0200 | [diff] [blame] | 5 | from lib.CoNLL_Annotation import read_conll, read_conll_generator |
daza | 972aabc | 2020-09-01 16:41:30 +0200 | [diff] [blame] | 6 | |
| 7 | logger = logging.getLogger(__name__) |
| 8 | |
| 9 | |
daza | 8534747 | 2020-11-23 18:43:33 +0100 | [diff] [blame] | 10 | def list_to_file(my_list, out_path): |
daza | e3bc92e | 2020-11-04 11:06:26 +0100 | [diff] [blame] | 11 | with open(out_path, "w") as out: |
daza | 8534747 | 2020-11-23 18:43:33 +0100 | [diff] [blame] | 12 | for item_str in my_list: |
| 13 | out.write(f"{item_str}\n") |
| 14 | |
| 15 | def counter_to_file(my_counter, out_path): |
| 16 | with open(out_path, "w") as out: |
| 17 | for item, count in my_counter: |
| 18 | item_str = "\t".join(item) |
| 19 | out.write(f"{item_str}\t{count}\n") |
| 20 | |
| 21 | def dict_to_file(my_dict, out_path): |
| 22 | with open(out_path, "w", encoding='utf8') as out: |
| 23 | json.dump(my_dict, fp=out, ensure_ascii=False) |
daza | e3bc92e | 2020-11-04 11:06:26 +0100 | [diff] [blame] | 24 | |
daza | 54e072e | 2020-11-04 11:06:26 +0100 | [diff] [blame] | 25 | |
daza | e3bc92e | 2020-11-04 11:06:26 +0100 | [diff] [blame] | 26 | def file_to_dict(file_path): |
| 27 | d = {} |
| 28 | with open(file_path) as f: |
daza | 8534747 | 2020-11-23 18:43:33 +0100 | [diff] [blame] | 29 | d = json.load(f) |
daza | e3bc92e | 2020-11-04 11:06:26 +0100 | [diff] [blame] | 30 | return d |
| 31 | |
| 32 | |
daza | 8534747 | 2020-11-23 18:43:33 +0100 | [diff] [blame] | 33 | def write_conll_file(conll_objs, out_path): |
| 34 | with open(out_path, "w", encoding='utf8') as out: |
| 35 | for obj in conll_objs: |
| 36 | for tok in obj.tokens: |
| 37 | out.write(tok.get_conllU_line()+"\n") |
| 38 | out.write("\n") |
| 39 | |
daza | 972aabc | 2020-09-01 16:41:30 +0200 | [diff] [blame] | 40 | def file_generator(file_path): |
| 41 | with open(file_path, "r") as data_file: |
| 42 | logger.info("Reading instances from lines in file at: %s", file_path) |
| 43 | for line in data_file: |
| 44 | if not line: continue |
| 45 | yield line |
| 46 | |
| 47 | |
Marc Kupietz | a01314f | 2021-02-11 17:02:08 +0100 | [diff] [blame^] | 48 | def get_file_annos_chunk(line_generator, chunk_size, token_class, comment_str="###C:", our_foundry="spacy"): |
daza | 8534747 | 2020-11-23 18:43:33 +0100 | [diff] [blame] | 49 | file_has_next = True |
Marc Kupietz | a01314f | 2021-02-11 17:02:08 +0100 | [diff] [blame^] | 50 | chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str, our_foundry=our_foundry) |
daza | 8534747 | 2020-11-23 18:43:33 +0100 | [diff] [blame] | 51 | if n_sents == 0: file_has_next = False |
| 52 | sents, gld, meta = [], [], [] |
| 53 | return chunk, file_has_next |
| 54 | |
| 55 | |
daza | e3bc92e | 2020-11-04 11:06:26 +0100 | [diff] [blame] | 56 | def get_file_text_chunk(line_generator, chunk_size, token_class, comment_str="###C:"): |
daza | 8534747 | 2020-11-23 18:43:33 +0100 | [diff] [blame] | 57 | """ Same as get_file_annos_chunk but directly get (text, labels) pairs""" |
daza | ff42f63 | 2020-10-08 14:46:32 +0200 | [diff] [blame] | 58 | file_has_next = True |
daza | e3bc92e | 2020-11-04 11:06:26 +0100 | [diff] [blame] | 59 | chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str) |
daza | ff42f63 | 2020-10-08 14:46:32 +0200 | [diff] [blame] | 60 | if n_sents == 0: file_has_next = False |
| 61 | sents, gld, meta = [], [], [] |
| 62 | for anno in chunk: |
| 63 | if len(anno.metadata) > 0: meta.append("\n".join(anno.metadata)) |
| 64 | sents.append(anno.get_sentence()) |
| 65 | gld.append(anno.get_pos_tags()) |
| 66 | return sents, gld, file_has_next |
| 67 | |
| 68 | |
daza | e3bc92e | 2020-11-04 11:06:26 +0100 | [diff] [blame] | 69 | def get_file_chunk(line_generator, chunk_size, token_class, comment_str="###C:"): |
daza | 972aabc | 2020-09-01 16:41:30 +0200 | [diff] [blame] | 70 | file_has_next = True |
daza | e3bc92e | 2020-11-04 11:06:26 +0100 | [diff] [blame] | 71 | chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str) |
| 72 | if n_sents < chunk_size: file_has_next = False |
daza | 972aabc | 2020-09-01 16:41:30 +0200 | [diff] [blame] | 73 | raw_text = "" |
| 74 | for anno in chunk: |
| 75 | if len(anno.metadata) > 0: |
| 76 | raw_text += "\n".join(anno.metadata) + "\n" |
| 77 | else: |
| 78 | raw_text += "\n" |
| 79 | for tok in anno.tokens: |
| 80 | raw_text += tok.get_conllU_line() + "\n" |
| 81 | raw_text += "\n" |
| 82 | return raw_text, file_has_next, n_sents |
| 83 | |
| 84 | |
| 85 | def turku_parse_file(raw_text, filename, chunk_ix): |
daza | e3bc92e | 2020-11-04 11:06:26 +0100 | [diff] [blame] | 86 | out_file_str = f"{filename}.parsed.{chunk_ix}.conllu" |
daza | 972aabc | 2020-09-01 16:41:30 +0200 | [diff] [blame] | 87 | # For each file make a request to obtain the parse back |
| 88 | logger.info(f"Sending Request {chunk_ix} to Parser Server...") |
| 89 | response = requests.post("http://localhost:7689/", data=raw_text.encode('utf-8')) |
| 90 | response_to_file(response.text, out_file_str) |
| 91 | |
| 92 | |
| 93 | |
| 94 | def response_to_file(response_str, fname): |
| 95 | fout = open(fname, "w") |
| 96 | fout.write(response_str) |
| 97 | fout.close() |
daza | 8534747 | 2020-11-23 18:43:33 +0100 | [diff] [blame] | 98 | |
| 99 | |
| 100 | def expand_file(f, substitute_comment=False): |
| 101 | # Expand the .gz file |
| 102 | fname = f[:-3] |
| 103 | if not os.path.isfile(fname): |
| 104 | p = subprocess.call(f"gunzip -c {f} > {fname}", shell=True) |
| 105 | if p == 0: |
| 106 | logger.info("Successfully uncompressed file") |
| 107 | else: |
| 108 | logger.info(f"Couldn't expand file {f}") |
| 109 | raise Exception |
| 110 | else: |
| 111 | logger.info(f"File {fname} is already uncompressed. Skipping this step...") |
| 112 | |
| 113 | # Substitute the Commentary Lines on the Expanded file |
| 114 | if substitute_comment: |
| 115 | fixed_filename = f"{fname}.fixed" |
| 116 | p = subprocess.call(f"sed 's/^# /###C: /g' {fname}", shell=True, stdout=open(fixed_filename, "w")) # stdout=subprocess.PIPE |
| 117 | if p == 0: |
| 118 | logger.info("Successfully fixed comments on file") |
| 119 | else: |
| 120 | logger.info(f"Something went wrong when substituting commentaries") |
| 121 | raise Exception |
| 122 | return fixed_filename |
| 123 | else: |
daza | fb308a2 | 2021-01-27 16:20:08 +0100 | [diff] [blame] | 124 | return fname |