blob: 1865fc53d5cb8faefde844adb294addbfed9e51a [file] [log] [blame]
dazae3bc92e2020-11-04 11:06:26 +01001import requests, logging, json
daza85347472020-11-23 18:43:33 +01002import subprocess, time
3import glob, logging
4import os.path, sys
daza0498a6a2020-10-06 12:03:12 +02005from lib.CoNLL_Annotation import read_conll, read_conll_generator
daza972aabc2020-09-01 16:41:30 +02006
7logger = logging.getLogger(__name__)
8
9
daza85347472020-11-23 18:43:33 +010010def list_to_file(my_list, out_path):
dazae3bc92e2020-11-04 11:06:26 +010011 with open(out_path, "w") as out:
daza85347472020-11-23 18:43:33 +010012 for item_str in my_list:
13 out.write(f"{item_str}\n")
14
15def counter_to_file(my_counter, out_path):
16 with open(out_path, "w") as out:
17 for item, count in my_counter:
18 item_str = "\t".join(item)
19 out.write(f"{item_str}\t{count}\n")
20
21def dict_to_file(my_dict, out_path):
22 with open(out_path, "w", encoding='utf8') as out:
23 json.dump(my_dict, fp=out, ensure_ascii=False)
dazae3bc92e2020-11-04 11:06:26 +010024
25def file_to_dict(file_path):
26 d = {}
27 with open(file_path) as f:
daza85347472020-11-23 18:43:33 +010028 d = json.load(f)
dazae3bc92e2020-11-04 11:06:26 +010029 return d
30
31
daza85347472020-11-23 18:43:33 +010032def write_conll_file(conll_objs, out_path):
33 with open(out_path, "w", encoding='utf8') as out:
34 for obj in conll_objs:
35 for tok in obj.tokens:
36 out.write(tok.get_conllU_line()+"\n")
37 out.write("\n")
38
39
daza972aabc2020-09-01 16:41:30 +020040def file_generator(file_path):
41 with open(file_path, "r") as data_file:
42 logger.info("Reading instances from lines in file at: %s", file_path)
43 for line in data_file:
44 if not line: continue
45 yield line
46
47
daza85347472020-11-23 18:43:33 +010048def get_file_annos_chunk(line_generator, chunk_size, token_class, comment_str="###C:"):
49 file_has_next = True
50 chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str)
51 if n_sents == 0: file_has_next = False
52 sents, gld, meta = [], [], []
53 return chunk, file_has_next
54
55
dazae3bc92e2020-11-04 11:06:26 +010056def get_file_text_chunk(line_generator, chunk_size, token_class, comment_str="###C:"):
daza85347472020-11-23 18:43:33 +010057 """ Same as get_file_annos_chunk but directly get (text, labels) pairs"""
dazaff42f632020-10-08 14:46:32 +020058 file_has_next = True
dazae3bc92e2020-11-04 11:06:26 +010059 chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str)
dazaff42f632020-10-08 14:46:32 +020060 if n_sents == 0: file_has_next = False
61 sents, gld, meta = [], [], []
62 for anno in chunk:
63 if len(anno.metadata) > 0: meta.append("\n".join(anno.metadata))
64 sents.append(anno.get_sentence())
65 gld.append(anno.get_pos_tags())
66 return sents, gld, file_has_next
67
68
dazae3bc92e2020-11-04 11:06:26 +010069def get_file_chunk(line_generator, chunk_size, token_class, comment_str="###C:"):
daza972aabc2020-09-01 16:41:30 +020070 file_has_next = True
dazae3bc92e2020-11-04 11:06:26 +010071 chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str)
72 if n_sents < chunk_size: file_has_next = False
daza972aabc2020-09-01 16:41:30 +020073 raw_text = ""
74 for anno in chunk:
75 if len(anno.metadata) > 0:
76 raw_text += "\n".join(anno.metadata) + "\n"
77 else:
78 raw_text += "\n"
79 for tok in anno.tokens:
80 raw_text += tok.get_conllU_line() + "\n"
81 raw_text += "\n"
82 return raw_text, file_has_next, n_sents
83
84
85def turku_parse_file(raw_text, filename, chunk_ix):
dazae3bc92e2020-11-04 11:06:26 +010086 out_file_str = f"{filename}.parsed.{chunk_ix}.conllu"
daza972aabc2020-09-01 16:41:30 +020087 # For each file make a request to obtain the parse back
88 logger.info(f"Sending Request {chunk_ix} to Parser Server...")
89 response = requests.post("http://localhost:7689/", data=raw_text.encode('utf-8'))
90 response_to_file(response.text, out_file_str)
91
92
93
94def response_to_file(response_str, fname):
95 fout = open(fname, "w")
96 fout.write(response_str)
97 fout.close()
daza85347472020-11-23 18:43:33 +010098
99
100def expand_file(f, substitute_comment=False):
101 # Expand the .gz file
102 fname = f[:-3]
103 if not os.path.isfile(fname):
104 p = subprocess.call(f"gunzip -c {f} > {fname}", shell=True)
105 if p == 0:
106 logger.info("Successfully uncompressed file")
107 else:
108 logger.info(f"Couldn't expand file {f}")
109 raise Exception
110 else:
111 logger.info(f"File {fname} is already uncompressed. Skipping this step...")
112
113 # Substitute the Commentary Lines on the Expanded file
114 if substitute_comment:
115 fixed_filename = f"{fname}.fixed"
116 p = subprocess.call(f"sed 's/^# /###C: /g' {fname}", shell=True, stdout=open(fixed_filename, "w")) # stdout=subprocess.PIPE
117 if p == 0:
118 logger.info("Successfully fixed comments on file")
119 else:
120 logger.info(f"Something went wrong when substituting commentaries")
121 raise Exception
122 return fixed_filename
123 else:
124 return fname