Blame - my_utils/file_utils.py - KorAP/sota-pos-lemmatizers

blob: e63ddca1a22a49334caecb8c3044c4f5f7c0dda9 [file] [log] [blame]

daza	e3bc92e	2020-11-04 11:06:26 +0100	[diff] [blame^]	1	import requests, logging, json
daza	0498a6a	2020-10-06 12:03:12 +0200	[diff] [blame]	2	from lib.CoNLL_Annotation import read_conll, read_conll_generator
daza	972aabc	2020-09-01 16:41:30 +0200	[diff] [blame]	3
				4	logger = logging.getLogger(__name__)
				5
				6
daza	e3bc92e	2020-11-04 11:06:26 +0100	[diff] [blame^]	7	def dict_to_file(my_dict, out_path):
				8	with open(out_path, "w") as out:
				9	out.write(json.dump(my_dict))
				10
				11	def file_to_dict(file_path):
				12	d = {}
				13	with open(file_path) as f:
				14	d = f.load(f)
				15	return d
				16
				17
daza	972aabc	2020-09-01 16:41:30 +0200	[diff] [blame]	18	def file_generator(file_path):
				19	with open(file_path, "r") as data_file:
				20	logger.info("Reading instances from lines in file at: %s", file_path)
				21	for line in data_file:
				22	if not line: continue
				23	yield line
				24
				25
daza	e3bc92e	2020-11-04 11:06:26 +0100	[diff] [blame^]	26	def get_file_text_chunk(line_generator, chunk_size, token_class, comment_str="###C:"):
daza	ff42f63	2020-10-08 14:46:32 +0200	[diff] [blame]	27	file_has_next = True
daza	e3bc92e	2020-11-04 11:06:26 +0100	[diff] [blame^]	28	chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str)
daza	ff42f63	2020-10-08 14:46:32 +0200	[diff] [blame]	29	if n_sents == 0: file_has_next = False
				30	sents, gld, meta = [], [], []
				31	for anno in chunk:
				32	if len(anno.metadata) > 0: meta.append("\n".join(anno.metadata))
				33	sents.append(anno.get_sentence())
				34	gld.append(anno.get_pos_tags())
				35	return sents, gld, file_has_next
				36
				37
daza	e3bc92e	2020-11-04 11:06:26 +0100	[diff] [blame^]	38	def get_file_chunk(line_generator, chunk_size, token_class, comment_str="###C:"):
daza	972aabc	2020-09-01 16:41:30 +0200	[diff] [blame]	39	file_has_next = True
daza	e3bc92e	2020-11-04 11:06:26 +0100	[diff] [blame^]	40	chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str)
				41	if n_sents < chunk_size: file_has_next = False
daza	972aabc	2020-09-01 16:41:30 +0200	[diff] [blame]	42	raw_text = ""
				43	for anno in chunk:
				44	if len(anno.metadata) > 0:
				45	raw_text += "\n".join(anno.metadata) + "\n"
				46	else:
				47	raw_text += "\n"
				48	for tok in anno.tokens:
				49	raw_text += tok.get_conllU_line() + "\n"
				50	raw_text += "\n"
				51	return raw_text, file_has_next, n_sents
				52
				53
				54	def turku_parse_file(raw_text, filename, chunk_ix):
daza	e3bc92e	2020-11-04 11:06:26 +0100	[diff] [blame^]	55	out_file_str = f"{filename}.parsed.{chunk_ix}.conllu"
daza	972aabc	2020-09-01 16:41:30 +0200	[diff] [blame]	56	# For each file make a request to obtain the parse back
				57	logger.info(f"Sending Request {chunk_ix} to Parser Server...")
				58	response = requests.post("http://localhost:7689/", data=raw_text.encode('utf-8'))
				59	response_to_file(response.text, out_file_str)
				60
				61
				62
				63	def response_to_file(response_str, fname):
				64	fout = open(fname, "w")
				65	fout.write(response_str)
				66	fout.close()