Blame - my_utils/file_utils.py - KorAP/sota-pos-lemmatizers

blob: 1865fc53d5cb8faefde844adb294addbfed9e51a [file] [log] [blame]

daza	e3bc92e	2020-11-04 11:06:26 +0100	[diff] [blame]	1	import requests, logging, json
daza	8534747	2020-11-23 18:43:33 +0100	[diff] [blame^]	2	import subprocess, time
				3	import glob, logging
				4	import os.path, sys
daza	0498a6a	2020-10-06 12:03:12 +0200	[diff] [blame]	5	from lib.CoNLL_Annotation import read_conll, read_conll_generator
daza	972aabc	2020-09-01 16:41:30 +0200	[diff] [blame]	6
				7	logger = logging.getLogger(__name__)
				8
				9
daza	8534747	2020-11-23 18:43:33 +0100	[diff] [blame^]	10	def list_to_file(my_list, out_path):
daza	e3bc92e	2020-11-04 11:06:26 +0100	[diff] [blame]	11	with open(out_path, "w") as out:
daza	8534747	2020-11-23 18:43:33 +0100	[diff] [blame^]	12	for item_str in my_list:
				13	out.write(f"{item_str}\n")
				14
				15	def counter_to_file(my_counter, out_path):
				16	with open(out_path, "w") as out:
				17	for item, count in my_counter:
				18	item_str = "\t".join(item)
				19	out.write(f"{item_str}\t{count}\n")
				20
				21	def dict_to_file(my_dict, out_path):
				22	with open(out_path, "w", encoding='utf8') as out:
				23	json.dump(my_dict, fp=out, ensure_ascii=False)
daza	e3bc92e	2020-11-04 11:06:26 +0100	[diff] [blame]	24
				25	def file_to_dict(file_path):
				26	d = {}
				27	with open(file_path) as f:
daza	8534747	2020-11-23 18:43:33 +0100	[diff] [blame^]	28	d = json.load(f)
daza	e3bc92e	2020-11-04 11:06:26 +0100	[diff] [blame]	29	return d
				30
				31
daza	8534747	2020-11-23 18:43:33 +0100	[diff] [blame^]	32	def write_conll_file(conll_objs, out_path):
				33	with open(out_path, "w", encoding='utf8') as out:
				34	for obj in conll_objs:
				35	for tok in obj.tokens:
				36	out.write(tok.get_conllU_line()+"\n")
				37	out.write("\n")
				38
				39
daza	972aabc	2020-09-01 16:41:30 +0200	[diff] [blame]	40	def file_generator(file_path):
				41	with open(file_path, "r") as data_file:
				42	logger.info("Reading instances from lines in file at: %s", file_path)
				43	for line in data_file:
				44	if not line: continue
				45	yield line
				46
				47
daza	8534747	2020-11-23 18:43:33 +0100	[diff] [blame^]	48	def get_file_annos_chunk(line_generator, chunk_size, token_class, comment_str="###C:"):
				49	file_has_next = True
				50	chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str)
				51	if n_sents == 0: file_has_next = False
				52	sents, gld, meta = [], [], []
				53	return chunk, file_has_next
				54
				55
daza	e3bc92e	2020-11-04 11:06:26 +0100	[diff] [blame]	56	def get_file_text_chunk(line_generator, chunk_size, token_class, comment_str="###C:"):
daza	8534747	2020-11-23 18:43:33 +0100	[diff] [blame^]	57	""" Same as get_file_annos_chunk but directly get (text, labels) pairs"""
daza	ff42f63	2020-10-08 14:46:32 +0200	[diff] [blame]	58	file_has_next = True
daza	e3bc92e	2020-11-04 11:06:26 +0100	[diff] [blame]	59	chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str)
daza	ff42f63	2020-10-08 14:46:32 +0200	[diff] [blame]	60	if n_sents == 0: file_has_next = False
				61	sents, gld, meta = [], [], []
				62	for anno in chunk:
				63	if len(anno.metadata) > 0: meta.append("\n".join(anno.metadata))
				64	sents.append(anno.get_sentence())
				65	gld.append(anno.get_pos_tags())
				66	return sents, gld, file_has_next
				67
				68
daza	e3bc92e	2020-11-04 11:06:26 +0100	[diff] [blame]	69	def get_file_chunk(line_generator, chunk_size, token_class, comment_str="###C:"):
daza	972aabc	2020-09-01 16:41:30 +0200	[diff] [blame]	70	file_has_next = True
daza	e3bc92e	2020-11-04 11:06:26 +0100	[diff] [blame]	71	chunk, n_sents = read_conll(line_generator, chunk_size, token_class, comment_str=comment_str)
				72	if n_sents < chunk_size: file_has_next = False
daza	972aabc	2020-09-01 16:41:30 +0200	[diff] [blame]	73	raw_text = ""
				74	for anno in chunk:
				75	if len(anno.metadata) > 0:
				76	raw_text += "\n".join(anno.metadata) + "\n"
				77	else:
				78	raw_text += "\n"
				79	for tok in anno.tokens:
				80	raw_text += tok.get_conllU_line() + "\n"
				81	raw_text += "\n"
				82	return raw_text, file_has_next, n_sents
				83
				84
				85	def turku_parse_file(raw_text, filename, chunk_ix):
daza	e3bc92e	2020-11-04 11:06:26 +0100	[diff] [blame]	86	out_file_str = f"{filename}.parsed.{chunk_ix}.conllu"
daza	972aabc	2020-09-01 16:41:30 +0200	[diff] [blame]	87	# For each file make a request to obtain the parse back
				88	logger.info(f"Sending Request {chunk_ix} to Parser Server...")
				89	response = requests.post("http://localhost:7689/", data=raw_text.encode('utf-8'))
				90	response_to_file(response.text, out_file_str)
				91
				92
				93
				94	def response_to_file(response_str, fname):
				95	fout = open(fname, "w")
				96	fout.write(response_str)
				97	fout.close()
daza	8534747	2020-11-23 18:43:33 +0100	[diff] [blame^]	98
				99
				100	def expand_file(f, substitute_comment=False):
				101	# Expand the .gz file
				102	fname = f[:-3]
				103	if not os.path.isfile(fname):
				104	p = subprocess.call(f"gunzip -c {f} > {fname}", shell=True)
				105	if p == 0:
				106	logger.info("Successfully uncompressed file")
				107	else:
				108	logger.info(f"Couldn't expand file {f}")
				109	raise Exception
				110	else:
				111	logger.info(f"File {fname} is already uncompressed. Skipping this step...")
				112
				113	# Substitute the Commentary Lines on the Expanded file
				114	if substitute_comment:
				115	fixed_filename = f"{fname}.fixed"
				116	p = subprocess.call(f"sed 's/^# /###C: /g' {fname}", shell=True, stdout=open(fixed_filename, "w")) # stdout=subprocess.PIPE
				117	if p == 0:
				118	logger.info("Successfully fixed comments on file")
				119	else:
				120	logger.info(f"Something went wrong when substituting commentaries")
				121	raise Exception
				122	return fixed_filename
				123	else:
				124	return fname