Blame - DeReKo/turku_client_parser.py - KorAP/sota-pos-lemmatizers

blob: 0eba13779772b7360ab62b36511d7213ec40603e [file] [log] [blame]

daza	cd9cf01	2020-08-31 17:19:11 +0200	[diff] [blame]	1	# TODO: write a client to make multiple requests to the server!
				2	import subprocess, json, time
daza	972aabc	2020-09-01 16:41:30 +0200	[diff] [blame]	3	import glob, logging
daza	cd9cf01	2020-08-31 17:19:11 +0200	[diff] [blame]	4	import os.path, sys
daza	0498a6a	2020-10-06 12:03:12 +0200	[diff] [blame^]	5	from my_utils.file_utils import *
				6	from lib.CoNLL_Annotation import CoNLLUP_Token
daza	cd9cf01	2020-08-31 17:19:11 +0200	[diff] [blame]	7
				8	# TODO: Add logging instead of Prints!
				9
				10	DEREKO_DIR = "/export/netapp/kupietz/N-GRAMM-STUDIE/conllu/"
				11
				12	def get_filenames(data_dir):
				13	filenames = []
				14	for filepath in glob.iglob(f'{data_dir}/*.conllu.gz', recursive=False):
				15	fname = filepath.split("/")[-1]
				16	filenames.append(filepath)
				17	return sorted(filenames)
				18
				19
				20	def expand_file(f):
				21	# Expand the .tgz file
				22	fname = f[:-3]
				23	if not os.path.isfile(fname):
				24	p = subprocess.call(f"gunzip -c {f} > {fname}", shell=True)
				25	if p == 0:
				26	logger.info("Successfully uncompressed file")
				27	else:
				28	logger.info(f"Couldn't expand file {f}")
				29	raise Exception
				30	else:
				31	logger.info(f"File {fname} is already uncompressed. Skipping this step...")
				32
				33	# Substitute the Commentary Lines on the Expanded file
				34	fixed_filename = f"{fname}.fixed"
				35	p = subprocess.call(f"sed 's/^# /###C: /g' {fname}", shell=True, stdout=open(fixed_filename, "w")) # stdout=subprocess.PIPE
				36	if p == 0:
				37	logger.info("Successfully fixed comments on file")
				38	else:
				39	logger.info(f"Something went wrong when substituting commentaries")
				40	raise Exception
				41	return fixed_filename
				42
				43
				44
daza	cd9cf01	2020-08-31 17:19:11 +0200	[diff] [blame]	45	if __name__ == "__main__":
daza	972aabc	2020-09-01 16:41:30 +0200	[diff] [blame]	46	conll_files = get_filenames(DEREKO_DIR)[:1] # This is for Development Purposes only process the first [at most] 2 files
				47	#print(conll_files)
				48	#conll_files = ["tutorial_examples/mini_test_raw.conllu.gz"]
daza	cd9cf01	2020-08-31 17:19:11 +0200	[diff] [blame]	49	file_has_next, chunk_ix = True, 0
				50	CHUNK_SIZE = 20000
				51
				52	# =====================================================================================
				53	# LOGGING INFO ...
				54	# =====================================================================================
				55	logger = logging.getLogger(__name__)
				56	console_hdlr = logging.StreamHandler(sys.stdout)
				57	file_hdlr = logging.FileHandler(filename=f"ParseTests.log")
				58	logging.basicConfig(level=logging.INFO, handlers=[console_hdlr, file_hdlr])
				59	logger.info("Start Logging")
				60	logger.info(f"Chunking in Files of {CHUNK_SIZE} Sentences")
				61
				62	# =====================================================================================
				63	# PROCESS (PARSE) ALL FILES FOUND ...
				64	# =====================================================================================
				65	for f in conll_files:
				66	start = time.time()
				67	text_filename = expand_file(f)
daza	972aabc	2020-09-01 16:41:30 +0200	[diff] [blame]	68	line_generator = file_generator(text_filename)
daza	cd9cf01	2020-08-31 17:19:11 +0200	[diff] [blame]	69	total_processed_sents = 0
				70	while file_has_next:
daza	972aabc	2020-09-01 16:41:30 +0200	[diff] [blame]	71	raw_text, file_has_next, n_sents = get_file_chunk(line_generator, chunk_size=CHUNK_SIZE, token_class=CoNLLUP_Token)
daza	cd9cf01	2020-08-31 17:19:11 +0200	[diff] [blame]	72	total_processed_sents += n_sents
				73	if len(raw_text) > 0:
				74	turku_parse_file(raw_text, text_filename, chunk_ix)
				75	now = time.time()
				76	elapsed = (now - start)
				77	logger.info(f"Time Elapsed: {elapsed}. Processed {total_processed_sents}. [{total_processed_sents/elapsed} Sents/sec]\n") # Toks/Sec???
				78	chunk_ix += 1
				79	end = time.time()
				80	logger.info(f"Processing File {f} took {(end - start)} seconds!")
				81
				82