DeReKo/turku_client_parser.py - KorAP/sota-pos-lemmatizers - Gitiles

 # TODO: write a client to make multiple requests to the server!
 import subprocess, json, time
 import requests, glob, logging
 import os.path, sys
 from CoNLL_Annotation import get_annotation, CoNLLUP_Token

 # TODO: Add logging instead of Prints!

 DEREKO_DIR = "/export/netapp/kupietz/N-GRAMM-STUDIE/conllu/"

 def get_filenames(data_dir):
     filenames = []
     for filepath in glob.iglob(f'{data_dir}/*.conllu.gz', recursive=False):
         fname = filepath.split("/")[-1]
         filenames.append(filepath)
     return sorted(filenames)


 def expand_file(f):
     # Expand the .tgz file
     fname = f[:-3]
     if not os.path.isfile(fname):
         p = subprocess.call(f"gunzip -c {f} > {fname}", shell=True)
         if p == 0:
             logger.info("Successfully uncompressed file")
         else:
             logger.info(f"Couldn't expand file {f}")
             raise Exception
     else:
         logger.info(f"File {fname} is already uncompressed. Skipping this step...")

     # Substitute the Commentary Lines on the Expanded file
     fixed_filename = f"{fname}.fixed"
     p = subprocess.call(f"sed 's/^# /###C: /g' {fname}", shell=True, stdout=open(fixed_filename, "w")) # stdout=subprocess.PIPE
     if p == 0:
         logger.info("Successfully fixed comments on file")
     else:
         logger.info(f"Something went wrong when substituting commentaries")
         raise Exception
     return fixed_filename


 def _file_generator(file_path):
     with open(file_path, "r") as data_file:
         logger.info("Reading instances from lines in file at: %s", file_path)
         for line in data_file:
             if not line: continue
             yield line


 def read_conll(line_generator, chunk_size):
     n_sents = 0
     annotated_sentences, buffer_meta, buffer_lst = [], [], []
     for i, line in enumerate(line_generator):
         if n_sents == chunk_size: break
         if line.startswith("###C:"):
             buffer_meta.append(line)
             continue
         if len(line.split()) > 0:
             buffer_lst.append(line)
         else:
             ann = get_annotation(buffer_lst, buffer_meta)
             n_sents += 1
             buffer_lst, buffer_meta = [], []
             annotated_sentences.append(ann)
     # logger.info("Read {} Sentences!".format(n_sents))
     return annotated_sentences, n_sents


 def get_file_chunk(line_generator, chunk_size):
     file_has_next = True
     chunk, n_sents = read_conll(line_generator, chunk_size)
     if n_sents == 0: file_has_next = False
     raw_text = ""
     for anno in chunk:
         raw_text += "\n".join(anno.metadata) + "\n"
         for tok in anno.tokens:
             raw_text += tok.get_conllU_line() + "\n"
         raw_text += "\n"
     return raw_text, file_has_next, n_sents


 def turku_parse_file(raw_text, filename, chunk_ix):
     f = filename.split(".")[0]
     out_file_str = f"{f}.parsed.{chunk_ix}.conllu"
     # For each file make a request to obtain the parse back
     logger.info(f"Sending Request {chunk_ix} to Parser Server...")
     response = requests.post("http://localhost:7689/", data=raw_text.encode('utf-8'))
     response_to_file(response.text, out_file_str)


 def response_to_file(response_str, fname):
     fout = open(fname, "w")
     fout.write(response_str)
     fout.close()


 if __name__ == "__main__":
     conll_files = get_filenames(DEREKO_DIR)[:1] # This is for Development Purposes only process the first [at most] 5 files
     print(conll_files)
     # conll_files = ["tutorial_examples/mini_test_raw.conllu.gz"]
     file_has_next, chunk_ix = True, 0
     CHUNK_SIZE = 20000

     # =====================================================================================
     #                    LOGGING INFO ...
     # =====================================================================================
     logger = logging.getLogger(__name__)
     console_hdlr = logging.StreamHandler(sys.stdout)
     file_hdlr = logging.FileHandler(filename=f"ParseTests.log")
     logging.basicConfig(level=logging.INFO, handlers=[console_hdlr, file_hdlr])
     logger.info("Start Logging")
     logger.info(f"Chunking in Files of {CHUNK_SIZE} Sentences")

     # =====================================================================================
     #                    PROCESS (PARSE) ALL FILES FOUND ...
     # =====================================================================================
     for f in conll_files:
         start = time.time()
         text_filename = expand_file(f)
         line_generator = _file_generator(text_filename)
         total_processed_sents = 0
         while file_has_next:
             raw_text, file_has_next, n_sents = get_file_chunk(line_generator, chunk_size=CHUNK_SIZE)
             total_processed_sents += n_sents
             if len(raw_text) > 0:
                 turku_parse_file(raw_text, text_filename, chunk_ix)
                 now = time.time()
                 elapsed = (now - start)
                 logger.info(f"Time Elapsed: {elapsed}. Processed {total_processed_sents}. [{total_processed_sents/elapsed} Sents/sec]\n") # Toks/Sec???
             chunk_ix += 1
         end = time.time()
         logger.info(f"Processing File {f} took {(end - start)} seconds!")
	# TODO: write a client to make multiple requests to the server!
	import subprocess, json, time
	import requests, glob, logging
	import os.path, sys
	from CoNLL_Annotation import get_annotation, CoNLLUP_Token

	# TODO: Add logging instead of Prints!

	DEREKO_DIR = "/export/netapp/kupietz/N-GRAMM-STUDIE/conllu/"

	def get_filenames(data_dir):
	filenames = []
	for filepath in glob.iglob(f'{data_dir}/*.conllu.gz', recursive=False):
	fname = filepath.split("/")[-1]
	filenames.append(filepath)
	return sorted(filenames)


	def expand_file(f):
	# Expand the .tgz file
	fname = f[:-3]
	if not os.path.isfile(fname):
	p = subprocess.call(f"gunzip -c {f} > {fname}", shell=True)
	if p == 0:
	logger.info("Successfully uncompressed file")
	else:
	logger.info(f"Couldn't expand file {f}")
	raise Exception
	else:
	logger.info(f"File {fname} is already uncompressed. Skipping this step...")

	# Substitute the Commentary Lines on the Expanded file
	fixed_filename = f"{fname}.fixed"
	p = subprocess.call(f"sed 's/^# /###C: /g' {fname}", shell=True, stdout=open(fixed_filename, "w")) # stdout=subprocess.PIPE
	if p == 0:
	logger.info("Successfully fixed comments on file")
	else:
	logger.info(f"Something went wrong when substituting commentaries")
	raise Exception
	return fixed_filename



	def _file_generator(file_path):
	with open(file_path, "r") as data_file:
	logger.info("Reading instances from lines in file at: %s", file_path)
	for line in data_file:
	if not line: continue
	yield line


	def read_conll(line_generator, chunk_size):
	n_sents = 0
	annotated_sentences, buffer_meta, buffer_lst = [], [], []
	for i, line in enumerate(line_generator):
	if n_sents == chunk_size: break
	if line.startswith("###C:"):
	buffer_meta.append(line)
	continue
	if len(line.split()) > 0:
	buffer_lst.append(line)
	else:
	ann = get_annotation(buffer_lst, buffer_meta)
	n_sents += 1
	buffer_lst, buffer_meta = [], []
	annotated_sentences.append(ann)
	# logger.info("Read {} Sentences!".format(n_sents))
	return annotated_sentences, n_sents



	def get_file_chunk(line_generator, chunk_size):
	file_has_next = True
	chunk, n_sents = read_conll(line_generator, chunk_size)
	if n_sents == 0: file_has_next = False
	raw_text = ""
	for anno in chunk:
	raw_text += "\n".join(anno.metadata) + "\n"
	for tok in anno.tokens:
	raw_text += tok.get_conllU_line() + "\n"
	raw_text += "\n"
	return raw_text, file_has_next, n_sents


	def turku_parse_file(raw_text, filename, chunk_ix):
	f = filename.split(".")[0]
	out_file_str = f"{f}.parsed.{chunk_ix}.conllu"
	# For each file make a request to obtain the parse back
	logger.info(f"Sending Request {chunk_ix} to Parser Server...")
	response = requests.post("http://localhost:7689/", data=raw_text.encode('utf-8'))
	response_to_file(response.text, out_file_str)



	def response_to_file(response_str, fname):
	fout = open(fname, "w")
	fout.write(response_str)
	fout.close()


	if __name__ == "__main__":
	conll_files = get_filenames(DEREKO_DIR)[:1] # This is for Development Purposes only process the first [at most] 5 files
	print(conll_files)
	# conll_files = ["tutorial_examples/mini_test_raw.conllu.gz"]
	file_has_next, chunk_ix = True, 0
	CHUNK_SIZE = 20000

	# =====================================================================================
	# LOGGING INFO ...
	# =====================================================================================
	logger = logging.getLogger(__name__)
	console_hdlr = logging.StreamHandler(sys.stdout)
	file_hdlr = logging.FileHandler(filename=f"ParseTests.log")
	logging.basicConfig(level=logging.INFO, handlers=[console_hdlr, file_hdlr])
	logger.info("Start Logging")
	logger.info(f"Chunking in Files of {CHUNK_SIZE} Sentences")

	# =====================================================================================
	# PROCESS (PARSE) ALL FILES FOUND ...
	# =====================================================================================
	for f in conll_files:
	start = time.time()
	text_filename = expand_file(f)
	line_generator = _file_generator(text_filename)
	total_processed_sents = 0
	while file_has_next:
	raw_text, file_has_next, n_sents = get_file_chunk(line_generator, chunk_size=CHUNK_SIZE)
	total_processed_sents += n_sents
	if len(raw_text) > 0:
	turku_parse_file(raw_text, text_filename, chunk_ix)
	now = time.time()
	elapsed = (now - start)
	logger.info(f"Time Elapsed: {elapsed}. Processed {total_processed_sents}. [{total_processed_sents/elapsed} Sents/sec]\n") # Toks/Sec???
	chunk_ix += 1
	end = time.time()
	logger.info(f"Processing File {f} took {(end - start)} seconds!")