DeReKo/turku_client_parser.py - KorAP/sota-pos-lemmatizers - Gitiles

 # TODO: write a client to make multiple requests to the server!
 import subprocess, json, time
 import glob, logging
 import os.path, sys
 from my_utils.file_utils import *
 from lib.CoNLL_Annotation import CoNLLUP_Token

 # TODO: Add logging instead of Prints!

 DEREKO_DIR = "/export/netapp/kupietz/N-GRAMM-STUDIE/conllu/"

 def get_filenames(data_dir):
     filenames = []
     for filepath in glob.iglob(f'{data_dir}/*.conllu.gz', recursive=False):
         fname = filepath.split("/")[-1]
         filenames.append(filepath)
     return sorted(filenames)


 def expand_file(f):
     # Expand the .tgz file
     fname = f[:-3]
     if not os.path.isfile(fname):
         p = subprocess.call(f"gunzip -c {f} > {fname}", shell=True)
         if p == 0:
             logger.info("Successfully uncompressed file")
         else:
             logger.info(f"Couldn't expand file {f}")
             raise Exception
     else:
         logger.info(f"File {fname} is already uncompressed. Skipping this step...")

     # Substitute the Commentary Lines on the Expanded file
     fixed_filename = f"{fname}.fixed"
     p = subprocess.call(f"sed 's/^# /###C: /g' {fname}", shell=True, stdout=open(fixed_filename, "w")) # stdout=subprocess.PIPE
     if p == 0:
         logger.info("Successfully fixed comments on file")
     else:
         logger.info(f"Something went wrong when substituting commentaries")
         raise Exception
     return fixed_filename


 if __name__ == "__main__":
     conll_files = get_filenames(DEREKO_DIR)[:1] # This is for Development Purposes only process the first [at most] 2 files
     #print(conll_files)
     #conll_files = ["tutorial_examples/mini_test_raw.conllu.gz"]
     file_has_next, chunk_ix = True, 0
     CHUNK_SIZE = 20000

     # =====================================================================================
     #                    LOGGING INFO ...
     # =====================================================================================
     logger = logging.getLogger(__name__)
     console_hdlr = logging.StreamHandler(sys.stdout)
     file_hdlr = logging.FileHandler(filename=f"ParseTests.log")
     logging.basicConfig(level=logging.INFO, handlers=[console_hdlr, file_hdlr])
     logger.info("Start Logging")
     logger.info(f"Chunking in Files of {CHUNK_SIZE} Sentences")

     # =====================================================================================
     #                    PROCESS (PARSE) ALL FILES FOUND ...
     # =====================================================================================
     for f in conll_files:
         start = time.time()
         text_filename = expand_file(f)
         line_generator = file_generator(text_filename)
         total_processed_sents = 0
         while file_has_next:
             raw_text, file_has_next, n_sents = get_file_chunk(line_generator, chunk_size=CHUNK_SIZE, token_class=CoNLLUP_Token)
             total_processed_sents += n_sents
             if len(raw_text) > 0:
                 turku_parse_file(raw_text, text_filename, chunk_ix)
                 now = time.time()
                 elapsed = (now - start)
                 logger.info(f"Time Elapsed: {elapsed}. Processed {total_processed_sents}. [{total_processed_sents/elapsed} Sents/sec]\n") # Toks/Sec???
             chunk_ix += 1
         end = time.time()
         logger.info(f"Processing File {f} took {(end - start)} seconds!")
	# TODO: write a client to make multiple requests to the server!
	import subprocess, json, time
	import glob, logging
	import os.path, sys
	from my_utils.file_utils import *
	from lib.CoNLL_Annotation import CoNLLUP_Token

	# TODO: Add logging instead of Prints!

	DEREKO_DIR = "/export/netapp/kupietz/N-GRAMM-STUDIE/conllu/"

	def get_filenames(data_dir):
	filenames = []
	for filepath in glob.iglob(f'{data_dir}/*.conllu.gz', recursive=False):
	fname = filepath.split("/")[-1]
	filenames.append(filepath)
	return sorted(filenames)


	def expand_file(f):
	# Expand the .tgz file
	fname = f[:-3]
	if not os.path.isfile(fname):
	p = subprocess.call(f"gunzip -c {f} > {fname}", shell=True)
	if p == 0:
	logger.info("Successfully uncompressed file")
	else:
	logger.info(f"Couldn't expand file {f}")
	raise Exception
	else:
	logger.info(f"File {fname} is already uncompressed. Skipping this step...")

	# Substitute the Commentary Lines on the Expanded file
	fixed_filename = f"{fname}.fixed"
	p = subprocess.call(f"sed 's/^# /###C: /g' {fname}", shell=True, stdout=open(fixed_filename, "w")) # stdout=subprocess.PIPE
	if p == 0:
	logger.info("Successfully fixed comments on file")
	else:
	logger.info(f"Something went wrong when substituting commentaries")
	raise Exception
	return fixed_filename



	if __name__ == "__main__":
	conll_files = get_filenames(DEREKO_DIR)[:1] # This is for Development Purposes only process the first [at most] 2 files
	#print(conll_files)
	#conll_files = ["tutorial_examples/mini_test_raw.conllu.gz"]
	file_has_next, chunk_ix = True, 0
	CHUNK_SIZE = 20000

	# =====================================================================================
	# LOGGING INFO ...
	# =====================================================================================
	logger = logging.getLogger(__name__)
	console_hdlr = logging.StreamHandler(sys.stdout)
	file_hdlr = logging.FileHandler(filename=f"ParseTests.log")
	logging.basicConfig(level=logging.INFO, handlers=[console_hdlr, file_hdlr])
	logger.info("Start Logging")
	logger.info(f"Chunking in Files of {CHUNK_SIZE} Sentences")

	# =====================================================================================
	# PROCESS (PARSE) ALL FILES FOUND ...
	# =====================================================================================
	for f in conll_files:
	start = time.time()
	text_filename = expand_file(f)
	line_generator = file_generator(text_filename)
	total_processed_sents = 0
	while file_has_next:
	raw_text, file_has_next, n_sents = get_file_chunk(line_generator, chunk_size=CHUNK_SIZE, token_class=CoNLLUP_Token)
	total_processed_sents += n_sents
	if len(raw_text) > 0:
	turku_parse_file(raw_text, text_filename, chunk_ix)
	now = time.time()
	elapsed = (now - start)
	logger.info(f"Time Elapsed: {elapsed}. Processed {total_processed_sents}. [{total_processed_sents/elapsed} Sents/sec]\n") # Toks/Sec???
	chunk_ix += 1
	end = time.time()
	logger.info(f"Processing File {f} took {(end - start)} seconds!")