blob: 0eba13779772b7360ab62b36511d7213ec40603e [file] [log] [blame]
# TODO: write a client to make multiple requests to the server!
import subprocess, json, time
import glob, logging
import os.path, sys
from my_utils.file_utils import *
from lib.CoNLL_Annotation import CoNLLUP_Token
# TODO: Add logging instead of Prints!
DEREKO_DIR = "/export/netapp/kupietz/N-GRAMM-STUDIE/conllu/"
def get_filenames(data_dir):
filenames = []
for filepath in glob.iglob(f'{data_dir}/*.conllu.gz', recursive=False):
fname = filepath.split("/")[-1]
filenames.append(filepath)
return sorted(filenames)
def expand_file(f):
# Expand the .tgz file
fname = f[:-3]
if not os.path.isfile(fname):
p = subprocess.call(f"gunzip -c {f} > {fname}", shell=True)
if p == 0:
logger.info("Successfully uncompressed file")
else:
logger.info(f"Couldn't expand file {f}")
raise Exception
else:
logger.info(f"File {fname} is already uncompressed. Skipping this step...")
# Substitute the Commentary Lines on the Expanded file
fixed_filename = f"{fname}.fixed"
p = subprocess.call(f"sed 's/^# /###C: /g' {fname}", shell=True, stdout=open(fixed_filename, "w")) # stdout=subprocess.PIPE
if p == 0:
logger.info("Successfully fixed comments on file")
else:
logger.info(f"Something went wrong when substituting commentaries")
raise Exception
return fixed_filename
if __name__ == "__main__":
conll_files = get_filenames(DEREKO_DIR)[:1] # This is for Development Purposes only process the first [at most] 2 files
#print(conll_files)
#conll_files = ["tutorial_examples/mini_test_raw.conllu.gz"]
file_has_next, chunk_ix = True, 0
CHUNK_SIZE = 20000
# =====================================================================================
# LOGGING INFO ...
# =====================================================================================
logger = logging.getLogger(__name__)
console_hdlr = logging.StreamHandler(sys.stdout)
file_hdlr = logging.FileHandler(filename=f"ParseTests.log")
logging.basicConfig(level=logging.INFO, handlers=[console_hdlr, file_hdlr])
logger.info("Start Logging")
logger.info(f"Chunking in Files of {CHUNK_SIZE} Sentences")
# =====================================================================================
# PROCESS (PARSE) ALL FILES FOUND ...
# =====================================================================================
for f in conll_files:
start = time.time()
text_filename = expand_file(f)
line_generator = file_generator(text_filename)
total_processed_sents = 0
while file_has_next:
raw_text, file_has_next, n_sents = get_file_chunk(line_generator, chunk_size=CHUNK_SIZE, token_class=CoNLLUP_Token)
total_processed_sents += n_sents
if len(raw_text) > 0:
turku_parse_file(raw_text, text_filename, chunk_ix)
now = time.time()
elapsed = (now - start)
logger.info(f"Time Elapsed: {elapsed}. Processed {total_processed_sents}. [{total_processed_sents/elapsed} Sents/sec]\n") # Toks/Sec???
chunk_ix += 1
end = time.time()
logger.info(f"Processing File {f} took {(end - start)} seconds!")