| # TODO: write a client to make multiple requests to the server! |
| import subprocess, json, time |
| import glob, logging |
| import os.path, sys |
| from my_utils.file_utils import * |
| from lib.CoNLL_Annotation import CoNLLUP_Token |
| |
| # TODO: Add logging instead of Prints! |
| |
| DEREKO_DIR = "/export/netapp/kupietz/N-GRAMM-STUDIE/conllu/" |
| |
| def get_filenames(data_dir): |
| filenames = [] |
| for filepath in glob.iglob(f'{data_dir}/*.conllu.gz', recursive=False): |
| fname = filepath.split("/")[-1] |
| filenames.append(filepath) |
| return sorted(filenames) |
| |
| |
| def expand_file(f): |
| # Expand the .tgz file |
| fname = f[:-3] |
| if not os.path.isfile(fname): |
| p = subprocess.call(f"gunzip -c {f} > {fname}", shell=True) |
| if p == 0: |
| logger.info("Successfully uncompressed file") |
| else: |
| logger.info(f"Couldn't expand file {f}") |
| raise Exception |
| else: |
| logger.info(f"File {fname} is already uncompressed. Skipping this step...") |
| |
| # Substitute the Commentary Lines on the Expanded file |
| fixed_filename = f"{fname}.fixed" |
| p = subprocess.call(f"sed 's/^# /###C: /g' {fname}", shell=True, stdout=open(fixed_filename, "w")) # stdout=subprocess.PIPE |
| if p == 0: |
| logger.info("Successfully fixed comments on file") |
| else: |
| logger.info(f"Something went wrong when substituting commentaries") |
| raise Exception |
| return fixed_filename |
| |
| |
| |
| if __name__ == "__main__": |
| conll_files = get_filenames(DEREKO_DIR)[:1] # This is for Development Purposes only process the first [at most] 2 files |
| #print(conll_files) |
| #conll_files = ["tutorial_examples/mini_test_raw.conllu.gz"] |
| file_has_next, chunk_ix = True, 0 |
| CHUNK_SIZE = 20000 |
| |
| # ===================================================================================== |
| # LOGGING INFO ... |
| # ===================================================================================== |
| logger = logging.getLogger(__name__) |
| console_hdlr = logging.StreamHandler(sys.stdout) |
| file_hdlr = logging.FileHandler(filename=f"ParseTests.log") |
| logging.basicConfig(level=logging.INFO, handlers=[console_hdlr, file_hdlr]) |
| logger.info("Start Logging") |
| logger.info(f"Chunking in Files of {CHUNK_SIZE} Sentences") |
| |
| # ===================================================================================== |
| # PROCESS (PARSE) ALL FILES FOUND ... |
| # ===================================================================================== |
| for f in conll_files: |
| start = time.time() |
| text_filename = expand_file(f) |
| line_generator = file_generator(text_filename) |
| total_processed_sents = 0 |
| while file_has_next: |
| raw_text, file_has_next, n_sents = get_file_chunk(line_generator, chunk_size=CHUNK_SIZE, token_class=CoNLLUP_Token) |
| total_processed_sents += n_sents |
| if len(raw_text) > 0: |
| turku_parse_file(raw_text, text_filename, chunk_ix) |
| now = time.time() |
| elapsed = (now - start) |
| logger.info(f"Time Elapsed: {elapsed}. Processed {total_processed_sents}. [{total_processed_sents/elapsed} Sents/sec]\n") # Toks/Sec??? |
| chunk_ix += 1 |
| end = time.time() |
| logger.info(f"Processing File {f} took {(end - start)} seconds!") |
| |
| |