daza | cd9cf01 | 2020-08-31 17:19:11 +0200 | [diff] [blame] | 1 | # TODO: write a client to make multiple requests to the server! |
| 2 | import subprocess, json, time |
daza | 972aabc | 2020-09-01 16:41:30 +0200 | [diff] [blame^] | 3 | import glob, logging |
daza | cd9cf01 | 2020-08-31 17:19:11 +0200 | [diff] [blame] | 4 | import os.path, sys |
daza | 972aabc | 2020-09-01 16:41:30 +0200 | [diff] [blame^] | 5 | from my_utils import * |
| 6 | from CoNLL_Annotation import CoNLLUP_Token |
daza | cd9cf01 | 2020-08-31 17:19:11 +0200 | [diff] [blame] | 7 | |
| 8 | # TODO: Add logging instead of Prints! |
| 9 | |
| 10 | DEREKO_DIR = "/export/netapp/kupietz/N-GRAMM-STUDIE/conllu/" |
| 11 | |
| 12 | def get_filenames(data_dir): |
| 13 | filenames = [] |
| 14 | for filepath in glob.iglob(f'{data_dir}/*.conllu.gz', recursive=False): |
| 15 | fname = filepath.split("/")[-1] |
| 16 | filenames.append(filepath) |
| 17 | return sorted(filenames) |
| 18 | |
| 19 | |
| 20 | def expand_file(f): |
| 21 | # Expand the .tgz file |
| 22 | fname = f[:-3] |
| 23 | if not os.path.isfile(fname): |
| 24 | p = subprocess.call(f"gunzip -c {f} > {fname}", shell=True) |
| 25 | if p == 0: |
| 26 | logger.info("Successfully uncompressed file") |
| 27 | else: |
| 28 | logger.info(f"Couldn't expand file {f}") |
| 29 | raise Exception |
| 30 | else: |
| 31 | logger.info(f"File {fname} is already uncompressed. Skipping this step...") |
| 32 | |
| 33 | # Substitute the Commentary Lines on the Expanded file |
| 34 | fixed_filename = f"{fname}.fixed" |
| 35 | p = subprocess.call(f"sed 's/^# /###C: /g' {fname}", shell=True, stdout=open(fixed_filename, "w")) # stdout=subprocess.PIPE |
| 36 | if p == 0: |
| 37 | logger.info("Successfully fixed comments on file") |
| 38 | else: |
| 39 | logger.info(f"Something went wrong when substituting commentaries") |
| 40 | raise Exception |
| 41 | return fixed_filename |
| 42 | |
| 43 | |
| 44 | |
daza | cd9cf01 | 2020-08-31 17:19:11 +0200 | [diff] [blame] | 45 | if __name__ == "__main__": |
daza | 972aabc | 2020-09-01 16:41:30 +0200 | [diff] [blame^] | 46 | conll_files = get_filenames(DEREKO_DIR)[:1] # This is for Development Purposes only process the first [at most] 2 files |
| 47 | #print(conll_files) |
| 48 | #conll_files = ["tutorial_examples/mini_test_raw.conllu.gz"] |
daza | cd9cf01 | 2020-08-31 17:19:11 +0200 | [diff] [blame] | 49 | file_has_next, chunk_ix = True, 0 |
| 50 | CHUNK_SIZE = 20000 |
| 51 | |
| 52 | # ===================================================================================== |
| 53 | # LOGGING INFO ... |
| 54 | # ===================================================================================== |
| 55 | logger = logging.getLogger(__name__) |
| 56 | console_hdlr = logging.StreamHandler(sys.stdout) |
| 57 | file_hdlr = logging.FileHandler(filename=f"ParseTests.log") |
| 58 | logging.basicConfig(level=logging.INFO, handlers=[console_hdlr, file_hdlr]) |
| 59 | logger.info("Start Logging") |
| 60 | logger.info(f"Chunking in Files of {CHUNK_SIZE} Sentences") |
| 61 | |
| 62 | # ===================================================================================== |
| 63 | # PROCESS (PARSE) ALL FILES FOUND ... |
| 64 | # ===================================================================================== |
| 65 | for f in conll_files: |
| 66 | start = time.time() |
| 67 | text_filename = expand_file(f) |
daza | 972aabc | 2020-09-01 16:41:30 +0200 | [diff] [blame^] | 68 | line_generator = file_generator(text_filename) |
daza | cd9cf01 | 2020-08-31 17:19:11 +0200 | [diff] [blame] | 69 | total_processed_sents = 0 |
| 70 | while file_has_next: |
daza | 972aabc | 2020-09-01 16:41:30 +0200 | [diff] [blame^] | 71 | raw_text, file_has_next, n_sents = get_file_chunk(line_generator, chunk_size=CHUNK_SIZE, token_class=CoNLLUP_Token) |
daza | cd9cf01 | 2020-08-31 17:19:11 +0200 | [diff] [blame] | 72 | total_processed_sents += n_sents |
| 73 | if len(raw_text) > 0: |
| 74 | turku_parse_file(raw_text, text_filename, chunk_ix) |
| 75 | now = time.time() |
| 76 | elapsed = (now - start) |
| 77 | logger.info(f"Time Elapsed: {elapsed}. Processed {total_processed_sents}. [{total_processed_sents/elapsed} Sents/sec]\n") # Toks/Sec??? |
| 78 | chunk_ix += 1 |
| 79 | end = time.time() |
| 80 | logger.info(f"Processing File {f} took {(end - start)} seconds!") |
| 81 | |
| 82 | |