| # TODO: write a client to make multiple requests to the server! |
| import subprocess, json, time |
| import requests, glob, logging |
| import os.path, sys |
| from CoNLL_Annotation import get_annotation, CoNLLUP_Token |
| |
| # TODO: Add logging instead of Prints! |
| |
| DEREKO_DIR = "/export/netapp/kupietz/N-GRAMM-STUDIE/conllu/" |
| |
| def get_filenames(data_dir): |
| filenames = [] |
| for filepath in glob.iglob(f'{data_dir}/*.conllu.gz', recursive=False): |
| fname = filepath.split("/")[-1] |
| filenames.append(filepath) |
| return sorted(filenames) |
| |
| |
| def expand_file(f): |
| # Expand the .tgz file |
| fname = f[:-3] |
| if not os.path.isfile(fname): |
| p = subprocess.call(f"gunzip -c {f} > {fname}", shell=True) |
| if p == 0: |
| logger.info("Successfully uncompressed file") |
| else: |
| logger.info(f"Couldn't expand file {f}") |
| raise Exception |
| else: |
| logger.info(f"File {fname} is already uncompressed. Skipping this step...") |
| |
| # Substitute the Commentary Lines on the Expanded file |
| fixed_filename = f"{fname}.fixed" |
| p = subprocess.call(f"sed 's/^# /###C: /g' {fname}", shell=True, stdout=open(fixed_filename, "w")) # stdout=subprocess.PIPE |
| if p == 0: |
| logger.info("Successfully fixed comments on file") |
| else: |
| logger.info(f"Something went wrong when substituting commentaries") |
| raise Exception |
| return fixed_filename |
| |
| |
| |
| def _file_generator(file_path): |
| with open(file_path, "r") as data_file: |
| logger.info("Reading instances from lines in file at: %s", file_path) |
| for line in data_file: |
| if not line: continue |
| yield line |
| |
| |
| def read_conll(line_generator, chunk_size): |
| n_sents = 0 |
| annotated_sentences, buffer_meta, buffer_lst = [], [], [] |
| for i, line in enumerate(line_generator): |
| if n_sents == chunk_size: break |
| if line.startswith("###C:"): |
| buffer_meta.append(line) |
| continue |
| if len(line.split()) > 0: |
| buffer_lst.append(line) |
| else: |
| ann = get_annotation(buffer_lst, buffer_meta) |
| n_sents += 1 |
| buffer_lst, buffer_meta = [], [] |
| annotated_sentences.append(ann) |
| # logger.info("Read {} Sentences!".format(n_sents)) |
| return annotated_sentences, n_sents |
| |
| |
| |
| def get_file_chunk(line_generator, chunk_size): |
| file_has_next = True |
| chunk, n_sents = read_conll(line_generator, chunk_size) |
| if n_sents == 0: file_has_next = False |
| raw_text = "" |
| for anno in chunk: |
| raw_text += "\n".join(anno.metadata) + "\n" |
| for tok in anno.tokens: |
| raw_text += tok.get_conllU_line() + "\n" |
| raw_text += "\n" |
| return raw_text, file_has_next, n_sents |
| |
| |
| def turku_parse_file(raw_text, filename, chunk_ix): |
| f = filename.split(".")[0] |
| out_file_str = f"{f}.parsed.{chunk_ix}.conllu" |
| # For each file make a request to obtain the parse back |
| logger.info(f"Sending Request {chunk_ix} to Parser Server...") |
| response = requests.post("http://localhost:7689/", data=raw_text.encode('utf-8')) |
| response_to_file(response.text, out_file_str) |
| |
| |
| |
| def response_to_file(response_str, fname): |
| fout = open(fname, "w") |
| fout.write(response_str) |
| fout.close() |
| |
| |
| if __name__ == "__main__": |
| conll_files = get_filenames(DEREKO_DIR)[:1] # This is for Development Purposes only process the first [at most] 5 files |
| print(conll_files) |
| # conll_files = ["tutorial_examples/mini_test_raw.conllu.gz"] |
| file_has_next, chunk_ix = True, 0 |
| CHUNK_SIZE = 20000 |
| |
| # ===================================================================================== |
| # LOGGING INFO ... |
| # ===================================================================================== |
| logger = logging.getLogger(__name__) |
| console_hdlr = logging.StreamHandler(sys.stdout) |
| file_hdlr = logging.FileHandler(filename=f"ParseTests.log") |
| logging.basicConfig(level=logging.INFO, handlers=[console_hdlr, file_hdlr]) |
| logger.info("Start Logging") |
| logger.info(f"Chunking in Files of {CHUNK_SIZE} Sentences") |
| |
| # ===================================================================================== |
| # PROCESS (PARSE) ALL FILES FOUND ... |
| # ===================================================================================== |
| for f in conll_files: |
| start = time.time() |
| text_filename = expand_file(f) |
| line_generator = _file_generator(text_filename) |
| total_processed_sents = 0 |
| while file_has_next: |
| raw_text, file_has_next, n_sents = get_file_chunk(line_generator, chunk_size=CHUNK_SIZE) |
| total_processed_sents += n_sents |
| if len(raw_text) > 0: |
| turku_parse_file(raw_text, text_filename, chunk_ix) |
| now = time.time() |
| elapsed = (now - start) |
| logger.info(f"Time Elapsed: {elapsed}. Processed {total_processed_sents}. [{total_processed_sents/elapsed} Sents/sec]\n") # Toks/Sec??? |
| chunk_ix += 1 |
| end = time.time() |
| logger.info(f"Processing File {f} took {(end - start)} seconds!") |
| |
| |