daza | e94ab18 | 2020-09-01 16:41:30 +0200 | [diff] [blame] | 1 | # TODO: write a client to make multiple requests to the server! |
| 2 | import subprocess, json, time |
| 3 | import requests, glob, logging |
| 4 | import os.path, sys |
daza | d140380 | 2020-10-08 14:46:32 +0200 | [diff] [blame^] | 5 | from lib.CoNLL_Annotation import CoNLL09_Token |
| 6 | import my_utils.file_utils as fu |
daza | e94ab18 | 2020-09-01 16:41:30 +0200 | [diff] [blame] | 7 | |
| 8 | |
| 9 | TIGER_CORPUS = "/home/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09" |
| 10 | |
| 11 | |
| 12 | if __name__ == "__main__": |
| 13 | file_has_next, chunk_ix = True, 0 |
| 14 | CHUNK_SIZE = 10000 |
| 15 | |
| 16 | # ===================================================================================== |
| 17 | # LOGGING INFO ... |
| 18 | # ===================================================================================== |
| 19 | logger = logging.getLogger(__name__) |
| 20 | console_hdlr = logging.StreamHandler(sys.stdout) |
daza | d140380 | 2020-10-08 14:46:32 +0200 | [diff] [blame^] | 21 | file_hdlr = logging.FileHandler(filename=f"logs/Parse_Tiger_Turku.log") |
daza | e94ab18 | 2020-09-01 16:41:30 +0200 | [diff] [blame] | 22 | logging.basicConfig(level=logging.INFO, handlers=[console_hdlr, file_hdlr]) |
| 23 | logger.info(f"Chunking TIGER Corpus in chunks of {CHUNK_SIZE} Sentences") |
| 24 | |
| 25 | # ===================================================================================== |
| 26 | # PROCESS (PARSE) TIGER Corpus ... |
| 27 | # ===================================================================================== |
| 28 | start = time.time() |
| 29 | total_processed_sents = 0 |
daza | d140380 | 2020-10-08 14:46:32 +0200 | [diff] [blame^] | 30 | line_generator = fu.file_generator(TIGER_CORPUS) |
daza | e94ab18 | 2020-09-01 16:41:30 +0200 | [diff] [blame] | 31 | while file_has_next: |
daza | d140380 | 2020-10-08 14:46:32 +0200 | [diff] [blame^] | 32 | raw_text, file_has_next, n_sents = fu.get_file_chunk(line_generator, chunk_size=CHUNK_SIZE, token_class=CoNLL09_Token) |
daza | e94ab18 | 2020-09-01 16:41:30 +0200 | [diff] [blame] | 33 | total_processed_sents += n_sents |
| 34 | if len(raw_text) > 0: |
daza | d140380 | 2020-10-08 14:46:32 +0200 | [diff] [blame^] | 35 | fu.turku_parse_file(raw_text, TIGER_CORPUS, chunk_ix) |
daza | e94ab18 | 2020-09-01 16:41:30 +0200 | [diff] [blame] | 36 | now = time.time() |
| 37 | elapsed = (now - start) |
| 38 | logger.info(f"Time Elapsed: {elapsed}. Processed {total_processed_sents}. [{total_processed_sents/elapsed} Sents/sec]\n") # Toks/Sec??? |
| 39 | chunk_ix += 1 |
| 40 | if chunk_ix == 10: break |
| 41 | end = time.time() |
| 42 | logger.info(f"Processing File {TIGER_CORPUS} took {(end - start)} seconds!") |
| 43 | |
| 44 | |