blob: 0eba13779772b7360ab62b36511d7213ec40603e [file] [log] [blame]
dazacd9cf012020-08-31 17:19:11 +02001# TODO: write a client to make multiple requests to the server!
2import subprocess, json, time
daza972aabc2020-09-01 16:41:30 +02003import glob, logging
dazacd9cf012020-08-31 17:19:11 +02004import os.path, sys
daza0498a6a2020-10-06 12:03:12 +02005from my_utils.file_utils import *
6from lib.CoNLL_Annotation import CoNLLUP_Token
dazacd9cf012020-08-31 17:19:11 +02007
8# TODO: Add logging instead of Prints!
9
10DEREKO_DIR = "/export/netapp/kupietz/N-GRAMM-STUDIE/conllu/"
11
12def get_filenames(data_dir):
13 filenames = []
14 for filepath in glob.iglob(f'{data_dir}/*.conllu.gz', recursive=False):
15 fname = filepath.split("/")[-1]
16 filenames.append(filepath)
17 return sorted(filenames)
18
19
20def expand_file(f):
21 # Expand the .tgz file
22 fname = f[:-3]
23 if not os.path.isfile(fname):
24 p = subprocess.call(f"gunzip -c {f} > {fname}", shell=True)
25 if p == 0:
26 logger.info("Successfully uncompressed file")
27 else:
28 logger.info(f"Couldn't expand file {f}")
29 raise Exception
30 else:
31 logger.info(f"File {fname} is already uncompressed. Skipping this step...")
32
33 # Substitute the Commentary Lines on the Expanded file
34 fixed_filename = f"{fname}.fixed"
35 p = subprocess.call(f"sed 's/^# /###C: /g' {fname}", shell=True, stdout=open(fixed_filename, "w")) # stdout=subprocess.PIPE
36 if p == 0:
37 logger.info("Successfully fixed comments on file")
38 else:
39 logger.info(f"Something went wrong when substituting commentaries")
40 raise Exception
41 return fixed_filename
42
43
44
dazacd9cf012020-08-31 17:19:11 +020045if __name__ == "__main__":
daza972aabc2020-09-01 16:41:30 +020046 conll_files = get_filenames(DEREKO_DIR)[:1] # This is for Development Purposes only process the first [at most] 2 files
47 #print(conll_files)
48 #conll_files = ["tutorial_examples/mini_test_raw.conllu.gz"]
dazacd9cf012020-08-31 17:19:11 +020049 file_has_next, chunk_ix = True, 0
50 CHUNK_SIZE = 20000
51
52 # =====================================================================================
53 # LOGGING INFO ...
54 # =====================================================================================
55 logger = logging.getLogger(__name__)
56 console_hdlr = logging.StreamHandler(sys.stdout)
57 file_hdlr = logging.FileHandler(filename=f"ParseTests.log")
58 logging.basicConfig(level=logging.INFO, handlers=[console_hdlr, file_hdlr])
59 logger.info("Start Logging")
60 logger.info(f"Chunking in Files of {CHUNK_SIZE} Sentences")
61
62 # =====================================================================================
63 # PROCESS (PARSE) ALL FILES FOUND ...
64 # =====================================================================================
65 for f in conll_files:
66 start = time.time()
67 text_filename = expand_file(f)
daza972aabc2020-09-01 16:41:30 +020068 line_generator = file_generator(text_filename)
dazacd9cf012020-08-31 17:19:11 +020069 total_processed_sents = 0
70 while file_has_next:
daza972aabc2020-09-01 16:41:30 +020071 raw_text, file_has_next, n_sents = get_file_chunk(line_generator, chunk_size=CHUNK_SIZE, token_class=CoNLLUP_Token)
dazacd9cf012020-08-31 17:19:11 +020072 total_processed_sents += n_sents
73 if len(raw_text) > 0:
74 turku_parse_file(raw_text, text_filename, chunk_ix)
75 now = time.time()
76 elapsed = (now - start)
77 logger.info(f"Time Elapsed: {elapsed}. Processed {total_processed_sents}. [{total_processed_sents/elapsed} Sents/sec]\n") # Toks/Sec???
78 chunk_ix += 1
79 end = time.time()
80 logger.info(f"Processing File {f} took {(end - start)} seconds!")
81
82