Work with Turku Client-Server schema

commit: a3c8c8b7ca8a582b485df56441f4a4794e25a36e [log] [tgz]
author: daza <daza@uni-heidelberg.de> Mon Aug 31 17:19:11 2020 +0200
committer: daza <daza@uni-heidelberg.de> Mon Aug 31 17:19:11 2020 +0200
tree: afdc8c4565a9b8303dd4e05e1ac3668a9f647a16
parent: 49b14c03af4d39eb7d530c5a35ee3cc14dbb3377 [diff] [blame]
diff --git a/DeReKo/turku_client_parser.py b/DeReKo/turku_client_parser.py
new file mode 100644
index 0000000..188f9fc
--- /dev/null
+++ b/DeReKo/turku_client_parser.py

@@ -0,0 +1,138 @@
+# TODO: write a client to make multiple requests to the server!
+import subprocess, json, time
+import requests, glob, logging
+import os.path, sys
+from CoNLL_Annotation import get_annotation, CoNLLUP_Token
+
+# TODO: Add logging instead of Prints!
+
+DEREKO_DIR = "/export/netapp/kupietz/N-GRAMM-STUDIE/conllu/"
+
+def get_filenames(data_dir):
+    filenames = []
+    for filepath in glob.iglob(f'{data_dir}/*.conllu.gz', recursive=False):
+        fname = filepath.split("/")[-1]
+        filenames.append(filepath)
+    return sorted(filenames)
+
+
+def expand_file(f):
+    # Expand the .tgz file
+    fname = f[:-3]
+    if not os.path.isfile(fname): 
+        p = subprocess.call(f"gunzip -c {f} > {fname}", shell=True)
+        if p == 0:
+            logger.info("Successfully uncompressed file")
+        else:
+            logger.info(f"Couldn't expand file {f}")
+            raise Exception
+    else:
+        logger.info(f"File {fname} is already uncompressed. Skipping this step...")
+    
+    # Substitute the Commentary Lines on the Expanded file
+    fixed_filename = f"{fname}.fixed"
+    p = subprocess.call(f"sed 's/^# /###C: /g' {fname}", shell=True, stdout=open(fixed_filename, "w")) # stdout=subprocess.PIPE
+    if p == 0:
+        logger.info("Successfully fixed comments on file")
+    else:
+        logger.info(f"Something went wrong when substituting commentaries")
+        raise Exception    
+    return fixed_filename
+
+
+
+def _file_generator(file_path):
+    with open(file_path, "r") as data_file:
+        logger.info("Reading instances from lines in file at: %s", file_path)
+        for line in data_file:
+            if not line: continue
+            yield line
+
+
+def read_conll(line_generator, chunk_size):
+    n_sents = 0
+    annotated_sentences, buffer_meta, buffer_lst = [], [], []
+    for i, line in enumerate(line_generator):
+        if n_sents == chunk_size: break
+        if line.startswith("###C:"):
+            buffer_meta.append(line) 
+            continue
+        if len(line.split()) > 0:
+            buffer_lst.append(line)
+        else:
+            ann = get_annotation(buffer_lst, buffer_meta)
+            n_sents += 1
+            buffer_lst, buffer_meta = [], []
+            annotated_sentences.append(ann)
+    # logger.info("Read {} Sentences!".format(n_sents))
+    return annotated_sentences, n_sents
+
+
+
+def get_file_chunk(line_generator, chunk_size):
+    file_has_next = True
+    chunk, n_sents = read_conll(line_generator, chunk_size)
+    if n_sents == 0: file_has_next = False
+    raw_text = ""
+    for anno in chunk:
+        raw_text += "\n".join(anno.metadata) + "\n"
+        for tok in anno.tokens:
+            raw_text += tok.get_conllU_line() + "\n"
+        raw_text += "\n"
+    return raw_text, file_has_next, n_sents
+
+
+def turku_parse_file(raw_text, filename, chunk_ix):
+    f = filename.split(".")[0]
+    out_file_str = f"{f}.parsed.{chunk_ix}.conllu"
+    # For each file make a request to obtain the parse back
+    logger.info(f"Sending Request {chunk_ix} to Parser Server...")
+    response = requests.post("http://localhost:7689/", data=raw_text.encode('utf-8'))
+    response_to_file(response.text, out_file_str)
+
+
+
+def response_to_file(response_str, fname):
+    fout = open(fname, "w")
+    fout.write(response_str)
+    fout.close()
+
+
+if __name__ == "__main__":
+    conll_files = get_filenames(DEREKO_DIR)[:1] # This is for Development Purposes only process the first [at most] 5 files
+    print(conll_files)
+    # conll_files = ["tutorial_examples/mini_test_raw.conllu.gz"]
+    file_has_next, chunk_ix = True, 0
+    CHUNK_SIZE = 20000
+    
+    # =====================================================================================
+    #                    LOGGING INFO ...
+    # =====================================================================================
+    logger = logging.getLogger(__name__)
+    console_hdlr = logging.StreamHandler(sys.stdout)
+    file_hdlr = logging.FileHandler(filename=f"ParseTests.log")
+    logging.basicConfig(level=logging.INFO, handlers=[console_hdlr, file_hdlr])
+    logger.info("Start Logging")
+    logger.info(f"Chunking in Files of {CHUNK_SIZE} Sentences")
+    
+    # =====================================================================================
+    #                    PROCESS (PARSE) ALL FILES FOUND ...
+    # =====================================================================================
+    for f in conll_files:
+        start = time.time()
+        text_filename = expand_file(f)
+        line_generator = _file_generator(text_filename)
+        total_processed_sents = 0
+        while file_has_next:
+            raw_text, file_has_next, n_sents = get_file_chunk(line_generator, chunk_size=CHUNK_SIZE)
+            total_processed_sents += n_sents
+            if len(raw_text) > 0:
+                turku_parse_file(raw_text, text_filename, chunk_ix)
+                now = time.time()
+                elapsed = (now - start)
+                logger.info(f"Time Elapsed: {elapsed}. Processed {total_processed_sents}. [{total_processed_sents/elapsed} Sents/sec]\n") # Toks/Sec???
+            chunk_ix += 1       
+        end = time.time()
+        logger.info(f"Processing File {f} took {(end - start)} seconds!")
+    
+    
\ No newline at end of file
commit	a3c8c8b7ca8a582b485df56441f4a4794e25a36e	[log] [tgz]
author	daza <daza@uni-heidelberg.de>	Mon Aug 31 17:19:11 2020 +0200
committer	daza <daza@uni-heidelberg.de>	Mon Aug 31 17:19:11 2020 +0200
tree	afdc8c4565a9b8303dd4e05e1ac3668a9f647a16
parent	49b14c03af4d39eb7d530c5a35ee3cc14dbb3377 [diff] [blame]