Work with Turku Client-Server schema
diff --git a/DeReKo/turku_client_parser.py b/DeReKo/turku_client_parser.py
new file mode 100644
index 0000000..188f9fc
--- /dev/null
+++ b/DeReKo/turku_client_parser.py
@@ -0,0 +1,138 @@
+# TODO: write a client to make multiple requests to the server!
+import subprocess, json, time
+import requests, glob, logging
+import os.path, sys
+from CoNLL_Annotation import get_annotation, CoNLLUP_Token
+
+# TODO: Add logging instead of Prints!
+
+DEREKO_DIR = "/export/netapp/kupietz/N-GRAMM-STUDIE/conllu/"
+
+def get_filenames(data_dir):
+ filenames = []
+ for filepath in glob.iglob(f'{data_dir}/*.conllu.gz', recursive=False):
+ fname = filepath.split("/")[-1]
+ filenames.append(filepath)
+ return sorted(filenames)
+
+
+def expand_file(f):
+ # Expand the .tgz file
+ fname = f[:-3]
+ if not os.path.isfile(fname):
+ p = subprocess.call(f"gunzip -c {f} > {fname}", shell=True)
+ if p == 0:
+ logger.info("Successfully uncompressed file")
+ else:
+ logger.info(f"Couldn't expand file {f}")
+ raise Exception
+ else:
+ logger.info(f"File {fname} is already uncompressed. Skipping this step...")
+
+ # Substitute the Commentary Lines on the Expanded file
+ fixed_filename = f"{fname}.fixed"
+ p = subprocess.call(f"sed 's/^# /###C: /g' {fname}", shell=True, stdout=open(fixed_filename, "w")) # stdout=subprocess.PIPE
+ if p == 0:
+ logger.info("Successfully fixed comments on file")
+ else:
+ logger.info(f"Something went wrong when substituting commentaries")
+ raise Exception
+ return fixed_filename
+
+
+
+def _file_generator(file_path):
+ with open(file_path, "r") as data_file:
+ logger.info("Reading instances from lines in file at: %s", file_path)
+ for line in data_file:
+ if not line: continue
+ yield line
+
+
+def read_conll(line_generator, chunk_size):
+ n_sents = 0
+ annotated_sentences, buffer_meta, buffer_lst = [], [], []
+ for i, line in enumerate(line_generator):
+ if n_sents == chunk_size: break
+ if line.startswith("###C:"):
+ buffer_meta.append(line)
+ continue
+ if len(line.split()) > 0:
+ buffer_lst.append(line)
+ else:
+ ann = get_annotation(buffer_lst, buffer_meta)
+ n_sents += 1
+ buffer_lst, buffer_meta = [], []
+ annotated_sentences.append(ann)
+ # logger.info("Read {} Sentences!".format(n_sents))
+ return annotated_sentences, n_sents
+
+
+
+def get_file_chunk(line_generator, chunk_size):
+ file_has_next = True
+ chunk, n_sents = read_conll(line_generator, chunk_size)
+ if n_sents == 0: file_has_next = False
+ raw_text = ""
+ for anno in chunk:
+ raw_text += "\n".join(anno.metadata) + "\n"
+ for tok in anno.tokens:
+ raw_text += tok.get_conllU_line() + "\n"
+ raw_text += "\n"
+ return raw_text, file_has_next, n_sents
+
+
+def turku_parse_file(raw_text, filename, chunk_ix):
+ f = filename.split(".")[0]
+ out_file_str = f"{f}.parsed.{chunk_ix}.conllu"
+ # For each file make a request to obtain the parse back
+ logger.info(f"Sending Request {chunk_ix} to Parser Server...")
+ response = requests.post("http://localhost:7689/", data=raw_text.encode('utf-8'))
+ response_to_file(response.text, out_file_str)
+
+
+
+def response_to_file(response_str, fname):
+ fout = open(fname, "w")
+ fout.write(response_str)
+ fout.close()
+
+
+if __name__ == "__main__":
+ conll_files = get_filenames(DEREKO_DIR)[:1] # This is for Development Purposes only process the first [at most] 5 files
+ print(conll_files)
+ # conll_files = ["tutorial_examples/mini_test_raw.conllu.gz"]
+ file_has_next, chunk_ix = True, 0
+ CHUNK_SIZE = 20000
+
+ # =====================================================================================
+ # LOGGING INFO ...
+ # =====================================================================================
+ logger = logging.getLogger(__name__)
+ console_hdlr = logging.StreamHandler(sys.stdout)
+ file_hdlr = logging.FileHandler(filename=f"ParseTests.log")
+ logging.basicConfig(level=logging.INFO, handlers=[console_hdlr, file_hdlr])
+ logger.info("Start Logging")
+ logger.info(f"Chunking in Files of {CHUNK_SIZE} Sentences")
+
+ # =====================================================================================
+ # PROCESS (PARSE) ALL FILES FOUND ...
+ # =====================================================================================
+ for f in conll_files:
+ start = time.time()
+ text_filename = expand_file(f)
+ line_generator = _file_generator(text_filename)
+ total_processed_sents = 0
+ while file_has_next:
+ raw_text, file_has_next, n_sents = get_file_chunk(line_generator, chunk_size=CHUNK_SIZE)
+ total_processed_sents += n_sents
+ if len(raw_text) > 0:
+ turku_parse_file(raw_text, text_filename, chunk_ix)
+ now = time.time()
+ elapsed = (now - start)
+ logger.info(f"Time Elapsed: {elapsed}. Processed {total_processed_sents}. [{total_processed_sents/elapsed} Sents/sec]\n") # Toks/Sec???
+ chunk_ix += 1
+ end = time.time()
+ logger.info(f"Processing File {f} took {(end - start)} seconds!")
+
+
\ No newline at end of file