Added Parsing and Evaluation of Lemmas using Tiger Corpus

commit: 972aabccbb3c14f0c70a578f688a78760050d884 [log] [tgz]
author: daza <daza@uni-heidelberg.de> Tue Sep 01 16:41:30 2020 +0200
committer: daza <daza@uni-heidelberg.de> Tue Sep 01 16:41:30 2020 +0200
tree: ff91c37e32499c2f2c51ae4ac7d7001a56f53a7e
parent: cd9cf01dca6821016681e6022de4c338fdcba605 [diff] [blame]
diff --git a/DeReKo/turku_client_parser.py b/DeReKo/turku_client_parser.py
index 188f9fc..6b09426 100644
--- a/DeReKo/turku_client_parser.py
+++ b/DeReKo/turku_client_parser.py

@@ -1,8 +1,9 @@
 # TODO: write a client to make multiple requests to the server!
 import subprocess, json, time
-import requests, glob, logging
+import glob, logging
 import os.path, sys
-from CoNLL_Annotation import get_annotation, CoNLLUP_Token
+from my_utils import *
+from CoNLL_Annotation import CoNLLUP_Token
 
 # TODO: Add logging instead of Prints!
 
@@ -41,67 +42,10 @@
 
 
 
-def _file_generator(file_path):
-    with open(file_path, "r") as data_file:
-        logger.info("Reading instances from lines in file at: %s", file_path)
-        for line in data_file:
-            if not line: continue
-            yield line
-
-
-def read_conll(line_generator, chunk_size):
-    n_sents = 0
-    annotated_sentences, buffer_meta, buffer_lst = [], [], []
-    for i, line in enumerate(line_generator):
-        if n_sents == chunk_size: break
-        if line.startswith("###C:"):
-            buffer_meta.append(line) 
-            continue
-        if len(line.split()) > 0:
-            buffer_lst.append(line)
-        else:
-            ann = get_annotation(buffer_lst, buffer_meta)
-            n_sents += 1
-            buffer_lst, buffer_meta = [], []
-            annotated_sentences.append(ann)
-    # logger.info("Read {} Sentences!".format(n_sents))
-    return annotated_sentences, n_sents
-
-
-
-def get_file_chunk(line_generator, chunk_size):
-    file_has_next = True
-    chunk, n_sents = read_conll(line_generator, chunk_size)
-    if n_sents == 0: file_has_next = False
-    raw_text = ""
-    for anno in chunk:
-        raw_text += "\n".join(anno.metadata) + "\n"
-        for tok in anno.tokens:
-            raw_text += tok.get_conllU_line() + "\n"
-        raw_text += "\n"
-    return raw_text, file_has_next, n_sents
-
-
-def turku_parse_file(raw_text, filename, chunk_ix):
-    f = filename.split(".")[0]
-    out_file_str = f"{f}.parsed.{chunk_ix}.conllu"
-    # For each file make a request to obtain the parse back
-    logger.info(f"Sending Request {chunk_ix} to Parser Server...")
-    response = requests.post("http://localhost:7689/", data=raw_text.encode('utf-8'))
-    response_to_file(response.text, out_file_str)
-
-
-
-def response_to_file(response_str, fname):
-    fout = open(fname, "w")
-    fout.write(response_str)
-    fout.close()
-
-
 if __name__ == "__main__":
-    conll_files = get_filenames(DEREKO_DIR)[:1] # This is for Development Purposes only process the first [at most] 5 files
-    print(conll_files)
-    # conll_files = ["tutorial_examples/mini_test_raw.conllu.gz"]
+    conll_files = get_filenames(DEREKO_DIR)[:1] # This is for Development Purposes only process the first [at most] 2 files
+    #print(conll_files)
+    #conll_files = ["tutorial_examples/mini_test_raw.conllu.gz"]
     file_has_next, chunk_ix = True, 0
     CHUNK_SIZE = 20000
     
@@ -121,10 +65,10 @@
     for f in conll_files:
         start = time.time()
         text_filename = expand_file(f)
-        line_generator = _file_generator(text_filename)
+        line_generator = file_generator(text_filename)
         total_processed_sents = 0
         while file_has_next:
-            raw_text, file_has_next, n_sents = get_file_chunk(line_generator, chunk_size=CHUNK_SIZE)
+            raw_text, file_has_next, n_sents = get_file_chunk(line_generator, chunk_size=CHUNK_SIZE, token_class=CoNLLUP_Token)
             total_processed_sents += n_sents
             if len(raw_text) > 0:
                 turku_parse_file(raw_text, text_filename, chunk_ix)
commit	972aabccbb3c14f0c70a578f688a78760050d884	[log] [tgz]
author	daza <daza@uni-heidelberg.de>	Tue Sep 01 16:41:30 2020 +0200
committer	daza <daza@uni-heidelberg.de>	Tue Sep 01 16:41:30 2020 +0200
tree	ff91c37e32499c2f2c51ae4ac7d7001a56f53a7e
parent	cd9cf01dca6821016681e6022de4c338fdcba605 [diff] [blame]