Added Parsing and Evaluation of Lemmas using Tiger Corpus
diff --git a/DeReKo/tiger_turku_parse.py b/DeReKo/tiger_turku_parse.py
new file mode 100644
index 0000000..6ccb64a
--- /dev/null
+++ b/DeReKo/tiger_turku_parse.py
@@ -0,0 +1,44 @@
+# TODO: write a client to make multiple requests to the server!
+import subprocess, json, time
+import requests, glob, logging
+import os.path, sys
+from CoNLL_Annotation import CoNLL09_Token
+from my_utils import *
+
+
+TIGER_CORPUS = "/home/daza/datasets/TIGER_conll/tiger_release_aug07.corrected.16012013.conll09"
+
+
+if __name__ == "__main__":
+ file_has_next, chunk_ix = True, 0
+ CHUNK_SIZE = 10000
+
+ # =====================================================================================
+ # LOGGING INFO ...
+ # =====================================================================================
+ logger = logging.getLogger(__name__)
+ console_hdlr = logging.StreamHandler(sys.stdout)
+ file_hdlr = logging.FileHandler(filename=f"ParseTests.log")
+ logging.basicConfig(level=logging.INFO, handlers=[console_hdlr, file_hdlr])
+ logger.info(f"Chunking TIGER Corpus in chunks of {CHUNK_SIZE} Sentences")
+
+ # =====================================================================================
+ # PROCESS (PARSE) TIGER Corpus ...
+ # =====================================================================================
+ start = time.time()
+ total_processed_sents = 0
+ line_generator = file_generator(TIGER_CORPUS)
+ while file_has_next:
+ raw_text, file_has_next, n_sents = get_file_chunk(line_generator, chunk_size=CHUNK_SIZE, token_class=CoNLL09_Token)
+ total_processed_sents += n_sents
+ if len(raw_text) > 0:
+ turku_parse_file(raw_text, TIGER_CORPUS, chunk_ix)
+ now = time.time()
+ elapsed = (now - start)
+ logger.info(f"Time Elapsed: {elapsed}. Processed {total_processed_sents}. [{total_processed_sents/elapsed} Sents/sec]\n") # Toks/Sec???
+ chunk_ix += 1
+ if chunk_ix == 10: break
+ end = time.time()
+ logger.info(f"Processing File {TIGER_CORPUS} took {(end - start)} seconds!")
+
+
\ No newline at end of file