Added Parsing and Evaluation of Lemmas using Tiger Corpus
diff --git a/DeReKo/turku_client_parser.py b/DeReKo/turku_client_parser.py
index 188f9fc..6b09426 100644
--- a/DeReKo/turku_client_parser.py
+++ b/DeReKo/turku_client_parser.py
@@ -1,8 +1,9 @@
# TODO: write a client to make multiple requests to the server!
import subprocess, json, time
-import requests, glob, logging
+import glob, logging
import os.path, sys
-from CoNLL_Annotation import get_annotation, CoNLLUP_Token
+from my_utils import *
+from CoNLL_Annotation import CoNLLUP_Token
# TODO: Add logging instead of Prints!
@@ -41,67 +42,10 @@
-def _file_generator(file_path):
- with open(file_path, "r") as data_file:
- logger.info("Reading instances from lines in file at: %s", file_path)
- for line in data_file:
- if not line: continue
- yield line
-
-
-def read_conll(line_generator, chunk_size):
- n_sents = 0
- annotated_sentences, buffer_meta, buffer_lst = [], [], []
- for i, line in enumerate(line_generator):
- if n_sents == chunk_size: break
- if line.startswith("###C:"):
- buffer_meta.append(line)
- continue
- if len(line.split()) > 0:
- buffer_lst.append(line)
- else:
- ann = get_annotation(buffer_lst, buffer_meta)
- n_sents += 1
- buffer_lst, buffer_meta = [], []
- annotated_sentences.append(ann)
- # logger.info("Read {} Sentences!".format(n_sents))
- return annotated_sentences, n_sents
-
-
-
-def get_file_chunk(line_generator, chunk_size):
- file_has_next = True
- chunk, n_sents = read_conll(line_generator, chunk_size)
- if n_sents == 0: file_has_next = False
- raw_text = ""
- for anno in chunk:
- raw_text += "\n".join(anno.metadata) + "\n"
- for tok in anno.tokens:
- raw_text += tok.get_conllU_line() + "\n"
- raw_text += "\n"
- return raw_text, file_has_next, n_sents
-
-
-def turku_parse_file(raw_text, filename, chunk_ix):
- f = filename.split(".")[0]
- out_file_str = f"{f}.parsed.{chunk_ix}.conllu"
- # For each file make a request to obtain the parse back
- logger.info(f"Sending Request {chunk_ix} to Parser Server...")
- response = requests.post("http://localhost:7689/", data=raw_text.encode('utf-8'))
- response_to_file(response.text, out_file_str)
-
-
-
-def response_to_file(response_str, fname):
- fout = open(fname, "w")
- fout.write(response_str)
- fout.close()
-
-
if __name__ == "__main__":
- conll_files = get_filenames(DEREKO_DIR)[:1] # This is for Development Purposes only process the first [at most] 5 files
- print(conll_files)
- # conll_files = ["tutorial_examples/mini_test_raw.conllu.gz"]
+ conll_files = get_filenames(DEREKO_DIR)[:1] # This is for Development Purposes only process the first [at most] 2 files
+ #print(conll_files)
+ #conll_files = ["tutorial_examples/mini_test_raw.conllu.gz"]
file_has_next, chunk_ix = True, 0
CHUNK_SIZE = 20000
@@ -121,10 +65,10 @@
for f in conll_files:
start = time.time()
text_filename = expand_file(f)
- line_generator = _file_generator(text_filename)
+ line_generator = file_generator(text_filename)
total_processed_sents = 0
while file_has_next:
- raw_text, file_has_next, n_sents = get_file_chunk(line_generator, chunk_size=CHUNK_SIZE)
+ raw_text, file_has_next, n_sents = get_file_chunk(line_generator, chunk_size=CHUNK_SIZE, token_class=CoNLLUP_Token)
total_processed_sents += n_sents
if len(raw_text) > 0:
turku_parse_file(raw_text, text_filename, chunk_ix)