Work with Turku Client-Server schema
diff --git a/DeReKo/CoNLL_Annotation.py b/DeReKo/CoNLL_Annotation.py
new file mode 100644
index 0000000..10b3f54
--- /dev/null
+++ b/DeReKo/CoNLL_Annotation.py
@@ -0,0 +1,63 @@
+from collections import defaultdict, OrderedDict
+import re
+
+# CoNLL-U Format - https://universaldependencies.org/format.html
+
+
+class CoNLLUP_Token():
+ def __init__(self, raw_line, word_ix):
+ info = raw_line.split()
+ # [ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]
+ # [11, Prügel, Prügel, NN, NN, _, _, _, _, 1.000000]
+ self.info = info
+ self.id = int(info[0]) # 1-based ID as in the CoNLL file
+ self.position = int(word_ix) # 0-based position in sentence
+ self.word = info[1]
+ self.lemma = info[2]
+ self.pos_universal = info[3]
+ self.pos_tag = info[4]
+ self.detail_tag = info[5]
+ self.head = info[6]
+ self.dep_tag = info[7]
+ self.blank = info[8] # ???
+ self.auto_score = info[9]
+
+ def get_info(self):
+ return [str(self.id), self.word, self.lemma, self.pos_universal, self.pos_tag, self.detail_tag,
+ str(self.head), self.dep_tag, self.blank, self.auto_score]
+
+ def get_conllU_line(self, separator="\t"):
+ info = self.get_info()
+ return separator.join(info)
+
+
+################################# GETTING SENTENCE ANNOTATIONS ####################################
+class AnnotatedSentence():
+ def __init__(self):
+ self.metadata = []
+ self.tokens = []
+
+ def get_words(self):
+ return [tok.word for tok in self.tokens]
+
+ def get_sentence(self):
+ return " ".join([tok.word for tok in self.tokens])
+
+ def get_pos_tags(self, universal=False):
+ if universal:
+ return [tok.pos_universal for tok in self.tokens]
+ else:
+ return [tok.pos_tag for tok in self.tokens]
+
+
+def get_annotation(raw_lines, raw_meta, token_class=CoNLLUP_Token):
+ ann = AnnotatedSentence()
+ ann.metadata = [m.strip("\n") for m in raw_meta]
+ # Annotate the predicates and senses
+ real_index = 0
+ for i, line in enumerate(raw_lines):
+ tok = token_class(line, real_index)
+ if tok:
+ ann.tokens.append(tok)
+ real_index += 1
+ return ann
diff --git a/DeReKo/ParseTests.log b/DeReKo/ParseTests.log
new file mode 100644
index 0000000..c64ba0b
--- /dev/null
+++ b/DeReKo/ParseTests.log
@@ -0,0 +1,282 @@
+INFO:__main__:Start Logging
+INFO:__main__:Chunking in Files of 20000 Sentences
+INFO:__main__:File /export/netapp/kupietz/N-GRAMM-STUDIE/conllu/a00.conllu is already uncompressed. Skipping this step...
+INFO:__main__:Successfully fixed comments on file
+INFO:__main__:Reading instances from lines in file at: /export/netapp/kupietz/N-GRAMM-STUDIE/conllu/a00.conllu.fixed
+INFO:__main__:Sending Request 0 to Parser Server...
+INFO:__main__:Time Elapsed: 98.19491934776306. Processed 20000. [203.67652555595905 Sents/sec]
+
+INFO:__main__:Sending Request 1 to Parser Server...
+INFO:__main__:Time Elapsed: 178.8947925567627. Processed 40000. [223.5951054154253 Sents/sec]
+
+INFO:__main__:Sending Request 2 to Parser Server...
+INFO:__main__:Time Elapsed: 258.42775654792786. Processed 60000. [232.1732030702841 Sents/sec]
+
+INFO:__main__:Sending Request 3 to Parser Server...
+INFO:__main__:Time Elapsed: 337.31846141815186. Processed 80000. [237.16460600366958 Sents/sec]
+
+INFO:__main__:Sending Request 4 to Parser Server...
+INFO:__main__:Time Elapsed: 420.1129512786865. Processed 100000. [238.03122397353542 Sents/sec]
+
+INFO:__main__:Sending Request 5 to Parser Server...
+INFO:__main__:Time Elapsed: 499.3560426235199. Processed 120000. [240.30949814794118 Sents/sec]
+
+INFO:__main__:Sending Request 6 to Parser Server...
+INFO:__main__:Time Elapsed: 580.2616112232208. Processed 140000. [241.27048436803 Sents/sec]
+
+INFO:__main__:Sending Request 7 to Parser Server...
+INFO:__main__:Time Elapsed: 664.2827656269073. Processed 160000. [240.86128419875877 Sents/sec]
+
+INFO:__main__:Sending Request 8 to Parser Server...
+INFO:__main__:Time Elapsed: 744.8023872375488. Processed 180000. [241.67484299777146 Sents/sec]
+
+INFO:__main__:Sending Request 9 to Parser Server...
+INFO:__main__:Time Elapsed: 825.8918900489807. Processed 200000. [242.16244572656927 Sents/sec]
+
+INFO:__main__:Sending Request 10 to Parser Server...
+INFO:__main__:Time Elapsed: 903.7794210910797. Processed 220000. [243.42222766524927 Sents/sec]
+
+INFO:__main__:Sending Request 11 to Parser Server...
+INFO:__main__:Time Elapsed: 983.4670946598053. Processed 240000. [244.03460095736023 Sents/sec]
+
+INFO:__main__:Sending Request 12 to Parser Server...
+INFO:__main__:Time Elapsed: 1064.4692878723145. Processed 260000. [244.25317194420325 Sents/sec]
+
+INFO:__main__:Sending Request 13 to Parser Server...
+INFO:__main__:Time Elapsed: 1148.1249577999115. Processed 280000. [243.8758935582661 Sents/sec]
+
+INFO:__main__:Sending Request 14 to Parser Server...
+INFO:__main__:Time Elapsed: 1231.3249888420105. Processed 300000. [243.63998352874535 Sents/sec]
+
+INFO:__main__:Sending Request 15 to Parser Server...
+INFO:__main__:Time Elapsed: 1313.075716972351. Processed 320000. [243.70262572355384 Sents/sec]
+
+INFO:__main__:Sending Request 16 to Parser Server...
+INFO:__main__:Time Elapsed: 1394.967396736145. Processed 340000. [243.73329498274305 Sents/sec]
+
+INFO:__main__:Sending Request 17 to Parser Server...
+INFO:__main__:Time Elapsed: 1476.8235352039337. Processed 360000. [243.76642937931499 Sents/sec]
+
+INFO:__main__:Sending Request 18 to Parser Server...
+INFO:__main__:Time Elapsed: 1557.1394641399384. Processed 380000. [244.03722900304697 Sents/sec]
+
+INFO:__main__:Sending Request 19 to Parser Server...
+INFO:__main__:Time Elapsed: 1641.3248417377472. Processed 400000. [243.70556627687506 Sents/sec]
+
+INFO:__main__:Sending Request 20 to Parser Server...
+INFO:__main__:Time Elapsed: 1721.7012026309967. Processed 420000. [243.944767743777 Sents/sec]
+
+INFO:__main__:Sending Request 21 to Parser Server...
+INFO:__main__:Time Elapsed: 1804.4057919979095. Processed 440000. [243.84758791580612 Sents/sec]
+
+INFO:__main__:Sending Request 22 to Parser Server...
+INFO:__main__:Time Elapsed: 1887.0067677497864. Processed 460000. [243.77231065712593 Sents/sec]
+
+INFO:__main__:Sending Request 23 to Parser Server...
+INFO:__main__:Time Elapsed: 2015.9566264152527. Processed 480000. [238.10036074711073 Sents/sec]
+
+INFO:__main__:Sending Request 24 to Parser Server...
+INFO:__main__:Time Elapsed: 2139.8701055049896. Processed 500000. [233.6590425342685 Sents/sec]
+
+INFO:__main__:Sending Request 25 to Parser Server...
+INFO:__main__:Time Elapsed: 2273.1346604824066. Processed 520000. [228.75899481012056 Sents/sec]
+
+INFO:__main__:Sending Request 26 to Parser Server...
+INFO:__main__:Time Elapsed: 2404.8847119808197. Processed 540000. [224.54298840596846 Sents/sec]
+
+INFO:__main__:Sending Request 27 to Parser Server...
+INFO:__main__:Time Elapsed: 2530.600071668625. Processed 560000. [221.29138707830182 Sents/sec]
+
+INFO:__main__:Sending Request 28 to Parser Server...
+INFO:__main__:Time Elapsed: 2656.4930505752563. Processed 580000. [218.33296340617287 Sents/sec]
+
+INFO:__main__:Sending Request 29 to Parser Server...
+INFO:__main__:Time Elapsed: 2781.2974302768707. Processed 600000. [215.72665816624712 Sents/sec]
+
+INFO:__main__:Sending Request 30 to Parser Server...
+INFO:__main__:Time Elapsed: 2907.4323580265045. Processed 620000. [213.24657761628586 Sents/sec]
+
+INFO:__main__:Sending Request 31 to Parser Server...
+INFO:__main__:Time Elapsed: 3030.6001527309418. Processed 640000. [211.17929378551693 Sents/sec]
+
+INFO:__main__:Sending Request 32 to Parser Server...
+INFO:__main__:Time Elapsed: 3155.522436141968. Processed 660000. [209.15712480464404 Sents/sec]
+
+INFO:__main__:Sending Request 33 to Parser Server...
+INFO:__main__:Time Elapsed: 3279.3324024677277. Processed 680000. [207.35927821415535 Sents/sec]
+
+INFO:__main__:Sending Request 34 to Parser Server...
+INFO:__main__:Time Elapsed: 3403.92253446579. Processed 700000. [205.6451029399991 Sents/sec]
+
+INFO:__main__:Sending Request 35 to Parser Server...
+INFO:__main__:Time Elapsed: 3523.8551886081696. Processed 720000. [204.3216765341544 Sents/sec]
+
+INFO:__main__:Sending Request 36 to Parser Server...
+INFO:__main__:Time Elapsed: 3649.4880363941193. Processed 740000. [202.76816710191434 Sents/sec]
+
+INFO:__main__:Sending Request 37 to Parser Server...
+INFO:__main__:Time Elapsed: 3769.89262509346. Processed 760000. [201.59725370988747 Sents/sec]
+
+INFO:__main__:Sending Request 38 to Parser Server...
+INFO:__main__:Time Elapsed: 3896.9183082580566. Processed 780000. [200.1581604487532 Sents/sec]
+
+INFO:__main__:Sending Request 39 to Parser Server...
+INFO:__main__:Time Elapsed: 4012.452913761139. Processed 800000. [199.3792867341356 Sents/sec]
+
+INFO:__main__:Sending Request 40 to Parser Server...
+INFO:__main__:Time Elapsed: 4123.85481595993. Processed 820000. [198.84308167845248 Sents/sec]
+
+INFO:__main__:Sending Request 41 to Parser Server...
+INFO:__main__:Time Elapsed: 4244.3920249938965. Processed 840000. [197.9082033548039 Sents/sec]
+
+INFO:__main__:Sending Request 42 to Parser Server...
+INFO:__main__:Time Elapsed: 4366.287281751633. Processed 860000. [196.96367749191987 Sents/sec]
+
+INFO:__main__:Sending Request 43 to Parser Server...
+INFO:__main__:Time Elapsed: 4488.923060178757. Processed 880000. [196.0381116367267 Sents/sec]
+
+INFO:__main__:Sending Request 44 to Parser Server...
+INFO:__main__:Time Elapsed: 4610.694122076035. Processed 900000. [195.19837494549768 Sents/sec]
+
+INFO:__main__:Sending Request 45 to Parser Server...
+INFO:__main__:Time Elapsed: 4731.4308161735535. Processed 920000. [194.44435219366284 Sents/sec]
+
+INFO:__main__:Sending Request 46 to Parser Server...
+INFO:__main__:Time Elapsed: 4855.6209988594055. Processed 940000. [193.59006813357297 Sents/sec]
+
+INFO:__main__:Sending Request 47 to Parser Server...
+INFO:__main__:Time Elapsed: 4980.613188266754. Processed 960000. [192.74735132243396 Sents/sec]
+
+INFO:__main__:Sending Request 48 to Parser Server...
+INFO:__main__:Time Elapsed: 5098.693638086319. Processed 980000. [192.20609622032933 Sents/sec]
+
+INFO:__main__:Sending Request 49 to Parser Server...
+INFO:__main__:Time Elapsed: 5221.365651845932. Processed 1000000. [191.5207757277956 Sents/sec]
+
+INFO:__main__:Sending Request 50 to Parser Server...
+INFO:__main__:Time Elapsed: 5339.519736289978. Processed 1020000. [191.02841648240064 Sents/sec]
+
+INFO:__main__:Sending Request 51 to Parser Server...
+INFO:__main__:Time Elapsed: 5461.673362731934. Processed 1040000. [190.41783184921024 Sents/sec]
+
+INFO:__main__:Sending Request 52 to Parser Server...
+INFO:__main__:Time Elapsed: 5584.106055259705. Processed 1060000. [189.82447494914237 Sents/sec]
+
+INFO:__main__:Sending Request 53 to Parser Server...
+INFO:__main__:Time Elapsed: 5703.546013832092. Processed 1080000. [189.35588445868797 Sents/sec]
+
+INFO:__main__:Sending Request 54 to Parser Server...
+INFO:__main__:Time Elapsed: 5821.955971479416. Processed 1100000. [188.93993795017988 Sents/sec]
+
+INFO:__main__:Sending Request 55 to Parser Server...
+INFO:__main__:Time Elapsed: 5940.822002887726. Processed 1120000. [188.52609949525308 Sents/sec]
+
+INFO:__main__:Sending Request 56 to Parser Server...
+INFO:__main__:Time Elapsed: 6057.515177726746. Processed 1140000. [188.19597913542782 Sents/sec]
+
+INFO:__main__:Sending Request 57 to Parser Server...
+INFO:__main__:Time Elapsed: 6175.445195436478. Processed 1160000. [187.84070836823477 Sents/sec]
+
+INFO:__main__:Sending Request 58 to Parser Server...
+INFO:__main__:Time Elapsed: 6293.141885995865. Processed 1180000. [187.50570404679024 Sents/sec]
+
+INFO:__main__:Sending Request 59 to Parser Server...
+INFO:__main__:Time Elapsed: 6407.568333148956. Processed 1200000. [187.27853338557657 Sents/sec]
+
+INFO:__main__:Sending Request 60 to Parser Server...
+INFO:__main__:Time Elapsed: 6519.235458374023. Processed 1220000. [187.13850846312013 Sents/sec]
+
+INFO:__main__:Sending Request 61 to Parser Server...
+INFO:__main__:Time Elapsed: 6633.7648096084595. Processed 1240000. [186.92251467883858 Sents/sec]
+
+INFO:__main__:Sending Request 62 to Parser Server...
+INFO:__main__:Time Elapsed: 6748.994223117828. Processed 1260000. [186.69448488843403 Sents/sec]
+
+INFO:__main__:Sending Request 63 to Parser Server...
+INFO:__main__:Time Elapsed: 6866.852972269058. Processed 1280000. [186.4027095336281 Sents/sec]
+
+INFO:__main__:Sending Request 64 to Parser Server...
+INFO:__main__:Time Elapsed: 6990.777011871338. Processed 1300000. [185.95930005955194 Sents/sec]
+
+INFO:__main__:Sending Request 65 to Parser Server...
+INFO:__main__:Time Elapsed: 7105.517110586166. Processed 1320000. [185.77113804052289 Sents/sec]
+
+INFO:__main__:Sending Request 66 to Parser Server...
+INFO:__main__:Time Elapsed: 7225.482071638107. Processed 1340000. [185.45475398241564 Sents/sec]
+
+INFO:__main__:Sending Request 67 to Parser Server...
+INFO:__main__:Time Elapsed: 7337.8775935173035. Processed 1360000. [185.33969566370268 Sents/sec]
+
+INFO:__main__:Sending Request 68 to Parser Server...
+INFO:__main__:Time Elapsed: 7455.035196065903. Processed 1380000. [185.10979005548356 Sents/sec]
+
+INFO:__main__:Sending Request 69 to Parser Server...
+INFO:__main__:Time Elapsed: 7578.5934200286865. Processed 1400000. [184.7308494344193 Sents/sec]
+
+INFO:__main__:Sending Request 70 to Parser Server...
+INFO:__main__:Time Elapsed: 7692.598174333572. Processed 1420000. [184.59302927557604 Sents/sec]
+
+INFO:__main__:Sending Request 71 to Parser Server...
+INFO:__main__:Time Elapsed: 7809.916503667831. Processed 1440000. [184.38097248846665 Sents/sec]
+
+INFO:__main__:Sending Request 72 to Parser Server...
+INFO:__main__:Time Elapsed: 7931.408297300339. Processed 1460000. [184.07828033477347 Sents/sec]
+
+INFO:__main__:Sending Request 73 to Parser Server...
+INFO:__main__:Time Elapsed: 8043.090455055237. Processed 1480000. [184.00887174777347 Sents/sec]
+
+INFO:__main__:Sending Request 74 to Parser Server...
+INFO:__main__:Time Elapsed: 8154.90810251236. Processed 1500000. [183.93830821194427 Sents/sec]
+
+INFO:__main__:Sending Request 75 to Parser Server...
+INFO:__main__:Time Elapsed: 8268.902240037918. Processed 1520000. [183.821256543605 Sents/sec]
+
+INFO:__main__:Sending Request 76 to Parser Server...
+INFO:__main__:Time Elapsed: 8386.647283315659. Processed 1540000. [183.6252256683867 Sents/sec]
+
+INFO:__main__:Sending Request 77 to Parser Server...
+INFO:__main__:Time Elapsed: 8499.714630842209. Processed 1560000. [183.535573575536 Sents/sec]
+
+INFO:__main__:Sending Request 78 to Parser Server...
+INFO:__main__:Time Elapsed: 8609.53560590744. Processed 1580000. [183.51744766766302 Sents/sec]
+
+INFO:__main__:Sending Request 79 to Parser Server...
+INFO:__main__:Time Elapsed: 8720.773554086685. Processed 1600000. [183.46996285096935 Sents/sec]
+
+INFO:__main__:Sending Request 80 to Parser Server...
+INFO:__main__:Time Elapsed: 8831.752426862717. Processed 1620000. [183.42905481279087 Sents/sec]
+
+INFO:__main__:Sending Request 81 to Parser Server...
+INFO:__main__:Time Elapsed: 8940.69381070137. Processed 1640000. [183.43095454594783 Sents/sec]
+
+INFO:__main__:Sending Request 82 to Parser Server...
+INFO:__main__:Time Elapsed: 9054.10047864914. Processed 1660000. [183.34234349558156 Sents/sec]
+
+INFO:__main__:Sending Request 83 to Parser Server...
+INFO:__main__:Time Elapsed: 9169.408634662628. Processed 1680000. [183.21792243495238 Sents/sec]
+
+INFO:__main__:Sending Request 84 to Parser Server...
+INFO:__main__:Time Elapsed: 9278.584917545319. Processed 1700000. [183.21759353469827 Sents/sec]
+
+INFO:__main__:Sending Request 85 to Parser Server...
+INFO:__main__:Time Elapsed: 9385.278182506561. Processed 1720000. [183.26574519719065 Sents/sec]
+
+INFO:__main__:Sending Request 86 to Parser Server...
+INFO:__main__:Time Elapsed: 9499.239807844162. Processed 1740000. [183.17255224604023 Sents/sec]
+
+INFO:__main__:Sending Request 87 to Parser Server...
+INFO:__main__:Time Elapsed: 9611.59985089302. Processed 1760000. [183.1120757525582 Sents/sec]
+
+INFO:__main__:Sending Request 88 to Parser Server...
+INFO:__main__:Time Elapsed: 9722.264467716217. Processed 1780000. [183.0849187358227 Sents/sec]
+
+INFO:__main__:Sending Request 89 to Parser Server...
+INFO:__main__:Time Elapsed: 9835.523452997208. Processed 1800000. [183.01008671292232 Sents/sec]
+
+INFO:__main__:Sending Request 90 to Parser Server...
+INFO:__main__:Time Elapsed: 9946.680324554443. Processed 1820000. [182.97562006764562 Sents/sec]
+
+INFO:__main__:Sending Request 91 to Parser Server...
+INFO:__main__:Time Elapsed: 10007.997486114502. Processed 1830665. [182.92020981619333 Sents/sec]
+
+INFO:__main__:Processing File /export/netapp/kupietz/N-GRAMM-STUDIE/conllu/a00.conllu.gz took 10007.997836351395 seconds!
diff --git a/DeReKo/__pycache__/CoNLL_Annotation.cpython-36.pyc b/DeReKo/__pycache__/CoNLL_Annotation.cpython-36.pyc
new file mode 100644
index 0000000..16cb825
--- /dev/null
+++ b/DeReKo/__pycache__/CoNLL_Annotation.cpython-36.pyc
Binary files differ
diff --git a/DeReKo/turku_client_parser.py b/DeReKo/turku_client_parser.py
new file mode 100644
index 0000000..188f9fc
--- /dev/null
+++ b/DeReKo/turku_client_parser.py
@@ -0,0 +1,138 @@
+# TODO: write a client to make multiple requests to the server!
+import subprocess, json, time
+import requests, glob, logging
+import os.path, sys
+from CoNLL_Annotation import get_annotation, CoNLLUP_Token
+
+# TODO: Add logging instead of Prints!
+
+DEREKO_DIR = "/export/netapp/kupietz/N-GRAMM-STUDIE/conllu/"
+
+def get_filenames(data_dir):
+ filenames = []
+ for filepath in glob.iglob(f'{data_dir}/*.conllu.gz', recursive=False):
+ fname = filepath.split("/")[-1]
+ filenames.append(filepath)
+ return sorted(filenames)
+
+
+def expand_file(f):
+ # Expand the .tgz file
+ fname = f[:-3]
+ if not os.path.isfile(fname):
+ p = subprocess.call(f"gunzip -c {f} > {fname}", shell=True)
+ if p == 0:
+ logger.info("Successfully uncompressed file")
+ else:
+ logger.info(f"Couldn't expand file {f}")
+ raise Exception
+ else:
+ logger.info(f"File {fname} is already uncompressed. Skipping this step...")
+
+ # Substitute the Commentary Lines on the Expanded file
+ fixed_filename = f"{fname}.fixed"
+ p = subprocess.call(f"sed 's/^# /###C: /g' {fname}", shell=True, stdout=open(fixed_filename, "w")) # stdout=subprocess.PIPE
+ if p == 0:
+ logger.info("Successfully fixed comments on file")
+ else:
+ logger.info(f"Something went wrong when substituting commentaries")
+ raise Exception
+ return fixed_filename
+
+
+
+def _file_generator(file_path):
+ with open(file_path, "r") as data_file:
+ logger.info("Reading instances from lines in file at: %s", file_path)
+ for line in data_file:
+ if not line: continue
+ yield line
+
+
+def read_conll(line_generator, chunk_size):
+ n_sents = 0
+ annotated_sentences, buffer_meta, buffer_lst = [], [], []
+ for i, line in enumerate(line_generator):
+ if n_sents == chunk_size: break
+ if line.startswith("###C:"):
+ buffer_meta.append(line)
+ continue
+ if len(line.split()) > 0:
+ buffer_lst.append(line)
+ else:
+ ann = get_annotation(buffer_lst, buffer_meta)
+ n_sents += 1
+ buffer_lst, buffer_meta = [], []
+ annotated_sentences.append(ann)
+ # logger.info("Read {} Sentences!".format(n_sents))
+ return annotated_sentences, n_sents
+
+
+
+def get_file_chunk(line_generator, chunk_size):
+ file_has_next = True
+ chunk, n_sents = read_conll(line_generator, chunk_size)
+ if n_sents == 0: file_has_next = False
+ raw_text = ""
+ for anno in chunk:
+ raw_text += "\n".join(anno.metadata) + "\n"
+ for tok in anno.tokens:
+ raw_text += tok.get_conllU_line() + "\n"
+ raw_text += "\n"
+ return raw_text, file_has_next, n_sents
+
+
+def turku_parse_file(raw_text, filename, chunk_ix):
+ f = filename.split(".")[0]
+ out_file_str = f"{f}.parsed.{chunk_ix}.conllu"
+ # For each file make a request to obtain the parse back
+ logger.info(f"Sending Request {chunk_ix} to Parser Server...")
+ response = requests.post("http://localhost:7689/", data=raw_text.encode('utf-8'))
+ response_to_file(response.text, out_file_str)
+
+
+
+def response_to_file(response_str, fname):
+ fout = open(fname, "w")
+ fout.write(response_str)
+ fout.close()
+
+
+if __name__ == "__main__":
+ conll_files = get_filenames(DEREKO_DIR)[:1] # This is for Development Purposes only process the first [at most] 5 files
+ print(conll_files)
+ # conll_files = ["tutorial_examples/mini_test_raw.conllu.gz"]
+ file_has_next, chunk_ix = True, 0
+ CHUNK_SIZE = 20000
+
+ # =====================================================================================
+ # LOGGING INFO ...
+ # =====================================================================================
+ logger = logging.getLogger(__name__)
+ console_hdlr = logging.StreamHandler(sys.stdout)
+ file_hdlr = logging.FileHandler(filename=f"ParseTests.log")
+ logging.basicConfig(level=logging.INFO, handlers=[console_hdlr, file_hdlr])
+ logger.info("Start Logging")
+ logger.info(f"Chunking in Files of {CHUNK_SIZE} Sentences")
+
+ # =====================================================================================
+ # PROCESS (PARSE) ALL FILES FOUND ...
+ # =====================================================================================
+ for f in conll_files:
+ start = time.time()
+ text_filename = expand_file(f)
+ line_generator = _file_generator(text_filename)
+ total_processed_sents = 0
+ while file_has_next:
+ raw_text, file_has_next, n_sents = get_file_chunk(line_generator, chunk_size=CHUNK_SIZE)
+ total_processed_sents += n_sents
+ if len(raw_text) > 0:
+ turku_parse_file(raw_text, text_filename, chunk_ix)
+ now = time.time()
+ elapsed = (now - start)
+ logger.info(f"Time Elapsed: {elapsed}. Processed {total_processed_sents}. [{total_processed_sents/elapsed} Sents/sec]\n") # Toks/Sec???
+ chunk_ix += 1
+ end = time.time()
+ logger.info(f"Processing File {f} took {(end - start)} seconds!")
+
+
\ No newline at end of file
diff --git a/DeReKo/tutorial_examples/fileCount.txt b/DeReKo/tutorial_examples/fileCount.txt
index 49055f7..71de4b6 100644
--- a/DeReKo/tutorial_examples/fileCount.txt
+++ b/DeReKo/tutorial_examples/fileCount.txt
@@ -326,3 +326,5 @@
spk 1
wkb 1
wkd 1
+
+TOTAL: 3173 Files!
\ No newline at end of file