upgrading repo to latest version
diff --git a/DeReKo/explore_dereko.py b/DeReKo/explore_dereko.py
new file mode 100644
index 0000000..c5ddc49
--- /dev/null
+++ b/DeReKo/explore_dereko.py
@@ -0,0 +1,23 @@
+import glob,re
+from collections import defaultdict
+
+DEREKO_DIR = "/export/netapp/kupietz/N-GRAMM-STUDIE/conllu/"
+
+def get_filenames(data_dir):
+ filenames = []
+ for filepath in glob.iglob(f'{data_dir}/*.conllu.gz', recursive=False):
+ fname = filepath.split("/")[-1]
+ filenames.append(fname)
+ return sorted(filenames)
+
+if __name__ == "__main__":
+ file_groups = defaultdict(list)
+ filenames = get_filenames(DEREKO_DIR)
+ for fn in filenames:
+ prefix = fn.split(".")[0]
+ prefix = re.findall("\D+", prefix)[0]
+ file_groups[prefix].append(fn)
+ print(fn.split(".")[0])
+
+ #for group,files in sorted(file_groups.items(), key=lambda x: len(x[1]), reverse=True):
+ # print(group, len(files))
\ No newline at end of file