print grouped files
diff --git a/DeReKo/explore_dereko.py b/DeReKo/explore_dereko.py
index fa14a6c..ee56ee9 100644
--- a/DeReKo/explore_dereko.py
+++ b/DeReKo/explore_dereko.py
@@ -1,8 +1,20 @@
import glob
+from collections import defaultdict
-DEREKO_DIR = "/export/netapp/kupietz/N-GRAMM-STUDIE/conllu"
+DEREKO_DIR = "/export/netapp/kupietz/N-GRAMM-STUDIE/conllu/"
+def get_filenames(data_dir):
+ filenames = []
+ for filepath in glob.iglob(f'{data_dir}/*.conllu.gz', recursive=False):
+ fname = filepath.split("/")[-1]
+ filenames.append(fname)
+ return sorted(filenames)
if __name__ == "__main__":
- for filepath in glob.iglob(f'{DEREKO_DIR}/*.conllu.gz', recursive=False):
- print(filepath)
\ No newline at end of file
+ file_groups = defaultdict(list)
+ filenames = get_filenames(DEREKO_DIR)
+ for fn in filenames:
+ prefix = fn.split(".")[0]
+ file_groups[prefix].append(fn)
+ for group,files in file_groups.items():
+ print(group, len(files))
\ No newline at end of file