| import glob,re |
| from collections import defaultdict |
| |
| DEREKO_DIR = "/export/netapp/kupietz/N-GRAMM-STUDIE/conllu/" |
| |
| def get_filenames(data_dir): |
| filenames = [] |
| for filepath in glob.iglob(f'{data_dir}/*.conllu.gz', recursive=False): |
| fname = filepath.split("/")[-1] |
| filenames.append(fname) |
| return sorted(filenames) |
| |
| if __name__ == "__main__": |
| file_groups = defaultdict(list) |
| filenames = get_filenames(DEREKO_DIR) |
| for fn in filenames: |
| prefix = fn.split(".")[0] |
| prefix = re.findall("\D+", prefix)[0] |
| file_groups[prefix].append(fn) |
| for group,files in sorted(file_groups.items(), key=lambda x: len(x[1]), reverse=True): |
| print(group, len(files)) |