José Angel Daza | 6a71c2b | 2020-08-14 13:43:22 +0200 | [diff] [blame] | 1 | import glob,re |
José Angel Daza | 4f781c0 | 2020-08-14 12:06:38 +0200 | [diff] [blame] | 2 | from collections import defaultdict |
José Angel Daza | f3f13a7 | 2020-08-14 11:50:57 +0200 | [diff] [blame] | 3 | |
José Angel Daza | 4f781c0 | 2020-08-14 12:06:38 +0200 | [diff] [blame] | 4 | DEREKO_DIR = "/export/netapp/kupietz/N-GRAMM-STUDIE/conllu/" |
José Angel Daza | f3f13a7 | 2020-08-14 11:50:57 +0200 | [diff] [blame] | 5 | |
José Angel Daza | 4f781c0 | 2020-08-14 12:06:38 +0200 | [diff] [blame] | 6 | def get_filenames(data_dir): |
| 7 | filenames = [] |
| 8 | for filepath in glob.iglob(f'{data_dir}/*.conllu.gz', recursive=False): |
| 9 | fname = filepath.split("/")[-1] |
| 10 | filenames.append(fname) |
| 11 | return sorted(filenames) |
José Angel Daza | f3f13a7 | 2020-08-14 11:50:57 +0200 | [diff] [blame] | 12 | |
| 13 | if __name__ == "__main__": |
José Angel Daza | 4f781c0 | 2020-08-14 12:06:38 +0200 | [diff] [blame] | 14 | file_groups = defaultdict(list) |
| 15 | filenames = get_filenames(DEREKO_DIR) |
| 16 | for fn in filenames: |
| 17 | prefix = fn.split(".")[0] |
José Angel Daza | 6a71c2b | 2020-08-14 13:43:22 +0200 | [diff] [blame] | 18 | prefix = re.findall("\D+", prefix)[0] |
José Angel Daza | 4f781c0 | 2020-08-14 12:06:38 +0200 | [diff] [blame] | 19 | file_groups[prefix].append(fn) |
José Angel Daza | 67b2dfa | 2020-08-14 13:50:02 +0200 | [diff] [blame^] | 20 | for group,files in sorted(file_groups.items(), key=lambda x: len(x[1]), reverse=True): |
José Angel Daza | 4f781c0 | 2020-08-14 12:06:38 +0200 | [diff] [blame] | 21 | print(group, len(files)) |