José Angel Daza | c428da9 | 2020-08-14 11:50:57 +0200 | [diff] [blame] | 1 | import glob |
José Angel Daza | 05ce70d | 2020-08-14 12:06:38 +0200 | [diff] [blame^] | 2 | from collections import defaultdict |
José Angel Daza | c428da9 | 2020-08-14 11:50:57 +0200 | [diff] [blame] | 3 | |
José Angel Daza | 05ce70d | 2020-08-14 12:06:38 +0200 | [diff] [blame^] | 4 | DEREKO_DIR = "/export/netapp/kupietz/N-GRAMM-STUDIE/conllu/" |
José Angel Daza | c428da9 | 2020-08-14 11:50:57 +0200 | [diff] [blame] | 5 | |
José Angel Daza | 05ce70d | 2020-08-14 12:06:38 +0200 | [diff] [blame^] | 6 | def get_filenames(data_dir): |
| 7 | filenames = [] |
| 8 | for filepath in glob.iglob(f'{data_dir}/*.conllu.gz', recursive=False): |
| 9 | fname = filepath.split("/")[-1] |
| 10 | filenames.append(fname) |
| 11 | return sorted(filenames) |
José Angel Daza | c428da9 | 2020-08-14 11:50:57 +0200 | [diff] [blame] | 12 | |
| 13 | if __name__ == "__main__": |
José Angel Daza | 05ce70d | 2020-08-14 12:06:38 +0200 | [diff] [blame^] | 14 | file_groups = defaultdict(list) |
| 15 | filenames = get_filenames(DEREKO_DIR) |
| 16 | for fn in filenames: |
| 17 | prefix = fn.split(".")[0] |
| 18 | file_groups[prefix].append(fn) |
| 19 | for group,files in file_groups.items(): |
| 20 | print(group, len(files)) |