blob: eabb2fc8d8d639b3100f06c5d1b877abdced35ac [file] [log] [blame]
José Angel Daza6a71c2b2020-08-14 13:43:22 +02001import glob,re
José Angel Daza4f781c02020-08-14 12:06:38 +02002from collections import defaultdict
José Angel Dazaf3f13a72020-08-14 11:50:57 +02003
José Angel Daza4f781c02020-08-14 12:06:38 +02004DEREKO_DIR = "/export/netapp/kupietz/N-GRAMM-STUDIE/conllu/"
José Angel Dazaf3f13a72020-08-14 11:50:57 +02005
José Angel Daza4f781c02020-08-14 12:06:38 +02006def get_filenames(data_dir):
7 filenames = []
8 for filepath in glob.iglob(f'{data_dir}/*.conllu.gz', recursive=False):
9 fname = filepath.split("/")[-1]
10 filenames.append(fname)
11 return sorted(filenames)
José Angel Dazaf3f13a72020-08-14 11:50:57 +020012
13if __name__ == "__main__":
José Angel Daza4f781c02020-08-14 12:06:38 +020014 file_groups = defaultdict(list)
15 filenames = get_filenames(DEREKO_DIR)
16 for fn in filenames:
17 prefix = fn.split(".")[0]
José Angel Daza6a71c2b2020-08-14 13:43:22 +020018 prefix = re.findall("\D+", prefix)[0]
José Angel Daza4f781c02020-08-14 12:06:38 +020019 file_groups[prefix].append(fn)
dazad7d70752021-01-12 18:17:49 +010020 print(fn.split(".")[0])
21
José Angel Daza7d22dca2020-08-14 13:50:02 +020022 for group,files in sorted(file_groups.items(), key=lambda x: len(x[1]), reverse=True):
dazafb308a22021-01-27 16:20:08 +010023 print(group, len(files))