import glob,re
from collections import defaultdict

DEREKO_DIR = "/export/netapp/kupietz/N-GRAMM-STUDIE/conllu/"

def get_filenames(data_dir):
    filenames = []
    for filepath in glob.iglob(f'{data_dir}/*.conllu.gz', recursive=False):
        fname = filepath.split("/")[-1]
        filenames.append(fname)
    return sorted(filenames)

if __name__ == "__main__":
    file_groups = defaultdict(list)
    filenames = get_filenames(DEREKO_DIR)
    for fn in filenames:
        prefix = fn.split(".")[0]
        prefix = re.findall("\D+", prefix)[0]
        file_groups[prefix].append(fn)
        print(fn.split(".")[0])
    
    for group,files in sorted(file_groups.items(), key=lambda x: len(x[1]), reverse=True):
       print(group, len(files))