blob: eabb2fc8d8d639b3100f06c5d1b877abdced35ac [file] [log] [blame]
import glob,re
from collections import defaultdict
DEREKO_DIR = "/export/netapp/kupietz/N-GRAMM-STUDIE/conllu/"
def get_filenames(data_dir):
filenames = []
for filepath in glob.iglob(f'{data_dir}/*.conllu.gz', recursive=False):
fname = filepath.split("/")[-1]
filenames.append(fname)
return sorted(filenames)
if __name__ == "__main__":
file_groups = defaultdict(list)
filenames = get_filenames(DEREKO_DIR)
for fn in filenames:
prefix = fn.split(".")[0]
prefix = re.findall("\D+", prefix)[0]
file_groups[prefix].append(fn)
print(fn.split(".")[0])
for group,files in sorted(file_groups.items(), key=lambda x: len(x[1]), reverse=True):
print(group, len(files))