blob: 3a714841d6d729ff696d0d217435406926ca2de8 [file] [log] [blame]
Marc Kupietz0e7dd3a2022-07-07 15:38:22 +02001#!/bin/bash
2
3DESTNAME=20CBT.tsv.bz2
4while read line; do
5 dest=$(basename $line)
6 dest=${dest%%.zip}.w2v
7 if [ -e $dest ]; then
8 echo "Skipping $dest" > /dev/stderr
9 else
10 echo "reading: ${line}" > /dev/stderr
11 echo "korapxml2conllu -m '<textSigle>([^<.]+)' -m '<creatDate>([^<]{4,7})' --word2vec $line > $dest"
12 fi
13done < 20cbt.corpus_files.lst | parallel
14
15if find . -maxdepth 1 -name "*.w2v" -type f -newer $DESTNAME; then
16 pv *.w2v | grep -P '^[A-Z/0-9]+\t[0-9.]+\t[^\t]+' | shuf | pbzip2 -m1000 -c | tee $DESTNAME | \
17 pbzcat | perl -wlne 'if(/^([^\/]+)/) { $a{$1}++ }; END { print "corpus ID\tcorpus title\tsentence count\n"; foreach $sigle (sort keys %a) {$t=`corpussigle2title $sigle`; chomp $t; print "$t\t$a{$sigle}"}}' > ${DESTNAME%%.tsv.bz2}.contents.txt
18fi
19