blob: 3a714841d6d729ff696d0d217435406926ca2de8 [file] [log] [blame]
#!/bin/bash
DESTNAME=20CBT.tsv.bz2
while read line; do
dest=$(basename $line)
dest=${dest%%.zip}.w2v
if [ -e $dest ]; then
echo "Skipping $dest" > /dev/stderr
else
echo "reading: ${line}" > /dev/stderr
echo "korapxml2conllu -m '<textSigle>([^<.]+)' -m '<creatDate>([^<]{4,7})' --word2vec $line > $dest"
fi
done < 20cbt.corpus_files.lst | parallel
if find . -maxdepth 1 -name "*.w2v" -type f -newer $DESTNAME; then
pv *.w2v | grep -P '^[A-Z/0-9]+\t[0-9.]+\t[^\t]+' | shuf | pbzip2 -m1000 -c | tee $DESTNAME | \
pbzcat | perl -wlne 'if(/^([^\/]+)/) { $a{$1}++ }; END { print "corpus ID\tcorpus title\tsentence count\n"; foreach $sigle (sort keys %a) {$t=`corpussigle2title $sigle`; chomp $t; print "$t\t$a{$sigle}"}}' > ${DESTNAME%%.tsv.bz2}.contents.txt
fi