| #!/bin/bash |
| |
| DESTNAME=20CBT.tsv.bz2 |
| while read line; do |
| dest=$(basename $line) |
| dest=${dest%%.zip}.w2v |
| if [ -e $dest ]; then |
| echo "Skipping $dest" > /dev/stderr |
| else |
| echo "reading: ${line}" > /dev/stderr |
| echo "korapxml2conllu -m '<textSigle>([^<.]+)' -m '<creatDate>([^<]{4,7})' --word2vec $line > $dest" |
| fi |
| done < 20cbt.corpus_files.lst | parallel |
| |
| if find . -maxdepth 1 -name "*.w2v" -type f -newer $DESTNAME; then |
| pv *.w2v | grep -P '^[A-Z/0-9]+\t[0-9.]+\t[^\t]+' | shuf | pbzip2 -m1000 -c | tee $DESTNAME | \ |
| pbzcat | perl -wlne 'if(/^([^\/]+)/) { $a{$1}++ }; END { print "corpus ID\tcorpus title\tsentence count\n"; foreach $sigle (sort keys %a) {$t=`corpussigle2title $sigle`; chomp $t; print "$t\t$a{$sigle}"}}' > ${DESTNAME%%.tsv.bz2}.contents.txt |
| fi |
| |