Marc Kupietz | 0e7dd3a | 2022-07-07 15:38:22 +0200 | [diff] [blame] | 1 | #!/bin/bash |
| 2 | |
| 3 | DESTNAME=20CBT.tsv.bz2 |
| 4 | while read line; do |
| 5 | dest=$(basename $line) |
| 6 | dest=${dest%%.zip}.w2v |
| 7 | if [ -e $dest ]; then |
| 8 | echo "Skipping $dest" > /dev/stderr |
| 9 | else |
| 10 | echo "reading: ${line}" > /dev/stderr |
| 11 | echo "korapxml2conllu -m '<textSigle>([^<.]+)' -m '<creatDate>([^<]{4,7})' --word2vec $line > $dest" |
| 12 | fi |
| 13 | done < 20cbt.corpus_files.lst | parallel |
| 14 | |
| 15 | if find . -maxdepth 1 -name "*.w2v" -type f -newer $DESTNAME; then |
| 16 | pv *.w2v | grep -P '^[A-Z/0-9]+\t[0-9.]+\t[^\t]+' | shuf | pbzip2 -m1000 -c | tee $DESTNAME | \ |
| 17 | pbzcat | perl -wlne 'if(/^([^\/]+)/) { $a{$1}++ }; END { print "corpus ID\tcorpus title\tsentence count\n"; foreach $sigle (sort keys %a) {$t=`corpussigle2title $sigle`; chomp $t; print "$t\t$a{$sigle}"}}' > ${DESTNAME%%.tsv.bz2}.contents.txt |
| 18 | fi |
| 19 | |