Add missing more content
Change-Id: Ie379c8f918119351486e31745ec4f469818b9a90
diff --git a/extract-shuffled-sentences.sh b/extract-shuffled-sentences.sh
new file mode 100755
index 0000000..3a71484
--- /dev/null
+++ b/extract-shuffled-sentences.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+DESTNAME=20CBT.tsv.bz2
+while read line; do
+ dest=$(basename $line)
+ dest=${dest%%.zip}.w2v
+ if [ -e $dest ]; then
+ echo "Skipping $dest" > /dev/stderr
+ else
+ echo "reading: ${line}" > /dev/stderr
+ echo "korapxml2conllu -m '<textSigle>([^<.]+)' -m '<creatDate>([^<]{4,7})' --word2vec $line > $dest"
+ fi
+done < 20cbt.corpus_files.lst | parallel
+
+if find . -maxdepth 1 -name "*.w2v" -type f -newer $DESTNAME; then
+ pv *.w2v | grep -P '^[A-Z/0-9]+\t[0-9.]+\t[^\t]+' | shuf | pbzip2 -m1000 -c | tee $DESTNAME | \
+ pbzcat | perl -wlne 'if(/^([^\/]+)/) { $a{$1}++ }; END { print "corpus ID\tcorpus title\tsentence count\n"; foreach $sigle (sort keys %a) {$t=`corpussigle2title $sigle`; chomp $t; print "$t\t$a{$sigle}"}}' > ${DESTNAME%%.tsv.bz2}.contents.txt
+fi
+