Add missing more content

Change-Id: Ie379c8f918119351486e31745ec4f469818b9a90
diff --git a/extract-shuffled-sentences.sh b/extract-shuffled-sentences.sh
new file mode 100755
index 0000000..3a71484
--- /dev/null
+++ b/extract-shuffled-sentences.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+DESTNAME=20CBT.tsv.bz2
+while read line; do
+		dest=$(basename $line)
+		dest=${dest%%.zip}.w2v
+		if [ -e $dest ]; then
+				echo "Skipping $dest" > /dev/stderr
+		else
+				echo "reading: ${line}" > /dev/stderr
+				echo "korapxml2conllu -m '<textSigle>([^<.]+)' -m '<creatDate>([^<]{4,7})' --word2vec $line > $dest"
+		fi
+done < 20cbt.corpus_files.lst | parallel
+
+if find . -maxdepth 1 -name "*.w2v" -type f -newer $DESTNAME; then
+	pv *.w2v | grep -P '^[A-Z/0-9]+\t[0-9.]+\t[^\t]+' | shuf | pbzip2 -m1000 -c | tee $DESTNAME | \
+	pbzcat | perl -wlne 'if(/^([^\/]+)/) { $a{$1}++ }; END { print "corpus ID\tcorpus title\tsentence count\n"; foreach $sigle (sort keys %a) {$t=`corpussigle2title $sigle`; chomp $t; print "$t\t$a{$sigle}"}}' > ${DESTNAME%%.tsv.bz2}.contents.txt 
+fi
+