flag for distinction between TEI and I5
diff --git a/klk2eureco.sh b/klk2eureco.sh
index f293edc..027f06b 100755
--- a/klk2eureco.sh
+++ b/klk2eureco.sh
@@ -8,9 +8,11 @@
DAT="$CORPUS/data" # input vrt files of klk-fi by year
XML="$CORPUS/XML" # output zipped vrtxml files of selection of newspapers
-#XML="$TEST/XML" # output zipped vrtxml files of selection of newspapers
-#TEI="$CORPUS/TEI" # output zipped tei files of the selection
-TEI="$TEST/TEI" # output zipped tei files of the selection
+TEI="$CORPUS/TEI" # output zipped tei files of the selection
+#TEI="$TEST/TEI" # output zipped tei files of the selection
+
+#TEIFORMAT="tei";
+TEIFORMAT="i5";
mkdir -p $TEST
mkdir -p $XML
@@ -33,22 +35,28 @@
# Generate corpus files by year and source in vrt
# for VRT in "$DAT/*.vrt"
-for VRT in "$DAT/klk_fi_v2_2021.vrt" # to be applied to yearly files
+#for VRT in "$DAT/klk_fi_v2_2021.vrt" # to be applied to yearly files
+# for VRT in "$DAT/klk_fi_v2_2020.vrt"
+for VRT in "$DAT/klk_fi_v2_2018.vrt" "$DAT/klk_fi_v2_2019.vrt"
do # (
BASENAME=`basename $VRT .vrt`
YY=`echo $BASENAME | gawk 'BEGIN {FS="_"} {print $4}'`
- #echo "Generating proper XML files from $VRT in $XML/$YY by source..."
- #gawk -v OUTDIR="$XML" -v YEAR="$YY" -f vrt2xml.awk $VRT # will generate corpus files for different sources in the YEAR dir
+ echo "Generating proper XML files from $VRT in $XML/$YY by source..."
+ gawk -v OUTDIR="$XML" -v YEAR="$YY" -f vrt2xml.awk $VRT # will generate corpus files for different sources in the YEAR dir
+done
+exit
+
+
# Checking Wellformedness of the XML and generating TEI
- # for s in $XML/*/
- for s in $XML/Suomen_Kuvalehti/
+ for s in $XML/*/
+ # for s in $XML/Suomen_Kuvalehti/
# for s in $XML/Lansi-Savo/
# for s in $XML/Helsingin_Sanomat/
# for s in $XML/Aamulehti/ $XML/Etela-Suomen_Sanomat/ $XML/Hameen_Sanomat/ $XML/Helsingin_Sanomat/ $XML/Ilkka-Pohjalainen/ $XML/Ilta-Sanomat/ $XML/Kaleva/ $XML/Keskipohjanmaa/ $XML/Satakunnan_Kansa/ $XML/Savon_Sanomat/ $XML/Turun_Sanomat/
- do # ( # threading
+ do ( # threading
SOURCE=`basename $s`
x=$s/$SOURCE$YY.xml
@@ -61,8 +69,7 @@
BASENAME=`basename $x .xml`
mkdir -p $TEI/$SOURCE
- t="$TEI/$SOURCE/$BASENAME.tei.xml"
- t0="$TEI/$SOURCE/$BASENAME.tei.0.xml"
+ t="$TEI/$SOURCE/$BASENAME.$TEIFORMAT.xml"
echo " generating $t using vrt2tei.pl, and prettifying..."
./vrt2tei.pl $x |
@@ -72,7 +79,11 @@
ls -l $t
echo " validating..."
- xmllint --stream --noout --dtdvalid $TEIDTD $t # scheint so zu funktioneren - nicht kombinieren mit --format!
+ if [ "$TEIFORMAT" == "i5" ] ; then
+ xmllint --stream --noout --valid $t
+ else
+ xmllint --stream --noout --dtdvalid $TEIDTD $t # TEI; scheint so zu funktioneren - nicht kombinieren mit --format!
+ fi
# echo " zipping $x..."
# gzip -f $x
@@ -80,7 +91,7 @@
# echo " zipping $t..."
# gzip -f $t
- echo # ) &
+ echo ) &
done
# wait # ) & # wait does not seem to make sense if nothing follows