blob: a01e07702bb87b816f50c0690c2a7353fc993571 [file] [log] [blame]
Harald Lüngen5bebb0c2024-08-27 16:44:34 +03001#/bin/bash
2
3EURECO="/scratch/project_2010889/eureco"
Harald Lüngen197aa202024-09-04 17:42:31 +03004TEST="/scratch/project_2010889/TEST"
Harald Lüngen5bebb0c2024-08-27 16:44:34 +03005
6CORPUS="$EURECO/klk-fi-v2-vrt"
7TEIDTD="$EURECO/tei/tei_all.dtd"
8
9DAT="$CORPUS/data" # input vrt files of klk-fi by year
Harald Lüngen197aa202024-09-04 17:42:31 +030010XML="$CORPUS/XML" # output zipped vrtxml files of selection of newspapers
Harald Lüngend488d5c2024-09-25 09:01:51 +030011
12
Harald Lüngencb223bd2024-09-19 10:52:09 +030013TEI="$CORPUS/TEI" # output zipped tei files of the selection
14#TEI="$TEST/TEI" # output zipped tei files of the selection
15
16#TEIFORMAT="tei";
17TEIFORMAT="i5";
Harald Lüngen5bebb0c2024-08-27 16:44:34 +030018
Harald Lüngend488d5c2024-09-25 09:01:51 +030019MASK="";
20#MASK="-m";
21
22PERL_BAD=0; # to supress warnings about locale
23
Harald Lüngen197aa202024-09-04 17:42:31 +030024mkdir -p $TEST
Harald Lüngen5bebb0c2024-08-27 16:44:34 +030025mkdir -p $XML
26mkdir -p $TEI
27
28
29
30# Multiple threads for a loop in bash:
31
32## for stuff in things
33## do
34## ( something
35## with
36## stuff ) &
37## done
38## wait # for all the something with stuff
39
40
41
42# Generate corpus files by year and source in vrt
43
Harald Lüngend488d5c2024-09-25 09:01:51 +030044# for VRT in $DAT/*.vrt # loop to be applied to yearly files
45for VRT in $DAT/klk_fi_v2_201[0-3].vrt
46# for VRT in "$DAT/klk_fi_v2_2015.vrt"
47# for VRT in "$DAT/klk_fi_v2_2019.vrt" "$DAT/klk_fi_v2_2020.vrt" "$DAT/klk_fi_v2_2021.vrt"
Harald Lüngen5bebb0c2024-08-27 16:44:34 +030048do # (
49
50 BASENAME=`basename $VRT .vrt`
Harald Lüngend488d5c2024-09-25 09:01:51 +030051 YEAR=`echo $BASENAME | gawk 'BEGIN {FS="_"} {print $4}'`
Harald Lüngen5bebb0c2024-08-27 16:44:34 +030052
Harald Lüngend488d5c2024-09-25 09:01:51 +030053 echo "Generating proper XML files from $VRT in $XML/$YEAR by source..."
54 gawk -v OUTDIR="$XML" -v YEAR="$YEAR" -f vrt2xml.awk $VRT
Harald Lüngencb223bd2024-09-19 10:52:09 +030055
Harald Lüngend488d5c2024-09-25 09:01:51 +030056 #--------------------------------------------------------
Harald Lüngen6feedd22024-08-29 13:33:59 +030057 # Checking Wellformedness of the XML and generating TEI
Harald Lüngend488d5c2024-09-25 09:01:51 +030058 #--------------------------------------------------------
59
Harald Lüngencb223bd2024-09-19 10:52:09 +030060 for s in $XML/*/
61 # for s in $XML/Suomen_Kuvalehti/
Harald Lüngencb223bd2024-09-19 10:52:09 +030062 do ( # threading
Harald Lüngen5bebb0c2024-08-27 16:44:34 +030063 SOURCE=`basename $s`
Harald Lüngend488d5c2024-09-25 09:01:51 +030064 x=$s/$SOURCE$YEAR.xml
65
66 # Some checks of the XML beforehand:
67
68 if [[ ! -f $x ]] ; then
69 echo "Warning: klk2eureco.sh: File $x does not exist; skipping"
70 break;
71 fi
72
73 if [[ -z `grep "<text " $x` ]] ; then
74 echo "Warning: klk2eureco.sh: File $x does not contain any <text>s ; skipping"
75 break;
76 fi
Harald Lüngen5bebb0c2024-08-27 16:44:34 +030077
Harald Lüngend488d5c2024-09-25 09:01:51 +030078 echo " checking wellformedness of $x"
79 xmllint --stream --noout $x
80 if [[ $R != "" ]] ; then
81 echo "Error: xmllint error with error return code $R" >&2;
82 break;
83 fi
84
85
86 #--------------------
87 # calling vrt2tei.pl
88 #--------------------
89
Harald Lüngen5bebb0c2024-08-27 16:44:34 +030090 BASENAME=`basename $x .xml`
91 mkdir -p $TEI/$SOURCE
Harald Lüngencb223bd2024-09-19 10:52:09 +030092 t="$TEI/$SOURCE/$BASENAME.$TEIFORMAT.xml"
Harald Lüngen5bebb0c2024-08-27 16:44:34 +030093
Harald Lüngen197aa202024-09-04 17:42:31 +030094 echo " generating $t using vrt2tei.pl, and prettifying..."
Harald Lüngend488d5c2024-09-25 09:01:51 +030095 perl vrt2tei.pl -t $TEIFORMAT $MASK $x |
Harald Lüngen7aac9732024-09-17 09:07:25 +030096 xml_pp | # xml_pp works but takes ages;
97 gawk -f rearrange-idsDoc.awk > $t # rearranging the start and end tag of idsDoc
98 # xmllint- format geht out of memory und --stream machen funktioniert nicht
Harald Lüngen5bebb0c2024-08-27 16:44:34 +030099 ls -l $t
Harald Lüngen6feedd22024-08-29 13:33:59 +0300100
Harald Lüngencb223bd2024-09-19 10:52:09 +0300101 if [ "$TEIFORMAT" == "i5" ] ; then
Harald Lüngend488d5c2024-09-25 09:01:51 +0300102 echo " validating $t..."
Harald Lüngencb223bd2024-09-19 10:52:09 +0300103 xmllint --stream --noout --valid $t
Harald Lüngend488d5c2024-09-25 09:01:51 +0300104 else
105 echo " validating $t..."
106 xmllint --stream --noout --dtdvalid $TEIDTD $t # scheint so zu funktioneren - nicht kombinieren mit --format!
Harald Lüngencb223bd2024-09-19 10:52:09 +0300107 fi
Harald Lüngen5bebb0c2024-08-27 16:44:34 +0300108
Harald Lüngen6feedd22024-08-29 13:33:59 +0300109 # echo " zipping $x..."
110 # gzip -f $x
Harald Lüngen5bebb0c2024-08-27 16:44:34 +0300111
Harald Lüngen6feedd22024-08-29 13:33:59 +0300112 # echo " zipping $t..."
113 # gzip -f $t
114
Harald Lüngencb223bd2024-09-19 10:52:09 +0300115 echo ) &
Harald Lüngen5bebb0c2024-08-27 16:44:34 +0300116 done
Harald Lüngend488d5c2024-09-25 09:01:51 +0300117 wait # ) & # wait seems to have to be there for it to terminate
Harald Lüngen5bebb0c2024-08-27 16:44:34 +0300118
119done
120# wait
121
122
123