blob: 7fe0386f058971ddade0bd91dfe63b80911adaca [file] [log] [blame]
Harald Lüngen5bebb0c2024-08-27 16:44:34 +03001#/bin/bash
2
3EURECO="/scratch/project_2010889/eureco"
Harald Lüngen197aa202024-09-04 17:42:31 +03004TEST="/scratch/project_2010889/TEST"
Harald Lüngen5bebb0c2024-08-27 16:44:34 +03005
6CORPUS="$EURECO/klk-fi-v2-vrt"
7TEIDTD="$EURECO/tei/tei_all.dtd"
8
9DAT="$CORPUS/data" # input vrt files of klk-fi by year
Harald Lüngen197aa202024-09-04 17:42:31 +030010XML="$CORPUS/XML" # output zipped vrtxml files of selection of newspapers
Harald Lüngend488d5c2024-09-25 09:01:51 +030011
12
Harald Lüngencb223bd2024-09-19 10:52:09 +030013TEI="$CORPUS/TEI" # output zipped tei files of the selection
14#TEI="$TEST/TEI" # output zipped tei files of the selection
15
Harald Lüngen34c05612025-02-04 15:08:08 +020016TEIFORMAT="tei";
17#TEIFORMAT="i5";
Harald Lüngen5bebb0c2024-08-27 16:44:34 +030018
Harald Lüngend488d5c2024-09-25 09:01:51 +030019MASK="";
20#MASK="-m";
21
22PERL_BAD=0; # to supress warnings about locale
23
Harald Lüngen197aa202024-09-04 17:42:31 +030024mkdir -p $TEST
Harald Lüngen5bebb0c2024-08-27 16:44:34 +030025mkdir -p $XML
26mkdir -p $TEI
27
28
29
30# Multiple threads for a loop in bash:
31
32## for stuff in things
33## do
34## ( something
35## with
36## stuff ) &
37## done
38## wait # for all the something with stuff
39
40
41
42# Generate corpus files by year and source in vrt
43
Harald Lüngen34c05612025-02-04 15:08:08 +020044#for VRT in $DAT/*.vrt # loop to be applied to yearly files
45for VRT in "$DAT/klk_fi_v2_2021.vrt"
Harald Lüngen5bebb0c2024-08-27 16:44:34 +030046do # (
47
48 BASENAME=`basename $VRT .vrt`
Harald Lüngend488d5c2024-09-25 09:01:51 +030049 YEAR=`echo $BASENAME | gawk 'BEGIN {FS="_"} {print $4}'`
Harald Lüngen5bebb0c2024-08-27 16:44:34 +030050
Harald Lüngen34c05612025-02-04 15:08:08 +020051 ##TMP echo "Generating proper XML files from $VRT in $XML/$YEAR by source..."
52 ##TMP gawk -v OUTDIR="$XML" -v YEAR="$YEAR" -f vrt2xml.awk $VRT
53
Harald Lüngencb223bd2024-09-19 10:52:09 +030054
Harald Lüngen34c05612025-02-04 15:08:08 +020055
Harald Lüngend488d5c2024-09-25 09:01:51 +030056 #--------------------------------------------------------
Harald Lüngen6feedd22024-08-29 13:33:59 +030057 # Checking Wellformedness of the XML and generating TEI
Harald Lüngend488d5c2024-09-25 09:01:51 +030058 #--------------------------------------------------------
59
Harald Lüngen34c05612025-02-04 15:08:08 +020060 ##TMP for s in $XML/*/
61 for s in $XML/Suomen_Kuvalehti/
62 do # ( # threading
Harald Lüngen5bebb0c2024-08-27 16:44:34 +030063 SOURCE=`basename $s`
Harald Lüngend488d5c2024-09-25 09:01:51 +030064 x=$s/$SOURCE$YEAR.xml
65
Harald Lüngen34c05612025-02-04 15:08:08 +020066 #---------------------------------------------------------------
67 # Some existence checks and removal of empty files beforehand:
68 #---------------------------------------------------------------
69
70 if [[ ! -f $x ]] ; then
71 echo "Warning: klk2eureco.sh: File $x does not exist; skipping";
72 break;
73 fi
74
75 if [[ -z `grep -l "<text " $x` ]] ; then
76 #echo "Warning: klk2eureco.sh: File $x does not contain any <text>s, skipping"
77 echo "Warning: klk2eureco.sh: File $x does not contain any <text>s,removing"
78 rm -fv $x;
79 break;
80 fi
Harald Lüngend488d5c2024-09-25 09:01:51 +030081
Harald Lüngen34c05612025-02-04 15:08:08 +020082 #---------------------------------
83 # checking well-formedness of $x
84 #---------------------------------
85
Harald Lüngend488d5c2024-09-25 09:01:51 +030086 echo " checking wellformedness of $x"
Harald Lüngen34c05612025-02-04 15:08:08 +020087 xmllint --stream --noout $x
Harald Lüngend488d5c2024-09-25 09:01:51 +030088 if [[ $R != "" ]] ; then
89 echo "Error: xmllint error with error return code $R" >&2;
90 break;
91 fi
92
93
Harald Lüngen34c05612025-02-04 15:08:08 +020094
Harald Lüngend488d5c2024-09-25 09:01:51 +030095 #--------------------
96 # calling vrt2tei.pl
97 #--------------------
98
Harald Lüngen5bebb0c2024-08-27 16:44:34 +030099 BASENAME=`basename $x .xml`
100 mkdir -p $TEI/$SOURCE
Harald Lüngencb223bd2024-09-19 10:52:09 +0300101 t="$TEI/$SOURCE/$BASENAME.$TEIFORMAT.xml"
Harald Lüngen5bebb0c2024-08-27 16:44:34 +0300102
Harald Lüngen197aa202024-09-04 17:42:31 +0300103 echo " generating $t using vrt2tei.pl, and prettifying..."
Harald Lüngend488d5c2024-09-25 09:01:51 +0300104 perl vrt2tei.pl -t $TEIFORMAT $MASK $x |
Harald Lüngen7aac9732024-09-17 09:07:25 +0300105 xml_pp | # xml_pp works but takes ages;
106 gawk -f rearrange-idsDoc.awk > $t # rearranging the start and end tag of idsDoc
107 # xmllint- format geht out of memory und --stream machen funktioniert nicht
Harald Lüngen5bebb0c2024-08-27 16:44:34 +0300108 ls -l $t
Harald Lüngen6feedd22024-08-29 13:33:59 +0300109
Harald Lüngen34c05612025-02-04 15:08:08 +0200110 # if $t is empty, remove it:
111 if [ ! -s $i ]; then
112 rm -f $i
113 fi
114
115 # validating $t:
Harald Lüngencb223bd2024-09-19 10:52:09 +0300116 if [ "$TEIFORMAT" == "i5" ] ; then
Harald Lüngend488d5c2024-09-25 09:01:51 +0300117 echo " validating $t..."
Harald Lüngencb223bd2024-09-19 10:52:09 +0300118 xmllint --stream --noout --valid $t
Harald Lüngend488d5c2024-09-25 09:01:51 +0300119 else
120 echo " validating $t..."
121 xmllint --stream --noout --dtdvalid $TEIDTD $t # scheint so zu funktioneren - nicht kombinieren mit --format!
Harald Lüngencb223bd2024-09-19 10:52:09 +0300122 fi
Harald Lüngen5bebb0c2024-08-27 16:44:34 +0300123
Harald Lüngen34c05612025-02-04 15:08:08 +0200124###TMP # echo " zipping $x..."
125###TMP # gzip -f $x
126###TMP
127###TMP # echo " zipping $t..."
128###TMP # gzip -f $t
129
130 echo # ) &
Harald Lüngen5bebb0c2024-08-27 16:44:34 +0300131 done
Harald Lüngend488d5c2024-09-25 09:01:51 +0300132 wait # ) & # wait seems to have to be there for it to terminate
Harald Lüngen5bebb0c2024-08-27 16:44:34 +0300133
134done
135# wait
136
137
138