blob: d8d6d882548c14c27aa4de50dd91038b52a94735 [file] [log] [blame]
Harald Lüngen984b2c62024-08-27 16:04:59 +03001# vrt2xml.awk
2# to be called for each yearly vrt file of KLK e.g. klk_fi_v2_2021.vrt
3
Harald Lungen7a0e6742024-08-23 09:32:02 +03004BEGIN {
Harald Lüngen984b2c62024-08-27 16:04:59 +03005 # getting OUTDIR and YEAR as parameters
6
Harald Lungen7a0e6742024-08-23 09:32:02 +03007 OFS="\t";
8
Harald Lüngen984b2c62024-08-27 16:04:59 +03009 system("mkdir -p " OUTDIR);
10
11 getline;
12 if($0 !~ /^\s*<!\-\- #vrt positional-attributes/) {
13 print "ERROR: Something went wrong when looking for positional attributes in first line"
14 }
15 else {
16 PATTS = $0;
17 }
18
Harald Lungen7a0e6742024-08-23 09:32:02 +030019 CONTAIN["Aamulehti"] = 1;
20 CONTAIN["Etelä-Suomen Sanomat"] = 1;
Harald Lungen7a0e6742024-08-23 09:32:02 +030021 CONTAIN["Hämeen Sanomat"] = 1;
22 CONTAIN["Helsingin Sanomat"] = 1;
23 CONTAIN["Ilkka-Pohjalainen"] = 1;
24 CONTAIN["Ilta-Sanomat"] = 1;
25 CONTAIN["Iltalehti"] = 1;
26 CONTAIN["Kaleva"] = 1;
27 CONTAIN["Karjalainen"] = 1;
28 CONTAIN["Kauppalehti"] = 1;
29 CONTAIN["Keskipohjanmaa"] = 1;
30 CONTAIN["Keskisuomalainen"] = 1;
31 CONTAIN["Kouvolan Sanomat"] = 1;
32 CONTAIN["Kymen Sanomat"] = 1;
33 CONTAIN["Länsi-Savo"] = 1;
34 CONTAIN["Lapin Kansa"] = 1;
35 CONTAIN["Satakunnan Kansa"] = 1;
36 CONTAIN["Savon Sanomat"] = 1;
37 CONTAIN["Suomen Kuvalehti"] = 1;
38 CONTAIN["Turun Sanomat"] = 1;
39
40
Harald Lüngen984b2c62024-08-27 16:04:59 +030041 # start all outfiles according to selected titles in CONTAIN:
Harald Lungen7a0e6742024-08-23 09:32:02 +030042 for(c in CONTAIN){
Harald Lüngen984b2c62024-08-27 16:04:59 +030043
44 system("mkdir -p " OUTDIR "/" simplify(c));
45
46 OUTFILE = OUTDIR "/" simplify(c) "/" simplify(c) YEAR ".xml";
47
Harald Lungen7a0e6742024-08-23 09:32:02 +030048 printf("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") > OUTFILE;
Harald Lüngen984b2c62024-08-27 16:04:59 +030049 print PATTS >> OUTFILE;
Harald Lungen7a0e6742024-08-23 09:32:02 +030050 printf("<texts>\n") >> OUTFILE;
Harald Lüngen984b2c62024-08-27 16:04:59 +030051 print "Starting outfile " OUTFILE > "/dev/stderr";
Harald Lungen7a0e6742024-08-23 09:32:02 +030052 }
53}
54
55
Harald Lungen7a0e6742024-08-23 09:32:02 +030056/^\s*<text/ {
57
58# USE 01 binding_id="2246025"
Harald Lungen7a0e6742024-08-23 09:32:02 +030059# USE 02 date="2021-01-15"
Harald Lüngen984b2c62024-08-27 16:04:59 +030060# 03 datefrom="20210115"
61# 04 dateto="20210115"
62# 05 elec_date="_"
63# 06 file=""
Harald Lungen7a0e6742024-08-23 09:32:02 +030064# USE 07 filename_metadata="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_mets.xml"
Harald Lungen7a0e6742024-08-23 09:32:02 +030065# USE 08 filename_orig="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_page-2021011502210030301.xml
Harald Lungen7a0e6742024-08-23 09:32:02 +030066# USE 09 id="t-bcd0f3fa-bbd3dac4"
Harald Lüngen984b2c62024-08-27 16:04:59 +030067# 10 img_url=""
Harald Lungen7a0e6742024-08-23 09:32:02 +030068# USE 11 issue_date="15.01.2021"
Harald Lungen7a0e6742024-08-23 09:32:02 +030069# USE 12 issue_no="SK0221"
Harald Lungen7a0e6742024-08-23 09:32:02 +030070# USE 13 issue_title="Suomen Kuvalehti"
Harald Lungen7a0e6742024-08-23 09:32:02 +030071# USE 14 label="Suomen Kuvalehti no. SK0221 15.01.2021"
Harald Lungen7a0e6742024-08-23 09:32:02 +030072# USE 16 language="fi"
Harald Lungen7a0e6742024-08-23 09:32:02 +030073# USE 17 page_id="p1"
Harald Lungen7a0e6742024-08-23 09:32:02 +030074# USE 18 page_no="None"
Harald Lüngen984b2c62024-08-27 16:04:59 +030075# 19 part_name="_"
76# 20 publ_id="0039-5552"
77# 21 publ_part=""
Harald Lungen7a0e6742024-08-23 09:32:02 +030078# USE 22 publ_title="Suomen Kuvalehti"
Harald Lungen7a0e6742024-08-23 09:32:02 +030079# USE 23 publ_type="aikakausi"
Harald Lungen7a0e6742024-08-23 09:32:02 +030080# USE 24 sentcount="70"
Harald Lüngen984b2c62024-08-27 16:04:59 +030081# 25 sum_lang="|xxx:44|fin:23|eng:3|"
82# 26 timefrom="000000"
83# 27 timeto="235959"
Harald Lungen7a0e6742024-08-23 09:32:02 +030084# USE 28 tokencount="304"
Harald Lüngen984b2c62024-08-27 16:04:59 +030085# 29 version_added="KLK-fi-2021">
Harald Lungen7a0e6742024-08-23 09:32:02 +030086
87
Harald Lüngen984b2c62024-08-27 16:04:59 +030088 BID = gensub(/^.+binding_id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
89 DATE = gensub(/^.+date=\"([^\"]+?)\".+$/, "\\1", "1", $0);
90 METAFILENAME = gensub(/^.+filename_metadata=\"([^\"]+?)\".+$/, "\\1", "1", $0);
91 ORIGFILENAME = gensub(/^.+filename_orig=\"([^\"]+?)\".+$/, "\\1", "1", $0);
92 ID = gensub(/^.+id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
93 ISSUEDATE = gensub(/^.+issue_date=\"([^\"]+?)\".+$/, "\\1", "1", $0);
94 ISSUENO = gensub(/^.+issue_no=\"([^\"]+?)\".+$/, "\\1", "1", $0);
95 ISSUETITLE = gensub(/^.+issue_title=\"([^\"]+?)\".+$/, "\\1", "1", $0);
96 LABEL = gensub(/^.+label=\"([^\"]+?)\".+$/, "\\1", "1", $0);
97 LANGUAGE = gensub(/^.+language=\"([^\"]+?)\".+$/, "\\1", "1", $0);
98 PAGEID = gensub(/^.+page_id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
99 PAGENO = gensub(/^.+page_no=\"([^\"]+?)\".+$/, "\\1", "1", $0);
100 PUBLTITLE = gensub(/^.+publ_title=\"([^\"]+?)\".+$/, "\\1", "1", $0);
101 PUBLTYPE = gensub(/^.+publ_type=\"([^\"]+?)\".+$/, "\\1", "1", $0);
102 SENTCOUNT = gensub(/^.+sentcount=\"([^\"]+?)\".+$/, "\\1", "1", $0);
103 TOKENCOUNT = gensub(/^.+tokencount=\"([^\"]+?)\".+$/, "\\1", "1", $0);
104
Harald Lungen7a0e6742024-08-23 09:32:02 +0300105
Harald Lüngen984b2c62024-08-27 16:04:59 +0300106# print BID, DATE, METAFILENAME, ORIGFILENAME, ID, ISSUEDATE, ISSUENO, ISSUETITLE, LABEL, LANGUAGE, PAGEID, PAGENO, PUBLTITLE, PUBLTYPE, SENTCOUNT, TOKENCOUNT
Harald Lungen7a0e6742024-08-23 09:32:02 +0300107
Harald Lüngen984b2c62024-08-27 16:04:59 +0300108 OUTFILE = OUTDIR "/" simplify(PUBLTITLE) "/" simplify(PUBLTITLE) YEAR ".xml" ;
109
110 # continue OUTFILEs if PUBTITLE of <text> is in CONTAIN:
Harald Lungen7a0e6742024-08-23 09:32:02 +0300111 if(CONTAIN[PUBLTITLE]){print $0 > OUTFILE; }
112}
113
114/^\s*<paragraph/ {
115 ## <paragraph id="p-bcd0f3fa-bbd3dac4-9a6ee1a8" sum_lang="|xxx:1|">
116 PID = gensub(/^.+id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
117 SUMLANG = gensub(/^.+sum_lang=\"([^\"]+?)\".+$/, "\\1", "1", $0);
118
119 if(CONTAIN[PUBLTITLE]){print $0 >> OUTFILE; }
120}
121
122
123
124/^\s*<sentence/ {
125 ## <sentence id="s-bcd0f3fa-bbd3dac4-5c959030" lang="xxx" lang_conf="0.1745286">
126 SID = gensub(/^.+id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
127 LANG = gensub(/^.+lang=\"([^\"]+?)\".+$/, "\\1", "1", $0);
128 LCONF = gensub(/^.+lang_conf=\"([^\"]+?)\".+$/, "\\1", "1", $0);
129
130 if(CONTAIN[PUBLTITLE]){print $0 >> OUTFILE; }
131}
132
133
134($0 ~ /^\s*</ && $0 !~ /^.*<(\/?)(\!\-\-|text|paragraph|sentence)/) {printf("WARNING: unknown element in line %s\n%s", NR, $0) > "/dev/stderr";}
135
136
137$0 !~ /^\s*</ {
138 if(CONTAIN[PUBLTITLE]){print $0 >> OUTFILE; }
139}
140
141/^s*<\/(sentence|paragraph)>/ {
142 if(CONTAIN[PUBLTITLE]){print $0 >> OUTFILE; }
143}
144/^s*<\/text>/ {
145 if(CONTAIN[PUBLTITLE]){print $0 >> OUTFILE; }
146 }
147
148
149
150
151END {
152
153 for(c in CONTAIN){
154 if(CONTAIN[c] == 1){
Harald Lüngen984b2c62024-08-27 16:04:59 +0300155 OUTFILE = OUTDIR "/" simplify(c) "/" simplify(c) YEAR ".xml";
Harald Lungen7a0e6742024-08-23 09:32:02 +0300156 printf("</texts>\n") >> OUTFILE;
157 }
158 }
159
160}
161
162
163function simplify(str)
164{
165 gsub(/\s+/, "_", str);
166 gsub(/ü/,"u", str);
167 gsub(/ä/,"a",str); gsub(/ö/,"o",str);
168 gsub(/Ü/,"U",str);
169 gsub(/Ä/,"A",str);
170 gsub(/Ö/,"O",str);
171
172 return(str);
173}
174
175
176
177