blob: b52c0524a73094b1715d74916abbcb224a0594fc [file] [log] [blame]
Harald Lüngen984b2c62024-08-27 16:04:59 +03001# vrt2xml.awk
2# to be called for each yearly vrt file of KLK e.g. klk_fi_v2_2021.vrt
3
Harald Lungen7a0e6742024-08-23 09:32:02 +03004BEGIN {
Harald Lüngenca9b17f2024-11-26 15:00:45 +02005
6 #------------------------
7 # Field separators
8 #------------------------
9
10 FS="\t";
Harald Lungen7a0e6742024-08-23 09:32:02 +030011 OFS="\t";
12
Harald Lüngenca9b17f2024-11-26 15:00:45 +020013 #-------------------
14 # Global variables
15 #-------------------
16
17 SEENID="";
18
19
20
21 # Parameters to be given from command line:
22 # OUTDIR
23 # YEAR
24 # SOURCESFILE
25
26 #----------------------------------
27 # Defaults for Parameters
28 #----------------------------------
29
30 if(!OUTDIR){
31 OUTDIR="XMLOUTDIR";
32 printf("Using %s as default output directory \n", OUTDIR)
33 }
34
35 if(!YEAR){
36 YEAR="2021";
37 printf("Extracting data from source for default year %s\n", YEAR)
38 }
39
40 if(!SOURCESFILE){
41 SOURCESFILE="sources_klk_fi_v2_2021_4eureco.csv";
42 printf("Using default sources file %s in this directory\n", SOURCESFILE)
43 }
44
45
Harald Lüngen984b2c62024-08-27 16:04:59 +030046 system("mkdir -p " OUTDIR);
47
Harald Lüngenca9b17f2024-11-26 15:00:45 +020048 #------------------------------------------------------------
49 # Get positional attributes from first line of input file
50 #------------------------------------------------------------
Harald Lüngen984b2c62024-08-27 16:04:59 +030051 getline;
52 if($0 !~ /^\s*<!\-\- #vrt positional-attributes/) {
53 print "ERROR: Something went wrong when looking for positional attributes in first line"
54 }
55 else {
56 PATTS = $0;
57 }
Harald Lungen7a0e6742024-08-23 09:32:02 +030058
Harald Lüngenca9b17f2024-11-26 15:00:45 +020059 #--------------------------------------------------------------------
60 # get list of sources to be extracted from separate sources file
61 #--------------------------------------------------------------------
62
63 getline < SOURCESFILE; # get rid of first line of sources file
64 while(getline < SOURCESFILE){
65
66 # remove all leading and trailing spaces from relevant fields:
67 e = $1; gsub(/^\s+?/, "", e); gsub(/\s+$/, "", e); # e is extract flag
68 p = $3; gsub(/^\s+?/, "", p); gsub(/\s+$/, "", p); # p is publ_id
69 t = $4; gsub(/^\s+?/, "", t); gsub(/\s+$/, "", t); # t is title string
Harald Lüngen984b2c62024-08-27 16:04:59 +030070
Harald Lüngenca9b17f2024-11-26 15:00:45 +020071
72 #printf("e: \t|%s|\n", e) >> "/dev/stderr";
73 #printf("p: \t|%s|\n", p) >> "/dev/stderr";
74 #printf("t: \t|%s|\n", t) >> "/dev/stderr";
75
76
77 # set facts (arrays):
78 if(e == 1){ # i.e. where extract flag is 1
79
80 TITLE[t] = e; # e.g. TITLE["Aamulehti"] = 1;
81 EXTRACT[p] = t; # e.g. EXTRACT["0355-6913"] ="Aamulehti";
82
83 # special case publ_id starting in 'fk' according to end user notes:
84 if(p ~ /^fk/){
85 P = toupper(p);
86 EXTRACT[P] = t; # add fact for upper case version of publ_id
87 }
88 }
89
90 }
91
92 for(t in EXTRACT){
93 printf("Extract: %s %s\n", t, EXTRACT[t]) >> "/dev/stderr";
94 }
95
96 # start all outfiles according to selected titles in EXTRACT:
97 for(c in TITLE){
Harald Lüngen984b2c62024-08-27 16:04:59 +030098 system("mkdir -p " OUTDIR "/" simplify(c));
99
100 OUTFILE = OUTDIR "/" simplify(c) "/" simplify(c) YEAR ".xml";
101
Harald Lungen7a0e6742024-08-23 09:32:02 +0300102 printf("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") > OUTFILE;
Harald Lüngen984b2c62024-08-27 16:04:59 +0300103 print PATTS >> OUTFILE;
Harald Lungen7a0e6742024-08-23 09:32:02 +0300104 printf("<texts>\n") >> OUTFILE;
Harald Lüngen984b2c62024-08-27 16:04:59 +0300105 print "Starting outfile " OUTFILE > "/dev/stderr";
Harald Lungen7a0e6742024-08-23 09:32:02 +0300106 }
Harald Lüngenca9b17f2024-11-26 15:00:45 +0200107} # END BEGIN
Harald Lungen7a0e6742024-08-23 09:32:02 +0300108
109
Harald Lüngenca9b17f2024-11-26 15:00:45 +0200110#---------------------------------
111# M A I N
112#---------------------------------
113
Harald Lungen7a0e6742024-08-23 09:32:02 +0300114/^\s*<text/ {
115
116# USE 01 binding_id="2246025"
Harald Lungen7a0e6742024-08-23 09:32:02 +0300117# USE 02 date="2021-01-15"
Harald Lüngen984b2c62024-08-27 16:04:59 +0300118# 03 datefrom="20210115"
119# 04 dateto="20210115"
120# 05 elec_date="_"
121# 06 file=""
Harald Lungen7a0e6742024-08-23 09:32:02 +0300122# USE 07 filename_metadata="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_mets.xml"
Harald Lungen7a0e6742024-08-23 09:32:02 +0300123# USE 08 filename_orig="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_page-2021011502210030301.xml
Harald Lungen7a0e6742024-08-23 09:32:02 +0300124# USE 09 id="t-bcd0f3fa-bbd3dac4"
Harald Lüngen984b2c62024-08-27 16:04:59 +0300125# 10 img_url=""
Harald Lungen7a0e6742024-08-23 09:32:02 +0300126# USE 11 issue_date="15.01.2021"
Harald Lungen7a0e6742024-08-23 09:32:02 +0300127# USE 12 issue_no="SK0221"
Harald Lungen7a0e6742024-08-23 09:32:02 +0300128# USE 13 issue_title="Suomen Kuvalehti"
Harald Lungen7a0e6742024-08-23 09:32:02 +0300129# USE 14 label="Suomen Kuvalehti no. SK0221 15.01.2021"
Harald Lungen7a0e6742024-08-23 09:32:02 +0300130# USE 16 language="fi"
Harald Lungen7a0e6742024-08-23 09:32:02 +0300131# USE 17 page_id="p1"
Harald Lungen7a0e6742024-08-23 09:32:02 +0300132# USE 18 page_no="None"
Harald Lüngen984b2c62024-08-27 16:04:59 +0300133# 19 part_name="_"
134# 20 publ_id="0039-5552"
135# 21 publ_part=""
Harald Lungen7a0e6742024-08-23 09:32:02 +0300136# USE 22 publ_title="Suomen Kuvalehti"
Harald Lungen7a0e6742024-08-23 09:32:02 +0300137# USE 23 publ_type="aikakausi"
Harald Lungen7a0e6742024-08-23 09:32:02 +0300138# USE 24 sentcount="70"
Harald Lüngen984b2c62024-08-27 16:04:59 +0300139# 25 sum_lang="|xxx:44|fin:23|eng:3|"
140# 26 timefrom="000000"
141# 27 timeto="235959"
Harald Lungen7a0e6742024-08-23 09:32:02 +0300142# USE 28 tokencount="304"
Harald Lüngen984b2c62024-08-27 16:04:59 +0300143# 29 version_added="KLK-fi-2021">
Harald Lungen7a0e6742024-08-23 09:32:02 +0300144
145
Harald Lüngen984b2c62024-08-27 16:04:59 +0300146 BID = gensub(/^.+binding_id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
147 DATE = gensub(/^.+date=\"([^\"]+?)\".+$/, "\\1", "1", $0);
148 METAFILENAME = gensub(/^.+filename_metadata=\"([^\"]+?)\".+$/, "\\1", "1", $0);
149 ORIGFILENAME = gensub(/^.+filename_orig=\"([^\"]+?)\".+$/, "\\1", "1", $0);
150 ID = gensub(/^.+id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
151 ISSUEDATE = gensub(/^.+issue_date=\"([^\"]+?)\".+$/, "\\1", "1", $0);
152 ISSUENO = gensub(/^.+issue_no=\"([^\"]+?)\".+$/, "\\1", "1", $0);
153 ISSUETITLE = gensub(/^.+issue_title=\"([^\"]+?)\".+$/, "\\1", "1", $0);
154 LABEL = gensub(/^.+label=\"([^\"]+?)\".+$/, "\\1", "1", $0);
155 LANGUAGE = gensub(/^.+language=\"([^\"]+?)\".+$/, "\\1", "1", $0);
156 PAGEID = gensub(/^.+page_id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
157 PAGENO = gensub(/^.+page_no=\"([^\"]+?)\".+$/, "\\1", "1", $0);
Harald Lüngenca9b17f2024-11-26 15:00:45 +0200158 PUBLID = gensub(/^.+publ_id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
Harald Lüngen984b2c62024-08-27 16:04:59 +0300159 PUBLTITLE = gensub(/^.+publ_title=\"([^\"]+?)\".+$/, "\\1", "1", $0);
160 PUBLTYPE = gensub(/^.+publ_type=\"([^\"]+?)\".+$/, "\\1", "1", $0);
161 SENTCOUNT = gensub(/^.+sentcount=\"([^\"]+?)\".+$/, "\\1", "1", $0);
162 TOKENCOUNT = gensub(/^.+tokencount=\"([^\"]+?)\".+$/, "\\1", "1", $0);
163
Harald Lungen7a0e6742024-08-23 09:32:02 +0300164
Harald Lüngen984b2c62024-08-27 16:04:59 +0300165# print BID, DATE, METAFILENAME, ORIGFILENAME, ID, ISSUEDATE, ISSUENO, ISSUETITLE, LABEL, LANGUAGE, PAGEID, PAGENO, PUBLTITLE, PUBLTYPE, SENTCOUNT, TOKENCOUNT
Harald Lüngen984b2c62024-08-27 16:04:59 +0300166
Harald Lüngenca9b17f2024-11-26 15:00:45 +0200167 OUTFILE = OUTDIR "/" simplify(EXTRACT[PUBLID]) "/" simplify(EXTRACT[PUBLID]) YEAR ".xml" ;
168
169 # continue OUTFILEs if PUBTITLE of <text> is in EXTRACT:
170 if(EXTRACT[PUBLID]){
171 if(PUBLID != SEENID){
172 printf("Extracting <text>s for %s\n", EXTRACT[PUBLID]) >> "/dev/stderr";
173 }
174 print $0 > OUTFILE;
175 SEENID = PUBLID;
176 }
Harald Lungen7a0e6742024-08-23 09:32:02 +0300177}
178
179/^\s*<paragraph/ {
180 ## <paragraph id="p-bcd0f3fa-bbd3dac4-9a6ee1a8" sum_lang="|xxx:1|">
181 PID = gensub(/^.+id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
Harald Lüngenca9b17f2024-11-26 15:00:45 +0200182 SUMLANG = gensub(/^.+sum_lang=\"([^\"]+?)\".+$/, "\\1", "1", $0);
Harald Lungen7a0e6742024-08-23 09:32:02 +0300183
Harald Lüngenca9b17f2024-11-26 15:00:45 +0200184 if(EXTRACT[PUBLID]){print $0 >> OUTFILE; }
Harald Lungen7a0e6742024-08-23 09:32:02 +0300185}
186
187
188
189/^\s*<sentence/ {
190 ## <sentence id="s-bcd0f3fa-bbd3dac4-5c959030" lang="xxx" lang_conf="0.1745286">
191 SID = gensub(/^.+id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
192 LANG = gensub(/^.+lang=\"([^\"]+?)\".+$/, "\\1", "1", $0);
193 LCONF = gensub(/^.+lang_conf=\"([^\"]+?)\".+$/, "\\1", "1", $0);
194
Harald Lüngenca9b17f2024-11-26 15:00:45 +0200195 if(EXTRACT[PUBLID]){print $0 >> OUTFILE; }
Harald Lungen7a0e6742024-08-23 09:32:02 +0300196}
197
198
199($0 ~ /^\s*</ && $0 !~ /^.*<(\/?)(\!\-\-|text|paragraph|sentence)/) {printf("WARNING: unknown element in line %s\n%s", NR, $0) > "/dev/stderr";}
200
201
202$0 !~ /^\s*</ {
Harald Lüngenca9b17f2024-11-26 15:00:45 +0200203 if(EXTRACT[PUBLID]){print $0 >> OUTFILE; }
Harald Lungen7a0e6742024-08-23 09:32:02 +0300204}
205
206/^s*<\/(sentence|paragraph)>/ {
Harald Lüngenca9b17f2024-11-26 15:00:45 +0200207 if(EXTRACT[PUBLID]){print $0 >> OUTFILE; }
Harald Lungen7a0e6742024-08-23 09:32:02 +0300208}
209/^s*<\/text>/ {
Harald Lüngenca9b17f2024-11-26 15:00:45 +0200210 if(EXTRACT[PUBLID]){print $0 >> OUTFILE; }
Harald Lungen7a0e6742024-08-23 09:32:02 +0300211 }
212
213
214
215
216END {
217
Harald Lüngenca9b17f2024-11-26 15:00:45 +0200218 for(c in TITLE){
219 OUTFILE = OUTDIR "/" simplify(c) "/" simplify(c) YEAR ".xml";
220 printf("</texts>\n") >> OUTFILE;
221 printf("Wrote %s\n", OUTFILE) >> "/dev/stderr";
Harald Lungen7a0e6742024-08-23 09:32:02 +0300222 }
Harald Lungen7a0e6742024-08-23 09:32:02 +0300223}
224
225
226function simplify(str)
227{
228 gsub(/\s+/, "_", str);
229 gsub(/ü/,"u", str);
230 gsub(/ä/,"a",str); gsub(/ö/,"o",str);
231 gsub(/Ü/,"U",str);
232 gsub(/Ä/,"A",str);
233 gsub(/Ö/,"O",str);
234
235 return(str);
236}
237
238
239
240