blob: 451e2254199c349afb6350a0ad76c3b800e3b427 [file] [log] [blame]
Harald Lungen7a0e6742024-08-23 09:32:02 +03001BEGIN {
2 OFS="\t";
3
4 OUTDIR="XML";
5 system("mkdir -p \"" OUTDIR "\"");
6
7 CONTAIN["Aamulehti"] = 1;
8 CONTAIN["Etelä-Suomen Sanomat"] = 1;
9 CONTAIN["Forssan Lehti"] = 1;
10 CONTAIN["Hämeen Sanomat"] = 1;
11 CONTAIN["Helsingin Sanomat"] = 1;
12 CONTAIN["Ilkka-Pohjalainen"] = 1;
13 CONTAIN["Ilta-Sanomat"] = 1;
14 CONTAIN["Iltalehti"] = 1;
15 CONTAIN["Kaleva"] = 1;
16 CONTAIN["Karjalainen"] = 1;
17 CONTAIN["Kauppalehti"] = 1;
18 CONTAIN["Keskipohjanmaa"] = 1;
19 CONTAIN["Keskisuomalainen"] = 1;
20 CONTAIN["Kouvolan Sanomat"] = 1;
21 CONTAIN["Kymen Sanomat"] = 1;
22 CONTAIN["Länsi-Savo"] = 1;
23 CONTAIN["Lapin Kansa"] = 1;
24 CONTAIN["Satakunnan Kansa"] = 1;
25 CONTAIN["Savon Sanomat"] = 1;
26 CONTAIN["Suomen Kuvalehti"] = 1;
27 CONTAIN["Turun Sanomat"] = 1;
28
29
30 for(c in CONTAIN){
31 OUTFILE = OUTDIR "/" simplify(c) ".xml";
32 printf("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") > OUTFILE;
33 printf("<texts>\n") >> OUTFILE;
34 print OUTFILE > "/dev/stderr";
35 }
36}
37
38
39
40/^\s*<\!\-\-/ { print; } # positional attributes
41
42/^\s*<text/ {
43
44# USE 01 binding_id="2246025"
45 BID = gensub(/^.+binding_id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
46# USE 02 date="2021-01-15"
47 DATE = gensub(/^.+date=\"([^\"]+?)\".+$/, "\\1", "1", $0);
48# 03 datefrom="20210115"
49# 04 dateto="20210115"
50# 05 elec_date="_"
51# 06 file=""
52# USE 07 filename_metadata="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_mets.xml"
53 METAFILENAME = gensub(/^.+filename_metadata=\"([^\"]+?)\".+$/, "\\1", "1", $0);
54# USE 08 filename_orig="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_page-2021011502210030301.xml
55 ORIGFILENAME = gensub(/^.+filename_orig=\"([^\"]+?)\".+$/, "\\1", "1", $0);
56# USE 09 id="t-bcd0f3fa-bbd3dac4"
57 ID = gensub(/^.+id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
58# 10 img_url=""
59# USE 11 issue_date="15.01.2021"
60 ISSUEDATE = gensub(/^.+issue_date=\"([^\"]+?)\".+$/, "\\1", "1", $0);
61# USE 12 issue_no="SK0221"
62 ISSUENO = gensub(/^.+issue_no=\"([^\"]+?)\".+$/, "\\1", "1", $0);
63# USE 13 issue_title="Suomen Kuvalehti"
64 ISSUETITLE = gensub(/^.+issue_title=\"([^\"]+?)\".+$/, "\\1", "1", $0);
65# USE 14 label="Suomen Kuvalehti no. SK0221 15.01.2021"
66 LABEL = gensub(/^.+label=\"([^\"]+?)\".+$/, "\\1", "1", $0);
67# USE 16 language="fi"
68 LANGUAGE = gensub(/^.+language=\"([^\"]+?)\".+$/, "\\1", "1", $0);
69# USE 17 page_id="p1"
70 PAGEID = gensub(/^.+page_id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
71# USE 18 page_no="None"
72 PAGENO = gensub(/^.+page_no=\"([^\"]+?)\".+$/, "\\1", "1", $0);
73# 19 part_name="_"
74# 20 publ_id="0039-5552"
75# 21 publ_part=""
76# USE 22 publ_title="Suomen Kuvalehti"
77 PUBLTITLE = gensub(/^.+publ_title=\"([^\"]+?)\".+$/, "\\1", "1", $0);
78# USE 23 publ_type="aikakausi"
79 PUBLTYPE = gensub(/^.+publ_type=\"([^\"]+?)\".+$/, "\\1", "1", $0);
80# USE 24 sentcount="70"
81 SENTCOUNT = gensub(/^.+sentcount=\"([^\"]+?)\".+$/, "\\1", "1", $0);
82# 25 sum_lang="|xxx:44|fin:23|eng:3|"
83# 26 timefrom="000000"
84# 27 timeto="235959"
85# USE 28 tokencount="304"
86 TOKENCOUNT = gensub(/^.+tokencount=\"([^\"]+?)\".+$/, "\\1", "1", $0);
87# 29 version_added="KLK-fi-2021">
88
89
90# print BID, DATE, METAFILENAME, ORIGFILENAME, ID, ISSUEDATE, ISSUENO, ISSUETITLE, LABEL, LANGUAGE, PAGEID, PAGENO, PUBLTITLE, PUBLTYPE, SENTCOUNT, TOKENCOUNT
91
92 OUTFILE = OUTDIR "/" simplify(PUBLTITLE) ".xml" ;
93
94 if(CONTAIN[PUBLTITLE]){print $0 > OUTFILE; }
95}
96
97/^\s*<paragraph/ {
98 ## <paragraph id="p-bcd0f3fa-bbd3dac4-9a6ee1a8" sum_lang="|xxx:1|">
99 PID = gensub(/^.+id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
100 SUMLANG = gensub(/^.+sum_lang=\"([^\"]+?)\".+$/, "\\1", "1", $0);
101
102 if(CONTAIN[PUBLTITLE]){print $0 >> OUTFILE; }
103}
104
105
106
107/^\s*<sentence/ {
108 ## <sentence id="s-bcd0f3fa-bbd3dac4-5c959030" lang="xxx" lang_conf="0.1745286">
109 SID = gensub(/^.+id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
110 LANG = gensub(/^.+lang=\"([^\"]+?)\".+$/, "\\1", "1", $0);
111 LCONF = gensub(/^.+lang_conf=\"([^\"]+?)\".+$/, "\\1", "1", $0);
112
113 if(CONTAIN[PUBLTITLE]){print $0 >> OUTFILE; }
114}
115
116
117($0 ~ /^\s*</ && $0 !~ /^.*<(\/?)(\!\-\-|text|paragraph|sentence)/) {printf("WARNING: unknown element in line %s\n%s", NR, $0) > "/dev/stderr";}
118
119
120$0 !~ /^\s*</ {
121 if(CONTAIN[PUBLTITLE]){print $0 >> OUTFILE; }
122}
123
124/^s*<\/(sentence|paragraph)>/ {
125 if(CONTAIN[PUBLTITLE]){print $0 >> OUTFILE; }
126}
127/^s*<\/text>/ {
128 if(CONTAIN[PUBLTITLE]){print $0 >> OUTFILE; }
129 }
130
131
132
133
134END {
135
136 for(c in CONTAIN){
137 if(CONTAIN[c] == 1){
138 OUTFILE = OUTDIR "/" simplify(c) ".xml";
139 printf("</texts>\n") >> OUTFILE;
140 }
141 }
142
143}
144
145
146function simplify(str)
147{
148 gsub(/\s+/, "_", str);
149 gsub(/ü/,"u", str);
150 gsub(/ä/,"a",str); gsub(/ö/,"o",str);
151 gsub(/Ü/,"U",str);
152 gsub(/Ä/,"A",str);
153 gsub(/Ö/,"O",str);
154
155 return(str);
156}
157
158
159
160