preprocessing to generate proper xml
diff --git a/vrt2xml.awk b/vrt2xml.awk
new file mode 100644
index 0000000..451e225
--- /dev/null
+++ b/vrt2xml.awk
@@ -0,0 +1,160 @@
+BEGIN {
+ OFS="\t";
+
+ OUTDIR="XML";
+ system("mkdir -p \"" OUTDIR "\"");
+
+ CONTAIN["Aamulehti"] = 1;
+ CONTAIN["Etelä-Suomen Sanomat"] = 1;
+ CONTAIN["Forssan Lehti"] = 1;
+ CONTAIN["Hämeen Sanomat"] = 1;
+ CONTAIN["Helsingin Sanomat"] = 1;
+ CONTAIN["Ilkka-Pohjalainen"] = 1;
+ CONTAIN["Ilta-Sanomat"] = 1;
+ CONTAIN["Iltalehti"] = 1;
+ CONTAIN["Kaleva"] = 1;
+ CONTAIN["Karjalainen"] = 1;
+ CONTAIN["Kauppalehti"] = 1;
+ CONTAIN["Keskipohjanmaa"] = 1;
+ CONTAIN["Keskisuomalainen"] = 1;
+ CONTAIN["Kouvolan Sanomat"] = 1;
+ CONTAIN["Kymen Sanomat"] = 1;
+ CONTAIN["Länsi-Savo"] = 1;
+ CONTAIN["Lapin Kansa"] = 1;
+ CONTAIN["Satakunnan Kansa"] = 1;
+ CONTAIN["Savon Sanomat"] = 1;
+ CONTAIN["Suomen Kuvalehti"] = 1;
+ CONTAIN["Turun Sanomat"] = 1;
+
+
+ for(c in CONTAIN){
+ OUTFILE = OUTDIR "/" simplify(c) ".xml";
+ printf("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") > OUTFILE;
+ printf("<texts>\n") >> OUTFILE;
+ print OUTFILE > "/dev/stderr";
+ }
+}
+
+
+
+/^\s*<\!\-\-/ { print; } # positional attributes
+
+/^\s*<text/ {
+
+# USE 01 binding_id="2246025"
+ BID = gensub(/^.+binding_id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# USE 02 date="2021-01-15"
+ DATE = gensub(/^.+date=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# 03 datefrom="20210115"
+# 04 dateto="20210115"
+# 05 elec_date="_"
+# 06 file=""
+# USE 07 filename_metadata="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_mets.xml"
+ METAFILENAME = gensub(/^.+filename_metadata=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# USE 08 filename_orig="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_page-2021011502210030301.xml
+ ORIGFILENAME = gensub(/^.+filename_orig=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# USE 09 id="t-bcd0f3fa-bbd3dac4"
+ ID = gensub(/^.+id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# 10 img_url=""
+# USE 11 issue_date="15.01.2021"
+ ISSUEDATE = gensub(/^.+issue_date=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# USE 12 issue_no="SK0221"
+ ISSUENO = gensub(/^.+issue_no=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# USE 13 issue_title="Suomen Kuvalehti"
+ ISSUETITLE = gensub(/^.+issue_title=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# USE 14 label="Suomen Kuvalehti no. SK0221 15.01.2021"
+ LABEL = gensub(/^.+label=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# USE 16 language="fi"
+ LANGUAGE = gensub(/^.+language=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# USE 17 page_id="p1"
+ PAGEID = gensub(/^.+page_id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# USE 18 page_no="None"
+ PAGENO = gensub(/^.+page_no=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# 19 part_name="_"
+# 20 publ_id="0039-5552"
+# 21 publ_part=""
+# USE 22 publ_title="Suomen Kuvalehti"
+ PUBLTITLE = gensub(/^.+publ_title=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# USE 23 publ_type="aikakausi"
+ PUBLTYPE = gensub(/^.+publ_type=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# USE 24 sentcount="70"
+ SENTCOUNT = gensub(/^.+sentcount=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# 25 sum_lang="|xxx:44|fin:23|eng:3|"
+# 26 timefrom="000000"
+# 27 timeto="235959"
+# USE 28 tokencount="304"
+ TOKENCOUNT = gensub(/^.+tokencount=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# 29 version_added="KLK-fi-2021">
+
+
+# print BID, DATE, METAFILENAME, ORIGFILENAME, ID, ISSUEDATE, ISSUENO, ISSUETITLE, LABEL, LANGUAGE, PAGEID, PAGENO, PUBLTITLE, PUBLTYPE, SENTCOUNT, TOKENCOUNT
+
+ OUTFILE = OUTDIR "/" simplify(PUBLTITLE) ".xml" ;
+
+ if(CONTAIN[PUBLTITLE]){print $0 > OUTFILE; }
+}
+
+/^\s*<paragraph/ {
+ ## <paragraph id="p-bcd0f3fa-bbd3dac4-9a6ee1a8" sum_lang="|xxx:1|">
+ PID = gensub(/^.+id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+ SUMLANG = gensub(/^.+sum_lang=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+
+ if(CONTAIN[PUBLTITLE]){print $0 >> OUTFILE; }
+}
+
+
+
+/^\s*<sentence/ {
+ ## <sentence id="s-bcd0f3fa-bbd3dac4-5c959030" lang="xxx" lang_conf="0.1745286">
+ SID = gensub(/^.+id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+ LANG = gensub(/^.+lang=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+ LCONF = gensub(/^.+lang_conf=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+
+ if(CONTAIN[PUBLTITLE]){print $0 >> OUTFILE; }
+}
+
+
+($0 ~ /^\s*</ && $0 !~ /^.*<(\/?)(\!\-\-|text|paragraph|sentence)/) {printf("WARNING: unknown element in line %s\n%s", NR, $0) > "/dev/stderr";}
+
+
+$0 !~ /^\s*</ {
+ if(CONTAIN[PUBLTITLE]){print $0 >> OUTFILE; }
+}
+
+/^s*<\/(sentence|paragraph)>/ {
+ if(CONTAIN[PUBLTITLE]){print $0 >> OUTFILE; }
+}
+/^s*<\/text>/ {
+ if(CONTAIN[PUBLTITLE]){print $0 >> OUTFILE; }
+ }
+
+
+
+
+END {
+
+ for(c in CONTAIN){
+ if(CONTAIN[c] == 1){
+ OUTFILE = OUTDIR "/" simplify(c) ".xml";
+ printf("</texts>\n") >> OUTFILE;
+ }
+ }
+
+}
+
+
+function simplify(str)
+{
+ gsub(/\s+/, "_", str);
+ gsub(/ü/,"u", str);
+ gsub(/ä/,"a",str); gsub(/ö/,"o",str);
+ gsub(/Ü/,"U",str);
+ gsub(/Ä/,"A",str);
+ gsub(/Ö/,"O",str);
+
+ return(str);
+}
+
+
+
+