preprocessing to generate proper xml
diff --git a/vrt2xml.awk b/vrt2xml.awk
new file mode 100644
index 0000000..451e225
--- /dev/null
+++ b/vrt2xml.awk
@@ -0,0 +1,160 @@
+BEGIN       {
+    OFS="\t";
+
+    OUTDIR="XML";
+    system("mkdir -p \"" OUTDIR "\"");
+     
+    CONTAIN["Aamulehti"]		 = 1; 
+    CONTAIN["Etelä-Suomen Sanomat"]	 = 1; 
+    CONTAIN["Forssan Lehti"]		 = 1; 
+    CONTAIN["Hämeen Sanomat"]		 = 1; 
+    CONTAIN["Helsingin Sanomat"]	 = 1; 
+    CONTAIN["Ilkka-Pohjalainen"]	 = 1; 
+    CONTAIN["Ilta-Sanomat"]		 = 1; 
+    CONTAIN["Iltalehti"]		 = 1; 
+    CONTAIN["Kaleva"]		         = 1; 
+    CONTAIN["Karjalainen"]		 = 1; 
+    CONTAIN["Kauppalehti"]		 = 1; 
+    CONTAIN["Keskipohjanmaa"]		 = 1; 
+    CONTAIN["Keskisuomalainen"]	         = 1; 
+    CONTAIN["Kouvolan Sanomat"]	         = 1; 
+    CONTAIN["Kymen Sanomat"]		 = 1; 
+    CONTAIN["Länsi-Savo"]		 = 1; 
+    CONTAIN["Lapin Kansa"]		 = 1; 
+    CONTAIN["Satakunnan Kansa"]		 = 1; 
+    CONTAIN["Savon Sanomat"]		 = 1; 
+    CONTAIN["Suomen Kuvalehti"]	         = 1; 
+    CONTAIN["Turun Sanomat"]		 = 1;
+    
+
+    for(c in CONTAIN){
+	OUTFILE = OUTDIR "/" simplify(c) ".xml"; 
+	printf("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") >  OUTFILE;
+	printf("<texts>\n")                                    >> OUTFILE;
+	print OUTFILE                                          > "/dev/stderr";
+    }
+}
+
+
+
+/^\s*<\!\-\-/      { print;  } # positional attributes
+
+/^\s*<text/ {
+    
+# USE 01 binding_id="2246025"
+    BID = gensub(/^.+binding_id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# USE 02 date="2021-01-15"
+    DATE = gensub(/^.+date=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# 03 datefrom="20210115"
+# 04 dateto="20210115"
+# 05 elec_date="_"
+# 06 file=""
+# USE 07 filename_metadata="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_mets.xml"
+    METAFILENAME = gensub(/^.+filename_metadata=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# USE 08 filename_orig="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_page-2021011502210030301.xml
+    ORIGFILENAME = gensub(/^.+filename_orig=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# USE 09 id="t-bcd0f3fa-bbd3dac4"
+    ID = gensub(/^.+id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# 10 img_url=""
+# USE 11 issue_date="15.01.2021"
+    ISSUEDATE = gensub(/^.+issue_date=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# USE 12 issue_no="SK0221"
+    ISSUENO = gensub(/^.+issue_no=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# USE 13 issue_title="Suomen Kuvalehti"
+    ISSUETITLE = gensub(/^.+issue_title=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# USE 14 label="Suomen Kuvalehti no. SK0221 15.01.2021"
+    LABEL = gensub(/^.+label=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# USE 16 language="fi"
+    LANGUAGE = gensub(/^.+language=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# USE 17 page_id="p1"
+    PAGEID = gensub(/^.+page_id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# USE 18 page_no="None"
+    PAGENO = gensub(/^.+page_no=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# 19 part_name="_"
+# 20 publ_id="0039-5552"
+# 21 publ_part=""
+# USE 22 publ_title="Suomen Kuvalehti" 
+    PUBLTITLE = gensub(/^.+publ_title=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# USE 23 publ_type="aikakausi"
+    PUBLTYPE = gensub(/^.+publ_type=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# USE 24 sentcount="70"
+    SENTCOUNT = gensub(/^.+sentcount=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# 25 sum_lang="|xxx:44|fin:23|eng:3|"
+# 26 timefrom="000000"
+# 27 timeto="235959"
+# USE 28 tokencount="304"
+    TOKENCOUNT = gensub(/^.+tokencount=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# 29 version_added="KLK-fi-2021">
+    
+    
+#	print BID, DATE, METAFILENAME, ORIGFILENAME, ID, ISSUEDATE, ISSUENO, ISSUETITLE, LABEL, LANGUAGE, PAGEID, PAGENO, PUBLTITLE, PUBLTYPE, SENTCOUNT, TOKENCOUNT
+    
+    OUTFILE = OUTDIR "/" simplify(PUBLTITLE) ".xml" ;
+    
+    if(CONTAIN[PUBLTITLE]){print $0 > OUTFILE; }
+}
+
+/^\s*<paragraph/ {
+    ## <paragraph id="p-bcd0f3fa-bbd3dac4-9a6ee1a8" sum_lang="|xxx:1|">
+    PID     = gensub(/^.+id=\"([^\"]+?)\".+$/,       "\\1", "1", $0);
+    SUMLANG = gensub(/^.+sum_lang=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+
+    if(CONTAIN[PUBLTITLE]){print $0 >> OUTFILE; }
+}
+
+
+
+/^\s*<sentence/ {
+    ## <sentence id="s-bcd0f3fa-bbd3dac4-5c959030" lang="xxx" lang_conf="0.1745286">
+    SID   = gensub(/^.+id=\"([^\"]+?)\".+$/,        "\\1", "1", $0);  
+    LANG  = gensub(/^.+lang=\"([^\"]+?)\".+$/,      "\\1", "1", $0);  
+    LCONF = gensub(/^.+lang_conf=\"([^\"]+?)\".+$/, "\\1", "1", $0);  
+    
+    if(CONTAIN[PUBLTITLE]){print $0 >> OUTFILE; }
+}
+
+
+($0 ~ /^\s*</ && $0 !~ /^.*<(\/?)(\!\-\-|text|paragraph|sentence)/) {printf("WARNING: unknown element in line %s\n%s", NR, $0) > "/dev/stderr";}
+
+
+$0 !~ /^\s*</ {
+    if(CONTAIN[PUBLTITLE]){print $0 >> OUTFILE; }
+}
+
+/^s*<\/(sentence|paragraph)>/ {
+    if(CONTAIN[PUBLTITLE]){print $0 >> OUTFILE; }
+}
+/^s*<\/text>/ {
+    if(CONTAIN[PUBLTITLE]){print $0 >> OUTFILE; }
+    }
+
+
+
+
+END {
+    
+    for(c in CONTAIN){
+	if(CONTAIN[c] == 1){
+	    OUTFILE = OUTDIR "/" simplify(c) ".xml";   
+	    printf("</texts>\n") >> OUTFILE;
+	}
+    }
+    
+}
+
+
+function simplify(str)
+{
+    gsub(/\s+/, "_", str);
+    gsub(/ü/,"u", str);
+    gsub(/ä/,"a",str);                                                                                                                                        gsub(/ö/,"o",str);
+    gsub(/Ü/,"U",str);
+    gsub(/Ä/,"A",str);
+    gsub(/Ö/,"O",str);
+    
+    return(str);
+}
+
+
+
+