auxiliary scripts
diff --git a/textmeta.awk b/textmeta.awk
new file mode 100644
index 0000000..ab0789f
--- /dev/null
+++ b/textmeta.awk
@@ -0,0 +1,59 @@
+BEGIN {
+ OFS="\t"
+ printf("<?xml version=\"1.0\" encoding=\"UTF-8\">\n")
+ printf("<texts>\n");
+}
+
+/<text/ {
+# Ü 01 binding_id="2246025"
+ BID = gensub(/^.+binding_id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# Ü 02 date="2021-01-15"
+ DATE = gensub(/^.+date=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# 03 datefrom="20210115"
+# 04 dateto="20210115"
+# 05 elec_date="_"
+# 06 file=""
+# Ü 07 filename_metadata="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_mets.xml"
+ METAFILENAME = gensub(/^.+filename_metadata=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# Ü 08 filename_orig="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_page-2021011502210030301.xml
+ ORIGFILENAME = gensub(/^.+filename_orig=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# Ü 09 id="t-bcd0f3fa-bbd3dac4"
+ ID = gensub(/^.+id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# 10 img_url=""
+# Ü 11 issue_date="15.01.2021"
+ ISSUEDATE = gensub(/^.+issue_date=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# Ü 12 issue_no="SK0221"
+ ISSUENO = gensub(/^.+issue_no=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# Ü 13 issue_title="Suomen Kuvalehti"
+ ISSUETITLE = gensub(/^.+issue_title=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# Ü 14 label="Suomen Kuvalehti no. SK0221 15.01.2021"
+ LABEL = gensub(/^.+label=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# Ü 16 language="fi"
+ LANGUAGE = gensub(/^.+language=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# Ü 17 page_id="p1"
+ PAGEID = gensub(/^.+page_id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# 18 page_no="None"
+ PAGENO = gensub(/^.+page_no=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# 19 part_name="_"
+# 20 publ_id="0039-5552"
+# 21 publ_part=""
+# Ü 22 publ_title="Suomen Kuvalehti"
+ PUBLTITLE = gensub(/^.+publ_title=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# Ü 23 publ_type="aikakausi"
+ PUBLTYPE = gensub(/^.+publ_type=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# Ü 24 sentcount="70"
+ SENTCOUNT = gensub(/^.+sentcount=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# 25 sum_lang="|xxx:44|fin:23|eng:3|"
+# 26 timefrom="000000"
+# 27 timeto="235959"
+# Ü 28 tokencount="304"
+ TOKENCOUNT = gensub(/^.+tokencount=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# 29 version_added="KLK-fi-2021">
+
+
+
+ print BID, DATE, METAFILENAME, ORIGFILENAME, ID, ISSUEDATE, ISSUENO, ISSUETITLE, LABEL, LANGUAGE, PAGEID, PAGENO, PUBLTITLE, PUBLTYPE, SENTCOUNT, TOKENCOUNT
+
+}
+
+