auxiliary scripts
diff --git a/sources.awk b/sources.awk
new file mode 100644
index 0000000..1c38420
--- /dev/null
+++ b/sources.awk
@@ -0,0 +1,17 @@
+    /<text/     {
+	pubTitle = gensub(/^.+publ_title=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+	pubType  = gensub(/^.+publ_type=\"([^\"]+?)\".+$/,  "\\1", "1", $0);
+	language = gensub(/^.+language=\"([^\"]+?)\".+$/,   "\\1", "1", $0);
+	tokenc   = gensub(/^.+tokencount=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+	
+	pubTitles[pubTitle]++;
+	pubTypes[pubTitle] = pubType;
+	languages[pubTitle]= language;
+	tokens[pubTitle] = tokens[pubTitle] + tokenc;
+    }
+
+    END {
+	for (pubTitle in pubTitles){
+	    print pubTitle, "\t", pubTitles[pubTitle], "\t", tokens[pubTitle], "\t", pubTypes[pubTitle], "\t", languages[pubTitle];
+	}
+    }
diff --git a/textmeta.awk b/textmeta.awk
new file mode 100644
index 0000000..ab0789f
--- /dev/null
+++ b/textmeta.awk
@@ -0,0 +1,59 @@
+BEGIN       {
+    OFS="\t"
+    printf("<?xml version=\"1.0\" encoding=\"UTF-8\">\n")
+    printf("<texts>\n");
+}
+    
+/<text/     {
+# Ü 01 binding_id="2246025"
+	BID = gensub(/^.+binding_id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# Ü 02 date="2021-01-15"
+	DATE = gensub(/^.+date=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# 03 datefrom="20210115"
+# 04 dateto="20210115"
+# 05 elec_date="_"
+# 06 file=""
+# Ü 07 filename_metadata="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_mets.xml"
+	METAFILENAME = gensub(/^.+filename_metadata=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# Ü 08 filename_orig="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_page-2021011502210030301.xml
+	ORIGFILENAME = gensub(/^.+filename_orig=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# Ü 09 id="t-bcd0f3fa-bbd3dac4"
+	ID = gensub(/^.+id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# 10 img_url=""
+# Ü 11 issue_date="15.01.2021"
+	ISSUEDATE = gensub(/^.+issue_date=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# Ü 12 issue_no="SK0221"
+	ISSUENO = gensub(/^.+issue_no=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# Ü 13 issue_title="Suomen Kuvalehti"
+	ISSUETITLE = gensub(/^.+issue_title=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# Ü 14 label="Suomen Kuvalehti no. SK0221 15.01.2021"
+	LABEL = gensub(/^.+label=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# Ü 16 language="fi"
+	LANGUAGE = gensub(/^.+language=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# Ü 17 page_id="p1"
+	PAGEID = gensub(/^.+page_id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# 18 page_no="None"
+	PAGENO = gensub(/^.+page_no=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# 19 part_name="_"
+# 20 publ_id="0039-5552"
+# 21 publ_part=""
+# Ü 22 publ_title="Suomen Kuvalehti" 
+	PUBLTITLE = gensub(/^.+publ_title=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# Ü 23 publ_type="aikakausi"
+	PUBLTYPE = gensub(/^.+publ_type=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# Ü 24 sentcount="70"
+	SENTCOUNT = gensub(/^.+sentcount=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# 25 sum_lang="|xxx:44|fin:23|eng:3|"
+# 26 timefrom="000000"
+# 27 timeto="235959"
+# Ü 28 tokencount="304"
+	TOKENCOUNT = gensub(/^.+tokencount=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+# 29 version_added="KLK-fi-2021">
+
+
+	
+	print BID, DATE, METAFILENAME, ORIGFILENAME, ID, ISSUEDATE, ISSUENO, ISSUETITLE, LABEL, LANGUAGE, PAGEID, PAGENO, PUBLTITLE, PUBLTYPE, SENTCOUNT, TOKENCOUNT
+
+}
+
+