use PUBLID to extract
diff --git a/vrt2xml.awk b/vrt2xml.awk
index d8d6d88..b52c052 100644
--- a/vrt2xml.awk
+++ b/vrt2xml.awk
@@ -2,12 +2,52 @@
 # to be called for each yearly vrt file of KLK e.g. klk_fi_v2_2021.vrt
 
 BEGIN       {
-    # getting OUTDIR and YEAR as parameters
-    
+
+    #------------------------
+    # Field separators
+    #------------------------
+
+    FS="\t";
     OFS="\t";
 
+    #-------------------
+    # Global variables
+    #-------------------
+    
+    SEENID="";
+
+    
+
+    # Parameters to be given from command line:
+    # OUTDIR
+    # YEAR
+    # SOURCESFILE
+
+    #----------------------------------
+    # Defaults for Parameters
+    #----------------------------------
+    
+    if(!OUTDIR){
+	OUTDIR="XMLOUTDIR";
+	printf("Using %s as default output directory \n", OUTDIR)
+    }
+
+    if(!YEAR){
+	YEAR="2021";
+	printf("Extracting data from source for default year %s\n", YEAR)
+    }
+
+    if(!SOURCESFILE){
+	SOURCESFILE="sources_klk_fi_v2_2021_4eureco.csv";
+	printf("Using default sources file %s in this directory\n", SOURCESFILE)
+    }
+    
+
     system("mkdir -p " OUTDIR);
 
+    #------------------------------------------------------------
+    # Get positional attributes from first line of input file
+    #------------------------------------------------------------
     getline;
     if($0 !~ /^\s*<!\-\- #vrt positional-attributes/) {
 	print "ERROR: Something went wrong when looking for positional attributes in first line"
@@ -15,32 +55,46 @@
     else {
 	PATTS = $0;
     }
-    
-    CONTAIN["Aamulehti"]		 = 1; 
-    CONTAIN["Etelä-Suomen Sanomat"]	 = 1; 
-    CONTAIN["Hämeen Sanomat"]		 = 1; 
-    CONTAIN["Helsingin Sanomat"]	 = 1; 
-    CONTAIN["Ilkka-Pohjalainen"]	 = 1; 
-    CONTAIN["Ilta-Sanomat"]		 = 1; 
-    CONTAIN["Iltalehti"]		 = 1; 
-    CONTAIN["Kaleva"]		         = 1; 
-    CONTAIN["Karjalainen"]		 = 1; 
-    CONTAIN["Kauppalehti"]		 = 1; 
-    CONTAIN["Keskipohjanmaa"]		 = 1; 
-    CONTAIN["Keskisuomalainen"]	         = 1; 
-    CONTAIN["Kouvolan Sanomat"]	         = 1; 
-    CONTAIN["Kymen Sanomat"]		 = 1; 
-    CONTAIN["Länsi-Savo"]		 = 1; 
-    CONTAIN["Lapin Kansa"]		 = 1; 
-    CONTAIN["Satakunnan Kansa"]		 = 1; 
-    CONTAIN["Savon Sanomat"]		 = 1; 
-    CONTAIN["Suomen Kuvalehti"]	         = 1; 
-    CONTAIN["Turun Sanomat"]		 = 1;
-    
 
-    # start all outfiles according to selected titles in CONTAIN:
-    for(c in CONTAIN){
+    #--------------------------------------------------------------------
+    # get list of sources to be extracted from separate sources file
+    #--------------------------------------------------------------------
+    
+    getline < SOURCESFILE; # get rid of first line of sources file
+    while(getline < SOURCESFILE){
+	
+	# remove all leading and trailing spaces from relevant fields:
+	e = $1; gsub(/^\s+?/, "", e); gsub(/\s+$/, "", e); # e is extract flag
+	p = $3; gsub(/^\s+?/, "", p); gsub(/\s+$/, "", p); # p is publ_id
+	t = $4; gsub(/^\s+?/, "", t); gsub(/\s+$/, "", t); # t is title string
 
+
+	#printf("e: \t|%s|\n", e) >> "/dev/stderr";
+	#printf("p: \t|%s|\n", p) >> "/dev/stderr";
+	#printf("t: \t|%s|\n", t) >> "/dev/stderr";
+
+	
+	# set facts (arrays):
+	if(e == 1){            # i.e. where extract flag is 1
+
+	    TITLE[t]   = e;    # e.g. TITLE["Aamulehti"]   = 1; 
+	    EXTRACT[p] = t;    # e.g. EXTRACT["0355-6913"] ="Aamulehti";
+	    
+	    # special case publ_id starting in 'fk' according to end user notes:
+	    if(p ~ /^fk/){
+		P = toupper(p);
+		EXTRACT[P] = t;  # add fact for upper case version of publ_id
+	    }
+	}
+	
+    }
+
+    for(t in EXTRACT){
+	printf("Extract: %s %s\n", t, EXTRACT[t]) >> "/dev/stderr";
+    }
+    
+    # start all outfiles according to selected titles in EXTRACT:
+    for(c in TITLE){
 	system("mkdir -p " OUTDIR "/" simplify(c));
 	
 	OUTFILE = OUTDIR "/" simplify(c) "/" simplify(c) YEAR ".xml";
@@ -50,9 +104,13 @@
 	printf("<texts>\n")                                    >> OUTFILE;
 	print "Starting outfile " OUTFILE                      > "/dev/stderr";
     }
-}
+} # END BEGIN
 
 
+#---------------------------------
+# M A I N 
+#---------------------------------
+
 /^\s*<text/ {
     
 # USE 01 binding_id="2246025"
@@ -97,6 +155,7 @@
     LANGUAGE     = gensub(/^.+language=\"([^\"]+?)\".+$/,          "\\1", "1", $0);
     PAGEID       = gensub(/^.+page_id=\"([^\"]+?)\".+$/,           "\\1", "1", $0);
     PAGENO       = gensub(/^.+page_no=\"([^\"]+?)\".+$/,           "\\1", "1", $0);
+    PUBLID       = gensub(/^.+publ_id=\"([^\"]+?)\".+$/,        "\\1", "1", $0);
     PUBLTITLE    = gensub(/^.+publ_title=\"([^\"]+?)\".+$/,        "\\1", "1", $0);
     PUBLTYPE     = gensub(/^.+publ_type=\"([^\"]+?)\".+$/,         "\\1", "1", $0);
     SENTCOUNT    = gensub(/^.+sentcount=\"([^\"]+?)\".+$/,         "\\1", "1", $0);
@@ -104,19 +163,25 @@
 
     
 #   print BID, DATE, METAFILENAME, ORIGFILENAME, ID, ISSUEDATE, ISSUENO, ISSUETITLE, LABEL, LANGUAGE, PAGEID, PAGENO, PUBLTITLE, PUBLTYPE, SENTCOUNT, TOKENCOUNT
-    
-    OUTFILE = OUTDIR "/" simplify(PUBLTITLE) "/" simplify(PUBLTITLE) YEAR ".xml" ;
 
-    # continue OUTFILEs if PUBTITLE of <text> is in CONTAIN: 
-    if(CONTAIN[PUBLTITLE]){print $0 > OUTFILE; }
+    OUTFILE = OUTDIR "/" simplify(EXTRACT[PUBLID]) "/" simplify(EXTRACT[PUBLID]) YEAR ".xml" ;
+
+    # continue OUTFILEs if PUBTITLE of <text> is in EXTRACT: 
+    if(EXTRACT[PUBLID]){
+	if(PUBLID != SEENID){
+	    printf("Extracting <text>s for %s\n", EXTRACT[PUBLID]) >> "/dev/stderr";
+	}
+	print $0 > OUTFILE;
+	SEENID = PUBLID;
+    }
 }
 
 /^\s*<paragraph/ {
     ## <paragraph id="p-bcd0f3fa-bbd3dac4-9a6ee1a8" sum_lang="|xxx:1|">
     PID     = gensub(/^.+id=\"([^\"]+?)\".+$/,       "\\1", "1", $0);
-    SUMLANG = gensub(/^.+sum_lang=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+      SUMLANG = gensub(/^.+sum_lang=\"([^\"]+?)\".+$/, "\\1", "1", $0);
 
-    if(CONTAIN[PUBLTITLE]){print $0 >> OUTFILE; }
+    if(EXTRACT[PUBLID]){print $0 >> OUTFILE; }
 }
 
 
@@ -127,7 +192,7 @@
     LANG  = gensub(/^.+lang=\"([^\"]+?)\".+$/,      "\\1", "1", $0);  
     LCONF = gensub(/^.+lang_conf=\"([^\"]+?)\".+$/, "\\1", "1", $0);  
     
-    if(CONTAIN[PUBLTITLE]){print $0 >> OUTFILE; }
+    if(EXTRACT[PUBLID]){print $0 >> OUTFILE; }
 }
 
 
@@ -135,14 +200,14 @@
 
 
 $0 !~ /^\s*</ {
-    if(CONTAIN[PUBLTITLE]){print $0 >> OUTFILE; }
+    if(EXTRACT[PUBLID]){print $0 >> OUTFILE; }
 }
 
 /^s*<\/(sentence|paragraph)>/ {
-    if(CONTAIN[PUBLTITLE]){print $0 >> OUTFILE; }
+    if(EXTRACT[PUBLID]){print $0 >> OUTFILE; }
 }
 /^s*<\/text>/ {
-    if(CONTAIN[PUBLTITLE]){print $0 >> OUTFILE; }
+    if(EXTRACT[PUBLID]){print $0 >> OUTFILE; }
     }
 
 
@@ -150,13 +215,11 @@
 
 END {
     
-    for(c in CONTAIN){
-	if(CONTAIN[c] == 1){
-	    OUTFILE = OUTDIR "/" simplify(c) "/" simplify(c) YEAR ".xml";   
-	    printf("</texts>\n") >> OUTFILE;
-	}
+    for(c in TITLE){
+	OUTFILE = OUTDIR "/" simplify(c) "/" simplify(c) YEAR ".xml";   
+	printf("</texts>\n") >> OUTFILE;
+	printf("Wrote %s\n", OUTFILE) >> "/dev/stderr";
     }
-    
 }