# vrt2xml.awk
# to be called for each yearly vrt file of KLK e.g. klk_fi_v2_2021.vrt

BEGIN       {

    #------------------------
    # Field separators
    #------------------------

    FS="\t";
    OFS="\t";

    #-------------------
    # Global variables
    #-------------------
    
    SEENID="";

    

    # Parameters to be given from command line:
    # OUTDIR
    # YEAR
    # SOURCESFILE

    #----------------------------------
    # Defaults for Parameters
    #----------------------------------
    
    if(!OUTDIR){
	OUTDIR="XMLOUTDIR";
	printf("Using %s as default output directory \n", OUTDIR)
    }

    if(!YEAR){
	YEAR="2021";
	printf("Extracting data from source for default year %s\n", YEAR)
    }

    if(!SOURCESFILE){
	SOURCESFILE="sources_klk_fi_v2_2021_4eureco.csv";
	printf("Using default sources file %s in this directory\n", SOURCESFILE)
    }
    

    system("mkdir -p " OUTDIR);

    #------------------------------------------------------------
    # Get positional attributes from first line of input file
    #------------------------------------------------------------
    getline;
    if($0 !~ /^\s*<!\-\- #vrt positional-attributes/) {
	print "ERROR: Something went wrong when looking for positional attributes in first line"
    }
    else {
	PATTS = $0;
    }

    #--------------------------------------------------------------------
    # get list of sources to be extracted from separate sources file
    #--------------------------------------------------------------------
    
    getline < SOURCESFILE; # get rid of first line of sources file
    while(getline < SOURCESFILE){
	
	# remove all leading and trailing spaces from relevant fields:
	e = $1; gsub(/^\s+?/, "", e); gsub(/\s+$/, "", e); # e is extract flag
	p = $3; gsub(/^\s+?/, "", p); gsub(/\s+$/, "", p); # p is publ_id
	t = $4; gsub(/^\s+?/, "", t); gsub(/\s+$/, "", t); # t is title string


	#printf("e: \t|%s|\n", e) >> "/dev/stderr";
	#printf("p: \t|%s|\n", p) >> "/dev/stderr";
	#printf("t: \t|%s|\n", t) >> "/dev/stderr";

	
	# set facts (arrays):
	if(e == 1){            # i.e. where extract flag is 1

	    TITLE[t]   = e;    # e.g. TITLE["Aamulehti"]   = 1; 
	    EXTRACT[p] = t;    # e.g. EXTRACT["0355-6913"] ="Aamulehti";
	    
	    # special case publ_id starting in 'fk' according to end user notes:
	    if(p ~ /^fk/){
		P = toupper(p);
		EXTRACT[P] = t;  # add fact for upper case version of publ_id
	    }
	}
	
    }

    for(t in EXTRACT){
	printf("Extract: %s %s\n", t, EXTRACT[t]) >> "/dev/stderr";
    }
    
    # start all outfiles according to selected titles in EXTRACT:
    for(c in TITLE){
	system("mkdir -p " OUTDIR "/" simplify(c));
	
	OUTFILE = OUTDIR "/" simplify(c) "/" simplify(c) YEAR ".xml";
	
	printf("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") >  OUTFILE;
	print PATTS                                            >> OUTFILE;
	printf("<texts>\n")                                    >> OUTFILE;
	print "Starting outfile " OUTFILE                      > "/dev/stderr";
    }
} # END BEGIN


#---------------------------------
# M A I N 
#---------------------------------

/^\s*<text/ {
    
# USE 01 binding_id="2246025"
# USE 02 date="2021-01-15"
#     03 datefrom="20210115"
#     04 dateto="20210115"
#     05 elec_date="_"
#     06 file=""
# USE 07 filename_metadata="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_mets.xml"
# USE 08 filename_orig="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_page-2021011502210030301.xml
# USE 09 id="t-bcd0f3fa-bbd3dac4"
#     10 img_url=""
# USE 11 issue_date="15.01.2021"
# USE 12 issue_no="SK0221"
# USE 13 issue_title="Suomen Kuvalehti"
# USE 14 label="Suomen Kuvalehti no. SK0221 15.01.2021"
# USE 16 language="fi"
# USE 17 page_id="p1"
# USE 18 page_no="None"
#     19 part_name="_"
#     20 publ_id="0039-5552"
#     21 publ_part=""
# USE 22 publ_title="Suomen Kuvalehti" 
# USE 23 publ_type="aikakausi"
# USE 24 sentcount="70"
#     25 sum_lang="|xxx:44|fin:23|eng:3|"
#     26 timefrom="000000"
#     27 timeto="235959"
# USE 28 tokencount="304"
#     29 version_added="KLK-fi-2021">
    
    
    BID          = gensub(/^.+binding_id=\"([^\"]+?)\".+$/,        "\\1", "1", $0);
    DATE         = gensub(/^.+date=\"([^\"]+?)\".+$/,              "\\1", "1", $0);
    METAFILENAME = gensub(/^.+filename_metadata=\"([^\"]+?)\".+$/, "\\1", "1", $0);
    ORIGFILENAME = gensub(/^.+filename_orig=\"([^\"]+?)\".+$/,     "\\1", "1", $0);
    ID           = gensub(/^.+id=\"([^\"]+?)\".+$/,                "\\1", "1", $0);
    ISSUEDATE    = gensub(/^.+issue_date=\"([^\"]+?)\".+$/,        "\\1", "1", $0);
    ISSUENO      = gensub(/^.+issue_no=\"([^\"]+?)\".+$/,          "\\1", "1", $0);
    ISSUETITLE   = gensub(/^.+issue_title=\"([^\"]+?)\".+$/,       "\\1", "1", $0);
    LABEL        = gensub(/^.+label=\"([^\"]+?)\".+$/,             "\\1", "1", $0);
    LANGUAGE     = gensub(/^.+language=\"([^\"]+?)\".+$/,          "\\1", "1", $0);
    PAGEID       = gensub(/^.+page_id=\"([^\"]+?)\".+$/,           "\\1", "1", $0);
    PAGENO       = gensub(/^.+page_no=\"([^\"]+?)\".+$/,           "\\1", "1", $0);
    PUBLID       = gensub(/^.+publ_id=\"([^\"]+?)\".+$/,        "\\1", "1", $0);
    PUBLTITLE    = gensub(/^.+publ_title=\"([^\"]+?)\".+$/,        "\\1", "1", $0);
    PUBLTYPE     = gensub(/^.+publ_type=\"([^\"]+?)\".+$/,         "\\1", "1", $0);
    SENTCOUNT    = gensub(/^.+sentcount=\"([^\"]+?)\".+$/,         "\\1", "1", $0);
    TOKENCOUNT   = gensub(/^.+tokencount=\"([^\"]+?)\".+$/,        "\\1", "1", $0);

    
#   print BID, DATE, METAFILENAME, ORIGFILENAME, ID, ISSUEDATE, ISSUENO, ISSUETITLE, LABEL, LANGUAGE, PAGEID, PAGENO, PUBLTITLE, PUBLTYPE, SENTCOUNT, TOKENCOUNT

    OUTFILE = OUTDIR "/" simplify(EXTRACT[PUBLID]) "/" simplify(EXTRACT[PUBLID]) YEAR ".xml" ;

    # continue OUTFILEs if PUBTITLE of <text> is in EXTRACT: 
    if(EXTRACT[PUBLID]){
	if(PUBLID != SEENID){
	    printf("Extracting <text>s for %s\n", EXTRACT[PUBLID]) >> "/dev/stderr";
	}
	print $0 > OUTFILE;
	SEENID = PUBLID;
    }
}

/^\s*<paragraph/ {
    ## <paragraph id="p-bcd0f3fa-bbd3dac4-9a6ee1a8" sum_lang="|xxx:1|">
    PID     = gensub(/^.+id=\"([^\"]+?)\".+$/,       "\\1", "1", $0);
      SUMLANG = gensub(/^.+sum_lang=\"([^\"]+?)\".+$/, "\\1", "1", $0);

    if(EXTRACT[PUBLID]){print $0 >> OUTFILE; }
}



/^\s*<sentence/ {
    ## <sentence id="s-bcd0f3fa-bbd3dac4-5c959030" lang="xxx" lang_conf="0.1745286">
    SID   = gensub(/^.+id=\"([^\"]+?)\".+$/,        "\\1", "1", $0);  
    LANG  = gensub(/^.+lang=\"([^\"]+?)\".+$/,      "\\1", "1", $0);  
    LCONF = gensub(/^.+lang_conf=\"([^\"]+?)\".+$/, "\\1", "1", $0);  
    
    if(EXTRACT[PUBLID]){print $0 >> OUTFILE; }
}


($0 ~ /^\s*</ && $0 !~ /^.*<(\/?)(\!\-\-|text|paragraph|sentence)/) {printf("WARNING: unknown element in line %s\n%s", NR, $0) > "/dev/stderr";}


$0 !~ /^\s*</ {
    if(EXTRACT[PUBLID]){print $0 >> OUTFILE; }
}

/^s*<\/(sentence|paragraph)>/ {
    if(EXTRACT[PUBLID]){print $0 >> OUTFILE; }
}
/^s*<\/text>/ {
    if(EXTRACT[PUBLID]){print $0 >> OUTFILE; }
    }




END {
    
    for(c in TITLE){
	OUTFILE = OUTDIR "/" simplify(c) "/" simplify(c) YEAR ".xml";   
	printf("</texts>\n") >> OUTFILE;
	printf("Wrote %s\n", OUTFILE) >> "/dev/stderr";
    }
}


function simplify(str)
{
    gsub(/\s+/, "_", str);
    gsub(/ü/,"u", str);
    gsub(/ä/,"a",str);                                                                                                                                        gsub(/ö/,"o",str);
    gsub(/Ü/,"U",str);
    gsub(/Ä/,"A",str);
    gsub(/Ö/,"O",str);
    
    return(str);
}




