blob: b52c0524a73094b1715d74916abbcb224a0594fc [file] [log] [blame]
# vrt2xml.awk
# to be called for each yearly vrt file of KLK e.g. klk_fi_v2_2021.vrt
BEGIN {
#------------------------
# Field separators
#------------------------
FS="\t";
OFS="\t";
#-------------------
# Global variables
#-------------------
SEENID="";
# Parameters to be given from command line:
# OUTDIR
# YEAR
# SOURCESFILE
#----------------------------------
# Defaults for Parameters
#----------------------------------
if(!OUTDIR){
OUTDIR="XMLOUTDIR";
printf("Using %s as default output directory \n", OUTDIR)
}
if(!YEAR){
YEAR="2021";
printf("Extracting data from source for default year %s\n", YEAR)
}
if(!SOURCESFILE){
SOURCESFILE="sources_klk_fi_v2_2021_4eureco.csv";
printf("Using default sources file %s in this directory\n", SOURCESFILE)
}
system("mkdir -p " OUTDIR);
#------------------------------------------------------------
# Get positional attributes from first line of input file
#------------------------------------------------------------
getline;
if($0 !~ /^\s*<!\-\- #vrt positional-attributes/) {
print "ERROR: Something went wrong when looking for positional attributes in first line"
}
else {
PATTS = $0;
}
#--------------------------------------------------------------------
# get list of sources to be extracted from separate sources file
#--------------------------------------------------------------------
getline < SOURCESFILE; # get rid of first line of sources file
while(getline < SOURCESFILE){
# remove all leading and trailing spaces from relevant fields:
e = $1; gsub(/^\s+?/, "", e); gsub(/\s+$/, "", e); # e is extract flag
p = $3; gsub(/^\s+?/, "", p); gsub(/\s+$/, "", p); # p is publ_id
t = $4; gsub(/^\s+?/, "", t); gsub(/\s+$/, "", t); # t is title string
#printf("e: \t|%s|\n", e) >> "/dev/stderr";
#printf("p: \t|%s|\n", p) >> "/dev/stderr";
#printf("t: \t|%s|\n", t) >> "/dev/stderr";
# set facts (arrays):
if(e == 1){ # i.e. where extract flag is 1
TITLE[t] = e; # e.g. TITLE["Aamulehti"] = 1;
EXTRACT[p] = t; # e.g. EXTRACT["0355-6913"] ="Aamulehti";
# special case publ_id starting in 'fk' according to end user notes:
if(p ~ /^fk/){
P = toupper(p);
EXTRACT[P] = t; # add fact for upper case version of publ_id
}
}
}
for(t in EXTRACT){
printf("Extract: %s %s\n", t, EXTRACT[t]) >> "/dev/stderr";
}
# start all outfiles according to selected titles in EXTRACT:
for(c in TITLE){
system("mkdir -p " OUTDIR "/" simplify(c));
OUTFILE = OUTDIR "/" simplify(c) "/" simplify(c) YEAR ".xml";
printf("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") > OUTFILE;
print PATTS >> OUTFILE;
printf("<texts>\n") >> OUTFILE;
print "Starting outfile " OUTFILE > "/dev/stderr";
}
} # END BEGIN
#---------------------------------
# M A I N
#---------------------------------
/^\s*<text/ {
# USE 01 binding_id="2246025"
# USE 02 date="2021-01-15"
# 03 datefrom="20210115"
# 04 dateto="20210115"
# 05 elec_date="_"
# 06 file=""
# USE 07 filename_metadata="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_mets.xml"
# USE 08 filename_orig="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_page-2021011502210030301.xml
# USE 09 id="t-bcd0f3fa-bbd3dac4"
# 10 img_url=""
# USE 11 issue_date="15.01.2021"
# USE 12 issue_no="SK0221"
# USE 13 issue_title="Suomen Kuvalehti"
# USE 14 label="Suomen Kuvalehti no. SK0221 15.01.2021"
# USE 16 language="fi"
# USE 17 page_id="p1"
# USE 18 page_no="None"
# 19 part_name="_"
# 20 publ_id="0039-5552"
# 21 publ_part=""
# USE 22 publ_title="Suomen Kuvalehti"
# USE 23 publ_type="aikakausi"
# USE 24 sentcount="70"
# 25 sum_lang="|xxx:44|fin:23|eng:3|"
# 26 timefrom="000000"
# 27 timeto="235959"
# USE 28 tokencount="304"
# 29 version_added="KLK-fi-2021">
BID = gensub(/^.+binding_id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
DATE = gensub(/^.+date=\"([^\"]+?)\".+$/, "\\1", "1", $0);
METAFILENAME = gensub(/^.+filename_metadata=\"([^\"]+?)\".+$/, "\\1", "1", $0);
ORIGFILENAME = gensub(/^.+filename_orig=\"([^\"]+?)\".+$/, "\\1", "1", $0);
ID = gensub(/^.+id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
ISSUEDATE = gensub(/^.+issue_date=\"([^\"]+?)\".+$/, "\\1", "1", $0);
ISSUENO = gensub(/^.+issue_no=\"([^\"]+?)\".+$/, "\\1", "1", $0);
ISSUETITLE = gensub(/^.+issue_title=\"([^\"]+?)\".+$/, "\\1", "1", $0);
LABEL = gensub(/^.+label=\"([^\"]+?)\".+$/, "\\1", "1", $0);
LANGUAGE = gensub(/^.+language=\"([^\"]+?)\".+$/, "\\1", "1", $0);
PAGEID = gensub(/^.+page_id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
PAGENO = gensub(/^.+page_no=\"([^\"]+?)\".+$/, "\\1", "1", $0);
PUBLID = gensub(/^.+publ_id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
PUBLTITLE = gensub(/^.+publ_title=\"([^\"]+?)\".+$/, "\\1", "1", $0);
PUBLTYPE = gensub(/^.+publ_type=\"([^\"]+?)\".+$/, "\\1", "1", $0);
SENTCOUNT = gensub(/^.+sentcount=\"([^\"]+?)\".+$/, "\\1", "1", $0);
TOKENCOUNT = gensub(/^.+tokencount=\"([^\"]+?)\".+$/, "\\1", "1", $0);
# print BID, DATE, METAFILENAME, ORIGFILENAME, ID, ISSUEDATE, ISSUENO, ISSUETITLE, LABEL, LANGUAGE, PAGEID, PAGENO, PUBLTITLE, PUBLTYPE, SENTCOUNT, TOKENCOUNT
OUTFILE = OUTDIR "/" simplify(EXTRACT[PUBLID]) "/" simplify(EXTRACT[PUBLID]) YEAR ".xml" ;
# continue OUTFILEs if PUBTITLE of <text> is in EXTRACT:
if(EXTRACT[PUBLID]){
if(PUBLID != SEENID){
printf("Extracting <text>s for %s\n", EXTRACT[PUBLID]) >> "/dev/stderr";
}
print $0 > OUTFILE;
SEENID = PUBLID;
}
}
/^\s*<paragraph/ {
## <paragraph id="p-bcd0f3fa-bbd3dac4-9a6ee1a8" sum_lang="|xxx:1|">
PID = gensub(/^.+id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
SUMLANG = gensub(/^.+sum_lang=\"([^\"]+?)\".+$/, "\\1", "1", $0);
if(EXTRACT[PUBLID]){print $0 >> OUTFILE; }
}
/^\s*<sentence/ {
## <sentence id="s-bcd0f3fa-bbd3dac4-5c959030" lang="xxx" lang_conf="0.1745286">
SID = gensub(/^.+id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
LANG = gensub(/^.+lang=\"([^\"]+?)\".+$/, "\\1", "1", $0);
LCONF = gensub(/^.+lang_conf=\"([^\"]+?)\".+$/, "\\1", "1", $0);
if(EXTRACT[PUBLID]){print $0 >> OUTFILE; }
}
($0 ~ /^\s*</ && $0 !~ /^.*<(\/?)(\!\-\-|text|paragraph|sentence)/) {printf("WARNING: unknown element in line %s\n%s", NR, $0) > "/dev/stderr";}
$0 !~ /^\s*</ {
if(EXTRACT[PUBLID]){print $0 >> OUTFILE; }
}
/^s*<\/(sentence|paragraph)>/ {
if(EXTRACT[PUBLID]){print $0 >> OUTFILE; }
}
/^s*<\/text>/ {
if(EXTRACT[PUBLID]){print $0 >> OUTFILE; }
}
END {
for(c in TITLE){
OUTFILE = OUTDIR "/" simplify(c) "/" simplify(c) YEAR ".xml";
printf("</texts>\n") >> OUTFILE;
printf("Wrote %s\n", OUTFILE) >> "/dev/stderr";
}
}
function simplify(str)
{
gsub(/\s+/, "_", str);
gsub(/ü/,"u", str);
gsub(/ä/,"a",str); gsub(/ö/,"o",str);
gsub(/Ü/,"U",str);
gsub(/Ä/,"A",str);
gsub(/Ö/,"O",str);
return(str);
}