use PUBLID to extract
diff --git a/vrt2xml.awk b/vrt2xml.awk
index d8d6d88..b52c052 100644
--- a/vrt2xml.awk
+++ b/vrt2xml.awk
@@ -2,12 +2,52 @@
# to be called for each yearly vrt file of KLK e.g. klk_fi_v2_2021.vrt
BEGIN {
- # getting OUTDIR and YEAR as parameters
-
+
+ #------------------------
+ # Field separators
+ #------------------------
+
+ FS="\t";
OFS="\t";
+ #-------------------
+ # Global variables
+ #-------------------
+
+ SEENID="";
+
+
+
+ # Parameters to be given from command line:
+ # OUTDIR
+ # YEAR
+ # SOURCESFILE
+
+ #----------------------------------
+ # Defaults for Parameters
+ #----------------------------------
+
+ if(!OUTDIR){
+ OUTDIR="XMLOUTDIR";
+ printf("Using %s as default output directory \n", OUTDIR)
+ }
+
+ if(!YEAR){
+ YEAR="2021";
+ printf("Extracting data from source for default year %s\n", YEAR)
+ }
+
+ if(!SOURCESFILE){
+ SOURCESFILE="sources_klk_fi_v2_2021_4eureco.csv";
+ printf("Using default sources file %s in this directory\n", SOURCESFILE)
+ }
+
+
system("mkdir -p " OUTDIR);
+ #------------------------------------------------------------
+ # Get positional attributes from first line of input file
+ #------------------------------------------------------------
getline;
if($0 !~ /^\s*<!\-\- #vrt positional-attributes/) {
print "ERROR: Something went wrong when looking for positional attributes in first line"
@@ -15,32 +55,46 @@
else {
PATTS = $0;
}
-
- CONTAIN["Aamulehti"] = 1;
- CONTAIN["Etelä-Suomen Sanomat"] = 1;
- CONTAIN["Hämeen Sanomat"] = 1;
- CONTAIN["Helsingin Sanomat"] = 1;
- CONTAIN["Ilkka-Pohjalainen"] = 1;
- CONTAIN["Ilta-Sanomat"] = 1;
- CONTAIN["Iltalehti"] = 1;
- CONTAIN["Kaleva"] = 1;
- CONTAIN["Karjalainen"] = 1;
- CONTAIN["Kauppalehti"] = 1;
- CONTAIN["Keskipohjanmaa"] = 1;
- CONTAIN["Keskisuomalainen"] = 1;
- CONTAIN["Kouvolan Sanomat"] = 1;
- CONTAIN["Kymen Sanomat"] = 1;
- CONTAIN["Länsi-Savo"] = 1;
- CONTAIN["Lapin Kansa"] = 1;
- CONTAIN["Satakunnan Kansa"] = 1;
- CONTAIN["Savon Sanomat"] = 1;
- CONTAIN["Suomen Kuvalehti"] = 1;
- CONTAIN["Turun Sanomat"] = 1;
-
- # start all outfiles according to selected titles in CONTAIN:
- for(c in CONTAIN){
+ #--------------------------------------------------------------------
+ # get list of sources to be extracted from separate sources file
+ #--------------------------------------------------------------------
+
+ getline < SOURCESFILE; # get rid of first line of sources file
+ while(getline < SOURCESFILE){
+
+ # remove all leading and trailing spaces from relevant fields:
+ e = $1; gsub(/^\s+?/, "", e); gsub(/\s+$/, "", e); # e is extract flag
+ p = $3; gsub(/^\s+?/, "", p); gsub(/\s+$/, "", p); # p is publ_id
+ t = $4; gsub(/^\s+?/, "", t); gsub(/\s+$/, "", t); # t is title string
+
+ #printf("e: \t|%s|\n", e) >> "/dev/stderr";
+ #printf("p: \t|%s|\n", p) >> "/dev/stderr";
+ #printf("t: \t|%s|\n", t) >> "/dev/stderr";
+
+
+ # set facts (arrays):
+ if(e == 1){ # i.e. where extract flag is 1
+
+ TITLE[t] = e; # e.g. TITLE["Aamulehti"] = 1;
+ EXTRACT[p] = t; # e.g. EXTRACT["0355-6913"] ="Aamulehti";
+
+ # special case publ_id starting in 'fk' according to end user notes:
+ if(p ~ /^fk/){
+ P = toupper(p);
+ EXTRACT[P] = t; # add fact for upper case version of publ_id
+ }
+ }
+
+ }
+
+ for(t in EXTRACT){
+ printf("Extract: %s %s\n", t, EXTRACT[t]) >> "/dev/stderr";
+ }
+
+ # start all outfiles according to selected titles in EXTRACT:
+ for(c in TITLE){
system("mkdir -p " OUTDIR "/" simplify(c));
OUTFILE = OUTDIR "/" simplify(c) "/" simplify(c) YEAR ".xml";
@@ -50,9 +104,13 @@
printf("<texts>\n") >> OUTFILE;
print "Starting outfile " OUTFILE > "/dev/stderr";
}
-}
+} # END BEGIN
+#---------------------------------
+# M A I N
+#---------------------------------
+
/^\s*<text/ {
# USE 01 binding_id="2246025"
@@ -97,6 +155,7 @@
LANGUAGE = gensub(/^.+language=\"([^\"]+?)\".+$/, "\\1", "1", $0);
PAGEID = gensub(/^.+page_id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
PAGENO = gensub(/^.+page_no=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+ PUBLID = gensub(/^.+publ_id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
PUBLTITLE = gensub(/^.+publ_title=\"([^\"]+?)\".+$/, "\\1", "1", $0);
PUBLTYPE = gensub(/^.+publ_type=\"([^\"]+?)\".+$/, "\\1", "1", $0);
SENTCOUNT = gensub(/^.+sentcount=\"([^\"]+?)\".+$/, "\\1", "1", $0);
@@ -104,19 +163,25 @@
# print BID, DATE, METAFILENAME, ORIGFILENAME, ID, ISSUEDATE, ISSUENO, ISSUETITLE, LABEL, LANGUAGE, PAGEID, PAGENO, PUBLTITLE, PUBLTYPE, SENTCOUNT, TOKENCOUNT
-
- OUTFILE = OUTDIR "/" simplify(PUBLTITLE) "/" simplify(PUBLTITLE) YEAR ".xml" ;
- # continue OUTFILEs if PUBTITLE of <text> is in CONTAIN:
- if(CONTAIN[PUBLTITLE]){print $0 > OUTFILE; }
+ OUTFILE = OUTDIR "/" simplify(EXTRACT[PUBLID]) "/" simplify(EXTRACT[PUBLID]) YEAR ".xml" ;
+
+ # continue OUTFILEs if PUBTITLE of <text> is in EXTRACT:
+ if(EXTRACT[PUBLID]){
+ if(PUBLID != SEENID){
+ printf("Extracting <text>s for %s\n", EXTRACT[PUBLID]) >> "/dev/stderr";
+ }
+ print $0 > OUTFILE;
+ SEENID = PUBLID;
+ }
}
/^\s*<paragraph/ {
## <paragraph id="p-bcd0f3fa-bbd3dac4-9a6ee1a8" sum_lang="|xxx:1|">
PID = gensub(/^.+id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
- SUMLANG = gensub(/^.+sum_lang=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+ SUMLANG = gensub(/^.+sum_lang=\"([^\"]+?)\".+$/, "\\1", "1", $0);
- if(CONTAIN[PUBLTITLE]){print $0 >> OUTFILE; }
+ if(EXTRACT[PUBLID]){print $0 >> OUTFILE; }
}
@@ -127,7 +192,7 @@
LANG = gensub(/^.+lang=\"([^\"]+?)\".+$/, "\\1", "1", $0);
LCONF = gensub(/^.+lang_conf=\"([^\"]+?)\".+$/, "\\1", "1", $0);
- if(CONTAIN[PUBLTITLE]){print $0 >> OUTFILE; }
+ if(EXTRACT[PUBLID]){print $0 >> OUTFILE; }
}
@@ -135,14 +200,14 @@
$0 !~ /^\s*</ {
- if(CONTAIN[PUBLTITLE]){print $0 >> OUTFILE; }
+ if(EXTRACT[PUBLID]){print $0 >> OUTFILE; }
}
/^s*<\/(sentence|paragraph)>/ {
- if(CONTAIN[PUBLTITLE]){print $0 >> OUTFILE; }
+ if(EXTRACT[PUBLID]){print $0 >> OUTFILE; }
}
/^s*<\/text>/ {
- if(CONTAIN[PUBLTITLE]){print $0 >> OUTFILE; }
+ if(EXTRACT[PUBLID]){print $0 >> OUTFILE; }
}
@@ -150,13 +215,11 @@
END {
- for(c in CONTAIN){
- if(CONTAIN[c] == 1){
- OUTFILE = OUTDIR "/" simplify(c) "/" simplify(c) YEAR ".xml";
- printf("</texts>\n") >> OUTFILE;
- }
+ for(c in TITLE){
+ OUTFILE = OUTDIR "/" simplify(c) "/" simplify(c) YEAR ".xml";
+ printf("</texts>\n") >> OUTFILE;
+ printf("Wrote %s\n", OUTFILE) >> "/dev/stderr";
}
-
}