reogranise acc. to source; add comment line with vrt positional attributes
diff --git a/vrt2xml.awk b/vrt2xml.awk
index 8f37955..d8d6d88 100644
--- a/vrt2xml.awk
+++ b/vrt2xml.awk
@@ -1,9 +1,21 @@
+# vrt2xml.awk
+# to be called for each yearly vrt file of KLK e.g. klk_fi_v2_2021.vrt
+
BEGIN {
+ # getting OUTDIR and YEAR as parameters
+
OFS="\t";
- OUTDIR="XML";
- system("mkdir -p \"" OUTDIR "\"");
-
+ system("mkdir -p " OUTDIR);
+
+ getline;
+ if($0 !~ /^\s*<!\-\- #vrt positional-attributes/) {
+ print "ERROR: Something went wrong when looking for positional attributes in first line"
+ }
+ else {
+ PATTS = $0;
+ }
+
CONTAIN["Aamulehti"] = 1;
CONTAIN["Etelä-Suomen Sanomat"] = 1;
CONTAIN["Hämeen Sanomat"] = 1;
@@ -26,70 +38,76 @@
CONTAIN["Turun Sanomat"] = 1;
+ # start all outfiles according to selected titles in CONTAIN:
for(c in CONTAIN){
- OUTFILE = OUTDIR "/" simplify(c) ".xml";
+
+ system("mkdir -p " OUTDIR "/" simplify(c));
+
+ OUTFILE = OUTDIR "/" simplify(c) "/" simplify(c) YEAR ".xml";
+
printf("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") > OUTFILE;
+ print PATTS >> OUTFILE;
printf("<texts>\n") >> OUTFILE;
- print OUTFILE > "/dev/stderr";
+ print "Starting outfile " OUTFILE > "/dev/stderr";
}
}
-
-/^\s*<\!\-\-/ { print; } # positional attributes
-
/^\s*<text/ {
# USE 01 binding_id="2246025"
- BID = gensub(/^.+binding_id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
# USE 02 date="2021-01-15"
- DATE = gensub(/^.+date=\"([^\"]+?)\".+$/, "\\1", "1", $0);
-# 03 datefrom="20210115"
-# 04 dateto="20210115"
-# 05 elec_date="_"
-# 06 file=""
+# 03 datefrom="20210115"
+# 04 dateto="20210115"
+# 05 elec_date="_"
+# 06 file=""
# USE 07 filename_metadata="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_mets.xml"
- METAFILENAME = gensub(/^.+filename_metadata=\"([^\"]+?)\".+$/, "\\1", "1", $0);
# USE 08 filename_orig="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_page-2021011502210030301.xml
- ORIGFILENAME = gensub(/^.+filename_orig=\"([^\"]+?)\".+$/, "\\1", "1", $0);
# USE 09 id="t-bcd0f3fa-bbd3dac4"
- ID = gensub(/^.+id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
-# 10 img_url=""
+# 10 img_url=""
# USE 11 issue_date="15.01.2021"
- ISSUEDATE = gensub(/^.+issue_date=\"([^\"]+?)\".+$/, "\\1", "1", $0);
# USE 12 issue_no="SK0221"
- ISSUENO = gensub(/^.+issue_no=\"([^\"]+?)\".+$/, "\\1", "1", $0);
# USE 13 issue_title="Suomen Kuvalehti"
- ISSUETITLE = gensub(/^.+issue_title=\"([^\"]+?)\".+$/, "\\1", "1", $0);
# USE 14 label="Suomen Kuvalehti no. SK0221 15.01.2021"
- LABEL = gensub(/^.+label=\"([^\"]+?)\".+$/, "\\1", "1", $0);
# USE 16 language="fi"
- LANGUAGE = gensub(/^.+language=\"([^\"]+?)\".+$/, "\\1", "1", $0);
# USE 17 page_id="p1"
- PAGEID = gensub(/^.+page_id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
# USE 18 page_no="None"
- PAGENO = gensub(/^.+page_no=\"([^\"]+?)\".+$/, "\\1", "1", $0);
-# 19 part_name="_"
-# 20 publ_id="0039-5552"
-# 21 publ_part=""
+# 19 part_name="_"
+# 20 publ_id="0039-5552"
+# 21 publ_part=""
# USE 22 publ_title="Suomen Kuvalehti"
- PUBLTITLE = gensub(/^.+publ_title=\"([^\"]+?)\".+$/, "\\1", "1", $0);
# USE 23 publ_type="aikakausi"
- PUBLTYPE = gensub(/^.+publ_type=\"([^\"]+?)\".+$/, "\\1", "1", $0);
# USE 24 sentcount="70"
- SENTCOUNT = gensub(/^.+sentcount=\"([^\"]+?)\".+$/, "\\1", "1", $0);
-# 25 sum_lang="|xxx:44|fin:23|eng:3|"
-# 26 timefrom="000000"
-# 27 timeto="235959"
+# 25 sum_lang="|xxx:44|fin:23|eng:3|"
+# 26 timefrom="000000"
+# 27 timeto="235959"
# USE 28 tokencount="304"
- TOKENCOUNT = gensub(/^.+tokencount=\"([^\"]+?)\".+$/, "\\1", "1", $0);
-# 29 version_added="KLK-fi-2021">
+# 29 version_added="KLK-fi-2021">
-# print BID, DATE, METAFILENAME, ORIGFILENAME, ID, ISSUEDATE, ISSUENO, ISSUETITLE, LABEL, LANGUAGE, PAGEID, PAGENO, PUBLTITLE, PUBLTYPE, SENTCOUNT, TOKENCOUNT
+ BID = gensub(/^.+binding_id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+ DATE = gensub(/^.+date=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+ METAFILENAME = gensub(/^.+filename_metadata=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+ ORIGFILENAME = gensub(/^.+filename_orig=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+ ID = gensub(/^.+id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+ ISSUEDATE = gensub(/^.+issue_date=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+ ISSUENO = gensub(/^.+issue_no=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+ ISSUETITLE = gensub(/^.+issue_title=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+ LABEL = gensub(/^.+label=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+ LANGUAGE = gensub(/^.+language=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+ PAGEID = gensub(/^.+page_id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+ PAGENO = gensub(/^.+page_no=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+ PUBLTITLE = gensub(/^.+publ_title=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+ PUBLTYPE = gensub(/^.+publ_type=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+ SENTCOUNT = gensub(/^.+sentcount=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+ TOKENCOUNT = gensub(/^.+tokencount=\"([^\"]+?)\".+$/, "\\1", "1", $0);
+
- OUTFILE = OUTDIR "/" simplify(PUBLTITLE) ".xml" ;
+# print BID, DATE, METAFILENAME, ORIGFILENAME, ID, ISSUEDATE, ISSUENO, ISSUETITLE, LABEL, LANGUAGE, PAGEID, PAGENO, PUBLTITLE, PUBLTYPE, SENTCOUNT, TOKENCOUNT
+ OUTFILE = OUTDIR "/" simplify(PUBLTITLE) "/" simplify(PUBLTITLE) YEAR ".xml" ;
+
+ # continue OUTFILEs if PUBTITLE of <text> is in CONTAIN:
if(CONTAIN[PUBLTITLE]){print $0 > OUTFILE; }
}
@@ -134,7 +152,7 @@
for(c in CONTAIN){
if(CONTAIN[c] == 1){
- OUTFILE = OUTDIR "/" simplify(c) ".xml";
+ OUTFILE = OUTDIR "/" simplify(c) "/" simplify(c) YEAR ".xml";
printf("</texts>\n") >> OUTFILE;
}
}