| # vrt2xml.awk |
| # to be called for each yearly vrt file of KLK e.g. klk_fi_v2_2021.vrt |
| |
| BEGIN { |
| |
| #------------------------ |
| # Field separators |
| #------------------------ |
| |
| FS="\t"; |
| OFS="\t"; |
| |
| #------------------- |
| # Global variables |
| #------------------- |
| |
| SEENID=""; |
| |
| |
| |
| # Parameters to be given from command line: |
| # OUTDIR |
| # YEAR |
| # SOURCESFILE |
| |
| #---------------------------------- |
| # Defaults for Parameters |
| #---------------------------------- |
| |
| if(!OUTDIR){ |
| OUTDIR="XMLOUTDIR"; |
| printf("Using %s as default output directory \n", OUTDIR) |
| } |
| |
| if(!YEAR){ |
| YEAR="2021"; |
| printf("Extracting data from source for default year %s\n", YEAR) |
| } |
| |
| if(!SOURCESFILE){ |
| SOURCESFILE="sources_klk_fi_v2_2021_4eureco.csv"; |
| printf("Using default sources file %s in this directory\n", SOURCESFILE) |
| } |
| |
| |
| system("mkdir -p " OUTDIR); |
| |
| #------------------------------------------------------------ |
| # Get positional attributes from first line of input file |
| #------------------------------------------------------------ |
| getline; |
| if($0 !~ /^\s*<!\-\- #vrt positional-attributes/) { |
| print "ERROR: Something went wrong when looking for positional attributes in first line" |
| } |
| else { |
| PATTS = $0; |
| } |
| |
| #-------------------------------------------------------------------- |
| # get list of sources to be extracted from separate sources file |
| #-------------------------------------------------------------------- |
| |
| getline < SOURCESFILE; # get rid of first line of sources file |
| while(getline < SOURCESFILE){ |
| |
| # remove all leading and trailing spaces from relevant fields: |
| e = $1; gsub(/^\s+?/, "", e); gsub(/\s+$/, "", e); # e is extract flag |
| p = $3; gsub(/^\s+?/, "", p); gsub(/\s+$/, "", p); # p is publ_id |
| t = $4; gsub(/^\s+?/, "", t); gsub(/\s+$/, "", t); # t is title string |
| |
| |
| #printf("e: \t|%s|\n", e) >> "/dev/stderr"; |
| #printf("p: \t|%s|\n", p) >> "/dev/stderr"; |
| #printf("t: \t|%s|\n", t) >> "/dev/stderr"; |
| |
| |
| # set facts (arrays): |
| if(e == 1){ # i.e. where extract flag is 1 |
| |
| TITLE[t] = e; # e.g. TITLE["Aamulehti"] = 1; |
| EXTRACT[p] = t; # e.g. EXTRACT["0355-6913"] ="Aamulehti"; |
| |
| # special case publ_id starting in 'fk' according to end user notes: |
| if(p ~ /^fk/){ |
| P = toupper(p); |
| EXTRACT[P] = t; # add fact for upper case version of publ_id |
| } |
| } |
| |
| } |
| |
| for(t in EXTRACT){ |
| printf("Extract: %s %s\n", t, EXTRACT[t]) >> "/dev/stderr"; |
| } |
| |
| # start all outfiles according to selected titles in EXTRACT: |
| for(c in TITLE){ |
| system("mkdir -p " OUTDIR "/" simplify(c)); |
| |
| OUTFILE = OUTDIR "/" simplify(c) "/" simplify(c) YEAR ".xml"; |
| |
| printf("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") > OUTFILE; |
| print PATTS >> OUTFILE; |
| printf("<texts>\n") >> OUTFILE; |
| print "Starting outfile " OUTFILE > "/dev/stderr"; |
| } |
| } # END BEGIN |
| |
| |
| #--------------------------------- |
| # M A I N |
| #--------------------------------- |
| |
| /^\s*<text/ { |
| |
| # USE 01 binding_id="2246025" |
| # USE 02 date="2021-01-15" |
| # 03 datefrom="20210115" |
| # 04 dateto="20210115" |
| # 05 elec_date="_" |
| # 06 file="" |
| # USE 07 filename_metadata="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_mets.xml" |
| # USE 08 filename_orig="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_page-2021011502210030301.xml |
| # USE 09 id="t-bcd0f3fa-bbd3dac4" |
| # 10 img_url="" |
| # USE 11 issue_date="15.01.2021" |
| # USE 12 issue_no="SK0221" |
| # USE 13 issue_title="Suomen Kuvalehti" |
| # USE 14 label="Suomen Kuvalehti no. SK0221 15.01.2021" |
| # USE 16 language="fi" |
| # USE 17 page_id="p1" |
| # USE 18 page_no="None" |
| # 19 part_name="_" |
| # 20 publ_id="0039-5552" |
| # 21 publ_part="" |
| # USE 22 publ_title="Suomen Kuvalehti" |
| # USE 23 publ_type="aikakausi" |
| # USE 24 sentcount="70" |
| # 25 sum_lang="|xxx:44|fin:23|eng:3|" |
| # 26 timefrom="000000" |
| # 27 timeto="235959" |
| # USE 28 tokencount="304" |
| # 29 version_added="KLK-fi-2021"> |
| |
| |
| BID = gensub(/^.+binding_id=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| DATE = gensub(/^.+date=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| METAFILENAME = gensub(/^.+filename_metadata=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| ORIGFILENAME = gensub(/^.+filename_orig=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| ID = gensub(/^.+id=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| ISSUEDATE = gensub(/^.+issue_date=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| ISSUENO = gensub(/^.+issue_no=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| ISSUETITLE = gensub(/^.+issue_title=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| LABEL = gensub(/^.+label=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| LANGUAGE = gensub(/^.+language=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| PAGEID = gensub(/^.+page_id=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| PAGENO = gensub(/^.+page_no=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| PUBLID = gensub(/^.+publ_id=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| PUBLTITLE = gensub(/^.+publ_title=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| PUBLTYPE = gensub(/^.+publ_type=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| SENTCOUNT = gensub(/^.+sentcount=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| TOKENCOUNT = gensub(/^.+tokencount=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| |
| |
| # print BID, DATE, METAFILENAME, ORIGFILENAME, ID, ISSUEDATE, ISSUENO, ISSUETITLE, LABEL, LANGUAGE, PAGEID, PAGENO, PUBLTITLE, PUBLTYPE, SENTCOUNT, TOKENCOUNT |
| |
| OUTFILE = OUTDIR "/" simplify(EXTRACT[PUBLID]) "/" simplify(EXTRACT[PUBLID]) YEAR ".xml" ; |
| |
| # continue OUTFILEs if PUBTITLE of <text> is in EXTRACT: |
| if(EXTRACT[PUBLID]){ |
| if(PUBLID != SEENID){ |
| printf("Extracting <text>s for %s\n", EXTRACT[PUBLID]) >> "/dev/stderr"; |
| } |
| print $0 > OUTFILE; |
| SEENID = PUBLID; |
| } |
| } |
| |
| /^\s*<paragraph/ { |
| ## <paragraph id="p-bcd0f3fa-bbd3dac4-9a6ee1a8" sum_lang="|xxx:1|"> |
| PID = gensub(/^.+id=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| SUMLANG = gensub(/^.+sum_lang=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| |
| if(EXTRACT[PUBLID]){print $0 >> OUTFILE; } |
| } |
| |
| |
| |
| /^\s*<sentence/ { |
| ## <sentence id="s-bcd0f3fa-bbd3dac4-5c959030" lang="xxx" lang_conf="0.1745286"> |
| SID = gensub(/^.+id=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| LANG = gensub(/^.+lang=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| LCONF = gensub(/^.+lang_conf=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| |
| if(EXTRACT[PUBLID]){print $0 >> OUTFILE; } |
| } |
| |
| |
| ($0 ~ /^\s*</ && $0 !~ /^.*<(\/?)(\!\-\-|text|paragraph|sentence)/) {printf("WARNING: unknown element in line %s\n%s", NR, $0) > "/dev/stderr";} |
| |
| |
| $0 !~ /^\s*</ { |
| if(EXTRACT[PUBLID]){print $0 >> OUTFILE; } |
| } |
| |
| /^s*<\/(sentence|paragraph)>/ { |
| if(EXTRACT[PUBLID]){print $0 >> OUTFILE; } |
| } |
| /^s*<\/text>/ { |
| if(EXTRACT[PUBLID]){print $0 >> OUTFILE; } |
| } |
| |
| |
| |
| |
| END { |
| |
| for(c in TITLE){ |
| OUTFILE = OUTDIR "/" simplify(c) "/" simplify(c) YEAR ".xml"; |
| printf("</texts>\n") >> OUTFILE; |
| printf("Wrote %s\n", OUTFILE) >> "/dev/stderr"; |
| } |
| } |
| |
| |
| function simplify(str) |
| { |
| gsub(/\s+/, "_", str); |
| gsub(/ü/,"u", str); |
| gsub(/ä/,"a",str); gsub(/ö/,"o",str); |
| gsub(/Ü/,"U",str); |
| gsub(/Ä/,"A",str); |
| gsub(/Ö/,"O",str); |
| |
| return(str); |
| } |
| |
| |
| |
| |