| # vrt2xml.awk |
| # to be called for each yearly vrt file of KLK e.g. klk_fi_v2_2021.vrt |
| |
| BEGIN { |
| # getting OUTDIR and YEAR as parameters |
| |
| OFS="\t"; |
| |
| system("mkdir -p " OUTDIR); |
| |
| getline; |
| if($0 !~ /^\s*<!\-\- #vrt positional-attributes/) { |
| print "ERROR: Something went wrong when looking for positional attributes in first line" |
| } |
| else { |
| PATTS = $0; |
| } |
| |
| CONTAIN["Aamulehti"] = 1; |
| CONTAIN["Etelä-Suomen Sanomat"] = 1; |
| CONTAIN["Hämeen Sanomat"] = 1; |
| CONTAIN["Helsingin Sanomat"] = 1; |
| CONTAIN["Ilkka-Pohjalainen"] = 1; |
| CONTAIN["Ilta-Sanomat"] = 1; |
| CONTAIN["Iltalehti"] = 1; |
| CONTAIN["Kaleva"] = 1; |
| CONTAIN["Karjalainen"] = 1; |
| CONTAIN["Kauppalehti"] = 1; |
| CONTAIN["Keskipohjanmaa"] = 1; |
| CONTAIN["Keskisuomalainen"] = 1; |
| CONTAIN["Kouvolan Sanomat"] = 1; |
| CONTAIN["Kymen Sanomat"] = 1; |
| CONTAIN["Länsi-Savo"] = 1; |
| CONTAIN["Lapin Kansa"] = 1; |
| CONTAIN["Satakunnan Kansa"] = 1; |
| CONTAIN["Savon Sanomat"] = 1; |
| CONTAIN["Suomen Kuvalehti"] = 1; |
| CONTAIN["Turun Sanomat"] = 1; |
| |
| |
| # start all outfiles according to selected titles in CONTAIN: |
| for(c in CONTAIN){ |
| |
| system("mkdir -p " OUTDIR "/" simplify(c)); |
| |
| OUTFILE = OUTDIR "/" simplify(c) "/" simplify(c) YEAR ".xml"; |
| |
| printf("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") > OUTFILE; |
| print PATTS >> OUTFILE; |
| printf("<texts>\n") >> OUTFILE; |
| print "Starting outfile " OUTFILE > "/dev/stderr"; |
| } |
| } |
| |
| |
| /^\s*<text/ { |
| |
| # USE 01 binding_id="2246025" |
| # USE 02 date="2021-01-15" |
| # 03 datefrom="20210115" |
| # 04 dateto="20210115" |
| # 05 elec_date="_" |
| # 06 file="" |
| # USE 07 filename_metadata="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_mets.xml" |
| # USE 08 filename_orig="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_page-2021011502210030301.xml |
| # USE 09 id="t-bcd0f3fa-bbd3dac4" |
| # 10 img_url="" |
| # USE 11 issue_date="15.01.2021" |
| # USE 12 issue_no="SK0221" |
| # USE 13 issue_title="Suomen Kuvalehti" |
| # USE 14 label="Suomen Kuvalehti no. SK0221 15.01.2021" |
| # USE 16 language="fi" |
| # USE 17 page_id="p1" |
| # USE 18 page_no="None" |
| # 19 part_name="_" |
| # 20 publ_id="0039-5552" |
| # 21 publ_part="" |
| # USE 22 publ_title="Suomen Kuvalehti" |
| # USE 23 publ_type="aikakausi" |
| # USE 24 sentcount="70" |
| # 25 sum_lang="|xxx:44|fin:23|eng:3|" |
| # 26 timefrom="000000" |
| # 27 timeto="235959" |
| # USE 28 tokencount="304" |
| # 29 version_added="KLK-fi-2021"> |
| |
| |
| BID = gensub(/^.+binding_id=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| DATE = gensub(/^.+date=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| METAFILENAME = gensub(/^.+filename_metadata=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| ORIGFILENAME = gensub(/^.+filename_orig=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| ID = gensub(/^.+id=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| ISSUEDATE = gensub(/^.+issue_date=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| ISSUENO = gensub(/^.+issue_no=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| ISSUETITLE = gensub(/^.+issue_title=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| LABEL = gensub(/^.+label=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| LANGUAGE = gensub(/^.+language=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| PAGEID = gensub(/^.+page_id=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| PAGENO = gensub(/^.+page_no=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| PUBLTITLE = gensub(/^.+publ_title=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| PUBLTYPE = gensub(/^.+publ_type=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| SENTCOUNT = gensub(/^.+sentcount=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| TOKENCOUNT = gensub(/^.+tokencount=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| |
| |
| # print BID, DATE, METAFILENAME, ORIGFILENAME, ID, ISSUEDATE, ISSUENO, ISSUETITLE, LABEL, LANGUAGE, PAGEID, PAGENO, PUBLTITLE, PUBLTYPE, SENTCOUNT, TOKENCOUNT |
| |
| OUTFILE = OUTDIR "/" simplify(PUBLTITLE) "/" simplify(PUBLTITLE) YEAR ".xml" ; |
| |
| # continue OUTFILEs if PUBTITLE of <text> is in CONTAIN: |
| if(CONTAIN[PUBLTITLE]){print $0 > OUTFILE; } |
| } |
| |
| /^\s*<paragraph/ { |
| ## <paragraph id="p-bcd0f3fa-bbd3dac4-9a6ee1a8" sum_lang="|xxx:1|"> |
| PID = gensub(/^.+id=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| SUMLANG = gensub(/^.+sum_lang=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| |
| if(CONTAIN[PUBLTITLE]){print $0 >> OUTFILE; } |
| } |
| |
| |
| |
| /^\s*<sentence/ { |
| ## <sentence id="s-bcd0f3fa-bbd3dac4-5c959030" lang="xxx" lang_conf="0.1745286"> |
| SID = gensub(/^.+id=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| LANG = gensub(/^.+lang=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| LCONF = gensub(/^.+lang_conf=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| |
| if(CONTAIN[PUBLTITLE]){print $0 >> OUTFILE; } |
| } |
| |
| |
| ($0 ~ /^\s*</ && $0 !~ /^.*<(\/?)(\!\-\-|text|paragraph|sentence)/) {printf("WARNING: unknown element in line %s\n%s", NR, $0) > "/dev/stderr";} |
| |
| |
| $0 !~ /^\s*</ { |
| if(CONTAIN[PUBLTITLE]){print $0 >> OUTFILE; } |
| } |
| |
| /^s*<\/(sentence|paragraph)>/ { |
| if(CONTAIN[PUBLTITLE]){print $0 >> OUTFILE; } |
| } |
| /^s*<\/text>/ { |
| if(CONTAIN[PUBLTITLE]){print $0 >> OUTFILE; } |
| } |
| |
| |
| |
| |
| END { |
| |
| for(c in CONTAIN){ |
| if(CONTAIN[c] == 1){ |
| OUTFILE = OUTDIR "/" simplify(c) "/" simplify(c) YEAR ".xml"; |
| printf("</texts>\n") >> OUTFILE; |
| } |
| } |
| |
| } |
| |
| |
| function simplify(str) |
| { |
| gsub(/\s+/, "_", str); |
| gsub(/ü/,"u", str); |
| gsub(/ä/,"a",str); gsub(/ö/,"o",str); |
| gsub(/Ü/,"U",str); |
| gsub(/Ä/,"A",str); |
| gsub(/Ö/,"O",str); |
| |
| return(str); |
| } |
| |
| |
| |
| |