| BEGIN { |
| OFS="\t" |
| printf("<?xml version=\"1.0\" encoding=\"UTF-8\">\n") |
| printf("<texts>\n"); |
| } |
| |
| /<text/ { |
| # Ü 01 binding_id="2246025" |
| BID = gensub(/^.+binding_id=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| # Ü 02 date="2021-01-15" |
| DATE = gensub(/^.+date=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| # 03 datefrom="20210115" |
| # 04 dateto="20210115" |
| # 05 elec_date="_" |
| # 06 file="" |
| # Ü 07 filename_metadata="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_mets.xml" |
| METAFILENAME = gensub(/^.+filename_metadata=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| # Ü 08 filename_orig="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_page-2021011502210030301.xml |
| ORIGFILENAME = gensub(/^.+filename_orig=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| # Ü 09 id="t-bcd0f3fa-bbd3dac4" |
| ID = gensub(/^.+id=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| # 10 img_url="" |
| # Ü 11 issue_date="15.01.2021" |
| ISSUEDATE = gensub(/^.+issue_date=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| # Ü 12 issue_no="SK0221" |
| ISSUENO = gensub(/^.+issue_no=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| # Ü 13 issue_title="Suomen Kuvalehti" |
| ISSUETITLE = gensub(/^.+issue_title=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| # Ü 14 label="Suomen Kuvalehti no. SK0221 15.01.2021" |
| LABEL = gensub(/^.+label=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| # Ü 16 language="fi" |
| LANGUAGE = gensub(/^.+language=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| # Ü 17 page_id="p1" |
| PAGEID = gensub(/^.+page_id=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| # 18 page_no="None" |
| PAGENO = gensub(/^.+page_no=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| # 19 part_name="_" |
| # 20 publ_id="0039-5552" |
| # 21 publ_part="" |
| # Ü 22 publ_title="Suomen Kuvalehti" |
| PUBLTITLE = gensub(/^.+publ_title=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| # Ü 23 publ_type="aikakausi" |
| PUBLTYPE = gensub(/^.+publ_type=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| # Ü 24 sentcount="70" |
| SENTCOUNT = gensub(/^.+sentcount=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| # 25 sum_lang="|xxx:44|fin:23|eng:3|" |
| # 26 timefrom="000000" |
| # 27 timeto="235959" |
| # Ü 28 tokencount="304" |
| TOKENCOUNT = gensub(/^.+tokencount=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| # 29 version_added="KLK-fi-2021"> |
| |
| |
| |
| print BID, DATE, METAFILENAME, ORIGFILENAME, ID, ISSUEDATE, ISSUENO, ISSUETITLE, LABEL, LANGUAGE, PAGEID, PAGENO, PUBLTITLE, PUBLTYPE, SENTCOUNT, TOKENCOUNT |
| |
| } |
| |
| |