Harald Lungen | accccb2 | 2024-08-23 09:19:04 +0300 | [diff] [blame] | 1 | BEGIN { |
| 2 | OFS="\t" |
| 3 | printf("<?xml version=\"1.0\" encoding=\"UTF-8\">\n") |
| 4 | printf("<texts>\n"); |
| 5 | } |
| 6 | |
| 7 | /<text/ { |
| 8 | # Ü 01 binding_id="2246025" |
| 9 | BID = gensub(/^.+binding_id=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| 10 | # Ü 02 date="2021-01-15" |
| 11 | DATE = gensub(/^.+date=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| 12 | # 03 datefrom="20210115" |
| 13 | # 04 dateto="20210115" |
| 14 | # 05 elec_date="_" |
| 15 | # 06 file="" |
| 16 | # Ü 07 filename_metadata="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_mets.xml" |
| 17 | METAFILENAME = gensub(/^.+filename_metadata=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| 18 | # Ü 08 filename_orig="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_page-2021011502210030301.xml |
| 19 | ORIGFILENAME = gensub(/^.+filename_orig=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| 20 | # Ü 09 id="t-bcd0f3fa-bbd3dac4" |
| 21 | ID = gensub(/^.+id=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| 22 | # 10 img_url="" |
| 23 | # Ü 11 issue_date="15.01.2021" |
| 24 | ISSUEDATE = gensub(/^.+issue_date=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| 25 | # Ü 12 issue_no="SK0221" |
| 26 | ISSUENO = gensub(/^.+issue_no=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| 27 | # Ü 13 issue_title="Suomen Kuvalehti" |
| 28 | ISSUETITLE = gensub(/^.+issue_title=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| 29 | # Ü 14 label="Suomen Kuvalehti no. SK0221 15.01.2021" |
| 30 | LABEL = gensub(/^.+label=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| 31 | # Ü 16 language="fi" |
| 32 | LANGUAGE = gensub(/^.+language=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| 33 | # Ü 17 page_id="p1" |
| 34 | PAGEID = gensub(/^.+page_id=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| 35 | # 18 page_no="None" |
| 36 | PAGENO = gensub(/^.+page_no=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| 37 | # 19 part_name="_" |
| 38 | # 20 publ_id="0039-5552" |
| 39 | # 21 publ_part="" |
| 40 | # Ü 22 publ_title="Suomen Kuvalehti" |
| 41 | PUBLTITLE = gensub(/^.+publ_title=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| 42 | # Ü 23 publ_type="aikakausi" |
| 43 | PUBLTYPE = gensub(/^.+publ_type=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| 44 | # Ü 24 sentcount="70" |
| 45 | SENTCOUNT = gensub(/^.+sentcount=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| 46 | # 25 sum_lang="|xxx:44|fin:23|eng:3|" |
| 47 | # 26 timefrom="000000" |
| 48 | # 27 timeto="235959" |
| 49 | # Ü 28 tokencount="304" |
| 50 | TOKENCOUNT = gensub(/^.+tokencount=\"([^\"]+?)\".+$/, "\\1", "1", $0); |
| 51 | # 29 version_added="KLK-fi-2021"> |
| 52 | |
| 53 | |
| 54 | |
| 55 | print BID, DATE, METAFILENAME, ORIGFILENAME, ID, ISSUEDATE, ISSUENO, ISSUETITLE, LABEL, LANGUAGE, PAGEID, PAGENO, PUBLTITLE, PUBLTYPE, SENTCOUNT, TOKENCOUNT |
| 56 | |
| 57 | } |
| 58 | |
| 59 | |