blob: ab0789f50b370c0eeee19f1973532d1120b16932 [file] [log] [blame]
Harald Lungenaccccb22024-08-23 09:19:04 +03001BEGIN {
2 OFS="\t"
3 printf("<?xml version=\"1.0\" encoding=\"UTF-8\">\n")
4 printf("<texts>\n");
5}
6
7/<text/ {
8# Ü 01 binding_id="2246025"
9 BID = gensub(/^.+binding_id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
10# Ü 02 date="2021-01-15"
11 DATE = gensub(/^.+date=\"([^\"]+?)\".+$/, "\\1", "1", $0);
12# 03 datefrom="20210115"
13# 04 dateto="20210115"
14# 05 elec_date="_"
15# 06 file=""
16# Ü 07 filename_metadata="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_mets.xml"
17 METAFILENAME = gensub(/^.+filename_metadata=\"([^\"]+?)\".+$/, "\\1", "1", $0);
18# Ü 08 filename_orig="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_page-2021011502210030301.xml
19 ORIGFILENAME = gensub(/^.+filename_orig=\"([^\"]+?)\".+$/, "\\1", "1", $0);
20# Ü 09 id="t-bcd0f3fa-bbd3dac4"
21 ID = gensub(/^.+id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
22# 10 img_url=""
23# Ü 11 issue_date="15.01.2021"
24 ISSUEDATE = gensub(/^.+issue_date=\"([^\"]+?)\".+$/, "\\1", "1", $0);
25# Ü 12 issue_no="SK0221"
26 ISSUENO = gensub(/^.+issue_no=\"([^\"]+?)\".+$/, "\\1", "1", $0);
27# Ü 13 issue_title="Suomen Kuvalehti"
28 ISSUETITLE = gensub(/^.+issue_title=\"([^\"]+?)\".+$/, "\\1", "1", $0);
29# Ü 14 label="Suomen Kuvalehti no. SK0221 15.01.2021"
30 LABEL = gensub(/^.+label=\"([^\"]+?)\".+$/, "\\1", "1", $0);
31# Ü 16 language="fi"
32 LANGUAGE = gensub(/^.+language=\"([^\"]+?)\".+$/, "\\1", "1", $0);
33# Ü 17 page_id="p1"
34 PAGEID = gensub(/^.+page_id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
35# 18 page_no="None"
36 PAGENO = gensub(/^.+page_no=\"([^\"]+?)\".+$/, "\\1", "1", $0);
37# 19 part_name="_"
38# 20 publ_id="0039-5552"
39# 21 publ_part=""
40# Ü 22 publ_title="Suomen Kuvalehti"
41 PUBLTITLE = gensub(/^.+publ_title=\"([^\"]+?)\".+$/, "\\1", "1", $0);
42# Ü 23 publ_type="aikakausi"
43 PUBLTYPE = gensub(/^.+publ_type=\"([^\"]+?)\".+$/, "\\1", "1", $0);
44# Ü 24 sentcount="70"
45 SENTCOUNT = gensub(/^.+sentcount=\"([^\"]+?)\".+$/, "\\1", "1", $0);
46# 25 sum_lang="|xxx:44|fin:23|eng:3|"
47# 26 timefrom="000000"
48# 27 timeto="235959"
49# Ü 28 tokencount="304"
50 TOKENCOUNT = gensub(/^.+tokencount=\"([^\"]+?)\".+$/, "\\1", "1", $0);
51# 29 version_added="KLK-fi-2021">
52
53
54
55 print BID, DATE, METAFILENAME, ORIGFILENAME, ID, ISSUEDATE, ISSUENO, ISSUETITLE, LABEL, LANGUAGE, PAGEID, PAGENO, PUBLTITLE, PUBLTYPE, SENTCOUNT, TOKENCOUNT
56
57}
58
59