blob: ab0789f50b370c0eeee19f1973532d1120b16932 [file] [log] [blame]
BEGIN {
OFS="\t"
printf("<?xml version=\"1.0\" encoding=\"UTF-8\">\n")
printf("<texts>\n");
}
/<text/ {
# Ü 01 binding_id="2246025"
BID = gensub(/^.+binding_id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
# Ü 02 date="2021-01-15"
DATE = gensub(/^.+date=\"([^\"]+?)\".+$/, "\\1", "1", $0);
# 03 datefrom="20210115"
# 04 dateto="20210115"
# 05 elec_date="_"
# 06 file=""
# Ü 07 filename_metadata="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_mets.xml"
METAFILENAME = gensub(/^.+filename_metadata=\"([^\"]+?)\".+$/, "\\1", "1", $0);
# Ü 08 filename_orig="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_page-2021011502210030301.xml
ORIGFILENAME = gensub(/^.+filename_orig=\"([^\"]+?)\".+$/, "\\1", "1", $0);
# Ü 09 id="t-bcd0f3fa-bbd3dac4"
ID = gensub(/^.+id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
# 10 img_url=""
# Ü 11 issue_date="15.01.2021"
ISSUEDATE = gensub(/^.+issue_date=\"([^\"]+?)\".+$/, "\\1", "1", $0);
# Ü 12 issue_no="SK0221"
ISSUENO = gensub(/^.+issue_no=\"([^\"]+?)\".+$/, "\\1", "1", $0);
# Ü 13 issue_title="Suomen Kuvalehti"
ISSUETITLE = gensub(/^.+issue_title=\"([^\"]+?)\".+$/, "\\1", "1", $0);
# Ü 14 label="Suomen Kuvalehti no. SK0221 15.01.2021"
LABEL = gensub(/^.+label=\"([^\"]+?)\".+$/, "\\1", "1", $0);
# Ü 16 language="fi"
LANGUAGE = gensub(/^.+language=\"([^\"]+?)\".+$/, "\\1", "1", $0);
# Ü 17 page_id="p1"
PAGEID = gensub(/^.+page_id=\"([^\"]+?)\".+$/, "\\1", "1", $0);
# 18 page_no="None"
PAGENO = gensub(/^.+page_no=\"([^\"]+?)\".+$/, "\\1", "1", $0);
# 19 part_name="_"
# 20 publ_id="0039-5552"
# 21 publ_part=""
# Ü 22 publ_title="Suomen Kuvalehti"
PUBLTITLE = gensub(/^.+publ_title=\"([^\"]+?)\".+$/, "\\1", "1", $0);
# Ü 23 publ_type="aikakausi"
PUBLTYPE = gensub(/^.+publ_type=\"([^\"]+?)\".+$/, "\\1", "1", $0);
# Ü 24 sentcount="70"
SENTCOUNT = gensub(/^.+sentcount=\"([^\"]+?)\".+$/, "\\1", "1", $0);
# 25 sum_lang="|xxx:44|fin:23|eng:3|"
# 26 timefrom="000000"
# 27 timeto="235959"
# Ü 28 tokencount="304"
TOKENCOUNT = gensub(/^.+tokencount=\"([^\"]+?)\".+$/, "\\1", "1", $0);
# 29 version_added="KLK-fi-2021">
print BID, DATE, METAFILENAME, ORIGFILENAME, ID, ISSUEDATE, ISSUENO, ISSUETITLE, LABEL, LANGUAGE, PAGEID, PAGENO, PUBLTITLE, PUBLTYPE, SENTCOUNT, TOKENCOUNT
}