Blame - vrt2tei.pl - EuReCo/kielipankki4eureco

blob: dd1cffa45931a868048bb54e5c82d6b993ac29a3 [file] [log] [blame]

Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	1	#! /usr/bin/perl -w
				2
				3
				4	###########################################################################################################################################################
				5	# vrt2tei.pl
				6	# eureco
				7	# leibniz-institut fuer deutsche sprache / csc finland esbo
				8	# august 2024
				9	#
				10	#
				11	# using XML::Twig , see http://www.xmltwig.org/, https://metacpan.org/pod/XML::Twiga
				12	#
Harald Lüngen	ccd8490	2024-08-27 16:03:47 +0300	[diff] [blame]	13	# usage: see below the usage fugnction
Harald Lüngen	caab080	2024-08-23 17:28:22 +0300	[diff] [blame]	14	# Usage: ./vrt2tei.pl <vrtxmlfile.xml> <outfile>
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	15	# <vrtxmlfile>: xml-ised vrt file
				16	#
				17	#
				18	# TODO:
				19	# 1 insert dtd spec, or ref to TEI
				20
Harald Lüngen	db5e6e7	2024-09-04 17:41:18 +0300	[diff] [blame]	21	# 3a remove the vrt positional attribute comment line / all comment lines
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	22	# 3b add @head and @deprel to I5 sowie auch @msd
				23	# 3c bearbeitung von @head und @deprel in tei2korapxml durch Nils?
				24	# 3d build 30 billion corpus
				25
				26	# 4a take care of IDs
				27	# 4b see to the values of @xml:lang
				28	# 5 abfangen von unerwarteten elementen dh andere als <sentence> und <paragraph>
				29	# 5a wort reihenfolge nochmal checken
				30	# 6 checks and balances
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	31	# 7 How to encode Kielipankki and National Library of Finland? in teiCorpus Header
				32	# 8 construct <idsDoc>s for the months (or go for TEI)
				33	# 9 parallelisation in bash and application on sub corpora of KLK
				34	# 10 re-implementation of the gawk code in the perl script
				35	# 12 re-implement creation of text header from xml file in another twig / parametrize TEI vs I5
				36
				37
				38
				39	#remember
				40	#formatted.xml:105613: element w: validity error : No declaration for attribute deprel of element w
				41	#formatted.xml:105613: element w: validity error : No declaration for attribute head of element w
				42
				43
				44	#
				45	#
				46	############################################################################################################################################################
				47
				48
				49	use strict;
				50	use warnings;
				51
				52	use XML::Twig;
				53	use XML::Generator ':pretty'; # apparently no effect when using flush();
				54
				55
				56	use locale; # diese drei Zeilen, damit \b im regex nicht Umlaute und ß matcht.
				57	use POSIX qw(locale_h); # to be able to use setlocale()
				58	#setlocale(LC_ALL,'de_DE');
				59	setlocale(LC_ALL, "fi_FI");
				60	use utf8;
				61	use open qw( :std :encoding(UTF-8) );
				62
				63	use Time::Piece;
				64	use Tie::IxHash;
				65
Harald Lüngen	db5e6e7	2024-09-04 17:41:18 +0300	[diff] [blame]	66
				67
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	68	#----------------------
				69	# check file arguments:
				70	#----------------------
				71
				72	# arg0 infile: vrt-xml
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	73
Harald Lüngen	a20e69d	2024-08-29 13:33:08 +0300	[diff] [blame]	74	unless($ARGV[0]) {&usage_message()} ; # min arg0, the input file
				75	if ($ARGV[1]) {&usage_message()}; # max arg0, the input file
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	76
				77
				78	####################
				79	# GLOBAL VARIABLES
				80	####################
				81
				82	my $encoding = "UTF-8";
				83	#my $encoding = "iso-8859-1"; # dieses $encoding ist NUR fuer das output s.u. twig funktion
Harald Lüngen	caab080	2024-08-23 17:28:22 +0300	[diff] [blame]	84	my $textcounter = 0;
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	85
				86
Harald Lüngen	db5e6e7	2024-09-04 17:41:18 +0300	[diff] [blame]	87
				88	my $twig="";
				89	my $teiCorpusHeaderDoc="";
				90
				91
				92	#------------------------------------------------------------------
				93	# read corpusHeaderSkeleton document and get header out of it
				94	#------------------------------------------------------------------
				95
				96	my $teiCorpusHeaderDocTwig = new XML::Twig(
				97	keep_spaces => 1,
				98	keep_atts_order => 1,
				99	comments => 'drop',
				100	);
				101
				102
				103	$teiCorpusHeaderDocTwig->parsefile("teiCorpusHeaderSkeleton.tei.xml");
				104	my $corpusHeader = $teiCorpusHeaderDocTwig->root; # getting the teiHeader for corpus out of the teiCorpusHeaderSkeleton document
				105
				106
				107	#------------------------------------------------------------------
				108	# read textHeaderSkeleton document adn get header out of it
				109	#------------------------------------------------------------------
				110
				111	my $teiTextHeaderDocTwig = new XML::Twig(
				112	keep_spaces => 1,
				113	keep_atts_order => 1,
				114	comments => 'drop',
				115	);
				116
				117	$teiTextHeaderDocTwig->parsefile("teiTextHeaderSkeleton.tei.xml");
				118	my $textHeader = $teiTextHeaderDocTwig->root; # getting the teiHeader for corpus out of the teiTextHeaderSkeleton document
				119
				120
				121	#----------------------------------
				122	# read input VRT-XML document
				123	#----------------------------------
				124
				125	open(my $IN, "< $ARGV[0]") \|\| die("$0: cannot open file for reading: $ARGV[0]"); # open input file and initialise filehandel, actually does not seem to be needed
				126	# as parsefile() (s.b.) is applied to the filename
				127
				128	#-----------------------------------------------------
				129	# global variables pertaining to the original corpus
				130	#-----------------------------------------------------
				131
				132	my $kielipankkiCorpus = "klk-fi-v2-vrt";
				133
				134
				135
				136
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	137	#####################
				138	# M A I N
				139	#####################
				140
Harald Lüngen	db5e6e7	2024-09-04 17:41:18 +0300	[diff] [blame]	141	#-------------------------------------------------------------------------------------------------------------
				142	# start twig for input and call start tag handler for root and twig handler for each <text> in the VRT
				143	#-------------------------------------------------------------------------------------------------------------
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	144
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	145
				146	$twig = new XML::Twig(
				147	keep_spaces => 1, # dadurch auch whitespaces an ehemeligen elementgrenzen im output
				148	keep_atts_order => 1, # requires Tie::IxHash
Harald Lüngen	db5e6e7	2024-09-04 17:41:18 +0300	[diff] [blame]	149	comments => 'drop',
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	150	start_tag_handlers => {
Harald Lüngen	db5e6e7	2024-09-04 17:41:18 +0300	[diff] [blame]	151	texts => sub{root(@_, $corpusHeader)}
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	152	},
				153	twig_handlers => {
Harald Lüngen	db5e6e7	2024-09-04 17:41:18 +0300	[diff] [blame]	154	# text => \&text
				155	text => sub{text(@_, $textHeader)}
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	156	},
				157	# dtd_handlers => { # ToDo for I5
				158	# \&set_dtd;
				159	# }
Harald Lüngen	a20e69d	2024-08-29 13:33:08 +0300	[diff] [blame]	160
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	161	output_encoding => $encoding,
				162	);
				163
				164	$twig->parsefile($ARGV[0]);
				165
Harald Lüngen	db5e6e7	2024-09-04 17:41:18 +0300	[diff] [blame]	166
				167
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	168
				169
				170	###########
				171	# END MAIN
				172	###########
				173
				174
				175
				176
				177	##############################
				178	# S U B R O U T I N E S
				179	##############################
				180
				181	# sub set_dtd [
				182	# my $twig, $dtd = @_;
				183	# my $internal = qq\|\nPUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"\n"DTD/xhtml1-strict.dtd"\|;
				184	#
				185	# $twig->twig_doctype('html', undef, undef, $internal);
				186	# }
				187
				188
				189
				190	sub root {
Harald Lüngen	db5e6e7	2024-09-04 17:41:18 +0300	[diff] [blame]	191	my ($twig, $root, $corpusHeader) =@_;
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	192
				193	$root->set_gi('teiCorpus');
				194	$root->set_att("xmlns", 'http://www.tei-c.org/ns/1.0');
				195
Harald Lüngen	db5e6e7	2024-09-04 17:41:18 +0300	[diff] [blame]	196	&insertCorpusHeader($root, $corpusHeader);
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	197	}
				198
				199
Harald Lüngen	db5e6e7	2024-09-04 17:41:18 +0300	[diff] [blame]	200
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	201	sub insertCorpusHeader{
Harald Lüngen	db5e6e7	2024-09-04 17:41:18 +0300	[diff] [blame]	202	my ($root, $corpusHeader) =@_;
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	203
Harald Lüngen	db5e6e7	2024-09-04 17:41:18 +0300	[diff] [blame]	204	#---------------------------------------------------------------------------
				205	# get some metadata for the current output corpus based on source and year
				206	#---------------------------------------------------------------------------
				207
Harald Lüngen	ccd8490	2024-08-27 16:03:47 +0300	[diff] [blame]	208	my @array = split(/\//, $ARGV[0]);
				209	my $l = scalar(@array);
				210	my $source = $array[$l-1];
Harald Lüngen	db5e6e7	2024-09-04 17:41:18 +0300	[diff] [blame]	211	$source =~ s/([0-9][0-9][0-9][0-9])\.xml$//;
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	212
Harald Lüngen	db5e6e7	2024-09-04 17:41:18 +0300	[diff] [blame]	213	my $language="Finnish";
				214	my $lang_tla="fi";
				215
				216	my $yy = $1; # $1 now containts substring in first bracket in regex above
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	217
Harald Lüngen	db5e6e7	2024-09-04 17:41:18 +0300	[diff] [blame]	218	my $ctitle = $source . " " . $yy . ", from ". $kielipankkiCorpus . " for EuReCo"; # to do: also get name of corpus (klk-fi-v2-vrt)
				219
				220
				221	#-----------------------
				222	# set corpus header
				223	#-----------------------
				224
				225	&set_title( $corpusHeader, $source, $yy, $kielipankkiCorpus);
				226	&set_sourceDesc($corpusHeader, $source, $yy, $kielipankkiCorpus);
				227
				228	my $teiCorpusHeader = $corpusHeader->paste("first_child", $root);
				229
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	230	}
				231
Harald Lüngen	db5e6e7	2024-09-04 17:41:18 +0300	[diff] [blame]	232
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	233	#----------------------------
				234	# handler &text for <text>
				235	#----------------------------
				236
				237	sub text {
Harald Lüngen	db5e6e7	2024-09-04 17:41:18 +0300	[diff] [blame]	238	my ($twig, $text, $textHeader) = @_;
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	239
Harald Lüngen	caab080	2024-08-23 17:28:22 +0300	[diff] [blame]	240	$textcounter++; # global variable
				241
				242	# ToDo: catch all other, unexpected children of root
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	243
				244	#--------------------------------------------------------------------------
				245	# Get text metadata (attributes of <text>) and create teiHeader for <text>
				246	#--------------------------------------------------------------------------
				247
				248	my $textattsref = $text->atts(); # $textattsref is now a reference to a hash and should be used with '->'
				249
Harald Lüngen	db5e6e7	2024-09-04 17:41:18 +0300	[diff] [blame]	250
				251	&createTextHeader($text, $textattsref, $textHeader);
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	252
				253	#--------------------------
				254	# create <TEI> from <text>
				255	#--------------------------
				256
				257	# set vrt <text> to <TEI> and delete all attributes after they were were saved above
				258	$text->del_atts;
				259	$text->set_gi("TEI");
Harald Lüngen	caab080	2024-08-23 17:28:22 +0300	[diff] [blame]	260
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	261	#------------------------------------------------------------------
				262	# create the <tei:text>, <body>, <div> elements inside <TEI>
				263	#------------------------------------------------------------------
				264
				265	my $ttext_element = XML::Twig::Elt->new('text');
				266	my $body_element = XML::Twig::Elt->new('body');
				267	my $div_element = XML::Twig::Elt->new('div');
				268
				269	# set atts
Harald Lüngen	caab080	2024-08-23 17:28:22 +0300	[diff] [blame]	270	$div_element ->set_att("type", "page"); # ToDo: this is specific to KLK
				271	$ttext_element->set_att("xml:lang", 'fi'); # as in ICC-NOR
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	272
				273	# paste
				274	$ttext_element->paste('last_child', $text);
				275	$body_element ->paste('last_child', $ttext_element);
				276	$div_element ->paste('last_child', $body_element);
				277
				278
				279	#-------------------------------
				280	# create <p> from <paragraph>
				281	#-------------------------------
				282
				283	my @paragraphs = $text->children( 'paragraph');
				284
				285	foreach my $paragraph (@paragraphs) {
				286
				287	&setP($paragraph);
				288
				289	$paragraph->move('last_child', $div_element);
				290
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	291	#------------------------------
				292	# create <s> from <sentence>
				293	#------------------------------
				294
				295	my @sentences = $paragraph->children('sentence');
				296	foreach my $sentence (@sentences) {
				297
				298	&setS($sentence);
				299
				300
				301	#--------------------------------------
				302	# create <w> (word) from each $line
				303	#--------------------------------------
				304
				305	my @lines = split(/\n+/, $sentence->xml_text);
				306	$sentence->set_text("\n");
				307
				308	for my $line (@lines){ # Todo: Reihenfolge checken
				309	if($line ne "" ){
				310	my $w_element = XML::Twig::Elt->new('w');
				311	&createW($w_element, $line);
				312	$w_element->paste('last_child', $sentence);
				313	}
Harald Lüngen	caab080	2024-08-23 17:28:22 +0300	[diff] [blame]	314	} # end words
				315	} # end sentences
				316	} # end paragraphs
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	317
Harald Lüngen	db5e6e7	2024-09-04 17:41:18 +0300	[diff] [blame]	318	# $twig->set_pretty_print( 'record');
Harald Lüngen	a20e69d	2024-08-29 13:33:08 +0300	[diff] [blame]	319	# $twig->flush($OUT);
				320	$twig->flush("/dev/stdout");
				321	}
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	322
				323	sub createTextHeader{
Harald Lüngen	db5e6e7	2024-09-04 17:41:18 +0300	[diff] [blame]	324	my ($text, $textattsref, $textHeader) = @_;
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	325
				326	# USE 01 binding_id="2246025"
				327	# USE 02 date="2021-01-15"
				328	# 03 datefrom="20210115"
				329	# 04 dateto="20210115"
				330	# 05 elec_date="_"
				331	# 06 file=""
				332	# USE 07 filename_metadata="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_mets.xml"
				333	# USE 08 filename_orig ="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_page-2021011502210030301.xml
				334	# USE 09 id="t-bcd0f3fa-bbd3dac4"
				335	# 10 img_url=""
				336	# USE 11 issue_date="15.01.2021"
				337	# USE 12 issue_no="SK0221"
				338	# USE 13 issue_title="Suomen Kuvalehti"
				339	# USE 14 label="Suomen Kuvalehti no. SK0221 15.01.2021"
				340	# USE 16 language="fi"
				341	# USE 17 page_id="p1"
				342	# USE 18 page_no="None"
				343	# 19 part_name="_"
				344	# 20 publ_id="0039-5552"
				345	# 21 publ_part=""
				346	# USE 22 publ_title="Suomen Kuvalehti"
				347	# USE 23 publ_type="aikakausi"
				348	# USE 24 sentcount="70"
				349	# USE 25 sum_lang="\|xxx:44\|fin:23\|eng:3\|"
				350	# 26 timefrom="000000"
				351	# 27 timeto="235959"
				352	# USE 28 tokencount="304"
				353	# 29 version_added="KLK-fi-2021">
				354
				355
				356	my $BID = $textattsref->{'binding_id'};
				357	my $DATE = $textattsref->{'date'};
				358	my $METAFILENAME = $textattsref->{'filename_metadata'};
				359	my $ORIGFILENAME = $textattsref->{'filename_orig'};
				360	my $ID = $textattsref->{'id'};
				361	my $ISSUEDATE = $textattsref->{'issue_date'};
				362	my $ISSUENO = $textattsref->{'issue_no'};
				363	my $ISSUETITLE = $textattsref->{'issue_title'};
				364	my $LABEL = $textattsref->{'label'};
				365	my $LANGUAGE = $textattsref->{'language'};
				366	my $PAGEID = $textattsref->{'page_id'};
				367	my $PAGENO = $textattsref->{'page_no'};
				368	my $PUBLTITLE = $textattsref->{'publ_title'};
				369	my $PUBLTYPE = $textattsref->{'publ_type'};
				370	my $SENTCOUNT = $textattsref->{'sentcount'};
				371	my $SUMLANG = $textattsref->{'sum_lang'};
				372	my $TOKENCOUNT = $textattsref->{'tokencount'};
				373
				374
				375	#-----------------------------
				376	# Derived Metadata variables
				377	#-----------------------------
				378
				379	my @datearray = split("-", $DATE);
				380	my @langarray = split("\|", $SUMLANG);
				381	my @namearray = split(/[\.\/]/, $ORIGFILENAME); # use $namearray[4] as ID for the page
				382
				383
Harald Lüngen	db5e6e7	2024-09-04 17:41:18 +0300	[diff] [blame]	384
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	385	#-----------------------------------------------------------------------
Harald Lüngen	db5e6e7	2024-09-04 17:41:18 +0300	[diff] [blame]	386	# CREATE text-teiHeader ACCORDING TO THE SKELETON in $textHeader
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	387	#-----------------------------------------------------------------------
				388
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	389
Harald Lüngen	db5e6e7	2024-09-04 17:41:18 +0300	[diff] [blame]	390	$textHeader->paste('first_child', $text);
				391
				392	#-----------------------------------------------
				393	# <teiHeader>
				394	# <fileDesc n="[EuReCo-KLK-FIN_$ID]">
				395	# <titleStmt>
				396	# <title>[$LABEL, page $PAGENO]</title>
				397
				398	$textHeader->first_child("fileDesc")->first_child("titleStmt")->first_child("title")
				399	->set_text($LABEL . ", Text #" . $textcounter); # Case KLK
				400
				401	#-----------------------------------------------
				402	# <fileDesc>
				403	# <sourceDesc>
				404	# <biblStruct>
				405	# <analytic>
				406	# <title type="main">[$LABEL, page $PAGENO]</title>
				407	# <date>[$DATE]</date>
				408	# <date type="year">TODO</date>
				409	# <date type="month">TODO</date>
				410	# <date type="day">TODO</date>
				411	# <idno type="PAGEID">$PAGEID</idno>
				412	# <idno type="BINDINGID">$BID</idno>
				413	# <idno type="ID">$ID</idno>
				414	# <idno type="KIELIPANKKI_METAFILENAME">$METAFILENAME</idno>
				415	# <idno type="KIELIPANKKI_ORIGFILENAME">$ORIGFILENAME</idno>
				416	# <textLang>$LANGUAGE</textLang>
				417	# </analytic>
				418
				419	my $analytic = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/analytic", 0);
				420
				421	$analytic->first_child("title") ->set_text($LABEL . ", Text #" . $textcounter); # Case KLK
				422	$analytic->get_xpath('./date[@type="year"]', 0) ->set_text($datearray[0]);
				423	$analytic->get_xpath('./date[@type="month"]', 0) ->set_text($datearray[1]);
				424	$analytic->get_xpath('./date[@type="day"]', 0) ->set_text($datearray[2]);
				425	$analytic->get_xpath('./idno[@type="PAGEID"]', 0) ->set_text($PAGEID);
				426	$analytic->get_xpath('./idno[@type="BINDINGID"]', 0) ->set_text($BID);
				427	$analytic->get_xpath('./idno[@type="ID"]', 0) ->set_text($ID);
				428	$analytic->get_xpath('./idno[@type="KIELIPANKKI_METAFILENAME"]', 0) ->set_text($METAFILENAME);
				429	$analytic->get_xpath('./idno[@type="KIELIPANKKI_ORIGFILENAME"]', 0) ->set_text($ORIGFILENAME);
				430	$analytic->first_child('textLang') ->set_text($LANGUAGE);
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	431
Harald Lüngen	db5e6e7	2024-09-04 17:41:18 +0300	[diff] [blame]	432	# <monogr>
				433	# <title>$PUBLTITLE</title>
				434	# <imprint>
				435	# <pubPlace>TODO</pubPlace>
				436	# <publisher>TODO</publisher>
				437	# </imprint>
				438	# <biblScope unit="ISSUETITLE"/>
				439	# <biblScope unit="ISSUENO"/>
				440	# <biblScope unit="ISSUEDATE"/>
				441	# <biblScope unit="pp">$PAGENO</biblScope>
				442	# <monogr>
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	443
Harald Lüngen	db5e6e7	2024-09-04 17:41:18 +0300	[diff] [blame]	444	my $monogr = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/monogr", 0);
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	445
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	446
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	447
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	448
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	449
Harald Lüngen	db5e6e7	2024-09-04 17:41:18 +0300	[diff] [blame]	450	##TMP # create <teiHeader> inside <TEI>
				451	##TMP my $teiHeader = XML::Twig::Elt->new('teiHeader');
				452	##TMP # $teiHeader->paste('first_child', $text);
				453	##TMP
				454	##TMP ## insert_new_elt is a combo of new and paste, cf. xml::twig docu:
				455	##TMP ## insert_new_elt ($opt_position, $gi, $opt_atts_hashref, @opt_content)
				456	##TMP
				457	##TMP my $fileDesc = $teiHeader->insert_new_elt('fileDesc' => {n => "EuReCo_KLK-fi_" . $namearray[4]});
				458	##TMP my $encodingDesc = $teiHeader->insert_new_elt("last_child", 'encodingDesc');
				459	##TMP my $profileDesc = $teiHeader->insert_new_elt("last_child", 'profileDesc');
				460	##TMP my $revisionDesc = $teiHeader->insert_new_elt("last_child", 'revisionDesc');
				461	##TMP
				462	##TMP #---------------------
				463	##TMP # fileDesc/titleStmt
				464	##TMP #---------------------
				465	##TMP my $titleStmt = $fileDesc ->insert_new_elt('titleStmt');
				466	##TMP my $title = $titleStmt->insert_new_elt("last_child", 'title');
				467	##TMP my $respStmt = $titleStmt->insert_new_elt("last_child", 'respStmt');
				468	##TMP my $resp = $respStmt ->insert_new_elt("last_child", 'resp');
				469	##TMP my $name = $respStmt ->insert_new_elt("last_child", 'name');
				470	##TMP
				471	##TMP # set texts for titleStmt
				472	##TMP # $title->set_text($LABEL . ", page " . $PAGENO); # Achtung - PAGENO scheint meist "None" zu sein
				473	##TMP $title->set_text($LABEL . ", Text #" . $textcounter); # at least for Suomen Kuvalehti
				474	##TMP $resp ->set_text("compiled by EuReCo");
				475	##TMP $name ->set_text("EuReCo: HL");
				476	##TMP
				477	##TMP #--------------------------
				478	##TMP # fileDesc/publicationStmt
				479	##TMP #--------------------------
				480	##TMP my $publicationStmt = $fileDesc ->insert_new_elt("last_child", 'publicationStmt');
				481	##TMP my $distributor = $publicationStmt->insert_new_elt("last_child", 'distributor');
				482	##TMP my $note = $distributor ->insert_new_elt("last_child", 'note');
				483	##TMP my $availability = $publicationStmt->insert_new_elt("last_child", 'availability');
				484	##TMP my $licence = $availability ->insert_new_elt("last_child", 'licence');
				485	##TMP
				486	##TMP # set texts for publicationStmt
				487	##TMP $note ->set_text("NOT FOR DISTRIBUTION - to be used locally in EuReCo");
				488	##TMP $licence->set_text("CLARIN_RES"); # TODO: Ausfuherlichere Licence info in KLK Metadata Record
				489	##TMP
				490	##TMP #------------------------------
				491	##TMP # fileDesc/sourceDesc/biblStruct
				492	##TMP #------------------------------
				493	##TMP my $sourceDesc = $fileDesc ->insert_new_elt("last_child", 'sourceDesc');
				494	##TMP my $biblStruct = $sourceDesc->insert_new_elt("last_child", 'biblStruct');
				495	##TMP
				496	##TMP # fileDesc/sourceDesc/biblStruct/analytic
				497	##TMP my $analytic = $biblStruct->insert_new_elt("last_child", 'analytic');
				498	##TMP my $analytic_title = $analytic->insert_new_elt("last_child", 'title' => {type => "main"} );
				499	##TMP # my $analytic_date = $analytic->insert_new_elt("last_child", 'date');
				500	##TMP my $analytic_date_year = $analytic->insert_new_elt("last_child", 'date' => {type => "year"});
				501	##TMP my $analytic_date_month = $analytic->insert_new_elt("last_child", 'date' => {type => "month"});
				502	##TMP my $analytic_date_day = $analytic->insert_new_elt("last_child", 'date' => {type => "day"});
				503	##TMP my $analytic_idno_pageid = $analytic->insert_new_elt("last_child", 'idno' => {type => "PAGEID"});
				504	##TMP my $analytic_idno_bindingid = $analytic->insert_new_elt("last_child", 'idno' => {type => "BINDINGID"});
				505	##TMP my $analytic_idno_id = $analytic->insert_new_elt("last_child", 'idno' => {type => "ID"});
				506	##TMP my $analytic_idno_metafile = $analytic->insert_new_elt("last_child", 'idno' => {type => "KIELIPANKKI_METAFILENAME"});
				507	##TMP my $analytic_idno_origfile = $analytic->insert_new_elt("last_child", 'idno' => {type => "KIELIPANKKI_ORIGFILENAME"});
				508	##TMP my $analytic_textlang = $analytic->insert_new_elt("last_child", 'textLang');
				509	##TMP
				510	##TMP # set texts for analytic
				511	##TMP # $analytic_title ->set_text($LABEL . ", page " . $PAGENO); # Achtung $PAGENO scheint meist "None zu sein"
				512	##TMP $analytic_title ->set_text($LABEL . ", Text #" . $textcounter); # Achtung $PAGENO scheint meist "None zu sein"
				513	##TMP # $analytic_date ->set_text($DATE);
				514	##TMP $analytic_date_year ->set_text($datearray[0]);
				515	##TMP $analytic_date_month ->set_text($datearray[1]);
				516	##TMP $analytic_date_day ->set_text($datearray[2]);
				517	##TMP $analytic_idno_pageid ->set_text($PAGEID);
				518	##TMP $analytic_idno_bindingid->set_text($BID);
				519	##TMP $analytic_idno_id ->set_text($ID);
				520	##TMP $analytic_idno_metafile ->set_text($METAFILENAME);
				521	##TMP $analytic_idno_origfile ->set_text($ORIGFILENAME);
				522	##TMP $analytic_textlang ->set_text($LANGUAGE);
				523	##TMP
				524	##TMP #-------------------------------------
				525	##TMP # fileDesc/sourceDesc/biblStruct/monogr
				526	##TMP #-------------------------------------
				527	##TMP my $monogr = $biblStruct->insert_new_elt("last_child", 'monogr');
				528	##TMP my $monogr_title = $monogr ->insert_new_elt("last_child", 'title');
				529	##TMP my $imprint = $monogr ->insert_new_elt("last_child", 'imprint'); # imprint is needed for valididty
				530	##TMP my $pubPlace = $imprint ->insert_new_elt("last_child", 'pubPlace'); # imprint is needed for validity
				531	##TMP my $publisher = $imprint ->insert_new_elt("last_child", 'publisher'); # imprint is needed for validity
				532	##TMP my $biblScope_issuetitle = $monogr ->insert_new_elt("last_child", 'biblScope' => {unit => 'ISSUETITLE'} );
				533	##TMP my $biblScope_issueno = $monogr ->insert_new_elt("last_child", 'biblScope' => {unit => 'ISSUENO'} );
				534	##TMP my $biblScope_issuedate = $monogr ->insert_new_elt("last_child", 'biblScope' => {unit => 'ISSUEDATE'} );
				535	##TMP my $biblScope_pp = $monogr ->insert_new_elt("last_child", 'biblScope' => {unit => 'PAGENO'} ); # Achtung PAGENO ist meist "None" ?
				536	##TMP
				537	##TMP # set texts for monogr
				538	##TMP $monogr_title ->set_text($PUBLTITLE);
				539	##TMP $pubPlace ->set_text("TODO");
				540	##TMP $pubPlace ->set_att("key",'FI');
				541	##TMP $publisher ->set_text("TODO");
				542	##TMP $biblScope_issuetitle->set_text($ISSUETITLE);
				543	##TMP $biblScope_issueno ->set_text($ISSUENO);
				544	##TMP $biblScope_issuedate ->set_text($ISSUEDATE);
				545	##TMP $biblScope_pp ->set_text($PAGENO);
				546	##TMP
				547	##TMP #---------------
				548	##TMP # encodingDesc
				549	##TMP #---------------
				550	##TMP my $tagsDecl = $encodingDesc->insert_new_elt("last_child", 'tagsDecl');
				551	##TMP my $namespace = $tagsDecl ->insert_new_elt("last_child", 'namespace' => {name => 'http://www.tei-c.org/ns/1.0'});
				552	##TMP my $tagUsage_s = $namespace ->insert_new_elt("last_child", 'tagUsage' => {gi => 's', occurs => $SENTCOUNT});
				553	##TMP my $tagUsage_w = $namespace ->insert_new_elt("last_child", 'tagUsage' => {gi => 'w', occurs => $TOKENCOUNT});
				554	##TMP
				555	##TMP #-------------
				556	##TMP # profileDesc
				557	##TMP #-------------
				558	##TMP my $langUsage = $profileDesc ->insert_new_elt("last_child", 'langUsage');
				559	##TMP my $language = $langUsage ->insert_new_elt("last_child", 'language' => {ident => $LANGUAGE, usage => $SUMLANG});
				560	##TMP # Achtung in @usage muss eigt. ein integer; am besten inhalt von SUMLANG aufdroeseln und mehrere <language> machen
				561	##TMP my $textClass = $profileDesc ->insert_new_elt("last_child", 'textClass');
				562	##TMP my $classCode_fi = $textClass ->insert_new_elt("last_child", 'classCode' => {scheme => "KLK_PUBLTYPE"});
				563	##TMP # my $classCode_en = $textClass ->insert_new_elt("last_child", 'classCode' => {scheme => "KLK_PUBLTYPE_MAPPED"});
				564	##TMP
				565	##TMP #---------------------------
				566	##TMP # set texts for profileDesc
				567	##TMP #---------------------------
				568	##TMP $classCode_fi ->set_text($PUBLTYPE);
				569	##TMP # $classCode_en->set_text($PUBLTYPETRANSL);
				570	##TMP
				571	##TMP #---------------
				572	##TMP # revisionDesc
				573	##TMP #---------------
				574	##TMP my $change = $revisionDesc ->insert_new_elt("last_child", 'change' => {when => localtime->ymd('-'), who => 'HL' });
				575	##TMP
				576	##TMP # set texts for revisionDesc
				577	##TMP $change->set_text("TEI version for EuReCo");
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	578
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	579
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	580
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	581
				582
Harald Lüngen	db5e6e7	2024-09-04 17:41:18 +0300	[diff] [blame]	583
				584
				585
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	586	###################################
				587	# END OF CREATING TEIHEADER
				588	###################################
				589
				590	}
				591
				592	sub setP {
				593	my ($paragraph) = @_;
				594
				595	$paragraph->set_gi('p');
				596
				597	# <paragraph id="p-bcd0f3fa-bbd3dac4-815ead7a" sum_lang="\|fin:1\|">
				598	# atts of <paragraph>:
				599	# @id USE
				600	# @sum_lang USE: put in xml:lang and prefix the value with "x-" for private value
				601
				602	$paragraph->set_att("xml:lang", "x-" . $paragraph->att("sum_lang"));
				603	$paragraph->del_att("sum_lang");
Harald Lüngen	a20e69d	2024-08-29 13:33:08 +0300	[diff] [blame]	604	# $paragraph->change_att_name('id', 'xml:id');
				605	$paragraph->del_att("id"); # diese id ist auch nicht eindeutig!!
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	606	}
				607	sub setS {
				608	my ($sentence) = @_;
				609
				610	$sentence->set_gi('s');
				611
				612	# the atts of <sentence>:
				613	# USE 1 @id="s-bcd0f3fa-bbd3dac4-f7429090"
				614	# USE 2 @lang="fin" -> xml:lang
				615	# ? 3 @lang_conf="0.6734853"> -> ToDo @cert ?
				616
				617	# set attrs of <s>
				618	$sentence->set_att("xml:lang", $sentence->att("lang")); # ToDo: convert the value / introduce a hash for lookup (input values: "fin", "xxx", ....)
Harald Lüngen	faf8d48	2024-08-27 21:19:47 +0300	[diff] [blame]	619	# $sentence->change_att_name('id', 'xml:id'); # nicht eindeutig
				620	$sentence->del_att('id');
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	621	$sentence->del_att("lang"); # replaced by xml:lang
				622	$sentence->del_att("lang_conf"); # for the time being
				623
				624	}
				625
				626	sub createW {
				627	my ($w_element, $line) = @_;
				628
				629	#---------------------------
				630	# Get the tags (=columns)
				631	#---------------------------
				632
				633	my @tags = split(/\t/, $line);
				634
				635	# set content of <w> i.e. the token
				636	$w_element->set_text($tags[0]);
				637
				638	# vrt positional-attributes in corpus KLK:
				639	# USE [0] word
				640	# USE [1] ref (id for reference of dephead)
				641	# USE [2] lemma
				642	# ? [3] lemmacomp (lemma with compound info - could go in @norm, as tag abuse?)
				643	# USE [4] pos
				644	# USE [5] msd
				645	# USE [6] dephead
				646	# USE [7] deprel
				647	# [8] content (ocr-process)
				648	# [9] vpos (ocr-process)
				649	# [10] ocr (ocr-process)
				650	# [11] cc (ocr-process)
				651	# [12] hyph (ocr-process)
				652	# [13] style (ocr-process)
				653	# [14] lex (korp semantic disambiguation from G"oteborg)
				654
				655	# set the attributes of <w>:
				656	$w_element->set_att("n", $tags[1]);
				657	# $w_element->set_att("id", "w_" . $namearray[4] . $sentence->att("xml:id") . "_" . $tags[1]);
				658	# so zusammengebaute ID ist auch nicht eindeutig...
Harald Lüngen	faf8d48	2024-08-27 21:19:47 +0300	[diff] [blame]	659	$w_element->del_att("id");
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	660	$w_element->set_att("lemma", $tags[2]);
				661	# $w_element->set_att("norm", $tags[3]); # tag abuse of @norm
				662	$w_element->set_att("pos", $tags[4]);
				663	$w_element->set_att("msd", $tags[5]);
Harald Lüngen	ccd8490	2024-08-27 16:03:47 +0300	[diff] [blame]	664	#TMP $w_element->set_att("head", $tags[6]);
				665	#TMP $w_element->set_att("deprel", $tags[7]);
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	666
				667	}
				668
Harald Lüngen	db5e6e7	2024-09-04 17:41:18 +0300	[diff] [blame]	669
				670	sub set_title{
				671	my ($corpusHeader, $source, $yy, $kielipankkiCorpus) = @_;
				672
				673	my $cTitleString = $source . " " . $yy . ", from ". $kielipankkiCorpus . " for EuReCo";
				674
				675	#<teiHeader>
				676	# <fileDesc>
				677	# <titleStmt>
				678	# <title>[Aamulehti2021] from [klk-fi-v2-vrt for] EuReCo</title>
				679	# </titleStmt>
				680	# <!-- ... -->
				681	# </fileDesc>
				682	#</teiHeader>
				683
				684	my $cTitleNode = $corpusHeader->first_child("fileDesc")->first_child("titleStmt")->first_child("title");
				685
				686	$cTitleNode->set_text($cTitleString);
				687
				688	}
				689
				690	sub set_sourceDesc{
				691	my ($corpusHeader, $source, $yy, $kielipankkiCorpus) = @_;
				692
				693	my $cBiblString = $source . " " . $yy . ", from ". $kielipankkiCorpus . " for EuReCo";
				694
				695	#<teiHeader>
				696	# <fileDesc>
				697	# <!-- ... -->
				698	# <sourceDesc>
				699	# <bibl>[Aamulehti2021] from [klk-fi-v2-vrt]</bibl>
				700	# </sourceDesc>
				701	# <!-- ... -->
				702	# </fileDesc>
				703	#</teiHeader>
				704
				705	my $cBiblNode = $corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("bibl");
				706
				707	$cBiblNode->set_text($cBiblString);
				708	}
				709
				710
				711
				712
				713
				714
				715
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	716	#################
				717	## usage_message
				718	#################
				719
				720
				721	sub usage_message {
Harald Lüngen	a7e9162	2024-08-23 17:33:11 +0300	[diff] [blame]	722	print " Usage: ./vrt2tei.pl <file.vrt.xml> <outfile>\n";
Harald Lüngen	9d4e046	2024-08-23 09:34:22 +0300	[diff] [blame]	723	print " <file.vrt.xml> is a VRT file converted to proper XML\n";
				724	exit;
				725	}
				726
				727