blob: dd1cffa45931a868048bb54e5c82d6b993ac29a3 [file] [log] [blame]
Harald Lüngen9d4e0462024-08-23 09:34:22 +03001#! /usr/bin/perl -w
2
3
4###########################################################################################################################################################
5# vrt2tei.pl
6# eureco
7# leibniz-institut fuer deutsche sprache / csc finland esbo
8# august 2024
9#
10#
11# using XML::Twig , see http://www.xmltwig.org/, https://metacpan.org/pod/XML::Twiga
12#
Harald Lüngenccd84902024-08-27 16:03:47 +030013# usage: see below the usage fugnction
Harald Lüngencaab0802024-08-23 17:28:22 +030014# Usage: ./vrt2tei.pl <vrtxmlfile.xml> <outfile>
Harald Lüngen9d4e0462024-08-23 09:34:22 +030015# <vrtxmlfile>: xml-ised vrt file
16#
17#
18# TODO:
19# 1 insert dtd spec, or ref to TEI
20
Harald Lüngendb5e6e72024-09-04 17:41:18 +030021# 3a remove the vrt positional attribute comment line / all comment lines
Harald Lüngen9d4e0462024-08-23 09:34:22 +030022# 3b add @head and @deprel to I5 sowie auch @msd
23# 3c bearbeitung von @head und @deprel in tei2korapxml durch Nils?
24# 3d build 30 billion corpus
25
26# 4a take care of IDs
27# 4b see to the values of @xml:lang
28# 5 abfangen von unerwarteten elementen dh andere als <sentence> und <paragraph>
29# 5a wort reihenfolge nochmal checken
30# 6 checks and balances
Harald Lüngen9d4e0462024-08-23 09:34:22 +030031# 7 How to encode Kielipankki and National Library of Finland? in teiCorpus Header
32# 8 construct <idsDoc>s for the months (or go for TEI)
33# 9 parallelisation in bash and application on sub corpora of KLK
34# 10 re-implementation of the gawk code in the perl script
35# 12 re-implement creation of text header from xml file in another twig / parametrize TEI vs I5
36
37
38
39#remember
40#formatted.xml:105613: element w: validity error : No declaration for attribute deprel of element w
41#formatted.xml:105613: element w: validity error : No declaration for attribute head of element w
42
43
44#
45#
46############################################################################################################################################################
47
48
49use strict;
50use warnings;
51
52use XML::Twig;
53use XML::Generator ':pretty'; # apparently no effect when using flush();
54
55
56use locale; # diese drei Zeilen, damit \b im regex nicht Umlaute und ß matcht.
57use POSIX qw(locale_h); # to be able to use setlocale()
58#setlocale(LC_ALL,'de_DE');
59setlocale(LC_ALL, "fi_FI");
60use utf8;
61use open qw( :std :encoding(UTF-8) );
62
63use Time::Piece;
64use Tie::IxHash;
65
Harald Lüngendb5e6e72024-09-04 17:41:18 +030066
67
Harald Lüngen9d4e0462024-08-23 09:34:22 +030068#----------------------
69# check file arguments:
70#----------------------
71
72# arg0 infile: vrt-xml
Harald Lüngen9d4e0462024-08-23 09:34:22 +030073
Harald Lüngena20e69d2024-08-29 13:33:08 +030074unless($ARGV[0]) {&usage_message()} ; # min arg0, the input file
75if ($ARGV[1]) {&usage_message()}; # max arg0, the input file
Harald Lüngen9d4e0462024-08-23 09:34:22 +030076
77
78####################
79# GLOBAL VARIABLES
80####################
81
82my $encoding = "UTF-8";
83#my $encoding = "iso-8859-1"; # dieses $encoding ist NUR fuer das output s.u. twig funktion
Harald Lüngencaab0802024-08-23 17:28:22 +030084my $textcounter = 0;
Harald Lüngen9d4e0462024-08-23 09:34:22 +030085
86
Harald Lüngendb5e6e72024-09-04 17:41:18 +030087
88my $twig="";
89my $teiCorpusHeaderDoc="";
90
91
92#------------------------------------------------------------------
93# read corpusHeaderSkeleton document and get header out of it
94#------------------------------------------------------------------
95
96my $teiCorpusHeaderDocTwig = new XML::Twig(
97 keep_spaces => 1,
98 keep_atts_order => 1,
99 comments => 'drop',
100 );
101
102
103$teiCorpusHeaderDocTwig->parsefile("teiCorpusHeaderSkeleton.tei.xml");
104my $corpusHeader = $teiCorpusHeaderDocTwig->root; # getting the teiHeader for corpus out of the teiCorpusHeaderSkeleton document
105
106
107#------------------------------------------------------------------
108# read textHeaderSkeleton document adn get header out of it
109#------------------------------------------------------------------
110
111my $teiTextHeaderDocTwig = new XML::Twig(
112 keep_spaces => 1,
113 keep_atts_order => 1,
114 comments => 'drop',
115 );
116
117$teiTextHeaderDocTwig->parsefile("teiTextHeaderSkeleton.tei.xml");
118my $textHeader = $teiTextHeaderDocTwig->root; # getting the teiHeader for corpus out of the teiTextHeaderSkeleton document
119
120
121#----------------------------------
122# read input VRT-XML document
123#----------------------------------
124
125open(my $IN, "< $ARGV[0]") || die("$0: cannot open file for reading: $ARGV[0]"); # open input file and initialise filehandel, actually does not seem to be needed
126 # as parsefile() (s.b.) is applied to the filename
127
128#-----------------------------------------------------
129# global variables pertaining to the original corpus
130#-----------------------------------------------------
131
132my $kielipankkiCorpus = "klk-fi-v2-vrt";
133
134
135
136
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300137#####################
138# M A I N
139#####################
140
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300141#-------------------------------------------------------------------------------------------------------------
142# start twig for input and call start tag handler for root and twig handler for each <text> in the VRT
143#-------------------------------------------------------------------------------------------------------------
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300144
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300145
146$twig = new XML::Twig(
147 keep_spaces => 1, # dadurch auch whitespaces an ehemeligen elementgrenzen im output
148 keep_atts_order => 1, # requires Tie::IxHash
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300149 comments => 'drop',
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300150 start_tag_handlers => {
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300151 texts => sub{root(@_, $corpusHeader)}
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300152 },
153 twig_handlers => {
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300154# text => \&text
155 text => sub{text(@_, $textHeader)}
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300156 },
157 # dtd_handlers => { # ToDo for I5
158 # \&set_dtd;
159 # }
Harald Lüngena20e69d2024-08-29 13:33:08 +0300160
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300161 output_encoding => $encoding,
162 );
163
164$twig->parsefile($ARGV[0]);
165
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300166
167
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300168
169
170###########
171# END MAIN
172###########
173
174
175
176
177##############################
178# S U B R O U T I N E S
179##############################
180
181# sub set_dtd [
182# my $twig, $dtd = @_;
183# my $internal = qq|\nPUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"\n"DTD/xhtml1-strict.dtd"|;
184#
185# $twig->twig_doctype('html', undef, undef, $internal);
186# }
187
188
189
190sub root {
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300191 my ($twig, $root, $corpusHeader) =@_;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300192
193 $root->set_gi('teiCorpus');
194 $root->set_att("xmlns", 'http://www.tei-c.org/ns/1.0');
195
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300196 &insertCorpusHeader($root, $corpusHeader);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300197}
198
199
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300200
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300201sub insertCorpusHeader{
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300202 my ($root, $corpusHeader) =@_;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300203
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300204 #---------------------------------------------------------------------------
205 # get some metadata for the current output corpus based on source and year
206 #---------------------------------------------------------------------------
207
Harald Lüngenccd84902024-08-27 16:03:47 +0300208 my @array = split(/\//, $ARGV[0]);
209 my $l = scalar(@array);
210 my $source = $array[$l-1];
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300211 $source =~ s/([0-9][0-9][0-9][0-9])\.xml$//;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300212
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300213 my $language="Finnish";
214 my $lang_tla="fi";
215
216 my $yy = $1; # $1 now containts substring in first bracket in regex above
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300217
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300218 my $ctitle = $source . " " . $yy . ", from ". $kielipankkiCorpus . " for EuReCo"; # to do: also get name of corpus (klk-fi-v2-vrt)
219
220
221 #-----------------------
222 # set corpus header
223 #-----------------------
224
225 &set_title( $corpusHeader, $source, $yy, $kielipankkiCorpus);
226 &set_sourceDesc($corpusHeader, $source, $yy, $kielipankkiCorpus);
227
228 my $teiCorpusHeader = $corpusHeader->paste("first_child", $root);
229
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300230}
231
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300232
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300233#----------------------------
234# handler &text for <text>
235#----------------------------
236
237sub text {
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300238 my ($twig, $text, $textHeader) = @_;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300239
Harald Lüngencaab0802024-08-23 17:28:22 +0300240 $textcounter++; # global variable
241
242 # ToDo: catch all other, unexpected children of root
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300243
244 #--------------------------------------------------------------------------
245 # Get text metadata (attributes of <text>) and create teiHeader for <text>
246 #--------------------------------------------------------------------------
247
248 my $textattsref = $text->atts(); # $textattsref is now a reference to a hash and should be used with '->'
249
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300250
251 &createTextHeader($text, $textattsref, $textHeader);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300252
253 #--------------------------
254 # create <TEI> from <text>
255 #--------------------------
256
257 # set vrt <text> to <TEI> and delete all attributes after they were were saved above
258 $text->del_atts;
259 $text->set_gi("TEI");
Harald Lüngencaab0802024-08-23 17:28:22 +0300260
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300261 #------------------------------------------------------------------
262 # create the <tei:text>, <body>, <div> elements inside <TEI>
263 #------------------------------------------------------------------
264
265 my $ttext_element = XML::Twig::Elt->new('text');
266 my $body_element = XML::Twig::Elt->new('body');
267 my $div_element = XML::Twig::Elt->new('div');
268
269 # set atts
Harald Lüngencaab0802024-08-23 17:28:22 +0300270 $div_element ->set_att("type", "page"); # ToDo: this is specific to KLK
271 $ttext_element->set_att("xml:lang", 'fi'); # as in ICC-NOR
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300272
273 # paste
274 $ttext_element->paste('last_child', $text);
275 $body_element ->paste('last_child', $ttext_element);
276 $div_element ->paste('last_child', $body_element);
277
278
279 #-------------------------------
280 # create <p> from <paragraph>
281 #-------------------------------
282
283 my @paragraphs = $text->children( 'paragraph');
284
285 foreach my $paragraph (@paragraphs) {
286
287 &setP($paragraph);
288
289 $paragraph->move('last_child', $div_element);
290
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300291 #------------------------------
292 # create <s> from <sentence>
293 #------------------------------
294
295 my @sentences = $paragraph->children('sentence');
296 foreach my $sentence (@sentences) {
297
298 &setS($sentence);
299
300
301 #--------------------------------------
302 # create <w> (word) from each $line
303 #--------------------------------------
304
305 my @lines = split(/\n+/, $sentence->xml_text);
306 $sentence->set_text("\n");
307
308 for my $line (@lines){ # Todo: Reihenfolge checken
309 if($line ne "" ){
310 my $w_element = XML::Twig::Elt->new('w');
311 &createW($w_element, $line);
312 $w_element->paste('last_child', $sentence);
313 }
Harald Lüngencaab0802024-08-23 17:28:22 +0300314 } # end words
315 } # end sentences
316 } # end paragraphs
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300317
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300318 # $twig->set_pretty_print( 'record');
Harald Lüngena20e69d2024-08-29 13:33:08 +0300319 # $twig->flush($OUT);
320 $twig->flush("/dev/stdout");
321}
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300322
323sub createTextHeader{
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300324 my ($text, $textattsref, $textHeader) = @_;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300325
326 # USE 01 binding_id="2246025"
327 # USE 02 date="2021-01-15"
328 # 03 datefrom="20210115"
329 # 04 dateto="20210115"
330 # 05 elec_date="_"
331 # 06 file=""
332 # USE 07 filename_metadata="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_mets.xml"
333 # USE 08 filename_orig ="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_page-2021011502210030301.xml
334 # USE 09 id="t-bcd0f3fa-bbd3dac4"
335 # 10 img_url=""
336 # USE 11 issue_date="15.01.2021"
337 # USE 12 issue_no="SK0221"
338 # USE 13 issue_title="Suomen Kuvalehti"
339 # USE 14 label="Suomen Kuvalehti no. SK0221 15.01.2021"
340 # USE 16 language="fi"
341 # USE 17 page_id="p1"
342 # USE 18 page_no="None"
343 # 19 part_name="_"
344 # 20 publ_id="0039-5552"
345 # 21 publ_part=""
346 # USE 22 publ_title="Suomen Kuvalehti"
347 # USE 23 publ_type="aikakausi"
348 # USE 24 sentcount="70"
349 # USE 25 sum_lang="|xxx:44|fin:23|eng:3|"
350 # 26 timefrom="000000"
351 # 27 timeto="235959"
352 # USE 28 tokencount="304"
353 # 29 version_added="KLK-fi-2021">
354
355
356 my $BID = $textattsref->{'binding_id'};
357 my $DATE = $textattsref->{'date'};
358 my $METAFILENAME = $textattsref->{'filename_metadata'};
359 my $ORIGFILENAME = $textattsref->{'filename_orig'};
360 my $ID = $textattsref->{'id'};
361 my $ISSUEDATE = $textattsref->{'issue_date'};
362 my $ISSUENO = $textattsref->{'issue_no'};
363 my $ISSUETITLE = $textattsref->{'issue_title'};
364 my $LABEL = $textattsref->{'label'};
365 my $LANGUAGE = $textattsref->{'language'};
366 my $PAGEID = $textattsref->{'page_id'};
367 my $PAGENO = $textattsref->{'page_no'};
368 my $PUBLTITLE = $textattsref->{'publ_title'};
369 my $PUBLTYPE = $textattsref->{'publ_type'};
370 my $SENTCOUNT = $textattsref->{'sentcount'};
371 my $SUMLANG = $textattsref->{'sum_lang'};
372 my $TOKENCOUNT = $textattsref->{'tokencount'};
373
374
375 #-----------------------------
376 # Derived Metadata variables
377 #-----------------------------
378
379 my @datearray = split("-", $DATE);
380 my @langarray = split("|", $SUMLANG);
381 my @namearray = split(/[\.\/]/, $ORIGFILENAME); # use $namearray[4] as ID for the page
382
383
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300384
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300385 #-----------------------------------------------------------------------
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300386 # CREATE text-teiHeader ACCORDING TO THE SKELETON in $textHeader
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300387 #-----------------------------------------------------------------------
388
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300389
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300390 $textHeader->paste('first_child', $text);
391
392 #-----------------------------------------------
393 # <teiHeader>
394 # <fileDesc n="[EuReCo-KLK-FIN_$ID]">
395 # <titleStmt>
396 # <title>[$LABEL, page $PAGENO]</title>
397
398 $textHeader->first_child("fileDesc")->first_child("titleStmt")->first_child("title")
399 ->set_text($LABEL . ", Text #" . $textcounter); # Case KLK
400
401 #-----------------------------------------------
402 # <fileDesc>
403 # <sourceDesc>
404 # <biblStruct>
405 # <analytic>
406 # <title type="main">[$LABEL, page $PAGENO]</title>
407 # <date>[$DATE]</date>
408 # <date type="year">TODO</date>
409 # <date type="month">TODO</date>
410 # <date type="day">TODO</date>
411 # <idno type="PAGEID">$PAGEID</idno>
412 # <idno type="BINDINGID">$BID</idno>
413 # <idno type="ID">$ID</idno>
414 # <idno type="KIELIPANKKI_METAFILENAME">$METAFILENAME</idno>
415 # <idno type="KIELIPANKKI_ORIGFILENAME">$ORIGFILENAME</idno>
416 # <textLang>$LANGUAGE</textLang>
417 # </analytic>
418
419 my $analytic = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/analytic", 0);
420
421 $analytic->first_child("title") ->set_text($LABEL . ", Text #" . $textcounter); # Case KLK
422 $analytic->get_xpath('./date[@type="year"]', 0) ->set_text($datearray[0]);
423 $analytic->get_xpath('./date[@type="month"]', 0) ->set_text($datearray[1]);
424 $analytic->get_xpath('./date[@type="day"]', 0) ->set_text($datearray[2]);
425 $analytic->get_xpath('./idno[@type="PAGEID"]', 0) ->set_text($PAGEID);
426 $analytic->get_xpath('./idno[@type="BINDINGID"]', 0) ->set_text($BID);
427 $analytic->get_xpath('./idno[@type="ID"]', 0) ->set_text($ID);
428 $analytic->get_xpath('./idno[@type="KIELIPANKKI_METAFILENAME"]', 0) ->set_text($METAFILENAME);
429 $analytic->get_xpath('./idno[@type="KIELIPANKKI_ORIGFILENAME"]', 0) ->set_text($ORIGFILENAME);
430 $analytic->first_child('textLang') ->set_text($LANGUAGE);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300431
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300432 # <monogr>
433 # <title>$PUBLTITLE</title>
434 # <imprint>
435 # <pubPlace>TODO</pubPlace>
436 # <publisher>TODO</publisher>
437 # </imprint>
438 # <biblScope unit="ISSUETITLE"/>
439 # <biblScope unit="ISSUENO"/>
440 # <biblScope unit="ISSUEDATE"/>
441 # <biblScope unit="pp">$PAGENO</biblScope>
442 # <monogr>
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300443
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300444 my $monogr = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/monogr", 0);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300445
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300446
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300447
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300448
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300449
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300450##TMP # create <teiHeader> inside <TEI>
451##TMP my $teiHeader = XML::Twig::Elt->new('teiHeader');
452##TMP # $teiHeader->paste('first_child', $text);
453##TMP
454##TMP ## insert_new_elt is a combo of new and paste, cf. xml::twig docu:
455##TMP ## insert_new_elt ($opt_position, $gi, $opt_atts_hashref, @opt_content)
456##TMP
457##TMP my $fileDesc = $teiHeader->insert_new_elt('fileDesc' => {n => "EuReCo_KLK-fi_" . $namearray[4]});
458##TMP my $encodingDesc = $teiHeader->insert_new_elt("last_child", 'encodingDesc');
459##TMP my $profileDesc = $teiHeader->insert_new_elt("last_child", 'profileDesc');
460##TMP my $revisionDesc = $teiHeader->insert_new_elt("last_child", 'revisionDesc');
461##TMP
462##TMP #---------------------
463##TMP # fileDesc/titleStmt
464##TMP #---------------------
465##TMP my $titleStmt = $fileDesc ->insert_new_elt('titleStmt');
466##TMP my $title = $titleStmt->insert_new_elt("last_child", 'title');
467##TMP my $respStmt = $titleStmt->insert_new_elt("last_child", 'respStmt');
468##TMP my $resp = $respStmt ->insert_new_elt("last_child", 'resp');
469##TMP my $name = $respStmt ->insert_new_elt("last_child", 'name');
470##TMP
471##TMP # set texts for titleStmt
472##TMP # $title->set_text($LABEL . ", page " . $PAGENO); # Achtung - PAGENO scheint meist "None" zu sein
473##TMP $title->set_text($LABEL . ", Text #" . $textcounter); # at least for Suomen Kuvalehti
474##TMP $resp ->set_text("compiled by EuReCo");
475##TMP $name ->set_text("EuReCo: HL");
476##TMP
477##TMP #--------------------------
478##TMP # fileDesc/publicationStmt
479##TMP #--------------------------
480##TMP my $publicationStmt = $fileDesc ->insert_new_elt("last_child", 'publicationStmt');
481##TMP my $distributor = $publicationStmt->insert_new_elt("last_child", 'distributor');
482##TMP my $note = $distributor ->insert_new_elt("last_child", 'note');
483##TMP my $availability = $publicationStmt->insert_new_elt("last_child", 'availability');
484##TMP my $licence = $availability ->insert_new_elt("last_child", 'licence');
485##TMP
486##TMP # set texts for publicationStmt
487##TMP $note ->set_text("NOT FOR DISTRIBUTION - to be used locally in EuReCo");
488##TMP $licence->set_text("CLARIN_RES"); # TODO: Ausfuherlichere Licence info in KLK Metadata Record
489##TMP
490##TMP #------------------------------
491##TMP # fileDesc/sourceDesc/biblStruct
492##TMP #------------------------------
493##TMP my $sourceDesc = $fileDesc ->insert_new_elt("last_child", 'sourceDesc');
494##TMP my $biblStruct = $sourceDesc->insert_new_elt("last_child", 'biblStruct');
495##TMP
496##TMP # fileDesc/sourceDesc/biblStruct/analytic
497##TMP my $analytic = $biblStruct->insert_new_elt("last_child", 'analytic');
498##TMP my $analytic_title = $analytic->insert_new_elt("last_child", 'title' => {type => "main"} );
499##TMP # my $analytic_date = $analytic->insert_new_elt("last_child", 'date');
500##TMP my $analytic_date_year = $analytic->insert_new_elt("last_child", 'date' => {type => "year"});
501##TMP my $analytic_date_month = $analytic->insert_new_elt("last_child", 'date' => {type => "month"});
502##TMP my $analytic_date_day = $analytic->insert_new_elt("last_child", 'date' => {type => "day"});
503##TMP my $analytic_idno_pageid = $analytic->insert_new_elt("last_child", 'idno' => {type => "PAGEID"});
504##TMP my $analytic_idno_bindingid = $analytic->insert_new_elt("last_child", 'idno' => {type => "BINDINGID"});
505##TMP my $analytic_idno_id = $analytic->insert_new_elt("last_child", 'idno' => {type => "ID"});
506##TMP my $analytic_idno_metafile = $analytic->insert_new_elt("last_child", 'idno' => {type => "KIELIPANKKI_METAFILENAME"});
507##TMP my $analytic_idno_origfile = $analytic->insert_new_elt("last_child", 'idno' => {type => "KIELIPANKKI_ORIGFILENAME"});
508##TMP my $analytic_textlang = $analytic->insert_new_elt("last_child", 'textLang');
509##TMP
510##TMP # set texts for analytic
511##TMP # $analytic_title ->set_text($LABEL . ", page " . $PAGENO); # Achtung $PAGENO scheint meist "None zu sein"
512##TMP $analytic_title ->set_text($LABEL . ", Text #" . $textcounter); # Achtung $PAGENO scheint meist "None zu sein"
513##TMP # $analytic_date ->set_text($DATE);
514##TMP $analytic_date_year ->set_text($datearray[0]);
515##TMP $analytic_date_month ->set_text($datearray[1]);
516##TMP $analytic_date_day ->set_text($datearray[2]);
517##TMP $analytic_idno_pageid ->set_text($PAGEID);
518##TMP $analytic_idno_bindingid->set_text($BID);
519##TMP $analytic_idno_id ->set_text($ID);
520##TMP $analytic_idno_metafile ->set_text($METAFILENAME);
521##TMP $analytic_idno_origfile ->set_text($ORIGFILENAME);
522##TMP $analytic_textlang ->set_text($LANGUAGE);
523##TMP
524##TMP #-------------------------------------
525##TMP # fileDesc/sourceDesc/biblStruct/monogr
526##TMP #-------------------------------------
527##TMP my $monogr = $biblStruct->insert_new_elt("last_child", 'monogr');
528##TMP my $monogr_title = $monogr ->insert_new_elt("last_child", 'title');
529##TMP my $imprint = $monogr ->insert_new_elt("last_child", 'imprint'); # imprint is needed for valididty
530##TMP my $pubPlace = $imprint ->insert_new_elt("last_child", 'pubPlace'); # imprint is needed for validity
531##TMP my $publisher = $imprint ->insert_new_elt("last_child", 'publisher'); # imprint is needed for validity
532##TMP my $biblScope_issuetitle = $monogr ->insert_new_elt("last_child", 'biblScope' => {unit => 'ISSUETITLE'} );
533##TMP my $biblScope_issueno = $monogr ->insert_new_elt("last_child", 'biblScope' => {unit => 'ISSUENO'} );
534##TMP my $biblScope_issuedate = $monogr ->insert_new_elt("last_child", 'biblScope' => {unit => 'ISSUEDATE'} );
535##TMP my $biblScope_pp = $monogr ->insert_new_elt("last_child", 'biblScope' => {unit => 'PAGENO'} ); # Achtung PAGENO ist meist "None" ?
536##TMP
537##TMP # set texts for monogr
538##TMP $monogr_title ->set_text($PUBLTITLE);
539##TMP $pubPlace ->set_text("TODO");
540##TMP $pubPlace ->set_att("key",'FI');
541##TMP $publisher ->set_text("TODO");
542##TMP $biblScope_issuetitle->set_text($ISSUETITLE);
543##TMP $biblScope_issueno ->set_text($ISSUENO);
544##TMP $biblScope_issuedate ->set_text($ISSUEDATE);
545##TMP $biblScope_pp ->set_text($PAGENO);
546##TMP
547##TMP #---------------
548##TMP # encodingDesc
549##TMP #---------------
550##TMP my $tagsDecl = $encodingDesc->insert_new_elt("last_child", 'tagsDecl');
551##TMP my $namespace = $tagsDecl ->insert_new_elt("last_child", 'namespace' => {name => 'http://www.tei-c.org/ns/1.0'});
552##TMP my $tagUsage_s = $namespace ->insert_new_elt("last_child", 'tagUsage' => {gi => 's', occurs => $SENTCOUNT});
553##TMP my $tagUsage_w = $namespace ->insert_new_elt("last_child", 'tagUsage' => {gi => 'w', occurs => $TOKENCOUNT});
554##TMP
555##TMP #-------------
556##TMP # profileDesc
557##TMP #-------------
558##TMP my $langUsage = $profileDesc ->insert_new_elt("last_child", 'langUsage');
559##TMP my $language = $langUsage ->insert_new_elt("last_child", 'language' => {ident => $LANGUAGE, usage => $SUMLANG});
560##TMP # Achtung in @usage muss eigt. ein integer; am besten inhalt von SUMLANG aufdroeseln und mehrere <language> machen
561##TMP my $textClass = $profileDesc ->insert_new_elt("last_child", 'textClass');
562##TMP my $classCode_fi = $textClass ->insert_new_elt("last_child", 'classCode' => {scheme => "KLK_PUBLTYPE"});
563##TMP # my $classCode_en = $textClass ->insert_new_elt("last_child", 'classCode' => {scheme => "KLK_PUBLTYPE_MAPPED"});
564##TMP
565##TMP #---------------------------
566##TMP # set texts for profileDesc
567##TMP #---------------------------
568##TMP $classCode_fi ->set_text($PUBLTYPE);
569##TMP # $classCode_en->set_text($PUBLTYPETRANSL);
570##TMP
571##TMP #---------------
572##TMP # revisionDesc
573##TMP #---------------
574##TMP my $change = $revisionDesc ->insert_new_elt("last_child", 'change' => {when => localtime->ymd('-'), who => 'HL' });
575##TMP
576##TMP # set texts for revisionDesc
577##TMP $change->set_text("TEI version for EuReCo");
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300578
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300579
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300580
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300581
582
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300583
584
585
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300586 ###################################
587 # END OF CREATING TEIHEADER
588 ###################################
589
590}
591
592sub setP {
593 my ($paragraph) = @_;
594
595 $paragraph->set_gi('p');
596
597 # <paragraph id="p-bcd0f3fa-bbd3dac4-815ead7a" sum_lang="|fin:1|">
598 # atts of <paragraph>:
599 # @id USE
600 # @sum_lang USE: put in xml:lang and prefix the value with "x-" for private value
601
602 $paragraph->set_att("xml:lang", "x-" . $paragraph->att("sum_lang"));
603 $paragraph->del_att("sum_lang");
Harald Lüngena20e69d2024-08-29 13:33:08 +0300604 # $paragraph->change_att_name('id', 'xml:id');
605 $paragraph->del_att("id"); # diese id ist auch nicht eindeutig!!
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300606}
607sub setS {
608 my ($sentence) = @_;
609
610 $sentence->set_gi('s');
611
612 # the atts of <sentence>:
613 # USE 1 @id="s-bcd0f3fa-bbd3dac4-f7429090"
614 # USE 2 @lang="fin" -> xml:lang
615 # ? 3 @lang_conf="0.6734853"> -> ToDo @cert ?
616
617 # set attrs of <s>
618 $sentence->set_att("xml:lang", $sentence->att("lang")); # ToDo: convert the value / introduce a hash for lookup (input values: "fin", "xxx", ....)
Harald Lüngenfaf8d482024-08-27 21:19:47 +0300619 # $sentence->change_att_name('id', 'xml:id'); # nicht eindeutig
620 $sentence->del_att('id');
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300621 $sentence->del_att("lang"); # replaced by xml:lang
622 $sentence->del_att("lang_conf"); # for the time being
623
624}
625
626sub createW {
627 my ($w_element, $line) = @_;
628
629 #---------------------------
630 # Get the tags (=columns)
631 #---------------------------
632
633 my @tags = split(/\t/, $line);
634
635 # set content of <w> i.e. the token
636 $w_element->set_text($tags[0]);
637
638 # vrt positional-attributes in corpus KLK:
639 # USE [0] word
640 # USE [1] ref (id for reference of dephead)
641 # USE [2] lemma
642 # ? [3] lemmacomp (lemma with compound info - could go in @norm, as tag abuse?)
643 # USE [4] pos
644 # USE [5] msd
645 # USE [6] dephead
646 # USE [7] deprel
647 # [8] content (ocr-process)
648 # [9] vpos (ocr-process)
649 # [10] ocr (ocr-process)
650 # [11] cc (ocr-process)
651 # [12] hyph (ocr-process)
652 # [13] style (ocr-process)
653 # [14] lex (korp semantic disambiguation from G"oteborg)
654
655 # set the attributes of <w>:
656 $w_element->set_att("n", $tags[1]);
657 # $w_element->set_att("id", "w_" . $namearray[4] . $sentence->att("xml:id") . "_" . $tags[1]);
658 # so zusammengebaute ID ist auch nicht eindeutig...
Harald Lüngenfaf8d482024-08-27 21:19:47 +0300659 $w_element->del_att("id");
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300660 $w_element->set_att("lemma", $tags[2]);
661 # $w_element->set_att("norm", $tags[3]); # tag abuse of @norm
662 $w_element->set_att("pos", $tags[4]);
663 $w_element->set_att("msd", $tags[5]);
Harald Lüngenccd84902024-08-27 16:03:47 +0300664#TMP $w_element->set_att("head", $tags[6]);
665#TMP $w_element->set_att("deprel", $tags[7]);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300666
667}
668
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300669
670sub set_title{
671 my ($corpusHeader, $source, $yy, $kielipankkiCorpus) = @_;
672
673 my $cTitleString = $source . " " . $yy . ", from ". $kielipankkiCorpus . " for EuReCo";
674
675 #<teiHeader>
676 # <fileDesc>
677 # <titleStmt>
678 # <title>[Aamulehti2021] from [klk-fi-v2-vrt for] EuReCo</title>
679 # </titleStmt>
680 # <!-- ... -->
681 # </fileDesc>
682 #</teiHeader>
683
684 my $cTitleNode = $corpusHeader->first_child("fileDesc")->first_child("titleStmt")->first_child("title");
685
686 $cTitleNode->set_text($cTitleString);
687
688}
689
690sub set_sourceDesc{
691 my ($corpusHeader, $source, $yy, $kielipankkiCorpus) = @_;
692
693 my $cBiblString = $source . " " . $yy . ", from ". $kielipankkiCorpus . " for EuReCo";
694
695 #<teiHeader>
696 # <fileDesc>
697 # <!-- ... -->
698 # <sourceDesc>
699 # <bibl>[Aamulehti2021] from [klk-fi-v2-vrt]</bibl>
700 # </sourceDesc>
701 # <!-- ... -->
702 # </fileDesc>
703 #</teiHeader>
704
705 my $cBiblNode = $corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("bibl");
706
707 $cBiblNode->set_text($cBiblString);
708}
709
710
711
712
713
714
715
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300716#################
717## usage_message
718#################
719
720
721sub usage_message {
Harald Lüngena7e91622024-08-23 17:33:11 +0300722 print " Usage: ./vrt2tei.pl <file.vrt.xml> <outfile>\n";
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300723 print " <file.vrt.xml> is a VRT file converted to proper XML\n";
724 exit;
725}
726
727