blob: 6c280baeb237aab44a557b3489fd50c874fba5bd [file] [log] [blame]
Harald Lüngen9d4e0462024-08-23 09:34:22 +03001#! /usr/bin/perl -w
2
3
4###########################################################################################################################################################
5# vrt2tei.pl
6# eureco
7# leibniz-institut fuer deutsche sprache / csc finland esbo
8# august 2024
9#
10#
11# using XML::Twig , see http://www.xmltwig.org/, https://metacpan.org/pod/XML::Twiga
12#
Harald Lüngenccd84902024-08-27 16:03:47 +030013# usage: see below the usage fugnction
Harald Lüngencaab0802024-08-23 17:28:22 +030014# Usage: ./vrt2tei.pl <vrtxmlfile.xml> <outfile>
Harald Lüngen9d4e0462024-08-23 09:34:22 +030015# <vrtxmlfile>: xml-ised vrt file
16#
17#
18# TODO:
19# 1 insert dtd spec, or ref to TEI
20
Harald Lüngendb5e6e72024-09-04 17:41:18 +030021# 3a remove the vrt positional attribute comment line / all comment lines
Harald Lüngen9d4e0462024-08-23 09:34:22 +030022# 3b add @head and @deprel to I5 sowie auch @msd
23# 3c bearbeitung von @head und @deprel in tei2korapxml durch Nils?
24# 3d build 30 billion corpus
25
26# 4a take care of IDs
27# 4b see to the values of @xml:lang
28# 5 abfangen von unerwarteten elementen dh andere als <sentence> und <paragraph>
29# 5a wort reihenfolge nochmal checken
30# 6 checks and balances
Harald Lüngen9d4e0462024-08-23 09:34:22 +030031# 7 How to encode Kielipankki and National Library of Finland? in teiCorpus Header
32# 8 construct <idsDoc>s for the months (or go for TEI)
33# 9 parallelisation in bash and application on sub corpora of KLK
34# 10 re-implementation of the gawk code in the perl script
35# 12 re-implement creation of text header from xml file in another twig / parametrize TEI vs I5
36
37
38
39#remember
40#formatted.xml:105613: element w: validity error : No declaration for attribute deprel of element w
41#formatted.xml:105613: element w: validity error : No declaration for attribute head of element w
42
43
44#
45#
46############################################################################################################################################################
47
48
49use strict;
50use warnings;
Harald Lüngen2551a952024-09-15 08:08:35 +030051use diagnostics;
Harald Lüngen9d4e0462024-08-23 09:34:22 +030052
Harald Lüngen2551a952024-09-15 08:08:35 +030053use Getopt::Std;
Harald Lüngen9d4e0462024-08-23 09:34:22 +030054use XML::Twig;
55use XML::Generator ':pretty'; # apparently no effect when using flush();
56
57
58use locale; # diese drei Zeilen, damit \b im regex nicht Umlaute und ß matcht.
59use POSIX qw(locale_h); # to be able to use setlocale()
60#setlocale(LC_ALL,'de_DE');
61setlocale(LC_ALL, "fi_FI");
62use utf8;
63use open qw( :std :encoding(UTF-8) );
64
65use Time::Piece;
66use Tie::IxHash;
67
Harald Lüngendb5e6e72024-09-04 17:41:18 +030068
69
Harald Lüngen9d4e0462024-08-23 09:34:22 +030070#----------------------
71# check file arguments:
72#----------------------
73
74# arg0 infile: vrt-xml
Harald Lüngen9d4e0462024-08-23 09:34:22 +030075
Harald Lüngena20e69d2024-08-29 13:33:08 +030076unless($ARGV[0]) {&usage_message()} ; # min arg0, the input file
77if ($ARGV[1]) {&usage_message()}; # max arg0, the input file
Harald Lüngen9d4e0462024-08-23 09:34:22 +030078
79
Harald Lüngen2551a952024-09-15 08:08:35 +030080#--------------------------
81# get options / auxiliary files
82#--------------------------
83
84
85
86
87
Harald Lüngen9d4e0462024-08-23 09:34:22 +030088####################
89# GLOBAL VARIABLES
90####################
91
Harald Lüngen2551a952024-09-15 08:08:35 +030092my $encoding = "UTF-8"; # dieses $encoding ist NUR fuer das output s.u. twig funktion
93
94#my $TEIFORMAT = "TEI";
95my $TEIFORMAT = "I5";
96
97
98
Harald Lüngencaab0802024-08-23 17:28:22 +030099my $textcounter = 0;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300100
Harald Lüngenba0354b2024-09-11 16:24:08 +0300101our %corpussigles = ();
102our %srcfullnames = ();
103our %srcpubplaces = ();
104our %srcpublishers = ();
105our %srctexttypes = ();
106our %srctextlangs = ();
107
Harald Lüngen86cbd932024-09-10 15:52:18 +0300108my %doccounter = ( # by the month as in derekox
109 "01" => 1,
110 "02" => 1,
111 "03" => 1,
112 "04" => 1,
113 "05" => 1,
114 "06" => 1,
115 "07" => 1,
116 "08" => 1,
117 "09" => 1,
118 "10" => 1,
119 "11" => 1,
120 "12" => 1,
121 );
122
123
124my $sourcescsvfile = "sources_klk_fi_v2_2021_4eureco.csv";
Harald Lüngen2551a952024-09-15 08:08:35 +0300125
Harald Lüngen86cbd932024-09-10 15:52:18 +0300126my $corpheaderfile = "teiCorpusHeaderSkeleton.tei.xml";
127my $textheaderfile = "teiTextHeaderSkeleton.tei.xml";
Harald Lüngen2551a952024-09-15 08:08:35 +0300128if($TEIFORMAT eq "I5"){
129 # $corpheaderfile = "i5CorpusHeaderSkeleton.i5.xml";
130 $textheaderfile = "i5TextHeaderSkeleton.i5.xml";
131}
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300132
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300133
134my $twig="";
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300135
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300136# global variables pertaining to the original corpus :
137my $kielipankkiCorpus = "klk-fi-v2-vrt";
138
Harald Lüngen86cbd932024-09-10 15:52:18 +0300139my %months = (
140 "01" => "JAN",
141 "02" => "FEB",
142 "03" => "MAR",
143 "04" => "APR",
144 "05" => "MAY",
145 "06" => "JUN",
146 "07" => "JUL",
147 "08" => "AUG",
148 "09" => "SEP",
149 "10" => "OCT",
150 "11" => "NOV",
151 "12" => "DEC",
152 );
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300153
Harald Lüngen2551a952024-09-15 08:08:35 +0300154my %mapping = ();
155$mapping{"aikakausi"} = "Zeitschrift";
156$mapping{"sanomalehti"} = "Zeitung";
157
158
159
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300160#------------------------------------------------------------------
161# read corpusHeaderSkeleton document and get header out of it
162#------------------------------------------------------------------
163
164my $teiCorpusHeaderDocTwig = new XML::Twig(
165 keep_spaces => 1,
166 keep_atts_order => 1,
167 comments => 'drop',
168 );
169
170
Harald Lüngen86cbd932024-09-10 15:52:18 +0300171$teiCorpusHeaderDocTwig->parsefile($corpheaderfile);
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300172my $corpusHeader = $teiCorpusHeaderDocTwig->root; # getting the teiHeader for corpus out of the teiCorpusHeaderSkeleton document
173
174
175#------------------------------------------------------------------
176# read textHeaderSkeleton document adn get header out of it
177#------------------------------------------------------------------
178
179my $teiTextHeaderDocTwig = new XML::Twig(
180 keep_spaces => 1,
181 keep_atts_order => 1,
182 comments => 'drop',
183 );
184
Harald Lüngen86cbd932024-09-10 15:52:18 +0300185$teiTextHeaderDocTwig->parsefile($textheaderfile);
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300186my $textHeader = $teiTextHeaderDocTwig->root; # getting the teiHeader for corpus out of the teiTextHeaderSkeleton document
187
188
Harald Lüngen2551a952024-09-15 08:08:35 +0300189my $idsDoc = XML::Twig::Elt->new('idsDoc');
190if($TEIFORMAT eq "I5"){
191
192 my $idsDocHeader = XML::Twig::Elt->new('idsHeader');
193 my $docFileDesc = XML::Twig::Elt->new('fileDesc');
194 my $docTitleStmt = XML::Twig::Elt->new('titleStmt');
195 my $dtitle = XML::Twig::Elt->new('d.title');
196 my $docSigle = XML::Twig::Elt->new('dokumentSigle');
197
198 $docSigle -> paste("first_child", $docTitleStmt);
199 $dtitle -> paste("last_child", $docTitleStmt);
200 $docTitleStmt -> paste("last_child", $docFileDesc);
201 $docFileDesc -> paste("last_child", $idsDocHeader);
202 $idsDocHeader -> paste("last_child", $idsDoc);
203
204 # ToDo set dummy dtitle and docSigle
205}
206
207
208
209
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300210#----------------------------------
211# read input VRT-XML document
212#----------------------------------
213
214open(my $IN, "< $ARGV[0]") || die("$0: cannot open file for reading: $ARGV[0]"); # open input file and initialise filehandel, actually does not seem to be needed
215 # as parsefile() (s.b.) is applied to the filename
216
Harald Lüngen86cbd932024-09-10 15:52:18 +0300217#-------------------------------------------------------------------------------------------
218# read source metadata file (prepared manually => ultimately read the info from CMDI File?)
219#-------------------------------------------------------------------------------------------
220
Harald Lüngenba0354b2024-09-11 16:24:08 +0300221open(my $SOURCES, "< $sourcescsvfile") || die("$0: cannot open file for reading: $sourcescsvfile");
222while(my $fline = <$SOURCES>){
223 chomp($fline);
Harald Lüngen86cbd932024-09-10 15:52:18 +0300224
Harald Lüngenba0354b2024-09-11 16:24:08 +0300225 if ($fline=~/^\#/ || $fline=~/^\s*$/ || $fline =~ "TEICORPUSID"){next;} # skip line if empty line or comment line or first line
226 my @flarray = split(/\s*\t+\s*/, $fline); # split each line into array
227
Harald Lüngen2551a952024-09-15 08:08:35 +0300228 # set full titles ($flarray[1]) as keys:
229 $corpussigles{$flarray[1]} = $flarray[0];
230 $srcfullnames{$flarray[1]} = $flarray[1];
231 $srcpubplaces{$flarray[1]} = $flarray[6];
232 $srcpublishers{$flarray[1]} = $flarray[7];
233 $srctexttypes{$flarray[1]} = $flarray[4];
234 $srctextlangs{$flarray[1]} = $flarray[5];
235
236 # also set simple titles ($flarray[2]) as keys:
Harald Lüngenba0354b2024-09-11 16:24:08 +0300237 $corpussigles{$flarray[2]} = $flarray[0];
238 $srcfullnames{$flarray[2]} = $flarray[1];
239 $srcpubplaces{$flarray[2]} = $flarray[6];
240 $srcpublishers{$flarray[2]} = $flarray[7];
241 $srctexttypes{$flarray[2]} = $flarray[4];
242 $srctextlangs{$flarray[2]} = $flarray[5];
Harald Lüngen86cbd932024-09-10 15:52:18 +0300243}
Harald Lüngenba0354b2024-09-11 16:24:08 +0300244close($SOURCES);
Harald Lüngen86cbd932024-09-10 15:52:18 +0300245
246
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300247
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300248#####################
249# M A I N
250#####################
251
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300252#-------------------------------------------------------------------------------------------------------------
253# start twig for input and call start tag handler for root and twig handler for each <text> in the VRT
254#-------------------------------------------------------------------------------------------------------------
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300255
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300256
Harald Lüngenba0354b2024-09-11 16:24:08 +0300257
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300258$twig = new XML::Twig(
259 keep_spaces => 1, # dadurch auch whitespaces an ehemeligen elementgrenzen im output
260 keep_atts_order => 1, # requires Tie::IxHash
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300261 comments => 'drop',
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300262 start_tag_handlers => {
Harald Lüngen2551a952024-09-15 08:08:35 +0300263 texts => sub{root(@_, $corpusHeader)}
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300264 },
Harald Lüngen2551a952024-09-15 08:08:35 +0300265
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300266 twig_handlers => {
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300267# text => \&text
Harald Lüngen7abb0b52024-09-05 16:26:57 +0300268 text => sub{text(@_, $textHeader->copy)} # copy must be because textHeader will be flushed with $twig in the <text> handler;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300269 },
270 # dtd_handlers => { # ToDo for I5
271 # \&set_dtd;
272 # }
Harald Lüngena20e69d2024-08-29 13:33:08 +0300273
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300274 output_encoding => $encoding,
275 );
276
277$twig->parsefile($ARGV[0]);
278
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300279
280
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300281###########
282# END MAIN
283###########
284
285
286
287
288##############################
289# S U B R O U T I N E S
290##############################
291
292# sub set_dtd [
293# my $twig, $dtd = @_;
294# my $internal = qq|\nPUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"\n"DTD/xhtml1-strict.dtd"|;
295#
296# $twig->twig_doctype('html', undef, undef, $internal);
297# }
298
299
300
301sub root {
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300302 my ($twig, $root, $corpusHeader) =@_;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300303
304 $root->set_gi('teiCorpus');
305 $root->set_att("xmlns", 'http://www.tei-c.org/ns/1.0');
306
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300307 &insertCorpusHeader($root, $corpusHeader);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300308}
309
310
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300311
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300312sub insertCorpusHeader{
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300313 my ($root, $corpusHeader) =@_;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300314
Harald Lüngenba0354b2024-09-11 16:24:08 +0300315 #-------------------------------------------------------------
316 # take fnsource and year from the current xml input filename
317 #-------------------------------------------------------------
318
Harald Lüngenccd84902024-08-27 16:03:47 +0300319 my @array = split(/\//, $ARGV[0]);
320 my $l = scalar(@array);
Harald Lüngenba0354b2024-09-11 16:24:08 +0300321 my $fnsource = $array[$l-1];
322 $fnsource =~ s/([0-9][0-9][0-9][0-9])\.xml$//;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300323
Harald Lüngen2551a952024-09-15 08:08:35 +0300324 my $year = $1; # $1 containts substring in first bracket in regex above
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300325
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300326 #-----------------------
327 # set corpus header
328 #-----------------------
329
Harald Lüngenba0354b2024-09-11 16:24:08 +0300330 &set_title( $corpusHeader, $srcfullnames{$fnsource}, $year, $kielipankkiCorpus);
331 &set_sourceDesc($corpusHeader, $srcfullnames{$fnsource}, $year, $kielipankkiCorpus);
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300332
Harald Lüngen2551a952024-09-15 08:08:35 +0300333 $corpusHeader->paste("first_child", $root);
334 $idsDoc ->paste("after", $corpusHeader);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300335}
336
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300337
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300338#----------------------------
339# handler &text for <text>
340#----------------------------
341
342sub text {
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300343 my ($twig, $text, $textHeader) = @_;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300344
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300345 $textcounter++;
Harald Lüngencaab0802024-08-23 17:28:22 +0300346
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300347
Harald Lüngencaab0802024-08-23 17:28:22 +0300348 # ToDo: catch all other, unexpected children of root
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300349
350 #--------------------------------------------------------------------------
351 # Get text metadata (attributes of <text>) and create teiHeader for <text>
352 #--------------------------------------------------------------------------
353
354 my $textattsref = $text->atts(); # $textattsref is now a reference to a hash and should be used with '->'
355
Harald Lüngen86cbd932024-09-10 15:52:18 +0300356 # &createTextHeader returns the $textID:
357 my $textID = &createTextHeader($text, $textattsref, $textHeader);
358
Harald Lüngen2551a952024-09-15 08:08:35 +0300359
360 #----------------------------------------
361 # create <TEI> or <idsText> from <text>
362 #----------------------------------------
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300363
364 # set vrt <text> to <TEI> and delete all attributes after they were were saved above
365 $text->del_atts;
Harald Lüngen7abb0b52024-09-05 16:26:57 +0300366
Harald Lüngen2551a952024-09-15 08:08:35 +0300367 if($TEIFORMAT eq "TEI"){
368 $text->set_gi("TEI");
369 $text->set_att('xml:id', $textID);
370 }
371 else {
372 $text ->set_gi("idsText");
373 $text ->set_att('version', "1.0");
374# $text ->move("last_child", $idsDoc); # does not work because apparently $idsDoc is not under $root at this point
Harald Lüngen7abb0b52024-09-05 16:26:57 +0300375
Harald Lüngen2551a952024-09-15 08:08:35 +0300376 }
Harald Lüngen86cbd932024-09-10 15:52:18 +0300377
Harald Lüngen7abb0b52024-09-05 16:26:57 +0300378
Harald Lüngencaab0802024-08-23 17:28:22 +0300379
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300380 #------------------------------------------------------------------
381 # create the <tei:text>, <body>, <div> elements inside <TEI>
382 #------------------------------------------------------------------
383
384 my $ttext_element = XML::Twig::Elt->new('text');
385 my $body_element = XML::Twig::Elt->new('body');
386 my $div_element = XML::Twig::Elt->new('div');
387
388 # set atts
Harald Lüngencaab0802024-08-23 17:28:22 +0300389 $div_element ->set_att("type", "page"); # ToDo: this is specific to KLK
Harald Lüngen2551a952024-09-15 08:08:35 +0300390 $ttext_element->set_att("xml:lang", 'fi'); # as in ICC-NOR
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300391
392 # paste
393 $ttext_element->paste('last_child', $text);
394 $body_element ->paste('last_child', $ttext_element);
395 $div_element ->paste('last_child', $body_element);
396
397
398 #-------------------------------
399 # create <p> from <paragraph>
400 #-------------------------------
401
402 my @paragraphs = $text->children( 'paragraph');
403
404 foreach my $paragraph (@paragraphs) {
405
406 &setP($paragraph);
407
408 $paragraph->move('last_child', $div_element);
409
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300410 #------------------------------
411 # create <s> from <sentence>
412 #------------------------------
413
414 my @sentences = $paragraph->children('sentence');
415 foreach my $sentence (@sentences) {
416
417 &setS($sentence);
418
419
420 #--------------------------------------
421 # create <w> (word) from each $line
422 #--------------------------------------
423
424 my @lines = split(/\n+/, $sentence->xml_text);
425 $sentence->set_text("\n");
426
427 for my $line (@lines){ # Todo: Reihenfolge checken
428 if($line ne "" ){
429 my $w_element = XML::Twig::Elt->new('w');
430 &createW($w_element, $line);
431 $w_element->paste('last_child', $sentence);
432 }
Harald Lüngencaab0802024-08-23 17:28:22 +0300433 } # end words
434 } # end sentences
435 } # end paragraphs
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300436
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300437 # $twig->set_pretty_print( 'record');
Harald Lüngena20e69d2024-08-29 13:33:08 +0300438 # $twig->flush($OUT);
439 $twig->flush("/dev/stdout");
440}
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300441
442sub createTextHeader{
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300443 my ($text, $textattsref, $textHeader) = @_;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300444
445 # USE 01 binding_id="2246025"
446 # USE 02 date="2021-01-15"
447 # 03 datefrom="20210115"
448 # 04 dateto="20210115"
449 # 05 elec_date="_"
450 # 06 file=""
451 # USE 07 filename_metadata="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_mets.xml"
452 # USE 08 filename_orig ="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_page-2021011502210030301.xml
453 # USE 09 id="t-bcd0f3fa-bbd3dac4"
454 # 10 img_url=""
455 # USE 11 issue_date="15.01.2021"
456 # USE 12 issue_no="SK0221"
457 # USE 13 issue_title="Suomen Kuvalehti"
458 # USE 14 label="Suomen Kuvalehti no. SK0221 15.01.2021"
459 # USE 16 language="fi"
460 # USE 17 page_id="p1"
461 # USE 18 page_no="None"
462 # 19 part_name="_"
463 # 20 publ_id="0039-5552"
464 # 21 publ_part=""
465 # USE 22 publ_title="Suomen Kuvalehti"
466 # USE 23 publ_type="aikakausi"
467 # USE 24 sentcount="70"
468 # USE 25 sum_lang="|xxx:44|fin:23|eng:3|"
469 # 26 timefrom="000000"
470 # 27 timeto="235959"
471 # USE 28 tokencount="304"
472 # 29 version_added="KLK-fi-2021">
473
474
475 my $BID = $textattsref->{'binding_id'};
476 my $DATE = $textattsref->{'date'};
477 my $METAFILENAME = $textattsref->{'filename_metadata'};
478 my $ORIGFILENAME = $textattsref->{'filename_orig'};
479 my $ID = $textattsref->{'id'};
480 my $ISSUEDATE = $textattsref->{'issue_date'};
481 my $ISSUENO = $textattsref->{'issue_no'};
482 my $ISSUETITLE = $textattsref->{'issue_title'};
483 my $LABEL = $textattsref->{'label'};
484 my $LANGUAGE = $textattsref->{'language'};
485 my $PAGEID = $textattsref->{'page_id'};
486 my $PAGENO = $textattsref->{'page_no'};
487 my $PUBLTITLE = $textattsref->{'publ_title'};
488 my $PUBLTYPE = $textattsref->{'publ_type'};
489 my $SENTCOUNT = $textattsref->{'sentcount'};
490 my $SUMLANG = $textattsref->{'sum_lang'};
491 my $TOKENCOUNT = $textattsref->{'tokencount'};
492
Harald Lüngenba0354b2024-09-11 16:24:08 +0300493
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300494 #-----------------------------
495 # Derived Metadata variables
496 #-----------------------------
497
498 my @datearray = split("-", $DATE);
499 my @langarray = split("|", $SUMLANG);
500 my @namearray = split(/[\.\/]/, $ORIGFILENAME); # use $namearray[4] as ID for the page
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300501
Harald Lüngen86cbd932024-09-10 15:52:18 +0300502 #----------------------------------------------------
503 # create textSigle to be returned from this function
504 #----------------------------------------------------
505
506 # SUK21.JAN.00001
507
508 my $corpusID = "SUK"; # ToDo read Table with Source metadata
509 my $yy = substr($datearray[0], 2, 2); # substr EXPR,OFFSET,LENGTH
510 my $mm = $datearray[1]; # substr EXPR,OFFSET,LENGTH
511 my $MMM = $months{$mm};
512
513 my $textID = $corpusID . $yy . "_" . $MMM . "." . sprintf("%05d", $doccounter{$mm}++);
Harald Lüngen2551a952024-09-15 08:08:35 +0300514 my $textSigle = $textID;
Harald Lüngen86cbd932024-09-10 15:52:18 +0300515
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300516
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300517 #-----------------------------------------------------------------------
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300518 # CREATE text-teiHeader ACCORDING TO THE SKELETON in $textHeader
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300519 #-----------------------------------------------------------------------
520
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300521
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300522 $textHeader->paste('first_child', $text);
523
524 #-----------------------------------------------
525 # <teiHeader>
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300526 # <fileDesc n="EuReCo-KLK-FIN_[$ID]">
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300527 # <titleStmt>
528 # <title>[$LABEL, page $PAGENO]</title>
529
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300530 $textHeader->first_child("fileDesc") -> set_att('n', "EuReCo-". $kielipankkiCorpus . $ID);
Harald Lüngen2551a952024-09-15 08:08:35 +0300531
532
533 #-----------------
534 # titleStmt
535 #----------------
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300536
Harald Lüngen2551a952024-09-15 08:08:35 +0300537 my $title="title";
538 my $titleStmt = $textHeader->first_child("fileDesc")->first_child("titleStmt");
539
540 if($TEIFORMAT eq "I5"){
541 $title = "t.title";
542 $textSigle =~ s/_/\//g;
543 $titleStmt->first_child("textSigle")->set_text($textSigle);
544 };
545
546 $titleStmt->first_child($title)->set_text($LABEL . ", Text #" . $textcounter);
547
548 # Case KLK; PAGENO scheint meist "None" zu sein
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300549
550 #-----------------------------------------------
551 # <fileDesc>
552 # <sourceDesc>
553 # <biblStruct>
554 # <analytic>
555 # <title type="main">[$LABEL, page $PAGENO]</title>
556 # <date>[$DATE]</date>
557 # <date type="year">TODO</date>
558 # <date type="month">TODO</date>
559 # <date type="day">TODO</date>
560 # <idno type="PAGEID">$PAGEID</idno>
561 # <idno type="BINDINGID">$BID</idno>
562 # <idno type="ID">$ID</idno>
563 # <idno type="KIELIPANKKI_METAFILENAME">$METAFILENAME</idno>
564 # <idno type="KIELIPANKKI_ORIGFILENAME">$ORIGFILENAME</idno>
565 # <textLang>$LANGUAGE</textLang>
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300566
567 my $analytic = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/analytic", 0);
Harald Lüngen2551a952024-09-15 08:08:35 +0300568 if($TEIFORMAT eq "I5"){$title="h.title"};
569
570
571 $analytic->first_child($title) ->set_text($LABEL . ", Text #" . $textcounter); # Case KLK; PAGENO scheint meist "None" zu sein
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300572 $analytic->get_xpath('./idno[@type="PAGEID"]', 0) ->set_text($PAGEID);
573 $analytic->get_xpath('./idno[@type="BINDINGID"]', 0) ->set_text($BID);
574 $analytic->get_xpath('./idno[@type="ID"]', 0) ->set_text($ID);
575 $analytic->get_xpath('./idno[@type="KIELIPANKKI_METAFILENAME"]', 0) ->set_text($METAFILENAME);
576 $analytic->get_xpath('./idno[@type="KIELIPANKKI_ORIGFILENAME"]', 0) ->set_text($ORIGFILENAME);
Harald Lüngen2551a952024-09-15 08:08:35 +0300577 if($TEIFORMAT eq "TEI"){
578 $analytic->first_child('textLang') ->set_text($LANGUAGE);
579 }
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300580
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300581 # <monogr>
582 # <title>$PUBLTITLE</title>
583 # <imprint>
584 # <pubPlace>TODO</pubPlace>
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300585 # <publisher>TODO</publisher>
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300586 # </imprint>
587 # <biblScope unit="ISSUETITLE"/>
588 # <biblScope unit="ISSUENO"/>
589 # <biblScope unit="ISSUEDATE"/>
590 # <biblScope unit="pp">$PAGENO</biblScope>
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300591
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300592 my $monogr = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/monogr", 0);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300593
Harald Lüngen2551a952024-09-15 08:08:35 +0300594 $monogr->first_child($title) ->set_text($PUBLTITLE);
595 if($TEIFORMAT eq "TEI"){
596 $monogr->get_xpath('./imprint/date[@type="date"]', 0) ->set_text($DATE);
597 }
598 my $date = "date";
599 if($TEIFORMAT eq "I5"){$date="pubDate"};
600 $monogr->get_xpath('./imprint/' . $date . '[@type="year"]', 0) ->set_text($datearray[0]);
601 $monogr->get_xpath('./imprint/' . $date . '[@type="month"]', 0) ->set_text($datearray[1]);
602 $monogr->get_xpath('./imprint/' . $date . '[@type="day"]', 0) ->set_text($datearray[2]);
603 $monogr->first_child("imprint")->first_child("pubPlace") ->set_text($srcpubplaces{$PUBLTITLE}); # imprint is needed for tei validity
604 $monogr->first_child("imprint")->first_child("publisher") ->set_text($srcpublishers{$PUBLTITLE}); # imprint is needed for tei validity
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300605 $monogr->get_xpath('./biblScope[@unit="ISSUETITLE"]', 0) ->set_text($ISSUETITLE);
606 $monogr->get_xpath('./biblScope[@unit="ISSUENO"]', 0) ->set_text($ISSUENO);
607 $monogr->get_xpath('./biblScope[@unit="ISSUEDATE"]', 0) ->set_text($ISSUEDATE);
608 $monogr->get_xpath('./biblScope[@unit="pp"]', 0) ->set_text($PAGENO); # Achtung - PAGENO scheint meist "None" zu sein
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300609
Harald Lüngen2551a952024-09-15 08:08:35 +0300610 my $dateNice = $datearray[2] . "." . $datearray[1] . "." . $datearray[0];
611 my $dateBackwards = $datearray[0] . "." . $datearray[1] . "." . $datearray[2];
612
613 if($TEIFORMAT eq "I5"){
614 my $refCompleteText = $textSigle . " " . $PUBLTITLE . " no. " . $ISSUENO . ", [" . $mapping{$PUBLTYPE} . "], " . $dateNice;
615 my $refShortText = $textSigle . " " . $PUBLTITLE . ", " . $dateNice;
616 $textHeader->get_xpath('./fileDesc/sourceDesc/reference[@type="complete"]', 0) -> set_text($refCompleteText);
617 $textHeader->get_xpath('./fileDesc/sourceDesc/reference[@type="short"]' , 0) -> set_text($refShortText);
618 }
619
620
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300621 # <encodingDesc>
622 # <tagsDecl>
623 # <namespace name="http://www.tei-c.org/ns/1.0">
624 # <tagUsage gi="s" occurs="SENTCOUNT"/>
625 # <tagUsage gi="w" occurs="TOKENCOUNT"/>
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300626
Harald Lüngen2551a952024-09-15 08:08:35 +0300627 my $namespacePath="./encodingDesc/tagsDecl/namespace/";
628 if($TEIFORMAT eq "I5"){$namespacePath="./encodingDesc/tagsDecl/"};
629
630 $textHeader->get_xpath($namespacePath . 'tagUsage[@gi="s"]', 0) -> set_att('occurs', $SENTCOUNT);
631 $textHeader->get_xpath($namespacePath . 'tagUsage[@gi="w"]', 0) -> set_att('occurs', $TOKENCOUNT);
632
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300633 # <profileDesc>
634 # <langUsage>
635 # <language ident="fi" usage="|xxx:44|fin:23|eng:3|"/>
636 # </langUsage>
637 # <textClass>
638 # <classCode scheme="kielipankki_klk">$PUBLTYPE</classCode>
Harald Lüngen2551a952024-09-15 08:08:35 +0300639 # <classCode scheme="kielipankki_klk_mapped">$mapping{$PUBLTYPE}</classCode>
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300640
Harald Lüngen2551a952024-09-15 08:08:35 +0300641 if($TEIFORMAT eq "I5"){
642 $textHeader->get_xpath('./profileDesc/creation/creatDate', 0) ->set_text($dateBackwards);
643 }
644
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300645 $textHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_att('ident', $LANGUAGE);
646 $textHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_att('usage', $SUMLANG);
647 # in @usage muss eigt. ein integer; am besten inhalt von SUMLANG aufdroeseln und mehrere <language> machen
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300648
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300649 $textHeader->get_xpath('./profileDesc/textClass/classCode[@scheme="kielipankki_klk"]', 0) ->set_text($PUBLTYPE);
Harald Lüngen2551a952024-09-15 08:08:35 +0300650 $textHeader->get_xpath('./profileDesc/textClass/classCode[@scheme="kielipankki_klk_mapped"]', 0)->set_text($mapping{$PUBLTYPE});
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300651
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300652 # <revisionDesc>
653 # <change when="TODO" who="HL">TEI version for EuReCo</change>
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300654
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300655 $textHeader->get_xpath('./revisionDesc/change', 0) ->set_att('when', localtime->ymd('-'));
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300656
Harald Lüngen86cbd932024-09-10 15:52:18 +0300657 return $textID;
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300658
Harald Lüngen86cbd932024-09-10 15:52:18 +0300659
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300660 #-----------------------------------
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300661 # END OF CREATING TEIHEADER
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300662 #-----------------------------------
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300663
664}
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300665
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300666sub setP {
667 my ($paragraph) = @_;
668
669 $paragraph->set_gi('p');
670
671 # <paragraph id="p-bcd0f3fa-bbd3dac4-815ead7a" sum_lang="|fin:1|">
672 # atts of <paragraph>:
673 # @id USE
674 # @sum_lang USE: put in xml:lang and prefix the value with "x-" for private value
675
676 $paragraph->set_att("xml:lang", "x-" . $paragraph->att("sum_lang"));
677 $paragraph->del_att("sum_lang");
Harald Lüngena20e69d2024-08-29 13:33:08 +0300678 # $paragraph->change_att_name('id', 'xml:id');
679 $paragraph->del_att("id"); # diese id ist auch nicht eindeutig!!
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300680}
681sub setS {
682 my ($sentence) = @_;
683
684 $sentence->set_gi('s');
685
686 # the atts of <sentence>:
687 # USE 1 @id="s-bcd0f3fa-bbd3dac4-f7429090"
688 # USE 2 @lang="fin" -> xml:lang
689 # ? 3 @lang_conf="0.6734853"> -> ToDo @cert ?
690
691 # set attrs of <s>
692 $sentence->set_att("xml:lang", $sentence->att("lang")); # ToDo: convert the value / introduce a hash for lookup (input values: "fin", "xxx", ....)
Harald Lüngenfaf8d482024-08-27 21:19:47 +0300693 # $sentence->change_att_name('id', 'xml:id'); # nicht eindeutig
694 $sentence->del_att('id');
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300695 $sentence->del_att("lang"); # replaced by xml:lang
696 $sentence->del_att("lang_conf"); # for the time being
697
698}
699
700sub createW {
701 my ($w_element, $line) = @_;
702
703 #---------------------------
704 # Get the tags (=columns)
705 #---------------------------
706
707 my @tags = split(/\t/, $line);
708
709 # set content of <w> i.e. the token
710 $w_element->set_text($tags[0]);
711
712 # vrt positional-attributes in corpus KLK:
713 # USE [0] word
714 # USE [1] ref (id for reference of dephead)
715 # USE [2] lemma
716 # ? [3] lemmacomp (lemma with compound info - could go in @norm, as tag abuse?)
717 # USE [4] pos
718 # USE [5] msd
719 # USE [6] dephead
720 # USE [7] deprel
721 # [8] content (ocr-process)
722 # [9] vpos (ocr-process)
723 # [10] ocr (ocr-process)
724 # [11] cc (ocr-process)
725 # [12] hyph (ocr-process)
726 # [13] style (ocr-process)
727 # [14] lex (korp semantic disambiguation from G"oteborg)
728
729 # set the attributes of <w>:
730 $w_element->set_att("n", $tags[1]);
731 # $w_element->set_att("id", "w_" . $namearray[4] . $sentence->att("xml:id") . "_" . $tags[1]);
732 # so zusammengebaute ID ist auch nicht eindeutig...
Harald Lüngenfaf8d482024-08-27 21:19:47 +0300733 $w_element->del_att("id");
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300734 $w_element->set_att("lemma", $tags[2]);
735 # $w_element->set_att("norm", $tags[3]); # tag abuse of @norm
736 $w_element->set_att("pos", $tags[4]);
737 $w_element->set_att("msd", $tags[5]);
Harald Lüngenccd84902024-08-27 16:03:47 +0300738#TMP $w_element->set_att("head", $tags[6]);
739#TMP $w_element->set_att("deprel", $tags[7]);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300740
741}
742
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300743
744sub set_title{
Harald Lüngen86cbd932024-09-10 15:52:18 +0300745 my ($corpusHeader, $source, $year, $kielipankkiCorpus) = @_;
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300746
Harald Lüngen86cbd932024-09-10 15:52:18 +0300747 my $cTitleString = $source . " " . $year . ", from ". $kielipankkiCorpus . " for EuReCo";
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300748
749 #<teiHeader>
750 # <fileDesc>
751 # <titleStmt>
752 # <title>[Aamulehti2021] from [klk-fi-v2-vrt for] EuReCo</title>
753 # </titleStmt>
754 # <!-- ... -->
755 # </fileDesc>
756 #</teiHeader>
757
758 my $cTitleNode = $corpusHeader->first_child("fileDesc")->first_child("titleStmt")->first_child("title");
759
760 $cTitleNode->set_text($cTitleString);
761
762}
763
764sub set_sourceDesc{
Harald Lüngen86cbd932024-09-10 15:52:18 +0300765 my ($corpusHeader, $source, $year, $kielipankkiCorpus) = @_;
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300766
Harald Lüngen86cbd932024-09-10 15:52:18 +0300767 my $cBiblString = $source . " " . $year . ", from ". $kielipankkiCorpus . " for EuReCo";
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300768
769 #<teiHeader>
770 # <fileDesc>
771 # <!-- ... -->
772 # <sourceDesc>
773 # <bibl>[Aamulehti2021] from [klk-fi-v2-vrt]</bibl>
774 # </sourceDesc>
775 # <!-- ... -->
776 # </fileDesc>
777 #</teiHeader>
778
779 my $cBiblNode = $corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("bibl");
780
781 $cBiblNode->set_text($cBiblString);
782}
783
784
785
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300786#################
787## usage_message
788#################
789
790
791sub usage_message {
Harald Lüngena7e91622024-08-23 17:33:11 +0300792 print " Usage: ./vrt2tei.pl <file.vrt.xml> <outfile>\n";
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300793 print " <file.vrt.xml> is a VRT file converted to proper XML\n";
794 exit;
795}
796
797