blob: 63dd6fdf9ada9ebf53ecf1d374c29c4eb4cc50ee [file] [log] [blame]
Harald Lüngen9d4e0462024-08-23 09:34:22 +03001#! /usr/bin/perl -w
2
3
4###########################################################################################################################################################
5# vrt2tei.pl
6# eureco
7# leibniz-institut fuer deutsche sprache / csc finland esbo
8# august 2024
9#
10#
11# using XML::Twig , see http://www.xmltwig.org/, https://metacpan.org/pod/XML::Twiga
12#
Harald Lüngenccd84902024-08-27 16:03:47 +030013# usage: see below the usage fugnction
Harald Lüngencaab0802024-08-23 17:28:22 +030014# Usage: ./vrt2tei.pl <vrtxmlfile.xml> <outfile>
Harald Lüngen9d4e0462024-08-23 09:34:22 +030015# <vrtxmlfile>: xml-ised vrt file
16#
17#
18# TODO:
19# 1 insert dtd spec, or ref to TEI
20
Harald Lüngendb5e6e72024-09-04 17:41:18 +030021# 3a remove the vrt positional attribute comment line / all comment lines
Harald Lüngen9d4e0462024-08-23 09:34:22 +030022# 3b add @head and @deprel to I5 sowie auch @msd
23# 3c bearbeitung von @head und @deprel in tei2korapxml durch Nils?
24# 3d build 30 billion corpus
25
26# 4a take care of IDs
27# 4b see to the values of @xml:lang
28# 5 abfangen von unerwarteten elementen dh andere als <sentence> und <paragraph>
29# 5a wort reihenfolge nochmal checken
30# 6 checks and balances
Harald Lüngen9d4e0462024-08-23 09:34:22 +030031# 7 How to encode Kielipankki and National Library of Finland? in teiCorpus Header
32# 8 construct <idsDoc>s for the months (or go for TEI)
33# 9 parallelisation in bash and application on sub corpora of KLK
34# 10 re-implementation of the gawk code in the perl script
35# 12 re-implement creation of text header from xml file in another twig / parametrize TEI vs I5
36
37
38
39#remember
40#formatted.xml:105613: element w: validity error : No declaration for attribute deprel of element w
41#formatted.xml:105613: element w: validity error : No declaration for attribute head of element w
42
43
44#
45#
46############################################################################################################################################################
47
48
49use strict;
50use warnings;
Harald Lüngen381c2a22024-09-17 09:06:39 +030051#use diagnostics;
Harald Lüngen9d4e0462024-08-23 09:34:22 +030052
Harald Lüngen2551a952024-09-15 08:08:35 +030053use Getopt::Std;
Harald Lüngen9d4e0462024-08-23 09:34:22 +030054use XML::Twig;
55use XML::Generator ':pretty'; # apparently no effect when using flush();
56
57
58use locale; # diese drei Zeilen, damit \b im regex nicht Umlaute und ß matcht.
59use POSIX qw(locale_h); # to be able to use setlocale()
60#setlocale(LC_ALL,'de_DE');
61setlocale(LC_ALL, "fi_FI");
62use utf8;
63use open qw( :std :encoding(UTF-8) );
64
65use Time::Piece;
66use Tie::IxHash;
Harald Lüngen381c2a22024-09-17 09:06:39 +030067use Data::Random::String;
Harald Lüngendb5e6e72024-09-04 17:41:18 +030068
69
Harald Lüngen9d4e0462024-08-23 09:34:22 +030070#----------------------
71# check file arguments:
72#----------------------
73
74# arg0 infile: vrt-xml
Harald Lüngen9d4e0462024-08-23 09:34:22 +030075
Harald Lüngena20e69d2024-08-29 13:33:08 +030076unless($ARGV[0]) {&usage_message()} ; # min arg0, the input file
77if ($ARGV[1]) {&usage_message()}; # max arg0, the input file
Harald Lüngen9d4e0462024-08-23 09:34:22 +030078
79
Harald Lüngen2551a952024-09-15 08:08:35 +030080#--------------------------
81# get options / auxiliary files
82#--------------------------
83
84
85
86
87
Harald Lüngen9d4e0462024-08-23 09:34:22 +030088####################
89# GLOBAL VARIABLES
90####################
91
Harald Lüngen2551a952024-09-15 08:08:35 +030092my $encoding = "UTF-8"; # dieses $encoding ist NUR fuer das output s.u. twig funktion
93
94#my $TEIFORMAT = "TEI";
95my $TEIFORMAT = "I5";
96
97
98
Harald Lüngencaab0802024-08-23 17:28:22 +030099my $textcounter = 0;
Harald Lüngen381c2a22024-09-17 09:06:39 +0300100my $LASTMONTH = 0;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300101
Harald Lüngenba0354b2024-09-11 16:24:08 +0300102our %corpussigles = ();
103our %srcfullnames = ();
104our %srcpubplaces = ();
105our %srcpublishers = ();
106our %srctexttypes = ();
107our %srctextlangs = ();
108
Harald Lüngen381c2a22024-09-17 09:06:39 +0300109our %expandLang = ();
110
111
112my %doccounter = ( # by the month as in dereko
Harald Lüngen86cbd932024-09-10 15:52:18 +0300113 "01" => 1,
114 "02" => 1,
115 "03" => 1,
116 "04" => 1,
117 "05" => 1,
118 "06" => 1,
119 "07" => 1,
120 "08" => 1,
121 "09" => 1,
122 "10" => 1,
123 "11" => 1,
124 "12" => 1,
125 );
126
127
Harald Lüngen381c2a22024-09-17 09:06:39 +0300128
Harald Lüngen86cbd932024-09-10 15:52:18 +0300129my $sourcescsvfile = "sources_klk_fi_v2_2021_4eureco.csv";
Harald Lüngen2551a952024-09-15 08:08:35 +0300130
Harald Lüngen86cbd932024-09-10 15:52:18 +0300131my $corpheaderfile = "teiCorpusHeaderSkeleton.tei.xml";
132my $textheaderfile = "teiTextHeaderSkeleton.tei.xml";
Harald Lüngen2551a952024-09-15 08:08:35 +0300133if($TEIFORMAT eq "I5"){
Harald Lüngen381c2a22024-09-17 09:06:39 +0300134 $corpheaderfile = "i5CorpusHeaderSkeleton.i5.xml";
Harald Lüngen2551a952024-09-15 08:08:35 +0300135 $textheaderfile = "i5TextHeaderSkeleton.i5.xml";
136}
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300137
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300138
139my $twig="";
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300140
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300141# global variables pertaining to the original corpus :
Harald Lüngen381c2a22024-09-17 09:06:39 +0300142my $kielipankkiCorpus = "klk-fi-v2-vrt";
143my $kielipankkiLicense = "CLARIN-RES";
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300144
Harald Lüngen381c2a22024-09-17 09:06:39 +0300145
146# variables $fnsource and $fnyear taken from the filename
147my @array = split(/\//, $ARGV[0]);
148my $l = scalar(@array);
149my $fnsource = $array[$l-1];
150$fnsource =~ s/([0-9][0-9][0-9][0-9])\.xml$//;
151
152my $fnyear = $1; # $1 contains substring in first bracket in regex above
153
154
155# months
Harald Lüngen86cbd932024-09-10 15:52:18 +0300156my %months = (
157 "01" => "JAN",
158 "02" => "FEB",
159 "03" => "MAR",
160 "04" => "APR",
161 "05" => "MAY",
162 "06" => "JUN",
163 "07" => "JUL",
164 "08" => "AUG",
165 "09" => "SEP",
166 "10" => "OCT",
167 "11" => "NOV",
168 "12" => "DEC",
169 );
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300170
Harald Lüngen381c2a22024-09-17 09:06:39 +0300171my %monthnames = (
172 "01" => "January",
173 "02" => "February",
174 "03" => "March",
175 "04" => "April",
176 "05" => "May",
177 "06" => "June",
178 "07" => "July",
179 "08" => "August",
180 "09" => "September",
181 "10" => "October",
182 "11" => "November",
183 "12" => "December",
184 );
185
Harald Lüngen2551a952024-09-15 08:08:35 +0300186my %mapping = ();
187$mapping{"aikakausi"} = "Zeitschrift";
188$mapping{"sanomalehti"} = "Zeitung";
189
190
191
Harald Lüngen381c2a22024-09-17 09:06:39 +0300192#-------------------------------------------------------------------------------------------
193# read source metadata file (prepared manually => ultimately read the info from CMDI File?)
194# and set variables
195#-------------------------------------------------------------------------------------------
196
197open(my $SOURCES, "< $sourcescsvfile") || die("$0: cannot open file for reading: $sourcescsvfile");
198while(my $fline = <$SOURCES>){
199 chomp($fline);
200
201 if ($fline=~/^\#/ || $fline=~/^\s*$/ || $fline =~ "TEICORPUSID"){next;} # skip line if empty line or comment line or first line
202 my @flarray = split(/\s*\t+\s*/, $fline); # split each line into array
203
204 # set full titles ($flarray[1]) as keys:
205 $corpussigles{$flarray[1]} = $flarray[0];
206 $srcfullnames{$flarray[1]} = $flarray[1];
207 $srcpubplaces{$flarray[1]} = $flarray[6];
208 $srcpublishers{$flarray[1]} = $flarray[7];
209 $srctexttypes{$flarray[1]} = $flarray[4];
210 $srctextlangs{$flarray[1]} = $flarray[5];
211
212 # also set simple titles ($flarray[2]) as keys:
213 $corpussigles{$flarray[2]} = $flarray[0];
214 $srcfullnames{$flarray[2]} = $flarray[1];
215 $srcpubplaces{$flarray[2]} = $flarray[6];
216 $srcpublishers{$flarray[2]} = $flarray[7];
217 $srctexttypes{$flarray[2]} = $flarray[4];
218 $srctextlangs{$flarray[2]} = $flarray[5];
219}
220close($SOURCES);
221
222$expandLang{"fi"} = "Finnish";
223$expandLang{"sv"} = "Swedish";
224
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300225#------------------------------------------------------------------
226# read corpusHeaderSkeleton document and get header out of it
227#------------------------------------------------------------------
228
229my $teiCorpusHeaderDocTwig = new XML::Twig(
230 keep_spaces => 1,
231 keep_atts_order => 1,
232 comments => 'drop',
233 );
234
235
Harald Lüngen86cbd932024-09-10 15:52:18 +0300236$teiCorpusHeaderDocTwig->parsefile($corpheaderfile);
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300237my $corpusHeader = $teiCorpusHeaderDocTwig->root; # getting the teiHeader for corpus out of the teiCorpusHeaderSkeleton document
238
239
240#------------------------------------------------------------------
241# read textHeaderSkeleton document adn get header out of it
242#------------------------------------------------------------------
243
244my $teiTextHeaderDocTwig = new XML::Twig(
245 keep_spaces => 1,
246 keep_atts_order => 1,
247 comments => 'drop',
248 );
249
Harald Lüngen86cbd932024-09-10 15:52:18 +0300250$teiTextHeaderDocTwig->parsefile($textheaderfile);
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300251my $textHeader = $teiTextHeaderDocTwig->root; # getting the teiHeader for corpus out of the teiTextHeaderSkeleton document
252
253
Harald Lüngen2551a952024-09-15 08:08:35 +0300254my $idsDoc = XML::Twig::Elt->new('idsDoc');
Harald Lüngen381c2a22024-09-17 09:06:39 +0300255my $idsDocHeader = XML::Twig::Elt->new('idsHeader');
Harald Lüngen2551a952024-09-15 08:08:35 +0300256
Harald Lüngen381c2a22024-09-17 09:06:39 +0300257if($TEIFORMAT eq "I5"){
Harald Lüngen2551a952024-09-15 08:08:35 +0300258 my $docFileDesc = XML::Twig::Elt->new('fileDesc');
259 my $docTitleStmt = XML::Twig::Elt->new('titleStmt');
260 my $dtitle = XML::Twig::Elt->new('d.title');
261 my $docSigle = XML::Twig::Elt->new('dokumentSigle');
262
Harald Lüngen381c2a22024-09-17 09:06:39 +0300263 my $docPublicationStmt = XML::Twig::Elt->new('publicationStmt');
264 my $docDistributor = XML::Twig::Elt->new('distributor');
265 my $docPubAddress = XML::Twig::Elt->new('pubAdress');
266 my $docAvailability = XML::Twig::Elt->new('availability');
267 my $docPubDate = XML::Twig::Elt->new('pubDate');
268
269 my $docSourceDesc = XML::Twig::Elt->new('sourceDesc');
270 my $docBiblStruct = XML::Twig::Elt->new('biblStruct');
271 my $docMonogr = XML::Twig::Elt->new('monogr');
272 my $docHTitle = XML::Twig::Elt->new('h.title');
273 my $docImprint = XML::Twig::Elt->new('imprint');
274
275 $docSigle -> paste("first_child", $docTitleStmt);
276 $dtitle -> paste("last_child", $docTitleStmt);
277 $docTitleStmt -> paste("last_child", $docFileDesc);
278 $docFileDesc -> paste("last_child", $idsDocHeader);
279 $docPublicationStmt -> paste("last_child", $docFileDesc);
280 $docDistributor -> paste("last_child", $docPublicationStmt);
281 $docPubAddress -> paste("last_child", $docPublicationStmt);
282
283 $docAvailability -> paste("last_child", $docPublicationStmt);
284 $docPubDate -> paste("last_child", $docPublicationStmt);
285
286 $docSourceDesc -> paste("last_child", $docFileDesc);
287 $docBiblStruct -> paste("last_child", $docSourceDesc);
288 $docMonogr -> paste("last_child", $docBiblStruct);
289 $docHTitle -> paste("last_child", $docMonogr);
290 $docImprint -> paste("last_child", $docMonogr);
291
Harald Lüngen2551a952024-09-15 08:08:35 +0300292 $idsDocHeader -> paste("last_child", $idsDoc);
293
294 # ToDo set dummy dtitle and docSigle
Harald Lüngen2551a952024-09-15 08:08:35 +0300295
Harald Lüngen381c2a22024-09-17 09:06:39 +0300296 $docSigle->set_text($corpussigles{$fnsource} . "/JAN");
297 $dtitle ->set_text($srcfullnames{$fnsource} . ", January " . $fnyear);
298}
Harald Lüngen2551a952024-09-15 08:08:35 +0300299
300
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300301#----------------------------------
302# read input VRT-XML document
303#----------------------------------
304
305open(my $IN, "< $ARGV[0]") || die("$0: cannot open file for reading: $ARGV[0]"); # open input file and initialise filehandel, actually does not seem to be needed
306 # as parsefile() (s.b.) is applied to the filename
307
Harald Lüngen86cbd932024-09-10 15:52:18 +0300308
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300309
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300310#####################
311# M A I N
312#####################
313
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300314#-------------------------------------------------------------------------------------------------------------
315# start twig for input and call start tag handler for root and twig handler for each <text> in the VRT
316#-------------------------------------------------------------------------------------------------------------
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300317
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300318
Harald Lüngenba0354b2024-09-11 16:24:08 +0300319
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300320$twig = new XML::Twig(
321 keep_spaces => 1, # dadurch auch whitespaces an ehemeligen elementgrenzen im output
322 keep_atts_order => 1, # requires Tie::IxHash
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300323 comments => 'drop',
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300324 start_tag_handlers => {
Harald Lüngen2551a952024-09-15 08:08:35 +0300325 texts => sub{root(@_, $corpusHeader)}
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300326 },
Harald Lüngen2551a952024-09-15 08:08:35 +0300327
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300328 twig_handlers => {
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300329# text => \&text
Harald Lüngen7abb0b52024-09-05 16:26:57 +0300330 text => sub{text(@_, $textHeader->copy)} # copy must be because textHeader will be flushed with $twig in the <text> handler;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300331 },
332 # dtd_handlers => { # ToDo for I5
333 # \&set_dtd;
334 # }
Harald Lüngena20e69d2024-08-29 13:33:08 +0300335
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300336 output_encoding => $encoding,
337 );
338
339$twig->parsefile($ARGV[0]);
340
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300341
342
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300343###########
344# END MAIN
345###########
346
347
348
349
350##############################
351# S U B R O U T I N E S
352##############################
353
354# sub set_dtd [
355# my $twig, $dtd = @_;
356# my $internal = qq|\nPUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"\n"DTD/xhtml1-strict.dtd"|;
357#
358# $twig->twig_doctype('html', undef, undef, $internal);
359# }
360
361
362
363sub root {
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300364 my ($twig, $root, $corpusHeader) =@_;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300365
Harald Lüngen381c2a22024-09-17 09:06:39 +0300366 if($TEIFORMAT eq "TEI"){
367 $root->set_gi('teiCorpus');
368 }
369 else {
370 $root->set_gi('idsCorpus');
371 }
372
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300373 $root->set_att("xmlns", 'http://www.tei-c.org/ns/1.0');
374
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300375 &insertCorpusHeader($root, $corpusHeader);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300376}
377
378
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300379
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300380sub insertCorpusHeader{
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300381 my ($root, $corpusHeader) =@_;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300382
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300383 #-----------------------
384 # set corpus header
385 #-----------------------
386
Harald Lüngen381c2a22024-09-17 09:06:39 +0300387 &set_title($corpusHeader, $srcfullnames{$fnsource}, $fnyear, $kielipankkiCorpus);
388
389 if($TEIFORMAT eq "TEI"){
390 &set_sourceDesc($corpusHeader, $srcfullnames{$fnsource}, $fnyear, $kielipankkiCorpus);
391 }
392 elsif ($TEIFORMAT eq "I5"){
393 $corpusHeader->get_xpath("fileDesc/titleStmt/korpusSigle",0) ->set_text($corpussigles{$fnsource});
394 $corpusHeader->get_xpath("fileDesc/publicationStmt/pubDate",0)->set_text(localtime[5] + 1900);
395 $corpusHeader->get_xpath("profileDesc/langUsage/language",0) ->set_text($expandLang{$srctextlangs{$fnsource}});
396 $corpusHeader->get_xpath("profileDesc/langUsage/language",0) ->set_att('id', $srctextlangs{$fnsource});
397 $corpusHeader->get_xpath('encodingDesc/editorialDecl/transduction[@n="2"]',0) ->set_text("I5 version by EuReCo using vrt2tei.pl " . localtime->ymd('-'));
398 &set_sourceDescI5($corpusHeader);
399 }
400 else{
401 print STDERR "TEIFORMAT is $TEIFORMAT; must be TEI or I5";
402 }
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300403
Harald Lüngen2551a952024-09-15 08:08:35 +0300404 $corpusHeader->paste("first_child", $root);
Harald Lüngen381c2a22024-09-17 09:06:39 +0300405
406 if($TEIFORMAT eq "I5"){
407 $idsDoc->paste("after", $corpusHeader);
408 }
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300409}
410
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300411
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300412#----------------------------
413# handler &text for <text>
414#----------------------------
415
416sub text {
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300417 my ($twig, $text, $textHeader) = @_;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300418
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300419 $textcounter++;
Harald Lüngencaab0802024-08-23 17:28:22 +0300420
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300421
Harald Lüngencaab0802024-08-23 17:28:22 +0300422 # ToDo: catch all other, unexpected children of root
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300423
424 #--------------------------------------------------------------------------
425 # Get text metadata (attributes of <text>) and create teiHeader for <text>
426 #--------------------------------------------------------------------------
427
428 my $textattsref = $text->atts(); # $textattsref is now a reference to a hash and should be used with '->'
429
Harald Lüngen381c2a22024-09-17 09:06:39 +0300430 &createIdsDoc($textattsref);
431
432
433
Harald Lüngen86cbd932024-09-10 15:52:18 +0300434 # &createTextHeader returns the $textID:
435 my $textID = &createTextHeader($text, $textattsref, $textHeader);
436
Harald Lüngen2551a952024-09-15 08:08:35 +0300437
438 #----------------------------------------
439 # create <TEI> or <idsText> from <text>
440 #----------------------------------------
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300441
442 # set vrt <text> to <TEI> and delete all attributes after they were were saved above
443 $text->del_atts;
Harald Lüngen7abb0b52024-09-05 16:26:57 +0300444
Harald Lüngen2551a952024-09-15 08:08:35 +0300445 if($TEIFORMAT eq "TEI"){
446 $text->set_gi("TEI");
447 $text->set_att('xml:id', $textID);
448 }
449 else {
450 $text ->set_gi("idsText");
451 $text ->set_att('version', "1.0");
452# $text ->move("last_child", $idsDoc); # does not work because apparently $idsDoc is not under $root at this point
Harald Lüngen7abb0b52024-09-05 16:26:57 +0300453
Harald Lüngen2551a952024-09-15 08:08:35 +0300454 }
Harald Lüngen86cbd932024-09-10 15:52:18 +0300455
Harald Lüngen7abb0b52024-09-05 16:26:57 +0300456
Harald Lüngencaab0802024-08-23 17:28:22 +0300457
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300458 #------------------------------------------------------------------
459 # create the <tei:text>, <body>, <div> elements inside <TEI>
460 #------------------------------------------------------------------
461
462 my $ttext_element = XML::Twig::Elt->new('text');
463 my $body_element = XML::Twig::Elt->new('body');
464 my $div_element = XML::Twig::Elt->new('div');
465
466 # set atts
Harald Lüngencaab0802024-08-23 17:28:22 +0300467 $div_element ->set_att("type", "page"); # ToDo: this is specific to KLK
Harald Lüngen2551a952024-09-15 08:08:35 +0300468 $ttext_element->set_att("xml:lang", 'fi'); # as in ICC-NOR
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300469
470 # paste
471 $ttext_element->paste('last_child', $text);
472 $body_element ->paste('last_child', $ttext_element);
473 $div_element ->paste('last_child', $body_element);
474
475
476 #-------------------------------
477 # create <p> from <paragraph>
478 #-------------------------------
479
480 my @paragraphs = $text->children( 'paragraph');
481
482 foreach my $paragraph (@paragraphs) {
483
484 &setP($paragraph);
485
486 $paragraph->move('last_child', $div_element);
487
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300488 #------------------------------
489 # create <s> from <sentence>
490 #------------------------------
491
492 my @sentences = $paragraph->children('sentence');
493 foreach my $sentence (@sentences) {
494
495 &setS($sentence);
496
497
498 #--------------------------------------
499 # create <w> (word) from each $line
500 #--------------------------------------
501
502 my @lines = split(/\n+/, $sentence->xml_text);
503 $sentence->set_text("\n");
504
505 for my $line (@lines){ # Todo: Reihenfolge checken
506 if($line ne "" ){
507 my $w_element = XML::Twig::Elt->new('w');
508 &createW($w_element, $line);
509 $w_element->paste('last_child', $sentence);
510 }
Harald Lüngencaab0802024-08-23 17:28:22 +0300511 } # end words
512 } # end sentences
513 } # end paragraphs
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300514
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300515 # $twig->set_pretty_print( 'record');
Harald Lüngena20e69d2024-08-29 13:33:08 +0300516 # $twig->flush($OUT);
517 $twig->flush("/dev/stdout");
518}
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300519
520sub createTextHeader{
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300521 my ($text, $textattsref, $textHeader) = @_;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300522
523 # USE 01 binding_id="2246025"
524 # USE 02 date="2021-01-15"
525 # 03 datefrom="20210115"
526 # 04 dateto="20210115"
527 # 05 elec_date="_"
528 # 06 file=""
529 # USE 07 filename_metadata="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_mets.xml"
530 # USE 08 filename_orig ="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_page-2021011502210030301.xml
531 # USE 09 id="t-bcd0f3fa-bbd3dac4"
532 # 10 img_url=""
533 # USE 11 issue_date="15.01.2021"
534 # USE 12 issue_no="SK0221"
535 # USE 13 issue_title="Suomen Kuvalehti"
536 # USE 14 label="Suomen Kuvalehti no. SK0221 15.01.2021"
537 # USE 16 language="fi"
538 # USE 17 page_id="p1"
539 # USE 18 page_no="None"
540 # 19 part_name="_"
541 # 20 publ_id="0039-5552"
542 # 21 publ_part=""
543 # USE 22 publ_title="Suomen Kuvalehti"
544 # USE 23 publ_type="aikakausi"
545 # USE 24 sentcount="70"
546 # USE 25 sum_lang="|xxx:44|fin:23|eng:3|"
547 # 26 timefrom="000000"
548 # 27 timeto="235959"
549 # USE 28 tokencount="304"
550 # 29 version_added="KLK-fi-2021">
551
552
553 my $BID = $textattsref->{'binding_id'};
554 my $DATE = $textattsref->{'date'};
555 my $METAFILENAME = $textattsref->{'filename_metadata'};
556 my $ORIGFILENAME = $textattsref->{'filename_orig'};
557 my $ID = $textattsref->{'id'};
558 my $ISSUEDATE = $textattsref->{'issue_date'};
559 my $ISSUENO = $textattsref->{'issue_no'};
560 my $ISSUETITLE = $textattsref->{'issue_title'};
561 my $LABEL = $textattsref->{'label'};
562 my $LANGUAGE = $textattsref->{'language'};
563 my $PAGEID = $textattsref->{'page_id'};
564 my $PAGENO = $textattsref->{'page_no'};
565 my $PUBLTITLE = $textattsref->{'publ_title'};
566 my $PUBLTYPE = $textattsref->{'publ_type'};
567 my $SENTCOUNT = $textattsref->{'sentcount'};
568 my $SUMLANG = $textattsref->{'sum_lang'};
569 my $TOKENCOUNT = $textattsref->{'tokencount'};
570
Harald Lüngenba0354b2024-09-11 16:24:08 +0300571
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300572 #-----------------------------
573 # Derived Metadata variables
574 #-----------------------------
575
576 my @datearray = split("-", $DATE);
577 my @langarray = split("|", $SUMLANG);
578 my @namearray = split(/[\.\/]/, $ORIGFILENAME); # use $namearray[4] as ID for the page
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300579
Harald Lüngen86cbd932024-09-10 15:52:18 +0300580 #----------------------------------------------------
581 # create textSigle to be returned from this function
582 #----------------------------------------------------
583
584 # SUK21.JAN.00001
585
586 my $corpusID = "SUK"; # ToDo read Table with Source metadata
587 my $yy = substr($datearray[0], 2, 2); # substr EXPR,OFFSET,LENGTH
588 my $mm = $datearray[1]; # substr EXPR,OFFSET,LENGTH
589 my $MMM = $months{$mm};
590
591 my $textID = $corpusID . $yy . "_" . $MMM . "." . sprintf("%05d", $doccounter{$mm}++);
Harald Lüngen2551a952024-09-15 08:08:35 +0300592 my $textSigle = $textID;
Harald Lüngen86cbd932024-09-10 15:52:18 +0300593
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300594
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300595 #-----------------------------------------------------------------------
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300596 # CREATE text-teiHeader ACCORDING TO THE SKELETON in $textHeader
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300597 #-----------------------------------------------------------------------
598
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300599
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300600 $textHeader->paste('first_child', $text);
601
602 #-----------------------------------------------
603 # <teiHeader>
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300604 # <fileDesc n="EuReCo-KLK-FIN_[$ID]">
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300605 # <titleStmt>
606 # <title>[$LABEL, page $PAGENO]</title>
607
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300608 $textHeader->first_child("fileDesc") -> set_att('n', "EuReCo-". $kielipankkiCorpus . $ID);
Harald Lüngen2551a952024-09-15 08:08:35 +0300609
610
611 #-----------------
612 # titleStmt
613 #----------------
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300614
Harald Lüngen2551a952024-09-15 08:08:35 +0300615 my $title="title";
616 my $titleStmt = $textHeader->first_child("fileDesc")->first_child("titleStmt");
617
618 if($TEIFORMAT eq "I5"){
619 $title = "t.title";
620 $textSigle =~ s/_/\//g;
621 $titleStmt->first_child("textSigle")->set_text($textSigle);
622 };
623
624 $titleStmt->first_child($title)->set_text($LABEL . ", Text #" . $textcounter);
625
626 # Case KLK; PAGENO scheint meist "None" zu sein
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300627
628 #-----------------------------------------------
629 # <fileDesc>
630 # <sourceDesc>
631 # <biblStruct>
632 # <analytic>
633 # <title type="main">[$LABEL, page $PAGENO]</title>
634 # <date>[$DATE]</date>
635 # <date type="year">TODO</date>
636 # <date type="month">TODO</date>
637 # <date type="day">TODO</date>
638 # <idno type="PAGEID">$PAGEID</idno>
639 # <idno type="BINDINGID">$BID</idno>
640 # <idno type="ID">$ID</idno>
641 # <idno type="KIELIPANKKI_METAFILENAME">$METAFILENAME</idno>
642 # <idno type="KIELIPANKKI_ORIGFILENAME">$ORIGFILENAME</idno>
643 # <textLang>$LANGUAGE</textLang>
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300644
645 my $analytic = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/analytic", 0);
Harald Lüngen2551a952024-09-15 08:08:35 +0300646 if($TEIFORMAT eq "I5"){$title="h.title"};
647
648
649 $analytic->first_child($title) ->set_text($LABEL . ", Text #" . $textcounter); # Case KLK; PAGENO scheint meist "None" zu sein
Harald Lüngen381c2a22024-09-17 09:06:39 +0300650 #$analytic->get_xpath('./idno[@type="PAGEID"]', 0) ->set_text($PAGEID);
651 #$analytic->get_xpath('./idno[@type="BINDINGID"]', 0) ->set_text($BID);
652 #$analytic->get_xpath('./idno[@type="ID"]', 0) ->set_text($ID);
653 #$analytic->get_xpath('./idno[@type="KIELIPANKKI_METAFILENAME"]', 0) ->set_text($METAFILENAME);
654 #$analytic->get_xpath('./idno[@type="KIELIPANKKI_ORIGFILENAME"]', 0) ->set_text($ORIGFILENAME);
Harald Lüngen2551a952024-09-15 08:08:35 +0300655 if($TEIFORMAT eq "TEI"){
656 $analytic->first_child('textLang') ->set_text($LANGUAGE);
657 }
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300658
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300659 # <monogr>
660 # <title>$PUBLTITLE</title>
661 # <imprint>
662 # <pubPlace>TODO</pubPlace>
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300663 # <publisher>TODO</publisher>
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300664 # </imprint>
665 # <biblScope unit="ISSUETITLE"/>
666 # <biblScope unit="ISSUENO"/>
667 # <biblScope unit="ISSUEDATE"/>
668 # <biblScope unit="pp">$PAGENO</biblScope>
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300669
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300670 my $monogr = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/monogr", 0);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300671
Harald Lüngen2551a952024-09-15 08:08:35 +0300672 $monogr->first_child($title) ->set_text($PUBLTITLE);
673 if($TEIFORMAT eq "TEI"){
674 $monogr->get_xpath('./imprint/date[@type="date"]', 0) ->set_text($DATE);
675 }
676 my $date = "date";
677 if($TEIFORMAT eq "I5"){$date="pubDate"};
678 $monogr->get_xpath('./imprint/' . $date . '[@type="year"]', 0) ->set_text($datearray[0]);
679 $monogr->get_xpath('./imprint/' . $date . '[@type="month"]', 0) ->set_text($datearray[1]);
680 $monogr->get_xpath('./imprint/' . $date . '[@type="day"]', 0) ->set_text($datearray[2]);
681 $monogr->first_child("imprint")->first_child("pubPlace") ->set_text($srcpubplaces{$PUBLTITLE}); # imprint is needed for tei validity
682 $monogr->first_child("imprint")->first_child("publisher") ->set_text($srcpublishers{$PUBLTITLE}); # imprint is needed for tei validity
Harald Lüngen381c2a22024-09-17 09:06:39 +0300683 #$monogr->get_xpath('./biblScope[@unit="ISSUETITLE"]', 0) ->set_text($ISSUETITLE);
684 #$monogr->get_xpath('./biblScope[@unit="ISSUENO"]', 0) ->set_text($ISSUENO);
685 #$monogr->get_xpath('./biblScope[@unit="ISSUEDATE"]', 0) ->set_text($ISSUEDATE);
686 #$monogr->get_xpath('./biblScope[@unit="pp"]', 0) ->set_text($PAGENO); # Achtung - PAGENO scheint meist "None" zu sein
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300687
Harald Lüngen2551a952024-09-15 08:08:35 +0300688 my $dateNice = $datearray[2] . "." . $datearray[1] . "." . $datearray[0];
689 my $dateBackwards = $datearray[0] . "." . $datearray[1] . "." . $datearray[2];
690
691 if($TEIFORMAT eq "I5"){
Harald Lüngen381c2a22024-09-17 09:06:39 +0300692 my $refCompleteText = $textSigle . " " . $PUBLTITLE . " no. " . $ISSUENO . ", [" . $PUBLTYPE . "], " . $dateNice;
Harald Lüngen2551a952024-09-15 08:08:35 +0300693 my $refShortText = $textSigle . " " . $PUBLTITLE . ", " . $dateNice;
694 $textHeader->get_xpath('./fileDesc/sourceDesc/reference[@type="complete"]', 0) -> set_text($refCompleteText);
695 $textHeader->get_xpath('./fileDesc/sourceDesc/reference[@type="short"]' , 0) -> set_text($refShortText);
696 }
697
698
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300699 # <encodingDesc>
700 # <tagsDecl>
701 # <namespace name="http://www.tei-c.org/ns/1.0">
702 # <tagUsage gi="s" occurs="SENTCOUNT"/>
703 # <tagUsage gi="w" occurs="TOKENCOUNT"/>
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300704
Harald Lüngen2551a952024-09-15 08:08:35 +0300705 my $namespacePath="./encodingDesc/tagsDecl/namespace/";
706 if($TEIFORMAT eq "I5"){$namespacePath="./encodingDesc/tagsDecl/"};
707
708 $textHeader->get_xpath($namespacePath . 'tagUsage[@gi="s"]', 0) -> set_att('occurs', $SENTCOUNT);
709 $textHeader->get_xpath($namespacePath . 'tagUsage[@gi="w"]', 0) -> set_att('occurs', $TOKENCOUNT);
710
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300711 # <profileDesc>
712 # <langUsage>
713 # <language ident="fi" usage="|xxx:44|fin:23|eng:3|"/>
714 # </langUsage>
715 # <textClass>
716 # <classCode scheme="kielipankki_klk">$PUBLTYPE</classCode>
Harald Lüngen2551a952024-09-15 08:08:35 +0300717 # <classCode scheme="kielipankki_klk_mapped">$mapping{$PUBLTYPE}</classCode>
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300718
Harald Lüngen2551a952024-09-15 08:08:35 +0300719 if($TEIFORMAT eq "I5"){
720 $textHeader->get_xpath('./profileDesc/creation/creatDate', 0) ->set_text($dateBackwards);
721 }
722
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300723 $textHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_att('ident', $LANGUAGE);
724 $textHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_att('usage', $SUMLANG);
725 # in @usage muss eigt. ein integer; am besten inhalt von SUMLANG aufdroeseln und mehrere <language> machen
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300726
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300727 $textHeader->get_xpath('./profileDesc/textClass/classCode[@scheme="kielipankki_klk"]', 0) ->set_text($PUBLTYPE);
Harald Lüngen2551a952024-09-15 08:08:35 +0300728 $textHeader->get_xpath('./profileDesc/textClass/classCode[@scheme="kielipankki_klk_mapped"]', 0)->set_text($mapping{$PUBLTYPE});
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300729
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300730 # <revisionDesc>
731 # <change when="TODO" who="HL">TEI version for EuReCo</change>
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300732
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300733 $textHeader->get_xpath('./revisionDesc/change', 0) ->set_att('when', localtime->ymd('-'));
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300734
Harald Lüngen86cbd932024-09-10 15:52:18 +0300735 return $textID;
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300736
Harald Lüngen86cbd932024-09-10 15:52:18 +0300737
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300738 #-----------------------------------
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300739 # END OF CREATING TEIHEADER
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300740 #-----------------------------------
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300741
742}
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300743
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300744sub setP {
745 my ($paragraph) = @_;
746
747 $paragraph->set_gi('p');
748
749 # <paragraph id="p-bcd0f3fa-bbd3dac4-815ead7a" sum_lang="|fin:1|">
750 # atts of <paragraph>:
751 # @id USE
752 # @sum_lang USE: put in xml:lang and prefix the value with "x-" for private value
753
754 $paragraph->set_att("xml:lang", "x-" . $paragraph->att("sum_lang"));
755 $paragraph->del_att("sum_lang");
Harald Lüngena20e69d2024-08-29 13:33:08 +0300756 # $paragraph->change_att_name('id', 'xml:id');
757 $paragraph->del_att("id"); # diese id ist auch nicht eindeutig!!
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300758}
759sub setS {
760 my ($sentence) = @_;
761
762 $sentence->set_gi('s');
763
764 # the atts of <sentence>:
765 # USE 1 @id="s-bcd0f3fa-bbd3dac4-f7429090"
766 # USE 2 @lang="fin" -> xml:lang
767 # ? 3 @lang_conf="0.6734853"> -> ToDo @cert ?
768
769 # set attrs of <s>
770 $sentence->set_att("xml:lang", $sentence->att("lang")); # ToDo: convert the value / introduce a hash for lookup (input values: "fin", "xxx", ....)
Harald Lüngenfaf8d482024-08-27 21:19:47 +0300771 # $sentence->change_att_name('id', 'xml:id'); # nicht eindeutig
772 $sentence->del_att('id');
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300773 $sentence->del_att("lang"); # replaced by xml:lang
774 $sentence->del_att("lang_conf"); # for the time being
775
776}
777
778sub createW {
779 my ($w_element, $line) = @_;
780
781 #---------------------------
782 # Get the tags (=columns)
783 #---------------------------
784
785 my @tags = split(/\t/, $line);
786
787 # set content of <w> i.e. the token
Harald Lüngen381c2a22024-09-17 09:06:39 +0300788 # $w_element->set_text($tags[0]);
789 my $random_w = Data::Random::String->create_random_string(length=>length($tags[0]), contains=>'alpha');
790 $w_element->set_text($random_w);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300791
792 # vrt positional-attributes in corpus KLK:
793 # USE [0] word
794 # USE [1] ref (id for reference of dephead)
795 # USE [2] lemma
796 # ? [3] lemmacomp (lemma with compound info - could go in @norm, as tag abuse?)
797 # USE [4] pos
798 # USE [5] msd
799 # USE [6] dephead
800 # USE [7] deprel
801 # [8] content (ocr-process)
802 # [9] vpos (ocr-process)
803 # [10] ocr (ocr-process)
804 # [11] cc (ocr-process)
805 # [12] hyph (ocr-process)
806 # [13] style (ocr-process)
807 # [14] lex (korp semantic disambiguation from G"oteborg)
808
809 # set the attributes of <w>:
810 $w_element->set_att("n", $tags[1]);
811 # $w_element->set_att("id", "w_" . $namearray[4] . $sentence->att("xml:id") . "_" . $tags[1]);
812 # so zusammengebaute ID ist auch nicht eindeutig...
Harald Lüngenfaf8d482024-08-27 21:19:47 +0300813 $w_element->del_att("id");
Harald Lüngen381c2a22024-09-17 09:06:39 +0300814 # $w_element->set_att("lemma", $tags[2]);
815 $w_element->set_att("lemma", $random_w);
816
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300817 # $w_element->set_att("norm", $tags[3]); # tag abuse of @norm
818 $w_element->set_att("pos", $tags[4]);
819 $w_element->set_att("msd", $tags[5]);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300820
Harald Lüngen381c2a22024-09-17 09:06:39 +0300821 if($TEIFORMAT eq "I5"){ # remove condition when part of the official TEI
822 $w_element->set_att("head", $tags[6]);
823 $w_element->set_att("deprel", $tags[7]);
824 }
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300825}
826
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300827
828sub set_title{
Harald Lüngen86cbd932024-09-10 15:52:18 +0300829 my ($corpusHeader, $source, $year, $kielipankkiCorpus) = @_;
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300830
Harald Lüngen381c2a22024-09-17 09:06:39 +0300831 my $titleElement = "title";
832
833 if($TEIFORMAT eq "I5"){
834 $titleElement = "c.title";
835 }
836
Harald Lüngen86cbd932024-09-10 15:52:18 +0300837 my $cTitleString = $source . " " . $year . ", from ". $kielipankkiCorpus . " for EuReCo";
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300838
839 #<teiHeader>
840 # <fileDesc>
841 # <titleStmt>
842 # <title>[Aamulehti2021] from [klk-fi-v2-vrt for] EuReCo</title>
843 # </titleStmt>
844 # <!-- ... -->
845 # </fileDesc>
846 #</teiHeader>
847
Harald Lüngen381c2a22024-09-17 09:06:39 +0300848 my $cTitleNode = $corpusHeader->first_child("fileDesc")->first_child("titleStmt")->first_child($titleElement);
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300849
850 $cTitleNode->set_text($cTitleString);
851
852}
853
854sub set_sourceDesc{
Harald Lüngen86cbd932024-09-10 15:52:18 +0300855 my ($corpusHeader, $source, $year, $kielipankkiCorpus) = @_;
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300856
Harald Lüngen86cbd932024-09-10 15:52:18 +0300857 my $cBiblString = $source . " " . $year . ", from ". $kielipankkiCorpus . " for EuReCo";
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300858
859 #<teiHeader>
860 # <fileDesc>
861 # <!-- ... -->
862 # <sourceDesc>
863 # <bibl>[Aamulehti2021] from [klk-fi-v2-vrt]</bibl>
864 # </sourceDesc>
865 # <!-- ... -->
866 # </fileDesc>
867 #</teiHeader>
868
869 my $cBiblNode = $corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("bibl");
870
871 $cBiblNode->set_text($cBiblString);
872}
873
Harald Lüngen381c2a22024-09-17 09:06:39 +0300874sub set_sourceDescI5{
875 my ($corpusHeader) = @_;
876
877 my $PUBLTITLE = $srcfullnames{$fnsource};
878 my $PUBLPLACE = $srcpubplaces{$PUBLTITLE};
879 my $PUBLISHER = $srcpublishers{$PUBLTITLE};
880 my $CSIGLE = $corpussigles{$PUBLTITLE};
881
882 my $YEAR = $fnyear;
883
884
885 #<idsHeader>
886 # <fileDesc>
887 # <!-- ... -->
888 # <sourceDesc>
889 # <biblStruct>
890 # <monogr>
891 # <h.title type="main">[$PUBLTITLE], [$YEAR]</h.title>
892 # <imprint>
893 # <publisher>[$PUBLISHER]</publisher>
894 # <pubPlace key="DE">[$PUBPLACE]</pubPlace>
895 # </imprint>
896 # </monogr>
897 # </biblStruct>
898 # <reference type="super" assemblage="regular">[$KKK] [$PUBLTITLE]; [$PUBPLACE]: [$PUBLISHER], [$YEAR]</reference>
899 # </sourceDesc> # <sourceDesc>
900 # <!-- ... -->
901 # </fileDesc>
902 #</teiHeader>
903
904 my $cMonogr = $corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("biblStruct")->first_child("monogr");
905 $cMonogr->first_child("h.title")->set_text($PUBLTITLE);
906 $cMonogr->first_child("imprint")->first_child("publisher")->set_text($PUBLISHER);
907 $cMonogr->first_child("imprint")->first_child("pubPlace") ->set_text($PUBLPLACE);
908
909 $corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("reference")->set_text($CSIGLE . " " . $PUBLTITLE . "; " . $PUBLPLACE . ": " . $PUBLISHER . ", " . $YEAR);
910
911}
912
913
914
915sub createIdsDoc{
916 my ($textattsref) = @_;
917 my $DATE = $textattsref->{'date'};
918 my $PUBLTITLE = $textattsref->{'publ_title'};
919 my $CSIGLE = $corpussigles{$PUBLTITLE};
920
921 #print STDERR "LASTMONTH: " . $LASTMONTH . "\n";
922 #print STDERR "DATE: " . $DATE . "\n";
923
924 my @datearray = split("-", $DATE);
925 my $MONTH = $datearray[1];
926 my $YEAR = $datearray[0];
927
928 my $DOCID = $months{$MONTH};
929 my $MONTHNAME = $monthnames{$MONTH};
930
931
932 #print STDERR "MONTH: " . $MONTH . "\n\n";
933
934 my $idsDocString="";
935 if($TEIFORMAT eq "I5"){
936 $idsDocString = "
937<idsDoc>
938<idsHeader type=\"document\" pattern=\"text\" version=\"1.1\" TEIform=\"teiHeader\">
939 <fileDesc>
940 <titleStmt>
941 <dokumentSigle>$CSIGLE/$DOCID</dokumentSigle>
942 <d.title>$PUBLTITLE, $MONTHNAME $YEAR</d.title>
943 </titleStmt>
944 <publicationStmt>
945 <distributor/>
946 <pubAddress/>
947 <availability region=\"world\">$kielipankkiLicense</availability>
948 <pubDate/>
949 </publicationStmt>
950 <sourceDesc>
951 <biblStruct>
952 <monogr>
953 <h.title/>
954 <imprint/>
955 </monogr>
956 </biblStruct>
957 </sourceDesc>
958 </fileDesc>
959</idsHeader>
960</idsDoc>\n";
961}
962 if($MONTH + 0 == $LASTMONTH + 1){
963 if($MONTH+0 > 1){
964 printf("%s\n", $idsDocString);
965 }
966 $LASTMONTH++;
967 }
968 # printf(STDERR "\n\nNEW MONTH %s\n\n", $months{$MONTH});
969}
970
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300971
972
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300973#################
974## usage_message
975#################
976
977
978sub usage_message {
Harald Lüngena7e91622024-08-23 17:33:11 +0300979 print " Usage: ./vrt2tei.pl <file.vrt.xml> <outfile>\n";
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300980 print " <file.vrt.xml> is a VRT file converted to proper XML\n";
981 exit;
982}
983
984