blob: 42bb7d9108d7665b2f1cdadf25ab71cc74298dc5 [file] [log] [blame]
Harald Lüngen9d4e0462024-08-23 09:34:22 +03001#! /usr/bin/perl -w
2
3
4###########################################################################################################################################################
5# vrt2tei.pl
6# eureco
7# leibniz-institut fuer deutsche sprache / csc finland esbo
8# august 2024
9#
10#
11# using XML::Twig , see http://www.xmltwig.org/, https://metacpan.org/pod/XML::Twiga
12#
Harald Lüngenccd84902024-08-27 16:03:47 +030013# usage: see below the usage fugnction
Harald Lüngencaab0802024-08-23 17:28:22 +030014# Usage: ./vrt2tei.pl <vrtxmlfile.xml> <outfile>
Harald Lüngen9d4e0462024-08-23 09:34:22 +030015# <vrtxmlfile>: xml-ised vrt file
16#
17#
18# TODO:
19# 1 insert dtd spec, or ref to TEI
20
Harald Lüngendb5e6e72024-09-04 17:41:18 +030021# 3a remove the vrt positional attribute comment line / all comment lines
Harald Lüngen9d4e0462024-08-23 09:34:22 +030022# 3b add @head and @deprel to I5 sowie auch @msd
23# 3c bearbeitung von @head und @deprel in tei2korapxml durch Nils?
24# 3d build 30 billion corpus
25
26# 4a take care of IDs
27# 4b see to the values of @xml:lang
28# 5 abfangen von unerwarteten elementen dh andere als <sentence> und <paragraph>
29# 5a wort reihenfolge nochmal checken
30# 6 checks and balances
Harald Lüngen9d4e0462024-08-23 09:34:22 +030031# 7 How to encode Kielipankki and National Library of Finland? in teiCorpus Header
32# 8 construct <idsDoc>s for the months (or go for TEI)
33# 9 parallelisation in bash and application on sub corpora of KLK
34# 10 re-implementation of the gawk code in the perl script
35# 12 re-implement creation of text header from xml file in another twig / parametrize TEI vs I5
36
37
38
39#remember
40#formatted.xml:105613: element w: validity error : No declaration for attribute deprel of element w
41#formatted.xml:105613: element w: validity error : No declaration for attribute head of element w
42
43
44#
45#
46############################################################################################################################################################
47
48
49use strict;
50use warnings;
Harald Lüngenba0354b2024-09-11 16:24:08 +030051#use diagnostics;
Harald Lüngen9d4e0462024-08-23 09:34:22 +030052
53use XML::Twig;
54use XML::Generator ':pretty'; # apparently no effect when using flush();
55
56
57use locale; # diese drei Zeilen, damit \b im regex nicht Umlaute und ß matcht.
58use POSIX qw(locale_h); # to be able to use setlocale()
59#setlocale(LC_ALL,'de_DE');
60setlocale(LC_ALL, "fi_FI");
61use utf8;
62use open qw( :std :encoding(UTF-8) );
63
64use Time::Piece;
65use Tie::IxHash;
66
Harald Lüngendb5e6e72024-09-04 17:41:18 +030067
68
Harald Lüngen9d4e0462024-08-23 09:34:22 +030069#----------------------
70# check file arguments:
71#----------------------
72
73# arg0 infile: vrt-xml
Harald Lüngen9d4e0462024-08-23 09:34:22 +030074
Harald Lüngena20e69d2024-08-29 13:33:08 +030075unless($ARGV[0]) {&usage_message()} ; # min arg0, the input file
76if ($ARGV[1]) {&usage_message()}; # max arg0, the input file
Harald Lüngen9d4e0462024-08-23 09:34:22 +030077
78
79####################
80# GLOBAL VARIABLES
81####################
82
83my $encoding = "UTF-8";
84#my $encoding = "iso-8859-1"; # dieses $encoding ist NUR fuer das output s.u. twig funktion
Harald Lüngencaab0802024-08-23 17:28:22 +030085my $textcounter = 0;
Harald Lüngen9d4e0462024-08-23 09:34:22 +030086
Harald Lüngenba0354b2024-09-11 16:24:08 +030087our %corpussigles = ();
88our %srcfullnames = ();
89our %srcpubplaces = ();
90our %srcpublishers = ();
91our %srctexttypes = ();
92our %srctextlangs = ();
93
Harald Lüngen86cbd932024-09-10 15:52:18 +030094my %doccounter = ( # by the month as in derekox
95 "01" => 1,
96 "02" => 1,
97 "03" => 1,
98 "04" => 1,
99 "05" => 1,
100 "06" => 1,
101 "07" => 1,
102 "08" => 1,
103 "09" => 1,
104 "10" => 1,
105 "11" => 1,
106 "12" => 1,
107 );
108
109
110my $sourcescsvfile = "sources_klk_fi_v2_2021_4eureco.csv";
111my $corpheaderfile = "teiCorpusHeaderSkeleton.tei.xml";
112my $textheaderfile = "teiTextHeaderSkeleton.tei.xml";
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300113
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300114
115my $twig="";
116my $teiCorpusHeaderDoc="";
117
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300118# global variables pertaining to the original corpus :
119my $kielipankkiCorpus = "klk-fi-v2-vrt";
120
Harald Lüngen86cbd932024-09-10 15:52:18 +0300121my %months = (
122 "01" => "JAN",
123 "02" => "FEB",
124 "03" => "MAR",
125 "04" => "APR",
126 "05" => "MAY",
127 "06" => "JUN",
128 "07" => "JUL",
129 "08" => "AUG",
130 "09" => "SEP",
131 "10" => "OCT",
132 "11" => "NOV",
133 "12" => "DEC",
134 );
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300135
136#------------------------------------------------------------------
137# read corpusHeaderSkeleton document and get header out of it
138#------------------------------------------------------------------
139
140my $teiCorpusHeaderDocTwig = new XML::Twig(
141 keep_spaces => 1,
142 keep_atts_order => 1,
143 comments => 'drop',
144 );
145
146
Harald Lüngen86cbd932024-09-10 15:52:18 +0300147$teiCorpusHeaderDocTwig->parsefile($corpheaderfile);
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300148my $corpusHeader = $teiCorpusHeaderDocTwig->root; # getting the teiHeader for corpus out of the teiCorpusHeaderSkeleton document
149
150
151#------------------------------------------------------------------
152# read textHeaderSkeleton document adn get header out of it
153#------------------------------------------------------------------
154
155my $teiTextHeaderDocTwig = new XML::Twig(
156 keep_spaces => 1,
157 keep_atts_order => 1,
158 comments => 'drop',
159 );
160
Harald Lüngen86cbd932024-09-10 15:52:18 +0300161$teiTextHeaderDocTwig->parsefile($textheaderfile);
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300162my $textHeader = $teiTextHeaderDocTwig->root; # getting the teiHeader for corpus out of the teiTextHeaderSkeleton document
163
164
165#----------------------------------
166# read input VRT-XML document
167#----------------------------------
168
169open(my $IN, "< $ARGV[0]") || die("$0: cannot open file for reading: $ARGV[0]"); # open input file and initialise filehandel, actually does not seem to be needed
170 # as parsefile() (s.b.) is applied to the filename
171
Harald Lüngen86cbd932024-09-10 15:52:18 +0300172#-------------------------------------------------------------------------------------------
173# read source metadata file (prepared manually => ultimately read the info from CMDI File?)
174#-------------------------------------------------------------------------------------------
175
Harald Lüngenba0354b2024-09-11 16:24:08 +0300176open(my $SOURCES, "< $sourcescsvfile") || die("$0: cannot open file for reading: $sourcescsvfile");
177while(my $fline = <$SOURCES>){
178 chomp($fline);
Harald Lüngen86cbd932024-09-10 15:52:18 +0300179
Harald Lüngenba0354b2024-09-11 16:24:08 +0300180 if ($fline=~/^\#/ || $fline=~/^\s*$/ || $fline =~ "TEICORPUSID"){next;} # skip line if empty line or comment line or first line
181 my @flarray = split(/\s*\t+\s*/, $fline); # split each line into array
182
183 $corpussigles{$flarray[2]} = $flarray[0];
184 $srcfullnames{$flarray[2]} = $flarray[1];
185 $srcpubplaces{$flarray[2]} = $flarray[6];
186 $srcpublishers{$flarray[2]} = $flarray[7];
187 $srctexttypes{$flarray[2]} = $flarray[4];
188 $srctextlangs{$flarray[2]} = $flarray[5];
Harald Lüngen86cbd932024-09-10 15:52:18 +0300189}
Harald Lüngenba0354b2024-09-11 16:24:08 +0300190close($SOURCES);
Harald Lüngen86cbd932024-09-10 15:52:18 +0300191
192
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300193
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300194#####################
195# M A I N
196#####################
197
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300198#-------------------------------------------------------------------------------------------------------------
199# start twig for input and call start tag handler for root and twig handler for each <text> in the VRT
200#-------------------------------------------------------------------------------------------------------------
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300201
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300202
Harald Lüngenba0354b2024-09-11 16:24:08 +0300203
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300204$twig = new XML::Twig(
205 keep_spaces => 1, # dadurch auch whitespaces an ehemeligen elementgrenzen im output
206 keep_atts_order => 1, # requires Tie::IxHash
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300207 comments => 'drop',
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300208 start_tag_handlers => {
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300209 texts => sub{root(@_, $corpusHeader)}
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300210 },
211 twig_handlers => {
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300212# text => \&text
Harald Lüngen7abb0b52024-09-05 16:26:57 +0300213 text => sub{text(@_, $textHeader->copy)} # copy must be because textHeader will be flushed with $twig in the <text> handler;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300214 },
215 # dtd_handlers => { # ToDo for I5
216 # \&set_dtd;
217 # }
Harald Lüngena20e69d2024-08-29 13:33:08 +0300218
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300219 output_encoding => $encoding,
220 );
221
222$twig->parsefile($ARGV[0]);
223
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300224
225
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300226
227
228###########
229# END MAIN
230###########
231
232
233
234
235##############################
236# S U B R O U T I N E S
237##############################
238
239# sub set_dtd [
240# my $twig, $dtd = @_;
241# my $internal = qq|\nPUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"\n"DTD/xhtml1-strict.dtd"|;
242#
243# $twig->twig_doctype('html', undef, undef, $internal);
244# }
245
246
247
248sub root {
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300249 my ($twig, $root, $corpusHeader) =@_;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300250
251 $root->set_gi('teiCorpus');
252 $root->set_att("xmlns", 'http://www.tei-c.org/ns/1.0');
253
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300254 &insertCorpusHeader($root, $corpusHeader);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300255}
256
257
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300258
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300259sub insertCorpusHeader{
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300260 my ($root, $corpusHeader) =@_;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300261
Harald Lüngenba0354b2024-09-11 16:24:08 +0300262 #-------------------------------------------------------------
263 # take fnsource and year from the current xml input filename
264 #-------------------------------------------------------------
265
Harald Lüngenccd84902024-08-27 16:03:47 +0300266 my @array = split(/\//, $ARGV[0]);
267 my $l = scalar(@array);
Harald Lüngenba0354b2024-09-11 16:24:08 +0300268 my $fnsource = $array[$l-1];
269 $fnsource =~ s/([0-9][0-9][0-9][0-9])\.xml$//;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300270
Harald Lüngen86cbd932024-09-10 15:52:18 +0300271 my $year = $1; # $1 now containts substring in first bracket in regex above
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300272
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300273 #-----------------------
274 # set corpus header
275 #-----------------------
276
Harald Lüngenba0354b2024-09-11 16:24:08 +0300277 &set_title( $corpusHeader, $srcfullnames{$fnsource}, $year, $kielipankkiCorpus);
278 &set_sourceDesc($corpusHeader, $srcfullnames{$fnsource}, $year, $kielipankkiCorpus);
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300279
280 my $teiCorpusHeader = $corpusHeader->paste("first_child", $root);
281
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300282}
283
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300284
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300285#----------------------------
286# handler &text for <text>
287#----------------------------
288
289sub text {
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300290 my ($twig, $text, $textHeader) = @_;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300291
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300292 $textcounter++;
Harald Lüngencaab0802024-08-23 17:28:22 +0300293
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300294
Harald Lüngencaab0802024-08-23 17:28:22 +0300295 # ToDo: catch all other, unexpected children of root
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300296
297 #--------------------------------------------------------------------------
298 # Get text metadata (attributes of <text>) and create teiHeader for <text>
299 #--------------------------------------------------------------------------
300
301 my $textattsref = $text->atts(); # $textattsref is now a reference to a hash and should be used with '->'
302
Harald Lüngen86cbd932024-09-10 15:52:18 +0300303 # &createTextHeader returns the $textID:
304 my $textID = &createTextHeader($text, $textattsref, $textHeader);
305
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300306 #--------------------------
307 # create <TEI> from <text>
308 #--------------------------
309
310 # set vrt <text> to <TEI> and delete all attributes after they were were saved above
311 $text->del_atts;
312 $text->set_gi("TEI");
Harald Lüngen86cbd932024-09-10 15:52:18 +0300313 $text->set_att('xml:id', $textID);
Harald Lüngen7abb0b52024-09-05 16:26:57 +0300314
315
Harald Lüngen86cbd932024-09-10 15:52:18 +0300316
317
Harald Lüngen7abb0b52024-09-05 16:26:57 +0300318
Harald Lüngencaab0802024-08-23 17:28:22 +0300319
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300320 #------------------------------------------------------------------
321 # create the <tei:text>, <body>, <div> elements inside <TEI>
322 #------------------------------------------------------------------
323
324 my $ttext_element = XML::Twig::Elt->new('text');
325 my $body_element = XML::Twig::Elt->new('body');
326 my $div_element = XML::Twig::Elt->new('div');
327
328 # set atts
Harald Lüngencaab0802024-08-23 17:28:22 +0300329 $div_element ->set_att("type", "page"); # ToDo: this is specific to KLK
330 $ttext_element->set_att("xml:lang", 'fi'); # as in ICC-NOR
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300331
332 # paste
333 $ttext_element->paste('last_child', $text);
334 $body_element ->paste('last_child', $ttext_element);
335 $div_element ->paste('last_child', $body_element);
336
337
338 #-------------------------------
339 # create <p> from <paragraph>
340 #-------------------------------
341
342 my @paragraphs = $text->children( 'paragraph');
343
344 foreach my $paragraph (@paragraphs) {
345
346 &setP($paragraph);
347
348 $paragraph->move('last_child', $div_element);
349
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300350 #------------------------------
351 # create <s> from <sentence>
352 #------------------------------
353
354 my @sentences = $paragraph->children('sentence');
355 foreach my $sentence (@sentences) {
356
357 &setS($sentence);
358
359
360 #--------------------------------------
361 # create <w> (word) from each $line
362 #--------------------------------------
363
364 my @lines = split(/\n+/, $sentence->xml_text);
365 $sentence->set_text("\n");
366
367 for my $line (@lines){ # Todo: Reihenfolge checken
368 if($line ne "" ){
369 my $w_element = XML::Twig::Elt->new('w');
370 &createW($w_element, $line);
371 $w_element->paste('last_child', $sentence);
372 }
Harald Lüngencaab0802024-08-23 17:28:22 +0300373 } # end words
374 } # end sentences
375 } # end paragraphs
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300376
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300377 # $twig->set_pretty_print( 'record');
Harald Lüngena20e69d2024-08-29 13:33:08 +0300378 # $twig->flush($OUT);
379 $twig->flush("/dev/stdout");
380}
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300381
382sub createTextHeader{
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300383 my ($text, $textattsref, $textHeader) = @_;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300384
385 # USE 01 binding_id="2246025"
386 # USE 02 date="2021-01-15"
387 # 03 datefrom="20210115"
388 # 04 dateto="20210115"
389 # 05 elec_date="_"
390 # 06 file=""
391 # USE 07 filename_metadata="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_mets.xml"
392 # USE 08 filename_orig ="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_page-2021011502210030301.xml
393 # USE 09 id="t-bcd0f3fa-bbd3dac4"
394 # 10 img_url=""
395 # USE 11 issue_date="15.01.2021"
396 # USE 12 issue_no="SK0221"
397 # USE 13 issue_title="Suomen Kuvalehti"
398 # USE 14 label="Suomen Kuvalehti no. SK0221 15.01.2021"
399 # USE 16 language="fi"
400 # USE 17 page_id="p1"
401 # USE 18 page_no="None"
402 # 19 part_name="_"
403 # 20 publ_id="0039-5552"
404 # 21 publ_part=""
405 # USE 22 publ_title="Suomen Kuvalehti"
406 # USE 23 publ_type="aikakausi"
407 # USE 24 sentcount="70"
408 # USE 25 sum_lang="|xxx:44|fin:23|eng:3|"
409 # 26 timefrom="000000"
410 # 27 timeto="235959"
411 # USE 28 tokencount="304"
412 # 29 version_added="KLK-fi-2021">
413
414
415 my $BID = $textattsref->{'binding_id'};
416 my $DATE = $textattsref->{'date'};
417 my $METAFILENAME = $textattsref->{'filename_metadata'};
418 my $ORIGFILENAME = $textattsref->{'filename_orig'};
419 my $ID = $textattsref->{'id'};
420 my $ISSUEDATE = $textattsref->{'issue_date'};
421 my $ISSUENO = $textattsref->{'issue_no'};
422 my $ISSUETITLE = $textattsref->{'issue_title'};
423 my $LABEL = $textattsref->{'label'};
424 my $LANGUAGE = $textattsref->{'language'};
425 my $PAGEID = $textattsref->{'page_id'};
426 my $PAGENO = $textattsref->{'page_no'};
427 my $PUBLTITLE = $textattsref->{'publ_title'};
428 my $PUBLTYPE = $textattsref->{'publ_type'};
429 my $SENTCOUNT = $textattsref->{'sentcount'};
430 my $SUMLANG = $textattsref->{'sum_lang'};
431 my $TOKENCOUNT = $textattsref->{'tokencount'};
432
Harald Lüngenba0354b2024-09-11 16:24:08 +0300433
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300434 #-----------------------------
435 # Derived Metadata variables
436 #-----------------------------
437
438 my @datearray = split("-", $DATE);
439 my @langarray = split("|", $SUMLANG);
440 my @namearray = split(/[\.\/]/, $ORIGFILENAME); # use $namearray[4] as ID for the page
441
442
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300443
Harald Lüngen86cbd932024-09-10 15:52:18 +0300444 #----------------------------------------------------
445 # create textSigle to be returned from this function
446 #----------------------------------------------------
447
448 # SUK21.JAN.00001
449
450 my $corpusID = "SUK"; # ToDo read Table with Source metadata
451 my $yy = substr($datearray[0], 2, 2); # substr EXPR,OFFSET,LENGTH
452 my $mm = $datearray[1]; # substr EXPR,OFFSET,LENGTH
453 my $MMM = $months{$mm};
454
455 my $textID = $corpusID . $yy . "_" . $MMM . "." . sprintf("%05d", $doccounter{$mm}++);
456
457
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300458
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300459 #-----------------------------------------------------------------------
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300460 # CREATE text-teiHeader ACCORDING TO THE SKELETON in $textHeader
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300461 #-----------------------------------------------------------------------
462
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300463
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300464 $textHeader->paste('first_child', $text);
465
466 #-----------------------------------------------
467 # <teiHeader>
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300468 # <fileDesc n="EuReCo-KLK-FIN_[$ID]">
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300469 # <titleStmt>
470 # <title>[$LABEL, page $PAGENO]</title>
471
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300472 $textHeader->first_child("fileDesc") -> set_att('n', "EuReCo-". $kielipankkiCorpus . $ID);
473
474 $textHeader->first_child("fileDesc") -> first_child("titleStmt")->first_child("title")
475 ->set_text($LABEL . ", Text #" . $textcounter); # Case KLK; PAGENO scheint meist "None" zu sein
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300476
477 #-----------------------------------------------
478 # <fileDesc>
479 # <sourceDesc>
480 # <biblStruct>
481 # <analytic>
482 # <title type="main">[$LABEL, page $PAGENO]</title>
483 # <date>[$DATE]</date>
484 # <date type="year">TODO</date>
485 # <date type="month">TODO</date>
486 # <date type="day">TODO</date>
487 # <idno type="PAGEID">$PAGEID</idno>
488 # <idno type="BINDINGID">$BID</idno>
489 # <idno type="ID">$ID</idno>
490 # <idno type="KIELIPANKKI_METAFILENAME">$METAFILENAME</idno>
491 # <idno type="KIELIPANKKI_ORIGFILENAME">$ORIGFILENAME</idno>
492 # <textLang>$LANGUAGE</textLang>
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300493
494 my $analytic = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/analytic", 0);
495
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300496 $analytic->first_child("title") ->set_text($LABEL . ", Text #" . $textcounter); # Case KLK; PAGENO scheint meist "None" zu sein
497 $analytic->get_xpath('./date[@type="date"]', 0) ->set_text($DATE);
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300498 $analytic->get_xpath('./date[@type="year"]', 0) ->set_text($datearray[0]);
499 $analytic->get_xpath('./date[@type="month"]', 0) ->set_text($datearray[1]);
500 $analytic->get_xpath('./date[@type="day"]', 0) ->set_text($datearray[2]);
501 $analytic->get_xpath('./idno[@type="PAGEID"]', 0) ->set_text($PAGEID);
502 $analytic->get_xpath('./idno[@type="BINDINGID"]', 0) ->set_text($BID);
503 $analytic->get_xpath('./idno[@type="ID"]', 0) ->set_text($ID);
504 $analytic->get_xpath('./idno[@type="KIELIPANKKI_METAFILENAME"]', 0) ->set_text($METAFILENAME);
505 $analytic->get_xpath('./idno[@type="KIELIPANKKI_ORIGFILENAME"]', 0) ->set_text($ORIGFILENAME);
506 $analytic->first_child('textLang') ->set_text($LANGUAGE);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300507
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300508 # <monogr>
509 # <title>$PUBLTITLE</title>
510 # <imprint>
511 # <pubPlace>TODO</pubPlace>
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300512 # <publisher>TODO</publisher>
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300513 # </imprint>
514 # <biblScope unit="ISSUETITLE"/>
515 # <biblScope unit="ISSUENO"/>
516 # <biblScope unit="ISSUEDATE"/>
517 # <biblScope unit="pp">$PAGENO</biblScope>
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300518
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300519 my $monogr = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/monogr", 0);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300520
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300521 $monogr->first_child("title") ->set_text($PUBLTITLE);
522 $monogr->first_child("imprint")->first_child("pubPlace") ->set_text("ToDo"); # imprint is needed for tei validity
523 $monogr->first_child("imprint")->first_child("publisher") ->set_text("ToDo"); # imprint is needed for tei validity
524 $monogr->get_xpath('./biblScope[@unit="ISSUETITLE"]', 0) ->set_text($ISSUETITLE);
525 $monogr->get_xpath('./biblScope[@unit="ISSUENO"]', 0) ->set_text($ISSUENO);
526 $monogr->get_xpath('./biblScope[@unit="ISSUEDATE"]', 0) ->set_text($ISSUEDATE);
527 $monogr->get_xpath('./biblScope[@unit="pp"]', 0) ->set_text($PAGENO); # Achtung - PAGENO scheint meist "None" zu sein
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300528
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300529 # <encodingDesc>
530 # <tagsDecl>
531 # <namespace name="http://www.tei-c.org/ns/1.0">
532 # <tagUsage gi="s" occurs="SENTCOUNT"/>
533 # <tagUsage gi="w" occurs="TOKENCOUNT"/>
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300534
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300535 $textHeader->get_xpath('./encodingDesc/tagsDecl/namespace/tagUsage[@gi="s"]', 0) -> set_att('occurs', $SENTCOUNT);
536 $textHeader->get_xpath('./encodingDesc/tagsDecl/namespace/tagUsage[@gi="w"]', 0) -> set_att('occurs', $TOKENCOUNT);
537
538 # <profileDesc>
539 # <langUsage>
540 # <language ident="fi" usage="|xxx:44|fin:23|eng:3|"/>
541 # </langUsage>
542 # <textClass>
543 # <classCode scheme="kielipankki_klk">$PUBLTYPE</classCode>
544 # <classCode scheme="kielipankki_klk_mapped">TODO</classCode>
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300545
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300546 $textHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_att('ident', $LANGUAGE);
547 $textHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_att('usage', $SUMLANG);
548 # in @usage muss eigt. ein integer; am besten inhalt von SUMLANG aufdroeseln und mehrere <language> machen
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300549
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300550 $textHeader->get_xpath('./profileDesc/textClass/classCode[@scheme="kielipankki_klk"]', 0) ->set_text($PUBLTYPE);
551 $textHeader->get_xpath('./profileDesc/textClass/classCode[@scheme="kielipankki_klk_mapped"]', 0)->set_text("ToDo");
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300552
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300553 # <revisionDesc>
554 # <change when="TODO" who="HL">TEI version for EuReCo</change>
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300555
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300556 $textHeader->get_xpath('./revisionDesc/change', 0) ->set_att('when', localtime->ymd('-'));
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300557
Harald Lüngen86cbd932024-09-10 15:52:18 +0300558 return $textID;
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300559
Harald Lüngen86cbd932024-09-10 15:52:18 +0300560
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300561 #-----------------------------------
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300562 # END OF CREATING TEIHEADER
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300563 #-----------------------------------
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300564
565}
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300566
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300567sub setP {
568 my ($paragraph) = @_;
569
570 $paragraph->set_gi('p');
571
572 # <paragraph id="p-bcd0f3fa-bbd3dac4-815ead7a" sum_lang="|fin:1|">
573 # atts of <paragraph>:
574 # @id USE
575 # @sum_lang USE: put in xml:lang and prefix the value with "x-" for private value
576
577 $paragraph->set_att("xml:lang", "x-" . $paragraph->att("sum_lang"));
578 $paragraph->del_att("sum_lang");
Harald Lüngena20e69d2024-08-29 13:33:08 +0300579 # $paragraph->change_att_name('id', 'xml:id');
580 $paragraph->del_att("id"); # diese id ist auch nicht eindeutig!!
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300581}
582sub setS {
583 my ($sentence) = @_;
584
585 $sentence->set_gi('s');
586
587 # the atts of <sentence>:
588 # USE 1 @id="s-bcd0f3fa-bbd3dac4-f7429090"
589 # USE 2 @lang="fin" -> xml:lang
590 # ? 3 @lang_conf="0.6734853"> -> ToDo @cert ?
591
592 # set attrs of <s>
593 $sentence->set_att("xml:lang", $sentence->att("lang")); # ToDo: convert the value / introduce a hash for lookup (input values: "fin", "xxx", ....)
Harald Lüngenfaf8d482024-08-27 21:19:47 +0300594 # $sentence->change_att_name('id', 'xml:id'); # nicht eindeutig
595 $sentence->del_att('id');
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300596 $sentence->del_att("lang"); # replaced by xml:lang
597 $sentence->del_att("lang_conf"); # for the time being
598
599}
600
601sub createW {
602 my ($w_element, $line) = @_;
603
604 #---------------------------
605 # Get the tags (=columns)
606 #---------------------------
607
608 my @tags = split(/\t/, $line);
609
610 # set content of <w> i.e. the token
611 $w_element->set_text($tags[0]);
612
613 # vrt positional-attributes in corpus KLK:
614 # USE [0] word
615 # USE [1] ref (id for reference of dephead)
616 # USE [2] lemma
617 # ? [3] lemmacomp (lemma with compound info - could go in @norm, as tag abuse?)
618 # USE [4] pos
619 # USE [5] msd
620 # USE [6] dephead
621 # USE [7] deprel
622 # [8] content (ocr-process)
623 # [9] vpos (ocr-process)
624 # [10] ocr (ocr-process)
625 # [11] cc (ocr-process)
626 # [12] hyph (ocr-process)
627 # [13] style (ocr-process)
628 # [14] lex (korp semantic disambiguation from G"oteborg)
629
630 # set the attributes of <w>:
631 $w_element->set_att("n", $tags[1]);
632 # $w_element->set_att("id", "w_" . $namearray[4] . $sentence->att("xml:id") . "_" . $tags[1]);
633 # so zusammengebaute ID ist auch nicht eindeutig...
Harald Lüngenfaf8d482024-08-27 21:19:47 +0300634 $w_element->del_att("id");
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300635 $w_element->set_att("lemma", $tags[2]);
636 # $w_element->set_att("norm", $tags[3]); # tag abuse of @norm
637 $w_element->set_att("pos", $tags[4]);
638 $w_element->set_att("msd", $tags[5]);
Harald Lüngenccd84902024-08-27 16:03:47 +0300639#TMP $w_element->set_att("head", $tags[6]);
640#TMP $w_element->set_att("deprel", $tags[7]);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300641
642}
643
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300644
645sub set_title{
Harald Lüngen86cbd932024-09-10 15:52:18 +0300646 my ($corpusHeader, $source, $year, $kielipankkiCorpus) = @_;
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300647
Harald Lüngen86cbd932024-09-10 15:52:18 +0300648 my $cTitleString = $source . " " . $year . ", from ". $kielipankkiCorpus . " for EuReCo";
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300649
650 #<teiHeader>
651 # <fileDesc>
652 # <titleStmt>
653 # <title>[Aamulehti2021] from [klk-fi-v2-vrt for] EuReCo</title>
654 # </titleStmt>
655 # <!-- ... -->
656 # </fileDesc>
657 #</teiHeader>
658
659 my $cTitleNode = $corpusHeader->first_child("fileDesc")->first_child("titleStmt")->first_child("title");
660
661 $cTitleNode->set_text($cTitleString);
662
663}
664
665sub set_sourceDesc{
Harald Lüngen86cbd932024-09-10 15:52:18 +0300666 my ($corpusHeader, $source, $year, $kielipankkiCorpus) = @_;
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300667
Harald Lüngen86cbd932024-09-10 15:52:18 +0300668 my $cBiblString = $source . " " . $year . ", from ". $kielipankkiCorpus . " for EuReCo";
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300669
670 #<teiHeader>
671 # <fileDesc>
672 # <!-- ... -->
673 # <sourceDesc>
674 # <bibl>[Aamulehti2021] from [klk-fi-v2-vrt]</bibl>
675 # </sourceDesc>
676 # <!-- ... -->
677 # </fileDesc>
678 #</teiHeader>
679
680 my $cBiblNode = $corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("bibl");
681
682 $cBiblNode->set_text($cBiblString);
683}
684
685
686
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300687#################
688## usage_message
689#################
690
691
692sub usage_message {
Harald Lüngena7e91622024-08-23 17:33:11 +0300693 print " Usage: ./vrt2tei.pl <file.vrt.xml> <outfile>\n";
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300694 print " <file.vrt.xml> is a VRT file converted to proper XML\n";
695 exit;
696}
697
698