blob: 53ad78df20dbfa2c2c3b1670fdc1a71e8922f262 [file] [log] [blame]
Harald Lüngen9d4e0462024-08-23 09:34:22 +03001#! /usr/bin/perl -w
2
3
4###########################################################################################################################################################
5# vrt2tei.pl
6# eureco
7# leibniz-institut fuer deutsche sprache / csc finland esbo
8# august 2024
9#
10#
11# using XML::Twig , see http://www.xmltwig.org/, https://metacpan.org/pod/XML::Twiga
12#
Harald Lüngenccd84902024-08-27 16:03:47 +030013# usage: see below the usage fugnction
Harald Lüngencaab0802024-08-23 17:28:22 +030014# Usage: ./vrt2tei.pl <vrtxmlfile.xml> <outfile>
Harald Lüngen9d4e0462024-08-23 09:34:22 +030015# <vrtxmlfile>: xml-ised vrt file
16#
17#
18# TODO:
19# 1 insert dtd spec, or ref to TEI
20
Harald Lüngendb5e6e72024-09-04 17:41:18 +030021# 3a remove the vrt positional attribute comment line / all comment lines
Harald Lüngen9d4e0462024-08-23 09:34:22 +030022# 3b add @head and @deprel to I5 sowie auch @msd
23# 3c bearbeitung von @head und @deprel in tei2korapxml durch Nils?
24# 3d build 30 billion corpus
25
26# 4a take care of IDs
27# 4b see to the values of @xml:lang
28# 5 abfangen von unerwarteten elementen dh andere als <sentence> und <paragraph>
29# 5a wort reihenfolge nochmal checken
30# 6 checks and balances
Harald Lüngen9d4e0462024-08-23 09:34:22 +030031# 7 How to encode Kielipankki and National Library of Finland? in teiCorpus Header
32# 8 construct <idsDoc>s for the months (or go for TEI)
33# 9 parallelisation in bash and application on sub corpora of KLK
34# 10 re-implementation of the gawk code in the perl script
35# 12 re-implement creation of text header from xml file in another twig / parametrize TEI vs I5
36
37
38
39#remember
40#formatted.xml:105613: element w: validity error : No declaration for attribute deprel of element w
41#formatted.xml:105613: element w: validity error : No declaration for attribute head of element w
42
43
44#
45#
46############################################################################################################################################################
47
48
49use strict;
50use warnings;
51
52use XML::Twig;
53use XML::Generator ':pretty'; # apparently no effect when using flush();
54
55
56use locale; # diese drei Zeilen, damit \b im regex nicht Umlaute und ß matcht.
57use POSIX qw(locale_h); # to be able to use setlocale()
58#setlocale(LC_ALL,'de_DE');
59setlocale(LC_ALL, "fi_FI");
60use utf8;
61use open qw( :std :encoding(UTF-8) );
62
63use Time::Piece;
64use Tie::IxHash;
65
Harald Lüngendb5e6e72024-09-04 17:41:18 +030066
67
Harald Lüngen9d4e0462024-08-23 09:34:22 +030068#----------------------
69# check file arguments:
70#----------------------
71
72# arg0 infile: vrt-xml
Harald Lüngen9d4e0462024-08-23 09:34:22 +030073
Harald Lüngena20e69d2024-08-29 13:33:08 +030074unless($ARGV[0]) {&usage_message()} ; # min arg0, the input file
75if ($ARGV[1]) {&usage_message()}; # max arg0, the input file
Harald Lüngen9d4e0462024-08-23 09:34:22 +030076
77
78####################
79# GLOBAL VARIABLES
80####################
81
82my $encoding = "UTF-8";
83#my $encoding = "iso-8859-1"; # dieses $encoding ist NUR fuer das output s.u. twig funktion
Harald Lüngencaab0802024-08-23 17:28:22 +030084my $textcounter = 0;
Harald Lüngen9d4e0462024-08-23 09:34:22 +030085
86
Harald Lüngendb5e6e72024-09-04 17:41:18 +030087
88my $twig="";
89my $teiCorpusHeaderDoc="";
90
Harald Lüngen695ac1d2024-09-05 08:55:21 +030091# global variables pertaining to the original corpus :
92my $kielipankkiCorpus = "klk-fi-v2-vrt";
93
94
95
Harald Lüngendb5e6e72024-09-04 17:41:18 +030096
97#------------------------------------------------------------------
98# read corpusHeaderSkeleton document and get header out of it
99#------------------------------------------------------------------
100
101my $teiCorpusHeaderDocTwig = new XML::Twig(
102 keep_spaces => 1,
103 keep_atts_order => 1,
104 comments => 'drop',
105 );
106
107
108$teiCorpusHeaderDocTwig->parsefile("teiCorpusHeaderSkeleton.tei.xml");
109my $corpusHeader = $teiCorpusHeaderDocTwig->root; # getting the teiHeader for corpus out of the teiCorpusHeaderSkeleton document
110
111
112#------------------------------------------------------------------
113# read textHeaderSkeleton document adn get header out of it
114#------------------------------------------------------------------
115
116my $teiTextHeaderDocTwig = new XML::Twig(
117 keep_spaces => 1,
118 keep_atts_order => 1,
119 comments => 'drop',
120 );
121
122$teiTextHeaderDocTwig->parsefile("teiTextHeaderSkeleton.tei.xml");
123my $textHeader = $teiTextHeaderDocTwig->root; # getting the teiHeader for corpus out of the teiTextHeaderSkeleton document
124
125
126#----------------------------------
127# read input VRT-XML document
128#----------------------------------
129
130open(my $IN, "< $ARGV[0]") || die("$0: cannot open file for reading: $ARGV[0]"); # open input file and initialise filehandel, actually does not seem to be needed
131 # as parsefile() (s.b.) is applied to the filename
132
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300133
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300134#####################
135# M A I N
136#####################
137
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300138#-------------------------------------------------------------------------------------------------------------
139# start twig for input and call start tag handler for root and twig handler for each <text> in the VRT
140#-------------------------------------------------------------------------------------------------------------
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300141
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300142
143$twig = new XML::Twig(
144 keep_spaces => 1, # dadurch auch whitespaces an ehemeligen elementgrenzen im output
145 keep_atts_order => 1, # requires Tie::IxHash
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300146 comments => 'drop',
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300147 start_tag_handlers => {
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300148 texts => sub{root(@_, $corpusHeader)}
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300149 },
150 twig_handlers => {
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300151# text => \&text
Harald Lüngen7abb0b52024-09-05 16:26:57 +0300152 text => sub{text(@_, $textHeader->copy)} # copy must be because textHeader will be flushed with $twig in the <text> handler;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300153 },
154 # dtd_handlers => { # ToDo for I5
155 # \&set_dtd;
156 # }
Harald Lüngena20e69d2024-08-29 13:33:08 +0300157
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300158 output_encoding => $encoding,
159 );
160
161$twig->parsefile($ARGV[0]);
162
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300163
164
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300165
166
167###########
168# END MAIN
169###########
170
171
172
173
174##############################
175# S U B R O U T I N E S
176##############################
177
178# sub set_dtd [
179# my $twig, $dtd = @_;
180# my $internal = qq|\nPUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"\n"DTD/xhtml1-strict.dtd"|;
181#
182# $twig->twig_doctype('html', undef, undef, $internal);
183# }
184
185
186
187sub root {
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300188 my ($twig, $root, $corpusHeader) =@_;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300189
190 $root->set_gi('teiCorpus');
191 $root->set_att("xmlns", 'http://www.tei-c.org/ns/1.0');
192
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300193 &insertCorpusHeader($root, $corpusHeader);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300194}
195
196
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300197
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300198sub insertCorpusHeader{
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300199 my ($root, $corpusHeader) =@_;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300200
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300201 #---------------------------------------------------------------------------
202 # get some metadata for the current output corpus based on source and year
203 #---------------------------------------------------------------------------
204
Harald Lüngenccd84902024-08-27 16:03:47 +0300205 my @array = split(/\//, $ARGV[0]);
206 my $l = scalar(@array);
207 my $source = $array[$l-1];
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300208 $source =~ s/([0-9][0-9][0-9][0-9])\.xml$//;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300209
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300210 my $language="Finnish";
211 my $lang_tla="fi";
212
213 my $yy = $1; # $1 now containts substring in first bracket in regex above
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300214
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300215 my $ctitle = $source . " " . $yy . ", from ". $kielipankkiCorpus . " for EuReCo"; # to do: also get name of corpus (klk-fi-v2-vrt)
216
217
218 #-----------------------
219 # set corpus header
220 #-----------------------
221
222 &set_title( $corpusHeader, $source, $yy, $kielipankkiCorpus);
223 &set_sourceDesc($corpusHeader, $source, $yy, $kielipankkiCorpus);
224
225 my $teiCorpusHeader = $corpusHeader->paste("first_child", $root);
226
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300227}
228
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300229
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300230#----------------------------
231# handler &text for <text>
232#----------------------------
233
234sub text {
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300235 my ($twig, $text, $textHeader) = @_;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300236
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300237 $textcounter++;
Harald Lüngencaab0802024-08-23 17:28:22 +0300238
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300239
Harald Lüngencaab0802024-08-23 17:28:22 +0300240 # ToDo: catch all other, unexpected children of root
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300241
242 #--------------------------------------------------------------------------
243 # Get text metadata (attributes of <text>) and create teiHeader for <text>
244 #--------------------------------------------------------------------------
245
246 my $textattsref = $text->atts(); # $textattsref is now a reference to a hash and should be used with '->'
247
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300248
249 &createTextHeader($text, $textattsref, $textHeader);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300250
251 #--------------------------
252 # create <TEI> from <text>
253 #--------------------------
254
255 # set vrt <text> to <TEI> and delete all attributes after they were were saved above
256 $text->del_atts;
257 $text->set_gi("TEI");
Harald Lüngen7abb0b52024-09-05 16:26:57 +0300258
259 # !!!!!!!!!!!!!!!!!!
260 # ToDo: Generate a proper textsigle in TEI/@xml:id that can be converted into a textsigle
261
262
263
264
Harald Lüngencaab0802024-08-23 17:28:22 +0300265
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300266 #------------------------------------------------------------------
267 # create the <tei:text>, <body>, <div> elements inside <TEI>
268 #------------------------------------------------------------------
269
270 my $ttext_element = XML::Twig::Elt->new('text');
271 my $body_element = XML::Twig::Elt->new('body');
272 my $div_element = XML::Twig::Elt->new('div');
273
274 # set atts
Harald Lüngencaab0802024-08-23 17:28:22 +0300275 $div_element ->set_att("type", "page"); # ToDo: this is specific to KLK
276 $ttext_element->set_att("xml:lang", 'fi'); # as in ICC-NOR
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300277
278 # paste
279 $ttext_element->paste('last_child', $text);
280 $body_element ->paste('last_child', $ttext_element);
281 $div_element ->paste('last_child', $body_element);
282
283
284 #-------------------------------
285 # create <p> from <paragraph>
286 #-------------------------------
287
288 my @paragraphs = $text->children( 'paragraph');
289
290 foreach my $paragraph (@paragraphs) {
291
292 &setP($paragraph);
293
294 $paragraph->move('last_child', $div_element);
295
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300296 #------------------------------
297 # create <s> from <sentence>
298 #------------------------------
299
300 my @sentences = $paragraph->children('sentence');
301 foreach my $sentence (@sentences) {
302
303 &setS($sentence);
304
305
306 #--------------------------------------
307 # create <w> (word) from each $line
308 #--------------------------------------
309
310 my @lines = split(/\n+/, $sentence->xml_text);
311 $sentence->set_text("\n");
312
313 for my $line (@lines){ # Todo: Reihenfolge checken
314 if($line ne "" ){
315 my $w_element = XML::Twig::Elt->new('w');
316 &createW($w_element, $line);
317 $w_element->paste('last_child', $sentence);
318 }
Harald Lüngencaab0802024-08-23 17:28:22 +0300319 } # end words
320 } # end sentences
321 } # end paragraphs
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300322
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300323 # $twig->set_pretty_print( 'record');
Harald Lüngena20e69d2024-08-29 13:33:08 +0300324 # $twig->flush($OUT);
325 $twig->flush("/dev/stdout");
326}
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300327
328sub createTextHeader{
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300329 my ($text, $textattsref, $textHeader) = @_;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300330
331 # USE 01 binding_id="2246025"
332 # USE 02 date="2021-01-15"
333 # 03 datefrom="20210115"
334 # 04 dateto="20210115"
335 # 05 elec_date="_"
336 # 06 file=""
337 # USE 07 filename_metadata="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_mets.xml"
338 # USE 08 filename_orig ="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_page-2021011502210030301.xml
339 # USE 09 id="t-bcd0f3fa-bbd3dac4"
340 # 10 img_url=""
341 # USE 11 issue_date="15.01.2021"
342 # USE 12 issue_no="SK0221"
343 # USE 13 issue_title="Suomen Kuvalehti"
344 # USE 14 label="Suomen Kuvalehti no. SK0221 15.01.2021"
345 # USE 16 language="fi"
346 # USE 17 page_id="p1"
347 # USE 18 page_no="None"
348 # 19 part_name="_"
349 # 20 publ_id="0039-5552"
350 # 21 publ_part=""
351 # USE 22 publ_title="Suomen Kuvalehti"
352 # USE 23 publ_type="aikakausi"
353 # USE 24 sentcount="70"
354 # USE 25 sum_lang="|xxx:44|fin:23|eng:3|"
355 # 26 timefrom="000000"
356 # 27 timeto="235959"
357 # USE 28 tokencount="304"
358 # 29 version_added="KLK-fi-2021">
359
360
361 my $BID = $textattsref->{'binding_id'};
362 my $DATE = $textattsref->{'date'};
363 my $METAFILENAME = $textattsref->{'filename_metadata'};
364 my $ORIGFILENAME = $textattsref->{'filename_orig'};
365 my $ID = $textattsref->{'id'};
366 my $ISSUEDATE = $textattsref->{'issue_date'};
367 my $ISSUENO = $textattsref->{'issue_no'};
368 my $ISSUETITLE = $textattsref->{'issue_title'};
369 my $LABEL = $textattsref->{'label'};
370 my $LANGUAGE = $textattsref->{'language'};
371 my $PAGEID = $textattsref->{'page_id'};
372 my $PAGENO = $textattsref->{'page_no'};
373 my $PUBLTITLE = $textattsref->{'publ_title'};
374 my $PUBLTYPE = $textattsref->{'publ_type'};
375 my $SENTCOUNT = $textattsref->{'sentcount'};
376 my $SUMLANG = $textattsref->{'sum_lang'};
377 my $TOKENCOUNT = $textattsref->{'tokencount'};
378
379
380 #-----------------------------
381 # Derived Metadata variables
382 #-----------------------------
383
384 my @datearray = split("-", $DATE);
385 my @langarray = split("|", $SUMLANG);
386 my @namearray = split(/[\.\/]/, $ORIGFILENAME); # use $namearray[4] as ID for the page
387
388
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300389
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300390
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300391 #-----------------------------------------------------------------------
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300392 # CREATE text-teiHeader ACCORDING TO THE SKELETON in $textHeader
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300393 #-----------------------------------------------------------------------
394
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300395
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300396 $textHeader->paste('first_child', $text);
397
398 #-----------------------------------------------
399 # <teiHeader>
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300400 # <fileDesc n="EuReCo-KLK-FIN_[$ID]">
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300401 # <titleStmt>
402 # <title>[$LABEL, page $PAGENO]</title>
403
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300404 $textHeader->first_child("fileDesc") -> set_att('n', "EuReCo-". $kielipankkiCorpus . $ID);
405
406 $textHeader->first_child("fileDesc") -> first_child("titleStmt")->first_child("title")
407 ->set_text($LABEL . ", Text #" . $textcounter); # Case KLK; PAGENO scheint meist "None" zu sein
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300408
409 #-----------------------------------------------
410 # <fileDesc>
411 # <sourceDesc>
412 # <biblStruct>
413 # <analytic>
414 # <title type="main">[$LABEL, page $PAGENO]</title>
415 # <date>[$DATE]</date>
416 # <date type="year">TODO</date>
417 # <date type="month">TODO</date>
418 # <date type="day">TODO</date>
419 # <idno type="PAGEID">$PAGEID</idno>
420 # <idno type="BINDINGID">$BID</idno>
421 # <idno type="ID">$ID</idno>
422 # <idno type="KIELIPANKKI_METAFILENAME">$METAFILENAME</idno>
423 # <idno type="KIELIPANKKI_ORIGFILENAME">$ORIGFILENAME</idno>
424 # <textLang>$LANGUAGE</textLang>
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300425
426 my $analytic = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/analytic", 0);
427
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300428 $analytic->first_child("title") ->set_text($LABEL . ", Text #" . $textcounter); # Case KLK; PAGENO scheint meist "None" zu sein
429 $analytic->get_xpath('./date[@type="date"]', 0) ->set_text($DATE);
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300430 $analytic->get_xpath('./date[@type="year"]', 0) ->set_text($datearray[0]);
431 $analytic->get_xpath('./date[@type="month"]', 0) ->set_text($datearray[1]);
432 $analytic->get_xpath('./date[@type="day"]', 0) ->set_text($datearray[2]);
433 $analytic->get_xpath('./idno[@type="PAGEID"]', 0) ->set_text($PAGEID);
434 $analytic->get_xpath('./idno[@type="BINDINGID"]', 0) ->set_text($BID);
435 $analytic->get_xpath('./idno[@type="ID"]', 0) ->set_text($ID);
436 $analytic->get_xpath('./idno[@type="KIELIPANKKI_METAFILENAME"]', 0) ->set_text($METAFILENAME);
437 $analytic->get_xpath('./idno[@type="KIELIPANKKI_ORIGFILENAME"]', 0) ->set_text($ORIGFILENAME);
438 $analytic->first_child('textLang') ->set_text($LANGUAGE);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300439
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300440 # <monogr>
441 # <title>$PUBLTITLE</title>
442 # <imprint>
443 # <pubPlace>TODO</pubPlace>
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300444 # <publisher>TODO</publisher>
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300445 # </imprint>
446 # <biblScope unit="ISSUETITLE"/>
447 # <biblScope unit="ISSUENO"/>
448 # <biblScope unit="ISSUEDATE"/>
449 # <biblScope unit="pp">$PAGENO</biblScope>
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300450
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300451 my $monogr = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/monogr", 0);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300452
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300453 $monogr->first_child("title") ->set_text($PUBLTITLE);
454 $monogr->first_child("imprint")->first_child("pubPlace") ->set_text("ToDo"); # imprint is needed for tei validity
455 $monogr->first_child("imprint")->first_child("publisher") ->set_text("ToDo"); # imprint is needed for tei validity
456 $monogr->get_xpath('./biblScope[@unit="ISSUETITLE"]', 0) ->set_text($ISSUETITLE);
457 $monogr->get_xpath('./biblScope[@unit="ISSUENO"]', 0) ->set_text($ISSUENO);
458 $monogr->get_xpath('./biblScope[@unit="ISSUEDATE"]', 0) ->set_text($ISSUEDATE);
459 $monogr->get_xpath('./biblScope[@unit="pp"]', 0) ->set_text($PAGENO); # Achtung - PAGENO scheint meist "None" zu sein
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300460
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300461 # <encodingDesc>
462 # <tagsDecl>
463 # <namespace name="http://www.tei-c.org/ns/1.0">
464 # <tagUsage gi="s" occurs="SENTCOUNT"/>
465 # <tagUsage gi="w" occurs="TOKENCOUNT"/>
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300466
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300467 $textHeader->get_xpath('./encodingDesc/tagsDecl/namespace/tagUsage[@gi="s"]', 0) -> set_att('occurs', $SENTCOUNT);
468 $textHeader->get_xpath('./encodingDesc/tagsDecl/namespace/tagUsage[@gi="w"]', 0) -> set_att('occurs', $TOKENCOUNT);
469
470 # <profileDesc>
471 # <langUsage>
472 # <language ident="fi" usage="|xxx:44|fin:23|eng:3|"/>
473 # </langUsage>
474 # <textClass>
475 # <classCode scheme="kielipankki_klk">$PUBLTYPE</classCode>
476 # <classCode scheme="kielipankki_klk_mapped">TODO</classCode>
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300477
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300478 $textHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_att('ident', $LANGUAGE);
479 $textHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_att('usage', $SUMLANG);
480 # in @usage muss eigt. ein integer; am besten inhalt von SUMLANG aufdroeseln und mehrere <language> machen
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300481
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300482 $textHeader->get_xpath('./profileDesc/textClass/classCode[@scheme="kielipankki_klk"]', 0) ->set_text($PUBLTYPE);
483 $textHeader->get_xpath('./profileDesc/textClass/classCode[@scheme="kielipankki_klk_mapped"]', 0)->set_text("ToDo");
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300484
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300485 # <revisionDesc>
486 # <change when="TODO" who="HL">TEI version for EuReCo</change>
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300487
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300488 $textHeader->get_xpath('./revisionDesc/change', 0) ->set_att('when', localtime->ymd('-'));
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300489
490
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300491 #-----------------------------------
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300492 # END OF CREATING TEIHEADER
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300493 #-----------------------------------
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300494
495}
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300496
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300497sub setP {
498 my ($paragraph) = @_;
499
500 $paragraph->set_gi('p');
501
502 # <paragraph id="p-bcd0f3fa-bbd3dac4-815ead7a" sum_lang="|fin:1|">
503 # atts of <paragraph>:
504 # @id USE
505 # @sum_lang USE: put in xml:lang and prefix the value with "x-" for private value
506
507 $paragraph->set_att("xml:lang", "x-" . $paragraph->att("sum_lang"));
508 $paragraph->del_att("sum_lang");
Harald Lüngena20e69d2024-08-29 13:33:08 +0300509 # $paragraph->change_att_name('id', 'xml:id');
510 $paragraph->del_att("id"); # diese id ist auch nicht eindeutig!!
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300511}
512sub setS {
513 my ($sentence) = @_;
514
515 $sentence->set_gi('s');
516
517 # the atts of <sentence>:
518 # USE 1 @id="s-bcd0f3fa-bbd3dac4-f7429090"
519 # USE 2 @lang="fin" -> xml:lang
520 # ? 3 @lang_conf="0.6734853"> -> ToDo @cert ?
521
522 # set attrs of <s>
523 $sentence->set_att("xml:lang", $sentence->att("lang")); # ToDo: convert the value / introduce a hash for lookup (input values: "fin", "xxx", ....)
Harald Lüngenfaf8d482024-08-27 21:19:47 +0300524 # $sentence->change_att_name('id', 'xml:id'); # nicht eindeutig
525 $sentence->del_att('id');
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300526 $sentence->del_att("lang"); # replaced by xml:lang
527 $sentence->del_att("lang_conf"); # for the time being
528
529}
530
531sub createW {
532 my ($w_element, $line) = @_;
533
534 #---------------------------
535 # Get the tags (=columns)
536 #---------------------------
537
538 my @tags = split(/\t/, $line);
539
540 # set content of <w> i.e. the token
541 $w_element->set_text($tags[0]);
542
543 # vrt positional-attributes in corpus KLK:
544 # USE [0] word
545 # USE [1] ref (id for reference of dephead)
546 # USE [2] lemma
547 # ? [3] lemmacomp (lemma with compound info - could go in @norm, as tag abuse?)
548 # USE [4] pos
549 # USE [5] msd
550 # USE [6] dephead
551 # USE [7] deprel
552 # [8] content (ocr-process)
553 # [9] vpos (ocr-process)
554 # [10] ocr (ocr-process)
555 # [11] cc (ocr-process)
556 # [12] hyph (ocr-process)
557 # [13] style (ocr-process)
558 # [14] lex (korp semantic disambiguation from G"oteborg)
559
560 # set the attributes of <w>:
561 $w_element->set_att("n", $tags[1]);
562 # $w_element->set_att("id", "w_" . $namearray[4] . $sentence->att("xml:id") . "_" . $tags[1]);
563 # so zusammengebaute ID ist auch nicht eindeutig...
Harald Lüngenfaf8d482024-08-27 21:19:47 +0300564 $w_element->del_att("id");
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300565 $w_element->set_att("lemma", $tags[2]);
566 # $w_element->set_att("norm", $tags[3]); # tag abuse of @norm
567 $w_element->set_att("pos", $tags[4]);
568 $w_element->set_att("msd", $tags[5]);
Harald Lüngenccd84902024-08-27 16:03:47 +0300569#TMP $w_element->set_att("head", $tags[6]);
570#TMP $w_element->set_att("deprel", $tags[7]);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300571
572}
573
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300574
575sub set_title{
576 my ($corpusHeader, $source, $yy, $kielipankkiCorpus) = @_;
577
578 my $cTitleString = $source . " " . $yy . ", from ". $kielipankkiCorpus . " for EuReCo";
579
580 #<teiHeader>
581 # <fileDesc>
582 # <titleStmt>
583 # <title>[Aamulehti2021] from [klk-fi-v2-vrt for] EuReCo</title>
584 # </titleStmt>
585 # <!-- ... -->
586 # </fileDesc>
587 #</teiHeader>
588
589 my $cTitleNode = $corpusHeader->first_child("fileDesc")->first_child("titleStmt")->first_child("title");
590
591 $cTitleNode->set_text($cTitleString);
592
593}
594
595sub set_sourceDesc{
596 my ($corpusHeader, $source, $yy, $kielipankkiCorpus) = @_;
597
598 my $cBiblString = $source . " " . $yy . ", from ". $kielipankkiCorpus . " for EuReCo";
599
600 #<teiHeader>
601 # <fileDesc>
602 # <!-- ... -->
603 # <sourceDesc>
604 # <bibl>[Aamulehti2021] from [klk-fi-v2-vrt]</bibl>
605 # </sourceDesc>
606 # <!-- ... -->
607 # </fileDesc>
608 #</teiHeader>
609
610 my $cBiblNode = $corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("bibl");
611
612 $cBiblNode->set_text($cBiblString);
613}
614
615
616
617
618
619
620
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300621#################
622## usage_message
623#################
624
625
626sub usage_message {
Harald Lüngena7e91622024-08-23 17:33:11 +0300627 print " Usage: ./vrt2tei.pl <file.vrt.xml> <outfile>\n";
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300628 print " <file.vrt.xml> is a VRT file converted to proper XML\n";
629 exit;
630}
631
632