blob: 67227d2053ef9fb5d89efad4588c386f37d9f6ea [file] [log] [blame]
Harald Lüngen9d4e0462024-08-23 09:34:22 +03001#! /usr/bin/perl -w
2
3
4###########################################################################################################################################################
5# vrt2tei.pl
6# eureco
7# leibniz-institut fuer deutsche sprache / csc finland esbo
8# august 2024
9#
10#
11# using XML::Twig , see http://www.xmltwig.org/, https://metacpan.org/pod/XML::Twiga
12#
Harald Lüngenccd84902024-08-27 16:03:47 +030013# usage: see below the usage fugnction
Harald Lüngencaab0802024-08-23 17:28:22 +030014# Usage: ./vrt2tei.pl <vrtxmlfile.xml> <outfile>
Harald Lüngen9d4e0462024-08-23 09:34:22 +030015# <vrtxmlfile>: xml-ised vrt file
16#
17#
18# TODO:
19# 1 insert dtd spec, or ref to TEI
20
21# 3a UPLOAD in GITHUB
22# 3b add @head and @deprel to I5 sowie auch @msd
23# 3c bearbeitung von @head und @deprel in tei2korapxml durch Nils?
24# 3d build 30 billion corpus
25
26# 4a take care of IDs
27# 4b see to the values of @xml:lang
28# 5 abfangen von unerwarteten elementen dh andere als <sentence> und <paragraph>
29# 5a wort reihenfolge nochmal checken
30# 6 checks and balances
31# 6a output nach stdout machen
32# 7 How to encode Kielipankki and National Library of Finland? in teiCorpus Header
33# 8 construct <idsDoc>s for the months (or go for TEI)
34# 9 parallelisation in bash and application on sub corpora of KLK
35# 10 re-implementation of the gawk code in the perl script
36# 12 re-implement creation of text header from xml file in another twig / parametrize TEI vs I5
37
38
39
40#remember
41#formatted.xml:105613: element w: validity error : No declaration for attribute deprel of element w
42#formatted.xml:105613: element w: validity error : No declaration for attribute head of element w
43
44
45#
46#
47############################################################################################################################################################
48
49
50use strict;
51use warnings;
52
53use XML::Twig;
54use XML::Generator ':pretty'; # apparently no effect when using flush();
55
56
57use locale; # diese drei Zeilen, damit \b im regex nicht Umlaute und ß matcht.
58use POSIX qw(locale_h); # to be able to use setlocale()
59#setlocale(LC_ALL,'de_DE');
60setlocale(LC_ALL, "fi_FI");
61use utf8;
62use open qw( :std :encoding(UTF-8) );
63
64use Time::Piece;
65use Tie::IxHash;
66
67#----------------------
68# check file arguments:
69#----------------------
70
71# arg0 infile: vrt-xml
Harald Lüngen9d4e0462024-08-23 09:34:22 +030072
Harald Lüngena20e69d2024-08-29 13:33:08 +030073unless($ARGV[0]) {&usage_message()} ; # min arg0, the input file
74if ($ARGV[1]) {&usage_message()}; # max arg0, the input file
Harald Lüngen9d4e0462024-08-23 09:34:22 +030075
76
77####################
78# GLOBAL VARIABLES
79####################
80
81my $encoding = "UTF-8";
82#my $encoding = "iso-8859-1"; # dieses $encoding ist NUR fuer das output s.u. twig funktion
Harald Lüngencaab0802024-08-23 17:28:22 +030083my $textcounter = 0;
Harald Lüngen9d4e0462024-08-23 09:34:22 +030084
85
86#####################
87# M A I N
88#####################
89
Harald Lüngena20e69d2024-08-29 13:33:08 +030090open(my $IN, "< $ARGV[0]") || die("$0: cannot open file for reading: $ARGV[0]"); # open input file and initialise filehandel, actually does not seem to be needed
91 # as parsefile() (s.b.) is applied to the filename
92## open(my $OUT, "> $ARGV[1]") || die("$0: cannot open file: $ARGV[1]"); # open result file and initialise filehandle
93 # currently not used but flushed to "/dev/stdout"
Harald Lüngen9d4e0462024-08-23 09:34:22 +030094
95#-----------------------------------------------------------------------------------
96# start twig and call start tag handler for root and twig handler for each <text>
97#-----------------------------------------------------------------------------------
98
99my $twig="";
100
101$twig = new XML::Twig(
102 keep_spaces => 1, # dadurch auch whitespaces an ehemeligen elementgrenzen im output
103 keep_atts_order => 1, # requires Tie::IxHash
104 pretty_print => 'indented',
105 start_tag_handlers => {
106 texts => \&root
107 },
108 twig_handlers => {
109 text => \&text
110 },
111 # dtd_handlers => { # ToDo for I5
112 # \&set_dtd;
113 # }
Harald Lüngena20e69d2024-08-29 13:33:08 +0300114
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300115 output_encoding => $encoding,
116 );
117
118$twig->parsefile($ARGV[0]);
119
Harald Lüngena20e69d2024-08-29 13:33:08 +0300120##TMP close($OUT);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300121
122
123###########
124# END MAIN
125###########
126
127
128
129
130##############################
131# S U B R O U T I N E S
132##############################
133
134# sub set_dtd [
135# my $twig, $dtd = @_;
136# my $internal = qq|\nPUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"\n"DTD/xhtml1-strict.dtd"|;
137#
138# $twig->twig_doctype('html', undef, undef, $internal);
139# }
140
141
142
143sub root {
144 my ($twig, $root) =@_;
145
146 $root->set_gi('teiCorpus');
147 $root->set_att("xmlns", 'http://www.tei-c.org/ns/1.0');
148
Harald Lüngena20e69d2024-08-29 13:33:08 +0300149 &insertCorpusHeader($root);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300150}
151
152
153sub insertCorpusHeader{
154 my ($root) =@_;
155
Harald Lüngenccd84902024-08-27 16:03:47 +0300156 my @array = split(/\//, $ARGV[0]);
157 my $l = scalar(@array);
158 my $source = $array[$l-1];
159 $source =~ s/\.xml//;
Harald Lüngen5bebb0c2024-08-27 16:44:34 +0300160 $source = $source . " from klk-fi-v2-vrt"; # for the time being; TODO
Harald Lüngenccd84902024-08-27 16:03:47 +0300161
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300162 my $teiHeader = $root ->insert_new_elt("first_child", 'teiHeader');
163 my $fileDesc = $teiHeader ->insert_new_elt("last_child", 'fileDesc');
Harald Lüngencaab0802024-08-23 17:28:22 +0300164 my $profileDesc = $teiHeader ->insert_new_elt("last_child", 'profileDesc');
165
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300166 my $titleStmt = $fileDesc ->insert_new_elt("last_child", 'titleStmt');
167 my $title = $titleStmt ->insert_new_elt("last_child", 'title');
Harald Lüngenccd84902024-08-27 16:03:47 +0300168 $title ->set_text($source . " from KLK-fi-2021 for EuReCo");
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300169
170 my $publicationStmt = $fileDesc ->insert_new_elt("last_child", 'publicationStmt');
171 my $distributor = $publicationStmt->insert_new_elt("last_child", 'distributor');
172 $distributor ->set_text("NOT FOR DISTRIBUTION - to be used locally in EuReCo");
173
174 my $sourceDesc = $fileDesc ->insert_new_elt("last_child", 'sourceDesc');
175 my $bibl = $sourceDesc ->insert_new_elt("last_child", 'bibl');
Harald Lüngenccd84902024-08-27 16:03:47 +0300176 $bibl ->set_text($source);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300177
Harald Lüngencaab0802024-08-23 17:28:22 +0300178 my $langUsage = $profileDesc ->insert_new_elt("last_child", 'langUsage');
179 my $language = $langUsage ->insert_new_elt("last_child", 'language');
180 $language ->set_att("ident",'fi');
181 $language ->set_text("Finnish");
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300182}
183
184
185#----------------------------
186# handler &text for <text>
187#----------------------------
188
189sub text {
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300190 my ($twig, $text) = @_;
191
Harald Lüngencaab0802024-08-23 17:28:22 +0300192 $textcounter++; # global variable
193
194 # ToDo: catch all other, unexpected children of root
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300195
196 #--------------------------------------------------------------------------
197 # Get text metadata (attributes of <text>) and create teiHeader for <text>
198 #--------------------------------------------------------------------------
199
200 my $textattsref = $text->atts(); # $textattsref is now a reference to a hash and should be used with '->'
201
202 &createTextHeader($text, $textattsref);
203
204 #--------------------------
205 # create <TEI> from <text>
206 #--------------------------
207
208 # set vrt <text> to <TEI> and delete all attributes after they were were saved above
209 $text->del_atts;
210 $text->set_gi("TEI");
Harald Lüngencaab0802024-08-23 17:28:22 +0300211
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300212 #------------------------------------------------------------------
213 # create the <tei:text>, <body>, <div> elements inside <TEI>
214 #------------------------------------------------------------------
215
216 my $ttext_element = XML::Twig::Elt->new('text');
217 my $body_element = XML::Twig::Elt->new('body');
218 my $div_element = XML::Twig::Elt->new('div');
219
220 # set atts
Harald Lüngencaab0802024-08-23 17:28:22 +0300221 $div_element ->set_att("type", "page"); # ToDo: this is specific to KLK
222 $ttext_element->set_att("xml:lang", 'fi'); # as in ICC-NOR
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300223
224 # paste
225 $ttext_element->paste('last_child', $text);
226 $body_element ->paste('last_child', $ttext_element);
227 $div_element ->paste('last_child', $body_element);
228
229
230 #-------------------------------
231 # create <p> from <paragraph>
232 #-------------------------------
233
234 my @paragraphs = $text->children( 'paragraph');
235
236 foreach my $paragraph (@paragraphs) {
237
238 &setP($paragraph);
239
240 $paragraph->move('last_child', $div_element);
241
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300242 #------------------------------
243 # create <s> from <sentence>
244 #------------------------------
245
246 my @sentences = $paragraph->children('sentence');
247 foreach my $sentence (@sentences) {
248
249 &setS($sentence);
250
251
252 #--------------------------------------
253 # create <w> (word) from each $line
254 #--------------------------------------
255
256 my @lines = split(/\n+/, $sentence->xml_text);
257 $sentence->set_text("\n");
258
259 for my $line (@lines){ # Todo: Reihenfolge checken
260 if($line ne "" ){
261 my $w_element = XML::Twig::Elt->new('w');
262 &createW($w_element, $line);
263 $w_element->paste('last_child', $sentence);
264 }
Harald Lüngencaab0802024-08-23 17:28:22 +0300265 } # end words
266 } # end sentences
267 } # end paragraphs
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300268
Harald Lüngena20e69d2024-08-29 13:33:08 +0300269 $twig->set_pretty_print( 'record');
270 # $twig->flush($OUT);
271 $twig->flush("/dev/stdout");
272}
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300273
274sub createTextHeader{
275 my ($text, $textattsref) = @_;
276
277 # USE 01 binding_id="2246025"
278 # USE 02 date="2021-01-15"
279 # 03 datefrom="20210115"
280 # 04 dateto="20210115"
281 # 05 elec_date="_"
282 # 06 file=""
283 # USE 07 filename_metadata="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_mets.xml"
284 # USE 08 filename_orig ="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_page-2021011502210030301.xml
285 # USE 09 id="t-bcd0f3fa-bbd3dac4"
286 # 10 img_url=""
287 # USE 11 issue_date="15.01.2021"
288 # USE 12 issue_no="SK0221"
289 # USE 13 issue_title="Suomen Kuvalehti"
290 # USE 14 label="Suomen Kuvalehti no. SK0221 15.01.2021"
291 # USE 16 language="fi"
292 # USE 17 page_id="p1"
293 # USE 18 page_no="None"
294 # 19 part_name="_"
295 # 20 publ_id="0039-5552"
296 # 21 publ_part=""
297 # USE 22 publ_title="Suomen Kuvalehti"
298 # USE 23 publ_type="aikakausi"
299 # USE 24 sentcount="70"
300 # USE 25 sum_lang="|xxx:44|fin:23|eng:3|"
301 # 26 timefrom="000000"
302 # 27 timeto="235959"
303 # USE 28 tokencount="304"
304 # 29 version_added="KLK-fi-2021">
305
306
307 my $BID = $textattsref->{'binding_id'};
308 my $DATE = $textattsref->{'date'};
309 my $METAFILENAME = $textattsref->{'filename_metadata'};
310 my $ORIGFILENAME = $textattsref->{'filename_orig'};
311 my $ID = $textattsref->{'id'};
312 my $ISSUEDATE = $textattsref->{'issue_date'};
313 my $ISSUENO = $textattsref->{'issue_no'};
314 my $ISSUETITLE = $textattsref->{'issue_title'};
315 my $LABEL = $textattsref->{'label'};
316 my $LANGUAGE = $textattsref->{'language'};
317 my $PAGEID = $textattsref->{'page_id'};
318 my $PAGENO = $textattsref->{'page_no'};
319 my $PUBLTITLE = $textattsref->{'publ_title'};
320 my $PUBLTYPE = $textattsref->{'publ_type'};
321 my $SENTCOUNT = $textattsref->{'sentcount'};
322 my $SUMLANG = $textattsref->{'sum_lang'};
323 my $TOKENCOUNT = $textattsref->{'tokencount'};
324
325
326 #-----------------------------
327 # Derived Metadata variables
328 #-----------------------------
329
330 my @datearray = split("-", $DATE);
331 my @langarray = split("|", $SUMLANG);
332 my @namearray = split(/[\.\/]/, $ORIGFILENAME); # use $namearray[4] as ID for the page
333
334
335 #-----------------------------------------------------------------------
336 # CREATE text-teiHeader ACCORDING TO THE SKELETON in klk-header.tei.xml
337 #-----------------------------------------------------------------------
338
339 # create <teiHeader> inside <TEI>
340 my $teiHeader = XML::Twig::Elt->new('teiHeader');
341 $teiHeader->paste('first_child', $text);
342
343 ## insert_new_elt is a combo of new and paste, cf. xml::twig docu:
344 ## insert_new_elt ($opt_position, $gi, $opt_atts_hashref, @opt_content)
345
346 my $fileDesc = $teiHeader->insert_new_elt('fileDesc' => {n => "EuReCo_KLK-fi_" . $namearray[4]});
347 my $encodingDesc = $teiHeader->insert_new_elt("last_child", 'encodingDesc');
348 my $profileDesc = $teiHeader->insert_new_elt("last_child", 'profileDesc');
349 my $revisionDesc = $teiHeader->insert_new_elt("last_child", 'revisionDesc');
350
351 #---------------------
352 # fileDesc/titleStmt
353 #---------------------
354 my $titleStmt = $fileDesc ->insert_new_elt('titleStmt');
355 my $title = $titleStmt->insert_new_elt("last_child", 'title');
356 my $respStmt = $titleStmt->insert_new_elt("last_child", 'respStmt');
357 my $resp = $respStmt ->insert_new_elt("last_child", 'resp');
358 my $name = $respStmt ->insert_new_elt("last_child", 'name');
359
360 # set texts for titleStmt
Harald Lüngencaab0802024-08-23 17:28:22 +0300361 # $title->set_text($LABEL . ", page " . $PAGENO); # Achtung - PAGENO scheint meist "None" zu sein
362 $title->set_text($LABEL . ", Text #" . $textcounter); # at least for Suomen Kuvalehti
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300363 $resp ->set_text("compiled by EuReCo");
364 $name ->set_text("EuReCo: HL");
365
366 #--------------------------
367 # fileDesc/publicationStmt
368 #--------------------------
369 my $publicationStmt = $fileDesc ->insert_new_elt("last_child", 'publicationStmt');
370 my $distributor = $publicationStmt->insert_new_elt("last_child", 'distributor');
371 my $note = $distributor ->insert_new_elt("last_child", 'note');
372 my $availability = $publicationStmt->insert_new_elt("last_child", 'availability');
373 my $licence = $availability ->insert_new_elt("last_child", 'licence');
374
375 # set texts for publicationStmt
376 $note ->set_text("NOT FOR DISTRIBUTION - to be used locally in EuReCo");
377 $licence->set_text("CLARIN-RES"); # TODO: Ausfuherlichere Licence info in KLK Metadata Record
378
379 #------------------------------
380 # fileDesc/sourceDesc/biblStruct
381 #------------------------------
382 my $sourceDesc = $fileDesc ->insert_new_elt("last_child", 'sourceDesc');
383 my $biblStruct = $sourceDesc->insert_new_elt("last_child", 'biblStruct');
384
385 # fileDesc/sourceDesc/biblStruct/analytic
386 my $analytic = $biblStruct->insert_new_elt("last_child", 'analytic');
387 my $analytic_title = $analytic->insert_new_elt("last_child", 'title' => {type => "main"} );
388# my $analytic_date = $analytic->insert_new_elt("last_child", 'date');
389 my $analytic_date_year = $analytic->insert_new_elt("last_child", 'date' => {type => "year"});
390 my $analytic_date_month = $analytic->insert_new_elt("last_child", 'date' => {type => "month"});
391 my $analytic_date_day = $analytic->insert_new_elt("last_child", 'date' => {type => "day"});
392 my $analytic_idno_pageid = $analytic->insert_new_elt("last_child", 'idno' => {type => "PAGEID"});
393 my $analytic_idno_bindingid = $analytic->insert_new_elt("last_child", 'idno' => {type => "BINDINGID"});
394 my $analytic_idno_id = $analytic->insert_new_elt("last_child", 'idno' => {type => "ID"});
395 my $analytic_idno_metafile = $analytic->insert_new_elt("last_child", 'idno' => {type => "KIELIPANKKI_METAFILENAME"});
396 my $analytic_idno_origfile = $analytic->insert_new_elt("last_child", 'idno' => {type => "KIELIPANKKI_ORIGFILENAME"});
397 my $analytic_textlang = $analytic->insert_new_elt("last_child", 'textLang');
398
399 # set texts for analytic
Harald Lüngencaab0802024-08-23 17:28:22 +0300400# $analytic_title ->set_text($LABEL . ", page " . $PAGENO); # Achtung $PAGENO scheint meist "None zu sein"
401 $analytic_title ->set_text($LABEL . ", Text #" . $textcounter); # Achtung $PAGENO scheint meist "None zu sein"
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300402# $analytic_date ->set_text($DATE);
403 $analytic_date_year ->set_text($datearray[0]);
404 $analytic_date_month ->set_text($datearray[1]);
405 $analytic_date_day ->set_text($datearray[2]);
406 $analytic_idno_pageid ->set_text($PAGEID);
407 $analytic_idno_bindingid->set_text($BID);
408 $analytic_idno_id ->set_text($ID);
409 $analytic_idno_metafile ->set_text($METAFILENAME);
410 $analytic_idno_origfile ->set_text($ORIGFILENAME);
411 $analytic_textlang ->set_text($LANGUAGE);
412
413 #-------------------------------------
414 # fileDesc/sourceDesc/biblStruct/monogr
415 #-------------------------------------
416 my $monogr = $biblStruct->insert_new_elt("last_child", 'monogr');
417 my $monogr_title = $monogr ->insert_new_elt("last_child", 'title');
418 my $imprint = $monogr ->insert_new_elt("last_child", 'imprint'); # imprint is needed for valididty
419 my $pubPlace = $imprint ->insert_new_elt("last_child", 'pubPlace'); # imprint is needed for validity
420 my $publisher = $imprint ->insert_new_elt("last_child", 'publisher'); # imprint is needed for validity
421 my $biblScope_issuetitle = $monogr ->insert_new_elt("last_child", 'biblScope' => {unit => 'ISSUETITLE'} );
422 my $biblScope_issueno = $monogr ->insert_new_elt("last_child", 'biblScope' => {unit => 'ISSUENO'} );
423 my $biblScope_issuedate = $monogr ->insert_new_elt("last_child", 'biblScope' => {unit => 'ISSUEDATE'} );
424 my $biblScope_pp = $monogr ->insert_new_elt("last_child", 'biblScope' => {unit => 'PAGENO'} ); # Achtung PAGENO ist meist "None" ?
425
426 # set texts for monogr
427 $monogr_title ->set_text($PUBLTITLE);
428 $pubPlace ->set_text("TODO");
Harald Lüngencaab0802024-08-23 17:28:22 +0300429 $pubPlace ->set_att("key",'FI');
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300430 $publisher ->set_text("TODO");
431 $biblScope_issuetitle->set_text($ISSUETITLE);
432 $biblScope_issueno ->set_text($ISSUENO);
433 $biblScope_issuedate ->set_text($ISSUEDATE);
434 $biblScope_pp ->set_text($PAGENO);
435
436 #---------------
437 # encodingDesc
438 #---------------
439 my $tagsDecl = $encodingDesc->insert_new_elt("last_child", 'tagsDecl');
440 my $namespace = $tagsDecl ->insert_new_elt("last_child", 'namespace' => {name => 'http://www.tei-c.org/ns/1.0'});
441 my $tagUsage_s = $namespace ->insert_new_elt("last_child", 'tagUsage' => {gi => 's', occurs => $SENTCOUNT});
442 my $tagUsage_w = $namespace ->insert_new_elt("last_child", 'tagUsage' => {gi => 'w', occurs => $TOKENCOUNT});
443
444 #-------------
445 # profileDesc
446 #-------------
447 my $langUsage = $profileDesc ->insert_new_elt("last_child", 'langUsage');
448 my $language = $langUsage ->insert_new_elt("last_child", 'language' => {ident => $LANGUAGE, usage => $SUMLANG});
449 # Achtung in @usage muss eigt. ein integer; am besten inhalt von SUMLANG aufdroeseln und mehrere <language> machen
450 my $textClass = $profileDesc ->insert_new_elt("last_child", 'textClass');
451 my $classCode_fi = $textClass ->insert_new_elt("last_child", 'classCode' => {scheme => "KLK_PUBLTYPE"});
452# my $classCode_en = $textClass ->insert_new_elt("last_child", 'classCode' => {scheme => "KLK_PUBLTYPE_MAPPED"});
453
454 #---------------------------
455 # set texts for profileDesc
456 #---------------------------
457 $classCode_fi ->set_text($PUBLTYPE);
458# $classCode_en->set_text($PUBLTYPETRANSL);
459
460 #---------------
461 # revisionDesc
462 #---------------
463 my $change = $revisionDesc ->insert_new_elt("last_child", 'change' => {when => localtime->ymd('-'), who => 'HL' });
464
465 # set texts for revisionDesc
466 $change->set_text("TEI version for EuReCo");
467
468
469 ###################################
470 # END OF CREATING TEIHEADER
471 ###################################
472
473}
474
475sub setP {
476 my ($paragraph) = @_;
477
478 $paragraph->set_gi('p');
479
480 # <paragraph id="p-bcd0f3fa-bbd3dac4-815ead7a" sum_lang="|fin:1|">
481 # atts of <paragraph>:
482 # @id USE
483 # @sum_lang USE: put in xml:lang and prefix the value with "x-" for private value
484
485 $paragraph->set_att("xml:lang", "x-" . $paragraph->att("sum_lang"));
486 $paragraph->del_att("sum_lang");
Harald Lüngena20e69d2024-08-29 13:33:08 +0300487 # $paragraph->change_att_name('id', 'xml:id');
488 $paragraph->del_att("id"); # diese id ist auch nicht eindeutig!!
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300489}
490sub setS {
491 my ($sentence) = @_;
492
493 $sentence->set_gi('s');
494
495 # the atts of <sentence>:
496 # USE 1 @id="s-bcd0f3fa-bbd3dac4-f7429090"
497 # USE 2 @lang="fin" -> xml:lang
498 # ? 3 @lang_conf="0.6734853"> -> ToDo @cert ?
499
500 # set attrs of <s>
501 $sentence->set_att("xml:lang", $sentence->att("lang")); # ToDo: convert the value / introduce a hash for lookup (input values: "fin", "xxx", ....)
Harald Lüngenfaf8d482024-08-27 21:19:47 +0300502 # $sentence->change_att_name('id', 'xml:id'); # nicht eindeutig
503 $sentence->del_att('id');
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300504 $sentence->del_att("lang"); # replaced by xml:lang
505 $sentence->del_att("lang_conf"); # for the time being
506
507}
508
509sub createW {
510 my ($w_element, $line) = @_;
511
512 #---------------------------
513 # Get the tags (=columns)
514 #---------------------------
515
516 my @tags = split(/\t/, $line);
517
518 # set content of <w> i.e. the token
519 $w_element->set_text($tags[0]);
520
521 # vrt positional-attributes in corpus KLK:
522 # USE [0] word
523 # USE [1] ref (id for reference of dephead)
524 # USE [2] lemma
525 # ? [3] lemmacomp (lemma with compound info - could go in @norm, as tag abuse?)
526 # USE [4] pos
527 # USE [5] msd
528 # USE [6] dephead
529 # USE [7] deprel
530 # [8] content (ocr-process)
531 # [9] vpos (ocr-process)
532 # [10] ocr (ocr-process)
533 # [11] cc (ocr-process)
534 # [12] hyph (ocr-process)
535 # [13] style (ocr-process)
536 # [14] lex (korp semantic disambiguation from G"oteborg)
537
538 # set the attributes of <w>:
539 $w_element->set_att("n", $tags[1]);
540 # $w_element->set_att("id", "w_" . $namearray[4] . $sentence->att("xml:id") . "_" . $tags[1]);
541 # so zusammengebaute ID ist auch nicht eindeutig...
Harald Lüngenfaf8d482024-08-27 21:19:47 +0300542 $w_element->del_att("id");
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300543 $w_element->set_att("lemma", $tags[2]);
544 # $w_element->set_att("norm", $tags[3]); # tag abuse of @norm
545 $w_element->set_att("pos", $tags[4]);
546 $w_element->set_att("msd", $tags[5]);
Harald Lüngenccd84902024-08-27 16:03:47 +0300547#TMP $w_element->set_att("head", $tags[6]);
548#TMP $w_element->set_att("deprel", $tags[7]);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300549
550}
551
552#################
553## usage_message
554#################
555
556
557sub usage_message {
Harald Lüngena7e91622024-08-23 17:33:11 +0300558 print " Usage: ./vrt2tei.pl <file.vrt.xml> <outfile>\n";
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300559 print " <file.vrt.xml> is a VRT file converted to proper XML\n";
560 exit;
561}
562
563