blob: 42abd2b0b4676a3a7c5434064ffd502b6ba238c2 [file] [log] [blame]
Harald Lüngen9d4e0462024-08-23 09:34:22 +03001#! /usr/bin/perl -w
2
3
4###########################################################################################################################################################
5# vrt2tei.pl
6# eureco
7# leibniz-institut fuer deutsche sprache / csc finland esbo
8# august 2024
9#
10#
11# using XML::Twig , see http://www.xmltwig.org/, https://metacpan.org/pod/XML::Twiga
12#
Harald Lüngenccd84902024-08-27 16:03:47 +030013# usage: see below the usage fugnction
Harald Lüngencaab0802024-08-23 17:28:22 +030014# Usage: ./vrt2tei.pl <vrtxmlfile.xml> <outfile>
Harald Lüngen9d4e0462024-08-23 09:34:22 +030015# <vrtxmlfile>: xml-ised vrt file
16#
17#
18# TODO:
Harald Lüngen9d4e0462024-08-23 09:34:22 +030019
Harald Lüngendb5e6e72024-09-04 17:41:18 +030020# 3a remove the vrt positional attribute comment line / all comment lines
Harald Lüngen9d4e0462024-08-23 09:34:22 +030021# 3b add @head and @deprel to I5 sowie auch @msd
22# 3c bearbeitung von @head und @deprel in tei2korapxml durch Nils?
23# 3d build 30 billion corpus
24
25# 4a take care of IDs
26# 4b see to the values of @xml:lang
27# 5 abfangen von unerwarteten elementen dh andere als <sentence> und <paragraph>
28# 5a wort reihenfolge nochmal checken
29# 6 checks and balances
Harald Lüngen9d4e0462024-08-23 09:34:22 +030030# 7 How to encode Kielipankki and National Library of Finland? in teiCorpus Header
31# 8 construct <idsDoc>s for the months (or go for TEI)
32# 9 parallelisation in bash and application on sub corpora of KLK
33# 10 re-implementation of the gawk code in the perl script
34# 12 re-implement creation of text header from xml file in another twig / parametrize TEI vs I5
35
36
37
38#remember
39#formatted.xml:105613: element w: validity error : No declaration for attribute deprel of element w
40#formatted.xml:105613: element w: validity error : No declaration for attribute head of element w
41
42
43#
44#
45############################################################################################################################################################
46
47
48use strict;
49use warnings;
Harald Lüngen381c2a22024-09-17 09:06:39 +030050#use diagnostics;
Harald Lüngen9d4e0462024-08-23 09:34:22 +030051
Harald Lüngen2551a952024-09-15 08:08:35 +030052use Getopt::Std;
Harald Lüngen9d4e0462024-08-23 09:34:22 +030053use XML::Twig;
54use XML::Generator ':pretty'; # apparently no effect when using flush();
55
56
57use locale; # diese drei Zeilen, damit \b im regex nicht Umlaute und ß matcht.
58use POSIX qw(locale_h); # to be able to use setlocale()
59#setlocale(LC_ALL,'de_DE');
60setlocale(LC_ALL, "fi_FI");
61use utf8;
62use open qw( :std :encoding(UTF-8) );
63
64use Time::Piece;
65use Tie::IxHash;
Harald Lüngen381c2a22024-09-17 09:06:39 +030066use Data::Random::String;
Harald Lüngendb5e6e72024-09-04 17:41:18 +030067
68
Harald Lüngen9d4e0462024-08-23 09:34:22 +030069#----------------------
70# check file arguments:
71#----------------------
72
73# arg0 infile: vrt-xml
Harald Lüngen9d4e0462024-08-23 09:34:22 +030074
Harald Lüngena20e69d2024-08-29 13:33:08 +030075unless($ARGV[0]) {&usage_message()} ; # min arg0, the input file
76if ($ARGV[1]) {&usage_message()}; # max arg0, the input file
Harald Lüngen9d4e0462024-08-23 09:34:22 +030077
78
Harald Lüngen2551a952024-09-15 08:08:35 +030079#--------------------------
80# get options / auxiliary files
81#--------------------------
82
83
84
85
86
Harald Lüngen9d4e0462024-08-23 09:34:22 +030087####################
88# GLOBAL VARIABLES
89####################
90
Harald Lüngen2551a952024-09-15 08:08:35 +030091my $encoding = "UTF-8"; # dieses $encoding ist NUR fuer das output s.u. twig funktion
92
93#my $TEIFORMAT = "TEI";
94my $TEIFORMAT = "I5";
95
Harald Lüngen8162ad52024-09-19 10:54:24 +030096my $DTDDECL = 'idsCorpus PUBLIC "-//IDS//DTD IDS-I5 1.0//EN" "http://corpora.ids-mannheim.de/I5/DTD/i5.dtd"'; # for I5
Harald Lüngen2551a952024-09-15 08:08:35 +030097
Harald Lüngencaab0802024-08-23 17:28:22 +030098my $textcounter = 0;
Harald Lüngen381c2a22024-09-17 09:06:39 +030099my $LASTMONTH = 0;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300100
Harald Lüngen8162ad52024-09-19 10:54:24 +0300101our %corpusids = ();
Harald Lüngenba0354b2024-09-11 16:24:08 +0300102our %srcfullnames = ();
103our %srcpubplaces = ();
104our %srcpublishers = ();
105our %srctexttypes = ();
106our %srctextlangs = ();
107
Harald Lüngen381c2a22024-09-17 09:06:39 +0300108our %expandLang = ();
109
110
111my %doccounter = ( # by the month as in dereko
Harald Lüngen86cbd932024-09-10 15:52:18 +0300112 "01" => 1,
113 "02" => 1,
114 "03" => 1,
115 "04" => 1,
116 "05" => 1,
117 "06" => 1,
118 "07" => 1,
119 "08" => 1,
120 "09" => 1,
121 "10" => 1,
122 "11" => 1,
123 "12" => 1,
124 );
125
Harald Lüngen8162ad52024-09-19 10:54:24 +0300126# global variables pertaining to the original corpus of *all* newspapers:
127my $kielipankkiCorpus = "klk-fi-v2-vrt";
128my $kielipankkiLicense = "CLARIN-RES";
129my $CountryKey = "FI";
Harald Lüngen86cbd932024-09-10 15:52:18 +0300130
Harald Lüngen8162ad52024-09-19 10:54:24 +0300131# Table with metadata about the different sources (newspapers)
Harald Lüngen86cbd932024-09-10 15:52:18 +0300132my $sourcescsvfile = "sources_klk_fi_v2_2021_4eureco.csv";
Harald Lüngen2551a952024-09-15 08:08:35 +0300133
Harald Lüngen8162ad52024-09-19 10:54:24 +0300134# corpusheader and textheader skeletons
Harald Lüngen86cbd932024-09-10 15:52:18 +0300135my $corpheaderfile = "teiCorpusHeaderSkeleton.tei.xml";
136my $textheaderfile = "teiTextHeaderSkeleton.tei.xml";
Harald Lüngen2551a952024-09-15 08:08:35 +0300137if($TEIFORMAT eq "I5"){
Harald Lüngen381c2a22024-09-17 09:06:39 +0300138 $corpheaderfile = "i5CorpusHeaderSkeleton.i5.xml";
Harald Lüngen2551a952024-09-15 08:08:35 +0300139 $textheaderfile = "i5TextHeaderSkeleton.i5.xml";
140}
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300141
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300142
143my $twig="";
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300144
Harald Lüngen381c2a22024-09-17 09:06:39 +0300145# variables $fnsource and $fnyear taken from the filename
146my @array = split(/\//, $ARGV[0]);
147my $l = scalar(@array);
148my $fnsource = $array[$l-1];
149$fnsource =~ s/([0-9][0-9][0-9][0-9])\.xml$//;
150
151my $fnyear = $1; # $1 contains substring in first bracket in regex above
Harald Lüngen8162ad52024-09-19 10:54:24 +0300152my $fnYY = substr($fnyear, 2, 2);
Harald Lüngen381c2a22024-09-17 09:06:39 +0300153
154
155# months
Harald Lüngen86cbd932024-09-10 15:52:18 +0300156my %months = (
157 "01" => "JAN",
158 "02" => "FEB",
159 "03" => "MAR",
160 "04" => "APR",
161 "05" => "MAY",
162 "06" => "JUN",
163 "07" => "JUL",
164 "08" => "AUG",
165 "09" => "SEP",
166 "10" => "OCT",
167 "11" => "NOV",
168 "12" => "DEC",
169 );
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300170
Harald Lüngen381c2a22024-09-17 09:06:39 +0300171my %monthnames = (
172 "01" => "January",
173 "02" => "February",
174 "03" => "March",
175 "04" => "April",
176 "05" => "May",
177 "06" => "June",
178 "07" => "July",
179 "08" => "August",
180 "09" => "September",
181 "10" => "October",
182 "11" => "November",
183 "12" => "December",
184 );
185
Harald Lüngen2551a952024-09-15 08:08:35 +0300186my %mapping = ();
187$mapping{"aikakausi"} = "Zeitschrift";
188$mapping{"sanomalehti"} = "Zeitung";
189
190
191
Harald Lüngen381c2a22024-09-17 09:06:39 +0300192#-------------------------------------------------------------------------------------------
193# read source metadata file (prepared manually => ultimately read the info from CMDI File?)
194# and set variables
195#-------------------------------------------------------------------------------------------
196
197open(my $SOURCES, "< $sourcescsvfile") || die("$0: cannot open file for reading: $sourcescsvfile");
198while(my $fline = <$SOURCES>){
199 chomp($fline);
200
201 if ($fline=~/^\#/ || $fline=~/^\s*$/ || $fline =~ "TEICORPUSID"){next;} # skip line if empty line or comment line or first line
202 my @flarray = split(/\s*\t+\s*/, $fline); # split each line into array
203
204 # set full titles ($flarray[1]) as keys:
Harald Lüngen8162ad52024-09-19 10:54:24 +0300205 # (ToDo: these hashes could probably conflated into an array of hashes or so)
206 $corpusids{$flarray[1]} = $flarray[0];
Harald Lüngen381c2a22024-09-17 09:06:39 +0300207 $srcfullnames{$flarray[1]} = $flarray[1];
208 $srcpubplaces{$flarray[1]} = $flarray[6];
209 $srcpublishers{$flarray[1]} = $flarray[7];
210 $srctexttypes{$flarray[1]} = $flarray[4];
211 $srctextlangs{$flarray[1]} = $flarray[5];
212
213 # also set simple titles ($flarray[2]) as keys:
Harald Lüngen8162ad52024-09-19 10:54:24 +0300214 $corpusids{$flarray[2]} = $flarray[0];
Harald Lüngen381c2a22024-09-17 09:06:39 +0300215 $srcfullnames{$flarray[2]} = $flarray[1];
216 $srcpubplaces{$flarray[2]} = $flarray[6];
217 $srcpublishers{$flarray[2]} = $flarray[7];
218 $srctexttypes{$flarray[2]} = $flarray[4];
219 $srctextlangs{$flarray[2]} = $flarray[5];
220}
221close($SOURCES);
222
223$expandLang{"fi"} = "Finnish";
224$expandLang{"sv"} = "Swedish";
225
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300226#------------------------------------------------------------------
Harald Lüngen8162ad52024-09-19 10:54:24 +0300227# read corpusHeaderSkeleton document and start a twig for it
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300228#------------------------------------------------------------------
229
230my $teiCorpusHeaderDocTwig = new XML::Twig(
231 keep_spaces => 1,
232 keep_atts_order => 1,
233 comments => 'drop',
234 );
235
236
Harald Lüngen86cbd932024-09-10 15:52:18 +0300237$teiCorpusHeaderDocTwig->parsefile($corpheaderfile);
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300238my $corpusHeader = $teiCorpusHeaderDocTwig->root; # getting the teiHeader for corpus out of the teiCorpusHeaderSkeleton document
239
240
241#------------------------------------------------------------------
Harald Lüngen8162ad52024-09-19 10:54:24 +0300242# read textHeaderSkeleton document and start a twig for it
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300243#------------------------------------------------------------------
244
245my $teiTextHeaderDocTwig = new XML::Twig(
246 keep_spaces => 1,
247 keep_atts_order => 1,
248 comments => 'drop',
249 );
250
Harald Lüngen86cbd932024-09-10 15:52:18 +0300251$teiTextHeaderDocTwig->parsefile($textheaderfile);
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300252my $textHeader = $teiTextHeaderDocTwig->root; # getting the teiHeader for corpus out of the teiTextHeaderSkeleton document
253
254
Harald Lüngen8162ad52024-09-19 10:54:24 +0300255#---------------------------------------------------------
256# define a subtree for idsDoc
257# for the time being it will only be used for the first
258# idsDoc header, to be inserted in the root hander
259#---------------------------------------------------------
260
Harald Lüngen2551a952024-09-15 08:08:35 +0300261my $idsDoc = XML::Twig::Elt->new('idsDoc');
Harald Lüngen381c2a22024-09-17 09:06:39 +0300262my $idsDocHeader = XML::Twig::Elt->new('idsHeader');
Harald Lüngen2551a952024-09-15 08:08:35 +0300263
Harald Lüngen381c2a22024-09-17 09:06:39 +0300264if($TEIFORMAT eq "I5"){
Harald Lüngen2551a952024-09-15 08:08:35 +0300265 my $docFileDesc = XML::Twig::Elt->new('fileDesc');
266 my $docTitleStmt = XML::Twig::Elt->new('titleStmt');
267 my $dtitle = XML::Twig::Elt->new('d.title');
268 my $docSigle = XML::Twig::Elt->new('dokumentSigle');
269
Harald Lüngen381c2a22024-09-17 09:06:39 +0300270 my $docPublicationStmt = XML::Twig::Elt->new('publicationStmt');
271 my $docDistributor = XML::Twig::Elt->new('distributor');
Harald Lüngen8162ad52024-09-19 10:54:24 +0300272 my $docPubAddress = XML::Twig::Elt->new('pubAddress');
Harald Lüngen381c2a22024-09-17 09:06:39 +0300273 my $docAvailability = XML::Twig::Elt->new('availability');
274 my $docPubDate = XML::Twig::Elt->new('pubDate');
275
276 my $docSourceDesc = XML::Twig::Elt->new('sourceDesc');
277 my $docBiblStruct = XML::Twig::Elt->new('biblStruct');
278 my $docMonogr = XML::Twig::Elt->new('monogr');
279 my $docHTitle = XML::Twig::Elt->new('h.title');
280 my $docImprint = XML::Twig::Elt->new('imprint');
Harald Lüngen8162ad52024-09-19 10:54:24 +0300281
282 $idsDoc -> set_att('version', "1.0");
283 $idsDoc -> set_att('TEIform', "TEI.2");
284
285 $idsDocHeader -> set_att('version', "1.1");
286 $idsDocHeader -> set_att('type', "document");
287 $idsDocHeader -> set_att('pattern', "text");
288 $idsDocHeader -> set_att('TEIform', "teiHeader");
289
290
Harald Lüngen381c2a22024-09-17 09:06:39 +0300291
292 $docSigle -> paste("first_child", $docTitleStmt);
293 $dtitle -> paste("last_child", $docTitleStmt);
294 $docTitleStmt -> paste("last_child", $docFileDesc);
295 $docFileDesc -> paste("last_child", $idsDocHeader);
296 $docPublicationStmt -> paste("last_child", $docFileDesc);
297 $docDistributor -> paste("last_child", $docPublicationStmt);
298 $docPubAddress -> paste("last_child", $docPublicationStmt);
299
300 $docAvailability -> paste("last_child", $docPublicationStmt);
301 $docPubDate -> paste("last_child", $docPublicationStmt);
302
303 $docSourceDesc -> paste("last_child", $docFileDesc);
304 $docBiblStruct -> paste("last_child", $docSourceDesc);
305 $docMonogr -> paste("last_child", $docBiblStruct);
306 $docHTitle -> paste("last_child", $docMonogr);
307 $docImprint -> paste("last_child", $docMonogr);
308
Harald Lüngen2551a952024-09-15 08:08:35 +0300309 $idsDocHeader -> paste("last_child", $idsDoc);
310
Harald Lüngen8162ad52024-09-19 10:54:24 +0300311 $docSigle->set_text($corpusids{$fnsource} . $fnYY . "/JAN");
Harald Lüngen381c2a22024-09-17 09:06:39 +0300312 $dtitle ->set_text($srcfullnames{$fnsource} . ", January " . $fnyear);
313}
Harald Lüngen2551a952024-09-15 08:08:35 +0300314
315
Harald Lüngen8162ad52024-09-19 10:54:24 +0300316
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300317#----------------------------------
Harald Lüngen8162ad52024-09-19 10:54:24 +0300318# read the input VRT-XML document
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300319#----------------------------------
320
321open(my $IN, "< $ARGV[0]") || die("$0: cannot open file for reading: $ARGV[0]"); # open input file and initialise filehandel, actually does not seem to be needed
322 # as parsefile() (s.b.) is applied to the filename
323
Harald Lüngen86cbd932024-09-10 15:52:18 +0300324
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300325
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300326#####################
327# M A I N
328#####################
329
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300330#-------------------------------------------------------------------------------------------------------------
331# start twig for input and call start tag handler for root and twig handler for each <text> in the VRT
332#-------------------------------------------------------------------------------------------------------------
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300333
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300334
335$twig = new XML::Twig(
336 keep_spaces => 1, # dadurch auch whitespaces an ehemeligen elementgrenzen im output
337 keep_atts_order => 1, # requires Tie::IxHash
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300338 comments => 'drop',
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300339 start_tag_handlers => {
Harald Lüngen2551a952024-09-15 08:08:35 +0300340 texts => sub{root(@_, $corpusHeader)}
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300341 },
Harald Lüngen2551a952024-09-15 08:08:35 +0300342
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300343 twig_handlers => {
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300344# text => \&text
Harald Lüngen7abb0b52024-09-05 16:26:57 +0300345 text => sub{text(@_, $textHeader->copy)} # copy must be because textHeader will be flushed with $twig in the <text> handler;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300346 },
Harald Lüngen8162ad52024-09-19 10:54:24 +0300347
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300348 output_encoding => $encoding,
349 );
350
351$twig->parsefile($ARGV[0]);
352
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300353
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300354###########
355# END MAIN
356###########
357
358
359
360
361##############################
362# S U B R O U T I N E S
363##############################
364
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300365sub root {
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300366 my ($twig, $root, $corpusHeader) =@_;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300367
Harald Lüngen8162ad52024-09-19 10:54:24 +0300368 if($TEIFORMAT eq "I5") {
369 $twig->set_doctype($DTDDECL); # the doctype could probably be set anywhere to the twig
370 $root->set_gi('idsCorpus');
371 $root->set_att('version', "1.0");
372 $root->set_att('TEIform', "teiCorpus.2");
373
Harald Lüngen381c2a22024-09-17 09:06:39 +0300374 }
375 else {
Harald Lüngen8162ad52024-09-19 10:54:24 +0300376 $root->set_gi('teiCorpus');
377 $root->set_att("xmlns", 'http://www.tei-c.org/ns/1.0');
Harald Lüngen381c2a22024-09-17 09:06:39 +0300378 }
379
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300380 &insertCorpusHeader($root, $corpusHeader);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300381}
382
383
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300384
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300385sub insertCorpusHeader{
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300386 my ($root, $corpusHeader) =@_;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300387
Harald Lüngen8162ad52024-09-19 10:54:24 +0300388 my $ident = "ident";
389
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300390 #-----------------------
391 # set corpus header
392 #-----------------------
393
Harald Lüngen381c2a22024-09-17 09:06:39 +0300394 &set_title($corpusHeader, $srcfullnames{$fnsource}, $fnyear, $kielipankkiCorpus);
395
396 if($TEIFORMAT eq "TEI"){
397 &set_sourceDesc($corpusHeader, $srcfullnames{$fnsource}, $fnyear, $kielipankkiCorpus);
398 }
Harald Lüngen8162ad52024-09-19 10:54:24 +0300399 elsif($TEIFORMAT eq "I5"){
400 $corpusHeader->get_xpath("fileDesc/titleStmt/korpusSigle",0) ->set_text($corpusids{$fnsource} . $fnYY);
401 $corpusHeader->get_xpath("fileDesc/publicationStmt/pubDate",0)->set_text((localtime)[5] + 1900);
Harald Lüngen381c2a22024-09-17 09:06:39 +0300402 $corpusHeader->get_xpath('encodingDesc/editorialDecl/transduction[@n="2"]',0) ->set_text("I5 version by EuReCo using vrt2tei.pl " . localtime->ymd('-'));
403 &set_sourceDescI5($corpusHeader);
Harald Lüngen8162ad52024-09-19 10:54:24 +0300404 $ident="id";
Harald Lüngen381c2a22024-09-17 09:06:39 +0300405 }
406 else{
407 print STDERR "TEIFORMAT is $TEIFORMAT; must be TEI or I5";
408 }
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300409
Harald Lüngen2551a952024-09-15 08:08:35 +0300410 $corpusHeader->paste("first_child", $root);
Harald Lüngen8162ad52024-09-19 10:54:24 +0300411 $corpusHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_att($ident, $srctextlangs{$fnsource});
412 $corpusHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_text($expandLang{$srctextlangs{$fnsource}});
Harald Lüngen381c2a22024-09-17 09:06:39 +0300413
414 if($TEIFORMAT eq "I5"){
415 $idsDoc->paste("after", $corpusHeader);
416 }
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300417}
418
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300419
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300420#----------------------------
421# handler &text for <text>
422#----------------------------
423
424sub text {
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300425 my ($twig, $text, $textHeader) = @_;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300426
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300427 $textcounter++;
Harald Lüngencaab0802024-08-23 17:28:22 +0300428
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300429
Harald Lüngencaab0802024-08-23 17:28:22 +0300430 # ToDo: catch all other, unexpected children of root
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300431
432 #--------------------------------------------------------------------------
433 # Get text metadata (attributes of <text>) and create teiHeader for <text>
434 #--------------------------------------------------------------------------
435
436 my $textattsref = $text->atts(); # $textattsref is now a reference to a hash and should be used with '->'
437
Harald Lüngen8162ad52024-09-19 10:54:24 +0300438 &createIdsDoc($textattsref); # this creation of idsDoc will only be called for the 2nd idsDoc (i.e. february) or higher
Harald Lüngen381c2a22024-09-17 09:06:39 +0300439
440
441
Harald Lüngen86cbd932024-09-10 15:52:18 +0300442 # &createTextHeader returns the $textID:
443 my $textID = &createTextHeader($text, $textattsref, $textHeader);
444
Harald Lüngen2551a952024-09-15 08:08:35 +0300445
446 #----------------------------------------
447 # create <TEI> or <idsText> from <text>
448 #----------------------------------------
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300449
450 # set vrt <text> to <TEI> and delete all attributes after they were were saved above
451 $text->del_atts;
Harald Lüngen7abb0b52024-09-05 16:26:57 +0300452
Harald Lüngen2551a952024-09-15 08:08:35 +0300453 if($TEIFORMAT eq "TEI"){
454 $text->set_gi("TEI");
455 $text->set_att('xml:id', $textID);
456 }
457 else {
458 $text ->set_gi("idsText");
459 $text ->set_att('version', "1.0");
460# $text ->move("last_child", $idsDoc); # does not work because apparently $idsDoc is not under $root at this point
Harald Lüngen7abb0b52024-09-05 16:26:57 +0300461
Harald Lüngen2551a952024-09-15 08:08:35 +0300462 }
Harald Lüngen86cbd932024-09-10 15:52:18 +0300463
Harald Lüngen7abb0b52024-09-05 16:26:57 +0300464
Harald Lüngencaab0802024-08-23 17:28:22 +0300465
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300466 #------------------------------------------------------------------
467 # create the <tei:text>, <body>, <div> elements inside <TEI>
468 #------------------------------------------------------------------
469
470 my $ttext_element = XML::Twig::Elt->new('text');
471 my $body_element = XML::Twig::Elt->new('body');
472 my $div_element = XML::Twig::Elt->new('div');
473
474 # set atts
Harald Lüngencaab0802024-08-23 17:28:22 +0300475 $div_element ->set_att("type", "page"); # ToDo: this is specific to KLK
Harald Lüngen2551a952024-09-15 08:08:35 +0300476 $ttext_element->set_att("xml:lang", 'fi'); # as in ICC-NOR
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300477
478 # paste
479 $ttext_element->paste('last_child', $text);
480 $body_element ->paste('last_child', $ttext_element);
481 $div_element ->paste('last_child', $body_element);
482
483
484 #-------------------------------
485 # create <p> from <paragraph>
486 #-------------------------------
487
488 my @paragraphs = $text->children( 'paragraph');
489
490 foreach my $paragraph (@paragraphs) {
491
492 &setP($paragraph);
493
494 $paragraph->move('last_child', $div_element);
495
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300496 #------------------------------
497 # create <s> from <sentence>
498 #------------------------------
499
500 my @sentences = $paragraph->children('sentence');
501 foreach my $sentence (@sentences) {
502
503 &setS($sentence);
504
505
506 #--------------------------------------
507 # create <w> (word) from each $line
508 #--------------------------------------
509
510 my @lines = split(/\n+/, $sentence->xml_text);
511 $sentence->set_text("\n");
512
513 for my $line (@lines){ # Todo: Reihenfolge checken
514 if($line ne "" ){
515 my $w_element = XML::Twig::Elt->new('w');
516 &createW($w_element, $line);
517 $w_element->paste('last_child', $sentence);
518 }
Harald Lüngencaab0802024-08-23 17:28:22 +0300519 } # end words
520 } # end sentences
521 } # end paragraphs
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300522
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300523 # $twig->set_pretty_print( 'record');
Harald Lüngena20e69d2024-08-29 13:33:08 +0300524 # $twig->flush($OUT);
525 $twig->flush("/dev/stdout");
526}
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300527
528sub createTextHeader{
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300529 my ($text, $textattsref, $textHeader) = @_;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300530
531 # USE 01 binding_id="2246025"
532 # USE 02 date="2021-01-15"
533 # 03 datefrom="20210115"
534 # 04 dateto="20210115"
535 # 05 elec_date="_"
536 # 06 file=""
537 # USE 07 filename_metadata="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_mets.xml"
538 # USE 08 filename_orig ="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_page-2021011502210030301.xml
539 # USE 09 id="t-bcd0f3fa-bbd3dac4"
540 # 10 img_url=""
541 # USE 11 issue_date="15.01.2021"
542 # USE 12 issue_no="SK0221"
543 # USE 13 issue_title="Suomen Kuvalehti"
544 # USE 14 label="Suomen Kuvalehti no. SK0221 15.01.2021"
545 # USE 16 language="fi"
546 # USE 17 page_id="p1"
547 # USE 18 page_no="None"
548 # 19 part_name="_"
549 # 20 publ_id="0039-5552"
550 # 21 publ_part=""
551 # USE 22 publ_title="Suomen Kuvalehti"
552 # USE 23 publ_type="aikakausi"
553 # USE 24 sentcount="70"
554 # USE 25 sum_lang="|xxx:44|fin:23|eng:3|"
555 # 26 timefrom="000000"
556 # 27 timeto="235959"
557 # USE 28 tokencount="304"
558 # 29 version_added="KLK-fi-2021">
559
560
561 my $BID = $textattsref->{'binding_id'};
562 my $DATE = $textattsref->{'date'};
563 my $METAFILENAME = $textattsref->{'filename_metadata'};
564 my $ORIGFILENAME = $textattsref->{'filename_orig'};
565 my $ID = $textattsref->{'id'};
566 my $ISSUEDATE = $textattsref->{'issue_date'};
567 my $ISSUENO = $textattsref->{'issue_no'};
568 my $ISSUETITLE = $textattsref->{'issue_title'};
569 my $LABEL = $textattsref->{'label'};
570 my $LANGUAGE = $textattsref->{'language'};
571 my $PAGEID = $textattsref->{'page_id'};
572 my $PAGENO = $textattsref->{'page_no'};
573 my $PUBLTITLE = $textattsref->{'publ_title'};
574 my $PUBLTYPE = $textattsref->{'publ_type'};
575 my $SENTCOUNT = $textattsref->{'sentcount'};
576 my $SUMLANG = $textattsref->{'sum_lang'};
577 my $TOKENCOUNT = $textattsref->{'tokencount'};
578
Harald Lüngenba0354b2024-09-11 16:24:08 +0300579
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300580 #-----------------------------
581 # Derived Metadata variables
582 #-----------------------------
583
584 my @datearray = split("-", $DATE);
585 my @langarray = split("|", $SUMLANG);
586 my @namearray = split(/[\.\/]/, $ORIGFILENAME); # use $namearray[4] as ID for the page
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300587
Harald Lüngen86cbd932024-09-10 15:52:18 +0300588 #----------------------------------------------------
589 # create textSigle to be returned from this function
590 #----------------------------------------------------
591
592 # SUK21.JAN.00001
593
Harald Lüngen86cbd932024-09-10 15:52:18 +0300594 my $yy = substr($datearray[0], 2, 2); # substr EXPR,OFFSET,LENGTH
595 my $mm = $datearray[1]; # substr EXPR,OFFSET,LENGTH
596 my $MMM = $months{$mm};
597
Harald Lüngen8162ad52024-09-19 10:54:24 +0300598 my $CSIGLE = $corpusids{$fnsource} . $yy;
599
600 my $textID = $CSIGLE . "_" . $MMM . "." . sprintf("%05d", $doccounter{$mm}++);
Harald Lüngen2551a952024-09-15 08:08:35 +0300601 my $textSigle = $textID;
Harald Lüngen86cbd932024-09-10 15:52:18 +0300602
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300603
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300604 #-----------------------------------------------------------------------
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300605 # CREATE text-teiHeader ACCORDING TO THE SKELETON in $textHeader
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300606 #-----------------------------------------------------------------------
607
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300608
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300609 $textHeader->paste('first_child', $text);
610
611 #-----------------------------------------------
612 # <teiHeader>
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300613 # <fileDesc n="EuReCo-KLK-FIN_[$ID]">
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300614 # <titleStmt>
615 # <title>[$LABEL, page $PAGENO]</title>
616
Harald Lüngen8162ad52024-09-19 10:54:24 +0300617 $textHeader->first_child("fileDesc") -> set_att('n', "EuReCo-". $kielipankkiCorpus . "-" . $ID);
Harald Lüngen2551a952024-09-15 08:08:35 +0300618
619
620 #-----------------
621 # titleStmt
622 #----------------
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300623
Harald Lüngen2551a952024-09-15 08:08:35 +0300624 my $title="title";
625 my $titleStmt = $textHeader->first_child("fileDesc")->first_child("titleStmt");
626
627 if($TEIFORMAT eq "I5"){
628 $title = "t.title";
629 $textSigle =~ s/_/\//g;
630 $titleStmt->first_child("textSigle")->set_text($textSigle);
631 };
632
633 $titleStmt->first_child($title)->set_text($LABEL . ", Text #" . $textcounter);
634
Harald Lüngen8162ad52024-09-19 10:54:24 +0300635 # Case KLK: PAGENO scheint meist "None" zu sein
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300636
637 #-----------------------------------------------
638 # <fileDesc>
639 # <sourceDesc>
640 # <biblStruct>
641 # <analytic>
642 # <title type="main">[$LABEL, page $PAGENO]</title>
643 # <date>[$DATE]</date>
644 # <date type="year">TODO</date>
645 # <date type="month">TODO</date>
646 # <date type="day">TODO</date>
647 # <idno type="PAGEID">$PAGEID</idno>
648 # <idno type="BINDINGID">$BID</idno>
649 # <idno type="ID">$ID</idno>
650 # <idno type="KIELIPANKKI_METAFILENAME">$METAFILENAME</idno>
651 # <idno type="KIELIPANKKI_ORIGFILENAME">$ORIGFILENAME</idno>
652 # <textLang>$LANGUAGE</textLang>
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300653
Harald Lüngen8162ad52024-09-19 10:54:24 +0300654
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300655 my $analytic = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/analytic", 0);
Harald Lüngen2551a952024-09-15 08:08:35 +0300656 if($TEIFORMAT eq "I5"){$title="h.title"};
657
658
659 $analytic->first_child($title) ->set_text($LABEL . ", Text #" . $textcounter); # Case KLK; PAGENO scheint meist "None" zu sein
Harald Lüngen381c2a22024-09-17 09:06:39 +0300660 #$analytic->get_xpath('./idno[@type="PAGEID"]', 0) ->set_text($PAGEID);
661 #$analytic->get_xpath('./idno[@type="BINDINGID"]', 0) ->set_text($BID);
662 #$analytic->get_xpath('./idno[@type="ID"]', 0) ->set_text($ID);
663 #$analytic->get_xpath('./idno[@type="KIELIPANKKI_METAFILENAME"]', 0) ->set_text($METAFILENAME);
664 #$analytic->get_xpath('./idno[@type="KIELIPANKKI_ORIGFILENAME"]', 0) ->set_text($ORIGFILENAME);
Harald Lüngen2551a952024-09-15 08:08:35 +0300665 if($TEIFORMAT eq "TEI"){
666 $analytic->first_child('textLang') ->set_text($LANGUAGE);
667 }
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300668
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300669 # <monogr>
670 # <title>$PUBLTITLE</title>
671 # <imprint>
672 # <pubPlace>TODO</pubPlace>
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300673 # <publisher>TODO</publisher>
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300674 # </imprint>
675 # <biblScope unit="ISSUETITLE"/>
676 # <biblScope unit="ISSUENO"/>
677 # <biblScope unit="ISSUEDATE"/>
678 # <biblScope unit="pp">$PAGENO</biblScope>
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300679
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300680 my $monogr = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/monogr", 0);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300681
Harald Lüngen2551a952024-09-15 08:08:35 +0300682 $monogr->first_child($title) ->set_text($PUBLTITLE);
683 if($TEIFORMAT eq "TEI"){
684 $monogr->get_xpath('./imprint/date[@type="date"]', 0) ->set_text($DATE);
685 }
686 my $date = "date";
687 if($TEIFORMAT eq "I5"){$date="pubDate"};
688 $monogr->get_xpath('./imprint/' . $date . '[@type="year"]', 0) ->set_text($datearray[0]);
689 $monogr->get_xpath('./imprint/' . $date . '[@type="month"]', 0) ->set_text($datearray[1]);
690 $monogr->get_xpath('./imprint/' . $date . '[@type="day"]', 0) ->set_text($datearray[2]);
691 $monogr->first_child("imprint")->first_child("pubPlace") ->set_text($srcpubplaces{$PUBLTITLE}); # imprint is needed for tei validity
Harald Lüngen8162ad52024-09-19 10:54:24 +0300692 $monogr->first_child("imprint")->first_child("pubPlace") ->set_att('key', $CountryKey);
Harald Lüngen2551a952024-09-15 08:08:35 +0300693 $monogr->first_child("imprint")->first_child("publisher") ->set_text($srcpublishers{$PUBLTITLE}); # imprint is needed for tei validity
Harald Lüngen381c2a22024-09-17 09:06:39 +0300694 #$monogr->get_xpath('./biblScope[@unit="ISSUETITLE"]', 0) ->set_text($ISSUETITLE);
695 #$monogr->get_xpath('./biblScope[@unit="ISSUENO"]', 0) ->set_text($ISSUENO);
696 #$monogr->get_xpath('./biblScope[@unit="ISSUEDATE"]', 0) ->set_text($ISSUEDATE);
697 #$monogr->get_xpath('./biblScope[@unit="pp"]', 0) ->set_text($PAGENO); # Achtung - PAGENO scheint meist "None" zu sein
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300698
Harald Lüngen2551a952024-09-15 08:08:35 +0300699 my $dateNice = $datearray[2] . "." . $datearray[1] . "." . $datearray[0];
700 my $dateBackwards = $datearray[0] . "." . $datearray[1] . "." . $datearray[2];
701
702 if($TEIFORMAT eq "I5"){
Harald Lüngen381c2a22024-09-17 09:06:39 +0300703 my $refCompleteText = $textSigle . " " . $PUBLTITLE . " no. " . $ISSUENO . ", [" . $PUBLTYPE . "], " . $dateNice;
Harald Lüngen2551a952024-09-15 08:08:35 +0300704 my $refShortText = $textSigle . " " . $PUBLTITLE . ", " . $dateNice;
705 $textHeader->get_xpath('./fileDesc/sourceDesc/reference[@type="complete"]', 0) -> set_text($refCompleteText);
706 $textHeader->get_xpath('./fileDesc/sourceDesc/reference[@type="short"]' , 0) -> set_text($refShortText);
707 }
708
709
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300710 # <encodingDesc>
711 # <tagsDecl>
712 # <namespace name="http://www.tei-c.org/ns/1.0">
713 # <tagUsage gi="s" occurs="SENTCOUNT"/>
714 # <tagUsage gi="w" occurs="TOKENCOUNT"/>
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300715
Harald Lüngen2551a952024-09-15 08:08:35 +0300716 my $namespacePath="./encodingDesc/tagsDecl/namespace/";
717 if($TEIFORMAT eq "I5"){$namespacePath="./encodingDesc/tagsDecl/"};
718
719 $textHeader->get_xpath($namespacePath . 'tagUsage[@gi="s"]', 0) -> set_att('occurs', $SENTCOUNT);
720 $textHeader->get_xpath($namespacePath . 'tagUsage[@gi="w"]', 0) -> set_att('occurs', $TOKENCOUNT);
721
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300722 # <profileDesc>
723 # <langUsage>
724 # <language ident="fi" usage="|xxx:44|fin:23|eng:3|"/>
725 # </langUsage>
726 # <textClass>
727 # <classCode scheme="kielipankki_klk">$PUBLTYPE</classCode>
Harald Lüngen2551a952024-09-15 08:08:35 +0300728 # <classCode scheme="kielipankki_klk_mapped">$mapping{$PUBLTYPE}</classCode>
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300729
Harald Lüngen2551a952024-09-15 08:08:35 +0300730 if($TEIFORMAT eq "I5"){
731 $textHeader->get_xpath('./profileDesc/creation/creatDate', 0) ->set_text($dateBackwards);
732 }
Harald Lüngen8162ad52024-09-19 10:54:24 +0300733 if($TEIFORMAT eq "TEI"){
734 $textHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_att('ident', $LANGUAGE);
735 $textHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_att('usage', $SUMLANG);
736 }
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300737 # in @usage muss eigt. ein integer; am besten inhalt von SUMLANG aufdroeseln und mehrere <language> machen
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300738
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300739 $textHeader->get_xpath('./profileDesc/textClass/classCode[@scheme="kielipankki_klk"]', 0) ->set_text($PUBLTYPE);
Harald Lüngen2551a952024-09-15 08:08:35 +0300740 $textHeader->get_xpath('./profileDesc/textClass/classCode[@scheme="kielipankki_klk_mapped"]', 0)->set_text($mapping{$PUBLTYPE});
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300741
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300742 # <revisionDesc>
743 # <change when="TODO" who="HL">TEI version for EuReCo</change>
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300744
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300745 $textHeader->get_xpath('./revisionDesc/change', 0) ->set_att('when', localtime->ymd('-'));
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300746
Harald Lüngen86cbd932024-09-10 15:52:18 +0300747 return $textID;
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300748
Harald Lüngen86cbd932024-09-10 15:52:18 +0300749
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300750 #-----------------------------------
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300751 # END OF CREATING TEIHEADER
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300752 #-----------------------------------
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300753
754}
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300755
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300756sub setP {
757 my ($paragraph) = @_;
758
759 $paragraph->set_gi('p');
760
761 # <paragraph id="p-bcd0f3fa-bbd3dac4-815ead7a" sum_lang="|fin:1|">
762 # atts of <paragraph>:
763 # @id USE
764 # @sum_lang USE: put in xml:lang and prefix the value with "x-" for private value
765
766 $paragraph->set_att("xml:lang", "x-" . $paragraph->att("sum_lang"));
767 $paragraph->del_att("sum_lang");
Harald Lüngena20e69d2024-08-29 13:33:08 +0300768 # $paragraph->change_att_name('id', 'xml:id');
769 $paragraph->del_att("id"); # diese id ist auch nicht eindeutig!!
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300770}
771sub setS {
772 my ($sentence) = @_;
773
774 $sentence->set_gi('s');
775
776 # the atts of <sentence>:
777 # USE 1 @id="s-bcd0f3fa-bbd3dac4-f7429090"
778 # USE 2 @lang="fin" -> xml:lang
779 # ? 3 @lang_conf="0.6734853"> -> ToDo @cert ?
780
781 # set attrs of <s>
782 $sentence->set_att("xml:lang", $sentence->att("lang")); # ToDo: convert the value / introduce a hash for lookup (input values: "fin", "xxx", ....)
Harald Lüngenfaf8d482024-08-27 21:19:47 +0300783 # $sentence->change_att_name('id', 'xml:id'); # nicht eindeutig
784 $sentence->del_att('id');
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300785 $sentence->del_att("lang"); # replaced by xml:lang
786 $sentence->del_att("lang_conf"); # for the time being
787
788}
789
790sub createW {
791 my ($w_element, $line) = @_;
792
793 #---------------------------
794 # Get the tags (=columns)
795 #---------------------------
796
797 my @tags = split(/\t/, $line);
798
799 # set content of <w> i.e. the token
Harald Lüngen8162ad52024-09-19 10:54:24 +0300800 my $random_w = "";
801 # $random_w = Data::Random::String->create_random_string(length=>length($tags[0]), contains=>'alpha');
802 $random_w = $tags[0];
Harald Lüngen381c2a22024-09-17 09:06:39 +0300803 $w_element->set_text($random_w);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300804
Harald Lüngen8162ad52024-09-19 10:54:24 +0300805 # vrt word and positional-attributes in corpus KLK:
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300806 # USE [0] word
807 # USE [1] ref (id for reference of dephead)
808 # USE [2] lemma
809 # ? [3] lemmacomp (lemma with compound info - could go in @norm, as tag abuse?)
810 # USE [4] pos
811 # USE [5] msd
812 # USE [6] dephead
813 # USE [7] deprel
814 # [8] content (ocr-process)
815 # [9] vpos (ocr-process)
816 # [10] ocr (ocr-process)
817 # [11] cc (ocr-process)
818 # [12] hyph (ocr-process)
819 # [13] style (ocr-process)
820 # [14] lex (korp semantic disambiguation from G"oteborg)
821
822 # set the attributes of <w>:
823 $w_element->set_att("n", $tags[1]);
824 # $w_element->set_att("id", "w_" . $namearray[4] . $sentence->att("xml:id") . "_" . $tags[1]);
825 # so zusammengebaute ID ist auch nicht eindeutig...
Harald Lüngenfaf8d482024-08-27 21:19:47 +0300826 $w_element->del_att("id");
Harald Lüngen8162ad52024-09-19 10:54:24 +0300827 $w_element->set_att("lemma", $tags[2]);
828 #$w_element->set_att("lemma", $random_w);
Harald Lüngen381c2a22024-09-17 09:06:39 +0300829
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300830 # $w_element->set_att("norm", $tags[3]); # tag abuse of @norm
831 $w_element->set_att("pos", $tags[4]);
832 $w_element->set_att("msd", $tags[5]);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300833
Harald Lüngen381c2a22024-09-17 09:06:39 +0300834 if($TEIFORMAT eq "I5"){ # remove condition when part of the official TEI
835 $w_element->set_att("head", $tags[6]);
836 $w_element->set_att("deprel", $tags[7]);
837 }
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300838}
839
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300840
841sub set_title{
Harald Lüngen86cbd932024-09-10 15:52:18 +0300842 my ($corpusHeader, $source, $year, $kielipankkiCorpus) = @_;
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300843
Harald Lüngen381c2a22024-09-17 09:06:39 +0300844 my $titleElement = "title";
845
846 if($TEIFORMAT eq "I5"){
847 $titleElement = "c.title";
848 }
849
Harald Lüngen86cbd932024-09-10 15:52:18 +0300850 my $cTitleString = $source . " " . $year . ", from ". $kielipankkiCorpus . " for EuReCo";
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300851
852 #<teiHeader>
853 # <fileDesc>
854 # <titleStmt>
855 # <title>[Aamulehti2021] from [klk-fi-v2-vrt for] EuReCo</title>
856 # </titleStmt>
857 # <!-- ... -->
858 # </fileDesc>
859 #</teiHeader>
860
Harald Lüngen381c2a22024-09-17 09:06:39 +0300861 my $cTitleNode = $corpusHeader->first_child("fileDesc")->first_child("titleStmt")->first_child($titleElement);
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300862
863 $cTitleNode->set_text($cTitleString);
864
865}
866
867sub set_sourceDesc{
Harald Lüngen86cbd932024-09-10 15:52:18 +0300868 my ($corpusHeader, $source, $year, $kielipankkiCorpus) = @_;
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300869
Harald Lüngen86cbd932024-09-10 15:52:18 +0300870 my $cBiblString = $source . " " . $year . ", from ". $kielipankkiCorpus . " for EuReCo";
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300871
872 #<teiHeader>
873 # <fileDesc>
874 # <!-- ... -->
875 # <sourceDesc>
876 # <bibl>[Aamulehti2021] from [klk-fi-v2-vrt]</bibl>
877 # </sourceDesc>
878 # <!-- ... -->
879 # </fileDesc>
880 #</teiHeader>
881
882 my $cBiblNode = $corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("bibl");
883
884 $cBiblNode->set_text($cBiblString);
885}
886
Harald Lüngen381c2a22024-09-17 09:06:39 +0300887sub set_sourceDescI5{
888 my ($corpusHeader) = @_;
889
890 my $PUBLTITLE = $srcfullnames{$fnsource};
891 my $PUBLPLACE = $srcpubplaces{$PUBLTITLE};
892 my $PUBLISHER = $srcpublishers{$PUBLTITLE};
Harald Lüngen381c2a22024-09-17 09:06:39 +0300893
Harald Lüngen8162ad52024-09-19 10:54:24 +0300894 my $YEAR = $fnyear;
895 my $YY = substr($fnyear, 2, 2);
Harald Lüngen381c2a22024-09-17 09:06:39 +0300896
Harald Lüngen8162ad52024-09-19 10:54:24 +0300897 my $CSIGLE = $corpusids{$PUBLTITLE} . $YY;
898
899
Harald Lüngen381c2a22024-09-17 09:06:39 +0300900 #<idsHeader>
901 # <fileDesc>
902 # <!-- ... -->
903 # <sourceDesc>
904 # <biblStruct>
905 # <monogr>
906 # <h.title type="main">[$PUBLTITLE], [$YEAR]</h.title>
907 # <imprint>
908 # <publisher>[$PUBLISHER]</publisher>
Harald Lüngen8162ad52024-09-19 10:54:24 +0300909 # <pubPlace key="[$TL]">[$PUBPLACE]</pubPlace>
Harald Lüngen381c2a22024-09-17 09:06:39 +0300910 # </imprint>
911 # </monogr>
912 # </biblStruct>
913 # <reference type="super" assemblage="regular">[$KKK] [$PUBLTITLE]; [$PUBPLACE]: [$PUBLISHER], [$YEAR]</reference>
914 # </sourceDesc> # <sourceDesc>
915 # <!-- ... -->
916 # </fileDesc>
917 #</teiHeader>
918
919 my $cMonogr = $corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("biblStruct")->first_child("monogr");
920 $cMonogr->first_child("h.title")->set_text($PUBLTITLE);
921 $cMonogr->first_child("imprint")->first_child("publisher")->set_text($PUBLISHER);
922 $cMonogr->first_child("imprint")->first_child("pubPlace") ->set_text($PUBLPLACE);
Harald Lüngen8162ad52024-09-19 10:54:24 +0300923 $cMonogr->first_child("imprint")->first_child("pubPlace") ->set_att('key', $CountryKey);
Harald Lüngen381c2a22024-09-17 09:06:39 +0300924
925 $corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("reference")->set_text($CSIGLE . " " . $PUBLTITLE . "; " . $PUBLPLACE . ": " . $PUBLISHER . ", " . $YEAR);
926
927}
928
929
930
Harald Lüngen8162ad52024-09-19 10:54:24 +0300931sub createIdsDoc{ # will only be called for the second idsDoc (i.e. for february) and higher
Harald Lüngen381c2a22024-09-17 09:06:39 +0300932 my ($textattsref) = @_;
933 my $DATE = $textattsref->{'date'};
934 my $PUBLTITLE = $textattsref->{'publ_title'};
Harald Lüngen381c2a22024-09-17 09:06:39 +0300935
Harald Lüngen381c2a22024-09-17 09:06:39 +0300936 my @datearray = split("-", $DATE);
937 my $MONTH = $datearray[1];
938 my $YEAR = $datearray[0];
Harald Lüngen8162ad52024-09-19 10:54:24 +0300939 my $YY = substr($YEAR, 2, 2);
Harald Lüngen381c2a22024-09-17 09:06:39 +0300940
Harald Lüngen8162ad52024-09-19 10:54:24 +0300941 my $CSIGLE = $corpusids{$PUBLTITLE} . $YY;
942
Harald Lüngen381c2a22024-09-17 09:06:39 +0300943 my $DOCID = $months{$MONTH};
944 my $MONTHNAME = $monthnames{$MONTH};
945
Harald Lüngen381c2a22024-09-17 09:06:39 +0300946 my $idsDocString="";
947 if($TEIFORMAT eq "I5"){
948 $idsDocString = "
Harald Lüngen8162ad52024-09-19 10:54:24 +0300949<idsDoc version=\"1.0\" TEIform=\"TEI.2\">
Harald Lüngen381c2a22024-09-17 09:06:39 +0300950<idsHeader type=\"document\" pattern=\"text\" version=\"1.1\" TEIform=\"teiHeader\">
951 <fileDesc>
952 <titleStmt>
953 <dokumentSigle>$CSIGLE/$DOCID</dokumentSigle>
954 <d.title>$PUBLTITLE, $MONTHNAME $YEAR</d.title>
955 </titleStmt>
956 <publicationStmt>
957 <distributor/>
958 <pubAddress/>
959 <availability region=\"world\">$kielipankkiLicense</availability>
960 <pubDate/>
961 </publicationStmt>
962 <sourceDesc>
963 <biblStruct>
964 <monogr>
965 <h.title/>
966 <imprint/>
967 </monogr>
968 </biblStruct>
969 </sourceDesc>
970 </fileDesc>
971</idsHeader>
972</idsDoc>\n";
973}
974 if($MONTH + 0 == $LASTMONTH + 1){
975 if($MONTH+0 > 1){
976 printf("%s\n", $idsDocString);
977 }
978 $LASTMONTH++;
979 }
Harald Lüngen381c2a22024-09-17 09:06:39 +0300980}
981
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300982
983
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300984#################
985## usage_message
986#################
987
988
989sub usage_message {
Harald Lüngena7e91622024-08-23 17:33:11 +0300990 print " Usage: ./vrt2tei.pl <file.vrt.xml> <outfile>\n";
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300991 print " <file.vrt.xml> is a VRT file converted to proper XML\n";
992 exit;
993}
994
995