blob: 86abb6f71a3eaf1b1087ea27f107afd599e0b6aa [file] [log] [blame]
Harald Lüngenfe838e02024-09-25 09:01:00 +03001## #! /appl/soft/bio/bioperl/5.36.0/bin/perl
2## #! /usr/bin/perl -w
Harald Lüngen9d4e0462024-08-23 09:34:22 +03003
4
5###########################################################################################################################################################
6# vrt2tei.pl
7# eureco
8# leibniz-institut fuer deutsche sprache / csc finland esbo
9# august 2024
10#
11#
12# using XML::Twig , see http://www.xmltwig.org/, https://metacpan.org/pod/XML::Twiga
13#
Harald Lüngenccd84902024-08-27 16:03:47 +030014# usage: see below the usage fugnction
Harald Lüngencaab0802024-08-23 17:28:22 +030015# Usage: ./vrt2tei.pl <vrtxmlfile.xml> <outfile>
Harald Lüngen9d4e0462024-08-23 09:34:22 +030016# <vrtxmlfile>: xml-ised vrt file
17#
18#
19# TODO:
Harald Lüngen9d4e0462024-08-23 09:34:22 +030020
Harald Lüngenfe838e02024-09-25 09:01:00 +030021# 0 ZIPPEN mit tei2korapxml; zippen mit korAP indexing
Harald Lüngen9d4e0462024-08-23 09:34:22 +030022
Harald Lüngenfe838e02024-09-25 09:01:00 +030023# 2 threading on compute node and application on sub corpora of KLK
24# 2 build 30 billion corpus and index it
25# 3 Optionen
26# 3a parametrize deprel for I5 and if Nils is not ready yet
Harald Lüngen9d4e0462024-08-23 09:34:22 +030027
Harald Lüngenfe838e02024-09-25 09:01:00 +030028#
29
30# 5 abfangen von unerwarteten elementen dh andere als <sentence> und <paragraph>
31# 6 checks and balances, wort reihenfolge nochmal checken?
32# 7 Encode Kielipankki and National Library of Finland? in teiCorpus Header
33# 8 How to encode the CLARIN-RES better - more Info from the CMDI
34# 9 construct <idsDoc>s independent of the order of texts, probably with writing intermediate files to zip
35# 10 re-implementation of the gawk code in the perl script
36# 11 Wwedish corpus
Harald Lüngen9d4e0462024-08-23 09:34:22 +030037
38#
39#
40############################################################################################################################################################
41
Harald Lüngen9d4e0462024-08-23 09:34:22 +030042use strict;
43use warnings;
Harald Lüngen381c2a22024-09-17 09:06:39 +030044#use diagnostics;
Harald Lüngen9d4e0462024-08-23 09:34:22 +030045
Harald Lüngen2551a952024-09-15 08:08:35 +030046use Getopt::Std;
Harald Lüngen9d4e0462024-08-23 09:34:22 +030047use XML::Twig;
48use XML::Generator ':pretty'; # apparently no effect when using flush();
49
50
51use locale; # diese drei Zeilen, damit \b im regex nicht Umlaute und ß matcht.
52use POSIX qw(locale_h); # to be able to use setlocale()
53#setlocale(LC_ALL,'de_DE');
54setlocale(LC_ALL, "fi_FI");
55use utf8;
56use open qw( :std :encoding(UTF-8) );
57
58use Time::Piece;
59use Tie::IxHash;
Harald Lüngen381c2a22024-09-17 09:06:39 +030060use Data::Random::String;
Harald Lüngendb5e6e72024-09-04 17:41:18 +030061
62
Harald Lüngen9d4e0462024-08-23 09:34:22 +030063
Harald Lüngenfe838e02024-09-25 09:01:00 +030064
65#-------------
66# get options
67#-------------
68
69our ($opt_h, $opt_m, $opt_s, $opt_t);
70
71# read switches and print usage info if some bad option was given
72if (!getopts('hms:t:')) { # switches with ':' take an argument; switches without ':' are boolean flags
73 &usage_message;
74 exit -1;
75}
76
77
78#--------------------
79# check argument(s)
80#--------------------
81
82# currently one argument: the vrt-xml input file
Harald Lüngen9d4e0462024-08-23 09:34:22 +030083
Harald Lüngenb557de52024-11-20 16:24:42 +020084unless($ARGV[0]) {&usage_message()} ; # min arg0, the input file e.g. Suomen_Kuvalehti2021.xml
Harald Lüngena20e69d2024-08-29 13:33:08 +030085if ($ARGV[1]) {&usage_message()}; # max arg0, the input file
Harald Lüngen9d4e0462024-08-23 09:34:22 +030086
87
Harald Lüngenfe838e02024-09-25 09:01:00 +030088
89#------------------------------------------------------------
90# initialize defaults for options
91#------------------------------------------------------------
92my $TEIFORMAT= "tei";
93my $MASK = 0;
94
95
96#----------------------------------------------------------------------------------------------------------
97# interpret the options and check whether their respective argument is meaningful (if applicable)
98#----------------------------------------------------------------------------------------------------------
99
100
101# option -h: display usage info and exit
102if ($opt_h) {
103 print STDERR &usage_message;
104 exit 0;
105}
Harald Lüngen2551a952024-09-15 08:08:35 +0300106
107
108
Harald Lüngenfe838e02024-09-25 09:01:00 +0300109# option -t
110if (defined($opt_t)) {
111 $TEIFORMAT = $opt_t;
112}
Harald Lüngen2551a952024-09-15 08:08:35 +0300113
114
Harald Lüngenfe838e02024-09-25 09:01:00 +0300115if ($TEIFORMAT !~ /(tei|i5)/i) { # case-insenstive
116 print STDERR "Error: invalid arg for option -t";
117 &usage_message;
118 exit 0;
119}
120if($TEIFORMAT eq "tei"){$TEIFORMAT="TEI"};
121if($TEIFORMAT eq "i5") {$TEIFORMAT="I5"};
122
123# option -m
124if ($opt_m) {
125 $MASK = 1;
126}
127
128
129#-----------------------------------------------
130# OTHER GLOBAL VARIABLES
131#-----------------------------------------------
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300132
Harald Lüngen2551a952024-09-15 08:08:35 +0300133my $encoding = "UTF-8"; # dieses $encoding ist NUR fuer das output s.u. twig funktion
134
Harald Lüngen8162ad52024-09-19 10:54:24 +0300135my $DTDDECL = 'idsCorpus PUBLIC "-//IDS//DTD IDS-I5 1.0//EN" "http://corpora.ids-mannheim.de/I5/DTD/i5.dtd"'; # for I5
Harald Lüngen2551a952024-09-15 08:08:35 +0300136
Harald Lüngencaab0802024-08-23 17:28:22 +0300137my $textcounter = 0;
Harald Lüngen381c2a22024-09-17 09:06:39 +0300138my $LASTMONTH = 0;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300139
Harald Lüngen8162ad52024-09-19 10:54:24 +0300140our %corpusids = ();
Harald Lüngenb557de52024-11-20 16:24:42 +0200141our %srcpublids = ();
Harald Lüngenba0354b2024-09-11 16:24:08 +0300142our %srcfullnames = ();
143our %srcpubplaces = ();
144our %srcpublishers = ();
145our %srctexttypes = ();
146our %srctextlangs = ();
147
Harald Lüngen381c2a22024-09-17 09:06:39 +0300148our %expandLang = ();
149
150
151my %doccounter = ( # by the month as in dereko
Harald Lüngen86cbd932024-09-10 15:52:18 +0300152 "01" => 1,
153 "02" => 1,
154 "03" => 1,
155 "04" => 1,
156 "05" => 1,
157 "06" => 1,
158 "07" => 1,
159 "08" => 1,
160 "09" => 1,
161 "10" => 1,
162 "11" => 1,
163 "12" => 1,
164 );
165
Harald Lüngen8162ad52024-09-19 10:54:24 +0300166# global variables pertaining to the original corpus of *all* newspapers:
167my $kielipankkiCorpus = "klk-fi-v2-vrt";
168my $kielipankkiLicense = "CLARIN-RES";
169my $CountryKey = "FI";
Harald Lüngen86cbd932024-09-10 15:52:18 +0300170
Harald Lüngen8162ad52024-09-19 10:54:24 +0300171# Table with metadata about the different sources (newspapers)
Harald Lüngen86cbd932024-09-10 15:52:18 +0300172my $sourcescsvfile = "sources_klk_fi_v2_2021_4eureco.csv";
Harald Lüngen2551a952024-09-15 08:08:35 +0300173
Harald Lüngen8162ad52024-09-19 10:54:24 +0300174# corpusheader and textheader skeletons
Harald Lüngen86cbd932024-09-10 15:52:18 +0300175my $corpheaderfile = "teiCorpusHeaderSkeleton.tei.xml";
176my $textheaderfile = "teiTextHeaderSkeleton.tei.xml";
Harald Lüngen2551a952024-09-15 08:08:35 +0300177if($TEIFORMAT eq "I5"){
Harald Lüngen381c2a22024-09-17 09:06:39 +0300178 $corpheaderfile = "i5CorpusHeaderSkeleton.i5.xml";
Harald Lüngen2551a952024-09-15 08:08:35 +0300179 $textheaderfile = "i5TextHeaderSkeleton.i5.xml";
180}
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300181
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300182
183my $twig="";
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300184
Harald Lüngenfe838e02024-09-25 09:01:00 +0300185# variables $fnsource and $fnyear derived from the filename
Harald Lüngen381c2a22024-09-17 09:06:39 +0300186my @array = split(/\//, $ARGV[0]);
187my $l = scalar(@array);
188my $fnsource = $array[$l-1];
189$fnsource =~ s/([0-9][0-9][0-9][0-9])\.xml$//;
190
191my $fnyear = $1; # $1 contains substring in first bracket in regex above
Harald Lüngen8162ad52024-09-19 10:54:24 +0300192my $fnYY = substr($fnyear, 2, 2);
Harald Lüngen381c2a22024-09-17 09:06:39 +0300193
194
195# months
Harald Lüngen86cbd932024-09-10 15:52:18 +0300196my %months = (
197 "01" => "JAN",
198 "02" => "FEB",
199 "03" => "MAR",
200 "04" => "APR",
201 "05" => "MAY",
202 "06" => "JUN",
203 "07" => "JUL",
204 "08" => "AUG",
205 "09" => "SEP",
206 "10" => "OCT",
207 "11" => "NOV",
208 "12" => "DEC",
209 );
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300210
Harald Lüngen381c2a22024-09-17 09:06:39 +0300211my %monthnames = (
212 "01" => "January",
213 "02" => "February",
214 "03" => "March",
215 "04" => "April",
216 "05" => "May",
217 "06" => "June",
218 "07" => "July",
219 "08" => "August",
220 "09" => "September",
221 "10" => "October",
222 "11" => "November",
223 "12" => "December",
224 );
225
Harald Lüngen2551a952024-09-15 08:08:35 +0300226my %mapping = ();
227$mapping{"aikakausi"} = "Zeitschrift";
228$mapping{"sanomalehti"} = "Zeitung";
229
230
231
Harald Lüngen381c2a22024-09-17 09:06:39 +0300232#-------------------------------------------------------------------------------------------
233# read source metadata file (prepared manually => ultimately read the info from CMDI File?)
234# and set variables
235#-------------------------------------------------------------------------------------------
236
237open(my $SOURCES, "< $sourcescsvfile") || die("$0: cannot open file for reading: $sourcescsvfile");
238while(my $fline = <$SOURCES>){
239 chomp($fline);
240
241 if ($fline=~/^\#/ || $fline=~/^\s*$/ || $fline =~ "TEICORPUSID"){next;} # skip line if empty line or comment line or first line
242 my @flarray = split(/\s*\t+\s*/, $fline); # split each line into array
243
Harald Lüngenb557de52024-11-20 16:24:42 +0200244 # (ToDo: the following hashes could probably conflated into an array of hashes or so)
245
Harald Lüngen34c05612025-02-04 15:08:08 +0200246 # set full titles ($flarray[3]) as keys:
247 $corpusids{$flarray[3]} = $flarray[1];
248 $srcpublids{$flarray[3]} = $flarray[2];
249 $srcfullnames{$flarray[3]} = $flarray[3];
250 $srcpubplaces{$flarray[3]} = $flarray[8];
251 $srcpublishers{$flarray[3]} = $flarray[9];
252 $srctexttypes{$flarray[3]} = $flarray[6];
253 $srctextlangs{$flarray[3]} = $flarray[7];
Harald Lüngen381c2a22024-09-17 09:06:39 +0300254
Harald Lüngen07afb282024-11-26 15:07:14 +0200255 # also set simple titles ($flarray[5]) as keys:
Harald Lüngen34c05612025-02-04 15:08:08 +0200256 $corpusids{$flarray[4]} = $flarray[1];
257 $srcpublids{$flarray[4]} = $flarray[2];
258 $srcfullnames{$flarray[4]} = $flarray[3];
259 $srcpubplaces{$flarray[4]} = $flarray[8];
260 $srcpublishers{$flarray[4]} = $flarray[9];
261 $srctexttypes{$flarray[4]} = $flarray[6];
262 $srctextlangs{$flarray[4]} = $flarray[7];
263
264 # also set publids ($flarray[2]) as keys:
265 $corpusids{$flarray[2]} = $flarray[1];
266 $srcpublids{$flarray[2]} = $flarray[2];
267 $srcfullnames{$flarray[2]} = $flarray[3];
268 $srcpubplaces{$flarray[2]} = $flarray[8];
269 $srcpublishers{$flarray[2]} = $flarray[9];
270 $srctexttypes{$flarray[2]} = $flarray[6];
271 $srctextlangs{$flarray[2]} = $flarray[7];
Harald Lüngen381c2a22024-09-17 09:06:39 +0300272}
273close($SOURCES);
274
275$expandLang{"fi"} = "Finnish";
276$expandLang{"sv"} = "Swedish";
277
Harald Lüngenfe838e02024-09-25 09:01:00 +0300278
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300279#------------------------------------------------------------------
Harald Lüngen8162ad52024-09-19 10:54:24 +0300280# read corpusHeaderSkeleton document and start a twig for it
Harald Lüngenb557de52024-11-20 16:24:42 +0200281# (since this file need not be streamed, no handlers are needed)
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300282#------------------------------------------------------------------
283
284my $teiCorpusHeaderDocTwig = new XML::Twig(
285 keep_spaces => 1,
286 keep_atts_order => 1,
287 comments => 'drop',
288 );
289
290
Harald Lüngen86cbd932024-09-10 15:52:18 +0300291$teiCorpusHeaderDocTwig->parsefile($corpheaderfile);
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300292my $corpusHeader = $teiCorpusHeaderDocTwig->root; # getting the teiHeader for corpus out of the teiCorpusHeaderSkeleton document
293
294
295#------------------------------------------------------------------
Harald Lüngen8162ad52024-09-19 10:54:24 +0300296# read textHeaderSkeleton document and start a twig for it
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300297#------------------------------------------------------------------
298
299my $teiTextHeaderDocTwig = new XML::Twig(
300 keep_spaces => 1,
301 keep_atts_order => 1,
302 comments => 'drop',
303 );
304
Harald Lüngen86cbd932024-09-10 15:52:18 +0300305$teiTextHeaderDocTwig->parsefile($textheaderfile);
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300306my $textHeader = $teiTextHeaderDocTwig->root; # getting the teiHeader for corpus out of the teiTextHeaderSkeleton document
307
308
Harald Lüngen8162ad52024-09-19 10:54:24 +0300309#---------------------------------------------------------
310# define a subtree for idsDoc
311# for the time being it will only be used for the first
312# idsDoc header, to be inserted in the root hander
313#---------------------------------------------------------
314
Harald Lüngen2551a952024-09-15 08:08:35 +0300315my $idsDoc = XML::Twig::Elt->new('idsDoc');
Harald Lüngen381c2a22024-09-17 09:06:39 +0300316my $idsDocHeader = XML::Twig::Elt->new('idsHeader');
Harald Lüngen2551a952024-09-15 08:08:35 +0300317
Harald Lüngen381c2a22024-09-17 09:06:39 +0300318if($TEIFORMAT eq "I5"){
Harald Lüngen2551a952024-09-15 08:08:35 +0300319 my $docFileDesc = XML::Twig::Elt->new('fileDesc');
320 my $docTitleStmt = XML::Twig::Elt->new('titleStmt');
321 my $dtitle = XML::Twig::Elt->new('d.title');
322 my $docSigle = XML::Twig::Elt->new('dokumentSigle');
323
Harald Lüngen381c2a22024-09-17 09:06:39 +0300324 my $docPublicationStmt = XML::Twig::Elt->new('publicationStmt');
325 my $docDistributor = XML::Twig::Elt->new('distributor');
Harald Lüngen8162ad52024-09-19 10:54:24 +0300326 my $docPubAddress = XML::Twig::Elt->new('pubAddress');
Harald Lüngen381c2a22024-09-17 09:06:39 +0300327 my $docAvailability = XML::Twig::Elt->new('availability');
328 my $docPubDate = XML::Twig::Elt->new('pubDate');
329
330 my $docSourceDesc = XML::Twig::Elt->new('sourceDesc');
331 my $docBiblStruct = XML::Twig::Elt->new('biblStruct');
332 my $docMonogr = XML::Twig::Elt->new('monogr');
333 my $docHTitle = XML::Twig::Elt->new('h.title');
334 my $docImprint = XML::Twig::Elt->new('imprint');
Harald Lüngen8162ad52024-09-19 10:54:24 +0300335
336 $idsDoc -> set_att('version', "1.0");
337 $idsDoc -> set_att('TEIform', "TEI.2");
338
339 $idsDocHeader -> set_att('version', "1.1");
340 $idsDocHeader -> set_att('type', "document");
341 $idsDocHeader -> set_att('pattern', "text");
342 $idsDocHeader -> set_att('TEIform', "teiHeader");
343
344
Harald Lüngen381c2a22024-09-17 09:06:39 +0300345
346 $docSigle -> paste("first_child", $docTitleStmt);
347 $dtitle -> paste("last_child", $docTitleStmt);
348 $docTitleStmt -> paste("last_child", $docFileDesc);
349 $docFileDesc -> paste("last_child", $idsDocHeader);
350 $docPublicationStmt -> paste("last_child", $docFileDesc);
351 $docDistributor -> paste("last_child", $docPublicationStmt);
352 $docPubAddress -> paste("last_child", $docPublicationStmt);
353
354 $docAvailability -> paste("last_child", $docPublicationStmt);
355 $docPubDate -> paste("last_child", $docPublicationStmt);
356
357 $docSourceDesc -> paste("last_child", $docFileDesc);
358 $docBiblStruct -> paste("last_child", $docSourceDesc);
359 $docMonogr -> paste("last_child", $docBiblStruct);
360 $docHTitle -> paste("last_child", $docMonogr);
361 $docImprint -> paste("last_child", $docMonogr);
362
Harald Lüngen2551a952024-09-15 08:08:35 +0300363 $idsDocHeader -> paste("last_child", $idsDoc);
364
Harald Lüngen8162ad52024-09-19 10:54:24 +0300365 $docSigle->set_text($corpusids{$fnsource} . $fnYY . "/JAN");
Harald Lüngen381c2a22024-09-17 09:06:39 +0300366 $dtitle ->set_text($srcfullnames{$fnsource} . ", January " . $fnyear);
367}
Harald Lüngen2551a952024-09-15 08:08:35 +0300368
369
Harald Lüngen8162ad52024-09-19 10:54:24 +0300370
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300371#----------------------------------
Harald Lüngen8162ad52024-09-19 10:54:24 +0300372# read the input VRT-XML document
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300373#----------------------------------
374
375open(my $IN, "< $ARGV[0]") || die("$0: cannot open file for reading: $ARGV[0]"); # open input file and initialise filehandel, actually does not seem to be needed
376 # as parsefile() (s.b.) is applied to the filename
377
Harald Lüngen86cbd932024-09-10 15:52:18 +0300378
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300379
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300380#####################
381# M A I N
382#####################
383
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300384#-------------------------------------------------------------------------------------------------------------
385# start twig for input and call start tag handler for root and twig handler for each <text> in the VRT
386#-------------------------------------------------------------------------------------------------------------
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300387
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300388
389$twig = new XML::Twig(
390 keep_spaces => 1, # dadurch auch whitespaces an ehemeligen elementgrenzen im output
391 keep_atts_order => 1, # requires Tie::IxHash
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300392 comments => 'drop',
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300393 start_tag_handlers => {
Harald Lüngen2551a952024-09-15 08:08:35 +0300394 texts => sub{root(@_, $corpusHeader)}
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300395 },
Harald Lüngen2551a952024-09-15 08:08:35 +0300396
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300397 twig_handlers => {
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300398# text => \&text
Harald Lüngen7abb0b52024-09-05 16:26:57 +0300399 text => sub{text(@_, $textHeader->copy)} # copy must be because textHeader will be flushed with $twig in the <text> handler;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300400 },
Harald Lüngen8162ad52024-09-19 10:54:24 +0300401
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300402 output_encoding => $encoding,
403 );
404
405$twig->parsefile($ARGV[0]);
406
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300407
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300408###########
409# END MAIN
410###########
411
412
413
414
415##############################
416# S U B R O U T I N E S
417##############################
418
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300419sub root {
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300420 my ($twig, $root, $corpusHeader) =@_;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300421
Harald Lüngen8162ad52024-09-19 10:54:24 +0300422 if($TEIFORMAT eq "I5") {
423 $twig->set_doctype($DTDDECL); # the doctype could probably be set anywhere to the twig
424 $root->set_gi('idsCorpus');
425 $root->set_att('version', "1.0");
426 $root->set_att('TEIform', "teiCorpus.2");
427
Harald Lüngen381c2a22024-09-17 09:06:39 +0300428 }
429 else {
Harald Lüngen8162ad52024-09-19 10:54:24 +0300430 $root->set_gi('teiCorpus');
431 $root->set_att("xmlns", 'http://www.tei-c.org/ns/1.0');
Harald Lüngen381c2a22024-09-17 09:06:39 +0300432 }
433
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300434 &insertCorpusHeader($root, $corpusHeader);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300435}
436
437
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300438
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300439sub insertCorpusHeader{
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300440 my ($root, $corpusHeader) =@_;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300441
Harald Lüngen8162ad52024-09-19 10:54:24 +0300442 my $ident = "ident";
443
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300444 #-----------------------
445 # set corpus header
446 #-----------------------
447
Harald Lüngen381c2a22024-09-17 09:06:39 +0300448 &set_title($corpusHeader, $srcfullnames{$fnsource}, $fnyear, $kielipankkiCorpus);
449
450 if($TEIFORMAT eq "TEI"){
451 &set_sourceDesc($corpusHeader, $srcfullnames{$fnsource}, $fnyear, $kielipankkiCorpus);
452 }
Harald Lüngen8162ad52024-09-19 10:54:24 +0300453 elsif($TEIFORMAT eq "I5"){
Harald Lüngenfe838e02024-09-25 09:01:00 +0300454 $ident="id";
Harald Lüngen8162ad52024-09-19 10:54:24 +0300455 $corpusHeader->get_xpath("fileDesc/titleStmt/korpusSigle",0) ->set_text($corpusids{$fnsource} . $fnYY);
456 $corpusHeader->get_xpath("fileDesc/publicationStmt/pubDate",0)->set_text((localtime)[5] + 1900);
Harald Lüngen381c2a22024-09-17 09:06:39 +0300457 $corpusHeader->get_xpath('encodingDesc/editorialDecl/transduction[@n="2"]',0) ->set_text("I5 version by EuReCo using vrt2tei.pl " . localtime->ymd('-'));
458 &set_sourceDescI5($corpusHeader);
459 }
460 else{
461 print STDERR "TEIFORMAT is $TEIFORMAT; must be TEI or I5";
462 }
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300463
Harald Lüngen2551a952024-09-15 08:08:35 +0300464 $corpusHeader->paste("first_child", $root);
Harald Lüngen8162ad52024-09-19 10:54:24 +0300465 $corpusHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_att($ident, $srctextlangs{$fnsource});
466 $corpusHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_text($expandLang{$srctextlangs{$fnsource}});
Harald Lüngen381c2a22024-09-17 09:06:39 +0300467
468 if($TEIFORMAT eq "I5"){
469 $idsDoc->paste("after", $corpusHeader);
470 }
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300471}
472
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300473
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300474#----------------------------
475# handler &text for <text>
476#----------------------------
477
478sub text {
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300479 my ($twig, $text, $textHeader) = @_;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300480
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300481 $textcounter++;
Harald Lüngencaab0802024-08-23 17:28:22 +0300482
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300483
Harald Lüngencaab0802024-08-23 17:28:22 +0300484 # ToDo: catch all other, unexpected children of root
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300485
486 #--------------------------------------------------------------------------
487 # Get text metadata (attributes of <text>) and create teiHeader for <text>
488 #--------------------------------------------------------------------------
489
490 my $textattsref = $text->atts(); # $textattsref is now a reference to a hash and should be used with '->'
491
Harald Lüngen8162ad52024-09-19 10:54:24 +0300492 &createIdsDoc($textattsref); # this creation of idsDoc will only be called for the 2nd idsDoc (i.e. february) or higher
Harald Lüngen381c2a22024-09-17 09:06:39 +0300493
494
495
Harald Lüngen86cbd932024-09-10 15:52:18 +0300496 # &createTextHeader returns the $textID:
497 my $textID = &createTextHeader($text, $textattsref, $textHeader);
498
Harald Lüngen2551a952024-09-15 08:08:35 +0300499
500 #----------------------------------------
501 # create <TEI> or <idsText> from <text>
502 #----------------------------------------
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300503
504 # set vrt <text> to <TEI> and delete all attributes after they were were saved above
505 $text->del_atts;
Harald Lüngen7abb0b52024-09-05 16:26:57 +0300506
Harald Lüngen2551a952024-09-15 08:08:35 +0300507 if($TEIFORMAT eq "TEI"){
508 $text->set_gi("TEI");
509 $text->set_att('xml:id', $textID);
510 }
511 else {
512 $text ->set_gi("idsText");
513 $text ->set_att('version', "1.0");
514# $text ->move("last_child", $idsDoc); # does not work because apparently $idsDoc is not under $root at this point
Harald Lüngen7abb0b52024-09-05 16:26:57 +0300515
Harald Lüngen2551a952024-09-15 08:08:35 +0300516 }
Harald Lüngen86cbd932024-09-10 15:52:18 +0300517
Harald Lüngen7abb0b52024-09-05 16:26:57 +0300518
Harald Lüngencaab0802024-08-23 17:28:22 +0300519
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300520 #------------------------------------------------------------------
521 # create the <tei:text>, <body>, <div> elements inside <TEI>
522 #------------------------------------------------------------------
523
524 my $ttext_element = XML::Twig::Elt->new('text');
525 my $body_element = XML::Twig::Elt->new('body');
526 my $div_element = XML::Twig::Elt->new('div');
527
528 # set atts
Harald Lüngencaab0802024-08-23 17:28:22 +0300529 $div_element ->set_att("type", "page"); # ToDo: this is specific to KLK
Harald Lüngen2551a952024-09-15 08:08:35 +0300530 $ttext_element->set_att("xml:lang", 'fi'); # as in ICC-NOR
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300531
532 # paste
533 $ttext_element->paste('last_child', $text);
534 $body_element ->paste('last_child', $ttext_element);
535 $div_element ->paste('last_child', $body_element);
536
537
538 #-------------------------------
539 # create <p> from <paragraph>
540 #-------------------------------
541
542 my @paragraphs = $text->children( 'paragraph');
543
544 foreach my $paragraph (@paragraphs) {
545
546 &setP($paragraph);
547
548 $paragraph->move('last_child', $div_element);
549
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300550 #------------------------------
551 # create <s> from <sentence>
552 #------------------------------
553
554 my @sentences = $paragraph->children('sentence');
555 foreach my $sentence (@sentences) {
556
557 &setS($sentence);
558
559
560 #--------------------------------------
561 # create <w> (word) from each $line
562 #--------------------------------------
563
564 my @lines = split(/\n+/, $sentence->xml_text);
565 $sentence->set_text("\n");
566
567 for my $line (@lines){ # Todo: Reihenfolge checken
568 if($line ne "" ){
569 my $w_element = XML::Twig::Elt->new('w');
570 &createW($w_element, $line);
571 $w_element->paste('last_child', $sentence);
572 }
Harald Lüngencaab0802024-08-23 17:28:22 +0300573 } # end words
574 } # end sentences
575 } # end paragraphs
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300576
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300577 # $twig->set_pretty_print( 'record');
Harald Lüngena20e69d2024-08-29 13:33:08 +0300578 # $twig->flush($OUT);
579 $twig->flush("/dev/stdout");
580}
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300581
582sub createTextHeader{
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300583 my ($text, $textattsref, $textHeader) = @_;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300584
585 # USE 01 binding_id="2246025"
586 # USE 02 date="2021-01-15"
587 # 03 datefrom="20210115"
588 # 04 dateto="20210115"
589 # 05 elec_date="_"
590 # 06 file=""
591 # USE 07 filename_metadata="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_mets.xml"
592 # USE 08 filename_orig ="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_page-2021011502210030301.xml
593 # USE 09 id="t-bcd0f3fa-bbd3dac4"
594 # 10 img_url=""
595 # USE 11 issue_date="15.01.2021"
596 # USE 12 issue_no="SK0221"
597 # USE 13 issue_title="Suomen Kuvalehti"
598 # USE 14 label="Suomen Kuvalehti no. SK0221 15.01.2021"
599 # USE 16 language="fi"
600 # USE 17 page_id="p1"
601 # USE 18 page_no="None"
602 # 19 part_name="_"
Harald Lüngen34c05612025-02-04 15:08:08 +0200603 # USE 20 publ_id="0039-5552"
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300604 # 21 publ_part=""
605 # USE 22 publ_title="Suomen Kuvalehti"
606 # USE 23 publ_type="aikakausi"
607 # USE 24 sentcount="70"
608 # USE 25 sum_lang="|xxx:44|fin:23|eng:3|"
609 # 26 timefrom="000000"
610 # 27 timeto="235959"
611 # USE 28 tokencount="304"
612 # 29 version_added="KLK-fi-2021">
613
614
615 my $BID = $textattsref->{'binding_id'};
616 my $DATE = $textattsref->{'date'};
617 my $METAFILENAME = $textattsref->{'filename_metadata'};
618 my $ORIGFILENAME = $textattsref->{'filename_orig'};
619 my $ID = $textattsref->{'id'};
620 my $ISSUEDATE = $textattsref->{'issue_date'};
621 my $ISSUENO = $textattsref->{'issue_no'};
622 my $ISSUETITLE = $textattsref->{'issue_title'};
623 my $LABEL = $textattsref->{'label'};
624 my $LANGUAGE = $textattsref->{'language'};
625 my $PAGEID = $textattsref->{'page_id'};
626 my $PAGENO = $textattsref->{'page_no'};
627 my $PUBLTITLE = $textattsref->{'publ_title'};
Harald Lüngen34c05612025-02-04 15:08:08 +0200628 my $PUBLID = $textattsref->{'publ_id'};
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300629 my $PUBLTYPE = $textattsref->{'publ_type'};
630 my $SENTCOUNT = $textattsref->{'sentcount'};
631 my $SUMLANG = $textattsref->{'sum_lang'};
632 my $TOKENCOUNT = $textattsref->{'tokencount'};
633
Harald Lüngenba0354b2024-09-11 16:24:08 +0300634
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300635 #-----------------------------
636 # Derived Metadata variables
637 #-----------------------------
638
639 my @datearray = split("-", $DATE);
640 my @langarray = split("|", $SUMLANG);
641 my @namearray = split(/[\.\/]/, $ORIGFILENAME); # use $namearray[4] as ID for the page
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300642
Harald Lüngen86cbd932024-09-10 15:52:18 +0300643 #----------------------------------------------------
644 # create textSigle to be returned from this function
645 #----------------------------------------------------
646
647 # SUK21.JAN.00001
648
Harald Lüngen86cbd932024-09-10 15:52:18 +0300649 my $yy = substr($datearray[0], 2, 2); # substr EXPR,OFFSET,LENGTH
650 my $mm = $datearray[1]; # substr EXPR,OFFSET,LENGTH
651 my $MMM = $months{$mm};
652
Harald Lüngen8162ad52024-09-19 10:54:24 +0300653 my $CSIGLE = $corpusids{$fnsource} . $yy;
654
655 my $textID = $CSIGLE . "_" . $MMM . "." . sprintf("%05d", $doccounter{$mm}++);
Harald Lüngen2551a952024-09-15 08:08:35 +0300656 my $textSigle = $textID;
Harald Lüngen86cbd932024-09-10 15:52:18 +0300657
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300658
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300659 #-----------------------------------------------------------------------
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300660 # CREATE text-teiHeader ACCORDING TO THE SKELETON in $textHeader
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300661 #-----------------------------------------------------------------------
662
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300663
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300664 $textHeader->paste('first_child', $text);
665
666 #-----------------------------------------------
667 # <teiHeader>
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300668 # <fileDesc n="EuReCo-KLK-FIN_[$ID]">
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300669 # <titleStmt>
670 # <title>[$LABEL, page $PAGENO]</title>
671
Harald Lüngen8162ad52024-09-19 10:54:24 +0300672 $textHeader->first_child("fileDesc") -> set_att('n', "EuReCo-". $kielipankkiCorpus . "-" . $ID);
Harald Lüngen2551a952024-09-15 08:08:35 +0300673
674
675 #-----------------
676 # titleStmt
677 #----------------
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300678
Harald Lüngen2551a952024-09-15 08:08:35 +0300679 my $title="title";
680 my $titleStmt = $textHeader->first_child("fileDesc")->first_child("titleStmt");
681
682 if($TEIFORMAT eq "I5"){
683 $title = "t.title";
684 $textSigle =~ s/_/\//g;
685 $titleStmt->first_child("textSigle")->set_text($textSigle);
686 };
687
688 $titleStmt->first_child($title)->set_text($LABEL . ", Text #" . $textcounter);
689
Harald Lüngen8162ad52024-09-19 10:54:24 +0300690 # Case KLK: PAGENO scheint meist "None" zu sein
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300691
692 #-----------------------------------------------
693 # <fileDesc>
694 # <sourceDesc>
695 # <biblStruct>
696 # <analytic>
697 # <title type="main">[$LABEL, page $PAGENO]</title>
698 # <date>[$DATE]</date>
699 # <date type="year">TODO</date>
700 # <date type="month">TODO</date>
701 # <date type="day">TODO</date>
702 # <idno type="PAGEID">$PAGEID</idno>
703 # <idno type="BINDINGID">$BID</idno>
704 # <idno type="ID">$ID</idno>
705 # <idno type="KIELIPANKKI_METAFILENAME">$METAFILENAME</idno>
706 # <idno type="KIELIPANKKI_ORIGFILENAME">$ORIGFILENAME</idno>
707 # <textLang>$LANGUAGE</textLang>
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300708
Harald Lüngen8162ad52024-09-19 10:54:24 +0300709
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300710 my $analytic = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/analytic", 0);
Harald Lüngen2551a952024-09-15 08:08:35 +0300711 if($TEIFORMAT eq "I5"){$title="h.title"};
712
713
714 $analytic->first_child($title) ->set_text($LABEL . ", Text #" . $textcounter); # Case KLK; PAGENO scheint meist "None" zu sein
Harald Lüngen381c2a22024-09-17 09:06:39 +0300715 #$analytic->get_xpath('./idno[@type="PAGEID"]', 0) ->set_text($PAGEID);
716 #$analytic->get_xpath('./idno[@type="BINDINGID"]', 0) ->set_text($BID);
717 #$analytic->get_xpath('./idno[@type="ID"]', 0) ->set_text($ID);
718 #$analytic->get_xpath('./idno[@type="KIELIPANKKI_METAFILENAME"]', 0) ->set_text($METAFILENAME);
719 #$analytic->get_xpath('./idno[@type="KIELIPANKKI_ORIGFILENAME"]', 0) ->set_text($ORIGFILENAME);
Harald Lüngen2551a952024-09-15 08:08:35 +0300720 if($TEIFORMAT eq "TEI"){
721 $analytic->first_child('textLang') ->set_text($LANGUAGE);
722 }
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300723
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300724 # <monogr>
725 # <title>$PUBLTITLE</title>
726 # <imprint>
727 # <pubPlace>TODO</pubPlace>
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300728 # <publisher>TODO</publisher>
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300729 # </imprint>
730 # <biblScope unit="ISSUETITLE"/>
731 # <biblScope unit="ISSUENO"/>
732 # <biblScope unit="ISSUEDATE"/>
733 # <biblScope unit="pp">$PAGENO</biblScope>
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300734
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300735 my $monogr = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/monogr", 0);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300736
Harald Lüngen2551a952024-09-15 08:08:35 +0300737 $monogr->first_child($title) ->set_text($PUBLTITLE);
738 if($TEIFORMAT eq "TEI"){
739 $monogr->get_xpath('./imprint/date[@type="date"]', 0) ->set_text($DATE);
740 }
741 my $date = "date";
742 if($TEIFORMAT eq "I5"){$date="pubDate"};
743 $monogr->get_xpath('./imprint/' . $date . '[@type="year"]', 0) ->set_text($datearray[0]);
744 $monogr->get_xpath('./imprint/' . $date . '[@type="month"]', 0) ->set_text($datearray[1]);
745 $monogr->get_xpath('./imprint/' . $date . '[@type="day"]', 0) ->set_text($datearray[2]);
Harald Lüngen34c05612025-02-04 15:08:08 +0200746 $monogr->first_child("imprint")->first_child("pubPlace") ->set_text($srcpubplaces{$PUBLID}); # imprint is needed for tei validity
Harald Lüngen8162ad52024-09-19 10:54:24 +0300747 $monogr->first_child("imprint")->first_child("pubPlace") ->set_att('key', $CountryKey);
Harald Lüngen34c05612025-02-04 15:08:08 +0200748 $monogr->first_child("imprint")->first_child("publisher") ->set_text($srcpublishers{$PUBLID}); # imprint is needed for tei validity
Harald Lüngen381c2a22024-09-17 09:06:39 +0300749 #$monogr->get_xpath('./biblScope[@unit="ISSUETITLE"]', 0) ->set_text($ISSUETITLE);
750 #$monogr->get_xpath('./biblScope[@unit="ISSUENO"]', 0) ->set_text($ISSUENO);
751 #$monogr->get_xpath('./biblScope[@unit="ISSUEDATE"]', 0) ->set_text($ISSUEDATE);
752 #$monogr->get_xpath('./biblScope[@unit="pp"]', 0) ->set_text($PAGENO); # Achtung - PAGENO scheint meist "None" zu sein
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300753
Harald Lüngen2551a952024-09-15 08:08:35 +0300754 my $dateNice = $datearray[2] . "." . $datearray[1] . "." . $datearray[0];
755 my $dateBackwards = $datearray[0] . "." . $datearray[1] . "." . $datearray[2];
756
757 if($TEIFORMAT eq "I5"){
Harald Lüngen381c2a22024-09-17 09:06:39 +0300758 my $refCompleteText = $textSigle . " " . $PUBLTITLE . " no. " . $ISSUENO . ", [" . $PUBLTYPE . "], " . $dateNice;
Harald Lüngen2551a952024-09-15 08:08:35 +0300759 my $refShortText = $textSigle . " " . $PUBLTITLE . ", " . $dateNice;
760 $textHeader->get_xpath('./fileDesc/sourceDesc/reference[@type="complete"]', 0) -> set_text($refCompleteText);
761 $textHeader->get_xpath('./fileDesc/sourceDesc/reference[@type="short"]' , 0) -> set_text($refShortText);
762 }
763
764
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300765 # <encodingDesc>
766 # <tagsDecl>
767 # <namespace name="http://www.tei-c.org/ns/1.0">
768 # <tagUsage gi="s" occurs="SENTCOUNT"/>
769 # <tagUsage gi="w" occurs="TOKENCOUNT"/>
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300770
Harald Lüngen2551a952024-09-15 08:08:35 +0300771 my $namespacePath="./encodingDesc/tagsDecl/namespace/";
772 if($TEIFORMAT eq "I5"){$namespacePath="./encodingDesc/tagsDecl/"};
773
774 $textHeader->get_xpath($namespacePath . 'tagUsage[@gi="s"]', 0) -> set_att('occurs', $SENTCOUNT);
775 $textHeader->get_xpath($namespacePath . 'tagUsage[@gi="w"]', 0) -> set_att('occurs', $TOKENCOUNT);
776
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300777 # <profileDesc>
778 # <langUsage>
779 # <language ident="fi" usage="|xxx:44|fin:23|eng:3|"/>
780 # </langUsage>
781 # <textClass>
782 # <classCode scheme="kielipankki_klk">$PUBLTYPE</classCode>
Harald Lüngen2551a952024-09-15 08:08:35 +0300783 # <classCode scheme="kielipankki_klk_mapped">$mapping{$PUBLTYPE}</classCode>
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300784
Harald Lüngen2551a952024-09-15 08:08:35 +0300785 if($TEIFORMAT eq "I5"){
786 $textHeader->get_xpath('./profileDesc/creation/creatDate', 0) ->set_text($dateBackwards);
787 }
Harald Lüngen8162ad52024-09-19 10:54:24 +0300788 if($TEIFORMAT eq "TEI"){
789 $textHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_att('ident', $LANGUAGE);
790 $textHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_att('usage', $SUMLANG);
791 }
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300792 # in @usage muss eigt. ein integer; am besten inhalt von SUMLANG aufdroeseln und mehrere <language> machen
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300793
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300794 $textHeader->get_xpath('./profileDesc/textClass/classCode[@scheme="kielipankki_klk"]', 0) ->set_text($PUBLTYPE);
Harald Lüngen2551a952024-09-15 08:08:35 +0300795 $textHeader->get_xpath('./profileDesc/textClass/classCode[@scheme="kielipankki_klk_mapped"]', 0)->set_text($mapping{$PUBLTYPE});
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300796
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300797 # <revisionDesc>
798 # <change when="TODO" who="HL">TEI version for EuReCo</change>
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300799
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300800 $textHeader->get_xpath('./revisionDesc/change', 0) ->set_att('when', localtime->ymd('-'));
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300801
Harald Lüngen86cbd932024-09-10 15:52:18 +0300802 return $textID;
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300803
Harald Lüngen86cbd932024-09-10 15:52:18 +0300804
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300805 #-----------------------------------
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300806 # END OF CREATING TEIHEADER
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300807 #-----------------------------------
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300808
809}
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300810
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300811sub setP {
812 my ($paragraph) = @_;
813
814 $paragraph->set_gi('p');
815
816 # <paragraph id="p-bcd0f3fa-bbd3dac4-815ead7a" sum_lang="|fin:1|">
817 # atts of <paragraph>:
818 # @id USE
819 # @sum_lang USE: put in xml:lang and prefix the value with "x-" for private value
820
821 $paragraph->set_att("xml:lang", "x-" . $paragraph->att("sum_lang"));
822 $paragraph->del_att("sum_lang");
Harald Lüngena20e69d2024-08-29 13:33:08 +0300823 # $paragraph->change_att_name('id', 'xml:id');
824 $paragraph->del_att("id"); # diese id ist auch nicht eindeutig!!
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300825}
826sub setS {
827 my ($sentence) = @_;
828
829 $sentence->set_gi('s');
830
831 # the atts of <sentence>:
832 # USE 1 @id="s-bcd0f3fa-bbd3dac4-f7429090"
833 # USE 2 @lang="fin" -> xml:lang
834 # ? 3 @lang_conf="0.6734853"> -> ToDo @cert ?
835
836 # set attrs of <s>
837 $sentence->set_att("xml:lang", $sentence->att("lang")); # ToDo: convert the value / introduce a hash for lookup (input values: "fin", "xxx", ....)
Harald Lüngenfaf8d482024-08-27 21:19:47 +0300838 # $sentence->change_att_name('id', 'xml:id'); # nicht eindeutig
839 $sentence->del_att('id');
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300840 $sentence->del_att("lang"); # replaced by xml:lang
841 $sentence->del_att("lang_conf"); # for the time being
842
843}
844
845sub createW {
846 my ($w_element, $line) = @_;
847
848 #---------------------------
849 # Get the tags (=columns)
850 #---------------------------
851
852 my @tags = split(/\t/, $line);
853
Harald Lüngenfe838e02024-09-25 09:01:00 +0300854 # set word string and lemma string according to $MASK flag:
855 my $w_string = "";
856 my $l_string = "";
857 if($MASK && ($tags[4] ne "Punct")){
858 $w_string = Data::Random::String->create_random_string(length=>length($tags[0]), contains=>'alpha');
859 $l_string = $w_string;
860 }
861 else {
862 $w_string = $tags[0];
863 $l_string = $tags[2];
864 }
865 $w_element->set_text($w_string);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300866
Harald Lüngen8162ad52024-09-19 10:54:24 +0300867 # vrt word and positional-attributes in corpus KLK:
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300868 # USE [0] word
869 # USE [1] ref (id for reference of dephead)
870 # USE [2] lemma
871 # ? [3] lemmacomp (lemma with compound info - could go in @norm, as tag abuse?)
872 # USE [4] pos
873 # USE [5] msd
874 # USE [6] dephead
875 # USE [7] deprel
876 # [8] content (ocr-process)
877 # [9] vpos (ocr-process)
878 # [10] ocr (ocr-process)
879 # [11] cc (ocr-process)
880 # [12] hyph (ocr-process)
881 # [13] style (ocr-process)
882 # [14] lex (korp semantic disambiguation from G"oteborg)
883
884 # set the attributes of <w>:
885 $w_element->set_att("n", $tags[1]);
Harald Lüngenfe838e02024-09-25 09:01:00 +0300886
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300887 # $w_element->set_att("id", "w_" . $namearray[4] . $sentence->att("xml:id") . "_" . $tags[1]);
888 # so zusammengebaute ID ist auch nicht eindeutig...
Harald Lüngenfaf8d482024-08-27 21:19:47 +0300889 $w_element->del_att("id");
Harald Lüngenfe838e02024-09-25 09:01:00 +0300890
891 $w_element->set_att("lemma", $l_string);
Harald Lüngen381c2a22024-09-17 09:06:39 +0300892
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300893 # $w_element->set_att("norm", $tags[3]); # tag abuse of @norm
894 $w_element->set_att("pos", $tags[4]);
895 $w_element->set_att("msd", $tags[5]);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300896
Harald Lüngen381c2a22024-09-17 09:06:39 +0300897 if($TEIFORMAT eq "I5"){ # remove condition when part of the official TEI
898 $w_element->set_att("head", $tags[6]);
899 $w_element->set_att("deprel", $tags[7]);
900 }
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300901}
902
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300903
904sub set_title{
Harald Lüngen86cbd932024-09-10 15:52:18 +0300905 my ($corpusHeader, $source, $year, $kielipankkiCorpus) = @_;
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300906
Harald Lüngen381c2a22024-09-17 09:06:39 +0300907 my $titleElement = "title";
908
909 if($TEIFORMAT eq "I5"){
910 $titleElement = "c.title";
911 }
912
Harald Lüngen86cbd932024-09-10 15:52:18 +0300913 my $cTitleString = $source . " " . $year . ", from ". $kielipankkiCorpus . " for EuReCo";
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300914
915 #<teiHeader>
916 # <fileDesc>
917 # <titleStmt>
918 # <title>[Aamulehti2021] from [klk-fi-v2-vrt for] EuReCo</title>
919 # </titleStmt>
920 # <!-- ... -->
921 # </fileDesc>
922 #</teiHeader>
923
Harald Lüngen381c2a22024-09-17 09:06:39 +0300924 my $cTitleNode = $corpusHeader->first_child("fileDesc")->first_child("titleStmt")->first_child($titleElement);
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300925
926 $cTitleNode->set_text($cTitleString);
927
928}
929
930sub set_sourceDesc{
Harald Lüngen86cbd932024-09-10 15:52:18 +0300931 my ($corpusHeader, $source, $year, $kielipankkiCorpus) = @_;
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300932
Harald Lüngen86cbd932024-09-10 15:52:18 +0300933 my $cBiblString = $source . " " . $year . ", from ". $kielipankkiCorpus . " for EuReCo";
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300934
935 #<teiHeader>
936 # <fileDesc>
937 # <!-- ... -->
938 # <sourceDesc>
939 # <bibl>[Aamulehti2021] from [klk-fi-v2-vrt]</bibl>
940 # </sourceDesc>
941 # <!-- ... -->
942 # </fileDesc>
943 #</teiHeader>
944
945 my $cBiblNode = $corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("bibl");
946
947 $cBiblNode->set_text($cBiblString);
948}
949
Harald Lüngen381c2a22024-09-17 09:06:39 +0300950sub set_sourceDescI5{
951 my ($corpusHeader) = @_;
952
Harald Lüngen34c05612025-02-04 15:08:08 +0200953 my $PUBLID = $srcpublids{$fnsource};
Harald Lüngen381c2a22024-09-17 09:06:39 +0300954 my $PUBLTITLE = $srcfullnames{$fnsource};
Harald Lüngen34c05612025-02-04 15:08:08 +0200955 my $PUBLPLACE = $srcpubplaces{$PUBLID};
956 my $PUBLISHER = $srcpublishers{$PUBLID};
Harald Lüngen381c2a22024-09-17 09:06:39 +0300957
Harald Lüngen8162ad52024-09-19 10:54:24 +0300958 my $YEAR = $fnyear;
959 my $YY = substr($fnyear, 2, 2);
Harald Lüngen381c2a22024-09-17 09:06:39 +0300960
Harald Lüngen34c05612025-02-04 15:08:08 +0200961 my $CSIGLE = $corpusids{$PUBLID} . $YY;
Harald Lüngen8162ad52024-09-19 10:54:24 +0300962
963
Harald Lüngen381c2a22024-09-17 09:06:39 +0300964 #<idsHeader>
965 # <fileDesc>
966 # <!-- ... -->
967 # <sourceDesc>
968 # <biblStruct>
969 # <monogr>
970 # <h.title type="main">[$PUBLTITLE], [$YEAR]</h.title>
971 # <imprint>
972 # <publisher>[$PUBLISHER]</publisher>
Harald Lüngen8162ad52024-09-19 10:54:24 +0300973 # <pubPlace key="[$TL]">[$PUBPLACE]</pubPlace>
Harald Lüngen381c2a22024-09-17 09:06:39 +0300974 # </imprint>
975 # </monogr>
976 # </biblStruct>
977 # <reference type="super" assemblage="regular">[$KKK] [$PUBLTITLE]; [$PUBPLACE]: [$PUBLISHER], [$YEAR]</reference>
978 # </sourceDesc> # <sourceDesc>
979 # <!-- ... -->
980 # </fileDesc>
981 #</teiHeader>
982
983 my $cMonogr = $corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("biblStruct")->first_child("monogr");
984 $cMonogr->first_child("h.title")->set_text($PUBLTITLE);
985 $cMonogr->first_child("imprint")->first_child("publisher")->set_text($PUBLISHER);
986 $cMonogr->first_child("imprint")->first_child("pubPlace") ->set_text($PUBLPLACE);
Harald Lüngen8162ad52024-09-19 10:54:24 +0300987 $cMonogr->first_child("imprint")->first_child("pubPlace") ->set_att('key', $CountryKey);
Harald Lüngen381c2a22024-09-17 09:06:39 +0300988
989 $corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("reference")->set_text($CSIGLE . " " . $PUBLTITLE . "; " . $PUBLPLACE . ": " . $PUBLISHER . ", " . $YEAR);
990
991}
992
993
994
Harald Lüngen8162ad52024-09-19 10:54:24 +0300995sub createIdsDoc{ # will only be called for the second idsDoc (i.e. for february) and higher
Harald Lüngen381c2a22024-09-17 09:06:39 +0300996 my ($textattsref) = @_;
997 my $DATE = $textattsref->{'date'};
998 my $PUBLTITLE = $textattsref->{'publ_title'};
Harald Lüngen34c05612025-02-04 15:08:08 +0200999 my $PUBLID = $textattsref->{'publ_id'};
1000
Harald Lüngen381c2a22024-09-17 09:06:39 +03001001
Harald Lüngen381c2a22024-09-17 09:06:39 +03001002 my @datearray = split("-", $DATE);
1003 my $MONTH = $datearray[1];
1004 my $YEAR = $datearray[0];
Harald Lüngen8162ad52024-09-19 10:54:24 +03001005 my $YY = substr($YEAR, 2, 2);
Harald Lüngen381c2a22024-09-17 09:06:39 +03001006
Harald Lüngen34c05612025-02-04 15:08:08 +02001007 my $CSIGLE = $corpusids{$PUBLID} . $YY;
Harald Lüngen8162ad52024-09-19 10:54:24 +03001008
Harald Lüngen381c2a22024-09-17 09:06:39 +03001009 my $DOCID = $months{$MONTH};
1010 my $MONTHNAME = $monthnames{$MONTH};
1011
Harald Lüngen381c2a22024-09-17 09:06:39 +03001012 my $idsDocString="";
1013 if($TEIFORMAT eq "I5"){
1014 $idsDocString = "
Harald Lüngen8162ad52024-09-19 10:54:24 +03001015<idsDoc version=\"1.0\" TEIform=\"TEI.2\">
Harald Lüngen381c2a22024-09-17 09:06:39 +03001016<idsHeader type=\"document\" pattern=\"text\" version=\"1.1\" TEIform=\"teiHeader\">
1017 <fileDesc>
1018 <titleStmt>
1019 <dokumentSigle>$CSIGLE/$DOCID</dokumentSigle>
1020 <d.title>$PUBLTITLE, $MONTHNAME $YEAR</d.title>
1021 </titleStmt>
1022 <publicationStmt>
1023 <distributor/>
1024 <pubAddress/>
1025 <availability region=\"world\">$kielipankkiLicense</availability>
1026 <pubDate/>
1027 </publicationStmt>
1028 <sourceDesc>
1029 <biblStruct>
1030 <monogr>
1031 <h.title/>
1032 <imprint/>
1033 </monogr>
1034 </biblStruct>
1035 </sourceDesc>
1036 </fileDesc>
1037</idsHeader>
1038</idsDoc>\n";
1039}
1040 if($MONTH + 0 == $LASTMONTH + 1){
1041 if($MONTH+0 > 1){
1042 printf("%s\n", $idsDocString);
1043 }
1044 $LASTMONTH++;
1045 }
Harald Lüngen381c2a22024-09-17 09:06:39 +03001046}
1047
Harald Lüngendb5e6e72024-09-04 17:41:18 +03001048
1049
Harald Lüngen9d4e0462024-08-23 09:34:22 +03001050#################
1051## usage_message
1052#################
1053
1054
1055sub usage_message {
Harald Lüngenfe838e02024-09-25 09:01:00 +03001056 print STDERR "Usage: ./vrt2tei.pl [OPTIONS] <file.vrt.xml>\n";
1057 print STDERR " <file.vrt.xml> is a VRT file as proper (well-formed) XML\n";
1058 print STDERR " Options:\n";
1059 print STDERR " -t (tei|i5) output format, default: tei\n";
1060 print STDERR " -m mask primary data\n";
1061 print STDERR "\n";
Harald Lüngen9d4e0462024-08-23 09:34:22 +03001062 exit;
1063}
1064
1065