blob: 14b12522939b577d7f83765703f592cbc427539d [file] [log] [blame]
Harald Lüngenfe838e02024-09-25 09:01:00 +03001## #! /appl/soft/bio/bioperl/5.36.0/bin/perl
2## #! /usr/bin/perl -w
Harald Lüngen9d4e0462024-08-23 09:34:22 +03003
4
5###########################################################################################################################################################
6# vrt2tei.pl
7# eureco
8# leibniz-institut fuer deutsche sprache / csc finland esbo
9# august 2024
10#
11#
12# using XML::Twig , see http://www.xmltwig.org/, https://metacpan.org/pod/XML::Twiga
13#
Harald Lüngenccd84902024-08-27 16:03:47 +030014# usage: see below the usage fugnction
Harald Lüngencaab0802024-08-23 17:28:22 +030015# Usage: ./vrt2tei.pl <vrtxmlfile.xml> <outfile>
Harald Lüngen9d4e0462024-08-23 09:34:22 +030016# <vrtxmlfile>: xml-ised vrt file
17#
18#
19# TODO:
Harald Lüngen9d4e0462024-08-23 09:34:22 +030020
Harald Lüngenfe838e02024-09-25 09:01:00 +030021# 0 ZIPPEN mit tei2korapxml; zippen mit korAP indexing
Harald Lüngen9d4e0462024-08-23 09:34:22 +030022
Harald Lüngenfe838e02024-09-25 09:01:00 +030023# 2 threading on compute node and application on sub corpora of KLK
24# 2 build 30 billion corpus and index it
25# 3 Optionen
26# 3a parametrize deprel for I5 and if Nils is not ready yet
Harald Lüngen9d4e0462024-08-23 09:34:22 +030027
Harald Lüngenfe838e02024-09-25 09:01:00 +030028#
29
30# 5 abfangen von unerwarteten elementen dh andere als <sentence> und <paragraph>
31# 6 checks and balances, wort reihenfolge nochmal checken?
32# 7 Encode Kielipankki and National Library of Finland? in teiCorpus Header
33# 8 How to encode the CLARIN-RES better - more Info from the CMDI
34# 9 construct <idsDoc>s independent of the order of texts, probably with writing intermediate files to zip
35# 10 re-implementation of the gawk code in the perl script
36# 11 Wwedish corpus
Harald Lüngen9d4e0462024-08-23 09:34:22 +030037
38#
39#
40############################################################################################################################################################
41
Harald Lüngen9d4e0462024-08-23 09:34:22 +030042use strict;
43use warnings;
Harald Lüngen381c2a22024-09-17 09:06:39 +030044#use diagnostics;
Harald Lüngen9d4e0462024-08-23 09:34:22 +030045
Harald Lüngen2551a952024-09-15 08:08:35 +030046use Getopt::Std;
Harald Lüngen9d4e0462024-08-23 09:34:22 +030047use XML::Twig;
48use XML::Generator ':pretty'; # apparently no effect when using flush();
49
50
51use locale; # diese drei Zeilen, damit \b im regex nicht Umlaute und ß matcht.
52use POSIX qw(locale_h); # to be able to use setlocale()
53#setlocale(LC_ALL,'de_DE');
54setlocale(LC_ALL, "fi_FI");
55use utf8;
56use open qw( :std :encoding(UTF-8) );
57
58use Time::Piece;
59use Tie::IxHash;
Harald Lüngen381c2a22024-09-17 09:06:39 +030060use Data::Random::String;
Harald Lüngendb5e6e72024-09-04 17:41:18 +030061
62
Harald Lüngen9d4e0462024-08-23 09:34:22 +030063
Harald Lüngenfe838e02024-09-25 09:01:00 +030064
65#-------------
66# get options
67#-------------
68
69our ($opt_h, $opt_m, $opt_s, $opt_t);
70
71# read switches and print usage info if some bad option was given
72if (!getopts('hms:t:')) { # switches with ':' take an argument; switches without ':' are boolean flags
73 &usage_message;
74 exit -1;
75}
76
77
78#--------------------
79# check argument(s)
80#--------------------
81
82# currently one argument: the vrt-xml input file
Harald Lüngen9d4e0462024-08-23 09:34:22 +030083
Harald Lüngenb557de52024-11-20 16:24:42 +020084unless($ARGV[0]) {&usage_message()} ; # min arg0, the input file e.g. Suomen_Kuvalehti2021.xml
Harald Lüngena20e69d2024-08-29 13:33:08 +030085if ($ARGV[1]) {&usage_message()}; # max arg0, the input file
Harald Lüngen9d4e0462024-08-23 09:34:22 +030086
87
Harald Lüngenfe838e02024-09-25 09:01:00 +030088
89#------------------------------------------------------------
90# initialize defaults for options
91#------------------------------------------------------------
92my $TEIFORMAT= "tei";
93my $MASK = 0;
94
95
96#----------------------------------------------------------------------------------------------------------
97# interpret the options and check whether their respective argument is meaningful (if applicable)
98#----------------------------------------------------------------------------------------------------------
99
100
101# option -h: display usage info and exit
102if ($opt_h) {
103 print STDERR &usage_message;
104 exit 0;
105}
Harald Lüngen2551a952024-09-15 08:08:35 +0300106
107
108
Harald Lüngenfe838e02024-09-25 09:01:00 +0300109# option -t
110if (defined($opt_t)) {
111 $TEIFORMAT = $opt_t;
112}
Harald Lüngen2551a952024-09-15 08:08:35 +0300113
114
Harald Lüngenfe838e02024-09-25 09:01:00 +0300115if ($TEIFORMAT !~ /(tei|i5)/i) { # case-insenstive
116 print STDERR "Error: invalid arg for option -t";
117 &usage_message;
118 exit 0;
119}
120if($TEIFORMAT eq "tei"){$TEIFORMAT="TEI"};
121if($TEIFORMAT eq "i5") {$TEIFORMAT="I5"};
122
123# option -m
124if ($opt_m) {
125 $MASK = 1;
126}
127
128
129#-----------------------------------------------
130# OTHER GLOBAL VARIABLES
131#-----------------------------------------------
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300132
Harald Lüngen2551a952024-09-15 08:08:35 +0300133my $encoding = "UTF-8"; # dieses $encoding ist NUR fuer das output s.u. twig funktion
134
Harald Lüngen8162ad52024-09-19 10:54:24 +0300135my $DTDDECL = 'idsCorpus PUBLIC "-//IDS//DTD IDS-I5 1.0//EN" "http://corpora.ids-mannheim.de/I5/DTD/i5.dtd"'; # for I5
Harald Lüngen2551a952024-09-15 08:08:35 +0300136
Harald Lüngencaab0802024-08-23 17:28:22 +0300137my $textcounter = 0;
Harald Lüngen381c2a22024-09-17 09:06:39 +0300138my $LASTMONTH = 0;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300139
Harald Lüngen8162ad52024-09-19 10:54:24 +0300140our %corpusids = ();
Harald Lüngenb557de52024-11-20 16:24:42 +0200141our %srcpublids = ();
Harald Lüngenba0354b2024-09-11 16:24:08 +0300142our %srcfullnames = ();
143our %srcpubplaces = ();
144our %srcpublishers = ();
145our %srctexttypes = ();
146our %srctextlangs = ();
147
Harald Lüngen381c2a22024-09-17 09:06:39 +0300148our %expandLang = ();
149
150
151my %doccounter = ( # by the month as in dereko
Harald Lüngen86cbd932024-09-10 15:52:18 +0300152 "01" => 1,
153 "02" => 1,
154 "03" => 1,
155 "04" => 1,
156 "05" => 1,
157 "06" => 1,
158 "07" => 1,
159 "08" => 1,
160 "09" => 1,
161 "10" => 1,
162 "11" => 1,
163 "12" => 1,
164 );
165
Harald Lüngen8162ad52024-09-19 10:54:24 +0300166# global variables pertaining to the original corpus of *all* newspapers:
167my $kielipankkiCorpus = "klk-fi-v2-vrt";
168my $kielipankkiLicense = "CLARIN-RES";
169my $CountryKey = "FI";
Harald Lüngen86cbd932024-09-10 15:52:18 +0300170
Harald Lüngen8162ad52024-09-19 10:54:24 +0300171# Table with metadata about the different sources (newspapers)
Harald Lüngen86cbd932024-09-10 15:52:18 +0300172my $sourcescsvfile = "sources_klk_fi_v2_2021_4eureco.csv";
Harald Lüngen2551a952024-09-15 08:08:35 +0300173
Harald Lüngen8162ad52024-09-19 10:54:24 +0300174# corpusheader and textheader skeletons
Harald Lüngen86cbd932024-09-10 15:52:18 +0300175my $corpheaderfile = "teiCorpusHeaderSkeleton.tei.xml";
176my $textheaderfile = "teiTextHeaderSkeleton.tei.xml";
Harald Lüngen2551a952024-09-15 08:08:35 +0300177if($TEIFORMAT eq "I5"){
Harald Lüngen381c2a22024-09-17 09:06:39 +0300178 $corpheaderfile = "i5CorpusHeaderSkeleton.i5.xml";
Harald Lüngen2551a952024-09-15 08:08:35 +0300179 $textheaderfile = "i5TextHeaderSkeleton.i5.xml";
180}
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300181
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300182
183my $twig="";
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300184
Harald Lüngenfe838e02024-09-25 09:01:00 +0300185# variables $fnsource and $fnyear derived from the filename
Harald Lüngen381c2a22024-09-17 09:06:39 +0300186my @array = split(/\//, $ARGV[0]);
187my $l = scalar(@array);
188my $fnsource = $array[$l-1];
189$fnsource =~ s/([0-9][0-9][0-9][0-9])\.xml$//;
190
191my $fnyear = $1; # $1 contains substring in first bracket in regex above
Harald Lüngen8162ad52024-09-19 10:54:24 +0300192my $fnYY = substr($fnyear, 2, 2);
Harald Lüngen381c2a22024-09-17 09:06:39 +0300193
194
195# months
Harald Lüngen86cbd932024-09-10 15:52:18 +0300196my %months = (
197 "01" => "JAN",
198 "02" => "FEB",
199 "03" => "MAR",
200 "04" => "APR",
201 "05" => "MAY",
202 "06" => "JUN",
203 "07" => "JUL",
204 "08" => "AUG",
205 "09" => "SEP",
206 "10" => "OCT",
207 "11" => "NOV",
208 "12" => "DEC",
209 );
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300210
Harald Lüngen381c2a22024-09-17 09:06:39 +0300211my %monthnames = (
212 "01" => "January",
213 "02" => "February",
214 "03" => "March",
215 "04" => "April",
216 "05" => "May",
217 "06" => "June",
218 "07" => "July",
219 "08" => "August",
220 "09" => "September",
221 "10" => "October",
222 "11" => "November",
223 "12" => "December",
224 );
225
Harald Lüngen2551a952024-09-15 08:08:35 +0300226my %mapping = ();
227$mapping{"aikakausi"} = "Zeitschrift";
228$mapping{"sanomalehti"} = "Zeitung";
229
230
231
Harald Lüngen381c2a22024-09-17 09:06:39 +0300232#-------------------------------------------------------------------------------------------
233# read source metadata file (prepared manually => ultimately read the info from CMDI File?)
234# and set variables
235#-------------------------------------------------------------------------------------------
236
237open(my $SOURCES, "< $sourcescsvfile") || die("$0: cannot open file for reading: $sourcescsvfile");
238while(my $fline = <$SOURCES>){
239 chomp($fline);
240
241 if ($fline=~/^\#/ || $fline=~/^\s*$/ || $fline =~ "TEICORPUSID"){next;} # skip line if empty line or comment line or first line
242 my @flarray = split(/\s*\t+\s*/, $fline); # split each line into array
243
Harald Lüngenb557de52024-11-20 16:24:42 +0200244 # (ToDo: the following hashes could probably conflated into an array of hashes or so)
245
Harald Lüngen381c2a22024-09-17 09:06:39 +0300246 # set full titles ($flarray[1]) as keys:
Harald Lüngenb557de52024-11-20 16:24:42 +0200247 $corpusids{$flarray[2]} = $flarray[0];
248 $srcpublids{$flarray[2]} = $flarray[1];
249 $srcfullnames{$flarray[2]} = $flarray[2];
250 $srcpubplaces{$flarray[2]} = $flarray[7];
251 $srcpublishers{$flarray[2]} = $flarray[8];
252 $srctexttypes{$flarray[2]} = $flarray[5];
253 $srctextlangs{$flarray[2]} = $flarray[6];
Harald Lüngen381c2a22024-09-17 09:06:39 +0300254
255 # also set simple titles ($flarray[2]) as keys:
Harald Lüngenb557de52024-11-20 16:24:42 +0200256 $corpusids{$flarray[3]} = $flarray[0];
257 $srcpublids{$flarray[3]} = $flarray[1];
258 $srcfullnames{$flarray[3]} = $flarray[2];
259 $srcpubplaces{$flarray[3]} = $flarray[7];
260 $srcpublishers{$flarray[3]} = $flarray[8];
261 $srctexttypes{$flarray[3]} = $flarray[5];
262 $srctextlangs{$flarray[3]} = $flarray[6];
Harald Lüngen381c2a22024-09-17 09:06:39 +0300263}
264close($SOURCES);
265
266$expandLang{"fi"} = "Finnish";
267$expandLang{"sv"} = "Swedish";
268
Harald Lüngenfe838e02024-09-25 09:01:00 +0300269
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300270#------------------------------------------------------------------
Harald Lüngen8162ad52024-09-19 10:54:24 +0300271# read corpusHeaderSkeleton document and start a twig for it
Harald Lüngenb557de52024-11-20 16:24:42 +0200272# (since this file need not be streamed, no handlers are needed)
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300273#------------------------------------------------------------------
274
275my $teiCorpusHeaderDocTwig = new XML::Twig(
276 keep_spaces => 1,
277 keep_atts_order => 1,
278 comments => 'drop',
279 );
280
281
Harald Lüngen86cbd932024-09-10 15:52:18 +0300282$teiCorpusHeaderDocTwig->parsefile($corpheaderfile);
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300283my $corpusHeader = $teiCorpusHeaderDocTwig->root; # getting the teiHeader for corpus out of the teiCorpusHeaderSkeleton document
284
285
286#------------------------------------------------------------------
Harald Lüngen8162ad52024-09-19 10:54:24 +0300287# read textHeaderSkeleton document and start a twig for it
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300288#------------------------------------------------------------------
289
290my $teiTextHeaderDocTwig = new XML::Twig(
291 keep_spaces => 1,
292 keep_atts_order => 1,
293 comments => 'drop',
294 );
295
Harald Lüngen86cbd932024-09-10 15:52:18 +0300296$teiTextHeaderDocTwig->parsefile($textheaderfile);
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300297my $textHeader = $teiTextHeaderDocTwig->root; # getting the teiHeader for corpus out of the teiTextHeaderSkeleton document
298
299
Harald Lüngen8162ad52024-09-19 10:54:24 +0300300#---------------------------------------------------------
301# define a subtree for idsDoc
302# for the time being it will only be used for the first
303# idsDoc header, to be inserted in the root hander
304#---------------------------------------------------------
305
Harald Lüngen2551a952024-09-15 08:08:35 +0300306my $idsDoc = XML::Twig::Elt->new('idsDoc');
Harald Lüngen381c2a22024-09-17 09:06:39 +0300307my $idsDocHeader = XML::Twig::Elt->new('idsHeader');
Harald Lüngen2551a952024-09-15 08:08:35 +0300308
Harald Lüngen381c2a22024-09-17 09:06:39 +0300309if($TEIFORMAT eq "I5"){
Harald Lüngen2551a952024-09-15 08:08:35 +0300310 my $docFileDesc = XML::Twig::Elt->new('fileDesc');
311 my $docTitleStmt = XML::Twig::Elt->new('titleStmt');
312 my $dtitle = XML::Twig::Elt->new('d.title');
313 my $docSigle = XML::Twig::Elt->new('dokumentSigle');
314
Harald Lüngen381c2a22024-09-17 09:06:39 +0300315 my $docPublicationStmt = XML::Twig::Elt->new('publicationStmt');
316 my $docDistributor = XML::Twig::Elt->new('distributor');
Harald Lüngen8162ad52024-09-19 10:54:24 +0300317 my $docPubAddress = XML::Twig::Elt->new('pubAddress');
Harald Lüngen381c2a22024-09-17 09:06:39 +0300318 my $docAvailability = XML::Twig::Elt->new('availability');
319 my $docPubDate = XML::Twig::Elt->new('pubDate');
320
321 my $docSourceDesc = XML::Twig::Elt->new('sourceDesc');
322 my $docBiblStruct = XML::Twig::Elt->new('biblStruct');
323 my $docMonogr = XML::Twig::Elt->new('monogr');
324 my $docHTitle = XML::Twig::Elt->new('h.title');
325 my $docImprint = XML::Twig::Elt->new('imprint');
Harald Lüngen8162ad52024-09-19 10:54:24 +0300326
327 $idsDoc -> set_att('version', "1.0");
328 $idsDoc -> set_att('TEIform', "TEI.2");
329
330 $idsDocHeader -> set_att('version', "1.1");
331 $idsDocHeader -> set_att('type', "document");
332 $idsDocHeader -> set_att('pattern', "text");
333 $idsDocHeader -> set_att('TEIform', "teiHeader");
334
335
Harald Lüngen381c2a22024-09-17 09:06:39 +0300336
337 $docSigle -> paste("first_child", $docTitleStmt);
338 $dtitle -> paste("last_child", $docTitleStmt);
339 $docTitleStmt -> paste("last_child", $docFileDesc);
340 $docFileDesc -> paste("last_child", $idsDocHeader);
341 $docPublicationStmt -> paste("last_child", $docFileDesc);
342 $docDistributor -> paste("last_child", $docPublicationStmt);
343 $docPubAddress -> paste("last_child", $docPublicationStmt);
344
345 $docAvailability -> paste("last_child", $docPublicationStmt);
346 $docPubDate -> paste("last_child", $docPublicationStmt);
347
348 $docSourceDesc -> paste("last_child", $docFileDesc);
349 $docBiblStruct -> paste("last_child", $docSourceDesc);
350 $docMonogr -> paste("last_child", $docBiblStruct);
351 $docHTitle -> paste("last_child", $docMonogr);
352 $docImprint -> paste("last_child", $docMonogr);
353
Harald Lüngen2551a952024-09-15 08:08:35 +0300354 $idsDocHeader -> paste("last_child", $idsDoc);
355
Harald Lüngen8162ad52024-09-19 10:54:24 +0300356 $docSigle->set_text($corpusids{$fnsource} . $fnYY . "/JAN");
Harald Lüngen381c2a22024-09-17 09:06:39 +0300357 $dtitle ->set_text($srcfullnames{$fnsource} . ", January " . $fnyear);
358}
Harald Lüngen2551a952024-09-15 08:08:35 +0300359
360
Harald Lüngen8162ad52024-09-19 10:54:24 +0300361
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300362#----------------------------------
Harald Lüngen8162ad52024-09-19 10:54:24 +0300363# read the input VRT-XML document
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300364#----------------------------------
365
366open(my $IN, "< $ARGV[0]") || die("$0: cannot open file for reading: $ARGV[0]"); # open input file and initialise filehandel, actually does not seem to be needed
367 # as parsefile() (s.b.) is applied to the filename
368
Harald Lüngen86cbd932024-09-10 15:52:18 +0300369
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300370
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300371#####################
372# M A I N
373#####################
374
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300375#-------------------------------------------------------------------------------------------------------------
376# start twig for input and call start tag handler for root and twig handler for each <text> in the VRT
377#-------------------------------------------------------------------------------------------------------------
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300378
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300379
380$twig = new XML::Twig(
381 keep_spaces => 1, # dadurch auch whitespaces an ehemeligen elementgrenzen im output
382 keep_atts_order => 1, # requires Tie::IxHash
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300383 comments => 'drop',
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300384 start_tag_handlers => {
Harald Lüngen2551a952024-09-15 08:08:35 +0300385 texts => sub{root(@_, $corpusHeader)}
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300386 },
Harald Lüngen2551a952024-09-15 08:08:35 +0300387
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300388 twig_handlers => {
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300389# text => \&text
Harald Lüngen7abb0b52024-09-05 16:26:57 +0300390 text => sub{text(@_, $textHeader->copy)} # copy must be because textHeader will be flushed with $twig in the <text> handler;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300391 },
Harald Lüngen8162ad52024-09-19 10:54:24 +0300392
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300393 output_encoding => $encoding,
394 );
395
396$twig->parsefile($ARGV[0]);
397
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300398
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300399###########
400# END MAIN
401###########
402
403
404
405
406##############################
407# S U B R O U T I N E S
408##############################
409
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300410sub root {
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300411 my ($twig, $root, $corpusHeader) =@_;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300412
Harald Lüngen8162ad52024-09-19 10:54:24 +0300413 if($TEIFORMAT eq "I5") {
414 $twig->set_doctype($DTDDECL); # the doctype could probably be set anywhere to the twig
415 $root->set_gi('idsCorpus');
416 $root->set_att('version', "1.0");
417 $root->set_att('TEIform', "teiCorpus.2");
418
Harald Lüngen381c2a22024-09-17 09:06:39 +0300419 }
420 else {
Harald Lüngen8162ad52024-09-19 10:54:24 +0300421 $root->set_gi('teiCorpus');
422 $root->set_att("xmlns", 'http://www.tei-c.org/ns/1.0');
Harald Lüngen381c2a22024-09-17 09:06:39 +0300423 }
424
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300425 &insertCorpusHeader($root, $corpusHeader);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300426}
427
428
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300429
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300430sub insertCorpusHeader{
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300431 my ($root, $corpusHeader) =@_;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300432
Harald Lüngen8162ad52024-09-19 10:54:24 +0300433 my $ident = "ident";
434
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300435 #-----------------------
436 # set corpus header
437 #-----------------------
438
Harald Lüngen381c2a22024-09-17 09:06:39 +0300439 &set_title($corpusHeader, $srcfullnames{$fnsource}, $fnyear, $kielipankkiCorpus);
440
441 if($TEIFORMAT eq "TEI"){
442 &set_sourceDesc($corpusHeader, $srcfullnames{$fnsource}, $fnyear, $kielipankkiCorpus);
443 }
Harald Lüngen8162ad52024-09-19 10:54:24 +0300444 elsif($TEIFORMAT eq "I5"){
Harald Lüngenfe838e02024-09-25 09:01:00 +0300445 $ident="id";
Harald Lüngen8162ad52024-09-19 10:54:24 +0300446 $corpusHeader->get_xpath("fileDesc/titleStmt/korpusSigle",0) ->set_text($corpusids{$fnsource} . $fnYY);
447 $corpusHeader->get_xpath("fileDesc/publicationStmt/pubDate",0)->set_text((localtime)[5] + 1900);
Harald Lüngen381c2a22024-09-17 09:06:39 +0300448 $corpusHeader->get_xpath('encodingDesc/editorialDecl/transduction[@n="2"]',0) ->set_text("I5 version by EuReCo using vrt2tei.pl " . localtime->ymd('-'));
449 &set_sourceDescI5($corpusHeader);
450 }
451 else{
452 print STDERR "TEIFORMAT is $TEIFORMAT; must be TEI or I5";
453 }
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300454
Harald Lüngen2551a952024-09-15 08:08:35 +0300455 $corpusHeader->paste("first_child", $root);
Harald Lüngen8162ad52024-09-19 10:54:24 +0300456 $corpusHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_att($ident, $srctextlangs{$fnsource});
457 $corpusHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_text($expandLang{$srctextlangs{$fnsource}});
Harald Lüngen381c2a22024-09-17 09:06:39 +0300458
459 if($TEIFORMAT eq "I5"){
460 $idsDoc->paste("after", $corpusHeader);
461 }
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300462}
463
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300464
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300465#----------------------------
466# handler &text for <text>
467#----------------------------
468
469sub text {
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300470 my ($twig, $text, $textHeader) = @_;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300471
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300472 $textcounter++;
Harald Lüngencaab0802024-08-23 17:28:22 +0300473
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300474
Harald Lüngencaab0802024-08-23 17:28:22 +0300475 # ToDo: catch all other, unexpected children of root
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300476
477 #--------------------------------------------------------------------------
478 # Get text metadata (attributes of <text>) and create teiHeader for <text>
479 #--------------------------------------------------------------------------
480
481 my $textattsref = $text->atts(); # $textattsref is now a reference to a hash and should be used with '->'
482
Harald Lüngen8162ad52024-09-19 10:54:24 +0300483 &createIdsDoc($textattsref); # this creation of idsDoc will only be called for the 2nd idsDoc (i.e. february) or higher
Harald Lüngen381c2a22024-09-17 09:06:39 +0300484
485
486
Harald Lüngen86cbd932024-09-10 15:52:18 +0300487 # &createTextHeader returns the $textID:
488 my $textID = &createTextHeader($text, $textattsref, $textHeader);
489
Harald Lüngen2551a952024-09-15 08:08:35 +0300490
491 #----------------------------------------
492 # create <TEI> or <idsText> from <text>
493 #----------------------------------------
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300494
495 # set vrt <text> to <TEI> and delete all attributes after they were were saved above
496 $text->del_atts;
Harald Lüngen7abb0b52024-09-05 16:26:57 +0300497
Harald Lüngen2551a952024-09-15 08:08:35 +0300498 if($TEIFORMAT eq "TEI"){
499 $text->set_gi("TEI");
500 $text->set_att('xml:id', $textID);
501 }
502 else {
503 $text ->set_gi("idsText");
504 $text ->set_att('version', "1.0");
505# $text ->move("last_child", $idsDoc); # does not work because apparently $idsDoc is not under $root at this point
Harald Lüngen7abb0b52024-09-05 16:26:57 +0300506
Harald Lüngen2551a952024-09-15 08:08:35 +0300507 }
Harald Lüngen86cbd932024-09-10 15:52:18 +0300508
Harald Lüngen7abb0b52024-09-05 16:26:57 +0300509
Harald Lüngencaab0802024-08-23 17:28:22 +0300510
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300511 #------------------------------------------------------------------
512 # create the <tei:text>, <body>, <div> elements inside <TEI>
513 #------------------------------------------------------------------
514
515 my $ttext_element = XML::Twig::Elt->new('text');
516 my $body_element = XML::Twig::Elt->new('body');
517 my $div_element = XML::Twig::Elt->new('div');
518
519 # set atts
Harald Lüngencaab0802024-08-23 17:28:22 +0300520 $div_element ->set_att("type", "page"); # ToDo: this is specific to KLK
Harald Lüngen2551a952024-09-15 08:08:35 +0300521 $ttext_element->set_att("xml:lang", 'fi'); # as in ICC-NOR
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300522
523 # paste
524 $ttext_element->paste('last_child', $text);
525 $body_element ->paste('last_child', $ttext_element);
526 $div_element ->paste('last_child', $body_element);
527
528
529 #-------------------------------
530 # create <p> from <paragraph>
531 #-------------------------------
532
533 my @paragraphs = $text->children( 'paragraph');
534
535 foreach my $paragraph (@paragraphs) {
536
537 &setP($paragraph);
538
539 $paragraph->move('last_child', $div_element);
540
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300541 #------------------------------
542 # create <s> from <sentence>
543 #------------------------------
544
545 my @sentences = $paragraph->children('sentence');
546 foreach my $sentence (@sentences) {
547
548 &setS($sentence);
549
550
551 #--------------------------------------
552 # create <w> (word) from each $line
553 #--------------------------------------
554
555 my @lines = split(/\n+/, $sentence->xml_text);
556 $sentence->set_text("\n");
557
558 for my $line (@lines){ # Todo: Reihenfolge checken
559 if($line ne "" ){
560 my $w_element = XML::Twig::Elt->new('w');
561 &createW($w_element, $line);
562 $w_element->paste('last_child', $sentence);
563 }
Harald Lüngencaab0802024-08-23 17:28:22 +0300564 } # end words
565 } # end sentences
566 } # end paragraphs
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300567
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300568 # $twig->set_pretty_print( 'record');
Harald Lüngena20e69d2024-08-29 13:33:08 +0300569 # $twig->flush($OUT);
570 $twig->flush("/dev/stdout");
571}
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300572
573sub createTextHeader{
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300574 my ($text, $textattsref, $textHeader) = @_;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300575
576 # USE 01 binding_id="2246025"
577 # USE 02 date="2021-01-15"
578 # 03 datefrom="20210115"
579 # 04 dateto="20210115"
580 # 05 elec_date="_"
581 # 06 file=""
582 # USE 07 filename_metadata="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_mets.xml"
583 # USE 08 filename_orig ="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_page-2021011502210030301.xml
584 # USE 09 id="t-bcd0f3fa-bbd3dac4"
585 # 10 img_url=""
586 # USE 11 issue_date="15.01.2021"
587 # USE 12 issue_no="SK0221"
588 # USE 13 issue_title="Suomen Kuvalehti"
589 # USE 14 label="Suomen Kuvalehti no. SK0221 15.01.2021"
590 # USE 16 language="fi"
591 # USE 17 page_id="p1"
592 # USE 18 page_no="None"
593 # 19 part_name="_"
594 # 20 publ_id="0039-5552"
595 # 21 publ_part=""
596 # USE 22 publ_title="Suomen Kuvalehti"
597 # USE 23 publ_type="aikakausi"
598 # USE 24 sentcount="70"
599 # USE 25 sum_lang="|xxx:44|fin:23|eng:3|"
600 # 26 timefrom="000000"
601 # 27 timeto="235959"
602 # USE 28 tokencount="304"
603 # 29 version_added="KLK-fi-2021">
604
605
606 my $BID = $textattsref->{'binding_id'};
607 my $DATE = $textattsref->{'date'};
608 my $METAFILENAME = $textattsref->{'filename_metadata'};
609 my $ORIGFILENAME = $textattsref->{'filename_orig'};
610 my $ID = $textattsref->{'id'};
611 my $ISSUEDATE = $textattsref->{'issue_date'};
612 my $ISSUENO = $textattsref->{'issue_no'};
613 my $ISSUETITLE = $textattsref->{'issue_title'};
614 my $LABEL = $textattsref->{'label'};
615 my $LANGUAGE = $textattsref->{'language'};
616 my $PAGEID = $textattsref->{'page_id'};
617 my $PAGENO = $textattsref->{'page_no'};
618 my $PUBLTITLE = $textattsref->{'publ_title'};
619 my $PUBLTYPE = $textattsref->{'publ_type'};
620 my $SENTCOUNT = $textattsref->{'sentcount'};
621 my $SUMLANG = $textattsref->{'sum_lang'};
622 my $TOKENCOUNT = $textattsref->{'tokencount'};
623
Harald Lüngenba0354b2024-09-11 16:24:08 +0300624
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300625 #-----------------------------
626 # Derived Metadata variables
627 #-----------------------------
628
629 my @datearray = split("-", $DATE);
630 my @langarray = split("|", $SUMLANG);
631 my @namearray = split(/[\.\/]/, $ORIGFILENAME); # use $namearray[4] as ID for the page
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300632
Harald Lüngen86cbd932024-09-10 15:52:18 +0300633 #----------------------------------------------------
634 # create textSigle to be returned from this function
635 #----------------------------------------------------
636
637 # SUK21.JAN.00001
638
Harald Lüngen86cbd932024-09-10 15:52:18 +0300639 my $yy = substr($datearray[0], 2, 2); # substr EXPR,OFFSET,LENGTH
640 my $mm = $datearray[1]; # substr EXPR,OFFSET,LENGTH
641 my $MMM = $months{$mm};
642
Harald Lüngen8162ad52024-09-19 10:54:24 +0300643 my $CSIGLE = $corpusids{$fnsource} . $yy;
644
645 my $textID = $CSIGLE . "_" . $MMM . "." . sprintf("%05d", $doccounter{$mm}++);
Harald Lüngen2551a952024-09-15 08:08:35 +0300646 my $textSigle = $textID;
Harald Lüngen86cbd932024-09-10 15:52:18 +0300647
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300648
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300649 #-----------------------------------------------------------------------
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300650 # CREATE text-teiHeader ACCORDING TO THE SKELETON in $textHeader
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300651 #-----------------------------------------------------------------------
652
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300653
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300654 $textHeader->paste('first_child', $text);
655
656 #-----------------------------------------------
657 # <teiHeader>
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300658 # <fileDesc n="EuReCo-KLK-FIN_[$ID]">
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300659 # <titleStmt>
660 # <title>[$LABEL, page $PAGENO]</title>
661
Harald Lüngen8162ad52024-09-19 10:54:24 +0300662 $textHeader->first_child("fileDesc") -> set_att('n', "EuReCo-". $kielipankkiCorpus . "-" . $ID);
Harald Lüngen2551a952024-09-15 08:08:35 +0300663
664
665 #-----------------
666 # titleStmt
667 #----------------
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300668
Harald Lüngen2551a952024-09-15 08:08:35 +0300669 my $title="title";
670 my $titleStmt = $textHeader->first_child("fileDesc")->first_child("titleStmt");
671
672 if($TEIFORMAT eq "I5"){
673 $title = "t.title";
674 $textSigle =~ s/_/\//g;
675 $titleStmt->first_child("textSigle")->set_text($textSigle);
676 };
677
678 $titleStmt->first_child($title)->set_text($LABEL . ", Text #" . $textcounter);
679
Harald Lüngen8162ad52024-09-19 10:54:24 +0300680 # Case KLK: PAGENO scheint meist "None" zu sein
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300681
682 #-----------------------------------------------
683 # <fileDesc>
684 # <sourceDesc>
685 # <biblStruct>
686 # <analytic>
687 # <title type="main">[$LABEL, page $PAGENO]</title>
688 # <date>[$DATE]</date>
689 # <date type="year">TODO</date>
690 # <date type="month">TODO</date>
691 # <date type="day">TODO</date>
692 # <idno type="PAGEID">$PAGEID</idno>
693 # <idno type="BINDINGID">$BID</idno>
694 # <idno type="ID">$ID</idno>
695 # <idno type="KIELIPANKKI_METAFILENAME">$METAFILENAME</idno>
696 # <idno type="KIELIPANKKI_ORIGFILENAME">$ORIGFILENAME</idno>
697 # <textLang>$LANGUAGE</textLang>
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300698
Harald Lüngen8162ad52024-09-19 10:54:24 +0300699
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300700 my $analytic = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/analytic", 0);
Harald Lüngen2551a952024-09-15 08:08:35 +0300701 if($TEIFORMAT eq "I5"){$title="h.title"};
702
703
704 $analytic->first_child($title) ->set_text($LABEL . ", Text #" . $textcounter); # Case KLK; PAGENO scheint meist "None" zu sein
Harald Lüngen381c2a22024-09-17 09:06:39 +0300705 #$analytic->get_xpath('./idno[@type="PAGEID"]', 0) ->set_text($PAGEID);
706 #$analytic->get_xpath('./idno[@type="BINDINGID"]', 0) ->set_text($BID);
707 #$analytic->get_xpath('./idno[@type="ID"]', 0) ->set_text($ID);
708 #$analytic->get_xpath('./idno[@type="KIELIPANKKI_METAFILENAME"]', 0) ->set_text($METAFILENAME);
709 #$analytic->get_xpath('./idno[@type="KIELIPANKKI_ORIGFILENAME"]', 0) ->set_text($ORIGFILENAME);
Harald Lüngen2551a952024-09-15 08:08:35 +0300710 if($TEIFORMAT eq "TEI"){
711 $analytic->first_child('textLang') ->set_text($LANGUAGE);
712 }
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300713
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300714 # <monogr>
715 # <title>$PUBLTITLE</title>
716 # <imprint>
717 # <pubPlace>TODO</pubPlace>
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300718 # <publisher>TODO</publisher>
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300719 # </imprint>
720 # <biblScope unit="ISSUETITLE"/>
721 # <biblScope unit="ISSUENO"/>
722 # <biblScope unit="ISSUEDATE"/>
723 # <biblScope unit="pp">$PAGENO</biblScope>
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300724
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300725 my $monogr = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/monogr", 0);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300726
Harald Lüngen2551a952024-09-15 08:08:35 +0300727 $monogr->first_child($title) ->set_text($PUBLTITLE);
728 if($TEIFORMAT eq "TEI"){
729 $monogr->get_xpath('./imprint/date[@type="date"]', 0) ->set_text($DATE);
730 }
731 my $date = "date";
732 if($TEIFORMAT eq "I5"){$date="pubDate"};
733 $monogr->get_xpath('./imprint/' . $date . '[@type="year"]', 0) ->set_text($datearray[0]);
734 $monogr->get_xpath('./imprint/' . $date . '[@type="month"]', 0) ->set_text($datearray[1]);
735 $monogr->get_xpath('./imprint/' . $date . '[@type="day"]', 0) ->set_text($datearray[2]);
736 $monogr->first_child("imprint")->first_child("pubPlace") ->set_text($srcpubplaces{$PUBLTITLE}); # imprint is needed for tei validity
Harald Lüngen8162ad52024-09-19 10:54:24 +0300737 $monogr->first_child("imprint")->first_child("pubPlace") ->set_att('key', $CountryKey);
Harald Lüngen2551a952024-09-15 08:08:35 +0300738 $monogr->first_child("imprint")->first_child("publisher") ->set_text($srcpublishers{$PUBLTITLE}); # imprint is needed for tei validity
Harald Lüngen381c2a22024-09-17 09:06:39 +0300739 #$monogr->get_xpath('./biblScope[@unit="ISSUETITLE"]', 0) ->set_text($ISSUETITLE);
740 #$monogr->get_xpath('./biblScope[@unit="ISSUENO"]', 0) ->set_text($ISSUENO);
741 #$monogr->get_xpath('./biblScope[@unit="ISSUEDATE"]', 0) ->set_text($ISSUEDATE);
742 #$monogr->get_xpath('./biblScope[@unit="pp"]', 0) ->set_text($PAGENO); # Achtung - PAGENO scheint meist "None" zu sein
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300743
Harald Lüngen2551a952024-09-15 08:08:35 +0300744 my $dateNice = $datearray[2] . "." . $datearray[1] . "." . $datearray[0];
745 my $dateBackwards = $datearray[0] . "." . $datearray[1] . "." . $datearray[2];
746
747 if($TEIFORMAT eq "I5"){
Harald Lüngen381c2a22024-09-17 09:06:39 +0300748 my $refCompleteText = $textSigle . " " . $PUBLTITLE . " no. " . $ISSUENO . ", [" . $PUBLTYPE . "], " . $dateNice;
Harald Lüngen2551a952024-09-15 08:08:35 +0300749 my $refShortText = $textSigle . " " . $PUBLTITLE . ", " . $dateNice;
750 $textHeader->get_xpath('./fileDesc/sourceDesc/reference[@type="complete"]', 0) -> set_text($refCompleteText);
751 $textHeader->get_xpath('./fileDesc/sourceDesc/reference[@type="short"]' , 0) -> set_text($refShortText);
752 }
753
754
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300755 # <encodingDesc>
756 # <tagsDecl>
757 # <namespace name="http://www.tei-c.org/ns/1.0">
758 # <tagUsage gi="s" occurs="SENTCOUNT"/>
759 # <tagUsage gi="w" occurs="TOKENCOUNT"/>
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300760
Harald Lüngen2551a952024-09-15 08:08:35 +0300761 my $namespacePath="./encodingDesc/tagsDecl/namespace/";
762 if($TEIFORMAT eq "I5"){$namespacePath="./encodingDesc/tagsDecl/"};
763
764 $textHeader->get_xpath($namespacePath . 'tagUsage[@gi="s"]', 0) -> set_att('occurs', $SENTCOUNT);
765 $textHeader->get_xpath($namespacePath . 'tagUsage[@gi="w"]', 0) -> set_att('occurs', $TOKENCOUNT);
766
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300767 # <profileDesc>
768 # <langUsage>
769 # <language ident="fi" usage="|xxx:44|fin:23|eng:3|"/>
770 # </langUsage>
771 # <textClass>
772 # <classCode scheme="kielipankki_klk">$PUBLTYPE</classCode>
Harald Lüngen2551a952024-09-15 08:08:35 +0300773 # <classCode scheme="kielipankki_klk_mapped">$mapping{$PUBLTYPE}</classCode>
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300774
Harald Lüngen2551a952024-09-15 08:08:35 +0300775 if($TEIFORMAT eq "I5"){
776 $textHeader->get_xpath('./profileDesc/creation/creatDate', 0) ->set_text($dateBackwards);
777 }
Harald Lüngen8162ad52024-09-19 10:54:24 +0300778 if($TEIFORMAT eq "TEI"){
779 $textHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_att('ident', $LANGUAGE);
780 $textHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_att('usage', $SUMLANG);
781 }
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300782 # in @usage muss eigt. ein integer; am besten inhalt von SUMLANG aufdroeseln und mehrere <language> machen
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300783
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300784 $textHeader->get_xpath('./profileDesc/textClass/classCode[@scheme="kielipankki_klk"]', 0) ->set_text($PUBLTYPE);
Harald Lüngen2551a952024-09-15 08:08:35 +0300785 $textHeader->get_xpath('./profileDesc/textClass/classCode[@scheme="kielipankki_klk_mapped"]', 0)->set_text($mapping{$PUBLTYPE});
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300786
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300787 # <revisionDesc>
788 # <change when="TODO" who="HL">TEI version for EuReCo</change>
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300789
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300790 $textHeader->get_xpath('./revisionDesc/change', 0) ->set_att('when', localtime->ymd('-'));
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300791
Harald Lüngen86cbd932024-09-10 15:52:18 +0300792 return $textID;
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300793
Harald Lüngen86cbd932024-09-10 15:52:18 +0300794
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300795 #-----------------------------------
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300796 # END OF CREATING TEIHEADER
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300797 #-----------------------------------
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300798
799}
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300800
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300801sub setP {
802 my ($paragraph) = @_;
803
804 $paragraph->set_gi('p');
805
806 # <paragraph id="p-bcd0f3fa-bbd3dac4-815ead7a" sum_lang="|fin:1|">
807 # atts of <paragraph>:
808 # @id USE
809 # @sum_lang USE: put in xml:lang and prefix the value with "x-" for private value
810
811 $paragraph->set_att("xml:lang", "x-" . $paragraph->att("sum_lang"));
812 $paragraph->del_att("sum_lang");
Harald Lüngena20e69d2024-08-29 13:33:08 +0300813 # $paragraph->change_att_name('id', 'xml:id');
814 $paragraph->del_att("id"); # diese id ist auch nicht eindeutig!!
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300815}
816sub setS {
817 my ($sentence) = @_;
818
819 $sentence->set_gi('s');
820
821 # the atts of <sentence>:
822 # USE 1 @id="s-bcd0f3fa-bbd3dac4-f7429090"
823 # USE 2 @lang="fin" -> xml:lang
824 # ? 3 @lang_conf="0.6734853"> -> ToDo @cert ?
825
826 # set attrs of <s>
827 $sentence->set_att("xml:lang", $sentence->att("lang")); # ToDo: convert the value / introduce a hash for lookup (input values: "fin", "xxx", ....)
Harald Lüngenfaf8d482024-08-27 21:19:47 +0300828 # $sentence->change_att_name('id', 'xml:id'); # nicht eindeutig
829 $sentence->del_att('id');
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300830 $sentence->del_att("lang"); # replaced by xml:lang
831 $sentence->del_att("lang_conf"); # for the time being
832
833}
834
835sub createW {
836 my ($w_element, $line) = @_;
837
838 #---------------------------
839 # Get the tags (=columns)
840 #---------------------------
841
842 my @tags = split(/\t/, $line);
843
Harald Lüngenfe838e02024-09-25 09:01:00 +0300844 # set word string and lemma string according to $MASK flag:
845 my $w_string = "";
846 my $l_string = "";
847 if($MASK && ($tags[4] ne "Punct")){
848 $w_string = Data::Random::String->create_random_string(length=>length($tags[0]), contains=>'alpha');
849 $l_string = $w_string;
850 }
851 else {
852 $w_string = $tags[0];
853 $l_string = $tags[2];
854 }
855 $w_element->set_text($w_string);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300856
Harald Lüngen8162ad52024-09-19 10:54:24 +0300857 # vrt word and positional-attributes in corpus KLK:
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300858 # USE [0] word
859 # USE [1] ref (id for reference of dephead)
860 # USE [2] lemma
861 # ? [3] lemmacomp (lemma with compound info - could go in @norm, as tag abuse?)
862 # USE [4] pos
863 # USE [5] msd
864 # USE [6] dephead
865 # USE [7] deprel
866 # [8] content (ocr-process)
867 # [9] vpos (ocr-process)
868 # [10] ocr (ocr-process)
869 # [11] cc (ocr-process)
870 # [12] hyph (ocr-process)
871 # [13] style (ocr-process)
872 # [14] lex (korp semantic disambiguation from G"oteborg)
873
874 # set the attributes of <w>:
875 $w_element->set_att("n", $tags[1]);
Harald Lüngenfe838e02024-09-25 09:01:00 +0300876
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300877 # $w_element->set_att("id", "w_" . $namearray[4] . $sentence->att("xml:id") . "_" . $tags[1]);
878 # so zusammengebaute ID ist auch nicht eindeutig...
Harald Lüngenfaf8d482024-08-27 21:19:47 +0300879 $w_element->del_att("id");
Harald Lüngenfe838e02024-09-25 09:01:00 +0300880
881 $w_element->set_att("lemma", $l_string);
Harald Lüngen381c2a22024-09-17 09:06:39 +0300882
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300883 # $w_element->set_att("norm", $tags[3]); # tag abuse of @norm
884 $w_element->set_att("pos", $tags[4]);
885 $w_element->set_att("msd", $tags[5]);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300886
Harald Lüngen381c2a22024-09-17 09:06:39 +0300887 if($TEIFORMAT eq "I5"){ # remove condition when part of the official TEI
888 $w_element->set_att("head", $tags[6]);
889 $w_element->set_att("deprel", $tags[7]);
890 }
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300891}
892
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300893
894sub set_title{
Harald Lüngen86cbd932024-09-10 15:52:18 +0300895 my ($corpusHeader, $source, $year, $kielipankkiCorpus) = @_;
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300896
Harald Lüngen381c2a22024-09-17 09:06:39 +0300897 my $titleElement = "title";
898
899 if($TEIFORMAT eq "I5"){
900 $titleElement = "c.title";
901 }
902
Harald Lüngen86cbd932024-09-10 15:52:18 +0300903 my $cTitleString = $source . " " . $year . ", from ". $kielipankkiCorpus . " for EuReCo";
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300904
905 #<teiHeader>
906 # <fileDesc>
907 # <titleStmt>
908 # <title>[Aamulehti2021] from [klk-fi-v2-vrt for] EuReCo</title>
909 # </titleStmt>
910 # <!-- ... -->
911 # </fileDesc>
912 #</teiHeader>
913
Harald Lüngen381c2a22024-09-17 09:06:39 +0300914 my $cTitleNode = $corpusHeader->first_child("fileDesc")->first_child("titleStmt")->first_child($titleElement);
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300915
916 $cTitleNode->set_text($cTitleString);
917
918}
919
920sub set_sourceDesc{
Harald Lüngen86cbd932024-09-10 15:52:18 +0300921 my ($corpusHeader, $source, $year, $kielipankkiCorpus) = @_;
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300922
Harald Lüngen86cbd932024-09-10 15:52:18 +0300923 my $cBiblString = $source . " " . $year . ", from ". $kielipankkiCorpus . " for EuReCo";
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300924
925 #<teiHeader>
926 # <fileDesc>
927 # <!-- ... -->
928 # <sourceDesc>
929 # <bibl>[Aamulehti2021] from [klk-fi-v2-vrt]</bibl>
930 # </sourceDesc>
931 # <!-- ... -->
932 # </fileDesc>
933 #</teiHeader>
934
935 my $cBiblNode = $corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("bibl");
936
937 $cBiblNode->set_text($cBiblString);
938}
939
Harald Lüngen381c2a22024-09-17 09:06:39 +0300940sub set_sourceDescI5{
941 my ($corpusHeader) = @_;
942
943 my $PUBLTITLE = $srcfullnames{$fnsource};
944 my $PUBLPLACE = $srcpubplaces{$PUBLTITLE};
945 my $PUBLISHER = $srcpublishers{$PUBLTITLE};
Harald Lüngen381c2a22024-09-17 09:06:39 +0300946
Harald Lüngen8162ad52024-09-19 10:54:24 +0300947 my $YEAR = $fnyear;
948 my $YY = substr($fnyear, 2, 2);
Harald Lüngen381c2a22024-09-17 09:06:39 +0300949
Harald Lüngen8162ad52024-09-19 10:54:24 +0300950 my $CSIGLE = $corpusids{$PUBLTITLE} . $YY;
951
952
Harald Lüngen381c2a22024-09-17 09:06:39 +0300953 #<idsHeader>
954 # <fileDesc>
955 # <!-- ... -->
956 # <sourceDesc>
957 # <biblStruct>
958 # <monogr>
959 # <h.title type="main">[$PUBLTITLE], [$YEAR]</h.title>
960 # <imprint>
961 # <publisher>[$PUBLISHER]</publisher>
Harald Lüngen8162ad52024-09-19 10:54:24 +0300962 # <pubPlace key="[$TL]">[$PUBPLACE]</pubPlace>
Harald Lüngen381c2a22024-09-17 09:06:39 +0300963 # </imprint>
964 # </monogr>
965 # </biblStruct>
966 # <reference type="super" assemblage="regular">[$KKK] [$PUBLTITLE]; [$PUBPLACE]: [$PUBLISHER], [$YEAR]</reference>
967 # </sourceDesc> # <sourceDesc>
968 # <!-- ... -->
969 # </fileDesc>
970 #</teiHeader>
971
972 my $cMonogr = $corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("biblStruct")->first_child("monogr");
973 $cMonogr->first_child("h.title")->set_text($PUBLTITLE);
974 $cMonogr->first_child("imprint")->first_child("publisher")->set_text($PUBLISHER);
975 $cMonogr->first_child("imprint")->first_child("pubPlace") ->set_text($PUBLPLACE);
Harald Lüngen8162ad52024-09-19 10:54:24 +0300976 $cMonogr->first_child("imprint")->first_child("pubPlace") ->set_att('key', $CountryKey);
Harald Lüngen381c2a22024-09-17 09:06:39 +0300977
978 $corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("reference")->set_text($CSIGLE . " " . $PUBLTITLE . "; " . $PUBLPLACE . ": " . $PUBLISHER . ", " . $YEAR);
979
980}
981
982
983
Harald Lüngen8162ad52024-09-19 10:54:24 +0300984sub createIdsDoc{ # will only be called for the second idsDoc (i.e. for february) and higher
Harald Lüngen381c2a22024-09-17 09:06:39 +0300985 my ($textattsref) = @_;
986 my $DATE = $textattsref->{'date'};
987 my $PUBLTITLE = $textattsref->{'publ_title'};
Harald Lüngen381c2a22024-09-17 09:06:39 +0300988
Harald Lüngen381c2a22024-09-17 09:06:39 +0300989 my @datearray = split("-", $DATE);
990 my $MONTH = $datearray[1];
991 my $YEAR = $datearray[0];
Harald Lüngen8162ad52024-09-19 10:54:24 +0300992 my $YY = substr($YEAR, 2, 2);
Harald Lüngen381c2a22024-09-17 09:06:39 +0300993
Harald Lüngen8162ad52024-09-19 10:54:24 +0300994 my $CSIGLE = $corpusids{$PUBLTITLE} . $YY;
995
Harald Lüngen381c2a22024-09-17 09:06:39 +0300996 my $DOCID = $months{$MONTH};
997 my $MONTHNAME = $monthnames{$MONTH};
998
Harald Lüngen381c2a22024-09-17 09:06:39 +0300999 my $idsDocString="";
1000 if($TEIFORMAT eq "I5"){
1001 $idsDocString = "
Harald Lüngen8162ad52024-09-19 10:54:24 +03001002<idsDoc version=\"1.0\" TEIform=\"TEI.2\">
Harald Lüngen381c2a22024-09-17 09:06:39 +03001003<idsHeader type=\"document\" pattern=\"text\" version=\"1.1\" TEIform=\"teiHeader\">
1004 <fileDesc>
1005 <titleStmt>
1006 <dokumentSigle>$CSIGLE/$DOCID</dokumentSigle>
1007 <d.title>$PUBLTITLE, $MONTHNAME $YEAR</d.title>
1008 </titleStmt>
1009 <publicationStmt>
1010 <distributor/>
1011 <pubAddress/>
1012 <availability region=\"world\">$kielipankkiLicense</availability>
1013 <pubDate/>
1014 </publicationStmt>
1015 <sourceDesc>
1016 <biblStruct>
1017 <monogr>
1018 <h.title/>
1019 <imprint/>
1020 </monogr>
1021 </biblStruct>
1022 </sourceDesc>
1023 </fileDesc>
1024</idsHeader>
1025</idsDoc>\n";
1026}
1027 if($MONTH + 0 == $LASTMONTH + 1){
1028 if($MONTH+0 > 1){
1029 printf("%s\n", $idsDocString);
1030 }
1031 $LASTMONTH++;
1032 }
Harald Lüngen381c2a22024-09-17 09:06:39 +03001033}
1034
Harald Lüngendb5e6e72024-09-04 17:41:18 +03001035
1036
Harald Lüngen9d4e0462024-08-23 09:34:22 +03001037#################
1038## usage_message
1039#################
1040
1041
1042sub usage_message {
Harald Lüngenfe838e02024-09-25 09:01:00 +03001043 print STDERR "Usage: ./vrt2tei.pl [OPTIONS] <file.vrt.xml>\n";
1044 print STDERR " <file.vrt.xml> is a VRT file as proper (well-formed) XML\n";
1045 print STDERR " Options:\n";
1046 print STDERR " -t (tei|i5) output format, default: tei\n";
1047 print STDERR " -m mask primary data\n";
1048 print STDERR "\n";
Harald Lüngen9d4e0462024-08-23 09:34:22 +03001049 exit;
1050}
1051
1052