blob: 93319a2576aacad155e5429d378b792f7b60a5cb [file] [log] [blame]
Harald Lüngenfe838e02024-09-25 09:01:00 +03001## #! /appl/soft/bio/bioperl/5.36.0/bin/perl
2## #! /usr/bin/perl -w
Harald Lüngen9d4e0462024-08-23 09:34:22 +03003
4
5###########################################################################################################################################################
6# vrt2tei.pl
7# eureco
8# leibniz-institut fuer deutsche sprache / csc finland esbo
9# august 2024
10#
11#
12# using XML::Twig , see http://www.xmltwig.org/, https://metacpan.org/pod/XML::Twiga
13#
Harald Lüngenccd84902024-08-27 16:03:47 +030014# usage: see below the usage fugnction
Harald Lüngencaab0802024-08-23 17:28:22 +030015# Usage: ./vrt2tei.pl <vrtxmlfile.xml> <outfile>
Harald Lüngen9d4e0462024-08-23 09:34:22 +030016# <vrtxmlfile>: xml-ised vrt file
17#
18#
19# TODO:
Harald Lüngen9d4e0462024-08-23 09:34:22 +030020
Harald Lüngenfe838e02024-09-25 09:01:00 +030021# 0 ZIPPEN mit tei2korapxml; zippen mit korAP indexing
Harald Lüngen9d4e0462024-08-23 09:34:22 +030022
Harald Lüngenfe838e02024-09-25 09:01:00 +030023# 1 bearbeitung von @head und @deprel in tei2korapxml durch Nils?
24# 2 threading on compute node and application on sub corpora of KLK
25# 2 build 30 billion corpus and index it
26# 3 Optionen
27# 3a parametrize deprel for I5 and if Nils is not ready yet
Harald Lüngen9d4e0462024-08-23 09:34:22 +030028
Harald Lüngenfe838e02024-09-25 09:01:00 +030029#
30
31# 5 abfangen von unerwarteten elementen dh andere als <sentence> und <paragraph>
32# 6 checks and balances, wort reihenfolge nochmal checken?
33# 7 Encode Kielipankki and National Library of Finland? in teiCorpus Header
34# 8 How to encode the CLARIN-RES better - more Info from the CMDI
35# 9 construct <idsDoc>s independent of the order of texts, probably with writing intermediate files to zip
36# 10 re-implementation of the gawk code in the perl script
37# 11 Wwedish corpus
Harald Lüngen9d4e0462024-08-23 09:34:22 +030038
39#
40#
41############################################################################################################################################################
42
Harald Lüngen9d4e0462024-08-23 09:34:22 +030043use strict;
44use warnings;
Harald Lüngen381c2a22024-09-17 09:06:39 +030045#use diagnostics;
Harald Lüngen9d4e0462024-08-23 09:34:22 +030046
Harald Lüngen2551a952024-09-15 08:08:35 +030047use Getopt::Std;
Harald Lüngen9d4e0462024-08-23 09:34:22 +030048use XML::Twig;
49use XML::Generator ':pretty'; # apparently no effect when using flush();
50
51
52use locale; # diese drei Zeilen, damit \b im regex nicht Umlaute und ß matcht.
53use POSIX qw(locale_h); # to be able to use setlocale()
54#setlocale(LC_ALL,'de_DE');
55setlocale(LC_ALL, "fi_FI");
56use utf8;
57use open qw( :std :encoding(UTF-8) );
58
59use Time::Piece;
60use Tie::IxHash;
Harald Lüngen381c2a22024-09-17 09:06:39 +030061use Data::Random::String;
Harald Lüngendb5e6e72024-09-04 17:41:18 +030062
63
Harald Lüngen9d4e0462024-08-23 09:34:22 +030064
Harald Lüngenfe838e02024-09-25 09:01:00 +030065
66#-------------
67# get options
68#-------------
69
70our ($opt_h, $opt_m, $opt_s, $opt_t);
71
72# read switches and print usage info if some bad option was given
73if (!getopts('hms:t:')) { # switches with ':' take an argument; switches without ':' are boolean flags
74 &usage_message;
75 exit -1;
76}
77
78
79#--------------------
80# check argument(s)
81#--------------------
82
83# currently one argument: the vrt-xml input file
Harald Lüngen9d4e0462024-08-23 09:34:22 +030084
Harald Lüngena20e69d2024-08-29 13:33:08 +030085unless($ARGV[0]) {&usage_message()} ; # min arg0, the input file
86if ($ARGV[1]) {&usage_message()}; # max arg0, the input file
Harald Lüngen9d4e0462024-08-23 09:34:22 +030087
88
Harald Lüngenfe838e02024-09-25 09:01:00 +030089
90#------------------------------------------------------------
91# initialize defaults for options
92#------------------------------------------------------------
93my $TEIFORMAT= "tei";
94my $MASK = 0;
95
96
97#----------------------------------------------------------------------------------------------------------
98# interpret the options and check whether their respective argument is meaningful (if applicable)
99#----------------------------------------------------------------------------------------------------------
100
101
102# option -h: display usage info and exit
103if ($opt_h) {
104 print STDERR &usage_message;
105 exit 0;
106}
Harald Lüngen2551a952024-09-15 08:08:35 +0300107
108
109
Harald Lüngenfe838e02024-09-25 09:01:00 +0300110# option -t
111if (defined($opt_t)) {
112 $TEIFORMAT = $opt_t;
113}
Harald Lüngen2551a952024-09-15 08:08:35 +0300114
115
Harald Lüngenfe838e02024-09-25 09:01:00 +0300116if ($TEIFORMAT !~ /(tei|i5)/i) { # case-insenstive
117 print STDERR "Error: invalid arg for option -t";
118 &usage_message;
119 exit 0;
120}
121if($TEIFORMAT eq "tei"){$TEIFORMAT="TEI"};
122if($TEIFORMAT eq "i5") {$TEIFORMAT="I5"};
123
124# option -m
125if ($opt_m) {
126 $MASK = 1;
127}
128
129
130#-----------------------------------------------
131# OTHER GLOBAL VARIABLES
132#-----------------------------------------------
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300133
Harald Lüngen2551a952024-09-15 08:08:35 +0300134my $encoding = "UTF-8"; # dieses $encoding ist NUR fuer das output s.u. twig funktion
135
Harald Lüngen8162ad52024-09-19 10:54:24 +0300136my $DTDDECL = 'idsCorpus PUBLIC "-//IDS//DTD IDS-I5 1.0//EN" "http://corpora.ids-mannheim.de/I5/DTD/i5.dtd"'; # for I5
Harald Lüngen2551a952024-09-15 08:08:35 +0300137
Harald Lüngencaab0802024-08-23 17:28:22 +0300138my $textcounter = 0;
Harald Lüngen381c2a22024-09-17 09:06:39 +0300139my $LASTMONTH = 0;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300140
Harald Lüngen8162ad52024-09-19 10:54:24 +0300141our %corpusids = ();
Harald Lüngenba0354b2024-09-11 16:24:08 +0300142our %srcfullnames = ();
143our %srcpubplaces = ();
144our %srcpublishers = ();
145our %srctexttypes = ();
146our %srctextlangs = ();
147
Harald Lüngen381c2a22024-09-17 09:06:39 +0300148our %expandLang = ();
149
150
151my %doccounter = ( # by the month as in dereko
Harald Lüngen86cbd932024-09-10 15:52:18 +0300152 "01" => 1,
153 "02" => 1,
154 "03" => 1,
155 "04" => 1,
156 "05" => 1,
157 "06" => 1,
158 "07" => 1,
159 "08" => 1,
160 "09" => 1,
161 "10" => 1,
162 "11" => 1,
163 "12" => 1,
164 );
165
Harald Lüngen8162ad52024-09-19 10:54:24 +0300166# global variables pertaining to the original corpus of *all* newspapers:
167my $kielipankkiCorpus = "klk-fi-v2-vrt";
168my $kielipankkiLicense = "CLARIN-RES";
169my $CountryKey = "FI";
Harald Lüngen86cbd932024-09-10 15:52:18 +0300170
Harald Lüngen8162ad52024-09-19 10:54:24 +0300171# Table with metadata about the different sources (newspapers)
Harald Lüngen86cbd932024-09-10 15:52:18 +0300172my $sourcescsvfile = "sources_klk_fi_v2_2021_4eureco.csv";
Harald Lüngen2551a952024-09-15 08:08:35 +0300173
Harald Lüngen8162ad52024-09-19 10:54:24 +0300174# corpusheader and textheader skeletons
Harald Lüngen86cbd932024-09-10 15:52:18 +0300175my $corpheaderfile = "teiCorpusHeaderSkeleton.tei.xml";
176my $textheaderfile = "teiTextHeaderSkeleton.tei.xml";
Harald Lüngen2551a952024-09-15 08:08:35 +0300177if($TEIFORMAT eq "I5"){
Harald Lüngen381c2a22024-09-17 09:06:39 +0300178 $corpheaderfile = "i5CorpusHeaderSkeleton.i5.xml";
Harald Lüngen2551a952024-09-15 08:08:35 +0300179 $textheaderfile = "i5TextHeaderSkeleton.i5.xml";
180}
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300181
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300182
183my $twig="";
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300184
Harald Lüngenfe838e02024-09-25 09:01:00 +0300185# variables $fnsource and $fnyear derived from the filename
Harald Lüngen381c2a22024-09-17 09:06:39 +0300186my @array = split(/\//, $ARGV[0]);
187my $l = scalar(@array);
188my $fnsource = $array[$l-1];
189$fnsource =~ s/([0-9][0-9][0-9][0-9])\.xml$//;
190
191my $fnyear = $1; # $1 contains substring in first bracket in regex above
Harald Lüngen8162ad52024-09-19 10:54:24 +0300192my $fnYY = substr($fnyear, 2, 2);
Harald Lüngen381c2a22024-09-17 09:06:39 +0300193
194
195# months
Harald Lüngen86cbd932024-09-10 15:52:18 +0300196my %months = (
197 "01" => "JAN",
198 "02" => "FEB",
199 "03" => "MAR",
200 "04" => "APR",
201 "05" => "MAY",
202 "06" => "JUN",
203 "07" => "JUL",
204 "08" => "AUG",
205 "09" => "SEP",
206 "10" => "OCT",
207 "11" => "NOV",
208 "12" => "DEC",
209 );
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300210
Harald Lüngen381c2a22024-09-17 09:06:39 +0300211my %monthnames = (
212 "01" => "January",
213 "02" => "February",
214 "03" => "March",
215 "04" => "April",
216 "05" => "May",
217 "06" => "June",
218 "07" => "July",
219 "08" => "August",
220 "09" => "September",
221 "10" => "October",
222 "11" => "November",
223 "12" => "December",
224 );
225
Harald Lüngen2551a952024-09-15 08:08:35 +0300226my %mapping = ();
227$mapping{"aikakausi"} = "Zeitschrift";
228$mapping{"sanomalehti"} = "Zeitung";
229
230
231
Harald Lüngen381c2a22024-09-17 09:06:39 +0300232#-------------------------------------------------------------------------------------------
233# read source metadata file (prepared manually => ultimately read the info from CMDI File?)
234# and set variables
235#-------------------------------------------------------------------------------------------
236
237open(my $SOURCES, "< $sourcescsvfile") || die("$0: cannot open file for reading: $sourcescsvfile");
238while(my $fline = <$SOURCES>){
239 chomp($fline);
240
241 if ($fline=~/^\#/ || $fline=~/^\s*$/ || $fline =~ "TEICORPUSID"){next;} # skip line if empty line or comment line or first line
242 my @flarray = split(/\s*\t+\s*/, $fline); # split each line into array
243
244 # set full titles ($flarray[1]) as keys:
Harald Lüngen8162ad52024-09-19 10:54:24 +0300245 # (ToDo: these hashes could probably conflated into an array of hashes or so)
246 $corpusids{$flarray[1]} = $flarray[0];
Harald Lüngen381c2a22024-09-17 09:06:39 +0300247 $srcfullnames{$flarray[1]} = $flarray[1];
248 $srcpubplaces{$flarray[1]} = $flarray[6];
249 $srcpublishers{$flarray[1]} = $flarray[7];
250 $srctexttypes{$flarray[1]} = $flarray[4];
251 $srctextlangs{$flarray[1]} = $flarray[5];
252
253 # also set simple titles ($flarray[2]) as keys:
Harald Lüngen8162ad52024-09-19 10:54:24 +0300254 $corpusids{$flarray[2]} = $flarray[0];
Harald Lüngen381c2a22024-09-17 09:06:39 +0300255 $srcfullnames{$flarray[2]} = $flarray[1];
256 $srcpubplaces{$flarray[2]} = $flarray[6];
257 $srcpublishers{$flarray[2]} = $flarray[7];
258 $srctexttypes{$flarray[2]} = $flarray[4];
259 $srctextlangs{$flarray[2]} = $flarray[5];
260}
261close($SOURCES);
262
263$expandLang{"fi"} = "Finnish";
264$expandLang{"sv"} = "Swedish";
265
Harald Lüngenfe838e02024-09-25 09:01:00 +0300266
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300267#------------------------------------------------------------------
Harald Lüngen8162ad52024-09-19 10:54:24 +0300268# read corpusHeaderSkeleton document and start a twig for it
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300269#------------------------------------------------------------------
270
271my $teiCorpusHeaderDocTwig = new XML::Twig(
272 keep_spaces => 1,
273 keep_atts_order => 1,
274 comments => 'drop',
275 );
276
277
Harald Lüngen86cbd932024-09-10 15:52:18 +0300278$teiCorpusHeaderDocTwig->parsefile($corpheaderfile);
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300279my $corpusHeader = $teiCorpusHeaderDocTwig->root; # getting the teiHeader for corpus out of the teiCorpusHeaderSkeleton document
280
281
282#------------------------------------------------------------------
Harald Lüngen8162ad52024-09-19 10:54:24 +0300283# read textHeaderSkeleton document and start a twig for it
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300284#------------------------------------------------------------------
285
286my $teiTextHeaderDocTwig = new XML::Twig(
287 keep_spaces => 1,
288 keep_atts_order => 1,
289 comments => 'drop',
290 );
291
Harald Lüngen86cbd932024-09-10 15:52:18 +0300292$teiTextHeaderDocTwig->parsefile($textheaderfile);
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300293my $textHeader = $teiTextHeaderDocTwig->root; # getting the teiHeader for corpus out of the teiTextHeaderSkeleton document
294
295
Harald Lüngen8162ad52024-09-19 10:54:24 +0300296#---------------------------------------------------------
297# define a subtree for idsDoc
298# for the time being it will only be used for the first
299# idsDoc header, to be inserted in the root hander
300#---------------------------------------------------------
301
Harald Lüngen2551a952024-09-15 08:08:35 +0300302my $idsDoc = XML::Twig::Elt->new('idsDoc');
Harald Lüngen381c2a22024-09-17 09:06:39 +0300303my $idsDocHeader = XML::Twig::Elt->new('idsHeader');
Harald Lüngen2551a952024-09-15 08:08:35 +0300304
Harald Lüngen381c2a22024-09-17 09:06:39 +0300305if($TEIFORMAT eq "I5"){
Harald Lüngen2551a952024-09-15 08:08:35 +0300306 my $docFileDesc = XML::Twig::Elt->new('fileDesc');
307 my $docTitleStmt = XML::Twig::Elt->new('titleStmt');
308 my $dtitle = XML::Twig::Elt->new('d.title');
309 my $docSigle = XML::Twig::Elt->new('dokumentSigle');
310
Harald Lüngen381c2a22024-09-17 09:06:39 +0300311 my $docPublicationStmt = XML::Twig::Elt->new('publicationStmt');
312 my $docDistributor = XML::Twig::Elt->new('distributor');
Harald Lüngen8162ad52024-09-19 10:54:24 +0300313 my $docPubAddress = XML::Twig::Elt->new('pubAddress');
Harald Lüngen381c2a22024-09-17 09:06:39 +0300314 my $docAvailability = XML::Twig::Elt->new('availability');
315 my $docPubDate = XML::Twig::Elt->new('pubDate');
316
317 my $docSourceDesc = XML::Twig::Elt->new('sourceDesc');
318 my $docBiblStruct = XML::Twig::Elt->new('biblStruct');
319 my $docMonogr = XML::Twig::Elt->new('monogr');
320 my $docHTitle = XML::Twig::Elt->new('h.title');
321 my $docImprint = XML::Twig::Elt->new('imprint');
Harald Lüngen8162ad52024-09-19 10:54:24 +0300322
323 $idsDoc -> set_att('version', "1.0");
324 $idsDoc -> set_att('TEIform', "TEI.2");
325
326 $idsDocHeader -> set_att('version', "1.1");
327 $idsDocHeader -> set_att('type', "document");
328 $idsDocHeader -> set_att('pattern', "text");
329 $idsDocHeader -> set_att('TEIform', "teiHeader");
330
331
Harald Lüngen381c2a22024-09-17 09:06:39 +0300332
333 $docSigle -> paste("first_child", $docTitleStmt);
334 $dtitle -> paste("last_child", $docTitleStmt);
335 $docTitleStmt -> paste("last_child", $docFileDesc);
336 $docFileDesc -> paste("last_child", $idsDocHeader);
337 $docPublicationStmt -> paste("last_child", $docFileDesc);
338 $docDistributor -> paste("last_child", $docPublicationStmt);
339 $docPubAddress -> paste("last_child", $docPublicationStmt);
340
341 $docAvailability -> paste("last_child", $docPublicationStmt);
342 $docPubDate -> paste("last_child", $docPublicationStmt);
343
344 $docSourceDesc -> paste("last_child", $docFileDesc);
345 $docBiblStruct -> paste("last_child", $docSourceDesc);
346 $docMonogr -> paste("last_child", $docBiblStruct);
347 $docHTitle -> paste("last_child", $docMonogr);
348 $docImprint -> paste("last_child", $docMonogr);
349
Harald Lüngen2551a952024-09-15 08:08:35 +0300350 $idsDocHeader -> paste("last_child", $idsDoc);
351
Harald Lüngen8162ad52024-09-19 10:54:24 +0300352 $docSigle->set_text($corpusids{$fnsource} . $fnYY . "/JAN");
Harald Lüngen381c2a22024-09-17 09:06:39 +0300353 $dtitle ->set_text($srcfullnames{$fnsource} . ", January " . $fnyear);
354}
Harald Lüngen2551a952024-09-15 08:08:35 +0300355
356
Harald Lüngen8162ad52024-09-19 10:54:24 +0300357
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300358#----------------------------------
Harald Lüngen8162ad52024-09-19 10:54:24 +0300359# read the input VRT-XML document
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300360#----------------------------------
361
362open(my $IN, "< $ARGV[0]") || die("$0: cannot open file for reading: $ARGV[0]"); # open input file and initialise filehandel, actually does not seem to be needed
363 # as parsefile() (s.b.) is applied to the filename
364
Harald Lüngen86cbd932024-09-10 15:52:18 +0300365
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300366
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300367#####################
368# M A I N
369#####################
370
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300371#-------------------------------------------------------------------------------------------------------------
372# start twig for input and call start tag handler for root and twig handler for each <text> in the VRT
373#-------------------------------------------------------------------------------------------------------------
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300374
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300375
376$twig = new XML::Twig(
377 keep_spaces => 1, # dadurch auch whitespaces an ehemeligen elementgrenzen im output
378 keep_atts_order => 1, # requires Tie::IxHash
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300379 comments => 'drop',
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300380 start_tag_handlers => {
Harald Lüngen2551a952024-09-15 08:08:35 +0300381 texts => sub{root(@_, $corpusHeader)}
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300382 },
Harald Lüngen2551a952024-09-15 08:08:35 +0300383
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300384 twig_handlers => {
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300385# text => \&text
Harald Lüngen7abb0b52024-09-05 16:26:57 +0300386 text => sub{text(@_, $textHeader->copy)} # copy must be because textHeader will be flushed with $twig in the <text> handler;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300387 },
Harald Lüngen8162ad52024-09-19 10:54:24 +0300388
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300389 output_encoding => $encoding,
390 );
391
392$twig->parsefile($ARGV[0]);
393
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300394
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300395###########
396# END MAIN
397###########
398
399
400
401
402##############################
403# S U B R O U T I N E S
404##############################
405
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300406sub root {
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300407 my ($twig, $root, $corpusHeader) =@_;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300408
Harald Lüngen8162ad52024-09-19 10:54:24 +0300409 if($TEIFORMAT eq "I5") {
410 $twig->set_doctype($DTDDECL); # the doctype could probably be set anywhere to the twig
411 $root->set_gi('idsCorpus');
412 $root->set_att('version', "1.0");
413 $root->set_att('TEIform', "teiCorpus.2");
414
Harald Lüngen381c2a22024-09-17 09:06:39 +0300415 }
416 else {
Harald Lüngen8162ad52024-09-19 10:54:24 +0300417 $root->set_gi('teiCorpus');
418 $root->set_att("xmlns", 'http://www.tei-c.org/ns/1.0');
Harald Lüngen381c2a22024-09-17 09:06:39 +0300419 }
420
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300421 &insertCorpusHeader($root, $corpusHeader);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300422}
423
424
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300425
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300426sub insertCorpusHeader{
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300427 my ($root, $corpusHeader) =@_;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300428
Harald Lüngen8162ad52024-09-19 10:54:24 +0300429 my $ident = "ident";
430
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300431 #-----------------------
432 # set corpus header
433 #-----------------------
434
Harald Lüngen381c2a22024-09-17 09:06:39 +0300435 &set_title($corpusHeader, $srcfullnames{$fnsource}, $fnyear, $kielipankkiCorpus);
436
437 if($TEIFORMAT eq "TEI"){
438 &set_sourceDesc($corpusHeader, $srcfullnames{$fnsource}, $fnyear, $kielipankkiCorpus);
439 }
Harald Lüngen8162ad52024-09-19 10:54:24 +0300440 elsif($TEIFORMAT eq "I5"){
Harald Lüngenfe838e02024-09-25 09:01:00 +0300441 $ident="id";
Harald Lüngen8162ad52024-09-19 10:54:24 +0300442 $corpusHeader->get_xpath("fileDesc/titleStmt/korpusSigle",0) ->set_text($corpusids{$fnsource} . $fnYY);
443 $corpusHeader->get_xpath("fileDesc/publicationStmt/pubDate",0)->set_text((localtime)[5] + 1900);
Harald Lüngen381c2a22024-09-17 09:06:39 +0300444 $corpusHeader->get_xpath('encodingDesc/editorialDecl/transduction[@n="2"]',0) ->set_text("I5 version by EuReCo using vrt2tei.pl " . localtime->ymd('-'));
445 &set_sourceDescI5($corpusHeader);
446 }
447 else{
448 print STDERR "TEIFORMAT is $TEIFORMAT; must be TEI or I5";
449 }
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300450
Harald Lüngen2551a952024-09-15 08:08:35 +0300451 $corpusHeader->paste("first_child", $root);
Harald Lüngen8162ad52024-09-19 10:54:24 +0300452 $corpusHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_att($ident, $srctextlangs{$fnsource});
453 $corpusHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_text($expandLang{$srctextlangs{$fnsource}});
Harald Lüngen381c2a22024-09-17 09:06:39 +0300454
455 if($TEIFORMAT eq "I5"){
456 $idsDoc->paste("after", $corpusHeader);
457 }
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300458}
459
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300460
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300461#----------------------------
462# handler &text for <text>
463#----------------------------
464
465sub text {
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300466 my ($twig, $text, $textHeader) = @_;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300467
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300468 $textcounter++;
Harald Lüngencaab0802024-08-23 17:28:22 +0300469
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300470
Harald Lüngencaab0802024-08-23 17:28:22 +0300471 # ToDo: catch all other, unexpected children of root
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300472
473 #--------------------------------------------------------------------------
474 # Get text metadata (attributes of <text>) and create teiHeader for <text>
475 #--------------------------------------------------------------------------
476
477 my $textattsref = $text->atts(); # $textattsref is now a reference to a hash and should be used with '->'
478
Harald Lüngen8162ad52024-09-19 10:54:24 +0300479 &createIdsDoc($textattsref); # this creation of idsDoc will only be called for the 2nd idsDoc (i.e. february) or higher
Harald Lüngen381c2a22024-09-17 09:06:39 +0300480
481
482
Harald Lüngen86cbd932024-09-10 15:52:18 +0300483 # &createTextHeader returns the $textID:
484 my $textID = &createTextHeader($text, $textattsref, $textHeader);
485
Harald Lüngen2551a952024-09-15 08:08:35 +0300486
487 #----------------------------------------
488 # create <TEI> or <idsText> from <text>
489 #----------------------------------------
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300490
491 # set vrt <text> to <TEI> and delete all attributes after they were were saved above
492 $text->del_atts;
Harald Lüngen7abb0b52024-09-05 16:26:57 +0300493
Harald Lüngen2551a952024-09-15 08:08:35 +0300494 if($TEIFORMAT eq "TEI"){
495 $text->set_gi("TEI");
496 $text->set_att('xml:id', $textID);
497 }
498 else {
499 $text ->set_gi("idsText");
500 $text ->set_att('version', "1.0");
501# $text ->move("last_child", $idsDoc); # does not work because apparently $idsDoc is not under $root at this point
Harald Lüngen7abb0b52024-09-05 16:26:57 +0300502
Harald Lüngen2551a952024-09-15 08:08:35 +0300503 }
Harald Lüngen86cbd932024-09-10 15:52:18 +0300504
Harald Lüngen7abb0b52024-09-05 16:26:57 +0300505
Harald Lüngencaab0802024-08-23 17:28:22 +0300506
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300507 #------------------------------------------------------------------
508 # create the <tei:text>, <body>, <div> elements inside <TEI>
509 #------------------------------------------------------------------
510
511 my $ttext_element = XML::Twig::Elt->new('text');
512 my $body_element = XML::Twig::Elt->new('body');
513 my $div_element = XML::Twig::Elt->new('div');
514
515 # set atts
Harald Lüngencaab0802024-08-23 17:28:22 +0300516 $div_element ->set_att("type", "page"); # ToDo: this is specific to KLK
Harald Lüngen2551a952024-09-15 08:08:35 +0300517 $ttext_element->set_att("xml:lang", 'fi'); # as in ICC-NOR
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300518
519 # paste
520 $ttext_element->paste('last_child', $text);
521 $body_element ->paste('last_child', $ttext_element);
522 $div_element ->paste('last_child', $body_element);
523
524
525 #-------------------------------
526 # create <p> from <paragraph>
527 #-------------------------------
528
529 my @paragraphs = $text->children( 'paragraph');
530
531 foreach my $paragraph (@paragraphs) {
532
533 &setP($paragraph);
534
535 $paragraph->move('last_child', $div_element);
536
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300537 #------------------------------
538 # create <s> from <sentence>
539 #------------------------------
540
541 my @sentences = $paragraph->children('sentence');
542 foreach my $sentence (@sentences) {
543
544 &setS($sentence);
545
546
547 #--------------------------------------
548 # create <w> (word) from each $line
549 #--------------------------------------
550
551 my @lines = split(/\n+/, $sentence->xml_text);
552 $sentence->set_text("\n");
553
554 for my $line (@lines){ # Todo: Reihenfolge checken
555 if($line ne "" ){
556 my $w_element = XML::Twig::Elt->new('w');
557 &createW($w_element, $line);
558 $w_element->paste('last_child', $sentence);
559 }
Harald Lüngencaab0802024-08-23 17:28:22 +0300560 } # end words
561 } # end sentences
562 } # end paragraphs
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300563
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300564 # $twig->set_pretty_print( 'record');
Harald Lüngena20e69d2024-08-29 13:33:08 +0300565 # $twig->flush($OUT);
566 $twig->flush("/dev/stdout");
567}
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300568
569sub createTextHeader{
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300570 my ($text, $textattsref, $textHeader) = @_;
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300571
572 # USE 01 binding_id="2246025"
573 # USE 02 date="2021-01-15"
574 # 03 datefrom="20210115"
575 # 04 dateto="20210115"
576 # 05 elec_date="_"
577 # 06 file=""
578 # USE 07 filename_metadata="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_mets.xml"
579 # USE 08 filename_orig ="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_page-2021011502210030301.xml
580 # USE 09 id="t-bcd0f3fa-bbd3dac4"
581 # 10 img_url=""
582 # USE 11 issue_date="15.01.2021"
583 # USE 12 issue_no="SK0221"
584 # USE 13 issue_title="Suomen Kuvalehti"
585 # USE 14 label="Suomen Kuvalehti no. SK0221 15.01.2021"
586 # USE 16 language="fi"
587 # USE 17 page_id="p1"
588 # USE 18 page_no="None"
589 # 19 part_name="_"
590 # 20 publ_id="0039-5552"
591 # 21 publ_part=""
592 # USE 22 publ_title="Suomen Kuvalehti"
593 # USE 23 publ_type="aikakausi"
594 # USE 24 sentcount="70"
595 # USE 25 sum_lang="|xxx:44|fin:23|eng:3|"
596 # 26 timefrom="000000"
597 # 27 timeto="235959"
598 # USE 28 tokencount="304"
599 # 29 version_added="KLK-fi-2021">
600
601
602 my $BID = $textattsref->{'binding_id'};
603 my $DATE = $textattsref->{'date'};
604 my $METAFILENAME = $textattsref->{'filename_metadata'};
605 my $ORIGFILENAME = $textattsref->{'filename_orig'};
606 my $ID = $textattsref->{'id'};
607 my $ISSUEDATE = $textattsref->{'issue_date'};
608 my $ISSUENO = $textattsref->{'issue_no'};
609 my $ISSUETITLE = $textattsref->{'issue_title'};
610 my $LABEL = $textattsref->{'label'};
611 my $LANGUAGE = $textattsref->{'language'};
612 my $PAGEID = $textattsref->{'page_id'};
613 my $PAGENO = $textattsref->{'page_no'};
614 my $PUBLTITLE = $textattsref->{'publ_title'};
615 my $PUBLTYPE = $textattsref->{'publ_type'};
616 my $SENTCOUNT = $textattsref->{'sentcount'};
617 my $SUMLANG = $textattsref->{'sum_lang'};
618 my $TOKENCOUNT = $textattsref->{'tokencount'};
619
Harald Lüngenba0354b2024-09-11 16:24:08 +0300620
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300621 #-----------------------------
622 # Derived Metadata variables
623 #-----------------------------
624
625 my @datearray = split("-", $DATE);
626 my @langarray = split("|", $SUMLANG);
627 my @namearray = split(/[\.\/]/, $ORIGFILENAME); # use $namearray[4] as ID for the page
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300628
Harald Lüngen86cbd932024-09-10 15:52:18 +0300629 #----------------------------------------------------
630 # create textSigle to be returned from this function
631 #----------------------------------------------------
632
633 # SUK21.JAN.00001
634
Harald Lüngen86cbd932024-09-10 15:52:18 +0300635 my $yy = substr($datearray[0], 2, 2); # substr EXPR,OFFSET,LENGTH
636 my $mm = $datearray[1]; # substr EXPR,OFFSET,LENGTH
637 my $MMM = $months{$mm};
638
Harald Lüngen8162ad52024-09-19 10:54:24 +0300639 my $CSIGLE = $corpusids{$fnsource} . $yy;
640
641 my $textID = $CSIGLE . "_" . $MMM . "." . sprintf("%05d", $doccounter{$mm}++);
Harald Lüngen2551a952024-09-15 08:08:35 +0300642 my $textSigle = $textID;
Harald Lüngen86cbd932024-09-10 15:52:18 +0300643
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300644
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300645 #-----------------------------------------------------------------------
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300646 # CREATE text-teiHeader ACCORDING TO THE SKELETON in $textHeader
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300647 #-----------------------------------------------------------------------
648
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300649
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300650 $textHeader->paste('first_child', $text);
651
652 #-----------------------------------------------
653 # <teiHeader>
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300654 # <fileDesc n="EuReCo-KLK-FIN_[$ID]">
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300655 # <titleStmt>
656 # <title>[$LABEL, page $PAGENO]</title>
657
Harald Lüngen8162ad52024-09-19 10:54:24 +0300658 $textHeader->first_child("fileDesc") -> set_att('n', "EuReCo-". $kielipankkiCorpus . "-" . $ID);
Harald Lüngen2551a952024-09-15 08:08:35 +0300659
660
661 #-----------------
662 # titleStmt
663 #----------------
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300664
Harald Lüngen2551a952024-09-15 08:08:35 +0300665 my $title="title";
666 my $titleStmt = $textHeader->first_child("fileDesc")->first_child("titleStmt");
667
668 if($TEIFORMAT eq "I5"){
669 $title = "t.title";
670 $textSigle =~ s/_/\//g;
671 $titleStmt->first_child("textSigle")->set_text($textSigle);
672 };
673
674 $titleStmt->first_child($title)->set_text($LABEL . ", Text #" . $textcounter);
675
Harald Lüngen8162ad52024-09-19 10:54:24 +0300676 # Case KLK: PAGENO scheint meist "None" zu sein
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300677
678 #-----------------------------------------------
679 # <fileDesc>
680 # <sourceDesc>
681 # <biblStruct>
682 # <analytic>
683 # <title type="main">[$LABEL, page $PAGENO]</title>
684 # <date>[$DATE]</date>
685 # <date type="year">TODO</date>
686 # <date type="month">TODO</date>
687 # <date type="day">TODO</date>
688 # <idno type="PAGEID">$PAGEID</idno>
689 # <idno type="BINDINGID">$BID</idno>
690 # <idno type="ID">$ID</idno>
691 # <idno type="KIELIPANKKI_METAFILENAME">$METAFILENAME</idno>
692 # <idno type="KIELIPANKKI_ORIGFILENAME">$ORIGFILENAME</idno>
693 # <textLang>$LANGUAGE</textLang>
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300694
Harald Lüngen8162ad52024-09-19 10:54:24 +0300695
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300696 my $analytic = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/analytic", 0);
Harald Lüngen2551a952024-09-15 08:08:35 +0300697 if($TEIFORMAT eq "I5"){$title="h.title"};
698
699
700 $analytic->first_child($title) ->set_text($LABEL . ", Text #" . $textcounter); # Case KLK; PAGENO scheint meist "None" zu sein
Harald Lüngen381c2a22024-09-17 09:06:39 +0300701 #$analytic->get_xpath('./idno[@type="PAGEID"]', 0) ->set_text($PAGEID);
702 #$analytic->get_xpath('./idno[@type="BINDINGID"]', 0) ->set_text($BID);
703 #$analytic->get_xpath('./idno[@type="ID"]', 0) ->set_text($ID);
704 #$analytic->get_xpath('./idno[@type="KIELIPANKKI_METAFILENAME"]', 0) ->set_text($METAFILENAME);
705 #$analytic->get_xpath('./idno[@type="KIELIPANKKI_ORIGFILENAME"]', 0) ->set_text($ORIGFILENAME);
Harald Lüngen2551a952024-09-15 08:08:35 +0300706 if($TEIFORMAT eq "TEI"){
707 $analytic->first_child('textLang') ->set_text($LANGUAGE);
708 }
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300709
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300710 # <monogr>
711 # <title>$PUBLTITLE</title>
712 # <imprint>
713 # <pubPlace>TODO</pubPlace>
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300714 # <publisher>TODO</publisher>
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300715 # </imprint>
716 # <biblScope unit="ISSUETITLE"/>
717 # <biblScope unit="ISSUENO"/>
718 # <biblScope unit="ISSUEDATE"/>
719 # <biblScope unit="pp">$PAGENO</biblScope>
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300720
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300721 my $monogr = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/monogr", 0);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300722
Harald Lüngen2551a952024-09-15 08:08:35 +0300723 $monogr->first_child($title) ->set_text($PUBLTITLE);
724 if($TEIFORMAT eq "TEI"){
725 $monogr->get_xpath('./imprint/date[@type="date"]', 0) ->set_text($DATE);
726 }
727 my $date = "date";
728 if($TEIFORMAT eq "I5"){$date="pubDate"};
729 $monogr->get_xpath('./imprint/' . $date . '[@type="year"]', 0) ->set_text($datearray[0]);
730 $monogr->get_xpath('./imprint/' . $date . '[@type="month"]', 0) ->set_text($datearray[1]);
731 $monogr->get_xpath('./imprint/' . $date . '[@type="day"]', 0) ->set_text($datearray[2]);
732 $monogr->first_child("imprint")->first_child("pubPlace") ->set_text($srcpubplaces{$PUBLTITLE}); # imprint is needed for tei validity
Harald Lüngen8162ad52024-09-19 10:54:24 +0300733 $monogr->first_child("imprint")->first_child("pubPlace") ->set_att('key', $CountryKey);
Harald Lüngen2551a952024-09-15 08:08:35 +0300734 $monogr->first_child("imprint")->first_child("publisher") ->set_text($srcpublishers{$PUBLTITLE}); # imprint is needed for tei validity
Harald Lüngen381c2a22024-09-17 09:06:39 +0300735 #$monogr->get_xpath('./biblScope[@unit="ISSUETITLE"]', 0) ->set_text($ISSUETITLE);
736 #$monogr->get_xpath('./biblScope[@unit="ISSUENO"]', 0) ->set_text($ISSUENO);
737 #$monogr->get_xpath('./biblScope[@unit="ISSUEDATE"]', 0) ->set_text($ISSUEDATE);
738 #$monogr->get_xpath('./biblScope[@unit="pp"]', 0) ->set_text($PAGENO); # Achtung - PAGENO scheint meist "None" zu sein
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300739
Harald Lüngen2551a952024-09-15 08:08:35 +0300740 my $dateNice = $datearray[2] . "." . $datearray[1] . "." . $datearray[0];
741 my $dateBackwards = $datearray[0] . "." . $datearray[1] . "." . $datearray[2];
742
743 if($TEIFORMAT eq "I5"){
Harald Lüngen381c2a22024-09-17 09:06:39 +0300744 my $refCompleteText = $textSigle . " " . $PUBLTITLE . " no. " . $ISSUENO . ", [" . $PUBLTYPE . "], " . $dateNice;
Harald Lüngen2551a952024-09-15 08:08:35 +0300745 my $refShortText = $textSigle . " " . $PUBLTITLE . ", " . $dateNice;
746 $textHeader->get_xpath('./fileDesc/sourceDesc/reference[@type="complete"]', 0) -> set_text($refCompleteText);
747 $textHeader->get_xpath('./fileDesc/sourceDesc/reference[@type="short"]' , 0) -> set_text($refShortText);
748 }
749
750
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300751 # <encodingDesc>
752 # <tagsDecl>
753 # <namespace name="http://www.tei-c.org/ns/1.0">
754 # <tagUsage gi="s" occurs="SENTCOUNT"/>
755 # <tagUsage gi="w" occurs="TOKENCOUNT"/>
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300756
Harald Lüngen2551a952024-09-15 08:08:35 +0300757 my $namespacePath="./encodingDesc/tagsDecl/namespace/";
758 if($TEIFORMAT eq "I5"){$namespacePath="./encodingDesc/tagsDecl/"};
759
760 $textHeader->get_xpath($namespacePath . 'tagUsage[@gi="s"]', 0) -> set_att('occurs', $SENTCOUNT);
761 $textHeader->get_xpath($namespacePath . 'tagUsage[@gi="w"]', 0) -> set_att('occurs', $TOKENCOUNT);
762
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300763 # <profileDesc>
764 # <langUsage>
765 # <language ident="fi" usage="|xxx:44|fin:23|eng:3|"/>
766 # </langUsage>
767 # <textClass>
768 # <classCode scheme="kielipankki_klk">$PUBLTYPE</classCode>
Harald Lüngen2551a952024-09-15 08:08:35 +0300769 # <classCode scheme="kielipankki_klk_mapped">$mapping{$PUBLTYPE}</classCode>
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300770
Harald Lüngen2551a952024-09-15 08:08:35 +0300771 if($TEIFORMAT eq "I5"){
772 $textHeader->get_xpath('./profileDesc/creation/creatDate', 0) ->set_text($dateBackwards);
773 }
Harald Lüngen8162ad52024-09-19 10:54:24 +0300774 if($TEIFORMAT eq "TEI"){
775 $textHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_att('ident', $LANGUAGE);
776 $textHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_att('usage', $SUMLANG);
777 }
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300778 # in @usage muss eigt. ein integer; am besten inhalt von SUMLANG aufdroeseln und mehrere <language> machen
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300779
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300780 $textHeader->get_xpath('./profileDesc/textClass/classCode[@scheme="kielipankki_klk"]', 0) ->set_text($PUBLTYPE);
Harald Lüngen2551a952024-09-15 08:08:35 +0300781 $textHeader->get_xpath('./profileDesc/textClass/classCode[@scheme="kielipankki_klk_mapped"]', 0)->set_text($mapping{$PUBLTYPE});
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300782
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300783 # <revisionDesc>
784 # <change when="TODO" who="HL">TEI version for EuReCo</change>
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300785
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300786 $textHeader->get_xpath('./revisionDesc/change', 0) ->set_att('when', localtime->ymd('-'));
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300787
Harald Lüngen86cbd932024-09-10 15:52:18 +0300788 return $textID;
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300789
Harald Lüngen86cbd932024-09-10 15:52:18 +0300790
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300791 #-----------------------------------
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300792 # END OF CREATING TEIHEADER
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300793 #-----------------------------------
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300794
795}
Harald Lüngen695ac1d2024-09-05 08:55:21 +0300796
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300797sub setP {
798 my ($paragraph) = @_;
799
800 $paragraph->set_gi('p');
801
802 # <paragraph id="p-bcd0f3fa-bbd3dac4-815ead7a" sum_lang="|fin:1|">
803 # atts of <paragraph>:
804 # @id USE
805 # @sum_lang USE: put in xml:lang and prefix the value with "x-" for private value
806
807 $paragraph->set_att("xml:lang", "x-" . $paragraph->att("sum_lang"));
808 $paragraph->del_att("sum_lang");
Harald Lüngena20e69d2024-08-29 13:33:08 +0300809 # $paragraph->change_att_name('id', 'xml:id');
810 $paragraph->del_att("id"); # diese id ist auch nicht eindeutig!!
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300811}
812sub setS {
813 my ($sentence) = @_;
814
815 $sentence->set_gi('s');
816
817 # the atts of <sentence>:
818 # USE 1 @id="s-bcd0f3fa-bbd3dac4-f7429090"
819 # USE 2 @lang="fin" -> xml:lang
820 # ? 3 @lang_conf="0.6734853"> -> ToDo @cert ?
821
822 # set attrs of <s>
823 $sentence->set_att("xml:lang", $sentence->att("lang")); # ToDo: convert the value / introduce a hash for lookup (input values: "fin", "xxx", ....)
Harald Lüngenfaf8d482024-08-27 21:19:47 +0300824 # $sentence->change_att_name('id', 'xml:id'); # nicht eindeutig
825 $sentence->del_att('id');
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300826 $sentence->del_att("lang"); # replaced by xml:lang
827 $sentence->del_att("lang_conf"); # for the time being
828
829}
830
831sub createW {
832 my ($w_element, $line) = @_;
833
834 #---------------------------
835 # Get the tags (=columns)
836 #---------------------------
837
838 my @tags = split(/\t/, $line);
839
Harald Lüngenfe838e02024-09-25 09:01:00 +0300840 # set word string and lemma string according to $MASK flag:
841 my $w_string = "";
842 my $l_string = "";
843 if($MASK && ($tags[4] ne "Punct")){
844 $w_string = Data::Random::String->create_random_string(length=>length($tags[0]), contains=>'alpha');
845 $l_string = $w_string;
846 }
847 else {
848 $w_string = $tags[0];
849 $l_string = $tags[2];
850 }
851 $w_element->set_text($w_string);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300852
Harald Lüngen8162ad52024-09-19 10:54:24 +0300853 # vrt word and positional-attributes in corpus KLK:
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300854 # USE [0] word
855 # USE [1] ref (id for reference of dephead)
856 # USE [2] lemma
857 # ? [3] lemmacomp (lemma with compound info - could go in @norm, as tag abuse?)
858 # USE [4] pos
859 # USE [5] msd
860 # USE [6] dephead
861 # USE [7] deprel
862 # [8] content (ocr-process)
863 # [9] vpos (ocr-process)
864 # [10] ocr (ocr-process)
865 # [11] cc (ocr-process)
866 # [12] hyph (ocr-process)
867 # [13] style (ocr-process)
868 # [14] lex (korp semantic disambiguation from G"oteborg)
869
870 # set the attributes of <w>:
871 $w_element->set_att("n", $tags[1]);
Harald Lüngenfe838e02024-09-25 09:01:00 +0300872
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300873 # $w_element->set_att("id", "w_" . $namearray[4] . $sentence->att("xml:id") . "_" . $tags[1]);
874 # so zusammengebaute ID ist auch nicht eindeutig...
Harald Lüngenfaf8d482024-08-27 21:19:47 +0300875 $w_element->del_att("id");
Harald Lüngenfe838e02024-09-25 09:01:00 +0300876
877 $w_element->set_att("lemma", $l_string);
Harald Lüngen381c2a22024-09-17 09:06:39 +0300878
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300879 # $w_element->set_att("norm", $tags[3]); # tag abuse of @norm
880 $w_element->set_att("pos", $tags[4]);
881 $w_element->set_att("msd", $tags[5]);
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300882
Harald Lüngen381c2a22024-09-17 09:06:39 +0300883 if($TEIFORMAT eq "I5"){ # remove condition when part of the official TEI
884 $w_element->set_att("head", $tags[6]);
885 $w_element->set_att("deprel", $tags[7]);
886 }
Harald Lüngen9d4e0462024-08-23 09:34:22 +0300887}
888
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300889
890sub set_title{
Harald Lüngen86cbd932024-09-10 15:52:18 +0300891 my ($corpusHeader, $source, $year, $kielipankkiCorpus) = @_;
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300892
Harald Lüngen381c2a22024-09-17 09:06:39 +0300893 my $titleElement = "title";
894
895 if($TEIFORMAT eq "I5"){
896 $titleElement = "c.title";
897 }
898
Harald Lüngen86cbd932024-09-10 15:52:18 +0300899 my $cTitleString = $source . " " . $year . ", from ". $kielipankkiCorpus . " for EuReCo";
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300900
901 #<teiHeader>
902 # <fileDesc>
903 # <titleStmt>
904 # <title>[Aamulehti2021] from [klk-fi-v2-vrt for] EuReCo</title>
905 # </titleStmt>
906 # <!-- ... -->
907 # </fileDesc>
908 #</teiHeader>
909
Harald Lüngen381c2a22024-09-17 09:06:39 +0300910 my $cTitleNode = $corpusHeader->first_child("fileDesc")->first_child("titleStmt")->first_child($titleElement);
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300911
912 $cTitleNode->set_text($cTitleString);
913
914}
915
916sub set_sourceDesc{
Harald Lüngen86cbd932024-09-10 15:52:18 +0300917 my ($corpusHeader, $source, $year, $kielipankkiCorpus) = @_;
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300918
Harald Lüngen86cbd932024-09-10 15:52:18 +0300919 my $cBiblString = $source . " " . $year . ", from ". $kielipankkiCorpus . " for EuReCo";
Harald Lüngendb5e6e72024-09-04 17:41:18 +0300920
921 #<teiHeader>
922 # <fileDesc>
923 # <!-- ... -->
924 # <sourceDesc>
925 # <bibl>[Aamulehti2021] from [klk-fi-v2-vrt]</bibl>
926 # </sourceDesc>
927 # <!-- ... -->
928 # </fileDesc>
929 #</teiHeader>
930
931 my $cBiblNode = $corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("bibl");
932
933 $cBiblNode->set_text($cBiblString);
934}
935
Harald Lüngen381c2a22024-09-17 09:06:39 +0300936sub set_sourceDescI5{
937 my ($corpusHeader) = @_;
938
939 my $PUBLTITLE = $srcfullnames{$fnsource};
940 my $PUBLPLACE = $srcpubplaces{$PUBLTITLE};
941 my $PUBLISHER = $srcpublishers{$PUBLTITLE};
Harald Lüngen381c2a22024-09-17 09:06:39 +0300942
Harald Lüngen8162ad52024-09-19 10:54:24 +0300943 my $YEAR = $fnyear;
944 my $YY = substr($fnyear, 2, 2);
Harald Lüngen381c2a22024-09-17 09:06:39 +0300945
Harald Lüngen8162ad52024-09-19 10:54:24 +0300946 my $CSIGLE = $corpusids{$PUBLTITLE} . $YY;
947
948
Harald Lüngen381c2a22024-09-17 09:06:39 +0300949 #<idsHeader>
950 # <fileDesc>
951 # <!-- ... -->
952 # <sourceDesc>
953 # <biblStruct>
954 # <monogr>
955 # <h.title type="main">[$PUBLTITLE], [$YEAR]</h.title>
956 # <imprint>
957 # <publisher>[$PUBLISHER]</publisher>
Harald Lüngen8162ad52024-09-19 10:54:24 +0300958 # <pubPlace key="[$TL]">[$PUBPLACE]</pubPlace>
Harald Lüngen381c2a22024-09-17 09:06:39 +0300959 # </imprint>
960 # </monogr>
961 # </biblStruct>
962 # <reference type="super" assemblage="regular">[$KKK] [$PUBLTITLE]; [$PUBPLACE]: [$PUBLISHER], [$YEAR]</reference>
963 # </sourceDesc> # <sourceDesc>
964 # <!-- ... -->
965 # </fileDesc>
966 #</teiHeader>
967
968 my $cMonogr = $corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("biblStruct")->first_child("monogr");
969 $cMonogr->first_child("h.title")->set_text($PUBLTITLE);
970 $cMonogr->first_child("imprint")->first_child("publisher")->set_text($PUBLISHER);
971 $cMonogr->first_child("imprint")->first_child("pubPlace") ->set_text($PUBLPLACE);
Harald Lüngen8162ad52024-09-19 10:54:24 +0300972 $cMonogr->first_child("imprint")->first_child("pubPlace") ->set_att('key', $CountryKey);
Harald Lüngen381c2a22024-09-17 09:06:39 +0300973
974 $corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("reference")->set_text($CSIGLE . " " . $PUBLTITLE . "; " . $PUBLPLACE . ": " . $PUBLISHER . ", " . $YEAR);
975
976}
977
978
979
Harald Lüngen8162ad52024-09-19 10:54:24 +0300980sub createIdsDoc{ # will only be called for the second idsDoc (i.e. for february) and higher
Harald Lüngen381c2a22024-09-17 09:06:39 +0300981 my ($textattsref) = @_;
982 my $DATE = $textattsref->{'date'};
983 my $PUBLTITLE = $textattsref->{'publ_title'};
Harald Lüngen381c2a22024-09-17 09:06:39 +0300984
Harald Lüngen381c2a22024-09-17 09:06:39 +0300985 my @datearray = split("-", $DATE);
986 my $MONTH = $datearray[1];
987 my $YEAR = $datearray[0];
Harald Lüngen8162ad52024-09-19 10:54:24 +0300988 my $YY = substr($YEAR, 2, 2);
Harald Lüngen381c2a22024-09-17 09:06:39 +0300989
Harald Lüngen8162ad52024-09-19 10:54:24 +0300990 my $CSIGLE = $corpusids{$PUBLTITLE} . $YY;
991
Harald Lüngen381c2a22024-09-17 09:06:39 +0300992 my $DOCID = $months{$MONTH};
993 my $MONTHNAME = $monthnames{$MONTH};
994
Harald Lüngen381c2a22024-09-17 09:06:39 +0300995 my $idsDocString="";
996 if($TEIFORMAT eq "I5"){
997 $idsDocString = "
Harald Lüngen8162ad52024-09-19 10:54:24 +0300998<idsDoc version=\"1.0\" TEIform=\"TEI.2\">
Harald Lüngen381c2a22024-09-17 09:06:39 +0300999<idsHeader type=\"document\" pattern=\"text\" version=\"1.1\" TEIform=\"teiHeader\">
1000 <fileDesc>
1001 <titleStmt>
1002 <dokumentSigle>$CSIGLE/$DOCID</dokumentSigle>
1003 <d.title>$PUBLTITLE, $MONTHNAME $YEAR</d.title>
1004 </titleStmt>
1005 <publicationStmt>
1006 <distributor/>
1007 <pubAddress/>
1008 <availability region=\"world\">$kielipankkiLicense</availability>
1009 <pubDate/>
1010 </publicationStmt>
1011 <sourceDesc>
1012 <biblStruct>
1013 <monogr>
1014 <h.title/>
1015 <imprint/>
1016 </monogr>
1017 </biblStruct>
1018 </sourceDesc>
1019 </fileDesc>
1020</idsHeader>
1021</idsDoc>\n";
1022}
1023 if($MONTH + 0 == $LASTMONTH + 1){
1024 if($MONTH+0 > 1){
1025 printf("%s\n", $idsDocString);
1026 }
1027 $LASTMONTH++;
1028 }
Harald Lüngen381c2a22024-09-17 09:06:39 +03001029}
1030
Harald Lüngendb5e6e72024-09-04 17:41:18 +03001031
1032
Harald Lüngen9d4e0462024-08-23 09:34:22 +03001033#################
1034## usage_message
1035#################
1036
1037
1038sub usage_message {
Harald Lüngenfe838e02024-09-25 09:01:00 +03001039 print STDERR "Usage: ./vrt2tei.pl [OPTIONS] <file.vrt.xml>\n";
1040 print STDERR " <file.vrt.xml> is a VRT file as proper (well-formed) XML\n";
1041 print STDERR " Options:\n";
1042 print STDERR " -t (tei|i5) output format, default: tei\n";
1043 print STDERR " -m mask primary data\n";
1044 print STDERR "\n";
Harald Lüngen9d4e0462024-08-23 09:34:22 +03001045 exit;
1046}
1047
1048