vrt2tei.pl - EuReCo/kielipankki4eureco - Gitiles

 ## #! /appl/soft/bio/bioperl/5.36.0/bin/perl
 ## #! /usr/bin/perl -w


 ###########################################################################################################################################################
 # vrt2tei.pl
 # eureco
 # leibniz-institut fuer deutsche sprache / csc finland esbo
 # august 2024
 #
 #
 # using XML::Twig , see http://www.xmltwig.org/, https://metacpan.org/pod/XML::Twiga
 #
 # usage: see below the usage fugnction
 # Usage:  ./vrt2tei.pl <vrtxmlfile.xml> <outfile>
 #         <vrtxmlfile>: xml-ised vrt file
 #
 #
 # TODO:

 # 0 ZIPPEN mit tei2korapxml; zippen mit korAP indexing

 # 2 threading on compute node and application on sub corpora of KLK
 # 2 build 30 billion corpus and index it
 # 3 Optionen
 # 3a parametrize deprel for I5 and if Nils is not ready yet

 #

 # 5  abfangen von unerwarteten elementen dh andere als <sentence> und <paragraph>
 # 6  checks and balances, wort reihenfolge nochmal checken?
 # 7  Encode Kielipankki and National Library of Finland? in teiCorpus Header
 # 8  How to encode the CLARIN-RES better - more Info from the CMDI
 # 9  construct <idsDoc>s independent of the order of texts, probably with writing intermediate files to zip
 # 10 re-implementation of the gawk code in the perl script
 # 11 Wwedish corpus

 #
 #
 ############################################################################################################################################################

 use strict;
 use warnings;
 #use diagnostics;

 use Getopt::Std;
 use XML::Twig;
 use XML::Generator ':pretty';  # apparently no effect when using flush();


 use locale;                 # diese drei Zeilen, damit \b im regex nicht Umlaute und ß matcht.
 use POSIX qw(locale_h);     # to be able to use setlocale()
 #setlocale(LC_ALL,'de_DE');
 setlocale(LC_ALL, "fi_FI");
 use utf8;
 use open qw( :std :encoding(UTF-8) );

 use Time::Piece;
 use Tie::IxHash;
 use Data::Random::String;


 #-------------
 # get options
 #-------------

 our ($opt_h, $opt_m, $opt_s, $opt_t);

 # read switches and print usage info if some bad option was given
 if (!getopts('hms:t:')) {   # switches with ':' take an argument; switches without ':' are boolean flags
     &usage_message;
     exit -1;
 }


 #--------------------
 # check argument(s)
 #--------------------

 # currently one argument: the vrt-xml input file

 unless($ARGV[0]) {&usage_message()} ; # min arg0, the input file e.g. Suomen_Kuvalehti2021.xml
 if    ($ARGV[1]) {&usage_message()};  # max arg0, the input file


 #------------------------------------------------------------
 # initialize defaults for options
 #------------------------------------------------------------
 my $TEIFORMAT= "tei";
 my $MASK     = 0;


 #----------------------------------------------------------------------------------------------------------
 # interpret the options and check whether their respective argument is meaningful (if applicable)
 #----------------------------------------------------------------------------------------------------------


 # option -h: display usage info and exit
 if ($opt_h) {
 	print STDERR &usage_message;
 	exit 0;
 }


 # option -t
 if (defined($opt_t)) {
     $TEIFORMAT = $opt_t;
 }


 if ($TEIFORMAT !~ /(tei|i5)/i) { # case-insenstive
     print STDERR "Error: invalid arg for option -t";
     &usage_message;
     exit 0;
 }
 if($TEIFORMAT eq "tei"){$TEIFORMAT="TEI"};
 if($TEIFORMAT eq "i5") {$TEIFORMAT="I5"};

 # option -m
 if ($opt_m) {
     $MASK = 1;
 }


 #-----------------------------------------------
 # OTHER GLOBAL VARIABLES
 #-----------------------------------------------

 my $encoding = "UTF-8";                # dieses $encoding ist NUR fuer das output s.u. twig funktion

 my $DTDDECL = 'idsCorpus PUBLIC "-//IDS//DTD IDS-I5 1.0//EN" "http://corpora.ids-mannheim.de/I5/DTD/i5.dtd"';     # for I5

 my $textcounter = 0;
 my $LASTMONTH = 0;

 our %corpusids    = ();
 our %srcpublids   = ();
 our %srcfullnames    = ();
 our %srcpubplaces    = ();
 our %srcpublishers   = ();
 our %srctexttypes    = ();
 our %srctextlangs    = ();

 our %expandLang      = ();


 my %doccounter = (                        # by the month as in dereko
     "01" => 1,
     "02" => 1,
     "03" => 1,
     "04" => 1,
     "05" => 1,
     "06" => 1,
     "07" => 1,
     "08" => 1,
     "09" => 1,
     "10" => 1,
     "11" => 1,
     "12" => 1,
     );

 # global variables pertaining to the original corpus of *all* newspapers:
 my $kielipankkiCorpus  = "klk-fi-v2-vrt";
 my $kielipankkiLicense = "CLARIN-RES";
 my $CountryKey         = "FI";

 # Table with metadata about the different sources (newspapers)
 my $sourcescsvfile = "sources_klk_fi_v2_2021_4eureco.csv";

 # corpusheader and textheader skeletons
 my $corpheaderfile = "teiCorpusHeaderSkeleton.tei.xml";
 my $textheaderfile = "teiTextHeaderSkeleton.tei.xml";
 if($TEIFORMAT eq "I5"){
     $corpheaderfile = "i5CorpusHeaderSkeleton.i5.xml";
     $textheaderfile = "i5TextHeaderSkeleton.i5.xml";
 }


 my $twig="";

 # variables $fnsource and $fnyear derived from the filename
 my @array = split(/\//, $ARGV[0]);
 my $l = scalar(@array);
 my $fnsource = $array[$l-1];
 $fnsource =~ s/([0-9][0-9][0-9][0-9])\.xml$//;

 my $fnyear = $1;   # $1 contains substring in first bracket in regex above
 my $fnYY   = substr($fnyear, 2, 2);


 # months
 my %months = (
      "01" => "JAN",
      "02" => "FEB",
      "03" => "MAR",
      "04" => "APR",
      "05" => "MAY",
      "06" => "JUN",
      "07" => "JUL",
      "08" => "AUG",
      "09" => "SEP",
      "10" => "OCT",
      "11" => "NOV",
      "12" => "DEC",
      );

 my %monthnames = (
      "01" => "January",
      "02" => "February",
      "03" => "March",
      "04" => "April",
      "05" => "May",
      "06" => "June",
      "07" => "July",
      "08" => "August",
      "09" => "September",
      "10" => "October",
      "11" => "November",
      "12" => "December",
      );

 my %mapping = ();
 $mapping{"aikakausi"}   = "Zeitschrift";
 $mapping{"sanomalehti"} = "Zeitung";


 #-------------------------------------------------------------------------------------------
 # read source metadata file (prepared manually => ultimately read the info from CMDI File?)
 # and set variables
 #-------------------------------------------------------------------------------------------

 open(my $SOURCES, "< $sourcescsvfile") || die("$0: cannot open file for reading: $sourcescsvfile");
 while(my $fline = <$SOURCES>){
     chomp($fline);

     if ($fline=~/^\#/ || $fline=~/^\s*$/ || $fline =~ "TEICORPUSID"){next;}    # skip line if empty line or comment line or first line
     my @flarray = split(/\s*\t+\s*/, $fline);                                  # split each line into array

     # (ToDo: the following hashes could probably conflated into an array of hashes or so)

     # set full titles ($flarray[4]) as keys:
     $corpusids{$flarray[4]}     = $flarray[2];
     $srcpublids{$flarray[4]}    = $flarray[3];
     $srcfullnames{$flarray[4]}  = $flarray[4];
     $srcpubplaces{$flarray[4]}  = $flarray[9];
     $srcpublishers{$flarray[4]} = $flarray[10];
     $srctexttypes{$flarray[4]}  = $flarray[7];
     $srctextlangs{$flarray[4]}  = $flarray[8];

     # also set simple titles ($flarray[5]) as keys:
     $corpusids{$flarray[5]}     = $flarray[2];
     $srcpublids{$flarray[5]}    = $flarray[3];
     $srcfullnames{$flarray[5]}  = $flarray[4];
     $srcpubplaces{$flarray[5]}  = $flarray[9];
     $srcpublishers{$flarray[5]} = $flarray[10];
     $srctexttypes{$flarray[5]}  = $flarray[7];
     $srctextlangs{$flarray[5]}  = $flarray[8];
 }
 close($SOURCES);

 $expandLang{"fi"} = "Finnish";
 $expandLang{"sv"} = "Swedish";


 #------------------------------------------------------------------
 # read corpusHeaderSkeleton document and start a twig for it
 # (since this file need not be streamed, no handlers are needed)
 #------------------------------------------------------------------

 my $teiCorpusHeaderDocTwig = new XML::Twig(
     keep_spaces => 1,
     keep_atts_order => 1,
     comments => 'drop',
     );


 $teiCorpusHeaderDocTwig->parsefile($corpheaderfile);
 my $corpusHeader = $teiCorpusHeaderDocTwig->root;                  # getting the teiHeader for corpus out of the teiCorpusHeaderSkeleton document


 #------------------------------------------------------------------
 # read textHeaderSkeleton document and start a twig for it
 #------------------------------------------------------------------

 my $teiTextHeaderDocTwig = new XML::Twig(
     keep_spaces => 1,
     keep_atts_order => 1,
     comments => 'drop',
     );

 $teiTextHeaderDocTwig->parsefile($textheaderfile);
 my $textHeader = $teiTextHeaderDocTwig->root;                      # getting the teiHeader for corpus out of the teiTextHeaderSkeleton document


 #---------------------------------------------------------
 # define a subtree for idsDoc
 # for the time being it will only be used for the first
 # idsDoc header, to be inserted in the root hander
 #---------------------------------------------------------

 my $idsDoc       = XML::Twig::Elt->new('idsDoc');
 my $idsDocHeader = XML::Twig::Elt->new('idsHeader');

 if($TEIFORMAT eq "I5"){
     my $docFileDesc  = XML::Twig::Elt->new('fileDesc');
     my $docTitleStmt = XML::Twig::Elt->new('titleStmt');
     my $dtitle       = XML::Twig::Elt->new('d.title');
     my $docSigle     = XML::Twig::Elt->new('dokumentSigle');

     my $docPublicationStmt = XML::Twig::Elt->new('publicationStmt');
     my $docDistributor     = XML::Twig::Elt->new('distributor');
     my $docPubAddress      = XML::Twig::Elt->new('pubAddress');
     my $docAvailability    = XML::Twig::Elt->new('availability');
     my $docPubDate         = XML::Twig::Elt->new('pubDate');

     my $docSourceDesc       = XML::Twig::Elt->new('sourceDesc');
     my $docBiblStruct      = XML::Twig::Elt->new('biblStruct');
     my $docMonogr          = XML::Twig::Elt->new('monogr');
     my $docHTitle          = XML::Twig::Elt->new('h.title');
     my $docImprint         = XML::Twig::Elt->new('imprint');

     $idsDoc                    -> set_att('version', "1.0");
     $idsDoc                    -> set_att('TEIform', "TEI.2");

     $idsDocHeader              -> set_att('version', "1.1");
     $idsDocHeader              -> set_att('type', "document");
     $idsDocHeader              -> set_att('pattern', "text");
     $idsDocHeader              -> set_att('TEIform', "teiHeader");


     $docSigle                  -> paste("first_child", $docTitleStmt);
     $dtitle                    -> paste("last_child",  $docTitleStmt);
     $docTitleStmt              -> paste("last_child",  $docFileDesc);
     $docFileDesc               -> paste("last_child",  $idsDocHeader);
     $docPublicationStmt        -> paste("last_child",  $docFileDesc);
     $docDistributor            -> paste("last_child",  $docPublicationStmt);
     $docPubAddress             -> paste("last_child",  $docPublicationStmt);

     $docAvailability           -> paste("last_child",  $docPublicationStmt);
     $docPubDate                -> paste("last_child",  $docPublicationStmt);

     $docSourceDesc             -> paste("last_child",  $docFileDesc);
     $docBiblStruct             -> paste("last_child",  $docSourceDesc);
     $docMonogr                 -> paste("last_child",  $docBiblStruct);
     $docHTitle                 -> paste("last_child",  $docMonogr);
     $docImprint                -> paste("last_child",  $docMonogr);

     $idsDocHeader           -> paste("last_child",  $idsDoc);

     $docSigle->set_text($corpusids{$fnsource} . $fnYY  . "/JAN");
     $dtitle  ->set_text($srcfullnames{$fnsource} . ", January " . $fnyear);
 }


 #----------------------------------
 # read the input VRT-XML document
 #----------------------------------

 open(my $IN,  "< $ARGV[0]") || die("$0: cannot open file for reading: $ARGV[0]");     # open input  file and initialise filehandel, actually does not seem to be needed
                                                                                       # as parsefile() (s.b.) is applied to the filename


 #####################
 #     M A I N
 #####################

 #-------------------------------------------------------------------------------------------------------------
 # start twig for input and call start tag handler for root and twig handler for each <text> in the VRT
 #-------------------------------------------------------------------------------------------------------------


 $twig = new XML::Twig(
     keep_spaces => 1,           # dadurch auch whitespaces an ehemeligen elementgrenzen im output
     keep_atts_order => 1,       # requires Tie::IxHash
     comments => 'drop',
     start_tag_handlers => {
     	texts => sub{root(@_, $corpusHeader)}
     },

     twig_handlers =>  {
 #	text => \&text
 	text =>  sub{text(@_, $textHeader->copy)}    #   copy must be because textHeader will be flushed with $twig in the <text> handler;
     },

     output_encoding => $encoding,
     );

 $twig->parsefile($ARGV[0]);


 ###########
 # END MAIN
 ###########


 ##############################
 #   S U B R O U T I N E S
 ##############################

 sub root {
     my ($twig, $root, $corpusHeader) =@_;

     if($TEIFORMAT eq "I5") {
 	$twig->set_doctype($DTDDECL);   # the doctype could probably be set anywhere to the twig
 	$root->set_gi('idsCorpus');
 	$root->set_att('version', "1.0");
 	$root->set_att('TEIform', "teiCorpus.2");

     }
     else {
 	$root->set_gi('teiCorpus');
 	$root->set_att("xmlns", 'http://www.tei-c.org/ns/1.0');
     }

     &insertCorpusHeader($root, $corpusHeader);
 }


 sub insertCorpusHeader{
     my ($root, $corpusHeader) =@_;

     my $ident = "ident";

     #-----------------------
     # set corpus header
     #-----------------------

     &set_title($corpusHeader, $srcfullnames{$fnsource}, $fnyear, $kielipankkiCorpus);

     if($TEIFORMAT eq "TEI"){
 	&set_sourceDesc($corpusHeader, $srcfullnames{$fnsource}, $fnyear, $kielipankkiCorpus);
     }
     elsif($TEIFORMAT eq "I5"){
 	$ident="id";
 	$corpusHeader->get_xpath("fileDesc/titleStmt/korpusSigle",0)  ->set_text($corpusids{$fnsource} . $fnYY);
 	$corpusHeader->get_xpath("fileDesc/publicationStmt/pubDate",0)->set_text((localtime)[5] + 1900);
 	$corpusHeader->get_xpath('encodingDesc/editorialDecl/transduction[@n="2"]',0)  ->set_text("I5 version by EuReCo using vrt2tei.pl " . localtime->ymd('-'));
 	&set_sourceDescI5($corpusHeader);
     }
     else{
 	print STDERR "TEIFORMAT is $TEIFORMAT; must be TEI or I5";
     }

     $corpusHeader->paste("first_child", $root);
     $corpusHeader->get_xpath('./profileDesc/langUsage/language', 0)                                   ->set_att($ident, $srctextlangs{$fnsource});
     $corpusHeader->get_xpath('./profileDesc/langUsage/language', 0)                                   ->set_text($expandLang{$srctextlangs{$fnsource}});

     if($TEIFORMAT eq "I5"){
 	$idsDoc->paste("after",       $corpusHeader);
     }
 }


 #----------------------------
 # handler &text for <text>
 #----------------------------

 sub text {
     my ($twig, $text, $textHeader) = @_;

     $textcounter++;


     # ToDo: catch all other, unexpected children of root

     #--------------------------------------------------------------------------
     # Get text metadata (attributes of <text>) and create teiHeader for <text>
     #--------------------------------------------------------------------------

     my $textattsref = $text->atts();               # $textattsref is now a reference to a hash and should be used with '->'

     &createIdsDoc($textattsref);                   # this creation of idsDoc will only be called for the 2nd idsDoc (i.e. february) or higher


     # &createTextHeader returns the $textID:
     my $textID = &createTextHeader($text, $textattsref, $textHeader);


     #----------------------------------------
     # create <TEI> or <idsText> from <text>
     #----------------------------------------

     # set vrt <text> to <TEI> and delete all attributes after they were were saved above
     $text->del_atts;

     if($TEIFORMAT eq "TEI"){
 	$text->set_gi("TEI");
 	$text->set_att('xml:id', $textID);
     }
     else {
 	$text    ->set_gi("idsText");
 	$text    ->set_att('version', "1.0");
 #	$text    ->move("last_child", $idsDoc);                       # does not work because apparently $idsDoc is not under $root at this point

     }


     #------------------------------------------------------------------
     # create the <tei:text>, <body>, <div> elements inside <TEI>
     #------------------------------------------------------------------

     my $ttext_element = XML::Twig::Elt->new('text');
     my $body_element  = XML::Twig::Elt->new('body');
     my $div_element   = XML::Twig::Elt->new('div');

     # set atts
     $div_element  ->set_att("type", "page");                          # ToDo: this is specific to KLK
     $ttext_element->set_att("xml:lang", 'fi');                        # as in ICC-NOR

     # paste
     $ttext_element->paste('last_child',  $text);
     $body_element ->paste('last_child',  $ttext_element);
     $div_element  ->paste('last_child',  $body_element);


     #-------------------------------
     # create <p> from <paragraph>
     #-------------------------------

     my @paragraphs = $text->children( 'paragraph');

     foreach my $paragraph (@paragraphs) {

 	&setP($paragraph);

 	$paragraph->move('last_child', $div_element);

 	#------------------------------
 	# create <s> from <sentence>
 	#------------------------------

 	my @sentences = $paragraph->children('sentence');
 	foreach my $sentence (@sentences) {

 	    &setS($sentence);


 	    #--------------------------------------
 	    # create <w> (word) from each $line
 	    #--------------------------------------

 	    my @lines = split(/\n+/, $sentence->xml_text);
 	    $sentence->set_text("\n");

 	    for my $line (@lines){                                 # Todo: Reihenfolge checken
 		if($line ne "" ){
 		    my $w_element = XML::Twig::Elt->new('w');
 		    &createW($w_element, $line);
 		    $w_element->paste('last_child', $sentence);
 		}
 	    } # end words
 	} # end sentences
     } # end paragraphs

     # $twig->set_pretty_print( 'record');
     # $twig->flush($OUT);
     $twig->flush("/dev/stdout");
 }

 sub createTextHeader{
     my ($text, $textattsref, $textHeader) = @_;

     # USE 01 binding_id="2246025"
     # USE 02 date="2021-01-15"
     #     03 datefrom="20210115"
     #     04 dateto="20210115"
     #     05 elec_date="_"
     #     06 file=""
     # USE 07 filename_metadata="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_mets.xml"
     # USE 08 filename_orig    ="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_page-2021011502210030301.xml
     # USE 09 id="t-bcd0f3fa-bbd3dac4"
     #     10 img_url=""
     # USE 11 issue_date="15.01.2021"
     # USE 12 issue_no="SK0221"
     # USE 13 issue_title="Suomen Kuvalehti"
     # USE 14 label="Suomen Kuvalehti no. SK0221 15.01.2021"
     # USE 16 language="fi"
     # USE 17 page_id="p1"
     # USE 18 page_no="None"
     #     19 part_name="_"
     #     20 publ_id="0039-5552"
     #     21 publ_part=""
     # USE 22 publ_title="Suomen Kuvalehti"
     # USE 23 publ_type="aikakausi"
     # USE 24 sentcount="70"
     # USE 25 sum_lang="|xxx:44|fin:23|eng:3|"
     #     26 timefrom="000000"
     #     27 timeto="235959"
     # USE 28 tokencount="304"
     #     29 version_added="KLK-fi-2021">


     my $BID          = $textattsref->{'binding_id'};
     my $DATE         = $textattsref->{'date'};
     my $METAFILENAME = $textattsref->{'filename_metadata'};
     my $ORIGFILENAME = $textattsref->{'filename_orig'};
     my $ID           = $textattsref->{'id'};
     my $ISSUEDATE    = $textattsref->{'issue_date'};
     my $ISSUENO      = $textattsref->{'issue_no'};
     my $ISSUETITLE   = $textattsref->{'issue_title'};
     my $LABEL        = $textattsref->{'label'};
     my $LANGUAGE     = $textattsref->{'language'};
     my $PAGEID       = $textattsref->{'page_id'};
     my $PAGENO       = $textattsref->{'page_no'};
     my $PUBLTITLE    = $textattsref->{'publ_title'};
     my $PUBLTYPE     = $textattsref->{'publ_type'};
     my $SENTCOUNT    = $textattsref->{'sentcount'};
     my $SUMLANG      = $textattsref->{'sum_lang'};
     my $TOKENCOUNT   = $textattsref->{'tokencount'};


     #-----------------------------
     # Derived Metadata variables
     #-----------------------------

     my @datearray = split("-", $DATE);
     my @langarray = split("|", $SUMLANG);
     my @namearray = split(/[\.\/]/, $ORIGFILENAME);  # use $namearray[4] as ID for the page

     #----------------------------------------------------
     # create textSigle to be returned from this function
     #----------------------------------------------------

     # SUK21.JAN.00001

     my $yy       = substr($datearray[0], 2, 2);      # substr EXPR,OFFSET,LENGTH
     my $mm       = $datearray[1];                    # substr EXPR,OFFSET,LENGTH
     my $MMM      = $months{$mm};

     my $CSIGLE = $corpusids{$fnsource} . $yy;

     my $textID = $CSIGLE . "_" . $MMM . "." . sprintf("%05d", $doccounter{$mm}++);
     my $textSigle = $textID;


     #-----------------------------------------------------------------------
     # CREATE text-teiHeader ACCORDING TO THE SKELETON in $textHeader
     #-----------------------------------------------------------------------


     $textHeader->paste('first_child', $text);

     #-----------------------------------------------
     # <teiHeader>
     #   <fileDesc n="EuReCo-KLK-FIN_[$ID]">
     #     <titleStmt>
     #       <title>[$LABEL, page $PAGENO]</title>

     $textHeader->first_child("fileDesc")   ->  set_att('n', "EuReCo-". $kielipankkiCorpus . "-" . $ID);


     #-----------------
     # titleStmt
     #----------------

     my $title="title";
     my $titleStmt = $textHeader->first_child("fileDesc")->first_child("titleStmt");

     if($TEIFORMAT eq "I5"){
 	$title = "t.title";
 	$textSigle =~ s/_/\//g;
 	$titleStmt->first_child("textSigle")->set_text($textSigle);
     };

     $titleStmt->first_child($title)->set_text($LABEL . ", Text #" . $textcounter);

     # Case KLK:  PAGENO scheint meist "None" zu sein

     #-----------------------------------------------
     # <fileDesc>
     #  <sourceDesc>
     #   <biblStruct>
     #      <analytic>
     #         <title type="main">[$LABEL, page $PAGENO]</title>
     #         <date>[$DATE]</date>
     #         <date type="year">TODO</date>
     #         <date type="month">TODO</date>
     #         <date type="day">TODO</date>
     #         <idno type="PAGEID">$PAGEID</idno>
     #         <idno type="BINDINGID">$BID</idno>
     #         <idno type="ID">$ID</idno>
     #         <idno type="KIELIPANKKI_METAFILENAME">$METAFILENAME</idno>
     #         <idno type="KIELIPANKKI_ORIGFILENAME">$ORIGFILENAME</idno>
     #         <textLang>$LANGUAGE</textLang>


     my $analytic  = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/analytic", 0);
     if($TEIFORMAT eq "I5"){$title="h.title"};


     $analytic->first_child($title)                    ->set_text($LABEL . ", Text #" . $textcounter);   # Case KLK; PAGENO scheint meist "None" zu sein
     #$analytic->get_xpath('./idno[@type="PAGEID"]',       0)     ->set_text($PAGEID);
     #$analytic->get_xpath('./idno[@type="BINDINGID"]',    0)     ->set_text($BID);
     #$analytic->get_xpath('./idno[@type="ID"]',           0)     ->set_text($ID);
     #$analytic->get_xpath('./idno[@type="KIELIPANKKI_METAFILENAME"]', 0)     ->set_text($METAFILENAME);
     #$analytic->get_xpath('./idno[@type="KIELIPANKKI_ORIGFILENAME"]', 0)     ->set_text($ORIGFILENAME);
     if($TEIFORMAT eq "TEI"){
 	$analytic->first_child('textLang')                        ->set_text($LANGUAGE);
     }

     #  <monogr>
     #    <title>$PUBLTITLE</title>
     #    <imprint>
     #      <pubPlace>TODO</pubPlace>
     #      <publisher>TODO</publisher>
     #    </imprint>
     #    <biblScope unit="ISSUETITLE"/>
     #    <biblScope unit="ISSUENO"/>
     #    <biblScope unit="ISSUEDATE"/>
     #    <biblScope unit="pp">$PAGENO</biblScope>

     my $monogr  = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/monogr", 0);

     $monogr->first_child($title)                                    ->set_text($PUBLTITLE);
     if($TEIFORMAT eq "TEI"){
 	$monogr->get_xpath('./imprint/date[@type="date"]',   0)         ->set_text($DATE);
     }
     my $date = "date";
     if($TEIFORMAT eq "I5"){$date="pubDate"};
     $monogr->get_xpath('./imprint/' . $date . '[@type="year"]',   0)         ->set_text($datearray[0]);
     $monogr->get_xpath('./imprint/' . $date . '[@type="month"]',  0)         ->set_text($datearray[1]);
     $monogr->get_xpath('./imprint/' . $date . '[@type="day"]',    0)         ->set_text($datearray[2]);
     $monogr->first_child("imprint")->first_child("pubPlace")        ->set_text($srcpubplaces{$PUBLTITLE});  # imprint is needed for tei validity
     $monogr->first_child("imprint")->first_child("pubPlace")        ->set_att('key', $CountryKey);
     $monogr->first_child("imprint")->first_child("publisher")       ->set_text($srcpublishers{$PUBLTITLE}); # imprint is needed for tei validity
     #$monogr->get_xpath('./biblScope[@unit="ISSUETITLE"]', 0)        ->set_text($ISSUETITLE);
     #$monogr->get_xpath('./biblScope[@unit="ISSUENO"]', 0)           ->set_text($ISSUENO);
     #$monogr->get_xpath('./biblScope[@unit="ISSUEDATE"]', 0)         ->set_text($ISSUEDATE);
     #$monogr->get_xpath('./biblScope[@unit="pp"]', 0)                ->set_text($PAGENO);        # Achtung - PAGENO scheint meist "None" zu sein

     my $dateNice      = $datearray[2] . "." . $datearray[1] . "." . $datearray[0];
     my $dateBackwards = $datearray[0] . "." . $datearray[1] . "." . $datearray[2];

     if($TEIFORMAT eq "I5"){
 	my $refCompleteText = $textSigle . " " . $PUBLTITLE . " no. " . $ISSUENO . ", [" . $PUBLTYPE . "], " . $dateNice;
 	my $refShortText    = $textSigle . " " . $PUBLTITLE .                                                     ", " . $dateNice;
 	$textHeader->get_xpath('./fileDesc/sourceDesc/reference[@type="complete"]', 0) -> set_text($refCompleteText);
 	$textHeader->get_xpath('./fileDesc/sourceDesc/reference[@type="short"]'   , 0) -> set_text($refShortText);
     }


     #  <encodingDesc>
     #    <tagsDecl>
     #      <namespace name="http://www.tei-c.org/ns/1.0">
     #        <tagUsage gi="s" occurs="SENTCOUNT"/>
     #        <tagUsage gi="w" occurs="TOKENCOUNT"/>

     my $namespacePath="./encodingDesc/tagsDecl/namespace/";
     if($TEIFORMAT eq "I5"){$namespacePath="./encodingDesc/tagsDecl/"};

     $textHeader->get_xpath($namespacePath . 'tagUsage[@gi="s"]', 0)      -> set_att('occurs', $SENTCOUNT);
     $textHeader->get_xpath($namespacePath . 'tagUsage[@gi="w"]', 0)      -> set_att('occurs', $TOKENCOUNT);

     #  <profileDesc>
     #    <langUsage>
     #     <language ident="fi" usage="|xxx:44|fin:23|eng:3|"/>
     #   </langUsage>
     #    <textClass>
     #      <classCode scheme="kielipankki_klk">$PUBLTYPE</classCode>
     #      <classCode scheme="kielipankki_klk_mapped">$mapping{$PUBLTYPE}</classCode>

     if($TEIFORMAT eq "I5"){
 	$textHeader->get_xpath('./profileDesc/creation/creatDate', 0)                                   ->set_text($dateBackwards);
     }
     if($TEIFORMAT eq "TEI"){
 	$textHeader->get_xpath('./profileDesc/langUsage/language', 0)                                   ->set_att('ident', $LANGUAGE);
 	$textHeader->get_xpath('./profileDesc/langUsage/language', 0)                                   ->set_att('usage', $SUMLANG);
     }
     # in @usage muss eigt. ein integer; am besten inhalt von SUMLANG aufdroeseln und mehrere <language> machen

     $textHeader->get_xpath('./profileDesc/textClass/classCode[@scheme="kielipankki_klk"]', 0)       ->set_text($PUBLTYPE);
     $textHeader->get_xpath('./profileDesc/textClass/classCode[@scheme="kielipankki_klk_mapped"]', 0)->set_text($mapping{$PUBLTYPE});

     #  <revisionDesc>
     #    <change when="TODO" who="HL">TEI version for EuReCo</change>

     $textHeader->get_xpath('./revisionDesc/change', 0)                                    ->set_att('when', localtime->ymd('-'));

     return $textID;


     #-----------------------------------
     # END OF CREATING TEIHEADER
     #-----------------------------------

 }

 sub setP {
     my ($paragraph) = @_;

     $paragraph->set_gi('p');

     # <paragraph id="p-bcd0f3fa-bbd3dac4-815ead7a" sum_lang="|fin:1|">
     # atts of <paragraph>:
     #    @id        USE
     #    @sum_lang  USE: put in xml:lang and prefix the value with "x-" for private value

     $paragraph->set_att("xml:lang", "x-" . $paragraph->att("sum_lang"));
     $paragraph->del_att("sum_lang");
     # $paragraph->change_att_name('id', 'xml:id');
     $paragraph->del_att("id");              # diese id ist auch nicht eindeutig!!
 }
 sub setS {
     my ($sentence) = @_;

     $sentence->set_gi('s');

     # the atts of <sentence>:
     #    USE   1 @id="s-bcd0f3fa-bbd3dac4-f7429090"
     #    USE   2 @lang="fin" -> xml:lang
     #    ?     3 @lang_conf="0.6734853"> -> ToDo @cert ?

     # set attrs of <s>
     $sentence->set_att("xml:lang", $sentence->att("lang"));  # ToDo: convert the value / introduce a hash for lookup (input values: "fin", "xxx", ....)
     # $sentence->change_att_name('id', 'xml:id');            # nicht eindeutig
     $sentence->del_att('id');
     $sentence->del_att("lang");                         # replaced by xml:lang
     $sentence->del_att("lang_conf");                    # for the time being

 }

 sub createW {
     my ($w_element, $line) = @_;

     #---------------------------
     # Get the tags (=columns)
     #---------------------------

     my @tags = split(/\t/, $line);

     # set word string and lemma string according to $MASK flag:
     my $w_string = "";
     my $l_string = "";
     if($MASK && ($tags[4] ne "Punct")){
 	$w_string = Data::Random::String->create_random_string(length=>length($tags[0]), contains=>'alpha');
 	$l_string = $w_string;
     }
     else {
 	$w_string = $tags[0];
 	$l_string = $tags[2];
     }
     $w_element->set_text($w_string);

     # vrt word and positional-attributes in corpus KLK:
     #  USE [0] word
     #  USE [1] ref  (id for reference of dephead)
     #  USE [2] lemma
     #  ?   [3] lemmacomp   (lemma with compound info - could go in @norm, as tag abuse?)
     #  USE [4] pos
     #  USE [5] msd
     #  USE [6] dephead
     #  USE [7] deprel
     #      [8] content   (ocr-process)
     #      [9] vpos      (ocr-process)
     #     [10] ocr       (ocr-process)
     #     [11] cc        (ocr-process)
     #     [12] hyph      (ocr-process)
     #     [13] style     (ocr-process)
     #     [14] lex       (korp semantic disambiguation from G"oteborg)

     # set the attributes of <w>:
     $w_element->set_att("n",      $tags[1]);

     # $w_element->set_att("id",     "w_" . $namearray[4] . $sentence->att("xml:id") . "_" . $tags[1]);
     # so zusammengebaute ID ist auch nicht eindeutig...
     $w_element->del_att("id");

     $w_element->set_att("lemma",  $l_string);

     # $w_element->set_att("norm",   $tags[3]);  # tag abuse of @norm
     $w_element->set_att("pos",    $tags[4]);
     $w_element->set_att("msd",    $tags[5]);

     if($TEIFORMAT eq "I5"){                     # remove condition when part of the official TEI
 	$w_element->set_att("head",   $tags[6]);
 	$w_element->set_att("deprel", $tags[7]);
     }
 }


 sub set_title{
     my ($corpusHeader, $source, $year, $kielipankkiCorpus) = @_;

     my $titleElement = "title";

     if($TEIFORMAT eq "I5"){
 	$titleElement = "c.title";
     }

     my $cTitleString = $source . " " . $year . ", from ". $kielipankkiCorpus . " for EuReCo";

     #<teiHeader>
     #  <fileDesc>
     #    <titleStmt>
     #      <title>[Aamulehti2021] from [klk-fi-v2-vrt for] EuReCo</title>
     #    </titleStmt>
     #    <!-- ... -->
     #  </fileDesc>
     #</teiHeader>

     my $cTitleNode = $corpusHeader->first_child("fileDesc")->first_child("titleStmt")->first_child($titleElement);

     $cTitleNode->set_text($cTitleString);

 }

 sub set_sourceDesc{
     my ($corpusHeader, $source, $year, $kielipankkiCorpus) = @_;

     my $cBiblString = $source . " " . $year . ", from ". $kielipankkiCorpus . " for EuReCo";

     #<teiHeader>
     #  <fileDesc>
     #    <!-- ... -->
     #    <sourceDesc>
     #      <bibl>[Aamulehti2021] from [klk-fi-v2-vrt]</bibl>
     #    </sourceDesc>
     #    <!-- ... -->
     #  </fileDesc>
     #</teiHeader>

     my $cBiblNode = $corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("bibl");

     $cBiblNode->set_text($cBiblString);
 }

 sub set_sourceDescI5{
     my ($corpusHeader) = @_;

     my $PUBLTITLE = $srcfullnames{$fnsource};
     my $PUBLPLACE = $srcpubplaces{$PUBLTITLE};
     my $PUBLISHER = $srcpublishers{$PUBLTITLE};

     my $YEAR  = $fnyear;
     my $YY    = substr($fnyear, 2, 2);

     my $CSIGLE    = $corpusids{$PUBLTITLE} . $YY;


     #<idsHeader>
     #  <fileDesc>
     #    <!-- ... -->
     #    <sourceDesc>
     #      <biblStruct>
     #        <monogr>
     #          <h.title type="main">[$PUBLTITLE], [$YEAR]</h.title>
     #          <imprint>
     #            <publisher>[$PUBLISHER]</publisher>
     #            <pubPlace key="[$TL]">[$PUBPLACE]</pubPlace>
     #          </imprint>
     #        </monogr>
     #      </biblStruct>
     #      <reference type="super" assemblage="regular">[$KKK] [$PUBLTITLE]; [$PUBPLACE]: [$PUBLISHER], [$YEAR]</reference>
     #    </sourceDesc>    #    <sourceDesc>
     #    <!-- ... -->
     #  </fileDesc>
     #</teiHeader>

     my $cMonogr = $corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("biblStruct")->first_child("monogr");
     $cMonogr->first_child("h.title")->set_text($PUBLTITLE);
     $cMonogr->first_child("imprint")->first_child("publisher")->set_text($PUBLISHER);
     $cMonogr->first_child("imprint")->first_child("pubPlace") ->set_text($PUBLPLACE);
     $cMonogr->first_child("imprint")->first_child("pubPlace") ->set_att('key', $CountryKey);

     $corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("reference")->set_text($CSIGLE . " " . $PUBLTITLE . "; " . $PUBLPLACE . ": " . $PUBLISHER . ", " . $YEAR);

 }


 sub createIdsDoc{          # will only be called for the second idsDoc (i.e. for february) and higher
     my ($textattsref) = @_;
     my $DATE      = $textattsref->{'date'};
     my $PUBLTITLE = $textattsref->{'publ_title'};

     my @datearray = split("-", $DATE);
     my $MONTH = $datearray[1];
     my $YEAR  = $datearray[0];
     my $YY    = substr($YEAR, 2, 2);

     my $CSIGLE    = $corpusids{$PUBLTITLE} . $YY;

     my $DOCID =     $months{$MONTH};
     my $MONTHNAME = $monthnames{$MONTH};

     my $idsDocString="";
     if($TEIFORMAT eq "I5"){
 	$idsDocString = "
 <idsDoc version=\"1.0\" TEIform=\"TEI.2\">
 <idsHeader type=\"document\" pattern=\"text\" version=\"1.1\" TEIform=\"teiHeader\">
   <fileDesc>
     <titleStmt>
       <dokumentSigle>$CSIGLE/$DOCID</dokumentSigle>
       <d.title>$PUBLTITLE, $MONTHNAME $YEAR</d.title>
     </titleStmt>
     <publicationStmt>
       <distributor/>
       <pubAddress/>
       <availability region=\"world\">$kielipankkiLicense</availability>
       <pubDate/>
     </publicationStmt>
     <sourceDesc>
       <biblStruct>
         <monogr>
           <h.title/>
           <imprint/>
         </monogr>
       </biblStruct>
     </sourceDesc>
   </fileDesc>
 </idsHeader>
 </idsDoc>\n";
 }
     if($MONTH + 0 == $LASTMONTH + 1){
 	if($MONTH+0 > 1){
 	    printf("%s\n", $idsDocString);
 		}
 	$LASTMONTH++;
     }
 }


 #################
 ## usage_message
 #################


 sub usage_message {
     print STDERR "Usage:  ./vrt2tei.pl  [OPTIONS]  <file.vrt.xml>\n";
     print STDERR "   <file.vrt.xml> is a VRT file as proper (well-formed) XML\n";
     print STDERR "   Options:\n";
     print STDERR "       -t (tei|i5)    output format, default: tei\n";
     print STDERR "       -m             mask primary data\n";
     print STDERR "\n";
     exit;
 }