vrt2tei.pl - EuReCo/kielipankki4eureco - Gitiles

 #! /usr/bin/perl -w


 ###########################################################################################################################################################
 # vrt2tei.pl
 # eureco
 # leibniz-institut fuer deutsche sprache / csc finland esbo
 # august 2024
 #
 #
 # using XML::Twig , see http://www.xmltwig.org/, https://metacpan.org/pod/XML::Twiga
 #
 # usage: see below the usage fugnction
 # Usage:  ./vrt2tei.pl <vrtxmlfile.xml> <outfile>
 #         <vrtxmlfile>: xml-ised vrt file
 #
 #
 # TODO:
 # 1  insert dtd spec, or ref to TEI

 # 3a remove the vrt positional attribute comment line / all comment lines
 # 3b add @head and @deprel to I5 sowie auch @msd
 # 3c bearbeitung von @head und @deprel in tei2korapxml durch Nils?
 # 3d build 30 billion corpus

 # 4a take care of IDs
 # 4b see to the values of @xml:lang
 # 5 abfangen von unerwarteten elementen dh andere als <sentence> und <paragraph>
 # 5a wort reihenfolge nochmal checken
 # 6 checks and balances
 # 7  How to encode Kielipankki and National Library of Finland? in teiCorpus Header
 # 8  construct <idsDoc>s for the months (or go for TEI)
 # 9  parallelisation in bash and application on sub corpora of KLK
 # 10  re-implementation of the gawk code in the perl script
 # 12  re-implement creation of text header from xml file in another twig / parametrize TEI vs I5


 #remember
 #formatted.xml:105613: element w: validity error : No declaration for attribute deprel of element w
 #formatted.xml:105613: element w: validity error : No declaration for attribute head of element w


 #
 #
 ############################################################################################################################################################


 use strict;
 use warnings;

 use XML::Twig;
 use XML::Generator ':pretty';  # apparently no effect when using flush();


 use locale;                 # diese drei Zeilen, damit \b im regex nicht Umlaute und ß matcht.
 use POSIX qw(locale_h);     # to be able to use setlocale()
 #setlocale(LC_ALL,'de_DE');
 setlocale(LC_ALL, "fi_FI");
 use utf8;
 use open qw( :std :encoding(UTF-8) );

 use Time::Piece;
 use Tie::IxHash;


 #----------------------
 # check file arguments:
 #----------------------

 # arg0 infile:   vrt-xml

 unless($ARGV[0]) {&usage_message()} ; # min arg0, the input file
 if    ($ARGV[1]) {&usage_message()};  # max arg0, the input file


 ####################
 # GLOBAL VARIABLES
 ####################

 my $encoding = "UTF-8";
 #my $encoding = "iso-8859-1";             # dieses $encoding ist NUR fuer das output s.u. twig funktion
 my $textcounter = 0;


 my $twig="";
 my $teiCorpusHeaderDoc="";

 # global variables pertaining to the original corpus :
 my $kielipankkiCorpus = "klk-fi-v2-vrt";


 #------------------------------------------------------------------
 # read corpusHeaderSkeleton document and get header out of it
 #------------------------------------------------------------------

 my $teiCorpusHeaderDocTwig = new XML::Twig(
     keep_spaces => 1,
     keep_atts_order => 1,
     comments => 'drop',
     );


 $teiCorpusHeaderDocTwig->parsefile("teiCorpusHeaderSkeleton.tei.xml");
 my $corpusHeader = $teiCorpusHeaderDocTwig->root;                  # getting the teiHeader for corpus out of the teiCorpusHeaderSkeleton document


 #------------------------------------------------------------------
 # read textHeaderSkeleton document adn get header out of it
 #------------------------------------------------------------------

 my $teiTextHeaderDocTwig = new XML::Twig(
     keep_spaces => 1,
     keep_atts_order => 1,
     comments => 'drop',
     );

 $teiTextHeaderDocTwig->parsefile("teiTextHeaderSkeleton.tei.xml");
 my $textHeader = $teiTextHeaderDocTwig->root;                      # getting the teiHeader for corpus out of the teiTextHeaderSkeleton document


 #----------------------------------
 # read input VRT-XML document
 #----------------------------------

 open(my $IN,  "< $ARGV[0]") || die("$0: cannot open file for reading: $ARGV[0]");     # open input  file and initialise filehandel, actually does not seem to be needed
                                                                                       # as parsefile() (s.b.) is applied to the filename


 #####################
 #     M A I N
 #####################

 #-------------------------------------------------------------------------------------------------------------
 # start twig for input and call start tag handler for root and twig handler for each <text> in the VRT
 #-------------------------------------------------------------------------------------------------------------


 $twig = new XML::Twig(
     keep_spaces => 1,           # dadurch auch whitespaces an ehemeligen elementgrenzen im output
     keep_atts_order => 1,       # requires Tie::IxHash
     comments => 'drop',
     start_tag_handlers => {
 	texts => sub{root(@_, $corpusHeader)}
     },
     twig_handlers =>  {
 #	text => \&text
 	text =>  sub{text(@_, $textHeader)}
     },
     # dtd_handlers =>   {       # ToDo for I5
     #	\&set_dtd;
     # }

     output_encoding => $encoding,
     );

 $twig->parsefile($ARGV[0]);


 ###########
 # END MAIN
 ###########


 ##############################
 #   S U B R O U T I N E S
 ##############################

 # sub set_dtd [
 #    my $twig, $dtd = @_;
 #    my $internal = qq|\nPUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"\n"DTD/xhtml1-strict.dtd"|;
 #
 #    $twig->twig_doctype('html', undef, undef, $internal);
 #    }


 sub root {
     my ($twig, $root, $corpusHeader) =@_;

     $root->set_gi('teiCorpus');
     $root->set_att("xmlns", 'http://www.tei-c.org/ns/1.0');

     &insertCorpusHeader($root, $corpusHeader);
 }


 sub insertCorpusHeader{
     my ($root, $corpusHeader) =@_;

     #---------------------------------------------------------------------------
     # get some metadata for the current output corpus based on source and year
     #---------------------------------------------------------------------------

     my @array = split(/\//, $ARGV[0]);
     my $l = scalar(@array);
     my $source = $array[$l-1];
     $source =~ s/([0-9][0-9][0-9][0-9])\.xml$//;

     my $language="Finnish";
     my $lang_tla="fi";

     my $yy = $1;   # $1 now containts substring in first bracket in regex above

     my $ctitle = $source . " " . $yy . ", from ". $kielipankkiCorpus . " for EuReCo";    # to do: also get name of corpus (klk-fi-v2-vrt)


     #-----------------------
     # set corpus header
     #-----------------------

     &set_title(     $corpusHeader, $source, $yy, $kielipankkiCorpus);
     &set_sourceDesc($corpusHeader, $source, $yy, $kielipankkiCorpus);

     my $teiCorpusHeader        = $corpusHeader->paste("first_child", $root);

 }


 #----------------------------
 # handler &text for <text>
 #----------------------------

 sub text {
     my ($twig, $text, $textHeader) = @_;

     $textcounter++;


     # ToDo: catch all other, unexpected children of root

     #--------------------------------------------------------------------------
     # Get text metadata (attributes of <text>) and create teiHeader for <text>
     #--------------------------------------------------------------------------

     my $textattsref = $text->atts();               # $textattsref is now a reference to a hash and should be used with '->'


     &createTextHeader($text, $textattsref, $textHeader);

     #--------------------------
     # create <TEI> from <text>
     #--------------------------

     # set vrt <text> to <TEI> and delete all attributes after they were were saved above
     $text->del_atts;
     $text->set_gi("TEI");

     #------------------------------------------------------------------
     # create the <tei:text>, <body>, <div> elements inside <TEI>
     #------------------------------------------------------------------

     my $ttext_element = XML::Twig::Elt->new('text');
     my $body_element  = XML::Twig::Elt->new('body');
     my $div_element   = XML::Twig::Elt->new('div');

     # set atts
     $div_element  ->set_att("type", "page");                          # ToDo: this is specific to KLK
     $ttext_element->set_att("xml:lang", 'fi');                               # as in ICC-NOR

     # paste
     $ttext_element->paste('last_child',  $text);
     $body_element ->paste('last_child',  $ttext_element);
     $div_element  ->paste('last_child',  $body_element);


     #-------------------------------
     # create <p> from <paragraph>
     #-------------------------------

     my @paragraphs = $text->children( 'paragraph');

     foreach my $paragraph (@paragraphs) {

 	&setP($paragraph);

 	$paragraph->move('last_child', $div_element);

 	#------------------------------
 	# create <s> from <sentence>
 	#------------------------------

 	my @sentences = $paragraph->children('sentence');
 	foreach my $sentence (@sentences) {

 	    &setS($sentence);


 	    #--------------------------------------
 	    # create <w> (word) from each $line
 	    #--------------------------------------

 	    my @lines = split(/\n+/, $sentence->xml_text);
 	    $sentence->set_text("\n");

 	    for my $line (@lines){                                 # Todo: Reihenfolge checken
 		if($line ne "" ){
 		    my $w_element = XML::Twig::Elt->new('w');
 		    &createW($w_element, $line);
 		    $w_element->paste('last_child', $sentence);
 		}
 	    } # end words
 	} # end sentences
     } # end paragraphs

     # $twig->set_pretty_print( 'record');
     # $twig->flush($OUT);
     $twig->flush("/dev/stdout");
 }

 sub createTextHeader{
     my ($text, $textattsref, $textHeader) = @_;

     # USE 01 binding_id="2246025"
     # USE 02 date="2021-01-15"
     #     03 datefrom="20210115"
     #     04 dateto="20210115"
     #     05 elec_date="_"
     #     06 file=""
     # USE 07 filename_metadata="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_mets.xml"
     # USE 08 filename_orig    ="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_page-2021011502210030301.xml
     # USE 09 id="t-bcd0f3fa-bbd3dac4"
     #     10 img_url=""
     # USE 11 issue_date="15.01.2021"
     # USE 12 issue_no="SK0221"
     # USE 13 issue_title="Suomen Kuvalehti"
     # USE 14 label="Suomen Kuvalehti no. SK0221 15.01.2021"
     # USE 16 language="fi"
     # USE 17 page_id="p1"
     # USE 18 page_no="None"
     #     19 part_name="_"
     #     20 publ_id="0039-5552"
     #     21 publ_part=""
     # USE 22 publ_title="Suomen Kuvalehti"
     # USE 23 publ_type="aikakausi"
     # USE 24 sentcount="70"
     # USE 25 sum_lang="|xxx:44|fin:23|eng:3|"
     #     26 timefrom="000000"
     #     27 timeto="235959"
     # USE 28 tokencount="304"
     #     29 version_added="KLK-fi-2021">


     my $BID          = $textattsref->{'binding_id'};
     my $DATE         = $textattsref->{'date'};
     my $METAFILENAME = $textattsref->{'filename_metadata'};
     my $ORIGFILENAME = $textattsref->{'filename_orig'};
     my $ID           = $textattsref->{'id'};
     my $ISSUEDATE    = $textattsref->{'issue_date'};
     my $ISSUENO      = $textattsref->{'issue_no'};
     my $ISSUETITLE   = $textattsref->{'issue_title'};
     my $LABEL        = $textattsref->{'label'};
     my $LANGUAGE     = $textattsref->{'language'};
     my $PAGEID       = $textattsref->{'page_id'};
     my $PAGENO       = $textattsref->{'page_no'};
     my $PUBLTITLE    = $textattsref->{'publ_title'};
     my $PUBLTYPE     = $textattsref->{'publ_type'};
     my $SENTCOUNT    = $textattsref->{'sentcount'};
     my $SUMLANG      = $textattsref->{'sum_lang'};
     my $TOKENCOUNT   = $textattsref->{'tokencount'};


     #-----------------------------
     # Derived Metadata variables
     #-----------------------------

     my @datearray = split("-", $DATE);
     my @langarray = split("|", $SUMLANG);
     my @namearray = split(/[\.\/]/, $ORIGFILENAME);  # use $namearray[4] as ID for the page


     #-----------------------------------------------------------------------
     # CREATE text-teiHeader ACCORDING TO THE SKELETON in $textHeader
     #-----------------------------------------------------------------------


     $textHeader->paste('first_child', $text);

     #-----------------------------------------------
     # <teiHeader>
     #   <fileDesc n="EuReCo-KLK-FIN_[$ID]">
     #     <titleStmt>
     #       <title>[$LABEL, page $PAGENO]</title>

     $textHeader->first_child("fileDesc")   ->  set_att('n', "EuReCo-". $kielipankkiCorpus . $ID);

     $textHeader->first_child("fileDesc")    -> first_child("titleStmt")->first_child("title")
 	->set_text($LABEL . ", Text #" . $textcounter);    # Case KLK;  PAGENO scheint meist "None" zu sein

     #-----------------------------------------------
     # <fileDesc>
     #  <sourceDesc>
     #   <biblStruct>
     #      <analytic>
     #         <title type="main">[$LABEL, page $PAGENO]</title>
     #         <date>[$DATE]</date>
     #         <date type="year">TODO</date>
     #         <date type="month">TODO</date>
     #         <date type="day">TODO</date>
     #         <idno type="PAGEID">$PAGEID</idno>
     #         <idno type="BINDINGID">$BID</idno>
     #         <idno type="ID">$ID</idno>
     #         <idno type="KIELIPANKKI_METAFILENAME">$METAFILENAME</idno>
     #         <idno type="KIELIPANKKI_ORIGFILENAME">$ORIGFILENAME</idno>
     #         <textLang>$LANGUAGE</textLang>

     my $analytic  = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/analytic", 0);

     $analytic->first_child("title")                    ->set_text($LABEL . ", Text #" . $textcounter);   # Case KLK; PAGENO scheint meist "None" zu sein
     $analytic->get_xpath('./date[@type="date"]',   0)  ->set_text($DATE);
     $analytic->get_xpath('./date[@type="year"]',   0)  ->set_text($datearray[0]);
     $analytic->get_xpath('./date[@type="month"]',  0)  ->set_text($datearray[1]);
     $analytic->get_xpath('./date[@type="day"]',    0)  ->set_text($datearray[2]);
     $analytic->get_xpath('./idno[@type="PAGEID"]',       0)     ->set_text($PAGEID);
     $analytic->get_xpath('./idno[@type="BINDINGID"]',    0)     ->set_text($BID);
     $analytic->get_xpath('./idno[@type="ID"]',           0)     ->set_text($ID);
     $analytic->get_xpath('./idno[@type="KIELIPANKKI_METAFILENAME"]', 0)     ->set_text($METAFILENAME);
     $analytic->get_xpath('./idno[@type="KIELIPANKKI_ORIGFILENAME"]', 0)     ->set_text($ORIGFILENAME);
     $analytic->first_child('textLang')                        ->set_text($LANGUAGE);

     #  <monogr>
     #    <title>$PUBLTITLE</title>
     #    <imprint>
     #      <pubPlace>TODO</pubPlace>
     #      <publisher>TODO</publisher>
     #    </imprint>
     #    <biblScope unit="ISSUETITLE"/>
     #    <biblScope unit="ISSUENO"/>
     #    <biblScope unit="ISSUEDATE"/>
     #    <biblScope unit="pp">$PAGENO</biblScope>

     my $monogr  = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/monogr", 0);

     $monogr->first_child("title")                                   ->set_text($PUBLTITLE);
     $monogr->first_child("imprint")->first_child("pubPlace")        ->set_text("ToDo");         # imprint is needed for tei validity
     $monogr->first_child("imprint")->first_child("publisher")       ->set_text("ToDo");         # imprint is needed for tei validity
     $monogr->get_xpath('./biblScope[@unit="ISSUETITLE"]', 0)        ->set_text($ISSUETITLE);
     $monogr->get_xpath('./biblScope[@unit="ISSUENO"]', 0)           ->set_text($ISSUENO);
     $monogr->get_xpath('./biblScope[@unit="ISSUEDATE"]', 0)         ->set_text($ISSUEDATE);
     $monogr->get_xpath('./biblScope[@unit="pp"]', 0)                ->set_text($PAGENO);        # Achtung - PAGENO scheint meist "None" zu sein

     #  <encodingDesc>
     #    <tagsDecl>
     #      <namespace name="http://www.tei-c.org/ns/1.0">
     #        <tagUsage gi="s" occurs="SENTCOUNT"/>
     #        <tagUsage gi="w" occurs="TOKENCOUNT"/>

     $textHeader->get_xpath('./encodingDesc/tagsDecl/namespace/tagUsage[@gi="s"]', 0)      -> set_att('occurs', $SENTCOUNT);
     $textHeader->get_xpath('./encodingDesc/tagsDecl/namespace/tagUsage[@gi="w"]', 0)      -> set_att('occurs', $TOKENCOUNT);

     #  <profileDesc>
     #    <langUsage>
     #     <language ident="fi" usage="|xxx:44|fin:23|eng:3|"/>
     #   </langUsage>
     #    <textClass>
     #      <classCode scheme="kielipankki_klk">$PUBLTYPE</classCode>
     #      <classCode scheme="kielipankki_klk_mapped">TODO</classCode>

     $textHeader->get_xpath('./profileDesc/langUsage/language', 0)                                   ->set_att('ident', $LANGUAGE);
     $textHeader->get_xpath('./profileDesc/langUsage/language', 0)                                   ->set_att('usage', $SUMLANG);
     # in @usage muss eigt. ein integer; am besten inhalt von SUMLANG aufdroeseln und mehrere <language> machen

     $textHeader->get_xpath('./profileDesc/textClass/classCode[@scheme="kielipankki_klk"]', 0)       ->set_text($PUBLTYPE);
     $textHeader->get_xpath('./profileDesc/textClass/classCode[@scheme="kielipankki_klk_mapped"]', 0)->set_text("ToDo");

     #  <revisionDesc>
     #    <change when="TODO" who="HL">TEI version for EuReCo</change>

     $textHeader->get_xpath('./revisionDesc/change', 0)                                    ->set_att('when', localtime->ymd('-'));


     #-----------------------------------
     # END OF CREATING TEIHEADER
     #-----------------------------------

 }

 sub setP {
     my ($paragraph) = @_;

     $paragraph->set_gi('p');

     # <paragraph id="p-bcd0f3fa-bbd3dac4-815ead7a" sum_lang="|fin:1|">
     # atts of <paragraph>:
     #    @id        USE
     #    @sum_lang  USE: put in xml:lang and prefix the value with "x-" for private value

     $paragraph->set_att("xml:lang", "x-" . $paragraph->att("sum_lang"));
     $paragraph->del_att("sum_lang");
     # $paragraph->change_att_name('id', 'xml:id');
     $paragraph->del_att("id");              # diese id ist auch nicht eindeutig!!
 }
 sub setS {
     my ($sentence) = @_;

     $sentence->set_gi('s');

     # the atts of <sentence>:
     #    USE   1 @id="s-bcd0f3fa-bbd3dac4-f7429090"
     #    USE   2 @lang="fin" -> xml:lang
     #    ?     3 @lang_conf="0.6734853"> -> ToDo @cert ?

     # set attrs of <s>
     $sentence->set_att("xml:lang", $sentence->att("lang"));  # ToDo: convert the value / introduce a hash for lookup (input values: "fin", "xxx", ....)
     # $sentence->change_att_name('id', 'xml:id');            # nicht eindeutig
     $sentence->del_att('id');
     $sentence->del_att("lang");                         # replaced by xml:lang
     $sentence->del_att("lang_conf");                    # for the time being

 }

 sub createW {
     my ($w_element, $line) = @_;

     #---------------------------
     # Get the tags (=columns)
     #---------------------------

     my @tags = split(/\t/, $line);

     # set content of <w> i.e. the token
     $w_element->set_text($tags[0]);

     # vrt positional-attributes in corpus KLK:
     #  USE [0] word
     #  USE [1] ref  (id for reference of dephead)
     #  USE [2] lemma
     #  ?   [3] lemmacomp   (lemma with compound info - could go in @norm, as tag abuse?)
     #  USE [4] pos
     #  USE [5] msd
     #  USE [6] dephead
     #  USE [7] deprel
     #      [8] content   (ocr-process)
     #      [9] vpos      (ocr-process)
     #     [10] ocr       (ocr-process)
     #     [11] cc        (ocr-process)
     #     [12] hyph      (ocr-process)
     #     [13] style     (ocr-process)
     #     [14] lex       (korp semantic disambiguation from G"oteborg)

     # set the attributes of <w>:
     $w_element->set_att("n",      $tags[1]);
     # $w_element->set_att("id",     "w_" . $namearray[4] . $sentence->att("xml:id") . "_" . $tags[1]);
     # so zusammengebaute ID ist auch nicht eindeutig...
     $w_element->del_att("id");
     $w_element->set_att("lemma",  $tags[2]);
     # $w_element->set_att("norm",   $tags[3]);  # tag abuse of @norm
     $w_element->set_att("pos",    $tags[4]);
     $w_element->set_att("msd",    $tags[5]);
 #TMP    $w_element->set_att("head",   $tags[6]);
 #TMP    $w_element->set_att("deprel", $tags[7]);

 }


 sub set_title{
     my ($corpusHeader, $source, $yy, $kielipankkiCorpus) = @_;

     my $cTitleString = $source . " " . $yy . ", from ". $kielipankkiCorpus . " for EuReCo";

     #<teiHeader>
     #  <fileDesc>
     #    <titleStmt>
     #      <title>[Aamulehti2021] from [klk-fi-v2-vrt for] EuReCo</title>
     #    </titleStmt>
     #    <!-- ... -->
     #  </fileDesc>
     #</teiHeader>

     my $cTitleNode = $corpusHeader->first_child("fileDesc")->first_child("titleStmt")->first_child("title");

     $cTitleNode->set_text($cTitleString);

 }

 sub set_sourceDesc{
     my ($corpusHeader, $source, $yy, $kielipankkiCorpus) = @_;

     my $cBiblString = $source . " " . $yy . ", from ". $kielipankkiCorpus . " for EuReCo";

     #<teiHeader>
     #  <fileDesc>
     #    <!-- ... -->
     #    <sourceDesc>
     #      <bibl>[Aamulehti2021] from [klk-fi-v2-vrt]</bibl>
     #    </sourceDesc>
     #    <!-- ... -->
     #  </fileDesc>
     #</teiHeader>

     my $cBiblNode = $corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("bibl");

     $cBiblNode->set_text($cBiblString);
 }


 #################
 ## usage_message
 #################


 sub usage_message {
     print "   Usage:  ./vrt2tei.pl <file.vrt.xml> <outfile>\n";
     print "   <file.vrt.xml> is a VRT file converted to proper XML\n";
     exit;
 }
	#! /usr/bin/perl -w


	###########################################################################################################################################################
	# vrt2tei.pl
	# eureco
	# leibniz-institut fuer deutsche sprache / csc finland esbo
	# august 2024
	#
	#
	# using XML::Twig , see http://www.xmltwig.org/, https://metacpan.org/pod/XML::Twiga
	#
	# usage: see below the usage fugnction
	# Usage: ./vrt2tei.pl <vrtxmlfile.xml> <outfile>
	# <vrtxmlfile>: xml-ised vrt file
	#
	#
	# TODO:
	# 1 insert dtd spec, or ref to TEI

	# 3a remove the vrt positional attribute comment line / all comment lines
	# 3b add @head and @deprel to I5 sowie auch @msd
	# 3c bearbeitung von @head und @deprel in tei2korapxml durch Nils?
	# 3d build 30 billion corpus

	# 4a take care of IDs
	# 4b see to the values of @xml:lang
	# 5 abfangen von unerwarteten elementen dh andere als <sentence> und <paragraph>
	# 5a wort reihenfolge nochmal checken
	# 6 checks and balances
	# 7 How to encode Kielipankki and National Library of Finland? in teiCorpus Header
	# 8 construct <idsDoc>s for the months (or go for TEI)
	# 9 parallelisation in bash and application on sub corpora of KLK
	# 10 re-implementation of the gawk code in the perl script
	# 12 re-implement creation of text header from xml file in another twig / parametrize TEI vs I5



	#remember
	#formatted.xml:105613: element w: validity error : No declaration for attribute deprel of element w
	#formatted.xml:105613: element w: validity error : No declaration for attribute head of element w


	#
	#
	############################################################################################################################################################


	use strict;
	use warnings;

	use XML::Twig;
	use XML::Generator ':pretty'; # apparently no effect when using flush();


	use locale; # diese drei Zeilen, damit \b im regex nicht Umlaute und ß matcht.
	use POSIX qw(locale_h); # to be able to use setlocale()
	#setlocale(LC_ALL,'de_DE');
	setlocale(LC_ALL, "fi_FI");
	use utf8;
	use open qw( :std :encoding(UTF-8) );

	use Time::Piece;
	use Tie::IxHash;



	#----------------------
	# check file arguments:
	#----------------------

	# arg0 infile: vrt-xml

	unless($ARGV[0]) {&usage_message()} ; # min arg0, the input file
	if ($ARGV[1]) {&usage_message()}; # max arg0, the input file


	####################
	# GLOBAL VARIABLES
	####################

	my $encoding = "UTF-8";
	#my $encoding = "iso-8859-1"; # dieses $encoding ist NUR fuer das output s.u. twig funktion
	my $textcounter = 0;



	my $twig="";
	my $teiCorpusHeaderDoc="";

	# global variables pertaining to the original corpus :
	my $kielipankkiCorpus = "klk-fi-v2-vrt";




	#------------------------------------------------------------------
	# read corpusHeaderSkeleton document and get header out of it
	#------------------------------------------------------------------

	my $teiCorpusHeaderDocTwig = new XML::Twig(
	keep_spaces => 1,
	keep_atts_order => 1,
	comments => 'drop',
	);


	$teiCorpusHeaderDocTwig->parsefile("teiCorpusHeaderSkeleton.tei.xml");
	my $corpusHeader = $teiCorpusHeaderDocTwig->root; # getting the teiHeader for corpus out of the teiCorpusHeaderSkeleton document


	#------------------------------------------------------------------
	# read textHeaderSkeleton document adn get header out of it
	#------------------------------------------------------------------

	my $teiTextHeaderDocTwig = new XML::Twig(
	keep_spaces => 1,
	keep_atts_order => 1,
	comments => 'drop',
	);

	$teiTextHeaderDocTwig->parsefile("teiTextHeaderSkeleton.tei.xml");
	my $textHeader = $teiTextHeaderDocTwig->root; # getting the teiHeader for corpus out of the teiTextHeaderSkeleton document


	#----------------------------------
	# read input VRT-XML document
	#----------------------------------

	open(my $IN, "< $ARGV[0]") \|\| die("$0: cannot open file for reading: $ARGV[0]"); # open input file and initialise filehandel, actually does not seem to be needed
	# as parsefile() (s.b.) is applied to the filename


	#####################
	# M A I N
	#####################

	#-------------------------------------------------------------------------------------------------------------
	# start twig for input and call start tag handler for root and twig handler for each <text> in the VRT
	#-------------------------------------------------------------------------------------------------------------


	$twig = new XML::Twig(
	keep_spaces => 1, # dadurch auch whitespaces an ehemeligen elementgrenzen im output
	keep_atts_order => 1, # requires Tie::IxHash
	comments => 'drop',
	start_tag_handlers => {
	texts => sub{root(@_, $corpusHeader)}
	},
	twig_handlers => {
	# text => \&text
	text => sub{text(@_, $textHeader)}
	},
	# dtd_handlers => { # ToDo for I5
	# \&set_dtd;
	# }

	output_encoding => $encoding,
	);

	$twig->parsefile($ARGV[0]);





	###########
	# END MAIN
	###########




	##############################
	# S U B R O U T I N E S
	##############################

	# sub set_dtd [
	# my $twig, $dtd = @_;
	# my $internal = qq\|\nPUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"\n"DTD/xhtml1-strict.dtd"\|;
	#
	# $twig->twig_doctype('html', undef, undef, $internal);
	# }



	sub root {
	my ($twig, $root, $corpusHeader) =@_;

	$root->set_gi('teiCorpus');
	$root->set_att("xmlns", 'http://www.tei-c.org/ns/1.0');

	&insertCorpusHeader($root, $corpusHeader);
	}



	sub insertCorpusHeader{
	my ($root, $corpusHeader) =@_;

	#---------------------------------------------------------------------------
	# get some metadata for the current output corpus based on source and year
	#---------------------------------------------------------------------------

	my @array = split(/\//, $ARGV[0]);
	my $l = scalar(@array);
	my $source = $array[$l-1];
	$source =~ s/([0-9][0-9][0-9][0-9])\.xml$//;

	my $language="Finnish";
	my $lang_tla="fi";

	my $yy = $1; # $1 now containts substring in first bracket in regex above

	my $ctitle = $source . " " . $yy . ", from ". $kielipankkiCorpus . " for EuReCo"; # to do: also get name of corpus (klk-fi-v2-vrt)


	#-----------------------
	# set corpus header
	#-----------------------

	&set_title( $corpusHeader, $source, $yy, $kielipankkiCorpus);
	&set_sourceDesc($corpusHeader, $source, $yy, $kielipankkiCorpus);

	my $teiCorpusHeader = $corpusHeader->paste("first_child", $root);

	}


	#----------------------------
	# handler &text for <text>
	#----------------------------

	sub text {
	my ($twig, $text, $textHeader) = @_;

	$textcounter++;


	# ToDo: catch all other, unexpected children of root

	#--------------------------------------------------------------------------
	# Get text metadata (attributes of <text>) and create teiHeader for <text>
	#--------------------------------------------------------------------------

	my $textattsref = $text->atts(); # $textattsref is now a reference to a hash and should be used with '->'


	&createTextHeader($text, $textattsref, $textHeader);

	#--------------------------
	# create <TEI> from <text>
	#--------------------------

	# set vrt <text> to <TEI> and delete all attributes after they were were saved above
	$text->del_atts;
	$text->set_gi("TEI");

	#------------------------------------------------------------------
	# create the <tei:text>, <body>, <div> elements inside <TEI>
	#------------------------------------------------------------------

	my $ttext_element = XML::Twig::Elt->new('text');
	my $body_element = XML::Twig::Elt->new('body');
	my $div_element = XML::Twig::Elt->new('div');

	# set atts
	$div_element ->set_att("type", "page"); # ToDo: this is specific to KLK
	$ttext_element->set_att("xml:lang", 'fi'); # as in ICC-NOR

	# paste
	$ttext_element->paste('last_child', $text);
	$body_element ->paste('last_child', $ttext_element);
	$div_element ->paste('last_child', $body_element);


	#-------------------------------
	# create <p> from <paragraph>
	#-------------------------------

	my @paragraphs = $text->children( 'paragraph');

	foreach my $paragraph (@paragraphs) {

	&setP($paragraph);

	$paragraph->move('last_child', $div_element);

	#------------------------------
	# create <s> from <sentence>
	#------------------------------

	my @sentences = $paragraph->children('sentence');
	foreach my $sentence (@sentences) {

	&setS($sentence);


	#--------------------------------------
	# create <w> (word) from each $line
	#--------------------------------------

	my @lines = split(/\n+/, $sentence->xml_text);
	$sentence->set_text("\n");

	for my $line (@lines){ # Todo: Reihenfolge checken
	if($line ne "" ){
	my $w_element = XML::Twig::Elt->new('w');
	&createW($w_element, $line);
	$w_element->paste('last_child', $sentence);
	}
	} # end words
	} # end sentences
	} # end paragraphs

	# $twig->set_pretty_print( 'record');
	# $twig->flush($OUT);
	$twig->flush("/dev/stdout");
	}

	sub createTextHeader{
	my ($text, $textattsref, $textHeader) = @_;

	# USE 01 binding_id="2246025"
	# USE 02 date="2021-01-15"
	# 03 datefrom="20210115"
	# 04 dateto="20210115"
	# 05 elec_date="_"
	# 06 file=""
	# USE 07 filename_metadata="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_mets.xml"
	# USE 08 filename_orig ="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_page-2021011502210030301.xml
	# USE 09 id="t-bcd0f3fa-bbd3dac4"
	# 10 img_url=""
	# USE 11 issue_date="15.01.2021"
	# USE 12 issue_no="SK0221"
	# USE 13 issue_title="Suomen Kuvalehti"
	# USE 14 label="Suomen Kuvalehti no. SK0221 15.01.2021"
	# USE 16 language="fi"
	# USE 17 page_id="p1"
	# USE 18 page_no="None"
	# 19 part_name="_"
	# 20 publ_id="0039-5552"
	# 21 publ_part=""
	# USE 22 publ_title="Suomen Kuvalehti"
	# USE 23 publ_type="aikakausi"
	# USE 24 sentcount="70"
	# USE 25 sum_lang="\|xxx:44\|fin:23\|eng:3\|"
	# 26 timefrom="000000"
	# 27 timeto="235959"
	# USE 28 tokencount="304"
	# 29 version_added="KLK-fi-2021">


	my $BID = $textattsref->{'binding_id'};
	my $DATE = $textattsref->{'date'};
	my $METAFILENAME = $textattsref->{'filename_metadata'};
	my $ORIGFILENAME = $textattsref->{'filename_orig'};
	my $ID = $textattsref->{'id'};
	my $ISSUEDATE = $textattsref->{'issue_date'};
	my $ISSUENO = $textattsref->{'issue_no'};
	my $ISSUETITLE = $textattsref->{'issue_title'};
	my $LABEL = $textattsref->{'label'};
	my $LANGUAGE = $textattsref->{'language'};
	my $PAGEID = $textattsref->{'page_id'};
	my $PAGENO = $textattsref->{'page_no'};
	my $PUBLTITLE = $textattsref->{'publ_title'};
	my $PUBLTYPE = $textattsref->{'publ_type'};
	my $SENTCOUNT = $textattsref->{'sentcount'};
	my $SUMLANG = $textattsref->{'sum_lang'};
	my $TOKENCOUNT = $textattsref->{'tokencount'};


	#-----------------------------
	# Derived Metadata variables
	#-----------------------------

	my @datearray = split("-", $DATE);
	my @langarray = split("\|", $SUMLANG);
	my @namearray = split(/[\.\/]/, $ORIGFILENAME); # use $namearray[4] as ID for the page




	#-----------------------------------------------------------------------
	# CREATE text-teiHeader ACCORDING TO THE SKELETON in $textHeader
	#-----------------------------------------------------------------------


	$textHeader->paste('first_child', $text);

	#-----------------------------------------------
	# <teiHeader>
	# <fileDesc n="EuReCo-KLK-FIN_[$ID]">
	# <titleStmt>
	# <title>[$LABEL, page $PAGENO]</title>

	$textHeader->first_child("fileDesc") -> set_att('n', "EuReCo-". $kielipankkiCorpus . $ID);

	$textHeader->first_child("fileDesc") -> first_child("titleStmt")->first_child("title")
	->set_text($LABEL . ", Text #" . $textcounter); # Case KLK; PAGENO scheint meist "None" zu sein

	#-----------------------------------------------
	# <fileDesc>
	# <sourceDesc>
	# <biblStruct>
	# <analytic>
	# <title type="main">[$LABEL, page $PAGENO]</title>
	# <date>[$DATE]</date>
	# <date type="year">TODO</date>
	# <date type="month">TODO</date>
	# <date type="day">TODO</date>
	# <idno type="PAGEID">$PAGEID</idno>
	# <idno type="BINDINGID">$BID</idno>
	# <idno type="ID">$ID</idno>
	# <idno type="KIELIPANKKI_METAFILENAME">$METAFILENAME</idno>
	# <idno type="KIELIPANKKI_ORIGFILENAME">$ORIGFILENAME</idno>
	# <textLang>$LANGUAGE</textLang>

	my $analytic = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/analytic", 0);

	$analytic->first_child("title") ->set_text($LABEL . ", Text #" . $textcounter); # Case KLK; PAGENO scheint meist "None" zu sein
	$analytic->get_xpath('./date[@type="date"]', 0) ->set_text($DATE);
	$analytic->get_xpath('./date[@type="year"]', 0) ->set_text($datearray[0]);
	$analytic->get_xpath('./date[@type="month"]', 0) ->set_text($datearray[1]);
	$analytic->get_xpath('./date[@type="day"]', 0) ->set_text($datearray[2]);
	$analytic->get_xpath('./idno[@type="PAGEID"]', 0) ->set_text($PAGEID);
	$analytic->get_xpath('./idno[@type="BINDINGID"]', 0) ->set_text($BID);
	$analytic->get_xpath('./idno[@type="ID"]', 0) ->set_text($ID);
	$analytic->get_xpath('./idno[@type="KIELIPANKKI_METAFILENAME"]', 0) ->set_text($METAFILENAME);
	$analytic->get_xpath('./idno[@type="KIELIPANKKI_ORIGFILENAME"]', 0) ->set_text($ORIGFILENAME);
	$analytic->first_child('textLang') ->set_text($LANGUAGE);

	# <monogr>
	# <title>$PUBLTITLE</title>
	# <imprint>
	# <pubPlace>TODO</pubPlace>
	# <publisher>TODO</publisher>
	# </imprint>
	# <biblScope unit="ISSUETITLE"/>
	# <biblScope unit="ISSUENO"/>
	# <biblScope unit="ISSUEDATE"/>
	# <biblScope unit="pp">$PAGENO</biblScope>

	my $monogr = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/monogr", 0);

	$monogr->first_child("title") ->set_text($PUBLTITLE);
	$monogr->first_child("imprint")->first_child("pubPlace") ->set_text("ToDo"); # imprint is needed for tei validity
	$monogr->first_child("imprint")->first_child("publisher") ->set_text("ToDo"); # imprint is needed for tei validity
	$monogr->get_xpath('./biblScope[@unit="ISSUETITLE"]', 0) ->set_text($ISSUETITLE);
	$monogr->get_xpath('./biblScope[@unit="ISSUENO"]', 0) ->set_text($ISSUENO);
	$monogr->get_xpath('./biblScope[@unit="ISSUEDATE"]', 0) ->set_text($ISSUEDATE);
	$monogr->get_xpath('./biblScope[@unit="pp"]', 0) ->set_text($PAGENO); # Achtung - PAGENO scheint meist "None" zu sein

	# <encodingDesc>
	# <tagsDecl>
	# <namespace name="http://www.tei-c.org/ns/1.0">
	# <tagUsage gi="s" occurs="SENTCOUNT"/>
	# <tagUsage gi="w" occurs="TOKENCOUNT"/>

	$textHeader->get_xpath('./encodingDesc/tagsDecl/namespace/tagUsage[@gi="s"]', 0) -> set_att('occurs', $SENTCOUNT);
	$textHeader->get_xpath('./encodingDesc/tagsDecl/namespace/tagUsage[@gi="w"]', 0) -> set_att('occurs', $TOKENCOUNT);

	# <profileDesc>
	# <langUsage>
	# <language ident="fi" usage="\|xxx:44\|fin:23\|eng:3\|"/>
	# </langUsage>
	# <textClass>
	# <classCode scheme="kielipankki_klk">$PUBLTYPE</classCode>
	# <classCode scheme="kielipankki_klk_mapped">TODO</classCode>

	$textHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_att('ident', $LANGUAGE);
	$textHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_att('usage', $SUMLANG);
	# in @usage muss eigt. ein integer; am besten inhalt von SUMLANG aufdroeseln und mehrere <language> machen

	$textHeader->get_xpath('./profileDesc/textClass/classCode[@scheme="kielipankki_klk"]', 0) ->set_text($PUBLTYPE);
	$textHeader->get_xpath('./profileDesc/textClass/classCode[@scheme="kielipankki_klk_mapped"]', 0)->set_text("ToDo");

	# <revisionDesc>
	# <change when="TODO" who="HL">TEI version for EuReCo</change>

	$textHeader->get_xpath('./revisionDesc/change', 0) ->set_att('when', localtime->ymd('-'));


	#-----------------------------------
	# END OF CREATING TEIHEADER
	#-----------------------------------

	}

	sub setP {
	my ($paragraph) = @_;

	$paragraph->set_gi('p');

	# <paragraph id="p-bcd0f3fa-bbd3dac4-815ead7a" sum_lang="\|fin:1\|">
	# atts of <paragraph>:
	# @id USE
	# @sum_lang USE: put in xml:lang and prefix the value with "x-" for private value

	$paragraph->set_att("xml:lang", "x-" . $paragraph->att("sum_lang"));
	$paragraph->del_att("sum_lang");
	# $paragraph->change_att_name('id', 'xml:id');
	$paragraph->del_att("id"); # diese id ist auch nicht eindeutig!!
	}
	sub setS {
	my ($sentence) = @_;

	$sentence->set_gi('s');

	# the atts of <sentence>:
	# USE 1 @id="s-bcd0f3fa-bbd3dac4-f7429090"
	# USE 2 @lang="fin" -> xml:lang
	# ? 3 @lang_conf="0.6734853"> -> ToDo @cert ?

	# set attrs of <s>
	$sentence->set_att("xml:lang", $sentence->att("lang")); # ToDo: convert the value / introduce a hash for lookup (input values: "fin", "xxx", ....)
	# $sentence->change_att_name('id', 'xml:id'); # nicht eindeutig
	$sentence->del_att('id');
	$sentence->del_att("lang"); # replaced by xml:lang
	$sentence->del_att("lang_conf"); # for the time being

	}

	sub createW {
	my ($w_element, $line) = @_;

	#---------------------------
	# Get the tags (=columns)
	#---------------------------

	my @tags = split(/\t/, $line);

	# set content of <w> i.e. the token
	$w_element->set_text($tags[0]);

	# vrt positional-attributes in corpus KLK:
	# USE [0] word
	# USE [1] ref (id for reference of dephead)
	# USE [2] lemma
	# ? [3] lemmacomp (lemma with compound info - could go in @norm, as tag abuse?)
	# USE [4] pos
	# USE [5] msd
	# USE [6] dephead
	# USE [7] deprel
	# [8] content (ocr-process)
	# [9] vpos (ocr-process)
	# [10] ocr (ocr-process)
	# [11] cc (ocr-process)
	# [12] hyph (ocr-process)
	# [13] style (ocr-process)
	# [14] lex (korp semantic disambiguation from G"oteborg)

	# set the attributes of <w>:
	$w_element->set_att("n", $tags[1]);
	# $w_element->set_att("id", "w_" . $namearray[4] . $sentence->att("xml:id") . "_" . $tags[1]);
	# so zusammengebaute ID ist auch nicht eindeutig...
	$w_element->del_att("id");
	$w_element->set_att("lemma", $tags[2]);
	# $w_element->set_att("norm", $tags[3]); # tag abuse of @norm
	$w_element->set_att("pos", $tags[4]);
	$w_element->set_att("msd", $tags[5]);
	#TMP $w_element->set_att("head", $tags[6]);
	#TMP $w_element->set_att("deprel", $tags[7]);

	}


	sub set_title{
	my ($corpusHeader, $source, $yy, $kielipankkiCorpus) = @_;

	my $cTitleString = $source . " " . $yy . ", from ". $kielipankkiCorpus . " for EuReCo";

	#<teiHeader>
	# <fileDesc>
	# <titleStmt>
	# <title>[Aamulehti2021] from [klk-fi-v2-vrt for] EuReCo</title>
	# </titleStmt>
	# <!-- ... -->
	# </fileDesc>
	#</teiHeader>

	my $cTitleNode = $corpusHeader->first_child("fileDesc")->first_child("titleStmt")->first_child("title");

	$cTitleNode->set_text($cTitleString);

	}

	sub set_sourceDesc{
	my ($corpusHeader, $source, $yy, $kielipankkiCorpus) = @_;

	my $cBiblString = $source . " " . $yy . ", from ". $kielipankkiCorpus . " for EuReCo";

	#<teiHeader>
	# <fileDesc>
	# <!-- ... -->
	# <sourceDesc>
	# <bibl>[Aamulehti2021] from [klk-fi-v2-vrt]</bibl>
	# </sourceDesc>
	# <!-- ... -->
	# </fileDesc>
	#</teiHeader>

	my $cBiblNode = $corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("bibl");

	$cBiblNode->set_text($cBiblString);
	}







	#################
	## usage_message
	#################


	sub usage_message {
	print " Usage: ./vrt2tei.pl <file.vrt.xml> <outfile>\n";
	print " <file.vrt.xml> is a VRT file converted to proper XML\n";
	exit;
	}