converter, first version
diff --git a/vrt2tei.pl b/vrt2tei.pl
new file mode 100755
index 0000000..c7789a8
--- /dev/null
+++ b/vrt2tei.pl
@@ -0,0 +1,548 @@
+#! /usr/bin/perl -w
+
+
+###########################################################################################################################################################
+# vrt2tei.pl
+# eureco
+# leibniz-institut fuer deutsche sprache / csc finland esbo
+# august 2024
+# 
+# 
+# using XML::Twig , see http://www.xmltwig.org/, https://metacpan.org/pod/XML::Twiga
+# 
+# usage: see below the usage function
+# Usage:  ./xml2i5.pl <vrtxmlfile.xml> <outfile>
+#         <vrtxmlfile>: xml-ised vrt file
+#
+#
+# TODO: 
+# 1  insert dtd spec, or ref to TEI
+
+# 3a UPLOAD in GITHUB
+# 3b add @head and @deprel to I5 sowie auch @msd
+# 3c bearbeitung von @head und @deprel in tei2korapxml durch Nils?
+# 3d build 30 billion corpus
+
+# 4a take care of IDs
+# 4b see to the values of @xml:lang
+# 5 abfangen von unerwarteten elementen dh andere als <sentence> und <paragraph>
+# 5a wort reihenfolge nochmal checken
+# 6 checks and balances
+# 6a output nach stdout machen
+# 7  How to encode Kielipankki and National Library of Finland? in teiCorpus Header
+# 8  construct <idsDoc>s for the months (or go for TEI)
+# 9  parallelisation in bash and application on sub corpora of KLK
+# 10  re-implementation of the gawk code in the perl script
+# 12  re-implement creation of text header from xml file in another twig / parametrize TEI vs I5
+
+
+
+#remember
+#formatted.xml:105613: element w: validity error : No declaration for attribute deprel of element w
+#formatted.xml:105613: element w: validity error : No declaration for attribute head of element w
+
+
+#
+#
+############################################################################################################################################################
+
+
+use strict;
+use warnings;
+
+use XML::Twig; 
+use XML::Generator ':pretty';  # apparently no effect when using flush();
+
+ 
+use locale;                 # diese drei Zeilen, damit \b im regex nicht Umlaute und ß matcht.
+use POSIX qw(locale_h);     # to be able to use setlocale()
+#setlocale(LC_ALL,'de_DE');  
+setlocale(LC_ALL, "fi_FI");  
+use utf8;
+use open qw( :std :encoding(UTF-8) );
+
+use Time::Piece;
+use Tie::IxHash;
+
+#----------------------
+# check file arguments:
+#----------------------
+
+# arg0 infile:   vrt-xml
+# arg1 outfile:  tei
+
+unless($ARGV[1]) {&usage_message()}; # min arg0 und arg1
+if ($ARGV[2]) {&usage_message()};    # max arg1
+
+
+
+
+####################
+# GLOBAL VARIABLES 
+####################
+
+my $encoding = "UTF-8";               
+#my $encoding = "iso-8859-1";             # dieses $encoding ist NUR fuer das output s.u. twig funktion
+
+
+
+#####################
+#     M A I N  
+#####################
+
+
+# open result file and initialise filehandle
+open(my $OUT, ">> $ARGV[1]") || die("cannot open file: $ARGV[1]"); 
+
+
+
+#-----------------------------------------------------------------------------------
+# start twig and call start tag handler for root and twig handler for each <text>
+#-----------------------------------------------------------------------------------
+
+my $twig="";   
+
+$twig = new XML::Twig(
+    keep_spaces => 1,           # dadurch auch whitespaces an ehemeligen elementgrenzen im output
+    keep_atts_order => 1,       # requires Tie::IxHash
+    pretty_print => 'indented',
+    start_tag_handlers => {
+    	texts => \&root
+    },
+    twig_handlers =>  {
+	text => \&text
+    },
+    # dtd_handlers =>   {       # ToDo for I5
+    #	\&set_dtd;
+    # }
+    output_encoding => $encoding,
+    );
+
+$twig->parsefile($ARGV[0]);
+
+close($OUT);
+
+
+###########
+# END MAIN
+###########
+
+
+
+
+##############################
+#   S U B R O U T I N E S
+##############################
+
+# sub set_dtd [
+#    my $twig, $dtd = @_;
+#    my $internal = qq|\nPUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"\n"DTD/xhtml1-strict.dtd"|;
+#
+#    $twig->twig_doctype('html', undef, undef, $internal);
+#    }
+
+
+
+sub root {
+    my ($twig, $root) =@_;
+
+    $root->set_gi('teiCorpus');
+    $root->set_att("xmlns", 'http://www.tei-c.org/ns/1.0');
+
+    &insertCorpusHeader($root);    
+}
+
+
+sub insertCorpusHeader{
+    my ($root) =@_;
+
+    my $teiHeader        = $root           ->insert_new_elt("first_child", 'teiHeader');    
+    my $fileDesc         = $teiHeader      ->insert_new_elt("last_child", 'fileDesc');
+
+    my $titleStmt        = $fileDesc       ->insert_new_elt("last_child", 'titleStmt');
+    my $title            = $titleStmt      ->insert_new_elt("last_child", 'title');
+    $title               ->set_text(" KLK-fi-2021 for EuReCo");
+    
+    my $publicationStmt  = $fileDesc       ->insert_new_elt("last_child", 'publicationStmt');
+    my $distributor      = $publicationStmt->insert_new_elt("last_child", 'distributor');
+    $distributor         ->set_text("NOT FOR DISTRIBUTION - to be used locally in EuReCo");
+
+    my $sourceDesc       = $fileDesc       ->insert_new_elt("last_child", 'sourceDesc');
+    my $bibl             = $sourceDesc     ->insert_new_elt("last_child", 'bibl');
+    $bibl                ->set_text("ToDo");
+
+}
+    
+    
+#----------------------------
+# handler &text for <text>
+#----------------------------
+
+sub text {
+
+    my ($twig, $text) = @_;
+
+    # ToDo: catch all other, unexpected children
+
+    #--------------------------------------------------------------------------
+    # Get text metadata (attributes of <text>) and create teiHeader for <text>
+    #--------------------------------------------------------------------------
+    
+    my $textattsref = $text->atts();               # $textattsref is now a reference to a hash and should be used with '->'
+
+    &createTextHeader($text, $textattsref);
+
+    #--------------------------
+    # create <TEI> from <text>
+    #--------------------------
+    
+    # set vrt <text> to <TEI> and delete all attributes after they were were saved above
+    $text->del_atts;
+    $text->set_gi("TEI");
+
+    #------------------------------------------------------------------
+    # create the <tei:text>, <body>, <div> elements inside <TEI>
+    #------------------------------------------------------------------
+    
+    my $ttext_element = XML::Twig::Elt->new('text');
+    my $body_element  = XML::Twig::Elt->new('body');
+    my $div_element   = XML::Twig::Elt->new('div');
+
+    # set atts
+    $div_element->set_att("type", "page");                          # ToDo: this is specific to KLK
+
+    # paste
+    $ttext_element->paste('last_child',  $text);
+    $body_element ->paste('last_child',  $ttext_element);
+    $div_element  ->paste('last_child',  $body_element);
+
+    
+    #-------------------------------
+    # create <p> from <paragraph>
+    #-------------------------------
+    
+    my @paragraphs = $text->children( 'paragraph');
+
+    foreach my $paragraph (@paragraphs) {
+	
+	&setP($paragraph);
+
+	$paragraph->move('last_child', $div_element);
+	
+	
+	#------------------------------
+	# create <s> from <sentence>
+	#------------------------------
+	
+	my @sentences = $paragraph->children('sentence');
+	foreach my $sentence (@sentences) {
+
+	    &setS($sentence);
+
+
+	    #--------------------------------------
+	    # create <w> (word) from each $line
+	    #--------------------------------------
+	    
+	    my @lines = split(/\n+/, $sentence->xml_text);
+	    $sentence->set_text("\n");
+
+	    for my $line (@lines){                                 # Todo: Reihenfolge checken
+		if($line ne "" ){
+		    my $w_element = XML::Twig::Elt->new('w');
+		    &createW($w_element, $line);
+		    $w_element->paste('last_child', $sentence);
+		}
+	    }
+	}
+    }
+
+    $twig->flush($OUT);    
+}
+
+
+sub createTextHeader{
+    my ($text, $textattsref) = @_;
+    
+    # USE 01 binding_id="2246025"
+    # USE 02 date="2021-01-15"
+    #     03 datefrom="20210115"
+    #     04 dateto="20210115"
+    #     05 elec_date="_"
+    #     06 file=""
+    # USE 07 filename_metadata="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_mets.xml"
+    # USE 08 filename_orig    ="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_page-2021011502210030301.xml
+    # USE 09 id="t-bcd0f3fa-bbd3dac4"
+    #     10 img_url=""
+    # USE 11 issue_date="15.01.2021"
+    # USE 12 issue_no="SK0221"
+    # USE 13 issue_title="Suomen Kuvalehti"
+    # USE 14 label="Suomen Kuvalehti no. SK0221 15.01.2021"
+    # USE 16 language="fi"
+    # USE 17 page_id="p1"
+    # USE 18 page_no="None"
+    #     19 part_name="_"
+    #     20 publ_id="0039-5552"
+    #     21 publ_part=""
+    # USE 22 publ_title="Suomen Kuvalehti" 
+    # USE 23 publ_type="aikakausi"
+    # USE 24 sentcount="70"
+    # USE 25 sum_lang="|xxx:44|fin:23|eng:3|"
+    #     26 timefrom="000000"
+    #     27 timeto="235959"
+    # USE 28 tokencount="304"
+    #     29 version_added="KLK-fi-2021">
+
+
+    my $BID          = $textattsref->{'binding_id'};
+    my $DATE         = $textattsref->{'date'};
+    my $METAFILENAME = $textattsref->{'filename_metadata'};    
+    my $ORIGFILENAME = $textattsref->{'filename_orig'};
+    my $ID           = $textattsref->{'id'};
+    my $ISSUEDATE    = $textattsref->{'issue_date'};
+    my $ISSUENO      = $textattsref->{'issue_no'};
+    my $ISSUETITLE   = $textattsref->{'issue_title'};
+    my $LABEL        = $textattsref->{'label'};
+    my $LANGUAGE     = $textattsref->{'language'};
+    my $PAGEID       = $textattsref->{'page_id'};
+    my $PAGENO       = $textattsref->{'page_no'};
+    my $PUBLTITLE    = $textattsref->{'publ_title'};
+    my $PUBLTYPE     = $textattsref->{'publ_type'};
+    my $SENTCOUNT    = $textattsref->{'sentcount'};
+    my $SUMLANG      = $textattsref->{'sum_lang'};
+    my $TOKENCOUNT   = $textattsref->{'tokencount'};
+
+    
+    #-----------------------------
+    # Derived Metadata variables
+    #-----------------------------
+
+    my @datearray = split("-", $DATE);
+    my @langarray = split("|", $SUMLANG);
+    my @namearray = split(/[\.\/]/, $ORIGFILENAME);  # use $namearray[4] as ID for the page
+    
+
+    #-----------------------------------------------------------------------
+    # CREATE text-teiHeader ACCORDING TO THE SKELETON in klk-header.tei.xml
+    #-----------------------------------------------------------------------
+
+    # create <teiHeader> inside <TEI>
+    my $teiHeader = XML::Twig::Elt->new('teiHeader');
+    $teiHeader->paste('first_child', $text);
+
+    ## insert_new_elt is a combo of new and paste, cf. xml::twig docu:
+    ## insert_new_elt ($opt_position, $gi, $opt_atts_hashref, @opt_content)
+    
+    my $fileDesc     = $teiHeader->insert_new_elt('fileDesc'                           => {n => "EuReCo_KLK-fi_" . $namearray[4]});
+    my $encodingDesc = $teiHeader->insert_new_elt("last_child", 'encodingDesc');
+    my $profileDesc  = $teiHeader->insert_new_elt("last_child", 'profileDesc');
+    my $revisionDesc = $teiHeader->insert_new_elt("last_child", 'revisionDesc');
+
+    #---------------------
+    # fileDesc/titleStmt
+    #---------------------
+    my $titleStmt = $fileDesc ->insert_new_elt('titleStmt');
+    my $title     = $titleStmt->insert_new_elt("last_child", 'title');
+    my $respStmt  = $titleStmt->insert_new_elt("last_child", 'respStmt');
+    my $resp      = $respStmt ->insert_new_elt("last_child", 'resp');
+    my $name      = $respStmt ->insert_new_elt("last_child", 'name');
+
+    # set texts for titleStmt
+    $title->set_text($LABEL . ", page " . $PAGENO);    # Achtung - PAGENO scheint meist "None" zu sein
+    $resp ->set_text("compiled by EuReCo");
+    $name ->set_text("EuReCo: HL");
+
+    #--------------------------
+    # fileDesc/publicationStmt
+    #--------------------------
+    my $publicationStmt = $fileDesc       ->insert_new_elt("last_child", 'publicationStmt');
+    my $distributor     = $publicationStmt->insert_new_elt("last_child", 'distributor');
+    my $note            = $distributor    ->insert_new_elt("last_child", 'note');
+    my $availability    = $publicationStmt->insert_new_elt("last_child", 'availability'); 
+    my $licence         = $availability   ->insert_new_elt("last_child", 'licence');
+
+    # set texts for publicationStmt
+    $note   ->set_text("NOT FOR DISTRIBUTION - to be used locally in EuReCo");
+    $licence->set_text("CLARIN-RES");  # TODO: Ausfuherlichere Licence info in KLK Metadata Record
+
+    #------------------------------
+    # fileDesc/sourceDesc/biblStruct
+    #------------------------------
+    my $sourceDesc = $fileDesc  ->insert_new_elt("last_child", 'sourceDesc');
+    my $biblStruct   = $sourceDesc->insert_new_elt("last_child", 'biblStruct');
+
+    # fileDesc/sourceDesc/biblStruct/analytic
+    my $analytic                = $biblStruct->insert_new_elt("last_child", 'analytic');
+    my $analytic_title          = $analytic->insert_new_elt("last_child", 'title'        => {type => "main"} );
+#    my $analytic_date           = $analytic->insert_new_elt("last_child", 'date');
+    my $analytic_date_year      = $analytic->insert_new_elt("last_child", 'date'         => {type => "year"});
+    my $analytic_date_month     = $analytic->insert_new_elt("last_child", 'date'         => {type => "month"});
+    my $analytic_date_day       = $analytic->insert_new_elt("last_child", 'date'         => {type => "day"});
+    my $analytic_idno_pageid    = $analytic->insert_new_elt("last_child", 'idno'         => {type => "PAGEID"});
+    my $analytic_idno_bindingid = $analytic->insert_new_elt("last_child", 'idno'         => {type => "BINDINGID"});
+    my $analytic_idno_id        = $analytic->insert_new_elt("last_child", 'idno'         => {type => "ID"});
+    my $analytic_idno_metafile  = $analytic->insert_new_elt("last_child", 'idno'         => {type => "KIELIPANKKI_METAFILENAME"});
+    my $analytic_idno_origfile  = $analytic->insert_new_elt("last_child", 'idno'         => {type => "KIELIPANKKI_ORIGFILENAME"});
+    my $analytic_textlang       = $analytic->insert_new_elt("last_child", 'textLang');
+
+    # set texts for analytic
+    $analytic_title         ->set_text($LABEL . ", page " . $PAGENO);  # Achtung $PAGENO scheint meist "None zu sein"
+#    $analytic_date         ->set_text($DATE);
+    $analytic_date_year     ->set_text($datearray[0]);
+    $analytic_date_month    ->set_text($datearray[1]);
+    $analytic_date_day      ->set_text($datearray[2]);
+    $analytic_idno_pageid   ->set_text($PAGEID);
+    $analytic_idno_bindingid->set_text($BID);
+    $analytic_idno_id       ->set_text($ID);
+    $analytic_idno_metafile ->set_text($METAFILENAME);
+    $analytic_idno_origfile ->set_text($ORIGFILENAME);
+    $analytic_textlang      ->set_text($LANGUAGE);
+
+    #-------------------------------------    
+    # fileDesc/sourceDesc/biblStruct/monogr
+    #-------------------------------------
+    my $monogr               = $biblStruct->insert_new_elt("last_child", 'monogr');
+    my $monogr_title         = $monogr    ->insert_new_elt("last_child", 'title');
+    my $imprint              = $monogr    ->insert_new_elt("last_child", 'imprint');   # imprint is needed for valididty
+    my $pubPlace             = $imprint   ->insert_new_elt("last_child", 'pubPlace');  # imprint is needed for validity
+    my $publisher            = $imprint   ->insert_new_elt("last_child", 'publisher'); # imprint is needed for validity
+    my $biblScope_issuetitle = $monogr    ->insert_new_elt("last_child", 'biblScope'   => {unit => 'ISSUETITLE'} );
+    my $biblScope_issueno    = $monogr    ->insert_new_elt("last_child", 'biblScope'   => {unit => 'ISSUENO'} );
+    my $biblScope_issuedate  = $monogr    ->insert_new_elt("last_child", 'biblScope'   => {unit => 'ISSUEDATE'} );
+    my $biblScope_pp         = $monogr    ->insert_new_elt("last_child", 'biblScope'   => {unit => 'PAGENO'} );      # Achtung PAGENO ist meist "None" ?
+
+    # set texts for monogr
+    $monogr_title        ->set_text($PUBLTITLE);
+    $pubPlace            ->set_text("TODO");
+    $publisher           ->set_text("TODO");
+    $biblScope_issuetitle->set_text($ISSUETITLE);
+    $biblScope_issueno   ->set_text($ISSUENO);
+    $biblScope_issuedate ->set_text($ISSUEDATE);
+    $biblScope_pp        ->set_text($PAGENO);
+ 
+    #---------------
+    # encodingDesc
+    #---------------
+    my $tagsDecl     = $encodingDesc->insert_new_elt("last_child", 'tagsDecl');
+    my $namespace    = $tagsDecl    ->insert_new_elt("last_child", 'namespace'    => {name => 'http://www.tei-c.org/ns/1.0'});
+    my $tagUsage_s   = $namespace   ->insert_new_elt("last_child", 'tagUsage'     => {gi   => 's', occurs => $SENTCOUNT});
+    my $tagUsage_w   = $namespace   ->insert_new_elt("last_child", 'tagUsage'     => {gi   => 'w', occurs => $TOKENCOUNT});
+    
+    #-------------
+    # profileDesc
+    #-------------
+    my $langUsage   = $profileDesc ->insert_new_elt("last_child", 'langUsage');
+    my $language    = $langUsage   ->insert_new_elt("last_child", 'language'      => {ident => $LANGUAGE, usage => $SUMLANG});
+    # Achtung in @usage muss eigt. ein integer; am besten inhalt von SUMLANG aufdroeseln und mehrere <language> machen
+    my $textClass   = $profileDesc ->insert_new_elt("last_child", 'textClass');
+    my $classCode_fi   = $textClass   ->insert_new_elt("last_child", 'classCode' => {scheme       => "KLK_PUBLTYPE"});
+#    my $classCode_en   = $textClass   ->insert_new_elt("last_child", 'classCode' => {scheme      => "KLK_PUBLTYPE_MAPPED"});
+
+    #---------------------------
+    # set texts for profileDesc
+    #---------------------------
+    $classCode_fi ->set_text($PUBLTYPE);
+#    $classCode_en->set_text($PUBLTYPETRANSL);
+
+    #---------------
+    # revisionDesc
+    #---------------
+    my $change      = $revisionDesc ->insert_new_elt("last_child", 'change'       => {when => localtime->ymd('-'), who => 'HL'  });
+
+    # set texts for revisionDesc
+    $change->set_text("TEI version for EuReCo");
+
+
+    ###################################
+    # END OF CREATING TEIHEADER
+    ###################################
+    
+}
+                                                                                                                                                                                          
+sub setP {
+    my ($paragraph) = @_;
+    
+    $paragraph->set_gi('p');
+    
+    # <paragraph id="p-bcd0f3fa-bbd3dac4-815ead7a" sum_lang="|fin:1|">
+    # atts of <paragraph>:
+    #    @id        USE
+    #    @sum_lang  USE: put in xml:lang and prefix the value with "x-" for private value
+    
+    $paragraph->set_att("xml:lang", "x-" . $paragraph->att("sum_lang"));
+    $paragraph->del_att("sum_lang");
+    $paragraph->change_att_name('id', 'xml:id');
+}                                                                                                                                                                                       
+sub setS {
+    my ($sentence) = @_;
+    
+    $sentence->set_gi('s');
+	    
+    # the atts of <sentence>:
+    #    USE   1 @id="s-bcd0f3fa-bbd3dac4-f7429090"
+    #    USE   2 @lang="fin" -> xml:lang 
+    #    ?     3 @lang_conf="0.6734853"> -> ToDo @cert ?	    
+    
+    # set attrs of <s>
+    $sentence->set_att("xml:lang", $sentence->att("lang"));  # ToDo: convert the value / introduce a hash for lookup (input values: "fin", "xxx", ....)
+    $sentence->change_att_name('id', 'xml:id');
+    $sentence->del_att("lang");                         # replaced by xml:lang
+    $sentence->del_att("lang_conf");                    # for the time being
+    
+}
+
+sub createW {
+    my ($w_element, $line) = @_;
+
+    #---------------------------
+    # Get the tags (=columns)
+    #---------------------------
+    
+    my @tags = split(/\t/, $line);
+    
+    # set content of <w> i.e. the token
+    $w_element->set_text($tags[0]);
+    
+    # vrt positional-attributes in corpus KLK:
+    #  USE [0] word
+    #  USE [1] ref  (id for reference of dephead)
+    #  USE [2] lemma
+    #  ?   [3] lemmacomp   (lemma with compound info - could go in @norm, as tag abuse?)
+    #  USE [4] pos 
+    #  USE [5] msd 
+    #  USE [6] dephead
+    #  USE [7] deprel 
+    #      [8] content   (ocr-process)
+    #      [9] vpos      (ocr-process)
+    #     [10] ocr       (ocr-process)
+    #     [11] cc        (ocr-process)
+    #     [12] hyph      (ocr-process)
+    #     [13] style     (ocr-process)
+    #     [14] lex       (korp semantic disambiguation from G"oteborg)
+    
+    # set the attributes of <w>:
+    $w_element->set_att("n",      $tags[1]);
+    # $w_element->set_att("id",     "w_" . $namearray[4] . $sentence->att("xml:id") . "_" . $tags[1]);
+    # so zusammengebaute ID ist auch nicht eindeutig...
+    $w_element->change_att_name('id', 'xml:id');
+    $w_element->set_att("lemma",  $tags[2]);
+    # $w_element->set_att("norm",   $tags[3]);  # tag abuse of @norm
+    $w_element->set_att("pos",    $tags[4]);
+    $w_element->set_att("msd",    $tags[5]);
+    $w_element->set_att("head",   $tags[6]);
+    $w_element->set_att("deprel", $tags[7]);
+    
+}
+    
+#################
+## usage_message
+#################
+
+
+sub usage_message {
+    print "   Usage:  ./xml2i5.pl <file.vrt.xml> <outfile>\n";
+    print "   <file.vrt.xml> is a VRT file converted to proper XML\n";
+    exit;
+}
+
+