|  | #! /usr/bin/perl -w | 
|  |  | 
|  |  | 
|  | ########################################################################################################################################################### | 
|  | # vrt2tei.pl | 
|  | # eureco | 
|  | # leibniz-institut fuer deutsche sprache / csc finland esbo | 
|  | # august 2024 | 
|  | # | 
|  | # | 
|  | # using XML::Twig , see http://www.xmltwig.org/, https://metacpan.org/pod/XML::Twiga | 
|  | # | 
|  | # usage: see below the usage fugnction | 
|  | # Usage:  ./vrt2tei.pl <vrtxmlfile.xml> <outfile> | 
|  | #         <vrtxmlfile>: xml-ised vrt file | 
|  | # | 
|  | # | 
|  | # TODO: | 
|  | # 1  insert dtd spec, or ref to TEI | 
|  |  | 
|  | # 3a remove the vrt positional attribute comment line / all comment lines | 
|  | # 3b add @head and @deprel to I5 sowie auch @msd | 
|  | # 3c bearbeitung von @head und @deprel in tei2korapxml durch Nils? | 
|  | # 3d build 30 billion corpus | 
|  |  | 
|  | # 4a take care of IDs | 
|  | # 4b see to the values of @xml:lang | 
|  | # 5 abfangen von unerwarteten elementen dh andere als <sentence> und <paragraph> | 
|  | # 5a wort reihenfolge nochmal checken | 
|  | # 6 checks and balances | 
|  | # 7  How to encode Kielipankki and National Library of Finland? in teiCorpus Header | 
|  | # 8  construct <idsDoc>s for the months (or go for TEI) | 
|  | # 9  parallelisation in bash and application on sub corpora of KLK | 
|  | # 10  re-implementation of the gawk code in the perl script | 
|  | # 12  re-implement creation of text header from xml file in another twig / parametrize TEI vs I5 | 
|  |  | 
|  |  | 
|  |  | 
|  | #remember | 
|  | #formatted.xml:105613: element w: validity error : No declaration for attribute deprel of element w | 
|  | #formatted.xml:105613: element w: validity error : No declaration for attribute head of element w | 
|  |  | 
|  |  | 
|  | # | 
|  | # | 
|  | ############################################################################################################################################################ | 
|  |  | 
|  |  | 
|  | use strict; | 
|  | use warnings; | 
|  |  | 
|  | use XML::Twig; | 
|  | use XML::Generator ':pretty';  # apparently no effect when using flush(); | 
|  |  | 
|  |  | 
|  | use locale;                 # diese drei Zeilen, damit \b im regex nicht Umlaute und ß matcht. | 
|  | use POSIX qw(locale_h);     # to be able to use setlocale() | 
|  | #setlocale(LC_ALL,'de_DE'); | 
|  | setlocale(LC_ALL, "fi_FI"); | 
|  | use utf8; | 
|  | use open qw( :std :encoding(UTF-8) ); | 
|  |  | 
|  | use Time::Piece; | 
|  | use Tie::IxHash; | 
|  |  | 
|  |  | 
|  |  | 
|  | #---------------------- | 
|  | # check file arguments: | 
|  | #---------------------- | 
|  |  | 
|  | # arg0 infile:   vrt-xml | 
|  |  | 
|  | unless($ARGV[0]) {&usage_message()} ; # min arg0, the input file | 
|  | if    ($ARGV[1]) {&usage_message()};  # max arg0, the input file | 
|  |  | 
|  |  | 
|  | #################### | 
|  | # GLOBAL VARIABLES | 
|  | #################### | 
|  |  | 
|  | my $encoding = "UTF-8"; | 
|  | #my $encoding = "iso-8859-1";             # dieses $encoding ist NUR fuer das output s.u. twig funktion | 
|  | my $textcounter = 0; | 
|  |  | 
|  |  | 
|  |  | 
|  | my $twig=""; | 
|  | my $teiCorpusHeaderDoc=""; | 
|  |  | 
|  |  | 
|  | #------------------------------------------------------------------ | 
|  | # read corpusHeaderSkeleton document and get header out of it | 
|  | #------------------------------------------------------------------ | 
|  |  | 
|  | my $teiCorpusHeaderDocTwig = new XML::Twig( | 
|  | keep_spaces => 1, | 
|  | keep_atts_order => 1, | 
|  | comments => 'drop', | 
|  | ); | 
|  |  | 
|  |  | 
|  | $teiCorpusHeaderDocTwig->parsefile("teiCorpusHeaderSkeleton.tei.xml"); | 
|  | my $corpusHeader = $teiCorpusHeaderDocTwig->root;                  # getting the teiHeader for corpus out of the teiCorpusHeaderSkeleton document | 
|  |  | 
|  |  | 
|  | #------------------------------------------------------------------ | 
|  | # read textHeaderSkeleton document adn get header out of it | 
|  | #------------------------------------------------------------------ | 
|  |  | 
|  | my $teiTextHeaderDocTwig = new XML::Twig( | 
|  | keep_spaces => 1, | 
|  | keep_atts_order => 1, | 
|  | comments => 'drop', | 
|  | ); | 
|  |  | 
|  | $teiTextHeaderDocTwig->parsefile("teiTextHeaderSkeleton.tei.xml"); | 
|  | my $textHeader = $teiTextHeaderDocTwig->root;                      # getting the teiHeader for corpus out of the teiTextHeaderSkeleton document | 
|  |  | 
|  |  | 
|  | #---------------------------------- | 
|  | # read input VRT-XML document | 
|  | #---------------------------------- | 
|  |  | 
|  | open(my $IN,  "< $ARGV[0]") || die("$0: cannot open file for reading: $ARGV[0]");     # open input  file and initialise filehandel, actually does not seem to be needed | 
|  | # as parsefile() (s.b.) is applied to the filename | 
|  |  | 
|  | #----------------------------------------------------- | 
|  | # global variables pertaining to the original corpus | 
|  | #----------------------------------------------------- | 
|  |  | 
|  | my $kielipankkiCorpus = "klk-fi-v2-vrt"; | 
|  |  | 
|  |  | 
|  |  | 
|  |  | 
|  | ##################### | 
|  | #     M A I N | 
|  | ##################### | 
|  |  | 
|  | #------------------------------------------------------------------------------------------------------------- | 
|  | # start twig for input and call start tag handler for root and twig handler for each <text> in the VRT | 
|  | #------------------------------------------------------------------------------------------------------------- | 
|  |  | 
|  |  | 
|  | $twig = new XML::Twig( | 
|  | keep_spaces => 1,           # dadurch auch whitespaces an ehemeligen elementgrenzen im output | 
|  | keep_atts_order => 1,       # requires Tie::IxHash | 
|  | comments => 'drop', | 
|  | start_tag_handlers => { | 
|  | texts => sub{root(@_, $corpusHeader)} | 
|  | }, | 
|  | twig_handlers =>  { | 
|  | #	text => \&text | 
|  | text =>  sub{text(@_, $textHeader)} | 
|  | }, | 
|  | # dtd_handlers =>   {       # ToDo for I5 | 
|  | #	\&set_dtd; | 
|  | # } | 
|  |  | 
|  | output_encoding => $encoding, | 
|  | ); | 
|  |  | 
|  | $twig->parsefile($ARGV[0]); | 
|  |  | 
|  |  | 
|  |  | 
|  |  | 
|  |  | 
|  | ########### | 
|  | # END MAIN | 
|  | ########### | 
|  |  | 
|  |  | 
|  |  | 
|  |  | 
|  | ############################## | 
|  | #   S U B R O U T I N E S | 
|  | ############################## | 
|  |  | 
|  | # sub set_dtd [ | 
|  | #    my $twig, $dtd = @_; | 
|  | #    my $internal = qq|\nPUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"\n"DTD/xhtml1-strict.dtd"|; | 
|  | # | 
|  | #    $twig->twig_doctype('html', undef, undef, $internal); | 
|  | #    } | 
|  |  | 
|  |  | 
|  |  | 
|  | sub root { | 
|  | my ($twig, $root, $corpusHeader) =@_; | 
|  |  | 
|  | $root->set_gi('teiCorpus'); | 
|  | $root->set_att("xmlns", 'http://www.tei-c.org/ns/1.0'); | 
|  |  | 
|  | &insertCorpusHeader($root, $corpusHeader); | 
|  | } | 
|  |  | 
|  |  | 
|  |  | 
|  | sub insertCorpusHeader{ | 
|  | my ($root, $corpusHeader) =@_; | 
|  |  | 
|  | #--------------------------------------------------------------------------- | 
|  | # get some metadata for the current output corpus based on source and year | 
|  | #--------------------------------------------------------------------------- | 
|  |  | 
|  | my @array = split(/\//, $ARGV[0]); | 
|  | my $l = scalar(@array); | 
|  | my $source = $array[$l-1]; | 
|  | $source =~ s/([0-9][0-9][0-9][0-9])\.xml$//; | 
|  |  | 
|  | my $language="Finnish"; | 
|  | my $lang_tla="fi"; | 
|  |  | 
|  | my $yy = $1;   # $1 now containts substring in first bracket in regex above | 
|  |  | 
|  | my $ctitle = $source . " " . $yy . ", from ". $kielipankkiCorpus . " for EuReCo";    # to do: also get name of corpus (klk-fi-v2-vrt) | 
|  |  | 
|  |  | 
|  | #----------------------- | 
|  | # set corpus header | 
|  | #----------------------- | 
|  |  | 
|  | &set_title(     $corpusHeader, $source, $yy, $kielipankkiCorpus); | 
|  | &set_sourceDesc($corpusHeader, $source, $yy, $kielipankkiCorpus); | 
|  |  | 
|  | my $teiCorpusHeader        = $corpusHeader->paste("first_child", $root); | 
|  |  | 
|  | } | 
|  |  | 
|  |  | 
|  | #---------------------------- | 
|  | # handler &text for <text> | 
|  | #---------------------------- | 
|  |  | 
|  | sub text { | 
|  | my ($twig, $text, $textHeader) = @_; | 
|  |  | 
|  | $textcounter++;               # global variable | 
|  |  | 
|  | # ToDo: catch all other, unexpected children of root | 
|  |  | 
|  | #-------------------------------------------------------------------------- | 
|  | # Get text metadata (attributes of <text>) and create teiHeader for <text> | 
|  | #-------------------------------------------------------------------------- | 
|  |  | 
|  | my $textattsref = $text->atts();               # $textattsref is now a reference to a hash and should be used with '->' | 
|  |  | 
|  |  | 
|  | &createTextHeader($text, $textattsref, $textHeader); | 
|  |  | 
|  | #-------------------------- | 
|  | # create <TEI> from <text> | 
|  | #-------------------------- | 
|  |  | 
|  | # set vrt <text> to <TEI> and delete all attributes after they were were saved above | 
|  | $text->del_atts; | 
|  | $text->set_gi("TEI"); | 
|  |  | 
|  | #------------------------------------------------------------------ | 
|  | # create the <tei:text>, <body>, <div> elements inside <TEI> | 
|  | #------------------------------------------------------------------ | 
|  |  | 
|  | my $ttext_element = XML::Twig::Elt->new('text'); | 
|  | my $body_element  = XML::Twig::Elt->new('body'); | 
|  | my $div_element   = XML::Twig::Elt->new('div'); | 
|  |  | 
|  | # set atts | 
|  | $div_element  ->set_att("type", "page");                          # ToDo: this is specific to KLK | 
|  | $ttext_element->set_att("xml:lang", 'fi');                               # as in ICC-NOR | 
|  |  | 
|  | # paste | 
|  | $ttext_element->paste('last_child',  $text); | 
|  | $body_element ->paste('last_child',  $ttext_element); | 
|  | $div_element  ->paste('last_child',  $body_element); | 
|  |  | 
|  |  | 
|  | #------------------------------- | 
|  | # create <p> from <paragraph> | 
|  | #------------------------------- | 
|  |  | 
|  | my @paragraphs = $text->children( 'paragraph'); | 
|  |  | 
|  | foreach my $paragraph (@paragraphs) { | 
|  |  | 
|  | &setP($paragraph); | 
|  |  | 
|  | $paragraph->move('last_child', $div_element); | 
|  |  | 
|  | #------------------------------ | 
|  | # create <s> from <sentence> | 
|  | #------------------------------ | 
|  |  | 
|  | my @sentences = $paragraph->children('sentence'); | 
|  | foreach my $sentence (@sentences) { | 
|  |  | 
|  | &setS($sentence); | 
|  |  | 
|  |  | 
|  | #-------------------------------------- | 
|  | # create <w> (word) from each $line | 
|  | #-------------------------------------- | 
|  |  | 
|  | my @lines = split(/\n+/, $sentence->xml_text); | 
|  | $sentence->set_text("\n"); | 
|  |  | 
|  | for my $line (@lines){                                 # Todo: Reihenfolge checken | 
|  | if($line ne "" ){ | 
|  | my $w_element = XML::Twig::Elt->new('w'); | 
|  | &createW($w_element, $line); | 
|  | $w_element->paste('last_child', $sentence); | 
|  | } | 
|  | } # end words | 
|  | } # end sentences | 
|  | } # end paragraphs | 
|  |  | 
|  | # $twig->set_pretty_print( 'record'); | 
|  | # $twig->flush($OUT); | 
|  | $twig->flush("/dev/stdout"); | 
|  | } | 
|  |  | 
|  | sub createTextHeader{ | 
|  | my ($text, $textattsref, $textHeader) = @_; | 
|  |  | 
|  | # USE 01 binding_id="2246025" | 
|  | # USE 02 date="2021-01-15" | 
|  | #     03 datefrom="20210115" | 
|  | #     04 dateto="20210115" | 
|  | #     05 elec_date="_" | 
|  | #     06 file="" | 
|  | # USE 07 filename_metadata="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_mets.xml" | 
|  | # USE 08 filename_orig    ="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_page-2021011502210030301.xml | 
|  | # USE 09 id="t-bcd0f3fa-bbd3dac4" | 
|  | #     10 img_url="" | 
|  | # USE 11 issue_date="15.01.2021" | 
|  | # USE 12 issue_no="SK0221" | 
|  | # USE 13 issue_title="Suomen Kuvalehti" | 
|  | # USE 14 label="Suomen Kuvalehti no. SK0221 15.01.2021" | 
|  | # USE 16 language="fi" | 
|  | # USE 17 page_id="p1" | 
|  | # USE 18 page_no="None" | 
|  | #     19 part_name="_" | 
|  | #     20 publ_id="0039-5552" | 
|  | #     21 publ_part="" | 
|  | # USE 22 publ_title="Suomen Kuvalehti" | 
|  | # USE 23 publ_type="aikakausi" | 
|  | # USE 24 sentcount="70" | 
|  | # USE 25 sum_lang="|xxx:44|fin:23|eng:3|" | 
|  | #     26 timefrom="000000" | 
|  | #     27 timeto="235959" | 
|  | # USE 28 tokencount="304" | 
|  | #     29 version_added="KLK-fi-2021"> | 
|  |  | 
|  |  | 
|  | my $BID          = $textattsref->{'binding_id'}; | 
|  | my $DATE         = $textattsref->{'date'}; | 
|  | my $METAFILENAME = $textattsref->{'filename_metadata'}; | 
|  | my $ORIGFILENAME = $textattsref->{'filename_orig'}; | 
|  | my $ID           = $textattsref->{'id'}; | 
|  | my $ISSUEDATE    = $textattsref->{'issue_date'}; | 
|  | my $ISSUENO      = $textattsref->{'issue_no'}; | 
|  | my $ISSUETITLE   = $textattsref->{'issue_title'}; | 
|  | my $LABEL        = $textattsref->{'label'}; | 
|  | my $LANGUAGE     = $textattsref->{'language'}; | 
|  | my $PAGEID       = $textattsref->{'page_id'}; | 
|  | my $PAGENO       = $textattsref->{'page_no'}; | 
|  | my $PUBLTITLE    = $textattsref->{'publ_title'}; | 
|  | my $PUBLTYPE     = $textattsref->{'publ_type'}; | 
|  | my $SENTCOUNT    = $textattsref->{'sentcount'}; | 
|  | my $SUMLANG      = $textattsref->{'sum_lang'}; | 
|  | my $TOKENCOUNT   = $textattsref->{'tokencount'}; | 
|  |  | 
|  |  | 
|  | #----------------------------- | 
|  | # Derived Metadata variables | 
|  | #----------------------------- | 
|  |  | 
|  | my @datearray = split("-", $DATE); | 
|  | my @langarray = split("|", $SUMLANG); | 
|  | my @namearray = split(/[\.\/]/, $ORIGFILENAME);  # use $namearray[4] as ID for the page | 
|  |  | 
|  |  | 
|  |  | 
|  | #----------------------------------------------------------------------- | 
|  | # CREATE text-teiHeader ACCORDING TO THE SKELETON in $textHeader | 
|  | #----------------------------------------------------------------------- | 
|  |  | 
|  |  | 
|  | $textHeader->paste('first_child', $text); | 
|  |  | 
|  | #----------------------------------------------- | 
|  | # <teiHeader> | 
|  | #   <fileDesc n="[EuReCo-KLK-FIN_$ID]"> | 
|  | #     <titleStmt> | 
|  | #       <title>[$LABEL, page $PAGENO]</title> | 
|  |  | 
|  | $textHeader->first_child("fileDesc")->first_child("titleStmt")->first_child("title") | 
|  | ->set_text($LABEL . ", Text #" . $textcounter);    # Case KLK | 
|  |  | 
|  | #----------------------------------------------- | 
|  | # <fileDesc> | 
|  | #  <sourceDesc> | 
|  | #   <biblStruct> | 
|  | #      <analytic> | 
|  | #         <title type="main">[$LABEL, page $PAGENO]</title> | 
|  | #         <date>[$DATE]</date> | 
|  | #         <date type="year">TODO</date> | 
|  | #         <date type="month">TODO</date> | 
|  | #         <date type="day">TODO</date> | 
|  | #         <idno type="PAGEID">$PAGEID</idno> | 
|  | #         <idno type="BINDINGID">$BID</idno> | 
|  | #         <idno type="ID">$ID</idno> | 
|  | #         <idno type="KIELIPANKKI_METAFILENAME">$METAFILENAME</idno> | 
|  | #         <idno type="KIELIPANKKI_ORIGFILENAME">$ORIGFILENAME</idno> | 
|  | #         <textLang>$LANGUAGE</textLang> | 
|  | #       </analytic> | 
|  |  | 
|  | my $analytic  = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/analytic", 0); | 
|  |  | 
|  | $analytic->first_child("title")                    ->set_text($LABEL . ", Text #" . $textcounter);   # Case KLK | 
|  | $analytic->get_xpath('./date[@type="year"]',   0)  ->set_text($datearray[0]); | 
|  | $analytic->get_xpath('./date[@type="month"]',  0)  ->set_text($datearray[1]); | 
|  | $analytic->get_xpath('./date[@type="day"]',    0)  ->set_text($datearray[2]); | 
|  | $analytic->get_xpath('./idno[@type="PAGEID"]',       0)     ->set_text($PAGEID); | 
|  | $analytic->get_xpath('./idno[@type="BINDINGID"]',    0)     ->set_text($BID); | 
|  | $analytic->get_xpath('./idno[@type="ID"]',           0)     ->set_text($ID); | 
|  | $analytic->get_xpath('./idno[@type="KIELIPANKKI_METAFILENAME"]', 0)     ->set_text($METAFILENAME); | 
|  | $analytic->get_xpath('./idno[@type="KIELIPANKKI_ORIGFILENAME"]', 0)     ->set_text($ORIGFILENAME); | 
|  | $analytic->first_child('textLang')                        ->set_text($LANGUAGE); | 
|  |  | 
|  | #  <monogr> | 
|  | #    <title>$PUBLTITLE</title> | 
|  | #    <imprint> | 
|  | #      <pubPlace>TODO</pubPlace> | 
|  | #        <publisher>TODO</publisher> | 
|  | #    </imprint> | 
|  | #    <biblScope unit="ISSUETITLE"/> | 
|  | #    <biblScope unit="ISSUENO"/> | 
|  | #    <biblScope unit="ISSUEDATE"/> | 
|  | #    <biblScope unit="pp">$PAGENO</biblScope> | 
|  | #  <monogr> | 
|  |  | 
|  | my $monogr  = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/monogr", 0); | 
|  |  | 
|  |  | 
|  |  | 
|  |  | 
|  |  | 
|  | ##TMP      # create <teiHeader> inside <TEI> | 
|  | ##TMP      my $teiHeader = XML::Twig::Elt->new('teiHeader'); | 
|  | ##TMP      # $teiHeader->paste('first_child', $text); | 
|  | ##TMP | 
|  | ##TMP      ## insert_new_elt is a combo of new and paste, cf. xml::twig docu: | 
|  | ##TMP      ## insert_new_elt ($opt_position, $gi, $opt_atts_hashref, @opt_content) | 
|  | ##TMP | 
|  | ##TMP      my $fileDesc     = $teiHeader->insert_new_elt('fileDesc'                           => {n => "EuReCo_KLK-fi_" . $namearray[4]}); | 
|  | ##TMP      my $encodingDesc = $teiHeader->insert_new_elt("last_child", 'encodingDesc'); | 
|  | ##TMP      my $profileDesc  = $teiHeader->insert_new_elt("last_child", 'profileDesc'); | 
|  | ##TMP      my $revisionDesc = $teiHeader->insert_new_elt("last_child", 'revisionDesc'); | 
|  | ##TMP | 
|  | ##TMP      #--------------------- | 
|  | ##TMP      # fileDesc/titleStmt | 
|  | ##TMP      #--------------------- | 
|  | ##TMP      my $titleStmt = $fileDesc ->insert_new_elt('titleStmt'); | 
|  | ##TMP      my $title     = $titleStmt->insert_new_elt("last_child", 'title'); | 
|  | ##TMP      my $respStmt  = $titleStmt->insert_new_elt("last_child", 'respStmt'); | 
|  | ##TMP      my $resp      = $respStmt ->insert_new_elt("last_child", 'resp'); | 
|  | ##TMP      my $name      = $respStmt ->insert_new_elt("last_child", 'name'); | 
|  | ##TMP | 
|  | ##TMP      # set texts for titleStmt | 
|  | ##TMP      # $title->set_text($LABEL . ", page " . $PAGENO);           # Achtung - PAGENO scheint meist "None" zu sein | 
|  | ##TMP      $title->set_text($LABEL . ", Text #" . $textcounter);    # at least for Suomen Kuvalehti | 
|  | ##TMP      $resp ->set_text("compiled by EuReCo"); | 
|  | ##TMP      $name ->set_text("EuReCo: HL"); | 
|  | ##TMP | 
|  | ##TMP      #-------------------------- | 
|  | ##TMP      # fileDesc/publicationStmt | 
|  | ##TMP      #-------------------------- | 
|  | ##TMP      my $publicationStmt = $fileDesc       ->insert_new_elt("last_child", 'publicationStmt'); | 
|  | ##TMP      my $distributor     = $publicationStmt->insert_new_elt("last_child", 'distributor'); | 
|  | ##TMP      my $note            = $distributor    ->insert_new_elt("last_child", 'note'); | 
|  | ##TMP      my $availability    = $publicationStmt->insert_new_elt("last_child", 'availability'); | 
|  | ##TMP      my $licence         = $availability   ->insert_new_elt("last_child", 'licence'); | 
|  | ##TMP | 
|  | ##TMP      # set texts for publicationStmt | 
|  | ##TMP      $note   ->set_text("NOT FOR DISTRIBUTION - to be used locally in EuReCo"); | 
|  | ##TMP      $licence->set_text("CLARIN_RES");  # TODO: Ausfuherlichere Licence info in KLK Metadata Record | 
|  | ##TMP | 
|  | ##TMP      #------------------------------ | 
|  | ##TMP      # fileDesc/sourceDesc/biblStruct | 
|  | ##TMP      #------------------------------ | 
|  | ##TMP      my $sourceDesc = $fileDesc  ->insert_new_elt("last_child", 'sourceDesc'); | 
|  | ##TMP      my $biblStruct   = $sourceDesc->insert_new_elt("last_child", 'biblStruct'); | 
|  | ##TMP | 
|  | ##TMP      # fileDesc/sourceDesc/biblStruct/analytic | 
|  | ##TMP      my $analytic                = $biblStruct->insert_new_elt("last_child", 'analytic'); | 
|  | ##TMP      my $analytic_title          = $analytic->insert_new_elt("last_child", 'title'        => {type => "main"} ); | 
|  | ##TMP  #    my $analytic_date           = $analytic->insert_new_elt("last_child", 'date'); | 
|  | ##TMP      my $analytic_date_year      = $analytic->insert_new_elt("last_child", 'date'         => {type => "year"}); | 
|  | ##TMP      my $analytic_date_month     = $analytic->insert_new_elt("last_child", 'date'         => {type => "month"}); | 
|  | ##TMP      my $analytic_date_day       = $analytic->insert_new_elt("last_child", 'date'         => {type => "day"}); | 
|  | ##TMP      my $analytic_idno_pageid    = $analytic->insert_new_elt("last_child", 'idno'         => {type => "PAGEID"}); | 
|  | ##TMP      my $analytic_idno_bindingid = $analytic->insert_new_elt("last_child", 'idno'         => {type => "BINDINGID"}); | 
|  | ##TMP      my $analytic_idno_id        = $analytic->insert_new_elt("last_child", 'idno'         => {type => "ID"}); | 
|  | ##TMP      my $analytic_idno_metafile  = $analytic->insert_new_elt("last_child", 'idno'         => {type => "KIELIPANKKI_METAFILENAME"}); | 
|  | ##TMP      my $analytic_idno_origfile  = $analytic->insert_new_elt("last_child", 'idno'         => {type => "KIELIPANKKI_ORIGFILENAME"}); | 
|  | ##TMP      my $analytic_textlang       = $analytic->insert_new_elt("last_child", 'textLang'); | 
|  | ##TMP | 
|  | ##TMP      # set texts for analytic | 
|  | ##TMP  #    $analytic_title         ->set_text($LABEL . ", page " . $PAGENO);  # Achtung $PAGENO scheint meist "None zu sein" | 
|  | ##TMP      $analytic_title         ->set_text($LABEL . ", Text #" . $textcounter);  # Achtung $PAGENO scheint meist "None zu sein" | 
|  | ##TMP  #    $analytic_date         ->set_text($DATE); | 
|  | ##TMP      $analytic_date_year     ->set_text($datearray[0]); | 
|  | ##TMP      $analytic_date_month    ->set_text($datearray[1]); | 
|  | ##TMP      $analytic_date_day      ->set_text($datearray[2]); | 
|  | ##TMP      $analytic_idno_pageid   ->set_text($PAGEID); | 
|  | ##TMP      $analytic_idno_bindingid->set_text($BID); | 
|  | ##TMP      $analytic_idno_id       ->set_text($ID); | 
|  | ##TMP      $analytic_idno_metafile ->set_text($METAFILENAME); | 
|  | ##TMP      $analytic_idno_origfile ->set_text($ORIGFILENAME); | 
|  | ##TMP      $analytic_textlang      ->set_text($LANGUAGE); | 
|  | ##TMP | 
|  | ##TMP      #------------------------------------- | 
|  | ##TMP      # fileDesc/sourceDesc/biblStruct/monogr | 
|  | ##TMP      #------------------------------------- | 
|  | ##TMP      my $monogr               = $biblStruct->insert_new_elt("last_child", 'monogr'); | 
|  | ##TMP      my $monogr_title         = $monogr    ->insert_new_elt("last_child", 'title'); | 
|  | ##TMP      my $imprint              = $monogr    ->insert_new_elt("last_child", 'imprint');   # imprint is needed for valididty | 
|  | ##TMP      my $pubPlace             = $imprint   ->insert_new_elt("last_child", 'pubPlace');  # imprint is needed for validity | 
|  | ##TMP      my $publisher            = $imprint   ->insert_new_elt("last_child", 'publisher'); # imprint is needed for validity | 
|  | ##TMP      my $biblScope_issuetitle = $monogr    ->insert_new_elt("last_child", 'biblScope'   => {unit => 'ISSUETITLE'} ); | 
|  | ##TMP      my $biblScope_issueno    = $monogr    ->insert_new_elt("last_child", 'biblScope'   => {unit => 'ISSUENO'} ); | 
|  | ##TMP      my $biblScope_issuedate  = $monogr    ->insert_new_elt("last_child", 'biblScope'   => {unit => 'ISSUEDATE'} ); | 
|  | ##TMP      my $biblScope_pp         = $monogr    ->insert_new_elt("last_child", 'biblScope'   => {unit => 'PAGENO'} );      # Achtung PAGENO ist meist "None" ? | 
|  | ##TMP | 
|  | ##TMP      # set texts for monogr | 
|  | ##TMP      $monogr_title        ->set_text($PUBLTITLE); | 
|  | ##TMP      $pubPlace            ->set_text("TODO"); | 
|  | ##TMP      $pubPlace            ->set_att("key",'FI'); | 
|  | ##TMP      $publisher           ->set_text("TODO"); | 
|  | ##TMP      $biblScope_issuetitle->set_text($ISSUETITLE); | 
|  | ##TMP      $biblScope_issueno   ->set_text($ISSUENO); | 
|  | ##TMP      $biblScope_issuedate ->set_text($ISSUEDATE); | 
|  | ##TMP      $biblScope_pp        ->set_text($PAGENO); | 
|  | ##TMP | 
|  | ##TMP      #--------------- | 
|  | ##TMP      # encodingDesc | 
|  | ##TMP      #--------------- | 
|  | ##TMP      my $tagsDecl     = $encodingDesc->insert_new_elt("last_child", 'tagsDecl'); | 
|  | ##TMP      my $namespace    = $tagsDecl    ->insert_new_elt("last_child", 'namespace'    => {name => 'http://www.tei-c.org/ns/1.0'}); | 
|  | ##TMP      my $tagUsage_s   = $namespace   ->insert_new_elt("last_child", 'tagUsage'     => {gi   => 's', occurs => $SENTCOUNT}); | 
|  | ##TMP      my $tagUsage_w   = $namespace   ->insert_new_elt("last_child", 'tagUsage'     => {gi   => 'w', occurs => $TOKENCOUNT}); | 
|  | ##TMP | 
|  | ##TMP      #------------- | 
|  | ##TMP      # profileDesc | 
|  | ##TMP      #------------- | 
|  | ##TMP      my $langUsage   = $profileDesc ->insert_new_elt("last_child", 'langUsage'); | 
|  | ##TMP      my $language    = $langUsage   ->insert_new_elt("last_child", 'language'      => {ident => $LANGUAGE, usage => $SUMLANG}); | 
|  | ##TMP      # Achtung in @usage muss eigt. ein integer; am besten inhalt von SUMLANG aufdroeseln und mehrere <language> machen | 
|  | ##TMP      my $textClass   = $profileDesc ->insert_new_elt("last_child", 'textClass'); | 
|  | ##TMP      my $classCode_fi   = $textClass   ->insert_new_elt("last_child", 'classCode' => {scheme       => "KLK_PUBLTYPE"}); | 
|  | ##TMP  #    my $classCode_en   = $textClass   ->insert_new_elt("last_child", 'classCode' => {scheme      => "KLK_PUBLTYPE_MAPPED"}); | 
|  | ##TMP | 
|  | ##TMP      #--------------------------- | 
|  | ##TMP      # set texts for profileDesc | 
|  | ##TMP      #--------------------------- | 
|  | ##TMP      $classCode_fi ->set_text($PUBLTYPE); | 
|  | ##TMP  #    $classCode_en->set_text($PUBLTYPETRANSL); | 
|  | ##TMP | 
|  | ##TMP      #--------------- | 
|  | ##TMP      # revisionDesc | 
|  | ##TMP      #--------------- | 
|  | ##TMP      my $change      = $revisionDesc ->insert_new_elt("last_child", 'change'       => {when => localtime->ymd('-'), who => 'HL'  }); | 
|  | ##TMP | 
|  | ##TMP      # set texts for revisionDesc | 
|  | ##TMP      $change->set_text("TEI version for EuReCo"); | 
|  |  | 
|  |  | 
|  |  | 
|  |  | 
|  |  | 
|  |  | 
|  |  | 
|  |  | 
|  | ################################### | 
|  | # END OF CREATING TEIHEADER | 
|  | ################################### | 
|  |  | 
|  | } | 
|  |  | 
|  | sub setP { | 
|  | my ($paragraph) = @_; | 
|  |  | 
|  | $paragraph->set_gi('p'); | 
|  |  | 
|  | # <paragraph id="p-bcd0f3fa-bbd3dac4-815ead7a" sum_lang="|fin:1|"> | 
|  | # atts of <paragraph>: | 
|  | #    @id        USE | 
|  | #    @sum_lang  USE: put in xml:lang and prefix the value with "x-" for private value | 
|  |  | 
|  | $paragraph->set_att("xml:lang", "x-" . $paragraph->att("sum_lang")); | 
|  | $paragraph->del_att("sum_lang"); | 
|  | # $paragraph->change_att_name('id', 'xml:id'); | 
|  | $paragraph->del_att("id");              # diese id ist auch nicht eindeutig!! | 
|  | } | 
|  | sub setS { | 
|  | my ($sentence) = @_; | 
|  |  | 
|  | $sentence->set_gi('s'); | 
|  |  | 
|  | # the atts of <sentence>: | 
|  | #    USE   1 @id="s-bcd0f3fa-bbd3dac4-f7429090" | 
|  | #    USE   2 @lang="fin" -> xml:lang | 
|  | #    ?     3 @lang_conf="0.6734853"> -> ToDo @cert ? | 
|  |  | 
|  | # set attrs of <s> | 
|  | $sentence->set_att("xml:lang", $sentence->att("lang"));  # ToDo: convert the value / introduce a hash for lookup (input values: "fin", "xxx", ....) | 
|  | # $sentence->change_att_name('id', 'xml:id');            # nicht eindeutig | 
|  | $sentence->del_att('id'); | 
|  | $sentence->del_att("lang");                         # replaced by xml:lang | 
|  | $sentence->del_att("lang_conf");                    # for the time being | 
|  |  | 
|  | } | 
|  |  | 
|  | sub createW { | 
|  | my ($w_element, $line) = @_; | 
|  |  | 
|  | #--------------------------- | 
|  | # Get the tags (=columns) | 
|  | #--------------------------- | 
|  |  | 
|  | my @tags = split(/\t/, $line); | 
|  |  | 
|  | # set content of <w> i.e. the token | 
|  | $w_element->set_text($tags[0]); | 
|  |  | 
|  | # vrt positional-attributes in corpus KLK: | 
|  | #  USE [0] word | 
|  | #  USE [1] ref  (id for reference of dephead) | 
|  | #  USE [2] lemma | 
|  | #  ?   [3] lemmacomp   (lemma with compound info - could go in @norm, as tag abuse?) | 
|  | #  USE [4] pos | 
|  | #  USE [5] msd | 
|  | #  USE [6] dephead | 
|  | #  USE [7] deprel | 
|  | #      [8] content   (ocr-process) | 
|  | #      [9] vpos      (ocr-process) | 
|  | #     [10] ocr       (ocr-process) | 
|  | #     [11] cc        (ocr-process) | 
|  | #     [12] hyph      (ocr-process) | 
|  | #     [13] style     (ocr-process) | 
|  | #     [14] lex       (korp semantic disambiguation from G"oteborg) | 
|  |  | 
|  | # set the attributes of <w>: | 
|  | $w_element->set_att("n",      $tags[1]); | 
|  | # $w_element->set_att("id",     "w_" . $namearray[4] . $sentence->att("xml:id") . "_" . $tags[1]); | 
|  | # so zusammengebaute ID ist auch nicht eindeutig... | 
|  | $w_element->del_att("id"); | 
|  | $w_element->set_att("lemma",  $tags[2]); | 
|  | # $w_element->set_att("norm",   $tags[3]);  # tag abuse of @norm | 
|  | $w_element->set_att("pos",    $tags[4]); | 
|  | $w_element->set_att("msd",    $tags[5]); | 
|  | #TMP    $w_element->set_att("head",   $tags[6]); | 
|  | #TMP    $w_element->set_att("deprel", $tags[7]); | 
|  |  | 
|  | } | 
|  |  | 
|  |  | 
|  | sub set_title{ | 
|  | my ($corpusHeader, $source, $yy, $kielipankkiCorpus) = @_; | 
|  |  | 
|  | my $cTitleString = $source . " " . $yy . ", from ". $kielipankkiCorpus . " for EuReCo"; | 
|  |  | 
|  | #<teiHeader> | 
|  | #  <fileDesc> | 
|  | #    <titleStmt> | 
|  | #      <title>[Aamulehti2021] from [klk-fi-v2-vrt for] EuReCo</title> | 
|  | #    </titleStmt> | 
|  | #    <!-- ... --> | 
|  | #  </fileDesc> | 
|  | #</teiHeader> | 
|  |  | 
|  | my $cTitleNode = $corpusHeader->first_child("fileDesc")->first_child("titleStmt")->first_child("title"); | 
|  |  | 
|  | $cTitleNode->set_text($cTitleString); | 
|  |  | 
|  | } | 
|  |  | 
|  | sub set_sourceDesc{ | 
|  | my ($corpusHeader, $source, $yy, $kielipankkiCorpus) = @_; | 
|  |  | 
|  | my $cBiblString = $source . " " . $yy . ", from ". $kielipankkiCorpus . " for EuReCo"; | 
|  |  | 
|  | #<teiHeader> | 
|  | #  <fileDesc> | 
|  | #    <!-- ... --> | 
|  | #    <sourceDesc> | 
|  | #      <bibl>[Aamulehti2021] from [klk-fi-v2-vrt]</bibl> | 
|  | #    </sourceDesc> | 
|  | #    <!-- ... --> | 
|  | #  </fileDesc> | 
|  | #</teiHeader> | 
|  |  | 
|  | my $cBiblNode = $corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("bibl"); | 
|  |  | 
|  | $cBiblNode->set_text($cBiblString); | 
|  | } | 
|  |  | 
|  |  | 
|  |  | 
|  |  | 
|  |  | 
|  |  | 
|  |  | 
|  | ################# | 
|  | ## usage_message | 
|  | ################# | 
|  |  | 
|  |  | 
|  | sub usage_message { | 
|  | print "   Usage:  ./vrt2tei.pl <file.vrt.xml> <outfile>\n"; | 
|  | print "   <file.vrt.xml> is a VRT file converted to proper XML\n"; | 
|  | exit; | 
|  | } | 
|  |  | 
|  |  |