converter, first version
diff --git a/vrt2tei.pl b/vrt2tei.pl
new file mode 100755
index 0000000..c7789a8
--- /dev/null
+++ b/vrt2tei.pl
@@ -0,0 +1,548 @@
+#! /usr/bin/perl -w
+
+
+###########################################################################################################################################################
+# vrt2tei.pl
+# eureco
+# leibniz-institut fuer deutsche sprache / csc finland esbo
+# august 2024
+#
+#
+# using XML::Twig , see http://www.xmltwig.org/, https://metacpan.org/pod/XML::Twiga
+#
+# usage: see below the usage function
+# Usage: ./xml2i5.pl <vrtxmlfile.xml> <outfile>
+# <vrtxmlfile>: xml-ised vrt file
+#
+#
+# TODO:
+# 1 insert dtd spec, or ref to TEI
+
+# 3a UPLOAD in GITHUB
+# 3b add @head and @deprel to I5 sowie auch @msd
+# 3c bearbeitung von @head und @deprel in tei2korapxml durch Nils?
+# 3d build 30 billion corpus
+
+# 4a take care of IDs
+# 4b see to the values of @xml:lang
+# 5 abfangen von unerwarteten elementen dh andere als <sentence> und <paragraph>
+# 5a wort reihenfolge nochmal checken
+# 6 checks and balances
+# 6a output nach stdout machen
+# 7 How to encode Kielipankki and National Library of Finland? in teiCorpus Header
+# 8 construct <idsDoc>s for the months (or go for TEI)
+# 9 parallelisation in bash and application on sub corpora of KLK
+# 10 re-implementation of the gawk code in the perl script
+# 12 re-implement creation of text header from xml file in another twig / parametrize TEI vs I5
+
+
+
+#remember
+#formatted.xml:105613: element w: validity error : No declaration for attribute deprel of element w
+#formatted.xml:105613: element w: validity error : No declaration for attribute head of element w
+
+
+#
+#
+############################################################################################################################################################
+
+
+use strict;
+use warnings;
+
+use XML::Twig;
+use XML::Generator ':pretty'; # apparently no effect when using flush();
+
+
+use locale; # diese drei Zeilen, damit \b im regex nicht Umlaute und ß matcht.
+use POSIX qw(locale_h); # to be able to use setlocale()
+#setlocale(LC_ALL,'de_DE');
+setlocale(LC_ALL, "fi_FI");
+use utf8;
+use open qw( :std :encoding(UTF-8) );
+
+use Time::Piece;
+use Tie::IxHash;
+
+#----------------------
+# check file arguments:
+#----------------------
+
+# arg0 infile: vrt-xml
+# arg1 outfile: tei
+
+unless($ARGV[1]) {&usage_message()}; # min arg0 und arg1
+if ($ARGV[2]) {&usage_message()}; # max arg1
+
+
+
+
+####################
+# GLOBAL VARIABLES
+####################
+
+my $encoding = "UTF-8";
+#my $encoding = "iso-8859-1"; # dieses $encoding ist NUR fuer das output s.u. twig funktion
+
+
+
+#####################
+# M A I N
+#####################
+
+
+# open result file and initialise filehandle
+open(my $OUT, ">> $ARGV[1]") || die("cannot open file: $ARGV[1]");
+
+
+
+#-----------------------------------------------------------------------------------
+# start twig and call start tag handler for root and twig handler for each <text>
+#-----------------------------------------------------------------------------------
+
+my $twig="";
+
+$twig = new XML::Twig(
+ keep_spaces => 1, # dadurch auch whitespaces an ehemeligen elementgrenzen im output
+ keep_atts_order => 1, # requires Tie::IxHash
+ pretty_print => 'indented',
+ start_tag_handlers => {
+ texts => \&root
+ },
+ twig_handlers => {
+ text => \&text
+ },
+ # dtd_handlers => { # ToDo for I5
+ # \&set_dtd;
+ # }
+ output_encoding => $encoding,
+ );
+
+$twig->parsefile($ARGV[0]);
+
+close($OUT);
+
+
+###########
+# END MAIN
+###########
+
+
+
+
+##############################
+# S U B R O U T I N E S
+##############################
+
+# sub set_dtd [
+# my $twig, $dtd = @_;
+# my $internal = qq|\nPUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"\n"DTD/xhtml1-strict.dtd"|;
+#
+# $twig->twig_doctype('html', undef, undef, $internal);
+# }
+
+
+
+sub root {
+ my ($twig, $root) =@_;
+
+ $root->set_gi('teiCorpus');
+ $root->set_att("xmlns", 'http://www.tei-c.org/ns/1.0');
+
+ &insertCorpusHeader($root);
+}
+
+
+sub insertCorpusHeader{
+ my ($root) =@_;
+
+ my $teiHeader = $root ->insert_new_elt("first_child", 'teiHeader');
+ my $fileDesc = $teiHeader ->insert_new_elt("last_child", 'fileDesc');
+
+ my $titleStmt = $fileDesc ->insert_new_elt("last_child", 'titleStmt');
+ my $title = $titleStmt ->insert_new_elt("last_child", 'title');
+ $title ->set_text(" KLK-fi-2021 for EuReCo");
+
+ my $publicationStmt = $fileDesc ->insert_new_elt("last_child", 'publicationStmt');
+ my $distributor = $publicationStmt->insert_new_elt("last_child", 'distributor');
+ $distributor ->set_text("NOT FOR DISTRIBUTION - to be used locally in EuReCo");
+
+ my $sourceDesc = $fileDesc ->insert_new_elt("last_child", 'sourceDesc');
+ my $bibl = $sourceDesc ->insert_new_elt("last_child", 'bibl');
+ $bibl ->set_text("ToDo");
+
+}
+
+
+#----------------------------
+# handler &text for <text>
+#----------------------------
+
+sub text {
+
+ my ($twig, $text) = @_;
+
+ # ToDo: catch all other, unexpected children
+
+ #--------------------------------------------------------------------------
+ # Get text metadata (attributes of <text>) and create teiHeader for <text>
+ #--------------------------------------------------------------------------
+
+ my $textattsref = $text->atts(); # $textattsref is now a reference to a hash and should be used with '->'
+
+ &createTextHeader($text, $textattsref);
+
+ #--------------------------
+ # create <TEI> from <text>
+ #--------------------------
+
+ # set vrt <text> to <TEI> and delete all attributes after they were were saved above
+ $text->del_atts;
+ $text->set_gi("TEI");
+
+ #------------------------------------------------------------------
+ # create the <tei:text>, <body>, <div> elements inside <TEI>
+ #------------------------------------------------------------------
+
+ my $ttext_element = XML::Twig::Elt->new('text');
+ my $body_element = XML::Twig::Elt->new('body');
+ my $div_element = XML::Twig::Elt->new('div');
+
+ # set atts
+ $div_element->set_att("type", "page"); # ToDo: this is specific to KLK
+
+ # paste
+ $ttext_element->paste('last_child', $text);
+ $body_element ->paste('last_child', $ttext_element);
+ $div_element ->paste('last_child', $body_element);
+
+
+ #-------------------------------
+ # create <p> from <paragraph>
+ #-------------------------------
+
+ my @paragraphs = $text->children( 'paragraph');
+
+ foreach my $paragraph (@paragraphs) {
+
+ &setP($paragraph);
+
+ $paragraph->move('last_child', $div_element);
+
+
+ #------------------------------
+ # create <s> from <sentence>
+ #------------------------------
+
+ my @sentences = $paragraph->children('sentence');
+ foreach my $sentence (@sentences) {
+
+ &setS($sentence);
+
+
+ #--------------------------------------
+ # create <w> (word) from each $line
+ #--------------------------------------
+
+ my @lines = split(/\n+/, $sentence->xml_text);
+ $sentence->set_text("\n");
+
+ for my $line (@lines){ # Todo: Reihenfolge checken
+ if($line ne "" ){
+ my $w_element = XML::Twig::Elt->new('w');
+ &createW($w_element, $line);
+ $w_element->paste('last_child', $sentence);
+ }
+ }
+ }
+ }
+
+ $twig->flush($OUT);
+}
+
+
+sub createTextHeader{
+ my ($text, $textattsref) = @_;
+
+ # USE 01 binding_id="2246025"
+ # USE 02 date="2021-01-15"
+ # 03 datefrom="20210115"
+ # 04 dateto="20210115"
+ # 05 elec_date="_"
+ # 06 file=""
+ # USE 07 filename_metadata="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_mets.xml"
+ # USE 08 filename_orig ="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_page-2021011502210030301.xml
+ # USE 09 id="t-bcd0f3fa-bbd3dac4"
+ # 10 img_url=""
+ # USE 11 issue_date="15.01.2021"
+ # USE 12 issue_no="SK0221"
+ # USE 13 issue_title="Suomen Kuvalehti"
+ # USE 14 label="Suomen Kuvalehti no. SK0221 15.01.2021"
+ # USE 16 language="fi"
+ # USE 17 page_id="p1"
+ # USE 18 page_no="None"
+ # 19 part_name="_"
+ # 20 publ_id="0039-5552"
+ # 21 publ_part=""
+ # USE 22 publ_title="Suomen Kuvalehti"
+ # USE 23 publ_type="aikakausi"
+ # USE 24 sentcount="70"
+ # USE 25 sum_lang="|xxx:44|fin:23|eng:3|"
+ # 26 timefrom="000000"
+ # 27 timeto="235959"
+ # USE 28 tokencount="304"
+ # 29 version_added="KLK-fi-2021">
+
+
+ my $BID = $textattsref->{'binding_id'};
+ my $DATE = $textattsref->{'date'};
+ my $METAFILENAME = $textattsref->{'filename_metadata'};
+ my $ORIGFILENAME = $textattsref->{'filename_orig'};
+ my $ID = $textattsref->{'id'};
+ my $ISSUEDATE = $textattsref->{'issue_date'};
+ my $ISSUENO = $textattsref->{'issue_no'};
+ my $ISSUETITLE = $textattsref->{'issue_title'};
+ my $LABEL = $textattsref->{'label'};
+ my $LANGUAGE = $textattsref->{'language'};
+ my $PAGEID = $textattsref->{'page_id'};
+ my $PAGENO = $textattsref->{'page_no'};
+ my $PUBLTITLE = $textattsref->{'publ_title'};
+ my $PUBLTYPE = $textattsref->{'publ_type'};
+ my $SENTCOUNT = $textattsref->{'sentcount'};
+ my $SUMLANG = $textattsref->{'sum_lang'};
+ my $TOKENCOUNT = $textattsref->{'tokencount'};
+
+
+ #-----------------------------
+ # Derived Metadata variables
+ #-----------------------------
+
+ my @datearray = split("-", $DATE);
+ my @langarray = split("|", $SUMLANG);
+ my @namearray = split(/[\.\/]/, $ORIGFILENAME); # use $namearray[4] as ID for the page
+
+
+ #-----------------------------------------------------------------------
+ # CREATE text-teiHeader ACCORDING TO THE SKELETON in klk-header.tei.xml
+ #-----------------------------------------------------------------------
+
+ # create <teiHeader> inside <TEI>
+ my $teiHeader = XML::Twig::Elt->new('teiHeader');
+ $teiHeader->paste('first_child', $text);
+
+ ## insert_new_elt is a combo of new and paste, cf. xml::twig docu:
+ ## insert_new_elt ($opt_position, $gi, $opt_atts_hashref, @opt_content)
+
+ my $fileDesc = $teiHeader->insert_new_elt('fileDesc' => {n => "EuReCo_KLK-fi_" . $namearray[4]});
+ my $encodingDesc = $teiHeader->insert_new_elt("last_child", 'encodingDesc');
+ my $profileDesc = $teiHeader->insert_new_elt("last_child", 'profileDesc');
+ my $revisionDesc = $teiHeader->insert_new_elt("last_child", 'revisionDesc');
+
+ #---------------------
+ # fileDesc/titleStmt
+ #---------------------
+ my $titleStmt = $fileDesc ->insert_new_elt('titleStmt');
+ my $title = $titleStmt->insert_new_elt("last_child", 'title');
+ my $respStmt = $titleStmt->insert_new_elt("last_child", 'respStmt');
+ my $resp = $respStmt ->insert_new_elt("last_child", 'resp');
+ my $name = $respStmt ->insert_new_elt("last_child", 'name');
+
+ # set texts for titleStmt
+ $title->set_text($LABEL . ", page " . $PAGENO); # Achtung - PAGENO scheint meist "None" zu sein
+ $resp ->set_text("compiled by EuReCo");
+ $name ->set_text("EuReCo: HL");
+
+ #--------------------------
+ # fileDesc/publicationStmt
+ #--------------------------
+ my $publicationStmt = $fileDesc ->insert_new_elt("last_child", 'publicationStmt');
+ my $distributor = $publicationStmt->insert_new_elt("last_child", 'distributor');
+ my $note = $distributor ->insert_new_elt("last_child", 'note');
+ my $availability = $publicationStmt->insert_new_elt("last_child", 'availability');
+ my $licence = $availability ->insert_new_elt("last_child", 'licence');
+
+ # set texts for publicationStmt
+ $note ->set_text("NOT FOR DISTRIBUTION - to be used locally in EuReCo");
+ $licence->set_text("CLARIN-RES"); # TODO: Ausfuherlichere Licence info in KLK Metadata Record
+
+ #------------------------------
+ # fileDesc/sourceDesc/biblStruct
+ #------------------------------
+ my $sourceDesc = $fileDesc ->insert_new_elt("last_child", 'sourceDesc');
+ my $biblStruct = $sourceDesc->insert_new_elt("last_child", 'biblStruct');
+
+ # fileDesc/sourceDesc/biblStruct/analytic
+ my $analytic = $biblStruct->insert_new_elt("last_child", 'analytic');
+ my $analytic_title = $analytic->insert_new_elt("last_child", 'title' => {type => "main"} );
+# my $analytic_date = $analytic->insert_new_elt("last_child", 'date');
+ my $analytic_date_year = $analytic->insert_new_elt("last_child", 'date' => {type => "year"});
+ my $analytic_date_month = $analytic->insert_new_elt("last_child", 'date' => {type => "month"});
+ my $analytic_date_day = $analytic->insert_new_elt("last_child", 'date' => {type => "day"});
+ my $analytic_idno_pageid = $analytic->insert_new_elt("last_child", 'idno' => {type => "PAGEID"});
+ my $analytic_idno_bindingid = $analytic->insert_new_elt("last_child", 'idno' => {type => "BINDINGID"});
+ my $analytic_idno_id = $analytic->insert_new_elt("last_child", 'idno' => {type => "ID"});
+ my $analytic_idno_metafile = $analytic->insert_new_elt("last_child", 'idno' => {type => "KIELIPANKKI_METAFILENAME"});
+ my $analytic_idno_origfile = $analytic->insert_new_elt("last_child", 'idno' => {type => "KIELIPANKKI_ORIGFILENAME"});
+ my $analytic_textlang = $analytic->insert_new_elt("last_child", 'textLang');
+
+ # set texts for analytic
+ $analytic_title ->set_text($LABEL . ", page " . $PAGENO); # Achtung $PAGENO scheint meist "None zu sein"
+# $analytic_date ->set_text($DATE);
+ $analytic_date_year ->set_text($datearray[0]);
+ $analytic_date_month ->set_text($datearray[1]);
+ $analytic_date_day ->set_text($datearray[2]);
+ $analytic_idno_pageid ->set_text($PAGEID);
+ $analytic_idno_bindingid->set_text($BID);
+ $analytic_idno_id ->set_text($ID);
+ $analytic_idno_metafile ->set_text($METAFILENAME);
+ $analytic_idno_origfile ->set_text($ORIGFILENAME);
+ $analytic_textlang ->set_text($LANGUAGE);
+
+ #-------------------------------------
+ # fileDesc/sourceDesc/biblStruct/monogr
+ #-------------------------------------
+ my $monogr = $biblStruct->insert_new_elt("last_child", 'monogr');
+ my $monogr_title = $monogr ->insert_new_elt("last_child", 'title');
+ my $imprint = $monogr ->insert_new_elt("last_child", 'imprint'); # imprint is needed for valididty
+ my $pubPlace = $imprint ->insert_new_elt("last_child", 'pubPlace'); # imprint is needed for validity
+ my $publisher = $imprint ->insert_new_elt("last_child", 'publisher'); # imprint is needed for validity
+ my $biblScope_issuetitle = $monogr ->insert_new_elt("last_child", 'biblScope' => {unit => 'ISSUETITLE'} );
+ my $biblScope_issueno = $monogr ->insert_new_elt("last_child", 'biblScope' => {unit => 'ISSUENO'} );
+ my $biblScope_issuedate = $monogr ->insert_new_elt("last_child", 'biblScope' => {unit => 'ISSUEDATE'} );
+ my $biblScope_pp = $monogr ->insert_new_elt("last_child", 'biblScope' => {unit => 'PAGENO'} ); # Achtung PAGENO ist meist "None" ?
+
+ # set texts for monogr
+ $monogr_title ->set_text($PUBLTITLE);
+ $pubPlace ->set_text("TODO");
+ $publisher ->set_text("TODO");
+ $biblScope_issuetitle->set_text($ISSUETITLE);
+ $biblScope_issueno ->set_text($ISSUENO);
+ $biblScope_issuedate ->set_text($ISSUEDATE);
+ $biblScope_pp ->set_text($PAGENO);
+
+ #---------------
+ # encodingDesc
+ #---------------
+ my $tagsDecl = $encodingDesc->insert_new_elt("last_child", 'tagsDecl');
+ my $namespace = $tagsDecl ->insert_new_elt("last_child", 'namespace' => {name => 'http://www.tei-c.org/ns/1.0'});
+ my $tagUsage_s = $namespace ->insert_new_elt("last_child", 'tagUsage' => {gi => 's', occurs => $SENTCOUNT});
+ my $tagUsage_w = $namespace ->insert_new_elt("last_child", 'tagUsage' => {gi => 'w', occurs => $TOKENCOUNT});
+
+ #-------------
+ # profileDesc
+ #-------------
+ my $langUsage = $profileDesc ->insert_new_elt("last_child", 'langUsage');
+ my $language = $langUsage ->insert_new_elt("last_child", 'language' => {ident => $LANGUAGE, usage => $SUMLANG});
+ # Achtung in @usage muss eigt. ein integer; am besten inhalt von SUMLANG aufdroeseln und mehrere <language> machen
+ my $textClass = $profileDesc ->insert_new_elt("last_child", 'textClass');
+ my $classCode_fi = $textClass ->insert_new_elt("last_child", 'classCode' => {scheme => "KLK_PUBLTYPE"});
+# my $classCode_en = $textClass ->insert_new_elt("last_child", 'classCode' => {scheme => "KLK_PUBLTYPE_MAPPED"});
+
+ #---------------------------
+ # set texts for profileDesc
+ #---------------------------
+ $classCode_fi ->set_text($PUBLTYPE);
+# $classCode_en->set_text($PUBLTYPETRANSL);
+
+ #---------------
+ # revisionDesc
+ #---------------
+ my $change = $revisionDesc ->insert_new_elt("last_child", 'change' => {when => localtime->ymd('-'), who => 'HL' });
+
+ # set texts for revisionDesc
+ $change->set_text("TEI version for EuReCo");
+
+
+ ###################################
+ # END OF CREATING TEIHEADER
+ ###################################
+
+}
+
+sub setP {
+ my ($paragraph) = @_;
+
+ $paragraph->set_gi('p');
+
+ # <paragraph id="p-bcd0f3fa-bbd3dac4-815ead7a" sum_lang="|fin:1|">
+ # atts of <paragraph>:
+ # @id USE
+ # @sum_lang USE: put in xml:lang and prefix the value with "x-" for private value
+
+ $paragraph->set_att("xml:lang", "x-" . $paragraph->att("sum_lang"));
+ $paragraph->del_att("sum_lang");
+ $paragraph->change_att_name('id', 'xml:id');
+}
+sub setS {
+ my ($sentence) = @_;
+
+ $sentence->set_gi('s');
+
+ # the atts of <sentence>:
+ # USE 1 @id="s-bcd0f3fa-bbd3dac4-f7429090"
+ # USE 2 @lang="fin" -> xml:lang
+ # ? 3 @lang_conf="0.6734853"> -> ToDo @cert ?
+
+ # set attrs of <s>
+ $sentence->set_att("xml:lang", $sentence->att("lang")); # ToDo: convert the value / introduce a hash for lookup (input values: "fin", "xxx", ....)
+ $sentence->change_att_name('id', 'xml:id');
+ $sentence->del_att("lang"); # replaced by xml:lang
+ $sentence->del_att("lang_conf"); # for the time being
+
+}
+
+sub createW {
+ my ($w_element, $line) = @_;
+
+ #---------------------------
+ # Get the tags (=columns)
+ #---------------------------
+
+ my @tags = split(/\t/, $line);
+
+ # set content of <w> i.e. the token
+ $w_element->set_text($tags[0]);
+
+ # vrt positional-attributes in corpus KLK:
+ # USE [0] word
+ # USE [1] ref (id for reference of dephead)
+ # USE [2] lemma
+ # ? [3] lemmacomp (lemma with compound info - could go in @norm, as tag abuse?)
+ # USE [4] pos
+ # USE [5] msd
+ # USE [6] dephead
+ # USE [7] deprel
+ # [8] content (ocr-process)
+ # [9] vpos (ocr-process)
+ # [10] ocr (ocr-process)
+ # [11] cc (ocr-process)
+ # [12] hyph (ocr-process)
+ # [13] style (ocr-process)
+ # [14] lex (korp semantic disambiguation from G"oteborg)
+
+ # set the attributes of <w>:
+ $w_element->set_att("n", $tags[1]);
+ # $w_element->set_att("id", "w_" . $namearray[4] . $sentence->att("xml:id") . "_" . $tags[1]);
+ # so zusammengebaute ID ist auch nicht eindeutig...
+ $w_element->change_att_name('id', 'xml:id');
+ $w_element->set_att("lemma", $tags[2]);
+ # $w_element->set_att("norm", $tags[3]); # tag abuse of @norm
+ $w_element->set_att("pos", $tags[4]);
+ $w_element->set_att("msd", $tags[5]);
+ $w_element->set_att("head", $tags[6]);
+ $w_element->set_att("deprel", $tags[7]);
+
+}
+
+#################
+## usage_message
+#################
+
+
+sub usage_message {
+ print " Usage: ./xml2i5.pl <file.vrt.xml> <outfile>\n";
+ print " <file.vrt.xml> is a VRT file converted to proper XML\n";
+ exit;
+}
+
+