blob: dd1cffa45931a868048bb54e5c82d6b993ac29a3 [file] [log] [blame]
#! /usr/bin/perl -w
###########################################################################################################################################################
# vrt2tei.pl
# eureco
# leibniz-institut fuer deutsche sprache / csc finland esbo
# august 2024
#
#
# using XML::Twig , see http://www.xmltwig.org/, https://metacpan.org/pod/XML::Twiga
#
# usage: see below the usage fugnction
# Usage: ./vrt2tei.pl <vrtxmlfile.xml> <outfile>
# <vrtxmlfile>: xml-ised vrt file
#
#
# TODO:
# 1 insert dtd spec, or ref to TEI
# 3a remove the vrt positional attribute comment line / all comment lines
# 3b add @head and @deprel to I5 sowie auch @msd
# 3c bearbeitung von @head und @deprel in tei2korapxml durch Nils?
# 3d build 30 billion corpus
# 4a take care of IDs
# 4b see to the values of @xml:lang
# 5 abfangen von unerwarteten elementen dh andere als <sentence> und <paragraph>
# 5a wort reihenfolge nochmal checken
# 6 checks and balances
# 7 How to encode Kielipankki and National Library of Finland? in teiCorpus Header
# 8 construct <idsDoc>s for the months (or go for TEI)
# 9 parallelisation in bash and application on sub corpora of KLK
# 10 re-implementation of the gawk code in the perl script
# 12 re-implement creation of text header from xml file in another twig / parametrize TEI vs I5
#remember
#formatted.xml:105613: element w: validity error : No declaration for attribute deprel of element w
#formatted.xml:105613: element w: validity error : No declaration for attribute head of element w
#
#
############################################################################################################################################################
use strict;
use warnings;
use XML::Twig;
use XML::Generator ':pretty'; # apparently no effect when using flush();
use locale; # diese drei Zeilen, damit \b im regex nicht Umlaute und ß matcht.
use POSIX qw(locale_h); # to be able to use setlocale()
#setlocale(LC_ALL,'de_DE');
setlocale(LC_ALL, "fi_FI");
use utf8;
use open qw( :std :encoding(UTF-8) );
use Time::Piece;
use Tie::IxHash;
#----------------------
# check file arguments:
#----------------------
# arg0 infile: vrt-xml
unless($ARGV[0]) {&usage_message()} ; # min arg0, the input file
if ($ARGV[1]) {&usage_message()}; # max arg0, the input file
####################
# GLOBAL VARIABLES
####################
my $encoding = "UTF-8";
#my $encoding = "iso-8859-1"; # dieses $encoding ist NUR fuer das output s.u. twig funktion
my $textcounter = 0;
my $twig="";
my $teiCorpusHeaderDoc="";
#------------------------------------------------------------------
# read corpusHeaderSkeleton document and get header out of it
#------------------------------------------------------------------
my $teiCorpusHeaderDocTwig = new XML::Twig(
keep_spaces => 1,
keep_atts_order => 1,
comments => 'drop',
);
$teiCorpusHeaderDocTwig->parsefile("teiCorpusHeaderSkeleton.tei.xml");
my $corpusHeader = $teiCorpusHeaderDocTwig->root; # getting the teiHeader for corpus out of the teiCorpusHeaderSkeleton document
#------------------------------------------------------------------
# read textHeaderSkeleton document adn get header out of it
#------------------------------------------------------------------
my $teiTextHeaderDocTwig = new XML::Twig(
keep_spaces => 1,
keep_atts_order => 1,
comments => 'drop',
);
$teiTextHeaderDocTwig->parsefile("teiTextHeaderSkeleton.tei.xml");
my $textHeader = $teiTextHeaderDocTwig->root; # getting the teiHeader for corpus out of the teiTextHeaderSkeleton document
#----------------------------------
# read input VRT-XML document
#----------------------------------
open(my $IN, "< $ARGV[0]") || die("$0: cannot open file for reading: $ARGV[0]"); # open input file and initialise filehandel, actually does not seem to be needed
# as parsefile() (s.b.) is applied to the filename
#-----------------------------------------------------
# global variables pertaining to the original corpus
#-----------------------------------------------------
my $kielipankkiCorpus = "klk-fi-v2-vrt";
#####################
# M A I N
#####################
#-------------------------------------------------------------------------------------------------------------
# start twig for input and call start tag handler for root and twig handler for each <text> in the VRT
#-------------------------------------------------------------------------------------------------------------
$twig = new XML::Twig(
keep_spaces => 1, # dadurch auch whitespaces an ehemeligen elementgrenzen im output
keep_atts_order => 1, # requires Tie::IxHash
comments => 'drop',
start_tag_handlers => {
texts => sub{root(@_, $corpusHeader)}
},
twig_handlers => {
# text => \&text
text => sub{text(@_, $textHeader)}
},
# dtd_handlers => { # ToDo for I5
# \&set_dtd;
# }
output_encoding => $encoding,
);
$twig->parsefile($ARGV[0]);
###########
# END MAIN
###########
##############################
# S U B R O U T I N E S
##############################
# sub set_dtd [
# my $twig, $dtd = @_;
# my $internal = qq|\nPUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"\n"DTD/xhtml1-strict.dtd"|;
#
# $twig->twig_doctype('html', undef, undef, $internal);
# }
sub root {
my ($twig, $root, $corpusHeader) =@_;
$root->set_gi('teiCorpus');
$root->set_att("xmlns", 'http://www.tei-c.org/ns/1.0');
&insertCorpusHeader($root, $corpusHeader);
}
sub insertCorpusHeader{
my ($root, $corpusHeader) =@_;
#---------------------------------------------------------------------------
# get some metadata for the current output corpus based on source and year
#---------------------------------------------------------------------------
my @array = split(/\//, $ARGV[0]);
my $l = scalar(@array);
my $source = $array[$l-1];
$source =~ s/([0-9][0-9][0-9][0-9])\.xml$//;
my $language="Finnish";
my $lang_tla="fi";
my $yy = $1; # $1 now containts substring in first bracket in regex above
my $ctitle = $source . " " . $yy . ", from ". $kielipankkiCorpus . " for EuReCo"; # to do: also get name of corpus (klk-fi-v2-vrt)
#-----------------------
# set corpus header
#-----------------------
&set_title( $corpusHeader, $source, $yy, $kielipankkiCorpus);
&set_sourceDesc($corpusHeader, $source, $yy, $kielipankkiCorpus);
my $teiCorpusHeader = $corpusHeader->paste("first_child", $root);
}
#----------------------------
# handler &text for <text>
#----------------------------
sub text {
my ($twig, $text, $textHeader) = @_;
$textcounter++; # global variable
# ToDo: catch all other, unexpected children of root
#--------------------------------------------------------------------------
# Get text metadata (attributes of <text>) and create teiHeader for <text>
#--------------------------------------------------------------------------
my $textattsref = $text->atts(); # $textattsref is now a reference to a hash and should be used with '->'
&createTextHeader($text, $textattsref, $textHeader);
#--------------------------
# create <TEI> from <text>
#--------------------------
# set vrt <text> to <TEI> and delete all attributes after they were were saved above
$text->del_atts;
$text->set_gi("TEI");
#------------------------------------------------------------------
# create the <tei:text>, <body>, <div> elements inside <TEI>
#------------------------------------------------------------------
my $ttext_element = XML::Twig::Elt->new('text');
my $body_element = XML::Twig::Elt->new('body');
my $div_element = XML::Twig::Elt->new('div');
# set atts
$div_element ->set_att("type", "page"); # ToDo: this is specific to KLK
$ttext_element->set_att("xml:lang", 'fi'); # as in ICC-NOR
# paste
$ttext_element->paste('last_child', $text);
$body_element ->paste('last_child', $ttext_element);
$div_element ->paste('last_child', $body_element);
#-------------------------------
# create <p> from <paragraph>
#-------------------------------
my @paragraphs = $text->children( 'paragraph');
foreach my $paragraph (@paragraphs) {
&setP($paragraph);
$paragraph->move('last_child', $div_element);
#------------------------------
# create <s> from <sentence>
#------------------------------
my @sentences = $paragraph->children('sentence');
foreach my $sentence (@sentences) {
&setS($sentence);
#--------------------------------------
# create <w> (word) from each $line
#--------------------------------------
my @lines = split(/\n+/, $sentence->xml_text);
$sentence->set_text("\n");
for my $line (@lines){ # Todo: Reihenfolge checken
if($line ne "" ){
my $w_element = XML::Twig::Elt->new('w');
&createW($w_element, $line);
$w_element->paste('last_child', $sentence);
}
} # end words
} # end sentences
} # end paragraphs
# $twig->set_pretty_print( 'record');
# $twig->flush($OUT);
$twig->flush("/dev/stdout");
}
sub createTextHeader{
my ($text, $textattsref, $textHeader) = @_;
# USE 01 binding_id="2246025"
# USE 02 date="2021-01-15"
# 03 datefrom="20210115"
# 04 dateto="20210115"
# 05 elec_date="_"
# 06 file=""
# USE 07 filename_metadata="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_mets.xml"
# USE 08 filename_orig ="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_page-2021011502210030301.xml
# USE 09 id="t-bcd0f3fa-bbd3dac4"
# 10 img_url=""
# USE 11 issue_date="15.01.2021"
# USE 12 issue_no="SK0221"
# USE 13 issue_title="Suomen Kuvalehti"
# USE 14 label="Suomen Kuvalehti no. SK0221 15.01.2021"
# USE 16 language="fi"
# USE 17 page_id="p1"
# USE 18 page_no="None"
# 19 part_name="_"
# 20 publ_id="0039-5552"
# 21 publ_part=""
# USE 22 publ_title="Suomen Kuvalehti"
# USE 23 publ_type="aikakausi"
# USE 24 sentcount="70"
# USE 25 sum_lang="|xxx:44|fin:23|eng:3|"
# 26 timefrom="000000"
# 27 timeto="235959"
# USE 28 tokencount="304"
# 29 version_added="KLK-fi-2021">
my $BID = $textattsref->{'binding_id'};
my $DATE = $textattsref->{'date'};
my $METAFILENAME = $textattsref->{'filename_metadata'};
my $ORIGFILENAME = $textattsref->{'filename_orig'};
my $ID = $textattsref->{'id'};
my $ISSUEDATE = $textattsref->{'issue_date'};
my $ISSUENO = $textattsref->{'issue_no'};
my $ISSUETITLE = $textattsref->{'issue_title'};
my $LABEL = $textattsref->{'label'};
my $LANGUAGE = $textattsref->{'language'};
my $PAGEID = $textattsref->{'page_id'};
my $PAGENO = $textattsref->{'page_no'};
my $PUBLTITLE = $textattsref->{'publ_title'};
my $PUBLTYPE = $textattsref->{'publ_type'};
my $SENTCOUNT = $textattsref->{'sentcount'};
my $SUMLANG = $textattsref->{'sum_lang'};
my $TOKENCOUNT = $textattsref->{'tokencount'};
#-----------------------------
# Derived Metadata variables
#-----------------------------
my @datearray = split("-", $DATE);
my @langarray = split("|", $SUMLANG);
my @namearray = split(/[\.\/]/, $ORIGFILENAME); # use $namearray[4] as ID for the page
#-----------------------------------------------------------------------
# CREATE text-teiHeader ACCORDING TO THE SKELETON in $textHeader
#-----------------------------------------------------------------------
$textHeader->paste('first_child', $text);
#-----------------------------------------------
# <teiHeader>
# <fileDesc n="[EuReCo-KLK-FIN_$ID]">
# <titleStmt>
# <title>[$LABEL, page $PAGENO]</title>
$textHeader->first_child("fileDesc")->first_child("titleStmt")->first_child("title")
->set_text($LABEL . ", Text #" . $textcounter); # Case KLK
#-----------------------------------------------
# <fileDesc>
# <sourceDesc>
# <biblStruct>
# <analytic>
# <title type="main">[$LABEL, page $PAGENO]</title>
# <date>[$DATE]</date>
# <date type="year">TODO</date>
# <date type="month">TODO</date>
# <date type="day">TODO</date>
# <idno type="PAGEID">$PAGEID</idno>
# <idno type="BINDINGID">$BID</idno>
# <idno type="ID">$ID</idno>
# <idno type="KIELIPANKKI_METAFILENAME">$METAFILENAME</idno>
# <idno type="KIELIPANKKI_ORIGFILENAME">$ORIGFILENAME</idno>
# <textLang>$LANGUAGE</textLang>
# </analytic>
my $analytic = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/analytic", 0);
$analytic->first_child("title") ->set_text($LABEL . ", Text #" . $textcounter); # Case KLK
$analytic->get_xpath('./date[@type="year"]', 0) ->set_text($datearray[0]);
$analytic->get_xpath('./date[@type="month"]', 0) ->set_text($datearray[1]);
$analytic->get_xpath('./date[@type="day"]', 0) ->set_text($datearray[2]);
$analytic->get_xpath('./idno[@type="PAGEID"]', 0) ->set_text($PAGEID);
$analytic->get_xpath('./idno[@type="BINDINGID"]', 0) ->set_text($BID);
$analytic->get_xpath('./idno[@type="ID"]', 0) ->set_text($ID);
$analytic->get_xpath('./idno[@type="KIELIPANKKI_METAFILENAME"]', 0) ->set_text($METAFILENAME);
$analytic->get_xpath('./idno[@type="KIELIPANKKI_ORIGFILENAME"]', 0) ->set_text($ORIGFILENAME);
$analytic->first_child('textLang') ->set_text($LANGUAGE);
# <monogr>
# <title>$PUBLTITLE</title>
# <imprint>
# <pubPlace>TODO</pubPlace>
# <publisher>TODO</publisher>
# </imprint>
# <biblScope unit="ISSUETITLE"/>
# <biblScope unit="ISSUENO"/>
# <biblScope unit="ISSUEDATE"/>
# <biblScope unit="pp">$PAGENO</biblScope>
# <monogr>
my $monogr = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/monogr", 0);
##TMP # create <teiHeader> inside <TEI>
##TMP my $teiHeader = XML::Twig::Elt->new('teiHeader');
##TMP # $teiHeader->paste('first_child', $text);
##TMP
##TMP ## insert_new_elt is a combo of new and paste, cf. xml::twig docu:
##TMP ## insert_new_elt ($opt_position, $gi, $opt_atts_hashref, @opt_content)
##TMP
##TMP my $fileDesc = $teiHeader->insert_new_elt('fileDesc' => {n => "EuReCo_KLK-fi_" . $namearray[4]});
##TMP my $encodingDesc = $teiHeader->insert_new_elt("last_child", 'encodingDesc');
##TMP my $profileDesc = $teiHeader->insert_new_elt("last_child", 'profileDesc');
##TMP my $revisionDesc = $teiHeader->insert_new_elt("last_child", 'revisionDesc');
##TMP
##TMP #---------------------
##TMP # fileDesc/titleStmt
##TMP #---------------------
##TMP my $titleStmt = $fileDesc ->insert_new_elt('titleStmt');
##TMP my $title = $titleStmt->insert_new_elt("last_child", 'title');
##TMP my $respStmt = $titleStmt->insert_new_elt("last_child", 'respStmt');
##TMP my $resp = $respStmt ->insert_new_elt("last_child", 'resp');
##TMP my $name = $respStmt ->insert_new_elt("last_child", 'name');
##TMP
##TMP # set texts for titleStmt
##TMP # $title->set_text($LABEL . ", page " . $PAGENO); # Achtung - PAGENO scheint meist "None" zu sein
##TMP $title->set_text($LABEL . ", Text #" . $textcounter); # at least for Suomen Kuvalehti
##TMP $resp ->set_text("compiled by EuReCo");
##TMP $name ->set_text("EuReCo: HL");
##TMP
##TMP #--------------------------
##TMP # fileDesc/publicationStmt
##TMP #--------------------------
##TMP my $publicationStmt = $fileDesc ->insert_new_elt("last_child", 'publicationStmt');
##TMP my $distributor = $publicationStmt->insert_new_elt("last_child", 'distributor');
##TMP my $note = $distributor ->insert_new_elt("last_child", 'note');
##TMP my $availability = $publicationStmt->insert_new_elt("last_child", 'availability');
##TMP my $licence = $availability ->insert_new_elt("last_child", 'licence');
##TMP
##TMP # set texts for publicationStmt
##TMP $note ->set_text("NOT FOR DISTRIBUTION - to be used locally in EuReCo");
##TMP $licence->set_text("CLARIN_RES"); # TODO: Ausfuherlichere Licence info in KLK Metadata Record
##TMP
##TMP #------------------------------
##TMP # fileDesc/sourceDesc/biblStruct
##TMP #------------------------------
##TMP my $sourceDesc = $fileDesc ->insert_new_elt("last_child", 'sourceDesc');
##TMP my $biblStruct = $sourceDesc->insert_new_elt("last_child", 'biblStruct');
##TMP
##TMP # fileDesc/sourceDesc/biblStruct/analytic
##TMP my $analytic = $biblStruct->insert_new_elt("last_child", 'analytic');
##TMP my $analytic_title = $analytic->insert_new_elt("last_child", 'title' => {type => "main"} );
##TMP # my $analytic_date = $analytic->insert_new_elt("last_child", 'date');
##TMP my $analytic_date_year = $analytic->insert_new_elt("last_child", 'date' => {type => "year"});
##TMP my $analytic_date_month = $analytic->insert_new_elt("last_child", 'date' => {type => "month"});
##TMP my $analytic_date_day = $analytic->insert_new_elt("last_child", 'date' => {type => "day"});
##TMP my $analytic_idno_pageid = $analytic->insert_new_elt("last_child", 'idno' => {type => "PAGEID"});
##TMP my $analytic_idno_bindingid = $analytic->insert_new_elt("last_child", 'idno' => {type => "BINDINGID"});
##TMP my $analytic_idno_id = $analytic->insert_new_elt("last_child", 'idno' => {type => "ID"});
##TMP my $analytic_idno_metafile = $analytic->insert_new_elt("last_child", 'idno' => {type => "KIELIPANKKI_METAFILENAME"});
##TMP my $analytic_idno_origfile = $analytic->insert_new_elt("last_child", 'idno' => {type => "KIELIPANKKI_ORIGFILENAME"});
##TMP my $analytic_textlang = $analytic->insert_new_elt("last_child", 'textLang');
##TMP
##TMP # set texts for analytic
##TMP # $analytic_title ->set_text($LABEL . ", page " . $PAGENO); # Achtung $PAGENO scheint meist "None zu sein"
##TMP $analytic_title ->set_text($LABEL . ", Text #" . $textcounter); # Achtung $PAGENO scheint meist "None zu sein"
##TMP # $analytic_date ->set_text($DATE);
##TMP $analytic_date_year ->set_text($datearray[0]);
##TMP $analytic_date_month ->set_text($datearray[1]);
##TMP $analytic_date_day ->set_text($datearray[2]);
##TMP $analytic_idno_pageid ->set_text($PAGEID);
##TMP $analytic_idno_bindingid->set_text($BID);
##TMP $analytic_idno_id ->set_text($ID);
##TMP $analytic_idno_metafile ->set_text($METAFILENAME);
##TMP $analytic_idno_origfile ->set_text($ORIGFILENAME);
##TMP $analytic_textlang ->set_text($LANGUAGE);
##TMP
##TMP #-------------------------------------
##TMP # fileDesc/sourceDesc/biblStruct/monogr
##TMP #-------------------------------------
##TMP my $monogr = $biblStruct->insert_new_elt("last_child", 'monogr');
##TMP my $monogr_title = $monogr ->insert_new_elt("last_child", 'title');
##TMP my $imprint = $monogr ->insert_new_elt("last_child", 'imprint'); # imprint is needed for valididty
##TMP my $pubPlace = $imprint ->insert_new_elt("last_child", 'pubPlace'); # imprint is needed for validity
##TMP my $publisher = $imprint ->insert_new_elt("last_child", 'publisher'); # imprint is needed for validity
##TMP my $biblScope_issuetitle = $monogr ->insert_new_elt("last_child", 'biblScope' => {unit => 'ISSUETITLE'} );
##TMP my $biblScope_issueno = $monogr ->insert_new_elt("last_child", 'biblScope' => {unit => 'ISSUENO'} );
##TMP my $biblScope_issuedate = $monogr ->insert_new_elt("last_child", 'biblScope' => {unit => 'ISSUEDATE'} );
##TMP my $biblScope_pp = $monogr ->insert_new_elt("last_child", 'biblScope' => {unit => 'PAGENO'} ); # Achtung PAGENO ist meist "None" ?
##TMP
##TMP # set texts for monogr
##TMP $monogr_title ->set_text($PUBLTITLE);
##TMP $pubPlace ->set_text("TODO");
##TMP $pubPlace ->set_att("key",'FI');
##TMP $publisher ->set_text("TODO");
##TMP $biblScope_issuetitle->set_text($ISSUETITLE);
##TMP $biblScope_issueno ->set_text($ISSUENO);
##TMP $biblScope_issuedate ->set_text($ISSUEDATE);
##TMP $biblScope_pp ->set_text($PAGENO);
##TMP
##TMP #---------------
##TMP # encodingDesc
##TMP #---------------
##TMP my $tagsDecl = $encodingDesc->insert_new_elt("last_child", 'tagsDecl');
##TMP my $namespace = $tagsDecl ->insert_new_elt("last_child", 'namespace' => {name => 'http://www.tei-c.org/ns/1.0'});
##TMP my $tagUsage_s = $namespace ->insert_new_elt("last_child", 'tagUsage' => {gi => 's', occurs => $SENTCOUNT});
##TMP my $tagUsage_w = $namespace ->insert_new_elt("last_child", 'tagUsage' => {gi => 'w', occurs => $TOKENCOUNT});
##TMP
##TMP #-------------
##TMP # profileDesc
##TMP #-------------
##TMP my $langUsage = $profileDesc ->insert_new_elt("last_child", 'langUsage');
##TMP my $language = $langUsage ->insert_new_elt("last_child", 'language' => {ident => $LANGUAGE, usage => $SUMLANG});
##TMP # Achtung in @usage muss eigt. ein integer; am besten inhalt von SUMLANG aufdroeseln und mehrere <language> machen
##TMP my $textClass = $profileDesc ->insert_new_elt("last_child", 'textClass');
##TMP my $classCode_fi = $textClass ->insert_new_elt("last_child", 'classCode' => {scheme => "KLK_PUBLTYPE"});
##TMP # my $classCode_en = $textClass ->insert_new_elt("last_child", 'classCode' => {scheme => "KLK_PUBLTYPE_MAPPED"});
##TMP
##TMP #---------------------------
##TMP # set texts for profileDesc
##TMP #---------------------------
##TMP $classCode_fi ->set_text($PUBLTYPE);
##TMP # $classCode_en->set_text($PUBLTYPETRANSL);
##TMP
##TMP #---------------
##TMP # revisionDesc
##TMP #---------------
##TMP my $change = $revisionDesc ->insert_new_elt("last_child", 'change' => {when => localtime->ymd('-'), who => 'HL' });
##TMP
##TMP # set texts for revisionDesc
##TMP $change->set_text("TEI version for EuReCo");
###################################
# END OF CREATING TEIHEADER
###################################
}
sub setP {
my ($paragraph) = @_;
$paragraph->set_gi('p');
# <paragraph id="p-bcd0f3fa-bbd3dac4-815ead7a" sum_lang="|fin:1|">
# atts of <paragraph>:
# @id USE
# @sum_lang USE: put in xml:lang and prefix the value with "x-" for private value
$paragraph->set_att("xml:lang", "x-" . $paragraph->att("sum_lang"));
$paragraph->del_att("sum_lang");
# $paragraph->change_att_name('id', 'xml:id');
$paragraph->del_att("id"); # diese id ist auch nicht eindeutig!!
}
sub setS {
my ($sentence) = @_;
$sentence->set_gi('s');
# the atts of <sentence>:
# USE 1 @id="s-bcd0f3fa-bbd3dac4-f7429090"
# USE 2 @lang="fin" -> xml:lang
# ? 3 @lang_conf="0.6734853"> -> ToDo @cert ?
# set attrs of <s>
$sentence->set_att("xml:lang", $sentence->att("lang")); # ToDo: convert the value / introduce a hash for lookup (input values: "fin", "xxx", ....)
# $sentence->change_att_name('id', 'xml:id'); # nicht eindeutig
$sentence->del_att('id');
$sentence->del_att("lang"); # replaced by xml:lang
$sentence->del_att("lang_conf"); # for the time being
}
sub createW {
my ($w_element, $line) = @_;
#---------------------------
# Get the tags (=columns)
#---------------------------
my @tags = split(/\t/, $line);
# set content of <w> i.e. the token
$w_element->set_text($tags[0]);
# vrt positional-attributes in corpus KLK:
# USE [0] word
# USE [1] ref (id for reference of dephead)
# USE [2] lemma
# ? [3] lemmacomp (lemma with compound info - could go in @norm, as tag abuse?)
# USE [4] pos
# USE [5] msd
# USE [6] dephead
# USE [7] deprel
# [8] content (ocr-process)
# [9] vpos (ocr-process)
# [10] ocr (ocr-process)
# [11] cc (ocr-process)
# [12] hyph (ocr-process)
# [13] style (ocr-process)
# [14] lex (korp semantic disambiguation from G"oteborg)
# set the attributes of <w>:
$w_element->set_att("n", $tags[1]);
# $w_element->set_att("id", "w_" . $namearray[4] . $sentence->att("xml:id") . "_" . $tags[1]);
# so zusammengebaute ID ist auch nicht eindeutig...
$w_element->del_att("id");
$w_element->set_att("lemma", $tags[2]);
# $w_element->set_att("norm", $tags[3]); # tag abuse of @norm
$w_element->set_att("pos", $tags[4]);
$w_element->set_att("msd", $tags[5]);
#TMP $w_element->set_att("head", $tags[6]);
#TMP $w_element->set_att("deprel", $tags[7]);
}
sub set_title{
my ($corpusHeader, $source, $yy, $kielipankkiCorpus) = @_;
my $cTitleString = $source . " " . $yy . ", from ". $kielipankkiCorpus . " for EuReCo";
#<teiHeader>
# <fileDesc>
# <titleStmt>
# <title>[Aamulehti2021] from [klk-fi-v2-vrt for] EuReCo</title>
# </titleStmt>
# <!-- ... -->
# </fileDesc>
#</teiHeader>
my $cTitleNode = $corpusHeader->first_child("fileDesc")->first_child("titleStmt")->first_child("title");
$cTitleNode->set_text($cTitleString);
}
sub set_sourceDesc{
my ($corpusHeader, $source, $yy, $kielipankkiCorpus) = @_;
my $cBiblString = $source . " " . $yy . ", from ". $kielipankkiCorpus . " for EuReCo";
#<teiHeader>
# <fileDesc>
# <!-- ... -->
# <sourceDesc>
# <bibl>[Aamulehti2021] from [klk-fi-v2-vrt]</bibl>
# </sourceDesc>
# <!-- ... -->
# </fileDesc>
#</teiHeader>
my $cBiblNode = $corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("bibl");
$cBiblNode->set_text($cBiblString);
}
#################
## usage_message
#################
sub usage_message {
print " Usage: ./vrt2tei.pl <file.vrt.xml> <outfile>\n";
print " <file.vrt.xml> is a VRT file converted to proper XML\n";
exit;
}