blob: 93319a2576aacad155e5429d378b792f7b60a5cb [file] [log] [blame]
## #! /appl/soft/bio/bioperl/5.36.0/bin/perl
## #! /usr/bin/perl -w
###########################################################################################################################################################
# vrt2tei.pl
# eureco
# leibniz-institut fuer deutsche sprache / csc finland esbo
# august 2024
#
#
# using XML::Twig , see http://www.xmltwig.org/, https://metacpan.org/pod/XML::Twiga
#
# usage: see below the usage fugnction
# Usage: ./vrt2tei.pl <vrtxmlfile.xml> <outfile>
# <vrtxmlfile>: xml-ised vrt file
#
#
# TODO:
# 0 ZIPPEN mit tei2korapxml; zippen mit korAP indexing
# 1 bearbeitung von @head und @deprel in tei2korapxml durch Nils?
# 2 threading on compute node and application on sub corpora of KLK
# 2 build 30 billion corpus and index it
# 3 Optionen
# 3a parametrize deprel for I5 and if Nils is not ready yet
#
# 5 abfangen von unerwarteten elementen dh andere als <sentence> und <paragraph>
# 6 checks and balances, wort reihenfolge nochmal checken?
# 7 Encode Kielipankki and National Library of Finland? in teiCorpus Header
# 8 How to encode the CLARIN-RES better - more Info from the CMDI
# 9 construct <idsDoc>s independent of the order of texts, probably with writing intermediate files to zip
# 10 re-implementation of the gawk code in the perl script
# 11 Wwedish corpus
#
#
############################################################################################################################################################
use strict;
use warnings;
#use diagnostics;
use Getopt::Std;
use XML::Twig;
use XML::Generator ':pretty'; # apparently no effect when using flush();
use locale; # diese drei Zeilen, damit \b im regex nicht Umlaute und ß matcht.
use POSIX qw(locale_h); # to be able to use setlocale()
#setlocale(LC_ALL,'de_DE');
setlocale(LC_ALL, "fi_FI");
use utf8;
use open qw( :std :encoding(UTF-8) );
use Time::Piece;
use Tie::IxHash;
use Data::Random::String;
#-------------
# get options
#-------------
our ($opt_h, $opt_m, $opt_s, $opt_t);
# read switches and print usage info if some bad option was given
if (!getopts('hms:t:')) { # switches with ':' take an argument; switches without ':' are boolean flags
&usage_message;
exit -1;
}
#--------------------
# check argument(s)
#--------------------
# currently one argument: the vrt-xml input file
unless($ARGV[0]) {&usage_message()} ; # min arg0, the input file
if ($ARGV[1]) {&usage_message()}; # max arg0, the input file
#------------------------------------------------------------
# initialize defaults for options
#------------------------------------------------------------
my $TEIFORMAT= "tei";
my $MASK = 0;
#----------------------------------------------------------------------------------------------------------
# interpret the options and check whether their respective argument is meaningful (if applicable)
#----------------------------------------------------------------------------------------------------------
# option -h: display usage info and exit
if ($opt_h) {
print STDERR &usage_message;
exit 0;
}
# option -t
if (defined($opt_t)) {
$TEIFORMAT = $opt_t;
}
if ($TEIFORMAT !~ /(tei|i5)/i) { # case-insenstive
print STDERR "Error: invalid arg for option -t";
&usage_message;
exit 0;
}
if($TEIFORMAT eq "tei"){$TEIFORMAT="TEI"};
if($TEIFORMAT eq "i5") {$TEIFORMAT="I5"};
# option -m
if ($opt_m) {
$MASK = 1;
}
#-----------------------------------------------
# OTHER GLOBAL VARIABLES
#-----------------------------------------------
my $encoding = "UTF-8"; # dieses $encoding ist NUR fuer das output s.u. twig funktion
my $DTDDECL = 'idsCorpus PUBLIC "-//IDS//DTD IDS-I5 1.0//EN" "http://corpora.ids-mannheim.de/I5/DTD/i5.dtd"'; # for I5
my $textcounter = 0;
my $LASTMONTH = 0;
our %corpusids = ();
our %srcfullnames = ();
our %srcpubplaces = ();
our %srcpublishers = ();
our %srctexttypes = ();
our %srctextlangs = ();
our %expandLang = ();
my %doccounter = ( # by the month as in dereko
"01" => 1,
"02" => 1,
"03" => 1,
"04" => 1,
"05" => 1,
"06" => 1,
"07" => 1,
"08" => 1,
"09" => 1,
"10" => 1,
"11" => 1,
"12" => 1,
);
# global variables pertaining to the original corpus of *all* newspapers:
my $kielipankkiCorpus = "klk-fi-v2-vrt";
my $kielipankkiLicense = "CLARIN-RES";
my $CountryKey = "FI";
# Table with metadata about the different sources (newspapers)
my $sourcescsvfile = "sources_klk_fi_v2_2021_4eureco.csv";
# corpusheader and textheader skeletons
my $corpheaderfile = "teiCorpusHeaderSkeleton.tei.xml";
my $textheaderfile = "teiTextHeaderSkeleton.tei.xml";
if($TEIFORMAT eq "I5"){
$corpheaderfile = "i5CorpusHeaderSkeleton.i5.xml";
$textheaderfile = "i5TextHeaderSkeleton.i5.xml";
}
my $twig="";
# variables $fnsource and $fnyear derived from the filename
my @array = split(/\//, $ARGV[0]);
my $l = scalar(@array);
my $fnsource = $array[$l-1];
$fnsource =~ s/([0-9][0-9][0-9][0-9])\.xml$//;
my $fnyear = $1; # $1 contains substring in first bracket in regex above
my $fnYY = substr($fnyear, 2, 2);
# months
my %months = (
"01" => "JAN",
"02" => "FEB",
"03" => "MAR",
"04" => "APR",
"05" => "MAY",
"06" => "JUN",
"07" => "JUL",
"08" => "AUG",
"09" => "SEP",
"10" => "OCT",
"11" => "NOV",
"12" => "DEC",
);
my %monthnames = (
"01" => "January",
"02" => "February",
"03" => "March",
"04" => "April",
"05" => "May",
"06" => "June",
"07" => "July",
"08" => "August",
"09" => "September",
"10" => "October",
"11" => "November",
"12" => "December",
);
my %mapping = ();
$mapping{"aikakausi"} = "Zeitschrift";
$mapping{"sanomalehti"} = "Zeitung";
#-------------------------------------------------------------------------------------------
# read source metadata file (prepared manually => ultimately read the info from CMDI File?)
# and set variables
#-------------------------------------------------------------------------------------------
open(my $SOURCES, "< $sourcescsvfile") || die("$0: cannot open file for reading: $sourcescsvfile");
while(my $fline = <$SOURCES>){
chomp($fline);
if ($fline=~/^\#/ || $fline=~/^\s*$/ || $fline =~ "TEICORPUSID"){next;} # skip line if empty line or comment line or first line
my @flarray = split(/\s*\t+\s*/, $fline); # split each line into array
# set full titles ($flarray[1]) as keys:
# (ToDo: these hashes could probably conflated into an array of hashes or so)
$corpusids{$flarray[1]} = $flarray[0];
$srcfullnames{$flarray[1]} = $flarray[1];
$srcpubplaces{$flarray[1]} = $flarray[6];
$srcpublishers{$flarray[1]} = $flarray[7];
$srctexttypes{$flarray[1]} = $flarray[4];
$srctextlangs{$flarray[1]} = $flarray[5];
# also set simple titles ($flarray[2]) as keys:
$corpusids{$flarray[2]} = $flarray[0];
$srcfullnames{$flarray[2]} = $flarray[1];
$srcpubplaces{$flarray[2]} = $flarray[6];
$srcpublishers{$flarray[2]} = $flarray[7];
$srctexttypes{$flarray[2]} = $flarray[4];
$srctextlangs{$flarray[2]} = $flarray[5];
}
close($SOURCES);
$expandLang{"fi"} = "Finnish";
$expandLang{"sv"} = "Swedish";
#------------------------------------------------------------------
# read corpusHeaderSkeleton document and start a twig for it
#------------------------------------------------------------------
my $teiCorpusHeaderDocTwig = new XML::Twig(
keep_spaces => 1,
keep_atts_order => 1,
comments => 'drop',
);
$teiCorpusHeaderDocTwig->parsefile($corpheaderfile);
my $corpusHeader = $teiCorpusHeaderDocTwig->root; # getting the teiHeader for corpus out of the teiCorpusHeaderSkeleton document
#------------------------------------------------------------------
# read textHeaderSkeleton document and start a twig for it
#------------------------------------------------------------------
my $teiTextHeaderDocTwig = new XML::Twig(
keep_spaces => 1,
keep_atts_order => 1,
comments => 'drop',
);
$teiTextHeaderDocTwig->parsefile($textheaderfile);
my $textHeader = $teiTextHeaderDocTwig->root; # getting the teiHeader for corpus out of the teiTextHeaderSkeleton document
#---------------------------------------------------------
# define a subtree for idsDoc
# for the time being it will only be used for the first
# idsDoc header, to be inserted in the root hander
#---------------------------------------------------------
my $idsDoc = XML::Twig::Elt->new('idsDoc');
my $idsDocHeader = XML::Twig::Elt->new('idsHeader');
if($TEIFORMAT eq "I5"){
my $docFileDesc = XML::Twig::Elt->new('fileDesc');
my $docTitleStmt = XML::Twig::Elt->new('titleStmt');
my $dtitle = XML::Twig::Elt->new('d.title');
my $docSigle = XML::Twig::Elt->new('dokumentSigle');
my $docPublicationStmt = XML::Twig::Elt->new('publicationStmt');
my $docDistributor = XML::Twig::Elt->new('distributor');
my $docPubAddress = XML::Twig::Elt->new('pubAddress');
my $docAvailability = XML::Twig::Elt->new('availability');
my $docPubDate = XML::Twig::Elt->new('pubDate');
my $docSourceDesc = XML::Twig::Elt->new('sourceDesc');
my $docBiblStruct = XML::Twig::Elt->new('biblStruct');
my $docMonogr = XML::Twig::Elt->new('monogr');
my $docHTitle = XML::Twig::Elt->new('h.title');
my $docImprint = XML::Twig::Elt->new('imprint');
$idsDoc -> set_att('version', "1.0");
$idsDoc -> set_att('TEIform', "TEI.2");
$idsDocHeader -> set_att('version', "1.1");
$idsDocHeader -> set_att('type', "document");
$idsDocHeader -> set_att('pattern', "text");
$idsDocHeader -> set_att('TEIform', "teiHeader");
$docSigle -> paste("first_child", $docTitleStmt);
$dtitle -> paste("last_child", $docTitleStmt);
$docTitleStmt -> paste("last_child", $docFileDesc);
$docFileDesc -> paste("last_child", $idsDocHeader);
$docPublicationStmt -> paste("last_child", $docFileDesc);
$docDistributor -> paste("last_child", $docPublicationStmt);
$docPubAddress -> paste("last_child", $docPublicationStmt);
$docAvailability -> paste("last_child", $docPublicationStmt);
$docPubDate -> paste("last_child", $docPublicationStmt);
$docSourceDesc -> paste("last_child", $docFileDesc);
$docBiblStruct -> paste("last_child", $docSourceDesc);
$docMonogr -> paste("last_child", $docBiblStruct);
$docHTitle -> paste("last_child", $docMonogr);
$docImprint -> paste("last_child", $docMonogr);
$idsDocHeader -> paste("last_child", $idsDoc);
$docSigle->set_text($corpusids{$fnsource} . $fnYY . "/JAN");
$dtitle ->set_text($srcfullnames{$fnsource} . ", January " . $fnyear);
}
#----------------------------------
# read the input VRT-XML document
#----------------------------------
open(my $IN, "< $ARGV[0]") || die("$0: cannot open file for reading: $ARGV[0]"); # open input file and initialise filehandel, actually does not seem to be needed
# as parsefile() (s.b.) is applied to the filename
#####################
# M A I N
#####################
#-------------------------------------------------------------------------------------------------------------
# start twig for input and call start tag handler for root and twig handler for each <text> in the VRT
#-------------------------------------------------------------------------------------------------------------
$twig = new XML::Twig(
keep_spaces => 1, # dadurch auch whitespaces an ehemeligen elementgrenzen im output
keep_atts_order => 1, # requires Tie::IxHash
comments => 'drop',
start_tag_handlers => {
texts => sub{root(@_, $corpusHeader)}
},
twig_handlers => {
# text => \&text
text => sub{text(@_, $textHeader->copy)} # copy must be because textHeader will be flushed with $twig in the <text> handler;
},
output_encoding => $encoding,
);
$twig->parsefile($ARGV[0]);
###########
# END MAIN
###########
##############################
# S U B R O U T I N E S
##############################
sub root {
my ($twig, $root, $corpusHeader) =@_;
if($TEIFORMAT eq "I5") {
$twig->set_doctype($DTDDECL); # the doctype could probably be set anywhere to the twig
$root->set_gi('idsCorpus');
$root->set_att('version', "1.0");
$root->set_att('TEIform', "teiCorpus.2");
}
else {
$root->set_gi('teiCorpus');
$root->set_att("xmlns", 'http://www.tei-c.org/ns/1.0');
}
&insertCorpusHeader($root, $corpusHeader);
}
sub insertCorpusHeader{
my ($root, $corpusHeader) =@_;
my $ident = "ident";
#-----------------------
# set corpus header
#-----------------------
&set_title($corpusHeader, $srcfullnames{$fnsource}, $fnyear, $kielipankkiCorpus);
if($TEIFORMAT eq "TEI"){
&set_sourceDesc($corpusHeader, $srcfullnames{$fnsource}, $fnyear, $kielipankkiCorpus);
}
elsif($TEIFORMAT eq "I5"){
$ident="id";
$corpusHeader->get_xpath("fileDesc/titleStmt/korpusSigle",0) ->set_text($corpusids{$fnsource} . $fnYY);
$corpusHeader->get_xpath("fileDesc/publicationStmt/pubDate",0)->set_text((localtime)[5] + 1900);
$corpusHeader->get_xpath('encodingDesc/editorialDecl/transduction[@n="2"]',0) ->set_text("I5 version by EuReCo using vrt2tei.pl " . localtime->ymd('-'));
&set_sourceDescI5($corpusHeader);
}
else{
print STDERR "TEIFORMAT is $TEIFORMAT; must be TEI or I5";
}
$corpusHeader->paste("first_child", $root);
$corpusHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_att($ident, $srctextlangs{$fnsource});
$corpusHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_text($expandLang{$srctextlangs{$fnsource}});
if($TEIFORMAT eq "I5"){
$idsDoc->paste("after", $corpusHeader);
}
}
#----------------------------
# handler &text for <text>
#----------------------------
sub text {
my ($twig, $text, $textHeader) = @_;
$textcounter++;
# ToDo: catch all other, unexpected children of root
#--------------------------------------------------------------------------
# Get text metadata (attributes of <text>) and create teiHeader for <text>
#--------------------------------------------------------------------------
my $textattsref = $text->atts(); # $textattsref is now a reference to a hash and should be used with '->'
&createIdsDoc($textattsref); # this creation of idsDoc will only be called for the 2nd idsDoc (i.e. february) or higher
# &createTextHeader returns the $textID:
my $textID = &createTextHeader($text, $textattsref, $textHeader);
#----------------------------------------
# create <TEI> or <idsText> from <text>
#----------------------------------------
# set vrt <text> to <TEI> and delete all attributes after they were were saved above
$text->del_atts;
if($TEIFORMAT eq "TEI"){
$text->set_gi("TEI");
$text->set_att('xml:id', $textID);
}
else {
$text ->set_gi("idsText");
$text ->set_att('version', "1.0");
# $text ->move("last_child", $idsDoc); # does not work because apparently $idsDoc is not under $root at this point
}
#------------------------------------------------------------------
# create the <tei:text>, <body>, <div> elements inside <TEI>
#------------------------------------------------------------------
my $ttext_element = XML::Twig::Elt->new('text');
my $body_element = XML::Twig::Elt->new('body');
my $div_element = XML::Twig::Elt->new('div');
# set atts
$div_element ->set_att("type", "page"); # ToDo: this is specific to KLK
$ttext_element->set_att("xml:lang", 'fi'); # as in ICC-NOR
# paste
$ttext_element->paste('last_child', $text);
$body_element ->paste('last_child', $ttext_element);
$div_element ->paste('last_child', $body_element);
#-------------------------------
# create <p> from <paragraph>
#-------------------------------
my @paragraphs = $text->children( 'paragraph');
foreach my $paragraph (@paragraphs) {
&setP($paragraph);
$paragraph->move('last_child', $div_element);
#------------------------------
# create <s> from <sentence>
#------------------------------
my @sentences = $paragraph->children('sentence');
foreach my $sentence (@sentences) {
&setS($sentence);
#--------------------------------------
# create <w> (word) from each $line
#--------------------------------------
my @lines = split(/\n+/, $sentence->xml_text);
$sentence->set_text("\n");
for my $line (@lines){ # Todo: Reihenfolge checken
if($line ne "" ){
my $w_element = XML::Twig::Elt->new('w');
&createW($w_element, $line);
$w_element->paste('last_child', $sentence);
}
} # end words
} # end sentences
} # end paragraphs
# $twig->set_pretty_print( 'record');
# $twig->flush($OUT);
$twig->flush("/dev/stdout");
}
sub createTextHeader{
my ($text, $textattsref, $textHeader) = @_;
# USE 01 binding_id="2246025"
# USE 02 date="2021-01-15"
# 03 datefrom="20210115"
# 04 dateto="20210115"
# 05 elec_date="_"
# 06 file=""
# USE 07 filename_metadata="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_mets.xml"
# USE 08 filename_orig ="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_page-2021011502210030301.xml
# USE 09 id="t-bcd0f3fa-bbd3dac4"
# 10 img_url=""
# USE 11 issue_date="15.01.2021"
# USE 12 issue_no="SK0221"
# USE 13 issue_title="Suomen Kuvalehti"
# USE 14 label="Suomen Kuvalehti no. SK0221 15.01.2021"
# USE 16 language="fi"
# USE 17 page_id="p1"
# USE 18 page_no="None"
# 19 part_name="_"
# 20 publ_id="0039-5552"
# 21 publ_part=""
# USE 22 publ_title="Suomen Kuvalehti"
# USE 23 publ_type="aikakausi"
# USE 24 sentcount="70"
# USE 25 sum_lang="|xxx:44|fin:23|eng:3|"
# 26 timefrom="000000"
# 27 timeto="235959"
# USE 28 tokencount="304"
# 29 version_added="KLK-fi-2021">
my $BID = $textattsref->{'binding_id'};
my $DATE = $textattsref->{'date'};
my $METAFILENAME = $textattsref->{'filename_metadata'};
my $ORIGFILENAME = $textattsref->{'filename_orig'};
my $ID = $textattsref->{'id'};
my $ISSUEDATE = $textattsref->{'issue_date'};
my $ISSUENO = $textattsref->{'issue_no'};
my $ISSUETITLE = $textattsref->{'issue_title'};
my $LABEL = $textattsref->{'label'};
my $LANGUAGE = $textattsref->{'language'};
my $PAGEID = $textattsref->{'page_id'};
my $PAGENO = $textattsref->{'page_no'};
my $PUBLTITLE = $textattsref->{'publ_title'};
my $PUBLTYPE = $textattsref->{'publ_type'};
my $SENTCOUNT = $textattsref->{'sentcount'};
my $SUMLANG = $textattsref->{'sum_lang'};
my $TOKENCOUNT = $textattsref->{'tokencount'};
#-----------------------------
# Derived Metadata variables
#-----------------------------
my @datearray = split("-", $DATE);
my @langarray = split("|", $SUMLANG);
my @namearray = split(/[\.\/]/, $ORIGFILENAME); # use $namearray[4] as ID for the page
#----------------------------------------------------
# create textSigle to be returned from this function
#----------------------------------------------------
# SUK21.JAN.00001
my $yy = substr($datearray[0], 2, 2); # substr EXPR,OFFSET,LENGTH
my $mm = $datearray[1]; # substr EXPR,OFFSET,LENGTH
my $MMM = $months{$mm};
my $CSIGLE = $corpusids{$fnsource} . $yy;
my $textID = $CSIGLE . "_" . $MMM . "." . sprintf("%05d", $doccounter{$mm}++);
my $textSigle = $textID;
#-----------------------------------------------------------------------
# CREATE text-teiHeader ACCORDING TO THE SKELETON in $textHeader
#-----------------------------------------------------------------------
$textHeader->paste('first_child', $text);
#-----------------------------------------------
# <teiHeader>
# <fileDesc n="EuReCo-KLK-FIN_[$ID]">
# <titleStmt>
# <title>[$LABEL, page $PAGENO]</title>
$textHeader->first_child("fileDesc") -> set_att('n', "EuReCo-". $kielipankkiCorpus . "-" . $ID);
#-----------------
# titleStmt
#----------------
my $title="title";
my $titleStmt = $textHeader->first_child("fileDesc")->first_child("titleStmt");
if($TEIFORMAT eq "I5"){
$title = "t.title";
$textSigle =~ s/_/\//g;
$titleStmt->first_child("textSigle")->set_text($textSigle);
};
$titleStmt->first_child($title)->set_text($LABEL . ", Text #" . $textcounter);
# Case KLK: PAGENO scheint meist "None" zu sein
#-----------------------------------------------
# <fileDesc>
# <sourceDesc>
# <biblStruct>
# <analytic>
# <title type="main">[$LABEL, page $PAGENO]</title>
# <date>[$DATE]</date>
# <date type="year">TODO</date>
# <date type="month">TODO</date>
# <date type="day">TODO</date>
# <idno type="PAGEID">$PAGEID</idno>
# <idno type="BINDINGID">$BID</idno>
# <idno type="ID">$ID</idno>
# <idno type="KIELIPANKKI_METAFILENAME">$METAFILENAME</idno>
# <idno type="KIELIPANKKI_ORIGFILENAME">$ORIGFILENAME</idno>
# <textLang>$LANGUAGE</textLang>
my $analytic = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/analytic", 0);
if($TEIFORMAT eq "I5"){$title="h.title"};
$analytic->first_child($title) ->set_text($LABEL . ", Text #" . $textcounter); # Case KLK; PAGENO scheint meist "None" zu sein
#$analytic->get_xpath('./idno[@type="PAGEID"]', 0) ->set_text($PAGEID);
#$analytic->get_xpath('./idno[@type="BINDINGID"]', 0) ->set_text($BID);
#$analytic->get_xpath('./idno[@type="ID"]', 0) ->set_text($ID);
#$analytic->get_xpath('./idno[@type="KIELIPANKKI_METAFILENAME"]', 0) ->set_text($METAFILENAME);
#$analytic->get_xpath('./idno[@type="KIELIPANKKI_ORIGFILENAME"]', 0) ->set_text($ORIGFILENAME);
if($TEIFORMAT eq "TEI"){
$analytic->first_child('textLang') ->set_text($LANGUAGE);
}
# <monogr>
# <title>$PUBLTITLE</title>
# <imprint>
# <pubPlace>TODO</pubPlace>
# <publisher>TODO</publisher>
# </imprint>
# <biblScope unit="ISSUETITLE"/>
# <biblScope unit="ISSUENO"/>
# <biblScope unit="ISSUEDATE"/>
# <biblScope unit="pp">$PAGENO</biblScope>
my $monogr = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/monogr", 0);
$monogr->first_child($title) ->set_text($PUBLTITLE);
if($TEIFORMAT eq "TEI"){
$monogr->get_xpath('./imprint/date[@type="date"]', 0) ->set_text($DATE);
}
my $date = "date";
if($TEIFORMAT eq "I5"){$date="pubDate"};
$monogr->get_xpath('./imprint/' . $date . '[@type="year"]', 0) ->set_text($datearray[0]);
$monogr->get_xpath('./imprint/' . $date . '[@type="month"]', 0) ->set_text($datearray[1]);
$monogr->get_xpath('./imprint/' . $date . '[@type="day"]', 0) ->set_text($datearray[2]);
$monogr->first_child("imprint")->first_child("pubPlace") ->set_text($srcpubplaces{$PUBLTITLE}); # imprint is needed for tei validity
$monogr->first_child("imprint")->first_child("pubPlace") ->set_att('key', $CountryKey);
$monogr->first_child("imprint")->first_child("publisher") ->set_text($srcpublishers{$PUBLTITLE}); # imprint is needed for tei validity
#$monogr->get_xpath('./biblScope[@unit="ISSUETITLE"]', 0) ->set_text($ISSUETITLE);
#$monogr->get_xpath('./biblScope[@unit="ISSUENO"]', 0) ->set_text($ISSUENO);
#$monogr->get_xpath('./biblScope[@unit="ISSUEDATE"]', 0) ->set_text($ISSUEDATE);
#$monogr->get_xpath('./biblScope[@unit="pp"]', 0) ->set_text($PAGENO); # Achtung - PAGENO scheint meist "None" zu sein
my $dateNice = $datearray[2] . "." . $datearray[1] . "." . $datearray[0];
my $dateBackwards = $datearray[0] . "." . $datearray[1] . "." . $datearray[2];
if($TEIFORMAT eq "I5"){
my $refCompleteText = $textSigle . " " . $PUBLTITLE . " no. " . $ISSUENO . ", [" . $PUBLTYPE . "], " . $dateNice;
my $refShortText = $textSigle . " " . $PUBLTITLE . ", " . $dateNice;
$textHeader->get_xpath('./fileDesc/sourceDesc/reference[@type="complete"]', 0) -> set_text($refCompleteText);
$textHeader->get_xpath('./fileDesc/sourceDesc/reference[@type="short"]' , 0) -> set_text($refShortText);
}
# <encodingDesc>
# <tagsDecl>
# <namespace name="http://www.tei-c.org/ns/1.0">
# <tagUsage gi="s" occurs="SENTCOUNT"/>
# <tagUsage gi="w" occurs="TOKENCOUNT"/>
my $namespacePath="./encodingDesc/tagsDecl/namespace/";
if($TEIFORMAT eq "I5"){$namespacePath="./encodingDesc/tagsDecl/"};
$textHeader->get_xpath($namespacePath . 'tagUsage[@gi="s"]', 0) -> set_att('occurs', $SENTCOUNT);
$textHeader->get_xpath($namespacePath . 'tagUsage[@gi="w"]', 0) -> set_att('occurs', $TOKENCOUNT);
# <profileDesc>
# <langUsage>
# <language ident="fi" usage="|xxx:44|fin:23|eng:3|"/>
# </langUsage>
# <textClass>
# <classCode scheme="kielipankki_klk">$PUBLTYPE</classCode>
# <classCode scheme="kielipankki_klk_mapped">$mapping{$PUBLTYPE}</classCode>
if($TEIFORMAT eq "I5"){
$textHeader->get_xpath('./profileDesc/creation/creatDate', 0) ->set_text($dateBackwards);
}
if($TEIFORMAT eq "TEI"){
$textHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_att('ident', $LANGUAGE);
$textHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_att('usage', $SUMLANG);
}
# in @usage muss eigt. ein integer; am besten inhalt von SUMLANG aufdroeseln und mehrere <language> machen
$textHeader->get_xpath('./profileDesc/textClass/classCode[@scheme="kielipankki_klk"]', 0) ->set_text($PUBLTYPE);
$textHeader->get_xpath('./profileDesc/textClass/classCode[@scheme="kielipankki_klk_mapped"]', 0)->set_text($mapping{$PUBLTYPE});
# <revisionDesc>
# <change when="TODO" who="HL">TEI version for EuReCo</change>
$textHeader->get_xpath('./revisionDesc/change', 0) ->set_att('when', localtime->ymd('-'));
return $textID;
#-----------------------------------
# END OF CREATING TEIHEADER
#-----------------------------------
}
sub setP {
my ($paragraph) = @_;
$paragraph->set_gi('p');
# <paragraph id="p-bcd0f3fa-bbd3dac4-815ead7a" sum_lang="|fin:1|">
# atts of <paragraph>:
# @id USE
# @sum_lang USE: put in xml:lang and prefix the value with "x-" for private value
$paragraph->set_att("xml:lang", "x-" . $paragraph->att("sum_lang"));
$paragraph->del_att("sum_lang");
# $paragraph->change_att_name('id', 'xml:id');
$paragraph->del_att("id"); # diese id ist auch nicht eindeutig!!
}
sub setS {
my ($sentence) = @_;
$sentence->set_gi('s');
# the atts of <sentence>:
# USE 1 @id="s-bcd0f3fa-bbd3dac4-f7429090"
# USE 2 @lang="fin" -> xml:lang
# ? 3 @lang_conf="0.6734853"> -> ToDo @cert ?
# set attrs of <s>
$sentence->set_att("xml:lang", $sentence->att("lang")); # ToDo: convert the value / introduce a hash for lookup (input values: "fin", "xxx", ....)
# $sentence->change_att_name('id', 'xml:id'); # nicht eindeutig
$sentence->del_att('id');
$sentence->del_att("lang"); # replaced by xml:lang
$sentence->del_att("lang_conf"); # for the time being
}
sub createW {
my ($w_element, $line) = @_;
#---------------------------
# Get the tags (=columns)
#---------------------------
my @tags = split(/\t/, $line);
# set word string and lemma string according to $MASK flag:
my $w_string = "";
my $l_string = "";
if($MASK && ($tags[4] ne "Punct")){
$w_string = Data::Random::String->create_random_string(length=>length($tags[0]), contains=>'alpha');
$l_string = $w_string;
}
else {
$w_string = $tags[0];
$l_string = $tags[2];
}
$w_element->set_text($w_string);
# vrt word and positional-attributes in corpus KLK:
# USE [0] word
# USE [1] ref (id for reference of dephead)
# USE [2] lemma
# ? [3] lemmacomp (lemma with compound info - could go in @norm, as tag abuse?)
# USE [4] pos
# USE [5] msd
# USE [6] dephead
# USE [7] deprel
# [8] content (ocr-process)
# [9] vpos (ocr-process)
# [10] ocr (ocr-process)
# [11] cc (ocr-process)
# [12] hyph (ocr-process)
# [13] style (ocr-process)
# [14] lex (korp semantic disambiguation from G"oteborg)
# set the attributes of <w>:
$w_element->set_att("n", $tags[1]);
# $w_element->set_att("id", "w_" . $namearray[4] . $sentence->att("xml:id") . "_" . $tags[1]);
# so zusammengebaute ID ist auch nicht eindeutig...
$w_element->del_att("id");
$w_element->set_att("lemma", $l_string);
# $w_element->set_att("norm", $tags[3]); # tag abuse of @norm
$w_element->set_att("pos", $tags[4]);
$w_element->set_att("msd", $tags[5]);
if($TEIFORMAT eq "I5"){ # remove condition when part of the official TEI
$w_element->set_att("head", $tags[6]);
$w_element->set_att("deprel", $tags[7]);
}
}
sub set_title{
my ($corpusHeader, $source, $year, $kielipankkiCorpus) = @_;
my $titleElement = "title";
if($TEIFORMAT eq "I5"){
$titleElement = "c.title";
}
my $cTitleString = $source . " " . $year . ", from ". $kielipankkiCorpus . " for EuReCo";
#<teiHeader>
# <fileDesc>
# <titleStmt>
# <title>[Aamulehti2021] from [klk-fi-v2-vrt for] EuReCo</title>
# </titleStmt>
# <!-- ... -->
# </fileDesc>
#</teiHeader>
my $cTitleNode = $corpusHeader->first_child("fileDesc")->first_child("titleStmt")->first_child($titleElement);
$cTitleNode->set_text($cTitleString);
}
sub set_sourceDesc{
my ($corpusHeader, $source, $year, $kielipankkiCorpus) = @_;
my $cBiblString = $source . " " . $year . ", from ". $kielipankkiCorpus . " for EuReCo";
#<teiHeader>
# <fileDesc>
# <!-- ... -->
# <sourceDesc>
# <bibl>[Aamulehti2021] from [klk-fi-v2-vrt]</bibl>
# </sourceDesc>
# <!-- ... -->
# </fileDesc>
#</teiHeader>
my $cBiblNode = $corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("bibl");
$cBiblNode->set_text($cBiblString);
}
sub set_sourceDescI5{
my ($corpusHeader) = @_;
my $PUBLTITLE = $srcfullnames{$fnsource};
my $PUBLPLACE = $srcpubplaces{$PUBLTITLE};
my $PUBLISHER = $srcpublishers{$PUBLTITLE};
my $YEAR = $fnyear;
my $YY = substr($fnyear, 2, 2);
my $CSIGLE = $corpusids{$PUBLTITLE} . $YY;
#<idsHeader>
# <fileDesc>
# <!-- ... -->
# <sourceDesc>
# <biblStruct>
# <monogr>
# <h.title type="main">[$PUBLTITLE], [$YEAR]</h.title>
# <imprint>
# <publisher>[$PUBLISHER]</publisher>
# <pubPlace key="[$TL]">[$PUBPLACE]</pubPlace>
# </imprint>
# </monogr>
# </biblStruct>
# <reference type="super" assemblage="regular">[$KKK] [$PUBLTITLE]; [$PUBPLACE]: [$PUBLISHER], [$YEAR]</reference>
# </sourceDesc> # <sourceDesc>
# <!-- ... -->
# </fileDesc>
#</teiHeader>
my $cMonogr = $corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("biblStruct")->first_child("monogr");
$cMonogr->first_child("h.title")->set_text($PUBLTITLE);
$cMonogr->first_child("imprint")->first_child("publisher")->set_text($PUBLISHER);
$cMonogr->first_child("imprint")->first_child("pubPlace") ->set_text($PUBLPLACE);
$cMonogr->first_child("imprint")->first_child("pubPlace") ->set_att('key', $CountryKey);
$corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("reference")->set_text($CSIGLE . " " . $PUBLTITLE . "; " . $PUBLPLACE . ": " . $PUBLISHER . ", " . $YEAR);
}
sub createIdsDoc{ # will only be called for the second idsDoc (i.e. for february) and higher
my ($textattsref) = @_;
my $DATE = $textattsref->{'date'};
my $PUBLTITLE = $textattsref->{'publ_title'};
my @datearray = split("-", $DATE);
my $MONTH = $datearray[1];
my $YEAR = $datearray[0];
my $YY = substr($YEAR, 2, 2);
my $CSIGLE = $corpusids{$PUBLTITLE} . $YY;
my $DOCID = $months{$MONTH};
my $MONTHNAME = $monthnames{$MONTH};
my $idsDocString="";
if($TEIFORMAT eq "I5"){
$idsDocString = "
<idsDoc version=\"1.0\" TEIform=\"TEI.2\">
<idsHeader type=\"document\" pattern=\"text\" version=\"1.1\" TEIform=\"teiHeader\">
<fileDesc>
<titleStmt>
<dokumentSigle>$CSIGLE/$DOCID</dokumentSigle>
<d.title>$PUBLTITLE, $MONTHNAME $YEAR</d.title>
</titleStmt>
<publicationStmt>
<distributor/>
<pubAddress/>
<availability region=\"world\">$kielipankkiLicense</availability>
<pubDate/>
</publicationStmt>
<sourceDesc>
<biblStruct>
<monogr>
<h.title/>
<imprint/>
</monogr>
</biblStruct>
</sourceDesc>
</fileDesc>
</idsHeader>
</idsDoc>\n";
}
if($MONTH + 0 == $LASTMONTH + 1){
if($MONTH+0 > 1){
printf("%s\n", $idsDocString);
}
$LASTMONTH++;
}
}
#################
## usage_message
#################
sub usage_message {
print STDERR "Usage: ./vrt2tei.pl [OPTIONS] <file.vrt.xml>\n";
print STDERR " <file.vrt.xml> is a VRT file as proper (well-formed) XML\n";
print STDERR " Options:\n";
print STDERR " -t (tei|i5) output format, default: tei\n";
print STDERR " -m mask primary data\n";
print STDERR "\n";
exit;
}