idsDoc partitions and case switches for I5
diff --git a/vrt2tei.pl b/vrt2tei.pl
index 6c280ba..63dd6fd 100755
--- a/vrt2tei.pl
+++ b/vrt2tei.pl
@@ -48,7 +48,7 @@
use strict;
use warnings;
-use diagnostics;
+#use diagnostics;
use Getopt::Std;
use XML::Twig;
@@ -64,7 +64,7 @@
use Time::Piece;
use Tie::IxHash;
-
+use Data::Random::String;
#----------------------
@@ -97,6 +97,7 @@
my $textcounter = 0;
+my $LASTMONTH = 0;
our %corpussigles = ();
our %srcfullnames = ();
@@ -105,7 +106,10 @@
our %srctexttypes = ();
our %srctextlangs = ();
-my %doccounter = ( # by the month as in derekox
+our %expandLang = ();
+
+
+my %doccounter = ( # by the month as in dereko
"01" => 1,
"02" => 1,
"03" => 1,
@@ -121,12 +125,13 @@
);
+
my $sourcescsvfile = "sources_klk_fi_v2_2021_4eureco.csv";
my $corpheaderfile = "teiCorpusHeaderSkeleton.tei.xml";
my $textheaderfile = "teiTextHeaderSkeleton.tei.xml";
if($TEIFORMAT eq "I5"){
- # $corpheaderfile = "i5CorpusHeaderSkeleton.i5.xml";
+ $corpheaderfile = "i5CorpusHeaderSkeleton.i5.xml";
$textheaderfile = "i5TextHeaderSkeleton.i5.xml";
}
@@ -134,8 +139,20 @@
my $twig="";
# global variables pertaining to the original corpus :
-my $kielipankkiCorpus = "klk-fi-v2-vrt";
+my $kielipankkiCorpus = "klk-fi-v2-vrt";
+my $kielipankkiLicense = "CLARIN-RES";
+
+# variables $fnsource and $fnyear taken from the filename
+my @array = split(/\//, $ARGV[0]);
+my $l = scalar(@array);
+my $fnsource = $array[$l-1];
+$fnsource =~ s/([0-9][0-9][0-9][0-9])\.xml$//;
+
+my $fnyear = $1; # $1 contains substring in first bracket in regex above
+
+
+# months
my %months = (
"01" => "JAN",
"02" => "FEB",
@@ -151,12 +168,60 @@
"12" => "DEC",
);
+my %monthnames = (
+ "01" => "January",
+ "02" => "February",
+ "03" => "March",
+ "04" => "April",
+ "05" => "May",
+ "06" => "June",
+ "07" => "July",
+ "08" => "August",
+ "09" => "September",
+ "10" => "October",
+ "11" => "November",
+ "12" => "December",
+ );
+
my %mapping = ();
$mapping{"aikakausi"} = "Zeitschrift";
$mapping{"sanomalehti"} = "Zeitung";
+#-------------------------------------------------------------------------------------------
+# read source metadata file (prepared manually => ultimately read the info from CMDI File?)
+# and set variables
+#-------------------------------------------------------------------------------------------
+
+open(my $SOURCES, "< $sourcescsvfile") || die("$0: cannot open file for reading: $sourcescsvfile");
+while(my $fline = <$SOURCES>){
+ chomp($fline);
+
+ if ($fline=~/^\#/ || $fline=~/^\s*$/ || $fline =~ "TEICORPUSID"){next;} # skip line if empty line or comment line or first line
+ my @flarray = split(/\s*\t+\s*/, $fline); # split each line into array
+
+ # set full titles ($flarray[1]) as keys:
+ $corpussigles{$flarray[1]} = $flarray[0];
+ $srcfullnames{$flarray[1]} = $flarray[1];
+ $srcpubplaces{$flarray[1]} = $flarray[6];
+ $srcpublishers{$flarray[1]} = $flarray[7];
+ $srctexttypes{$flarray[1]} = $flarray[4];
+ $srctextlangs{$flarray[1]} = $flarray[5];
+
+ # also set simple titles ($flarray[2]) as keys:
+ $corpussigles{$flarray[2]} = $flarray[0];
+ $srcfullnames{$flarray[2]} = $flarray[1];
+ $srcpubplaces{$flarray[2]} = $flarray[6];
+ $srcpublishers{$flarray[2]} = $flarray[7];
+ $srctexttypes{$flarray[2]} = $flarray[4];
+ $srctextlangs{$flarray[2]} = $flarray[5];
+}
+close($SOURCES);
+
+$expandLang{"fi"} = "Finnish";
+$expandLang{"sv"} = "Swedish";
+
#------------------------------------------------------------------
# read corpusHeaderSkeleton document and get header out of it
#------------------------------------------------------------------
@@ -187,24 +252,50 @@
my $idsDoc = XML::Twig::Elt->new('idsDoc');
-if($TEIFORMAT eq "I5"){
+my $idsDocHeader = XML::Twig::Elt->new('idsHeader');
- my $idsDocHeader = XML::Twig::Elt->new('idsHeader');
+if($TEIFORMAT eq "I5"){
my $docFileDesc = XML::Twig::Elt->new('fileDesc');
my $docTitleStmt = XML::Twig::Elt->new('titleStmt');
my $dtitle = XML::Twig::Elt->new('d.title');
my $docSigle = XML::Twig::Elt->new('dokumentSigle');
- $docSigle -> paste("first_child", $docTitleStmt);
- $dtitle -> paste("last_child", $docTitleStmt);
- $docTitleStmt -> paste("last_child", $docFileDesc);
- $docFileDesc -> paste("last_child", $idsDocHeader);
+ my $docPublicationStmt = XML::Twig::Elt->new('publicationStmt');
+ my $docDistributor = XML::Twig::Elt->new('distributor');
+ my $docPubAddress = XML::Twig::Elt->new('pubAdress');
+ my $docAvailability = XML::Twig::Elt->new('availability');
+ my $docPubDate = XML::Twig::Elt->new('pubDate');
+
+ my $docSourceDesc = XML::Twig::Elt->new('sourceDesc');
+ my $docBiblStruct = XML::Twig::Elt->new('biblStruct');
+ my $docMonogr = XML::Twig::Elt->new('monogr');
+ my $docHTitle = XML::Twig::Elt->new('h.title');
+ my $docImprint = XML::Twig::Elt->new('imprint');
+
+ $docSigle -> paste("first_child", $docTitleStmt);
+ $dtitle -> paste("last_child", $docTitleStmt);
+ $docTitleStmt -> paste("last_child", $docFileDesc);
+ $docFileDesc -> paste("last_child", $idsDocHeader);
+ $docPublicationStmt -> paste("last_child", $docFileDesc);
+ $docDistributor -> paste("last_child", $docPublicationStmt);
+ $docPubAddress -> paste("last_child", $docPublicationStmt);
+
+ $docAvailability -> paste("last_child", $docPublicationStmt);
+ $docPubDate -> paste("last_child", $docPublicationStmt);
+
+ $docSourceDesc -> paste("last_child", $docFileDesc);
+ $docBiblStruct -> paste("last_child", $docSourceDesc);
+ $docMonogr -> paste("last_child", $docBiblStruct);
+ $docHTitle -> paste("last_child", $docMonogr);
+ $docImprint -> paste("last_child", $docMonogr);
+
$idsDocHeader -> paste("last_child", $idsDoc);
# ToDo set dummy dtitle and docSigle
-}
-
+ $docSigle->set_text($corpussigles{$fnsource} . "/JAN");
+ $dtitle ->set_text($srcfullnames{$fnsource} . ", January " . $fnyear);
+}
#----------------------------------
@@ -214,35 +305,6 @@
open(my $IN, "< $ARGV[0]") || die("$0: cannot open file for reading: $ARGV[0]"); # open input file and initialise filehandel, actually does not seem to be needed
# as parsefile() (s.b.) is applied to the filename
-#-------------------------------------------------------------------------------------------
-# read source metadata file (prepared manually => ultimately read the info from CMDI File?)
-#-------------------------------------------------------------------------------------------
-
-open(my $SOURCES, "< $sourcescsvfile") || die("$0: cannot open file for reading: $sourcescsvfile");
-while(my $fline = <$SOURCES>){
- chomp($fline);
-
- if ($fline=~/^\#/ || $fline=~/^\s*$/ || $fline =~ "TEICORPUSID"){next;} # skip line if empty line or comment line or first line
- my @flarray = split(/\s*\t+\s*/, $fline); # split each line into array
-
- # set full titles ($flarray[1]) as keys:
- $corpussigles{$flarray[1]} = $flarray[0];
- $srcfullnames{$flarray[1]} = $flarray[1];
- $srcpubplaces{$flarray[1]} = $flarray[6];
- $srcpublishers{$flarray[1]} = $flarray[7];
- $srctexttypes{$flarray[1]} = $flarray[4];
- $srctextlangs{$flarray[1]} = $flarray[5];
-
- # also set simple titles ($flarray[2]) as keys:
- $corpussigles{$flarray[2]} = $flarray[0];
- $srcfullnames{$flarray[2]} = $flarray[1];
- $srcpubplaces{$flarray[2]} = $flarray[6];
- $srcpublishers{$flarray[2]} = $flarray[7];
- $srctexttypes{$flarray[2]} = $flarray[4];
- $srctextlangs{$flarray[2]} = $flarray[5];
-}
-close($SOURCES);
-
#####################
@@ -301,7 +363,13 @@
sub root {
my ($twig, $root, $corpusHeader) =@_;
- $root->set_gi('teiCorpus');
+ if($TEIFORMAT eq "TEI"){
+ $root->set_gi('teiCorpus');
+ }
+ else {
+ $root->set_gi('idsCorpus');
+ }
+
$root->set_att("xmlns", 'http://www.tei-c.org/ns/1.0');
&insertCorpusHeader($root, $corpusHeader);
@@ -312,26 +380,32 @@
sub insertCorpusHeader{
my ($root, $corpusHeader) =@_;
- #-------------------------------------------------------------
- # take fnsource and year from the current xml input filename
- #-------------------------------------------------------------
-
- my @array = split(/\//, $ARGV[0]);
- my $l = scalar(@array);
- my $fnsource = $array[$l-1];
- $fnsource =~ s/([0-9][0-9][0-9][0-9])\.xml$//;
-
- my $year = $1; # $1 containts substring in first bracket in regex above
-
#-----------------------
# set corpus header
#-----------------------
- &set_title( $corpusHeader, $srcfullnames{$fnsource}, $year, $kielipankkiCorpus);
- &set_sourceDesc($corpusHeader, $srcfullnames{$fnsource}, $year, $kielipankkiCorpus);
+ &set_title($corpusHeader, $srcfullnames{$fnsource}, $fnyear, $kielipankkiCorpus);
+
+ if($TEIFORMAT eq "TEI"){
+ &set_sourceDesc($corpusHeader, $srcfullnames{$fnsource}, $fnyear, $kielipankkiCorpus);
+ }
+ elsif ($TEIFORMAT eq "I5"){
+ $corpusHeader->get_xpath("fileDesc/titleStmt/korpusSigle",0) ->set_text($corpussigles{$fnsource});
+ $corpusHeader->get_xpath("fileDesc/publicationStmt/pubDate",0)->set_text(localtime[5] + 1900);
+ $corpusHeader->get_xpath("profileDesc/langUsage/language",0) ->set_text($expandLang{$srctextlangs{$fnsource}});
+ $corpusHeader->get_xpath("profileDesc/langUsage/language",0) ->set_att('id', $srctextlangs{$fnsource});
+ $corpusHeader->get_xpath('encodingDesc/editorialDecl/transduction[@n="2"]',0) ->set_text("I5 version by EuReCo using vrt2tei.pl " . localtime->ymd('-'));
+ &set_sourceDescI5($corpusHeader);
+ }
+ else{
+ print STDERR "TEIFORMAT is $TEIFORMAT; must be TEI or I5";
+ }
$corpusHeader->paste("first_child", $root);
- $idsDoc ->paste("after", $corpusHeader);
+
+ if($TEIFORMAT eq "I5"){
+ $idsDoc->paste("after", $corpusHeader);
+ }
}
@@ -353,6 +427,10 @@
my $textattsref = $text->atts(); # $textattsref is now a reference to a hash and should be used with '->'
+ &createIdsDoc($textattsref);
+
+
+
# &createTextHeader returns the $textID:
my $textID = &createTextHeader($text, $textattsref, $textHeader);
@@ -569,11 +647,11 @@
$analytic->first_child($title) ->set_text($LABEL . ", Text #" . $textcounter); # Case KLK; PAGENO scheint meist "None" zu sein
- $analytic->get_xpath('./idno[@type="PAGEID"]', 0) ->set_text($PAGEID);
- $analytic->get_xpath('./idno[@type="BINDINGID"]', 0) ->set_text($BID);
- $analytic->get_xpath('./idno[@type="ID"]', 0) ->set_text($ID);
- $analytic->get_xpath('./idno[@type="KIELIPANKKI_METAFILENAME"]', 0) ->set_text($METAFILENAME);
- $analytic->get_xpath('./idno[@type="KIELIPANKKI_ORIGFILENAME"]', 0) ->set_text($ORIGFILENAME);
+ #$analytic->get_xpath('./idno[@type="PAGEID"]', 0) ->set_text($PAGEID);
+ #$analytic->get_xpath('./idno[@type="BINDINGID"]', 0) ->set_text($BID);
+ #$analytic->get_xpath('./idno[@type="ID"]', 0) ->set_text($ID);
+ #$analytic->get_xpath('./idno[@type="KIELIPANKKI_METAFILENAME"]', 0) ->set_text($METAFILENAME);
+ #$analytic->get_xpath('./idno[@type="KIELIPANKKI_ORIGFILENAME"]', 0) ->set_text($ORIGFILENAME);
if($TEIFORMAT eq "TEI"){
$analytic->first_child('textLang') ->set_text($LANGUAGE);
}
@@ -602,16 +680,16 @@
$monogr->get_xpath('./imprint/' . $date . '[@type="day"]', 0) ->set_text($datearray[2]);
$monogr->first_child("imprint")->first_child("pubPlace") ->set_text($srcpubplaces{$PUBLTITLE}); # imprint is needed for tei validity
$monogr->first_child("imprint")->first_child("publisher") ->set_text($srcpublishers{$PUBLTITLE}); # imprint is needed for tei validity
- $monogr->get_xpath('./biblScope[@unit="ISSUETITLE"]', 0) ->set_text($ISSUETITLE);
- $monogr->get_xpath('./biblScope[@unit="ISSUENO"]', 0) ->set_text($ISSUENO);
- $monogr->get_xpath('./biblScope[@unit="ISSUEDATE"]', 0) ->set_text($ISSUEDATE);
- $monogr->get_xpath('./biblScope[@unit="pp"]', 0) ->set_text($PAGENO); # Achtung - PAGENO scheint meist "None" zu sein
+ #$monogr->get_xpath('./biblScope[@unit="ISSUETITLE"]', 0) ->set_text($ISSUETITLE);
+ #$monogr->get_xpath('./biblScope[@unit="ISSUENO"]', 0) ->set_text($ISSUENO);
+ #$monogr->get_xpath('./biblScope[@unit="ISSUEDATE"]', 0) ->set_text($ISSUEDATE);
+ #$monogr->get_xpath('./biblScope[@unit="pp"]', 0) ->set_text($PAGENO); # Achtung - PAGENO scheint meist "None" zu sein
my $dateNice = $datearray[2] . "." . $datearray[1] . "." . $datearray[0];
my $dateBackwards = $datearray[0] . "." . $datearray[1] . "." . $datearray[2];
if($TEIFORMAT eq "I5"){
- my $refCompleteText = $textSigle . " " . $PUBLTITLE . " no. " . $ISSUENO . ", [" . $mapping{$PUBLTYPE} . "], " . $dateNice;
+ my $refCompleteText = $textSigle . " " . $PUBLTITLE . " no. " . $ISSUENO . ", [" . $PUBLTYPE . "], " . $dateNice;
my $refShortText = $textSigle . " " . $PUBLTITLE . ", " . $dateNice;
$textHeader->get_xpath('./fileDesc/sourceDesc/reference[@type="complete"]', 0) -> set_text($refCompleteText);
$textHeader->get_xpath('./fileDesc/sourceDesc/reference[@type="short"]' , 0) -> set_text($refShortText);
@@ -707,7 +785,9 @@
my @tags = split(/\t/, $line);
# set content of <w> i.e. the token
- $w_element->set_text($tags[0]);
+ # $w_element->set_text($tags[0]);
+ my $random_w = Data::Random::String->create_random_string(length=>length($tags[0]), contains=>'alpha');
+ $w_element->set_text($random_w);
# vrt positional-attributes in corpus KLK:
# USE [0] word
@@ -731,19 +811,29 @@
# $w_element->set_att("id", "w_" . $namearray[4] . $sentence->att("xml:id") . "_" . $tags[1]);
# so zusammengebaute ID ist auch nicht eindeutig...
$w_element->del_att("id");
- $w_element->set_att("lemma", $tags[2]);
+ # $w_element->set_att("lemma", $tags[2]);
+ $w_element->set_att("lemma", $random_w);
+
# $w_element->set_att("norm", $tags[3]); # tag abuse of @norm
$w_element->set_att("pos", $tags[4]);
$w_element->set_att("msd", $tags[5]);
-#TMP $w_element->set_att("head", $tags[6]);
-#TMP $w_element->set_att("deprel", $tags[7]);
+ if($TEIFORMAT eq "I5"){ # remove condition when part of the official TEI
+ $w_element->set_att("head", $tags[6]);
+ $w_element->set_att("deprel", $tags[7]);
+ }
}
sub set_title{
my ($corpusHeader, $source, $year, $kielipankkiCorpus) = @_;
+ my $titleElement = "title";
+
+ if($TEIFORMAT eq "I5"){
+ $titleElement = "c.title";
+ }
+
my $cTitleString = $source . " " . $year . ", from ". $kielipankkiCorpus . " for EuReCo";
#<teiHeader>
@@ -755,7 +845,7 @@
# </fileDesc>
#</teiHeader>
- my $cTitleNode = $corpusHeader->first_child("fileDesc")->first_child("titleStmt")->first_child("title");
+ my $cTitleNode = $corpusHeader->first_child("fileDesc")->first_child("titleStmt")->first_child($titleElement);
$cTitleNode->set_text($cTitleString);
@@ -781,6 +871,103 @@
$cBiblNode->set_text($cBiblString);
}
+sub set_sourceDescI5{
+ my ($corpusHeader) = @_;
+
+ my $PUBLTITLE = $srcfullnames{$fnsource};
+ my $PUBLPLACE = $srcpubplaces{$PUBLTITLE};
+ my $PUBLISHER = $srcpublishers{$PUBLTITLE};
+ my $CSIGLE = $corpussigles{$PUBLTITLE};
+
+ my $YEAR = $fnyear;
+
+
+ #<idsHeader>
+ # <fileDesc>
+ # <!-- ... -->
+ # <sourceDesc>
+ # <biblStruct>
+ # <monogr>
+ # <h.title type="main">[$PUBLTITLE], [$YEAR]</h.title>
+ # <imprint>
+ # <publisher>[$PUBLISHER]</publisher>
+ # <pubPlace key="DE">[$PUBPLACE]</pubPlace>
+ # </imprint>
+ # </monogr>
+ # </biblStruct>
+ # <reference type="super" assemblage="regular">[$KKK] [$PUBLTITLE]; [$PUBPLACE]: [$PUBLISHER], [$YEAR]</reference>
+ # </sourceDesc> # <sourceDesc>
+ # <!-- ... -->
+ # </fileDesc>
+ #</teiHeader>
+
+ my $cMonogr = $corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("biblStruct")->first_child("monogr");
+ $cMonogr->first_child("h.title")->set_text($PUBLTITLE);
+ $cMonogr->first_child("imprint")->first_child("publisher")->set_text($PUBLISHER);
+ $cMonogr->first_child("imprint")->first_child("pubPlace") ->set_text($PUBLPLACE);
+
+ $corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("reference")->set_text($CSIGLE . " " . $PUBLTITLE . "; " . $PUBLPLACE . ": " . $PUBLISHER . ", " . $YEAR);
+
+}
+
+
+
+sub createIdsDoc{
+ my ($textattsref) = @_;
+ my $DATE = $textattsref->{'date'};
+ my $PUBLTITLE = $textattsref->{'publ_title'};
+ my $CSIGLE = $corpussigles{$PUBLTITLE};
+
+ #print STDERR "LASTMONTH: " . $LASTMONTH . "\n";
+ #print STDERR "DATE: " . $DATE . "\n";
+
+ my @datearray = split("-", $DATE);
+ my $MONTH = $datearray[1];
+ my $YEAR = $datearray[0];
+
+ my $DOCID = $months{$MONTH};
+ my $MONTHNAME = $monthnames{$MONTH};
+
+
+ #print STDERR "MONTH: " . $MONTH . "\n\n";
+
+ my $idsDocString="";
+ if($TEIFORMAT eq "I5"){
+ $idsDocString = "
+<idsDoc>
+<idsHeader type=\"document\" pattern=\"text\" version=\"1.1\" TEIform=\"teiHeader\">
+ <fileDesc>
+ <titleStmt>
+ <dokumentSigle>$CSIGLE/$DOCID</dokumentSigle>
+ <d.title>$PUBLTITLE, $MONTHNAME $YEAR</d.title>
+ </titleStmt>
+ <publicationStmt>
+ <distributor/>
+ <pubAddress/>
+ <availability region=\"world\">$kielipankkiLicense</availability>
+ <pubDate/>
+ </publicationStmt>
+ <sourceDesc>
+ <biblStruct>
+ <monogr>
+ <h.title/>
+ <imprint/>
+ </monogr>
+ </biblStruct>
+ </sourceDesc>
+ </fileDesc>
+</idsHeader>
+</idsDoc>\n";
+}
+ if($MONTH + 0 == $LASTMONTH + 1){
+ if($MONTH+0 > 1){
+ printf("%s\n", $idsDocString);
+ }
+ $LASTMONTH++;
+ }
+ # printf(STDERR "\n\nNEW MONTH %s\n\n", $months{$MONTH});
+}
+
#################