dtd handling and some renamings
diff --git a/vrt2tei.pl b/vrt2tei.pl
index 63dd6fd..42abd2b 100755
--- a/vrt2tei.pl
+++ b/vrt2tei.pl
@@ -16,7 +16,6 @@
#
#
# TODO:
-# 1 insert dtd spec, or ref to TEI
# 3a remove the vrt positional attribute comment line / all comment lines
# 3b add @head and @deprel to I5 sowie auch @msd
@@ -94,12 +93,12 @@
#my $TEIFORMAT = "TEI";
my $TEIFORMAT = "I5";
-
+my $DTDDECL = 'idsCorpus PUBLIC "-//IDS//DTD IDS-I5 1.0//EN" "http://corpora.ids-mannheim.de/I5/DTD/i5.dtd"'; # for I5
my $textcounter = 0;
my $LASTMONTH = 0;
-our %corpussigles = ();
+our %corpusids = ();
our %srcfullnames = ();
our %srcpubplaces = ();
our %srcpublishers = ();
@@ -124,10 +123,15 @@
"12" => 1,
);
+# global variables pertaining to the original corpus of *all* newspapers:
+my $kielipankkiCorpus = "klk-fi-v2-vrt";
+my $kielipankkiLicense = "CLARIN-RES";
+my $CountryKey = "FI";
-
+# Table with metadata about the different sources (newspapers)
my $sourcescsvfile = "sources_klk_fi_v2_2021_4eureco.csv";
+# corpusheader and textheader skeletons
my $corpheaderfile = "teiCorpusHeaderSkeleton.tei.xml";
my $textheaderfile = "teiTextHeaderSkeleton.tei.xml";
if($TEIFORMAT eq "I5"){
@@ -138,11 +142,6 @@
my $twig="";
-# global variables pertaining to the original corpus :
-my $kielipankkiCorpus = "klk-fi-v2-vrt";
-my $kielipankkiLicense = "CLARIN-RES";
-
-
# variables $fnsource and $fnyear taken from the filename
my @array = split(/\//, $ARGV[0]);
my $l = scalar(@array);
@@ -150,6 +149,7 @@
$fnsource =~ s/([0-9][0-9][0-9][0-9])\.xml$//;
my $fnyear = $1; # $1 contains substring in first bracket in regex above
+my $fnYY = substr($fnyear, 2, 2);
# months
@@ -202,7 +202,8 @@
my @flarray = split(/\s*\t+\s*/, $fline); # split each line into array
# set full titles ($flarray[1]) as keys:
- $corpussigles{$flarray[1]} = $flarray[0];
+ # (ToDo: these hashes could probably conflated into an array of hashes or so)
+ $corpusids{$flarray[1]} = $flarray[0];
$srcfullnames{$flarray[1]} = $flarray[1];
$srcpubplaces{$flarray[1]} = $flarray[6];
$srcpublishers{$flarray[1]} = $flarray[7];
@@ -210,7 +211,7 @@
$srctextlangs{$flarray[1]} = $flarray[5];
# also set simple titles ($flarray[2]) as keys:
- $corpussigles{$flarray[2]} = $flarray[0];
+ $corpusids{$flarray[2]} = $flarray[0];
$srcfullnames{$flarray[2]} = $flarray[1];
$srcpubplaces{$flarray[2]} = $flarray[6];
$srcpublishers{$flarray[2]} = $flarray[7];
@@ -223,7 +224,7 @@
$expandLang{"sv"} = "Swedish";
#------------------------------------------------------------------
-# read corpusHeaderSkeleton document and get header out of it
+# read corpusHeaderSkeleton document and start a twig for it
#------------------------------------------------------------------
my $teiCorpusHeaderDocTwig = new XML::Twig(
@@ -238,7 +239,7 @@
#------------------------------------------------------------------
-# read textHeaderSkeleton document adn get header out of it
+# read textHeaderSkeleton document and start a twig for it
#------------------------------------------------------------------
my $teiTextHeaderDocTwig = new XML::Twig(
@@ -251,6 +252,12 @@
my $textHeader = $teiTextHeaderDocTwig->root; # getting the teiHeader for corpus out of the teiTextHeaderSkeleton document
+#---------------------------------------------------------
+# define a subtree for idsDoc
+# for the time being it will only be used for the first
+# idsDoc header, to be inserted in the root hander
+#---------------------------------------------------------
+
my $idsDoc = XML::Twig::Elt->new('idsDoc');
my $idsDocHeader = XML::Twig::Elt->new('idsHeader');
@@ -262,7 +269,7 @@
my $docPublicationStmt = XML::Twig::Elt->new('publicationStmt');
my $docDistributor = XML::Twig::Elt->new('distributor');
- my $docPubAddress = XML::Twig::Elt->new('pubAdress');
+ my $docPubAddress = XML::Twig::Elt->new('pubAddress');
my $docAvailability = XML::Twig::Elt->new('availability');
my $docPubDate = XML::Twig::Elt->new('pubDate');
@@ -271,6 +278,16 @@
my $docMonogr = XML::Twig::Elt->new('monogr');
my $docHTitle = XML::Twig::Elt->new('h.title');
my $docImprint = XML::Twig::Elt->new('imprint');
+
+ $idsDoc -> set_att('version', "1.0");
+ $idsDoc -> set_att('TEIform', "TEI.2");
+
+ $idsDocHeader -> set_att('version', "1.1");
+ $idsDocHeader -> set_att('type', "document");
+ $idsDocHeader -> set_att('pattern', "text");
+ $idsDocHeader -> set_att('TEIform', "teiHeader");
+
+
$docSigle -> paste("first_child", $docTitleStmt);
$dtitle -> paste("last_child", $docTitleStmt);
@@ -291,15 +308,14 @@
$idsDocHeader -> paste("last_child", $idsDoc);
- # ToDo set dummy dtitle and docSigle
-
- $docSigle->set_text($corpussigles{$fnsource} . "/JAN");
+ $docSigle->set_text($corpusids{$fnsource} . $fnYY . "/JAN");
$dtitle ->set_text($srcfullnames{$fnsource} . ", January " . $fnyear);
}
+
#----------------------------------
-# read input VRT-XML document
+# read the input VRT-XML document
#----------------------------------
open(my $IN, "< $ARGV[0]") || die("$0: cannot open file for reading: $ARGV[0]"); # open input file and initialise filehandel, actually does not seem to be needed
@@ -316,7 +332,6 @@
#-------------------------------------------------------------------------------------------------------------
-
$twig = new XML::Twig(
keep_spaces => 1, # dadurch auch whitespaces an ehemeligen elementgrenzen im output
keep_atts_order => 1, # requires Tie::IxHash
@@ -329,17 +344,13 @@
# text => \&text
text => sub{text(@_, $textHeader->copy)} # copy must be because textHeader will be flushed with $twig in the <text> handler;
},
- # dtd_handlers => { # ToDo for I5
- # \&set_dtd;
- # }
-
+
output_encoding => $encoding,
);
$twig->parsefile($ARGV[0]);
-
###########
# END MAIN
###########
@@ -351,27 +362,21 @@
# S U B R O U T I N E S
##############################
-# sub set_dtd [
-# my $twig, $dtd = @_;
-# my $internal = qq|\nPUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"\n"DTD/xhtml1-strict.dtd"|;
-#
-# $twig->twig_doctype('html', undef, undef, $internal);
-# }
-
-
-
sub root {
my ($twig, $root, $corpusHeader) =@_;
- if($TEIFORMAT eq "TEI"){
- $root->set_gi('teiCorpus');
+ if($TEIFORMAT eq "I5") {
+ $twig->set_doctype($DTDDECL); # the doctype could probably be set anywhere to the twig
+ $root->set_gi('idsCorpus');
+ $root->set_att('version', "1.0");
+ $root->set_att('TEIform', "teiCorpus.2");
+
}
else {
- $root->set_gi('idsCorpus');
+ $root->set_gi('teiCorpus');
+ $root->set_att("xmlns", 'http://www.tei-c.org/ns/1.0');
}
- $root->set_att("xmlns", 'http://www.tei-c.org/ns/1.0');
-
&insertCorpusHeader($root, $corpusHeader);
}
@@ -380,6 +385,8 @@
sub insertCorpusHeader{
my ($root, $corpusHeader) =@_;
+ my $ident = "ident";
+
#-----------------------
# set corpus header
#-----------------------
@@ -389,19 +396,20 @@
if($TEIFORMAT eq "TEI"){
&set_sourceDesc($corpusHeader, $srcfullnames{$fnsource}, $fnyear, $kielipankkiCorpus);
}
- elsif ($TEIFORMAT eq "I5"){
- $corpusHeader->get_xpath("fileDesc/titleStmt/korpusSigle",0) ->set_text($corpussigles{$fnsource});
- $corpusHeader->get_xpath("fileDesc/publicationStmt/pubDate",0)->set_text(localtime[5] + 1900);
- $corpusHeader->get_xpath("profileDesc/langUsage/language",0) ->set_text($expandLang{$srctextlangs{$fnsource}});
- $corpusHeader->get_xpath("profileDesc/langUsage/language",0) ->set_att('id', $srctextlangs{$fnsource});
+ elsif($TEIFORMAT eq "I5"){
+ $corpusHeader->get_xpath("fileDesc/titleStmt/korpusSigle",0) ->set_text($corpusids{$fnsource} . $fnYY);
+ $corpusHeader->get_xpath("fileDesc/publicationStmt/pubDate",0)->set_text((localtime)[5] + 1900);
$corpusHeader->get_xpath('encodingDesc/editorialDecl/transduction[@n="2"]',0) ->set_text("I5 version by EuReCo using vrt2tei.pl " . localtime->ymd('-'));
&set_sourceDescI5($corpusHeader);
+ $ident="id";
}
else{
print STDERR "TEIFORMAT is $TEIFORMAT; must be TEI or I5";
}
$corpusHeader->paste("first_child", $root);
+ $corpusHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_att($ident, $srctextlangs{$fnsource});
+ $corpusHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_text($expandLang{$srctextlangs{$fnsource}});
if($TEIFORMAT eq "I5"){
$idsDoc->paste("after", $corpusHeader);
@@ -427,7 +435,7 @@
my $textattsref = $text->atts(); # $textattsref is now a reference to a hash and should be used with '->'
- &createIdsDoc($textattsref);
+ &createIdsDoc($textattsref); # this creation of idsDoc will only be called for the 2nd idsDoc (i.e. february) or higher
@@ -583,12 +591,13 @@
# SUK21.JAN.00001
- my $corpusID = "SUK"; # ToDo read Table with Source metadata
my $yy = substr($datearray[0], 2, 2); # substr EXPR,OFFSET,LENGTH
my $mm = $datearray[1]; # substr EXPR,OFFSET,LENGTH
my $MMM = $months{$mm};
- my $textID = $corpusID . $yy . "_" . $MMM . "." . sprintf("%05d", $doccounter{$mm}++);
+ my $CSIGLE = $corpusids{$fnsource} . $yy;
+
+ my $textID = $CSIGLE . "_" . $MMM . "." . sprintf("%05d", $doccounter{$mm}++);
my $textSigle = $textID;
@@ -605,7 +614,7 @@
# <titleStmt>
# <title>[$LABEL, page $PAGENO]</title>
- $textHeader->first_child("fileDesc") -> set_att('n', "EuReCo-". $kielipankkiCorpus . $ID);
+ $textHeader->first_child("fileDesc") -> set_att('n', "EuReCo-". $kielipankkiCorpus . "-" . $ID);
#-----------------
@@ -623,7 +632,7 @@
$titleStmt->first_child($title)->set_text($LABEL . ", Text #" . $textcounter);
- # Case KLK; PAGENO scheint meist "None" zu sein
+ # Case KLK: PAGENO scheint meist "None" zu sein
#-----------------------------------------------
# <fileDesc>
@@ -642,6 +651,7 @@
# <idno type="KIELIPANKKI_ORIGFILENAME">$ORIGFILENAME</idno>
# <textLang>$LANGUAGE</textLang>
+
my $analytic = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/analytic", 0);
if($TEIFORMAT eq "I5"){$title="h.title"};
@@ -679,6 +689,7 @@
$monogr->get_xpath('./imprint/' . $date . '[@type="month"]', 0) ->set_text($datearray[1]);
$monogr->get_xpath('./imprint/' . $date . '[@type="day"]', 0) ->set_text($datearray[2]);
$monogr->first_child("imprint")->first_child("pubPlace") ->set_text($srcpubplaces{$PUBLTITLE}); # imprint is needed for tei validity
+ $monogr->first_child("imprint")->first_child("pubPlace") ->set_att('key', $CountryKey);
$monogr->first_child("imprint")->first_child("publisher") ->set_text($srcpublishers{$PUBLTITLE}); # imprint is needed for tei validity
#$monogr->get_xpath('./biblScope[@unit="ISSUETITLE"]', 0) ->set_text($ISSUETITLE);
#$monogr->get_xpath('./biblScope[@unit="ISSUENO"]', 0) ->set_text($ISSUENO);
@@ -719,9 +730,10 @@
if($TEIFORMAT eq "I5"){
$textHeader->get_xpath('./profileDesc/creation/creatDate', 0) ->set_text($dateBackwards);
}
-
- $textHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_att('ident', $LANGUAGE);
- $textHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_att('usage', $SUMLANG);
+ if($TEIFORMAT eq "TEI"){
+ $textHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_att('ident', $LANGUAGE);
+ $textHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_att('usage', $SUMLANG);
+ }
# in @usage muss eigt. ein integer; am besten inhalt von SUMLANG aufdroeseln und mehrere <language> machen
$textHeader->get_xpath('./profileDesc/textClass/classCode[@scheme="kielipankki_klk"]', 0) ->set_text($PUBLTYPE);
@@ -785,11 +797,12 @@
my @tags = split(/\t/, $line);
# set content of <w> i.e. the token
- # $w_element->set_text($tags[0]);
- my $random_w = Data::Random::String->create_random_string(length=>length($tags[0]), contains=>'alpha');
+ my $random_w = "";
+ # $random_w = Data::Random::String->create_random_string(length=>length($tags[0]), contains=>'alpha');
+ $random_w = $tags[0];
$w_element->set_text($random_w);
- # vrt positional-attributes in corpus KLK:
+ # vrt word and positional-attributes in corpus KLK:
# USE [0] word
# USE [1] ref (id for reference of dephead)
# USE [2] lemma
@@ -811,8 +824,8 @@
# $w_element->set_att("id", "w_" . $namearray[4] . $sentence->att("xml:id") . "_" . $tags[1]);
# so zusammengebaute ID ist auch nicht eindeutig...
$w_element->del_att("id");
- # $w_element->set_att("lemma", $tags[2]);
- $w_element->set_att("lemma", $random_w);
+ $w_element->set_att("lemma", $tags[2]);
+ #$w_element->set_att("lemma", $random_w);
# $w_element->set_att("norm", $tags[3]); # tag abuse of @norm
$w_element->set_att("pos", $tags[4]);
@@ -877,11 +890,13 @@
my $PUBLTITLE = $srcfullnames{$fnsource};
my $PUBLPLACE = $srcpubplaces{$PUBLTITLE};
my $PUBLISHER = $srcpublishers{$PUBLTITLE};
- my $CSIGLE = $corpussigles{$PUBLTITLE};
-
- my $YEAR = $fnyear;
+ my $YEAR = $fnyear;
+ my $YY = substr($fnyear, 2, 2);
+ my $CSIGLE = $corpusids{$PUBLTITLE} . $YY;
+
+
#<idsHeader>
# <fileDesc>
# <!-- ... -->
@@ -891,7 +906,7 @@
# <h.title type="main">[$PUBLTITLE], [$YEAR]</h.title>
# <imprint>
# <publisher>[$PUBLISHER]</publisher>
- # <pubPlace key="DE">[$PUBPLACE]</pubPlace>
+ # <pubPlace key="[$TL]">[$PUBPLACE]</pubPlace>
# </imprint>
# </monogr>
# </biblStruct>
@@ -905,6 +920,7 @@
$cMonogr->first_child("h.title")->set_text($PUBLTITLE);
$cMonogr->first_child("imprint")->first_child("publisher")->set_text($PUBLISHER);
$cMonogr->first_child("imprint")->first_child("pubPlace") ->set_text($PUBLPLACE);
+ $cMonogr->first_child("imprint")->first_child("pubPlace") ->set_att('key', $CountryKey);
$corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("reference")->set_text($CSIGLE . " " . $PUBLTITLE . "; " . $PUBLPLACE . ": " . $PUBLISHER . ", " . $YEAR);
@@ -912,29 +928,25 @@
-sub createIdsDoc{
+sub createIdsDoc{ # will only be called for the second idsDoc (i.e. for february) and higher
my ($textattsref) = @_;
my $DATE = $textattsref->{'date'};
my $PUBLTITLE = $textattsref->{'publ_title'};
- my $CSIGLE = $corpussigles{$PUBLTITLE};
- #print STDERR "LASTMONTH: " . $LASTMONTH . "\n";
- #print STDERR "DATE: " . $DATE . "\n";
-
my @datearray = split("-", $DATE);
my $MONTH = $datearray[1];
my $YEAR = $datearray[0];
+ my $YY = substr($YEAR, 2, 2);
+ my $CSIGLE = $corpusids{$PUBLTITLE} . $YY;
+
my $DOCID = $months{$MONTH};
my $MONTHNAME = $monthnames{$MONTH};
-
- #print STDERR "MONTH: " . $MONTH . "\n\n";
-
my $idsDocString="";
if($TEIFORMAT eq "I5"){
$idsDocString = "
-<idsDoc>
+<idsDoc version=\"1.0\" TEIform=\"TEI.2\">
<idsHeader type=\"document\" pattern=\"text\" version=\"1.1\" TEIform=\"teiHeader\">
<fileDesc>
<titleStmt>
@@ -965,7 +977,6 @@
}
$LASTMONTH++;
}
- # printf(STDERR "\n\nNEW MONTH %s\n\n", $months{$MONTH});
}