switches for I5 idsHeader; dummy idsDoc
diff --git a/vrt2tei.pl b/vrt2tei.pl
index 42bb7d9..6c280ba 100755
--- a/vrt2tei.pl
+++ b/vrt2tei.pl
@@ -48,8 +48,9 @@
use strict;
use warnings;
-#use diagnostics;
+use diagnostics;
+use Getopt::Std;
use XML::Twig;
use XML::Generator ':pretty'; # apparently no effect when using flush();
@@ -76,12 +77,25 @@
if ($ARGV[1]) {&usage_message()}; # max arg0, the input file
+#--------------------------
+# get options / auxiliary files
+#--------------------------
+
+
+
+
+
####################
# GLOBAL VARIABLES
####################
-my $encoding = "UTF-8";
-#my $encoding = "iso-8859-1"; # dieses $encoding ist NUR fuer das output s.u. twig funktion
+my $encoding = "UTF-8"; # dieses $encoding ist NUR fuer das output s.u. twig funktion
+
+#my $TEIFORMAT = "TEI";
+my $TEIFORMAT = "I5";
+
+
+
my $textcounter = 0;
our %corpussigles = ();
@@ -108,12 +122,16 @@
my $sourcescsvfile = "sources_klk_fi_v2_2021_4eureco.csv";
+
my $corpheaderfile = "teiCorpusHeaderSkeleton.tei.xml";
my $textheaderfile = "teiTextHeaderSkeleton.tei.xml";
+if($TEIFORMAT eq "I5"){
+ # $corpheaderfile = "i5CorpusHeaderSkeleton.i5.xml";
+ $textheaderfile = "i5TextHeaderSkeleton.i5.xml";
+}
my $twig="";
-my $teiCorpusHeaderDoc="";
# global variables pertaining to the original corpus :
my $kielipankkiCorpus = "klk-fi-v2-vrt";
@@ -133,6 +151,12 @@
"12" => "DEC",
);
+my %mapping = ();
+$mapping{"aikakausi"} = "Zeitschrift";
+$mapping{"sanomalehti"} = "Zeitung";
+
+
+
#------------------------------------------------------------------
# read corpusHeaderSkeleton document and get header out of it
#------------------------------------------------------------------
@@ -162,6 +186,27 @@
my $textHeader = $teiTextHeaderDocTwig->root; # getting the teiHeader for corpus out of the teiTextHeaderSkeleton document
+my $idsDoc = XML::Twig::Elt->new('idsDoc');
+if($TEIFORMAT eq "I5"){
+
+ my $idsDocHeader = XML::Twig::Elt->new('idsHeader');
+ my $docFileDesc = XML::Twig::Elt->new('fileDesc');
+ my $docTitleStmt = XML::Twig::Elt->new('titleStmt');
+ my $dtitle = XML::Twig::Elt->new('d.title');
+ my $docSigle = XML::Twig::Elt->new('dokumentSigle');
+
+ $docSigle -> paste("first_child", $docTitleStmt);
+ $dtitle -> paste("last_child", $docTitleStmt);
+ $docTitleStmt -> paste("last_child", $docFileDesc);
+ $docFileDesc -> paste("last_child", $idsDocHeader);
+ $idsDocHeader -> paste("last_child", $idsDoc);
+
+ # ToDo set dummy dtitle and docSigle
+}
+
+
+
+
#----------------------------------
# read input VRT-XML document
#----------------------------------
@@ -180,6 +225,15 @@
if ($fline=~/^\#/ || $fline=~/^\s*$/ || $fline =~ "TEICORPUSID"){next;} # skip line if empty line or comment line or first line
my @flarray = split(/\s*\t+\s*/, $fline); # split each line into array
+ # set full titles ($flarray[1]) as keys:
+ $corpussigles{$flarray[1]} = $flarray[0];
+ $srcfullnames{$flarray[1]} = $flarray[1];
+ $srcpubplaces{$flarray[1]} = $flarray[6];
+ $srcpublishers{$flarray[1]} = $flarray[7];
+ $srctexttypes{$flarray[1]} = $flarray[4];
+ $srctextlangs{$flarray[1]} = $flarray[5];
+
+ # also set simple titles ($flarray[2]) as keys:
$corpussigles{$flarray[2]} = $flarray[0];
$srcfullnames{$flarray[2]} = $flarray[1];
$srcpubplaces{$flarray[2]} = $flarray[6];
@@ -206,8 +260,9 @@
keep_atts_order => 1, # requires Tie::IxHash
comments => 'drop',
start_tag_handlers => {
- texts => sub{root(@_, $corpusHeader)}
+ texts => sub{root(@_, $corpusHeader)}
},
+
twig_handlers => {
# text => \&text
text => sub{text(@_, $textHeader->copy)} # copy must be because textHeader will be flushed with $twig in the <text> handler;
@@ -223,8 +278,6 @@
-
-
###########
# END MAIN
###########
@@ -268,7 +321,7 @@
my $fnsource = $array[$l-1];
$fnsource =~ s/([0-9][0-9][0-9][0-9])\.xml$//;
- my $year = $1; # $1 now containts substring in first bracket in regex above
+ my $year = $1; # $1 containts substring in first bracket in regex above
#-----------------------
# set corpus header
@@ -277,8 +330,8 @@
&set_title( $corpusHeader, $srcfullnames{$fnsource}, $year, $kielipankkiCorpus);
&set_sourceDesc($corpusHeader, $srcfullnames{$fnsource}, $year, $kielipankkiCorpus);
- my $teiCorpusHeader = $corpusHeader->paste("first_child", $root);
-
+ $corpusHeader->paste("first_child", $root);
+ $idsDoc ->paste("after", $corpusHeader);
}
@@ -303,17 +356,24 @@
# &createTextHeader returns the $textID:
my $textID = &createTextHeader($text, $textattsref, $textHeader);
- #--------------------------
- # create <TEI> from <text>
- #--------------------------
+
+ #----------------------------------------
+ # create <TEI> or <idsText> from <text>
+ #----------------------------------------
# set vrt <text> to <TEI> and delete all attributes after they were were saved above
$text->del_atts;
- $text->set_gi("TEI");
- $text->set_att('xml:id', $textID);
+ if($TEIFORMAT eq "TEI"){
+ $text->set_gi("TEI");
+ $text->set_att('xml:id', $textID);
+ }
+ else {
+ $text ->set_gi("idsText");
+ $text ->set_att('version', "1.0");
+# $text ->move("last_child", $idsDoc); # does not work because apparently $idsDoc is not under $root at this point
-
+ }
@@ -327,7 +387,7 @@
# set atts
$div_element ->set_att("type", "page"); # ToDo: this is specific to KLK
- $ttext_element->set_att("xml:lang", 'fi'); # as in ICC-NOR
+ $ttext_element->set_att("xml:lang", 'fi'); # as in ICC-NOR
# paste
$ttext_element->paste('last_child', $text);
@@ -438,8 +498,6 @@
my @datearray = split("-", $DATE);
my @langarray = split("|", $SUMLANG);
my @namearray = split(/[\.\/]/, $ORIGFILENAME); # use $namearray[4] as ID for the page
-
-
#----------------------------------------------------
# create textSigle to be returned from this function
@@ -453,7 +511,7 @@
my $MMM = $months{$mm};
my $textID = $corpusID . $yy . "_" . $MMM . "." . sprintf("%05d", $doccounter{$mm}++);
-
+ my $textSigle = $textID;
#-----------------------------------------------------------------------
@@ -470,9 +528,24 @@
# <title>[$LABEL, page $PAGENO]</title>
$textHeader->first_child("fileDesc") -> set_att('n', "EuReCo-". $kielipankkiCorpus . $ID);
+
+
+ #-----------------
+ # titleStmt
+ #----------------
- $textHeader->first_child("fileDesc") -> first_child("titleStmt")->first_child("title")
- ->set_text($LABEL . ", Text #" . $textcounter); # Case KLK; PAGENO scheint meist "None" zu sein
+ my $title="title";
+ my $titleStmt = $textHeader->first_child("fileDesc")->first_child("titleStmt");
+
+ if($TEIFORMAT eq "I5"){
+ $title = "t.title";
+ $textSigle =~ s/_/\//g;
+ $titleStmt->first_child("textSigle")->set_text($textSigle);
+ };
+
+ $titleStmt->first_child($title)->set_text($LABEL . ", Text #" . $textcounter);
+
+ # Case KLK; PAGENO scheint meist "None" zu sein
#-----------------------------------------------
# <fileDesc>
@@ -492,18 +565,18 @@
# <textLang>$LANGUAGE</textLang>
my $analytic = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/analytic", 0);
-
- $analytic->first_child("title") ->set_text($LABEL . ", Text #" . $textcounter); # Case KLK; PAGENO scheint meist "None" zu sein
- $analytic->get_xpath('./date[@type="date"]', 0) ->set_text($DATE);
- $analytic->get_xpath('./date[@type="year"]', 0) ->set_text($datearray[0]);
- $analytic->get_xpath('./date[@type="month"]', 0) ->set_text($datearray[1]);
- $analytic->get_xpath('./date[@type="day"]', 0) ->set_text($datearray[2]);
+ if($TEIFORMAT eq "I5"){$title="h.title"};
+
+
+ $analytic->first_child($title) ->set_text($LABEL . ", Text #" . $textcounter); # Case KLK; PAGENO scheint meist "None" zu sein
$analytic->get_xpath('./idno[@type="PAGEID"]', 0) ->set_text($PAGEID);
$analytic->get_xpath('./idno[@type="BINDINGID"]', 0) ->set_text($BID);
$analytic->get_xpath('./idno[@type="ID"]', 0) ->set_text($ID);
$analytic->get_xpath('./idno[@type="KIELIPANKKI_METAFILENAME"]', 0) ->set_text($METAFILENAME);
$analytic->get_xpath('./idno[@type="KIELIPANKKI_ORIGFILENAME"]', 0) ->set_text($ORIGFILENAME);
- $analytic->first_child('textLang') ->set_text($LANGUAGE);
+ if($TEIFORMAT eq "TEI"){
+ $analytic->first_child('textLang') ->set_text($LANGUAGE);
+ }
# <monogr>
# <title>$PUBLTITLE</title>
@@ -518,37 +591,63 @@
my $monogr = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/monogr", 0);
- $monogr->first_child("title") ->set_text($PUBLTITLE);
- $monogr->first_child("imprint")->first_child("pubPlace") ->set_text("ToDo"); # imprint is needed for tei validity
- $monogr->first_child("imprint")->first_child("publisher") ->set_text("ToDo"); # imprint is needed for tei validity
+ $monogr->first_child($title) ->set_text($PUBLTITLE);
+ if($TEIFORMAT eq "TEI"){
+ $monogr->get_xpath('./imprint/date[@type="date"]', 0) ->set_text($DATE);
+ }
+ my $date = "date";
+ if($TEIFORMAT eq "I5"){$date="pubDate"};
+ $monogr->get_xpath('./imprint/' . $date . '[@type="year"]', 0) ->set_text($datearray[0]);
+ $monogr->get_xpath('./imprint/' . $date . '[@type="month"]', 0) ->set_text($datearray[1]);
+ $monogr->get_xpath('./imprint/' . $date . '[@type="day"]', 0) ->set_text($datearray[2]);
+ $monogr->first_child("imprint")->first_child("pubPlace") ->set_text($srcpubplaces{$PUBLTITLE}); # imprint is needed for tei validity
+ $monogr->first_child("imprint")->first_child("publisher") ->set_text($srcpublishers{$PUBLTITLE}); # imprint is needed for tei validity
$monogr->get_xpath('./biblScope[@unit="ISSUETITLE"]', 0) ->set_text($ISSUETITLE);
$monogr->get_xpath('./biblScope[@unit="ISSUENO"]', 0) ->set_text($ISSUENO);
$monogr->get_xpath('./biblScope[@unit="ISSUEDATE"]', 0) ->set_text($ISSUEDATE);
$monogr->get_xpath('./biblScope[@unit="pp"]', 0) ->set_text($PAGENO); # Achtung - PAGENO scheint meist "None" zu sein
+ my $dateNice = $datearray[2] . "." . $datearray[1] . "." . $datearray[0];
+ my $dateBackwards = $datearray[0] . "." . $datearray[1] . "." . $datearray[2];
+
+ if($TEIFORMAT eq "I5"){
+ my $refCompleteText = $textSigle . " " . $PUBLTITLE . " no. " . $ISSUENO . ", [" . $mapping{$PUBLTYPE} . "], " . $dateNice;
+ my $refShortText = $textSigle . " " . $PUBLTITLE . ", " . $dateNice;
+ $textHeader->get_xpath('./fileDesc/sourceDesc/reference[@type="complete"]', 0) -> set_text($refCompleteText);
+ $textHeader->get_xpath('./fileDesc/sourceDesc/reference[@type="short"]' , 0) -> set_text($refShortText);
+ }
+
+
# <encodingDesc>
# <tagsDecl>
# <namespace name="http://www.tei-c.org/ns/1.0">
# <tagUsage gi="s" occurs="SENTCOUNT"/>
# <tagUsage gi="w" occurs="TOKENCOUNT"/>
- $textHeader->get_xpath('./encodingDesc/tagsDecl/namespace/tagUsage[@gi="s"]', 0) -> set_att('occurs', $SENTCOUNT);
- $textHeader->get_xpath('./encodingDesc/tagsDecl/namespace/tagUsage[@gi="w"]', 0) -> set_att('occurs', $TOKENCOUNT);
-
+ my $namespacePath="./encodingDesc/tagsDecl/namespace/";
+ if($TEIFORMAT eq "I5"){$namespacePath="./encodingDesc/tagsDecl/"};
+
+ $textHeader->get_xpath($namespacePath . 'tagUsage[@gi="s"]', 0) -> set_att('occurs', $SENTCOUNT);
+ $textHeader->get_xpath($namespacePath . 'tagUsage[@gi="w"]', 0) -> set_att('occurs', $TOKENCOUNT);
+
# <profileDesc>
# <langUsage>
# <language ident="fi" usage="|xxx:44|fin:23|eng:3|"/>
# </langUsage>
# <textClass>
# <classCode scheme="kielipankki_klk">$PUBLTYPE</classCode>
- # <classCode scheme="kielipankki_klk_mapped">TODO</classCode>
+ # <classCode scheme="kielipankki_klk_mapped">$mapping{$PUBLTYPE}</classCode>
+ if($TEIFORMAT eq "I5"){
+ $textHeader->get_xpath('./profileDesc/creation/creatDate', 0) ->set_text($dateBackwards);
+ }
+
$textHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_att('ident', $LANGUAGE);
$textHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_att('usage', $SUMLANG);
# in @usage muss eigt. ein integer; am besten inhalt von SUMLANG aufdroeseln und mehrere <language> machen
$textHeader->get_xpath('./profileDesc/textClass/classCode[@scheme="kielipankki_klk"]', 0) ->set_text($PUBLTYPE);
- $textHeader->get_xpath('./profileDesc/textClass/classCode[@scheme="kielipankki_klk_mapped"]', 0)->set_text("ToDo");
+ $textHeader->get_xpath('./profileDesc/textClass/classCode[@scheme="kielipankki_klk_mapped"]', 0)->set_text($mapping{$PUBLTYPE});
# <revisionDesc>
# <change when="TODO" who="HL">TEI version for EuReCo</change>