revamped textHeader using external skeleton
diff --git a/vrt2tei.pl b/vrt2tei.pl
index dd1cffa..0356dbe 100755
--- a/vrt2tei.pl
+++ b/vrt2tei.pl
@@ -88,6 +88,11 @@
my $twig="";
my $teiCorpusHeaderDoc="";
+# global variables pertaining to the original corpus :
+my $kielipankkiCorpus = "klk-fi-v2-vrt";
+
+
+
#------------------------------------------------------------------
# read corpusHeaderSkeleton document and get header out of it
@@ -125,14 +130,6 @@
open(my $IN, "< $ARGV[0]") || die("$0: cannot open file for reading: $ARGV[0]"); # open input file and initialise filehandel, actually does not seem to be needed
# as parsefile() (s.b.) is applied to the filename
-#-----------------------------------------------------
-# global variables pertaining to the original corpus
-#-----------------------------------------------------
-
-my $kielipankkiCorpus = "klk-fi-v2-vrt";
-
-
-
#####################
# M A I N
@@ -237,8 +234,9 @@
sub text {
my ($twig, $text, $textHeader) = @_;
- $textcounter++; # global variable
+ $textcounter++;
+
# ToDo: catch all other, unexpected children of root
#--------------------------------------------------------------------------
@@ -382,6 +380,7 @@
+
#-----------------------------------------------------------------------
# CREATE text-teiHeader ACCORDING TO THE SKELETON in $textHeader
#-----------------------------------------------------------------------
@@ -391,12 +390,14 @@
#-----------------------------------------------
# <teiHeader>
- # <fileDesc n="[EuReCo-KLK-FIN_$ID]">
+ # <fileDesc n="EuReCo-KLK-FIN_[$ID]">
# <titleStmt>
# <title>[$LABEL, page $PAGENO]</title>
- $textHeader->first_child("fileDesc")->first_child("titleStmt")->first_child("title")
- ->set_text($LABEL . ", Text #" . $textcounter); # Case KLK
+ $textHeader->first_child("fileDesc") -> set_att('n', "EuReCo-". $kielipankkiCorpus . $ID);
+
+ $textHeader->first_child("fileDesc") -> first_child("titleStmt")->first_child("title")
+ ->set_text($LABEL . ", Text #" . $textcounter); # Case KLK; PAGENO scheint meist "None" zu sein
#-----------------------------------------------
# <fileDesc>
@@ -414,11 +415,11 @@
# <idno type="KIELIPANKKI_METAFILENAME">$METAFILENAME</idno>
# <idno type="KIELIPANKKI_ORIGFILENAME">$ORIGFILENAME</idno>
# <textLang>$LANGUAGE</textLang>
- # </analytic>
my $analytic = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/analytic", 0);
- $analytic->first_child("title") ->set_text($LABEL . ", Text #" . $textcounter); # Case KLK
+ $analytic->first_child("title") ->set_text($LABEL . ", Text #" . $textcounter); # Case KLK; PAGENO scheint meist "None" zu sein
+ $analytic->get_xpath('./date[@type="date"]', 0) ->set_text($DATE);
$analytic->get_xpath('./date[@type="year"]', 0) ->set_text($datearray[0]);
$analytic->get_xpath('./date[@type="month"]', 0) ->set_text($datearray[1]);
$analytic->get_xpath('./date[@type="day"]', 0) ->set_text($datearray[2]);
@@ -433,162 +434,59 @@
# <title>$PUBLTITLE</title>
# <imprint>
# <pubPlace>TODO</pubPlace>
- # <publisher>TODO</publisher>
+ # <publisher>TODO</publisher>
# </imprint>
# <biblScope unit="ISSUETITLE"/>
# <biblScope unit="ISSUENO"/>
# <biblScope unit="ISSUEDATE"/>
# <biblScope unit="pp">$PAGENO</biblScope>
- # <monogr>
my $monogr = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/monogr", 0);
+ $monogr->first_child("title") ->set_text($PUBLTITLE);
+ $monogr->first_child("imprint")->first_child("pubPlace") ->set_text("ToDo"); # imprint is needed for tei validity
+ $monogr->first_child("imprint")->first_child("publisher") ->set_text("ToDo"); # imprint is needed for tei validity
+ $monogr->get_xpath('./biblScope[@unit="ISSUETITLE"]', 0) ->set_text($ISSUETITLE);
+ $monogr->get_xpath('./biblScope[@unit="ISSUENO"]', 0) ->set_text($ISSUENO);
+ $monogr->get_xpath('./biblScope[@unit="ISSUEDATE"]', 0) ->set_text($ISSUEDATE);
+ $monogr->get_xpath('./biblScope[@unit="pp"]', 0) ->set_text($PAGENO); # Achtung - PAGENO scheint meist "None" zu sein
+ # <encodingDesc>
+ # <tagsDecl>
+ # <namespace name="http://www.tei-c.org/ns/1.0">
+ # <tagUsage gi="s" occurs="SENTCOUNT"/>
+ # <tagUsage gi="w" occurs="TOKENCOUNT"/>
+ $textHeader->get_xpath('./encodingDesc/tagsDecl/namespace/tagUsage[@gi="s"]', 0) -> set_att('occurs', $SENTCOUNT);
+ $textHeader->get_xpath('./encodingDesc/tagsDecl/namespace/tagUsage[@gi="w"]', 0) -> set_att('occurs', $TOKENCOUNT);
+
+ # <profileDesc>
+ # <langUsage>
+ # <language ident="fi" usage="|xxx:44|fin:23|eng:3|"/>
+ # </langUsage>
+ # <textClass>
+ # <classCode scheme="kielipankki_klk">$PUBLTYPE</classCode>
+ # <classCode scheme="kielipankki_klk_mapped">TODO</classCode>
-
-##TMP # create <teiHeader> inside <TEI>
-##TMP my $teiHeader = XML::Twig::Elt->new('teiHeader');
-##TMP # $teiHeader->paste('first_child', $text);
-##TMP
-##TMP ## insert_new_elt is a combo of new and paste, cf. xml::twig docu:
-##TMP ## insert_new_elt ($opt_position, $gi, $opt_atts_hashref, @opt_content)
-##TMP
-##TMP my $fileDesc = $teiHeader->insert_new_elt('fileDesc' => {n => "EuReCo_KLK-fi_" . $namearray[4]});
-##TMP my $encodingDesc = $teiHeader->insert_new_elt("last_child", 'encodingDesc');
-##TMP my $profileDesc = $teiHeader->insert_new_elt("last_child", 'profileDesc');
-##TMP my $revisionDesc = $teiHeader->insert_new_elt("last_child", 'revisionDesc');
-##TMP
-##TMP #---------------------
-##TMP # fileDesc/titleStmt
-##TMP #---------------------
-##TMP my $titleStmt = $fileDesc ->insert_new_elt('titleStmt');
-##TMP my $title = $titleStmt->insert_new_elt("last_child", 'title');
-##TMP my $respStmt = $titleStmt->insert_new_elt("last_child", 'respStmt');
-##TMP my $resp = $respStmt ->insert_new_elt("last_child", 'resp');
-##TMP my $name = $respStmt ->insert_new_elt("last_child", 'name');
-##TMP
-##TMP # set texts for titleStmt
-##TMP # $title->set_text($LABEL . ", page " . $PAGENO); # Achtung - PAGENO scheint meist "None" zu sein
-##TMP $title->set_text($LABEL . ", Text #" . $textcounter); # at least for Suomen Kuvalehti
-##TMP $resp ->set_text("compiled by EuReCo");
-##TMP $name ->set_text("EuReCo: HL");
-##TMP
-##TMP #--------------------------
-##TMP # fileDesc/publicationStmt
-##TMP #--------------------------
-##TMP my $publicationStmt = $fileDesc ->insert_new_elt("last_child", 'publicationStmt');
-##TMP my $distributor = $publicationStmt->insert_new_elt("last_child", 'distributor');
-##TMP my $note = $distributor ->insert_new_elt("last_child", 'note');
-##TMP my $availability = $publicationStmt->insert_new_elt("last_child", 'availability');
-##TMP my $licence = $availability ->insert_new_elt("last_child", 'licence');
-##TMP
-##TMP # set texts for publicationStmt
-##TMP $note ->set_text("NOT FOR DISTRIBUTION - to be used locally in EuReCo");
-##TMP $licence->set_text("CLARIN_RES"); # TODO: Ausfuherlichere Licence info in KLK Metadata Record
-##TMP
-##TMP #------------------------------
-##TMP # fileDesc/sourceDesc/biblStruct
-##TMP #------------------------------
-##TMP my $sourceDesc = $fileDesc ->insert_new_elt("last_child", 'sourceDesc');
-##TMP my $biblStruct = $sourceDesc->insert_new_elt("last_child", 'biblStruct');
-##TMP
-##TMP # fileDesc/sourceDesc/biblStruct/analytic
-##TMP my $analytic = $biblStruct->insert_new_elt("last_child", 'analytic');
-##TMP my $analytic_title = $analytic->insert_new_elt("last_child", 'title' => {type => "main"} );
-##TMP # my $analytic_date = $analytic->insert_new_elt("last_child", 'date');
-##TMP my $analytic_date_year = $analytic->insert_new_elt("last_child", 'date' => {type => "year"});
-##TMP my $analytic_date_month = $analytic->insert_new_elt("last_child", 'date' => {type => "month"});
-##TMP my $analytic_date_day = $analytic->insert_new_elt("last_child", 'date' => {type => "day"});
-##TMP my $analytic_idno_pageid = $analytic->insert_new_elt("last_child", 'idno' => {type => "PAGEID"});
-##TMP my $analytic_idno_bindingid = $analytic->insert_new_elt("last_child", 'idno' => {type => "BINDINGID"});
-##TMP my $analytic_idno_id = $analytic->insert_new_elt("last_child", 'idno' => {type => "ID"});
-##TMP my $analytic_idno_metafile = $analytic->insert_new_elt("last_child", 'idno' => {type => "KIELIPANKKI_METAFILENAME"});
-##TMP my $analytic_idno_origfile = $analytic->insert_new_elt("last_child", 'idno' => {type => "KIELIPANKKI_ORIGFILENAME"});
-##TMP my $analytic_textlang = $analytic->insert_new_elt("last_child", 'textLang');
-##TMP
-##TMP # set texts for analytic
-##TMP # $analytic_title ->set_text($LABEL . ", page " . $PAGENO); # Achtung $PAGENO scheint meist "None zu sein"
-##TMP $analytic_title ->set_text($LABEL . ", Text #" . $textcounter); # Achtung $PAGENO scheint meist "None zu sein"
-##TMP # $analytic_date ->set_text($DATE);
-##TMP $analytic_date_year ->set_text($datearray[0]);
-##TMP $analytic_date_month ->set_text($datearray[1]);
-##TMP $analytic_date_day ->set_text($datearray[2]);
-##TMP $analytic_idno_pageid ->set_text($PAGEID);
-##TMP $analytic_idno_bindingid->set_text($BID);
-##TMP $analytic_idno_id ->set_text($ID);
-##TMP $analytic_idno_metafile ->set_text($METAFILENAME);
-##TMP $analytic_idno_origfile ->set_text($ORIGFILENAME);
-##TMP $analytic_textlang ->set_text($LANGUAGE);
-##TMP
-##TMP #-------------------------------------
-##TMP # fileDesc/sourceDesc/biblStruct/monogr
-##TMP #-------------------------------------
-##TMP my $monogr = $biblStruct->insert_new_elt("last_child", 'monogr');
-##TMP my $monogr_title = $monogr ->insert_new_elt("last_child", 'title');
-##TMP my $imprint = $monogr ->insert_new_elt("last_child", 'imprint'); # imprint is needed for valididty
-##TMP my $pubPlace = $imprint ->insert_new_elt("last_child", 'pubPlace'); # imprint is needed for validity
-##TMP my $publisher = $imprint ->insert_new_elt("last_child", 'publisher'); # imprint is needed for validity
-##TMP my $biblScope_issuetitle = $monogr ->insert_new_elt("last_child", 'biblScope' => {unit => 'ISSUETITLE'} );
-##TMP my $biblScope_issueno = $monogr ->insert_new_elt("last_child", 'biblScope' => {unit => 'ISSUENO'} );
-##TMP my $biblScope_issuedate = $monogr ->insert_new_elt("last_child", 'biblScope' => {unit => 'ISSUEDATE'} );
-##TMP my $biblScope_pp = $monogr ->insert_new_elt("last_child", 'biblScope' => {unit => 'PAGENO'} ); # Achtung PAGENO ist meist "None" ?
-##TMP
-##TMP # set texts for monogr
-##TMP $monogr_title ->set_text($PUBLTITLE);
-##TMP $pubPlace ->set_text("TODO");
-##TMP $pubPlace ->set_att("key",'FI');
-##TMP $publisher ->set_text("TODO");
-##TMP $biblScope_issuetitle->set_text($ISSUETITLE);
-##TMP $biblScope_issueno ->set_text($ISSUENO);
-##TMP $biblScope_issuedate ->set_text($ISSUEDATE);
-##TMP $biblScope_pp ->set_text($PAGENO);
-##TMP
-##TMP #---------------
-##TMP # encodingDesc
-##TMP #---------------
-##TMP my $tagsDecl = $encodingDesc->insert_new_elt("last_child", 'tagsDecl');
-##TMP my $namespace = $tagsDecl ->insert_new_elt("last_child", 'namespace' => {name => 'http://www.tei-c.org/ns/1.0'});
-##TMP my $tagUsage_s = $namespace ->insert_new_elt("last_child", 'tagUsage' => {gi => 's', occurs => $SENTCOUNT});
-##TMP my $tagUsage_w = $namespace ->insert_new_elt("last_child", 'tagUsage' => {gi => 'w', occurs => $TOKENCOUNT});
-##TMP
-##TMP #-------------
-##TMP # profileDesc
-##TMP #-------------
-##TMP my $langUsage = $profileDesc ->insert_new_elt("last_child", 'langUsage');
-##TMP my $language = $langUsage ->insert_new_elt("last_child", 'language' => {ident => $LANGUAGE, usage => $SUMLANG});
-##TMP # Achtung in @usage muss eigt. ein integer; am besten inhalt von SUMLANG aufdroeseln und mehrere <language> machen
-##TMP my $textClass = $profileDesc ->insert_new_elt("last_child", 'textClass');
-##TMP my $classCode_fi = $textClass ->insert_new_elt("last_child", 'classCode' => {scheme => "KLK_PUBLTYPE"});
-##TMP # my $classCode_en = $textClass ->insert_new_elt("last_child", 'classCode' => {scheme => "KLK_PUBLTYPE_MAPPED"});
-##TMP
-##TMP #---------------------------
-##TMP # set texts for profileDesc
-##TMP #---------------------------
-##TMP $classCode_fi ->set_text($PUBLTYPE);
-##TMP # $classCode_en->set_text($PUBLTYPETRANSL);
-##TMP
-##TMP #---------------
-##TMP # revisionDesc
-##TMP #---------------
-##TMP my $change = $revisionDesc ->insert_new_elt("last_child", 'change' => {when => localtime->ymd('-'), who => 'HL' });
-##TMP
-##TMP # set texts for revisionDesc
-##TMP $change->set_text("TEI version for EuReCo");
+ $textHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_att('ident', $LANGUAGE);
+ $textHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_att('usage', $SUMLANG);
+ # in @usage muss eigt. ein integer; am besten inhalt von SUMLANG aufdroeseln und mehrere <language> machen
+ $textHeader->get_xpath('./profileDesc/textClass/classCode[@scheme="kielipankki_klk"]', 0) ->set_text($PUBLTYPE);
+ $textHeader->get_xpath('./profileDesc/textClass/classCode[@scheme="kielipankki_klk_mapped"]', 0)->set_text("ToDo");
+ # <revisionDesc>
+ # <change when="TODO" who="HL">TEI version for EuReCo</change>
-
-
-
+ $textHeader->get_xpath('./revisionDesc/change', 0) ->set_att('when', localtime->ymd('-'));
- ###################################
+ #-----------------------------------
# END OF CREATING TEIHEADER
- ###################################
+ #-----------------------------------
}
-
+
sub setP {
my ($paragraph) = @_;