dtd handling and some renamings

commit: 8162ad5dba48ac36b5ab2bf4209aadc8dea48a51 [log] [tgz]
author: Harald Lüngen <luengen@ids-mannheim.de> Thu Sep 19 10:54:24 2024 +0300
committer: Harald Lüngen <luengen@ids-mannheim.de> Thu Sep 19 10:54:24 2024 +0300
tree: acce87e81e19ef18b5c4d319d0d183194310d770
parent: cb223bd2695fbb2038e03d7a172185582911af75 [diff]
diff --git a/vrt2tei.pl b/vrt2tei.pl
index 63dd6fd..42abd2b 100755
--- a/vrt2tei.pl
+++ b/vrt2tei.pl

@@ -16,7 +16,6 @@
 #
 #
 # TODO: 
-# 1  insert dtd spec, or ref to TEI
 
 # 3a remove the vrt positional attribute comment line / all comment lines
 # 3b add @head and @deprel to I5 sowie auch @msd
@@ -94,12 +93,12 @@
 #my $TEIFORMAT = "TEI";
 my $TEIFORMAT = "I5";
 
-
+my $DTDDECL = 'idsCorpus PUBLIC "-//IDS//DTD IDS-I5 1.0//EN" "http://corpora.ids-mannheim.de/I5/DTD/i5.dtd"';     # for I5
 
 my $textcounter = 0;
 my $LASTMONTH = 0;
 
-our %corpussigles    = ();
+our %corpusids    = ();
 our %srcfullnames    = ();
 our %srcpubplaces    = ();
 our %srcpublishers   = ();
@@ -124,10 +123,15 @@
     "12" => 1,
     );
 
+# global variables pertaining to the original corpus of *all* newspapers:
+my $kielipankkiCorpus  = "klk-fi-v2-vrt";
+my $kielipankkiLicense = "CLARIN-RES";
+my $CountryKey         = "FI";
 
-
+# Table with metadata about the different sources (newspapers)
 my $sourcescsvfile = "sources_klk_fi_v2_2021_4eureco.csv";
 
+# corpusheader and textheader skeletons
 my $corpheaderfile = "teiCorpusHeaderSkeleton.tei.xml";
 my $textheaderfile = "teiTextHeaderSkeleton.tei.xml";
 if($TEIFORMAT eq "I5"){
@@ -138,11 +142,6 @@
 
 my $twig="";   
 
-# global variables pertaining to the original corpus :
-my $kielipankkiCorpus  = "klk-fi-v2-vrt";
-my $kielipankkiLicense = "CLARIN-RES";
-
-
 # variables $fnsource and $fnyear taken from the filename
 my @array = split(/\//, $ARGV[0]);
 my $l = scalar(@array);
@@ -150,6 +149,7 @@
 $fnsource =~ s/([0-9][0-9][0-9][0-9])\.xml$//;
 
 my $fnyear = $1;   # $1 contains substring in first bracket in regex above
+my $fnYY   = substr($fnyear, 2, 2);
 
 
 # months
@@ -202,7 +202,8 @@
     my @flarray = split(/\s*\t+\s*/, $fline);                                  # split each line into array
 
     # set full titles ($flarray[1]) as keys:
-    $corpussigles{$flarray[1]}  = $flarray[0];
+    # (ToDo: these hashes could probably conflated into an array of hashes or so)
+    $corpusids{$flarray[1]}     = $flarray[0];
     $srcfullnames{$flarray[1]}  = $flarray[1];
     $srcpubplaces{$flarray[1]}  = $flarray[6];
     $srcpublishers{$flarray[1]} = $flarray[7];
@@ -210,7 +211,7 @@
     $srctextlangs{$flarray[1]}  = $flarray[5];
 
     # also set simple titles ($flarray[2]) as keys: 
-    $corpussigles{$flarray[2]}  = $flarray[0];
+    $corpusids{$flarray[2]}     = $flarray[0];
     $srcfullnames{$flarray[2]}  = $flarray[1];
     $srcpubplaces{$flarray[2]}  = $flarray[6];
     $srcpublishers{$flarray[2]} = $flarray[7];
@@ -223,7 +224,7 @@
 $expandLang{"sv"} = "Swedish";
 
 #------------------------------------------------------------------
-# read corpusHeaderSkeleton document and get header out of it
+# read corpusHeaderSkeleton document and start a twig for it
 #------------------------------------------------------------------
 
 my $teiCorpusHeaderDocTwig = new XML::Twig(
@@ -238,7 +239,7 @@
 
 
 #------------------------------------------------------------------
-# read textHeaderSkeleton document adn get header out of it
+# read textHeaderSkeleton document and start a twig for it
 #------------------------------------------------------------------
 
 my $teiTextHeaderDocTwig = new XML::Twig(
@@ -251,6 +252,12 @@
 my $textHeader = $teiTextHeaderDocTwig->root;                      # getting the teiHeader for corpus out of the teiTextHeaderSkeleton document
 
 
+#---------------------------------------------------------
+# define a subtree for idsDoc
+# for the time being it will only be used for the first
+# idsDoc header, to be inserted in the root hander
+#---------------------------------------------------------
+
 my $idsDoc       = XML::Twig::Elt->new('idsDoc');    
 my $idsDocHeader = XML::Twig::Elt->new('idsHeader');
 
@@ -262,7 +269,7 @@
 
     my $docPublicationStmt = XML::Twig::Elt->new('publicationStmt');
     my $docDistributor     = XML::Twig::Elt->new('distributor');
-    my $docPubAddress      = XML::Twig::Elt->new('pubAdress');
+    my $docPubAddress      = XML::Twig::Elt->new('pubAddress');
     my $docAvailability    = XML::Twig::Elt->new('availability');
     my $docPubDate         = XML::Twig::Elt->new('pubDate');
     
@@ -271,6 +278,16 @@
     my $docMonogr          = XML::Twig::Elt->new('monogr');
     my $docHTitle          = XML::Twig::Elt->new('h.title');
     my $docImprint         = XML::Twig::Elt->new('imprint');
+
+    $idsDoc                    -> set_att('version', "1.0");
+    $idsDoc                    -> set_att('TEIform', "TEI.2");
+
+    $idsDocHeader              -> set_att('version', "1.1");
+    $idsDocHeader              -> set_att('type', "document");
+    $idsDocHeader              -> set_att('pattern', "text");
+    $idsDocHeader              -> set_att('TEIform', "teiHeader");
+
+
     
     $docSigle                  -> paste("first_child", $docTitleStmt);
     $dtitle                    -> paste("last_child",  $docTitleStmt);
@@ -291,15 +308,14 @@
     
     $idsDocHeader           -> paste("last_child",  $idsDoc);
 
-    # ToDo set dummy dtitle and docSigle
-
-    $docSigle->set_text($corpussigles{$fnsource} . "/JAN");
+    $docSigle->set_text($corpusids{$fnsource} . $fnYY  . "/JAN");
     $dtitle  ->set_text($srcfullnames{$fnsource} . ", January " . $fnyear);
 }
 
 
+
 #----------------------------------
-# read input VRT-XML document
+# read the input VRT-XML document
 #----------------------------------
 
 open(my $IN,  "< $ARGV[0]") || die("$0: cannot open file for reading: $ARGV[0]");     # open input  file and initialise filehandel, actually does not seem to be needed
@@ -316,7 +332,6 @@
 #-------------------------------------------------------------------------------------------------------------
 
 
-
 $twig = new XML::Twig(
     keep_spaces => 1,           # dadurch auch whitespaces an ehemeligen elementgrenzen im output
     keep_atts_order => 1,       # requires Tie::IxHash
@@ -329,17 +344,13 @@
 #	text => \&text
 	text =>  sub{text(@_, $textHeader->copy)}    #   copy must be because textHeader will be flushed with $twig in the <text> handler;
     },
-    # dtd_handlers =>   {       # ToDo for I5
-    #	\&set_dtd;
-    # }
-    
+
     output_encoding => $encoding,
     );
 
 $twig->parsefile($ARGV[0]);
 
 
-
 ###########
 # END MAIN
 ###########
@@ -351,27 +362,21 @@
 #   S U B R O U T I N E S
 ##############################
 
-# sub set_dtd [
-#    my $twig, $dtd = @_;
-#    my $internal = qq|\nPUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"\n"DTD/xhtml1-strict.dtd"|;
-#
-#    $twig->twig_doctype('html', undef, undef, $internal);
-#    }
-
-
-
 sub root {
     my ($twig, $root, $corpusHeader) =@_;
 
-    if($TEIFORMAT eq "TEI"){
-	$root->set_gi('teiCorpus');
+    if($TEIFORMAT eq "I5") {
+	$twig->set_doctype($DTDDECL);   # the doctype could probably be set anywhere to the twig
+	$root->set_gi('idsCorpus');
+	$root->set_att('version', "1.0");
+	$root->set_att('TEIform', "teiCorpus.2");
+	
     }
     else {
-	$root->set_gi('idsCorpus');
+	$root->set_gi('teiCorpus');
+	$root->set_att("xmlns", 'http://www.tei-c.org/ns/1.0');
     }
     
-    $root->set_att("xmlns", 'http://www.tei-c.org/ns/1.0');
-
     &insertCorpusHeader($root, $corpusHeader);
 }
 
@@ -380,6 +385,8 @@
 sub insertCorpusHeader{
     my ($root, $corpusHeader) =@_;
 
+    my $ident = "ident";
+
     #-----------------------
     # set corpus header
     #-----------------------
@@ -389,19 +396,20 @@
     if($TEIFORMAT eq "TEI"){
 	&set_sourceDesc($corpusHeader, $srcfullnames{$fnsource}, $fnyear, $kielipankkiCorpus);
     }
-    elsif ($TEIFORMAT eq "I5"){
-	$corpusHeader->get_xpath("fileDesc/titleStmt/korpusSigle",0)  ->set_text($corpussigles{$fnsource});
-	$corpusHeader->get_xpath("fileDesc/publicationStmt/pubDate",0)->set_text(localtime[5] + 1900);
-	$corpusHeader->get_xpath("profileDesc/langUsage/language",0)  ->set_text($expandLang{$srctextlangs{$fnsource}});
-	$corpusHeader->get_xpath("profileDesc/langUsage/language",0)  ->set_att('id', $srctextlangs{$fnsource});
+    elsif($TEIFORMAT eq "I5"){
+	$corpusHeader->get_xpath("fileDesc/titleStmt/korpusSigle",0)  ->set_text($corpusids{$fnsource} . $fnYY);
+	$corpusHeader->get_xpath("fileDesc/publicationStmt/pubDate",0)->set_text((localtime)[5] + 1900);
 	$corpusHeader->get_xpath('encodingDesc/editorialDecl/transduction[@n="2"]',0)  ->set_text("I5 version by EuReCo using vrt2tei.pl " . localtime->ymd('-'));
 	&set_sourceDescI5($corpusHeader);
+	$ident="id";
     }
     else{
 	print STDERR "TEIFORMAT is $TEIFORMAT; must be TEI or I5";
     }
 
     $corpusHeader->paste("first_child", $root);
+    $corpusHeader->get_xpath('./profileDesc/langUsage/language', 0)                                   ->set_att($ident, $srctextlangs{$fnsource});
+    $corpusHeader->get_xpath('./profileDesc/langUsage/language', 0)                                   ->set_text($expandLang{$srctextlangs{$fnsource}});
 
     if($TEIFORMAT eq "I5"){
 	$idsDoc->paste("after",       $corpusHeader);
@@ -427,7 +435,7 @@
     
     my $textattsref = $text->atts();               # $textattsref is now a reference to a hash and should be used with '->'
 
-    &createIdsDoc($textattsref);
+    &createIdsDoc($textattsref);                   # this creation of idsDoc will only be called for the 2nd idsDoc (i.e. february) or higher
 
 
 
@@ -583,12 +591,13 @@
 
     # SUK21.JAN.00001
     
-    my $corpusID = "SUK";                            # ToDo read Table with Source metadata
     my $yy       = substr($datearray[0], 2, 2);      # substr EXPR,OFFSET,LENGTH
     my $mm       = $datearray[1];                    # substr EXPR,OFFSET,LENGTH
     my $MMM      = $months{$mm};
 
-    my $textID = $corpusID . $yy . "_" . $MMM . "." . sprintf("%05d", $doccounter{$mm}++);
+    my $CSIGLE = $corpusids{$fnsource} . $yy;
+    
+    my $textID = $CSIGLE . "_" . $MMM . "." . sprintf("%05d", $doccounter{$mm}++);
     my $textSigle = $textID;
     
     
@@ -605,7 +614,7 @@
     #     <titleStmt>
     #       <title>[$LABEL, page $PAGENO]</title>	
 
-    $textHeader->first_child("fileDesc")   ->  set_att('n', "EuReCo-". $kielipankkiCorpus . $ID);
+    $textHeader->first_child("fileDesc")   ->  set_att('n', "EuReCo-". $kielipankkiCorpus . "-" . $ID);
 
 
     #-----------------
@@ -623,7 +632,7 @@
     
     $titleStmt->first_child($title)->set_text($LABEL . ", Text #" . $textcounter);
 
-    # Case KLK;  PAGENO scheint meist "None" zu sein
+    # Case KLK:  PAGENO scheint meist "None" zu sein
 
     #-----------------------------------------------
     # <fileDesc>
@@ -642,6 +651,7 @@
     #         <idno type="KIELIPANKKI_ORIGFILENAME">$ORIGFILENAME</idno>
     #         <textLang>$LANGUAGE</textLang>
 
+    
     my $analytic  = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/analytic", 0);
     if($TEIFORMAT eq "I5"){$title="h.title"};
     
@@ -679,6 +689,7 @@
     $monogr->get_xpath('./imprint/' . $date . '[@type="month"]',  0)         ->set_text($datearray[1]);
     $monogr->get_xpath('./imprint/' . $date . '[@type="day"]',    0)         ->set_text($datearray[2]);    
     $monogr->first_child("imprint")->first_child("pubPlace")        ->set_text($srcpubplaces{$PUBLTITLE});  # imprint is needed for tei validity
+    $monogr->first_child("imprint")->first_child("pubPlace")        ->set_att('key', $CountryKey);
     $monogr->first_child("imprint")->first_child("publisher")       ->set_text($srcpublishers{$PUBLTITLE}); # imprint is needed for tei validity
     #$monogr->get_xpath('./biblScope[@unit="ISSUETITLE"]', 0)        ->set_text($ISSUETITLE);
     #$monogr->get_xpath('./biblScope[@unit="ISSUENO"]', 0)           ->set_text($ISSUENO);
@@ -719,9 +730,10 @@
     if($TEIFORMAT eq "I5"){
 	$textHeader->get_xpath('./profileDesc/creation/creatDate', 0)                                   ->set_text($dateBackwards);
     }
-    
-    $textHeader->get_xpath('./profileDesc/langUsage/language', 0)                                   ->set_att('ident', $LANGUAGE);
-    $textHeader->get_xpath('./profileDesc/langUsage/language', 0)                                   ->set_att('usage', $SUMLANG);
+    if($TEIFORMAT eq "TEI"){
+	$textHeader->get_xpath('./profileDesc/langUsage/language', 0)                                   ->set_att('ident', $LANGUAGE);
+	$textHeader->get_xpath('./profileDesc/langUsage/language', 0)                                   ->set_att('usage', $SUMLANG);
+    }
     # in @usage muss eigt. ein integer; am besten inhalt von SUMLANG aufdroeseln und mehrere <language> machen
 
     $textHeader->get_xpath('./profileDesc/textClass/classCode[@scheme="kielipankki_klk"]', 0)       ->set_text($PUBLTYPE);
@@ -785,11 +797,12 @@
     my @tags = split(/\t/, $line);
     
     # set content of <w> i.e. the token
-    # $w_element->set_text($tags[0]);
-    my $random_w = Data::Random::String->create_random_string(length=>length($tags[0]), contains=>'alpha');
+    my $random_w = "";
+    # $random_w = Data::Random::String->create_random_string(length=>length($tags[0]), contains=>'alpha');
+    $random_w = $tags[0];
     $w_element->set_text($random_w);
     
-    # vrt positional-attributes in corpus KLK:
+    # vrt word and positional-attributes in corpus KLK:
     #  USE [0] word
     #  USE [1] ref  (id for reference of dephead)
     #  USE [2] lemma
@@ -811,8 +824,8 @@
     # $w_element->set_att("id",     "w_" . $namearray[4] . $sentence->att("xml:id") . "_" . $tags[1]);
     # so zusammengebaute ID ist auch nicht eindeutig...
     $w_element->del_att("id");
-    # $w_element->set_att("lemma",  $tags[2]);
-    $w_element->set_att("lemma",  $random_w);
+    $w_element->set_att("lemma",  $tags[2]);
+    #$w_element->set_att("lemma",  $random_w);
 
     # $w_element->set_att("norm",   $tags[3]);  # tag abuse of @norm
     $w_element->set_att("pos",    $tags[4]);
@@ -877,11 +890,13 @@
     my $PUBLTITLE = $srcfullnames{$fnsource};
     my $PUBLPLACE = $srcpubplaces{$PUBLTITLE};
     my $PUBLISHER = $srcpublishers{$PUBLTITLE};
-    my $CSIGLE    = $corpussigles{$PUBLTITLE};
-
-    my $YEAR  = $fnyear;
     
+    my $YEAR  = $fnyear;
+    my $YY    = substr($fnyear, 2, 2);
 
+    my $CSIGLE    = $corpusids{$PUBLTITLE} . $YY;
+
+    
     #<idsHeader>
     #  <fileDesc>
     #    <!-- ... -->
@@ -891,7 +906,7 @@
     #          <h.title type="main">[$PUBLTITLE], [$YEAR]</h.title>
     #          <imprint>
     #            <publisher>[$PUBLISHER]</publisher>
-    #            <pubPlace key="DE">[$PUBPLACE]</pubPlace>
+    #            <pubPlace key="[$TL]">[$PUBPLACE]</pubPlace>
     #          </imprint>
     #        </monogr>
     #      </biblStruct>
@@ -905,6 +920,7 @@
     $cMonogr->first_child("h.title")->set_text($PUBLTITLE);
     $cMonogr->first_child("imprint")->first_child("publisher")->set_text($PUBLISHER);
     $cMonogr->first_child("imprint")->first_child("pubPlace") ->set_text($PUBLPLACE);
+    $cMonogr->first_child("imprint")->first_child("pubPlace") ->set_att('key', $CountryKey);
 
     $corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("reference")->set_text($CSIGLE . " " . $PUBLTITLE . "; " . $PUBLPLACE . ": " . $PUBLISHER . ", " . $YEAR);
 
@@ -912,29 +928,25 @@
 
 
 
-sub createIdsDoc{
+sub createIdsDoc{          # will only be called for the second idsDoc (i.e. for february) and higher
     my ($textattsref) = @_;
     my $DATE      = $textattsref->{'date'};
     my $PUBLTITLE = $textattsref->{'publ_title'};
-    my $CSIGLE    = $corpussigles{$PUBLTITLE};
     
-    #print STDERR "LASTMONTH: " . $LASTMONTH . "\n";
-    #print STDERR "DATE: "     .  $DATE . "\n";
-
     my @datearray = split("-", $DATE);
     my $MONTH = $datearray[1];
     my $YEAR  = $datearray[0];
+    my $YY    = substr($YEAR, 2, 2);
     
+    my $CSIGLE    = $corpusids{$PUBLTITLE} . $YY;
+
     my $DOCID =     $months{$MONTH};
     my $MONTHNAME = $monthnames{$MONTH};
 
-
-    #print STDERR "MONTH: "     .  $MONTH . "\n\n";
-    
     my $idsDocString="";
     if($TEIFORMAT eq "I5"){
 	$idsDocString = "
-<idsDoc>
+<idsDoc version=\"1.0\" TEIform=\"TEI.2\">
 <idsHeader type=\"document\" pattern=\"text\" version=\"1.1\" TEIform=\"teiHeader\">
   <fileDesc>
     <titleStmt>
@@ -965,7 +977,6 @@
 		}
 	$LASTMONTH++;
     }
-    # printf(STDERR "\n\nNEW MONTH   %s\n\n", $months{$MONTH});
 }
commit	8162ad5dba48ac36b5ab2bf4209aadc8dea48a51	[log] [tgz]
author	Harald Lüngen <luengen@ids-mannheim.de>	Thu Sep 19 10:54:24 2024 +0300
committer	Harald Lüngen <luengen@ids-mannheim.de>	Thu Sep 19 10:54:24 2024 +0300
tree	acce87e81e19ef18b5c4d319d0d183194310d770
parent	cb223bd2695fbb2038e03d7a172185582911af75 [diff]