idsDoc partitions and case switches for I5

commit: 381c2a2a7c3a9ad7a33ecb52d0c493ae1366d7b5 [log] [tgz]
author: Harald Lüngen <luengen@ids-mannheim.de> Tue Sep 17 09:06:39 2024 +0300
committer: Harald Lüngen <luengen@ids-mannheim.de> Tue Sep 17 09:06:39 2024 +0300
tree: 191356390a06fb46dc93b0b8b7373027ba024eee
parent: ab5ac5c3bb1ae63e23aaae6857120fc68397ab39 [diff] [blame]
diff --git a/vrt2tei.pl b/vrt2tei.pl
index 6c280ba..63dd6fd 100755
--- a/vrt2tei.pl
+++ b/vrt2tei.pl

@@ -48,7 +48,7 @@
 
 use strict;
 use warnings;
-use diagnostics;
+#use diagnostics;
 
 use Getopt::Std;
 use XML::Twig; 
@@ -64,7 +64,7 @@
 
 use Time::Piece;
 use Tie::IxHash;
-
+use Data::Random::String;
 
 
 #----------------------
@@ -97,6 +97,7 @@
 
 
 my $textcounter = 0;
+my $LASTMONTH = 0;
 
 our %corpussigles    = ();
 our %srcfullnames    = ();
@@ -105,7 +106,10 @@
 our %srctexttypes    = ();
 our %srctextlangs    = ();
 
-my %doccounter = (                        # by the month as in derekox
+our %expandLang      = ();
+
+
+my %doccounter = (                        # by the month as in dereko
     "01" => 1,
     "02" => 1,
     "03" => 1,
@@ -121,12 +125,13 @@
     );
 
 
+
 my $sourcescsvfile = "sources_klk_fi_v2_2021_4eureco.csv";
 
 my $corpheaderfile = "teiCorpusHeaderSkeleton.tei.xml";
 my $textheaderfile = "teiTextHeaderSkeleton.tei.xml";
 if($TEIFORMAT eq "I5"){
-    # $corpheaderfile = "i5CorpusHeaderSkeleton.i5.xml";
+    $corpheaderfile = "i5CorpusHeaderSkeleton.i5.xml";
     $textheaderfile = "i5TextHeaderSkeleton.i5.xml";
 }
 
@@ -134,8 +139,20 @@
 my $twig="";   
 
 # global variables pertaining to the original corpus :
-my $kielipankkiCorpus = "klk-fi-v2-vrt";
+my $kielipankkiCorpus  = "klk-fi-v2-vrt";
+my $kielipankkiLicense = "CLARIN-RES";
 
+
+# variables $fnsource and $fnyear taken from the filename
+my @array = split(/\//, $ARGV[0]);
+my $l = scalar(@array);
+my $fnsource = $array[$l-1];
+$fnsource =~ s/([0-9][0-9][0-9][0-9])\.xml$//;
+
+my $fnyear = $1;   # $1 contains substring in first bracket in regex above
+
+
+# months
 my %months = (
      "01" => "JAN",
      "02" => "FEB",
@@ -151,12 +168,60 @@
      "12" => "DEC",
      );
 
+my %monthnames = (
+     "01" => "January",
+     "02" => "February",
+     "03" => "March",
+     "04" => "April",
+     "05" => "May",
+     "06" => "June",
+     "07" => "July",
+     "08" => "August",
+     "09" => "September",
+     "10" => "October",
+     "11" => "November",
+     "12" => "December",
+     );
+
 my %mapping = ();
 $mapping{"aikakausi"}   = "Zeitschrift";
 $mapping{"sanomalehti"} = "Zeitung";
 
 
 
+#-------------------------------------------------------------------------------------------
+# read source metadata file (prepared manually => ultimately read the info from CMDI File?)
+# and set variables
+#-------------------------------------------------------------------------------------------
+
+open(my $SOURCES, "< $sourcescsvfile") || die("$0: cannot open file for reading: $sourcescsvfile");
+while(my $fline = <$SOURCES>){
+    chomp($fline);
+
+    if ($fline=~/^\#/ || $fline=~/^\s*$/ || $fline =~ "TEICORPUSID"){next;}    # skip line if empty line or comment line or first line
+    my @flarray = split(/\s*\t+\s*/, $fline);                                  # split each line into array
+
+    # set full titles ($flarray[1]) as keys:
+    $corpussigles{$flarray[1]}  = $flarray[0];
+    $srcfullnames{$flarray[1]}  = $flarray[1];
+    $srcpubplaces{$flarray[1]}  = $flarray[6];
+    $srcpublishers{$flarray[1]} = $flarray[7];
+    $srctexttypes{$flarray[1]}  = $flarray[4];
+    $srctextlangs{$flarray[1]}  = $flarray[5];
+
+    # also set simple titles ($flarray[2]) as keys: 
+    $corpussigles{$flarray[2]}  = $flarray[0];
+    $srcfullnames{$flarray[2]}  = $flarray[1];
+    $srcpubplaces{$flarray[2]}  = $flarray[6];
+    $srcpublishers{$flarray[2]} = $flarray[7];
+    $srctexttypes{$flarray[2]}  = $flarray[4];
+    $srctextlangs{$flarray[2]}  = $flarray[5];
+}
+close($SOURCES);
+
+$expandLang{"fi"} = "Finnish";
+$expandLang{"sv"} = "Swedish";
+
 #------------------------------------------------------------------
 # read corpusHeaderSkeleton document and get header out of it
 #------------------------------------------------------------------
@@ -187,24 +252,50 @@
 
 
 my $idsDoc       = XML::Twig::Elt->new('idsDoc');    
-if($TEIFORMAT eq "I5"){
+my $idsDocHeader = XML::Twig::Elt->new('idsHeader');
 
-    my $idsDocHeader = XML::Twig::Elt->new('idsHeader');
+if($TEIFORMAT eq "I5"){
     my $docFileDesc  = XML::Twig::Elt->new('fileDesc');
     my $docTitleStmt = XML::Twig::Elt->new('titleStmt');
     my $dtitle       = XML::Twig::Elt->new('d.title');
     my $docSigle     = XML::Twig::Elt->new('dokumentSigle');
 
-    $docSigle               -> paste("first_child", $docTitleStmt);
-    $dtitle                 -> paste("last_child",  $docTitleStmt);
-    $docTitleStmt           -> paste("last_child",  $docFileDesc);
-    $docFileDesc            -> paste("last_child",  $idsDocHeader);
+    my $docPublicationStmt = XML::Twig::Elt->new('publicationStmt');
+    my $docDistributor     = XML::Twig::Elt->new('distributor');
+    my $docPubAddress      = XML::Twig::Elt->new('pubAdress');
+    my $docAvailability    = XML::Twig::Elt->new('availability');
+    my $docPubDate         = XML::Twig::Elt->new('pubDate');
+    
+    my $docSourceDesc       = XML::Twig::Elt->new('sourceDesc');
+    my $docBiblStruct      = XML::Twig::Elt->new('biblStruct');
+    my $docMonogr          = XML::Twig::Elt->new('monogr');
+    my $docHTitle          = XML::Twig::Elt->new('h.title');
+    my $docImprint         = XML::Twig::Elt->new('imprint');
+    
+    $docSigle                  -> paste("first_child", $docTitleStmt);
+    $dtitle                    -> paste("last_child",  $docTitleStmt);
+    $docTitleStmt              -> paste("last_child",  $docFileDesc);
+    $docFileDesc               -> paste("last_child",  $idsDocHeader);
+    $docPublicationStmt        -> paste("last_child",  $docFileDesc);
+    $docDistributor            -> paste("last_child",  $docPublicationStmt);
+    $docPubAddress             -> paste("last_child",  $docPublicationStmt);
+
+    $docAvailability           -> paste("last_child",  $docPublicationStmt);
+    $docPubDate                -> paste("last_child",  $docPublicationStmt);
+
+    $docSourceDesc             -> paste("last_child",  $docFileDesc);
+    $docBiblStruct             -> paste("last_child",  $docSourceDesc);
+    $docMonogr                 -> paste("last_child",  $docBiblStruct);
+    $docHTitle                 -> paste("last_child",  $docMonogr);
+    $docImprint                -> paste("last_child",  $docMonogr);
+    
     $idsDocHeader           -> paste("last_child",  $idsDoc);
 
     # ToDo set dummy dtitle and docSigle
-}    
 
-
+    $docSigle->set_text($corpussigles{$fnsource} . "/JAN");
+    $dtitle  ->set_text($srcfullnames{$fnsource} . ", January " . $fnyear);
+}
 
 
 #----------------------------------
@@ -214,35 +305,6 @@
 open(my $IN,  "< $ARGV[0]") || die("$0: cannot open file for reading: $ARGV[0]");     # open input  file and initialise filehandel, actually does not seem to be needed
                                                                                       # as parsefile() (s.b.) is applied to the filename
 
-#-------------------------------------------------------------------------------------------
-# read source metadata file (prepared manually => ultimately read the info from CMDI File?)
-#-------------------------------------------------------------------------------------------
-
-open(my $SOURCES, "< $sourcescsvfile") || die("$0: cannot open file for reading: $sourcescsvfile");
-while(my $fline = <$SOURCES>){
-    chomp($fline);
-
-    if ($fline=~/^\#/ || $fline=~/^\s*$/ || $fline =~ "TEICORPUSID"){next;}    # skip line if empty line or comment line or first line
-    my @flarray = split(/\s*\t+\s*/, $fline);                                  # split each line into array
-
-    # set full titles ($flarray[1]) as keys:
-    $corpussigles{$flarray[1]}  = $flarray[0];
-    $srcfullnames{$flarray[1]}  = $flarray[1];
-    $srcpubplaces{$flarray[1]}  = $flarray[6];
-    $srcpublishers{$flarray[1]} = $flarray[7];
-    $srctexttypes{$flarray[1]}  = $flarray[4];
-    $srctextlangs{$flarray[1]}  = $flarray[5];
-
-    # also set simple titles ($flarray[2]) as keys: 
-    $corpussigles{$flarray[2]}  = $flarray[0];
-    $srcfullnames{$flarray[2]}  = $flarray[1];
-    $srcpubplaces{$flarray[2]}  = $flarray[6];
-    $srcpublishers{$flarray[2]} = $flarray[7];
-    $srctexttypes{$flarray[2]}  = $flarray[4];
-    $srctextlangs{$flarray[2]}  = $flarray[5];
-}
-close($SOURCES);
-
 
 
 #####################
@@ -301,7 +363,13 @@
 sub root {
     my ($twig, $root, $corpusHeader) =@_;
 
-    $root->set_gi('teiCorpus');
+    if($TEIFORMAT eq "TEI"){
+	$root->set_gi('teiCorpus');
+    }
+    else {
+	$root->set_gi('idsCorpus');
+    }
+    
     $root->set_att("xmlns", 'http://www.tei-c.org/ns/1.0');
 
     &insertCorpusHeader($root, $corpusHeader);
@@ -312,26 +380,32 @@
 sub insertCorpusHeader{
     my ($root, $corpusHeader) =@_;
 
-    #-------------------------------------------------------------
-    # take fnsource and year from the current xml input filename
-    #-------------------------------------------------------------
-
-    my @array = split(/\//, $ARGV[0]);
-    my $l = scalar(@array);
-    my $fnsource = $array[$l-1];
-    $fnsource =~ s/([0-9][0-9][0-9][0-9])\.xml$//;
-
-    my $year = $1;   # $1 containts substring in first bracket in regex above
-
     #-----------------------
     # set corpus header
     #-----------------------
     
-    &set_title(     $corpusHeader, $srcfullnames{$fnsource}, $year, $kielipankkiCorpus);
-    &set_sourceDesc($corpusHeader, $srcfullnames{$fnsource}, $year, $kielipankkiCorpus);
+    &set_title($corpusHeader, $srcfullnames{$fnsource}, $fnyear, $kielipankkiCorpus);
+
+    if($TEIFORMAT eq "TEI"){
+	&set_sourceDesc($corpusHeader, $srcfullnames{$fnsource}, $fnyear, $kielipankkiCorpus);
+    }
+    elsif ($TEIFORMAT eq "I5"){
+	$corpusHeader->get_xpath("fileDesc/titleStmt/korpusSigle",0)  ->set_text($corpussigles{$fnsource});
+	$corpusHeader->get_xpath("fileDesc/publicationStmt/pubDate",0)->set_text(localtime[5] + 1900);
+	$corpusHeader->get_xpath("profileDesc/langUsage/language",0)  ->set_text($expandLang{$srctextlangs{$fnsource}});
+	$corpusHeader->get_xpath("profileDesc/langUsage/language",0)  ->set_att('id', $srctextlangs{$fnsource});
+	$corpusHeader->get_xpath('encodingDesc/editorialDecl/transduction[@n="2"]',0)  ->set_text("I5 version by EuReCo using vrt2tei.pl " . localtime->ymd('-'));
+	&set_sourceDescI5($corpusHeader);
+    }
+    else{
+	print STDERR "TEIFORMAT is $TEIFORMAT; must be TEI or I5";
+    }
 
     $corpusHeader->paste("first_child", $root);
-    $idsDoc      ->paste("after",       $corpusHeader);
+
+    if($TEIFORMAT eq "I5"){
+	$idsDoc->paste("after",       $corpusHeader);
+    }
 }
     
 
@@ -353,6 +427,10 @@
     
     my $textattsref = $text->atts();               # $textattsref is now a reference to a hash and should be used with '->'
 
+    &createIdsDoc($textattsref);
+
+
+
     # &createTextHeader returns the $textID:
     my $textID = &createTextHeader($text, $textattsref, $textHeader);
     
@@ -569,11 +647,11 @@
     
     
     $analytic->first_child($title)                    ->set_text($LABEL . ", Text #" . $textcounter);   # Case KLK; PAGENO scheint meist "None" zu sein
-    $analytic->get_xpath('./idno[@type="PAGEID"]',       0)     ->set_text($PAGEID);
-    $analytic->get_xpath('./idno[@type="BINDINGID"]',    0)     ->set_text($BID);
-    $analytic->get_xpath('./idno[@type="ID"]',           0)     ->set_text($ID);
-    $analytic->get_xpath('./idno[@type="KIELIPANKKI_METAFILENAME"]', 0)     ->set_text($METAFILENAME);
-    $analytic->get_xpath('./idno[@type="KIELIPANKKI_ORIGFILENAME"]', 0)     ->set_text($ORIGFILENAME);
+    #$analytic->get_xpath('./idno[@type="PAGEID"]',       0)     ->set_text($PAGEID);
+    #$analytic->get_xpath('./idno[@type="BINDINGID"]',    0)     ->set_text($BID);
+    #$analytic->get_xpath('./idno[@type="ID"]',           0)     ->set_text($ID);
+    #$analytic->get_xpath('./idno[@type="KIELIPANKKI_METAFILENAME"]', 0)     ->set_text($METAFILENAME);
+    #$analytic->get_xpath('./idno[@type="KIELIPANKKI_ORIGFILENAME"]', 0)     ->set_text($ORIGFILENAME);
     if($TEIFORMAT eq "TEI"){
 	$analytic->first_child('textLang')                        ->set_text($LANGUAGE);
     }
@@ -602,16 +680,16 @@
     $monogr->get_xpath('./imprint/' . $date . '[@type="day"]',    0)         ->set_text($datearray[2]);    
     $monogr->first_child("imprint")->first_child("pubPlace")        ->set_text($srcpubplaces{$PUBLTITLE});  # imprint is needed for tei validity
     $monogr->first_child("imprint")->first_child("publisher")       ->set_text($srcpublishers{$PUBLTITLE}); # imprint is needed for tei validity
-    $monogr->get_xpath('./biblScope[@unit="ISSUETITLE"]', 0)        ->set_text($ISSUETITLE);
-    $monogr->get_xpath('./biblScope[@unit="ISSUENO"]', 0)           ->set_text($ISSUENO);
-    $monogr->get_xpath('./biblScope[@unit="ISSUEDATE"]', 0)         ->set_text($ISSUEDATE);
-    $monogr->get_xpath('./biblScope[@unit="pp"]', 0)                ->set_text($PAGENO);        # Achtung - PAGENO scheint meist "None" zu sein
+    #$monogr->get_xpath('./biblScope[@unit="ISSUETITLE"]', 0)        ->set_text($ISSUETITLE);
+    #$monogr->get_xpath('./biblScope[@unit="ISSUENO"]', 0)           ->set_text($ISSUENO);
+    #$monogr->get_xpath('./biblScope[@unit="ISSUEDATE"]', 0)         ->set_text($ISSUEDATE);
+    #$monogr->get_xpath('./biblScope[@unit="pp"]', 0)                ->set_text($PAGENO);        # Achtung - PAGENO scheint meist "None" zu sein
 
     my $dateNice      = $datearray[2] . "." . $datearray[1] . "." . $datearray[0]; 
     my $dateBackwards = $datearray[0] . "." . $datearray[1] . "." . $datearray[2]; 
     
     if($TEIFORMAT eq "I5"){
-	my $refCompleteText = $textSigle . " " . $PUBLTITLE . " no. " . $ISSUENO . ", [" . $mapping{$PUBLTYPE} . "], " . $dateNice;
+	my $refCompleteText = $textSigle . " " . $PUBLTITLE . " no. " . $ISSUENO . ", [" . $PUBLTYPE . "], " . $dateNice;
 	my $refShortText    = $textSigle . " " . $PUBLTITLE .                                                     ", " . $dateNice;
 	$textHeader->get_xpath('./fileDesc/sourceDesc/reference[@type="complete"]', 0) -> set_text($refCompleteText);
 	$textHeader->get_xpath('./fileDesc/sourceDesc/reference[@type="short"]'   , 0) -> set_text($refShortText);
@@ -707,7 +785,9 @@
     my @tags = split(/\t/, $line);
     
     # set content of <w> i.e. the token
-    $w_element->set_text($tags[0]);
+    # $w_element->set_text($tags[0]);
+    my $random_w = Data::Random::String->create_random_string(length=>length($tags[0]), contains=>'alpha');
+    $w_element->set_text($random_w);
     
     # vrt positional-attributes in corpus KLK:
     #  USE [0] word
@@ -731,19 +811,29 @@
     # $w_element->set_att("id",     "w_" . $namearray[4] . $sentence->att("xml:id") . "_" . $tags[1]);
     # so zusammengebaute ID ist auch nicht eindeutig...
     $w_element->del_att("id");
-    $w_element->set_att("lemma",  $tags[2]);
+    # $w_element->set_att("lemma",  $tags[2]);
+    $w_element->set_att("lemma",  $random_w);
+
     # $w_element->set_att("norm",   $tags[3]);  # tag abuse of @norm
     $w_element->set_att("pos",    $tags[4]);
     $w_element->set_att("msd",    $tags[5]);
-#TMP    $w_element->set_att("head",   $tags[6]);
-#TMP    $w_element->set_att("deprel", $tags[7]);
     
+    if($TEIFORMAT eq "I5"){                     # remove condition when part of the official TEI
+	$w_element->set_att("head",   $tags[6]);
+	$w_element->set_att("deprel", $tags[7]);
+    }
 }
     
 
 sub set_title{
     my ($corpusHeader, $source, $year, $kielipankkiCorpus) = @_;
 
+    my $titleElement = "title";
+    
+    if($TEIFORMAT eq "I5"){
+	$titleElement = "c.title";
+    }
+    
     my $cTitleString = $source . " " . $year . ", from ". $kielipankkiCorpus . " for EuReCo";    
 
     #<teiHeader>
@@ -755,7 +845,7 @@
     #  </fileDesc>
     #</teiHeader>
 
-    my $cTitleNode = $corpusHeader->first_child("fileDesc")->first_child("titleStmt")->first_child("title");
+    my $cTitleNode = $corpusHeader->first_child("fileDesc")->first_child("titleStmt")->first_child($titleElement);
 
     $cTitleNode->set_text($cTitleString);
     
@@ -781,6 +871,103 @@
     $cBiblNode->set_text($cBiblString);    
 }
 
+sub set_sourceDescI5{
+    my ($corpusHeader) = @_;
+
+    my $PUBLTITLE = $srcfullnames{$fnsource};
+    my $PUBLPLACE = $srcpubplaces{$PUBLTITLE};
+    my $PUBLISHER = $srcpublishers{$PUBLTITLE};
+    my $CSIGLE    = $corpussigles{$PUBLTITLE};
+
+    my $YEAR  = $fnyear;
+    
+
+    #<idsHeader>
+    #  <fileDesc>
+    #    <!-- ... -->
+    #    <sourceDesc>
+    #      <biblStruct>
+    #        <monogr>
+    #          <h.title type="main">[$PUBLTITLE], [$YEAR]</h.title>
+    #          <imprint>
+    #            <publisher>[$PUBLISHER]</publisher>
+    #            <pubPlace key="DE">[$PUBPLACE]</pubPlace>
+    #          </imprint>
+    #        </monogr>
+    #      </biblStruct>
+    #      <reference type="super" assemblage="regular">[$KKK] [$PUBLTITLE]; [$PUBPLACE]: [$PUBLISHER], [$YEAR]</reference>
+    #    </sourceDesc>    #    <sourceDesc>
+    #    <!-- ... -->
+    #  </fileDesc>
+    #</teiHeader>
+
+    my $cMonogr = $corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("biblStruct")->first_child("monogr");
+    $cMonogr->first_child("h.title")->set_text($PUBLTITLE);
+    $cMonogr->first_child("imprint")->first_child("publisher")->set_text($PUBLISHER);
+    $cMonogr->first_child("imprint")->first_child("pubPlace") ->set_text($PUBLPLACE);
+
+    $corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("reference")->set_text($CSIGLE . " " . $PUBLTITLE . "; " . $PUBLPLACE . ": " . $PUBLISHER . ", " . $YEAR);
+
+}
+
+
+
+sub createIdsDoc{
+    my ($textattsref) = @_;
+    my $DATE      = $textattsref->{'date'};
+    my $PUBLTITLE = $textattsref->{'publ_title'};
+    my $CSIGLE    = $corpussigles{$PUBLTITLE};
+    
+    #print STDERR "LASTMONTH: " . $LASTMONTH . "\n";
+    #print STDERR "DATE: "     .  $DATE . "\n";
+
+    my @datearray = split("-", $DATE);
+    my $MONTH = $datearray[1];
+    my $YEAR  = $datearray[0];
+    
+    my $DOCID =     $months{$MONTH};
+    my $MONTHNAME = $monthnames{$MONTH};
+
+
+    #print STDERR "MONTH: "     .  $MONTH . "\n\n";
+    
+    my $idsDocString="";
+    if($TEIFORMAT eq "I5"){
+	$idsDocString = "
+<idsDoc>
+<idsHeader type=\"document\" pattern=\"text\" version=\"1.1\" TEIform=\"teiHeader\">
+  <fileDesc>
+    <titleStmt>
+      <dokumentSigle>$CSIGLE/$DOCID</dokumentSigle>
+      <d.title>$PUBLTITLE, $MONTHNAME $YEAR</d.title>
+    </titleStmt>
+    <publicationStmt>
+      <distributor/>
+      <pubAddress/>
+      <availability region=\"world\">$kielipankkiLicense</availability>
+      <pubDate/>
+    </publicationStmt>
+    <sourceDesc>
+      <biblStruct>
+        <monogr>
+          <h.title/>
+          <imprint/>
+        </monogr>
+      </biblStruct>
+    </sourceDesc>    
+  </fileDesc>  
+</idsHeader>
+</idsDoc>\n";
+}    
+    if($MONTH + 0 == $LASTMONTH + 1){
+	if($MONTH+0 > 1){
+	    printf("%s\n", $idsDocString);
+		}
+	$LASTMONTH++;
+    }
+    # printf(STDERR "\n\nNEW MONTH   %s\n\n", $months{$MONTH});
+}
+
 
 
 #################
commit	381c2a2a7c3a9ad7a33ecb52d0c493ae1366d7b5	[log] [tgz]
author	Harald Lüngen <luengen@ids-mannheim.de>	Tue Sep 17 09:06:39 2024 +0300
committer	Harald Lüngen <luengen@ids-mannheim.de>	Tue Sep 17 09:06:39 2024 +0300
tree	191356390a06fb46dc93b0b8b7373027ba024eee
parent	ab5ac5c3bb1ae63e23aaae6857120fc68397ab39 [diff] [blame]