modified

commit: 86cbd93a296b2118d08502f8d5393936a359b667 [log] [tgz]
author: Harald Lüngen <luengen@ids-mannheim.de> Tue Sep 10 15:52:18 2024 +0300
committer: Harald Lüngen <luengen@ids-mannheim.de> Tue Sep 10 15:52:18 2024 +0300
tree: 7c48fa56c29f8d1e105a809f79fe10a103cbf213
parent: 7abb0b5a344f82b6bf567dbba23f0b11d81f6365 [diff]
diff --git a/klk2eureco.sh b/klk2eureco.sh
index 8b0ddbc..2e0d908 100755
--- a/klk2eureco.sh
+++ b/klk2eureco.sh

@@ -52,9 +52,9 @@
 	x=$s/$SOURCE$YY.xml
 	
  	## echo "  checking wellformedness of $x"
-	## xmllint --noout $x                                              # ToDo: make if condition for continuing only if well-formed
+	## xmllint --noout $x                                                # ToDo: make if condition for continuing only if well-formed
 	# if($R != 0){
-	#      echo "Error: xmllint error with error return code $R" >&2;   # to stderr
+	#      echo "Error: xmllint error with error return code $R" >&2;    
 	#      break; 
 	#  }
 	      
@@ -64,8 +64,7 @@
 	t0="$TEI/$SOURCE/$BASENAME.tei.0.xml"
 	
 	echo "  generating $t using vrt2tei.pl, and prettifying..."
-	./vrt2tei.pl $x | xml_pp > $t                               # xml_pp works but takes ages
-        # xmllint --format $t0 > $t                                  # geht out of memory und --stream machen funktioniert nicht
+	./vrt2tei.pl $x | xml_pp > $t                               # xml_pp works but takes ages; xmllint- format geht out of memory und --stream machen funktioniert nicht
 
 	ls -l $t
 

diff --git a/vrt2tei.pl b/vrt2tei.pl
index 53ad78d..54b4c8d 100755
--- a/vrt2tei.pl
+++ b/vrt2tei.pl

@@ -83,6 +83,25 @@
 #my $encoding = "iso-8859-1";             # dieses $encoding ist NUR fuer das output s.u. twig funktion
 my $textcounter = 0;
 
+my %doccounter = (                        # by the month as in derekox
+    "01" => 1,
+    "02" => 1,
+    "03" => 1,
+    "04" => 1,
+    "05" => 1,
+    "06" => 1,
+    "07" => 1,
+    "08" => 1,
+    "09" => 1,
+    "10" => 1,
+    "11" => 1,
+    "12" => 1,
+    );
+
+
+my $sourcescsvfile = "sources_klk_fi_v2_2021_4eureco.csv";
+my $corpheaderfile = "teiCorpusHeaderSkeleton.tei.xml";
+my $textheaderfile = "teiTextHeaderSkeleton.tei.xml";
 
 
 my $twig="";   
@@ -91,8 +110,20 @@
 # global variables pertaining to the original corpus :
 my $kielipankkiCorpus = "klk-fi-v2-vrt";
 
-
-
+my %months = (
+     "01" => "JAN",
+     "02" => "FEB",
+     "03" => "MAR",
+     "04" => "APR",
+     "05" => "MAY",
+     "06" => "JUN",
+     "07" => "JUL",
+     "08" => "AUG",
+     "09" => "SEP",
+     "10" => "OCT",
+     "11" => "NOV",
+     "12" => "DEC",
+     );
 
 #------------------------------------------------------------------
 # read corpusHeaderSkeleton document and get header out of it
@@ -105,7 +136,7 @@
     );
 
 
-$teiCorpusHeaderDocTwig->parsefile("teiCorpusHeaderSkeleton.tei.xml");
+$teiCorpusHeaderDocTwig->parsefile($corpheaderfile);
 my $corpusHeader = $teiCorpusHeaderDocTwig->root;                  # getting the teiHeader for corpus out of the teiCorpusHeaderSkeleton document
 
 
@@ -119,7 +150,7 @@
     comments => 'drop',
     );
 
-$teiTextHeaderDocTwig->parsefile("teiTextHeaderSkeleton.tei.xml");
+$teiTextHeaderDocTwig->parsefile($textheaderfile);
 my $textHeader = $teiTextHeaderDocTwig->root;                      # getting the teiHeader for corpus out of the teiTextHeaderSkeleton document
 
 
@@ -130,6 +161,38 @@
 open(my $IN,  "< $ARGV[0]") || die("$0: cannot open file for reading: $ARGV[0]");     # open input  file and initialise filehandel, actually does not seem to be needed
                                                                                       # as parsefile() (s.b.) is applied to the filename
 
+#-------------------------------------------------------------------------------------------
+# read source metadata file (prepared manually => ultimately read the info from CMDI File?)
+#-------------------------------------------------------------------------------------------
+
+my $linecount=0;
+open(my $SOURCES, $sourcescsvfile) || die("$0: cannot open file for reading: $sourcescsvfile");
+while(my $line = $SOURCES){
+    $linecount++;
+    chomp($line);
+    if ( $line=~/^\#/  ||  $line=~/^\s*$/ || $linecount == 1){  # skip line if emmpty line or comment line or first line
+	next;
+    };
+    # split each line into array:
+    my @line = split(/\t+/, $line);
+    
+    # $sources{"Suomen Kuvalehti"}[0]
+
+}
+
+# while (my $line = <$seq_fh>) {
+#    chomp $line;
+#    ## skip comments and blank lines and optional repeat of title line
+#    next if $line =~ /^\#/ || $line =~ /^\s*$/ || $line =~ /^\+/;
+#    #split each line into array
+#    my @line = split(/\s+/, $line);
+#    $result{$line[0]}{yeartotal} += $line[1];
+#    $result{$line[0]}{earning} += $line[3] - $line[2];
+# }
+
+    
+
+
 
 #####################
 #     M A I N  
@@ -210,17 +273,17 @@
     my $language="Finnish";
     my $lang_tla="fi";
     
-    my $yy = $1;   # $1 now containts substring in first bracket in regex above
+    my $year = $1;   # $1 now containts substring in first bracket in regex above
 
-    my $ctitle = $source . " " . $yy . ", from ". $kielipankkiCorpus . " for EuReCo";    # to do: also get name of corpus (klk-fi-v2-vrt)
+    my $ctitle = $source . " " . $year . ", from ". $kielipankkiCorpus . " for EuReCo";    # to do: also get name of corpus (klk-fi-v2-vrt)
 
 
     #-----------------------
     # set corpus header
     #-----------------------
     
-    &set_title(     $corpusHeader, $source, $yy, $kielipankkiCorpus);
-    &set_sourceDesc($corpusHeader, $source, $yy, $kielipankkiCorpus);
+    &set_title(     $corpusHeader, $source, $year, $kielipankkiCorpus);
+    &set_sourceDesc($corpusHeader, $source, $year, $kielipankkiCorpus);
 
     my $teiCorpusHeader        = $corpusHeader->paste("first_child", $root);
 
@@ -245,9 +308,9 @@
     
     my $textattsref = $text->atts();               # $textattsref is now a reference to a hash and should be used with '->'
 
-
-    &createTextHeader($text, $textattsref, $textHeader);
-
+    # &createTextHeader returns the $textID:
+    my $textID = &createTextHeader($text, $textattsref, $textHeader);
+    
     #--------------------------
     # create <TEI> from <text>
     #--------------------------
@@ -255,12 +318,11 @@
     # set vrt <text> to <TEI> and delete all attributes after they were were saved above
     $text->del_atts;
     $text->set_gi("TEI");
-
-    # !!!!!!!!!!!!!!!!!!
-    # ToDo: Generate a proper textsigle in TEI/@xml:id that can be converted into a textsigle
-
+    $text->set_att('xml:id', $textID);
     
 
+      
+
     
     
     #------------------------------------------------------------------
@@ -387,6 +449,20 @@
     
 
 
+    #----------------------------------------------------
+    # create textSigle to be returned from this function
+    #----------------------------------------------------
+
+    # SUK21.JAN.00001
+    
+    my $corpusID = "SUK";                            # ToDo read Table with Source metadata
+    my $yy       = substr($datearray[0], 2, 2);      # substr EXPR,OFFSET,LENGTH
+    my $mm       = $datearray[1];                    # substr EXPR,OFFSET,LENGTH
+    my $MMM      = $months{$mm};
+
+    my $textID = $corpusID . $yy . "_" . $MMM . "." . sprintf("%05d", $doccounter{$mm}++);
+
+    
     
     #-----------------------------------------------------------------------
     # CREATE text-teiHeader ACCORDING TO THE SKELETON in $textHeader
@@ -487,7 +563,9 @@
 
     $textHeader->get_xpath('./revisionDesc/change', 0)                                    ->set_att('when', localtime->ymd('-'));
     
+    return $textID;
 
+    
     #-----------------------------------
     # END OF CREATING TEIHEADER
     #-----------------------------------
@@ -573,9 +651,9 @@
     
 
 sub set_title{
-    my ($corpusHeader, $source, $yy, $kielipankkiCorpus) = @_;
+    my ($corpusHeader, $source, $year, $kielipankkiCorpus) = @_;
 
-    my $cTitleString = $source . " " . $yy . ", from ". $kielipankkiCorpus . " for EuReCo";    
+    my $cTitleString = $source . " " . $year . ", from ". $kielipankkiCorpus . " for EuReCo";    
 
     #<teiHeader>
     #  <fileDesc>
@@ -593,9 +671,9 @@
 }
 
 sub set_sourceDesc{
-    my ($corpusHeader, $source, $yy, $kielipankkiCorpus) = @_;
+    my ($corpusHeader, $source, $year, $kielipankkiCorpus) = @_;
 
-    my $cBiblString = $source . " " . $yy . ", from ". $kielipankkiCorpus . " for EuReCo";    
+    my $cBiblString = $source . " " . $year . ", from ". $kielipankkiCorpus . " for EuReCo";    
 
     #<teiHeader>
     #  <fileDesc>
commit	86cbd93a296b2118d08502f8d5393936a359b667	[log] [tgz]
author	Harald Lüngen <luengen@ids-mannheim.de>	Tue Sep 10 15:52:18 2024 +0300
committer	Harald Lüngen <luengen@ids-mannheim.de>	Tue Sep 10 15:52:18 2024 +0300
tree	7c48fa56c29f8d1e105a809f79fe10a103cbf213
parent	7abb0b5a344f82b6bf567dbba23f0b11d81f6365 [diff]