reading the sources file

commit: ba0354b4016d882f29bb9e2e015be59347fff2a8 [log] [tgz]
author: Harald Lüngen <luengen@ids-mannheim.de> Wed Sep 11 16:24:08 2024 +0300
committer: Harald Lüngen <luengen@ids-mannheim.de> Wed Sep 11 16:24:08 2024 +0300
tree: 51bce9a4c2e90061e98c44bc44d4eb3dea77d583
parent: 688dce082267aa20b3455e7cdc20c7cfca3dd623 [diff] [blame]
diff --git a/vrt2tei.pl b/vrt2tei.pl
index 54b4c8d..42bb7d9 100755
--- a/vrt2tei.pl
+++ b/vrt2tei.pl

@@ -48,6 +48,7 @@
 
 use strict;
 use warnings;
+#use diagnostics;
 
 use XML::Twig; 
 use XML::Generator ':pretty';  # apparently no effect when using flush();
@@ -83,6 +84,13 @@
 #my $encoding = "iso-8859-1";             # dieses $encoding ist NUR fuer das output s.u. twig funktion
 my $textcounter = 0;
 
+our %corpussigles    = ();
+our %srcfullnames    = ();
+our %srcpubplaces    = ();
+our %srcpublishers   = ();
+our %srctexttypes    = ();
+our %srctextlangs    = ();
+
 my %doccounter = (                        # by the month as in derekox
     "01" => 1,
     "02" => 1,
@@ -165,32 +173,21 @@
 # read source metadata file (prepared manually => ultimately read the info from CMDI File?)
 #-------------------------------------------------------------------------------------------
 
-my $linecount=0;
-open(my $SOURCES, $sourcescsvfile) || die("$0: cannot open file for reading: $sourcescsvfile");
-while(my $line = $SOURCES){
-    $linecount++;
-    chomp($line);
-    if ( $line=~/^\#/  ||  $line=~/^\s*$/ || $linecount == 1){  # skip line if emmpty line or comment line or first line
-	next;
-    };
-    # split each line into array:
-    my @line = split(/\t+/, $line);
-    
-    # $sources{"Suomen Kuvalehti"}[0]
+open(my $SOURCES, "< $sourcescsvfile") || die("$0: cannot open file for reading: $sourcescsvfile");
+while(my $fline = <$SOURCES>){
+    chomp($fline);
 
+    if ($fline=~/^\#/ || $fline=~/^\s*$/ || $fline =~ "TEICORPUSID"){next;}    # skip line if empty line or comment line or first line
+    my @flarray = split(/\s*\t+\s*/, $fline);                                  # split each line into array
+
+    $corpussigles{$flarray[2]}  = $flarray[0];
+    $srcfullnames{$flarray[2]}  = $flarray[1];
+    $srcpubplaces{$flarray[2]}  = $flarray[6];
+    $srcpublishers{$flarray[2]} = $flarray[7];
+    $srctexttypes{$flarray[2]}  = $flarray[4];
+    $srctextlangs{$flarray[2]}  = $flarray[5];
 }
-
-# while (my $line = <$seq_fh>) {
-#    chomp $line;
-#    ## skip comments and blank lines and optional repeat of title line
-#    next if $line =~ /^\#/ || $line =~ /^\s*$/ || $line =~ /^\+/;
-#    #split each line into array
-#    my @line = split(/\s+/, $line);
-#    $result{$line[0]}{yeartotal} += $line[1];
-#    $result{$line[0]}{earning} += $line[3] - $line[2];
-# }
-
-    
+close($SOURCES);
 
 
 
@@ -203,6 +200,7 @@
 #-------------------------------------------------------------------------------------------------------------
 
 
+
 $twig = new XML::Twig(
     keep_spaces => 1,           # dadurch auch whitespaces an ehemeligen elementgrenzen im output
     keep_atts_order => 1,       # requires Tie::IxHash
@@ -261,29 +259,23 @@
 sub insertCorpusHeader{
     my ($root, $corpusHeader) =@_;
 
-    #---------------------------------------------------------------------------
-    # get some metadata for the current output corpus based on source and year
-    #---------------------------------------------------------------------------
-    
+    #-------------------------------------------------------------
+    # take fnsource and year from the current xml input filename
+    #-------------------------------------------------------------
+
     my @array = split(/\//, $ARGV[0]);
     my $l = scalar(@array);
-    my $source = $array[$l-1];
-    $source =~ s/([0-9][0-9][0-9][0-9])\.xml$//;
+    my $fnsource = $array[$l-1];
+    $fnsource =~ s/([0-9][0-9][0-9][0-9])\.xml$//;
 
-    my $language="Finnish";
-    my $lang_tla="fi";
-    
     my $year = $1;   # $1 now containts substring in first bracket in regex above
 
-    my $ctitle = $source . " " . $year . ", from ". $kielipankkiCorpus . " for EuReCo";    # to do: also get name of corpus (klk-fi-v2-vrt)
-
-
     #-----------------------
     # set corpus header
     #-----------------------
     
-    &set_title(     $corpusHeader, $source, $year, $kielipankkiCorpus);
-    &set_sourceDesc($corpusHeader, $source, $year, $kielipankkiCorpus);
+    &set_title(     $corpusHeader, $srcfullnames{$fnsource}, $year, $kielipankkiCorpus);
+    &set_sourceDesc($corpusHeader, $srcfullnames{$fnsource}, $year, $kielipankkiCorpus);
 
     my $teiCorpusHeader        = $corpusHeader->paste("first_child", $root);
 
@@ -438,7 +430,7 @@
     my $SUMLANG      = $textattsref->{'sum_lang'};
     my $TOKENCOUNT   = $textattsref->{'tokencount'};
 
-    
+
     #-----------------------------
     # Derived Metadata variables
     #-----------------------------
@@ -692,10 +684,6 @@
 
 
 
-
-
-
-
 #################
 ## usage_message
 #################
commit	ba0354b4016d882f29bb9e2e015be59347fff2a8	[log] [tgz]
author	Harald Lüngen <luengen@ids-mannheim.de>	Wed Sep 11 16:24:08 2024 +0300
committer	Harald Lüngen <luengen@ids-mannheim.de>	Wed Sep 11 16:24:08 2024 +0300
tree	51bce9a4c2e90061e98c44bc44d4eb3dea77d583
parent	688dce082267aa20b3455e7cdc20c7cfca3dd623 [diff] [blame]