reading the sources file
diff --git a/vrt2tei.pl b/vrt2tei.pl
index 54b4c8d..42bb7d9 100755
--- a/vrt2tei.pl
+++ b/vrt2tei.pl
@@ -48,6 +48,7 @@
use strict;
use warnings;
+#use diagnostics;
use XML::Twig;
use XML::Generator ':pretty'; # apparently no effect when using flush();
@@ -83,6 +84,13 @@
#my $encoding = "iso-8859-1"; # dieses $encoding ist NUR fuer das output s.u. twig funktion
my $textcounter = 0;
+our %corpussigles = ();
+our %srcfullnames = ();
+our %srcpubplaces = ();
+our %srcpublishers = ();
+our %srctexttypes = ();
+our %srctextlangs = ();
+
my %doccounter = ( # by the month as in derekox
"01" => 1,
"02" => 1,
@@ -165,32 +173,21 @@
# read source metadata file (prepared manually => ultimately read the info from CMDI File?)
#-------------------------------------------------------------------------------------------
-my $linecount=0;
-open(my $SOURCES, $sourcescsvfile) || die("$0: cannot open file for reading: $sourcescsvfile");
-while(my $line = $SOURCES){
- $linecount++;
- chomp($line);
- if ( $line=~/^\#/ || $line=~/^\s*$/ || $linecount == 1){ # skip line if emmpty line or comment line or first line
- next;
- };
- # split each line into array:
- my @line = split(/\t+/, $line);
-
- # $sources{"Suomen Kuvalehti"}[0]
+open(my $SOURCES, "< $sourcescsvfile") || die("$0: cannot open file for reading: $sourcescsvfile");
+while(my $fline = <$SOURCES>){
+ chomp($fline);
+ if ($fline=~/^\#/ || $fline=~/^\s*$/ || $fline =~ "TEICORPUSID"){next;} # skip line if empty line or comment line or first line
+ my @flarray = split(/\s*\t+\s*/, $fline); # split each line into array
+
+ $corpussigles{$flarray[2]} = $flarray[0];
+ $srcfullnames{$flarray[2]} = $flarray[1];
+ $srcpubplaces{$flarray[2]} = $flarray[6];
+ $srcpublishers{$flarray[2]} = $flarray[7];
+ $srctexttypes{$flarray[2]} = $flarray[4];
+ $srctextlangs{$flarray[2]} = $flarray[5];
}
-
-# while (my $line = <$seq_fh>) {
-# chomp $line;
-# ## skip comments and blank lines and optional repeat of title line
-# next if $line =~ /^\#/ || $line =~ /^\s*$/ || $line =~ /^\+/;
-# #split each line into array
-# my @line = split(/\s+/, $line);
-# $result{$line[0]}{yeartotal} += $line[1];
-# $result{$line[0]}{earning} += $line[3] - $line[2];
-# }
-
-
+close($SOURCES);
@@ -203,6 +200,7 @@
#-------------------------------------------------------------------------------------------------------------
+
$twig = new XML::Twig(
keep_spaces => 1, # dadurch auch whitespaces an ehemeligen elementgrenzen im output
keep_atts_order => 1, # requires Tie::IxHash
@@ -261,29 +259,23 @@
sub insertCorpusHeader{
my ($root, $corpusHeader) =@_;
- #---------------------------------------------------------------------------
- # get some metadata for the current output corpus based on source and year
- #---------------------------------------------------------------------------
-
+ #-------------------------------------------------------------
+ # take fnsource and year from the current xml input filename
+ #-------------------------------------------------------------
+
my @array = split(/\//, $ARGV[0]);
my $l = scalar(@array);
- my $source = $array[$l-1];
- $source =~ s/([0-9][0-9][0-9][0-9])\.xml$//;
+ my $fnsource = $array[$l-1];
+ $fnsource =~ s/([0-9][0-9][0-9][0-9])\.xml$//;
- my $language="Finnish";
- my $lang_tla="fi";
-
my $year = $1; # $1 now containts substring in first bracket in regex above
- my $ctitle = $source . " " . $year . ", from ". $kielipankkiCorpus . " for EuReCo"; # to do: also get name of corpus (klk-fi-v2-vrt)
-
-
#-----------------------
# set corpus header
#-----------------------
- &set_title( $corpusHeader, $source, $year, $kielipankkiCorpus);
- &set_sourceDesc($corpusHeader, $source, $year, $kielipankkiCorpus);
+ &set_title( $corpusHeader, $srcfullnames{$fnsource}, $year, $kielipankkiCorpus);
+ &set_sourceDesc($corpusHeader, $srcfullnames{$fnsource}, $year, $kielipankkiCorpus);
my $teiCorpusHeader = $corpusHeader->paste("first_child", $root);
@@ -438,7 +430,7 @@
my $SUMLANG = $textattsref->{'sum_lang'};
my $TOKENCOUNT = $textattsref->{'tokencount'};
-
+
#-----------------------------
# Derived Metadata variables
#-----------------------------
@@ -692,10 +684,6 @@
-
-
-
-
#################
## usage_message
#################