adapt hashes to new second column with publid in sources_klk_fi_v2_2021_4eureco.csv
diff --git a/vrt2tei.pl b/vrt2tei.pl
index 93319a2..14b1252 100755
--- a/vrt2tei.pl
+++ b/vrt2tei.pl
@@ -20,7 +20,6 @@
# 0 ZIPPEN mit tei2korapxml; zippen mit korAP indexing
-# 1 bearbeitung von @head und @deprel in tei2korapxml durch Nils?
# 2 threading on compute node and application on sub corpora of KLK
# 2 build 30 billion corpus and index it
# 3 Optionen
@@ -82,7 +81,7 @@
# currently one argument: the vrt-xml input file
-unless($ARGV[0]) {&usage_message()} ; # min arg0, the input file
+unless($ARGV[0]) {&usage_message()} ; # min arg0, the input file e.g. Suomen_Kuvalehti2021.xml
if ($ARGV[1]) {&usage_message()}; # max arg0, the input file
@@ -139,6 +138,7 @@
my $LASTMONTH = 0;
our %corpusids = ();
+our %srcpublids = ();
our %srcfullnames = ();
our %srcpubplaces = ();
our %srcpublishers = ();
@@ -241,22 +241,25 @@
if ($fline=~/^\#/ || $fline=~/^\s*$/ || $fline =~ "TEICORPUSID"){next;} # skip line if empty line or comment line or first line
my @flarray = split(/\s*\t+\s*/, $fline); # split each line into array
+ # (ToDo: the following hashes could probably conflated into an array of hashes or so)
+
# set full titles ($flarray[1]) as keys:
- # (ToDo: these hashes could probably conflated into an array of hashes or so)
- $corpusids{$flarray[1]} = $flarray[0];
- $srcfullnames{$flarray[1]} = $flarray[1];
- $srcpubplaces{$flarray[1]} = $flarray[6];
- $srcpublishers{$flarray[1]} = $flarray[7];
- $srctexttypes{$flarray[1]} = $flarray[4];
- $srctextlangs{$flarray[1]} = $flarray[5];
+ $corpusids{$flarray[2]} = $flarray[0];
+ $srcpublids{$flarray[2]} = $flarray[1];
+ $srcfullnames{$flarray[2]} = $flarray[2];
+ $srcpubplaces{$flarray[2]} = $flarray[7];
+ $srcpublishers{$flarray[2]} = $flarray[8];
+ $srctexttypes{$flarray[2]} = $flarray[5];
+ $srctextlangs{$flarray[2]} = $flarray[6];
# also set simple titles ($flarray[2]) as keys:
- $corpusids{$flarray[2]} = $flarray[0];
- $srcfullnames{$flarray[2]} = $flarray[1];
- $srcpubplaces{$flarray[2]} = $flarray[6];
- $srcpublishers{$flarray[2]} = $flarray[7];
- $srctexttypes{$flarray[2]} = $flarray[4];
- $srctextlangs{$flarray[2]} = $flarray[5];
+ $corpusids{$flarray[3]} = $flarray[0];
+ $srcpublids{$flarray[3]} = $flarray[1];
+ $srcfullnames{$flarray[3]} = $flarray[2];
+ $srcpubplaces{$flarray[3]} = $flarray[7];
+ $srcpublishers{$flarray[3]} = $flarray[8];
+ $srctexttypes{$flarray[3]} = $flarray[5];
+ $srctextlangs{$flarray[3]} = $flarray[6];
}
close($SOURCES);
@@ -266,6 +269,7 @@
#------------------------------------------------------------------
# read corpusHeaderSkeleton document and start a twig for it
+# (since this file need not be streamed, no handlers are needed)
#------------------------------------------------------------------
my $teiCorpusHeaderDocTwig = new XML::Twig(