Merge branch 'master' of https://korap.ids-mannheim.de/gerrit/EuReCo/kielipankki4eureco
diff --git a/klk2eureco.sh b/klk2eureco.sh
index 8b0ddbc..2e0d908 100755
--- a/klk2eureco.sh
+++ b/klk2eureco.sh
@@ -52,9 +52,9 @@
x=$s/$SOURCE$YY.xml
## echo " checking wellformedness of $x"
- ## xmllint --noout $x # ToDo: make if condition for continuing only if well-formed
+ ## xmllint --noout $x # ToDo: make if condition for continuing only if well-formed
# if($R != 0){
- # echo "Error: xmllint error with error return code $R" >&2; # to stderr
+ # echo "Error: xmllint error with error return code $R" >&2;
# break;
# }
@@ -64,8 +64,7 @@
t0="$TEI/$SOURCE/$BASENAME.tei.0.xml"
echo " generating $t using vrt2tei.pl, and prettifying..."
- ./vrt2tei.pl $x | xml_pp > $t # xml_pp works but takes ages
- # xmllint --format $t0 > $t # geht out of memory und --stream machen funktioniert nicht
+ ./vrt2tei.pl $x | xml_pp > $t # xml_pp works but takes ages; xmllint- format geht out of memory und --stream machen funktioniert nicht
ls -l $t
diff --git a/vrt2tei.pl b/vrt2tei.pl
index 53ad78d..54b4c8d 100755
--- a/vrt2tei.pl
+++ b/vrt2tei.pl
@@ -83,6 +83,25 @@
#my $encoding = "iso-8859-1"; # dieses $encoding ist NUR fuer das output s.u. twig funktion
my $textcounter = 0;
+my %doccounter = ( # by the month as in derekox
+ "01" => 1,
+ "02" => 1,
+ "03" => 1,
+ "04" => 1,
+ "05" => 1,
+ "06" => 1,
+ "07" => 1,
+ "08" => 1,
+ "09" => 1,
+ "10" => 1,
+ "11" => 1,
+ "12" => 1,
+ );
+
+
+my $sourcescsvfile = "sources_klk_fi_v2_2021_4eureco.csv";
+my $corpheaderfile = "teiCorpusHeaderSkeleton.tei.xml";
+my $textheaderfile = "teiTextHeaderSkeleton.tei.xml";
my $twig="";
@@ -91,8 +110,20 @@
# global variables pertaining to the original corpus :
my $kielipankkiCorpus = "klk-fi-v2-vrt";
-
-
+my %months = (
+ "01" => "JAN",
+ "02" => "FEB",
+ "03" => "MAR",
+ "04" => "APR",
+ "05" => "MAY",
+ "06" => "JUN",
+ "07" => "JUL",
+ "08" => "AUG",
+ "09" => "SEP",
+ "10" => "OCT",
+ "11" => "NOV",
+ "12" => "DEC",
+ );
#------------------------------------------------------------------
# read corpusHeaderSkeleton document and get header out of it
@@ -105,7 +136,7 @@
);
-$teiCorpusHeaderDocTwig->parsefile("teiCorpusHeaderSkeleton.tei.xml");
+$teiCorpusHeaderDocTwig->parsefile($corpheaderfile);
my $corpusHeader = $teiCorpusHeaderDocTwig->root; # getting the teiHeader for corpus out of the teiCorpusHeaderSkeleton document
@@ -119,7 +150,7 @@
comments => 'drop',
);
-$teiTextHeaderDocTwig->parsefile("teiTextHeaderSkeleton.tei.xml");
+$teiTextHeaderDocTwig->parsefile($textheaderfile);
my $textHeader = $teiTextHeaderDocTwig->root; # getting the teiHeader for corpus out of the teiTextHeaderSkeleton document
@@ -130,6 +161,38 @@
open(my $IN, "< $ARGV[0]") || die("$0: cannot open file for reading: $ARGV[0]"); # open input file and initialise filehandel, actually does not seem to be needed
# as parsefile() (s.b.) is applied to the filename
+#-------------------------------------------------------------------------------------------
+# read source metadata file (prepared manually => ultimately read the info from CMDI File?)
+#-------------------------------------------------------------------------------------------
+
+my $linecount=0;
+open(my $SOURCES, $sourcescsvfile) || die("$0: cannot open file for reading: $sourcescsvfile");
+while(my $line = $SOURCES){
+ $linecount++;
+ chomp($line);
+ if ( $line=~/^\#/ || $line=~/^\s*$/ || $linecount == 1){ # skip line if emmpty line or comment line or first line
+ next;
+ };
+ # split each line into array:
+ my @line = split(/\t+/, $line);
+
+ # $sources{"Suomen Kuvalehti"}[0]
+
+}
+
+# while (my $line = <$seq_fh>) {
+# chomp $line;
+# ## skip comments and blank lines and optional repeat of title line
+# next if $line =~ /^\#/ || $line =~ /^\s*$/ || $line =~ /^\+/;
+# #split each line into array
+# my @line = split(/\s+/, $line);
+# $result{$line[0]}{yeartotal} += $line[1];
+# $result{$line[0]}{earning} += $line[3] - $line[2];
+# }
+
+
+
+
#####################
# M A I N
@@ -210,17 +273,17 @@
my $language="Finnish";
my $lang_tla="fi";
- my $yy = $1; # $1 now containts substring in first bracket in regex above
+ my $year = $1; # $1 now containts substring in first bracket in regex above
- my $ctitle = $source . " " . $yy . ", from ". $kielipankkiCorpus . " for EuReCo"; # to do: also get name of corpus (klk-fi-v2-vrt)
+ my $ctitle = $source . " " . $year . ", from ". $kielipankkiCorpus . " for EuReCo"; # to do: also get name of corpus (klk-fi-v2-vrt)
#-----------------------
# set corpus header
#-----------------------
- &set_title( $corpusHeader, $source, $yy, $kielipankkiCorpus);
- &set_sourceDesc($corpusHeader, $source, $yy, $kielipankkiCorpus);
+ &set_title( $corpusHeader, $source, $year, $kielipankkiCorpus);
+ &set_sourceDesc($corpusHeader, $source, $year, $kielipankkiCorpus);
my $teiCorpusHeader = $corpusHeader->paste("first_child", $root);
@@ -245,9 +308,9 @@
my $textattsref = $text->atts(); # $textattsref is now a reference to a hash and should be used with '->'
-
- &createTextHeader($text, $textattsref, $textHeader);
-
+ # &createTextHeader returns the $textID:
+ my $textID = &createTextHeader($text, $textattsref, $textHeader);
+
#--------------------------
# create <TEI> from <text>
#--------------------------
@@ -255,12 +318,11 @@
# set vrt <text> to <TEI> and delete all attributes after they were were saved above
$text->del_atts;
$text->set_gi("TEI");
-
- # !!!!!!!!!!!!!!!!!!
- # ToDo: Generate a proper textsigle in TEI/@xml:id that can be converted into a textsigle
-
+ $text->set_att('xml:id', $textID);
+
+
#------------------------------------------------------------------
@@ -387,6 +449,20 @@
+ #----------------------------------------------------
+ # create textSigle to be returned from this function
+ #----------------------------------------------------
+
+ # SUK21.JAN.00001
+
+ my $corpusID = "SUK"; # ToDo read Table with Source metadata
+ my $yy = substr($datearray[0], 2, 2); # substr EXPR,OFFSET,LENGTH
+ my $mm = $datearray[1]; # substr EXPR,OFFSET,LENGTH
+ my $MMM = $months{$mm};
+
+ my $textID = $corpusID . $yy . "_" . $MMM . "." . sprintf("%05d", $doccounter{$mm}++);
+
+
#-----------------------------------------------------------------------
# CREATE text-teiHeader ACCORDING TO THE SKELETON in $textHeader
@@ -487,7 +563,9 @@
$textHeader->get_xpath('./revisionDesc/change', 0) ->set_att('when', localtime->ymd('-'));
+ return $textID;
+
#-----------------------------------
# END OF CREATING TEIHEADER
#-----------------------------------
@@ -573,9 +651,9 @@
sub set_title{
- my ($corpusHeader, $source, $yy, $kielipankkiCorpus) = @_;
+ my ($corpusHeader, $source, $year, $kielipankkiCorpus) = @_;
- my $cTitleString = $source . " " . $yy . ", from ". $kielipankkiCorpus . " for EuReCo";
+ my $cTitleString = $source . " " . $year . ", from ". $kielipankkiCorpus . " for EuReCo";
#<teiHeader>
# <fileDesc>
@@ -593,9 +671,9 @@
}
sub set_sourceDesc{
- my ($corpusHeader, $source, $yy, $kielipankkiCorpus) = @_;
+ my ($corpusHeader, $source, $year, $kielipankkiCorpus) = @_;
- my $cBiblString = $source . " " . $yy . ", from ". $kielipankkiCorpus . " for EuReCo";
+ my $cBiblString = $source . " " . $year . ", from ". $kielipankkiCorpus . " for EuReCo";
#<teiHeader>
# <fileDesc>