options and switches and their interpretations; case distinctions and update of usage message
diff --git a/vrt2tei.pl b/vrt2tei.pl
index 42abd2b..93319a2 100755
--- a/vrt2tei.pl
+++ b/vrt2tei.pl
@@ -1,4 +1,5 @@
-#! /usr/bin/perl -w
+## #! /appl/soft/bio/bioperl/5.36.0/bin/perl
+## #! /usr/bin/perl -w
###########################################################################################################################################################
@@ -17,34 +18,28 @@
#
# TODO:
-# 3a remove the vrt positional attribute comment line / all comment lines
-# 3b add @head and @deprel to I5 sowie auch @msd
-# 3c bearbeitung von @head und @deprel in tei2korapxml durch Nils?
-# 3d build 30 billion corpus
+# 0 ZIPPEN mit tei2korapxml; zippen mit korAP indexing
-# 4a take care of IDs
-# 4b see to the values of @xml:lang
-# 5 abfangen von unerwarteten elementen dh andere als <sentence> und <paragraph>
-# 5a wort reihenfolge nochmal checken
-# 6 checks and balances
-# 7 How to encode Kielipankki and National Library of Finland? in teiCorpus Header
-# 8 construct <idsDoc>s for the months (or go for TEI)
-# 9 parallelisation in bash and application on sub corpora of KLK
-# 10 re-implementation of the gawk code in the perl script
-# 12 re-implement creation of text header from xml file in another twig / parametrize TEI vs I5
+# 1 bearbeitung von @head und @deprel in tei2korapxml durch Nils?
+# 2 threading on compute node and application on sub corpora of KLK
+# 2 build 30 billion corpus and index it
+# 3 Optionen
+# 3a parametrize deprel for I5 and if Nils is not ready yet
-
-
-#remember
-#formatted.xml:105613: element w: validity error : No declaration for attribute deprel of element w
-#formatted.xml:105613: element w: validity error : No declaration for attribute head of element w
-
+#
+
+# 5 abfangen von unerwarteten elementen dh andere als <sentence> und <paragraph>
+# 6 checks and balances, wort reihenfolge nochmal checken?
+# 7 Encode Kielipankki and National Library of Finland? in teiCorpus Header
+# 8 How to encode the CLARIN-RES better - more Info from the CMDI
+# 9 construct <idsDoc>s independent of the order of texts, probably with writing intermediate files to zip
+# 10 re-implementation of the gawk code in the perl script
+# 11 Wwedish corpus
#
#
############################################################################################################################################################
-
use strict;
use warnings;
#use diagnostics;
@@ -66,33 +61,78 @@
use Data::Random::String;
-#----------------------
-# check file arguments:
-#----------------------
-# arg0 infile: vrt-xml
+
+#-------------
+# get options
+#-------------
+
+our ($opt_h, $opt_m, $opt_s, $opt_t);
+
+# read switches and print usage info if some bad option was given
+if (!getopts('hms:t:')) { # switches with ':' take an argument; switches without ':' are boolean flags
+ &usage_message;
+ exit -1;
+}
+
+
+#--------------------
+# check argument(s)
+#--------------------
+
+# currently one argument: the vrt-xml input file
unless($ARGV[0]) {&usage_message()} ; # min arg0, the input file
if ($ARGV[1]) {&usage_message()}; # max arg0, the input file
-#--------------------------
-# get options / auxiliary files
-#--------------------------
+
+#------------------------------------------------------------
+# initialize defaults for options
+#------------------------------------------------------------
+my $TEIFORMAT= "tei";
+my $MASK = 0;
+
+
+#----------------------------------------------------------------------------------------------------------
+# interpret the options and check whether their respective argument is meaningful (if applicable)
+#----------------------------------------------------------------------------------------------------------
+
+
+# option -h: display usage info and exit
+if ($opt_h) {
+ print STDERR &usage_message;
+ exit 0;
+}
+# option -t
+if (defined($opt_t)) {
+ $TEIFORMAT = $opt_t;
+}
-####################
-# GLOBAL VARIABLES
-####################
+if ($TEIFORMAT !~ /(tei|i5)/i) { # case-insenstive
+ print STDERR "Error: invalid arg for option -t";
+ &usage_message;
+ exit 0;
+}
+if($TEIFORMAT eq "tei"){$TEIFORMAT="TEI"};
+if($TEIFORMAT eq "i5") {$TEIFORMAT="I5"};
+
+# option -m
+if ($opt_m) {
+ $MASK = 1;
+}
+
+
+#-----------------------------------------------
+# OTHER GLOBAL VARIABLES
+#-----------------------------------------------
my $encoding = "UTF-8"; # dieses $encoding ist NUR fuer das output s.u. twig funktion
-#my $TEIFORMAT = "TEI";
-my $TEIFORMAT = "I5";
-
my $DTDDECL = 'idsCorpus PUBLIC "-//IDS//DTD IDS-I5 1.0//EN" "http://corpora.ids-mannheim.de/I5/DTD/i5.dtd"'; # for I5
my $textcounter = 0;
@@ -142,7 +182,7 @@
my $twig="";
-# variables $fnsource and $fnyear taken from the filename
+# variables $fnsource and $fnyear derived from the filename
my @array = split(/\//, $ARGV[0]);
my $l = scalar(@array);
my $fnsource = $array[$l-1];
@@ -223,6 +263,7 @@
$expandLang{"fi"} = "Finnish";
$expandLang{"sv"} = "Swedish";
+
#------------------------------------------------------------------
# read corpusHeaderSkeleton document and start a twig for it
#------------------------------------------------------------------
@@ -397,11 +438,11 @@
&set_sourceDesc($corpusHeader, $srcfullnames{$fnsource}, $fnyear, $kielipankkiCorpus);
}
elsif($TEIFORMAT eq "I5"){
+ $ident="id";
$corpusHeader->get_xpath("fileDesc/titleStmt/korpusSigle",0) ->set_text($corpusids{$fnsource} . $fnYY);
$corpusHeader->get_xpath("fileDesc/publicationStmt/pubDate",0)->set_text((localtime)[5] + 1900);
$corpusHeader->get_xpath('encodingDesc/editorialDecl/transduction[@n="2"]',0) ->set_text("I5 version by EuReCo using vrt2tei.pl " . localtime->ymd('-'));
&set_sourceDescI5($corpusHeader);
- $ident="id";
}
else{
print STDERR "TEIFORMAT is $TEIFORMAT; must be TEI or I5";
@@ -796,11 +837,18 @@
my @tags = split(/\t/, $line);
- # set content of <w> i.e. the token
- my $random_w = "";
- # $random_w = Data::Random::String->create_random_string(length=>length($tags[0]), contains=>'alpha');
- $random_w = $tags[0];
- $w_element->set_text($random_w);
+ # set word string and lemma string according to $MASK flag:
+ my $w_string = "";
+ my $l_string = "";
+ if($MASK && ($tags[4] ne "Punct")){
+ $w_string = Data::Random::String->create_random_string(length=>length($tags[0]), contains=>'alpha');
+ $l_string = $w_string;
+ }
+ else {
+ $w_string = $tags[0];
+ $l_string = $tags[2];
+ }
+ $w_element->set_text($w_string);
# vrt word and positional-attributes in corpus KLK:
# USE [0] word
@@ -821,11 +869,12 @@
# set the attributes of <w>:
$w_element->set_att("n", $tags[1]);
+
# $w_element->set_att("id", "w_" . $namearray[4] . $sentence->att("xml:id") . "_" . $tags[1]);
# so zusammengebaute ID ist auch nicht eindeutig...
$w_element->del_att("id");
- $w_element->set_att("lemma", $tags[2]);
- #$w_element->set_att("lemma", $random_w);
+
+ $w_element->set_att("lemma", $l_string);
# $w_element->set_att("norm", $tags[3]); # tag abuse of @norm
$w_element->set_att("pos", $tags[4]);
@@ -987,8 +1036,12 @@
sub usage_message {
- print " Usage: ./vrt2tei.pl <file.vrt.xml> <outfile>\n";
- print " <file.vrt.xml> is a VRT file converted to proper XML\n";
+ print STDERR "Usage: ./vrt2tei.pl [OPTIONS] <file.vrt.xml>\n";
+ print STDERR " <file.vrt.xml> is a VRT file as proper (well-formed) XML\n";
+ print STDERR " Options:\n";
+ print STDERR " -t (tei|i5) output format, default: tei\n";
+ print STDERR " -m mask primary data\n";
+ print STDERR "\n";
exit;
}