options and switches and their interpretations; case distinctions and update of usage message

commit: fe838e0dd61ddef3f06fd8ae9534322f30b03f31 [log] [tgz]
author: Harald Lüngen <luengen@ids-mannheim.de> Wed Sep 25 09:01:00 2024 +0300
committer: Harald Lüngen <luengen@ids-mannheim.de> Wed Sep 25 09:01:00 2024 +0300
tree: 5f7d1dc630d8ae681598804ac64e1abff604e14c
parent: 37a0edeb301008620926a8760fd3754f939d53f6 [diff] [blame]
diff --git a/vrt2tei.pl b/vrt2tei.pl
index 42abd2b..93319a2 100755
--- a/vrt2tei.pl
+++ b/vrt2tei.pl

@@ -1,4 +1,5 @@
-#! /usr/bin/perl -w
+## #! /appl/soft/bio/bioperl/5.36.0/bin/perl
+## #! /usr/bin/perl -w
 
 
 ###########################################################################################################################################################
@@ -17,34 +18,28 @@
 #
 # TODO: 
 
-# 3a remove the vrt positional attribute comment line / all comment lines
-# 3b add @head and @deprel to I5 sowie auch @msd
-# 3c bearbeitung von @head und @deprel in tei2korapxml durch Nils?
-# 3d build 30 billion corpus
+# 0 ZIPPEN mit tei2korapxml; zippen mit korAP indexing
 
-# 4a take care of IDs
-# 4b see to the values of @xml:lang
-# 5 abfangen von unerwarteten elementen dh andere als <sentence> und <paragraph>
-# 5a wort reihenfolge nochmal checken
-# 6 checks and balances
-# 7  How to encode Kielipankki and National Library of Finland? in teiCorpus Header
-# 8  construct <idsDoc>s for the months (or go for TEI)
-# 9  parallelisation in bash and application on sub corpora of KLK
-# 10  re-implementation of the gawk code in the perl script
-# 12  re-implement creation of text header from xml file in another twig / parametrize TEI vs I5
+# 1 bearbeitung von @head und @deprel in tei2korapxml durch Nils?
+# 2 threading on compute node and application on sub corpora of KLK
+# 2 build 30 billion corpus and index it
+# 3 Optionen
+# 3a parametrize deprel for I5 and if Nils is not ready yet
 
-
-
-#remember
-#formatted.xml:105613: element w: validity error : No declaration for attribute deprel of element w
-#formatted.xml:105613: element w: validity error : No declaration for attribute head of element w
-
+# 
+    
+# 5  abfangen von unerwarteten elementen dh andere als <sentence> und <paragraph>
+# 6  checks and balances, wort reihenfolge nochmal checken?
+# 7  Encode Kielipankki and National Library of Finland? in teiCorpus Header
+# 8  How to encode the CLARIN-RES better - more Info from the CMDI
+# 9  construct <idsDoc>s independent of the order of texts, probably with writing intermediate files to zip 
+# 10 re-implementation of the gawk code in the perl script
+# 11 Wwedish corpus
 
 #
 #
 ############################################################################################################################################################
 
-
 use strict;
 use warnings;
 #use diagnostics;
@@ -66,33 +61,78 @@
 use Data::Random::String;
 
 
-#----------------------
-# check file arguments:
-#----------------------
 
-# arg0 infile:   vrt-xml
+
+#-------------
+# get options 
+#-------------
+
+our ($opt_h, $opt_m, $opt_s, $opt_t);
+
+# read switches and print usage info if some bad option was given
+if (!getopts('hms:t:')) {   # switches with ':' take an argument; switches without ':' are boolean flags
+    &usage_message;
+    exit -1;
+}
+
+
+#--------------------
+# check argument(s)
+#--------------------
+
+# currently one argument: the vrt-xml input file
 
 unless($ARGV[0]) {&usage_message()} ; # min arg0, the input file
 if    ($ARGV[1]) {&usage_message()};  # max arg0, the input file
 
 
-#--------------------------
-# get options / auxiliary files
-#--------------------------
+
+#------------------------------------------------------------
+# initialize defaults for options
+#------------------------------------------------------------
+my $TEIFORMAT= "tei";
+my $MASK     = 0;
+
+
+#----------------------------------------------------------------------------------------------------------
+# interpret the options and check whether their respective argument is meaningful (if applicable)
+#----------------------------------------------------------------------------------------------------------
+
+
+# option -h: display usage info and exit
+if ($opt_h) {
+	print STDERR &usage_message;
+	exit 0;
+}
 
 
 
+# option -t
+if (defined($opt_t)) {
+    $TEIFORMAT = $opt_t;
+}
 
 
-####################
-# GLOBAL VARIABLES 
-####################
+if ($TEIFORMAT !~ /(tei|i5)/i) { # case-insenstive
+    print STDERR "Error: invalid arg for option -t";
+    &usage_message;
+    exit 0;
+}
+if($TEIFORMAT eq "tei"){$TEIFORMAT="TEI"};
+if($TEIFORMAT eq "i5") {$TEIFORMAT="I5"};
+
+# option -m
+if ($opt_m) {
+    $MASK = 1;
+}
+
+
+#-----------------------------------------------
+# OTHER GLOBAL VARIABLES 
+#-----------------------------------------------
 
 my $encoding = "UTF-8";                # dieses $encoding ist NUR fuer das output s.u. twig funktion
 
-#my $TEIFORMAT = "TEI";
-my $TEIFORMAT = "I5";
-
 my $DTDDECL = 'idsCorpus PUBLIC "-//IDS//DTD IDS-I5 1.0//EN" "http://corpora.ids-mannheim.de/I5/DTD/i5.dtd"';     # for I5
 
 my $textcounter = 0;
@@ -142,7 +182,7 @@
 
 my $twig="";   
 
-# variables $fnsource and $fnyear taken from the filename
+# variables $fnsource and $fnyear derived from the filename
 my @array = split(/\//, $ARGV[0]);
 my $l = scalar(@array);
 my $fnsource = $array[$l-1];
@@ -223,6 +263,7 @@
 $expandLang{"fi"} = "Finnish";
 $expandLang{"sv"} = "Swedish";
 
+
 #------------------------------------------------------------------
 # read corpusHeaderSkeleton document and start a twig for it
 #------------------------------------------------------------------
@@ -397,11 +438,11 @@
 	&set_sourceDesc($corpusHeader, $srcfullnames{$fnsource}, $fnyear, $kielipankkiCorpus);
     }
     elsif($TEIFORMAT eq "I5"){
+	$ident="id";
 	$corpusHeader->get_xpath("fileDesc/titleStmt/korpusSigle",0)  ->set_text($corpusids{$fnsource} . $fnYY);
 	$corpusHeader->get_xpath("fileDesc/publicationStmt/pubDate",0)->set_text((localtime)[5] + 1900);
 	$corpusHeader->get_xpath('encodingDesc/editorialDecl/transduction[@n="2"]',0)  ->set_text("I5 version by EuReCo using vrt2tei.pl " . localtime->ymd('-'));
 	&set_sourceDescI5($corpusHeader);
-	$ident="id";
     }
     else{
 	print STDERR "TEIFORMAT is $TEIFORMAT; must be TEI or I5";
@@ -796,11 +837,18 @@
     
     my @tags = split(/\t/, $line);
     
-    # set content of <w> i.e. the token
-    my $random_w = "";
-    # $random_w = Data::Random::String->create_random_string(length=>length($tags[0]), contains=>'alpha');
-    $random_w = $tags[0];
-    $w_element->set_text($random_w);
+    # set word string and lemma string according to $MASK flag:
+    my $w_string = "";
+    my $l_string = "";
+    if($MASK && ($tags[4] ne "Punct")){
+	$w_string = Data::Random::String->create_random_string(length=>length($tags[0]), contains=>'alpha');
+	$l_string = $w_string;
+    }
+    else {
+	$w_string = $tags[0];
+	$l_string = $tags[2];
+    }
+    $w_element->set_text($w_string);
     
     # vrt word and positional-attributes in corpus KLK:
     #  USE [0] word
@@ -821,11 +869,12 @@
     
     # set the attributes of <w>:
     $w_element->set_att("n",      $tags[1]);
+    
     # $w_element->set_att("id",     "w_" . $namearray[4] . $sentence->att("xml:id") . "_" . $tags[1]);
     # so zusammengebaute ID ist auch nicht eindeutig...
     $w_element->del_att("id");
-    $w_element->set_att("lemma",  $tags[2]);
-    #$w_element->set_att("lemma",  $random_w);
+
+    $w_element->set_att("lemma",  $l_string);
 
     # $w_element->set_att("norm",   $tags[3]);  # tag abuse of @norm
     $w_element->set_att("pos",    $tags[4]);
@@ -987,8 +1036,12 @@
 
 
 sub usage_message {
-    print "   Usage:  ./vrt2tei.pl <file.vrt.xml> <outfile>\n";
-    print "   <file.vrt.xml> is a VRT file converted to proper XML\n";
+    print STDERR "Usage:  ./vrt2tei.pl  [OPTIONS]  <file.vrt.xml>\n";
+    print STDERR "   <file.vrt.xml> is a VRT file as proper (well-formed) XML\n";
+    print STDERR "   Options:\n";
+    print STDERR "       -t (tei|i5)    output format, default: tei\n";
+    print STDERR "       -m             mask primary data\n";
+    print STDERR "\n";
     exit;
 }
commit	fe838e0dd61ddef3f06fd8ae9534322f30b03f31	[log] [tgz]
author	Harald Lüngen <luengen@ids-mannheim.de>	Wed Sep 25 09:01:00 2024 +0300
committer	Harald Lüngen <luengen@ids-mannheim.de>	Wed Sep 25 09:01:00 2024 +0300
tree	5f7d1dc630d8ae681598804ac64e1abff604e14c
parent	37a0edeb301008620926a8760fd3754f939d53f6 [diff] [blame]