Cleanup: Improve variable naming Change-Id: I9deafae98caa7a78fa85e0708e619b6460d846dc

commit: d53913c241262532e2f54a3c91a9410dd59f8a5e [log] [tgz]
author: Akron <nils@diewald-online.de> Wed Feb 24 09:50:13 2021 +0100
committer: Akron <nils@diewald-online.de> Wed Feb 24 10:06:04 2021 +0100
tree: e0d1ac8cf67875e1d35abbabeb0b6dd0ec9436e0
parent: dac5d93932bb5f7d81c392b0a67707eede64abe1 [diff] [blame]
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 53c4ea8..c8ad29f 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml

@@ -42,19 +42,19 @@
 
 # Parse options from the command line
 GetOptions(
-  "root|r=s"              => \(my $_root_dir = '.'),
+  "root|r=s"              => \(my $root_dir = '.'),
   "input|i=s"             => \(my $input_fname = ''),
   'tokenizer-call|tc=s'   => \(my $tokenizer_call),
   'tokenizer-korap|tk'    => \(my $tokenizer_korap),
-  'tokenizer-internal|ti' => \(my $_GEN_TOK_INT),
+  'tokenizer-internal|ti' => \(my $tokenizer_intern),
   'use-tokenizer-sentence-splits|s' => \(my $use_tokenizer_sentence_splits),
   'inline-tokens=s'       => \(my $inline_tokens = 'tokens#morpho'),
   'inline-structures=s'   => \(my $inline_structures = 'struct#structure'),
   'skip-inline-tokens'    => \(my $skip_inline_tokens = 0),
-  'base-foundry=s'        => \(my $_tok_dir = 'base'),
-  'data-file=s'           => \(my $_data_file = 'data'),
-  'header-file=s'         => \(my $_header_file = 'header'),
-  'tokens-file=s'         => \(my $_tok_file_ext = 'tokens'),
+  'base-foundry=s'        => \(my $base_dir = 'base'),
+  'data-file=s'           => \(my $data_file = 'data'),
+  'header-file=s'         => \(my $header_file = 'header'),
+  'tokens-file=s'         => \(my $tokens_file = 'tokens'),
   'log|l=s'               => \(my $log_level = 'notice'),
   'help|h' => sub {
     pod2usage(
@@ -86,6 +86,10 @@
 # TODO: IDS-specific (and redundant)
 my $_HEADER_TAG = 'idsHeader';
 
+# name of the tag containing all information stored in $_tokens_file
+my $_TOKENS_TAG = 'w';
+
+
 if ($use_tokenizer_sentence_splits && !$tokenizer_korap) {
   die $log->fatal("Sentence splitting is currently only supported by KorAP tokenizer (use -tk to activate it");
 };
@@ -112,16 +116,13 @@
 ##
 
 # Name of the directory and the file containing all inline structure informations
-# except for $_TOKEN_TAG information
+# except for $_TOKENS_TAG information
 my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
 
 # Name of the directory and the file containing all inline token informations
 # i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
 my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
 
-# name of the tag containing all information stored in $_tokens_file
-my $_TOKENS_TAG = "w";
-
 # Handling inline annotations (inside $_TOKENS_TAG)
 my $_INLINE_ANNOT = $ENV{KORAPXMLTEI_INLINE} ? 1 : 0;
 
@@ -133,77 +134,81 @@
 my $data = KorAP::XML::TEI::Data->new;
 
 # Initialize zipper
-my $zipper = KorAP::XML::TEI::Zipper->new($_root_dir);
+my $zipper = KorAP::XML::TEI::Zipper->new($root_dir);
 
 
 #
 # ~~~ variables ~~~
 #
 
+my $dir = '';                                        # text     directory (below $root_dir)
 
-my $input_fh;                                        # input file handle (default: stdin)
-
-my $dir;                                             # text     directory (below $_root_dir)
-
-my ( $text_id,
-     $text_id_esc );                                 # '$text_id_esc' = escaped version of $text_id
+# '$text_id_esc' = escaped version of $text_id
+my ($text_id, $text_id_esc);
 
 # these are only used inside recursive function 'retr_info'
-my ( $_IDX,                                          # value is set dependent on DEBUG - for extracting array of child elements from element in $tree_data
-     $e,                                             # element from $tree_data
-     ## variables for handling ~ whitespace related issue ~ (it is sometimes necessary, to correct the from-values for some tags)
-     $add_one,                                       # ...
-     $fval,                                          # ...
-     %ws);                                           # hash for indices of whitespace-nodes (needed to recorrect from-values)
-                                                     # idea: when closing element, check if it's from-index minus 1 refers to a whitespace-node
-                                                     #  (means: 'from-index - 1' is a key in %ws).
-                                                     # if this is _not_ the case, then the from-value is one to high => correct it by substracting 1
+# value is set dependent on DEBUG - for extracting array of
+# child elements from element in $tree_data
+my $child_idx;
+
+# element from $tree_data
+my $e;
+
+# Keeping track of the current positions in the text
+my $pos;
+
+# Default encoding of the text
+my $input_enc = 'UTF-8';
+
+# variables for handling ~ whitespace related issue ~
+# (it is sometimes necessary, to correct the from-values for some tags)
+my $add_one;
+my $from = 0;
+
+# text line (needed for whitespace handling)
+my $text_line = 0;
+
+# hash for indices of whitespace-nodes
+# (needed to recorrect from-values)
+# IDEA:
+#   when closing element, check if it's from-index minus 1 refers to a whitespace-node
+#  (means: 'from-index - 1' is a key in %ws).
+#  if this is _not_ the case, then the from-value is one
+#  to high => correct it by substracting 1
+my %ws;
 
 
 #
 # ~~~ main ~~~
 #
 
-# ~ initializations ~
-
 # Include line numbers in elements of $tree_data for debugging
-DEBUG ? ($_IDX = 5) : ($_IDX = 4);
+DEBUG ? ($child_idx = 5) : ($child_idx = 4);
 
-$fval = 0;
 
 # ~ read input and write output (text by text) ~
 
-my $tl = 0; # text line (needed for whitespace handling)
+# Input file handle (default: stdin)
+my $input_fh = *STDIN;
 
-$input_fh = *STDIN;  # input file handle (default: stdin)
-
-# Maybe not necessary
-$data->reset;
-
-$dir = '';
-
-if ( $input_fname ne '' ){
+if ($input_fname ne '') {
   unless (open($input_fh, '<', $input_fname)) {
     die $log->fatal("File '$input_fname' could not be opened.");
   };
-}
+};
 
 # Prevents segfaulting (see notes on segfault prevention)
 binmode $input_fh;
 
-my $sfx;
-my $pos;
-my $input_enc = 'UTF-8';
-my $l = length('</' . $_TEXT_BODY) + 1;
 
-# ~ loop (reading input document) ~
-
+# Reading input document
 MAIN: while ( <$input_fh> ){
 
-  $_ = remove_xml_comments( $input_fh, $_ ); # remove HTML (multi-line) comments (<!--...-->)
+  # remove HTML (multi-line) comments (<!--...-->)
+  $_ = remove_xml_comments( $input_fh, $_ );
 
   # Set input encoding
-  if ( index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
+  if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
     $input_enc = $2;
     next;
   };
@@ -211,20 +216,20 @@
   $_ = decode($input_enc, $_);
   $_ = replace_entities($_);
 
-  if ( index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$# ){
+  # Start of Text body
+  if (index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$#){
 
-    # ~ start of text body ~
+    my $suffix = $2;
 
-    $sfx = $2;
-
-    if ($1 !~ /^\s*$/ || $sfx !~ /^\s*$/) {
+    if ($1 !~ /^\s*$/ || $suffix !~ /^\s*$/) {
       die $log->fatal("input line number $.: " .
                         "line with opening text-body tag '${_TEXT_BODY}' " .
                         "contains additional information ... => Aborting (line=$_)");
     };
 
-    # text body data extracted from input document ($input_fh), further processed by XML::LibXML::Reader
-    my $buf_in = '';
+    # Text body data extracted from input document ($input_fh),
+    # further processed by XML::LibXML::Reader
+    my $text_buffer = '';
 
     # Iterate over all lines in the text body
     while (<$input_fh>) {
@@ -233,24 +238,27 @@
       $_ = decode($input_enc, $_);
       $_ = replace_entities($_);
 
-      # ~ end of text body ~
+      # End of text body
       if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
 
         # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
 
-        if ((substr($_, 0, $pos) . substr($_, $l + $pos)) !~ /^\s*$/) {
+        if ((substr($_, 0, $pos) . substr($_, length("</$_TEXT_BODY>") + $pos)) !~ /^\s*$/) {
           die $log->fatal("input line number $.: " .
                             "line with closing text-body tag '${_TEXT_BODY}'".
                             " contains additional information ... => Aborting (line=$_)");
         };
 
         if ($dir eq '') {
-          $log->warn("Maybe empty textSigle => skipping this text ...\ndata=" . substr($data->data, 0, 200));
+          $log->warn(
+            "Maybe empty textSigle => skipping this text ...\n" .
+              'data=' . substr($data->data, 0, 200)
+            );
           next MAIN;
         };
 
         my $reader = XML::LibXML::Reader->new(
-          string => "<text>$buf_in</text>",
+          string => "<text>$text_buffer</text>",
           huge => 1
         );
 
@@ -271,38 +279,39 @@
         %ws = ();
 
         # ~ recursion ~
-        retr_info(1, \$tree_data->[2] ); # parse input data
+        retr_info(1, \$tree_data->[2]); # parse input data
 
         if (DEBUG) {
-          $log->debug("Writing (utf8-formatted) xml file $dir/${_data_file}.xml");
+          $log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
         };
 
-        # ~ write data.xml ~
+        # Write data.xml
         $data->to_zip(
-          $zipper->new_stream("$dir/${_data_file}.xml"),
+          $zipper->new_stream("$dir/${data_file}.xml"),
           $text_id_esc
         );
 
-        # ~ tokenization ~
+        # Tokenize with external tokenizer
         if ($ext_tok) {
 
           # Tokenize and output
           $ext_tok->tokenize($data->data)->to_zip(
-            $zipper->new_stream("$dir/$_tok_dir/${_tok_file_ext}.xml"),
+            $zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
             $text_id_esc
           );
         };
 
-        if ($_GEN_TOK_INT) {
+        # Tokenize with internal tokenizer
+        if ($tokenizer_intern) {
 
           # Tokenize and output
           $cons_tok->tokenize($data->data)->to_zip(
-            $zipper->new_stream("$dir/$_tok_dir/" . $cons_tok->name . '.xml'),
+            $zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
             $text_id_esc
           );
 
           $aggr_tok->tokenize($data->data)->to_zip(
-            $zipper->new_stream("$dir/$_tok_dir/" . $aggr_tok->name . '.xml'),
+            $zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
             $text_id_esc
           );
 
@@ -341,31 +350,44 @@
         next MAIN;
       };
 
-      # ~ inside text body ~
 
       # ~ whitespace handling ~
 
       # Fix whitespaces (see notes on whitespace fixing)
 
-      # TODO: Maybe it's best, to keep the stripping of whitespace and to just remove the if-clause and to insert a blank by default (with possibly
-      #   an option on how newlines in primary text should be handled (stripped or replaced by a whitespace)).
+      # TODO:
+      #   Maybe it's best, to keep the stripping of whitespace and
+      #   to just remove the if-clause and to insert a blank by default
+      #   (with possibly an option on how newlines in primary text should
+      #   be handled (stripped or replaced by a whitespace)).
 
       # Remove consecutive whitespace at beginning and end (mostly one newline)
       s/^\s+//; s/\s+$//;
 
-      ### NOTE: this is only relevant, if a text consists of more than one line
-      ### TODO: find a better solution, or create a warning, if a text has more than one line ($tl > 1)
-      ###  do testing with 2 different corpora (one with only one-line texts, the other with several lines per text)
-      if (m/<[^>]+>[^<]/) { # line contains at least one tag with at least one character contents
+      # NOTE:
+      #   this is only relevant, if a text consists of more than one line
 
-        $tl++; # counter for text lines
+      # TODO:
+      #   find a better solution, or create a warning, if a text has more
+      #   than one line ($text_line > 1)
 
-        s/^(.)/ $1/ if $tl > 1; # insert blank before 1st character (for 2nd line and consecutive lines)
+      # TODO:
+      #   do testing with 2 different corpora
+      #   (one with only one-line texts, the other with several lines per text)
+
+      # line contains at least one tag with at least one character contents
+      if (m/<[^>]+>[^<]/) {
+
+        # Increment counter for text lines
+        $text_line++;
+
+        # insert blank before 1st character
+        #(for 2nd line and consecutive lines)
+        s/^(.)/ $1/ if $text_line > 1;
       }
-      ###
 
       # add line to buffer
-      $buf_in .= $_;
+      $text_buffer .= $_;
     };
 
   } elsif (m#^(.*)(\<${_HEADER_TAG}[^>]*?type=["'].*)$#) {
@@ -374,9 +396,10 @@
     my $content = "$2\n";
 
     if ($1 !~ /^\s*$/) {
-      die $log->fatal("input line number $.: " .
-                        "line with opening header tag" .
-                        " is not in expected format ... => Aborting (line=$_)");
+      die $log->fatal(
+        "input line number $.: " .
+          'line with opening header tag is not in expected format ... ' .
+          "=> Aborting (line=$_)");
     };
 
     # Parse header
@@ -386,7 +409,7 @@
     if ($header) {
 
       # Write header to zip
-      my $file = $header->dir . '/' . $_header_file . '.xml';
+      my $file = $header->dir . '/' . $header_file . '.xml';
 
       $log->debug("Writing file $file") if DEBUG;
 
@@ -403,24 +426,29 @@
         # log output for seeing progression
         $log->notice("$0: text_id=$text_id");
 
-        $tl = 0; # reset (needed for ~ whitespace handling ~)
-      }
-    }
-  }
-} #end: while
+        # Reset counter for text lines
+        # (needed for whitespace handling)
+        $text_line = 0;
+      };
+    };
+  };
+};
 
 $zipper->close;
 
 $ext_tok->close if $ext_tok;
 
+close $input_fh;
+
 exit(0);
 
 
 # Recursively called function to handle XML tree data
 sub retr_info {
+
   # recursion level
   # (1 = topmost level inside retr_info() = should always be level of tag $_TEXT_BODY)
-  my $rl = shift;
+  my $depth = shift;
 
   # Iteration through all array elements
   # ($_[0] is a reference to an array reference)
@@ -438,8 +466,8 @@
       # $e->[1] represents the tag name
       # Skip sentences
       if ($use_tokenizer_sentence_splits && $e->[1] eq "s") {
-        if (defined $e->[$_IDX]) {
-          retr_info($rl+1, \$e->[$_IDX]);
+        if (defined $e->[$child_idx]) {
+          retr_info($depth+1, \$e->[$child_idx]);
         }
         next;
       }
@@ -476,12 +504,12 @@
 
 
       # Call function recursively
-      # do no recursion, if $e->[$_IDX] is not defined
+      # do no recursion, if $e->[$child_idx] is not defined
       # (because we have no array of child-nodes, e.g.: <back/>)
-      if (defined $e->[$_IDX]) {
+      if (defined $e->[$child_idx]) {
 
         # Recursion with array of child-nodes
-        retr_info($rl+1, \$e->[$_IDX]);
+        retr_info($depth+1, \$e->[$child_idx]);
       }
 
 
@@ -495,34 +523,40 @@
 
       # Handle structures and tokens
 
-      $fval = $anno->from;
+      $from = $anno->from;
 
       # ~ whitespace related issue ~
-      if ($fval > 0 && not exists $ws{$fval - 1}) {
+      if ($from > 0 && not exists $ws{$from - 1}) {
 
         # ~ previous node was a text-node ~
-        $anno->set_from($fval - 1);
-      }
-
-      # in case this fails, check input
-      if (($fval - 1) > $pos) {
-        die $log->fatal("text_id='$text_id', " .
-                          "processing of structures: " .
-                          "from-value ($fval) is 2 or more greater " .
-                          "than to-value ($pos) => please check. Aborting");
+        $anno->set_from($from - 1);
       };
 
-      # TODO: find example for which this case applies
-      #  maybe this is not necessary anymore, because the above recorrection of the from-value suffices
+      # in case this fails, check input
+      if (($from - 1) > $pos) {
+        die $log->fatal(
+          "text_id='$text_id', " .
+            'processing of structures: ' .
+            "from-value ($from) is 2 or more greater " .
+            "than to-value ($pos) => please check. Aborting"
+          );
+      };
+
+      # TODO:
+      #   find example for which this case applies
+      #   maybe this is not necessary anymore, because the
+      #   above recorrection of the from-value suffices
       #
-      # TODO: check, if it's better to remove this line and change above check to 'if ($fval - 1) >= $pos;
+      # TODO:
+      #   check, if it's better to remove this line and
+      #   change above check to 'if ($from - 1) >= $pos;
       #   do testing with bigger corpus excerpt (wikipedia?)
-      $anno->set_from($pos) if $fval == $pos + 1;
+      $anno->set_from($pos) if $from == $pos + 1;
       $anno->set_to($pos);
-      $anno->set_level($rl);
+      $anno->set_level($depth);
 
       # Clean up whitespace
-      delete $ws{$fval  - 1} if $fval > 0 && exists $ws{$fval - 1};
+      delete $ws{$from  - 1} if $from > 0 && exists $ws{$from - 1};
 
 
       #~~~~
@@ -829,7 +863,7 @@
 ref($data->[2]->[0]->[3])                                               == HASH  (=> ${$data->[2]->[0]->[3]}{a} == 'v')
 $data->[2]->[0]->[4]                                                    == 1 (line number)
 ref($data->[2]->[0]->[5])                                               == ARRAY (with 2 elements for 'node1' and 'node2')
-                                                                                   # child-nodes of actual node (see $_IDX)
+                                                                                   # child-nodes of actual node (see $child_idx)
 
 ref($data->[2]->[0]->[5]->[0])                                          == ARRAY (with 6 elements)
 $data->[2]->[0]->[5]->[0]->[0]                                          == 1 (=> type == XML_READER_TYPE_ELEMENT)
@@ -905,7 +939,7 @@
 [1]
 Now, what happens, when 2 text-nodes are _not_ seperated by a whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
  In this case, the falsely increased from-value has to be decreased again by 1 when closing the enclosing tag
- (see above code fragment '... not exists $ws{ $fval - 1 } ...').
+ (see above code fragment '... not exists $ws{ $from - 1 } ...').
 
 [2]
 Comparing the 2 examples '<w>fu</w> <w>bar</w>' and '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a
commit	d53913c241262532e2f54a3c91a9410dd59f8a5e	[log] [tgz]
author	Akron <nils@diewald-online.de>	Wed Feb 24 09:50:13 2021 +0100
committer	Akron <nils@diewald-online.de>	Wed Feb 24 10:06:04 2021 +0100
tree	e0d1ac8cf67875e1d35abbabeb0b6dd0ec9436e0
parent	dac5d93932bb5f7d81c392b0a67707eede64abe1 [diff] [blame]