Reduce indentation level and test for missing text ids Change-Id: Ief41eaf446dd083abd3d63701a417279f5d9ec90

commit: dafaa7af418487d298a2726da4a3e70bcb0f3fb0 [log] [tgz]
author: Akron <nils@diewald-online.de> Fri Feb 19 15:17:58 2021 +0100
committer: Akron <nils@diewald-online.de> Tue Feb 23 16:01:43 2021 +0100
tree: dfab2c8676dfae3a41a30cbe571b6230a58b34a1
parent: d20898f99b85916370b6faee9fe2fdb4db015378 [diff] [blame]
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 9aca1f6..8d15173 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml

@@ -202,7 +202,7 @@
 # Maybe not necessary
 $data->reset;
 
-$dir = "";
+$dir = '';
 
 if ( $input_fname ne '' ){
   unless (open($input_fh, '<', $input_fname)) {
@@ -266,101 +266,100 @@
                             " contains additional information ... => Aborting (line=$_)");
         };
 
-        if ($dir ne "") {
+        if ($dir eq '') {
+          $log->warn("Maybe empty textSigle => skipping this text ...\ndata=" . substr($data->data, 0, 200));
+          next MAIN;
+        };
 
-          my $reader = XML::LibXML::Reader->new(
-            string => "<text>$buf_in</text>",
-            huge => 1
+        my $reader = XML::LibXML::Reader->new(
+          string => "<text>$buf_in</text>",
+          huge => 1
+        );
+
+        # See notes on whitespace handling
+        my $param = XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY;
+
+        # XCT_LINE_NUMBERS is only needed for debugging
+        # (see XML::CompactTree::XS)
+        $param |= XCT_LINE_NUMBERS if DEBUG;
+        my $tree_data = XML::CompactTree::XS::readSubtreeToPerl($reader, $param);
+
+        $structures->reset;
+
+        $tokens->reset if $_TOKENS_PROC;
+
+        # ~ whitespace related issue ~
+        $add_one = 0;
+        %ws = ();
+
+        # ~ recursion ~
+        retr_info(1, \$tree_data->[2] ); # parse input data
+
+        if (DEBUG) {
+          $log->debug("Writing (utf8-formatted) xml file $dir/${_data_file}.xml");
+        };
+
+        # ~ write data.xml ~
+        $data->to_zip(
+          $zipper->new_stream("$dir/${_data_file}.xml"),
+          $text_id_esc
+        );
+
+        # ~ tokenization ~
+        if ($_GEN_TOK_EXT) {
+
+          # Tokenize and output
+          $ext_tok->tokenize($data->data)->to_zip(
+            $zipper->new_stream("$dir/$_tok_dir/${_tok_file_ext}.xml"),
+            $text_id_esc
           );
+        };
 
-          # See notes on whitespace handling
-          my $param = XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY;
+        if ($_GEN_TOK_INT) {
 
-          # XCT_LINE_NUMBERS is only needed for debugging
-          # (see XML::CompactTree::XS)
-          $param |= XCT_LINE_NUMBERS if DEBUG;
-          my $tree_data = XML::CompactTree::XS::readSubtreeToPerl($reader, $param);
-
-          $structures->reset;
-
-          $tokens->reset if $_TOKENS_PROC;
-
-          # ~ whitespace related issue ~
-          $add_one = 0;
-          %ws = ();
-
-          # ~ recursion ~
-          retr_info(1, \$tree_data->[2] ); # parse input data
-
-          if (DEBUG) {
-            $log->debug("Writing (utf8-formatted) xml file $dir/${_data_file}.xml");
-          };
-
-          # ~ write data.xml ~
-          $data->to_zip(
-            $zipper->new_stream("$dir/${_data_file}.xml"),
+          # Tokenize and output
+          $cons_tok->tokenize($data->data)->to_zip(
+            $zipper->new_stream("$dir/$_tok_dir/" . $cons_tok->name . '.xml'),
             $text_id_esc
           );
 
-          # ~ tokenization ~
-          if ($_GEN_TOK_EXT) {
+          $aggr_tok->tokenize($data->data)->to_zip(
+            $zipper->new_stream("$dir/$_tok_dir/" . $aggr_tok->name . '.xml'),
+            $text_id_esc
+          );
 
-            # Tokenize and output
-            $ext_tok->tokenize($data->data)->to_zip(
-              $zipper->new_stream("$dir/$_tok_dir/${_tok_file_ext}.xml"),
-              $text_id_esc
-            );
-          };
+          $aggr_tok->reset;
+          $cons_tok->reset;
+        };
 
-          if ($_GEN_TOK_INT) {
-
-            # Tokenize and output
-            $cons_tok->tokenize($data->data)->to_zip(
-              $zipper->new_stream("$dir/$_tok_dir/" . $cons_tok->name . '.xml'),
-              $text_id_esc
-            );
-
-            $aggr_tok->tokenize($data->data)->to_zip(
-              $zipper->new_stream("$dir/$_tok_dir/" . $aggr_tok->name . '.xml'),
-              $text_id_esc
-            );
-
-            $aggr_tok->reset;
-            $cons_tok->reset;
-          };
-
-          if ($use_tokenizer_sentence_splits) {
-            $ext_tok->sentencize_from_previous_input($structures);
-          }
-
-          # ~ write structures ~
-          if (!$structures->empty) {
-            $structures->to_zip(
-              $zipper->new_stream("$dir/$_structure_dir/$_structure_file"),
-              $text_id_esc,
-              2 # = structure serialization
-            );
-          };
-
-          # ~ write tokens ~
-          if ($_TOKENS_PROC && !$tokens->empty) {
-            $tokens->to_zip(
-              $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}"),
-              $text_id_esc,
-              $_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
-            );
-          };
-
-          $dir = ""; # reinit.
-
-          # Maybe not necessary
-          $data->reset;
-
-        } else { # $dir eq ""
-
-          $log->warn("Maybe empty textSigle => skipping this text ...\ndata=$data");
+        if ($use_tokenizer_sentence_splits) {
+          $ext_tok->sentencize_from_previous_input($structures);
         }
 
+        # ~ write structures ~
+        if (!$structures->empty) {
+          $structures->to_zip(
+            $zipper->new_stream("$dir/$_structure_dir/$_structure_file"),
+            $text_id_esc,
+            2 # = structure serialization
+          );
+        };
+
+        # ~ write tokens ~
+        if ($_TOKENS_PROC && !$tokens->empty) {
+          $tokens->to_zip(
+            $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}"),
+            $text_id_esc,
+            $_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
+          );
+        };
+
+        # reinit.
+        $dir = '';
+
+        # Maybe not necessary
+        $data->reset;
+
         next MAIN;
       };
commit	dafaa7af418487d298a2726da4a3e70bcb0f3fb0	[log] [tgz]
author	Akron <nils@diewald-online.de>	Fri Feb 19 15:17:58 2021 +0100
committer	Akron <nils@diewald-online.de>	Tue Feb 23 16:01:43 2021 +0100
tree	dfab2c8676dfae3a41a30cbe571b6230a58b34a1
parent	d20898f99b85916370b6faee9fe2fdb4db015378 [diff] [blame]