Minor improvement of performance by usind index() prior to regex in loops. In certain instances this even replaces regexes. Change-Id: I256d120dd8b88af0fa58ad9d3735409921513101

commit: c1124218984572bec3cda16814d74dc9c7d6e577 [log] [tgz]
author: Akron <nils@diewald-online.de> Fri Jul 31 08:55:38 2020 +0200
committer: Akron <nils@diewald-online.de> Fri Jul 31 20:38:09 2020 +0200
tree: 3033c45f77dc19fc2f74f57c0dc928d3fa25729c
parent: ec2cef29e4bb57178227ee4abf501738f54fcf41 [diff] [blame]
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 60a7ceb..280cacc 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml

@@ -245,6 +245,8 @@
   # see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.
   binmode $input_fh;
 
+  my $pos;
+  my $l = length('</' . $_TEXT_BODY) + 1;
 
   # ~ loop (reading input document) ~
 
@@ -254,19 +256,16 @@
     # must-have, otherwise comments in input could be fatal (e.g.: ...<!--\n<idsHeader...\n-->...)
     remove_xml_comments( $input_fh, $_ ); # remove HTML comments (<!--...-->)
 
-    if ( $data_fl && m#^(.*)</${_TEXT_BODY}>(.*)$# ){
-
+    if ( $data_fl && ($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
 
       # ~ end of text body ~
 
 
       # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files (s.a.: $_tok_file_ext, $_tok_file_con, $_tok_file_agg)
 
-      $pfx = $1; $sfx = $2;
-
       die "ERROR ($0): main(): input line number $.: line with closing text-body tag '${_TEXT_BODY}'"
         ." contains additional information ... => Aborting\n\tline=$_"
-          if $pfx !~ /^\s*$/ || $sfx !~ /^\s*$/;
+        if (substr($_, 0, $pos) . substr($_, $l + $pos)) !~ /^\s*$/;
 
       if ( $dir ne "" ){
 
@@ -433,7 +432,7 @@
       # add line to buffer
       $buf_in .= $_;
 
-    } elsif ( m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$# ){
+    } elsif ( index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$# ){
 
       # ~ start of text body ~
commit	c1124218984572bec3cda16814d74dc9c7d6e577	[log] [tgz]
author	Akron <nils@diewald-online.de>	Fri Jul 31 08:55:38 2020 +0200
committer	Akron <nils@diewald-online.de>	Fri Jul 31 20:38:09 2020 +0200
tree	3033c45f77dc19fc2f74f57c0dc928d3fa25729c
parent	ec2cef29e4bb57178227ee4abf501738f54fcf41 [diff] [blame]