Minor improvement of performance by usind index() prior to regex in loops. In certain instances this even replaces regexes. Change-Id: I256d120dd8b88af0fa58ad9d3735409921513101

commit: c1124218984572bec3cda16814d74dc9c7d6e577 [log] [tgz]
author: Akron <nils@diewald-online.de> Fri Jul 31 08:55:38 2020 +0200
committer: Akron <nils@diewald-online.de> Fri Jul 31 20:38:09 2020 +0200
tree: 3033c45f77dc19fc2f74f57c0dc928d3fa25729c
parent: ec2cef29e4bb57178227ee4abf501738f54fcf41 [diff]
diff --git a/lib/KorAP/XML/TEI/Header.pm b/lib/KorAP/XML/TEI/Header.pm
index 9193928..44e7804 100644
--- a/lib/KorAP/XML/TEI/Header.pm
+++ b/lib/KorAP/XML/TEI/Header.pm

@@ -36,7 +36,7 @@
   my $self = bless [$text, undef, ''], $class;
 
   # Check header types to distinguish between siglen types
-  if ($text =~ m!^<${_HEADER_TAG} [^<]*type="([^"]+)"!) {
+  if ($text =~ m!^<${_HEADER_TAG}\s+[^<]*type="([^"]+)"!) {
     $self->[HEADTYPE] = $1;
   }
 
@@ -56,6 +56,9 @@
 
   my $sig_type = $sig{$self->[HEADTYPE]} // 'textSigle';
 
+  my $pos;
+  my $l = length('</' . $_HEADER_TAG) + 1;
+
   # Iterate over file handle
   while (<$fh>) {
 
@@ -63,40 +66,36 @@
     #   This version keeps comments in header files
 
     # End of header found - finish parsing
-    if ( m!^(.*</${_HEADER_TAG}>)(.*)$! ){
+    if (($pos = index($_, '</' . $_HEADER_TAG)) >= 0) {
 
       # Add to text
-      $self->[TEXT] .= $1;
+      $self->[TEXT] .= substr($_, 0, $l + $pos);
 
       die $log->fatal("Line with tag '</${_HEADER_TAG}>' (L$.) contains additional information")
-        if $2 !~ /^\s*$/;
+        if substr($_, $l + $pos) !~ /^\s*$/;
 
       if ($self->dir eq '') {
         $log->error("Empty '<$sig_type />' (L$.) in header");
         return;
-
       };
 
       return $self;
     };
 
     # Check for sigle in line
-    if ( m!^(.*)<$sig_type(?: [^>]*)?>([^<]*)(.*)$! ){
+    if (index($_, '<' . $sig_type) >= 0) {
 
-      my $pfx = $1;
-      my $sig = $2;
-      my $sfx = $3;
+      unless (m!^\s*<$sig_type[^>]*>([^<]*)</$sig_type>\s*$!) {
+        die $log->fatal("line with '<$sig_type />' (L$.) is not in expected format");
+      };
 
-      die $log->fatal("line with '<$sig_type />' (L$.) is not in expected format")
-        if $pfx !~ /^\s*$/  || $sfx !~ m!^</$sig_type>\s*$! || $sig =~ /^\s*$/;
-
-      $self->[SIGLE] = encode('UTF-8' , $sig);
+      $self->[SIGLE] = encode('UTF-8' , $1);
 
       # Escape sig
       my $sig_esc = decode('UTF-8', $self->sigle_esc);
 
       # replace sigle in header, if there's an escaped version that differs
-      s!(<$sig_type(?: [^>]*)?>)[^<]+</$sig_type>!$1$sig_esc</$sig_type>! if $sig_esc ne $sig;
+      s!$1</$sig_type>!$sig_esc</$sig_type>! if $sig_esc ne $1;
     };
 
     # Add line to header text
@@ -104,6 +103,7 @@
   };
 };
 
+
 # Type of the header
 sub type {
   $_[0]->[HEADTYPE];

diff --git a/script/tei2korapxml b/script/tei2korapxml
index 60a7ceb..280cacc 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml

@@ -245,6 +245,8 @@
   # see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.
   binmode $input_fh;
 
+  my $pos;
+  my $l = length('</' . $_TEXT_BODY) + 1;
 
   # ~ loop (reading input document) ~
 
@@ -254,19 +256,16 @@
     # must-have, otherwise comments in input could be fatal (e.g.: ...<!--\n<idsHeader...\n-->...)
     remove_xml_comments( $input_fh, $_ ); # remove HTML comments (<!--...-->)
 
-    if ( $data_fl && m#^(.*)</${_TEXT_BODY}>(.*)$# ){
-
+    if ( $data_fl && ($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
 
       # ~ end of text body ~
 
 
       # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files (s.a.: $_tok_file_ext, $_tok_file_con, $_tok_file_agg)
 
-      $pfx = $1; $sfx = $2;
-
       die "ERROR ($0): main(): input line number $.: line with closing text-body tag '${_TEXT_BODY}'"
         ." contains additional information ... => Aborting\n\tline=$_"
-          if $pfx !~ /^\s*$/ || $sfx !~ /^\s*$/;
+        if (substr($_, 0, $pos) . substr($_, $l + $pos)) !~ /^\s*$/;
 
       if ( $dir ne "" ){
 
@@ -433,7 +432,7 @@
       # add line to buffer
       $buf_in .= $_;
 
-    } elsif ( m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$# ){
+    } elsif ( index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$# ){
 
       # ~ start of text body ~
commit	c1124218984572bec3cda16814d74dc9c7d6e577	[log] [tgz]
author	Akron <nils@diewald-online.de>	Fri Jul 31 08:55:38 2020 +0200
committer	Akron <nils@diewald-online.de>	Fri Jul 31 20:38:09 2020 +0200
tree	3033c45f77dc19fc2f74f57c0dc928d3fa25729c
parent	ec2cef29e4bb57178227ee4abf501738f54fcf41 [diff]