Minor improvement of performance by usind index() prior to regex
in loops.
In certain instances this even replaces regexes.
Change-Id: I256d120dd8b88af0fa58ad9d3735409921513101
diff --git a/lib/KorAP/XML/TEI/Header.pm b/lib/KorAP/XML/TEI/Header.pm
index 9193928..44e7804 100644
--- a/lib/KorAP/XML/TEI/Header.pm
+++ b/lib/KorAP/XML/TEI/Header.pm
@@ -36,7 +36,7 @@
my $self = bless [$text, undef, ''], $class;
# Check header types to distinguish between siglen types
- if ($text =~ m!^<${_HEADER_TAG} [^<]*type="([^"]+)"!) {
+ if ($text =~ m!^<${_HEADER_TAG}\s+[^<]*type="([^"]+)"!) {
$self->[HEADTYPE] = $1;
}
@@ -56,6 +56,9 @@
my $sig_type = $sig{$self->[HEADTYPE]} // 'textSigle';
+ my $pos;
+ my $l = length('</' . $_HEADER_TAG) + 1;
+
# Iterate over file handle
while (<$fh>) {
@@ -63,40 +66,36 @@
# This version keeps comments in header files
# End of header found - finish parsing
- if ( m!^(.*</${_HEADER_TAG}>)(.*)$! ){
+ if (($pos = index($_, '</' . $_HEADER_TAG)) >= 0) {
# Add to text
- $self->[TEXT] .= $1;
+ $self->[TEXT] .= substr($_, 0, $l + $pos);
die $log->fatal("Line with tag '</${_HEADER_TAG}>' (L$.) contains additional information")
- if $2 !~ /^\s*$/;
+ if substr($_, $l + $pos) !~ /^\s*$/;
if ($self->dir eq '') {
$log->error("Empty '<$sig_type />' (L$.) in header");
return;
-
};
return $self;
};
# Check for sigle in line
- if ( m!^(.*)<$sig_type(?: [^>]*)?>([^<]*)(.*)$! ){
+ if (index($_, '<' . $sig_type) >= 0) {
- my $pfx = $1;
- my $sig = $2;
- my $sfx = $3;
+ unless (m!^\s*<$sig_type[^>]*>([^<]*)</$sig_type>\s*$!) {
+ die $log->fatal("line with '<$sig_type />' (L$.) is not in expected format");
+ };
- die $log->fatal("line with '<$sig_type />' (L$.) is not in expected format")
- if $pfx !~ /^\s*$/ || $sfx !~ m!^</$sig_type>\s*$! || $sig =~ /^\s*$/;
-
- $self->[SIGLE] = encode('UTF-8' , $sig);
+ $self->[SIGLE] = encode('UTF-8' , $1);
# Escape sig
my $sig_esc = decode('UTF-8', $self->sigle_esc);
# replace sigle in header, if there's an escaped version that differs
- s!(<$sig_type(?: [^>]*)?>)[^<]+</$sig_type>!$1$sig_esc</$sig_type>! if $sig_esc ne $sig;
+ s!$1</$sig_type>!$sig_esc</$sig_type>! if $sig_esc ne $1;
};
# Add line to header text
@@ -104,6 +103,7 @@
};
};
+
# Type of the header
sub type {
$_[0]->[HEADTYPE];
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 60a7ceb..280cacc 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -245,6 +245,8 @@
# see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.
binmode $input_fh;
+ my $pos;
+ my $l = length('</' . $_TEXT_BODY) + 1;
# ~ loop (reading input document) ~
@@ -254,19 +256,16 @@
# must-have, otherwise comments in input could be fatal (e.g.: ...<!--\n<idsHeader...\n-->...)
remove_xml_comments( $input_fh, $_ ); # remove HTML comments (<!--...-->)
- if ( $data_fl && m#^(.*)</${_TEXT_BODY}>(.*)$# ){
-
+ if ( $data_fl && ($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
# ~ end of text body ~
# write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files (s.a.: $_tok_file_ext, $_tok_file_con, $_tok_file_agg)
- $pfx = $1; $sfx = $2;
-
die "ERROR ($0): main(): input line number $.: line with closing text-body tag '${_TEXT_BODY}'"
." contains additional information ... => Aborting\n\tline=$_"
- if $pfx !~ /^\s*$/ || $sfx !~ /^\s*$/;
+ if (substr($_, 0, $pos) . substr($_, $l + $pos)) !~ /^\s*$/;
if ( $dir ne "" ){
@@ -433,7 +432,7 @@
# add line to buffer
$buf_in .= $_;
- } elsif ( m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$# ){
+ } elsif ( index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$# ){
# ~ start of text body ~