Replace recursion and non-essential regexes with index/substr
(as a very minor performance improvement) and fix a bug
where a single-line regex would ignore the newline symbol.
Change-Id: I573b50b85b7dd2732c2cad3f50e22b0e9e33e2ea
diff --git a/lib/KorAP/XML/TEI.pm b/lib/KorAP/XML/TEI.pm
index 7a2174d..ac63ba8 100644
--- a/lib/KorAP/XML/TEI.pm
+++ b/lib/KorAP/XML/TEI.pm
@@ -9,31 +9,42 @@
# sometimes is not desirable (e.g.: '...<!-- comment -->\n<w>token</w>...' would lead to '... <w>token</w>...' in $buf_in).
# removing comments before processing the line, prevents this situation.
- my ( $pfx, $sfx ) = ('','');
+ my $pfx = '';
+ my $i = 0;
CHECK:
- while ( $html =~ s/<!--.*?-->//g ){}; # remove all comments in actual line
+ $html =~ s/<!--.*?-->//g; # remove all comments in actual line
- if ( $html =~ /^(.*)<!--/ && $html !~ /-->/ ){ # remove comment spanning over several lines
+ # Remove comment spanning over several lines
+ # No closing comment found
+ if ( index($html, '-->') == -1) {
- $pfx = $1;
+ # Opening comment found
+ $i = index($html, '<!--');
+ if ($i != -1) {
+ $pfx = substr($html, 0, $i);
- while ( $html = <$fh> ){
+ # Consume all lines until the closing comment is found
+ while ( $html = <$fh> ){
- if ( $html =~ /-->(.*)$/ ){
- $sfx = $1; last
+ $i = index($html, '-->');
+ if ($i != -1){
+ $html = substr($html, $i + 3);
+ last;
+ }
+
}
+ $html = $pfx . ($html // '');
+ goto CHECK;
}
-
- $html = "$pfx$sfx";
- goto CHECK;
}
- if ( $html =~ s/^\s*$// ){ # get next line and feed it also to this sub, if actual line is empty or only contains whitespace
+ if ( $html =~ /^\s*$/ ){ # get next line and feed it also to this sub, if actual line is empty or only contains whitespace
- $html = <$fh>; delHTMLcom ( $fh, $html );
+ $html = <$fh>;
+ goto CHECK;
}
return $html
diff --git a/t/tei.t b/t/tei.t
index fcfd32e..5022478 100644
--- a/t/tei.t
+++ b/t/tei.t
@@ -38,7 +38,7 @@
seek($fh, 0, 0);
-is(KorAP::XML::TEI::delHTMLcom($fh, 'Dies <!--'), "Dies ist ein Test");
+is(KorAP::XML::TEI::delHTMLcom($fh, 'Dies <!--'), "Dies ist ein Test\n");
close($fh);