Fix a bug in delHTMLcom where comments were left open
Change-Id: I424ac394fa7eaad9a2b62c61761d2de6720870a2
diff --git a/.gitignore b/.gitignore
index 1c83e3e..9071e73 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,7 @@
MYMETA*
Makefile
pm_to_blib
+\#*
+*~
+.*
+!.gitignore
diff --git a/lib/KorAP/XML/TEI.pm b/lib/KorAP/XML/TEI.pm
index 12131e7..7a2174d 100644
--- a/lib/KorAP/XML/TEI.pm
+++ b/lib/KorAP/XML/TEI.pm
@@ -9,7 +9,9 @@
# sometimes is not desirable (e.g.: '...<!-- comment -->\n<w>token</w>...' would lead to '... <w>token</w>...' in $buf_in).
# removing comments before processing the line, prevents this situation.
- my ( $pfx, $sfx );
+ my ( $pfx, $sfx ) = ('','');
+
+ CHECK:
while ( $html =~ s/<!--.*?-->//g ){}; # remove all comments in actual line
@@ -26,7 +28,7 @@
}
$html = "$pfx$sfx";
-
+ goto CHECK;
}
if ( $html =~ s/^\s*$// ){ # get next line and feed it also to this sub, if actual line is empty or only contains whitespace
diff --git a/t/tei.t b/t/tei.t
index 292ed19..fcfd32e 100644
--- a/t/tei.t
+++ b/t/tei.t
@@ -27,4 +27,19 @@
is(KorAP::XML::TEI::delHTMLcom($fh, '<!--'), "Test\n");
+seek($fh, 0, 0);
+
+print $fh <<'HTML';
+mehrzeiliger
+Kommentar
+ --><!-- Versuch
+-->ist <!-- a --><!-- b --> ein Test
+HTML
+
+seek($fh, 0, 0);
+
+is(KorAP::XML::TEI::delHTMLcom($fh, 'Dies <!--'), "Dies ist ein Test");
+
+close($fh);
+
done_testing;
diff --git a/xt/benchmark.pl b/xt/benchmark.pl
index 35c0c65..ddd17a2 100644
--- a/xt/benchmark.pl
+++ b/xt/benchmark.pl
@@ -4,7 +4,7 @@
use Dumbbench;
use File::Basename 'dirname';
use File::Spec::Functions qw/catfile rel2abs/;
-use File::Temp ':POSIX';
+use File::Temp 'tempfile';
use FindBin;
use Getopt::Long;
@@ -41,6 +41,15 @@
);
my $result;
+my ($fh, $filename) = tempfile();
+
+print $fh <<'HTML';
+mehrzeiliger
+Kommentar
+ --><!-- Versuch
+-->ist <!-- a --><!-- b --> ein Test
+HTML
+
# Add benchmark instances
$bench->add_instances(
@@ -61,11 +70,26 @@
};
}
),
+ Dumbbench::Instance::PerlSub->new(
+ name => 'delHTMLcom-long',
+ code => sub {
+ for (1..10_000) {
+ $result = KorAP::XML::TEI::delHTMLcom(
+ $fh,
+ "This <!--" . $_
+ );
+ seek($fh, 0, 0);
+ };
+ }
+ ),
);
# Run benchmarks
$bench->run;
+# Clean up
+close($fh);
+
# Output in a single row
if ($columns) {
unless ($no_header) {