Atomize and test comment stripping
Change-Id: Id798c8bac96214f29659b2764f8861539d6f5210
diff --git a/lib/KorAP/XML/TEI.pm b/lib/KorAP/XML/TEI.pm
new file mode 100644
index 0000000..12131e7
--- /dev/null
+++ b/lib/KorAP/XML/TEI.pm
@@ -0,0 +1,40 @@
+package KorAP::XML::TEI;
+use strict;
+use warnings;
+
+sub delHTMLcom { # remove HTML comments
+ my ($fh, $html) = @_;
+
+ # the source code part where $tc is used, leads to the situation, that comments can produce an additional blank, which
+ # sometimes is not desirable (e.g.: '...<!-- comment -->\n<w>token</w>...' would lead to '... <w>token</w>...' in $buf_in).
+ # removing comments before processing the line, prevents this situation.
+
+ my ( $pfx, $sfx );
+
+ while ( $html =~ s/<!--.*?-->//g ){}; # remove all comments in actual line
+
+ if ( $html =~ /^(.*)<!--/ && $html !~ /-->/ ){ # remove comment spanning over several lines
+
+ $pfx = $1;
+
+ while ( $html = <$fh> ){
+
+ if ( $html =~ /-->(.*)$/ ){
+ $sfx = $1; last
+ }
+
+ }
+
+ $html = "$pfx$sfx";
+
+ }
+
+ if ( $html =~ s/^\s*$// ){ # get next line and feed it also to this sub, if actual line is empty or only contains whitespace
+
+ $html = <$fh>; delHTMLcom ( $fh, $html );
+ }
+
+ return $html
+}
+
+1;
diff --git a/script/tei2korapxml b/script/tei2korapxml
index fe8c37d..d50dc24 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -44,6 +44,12 @@
use IO::Compress::Zip qw(zip $ZipError :constants);
use IPC::Open2 qw(open2);
+use FindBin;
+BEGIN {
+ unshift @INC, "$FindBin::Bin/../lib";
+};
+
+use KorAP::XML::TEI;
our $VERSION = '0.01';
@@ -308,7 +314,7 @@
# TODO: yet not tested fo big amounts of data
# must-have, otherwise comments in input could be fatal (e.g.: ...<!--\n<idsHeader...\n-->...)
- delHTMLcom ( $_ ); # remove HTML comments (<!--...-->)
+ KorAP::XML::TEI::delHTMLcom ( $input_fh, $_ ); # remove HTML comments (<!--...-->)
if ( $data_fl && m#^(.*)</${_TEXT_BODY}>(.*)$# ){
@@ -1473,39 +1479,6 @@
} # end: sub write_tokens
-sub delHTMLcom { # remove HTML comments
-
- # the source code part where $tc is used, leads to the situation, that comments can produce an additional blank, which
- # sometimes is not desirable (e.g.: '...<!-- comment -->\n<w>token</w>...' would lead to '... <w>token</w>...' in $buf_in).
- # removing comments before processing the line, prevents this situation.
-
- my ( $pfx, $sfx );
-
- while ( $_[0] =~ s/<!--.*?-->//g ){}; # remove all comments in actual line
-
- if ( $_[0] =~ /^(.*)<!--/ && $_[0] !~ /-->/ ){ # remove comment spanning over several lines
-
- $pfx = $1;
-
- while ( $_[0] = <> ){
-
- if ( $_[0] =~ /-->(.*)$/ ){
- $sfx = $1; last
- }
-
- }
-
- $_[0] = "$pfx$sfx";
-
- }
-
- if ( $_[0] =~ s/^\s*$// ){ # get next line and feed it also to this sub, if actual line is empty or only contains whitespace
-
- $_[0] = <>; delHTMLcom ( $_[0] );
- }
-}
-
-
## DEPRECATED ($_GEN_TOK_BAS: only IDS-intern)
sub startTokenizer {
$pid = open2($chld_out, $chld_in, 'java -cp '. join(":", ".", glob(&dirname(__FILE__)."/../target/*.jar"))." de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl");
diff --git a/t/tei.t b/t/tei.t
new file mode 100644
index 0000000..292ed19
--- /dev/null
+++ b/t/tei.t
@@ -0,0 +1,30 @@
+use strict;
+use warnings;
+use Test::More;
+use File::Temp 'tempfile';
+
+use FindBin;
+BEGIN {
+ unshift @INC, "$FindBin::Bin/../lib";
+};
+
+require_ok('KorAP::XML::TEI');
+
+my ($fh, $filename) = tempfile();
+
+print $fh <<'HTML';
+mehrzeiliger
+Kommentar
+ -->
+Test
+HTML
+
+is(KorAP::XML::TEI::delHTMLcom($fh, "hallo"),"hallo");
+is(KorAP::XML::TEI::delHTMLcom($fh, "hallo <!-- Test -->"),"hallo ");
+is(KorAP::XML::TEI::delHTMLcom($fh, "<!-- Test --> hallo")," hallo");
+
+seek($fh, 0, 0);
+
+is(KorAP::XML::TEI::delHTMLcom($fh, '<!--'), "Test\n");
+
+done_testing;
diff --git a/xt/benchmark.pl b/xt/benchmark.pl
index d8e3b2a..35c0c65 100644
--- a/xt/benchmark.pl
+++ b/xt/benchmark.pl
@@ -12,6 +12,8 @@
unshift @INC, "$FindBin::Bin/../lib";
};
+use KorAP::XML::TEI;
+
my $columns = 0;
my $no_header = 0;
GetOptions(
@@ -38,6 +40,8 @@
verbosity => 0
);
+my $result;
+
# Add benchmark instances
$bench->add_instances(
Dumbbench::Instance::PerlSub->new(
@@ -45,7 +49,18 @@
code => sub {
`cat '$file' | perl '$script' > /dev/null 2>&1`
}
- )
+ ),
+ Dumbbench::Instance::PerlSub->new(
+ name => 'delHTMLcom',
+ code => sub {
+ for (1..100_000) {
+ $result = KorAP::XML::TEI::delHTMLcom(
+ \*STDIN,
+ "This <!-- comment --> is a test " . $_
+ );
+ };
+ }
+ ),
);
# Run benchmarks