Merge "Remove the call for select_tokenization as it needlessly doubles the tokenizer check"
diff --git a/lib/KorAP/XML/TEI.pm b/lib/KorAP/XML/TEI.pm
index ac63ba8..2f7f11d 100644
--- a/lib/KorAP/XML/TEI.pm
+++ b/lib/KorAP/XML/TEI.pm
@@ -2,7 +2,11 @@
use strict;
use warnings;
-sub delHTMLcom { # remove HTML comments
+use Exporter 'import';
+our @EXPORT_OK = qw(remove_xml_comments);
+
+# remove xml comments
+sub remove_xml_comments {
my ($fh, $html) = @_;
# the source code part where $tc is used, leads to the situation, that comments can produce an additional blank, which
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 0db0f31..785e976 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -18,7 +18,7 @@
unshift @INC, "$FindBin::Bin/../lib";
};
-use KorAP::XML::TEI;
+use KorAP::XML::TEI qw'remove_xml_comments';
use KorAP::XML::TEI::Tokenizer::External;
use KorAP::XML::TEI::Tokenizer::Conservative;
use KorAP::XML::TEI::Tokenizer::Aggressive;
@@ -247,6 +247,10 @@
my ( $pfx, $sfx );
+ # TODO:
+ # Replace all calls of $lc with $. or $input_fh->input_line_number,
+ # because otherwise remove_html_comments will
+ # move the lines forward without incrementing.
my $lc = 0; # line counter
my $tc = 0; # text counter
@@ -279,7 +283,7 @@
# TODO: yet not tested fo big amounts of data
# must-have, otherwise comments in input could be fatal (e.g.: ...<!--\n<idsHeader...\n-->...)
- KorAP::XML::TEI::delHTMLcom ( $input_fh, $_ ); # remove HTML comments (<!--...-->)
+ remove_xml_comments( $input_fh, $_ ); # remove HTML comments (<!--...-->)
if ( $data_fl && m#^(.*)</${_TEXT_BODY}>(.*)$# ){
diff --git a/t/tei.t b/t/tei.t
index 5022478..f9b5959 100644
--- a/t/tei.t
+++ b/t/tei.t
@@ -8,7 +8,7 @@
unshift @INC, "$FindBin::Bin/../lib";
};
-require_ok('KorAP::XML::TEI');
+use_ok('KorAP::XML::TEI', 'remove_xml_comments');
my ($fh, $filename) = tempfile();
@@ -19,13 +19,13 @@
Test
HTML
-is(KorAP::XML::TEI::delHTMLcom($fh, "hallo"),"hallo");
-is(KorAP::XML::TEI::delHTMLcom($fh, "hallo <!-- Test -->"),"hallo ");
-is(KorAP::XML::TEI::delHTMLcom($fh, "<!-- Test --> hallo")," hallo");
+is(remove_xml_comments($fh, "hallo"),"hallo");
+is(remove_xml_comments($fh, "hallo <!-- Test -->"),"hallo ");
+is(remove_xml_comments($fh, "<!-- Test --> hallo")," hallo");
seek($fh, 0, 0);
-is(KorAP::XML::TEI::delHTMLcom($fh, '<!--'), "Test\n");
+is(remove_xml_comments($fh, '<!--'), "Test\n");
seek($fh, 0, 0);
@@ -38,7 +38,7 @@
seek($fh, 0, 0);
-is(KorAP::XML::TEI::delHTMLcom($fh, 'Dies <!--'), "Dies ist ein Test\n");
+is(remove_xml_comments($fh, 'Dies <!--'), "Dies ist ein Test\n");
close($fh);
diff --git a/xt/benchmark.pl b/xt/benchmark.pl
index b27acd6..c1657a6 100644
--- a/xt/benchmark.pl
+++ b/xt/benchmark.pl
@@ -12,7 +12,7 @@
unshift @INC, "$FindBin::Bin/../lib";
};
-use KorAP::XML::TEI;
+use KorAP::XML::TEI 'remove_xml_comments';
use KorAP::XML::TEI::Tokenizer::Aggressive;
use KorAP::XML::TEI::Tokenizer::Conservative;
@@ -84,7 +84,7 @@
name => 'delHTMLcom',
code => sub {
for (1..100_000) {
- $result = KorAP::XML::TEI::delHTMLcom(
+ $result = remove_xml_comments(
\*STDIN,
"This <!-- comment --> is a test " . $_
);
@@ -95,7 +95,7 @@
name => 'delHTMLcom-long',
code => sub {
for (1..10_000) {
- $result = KorAP::XML::TEI::delHTMLcom(
+ $result = remove_xml_comments(
$fh,
"This <!--" . $_
);