Merge "Remove the call for select_tokenization as it needlessly doubles the tokenizer check"
diff --git a/lib/KorAP/XML/TEI.pm b/lib/KorAP/XML/TEI.pm
index ac63ba8..2f7f11d 100644
--- a/lib/KorAP/XML/TEI.pm
+++ b/lib/KorAP/XML/TEI.pm
@@ -2,7 +2,11 @@
 use strict;
 use warnings;
 
-sub delHTMLcom { # remove HTML comments
+use Exporter 'import';
+our @EXPORT_OK = qw(remove_xml_comments);
+
+# remove xml comments
+sub remove_xml_comments {
   my ($fh, $html) = @_;
 
   # the source code part where $tc is used, leads to the situation, that comments can produce an additional blank, which
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 0db0f31..785e976 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -18,7 +18,7 @@
   unshift @INC, "$FindBin::Bin/../lib";
 };
 
-use KorAP::XML::TEI;
+use KorAP::XML::TEI qw'remove_xml_comments';
 use KorAP::XML::TEI::Tokenizer::External;
 use KorAP::XML::TEI::Tokenizer::Conservative;
 use KorAP::XML::TEI::Tokenizer::Aggressive;
@@ -247,6 +247,10 @@
 
   my ( $pfx, $sfx );
 
+  # TODO:
+  #   Replace all calls of $lc with $. or $input_fh->input_line_number,
+  #   because otherwise remove_html_comments will
+  #   move the lines forward without incrementing.
   my $lc = 0; # line counter
 
   my $tc = 0; # text counter
@@ -279,7 +283,7 @@
 
     # TODO: yet not tested fo big amounts of data
     # must-have, otherwise comments in input could be fatal (e.g.: ...<!--\n<idsHeader...\n-->...)
-    KorAP::XML::TEI::delHTMLcom ( $input_fh, $_ ); # remove HTML comments (<!--...-->)
+    remove_xml_comments( $input_fh, $_ ); # remove HTML comments (<!--...-->)
 
     if ( $data_fl && m#^(.*)</${_TEXT_BODY}>(.*)$# ){
 
diff --git a/t/tei.t b/t/tei.t
index 5022478..f9b5959 100644
--- a/t/tei.t
+++ b/t/tei.t
@@ -8,7 +8,7 @@
   unshift @INC, "$FindBin::Bin/../lib";
 };
 
-require_ok('KorAP::XML::TEI');
+use_ok('KorAP::XML::TEI', 'remove_xml_comments');
 
 my ($fh, $filename) = tempfile();
 
@@ -19,13 +19,13 @@
 Test
 HTML
 
-is(KorAP::XML::TEI::delHTMLcom($fh, "hallo"),"hallo");
-is(KorAP::XML::TEI::delHTMLcom($fh, "hallo <!-- Test -->"),"hallo ");
-is(KorAP::XML::TEI::delHTMLcom($fh, "<!-- Test --> hallo")," hallo");
+is(remove_xml_comments($fh, "hallo"),"hallo");
+is(remove_xml_comments($fh, "hallo <!-- Test -->"),"hallo ");
+is(remove_xml_comments($fh, "<!-- Test --> hallo")," hallo");
 
 seek($fh, 0, 0);
 
-is(KorAP::XML::TEI::delHTMLcom($fh, '<!--'), "Test\n");
+is(remove_xml_comments($fh, '<!--'), "Test\n");
 
 seek($fh, 0, 0);
 
@@ -38,7 +38,7 @@
 
 seek($fh, 0, 0);
 
-is(KorAP::XML::TEI::delHTMLcom($fh, 'Dies <!--'), "Dies ist  ein Test\n");
+is(remove_xml_comments($fh, 'Dies <!--'), "Dies ist  ein Test\n");
 
 close($fh);
 
diff --git a/xt/benchmark.pl b/xt/benchmark.pl
index b27acd6..c1657a6 100644
--- a/xt/benchmark.pl
+++ b/xt/benchmark.pl
@@ -12,7 +12,7 @@
   unshift @INC, "$FindBin::Bin/../lib";
 };
 
-use KorAP::XML::TEI;
+use KorAP::XML::TEI 'remove_xml_comments';
 use KorAP::XML::TEI::Tokenizer::Aggressive;
 use KorAP::XML::TEI::Tokenizer::Conservative;
 
@@ -84,7 +84,7 @@
     name => 'delHTMLcom',
     code => sub {
       for (1..100_000) {
-        $result = KorAP::XML::TEI::delHTMLcom(
+        $result = remove_xml_comments(
           \*STDIN,
           "This <!-- comment --> is a test " . $_
         );
@@ -95,7 +95,7 @@
     name => 'delHTMLcom-long',
     code => sub {
       for (1..10_000) {
-        $result = KorAP::XML::TEI::delHTMLcom(
+        $result = remove_xml_comments(
           $fh,
           "This <!--" . $_
         );