Atomize and test comment stripping

Change-Id: Id798c8bac96214f29659b2764f8861539d6f5210
diff --git a/lib/KorAP/XML/TEI.pm b/lib/KorAP/XML/TEI.pm
new file mode 100644
index 0000000..12131e7
--- /dev/null
+++ b/lib/KorAP/XML/TEI.pm
@@ -0,0 +1,40 @@
+package KorAP::XML::TEI;
+use strict;
+use warnings;
+
+sub delHTMLcom { # remove HTML comments
+  my ($fh, $html) = @_;
+
+  # the source code part where $tc is used, leads to the situation, that comments can produce an additional blank, which
+  # sometimes is not desirable (e.g.: '...<!-- comment -->\n<w>token</w>...' would lead to '... <w>token</w>...' in $buf_in).
+  # removing comments before processing the line, prevents this situation.
+
+  my ( $pfx, $sfx );
+
+  while ( $html =~ s/<!--.*?-->//g ){}; # remove all comments in actual line
+
+  if ( $html =~ /^(.*)<!--/ && $html !~ /-->/ ){ # remove comment spanning over several lines
+
+    $pfx = $1;
+
+    while ( $html = <$fh> ){
+
+      if ( $html =~ /-->(.*)$/ ){
+        $sfx = $1; last
+      }
+
+    }
+
+    $html = "$pfx$sfx";
+
+  }
+
+  if ( $html =~ s/^\s*$// ){ # get next line and feed it also to this sub, if actual line is empty or only contains whitespace
+
+    $html = <$fh>; delHTMLcom ( $fh, $html );
+  }
+
+  return $html
+}
+
+1;
diff --git a/script/tei2korapxml b/script/tei2korapxml
index fe8c37d..d50dc24 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -44,6 +44,12 @@
 use IO::Compress::Zip qw(zip $ZipError :constants);
 use IPC::Open2 qw(open2);
 
+use FindBin;
+BEGIN {
+  unshift @INC, "$FindBin::Bin/../lib";
+};
+
+use KorAP::XML::TEI;
 
 our $VERSION = '0.01';
 
@@ -308,7 +314,7 @@
 
     # TODO: yet not tested fo big amounts of data
     # must-have, otherwise comments in input could be fatal (e.g.: ...<!--\n<idsHeader...\n-->...)
-    delHTMLcom ( $_ ); # remove HTML comments (<!--...-->)
+    KorAP::XML::TEI::delHTMLcom ( $input_fh, $_ ); # remove HTML comments (<!--...-->)
 
     if ( $data_fl && m#^(.*)</${_TEXT_BODY}>(.*)$# ){
 
@@ -1473,39 +1479,6 @@
 } # end: sub write_tokens
 
 
-sub delHTMLcom { # remove HTML comments
-
-  # the source code part where $tc is used, leads to the situation, that comments can produce an additional blank, which
-  # sometimes is not desirable (e.g.: '...<!-- comment -->\n<w>token</w>...' would lead to '... <w>token</w>...' in $buf_in).
-  # removing comments before processing the line, prevents this situation.
-
-  my ( $pfx, $sfx );
-
-  while ( $_[0] =~ s/<!--.*?-->//g ){}; # remove all comments in actual line
-
-  if ( $_[0] =~ /^(.*)<!--/ && $_[0] !~ /-->/ ){ # remove comment spanning over several lines
-
-    $pfx = $1;
-
-    while ( $_[0] = <> ){
-
-      if ( $_[0] =~ /-->(.*)$/ ){
-        $sfx = $1; last
-      }
-
-    }
-
-    $_[0] = "$pfx$sfx";
-
-  }
-
-  if ( $_[0] =~ s/^\s*$// ){ # get next line and feed it also to this sub, if actual line is empty or only contains whitespace
-
-    $_[0] = <>; delHTMLcom ( $_[0] );
-  }
-}
-
-
 ## DEPRECATED ($_GEN_TOK_BAS: only IDS-intern)
 sub startTokenizer {
   $pid = open2($chld_out, $chld_in, 'java  -cp '. join(":", ".", glob(&dirname(__FILE__)."/../target/*.jar"))." de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl");
diff --git a/t/tei.t b/t/tei.t
new file mode 100644
index 0000000..292ed19
--- /dev/null
+++ b/t/tei.t
@@ -0,0 +1,30 @@
+use strict;
+use warnings;
+use Test::More;
+use File::Temp 'tempfile';
+
+use FindBin;
+BEGIN {
+  unshift @INC, "$FindBin::Bin/../lib";
+};
+
+require_ok('KorAP::XML::TEI');
+
+my ($fh, $filename) = tempfile();
+
+print $fh <<'HTML';
+mehrzeiliger
+Kommentar
+  -->
+Test
+HTML
+
+is(KorAP::XML::TEI::delHTMLcom($fh, "hallo"),"hallo");
+is(KorAP::XML::TEI::delHTMLcom($fh, "hallo <!-- Test -->"),"hallo ");
+is(KorAP::XML::TEI::delHTMLcom($fh, "<!-- Test --> hallo")," hallo");
+
+seek($fh, 0, 0);
+
+is(KorAP::XML::TEI::delHTMLcom($fh, '<!--'), "Test\n");
+
+done_testing;
diff --git a/xt/benchmark.pl b/xt/benchmark.pl
index d8e3b2a..35c0c65 100644
--- a/xt/benchmark.pl
+++ b/xt/benchmark.pl
@@ -12,6 +12,8 @@
   unshift @INC, "$FindBin::Bin/../lib";
 };
 
+use KorAP::XML::TEI;
+
 my $columns = 0;
 my $no_header = 0;
 GetOptions(
@@ -38,6 +40,8 @@
   verbosity => 0
 );
 
+my $result;
+
 # Add benchmark instances
 $bench->add_instances(
   Dumbbench::Instance::PerlSub->new(
@@ -45,7 +49,18 @@
     code => sub {
       `cat '$file' | perl '$script' > /dev/null 2>&1`
     }
-  )
+  ),
+  Dumbbench::Instance::PerlSub->new(
+    name => 'delHTMLcom',
+    code => sub {
+      for (1..100_000) {
+        $result = KorAP::XML::TEI::delHTMLcom(
+          \*STDIN,
+          "This <!-- comment --> is a test " . $_
+        );
+      };
+    }
+  ),
 );
 
 # Run benchmarks