Send <EOT>\n to external tokenizer for separating texts

This is now the standard for the KorAP tokenizer.

Change-Id: I30c2d6ca82211b1d312364899d4f56ea7908b4f8
diff --git a/lib/KorAP/XML/TEI/Tokenizer/External.pm b/lib/KorAP/XML/TEI/Tokenizer/External.pm
index 8456e97..9417efa 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/External.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/External.pm
@@ -30,13 +30,9 @@
     return;
   };
 
-  # Send this sequence to separate inputs
-  # TODO: needs to be explored furthermore ...
-  #   '\x03' produces a warning in 't/tokenization-external.t' (WARNING: extra output: 0 1)
-  #   - see discussion in gerrit (3123: Establish tokenizer object for external base tokenization)
-  #   an empty $sep leads to a blocking situation inside t/cmd/tokenizer.pl (right before the while-loop)
-  #$sep //= "\n\x03\n";
-  $sep //= "\n";
+  # Send <EOT> to separate textsv (and \n to flush output)
+  # (Default for KorAP-Tokenizer).
+  $sep //= "\x04\n";
 
   my $self = bless {
     chld_in  => undef,
@@ -131,6 +127,10 @@
       $_ = <$out>;
 
       if (defined $_ && $_ ne '') {
+
+        # This warning is sometimes thrown, though not yet replicated in the test suite.
+        # See the discussion in gerrit (3123: Establish tokenizer object for external base tokenization)
+        # for further issues.
         $log->warn("Extra output: $_");
       }
       else {
diff --git a/t/cmd/tokenizer.pl b/t/cmd/tokenizer.pl
index ad286df..bf948e8 100644
--- a/t/cmd/tokenizer.pl
+++ b/t/cmd/tokenizer.pl
@@ -15,9 +15,11 @@
 # Read lines from input and return boundaries
 while (!eof(STDIN)) {
   my $line = <>;
-  $tok->tokenize($line);
-  print join(' ', $tok->boundaries), "\n";
-  $tok->reset;
+  for my $text (split(/\n?\x{04}\n?/, $line)) {
+    $tok->tokenize($text);
+    print join(' ', $tok->boundaries), "\n";
+    $tok->reset;
+  }
 };
 
 1;
diff --git a/t/tokenization-external.t b/t/tokenization-external.t
index 742c656..874f0fe 100644
--- a/t/tokenization-external.t
+++ b/t/tokenization-external.t
@@ -6,6 +6,8 @@
 use Test::XML::Loy;
 
 use FindBin;
+use utf8;
+
 BEGIN {
   unshift @INC, "$FindBin::Bin/../lib";
 };
@@ -22,10 +24,6 @@
 );
 
 $ext->tokenize("Der alte Mann");
-# TODO:
-#   see comments on $sep in 'lib/KorAP/XML/TEI/Tokenizer/External.pm'
-#$ext->tokenize("ging über die Straße");
-
 my $str = $ext->to_string('unknown');
 my $t = Test::XML::Loy->new($str);
 $t->attr_is('layer spanList span:nth-child(1)', 'to', 3);
@@ -35,8 +33,20 @@
 $t->attr_is('layer spanList span:nth-child(3)', 'to', 13);
 $t->element_count_is('layer spanList span', 3);
 
+$ext->tokenize("ging über die Straße");
+$str = $ext->to_string('unknown');
+$t = Test::XML::Loy->new($str);
+$t->attr_is('layer spanList span:nth-child(1)', 'to', 4);
+$t->attr_is('layer spanList span:nth-child(2)', 'from', 5);
+$t->attr_is('layer spanList span:nth-child(2)', 'to', 9);
+$t->attr_is('layer spanList span:nth-child(3)', 'from', 10);
+$t->attr_is('layer spanList span:nth-child(3)', 'to', 13);
+$t->attr_is('layer spanList span:nth-child(4)', 'from', 14);
+$t->attr_is('layer spanList span:nth-child(4)', 'to', 20);
+$t->element_count_is('layer spanList span', 4);
+
 $ext->reset;
-$ext->tokenize("Hu aha\ndas ist cool");
+$ext->tokenize("Hu aha\x{04}\ndas ist cool");
 
 $str = $ext->to_string('unknown');
 $t = Test::XML::Loy->new($str);