Send <EOT>\n to external tokenizer for separating texts
This is now the standard for the KorAP tokenizer.
Change-Id: I30c2d6ca82211b1d312364899d4f56ea7908b4f8
diff --git a/lib/KorAP/XML/TEI/Tokenizer/External.pm b/lib/KorAP/XML/TEI/Tokenizer/External.pm
index 8456e97..9417efa 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/External.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/External.pm
@@ -30,13 +30,9 @@
return;
};
- # Send this sequence to separate inputs
- # TODO: needs to be explored furthermore ...
- # '\x03' produces a warning in 't/tokenization-external.t' (WARNING: extra output: 0 1)
- # - see discussion in gerrit (3123: Establish tokenizer object for external base tokenization)
- # an empty $sep leads to a blocking situation inside t/cmd/tokenizer.pl (right before the while-loop)
- #$sep //= "\n\x03\n";
- $sep //= "\n";
+ # Send <EOT> to separate textsv (and \n to flush output)
+ # (Default for KorAP-Tokenizer).
+ $sep //= "\x04\n";
my $self = bless {
chld_in => undef,
@@ -131,6 +127,10 @@
$_ = <$out>;
if (defined $_ && $_ ne '') {
+
+ # This warning is sometimes thrown, though not yet replicated in the test suite.
+ # See the discussion in gerrit (3123: Establish tokenizer object for external base tokenization)
+ # for further issues.
$log->warn("Extra output: $_");
}
else {
diff --git a/t/cmd/tokenizer.pl b/t/cmd/tokenizer.pl
index ad286df..bf948e8 100644
--- a/t/cmd/tokenizer.pl
+++ b/t/cmd/tokenizer.pl
@@ -15,9 +15,11 @@
# Read lines from input and return boundaries
while (!eof(STDIN)) {
my $line = <>;
- $tok->tokenize($line);
- print join(' ', $tok->boundaries), "\n";
- $tok->reset;
+ for my $text (split(/\n?\x{04}\n?/, $line)) {
+ $tok->tokenize($text);
+ print join(' ', $tok->boundaries), "\n";
+ $tok->reset;
+ }
};
1;
diff --git a/t/tokenization-external.t b/t/tokenization-external.t
index 742c656..874f0fe 100644
--- a/t/tokenization-external.t
+++ b/t/tokenization-external.t
@@ -6,6 +6,8 @@
use Test::XML::Loy;
use FindBin;
+use utf8;
+
BEGIN {
unshift @INC, "$FindBin::Bin/../lib";
};
@@ -22,10 +24,6 @@
);
$ext->tokenize("Der alte Mann");
-# TODO:
-# see comments on $sep in 'lib/KorAP/XML/TEI/Tokenizer/External.pm'
-#$ext->tokenize("ging über die Straße");
-
my $str = $ext->to_string('unknown');
my $t = Test::XML::Loy->new($str);
$t->attr_is('layer spanList span:nth-child(1)', 'to', 3);
@@ -35,8 +33,20 @@
$t->attr_is('layer spanList span:nth-child(3)', 'to', 13);
$t->element_count_is('layer spanList span', 3);
+$ext->tokenize("ging über die Straße");
+$str = $ext->to_string('unknown');
+$t = Test::XML::Loy->new($str);
+$t->attr_is('layer spanList span:nth-child(1)', 'to', 4);
+$t->attr_is('layer spanList span:nth-child(2)', 'from', 5);
+$t->attr_is('layer spanList span:nth-child(2)', 'to', 9);
+$t->attr_is('layer spanList span:nth-child(3)', 'from', 10);
+$t->attr_is('layer spanList span:nth-child(3)', 'to', 13);
+$t->attr_is('layer spanList span:nth-child(4)', 'from', 14);
+$t->attr_is('layer spanList span:nth-child(4)', 'to', 20);
+$t->element_count_is('layer spanList span', 4);
+
$ext->reset;
-$ext->tokenize("Hu aha\ndas ist cool");
+$ext->tokenize("Hu aha\x{04}\ndas ist cool");
$str = $ext->to_string('unknown');
$t = Test::XML::Loy->new($str);