Send <EOT>\n to external tokenizer for separating texts
This is now the standard for the KorAP tokenizer.
Change-Id: I30c2d6ca82211b1d312364899d4f56ea7908b4f8
diff --git a/t/cmd/tokenizer.pl b/t/cmd/tokenizer.pl
index ad286df..bf948e8 100644
--- a/t/cmd/tokenizer.pl
+++ b/t/cmd/tokenizer.pl
@@ -15,9 +15,11 @@
# Read lines from input and return boundaries
while (!eof(STDIN)) {
my $line = <>;
- $tok->tokenize($line);
- print join(' ', $tok->boundaries), "\n";
- $tok->reset;
+ for my $text (split(/\n?\x{04}\n?/, $line)) {
+ $tok->tokenize($text);
+ print join(' ', $tok->boundaries), "\n";
+ $tok->reset;
+ }
};
1;
diff --git a/t/tokenization-external.t b/t/tokenization-external.t
index 742c656..874f0fe 100644
--- a/t/tokenization-external.t
+++ b/t/tokenization-external.t
@@ -6,6 +6,8 @@
use Test::XML::Loy;
use FindBin;
+use utf8;
+
BEGIN {
unshift @INC, "$FindBin::Bin/../lib";
};
@@ -22,10 +24,6 @@
);
$ext->tokenize("Der alte Mann");
-# TODO:
-# see comments on $sep in 'lib/KorAP/XML/TEI/Tokenizer/External.pm'
-#$ext->tokenize("ging über die Straße");
-
my $str = $ext->to_string('unknown');
my $t = Test::XML::Loy->new($str);
$t->attr_is('layer spanList span:nth-child(1)', 'to', 3);
@@ -35,8 +33,20 @@
$t->attr_is('layer spanList span:nth-child(3)', 'to', 13);
$t->element_count_is('layer spanList span', 3);
+$ext->tokenize("ging über die Straße");
+$str = $ext->to_string('unknown');
+$t = Test::XML::Loy->new($str);
+$t->attr_is('layer spanList span:nth-child(1)', 'to', 4);
+$t->attr_is('layer spanList span:nth-child(2)', 'from', 5);
+$t->attr_is('layer spanList span:nth-child(2)', 'to', 9);
+$t->attr_is('layer spanList span:nth-child(3)', 'from', 10);
+$t->attr_is('layer spanList span:nth-child(3)', 'to', 13);
+$t->attr_is('layer spanList span:nth-child(4)', 'from', 14);
+$t->attr_is('layer spanList span:nth-child(4)', 'to', 20);
+$t->element_count_is('layer spanList span', 4);
+
$ext->reset;
-$ext->tokenize("Hu aha\ndas ist cool");
+$ext->tokenize("Hu aha\x{04}\ndas ist cool");
$str = $ext->to_string('unknown');
$t = Test::XML::Loy->new($str);