Send <EOT>\n to external tokenizer for separating texts
This is now the standard for the KorAP tokenizer.
Change-Id: I30c2d6ca82211b1d312364899d4f56ea7908b4f8
diff --git a/t/cmd/tokenizer.pl b/t/cmd/tokenizer.pl
index ad286df..bf948e8 100644
--- a/t/cmd/tokenizer.pl
+++ b/t/cmd/tokenizer.pl
@@ -15,9 +15,11 @@
# Read lines from input and return boundaries
while (!eof(STDIN)) {
my $line = <>;
- $tok->tokenize($line);
- print join(' ', $tok->boundaries), "\n";
- $tok->reset;
+ for my $text (split(/\n?\x{04}\n?/, $line)) {
+ $tok->tokenize($text);
+ print join(' ', $tok->boundaries), "\n";
+ $tok->reset;
+ }
};
1;