Add -s option to use sentence boundaries provided by KorAP tokenizer
Change-Id: Id3aaa50d7775256e336013cc0fbe56803c125052
diff --git a/t/tokenization-korap.t b/t/tokenization-korap.t
index a4c547e..0ca0719 100644
--- a/t/tokenization-korap.t
+++ b/t/tokenization-korap.t
@@ -17,12 +17,13 @@
};
}
+use_ok('KorAP::XML::TEI::Annotations::Collector');
require_ok('KorAP::XML::TEI::Tokenizer::KorAP');
my $f = dirname(__FILE__);
my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
-my $ext = KorAP::XML::TEI::Tokenizer::KorAP->new();
+my $ext = KorAP::XML::TEI::Tokenizer::KorAP->new(1);
$ext->tokenize("Der alte Mann");
my $str = $ext->to_string('unknown');
@@ -64,6 +65,13 @@
$t->attr_is('layer spanList span:nth-child(14)', 'to', 92);
$t->element_count_is('layer spanList span', 14);
+my $structures = KorAP::XML::TEI::Annotations::Collector->new;
+$ext->sentencize_from_previous_input($structures);
+$t = Test::XML::Loy->new($structures->[-1]->to_string(3));
+$t->attr_is('span', 'from', 6)
+ ->attr_is('span', 'to', 92)
+ ->attr_is('span', 'l', -1, "sentence splitting with korap tokenizer");
+
$string = "Gefunden auf www.wikipedia.de";
$ext->reset;
$ext->tokenize($string);