Add -s option to use sentence boundaries provided by KorAP tokenizer Change-Id: Id3aaa50d7775256e336013cc0fbe56803c125052

commit: 985da0cafd247c9d6bb891ae862f9c4ac324820d [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Mon Feb 15 19:29:50 2021 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Wed Feb 17 11:57:19 2021 +0100
tree: a0f7ea1889e4e2f62b35b39dbbebe60eab43884c
parent: f7084c4f4c5e24613cf7935be5eb2bf8c92ce804 [diff]
diff --git a/t/script.t b/t/script.t
index e5402dc..81067d9 100644
--- a/t/script.t
+++ b/t/script.t

@@ -175,6 +175,31 @@
     ->element_count_is('spanList span', 227);
 };
 
+subtest 'Sentence split with KorAP tokenizer' => sub {
+
+  eval {
+    require KorAP::XML::TEI::Tokenizer::KorAP;
+    1;
+  } or do {
+    plan skip_all => "KorAP::XML::TEI::Tokenizer::KorAP cannot be used";
+  };
+
+  test_tei2korapxml(
+      file => $file,
+      param => "-tk -s",
+      tmp => 'script_sentence_split'
+  )
+      ->stderr_like(qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!)
+      ->file_readable('GOE/AGA/00000/struct/structure.xml')
+      ->unzip_xml('GOE/AGA/00000/struct/structure.xml')
+      ->text_is('span#s25 fs f', 's')
+      ->attr_is('span#s25', 'l', -1)
+      ->attr_is('span#s25', 'to', 54)
+      ->text_is('span#s30 fs f', 's')
+      ->attr_is('span#s30', 'l', -1)
+      ->attr_is('span#s30', 'from', 1099)
+      ->attr_is('span#s30', 'to', 1266);
+};
 
 subtest 'Test Tokenizations' => sub {
 

diff --git a/t/tokenization-korap.t b/t/tokenization-korap.t
index a4c547e..0ca0719 100644
--- a/t/tokenization-korap.t
+++ b/t/tokenization-korap.t

@@ -17,12 +17,13 @@
   };
 }
 
+use_ok('KorAP::XML::TEI::Annotations::Collector');
 require_ok('KorAP::XML::TEI::Tokenizer::KorAP');
 
 my $f = dirname(__FILE__);
 my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
 
-my $ext = KorAP::XML::TEI::Tokenizer::KorAP->new();
+my $ext = KorAP::XML::TEI::Tokenizer::KorAP->new(1);
 
 $ext->tokenize("Der alte Mann");
 my $str = $ext->to_string('unknown');
@@ -64,6 +65,13 @@
 $t->attr_is('layer spanList span:nth-child(14)', 'to', 92);
 $t->element_count_is('layer spanList span', 14);
 
+my $structures = KorAP::XML::TEI::Annotations::Collector->new;
+$ext->sentencize_from_previous_input($structures);
+$t = Test::XML::Loy->new($structures->[-1]->to_string(3));
+$t->attr_is('span', 'from', 6)
+  ->attr_is('span', 'to', 92)
+  ->attr_is('span', 'l', -1, "sentence splitting with korap tokenizer");
+
 $string = "Gefunden auf www.wikipedia.de";
 $ext->reset;
 $ext->tokenize($string);
commit	985da0cafd247c9d6bb891ae862f9c4ac324820d	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Mon Feb 15 19:29:50 2021 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Wed Feb 17 11:57:19 2021 +0100
tree	a0f7ea1889e4e2f62b35b39dbbebe60eab43884c
parent	f7084c4f4c5e24613cf7935be5eb2bf8c92ce804 [diff]