Add -s option to use sentence boundaries provided by KorAP tokenizer Change-Id: Id3aaa50d7775256e336013cc0fbe56803c125052

commit: 985da0cafd247c9d6bb891ae862f9c4ac324820d [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Mon Feb 15 19:29:50 2021 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Wed Feb 17 11:57:19 2021 +0100
tree: a0f7ea1889e4e2f62b35b39dbbebe60eab43884c
parent: f7084c4f4c5e24613cf7935be5eb2bf8c92ce804 [diff]
diff --git a/Changes b/Changes
index 078fb26..5bd0267 100644
--- a/Changes
+++ b/Changes

@@ -1,3 +1,4 @@
+        - -s option added that uses sentence boundaries provided by the KorAP tokenizer (-tk)
 0.03 2021-01-12
         - Update KorAP-Tokenizer to released 2.0 version
         - Improve test suite for recent version

diff --git a/Readme.pod b/Readme.pod
index d3cca5b..981ae2e 100644
--- a/Readme.pod
+++ b/Readme.pod

@@ -116,6 +116,11 @@
 that will take an I<Aggressive> and a I<conservative>
 approach.
 
+=item B<--use-tokenizer-sentence-splits|-s>
+
+Replace existing with, or add new, sentence boundary information
+provided by the KorAP tokenizer (currently supported only).
+
 =item B<--log|-l>
 
 Loglevel for I<Log::Any>. Defaults to C<notice>.
@@ -140,7 +145,7 @@
 
 =head1 COPYRIGHT AND LICENSE
 
-Copyright (C) 2020, L<IDS Mannheim|https://www.ids-mannheim.de/>
+Copyright (C) 2021, L<IDS Mannheim|https://www.ids-mannheim.de/>
 
 Author: Peter Harders
 

diff --git a/lib/KorAP/XML/TEI/Annotations/Collector.pm b/lib/KorAP/XML/TEI/Annotations/Collector.pm
index ed11d23..a15a98f 100644
--- a/lib/KorAP/XML/TEI/Annotations/Collector.pm
+++ b/lib/KorAP/XML/TEI/Annotations/Collector.pm

@@ -12,6 +12,12 @@
 };
 
 
+# Dummy annotation that will not be added to output
+sub new_dummy_annotation {
+  my $token = KorAP::XML::TEI::Annotations::Annotation->new(@_);
+  return $token;
+};
+
 # Add new annotation to annotation list
 sub add_new_annotation {
   my $self = shift;

diff --git a/lib/KorAP/XML/TEI/Tokenizer/External.pm b/lib/KorAP/XML/TEI/Tokenizer/External.pm
index b7d4c87..02d9ccd 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/External.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/External.pm

@@ -33,12 +33,15 @@
   $sep //= "\x04\n";
 
   my $self = bless {
-    chld_in  => undef,
-    chld_out => undef,
-    pid      => undef,
-    cmd      => $cmd,
-    select   => undef,
-    sep      => $sep,
+      chld_in         => undef,
+      chld_out        => undef,
+      pid             => undef,
+      cmd             => $cmd,
+      select          => undef,
+      sep             => $sep,
+      sentence_split  => undef,
+      sentence_starts => [],
+      sentence_ends   => [],
   }, $class;
 
   # Initialize tokenizer
@@ -110,9 +113,20 @@
 
     my $out = $self->{chld_out};
     $_ = <$out>;
-
     my @bounds = split;
 
+    if ($self->{sentence_split}) {
+      # sentence boundaries will be on a second line
+      $_ = <$out>;
+      my @sentence_bounds = split;
+
+      # Save all sentence bounds
+      for (my $i = 0; $i < @sentence_bounds; $i +=  2 ) {
+        push @{$self->{sentence_starts}}, $sentence_bounds[$i];
+        push @{$self->{sentence_endss}}, $sentence_bounds[$i+1];
+      };
+    }
+
     # Serialize all bounds
     my $c = 0;
     for (my $i = 0; $i < @bounds; $i +=  2 ){
@@ -162,5 +176,18 @@
   };
 };
 
+sub sentencize_from_previous_input {
+  my ($self, $structures) = @_;
+
+  for (my $i=0; $i < @{$self->{sentence_starts}}; $i++) {
+    my $anno = $structures->add_new_annotation("s");
+    $anno->set_from($self->{sentence_starts}[$i]);
+    $anno->set_to($self->{sentence_endss}[$i]);
+    $anno->set_level(-1);
+  }
+  $self->{sentence_starts} = [];
+  $self->{sentence_endss} = [];
+}
+
 
 1;

diff --git a/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm b/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
index 840e434..bbe38bd 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm

@@ -26,8 +26,10 @@
 
 # Construct a new KorAP Tokenizer
 sub new {
-  my $class = shift;
-  my $self = $class->SUPER::new("$java -jar $tokenizer_jar --no-tokens --positions");
+  my ($class, $sentence_split) = @_;
+  my $self = $class->SUPER::new("$java -jar $tokenizer_jar --no-tokens --positions" .
+      ($sentence_split? " --sentence-boundaries" : ""));
+  $self->{sentence_split} = $sentence_split;
   $self->{name} = 'korap';
   $self->{sep} = "\x{04}\n";
   return bless $self, $class;

diff --git a/script/tei2korapxml b/script/tei2korapxml
index 690cf2d..f15376f 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml

@@ -47,6 +47,7 @@
   'tokenizer-call|tc=s' => \(my $tokenizer_call), # Temporary argument for testing purposes
   'tokenizer-korap|tk' => \(my $tokenizer_korap), # use KorAP-tokenizer
   'tokenizer-internal|ti' => \(my $tokenizer_intern), # use intern tokenization (default = no)
+  'use-tokenizer-sentence-splits|s' => (\my $use_tokenizer_sentence_splits), # use KorAP tokenizer to split s (default=no)
   'log|l=s' => \(my $log_level = 'notice'),
   'help|h'    => sub {
     pod2usage(
@@ -89,13 +90,17 @@
 ## extern tokenization
 my $_GEN_TOK_EXT = $tokenizer_call || $tokenizer_korap ? 1 : 0;
 
+if ($use_tokenizer_sentence_splits && !$tokenizer_korap) {
+  die $log->fatal("Sentence splitting is currently only supported by KorAP tokenizer (use -tk to activate it");
+}
+
 my $ext_tok;
 if ($tokenizer_call) {
   $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
 }
 
 elsif ($tokenizer_korap) {
-  $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new;
+  $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
 };
 my $_tok_file_ext  = "tokens.xml";
 ##
@@ -336,6 +341,10 @@
             $cons_tok->reset;
           };
 
+          if ($use_tokenizer_sentence_splits) {
+            $ext_tok->sentencize_from_previous_input($structures);
+          }
+
           # ~ write structures ~
           if (!$structures->empty) {
             $structures->to_zip(
@@ -469,6 +478,11 @@
   # (1 = topmost level inside retr_info() = should always be level of tag $_TEXT_BODY)
   my $rl = shift;
 
+  my $dummy_anno;
+  if ($use_tokenizer_sentence_splits) {
+    $dummy_anno = $structures->new_dummy_annotation();
+  }
+
   #  Notes on how 'XML::CompactTree::XS' works
   #
   #  Example: <node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>
@@ -570,8 +584,14 @@
 
       # ~ handle structures ~
 
+      my $anno;
+
       # $e->[1] represents the tag name
-      my $anno = $structures->add_new_annotation($e->[1]);
+      if ($use_tokenizer_sentence_splits && $e->[1] eq "s") {
+        $anno = $dummy_anno;
+      } else {
+        $anno = $structures->add_new_annotation($e->[1]);
+      }
 
       # ~ handle tokens ~
 
@@ -875,6 +895,11 @@
 that will take an I<Aggressive> and a I<conservative>
 approach.
 
+=item B<--use-tokenizer-sentence-splits|-s>
+
+Replace existing with, or add new, sentence boundary information
+provided by the KorAP tokenizer (currently supported only).
+
 =item B<--log|-l>
 
 Loglevel for I<Log::Any>. Defaults to C<notice>.

diff --git a/t/script.t b/t/script.t
index e5402dc..81067d9 100644
--- a/t/script.t
+++ b/t/script.t

@@ -175,6 +175,31 @@
     ->element_count_is('spanList span', 227);
 };
 
+subtest 'Sentence split with KorAP tokenizer' => sub {
+
+  eval {
+    require KorAP::XML::TEI::Tokenizer::KorAP;
+    1;
+  } or do {
+    plan skip_all => "KorAP::XML::TEI::Tokenizer::KorAP cannot be used";
+  };
+
+  test_tei2korapxml(
+      file => $file,
+      param => "-tk -s",
+      tmp => 'script_sentence_split'
+  )
+      ->stderr_like(qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!)
+      ->file_readable('GOE/AGA/00000/struct/structure.xml')
+      ->unzip_xml('GOE/AGA/00000/struct/structure.xml')
+      ->text_is('span#s25 fs f', 's')
+      ->attr_is('span#s25', 'l', -1)
+      ->attr_is('span#s25', 'to', 54)
+      ->text_is('span#s30 fs f', 's')
+      ->attr_is('span#s30', 'l', -1)
+      ->attr_is('span#s30', 'from', 1099)
+      ->attr_is('span#s30', 'to', 1266);
+};
 
 subtest 'Test Tokenizations' => sub {
 

diff --git a/t/tokenization-korap.t b/t/tokenization-korap.t
index a4c547e..0ca0719 100644
--- a/t/tokenization-korap.t
+++ b/t/tokenization-korap.t

@@ -17,12 +17,13 @@
   };
 }
 
+use_ok('KorAP::XML::TEI::Annotations::Collector');
 require_ok('KorAP::XML::TEI::Tokenizer::KorAP');
 
 my $f = dirname(__FILE__);
 my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
 
-my $ext = KorAP::XML::TEI::Tokenizer::KorAP->new();
+my $ext = KorAP::XML::TEI::Tokenizer::KorAP->new(1);
 
 $ext->tokenize("Der alte Mann");
 my $str = $ext->to_string('unknown');
@@ -64,6 +65,13 @@
 $t->attr_is('layer spanList span:nth-child(14)', 'to', 92);
 $t->element_count_is('layer spanList span', 14);
 
+my $structures = KorAP::XML::TEI::Annotations::Collector->new;
+$ext->sentencize_from_previous_input($structures);
+$t = Test::XML::Loy->new($structures->[-1]->to_string(3));
+$t->attr_is('span', 'from', 6)
+  ->attr_is('span', 'to', 92)
+  ->attr_is('span', 'l', -1, "sentence splitting with korap tokenizer");
+
 $string = "Gefunden auf www.wikipedia.de";
 $ext->reset;
 $ext->tokenize($string);
commit	985da0cafd247c9d6bb891ae862f9c4ac324820d	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Mon Feb 15 19:29:50 2021 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Wed Feb 17 11:57:19 2021 +0100
tree	a0f7ea1889e4e2f62b35b39dbbebe60eab43884c
parent	f7084c4f4c5e24613cf7935be5eb2bf8c92ce804 [diff]