Add -s option to use sentence boundaries provided by KorAP tokenizer

Change-Id: Id3aaa50d7775256e336013cc0fbe56803c125052
diff --git a/lib/KorAP/XML/TEI/Tokenizer/External.pm b/lib/KorAP/XML/TEI/Tokenizer/External.pm
index b7d4c87..02d9ccd 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/External.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/External.pm
@@ -33,12 +33,15 @@
   $sep //= "\x04\n";
 
   my $self = bless {
-    chld_in  => undef,
-    chld_out => undef,
-    pid      => undef,
-    cmd      => $cmd,
-    select   => undef,
-    sep      => $sep,
+      chld_in         => undef,
+      chld_out        => undef,
+      pid             => undef,
+      cmd             => $cmd,
+      select          => undef,
+      sep             => $sep,
+      sentence_split  => undef,
+      sentence_starts => [],
+      sentence_ends   => [],
   }, $class;
 
   # Initialize tokenizer
@@ -110,9 +113,20 @@
 
     my $out = $self->{chld_out};
     $_ = <$out>;
-
     my @bounds = split;
 
+    if ($self->{sentence_split}) {
+      # sentence boundaries will be on a second line
+      $_ = <$out>;
+      my @sentence_bounds = split;
+
+      # Save all sentence bounds
+      for (my $i = 0; $i < @sentence_bounds; $i +=  2 ) {
+        push @{$self->{sentence_starts}}, $sentence_bounds[$i];
+        push @{$self->{sentence_endss}}, $sentence_bounds[$i+1];
+      };
+    }
+
     # Serialize all bounds
     my $c = 0;
     for (my $i = 0; $i < @bounds; $i +=  2 ){
@@ -162,5 +176,18 @@
   };
 };
 
+sub sentencize_from_previous_input {
+  my ($self, $structures) = @_;
+
+  for (my $i=0; $i < @{$self->{sentence_starts}}; $i++) {
+    my $anno = $structures->add_new_annotation("s");
+    $anno->set_from($self->{sentence_starts}[$i]);
+    $anno->set_to($self->{sentence_endss}[$i]);
+    $anno->set_level(-1);
+  }
+  $self->{sentence_starts} = [];
+  $self->{sentence_endss} = [];
+}
+
 
 1;
diff --git a/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm b/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
index 840e434..bbe38bd 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
@@ -26,8 +26,10 @@
 
 # Construct a new KorAP Tokenizer
 sub new {
-  my $class = shift;
-  my $self = $class->SUPER::new("$java -jar $tokenizer_jar --no-tokens --positions");
+  my ($class, $sentence_split) = @_;
+  my $self = $class->SUPER::new("$java -jar $tokenizer_jar --no-tokens --positions" .
+      ($sentence_split? " --sentence-boundaries" : ""));
+  $self->{sentence_split} = $sentence_split;
   $self->{name} = 'korap';
   $self->{sep} = "\x{04}\n";
   return bless $self, $class;