Add -s option to use sentence boundaries provided by KorAP tokenizer
Change-Id: Id3aaa50d7775256e336013cc0fbe56803c125052
diff --git a/lib/KorAP/XML/TEI/Tokenizer/External.pm b/lib/KorAP/XML/TEI/Tokenizer/External.pm
index b7d4c87..02d9ccd 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/External.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/External.pm
@@ -33,12 +33,15 @@
$sep //= "\x04\n";
my $self = bless {
- chld_in => undef,
- chld_out => undef,
- pid => undef,
- cmd => $cmd,
- select => undef,
- sep => $sep,
+ chld_in => undef,
+ chld_out => undef,
+ pid => undef,
+ cmd => $cmd,
+ select => undef,
+ sep => $sep,
+ sentence_split => undef,
+ sentence_starts => [],
+ sentence_ends => [],
}, $class;
# Initialize tokenizer
@@ -110,9 +113,20 @@
my $out = $self->{chld_out};
$_ = <$out>;
-
my @bounds = split;
+ if ($self->{sentence_split}) {
+ # sentence boundaries will be on a second line
+ $_ = <$out>;
+ my @sentence_bounds = split;
+
+ # Save all sentence bounds
+ for (my $i = 0; $i < @sentence_bounds; $i += 2 ) {
+ push @{$self->{sentence_starts}}, $sentence_bounds[$i];
+ push @{$self->{sentence_endss}}, $sentence_bounds[$i+1];
+ };
+ }
+
# Serialize all bounds
my $c = 0;
for (my $i = 0; $i < @bounds; $i += 2 ){
@@ -162,5 +176,18 @@
};
};
+sub sentencize_from_previous_input {
+ my ($self, $structures) = @_;
+
+ for (my $i=0; $i < @{$self->{sentence_starts}}; $i++) {
+ my $anno = $structures->add_new_annotation("s");
+ $anno->set_from($self->{sentence_starts}[$i]);
+ $anno->set_to($self->{sentence_endss}[$i]);
+ $anno->set_level(-1);
+ }
+ $self->{sentence_starts} = [];
+ $self->{sentence_endss} = [];
+}
+
1;
diff --git a/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm b/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
index 840e434..bbe38bd 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
@@ -26,8 +26,10 @@
# Construct a new KorAP Tokenizer
sub new {
- my $class = shift;
- my $self = $class->SUPER::new("$java -jar $tokenizer_jar --no-tokens --positions");
+ my ($class, $sentence_split) = @_;
+ my $self = $class->SUPER::new("$java -jar $tokenizer_jar --no-tokens --positions" .
+ ($sentence_split? " --sentence-boundaries" : ""));
+ $self->{sentence_split} = $sentence_split;
$self->{name} = 'korap';
$self->{sep} = "\x{04}\n";
return bless $self, $class;