Fixed sentence bug in base
Change-Id: Ifa3a13d803049d82160c7e2ffe46e74f58531572
diff --git a/Changes b/Changes
index f4f2d5e..2d56a7f 100644
--- a/Changes
+++ b/Changes
@@ -1,7 +1,9 @@
-0.12 2016-02-27
+0.12 2016-02-28
- Added extract method to korapxml2krill.
- Fixed Mate/Dependency.
- Fixed skip flag in korapxml2krill.
+ - Ignore spans outside the token range
+ (i.e. character offsets end before tokens have started).
0.11 2016-02-23
- Merged korap2krill and korap2krill_dir.
diff --git a/lib/KorAP/XML/Index/Base/Sentences.pm b/lib/KorAP/XML/Index/Base/Sentences.pm
index 5766e8b..6b77d98 100644
--- a/lib/KorAP/XML/Index/Base/Sentences.pm
+++ b/lib/KorAP/XML/Index/Base/Sentences.pm
@@ -13,6 +13,7 @@
cb => sub {
my ($stream, $span) = @_;
my $mtt = $stream->pos($span->p_start);
+
$first = [$span->p_start, $span->o_start] unless defined $first;
$mtt->add(
term => '<>:base/s:s',
diff --git a/lib/KorAP/XML/Tokenizer.pm b/lib/KorAP/XML/Tokenizer.pm
index e57527a..27929ac 100644
--- a/lib/KorAP/XML/Tokenizer.pm
+++ b/lib/KorAP/XML/Tokenizer.pm
@@ -294,10 +294,12 @@
$param{primary} = $self->doc->primary;
+ # Todo: Match and range may be part of stream!
my $spans = KorAP::XML::Tokenizer::Spans->new(
path => $self->path,
range => $self->range,
match => $self->match,
+ stream => $self->stream,
%param
);
@@ -347,6 +349,7 @@
path => $self->path,
range => $self->range,
match => $self->match,
+ stream => $self->stream,
%param
);
@@ -386,7 +389,7 @@
my $layer = shift;
unless ($foundry && $layer) {
- warn 'Unable to add specific module - not enough information given!';
+ $self->log->warn('Unable to add specific module - not enough information given!');
return;
};
diff --git a/lib/KorAP/XML/Tokenizer/Spans.pm b/lib/KorAP/XML/Tokenizer/Spans.pm
index 10ba474..c144bf5 100644
--- a/lib/KorAP/XML/Tokenizer/Spans.pm
+++ b/lib/KorAP/XML/Tokenizer/Spans.pm
@@ -20,6 +20,7 @@
};
+# Parse span file
sub parse {
my $self = shift;
my $path = $self->path . $self->foundry . '/' . $self->layer . '.xml';
@@ -33,14 +34,14 @@
my ($spans, $error);
try {
- local $SIG{__WARN__} = sub {
- $error = 1;
- };
- $spans = xml2hash($file, text => '#text', attr => '-', array => ['span'])->{layer}->{spanList};
+ local $SIG{__WARN__} = sub {
+ $error = 1;
+ };
+ $spans = xml2hash($file, text => '#text', attr => '-', array => ['span'])->{layer}->{spanList};
}
catch {
- $self->log->warn('Span error in ' . $path . ($_ ? ': ' . $_ : ''));
- $error = 1;
+ $self->log->warn('Span error in ' . $path . ($_ ? ': ' . $_ : ''));
+ $error = 1;
};
return if $error;
diff --git a/lib/KorAP/XML/Tokenizer/Units.pm b/lib/KorAP/XML/Tokenizer/Units.pm
index 87f44d5..6b75e42 100644
--- a/lib/KorAP/XML/Tokenizer/Units.pm
+++ b/lib/KorAP/XML/Tokenizer/Units.pm
@@ -3,7 +3,7 @@
use KorAP::XML::Tokenizer::Token;
use Mojo::Base -base;
-has [qw/path foundry layer match range primary/];
+has [qw/path foundry layer match range primary stream/];
has 'should' => 0;
has 'have' => 0;
has 'encoding' => 'utf-8';
@@ -23,33 +23,48 @@
my $span = KorAP::XML::Tokenizer::Span->new;
+
# The span is a milestone
if ($from == $to) {
$span->milestone(1);
};
+ # The span has an id (probably useful)
$span->id($s->{-id}) if $s && $s->{-id};
+ # Set character offsets
$span->o_start($from);
$span->o_end($to);
+ # Get start position (exactly)
my $start = $self->match->startswith($span->o_start);
unless (defined $start) {
- $start = $self->range->after($span->o_start) or return;
+ $start = $self->range->after($span->o_start);
+ return unless defined $start;
};
+ # Set start token position to span
$span->p_start($start);
if ($span->milestone) {
$span->p_end($start);
}
else {
+
+ # Get end position (exactly)
my $end = $self->match->endswith($span->o_end);
unless (defined $end) {
$end = $self->range->before($span->o_end);
return unless defined $end;
+
+ # The next token of end has a character
+ # offset AFTER th given end character offset
+ my $real_start = $self->stream->pos($end)->o_start;
+
+ # Ignore non-milestone elements outside the token stream!
+ return if $to <= $real_start;
};
# $span->p_end($end);
@@ -57,6 +72,7 @@
# EXPERIMENTAL:
return unless $end >= $span->p_start;
+
$span->p_end($end + 1);
}
diff --git a/t/sgbr/base.t b/t/sgbr/base.t
new file mode 100644
index 0000000..6eb7d62
--- /dev/null
+++ b/t/sgbr/base.t
@@ -0,0 +1,39 @@
+use strict;
+use warnings;
+use Test::More;
+use File::Basename 'dirname';
+use File::Spec::Functions 'catdir';
+use Data::Dumper;
+use KorAP::XML::Tokenizer;
+use KorAP::XML::Krill;
+use utf8;
+
+my $path = catdir(dirname(__FILE__), 'CMC-TSK', '2014-09', 3401);
+
+ok(my $doc = KorAP::XML::Krill->new(
+ path => $path . '/'
+), 'Create Document');
+
+ok($doc->parse, 'Parse document');
+
+ok(my $tokens = KorAP::XML::Tokenizer->new(
+ path => $doc->path,
+ doc => $doc,
+ foundry => 'Sgbr',
+ layer => 'Lemma',
+ name => 'tokens'
+), 'Create tokens based on lemmata');
+
+ok($tokens->parse, 'Parse tokenization based on lemmata');
+
+ok($tokens->add('Base', 'Sentences'), 'Add Sentences');
+
+my $stream = $tokens->to_data->{data}->{stream};
+
+is($stream->[0]->[0], '-:base/sentences$<i>1');
+is($stream->[0]->[1], '-:tokens$<i>15');
+is($stream->[0]->[2], '<>:base/s:t$<b>64<i>0<i>115<i>14<b>0');
+is($stream->[0]->[3], '<>:base/s:s$<b>64<i>16<i>114<i>14<b>2');
+is($stream->[0]->[4], '_0$<i>17<i>18');
+
+done_testing;