Fixed tiny offset issue for documents ending with non-tokens
diff --git a/lib/KorAP/Document.pm b/lib/KorAP/Document.pm
index 59a1022..5b4ef73 100644
--- a/lib/KorAP/Document.pm
+++ b/lib/KorAP/Document.pm
@@ -19,9 +19,9 @@
my $self = shift;
my $file = b($self->path . 'data.xml')->slurp;
- state $unable = 'Unable to parse document';
+ state $unable = 'Unable to parse document ' . $self->path;
- $self->log->trace('Parse document ' . $self->path);
+ $self->log->debug('Parse document ' . $self->path);
my $dom = Mojo::DOM->new($file);
@@ -34,11 +34,11 @@
$self->corpus_id($1);
}
else {
- croak $unable;
+ croak $unable . ': ID not parseable';
};
}
else {
- croak $unable;
+ croak $unable . ': No raw_text found or no ID';
};
# Get primary data
diff --git a/lib/KorAP/Index/Base/Sentences.pm b/lib/KorAP/Index/Base/Sentences.pm
new file mode 100644
index 0000000..16e4673
--- /dev/null
+++ b/lib/KorAP/Index/Base/Sentences.pm
@@ -0,0 +1,29 @@
+package KorAP::Index::Base::Sentences;
+use KorAP::Index::Base;
+
+sub parse {
+ my $self = shift;
+ my $i = 0;
+
+ $$self->add_spandata(
+ foundry => 'base',
+ layer => 'sentences',
+ cb => sub {
+ my ($stream, $span) = @_;
+ my $mtt = $stream->pos($span->p_start);
+ $mtt->add(
+ term => '<>:s',
+ o_start => $span->o_start,
+ o_end => $span->o_end,
+ p_end => $span->p_end
+ );
+ $i++;
+ }
+ ) or return;
+
+ $$self->stream->add_meta('sentences', '<i>' . $i);
+
+ return 1;
+};
+
+1;
diff --git a/lib/KorAP/Tokenizer.pm b/lib/KorAP/Tokenizer.pm
index 3b83b5f..3f2c7f2 100644
--- a/lib/KorAP/Tokenizer.pm
+++ b/lib/KorAP/Tokenizer.pm
@@ -49,6 +49,8 @@
my $to = $span->attr('to');
my $token = $doc->primary->data($from, $to);
+ # warn 'Has ' . $from . '->' . $to . "($old)";
+
unless (defined $token) {
$self->log->error("Unable to find substring [$from-$to] in $path");
return;
@@ -89,7 +91,7 @@
# Add token count
$mtts->add_meta('tokens', '<i>' . $have);
- $range->gap($old, $doc->primary->data_length, $have-1) if $doc->primary->data_length >= $old;
+ $range->gap($old, $doc->primary->data_length + 1, $have-1) if $doc->primary->data_length >= ($old - 1);
# Add info
$self->stream($mtts);
diff --git a/lib/KorAP/Tokenizer/Range.pm b/lib/KorAP/Tokenizer/Range.pm
index 110fbc6..d129dc3 100644
--- a/lib/KorAP/Tokenizer/Range.pm
+++ b/lib/KorAP/Tokenizer/Range.pm
@@ -11,11 +11,13 @@
sub set {
my $self = shift;
+ # warn 'Set range: ', join(',', @_);
$$self->set_range(@_);
};
sub gap {
my $self = shift;
+ # warn 'Gap range: ', join(',', @_);
$$self->set_range($_[0], $_[1], '!' . ($_[2] - 1) . ':' . $_[2]);
};
@@ -31,6 +33,7 @@
my $found = $$self->lookup( $offset );
unless (defined $found) {
warn 'There is no value for ', $offset;
+ return;
};
if ($found =~ /!(\d+):(\d+)$/) {
diff --git a/script/prepare_index.pl b/script/prepare_index.pl
index 8acdc80..17a5470 100644
--- a/script/prepare_index.pl
+++ b/script/prepare_index.pl
@@ -105,7 +105,6 @@
my $doc = KorAP::Document->new( path => $input );
$doc->parse;
-
my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/);
if ($token_base) {
($token_base_foundry, $token_base_layer) = split /#/, $token_base;