Added add_span() method to MultiTermToken
Change-Id: Iabe067079c47ac49995cfc66d18f7d78768bc25e
diff --git a/Changes b/Changes
index a28b5e6..2e14f56 100644
--- a/Changes
+++ b/Changes
@@ -9,6 +9,7 @@
- Remove MultiTerm->add() in favor of
MultiTerm->add_by_term().
- Optimization by reducing calls to _offset().
+ - Introduced add_span() method to MultiTermToken.
0.40 2020-03-03
- Fixed XIP parser.
diff --git a/lib/KorAP/XML/Annotation/Base/Paragraphs.pm b/lib/KorAP/XML/Annotation/Base/Paragraphs.pm
index 922df0c..3f18360 100644
--- a/lib/KorAP/XML/Annotation/Base/Paragraphs.pm
+++ b/lib/KorAP/XML/Annotation/Base/Paragraphs.pm
@@ -10,13 +10,9 @@
layer => 'paragraph',
cb => sub {
my ($stream, $span) = @_;
- my $mtt = $stream->pos($span->get_p_start);
- my $mt = $mtt->add_by_term('<>:base/s:p');
- $mt->set_o_start($span->get_o_start);
- $mt->set_o_end($span->get_o_end);
- $mt->set_p_end($span->get_p_end);
- $mt->set_payload('<b>1');
- $mt->set_pti(64);
+ $stream->pos($span->get_p_start)
+ ->add_span('<>:base/s:p', $span)
+ ->set_payload('<b>1');
$i++;
}
) or return;
diff --git a/lib/KorAP/XML/Annotation/Base/Sentences.pm b/lib/KorAP/XML/Annotation/Base/Sentences.pm
index 9ccdca6..e5b586f 100644
--- a/lib/KorAP/XML/Annotation/Base/Sentences.pm
+++ b/lib/KorAP/XML/Annotation/Base/Sentences.pm
@@ -5,26 +5,14 @@
my $self = shift;
my $i = 0;
- my ($first, $last_p, $last_o);
-
$$self->add_spandata(
foundry => 'base',
layer => 'sentences',
cb => sub {
my ($stream, $span) = @_;
- my $mtt = $stream->pos($span->get_p_start);
-
- $first = [$span->get_p_start, $span->get_o_start] unless defined $first;
-
- my $mt = $mtt->add_by_term('<>:base/s:s');
- $mt->set_o_start($span->get_o_start);
- $mt->set_o_end($span->get_o_end);
- $mt->set_p_end($span->get_p_end);
- $mt->set_payload('<b>2');
- $mt->set_pti(64);
-
- $last_p = $span->get_p_end;
- $last_o = $span->get_o_end;
+ $stream->pos($span->get_p_start)
+ ->add_span('<>:base/s:s', $span)
+ ->set_payload('<b>2');
$i++;
}
) or return;
diff --git a/lib/KorAP/XML/Annotation/Connexor/Phrase.pm b/lib/KorAP/XML/Annotation/Connexor/Phrase.pm
index e5f8012..f1c5f76 100644
--- a/lib/KorAP/XML/Annotation/Connexor/Phrase.pm
+++ b/lib/KorAP/XML/Annotation/Connexor/Phrase.pm
@@ -17,13 +17,9 @@
my $type = $content->{'#text'};
if ($type) {
- my $mt = $stream->pos($span->get_p_start)
- ->add_by_term('<>:cnx/c:' . $type);
- $mt->set_o_start($span->get_o_start);
- $mt->set_o_end($span->get_o_end);
- $mt->set_p_end($span->get_p_end);
- $mt->set_pti(64);
- $mt->set_payload('<b>0'); # Pseudo-depth
+ $stream->pos($span->get_p_start)
+ ->add_span('<>:cnx/c:' . $type, $span)
+ ->set_payload('<b>0'); # Pseudo-depth
};
}
) or return;
diff --git a/lib/KorAP/XML/Annotation/Connexor/Sentences.pm b/lib/KorAP/XML/Annotation/Connexor/Sentences.pm
index 71b1e08..b467e46 100644
--- a/lib/KorAP/XML/Annotation/Connexor/Sentences.pm
+++ b/lib/KorAP/XML/Annotation/Connexor/Sentences.pm
@@ -10,13 +10,9 @@
layer => 'sentences',
cb => sub {
my ($stream, $span) = @_;
- my $mt = $stream->pos($span->get_p_start)
- ->add_by_term('<>:cnx/s:s');
- $mt->set_o_start($span->get_o_start);
- $mt->set_o_end($span->get_o_end);
- $mt->set_p_end($span->get_p_end);
- $mt->set_pti(64);
- $mt->set_payload('<b>0');
+ $stream->pos($span->get_p_start)
+ ->add_span('<>:cnx/s:s', $span)
+ ->set_payload('<b>0');
$i++;
}
) or return;
diff --git a/lib/KorAP/XML/Annotation/CoreNLP/Constituency.pm b/lib/KorAP/XML/Annotation/CoreNLP/Constituency.pm
index 3f33802..3467ae4 100644
--- a/lib/KorAP/XML/Annotation/CoreNLP/Constituency.pm
+++ b/lib/KorAP/XML/Annotation/CoreNLP/Constituency.pm
@@ -1,6 +1,7 @@
package KorAP::XML::Annotation::CoreNLP::Constituency;
use KorAP::XML::Annotation::Base;
use Set::Scalar;
+use feature 'current_sub';
sub parse {
my $self = shift;
@@ -59,14 +60,8 @@
my $type = $f->{'#text'} or return;
# $type is now NPA, NP, NUM ...
- my $term = $mtt->add_by_term('<>:corenlp/c:' . $type);
- $term->set_o_start($span->get_o_start);
- $term->set_o_end($span->get_o_end);
- $term->set_p_end($span->get_p_end);
- $term->set_pti(64);
- $term->set_payload('<b>' . ($level // 0));
-
- my $this = $add_const;
+ $mtt->add_span('<>:corenlp/c:' . $type, $span)
+ ->set_payload('<b>' . ($level // 0));
my $rel = $content->{rel} or return;
$rel = [$rel] unless ref $rel eq 'ARRAY';
@@ -76,7 +71,7 @@
my $subspan = delete $corenlp_const{$_->{-target}} or return;
# This will be called recursively
- $this->($subspan, $level + 1);
+ __SUB__->($subspan, $level + 1);
};
};
diff --git a/lib/KorAP/XML/Annotation/CoreNLP/Sentences.pm b/lib/KorAP/XML/Annotation/CoreNLP/Sentences.pm
index 37fb19a..31ef18c 100644
--- a/lib/KorAP/XML/Annotation/CoreNLP/Sentences.pm
+++ b/lib/KorAP/XML/Annotation/CoreNLP/Sentences.pm
@@ -10,13 +10,9 @@
layer => 'sentences',
cb => sub {
my ($stream, $span) = @_;
- my $mtt = $stream->pos($span->get_p_start);
- my $mt = $mtt->add_by_term('<>:corenlp/s:s');
- $mt->set_o_start($span->get_o_start);
- $mt->set_o_end($span->get_o_end);
- $mt->set_p_end($span->get_p_end);
- $mt->set_pti(64);
- $mt->set_payload('<b>0'); # Could also be 2 for t/p/s
+ $stream->pos($span->get_p_start)
+ ->add_span('<>:corenlp/s:s', $span)
+ ->set_payload('<b>0'); # Could also be 2 for t/p/s
$i++;
}
) or return;
diff --git a/lib/KorAP/XML/Annotation/OpenNLP/Sentences.pm b/lib/KorAP/XML/Annotation/OpenNLP/Sentences.pm
index cf496c2..47aca86 100644
--- a/lib/KorAP/XML/Annotation/OpenNLP/Sentences.pm
+++ b/lib/KorAP/XML/Annotation/OpenNLP/Sentences.pm
@@ -10,13 +10,9 @@
layer => 'sentences',
cb => sub {
my ($stream, $span) = @_;
- my $mt = $stream->pos($span->get_p_start)
- ->add_by_term('<>:opennlp/s:s');
- $mt->set_o_start($span->get_o_start);
- $mt->set_o_end($span->get_o_end);
- $mt->set_p_end($span->get_p_end);
- $mt->set_pti(64);
- $mt->set_payload('<b>0');
+ $stream->pos($span->get_p_start)
+ ->add_span('<>:opennlp/s:s', $span)
+ ->set_payload('<b>0');
$i++;
}
) or return;
diff --git a/lib/KorAP/XML/Annotation/TreeTagger/Sentences.pm b/lib/KorAP/XML/Annotation/TreeTagger/Sentences.pm
index bdfcbc5..b7ca230 100644
--- a/lib/KorAP/XML/Annotation/TreeTagger/Sentences.pm
+++ b/lib/KorAP/XML/Annotation/TreeTagger/Sentences.pm
@@ -10,13 +10,9 @@
layer => 'sentences',
cb => sub {
my ($stream, $span) = @_;
- my $mtt = $stream->pos($span->get_p_start);
- my $mt = $mtt->add_by_term('<>:tt/s:s');
- $mt->set_o_start($span->get_o_start);
- $mt->set_o_end($span->get_o_end);
- $mt->set_p_end($span->get_p_end);
- $mt->set_pti(64);
- $mt->set_payload('<b>0'); # Could be 2 as well t/p/s
+ $stream->pos($span->get_p_start)
+ ->add_span('<>:tt/s:s',$span)
+ ->set_payload('<b>0'); # Could be 2 as well t/p/s
$i++;
}
) or return;
diff --git a/lib/KorAP/XML/Annotation/XIP/Sentences.pm b/lib/KorAP/XML/Annotation/XIP/Sentences.pm
index cc1474c..afb99e6 100644
--- a/lib/KorAP/XML/Annotation/XIP/Sentences.pm
+++ b/lib/KorAP/XML/Annotation/XIP/Sentences.pm
@@ -13,13 +13,9 @@
cb => sub {
my ($stream, $span) = @_;
- my $mt = $stream->pos($span->get_p_start)
- ->add_by_term('<>:xip/s:s');
- $mt->set_o_start($span->get_o_start);
- $mt->set_o_end($span->get_o_end);
- $mt->set_p_end($span->get_p_end);
- $mt->set_pti(64);
- $mt->set_payload('<b>0'); # Could be 2 as well for t/p/s
+ $stream->pos($span->get_p_start)
+ ->add_span('<>:xip/s:s', $span)
+ ->set_payload('<b>0'); # Could be 2 as well for t/p/s
$i++;
}
) or return;
diff --git a/lib/KorAP/XML/Index/MultiTerm.pm b/lib/KorAP/XML/Index/MultiTerm.pm
index db75a38..b72705e 100644
--- a/lib/KorAP/XML/Index/MultiTerm.pm
+++ b/lib/KorAP/XML/Index/MultiTerm.pm
@@ -3,25 +3,23 @@
use warnings;
use MIME::Base64;
-# Todo: This should store only the pti and the payload - with clever access using the pti!
-# Everything should be stored as bytes already (if this is feasible)
-
use constant {
TERM => 0,
O_START => 1,
O_END => 2,
P_START => 3,
P_END => 4,
- STORED_OFFSETS => 5,
- PTI => 6,
- TUI => 7,
- PAYLOAD => 8,
+ PTI => 5,
+ TUI => 6,
+ PAYLOAD => 7,
+ STORED_OFFSETS => 8,
};
# Construct a multiterm object by passing a term
sub new {
- bless [$_[1]], $_[0];
+ my $class = shift;
+ bless [@_], $class;
};
sub set_payload {
diff --git a/lib/KorAP/XML/Index/MultiTermToken.pm b/lib/KorAP/XML/Index/MultiTermToken.pm
index 35f7d7a..55ee251 100644
--- a/lib/KorAP/XML/Index/MultiTermToken.pm
+++ b/lib/KorAP/XML/Index/MultiTermToken.pm
@@ -21,6 +21,22 @@
bless [[]], shift;
};
+
+# Add span annotation
+sub add_span {
+ my ($class, $term, $span) = @_;
+ my $mt = KorAP::XML::Index::MultiTerm->new(
+ $term, # Term
+ $span->get_o_start, # o_start
+ $span->get_o_end, # o_end
+ undef, # p_start
+ $span->get_p_end, # p_end
+ 64, # pti
+ );
+ push(@{$_[0]->[MT]}, $mt);
+ $mt;
+};
+
sub add_by_term {
my $mt = KorAP::XML::Index::MultiTerm->new($_[1]);
push(@{$_[0]->[MT]}, $mt);
diff --git a/lib/KorAP/XML/Tokenizer/Units.pm b/lib/KorAP/XML/Tokenizer/Units.pm
index 322df2c..48c83c1 100644
--- a/lib/KorAP/XML/Tokenizer/Units.pm
+++ b/lib/KorAP/XML/Tokenizer/Units.pm
@@ -271,7 +271,7 @@
return if !$to;
return unless $to > $from;
- $from ||= 0;
+ $from //= 0;
my $pos = $self->match->lookup($from, $to);