Fixed payloads, sorted tokens, major speed improvements
diff --git a/lib/KorAP/Field/MultiTerm.pm b/lib/KorAP/Field/MultiTerm.pm
index 8210c56..0f93787 100644
--- a/lib/KorAP/Field/MultiTerm.pm
+++ b/lib/KorAP/Field/MultiTerm.pm
@@ -1,11 +1,122 @@
package KorAP::Field::MultiTerm;
-use Mojo::Base -base;
+use strict;
+use warnings;
use MIME::Base64;
-has [qw/p_start p_end o_start o_end term payload/];
-has store_offsets => 1;
+sub new {
+ my $self = bless [], shift;
+ my $i = 0;
+ for (; $i < scalar @_; $i+=2) {
+ if ($_[$i] eq 'term') {
+ $self->term($_[$i+1]);
+ }
+ elsif ($_[$i] eq 'p_start') {
+ $self->p_start($_[$i+1]);
+ }
+ elsif ($_[$i] eq 'p_end') {
+ $self->p_end($_[$i+1]);
+ }
+ elsif ($_[$i] eq 'payload') {
+ $self->payload($_[$i+1]);
+ }
+ elsif ($_[$i] eq 'store_offsets') {
+ $self->store_offsets($_[$i+1]);
+ }
+ elsif ($_[$i] eq 'o_start') {
+ $self->o_start($_[$i+1]);
+ }
+ elsif ($_[$i] eq 'o_end') {
+ $self->o_end($_[$i+1]);
+ };
+ };
+ $self;
+};
+# 0
+sub payload {
+ if (defined $_[1]) {
+ return $_[0]->[0] = $_[1];
+ };
+ $_[0]->[0];
+};
+
+# 1
+sub p_start {
+ if (defined $_[1]) {
+ return $_[0]->[1] = $_[1];
+ };
+ $_[0]->[1];
+};
+
+# 2
+sub p_end {
+ if (defined $_[1]) {
+ return $_[0]->[2] = $_[1];
+ };
+ $_[0]->[2];
+};
+
+# 3
+sub o_start {
+ if (defined $_[1]) {
+ return $_[0]->[3] = $_[1];
+ };
+ $_[0]->[3];
+};
+
+# 4
+sub o_end {
+ if (defined $_[1]) {
+ return $_[0]->[4] = $_[1];
+ };
+ $_[0]->[4];
+};
+
+# 5
+sub term {
+ if (defined $_[1]) {
+ return $_[0]->[5] = $_[1];
+ };
+ $_[0]->[5];
+};
+
+# 6
+sub store_offsets {
+ if (defined $_[1]) {
+ return $_[0]->[6] = $_[1];
+ };
+ $_[0]->[6];
+};
+
+
+# to string based on array
sub to_string {
+ my $string = $_[0]->[5];
+ if (defined $_[0]->[3]) {
+ $string .= '#' .$_[0]->[3] .'-' . $_[0]->[4];
+ };
+
+ my $pl = $_[0]->[1] ? $_[0]->[1] - 1 : $_[0]->[0];
+ if ($_[0]->[2] || $_[0]->[0]) {
+ $string .= '$';
+ if ($_[0]->[2]) {
+ $string .= '<i>' . $_[0]->[2];
+ };
+ if ($_[0]->[0]) {
+ if (index($_[0]->[0], '<') == 0) {
+ $string .= $_[0]->[0];
+ }
+ else {
+ $string .= '<?>' . $_[0]->[0];
+ };
+ };
+ };
+
+ $string;
+};
+
+
+sub to_string_2 {
my $self = shift;
my $string = $self->term;
if (defined $self->o_start) {
@@ -31,6 +142,9 @@
return $string;
};
+
+
+
sub to_solr {
my $self = shift;
my $increment = shift;
diff --git a/lib/KorAP/Field/MultiTermToken.pm b/lib/KorAP/Field/MultiTermToken.pm
index 0d8742b..fb83c1a 100644
--- a/lib/KorAP/Field/MultiTermToken.pm
+++ b/lib/KorAP/Field/MultiTermToken.pm
@@ -1,56 +1,196 @@
package KorAP::Field::MultiTermToken;
use KorAP::Field::MultiTerm;
-use Mojo::Base -base;
use List::MoreUtils 'uniq';
+use strict;
+use warnings;
-has [qw/o_start o_end/];
+# This tries to be highly optimized - it's not supposed to be readable
+
+sub new {
+ bless [], shift;
+};
sub add {
my $self = shift;
my $mt;
unless (ref $_[0] eq 'MultiTerm') {
if (@_ == 1) {
- $mt = KorAP::Field::MultiTerm->new(term => shift());
+ $mt = KorAP::Field::MultiTerm->new(term => $_[0]);
}
else {
$mt = KorAP::Field::MultiTerm->new(@_);
};
}
else {
- $mt = shift;
+ $mt = $_[0];
};
- $self->{mt} //= [];
- push(@{$self->{mt}}, $mt);
- return $mt;
+ $self->[0] //= [];
+ push(@{$self->[0]}, $mt);
+ $mt;
};
-# Return a new term id
+# 0 -> mt
+
+# 1
+sub o_start {
+ if (defined $_[1]) {
+ return $_[0]->[1] = $_[1];
+ };
+ $_[0]->[1];
+};
+
+# 2
+sub o_end {
+ if (defined $_[1]) {
+ return $_[0]->[2] = $_[1];
+ };
+ $_[0]->[2];
+};
+
+# 3: Return a new term id
sub id_counter {
- $_[0]->{id_counter} //= 1;
- return $_[0]->{id_counter}++;
+ $_[0]->[3] //= 1;
+ return $_[0]->[3]++;
};
-
sub surface {
- substr($_[0]->{mt}->[0]->term,2);
+ substr($_[0]->[0]->[0]->term,2);
};
sub lc_surface {
- substr($_[0]->{mt}->[1]->term,2);
+ substr($_[0]->[0]->[1]->term,2);
};
+sub to_array {
+ my $self = shift;
+ [uniq(map($_->to_string, sort _sort @{$self->[0]}))];
+};
+
+
sub to_string {
my $self = shift;
my $string = '[(' . $self->o_start . '-'. $self->o_end . ')';
- $string .= join ('|', map($_->to_string, @{$self->{mt}}));
+ $string .= join ('|', @{$self->to_array});
$string .= ']';
return $string;
};
+# Get relation based positions
+sub _rel_right_pos {
+ # token to token - right token
+ if ($_[0] =~ m/^<i>(\d+)<s>/o) {
+ return ($1, $1);
+ }
+ # token/span to span - right token
+ elsif ($_[0] =~ m/^<i>(\d+)<i>(\d+)<s>/o) {
+ return ($1, $2);
+ }
+ # span to token - right token
+ elsif ($_[0] =~ m/^<b>\d+<i>(\d+)<s>/o) {
+ return ($1, $1);
+ };
+ warn 'Unknown relation format!';
+ return (0,0);
+};
-sub to_array {
- my $self = shift;
- [uniq(map($_->to_string, @{$self->{mt}}))];
+# Sort spans, attributes and relations
+sub _sort {
+
+ # Both are no spans
+ if (index($a->[5], '<>:') != 0 && index($b->[5], '<>:') != 0) {
+
+ # Both are attributes
+ # Order attributes by reference id
+ if (index($a->[5], '@:') == 0 && index($b->[5], '@:') == 0) {
+ my ($a_id) = ($a->[0] =~ m/^<s>(\d+)/);
+ my ($b_id) = ($b->[0] =~ m/^<s>(\d+)/);
+ if ($a_id > $b_id) {
+ return 1;
+ }
+ elsif ($a_id < $b_id) {
+ return -1;
+ }
+ else {
+ return 1;
+ };
+ }
+
+ # Both are relations
+ elsif (
+ (index($a->[5],'<:') == 0 || index($a->[5],'>:') == 0) &&
+ (index($b->[5], '<:') == 0 || index($b->[5],'>:') == 0)) {
+ my $a_end = $a->[2] // 0;
+ my $b_end = $b->[2] // 0;
+
+ # left is p_end
+ if ($a_end < $b_end) {
+ return -1;
+ }
+ elsif ($a_end > $b_end) {
+ return 1;
+ }
+ else {
+ # Check for right positions
+ (my $a_start, $a_end) = _rel_right_pos($a->[0]);
+ (my $b_start, $b_end) = _rel_right_pos($b->[0]);
+ if ($a_start < $b_start) {
+ return -1;
+ }
+ elsif ($a_start > $b_start) {
+ return 1;
+ }
+ elsif ($a_end < $b_end) {
+ return -1;
+ }
+ elsif ($a_end > $b_end) {
+ return 1;
+ }
+ else {
+ return 1;
+ };
+ };
+ };
+
+ # This has to be sorted alphabetically!
+ return $a->[5] cmp $b->[5];
+ }
+
+ # Not identical
+ elsif (index($a->[5], '<>:') != 0) {
+ return $a->[5] cmp $b->[5];
+ }
+ # Not identical
+ elsif (index($b->[5], '<>:') != 0) {
+ return $a->[5] cmp $b->[5];
+ }
+
+ # Sort both spans
+ else {
+ if ($a->[2] < $b->[2]) {
+ return -1;
+ }
+ elsif ($a->[2] > $b->[2]) {
+ return 1;
+ }
+
+ # Check depth
+ else {
+ my ($a_depth) = ($a->[0] =~ m/^<b>(\d+)/);
+ my ($b_depth) = ($b->[0] =~ m/^<b>(\d+)/);
+
+ $a_depth //= 0;
+ $b_depth //= 0;
+ if ($a_depth < $b_depth) {
+ return -1;
+ }
+ elsif ($a_depth > $b_depth) {
+ return 1;
+ }
+ else {
+ return 1;
+ };
+ };
+ };
};
diff --git a/lib/KorAP/Field/MultiTermTokenStream.pm b/lib/KorAP/Field/MultiTermTokenStream.pm
index f9e97a2..47524ab 100644
--- a/lib/KorAP/Field/MultiTermTokenStream.pm
+++ b/lib/KorAP/Field/MultiTermTokenStream.pm
@@ -34,6 +34,7 @@
$_[0]->{mtt};
};
+
sub to_array {
my $self = shift;
[ map { $_->to_array } @{$self->{mtt}} ];