Fixed payloads, sorted tokens, major speed improvements
diff --git a/lib/KorAP/Field/MultiTerm.pm b/lib/KorAP/Field/MultiTerm.pm
index 8210c56..0f93787 100644
--- a/lib/KorAP/Field/MultiTerm.pm
+++ b/lib/KorAP/Field/MultiTerm.pm
@@ -1,11 +1,122 @@
package KorAP::Field::MultiTerm;
-use Mojo::Base -base;
+use strict;
+use warnings;
use MIME::Base64;
-has [qw/p_start p_end o_start o_end term payload/];
-has store_offsets => 1;
+sub new {
+ my $self = bless [], shift;
+ my $i = 0;
+ for (; $i < scalar @_; $i+=2) {
+ if ($_[$i] eq 'term') {
+ $self->term($_[$i+1]);
+ }
+ elsif ($_[$i] eq 'p_start') {
+ $self->p_start($_[$i+1]);
+ }
+ elsif ($_[$i] eq 'p_end') {
+ $self->p_end($_[$i+1]);
+ }
+ elsif ($_[$i] eq 'payload') {
+ $self->payload($_[$i+1]);
+ }
+ elsif ($_[$i] eq 'store_offsets') {
+ $self->store_offsets($_[$i+1]);
+ }
+ elsif ($_[$i] eq 'o_start') {
+ $self->o_start($_[$i+1]);
+ }
+ elsif ($_[$i] eq 'o_end') {
+ $self->o_end($_[$i+1]);
+ };
+ };
+ $self;
+};
+# 0
+sub payload {
+ if (defined $_[1]) {
+ return $_[0]->[0] = $_[1];
+ };
+ $_[0]->[0];
+};
+
+# 1
+sub p_start {
+ if (defined $_[1]) {
+ return $_[0]->[1] = $_[1];
+ };
+ $_[0]->[1];
+};
+
+# 2
+sub p_end {
+ if (defined $_[1]) {
+ return $_[0]->[2] = $_[1];
+ };
+ $_[0]->[2];
+};
+
+# 3
+sub o_start {
+ if (defined $_[1]) {
+ return $_[0]->[3] = $_[1];
+ };
+ $_[0]->[3];
+};
+
+# 4
+sub o_end {
+ if (defined $_[1]) {
+ return $_[0]->[4] = $_[1];
+ };
+ $_[0]->[4];
+};
+
+# 5
+sub term {
+ if (defined $_[1]) {
+ return $_[0]->[5] = $_[1];
+ };
+ $_[0]->[5];
+};
+
+# 6
+sub store_offsets {
+ if (defined $_[1]) {
+ return $_[0]->[6] = $_[1];
+ };
+ $_[0]->[6];
+};
+
+
+# to string based on array
sub to_string {
+ my $string = $_[0]->[5];
+ if (defined $_[0]->[3]) {
+ $string .= '#' .$_[0]->[3] .'-' . $_[0]->[4];
+ };
+
+ my $pl = $_[0]->[1] ? $_[0]->[1] - 1 : $_[0]->[0];
+ if ($_[0]->[2] || $_[0]->[0]) {
+ $string .= '$';
+ if ($_[0]->[2]) {
+ $string .= '<i>' . $_[0]->[2];
+ };
+ if ($_[0]->[0]) {
+ if (index($_[0]->[0], '<') == 0) {
+ $string .= $_[0]->[0];
+ }
+ else {
+ $string .= '<?>' . $_[0]->[0];
+ };
+ };
+ };
+
+ $string;
+};
+
+
+sub to_string_2 {
my $self = shift;
my $string = $self->term;
if (defined $self->o_start) {
@@ -31,6 +142,9 @@
return $string;
};
+
+
+
sub to_solr {
my $self = shift;
my $increment = shift;
diff --git a/lib/KorAP/Field/MultiTermToken.pm b/lib/KorAP/Field/MultiTermToken.pm
index 0d8742b..fb83c1a 100644
--- a/lib/KorAP/Field/MultiTermToken.pm
+++ b/lib/KorAP/Field/MultiTermToken.pm
@@ -1,56 +1,196 @@
package KorAP::Field::MultiTermToken;
use KorAP::Field::MultiTerm;
-use Mojo::Base -base;
use List::MoreUtils 'uniq';
+use strict;
+use warnings;
-has [qw/o_start o_end/];
+# This tries to be highly optimized - it's not supposed to be readable
+
+sub new {
+ bless [], shift;
+};
sub add {
my $self = shift;
my $mt;
unless (ref $_[0] eq 'MultiTerm') {
if (@_ == 1) {
- $mt = KorAP::Field::MultiTerm->new(term => shift());
+ $mt = KorAP::Field::MultiTerm->new(term => $_[0]);
}
else {
$mt = KorAP::Field::MultiTerm->new(@_);
};
}
else {
- $mt = shift;
+ $mt = $_[0];
};
- $self->{mt} //= [];
- push(@{$self->{mt}}, $mt);
- return $mt;
+ $self->[0] //= [];
+ push(@{$self->[0]}, $mt);
+ $mt;
};
-# Return a new term id
+# 0 -> mt
+
+# 1
+sub o_start {
+ if (defined $_[1]) {
+ return $_[0]->[1] = $_[1];
+ };
+ $_[0]->[1];
+};
+
+# 2
+sub o_end {
+ if (defined $_[1]) {
+ return $_[0]->[2] = $_[1];
+ };
+ $_[0]->[2];
+};
+
+# 3: Return a new term id
sub id_counter {
- $_[0]->{id_counter} //= 1;
- return $_[0]->{id_counter}++;
+ $_[0]->[3] //= 1;
+ return $_[0]->[3]++;
};
-
sub surface {
- substr($_[0]->{mt}->[0]->term,2);
+ substr($_[0]->[0]->[0]->term,2);
};
sub lc_surface {
- substr($_[0]->{mt}->[1]->term,2);
+ substr($_[0]->[0]->[1]->term,2);
};
+sub to_array {
+ my $self = shift;
+ [uniq(map($_->to_string, sort _sort @{$self->[0]}))];
+};
+
+
sub to_string {
my $self = shift;
my $string = '[(' . $self->o_start . '-'. $self->o_end . ')';
- $string .= join ('|', map($_->to_string, @{$self->{mt}}));
+ $string .= join ('|', @{$self->to_array});
$string .= ']';
return $string;
};
+# Get relation based positions
+sub _rel_right_pos {
+ # token to token - right token
+ if ($_[0] =~ m/^<i>(\d+)<s>/o) {
+ return ($1, $1);
+ }
+ # token/span to span - right token
+ elsif ($_[0] =~ m/^<i>(\d+)<i>(\d+)<s>/o) {
+ return ($1, $2);
+ }
+ # span to token - right token
+ elsif ($_[0] =~ m/^<b>\d+<i>(\d+)<s>/o) {
+ return ($1, $1);
+ };
+ warn 'Unknown relation format!';
+ return (0,0);
+};
-sub to_array {
- my $self = shift;
- [uniq(map($_->to_string, @{$self->{mt}}))];
+# Sort spans, attributes and relations
+sub _sort {
+
+ # Both are no spans
+ if (index($a->[5], '<>:') != 0 && index($b->[5], '<>:') != 0) {
+
+ # Both are attributes
+ # Order attributes by reference id
+ if (index($a->[5], '@:') == 0 && index($b->[5], '@:') == 0) {
+ my ($a_id) = ($a->[0] =~ m/^<s>(\d+)/);
+ my ($b_id) = ($b->[0] =~ m/^<s>(\d+)/);
+ if ($a_id > $b_id) {
+ return 1;
+ }
+ elsif ($a_id < $b_id) {
+ return -1;
+ }
+ else {
+ return 1;
+ };
+ }
+
+ # Both are relations
+ elsif (
+ (index($a->[5],'<:') == 0 || index($a->[5],'>:') == 0) &&
+ (index($b->[5], '<:') == 0 || index($b->[5],'>:') == 0)) {
+ my $a_end = $a->[2] // 0;
+ my $b_end = $b->[2] // 0;
+
+ # left is p_end
+ if ($a_end < $b_end) {
+ return -1;
+ }
+ elsif ($a_end > $b_end) {
+ return 1;
+ }
+ else {
+ # Check for right positions
+ (my $a_start, $a_end) = _rel_right_pos($a->[0]);
+ (my $b_start, $b_end) = _rel_right_pos($b->[0]);
+ if ($a_start < $b_start) {
+ return -1;
+ }
+ elsif ($a_start > $b_start) {
+ return 1;
+ }
+ elsif ($a_end < $b_end) {
+ return -1;
+ }
+ elsif ($a_end > $b_end) {
+ return 1;
+ }
+ else {
+ return 1;
+ };
+ };
+ };
+
+ # This has to be sorted alphabetically!
+ return $a->[5] cmp $b->[5];
+ }
+
+ # Not identical
+ elsif (index($a->[5], '<>:') != 0) {
+ return $a->[5] cmp $b->[5];
+ }
+ # Not identical
+ elsif (index($b->[5], '<>:') != 0) {
+ return $a->[5] cmp $b->[5];
+ }
+
+ # Sort both spans
+ else {
+ if ($a->[2] < $b->[2]) {
+ return -1;
+ }
+ elsif ($a->[2] > $b->[2]) {
+ return 1;
+ }
+
+ # Check depth
+ else {
+ my ($a_depth) = ($a->[0] =~ m/^<b>(\d+)/);
+ my ($b_depth) = ($b->[0] =~ m/^<b>(\d+)/);
+
+ $a_depth //= 0;
+ $b_depth //= 0;
+ if ($a_depth < $b_depth) {
+ return -1;
+ }
+ elsif ($a_depth > $b_depth) {
+ return 1;
+ }
+ else {
+ return 1;
+ };
+ };
+ };
};
diff --git a/lib/KorAP/Field/MultiTermTokenStream.pm b/lib/KorAP/Field/MultiTermTokenStream.pm
index f9e97a2..47524ab 100644
--- a/lib/KorAP/Field/MultiTermTokenStream.pm
+++ b/lib/KorAP/Field/MultiTermTokenStream.pm
@@ -34,6 +34,7 @@
$_[0]->{mtt};
};
+
sub to_array {
my $self = shift;
[ map { $_->to_array } @{$self->{mtt}} ];
diff --git a/lib/KorAP/Index/Base/Paragraphs.pm b/lib/KorAP/Index/Base/Paragraphs.pm
index 105bd59..e5386f4 100644
--- a/lib/KorAP/Index/Base/Paragraphs.pm
+++ b/lib/KorAP/Index/Base/Paragraphs.pm
@@ -14,7 +14,8 @@
term => '<>:base/s:p',
o_start => $span->o_start,
o_end => $span->o_end,
- p_end => $span->p_end
+ p_end => $span->p_end,
+ payload => '<b>1'
);
$i++;
}
diff --git a/lib/KorAP/Index/Base/Sentences.pm b/lib/KorAP/Index/Base/Sentences.pm
index f1ca6f9..6d485ad 100644
--- a/lib/KorAP/Index/Base/Sentences.pm
+++ b/lib/KorAP/Index/Base/Sentences.pm
@@ -18,7 +18,8 @@
term => '<>:base/s:s',
o_start => $span->o_start,
o_end => $span->o_end,
- p_end => $span->p_end
+ p_end => $span->p_end,
+ payload => '<b>2'
);
$last_p = $span->p_end;
$last_o = $span->o_end;
@@ -31,7 +32,8 @@
term => '<>:base/s:t',
o_start => $first->[1],
p_end => $last_p,
- o_end => $last_o
+ o_end => $last_o,
+ payload => '<b>0'
);
$$self->stream->add_meta('base/sentences', '<i>' . $i);
diff --git a/lib/KorAP/Index/Connexor/Phrase.pm b/lib/KorAP/Index/Connexor/Phrase.pm
index ed36de3..00a1b0d 100644
--- a/lib/KorAP/Index/Connexor/Phrase.pm
+++ b/lib/KorAP/Index/Connexor/Phrase.pm
@@ -22,7 +22,8 @@
term => '<>:cnx/c:' . $type,
o_start => $span->o_start,
o_end => $span->o_end,
- p_end => $span->p_end
+ p_end => $span->p_end,
+ payload => '<b>0' # Pseudo-depth
);
};
}
diff --git a/lib/KorAP/Index/Connexor/Sentences.pm b/lib/KorAP/Index/Connexor/Sentences.pm
index ac6f89f..db95729 100644
--- a/lib/KorAP/Index/Connexor/Sentences.pm
+++ b/lib/KorAP/Index/Connexor/Sentences.pm
@@ -15,7 +15,8 @@
term => '<>:cnx/s:s',
o_start => $span->o_start,
o_end => $span->o_end,
- p_end => $span->p_end
+ p_end => $span->p_end,
+ payload => '<b>2'
);
$i++;
}
diff --git a/lib/KorAP/Index/CoreNLP/Constituency.pm b/lib/KorAP/Index/CoreNLP/Constituency.pm
index 4793bfd..ee37abc 100644
--- a/lib/KorAP/Index/CoreNLP/Constituency.pm
+++ b/lib/KorAP/Index/CoreNLP/Constituency.pm
@@ -1,7 +1,6 @@
package KorAP::Index::CoreNLP::Constituency;
use KorAP::Index::Base;
use Set::Scalar;
-use v5.16;
sub parse {
my $self = shift;
@@ -25,8 +24,14 @@
$rel = [$rel] unless ref $rel eq 'ARRAY';
foreach (@$rel) {
- if ($_->{-label} eq 'dominates' && $_->{-target}) {
- $corenlp_const_noroot->insert($_->{-target});
+ if ($_->{-label} eq 'dominates') {
+ if ($_->{-target}) {
+ $corenlp_const_noroot->insert($_->{-target});
+ }
+ elsif (my $uri = $_->{-uri}) {
+ $uri =~ s/^morpho\.xml#//;
+ $corenlp_const_noroot->insert($uri);
+ };
};
};
}
@@ -34,7 +39,8 @@
my $stream = $$self->stream;
- my $add_const = sub {
+ my $add_const;
+ $add_const = sub {
my $span = shift;
my $level = shift;
my $mtt = $stream->pos($span->p_start);
@@ -53,11 +59,11 @@
p_end => $span->p_end
);
- $term{payload} = '<b>' . $level if $level;
+ $term{payload} = '<b>' . ($level // 0);
$mtt->add(%term);
- my $this = __SUB__;
+ my $this = $add_const;
my $rel = $content->{rel} or return;
$rel = [$rel] unless ref $rel eq 'ARRAY';
@@ -79,7 +85,7 @@
};
sub layer_info {
- ['corenlp/c=const']
+ ['corenlp/c=spans']
}
1;
diff --git a/lib/KorAP/Index/CoreNLP/Sentences.pm b/lib/KorAP/Index/CoreNLP/Sentences.pm
index 0f40213..cacc2b0 100644
--- a/lib/KorAP/Index/CoreNLP/Sentences.pm
+++ b/lib/KorAP/Index/CoreNLP/Sentences.pm
@@ -15,7 +15,8 @@
term => '<>:corenlp/s:s',
o_start => $span->o_start,
o_end => $span->o_end,
- p_end => $span->p_end
+ p_end => $span->p_end,
+ payload => '<b>2'
);
$i++;
}
diff --git a/lib/KorAP/Index/Mate/Morpho.pm b/lib/KorAP/Index/Mate/Morpho.pm
index 035bc37..1a06f63 100644
--- a/lib/KorAP/Index/Mate/Morpho.pm
+++ b/lib/KorAP/Index/Mate/Morpho.pm
@@ -13,69 +13,6 @@
my $content = $token->hash->{fs}->{f};
- my ($found, $pos, $msd, $id);
-
- my $capital = 0;
-
- foreach my $f (@{$content->{fs}->{f}}) {
- #pos
- if (($f->{-name} eq 'pos') && ($found = $f->{'#text'})) {
- $pos = $found;
- }
-
- # lemma
- elsif (($f->{-name} eq 'lemma')
- && ($found = $f->{'#text'})
- && $found ne '--') {
- $mtt->add(term => 'mate/l:' . $found);
- }
-
- # MSD
- elsif (($f->{-name} eq 'msd') &&
- ($found = $f->{'#text'}) &&
- ($found ne '_')) {
- $msd = $found;
- $id = $mtt->id_counter;
- };
- };
-
- $mtt->add(term => 'mate/m:' . $pos . ($id ? ('$<s>' . $id) : ''));
-
- # MSD
- if ($msd) {
- foreach (split '\|', $msd) {
- my ($x, $y) = split "=", $_;
- # case, tense, number, mood, person, degree, gender
- $mtt->add(term => '@:' . $x . ($y ? '=' . $y : '') . '$<s>' . $id);
- };
- };
- }) or return;
-
- return 1;
-};
-
-sub layer_info {
- ['mate/l=tokens', 'mate/m=tokens']
-};
-
-1;
-
-
-__END__
-
-
-sub parse {
- my $self = shift;
-
- $$self->add_tokendata(
- foundry => 'mate',
- layer => 'morpho',
- cb => sub {
- my ($stream, $token) = @_;
- my $mtt = $stream->pos($token->pos);
-
- my $content = $token->hash->{fs}->{f};
-
my $found;
my $capital = 0;
diff --git a/lib/KorAP/Index/Mate/Morpho2.pm b/lib/KorAP/Index/Mate/Morpho2.pm
new file mode 100644
index 0000000..e032f2f
--- /dev/null
+++ b/lib/KorAP/Index/Mate/Morpho2.pm
@@ -0,0 +1,63 @@
+package KorAP::Index::Mate::Morpho;
+use KorAP::Index::Base;
+
+# This attaches morphological information as attributes to the pos
+
+sub parse {
+ my $self = shift;
+
+ $$self->add_tokendata(
+ foundry => 'mate',
+ layer => 'morpho',
+ cb => sub {
+ my ($stream, $token) = @_;
+ my $mtt = $stream->pos($token->pos);
+
+ my $content = $token->hash->{fs}->{f};
+
+ my ($found, $pos, $msd, $id);
+
+ my $capital = 0;
+
+ foreach my $f (@{$content->{fs}->{f}}) {
+ #pos
+ if (($f->{-name} eq 'pos') && ($found = $f->{'#text'})) {
+ $pos = $found;
+ }
+
+ # lemma
+ elsif (($f->{-name} eq 'lemma')
+ && ($found = $f->{'#text'})
+ && $found ne '--') {
+ $mtt->add(term => 'mate/l:' . $found);
+ }
+
+ # MSD
+ elsif (($f->{-name} eq 'msd') &&
+ ($found = $f->{'#text'}) &&
+ ($found ne '_')) {
+ $msd = $found;
+ $id = $mtt->id_counter;
+ };
+ };
+
+ $mtt->add(term => 'mate/m:' . $pos . ($id ? ('$<s>' . $id) : ''));
+
+ # MSD
+ if ($msd) {
+ foreach (split '\|', $msd) {
+ my ($x, $y) = split "=", $_;
+ # case, tense, number, mood, person, degree, gender
+ $mtt->add(term => '@:' . $x . ($y ? '=' . $y : '') . '$<s>' . $id);
+ };
+ };
+ }) or return;
+
+ return 1;
+};
+
+sub layer_info {
+ ['mate/l=tokens', 'mate/m=tokens']
+};
+
+1;
diff --git a/lib/KorAP/Index/OpenNLP/Sentences.pm b/lib/KorAP/Index/OpenNLP/Sentences.pm
index 8710763..f4e84e9 100644
--- a/lib/KorAP/Index/OpenNLP/Sentences.pm
+++ b/lib/KorAP/Index/OpenNLP/Sentences.pm
@@ -15,7 +15,8 @@
term => '<>:opennlp/s:s',
o_start => $span->o_start,
o_end => $span->o_end,
- p_end => $span->p_end
+ p_end => $span->p_end,
+ payload => '<b>2' # t/p/s
);
$i++;
}
diff --git a/lib/KorAP/Index/TreeTagger/Morpho.pm b/lib/KorAP/Index/TreeTagger/Morpho.pm
index 81766c7..989728b 100644
--- a/lib/KorAP/Index/TreeTagger/Morpho.pm
+++ b/lib/KorAP/Index/TreeTagger/Morpho.pm
@@ -22,11 +22,11 @@
$content = $fs->{fs}->{f};
my @val;
- my $certainty = '';
+ my $certainty = 0;
foreach (@$content) {
if ($_->{-name} eq 'certainty') {
$certainty = floor(($_->{'#text'} * 255));
- $certainty = '$<b>' . $certainty if $certainty;
+ $certainty = $certainty if $certainty;
}
else {
push @val, $_
@@ -39,16 +39,20 @@
($found = $_->{'#text'}) &&
($found ne 'UNKNOWN') &&
($found ne '?')) {
- $mtt->add(
- term => 'tt/l:' . $found . $certainty
+ my %term = (
+ term => 'tt/l:' . $found
);
+ $term{payload} = '<b>' . $certainty if $certainty;
+ $mtt->add(%term);
};
# pos
if (($_->{-name} eq 'ctag') && ($found = $_->{'#text'})) {
- $mtt->add(
- term => 'tt/p:' . $found . $certainty
+ my %term = (
+ term => 'tt/p:' . $found
);
+ $term{payload} = '<b>' . $certainty if $certainty;
+ $mtt->add(%term);
};
};
};
diff --git a/lib/KorAP/Index/TreeTagger/Sentences.pm b/lib/KorAP/Index/TreeTagger/Sentences.pm
index 06669ea..37b49e0 100644
--- a/lib/KorAP/Index/TreeTagger/Sentences.pm
+++ b/lib/KorAP/Index/TreeTagger/Sentences.pm
@@ -15,7 +15,8 @@
term => '<>:tt/s:s',
o_start => $span->o_start,
o_end => $span->o_end,
- p_end => $span->p_end
+ p_end => $span->p_end,
+ payload => '<b>2' # Depth is 2 by default t/p/s
);
$i++;
}
diff --git a/lib/KorAP/Index/XIP/Constituency.pm b/lib/KorAP/Index/XIP/Constituency.pm
index 62a4a29..d181afd 100644
--- a/lib/KorAP/Index/XIP/Constituency.pm
+++ b/lib/KorAP/Index/XIP/Constituency.pm
@@ -34,6 +34,7 @@
my $rel = $span->hash->{rel} or return;
$rel = [$rel] unless ref $rel eq 'ARRAY';
+ # Iterate over all relations
foreach (@$rel) {
if ($_->{-label} eq 'dominates') {
@@ -43,6 +44,7 @@
$target = $1;
};
+ # The target may not be addressable
next unless $target;
# It's definately not a root
@@ -57,7 +59,7 @@
# Recursive tree traversal method
my $add_const;
- $add_const= sub {
+ $add_const = sub {
my ($span, $level) = @_;
weaken $xip_const_root;
@@ -68,6 +70,7 @@
my $content = $span->hash;
my $f = $content->{fs}->{f};
+
unless ($f->{-name} eq 'const') {
warn $f->{-id} . ' is no constant';
return;
@@ -89,7 +92,7 @@
);
# Only add level payload if node != root
- $term{payload} = '<b>' . $level if $level;
+ $term{payload} ='<b>' . ($level // 0);
$mtt->add(%term);
@@ -141,7 +144,7 @@
# Layer info
sub layer_info {
- ['xip/c=const']
-}
+ ['xip/c=spans']
+};
1;
diff --git a/lib/KorAP/Index/XIP/Morpho.pm b/lib/KorAP/Index/XIP/Morpho.pm
index 1eae6c6..474bef0 100644
--- a/lib/KorAP/Index/XIP/Morpho.pm
+++ b/lib/KorAP/Index/XIP/Morpho.pm
@@ -60,7 +60,7 @@
};
sub layer_info {
- ['xip/l=lemma', 'xip/p=pos']
+ ['xip/l=tokens', 'xip/p=tokens']
};
diff --git a/lib/KorAP/Index/XIP/Sentences.pm b/lib/KorAP/Index/XIP/Sentences.pm
index f045152..35ab6f1 100644
--- a/lib/KorAP/Index/XIP/Sentences.pm
+++ b/lib/KorAP/Index/XIP/Sentences.pm
@@ -18,7 +18,8 @@
term => '<>:xip/s:s',
o_start => $span->o_start,
o_end => $span->o_end,
- p_end => $span->p_end
+ p_end => $span->p_end,
+ payload => '<b>2'
);
$i++;
}
@@ -29,4 +30,9 @@
return 1;
};
+sub layer_info {
+ ['xip/s=spans'];
+};
+
+
1;