Improve ranking mechanism
Change-Id: I23add5381ba7099485e5883bebad4997962b82ba
diff --git a/lib/Krawfish/Cluster.pm b/lib/Krawfish/Cluster.pm
index 9edd54c..3387e25 100644
--- a/lib/Krawfish/Cluster.pm
+++ b/lib/Krawfish/Cluster.pm
@@ -21,7 +21,7 @@
sub search_for {
my ($self, $query, $cb) = @_;
- # This should probably open multiple websockets/unx-sockets in parallel
+ # This should probably open multiple websockets/unix-sockets in parallel
# https://stackoverflow.com/questions/13417000/synchronous-request-with-websockets
Mojo::IOLoop->delay(
sub {
diff --git a/lib/Krawfish/Index.pm b/lib/Krawfish/Index.pm
index c911021..d3dd5de 100644
--- a/lib/Krawfish/Index.pm
+++ b/lib/Krawfish/Index.pm
@@ -9,6 +9,10 @@
use constant DEBUG => 0;
+# TODO:
+# May need to be renamed to Krawfish::Node
+
+
# This is the central object for index handling on node level.
# A new document will be added by adding the following information:
# - To the dynamic DICTIONARY
@@ -129,7 +133,19 @@
my $dict = $self->dict;
# Add field
- if ($dict->add_field($field_term, $collation)) {
+ if (my $term_id = $dict->add_field($field_term, $collation)) {
+
+ # Field is meant to be sortable
+ if ($collation) {
+
+ # Propagate the field to all segments
+ foreach (@{$self->segments}) {
+
+ # Introduce ranking file
+ $_->field_ranks->introduce_rank($term_id, $collation)
+ };
+ };
+
# Field is added to the dictionary
return 1;
};
diff --git a/lib/Krawfish/Index/Fields.pm b/lib/Krawfish/Index/Fields.pm
index 59acb8f..fe29b6e 100644
--- a/lib/Krawfish/Index/Fields.pm
+++ b/lib/Krawfish/Index/Fields.pm
@@ -1,7 +1,8 @@
package Krawfish::Index::Fields;
use Krawfish::Index::Fields::Doc;
+use Krawfish::Index::Fields::Ranks;
use Krawfish::Index::Fields::Pointer;
-use Krawfish::Index::Rank::Fields;
+# use Krawfish::Index::Rank::Fields;
use Krawfish::Log;
use warnings;
use strict;
@@ -43,7 +44,8 @@
my $class = shift;
bless {
docs => [],
- last_doc_id => -1
+ last_doc_id => -1,
+ ranks => {}
}, $class;
};
@@ -62,7 +64,6 @@
# TODO:
# use Krawfish::Index::Store::V1::Fields->new;
$self->{docs}->[$self->last_doc_id] = Krawfish::Index::Fields::Doc->new($doc);
-
return $doc_id;
};
diff --git a/lib/Krawfish/Index/Fields/Doc.pm b/lib/Krawfish/Index/Fields/Doc.pm
index 71e9ddb..109466d 100644
--- a/lib/Krawfish/Index/Fields/Doc.pm
+++ b/lib/Krawfish/Index/Fields/Doc.pm
@@ -35,7 +35,6 @@
};
} @$fields;
-
# Add field data
my @data = ();
foreach (@sorted_fields) {
@@ -49,6 +48,7 @@
push @data, $_->value if $_->type eq 'int' || $_->type eq 'store';
};
+
push @data, 'EOF';
print_log('fields_doc', 'The fields are ' . join(',', map { defined $_ ? $_ : '?' } @data)) if DEBUG;
diff --git a/lib/Krawfish/Index/Fields/Rank.pm b/lib/Krawfish/Index/Fields/Rank.pm
new file mode 100644
index 0000000..7935d09
--- /dev/null
+++ b/lib/Krawfish/Index/Fields/Rank.pm
@@ -0,0 +1,23 @@
+package Krawfish::Index::Fields::Rank;
+use strict;
+use warnings;
+
+sub new {
+ my $class = shift;
+ bless {
+ collation => shift,
+ prefix => [],
+ suffix => [],
+ plain => []
+ }, $class;
+};
+
+
+# Add an entry to the plain list
+sub add {
+ my $self = shift;
+ my ($value, $doc_id) = @_;
+ push @{$self->{plain}}, [$value, $doc_id];
+};
+
+1;
diff --git a/lib/Krawfish/Index/Fields/Ranks.pm b/lib/Krawfish/Index/Fields/Ranks.pm
new file mode 100644
index 0000000..73a85b9
--- /dev/null
+++ b/lib/Krawfish/Index/Fields/Ranks.pm
@@ -0,0 +1,25 @@
+package Krawfish::Index::Fields::Ranks;
+use Krawfish::Index::Fields::Rank;
+use strict;
+use warnings;
+
+sub new {
+ my $class = shift;
+ bless {}, $class;
+};
+
+# Get the rank by
+sub by {
+ my ($self, $field_id) = @_;
+
+ # Field may be ranked or not
+ return $self->{$field_id};
+};
+
+sub introduce_rank {
+ my ($self, $field_id, $collation) = @_;
+ $self->{$field_id} = Krawfish::Index::Fields::Rank->new($collation);
+};
+
+
+1;
diff --git a/lib/Krawfish/Index/Segment.pm b/lib/Krawfish/Index/Segment.pm
index 72cc5ac..03a3d11 100644
--- a/lib/Krawfish/Index/Segment.pm
+++ b/lib/Krawfish/Index/Segment.pm
@@ -1,5 +1,6 @@
package Krawfish::Index::Segment;
use Krawfish::Index::Fields;
+use Krawfish::Index::Fields::Ranks;
use Krawfish::Index::PostingsLive;
use Krawfish::Index::PostingsList;
use Krawfish::Index::Forward;
@@ -61,9 +62,9 @@
$self->{file}
);
- # $self->{field_ranks} = Krawfish::Index::Fields::Ranks->new(
- # $self->{file}
- #);
+ $self->{field_ranks} = Krawfish::Index::Fields::Ranks->new(
+ $self->{file}
+ );
# Create a list of docid -> uuid mappers
# This may be problematic as uuids may need to be uint64,
@@ -96,9 +97,9 @@
# Alias for last doc
-sub max_rank {
- $_[0]->{live}->next_doc_id - 1;
-};
+#sub max_rank {
+# $_[0]->{live}->next_doc_id - 1;
+#};
# Get subtokens
@@ -125,6 +126,11 @@
};
+sub field_ranks {
+ $_[0]->{field_ranks};
+};
+
+
# Return a postings list based on a term_id
sub postings {
my ($self, $term_id) = @_;
@@ -141,6 +147,7 @@
};
+
# Add a prepared document to the index
sub add {
my ($self, $doc) = @_;
@@ -167,19 +174,26 @@
# Rank fields!
# $self->field_ranks();
- # TODO:
- # Deal with sortables!
-
# $self->invert->add()
# Create term index for fields
my $fields = $doc->fields;
+ my $ranks = $self->field_ranks;
foreach (@$fields) {
next if $_->type eq 'store';
if (DEBUG) {
print_log('seg', 'Added field #' . $_->term_id . ' for doc_id=' . $doc_id);
};
$self->postings($_->term_id)->append($doc_id);
+
+ # The field is sortable
+ if ($_->sortable) {
+
+ # Add field value to ranking
+ my $ranked_by = $ranks->by($_->key_id);
+
+ $ranked_by->add($_->value, $doc_id) if $ranked_by;
+ };
};
diff --git a/lib/Krawfish/Koral/Document.pm b/lib/Krawfish/Koral/Document.pm
index aa96468..945ebe6 100644
--- a/lib/Krawfish/Koral/Document.pm
+++ b/lib/Krawfish/Koral/Document.pm
@@ -63,11 +63,6 @@
};
-sub sortable {
- $_[0]->{sortable};
-};
-
-
# Translate all terms into term_ids and
# add unknown terms to the dictionary
sub identify {
diff --git a/lib/Krawfish/Koral/Document/FieldBase.pm b/lib/Krawfish/Koral/Document/FieldBase.pm
index e8c25cb..e1f3559 100644
--- a/lib/Krawfish/Koral/Document/FieldBase.pm
+++ b/lib/Krawfish/Koral/Document/FieldBase.pm
@@ -11,7 +11,7 @@
sub new {
my $class = shift;
- # key, value, key_id, key_value_id
+ # key, value, key_id, key_value_id, sortable
bless { @_ }, $class;
};
diff --git a/lib/Krawfish/Koral/Document/FieldString.pm b/lib/Krawfish/Koral/Document/FieldString.pm
index 0f67869..68715a6 100644
--- a/lib/Krawfish/Koral/Document/FieldString.pm
+++ b/lib/Krawfish/Koral/Document/FieldString.pm
@@ -16,10 +16,13 @@
sub identify {
my ($self, $dict) = @_;
+ # This will check, if the field is
+ # sortable
return if $self->{key_id} && $self->{key_value_id};
# Get or introduce new key term_id
my $key = '!' . $self->{key};
+
$self->{key_id} = $dict->add_term($key);
# Set sortable
diff --git a/lib/Krawfish/Koral/Document/Fields.pm b/lib/Krawfish/Koral/Document/Fields.pm
index 4c13c57..33833c8 100644
--- a/lib/Krawfish/Koral/Document/Fields.pm
+++ b/lib/Krawfish/Koral/Document/Fields.pm
@@ -60,5 +60,4 @@
};
-
1;
diff --git a/lib/Krawfish/Koral/Meta/Limit.pm b/lib/Krawfish/Koral/Meta/Limit.pm
index ed2dce6..7b3a922 100644
--- a/lib/Krawfish/Koral/Meta/Limit.pm
+++ b/lib/Krawfish/Koral/Meta/Limit.pm
@@ -33,8 +33,13 @@
my ($self, $query) = @_;
# For sampling, limiting has no effect
- if ($query->type eq 'sort') {
- $query->top_k($self->items_per_type);
+ if ($query->type eq 'sample') {
+
+ # WARNING:
+ # This only holds true for the segment level,
+ # on the cluster and node levels,
+ # the limit (at least the top-k) is important.
+ $query->top_k($self->items_per_page);
return $query;
};
diff --git a/lib/Krawfish/Koral/Meta/Node/Aggregate.pm b/lib/Krawfish/Koral/Meta/Node/Aggregate.pm
index 27dacf2..1782fe3 100644
--- a/lib/Krawfish/Koral/Meta/Node/Aggregate.pm
+++ b/lib/Krawfish/Koral/Meta/Node/Aggregate.pm
@@ -3,6 +3,8 @@
use strict;
use warnings;
+# TODO:
+# Identify() should probably first return a Segment::Aggregate object
sub new {
my $class = shift;
diff --git a/lib/Krawfish/Koral/Meta/Node/Sort.pm b/lib/Krawfish/Koral/Meta/Node/Sort.pm
index 20d6957..78d8552 100644
--- a/lib/Krawfish/Koral/Meta/Node/Sort.pm
+++ b/lib/Krawfish/Koral/Meta/Node/Sort.pm
@@ -3,13 +3,19 @@
use strict;
use warnings;
-use constant DEBUG => 1;
+use constant (
+ DEBUG => 1,
+ UNIQUE => 'id'
+);
sub new {
my $class = shift;
if (DEBUG) {
- print_log('kq_n_sort', 'Create sort query with ' . join(', ', map {$_ ? $_ : '?'} @_));
+ print_log(
+ 'kq_n_sort', 'Create sort query with ' .
+ join(', ', map {$_ ? $_ : '?'} @_)
+ );
};
my $self = bless {
@@ -21,6 +27,11 @@
};
+sub type {
+ 'sort';
+};
+
+
# Get identifiers
sub identify {
my ($self, $dict) = @_;
@@ -28,10 +39,10 @@
my @identifier;
foreach (@{$self->{sort}}) {
- # Field may not exist in dictionary
- my $field = $_->identify($dict);
- if ($field) {
- push @identifier, $field;
+ # Criterion may not exist in dictionary
+ my $criterion = $_->identify($dict);
+ if ($criterion) {
+ push @identifier, $criterion;
};
};
@@ -48,6 +59,7 @@
};
+# Stringification
sub to_string {
my $self = shift;
my $str = join(',', map { $_->to_string } @{$self->{sort}});
@@ -67,9 +79,13 @@
sub optimize {
my ($self, $segment) = @_;
- warn 'Sorting not yet implemented';
+ my $query = $self->{query}->optimize($segment);
- return $self->{query}->optimize($segment);
+ if ($query->max_freq == 0) {
+ return Krawfish::Query::Nothing->new;
+ };
+
+ return $self;
};
1;
diff --git a/lib/Krawfish/Koral/Meta/Node/Sort/Sample.pm b/lib/Krawfish/Koral/Meta/Node/Sort/Sample.pm
index 827b2c5..6c71886 100644
--- a/lib/Krawfish/Koral/Meta/Node/Sort/Sample.pm
+++ b/lib/Krawfish/Koral/Meta/Node/Sort/Sample.pm
@@ -7,7 +7,7 @@
my $class = shift;
bless {
query => shift,
- n => shift
+ top_k => shift
}, $class;
};
@@ -15,6 +15,18 @@
'sample'
};
+# Set or get the top_k limitation!
+sub top_k {
+ my $self = shift;
+ if (defined $_[0]) {
+ $self->{top_k} = shift;
+ return $self;
+ };
+ return $self->{top_k};
+};
+
+
+
sub identify {
my ($self, $dict) = @_;
@@ -35,13 +47,13 @@
return Krawfish::Result::Segment::Sort::Sample->new(
$query,
- $self->{n}
+ $self->{top_k}
)
};
sub to_string {
- return 'sample(' . $_[0]->{n} . ':' . $_[0]->{query}->to_string . ')';
+ return 'sample(' . $_[0]->{top_k} . ':' . $_[0]->{query}->to_string . ')';
};
1;
diff --git a/lib/Krawfish/Koral/Meta/Sort.pm b/lib/Krawfish/Koral/Meta/Sort.pm
index a72ea18..0e2cb68 100644
--- a/lib/Krawfish/Koral/Meta/Sort.pm
+++ b/lib/Krawfish/Koral/Meta/Sort.pm
@@ -1,11 +1,15 @@
package Krawfish::Koral::Meta::Sort;
-use Krawfish::Result::Node::Sort;
use Krawfish::Koral::Meta::Node::Sort;
+use Krawfish::Koral::Meta::Sort::Field;
+use Krawfish::Koral::Meta::Type::Key;
use Krawfish::Log;
use strict;
use warnings;
-use constant DEBUG => 1;
+use constant {
+ DEBUG => 1,
+ UNIQUE_ID => 'id'
+};
# TODO:
# Support top_k setting from limit!
@@ -27,11 +31,17 @@
bless {
sort => [@_],
top_k => undef,
- filter => undef
+ filter => undef,
+ unique => UNIQUE_ID
}, $class;
};
+sub type {
+ 'sort';
+};
+
+
# Set or get the top_k limitation!
sub top_k {
my $self = shift;
@@ -43,6 +53,8 @@
};
+# Use sort filter (only possible, in case no aggregation
+# or grouping is applied)
sub filter {
my $self = shift;
if (defined $_[0]) {
@@ -83,10 +95,6 @@
};
-sub type {
- 'sort';
-};
-
# Remove duplicates
sub normalize {
@@ -94,6 +102,13 @@
my @unique;
my %unique;
my $sampling = 0;
+
+ # Add unique sorting to sort array
+ push @{$self->{sort}}, Krawfish::Koral::Meta::Sort::Field->new(
+ Krawfish::Koral::Meta::Type::Key->new($self->{unique})
+ );
+
+ # Normalize sorting
foreach (@{$self->{sort}}) {
# Sampling can't be combined with other sorting
@@ -104,16 +119,21 @@
return $_;
};
+ # Push unique sorting criteria to sorting array
unless (exists $unique{$_->to_string}) {
push @unique, $_;
$unique{$_->to_string} = 1;
};
};
+
+ # Create unique sort
@{$self->{sort}} = @unique;
+
return $self;
};
+# Wrap query object
sub wrap {
my ($self, $query) = @_;
return Krawfish::Koral::Meta::Node::Sort->new(
diff --git a/lib/Krawfish/Koral/Meta/Sort/Field.pm b/lib/Krawfish/Koral/Meta/Sort/Field.pm
index 1928cbc..3e56878 100644
--- a/lib/Krawfish/Koral/Meta/Sort/Field.pm
+++ b/lib/Krawfish/Koral/Meta/Sort/Field.pm
@@ -1,4 +1,5 @@
package Krawfish::Koral::Meta::Sort::Field;
+use Krawfish::Result::Segment::Sort::Field;
use Krawfish::Util::String qw/squote/;
use strict;
use warnings;
@@ -6,7 +7,7 @@
sub new {
my $class = shift;
bless {
- field => shift,
+ field => shift, # Is a Koral::Meta::Type::Key
desc => shift
}, $class;
};
@@ -19,6 +20,20 @@
return $_[0]->{field};
};
+sub desc {
+ return $_[0]->{desc};
+};
+
+sub optimize {
+ my ($self, $segment) = @_;
+
+ return Krawfish::Result::Segment::Sort::Field->new(
+ $segment,
+ $_[0]->{field}, # Should probably be a field_id!
+ $_[0]->{desc}
+ );
+};
+
sub identify {
my ($self, $dict) = @_;
my $field = $self->{field}->identify($dict);
diff --git a/lib/Krawfish/Result/Segment/Sort.pm b/lib/Krawfish/Result/Segment/Sort.pm
index 5489a9d..1d6b4a2 100644
--- a/lib/Krawfish/Result/Segment/Sort.pm
+++ b/lib/Krawfish/Result/Segment/Sort.pm
@@ -50,6 +50,8 @@
# TODO:
# It should be possible to add the sorting criteria.
+
+# Constructor
sub new {
my $class = shift;
my %param = @_;
diff --git a/lib/Krawfish/Result/Segment/Sort/Field.pm b/lib/Krawfish/Result/Segment/Sort/Field.pm
index 85b400b..9ae4cfe 100644
--- a/lib/Krawfish/Result/Segment/Sort/Field.pm
+++ b/lib/Krawfish/Result/Segment/Sort/Field.pm
@@ -12,16 +12,16 @@
my $class = shift;
my $self = bless {
- index => shift,
- field => shift,
+ segment => shift,
+ field => shift,
descending => shift // 0
}, $class;
# Get ranking
- $self->{ranks} = $self->{index}->fields->ranked_by($field) or return;
+ # $self->{ranks} = $self->{index}->fields->ranked_by($field) or return;
# Get maximum rank if descending order
- $self->{max} = $self->{ranks}->max if $self->{descending};
+ # $self->{max} = $self->{ranks}->max if $self->{descending};
return $self;
};
@@ -29,13 +29,13 @@
sub get {
my $self = shift;
- my $max = $ranking->max if $desc;
+# my $max = $ranking->max if $desc;
# Get stored rank
- $rank = $ranking->get(shift);
+# $rank = $ranking->get(shift);
# Revert if maximum rank is set
- return $max ? $max - $rank : $rank;
+# return $max ? $max - $rank : $rank;
};
diff --git a/lib/Krawfish/Result/Segment/Sort/MultiField.pm b/lib/Krawfish/Result/Segment/Sort/MultiField.pm
deleted file mode 100644
index 25159cb..0000000
--- a/lib/Krawfish/Result/Segment/Sort/MultiField.pm
+++ /dev/null
@@ -1,30 +0,0 @@
-package Krawfish::Result::Segment::Sort::MultiField;
-use strict;
-use warnings;
-
-# Sorting criterion for multi value field ranks.
-
-# See Krawfish::Result::Segment::Sort::Field
-
-sub new {
- my $class = shift;
-
- my $self = bless {
- index => shift,
- field => shift,
- descending => shift // 0
- }, $class;
-
- # Get ranking
- # TODO: Depending on the descending value, different rankings need to be loaded
-
- return $self;
-};
-
-
-sub get {
- my $self = shift;
-
- # Get stored rank
- return $self->{ranking}->get(shift);
-};
diff --git a/lib/Krawfish/Result/Sort/Random.pm b/lib/Krawfish/Result/Segment/Sort/Random.pm
similarity index 61%
rename from lib/Krawfish/Result/Sort/Random.pm
rename to lib/Krawfish/Result/Segment/Sort/Random.pm
index c302664..1fedc12 100644
--- a/lib/Krawfish/Result/Sort/Random.pm
+++ b/lib/Krawfish/Result/Segment/Sort/Random.pm
@@ -1 +1,3 @@
+# Return all elements in random order
+
# https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle
diff --git a/lib/Krawfish/Result/Segment/Sort/Sample.pm b/lib/Krawfish/Result/Segment/Sort/Sample.pm
index 70def45..247e086 100644
--- a/lib/Krawfish/Result/Segment/Sort/Sample.pm
+++ b/lib/Krawfish/Result/Segment/Sort/Sample.pm
@@ -3,16 +3,21 @@
use strict;
use warnings;
-# https://en.wikipedia.org/wiki/Reservoir_sampling
-# https://webkist.wordpress.com/2008/10/01/reservoir-sampling-in-perl/
-# https://blogs.msdn.microsoft.com/spt/2008/02/05/reservoir-sampling/
-
-# A. Anagnostopoulos, A. Z. Broder, and D. Carmel. Sampling search-engine results. In Proc. of the Fourteenth International World Wide Web Conference, Chiba, Japan, 2005. ACM Press.
-
+# Sort all matches in random order and only return the top_k
+# results. This is implemented using reservoir sampling.
+# Difference to random sorting is, this won't randomly sort all
+# results, making paging possible.
# WARNING:
-# Sorting does not respect current_match of any nested query, that's why
-# sorting is always separated from enriching!
+# Sorting does not respect current_match of any nested query,
+# that's why sorting is always separated from enriching!
+
+# See
+# https://en.wikipedia.org/wiki/Reservoir_sampling
+# https://webkist.wordpress.com/2008/10/01/reservoir-sampling-in-perl/
+# https://blogs.msdn.microsoft.com/spt/2008/02/05/reservoir-sampling/
+# A. Anagnostopoulos, A. Z. Broder, and D. Carmel. Sampling search-engine results. In Proc. of the Fourteenth International World Wide Web Conference, Chiba, Japan, 2005. ACM Press.
+
use constant DEBUG => 1;
diff --git a/lib/Krawfish/Result/Segment/Sort/SubTerm.pm b/lib/Krawfish/Result/Segment/Sort/SubTerm.pm
index b86e964..111af25 100644
--- a/lib/Krawfish/Result/Segment/Sort/SubTerm.pm
+++ b/lib/Krawfish/Result/Segment/Sort/SubTerm.pm
@@ -1,7 +1,9 @@
-package Krawfish::Result::Segment::Sort::Field;
+package Krawfish::Result::Segment::Sort::SubTerm;
use strict;
use warnings;
+# This will sort based on a pre-ranked subterm
+
sub new {
my $class = shift;