Simplified Ranking algorithm to be nestable
Change-Id: I622f6eab5621d7ed26dffed06fe864a44dc21c30
diff --git a/lib/Krawfish/Index/Fields/Rank.pm b/lib/Krawfish/Index/Fields/Rank.pm
index f51abc2..7b7e97f 100644
--- a/lib/Krawfish/Index/Fields/Rank.pm
+++ b/lib/Krawfish/Index/Fields/Rank.pm
@@ -58,6 +58,10 @@
};
+sub max_rank {
+ $_[0]->{max_rank};
+};
+
# Prepare the plain list for merging
# or - for the moment - to become
diff --git a/lib/Krawfish/Meta/Segment/Sort.pm b/lib/Krawfish/Meta/Segment/Sort.pm
index 1c5f64c..dbdde15 100644
--- a/lib/Krawfish/Meta/Segment/Sort.pm
+++ b/lib/Krawfish/Meta/Segment/Sort.pm
@@ -12,7 +12,7 @@
# This is the general sorting implementation based on ranks.
use constant {
- DEBUG => 0,
+ DEBUG => 1,
RANK => 0,
SAME => 1,
VALUE => 2,
@@ -48,6 +48,7 @@
$_[0]->{query}->max_freq;
};
+
# Constructor
sub new {
my $class = shift;
@@ -62,6 +63,11 @@
# This is the sort criterion
my $sort = $param{sort};
+ # Set top_k if not yet set
+ # - to be honest, heap sort is probably not the
+ # best approach for a full search
+ $top_k //= $segment->last_doc;
+
# The maximum ranking value may be used
# by outside filters to know in advance,
# if a document can't be part of the result set
@@ -85,7 +91,7 @@
# Construct
return bless {
- segment => $segment,
+ segment => $segment,
top_k => $top_k,
query => $query,
queue => $queue,
@@ -112,32 +118,32 @@
my $query = $self->{query};
- # Get first sorting criterion
- my ($field, $desc) = @{$self->{fields}->[0]};
-
- # Get ranking
- # This is already either desc or asc (for fields)
# TODO:
# Make this work for terms as well!
- my $ranking = $self->{ranks};
- # Get maximum rank if descending order
- # my $max = $ranking->max if $desc;
+ # TODO:
+ # This currently only works for fields,
+ # because it bundles document ids.
+ # The prebundling of documents may be
+ # done in a separated step.
# Get maximum accepted rank from queue
my $max_rank_ref = $self->{max_rank_ref};
my $last_doc_id = -1;
- my $rank;
my $queue = $self->{queue};
+ my $sort = $self->{sort};
+
# Store the last match buffered
my $match;
if (DEBUG) {
- print_log('p_sort', qq!Next Rank on field "$field"!);
+ print_log('p_sort', qq!Next Rank on field #! . $sort->term_id);
};
+ my $rank;
+
# Pass through all queries
while ($match || ($query->next && ($match = $query->current))) {
@@ -146,55 +152,48 @@
};
# Get stored rank
- $rank = $ranking->get($match->doc_id);
-
- # Revert if maximum rank is set
- # TODO:
- # That should be done in the
- # Krawfish::Meta::Segment::Sort::*-Object!
- # $rank = $max - $rank if $max;
+ $rank = $sort->rank_for($match->doc_id);
if (DEBUG) {
print_log('p_sort', 'Rank for doc id ' . $match->doc_id . " is $rank");
};
# Precheck if the match is relevant
- if ($rank <= $$max_rank_ref) {
+ if ($rank > $$max_rank_ref) {
- # Create new bundle of matches
- my $bundle = Krawfish::Posting::Bundle->new($match->clone);
-
- # Remember doc_id
- $last_doc_id = $match->doc_id;
+ # Document is irrelevant
$match = undef;
-
- # Iterate over next queries
- while ($query->next) {
-
- # New match should join the bundle
- if ($query->current->doc_id == $last_doc_id) {
-
- # Add match to bundle
- $bundle->add($query->current);
- }
-
- # New match is new
- else {
-
- # Remember match for the next tome
- $match = $query->current;
- last;
- };
- };
-
- # Insert into priority queue
- $queue->insert([$rank, 0, $bundle, $bundle->length]) if $bundle;
- }
-
- # Document is irrelevant
- else {
- $match = undef;
+ next;
};
+
+ # Create new bundle of matches
+ my $bundle = Krawfish::Posting::Bundle->new($match->clone);
+
+ # Remember doc_id
+ $last_doc_id = $match->doc_id;
+ $match = undef;
+
+ # Iterate over next queries
+ while ($query->next) {
+
+ # New match should join the bundle
+ if ($query->current->doc_id == $last_doc_id) {
+
+ # Add match to bundle
+ $bundle->add($query->current);
+ }
+
+ # New match is new
+ else {
+
+ # Remember match for the next tome
+ $match = $query->current;
+ last;
+ };
+ };
+
+ # Insert into priority queue
+ $queue->insert([$rank, 0, $bundle, $bundle->length]) if $bundle;
};
print_log('p_sort', 'Get list ranking') if DEBUG;
@@ -204,11 +203,12 @@
};
+
# Move to the next item in the sorted list
sub next {
my $self = shift;
- if ($self->{pos}++ >= $self->{top_k}) {
+ if ($self->{top_k} && $self->{pos}++ >= $self->{top_k}) {
if (DEBUG) {
print_log(
@@ -221,6 +221,7 @@
return;
};
+
# Initialize query - this will do a full run on the first field level!
$self->_init;
@@ -248,10 +249,6 @@
# Get the list values
my $stack = $self->{stack};
- # The result list is empty - sort next items
- # if ($self->{presorted}) {
- # };
-
# This will get the level from the stack
my $level = $#{$stack};
@@ -268,9 +265,9 @@
pop @$stack;
$level--;
- if (DEBUG) {
- print_log('p_sort', "Stack is reduced to level $level with " . Dumper($stack));
- };
+ #if (DEBUG) {
+ # print_log('p_sort', "Stack is reduced to level $level with " . Dumper($stack));
+ #};
};
# There is nothing to sort further
@@ -291,7 +288,6 @@
# here the next strategy should be chosen.
# Either sort in place, or sort using heapsort again.
-
# The first item in the current list has multiple identical ranks
# As long as the first item in the list has duplicates,
# order by the next level
@@ -309,30 +305,33 @@
my @presort = splice(@{$stack->[$level]}, 0, $same - 1);
print_log('p_sort', 'Presort array is ' . _string_array(\@presort)) if DEBUG;
- # TODO: Push presort on the stack!
+
+ # TODO:
+ # Push presort on the stack!
# This is the new top_k!
- # TODO: Check if this is really correct!
+ # TODO:
+ # Check if this is really correct!
my $top_k = $self->{top_k} - ($self->{pos} - 1);
# Get next field to rank on level
# level 0 is preinitialized, so it is one off
- my ($field, $desc) = @{$self->{fields}->[$level + 1]};
+ #my ($field, $desc) = @{$self->{fields}->[$level + 1]};
- if (DEBUG) {
- print_log('p_sort', qq!Next Rank on field "$field"!);
- };
+ #if (DEBUG) {
+ # print_log('p_sort', qq!Next Rank on field "$field"!);
+ #};
- $level++;
+ #$level++;
- # TODO:
- # If the same count is smaller than X (at least top_k - pos)
- # do quicksort or something similar
- # if ($same < $top_k || $same < 128) {
- # }
- # else
- $stack->[$level] = $self->heap_sort($top_k, \@presort, $field, $desc);
- # };
+ ## TODO:
+ ## If the same count is smaller than X (at least top_k - pos)
+ ## do quicksort or something similar
+ ## if ($same < $top_k || $same < 128) {
+ ## }
+ ## else
+ #$stack->[$level] = $self->heap_sort($top_k, \@presort, $field, $desc);
+ ## };
if (DEBUG) {
print_log(
@@ -364,6 +363,59 @@
};
+
+# Return the current match
+sub current {
+
+ if (DEBUG) {
+ print_log('p_sort', 'Current posting is ' . $_[0]->{current}->to_string);
+ };
+
+ $_[0]->{current};
+};
+
+
+# Get the current match object
+sub current_match {
+ my $self = shift;
+ my $current = $self->current or return;
+ my $match = Krawfish::Koral::Result::Match->new(
+ doc_id => $current->doc_id,
+ start => $current->start,
+ end => $current->end,
+ payload => $current->payload,
+ );
+
+ if (DEBUG) {
+ print_log('p_sort', 'Current match is ' . $match->to_string);
+ };
+
+ return $match;
+};
+
+
+# Return the number of duplicates of the current match
+sub duplicate_rank {
+ my $self = shift;
+
+ if (DEBUG) {
+ print_log('p_sort', 'Check for duplicates from index ' . $self->{pos});
+ };
+
+ return $self->{list}->[$self->{pos}]->[1] || 1;
+};
+
+
+sub to_string {
+ my $self = shift;
+ my $str = 'sort(';
+ $str .= $self->{sort}->to_string;
+ $str .= ',0-' . $self->{top_k} if $self->{top_k};
+ $str .= ':' . $self->{query}->to_string;
+ return $str . ')';
+};
+
+
sub _string_array {
my $array = shift;
my $str = '';
@@ -378,11 +430,17 @@
};
+
+
+
+
# Todo:
# Accept an iterator, a ranking, and return an iterator
sub heap_sort {
my ($self, $top_k, $sub_list, $field, $desc) = @_;
+ warn 'DEPRECATED';
+
if (DEBUG) {
print_log('p_sort', 'Heapsort list of length ' . scalar(@$sub_list) .
qq! by field "$field" for top_k = $top_k!);
@@ -425,53 +483,6 @@
};
-# Return the current match
-sub current {
-
- if (DEBUG) {
- print_log('p_sort', 'Current posting is ' . $_[0]->{current}->to_string);
- };
-
- $_[0]->{current};
-};
-
-sub current_match {
- my $self = shift;
- my $current = $self->current or return;
- my $match = Krawfish::Koral::Result::Match->new(
- doc_id => $current->doc_id,
- start => $current->start,
- end => $current->end,
- payload => $current->payload,
- );
-
- if (DEBUG) {
- print_log('p_sort', 'Current match is ' . $match->to_string);
- };
-
- return $match;
-};
-
-# Return the number of duplicates of the current match
-sub duplicate_rank {
- my $self = shift;
-
- if (DEBUG) {
- print_log('p_sort', 'Check for duplicates from index ' . $self->{pos});
- };
-
- return $self->{list}->[$self->{pos}]->[1] || 1;
-};
-
-
-sub to_string {
- my $self = shift;
- my $str = 'sort(';
- $str .= $self->{sort}->to_string;
- $str .= ',0-' . $self->{top_k} if $self->{top_k};
- $str .= ':' . $self->{query}->to_string;
- return $str . ')';
-};
1;
diff --git a/lib/Krawfish/Meta/Segment/Sort/Field.pm b/lib/Krawfish/Meta/Segment/Sort/Field.pm
index ab6c85c..2abe170 100644
--- a/lib/Krawfish/Meta/Segment/Sort/Field.pm
+++ b/lib/Krawfish/Meta/Segment/Sort/Field.pm
@@ -15,6 +15,9 @@
# Probably not only support ranks but all kinds of sorting
# by having a get_lt() API that also works for strings!
+# TODO:
+# Return max rank for unknown fields!
+
sub new {
my $class = shift;
@@ -26,8 +29,9 @@
return unless $rank;
my $self = bless {
- field_id => $field_id,
- desc => $descending
+ field_id => $field_id,
+ desc => $descending,
+ max_rank => $rank->max_rank
}, $class;
# Get fields in descending order
@@ -35,12 +39,12 @@
# This may be a real descending order file
# or a reversed single-valued ascending order file
- $self->{rank} = $rank->descending;
+ $self->{rank} = $rank->descending or return;
}
# Get fields in ascending order
else {
- $self->{rank} = $rank->ascending;
+ $self->{rank} = $rank->ascending or return;
};
return $self;
@@ -64,6 +68,13 @@
$_[0]->{field_id};
};
+sub max_rank {
+ $_[0]->{max_rank}
+}
+
+sub term_id {
+ $_[0]->{field_id};
+};
# TODO:
# This may need to be an inflatable
diff --git a/lib/Krawfish/Query/TermID.pm b/lib/Krawfish/Query/TermID.pm
index 58f35e9..a5e1996 100644
--- a/lib/Krawfish/Query/TermID.pm
+++ b/lib/Krawfish/Query/TermID.pm
@@ -6,8 +6,7 @@
use strict;
use warnings;
-
-use constant DEBUG => 1;
+use constant DEBUG => 0;
# Constructor
sub new {
diff --git a/lib/Krawfish/Util/PrioritySort.pm b/lib/Krawfish/Util/PrioritySort.pm
index 2a4cd11..b9781f1 100644
--- a/lib/Krawfish/Util/PrioritySort.pm
+++ b/lib/Krawfish/Util/PrioritySort.pm
@@ -38,7 +38,7 @@
# with the same rank. For example, multiple matches in a document.
#
use constant {
- DEBUG => 0,
+ DEBUG => 1,
RANK => 0,
SAME => 1, # 0 means: not checked yet!
VALUE => 2
diff --git a/t/meta/segment/sort_field.t b/t/meta/segment/sort_field.t
index d4f13c2..331f462 100644
--- a/t/meta/segment/sort_field.t
+++ b/t/meta/segment/sort_field.t
@@ -45,6 +45,8 @@
age => 7
} => [qw/aa bb/], 'Add complex document');
+ok($index->commit, 'Commit data');
+
my $koral = Krawfish::Koral->new;
my $qb = $koral->query_builder;
my $mb = $koral->meta_builder;
@@ -56,13 +58,36 @@
)
);
+# For simplicity
+$koral->meta(
+ $mb->sort_by(
+ $mb->s_field('id')
+ )
+);
+
+ok(my $query = $koral->to_query, 'Normalize');
+
+ok($query = $query->identify($index->dict)->optimize($index->segment), 'optimize');
+
+is($query->to_string, 'sort(field=#1<,0-4:constr(pos=2:#10,filter(#12,[1])))', 'Stringification');
+
+ok($query->next, 'Move to next match');
+
+
+
+done_testing;
+__END__
+
+
$koral->meta(
$mb->sort_by(
$mb->s_field('author')
)
);
-ok(my $query = $koral->to_query, 'Normalize');
+
+# Check for multiple fields in order
+ok($query = $koral->to_query, 'Normalize');
is($query->to_string, "sort(field='id'<:sort(field='author'<:filter(aabb,[1])))", 'Stringification');
@@ -78,6 +103,7 @@
'sort(field=#1<:sort(field=#2<:constr(pos=2:#10,filter(#12,[1]))))',
'Stringification');
+
diag 'check sorting';
done_testing;
@@ -87,6 +113,9 @@
+
+
+
# TODO: Introduce sortfilter
$koral->meta(