Simplified Ranking algorithm to be nestable Change-Id: I622f6eab5621d7ed26dffed06fe864a44dc21c30

commit: 96c2f14a767e4633086196dc4039afc2b88a9110 [log] [tgz]
author: Akron <nils@diewald-online.de> Thu Sep 28 13:13:15 2017 +0200
committer: Akron <nils@diewald-online.de> Thu Sep 28 13:13:15 2017 +0200
tree: 8f83676d353cfade23e8f7a123435ad49278b31b
parent: 510baba6a0a36ebc1083f6dfd850f137328811b8 [diff]
diff --git a/lib/Krawfish/Index/Fields/Rank.pm b/lib/Krawfish/Index/Fields/Rank.pm
index f51abc2..7b7e97f 100644
--- a/lib/Krawfish/Index/Fields/Rank.pm
+++ b/lib/Krawfish/Index/Fields/Rank.pm

@@ -58,6 +58,10 @@
 };
 
 
+sub max_rank {
+  $_[0]->{max_rank};
+};
+
 
 # Prepare the plain list for merging
 # or - for the moment - to become

diff --git a/lib/Krawfish/Meta/Segment/Sort.pm b/lib/Krawfish/Meta/Segment/Sort.pm
index 1c5f64c..dbdde15 100644
--- a/lib/Krawfish/Meta/Segment/Sort.pm
+++ b/lib/Krawfish/Meta/Segment/Sort.pm

@@ -12,7 +12,7 @@
 # This is the general sorting implementation based on ranks.
 
 use constant {
-  DEBUG   => 0,
+  DEBUG   => 1,
   RANK    => 0,
   SAME    => 1,
   VALUE   => 2,
@@ -48,6 +48,7 @@
   $_[0]->{query}->max_freq;
 };
 
+
 # Constructor
 sub new {
   my $class = shift;
@@ -62,6 +63,11 @@
   # This is the sort criterion
   my $sort     = $param{sort};
 
+  # Set top_k if not yet set
+  # - to be honest, heap sort is probably not the
+  # best approach for a full search
+  $top_k //= $segment->last_doc;
+
   # The maximum ranking value may be used
   # by outside filters to know in advance,
   # if a document can't be part of the result set
@@ -85,7 +91,7 @@
 
   # Construct
   return bless {
-    segment        => $segment,
+    segment      => $segment,
     top_k        => $top_k,
     query        => $query,
     queue        => $queue,
@@ -112,32 +118,32 @@
 
   my $query = $self->{query};
 
-  # Get first sorting criterion
-  my ($field, $desc) = @{$self->{fields}->[0]};
-
-  # Get ranking
-  # This is already either desc or asc (for fields)
   # TODO:
   #   Make this work for terms as well!
-  my $ranking = $self->{ranks};
 
-  # Get maximum rank if descending order
-  # my $max = $ranking->max if $desc;
+  # TODO:
+  #   This currently only works for fields,
+  #   because it bundles document ids.
+  #   The prebundling of documents may be
+  #   done in a separated step.
 
   # Get maximum accepted rank from queue
   my $max_rank_ref = $self->{max_rank_ref};
 
   my $last_doc_id = -1;
-  my $rank;
   my $queue = $self->{queue};
 
+  my $sort = $self->{sort};
+
   # Store the last match buffered
   my $match;
 
   if (DEBUG) {
-    print_log('p_sort', qq!Next Rank on field "$field"!);
+    print_log('p_sort', qq!Next Rank on field #! . $sort->term_id);
   };
 
+  my $rank;
+
   # Pass through all queries
   while ($match || ($query->next && ($match = $query->current))) {
 
@@ -146,55 +152,48 @@
     };
 
     # Get stored rank
-    $rank = $ranking->get($match->doc_id);
-
-    # Revert if maximum rank is set
-    # TODO:
-    #   That should be done in the
-    #   Krawfish::Meta::Segment::Sort::*-Object!
-    # $rank = $max - $rank if $max;
+    $rank = $sort->rank_for($match->doc_id);
 
     if (DEBUG) {
       print_log('p_sort', 'Rank for doc id ' . $match->doc_id . " is $rank");
     };
 
     # Precheck if the match is relevant
-    if ($rank <= $$max_rank_ref) {
+    if ($rank > $$max_rank_ref) {
 
-      # Create new bundle of matches
-      my $bundle = Krawfish::Posting::Bundle->new($match->clone);
-
-      # Remember doc_id
-      $last_doc_id = $match->doc_id;
+      # Document is irrelevant
       $match = undef;
-
-      # Iterate over next queries
-      while ($query->next) {
-
-        # New match should join the bundle
-        if ($query->current->doc_id == $last_doc_id) {
-
-          # Add match to bundle
-          $bundle->add($query->current);
-        }
-
-        # New match is new
-        else {
-
-          # Remember match for the next tome
-          $match = $query->current;
-          last;
-        };
-      };
-
-      # Insert into priority queue
-      $queue->insert([$rank, 0, $bundle, $bundle->length]) if $bundle;
-    }
-
-    # Document is irrelevant
-    else {
-      $match = undef;
+      next;
     };
+
+    # Create new bundle of matches
+    my $bundle = Krawfish::Posting::Bundle->new($match->clone);
+
+    # Remember doc_id
+    $last_doc_id = $match->doc_id;
+    $match = undef;
+
+    # Iterate over next queries
+    while ($query->next) {
+
+      # New match should join the bundle
+      if ($query->current->doc_id == $last_doc_id) {
+
+        # Add match to bundle
+        $bundle->add($query->current);
+      }
+
+      # New match is new
+      else {
+
+        # Remember match for the next tome
+        $match = $query->current;
+        last;
+      };
+    };
+
+    # Insert into priority queue
+    $queue->insert([$rank, 0, $bundle, $bundle->length]) if $bundle;
   };
 
   print_log('p_sort', 'Get list ranking') if DEBUG;
@@ -204,11 +203,12 @@
 };
 
 
+
 # Move to the next item in the sorted list
 sub next {
   my $self = shift;
 
-  if ($self->{pos}++ >= $self->{top_k}) {
+  if ($self->{top_k} && $self->{pos}++ >= $self->{top_k}) {
 
     if (DEBUG) {
       print_log(
@@ -221,6 +221,7 @@
     return;
   };
 
+
   # Initialize query - this will do a full run on the first field level!
   $self->_init;
 
@@ -248,10 +249,6 @@
   # Get the list values
   my $stack = $self->{stack};
 
-  # The result list is empty - sort next items
-  #  if ($self->{presorted}) {
-  #  };
-
   # This will get the level from the stack
   my $level = $#{$stack};
 
@@ -268,9 +265,9 @@
     pop @$stack;
     $level--;
 
-    if (DEBUG) {
-      print_log('p_sort', "Stack is reduced to level $level with " . Dumper($stack));
-    };
+    #if (DEBUG) {
+    #  print_log('p_sort', "Stack is reduced to level $level with " . Dumper($stack));
+    #};
   };
 
   # There is nothing to sort further
@@ -291,7 +288,6 @@
   #   here the next strategy should be chosen.
   #   Either sort in place, or sort using heapsort again.
 
-
   # The first item in the current list has multiple identical ranks
   # As long as the first item in the list has duplicates,
   # order by the next level
@@ -309,30 +305,33 @@
     my @presort = splice(@{$stack->[$level]}, 0, $same - 1);
 
     print_log('p_sort', 'Presort array is ' . _string_array(\@presort)) if DEBUG;
-    # TODO: Push presort on the stack!
+
+    # TODO:
+    #   Push presort on the stack!
 
     # This is the new top_k!
-    # TODO: Check if this is really correct!
+    # TODO:
+    #   Check if this is really correct!
     my $top_k = $self->{top_k} - ($self->{pos} - 1);
 
     # Get next field to rank on level
     # level 0 is preinitialized, so it is one off
-    my ($field, $desc) = @{$self->{fields}->[$level + 1]};
+    #my ($field, $desc) = @{$self->{fields}->[$level + 1]};
 
-    if (DEBUG) {
-      print_log('p_sort', qq!Next Rank on field "$field"!);
-    };
+    #if (DEBUG) {
+    #  print_log('p_sort', qq!Next Rank on field "$field"!);
+    #};
 
-    $level++;
+    #$level++;
 
-    # TODO:
-    #   If the same count is smaller than X (at least top_k - pos)
-    #   do quicksort or something similar
-    # if ($same < $top_k || $same < 128) {
-    # }
-    # else
-    $stack->[$level] = $self->heap_sort($top_k, \@presort, $field, $desc);
-    # };
+    ## TODO:
+    ##   If the same count is smaller than X (at least top_k - pos)
+    ##   do quicksort or something similar
+    ## if ($same < $top_k || $same < 128) {
+    ## }
+    ## else
+    #$stack->[$level] = $self->heap_sort($top_k, \@presort, $field, $desc);
+    ## };
 
     if (DEBUG) {
       print_log(
@@ -364,6 +363,59 @@
 };
 
 
+
+# Return the current match
+sub current {
+
+  if (DEBUG) {
+    print_log('p_sort', 'Current posting is ' . $_[0]->{current}->to_string);
+  };
+
+  $_[0]->{current};
+};
+
+
+# Get the current match object
+sub current_match {
+  my $self = shift;
+  my $current = $self->current or return;
+  my $match = Krawfish::Koral::Result::Match->new(
+    doc_id  => $current->doc_id,
+    start   => $current->start,
+    end     => $current->end,
+    payload => $current->payload,
+  );
+
+  if (DEBUG) {
+    print_log('p_sort', 'Current match is ' . $match->to_string);
+  };
+
+  return $match;
+};
+
+
+# Return the number of duplicates of the current match
+sub duplicate_rank {
+  my $self = shift;
+
+  if (DEBUG) {
+    print_log('p_sort', 'Check for duplicates from index ' . $self->{pos});
+  };
+
+  return $self->{list}->[$self->{pos}]->[1] || 1;
+};
+
+
+sub to_string {
+  my $self = shift;
+  my $str = 'sort(';
+  $str .= $self->{sort}->to_string;
+  $str .= ',0-' . $self->{top_k} if $self->{top_k};
+  $str .= ':' . $self->{query}->to_string;
+  return $str . ')';
+};
+
+
 sub _string_array {
   my $array = shift;
   my $str = '';
@@ -378,11 +430,17 @@
 };
 
 
+
+
+
+
 # Todo:
 #   Accept an iterator, a ranking, and return an iterator
 sub heap_sort {
   my ($self, $top_k, $sub_list, $field, $desc) = @_;
 
+  warn 'DEPRECATED';
+
   if (DEBUG) {
     print_log('p_sort', 'Heapsort list of length ' . scalar(@$sub_list) .
                 qq! by field "$field" for top_k = $top_k!);
@@ -425,53 +483,6 @@
 };
 
 
-# Return the current match
-sub current {
-
-  if (DEBUG) {
-    print_log('p_sort', 'Current posting is ' . $_[0]->{current}->to_string);
-  };
-
-  $_[0]->{current};
-};
-
-sub current_match {
-  my $self = shift;
-  my $current = $self->current or return;
-  my $match = Krawfish::Koral::Result::Match->new(
-    doc_id  => $current->doc_id,
-    start   => $current->start,
-    end     => $current->end,
-    payload => $current->payload,
-  );
-
-  if (DEBUG) {
-    print_log('p_sort', 'Current match is ' . $match->to_string);
-  };
-
-  return $match;
-};
-
-# Return the number of duplicates of the current match
-sub duplicate_rank {
-  my $self = shift;
-
-  if (DEBUG) {
-    print_log('p_sort', 'Check for duplicates from index ' . $self->{pos});
-  };
-
-  return $self->{list}->[$self->{pos}]->[1] || 1;
-};
-
-
-sub to_string {
-  my $self = shift;
-  my $str = 'sort(';
-  $str .= $self->{sort}->to_string;
-  $str .= ',0-' . $self->{top_k} if $self->{top_k};
-  $str .= ':' . $self->{query}->to_string;
-  return $str . ')';
-};
 
 
 1;

diff --git a/lib/Krawfish/Meta/Segment/Sort/Field.pm b/lib/Krawfish/Meta/Segment/Sort/Field.pm
index ab6c85c..2abe170 100644
--- a/lib/Krawfish/Meta/Segment/Sort/Field.pm
+++ b/lib/Krawfish/Meta/Segment/Sort/Field.pm

@@ -15,6 +15,9 @@
 #   Probably not only support ranks but all kinds of sorting
 #   by having a get_lt() API that also works for strings!
 
+# TODO:
+#   Return max rank for unknown fields!
+
 sub new {
   my $class = shift;
 
@@ -26,8 +29,9 @@
   return unless $rank;
 
   my $self = bless {
-    field_id   => $field_id,
-    desc       => $descending
+    field_id => $field_id,
+    desc     => $descending,
+    max_rank => $rank->max_rank
   }, $class;
 
   # Get fields in descending order
@@ -35,12 +39,12 @@
 
     # This may be a real descending order file
     # or a reversed single-valued ascending order file
-    $self->{rank} = $rank->descending;
+    $self->{rank} = $rank->descending or return;
   }
 
   # Get fields in ascending order
   else {
-    $self->{rank} = $rank->ascending;
+    $self->{rank} = $rank->ascending or return;
   };
 
   return $self;
@@ -64,6 +68,13 @@
   $_[0]->{field_id};
 };
 
+sub max_rank {
+  $_[0]->{max_rank}
+}
+
+sub term_id {
+  $_[0]->{field_id};
+};
 
 # TODO:
 #   This may need to be an inflatable

diff --git a/lib/Krawfish/Query/TermID.pm b/lib/Krawfish/Query/TermID.pm
index 58f35e9..a5e1996 100644
--- a/lib/Krawfish/Query/TermID.pm
+++ b/lib/Krawfish/Query/TermID.pm

@@ -6,8 +6,7 @@
 use strict;
 use warnings;
 
-
-use constant DEBUG => 1;
+use constant DEBUG => 0;
 
 # Constructor
 sub new {

diff --git a/lib/Krawfish/Util/PrioritySort.pm b/lib/Krawfish/Util/PrioritySort.pm
index 2a4cd11..b9781f1 100644
--- a/lib/Krawfish/Util/PrioritySort.pm
+++ b/lib/Krawfish/Util/PrioritySort.pm

@@ -38,7 +38,7 @@
 # with the same rank. For example, multiple matches in a document.
 #
 use constant {
-  DEBUG => 0,
+  DEBUG => 1,
   RANK  => 0,
   SAME  => 1, # 0 means: not checked yet!
   VALUE => 2

diff --git a/t/meta/segment/sort_field.t b/t/meta/segment/sort_field.t
index d4f13c2..331f462 100644
--- a/t/meta/segment/sort_field.t
+++ b/t/meta/segment/sort_field.t

@@ -45,6 +45,8 @@
   age => 7
 } => [qw/aa bb/], 'Add complex document');
 
+ok($index->commit, 'Commit data');
+
 my $koral = Krawfish::Koral->new;
 my $qb = $koral->query_builder;
 my $mb = $koral->meta_builder;
@@ -56,13 +58,36 @@
   )
 );
 
+# For simplicity
+$koral->meta(
+  $mb->sort_by(
+    $mb->s_field('id')
+  )
+);
+
+ok(my $query = $koral->to_query, 'Normalize');
+
+ok($query = $query->identify($index->dict)->optimize($index->segment), 'optimize');
+
+is($query->to_string, 'sort(field=#1<,0-4:constr(pos=2:#10,filter(#12,[1])))', 'Stringification');
+
+ok($query->next, 'Move to next match');
+
+
+
+done_testing;
+__END__
+
+
 $koral->meta(
   $mb->sort_by(
     $mb->s_field('author')
   )
 );
 
-ok(my $query = $koral->to_query, 'Normalize');
+
+# Check for multiple fields in order
+ok($query = $koral->to_query, 'Normalize');
 
 is($query->to_string, "sort(field='id'<:sort(field='author'<:filter(aabb,[1])))", 'Stringification');
 
@@ -78,6 +103,7 @@
    'sort(field=#1<:sort(field=#2<:constr(pos=2:#10,filter(#12,[1]))))',
    'Stringification');
 
+
 diag 'check sorting';
 
 done_testing;
@@ -87,6 +113,9 @@
 
 
 
+
+
+
 # TODO: Introduce sortfilter
 
 $koral->meta(
commit	96c2f14a767e4633086196dc4039afc2b88a9110	[log] [tgz]
author	Akron <nils@diewald-online.de>	Thu Sep 28 13:13:15 2017 +0200
committer	Akron <nils@diewald-online.de>	Thu Sep 28 13:13:15 2017 +0200
tree	8f83676d353cfade23e8f7a123435ad49278b31b
parent	510baba6a0a36ebc1083f6dfd850f137328811b8 [diff]