Established second level sorting
Change-Id: Ie9433c62c6cf23a97059ebae214a29ae55681130
diff --git a/lib/Krawfish/Meta/Segment/Bundle.pm b/lib/Krawfish/Meta/Segment/Bundle.pm
new file mode 100644
index 0000000..4cf1812
--- /dev/null
+++ b/lib/Krawfish/Meta/Segment/Bundle.pm
@@ -0,0 +1,32 @@
+package Krawfish::Meta::Segment::Bundle;
+use parent 'Krawfish::Meta';
+
+sub current_bundle {
+ $_[0]->{current_bundle};
+};
+
+sub current_match {
+ $_[0]->{current_match};
+};
+
+sub current {
+ return $_[0]->{query}->current;
+};
+
+
+sub next_bundle {
+ ...
+};
+
+
+# These call methods in Posting::Bundle!
+sub next {
+ ...
+};
+
+sub max_freq {
+ $_[0]->{query}->max_freq;
+};
+
+
+1;
diff --git a/lib/Krawfish/Meta/Segment/BundleDocs.pm b/lib/Krawfish/Meta/Segment/BundleDocs.pm
index 121bc7c..b4bfe08 100644
--- a/lib/Krawfish/Meta/Segment/BundleDocs.pm
+++ b/lib/Krawfish/Meta/Segment/BundleDocs.pm
@@ -1,5 +1,5 @@
package Krawfish::Meta::Segment::BundleDocs;
-use parent 'Krawfish::Meta';
+use parent 'Krawfish::Meta::Segment::Bundle';
use Krawfish::Log;
use Krawfish::Posting::DocBundle;
use strict;
@@ -7,13 +7,17 @@
# Bundle matches in the same document.
+# TODO:
+# The problem with the current approch is, that next_bundle
+# will bundle the next doc - without asking if the doc
+# is relevant.
+
use constant DEBUG => 1;
sub new {
my $class = shift;
bless {
query => shift,
- next_item => undef,
buffer => undef
}, $class;
};
@@ -27,12 +31,41 @@
print_log('d_bundle', 'Get bundle');
};
- return $self->{current_bundle} if $self->{current_bundle};
+ return $self->{current_bundle};
+};
- my $first = $self->{next_item};
- # Need a next() first
- return unless $first;
+# TODO:
+# Implement next doc!
+sub next_doc {
+ ...
+};
+
+
+# Move to next bundle
+sub next_bundle {
+ my $self = shift;
+
+ # Reset current bundle
+ $self->{current_bundle} = undef;
+
+ my $first;
+
+ # There is a bundle on buffer
+ if ($self->{buffer}) {
+ $first = $self->{buffer};
+ $self->{buffer} = undef;
+ }
+
+ # Move forward
+ elsif ($self->{query}->next) {
+ $first = $self->{query}->current or return;
+ }
+
+ # Can't move forward
+ else {
+ return;
+ };
my $bundle = Krawfish::Posting::DocBundle->new($first);
my $query = $self->{query};
@@ -41,7 +74,6 @@
print_log('d_bundle', 'Start bundle with ' . $first->to_string);
};
-
# There is a next entry
my $next;
while ($query->next) {
@@ -60,41 +92,6 @@
};
$self->{current_bundle} = $bundle;
-
- return $bundle;
-};
-
-
-sub max_freq {
- $_[0]->{query}->max_freq;
-};
-
-sub current {
- return $_[0]->{query}->current;
-};
-
-
-# Move to next bundle
-sub next {
- my $self = shift;
- $self->{current_bundle} = undef;
-
- if ($self->{buffer}) {
- $self->{next_item} = $self->{buffer};
- $self->{buffer} = undef;
- }
-
- # Move forward
- elsif ($self->{query}->next) {
- $self->{next_item} = $self->{query}->current;
- }
-
- # Can't move forward
- else {
- $self->{next_item} = undef;
- return;
- };
-
return 1;
};
diff --git a/lib/Krawfish/Meta/Segment/Sort.pm b/lib/Krawfish/Meta/Segment/Sort.pm
index ba49bcb..8add0f5 100644
--- a/lib/Krawfish/Meta/Segment/Sort.pm
+++ b/lib/Krawfish/Meta/Segment/Sort.pm
@@ -50,7 +50,7 @@
my %param = @_;
if (DEBUG) {
- print_log('p_sort', 'Initiate sort object');
+ print_log('sort', 'Initiate sort object');
};
# TODO:
@@ -125,7 +125,7 @@
my $query = $self->{query};
if (DEBUG) {
- print_log('p_sort', 'Initialize sort object');
+ print_log('sort', 'Initialize sort object');
};
# TODO:
@@ -139,35 +139,32 @@
# Get maximum accepted rank from queue
my $max_rank_ref = $self->{max_rank_ref};
-
- my $last_doc_id = -1;
my $queue = $self->{queue};
-
my $sort = $self->{sort};
if (DEBUG) {
- print_log('p_sort', qq!Next Rank on field #! . $sort->term_id);
+ print_log('sort', qq!Next Rank on field #! . $sort->term_id);
};
# Store the last match buffered
my ($match, $rank);
# Init
- $query->next;
+ $query->next_bundle;
# Pass through all queries
- while ($match = $query->current) {
+ while ($match = $query->current_bundle) {
if (DEBUG) {
- print_log('p_sort', 'Get next posting from ' . $query->to_string);
+ print_log('sort', 'Get next posting from ' . $query->to_string);
};
# Get stored rank
$rank = $sort->rank_for($match->doc_id);
if (DEBUG) {
- print_log('p_sort', 'Rank for doc id ' . $match->doc_id . " is $rank");
- print_log('p_sort', "Check rank $rank against max rank " . $$max_rank_ref);
+ print_log('sort', 'Rank for doc id ' . $match->doc_id . " is $rank");
+ print_log('sort', "Check rank $rank against max rank " . $$max_rank_ref);
};
@@ -178,7 +175,7 @@
$match = undef;
if (DEBUG) {
- print_log('p_sort', 'Move to next doc');
+ print_log('sort', 'Move to next doc');
};
# Skip to next document
@@ -193,21 +190,15 @@
$queue->insert([$rank, 0, $bundle, $bundle->matches]) if $bundle;
if (DEBUG) {
- print_log('p_sort', 'Move to next position');
+ print_log('sort', 'Move to next position');
};
- # Move to next match
- $query->next;
+ # Move to next bundle
+ $query->next_bundle;
};
-
my $array = $queue->reverse_array;
- print_log('p_sort', 'Get list ranking of ' . Dumper($array)) if DEBUG;
-
- # Get the rank reference (old)
- # TODO:
- # Remove!
- $self->{stack} = [$array];
+ print_log('sort', 'Get list ranking of ' . Dumper($array)) if DEBUG;
# Get the rank reference (new);
$self->{new_sorted} = $array;
@@ -217,7 +208,7 @@
# Move to the next item in the bundled document list
# and create the next bundle
-sub next {
+sub next_bundle {
my $self = shift;
# Initialize query - this will do a full run on the first field level!
@@ -229,7 +220,7 @@
#
# if (DEBUG) {
# print_log(
- # 'p_sort',
+ # 'sort',
# 'top_k ' . $self->{top_k} . ' is reached at position ' . $self->{pos}
# );
# };
@@ -240,7 +231,7 @@
if ($self->{pos_in_sort} >= @{$self->{new_sorted}}) {
if (DEBUG) {
- print_log('p_sort', 'No more elements in the priority array');
+ print_log('sort', 'No more elements in the priority array');
};
$self->{current_bundle} = undef;
@@ -254,31 +245,35 @@
my $top_bundle = $self->{new_sorted}->[$pos];
if (DEBUG) {
- print_log('p_sort', "Move to next bundle at $pos, which is " . $top_bundle->[VALUE]);
+ print_log('sort', "Move to next bundle at $pos, which is " . $top_bundle->[VALUE]);
};
my $rank = $top_bundle->[RANK];
+ if (DEBUG) {
+ print_log('sort', "Create new bundle from prio at $pos starting with " .
+ $top_bundle->[VALUE]->to_string);
+ };
+
# TODO:
# Add criterion to bundle
my $new_bundle = Krawfish::Posting::Bundle->new($top_bundle->[VALUE]);
-
- if (DEBUG) {
- print_log('p_sort', "Get first bundle at $pos from prio " .
- $top_bundle->[VALUE]->to_string);
- };
-
$pos++;
for (; $pos < @{$self->{new_sorted}}; $pos++) {
$top_bundle = $self->{new_sorted}->[$pos];
if (DEBUG) {
- print_log('p_sort', 'Check follow up from prio: ' . $top_bundle->[VALUE]->to_string);
+ print_log('sort', 'Check follow up from prio: ' . $top_bundle->[VALUE]->to_string);
};
# Add element to bundle
if ($rank == $top_bundle->[RANK]) {
+
+ if (DEBUG) {
+ print_log('sort', "Both items have the same rank $rank");
+ };
+
unless ($new_bundle->add($top_bundle->[VALUE])) {
warn 'Unable to add bundle to new bundle';
};
@@ -293,6 +288,10 @@
# Get position
$self->{pos_in_sort} = $pos;
+ if (DEBUG) {
+ print_log('sort', 'Current bundle is now ' . $new_bundle->to_string);
+ };
+
# Set current bundle
$self->{current_bundle} = $new_bundle;
@@ -302,45 +301,43 @@
};
-sub next_bundle {
- ...
-};
-
sub current_bundle {
- my $self = shift;
- return $self->{current_bundle};
+ $_[0]->{current_bundle};
};
# Return the current match
sub current {
-
- if (DEBUG) {
- print_log('p_sort', 'Current posting is ' . $_[0]->{current}->to_string);
- };
-
- $_[0]->{current};
+ # if (DEBUG) {
+ # print_log('sort', 'Current posting is ' . $_[0]->{current}->to_string);
+ # };
+ #
+ # $_[0]->{current};
+ ...
};
# Get the current match object
sub current_match {
- my $self = shift;
- my $current = $self->current or return;
- my $match = Krawfish::Koral::Result::Match->new(
- doc_id => $current->doc_id,
- start => $current->start,
- end => $current->end,
- payload => $current->payload,
- );
-
- if (DEBUG) {
- print_log('p_sort', 'Current match is ' . $match->to_string);
- };
-
- return $match;
+ # my $self = shift;
+ # my $current = $self->current or return;
+ # my $match = Krawfish::Koral::Result::Match->new(
+ # doc_id => $current->doc_id,
+ # start => $current->start,
+ # end => $current->end,
+ # payload => $current->payload,
+ # );
+ #
+ # if (DEBUG) {
+ # print_log('sort', 'Current match is ' . $match->to_string);
+ # };
+ #
+ # return $match;
+ ...
};
+
+
sub to_string {
my $self = shift;
my $str = 'sort(';
@@ -381,7 +378,7 @@
if (DEBUG) {
print_log(
- 'p_sort',
+ 'sort',
'top_k ' . $self->{top_k} . ' is reached at position ' . $self->{pos}
);
};
@@ -402,7 +399,7 @@
if (DEBUG) {
print_log(
- 'p_sort',
+ 'sort',
'There is already a match in [sorted]: ' . $self->{current}->to_string,
);
};
@@ -412,7 +409,7 @@
# Nothing presorted
elsif (DEBUG) {
- print_log('p_sort', 'There is no match in [sorted]');
+ print_log('sort', 'There is no match in [sorted]');
};
# Get the list values
@@ -421,7 +418,7 @@
# This will get the level from the stack
my $level = $#{$stack};
- print_log('p_sort', "Check stack on current level $level") if DEBUG;
+ print_log('sort', "Check stack on current level $level") if DEBUG;
# If the current list is empty, remove from stack
while (scalar @$stack && (
@@ -429,20 +426,20 @@
!scalar(@{$stack->[$level]->[0]})
)) {
- print_log('p_sort', "Stack is empty at least on level $level") if DEBUG;
+ print_log('sort', "Stack is empty at least on level $level") if DEBUG;
pop @$stack;
$level--;
#if (DEBUG) {
- # print_log('p_sort', "Stack is reduced to level $level with " . Dumper($stack));
+ # print_log('sort', "Stack is reduced to level $level with " . Dumper($stack));
#};
};
# There is nothing to sort further
unless (scalar @$stack) {
- print_log('p_sort', 'There is nothing to sort further') if DEBUG;
+ print_log('sort', 'There is nothing to sort further') if DEBUG;
$self->{current} = undef;
return;
@@ -464,7 +461,7 @@
if (DEBUG) {
print_log(
- 'p_sort',
+ 'sort',
"Found $same matches at first node",
" on level $level in " . _string_array($stack->[$level])
);
@@ -473,7 +470,7 @@
# Get the identical elements from the list
my @presort = splice(@{$stack->[$level]}, 0, $same - 1);
- print_log('p_sort', 'Presort array is ' . _string_array(\@presort)) if DEBUG;
+ print_log('sort', 'Presort array is ' . _string_array(\@presort)) if DEBUG;
# TODO:
# Push presort on the stack!
@@ -488,7 +485,7 @@
#my ($field, $desc) = @{$self->{fields}->[$level + 1]};
#if (DEBUG) {
- # print_log('p_sort', qq!Next Rank on field "$field"!);
+ # print_log('sort', qq!Next Rank on field "$field"!);
#};
#$level++;
@@ -504,7 +501,7 @@
if (DEBUG) {
print_log(
- 'p_sort',
+ 'sort',
"Sorted array",
" on new level $level is " . _string_array($stack->[$level])
);
@@ -514,13 +511,13 @@
# There are matches on the list without identical ranks
# if (DEBUG) {
-# print_log('p_sort', "Stack with level $level is " . Dumper($stack));
+# print_log('sort', "Stack with level $level is " . Dumper($stack));
# };
# Get the top list entry
my $top = shift @{$stack->[$level]};
- print_log('p_sort', 'Push value ' . $top->[VALUE]) if DEBUG;
+ print_log('sort', 'Push value ' . $top->[VALUE]) if DEBUG;
# Push matches to result list
push @{$self->{sorted}}, $top->[VALUE]->unbundle;
@@ -540,7 +537,7 @@
my $self = shift;
if (DEBUG) {
- print_log('p_sort', 'Check for duplicates from index ' . $self->{pos});
+ print_log('sort', 'Check for duplicates from index ' . $self->{pos});
};
return $self->{list}->[$self->{pos}]->[1] || 1;
@@ -561,7 +558,7 @@
warn 'DEPRECATED';
if (DEBUG) {
- print_log('p_sort', 'Heapsort list of length ' . scalar(@$sub_list) .
+ print_log('sort', 'Heapsort list of length ' . scalar(@$sub_list) .
qq! by field "$field" for top_k = $top_k!);
};
diff --git a/lib/Krawfish/Meta/Segment/Sort/Sample.pm b/lib/Krawfish/Meta/Segment/Sort/Sample.pm
index 3b4efa9..48466aa 100644
--- a/lib/Krawfish/Meta/Segment/Sort/Sample.pm
+++ b/lib/Krawfish/Meta/Segment/Sort/Sample.pm
@@ -9,6 +9,9 @@
# Difference to random sorting is, this won't randomly sort all
# results, making paging possible.
+# When the number of matches is known in advance, another
+# approach may be valid.
+
# WARNING:
# Sorting does not respect current_match of any nested query,
# that's why sorting is always separated from enriching!
diff --git a/lib/Krawfish/Meta/Segment/SortAfter.pm b/lib/Krawfish/Meta/Segment/SortAfter.pm
index 8fd7e79..9fe18bd 100644
--- a/lib/Krawfish/Meta/Segment/SortAfter.pm
+++ b/lib/Krawfish/Meta/Segment/SortAfter.pm
@@ -1,5 +1,7 @@
package Krawfish::Meta::Segment::SortAfter;
-use parent 'Krawfish::Meta';
+use parent 'Krawfish::Meta::Segment::Bundle';
+use Data::Dumper;
+use Krawfish::Log;
use strict;
use warnings;
@@ -10,9 +12,20 @@
# (because all matches are already retrieved)
# and immediately stops, when top_k is reached.
#
-# That also means, this respects next etc.
-# and doesn't do all the work in init() phase.
+# That also means, this does all the work in next_bundle()
+# instead of init().
+
+use constant {
+ DEBUG => 1,
+ RANK => 0,
+ SAME => 1,
+ VALUE => 2,
+ MATCHES => 3
+};
+
+
+# Constructor
sub new {
my $class = shift;
my %param = @_;
@@ -36,31 +49,41 @@
$max_rank_ref
);
+ if (DEBUG) {
+ print_log('sort_after', 'Initiate follow up sort');
+ };
+
bless {
- query => $query,
- segment => $segment,
- top_k => $top_k,
- sort => $sort,
- count => 0 # number of (bundled) matches already served
+ query => $query,
+ segment => $segment,
+ top_k => $top_k,
+ sort => $sort,
+ max_rank => $segment->max_rank,
+ count => 0 # number of (bundled) matches already served
}, $class;
};
# Move to next bundle
-sub next {
+sub next_bundle {
my $self = shift;
+ if (DEBUG) {
+ print_log('sort_after', 'Move to next bundle');
+ };
+
+ $_[0]->{current_bundle} = undef;
+
# Already served enough
if ($self->{count} > $self->{top_k}) {
- $_[0]->{current_bundle} = undef;
return;
}
# There are sorted bundles on the buffer
- if (@{$self->{buffer}}) {
+ if ($self->{buffer} && @{$self->{buffer}}) {
# This is also a bundle
- $self->{current_bundle} = shift @{$self->{buffer}};
+ $self->{current_bundle} = shift(@{$self->{buffer}})->[VALUE];
# Move forward by the length of the bundle
$self->{count} += $self->{current_bundle}->size;
@@ -70,14 +93,82 @@
};
# Get a new bundle from the nested query
- if ($self->{query}->next) {
- my $next_bundle = $self->{query}->current_bundle;
+ unless ($self->{query}->next_bundle) {
+ return;
+ };
- # 1. Split the bundle
- # 2. Sort
- # 3. add sorting criterion
+ if (DEBUG) {
+ print_log('Get next bundle from ' . $self->{query}->to_string);
+ };
+
+ my $next_bundle = $self->{query}->current_bundle;
+
+ # Next bundle is already sorted
+ if ($next_bundle->size == 1) {
+ ($self->{current_bundle}) = $next_bundle->unbundle;
+ return 1;
+ };
+
+ # Sort next bundle
+
+ # This should probably check for a simpler sorting
+ # algorithm for small data sets
+ my $rank;
+ my $sort = $self->{sort};
+ my $max_rank_ref = \(my $max_rank = $self->{max_rank});
+
+ if (DEBUG) {
+ print_log('sort_after', 'Sort nested bundle');
+ };
+
+ # Create initial priority queue
+ my $queue = Krawfish::Util::PriorityQueue::PerDoc->new(
+ $self->{top_k} - $self->{count},
+ $max_rank_ref
+ );
+
+ # Unbundle bundle and go through matches
+ # TODO:
+ # This should be an iterator
+ foreach my $posting ($next_bundle->unbundle) {
+
+ if (DEBUG) {
+ print_log('sort_after', 'Get next posting from ' . $self->{query}->to_string);
+ };
+
+ # Get stored rank
+ $rank = $sort->rank_for($posting->doc_id);
+
+ # TODO:
+ # Support next_doc() and preview_doc_id()
+ #
+ # if ($rank > $$max_rank_ref) {
+ # # Document is irrelevant
+ # $match = undef;
+ #
+ # if (DEBUG) {
+ # print_log('sort', 'Move to next doc');
+ # };
+ #
+ # # Skip to next document
+ # $query->next_doc;
+ # CORE::next;
+ # };
+
+ $queue->insert([$rank, 0, $posting, $posting->matches]);
# 4. Push to buffer
};
+
+ my $array = $queue->reverse_array;
+
+ print_log('sort_after', 'Get list ranking of ' . Dumper($array)) if DEBUG;
+
+ # This is also a bundle
+ $self->{current_bundle} = shift(@{$array})->[VALUE];
+
+ print_log('sort_after', 'New current bundle is ' . $self->{current_bundle}->to_string) if DEBUG;
+
+ $self->{buffer} = $array;
};