Improved priorityqueue-per-doc
diff --git a/lib/Krawfish/Index/Store/V1/Dictionary.pm b/lib/Krawfish/Index/Store/V1/Dictionary.pm
index 80b57e2..776c1a6 100644
--- a/lib/Krawfish/Index/Store/V1/Dictionary.pm
+++ b/lib/Krawfish/Index/Store/V1/Dictionary.pm
@@ -12,7 +12,7 @@
HI_KID => 3,
TERM_ID => 4,
TERM_CHAR => '00',
- DEBUG => 1
+ DEBUG => 0
};
# This is a very compact representation of a Ternary Search Tree.
diff --git a/lib/Krawfish/Query/Base/Sorted.pm b/lib/Krawfish/Query/Base/Sorted.pm
index 00eaa3b..ae930d3 100644
--- a/lib/Krawfish/Query/Base/Sorted.pm
+++ b/lib/Krawfish/Query/Base/Sorted.pm
@@ -4,7 +4,7 @@
use strict;
use warnings;
-use constant DEBUG => 1;
+use constant DEBUG => 0;
# TODO:
# Implement as an overwriting ring buffer (FIFO) with
diff --git a/lib/Krawfish/Result/Snippet.pm b/lib/Krawfish/Result/Snippet.pm
index d627fd8..ed0d3d2 100644
--- a/lib/Krawfish/Result/Snippet.pm
+++ b/lib/Krawfish/Result/Snippet.pm
@@ -5,7 +5,7 @@
use strict;
use warnings;
-use constant DEBUG => 1;
+use constant DEBUG => 0;
# TODO:
# - ExpandToSpan
diff --git a/lib/Krawfish/Result/Snippet/Highlights.pm b/lib/Krawfish/Result/Snippet/Highlights.pm
index 9adb80b..db79bca 100644
--- a/lib/Krawfish/Result/Snippet/Highlights.pm
+++ b/lib/Krawfish/Result/Snippet/Highlights.pm
@@ -3,7 +3,7 @@
use strict;
use warnings;
-use constant DEBUG => 1;
+use constant DEBUG => 0;
# -1 is match highlight
# $annotation_nr_counter = 256;
diff --git a/lib/Krawfish/Result/Snippet/Spans.pm b/lib/Krawfish/Result/Snippet/Spans.pm
index 401b09f..112857e 100644
--- a/lib/Krawfish/Result/Snippet/Spans.pm
+++ b/lib/Krawfish/Result/Snippet/Spans.pm
@@ -3,7 +3,7 @@
use strict;
use warnings;
-use constant DEBUG => 1;
+use constant DEBUG => 0;
sub new {
my $class = shift;
diff --git a/lib/Krawfish/Result/Sort/Priority.pm b/lib/Krawfish/Result/Sort/Priority.pm
index dad8705..a3fc4e7 100644
--- a/lib/Krawfish/Result/Sort/Priority.pm
+++ b/lib/Krawfish/Result/Sort/Priority.pm
@@ -5,18 +5,17 @@
use strict;
use warnings;
-use constant DEBUG => 1;
+use constant DEBUG => 0;
sub new {
my $class = shift;
my %param = @_;
- my $query = $param{query};
- my $fields = $param{fields};
+ my $query = $param{query};
+ my $fields = $param{fields};
my $field = $param{field};
- my $desc = $param{desc} ? 1 : 0;
-
- my $top_k = $param{top_k};
+ my $desc = $param{desc} ? 1 : 0;
+ my $top_k = $param{top_k};
# TODO: my $offset = $param{offset};
my $max_rank_ref = $param{max_rank_ref};
@@ -80,7 +79,7 @@
};
# Insert into priority queue
- $queue->insert($rank, $record);
+ $queue->insert([$rank, 0, $record]);
};
# Get the rank reference
diff --git a/lib/Krawfish/Result/Sort/PriorityCascade.pm b/lib/Krawfish/Result/Sort/PriorityCascade.pm
index 6c9228c..22e6ddc 100644
--- a/lib/Krawfish/Result/Sort/PriorityCascade.pm
+++ b/lib/Krawfish/Result/Sort/PriorityCascade.pm
@@ -5,7 +5,7 @@
use strict;
use warnings;
-use constant DEBUG => 1;
+use constant DEBUG => 0;
# TODO:
# Use Krawfish::Util::PrioritySortPerDoc
diff --git a/lib/Krawfish/Util/BucketSort/Bucket.pm b/lib/Krawfish/Util/BucketSort/Bucket.pm
index 65c5615..714c811 100644
--- a/lib/Krawfish/Util/BucketSort/Bucket.pm
+++ b/lib/Krawfish/Util/BucketSort/Bucket.pm
@@ -6,7 +6,7 @@
# TODO:
# The position should be stored in BucketSort, not in the bucket
-use constant DEBUG => 1;
+use constant DEBUG => 0;
# The count is the sum of all counts before
sub new {
diff --git a/lib/Krawfish/Util/PriorityQueue/PerDoc.pm b/lib/Krawfish/Util/PriorityQueue/PerDoc.pm
index 5ddb1de..3493e9d 100644
--- a/lib/Krawfish/Util/PriorityQueue/PerDoc.pm
+++ b/lib/Krawfish/Util/PriorityQueue/PerDoc.pm
@@ -3,6 +3,8 @@
use warnings;
use Krawfish::Log;
+# TODO: Probably rename from IN_DOC to IN_COLL
+
use constant {
DEBUG => 0,
RANK => 0,
@@ -31,20 +33,17 @@
$_[0]->{match_count};
};
-sub insert {
+# TODO: May accept rank, in_doc, value instead of nodes
+# sub insert;
+
+sub incr {
my ($self, $node) = @_;
+ $self->{match_count} += $node->[IN_DOC];
+};
- # Rank is beyond useful
- if ($node->[RANK] > ${$self->{max_rank_ref}}) {
- print_log('prio', "Rank is larger than max rank") if DEBUG;
- return;
- };
-
- if ($self->enqueue($node)) {
- $self->{match_count} += $node->[IN_DOC];
- };
-
- return 1;
+sub decr {
+ my ($self, $node) = @_;
+ $self->{match_count} -= $node->[IN_DOC];
};
@@ -55,16 +54,29 @@
};
-
-1;
-
-__END__
-
+# Return tree stringification
+sub to_tree {
+ my $self = shift;
+ return
+ join('', map {
+ '[' . $_->[RANK] .
+ ($_->[SAME] ? ':' . $_->[SAME] : '') .
+ ($_->[IN_DOC] ? 'x' . $_->[IN_DOC] : '') .
+ ']'
+ } @{$self->{array}}[0..$self->{index}-1]);
+};
sub top_identicals {
my $top = $_[0]->{array}->[0];
+ if ($top->[SAME] > 1) {
+ warn 'Undefined yet!';
+ }
+ else {
+ return $top->[IN_DOC]
+ };
+
# TODO:
# - Go through all same nodes and get
# the sum for all per_doc values.
@@ -78,7 +90,11 @@
};
-sub to_tree;
+1;
+
+
+__END__
+
sub mark_top_duplicates {
...
diff --git a/lib/Krawfish/Util/PrioritySort.pm b/lib/Krawfish/Util/PrioritySort.pm
index b5f91cc..dcb1648 100644
--- a/lib/Krawfish/Util/PrioritySort.pm
+++ b/lib/Krawfish/Util/PrioritySort.pm
@@ -30,7 +30,10 @@
# Check
# https://github.com/apache/lucy/blob/62cdcf930dc871fb95b5c99fc86e93afe7a3e344/core/Lucy/Search/HitQueue.c
# https://github.com/apache/lucy/blob/master/core/Lucy/Util/PriorityQueue.c
-
+#
+# Identicals can mean: Have the same rank. It may also mean: Are part of a collection
+# with the same rank. For example, multiple matches in a document.
+#
use constant {
DEBUG => 0,
RANK => 0,
@@ -66,7 +69,7 @@
return;
};
- return $_[0]->enqueue($_[1]);
+ return $self->enqueue($node);
};
@@ -125,7 +128,9 @@
};
};
- print_log('prio', 'Tree is ' . $self->to_tree) if DEBUG;
+ if (DEBUG) {
+ print_log('prio', 'Tree is ' . $self->to_tree);
+ };
# The rank is identical to the top rank and it's a same
if ($is_same && $node->[RANK] == ${$self->{max_rank_ref}}) {
@@ -148,6 +153,9 @@
}
};
+ # Increment tree node size
+ $self->incr($node);
+
# Remove top nodes
if ($self->length >= $self->{top_k}) {
@@ -178,7 +186,7 @@
${$self->{max_rank_ref}} = $array->[0]->[RANK];
if (DEBUG) {
- print_log('prio', 'Tree is ' . $self->to_tree);
+ print_log('prio', 'Tree with length ' . $self->length . ' is ' . $self->to_tree);
print_log('prio', "New maximum rank is " .$array->[0]->[RANK]);
};
};
@@ -343,6 +351,8 @@
} @{$self->{array}}[0..$self->{index}-1]);
};
+sub incr {};
+sub decr {};
# Remove a single top entry
sub _remove_single_top {
@@ -351,6 +361,9 @@
# Place the last element in the first position and swap
my $array = $self->{array};
+ # Decrement values
+ $self->decr($array->[0]);
+
$array->[0] = $array->[--$self->{index}];
print_log('prio', 'Move last entry ' . $array->[0]->[RANK] . ' to top node') if DEBUG;