Added some ideas for sorting
diff --git a/lib/Krawfish/Index/Dictionary.pm b/lib/Krawfish/Index/Dictionary.pm
index 9389871..e843b8b 100644
--- a/lib/Krawfish/Index/Dictionary.pm
+++ b/lib/Krawfish/Index/Dictionary.pm
@@ -59,10 +59,12 @@
file => $file,
hash => {}, # Contain the dictionary
+ # This will probably be one array in the future
prefix_rank => [],
suffix_rank => [],
+ ranked => 0,
- # Temporary helper arrays for (sub)term_id -> (sub)term mapping
+ # TEMP helper arrays for (sub)term_id -> (sub)term mapping
term_array => [],
subterm_array => [],
@@ -125,6 +127,8 @@
# For hapax legomena, a special null marker will be returned
$subterm_id = $self->{last_subterm_id}++;
+ $self->{ranked} = 0;
+
# TODO: Based on the subterms, the rankings will be processed
# Store subterm for term_id mapping
@@ -133,6 +137,7 @@
# Add subterm to set
$hash->{'~' . $subterm} = $subterm_id;
};
+
return $subterm_id;
};
@@ -162,8 +167,17 @@
};
# Returns a rank value by a certain subterm_id
-sub rank_by_subterm_id;
-sub rev_rank_by_subterm_id;
+sub prefix_rank_by_subterm_id {
+ my ($self, $subterm_id) = @_;
+ $self->process_subterm_ranks;
+ $self->{prefix_rank}->[$subterm_id];
+};
+
+sub suffix_rank_by_subterm_id {
+ my ($self, $subterm_id) = @_;
+ $self->process_subterm_ranks;
+ $self->{suffix_rank}->[$subterm_id];
+};
# if a term has no term_id it also has no rank,
# so this will return the rank of the preceeding term in the dictionary.
@@ -197,6 +211,9 @@
};
sub process_subterm_ranks {
+ return if $_[0]->{ranked};
+ my $self = shift;
+ # TODO:
# For prefix rank:
# Iterate over prefix tree in alphabetical order
# and generate a rank for the subtree of the subterm.
@@ -206,7 +223,25 @@
# and sort by suffix using
# bucketsort or mergesort to
# create a sorted rank
- ...
+
+ # Iterate over all subterms alphabetically
+ my $i = 0;
+ my @subt = grep { index($_, '~') == 0 } keys %{$self->{hash}};
+ foreach my $subterm (sort @subt) {
+
+ # Set subterm_id to prefix rank
+ $self->{prefix_rank}->[$i++] = $self->{hash}->{$subterm};
+ };
+
+ # Iterate over all subterms based on their suffixes
+ $i = 0;
+ foreach my $subterm (sort { reverse($a) cmp reverse($b) } @subt) {
+
+ # Set subterm_id to prefix rank
+ $self->{suffix_rank}->[$i++] = $self->{hash}->{$subterm};
+ };
+
+ $_[0]->{ranked} = 1;
};
diff --git a/lib/Krawfish/Index/Subtokens.pm b/lib/Krawfish/Index/Subtokens.pm
index 7be508a..01e6f05 100644
--- a/lib/Krawfish/Index/Subtokens.pm
+++ b/lib/Krawfish/Index/Subtokens.pm
@@ -98,26 +98,26 @@
my $self = shift;
# Get data to store per segment
- my ($doc_id, $segment, $start_char, $end_char, $term_id, $term) = @_;
+ my ($doc_id, $subtoken, $start_char, $end_char, $subterm_id, $subterm) = @_;
# TODO: THIS IS PROBABLY NOT NECESSARY!
- if ($term) {
+ if ($subterm) {
# Get the first and last characters of the term
- my ($first, $last) = (substr($term, 0, 2), scalar reverse substr($term, -2));
+ my ($first, $last) = (substr($subterm, 0, 2), scalar reverse substr($subterm, -2));
# Store all segments
- $self->{$doc_id . '#' . $segment} = [$start_char, $end_char, $term_id, $first, $last];
+ $self->{$doc_id . '#' . $subtoken} = [$start_char, $end_char, $subterm_id, $first, $last];
if (DEBUG) {
- print_log('segments', "Store segment at [$doc_id,$segment]");
- print_log('segments', ' with ' . join(','),@{$self->{$doc_id . '#' . $segment}});
+ print_log('segments', "Store subtoken at [$doc_id,$subtoken]");
+ print_log('segments', ' with ' . join(','),@{$self->{$doc_id . '#' . $subtoken}});
};
}
# Temporary
else {
# Store all segments
- $self->{$doc_id . '#' . $segment} = [$start_char, $end_char];
+ $self->{$doc_id . '#' . $subtoken} = [$start_char, $end_char];
}
return $self;
@@ -136,7 +136,9 @@
sub append {
my $self = shift;
my ($token, $doc_id, $pos, $end) = @_;
- print_log('toklist', "Appended $token with $doc_id, $pos" . ($end ? "-$end" : '')) if DEBUG;
+ if (DEBUG) {
+ print_log('toklist', "Appended $token with $doc_id, $pos" . ($end ? "-$end" : ''));
+ };
push(@{$self->{array}}, [$doc_id, $pos, $end]);
};
diff --git a/lib/Krawfish/Koral/Meta/Sort.pm b/lib/Krawfish/Koral/Meta/Sort.pm
new file mode 100644
index 0000000..958d95e
--- /dev/null
+++ b/lib/Krawfish/Koral/Meta/Sort.pm
@@ -0,0 +1,60 @@
+package Krawfish::Koral::Meta::Sort;
+use strict;
+use warnings;
+use Krawfish::Result::Sort::FirstPass;
+use Krawfish::Result::Sort;
+use Krawfish::Log;
+
+use constant DEBUG => 0;
+
+# fields => [[asc => 'author', desc => 'title']]
+sub new {
+ my $class = shift;
+ my $query = shift;
+ my @fields = @_;
+ bless {
+ query => $query,
+ fields => \@fields
+ }, $class;
+};
+
+sub plan_for {
+ my ($self, $index) = @_;
+
+ my $field = shift @{$self->{fields}};
+
+ # Initially sort using bucket sort
+ $query = Krawfish::Result::FilterSort->new(
+ $self->{query},
+ ($field->[0] eq 'desc' ? 1 : 0),
+ $field->[1]
+ );
+
+ # Iterate over all fields
+ foreach $field (@{$self->{fields}}) {
+ $query = Krawfish::Result::RankSort->new(
+ $query,
+ ($field->[0] eq 'desc' ? 1 : 0),
+ $field->[1]
+ );
+ };
+
+ # Final sorting based on UID
+ return Krawfish::Result::Sort->new($query, 0, 'uid');
+};
+
+
+sub type { 'sort' };
+
+
+sub to_koral_fragment {
+ ...
+};
+
+
+sub to_string {
+ ...
+};
+
+
+1;
diff --git a/lib/Krawfish/Query/Util/BucketSort.pm b/lib/Krawfish/Query/Util/BucketSort.pm
new file mode 100644
index 0000000..5a1207e
--- /dev/null
+++ b/lib/Krawfish/Query/Util/BucketSort.pm
@@ -0,0 +1,128 @@
+package Krawfish::Query::Util::BucketSort;
+use Krawfish::Query::Util::BucketSort::Bucket;
+use Krawfish::Log;
+use strict;
+use warnings;
+use bytes;
+
+# All K::Q::Util packages may move to K::Util ...
+
+# This implements the datastructure for all bucket sorts algorithms.
+# It will initially
+
+# TODO:
+# Use a variant of insertion sort or (better) tree sort
+# http://jeffreystedfast.blogspot.de/2007/02/binary-insertion-sort.html
+# The most natural way to sort would probably be a variant of bucket sort,
+# but this would perform poorly in case the field to sort has only
+# a single rank, which would be the case, if the corpus query would require
+# this.
+# The best variant may be bucket sort with insertion sort. In case top_n was chosen,
+# this may early make some buckets neglectable.
+# Like this:
+# 1. Create 256 buckets. These buckets have
+# 1.1. A pointer to their contents and
+# 1.2. A counter, how many elements are in there.
+# If the pointer is zero, no elements should
+# be put into the bin (i.e. forget them).
+# 1.3. A bit vector marking equal elements
+# May not be good - better add concrete pointers, because
+# order will change regularly. Maybe only a single
+# marker for duplicates
+# 2. Get a new item from the stream and add it to the bucket in question,
+# in case this bucket does not point to 0.
+# 2.1. Do an insertion sort in the bucket.
+# 2.2. If the elements are equal, put the element behind the equal element
+# and set the duplicate flag.
+# 3. Increment the counter of the bucket.
+# 4. If the number of sorted elements is (n % top_n) == 0
+# iterate through all buckets from the top and calculate the sum
+# of the contents.
+# 4.1. If the sum exceeds top_n, clean up the following bucket pointers
+# and let them point to 0.
+# 4.2. Stop if a pointer is 0.
+# 5. Go to 2 until all elements are consumed.
+# 6. From the top bucket, check all duplicates, and sort them after the next field
+# 7. Go to 6. unless all fields are sorted.
+# 8. Check for remaining duplicates in the buckets and sort them by the uid field.
+# 9. Return top_n in bucket order.
+#
+# It would be great, if the ordered values in the buckets could be used
+# for traversing directly - without copying the structure any further.
+# As the top-buckets will always receive items, they will probably need more space than the others
+# (although, this is not true all the time)
+#
+# To calculate the sum of the preceeding array, vector intrinsics may be useful:
+# http://stackoverflow.com/questions/11872952/simd-the-following-code
+#
+
+# TODO:
+# The implementation may well be a double-linked list where
+# bucket borders are additional links to the list:
+#
+# [buc1=count|valid][buc2=count|valid]...
+# | |----------------|
+# | |
+# v v
+# [rec1|dupl]=[rec2|dupl]=[rec3|dupl]=[rec4|dupl]=...
+#
+# With this, it's possible to first use bucket search
+# for insertion into the list and then perform insertion
+# sort or any other offline sort algorithm per valid bucket.
+# However - the length of the list needs to be transported
+# somehow ... maybe by a separated link with the bucket pointers.
+
+sub new {
+ my $class = shift;
+ my $top_k = shift;
+
+ my @buckets = ();
+ foreach (0 .. 255) {
+ push @buckets, Krawfish::Query::Util::Buckets::Bucket->new;
+ };
+
+ bless {
+ top_k => $top_k,
+ buckets => \@buckets,
+ max_bucket => 256
+ }, $class;
+};
+
+
+sub add {
+ my ($self, $rank, $record) = @_;
+
+ # This returns the first 8 bits from the rank
+ my $bucket_nr = bytes::substr($rank, 0, 1);
+
+ # Check, if bucket is valid
+ return unless $bucket_nr > $self->{max_bucket};
+
+ # Buckets have the structure:
+ # [valid|pointer|counter]
+ my $bucket = $self->{buckets}->[$bucket_nr];
+
+ # Insert (sorted)
+ $bucket->insert($rank, $record);
+
+ # Bucket is in max
+ return 1 if $bucket_nr == $self->{max_bucket};
+
+ # Increment bucket counter
+ while ($bucket = $self->{buckets}->[$bucket_nr++]) {
+
+ # Increnment bucket and invalidate if exceeding
+ if ($bucket->incr > $self->{top_k}) {
+ $self->{max_bucket} = $bucket_nr - 1;
+ last;
+ };
+ };
+
+ return 1;
+};
+
+
+# Returns the top k results in sorted bucket order
+sub next;
+
+1;
diff --git a/lib/Krawfish/Query/Util/BucketSort/Bucket.pm b/lib/Krawfish/Query/Util/BucketSort/Bucket.pm
new file mode 100644
index 0000000..fc3b286
--- /dev/null
+++ b/lib/Krawfish/Query/Util/BucketSort/Bucket.pm
@@ -0,0 +1,36 @@
+package Krawfish::Query::Util::BucketSort::Bucket;
+use Krawfish::Query::Util::BucketSort::Record;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+# The count is the sum of all counts before
+sub new {
+ bless [
+ 0, # count
+ [] # entries
+ ], shift;
+};
+
+
+# Increment counter
+sub incr {
+ return $_[0]->[1]++;
+};
+
+
+# Entries have the structure:
+# [rank
+sub insert {
+ my ($self, $rank, $value) = @_;
+
+ # Increment counter
+ $self->incr;
+
+ # Insert into list
+ # Probably unsorted at first
+ $self->{list}->insert($rank, $value);
+};
+
+
+1;
diff --git a/lib/Krawfish/Query/Util/BucketSort/List.pm b/lib/Krawfish/Query/Util/BucketSort/List.pm
new file mode 100644
index 0000000..e7e3957
--- /dev/null
+++ b/lib/Krawfish/Query/Util/BucketSort/List.pm
@@ -0,0 +1,19 @@
+package Krawfish::Query::BucketSort::List;
+use strict;
+use warnings;
+
+# Implement as a linked list - initially unsorted
+# (because it may not be necessary to sort)!
+# Entries have the structure [rank|value]
+
+sub new {
+ my $class = shift;
+ bless [], $class;
+};
+
+sub insert {
+ my ($self, $rank, $value) = @_;
+ push @$self, [$rank, $value];
+};
+
+1;
diff --git a/lib/Krawfish/Result/Group/Classes.pm b/lib/Krawfish/Result/Group/Classes.pm
index f090144..d555715 100644
--- a/lib/Krawfish/Result/Group/Classes.pm
+++ b/lib/Krawfish/Result/Group/Classes.pm
@@ -14,6 +14,7 @@
END_POS => 2,
};
+
sub new {
my $class = shift;
bless {
diff --git a/lib/Krawfish/Result/Sort.pm b/lib/Krawfish/Result/Sort.pm
index 40b8410..f9e59b9 100644
--- a/lib/Krawfish/Result/Sort.pm
+++ b/lib/Krawfish/Result/Sort.pm
@@ -8,6 +8,8 @@
# Sorting can be optimized by an appended filter, in case there is no need
# for counting all matches and documents.
+# See Krawfish::Query::Util::Buckets
+
# TODO:
# Sort is currently limited to sorting for searching.
# It should also be able to sort groups or sort texts/documents/corpora
@@ -18,51 +20,6 @@
# This could in fact be done manipulating the external virtual corpus filter!
#
# TODO:
-# Use a variant of insertion sort or (better) tree sort
-# http://jeffreystedfast.blogspot.de/2007/02/binary-insertion-sort.html
-# The most natural way to sort would probably be a variant of bucket sort,
-# but this would perform poorly in case the field to sort has only
-# a single rank, which would be the case, if the corpus query would require
-# this.
-# The best variant may be bucket sort with insertion sort. In case top_n was chosen,
-# this may early make some buckets neglectable.
-# Like this:
-# 1. Create 256 buckets. These buckets have
-# 1.1. A pointer to their contents and
-# 1.2. A counter, how many elements are in there.
-# If the pointer is zero, no elements should
-# be put into the bin (i.e. forget them).
-# 1.3. A bit vector marking equal elements
-# May not be good - better add concrete pointers, because
-# order will change regularly. Maybe only a single
-# marker for duplicates
-# 2. Get a new item from the stream and add it to the bucket in question,
-# in case this bucket does not point to 0.
-# 2.1. Do an insertion sort in the bucket.
-# 2.2. If the elements are equal, put the element behind the equal element
-# and set the duplicate flag.
-# 3. Increment the counter of the bucket.
-# 4. If the number of sorted elements is (n % top_n) == 0
-# iterate through all buckets from the top and calculate the sum
-# of the contents.
-# 4.1. If the sum exceeds top_n, clean up the following bucket pointers
-# and let them point to 0.
-# 4.2. Stop if a pointer is 0.
-# 5. Go to 2 until all elements are consumed.
-# 6. From the top bucket, check all duplicates, and sort them after the next field
-# 7. Go to 6. unless all fields are sorted.
-# 8. Check for remaining duplicates in the buckets and sort them by the uid field.
-# 9. Return top_n in bucket order.
-#
-# It would be great, if the ordered values in the buckets could be used
-# for traversing directly - without copying the structure any further.
-# As the top-buckets will always receive items, they will probably need more space than the others
-# (although, this is not true all the time)
-#
-# To calculate the sum of the preceeding array, vector intrinsics may be useful:
-# http://stackoverflow.com/questions/11872952/simd-the-following-code
-#
-# TODO:
# Whenever top X is requested, and there are matches > X with a max_rank Y,
# ignore all matches > Y. This also means, these matches do not need to be
# fine-sorted using secondary fields.
diff --git a/lib/Krawfish/Result/Sort/Alphabet.pm b/lib/Krawfish/Result/Sort/Alphabet.pm
index cc336f2..c80dc7f 100644
--- a/lib/Krawfish/Result/Sort/Alphabet.pm
+++ b/lib/Krawfish/Result/Sort/Alphabet.pm
@@ -1,5 +1,39 @@
+package Krawfish::Result::Sort;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+use constant DEBUG => 0;
+
# Sort by characters of a certain segment (either the first or the last).
# This will require to open the offset file to get the first two characters
# for bucket sorting per token and then request the
# forward index (the offset is already liftet and may be stored in the buckets
# as well) for fine grained sorting!
+
+# TODO:
+# This will need to pass sorting criteria as strings for cluster sorting.
+# At least for the top k matches.
+
+sub new {
+ my $class = shift;
+ my $self = bless {
+ query => shift,
+ index => shift,
+ # top_k => shift
+ }, $class;
+
+ my $dict = $self->dict;
+ my $subt = $self->subtokens;
+
+ while ($query->next) {
+ my $element = $query->current->clone;
+
+ # Get the subtoken info
+ my $x = $subt->get($element->doc_id, $element->start);
+
+ # Add element with id for sorting
+ push @record, [$element, $x->[2]]
+ };
+
+};