Added some ideas for sorting

commit: ba0d3c8afc5b94492a4109e828f6e0a6d8638d5f [log] [tgz]
author: Akron <nils@diewald-online.de> Mon Feb 20 12:45:07 2017 +0100
committer: Akron <nils@diewald-online.de> Mon Feb 20 12:45:07 2017 +0100
tree: d3cf2a926297db29b52d72f10bf051e45418e0c9
parent: 1cb0b2112920c415f029dfb4d4edd18386a91da1 [diff]
diff --git a/lib/Krawfish/Index/Dictionary.pm b/lib/Krawfish/Index/Dictionary.pm
index 9389871..e843b8b 100644
--- a/lib/Krawfish/Index/Dictionary.pm
+++ b/lib/Krawfish/Index/Dictionary.pm

@@ -59,10 +59,12 @@
     file => $file,
     hash => {},   # Contain the dictionary
 
+    # This will probably be one array in the future
     prefix_rank => [],
     suffix_rank => [],
+    ranked => 0,
 
-    # Temporary helper arrays for (sub)term_id -> (sub)term mapping
+    # TEMP helper arrays for (sub)term_id -> (sub)term mapping
     term_array => [],
     subterm_array => [],
 
@@ -125,6 +127,8 @@
     #   For hapax legomena, a special null marker will be returned
     $subterm_id = $self->{last_subterm_id}++;
 
+    $self->{ranked} = 0;
+
     # TODO: Based on the subterms, the rankings will be processed
 
     # Store subterm for term_id mapping
@@ -133,6 +137,7 @@
     # Add subterm to set
     $hash->{'~' . $subterm} = $subterm_id;
   };
+
   return $subterm_id;
 };
 
@@ -162,8 +167,17 @@
 };
 
 # Returns a rank value by a certain subterm_id
-sub rank_by_subterm_id;
-sub rev_rank_by_subterm_id;
+sub prefix_rank_by_subterm_id {
+  my ($self, $subterm_id) = @_;
+  $self->process_subterm_ranks;
+  $self->{prefix_rank}->[$subterm_id];
+};
+
+sub suffix_rank_by_subterm_id {
+  my ($self, $subterm_id) = @_;
+  $self->process_subterm_ranks;
+  $self->{suffix_rank}->[$subterm_id];
+};
 
 # if a term has no term_id it also has no rank,
 # so this will return the rank of the preceeding term in the dictionary.
@@ -197,6 +211,9 @@
 };
 
 sub process_subterm_ranks {
+  return if $_[0]->{ranked};
+  my $self = shift;
+  # TODO:
   # For prefix rank:
   # Iterate over prefix tree in alphabetical order
   # and generate a rank for the subtree of the subterm.
@@ -206,7 +223,25 @@
   # and sort by suffix using
   # bucketsort or mergesort to
   # create a sorted rank
-  ...
+
+  # Iterate over all subterms alphabetically
+  my $i = 0;
+  my @subt = grep { index($_, '~') == 0 } keys %{$self->{hash}};
+  foreach my $subterm (sort @subt) {
+
+    # Set subterm_id to prefix rank
+    $self->{prefix_rank}->[$i++] = $self->{hash}->{$subterm};
+  };
+
+  # Iterate over all subterms based on their suffixes
+  $i = 0;
+  foreach my $subterm (sort { reverse($a) cmp reverse($b) } @subt) {
+
+    # Set subterm_id to prefix rank
+    $self->{suffix_rank}->[$i++] = $self->{hash}->{$subterm};
+  };
+
+  $_[0]->{ranked} = 1;
 };
 
 

diff --git a/lib/Krawfish/Index/Subtokens.pm b/lib/Krawfish/Index/Subtokens.pm
index 7be508a..01e6f05 100644
--- a/lib/Krawfish/Index/Subtokens.pm
+++ b/lib/Krawfish/Index/Subtokens.pm

@@ -98,26 +98,26 @@
   my $self = shift;
 
   # Get data to store per segment
-  my ($doc_id, $segment, $start_char, $end_char, $term_id, $term) = @_;
+  my ($doc_id, $subtoken, $start_char, $end_char, $subterm_id, $subterm) = @_;
 
   # TODO: THIS IS PROBABLY NOT NECESSARY!
-  if ($term) {
+  if ($subterm) {
     # Get the first and last characters of the term
-    my ($first, $last) = (substr($term, 0, 2), scalar reverse substr($term, -2));
+    my ($first, $last) = (substr($subterm, 0, 2), scalar reverse substr($subterm, -2));
 
     # Store all segments
-    $self->{$doc_id . '#' . $segment} = [$start_char, $end_char, $term_id, $first, $last];
+    $self->{$doc_id . '#' . $subtoken} = [$start_char, $end_char, $subterm_id, $first, $last];
 
     if (DEBUG) {
-      print_log('segments', "Store segment at [$doc_id,$segment]");
-      print_log('segments', '  with ' . join(','),@{$self->{$doc_id . '#' . $segment}});
+      print_log('segments', "Store subtoken at [$doc_id,$subtoken]");
+      print_log('segments', '  with ' . join(','),@{$self->{$doc_id . '#' . $subtoken}});
     };
   }
 
   # Temporary
   else {
     # Store all segments
-    $self->{$doc_id . '#' . $segment} = [$start_char, $end_char];
+    $self->{$doc_id . '#' . $subtoken} = [$start_char, $end_char];
   }
 
   return $self;
@@ -136,7 +136,9 @@
 sub append {
   my $self = shift;
   my ($token, $doc_id, $pos, $end) = @_;
-  print_log('toklist', "Appended $token with $doc_id, $pos" . ($end ? "-$end" : '')) if DEBUG;
+  if (DEBUG) {
+    print_log('toklist', "Appended $token with $doc_id, $pos" . ($end ? "-$end" : ''));
+  };
   push(@{$self->{array}}, [$doc_id, $pos, $end]);
 };
 

diff --git a/lib/Krawfish/Koral/Meta/Sort.pm b/lib/Krawfish/Koral/Meta/Sort.pm
new file mode 100644
index 0000000..958d95e
--- /dev/null
+++ b/lib/Krawfish/Koral/Meta/Sort.pm

@@ -0,0 +1,60 @@
+package Krawfish::Koral::Meta::Sort;
+use strict;
+use warnings;
+use Krawfish::Result::Sort::FirstPass;
+use Krawfish::Result::Sort;
+use Krawfish::Log;
+
+use constant DEBUG => 0;
+
+# fields => [[asc => 'author', desc => 'title']]
+sub new {
+  my $class = shift;
+  my $query = shift;
+  my @fields = @_;
+  bless {
+    query => $query,
+    fields => \@fields
+  }, $class;
+};
+
+sub plan_for {
+  my ($self, $index) = @_;
+
+  my $field = shift @{$self->{fields}};
+
+  # Initially sort using bucket sort
+  $query = Krawfish::Result::FilterSort->new(
+    $self->{query},
+    ($field->[0] eq 'desc' ? 1 : 0),
+    $field->[1]
+  );
+
+  # Iterate over all fields
+  foreach $field (@{$self->{fields}}) {
+    $query = Krawfish::Result::RankSort->new(
+      $query,
+      ($field->[0] eq 'desc' ? 1 : 0),
+      $field->[1]
+    );
+  };
+
+  # Final sorting based on UID
+  return Krawfish::Result::Sort->new($query, 0, 'uid');
+};
+
+
+sub type { 'sort' };
+
+
+sub to_koral_fragment {
+  ...
+};
+
+
+sub to_string {
+  ...
+};
+
+
+1;

diff --git a/lib/Krawfish/Query/Util/BucketSort.pm b/lib/Krawfish/Query/Util/BucketSort.pm
new file mode 100644
index 0000000..5a1207e
--- /dev/null
+++ b/lib/Krawfish/Query/Util/BucketSort.pm

@@ -0,0 +1,128 @@
+package Krawfish::Query::Util::BucketSort;
+use Krawfish::Query::Util::BucketSort::Bucket;
+use Krawfish::Log;
+use strict;
+use warnings;
+use bytes;
+
+# All K::Q::Util packages may move to K::Util ...
+
+# This implements the datastructure for all bucket sorts algorithms.
+# It will initially
+
+# TODO:
+#   Use a variant of insertion sort or (better) tree sort
+#   http://jeffreystedfast.blogspot.de/2007/02/binary-insertion-sort.html
+#   The most natural way to sort would probably be a variant of bucket sort,
+#   but this would perform poorly in case the field to sort has only
+#   a single rank, which would be the case, if the corpus query would require
+#   this.
+#   The best variant may be bucket sort with insertion sort. In case top_n was chosen,
+#   this may early make some buckets neglectable.
+#   Like this:
+#   1. Create 256 buckets. These buckets have
+#      1.1. A pointer to their contents and
+#      1.2. A counter, how many elements are in there.
+#           If the pointer is zero, no elements should
+#           be put into the bin (i.e. forget them).
+#      1.3. A bit vector marking equal elements
+#           May not be good - better add concrete pointers, because
+#           order will change regularly. Maybe only a single
+#           marker for duplicates
+#   2. Get a new item from the stream and add it to the bucket in question,
+#      in case this bucket does not point to 0.
+#      2.1. Do an insertion sort in the bucket.
+#      2.2. If the elements are equal, put the element behind the equal element
+#           and set the duplicate flag.
+#   3. Increment the counter of the bucket.
+#   4. If the number of sorted elements is (n % top_n) == 0
+#      iterate through all buckets from the top and calculate the sum
+#      of the contents.
+#      4.1. If the sum exceeds top_n, clean up the following bucket pointers
+#           and let them point to 0.
+#      4.2. Stop if a pointer is 0.
+#   5. Go to 2 until all elements are consumed.
+#   6. From the top bucket, check all duplicates, and sort them after the next field
+#   7. Go to 6. unless all fields are sorted.
+#   8. Check for remaining duplicates in the buckets and sort them by the uid field.
+#   9. Return top_n in bucket order.
+#
+#   It would be great, if the ordered values in the buckets could be used
+#   for traversing directly - without copying the structure any further.
+#   As the top-buckets will always receive items, they will probably need more space than the others
+#   (although, this is not true all the time)
+#
+#   To calculate the sum of the preceeding array, vector intrinsics may be useful:
+#   http://stackoverflow.com/questions/11872952/simd-the-following-code
+#
+
+# TODO:
+#  The implementation may well be a double-linked list where
+#  bucket borders are additional links to the list:
+#
+#  [buc1=count|valid][buc2=count|valid]...
+#      |                 |----------------|
+#      |                                  |
+#      v                                  v
+#  [rec1|dupl]=[rec2|dupl]=[rec3|dupl]=[rec4|dupl]=...
+#
+# With this, it's possible to first use bucket search
+# for insertion into the list and then perform insertion
+# sort or any other offline sort algorithm per valid bucket.
+# However - the length of the list needs to be transported
+# somehow ... maybe by a separated link with the bucket pointers.
+
+sub new {
+  my $class = shift;
+  my $top_k = shift;
+
+  my @buckets = ();
+  foreach (0 .. 255) {
+    push @buckets, Krawfish::Query::Util::Buckets::Bucket->new;
+  };
+
+  bless {
+    top_k => $top_k,
+    buckets => \@buckets,
+    max_bucket => 256
+  }, $class;
+};
+
+
+sub add {
+  my ($self, $rank, $record) = @_;
+
+  # This returns the first 8 bits from the rank
+  my $bucket_nr = bytes::substr($rank, 0, 1);
+
+  # Check, if bucket is valid
+  return unless $bucket_nr > $self->{max_bucket};
+
+  # Buckets have the structure:
+  # [valid|pointer|counter]
+  my $bucket = $self->{buckets}->[$bucket_nr];
+
+  # Insert (sorted)
+  $bucket->insert($rank, $record);
+
+  # Bucket is in max
+  return 1 if $bucket_nr == $self->{max_bucket};
+
+  # Increment bucket counter
+  while ($bucket = $self->{buckets}->[$bucket_nr++]) {
+
+    # Increnment bucket and invalidate if exceeding
+    if ($bucket->incr > $self->{top_k}) {
+      $self->{max_bucket} = $bucket_nr - 1;
+      last;
+    };
+  };
+
+  return 1;
+};
+
+
+# Returns the top k results in sorted bucket order
+sub next;
+
+1;

diff --git a/lib/Krawfish/Query/Util/BucketSort/Bucket.pm b/lib/Krawfish/Query/Util/BucketSort/Bucket.pm
new file mode 100644
index 0000000..fc3b286
--- /dev/null
+++ b/lib/Krawfish/Query/Util/BucketSort/Bucket.pm

@@ -0,0 +1,36 @@
+package Krawfish::Query::Util::BucketSort::Bucket;
+use Krawfish::Query::Util::BucketSort::Record;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+# The count is the sum of all counts before
+sub new {
+  bless [
+    0, # count
+    [] # entries
+  ], shift;
+};
+
+
+# Increment counter
+sub incr {
+  return $_[0]->[1]++;
+};
+
+
+# Entries have the structure:
+# [rank
+sub insert {
+  my ($self, $rank, $value) = @_;
+
+  # Increment counter
+  $self->incr;
+
+  # Insert into list
+  # Probably unsorted at first
+  $self->{list}->insert($rank, $value);
+};
+
+
+1;

diff --git a/lib/Krawfish/Query/Util/BucketSort/List.pm b/lib/Krawfish/Query/Util/BucketSort/List.pm
new file mode 100644
index 0000000..e7e3957
--- /dev/null
+++ b/lib/Krawfish/Query/Util/BucketSort/List.pm

@@ -0,0 +1,19 @@
+package Krawfish::Query::BucketSort::List;
+use strict;
+use warnings;
+
+# Implement as a linked list - initially unsorted
+# (because it may not be necessary to sort)!
+# Entries have the structure [rank|value]
+
+sub new {
+  my $class = shift;
+  bless [], $class;
+};
+
+sub insert {
+  my ($self, $rank, $value) = @_;
+  push @$self, [$rank, $value];
+};
+
+1;

diff --git a/lib/Krawfish/Result/Group/Classes.pm b/lib/Krawfish/Result/Group/Classes.pm
index f090144..d555715 100644
--- a/lib/Krawfish/Result/Group/Classes.pm
+++ b/lib/Krawfish/Result/Group/Classes.pm

@@ -14,6 +14,7 @@
     END_POS => 2,
 };
 
+
 sub new {
   my $class = shift;
   bless {

diff --git a/lib/Krawfish/Result/Sort.pm b/lib/Krawfish/Result/Sort.pm
index 40b8410..f9e59b9 100644
--- a/lib/Krawfish/Result/Sort.pm
+++ b/lib/Krawfish/Result/Sort.pm

@@ -8,6 +8,8 @@
 # Sorting can be optimized by an appended filter, in case there is no need
 # for counting all matches and documents.
 
+# See Krawfish::Query::Util::Buckets
+
 # TODO:
 #  Sort is currently limited to sorting for searching.
 #  It should also be able to sort groups or sort texts/documents/corpora
@@ -18,51 +20,6 @@
 #   This could in fact be done manipulating the external virtual corpus filter!
 #
 # TODO:
-#   Use a variant of insertion sort or (better) tree sort
-#   http://jeffreystedfast.blogspot.de/2007/02/binary-insertion-sort.html
-#   The most natural way to sort would probably be a variant of bucket sort,
-#   but this would perform poorly in case the field to sort has only
-#   a single rank, which would be the case, if the corpus query would require
-#   this.
-#   The best variant may be bucket sort with insertion sort. In case top_n was chosen,
-#   this may early make some buckets neglectable.
-#   Like this:
-#   1. Create 256 buckets. These buckets have
-#      1.1. A pointer to their contents and
-#      1.2. A counter, how many elements are in there.
-#           If the pointer is zero, no elements should
-#           be put into the bin (i.e. forget them).
-#      1.3. A bit vector marking equal elements
-#           May not be good - better add concrete pointers, because
-#           order will change regularly. Maybe only a single
-#           marker for duplicates
-#   2. Get a new item from the stream and add it to the bucket in question,
-#      in case this bucket does not point to 0.
-#      2.1. Do an insertion sort in the bucket.
-#      2.2. If the elements are equal, put the element behind the equal element
-#           and set the duplicate flag.
-#   3. Increment the counter of the bucket.
-#   4. If the number of sorted elements is (n % top_n) == 0
-#      iterate through all buckets from the top and calculate the sum
-#      of the contents.
-#      4.1. If the sum exceeds top_n, clean up the following bucket pointers
-#           and let them point to 0.
-#      4.2. Stop if a pointer is 0.
-#   5. Go to 2 until all elements are consumed.
-#   6. From the top bucket, check all duplicates, and sort them after the next field
-#   7. Go to 6. unless all fields are sorted.
-#   8. Check for remaining duplicates in the buckets and sort them by the uid field.
-#   9. Return top_n in bucket order.
-#
-#   It would be great, if the ordered values in the buckets could be used
-#   for traversing directly - without copying the structure any further.
-#   As the top-buckets will always receive items, they will probably need more space than the others
-#   (although, this is not true all the time)
-#
-#   To calculate the sum of the preceeding array, vector intrinsics may be useful:
-#   http://stackoverflow.com/questions/11872952/simd-the-following-code
-#
-# TODO:
 #   Whenever top X is requested, and there are matches > X with a max_rank Y,
 #   ignore all matches > Y. This also means, these matches do not need to be
 #   fine-sorted using secondary fields.

diff --git a/lib/Krawfish/Result/Sort/Alphabet.pm b/lib/Krawfish/Result/Sort/Alphabet.pm
index cc336f2..c80dc7f 100644
--- a/lib/Krawfish/Result/Sort/Alphabet.pm
+++ b/lib/Krawfish/Result/Sort/Alphabet.pm

@@ -1,5 +1,39 @@
+package Krawfish::Result::Sort;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+use constant DEBUG => 0;
+
 # Sort by characters of a certain segment (either the first or the last).
 # This will require to open the offset file to get the first two characters
 # for bucket sorting per token and then request the
 # forward index (the offset is already liftet and may be stored in the buckets
 # as well) for fine grained sorting!
+
+# TODO:
+#   This will need to pass sorting criteria as strings for cluster sorting.
+#   At least for the top k matches.
+
+sub new {
+  my $class = shift;
+  my $self = bless {
+    query => shift,
+    index => shift,
+    # top_k => shift
+  }, $class;
+
+  my $dict = $self->dict;
+  my $subt = $self->subtokens;
+
+  while ($query->next) {
+    my $element = $query->current->clone;
+
+    # Get the subtoken info
+    my $x = $subt->get($element->doc_id, $element->start);
+
+    # Add element with id for sorting
+    push @record, [$element, $x->[2]]
+  };
+
+};
commit	ba0d3c8afc5b94492a4109e828f6e0a6d8638d5f	[log] [tgz]
author	Akron <nils@diewald-online.de>	Mon Feb 20 12:45:07 2017 +0100
committer	Akron <nils@diewald-online.de>	Mon Feb 20 12:45:07 2017 +0100
tree	d3cf2a926297db29b52d72f10bf051e45418e0c9
parent	1cb0b2112920c415f029dfb4d4edd18386a91da1 [diff]