Introduce concept for cluster search

commit: e0914534bd294164e2498b29283964eae395d04e [log] [tgz]
author: Akron <nils@diewald-online.de> Sat Jul 29 19:53:10 2017 +0200
committer: Akron <nils@diewald-online.de> Sat Jul 29 19:53:10 2017 +0200
tree: 17a11e639be96f9e2d116c0445c66305b6ef6083
parent: 826ec7718a4c28cf8fbc2fb47e0750e69f6ff00d [diff]
diff --git a/lib/Krawfish/Cluster.pm b/lib/Krawfish/Cluster.pm
index 9c0c470..99b05bf 100644
--- a/lib/Krawfish/Cluster.pm
+++ b/lib/Krawfish/Cluster.pm

@@ -1,10 +1,13 @@
 package Krawfish::Cluster;
+use Mojo::IOLoop;
 use strict;
 use warnings;
 
 # Krawfish::Cluster queries to multiple nodes
 # and takes care of failures in responses
 
+# See http://verdi.uwplse.org/
+
 sub new {
   my $class = shift;
   bless {
@@ -12,4 +15,46 @@
   }, $class;
 };
 
+
+# Search for a query and return a response
+sub search_for {
+  my ($self, $query, $cb) = @_;
+
+  # This should probably open multiple websockets/unx-sockets in parallel
+  # https://stackoverflow.com/questions/13417000/synchronous-request-with-websockets
+  Mojo::IOLoop->delay(
+    sub {
+      my $delay = shift;
+      foreach my $node (@{$self->{nodes}}) {
+        $ua->post($node => json => $query => $delay->begin);
+      };
+    },
+    sub {
+      my $delay = shift;
+
+      # Iterate over all results
+      foreach (@_) {
+
+        # Responses have a head and a tail section
+        # In case, no aggregation or grouping is done,
+        # there is no head section.
+        # In case, there is grouping, there is no
+        # tail.
+        my $response = $_->res->json;
+
+        # Aggregate data, e.g. for grouping
+        $query->process_head($response->{head});
+
+        # Get through the matches
+        # TODO:
+        #   This is, however, bad for merge sort!
+        $query->process_tail($response->{tail});
+      };
+    }
+  )->wait;
+
+  return $query->to_result;
+};
+
+
 1;

diff --git a/lib/Krawfish/Controller/Index.pm b/lib/Krawfish/Controller/Index.pm
index 410a03e..f83ef9a 100644
--- a/lib/Krawfish/Controller/Index.pm
+++ b/lib/Krawfish/Controller/Index.pm

@@ -54,29 +54,17 @@
   # Get nodes object
   my $cluster = Krawfish::Cluster->new;
 
-  # Send to all nodes
-  $node_koral->send(
-    $cluster => (
+  # Send query to all nodes
+  $cluster->search_for(
+    $node_koral => sub {
+      my $response = shift;
 
-      # This sub will be triggered for each node
-      sub {
-        my ($query, $node) = @_;
+      # Add result to response
+      $response->{response} = $query->to_response;
 
-        # Process the head data
-        $query->process_head($node->response->head);
-      },
-
-      # This sub will triggered after all nodes were passed
-      sub {
-        my $query = shift;
-
-        # Add result to response
-        $response->{response} = $query->to_response;
-
-        # Return koral query response
-        return $c->render(json => $response->to_koral_query);
-      }
-    )
+      # Return koral query response
+      return $c->render(json => $response->to_koral_query);
+    }
   );
 };
 

diff --git a/lib/Krawfish/Posting.pm b/lib/Krawfish/Posting.pm
index ec175dd..fd3d912 100644
--- a/lib/Krawfish/Posting.pm
+++ b/lib/Krawfish/Posting.pm

@@ -111,13 +111,15 @@
   return @classes;
 };
 
+
 # Return classes sorted by start position
 sub get_classes_sorted {
   my ($self, $nrs) = @_;
   # The same as get_classes, but ordered by start position
 
   return sort { $a->[1] <=> $b->[1] } $self->get_classes($nrs);
-}
+};
+
 
 # This will be overwritten for at least cached buffers
 # necessary for sorting
@@ -126,6 +128,7 @@
 };
 
 
+# Clone the posting with all information
 sub clone {
   my $self = shift;
   return __PACKAGE__->new(
@@ -136,7 +139,8 @@
   );
 }
 
-# Stringify
+
+# Stringification
 sub to_string {
   my $self = shift;
   my $str = '[' .
@@ -152,6 +156,7 @@
 };
 
 
+# Check if two postings are identical
 sub is_identical {
   my ($self, $comp) = @_;
   return unless $comp;

diff --git a/lib/Krawfish/Posting/Bundle.pm b/lib/Krawfish/Posting/Bundle.pm
index 272da8b..cb8616b 100644
--- a/lib/Krawfish/Posting/Bundle.pm
+++ b/lib/Krawfish/Posting/Bundle.pm

@@ -7,8 +7,8 @@
 # TODO:
 #   This is quite similar to K::P::Group
 
-# This is a container class for multiple
-# Krawfish::Posting objects
+# This is a container class for multiple Krawfish::Posting objects,
+# used for (among others) sorting.
 
 # Constructor
 sub new {

diff --git a/lib/Krawfish/Posting/Match.pm b/lib/Krawfish/Posting/Match.pm
index 299a13b..6062e43 100644
--- a/lib/Krawfish/Posting/Match.pm
+++ b/lib/Krawfish/Posting/Match.pm

@@ -1,10 +1,13 @@
 package Krawfish::Posting::Match;
 use parent 'Krawfish::Posting';
 use Krawfish::Util::String qw/squote/;
-use JSON::XS;
 use warnings;
 use strict;
 
+
+# Matches are returned from searches and can be enriched
+# with various information
+
 # Get or set field to match
 sub fields {
   my $self = shift;
@@ -36,7 +39,22 @@
   };
 };
 
+sub sorting_criteria;
 
+sub snippet;
+
+sub segment_id;
+
+sub match_id;
+
+
+# serialize to koralquery
+sub to_koral_query {
+  ...
+};
+
+
+# Stringification
 sub to_string {
   my $self = shift;
   my $str = '[';

diff --git a/lib/Krawfish/Result/Cluster.pm b/lib/Krawfish/Result/Cluster.pm
deleted file mode 100644
index 41b974f..0000000
--- a/lib/Krawfish/Result/Cluster.pm
+++ /dev/null

@@ -1 +0,0 @@
-# See http://verdi.uwplse.org/

diff --git a/lib/Krawfish/Result/Group/Fields.pm b/lib/Krawfish/Result/Group/Fields.pm
index 82c442e..15eb364 100644
--- a/lib/Krawfish/Result/Group/Fields.pm
+++ b/lib/Krawfish/Result/Group/Fields.pm

@@ -6,9 +6,9 @@
 use constant DEBUG => 0;
 
 # This will group matches (especially document matches) by field
-# This is useful for document browsing.
+# This is useful e.g. for document browsing per corpus.
 #
-# Because the groupiung is based on ranking, the sorting will be trivial.
+# Because the grouping is based on ranking, the sorting will be trivial.
 
 sub new {
   my $class = shift;

diff --git a/lib/Krawfish/Result/Node/Sort.pm b/lib/Krawfish/Result/Node/Sort.pm
index c60fcd5..0c9aa3d 100644
--- a/lib/Krawfish/Result/Node/Sort.pm
+++ b/lib/Krawfish/Result/Node/Sort.pm

@@ -1,33 +1,84 @@
 package Krawfish::Result::Node::Sort;
+use Krawfish::Util::Heap;
 use strict;
 use warnings;
 
-# This will simply mergesort the inmcoming
-# streams using next and prepare 'criterion'
-# for current.
+# This will sort the incoming results using a heap
+# and the sort criteria.
+# This is obviously less efficient than a dynamic
+# mergesort, but for the moment, it's way simpler.
 
-# May need to return Krawfish::Posting::Sorted with a 'criterion' array.
-
-# Instead of next() followed by current(), this should use
-# next_current() and - for matches - next_match()
+# TODO:
+#   May need to return Krawfish::Posting::Sorted with a 'criterion' array.
+#   Instead of next() followed by current(), this should use
+#   next_current() and - for matches - next_match()
 
 sub new {
   my $class = shift;
-  return bless {
+  my $self = bless {
     query => shift,
-    sort => shift
+    sort => shift,
+    top_k => shift
   }, $class;
+
+  $self->{heap} = Krawfish::Util::Heap->new($self->{top_k});
+
+  # Add criterion comparation method here
+  $self->{heap}->sort_by(
+    sub {
+      my ($obj_a, $obj_b) = @_;
+
+      my $criterion_a = $obj_a->{criterion};
+      my $criterion_b = $obj_b->{criterion};
+
+      for (my $i = 0; $i < @{$criterion_a}; $i++) {
+        if ($criterion_b->[$i]) {
+          return 1;
+        };
+        if ($criterion_a->[$i] < $criterion_b->[$i]) {
+          return -1;
+        }
+        elsif ($criterion_a->[$i] > $criterion_b->[$i]) {
+          return 1;
+        };
+      };
+      return -1;
+    }
+  );
+
+  return $self;
 };
 
 
 sub to_string {
   my $self = shift;
-  return 'sort(' . join(',', map { $_->to_string } @{$self->{sort}}) . ':' . $self->{query}->to_string . ')';
+  return 'sort(' .
+    join(',', map { $_->to_string }
+         @{$self->{sort}}) . ':' . $self->{query}->to_string . ')';
 };
 
-sub next {
-  $_[0]->{query}->next;
+
+# Process one tail
+sub process_tail {
+  my ($self, $tail) = @_;
+
+  # Iterate over all matches
+  foreach my $match (@$tail) {
+
+    # Enqueue as long as the list isn't full
+    unless ($self->{heap}->enqueue($match)) {
+      last;
+    };
+  };
+
+  $self->{query}->process_tail($tail);
 };
 
 
+sub to_result {
+  ...
+};
+
+
+
 1;

diff --git a/lib/Krawfish/Util/Heap.pm b/lib/Krawfish/Util/Heap.pm
new file mode 100644
index 0000000..e4c46ff
--- /dev/null
+++ b/lib/Krawfish/Util/Heap.pm

@@ -0,0 +1,41 @@
+package Krawfish::Util::Heap;
+use strict;
+use warnings;
+
+# Heap structure for top-k heap sort
+
+# TODO:
+#   Use this as the base for PrioritySort
+
+
+sub new {
+  my $class = shift;
+  bless {
+    top_k => shift,
+    _sort => sub { $_[0] cmp $_[1] }
+  }, $class;
+};
+
+
+# Get or set sort method
+sub sort_by {
+  my $self = shift;
+  if (@_) {
+    $self->{_sort} = shift;
+    return $self;
+  };
+  return $self->{_sort};
+};
+
+
+sub enqueue {
+  ...
+};
+
+
+sub dequeue {
+  ...
+};
+
+
+1;
commit	e0914534bd294164e2498b29283964eae395d04e	[log] [tgz]
author	Akron <nils@diewald-online.de>	Sat Jul 29 19:53:10 2017 +0200
committer	Akron <nils@diewald-online.de>	Sat Jul 29 19:53:10 2017 +0200
tree	17a11e639be96f9e2d116c0445c66305b6ef6083
parent	826ec7718a4c28cf8fbc2fb47e0750e69f6ff00d [diff]