First attempt to group instead of searching

commit: 18ff592b72950a89e155f5edc17b13146e6fff17 [log] [tgz]
author: Akron <nils@diewald-online.de> Fri Jan 13 10:09:45 2017 +0100
committer: Akron <nils@diewald-online.de> Fri Jan 13 10:09:45 2017 +0100
tree: bbd2c62f937b6f88db230c470cb68361c6e54204
parent: 063e425948316fa1b54f499ce10df242c1a4b0a6 [diff]
diff --git a/lib/Krawfish/Index.pm b/lib/Krawfish/Index.pm
index 56cda6e..857f1ed 100644
--- a/lib/Krawfish/Index.pm
+++ b/lib/Krawfish/Index.pm

@@ -9,7 +9,7 @@
 use warnings;
 use Scalar::Util qw!blessed!;
 use Mojo::JSON qw/encode_json decode_json/;
-use Mojo::Util qw/slurp/;
+use Mojo::File;
 
 # TODO: Add LiveDocs-PostingsList, that supports deletion
 #
@@ -113,7 +113,7 @@
   my $self = shift;
   my $doc = shift;
   unless (ref $doc) {
-    $doc = decode_json slurp $doc;
+    $doc = decode_json(Mojo::File->new($doc)->slurp);
   };
 
   # Get new doc_id

diff --git a/lib/Krawfish/Index/Dictionary.pm b/lib/Krawfish/Index/Dictionary.pm
index f663d7a..9764827 100644
--- a/lib/Krawfish/Index/Dictionary.pm
+++ b/lib/Krawfish/Index/Dictionary.pm

@@ -5,6 +5,7 @@
 use Krawfish::Index::PostingsList;
 
 # TODO: Use Storable
+# TODO: Support case insensitivity
 
 use constant DEBUG => 0;
 

diff --git a/lib/Krawfish/Index/Segments.pm b/lib/Krawfish/Index/Segments.pm
index 6a5887c..a418114 100644
--- a/lib/Krawfish/Index/Segments.pm
+++ b/lib/Krawfish/Index/Segments.pm

@@ -10,7 +10,14 @@
 #   The segments structure may be augmented with a skiplist
 #   and be a highly optimized position encoding, because character offsets
 #   should normally have values between 0 and 16.
-
+#
+#   It should also contain information about the first two characters
+#   of a term and possibly the last two characters, necessary to bucket sort terms.
+#   The characters are stored as UTF-8 or similar -
+#   it may be beneficial to have the most common characters need the least
+#   bits.
+#   Note that this information needs to store characters and not
+#   bytes, as bytes may not be helpful for sorting!
 
 # Constructor
 sub new {
@@ -42,4 +49,18 @@
   return $self->{$doc_id . '#' . $segment};
 };
 
+
+# Define, how many start characters will be stored
+sub start_char_length {
+  2;
+}
+
+# Define, how many start characters will be stored
+sub end_char_length {
+  2;
+}
+
+# TODO: A Segment has ->start_offset, ->length, ->first_chars, ->last_chars, ->term_id
+# term_id may either be a term-id or a string
+
 1;

diff --git a/lib/Krawfish/Index/TokensList.pm b/lib/Krawfish/Index/TokensList.pm
index 84792cf..a1a5d3a 100644
--- a/lib/Krawfish/Index/TokensList.pm
+++ b/lib/Krawfish/Index/TokensList.pm

@@ -3,6 +3,9 @@
 use strict;
 use warnings;
 
+# DEPRECATED!
+# A forward index will be used instead!
+
 # This is a forward index for tokens
 # This will be used for complex regular expressions,
 # grouping of class results

diff --git a/lib/Krawfish/Koral/Query/Sequence.pm b/lib/Krawfish/Koral/Query/Sequence.pm
index e4f3360..8b32866 100644
--- a/lib/Krawfish/Koral/Query/Sequence.pm
+++ b/lib/Krawfish/Koral/Query/Sequence.pm

@@ -4,6 +4,9 @@
 use strict;
 use warnings;
 
+# TODO: Optimize if there is an identical subquery
+#   in a direct sequence, make this a repetition!!!
+
 use constant DEBUG => 0;
 
 sub new {

diff --git a/lib/Krawfish/Posting.pm b/lib/Krawfish/Posting.pm
index 007bfb5..c462a31 100644
--- a/lib/Krawfish/Posting.pm
+++ b/lib/Krawfish/Posting.pm

@@ -35,12 +35,27 @@
 };
 
 
+sub get_classes {
+  my ($self, $nrs) = @_;
+  # Check payload for relevant class and return start, end
+  # If no nrs are given, return all classes
+  ...
+};
+
+
+sub get_classes_sorted {
+  my ($self, $nrs) = @_;
+  # The same as get_classes, but ordered by start position
+  ...
+}
+
 # This will be overwritten for at least cached buffers
 # necessary for sorting
 sub offset {
   undef;
 };
 
+
 sub clone {
   my $self = shift;
   return __PACKAGE__->new(

diff --git a/lib/Krawfish/Query/Constraint/InDistanceSpan.pm b/lib/Krawfish/Query/Constraint/InDistanceSpan.pm
index 76a0809..60f139f 100644
--- a/lib/Krawfish/Query/Constraint/InDistanceSpan.pm
+++ b/lib/Krawfish/Query/Constraint/InDistanceSpan.pm

@@ -2,6 +2,16 @@
 use strict;
 use warnings;
 
+# The first span and the second span needs to be inside
+# spans, maybe in the same (max=0) or with a distance.
+# There are gaps allowed in the distance.
+
+use constant {
+  NEXTA => 1,
+  NEXTB => 2,
+  MATCH => 4,
+};
+
 sub new {
   my $class = shift;
   bless {
@@ -10,31 +20,59 @@
     min => shift,
     max => shift
   }, $class;
-}
+};
 
+sub _init {
+  return if $_[0]->{init}++;
+  print_log('c_dist', 'Init distance span') if DEBUG;
+  $_[0]->{span}->next;
+};
+
+# Check the configuration
 sub check {
   my $self = shift;
   my ($payload, $first, $second) = @_;
 
-  # TODO: init span
+  # Find out ranges
+  my $lower_range = $first->start < $second->start ? $first->start : $second_start;
+  my $upper_range = $first->end > $second->end ? $first->end : $second->end;
+  # my ($start, $end) = $first->start < $second->start ? ($first, $second) : ($second, $first);
 
-  my $span = $self->{span};
+  $self->_init;
 
-  my $buffer = $self->{buffer};
+  my $distance = $self->{span};
+  my $ret_val = 0b0000;
 
   # No current element
-  return 0b0000 unless $span->current;
+  return $ret_val unless $distance->current;
 
   # Move span to correct position
-  while ($span->current->doc_id < $first->doc_id) {
-    $span->next or return NEXTA | NEXTB;
+  while ($distance->current->doc_id < $first->doc_id) {
+    $distance->next or return NEXTA | NEXTB;
   };
 
-  my $current = $span->current or return 0b0000;
+  # There is no correct position ...
+  if ($distance->current->doc_id > $first->doc_id) {
+    return NEXTA | NEXTB;
+  };
 
-  my ($start, $end) = $first->start < $second->start ? ($first, $second) : ($second, $first);
-  
-  if ($first->end > $current->end)
+  my $distance_current = $distance->current;
+  # Doc ID is at the correct position
+  # my $buffer = $self->{buffer};
+
+  # my $current = $span->current or return 0b0000;
+
+  # Forward, until the spans end overlaps the lower range
+  while ($distance->current->end < $lower_range) {
+    $distance->next;
+  };
+
+
+  # Distance is quite complicated imagine a situation like this:
+  # <1> ... <2> ... [a] ... </2> ... <3> ... <4> ... [b] ... </4></3></1>
+  # ???
+  #
+  # if ($first->end > $current->end)
 };
 
 1;

diff --git a/lib/Krawfish/Result/Aggregate/Content.pm b/lib/Krawfish/Result/Aggregate/Content.pm
new file mode 100644
index 0000000..16a5cec
--- /dev/null
+++ b/lib/Krawfish/Result/Aggregate/Content.pm

@@ -0,0 +1,2 @@
+# Aggregate by content information, for example,
+# based on a certain class

diff --git a/lib/Krawfish/Result/Group.pm b/lib/Krawfish/Result/Group.pm
new file mode 100644
index 0000000..cafd35e
--- /dev/null
+++ b/lib/Krawfish/Result/Group.pm

@@ -0,0 +1,109 @@
+package Krawfish::Result::Limit;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+use constant DEBUG => 0;
+
+# Group snippets based on certain criteria, for example:
+# metadata!
+# - this is an extension to facets, where snippets are grouped
+#   based on a certain facet.
+# - having facets in a first step may improve the distributed aggregation
+#   (as the central node than knows, which facets are most or least common)
+# - this grouping doesn't seem beneficial - as the facet view already helps here
+#
+# innertextual!
+# - has a certain identical class on surface
+# - has the same starting characters of a word
+# - has the same ending characters of a word
+# - has the same POS of a certain class (this is actually pretty hard!)
+#   - this may mean to modify the search a bit to lift the posting types
+#     and make a class, like [orth=der & base/p=*]
+#   - At least the postingslist of base/p=* should be merged in parallel!
+#
+# This is already possible in C2 so it needs to be implemented!
+
+# A group has the following structure:
+# {
+#   criterion => [freq, doc_freq]
+# }
+# Where criterion may be a sequence of criteria
+# with class information, like
+#   1:der|2:Baum => []
+
+# Construct grouping function
+sub new {
+  my $class = shift;
+  bless {
+    query => shift,
+
+    # This is a group criterion object, created outside, that defines the criterion
+    criterion => shift,
+    classes => [@_],
+
+    # Group to fill with matches and group info
+    # (as class1=>X, class2=>Y)
+    groups => {}
+  }, $class;
+};
+
+# Go through all matches
+# This could, nonetheless, be implemented like Facets ...
+sub _init {
+  my $self = shift;
+  my $criterion = $self->{criterion};
+  my $query = $self->{query};
+
+  my %groups = ();
+  my $group, $current;
+  my $doc_id = -1;
+
+  while ($query->next) {
+
+    $current = $query->current or last;
+
+    # Potentially create new group
+    $group = ($groups{$criterion->get_group($current)} //= [0,0]);
+
+    # Increment freq
+    $group->[0]++;
+
+    if ($current->doc_id != $doc_id) {
+
+      # Increment doc_freq
+      $group->[1]++;
+
+      $doc_id = $current->doc_id;
+    };
+  };
+
+  return \%groups;
+};
+
+sub next {
+  my $self = shift;
+
+
+  return $criterion->groups;
+};
+
+
+sub current {
+  $_[0]->{query}->current;
+}
+
+
+# May return a hash reference with information
+sub current_group;
+
+
+sub to_string {
+  my $self = shift;
+  my $str = 'collectGroups(';
+  $str .= $self->{criterion}->to_string . ':';
+  $str .= $self->{query}->to_string;
+  $str .= ')';
+};
+
+1;

diff --git a/lib/Krawfish/Result/Group/Characters.pm b/lib/Krawfish/Result/Group/Characters.pm
new file mode 100644
index 0000000..9dc5d83
--- /dev/null
+++ b/lib/Krawfish/Result/Group/Characters.pm

@@ -0,0 +1,57 @@
+package Krawfish::Result::Group::Character;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+use constant DEBUG => 0;
+
+sub new {
+  my $class = shift;
+  bless {
+    segments   => shift, # Krawfish::Index::Segments object
+    from_start => shift,  # boolean - otherwise from end
+    char_count => shift
+    nrs => [@_]
+  }, $class;
+};
+
+
+sub get_group {
+  my ($self, $match) = @_;
+
+  # Get all classes from the match
+  my @classes = $self->get_classes($self->{nrs});
+
+  my $segments = $self->{segments};
+
+  my %group;
+
+  # Classes have nr, start, end
+  foreach my $class (sort { $a->start <=> $b->start } @classes) {
+
+    if ($self->{from_start}) {
+
+      # This will retrieve the segment from the segments stream
+      my $segment = $stream->get($match->doc_id, $class->start);
+
+      if ($segment->)
+
+        # The character count can be satisfied by the
+      my $first_chars = $segment->first_chars;
+
+      if (length($first_chars) <= $self->{char_count} {
+        substr($first_chars);
+      }
+      
+      # Check, if the segment only spans one segment
+      if ($class->end != $class->start+1) {
+        
+      };
+    }
+    else {
+      ...
+    };
+  };
+};
+
+1;

diff --git a/lib/Krawfish/Result/Group/Classes.pm b/lib/Krawfish/Result/Group/Classes.pm
new file mode 100644
index 0000000..a974758
--- /dev/null
+++ b/lib/Krawfish/Result/Group/Classes.pm

@@ -0,0 +1,69 @@
+package Krawfish::Result::Group::Classes;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+# TODO:
+#   The name is somehow misleading, as this will only
+#   group by surface terms.
+
+use constant DEBUG => 0;
+
+sub new {
+  my $class = shift;
+  bless {
+    segments   => shift, # Krawfish::Index::Segments object
+    nrs => [@_],
+    groups => {}
+  }, $class;
+};
+
+
+# This will return a string, reflecting the group name of the list
+sub get_group {
+  my ($self, $match) = @_;
+
+  # Get all classes from the match
+  # Classes need to be sorted by start position
+  # to be retrievable, in case the Segments-Stream
+  # is implemented as a postingslist (probably not)
+  my @classes = $self->get_classes_sorted($self->{nrs});
+
+  my %class_group;
+
+  # Classes have nr, start, end
+  foreach my $class (@classes) {
+
+    # WARNIG! CLASSES MAY OVERLAP SO SEGMENTS SHOULD BE CACHED OR BUFFERED!
+
+    # Get start position
+    my $start = $class->start;
+
+    my @seq = ();
+
+    # Receive segment
+    my $seg = $segments->get($match->doc_id, $start);
+
+    # Push term id to segment
+    push (@seq, $seg->term_id);
+
+    while ($start < $class->end -1) {
+      $seg = $segments->get($match->doc_id, $start++);
+
+      # Push term id to segment
+      push (@seq, $seg->term_id);
+    };
+
+    $class_group{$class->nr} = join('|', @seq);
+  };
+
+  my $string = '';
+  foreach (sort {$a <=> $b} keys %class_group) {
+    $string .= $_ .':' . class_group{$_} . ';';
+  };
+
+  return $string;
+};
+
+
+1;

diff --git a/lib/Krawfish/Result/Sort/Alphabet.pm b/lib/Krawfish/Result/Sort/Alphabet.pm
new file mode 100644
index 0000000..cc336f2
--- /dev/null
+++ b/lib/Krawfish/Result/Sort/Alphabet.pm

@@ -0,0 +1,5 @@
+# Sort by characters of a certain segment (either the first or the last).
+# This will require to open the offset file to get the first two characters
+# for bucket sorting per token and then request the
+# forward index (the offset is already liftet and may be stored in the buckets
+# as well) for fine grained sorting!
commit	18ff592b72950a89e155f5edc17b13146e6fff17	[log] [tgz]
author	Akron <nils@diewald-online.de>	Fri Jan 13 10:09:45 2017 +0100
committer	Akron <nils@diewald-online.de>	Fri Jan 13 10:09:45 2017 +0100
tree	bbd2c62f937b6f88db230c470cb68361c6e54204
parent	063e425948316fa1b54f499ce10df242c1a4b0a6 [diff]