Among other things, some notes regarding merging of segments

commit: c2a107d84e57584c31756f206ee2f4e306b2f16d [log] [tgz]
author: Akron <nils@diewald-online.de> Wed Apr 19 22:05:29 2017 +0200
committer: Akron <nils@diewald-online.de> Wed Apr 19 22:05:29 2017 +0200
tree: 863d03097c9234308495a6369602054a42a1b279
parent: b4e1d642fb14e3f4f06f8d563a71882241b816ca [diff]
diff --git a/lib/Krawfish/Index/Dictionary.pm b/lib/Krawfish/Index/Dictionary.pm
index b288ac4..c0d0558 100644
--- a/lib/Krawfish/Index/Dictionary.pm
+++ b/lib/Krawfish/Index/Dictionary.pm

@@ -27,6 +27,11 @@
 #     dictionary in memory, write to disc,
 #     and exchange the old dictionary with the new one.
 
+# TODO:
+#   - While field ranks are done using rank files per segment,
+#     surface terms need to be reranked all the time -
+#     Or there are segment-wide ranks as well ...
+
 # TODO: Use Storable
 # TODO: Support case insensitivity
 # TODO: Create forward index with term-ids

diff --git a/lib/Krawfish/Index/Merge.pm b/lib/Krawfish/Index/Merge.pm
new file mode 100644
index 0000000..3e7acaa
--- /dev/null
+++ b/lib/Krawfish/Index/Merge.pm

@@ -0,0 +1,120 @@
+package Krawfish::Index::Merge;
+use strict;
+use warnings;
+
+sub new {
+  my ($class, $index_a, $index_b) = @_;
+  bless {
+    index_a => $index_a,
+    index_b => $index_b
+  }, $class;
+};
+
+sub merge {
+
+  # Merging will:
+  # - Concatenate all postings lists
+  $self->_merge_postings_lists;
+
+  # - concatenate field information
+  #   - This is also necessary for reranking
+  $self->_merge_fields;
+
+
+  # - concatenate all subtoken lists
+  $self->_merge_subtoken_lists;
+
+  # - Rerank all field ranks
+  #   (ignoring deleted documents)
+  #   - Update pointer file to the dictionary (or maybe not)
+  # - This requires, that the "get_field(x)" is already
+  #   prepared for both indices
+  $self->_rerank_fields;
+
+  # - Concatenate and update primary files / forward index
+  $self->_merge_primary_data;
+};
+
+sub _merge_postings_lists {
+  # TODO:
+  #   (ignore deleted documents)
+  #   - Add SkipLists to postings lists
+  #   - Update position information in dictionary
+  #     (or rather in the pointer file per segment)
+  #
+};
+
+sub _merge_fields {
+  # TODO:
+  #   (ignore deleted documents)
+  #   - Take all field information and write them in a new file
+  #   - Update all pointing information that maps doc_id->field_pos
+};
+
+
+sub _merge_subtoken_lists {
+  #   (ignore deleted documents)
+  #   - Take all subtoken lists and write them to a new file
+  #   - The position offsets to the primary data files should stay intact
+};
+
+sub _merge_primary_data {
+  # TODO:
+  #   (ignore deleted documents)
+  #   - Take all primary data and write them in a new file
+  #   - Update all pointing information that maps doc_id->primary_pos
+  #   - The position pointer offsets in the subtoken-lists should stay intact
+};
+
+
+sub _rerank_fields {
+  # TODO:
+  #   (ignore deleted documents)
+  #   Case A) At the beginning the mechanism has two field ranks:
+  #
+  #   A  Version:24; Max:4; 3,4,3,1
+  #   B  Version:28; Max:5; 3,5,2,1
+  #
+  #   Both ranks have different dictionary version numbers, which means
+  #   the ranks may differ. To get this information, if there is, e.g.
+  #   A new field ranked between 1 and 2, the field backlog of the
+  #   dictionary is requested with the structure:
+  #
+  #   author:
+  #     V25:
+  #       Goethe: 3 (means: Goethe was inserted before 3!)
+  #     V27:
+  #       Schiller: 9
+  #
+  #   First: The maximum rank is looked up and incremented so it is checked
+  #   if the max-value is needs to be updated. Then it is checked, which max-value
+  #   is the new max, which is then used as the new max (dictating the bit width).
+  #
+  #   Based on that information the rank list of the older version is updated by
+  #   incrementing old ranks by the number of new ranks in between.
+  #
+  #   Then both ranks are concatenated.
+  #
+  #   In case the max value was introduced by a then-deleted document,
+  #   update the max value (though do not update the bitwidth again).
+
+
+  #   Case B) At the beginning, one segment has a field rank, the other has none:
+  #
+  #   A  Version:24; Max:4; 3,4,3,1
+  #   B
+  #
+  #   In that case B first needs to get a field rank
+
+
+  #   Case C) At the beginning, no segment has a field rank:
+  #
+  #   A
+  #   B
+  #
+  #   In that case, concatenate first, then rank.
+  #   Take the maximum rank and use this for encoding
+};
+
+
+1;

diff --git a/lib/Krawfish/Koral/Meta/Builder.pm b/lib/Krawfish/Koral/Meta/Builder.pm
index 4e7c408..785517c 100644
--- a/lib/Krawfish/Koral/Meta/Builder.pm
+++ b/lib/Krawfish/Koral/Meta/Builder.pm

@@ -1,5 +1,7 @@
 package Krawfish::Koral::Meta::Builder;
 use parent 'Krawfish::Koral::Meta';
+use Krawfish::Koral::Meta::Sort::Field;
+use Krawfish::Koral::Meta::Sort;
 use strict;
 use warnings;
 

diff --git a/lib/Krawfish/Koral/Meta/Sort.pm b/lib/Krawfish/Koral/Meta/Sort.pm
index 1b21441..e688143 100644
--- a/lib/Krawfish/Koral/Meta/Sort.pm
+++ b/lib/Krawfish/Koral/Meta/Sort.pm

@@ -4,6 +4,9 @@
 use Krawfish::Log;
 use Krawfish::Result::Sort;
 
+# All meta-queries need the nesting query for
+# plan_for
+
 use constant DEBUG => 0;
 
 # TODO: Should differ between
@@ -11,18 +14,52 @@
 # and
 # - sort_by_class()
 
-# fields => [[asc => 'author', desc => 'title']]
+# TODO: should support criteria instead
+# criteria => [[field => asc => 'author', field =>desc => 'title']]
 sub new {
   my $class = shift;
-  my $query = shift;
-  my @fields = @_;
   bless {
-    query => $query,
-    fields => \@fields,
-    filterable => 0
+    criteria => [@_],
+    top_k => undef
   }, $class;
 };
 
+sub top_k {
+  my $self = shift;
+  return $self->{top_k} unless @_;
+  $self->{top_k} = shift;
+};
+
+# Order sort
+sub plan_for {
+  my ($self, $index, $query) = @_;
+  ...
+};
+
+
+sub type { 'sort' };
+
+
+sub to_koral_fragment {
+  ...
+};
+
+
+# Stringify sort
+sub to_string {
+  my $self = shift;
+  my $str = 'sort(';
+  foreach my $criterion (@{$self->{criteria}}) {
+    $str .= $criterion->to_string;
+  };
+  return $str . ')';
+};
+
+
+1;
+
+
+__END__
 
 # Sorting can be optimized by an appended filter, in case there is no need
 # for counting all matches and documents.

diff --git a/lib/Krawfish/Koral/Meta/Sort/Field.pm b/lib/Krawfish/Koral/Meta/Sort/Field.pm
new file mode 100644
index 0000000..27baaf7
--- /dev/null
+++ b/lib/Krawfish/Koral/Meta/Sort/Field.pm

@@ -0,0 +1,19 @@
+package Krawfish::Koral::Meta::Sort::Field;
+use strict;
+use warnings;
+
+sub new {
+  my $class = shift;
+  bless {
+    field => shift,
+    desc => shift
+  }, $class;
+};
+
+sub to_string {
+  my $str = 'field=' . $_[0]->{field};
+  $str .= ($_[0]->{desc} ? '>' : '<');
+  $str;
+};
+
+1;

diff --git a/lib/Krawfish/Koral/Query/Constraint/Position.pm b/lib/Krawfish/Koral/Query/Constraint/Position.pm
index 5e490ff..569c7b4 100644
--- a/lib/Krawfish/Koral/Query/Constraint/Position.pm
+++ b/lib/Krawfish/Koral/Query/Constraint/Position.pm

@@ -3,6 +3,13 @@
 use strict;
 use warnings;
 
+# TODO:
+#   It should be noted that optimization should
+#   keep skip_position() in mind. so in situations
+#   like <a><b>, the <b> can be skiped to a position
+#   equal to the end of <a>, while <a> can't be skipped
+#   to end at the beginning of <b>.
+
 our %FRAME = (
   precedes => PRECEDES,
   precedesDirectly => PRECEDES_DIRECTLY,

diff --git a/lib/Krawfish/Result/Aggregate.pm b/lib/Krawfish/Result/Aggregate.pm
index 2ba15e0..6a49c5d 100644
--- a/lib/Krawfish/Result/Aggregate.pm
+++ b/lib/Krawfish/Result/Aggregate.pm

@@ -6,6 +6,9 @@
 
 use constant DEBUG => 0;
 
+# TODO: Rename to Krawfish::Result::Segment::Aggregate
+
+
 # TODO:
 #   See https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations.html
 

diff --git a/lib/Krawfish/Result/Sort.pm b/lib/Krawfish/Result/Sort.pm
index 852c378..8aeee7e 100644
--- a/lib/Krawfish/Result/Sort.pm
+++ b/lib/Krawfish/Result/Sort.pm

@@ -7,6 +7,15 @@
 
 use constant DEBUG => 0;
 
+
+# TODO: Rename to Krawfish::Result::Segment::Sort
+#
+# TODO:
+#   Do not sort based on RANKS but based on Criteria.
+#   So, in case the first criterion is a field rank,
+#   use PriorityCascade, otherwise use e.g. Arbitrary PriorityQueue
+
+
 # See Krawfish::Util::Buckets
 
 

diff --git a/lib/Krawfish/Result/Sort/Criterion/Rank.pm b/lib/Krawfish/Result/Sort/Criterion/Rank.pm
new file mode 100644
index 0000000..8345aa7
--- /dev/null
+++ b/lib/Krawfish/Result/Sort/Criterion/Rank.pm

@@ -0,0 +1,42 @@
+package Krawfish::Result::Sort::Criterion::Field;
+use strict;
+use warnings;
+
+# TODO:
+#   The same criterion for K::Result::Node::Field
+#   will introduce field fetching etc.
+
+# Constructor
+sub new {
+  my $self = shift;
+  my ($index, $field, $desc) = @_;
+
+  bless {
+    field => $field,
+    desc => $desc,
+    ranking => $index->fields->ranked_by($field),
+    max => $self->{ranking}->max if $desc
+  }, $class;
+};
+
+
+# Get the rank of the match
+sub rank {
+  my ($self, $match) = @_;
+
+  # Get rank from match
+  my $rank = $self->{ranking}->get($match->doc_id);
+  return $self->{max} ? ($self->{max} - $rank) : $rank;
+};
+
+
+# Serialize to string
+sub to_string {
+  my $self = shift;
+  my $str = 'field=';
+  $str .= $self->{field};
+  $str .= $self->{desc} ? '>' : '<';
+  return $str;
+};
+
+1;

diff --git a/lib/Krawfish/Result/Sort/PriorityCascade.pm b/lib/Krawfish/Result/Sort/PriorityCascade.pm
index 30a92d8..04cc68e 100644
--- a/lib/Krawfish/Result/Sort/PriorityCascade.pm
+++ b/lib/Krawfish/Result/Sort/PriorityCascade.pm

@@ -7,6 +7,8 @@
 use strict;
 use warnings;
 
+# This is only based on criteria that return ranks
+
 use constant {
   DEBUG   => 0,
   RANK    => 0,
@@ -50,6 +52,7 @@
   # It has the structure [[field], [field, 1]]
   # where the second value is the descending marker
   my $fields = $param{fields};
+  # TODO: Change to criterion!
 
   # For final field distinction, use unique field
   push @$fields, [$param{unique}];

diff --git a/lib/Krawfish/Result/SortCriteria.pm b/lib/Krawfish/Result/SortCriteria.pm
new file mode 100644
index 0000000..87b358d
--- /dev/null
+++ b/lib/Krawfish/Result/SortCriteria.pm

@@ -0,0 +1,20 @@
+package Krawfish::Result::SortCriteria;
+use strict;
+use warnings;
+
+# Smilar to Snippet, this will add the surface information
+# for all sorting criteria to make sorting possible for
+# cluster sorting.
+
+# sort-criteria : [
+#   {
+#     "@type" : "koral:field"
+#     ...
+#   },
+#   {
+#     "@type" : "koral:string"
+#     ...
+#   }
+# ]
+
+1;

diff --git a/lib/Krawfish/Util/PriorityQueue/Arbitrary.pm b/lib/Krawfish/Util/PriorityQueue/Arbitrary.pm
new file mode 100644
index 0000000..f5392f2
--- /dev/null
+++ b/lib/Krawfish/Util/PriorityQueue/Arbitrary.pm

@@ -0,0 +1,3 @@
+# This will have a priority queue for arbitrary
+# criteria data that will be used on all levels of parallelism
+# (segment, node, and cluster) to sort non-ranked data.

diff --git a/lib/Krawfish/Util/PriorityQueue/PerDoc.pm b/lib/Krawfish/Util/PriorityQueue/PerDoc.pm
index f6fd12b..84fc86f 100644
--- a/lib/Krawfish/Util/PriorityQueue/PerDoc.pm
+++ b/lib/Krawfish/Util/PriorityQueue/PerDoc.pm

@@ -4,6 +4,11 @@
 use warnings;
 use Krawfish::Log;
 
+# TODO: This currently only works with ranks,
+#       it may be more beneficial to work with criteria.
+#       So instead to compare ranks, a Criterion object will
+#       compare arbitrary data in the "rank"-field
+
 # TODO: Probably rename from IN_DOC to IN_COLL
 # TODO: Probably rename to PriorityQueue::Bundle
 # TODO: Turn reverse_array into an iterator
commit	c2a107d84e57584c31756f206ee2f4e306b2f16d	[log] [tgz]
author	Akron <nils@diewald-online.de>	Wed Apr 19 22:05:29 2017 +0200
committer	Akron <nils@diewald-online.de>	Wed Apr 19 22:05:29 2017 +0200
tree	863d03097c9234308495a6369602054a42a1b279
parent	b4e1d642fb14e3f4f06f8d563a71882241b816ca [diff]