Among other things, some notes regarding merging of segments
diff --git a/lib/Krawfish/Index/Dictionary.pm b/lib/Krawfish/Index/Dictionary.pm
index b288ac4..c0d0558 100644
--- a/lib/Krawfish/Index/Dictionary.pm
+++ b/lib/Krawfish/Index/Dictionary.pm
@@ -27,6 +27,11 @@
# dictionary in memory, write to disc,
# and exchange the old dictionary with the new one.
+# TODO:
+# - While field ranks are done using rank files per segment,
+# surface terms need to be reranked all the time -
+# Or there are segment-wide ranks as well ...
+
# TODO: Use Storable
# TODO: Support case insensitivity
# TODO: Create forward index with term-ids
diff --git a/lib/Krawfish/Index/Merge.pm b/lib/Krawfish/Index/Merge.pm
new file mode 100644
index 0000000..3e7acaa
--- /dev/null
+++ b/lib/Krawfish/Index/Merge.pm
@@ -0,0 +1,120 @@
+package Krawfish::Index::Merge;
+use strict;
+use warnings;
+
+sub new {
+ my ($class, $index_a, $index_b) = @_;
+ bless {
+ index_a => $index_a,
+ index_b => $index_b
+ }, $class;
+};
+
+sub merge {
+
+ # Merging will:
+ # - Concatenate all postings lists
+ $self->_merge_postings_lists;
+
+ # - concatenate field information
+ # - This is also necessary for reranking
+ $self->_merge_fields;
+
+
+ # - concatenate all subtoken lists
+ $self->_merge_subtoken_lists;
+
+ # - Rerank all field ranks
+ # (ignoring deleted documents)
+ # - Update pointer file to the dictionary (or maybe not)
+ # - This requires, that the "get_field(x)" is already
+ # prepared for both indices
+ $self->_rerank_fields;
+
+ # - Concatenate and update primary files / forward index
+ $self->_merge_primary_data;
+};
+
+sub _merge_postings_lists {
+ # TODO:
+ # (ignore deleted documents)
+ # - Add SkipLists to postings lists
+ # - Update position information in dictionary
+ # (or rather in the pointer file per segment)
+ #
+};
+
+sub _merge_fields {
+ # TODO:
+ # (ignore deleted documents)
+ # - Take all field information and write them in a new file
+ # - Update all pointing information that maps doc_id->field_pos
+};
+
+
+sub _merge_subtoken_lists {
+ # (ignore deleted documents)
+ # - Take all subtoken lists and write them to a new file
+ # - The position offsets to the primary data files should stay intact
+};
+
+sub _merge_primary_data {
+ # TODO:
+ # (ignore deleted documents)
+ # - Take all primary data and write them in a new file
+ # - Update all pointing information that maps doc_id->primary_pos
+ # - The position pointer offsets in the subtoken-lists should stay intact
+};
+
+
+sub _rerank_fields {
+ # TODO:
+ # (ignore deleted documents)
+ # Case A) At the beginning the mechanism has two field ranks:
+ #
+ # A Version:24; Max:4; 3,4,3,1
+ # B Version:28; Max:5; 3,5,2,1
+ #
+ # Both ranks have different dictionary version numbers, which means
+ # the ranks may differ. To get this information, if there is, e.g.
+ # A new field ranked between 1 and 2, the field backlog of the
+ # dictionary is requested with the structure:
+ #
+ # author:
+ # V25:
+ # Goethe: 3 (means: Goethe was inserted before 3!)
+ # V27:
+ # Schiller: 9
+ #
+ # First: The maximum rank is looked up and incremented so it is checked
+ # if the max-value is needs to be updated. Then it is checked, which max-value
+ # is the new max, which is then used as the new max (dictating the bit width).
+ #
+ # Based on that information the rank list of the older version is updated by
+ # incrementing old ranks by the number of new ranks in between.
+ #
+ # Then both ranks are concatenated.
+ #
+ # In case the max value was introduced by a then-deleted document,
+ # update the max value (though do not update the bitwidth again).
+
+
+ # Case B) At the beginning, one segment has a field rank, the other has none:
+ #
+ # A Version:24; Max:4; 3,4,3,1
+ # B
+ #
+ # In that case B first needs to get a field rank
+
+
+ # Case C) At the beginning, no segment has a field rank:
+ #
+ # A
+ # B
+ #
+ # In that case, concatenate first, then rank.
+ # Take the maximum rank and use this for encoding
+};
+
+
+1;
diff --git a/lib/Krawfish/Koral/Meta/Builder.pm b/lib/Krawfish/Koral/Meta/Builder.pm
index 4e7c408..785517c 100644
--- a/lib/Krawfish/Koral/Meta/Builder.pm
+++ b/lib/Krawfish/Koral/Meta/Builder.pm
@@ -1,5 +1,7 @@
package Krawfish::Koral::Meta::Builder;
use parent 'Krawfish::Koral::Meta';
+use Krawfish::Koral::Meta::Sort::Field;
+use Krawfish::Koral::Meta::Sort;
use strict;
use warnings;
diff --git a/lib/Krawfish/Koral/Meta/Sort.pm b/lib/Krawfish/Koral/Meta/Sort.pm
index 1b21441..e688143 100644
--- a/lib/Krawfish/Koral/Meta/Sort.pm
+++ b/lib/Krawfish/Koral/Meta/Sort.pm
@@ -4,6 +4,9 @@
use Krawfish::Log;
use Krawfish::Result::Sort;
+# All meta-queries need the nesting query for
+# plan_for
+
use constant DEBUG => 0;
# TODO: Should differ between
@@ -11,18 +14,52 @@
# and
# - sort_by_class()
-# fields => [[asc => 'author', desc => 'title']]
+# TODO: should support criteria instead
+# criteria => [[field => asc => 'author', field =>desc => 'title']]
sub new {
my $class = shift;
- my $query = shift;
- my @fields = @_;
bless {
- query => $query,
- fields => \@fields,
- filterable => 0
+ criteria => [@_],
+ top_k => undef
}, $class;
};
+sub top_k {
+ my $self = shift;
+ return $self->{top_k} unless @_;
+ $self->{top_k} = shift;
+};
+
+# Order sort
+sub plan_for {
+ my ($self, $index, $query) = @_;
+ ...
+};
+
+
+sub type { 'sort' };
+
+
+sub to_koral_fragment {
+ ...
+};
+
+
+# Stringify sort
+sub to_string {
+ my $self = shift;
+ my $str = 'sort(';
+ foreach my $criterion (@{$self->{criteria}}) {
+ $str .= $criterion->to_string;
+ };
+ return $str . ')';
+};
+
+
+1;
+
+
+__END__
# Sorting can be optimized by an appended filter, in case there is no need
# for counting all matches and documents.
diff --git a/lib/Krawfish/Koral/Meta/Sort/Field.pm b/lib/Krawfish/Koral/Meta/Sort/Field.pm
new file mode 100644
index 0000000..27baaf7
--- /dev/null
+++ b/lib/Krawfish/Koral/Meta/Sort/Field.pm
@@ -0,0 +1,19 @@
+package Krawfish::Koral::Meta::Sort::Field;
+use strict;
+use warnings;
+
+sub new {
+ my $class = shift;
+ bless {
+ field => shift,
+ desc => shift
+ }, $class;
+};
+
+sub to_string {
+ my $str = 'field=' . $_[0]->{field};
+ $str .= ($_[0]->{desc} ? '>' : '<');
+ $str;
+};
+
+1;
diff --git a/lib/Krawfish/Koral/Query/Constraint/Position.pm b/lib/Krawfish/Koral/Query/Constraint/Position.pm
index 5e490ff..569c7b4 100644
--- a/lib/Krawfish/Koral/Query/Constraint/Position.pm
+++ b/lib/Krawfish/Koral/Query/Constraint/Position.pm
@@ -3,6 +3,13 @@
use strict;
use warnings;
+# TODO:
+# It should be noted that optimization should
+# keep skip_position() in mind. so in situations
+# like <a><b>, the <b> can be skiped to a position
+# equal to the end of <a>, while <a> can't be skipped
+# to end at the beginning of <b>.
+
our %FRAME = (
precedes => PRECEDES,
precedesDirectly => PRECEDES_DIRECTLY,
diff --git a/lib/Krawfish/Result/Aggregate.pm b/lib/Krawfish/Result/Aggregate.pm
index 2ba15e0..6a49c5d 100644
--- a/lib/Krawfish/Result/Aggregate.pm
+++ b/lib/Krawfish/Result/Aggregate.pm
@@ -6,6 +6,9 @@
use constant DEBUG => 0;
+# TODO: Rename to Krawfish::Result::Segment::Aggregate
+
+
# TODO:
# See https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations.html
diff --git a/lib/Krawfish/Result/Sort.pm b/lib/Krawfish/Result/Sort.pm
index 852c378..8aeee7e 100644
--- a/lib/Krawfish/Result/Sort.pm
+++ b/lib/Krawfish/Result/Sort.pm
@@ -7,6 +7,15 @@
use constant DEBUG => 0;
+
+# TODO: Rename to Krawfish::Result::Segment::Sort
+#
+# TODO:
+# Do not sort based on RANKS but based on Criteria.
+# So, in case the first criterion is a field rank,
+# use PriorityCascade, otherwise use e.g. Arbitrary PriorityQueue
+
+
# See Krawfish::Util::Buckets
diff --git a/lib/Krawfish/Result/Sort/Criterion/Rank.pm b/lib/Krawfish/Result/Sort/Criterion/Rank.pm
new file mode 100644
index 0000000..8345aa7
--- /dev/null
+++ b/lib/Krawfish/Result/Sort/Criterion/Rank.pm
@@ -0,0 +1,42 @@
+package Krawfish::Result::Sort::Criterion::Field;
+use strict;
+use warnings;
+
+# TODO:
+# The same criterion for K::Result::Node::Field
+# will introduce field fetching etc.
+
+# Constructor
+sub new {
+ my $self = shift;
+ my ($index, $field, $desc) = @_;
+
+ bless {
+ field => $field,
+ desc => $desc,
+ ranking => $index->fields->ranked_by($field),
+ max => $self->{ranking}->max if $desc
+ }, $class;
+};
+
+
+# Get the rank of the match
+sub rank {
+ my ($self, $match) = @_;
+
+ # Get rank from match
+ my $rank = $self->{ranking}->get($match->doc_id);
+ return $self->{max} ? ($self->{max} - $rank) : $rank;
+};
+
+
+# Serialize to string
+sub to_string {
+ my $self = shift;
+ my $str = 'field=';
+ $str .= $self->{field};
+ $str .= $self->{desc} ? '>' : '<';
+ return $str;
+};
+
+1;
diff --git a/lib/Krawfish/Result/Sort/PriorityCascade.pm b/lib/Krawfish/Result/Sort/PriorityCascade.pm
index 30a92d8..04cc68e 100644
--- a/lib/Krawfish/Result/Sort/PriorityCascade.pm
+++ b/lib/Krawfish/Result/Sort/PriorityCascade.pm
@@ -7,6 +7,8 @@
use strict;
use warnings;
+# This is only based on criteria that return ranks
+
use constant {
DEBUG => 0,
RANK => 0,
@@ -50,6 +52,7 @@
# It has the structure [[field], [field, 1]]
# where the second value is the descending marker
my $fields = $param{fields};
+ # TODO: Change to criterion!
# For final field distinction, use unique field
push @$fields, [$param{unique}];
diff --git a/lib/Krawfish/Result/SortCriteria.pm b/lib/Krawfish/Result/SortCriteria.pm
new file mode 100644
index 0000000..87b358d
--- /dev/null
+++ b/lib/Krawfish/Result/SortCriteria.pm
@@ -0,0 +1,20 @@
+package Krawfish::Result::SortCriteria;
+use strict;
+use warnings;
+
+# Smilar to Snippet, this will add the surface information
+# for all sorting criteria to make sorting possible for
+# cluster sorting.
+
+# sort-criteria : [
+# {
+# "@type" : "koral:field"
+# ...
+# },
+# {
+# "@type" : "koral:string"
+# ...
+# }
+# ]
+
+1;
diff --git a/lib/Krawfish/Util/PriorityQueue/Arbitrary.pm b/lib/Krawfish/Util/PriorityQueue/Arbitrary.pm
new file mode 100644
index 0000000..f5392f2
--- /dev/null
+++ b/lib/Krawfish/Util/PriorityQueue/Arbitrary.pm
@@ -0,0 +1,3 @@
+# This will have a priority queue for arbitrary
+# criteria data that will be used on all levels of parallelism
+# (segment, node, and cluster) to sort non-ranked data.
diff --git a/lib/Krawfish/Util/PriorityQueue/PerDoc.pm b/lib/Krawfish/Util/PriorityQueue/PerDoc.pm
index f6fd12b..84fc86f 100644
--- a/lib/Krawfish/Util/PriorityQueue/PerDoc.pm
+++ b/lib/Krawfish/Util/PriorityQueue/PerDoc.pm
@@ -4,6 +4,11 @@
use warnings;
use Krawfish::Log;
+# TODO: This currently only works with ranks,
+# it may be more beneficial to work with criteria.
+# So instead to compare ranks, a Criterion object will
+# compare arbitrary data in the "rank"-field
+
# TODO: Probably rename from IN_DOC to IN_COLL
# TODO: Probably rename to PriorityQueue::Bundle
# TODO: Turn reverse_array into an iterator