Improve merge policy description and separate dynamic and static segment structures

commit: df4b40598ada4a049443eb83ed50348e48d6a696 [log] [tgz]
author: Akron <nils@diewald-online.de> Fri Apr 21 14:51:28 2017 +0200
committer: Akron <nils@diewald-online.de> Fri Apr 21 14:51:28 2017 +0200
tree: 5b8eec73f4dd84da355195cc145c711b036298f4
parent: c2a107d84e57584c31756f206ee2f4e306b2f16d [diff]
diff --git a/lib/Krawfish/Index/Dictionary.pm b/lib/Krawfish/Index/Dictionary.pm
index c0d0558..56ee86f 100644
--- a/lib/Krawfish/Index/Dictionary.pm
+++ b/lib/Krawfish/Index/Dictionary.pm

@@ -5,27 +5,40 @@
 use Krawfish::Index::PostingsList;
 
 # TODO:
+#   This should be the base class for K::I::Dictionary::Dynamic
+#   and K::II::Dictionary::Static
+
+# TODO:
 #   In production the dictionary will be implemented using
 #   two data structures:
 #   - A dynamic TST (either balancing or self-optimizing)
-#   - A static complete TST (compact, fast (de)serializale,
+#   - A static TST (compact, fast (de)serializale,
 #     cache-optimized, small)
-#   - The dynamic tree is used to add new terms.
-#     It potentially can also delete terms or mark terms (nodes)
-#     as being deleted.
+#   - The dynamic tree is used to add new terms
+#     in case this is needed for the dynamic segment
 #   - The dynamic and the static tree are searchable
-#     (though it's acceptable if the dynamic TST is slower)
+#     (though it's acceptable if the dynamic TST is slower
+#     or less optimized regarding memory consumption)
 #   - The dynamic and the static trees support reverse lookup
 #     (that is, retrieving the term by a term id)
-#   - The static tree does not support adding or
-#     deleting of nodes.
+#   - The static tree does not support adding.
+#   - Both trees support deleting of nodes:
+#     - The dynamic dictionary directly
+#     - The static dictionary by marking branches as not
+#       followable
 #   - The dynamic tree can be serialized to a static tree
-#   - The dynamic tree can merge (while being serialized)
+#   - The dynamic tree can be merged (while being serialized)
 #     with a static tree.
-#   - Whenever the dynamic tree contains a reasonable
-#     amount of terms, it can merge with a second static
-#     dictionary in memory, write to disc,
-#     and exchange the old dictionary with the new one.
+#   - When both trees are merged, removed terms are ignored.
+#   - The dynamic dictionary will be merged with the static dictionary
+#     whenever a dynamic segment is merged with a static segment
+#     and comes with a non-empty dynamic dictionary
+
+# TODO:
+#   - The dictionary is cleaned up only once in a while.
+#     Then all terms with no postingslists in any segment
+#     Will be marked as removed, so they will be ignored when the
+#     static dictionary is merged with the dynamic dictionary.
 
 # TODO:
 #   - While field ranks are done using rank files per segment,
@@ -49,6 +62,12 @@
 #   rank value needs to be incremented.
 #   However, keep in mind: That only works for fields
 #   with the same collation mechanism as the dictionary.
+#   Maybe it's better to have redundant rank-lists
+#   (including surface forms) per field,
+#   that are only used for ranking and reranking
+#   (but never for live-searching, so they can be stored
+#   compressed on disk and can be decompressed for reranking)
+
 #
 # TODO:
 #   For surface terms in subtoken-boundaries (ONLY),

diff --git a/lib/Krawfish/Index/Dynamic.pm b/lib/Krawfish/Index/Dynamic.pm
new file mode 100644
index 0000000..ccfc2de
--- /dev/null
+++ b/lib/Krawfish/Index/Dynamic.pm

@@ -0,0 +1,23 @@
+package Krawfish::Index::Dynamic;
+use parent 'Krawfish::Index';
+
+# The dynamic index segment can easily
+# add new documents.
+# It will use the global dictionary and for
+# new terms it adds them to a secondary
+# dictionary.
+
+
+# TODO:
+#   This needs to have a different ranking
+#   strategy.
+
+sub add;
+
+sub dynamic_dict;
+
+sub is_dynamic { 1 };
+
+sub is_static  { 0 };
+
+1;

diff --git a/lib/Krawfish/Index/FieldsRank.pm b/lib/Krawfish/Index/FieldsRank.pm
index 52b899d..358d362 100644
--- a/lib/Krawfish/Index/FieldsRank.pm
+++ b/lib/Krawfish/Index/FieldsRank.pm

@@ -1,4 +1,5 @@
 package Krawfish::Index::FieldsRank;
+use parent 'Krawfish::Index::Rank';
 use strict;
 use warnings;
 
@@ -8,76 +9,13 @@
 # a certain document ID.
 #
 # This is defined per Segment.
-#
+
 # TODO:
-#   There are four possible rank value types:
-# - value is even: The rank
-# - value is odd: The prerank, means,
-#   it is sorted based on the rank and takes the place
-#   between the two rank values, but it may occurr
-#   multiple types for different values.
-# - value is 0: Not yet ranked
-# - value is MAX_DOCS: Not available for this document
+#   There are two possible rank value types:
+# 1 VALUE IS >= 1 && <= MAX_RANK:
+#   The rank in the segment
+# 2 VALUE IS 0: Not available for this document
 #   (e.g. "author" is not defined for this document)
-#   TODO:
-#     MAX_RANK+1 does not work very well, because it may change on every
-#     RERANK ... although - it may be updated as well on every rerank,
-#     so this may not be a problem.
-#
-# TODO:
-#   Ranking is simple:
-#     1. rank all values by sorting them - give them rank numbers.
-#   If new values are added:
-#     1. Sort all new values separately
-#        Start at position i = MAX_RANK/2
-#        while there are values in the list:
-#        2. Look up, if the smallest value is already present
-#           LOOKUP
-#           yes -> ignore, keep position as i
-#                  but for rank-integration, keep the rank in
-#                  a special data strukture like ranked_cache
-#           no -> go to 3
-#        # HOW TO SEARCH HERE!?
-#        3. Get the preceeding and following rank values and
-#           define the middle value for the value as a prerank
-#           and set position value to 0.
-#        4. For all next values to prerank, look, if they have the same
-#           surrounding ranks (just check, if the greater ranked value is also greater
-#           as the current value).
-#           yes ->
-#             Check, if the value is identical
-#             yes -> add known prerank and same position value, go to 2
-#             no -> add known prerank and an incremented position value, go to 2
-#           no -> go to 2
-#   For merging, just go linearly through all
-#   Ranks in a merge sort way. Whenever there is a value that needs to be integrated
-#   from the prerank list, increment all values.
-#   HOW:
-#     The ranked list will be iterated in document order
-#     The precached_rank needs to have a special data structure with an API:
-#     ->update_rank(rank), that will take a rank of the ranked list
-#     and returns the rank incremented by the number of preranks before this rank.
-#     in case, no rank was inserted before, this will be the same.
-#     Then, all new documents with preranked or ranked but not integrated yet
-#     values will be appended to the rank stream.
-#
-# TODO:
-#   Use something like that:
-#   http://pempek.net/articles/2013/08/03/bit-packing-with-packedarray/
-#   https://github.com/gpakosz/PackedArray
-#
-# TODO:
-#   $max_rank is important, because it indicates
-#   how many bits per doc are necessary to encode
-#   the rank!
-#
-# TODO:
-#   In case, a field is only set for a couple of documents, a different
-#   strategy may be valid.
-#
-# TODO:
-#   Rank 0 may be used to indicate a field that is not ranked yet.
-#
 
 use constant {
   NOT_RANKED_YET => 0

diff --git a/lib/Krawfish/Index/Merge.pm b/lib/Krawfish/Index/Merge.pm
index 3e7acaa..59f0903 100644
--- a/lib/Krawfish/Index/Merge.pm
+++ b/lib/Krawfish/Index/Merge.pm

@@ -2,6 +2,14 @@
 use strict;
 use warnings;
 
+# There are two types of segments:
+# a) multiple static segments
+# b) One dynamic segment
+#
+# All new documents are added to the dynamic index,
+# But searches are done 
+
+
 sub new {
   my ($class, $index_a, $index_b) = @_;
   bless {
@@ -20,6 +28,8 @@
   #   - This is also necessary for reranking
   $self->_merge_fields;
 
+  # - concatenate docid->uuid field mappers
+  $self->_merge_identifier_lists;
 
   # - concatenate all subtoken lists
   $self->_merge_subtoken_lists;
@@ -33,6 +43,32 @@
 
   # - Concatenate and update primary files / forward index
   $self->_merge_primary_data;
+
+  # In case the second index is dynamic, also
+  # Merge the dictionaries
+  if ($index_b->is_dynamic) {
+    $index_a->dict->merge($index_b->dynamic_dict);
+  };
+
+  # Launch the newly created index
+  $self->_launch;
+};
+
+sub _launch {
+  # TODO:
+  #   - If the dictionary is new
+  #       - lock the whole index
+  #       - Switch to the new dictionary
+  #       - remove the old dictionary
+  #       - remove segment A
+  #       - remove segment B
+  #       - activate the new segment
+  #     else
+  #       - lock segment A
+  #       - lock segment B
+  #       - activate the new segment
+  #       - remove segment A
+  #       - remove segment B
 };
 
 sub _merge_postings_lists {
@@ -41,7 +77,10 @@
   #   - Add SkipLists to postings lists
   #   - Update position information in dictionary
   #     (or rather in the pointer file per segment)
-  #
+  #   - Decrement all document ids for deleted documents
+  #   - Add the maximum doc_id of the first segment to
+  #     all documents of the second index
+  #   - Calculate new freq value
 };
 
 sub _merge_fields {

diff --git a/lib/Krawfish/Index/Rank.pm b/lib/Krawfish/Index/Rank.pm
new file mode 100644
index 0000000..88a377a
--- /dev/null
+++ b/lib/Krawfish/Index/Rank.pm

@@ -0,0 +1,69 @@
+package Krawfish::Index::Rank;
+use strict;
+use warnings;
+
+# This should be the base class for FieldsRank
+# and TermRank
+
+
+# TODO:
+#   Maybe each segment should have a rank-file that does not
+#   store the ranks per doc, but all fields
+#   in sorted order with their ranks.
+#   This file will only be consulted for reranking,
+#   so it may be compressed on disk and potentially
+#   stored with incremental encoding/front coding
+#   The same should be true for TermRank.
+
+# TODO:
+#   Ranking is simple:
+#     1. rank all values by sorting them - give them rank numbers.
+#   If new values are added:
+#     1. Sort all new values separately
+#        Start at position i = MAX_RANK/2
+#        while there are values in the list:
+#        2. Look up, if the smallest value is already present
+#           LOOKUP
+#           yes -> ignore, keep position as i
+#                  but for rank-integration, keep the rank in
+#                  a special data strukture like ranked_cache
+#           no -> go to 3
+#        # HOW TO SEARCH HERE!?
+#        3. Get the preceeding and following rank values and
+#           define the middle value for the value as a prerank
+#           and set position value to 0.
+#        4. For all next values to prerank, look, if they have the same
+#           surrounding ranks (just check, if the greater ranked value is also greater
+#           as the current value).
+#           yes ->
+#             Check, if the value is identical
+#             yes -> add known prerank and same position value, go to 2
+#             no -> add known prerank and an incremented position value, go to 2
+#           no -> go to 2
+#   For merging, just go linearly through all
+#   Ranks in a merge sort way. Whenever there is a value that needs to be integrated
+#   from the prerank list, increment all values.
+#   HOW:
+#     The ranked list will be iterated in document order
+#     The precached_rank needs to have a special data structure with an API:
+#     ->update_rank(rank), that will take a rank of the ranked list
+#     and returns the rank incremented by the number of preranks before this rank.
+#     in case, no rank was inserted before, this will be the same.
+#     Then, all new documents with preranked or ranked but not integrated yet
+#     values will be appended to the rank stream.
+#
+# TODO:
+#   Use something like that:
+#   http://pempek.net/articles/2013/08/03/bit-packing-with-packedarray/
+#   https://github.com/gpakosz/PackedArray
+#
+# TODO:
+#   $max_rank is important, because it indicates
+#   how many bits per doc are necessary to encode
+#   the rank!
+#
+# TODO:
+#   In case, a field is only set for a couple of documents, a different
+#   strategy may be valid.
+
+1;

diff --git a/lib/Krawfish/Index/Static.pm b/lib/Krawfish/Index/Static.pm
new file mode 100644
index 0000000..80aab26
--- /dev/null
+++ b/lib/Krawfish/Index/Static.pm

@@ -0,0 +1,20 @@
+package Krawfish::Index::Static;
+use parent 'Krawfish::Index';
+
+# Multiple static index segments can
+# only delete documents (by adding entries
+# to the deleted documents posting list),
+# but not add new documents.
+# But static documents can be merged
+# (with static segments and with the dynamic
+# segment).
+
+sub new;
+
+# This will use Krawfish::Index::Merge
+sub merge;
+
+sub is_dynamic { 0 };
+sub is_static  { 1 };
+
+1;

diff --git a/lib/Krawfish/Index/TermRank.pm b/lib/Krawfish/Index/TermRank.pm
index f6e4914..234f4e8 100644
--- a/lib/Krawfish/Index/TermRank.pm
+++ b/lib/Krawfish/Index/TermRank.pm

@@ -1,9 +1,13 @@
 package Krawfish::Index::TermRank;
+use parent 'Krawfish::Index::Rank';
 use strict;
 use warnings;
 
-# While FieldsRank is defined per Segment, TermRank is defined per
-# Dictionary.
+# While FieldsRank is defined per Segment,
+# TermRank is defined per Dictionary.
+# That means per node there are two Term-Ranks
+# per direction (prefix and suffix):
+# One static and one dynamic.
 #
 # TODO:
 #   may be renamed to SubTermRank
@@ -11,4 +15,14 @@
 # TODO:
 #   should have a similar API as FieldsRank!
 
+# TODO:
+#   There are two possible rank value types:
+# 1 VALUE IS EVEN: The global rank from
+#   the static dictionary
+# 2 VALUE IS ODD: The prerank, means,
+#   it is sorted based on the rank and takes the place
+#   between the two rank values, but it may occurr
+#   multiple types for different values.
+#   This comes from the dynamic dictionary.
+
 1;
commit	df4b40598ada4a049443eb83ed50348e48d6a696	[log] [tgz]
author	Akron <nils@diewald-online.de>	Fri Apr 21 14:51:28 2017 +0200
committer	Akron <nils@diewald-online.de>	Fri Apr 21 14:51:28 2017 +0200
tree	5b8eec73f4dd84da355195cc145c711b036298f4
parent	c2a107d84e57584c31756f206ee2f4e306b2f16d [diff]