Improve merge policy description and separate dynamic and static segment structures
diff --git a/lib/Krawfish/Index/Dictionary.pm b/lib/Krawfish/Index/Dictionary.pm
index c0d0558..56ee86f 100644
--- a/lib/Krawfish/Index/Dictionary.pm
+++ b/lib/Krawfish/Index/Dictionary.pm
@@ -5,27 +5,40 @@
use Krawfish::Index::PostingsList;
# TODO:
+# This should be the base class for K::I::Dictionary::Dynamic
+# and K::II::Dictionary::Static
+
+# TODO:
# In production the dictionary will be implemented using
# two data structures:
# - A dynamic TST (either balancing or self-optimizing)
-# - A static complete TST (compact, fast (de)serializale,
+# - A static TST (compact, fast (de)serializale,
# cache-optimized, small)
-# - The dynamic tree is used to add new terms.
-# It potentially can also delete terms or mark terms (nodes)
-# as being deleted.
+# - The dynamic tree is used to add new terms
+# in case this is needed for the dynamic segment
# - The dynamic and the static tree are searchable
-# (though it's acceptable if the dynamic TST is slower)
+# (though it's acceptable if the dynamic TST is slower
+# or less optimized regarding memory consumption)
# - The dynamic and the static trees support reverse lookup
# (that is, retrieving the term by a term id)
-# - The static tree does not support adding or
-# deleting of nodes.
+# - The static tree does not support adding.
+# - Both trees support deleting of nodes:
+# - The dynamic dictionary directly
+# - The static dictionary by marking branches as not
+# followable
# - The dynamic tree can be serialized to a static tree
-# - The dynamic tree can merge (while being serialized)
+# - The dynamic tree can be merged (while being serialized)
# with a static tree.
-# - Whenever the dynamic tree contains a reasonable
-# amount of terms, it can merge with a second static
-# dictionary in memory, write to disc,
-# and exchange the old dictionary with the new one.
+# - When both trees are merged, removed terms are ignored.
+# - The dynamic dictionary will be merged with the static dictionary
+# whenever a dynamic segment is merged with a static segment
+# and comes with a non-empty dynamic dictionary
+
+# TODO:
+# - The dictionary is cleaned up only once in a while.
+# Then all terms with no postingslists in any segment
+# Will be marked as removed, so they will be ignored when the
+# static dictionary is merged with the dynamic dictionary.
# TODO:
# - While field ranks are done using rank files per segment,
@@ -49,6 +62,12 @@
# rank value needs to be incremented.
# However, keep in mind: That only works for fields
# with the same collation mechanism as the dictionary.
+# Maybe it's better to have redundant rank-lists
+# (including surface forms) per field,
+# that are only used for ranking and reranking
+# (but never for live-searching, so they can be stored
+# compressed on disk and can be decompressed for reranking)
+
#
# TODO:
# For surface terms in subtoken-boundaries (ONLY),
diff --git a/lib/Krawfish/Index/Dynamic.pm b/lib/Krawfish/Index/Dynamic.pm
new file mode 100644
index 0000000..ccfc2de
--- /dev/null
+++ b/lib/Krawfish/Index/Dynamic.pm
@@ -0,0 +1,23 @@
+package Krawfish::Index::Dynamic;
+use parent 'Krawfish::Index';
+
+# The dynamic index segment can easily
+# add new documents.
+# It will use the global dictionary and for
+# new terms it adds them to a secondary
+# dictionary.
+
+
+# TODO:
+# This needs to have a different ranking
+# strategy.
+
+sub add;
+
+sub dynamic_dict;
+
+sub is_dynamic { 1 };
+
+sub is_static { 0 };
+
+1;
diff --git a/lib/Krawfish/Index/FieldsRank.pm b/lib/Krawfish/Index/FieldsRank.pm
index 52b899d..358d362 100644
--- a/lib/Krawfish/Index/FieldsRank.pm
+++ b/lib/Krawfish/Index/FieldsRank.pm
@@ -1,4 +1,5 @@
package Krawfish::Index::FieldsRank;
+use parent 'Krawfish::Index::Rank';
use strict;
use warnings;
@@ -8,76 +9,13 @@
# a certain document ID.
#
# This is defined per Segment.
-#
+
# TODO:
-# There are four possible rank value types:
-# - value is even: The rank
-# - value is odd: The prerank, means,
-# it is sorted based on the rank and takes the place
-# between the two rank values, but it may occurr
-# multiple types for different values.
-# - value is 0: Not yet ranked
-# - value is MAX_DOCS: Not available for this document
+# There are two possible rank value types:
+# 1 VALUE IS >= 1 && <= MAX_RANK:
+# The rank in the segment
+# 2 VALUE IS 0: Not available for this document
# (e.g. "author" is not defined for this document)
-# TODO:
-# MAX_RANK+1 does not work very well, because it may change on every
-# RERANK ... although - it may be updated as well on every rerank,
-# so this may not be a problem.
-#
-# TODO:
-# Ranking is simple:
-# 1. rank all values by sorting them - give them rank numbers.
-# If new values are added:
-# 1. Sort all new values separately
-# Start at position i = MAX_RANK/2
-# while there are values in the list:
-# 2. Look up, if the smallest value is already present
-# LOOKUP
-# yes -> ignore, keep position as i
-# but for rank-integration, keep the rank in
-# a special data strukture like ranked_cache
-# no -> go to 3
-# # HOW TO SEARCH HERE!?
-# 3. Get the preceeding and following rank values and
-# define the middle value for the value as a prerank
-# and set position value to 0.
-# 4. For all next values to prerank, look, if they have the same
-# surrounding ranks (just check, if the greater ranked value is also greater
-# as the current value).
-# yes ->
-# Check, if the value is identical
-# yes -> add known prerank and same position value, go to 2
-# no -> add known prerank and an incremented position value, go to 2
-# no -> go to 2
-# For merging, just go linearly through all
-# Ranks in a merge sort way. Whenever there is a value that needs to be integrated
-# from the prerank list, increment all values.
-# HOW:
-# The ranked list will be iterated in document order
-# The precached_rank needs to have a special data structure with an API:
-# ->update_rank(rank), that will take a rank of the ranked list
-# and returns the rank incremented by the number of preranks before this rank.
-# in case, no rank was inserted before, this will be the same.
-# Then, all new documents with preranked or ranked but not integrated yet
-# values will be appended to the rank stream.
-#
-# TODO:
-# Use something like that:
-# http://pempek.net/articles/2013/08/03/bit-packing-with-packedarray/
-# https://github.com/gpakosz/PackedArray
-#
-# TODO:
-# $max_rank is important, because it indicates
-# how many bits per doc are necessary to encode
-# the rank!
-#
-# TODO:
-# In case, a field is only set for a couple of documents, a different
-# strategy may be valid.
-#
-# TODO:
-# Rank 0 may be used to indicate a field that is not ranked yet.
-#
use constant {
NOT_RANKED_YET => 0
diff --git a/lib/Krawfish/Index/Merge.pm b/lib/Krawfish/Index/Merge.pm
index 3e7acaa..59f0903 100644
--- a/lib/Krawfish/Index/Merge.pm
+++ b/lib/Krawfish/Index/Merge.pm
@@ -2,6 +2,14 @@
use strict;
use warnings;
+# There are two types of segments:
+# a) multiple static segments
+# b) One dynamic segment
+#
+# All new documents are added to the dynamic index,
+# But searches are done
+
+
sub new {
my ($class, $index_a, $index_b) = @_;
bless {
@@ -20,6 +28,8 @@
# - This is also necessary for reranking
$self->_merge_fields;
+ # - concatenate docid->uuid field mappers
+ $self->_merge_identifier_lists;
# - concatenate all subtoken lists
$self->_merge_subtoken_lists;
@@ -33,6 +43,32 @@
# - Concatenate and update primary files / forward index
$self->_merge_primary_data;
+
+ # In case the second index is dynamic, also
+ # Merge the dictionaries
+ if ($index_b->is_dynamic) {
+ $index_a->dict->merge($index_b->dynamic_dict);
+ };
+
+ # Launch the newly created index
+ $self->_launch;
+};
+
+sub _launch {
+ # TODO:
+ # - If the dictionary is new
+ # - lock the whole index
+ # - Switch to the new dictionary
+ # - remove the old dictionary
+ # - remove segment A
+ # - remove segment B
+ # - activate the new segment
+ # else
+ # - lock segment A
+ # - lock segment B
+ # - activate the new segment
+ # - remove segment A
+ # - remove segment B
};
sub _merge_postings_lists {
@@ -41,7 +77,10 @@
# - Add SkipLists to postings lists
# - Update position information in dictionary
# (or rather in the pointer file per segment)
- #
+ # - Decrement all document ids for deleted documents
+ # - Add the maximum doc_id of the first segment to
+ # all documents of the second index
+ # - Calculate new freq value
};
sub _merge_fields {
diff --git a/lib/Krawfish/Index/Rank.pm b/lib/Krawfish/Index/Rank.pm
new file mode 100644
index 0000000..88a377a
--- /dev/null
+++ b/lib/Krawfish/Index/Rank.pm
@@ -0,0 +1,69 @@
+package Krawfish::Index::Rank;
+use strict;
+use warnings;
+
+# This should be the base class for FieldsRank
+# and TermRank
+
+
+# TODO:
+# Maybe each segment should have a rank-file that does not
+# store the ranks per doc, but all fields
+# in sorted order with their ranks.
+# This file will only be consulted for reranking,
+# so it may be compressed on disk and potentially
+# stored with incremental encoding/front coding
+# The same should be true for TermRank.
+
+# TODO:
+# Ranking is simple:
+# 1. rank all values by sorting them - give them rank numbers.
+# If new values are added:
+# 1. Sort all new values separately
+# Start at position i = MAX_RANK/2
+# while there are values in the list:
+# 2. Look up, if the smallest value is already present
+# LOOKUP
+# yes -> ignore, keep position as i
+# but for rank-integration, keep the rank in
+# a special data strukture like ranked_cache
+# no -> go to 3
+# # HOW TO SEARCH HERE!?
+# 3. Get the preceeding and following rank values and
+# define the middle value for the value as a prerank
+# and set position value to 0.
+# 4. For all next values to prerank, look, if they have the same
+# surrounding ranks (just check, if the greater ranked value is also greater
+# as the current value).
+# yes ->
+# Check, if the value is identical
+# yes -> add known prerank and same position value, go to 2
+# no -> add known prerank and an incremented position value, go to 2
+# no -> go to 2
+# For merging, just go linearly through all
+# Ranks in a merge sort way. Whenever there is a value that needs to be integrated
+# from the prerank list, increment all values.
+# HOW:
+# The ranked list will be iterated in document order
+# The precached_rank needs to have a special data structure with an API:
+# ->update_rank(rank), that will take a rank of the ranked list
+# and returns the rank incremented by the number of preranks before this rank.
+# in case, no rank was inserted before, this will be the same.
+# Then, all new documents with preranked or ranked but not integrated yet
+# values will be appended to the rank stream.
+#
+# TODO:
+# Use something like that:
+# http://pempek.net/articles/2013/08/03/bit-packing-with-packedarray/
+# https://github.com/gpakosz/PackedArray
+#
+# TODO:
+# $max_rank is important, because it indicates
+# how many bits per doc are necessary to encode
+# the rank!
+#
+# TODO:
+# In case, a field is only set for a couple of documents, a different
+# strategy may be valid.
+
+1;
diff --git a/lib/Krawfish/Index/Static.pm b/lib/Krawfish/Index/Static.pm
new file mode 100644
index 0000000..80aab26
--- /dev/null
+++ b/lib/Krawfish/Index/Static.pm
@@ -0,0 +1,20 @@
+package Krawfish::Index::Static;
+use parent 'Krawfish::Index';
+
+# Multiple static index segments can
+# only delete documents (by adding entries
+# to the deleted documents posting list),
+# but not add new documents.
+# But static documents can be merged
+# (with static segments and with the dynamic
+# segment).
+
+sub new;
+
+# This will use Krawfish::Index::Merge
+sub merge;
+
+sub is_dynamic { 0 };
+sub is_static { 1 };
+
+1;
diff --git a/lib/Krawfish/Index/TermRank.pm b/lib/Krawfish/Index/TermRank.pm
index f6e4914..234f4e8 100644
--- a/lib/Krawfish/Index/TermRank.pm
+++ b/lib/Krawfish/Index/TermRank.pm
@@ -1,9 +1,13 @@
package Krawfish::Index::TermRank;
+use parent 'Krawfish::Index::Rank';
use strict;
use warnings;
-# While FieldsRank is defined per Segment, TermRank is defined per
-# Dictionary.
+# While FieldsRank is defined per Segment,
+# TermRank is defined per Dictionary.
+# That means per node there are two Term-Ranks
+# per direction (prefix and suffix):
+# One static and one dynamic.
#
# TODO:
# may be renamed to SubTermRank
@@ -11,4 +15,14 @@
# TODO:
# should have a similar API as FieldsRank!
+# TODO:
+# There are two possible rank value types:
+# 1 VALUE IS EVEN: The global rank from
+# the static dictionary
+# 2 VALUE IS ODD: The prerank, means,
+# it is sorted based on the rank and takes the place
+# between the two rank values, but it may occurr
+# multiple types for different values.
+# This comes from the dynamic dictionary.
+
1;