Removed Subtokens class (unnecessary due to Forward::*)

commit: 4037dd492504c5c33cbfb977794861755619fe70 [log] [tgz]
author: Akron <nils@diewald-online.de> Thu Aug 17 12:05:37 2017 +0200
committer: Akron <nils@diewald-online.de> Thu Aug 17 12:05:37 2017 +0200
tree: 131c058133fe5c5f6cf20068ecfacebdbc07334d
parent: 4a46e6eb698e5fd7f670b4434b3d5b6bc5bc945a [diff]
diff --git a/lib/Krawfish/Index/Merge.pm b/lib/Krawfish/Index/Merge.pm
index 59f0903..4ef971b 100644
--- a/lib/Krawfish/Index/Merge.pm
+++ b/lib/Krawfish/Index/Merge.pm

@@ -2,6 +2,12 @@
 use strict;
 use warnings;
 
+# TODO:
+#   This is not an actual implementation,
+#   just a guide to keep notes what is necessary
+#   to merge two segments and two dictionaries.
+
+
 # There are two types of segments:
 # a) multiple static segments
 # b) One dynamic segment
@@ -28,11 +34,15 @@
   #   - This is also necessary for reranking
   $self->_merge_fields;
 
+  $self->_merge_forward;
+
+  $self->_merge_live;
+
   # - concatenate docid->uuid field mappers
-  $self->_merge_identifier_lists;
+  # $self->_merge_identifier_lists;
 
   # - concatenate all subtoken lists
-  $self->_merge_subtoken_lists;
+  # $self->_merge_subtoken_lists;
 
   # - Rerank all field ranks
   #   (ignoring deleted documents)
@@ -42,7 +52,7 @@
   $self->_rerank_fields;
 
   # - Concatenate and update primary files / forward index
-  $self->_merge_primary_data;
+  # $self->_merge_primary_data;
 
   # In case the second index is dynamic, also
   # Merge the dictionaries

diff --git a/lib/Krawfish/Index/Segment.pm b/lib/Krawfish/Index/Segment.pm
index 4bd4ee1..e23dcaf 100644
--- a/lib/Krawfish/Index/Segment.pm
+++ b/lib/Krawfish/Index/Segment.pm

@@ -1,5 +1,4 @@
 package Krawfish::Index::Segment;
-use Krawfish::Index::Subtokens;
 use Krawfish::Index::Fields;
 use Krawfish::Index::PostingsLive;
 use Krawfish::Index::PostingsList;
@@ -30,6 +29,8 @@
 #           accessible by their sub_term_id position in the list:
 #  ([leaf-backref][prefix-rank][suffix-rank])*
 
+# TODO:
+#   Store token information per foundry/layer tokens
 
 use constant DEBUG => 0;
 
@@ -42,11 +43,6 @@
 
   print_log('seg', 'Instantiate new segment') if DEBUG;
 
-  # Load subtokens list
-  $self->{subtokens} = Krawfish::Index::Subtokens->new(
-    $self->{file}
-  );
-
   # Load fields
   $self->{fields} = Krawfish::Index::Fields->new(
     $self->{file}

diff --git a/lib/Krawfish/Index/Subtokens.pm b/lib/Krawfish/Index/Subtokens.pm
deleted file mode 100644
index 2424b10..0000000
--- a/lib/Krawfish/Index/Subtokens.pm
+++ /dev/null

@@ -1,193 +0,0 @@
-package Krawfish::Index::Subtokens;
-use Krawfish::Log;
-use strict;
-use warnings;
-
-# See Krawfish::Index::Tokens
-
-# There is only one subtoken list for the documents
-
-# TODO:
-#   With a forward index, the subtokens offsets will no longer
-#   point to character positions in the primary text but to
-#   subtoken positions in the forward index!
-
-# The Subtokens list (not different for different tokenizations)
-# has the following job:
-#
-# * Return forward index offsets for a certain subtoken
-#   (for the current forward index implementation, only the
-#    start offset is necessary)
-#   API: ->get_offset($doc_id, $pos)
-#
-# * Get the term_id at a certain position
-#   API: ->get_term_id($doc_id, $pos)
-#
-#   IDEA: This will get the offset and look into the forward index
-#         for the $term_id. Or it will directly store the $term_id
-#         in the stream.
-#
-# * Get the surface form from the dictionary as fast as possible
-#   This will first find the $term_ids in the forward index and resolve them
-#   (potentially).
-#   API: ->get_surface($doc_id, $pos)
-#        ->get_surface($doc_id, $pos, $length)
-#
-# * Get the prefix and suffix rank of the surface form for fast
-#   sorting. All terms should be preranked in prefix and suffix order
-#   for the standard collation.
-#   API: ->get_prefix_rank($doc_id, $pos)
-#        ->get_suffix_rank($doc_id, $pos)
-#
-#   IDEA:  It would be nice to have the ranks being stored in the dictionary
-#          to avoid redundancy. So this would be implemented as first
-#          looking up the $term_id in the forward_index, then retrieving
-#          the rank based on the $term_id in the dictionary.
-#          As these sorting parts of the dictionary are not necessary all the time,
-#          they may not need to reside in memory all the time.
-#
-#   REQUIREMENT: For on-the-fly sorting it may be beneficial to have a fast
-#                incremental ->next() and ->skip_to() like method.
-#                For finding the offset for a match, a get() should
-#                be allowed to be slow.
-#                A good argument for fast on-the-fly-sorting is also
-#                grouping, which needs fast access to the term_ids, so
-#                they may need to be stored redundantly
-#
-#   DATASTRUCTURE: [doc_id]([delta_varint_offset][term_id_varint])*
-#                  Augmented with a SkipList.
-#
-# TODO:
-#   This may be implemented using a postings list, but inside positions,
-#   it should be possible to move backwards as well.
-#   The tokens structure may be augmented with a skiplist
-#   and be a highly optimized position encoding.
-#
-#   It should also contain information about the first two characters
-#   of a term and possibly the last two characters, necessary to bucket sort terms.
-#   The characters are stored as UTF-8 or similar -
-#   it may be beneficial to have the most common characters need the least
-#   bits.
-#   Note that this information needs to store characters and not
-#   bytes, as bytes may not be helpful for sorting!
-#   This may as well use a prefix_rank and a suffix_rank for bucket-sorting.
-#
-#   In addition, the surface term_id needs to be accessible fastly!
-#   TODO: Term-IDs may be better stored in a separate file, to keep the file small.
-
-
-use constant DEBUG => 0;
-
-
-# Constructor
-sub new {
-  my $class = shift;
-  bless {
-    file => shift,
-
-    # Define, how many start characters will be stored
-    # This is useful for alphabetic sorting
-    # start_char_length => shift // 2,
-
-    # Define, how many start characters will be stored
-    # This is useful for alphabetic sorting
-    # end_char_length => shift // 2,
-
-    array => [],
-    pos => -1,
-  }, $class;
-};
-
-
-# TODO: Better store length ...
-# Store offsets
-sub store {
-  my $self = shift;
-
-  # Get data to store per segment
-  my ($doc_id, $subtoken, $start_char, $end_char, $subterm_id, $subterm) = @_;
-
-  # TODO:
-  #   THIS IS PROBABLY NOT NECESSARY!
-  if ($subterm) {
-    # Get the first and last characters of the term
-    my ($first, $last) = (substr($subterm, 0, 2), scalar reverse substr($subterm, -2));
-
-    # Store all segments
-    $self->{$doc_id . '#' . $subtoken} = [$start_char, $end_char, $subterm_id, $first, $last];
-
-    if (DEBUG) {
-      print_log('stokens', "Store subtoken at [$doc_id,$subtoken]");
-      print_log('stokens', '  with ' . join(','),@{$self->{$doc_id . '#' . $subtoken}});
-    };
-  }
-
-  # Temporary
-  else {
-
-    # Store all segments
-    $self->{$doc_id . '#' . $subtoken} = [$start_char, $end_char];
-  }
-
-  return $self;
-};
-
-
-# Get offsets
-# TODO: Support caching!
-sub get {
-  my $self = shift;
-  my ($doc_id, $segment) = @_;
-  return $self->{$doc_id . '#' . $segment};
-};
-
-
-sub append {
-  my $self = shift;
-  my ($token, $doc_id, $pos, $end) = @_;
-  if (DEBUG) {
-    print_log('toklist', "Appended $token with $doc_id, $pos" . ($end ? "-$end" : ''));
-  };
-  push(@{$self->{array}}, [$doc_id, $pos, $end]);
-};
-
-
-sub next;
-
-
-sub pos {
-  return $_[0]->{pos};
-};
-
-
-sub token {
-  return $_[0]->{array}->[$_[0]->pos];
-};
-
-
-sub freq;
-
-
-sub skip_to_doc;
-
-
-sub skip_to_pos;
-
-
-# This may require random access for sorting based on term ids
-sub get_term_id {
-  my $self = shift;
-  my ($doc_id, $pos) = @_;
-  ...
-};
-
-
-# This may require random access for sorting based on term ids
-sub get_term_ids {
-  my $self = shift;
-  my ($doc_id, $start, $end) = @_;
-  ...
-};
-
-
-1;

diff --git a/lib/Krawfish/Index/Tokens.pm b/lib/Krawfish/Index/Tokens.pm
index 85cf22d..c67193d 100644
--- a/lib/Krawfish/Index/Tokens.pm
+++ b/lib/Krawfish/Index/Tokens.pm

@@ -3,11 +3,9 @@
 use strict;
 use warnings;
 
-# See Krawfish::Index::Subtokens
-
 # There is one token list per tokenization
 
-# The Tokens list has the following jobs:
+# The Token list has the following jobs:
 #
 # * Check if the number of tokens between two subtokens is
 #   in a certain range
@@ -21,13 +19,22 @@
 #
 # * Get the number of tokens per doc_id
 #   API: ->count($doc_id)
-#        or ->freq_doc($doc_id)
+#        or ->freq_in_doc($doc_id)
 #   (Necessary for Result::Aggregate::TokenFreq)
 #
 # * Get the maximum number of subtokens a token
 #   of this foundry can have (necessary for Constraint::InBetween)
 #   ->max_subtokens;
 
+# TODO:
+#   A possible structure would be (without including skip data)
+#   ([doc-id]([pos-delta][length-varint])*)
+#   where only start positions are stored for lengths > 1.
+#   Skip data should be included only for every ~128 entries
+#   and each skipblock should have a previous pointer
+#   to make skipping back possible.
+
+
 # Get an array of start positions that are in the range of min/max
 # Start with the lowest
 sub extend_to_left {
@@ -57,7 +64,7 @@
   ...
 };
 
-sub freq_doc;
+sub freq_in_doc;
 
 sub max_subtokens;
commit	4037dd492504c5c33cbfb977794861755619fe70	[log] [tgz]
author	Akron <nils@diewald-online.de>	Thu Aug 17 12:05:37 2017 +0200
committer	Akron <nils@diewald-online.de>	Thu Aug 17 12:05:37 2017 +0200
tree	131c058133fe5c5f6cf20068ecfacebdbc07334d
parent	4a46e6eb698e5fd7f670b4434b3d5b6bc5bc945a [diff]