Introduced a store and renamed segments to subtokens

commit: 6a749735268e1113253f2806bf9a3167e27ebc2f [log] [tgz]
author: Akron <nils@diewald-online.de> Tue Feb 14 14:43:06 2017 +0100
committer: Akron <nils@diewald-online.de> Tue Feb 14 14:43:06 2017 +0100
tree: d61a8a009606cd2d7122936f4b12be40473be44d
parent: d2f9e69eeca2c6a648722f69cf9b5edeaa3ff5c6 [diff]
diff --git a/lib/Krawfish/Index.pm b/lib/Krawfish/Index.pm
index 355ed11..de554e4 100644
--- a/lib/Krawfish/Index.pm
+++ b/lib/Krawfish/Index.pm

@@ -1,6 +1,6 @@
 package Krawfish::Index;
 use Krawfish::Index::Dictionary;
-use Krawfish::Index::Segments;
+use Krawfish::Index::Subtokens;
 use Krawfish::Index::PrimaryData;
 use Krawfish::Index::Fields;
 use Krawfish::Cache;
@@ -24,9 +24,9 @@
 # TODO: Maybe 65.535 documents are enough per segment ...
 
 # TODO: Build a forward index
-# TODO: With a forward index, the segments offsets will no longer
+# TODO: With a forward index, the subtokens offsets will no longer
 #   point to character positions in the primary text but to
-#   segment positions in the forward index!
+#   subtoken positions in the forward index!
 
 # TODO:
 #   Reranking a field is not necessary, if the field value is already given.
@@ -56,7 +56,7 @@
   );
 
   # Load offsets
-  $self->{segments} = Krawfish::Index::Segments->new(
+  $self->{subtokens} = Krawfish::Index::Subtokens->new(
     $self->{file}
   );
 
@@ -106,9 +106,9 @@
 };
 
 
-# Get segments
-sub segments {
-  $_[0]->{segments};
+# Get subtokens
+sub subtokens {
+  $_[0]->{subtokens};
 };
 
 
@@ -177,27 +177,27 @@
     $post_list->append($doc_id);
   };
 
-  my $segments = $self->segments;
+  my $subtokens = $self->subtokens;
 
-  # The primary text is necessary for the segments index as well as
+  # The primary text is necessary for the subtoken index as well as
   # for the forward index
   my $primary = $doc->{primaryData};
 
-  # Store segments
-  if ($doc->{segments}) {
+  # Store subtokens
+  if ($doc->{subtokens}) {
 
-    print_log('index', 'Store segments') if DEBUG;
+    print_log('index', 'Store subtokens') if DEBUG;
 
-    # Store all segment offsets
-    foreach my $seg (@{$doc->{segments}}) {
+    # Store all subtoken offsets
+    foreach my $seg (@{$doc->{subtokens}}) {
 
-      # Get start and end of the segment
+      # Get start and end of the subtoken
       my ($start, $end) = @{$seg->{offsets}};
 
       if (DEBUG) {
         print_log(
           'index',
-          'Store segment: ' . $doc_id . ':' . $pos . '=' . join('-', $start, $end)
+          'Store subtoken: ' . $doc_id . ':' . $pos . '=' . join('-', $start, $end)
         );
       };
 
@@ -207,14 +207,14 @@
 
       # TODO: There may be a prefix necessary for surface forms
       # TODO: This may in fact be not necessary at all -
-      #   The segments may have their own IDs
+      #   The subtokens may have their own IDs
       #   And the terms do not need to be stored in the dictionary for retrieval ...
       my $term_id = $dict->add('*' . $term)->term_id;
 
       print_log('index', 'Surface form has term_id ' . $term_id) if DEBUG;
 
-      # Store information to segment
-      $segments->store($doc_id, $pos++, $start, $end, $term_id, $term);
+      # Store information to subtoken
+      $subtokens->store($doc_id, $pos++, $start, $end, $term_id, $term);
     };
   };
 
@@ -248,15 +248,15 @@
       };
 
       # Append posting to postings list
-      my @segments = _segments($item);
+      my @subtokens = _subtokens($item);
 
-      # No segments defined
-      unless (scalar @segments) {
-        push @segments, $pos;
+      # No subtokens defined
+      unless (scalar @subtokens) {
+        push @subtokens, $pos;
 
         # Store offsets
         if ($item->{offsets}) {
-          $segments->store($doc_id, $pos, @{$item->{offsets}});
+          $subtokens->store($doc_id, $pos, @{$item->{offsets}});
         };
         $pos++;
       };
@@ -264,7 +264,7 @@
       # Add token terms
       foreach (@keys) {
         my $post_list = $dict->add($_);
-        $post_list->append($doc_id, @segments);
+        $post_list->append($doc_id, @subtokens);
       };
     }
 
@@ -279,9 +279,9 @@
       # Append posting to posting list
       $post_list->append(
         $doc_id,
-        $item->{segments}->[0],
-        # The end is AFTER the second segment
-        $item->{segments}->[-1] + 1
+        $item->{subtokens}->[0],
+        # The end is AFTER the second subtoken
+        $item->{subtokens}->[-1] + 1
       );
     };
   };
@@ -308,19 +308,19 @@
 }
 
 
-# Return segment list or nothing
-sub _segments {
+# Return subtoken list or nothing
+sub _subtokens {
   my $item = shift;
   my @posting;
 
-  if ($item->{segments}) {
+  if ($item->{subtokens}) {
 
     # Remove!
-    push @posting, $item->{segments}->[0];
+    push @posting, $item->{subtokens}->[0];
 
-    if ($item->{segments}->[1]) {
-      # The end is AFTER the second segment
-      push @posting, $item->{segments}->[1] + 1;
+    if ($item->{subtokens}->[1]) {
+      # The end is AFTER the second subtoken
+      push @posting, $item->{subtokens}->[1] + 1;
     };
 
     return @posting;

diff --git a/lib/Krawfish/Index/ForwardIndex.pm b/lib/Krawfish/Index/ForwardIndex.pm
index 047567c..b93275e 100644
--- a/lib/Krawfish/Index/ForwardIndex.pm
+++ b/lib/Krawfish/Index/ForwardIndex.pm

@@ -12,6 +12,13 @@
 #   merge. Then, convert the forward index based on this table without
 #   dictionary lookup.
 #
+# TODO:
+#   This is great for retrieving pagebreaks, annotations, primary data,
+#   perhaps help on regex ...
+#   But can this help to expand the context of a match to a certain element context?
+#   Probably by retrieving the data with a certain maximum offset (say left 100 subtokens, right 100 subtokens)
+#   and first check for the expanding element start on the left, then move to the right.
+#
 sub new {
   my $class = shift;
   bless {
@@ -32,6 +39,10 @@
   return substr($self->{forward}->[$doc_id], $offset, $end - $offset);
 };
 
+sub get_expanded {
+  ...
+};
+
 
 # Return a stream of elements (primary text and annotations)
 sub get_annotated {
@@ -40,7 +51,17 @@
   ...
 };
 
+# Return a stream of elements (primary text and annotations)
+# that is within a certain element
+sub get_annotated_expanded {
+  my $self = shift;
+  my ($doc_id, $offset, $length, $foundry, $max_exp, $layer, $element) = @_;
+  ...
+};
+
+
 # Return the surface string only
+# This should be as fast as possible, as it is used for aggregations
 sub get_surface {
   my ($self, $doc_id, $offset, $length) = @_;
   ...

diff --git a/lib/Krawfish/Index/Segments.pm b/lib/Krawfish/Index/Segments.pm
deleted file mode 100644
index ce0896a..0000000
--- a/lib/Krawfish/Index/Segments.pm
+++ /dev/null

@@ -1,83 +0,0 @@
-package Krawfish::Index::Segments;
-use Krawfish::Log;
-use strict;
-use warnings;
-
-# Store offsets for direct access using doc id and pos
-# - in addition store term ids and characters for presorting
-
-# TODO:
-#   This may be implemented using a postings list, but inside positions,
-#   it should be possible to move backwards as well.
-#   The segments structure may be augmented with a skiplist
-#   and be a highly optimized position encoding, because character offsets
-#   should normally have values between 0 and 16.
-#
-#   It should also contain information about the first two characters
-#   of a term and possibly the last two characters, necessary to bucket sort terms.
-#   The characters are stored as UTF-8 or similar -
-#   it may be beneficial to have the most common characters need the least
-#   bits.
-#   Note that this information needs to store characters and not
-#   bytes, as bytes may not be helpful for sorting!
-#
-#   In addition, the term_id needs to be stored!
-
-# TODO: Term-IDs may be better stored in a separate file, to keep the file small.
-
-use constant DEBUG => 0;
-
-# Constructor
-sub new {
-  my $class = shift;
-  bless {
-    file => shift,
-
-    # Define, how many start characters will be stored
-    start_char_length => shift // 2,
-
-    # Define, how many start characters will be stored
-    end_char_length => shift // 2
-  }, $class;
-};
-
-# TODO: Better store length ...
-# Store offsets
-sub store {
-  my $self = shift;
-
-  # Get data to store per segment
-  my ($doc_id, $segment, $start_char, $end_char, $term_id, $term) = @_;
-
-  if ($term) {
-    # Get the first and last characters of the term
-    my ($first, $last) = (substr($term, 0, 2), scalar reverse substr($term, -2));
-
-    # Store all segments
-    $self->{$doc_id . '#' . $segment} = [$start_char, $end_char, $term_id, $first, $last];
-
-    if (DEBUG) {
-      print_log('segments', "Store segment at [$doc_id,$segment]");
-      print_log('segments', '  with ' . join(','),@{$self->{$doc_id . '#' . $segment}});
-    };
-  }
-
-  # Temporary
-  else {
-    # Store all segments
-    $self->{$doc_id . '#' . $segment} = [$start_char, $end_char];
-  }
-
-  return $self;
-};
-
-
-# Get offsets
-# TODO: Support caching!
-sub get {
-  my $self = shift;
-  my ($doc_id, $segment) = @_;
-  return $self->{$doc_id . '#' . $segment};
-};
-
-1;

diff --git a/lib/Krawfish/Index/Store/ForwardIndex.pm b/lib/Krawfish/Index/Store/1/ForwardIndex.pm
similarity index 82%
rename from lib/Krawfish/Index/Store/ForwardIndex.pm
rename to lib/Krawfish/Index/Store/1/ForwardIndex.pm
index 9648651..7e3eb5e 100644
--- a/lib/Krawfish/Index/Store/ForwardIndex.pm
+++ b/lib/Krawfish/Index/Store/1/ForwardIndex.pm

@@ -1,10 +1,11 @@
-package Krawfish::Index::Store::ForwardIndex;
+package Krawfish::Index::Store::V1::ForwardIndex;
 use Krawfish::Index::Store::Util qw/enc_string
                                     dec_string
                                     enc_varint
                                     dec_varint/;
 use strict;
 use warnings;
+use Data::BitStream;
 
 # TODO:
 #   The store should be versioned!
@@ -68,13 +69,24 @@
 # Flush the buffer
 sub _flush {
   my $self = shift;
+
+  # Calculate the subtoken length
+  # TODO: Store in 2 bytes
+  my $length = length(
+    $self->{buffer} . $self->{plain_tail}
+  );
+
+  # Add subtoken to stream
   $self->{stream} .=
     SUBTOKEN_MARKER .
-    (length($self->{buffer} . $self->{plain_tail}) + 1) .
+    $length .
     $self->{buffer} .
     PLAIN_MARKER .
-    $self->{plain_tail};
+    $self->{plain_tail} .
+    $length;
 
+  # TODO: For next() add PLAIN_MARKER and 2x length
+  # TODO: For previous() add SUBTOKEN_MARKER, PLAIN_MARKER and 1x length
   $self->{buffer} = '';
   $self->{plain_tail} = '';
   $self->{plain_pos} = 0;
@@ -104,6 +116,15 @@
   }
 };
 
+# TODO: May return a subtoken object
+sub get {
+  my ($self, $offset) = @_;
+
+  # TODO: Check for SUBTOKEN_MARKER
+  # read length
+  my $subtoken_length = substr($self->{buffer}, $offset, 1, 3);
+  ...
+};
 
 # Add plain string
 # for example punctuation, whitespace etc.

diff --git a/lib/Krawfish/Index/Store/1/ForwardPointer.pm b/lib/Krawfish/Index/Store/1/ForwardPointer.pm
new file mode 100644
index 0000000..019f9d7
--- /dev/null
+++ b/lib/Krawfish/Index/Store/1/ForwardPointer.pm

@@ -0,0 +1,31 @@
+package Krawfish::Index::Store::V1::ForwardPointer;
+use strict;
+use warnings;
+
+sub new {
+  my $class = shift;
+  bless {
+    offset => 0,
+    index => shift,
+    current => undef
+  }, $class;
+};
+
+sub current {
+  return $_[0]->{current};
+};
+
+sub get {
+  my ($self, $offset) = @_;
+  if (my $subtoken = $self->{buffer}->get($offset)) {
+    $self->{offset} = $offset;
+    return $subtoken;
+  };
+  $self->{offset} = 0;
+  return;
+};
+
+
+sub next {}
+
+1;

diff --git a/lib/Krawfish/Index/Store/Stream.pm b/lib/Krawfish/Index/Store/1/Stream.pm
similarity index 92%
rename from lib/Krawfish/Index/Store/Stream.pm
rename to lib/Krawfish/Index/Store/1/Stream.pm
index c295954..3b34186 100644
--- a/lib/Krawfish/Index/Store/Stream.pm
+++ b/lib/Krawfish/Index/Store/1/Stream.pm

@@ -1,4 +1,4 @@
-package Krawfish::Index::Store::Stream;
+package Krawfish::Index::Store::V1::Stream;
 use strict;
 use warnings;
 

diff --git a/lib/Krawfish/Index/Store/Util.pm b/lib/Krawfish/Index/Store/1/Util.pm
similarity index 95%
rename from lib/Krawfish/Index/Store/Util.pm
rename to lib/Krawfish/Index/Store/1/Util.pm
index a5a78d4..1a20618 100644
--- a/lib/Krawfish/Index/Store/Util.pm
+++ b/lib/Krawfish/Index/Store/1/Util.pm

@@ -1,4 +1,4 @@
-package Krawfish::Index::Store::Util;
+package Krawfish::Index::Store::V1::Util;
 use parent 'Exporter';
 use strict;
 use warnings;

diff --git a/lib/Krawfish/Index/Subtokens.pm b/lib/Krawfish/Index/Subtokens.pm
new file mode 100644
index 0000000..34dfd6d
--- /dev/null
+++ b/lib/Krawfish/Index/Subtokens.pm

@@ -0,0 +1,155 @@
+package Krawfish::Index::Subtokens;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+# See Krawfish::Index::Tokens
+
+# The Subtokens list (not different for different tokenizations)
+# has the following job:
+#
+# * Return forward index offsets for a certain subtoken
+#   (for the current forward index implementation, only the
+#    start offset is necessary)
+#   API: ->get($doc_id, $pos)
+#
+# * Get the surface form from the forward index as fast as possible
+#   This will first find the offsets and then collect the term_ids from
+#   the forward index and resolve the term_ids (potentially).
+#   API: ->get_surface($doc_id, $pos)
+#        ->get_surface($doc_id, $pos, $length)
+#
+# * Get the start and end characters of the surface form for fast
+#   sorting. All terms should be preranked in prefix and suffix order
+#   for the standard collation.
+#   API: ->get_prefix_rank($doc_id, $pos)
+#        ->get_suffix_rank($doc_id, $pos)
+
+
+# TODO:
+#   This may be implemented using a postings list, but inside positions,
+#   it should be possible to move backwards as well.
+#   The segments structure may be augmented with a skiplist
+#   and be a highly optimized position encoding, because character offsets
+#   should normally have values between 0 and 16.
+#
+#   It should also contain information about the first two characters
+#   of a term and possibly the last two characters, necessary to bucket sort terms.
+#   The characters are stored as UTF-8 or similar -
+#   it may be beneficial to have the most common characters need the least
+#   bits.
+#   Note that this information needs to store characters and not
+#   bytes, as bytes may not be helpful for sorting!
+#
+#   In addition, the term_id needs to be stored!
+
+# TODO: Term-IDs may be better stored in a separate file, to keep the file small.
+
+# The following APIs are needed:
+# ->get_plus('opennlp', 2,4)
+# That is needed to get the subtokens used for
+# extensions
+
+# This is a special PostingsList to store the length of tokens
+# in segments
+#
+# It may also be used for extensions and distances with tokens
+# (instead of segments)
+#
+# That's why this postingslist has a special API for extensions
+# and word distances.
+#
+# Structure may be: ([docid-delta]([seg-pos-delta][length-varbit])*)*
+#
+# The problem is, this won't make it possible to go back and forth.
+
+
+use constant DEBUG => 0;
+
+# Constructor
+sub new {
+  my $class = shift;
+  bless {
+    file => shift,
+
+    # Define, how many start characters will be stored
+    # This is useful for alphabetic sorting
+    start_char_length => shift // 2,
+
+    # Define, how many start characters will be stored
+    # This is useful for alphabetic sorting
+    end_char_length => shift // 2,
+
+    array => [],
+    pos => -1,
+  }, $class;
+};
+
+# TODO: Better store length ...
+# Store offsets
+sub store {
+  my $self = shift;
+
+  # Get data to store per segment
+  my ($doc_id, $segment, $start_char, $end_char, $term_id, $term) = @_;
+
+  if ($term) {
+    # Get the first and last characters of the term
+    my ($first, $last) = (substr($term, 0, 2), scalar reverse substr($term, -2));
+
+    # Store all segments
+    $self->{$doc_id . '#' . $segment} = [$start_char, $end_char, $term_id, $first, $last];
+
+    if (DEBUG) {
+      print_log('segments', "Store segment at [$doc_id,$segment]");
+      print_log('segments', '  with ' . join(','),@{$self->{$doc_id . '#' . $segment}});
+    };
+  }
+
+  # Temporary
+  else {
+    # Store all segments
+    $self->{$doc_id . '#' . $segment} = [$start_char, $end_char];
+  }
+
+  return $self;
+};
+
+
+# Get offsets
+# TODO: Support caching!
+sub get {
+  my $self = shift;
+  my ($doc_id, $segment) = @_;
+  return $self->{$doc_id . '#' . $segment};
+};
+
+
+sub append {
+  my $self = shift;
+  my ($token, $doc_id, $pos, $end) = @_;
+  print_log('toklist', "Appended $token with $doc_id, $pos" . ($end ? "-$end" : '')) if DEBUG;
+  push(@{$self->{array}}, [$doc_id, $pos, $end]);
+};
+
+sub next;
+
+sub pos {
+  return $_[0]->{pos};
+};
+
+sub token {
+  return $_[0]->{array}->[$_[0]->pos];
+};
+
+
+sub freq;
+
+sub skip_to_doc;
+
+sub skip_to_pos;
+
+
+
+
+1;

diff --git a/lib/Krawfish/Index/Tokens.pm b/lib/Krawfish/Index/Tokens.pm
new file mode 100644
index 0000000..b2755a5
--- /dev/null
+++ b/lib/Krawfish/Index/Tokens.pm

@@ -0,0 +1,53 @@
+package Krawfish::Index::Tokens;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+# See Krawfish::Index::Subtokens
+
+# The Tokens list has the following jobs:
+#
+# * Check if the number of tokens between two subtokens is
+#   in a certain range
+#   API: ->count($doc_id, $pos, $length, $min, $max)
+#   May as well be extensible for queries like
+#   a []{2,7} b
+#
+# * Add tokens to both sides for extension queries
+#   API: ->extend_to_left($doc_id, $pos, $min, $max)
+#   API: ->extend_to_right($doc_id, $pos, $min, $max)
+#
+# * Get the number of tokens per doc_id
+#   API: ->count($doc_id)
+#        or ->freq($doc_id)
+#
+
+# Get an array of start positions that are in the range of min/max
+# Start with the lowest
+sub extend_to_left {
+  my ($self, $start, $min, $max) = @_;
+  # Returns an array of start positions
+  ...
+};
+
+# Get an array of end positions that are in the range of min/max
+# Start with the lowest
+sub extend_to_right {
+  my ($self, $end, $min, $max) = @_;
+  # Returns an array of end positions
+  ...
+};
+
+# Check if the number of tokens between end and start
+# is in the given range.
+#
+# This is necessary for token distance
+# a []{2,3} b
+sub count {
+  my ($self, $end, $start, $min, $max) = @_;
+
+  # First check if this is even possible based on segments
+  # then check on tokens
+  ...
+}
+

diff --git a/lib/Krawfish/Index/TokensList.pm b/lib/Krawfish/Index/TokensList.pm
deleted file mode 100644
index 3eb3d74..0000000
--- a/lib/Krawfish/Index/TokensList.pm
+++ /dev/null

@@ -1,87 +0,0 @@
-package Krawfish::Index::TokensList;
-use strict;
-use warnings;
-
-use constant DEBUG => 0;
-
-# This is a special PostingsList to store the length of tokens
-# in segments
-#
-# It may also be used for extensions and distances with tokens
-# (instead of segments)
-#
-# That's why this postingslist has a special API for extensions
-# and word distances.
-#
-# Structure may be: ([docid-delta]([seg-pos-delta][length-varbit])*)*
-#
-# The problem is, this won't make it possible to go back and forth.
-
-sub new {
-  my $class = shift;
-  bless {
-    array => [],
-    pos => -1,
-    index_file => shift,
-    foundry => shift
-  }, $class;
-}
-
-sub append {
-  my $self = shift;
-  my ($token, $doc_id, $pos, $end) = @_;
-  print_log('toklist', "Appended $token with $doc_id, $pos" . ($end ? "-$end" : '')) if DEBUG;
-  push(@{$self->{array}}, [$doc_id, $pos, $end]);
-};
-
-sub next;
-
-sub pos {
-  return $_[0]->{pos};
-};
-
-sub token {
-  return $_[0]->{array}->[$_[0]->pos];
-};
-
-
-sub freq;
-
-sub skip_to_doc;
-
-sub skip_to_pos;
-
-
-# Get an array of start positions that are in the range of min/max
-# Start with the lowest
-sub extend_to_left {
-  my ($self, $start, $min, $max) = @_;
-  # Returns an array of start positions
-  ...
-};
-
-# Get an array of end positions that are in the range of min/max
-# Start with the lowest
-sub extend_to_right {
-  my ($self, $end, $min, $max) = @_;
-  # Returns an array of end positions
-  ...
-};
-
-# Check if the number of tokens between end and start
-# is in the given range.
-#
-# This is necessary for token distance
-# a []{2,3} b
-sub check_tokens_between {
-  my ($self, $end, $start, $min, $max) = @_;
-
-  # First check if this is even possible based on segments
-  # then check on tokens
-  ...
-}
-
-
-1;
-
-__END__

diff --git a/lib/Krawfish/Koral/Query.pm b/lib/Krawfish/Koral/Query.pm
index 2973b3f..995358f 100644
--- a/lib/Krawfish/Koral/Query.pm
+++ b/lib/Krawfish/Koral/Query.pm

@@ -30,6 +30,7 @@
 #########################################
 
 # Prepare a query for an index
+# TODO: Rename to compile()
 sub prepare_for {
   my ($self, $index) = @_;
 

diff --git a/lib/Krawfish/Posting/Snippet.pm b/lib/Krawfish/Posting/Snippet.pm
index 6f21567..78db1e3 100644
--- a/lib/Krawfish/Posting/Snippet.pm
+++ b/lib/Krawfish/Posting/Snippet.pm

@@ -11,20 +11,20 @@
   my $self = shift;
 
   my $offsets = $self->index->offsets;
-  my $start_segment = $offsets->get(
+  my $start_subtoken = $offsets->get(
     $self->doc_id,
     $self->start
   );
 
-  my $end_segment = $offsets->get(
+  my $end_subtoken = $offsets->get(
     $self->doc_id,
     $self->end
   );
 
   return $self->index->primary->get(
     $self->doc_id,
-    $start_segment,
-    $end_segment
+    $start_subtoken,
+    $end_subtoken
   );
 };
 

diff --git a/lib/Krawfish/Query/Extension.pm b/lib/Krawfish/Query/Extension.pm
index eba903e..b0e7923 100644
--- a/lib/Krawfish/Query/Extension.pm
+++ b/lib/Krawfish/Query/Extension.pm

@@ -4,7 +4,7 @@
 use strict;
 use warnings;
 
-# This query adds segments to the left or the right
+# This query adds subtokens to the left or the right
 # of a matching span
 
 

diff --git a/lib/Krawfish/Result/Aggregate/Values.pm b/lib/Krawfish/Result/Aggregate/Values.pm
index 292b6fc..9d26508 100644
--- a/lib/Krawfish/Result/Aggregate/Values.pm
+++ b/lib/Krawfish/Result/Aggregate/Values.pm

@@ -40,7 +40,7 @@
   if ($value_current->doc_id < $current->doc_id) {
 
     # Skip to the requested doc_id (or beyond)
-    $value_current = $values->skip_to($current->doc_id);
+    $value_current = $values->skip_doc($current->doc_id);
   };
 
   if ($current_value->doc_id == $current->doc_id) {

diff --git a/lib/Krawfish/Result/Group/Classes.pm b/lib/Krawfish/Result/Group/Classes.pm
index 94e161b..a3800a3 100644
--- a/lib/Krawfish/Result/Group/Classes.pm
+++ b/lib/Krawfish/Result/Group/Classes.pm

@@ -30,36 +30,36 @@
 
   # Get all classes from the match
   # Classes need to be sorted by start position
-  # to be retrievable, in case the Segments-Stream
+  # to be retrievable, in case the subtokens-Stream
   # is implemented as a postingslist (probably not)
   my @classes = $match->get_classes_sorted($self->{nrs});
 
-  my $segments = $self->{index}->segments;
+  my $subtokens = $self->{index}->subtokens;
 
   my %class_group;
 
   # Classes have nr, start, end
   foreach my $class (@classes) {
 
-    # WARNING! CLASSES MAY OVERLAP SO SEGMENTS SHOULD BE CACHED OR BUFFERED!
+    # WARNING! CLASSES MAY OVERLAP SO SUBTOKENS SHOULD BE CACHED OR BUFFERED!
 
     # Get start position
     my $start = $class->[START_POS];
 
     my @seq = ();
 
-    # Receive segment
-    my $seg = $segments->get($match->doc_id, $start);
+    # Receive subtoken
+    my $subt = $subtokens->get($match->doc_id, $start);
 
-    # Push term id to segment
-    # TODO: A segment should have accessors
-    push (@seq, $seg->[2]);
+    # Push term id to subtoken
+    # TODO: A subtoken should have accessors
+    push (@seq, $subt->[2]);
 
     while ($start < ($class->[END_POS] -1)) {
-      $seg = $segments->get($match->doc_id, ++$start);
+      $subt = $subtokens->get($match->doc_id, ++$start);
 
-      # Push term id to segment
-      push (@seq, $seg->[2]);
+      # Push term id to subtoken
+      push (@seq, $subt->[2]);
     };
 
     # Class not yet set

diff --git a/lib/Krawfish/Result/Snippet.pm b/lib/Krawfish/Result/Snippet.pm
index 6bf6c33..772c940 100644
--- a/lib/Krawfish/Result/Snippet.pm
+++ b/lib/Krawfish/Result/Snippet.pm

@@ -20,12 +20,12 @@
     index => $param{index}
   }, $class;
 
-  $self->{segments} = $self->{index}->segments;
+  $self->{subtokens} = $self->{index}->subtokens;
 
   # Create highlight object
   $self->{highlights} = Krawfish::Result::Snippet::Highlights->new(
     $param{highlights},
-    $self->{segments}
+    $self->{subtokens}
   );
 
   return $self;
commit	6a749735268e1113253f2806bf9a3167e27ebc2f	[log] [tgz]
author	Akron <nils@diewald-online.de>	Tue Feb 14 14:43:06 2017 +0100
committer	Akron <nils@diewald-online.de>	Tue Feb 14 14:43:06 2017 +0100
tree	d61a8a009606cd2d7122936f4b12be40473be44d
parent	d2f9e69eeca2c6a648722f69cf9b5edeaa3ff5c6 [diff]