Introduced a store and renamed segments to subtokens
diff --git a/lib/Krawfish/Index.pm b/lib/Krawfish/Index.pm
index 355ed11..de554e4 100644
--- a/lib/Krawfish/Index.pm
+++ b/lib/Krawfish/Index.pm
@@ -1,6 +1,6 @@
package Krawfish::Index;
use Krawfish::Index::Dictionary;
-use Krawfish::Index::Segments;
+use Krawfish::Index::Subtokens;
use Krawfish::Index::PrimaryData;
use Krawfish::Index::Fields;
use Krawfish::Cache;
@@ -24,9 +24,9 @@
# TODO: Maybe 65.535 documents are enough per segment ...
# TODO: Build a forward index
-# TODO: With a forward index, the segments offsets will no longer
+# TODO: With a forward index, the subtokens offsets will no longer
# point to character positions in the primary text but to
-# segment positions in the forward index!
+# subtoken positions in the forward index!
# TODO:
# Reranking a field is not necessary, if the field value is already given.
@@ -56,7 +56,7 @@
);
# Load offsets
- $self->{segments} = Krawfish::Index::Segments->new(
+ $self->{subtokens} = Krawfish::Index::Subtokens->new(
$self->{file}
);
@@ -106,9 +106,9 @@
};
-# Get segments
-sub segments {
- $_[0]->{segments};
+# Get subtokens
+sub subtokens {
+ $_[0]->{subtokens};
};
@@ -177,27 +177,27 @@
$post_list->append($doc_id);
};
- my $segments = $self->segments;
+ my $subtokens = $self->subtokens;
- # The primary text is necessary for the segments index as well as
+ # The primary text is necessary for the subtoken index as well as
# for the forward index
my $primary = $doc->{primaryData};
- # Store segments
- if ($doc->{segments}) {
+ # Store subtokens
+ if ($doc->{subtokens}) {
- print_log('index', 'Store segments') if DEBUG;
+ print_log('index', 'Store subtokens') if DEBUG;
- # Store all segment offsets
- foreach my $seg (@{$doc->{segments}}) {
+ # Store all subtoken offsets
+ foreach my $seg (@{$doc->{subtokens}}) {
- # Get start and end of the segment
+ # Get start and end of the subtoken
my ($start, $end) = @{$seg->{offsets}};
if (DEBUG) {
print_log(
'index',
- 'Store segment: ' . $doc_id . ':' . $pos . '=' . join('-', $start, $end)
+ 'Store subtoken: ' . $doc_id . ':' . $pos . '=' . join('-', $start, $end)
);
};
@@ -207,14 +207,14 @@
# TODO: There may be a prefix necessary for surface forms
# TODO: This may in fact be not necessary at all -
- # The segments may have their own IDs
+ # The subtokens may have their own IDs
# And the terms do not need to be stored in the dictionary for retrieval ...
my $term_id = $dict->add('*' . $term)->term_id;
print_log('index', 'Surface form has term_id ' . $term_id) if DEBUG;
- # Store information to segment
- $segments->store($doc_id, $pos++, $start, $end, $term_id, $term);
+ # Store information to subtoken
+ $subtokens->store($doc_id, $pos++, $start, $end, $term_id, $term);
};
};
@@ -248,15 +248,15 @@
};
# Append posting to postings list
- my @segments = _segments($item);
+ my @subtokens = _subtokens($item);
- # No segments defined
- unless (scalar @segments) {
- push @segments, $pos;
+ # No subtokens defined
+ unless (scalar @subtokens) {
+ push @subtokens, $pos;
# Store offsets
if ($item->{offsets}) {
- $segments->store($doc_id, $pos, @{$item->{offsets}});
+ $subtokens->store($doc_id, $pos, @{$item->{offsets}});
};
$pos++;
};
@@ -264,7 +264,7 @@
# Add token terms
foreach (@keys) {
my $post_list = $dict->add($_);
- $post_list->append($doc_id, @segments);
+ $post_list->append($doc_id, @subtokens);
};
}
@@ -279,9 +279,9 @@
# Append posting to posting list
$post_list->append(
$doc_id,
- $item->{segments}->[0],
- # The end is AFTER the second segment
- $item->{segments}->[-1] + 1
+ $item->{subtokens}->[0],
+ # The end is AFTER the second subtoken
+ $item->{subtokens}->[-1] + 1
);
};
};
@@ -308,19 +308,19 @@
}
-# Return segment list or nothing
-sub _segments {
+# Return subtoken list or nothing
+sub _subtokens {
my $item = shift;
my @posting;
- if ($item->{segments}) {
+ if ($item->{subtokens}) {
# Remove!
- push @posting, $item->{segments}->[0];
+ push @posting, $item->{subtokens}->[0];
- if ($item->{segments}->[1]) {
- # The end is AFTER the second segment
- push @posting, $item->{segments}->[1] + 1;
+ if ($item->{subtokens}->[1]) {
+ # The end is AFTER the second subtoken
+ push @posting, $item->{subtokens}->[1] + 1;
};
return @posting;
diff --git a/lib/Krawfish/Index/ForwardIndex.pm b/lib/Krawfish/Index/ForwardIndex.pm
index 047567c..b93275e 100644
--- a/lib/Krawfish/Index/ForwardIndex.pm
+++ b/lib/Krawfish/Index/ForwardIndex.pm
@@ -12,6 +12,13 @@
# merge. Then, convert the forward index based on this table without
# dictionary lookup.
#
+# TODO:
+# This is great for retrieving pagebreaks, annotations, primary data,
+# perhaps help on regex ...
+# But can this help to expand the context of a match to a certain element context?
+# Probably by retrieving the data with a certain maximum offset (say left 100 subtokens, right 100 subtokens)
+# and first check for the expanding element start on the left, then move to the right.
+#
sub new {
my $class = shift;
bless {
@@ -32,6 +39,10 @@
return substr($self->{forward}->[$doc_id], $offset, $end - $offset);
};
+sub get_expanded {
+ ...
+};
+
# Return a stream of elements (primary text and annotations)
sub get_annotated {
@@ -40,7 +51,17 @@
...
};
+# Return a stream of elements (primary text and annotations)
+# that is within a certain element
+sub get_annotated_expanded {
+ my $self = shift;
+ my ($doc_id, $offset, $length, $foundry, $max_exp, $layer, $element) = @_;
+ ...
+};
+
+
# Return the surface string only
+# This should be as fast as possible, as it is used for aggregations
sub get_surface {
my ($self, $doc_id, $offset, $length) = @_;
...
diff --git a/lib/Krawfish/Index/Segments.pm b/lib/Krawfish/Index/Segments.pm
deleted file mode 100644
index ce0896a..0000000
--- a/lib/Krawfish/Index/Segments.pm
+++ /dev/null
@@ -1,83 +0,0 @@
-package Krawfish::Index::Segments;
-use Krawfish::Log;
-use strict;
-use warnings;
-
-# Store offsets for direct access using doc id and pos
-# - in addition store term ids and characters for presorting
-
-# TODO:
-# This may be implemented using a postings list, but inside positions,
-# it should be possible to move backwards as well.
-# The segments structure may be augmented with a skiplist
-# and be a highly optimized position encoding, because character offsets
-# should normally have values between 0 and 16.
-#
-# It should also contain information about the first two characters
-# of a term and possibly the last two characters, necessary to bucket sort terms.
-# The characters are stored as UTF-8 or similar -
-# it may be beneficial to have the most common characters need the least
-# bits.
-# Note that this information needs to store characters and not
-# bytes, as bytes may not be helpful for sorting!
-#
-# In addition, the term_id needs to be stored!
-
-# TODO: Term-IDs may be better stored in a separate file, to keep the file small.
-
-use constant DEBUG => 0;
-
-# Constructor
-sub new {
- my $class = shift;
- bless {
- file => shift,
-
- # Define, how many start characters will be stored
- start_char_length => shift // 2,
-
- # Define, how many start characters will be stored
- end_char_length => shift // 2
- }, $class;
-};
-
-# TODO: Better store length ...
-# Store offsets
-sub store {
- my $self = shift;
-
- # Get data to store per segment
- my ($doc_id, $segment, $start_char, $end_char, $term_id, $term) = @_;
-
- if ($term) {
- # Get the first and last characters of the term
- my ($first, $last) = (substr($term, 0, 2), scalar reverse substr($term, -2));
-
- # Store all segments
- $self->{$doc_id . '#' . $segment} = [$start_char, $end_char, $term_id, $first, $last];
-
- if (DEBUG) {
- print_log('segments', "Store segment at [$doc_id,$segment]");
- print_log('segments', ' with ' . join(','),@{$self->{$doc_id . '#' . $segment}});
- };
- }
-
- # Temporary
- else {
- # Store all segments
- $self->{$doc_id . '#' . $segment} = [$start_char, $end_char];
- }
-
- return $self;
-};
-
-
-# Get offsets
-# TODO: Support caching!
-sub get {
- my $self = shift;
- my ($doc_id, $segment) = @_;
- return $self->{$doc_id . '#' . $segment};
-};
-
-1;
diff --git a/lib/Krawfish/Index/Store/ForwardIndex.pm b/lib/Krawfish/Index/Store/1/ForwardIndex.pm
similarity index 82%
rename from lib/Krawfish/Index/Store/ForwardIndex.pm
rename to lib/Krawfish/Index/Store/1/ForwardIndex.pm
index 9648651..7e3eb5e 100644
--- a/lib/Krawfish/Index/Store/ForwardIndex.pm
+++ b/lib/Krawfish/Index/Store/1/ForwardIndex.pm
@@ -1,10 +1,11 @@
-package Krawfish::Index::Store::ForwardIndex;
+package Krawfish::Index::Store::V1::ForwardIndex;
use Krawfish::Index::Store::Util qw/enc_string
dec_string
enc_varint
dec_varint/;
use strict;
use warnings;
+use Data::BitStream;
# TODO:
# The store should be versioned!
@@ -68,13 +69,24 @@
# Flush the buffer
sub _flush {
my $self = shift;
+
+ # Calculate the subtoken length
+ # TODO: Store in 2 bytes
+ my $length = length(
+ $self->{buffer} . $self->{plain_tail}
+ );
+
+ # Add subtoken to stream
$self->{stream} .=
SUBTOKEN_MARKER .
- (length($self->{buffer} . $self->{plain_tail}) + 1) .
+ $length .
$self->{buffer} .
PLAIN_MARKER .
- $self->{plain_tail};
+ $self->{plain_tail} .
+ $length;
+ # TODO: For next() add PLAIN_MARKER and 2x length
+ # TODO: For previous() add SUBTOKEN_MARKER, PLAIN_MARKER and 1x length
$self->{buffer} = '';
$self->{plain_tail} = '';
$self->{plain_pos} = 0;
@@ -104,6 +116,15 @@
}
};
+# TODO: May return a subtoken object
+sub get {
+ my ($self, $offset) = @_;
+
+ # TODO: Check for SUBTOKEN_MARKER
+ # read length
+ my $subtoken_length = substr($self->{buffer}, $offset, 1, 3);
+ ...
+};
# Add plain string
# for example punctuation, whitespace etc.
diff --git a/lib/Krawfish/Index/Store/1/ForwardPointer.pm b/lib/Krawfish/Index/Store/1/ForwardPointer.pm
new file mode 100644
index 0000000..019f9d7
--- /dev/null
+++ b/lib/Krawfish/Index/Store/1/ForwardPointer.pm
@@ -0,0 +1,31 @@
+package Krawfish::Index::Store::V1::ForwardPointer;
+use strict;
+use warnings;
+
+sub new {
+ my $class = shift;
+ bless {
+ offset => 0,
+ index => shift,
+ current => undef
+ }, $class;
+};
+
+sub current {
+ return $_[0]->{current};
+};
+
+sub get {
+ my ($self, $offset) = @_;
+ if (my $subtoken = $self->{buffer}->get($offset)) {
+ $self->{offset} = $offset;
+ return $subtoken;
+ };
+ $self->{offset} = 0;
+ return;
+};
+
+
+sub next {}
+
+1;
diff --git a/lib/Krawfish/Index/Store/Stream.pm b/lib/Krawfish/Index/Store/1/Stream.pm
similarity index 92%
rename from lib/Krawfish/Index/Store/Stream.pm
rename to lib/Krawfish/Index/Store/1/Stream.pm
index c295954..3b34186 100644
--- a/lib/Krawfish/Index/Store/Stream.pm
+++ b/lib/Krawfish/Index/Store/1/Stream.pm
@@ -1,4 +1,4 @@
-package Krawfish::Index::Store::Stream;
+package Krawfish::Index::Store::V1::Stream;
use strict;
use warnings;
diff --git a/lib/Krawfish/Index/Store/Util.pm b/lib/Krawfish/Index/Store/1/Util.pm
similarity index 95%
rename from lib/Krawfish/Index/Store/Util.pm
rename to lib/Krawfish/Index/Store/1/Util.pm
index a5a78d4..1a20618 100644
--- a/lib/Krawfish/Index/Store/Util.pm
+++ b/lib/Krawfish/Index/Store/1/Util.pm
@@ -1,4 +1,4 @@
-package Krawfish::Index::Store::Util;
+package Krawfish::Index::Store::V1::Util;
use parent 'Exporter';
use strict;
use warnings;
diff --git a/lib/Krawfish/Index/Subtokens.pm b/lib/Krawfish/Index/Subtokens.pm
new file mode 100644
index 0000000..34dfd6d
--- /dev/null
+++ b/lib/Krawfish/Index/Subtokens.pm
@@ -0,0 +1,155 @@
+package Krawfish::Index::Subtokens;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+# See Krawfish::Index::Tokens
+
+# The Subtokens list (not different for different tokenizations)
+# has the following job:
+#
+# * Return forward index offsets for a certain subtoken
+# (for the current forward index implementation, only the
+# start offset is necessary)
+# API: ->get($doc_id, $pos)
+#
+# * Get the surface form from the forward index as fast as possible
+# This will first find the offsets and then collect the term_ids from
+# the forward index and resolve the term_ids (potentially).
+# API: ->get_surface($doc_id, $pos)
+# ->get_surface($doc_id, $pos, $length)
+#
+# * Get the start and end characters of the surface form for fast
+# sorting. All terms should be preranked in prefix and suffix order
+# for the standard collation.
+# API: ->get_prefix_rank($doc_id, $pos)
+# ->get_suffix_rank($doc_id, $pos)
+
+
+# TODO:
+# This may be implemented using a postings list, but inside positions,
+# it should be possible to move backwards as well.
+# The segments structure may be augmented with a skiplist
+# and be a highly optimized position encoding, because character offsets
+# should normally have values between 0 and 16.
+#
+# It should also contain information about the first two characters
+# of a term and possibly the last two characters, necessary to bucket sort terms.
+# The characters are stored as UTF-8 or similar -
+# it may be beneficial to have the most common characters need the least
+# bits.
+# Note that this information needs to store characters and not
+# bytes, as bytes may not be helpful for sorting!
+#
+# In addition, the term_id needs to be stored!
+
+# TODO: Term-IDs may be better stored in a separate file, to keep the file small.
+
+# The following APIs are needed:
+# ->get_plus('opennlp', 2,4)
+# That is needed to get the subtokens used for
+# extensions
+
+# This is a special PostingsList to store the length of tokens
+# in segments
+#
+# It may also be used for extensions and distances with tokens
+# (instead of segments)
+#
+# That's why this postingslist has a special API for extensions
+# and word distances.
+#
+# Structure may be: ([docid-delta]([seg-pos-delta][length-varbit])*)*
+#
+# The problem is, this won't make it possible to go back and forth.
+
+
+use constant DEBUG => 0;
+
+# Constructor
+sub new {
+ my $class = shift;
+ bless {
+ file => shift,
+
+ # Define, how many start characters will be stored
+ # This is useful for alphabetic sorting
+ start_char_length => shift // 2,
+
+ # Define, how many start characters will be stored
+ # This is useful for alphabetic sorting
+ end_char_length => shift // 2,
+
+ array => [],
+ pos => -1,
+ }, $class;
+};
+
+# TODO: Better store length ...
+# Store offsets
+sub store {
+ my $self = shift;
+
+ # Get data to store per segment
+ my ($doc_id, $segment, $start_char, $end_char, $term_id, $term) = @_;
+
+ if ($term) {
+ # Get the first and last characters of the term
+ my ($first, $last) = (substr($term, 0, 2), scalar reverse substr($term, -2));
+
+ # Store all segments
+ $self->{$doc_id . '#' . $segment} = [$start_char, $end_char, $term_id, $first, $last];
+
+ if (DEBUG) {
+ print_log('segments', "Store segment at [$doc_id,$segment]");
+ print_log('segments', ' with ' . join(','),@{$self->{$doc_id . '#' . $segment}});
+ };
+ }
+
+ # Temporary
+ else {
+ # Store all segments
+ $self->{$doc_id . '#' . $segment} = [$start_char, $end_char];
+ }
+
+ return $self;
+};
+
+
+# Get offsets
+# TODO: Support caching!
+sub get {
+ my $self = shift;
+ my ($doc_id, $segment) = @_;
+ return $self->{$doc_id . '#' . $segment};
+};
+
+
+sub append {
+ my $self = shift;
+ my ($token, $doc_id, $pos, $end) = @_;
+ print_log('toklist', "Appended $token with $doc_id, $pos" . ($end ? "-$end" : '')) if DEBUG;
+ push(@{$self->{array}}, [$doc_id, $pos, $end]);
+};
+
+sub next;
+
+sub pos {
+ return $_[0]->{pos};
+};
+
+sub token {
+ return $_[0]->{array}->[$_[0]->pos];
+};
+
+
+sub freq;
+
+sub skip_to_doc;
+
+sub skip_to_pos;
+
+
+
+
+1;
diff --git a/lib/Krawfish/Index/Tokens.pm b/lib/Krawfish/Index/Tokens.pm
new file mode 100644
index 0000000..b2755a5
--- /dev/null
+++ b/lib/Krawfish/Index/Tokens.pm
@@ -0,0 +1,53 @@
+package Krawfish::Index::Tokens;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+# See Krawfish::Index::Subtokens
+
+# The Tokens list has the following jobs:
+#
+# * Check if the number of tokens between two subtokens is
+# in a certain range
+# API: ->count($doc_id, $pos, $length, $min, $max)
+# May as well be extensible for queries like
+# a []{2,7} b
+#
+# * Add tokens to both sides for extension queries
+# API: ->extend_to_left($doc_id, $pos, $min, $max)
+# API: ->extend_to_right($doc_id, $pos, $min, $max)
+#
+# * Get the number of tokens per doc_id
+# API: ->count($doc_id)
+# or ->freq($doc_id)
+#
+
+# Get an array of start positions that are in the range of min/max
+# Start with the lowest
+sub extend_to_left {
+ my ($self, $start, $min, $max) = @_;
+ # Returns an array of start positions
+ ...
+};
+
+# Get an array of end positions that are in the range of min/max
+# Start with the lowest
+sub extend_to_right {
+ my ($self, $end, $min, $max) = @_;
+ # Returns an array of end positions
+ ...
+};
+
+# Check if the number of tokens between end and start
+# is in the given range.
+#
+# This is necessary for token distance
+# a []{2,3} b
+sub count {
+ my ($self, $end, $start, $min, $max) = @_;
+
+ # First check if this is even possible based on segments
+ # then check on tokens
+ ...
+}
+
diff --git a/lib/Krawfish/Index/TokensList.pm b/lib/Krawfish/Index/TokensList.pm
deleted file mode 100644
index 3eb3d74..0000000
--- a/lib/Krawfish/Index/TokensList.pm
+++ /dev/null
@@ -1,87 +0,0 @@
-package Krawfish::Index::TokensList;
-use strict;
-use warnings;
-
-use constant DEBUG => 0;
-
-# This is a special PostingsList to store the length of tokens
-# in segments
-#
-# It may also be used for extensions and distances with tokens
-# (instead of segments)
-#
-# That's why this postingslist has a special API for extensions
-# and word distances.
-#
-# Structure may be: ([docid-delta]([seg-pos-delta][length-varbit])*)*
-#
-# The problem is, this won't make it possible to go back and forth.
-
-sub new {
- my $class = shift;
- bless {
- array => [],
- pos => -1,
- index_file => shift,
- foundry => shift
- }, $class;
-}
-
-sub append {
- my $self = shift;
- my ($token, $doc_id, $pos, $end) = @_;
- print_log('toklist', "Appended $token with $doc_id, $pos" . ($end ? "-$end" : '')) if DEBUG;
- push(@{$self->{array}}, [$doc_id, $pos, $end]);
-};
-
-sub next;
-
-sub pos {
- return $_[0]->{pos};
-};
-
-sub token {
- return $_[0]->{array}->[$_[0]->pos];
-};
-
-
-sub freq;
-
-sub skip_to_doc;
-
-sub skip_to_pos;
-
-
-# Get an array of start positions that are in the range of min/max
-# Start with the lowest
-sub extend_to_left {
- my ($self, $start, $min, $max) = @_;
- # Returns an array of start positions
- ...
-};
-
-# Get an array of end positions that are in the range of min/max
-# Start with the lowest
-sub extend_to_right {
- my ($self, $end, $min, $max) = @_;
- # Returns an array of end positions
- ...
-};
-
-# Check if the number of tokens between end and start
-# is in the given range.
-#
-# This is necessary for token distance
-# a []{2,3} b
-sub check_tokens_between {
- my ($self, $end, $start, $min, $max) = @_;
-
- # First check if this is even possible based on segments
- # then check on tokens
- ...
-}
-
-
-1;
-
-__END__
diff --git a/lib/Krawfish/Koral/Query.pm b/lib/Krawfish/Koral/Query.pm
index 2973b3f..995358f 100644
--- a/lib/Krawfish/Koral/Query.pm
+++ b/lib/Krawfish/Koral/Query.pm
@@ -30,6 +30,7 @@
#########################################
# Prepare a query for an index
+# TODO: Rename to compile()
sub prepare_for {
my ($self, $index) = @_;
diff --git a/lib/Krawfish/Posting/Snippet.pm b/lib/Krawfish/Posting/Snippet.pm
index 6f21567..78db1e3 100644
--- a/lib/Krawfish/Posting/Snippet.pm
+++ b/lib/Krawfish/Posting/Snippet.pm
@@ -11,20 +11,20 @@
my $self = shift;
my $offsets = $self->index->offsets;
- my $start_segment = $offsets->get(
+ my $start_subtoken = $offsets->get(
$self->doc_id,
$self->start
);
- my $end_segment = $offsets->get(
+ my $end_subtoken = $offsets->get(
$self->doc_id,
$self->end
);
return $self->index->primary->get(
$self->doc_id,
- $start_segment,
- $end_segment
+ $start_subtoken,
+ $end_subtoken
);
};
diff --git a/lib/Krawfish/Query/Extension.pm b/lib/Krawfish/Query/Extension.pm
index eba903e..b0e7923 100644
--- a/lib/Krawfish/Query/Extension.pm
+++ b/lib/Krawfish/Query/Extension.pm
@@ -4,7 +4,7 @@
use strict;
use warnings;
-# This query adds segments to the left or the right
+# This query adds subtokens to the left or the right
# of a matching span
diff --git a/lib/Krawfish/Result/Aggregate/Values.pm b/lib/Krawfish/Result/Aggregate/Values.pm
index 292b6fc..9d26508 100644
--- a/lib/Krawfish/Result/Aggregate/Values.pm
+++ b/lib/Krawfish/Result/Aggregate/Values.pm
@@ -40,7 +40,7 @@
if ($value_current->doc_id < $current->doc_id) {
# Skip to the requested doc_id (or beyond)
- $value_current = $values->skip_to($current->doc_id);
+ $value_current = $values->skip_doc($current->doc_id);
};
if ($current_value->doc_id == $current->doc_id) {
diff --git a/lib/Krawfish/Result/Group/Classes.pm b/lib/Krawfish/Result/Group/Classes.pm
index 94e161b..a3800a3 100644
--- a/lib/Krawfish/Result/Group/Classes.pm
+++ b/lib/Krawfish/Result/Group/Classes.pm
@@ -30,36 +30,36 @@
# Get all classes from the match
# Classes need to be sorted by start position
- # to be retrievable, in case the Segments-Stream
+ # to be retrievable, in case the subtokens-Stream
# is implemented as a postingslist (probably not)
my @classes = $match->get_classes_sorted($self->{nrs});
- my $segments = $self->{index}->segments;
+ my $subtokens = $self->{index}->subtokens;
my %class_group;
# Classes have nr, start, end
foreach my $class (@classes) {
- # WARNING! CLASSES MAY OVERLAP SO SEGMENTS SHOULD BE CACHED OR BUFFERED!
+ # WARNING! CLASSES MAY OVERLAP SO SUBTOKENS SHOULD BE CACHED OR BUFFERED!
# Get start position
my $start = $class->[START_POS];
my @seq = ();
- # Receive segment
- my $seg = $segments->get($match->doc_id, $start);
+ # Receive subtoken
+ my $subt = $subtokens->get($match->doc_id, $start);
- # Push term id to segment
- # TODO: A segment should have accessors
- push (@seq, $seg->[2]);
+ # Push term id to subtoken
+ # TODO: A subtoken should have accessors
+ push (@seq, $subt->[2]);
while ($start < ($class->[END_POS] -1)) {
- $seg = $segments->get($match->doc_id, ++$start);
+ $subt = $subtokens->get($match->doc_id, ++$start);
- # Push term id to segment
- push (@seq, $seg->[2]);
+ # Push term id to subtoken
+ push (@seq, $subt->[2]);
};
# Class not yet set
diff --git a/lib/Krawfish/Result/Snippet.pm b/lib/Krawfish/Result/Snippet.pm
index 6bf6c33..772c940 100644
--- a/lib/Krawfish/Result/Snippet.pm
+++ b/lib/Krawfish/Result/Snippet.pm
@@ -20,12 +20,12 @@
index => $param{index}
}, $class;
- $self->{segments} = $self->{index}->segments;
+ $self->{subtokens} = $self->{index}->subtokens;
# Create highlight object
$self->{highlights} = Krawfish::Result::Snippet::Highlights->new(
$param{highlights},
- $self->{segments}
+ $self->{subtokens}
);
return $self;