lib/Krawfish/Index/Subtokens.pm - KorAP/Krawfish-prototype - Gitiles

 package Krawfish::Index::Subtokens;
 use Krawfish::Log;
 use strict;
 use warnings;

 # See Krawfish::Index::Tokens

 # There is only one subtoken list for the documents

 # TODO:
 #   With a forward index, the subtokens offsets will no longer
 #   point to character positions in the primary text but to
 #   subtoken positions in the forward index!

 # The Subtokens list (not different for different tokenizations)
 # has the following job:
 #
 # * Return forward index offsets for a certain subtoken
 #   (for the current forward index implementation, only the
 #    start offset is necessary)
 #   API: ->get_offset($doc_id, $pos)
 #
 # * Get the term_id at a certain position
 #   API: ->get_term_id($doc_id, $pos)
 #
 #   IDEA: This will get the offset and look into the forward index
 #         for the $term_id. Or it will directly store the $term_id
 #         in the stream.
 #
 # * Get the surface form from the dictionary as fast as possible
 #   This will first find the $term_ids in the forward index and resolve them
 #   (potentially).
 #   API: ->get_surface($doc_id, $pos)
 #        ->get_surface($doc_id, $pos, $length)
 #
 # * Get the prefix and suffix rank of the surface form for fast
 #   sorting. All terms should be preranked in prefix and suffix order
 #   for the standard collation.
 #   API: ->get_prefix_rank($doc_id, $pos)
 #        ->get_suffix_rank($doc_id, $pos)
 #
 #   IDEA:  It would be nice to have the ranks being stored in the dictionary
 #          to avoid redundancy. So this would be implemented as first
 #          looking up the $term_id in the forward_index, then retrieving
 #          the rank based on the $term_id in the dictionary.
 #          As these sorting parts of the dictionary are not necessary all the time,
 #          they may not need to reside in memory all the time.
 #
 #   REQUIREMENT: For on-the-fly sorting it may be beneficial to have a fast
 #                incremental ->next() and ->skip_to() like method.
 #                For finding the offset for a match, a get() should
 #                be allowed to be slow.
 #                A good argument for fast on-the-fly-sorting is also
 #                grouping, which needs fast access to the term_ids, so
 #                they may need to be stored redundantly
 #
 #   DATASTRUCTURE: [doc_id]([delta_varint_offset][term_id_varint])*
 #                  Augmented with a SkipList.
 #
 # TODO:
 #   This may be implemented using a postings list, but inside positions,
 #   it should be possible to move backwards as well.
 #   The tokens structure may be augmented with a skiplist
 #   and be a highly optimized position encoding.
 #
 #   It should also contain information about the first two characters
 #   of a term and possibly the last two characters, necessary to bucket sort terms.
 #   The characters are stored as UTF-8 or similar -
 #   it may be beneficial to have the most common characters need the least
 #   bits.
 #   Note that this information needs to store characters and not
 #   bytes, as bytes may not be helpful for sorting!
 #   This may as well use a prefix_rank and a suffix_rank for bucket-sorting.
 #
 #   In addition, the surface term_id needs to be accessible fastly!
 #   TODO: Term-IDs may be better stored in a separate file, to keep the file small.


 use constant DEBUG => 0;


 # Constructor
 sub new {
   my $class = shift;
   bless {
     file => shift,

     # Define, how many start characters will be stored
     # This is useful for alphabetic sorting
     # start_char_length => shift // 2,

     # Define, how many start characters will be stored
     # This is useful for alphabetic sorting
     # end_char_length => shift // 2,

     array => [],
     pos => -1,
   }, $class;
 };


 # TODO: Better store length ...
 # Store offsets
 sub store {
   my $self = shift;

   # Get data to store per segment
   my ($doc_id, $subtoken, $start_char, $end_char, $subterm_id, $subterm) = @_;

   # TODO:
   #   THIS IS PROBABLY NOT NECESSARY!
   if ($subterm) {
     # Get the first and last characters of the term
     my ($first, $last) = (substr($subterm, 0, 2), scalar reverse substr($subterm, -2));

     # Store all segments
     $self->{$doc_id . '#' . $subtoken} = [$start_char, $end_char, $subterm_id, $first, $last];

     if (DEBUG) {
       print_log('stokens', "Store subtoken at [$doc_id,$subtoken]");
       print_log('stokens', '  with ' . join(','),@{$self->{$doc_id . '#' . $subtoken}});
     };
   }

   # Temporary
   else {

     # Store all segments
     $self->{$doc_id . '#' . $subtoken} = [$start_char, $end_char];
   }

   return $self;
 };


 # Get offsets
 # TODO: Support caching!
 sub get {
   my $self = shift;
   my ($doc_id, $segment) = @_;
   return $self->{$doc_id . '#' . $segment};
 };


 sub append {
   my $self = shift;
   my ($token, $doc_id, $pos, $end) = @_;
   if (DEBUG) {
     print_log('toklist', "Appended $token with $doc_id, $pos" . ($end ? "-$end" : ''));
   };
   push(@{$self->{array}}, [$doc_id, $pos, $end]);
 };


 sub next;


 sub pos {
   return $_[0]->{pos};
 };


 sub token {
   return $_[0]->{array}->[$_[0]->pos];
 };


 sub freq;


 sub skip_to_doc;


 sub skip_to_pos;


 # This may require random access for sorting based on term ids
 sub get_term_id {
   my $self = shift;
   my ($doc_id, $pos) = @_;
   ...
 };


 # This may require random access for sorting based on term ids
 sub get_term_ids {
   my $self = shift;
   my ($doc_id, $start, $end) = @_;
   ...
 };


 1;
	package Krawfish::Index::Subtokens;
	use Krawfish::Log;
	use strict;
	use warnings;

	# See Krawfish::Index::Tokens

	# There is only one subtoken list for the documents

	# TODO:
	# With a forward index, the subtokens offsets will no longer
	# point to character positions in the primary text but to
	# subtoken positions in the forward index!

	# The Subtokens list (not different for different tokenizations)
	# has the following job:
	#
	# * Return forward index offsets for a certain subtoken
	# (for the current forward index implementation, only the
	# start offset is necessary)
	# API: ->get_offset($doc_id, $pos)
	#
	# * Get the term_id at a certain position
	# API: ->get_term_id($doc_id, $pos)
	#
	# IDEA: This will get the offset and look into the forward index
	# for the $term_id. Or it will directly store the $term_id
	# in the stream.
	#
	# * Get the surface form from the dictionary as fast as possible
	# This will first find the $term_ids in the forward index and resolve them
	# (potentially).
	# API: ->get_surface($doc_id, $pos)
	# ->get_surface($doc_id, $pos, $length)
	#
	# * Get the prefix and suffix rank of the surface form for fast
	# sorting. All terms should be preranked in prefix and suffix order
	# for the standard collation.
	# API: ->get_prefix_rank($doc_id, $pos)
	# ->get_suffix_rank($doc_id, $pos)
	#
	# IDEA: It would be nice to have the ranks being stored in the dictionary
	# to avoid redundancy. So this would be implemented as first
	# looking up the $term_id in the forward_index, then retrieving
	# the rank based on the $term_id in the dictionary.
	# As these sorting parts of the dictionary are not necessary all the time,
	# they may not need to reside in memory all the time.
	#
	# REQUIREMENT: For on-the-fly sorting it may be beneficial to have a fast
	# incremental ->next() and ->skip_to() like method.
	# For finding the offset for a match, a get() should
	# be allowed to be slow.
	# A good argument for fast on-the-fly-sorting is also
	# grouping, which needs fast access to the term_ids, so
	# they may need to be stored redundantly
	#
	# DATASTRUCTURE: [doc_id]([delta_varint_offset][term_id_varint])*
	# Augmented with a SkipList.
	#
	# TODO:
	# This may be implemented using a postings list, but inside positions,
	# it should be possible to move backwards as well.
	# The tokens structure may be augmented with a skiplist
	# and be a highly optimized position encoding.
	#
	# It should also contain information about the first two characters
	# of a term and possibly the last two characters, necessary to bucket sort terms.
	# The characters are stored as UTF-8 or similar -
	# it may be beneficial to have the most common characters need the least
	# bits.
	# Note that this information needs to store characters and not
	# bytes, as bytes may not be helpful for sorting!
	# This may as well use a prefix_rank and a suffix_rank for bucket-sorting.
	#
	# In addition, the surface term_id needs to be accessible fastly!
	# TODO: Term-IDs may be better stored in a separate file, to keep the file small.


	use constant DEBUG => 0;


	# Constructor
	sub new {
	my $class = shift;
	bless {
	file => shift,

	# Define, how many start characters will be stored
	# This is useful for alphabetic sorting
	# start_char_length => shift // 2,

	# Define, how many start characters will be stored
	# This is useful for alphabetic sorting
	# end_char_length => shift // 2,

	array => [],
	pos => -1,
	}, $class;
	};


	# TODO: Better store length ...
	# Store offsets
	sub store {
	my $self = shift;

	# Get data to store per segment
	my ($doc_id, $subtoken, $start_char, $end_char, $subterm_id, $subterm) = @_;

	# TODO:
	# THIS IS PROBABLY NOT NECESSARY!
	if ($subterm) {
	# Get the first and last characters of the term
	my ($first, $last) = (substr($subterm, 0, 2), scalar reverse substr($subterm, -2));

	# Store all segments
	$self->{$doc_id . '#' . $subtoken} = [$start_char, $end_char, $subterm_id, $first, $last];

	if (DEBUG) {
	print_log('stokens', "Store subtoken at [$doc_id,$subtoken]");
	print_log('stokens', ' with ' . join(','),@{$self->{$doc_id . '#' . $subtoken}});
	};
	}

	# Temporary
	else {

	# Store all segments
	$self->{$doc_id . '#' . $subtoken} = [$start_char, $end_char];
	}

	return $self;
	};


	# Get offsets
	# TODO: Support caching!
	sub get {
	my $self = shift;
	my ($doc_id, $segment) = @_;
	return $self->{$doc_id . '#' . $segment};
	};


	sub append {
	my $self = shift;
	my ($token, $doc_id, $pos, $end) = @_;
	if (DEBUG) {
	print_log('toklist', "Appended $token with $doc_id, $pos" . ($end ? "-$end" : ''));
	};
	push(@{$self->{array}}, [$doc_id, $pos, $end]);
	};


	sub next;


	sub pos {
	return $_[0]->{pos};
	};


	sub token {
	return $_[0]->{array}->[$_[0]->pos];
	};


	sub freq;


	sub skip_to_doc;


	sub skip_to_pos;


	# This may require random access for sorting based on term ids
	sub get_term_id {
	my $self = shift;
	my ($doc_id, $pos) = @_;
	...
	};


	# This may require random access for sorting based on term ids
	sub get_term_ids {
	my $self = shift;
	my ($doc_id, $start, $end) = @_;
	...
	};


	1;