lib/Krawfish/Index/Store/V1/Fields.pm - KorAP/Krawfish-prototype - Gitiles

 package Krawfish::Index::Store::V1::Fields;
 use parent 'Krawfish::Index::Store::V1::Stream';
 use Krawfish::Index::Store::V1::Util qw/enc_string dec_string/;
 use Krawfish::Log;
 use strict;
 use warnings;

 # TODO:
 #   field names should have term_ids
 #   all values should be stored in a sequential order
 #   augmented by a skip list.
 #
 #   [i:doc_id][i:doc_field_length]([i:field_id][i:value_length][str:value])*
 #
 #   The fields are stored in ascending field_id order,
 #   so its fast to find the correct value.
 #   The first bit of the field length may indicate,
 #   if the field is a string or a numerical value.
 #
 #   This may also have a next() and skip_doc() API
 #   to move to the expected document in a sequential way,
 #   which may be the case for Aggregate::Values. (Although,
 #   this may be better to be stored in a different mechanism.)
 #   In that case, a pointer mechanism is required.
 #   Another good use-case is the fast collection of text siglen
 #   for the virtualcorpus->textsiglen-vector method.

 # Tie to a file
 sub new {
   my ($class, $file, $dict) = @_;
   bless {
     file => $file,
     dictionary => $dict
   }, $class;
 };


 # Store information on a document
 # The doc_id needs to be greater than the last doc_id
 sub store {
   my $self = shift;
   my $doc_id = shift;

   # Expected structure is:
   # field_id => str
   my %raw_fields = @_;
   my %fields = ();

   # Translate field names to term_ids
   foreach (keys %raw_fields) {
     $fields{$self->{dict}->term_id_by_term($_)} = $raw_fields{$_};
   };

   my $bytes = '';

   # Sort term_ids numerical
   foreach (sort keys %fields) {
     $bytes .= $_;
     my $value = enc_string $value;
     $bytes .= length($value);
     $bytes .= $value;
   };

   # Append byte to stream
   $self->_append($doc_id, $bytes);
 };


 sub get_fields {
   my $self = shift;
   my $doc_id = shift;
   my $current = $self->skip_doc($doc_id);
 };

 1;
	package Krawfish::Index::Store::V1::Fields;
	use parent 'Krawfish::Index::Store::V1::Stream';
	use Krawfish::Index::Store::V1::Util qw/enc_string dec_string/;
	use Krawfish::Log;
	use strict;
	use warnings;

	# TODO:
	# field names should have term_ids
	# all values should be stored in a sequential order
	# augmented by a skip list.
	#
	# [i:doc_id][i:doc_field_length]([i:field_id][i:value_length][str:value])*
	#
	# The fields are stored in ascending field_id order,
	# so its fast to find the correct value.
	# The first bit of the field length may indicate,
	# if the field is a string or a numerical value.
	#
	# This may also have a next() and skip_doc() API
	# to move to the expected document in a sequential way,
	# which may be the case for Aggregate::Values. (Although,
	# this may be better to be stored in a different mechanism.)
	# In that case, a pointer mechanism is required.
	# Another good use-case is the fast collection of text siglen
	# for the virtualcorpus->textsiglen-vector method.

	# Tie to a file
	sub new {
	my ($class, $file, $dict) = @_;
	bless {
	file => $file,
	dictionary => $dict
	}, $class;
	};


	# Store information on a document
	# The doc_id needs to be greater than the last doc_id
	sub store {
	my $self = shift;
	my $doc_id = shift;

	# Expected structure is:
	# field_id => str
	my %raw_fields = @_;
	my %fields = ();

	# Translate field names to term_ids
	foreach (keys %raw_fields) {
	$fields{$self->{dict}->term_id_by_term($_)} = $raw_fields{$_};
	};

	my $bytes = '';

	# Sort term_ids numerical
	foreach (sort keys %fields) {
	$bytes .= $_;
	my $value = enc_string $value;
	$bytes .= length($value);
	$bytes .= $value;
	};

	# Append byte to stream
	$self->_append($doc_id, $bytes);
	};


	sub get_fields {
	my $self = shift;
	my $doc_id = shift;
	my $current = $self->skip_doc($doc_id);
	};

	1;