blob: 8b4d9458aaa9054945b77f07eb29d8370db8b95f [file] [log] [blame]
package Krawfish::Index::Store::V1::Fields;
use parent 'Krawfish::Index::Store::V1::Stream';
use Krawfish::Index::Store::V1::Util qw/enc_string dec_string/;
use Krawfish::Log;
use strict;
use warnings;
# TODO:
# field names should have term_ids
# all values should be stored in a sequential order
# augmented by a skip list.
#
# [i:doc_id][i:doc_field_length]([i:field_id][i:value_length][str:value])*
#
# The fields are stored in ascending field_id order,
# so its fast to find the correct value.
# The first bit of the field length may indicate,
# if the field is a string or a numerical value.
#
# This may also have a next() and skip_doc() API
# to move to the expected document in a sequential way,
# which may be the case for Aggregate::Values. (Although,
# this may be better to be stored in a different mechanism.)
# In that case, a pointer mechanism is required.
# Another good use-case is the fast collection of text siglen
# for the virtualcorpus->textsiglen-vector method.
# Tie to a file
sub new {
my ($class, $file, $dict) = @_;
bless {
file => $file,
dictionary => $dict
}, $class;
};
# Store information on a document
# The doc_id needs to be greater than the last doc_id
sub store {
my $self = shift;
my $doc_id = shift;
# Expected structure is:
# field_id => str
my %raw_fields = @_;
my %fields = ();
# Translate field names to term_ids
foreach (keys %raw_fields) {
$fields{$self->{dict}->term_id_by_term($_)} = $raw_fields{$_};
};
my $bytes = '';
# Sort term_ids numerical
foreach (sort keys %fields) {
$bytes .= $_;
my $value = enc_string $value;
$bytes .= length($value);
$bytes .= $value;
};
# Append byte to stream
$self->_append($doc_id, $bytes);
};
sub get_fields {
my $self = shift;
my $doc_id = shift;
my $current = $self->skip_doc($doc_id);
};
1;