blob: 8f6c64543f9a40e23f1c44ef15076dc113177076 [file] [log] [blame]
package Krawfish::Koral::Result::Aggregate::Fields;
use strict;
use warnings;
use Role::Tiny::With;
use Krawfish::Log;
use Krawfish::Util::Bits;
use Krawfish::Util::Constants qw/:PREFIX/;
with 'Krawfish::Koral::Result::Inflatable';
with 'Krawfish::Koral::Result::Aggregate';
# This remembers facets for multiple classes,
# both using ids and terms
# TODO:
# The first field level should be initiated
# beforehand, so it is not necessary to check
# this level on each doc.
# TODO:
# This should accept an order field, to reconstruct the requested
# field order after aggregation!
# TODO:
# It may be beneficial to deal with Koral::Type here,
# so inflate() would be an action directly done in Koral::Type
# TODO:
# Rename stringifications to aFields!
# TODO:
# Support flags in constructor
use constant DEBUG => 0;
# Constructor
sub new {
my $class = shift;
bless {
flags => shift,
fields => {},
cache => [],
freq => 0
}, $class;
};
# Increment the field frequency for each field in the current doc
sub incr_doc {
my ($self, $key_id, $field_id, $flags) = @_;
# TODO:
# It may be easier to create the structure
# in flush(), similar to K::R::Group::Fields
# Structure is
# {
# field1 => {
# key1 => {
# flag1 => [0,0],
# flag2 => [0,0]
# }
# key2 => {
# flag1 => [0,0],
# flag2 => [0,0]
# }
# },
# field2 => {
# key1 => {
# flag1 => [0,0],
# flag2 => [0,0]
# }
# }
# }
# Get fields per flags
my $fields = $self->{fields};
# Field may already exist
my $field_key_freq = ($fields->{$key_id} //= {});
# Initialize frequency
my $field_freq = ($field_key_freq->{$field_id} //= {});
# Initialize frequency
my $field_flag_freq = ($field_freq->{$flags} //= [0,0]);
if (DEBUG) {
print_log(
'p_a_fields',
"Increment doc frequency for #" . $key_id . '=#' . "$field_id for flag $flags"
);
};
# Remember the frequency
# The problem here is, that they are only loosely coupled to the field
# frequency of the field. This may be problematic
push @{$self->{cache}}, $field_flag_freq;
};
# Increment the field frequency for each field per match
sub incr_match {
$_[0]->{freq}++;
if (DEBUG) {
print_log('p_a_fields', 'Increment match frequency');
};
};
# Flush all frequency information remembered
# per document
sub flush {
my $self = shift;
if ($self->{freq}) {
# Iterate over cached frequency object
foreach my $field_flag_freq (@{$self->{cache}}) {
if (DEBUG) {
print_log(
'p_a_fields',
'Increase frequency on cached references '.
'with ' . $self->{freq}
);
};
# Increment doc freq
$field_flag_freq->[0]++;
# Increment match freq
$field_flag_freq->[1] += $self->{freq};
};
$self->{freq} = 0;
$self->{cache} = [];
if (DEBUG) {
print_log('p_a_fields', 'Flush field frequency for all remembered frequencies');
};
};
};
# On finish flush the cache
sub on_finish {
$_[0]->flush;
$_[0];
};
# Merge aggregation data
sub merge {
my ($self, $aggr) = @_;
foreach my $field (keys %{$aggr->{fields}}) {
$self->{fields}->{$field} //= {};
foreach my $key (keys %{$aggr->{fields}->{$field}}) {
my $flags = $aggr->{fields}->{$field}->{$key};
my $self_flags = ($self->{fields}->{$field}->{$key} //= {});
foreach my $flag (keys %$flags) {
$self_flags->{$flag} //= [0,0];
$self_flags->{$flag}->[0] += $flags->{$flag}->[0];
$self_flags->{$flag}->[1] += $flags->{$flag}->[1];
}
};
};
};
# Translate this to terms
sub inflate {
my ($self, $dict) = @_;
# Get fields
my $fields = $self->{fields};
my %fields = ();
# Iterate over field identifier
foreach my $field_id (keys %$fields) {
# Request the term from the dictionary
my $field_term = $dict->term_by_term_id($field_id);
# Remove the term marker
# TODO:
# this may be a direct feature of the dictionary instead
$field_term = substr($field_term,1); # ~ s/^!//;
my $aggr = ($fields{$field_term} //= {});
# Get facets for field
my $values = $fields->{$field_id};
foreach my $value (keys %$values) {
# Get the field term
my $field = $dict->term_by_term_id($value);
# Remove the first character
# TODO:
# This may be a direct feature of the dictionary instead
$field =~ s/^.$field_term://;
# These are flag sorted values!
$aggr->{$field} = $values->{$value};
};
};
$self->{fields_terms} = \%fields;
$self;
};
# Generate class ordering
sub _to_classes {
my $self = shift;
# Order field terms by classes
# Doing this beforehand on_finish would be costly
my @classes;
my $fields = $self->{fields_terms};
# Iterate over fields
foreach my $key (keys %$fields) {
# Iterate over field values
foreach my $field (keys %{$fields->{$key}}) {
# Iterate over flags
my $freqs = $fields->{$key}->{$field};
foreach my $flag (keys %$freqs) {
# Iterate over classes
foreach my $class (flags_to_classes($flag)) {
# Store all data below class information
$classes[$class] //= {};
my $key = ($classes[$class]->{$key} //= {});
my $field = ($key->{$field} //= [0,0]);
$field->[0] += $freqs->{$flag}->[0];
$field->[1] += $freqs->{$flag}->[1];
};
};
};
};
return \@classes;
};
# Stringification
sub to_string {
my ($self, $ids) = @_;
my $str = '[fields=';
# IDs not supported
if ($ids) {
# warn 'ID based stringification currently not supported';
return $str . '#?]';
};
# No terms yet
unless ($self->{fields_terms}) {
# warn 'ID based stringification currently not supported';
return $str . '#?]';
};
my @classes = @{$self->_to_classes};
my $first = 0;
foreach (my $i = 0; $i < @classes; $i++) {
$str .= $i == 0 ? 'total' : 'inCorpus-' . $i;
$str .= ':[';
my $fields = $classes[$i];
foreach my $field (sort keys %$fields) {
$str .= $field . '=';
my $values = $fields->{$field};
foreach (sort keys %$values) {
$str .= $_ . ':';
my $freq = $values->{$_};
$str .= '['.$freq->[0].','.$freq->[1].'],';
};
chop $str;
$str .= ';';
};
chop $str;
$str .= '];';
};
chop $str;
return $str . ']';
};
# Key to add to KoralQuery
sub key {
'fields';
};
# Serialize to KQ
sub to_koral_fragment {
my $self = shift;
my $aggr = {
'@type' => 'koral:aggregation',
'aggregation' => 'aggregation:fields'
};
# No terms yet
unless ($self->{fields_terms}) {
warn 'ID based stringification currently not supported';
return;
};
# TODO:
# Use order field to recreate initial field order!
my @classes = @{$self->_to_classes};
my $first = 0;
foreach (my $i = 0; $i < @classes; $i++) {
my $per_field = $aggr->{$i == 0 ? 'total' : 'inCorpus-' . $i} = {};
my $fields = $classes[$i];
# Store per field
foreach my $field (sort keys %$fields) {
# Get values
my $values = $fields->{$field};
# Iterate over values
foreach (keys %$values) {
my $freq = $values->{$_};
my $per_value = $per_field->{$field} //= {};
$per_value->{$_} = {
docs => $freq->[0],
matches => $freq->[1]
};
};
};
};
return $aggr;
};
1;
__END__