blob: 4bdce0f01fa816d27dcd163a272af995e7be4c56 [file] [log] [blame]
package Krawfish::Result::Group;
use Krawfish::Log;
use strict;
use warnings;
use constant DEBUG => 0;
# TODO: Use Krawfish::Posting::Group;
# Group matches based on certain criteria, for example:
# - for record matches
# - metadata!
# - This is useful to group document matches for corpus browsing!
# - for span matches
# - metdata
# - this is an extension to facets, where snippet frequencies are grouped
# based on a certain facet.
# - having facets in a first step may improve the distributed aggregation
# (as the central node than knows, which facets are most or least common)
# - this grouping doesn't seem beneficial - as the facet view already helps here
#
# - innertextual!
# - has a certain identical class on surface
# - has the same starting characters of a word
# - has the same ending characters of a word
# - has the same POS of a certain class (this is actually pretty hard!)
# - this may mean to modify the search a bit to lift the posting types
# and make a class, like [orth=der & base/p=*]
# - At least the postingslist of base/p=* should be merged in parallel!
#
# This is already possible in C2 so it needs to be implemented!
# A group has the following structure:
# {
# criterion => [freq, doc_freq]
# }
# Where criterion is a classed sequence of criteria
# with class information, like
# 1:der|2:Baum => []
# Sometimes it may indicate tokens instead of classes though ...
# Construct grouping function
sub new {
my $class = shift;
my ($query, $criterion, $index) = @_;
bless {
query => $query,
# This is a group criterion object, created outside, that defines the criterion
criterion => $criterion,
pos => -1,
# Group to fill with matches and group info
# (as class1=>X, class2=>Y)
groups => []
}, $class;
};
# Go through all matches
# This could, nonetheless, be implemented like Facets ...
sub _init {
my $self = shift;
return if $self->{init}++;
my $criterion = $self->{criterion};
my $query = $self->{query};
my %groups = ();
my ($group, $current);
my $doc_id = -1;
# Iterate over all queries
while ($query->next) {
# Get current query if there is any
$current = $query->current or last;
# Potentially create new group
$group = ($groups{$criterion->get_group($current)} //= [0,0]);
# Increment freq
$group->[0]++;
if ($current->doc_id != $doc_id) {
# Increment doc_freq
$group->[1]++;
$doc_id = $current->doc_id;
};
};
# Value is stored as [criterion, freq, doc_freq]
# Sorted by freq by default
my @array = ();
foreach (sort { $groups{$b}->[0] <=> $groups{$a}->[0] } keys %groups) {
push @array, [$_, $groups{$_}->[0], $groups{$_}->[1]];
};
# Store for retrieval
$self->{groups} = \@array;
return 1;
};
sub freq {
my $self = shift;
scalar @{$self->{groups}}
};
sub next {
my $self = shift;
$self->_init;
if ($self->{pos}++ < ($self->freq - 1)) {
return 1;
};
return;
};
sub current;
# Return a hash reference with information
sub current_group {
my $self = shift;
my $group = $self->{groups}->[$self->{pos}];
# Make a hash from criterion
return $self->{criterion}->to_hash(@$group);
};
sub to_string {
my $self = shift;
my $str = 'groupBy(';
$str .= $self->{criterion}->to_string . ':';
$str .= $self->{query}->to_string;
$str .= ')';
};
1;