lib/Krawfish/Result/Group.pm - KorAP/Krawfish-prototype - Gitiles

 package Krawfish::Result::Group;
 use Krawfish::Log;
 use strict;
 use warnings;

 use constant DEBUG => 0;

 # TODO: Use Krawfish::Posting::Group;

 # Group matches based on certain criteria, for example:
 # - for record matches
 #   - metadata!
 #   - This is useful to group document matches for corpus browsing!
 # - for span matches
 #   - metdata
 #     - this is an extension to facets, where snippet frequencies are grouped
 #       based on a certain facet.
 #     - having facets in a first step may improve the distributed aggregation
 #       (as the central node than knows, which facets are most or least common)
 #     - this grouping doesn't seem beneficial - as the facet view already helps here
 #
 #   - innertextual!
 # - has a certain identical class on surface
 # - has the same starting characters of a word
 # - has the same ending characters of a word
 # - has the same POS of a certain class (this is actually pretty hard!)
 #   - this may mean to modify the search a bit to lift the posting types
 #     and make a class, like [orth=der & base/p=*]
 #   - At least the postingslist of base/p=* should be merged in parallel!
 #
 # This is already possible in C2 so it needs to be implemented!

 # A group has the following structure:
 # {
 #   criterion => [freq, doc_freq]
 # }
 # Where criterion is a classed sequence of criteria
 # with class information, like
 #   1:der|2:Baum => []
 # Sometimes it may indicate tokens instead of classes though ...

 # Construct grouping function
 sub new {
   my $class = shift;
   my ($query, $criterion, $index) = @_;

   bless {
     query => $query,

     # This is a group criterion object, created outside, that defines the criterion
     criterion => $criterion,
     pos => -1,

     # Group to fill with matches and group info
     # (as class1=>X, class2=>Y)
     groups => []
   }, $class;
 };


 # Go through all matches
 # This could, nonetheless, be implemented like Facets ...
 sub _init {
   my $self = shift;

   return if $self->{init}++;

   my $criterion = $self->{criterion};

   my $query = $self->{query};

   my %groups = ();
   my ($group, $current);
   my $doc_id = -1;

   # Iterate over all queries
   while ($query->next) {

     # Get current query if there is any
     $current = $query->current or last;

     # Potentially create new group
     $group = ($groups{$criterion->get_group($current)} //= [0,0]);

     # Increment freq
     $group->[0]++;

     if ($current->doc_id != $doc_id) {

       # Increment doc_freq
       $group->[1]++;

       $doc_id = $current->doc_id;
     };
   };

   # Value is stored as [criterion, freq, doc_freq]
   # Sorted by freq by default
   my @array = ();
   foreach (sort { $groups{$b}->[0] <=> $groups{$a}->[0] } keys %groups) {
     push @array, [$_, $groups{$_}->[0], $groups{$_}->[1]];
   };

   # Store for retrieval
   $self->{groups} = \@array;
   return 1;
 };


 sub freq {
   my $self = shift;
   scalar @{$self->{groups}}
 };

 sub next {
   my $self = shift;
   $self->_init;
   if ($self->{pos}++ < ($self->freq - 1)) {
     return 1;
   };
   return;
 };


 sub current;


 # Return a hash reference with information
 sub current_group {
   my $self = shift;
   my $group = $self->{groups}->[$self->{pos}];

   # Make a hash from criterion
   return $self->{criterion}->to_hash(@$group);
 };


 sub to_string {
   my $self = shift;
   my $str = 'groupBy(';
   $str .= $self->{criterion}->to_string . ':';
   $str .= $self->{query}->to_string;
   $str .= ')';
 };

 1;
	package Krawfish::Result::Group;
	use Krawfish::Log;
	use strict;
	use warnings;

	use constant DEBUG => 0;

	# TODO: Use Krawfish::Posting::Group;

	# Group matches based on certain criteria, for example:
	# - for record matches
	# - metadata!
	# - This is useful to group document matches for corpus browsing!
	# - for span matches
	# - metdata
	# - this is an extension to facets, where snippet frequencies are grouped
	# based on a certain facet.
	# - having facets in a first step may improve the distributed aggregation
	# (as the central node than knows, which facets are most or least common)
	# - this grouping doesn't seem beneficial - as the facet view already helps here
	#
	# - innertextual!
	# - has a certain identical class on surface
	# - has the same starting characters of a word
	# - has the same ending characters of a word
	# - has the same POS of a certain class (this is actually pretty hard!)
	# - this may mean to modify the search a bit to lift the posting types
	# and make a class, like [orth=der & base/p=*]
	# - At least the postingslist of base/p=* should be merged in parallel!
	#
	# This is already possible in C2 so it needs to be implemented!

	# A group has the following structure:
	# {
	# criterion => [freq, doc_freq]
	# }
	# Where criterion is a classed sequence of criteria
	# with class information, like
	# 1:der\|2:Baum => []
	# Sometimes it may indicate tokens instead of classes though ...

	# Construct grouping function
	sub new {
	my $class = shift;
	my ($query, $criterion, $index) = @_;

	bless {
	query => $query,

	# This is a group criterion object, created outside, that defines the criterion
	criterion => $criterion,
	pos => -1,

	# Group to fill with matches and group info
	# (as class1=>X, class2=>Y)
	groups => []
	}, $class;
	};


	# Go through all matches
	# This could, nonetheless, be implemented like Facets ...
	sub _init {
	my $self = shift;

	return if $self->{init}++;

	my $criterion = $self->{criterion};

	my $query = $self->{query};

	my %groups = ();
	my ($group, $current);
	my $doc_id = -1;

	# Iterate over all queries
	while ($query->next) {

	# Get current query if there is any
	$current = $query->current or last;

	# Potentially create new group
	$group = ($groups{$criterion->get_group($current)} //= [0,0]);

	# Increment freq
	$group->[0]++;

	if ($current->doc_id != $doc_id) {

	# Increment doc_freq
	$group->[1]++;

	$doc_id = $current->doc_id;
	};
	};

	# Value is stored as [criterion, freq, doc_freq]
	# Sorted by freq by default
	my @array = ();
	foreach (sort { $groups{$b}->[0] <=> $groups{$a}->[0] } keys %groups) {
	push @array, [$_, $groups{$_}->[0], $groups{$_}->[1]];
	};

	# Store for retrieval
	$self->{groups} = \@array;
	return 1;
	};


	sub freq {
	my $self = shift;
	scalar @{$self->{groups}}
	};

	sub next {
	my $self = shift;
	$self->_init;
	if ($self->{pos}++ < ($self->freq - 1)) {
	return 1;
	};
	return;
	};


	sub current;


	# Return a hash reference with information
	sub current_group {
	my $self = shift;
	my $group = $self->{groups}->[$self->{pos}];

	# Make a hash from criterion
	return $self->{criterion}->to_hash(@$group);
	};


	sub to_string {
	my $self = shift;
	my $str = 'groupBy(';
	$str .= $self->{criterion}->to_string . ':';
	$str .= $self->{query}->to_string;
	$str .= ')';
	};

	1;