Move aggregation methods to Result::Segment
diff --git a/lib/Krawfish/Result/Segment/Aggregate.pm b/lib/Krawfish/Result/Segment/Aggregate.pm
new file mode 100644
index 0000000..1df6c08
--- /dev/null
+++ b/lib/Krawfish/Result/Segment/Aggregate.pm
@@ -0,0 +1,98 @@
+package Krawfish::Result::Segment::Aggregate;
+use parent 'Krawfish::Result';
+# use Hash::Merge qw( merge );
+use strict;
+use warnings;
+
+use constant DEBUG => 0;
+
+# Aggregate values of matches per document and
+# per match.
+
+# TODO:
+# See https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations.html
+
+
+# TODO: Sort all ops for each_match and each_doc support
+sub new {
+ my $class = shift;
+ my $result = {};
+ return bless {
+ last_doc_id => -1,
+ query => shift,
+ ops => shift,
+ result => $result,
+ last_doc_id => -1,
+ finished => 0
+ }, $class;
+};
+
+sub result {
+ $_[0]->{result};
+};
+
+
+# Iterate to the next result
+sub next {
+ my $self = shift;
+
+ # Get container object
+ my $result = $self->result;
+
+ # There is a next match
+ if ($self->{query}->next) {
+
+ # Get the current posting
+ my $current = $self->{query}->current;
+
+ if ($current->doc_id != $self->{last_doc_id}) {
+
+ # Collect data of current operation
+ foreach (@{$self->{ops}}) {
+ $_->each_doc($current, $result);
+ };
+
+ # Set last doc to current doc
+ $self->{last_doc_id} = $current->doc_id;
+ };
+
+ # Collect data of current operation
+ foreach (@{$self->{ops}}) {
+ $_->each_match($current, $result);
+ };
+
+ return 1;
+ };
+
+ # Release on_finish event
+ unless ($self->{finished}) {
+ foreach (@{$self->{ops}}) {
+ $_->on_finish($result);
+ };
+ $self->{finished} = 1;
+ };
+
+ return 0;
+};
+
+
+sub current {
+ return $_[0]->{query}->current;
+};
+
+
+sub to_string {
+ my $self = shift;
+ my $str = 'aggregate(';
+ $str .= '[' . join(',', map { $_->to_string } @{$self->{ops}}) . ']:';
+ $str .= $self->{query}->to_string;
+ return $str . ')';
+};
+
+# Shorthand for "search through"
+sub finalize {
+ while ($_[0]->next) {};
+ return 1;
+};
+
+1;
diff --git a/lib/Krawfish/Result/Segment/Aggregate/Base.pm b/lib/Krawfish/Result/Segment/Aggregate/Base.pm
new file mode 100644
index 0000000..f8e43da
--- /dev/null
+++ b/lib/Krawfish/Result/Segment/Aggregate/Base.pm
@@ -0,0 +1,20 @@
+package Krawfish::Result::Segment::Aggregate::Base;
+use strict;
+use warnings;
+
+sub new {
+ my $class = shift;
+ bless \(my $self = ''), $class;
+};
+
+sub each_doc {};
+
+sub each_match {};
+
+sub on_finish {};
+
+sub to_string {
+ ...
+};
+
+1;
diff --git a/lib/Krawfish/Result/Segment/Aggregate/Classes.pm b/lib/Krawfish/Result/Segment/Aggregate/Classes.pm
new file mode 100644
index 0000000..ebce6e8
--- /dev/null
+++ b/lib/Krawfish/Result/Segment/Aggregate/Classes.pm
@@ -0,0 +1,2 @@
+# Count frequencies on class occurrences
+# Will check all matches and count frequencies per class
diff --git a/lib/Krawfish/Result/Segment/Aggregate/Content.pm b/lib/Krawfish/Result/Segment/Aggregate/Content.pm
new file mode 100644
index 0000000..7f3b54f
--- /dev/null
+++ b/lib/Krawfish/Result/Segment/Aggregate/Content.pm
@@ -0,0 +1,4 @@
+# Aggregate by content information, for example,
+# based on a certain class
+#
+# Aggregate on rank!
diff --git a/lib/Krawfish/Result/Segment/Aggregate/Count.pm b/lib/Krawfish/Result/Segment/Aggregate/Count.pm
new file mode 100644
index 0000000..04c1c29
--- /dev/null
+++ b/lib/Krawfish/Result/Segment/Aggregate/Count.pm
@@ -0,0 +1,29 @@
+package Krawfish::Result::Segment::Aggregate::Count;
+use parent 'Krawfish::Result::Segment::Aggregate::Base';
+use Krawfish::Log;
+use strict;
+use warnings;
+
+# TODO: Rename to frequencies
+# TODO: Support virtual corpus classes
+
+use constant DEBUG => 0;
+
+# Add to totalResources immediately
+sub each_doc {
+ $_[2]->{totalResources}++;
+};
+
+
+# Add to totalResults immediately
+sub each_match {
+ $_[2]->{totalResults}++;
+};
+
+
+# Stringification
+sub to_string {
+ 'count'
+};
+
+1;
diff --git a/lib/Krawfish/Result/Segment/Aggregate/Facets.pm b/lib/Krawfish/Result/Segment/Aggregate/Facets.pm
new file mode 100644
index 0000000..106ddfd
--- /dev/null
+++ b/lib/Krawfish/Result/Segment/Aggregate/Facets.pm
@@ -0,0 +1,134 @@
+package Krawfish::Result::Segment::Aggregate::Facets;
+use parent 'Krawfish::Result::Segment::Aggregate::Base';
+use Krawfish::Log;
+use strict;
+use warnings;
+
+use constant DEBUG => 0;
+
+# TODO: It may be beneficial to store example documents in the
+# field ranks, too - so they don't need to be collected on the way ...
+# See Group::Fields as well.
+#
+# TODO:
+# Field aggregates should be sortable either <asc> or <desc>,
+# and should have a count limitation, may be even a start_index and an items_per_page
+
+sub new {
+ my $class = shift;
+ my $self = bless {
+ index => shift,
+ field => shift,
+
+ # TODO: May as well be groups ...
+ buckets => [], # The buckets in memory
+ freq => undef
+ }, $class;
+};
+
+sub _init {
+ return if $_[0]->{rank};
+
+ my $self = shift;
+
+ print_log('aggr_facets', 'Load ranks for ' . $self->{field}) if DEBUG;
+
+ # Load the ranked list - may be too large for memory!
+ $self->{rank} = $self->{index}->fields->ranked_by($self->{field});
+};
+
+
+# On every doc
+sub each_doc {
+ my $self = shift;
+ $self->_init;
+ my $current = shift;
+
+ my $doc_id = $current->doc_id;
+
+ # Get the document rank
+ my $rank = $self->{rank}->get($doc_id);
+
+ # Rank exists
+ # TODO:
+ # Check if zero don't mean, the field
+ # is not ranked yet!
+ if ($rank != 0) {
+
+ # This will contain 'doc_freq', 'freq', and an example 'doc_id'
+ $self->{freq} = $self->{buckets}->[$rank] //= [0,0, $doc_id];
+ $self->{freq}->[0]++;
+
+ print_log('aggr_facets', $self->{field} . ' has frequencies') if DEBUG;
+ }
+
+ # Do not check rank
+ else {
+ $_[0]->{freq} = undef;
+ };
+};
+
+
+# On every match
+sub each_match {
+ if ($_[0]->{freq}) {
+ $_[0]->{freq}->[1]++;
+ };
+};
+
+
+# finish the results
+sub on_finish {
+ my ($self, $result) = @_;
+
+ # Get fields
+ my $fields = $self->{index}->fields;
+ my $field = $self->{field};
+
+ my %facets = ();
+
+ # Iterate over all ranked buckets of the field
+ foreach my $rank (grep { defined $_ } @{$self->{buckets}}) {
+
+ print_log('aggr_facets', "Get rank $rank for $field") if DEBUG;
+
+ # Get information from rank
+ my ($doc_freq, $freq, $example_doc_id) = @$rank;
+
+ # This rank occurrs in the query
+ if ($doc_freq) {
+
+ # Get the field name of the frequency
+ my $field_value = $fields->get($example_doc_id, $field);
+
+ # Set facet information
+ # May need the field key prepended
+ $facets{$field_value} = [$doc_freq, $freq];
+ };
+ };
+
+ # Return facets
+ # Example structure for year
+ # {
+ # 1997 => [4, 67],
+ # 1998 => [5, 89],
+ # 1999 => [3, 20]
+ # }
+ my $facet_result = ($result->{facets} //= {});
+ $facet_result->{$self->{field}} = \%facets;
+};
+
+sub to_string {
+ return 'facet:' . _squote($_[0]->{field});
+};
+
+
+# From Mojo::Util
+sub _squote {
+ my $str = shift;
+ $str =~ s/(['\\])/\\$1/g;
+ return qq{'$str'};
+};
+
+
+1;
diff --git a/lib/Krawfish/Result/Segment/Aggregate/Length.pm b/lib/Krawfish/Result/Segment/Aggregate/Length.pm
new file mode 100644
index 0000000..7f9572b
--- /dev/null
+++ b/lib/Krawfish/Result/Segment/Aggregate/Length.pm
@@ -0,0 +1,56 @@
+package Krawfish::Result::Segment::Aggregate::Length;
+use parent 'Krawfish::Result::Segment::Aggregate::Base';
+use Krawfish::Log;
+use strict;
+use warnings;
+
+use constant DEBUG => 0;
+
+# This will check the segments length -
+# currently other word lengths are not supported
+
+# See https://en.wikipedia.org/wiki/Selection_algorithm
+# for algorithms to find the median or similar.
+
+sub new {
+ my $class = shift;
+ bless {
+ min => 32_000,
+ max => 0,
+ sum => 0,
+ freq => 0
+ }, $class;
+};
+
+
+# On every match
+sub each_match {
+ my ($self, $current) = @_;
+ my $length = $current->end - $current->start;
+ $self->{min} = $length < $self->{min} ? $length : $self->{min};
+ $self->{max} = $length > $self->{max} ? $length : $self->{max};
+ $self->{sum} += $length;
+ $self->{freq}++;
+};
+
+
+# Finish the aggregation
+sub on_finish {
+ my ($self, $result) = @_;
+
+ return if $self->{freq} == 0;
+ $result->{length} = {
+ min => $self->{min},
+ max => $self->{max},
+ sum => $self->{sum},
+ avg => $self->{sum} / $self->{freq}
+ };
+};
+
+
+# Stringification
+sub to_string {
+ 'length'
+};
+
+1;
diff --git a/lib/Krawfish/Result/Segment/Aggregate/Values.pm b/lib/Krawfish/Result/Segment/Aggregate/Values.pm
new file mode 100644
index 0000000..82f4a82
--- /dev/null
+++ b/lib/Krawfish/Result/Segment/Aggregate/Values.pm
@@ -0,0 +1,100 @@
+package Krawfish::Result::Segment::Aggregate::Values;
+use parent 'Krawfish::Result::Segment::Aggregate::Base';
+use Krawfish::Log;
+use strict;
+use warnings;
+
+# TODO: Rename to FieldCalc or FieldSum
+
+use constant {
+ DEBUG => 0,
+ MIN_INIT_VALUE => 32_000
+};
+
+sub new {
+ my $class = shift;
+ my $index = shift;
+ my $self = bless {
+ index => $index, # Index
+
+ # This need to be a numerical fields!
+ fields => shift,
+ # TODO: May need to be translated into field_term_ids
+
+ # TODO:
+ # It may be more efficient to store a list of numerical
+ # field values here (e.g. sentence)
+ fields_obj => $index->fields,
+ aggregate => {}
+ }, $class;
+
+ # Initiate aggregation maps
+ foreach (@{$self->{fields}}) {
+ $self->{aggregate}->{$_} = {
+ min => MIN_INIT_VALUE,
+ max => 0,
+ sum => 0,
+ freq => 0
+ };
+ };
+
+ return $self;
+};
+
+
+# Release for each doc
+sub each_doc {
+ my ($self, $current, $result) = @_;
+
+ my $fields = $self->{fields_obj};
+
+ # my $value_current = $values->current;
+
+ # Current value has to catch up to the current doc
+ # if ($value_current->doc_id < $current->doc_id) {
+
+ # Skip to the requested doc_id (or beyond)
+ # $value_current = $values->skip_doc($current->doc_id);
+ # };
+
+ # Get document fields
+ my $doc_fields = $fields->get($current->doc_id);
+
+ # Get aggregation information
+ my $aggr = $self->{aggregate};
+
+ foreach my $field (@{$self->{fields}}) {
+
+ # Get field value
+ my $value = $doc_fields->{$field};
+
+ next unless defined $value;
+
+ # Get field in aggregation
+ my $field_aggr = $aggr->{$field};
+
+ $field_aggr->{min} = $field_aggr->{min} < $value ? $field_aggr->{min} : $value;
+ $field_aggr->{max} = $field_aggr->{max} > $value ? $field_aggr->{max} : $value;
+ $field_aggr->{sum} += $value;
+ $field_aggr->{freq}++;
+ };
+};
+
+
+# Stringification
+sub to_string {
+ return 'values:' . $_[0]->{field};
+};
+
+
+# Finish the aggregation
+sub on_finish {
+ my ($self, $result) = @_;
+ my $aggr = ($result->{aggregate} = $self->{aggregate});
+ foreach (values %{$aggr}) {
+ next unless $_->{freq};
+ $_->{avg} = $_->{sum} / $_->{freq};
+ };
+};
+
+1;