Move aggregation methods to Result::Segment

commit: e9d59c88c737af2b4fd534653d53f620080a491c [log] [tgz]
author: Akron <nils@diewald-online.de> Fri May 12 19:33:43 2017 +0200
committer: Akron <nils@diewald-online.de> Fri May 12 19:33:43 2017 +0200
tree: 361aaf4b05f2a9ef23f48868e3f9130b3046e1a1
parent: dd02499a0497e5a2052a62c13b5302ae3eff19ac [diff]
diff --git a/lib/Krawfish/Result/Segment/Aggregate.pm b/lib/Krawfish/Result/Segment/Aggregate.pm
new file mode 100644
index 0000000..1df6c08
--- /dev/null
+++ b/lib/Krawfish/Result/Segment/Aggregate.pm

@@ -0,0 +1,98 @@
+package Krawfish::Result::Segment::Aggregate;
+use parent 'Krawfish::Result';
+# use Hash::Merge qw( merge );
+use strict;
+use warnings;
+
+use constant DEBUG => 0;
+
+# Aggregate values of matches per document and
+# per match.
+
+# TODO:
+#   See https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations.html
+
+
+# TODO: Sort all ops for each_match and each_doc support
+sub new {
+  my $class = shift;
+  my $result = {};
+  return bless {
+    last_doc_id => -1,
+    query => shift,
+    ops => shift,
+    result => $result,
+    last_doc_id => -1,
+    finished => 0
+  }, $class;
+};
+
+sub result {
+  $_[0]->{result};
+};
+
+
+# Iterate to the next result
+sub next {
+  my $self = shift;
+
+  # Get container object
+  my $result = $self->result;
+
+  # There is a next match
+  if ($self->{query}->next) {
+
+    # Get the current posting
+    my $current = $self->{query}->current;
+
+    if ($current->doc_id != $self->{last_doc_id}) {
+
+      # Collect data of current operation
+      foreach (@{$self->{ops}}) {
+        $_->each_doc($current, $result);
+      };
+
+      # Set last doc to current doc
+      $self->{last_doc_id} = $current->doc_id;
+    };
+
+    # Collect data of current operation
+    foreach (@{$self->{ops}}) {
+      $_->each_match($current, $result);
+    };
+
+    return 1;
+  };
+
+  # Release on_finish event
+  unless ($self->{finished}) {
+    foreach (@{$self->{ops}}) {
+      $_->on_finish($result);
+    };
+    $self->{finished} = 1;
+  };
+
+  return 0;
+};
+
+
+sub current {
+  return $_[0]->{query}->current;
+};
+
+
+sub to_string {
+  my $self = shift;
+  my $str = 'aggregate(';
+  $str .= '[' . join(',', map { $_->to_string } @{$self->{ops}}) . ']:';
+  $str .= $self->{query}->to_string;
+  return $str . ')';
+};
+
+# Shorthand for "search through"
+sub finalize {
+  while ($_[0]->next) {};
+  return 1;
+};
+
+1;

diff --git a/lib/Krawfish/Result/Segment/Aggregate/Base.pm b/lib/Krawfish/Result/Segment/Aggregate/Base.pm
new file mode 100644
index 0000000..f8e43da
--- /dev/null
+++ b/lib/Krawfish/Result/Segment/Aggregate/Base.pm

@@ -0,0 +1,20 @@
+package Krawfish::Result::Segment::Aggregate::Base;
+use strict;
+use warnings;
+
+sub new {
+  my $class = shift;
+  bless \(my $self = ''), $class;
+};
+
+sub each_doc {};
+
+sub each_match {};
+
+sub on_finish {};
+
+sub to_string {
+  ...
+};
+
+1;

diff --git a/lib/Krawfish/Result/Segment/Aggregate/Classes.pm b/lib/Krawfish/Result/Segment/Aggregate/Classes.pm
new file mode 100644
index 0000000..ebce6e8
--- /dev/null
+++ b/lib/Krawfish/Result/Segment/Aggregate/Classes.pm

@@ -0,0 +1,2 @@
+# Count frequencies on class occurrences
+# Will check all matches and count frequencies per class

diff --git a/lib/Krawfish/Result/Segment/Aggregate/Content.pm b/lib/Krawfish/Result/Segment/Aggregate/Content.pm
new file mode 100644
index 0000000..7f3b54f
--- /dev/null
+++ b/lib/Krawfish/Result/Segment/Aggregate/Content.pm

@@ -0,0 +1,4 @@
+# Aggregate by content information, for example,
+# based on a certain class
+#
+# Aggregate on rank!

diff --git a/lib/Krawfish/Result/Segment/Aggregate/Count.pm b/lib/Krawfish/Result/Segment/Aggregate/Count.pm
new file mode 100644
index 0000000..04c1c29
--- /dev/null
+++ b/lib/Krawfish/Result/Segment/Aggregate/Count.pm

@@ -0,0 +1,29 @@
+package Krawfish::Result::Segment::Aggregate::Count;
+use parent 'Krawfish::Result::Segment::Aggregate::Base';
+use Krawfish::Log;
+use strict;
+use warnings;
+
+# TODO: Rename to frequencies
+# TODO: Support virtual corpus classes
+
+use constant DEBUG => 0;
+
+# Add to totalResources immediately
+sub each_doc {
+  $_[2]->{totalResources}++;
+};
+
+
+# Add to totalResults immediately
+sub each_match {
+  $_[2]->{totalResults}++;
+};
+
+
+# Stringification
+sub to_string {
+  'count'
+};
+
+1;

diff --git a/lib/Krawfish/Result/Segment/Aggregate/Facets.pm b/lib/Krawfish/Result/Segment/Aggregate/Facets.pm
new file mode 100644
index 0000000..106ddfd
--- /dev/null
+++ b/lib/Krawfish/Result/Segment/Aggregate/Facets.pm

@@ -0,0 +1,134 @@
+package Krawfish::Result::Segment::Aggregate::Facets;
+use parent 'Krawfish::Result::Segment::Aggregate::Base';
+use Krawfish::Log;
+use strict;
+use warnings;
+
+use constant DEBUG => 0;
+
+# TODO: It may be beneficial to store example documents in the
+#   field ranks, too - so they don't need to be collected on the way ...
+#   See Group::Fields as well.
+#
+# TODO:
+#   Field aggregates should be sortable either <asc> or <desc>,
+#   and should have a count limitation, may be even a start_index and an items_per_page
+
+sub new {
+  my $class = shift;
+  my $self = bless {
+    index   => shift,
+    field   => shift,
+
+    # TODO: May as well be groups ...
+    buckets => [], # The buckets in memory
+    freq    => undef
+  }, $class;
+};
+
+sub _init {
+  return if $_[0]->{rank};
+
+  my $self = shift;
+
+  print_log('aggr_facets', 'Load ranks for ' . $self->{field}) if DEBUG;
+
+  # Load the ranked list - may be too large for memory!
+  $self->{rank} = $self->{index}->fields->ranked_by($self->{field});
+};
+
+
+# On every doc
+sub each_doc {
+  my $self = shift;
+  $self->_init;
+  my $current = shift;
+
+  my $doc_id = $current->doc_id;
+
+  # Get the document rank
+  my $rank = $self->{rank}->get($doc_id);
+
+  # Rank exists
+  # TODO:
+  #   Check if zero don't mean, the field
+  #   is not ranked yet!
+  if ($rank != 0) {
+
+    # This will contain 'doc_freq', 'freq', and an example 'doc_id'
+    $self->{freq} = $self->{buckets}->[$rank] //= [0,0, $doc_id];
+    $self->{freq}->[0]++;
+
+    print_log('aggr_facets', $self->{field} . ' has frequencies') if DEBUG;
+  }
+
+  # Do not check rank
+  else {
+    $_[0]->{freq} = undef;
+  };
+};
+
+
+# On every match
+sub each_match {
+  if ($_[0]->{freq}) {
+    $_[0]->{freq}->[1]++;
+  };
+};
+
+
+# finish the results
+sub on_finish {
+  my ($self, $result) = @_;
+
+  # Get fields
+  my $fields = $self->{index}->fields;
+  my $field = $self->{field};
+
+  my %facets = ();
+
+  # Iterate over all ranked buckets of the field
+  foreach my $rank (grep { defined $_ } @{$self->{buckets}}) {
+
+    print_log('aggr_facets', "Get rank $rank for $field") if DEBUG;
+
+    # Get information from rank
+    my ($doc_freq, $freq, $example_doc_id) = @$rank;
+
+    # This rank occurrs in the query
+    if ($doc_freq) {
+
+      # Get the field name of the frequency
+      my $field_value = $fields->get($example_doc_id, $field);
+
+      # Set facet information
+      # May need the field key prepended
+      $facets{$field_value} = [$doc_freq, $freq];
+    };
+  };
+
+  # Return facets
+  # Example structure for year
+  # {
+  #   1997 => [4, 67],
+  #   1998 => [5, 89],
+  #   1999 => [3, 20]
+  # }
+  my $facet_result = ($result->{facets} //= {});
+  $facet_result->{$self->{field}} = \%facets;
+};
+
+sub to_string {
+  return 'facet:' . _squote($_[0]->{field});
+};
+
+
+# From Mojo::Util
+sub _squote {
+  my $str = shift;
+  $str =~ s/(['\\])/\\$1/g;
+  return qq{'$str'};
+};
+
+
+1;

diff --git a/lib/Krawfish/Result/Segment/Aggregate/Length.pm b/lib/Krawfish/Result/Segment/Aggregate/Length.pm
new file mode 100644
index 0000000..7f9572b
--- /dev/null
+++ b/lib/Krawfish/Result/Segment/Aggregate/Length.pm

@@ -0,0 +1,56 @@
+package Krawfish::Result::Segment::Aggregate::Length;
+use parent 'Krawfish::Result::Segment::Aggregate::Base';
+use Krawfish::Log;
+use strict;
+use warnings;
+
+use constant DEBUG => 0;
+
+# This will check the segments length -
+# currently other word lengths are not supported
+
+# See https://en.wikipedia.org/wiki/Selection_algorithm
+# for algorithms to find the median or similar.
+
+sub new {
+  my $class = shift;
+  bless {
+    min  => 32_000,
+    max  => 0,
+    sum  => 0,
+    freq => 0
+  }, $class;
+};
+
+
+# On every match
+sub each_match {
+  my ($self, $current) = @_;
+  my $length = $current->end - $current->start;
+  $self->{min} = $length < $self->{min} ? $length : $self->{min};
+  $self->{max} = $length > $self->{max} ? $length : $self->{max};
+  $self->{sum} += $length;
+  $self->{freq}++;
+};
+
+
+# Finish the aggregation
+sub on_finish {
+  my ($self, $result) = @_;
+
+  return if $self->{freq} == 0;
+  $result->{length} = {
+    min  => $self->{min},
+    max  => $self->{max},
+    sum  => $self->{sum},
+    avg  => $self->{sum} / $self->{freq}
+  };
+};
+
+
+# Stringification
+sub to_string {
+  'length'
+};
+
+1;

diff --git a/lib/Krawfish/Result/Segment/Aggregate/Values.pm b/lib/Krawfish/Result/Segment/Aggregate/Values.pm
new file mode 100644
index 0000000..82f4a82
--- /dev/null
+++ b/lib/Krawfish/Result/Segment/Aggregate/Values.pm

@@ -0,0 +1,100 @@
+package Krawfish::Result::Segment::Aggregate::Values;
+use parent 'Krawfish::Result::Segment::Aggregate::Base';
+use Krawfish::Log;
+use strict;
+use warnings;
+
+# TODO: Rename to FieldCalc or FieldSum
+
+use constant {
+  DEBUG          => 0,
+  MIN_INIT_VALUE => 32_000
+};
+
+sub new {
+  my $class = shift;
+  my $index = shift;
+  my $self = bless {
+    index => $index, # Index
+
+    # This need to be a numerical fields!
+    fields => shift,
+    # TODO: May need to be translated into field_term_ids
+
+    # TODO:
+    #   It may be more efficient to store a list of numerical
+    #   field values here (e.g. sentence)
+    fields_obj  => $index->fields,
+    aggregate => {}
+  }, $class;
+
+  # Initiate aggregation maps
+  foreach (@{$self->{fields}}) {
+    $self->{aggregate}->{$_} = {
+      min   => MIN_INIT_VALUE,
+      max   => 0,
+      sum   => 0,
+      freq => 0
+    };
+  };
+
+  return $self;
+};
+
+
+# Release for each doc
+sub each_doc {
+  my ($self, $current, $result) = @_;
+
+  my $fields = $self->{fields_obj};
+
+  # my $value_current = $values->current;
+
+  # Current value has to catch up to the current doc
+  # if ($value_current->doc_id < $current->doc_id) {
+
+    # Skip to the requested doc_id (or beyond)
+    # $value_current = $values->skip_doc($current->doc_id);
+  # };
+
+  # Get document fields
+  my $doc_fields = $fields->get($current->doc_id);
+
+  # Get aggregation information
+  my $aggr = $self->{aggregate};
+
+  foreach my $field (@{$self->{fields}}) {
+
+    # Get field value
+    my $value = $doc_fields->{$field};
+
+    next unless defined $value;
+
+    # Get field in aggregation
+    my $field_aggr = $aggr->{$field};
+
+    $field_aggr->{min} = $field_aggr->{min} < $value ? $field_aggr->{min} : $value;
+    $field_aggr->{max} = $field_aggr->{max} > $value ? $field_aggr->{max} : $value;
+    $field_aggr->{sum} += $value;
+    $field_aggr->{freq}++;
+  };
+};
+
+
+# Stringification
+sub to_string {
+  return 'values:' . $_[0]->{field};
+};
+
+
+# Finish the aggregation
+sub on_finish {
+  my ($self, $result) = @_;
+  my $aggr = ($result->{aggregate} = $self->{aggregate});
+  foreach (values %{$aggr}) {
+    next unless $_->{freq};
+    $_->{avg} = $_->{sum} / $_->{freq};
+  };
+};
+
+1;
commit	e9d59c88c737af2b4fd534653d53f620080a491c	[log] [tgz]
author	Akron <nils@diewald-online.de>	Fri May 12 19:33:43 2017 +0200
committer	Akron <nils@diewald-online.de>	Fri May 12 19:33:43 2017 +0200
tree	361aaf4b05f2a9ef23f48868e3f9130b3046e1a1
parent	dd02499a0497e5a2052a62c13b5302ae3eff19ac [diff]