Merge groups on node level
Change-Id: Ibbfa35eb0ce51341eabbabe56f68dab8e8fae9aa
diff --git a/lib/Krawfish/Compile/Node.pm b/lib/Krawfish/Compile/Node.pm
index 3d588b9..364f0cf 100644
--- a/lib/Krawfish/Compile/Node.pm
+++ b/lib/Krawfish/Compile/Node.pm
@@ -18,6 +18,9 @@
# Add a timeout! Just in case ...!
# TODO:
+# Merge warnings, errors, messages!
+
+# TODO:
# Introduce max_rank_ref!
# This may be less efficient than a dynamic
@@ -32,7 +35,7 @@
# - Krawfish::MultiNodes::*
-use constant DEBUG => 0;
+use constant DEBUG => 1;
# Constructor
@@ -58,7 +61,7 @@
my $segment_query = $query->optimize($seg);
if (DEBUG) {
- print_log('node', 'Add query ' . $segment_query->to_string . ' to merge');
+ print_log('cmp_node', 'Add query ' . $segment_query->to_string . ' to merge');
};
# There are results expected
@@ -69,7 +72,13 @@
$self->{segment_queries} = \@segment_queries;
- # Add criterion comparation method here
+ # Query does not require sorted result
+ if (Role::Tiny::does_role($query, 'Krawfish::Koral::Compile::Node::Group')) {
+ $self->{top_k} = 0;
+ return $self;
+ };
+
+ # Add criterion comparation method
$self->{prio} = Array::Queue::Priority->new(
sort_cb => sub {
my ($match_a, $match_b) = @_;
@@ -83,6 +92,7 @@
return $crit_a->compare($crit_b);
}
);
+
return $self;
};
@@ -94,16 +104,17 @@
return if $self->{init}++;
if (DEBUG) {
- print_log('node', 'Initialize sorting queue');
+ print_log('cmp_node', 'Initialize node response');
};
+ # Priority queue if sorting is required, per default with size $n
+ my $prio = $self->{prio};
+
my $i = 0;
my $n = scalar @{$self->{segment_queries}};
- # Priority queue, per default with size $n
- my $prio = $self->{prio};
-
- # Iterate over all segments until the prio is full
+ # Iterate over all segments - either for grouping
+ # or (in case of sorting) until the prio is full
#
# TODO:
# This needs to be done in parallel, as the initial
@@ -113,11 +124,23 @@
# Get query from segment
my $seg_q = $self->{segment_queries}->[$i];
+ # Do grouping!
+ unless ($prio) {
+
+ if (DEBUG) {
+ print_log('cmp_node', "Finalize query at channel $i");
+ };
+
+ # Search through all results
+ $seg_q->finalize;
+ next;
+ };
+
# There is a next item from the segment
if ($seg_q->next) {
if (DEBUG) {
- print_log('node', "Init query at channel $i");
+ print_log('cmp_node', "Init query at channel $i");
};
# Enqueue and remember the segment/channel
@@ -125,7 +148,7 @@
$prio->add([$seg_q->current_match, $i]);
if (DEBUG) {
- print_log('node', "Added match " . $seg_q->current_match->to_string);
+ print_log('cmp_node', "Added match " . $seg_q->current_match->to_string);
};
}
@@ -133,7 +156,7 @@
else {
if (DEBUG) {
- print_log('node', "Remove query at channel $i");
+ print_log('cmp_node', "Remove query at channel $i");
};
# Remove segment query
@@ -144,17 +167,19 @@
};
};
+ return unless $self->{prio};
+
# Resize the priority queue
# $prio->size($n);
if (DEBUG) {
print_log(
- 'node',
+ 'cmp_node',
'Array: ' . join(',', map { $_->[0]->to_string } @{$prio->queue})
);
};
- $self->{prio} = $prio;
+ # $self->{prio} = $prio;
};
@@ -165,7 +190,7 @@
$self->_init;
# There is no next
- return if $self->{pos} > $self->{top_k} -1;
+ return if !$self->{prio} || $self->{pos} > $self->{top_k} -1;
# Get next match from list
# TODO: dequeue
@@ -194,7 +219,7 @@
if (DEBUG) {
print_log(
- 'node',
+ 'cmp_node',
'Array: ' . join(',', map { $_->[0]->to_string } @{$self->{prio}->queue})
);
};
@@ -220,13 +245,13 @@
my $result = $self->result;
- print_log('node','Compile result') if DEBUG;
+ print_log('cmp_node','Compile result') if DEBUG;
my $k = $self->{top_k};
# Get next match from list
# TODO: dequeue
- while ($k--) {
+ while ($k-- > 0) {
my $entry = $self->{prio}->remove;
# No more entries
@@ -256,22 +281,69 @@
};
+# Group data
+sub group {
+ my $self = shift;
+
+ $self->_init;
+
+ if (DEBUG) {
+ print_log('cmp_node', 'Group data');
+ };
+
+ my $result = $self->result;
+
+ if (DEBUG && $result->{group}) {
+ print_log('cmp_node', 'Group is already done is already done');
+ };
+
+ # Aggregation already collected
+ return $result if $result->group;
+
+ # Iterate over all queries
+ foreach my $seg_q (@{$self->{segment_queries}}) {
+
+ # Check for compilation role
+ if (Role::Tiny::does_role($seg_q, 'Krawfish::Compile::Segment::Group')) {
+ if (DEBUG) {
+ print_log('cmp_node', 'Add result from ' . ref($seg_q));
+ };
+
+ # Merge aggregations
+ my $group = $seg_q->group;
+
+ if (DEBUG) {
+ use Data::Dumper;
+ print_log('cmp_node', 'Merge result: ' . ref($group) . ':' . $group->to_string);
+ };
+
+ # Merge group
+ $result->merge_group($group);
+
+ if (DEBUG) {
+ print_log('cmp_node', 'Groups merged');
+ };
+ };
+ };
+
+ return $result;
+};
+
+
# Get aggregation data only
-# TODO:
-# Identical with ::Compile
sub aggregate {
my $self = shift;
$self->_init;
if (DEBUG) {
- print_log('node', 'Aggregate data');
+ print_log('cmp_node', 'Aggregate data');
};
my $result = $self->result;
if (DEBUG && @{$result->{aggregation}}) {
- print_log('node', 'Aggregation is already done');
+ print_log('cmp_node', 'Aggregation is already done');
};
# Aggregation already collected
@@ -283,19 +355,19 @@
# Check for compilation role
if (Role::Tiny::does_role($seg_q, 'Krawfish::Compile::Segment')) {
if (DEBUG) {
- print_log('node', 'Add result from ' . ref($seg_q));
+ print_log('cmp_node', 'Add result from ' . ref($seg_q));
};
# Merge aggregations
my $aggregate = $seg_q->aggregate;
if (DEBUG) {
use Data::Dumper;
- print_log('node', 'Merge result ' . $aggregate->to_string);
+ print_log('cmp_node', 'Merge result ' . $aggregate->to_string);
};
$result->merge_aggregation($aggregate);
if (DEBUG) {
- print_log('node', 'Result merged');
+ print_log('cmp_node', 'Result merged');
};
};
};
diff --git a/lib/Krawfish/Compile/Segment.pm b/lib/Krawfish/Compile/Segment.pm
index 73b7d72..101a717 100644
--- a/lib/Krawfish/Compile/Segment.pm
+++ b/lib/Krawfish/Compile/Segment.pm
@@ -30,7 +30,7 @@
if (DEBUG) {
print_log(
- 'compile',
+ 'cmp_seg',
'Current match requested by ' . ref($self)
);
};
@@ -39,7 +39,7 @@
if (DEBUG) {
print_log(
- 'compile',
+ 'cmp_seg',
'Current match is ' . $match->to_string
);
};
@@ -53,7 +53,7 @@
my $self = shift;
if (DEBUG) {
- print_log('compile', 'Get current from ' . ref $self);
+ print_log('cmp_seg', 'Get current from ' . ref $self);
};
return $self->{current} // $self->{query}->current;
@@ -68,7 +68,7 @@
sub match_from_query {
my $self = shift;
- print_log('compile', 'Get match from query as ' . ref($self)) if DEBUG;
+ print_log('cmp_seg', 'Get match from query as ' . ref($self)) if DEBUG;
my $match;
@@ -84,14 +84,14 @@
# Not yet defined
unless ($match) {
- print_log('compile', 'No match found from ' . ref($self->{query})) if DEBUG;
+ print_log('cmp_seg', 'No match found from ' . ref($self->{query})) if DEBUG;
# Get current object
my $current = $self->current;
unless ($current) {
print_log(
- 'compile',
+ 'cmp_seg',
'No current definable from ' .
ref($self)) if DEBUG;
return;
@@ -99,7 +99,7 @@
if (DEBUG) {
print_log(
- 'compile',
+ 'cmp_seg',
'Current posting is from '. $self->{query}->to_string
);
};
@@ -156,7 +156,7 @@
# This is rather for testing purposes
if (DEBUG) {
- print_log('compile', 'Compile aggregation with ' . ref($self));
+ print_log('cmp_seg', 'Compile aggregation with ' . ref($self));
};
# Get result object
@@ -166,7 +166,7 @@
while ($self->next) {
if (DEBUG) {
print_log(
- 'compile',
+ 'cmp_seg',
'Add match ' . $self->current_match->to_string
);
};
@@ -179,7 +179,7 @@
if (DEBUG) {
print_log(
- 'compile',
+ 'cmp_seg',
'Result is ' . $result
);
};
@@ -214,12 +214,12 @@
my $query = $self->{query};
if (DEBUG) {
- print_log('compile', 'Check if ' . ref($query) . ' does compiling');
+ print_log('cmp_seg', 'Check if ' . ref($query) . ' does compiling');
};
if (Role::Tiny::does_role($query, __PACKAGE__)) {
if (DEBUG) {
- print_log('compile', 'Add result from ' . ref($query));
+ print_log('cmp_seg', 'Add result from ' . ref($query));
};
$query->result($result)->aggregate;
};
diff --git a/lib/Krawfish/Compile/Segment/Group.pm b/lib/Krawfish/Compile/Segment/Group.pm
index 877a81c..8b06ba1 100644
--- a/lib/Krawfish/Compile/Segment/Group.pm
+++ b/lib/Krawfish/Compile/Segment/Group.pm
@@ -8,7 +8,7 @@
requires qw/group/;
-use constant DEBUG => 0;
+use constant DEBUG => 1;
# Override to compile data
@@ -23,14 +23,7 @@
my $result = $self->result;
# Add all results
- while ($self->next) {
- if (DEBUG) {
- print_log(
- 'compile',
- 'Check match ' . $self->current->to_string
- );
- };
- };
+ $self->finalize;
# Set group to result
$result->group(
@@ -54,13 +47,27 @@
# Get group
-# TODO:
-# rename to group_result
sub group {
$_[0]->{group};
};
+# Finalize query
+sub finalize {
+ my $self = shift;
+
+ if (DEBUG) {
+ print_log(
+ 'group',
+ 'Finalize query for grouping'
+ );
+ };
+
+ while ($self->next) {};
+ return $self;
+};
+
+
# Get current posting
sub current {
return $_[0]->{query}->current;
diff --git a/lib/Krawfish/Compile/Segment/Group/ClassFrequencies.pm b/lib/Krawfish/Compile/Segment/Group/ClassFrequencies.pm
index 57bbe0b..417fc13 100644
--- a/lib/Krawfish/Compile/Segment/Group/ClassFrequencies.pm
+++ b/lib/Krawfish/Compile/Segment/Group/ClassFrequencies.pm
@@ -3,7 +3,7 @@
use Krawfish::Log;
use strict;
use warnings;
-use Role::Tiny;
+use Role::Tiny::With;
with 'Krawfish::Compile::Segment::Group';
@@ -55,10 +55,8 @@
};
-# Shorthand for "search through"
-sub finalize {
- while ($_[0]->next) {};
- return $_[0];
+sub clone {
+ ...
};
diff --git a/lib/Krawfish/Compile/Segment/Group/Fields.pm b/lib/Krawfish/Compile/Segment/Group/Fields.pm
index 7e1bd2e..0a6019e 100644
--- a/lib/Krawfish/Compile/Segment/Group/Fields.pm
+++ b/lib/Krawfish/Compile/Segment/Group/Fields.pm
@@ -4,11 +4,11 @@
use Krawfish::Log;
use strict;
use warnings;
-use Role::Tiny;
+use Role::Tiny::With;
with 'Krawfish::Compile::Segment::Group';
-use constant DEBUG => 0;
+use constant DEBUG => 1;
# This will group matches (especially document matches) by field
# This is useful e.g. for document browsing per corpus.
@@ -42,6 +42,17 @@
};
+# Clone query
+sub clone {
+ my $self = shift;
+ return __PACKAGE__->new(
+ $self->{field_obj},
+ $self->{query},
+ $self->{field_keys}
+ );
+};
+
+
# Initialize field pointer
sub _init {
return if $_[0]->{field_pointer};
@@ -135,6 +146,13 @@
# Key identifier are matching
elsif ($field_keys[$key_pos]->key_id == $field_objs[$val_pos]->key_id) {
+ if (DEBUG) {
+ print_log(
+ 'g_fields',
+ 'Key at ' . $key_pos . ' is ' . $field_keys[$key_pos]->key_id .
+ ' which is equal to ' . $field_objs[$val_pos]->key_id);
+ };
+
# Add key to pattern
$patterns[$key_pos] //= [];
push @{$patterns[$key_pos]}, $field_objs[$val_pos]->term_id;
diff --git a/lib/Krawfish/Compile/Segment/Group/Spans.pm b/lib/Krawfish/Compile/Segment/Group/Spans.pm
index 7279e68..c08b25b 100644
--- a/lib/Krawfish/Compile/Segment/Group/Spans.pm
+++ b/lib/Krawfish/Compile/Segment/Group/Spans.pm
@@ -2,7 +2,7 @@
use Krawfish::Log;
use strict;
use warnings;
-use Role::Tiny;
+use Role::Tiny::With;
with 'Krawfish::Compile::Segment::Group';
@@ -58,4 +58,9 @@
return $slice_start . '_' . $slice_end;
};
+
+sub clone {
+ ...
+};
+
1;
diff --git a/lib/Krawfish/Compile/Segment/Group/TermExistence.pm b/lib/Krawfish/Compile/Segment/Group/TermExistence.pm
index 8736432..d692377 100644
--- a/lib/Krawfish/Compile/Segment/Group/TermExistence.pm
+++ b/lib/Krawfish/Compile/Segment/Group/TermExistence.pm
@@ -1,7 +1,7 @@
package Krawfish::Compile::Segment::Group::TermExistence;
use strict;
use warnings;
-use Role::Tiny;
+use Role::Tiny::With;
with 'Krawfish::Compile::Segment::Group';
@@ -21,6 +21,10 @@
...
};
+sub clone {
+ ...
+};
+
# TODO:
# Think about when next() is called, as it needs to be called on term_ids as well ...
diff --git a/lib/Krawfish/Koral/Compile/Node/Group.pm b/lib/Krawfish/Koral/Compile/Node/Group.pm
new file mode 100644
index 0000000..d34c5ea
--- /dev/null
+++ b/lib/Krawfish/Koral/Compile/Node/Group.pm
@@ -0,0 +1,10 @@
+package Krawfish::Koral::Compile::Node::Group;
+use strict;
+use warnings;
+use Role::Tiny;
+
+requires qw/to_string
+ identify
+ optimize/;
+
+1;
diff --git a/lib/Krawfish/Koral/Compile/Node/Group/ClassFrequencies.pm b/lib/Krawfish/Koral/Compile/Node/Group/ClassFrequencies.pm
index 93bf0c7..3abe798 100644
--- a/lib/Krawfish/Koral/Compile/Node/Group/ClassFrequencies.pm
+++ b/lib/Krawfish/Koral/Compile/Node/Group/ClassFrequencies.pm
@@ -2,10 +2,14 @@
use Krawfish::Compile::Segment::Group::ClassFrequencies;
use Krawfish::Util::String qw/squote/;
use Krawfish::Compile::Segment::Nowhere;
+use Role::Tiny::With;
use strict;
use warnings;
+with 'Krawfish::Koral::Compile::Node::Group';
+
+
# Create new enrichment object for fields
sub new {
my $class = shift;
diff --git a/lib/Krawfish/Koral/Compile/Node/Group/Fields.pm b/lib/Krawfish/Koral/Compile/Node/Group/Fields.pm
index b2dfda4..e7aedb7 100644
--- a/lib/Krawfish/Koral/Compile/Node/Group/Fields.pm
+++ b/lib/Krawfish/Koral/Compile/Node/Group/Fields.pm
@@ -2,10 +2,14 @@
use Krawfish::Compile::Segment::Group::Fields;
use Krawfish::Util::String qw/squote/;
use Krawfish::Compile::Segment::Nowhere;
+use Role::Tiny::With;
use strict;
use warnings;
+with 'Krawfish::Koral::Compile::Node::Group';
+
+
# Create new enrichment object for fields
sub new {
my $class = shift;
diff --git a/lib/Krawfish/Koral/Result.pm b/lib/Krawfish/Koral/Result.pm
index 9eaa5e9..5a37452 100644
--- a/lib/Krawfish/Koral/Result.pm
+++ b/lib/Krawfish/Koral/Result.pm
@@ -80,7 +80,6 @@
print_log('k_result', 'Add aggregation data for ' . $new_aggr->key);
};
-
# Introduce aggregation
$self->add_aggregation($new_aggr);
};
@@ -88,6 +87,34 @@
return;
};
+# Merge groups
+sub merge_group {
+ my ($self, $group) = @_;
+
+ if (DEBUG) {
+ print_log('k_result', 'Merge group data');
+ };
+
+ # Merge with existing group
+ if ($self->{group}) {
+
+ if ($self->{group}->key ne $group->key) {
+ $self->add_error(000 => 'Groups are not compatible');
+ delete $self->{group};
+ return;
+ };
+
+ # Merge
+ $self->{group}->merge($group);
+ }
+
+ # Establish first group
+ else {
+ $self->{group} = $group;
+ };
+
+ return;
+};
# Get aggregations
sub aggregation {
diff --git a/lib/Krawfish/Koral/Result/Aggregate.pm b/lib/Krawfish/Koral/Result/Aggregate.pm
index 695ce97..956aafb 100644
--- a/lib/Krawfish/Koral/Result/Aggregate.pm
+++ b/lib/Krawfish/Koral/Result/Aggregate.pm
@@ -3,7 +3,13 @@
use warnings;
use Role::Tiny;
-requires qw/key merge/;
+# TODO: Identical to Result::Group
+
+requires qw/key
+ merge
+ inflate
+ to_string
+ to_koral_fragment/;
# Finish the calculation
sub on_finish {
diff --git a/lib/Krawfish/Koral/Result/Group.pm b/lib/Krawfish/Koral/Result/Group.pm
index b199c31..2d81bc4 100644
--- a/lib/Krawfish/Koral/Result/Group.pm
+++ b/lib/Krawfish/Koral/Result/Group.pm
@@ -1,18 +1,20 @@
package Krawfish::Koral::Result::Group;
-use Role::Tiny::With;
+use Role::Tiny;
use strict;
use warnings;
-with 'Krawfish::Koral::Report';
-with 'Krawfish::Koral::Result::Inflatable';
+# TODO: Identical to Result::Aggregate
+
+requires qw/key
+ merge
+ inflate
+ to_string
+ to_koral_fragment/;
# This will be returned by a Group search
# It needs a to_hash method,
# does not require start, end etc ...
-# TODO:
-# This is quite similar to K::P::Bundle
-
# With a witness, the group has:
# {
# criterion => [freq, doc_freq, match]
@@ -31,17 +33,9 @@
# criterion => [freq, doc_freq, match, freq, doc_freq, match, freq, doc_freq, match ...]
# }
-
-sub freq {
- ...
-};
-
-sub doc_freq {
- ...
-};
-
-sub to_hash {
- ...
+sub on_finish {
+ $_[0];
};
1;
+
diff --git a/lib/Krawfish/Koral/Result/Group/Fields.pm b/lib/Krawfish/Koral/Result/Group/Fields.pm
index 868a0bc..765b2d5 100644
--- a/lib/Krawfish/Koral/Result/Group/Fields.pm
+++ b/lib/Krawfish/Koral/Result/Group/Fields.pm
@@ -1,5 +1,6 @@
package Krawfish::Koral::Result::Group::Fields;
use Krawfish::Util::PatternList qw/pattern_list/;
+use Data::Dumper;
use Role::Tiny::With;
use Krawfish::Util::Bits;
use Krawfish::Log;
@@ -8,8 +9,9 @@
use warnings;
with 'Krawfish::Koral::Result::Inflatable';
+with 'Krawfish::Koral::Result::Group';
-use constant DEBUG => 0;
+use constant DEBUG => 1;
# Group on a sequence of field values
@@ -89,12 +91,42 @@
};
};
+
# On finish, flush the cache
sub on_finish {
$_[0]->flush;
$_[0];
};
+
+# Merge groups
+sub merge {
+ my ($self, $group) = @_;
+ my $est_group = $self->{group};
+ my $new_group = $group->{group};
+
+ # Get groups
+ foreach my $signature (keys %{$new_group}) {
+ $est_group->{$signature} //= {};
+
+ if (DEBUG) {
+ print_log('p_g_fields','Result: ' . Dumper $new_group);
+ };
+
+ # Iterate over all existing groups
+ foreach my $flag (keys %{$new_group->{$signature}}) {
+
+ my $value = ($est_group->{$signature}->{$flag} //= [0,0]);
+ my $freq = $new_group->{$signature}->{$flag};
+
+ $value->[0] += $freq->[0];
+ $value->[1] += $freq->[1];
+ };
+ };
+};
+
+
+
# Translate this to terms
sub inflate {
my ($self, $dict) = @_;