blob: c2799976fdd580102a64398b88357860f08cd49f [file] [log] [blame]
Akron5cf5fca2017-10-09 19:01:47 +02001package Krawfish::Koral::Compile;
2use Krawfish::Koral::Compile::Builder;
Akron304fdd52017-04-05 01:47:46 +02003use Krawfish::Log;
Akronc3657bf2016-10-31 00:15:43 +01004use strict;
5use warnings;
6
Akrona588d072017-10-13 14:45:34 +02007# Creation of compilation query
8
Akrond15e2bb2017-08-11 18:23:14 +02009# WARNING! / TODO!
10# An enrichment for fields or snippets (better any enrichments)
Akron09ab24b2017-08-24 12:45:39 +020011# can never wrap around a presort query, because the relevant
Akrona588d072017-10-13 14:45:34 +020012# data structures and algorithms require the results to be in
13# doc_id order!
Akron78c49502017-07-27 16:00:36 +020014
Akron4401f272017-08-18 16:54:30 +020015# WARNING!
16# It's important to remember that sortFilter can't be shared in parallel
17# processing - especially for fields, as segment rankings can differ!
18
Akronc1ed58c2017-08-04 17:26:30 +020019# TODO:
Akron09ab24b2017-08-24 12:45:39 +020020# There are presort and postsort queries.
21# Presortqueries don't respect current_query,
22# while postsortqueries do!
23# Postsortqueries only work on the clusterlevel.
24
25# TODO:
Akronc1ed58c2017-08-04 17:26:30 +020026# When a group filter is added,
27# sorting does not work etc.
28# This has to be thought through
29
Akron015093d2017-10-24 18:47:44 +020030# TODO:
31# Remove corpus classes, in case they are not used.
32# This requires a method like ->used_classes
Akrond15e2bb2017-08-11 18:23:14 +020033
Akron9687f472018-02-23 21:17:45 +010034# TODO:
35# Support aggregations on groups!
36
Akron5cf5fca2017-10-09 19:01:47 +020037our %COMPILE_ORDER = (
Akron405cba02018-03-02 20:18:30 +010038 limit => 1,
39 cluster_merge => 2,
40 node_merge => 3,
41 sort => 4,
42 sample => 5,
43 enrich => 6,
44 group_aggregate => 7,
45 group => 8,
46 aggregate => 9,
47 filter => 10
Akrond15e2bb2017-08-11 18:23:14 +020048);
49
Akrona588d072017-10-13 14:45:34 +020050
Akronce242632017-11-23 17:19:10 +010051use constant DEBUG => 0;
Akron304fdd52017-04-05 01:47:46 +020052
Akrona588d072017-10-13 14:45:34 +020053
54# Constructor
Akronc3657bf2016-10-31 00:15:43 +010055sub new {
56 my $class = shift;
Akron78c49502017-07-27 16:00:36 +020057 bless [@_], $class;
Akronc3657bf2016-10-31 00:15:43 +010058};
59
Akrona588d072017-10-13 14:45:34 +020060
61# Stringification
Akron78c49502017-07-27 16:00:36 +020062sub to_string {
Akron10448e12017-10-11 18:04:53 +020063 my ($self, $id) = @_;
64 return join(',', map { $_->to_string($id) } $self->operations);
Akron492674d2017-10-11 16:30:34 +020065};
66
Akron78c49502017-07-27 16:00:36 +020067
Akrona588d072017-10-13 14:45:34 +020068# Get builder object
Akron78c49502017-07-27 16:00:36 +020069sub builder {
Akron5cf5fca2017-10-09 19:01:47 +020070 return Krawfish::Koral::Compile::Builder->new;
Akron78c49502017-07-27 16:00:36 +020071};
72
73
74# Get or set operations
75sub operations {
76 my $self = shift;
77 if (@_) {
78 @$self = @_;
79 return $self;
80 };
81 return @$self;
82};
83
84
Akron66485972017-12-07 19:53:14 +010085# Add operation
86sub add {
87 my ($self, $op) = @_;
88 push @$self, $op;
89};
90
91
Akron5cf5fca2017-10-09 19:01:47 +020092# Normalize compile object
Akron78c49502017-07-27 16:00:36 +020093sub normalize {
94 my $self = shift;
95
Akron5cf5fca2017-10-09 19:01:47 +020096 my @compile = $self->operations;
Akron78c49502017-07-27 16:00:36 +020097
98 my $mb = $self->builder;
99
Akronc1ed58c2017-08-04 17:26:30 +0200100 # Check, if the query is a group query,
Akron5cf5fca2017-10-09 19:01:47 +0200101 # which invalidates some compile operations
Akronc1ed58c2017-08-04 17:26:30 +0200102 my $group_query = 0;
103 my $top_k = 0;
Akron5cf5fca2017-10-09 19:01:47 +0200104 foreach (@compile) {
Akronc1ed58c2017-08-04 17:26:30 +0200105 if ($_->type eq 'group') {
106 $group_query = 1;
107 }
108 elsif ($_->type eq 'limit') {
109 $top_k = $_->start_index + $_->items_per_page;
110 };
111 };
Akron78c49502017-07-27 16:00:36 +0200112
Akronc1ed58c2017-08-04 17:26:30 +0200113 # Add unique sorting per default - unless it's a group query
Akrona588d072017-10-13 14:45:34 +0200114 # unless ($group_query) {
115 # push @compile,
116 # $mb->sort_by($mb->s_field(UNIQUE_FIELD));
Akrond15e2bb2017-08-11 18:23:14 +0200117 #
Akrona588d072017-10-13 14:45:34 +0200118 # if (DEBUG) {
119 # print_log('kq_compile', 'Added unique field ' . UNIQUE_FIELD . ' to order');
120 # };
121 # };
Akronc1ed58c2017-08-04 17:26:30 +0200122
Akron01e97e82017-08-03 15:12:25 +0200123
Akron78c49502017-07-27 16:00:36 +0200124 # 1. Introduce required information
Akron89440982017-07-28 14:48:28 +0200125 my $sort_filtering = 1;
Akron5cf5fca2017-10-09 19:01:47 +0200126 for (my $i = 0; $i < scalar @compile; $i++) {
Akron78c49502017-07-27 16:00:36 +0200127
Akron78c49502017-07-27 16:00:36 +0200128 # There is at least one aggregation field
Akron5cf5fca2017-10-09 19:01:47 +0200129 if ($compile[$i]->type eq 'aggregate') {
Akron89440982017-07-28 14:48:28 +0200130 $sort_filtering = 0;
131 }
132
133 # There is at least one group option
Akron5cf5fca2017-10-09 19:01:47 +0200134 elsif ($compile[$i]->type eq 'group') {
Akron89440982017-07-28 14:48:28 +0200135 $sort_filtering = 0;
Akron78c49502017-07-27 16:00:36 +0200136 };
137 };
138
139 # Sort objects based on a defined order
Akron5cf5fca2017-10-09 19:01:47 +0200140 @compile = sort {
141 $COMPILE_ORDER{$a->type} <=> $COMPILE_ORDER{$b->type}
142 } @compile;
Akron78c49502017-07-27 16:00:36 +0200143
144
145 # 2. Find identical types and merge
146 # fields+fields => fields
147 # sort+sort => sort ...
148 # and take the first value for single values
149 # start_index=0 + start_index=2 => start_index=0
150 #
Akron5cf5fca2017-10-09 19:01:47 +0200151 for (my $i = 1; $i < @compile; $i++) {
Akron78c49502017-07-27 16:00:36 +0200152
Akron405cba02018-03-02 20:18:30 +0100153 # 3. Remove duplicates
154 # aggr_freq + aggr_freq => - aggr_freq
Akron78c49502017-07-27 16:00:36 +0200155 # Consecutive types are identical, join
Akron5cf5fca2017-10-09 19:01:47 +0200156 if ($compile[$i]->type eq $compile[$i-1]->type) {
Akron78c49502017-07-27 16:00:36 +0200157
158 # Join fields or aggregations
Akron5cf5fca2017-10-09 19:01:47 +0200159 if ($compile[$i]->type eq 'enrich' ||
Akron405cba02018-03-02 20:18:30 +0100160 $compile[$i]->type eq 'sort' ||
Akron5cf5fca2017-10-09 19:01:47 +0200161 $compile[$i]->type eq 'aggregate' ||
Akron405cba02018-03-02 20:18:30 +0100162 $compile[$i]->type eq 'group_aggregate'
Akron78c49502017-07-27 16:00:36 +0200163 ) {
164
165 # The first operations have higher precedence
Akron5cf5fca2017-10-09 19:01:47 +0200166 $compile[$i-1]->operations(
167 $compile[$i-1]->operations,
168 $compile[$i]->operations
Akron78c49502017-07-27 16:00:36 +0200169 );
170
171 # Remove merged object
Akron5cf5fca2017-10-09 19:01:47 +0200172 splice(@compile, $i, 1);
Akron78c49502017-07-27 16:00:36 +0200173 $i--;
174 }
175
176 # TODO:
177 # Make single field values work
178 # - start_index
179 # - count
180
181 # Unknown operation
182 else {
Akron5cf5fca2017-10-09 19:01:47 +0200183 warn 'Unable to deal with unknown compile operation' . $compile[$i]->type;
Akron78c49502017-07-27 16:00:36 +0200184 };
185
186 # Don't normalize nonmerged data
Akron4204f172017-10-02 22:32:02 +0200187 CORE::next;
Akron78c49502017-07-27 16:00:36 +0200188 };
189
190 # Normalize when no longer consecutive operations
191 # can be expected
Akron5cf5fca2017-10-09 19:01:47 +0200192 $compile[$i-1] = $compile[$i-1]->normalize;
Akron78c49502017-07-27 16:00:36 +0200193 };
194
195 # Normalize last operation
Akron5cf5fca2017-10-09 19:01:47 +0200196 $compile[-1] = $compile[-1]->normalize;
Akron78c49502017-07-27 16:00:36 +0200197
198 # 4. Optimize
Akron89440982017-07-28 14:48:28 +0200199 # No aggregation or group queries =>
Akronc1ed58c2017-08-04 17:26:30 +0200200 # add a sort filter to sort
201 # If a limit is given, add top_k to sort
Akron45d31922017-09-15 17:05:36 +0200202 if ($sort_filtering && $top_k) {
Akron5cf5fca2017-10-09 19:01:47 +0200203 foreach (@compile) {
Akronc1ed58c2017-08-04 17:26:30 +0200204 if ($_->type eq 'sort') {
205
206 # Activate sort_filter option
Akron45d31922017-09-15 17:05:36 +0200207 $_->filter(1); # if $sort_filtering;
Akronc1ed58c2017-08-04 17:26:30 +0200208
209 # Set top_k option!
210 $_->top_k($top_k) if $top_k;
211 last;
Akron09ab24b2017-08-24 12:45:39 +0200212 }
213
214 elsif ($_->type eq 'sample') {
215 # Set top_k option!
216 $_->top_k($top_k) if $top_k;
217 last;
Akronc1ed58c2017-08-04 17:26:30 +0200218 };
219 };
Akron78c49502017-07-27 16:00:36 +0200220 };
221
222 # Set operations
Akron5cf5fca2017-10-09 19:01:47 +0200223 $self->operations(@compile);
Akron78c49502017-07-27 16:00:36 +0200224
Akron304fdd52017-04-05 01:47:46 +0200225 return $self;
226};
227
Akron89440982017-07-28 14:48:28 +0200228
Akron01e97e82017-08-03 15:12:25 +0200229# Translate all fields to term ids
230sub identify {
231 my ($self, $dict) = @_;
232
233 for (my $i = 0; $i < @$self; $i++) {
234 $self->[$i] = $self->[$i]->identify($dict);
235 };
236
237 return $self;
238};
239
240
Akroncfa3e012017-08-07 19:46:41 +0200241# Wrap operations in a single query object
Akronc1ed58c2017-08-04 17:26:30 +0200242sub wrap {
243 my ($self, $query) = @_;
244 foreach (reverse $self->operations) {
245 $query = $_->wrap($query);
246 };
247 return $query;
248};
249
250
Akronc3657bf2016-10-31 00:15:43 +01002511;
252
Akronb00c2be2017-08-16 14:45:07 +0200253
Akronc3657bf2016-10-31 00:15:43 +0100254__END__