blob: 1722147cbaf7d83f351538f4065c0e7c9cd0efd9 [file] [log] [blame]
Akron5cf5fca2017-10-09 19:01:47 +02001package Krawfish::Koral::Compile;
2use Krawfish::Koral::Compile::Builder;
Akron304fdd52017-04-05 01:47:46 +02003use Krawfish::Log;
Akronc3657bf2016-10-31 00:15:43 +01004use strict;
5use warnings;
6
Akrona588d072017-10-13 14:45:34 +02007# Creation of compilation query
8
Akrond15e2bb2017-08-11 18:23:14 +02009# WARNING! / TODO!
10# An enrichment for fields or snippets (better any enrichments)
Akron09ab24b2017-08-24 12:45:39 +020011# can never wrap around a presort query, because the relevant
Akrona588d072017-10-13 14:45:34 +020012# data structures and algorithms require the results to be in
13# doc_id order!
Akron78c49502017-07-27 16:00:36 +020014
Akron4401f272017-08-18 16:54:30 +020015# WARNING!
16# It's important to remember that sortFilter can't be shared in parallel
17# processing - especially for fields, as segment rankings can differ!
18
Akronc1ed58c2017-08-04 17:26:30 +020019# TODO:
Akron09ab24b2017-08-24 12:45:39 +020020# There are presort and postsort queries.
21# Presortqueries don't respect current_query,
22# while postsortqueries do!
23# Postsortqueries only work on the clusterlevel.
24
25# TODO:
Akronc1ed58c2017-08-04 17:26:30 +020026# When a group filter is added,
27# sorting does not work etc.
28# This has to be thought through
29
Akron015093d2017-10-24 18:47:44 +020030# TODO:
31# Remove corpus classes, in case they are not used.
32# This requires a method like ->used_classes
Akrond15e2bb2017-08-11 18:23:14 +020033
Akron5cf5fca2017-10-09 19:01:47 +020034our %COMPILE_ORDER = (
Akrond15e2bb2017-08-11 18:23:14 +020035 limit => 1,
36 sort => 2,
Akron09ab24b2017-08-24 12:45:39 +020037 sample => 3,
38 enrich => 4,
39 aggregate => 5,
40 group => 6,
41 filter => 7
Akrond15e2bb2017-08-11 18:23:14 +020042);
43
Akrona588d072017-10-13 14:45:34 +020044
Akron304fdd52017-04-05 01:47:46 +020045use constant {
Akron94256e62017-10-10 17:29:18 +020046 DEBUG => 0,
Akron4b5257e2017-04-05 17:50:22 +020047 UNIQUE_FIELD => 'id'
Akron304fdd52017-04-05 01:47:46 +020048};
49
Akrona588d072017-10-13 14:45:34 +020050
51# Constructor
Akronc3657bf2016-10-31 00:15:43 +010052sub new {
53 my $class = shift;
Akron78c49502017-07-27 16:00:36 +020054 bless [@_], $class;
Akronc3657bf2016-10-31 00:15:43 +010055};
56
Akrona588d072017-10-13 14:45:34 +020057
58# Stringification
Akron78c49502017-07-27 16:00:36 +020059sub to_string {
Akron10448e12017-10-11 18:04:53 +020060 my ($self, $id) = @_;
61 return join(',', map { $_->to_string($id) } $self->operations);
Akron492674d2017-10-11 16:30:34 +020062};
63
Akron78c49502017-07-27 16:00:36 +020064
Akrona588d072017-10-13 14:45:34 +020065# Get builder object
Akron78c49502017-07-27 16:00:36 +020066sub builder {
Akron5cf5fca2017-10-09 19:01:47 +020067 return Krawfish::Koral::Compile::Builder->new;
Akron78c49502017-07-27 16:00:36 +020068};
69
70
71# Get or set operations
72sub operations {
73 my $self = shift;
74 if (@_) {
75 @$self = @_;
76 return $self;
77 };
78 return @$self;
79};
80
81
Akron5cf5fca2017-10-09 19:01:47 +020082# Normalize compile object
Akron78c49502017-07-27 16:00:36 +020083sub normalize {
84 my $self = shift;
85
Akron5cf5fca2017-10-09 19:01:47 +020086 my @compile = $self->operations;
Akron78c49502017-07-27 16:00:36 +020087
88 my $mb = $self->builder;
89
Akronc1ed58c2017-08-04 17:26:30 +020090 # Check, if the query is a group query,
Akron5cf5fca2017-10-09 19:01:47 +020091 # which invalidates some compile operations
Akronc1ed58c2017-08-04 17:26:30 +020092 my $group_query = 0;
93 my $top_k = 0;
Akron5cf5fca2017-10-09 19:01:47 +020094 foreach (@compile) {
Akronc1ed58c2017-08-04 17:26:30 +020095 if ($_->type eq 'group') {
96 $group_query = 1;
97 }
98 elsif ($_->type eq 'limit') {
99 $top_k = $_->start_index + $_->items_per_page;
100 };
101 };
Akron78c49502017-07-27 16:00:36 +0200102
Akronc1ed58c2017-08-04 17:26:30 +0200103 # Add unique sorting per default - unless it's a group query
Akrona588d072017-10-13 14:45:34 +0200104 # unless ($group_query) {
105 # push @compile,
106 # $mb->sort_by($mb->s_field(UNIQUE_FIELD));
Akrond15e2bb2017-08-11 18:23:14 +0200107 #
Akrona588d072017-10-13 14:45:34 +0200108 # if (DEBUG) {
109 # print_log('kq_compile', 'Added unique field ' . UNIQUE_FIELD . ' to order');
110 # };
111 # };
Akronc1ed58c2017-08-04 17:26:30 +0200112
Akron01e97e82017-08-03 15:12:25 +0200113
Akron78c49502017-07-27 16:00:36 +0200114 # 1. Introduce required information
115 # e.g. sort(field) => fields(field)
Akron89440982017-07-28 14:48:28 +0200116 my $sort_filtering = 1;
Akron5cf5fca2017-10-09 19:01:47 +0200117 for (my $i = 0; $i < scalar @compile; $i++) {
Akron78c49502017-07-27 16:00:36 +0200118
119 # For all sort fields, it may be beneficial to
120 # retrieve the fields as well - as they need
121 # to be retrieved nonetheless for search criteria
Akron5cf5fca2017-10-09 19:01:47 +0200122 #if ($compile[$i]->type eq 'sort') {
Akrond15e2bb2017-08-11 18:23:14 +0200123 #
124 # my $mb = $self->builder;
Akron5cf5fca2017-10-09 19:01:47 +0200125 # push @compile,
126 # $mb->enrich($mb->e_fields($compile[$i]->fields));
Akrond15e2bb2017-08-11 18:23:14 +0200127 #
128 # if (DEBUG) {
Akron5cf5fca2017-10-09 19:01:47 +0200129 # print_log('kq_compile', 'Added sorting ' .
130 # join(',', map {$_->to_string } $compile[$i]->fields) .
Akrond15e2bb2017-08-11 18:23:14 +0200131 # ' to fields');
132 # };
133 #}
Akron78c49502017-07-27 16:00:36 +0200134
135 # There is at least one aggregation field
Akrond15e2bb2017-08-11 18:23:14 +0200136 #els
Akron5cf5fca2017-10-09 19:01:47 +0200137 if ($compile[$i]->type eq 'aggregate') {
Akron89440982017-07-28 14:48:28 +0200138 $sort_filtering = 0;
139 }
140
141 # There is at least one group option
Akron5cf5fca2017-10-09 19:01:47 +0200142 elsif ($compile[$i]->type eq 'group') {
Akron89440982017-07-28 14:48:28 +0200143 $sort_filtering = 0;
Akron78c49502017-07-27 16:00:36 +0200144 };
145 };
146
147 # Sort objects based on a defined order
Akron5cf5fca2017-10-09 19:01:47 +0200148 @compile = sort {
149 $COMPILE_ORDER{$a->type} <=> $COMPILE_ORDER{$b->type}
150 } @compile;
Akron78c49502017-07-27 16:00:36 +0200151
152
153 # 2. Find identical types and merge
154 # fields+fields => fields
155 # sort+sort => sort ...
156 # and take the first value for single values
157 # start_index=0 + start_index=2 => start_index=0
158 #
159 # 3. Remove duplicates
160 # aggr_freq + aggr_freq => - aggr_freq
Akron5cf5fca2017-10-09 19:01:47 +0200161 for (my $i = 1; $i < @compile; $i++) {
Akron78c49502017-07-27 16:00:36 +0200162
163 # Consecutive types are identical, join
Akron5cf5fca2017-10-09 19:01:47 +0200164 if ($compile[$i]->type eq $compile[$i-1]->type) {
Akron78c49502017-07-27 16:00:36 +0200165
166 # Join fields or aggregations
Akron5cf5fca2017-10-09 19:01:47 +0200167 if ($compile[$i]->type eq 'enrich' ||
168 $compile[$i]->type eq 'aggregate' ||
169 $compile[$i]->type eq 'sort'
Akron78c49502017-07-27 16:00:36 +0200170 ) {
171
172 # The first operations have higher precedence
Akron5cf5fca2017-10-09 19:01:47 +0200173 $compile[$i-1]->operations(
174 $compile[$i-1]->operations,
175 $compile[$i]->operations
Akron78c49502017-07-27 16:00:36 +0200176 );
177
178 # Remove merged object
Akron5cf5fca2017-10-09 19:01:47 +0200179 splice(@compile, $i, 1);
Akron78c49502017-07-27 16:00:36 +0200180 $i--;
181 }
182
183 # TODO:
184 # Make single field values work
185 # - start_index
186 # - count
187
188 # Unknown operation
189 else {
Akron5cf5fca2017-10-09 19:01:47 +0200190 warn 'Unable to deal with unknown compile operation' . $compile[$i]->type;
Akron78c49502017-07-27 16:00:36 +0200191 };
192
193 # Don't normalize nonmerged data
Akron4204f172017-10-02 22:32:02 +0200194 CORE::next;
Akron78c49502017-07-27 16:00:36 +0200195 };
196
197 # Normalize when no longer consecutive operations
198 # can be expected
Akron5cf5fca2017-10-09 19:01:47 +0200199 $compile[$i-1] = $compile[$i-1]->normalize;
Akron78c49502017-07-27 16:00:36 +0200200 };
201
202 # Normalize last operation
Akron5cf5fca2017-10-09 19:01:47 +0200203 $compile[-1] = $compile[-1]->normalize;
Akron78c49502017-07-27 16:00:36 +0200204
205 # 4. Optimize
Akron89440982017-07-28 14:48:28 +0200206 # No aggregation or group queries =>
Akronc1ed58c2017-08-04 17:26:30 +0200207 # add a sort filter to sort
208 # If a limit is given, add top_k to sort
Akron45d31922017-09-15 17:05:36 +0200209 if ($sort_filtering && $top_k) {
Akron5cf5fca2017-10-09 19:01:47 +0200210 foreach (@compile) {
Akronc1ed58c2017-08-04 17:26:30 +0200211 if ($_->type eq 'sort') {
212
213 # Activate sort_filter option
Akron45d31922017-09-15 17:05:36 +0200214 $_->filter(1); # if $sort_filtering;
Akronc1ed58c2017-08-04 17:26:30 +0200215
216 # Set top_k option!
217 $_->top_k($top_k) if $top_k;
218 last;
Akron09ab24b2017-08-24 12:45:39 +0200219 }
220
221 elsif ($_->type eq 'sample') {
222 # Set top_k option!
223 $_->top_k($top_k) if $top_k;
224 last;
Akronc1ed58c2017-08-04 17:26:30 +0200225 };
226 };
Akron78c49502017-07-27 16:00:36 +0200227 };
228
229 # Set operations
Akron5cf5fca2017-10-09 19:01:47 +0200230 $self->operations(@compile);
Akron78c49502017-07-27 16:00:36 +0200231
Akron304fdd52017-04-05 01:47:46 +0200232 return $self;
233};
234
Akron89440982017-07-28 14:48:28 +0200235
Akron01e97e82017-08-03 15:12:25 +0200236# Translate all fields to term ids
237sub identify {
238 my ($self, $dict) = @_;
239
240 for (my $i = 0; $i < @$self; $i++) {
241 $self->[$i] = $self->[$i]->identify($dict);
242 };
243
244 return $self;
245};
246
247
Akroncfa3e012017-08-07 19:46:41 +0200248# Wrap operations in a single query object
Akronc1ed58c2017-08-04 17:26:30 +0200249sub wrap {
250 my ($self, $query) = @_;
251 foreach (reverse $self->operations) {
252 $query = $_->wrap($query);
253 };
254 return $query;
255};
256
257
Akrona588d072017-10-13 14:45:34 +0200258# Send to segments
Akron89440982017-07-28 14:48:28 +0200259sub to_segment {
Akron89440982017-07-28 14:48:28 +0200260 ...
Akron78c49502017-07-27 16:00:36 +0200261};
262
263
Akrona588d072017-10-13 14:45:34 +0200264# Optimize query
Akroncfa3e012017-08-07 19:46:41 +0200265sub optimize {
266 ...
267};
Akron78c49502017-07-27 16:00:36 +0200268
Akron304fdd52017-04-05 01:47:46 +0200269
Akronc3657bf2016-10-31 00:15:43 +01002701;
271
Akronb00c2be2017-08-16 14:45:07 +0200272
Akronc3657bf2016-10-31 00:15:43 +0100273__END__