| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 1 | package Krawfish::Koral::Compile; |
| 2 | use Krawfish::Koral::Compile::Builder; |
| Akron | 304fdd5 | 2017-04-05 01:47:46 +0200 | [diff] [blame] | 3 | use Krawfish::Log; |
| Akron | c3657bf | 2016-10-31 00:15:43 +0100 | [diff] [blame] | 4 | use strict; |
| 5 | use warnings; |
| 6 | |
| Akron | a588d07 | 2017-10-13 14:45:34 +0200 | [diff] [blame] | 7 | # Creation of compilation query |
| 8 | |
| Akron | d15e2bb | 2017-08-11 18:23:14 +0200 | [diff] [blame] | 9 | # WARNING! / TODO! |
| 10 | # An enrichment for fields or snippets (better any enrichments) |
| Akron | 09ab24b | 2017-08-24 12:45:39 +0200 | [diff] [blame] | 11 | # can never wrap around a presort query, because the relevant |
| Akron | a588d07 | 2017-10-13 14:45:34 +0200 | [diff] [blame] | 12 | # data structures and algorithms require the results to be in |
| 13 | # doc_id order! |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 14 | |
| Akron | 4401f27 | 2017-08-18 16:54:30 +0200 | [diff] [blame] | 15 | # WARNING! |
| 16 | # It's important to remember that sortFilter can't be shared in parallel |
| 17 | # processing - especially for fields, as segment rankings can differ! |
| 18 | |
| Akron | c1ed58c | 2017-08-04 17:26:30 +0200 | [diff] [blame] | 19 | # TODO: |
| Akron | 09ab24b | 2017-08-24 12:45:39 +0200 | [diff] [blame] | 20 | # There are presort and postsort queries. |
| 21 | # Presortqueries don't respect current_query, |
| 22 | # while postsortqueries do! |
| 23 | # Postsortqueries only work on the clusterlevel. |
| 24 | |
| 25 | # TODO: |
| Akron | c1ed58c | 2017-08-04 17:26:30 +0200 | [diff] [blame] | 26 | # When a group filter is added, |
| 27 | # sorting does not work etc. |
| 28 | # This has to be thought through |
| 29 | |
| Akron | 015093d | 2017-10-24 18:47:44 +0200 | [diff] [blame] | 30 | # TODO: |
| 31 | # Remove corpus classes, in case they are not used. |
| 32 | # This requires a method like ->used_classes |
| Akron | d15e2bb | 2017-08-11 18:23:14 +0200 | [diff] [blame] | 33 | |
| Akron | 9687f47 | 2018-02-23 21:17:45 +0100 | [diff] [blame] | 34 | # TODO: |
| 35 | # Support aggregations on groups! |
| 36 | |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 37 | our %COMPILE_ORDER = ( |
| Akron | 405cba0 | 2018-03-02 20:18:30 +0100 | [diff] [blame] | 38 | limit => 1, |
| 39 | cluster_merge => 2, |
| 40 | node_merge => 3, |
| 41 | sort => 4, |
| 42 | sample => 5, |
| 43 | enrich => 6, |
| 44 | group_aggregate => 7, |
| 45 | group => 8, |
| 46 | aggregate => 9, |
| 47 | filter => 10 |
| Akron | d15e2bb | 2017-08-11 18:23:14 +0200 | [diff] [blame] | 48 | ); |
| 49 | |
| Akron | a588d07 | 2017-10-13 14:45:34 +0200 | [diff] [blame] | 50 | |
| Akron | ce24263 | 2017-11-23 17:19:10 +0100 | [diff] [blame] | 51 | use constant DEBUG => 0; |
| Akron | 304fdd5 | 2017-04-05 01:47:46 +0200 | [diff] [blame] | 52 | |
| Akron | a588d07 | 2017-10-13 14:45:34 +0200 | [diff] [blame] | 53 | |
| 54 | # Constructor |
| Akron | c3657bf | 2016-10-31 00:15:43 +0100 | [diff] [blame] | 55 | sub new { |
| 56 | my $class = shift; |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 57 | bless [@_], $class; |
| Akron | c3657bf | 2016-10-31 00:15:43 +0100 | [diff] [blame] | 58 | }; |
| 59 | |
| Akron | a588d07 | 2017-10-13 14:45:34 +0200 | [diff] [blame] | 60 | |
| 61 | # Stringification |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 62 | sub to_string { |
| Akron | 10448e1 | 2017-10-11 18:04:53 +0200 | [diff] [blame] | 63 | my ($self, $id) = @_; |
| 64 | return join(',', map { $_->to_string($id) } $self->operations); |
| Akron | 492674d | 2017-10-11 16:30:34 +0200 | [diff] [blame] | 65 | }; |
| 66 | |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 67 | |
| Akron | a588d07 | 2017-10-13 14:45:34 +0200 | [diff] [blame] | 68 | # Get builder object |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 69 | sub builder { |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 70 | return Krawfish::Koral::Compile::Builder->new; |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 71 | }; |
| 72 | |
| 73 | |
| 74 | # Get or set operations |
| 75 | sub operations { |
| 76 | my $self = shift; |
| 77 | if (@_) { |
| 78 | @$self = @_; |
| 79 | return $self; |
| 80 | }; |
| 81 | return @$self; |
| 82 | }; |
| 83 | |
| 84 | |
| Akron | 6648597 | 2017-12-07 19:53:14 +0100 | [diff] [blame] | 85 | # Add operation |
| 86 | sub add { |
| 87 | my ($self, $op) = @_; |
| 88 | push @$self, $op; |
| 89 | }; |
| 90 | |
| 91 | |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 92 | # Normalize compile object |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 93 | sub normalize { |
| 94 | my $self = shift; |
| 95 | |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 96 | my @compile = $self->operations; |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 97 | |
| 98 | my $mb = $self->builder; |
| 99 | |
| Akron | c1ed58c | 2017-08-04 17:26:30 +0200 | [diff] [blame] | 100 | # Check, if the query is a group query, |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 101 | # which invalidates some compile operations |
| Akron | c1ed58c | 2017-08-04 17:26:30 +0200 | [diff] [blame] | 102 | my $group_query = 0; |
| 103 | my $top_k = 0; |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 104 | foreach (@compile) { |
| Akron | c1ed58c | 2017-08-04 17:26:30 +0200 | [diff] [blame] | 105 | if ($_->type eq 'group') { |
| 106 | $group_query = 1; |
| 107 | } |
| 108 | elsif ($_->type eq 'limit') { |
| 109 | $top_k = $_->start_index + $_->items_per_page; |
| 110 | }; |
| 111 | }; |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 112 | |
| Akron | c1ed58c | 2017-08-04 17:26:30 +0200 | [diff] [blame] | 113 | # Add unique sorting per default - unless it's a group query |
| Akron | a588d07 | 2017-10-13 14:45:34 +0200 | [diff] [blame] | 114 | # unless ($group_query) { |
| 115 | # push @compile, |
| 116 | # $mb->sort_by($mb->s_field(UNIQUE_FIELD)); |
| Akron | d15e2bb | 2017-08-11 18:23:14 +0200 | [diff] [blame] | 117 | # |
| Akron | a588d07 | 2017-10-13 14:45:34 +0200 | [diff] [blame] | 118 | # if (DEBUG) { |
| 119 | # print_log('kq_compile', 'Added unique field ' . UNIQUE_FIELD . ' to order'); |
| 120 | # }; |
| 121 | # }; |
| Akron | c1ed58c | 2017-08-04 17:26:30 +0200 | [diff] [blame] | 122 | |
| Akron | 01e97e8 | 2017-08-03 15:12:25 +0200 | [diff] [blame] | 123 | |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 124 | # 1. Introduce required information |
| Akron | 8944098 | 2017-07-28 14:48:28 +0200 | [diff] [blame] | 125 | my $sort_filtering = 1; |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 126 | for (my $i = 0; $i < scalar @compile; $i++) { |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 127 | |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 128 | # There is at least one aggregation field |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 129 | if ($compile[$i]->type eq 'aggregate') { |
| Akron | 8944098 | 2017-07-28 14:48:28 +0200 | [diff] [blame] | 130 | $sort_filtering = 0; |
| 131 | } |
| 132 | |
| 133 | # There is at least one group option |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 134 | elsif ($compile[$i]->type eq 'group') { |
| Akron | 8944098 | 2017-07-28 14:48:28 +0200 | [diff] [blame] | 135 | $sort_filtering = 0; |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 136 | }; |
| 137 | }; |
| 138 | |
| 139 | # Sort objects based on a defined order |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 140 | @compile = sort { |
| 141 | $COMPILE_ORDER{$a->type} <=> $COMPILE_ORDER{$b->type} |
| 142 | } @compile; |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 143 | |
| 144 | |
| 145 | # 2. Find identical types and merge |
| 146 | # fields+fields => fields |
| 147 | # sort+sort => sort ... |
| 148 | # and take the first value for single values |
| 149 | # start_index=0 + start_index=2 => start_index=0 |
| 150 | # |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 151 | for (my $i = 1; $i < @compile; $i++) { |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 152 | |
| Akron | 405cba0 | 2018-03-02 20:18:30 +0100 | [diff] [blame] | 153 | # 3. Remove duplicates |
| 154 | # aggr_freq + aggr_freq => - aggr_freq |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 155 | # Consecutive types are identical, join |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 156 | if ($compile[$i]->type eq $compile[$i-1]->type) { |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 157 | |
| 158 | # Join fields or aggregations |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 159 | if ($compile[$i]->type eq 'enrich' || |
| Akron | 405cba0 | 2018-03-02 20:18:30 +0100 | [diff] [blame] | 160 | $compile[$i]->type eq 'sort' || |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 161 | $compile[$i]->type eq 'aggregate' || |
| Akron | 405cba0 | 2018-03-02 20:18:30 +0100 | [diff] [blame] | 162 | $compile[$i]->type eq 'group_aggregate' |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 163 | ) { |
| 164 | |
| 165 | # The first operations have higher precedence |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 166 | $compile[$i-1]->operations( |
| 167 | $compile[$i-1]->operations, |
| 168 | $compile[$i]->operations |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 169 | ); |
| 170 | |
| 171 | # Remove merged object |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 172 | splice(@compile, $i, 1); |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 173 | $i--; |
| 174 | } |
| 175 | |
| 176 | # TODO: |
| 177 | # Make single field values work |
| 178 | # - start_index |
| 179 | # - count |
| 180 | |
| 181 | # Unknown operation |
| 182 | else { |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 183 | warn 'Unable to deal with unknown compile operation' . $compile[$i]->type; |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 184 | }; |
| 185 | |
| 186 | # Don't normalize nonmerged data |
| Akron | 4204f17 | 2017-10-02 22:32:02 +0200 | [diff] [blame] | 187 | CORE::next; |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 188 | }; |
| 189 | |
| 190 | # Normalize when no longer consecutive operations |
| 191 | # can be expected |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 192 | $compile[$i-1] = $compile[$i-1]->normalize; |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 193 | }; |
| 194 | |
| 195 | # Normalize last operation |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 196 | $compile[-1] = $compile[-1]->normalize; |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 197 | |
| 198 | # 4. Optimize |
| Akron | 8944098 | 2017-07-28 14:48:28 +0200 | [diff] [blame] | 199 | # No aggregation or group queries => |
| Akron | c1ed58c | 2017-08-04 17:26:30 +0200 | [diff] [blame] | 200 | # add a sort filter to sort |
| 201 | # If a limit is given, add top_k to sort |
| Akron | 45d3192 | 2017-09-15 17:05:36 +0200 | [diff] [blame] | 202 | if ($sort_filtering && $top_k) { |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 203 | foreach (@compile) { |
| Akron | c1ed58c | 2017-08-04 17:26:30 +0200 | [diff] [blame] | 204 | if ($_->type eq 'sort') { |
| 205 | |
| 206 | # Activate sort_filter option |
| Akron | 45d3192 | 2017-09-15 17:05:36 +0200 | [diff] [blame] | 207 | $_->filter(1); # if $sort_filtering; |
| Akron | c1ed58c | 2017-08-04 17:26:30 +0200 | [diff] [blame] | 208 | |
| 209 | # Set top_k option! |
| 210 | $_->top_k($top_k) if $top_k; |
| 211 | last; |
| Akron | 09ab24b | 2017-08-24 12:45:39 +0200 | [diff] [blame] | 212 | } |
| 213 | |
| 214 | elsif ($_->type eq 'sample') { |
| 215 | # Set top_k option! |
| 216 | $_->top_k($top_k) if $top_k; |
| 217 | last; |
| Akron | c1ed58c | 2017-08-04 17:26:30 +0200 | [diff] [blame] | 218 | }; |
| 219 | }; |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 220 | }; |
| 221 | |
| 222 | # Set operations |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 223 | $self->operations(@compile); |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 224 | |
| Akron | 304fdd5 | 2017-04-05 01:47:46 +0200 | [diff] [blame] | 225 | return $self; |
| 226 | }; |
| 227 | |
| Akron | 8944098 | 2017-07-28 14:48:28 +0200 | [diff] [blame] | 228 | |
| Akron | 01e97e8 | 2017-08-03 15:12:25 +0200 | [diff] [blame] | 229 | # Translate all fields to term ids |
| 230 | sub identify { |
| 231 | my ($self, $dict) = @_; |
| 232 | |
| 233 | for (my $i = 0; $i < @$self; $i++) { |
| 234 | $self->[$i] = $self->[$i]->identify($dict); |
| 235 | }; |
| 236 | |
| 237 | return $self; |
| 238 | }; |
| 239 | |
| 240 | |
| Akron | cfa3e01 | 2017-08-07 19:46:41 +0200 | [diff] [blame] | 241 | # Wrap operations in a single query object |
| Akron | c1ed58c | 2017-08-04 17:26:30 +0200 | [diff] [blame] | 242 | sub wrap { |
| 243 | my ($self, $query) = @_; |
| 244 | foreach (reverse $self->operations) { |
| 245 | $query = $_->wrap($query); |
| 246 | }; |
| 247 | return $query; |
| 248 | }; |
| 249 | |
| 250 | |
| Akron | c3657bf | 2016-10-31 00:15:43 +0100 | [diff] [blame] | 251 | 1; |
| 252 | |
| Akron | b00c2be | 2017-08-16 14:45:07 +0200 | [diff] [blame] | 253 | |
| Akron | c3657bf | 2016-10-31 00:15:43 +0100 | [diff] [blame] | 254 | __END__ |