| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 1 | package Krawfish::Koral::Compile; |
| 2 | use Krawfish::Koral::Compile::Builder; |
| Akron | 304fdd5 | 2017-04-05 01:47:46 +0200 | [diff] [blame] | 3 | use Krawfish::Log; |
| Akron | c3657bf | 2016-10-31 00:15:43 +0100 | [diff] [blame] | 4 | use strict; |
| 5 | use warnings; |
| 6 | |
| Akron | a588d07 | 2017-10-13 14:45:34 +0200 | [diff] [blame] | 7 | # Creation of compilation query |
| 8 | |
| Akron | d15e2bb | 2017-08-11 18:23:14 +0200 | [diff] [blame] | 9 | # WARNING! / TODO! |
| 10 | # An enrichment for fields or snippets (better any enrichments) |
| Akron | 09ab24b | 2017-08-24 12:45:39 +0200 | [diff] [blame] | 11 | # can never wrap around a presort query, because the relevant |
| Akron | a588d07 | 2017-10-13 14:45:34 +0200 | [diff] [blame] | 12 | # data structures and algorithms require the results to be in |
| 13 | # doc_id order! |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 14 | |
| Akron | 4401f27 | 2017-08-18 16:54:30 +0200 | [diff] [blame] | 15 | # WARNING! |
| 16 | # It's important to remember that sortFilter can't be shared in parallel |
| 17 | # processing - especially for fields, as segment rankings can differ! |
| 18 | |
| Akron | c1ed58c | 2017-08-04 17:26:30 +0200 | [diff] [blame] | 19 | # TODO: |
| Akron | 09ab24b | 2017-08-24 12:45:39 +0200 | [diff] [blame] | 20 | # There are presort and postsort queries. |
| 21 | # Presortqueries don't respect current_query, |
| 22 | # while postsortqueries do! |
| 23 | # Postsortqueries only work on the clusterlevel. |
| 24 | |
| 25 | # TODO: |
| Akron | c1ed58c | 2017-08-04 17:26:30 +0200 | [diff] [blame] | 26 | # When a group filter is added, |
| 27 | # sorting does not work etc. |
| 28 | # This has to be thought through |
| 29 | |
| Akron | 015093d | 2017-10-24 18:47:44 +0200 | [diff] [blame] | 30 | # TODO: |
| 31 | # Remove corpus classes, in case they are not used. |
| 32 | # This requires a method like ->used_classes |
| Akron | d15e2bb | 2017-08-11 18:23:14 +0200 | [diff] [blame] | 33 | |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 34 | our %COMPILE_ORDER = ( |
| Akron | d15e2bb | 2017-08-11 18:23:14 +0200 | [diff] [blame] | 35 | limit => 1, |
| 36 | sort => 2, |
| Akron | 09ab24b | 2017-08-24 12:45:39 +0200 | [diff] [blame] | 37 | sample => 3, |
| 38 | enrich => 4, |
| 39 | aggregate => 5, |
| 40 | group => 6, |
| 41 | filter => 7 |
| Akron | d15e2bb | 2017-08-11 18:23:14 +0200 | [diff] [blame] | 42 | ); |
| 43 | |
| Akron | a588d07 | 2017-10-13 14:45:34 +0200 | [diff] [blame] | 44 | |
| Akron | 304fdd5 | 2017-04-05 01:47:46 +0200 | [diff] [blame] | 45 | use constant { |
| Akron | 94256e6 | 2017-10-10 17:29:18 +0200 | [diff] [blame] | 46 | DEBUG => 0, |
| Akron | 4b5257e | 2017-04-05 17:50:22 +0200 | [diff] [blame] | 47 | UNIQUE_FIELD => 'id' |
| Akron | 304fdd5 | 2017-04-05 01:47:46 +0200 | [diff] [blame] | 48 | }; |
| 49 | |
| Akron | a588d07 | 2017-10-13 14:45:34 +0200 | [diff] [blame] | 50 | |
| 51 | # Constructor |
| Akron | c3657bf | 2016-10-31 00:15:43 +0100 | [diff] [blame] | 52 | sub new { |
| 53 | my $class = shift; |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 54 | bless [@_], $class; |
| Akron | c3657bf | 2016-10-31 00:15:43 +0100 | [diff] [blame] | 55 | }; |
| 56 | |
| Akron | a588d07 | 2017-10-13 14:45:34 +0200 | [diff] [blame] | 57 | |
| 58 | # Stringification |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 59 | sub to_string { |
| Akron | 10448e1 | 2017-10-11 18:04:53 +0200 | [diff] [blame] | 60 | my ($self, $id) = @_; |
| 61 | return join(',', map { $_->to_string($id) } $self->operations); |
| Akron | 492674d | 2017-10-11 16:30:34 +0200 | [diff] [blame] | 62 | }; |
| 63 | |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 64 | |
| Akron | a588d07 | 2017-10-13 14:45:34 +0200 | [diff] [blame] | 65 | # Get builder object |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 66 | sub builder { |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 67 | return Krawfish::Koral::Compile::Builder->new; |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 68 | }; |
| 69 | |
| 70 | |
| 71 | # Get or set operations |
| 72 | sub operations { |
| 73 | my $self = shift; |
| 74 | if (@_) { |
| 75 | @$self = @_; |
| 76 | return $self; |
| 77 | }; |
| 78 | return @$self; |
| 79 | }; |
| 80 | |
| 81 | |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 82 | # Normalize compile object |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 83 | sub normalize { |
| 84 | my $self = shift; |
| 85 | |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 86 | my @compile = $self->operations; |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 87 | |
| 88 | my $mb = $self->builder; |
| 89 | |
| Akron | c1ed58c | 2017-08-04 17:26:30 +0200 | [diff] [blame] | 90 | # Check, if the query is a group query, |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 91 | # which invalidates some compile operations |
| Akron | c1ed58c | 2017-08-04 17:26:30 +0200 | [diff] [blame] | 92 | my $group_query = 0; |
| 93 | my $top_k = 0; |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 94 | foreach (@compile) { |
| Akron | c1ed58c | 2017-08-04 17:26:30 +0200 | [diff] [blame] | 95 | if ($_->type eq 'group') { |
| 96 | $group_query = 1; |
| 97 | } |
| 98 | elsif ($_->type eq 'limit') { |
| 99 | $top_k = $_->start_index + $_->items_per_page; |
| 100 | }; |
| 101 | }; |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 102 | |
| Akron | c1ed58c | 2017-08-04 17:26:30 +0200 | [diff] [blame] | 103 | # Add unique sorting per default - unless it's a group query |
| Akron | a588d07 | 2017-10-13 14:45:34 +0200 | [diff] [blame] | 104 | # unless ($group_query) { |
| 105 | # push @compile, |
| 106 | # $mb->sort_by($mb->s_field(UNIQUE_FIELD)); |
| Akron | d15e2bb | 2017-08-11 18:23:14 +0200 | [diff] [blame] | 107 | # |
| Akron | a588d07 | 2017-10-13 14:45:34 +0200 | [diff] [blame] | 108 | # if (DEBUG) { |
| 109 | # print_log('kq_compile', 'Added unique field ' . UNIQUE_FIELD . ' to order'); |
| 110 | # }; |
| 111 | # }; |
| Akron | c1ed58c | 2017-08-04 17:26:30 +0200 | [diff] [blame] | 112 | |
| Akron | 01e97e8 | 2017-08-03 15:12:25 +0200 | [diff] [blame] | 113 | |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 114 | # 1. Introduce required information |
| 115 | # e.g. sort(field) => fields(field) |
| Akron | 8944098 | 2017-07-28 14:48:28 +0200 | [diff] [blame] | 116 | my $sort_filtering = 1; |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 117 | for (my $i = 0; $i < scalar @compile; $i++) { |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 118 | |
| 119 | # For all sort fields, it may be beneficial to |
| 120 | # retrieve the fields as well - as they need |
| 121 | # to be retrieved nonetheless for search criteria |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 122 | #if ($compile[$i]->type eq 'sort') { |
| Akron | d15e2bb | 2017-08-11 18:23:14 +0200 | [diff] [blame] | 123 | # |
| 124 | # my $mb = $self->builder; |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 125 | # push @compile, |
| 126 | # $mb->enrich($mb->e_fields($compile[$i]->fields)); |
| Akron | d15e2bb | 2017-08-11 18:23:14 +0200 | [diff] [blame] | 127 | # |
| 128 | # if (DEBUG) { |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 129 | # print_log('kq_compile', 'Added sorting ' . |
| 130 | # join(',', map {$_->to_string } $compile[$i]->fields) . |
| Akron | d15e2bb | 2017-08-11 18:23:14 +0200 | [diff] [blame] | 131 | # ' to fields'); |
| 132 | # }; |
| 133 | #} |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 134 | |
| 135 | # There is at least one aggregation field |
| Akron | d15e2bb | 2017-08-11 18:23:14 +0200 | [diff] [blame] | 136 | #els |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 137 | if ($compile[$i]->type eq 'aggregate') { |
| Akron | 8944098 | 2017-07-28 14:48:28 +0200 | [diff] [blame] | 138 | $sort_filtering = 0; |
| 139 | } |
| 140 | |
| 141 | # There is at least one group option |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 142 | elsif ($compile[$i]->type eq 'group') { |
| Akron | 8944098 | 2017-07-28 14:48:28 +0200 | [diff] [blame] | 143 | $sort_filtering = 0; |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 144 | }; |
| 145 | }; |
| 146 | |
| 147 | # Sort objects based on a defined order |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 148 | @compile = sort { |
| 149 | $COMPILE_ORDER{$a->type} <=> $COMPILE_ORDER{$b->type} |
| 150 | } @compile; |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 151 | |
| 152 | |
| 153 | # 2. Find identical types and merge |
| 154 | # fields+fields => fields |
| 155 | # sort+sort => sort ... |
| 156 | # and take the first value for single values |
| 157 | # start_index=0 + start_index=2 => start_index=0 |
| 158 | # |
| 159 | # 3. Remove duplicates |
| 160 | # aggr_freq + aggr_freq => - aggr_freq |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 161 | for (my $i = 1; $i < @compile; $i++) { |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 162 | |
| 163 | # Consecutive types are identical, join |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 164 | if ($compile[$i]->type eq $compile[$i-1]->type) { |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 165 | |
| 166 | # Join fields or aggregations |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 167 | if ($compile[$i]->type eq 'enrich' || |
| 168 | $compile[$i]->type eq 'aggregate' || |
| 169 | $compile[$i]->type eq 'sort' |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 170 | ) { |
| 171 | |
| 172 | # The first operations have higher precedence |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 173 | $compile[$i-1]->operations( |
| 174 | $compile[$i-1]->operations, |
| 175 | $compile[$i]->operations |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 176 | ); |
| 177 | |
| 178 | # Remove merged object |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 179 | splice(@compile, $i, 1); |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 180 | $i--; |
| 181 | } |
| 182 | |
| 183 | # TODO: |
| 184 | # Make single field values work |
| 185 | # - start_index |
| 186 | # - count |
| 187 | |
| 188 | # Unknown operation |
| 189 | else { |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 190 | warn 'Unable to deal with unknown compile operation' . $compile[$i]->type; |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 191 | }; |
| 192 | |
| 193 | # Don't normalize nonmerged data |
| Akron | 4204f17 | 2017-10-02 22:32:02 +0200 | [diff] [blame] | 194 | CORE::next; |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 195 | }; |
| 196 | |
| 197 | # Normalize when no longer consecutive operations |
| 198 | # can be expected |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 199 | $compile[$i-1] = $compile[$i-1]->normalize; |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 200 | }; |
| 201 | |
| 202 | # Normalize last operation |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 203 | $compile[-1] = $compile[-1]->normalize; |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 204 | |
| 205 | # 4. Optimize |
| Akron | 8944098 | 2017-07-28 14:48:28 +0200 | [diff] [blame] | 206 | # No aggregation or group queries => |
| Akron | c1ed58c | 2017-08-04 17:26:30 +0200 | [diff] [blame] | 207 | # add a sort filter to sort |
| 208 | # If a limit is given, add top_k to sort |
| Akron | 45d3192 | 2017-09-15 17:05:36 +0200 | [diff] [blame] | 209 | if ($sort_filtering && $top_k) { |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 210 | foreach (@compile) { |
| Akron | c1ed58c | 2017-08-04 17:26:30 +0200 | [diff] [blame] | 211 | if ($_->type eq 'sort') { |
| 212 | |
| 213 | # Activate sort_filter option |
| Akron | 45d3192 | 2017-09-15 17:05:36 +0200 | [diff] [blame] | 214 | $_->filter(1); # if $sort_filtering; |
| Akron | c1ed58c | 2017-08-04 17:26:30 +0200 | [diff] [blame] | 215 | |
| 216 | # Set top_k option! |
| 217 | $_->top_k($top_k) if $top_k; |
| 218 | last; |
| Akron | 09ab24b | 2017-08-24 12:45:39 +0200 | [diff] [blame] | 219 | } |
| 220 | |
| 221 | elsif ($_->type eq 'sample') { |
| 222 | # Set top_k option! |
| 223 | $_->top_k($top_k) if $top_k; |
| 224 | last; |
| Akron | c1ed58c | 2017-08-04 17:26:30 +0200 | [diff] [blame] | 225 | }; |
| 226 | }; |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 227 | }; |
| 228 | |
| 229 | # Set operations |
| Akron | 5cf5fca | 2017-10-09 19:01:47 +0200 | [diff] [blame] | 230 | $self->operations(@compile); |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 231 | |
| Akron | 304fdd5 | 2017-04-05 01:47:46 +0200 | [diff] [blame] | 232 | return $self; |
| 233 | }; |
| 234 | |
| Akron | 8944098 | 2017-07-28 14:48:28 +0200 | [diff] [blame] | 235 | |
| Akron | 01e97e8 | 2017-08-03 15:12:25 +0200 | [diff] [blame] | 236 | # Translate all fields to term ids |
| 237 | sub identify { |
| 238 | my ($self, $dict) = @_; |
| 239 | |
| 240 | for (my $i = 0; $i < @$self; $i++) { |
| 241 | $self->[$i] = $self->[$i]->identify($dict); |
| 242 | }; |
| 243 | |
| 244 | return $self; |
| 245 | }; |
| 246 | |
| 247 | |
| Akron | cfa3e01 | 2017-08-07 19:46:41 +0200 | [diff] [blame] | 248 | # Wrap operations in a single query object |
| Akron | c1ed58c | 2017-08-04 17:26:30 +0200 | [diff] [blame] | 249 | sub wrap { |
| 250 | my ($self, $query) = @_; |
| 251 | foreach (reverse $self->operations) { |
| 252 | $query = $_->wrap($query); |
| 253 | }; |
| 254 | return $query; |
| 255 | }; |
| 256 | |
| 257 | |
| Akron | a588d07 | 2017-10-13 14:45:34 +0200 | [diff] [blame] | 258 | # Send to segments |
| Akron | 8944098 | 2017-07-28 14:48:28 +0200 | [diff] [blame] | 259 | sub to_segment { |
| Akron | 8944098 | 2017-07-28 14:48:28 +0200 | [diff] [blame] | 260 | ... |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 261 | }; |
| 262 | |
| 263 | |
| Akron | a588d07 | 2017-10-13 14:45:34 +0200 | [diff] [blame] | 264 | # Optimize query |
| Akron | cfa3e01 | 2017-08-07 19:46:41 +0200 | [diff] [blame] | 265 | sub optimize { |
| 266 | ... |
| 267 | }; |
| Akron | 78c4950 | 2017-07-27 16:00:36 +0200 | [diff] [blame] | 268 | |
| Akron | 304fdd5 | 2017-04-05 01:47:46 +0200 | [diff] [blame] | 269 | |
| Akron | c3657bf | 2016-10-31 00:15:43 +0100 | [diff] [blame] | 270 | 1; |
| 271 | |
| Akron | b00c2be | 2017-08-16 14:45:07 +0200 | [diff] [blame] | 272 | |
| Akron | c3657bf | 2016-10-31 00:15:43 +0100 | [diff] [blame] | 273 | __END__ |