| Akron | dc8dceb | 2017-08-22 20:25:39 +0200 | [diff] [blame] | 1 | package Krawfish::Result::Segment::Group::Fields; |
| 2 | use parent 'Krawfish::Result'; |
| 3 | use Krawfish::Posting::Group::Fields; |
| Akron | eb12ac2 | 2017-01-19 00:05:12 +0100 | [diff] [blame] | 4 | use Krawfish::Log; |
| 5 | use strict; |
| 6 | use warnings; |
| 7 | |
| 8 | use constant DEBUG => 0; |
| 9 | |
| 10 | # This will group matches (especially document matches) by field |
| Akron | e091453 | 2017-07-29 19:53:10 +0200 | [diff] [blame] | 11 | # This is useful e.g. for document browsing per corpus. |
| Akron | eb12ac2 | 2017-01-19 00:05:12 +0100 | [diff] [blame] | 12 | # |
| Akron | e091453 | 2017-07-29 19:53:10 +0200 | [diff] [blame] | 13 | # Because the grouping is based on ranking, the sorting will be trivial. |
| Akron | ea3df5b | 2017-09-09 22:09:18 +0200 | [diff] [blame] | 14 | # |
| 15 | # TODO: |
| 16 | # For some mechanisms, it is not necessary to count all occurrences, |
| 17 | # e.g. to get all keywords used in a certain virtual corpus or all |
| 18 | # used annotations. |
| Akron | eb12ac2 | 2017-01-19 00:05:12 +0100 | [diff] [blame] | 19 | |
| 20 | sub new { |
| 21 | my $class = shift; |
| Akron | dc8dceb | 2017-08-22 20:25:39 +0200 | [diff] [blame] | 22 | my ($field_obj, $query, $fields) = @_; |
| 23 | my $self = bless { |
| 24 | field_obj => $field_obj, |
| 25 | query => $query, |
| 26 | field_keys => [map { ref($_) ? $_->term_id : $_ } @$fields], |
| 27 | last_doc_id => -1 |
| Akron | eb12ac2 | 2017-01-19 00:05:12 +0100 | [diff] [blame] | 28 | }, $class; |
| Akron | dc8dceb | 2017-08-22 20:25:39 +0200 | [diff] [blame] | 29 | |
| 30 | # Initialize group object |
| 31 | $self->{groups} = Krawfish::Posting::Group::Fields->new($self->{field_keys}); |
| 32 | |
| 33 | return $self; |
| Akron | eb12ac2 | 2017-01-19 00:05:12 +0100 | [diff] [blame] | 34 | }; |
| 35 | |
| 36 | |
| Akron | dc8dceb | 2017-08-22 20:25:39 +0200 | [diff] [blame] | 37 | # Initialize field pointer |
| 38 | sub _init { |
| 39 | return if $_[0]->{field_pointer}; |
| 40 | |
| 41 | my $self = shift; |
| 42 | |
| 43 | print_log('g_fields', 'Create pointer on fields') if DEBUG; |
| 44 | |
| 45 | # Load the ranked list - may be too large for memory! |
| 46 | $self->{field_pointer} = $self->{field_obj}->pointer; |
| 47 | }; |
| 48 | |
| 49 | |
| 50 | sub to_string { |
| 51 | my $self = shift; |
| 52 | my $str = 'gFields(' . join(',', map { '#' . $_ } @{$self->{field_keys}}) . |
| 53 | ':' . $self->{query}->to_string . ')'; |
| 54 | return $str; |
| 55 | }; |
| 56 | |
| 57 | |
| 58 | # Shorthand for "search through" |
| 59 | sub finalize { |
| 60 | while ($_[0]->next) {}; |
| 61 | return $_[0]; |
| 62 | }; |
| 63 | |
| 64 | |
| 65 | # Iterate to the next result |
| 66 | sub next { |
| 67 | my $self = shift; |
| 68 | |
| 69 | $self->_init; |
| 70 | |
| 71 | my $groups = $self->{groups}; |
| 72 | my $pointer = $self->{field_pointer}; |
| 73 | |
| Akron | dc8dceb | 2017-08-22 20:25:39 +0200 | [diff] [blame] | 74 | # There is a next match |
| 75 | if ($self->{query}->next) { |
| 76 | |
| 77 | # Get the current posting |
| 78 | my $current = $self->{query}->current; |
| 79 | |
| 80 | if ($current->doc_id != $self->{last_doc_id}) { |
| 81 | |
| 82 | # Flush old information |
| 83 | $groups->flush; |
| 84 | |
| 85 | my $doc_id = $pointer->skip_doc($current->doc_id); |
| 86 | |
| 87 | # There are no fields for this doc |
| 88 | next if $doc_id != $current->doc_id; |
| 89 | |
| 90 | # Due to multivalued fields, |
| 91 | # a document can yield a permutation of |
| 92 | # patterns, so we recognize this |
| 93 | my @patterns = (); |
| 94 | my @field_keys = @{$self->{field_keys}}; |
| 95 | |
| 96 | # Ignore stored fields |
| 97 | my @field_objs = grep { $_->type ne 'store' } $pointer->fields(@field_keys); |
| 98 | |
| 99 | my ($key_pos, $val_pos) = (0,0); |
| 100 | |
| 101 | # Iterate through both lists and create a pattern |
| 102 | # Pattern may occur because fields can have multiple values |
| 103 | while ($key_pos < @field_keys) { |
| 104 | |
| 105 | # There are no more values for the position |
| 106 | if (!$field_objs[$val_pos]) { |
| 107 | # Add ignorable null term |
| 108 | unless (@{$patterns[$key_pos]}) { |
| 109 | push @{$patterns[$key_pos]}, 0; |
| 110 | }; |
| 111 | $key_pos++; |
| 112 | } |
| 113 | |
| 114 | # Key identifier are matching |
| 115 | elsif ($field_keys[$key_pos] == $field_objs[$val_pos]->key_id) { |
| 116 | |
| 117 | # Add key to pattern |
| 118 | $patterns[$key_pos] //= []; |
| 119 | push @{$patterns[$key_pos]}, $field_objs[$val_pos]->term_id; |
| 120 | $val_pos++; |
| 121 | } |
| 122 | |
| 123 | # Forward key position |
| 124 | elsif ($field_keys[$key_pos] < $field_objs[$val_pos]->key_id) { |
| 125 | |
| 126 | # Add ignorable null term |
| 127 | unless (@{$patterns[$key_pos]}) { |
| 128 | push @{$patterns[$key_pos]}, 0; |
| 129 | }; |
| 130 | $key_pos++; |
| 131 | } |
| 132 | |
| 133 | # $field_keys[$key_pos] > $field_objs[$val_pos]->key_id |
| 134 | else { |
| 135 | |
| 136 | # I don't know if this can happen |
| 137 | $val_pos++; |
| 138 | }; |
| 139 | }; |
| 140 | |
| 141 | # This adds |
| 142 | $groups->incr_doc(\@patterns); |
| 143 | |
| Akron | dc8dceb | 2017-08-22 20:25:39 +0200 | [diff] [blame] | 144 | # Set last doc to current doc |
| 145 | $self->{last_doc_id} = $current->doc_id; |
| 146 | }; |
| 147 | |
| 148 | # Add to frequencies |
| 149 | $groups->incr_match; |
| 150 | |
| 151 | return 1; |
| 152 | }; |
| 153 | |
| Akron | 09ab24b | 2017-08-24 12:45:39 +0200 | [diff] [blame] | 154 | # Flush cached results |
| Akron | dc8dceb | 2017-08-22 20:25:39 +0200 | [diff] [blame] | 155 | $groups->flush; |
| 156 | |
| 157 | return 0; |
| 158 | }; |
| 159 | |
| 160 | |
| 161 | sub current { |
| 162 | return $_[0]->{query}->current; |
| 163 | }; |
| 164 | |
| 165 | |
| Akron | 09ab24b | 2017-08-24 12:45:39 +0200 | [diff] [blame] | 166 | # Get collection |
| Akron | dc8dceb | 2017-08-22 20:25:39 +0200 | [diff] [blame] | 167 | sub collection { |
| 168 | $_[0]->{groups}; |
| 169 | }; |
| 170 | |
| 171 | |
| Akron | dc8dceb | 2017-08-22 20:25:39 +0200 | [diff] [blame] | 172 | 1; |
| Akron | 09ab24b | 2017-08-24 12:45:39 +0200 | [diff] [blame] | 173 | |
| 174 | |
| Akron | dc8dceb | 2017-08-22 20:25:39 +0200 | [diff] [blame] | 175 | __END__ |