| Akron | bc9d54c | 2017-01-14 02:27:45 +0100 | [diff] [blame] | 1 | package Krawfish::Result::Group; |
| Akron | 18ff592 | 2017-01-13 10:09:45 +0100 | [diff] [blame] | 2 | use Krawfish::Log; |
| 3 | use strict; |
| 4 | use warnings; |
| 5 | |
| Akron | 8118151 | 2017-01-19 09:52:34 +0100 | [diff] [blame] | 6 | use constant DEBUG => 0; |
| Akron | 555de3b | 2017-01-17 00:27:29 +0100 | [diff] [blame] | 7 | |
| 8 | # TODO: Use Krawfish::Posting::Group; |
| Akron | 18ff592 | 2017-01-13 10:09:45 +0100 | [diff] [blame] | 9 | |
| Akron | eb12ac2 | 2017-01-19 00:05:12 +0100 | [diff] [blame] | 10 | # Group matches based on certain criteria, for example: |
| 11 | # - for record matches |
| 12 | # - metadata! |
| 13 | # - This is useful to group document matches for corpus browsing! |
| Akron | 1f3feac | 2017-05-05 17:05:45 +0200 | [diff] [blame] | 14 | # - BUT: This would probably need a witness mechanism, so for a match, |
| 15 | # some fields can be loaded, e.g. a matching document sigle will return |
| 16 | # the document title. |
| Akron | eb12ac2 | 2017-01-19 00:05:12 +0100 | [diff] [blame] | 17 | # - for span matches |
| 18 | # - metdata |
| 19 | # - this is an extension to facets, where snippet frequencies are grouped |
| 20 | # based on a certain facet. |
| 21 | # - having facets in a first step may improve the distributed aggregation |
| 22 | # (as the central node than knows, which facets are most or least common) |
| 23 | # - this grouping doesn't seem beneficial - as the facet view already helps here |
| Akron | 18ff592 | 2017-01-13 10:09:45 +0100 | [diff] [blame] | 24 | # |
| Akron | eb12ac2 | 2017-01-19 00:05:12 +0100 | [diff] [blame] | 25 | # - innertextual! |
| Akron | 18ff592 | 2017-01-13 10:09:45 +0100 | [diff] [blame] | 26 | # - has a certain identical class on surface |
| 27 | # - has the same starting characters of a word |
| 28 | # - has the same ending characters of a word |
| 29 | # - has the same POS of a certain class (this is actually pretty hard!) |
| 30 | # - this may mean to modify the search a bit to lift the posting types |
| 31 | # and make a class, like [orth=der & base/p=*] |
| 32 | # - At least the postingslist of base/p=* should be merged in parallel! |
| 33 | # |
| 34 | # This is already possible in C2 so it needs to be implemented! |
| 35 | |
| Akron | 1f3feac | 2017-05-05 17:05:45 +0200 | [diff] [blame] | 36 | # A group has the following structure for matches: |
| Akron | 18ff592 | 2017-01-13 10:09:45 +0100 | [diff] [blame] | 37 | # { |
| 38 | # criterion => [freq, doc_freq] |
| 39 | # } |
| Akron | 1f3feac | 2017-05-05 17:05:45 +0200 | [diff] [blame] | 40 | # |
| 41 | # For docs, freq and doc_freq are identical |
| 42 | # |
| Akron | bc9d54c | 2017-01-14 02:27:45 +0100 | [diff] [blame] | 43 | # Where criterion is a classed sequence of criteria |
| Akron | 18ff592 | 2017-01-13 10:09:45 +0100 | [diff] [blame] | 44 | # with class information, like |
| 45 | # 1:der|2:Baum => [] |
| Akron | bc9d54c | 2017-01-14 02:27:45 +0100 | [diff] [blame] | 46 | # Sometimes it may indicate tokens instead of classes though ... |
| Akron | 1f3feac | 2017-05-05 17:05:45 +0200 | [diff] [blame] | 47 | # |
| 48 | # With a witness, the group has: |
| 49 | # { |
| 50 | # criterion => [freq, doc_freq, match] |
| 51 | # } |
| 52 | # The match can be anything - so it may even be a first example snippet. |
| 53 | # |
| Akron | 97a7cba | 2017-05-26 13:39:06 +0200 | [diff] [blame] | 54 | # But with a multiple class() corpora, there may be more: |
| Akron | 1f3feac | 2017-05-05 17:05:45 +0200 | [diff] [blame] | 55 | # |
| 56 | # { |
| 57 | # criterion => [freq, doc_freq, freq, doc_freq, freq, doc_freq, ...] |
| 58 | # } |
| Akron | 97a7cba | 2017-05-26 13:39:06 +0200 | [diff] [blame] | 59 | # |
| 60 | # or even |
| 61 | # |
| 62 | # { |
| 63 | # criterion => [freq, doc_freq, match, freq, doc_freq, match, freq, doc_freq, match ...] |
| 64 | # } |
| Akron | 1f3feac | 2017-05-05 17:05:45 +0200 | [diff] [blame] | 65 | |
| 66 | |
| 67 | |
| 68 | # WARNING! |
| 69 | # This kind of result can not be limited or sorted on an earlier level, |
| 70 | # as the number of matches is only clear after everything is aggregated. |
| Akron | 18ff592 | 2017-01-13 10:09:45 +0100 | [diff] [blame] | 71 | |
| 72 | # Construct grouping function |
| 73 | sub new { |
| 74 | my $class = shift; |
| Akron | 01e97e8 | 2017-08-03 15:12:25 +0200 | [diff] [blame] | 75 | my ($query, $criterion) = @_; |
| Akron | 8118151 | 2017-01-19 09:52:34 +0100 | [diff] [blame] | 76 | |
| Akron | 18ff592 | 2017-01-13 10:09:45 +0100 | [diff] [blame] | 77 | bless { |
| Akron | 8118151 | 2017-01-19 09:52:34 +0100 | [diff] [blame] | 78 | query => $query, |
| Akron | 18ff592 | 2017-01-13 10:09:45 +0100 | [diff] [blame] | 79 | |
| 80 | # This is a group criterion object, created outside, that defines the criterion |
| Akron | 8118151 | 2017-01-19 09:52:34 +0100 | [diff] [blame] | 81 | criterion => $criterion, |
| Akron | bc9d54c | 2017-01-14 02:27:45 +0100 | [diff] [blame] | 82 | pos => -1, |
| Akron | 18ff592 | 2017-01-13 10:09:45 +0100 | [diff] [blame] | 83 | |
| 84 | # Group to fill with matches and group info |
| 85 | # (as class1=>X, class2=>Y) |
| Akron | bc9d54c | 2017-01-14 02:27:45 +0100 | [diff] [blame] | 86 | groups => [] |
| Akron | 18ff592 | 2017-01-13 10:09:45 +0100 | [diff] [blame] | 87 | }, $class; |
| 88 | }; |
| 89 | |
| Akron | 555de3b | 2017-01-17 00:27:29 +0100 | [diff] [blame] | 90 | |
| Akron | 18ff592 | 2017-01-13 10:09:45 +0100 | [diff] [blame] | 91 | # Go through all matches |
| 92 | # This could, nonetheless, be implemented like Facets ... |
| 93 | sub _init { |
| 94 | my $self = shift; |
| Akron | bc9d54c | 2017-01-14 02:27:45 +0100 | [diff] [blame] | 95 | |
| 96 | return if $self->{init}++; |
| 97 | |
| Akron | 18ff592 | 2017-01-13 10:09:45 +0100 | [diff] [blame] | 98 | my $criterion = $self->{criterion}; |
| Akron | 8118151 | 2017-01-19 09:52:34 +0100 | [diff] [blame] | 99 | |
| Akron | 18ff592 | 2017-01-13 10:09:45 +0100 | [diff] [blame] | 100 | my $query = $self->{query}; |
| 101 | |
| 102 | my %groups = (); |
| Akron | bc9d54c | 2017-01-14 02:27:45 +0100 | [diff] [blame] | 103 | my ($group, $current); |
| Akron | 18ff592 | 2017-01-13 10:09:45 +0100 | [diff] [blame] | 104 | my $doc_id = -1; |
| 105 | |
| Akron | bc9d54c | 2017-01-14 02:27:45 +0100 | [diff] [blame] | 106 | # Iterate over all queries |
| Akron | 18ff592 | 2017-01-13 10:09:45 +0100 | [diff] [blame] | 107 | while ($query->next) { |
| 108 | |
| Akron | bc9d54c | 2017-01-14 02:27:45 +0100 | [diff] [blame] | 109 | # Get current query if there is any |
| Akron | 18ff592 | 2017-01-13 10:09:45 +0100 | [diff] [blame] | 110 | $current = $query->current or last; |
| 111 | |
| 112 | # Potentially create new group |
| Akron | 97a7cba | 2017-05-26 13:39:06 +0200 | [diff] [blame] | 113 | $group = ($groups{ |
| 114 | $criterion->get_group($current) |
| 115 | } //= [0,0]); |
| 116 | |
| 117 | # TODO: Should work with classes! |
| 118 | # Like |
| 119 | # foreach my $nr ($match->get_corpus_classes) { |
| 120 | # $group->[$nr * 2]++; |
| 121 | # } |
| Akron | 18ff592 | 2017-01-13 10:09:45 +0100 | [diff] [blame] | 122 | |
| 123 | # Increment freq |
| 124 | $group->[0]++; |
| 125 | |
| 126 | if ($current->doc_id != $doc_id) { |
| 127 | |
| 128 | # Increment doc_freq |
| 129 | $group->[1]++; |
| 130 | |
| Akron | 97a7cba | 2017-05-26 13:39:06 +0200 | [diff] [blame] | 131 | # TODO: If requested, add a witness! |
| 132 | |
| Akron | 18ff592 | 2017-01-13 10:09:45 +0100 | [diff] [blame] | 133 | $doc_id = $current->doc_id; |
| 134 | }; |
| 135 | }; |
| 136 | |
| Akron | 555de3b | 2017-01-17 00:27:29 +0100 | [diff] [blame] | 137 | # Value is stored as [criterion, freq, doc_freq] |
| 138 | # Sorted by freq by default |
| Akron | bc9d54c | 2017-01-14 02:27:45 +0100 | [diff] [blame] | 139 | my @array = (); |
| Akron | eb12ac2 | 2017-01-19 00:05:12 +0100 | [diff] [blame] | 140 | foreach (sort { $groups{$b}->[0] <=> $groups{$a}->[0] } keys %groups) { |
| Akron | 555de3b | 2017-01-17 00:27:29 +0100 | [diff] [blame] | 141 | push @array, [$_, $groups{$_}->[0], $groups{$_}->[1]]; |
| Akron | bc9d54c | 2017-01-14 02:27:45 +0100 | [diff] [blame] | 142 | }; |
| 143 | |
| Akron | 555de3b | 2017-01-17 00:27:29 +0100 | [diff] [blame] | 144 | # Store for retrieval |
| Akron | bc9d54c | 2017-01-14 02:27:45 +0100 | [diff] [blame] | 145 | $self->{groups} = \@array; |
| Akron | 555de3b | 2017-01-17 00:27:29 +0100 | [diff] [blame] | 146 | return 1; |
| Akron | bc9d54c | 2017-01-14 02:27:45 +0100 | [diff] [blame] | 147 | }; |
| 148 | |
| 149 | |
| 150 | sub freq { |
| 151 | my $self = shift; |
| 152 | scalar @{$self->{groups}} |
| Akron | 18ff592 | 2017-01-13 10:09:45 +0100 | [diff] [blame] | 153 | }; |
| 154 | |
| Akron | 97a7cba | 2017-05-26 13:39:06 +0200 | [diff] [blame] | 155 | |
| 156 | # Next will move a position value |
| Akron | 18ff592 | 2017-01-13 10:09:45 +0100 | [diff] [blame] | 157 | sub next { |
| 158 | my $self = shift; |
| Akron | bc9d54c | 2017-01-14 02:27:45 +0100 | [diff] [blame] | 159 | $self->_init; |
| 160 | if ($self->{pos}++ < ($self->freq - 1)) { |
| 161 | return 1; |
| 162 | }; |
| 163 | return; |
| Akron | 18ff592 | 2017-01-13 10:09:45 +0100 | [diff] [blame] | 164 | }; |
| 165 | |
| 166 | |
| Akron | 555de3b | 2017-01-17 00:27:29 +0100 | [diff] [blame] | 167 | sub current; |
| Akron | 18ff592 | 2017-01-13 10:09:45 +0100 | [diff] [blame] | 168 | |
| 169 | |
| Akron | 555de3b | 2017-01-17 00:27:29 +0100 | [diff] [blame] | 170 | # Return a hash reference with information |
| Akron | bc9d54c | 2017-01-14 02:27:45 +0100 | [diff] [blame] | 171 | sub current_group { |
| Akron | 555de3b | 2017-01-17 00:27:29 +0100 | [diff] [blame] | 172 | my $self = shift; |
| 173 | my $group = $self->{groups}->[$self->{pos}]; |
| Akron | d5105af | 2017-01-14 16:50:38 +0100 | [diff] [blame] | 174 | |
| Akron | 555de3b | 2017-01-17 00:27:29 +0100 | [diff] [blame] | 175 | # Make a hash from criterion |
| 176 | return $self->{criterion}->to_hash(@$group); |
| Akron | bc9d54c | 2017-01-14 02:27:45 +0100 | [diff] [blame] | 177 | }; |
| Akron | 18ff592 | 2017-01-13 10:09:45 +0100 | [diff] [blame] | 178 | |
| 179 | |
| 180 | sub to_string { |
| 181 | my $self = shift; |
| Akron | eb12ac2 | 2017-01-19 00:05:12 +0100 | [diff] [blame] | 182 | my $str = 'groupBy('; |
| Akron | 18ff592 | 2017-01-13 10:09:45 +0100 | [diff] [blame] | 183 | $str .= $self->{criterion}->to_string . ':'; |
| 184 | $str .= $self->{query}->to_string; |
| Akron | 8fb8d90 | 2017-02-20 21:29:31 +0100 | [diff] [blame] | 185 | return $str . ')'; |
| Akron | 18ff592 | 2017-01-13 10:09:45 +0100 | [diff] [blame] | 186 | }; |
| 187 | |
| 188 | 1; |