| Akron | 09ab24b | 2017-08-24 12:45:39 +0200 | [diff] [blame] | 1 | package Krawfish::Result::Segment::Sort::Sample; |
| 2 | use Krawfish::Log; |
| 3 | use strict; |
| 4 | use warnings; |
| 5 | |
| Akron | 7db79e2 | 2016-12-08 23:02:32 +0100 | [diff] [blame] | 6 | # https://en.wikipedia.org/wiki/Reservoir_sampling |
| Akron | 09ab24b | 2017-08-24 12:45:39 +0200 | [diff] [blame] | 7 | # https://webkist.wordpress.com/2008/10/01/reservoir-sampling-in-perl/ |
| 8 | # https://blogs.msdn.microsoft.com/spt/2008/02/05/reservoir-sampling/ |
| Akron | 373df82 | 2016-12-28 15:25:14 +0100 | [diff] [blame] | 9 | |
| 10 | # A. Anagnostopoulos, A. Z. Broder, and D. Carmel. Sampling search-engine results. In Proc. of the Fourteenth International World Wide Web Conference, Chiba, Japan, 2005. ACM Press. |
| Akron | 09ab24b | 2017-08-24 12:45:39 +0200 | [diff] [blame] | 11 | |
| 12 | |
| 13 | # WARNING: |
| 14 | # Sorting does not respect current_match of any nested query, that's why |
| 15 | # sorting is always separated from enriching! |
| 16 | |
| 17 | use constant DEBUG => 1; |
| 18 | |
| 19 | # Create a sample sort of k elements in the list |
| 20 | sub new { |
| 21 | my $class = shift; |
| 22 | bless { |
| 23 | query => shift, |
| 24 | n => shift, # Size of the sample |
| 25 | k => 0, # Items already seen |
| 26 | reservoir => [], |
| 27 | current => undef |
| 28 | }, $class; |
| 29 | }; |
| 30 | |
| 31 | |
| 32 | sub max_freq { |
| 33 | my $self = shift; |
| 34 | my $n = $self->{query}->max_freq; |
| 35 | $n = $n < $self->{n} ? $n : $self->{n}; |
| 36 | return $n; |
| 37 | }; |
| 38 | |
| 39 | # Initialize reservoir |
| 40 | sub _init { |
| 41 | my $self = shift; |
| 42 | |
| 43 | return if $self->{k}; |
| 44 | |
| Akron | d913514 | 2017-08-25 15:45:17 +0200 | [diff] [blame^] | 45 | if (DEBUG) { |
| 46 | print_log('r_s_sample', 'Initialize sampling, meaning iterate over all items'); |
| 47 | }; |
| 48 | |
| 49 | while ($self->{query}->next) { |
| Akron | 09ab24b | 2017-08-24 12:45:39 +0200 | [diff] [blame] | 50 | |
| 51 | # Seen next item |
| 52 | $self->{k}++; |
| 53 | |
| Akron | d913514 | 2017-08-25 15:45:17 +0200 | [diff] [blame^] | 54 | if (DEBUG) { |
| 55 | print_log('r_s_sample', 'Found item ' . $self->{k}); |
| 56 | }; |
| 57 | |
| Akron | 09ab24b | 2017-08-24 12:45:39 +0200 | [diff] [blame] | 58 | # The reservoir is not filled up yet |
| 59 | if ($self->{k} <= $self->{n}) { |
| 60 | |
| Akron | d913514 | 2017-08-25 15:45:17 +0200 | [diff] [blame^] | 61 | if (DEBUG) { |
| 62 | print_log('r_s_sample', 'Add item ' . $self->{k} . ' to reservoir'); |
| 63 | }; |
| 64 | |
| Akron | 09ab24b | 2017-08-24 12:45:39 +0200 | [diff] [blame] | 65 | # Add current match to reservoir |
| 66 | my $current = $self->{query}->current; |
| 67 | push @{$self->{reservoir}}, $current; |
| 68 | } |
| 69 | |
| 70 | # Check if the item should replace another item in the reservoir |
| 71 | elsif (rand(1) <= ($self->{n}/$self->{k})) { |
| 72 | |
| Akron | d913514 | 2017-08-25 15:45:17 +0200 | [diff] [blame^] | 73 | my $item = int(rand($self->{n})); |
| 74 | |
| 75 | if (DEBUG) { |
| 76 | print_log('r_s_sample', $self->{n} . ' == ' . scalar @{$self->{reservoir}}); |
| 77 | print_log('r_s_sample', "Overwrite item $item with item " . $self->{k}); |
| 78 | }; |
| 79 | |
| Akron | 09ab24b | 2017-08-24 12:45:39 +0200 | [diff] [blame] | 80 | # Replace random match in reservoir |
| 81 | my $current = $self->{query}->current; |
| 82 | |
| 83 | # TODO: |
| 84 | # Check if $self->{n} is here equivalent to scalar @{$self->{reservoir}} |
| Akron | d913514 | 2017-08-25 15:45:17 +0200 | [diff] [blame^] | 85 | $self->{reservoir}->[$item] = $current; |
| Akron | 09ab24b | 2017-08-24 12:45:39 +0200 | [diff] [blame] | 86 | } |
| Akron | d913514 | 2017-08-25 15:45:17 +0200 | [diff] [blame^] | 87 | |
| 88 | elsif (DEBUG) { |
| 89 | print_log('r_s_sample', 'Ignore item ' . $self->{k}); |
| 90 | }; |
| Akron | 09ab24b | 2017-08-24 12:45:39 +0200 | [diff] [blame] | 91 | }; |
| 92 | |
| 93 | return; |
| 94 | }; |
| 95 | |
| 96 | |
| 97 | # Move to next item |
| 98 | sub next { |
| 99 | my $self = shift; |
| 100 | |
| 101 | # Fill reservoir |
| 102 | $self->_init; |
| 103 | |
| 104 | # Get match from reservoir |
| 105 | my $current = shift @{$self->{reservoir}}; |
| 106 | |
| 107 | # There is no more match in reservoir |
| 108 | unless ($current) { |
| 109 | $self->{current} = undef; |
| 110 | return; |
| 111 | }; |
| 112 | |
| 113 | # Set current match |
| 114 | $self->{current} = $current; |
| 115 | return 1; |
| 116 | }; |
| 117 | |
| 118 | |
| 119 | sub current { |
| 120 | $_[0]->{current}; |
| 121 | }; |
| 122 | |
| 123 | |
| 124 | sub match_from_query { |
| 125 | ... |
| 126 | }; |
| 127 | |
| 128 | |
| 129 | sub current_match { |
| 130 | my $self = shift; |
| 131 | my $current = $self->current or return; |
| 132 | my $match = Krawfish::Posting::Match->new( |
| 133 | doc_id => $current->doc_id, |
| 134 | start => $current->start, |
| 135 | end => $current->end, |
| 136 | payload => $current->payload, |
| 137 | ); |
| 138 | |
| 139 | if (DEBUG) { |
| 140 | print_log('sort_sample', 'Current match is ' . $match->to_string); |
| 141 | }; |
| 142 | |
| 143 | return $match; |
| 144 | }; |
| 145 | |
| 146 | sub to_string { |
| 147 | 'sample(' . $_[0]->{n} . ':' . $_[0]->{query}->to_string . ')'; |
| 148 | }; |
| 149 | |
| 150 | |
| 151 | 1; |