| Akron | 09ab24b | 2017-08-24 12:45:39 +0200 | [diff] [blame^] | 1 | package Krawfish::Result::Segment::Sort::Sample; |
| 2 | use Krawfish::Log; |
| 3 | use strict; |
| 4 | use warnings; |
| 5 | |
| Akron | 7db79e2 | 2016-12-08 23:02:32 +0100 | [diff] [blame] | 6 | # https://en.wikipedia.org/wiki/Reservoir_sampling |
| Akron | 09ab24b | 2017-08-24 12:45:39 +0200 | [diff] [blame^] | 7 | # https://webkist.wordpress.com/2008/10/01/reservoir-sampling-in-perl/ |
| 8 | # https://blogs.msdn.microsoft.com/spt/2008/02/05/reservoir-sampling/ |
| Akron | 373df82 | 2016-12-28 15:25:14 +0100 | [diff] [blame] | 9 | |
| 10 | # A. Anagnostopoulos, A. Z. Broder, and D. Carmel. Sampling search-engine results. In Proc. of the Fourteenth International World Wide Web Conference, Chiba, Japan, 2005. ACM Press. |
| Akron | 09ab24b | 2017-08-24 12:45:39 +0200 | [diff] [blame^] | 11 | |
| 12 | |
| 13 | # WARNING: |
| 14 | # Sorting does not respect current_match of any nested query, that's why |
| 15 | # sorting is always separated from enriching! |
| 16 | |
| 17 | use constant DEBUG => 1; |
| 18 | |
| 19 | # Create a sample sort of k elements in the list |
| 20 | sub new { |
| 21 | my $class = shift; |
| 22 | bless { |
| 23 | query => shift, |
| 24 | n => shift, # Size of the sample |
| 25 | k => 0, # Items already seen |
| 26 | reservoir => [], |
| 27 | current => undef |
| 28 | }, $class; |
| 29 | }; |
| 30 | |
| 31 | |
| 32 | sub max_freq { |
| 33 | my $self = shift; |
| 34 | my $n = $self->{query}->max_freq; |
| 35 | $n = $n < $self->{n} ? $n : $self->{n}; |
| 36 | return $n; |
| 37 | }; |
| 38 | |
| 39 | # Initialize reservoir |
| 40 | sub _init { |
| 41 | my $self = shift; |
| 42 | |
| 43 | return if $self->{k}; |
| 44 | |
| 45 | if ($self->{query}->next) { |
| 46 | |
| 47 | # Seen next item |
| 48 | $self->{k}++; |
| 49 | |
| 50 | # The reservoir is not filled up yet |
| 51 | if ($self->{k} <= $self->{n}) { |
| 52 | |
| 53 | # Add current match to reservoir |
| 54 | my $current = $self->{query}->current; |
| 55 | push @{$self->{reservoir}}, $current; |
| 56 | } |
| 57 | |
| 58 | # Check if the item should replace another item in the reservoir |
| 59 | elsif (rand(1) <= ($self->{n}/$self->{k})) { |
| 60 | |
| 61 | # Replace random match in reservoir |
| 62 | my $current = $self->{query}->current; |
| 63 | |
| 64 | # TODO: |
| 65 | # Check if $self->{n} is here equivalent to scalar @{$self->{reservoir}} |
| 66 | $self->{reservoir}->[rand($self->{n})] = $current; |
| 67 | } |
| 68 | }; |
| 69 | |
| 70 | return; |
| 71 | }; |
| 72 | |
| 73 | |
| 74 | # Move to next item |
| 75 | sub next { |
| 76 | my $self = shift; |
| 77 | |
| 78 | # Fill reservoir |
| 79 | $self->_init; |
| 80 | |
| 81 | # Get match from reservoir |
| 82 | my $current = shift @{$self->{reservoir}}; |
| 83 | |
| 84 | # There is no more match in reservoir |
| 85 | unless ($current) { |
| 86 | $self->{current} = undef; |
| 87 | return; |
| 88 | }; |
| 89 | |
| 90 | # Set current match |
| 91 | $self->{current} = $current; |
| 92 | return 1; |
| 93 | }; |
| 94 | |
| 95 | |
| 96 | sub current { |
| 97 | $_[0]->{current}; |
| 98 | }; |
| 99 | |
| 100 | |
| 101 | sub match_from_query { |
| 102 | ... |
| 103 | }; |
| 104 | |
| 105 | |
| 106 | sub current_match { |
| 107 | my $self = shift; |
| 108 | my $current = $self->current or return; |
| 109 | my $match = Krawfish::Posting::Match->new( |
| 110 | doc_id => $current->doc_id, |
| 111 | start => $current->start, |
| 112 | end => $current->end, |
| 113 | payload => $current->payload, |
| 114 | ); |
| 115 | |
| 116 | if (DEBUG) { |
| 117 | print_log('sort_sample', 'Current match is ' . $match->to_string); |
| 118 | }; |
| 119 | |
| 120 | return $match; |
| 121 | }; |
| 122 | |
| 123 | sub to_string { |
| 124 | 'sample(' . $_[0]->{n} . ':' . $_[0]->{query}->to_string . ')'; |
| 125 | }; |
| 126 | |
| 127 | |
| 128 | 1; |