blob: 70def45c688312a4c4e562e328816c9bd774b9e2 [file] [log] [blame]
Akron09ab24b2017-08-24 12:45:39 +02001package Krawfish::Result::Segment::Sort::Sample;
2use Krawfish::Log;
3use strict;
4use warnings;
5
Akron7db79e22016-12-08 23:02:32 +01006# https://en.wikipedia.org/wiki/Reservoir_sampling
Akron09ab24b2017-08-24 12:45:39 +02007# https://webkist.wordpress.com/2008/10/01/reservoir-sampling-in-perl/
8# https://blogs.msdn.microsoft.com/spt/2008/02/05/reservoir-sampling/
Akron373df822016-12-28 15:25:14 +01009
10# A. Anagnostopoulos, A. Z. Broder, and D. Carmel. Sampling search-engine results. In Proc. of the Fourteenth International World Wide Web Conference, Chiba, Japan, 2005. ACM Press.
Akron09ab24b2017-08-24 12:45:39 +020011
12
13# WARNING:
14# Sorting does not respect current_match of any nested query, that's why
15# sorting is always separated from enriching!
16
17use constant DEBUG => 1;
18
19# Create a sample sort of k elements in the list
20sub new {
21 my $class = shift;
22 bless {
23 query => shift,
24 n => shift, # Size of the sample
25 k => 0, # Items already seen
26 reservoir => [],
27 current => undef
28 }, $class;
29};
30
31
32sub max_freq {
33 my $self = shift;
34 my $n = $self->{query}->max_freq;
35 $n = $n < $self->{n} ? $n : $self->{n};
36 return $n;
37};
38
39# Initialize reservoir
40sub _init {
41 my $self = shift;
42
43 return if $self->{k};
44
Akrond9135142017-08-25 15:45:17 +020045 if (DEBUG) {
46 print_log('r_s_sample', 'Initialize sampling, meaning iterate over all items');
47 };
48
49 while ($self->{query}->next) {
Akron09ab24b2017-08-24 12:45:39 +020050
51 # Seen next item
52 $self->{k}++;
53
Akrond9135142017-08-25 15:45:17 +020054 if (DEBUG) {
55 print_log('r_s_sample', 'Found item ' . $self->{k});
56 };
57
Akron09ab24b2017-08-24 12:45:39 +020058 # The reservoir is not filled up yet
59 if ($self->{k} <= $self->{n}) {
60
Akrond9135142017-08-25 15:45:17 +020061 if (DEBUG) {
62 print_log('r_s_sample', 'Add item ' . $self->{k} . ' to reservoir');
63 };
64
Akron09ab24b2017-08-24 12:45:39 +020065 # Add current match to reservoir
66 my $current = $self->{query}->current;
67 push @{$self->{reservoir}}, $current;
68 }
69
70 # Check if the item should replace another item in the reservoir
71 elsif (rand(1) <= ($self->{n}/$self->{k})) {
72
Akrond9135142017-08-25 15:45:17 +020073 my $item = int(rand($self->{n}));
74
75 if (DEBUG) {
76 print_log('r_s_sample', $self->{n} . ' == ' . scalar @{$self->{reservoir}});
77 print_log('r_s_sample', "Overwrite item $item with item " . $self->{k});
78 };
79
Akron09ab24b2017-08-24 12:45:39 +020080 # Replace random match in reservoir
81 my $current = $self->{query}->current;
82
83 # TODO:
84 # Check if $self->{n} is here equivalent to scalar @{$self->{reservoir}}
Akrond9135142017-08-25 15:45:17 +020085 $self->{reservoir}->[$item] = $current;
Akron09ab24b2017-08-24 12:45:39 +020086 }
Akrond9135142017-08-25 15:45:17 +020087
88 elsif (DEBUG) {
89 print_log('r_s_sample', 'Ignore item ' . $self->{k});
90 };
Akron09ab24b2017-08-24 12:45:39 +020091 };
92
93 return;
94};
95
96
97# Move to next item
98sub next {
99 my $self = shift;
100
101 # Fill reservoir
102 $self->_init;
103
104 # Get match from reservoir
105 my $current = shift @{$self->{reservoir}};
106
107 # There is no more match in reservoir
108 unless ($current) {
109 $self->{current} = undef;
110 return;
111 };
112
113 # Set current match
114 $self->{current} = $current;
115 return 1;
116};
117
118
119sub current {
120 $_[0]->{current};
121};
122
123
124sub match_from_query {
125 ...
126};
127
128
129sub current_match {
130 my $self = shift;
131 my $current = $self->current or return;
132 my $match = Krawfish::Posting::Match->new(
133 doc_id => $current->doc_id,
134 start => $current->start,
135 end => $current->end,
136 payload => $current->payload,
137 );
138
139 if (DEBUG) {
140 print_log('sort_sample', 'Current match is ' . $match->to_string);
141 };
142
143 return $match;
144};
145
146sub to_string {
147 'sample(' . $_[0]->{n} . ':' . $_[0]->{query}->to_string . ')';
148};
149
150
1511;