blob: 8bb1a16143767221ce287de8456e1b30f9a07aa2 [file] [log] [blame]
Akron09ab24b2017-08-24 12:45:39 +02001package Krawfish::Result::Segment::Sort::Sample;
2use Krawfish::Log;
3use strict;
4use warnings;
5
Akron7db79e22016-12-08 23:02:32 +01006# https://en.wikipedia.org/wiki/Reservoir_sampling
Akron09ab24b2017-08-24 12:45:39 +02007# https://webkist.wordpress.com/2008/10/01/reservoir-sampling-in-perl/
8# https://blogs.msdn.microsoft.com/spt/2008/02/05/reservoir-sampling/
Akron373df822016-12-28 15:25:14 +01009
10# A. Anagnostopoulos, A. Z. Broder, and D. Carmel. Sampling search-engine results. In Proc. of the Fourteenth International World Wide Web Conference, Chiba, Japan, 2005. ACM Press.
Akron09ab24b2017-08-24 12:45:39 +020011
12
13# WARNING:
14# Sorting does not respect current_match of any nested query, that's why
15# sorting is always separated from enriching!
16
17use constant DEBUG => 1;
18
19# Create a sample sort of k elements in the list
20sub new {
21 my $class = shift;
22 bless {
23 query => shift,
24 n => shift, # Size of the sample
25 k => 0, # Items already seen
26 reservoir => [],
27 current => undef
28 }, $class;
29};
30
31
32sub max_freq {
33 my $self = shift;
34 my $n = $self->{query}->max_freq;
35 $n = $n < $self->{n} ? $n : $self->{n};
36 return $n;
37};
38
39# Initialize reservoir
40sub _init {
41 my $self = shift;
42
43 return if $self->{k};
44
45 if ($self->{query}->next) {
46
47 # Seen next item
48 $self->{k}++;
49
50 # The reservoir is not filled up yet
51 if ($self->{k} <= $self->{n}) {
52
53 # Add current match to reservoir
54 my $current = $self->{query}->current;
55 push @{$self->{reservoir}}, $current;
56 }
57
58 # Check if the item should replace another item in the reservoir
59 elsif (rand(1) <= ($self->{n}/$self->{k})) {
60
61 # Replace random match in reservoir
62 my $current = $self->{query}->current;
63
64 # TODO:
65 # Check if $self->{n} is here equivalent to scalar @{$self->{reservoir}}
66 $self->{reservoir}->[rand($self->{n})] = $current;
67 }
68 };
69
70 return;
71};
72
73
74# Move to next item
75sub next {
76 my $self = shift;
77
78 # Fill reservoir
79 $self->_init;
80
81 # Get match from reservoir
82 my $current = shift @{$self->{reservoir}};
83
84 # There is no more match in reservoir
85 unless ($current) {
86 $self->{current} = undef;
87 return;
88 };
89
90 # Set current match
91 $self->{current} = $current;
92 return 1;
93};
94
95
96sub current {
97 $_[0]->{current};
98};
99
100
101sub match_from_query {
102 ...
103};
104
105
106sub current_match {
107 my $self = shift;
108 my $current = $self->current or return;
109 my $match = Krawfish::Posting::Match->new(
110 doc_id => $current->doc_id,
111 start => $current->start,
112 end => $current->end,
113 payload => $current->payload,
114 );
115
116 if (DEBUG) {
117 print_log('sort_sample', 'Current match is ' . $match->to_string);
118 };
119
120 return $match;
121};
122
123sub to_string {
124 'sample(' . $_[0]->{n} . ':' . $_[0]->{query}->to_string . ')';
125};
126
127
1281;