Added preliminary filtering
diff --git a/lib/Krawfish/Koral/Meta.pm b/lib/Krawfish/Koral/Meta.pm
index 5587b2a..d5eb52c 100644
--- a/lib/Krawfish/Koral/Meta.pm
+++ b/lib/Krawfish/Koral/Meta.pm
@@ -6,7 +6,7 @@
# WARNING! / TODO!
# An enrichment for fields or snippets (better any enrichments)
-# can never wrap around a sort query, because the relevant
+# can never wrap around a presort query, because the relevant
# data structures and algorithms require the results to be in doc_id order!
# WARNING!
@@ -14,6 +14,12 @@
# processing - especially for fields, as segment rankings can differ!
# TODO:
+# There are presort and postsort queries.
+# Presortqueries don't respect current_query,
+# while postsortqueries do!
+# Postsortqueries only work on the clusterlevel.
+
+# TODO:
# When a group filter is added,
# sorting does not work etc.
# This has to be thought through
@@ -22,10 +28,11 @@
our %META_ORDER = (
limit => 1,
sort => 2,
- enrich => 3,
- aggregate => 4,
- group => 5,
- filter => 6
+ sample => 3,
+ enrich => 4,
+ aggregate => 5,
+ group => 6,
+ filter => 7
);
use constant {
@@ -197,6 +204,12 @@
# Set top_k option!
$_->top_k($top_k) if $top_k;
last;
+ }
+
+ elsif ($_->type eq 'sample') {
+ # Set top_k option!
+ $_->top_k($top_k) if $top_k;
+ last;
};
};
};
diff --git a/lib/Krawfish/Koral/Meta/Builder.pm b/lib/Krawfish/Koral/Meta/Builder.pm
index dfcb757..1d67e61 100644
--- a/lib/Krawfish/Koral/Meta/Builder.pm
+++ b/lib/Krawfish/Koral/Meta/Builder.pm
@@ -9,6 +9,7 @@
use Krawfish::Koral::Meta::Limit;
use Krawfish::Koral::Meta::Sort;
use Krawfish::Koral::Meta::Sort::Field;
+use Krawfish::Koral::Meta::Sort::Sample;
use Krawfish::Koral::Meta::Aggregate::Frequencies;
use Krawfish::Koral::Meta::Aggregate::Fields;
use Krawfish::Koral::Meta::Aggregate::Length;
@@ -150,6 +151,13 @@
);
};
+
+# Get a sample of size X
+sub s_sample {
+ shift;
+ return Krawfish::Koral::Meta::Sort::Sample->new(shift);
+};
+
# TODO:
# s_class (sort by the surface form of a class, necessary for concordances)
diff --git a/lib/Krawfish/Koral/Meta/Limit.pm b/lib/Krawfish/Koral/Meta/Limit.pm
index cf458bb..ed2dce6 100644
--- a/lib/Krawfish/Koral/Meta/Limit.pm
+++ b/lib/Krawfish/Koral/Meta/Limit.pm
@@ -32,6 +32,12 @@
sub wrap {
my ($self, $query) = @_;
+ # For sampling, limiting has no effect
+ if ($query->type eq 'sort') {
+ $query->top_k($self->items_per_type);
+ return $query;
+ };
+
return Krawfish::Koral::Meta::Node::Limit->new(
$query,
$self->start_index,
diff --git a/lib/Krawfish/Koral/Meta/Node/Limit.pm b/lib/Krawfish/Koral/Meta/Node/Limit.pm
index 0ffc401..862611e 100644
--- a/lib/Krawfish/Koral/Meta/Node/Limit.pm
+++ b/lib/Krawfish/Koral/Meta/Node/Limit.pm
@@ -1,4 +1,8 @@
package Krawfish::Koral::Meta::Node::Limit;
+
+# TODO:
+# Move this to segment
+use Krawfish::Result::Limit;
use Krawfish::Log;
use strict;
use warnings;
@@ -38,4 +42,20 @@
};
+sub optimize {
+ my ($self, $segment) = @_;
+
+ my $query = $self->{query}->optimize($segment);
+
+ if ($query->max_freq == 0) {
+ return Krawfish::Query::Nothing->new;
+ };
+
+ return Krawfish::Result::Limit->new(
+ $query,
+ $self->{start_index},
+ $self->{items_per_page}
+ )
+};
+
1;
diff --git a/lib/Krawfish/Koral/Meta/Node/Sort/Sample.pm b/lib/Krawfish/Koral/Meta/Node/Sort/Sample.pm
new file mode 100644
index 0000000..827b2c5
--- /dev/null
+++ b/lib/Krawfish/Koral/Meta/Node/Sort/Sample.pm
@@ -0,0 +1,47 @@
+package Krawfish::Koral::Meta::Node::Sort::Sample;
+use Krawfish::Result::Segment::Sort::Sample;
+use strict;
+use warnings;
+
+sub new {
+ my $class = shift;
+ bless {
+ query => shift,
+ n => shift
+ }, $class;
+};
+
+sub type {
+ 'sample'
+};
+
+sub identify {
+ my ($self, $dict) = @_;
+
+ $self->{query} = $self->{query}->identify($dict);
+
+ return $self;
+};
+
+
+sub optimize {
+ my ($self, $segment) = @_;
+
+ my $query = $self->{query}->optimize($segment);
+
+ if ($query->max_freq == 0) {
+ return Krawfish::Query::Nothing->new;
+ };
+
+ return Krawfish::Result::Segment::Sort::Sample->new(
+ $query,
+ $self->{n}
+ )
+};
+
+
+sub to_string {
+ return 'sample(' . $_[0]->{n} . ':' . $_[0]->{query}->to_string . ')';
+};
+
+1;
diff --git a/lib/Krawfish/Koral/Meta/Sort.pm b/lib/Krawfish/Koral/Meta/Sort.pm
index f4da685..a72ea18 100644
--- a/lib/Krawfish/Koral/Meta/Sort.pm
+++ b/lib/Krawfish/Koral/Meta/Sort.pm
@@ -10,6 +10,11 @@
# TODO:
# Support top_k setting from limit!
+# TODO:
+# Not all sortings are compatible,
+# e.g. sample cannot be mixed with
+# another sorting!
+
sub new {
my $class = shift;
@@ -88,7 +93,17 @@
my $self = shift;
my @unique;
my %unique;
+ my $sampling = 0;
foreach (@{$self->{sort}}) {
+
+ # Sampling can't be combined with other sorting
+ # mechanisms - and it can't be filtered,
+ # so return directly
+ if ($_->type eq 'sample') {
+ $_->top_k($self->top_k);
+ return $_;
+ };
+
unless (exists $unique{$_->to_string}) {
push @unique, $_;
$unique{$_->to_string} = 1;
diff --git a/lib/Krawfish/Koral/Meta/Sort/Field.pm b/lib/Krawfish/Koral/Meta/Sort/Field.pm
index d5f23a1..1928cbc 100644
--- a/lib/Krawfish/Koral/Meta/Sort/Field.pm
+++ b/lib/Krawfish/Koral/Meta/Sort/Field.pm
@@ -11,6 +11,10 @@
}, $class;
};
+sub type {
+ 'field';
+};
+
sub field {
return $_[0]->{field};
};
diff --git a/lib/Krawfish/Koral/Meta/Sort/Sample.pm b/lib/Krawfish/Koral/Meta/Sort/Sample.pm
new file mode 100644
index 0000000..e9a2932
--- /dev/null
+++ b/lib/Krawfish/Koral/Meta/Sort/Sample.pm
@@ -0,0 +1,51 @@
+package Krawfish::Koral::Meta::Sort::Sample;
+use Krawfish::Koral::Meta::Node::Sort::Sample;
+use Krawfish::Util::String qw/squote/;
+use strict;
+use warnings;
+
+sub new {
+ my $class = shift;
+ bless {
+ top_k => shift // 0
+ }, $class;
+};
+
+# Set or get the top_k limitation!
+sub top_k {
+ my $self = shift;
+ if (defined $_[0]) {
+ $self->{top_k} = shift;
+ return $self;
+ };
+ return $self->{top_k};
+};
+
+
+sub type {
+ 'sample';
+};
+
+sub to_string {
+ return 'sample=' . $_[0]->{top_k};
+};
+
+sub normalize {
+ $_[0];
+};
+
+sub identify {
+ $_[0];
+};
+
+sub wrap {
+ my ($self, $query) = @_;
+ return Krawfish::Koral::Meta::Node::Sort::Sample->new(
+ $query,
+ $self->{top_k}
+ );
+
+};
+
+
+1;
diff --git a/lib/Krawfish/Posting/Group/Fields.pm b/lib/Krawfish/Posting/Group/Fields.pm
index 6d734cb..2e6ab70 100644
--- a/lib/Krawfish/Posting/Group/Fields.pm
+++ b/lib/Krawfish/Posting/Group/Fields.pm
@@ -6,6 +6,10 @@
use constant DEBUG => 1;
+# TODO:
+# In addition to the group name
+# create a signature that is universal for each group
+
sub new {
my $class = shift;
bless {
@@ -133,4 +137,21 @@
};
+sub to_koral_query {
+ # Create groups like
+ # {
+ # "@type":"koral:collection",
+ # "groupedBy":"groupedBy:fields", # or "aggregatedBy, "sortedBy"
+ # "labels":[...],
+ # "items":[
+ # {
+ # "@type":"koral:item",
+ # // "signature":"ab47mhjhjgfjuizgtzurzt",
+ # "cols":[...]
+ # }
+ # ]
+ # }
+ ...
+};
+
1;
diff --git a/lib/Krawfish/Result.pm b/lib/Krawfish/Result.pm
index 014e4f8..a5d6375 100644
--- a/lib/Krawfish/Result.pm
+++ b/lib/Krawfish/Result.pm
@@ -9,12 +9,13 @@
sub current_match {
my $self = shift;
- return unless defined $self->{doc_id};
+ my $current = $self->current;
+ return unless $current;
return Krawfish::Posting::Match->new(
- doc_id => $self->{doc_id},
- start => $self->{start},
- end => $self->{end},
- payload => $self->{payload}
+ doc_id => $current->doc_id,
+ start => $current->start,
+ end => $current->end,
+ payload => $current->payload
);
};
diff --git a/lib/Krawfish/Result/Limit.pm b/lib/Krawfish/Result/Limit.pm
index 62349f9..a13fdc8 100644
--- a/lib/Krawfish/Result/Limit.pm
+++ b/lib/Krawfish/Result/Limit.pm
@@ -1,4 +1,5 @@
package Krawfish::Result::Limit;
+use parent 'Krawfish::Result';
use Krawfish::Log;
use strict;
use warnings;
@@ -47,12 +48,6 @@
};
# May return a hash reference with information
-sub current_match {
- ...
-};
-
-
-# May return a hash reference with information
sub current_group {
...
};
diff --git a/lib/Krawfish/Result/Segment/Group/Fields.pm b/lib/Krawfish/Result/Segment/Group/Fields.pm
index a83840c..7d44491 100644
--- a/lib/Krawfish/Result/Segment/Group/Fields.pm
+++ b/lib/Krawfish/Result/Segment/Group/Fields.pm
@@ -66,9 +66,6 @@
my $groups = $self->{groups};
my $pointer = $self->{field_pointer};
- # Get container object
- # my $collection = $self->collection;
-
# There is a next match
if ($self->{query}->next) {
@@ -139,9 +136,6 @@
# This adds
$groups->incr_doc(\@patterns);
- # TODO: Add lists
- # $self->{current_group} = $groups->add();
-
# Set last doc to current doc
$self->{last_doc_id} = $current->doc_id;
};
@@ -152,14 +146,7 @@
return 1;
};
- # Release on_finish event
- #unless ($self->{finished}) {
- # foreach (@{$self->{ops}}) {
- # $_->on_finish($collection);
- # };
- # $self->{finished} = 1;
- #};
-
+ # Flush cached results
$groups->flush;
return 0;
@@ -171,136 +158,13 @@
};
+# Get collection
sub collection {
$_[0]->{groups};
};
-sub on_finish {
- my ($self, $collection) = @_;
- $self->{groups}->flush;
- $collection->{fields} = $self->{groups};
-};
-
-
1;
+
+
__END__
-
-
-# Initialize group fetching
-sub _init {
- return if $_[0]->{ranks};
-
- my $self = shift;
-
- print_log('group_fields', 'Get ranks for fields') if DEBUG;
-
- # Get fields object
- my $fields = $self->{index}->fields;
-
- # Lift ranks for each relevant field
- # (may already be liftet for another job ...)
- # and initialize example docs
- my $ranks = ($self->{ranks} = []);
- my $example_docs = ($self->{example_docs} = []);
- my $i = 0;
- my @fields = ();
- foreach my $field (@{$self->{fields}}) {
-
- print_log('group_fields', "Lift the ranks for '$field'") if DEBUG;
-
- # Fetch rank
- if (my $rankings = $fields->ranked_by($field)) {
- push @$ranks, $rankings;
- $self->{example_docs}->[$i] = [];
- push @fields, $field;
- };
-
- $i++;
- };
-
- # In case they were no-ranked fields requested, the field request needs to be rewritten.
- # WARNING: This needs to be notified to the user somehow ...
- $self->{fields} = \@fields;
-};
-
-
-# Get the group signature for each match
-# May well be renamed to "get_signature"
-sub get_group {
- my $self = shift;
- $self->_init;
-
- my $current = shift;
- my $doc_id = $current->doc_id;
-
- # Create a string with all necessary field information
- my @group = ();
- my $i = 0;
- my $example_docs = $self->{example_docs};
-
- # Iterate over all rankings
- foreach my $rankings (@{$self->{ranks}}) {
-
- # Get the rank of the match
- my $rank = $rankings->get($doc_id);
-
- # Store example document to later retrieve surface field
- $example_docs->[$i++]->[$rank] //= $doc_id;
-
- # push rank to signature
- push @group, $rank;
- };
-
- # Create signature string
- return join('___', @group);
-};
-
-
-# return group info as hash
-sub to_hash {
- my ($self, $signature, $doc_freq, $freq) = @_;
-
- # Get field titles
- my $fields = $self->{fields};
- my $fields_obj = $self->{index}->fields;
- my $example_docs = $self->{example_docs};
-
- # Get field values
- my @ranks = split('___', $signature);
-
- # Store frequency information
- my %hash = (
- doc_freq => $doc_freq
- );
- $hash{freq} = $freq if defined $freq;
-
- print_log('group_field', "Create hash for $signature") if DEBUG;
-
- # Iterate over all ranks in the signature
- # - this will be identical to the number of fields requested
- for (my $i = 0; $i < scalar @ranks; $i++) {
-
- # Get rankings
- my $rank = $ranks[$i];
-
- my $doc_id = $example_docs->[$i]->[$rank];
-
- print_log('group_field', "Example doc is $doc_id") if DEBUG;
-
- # Get field title
- my $field_title = $fields_obj->get(
- $example_docs->[$i]->[$rank],
- $fields->[$i]
- );
-
- # Set field title and value
- $hash{$fields->[$i]} = $field_title;
- };
-
- return \%hash;
-};
-
-
-
-1;
diff --git a/lib/Krawfish/Result/Segment/Sort/Sample.pm b/lib/Krawfish/Result/Segment/Sort/Sample.pm
index f4fa065..8bb1a16 100644
--- a/lib/Krawfish/Result/Segment/Sort/Sample.pm
+++ b/lib/Krawfish/Result/Segment/Sort/Sample.pm
@@ -1,3 +1,128 @@
+package Krawfish::Result::Segment::Sort::Sample;
+use Krawfish::Log;
+use strict;
+use warnings;
+
# https://en.wikipedia.org/wiki/Reservoir_sampling
+# https://webkist.wordpress.com/2008/10/01/reservoir-sampling-in-perl/
+# https://blogs.msdn.microsoft.com/spt/2008/02/05/reservoir-sampling/
# A. Anagnostopoulos, A. Z. Broder, and D. Carmel. Sampling search-engine results. In Proc. of the Fourteenth International World Wide Web Conference, Chiba, Japan, 2005. ACM Press.
+
+
+# WARNING:
+# Sorting does not respect current_match of any nested query, that's why
+# sorting is always separated from enriching!
+
+use constant DEBUG => 1;
+
+# Create a sample sort of k elements in the list
+sub new {
+ my $class = shift;
+ bless {
+ query => shift,
+ n => shift, # Size of the sample
+ k => 0, # Items already seen
+ reservoir => [],
+ current => undef
+ }, $class;
+};
+
+
+sub max_freq {
+ my $self = shift;
+ my $n = $self->{query}->max_freq;
+ $n = $n < $self->{n} ? $n : $self->{n};
+ return $n;
+};
+
+# Initialize reservoir
+sub _init {
+ my $self = shift;
+
+ return if $self->{k};
+
+ if ($self->{query}->next) {
+
+ # Seen next item
+ $self->{k}++;
+
+ # The reservoir is not filled up yet
+ if ($self->{k} <= $self->{n}) {
+
+ # Add current match to reservoir
+ my $current = $self->{query}->current;
+ push @{$self->{reservoir}}, $current;
+ }
+
+ # Check if the item should replace another item in the reservoir
+ elsif (rand(1) <= ($self->{n}/$self->{k})) {
+
+ # Replace random match in reservoir
+ my $current = $self->{query}->current;
+
+ # TODO:
+ # Check if $self->{n} is here equivalent to scalar @{$self->{reservoir}}
+ $self->{reservoir}->[rand($self->{n})] = $current;
+ }
+ };
+
+ return;
+};
+
+
+# Move to next item
+sub next {
+ my $self = shift;
+
+ # Fill reservoir
+ $self->_init;
+
+ # Get match from reservoir
+ my $current = shift @{$self->{reservoir}};
+
+ # There is no more match in reservoir
+ unless ($current) {
+ $self->{current} = undef;
+ return;
+ };
+
+ # Set current match
+ $self->{current} = $current;
+ return 1;
+};
+
+
+sub current {
+ $_[0]->{current};
+};
+
+
+sub match_from_query {
+ ...
+};
+
+
+sub current_match {
+ my $self = shift;
+ my $current = $self->current or return;
+ my $match = Krawfish::Posting::Match->new(
+ doc_id => $current->doc_id,
+ start => $current->start,
+ end => $current->end,
+ payload => $current->payload,
+ );
+
+ if (DEBUG) {
+ print_log('sort_sample', 'Current match is ' . $match->to_string);
+ };
+
+ return $match;
+};
+
+sub to_string {
+ 'sample(' . $_[0]->{n} . ':' . $_[0]->{query}->to_string . ')';
+};
+
+
+1;