Added and-group and bitstream template
diff --git a/lib/Krawfish/Index.pm b/lib/Krawfish/Index.pm
index 5bfc0d5..541b148 100644
--- a/lib/Krawfish/Index.pm
+++ b/lib/Krawfish/Index.pm
@@ -282,16 +282,96 @@
__END__
-sub apply {
- my $self = shift;
- $self->{index} = shift;
+
+
+# Search using meta data
+# Can also be used to collect with a callback
+sub search {
+ my ($self, $koral, $cb) = @_;
+
+ my $query = $koral->query;
+ my $corpus = $koral->corpus;
+ my $meta = $koral->meta;
+
+ # Results
+ my $result = $koral->result;
+
+ my $search = $query->filter_by($corpus)->plan_for($self);
+
+ # Augment with facets
+ if ($meta->facets) {
+ $search = $meta->facets($search);
+ };
+
+ # Augment with sorting
+ if ($meta->sort) {
+ $search = $meta->sort($search);
+ };
+
+ my $count = 0;
+ while ($search->next) {
+ my $posting = $search->current;
+
+ # Based on the information, this will populate the match
+ $result->add_match($posting, $index);
+
+ last if ++$count > $meta->count;
+ };
+
+ # Total result count may already be available after sorting
+ # Otherwise count
+ if (!$meta->total_results && !$meta->cutoff) {
+ $count++ while $search->next;
+ $meta->total_results($count);
+ };
+
+ return $koral;
+};
+
+sub get_fields {
+ my ($self, $doc_id, $fields) = @_;
+};
+
+# This returns the posting's start and end position
+# when embedded in a span, e.g. <base/s=s>
+sub get_context_by_query {
+ my ($self, $posting, $query) = @_
+};
+
+sub get_annotations {
+ my ($self, $posting, $terms) = @_;
+
+ my %anno = ();
+
+ my $dict = $self->dict;
+ foreach my $term ($dict->terms($terms)) {
+ my $term_list = $dict->get($term);
+
+ # Skip to the correct document and the first position
+ next unless $term_list->next($posting->doc_id, $posting->start);
+
+ # Init annotation
+ my $anno = ($anno{$term} //= []);
+
+ # Iterate over all annotations
+ while ($term_list->current->end <= $posting->end) {
+
+ # Remember the annotations
+ push @$anno, $term_list->current->clone;
+
+ $term_list->next or next;
+ }
+
+ # Close (and forget) termlist
+ $term_list->close;
+ };
+
+ return \%anno;
};
-sub filter_by {
- my $self = shift;
- $self->{filter} = shift;
-};
+
+
sub items_per_page;
diff --git a/lib/Krawfish/Index/BitStream.pm b/lib/Krawfish/Index/BitStream.pm
new file mode 100644
index 0000000..e2cb5d7
--- /dev/null
+++ b/lib/Krawfish/Index/BitStream.pm
@@ -0,0 +1,102 @@
+package Krawfish::Index::BitStream;
+use strict;
+use warnings;
+
+
+# TODO:
+# Vint should be as simple as possible
+# TODO:
+# BitStream should support multiple pointers,
+# And the stream should be closed, once no pointers
+# point to it any longer
+# TODO:
+# BitStream may be loaded from a file and may
+# load further elements, once it exceeds the boundaries
+# of the current element
+
+sub new {
+ my $class = shift;
+ bless {
+ stream => [], # May contain multiple elements
+ finger => [], # Finger registry to know, when
+ # the bitstream can be closed
+ start => 0, # File offset of the bitstream
+ length => 0 # Length of the segment in the file
+ }, $class;
+};
+
+# Override
+# This will describe the compression scheme
+sub schema;
+
+sub current {
+ # Return the data at the current position
+ # This will convert all values based
+ # on schema
+ ...
+};
+
+# Get the next item, based on the current schema
+# This will ignore all skip entries
+sub next {
+ my ($self, $offset) = @_;
+ ...
+};
+
+sub next_pos;
+
+sub next_doc;
+
+
+# Skip to or beyond a certain doc id and to or before a certain position
+sub skip_to {
+ my $self = shift;
+ my ($offset, $doc_id, $pos) = @_;
+ # The offset comes from the finger position in the byte stream
+ ...
+};
+
+# Add bytes at the end of the stream
+sub add_bytes {
+ my $self = shift;
+ ...
+};
+
+
+# Set bytes at a certain byte offset in the stream
+# This is necessary to augment the stream with skip entrie
+sub set_bytes {
+ my ($self, $offset, $length) = @_;
+ ...
+};
+
+
+########################
+# Conversion functions #
+########################
+
+# Encode variable integer
+sub enc_vint {
+ ...
+};
+
+
+# Decode variable integer
+sub dec_vint {
+ ...
+};
+
+# Encode simple 16
+sub enc_simple_16 {
+ ...
+};
+
+# Decode simple 16
+sub dec_simple_16 {
+ ...
+};
+
+1;
+
+
+__END__
diff --git a/lib/Krawfish/Index/Dictionary.pm b/lib/Krawfish/Index/Dictionary.pm
index 3ecc8ef..45e4e6a 100644
--- a/lib/Krawfish/Index/Dictionary.pm
+++ b/lib/Krawfish/Index/Dictionary.pm
@@ -37,6 +37,7 @@
};
# Return terms of the term dictionary
+# TODO: This should return an iterator
sub terms {
my ($self, $re) = @_;
diff --git a/lib/Krawfish/Index/PostingPointer.pm b/lib/Krawfish/Index/PostingPointer.pm
index 4526028..ac210c9 100644
--- a/lib/Krawfish/Index/PostingPointer.pm
+++ b/lib/Krawfish/Index/PostingPointer.pm
@@ -6,7 +6,7 @@
# Points to a position in a postings list
# TODO: Return different posting types
-# Using current
+# Using current
sub new {
my $class = shift;
@@ -23,7 +23,7 @@
};
sub term {
- $_[0]->{list}->{term};
+ $_[0]->{list}->term;
};
sub next {
@@ -45,6 +45,10 @@
$_[0]->{list};
};
+sub close {
+ ...
+};
+
# sub skip_doc_to;
# sub skip_pos_to;
diff --git a/lib/Krawfish/Index/PostingsLive.pm b/lib/Krawfish/Index/PostingsLive.pm
new file mode 100644
index 0000000..fefbda3
--- /dev/null
+++ b/lib/Krawfish/Index/PostingsLive.pm
@@ -0,0 +1,50 @@
+package Krawfish::Index::PostingsLive;
+use strict;
+use warnings;
+
+# TODO: Has a "delete" method and works otherwise identical to PostingsList and PostingPointer
+
+sub new {
+ my ($class, $index_file, $max) = @_;
+ bless {
+ index_file => $index_file,
+ deletes => [],
+ pointers => [],
+ max => $max # Maximum number of documents
+ }, $class;
+};
+
+sub delete {
+ my $self = shift;
+ push @{$self->{deletes}}, shift;
+ $self->{deletes} = [sort @{$self->{deletes}}];
+};
+
+sub freq {
+ $_[0]->{max} - scalar @{$self->{deletes}}
+}
+
+sub pointer {
+ Krawfish::Index::PostingLivePointer->new($self);
+};
+
+sub to_string {
+ '...'
+};
+
+
+# Pointer actions
+sub next {
+ my $self = shift;
+ my $pos = $self->{pos}++;
+
+ if ($pos + 1 < $self->freq) {
+ if ($self->{deletes}) {
+ ...
+ }
+ };
+
+ return;
+};
+
+1;
diff --git a/lib/Krawfish/Koral.pm b/lib/Krawfish/Koral.pm
index 7f74f51..ce23965 100644
--- a/lib/Krawfish/Koral.pm
+++ b/lib/Krawfish/Koral.pm
@@ -91,27 +91,3 @@
__END__
-sub search {
- my $self = shift;
- my $callback = shift;
- my $token = Krawfish::Query::Token->new(
- $self->{index},
- $term
- );
-
- # Filter the results
- if ($self->filter_by) {
-
- # Filter the result
- $token->filter_by($self->filter_by);
- };
-
- # Apply Sorting here
-
- # Iterate over all matches
- while ($self->next) {
-
- # Call callback with match
- $callback->($self->current) or return;
- };
-};
diff --git a/lib/Krawfish/Koral/Corpus/FieldGroup.pm b/lib/Krawfish/Koral/Corpus/FieldGroup.pm
index a971d92..e527c58 100644
--- a/lib/Krawfish/Koral/Corpus/FieldGroup.pm
+++ b/lib/Krawfish/Koral/Corpus/FieldGroup.pm
@@ -2,6 +2,7 @@
use parent 'Krawfish::Koral::Corpus';
use Krawfish::Log;
use Krawfish::Corpus::Or;
+use Krawfish::Corpus::And;
use strict;
use warnings;
@@ -63,7 +64,19 @@
}
elsif ($self->operation eq 'and') {
- ...
+
+ print_log('kq_fgroup', 'Prepare and-group') if DEBUG;
+
+ # Filter out all terms that do not occur
+ for (; $i < @$ops; $i++) {
+ my $option = $ops->[$i]->plan_for($index);
+ if ($option->freq != 0) {
+ $query = Krawfish::Corpus::And->new(
+ $query,
+ $option
+ )
+ };
+ };
};
if ($query->freq == 0) {
diff --git a/lib/Krawfish/Koral/Query.pm b/lib/Krawfish/Koral/Query.pm
index b9b0721..0a783d4 100644
--- a/lib/Krawfish/Koral/Query.pm
+++ b/lib/Krawfish/Koral/Query.pm
@@ -53,6 +53,7 @@
};
# Plan a query for an index (to be overwritten)
+# TODO: Rename to_primitive(index)
sub plan_for;
sub is_any { $_[0]->{any} // 0 };
diff --git a/lib/Krawfish/Koral/Result.pm b/lib/Krawfish/Koral/Result.pm
new file mode 100644
index 0000000..4ce4857
--- /dev/null
+++ b/lib/Krawfish/Koral/Result.pm
@@ -0,0 +1,43 @@
+package Krawfish::Koral::Result;
+use strict;
+use warnings;
+
+1;
+
+__END__
+
+sub add_match {
+ my ($self, $posting, $index) = @_;
+
+ my $match = Krawfish::Koral::Result::Match->new($posting);
+
+ my $meta = $self->meta;
+ if ($meta->fields) {
+ $match->fields(
+ $index->get_fields($posting->doc_id, $meta->fields)
+ );
+ };
+
+ # Expand match to, e.g., <base/s=s>
+ if ($meta->expansion) {
+ my ($start, $end) = $index->get_context(
+ $posting,
+ $meta->expansion
+ );
+ };
+
+ # Expand context to, e.g., <base/s=p>
+ if ($meta->context) {
+ my ($start) = $index->get_context();
+ };
+
+ if ($meta->snippet) {
+ $self->get_snippet(
+ posting => $posting,
+ highlights => $meta->highlights,
+ snippet_context => $meta->context,
+ match_context => $meta->expansion,
+ annotations => $match->annotations
+ );
+ };
+};
diff --git a/lib/Krawfish/Query.pm b/lib/Krawfish/Query.pm
index fb62891..0e7e6b9 100644
--- a/lib/Krawfish/Query.pm
+++ b/lib/Krawfish/Query.pm
@@ -15,10 +15,14 @@
};
# Overwrite
+# TODO: Accepts a target doc
+# TODO: Returns the doc_id of the current posting
sub next;
# Forward to next start position
-sub next_pos;
+sub next_greater_start;
+
+
sub skip_doc {
my $self = shift;
@@ -29,6 +33,14 @@
return $self->{doc_id};
};
+# In Lucene it's exemplified:
+# int advance(int target) {
+# int doc;
+# while ((doc = nextDoc()) < target) {
+# }
+# return doc;
+# }
+
sub freq {
-1;
};
diff --git a/lib/Krawfish/Query/Base/Dual.pm b/lib/Krawfish/Query/Base/Dual.pm
index 8b7cfbf..59752cc 100644
--- a/lib/Krawfish/Query/Base/Dual.pm
+++ b/lib/Krawfish/Query/Base/Dual.pm
@@ -17,6 +17,12 @@
@EXPORT = qw/NEXTA NEXTB MATCH/;
+# TODO: Next to NEXTA and NEXTB there should be flags for:
+# NEXTX to STARTY (Position skipping)
+# NEXTX to ENDY (Position skipping)
+# NEXTX to ENDX (Position skipping)
+# NEXTX to STARTX+1 (Position skipping)
+
# TODO: Improve by skipping to the same document
sub new {
diff --git a/t/corpus/and.t b/t/corpus/and.t
new file mode 100644
index 0000000..c5ac26a
--- /dev/null
+++ b/t/corpus/and.t
@@ -0,0 +1,57 @@
+use Test::More;
+use Test::Krawfish;
+use strict;
+use warnings;
+
+use_ok('Krawfish::Koral::Corpus::Builder');
+use_ok('Krawfish::Index');
+
+my $index = Krawfish::Index->new;
+ok_index($index, {id => 2, author => 'Peter', age => 4} => [qw/aa bb/], 'Add complex document');
+ok_index($index, {id => 3, author => 'Peter', age => 3} => [qw/aa bb/], 'Add complex document');
+ok_index($index, {id => 5, author => 'Peter', age => 4} => [qw/aa bb/], 'Add complex document');
+
+ok(my $cb = Krawfish::Koral::Corpus::Builder->new, 'Create CorpusBuilder');
+
+ok(my $query = $cb->field_and(
+ $cb->string('author')->eq('Peter'),
+ $cb->string('age')->eq('4')
+), 'Create corpus query');
+
+is($query->to_string, 'author=Peter&age=4', 'Stringification');
+
+ok(my $plan = $query->plan_for($index), 'Planning');
+
+is($plan->to_string, "and('author:Peter','age:4')", 'Stringification');
+
+ok($plan->next, 'Init vc');
+is($plan->current->to_string, '[0]', 'First doc');
+ok($plan->next, 'More next');
+is($plan->current->to_string, '[2]', 'First doc');
+ok(!$plan->next, 'No more next');
+
+
+# Complex virtual corpus
+ok($query = $cb->field_or(
+ $cb->field_and(
+ $cb->string('author')->eq('Peter'),
+ $cb->string('age')->eq(3)
+ ),
+ $cb->string('id')->eq(2)
+), 'Create corpus query');
+
+is($query->to_string, '(author=Peter&age=3)|id=2', 'Stringification');
+
+ok($plan = $query->plan_for($index), 'Planning');
+
+is($plan->to_string, "or(and('author:Peter','age:3'),'id:2')", 'Stringification');
+
+ok($plan->next, 'Init vc');
+is($plan->current->to_string, '[0]', 'First doc');
+ok($plan->next, 'More next');
+is($plan->current->to_string, '[1]', 'First doc');
+ok(!$plan->next, 'No more next');
+
+
+done_testing;
+__END__
diff --git a/t/corpus/or.t b/t/corpus/or.t
index 87e4043..efbe871 100644
--- a/t/corpus/or.t
+++ b/t/corpus/or.t
@@ -30,6 +30,24 @@
is($plan->current->to_string, '[1]', 'First doc');
ok(!$plan->next, 'No more next doc');
+
+ok_index($index, {id => 7} => [qw/aa bb/], 'Add complex document');
+ok_index($index, {id => 9} => [qw/aa bb/], 'Add complex document');
+
+ok($query = $cb->field_or(
+ $cb->string('id')->eq('3'),
+ $cb->string('id')->eq('2'),
+ $cb->string('id')->eq('9')
+), 'Create corpus query');
+
+is($query->to_string, 'id=3|id=2|id=9', 'Stringification');
+
+ok($plan = $query->plan_for($index), 'Planning');
+
+is($plan->to_string, "or(or('id:3','id:2'),'id:9')", 'Stringification');
+
+matches($plan, [qw/[0] [1] [4]/], 'Matches');
+
diag 'Test further';
done_testing;