First attempt to group instead of searching
diff --git a/lib/Krawfish/Index.pm b/lib/Krawfish/Index.pm
index 56cda6e..857f1ed 100644
--- a/lib/Krawfish/Index.pm
+++ b/lib/Krawfish/Index.pm
@@ -9,7 +9,7 @@
use warnings;
use Scalar::Util qw!blessed!;
use Mojo::JSON qw/encode_json decode_json/;
-use Mojo::Util qw/slurp/;
+use Mojo::File;
# TODO: Add LiveDocs-PostingsList, that supports deletion
#
@@ -113,7 +113,7 @@
my $self = shift;
my $doc = shift;
unless (ref $doc) {
- $doc = decode_json slurp $doc;
+ $doc = decode_json(Mojo::File->new($doc)->slurp);
};
# Get new doc_id
diff --git a/lib/Krawfish/Index/Dictionary.pm b/lib/Krawfish/Index/Dictionary.pm
index f663d7a..9764827 100644
--- a/lib/Krawfish/Index/Dictionary.pm
+++ b/lib/Krawfish/Index/Dictionary.pm
@@ -5,6 +5,7 @@
use Krawfish::Index::PostingsList;
# TODO: Use Storable
+# TODO: Support case insensitivity
use constant DEBUG => 0;
diff --git a/lib/Krawfish/Index/Segments.pm b/lib/Krawfish/Index/Segments.pm
index 6a5887c..a418114 100644
--- a/lib/Krawfish/Index/Segments.pm
+++ b/lib/Krawfish/Index/Segments.pm
@@ -10,7 +10,14 @@
# The segments structure may be augmented with a skiplist
# and be a highly optimized position encoding, because character offsets
# should normally have values between 0 and 16.
-
+#
+# It should also contain information about the first two characters
+# of a term and possibly the last two characters, necessary to bucket sort terms.
+# The characters are stored as UTF-8 or similar -
+# it may be beneficial to have the most common characters need the least
+# bits.
+# Note that this information needs to store characters and not
+# bytes, as bytes may not be helpful for sorting!
# Constructor
sub new {
@@ -42,4 +49,18 @@
return $self->{$doc_id . '#' . $segment};
};
+
+# Define, how many start characters will be stored
+sub start_char_length {
+ 2;
+}
+
+# Define, how many start characters will be stored
+sub end_char_length {
+ 2;
+}
+
+# TODO: A Segment has ->start_offset, ->length, ->first_chars, ->last_chars, ->term_id
+# term_id may either be a term-id or a string
+
1;
diff --git a/lib/Krawfish/Index/TokensList.pm b/lib/Krawfish/Index/TokensList.pm
index 84792cf..a1a5d3a 100644
--- a/lib/Krawfish/Index/TokensList.pm
+++ b/lib/Krawfish/Index/TokensList.pm
@@ -3,6 +3,9 @@
use strict;
use warnings;
+# DEPRECATED!
+# A forward index will be used instead!
+
# This is a forward index for tokens
# This will be used for complex regular expressions,
# grouping of class results
diff --git a/lib/Krawfish/Koral/Query/Sequence.pm b/lib/Krawfish/Koral/Query/Sequence.pm
index e4f3360..8b32866 100644
--- a/lib/Krawfish/Koral/Query/Sequence.pm
+++ b/lib/Krawfish/Koral/Query/Sequence.pm
@@ -4,6 +4,9 @@
use strict;
use warnings;
+# TODO: Optimize if there is an identical subquery
+# in a direct sequence, make this a repetition!!!
+
use constant DEBUG => 0;
sub new {
diff --git a/lib/Krawfish/Posting.pm b/lib/Krawfish/Posting.pm
index 007bfb5..c462a31 100644
--- a/lib/Krawfish/Posting.pm
+++ b/lib/Krawfish/Posting.pm
@@ -35,12 +35,27 @@
};
+sub get_classes {
+ my ($self, $nrs) = @_;
+ # Check payload for relevant class and return start, end
+ # If no nrs are given, return all classes
+ ...
+};
+
+
+sub get_classes_sorted {
+ my ($self, $nrs) = @_;
+ # The same as get_classes, but ordered by start position
+ ...
+}
+
# This will be overwritten for at least cached buffers
# necessary for sorting
sub offset {
undef;
};
+
sub clone {
my $self = shift;
return __PACKAGE__->new(
diff --git a/lib/Krawfish/Query/Constraint/InDistanceSpan.pm b/lib/Krawfish/Query/Constraint/InDistanceSpan.pm
index 76a0809..60f139f 100644
--- a/lib/Krawfish/Query/Constraint/InDistanceSpan.pm
+++ b/lib/Krawfish/Query/Constraint/InDistanceSpan.pm
@@ -2,6 +2,16 @@
use strict;
use warnings;
+# The first span and the second span needs to be inside
+# spans, maybe in the same (max=0) or with a distance.
+# There are gaps allowed in the distance.
+
+use constant {
+ NEXTA => 1,
+ NEXTB => 2,
+ MATCH => 4,
+};
+
sub new {
my $class = shift;
bless {
@@ -10,31 +20,59 @@
min => shift,
max => shift
}, $class;
-}
+};
+sub _init {
+ return if $_[0]->{init}++;
+ print_log('c_dist', 'Init distance span') if DEBUG;
+ $_[0]->{span}->next;
+};
+
+# Check the configuration
sub check {
my $self = shift;
my ($payload, $first, $second) = @_;
- # TODO: init span
+ # Find out ranges
+ my $lower_range = $first->start < $second->start ? $first->start : $second_start;
+ my $upper_range = $first->end > $second->end ? $first->end : $second->end;
+ # my ($start, $end) = $first->start < $second->start ? ($first, $second) : ($second, $first);
- my $span = $self->{span};
+ $self->_init;
- my $buffer = $self->{buffer};
+ my $distance = $self->{span};
+ my $ret_val = 0b0000;
# No current element
- return 0b0000 unless $span->current;
+ return $ret_val unless $distance->current;
# Move span to correct position
- while ($span->current->doc_id < $first->doc_id) {
- $span->next or return NEXTA | NEXTB;
+ while ($distance->current->doc_id < $first->doc_id) {
+ $distance->next or return NEXTA | NEXTB;
};
- my $current = $span->current or return 0b0000;
+ # There is no correct position ...
+ if ($distance->current->doc_id > $first->doc_id) {
+ return NEXTA | NEXTB;
+ };
- my ($start, $end) = $first->start < $second->start ? ($first, $second) : ($second, $first);
-
- if ($first->end > $current->end)
+ my $distance_current = $distance->current;
+ # Doc ID is at the correct position
+ # my $buffer = $self->{buffer};
+
+ # my $current = $span->current or return 0b0000;
+
+ # Forward, until the spans end overlaps the lower range
+ while ($distance->current->end < $lower_range) {
+ $distance->next;
+ };
+
+
+ # Distance is quite complicated imagine a situation like this:
+ # <1> ... <2> ... [a] ... </2> ... <3> ... <4> ... [b] ... </4></3></1>
+ # ???
+ #
+ # if ($first->end > $current->end)
};
1;
diff --git a/lib/Krawfish/Result/Aggregate/Content.pm b/lib/Krawfish/Result/Aggregate/Content.pm
new file mode 100644
index 0000000..16a5cec
--- /dev/null
+++ b/lib/Krawfish/Result/Aggregate/Content.pm
@@ -0,0 +1,2 @@
+# Aggregate by content information, for example,
+# based on a certain class
diff --git a/lib/Krawfish/Result/Group.pm b/lib/Krawfish/Result/Group.pm
new file mode 100644
index 0000000..cafd35e
--- /dev/null
+++ b/lib/Krawfish/Result/Group.pm
@@ -0,0 +1,109 @@
+package Krawfish::Result::Limit;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+use constant DEBUG => 0;
+
+# Group snippets based on certain criteria, for example:
+# metadata!
+# - this is an extension to facets, where snippets are grouped
+# based on a certain facet.
+# - having facets in a first step may improve the distributed aggregation
+# (as the central node than knows, which facets are most or least common)
+# - this grouping doesn't seem beneficial - as the facet view already helps here
+#
+# innertextual!
+# - has a certain identical class on surface
+# - has the same starting characters of a word
+# - has the same ending characters of a word
+# - has the same POS of a certain class (this is actually pretty hard!)
+# - this may mean to modify the search a bit to lift the posting types
+# and make a class, like [orth=der & base/p=*]
+# - At least the postingslist of base/p=* should be merged in parallel!
+#
+# This is already possible in C2 so it needs to be implemented!
+
+# A group has the following structure:
+# {
+# criterion => [freq, doc_freq]
+# }
+# Where criterion may be a sequence of criteria
+# with class information, like
+# 1:der|2:Baum => []
+
+# Construct grouping function
+sub new {
+ my $class = shift;
+ bless {
+ query => shift,
+
+ # This is a group criterion object, created outside, that defines the criterion
+ criterion => shift,
+ classes => [@_],
+
+ # Group to fill with matches and group info
+ # (as class1=>X, class2=>Y)
+ groups => {}
+ }, $class;
+};
+
+# Go through all matches
+# This could, nonetheless, be implemented like Facets ...
+sub _init {
+ my $self = shift;
+ my $criterion = $self->{criterion};
+ my $query = $self->{query};
+
+ my %groups = ();
+ my $group, $current;
+ my $doc_id = -1;
+
+ while ($query->next) {
+
+ $current = $query->current or last;
+
+ # Potentially create new group
+ $group = ($groups{$criterion->get_group($current)} //= [0,0]);
+
+ # Increment freq
+ $group->[0]++;
+
+ if ($current->doc_id != $doc_id) {
+
+ # Increment doc_freq
+ $group->[1]++;
+
+ $doc_id = $current->doc_id;
+ };
+ };
+
+ return \%groups;
+};
+
+sub next {
+ my $self = shift;
+
+
+ return $criterion->groups;
+};
+
+
+sub current {
+ $_[0]->{query}->current;
+}
+
+
+# May return a hash reference with information
+sub current_group;
+
+
+sub to_string {
+ my $self = shift;
+ my $str = 'collectGroups(';
+ $str .= $self->{criterion}->to_string . ':';
+ $str .= $self->{query}->to_string;
+ $str .= ')';
+};
+
+1;
diff --git a/lib/Krawfish/Result/Group/Characters.pm b/lib/Krawfish/Result/Group/Characters.pm
new file mode 100644
index 0000000..9dc5d83
--- /dev/null
+++ b/lib/Krawfish/Result/Group/Characters.pm
@@ -0,0 +1,57 @@
+package Krawfish::Result::Group::Character;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+use constant DEBUG => 0;
+
+sub new {
+ my $class = shift;
+ bless {
+ segments => shift, # Krawfish::Index::Segments object
+ from_start => shift, # boolean - otherwise from end
+ char_count => shift
+ nrs => [@_]
+ }, $class;
+};
+
+
+sub get_group {
+ my ($self, $match) = @_;
+
+ # Get all classes from the match
+ my @classes = $self->get_classes($self->{nrs});
+
+ my $segments = $self->{segments};
+
+ my %group;
+
+ # Classes have nr, start, end
+ foreach my $class (sort { $a->start <=> $b->start } @classes) {
+
+ if ($self->{from_start}) {
+
+ # This will retrieve the segment from the segments stream
+ my $segment = $stream->get($match->doc_id, $class->start);
+
+ if ($segment->)
+
+ # The character count can be satisfied by the
+ my $first_chars = $segment->first_chars;
+
+ if (length($first_chars) <= $self->{char_count} {
+ substr($first_chars);
+ }
+
+ # Check, if the segment only spans one segment
+ if ($class->end != $class->start+1) {
+
+ };
+ }
+ else {
+ ...
+ };
+ };
+};
+
+1;
diff --git a/lib/Krawfish/Result/Group/Classes.pm b/lib/Krawfish/Result/Group/Classes.pm
new file mode 100644
index 0000000..a974758
--- /dev/null
+++ b/lib/Krawfish/Result/Group/Classes.pm
@@ -0,0 +1,69 @@
+package Krawfish::Result::Group::Classes;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+# TODO:
+# The name is somehow misleading, as this will only
+# group by surface terms.
+
+use constant DEBUG => 0;
+
+sub new {
+ my $class = shift;
+ bless {
+ segments => shift, # Krawfish::Index::Segments object
+ nrs => [@_],
+ groups => {}
+ }, $class;
+};
+
+
+# This will return a string, reflecting the group name of the list
+sub get_group {
+ my ($self, $match) = @_;
+
+ # Get all classes from the match
+ # Classes need to be sorted by start position
+ # to be retrievable, in case the Segments-Stream
+ # is implemented as a postingslist (probably not)
+ my @classes = $self->get_classes_sorted($self->{nrs});
+
+ my %class_group;
+
+ # Classes have nr, start, end
+ foreach my $class (@classes) {
+
+ # WARNIG! CLASSES MAY OVERLAP SO SEGMENTS SHOULD BE CACHED OR BUFFERED!
+
+ # Get start position
+ my $start = $class->start;
+
+ my @seq = ();
+
+ # Receive segment
+ my $seg = $segments->get($match->doc_id, $start);
+
+ # Push term id to segment
+ push (@seq, $seg->term_id);
+
+ while ($start < $class->end -1) {
+ $seg = $segments->get($match->doc_id, $start++);
+
+ # Push term id to segment
+ push (@seq, $seg->term_id);
+ };
+
+ $class_group{$class->nr} = join('|', @seq);
+ };
+
+ my $string = '';
+ foreach (sort {$a <=> $b} keys %class_group) {
+ $string .= $_ .':' . class_group{$_} . ';';
+ };
+
+ return $string;
+};
+
+
+1;
diff --git a/lib/Krawfish/Result/Sort/Alphabet.pm b/lib/Krawfish/Result/Sort/Alphabet.pm
new file mode 100644
index 0000000..cc336f2
--- /dev/null
+++ b/lib/Krawfish/Result/Sort/Alphabet.pm
@@ -0,0 +1,5 @@
+# Sort by characters of a certain segment (either the first or the last).
+# This will require to open the offset file to get the first two characters
+# for bucket sorting per token and then request the
+# forward index (the offset is already liftet and may be stored in the buckets
+# as well) for fine grained sorting!