Rename Meta to Compile
Change-Id: Ief55855e2642a64672a840ece9d9fd876a49d373
diff --git a/lib/Krawfish/Compile/Segment/Aggregate.pm b/lib/Krawfish/Compile/Segment/Aggregate.pm
new file mode 100644
index 0000000..0ad04fc
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Aggregate.pm
@@ -0,0 +1,136 @@
+package Krawfish::Compile::Segment::Aggregate;
+use parent 'Krawfish::Compile';
+use strict;
+use warnings;
+
+use constant DEBUG => 0;
+
+# TODO:
+# It may be better to have one aggregator per match
+# and one aggregator per doc.
+
+# Aggregate values of matches per document and
+# per match.
+
+# TODO:
+# See https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations.html
+
+# TODO: Sort all ops for each_match and each_doc support
+sub new {
+ my $class = shift;
+ my $result = {};
+ my $self = bless {
+ query => shift,
+ ops => shift,
+
+ last_doc_id => -1,
+ collection => $result,
+ last_doc_id => -1,
+ finished => 0
+ }, $class;
+
+ # The aggregation needs to trigger on each match
+ my (@each_doc, @each_match);
+ foreach my $op (@{$self->{ops}}) {
+ if ($op->can('each_match')) {
+ push @each_match, $op;
+ };
+
+ # The aggregation needs to trigger on each doc
+ if ($op->can('each_doc')) {
+ push @each_doc, $op;
+ };
+ };
+
+ $self->{each_doc} = \@each_doc;
+ $self->{each_match} = \@each_match;
+
+ return $self;
+};
+
+
+sub collection {
+ $_[0]->{collection};
+};
+
+
+# TODO:
+# Add collection data to result document
+sub collect {
+ my ($self, $result) = @_;
+
+ # Add collect
+ $result->add_collection($self->collection);
+
+ # Collect result from nested query
+ $self->{query}->collect($result);
+ return $result;
+};
+
+
+# Iterate to the next result
+sub next {
+ my $self = shift;
+
+ # Get container object
+ my $collection = $self->collection;
+
+ # There is a next match
+ # TODO:
+ # If there is no operand per match, only use next_doc
+ if ($self->{query}->next) {
+
+ # Get the current posting
+ my $current = $self->{query}->current;
+
+ if ($current->doc_id != $self->{last_doc_id}) {
+
+ # Collect data of current operation
+ foreach (@{$self->{each_doc}}) {
+ $_->each_doc($current, $collection);
+ };
+
+ # Set last doc to current doc
+ $self->{last_doc_id} = $current->doc_id;
+ };
+
+ # Collect data of current operation
+ foreach (@{$self->{each_match}}) {
+ $_->each_match($current, $collection);
+ };
+
+ return 1;
+ };
+
+ # Release on_finish event
+ unless ($self->{finished}) {
+ foreach (@{$self->{ops}}) {
+ $_->on_finish($collection);
+ };
+ $self->{finished} = 1;
+ };
+
+ return 0;
+};
+
+
+sub current {
+ return $_[0]->{query}->current;
+};
+
+
+sub to_string {
+ my $self = shift;
+ my $str = 'aggr(';
+ $str .= '[' . join(',', map { $_->to_string } @{$self->{ops}}) . ']:';
+ $str .= $self->{query}->to_string;
+ return $str . ')';
+};
+
+# Shorthand for "search through"
+sub finalize {
+ while ($_[0]->next) {};
+ return $_[0];
+};
+
+1;
diff --git a/lib/Krawfish/Compile/Segment/Aggregate/Base.pm b/lib/Krawfish/Compile/Segment/Aggregate/Base.pm
new file mode 100644
index 0000000..efc4278
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Aggregate/Base.pm
@@ -0,0 +1,32 @@
+package Krawfish::Compile::Segment::Aggregate::Base;
+use strict;
+use warnings;
+
+# Does not need anything in the object
+sub new {
+ my $class = shift;
+ bless \(my $self = ''), $class;
+};
+
+# Per default do nothing
+sub each_doc {
+};
+
+# Per default do nothing
+sub each_match {
+};
+
+# Per default do nothing
+sub on_finish {
+};
+
+# Not implemented on base
+sub to_string {
+ ...
+};
+
+sub collection {
+ ...
+};
+
+1;
diff --git a/lib/Krawfish/Compile/Segment/Aggregate/Fields.pm b/lib/Krawfish/Compile/Segment/Aggregate/Fields.pm
new file mode 100644
index 0000000..2887f70
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Aggregate/Fields.pm
@@ -0,0 +1,158 @@
+package Krawfish::Compile::Segment::Aggregate::Fields;
+use parent 'Krawfish::Compile::Segment::Aggregate::Base';
+use Krawfish::Koral::Result::Aggregate::Fields;
+use Krawfish::Util::String qw/squote/;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+use constant DEBUG => 0;
+
+
+# TODO:
+# Simplify the counting by mapping the requested fields to
+# an array, that points to a map.
+
+# TODO:
+# Look for fast int => int hash maps
+# http://java-performance.info/implementing-world-fastest-java-int-to-int-hash-map/
+# http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx
+# https://gist.github.com/badboy/6267743
+
+# TODO:
+# Support corpus classes!
+
+# TODO:
+# In case the field has no rank, because it is a multivalued field,
+# a different mechanism has to be used!
+
+# TODO: It may be beneficial to store example documents in the
+# field ranks, too - so they don't need to be collected on the way ...
+# See Group::Fields as well.
+# For this, add a "witness" field
+
+# TODO:
+# Field aggregates should be sortable either <asc> or <desc>,
+# and should have a count limitation, may be even a start_index and an items_per_page
+
+
+sub new {
+ my $class = shift;
+ bless {
+ field_obj => shift,
+ field_keys => [map { ref($_) ? $_->term_id : $_ } @{shift()}],
+
+ # TODO:
+ # This needs to be an object, so it can be inflated again!
+ # collection => {}, # The buckets in memory
+
+ aggregation => Krawfish::Koral::Result::Aggregate::Fields->new,
+
+ freq => 0,
+ field_freqs => {}
+ }, $class;
+};
+
+
+# Initialize field pointer
+sub _init {
+ return if $_[0]->{field_pointer};
+
+ my $self = shift;
+
+ print_log('aggr_fields', 'Create pointer on fields') if DEBUG;
+
+ # Load the ranked list - may be too large for memory!
+ $self->{field_pointer} = $self->{field_obj}->pointer;
+};
+
+
+# On every doc
+sub each_doc {
+ my ($self, $current) = @_;
+
+ $self->_init;
+
+ print_log('aggr_fields', 'Aggregate on fields') if DEBUG;
+
+ my $doc_id = $current->doc_id;
+
+ my $pointer = $self->{field_pointer};
+
+ # Set match frequencies to all remembered doc frequencies
+ my $aggr = $self->{aggregation};
+
+ # Skip to document in question
+ # TODO:
+ # skip_doc should ALWAYS return either the document or NOMOREDOC!
+
+ if ($pointer->skip_doc($doc_id) == $doc_id) {
+
+ $aggr->flush;
+
+ # my $coll = $self->{collection};
+
+ # Get all requested fields
+ # my @fields;
+
+ if (DEBUG) {
+ print_log('aggr_fields', 'Look for frequencies for key ids ' .
+ join(', ', map { '#' . $_ } @{$self->{field_keys}}) . " in doc $doc_id");
+ };
+
+ # Iterate over all fields
+ foreach my $field ($pointer->fields(@{$self->{field_keys}})) {
+
+ # This should probably be a method in the fields pointer!
+ next if $field->type eq 'store';
+
+ # Increment occurrence
+ $aggr->incr_doc($field->key_id, $field->term_id);
+
+ if (DEBUG) {
+ print_log('aggr_fields', '#' . $field->term_id . ' has frequencies');
+ };
+ };
+ }
+
+ # Do not check rank
+ else {
+ $aggr->flush;
+ };
+};
+
+
+# On every match
+sub each_match {
+ $_[0]->{aggregation}->incr_match;
+};
+
+
+# finish the results
+sub on_finish {
+ my ($self, $collection) = @_;
+
+ $self->{aggregation}->flush;
+
+ $collection->{fields} = $self->{aggregation};
+};
+
+
+sub collection {
+ # Return fields
+ # Example structure for year
+ # {
+ # 1997 => [4, 67],
+ # 1998 => [5, 89],
+ # 1999 => [3, 20]
+ # }
+ $_[0]->{collection};
+};
+
+
+sub to_string {
+ return 'fields:' . join(',', map { '#' . $_ } @{$_[0]->{field_keys}});
+};
+
+
+1;
diff --git a/lib/Krawfish/Compile/Segment/Aggregate/Frequencies.pm b/lib/Krawfish/Compile/Segment/Aggregate/Frequencies.pm
new file mode 100644
index 0000000..099c092
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Aggregate/Frequencies.pm
@@ -0,0 +1,31 @@
+package Krawfish::Compile::Segment::Aggregate::Frequencies;
+use parent 'Krawfish::Compile::Segment::Aggregate::Base';
+use Krawfish::Log;
+use strict;
+use warnings;
+
+# Count the frequencies of all matches of the query
+# per doc and per match
+
+# TODO:
+# Support virtual corpus classes
+
+# Add to totalResources immediately
+sub each_doc {
+ $_[2]->{totalResources}++;
+};
+
+
+# Add to totalResults immediately
+sub each_match {
+ $_[2]->{totalResults}++;
+};
+
+
+# Stringification
+sub to_string {
+ 'freq'
+};
+
+1;
+
diff --git a/lib/Krawfish/Compile/Segment/Aggregate/Length.pm b/lib/Krawfish/Compile/Segment/Aggregate/Length.pm
new file mode 100644
index 0000000..87f9274
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Aggregate/Length.pm
@@ -0,0 +1,59 @@
+package Krawfish::Compile::Segment::Aggregate::Length;
+use parent 'Krawfish::Compile::Segment::Aggregate::Base';
+use Krawfish::Log;
+use strict;
+use warnings;
+
+# This will check the hits length in subtokens -
+# currently other word lengths are not supported
+
+# See https://en.wikipedia.org/wiki/Selection_algorithm
+# for algorithms to find the median or similar.
+
+
+use constant DEBUG => 0;
+
+sub new {
+ my $class = shift;
+ bless {
+ segment => shift,
+ query => shift,
+ min => 32_000,
+ max => 0,
+ sum => 0,
+ freq => 0
+ }, $class;
+};
+
+
+# On every match
+sub each_match {
+ my ($self, $current) = @_;
+ my $length = $current->end - $current->start;
+ $self->{min} = $length < $self->{min} ? $length : $self->{min};
+ $self->{max} = $length > $self->{max} ? $length : $self->{max};
+ $self->{sum} += $length;
+ $self->{freq}++;
+};
+
+
+# Finish the aggregation
+sub on_finish {
+ my ($self, $collection) = @_;
+
+ return if $self->{freq} == 0;
+ $collection->{length} = {
+ min => $self->{min},
+ max => $self->{max},
+ sum => $self->{sum},
+ avg => $self->{sum} / $self->{freq}
+ };
+};
+
+
+# Stringification
+sub to_string {
+ 'length'
+};
+
+1;
diff --git a/lib/Krawfish/Compile/Segment/Aggregate/TermExistence.pm b/lib/Krawfish/Compile/Segment/Aggregate/TermExistence.pm
new file mode 100644
index 0000000..fe9ed04
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Aggregate/TermExistence.pm
@@ -0,0 +1,8 @@
+# Similar to TermFreq, but stops
+# calculating after one match in the virtual corpus.
+# This probably requires a single list of term queries, that can be
+# closed, once a match occurs.
+
+# Probably better suited in Group
+
+__END__
diff --git a/lib/Krawfish/Compile/Segment/Aggregate/TermFreq.pm b/lib/Krawfish/Compile/Segment/Aggregate/TermFreq.pm
new file mode 100644
index 0000000..d76ebe3
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Aggregate/TermFreq.pm
@@ -0,0 +1,67 @@
+package Krawfish::Compile::Segment::Aggregate::TermFreq;
+use parent 'Krawfish::Compile::Segment::Aggregate::Base';
+use Krawfish::Util::String qw/squote/;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+# Counts the frequency for each term in a TermFrequency
+# query. This is necessary for co-occurrence search and the
+# Glemm service.
+
+
+# TODO:
+# This is rather a group query than an aggregation query.
+
+use constant DEBUG => 0;
+
+sub new {
+ my $class = shift;
+ my $self = bless {
+ index => shift,
+ term_query => shift,
+ freq => 0
+ }, $class;
+
+ # The term never occurs
+ unless ($self->{term}->next) {
+ $self->{term_query} = undef;
+ };
+
+ return $self;
+};
+
+sub each_doc {
+ my ($self, $current) = @_;
+
+ return unless $self->{term_query};
+
+ # Get the current doc_id from the VC
+ my $doc_id = $current->doc_id;
+
+ my $term = $self->{term_query};
+
+ # Check, if the term occurs in the doc
+ if ($term->current->doc_id == $doc_id || $term->skip_doc($doc_id) == $doc_id) {
+
+ # Add frequency in document to result
+ $self->{freq} += $term->freq_in_doc;
+ };
+};
+
+
+# Finish the result
+sub on_finish {
+ my ($self, $result) = @_;
+
+ my $term = $self->{term_query}->term;
+ my $freq = ($result->{freq} //= {});
+ $frew->{$term} = $self->{freq};
+};
+
+# Stringification
+sub to_string {
+ return 'tfreq:' . squote($self->{term_query}->term);
+};
+
+1;
diff --git a/lib/Krawfish/Compile/Segment/Aggregate/TokenFreq.pm b/lib/Krawfish/Compile/Segment/Aggregate/TokenFreq.pm
new file mode 100644
index 0000000..e165bd3
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Aggregate/TokenFreq.pm
@@ -0,0 +1 @@
+# Similar to TermFreq but will count the number of tokens per document.
diff --git a/lib/Krawfish/Compile/Segment/Aggregate/Values.pm b/lib/Krawfish/Compile/Segment/Aggregate/Values.pm
new file mode 100644
index 0000000..71b0711
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Aggregate/Values.pm
@@ -0,0 +1,107 @@
+package Krawfish::Compile::Segment::Aggregate::Values;
+use parent 'Krawfish::Compile::Segment::Aggregate::Base';
+use Krawfish::Koral::Result::Aggregate::Values;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+# TODO:
+# Rename to FieldCalc or FieldSum
+
+# TODO:
+# Support corpus classes
+
+# TODO:
+# This is rather a group query or better:
+# An aggregation on groups!
+
+use constant {
+ DEBUG => 0
+};
+
+sub new {
+ my $class = shift;
+ my $self = bless {
+ fields_obj => shift,
+
+ # This needs to be numerical field ids!
+ field_ids => [map { ref($_) ? $_->term_id : $_ } @{shift()}],
+ }, $class;
+
+ # Initialize aggregator
+ $self->{aggregation} = Krawfish::Koral::Result::Aggregate::Values->new(
+ $self->{field_ids}
+ );
+
+ return $self;
+};
+
+
+# Initialize field pointer
+sub _init {
+ return if $_[0]->{field_pointer};
+
+ my $self = shift;
+
+ $self->{field_pointer} = $self->{fields_obj}->pointer;
+};
+
+
+# Release for each doc
+sub each_doc {
+ my ($self, $current, $result) = @_;
+
+ $self->_init;
+
+ if (DEBUG) {
+ print_log('aggr_values', 'Aggregate on field values');
+ };
+
+ # Get current document
+ my $doc_id = $current->doc_id;
+
+ my $pointer = $self->{field_pointer};
+
+ # Get aggregation information
+ my $aggr = $self->{aggregation};
+
+ # Move fields pointer to current document
+ if ($pointer->skip_doc($doc_id) == $doc_id) {
+
+ # collect values
+ my @values = $pointer->int_fields(@{$self->{field_ids}}) or return;
+
+ # Aggregate all values
+ foreach my $field (@values) {
+
+ # Aggregate value
+ $aggr->add($field->key_id, $field->value);
+ };
+ };
+};
+
+
+sub collection {
+ $_[0]->{collection};
+};
+
+
+# Stringification
+sub to_string {
+ return 'values:' . join(',', @{$_[0]->{field_ids}});
+};
+
+
+# Finish the aggregation
+sub on_finish {
+ my ($self, $collection) = @_;
+
+ # Summarize collection
+ $self->{aggregation}->summarize;
+
+ # Maybe push to collection instead
+ $collection->{values} = $self->{aggregation};
+};
+
+
+1;
diff --git a/lib/Krawfish/Compile/Segment/Bundle.pm b/lib/Krawfish/Compile/Segment/Bundle.pm
new file mode 100644
index 0000000..6336697
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Bundle.pm
@@ -0,0 +1,117 @@
+package Krawfish::Compile::Segment::Bundle;
+use parent 'Krawfish::Compile';
+use Krawfish::Log;
+use strict;
+use warnings;
+
+
+# This class represents bundles of postings
+# (or bundles of bundles of postings) sorted by a certain criterion.
+
+
+use constant DEBUG => 0;
+
+
+# Bundle the current match
+sub current_bundle {
+ my $self = shift;
+
+ if (DEBUG) {
+ print_log('bundle', 'Get bundle');
+ };
+
+ return $self->{current_bundle};
+};
+
+
+# Get current match
+sub current_match {
+ $_[0]->{current_match};
+};
+
+
+# Get next bundle
+# This needs to be overwritten!
+sub next_bundle {
+ ...
+};
+
+
+
+# Move to the next posting in the list,
+# maybe in nested bundles.
+# These calls methods in Posting::Bundle!
+sub next {
+ my $self = shift;
+
+ if (DEBUG) {
+ print_log('bundle', 'Move to next posting');
+ };
+
+ # Get current bundle
+ my $bundle = $self->current_bundle;
+
+ # Check next in bundle
+ while (!$bundle || !$bundle->next) {
+
+ if (DEBUG) {
+ if (!$bundle) {
+ print_log('bundle', 'Current bundle does not exist yet or there is none');
+ }
+ else {
+ print_log('bundle', 'There is no more entry in current bundle');
+ };
+
+ print_log('bundle', 'Move to next bundle');
+ };
+
+
+ # There are more bundles
+ if ($self->next_bundle) {
+ $bundle = $self->current_bundle;
+ if (DEBUG) {
+ print_log('bundle', 'Current bundle to check is ' . $bundle->to_string);
+ };
+ }
+
+ # There are no more bundles
+ else {
+
+ if (DEBUG) {
+ print_log('bundle', 'No more bundles');
+ };
+
+ $self->{current} = undef;
+ return 0;
+ };
+ };
+
+ $self->{current} = $bundle->current;
+
+ if (DEBUG) {
+ print_log('bundle', 'Set current posting to ' . $self->{current}->to_string);
+ };
+
+ return 1;
+};
+
+
+# Return the current match
+sub current {
+ my $self = shift;
+ if (DEBUG) {
+ print_log('bundle', 'Current posting is ' . $self->{current}->to_string);
+ };
+
+ $self->{current};
+};
+
+
+
+# Get frequency
+sub max_freq {
+ $_[0]->{query}->max_freq;
+};
+
+
+1;
diff --git a/lib/Krawfish/Compile/Segment/BundleDocs.pm b/lib/Krawfish/Compile/Segment/BundleDocs.pm
new file mode 100644
index 0000000..db13935
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/BundleDocs.pm
@@ -0,0 +1,139 @@
+package Krawfish::Compile::Segment::BundleDocs;
+use parent 'Krawfish::Compile::Segment::Bundle';
+use Krawfish::Log;
+use Krawfish::Posting::List;
+use strict;
+use warnings;
+
+# Bundle matches in the same document.
+
+# TODO:
+# The problem with the current approch is, that next_bundle
+# will bundle the next doc - without asking if the doc
+# is relevant.
+
+use constant DEBUG => 0;
+
+sub new {
+ my $class = shift;
+ bless {
+ query => shift,
+ buffer => undef
+ }, $class;
+};
+
+
+# Clone query
+sub clone {
+ my $self = shift;
+ __PACKAGE__->new(
+ $self->{query}->clone
+ );
+};
+
+
+# TODO:
+# Implement next doc!
+sub next_doc {
+ ...
+};
+
+
+# Move to next bundle
+sub next_bundle {
+ my $self = shift;
+
+ # Reset current bundle
+ $self->{current_bundle} = undef;
+
+ my $first;
+
+ # There is a bundle on buffer
+ if ($self->{buffer}) {
+
+
+ if (DEBUG) {
+ print_log('d_bundle', 'There is a buffered posting for first');
+ };
+
+ $first = $self->{buffer};
+ $self->{buffer} = undef;
+ }
+
+ # Move forward
+ elsif ($self->{query}->next) {
+
+ if (DEBUG) {
+ print_log('d_bundle', 'Get new posting for first');
+ };
+
+ unless ($first = $self->{query}->current) {
+
+ if (DEBUG) {
+ print_log('d_bundle', 'There is no more posting');
+ };
+
+ return;
+ };
+ }
+
+ # Can't move forward
+ else {
+
+ if (DEBUG) {
+ print_log('d_bundle', 'No postings to move forward');
+ };
+
+ return;
+ };
+
+ my $bundle = Krawfish::Posting::List->new($first);
+ my $query = $self->{query};
+
+ if (DEBUG) {
+ print_log('d_bundle', 'Start bundle with ' . $first->to_string);
+ };
+
+ # There is a next entry
+ my $next;
+ while ($query->next) {
+ $next = $query->current;
+
+ # Documents are identical - bundle
+ if ($next->doc_id == $first->doc_id) {
+
+ if (DEBUG) {
+ print_log('d_bundle', 'Posting ' . $next->to_string . ' has identical doc id');
+ };
+
+ $bundle->add($next);
+ }
+
+ # Remember the next bundle
+ else {
+
+ if (DEBUG) {
+ print_log('d_bundle', 'Posting ' . $next->to_string . ' has different doc id');
+ };
+
+ $self->{buffer} = $next;
+ last;
+ };
+ };
+
+ if (DEBUG) {
+ print_log('d_bundle', 'Current bundle is ' . $bundle->to_string);
+ };
+
+ $self->{current_bundle} = $bundle;
+ return 1;
+};
+
+
+# Stringification
+sub to_string {
+ 'bundleDocs(' . $_[0]->{query}->to_string . ')';
+};
+
+
+1;
diff --git a/lib/Krawfish/Compile/Segment/Enrich/Context.pm b/lib/Krawfish/Compile/Segment/Enrich/Context.pm
new file mode 100644
index 0000000..09feebd
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Enrich/Context.pm
@@ -0,0 +1,113 @@
+package Krawfish::Compile::Segment::Enrich::Context;
+use parent 'Krawfish::Compile';
+use Krawfish::Log;
+use strict;
+use warnings;
+
+use constant DEBUG => 0;
+
+
+# DEPRECATED!!!
+
+
+# This will add context (only surface forms) to each match
+
+# TODO:
+# Context always needs to be left AND right, because
+# at least the surrounding elements context will expand
+# both sides at the same time!
+
+
+sub new {
+ my $class = shift;
+ bless {
+ forward_obj => shift,
+ query => shift,
+
+ # TODO:
+ # Should support
+ # - surrounding elements
+ # - left elements / right elements
+ # - left tokens / right tokens
+ # - left characters / right characters
+ contextualize => shift, # Accept context object
+ match => undef
+ }, $class;
+};
+
+# Initialize forward counter
+sub _init {
+ return if $_[0]->{forward_pointer};
+
+ my $self = shift;
+
+ print_log('e_context', 'Create forward pointer') if DEBUG;
+
+ # Load the ranked list - may be too large for memory!
+ $self->{forward_pointer} = $self->{forward_obj}->pointer;
+};
+
+
+sub current_match {
+ my $self = shift;
+
+ $self->_init;
+
+ # Match is already set
+ if ($self->{match}) {
+ if (DEBUG) {
+ print_log(
+ 'e_context',
+ 'Match already defined ' . $self->{match}->to_string
+ );
+ };
+ return $self->{match};
+ };
+
+ # TODO:
+ # may simply be $self->{query}->current_match
+ my $match = $self->match_from_query;
+
+ # Get forward pointer
+ my $forward = $self->{forward_pointer};
+
+ my $doc_id = $match->doc_id;
+
+ unless ($forward->skip_doc($doc_id) == $doc_id) {
+
+ # TODO: This should never happen!
+ return;
+ };
+
+ # Get the context
+ # TODO:
+ # This may be retrieved as part of the snippet!
+ my $left_context = $self->{contextualize}->left_context($match, $forward);
+ my $right_context = $self->{contextualize}->right_context($match, $forward);
+
+ # Add context to match
+ $match->add(
+ Krawfish::Posting::Match::Context->new(
+ left => $left_context,
+ right => $right_context
+ ));
+
+ return $match;
+};
+
+
+# Next match
+sub next {
+ my $self = shift;
+ $self->{match} = undef;
+ return $self->{query}->next;
+};
+
+
+sub to_string {
+ my $str = 'enrichContext(' . $self->{contextualize}->to_string . ':';
+ $str .= $_[0]->{query}->to_string;
+ return $str . ')';
+};
+
+1;
diff --git a/lib/Krawfish/Compile/Segment/Enrich/Fields.pm b/lib/Krawfish/Compile/Segment/Enrich/Fields.pm
new file mode 100644
index 0000000..a201418
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Enrich/Fields.pm
@@ -0,0 +1,135 @@
+package Krawfish::Compile::Segment::Enrich::Fields;
+use parent 'Krawfish::Compile';
+use Krawfish::Koral::Result::Enrich::Fields;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+use constant DEBUG => 0;
+
+# This will enrich each match with specific field information
+# Needs to be called on the segment level
+
+# Constructor
+sub new {
+ my $class = shift;
+ bless {
+ field_obj => shift,
+ query => shift,
+
+ # Could be treated like in Aggregate/Fields or Group/Fields!
+ fields => shift, # Expects to be numerical sorted field identifier
+ match => undef,
+ pointer => undef,
+ last_doc_id => -1
+ }, $class;
+};
+
+sub _init {
+ my $self = shift;
+
+ # Pointer is already initiated
+ return if $self->{init}++;
+
+ # Match is already set
+ if (DEBUG) {
+ print_log(
+ 'r_fields',
+ 'Initiate pointer to fields'
+ );
+ };
+
+ $self->{pointer} = $self->{field_obj}->pointer;
+ return;
+};
+
+
+sub pointer {
+ $_[0]->{pointer};
+};
+
+
+# Get current match
+sub current_match {
+ my $self = shift;
+
+ $self->_init;
+
+ # Match is already set
+ if ($self->{match}) {
+ if (DEBUG) {
+ print_log(
+ 'r_fields',
+ 'Match already defined ' . $self->{match}->to_string
+ );
+ };
+ return $self->{match};
+ };
+
+ my $match = $self->match_from_query;
+
+
+ # Match is in the same document as before
+ if ($match->doc_id == $self->{last_doc_id}) {
+
+ # Create an enrichment
+ $match->add(
+ Krawfish::Koral::Result::Enrich::Fields->new(@{$self->{last_fields}})
+ );
+
+ # The document has no associated fields
+ return ($self->{match} = $match);
+ };
+
+ # Retrieve from data
+
+ # Get fields object
+ my $fields = $self->{pointer};
+
+ # Move to document in field stream
+ my $fields_doc_id = $fields->skip_doc($match->doc_id);
+ if ($fields_doc_id != $match->doc_id) {
+
+ if (DEBUG) {
+ print_log('r_fields', 'Match doc id #' . $match->doc_id .
+ ' mismatches fields doc id #' . $fields_doc_id);
+ };
+
+ # The document has no associated fields
+ return ($self->{match} = $match);
+ };
+
+ # Get the fields from the fields stream
+ my @fields = $fields->fields(@{$self->{fields}});
+
+ $self->{last_doc_id} = $match->doc_id;
+ $self->{last_fields} = [@fields];
+
+ # Create an enrichment
+ $match->add(
+ Krawfish::Koral::Result::Enrich::Fields->new(@fields)
+ );
+
+ $self->{match} = $match;
+
+ print_log('r_fields', 'Match now contains data for ' . join(', ', @fields)) if DEBUG;
+
+ return $match;
+};
+
+
+# Next match
+sub next {
+ my $self = shift;
+ $self->{match} = undef;
+ return $self->{query}->next;
+};
+
+
+sub to_string {
+ my $str = 'enrichFields(' . join(',', @{$_[0]->{fields}}) . ':';
+ $str .= $_[0]->{query}->to_string;
+ return $str . ')';
+};
+
+1;
diff --git a/lib/Krawfish/Compile/Segment/Enrich/Snippet.pm b/lib/Krawfish/Compile/Segment/Enrich/Snippet.pm
new file mode 100644
index 0000000..8de0052
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Enrich/Snippet.pm
@@ -0,0 +1,126 @@
+package Krawfish::Compile::Segment::Enrich::Snippet;
+use parent 'Krawfish::Compile';
+use Krawfish::Koral::Result::Enrich::Snippet;
+# use Krawfish::Compile::Segment::Enrich::Snippet::Highlights;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+use constant DEBUG => 0;
+
+# TODO:
+# It may be more efficient to first collect all required
+# annotations (for decoration, context, hit etc.) and
+# then iterate over left context, hit, right context
+# and get all annotations per token at a time
+
+
+sub new {
+ my $class = shift;
+ # query
+ # fwd_obj
+ # left
+ # right
+ # hit
+ return bless { @_ }, $class;
+};
+
+
+# Initialize forward index
+sub _init {
+ return if $_[0]->{_init}++;
+
+ my $self = shift;
+ $self->{fwd_pointer} = $self->{fwd_obj}->pointer;
+};
+
+
+# Iterated through the ordered matches
+sub next {
+ my $self = shift;
+ $self->_init;
+ $self->{match} = undef;
+ return $self->{query}->next;
+};
+
+
+# Return the current match
+sub current_match {
+ my $self = shift;
+
+ print_log('c_snippet', 'Get current match') if DEBUG;
+
+ # Match is already set
+ return $self->{match} if $self->{match};
+
+ # Get current match from query
+ my $match = $self->match_from_query;
+
+ print_log('c_snippet', 'match is ' . $match) if DEBUG;
+
+ # Get forward query
+ my $forward = $self->{fwd_pointer};
+
+ # TODO:
+ # Fetch preceding context
+
+ my $doc_id = $match->doc_id;
+ unless ($forward->skip_doc($doc_id) == $doc_id) {
+
+ # TODO: This should never happen!
+ return;
+ };
+
+ if (DEBUG) {
+ print_log('c_snippet', 'Move to match doc position');
+ };
+
+
+ # Move pointer to start position of match
+ unless ($forward->skip_pos($match->start)) {
+
+ # This should never happen!
+ return;
+ };
+
+ # Get data from hit
+ my $hit_data = $self->{hit}->content($match, $forward);
+
+ if (DEBUG) {
+ print_log('c_snippet', 'Move to match position');
+ };
+
+ # Create snippet posting
+ my $snippet = Krawfish::Koral::Result::Enrich::Snippet->new(
+ hit_ids => $hit_data
+ );
+
+ # Add snippet to match
+ $match->add($snippet);
+
+ # Deal with left
+ # Deal with hit
+ # Deal with right
+
+ $self->{match} = $match;
+ return $match;
+};
+
+
+# Stringification
+sub to_string {
+ my $self = shift;
+ my $str = 'snippet(';
+ if ($self->{left}) {
+ $str .= $self->{left}->to_string . ',';
+ };
+ if ($self->{right}) {
+ $str .= $self->{right}->to_string . ',';
+ };
+ $str .= $self->{hit}->to_string . ':';
+ $str .= $self->{query}->to_string;
+ return $str . ')';
+};
+
+
+1;
diff --git a/lib/Krawfish/Compile/Segment/Enrich/Snippet/Context.pm b/lib/Krawfish/Compile/Segment/Enrich/Snippet/Context.pm
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Enrich/Snippet/Context.pm
diff --git a/lib/Krawfish/Compile/Segment/Enrich/Snippet/Context/Span.pm b/lib/Krawfish/Compile/Segment/Enrich/Snippet/Context/Span.pm
new file mode 100644
index 0000000..4ebd8ce
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Enrich/Snippet/Context/Span.pm
@@ -0,0 +1,145 @@
+package Krawfish::Compile::Segment::Enrich::Snippet::Context::Span;
+use Krawfish::Posting::Forward;
+use strict;
+use warnings;
+
+sub new {
+ my $class = shift;
+ # foundry_id, layer_id, anno_id, count
+ #
+ # The number of elements left or right
+ # to the match. Defaults to 0
+ # (so: only expand to the element)
+ # count
+
+ # Maximum number of tokens
+ # max
+ bless { @_ }, $class;
+};
+
+
+# Get left context
+sub left {
+ my ($self, $match, $pointer) = @_;
+
+ # The context as an array of preceeding strings and term_ids
+ my @context = ();
+
+ # Do not search beyond maximum tokens
+ my $max = $self->{max_left};
+ my $count = $self->{count_left};
+ my $last_match = undef;
+
+ # Get the start position of the match
+ # Move to that position (may be before current pointer position
+ unless ($pointer->skip_pos($match->start)) {
+ warn "pointer currently can't be repositioned";
+ };
+
+ # Remember the match position
+ my $remember_position = $pointer->current;
+
+ # As long as it is allowed, iterate through tokens
+ while ($max-- > 0) {
+
+ # Get current forward token
+ my $current = $pointer->current;
+ my $pos = $current->pos;
+
+ # Check if the token is relevant for annotation
+ if (my $anno = $current->annotation(
+ $self->{foundry_id},
+ $self->{layer_id},
+ $self->{anno_id},
+ )) {
+ # Annotation was found - get span length!
+
+ # Get the first annotation (that has the shortest span length)
+ # TODO:
+ # It may be beneficial to check, if there is a better length,
+ # e.g. one that surrounds the match
+ my $anno_data = $anno->[-1];
+
+ # The first part is the end position
+ my $anno_end = $anno_data->[0];
+
+ # TODO:
+ # DO something with $anno_end
+
+ # Element was found
+ if ($count-- < 0) {
+
+ # The span is not the start of the match
+ # (in which the part is already in the match)
+ if ($pos != $match->start) {
+ # Add token to context
+ unshift @context, Krawfish::Posting::Forward->new(
+ term_id => $current->term_id,
+
+ # Preceding data may need to be cut!
+ preceding_data => $current->preceding_data
+ );
+ };
+
+ # The requirement is fulfilled
+ $last_match = undef;
+ last;
+ };
+
+ # remember last match to gracefully cut all matches
+ $last_match = $max;
+ }
+
+ # Element is not found at position
+ else {
+
+ # The token is part of the match (and therefore not part of the context)
+ if ($pos == $match->start) {
+ # Add token to context
+ unshift @context, Krawfish::Posting::Forward->new(
+ term_id => 0,
+ preceding_data => $current->preceding_data
+ );
+ };
+
+ # Add token to context
+ unshift @context, Krawfish::Posting::Forward->new(
+ term_id => $current->term_id,
+ preceding_data => $current->preceding_data
+ );
+
+ # Maximum tokens exceeded
+ if ($max-- < 1) {
+ last;
+ };
+
+ # Move backwards
+ $pointer->prev;
+ };
+ };
+
+ # There is a last match to trim
+ if ($last_match) {
+ # TODO:
+ # Cut the context array to the last match
+ # TODO:
+ # Cut the last preceding data part as well!
+ };
+
+ # TODO
+ # Revert array
+};
+
+sub right {
+ ...
+};
+
+sub to_string {
+ my $self = shift;
+ my $str = 'span(';
+ $str .= '#' . $self->{foundry_id} . '/#' . $self->{layer_id} . '=#' . $self->{anno_id} . ',';
+ return $str . $self->{count} . ',' . $self->{max} . ')';
+};
+
+
+1;
diff --git a/lib/Krawfish/Compile/Segment/Enrich/Snippet/Element.pm b/lib/Krawfish/Compile/Segment/Enrich/Snippet/Element.pm
new file mode 100644
index 0000000..163213a
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Enrich/Snippet/Element.pm
@@ -0,0 +1,24 @@
+package Krawfish::Compile::Segment::Snippet::Element;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+sub start_char {
+ ...
+};
+sub end_char {
+ ...
+};
+
+sub open_html {
+ ...
+};
+sub close_html {
+ ...
+};
+
+1;
+
+__END__
+
+1;
diff --git a/lib/Krawfish/Compile/Segment/Enrich/Snippet/Element/Pagebreak.pm b/lib/Krawfish/Compile/Segment/Enrich/Snippet/Element/Pagebreak.pm
new file mode 100644
index 0000000..036d69d
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Enrich/Snippet/Element/Pagebreak.pm
@@ -0,0 +1,30 @@
+package Krawfish::Compile::Segment::Enrich::Snippet::Element::Pagebreak;
+use parent 'Krawfish::Compile::Segment::Enrich::Snippet::Element';
+use strict;
+use warnings;
+
+sub new {
+ my $self = shift;
+ bless {
+ start_char => shift,
+ page_after => shift
+ }, $class;
+};
+
+sub start_char {
+ return $_[0]->{start_char};
+};
+
+sub end_char {
+ return $_[0]->{end_char};
+};
+
+sub open_html {
+ return '<span class="pagebreak" data-page-after="' . $self->{page_after} . '"></span>';
+};
+
+sub close_html {
+ return '';
+};
+
+1;
diff --git a/lib/Krawfish/Compile/Segment/Enrich/Snippet/Highlights.pm b/lib/Krawfish/Compile/Segment/Enrich/Snippet/Highlights.pm
new file mode 100644
index 0000000..0f8bfb3
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Enrich/Snippet/Highlights.pm
@@ -0,0 +1,564 @@
+package Krawfish::Compile::Segment::Enrich::Snippet::Highlights;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+use constant DEBUG => 0;
+
+# -1 is match highlight
+# $annotation_nr_counter = 256;
+# $relation_number_counter = 2048;
+# $identifier_number_counter = -2;
+
+# private HashMap<Integer, String> annotationNumber = new HashMap<>(16);
+# private HashMap<Integer, Relation> relationNumber = new HashMap<>(16);
+# private HashMap<Integer, Integer> identifierNumber = new HashMap<>(16);
+
+sub new {
+ my $class = shift;
+ bless {
+ highlights => shift,
+ segments => shift,
+ list => [], # Combined array
+ stack => [] # Stack for balancing the elements
+ }, $class;
+};
+
+
+sub clear {
+ ...
+};
+
+
+sub add_open {
+ my $self = shift;
+};
+
+sub process {
+ ...
+};
+
+1;
+
+__END__
+
+
+sub parse_simple {
+ my $self = shift;
+
+ my $segments = $self->{segments};
+
+ # TODO:
+ # In Krill, offsets are collected in advance,
+ # but I guess it's cleaner to do on the fly
+
+ print_log('c_highl', 'Process highlight stack') if DEBUG;
+
+ my @highlights = ();
+
+ # my $start_seg = $segments->get($match->doc_id, $match->start);
+ # my $end_seg = $segments->get($match->doc_id, $match->end - 1);
+
+ # Add match as highlight
+ push @highlights, _highlight($match->start, $match->end, -1);
+
+ # TODO:
+ # Check that highlights are between these values
+ # my $start_pos = $match->start;
+ # my $end_pos = $match->end;
+
+ # TODO:
+ # Filter multiple identifiers, that may be introduced and would
+ # result in invalid xml
+ # this._filterMultipleIdentifiers();
+
+ my @open_list = sort _open_sort @highlights;
+ my @close_list = sort _close_sort @highlights;
+
+ # Final highlight stack
+ my @stack = ();
+
+ # Create sorted stack unless both lists are empty
+ while (scalar @open_list || scalar @close_list) {
+
+ # Shortcut for list ending
+ if (!scalar @open_list) {
+ push @stack, map { $_->[4] = } @close_list;
+ last;
+ }
+
+ # Not sure about this - but may happen
+ elsif (!scalar @close_list) {
+ last;
+ };
+
+ # Type 0: Textual data
+ # Type 1: Opening
+ # Type 2: Closing
+
+ # Check if the opening tag starts before the closing tag ends
+ if ($open_list[0]->[0] < $close_list[0]->[1]) {
+
+ # Clone highlight
+ my $element = [@{(shift(@open_list))}];
+
+ # Set element to be terminal
+ $element->[3] = 1; # terminal
+ $element->[4] = 1; # Opening
+ push @stack, $element;
+ }
+
+ # No - then close
+ else {
+ my $element = shift(@close_list);
+ $element->[4] = 2; # closing
+ push @stack, $element;
+ };
+ };
+
+ $self->{stack} = \@stack; # is a position stack!
+
+ # TODO:
+ # Problem to solve is now discontinuing elements!
+
+ return $self;
+};
+
+
+sub _highlight {
+ my ($start, $end, $class, $terminal) = @_;
+ return [$start, $end, $class, $terminal // 0];
+};
+
+# Sort opening tags by start, end and class number
+sub _open_sort {
+
+ # Compare start position
+ if ($a->start > $b->start) {
+ return 1;
+ }
+ elsif ($a->start == $b->start) {
+ # Compare end position
+ if ($a->end > $b->end) {
+ return -1;
+ }
+ elsif ($a->end == $b->end) {
+ # Compare class number
+ if ($a->[2] > $b->[2]) {
+ return 1;
+ }
+ elsif ($a->[2] < $b->[2]) {
+ return -1;
+ };
+ return 0;
+ };
+ }
+ return -1;
+};
+
+
+# Sort closing tags by end and start
+sub _close_sort {
+
+ # Compare end positions
+ if ($a->[1] > $b->[1]) {
+ return 1;
+ }
+ elsif ($a->[1] == $b->[1]) {
+
+ # Compare start position
+ if ($a->[0] < $b->[0]) {
+ return 1;
+ }
+ elsif ($a->[0] == $b->[0]) {
+ return 0;
+ };
+ return -1;
+ };
+ return -1;
+};
+
+
+
+
+
+1;
+
+
+
+
+__END__
+
+sub parse {
+ my ($self, $match) = @_;
+
+ my $segments = $self->{segments};
+
+ # Collect offsets for match
+ # TODO: In Krill, offsets are collected in advance,
+ # but I guess it's cleaner to do on the fly
+ my $start_seg = $segments->get($match->doc_id, $match->start);
+ my $end_seg = $segments->get($match->doc_id, $match->end - 1);
+
+ # TODO: Collect offsets for inner match
+
+ # match number
+
+ # TODO: Parse identifier string
+
+ # $self->add_open(0, $start_seg->[0]);
+ # $self->add_close(0, $end_seg->[1]);
+
+ # foreach (@highlights) {
+ # if ($_->start >= $match->start && $_->end <= $self->end) {
+ #
+ # };
+ # }
+
+ my $stack = $self->_process_highlight_stack;
+ return $self->{list};
+};
+
+
+sub _process_highlight_spans {
+ my $self = shift;
+
+ # TODO:
+ # Check potential start and end characters here
+
+ my $identifier = undef;
+
+ # my $array = $self->_process_offset_chars($match->doc_id);
+
+ # foreach my $highlight (@{$self->highlights}) {
+ # my $start = $self->{segments}->get($match->doc_id, $highlight->start);
+ # my $end = $self->{segments}->get($match->doc_id, $highlight->end);
+
+ # return if $start < 0 || $end < 0;
+ # $self->{span}->add($start, $end, $highlight->nr);
+ # };
+};
+
+
+# TODO: Process context, primary data
+sub _process_offset_chars {
+# my $self = shift;
+ # if ($context) {}
+};
+
+sub _process_highlight_stack {
+ my $self = shift;
+
+ print_log('c_highl', 'Process highlight stack') if DEBUG;
+
+ my @open_list = ();
+ my @close_list = ();
+
+ # TODO:
+ # Filter multiple identifiers, that may be introduced and would
+ # result in invalid xml
+ # this._filterMultipleIdentifiers();
+
+ my @highlights = @_;
+
+ push @open_list, @highlights;
+ push @close_list, @highlights;
+
+ @open_list = sort _open_sort @open_list;
+ @close_list = sort _close_sort @open_list;
+
+ my @stack = ();
+
+ # Create sorted stack unless both lists are empty
+ while (scalar @open_list || scalar @close_list) {
+ if (!scalar @open_list) {
+ push @stack, @close_list;
+ last;
+ }
+
+ # Not sure about this - but may happen
+ elsif (!scalar @close_list) {
+ last;
+ };
+
+ if ($open_list[0]->start < $close_list[0]->end) {
+
+ my $e = (shift(@open_list))->clone;
+ $e->[3] = 1;
+ push @stack, $e;
+ }
+ else {
+ push @stack, shift(@close_list)
+ };
+ };
+ return \@stack;
+};
+
+sub add_close {
+ my $self = shift;
+ my ($nr, $end) = @_;
+
+ $self->{temp_stack} = [];
+
+ # Check if there is an opening tag
+ unless ($self->{stack}->[0]) {
+ warn 'Nothing to close on stack';
+ return;
+ };
+
+ if (DEBUG) {
+ print_log(
+ 'c_highl',
+ "Stack for checkinmg with class $nr is " .
+ join('|', @{$self->{stack}})
+ );
+ };
+
+
+ # Class number of the last element
+ my $eold = pop @{$self->{stack}};
+
+ my $last_combinator;
+
+ # the closing element is not balanced, i.e. the last element differs
+ while ($eold != $nr) {
+
+ # Get last element
+ $last_combinator = $self->{list}->[-1];
+
+ if (DEBUG) {
+ print_log(
+ 'c_highl',
+ 'Closing element is unbalanced - ' .
+ $eold . ' != ' . $nr . ' with last combinator ' .
+ join('|',
+ $last_combinator->{type},
+ $last_combinator->{nr},
+ $last_combinator->{chars}
+ )
+ );
+ };
+
+ # combinator is opening and the number is not equal to the last
+ # element on the balanceStack
+ if ($last_combinator->{type} == 1 && $last_combinator->{nr} == $eold) {
+
+ # Remove the last element - it's empty and uninteresting!
+ pop @{$self->{list}};
+ }
+
+ # combinator is either closing (??) or another opener
+ else {
+
+ print_log('c_highl', "Close element a) $eold") if DEBUG;
+
+ # Add close element of the unclosed element
+ # This will be continued
+ push @{$self->{list}}, Krawfish::Collection::Snippet::Highlights::Combinator->new_node(
+ 2, $eold, 0
+ );
+ };
+
+ # add this element number temporarily on the stack
+ push @{$self->{temp_stack}}, $eold;
+
+ # Check next element
+ $eold = pop @{$self->{stack}};
+ };
+
+ # Get last combinator on the stack
+ $last_combinator = $self->{list}->[-1];
+
+ if (DEBUG) {
+ print_log(
+ 'c_highl',
+ "LastComb: " .
+ join('|',
+ $last_combinator->{type},
+ $last_combinator->{nr},
+ $last_combinator->{chars}
+ ) .
+ " for $nr"
+ );
+ };
+
+ if ($last_combinator->{type} == 1 && $last_combinator->{nr} == $nr) {
+
+ while ($last_combinator->{type} == 1 && $last_combinator->{nr} == $nr) {
+ # Remove the damn thing - It's empty and uninteresting!
+ pop @{$self->{list}};
+ $last_combinator = $self->{list}->[-1];
+ };
+ }
+
+ else {
+ print_log('c_highl', "Close element b) $nr") if DEBUG;
+
+ # Add closer
+ push @{$self->{list}}, Krawfish::Collection::Snippet::Highlights::Combinator->new_node(
+ 2, $eold, 1
+ );
+ };
+
+ for my $e (@{$self->{temp_stack}}) {
+ print_log('c_highl', "Reopen element $e") if DEBUG;
+ push @{$self->{list}}, Krawfish::Collection::Snippet::Highlights::Combinator->new_node(
+ 1, $e
+ );
+
+ push @{$self->{stack}}, $e;
+ };
+};
+
+
+sub get_first {
+ $_[0]->{list}->[0];
+};
+
+
+sub get_last {
+ $_[0]->{list}->[-1];
+};
+
+
+sub get {
+ $_[0]->{list}->[$_[1]];
+};
+
+
+sub size {
+ scalar @{$_[0]->{list}}
+};
+
+
+# Add textual element
+sub add_string {
+ my ($self, $string) = @_;
+ my $element = Krawfish::Collection::Snippet::Highlights::Combinator->new_text(
+ $string
+ );
+ push @{$self->{list}}, $element;
+};
+
+
+# Open element
+sub add_open {
+ my ($self, $number) = @_;
+
+ my $element = Krawfish::Collection::Snippet::Highlights::Combinator->new_node(
+ 1 => $number
+ );
+ push @{$self->{list}}, $element;
+ push @{$self->{stack}}, $number;
+};
+
+
+sub to_string {
+ my $self = shift;
+ my $str = '';
+ foreach (@{$self->{list}}) {
+ $str .= $_->to_string . "\n";
+ };
+ return $str;
+};
+
+
+package Krawfish::Collection::Snippet::Highlights::Combinator;
+use strict;
+use warnings;
+
+# Type 0: Textual data
+# Type 1: Opening
+# Type 2: Closing
+
+# Constructor for nodes
+sub new_node {
+ my $class = shift;
+ my $self = bless {
+ type => shift, # byte
+ nr => shift, # integer
+ terminal => shift // 1, # boolean
+ chars => ''
+ }, $class;
+
+ # Terminal elements are closed and won't be reopened
+
+ return $self;
+};
+
+
+# Constructor for textual data
+sub new_text {
+ my $class = shift;
+ bless {
+ type => 0,
+ chars => shift,
+ nr => 0,
+ terminal => 1
+ }, $class;
+};
+
+
+# TODO: This may not be set here
+sub to_bracket {
+ my $self = shift;
+ my $match = shift;
+
+ my $str = '';
+
+ # Closing bracket
+ if ($self->{type} == 2) {
+
+ # Close matching element
+ if ($self->{nr} == -1) {
+ return ']';
+ };
+
+ # Close matching highlight, relation, span ...
+ return '}';
+ }
+
+ elsif ($self->{type} == 1) {
+ if ($self->{nr} == -1) {
+ $str .= '[';
+ }
+
+ # Is identifier
+ elsif ($self->{nr} < -1) {
+ $str .= '{#' . $match->class_id($self->{nr}) . ':';
+ }
+
+ # Highlight, relation, Span
+ else {
+ $str .= '{';
+
+ # Todo: Use highlight directive
+
+ if ($self->{nr} >= 256) {
+
+ # Is an annotation?
+ if ($self->{nr} < 2048) {
+ $str .= $match->annotation_id($self->{nr});
+ }
+
+ # Relation
+ else {
+ my $rel = $match->relation_id($self->{nr});
+ $str .= $rel->annotation;
+ $str .= '>';
+ }
+ }
+
+ # Highlight
+ elsif ($self->{nr} != 0) {
+ $str .= $self->{nr} . ':';
+ }
+
+ return $str;
+ };
+
+ return $self->{chars};
+ };
+};
+
+1;
diff --git a/lib/Krawfish/Compile/Segment/Enrich/Snippet/Hit.pm b/lib/Krawfish/Compile/Segment/Enrich/Snippet/Hit.pm
new file mode 100644
index 0000000..a22fd95
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Enrich/Snippet/Hit.pm
@@ -0,0 +1,54 @@
+package Krawfish::Compile::Segment::Enrich::Snippet::Hit;
+use Krawfish::Koral::Document::Subtoken;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+use constant DEBUG => 0;
+
+sub new {
+ my $class = shift;
+ bless {
+ @_
+ }, $class;
+};
+
+sub content {
+ my ($self, $match, $forward) = @_;
+
+ if ($match->start != $forward->pos) {
+ warn 'The current position is not at the start position of the match';
+ return;
+ };
+
+ my @data;
+ my $length = $match->end - $match->start;
+ while ($length > 0) {
+
+ # Get the current token
+ my $current = $forward->current;
+
+ # Add token to text
+ push @data, Krawfish::Koral::Document::Subtoken->new_by_term_id(
+ $current->preceding_data,
+ $current->term_id
+ );
+
+ # Get the surface data
+ $length--;
+ $forward->next or last;
+ };
+
+ if (DEBUG) {
+ print_log('c_snippet', 'Add snippet match data: ' . join(',', @data));
+ };
+
+ return \@data;
+};
+
+sub to_string {
+ 'hit';
+};
+
+
+1;
diff --git a/lib/Krawfish/Compile/Segment/Enrich/Snippet/Spans.pm b/lib/Krawfish/Compile/Segment/Enrich/Snippet/Spans.pm
new file mode 100644
index 0000000..f7ad380
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Enrich/Snippet/Spans.pm
@@ -0,0 +1,32 @@
+package Krawfish::Compile::Segment::Enrich::Snippet::Spans;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+use constant DEBUG => 0;
+
+sub new {
+ my $class = shift;
+ bless {
+ elements => [],
+ text => ''
+ }, $class;
+};
+
+sub add_element {
+ my ($self, $element) = @_;
+ push @{$self->{elements}}, $element;
+};
+
+sub add_text {
+ my ($self, $text) = @_;
+ $self->{text} .= $text;
+ return $self;
+};
+
+sub to_html {
+ my $self = shift;
+ return $self->{text};
+};
+
+1;
diff --git a/lib/Krawfish/Compile/Segment/Enrich/SortCriterion.pm b/lib/Krawfish/Compile/Segment/Enrich/SortCriterion.pm
new file mode 100644
index 0000000..b388f0e
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Enrich/SortCriterion.pm
@@ -0,0 +1,73 @@
+package Krawfish::Compile::Segment::Enrich::SortCriterion;
+use parent 'Krawfish::Compile';
+use warnings;
+use strict;
+
+# Enrich an item with sort criteria.
+# This is necessary to sort items beyond the segment.
+# The problem with this enrichment is,
+# that it needs to augment the sorted items after sorting,
+# so they are not in a proper order to go through
+# the fields lists (for example) to collect the field values
+# or through the forward index to collect term_ids (though
+# this may be a different API).
+#
+# A proper way to do this would be to go through the sorted
+# lists and create a new sorted list in doc order (or to somehow
+# keep match order) to make it possible to enrich with all
+# sorting criteria.
+#
+# 1. For Fields: Create a list of all docs to enrich in doc_id order
+# (Ignore duplicates)
+# 2. Prepare all requested fields in field order
+# 3. Go through all fields and collect values or term_ids
+# 4. Create criterion vectors per match based on these information
+#
+# But:
+# It may very well be possible to only enrich if required
+# on the node level.
+#
+# On the node level, the relevant criteria (top_k) will be inflated,
+# taken the ordering into account (which means following matches may
+# have a lot of criteria in common.
+
+
+sub new {
+ my $class = shift;
+ bless {
+ query => shift,
+
+ # Store all criteria in sorted order,
+ # which may include terms and fields.
+ # This will also keep the direction
+ # and possibly the collation.
+ criteria => shift
+ }, $class
+};
+
+sub _init {
+ my $self = shift;
+
+ return if $self->{init}++;
+
+ # TODO:
+ # Go through all criteria and collect required field IDs.
+ # Bring required field IDs in order.
+ # Create an array for field_id => criterion_position to
+ # map the surface term to the criterion after fetching.
+ # Remember the criterion position for optional term sorting.
+};
+
+
+sub current_match {
+ # TODO:
+ # Create an empty list for sorting criteria.
+ # a) Retrieve for the document id all the relevant fields
+ # if there are fields to retrieve.
+ # Add in the position of the criteria list.
+ # b) The surface term is already retrieved and enriched.
+ # Add in the position of the criteria list.
+};
+
+
+1;
diff --git a/lib/Krawfish/Compile/Segment/Enrich/TermFreq.pm b/lib/Krawfish/Compile/Segment/Enrich/TermFreq.pm
new file mode 100644
index 0000000..452a385
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Enrich/TermFreq.pm
@@ -0,0 +1,2 @@
+# Enrich all matches (per doc) with frequencies of terms
+# in the specific document, e.g. number of token, number of sentences etc.
diff --git a/lib/Krawfish/Compile/Segment/Enrich/Terms.pm b/lib/Krawfish/Compile/Segment/Enrich/Terms.pm
new file mode 100644
index 0000000..a5299a0
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Enrich/Terms.pm
@@ -0,0 +1,234 @@
+package Krawfish::Compile::Segment::Enrich::Terms;
+use Krawfish::Koral::Result::Enrich::Terms;
+use parent 'Krawfish::Compile';
+use Krawfish::Log;
+use strict;
+use warnings;
+
+# TODO:
+# Potentially rename to ::Terms! or ::Classes!
+
+# Enrich each match with all term ids for a specific region and
+# for a specific class
+
+# TODO:
+# Make this usable for Krawfish::Compile::Group::Classes
+# and Krawfish::Compile::Sort::Classes
+# by supporting $nrs instead of $nr
+#
+# TODO:
+# Classes may overlap, so subtokens should be cached/buffered
+# or rather memoized!
+# This also means it's necessary to retrieve term-ids without gaps
+# so the next match can retrieve the term ids overlapping
+#
+# TODO:
+# For a class there may be more than one start+end values.
+# For sorting, gaps should be recognized and marked
+# with a term_id=0.
+# For sorting it's important to remember that!
+
+use constant {
+ DEBUG => 0,
+ NR => 0,
+ START_POS => 1,
+ END_POS => 2,
+};
+
+
+sub new {
+ my $class = shift;
+ bless {
+ forward_obj => shift,
+ query => shift,
+ nrs => shift // [0],
+ match => undef
+ }, $class;
+};
+
+sub to_string {
+ my $self = shift;
+ 'terms(' . join(',', @{$self->{nrs}}) . ':'
+ . $self->{query}->to_string . ')'
+};
+
+
+sub _init {
+ my $self = shift;
+
+ return if $self->{init}++;
+
+ if (DEBUG) {
+ print_log(
+ 'r_terms',
+ 'Initiate pointer to forward index'
+ );
+ };
+
+ # The pointer can move backwards if necessary
+ $self->{pointer} = $self->{forward_obj}->pointer;
+};
+
+
+sub pointer {
+ $_[0]->{pointer};
+};
+
+
+
+# Next match
+sub next {
+ my $self = shift;
+ $self->{match} = undef;
+ return $self->{query}->next;
+};
+
+
+# Get the current match
+sub current_match {
+ my $self = shift;
+
+ $self->_init;
+
+ # Current match is already defined
+ if ($self->{match}) {
+
+ # Return match
+ return $self->{match};
+ };
+
+ # Get match based on current query position
+ my $match = $self->match_from_query;
+
+ if (DEBUG) {
+ print_log(
+ 'r_terms',
+ 'Get match from query'
+ );
+ };
+
+ # Get classes of the match
+ my @classes = $match->get_classes($self->{nrs});
+
+ # No classes found in match
+ return $match unless @classes;
+
+ # This only contains classes requested,
+ # but potentially multiple times
+
+ # First retrieve term ids
+ my $start = $classes[0]->[START_POS];
+ my $end = $classes[0]->[END_POS];
+ foreach (@classes[1 .. $#classes]) {
+ $start = $_->[START_POS] if $_->[START_POS] < $start;
+ $end = $_->[END_POS] if $_->[END_POS] > $end;
+ };
+
+ if (DEBUG) {
+ print_log(
+ 'r_terms',
+ "Retrieve subtokens for class position $start-$end"
+ );
+ };
+
+ # TODO:
+ # Instead of using pointer directly,
+ # this should use a forward buffer
+ # with a yet to be defined API
+
+ my $pointer = $self->pointer;
+
+ # Skip to current document
+ my @term_ids = ();
+
+ my $doc_id = $match->doc_id;
+ if ($pointer->skip_doc($doc_id) == $doc_id &&
+
+ # Skip to current position
+ $pointer->skip_pos($start)) {
+
+ if (DEBUG) {
+ print_log('r_terms', "Pointer is repositioned");
+ };
+
+
+ # Collect all relevant subtoken term ids
+ for (my $i = $start; $i < $end; $i++) {
+ # Add terms to list
+ my $current = $pointer->current or return $match;
+ push @term_ids, $current->term_id;
+
+ # Move to next subtoken
+ $pointer->next;
+ };
+ }
+
+ # Document not available
+ else {
+ # Nothing to add
+ return $match;
+ };
+
+ if (DEBUG) {
+ print_log(
+ 'r_terms',
+ 'Retrieved terms are ' . join(',', @term_ids)
+ );
+ };
+
+ # Add lists of term_ids
+ # Structure is
+ # {
+ # class1 => [id,id,0,id,id],
+ # class2 => [...]
+ # }
+ # WARNING:
+ # Gaps in classes are marked with 0!
+ my %term_id_per_class;
+ foreach my $class (@classes) {
+
+ if (DEBUG) {
+ print_log(
+ 'r_terms',
+ 'Add term ids for class ' . $class->[NR] .
+ ' with theoretical start at ' . $start
+ );
+ };
+
+ # Get the term vector of the class
+ my $term_ids = ($term_id_per_class{$class->[NR]} //= []);
+
+ # Foreach position, set the term_id
+ foreach my $pos ($class->[START_POS] .. ($class->[END_POS] - 1)) {
+
+ # Get the position without offset
+ $pos = $start - $pos;
+
+ # Copy the term id from the retrieved list
+ $term_ids->[$pos] = $term_ids[$pos];
+ };
+ };
+
+ # Because this may introduce zeros at the beginning,
+ # all classes need to be trimmed again
+ foreach my $class_nr (keys %term_id_per_class) {
+ while (defined $term_id_per_class{$class_nr}->[0] &&
+ $term_id_per_class{$class_nr}->[0] == 0) {
+ shift @{$term_id_per_class{$class_nr}};
+ };
+ };
+
+ # Add term id information per class
+ $match->add(
+ Krawfish::Koral::Result::Enrich::Terms->new(
+ \%term_id_per_class
+ )
+ );
+
+ # Set match
+ $self->{match} = $match;
+};
+
+
+
+1;
diff --git a/lib/Krawfish/Compile/Segment/EnrichGroup/Values.pm b/lib/Krawfish/Compile/Segment/EnrichGroup/Values.pm
new file mode 100644
index 0000000..f94461c
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/EnrichGroup/Values.pm
@@ -0,0 +1,4 @@
+# Add per group values from fields,
+# like in a group on documents add the min and max values
+# of a field, e.g. the date span, or the total number
+# of sentences in a corpus.
diff --git a/lib/Krawfish/Compile/Segment/Group/AnnotationClasses.pm b/lib/Krawfish/Compile/Segment/Group/AnnotationClasses.pm
new file mode 100644
index 0000000..bba8b9e
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Group/AnnotationClasses.pm
@@ -0,0 +1,11 @@
+# This should make it possible to search for classes
+# and group based on the annotations at the certain range.
+# This, however, is probably quite tricky as
+# there is no simple position based forward index with
+# term_ids for annotations, meaning that this
+# has to check the annotations in the complete forward index,
+# probably making this unusable slow.
+# but who knows ...
+
+# A query like
+# group_by_annotation_classes(1,"opennlp","p","Der {1:[]} Mann")
diff --git a/lib/Krawfish/Compile/Segment/Group/Characters.pm b/lib/Krawfish/Compile/Segment/Group/Characters.pm
new file mode 100644
index 0000000..e5b1da7
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Group/Characters.pm
@@ -0,0 +1,66 @@
+package Krawfish::Compile::Segment::Group::Character;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+
+# This groups on prefix or suffixes of subterms.
+# Necessary to support "Ansicht nach Wortendungen" for example.
+# It's possible to first group on terms and then - per term,
+# request the term surface in the dictionary and group by
+# the result.
+
+
+use constant DEBUG => 0;
+
+sub new {
+ my $class = shift;
+ bless {
+ segments => shift, # Krawfish::Index::Segments object
+ # TODO: May as well be a subtoken object
+ from_start => shift, # boolean - otherwise from end
+ char_count => shift
+ nrs => [@_]
+ }, $class;
+};
+
+
+sub get_group {
+ my ($self, $match) = @_;
+
+ # Get all classes from the match
+ my @classes = $match->get_classes($self->{nrs});
+
+ my $segments = $self->{segments};
+
+ my %group;
+
+ # Classes have nr, start, end
+ foreach my $class (sort { $a->start <=> $b->start } @classes) {
+
+ if ($self->{from_start}) {
+
+ # This will retrieve the segment from the segments stream
+ my $segment = $stream->get($match->doc_id, $class->start);
+
+ if ($segment->)
+
+ # The character count can be satisfied by the
+ my $first_chars = $segment->first_chars;
+
+ if (length($first_chars) <= $self->{char_count} {
+ substr($first_chars);
+ }
+
+ # Check, if the segment only spans one segment
+ if ($class->end != $class->start+1) {
+
+ };
+ }
+ else {
+ ...
+ };
+ };
+};
+
+1;
diff --git a/lib/Krawfish/Compile/Segment/Group/ClassFrequencies.pm b/lib/Krawfish/Compile/Segment/Group/ClassFrequencies.pm
new file mode 100644
index 0000000..529267f
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Group/ClassFrequencies.pm
@@ -0,0 +1,215 @@
+package Krawfish::Compile::Segment::Group::ClassFrequencies;
+use parent 'Krawfish::Compile';
+use Krawfish::Koral::Result::Group::ClassFrequencies;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+use constant DEBUG => 0;
+
+# Aggregate by content information, for example,
+# based on a certain class
+#
+# TODO:
+# Currently this only works for surface term_ids, but it may very well collect
+# arbitrary annotations! In that case, multiple annotations and different
+# annotation lengths have to be taken into account.
+#
+# TODO:
+# The special case of class 0 needs to be treated.
+#
+# TODO:
+# Support virtual corpus classes
+
+sub new {
+ my $class = shift;
+ my $self = bless {
+ forward_obj => shift,
+ query => shift,
+ classes => shift,
+ class_freq => {},
+ term_cache => {},
+ last_doc_id => -1
+ }, $class;
+
+ $self->{groups} = Krawfish::Koral::Result::Group::ClassFrequencies->new(
+ $self->{classes}
+ );
+
+ return $self;
+};
+
+
+# Initialize forward counter
+sub _init {
+ return if $_[0]->{forward_pointer};
+
+ my $self = shift;
+
+ print_log('g_class_freq', 'Create forward pointer') if DEBUG;
+
+ # Load the ranked list - may be too large for memory!
+ $self->{forward_pointer} = $self->{forward_obj}->pointer;
+};
+
+
+# Shorthand for "search through"
+sub finalize {
+ while ($_[0]->next) {};
+ return $_[0];
+};
+
+
+# Move to next match
+sub next {
+ my $self = shift;
+
+ $self->_init;
+
+ # No more matches
+ return unless $self->{query}->next;
+
+ # Get the current posting
+ my $current = $self->{query}->current;
+
+ my $groups = $self->{groups};
+ my $pointer = $self->{forward_pointer};
+
+ # Get the current doc_id and move to it
+ my $doc_id = $current->doc_id;
+
+ # Current doc_id differ - move forward
+ if ($doc_id != $self->{last_doc_id}) {
+
+ # Skip to doc
+ if ($pointer->skip_doc($doc_id) != $doc_id) {
+
+ # This should never happen, as for all docs there is a
+ # forward index!
+ return;
+ };
+
+ if (DEBUG) {
+ print_log('g_class_freq', "Moved forward index to $doc_id");
+ };
+
+ # Remember the last document
+ $self->{last_doc_id} = $doc_id;
+ };
+
+ # Remember terms (for overlap)
+ # with the structure pos -> term_id
+ # my $cache = $self->{term_cache};
+
+ # Collect classes - have the structure
+ # $classes[1] = [...term_id,0,0,term_id,...]
+ my @classes = ();
+ foreach (@{$self->{classes}}) {
+ $classes[$_] = []; # TODO: Length is normally the length of the match
+ };
+ my @term_cache = ();
+
+ # Get offset to simplify classes
+ my $offset = $current->start;
+
+ # TODO:
+ # There may be overlaps within following
+ # matches, so under certain circumstances the pointer
+ # needs to move backwards.
+ # It may be enough to do that at the beginning here.
+ # Or instead of skip_pos, the forward pointer can
+ # reposition automatically.
+
+ # Get class payloads
+ my @class_infos = $current->get_classes($self->{classes});
+
+ # Retrieve the requested classes for the current posting
+ foreach my $class_info (@class_infos) {
+
+ # Check class info
+ my ($nr, $start, $end) = @{$class_info};
+
+ # Iterate over the class and collect term ids
+ for (my $i = $start; $i < $end; $i++) {
+
+ # The relative position of the class in the match
+ my $rel_pos = $i - $offset;
+
+ # Check if term already retrieved
+ if ($term_cache[$rel_pos]) {
+
+ # Copy to class
+ $classes[$nr]->[$rel_pos] = $term_cache[$rel_pos];
+ }
+
+ # TODO:
+ # Check for term cache
+ # if ($term_cache->{$start})
+
+ # Retrieve term id
+ else {
+
+ # Collect all terms from start to end
+ if ($pointer->skip_pos($i)) {
+
+ # Add term id to class in correct order
+ my $term_id = $pointer->current->term_id;
+
+ if (DEBUG) {
+ print_log(
+ 'g_class_freq',
+ "Term id at position $i is #" . $term_id);
+ };
+
+
+ # Set term_id at relative position in term_cache
+ $term_cache[$rel_pos] = $classes[$nr]->[$rel_pos] = $term_id;
+ };
+ }
+ };
+ };
+
+ if (DEBUG) {
+ print_log('g_class_freq', 'term cache is ' . join(',', @term_cache));
+ };
+
+ # The signature has the structure [class, term_id*]+
+ my @sig = ();
+
+ # Iterate over all classes
+ foreach my $nr (@{$self->{classes}}) {
+ push @sig, $nr;
+
+ foreach (@{$classes[$nr]}) {
+ push @sig, $_ if defined $_; # Add all set term_ids
+ };
+
+ push @sig, 0;
+ };
+
+ # Increment per match
+ $self->{groups}->incr_match(\@sig);
+
+ return 1;
+};
+
+
+sub current {
+ return $_[0]->{query}->current;
+};
+
+
+# Get collection
+sub collection {
+ $_[0]->{groups};
+};
+
+
+sub to_string {
+ my $self = shift;
+ my $str = 'gClassFreq(' . join(',', @{$self->{classes}}) .
+ ':' . $self->{query}->to_string . ')';
+ return $str;
+};
+
+1;
diff --git a/lib/Krawfish/Compile/Segment/Group/Classes.pm b/lib/Krawfish/Compile/Segment/Group/Classes.pm
new file mode 100644
index 0000000..af286ed
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Group/Classes.pm
@@ -0,0 +1,121 @@
+package Krawfish::Compile::Segment::Group::Classes;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+# TODO:
+# The name is somehow misleading, as this will only
+# group by surface terms.
+
+# TODO:
+# Refer to Krawfish::Compile::Segment::TermIDs!
+
+use constant {
+ DEBUG => 0,
+ NR => 0,
+ START_POS => 1,
+ END_POS => 2,
+};
+
+
+sub new {
+ my $class = shift;
+ bless {
+ segment => shift,
+ nrs => @_ ? [sort @_] : undef,
+ groups => {},
+ }, $class;
+};
+
+
+# Get the group signature for each match
+sub get_group {
+ my ($self, $match) = @_;
+
+ # Get all classes from the match
+ # Classes need to be sorted by start position
+ # to be retrievable, in case the subtokens-Stream
+ # is implemented as a postingslist (probably not)
+ my @classes = $match->get_classes_sorted($self->{nrs});
+
+ my $subtokens = $self->{segment}->subtokens;
+
+ my %class_group;
+
+ # Classes have nr, start, end
+ foreach my $class (@classes) {
+
+ # WARNING! CLASSES MAY OVERLAP SO SUBTOKENS SHOULD BE CACHED OR BUFFERED!
+
+ # Get start position
+ my $start = $class->[START_POS];
+
+ my @seq = ();
+
+ # Receive subtoken
+ my $subt = $subtokens->get($match->doc_id, $start);
+
+ # Push term id to subtoken
+ # TODO: A subtoken should have accessors
+ push (@seq, $subt->[2]);
+
+ while ($start < ($class->[END_POS] -1)) {
+ $subt = $subtokens->get($match->doc_id, ++$start);
+
+ # Push subterm id to subtoken
+ push (@seq, $subt->[2]);
+ };
+
+ # Class not yet set
+ unless ($class_group{$class->[NR]}) {
+ $class_group{$class->[NR]} = join('___', @seq);
+ }
+
+ # There is a gap in the class, potentially an overlap!
+ # TODO: Resolve overlaps!
+ # TODO: Mark gaps!
+ else {
+ $class_group{$class->[NR]} .= '___' . join('___', @seq);
+ };
+ };
+
+ my $string = '';
+ foreach (sort {$a <=> $b} keys %class_group) {
+ $string .= $_ .':' . $class_group{$_} . ';';
+ };
+
+ return $string;
+};
+
+
+# Return group info as hash
+sub to_hash {
+ my ($self, $signature, $doc_freq, $freq) = @_;
+
+ # TODO:
+ # This can't work!
+ # Get dictionary object to convert terms to term id
+ # my $dict = $self->{segment}->dict;
+
+ my %hash = ();
+ while ($signature =~ /\G(\d+):(.+?);/g) {
+
+ # if (DEBUG) {
+ # print_log('g_class', "Build class $1 for signature $2");
+ # };
+
+ $hash{"class_$1"} = [ split('___', $2)];
+ };
+ $hash{freq} = $freq if defined $freq;
+ $hash{doc_freq} = $doc_freq;
+ return \%hash;
+};
+
+
+sub to_string {
+ my $str = 'classes';
+ $str .= $_[0]->{nrs} ? '[' . join(',', @{$_[0]->{nrs}}) . ']' : '';
+ return $str;
+};
+
+1;
diff --git a/lib/Krawfish/Compile/Segment/Group/Fields.pm b/lib/Krawfish/Compile/Segment/Group/Fields.pm
new file mode 100644
index 0000000..75884b2
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Group/Fields.pm
@@ -0,0 +1,175 @@
+package Krawfish::Compile::Segment::Group::Fields;
+use parent 'Krawfish::Compile';
+use Krawfish::Koral::Result::Group::Fields;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+use constant DEBUG => 0;
+
+# This will group matches (especially document matches) by field
+# This is useful e.g. for document browsing per corpus.
+#
+# Because the grouping is based on ranking, the sorting will be trivial.
+#
+# TODO:
+# For some mechanisms, it is not necessary to count all occurrences,
+# e.g. to get all keywords used in a certain virtual corpus or all
+# used annotations.
+
+sub new {
+ my $class = shift;
+ my ($field_obj, $query, $fields) = @_;
+ my $self = bless {
+ field_obj => $field_obj,
+ query => $query,
+ field_keys => [map { ref($_) ? $_->term_id : $_ } @$fields],
+ last_doc_id => -1
+ }, $class;
+
+ # Initialize group object
+ $self->{groups} = Krawfish::Koral::Result::Group::Fields->new($self->{field_keys});
+
+ return $self;
+};
+
+
+# Initialize field pointer
+sub _init {
+ return if $_[0]->{field_pointer};
+
+ my $self = shift;
+
+ print_log('g_fields', 'Create pointer on fields') if DEBUG;
+
+ # Load the ranked list - may be too large for memory!
+ $self->{field_pointer} = $self->{field_obj}->pointer;
+};
+
+
+sub to_string {
+ my $self = shift;
+ my $str = 'gFields(' . join(',', map { '#' . $_ } @{$self->{field_keys}}) .
+ ':' . $self->{query}->to_string . ')';
+ return $str;
+};
+
+
+# Shorthand for "search through"
+sub finalize {
+ while ($_[0]->next) {};
+ return $_[0];
+};
+
+
+# Iterate to the next result
+sub next {
+ my $self = shift;
+
+ $self->_init;
+
+ my $groups = $self->{groups};
+ my $pointer = $self->{field_pointer};
+
+ # There is a next match
+ if ($self->{query}->next) {
+
+ # Get the current posting
+ my $current = $self->{query}->current;
+
+ if ($current->doc_id != $self->{last_doc_id}) {
+
+ # Flush old information
+ $groups->flush;
+
+ my $doc_id = $pointer->skip_doc($current->doc_id);
+
+ # There are no fields for this doc
+ next if $doc_id != $current->doc_id;
+
+ # Due to multivalued fields,
+ # a document can yield a permutation of
+ # patterns, so we recognize this
+ my @patterns = ();
+ my @field_keys = @{$self->{field_keys}};
+
+ # Ignore stored fields
+ my @field_objs = grep { $_->type ne 'store' } $pointer->fields(@field_keys);
+
+ my ($key_pos, $val_pos) = (0,0);
+
+ # Iterate through both lists and create a pattern
+ # Pattern may occur because fields can have multiple values
+ while ($key_pos < @field_keys) {
+
+ # There are no more values for the position
+ if (!$field_objs[$val_pos]) {
+ # Add ignorable null term
+ unless (@{$patterns[$key_pos]}) {
+ push @{$patterns[$key_pos]}, 0;
+ };
+ $key_pos++;
+ }
+
+ # Key identifier are matching
+ elsif ($field_keys[$key_pos] == $field_objs[$val_pos]->key_id) {
+
+ # Add key to pattern
+ $patterns[$key_pos] //= [];
+ push @{$patterns[$key_pos]}, $field_objs[$val_pos]->term_id;
+ $val_pos++;
+ }
+
+ # Forward key position
+ elsif ($field_keys[$key_pos] < $field_objs[$val_pos]->key_id) {
+
+ # Add ignorable null term
+ unless (@{$patterns[$key_pos]}) {
+ push @{$patterns[$key_pos]}, 0;
+ };
+ $key_pos++;
+ }
+
+ # $field_keys[$key_pos] > $field_objs[$val_pos]->key_id
+ else {
+
+ # I don't know if this can happen
+ $val_pos++;
+ };
+ };
+
+ # This adds
+ $groups->incr_doc(\@patterns);
+
+ # Set last doc to current doc
+ $self->{last_doc_id} = $current->doc_id;
+ };
+
+ # Add to frequencies
+ $groups->incr_match;
+
+ return 1;
+ };
+
+ # Flush cached results
+ $groups->flush;
+
+ return 0;
+};
+
+
+sub current {
+ return $_[0]->{query}->current;
+};
+
+
+# Get collection
+sub collection {
+ $_[0]->{groups};
+};
+
+
+1;
+
+
+__END__
diff --git a/lib/Krawfish/Compile/Segment/Group/Spans.pm b/lib/Krawfish/Compile/Segment/Group/Spans.pm
new file mode 100644
index 0000000..7085162
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Group/Spans.pm
@@ -0,0 +1,59 @@
+package Krawfish::Compile::Segment::Group::Spans;
+use parent 'Krawfish::Compile';
+use Krawfish::Log;
+use strict;
+use warnings;
+
+# This may be generalizable, but for the moment
+# It should make it possible to group the span positions
+# of a query based on a nesting query.
+#
+# The idea is to make the following possible:
+# Search for a term in sentences (like "{1:contains(<s>, {2:'baum'})}") and
+# based on the position and length of 1 and 2,
+# a result like
+#
+# 0: 5
+# 1: 7
+# 100: 2
+#
+# can be returned, where each class 1 is sliced in
+# 100 pieces and for each piece there is a dot, in case
+# class 2 occurs in that slice.
+#
+# By doing that it's easy to visualize the position of expressions
+# in sentences or documents etc.
+#
+# For example to answer questions like 'where in documents does
+# the phrase "Herzlichen Dank" occur?'
+#
+# If the span spans more than 1 slice, the result can be
+#
+# 0_2: 1
+# 0_3: 4
+# 4: 6
+#
+# etc. In case the second class is not nested in the first
+# class, this is not counted at all (as this would result
+# in weird data regarding the slice sizes).
+
+sub new {
+ my $class = shift;
+ my %param = @_;
+ bless {
+ slices => $param{slices} // 100,
+ wrap_clas => $param{wrap_class} // 1,
+ embedded_class => $param{embedded_class} // 2
+ }, $class;
+};
+
+# Get the group signature for each match
+# May well be renamed to get_signature
+sub get_group {
+ my $self = shift;
+ my $slice_start = 0;
+ my $slice_end = 0;
+ return $slice_start . '_' . $slice_end;
+};
+
+1;
diff --git a/lib/Krawfish/Compile/Segment/Group/TermExistence.pm b/lib/Krawfish/Compile/Segment/Group/TermExistence.pm
new file mode 100644
index 0000000..009c6ed
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Group/TermExistence.pm
@@ -0,0 +1,159 @@
+package Krawfish::Compile::Segment::Group::TermExistence;
+use parent 'Krawfish::Compile';
+use strict;
+use warnings;
+
+# The query works similar to Or-query, but only accepts term ids.
+
+sub new {
+ my $class = shift;
+ bless {
+ term_id => shift, # Term Query
+ term_ids => shift, # Optional TermExistence-Query
+ filter => undef,
+ existence => []
+ }, $class;
+};
+
+sub _init {
+ ...
+};
+
+
+# TODO:
+# Think about when next() is called, as it needs to be called on term_ids as well ...
+# Mabe this should be done in _init as a while query somehow.
+sub next {
+ my $self = shift;
+
+ # Get the current document in the VC
+ my $filter = $self->{filter};
+ my $doc_id = $filter->doc_id;
+
+ # The next document to look for in the VC
+ my $next_doc_id;
+
+
+ # Check the single term_id for existence
+
+ # The simple term does not exist
+ my $term = $self->{term_id};
+ if (!$term) {
+ # Do nothing
+ }
+
+ # Should never happen
+ elsif (!$term->current) {
+ $self->{term_id} = undef;
+ }
+
+ # Term exists and can be checked
+ else {
+
+ # Is the VC document beyond the current document id
+ if ($doc_id > $term->doc_id) {
+
+ # Move the term document to the VC document
+ $term->skip_doc($doc_id);
+ };
+
+ # Are both terms in the same document?
+ if ($term->doc_id == $doc_id) {
+
+ # Add this term to existence
+ $self->exists($term->term_id);
+
+ # Close posting
+ $term->close;
+
+ # Do not check any further
+ $self->{term_id} = undef;
+ }
+
+ # Current term document is beyond current VC doc
+ else {
+ $next_doc_id = $term->doc_id;
+ };
+ };
+
+
+ # Check the complex term_ids for existence
+
+ my $terms = $self->{term_ids};
+
+ if (!$terms) {
+ # Do nothing
+ }
+
+ # Should never happen
+ elsif (!$terms->current) {
+ $self->{term_ids} = undef;
+ }
+
+ else {
+
+ # When there is a complex query, move on
+ if ($doc_id > $terms->doc_id) {
+ $terms->skip_doc($doc_id);
+ };
+
+ # There are no further matches
+ unless ($terms->current) {
+
+ # Merge existence values
+ $self->exists($terms->existence);
+ $terms->close;
+ $self->{term_ids} = undef;
+ }
+
+ # Current terms are beyond current VC doc
+ else {
+
+ # Remember the next relevant document id
+ if (!$next_doc_id || $next_doc_id > $term->doc_id) {
+ $next_doc_id = $term->doc_id;
+ };
+ };
+ };
+
+ # There is a next document id defined - move on
+ if (defined $next_doc_id) {
+
+ # Move the VC stream to the next relevant position
+ if ($filter->skip_doc($next_doc_id)) {
+
+ # It's fine
+ return 1;
+ };
+ };
+
+ return 0;
+};
+
+
+# Add term ids to existence list
+sub exists {
+ my ($self, $term_id) = @_;
+
+ if (ref $term_id) {
+ push @{$terms->existence}, @$term_id;
+ }
+ else {
+ push @{$terms->existence}, $term_id;
+ };
+};
+
+
+# Return list of existing term ids
+sub existence {
+ return $self->{existence}
+};
+
+
+sub filter_by {
+ ...
+ # It is relevant to filter The query - but one filter may be enough
+};
+
+
+1;
diff --git a/lib/Krawfish/Compile/Segment/Sort.pm b/lib/Krawfish/Compile/Segment/Sort.pm
new file mode 100644
index 0000000..3c69732
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Sort.pm
@@ -0,0 +1,377 @@
+package Krawfish::Compile::Segment::Sort;
+use parent 'Krawfish::Compile::Segment::Bundle';
+use Krawfish::Util::String qw/squote/;
+use Krawfish::Util::PriorityQueue::PerDoc;
+use Krawfish::Koral::Result::Match;
+use Krawfish::Posting::Bundle;
+use Krawfish::Log;
+use Data::Dumper;
+use strict;
+use warnings;
+
+# This is the general sorting implementation based on ranks.
+#
+# It establishes a top-k HeapSort approach and expects bundles
+# of matches to sort by a rank. It returns bundles.
+#
+# A given segment-wide $max_rank_ref can be used to ignore documents
+# during search in a sort filter.
+
+# TODO:
+# Currently this is limited to fields and works different to subterms.
+# So this may need to be renamed to Sort/ByField and Sort/ByFieldAfter.
+
+# TODO:
+# It's possible that fields return a rank of 0, indicating that
+# the field does not exist for the document.
+# They will always be sorted at the end.
+
+# TODO:
+# Ranks should respect the ranking mechanism of FieldsRan,
+# where only even values are fine and odd values need
+# to be sorted in a separate step (this is still open for discussion).
+
+# TODO:
+# It may be beneficial to have the binary heap space limited
+# and do a quickselect whenever the heap is full - to prevent full
+# sort, see
+# http://lemire.me/blog/2017/06/14/quickselect-versus-binary-heap-for-top-k-queries/
+# https://plus.google.com/+googleguava/posts/QMD74vZ5dxc
+# although
+# http://lemire.me/blog/2017/06/21/top-speed-for-top-k-queries/
+# says its irrelevant
+
+
+use constant {
+ DEBUG => 0,
+ RANK => 0,
+ SAME => 1,
+ VALUE => 2,
+ MATCHES => 3
+};
+
+
+# Constructor
+sub new {
+ my $class = shift;
+ my %param = @_;
+
+ if (DEBUG) {
+ print_log('sort', 'Initiate sort object');
+ };
+
+ # TODO:
+ # Check for mandatory parameters
+ #
+ my $query = $param{query};
+
+ unless ($query->isa('Krawfish::Compile::Segment::Bundle')) {
+ warn 'The query is no bundled query';
+ return;
+ };
+
+ my $segment = $param{segment};
+ my $top_k = $param{top_k};
+
+ # This is the sort criterion
+ my $sort = $param{sort};
+
+ # Set top_k if not yet set
+ # - to be honest, heap sort is probably not the
+ # best approach for a full search
+ $top_k //= $segment->max_rank;
+
+ # The maximum ranking value may be used
+ # by outside filters to know in advance,
+ # if a document can't be part of the result set
+ my $max_rank_ref;
+ if (defined $param{max_rank_ref}) {
+
+ # Get reference from definition
+ $max_rank_ref = $param{max_rank_ref};
+ }
+ else {
+
+ # Create a new reference
+ $max_rank_ref = \(my $max_rank = $segment->max_rank);
+ };
+
+ # Create initial priority queue
+ my $queue = Krawfish::Util::PriorityQueue::PerDoc->new(
+ $top_k,
+ $max_rank_ref
+ );
+
+ # Construct
+ return bless {
+ segment => $segment,
+ top_k => $top_k,
+ query => $query,
+ queue => $queue,
+ max_rank_ref => $max_rank_ref,
+
+ # TODO:
+ # Rename to criterion
+ sort => $sort,
+
+ buffer => undef,
+ pos_in_sort => 0,
+
+ # Default starts
+ stack => [], # All lists on a stack
+ sorted => [],
+ pos => 0 # Number of matches already served
+ }, $class;
+};
+
+sub clone {
+ my $self = shift;
+ __PACKAGE__->new(
+ query => $self->{query}->clone,
+ segment => $self->{segment},
+ top_k => $self->{top_k},
+ sort => $self->{sort},
+ max_rank_ref => $self->{max_rank_ref}
+ );
+};
+
+
+# Initialize the sorting - this will do a full run!
+sub _init {
+ my $self = shift;
+
+ # Result already initiated
+ return if $self->{init}++;
+
+ my $query = $self->{query};
+
+ if (DEBUG) {
+ print_log('sort', 'Initialize sort object');
+ };
+
+ # TODO:
+ # Make this work for terms as well!
+
+ # TODO:
+ # This currently only works for fields,
+ # because it bundles document ids.
+ # The prebundling of documents may be
+ # done in a separated step.
+
+ # Get maximum accepted rank from queue
+ my $max_rank_ref = $self->{max_rank_ref};
+ my $queue = $self->{queue};
+ my $sort = $self->{sort};
+
+ if (DEBUG) {
+ print_log('sort', qq!Next Rank on field #! . $sort->term_id);
+ };
+
+ # Store the last match buffered
+ my ($match, $rank);
+
+ # Init
+ $query->next_bundle;
+
+ # Pass through all queries
+ while ($match = $query->current_bundle) {
+
+ if (DEBUG) {
+ print_log('sort', 'Get next posting from ' . $query->to_string);
+ };
+
+ # Get stored rank
+ $rank = $sort->rank_for($match->doc_id);
+
+ if (DEBUG) {
+ print_log('sort', 'Rank for doc id ' . $match->doc_id . " is $rank");
+ print_log('sort', "Check rank $rank against max rank " . $$max_rank_ref);
+ };
+
+
+ # Precheck if the match is relevant
+ if ($rank > $$max_rank_ref) {
+
+ # Document is irrelevant
+ $match = undef;
+
+ if (DEBUG) {
+ print_log('sort', 'Move to next doc');
+ };
+
+ # Skip to next document
+ $query->next_doc;
+ CORE::next;
+ };
+
+ # Get current bundle
+ my $bundle = $query->current_bundle;
+
+ # Insert bundle into priority queue with length information
+ $queue->insert([$rank, 0, $bundle, $bundle->matches]) if $bundle;
+
+ if (DEBUG) {
+ print_log('sort', 'Move to next bundle');
+ };
+
+ # Move to next bundle
+ $query->next_bundle;
+ };
+
+ my $array = $queue->reverse_array;
+ if (DEBUG && 0) {
+ print_log('sort', 'Get list ranking of ' . Dumper($array));
+ };
+
+ # Get the rank reference (new);
+ $self->{buffer} = $array;
+};
+
+
+
+# Move to the next item in the bundled document list
+# and create the next bundle
+sub next_bundle {
+ my $self = shift;
+
+ # Initialize query - this will do a full run on the first field level!
+ $self->_init;
+
+ if ($self->{pos_in_sort} >= @{$self->{buffer}}) {
+ if (DEBUG) {
+ print_log('sort', 'No more elements in the priority array');
+ };
+
+ $self->{current_bundle} = undef;
+ return;
+ };
+
+ # Set current bundle
+ $self->{current_bundle} = $self->get_bundle_from_buffer;
+
+ # Remember the number of entries
+ $self->{pos} += $self->{current_bundle}->matches;
+ return 1;
+};
+
+
+# Get the top bundle from buffer
+sub get_bundle_from_buffer {
+ my $self = shift;
+
+ # Iterate over the next elements with identical ranks
+ my $pos = $self->{pos_in_sort};
+
+ # Get the top entry
+ my $top_bundle = $self->{buffer}->[$pos];
+
+ if (DEBUG) {
+ print_log('sort_after', "Move to next bundle at $pos, which is " .
+ $top_bundle->[VALUE]);
+ };
+
+ my $rank = $top_bundle->[RANK];
+
+ if (DEBUG) {
+ print_log('sort_after', "Create new bundle from prio at $pos starting with " .
+ $top_bundle->[VALUE]->to_string);
+ };
+
+ # Initiate new bundle
+ my $new_bundle = Krawfish::Posting::Bundle->new($top_bundle->[VALUE]);
+
+ $pos++;
+ for (; $pos < @{$self->{buffer}}; $pos++) {
+ $top_bundle = $self->{buffer}->[$pos];
+
+ if (DEBUG) {
+ print_log('sort', 'Check follow up from prio: ' . $top_bundle->[VALUE]->to_string);
+ };
+
+ # Add element to bundle
+ if ($rank == $top_bundle->[RANK]) {
+
+ if (DEBUG) {
+ print_log('sort', "Both items have the same rank $rank");
+ };
+
+ unless ($new_bundle->add($top_bundle->[VALUE])) {
+ warn 'Unable to add bundle to new bundle';
+ };
+ }
+
+ # Stop
+ else {
+ last;
+ };
+ };
+
+ # Get position
+ $self->{pos_in_sort} = $pos;
+
+ if (DEBUG) {
+ print_log('sort', 'Current bundle is now ' . $new_bundle->to_string);
+ };
+
+ return $new_bundle;
+};
+
+
+# Get the current match object
+sub current_match {
+ # my $self = shift;
+ # my $current = $self->current or return;
+ # my $match = Krawfish::Koral::Result::Match->new(
+ # doc_id => $current->doc_id,
+ # start => $current->start,
+ # end => $current->end,
+ # payload => $current->payload,
+ # );
+ #
+ # if (DEBUG) {
+ # print_log('sort', 'Current match is ' . $match->to_string);
+ # };
+ #
+ # return $match;
+ ...
+};
+
+
+# Return the current posting
+sub current {
+ my $self = shift;
+ if (DEBUG) {
+ print_log('sort', 'Current posting is ' . $self->{current}->to_string);
+ };
+
+ $self->{current};
+};
+
+
+sub to_string {
+ my $self = shift;
+ my $str = 'sort(';
+ $str .= $self->{sort}->to_string;
+ $str .= ',0-' . $self->{top_k} if $self->{top_k};
+ $str .= ':' . $self->{query}->to_string;
+ return $str . ')';
+};
+
+
+sub _string_array {
+ my $array = shift;
+ my $str = '';
+ foreach (@$array) {
+ $str .= '[';
+ $str .= 'R:' . $_->[RANK] . ';';
+ $str .= ($_->[SAME] ? 'S:' . $_->[SAME] . ';' : '');
+ $str .= ($_->[MATCHES] ? 'M:' . $_->[MATCHES] : '');
+ $str .= ']';
+ };
+ return $str;
+};
+
+
+1;
+
+__END__
diff --git a/lib/Krawfish/Compile/Segment/Sort/Alphabet.pm b/lib/Krawfish/Compile/Segment/Sort/Alphabet.pm
new file mode 100644
index 0000000..25d805f
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Sort/Alphabet.pm
@@ -0,0 +1,58 @@
+package Krawfish::Compile::Segment::Sort::Alphabet;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+warn 'NOT USED YET';
+
+use constant DEBUG => 0;
+
+# Sort result alphabetically.
+# Requires a class or the match - and based on the start + end, the term_ids are requested
+# and added per match as an array.
+# Then the matches are sorted one term_id position after the other.
+#
+# In case the ordering is reverse alphabetically, the term_id array is reversed as well.
+#
+# In case the term_id array has no equal length, the shorter array is preferred.
+#
+# EXAMPLE:
+# match1: [term_1, term_2, term_3]
+# match2: [term_1, term_2, term_3]
+#
+# This is necessary for all alphabetical sortings!
+
+
+# ---- old:
+# Sort by characters of a certain segment (either the first or the last).
+# This will require to open the offset file to get the first two characters
+# for bucket sorting per token and then request the
+# forward index (the offset is already liftet and may be stored in the buckets
+# as well) for fine grained sorting!
+
+# TODO:
+# This will need to pass sorting criteria as strings for cluster sorting.
+# At least for the top k matches.
+
+sub new {
+ my $class = shift;
+ my $self = bless {
+ query => shift,
+ index => shift,
+ # top_k => shift
+ }, $class;
+
+ my $dict = $self->dict;
+ my $subt = $self->subtokens;
+
+ while ($query->next) {
+ my $element = $query->current->clone;
+
+ # Get the subtoken info
+ my $x = $subt->get($element->doc_id, $element->start);
+
+ # Add element with id for sorting
+ push @record, [$element, $x->[2]]
+ };
+
+};
diff --git a/lib/Krawfish/Compile/Segment/Sort/Criterion/Rank.pm b/lib/Krawfish/Compile/Segment/Sort/Criterion/Rank.pm
new file mode 100644
index 0000000..ac6710d
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Sort/Criterion/Rank.pm
@@ -0,0 +1,44 @@
+package Krawfish::Compile::Sort::Criterion::Field;
+use strict;
+use warnings;
+
+warn 'NOT USED YET';
+
+# TODO:
+# The same criterion for K::Result::Node::Field
+# will introduce field fetching etc.
+
+# Constructor
+sub new {
+ my $self = shift;
+ my ($index, $field, $desc) = @_;
+
+ bless {
+ field => $field,
+ desc => $desc,
+ ranking => $index->fields->ranked_by($field),
+ max => $self->{ranking}->max if $desc
+ }, $class;
+};
+
+
+# Get the rank of the match
+sub rank {
+ my ($self, $match) = @_;
+
+ # Get rank from match
+ my $rank = $self->{ranking}->get($match->doc_id);
+ return $self->{max} ? ($self->{max} - $rank) : $rank;
+};
+
+
+# Serialize to string
+sub to_string {
+ my $self = shift;
+ my $str = 'field=';
+ $str .= $self->{field};
+ $str .= $self->{desc} ? '>' : '<';
+ return $str;
+};
+
+1;
diff --git a/lib/Krawfish/Compile/Segment/Sort/Field.pm b/lib/Krawfish/Compile/Segment/Sort/Field.pm
new file mode 100644
index 0000000..ac441e4
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Sort/Field.pm
@@ -0,0 +1,90 @@
+package Krawfish::Compile::Segment::Sort::Field;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+use constant DEBUG => 0;
+
+# TODO:
+# Use this an instantiate it directly with
+# a ranking!
+
+# Sorting criterion for field ranks.
+
+# TODO:
+# Probably not only support ranks but all kinds of sorting
+# by having a get_lt() API that also works for strings!
+
+# TODO:
+# Return max rank for unknown fields!
+
+sub new {
+ my $class = shift;
+
+ my ($segment, $field_id, $descending) = @_;
+
+ # Get ranking
+ my $rank = $segment->field_ranks->by($field_id);
+
+ return unless $rank;
+
+ my $self = bless {
+ field_id => $field_id,
+ desc => $descending,
+ max_rank => $rank->max_rank
+ }, $class;
+
+ # Get fields in descending order
+ if ($descending) {
+
+ # This may be a real descending order file
+ # or a reversed single-valued ascending order file
+ $self->{rank} = $rank->descending or return;
+ }
+
+ # Get fields in ascending order
+ else {
+ $self->{rank} = $rank->ascending or return;
+ };
+
+ return $self;
+};
+
+sub type {
+ 'field';
+};
+
+
+# Get the rank for this criterion
+sub rank_for {
+ my ($self, $doc_id) = @_;
+
+ return $self->{rank}->rank_for($doc_id);
+
+ # Get rank if rank is littler than value
+ # my $value = shift;
+ # return $self->{rank}->rank_doc_lt($doc_id, $value);
+ # my $max = $ranking->max if $desc;
+};
+
+
+sub criterion {
+ $_[0]->{field_id};
+};
+
+sub max_rank {
+ $_[0]->{max_rank}
+}
+
+sub term_id {
+ $_[0]->{field_id};
+};
+
+# TODO:
+# This may need to be an inflatable
+sub to_string {
+ my $self = shift;
+ return 'field=#' . $self->{field_id} . ($_->{desc} ? '>' : '<')
+};
+
+1;
diff --git a/lib/Krawfish/Compile/Segment/Sort/Filter.pm b/lib/Krawfish/Compile/Segment/Sort/Filter.pm
new file mode 100644
index 0000000..9f723d9
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Sort/Filter.pm
@@ -0,0 +1,122 @@
+package Krawfish::Compile::Segment::Sort::Filter;
+use parent 'Krawfish::Corpus';
+use Krawfish::Log;
+use strict;
+use warnings;
+
+warn 'NOT USED YET';
+
+use constant DEBUG => 0;
+
+# This is a corpus query implementation
+#
+# TODO: Better move to Krawfish::Corpus
+
+sub new {
+ my $class = shift;
+ my %param = @_;
+
+ my $ranking = $param{index}->fields->ranked_by($param{field});
+ my $max = $ranking->max if $param{desc};
+
+ bless {
+ query => $param{query},
+ max_rank_ref => $param{max_rank_ref},
+ field => $param{field},
+ desc => $param{desc},
+ ranking => $ranking,
+ max => $max,
+ init => 0
+ }, $class;
+};
+
+
+# Forward to next document
+sub next {
+ my $self = shift;
+
+ my $query = $self->{query};
+
+ # Get next document
+ while ($query->next) {
+
+ # Check object
+ return 1 if $self->_check;
+ };
+
+ # No next
+ return;
+};
+
+
+# Check the document id for the rank
+sub _check {
+ my $self = shift;
+
+ # Maximum rank reference
+ my $max_rank_ref = $self->{max_rank_ref};
+
+ # Get the current doc_id
+ my $query = $self->{query};
+ my $current = $query->current;
+ my $doc_id = $current->doc_id;
+
+ # Get rank for field
+ my $rank = $self->{ranking}->get($doc_id);
+
+ # Invert rank if descending field is required
+ $rank = $self->{max} - $rank if $self->{max};
+
+ if (DEBUG) {
+ print_log('vc_sort_filter', 'Current posting is ' . $current->to_string);
+ };
+
+ # Rank is smaller then required
+ if ($rank <= $$max_rank_ref) {
+
+ # Document is fine
+ $self->{current} = $current;
+ return 1;
+ };
+
+ if (DEBUG) {
+ print_log('vc_sort_filter', $current->to_string . ' is filtered out');
+ };
+
+ $self->{current} = undef;
+ return;
+};
+
+
+# Get current document
+sub current {
+ $_[0]->{current};
+};
+
+
+# Skip to the relevant document
+sub skip_doc {
+ my ($self, $doc_id) = @_;
+
+ my $query = $self->{query};
+
+ # Skip the document
+ if ($query->skip_doc($doc_id)) {
+
+ # Return the document id, if it matches
+ return $doc_id if $query->_check;
+
+ # Get the next matching element
+ if ($self->next) {
+
+ # return the document id
+ return $self->{current}->doc_id;
+ };
+ };
+
+ # Fail
+ return;
+};
+
+
+1;
diff --git a/lib/Krawfish/Compile/Segment/Sort/No.pm b/lib/Krawfish/Compile/Segment/Sort/No.pm
new file mode 100644
index 0000000..61f3b90
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Sort/No.pm
@@ -0,0 +1,35 @@
+package Krawfish::Compile::Segment::Sort::No;
+use strict;
+use warnings;
+
+# This is a dummy sorting criterion that is
+# used for fields that are either not sortable
+# or for fields not in the dictionary.
+# As the first may be an indication for an error
+# in the index design,
+
+sub new {
+ my $class = shift;
+ return bless {
+ field => shift,
+ desc => shift
+ }, $class;
+};
+
+
+# Get the rank for this criterion
+sub rank_for {
+ return 0;
+};
+
+
+sub criterion {
+ $_[0]->{field};
+};
+
+sub to_string {
+ my $self = shift;
+ return 'field=' . $self->{field} . ($_->{desc} ? '>' : '<')
+};
+
+1;
diff --git a/lib/Krawfish/Compile/Segment/Sort/Priority.pm b/lib/Krawfish/Compile/Segment/Sort/Priority.pm
new file mode 100644
index 0000000..926676d
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Sort/Priority.pm
@@ -0,0 +1,149 @@
+package Krawfish::Compile::Segment::Sort::Priority;
+use Krawfish::Util::PriorityQueue;
+use Krawfish::Log;
+use Data::Dumper;
+use strict;
+use warnings;
+
+# WARNING!
+# THIS IS DEPRECATED IN FAVOR OF Segment::Sort and Segment::SortAfter
+
+
+use constant DEBUG => 0;
+
+sub new {
+ my $class = shift;
+ my %param = @_;
+
+ my $query = $param{query};
+ my $fields = $param{fields};
+ my $field = $param{field};
+ my $desc = $param{desc} ? 1 : 0;
+ my $top_k = $param{top_k};
+
+ my $max_rank_ref = $param{max_rank_ref};
+
+ # Create priority queue
+ my $queue = Krawfish::Util::PrioritySort->new($top_k, $max_rank_ref);
+
+ return bless {
+ field_rank => $fields->ranked_by($field),
+ field => $field,
+ desc => $desc,
+ query => $query,
+ queue => $queue,
+ list => undef,
+ pos => -1
+ }, $class;
+};
+
+
+# Init queue
+sub _init {
+ my $self = shift;
+
+ return if $self->{init}++;
+
+ my $field_rank = $self->{field_rank};
+
+ my $max;
+ # Get maximum rank if descending order
+ if ($self->{desc}) {
+ $max = $field_rank->max;
+ };
+
+ my $query = $self->{query};
+ my $queue = $self->{queue};
+ my $last_doc_id = -1;
+ my $rank;
+
+ # Pass through all queries
+ while ($query->next) {
+
+ if (DEBUG) {
+ print_log('p_sort', 'Get next posting from ' . $query->to_string);
+ };
+
+ # Clone record
+ my $record = $query->current->clone;
+
+ # Fetch rank if doc_id changes
+ if ($record->doc_id != $last_doc_id) {
+
+ # Get stored rank
+ $rank = $field_rank->get($record->doc_id);
+
+ # Revert if maximum rank is set
+ $rank = $max - $rank if $max;
+ };
+
+ if (DEBUG) {
+ print_log('p_sort', 'Rank for doc id ' . $record->doc_id . " is $rank");
+ };
+
+ # Insert into priority queue
+ $queue->insert([$rank, 0, $record]);
+ };
+
+ # Get the rank reference
+ $self->{list} = $queue->reverse_array;
+ $self->{length} = $queue->length;
+};
+
+
+# Get next element from list
+sub next {
+ my $self = shift;
+ $self->_init;
+ if ($self->{pos}++ < $self->{length}) {
+ return 1;
+ };
+ return;
+};
+
+
+# Get current element
+sub current {
+ my $self = shift;
+
+ # 2 is the index of the value
+ if (DEBUG) {
+ print_log('p_sort', 'Get match from index ' . $self->{pos});
+ };
+
+ return $self->{list}->[$self->{pos}]->[2];
+};
+
+
+# Return the number of duplicates of the current match
+sub duplicate_rank {
+ my $self = shift;
+
+ if (DEBUG) {
+ print_log('p_sort', 'Check for duplicates from index ' . $self->{pos});
+ };
+
+ return $self->{list}->[$self->{pos}]->[1] || 1;
+};
+
+
+# This returns an additional data structure with key/value pairs
+# in sorted order to document the sort criteria.
+# Like: [[class_1 => 'cba'], [author => 'Goethe']]...
+# This is necessary for the cluster-merge-sort
+sub current_sort {
+ ...
+};
+
+
+sub to_string {
+ my $self = shift;
+ my $str = 'prioritySort(';
+ $str .= $self->{desc} ? '^' : 'v';
+ $str .= ',' . $self->{field} . ':';
+ $str .= $self->{query}->to_string;
+ return $str . ')';
+};
+
+
+1;
diff --git a/lib/Krawfish/Compile/Segment/Sort/PriorityCascade.pm b/lib/Krawfish/Compile/Segment/Sort/PriorityCascade.pm
new file mode 100644
index 0000000..58806e4
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Sort/PriorityCascade.pm
@@ -0,0 +1,470 @@
+package Krawfish::Compile::Segment::Sort::PriorityCascade;
+use parent 'Krawfish::Compile';
+use Krawfish::Util::String qw/squote/;
+use Krawfish::Util::PriorityQueue::PerDoc;
+use Krawfish::Koral::Result;
+use Krawfish::Posting::Bundle;
+use Krawfish::Log;
+use Data::Dumper;
+use strict;
+use warnings;
+
+# WARNING!
+# THIS IS DEPRECATED IN FAVOR OF Segment::Sort and Segment::SortAfter
+
+# This is only based on criteria that return ranks
+
+use constant {
+ DEBUG => 0,
+ RANK => 0,
+ SAME => 1,
+ VALUE => 2,
+ MATCHES => 3
+};
+
+# TODO:
+# my $offset = $param{offset};
+# This may however not work in a multi-segment
+# or cluster scenario - so let's forget about it
+
+# TODO:
+# It's possible that fields return a rank of 0, indicating that
+# the field is not yet ranked.
+# In that case these fields have to be looked up, in case they are
+# potentially in the result set (meaning they are ranked before/after
+# the last accepted rank field). If so, they need to be remembered.
+# After a sort turn, the non-ranked fields are sorted in the ranked
+# fields. The field can be reranked any time.
+
+# TODO:
+# Ranks should respect the ranking mechanism of FieldsRank and
+# TermRank, where only even values are fine and odd values need
+# to be sorted in a separate step.
+
+sub new {
+ my $class = shift;
+ my %param = @_;
+
+ # TODO:
+ # Check for mandatory parameters
+ my $query = $param{query};
+
+ # This is the index element
+ my $index = $param{index};
+ my $top_k = $param{top_k};
+
+ # This is the fields element
+ # It has the structure [[field], [field, 1]]
+ # where the second value is the descending marker
+ my $fields = $param{fields};
+ # TODO: Change to criterion!
+
+ # For final field distinction, use unique field
+ push @$fields, [$param{unique}];
+
+ # The maximum ranking value may be used
+ # by outside filters to know in advance,
+ # if a document can't be part of the result set
+ my $max_rank_ref;
+ if (defined $param{max_rank_ref}) {
+
+ # Get reference from definition
+ $max_rank_ref = $param{max_rank_ref};
+ }
+ else {
+
+ # Create a new reference
+ $max_rank_ref = \(my $max_rank = $index->max_rank);
+ };
+
+ # Create initial priority queue
+ my $queue = Krawfish::Util::PriorityQueue::PerDoc->new(
+ $top_k,
+ $max_rank_ref
+ );
+
+ # Construct
+ return bless {
+ fields => $fields,
+ index => $index,
+ top_k => $top_k,
+ query => $query,
+ queue => $queue,
+ max_rank_ref => $max_rank_ref,
+ stack => [], # All lists on a stack
+ sorted => [],
+ pos => 0
+ }, $class;
+};
+
+
+# Initialize the sorting - this will do a full run!
+sub _init {
+ my $self = shift;
+
+ # Result already initiated
+ return if $self->{init}++;
+
+ my $query = $self->{query};
+
+ # Get first sorting criterion
+ my ($field, $desc) = @{$self->{fields}->[0]};
+
+ # Get ranking
+ my $ranking = $self->{index}->fields->ranked_by($field);
+
+ # Get maximum rank if descending order
+ my $max = $ranking->max if $desc;
+
+ # Get maximum accepted rank from queue
+ my $max_rank_ref = $self->{max_rank_ref};
+
+ my $last_doc_id = -1;
+ my $rank;
+ my $queue = $self->{queue};
+
+ # Store the last match buffered
+ my $match;
+
+ if (DEBUG) {
+ print_log('p_sort', qq!Next Rank on field "$field"!);
+ };
+
+ # Pass through all queries
+ while ($match || ($query->next && ($match = $query->current))) {
+
+ if (DEBUG) {
+ print_log('p_sort', 'Get next posting from ' . $query->to_string);
+ };
+
+ # Get stored rank
+ $rank = $ranking->get($match->doc_id);
+
+ # Revert if maximum rank is set
+ $rank = $max - $rank if $max;
+
+ if (DEBUG) {
+ print_log('p_sort', 'Rank for doc id ' . $match->doc_id . " is $rank");
+ };
+
+ # Precheck if the match is relevant
+ if ($rank <= $$max_rank_ref) {
+
+ # Create new bundle of matches
+ my $bundle = Krawfish::Posting::Bundle->new($match->clone);
+
+ # Remember doc_id
+ $last_doc_id = $match->doc_id;
+ $match = undef;
+
+ # Iterate over next queries
+ while ($query->next) {
+
+ # New match should join the bundle
+ if ($query->current->doc_id == $last_doc_id) {
+
+ # Add match to bundle
+ $bundle->add($query->current);
+ }
+
+ # New match is new
+ else {
+
+ # Remember match for the next tome
+ $match = $query->current;
+ last;
+ };
+ };
+
+ # Insert into priority queue
+ $queue->insert([$rank, 0, $bundle, $bundle->length]) if $bundle;
+ }
+
+ # Document is irrelevant
+ else {
+ $match = undef;
+ };
+ };
+
+ print_log('p_sort', 'Get list ranking') if DEBUG;
+
+ # Get the rank reference
+ $self->{stack} = [$queue->reverse_array];
+};
+
+
+# Move to the next item in the sorted list
+sub next {
+ my $self = shift;
+
+ if ($self->{pos}++ >= $self->{top_k}) {
+
+ if (DEBUG) {
+ print_log(
+ 'p_sort',
+ 'top_k ' . $self->{top_k} . ' is reached at position ' . $self->{pos}
+ );
+ };
+
+ $self->{current} = undef;
+ return;
+ };
+
+ # Initialize query - this will do a full run on the first field level!
+ $self->_init;
+
+ # There are sorted results in the result list
+ if (scalar @{$self->{sorted}}) {
+
+ # Make this current
+ $self->{current} = shift @{$self->{sorted}};
+
+ if (DEBUG) {
+ print_log(
+ 'p_sort',
+ 'There is already a match in [sorted]: ' . $self->{current}->to_string,
+ );
+ };
+
+ return 1;
+ }
+
+ # Nothing presorted
+ elsif (DEBUG) {
+ print_log('p_sort', 'There is no match in [sorted]');
+ };
+
+ # Get the list values
+ my $stack = $self->{stack};
+
+ # The result list is empty - sort next items
+ # if ($self->{presorted}) {
+ # };
+
+ # This will get the level from the stack
+ my $level = $#{$stack};
+
+ print_log('p_sort', "Check stack on current level $level") if DEBUG;
+
+ # If the current list is empty, remove from stack
+ while (scalar @$stack && (
+ !scalar(@{$stack->[$level]}) ||
+ !scalar(@{$stack->[$level]->[0]})
+ )) {
+
+ print_log('p_sort', "Stack is empty at least on level $level") if DEBUG;
+
+ pop @$stack;
+ $level--;
+
+ if (DEBUG) {
+ print_log('p_sort', "Stack is reduced to level $level with " . Dumper($stack));
+ };
+ };
+
+ # There is nothing to sort further
+ unless (scalar @$stack) {
+
+ print_log('p_sort', 'There is nothing to sort further') if DEBUG;
+
+ $self->{current} = undef;
+ return;
+ };
+
+ # while (my $same = $list->[0]->[SAME]) {
+ # $list = $self->heap_sort();
+ # };
+
+ # TODO:
+ # Depending on how many identical ranks exist,
+ # here the next strategy should be chosen.
+ # Either sort in place, or sort using heapsort again.
+
+
+ # The first item in the current list has multiple identical ranks
+ # As long as the first item in the list has duplicates,
+ # order by the next level
+ while ((my $same = ($stack->[$level]->[0]->[SAME] // 1)) > 1) {
+
+ if (DEBUG) {
+ print_log(
+ 'p_sort',
+ "Found $same matches at first node",
+ " on level $level in " . _string_array($stack->[$level])
+ );
+ };
+
+ # Get the identical elements from the list
+ my @presort = splice(@{$stack->[$level]}, 0, $same - 1);
+
+ print_log('p_sort', 'Presort array is ' . _string_array(\@presort)) if DEBUG;
+ # TODO: Push presort on the stack!
+
+ # This is the new top_k!
+ # TODO: Check if this is really correct!
+ my $top_k = $self->{top_k} - ($self->{pos} - 1);
+
+ # Get next field to rank on level
+ # level 0 is preinitialized, so it is one off
+ my ($field, $desc) = @{$self->{fields}->[$level + 1]};
+
+ if (DEBUG) {
+ print_log('p_sort', qq!Next Rank on field "$field"!);
+ };
+
+ $level++;
+
+ # TODO:
+ # If the same count is smaller than X (at least top_k - pos)
+ # do quicksort or something similar
+ # if ($same < $top_k || $same < 128) {
+ # }
+ # else
+ $stack->[$level] = $self->heap_sort($top_k, \@presort, $field, $desc);
+ # };
+
+ if (DEBUG) {
+ print_log(
+ 'p_sort',
+ "Sorted array",
+ " on new level $level is " . _string_array($stack->[$level])
+ );
+ };
+ };
+
+ # There are matches on the list without identical ranks
+
+ if (DEBUG) {
+ print_log('p_sort', "Stack with level $level is " . Dumper($stack));
+ };
+
+ # Get the top list entry
+ my $top = shift @{$stack->[$level]};
+
+ print_log('p_sort', 'Push value ' . $top->[VALUE]) if DEBUG;
+
+ # Push matches to result list
+ push @{$self->{sorted}}, $top->[VALUE]->unbundle;
+
+ # Make the first match the current
+ # TODO: Be aware! This is a BUNDLE!
+ $self->{current} = shift @{$self->{sorted}};
+ return 1;
+};
+
+
+sub _string_array {
+ my $array = shift;
+ my $str = '';
+ foreach (@$array) {
+ $str .= '[';
+ $str .= 'R:' . $_->[RANK] . ';';
+ $str .= ($_->[SAME] ? 'S:' . $_->[SAME] . ';' : '');
+ $str .= ($_->[MATCHES] ? 'M:' . $_->[MATCHES] : '');
+ $str .= ']';
+ };
+ return $str;
+};
+
+
+# Todo:
+# Accept an iterator, a ranking, and return an iterator
+sub heap_sort {
+ my ($self, $top_k, $sub_list, $field, $desc) = @_;
+
+ if (DEBUG) {
+ print_log('p_sort', 'Heapsort list of length ' . scalar(@$sub_list) .
+ qq! by field "$field" for top_k = $top_k!);
+ };
+
+ my $index = $self->{index};
+ my $ranking = $index->fields->ranked_by($field);
+
+ # Get maximum rank if descending order
+ my $max = $ranking->max if $desc;
+
+ # Get maximum rank
+ my $max_rank = $index->max_rank;
+ my $max_rank_ref = \$max_rank;
+
+ # Create new priority queue
+ my $queue = Krawfish::Util::PriorityQueue::PerDoc->new(
+ $top_k,
+ $max_rank_ref
+ );
+
+ my $rank;
+
+ # Iterate over list
+ foreach (@$sub_list) {
+ my $bundle = $_->[VALUE];
+
+ # Get stored rank
+ $rank = $ranking->get($bundle->doc_id);
+
+ # Revert if maximum rank is set
+ $rank = $max - $rank if $max;
+
+ # Insert into queue
+ $queue->insert([$rank, 0, $bundle, $bundle->length]);
+ };
+
+ # Return reverse list
+ return $queue->reverse_array;
+};
+
+
+# Return the current match
+sub current {
+
+ if (DEBUG) {
+ print_log('p_sort', 'Current posting is ' . $_[0]->{current}->to_string);
+ };
+
+ $_[0]->{current};
+};
+
+sub current_match {
+ my $self = shift;
+ my $current = $self->current or return;
+ my $match = Krawfish::Koral::Result::Match->new(
+ doc_id => $current->doc_id,
+ start => $current->start,
+ end => $current->end,
+ payload => $current->payload,
+ );
+
+ if (DEBUG) {
+ print_log('p_sort', 'Current match is ' . $match->to_string);
+ };
+
+ return $match;
+};
+
+# Return the number of duplicates of the current match
+sub duplicate_rank {
+ my $self = shift;
+
+ if (DEBUG) {
+ print_log('p_sort', 'Check for duplicates from index ' . $self->{pos});
+ };
+
+ return $self->{list}->[$self->{pos}]->[1] || 1;
+};
+
+
+sub to_string {
+ my $self = shift;
+ my $str = 'resultSorted([';
+ $str .= join(',', map { squote($_->[0]) . ($_->[1] ? '>' : '<') } @{$self->{fields}});
+ $str .= ']';
+ $str .= ',0-' . $self->{top_k} if $self->{top_k};
+ $str .= ':' . $self->{query}->to_string;
+ return $str . ')';
+};
+
+
+1;
+
+__END__
+
diff --git a/lib/Krawfish/Compile/Segment/Sort/Random.pm b/lib/Krawfish/Compile/Segment/Sort/Random.pm
new file mode 100644
index 0000000..038ca8d
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Sort/Random.pm
@@ -0,0 +1,5 @@
+# Return all elements in random order
+# This may, however, not be necessary having Sample
+
+# https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle
+# https://lemire.me/blog/2017/09/26/benchmarking-algorithms-to-visit-all-values-in-an-array-in-random-order/
diff --git a/lib/Krawfish/Compile/Segment/Sort/Sample.pm b/lib/Krawfish/Compile/Segment/Sort/Sample.pm
new file mode 100644
index 0000000..674527e
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Sort/Sample.pm
@@ -0,0 +1,160 @@
+package Krawfish::Compile::Segment::Sort::Sample;
+use Krawfish::Koral::Result::Match;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+# Sort all matches in random order and only return the top_k
+# results. This is implemented using reservoir sampling.
+# Difference to random sorting is, this won't randomly sort all
+# results, making paging possible.
+
+# When the number of matches is known in advance, another
+# approach may be valid.
+
+# WARNING:
+# Sorting does not respect current_match of any nested query,
+# that's why sorting is always separated from enriching!
+
+# See
+# https://en.wikipedia.org/wiki/Reservoir_sampling
+# https://webkist.wordpress.com/2008/10/01/reservoir-sampling-in-perl/
+# https://blogs.msdn.microsoft.com/spt/2008/02/05/reservoir-sampling/
+# A. Anagnostopoulos, A. Z. Broder, and D. Carmel. Sampling search-engine results. In Proc. of the Fourteenth International World Wide Web Conference, Chiba, Japan, 2005. ACM Press.
+
+
+use constant DEBUG => 1;
+
+# Create a sample sort of k elements in the list
+sub new {
+ my $class = shift;
+ bless {
+ query => shift,
+ n => shift, # Size of the sample
+ k => 0, # Items already seen
+ reservoir => [],
+ current => undef
+ }, $class;
+};
+
+
+sub max_freq {
+ my $self = shift;
+ my $n = $self->{query}->max_freq;
+ $n = $n < $self->{n} ? $n : $self->{n};
+ return $n;
+};
+
+# Initialize reservoir
+sub _init {
+ my $self = shift;
+
+ return if $self->{k};
+
+ if (DEBUG) {
+ print_log('r_s_sample', 'Initialize sampling, meaning iterate over all items');
+ };
+
+ while ($self->{query}->next) {
+
+ # Seen next item
+ $self->{k}++;
+
+ if (DEBUG) {
+ print_log('r_s_sample', 'Found item ' . $self->{k});
+ };
+
+ # The reservoir is not filled up yet
+ if ($self->{k} <= $self->{n}) {
+
+ if (DEBUG) {
+ print_log('r_s_sample', 'Add item ' . $self->{k} . ' to reservoir');
+ };
+
+ # Add current match to reservoir
+ my $current = $self->{query}->current;
+ push @{$self->{reservoir}}, $current;
+ }
+
+ # Check if the item should replace another item in the reservoir
+ elsif (rand(1) <= ($self->{n}/$self->{k})) {
+
+ my $item = int(rand($self->{n}));
+
+ if (DEBUG) {
+ print_log('r_s_sample', $self->{n} . ' == ' . scalar @{$self->{reservoir}});
+ print_log('r_s_sample', "Overwrite item $item with item " . $self->{k});
+ };
+
+ # Replace random match in reservoir
+ my $current = $self->{query}->current;
+
+ # TODO:
+ # Check if $self->{n} is here equivalent to scalar @{$self->{reservoir}}
+ $self->{reservoir}->[$item] = $current;
+ }
+
+ elsif (DEBUG) {
+ print_log('r_s_sample', 'Ignore item ' . $self->{k});
+ };
+ };
+
+ return;
+};
+
+
+# Move to next item
+sub next {
+ my $self = shift;
+
+ # Fill reservoir
+ $self->_init;
+
+ # Get match from reservoir
+ my $current = shift @{$self->{reservoir}};
+
+ # There is no more match in reservoir
+ unless ($current) {
+ $self->{current} = undef;
+ return;
+ };
+
+ # Set current match
+ $self->{current} = $current;
+ return 1;
+};
+
+
+sub current {
+ $_[0]->{current};
+};
+
+
+sub match_from_query {
+ ...
+};
+
+
+sub current_match {
+ my $self = shift;
+ my $current = $self->current or return;
+ my $match = Krawfish::Koral::Result::Match->new(
+ doc_id => $current->doc_id,
+ start => $current->start,
+ end => $current->end,
+ payload => $current->payload,
+ );
+
+ if (DEBUG) {
+ print_log('sort_sample', 'Current match is ' . $match->to_string);
+ };
+
+ return $match;
+};
+
+sub to_string {
+ 'sample(' . $_[0]->{n} . ':' . $_[0]->{query}->to_string . ')';
+};
+
+
+1;
diff --git a/lib/Krawfish/Compile/Segment/Sort/Simple.pm b/lib/Krawfish/Compile/Segment/Sort/Simple.pm
new file mode 100644
index 0000000..5f438d2
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Sort/Simple.pm
@@ -0,0 +1,85 @@
+package Krawfish::Compile::Segment::Sort::Simple;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+warn 'NOT USED YET';
+
+# This should be used fur subsequent
+# sorting following the first pass
+
+# May use insertion sort for small numbers
+# of duplicates.
+
+# This may very well be a PrioritySort,
+# so initially there is a very simple
+# querier that only add rank and same elements
+# and subsequential they are ranked
+
+sub new {
+ my $class = shift;
+ my %param = @_;
+
+ my $query = $param{query};
+ my $fields = $param{fields};
+ my $field = $param{field};
+ my $desc = $param{desc} ? 1 : 0;
+
+ my $top_k = $param{top_k};
+
+ return bless {
+ field_rank => $fields->ranked_by($field),
+ field => $field,
+ desc => $desc,
+ query => $query,
+ queue => $queue,
+ list => undef,
+ pos => -1
+ }, $class;
+};
+
+sub next {
+
+ # TODO:
+ # In case the sorting before
+ # results in a very bad configuration
+ # (lots of duplicates in the final pos),
+ # choose a different strategy!
+
+ my $field_rank = $self->{field_rank};
+
+ my $max;
+ # Get maximum rank if descending order
+ if ($self->{desc}) {
+ $max = $field_rank->max;
+ };
+
+ my $query = $self->{query};
+
+ while ($query->next) {
+ if (DEBUG) {
+ print_log('s_sort', 'Get next posting from ' . $query->to_string);
+ };
+
+ # The rank is totally fine
+ if ($query->duplicate_rank == 1) {
+ $self->{pos} = 0;
+ $self->{list} = [$query->current];
+ return 1;
+ }
+
+ # The rank has many duplicates
+ else {
+
+ # Sort elements!
+ my $elements = $query->duplicate_rank;
+ for (1..$elements) {
+ $query->next;
+
+# # Clone record
+# my $record = $query->current->clone;
+
+ };
+ };
+ };
+};
diff --git a/lib/Krawfish/Compile/Segment/Sort/SubTerm.pm b/lib/Krawfish/Compile/Segment/Sort/SubTerm.pm
new file mode 100644
index 0000000..dc79207
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Sort/SubTerm.pm
@@ -0,0 +1,65 @@
+package Krawfish::Compile::Segment::Sort::SubTerm;
+use strict;
+use warnings;
+
+# This will sort based on a pre-ranked subterm
+# or rather a subterm list for a class
+#
+# A given node-wide vector_ref can be used to limit
+# the list of terms to check.
+#
+# As classes are in order, a sortafter on subterms
+# for further classes are only relevant in case
+# there are matches with identical ranks on this class.
+
+sub new {
+ my $class = shift;
+
+ # TODO:
+ # Possibly remember the collation
+ my $self = bless {
+ index => shift,
+ suffix => shift // 0,
+ descending => shift // 0,
+ class => shift // 0,
+ max_rank_vector_ref => shift // []
+ }, $class;
+
+ # Get ranking
+ $self->{dict} = $self->{index}->dictionary or return;
+
+ # Get maximum rank if descending order
+ $self->{max} = $self->{ranks}->max if $self->{descending};
+
+ return $self;
+};
+
+
+# Check for the rank of the match if it is smaller
+# than the given rank.
+sub rank_lt {
+ my ($self, $match) = shift;
+
+ # TODO:
+ # For the requested class(es),
+ # retrieve the subterm_ids.
+ # This is similar to Enrich::Snippet retrieval,
+ # as classes may have overlaps.
+ # go through all terms in either left-to-right (prefix)
+ # or right-to-left (suffix) order and rank as long as
+ # the terms are littler than the rank vector
+
+ my $rank;
+ if ($self->{suffix}) {
+ $rank = $self->{dict}->suffix_rank_by_subterm_id($subterm_id);
+ }
+ else {
+ $rank = $self->{dict}->prefix_rank_by_subterm_id($subterm_id);
+ };
+
+ # Revert if maximum rank is set
+ return $max ? $max - $rank : $rank;
+};
+
+
+1;
diff --git a/lib/Krawfish/Compile/Segment/Sort/Substring.pm b/lib/Krawfish/Compile/Segment/Sort/Substring.pm
new file mode 100644
index 0000000..c20e454
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Sort/Substring.pm
@@ -0,0 +1,36 @@
+package Krawfish::Compile::Segment::Sort::Substring;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+warn 'NOT USED YET';
+
+# To support C2-Wort-Type sorting based on word endings,
+# It's necessary to support sorting based on substrings.
+#
+# EXAMPLE:
+# Sort based on the last two characters of a word in the
+# correct order: substring(-2,2)
+#
+# match1: D[er] al[te] Ma[nn]
+# match2: D[er] gu[te] Schäf[er]
+#
+# This requires that all terms of a class are fetched from
+# the dictionary (or at least X characters).
+# Equal sequences will receive the same rank and can then be
+# sorted alphabetically in the next run.
+# Equal may mean that this is case insensitive.
+
+sub new {
+ my $class = shift;
+ bless {
+ offset => shift, # Supports negative offset
+ length => shift,
+ reverse => shift, # The substring is read from right to left
+ caseinsensitive => shift,
+ resolve_diacritiques => shift
+ }, $class;
+};
+
+
+1;
diff --git a/lib/Krawfish/Compile/Segment/SortAfter.pm b/lib/Krawfish/Compile/Segment/SortAfter.pm
new file mode 100644
index 0000000..d1f3642
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/SortAfter.pm
@@ -0,0 +1,196 @@
+package Krawfish::Compile::Segment::SortAfter;
+use parent 'Krawfish::Compile::Segment::Sort';
+use Data::Dumper;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+# This sorting query is similar to
+# Krawfish::Compile::Segment::Sort,
+# But it already expects sorted, bundled postings,
+# does not support $max_rank_ref
+# (because all matches are already retrieved),
+# and immediately stops, when top_k is reached.
+#
+# That also means, this does all the work in next_bundle()
+# instead of init().
+
+
+use constant {
+ DEBUG => 0,
+ RANK => 0,
+ SAME => 1,
+ VALUE => 2,
+ MATCHES => 3
+};
+
+
+# Constructor
+sub new {
+ my $class = shift;
+ my %param = @_;
+
+ my $query = $param{query};
+ my $segment = $param{segment};
+ my $top_k = $param{top_k};
+
+ # This is the sort criterion
+ my $sort = $param{sort};
+
+ $top_k //= $segment->max_rank;
+
+ if (DEBUG) {
+ print_log('sort_after', 'Initiate follow up sort');
+ };
+
+ bless {
+ query => $query,
+ segment => $segment,
+ top_k => $top_k,
+ sort => $sort,
+ max_rank => $segment->max_rank,
+ pos_in_sort => 0, # Current position in sorted heap
+ pos => 0 # Number of (bundled) matches already served
+ }, $class;
+};
+
+
+# Move to next bundle
+sub next_bundle {
+ my $self = shift;
+
+ if (DEBUG) {
+ print_log('sort_after', 'Move to next bundle');
+ };
+
+ $_[0]->{current_bundle} = undef;
+
+ # Already served enough
+ if ($self->{pos} > $self->{top_k}) {
+ return;
+ }
+
+ # There are sorted bundles on the buffer
+ if ($self->{buffer}) {
+
+ # The buffer is not exceeded yet
+ if ($self->{pos_in_sort} < @{$self->{buffer}}) {
+
+ $self->{current_bundle} = $self->get_bundle_from_buffer;
+
+ # Get the number of matches in the bundle
+ $self->{pos} += $self->{current_bundle}->matches;
+
+ # Fine
+ return 1;
+ };
+
+ # Buffer is exceeded - reset
+ $self->{buffer} = undef;
+ $self->{pos_in_sort} = 0;
+ };
+
+ # Get a new bundle from the nested query
+ unless ($self->{query}->next_bundle) {
+ return;
+ };
+
+ if (DEBUG) {
+ print_log('Get next bundle from ' . $self->{query}->to_string);
+ };
+
+ my $next_bundle = $self->{query}->current_bundle;
+
+ # Next bundle is already sorted
+ if ($next_bundle->size == 1) {
+
+ # Do nothing
+ $self->{current_bundle} = $next_bundle;
+ return 1;
+ };
+
+ # Sort next bundle
+
+ # This should probably check for a simpler sorting
+ # algorithm for small data sets
+ my $rank;
+ my $sort = $self->{sort};
+ my $max_rank_ref = \(my $max_rank = $self->{max_rank});
+
+ if (DEBUG) {
+ print_log('sort_after', 'Sort nested bundle');
+ };
+
+ # Create initial priority queue
+ my $queue = Krawfish::Util::PriorityQueue::PerDoc->new(
+ $self->{top_k} - $self->{pos},
+ $max_rank_ref
+ );
+
+ # Unbundle bundle and go through matches
+ for (my $i = 0; $i < $next_bundle->size; $i++) {
+
+ # Get item from list
+ my $posting = $next_bundle->item($i);
+
+ if (DEBUG) {
+ print_log('sort_after', 'Get next posting from ' . $self->{query}->to_string);
+ };
+
+ # Get stored rank
+ $rank = $sort->rank_for($posting->doc_id);
+
+ # Checking for $$max_rank_ref is not useful here,
+ # as the bundles are already bundled and skipping bundles
+ # using next_doc() and preview_doc_id() is not beneficial.
+
+ $queue->insert([$rank, 0, $posting, $posting->matches]);
+ };
+
+ # Get the sorted array (which has still the ranking structure etc.)
+ my $array = $queue->reverse_array;
+
+ print_log('sort_after', 'Get list ranking of ' . Dumper($array)) if DEBUG;
+
+ if (DEBUG) {
+ print_log(
+ 'sort_after',
+ 'New current bundle is ' . $self->{current_bundle}->to_string
+ );
+ };
+
+ # Store the sorted bundle in the buffer
+ $self->{buffer} = $array;
+
+ # Set current bundle
+ $self->{current_bundle} = $self->get_bundle_from_buffer;
+
+ # Remember the number of entries
+ $self->{pos} += $self->{current_bundle}->matches;
+ return 1;
+};
+
+
+# Clone query
+sub clone {
+ my $self = shift;
+ __PACKAGE__->new(
+ query => $self->{query}->clone,
+ segment => $self->{segment},
+ top_k => $self->{top_k},
+ sort => $self->{sort}
+ );
+};
+
+
+# Stringification
+sub to_string {
+ my $self = shift;
+ my $str = 'sort(';
+ $str .= $self->{sort}->to_string;
+ $str .= ',0-' . $self->{top_k} if $self->{top_k};
+ $str .= ':' . $self->{query}->to_string;
+ return $str . ')';
+};
+
+1;