Cleanup for publication
Change-Id: Ibc49a45239cfedf61ffc223713eb90ab205f771e
diff --git a/lib/Krawfish/Compile.pm b/lib/Krawfish/Compile.pm
index 7848cc5..705b46b 100644
--- a/lib/Krawfish/Compile.pm
+++ b/lib/Krawfish/Compile.pm
@@ -5,6 +5,9 @@
use strict;
use warnings;
+# Krawfish::Compile is the base class for all Compile queries.
+
+
use constant DEBUG => 0;
sub current_match {
diff --git a/lib/Krawfish/Compile/Cluster/Limit.pm b/lib/Krawfish/Compile/Cluster/Limit.pm
index c87a1bd..ca51453 100644
--- a/lib/Krawfish/Compile/Cluster/Limit.pm
+++ b/lib/Krawfish/Compile/Cluster/Limit.pm
@@ -47,6 +47,7 @@
$_[0]->{query}->current;
};
+
# May return a hash reference with information
sub current_group {
...
diff --git a/lib/Krawfish/Compile/Node.pm b/lib/Krawfish/Compile/Node.pm
index f913ecb..4a1adf3 100644
--- a/lib/Krawfish/Compile/Node.pm
+++ b/lib/Krawfish/Compile/Node.pm
@@ -30,6 +30,7 @@
return $self;
};
+
# Get or set rank reference value
# This is useful for sorting coordination between processes
sub max_rank_reference {
diff --git a/lib/Krawfish/Compile/Node/Aggregate.pm b/lib/Krawfish/Compile/Node/Aggregate.pm
index 57f73e7..cc6ffe9 100644
--- a/lib/Krawfish/Compile/Node/Aggregate.pm
+++ b/lib/Krawfish/Compile/Node/Aggregate.pm
@@ -33,7 +33,8 @@
};
-# This will read all header information from the nodes and aggregate the date
+# This will read all header information from the nodes
+# and aggregate the date
sub process_head {
my ($self, $head) = @_;
diff --git a/lib/Krawfish/Compile/Node/Enrich/Fields.pm b/lib/Krawfish/Compile/Node/Enrich/Fields.pm
index e50b438..b2f53f0 100644
--- a/lib/Krawfish/Compile/Node/Enrich/Fields.pm
+++ b/lib/Krawfish/Compile/Node/Enrich/Fields.pm
@@ -4,10 +4,11 @@
use strict;
use warnings;
-# Koral::Node::Fields does actually nothing. It's just a wrapper
-# However - it may very well - like snippets - first collect matches and
-# then resend request to the cluster for more information,
-# like
+# Koral::Node::Fields does actually nothing.
+# It's just a wrapper.
+# However - it may very well - like snippets - first
+# collect matches and then resend request to the
+# cluster for more information.
# TODO:
# Fields should be part of the snippet generation mechanism!
@@ -23,6 +24,8 @@
}, $class;
};
+
+# Stringification
sub to_string {
my $self = shift;
return 'fields(' . join(',', map { $_->to_string } @{$self->{fields}}) .
@@ -30,6 +33,7 @@
};
+# Move to next posting
sub next {
$_[0]->{query}->next;
};
diff --git a/lib/Krawfish/Compile/Node/Sort.pm b/lib/Krawfish/Compile/Node/Sort.pm
index 1fbf950..1ddc706 100644
--- a/lib/Krawfish/Compile/Node/Sort.pm
+++ b/lib/Krawfish/Compile/Node/Sort.pm
@@ -9,7 +9,8 @@
# mergesort, but for the moment, it's way simpler.
# TODO:
-# May need to return Krawfish::Posting::Sorted with a 'criterion' array.
+# May need to return Krawfish::Posting::Sorted
+# with a 'criterion' array.
# Instead of next() followed by current(), this should use
# next_current() and - for matches - next_match()
diff --git a/lib/Krawfish/Compile/Remote/Sort.pm b/lib/Krawfish/Compile/Remote/Sort.pm
index 660f4e0..8292a67 100644
--- a/lib/Krawfish/Compile/Remote/Sort.pm
+++ b/lib/Krawfish/Compile/Remote/Sort.pm
@@ -17,6 +17,10 @@
# After the results are returned, the results somehow should be validated
# to defend rogue nodes.
+# The mechanism requires that the collation of the sorting is forwarded
+# as well. This will override rank sorting on terms and fields
+# (and is probably slow).
+
sub new {
my $class = shift;
bless {
@@ -25,6 +29,8 @@
}, $class;
};
+
+# Moive to next posting
sub next {
...
};
diff --git a/lib/Krawfish/Controller/Index.pm b/lib/Krawfish/Controller/Index.pm
index c5e2632..be02178 100644
--- a/lib/Krawfish/Controller/Index.pm
+++ b/lib/Krawfish/Controller/Index.pm
@@ -15,9 +15,11 @@
# replicant.
# 2. $cluster->import($primary, $secondary);
# if one of them fails, choose another one.
+ # 3. Return a unique commit-ID
};
+# Receive information regarding a specific commit
sub commit_info {
my $c = shift;
my $commit_id = $c->stash('commit_id');
@@ -26,11 +28,10 @@
unless ($commit_id) {
...
};
-
- # List information on one special commit
...
};
+
# The search API first searches for matches, then retrieves information
# per match identical to the match API
sub search {
diff --git a/lib/Krawfish/Corpus.pm b/lib/Krawfish/Corpus.pm
index ed4d34b..3746fc5 100644
--- a/lib/Krawfish/Corpus.pm
+++ b/lib/Krawfish/Corpus.pm
@@ -3,6 +3,8 @@
use strict;
use warnings;
+# Krawfish::Corpus is the base class for all corpus queries.
+
# Current span object
sub current {
my $self = shift;
@@ -13,4 +15,10 @@
};
+# Overwrite query object
+sub next_doc {
+ return $_[0]->next;
+};
+
+
1;
diff --git a/lib/Krawfish/Corpus/And.pm b/lib/Krawfish/Corpus/And.pm
index 11d8bbe..0b5bbff 100644
--- a/lib/Krawfish/Corpus/And.pm
+++ b/lib/Krawfish/Corpus/And.pm
@@ -7,7 +7,7 @@
use warnings;
# TODO:
-# Create a version of AndWithFlags
+# Support class flags
use constant DEBUG => 0;
@@ -16,12 +16,13 @@
bless {
first => shift,
second => shift,
- doc_id => undef
+ doc_id => undef,
+ flags => 0b0000_0000_0000_0000
}, $class;
};
-sub init {
+sub _init {
return if $_[0]->{init}++;
$_[0]->{first}->next;
$_[0]->{second}->next;
@@ -39,7 +40,7 @@
sub next {
my $self = shift;
- $self->init;
+ $self->_init;
if (DEBUG) {
print_log(
diff --git a/lib/Krawfish/Corpus/AndNot.pm b/lib/Krawfish/Corpus/AndNot.pm
index 9f3b592..d1c8340 100644
--- a/lib/Krawfish/Corpus/AndNot.pm
+++ b/lib/Krawfish/Corpus/AndNot.pm
@@ -15,12 +15,14 @@
};
-sub init {
+# Initialize query
+sub _init {
return if $_[0]->{init}++;
$_[0]->{first}->next;
$_[0]->{second}->next;
};
+
# Clone query
sub clone {
my $self = shift;
@@ -31,9 +33,10 @@
};
+# Move to next posting
sub next {
my $self = shift;
- $self->init;
+ $self->_init;
my $first = $self->{first}->current;
my $second = $self->{second}->current;
@@ -43,7 +46,9 @@
# No first operand
return unless $first;
- print_log('vc_andnot', 'There is a first current ' . $first->to_string) if DEBUG;
+ if (DEBUG) {
+ print_log('vc_andnot', 'There is a first current ' . $first->to_string);
+ };
while ($first && $second) {
@@ -51,7 +56,10 @@
if ($first->doc_id == $second->doc_id) {
if (DEBUG) {
- print_log('vc_andnot', 'Both operands have the same doc_id: ' . $first->doc_id);
+ print_log(
+ 'vc_andnot',
+ 'Both operands have the same doc_id: ' . $first->doc_id
+ );
};
$self->{first}->next;
@@ -100,11 +108,14 @@
return 0;
};
+
+# Get maximum frequency
sub max_freq {
$_[0]->{first}->max_freq;
};
+# Stringification
sub to_string {
my $self = shift;
return 'andNot(' . $self->{first}->to_string . ',' . $self->{second}->to_string . ')';
diff --git a/lib/Krawfish/Corpus/AndWithFlags.pm b/lib/Krawfish/Corpus/AndWithFlags.pm
deleted file mode 100644
index 7e95f5f..0000000
--- a/lib/Krawfish/Corpus/AndWithFlags.pm
+++ /dev/null
@@ -1,36 +0,0 @@
-package Krawfish::Corpus::AndWithFlags;
-use parent 'Krawfish::Corpus::And';
-use Krawfish::Posting::DocWithFlags;
-use Krawfish::Log;
-use strict;
-use warnings;
-
-# "and with flags" queries are similar
-# to "and" queries, but they respect flags
-# and are therefore not cachable
-
-sub new {
- my $class = shift;
- bless {
- first => shift,
- second => shift,
- flags => 0b0000_0000_0000_0000
- }, $class;
-};
-
-
-sub current {
- my $self = shift;
- return unless defined $self->{doc_id};
- return Krawfish::Posting::DocWithFlags->new(
- $self->{doc_id},
- $self->{flags}
- );
-};
-
-
-sub next {
- ...;
-};
-
-1;
diff --git a/lib/Krawfish/Corpus/Cache.pm b/lib/Krawfish/Corpus/Cache.pm
index 183e20f..ee34263 100644
--- a/lib/Krawfish/Corpus/Cache.pm
+++ b/lib/Krawfish/Corpus/Cache.pm
@@ -61,10 +61,13 @@
};
+# Clone query
sub clone {
...
};
+
+# Move to next posting
sub next {
my $self = shift;
@@ -86,6 +89,7 @@
};
+# Skip document
sub skip_doc {
my ($self, $doc_id) = @_;
@@ -103,13 +107,17 @@
};
+# Stringification
sub to_string {
my $self = shift;
return 'cache(' . $self->{span}->to_string . ')';
};
+
+# Get maximum frequency
sub max_freq {
$_[0]->{span}->max_freq;
};
+
1;
diff --git a/lib/Krawfish/Corpus/Class.pm b/lib/Krawfish/Corpus/Class.pm
index c1f64ad..cd40c91 100644
--- a/lib/Krawfish/Corpus/Class.pm
+++ b/lib/Krawfish/Corpus/Class.pm
@@ -43,6 +43,8 @@
}, $class;
};
+
+# Clone query
sub clone {
my $self = shift;
__PACKAGE__->new(
@@ -83,11 +85,13 @@
};
+# Skip to target document
sub skip_doc {
...
};
+# Stringification
sub to_string {
my $self = shift;
my $str = 'class(';
@@ -96,6 +100,8 @@
return $str;
};
+
+# Get maximum frequency
sub max_freq {
$_[0]->{corpus}->max_freq;
};
diff --git a/lib/Krawfish/Corpus/DocVector.pm b/lib/Krawfish/Corpus/DocVector.pm
index fd23209..7e9185a 100644
--- a/lib/Krawfish/Corpus/DocVector.pm
+++ b/lib/Krawfish/Corpus/DocVector.pm
@@ -19,6 +19,7 @@
}, $class;
};
+
# The query is built by iterating through all terms
# in the dictionary and fetching the relevant doc_ids per
# segment. While doing that, the doc_ids are sorted and
@@ -27,18 +28,26 @@
...
};
+
+# Move to next posting
sub next {
...
};
+
+# Get current posting
sub current {
...
};
+
+# Get maximum frequency
sub max_freq {
...
};
+
+# Stringification
sub to_string {
...
};
diff --git a/lib/Krawfish/Corpus/FieldID.pm b/lib/Krawfish/Corpus/FieldID.pm
index ccfeaed..025678d 100644
--- a/lib/Krawfish/Corpus/FieldID.pm
+++ b/lib/Krawfish/Corpus/FieldID.pm
@@ -20,6 +20,8 @@
}, $class;
};
+
+# Clone query
sub clone {
my $self = shift;
return __PACKAGE__->new(
@@ -28,6 +30,8 @@
);
};
+
+# Move to next posting
sub next {
my $self = shift;
@@ -42,11 +46,13 @@
};
+# Get term identifier
sub term_id {
$_[0]->{term_id};
};
+# Get current posting
sub current {
my $postings = $_[0]->{postings};
return if $postings->pos == -1;
@@ -56,10 +62,14 @@
);
}
+
+# Get maximum frequency
sub max_freq {
$_[0]->{postings}->freq;
};
+
+# stringification
sub to_string {
return '#' . $_[0]->term_id;
};
diff --git a/lib/Krawfish/Corpus/Or.pm b/lib/Krawfish/Corpus/Or.pm
index e5007a7..c673804 100644
--- a/lib/Krawfish/Corpus/Or.pm
+++ b/lib/Krawfish/Corpus/Or.pm
@@ -6,15 +6,21 @@
use constant DEBUG => 0;
+# TODO:
+# Support class flags
+
sub new {
my $class = shift;
bless {
first => shift,
second => shift,
- doc_id => -1
+ doc_id => -1,
+ flags => 0b0000_0000_0000_0000
}, $class;
};
+
+# Clone query object
sub clone {
my $self = shift;
__PACKAGE__->new(
@@ -23,15 +29,19 @@
);
};
-sub init {
+
+# Initialize query
+sub _init {
return if $_[0]->{init}++;
$_[0]->{first}->next;
$_[0]->{second}->next;
};
+
+# Move to next posting
sub next {
my $self = shift;
- $self->init;
+ $self->_init;
my $first = $self->{first}->current;
my $second = $self->{second}->current;
@@ -101,12 +111,14 @@
};
+# Stringification
sub to_string {
my $self = shift;
return 'or(' . $self->{first}->to_string . ',' . $self->{second}->to_string . ')';
};
+# Get maximum frequency
sub max_freq {
my $self = shift;
$self->{first}->max_freq + $self->{second}->max_freq;
diff --git a/lib/Krawfish/Corpus/OrWithFlags.pm b/lib/Krawfish/Corpus/OrWithFlags.pm
deleted file mode 100644
index bba59f1..0000000
--- a/lib/Krawfish/Corpus/OrWithFlags.pm
+++ /dev/null
@@ -1,128 +0,0 @@
-package Krawfish::Corpus::OrWithFlags;
-use parent 'Krawfish::Corpus::Or';
-use Krawfish::Posting::DocWithFlags;
-use Krawfish::Log;
-use strict;
-use warnings;
-
-# "or with classes" queries are similar
-# to "or" queries, but they respect flags
-# and are therefore not cachable
-
-sub new {
- my $class = shift;
- bless {
- first => shift,
- second => shift,
- first_current => undef,
- second_current => undef,
- doc_id => -1,
- flags => 0b0000_0000_0000_0000
- }, $class;
-};
-
-
-sub current {
- my $self = shift;
- return unless defined $self->{doc_id};
- return Krawfish::Posting::DocWithFlags->new(
- $self->{doc_id},
- $self->{flags}
- );
-};
-
-sub next {
- ...;
-};
-
-sub max_freq {
- ...
-};
-
-1;
-
-__END__
-
-
-sub next {
- $self = shift;
- $self->init;
-
- my $first = $self->{first}->current;
- my $second = $self->{second}->current;
-
- print_log('vc_or_flags', 'Check postings') if DEBUG;
-
- # First doc matches
- $self->{flags} |= $first->flags if $first;
-
- # Second doc matches
- $self->{flags} |= $second->flags if $second;
-};
-
-
-{
- # Iterate to positions
- while ($first || $second) {
-
- # First span is no longer available
- if (!$first) {
- $curr = 'second';
- }
-
- # Second span is no longer available
- elsif (!$second) {
- print_log('vc_or_flags', 'Current is first operand (b)') if DEBUG;
- $curr = 'first';
- }
-
- elsif ($first->doc_id < $second->doc_id) {
- print_log('vc_or_flags', 'Current is first operand (1)') if DEBUG;
- $curr = 'first';
- }
- elsif ($first->doc_id > $second->doc_id) {
- print_log('vc_or_flags', 'Current is second operand (1)') if DEBUG;
- $curr = 'second';
- }
- else {
- print_log('vc_or_flags', 'Current is first operand (4)') if DEBUG;
- $curr = 'first';
- };
-
- # Get the current posting of the respective operand
- my $curr_post = $self->{$curr}->current;
-
- # Only return unique identifier
- if ($self->{doc_id} == $curr_post->doc_id) {
-
- if (DEBUG) {
- print_log('vc_or_flags', 'Document ID already returned: '. $self->{doc_id});
- };
-
- # Forward
- $self->{$curr}->next;
-
- # Set current docs
- $first = $self->{first}->current;
- $second = $self->{second}->current;
-
- CORE::next;
- };
-
- $self->{doc_id} = $curr_post->doc_id;
-
- if (DEBUG) {
- print_log('vc_or_flags', 'Current doc is ' . $self->current->to_string);
- print_log('vc_or_flags', "Next on $curr operand");
- };
-
- $self->{$curr}->next;
- return 1;
- };
-
- $self->{doc_id} = undef;
- return;
- };
-};
-
-1;
diff --git a/lib/Krawfish/Index.pm b/lib/Krawfish/Index.pm
index 1b34f7d..272dbb9 100644
--- a/lib/Krawfish/Index.pm
+++ b/lib/Krawfish/Index.pm
@@ -9,10 +9,6 @@
use constant DEBUG => 0;
-# TODO:
-# May need to be renamed to Krawfish::Node
-
-
# This is the central object for index handling on node level.
# A new document will be added by adding the following information:
# - To the dynamic DICTIONARY
@@ -57,6 +53,9 @@
# Dynamic dictionaries can be merged with static indices once in a while.
# TODO:
+# May need to be renamed to Krawfish::Node
+
+# TODO:
# Create Importer class
#
# TODO:
diff --git a/lib/Krawfish/Index/Dictionary.pm b/lib/Krawfish/Index/Dictionary.pm
index e9f5885..88f5cba 100644
--- a/lib/Krawfish/Index/Dictionary.pm
+++ b/lib/Krawfish/Index/Dictionary.pm
@@ -6,9 +6,6 @@
use warnings;
use Krawfish::Log;
-# TODO:
-# Create a central prefix constant class!
-
# This class is the basic dictionary class. It provides a
# homogeneous interface to K::I::Dictionary::Dynamic and
# K::I::Dictionary::Static (versioned).
@@ -39,6 +36,54 @@
# fields: +
# fieldkeys: !
#
+
+# SURFACE RANKING
+# ===============
+# The dictionary contains all ranking information for surface forms.
+# When a surface form is added, the information on the ranking in
+# the dynamic dictionary is stored as empty epsilon information initially.
+# (See the store variant of the static dictionary).
+# Every new term in the dynamic dictionary is added to a list of
+# terms with attached term ids.
+# Identical surface terms may have different sorting keys (following
+# UTS #10) - in that case, before the ranking transition, another transition
+# is added to branch on multiple sorting keys, e.g.
+# bank-[COLL-DE]-[RANKS1]-#term-id-1
+# \-[COLL-EN]-[RANKS2]-#term-id-2
+# (Or the collations are appended to the ranking level)
+# In the forward index, the different term-ids result in different rankings,
+# though they result in identical surface terms.
+# That way, different languages can be sorted at the same time solely based
+# on their sorting key.
+# BE AWARE: For checking, if a term is identical to another term,
+# the collation must be stored in a byte at the rank transition.
+# Because this may introduce quie a lot of problems, it's up to changes.
+#
+# On MERGE
+# 1 The dictionaries are merged
+# 2 The list of new terms is sorted both in prefix and
+# suffix order according to their UTS #10 sorting keys
+# 3 The sorted new term list in prefix order is merged with the
+# sorted list in prefix order of the static dictionary
+# 4 When a new term is first found to be merged in,
+# the term gets the prefix rank in the merged static dictionary
+# 5 All following terms are updated in the static dictionary
+# accordingly
+# (which is fast, because term-id lookup + one up is reasonable
+# fast in memory)
+# 6 Do 2-5 for the suffix ordered list
+#
+# The dynamic new term list (unsorted) has the following structure:
+# ([sorting-key][term_id])* # though, this may be redundant
+# The static sorted lists have the following structure:
+# ([sorting-key-with-front-coding][term_id])*
+# Ranks are stored at the pre-terminal level in the dictionary.
+#
+# Ranking information is stored on the node level
+# [term_id] -> [RANK]
+# ->rank_by_term_id(term_id)
+# ->rev_rank_by_term_id(term_id)
+
# add_term:
# First the static dictionary will do a look-up if the term exists,
# then the dynamic dictionary will do an insert_or_search, meaning
@@ -171,9 +216,21 @@
# requested, for example, by the term_id API for co-occurrence search.
# That's why all subterms need to be stored as well.
+# TODO:
+# There are two possible rank value types:
+# 1 VALUE IS EVEN: The global rank from
+# the static dictionary
+# 2 VALUE IS ODD: The prerank, means,
+# it is sorted based on the rank and takes the place
+# between the two rank values, but it may occurr
+# multiple types for different values.
+# This comes from the dynamic dictionary.
+
use constant DEBUG => 0;
+
+# Constructor
sub new {
my $class = shift;
my $file = shift;
@@ -210,6 +267,7 @@
};
+# Add arbitrary term to dictionary
sub add_term {
my ($self, $term) = @_;
@@ -238,10 +296,6 @@
};
-sub collations {
- $_[0]->{collations};
-};
-
# Add a field to the index
# TODO:
# Currently the collation only accepts a locale
@@ -293,6 +347,11 @@
};
+# Get collations object
+sub collations {
+ $_[0]->{collations};
+};
+
# Get the collation of the base or the field id or undef, if not sortable
sub collation {
diff --git a/lib/Krawfish/Index/Dictionary/Collation.pm b/lib/Krawfish/Index/Dictionary/Collation.pm
index 3e2ecf4..6d1465f 100644
--- a/lib/Krawfish/Index/Dictionary/Collation.pm
+++ b/lib/Krawfish/Index/Dictionary/Collation.pm
@@ -5,6 +5,7 @@
# This is just a convenience wrapper for Unicode::Collate::Locale
+# Constructor
sub new {
my ($class, $locale) = @_;
@@ -17,6 +18,8 @@
bless \$coll, $class;
};
+
+# Get sort key for value
sub sort_key {
my ($self, $value) = @_;
return $$self->getSortKey($value);
diff --git a/lib/Krawfish/Index/Dictionary/Collations.pm b/lib/Krawfish/Index/Dictionary/Collations.pm
index efb8bb2..b95c294 100644
--- a/lib/Krawfish/Index/Dictionary/Collations.pm
+++ b/lib/Krawfish/Index/Dictionary/Collations.pm
@@ -3,6 +3,29 @@
use strict;
use warnings;
+# COLLATIONS
+# ==========
+# Sortable fields need to be initialized before documents using
+# this field are added. The dictionary will have a "sortable" flag
+# on a pre-terminal edge in the dictionary that is retrievable.
+# when a field is requested, that is not sortable, an error is raised
+# when the sorting is initialized.
+# The collation file is sorted by field-term-id and probably quite short
+# and kept in memory
+#
+# ([sortable-field-id][collation])*
+#
+# When a new field is initialized, this list is immediately updated.
+#
+# Collation information is stored on the node level
+# [term_id] -> [COLLATION]
+# ->init_field(field, collation)
+# ->collation_by_field_id(field_id)
+#
+# Because collation for fields is also stored per segment, this is not
+# requested often.
+
+
# Get the collation based on the locale
# This currently does not support collation ids!
sub new {
@@ -16,7 +39,8 @@
# Get collation
sub get {
my ($self, $locale) = @_;
- return $self->{$locale} //= Krawfish::Index::Dictionary::Collation->new($locale);
+ return $self->{$locale} //=
+ Krawfish::Index::Dictionary::Collation->new($locale);
};
diff --git a/lib/Krawfish/Index/Fields.pm b/lib/Krawfish/Index/Fields.pm
index 5b46dda..f56aae9 100644
--- a/lib/Krawfish/Index/Fields.pm
+++ b/lib/Krawfish/Index/Fields.pm
@@ -39,6 +39,7 @@
# are bad, for example!)
+# Constructor
sub new {
my $class = shift;
bless {
@@ -62,7 +63,8 @@
# TODO:
# use Krawfish::Index::Store::V1::Fields->new;
- $self->{docs}->[$self->last_doc_id] = Krawfish::Index::Fields::Doc->new($doc);
+ $self->{docs}->[$self->last_doc_id] =
+ Krawfish::Index::Fields::Doc->new($doc);
return $doc_id;
};
@@ -82,53 +84,6 @@
};
-
-
-# TODO:
-# Make this part of Krawfish::Index::Fields::Rank!
-#
-# TODO:
-# Unused yet!
-#
-sub ranked_by {
- my ($self, $field) = @_;
-
- warn 'DEPRECATED';
-
- print_log(
- 'fields',
- 'Get rank vector for ' . $field
- ) if DEBUG;
-
- # TODO:
- # Currently ranks are set absolutely - but they should be set
- # multiple times to make sorts for multiple fields
- #
- # TODO: Check if the field needs to be sorted
- # numerically or based on a collation
-
- my $ranks = $self->{ranks};
-
- # Lookup at disk
- return $ranks->{$field} if $ranks->{$field};
-
- # Add rank
- $ranks->{$field} = Krawfish::Index::Rank::Fields->new(
- [grep { defined $_ } map { $_->{$field} } @{$self->{array}}]
- );
-
- if (DEBUG) {
- print_log(
- 'fields',
- 'Return rank vector for ' . $field
- );
- };
-
- # Return ranked list
- return $ranks->{$field};
-};
-
-
1;
diff --git a/lib/Krawfish/Index/Fields/Direction.pm b/lib/Krawfish/Index/Fields/Direction.pm
index 9bec3fa..a7196b5 100644
--- a/lib/Krawfish/Index/Fields/Direction.pm
+++ b/lib/Krawfish/Index/Fields/Direction.pm
@@ -5,6 +5,8 @@
use constant DEBUG => 0;
+
+# Constructor
sub new {
my $class = shift;
my $self = bless [], $class;
@@ -17,16 +19,21 @@
return $self;
};
+
+# Lift ranking
sub load {
my ($self, $list) = @_;
@$self = @$list;
};
+
+# Reset ranking
sub reset {
@{$_[0]} = ();
};
+# Get rank for a specific document id
sub rank_for {
my ($self, $doc_id) = @_;
diff --git a/lib/Krawfish/Index/Fields/Doc.pm b/lib/Krawfish/Index/Fields/Doc.pm
index ca43993..896022f 100644
--- a/lib/Krawfish/Index/Fields/Doc.pm
+++ b/lib/Krawfish/Index/Fields/Doc.pm
@@ -7,6 +7,7 @@
use constant DEBUG => 0;
+# Constructor
sub new {
my $class = shift;
my $doc = shift;
@@ -57,8 +58,10 @@
};
+# Serialize to stream
sub to_stream {
...
};
+
1;
diff --git a/lib/Krawfish/Index/Fields/Pointer.pm b/lib/Krawfish/Index/Fields/Pointer.pm
index ec6ab87..c923494 100644
--- a/lib/Krawfish/Index/Fields/Pointer.pm
+++ b/lib/Krawfish/Index/Fields/Pointer.pm
@@ -23,6 +23,8 @@
# Multiple aggregations (e.g. values and facets) will currently
# use multiple pointers, though this could be optimized.
+
+# Constructor
sub new {
my $class = shift;
bless {
@@ -36,26 +38,32 @@
};
+# Get frequency of documents.
+# Maybe loaded on initilization.
sub freq {
$_[0]->{list}->last_doc_id + 1;
};
+# Return current doc id
sub doc_id {
$_[0]->{doc_id};
};
+# Get current position in list
sub pos {
$_[0]->{pos};
};
+# Move to next document
sub next_doc {
warn 'Not supported';
};
+# Potentially close pointer
sub close {
...
};
@@ -83,7 +91,7 @@
};
-# This returns only int-values - so it may need to be renamed
+# Get integer fields only
sub int_fields {
my $self = shift;
@@ -104,8 +112,11 @@
unless (defined $key_ids[$key_pos]) {
if (DEBUG) {
- print_log('f_point', 'There are no more fields to fetch ' .
- 'at keypos ' . $key_pos . ' in doc_id ' . $self->{doc_id});
+ print_log(
+ 'f_point',
+ 'There are no more fields to fetch ' .
+ 'at keypos ' . $key_pos . ' in doc_id ' . $self->{doc_id}
+ );
};
last;
};
@@ -279,12 +290,4 @@
};
-sub field_terms {
- my $self = shift;
- warn 'probably wrong!';
- return map { $_->term_id } $self->fields(@_);
-};
-
-
-
1;
diff --git a/lib/Krawfish/Index/Fields/Rank.pm b/lib/Krawfish/Index/Fields/Rank.pm
index 3b931fa..7cf656c 100644
--- a/lib/Krawfish/Index/Fields/Rank.pm
+++ b/lib/Krawfish/Index/Fields/Rank.pm
@@ -18,6 +18,29 @@
# using the max_rank rather than storing
# the inverse data redundantly
+# TODO:
+# For encoding dense but not diverse field ranks use something like that:
+# http://pempek.net/articles/2013/08/03/bit-packing-with-packedarray/
+# https://github.com/gpakosz/PackedArray
+# That's why max_rank is important, because it indicates
+# how many bits per doc are necessary to encode
+# the rank!
+
+# TODO:
+# In case, a field is only set for a couple of documents, a different
+# strategy may be valid.
+
+# TODO:
+# Think about a different design, where the field lists are stored on the
+# node level:
+# [collation]([field-term-with-front-coding][term_id])
+# Now, the new terms will be merged in the list and the new segment will incorporate
+# the new ranking.
+# When a new term is added, it is added as
+# ([term][term_id][doc_id])*
+# ...
+
+# Constructor
sub new {
my $class = shift;
@@ -70,12 +93,11 @@
};
$self->{max_rank} = undef;
-# $self->{asc}->reset;
-# $self->{desc}->reset;
$self->{sorted} = [];
};
+# Get the maximum rank
sub max_rank {
$_[0]->{max_rank};
};
@@ -199,15 +221,18 @@
};
+# Get ascending ranking
sub ascending {
$_[0]->{asc};
};
+# Get descending ranking
sub descending {
$_[0]->{desc};
}
+
# Stringification
sub to_string {
my $self = shift;
@@ -227,6 +252,8 @@
return sort { $a->[0] cmp $b->[0] } @$plain;
};
+
+# Numerical sorting
sub _numsort_fields {
my $plain = shift;
@@ -234,4 +261,5 @@
return sort { $a->[0] <=> $b->[0] } @$plain;
};
+
1;
diff --git a/lib/Krawfish/Index/Fields/Ranks.pm b/lib/Krawfish/Index/Fields/Ranks.pm
index a53f761..718ed1f 100644
--- a/lib/Krawfish/Index/Fields/Ranks.pm
+++ b/lib/Krawfish/Index/Fields/Ranks.pm
@@ -6,6 +6,47 @@
use constant DEBUG => 0;
+# FIELD RANKING
+# =============
+# Each segment contains all ranking information for sortable fields.
+# When a document is added to the dynamic segment, all sortable fields
+# are recognized with their sorting keys and the attached doc id.
+# Each static segment has a rank file per field with the length of
+# the segment's doc vector including a forward rank and a backward rank.
+# To make reranking possible on merging, each static segment also has a
+# sorted list of sorting keys with all documents the field is attached to.
+# To deal with multivalued fields (e.g. keywords), the ranking file has
+# two fields: One for forward sorting, one for backwards sorting.
+#
+# On MERGE
+# 1 Sort the dynamic field list in alphabetically/numerical order
+# (respect a chosen collation)
+# 2 Merge all postingslists, forward indices etc.
+# 3 Merge the dynamic field list with the static field list
+# 4 Iterate through the new list from beginning to the end to
+# fill the forward ranking list. Increment starting with 1.
+# The first occurrence of a doc_id is taken.
+# The maximum rank is remembered.
+# 5 Iterate through the new list from beginning to the end to
+# fill the reverse ranking list. Decrement stating with the maximum rank.
+# The last occurrence of a doc_id is taken.
+# 6 Based on the relation between maximum rank and the length of the
+# document vector, the ranking file is encoded and stored.
+# The number of unset documents may also be taken into account for encoding.
+#
+# The sorted lists have the following structure:
+# [collation]([sort-key-with-front-coding|value-as-delta][num-doc-ids-varint][doc_id]*)*
+# The dynamic field list (unsorted) has the following structure:
+# ([field-term][doc_id])*
+# The static ranking lists have the following structure:
+# ([rank][revrank]){MAX_DOC_ID}
+#
+# Ranking information is stored on the segment level
+# [doc_id] -> [RANK]
+# ->rank_by_doc_id(doc_id)
+# ->rev_rank_by_doc_id(doc_id)
+
+
# TODO:
# Instead of 'by()', implement
# 'ascending()' and 'descending()!'
@@ -13,7 +54,7 @@
# has multiple values in the ranks overfiew
-
+# Constructor
sub new {
my $class = shift;
@@ -25,6 +66,7 @@
bless {}, $class;
};
+
# Get the rank by field
sub by {
my ($self, $field_id) = @_;
@@ -66,7 +108,7 @@
};
-
+# Stringification
sub to_string {
my $self = shift;
return join(';', map { $_ . ':' . $self->{$_}->to_string } keys %$self);
diff --git a/lib/Krawfish/Index/Forward.pm b/lib/Krawfish/Index/Forward.pm
index 90b68c7..779e9dc 100644
--- a/lib/Krawfish/Index/Forward.pm
+++ b/lib/Krawfish/Index/Forward.pm
@@ -10,8 +10,9 @@
# This represents a forward index of the data,
# accessible by document ID and subtoken offset.
-# Merging the forward index is pretty simple, as it only needs to be indexed
-# on the document level and then simply be appended.
+# Merging the forward index is pretty simple, as it only
+# needs to be indexed on the document level and then
+# simply be appended.
# TODO:
# This is great for retrieving pagebreaks, annotations, primary data,
@@ -19,7 +20,7 @@
# But can this help to expand the context of a match to a certain element context?
# Probably by retrieving the data with a certain maximum offset (say left 100 subtokens, right 100 subtokens)
# and first check for the expanding element start on the left, then move to the right.
-#
+
# TODO:
# In case the term IDs are retrieved for surface sorting,
# it may be useful to not have much data in memory.
@@ -29,11 +30,11 @@
# The forward index needs fast access to documents and positions,
# to get term ids from contexts for use in the co-occurrence analysis.
-
# TODO:
# This API needs to be backed up by a store version.
# use Krawfish::Index::Store::V1::ForwardIndex;
+# Constructor
sub new {
my $class = shift;
@@ -64,7 +65,8 @@
};
-# Get doc from list (as long as the list provides random access to docs)
+# Get doc from list
+# (as long as the list provides random access to docs)
sub doc {
my ($self, $doc_id) = @_;
print_log('fwd', 'Get document for id ' . $doc_id) if DEBUG;
diff --git a/lib/Krawfish/Index/Forward/Doc.pm b/lib/Krawfish/Index/Forward/Doc.pm
index 6c5cb43..1fb415c 100644
--- a/lib/Krawfish/Index/Forward/Doc.pm
+++ b/lib/Krawfish/Index/Forward/Doc.pm
@@ -3,15 +3,15 @@
use warnings;
use strict;
-use constant DEBUG => 0;
+# Represent a document in the forward index
+use constant DEBUG => 0;
# TODO:
# The forward index may need to contain casefolded terms as well,
# so grouping on terms can support casefolding.
-
-
+# Constructor
sub new {
my $class = shift;
my $doc = shift;
@@ -95,6 +95,7 @@
};
+# Stringification
sub to_string {
my $self = shift;
my ($offset, $length) = @_;
diff --git a/lib/Krawfish/Index/Forward/Pointer.pm b/lib/Krawfish/Index/Forward/Pointer.pm
index ecf989f..3dd57b6 100644
--- a/lib/Krawfish/Index/Forward/Pointer.pm
+++ b/lib/Krawfish/Index/Forward/Pointer.pm
@@ -4,6 +4,8 @@
use warnings;
use strict;
+# Pointer in the list of documents.
+
# WARNING:
# This currently is not combined with live documents per default
@@ -24,7 +26,7 @@
#
# ->current # The current subtoken object
-
+# Constructor
sub new {
my $class = shift;
bless {
@@ -41,6 +43,9 @@
}, $class;
};
+
+# Get the number of documents in the index.
+# Maybe passed in initialization phase
sub freq {
my $freq = $_[0]->{list}->last_doc_id + 1;
@@ -69,13 +74,14 @@
$_[0]->{cur};
};
+
# Move to next document
sub next_doc {
...
};
-# Close stream
+# Potentially close stream
sub close {
...
};
@@ -83,31 +89,31 @@
# Skip to relevant document
sub skip_doc {
- my ($self, $doc_id) = @_;
+ my ($self, $target_doc_id) = @_;
if (DEBUG) {
- print_log('fwd_point', "Skip from " . $self->{doc_id} . " to $doc_id");
+ print_log('fwd_point', "Skip from " . $self->{doc_id} . " to $target_doc_id");
};
# Pointer already in requested document
- if ($self->{doc_id} == $doc_id) {
+ if ($self->{doc_id} == $target_doc_id) {
if (DEBUG) {
print_log('fwd_point', 'Document already in position');
};
- return $doc_id;
+ return $target_doc_id;
}
# Pointer needs to skip
- elsif ($self->{doc_id} < $doc_id && $doc_id < $self->freq) {
+ elsif ($self->{doc_id} < $target_doc_id && $target_doc_id < $self->freq) {
if (DEBUG) {
- print_log('fwd_point', 'Get document for id ' . $doc_id);
+ print_log('fwd_point', 'Get document for id ' . $target_doc_id);
};
- $self->{doc_id} = $doc_id;
- $self->{doc} = $self->{list}->doc($doc_id);
+ $self->{doc_id} = $target_doc_id;
+ $self->{doc} = $self->{list}->doc($target_doc_id);
$self->{cur} = 0;
$self->{pos} = -1;
@@ -115,7 +121,7 @@
delete $self->{prev};
delete $self->{next};
- return $doc_id;
+ return $target_doc_id;
};
return NOMORE;
};
@@ -123,21 +129,21 @@
# Skip to relevant position
sub skip_pos {
- my ($self, $pos) = @_;
+ my ($self, $target_pos) = @_;
# TODO:
# There need to be a way to skip back in a document,
# though it's probably sufficient to
# go ->prev() without skipping
- return 0 if $pos < $self->{pos};
+ return 0 if $target_pos < $self->{pos};
if (DEBUG) {
- print_log('fwd_point', "Skip position to $pos");
+ print_log('fwd_point', "Skip position to $target_pos");
};
# TODO:
# This should use skip lists!
- while ($pos > $self->{pos}) {
+ while ($target_pos > $self->{pos}) {
$self->next or return 0;
};
@@ -146,6 +152,7 @@
# Get the current token
+# As this does not return a posting, this may be renamed!
sub current {
my $self = shift;
@@ -183,7 +190,7 @@
};
-# Get the next token
+# Move to the next posting
sub next {
my $self = shift;
@@ -221,7 +228,7 @@
};
-# Get the previous token
+# Move to the previous token
sub prev {
my $self = shift;
@@ -248,5 +255,4 @@
};
-
1;
diff --git a/lib/Krawfish/Index/Merge.pm b/lib/Krawfish/Index/Merge.pm
index f6881de..18636f5 100644
--- a/lib/Krawfish/Index/Merge.pm
+++ b/lib/Krawfish/Index/Merge.pm
@@ -12,10 +12,10 @@
# a) multiple static segments
# b) One dynamic segment
#
-# All new documents are added to the dynamic index,
-# But searches are done
+# All new documents are added to the dynamic index.
+# Constructor
sub new {
my ($class, $segment_a, $segment_b) = @_;
bless {
@@ -24,6 +24,8 @@
}, $class;
};
+
+# Merge segments
sub merge {
# Merging will:
@@ -82,6 +84,7 @@
# - remove segment B
};
+
sub _merge_postings_lists {
# TODO:
# (ignore deleted documents)
@@ -94,6 +97,7 @@
# - Calculate new freq value
};
+
sub _merge_fields {
# TODO:
# (ignore deleted documents)
@@ -108,6 +112,7 @@
# - The position offsets to the primary data files should stay intact
};
+
sub _merge_primary_data {
# TODO:
# (ignore deleted documents)
diff --git a/lib/Krawfish/Index/PostingLivePointer.pm b/lib/Krawfish/Index/PostingLivePointer.pm
index aa78315..196da53 100644
--- a/lib/Krawfish/Index/PostingLivePointer.pm
+++ b/lib/Krawfish/Index/PostingLivePointer.pm
@@ -4,19 +4,19 @@
use strict;
use warnings;
+# Points to a position in a live list
+
# TODO:
# The pointer should copy the list of deletes,
# so a new delete during searching doesn't interfere with the list!
-use constant {
- DEBUG => 0
-};
+# TODO:
+# Use Stream::Finger instead of PostingPointer
-# Points to a position in a live list
-
-# TODO: Use Stream::Finger instead of PostingPointer
+use constant DEBUG => 0;
+# Constructor
sub new {
my $class = shift;
my $self = bless {
@@ -51,7 +51,7 @@
};
-# Forward position
+# Move to next live posting
sub next {
my $self = shift;
@@ -74,7 +74,9 @@
# meaning it hits the stop marker
while ($doc_id >= $list->[$self->{pos}]) {
- print_log('live_p', 'Current doc_id is either deleted or beyond') if DEBUG;
+ if (DEBUG) {
+ print_log('live_p', 'Current doc_id is either deleted or beyond');
+ };
if ($doc_id == $list->[$self->{pos}]) {
@@ -107,17 +109,17 @@
};
+# Move to next document
sub next_doc {
$_[0]->next;
};
+# Skip to target document
sub skip_doc {
- my $self = shift;
+ my ($self, $target_doc_id) = @_;
- my $doc_id = shift;
-
- if ($doc_id >= $self->{next_doc_id} || $doc_id < $self->{doc_id}) {
+ if ($target_doc_id >= $self->{next_doc_id} || $target_doc_id < $self->{doc_id}) {
$self->{doc_id} = $self->{next_doc_id};
return;
};
@@ -125,13 +127,13 @@
my $list = $self->{list_copy};
# Move through deletion list until doc_id is valid
- while ($list->[$self->{pos}] <= $doc_id) {
+ while ($list->[$self->{pos}] <= $target_doc_id) {
# Requested document is deleted
- if ($list->[$self->{pos}] == $doc_id) {
+ if ($list->[$self->{pos}] == $target_doc_id) {
# Goto next doc
- $doc_id++;
+ $target_doc_id++;
};
# Move to next deletion list position
@@ -139,21 +141,28 @@
};
# TODO: Can this happen?
- return if $doc_id >= $self->{next_doc_id};
+ return if $target_doc_id >= $self->{next_doc_id};
# Set document id
- return $self->{doc_id} = $doc_id;
+ return $self->{doc_id} = $target_doc_id;
};
+
+# Get next document identifier
+# (probably implement similar to sorted queries with previews)
sub next_doc_id {
$_[0]->{next_doc_id};
};
+
+# Stringification
sub to_string {
'[1]';
};
-sub configuration {
+
+# Get configuration string
+sub to_config_string {
my $self = shift;
my @del = @{$self->{list_copy}};
my $pos = $del[$self->{pos}];
@@ -176,11 +185,14 @@
};
+# Get posting number in document
+# (always 1)
sub freq_in_doc {
1;
};
+# Get position
sub pos {
return $_[0]->{pos};
};
@@ -198,6 +210,7 @@
};
+# Potentially close posting list
sub close {
...
};
diff --git a/lib/Krawfish/Index/PostingPointer.pm b/lib/Krawfish/Index/PostingPointer.pm
index 62aaff9..7b5939e 100644
--- a/lib/Krawfish/Index/PostingPointer.pm
+++ b/lib/Krawfish/Index/PostingPointer.pm
@@ -7,23 +7,33 @@
use strict;
use warnings;
+# Moving pointer in a posting list.
+
use constant {
DEBUG => 0,
DOC_ID => 0
};
-# TODO: Implement skipping efficiently!!!
-# TODO: Implement next_doc efficiently!!!
-# TODO: Implement freq_in_doc efficiently!!!
-# TODO: Add direct access to doc_id!
+# TODO:
+# Implement skipping efficiently!!!
-# TODO: Use Stream::Finger instead of PostingPointer
+# TODO:
+# Implement next_doc efficiently!!!
-# Points to a position in a postings list
+# TODO:
+# Implement freq_in_doc efficiently!!!
-# TODO: Return different posting types
-# Using current
+# TODO:
+# Add direct access to doc_id!
+# TODO:
+# Use Stream::Finger instead of PostingPointer
+
+# TODO:
+# Return different posting types using current
+
+
+# Constructor
sub new {
my $class = shift;
bless {
@@ -32,23 +42,22 @@
}, $class;
};
+
+# Get frequency of the list
+# (probably copy value when posting pointer is lifted)
sub freq {
$_[0]->{list}->freq;
};
-# Get the term from the list
-sub term {
- $_[0]->{list}->term;
-};
-
-
+# Get the term id
+# (probably copy value when posting pointer is lifted)
sub term_id {
$_[0]->{list}->term_id;
};
-# Forward position
+# Move to next posting
sub next {
my $self = shift;
my $pos = $self->{pos}++;
@@ -86,6 +95,7 @@
};
+# Get the current position in the list
sub pos {
return $_[0]->{pos};
};
@@ -104,34 +114,37 @@
};
+# Potentially close pointer
sub close {
...
};
-#sub list {
-# return $_[0]->{list};
-#};
-
-
-# Skip to a certain document, return the current
-# doc_id
+# Skip to a certain document,
+# return the new doc_id
sub skip_doc {
- my ($self, $doc_id) = @_;
+ my ($self, $target_doc_id) = @_;
+
+ # TODO:
+ # Return NOMORE in case there are no more postings.
print_log('ppointer', refaddr($self) . ': TEMP SLOW Skip to chosen document') if DEBUG;
- while (!$self->current || $self->current->doc_id < $doc_id) {
+ while (!$self->current || $self->current->doc_id < $target_doc_id) {
$self->next or return;
};
+
return $self->current->doc_id;
};
+# Skip to a certain position in the list
sub skip_pos {
- my ($self, $pos) = @_;
- print_log('ppointer', refaddr($self) . ': TEMP SLOW Skip to chosen position or after')
- if DEBUG;
+ my ($self, $target_pos) = @_;
+
+ if (DEBUG) {
+ print_log('ppointer', refaddr($self) . ': TEMP SLOW Skip to chosen position or after');
+ };
unless ($self->current) {
$self->next or return;
@@ -140,7 +153,7 @@
my $current = $self->current;
my $start_doc_id = $current->doc_id;
- while ($start_doc_id == $current->doc_id && $current->start <= $pos) {
+ while ($start_doc_id == $current->doc_id && $current->start <= $target_pos) {
$self->next or return;
$current = $self->current;
};
@@ -148,4 +161,5 @@
return $current->start;
};
+
1;
diff --git a/lib/Krawfish/Index/Postings/Coordinator.pm b/lib/Krawfish/Index/Postings/Coordinator.pm
index a104dc2..6a0aa5d 100644
--- a/lib/Krawfish/Index/Postings/Coordinator.pm
+++ b/lib/Krawfish/Index/Postings/Coordinator.pm
@@ -4,8 +4,10 @@
use strict;
# The PostingsCoordinator loads the postings
-# index and lifts the relevant postings using mmap
+# index and lifts the relevant postings using mmap.
+
+# Constructor
sub new {
my ($class, $file, $size) = @_;
@@ -85,6 +87,8 @@
return $lifted;
};
+
+# Merge postings lists
sub merge {
# Merge the postingslists of two segments
# by iterating over both coordination files and
diff --git a/lib/Krawfish/Index/Postings/Empty.pm b/lib/Krawfish/Index/Postings/Empty.pm
index 325b142..5b23394 100644
--- a/lib/Krawfish/Index/Postings/Empty.pm
+++ b/lib/Krawfish/Index/Postings/Empty.pm
@@ -1,9 +1,13 @@
package Krawfish::Index::Postings::Empty;
+use parent 'Krawfish::Index::PostingPointer';
use strict;
use warnings;
-# This list is empty
+# Represent an empty posting list
+
+
+# Constructor
sub new {
my $class = shift;
my $term_id = shift;
@@ -11,26 +15,35 @@
};
+# Get frequency of postings in list
sub freq {
0;
};
+
+# Get associated term id
sub term_id {
${$_[0]};
};
+
# Stringification
sub to_string {
'#' . $_[0]->term_id;
};
+
+# Move to posting at a certain position
sub at {
return;
};
+
+# Lift a pointer into the empty list
sub pointer {
warn q!You can't point into an empty list!;
return;
};
+
1;
diff --git a/lib/Krawfish/Index/Postings/Lift.pm b/lib/Krawfish/Index/Postings/Lift.pm
index f5f13aa..a32846d 100644
--- a/lib/Krawfish/Index/Postings/Lift.pm
+++ b/lib/Krawfish/Index/Postings/Lift.pm
@@ -29,6 +29,12 @@
#
# Regarding the overall structure, see
# http://www.atire.org/index.php?title=Index_Structure
+#
+# See
+# https://stackoverflow.com/questions/9817233/why-mmap-is-faster-than-sequential-io
+# http://lkml.iu.edu/hypermail/linux/kernel/0802.0/1496.html
+# http://lkml.iu.edu/hypermail/linux/kernel/0802.0/1496.html
+# https://marc.info/?l=linux-kernel&m=95496636207616&w=2
use constant {
DEBUG => 0,
@@ -40,12 +46,6 @@
LIST => 5
};
-# See
-# https://stackoverflow.com/questions/9817233/why-mmap-is-faster-than-sequential-io
-# http://lkml.iu.edu/hypermail/linux/kernel/0802.0/1496.html
-# http://lkml.iu.edu/hypermail/linux/kernel/0802.0/1496.html
-# https://marc.info/?l=linux-kernel&m=95496636207616&w=2
-
# Construct a new lifter
sub new {
@@ -115,4 +115,5 @@
};
};
+
1;
diff --git a/lib/Krawfish/Index/PostingsList.pm b/lib/Krawfish/Index/PostingsList.pm
index d21b01d..6b0dfb1 100644
--- a/lib/Krawfish/Index/PostingsList.pm
+++ b/lib/Krawfish/Index/PostingsList.pm
@@ -7,14 +7,24 @@
use constant DEBUG => 0;
# TODO:
+# Per segment has the information of frequency, length,
+# and position in segment.
+
+# TODO:
+# Check if there is a relation to Posting::List.
+
+# TODO:
# Use different PostingsList (or rather different PostingPointer)
# for different term types
-#
-# TODO: Split postinglists, so they have different sizes,
-# that may be fragmented.
-# TODO: Return K::P::Data for at()
+# TODO:
+# Split postinglists, so they have different sizes,
+# that may be fragmented.
+# TODO:
+# Return K::P::Data for at()
+
+# Constructor
sub new {
my ($class, $index_file, $term_id) = @_;
@@ -49,14 +59,21 @@
};
+# Get term_id associated to the term id
sub term_id {
return $_[0]->{term_id};
};
+
+# Get item at certain position
+# TODO:
+# maybe rename to item(), see Posting::Bundle
sub at {
return $_[0]->{array}->[$_[1]];
};
+
+# Get new pointer
sub pointer {
my $self = shift;
# TODO:
@@ -66,6 +83,8 @@
Krawfish::Index::PostingPointer->new($self);
};
+
+# Stringification
sub to_string {
my $self = shift;
join(',', map { '[' . $_ . ']' } @{$self->{array}});
diff --git a/lib/Krawfish/Index/PostingsLive.pm b/lib/Krawfish/Index/PostingsLive.pm
index b3f28cc..0104833 100644
--- a/lib/Krawfish/Index/PostingsLive.pm
+++ b/lib/Krawfish/Index/PostingsLive.pm
@@ -4,6 +4,7 @@
use strict;
use warnings;
+# Point to live documents.
# Similar interface as Krawfish::Index::PostingsList,
# but has a "delete" method.
@@ -11,7 +12,7 @@
# In addition, this will store the maximum
# number of documents.
-
+# Constructor
sub new {
my ($class, $index_file) = @_;
bless {
@@ -74,6 +75,7 @@
};
+# Lift new pointer to live documents
sub pointer {
my $self = shift;
# This requires a list copy, so chenages in the list
@@ -84,6 +86,8 @@
);
};
+
+# Stringification
sub to_string {
my $self = shift;
'~' . join(',', map { '[' . $_ . ']' } @{$self->{deletes}});
diff --git a/lib/Krawfish/Index/QueryStore.pm b/lib/Krawfish/Index/QueryStore.pm
index 7107457..4092ed8 100644
--- a/lib/Krawfish/Index/QueryStore.pm
+++ b/lib/Krawfish/Index/QueryStore.pm
@@ -18,6 +18,8 @@
# virtual corpus), the subquery can be wrapped in a
# cached query
+
+# Constructor
sub new {
my $class = shift;
bless {
@@ -84,3 +86,6 @@
}
};
};
+
+
+1;
diff --git a/lib/Krawfish/Index/Rank.pm b/lib/Krawfish/Index/Rank.pm
deleted file mode 100644
index 544617a..0000000
--- a/lib/Krawfish/Index/Rank.pm
+++ /dev/null
@@ -1,155 +0,0 @@
-package Krawfish::Index::Rank;
-use strict;
-use warnings;
-
-# Base class for ranking of fields and subterms
-
-# Strategy:
-#
-# SURFACE RANKING
-# ===============
-# The dictionary contains all ranking information for surface forms.
-# When a surface form is added, the information on the ranking in
-# the dynamic dictionary is stored as empty epsilon information initially.
-# (See the store variant of the static dictionary).
-# Every new term in the dynamic dictionary is added to a list of
-# terms with attached term ids.
-# Identical surface terms may have different sorting keys (following
-# UTS #10) - in that case, before the ranking transition, another transition
-# is added to branch on multiple sorting keys, e.g.
-# bank-[COLL-DE]-[RANKS1]-#term-id-1
-# \-[COLL-EN]-[RANKS2]-#term-id-2
-# (Or the collations are appended to the ranking level)
-# In the forward index, the different term-ids result in different rankings,
-# though they result in identical surface terms.
-# That way, different languages can be sorted at the same time solely based
-# on their sorting key.
-# BE AWARE: For checking, if a term is identical to another term,
-# the collation must be stored in a byte at the rank transition.
-# Because this may introduce quie a lot of problems, it's up to changes.
-#
-# On MERGE
-# 1 The dictionaries are merged
-# 2 The list of new terms is sorted both in prefix and
-# suffix order according to their UTS #10 sorting keys
-# 3 The sorted new term list in prefix order is merged with the
-# sorted list in prefix order of the static dictionary
-# 4 When a new term is first found to be merged in,
-# the term gets the prefix rank in the merged static dictionary
-# 5 All following terms are updated in the static dictionary
-# accordingly
-# (which is fast, because term-id lookup + one up is reasonable
-# fast in memory)
-# 6 Do 2-5 for the suffix ordered list
-#
-# The dynamic new term list (unsorted) has the following structure:
-# ([sorting-key][term_id])* # though, this may be redundant
-# The static sorted lists have the following structure:
-# ([sorting-key-with-front-coding][term_id])*
-# Ranks are stored at the pre-terminal level in the dictionary.
-#
-# Ranking information is stored on the node level
-# [term_id] -> [RANK]
-# ->rank_by_term_id(term_id)
-# ->rev_rank_by_term_id(term_id)
-#
-#
-# FIELD RANKING
-# =============
-# Each segment contains all ranking information for sortable fields.
-# When a document is added to the dynamic segment, all sortable fields
-# are recognized with their sorting keys and the attached doc id.
-# Each static segment has a rank file per field with the length of
-# the segment's doc vector including a forward rank and a backward rank.
-# To make reranking possible on merging, each static segment also has a
-# sorted list of sorting keys with all documents the field is attached to.
-# To deal with multivalued fields (e.g. keywords), the ranking file has
-# two fields: One for forward sorting, one for backwards sorting.
-#
-# On MERGE
-# 1 Sort the dynamic field list in alphabetically/numerical order
-# (respect a chosen collation)
-# 2 Merge all postingslists, forward indices etc.
-# 3 Merge the dynamic field list with the static field list
-# 4 Iterate through the new list from beginning to the end to
-# fill the forward ranking list. Increment starting with 1.
-# The first occurrence of a doc_id is taken.
-# The maximum rank is remembered.
-# 5 Iterate through the new list from beginning to the end to
-# fill the reverse ranking list. Decrement stating with the maximum rank.
-# The last occurrence of a doc_id is taken.
-# 6 Based on the relation between maximum rank and the length of the
-# document vector, the ranking file is encoded and stored.
-# The number of unset documents may also be taken into account for encoding.
-#
-# The sorted lists have the following structure:
-# [collation]([sort-key-with-front-coding|value-as-delta][num-doc-ids-varint][doc_id]*)*
-# The dynamic field list (unsorted) has the following structure:
-# ([field-term][doc_id])*
-# The static ranking lists have the following structure:
-# ([rank][revrank]){MAX_DOC_ID}
-#
-# Ranking information is stored on the segment level
-# [doc_id] -> [RANK]
-# ->rank_by_doc_id(doc_id)
-# ->rev_rank_by_doc_id(doc_id)
-#
-#
-# COLLATIONS
-# ==========
-# Sortable fields need to be initialized before documents using
-# this field are added. The dictionary will have a "sortable" flag
-# on a pre-terminal edge in the dictionary that is retrievable.
-# when a field is requested, that is not sortable, an error is raised
-# when the sorting is initialized.
-# The collation file is sorted by field-term-id and probably quite short
-# and kept in memory
-#
-# ([sortable-field-id][collation])*
-#
-# When a new field is initialized, this list is immediately updated.
-#
-# Collation information is stored on the node level
-# [term_id] -> [COLLATION]
-# ->init_field(field, collation)
-# ->collation_by_field_id(field_id)
-#
-# Because collation for fields is also stored per segment, this is not
-# requested often.
-
-
-# TODO:
-# For encoding dense but not diverse field ranks use something like that:
-# http://pempek.net/articles/2013/08/03/bit-packing-with-packedarray/
-# https://github.com/gpakosz/PackedArray
-# That's why max_rank is important, because it indicates
-# how many bits per doc are necessary to encode
-# the rank!
-#
-# TODO:
-# In case, a field is only set for a couple of documents, a different
-# strategy may be valid.
-
-# TODO:
-# Think about a different design, where the field lists are stored on the
-# node level:
-# [collation]([field-term-with-front-coding][term_id])
-# Now, the new terms will be merged in the list and the new segment will incorporate
-# the new ranking.
-# When a new term is added, it is added as
-# ([term][term_id][doc_id])*
-# ...
-
-sub max {
- $_[0]->{max};
-};
-
-
-# Needs to be implemented
-# in the child modules
-sub merge {
- ...
-};
-
-
-1;
diff --git a/lib/Krawfish/Index/Rank/SubTerms.pm b/lib/Krawfish/Index/Rank/SubTerms.pm
deleted file mode 100644
index 0dcf95f..0000000
--- a/lib/Krawfish/Index/Rank/SubTerms.pm
+++ /dev/null
@@ -1,27 +0,0 @@
-package Krawfish::Index::Rank::SubTerms;
-use parent 'Krawfish::Index::Rank';
-use strict;
-use warnings;
-
-warn 'DEPRECATED';
-
-# While FieldsRank is defined per Segment,
-# TermRank is defined per Dictionary.
-# That means per node there are two Term-Ranks
-# per direction (prefix and suffix):
-# One static and one dynamic.
-#
-# TODO:
-# should have a similar API as FieldsRank!
-
-# TODO:
-# There are two possible rank value types:
-# 1 VALUE IS EVEN: The global rank from
-# the static dictionary
-# 2 VALUE IS ODD: The prerank, means,
-# it is sorted based on the rank and takes the place
-# between the two rank values, but it may occurr
-# multiple types for different values.
-# This comes from the dynamic dictionary.
-
-1;
diff --git a/lib/Krawfish/Index/Segment.pm b/lib/Krawfish/Index/Segment.pm
index e339cbd..d93494c 100644
--- a/lib/Krawfish/Index/Segment.pm
+++ b/lib/Krawfish/Index/Segment.pm
@@ -10,11 +10,10 @@
use strict;
use warnings;
-# Return segment information for term ids
+# Return segment information.
# This is the base for dynamic and
# static segment stores.
-#
# TERMS: The dictionary will have one value lists with data,
# accessible by their term_id position in the list:
#
@@ -38,6 +37,8 @@
use constant DEBUG => 0;
+
+# Constructor
sub new {
my $class = shift;
my $file = shift;
@@ -84,12 +85,6 @@
};
-#sub add_sortable {
-# my ($self, $field) = @_;
-# $self->{sortable}->{$field}++;
-#};
-
-
# Get the last document index
sub last_doc {
$_[0]->{live}->next_doc_id - 1;
@@ -126,6 +121,7 @@
};
+# Get field ranks
sub field_ranks {
$_[0]->{field_ranks};
};
@@ -254,4 +250,5 @@
return 1;
};
+
1;
diff --git a/lib/Krawfish/Index/Store/Dynamic/Dictionary.pm b/lib/Krawfish/Index/Store/Dynamic/Dictionary.pm
index d0cfca5..a5e623f 100644
--- a/lib/Krawfish/Index/Store/Dynamic/Dictionary.pm
+++ b/lib/Krawfish/Index/Store/Dynamic/Dictionary.pm
@@ -35,10 +35,14 @@
# Code is based on Tree::Ternary
+# Constructor
+# Represent dictionary as an array
sub new {
bless [], shift;
};
+
+# Insert new term
sub insert {
# Iterative implementation of string insertion.
my ($self, $term, $term_id) = @_;
@@ -86,6 +90,7 @@
};
+# Search in dictionary
sub search {
#
# Iterative implementation of the string search.
@@ -151,17 +156,22 @@
};
-
+# Lookup terms by prefix
+# Returns an iterator
sub prefix_lookup {
my ($self, $prefix, $top_k) = @_;
...
};
+
+# Updated dictionary structure
+# (maybe not necessary)
sub update {
my ($self, $prefix, $so_strategy) = @_;
...
};
+
# Remove least significant term
sub remove_lst {
my $self = shift;
@@ -184,4 +194,5 @@
...
};
+
1;
diff --git a/lib/Krawfish/Index/Store/V1/Dictionary.pm b/lib/Krawfish/Index/Store/V1/Dictionary.pm
index a34dd02..b81ce36 100644
--- a/lib/Krawfish/Index/Store/V1/Dictionary.pm
+++ b/lib/Krawfish/Index/Store/V1/Dictionary.pm
@@ -3,6 +3,8 @@
use strict;
use warnings;
+# This is a naive implementation!
+
# This is a compact array based trie representation.
# On each letter node, binary search and linear search can be done over
# an alphabetically sorted list.
@@ -17,8 +19,6 @@
# The term_id array points to the '00' terminal nodes of the tree structure.
-
-
# TODO:
# It may be useful to check for big file limitations
# https://www.codeproject.com/articles/563200/indexer-index-large-collections-by-different-keys
@@ -29,7 +29,7 @@
# In Atire (http://atire.org/index.php?title=Index_Structure) the
# dictionary is split into a top part (first 4 characters) and a
# second part.
-#
+
# TODO:
# Ranks for terms should be added at a prefinal level for surface
# terms with an epsilon character to ignore
@@ -40,11 +40,20 @@
#
# That way a lookup for a rank based on term id is very fast
# and not very costly (as the term id array access is O(1))!
-#
+
# TODO:
# The information if a field is sortable, should also be added
# to a preterminal epsilon edge to all field-ids
+# TODO:
+# Use linear search for small arrays, see
+# https://schani.wordpress.com/2010/04/30/linear-vs-binary-search/
+# Because most arrays are small, prefer linear search over binary search
+
+# TODO: Support collations
+# - https://msdn.microsoft.com/en-us/library/ms143726.aspx
+# - http://userguide.icu-project.org/collation
+
# This is necessary to deal with the dynamic structure
use constant {
SPLIT_CHAR => 0,
@@ -58,14 +67,6 @@
DEBUG => 0
};
-# TODO:
-# Use linear search for small arrays, see
-# https://schani.wordpress.com/2010/04/30/linear-vs-binary-search/
-# Because most arrays are small, prefer linear search over binary search
-
-# TODO: Support collations
-# - https://msdn.microsoft.com/en-us/library/ms143726.aspx
-# - http://userguide.icu-project.org/collation
# from_array
sub new {
@@ -97,9 +98,10 @@
# Search for a term and return a term id
+# Alternatively returns iterator
sub search {
- my $self = shift;
- my $term = shift;
+ my ($self, $term) = @_;
+
my @term = (split('', $term), TERM_CHAR);
my $consumed = 0;
@@ -165,38 +167,54 @@
return;
};
+
+# Search with ignoring case
+# Returns iterator
sub search_case_insensitive {
...
};
+
+# Search with ignoring diacritics
+# Returns iterator
sub search_diacritic_insensitive {
...
};
+
+# Search with k errors
+# Returns iterator
sub search_approximative {
...
};
+
+# Search using regular expression
+# Returns iterator
sub search_regex {
...
};
+
# Merge static tree with dynamic tree
sub merge {
...
};
+
# Return iterator of term ids
# TODO:
# Be aware, this is only in collation
# order of the insertion, that may not be very helpful.
-sub in_prefix_order {
- ...
-};
+# sub in_prefix_order {
+# ...
+# };
-sub in_suffix_order {
- ...
-};
+
+# May not be helpful
+# sub in_suffix_order {
+# ...
+# };
@@ -209,7 +227,7 @@
};
-# write a header
+# Write a header
sub to_file {
...
};
@@ -230,6 +248,7 @@
};
+# Convert tree representation to array representation
# P.S. I tried to use only one field for double linking,
# but this didn't work so well
sub convert_to_array {
@@ -323,6 +342,7 @@
};
+# Get term from term id
# Move to top, character by character
sub term_by_term_id {
my ($self, $term_id) = @_;
diff --git a/lib/Krawfish/Index/Store/V1/Fields.pm b/lib/Krawfish/Index/Store/V1/Fields.pm
index 7a01210..6df977f 100644
--- a/lib/Krawfish/Index/Store/V1/Fields.pm
+++ b/lib/Krawfish/Index/Store/V1/Fields.pm
@@ -43,7 +43,8 @@
# Another good use-case is the fast collection of text siglen
# for the virtualcorpus->textsiglen-vector method.
-# Tie to a file
+
+# Constructor tied to a file
sub new {
my ($class, $file, $dict) = @_;
bless {
@@ -84,10 +85,10 @@
};
+# Get fields by doc
sub get_fields {
- my $self = shift;
- my $doc_id = shift;
- my $current = $self->skip_doc($doc_id);
+ my ($self, $target_doc_id) = @_;
+ my $current = $self->skip_doc($target_doc_id);
};
1;
diff --git a/lib/Krawfish/Index/Store/V1/ForwardIndex.pm b/lib/Krawfish/Index/Store/V1/ForwardIndex.pm
index a579939..67acbd1 100644
--- a/lib/Krawfish/Index/Store/V1/ForwardIndex.pm
+++ b/lib/Krawfish/Index/Store/V1/ForwardIndex.pm
@@ -46,6 +46,7 @@
WS_SCHEME => 1 # Short string compression scheme optimized for whitespace
};
+# Constructor
sub new {
my $class = shift;
my $short_string_compression_scheme = shift;
@@ -59,10 +60,13 @@
bless \$stream, $class;
};
+
+# Get current position
sub pos {
...
};
+
# Add term by id
sub add_term_id {
my ($foundry_id, $layer_id, $term_id) = @_;
@@ -105,6 +109,7 @@
$self->{plain_pos} = 0;
};
+
# Add an annotation
sub add_term {
my ($foundry_id, $layer_id, $term) = @_;
@@ -129,6 +134,8 @@
}
};
+
+# Get token
# TODO: May return a subtoken object
sub get {
my ($self, $offset) = @_;
@@ -139,6 +146,7 @@
...
};
+
# Add plain string
# for example punctuation, whitespace etc.
sub add_plain {
diff --git a/lib/Krawfish/Index/Store/V1/ForwardPointer.pm b/lib/Krawfish/Index/Store/V1/ForwardPointer.pm
index f5170d2..b40dfe1 100644
--- a/lib/Krawfish/Index/Store/V1/ForwardPointer.pm
+++ b/lib/Krawfish/Index/Store/V1/ForwardPointer.pm
@@ -7,6 +7,7 @@
# this needs fast access to documents AND positions.
# It needs next() and previous() methods.
+# Constructor
sub new {
my $class = shift;
bless {
@@ -16,10 +17,14 @@
}, $class;
};
+
+# Get current token
sub current {
return $_[0]->{current};
};
+
+# Get posting by offset
sub get {
my ($self, $offset) = @_;
if (my $subtoken = $self->{buffer}->get($offset)) {
@@ -31,12 +36,16 @@
};
+# Move to next token
sub next {
...
}
+
+# Move to previous token
sub previous {
...
};
+
1;
diff --git a/lib/Krawfish/Index/Store/V1/Stream.pm b/lib/Krawfish/Index/Store/V1/Stream.pm
index 4e91e1e..da37193 100644
--- a/lib/Krawfish/Index/Store/V1/Stream.pm
+++ b/lib/Krawfish/Index/Store/V1/Stream.pm
@@ -22,18 +22,24 @@
};
};
+# Initialize stream
sub _init {
# Load first X bytes from file
};
+
+# Skip to or beyond certain document
sub skip_doc {
...
};
+
+# Move to next posting
sub next {
...
};
+
# This appends a byte sequence to a stream
# and updates the skiplist
sub append {
diff --git a/lib/Krawfish/Index/Store/V1/Tokens.pm b/lib/Krawfish/Index/Store/V1/Tokens.pm
index 9914844..0e13a81 100644
--- a/lib/Krawfish/Index/Store/V1/Tokens.pm
+++ b/lib/Krawfish/Index/Store/V1/Tokens.pm
@@ -16,15 +16,17 @@
# ([seg-pos:delta-int][length:uniint])*
# )*
#
-# The difference will only be stored, if it is > 1 (so if a token is greater
-# than one subtoken).
+# The difference will only be stored, if it is > 1
+# (so if a token is greater than one subtoken).
+
+# Constructor
sub new {
my ($class, $file, $foundry) = @_;
bless {
- file => $file,
+ file => $file,
foundry => $foundry,
- doc_id => -1
+ doc_id => -1
}, $class;
};
@@ -47,6 +49,7 @@
$_[0]->{max_token_length}
};
+
# Check if the number of tokens between end and start
# is in the given range.
#
@@ -62,6 +65,7 @@
...
};
+
# Get an array of start positions that are in the range of min/max
# Start with the lowest
sub extend_to_left {
@@ -72,6 +76,7 @@
...
};
+
# Get an array of end positions that are in the range of min/max
# Start with the lowest
sub extend_to_right {
diff --git a/lib/Krawfish/Index/Stream.pm b/lib/Krawfish/Index/Stream.pm
index 8ea5446..777460f 100644
--- a/lib/Krawfish/Index/Stream.pm
+++ b/lib/Krawfish/Index/Stream.pm
@@ -2,20 +2,25 @@
use strict;
use warnings;
+# THIS IS CURRENTLY UNUSED!
+
+# This is similar to Reference Queries, so the mechanism
+# should probably be identical.
+
# TODO:
# Vint should be as simple as possible
+
# TODO:
# BitStream should support multiple pointers,
# And the stream should be closed, once no pointers
# point to it any longer
+
# TODO:
# BitStream may be loaded from a file and may
# load further elements, once it exceeds the boundaries
# of the current element
-#
-# This is similar to Reference Queries, so the mechanism
-# should probably be identical.
-#
+
+# Constructor
sub new {
my $class = shift;
bless {
diff --git a/lib/Krawfish/Index/Stream/Finger.pm b/lib/Krawfish/Index/Stream/Finger.pm
index 08d1a5d..7b7b10d 100644
--- a/lib/Krawfish/Index/Stream/Finger.pm
+++ b/lib/Krawfish/Index/Stream/Finger.pm
@@ -2,7 +2,9 @@
use strict;
use warnings;
-# New stream finger
+# THIS IS CURRENTLY UNUSED!
+
+# Constructor
sub new {
my $class = shift;
bless {
@@ -13,6 +15,7 @@
}, $class;
};
+
# Forward in posting stream
sub next {
my $self = shift;
@@ -24,19 +27,24 @@
};
-sub skip_to {
+# Skip to certain document in stream
+sub skip_doc {
...
};
+
+# Move to next position in stream
sub next_pos {
...
};
+# Move to next document in stream
sub next_doc {
...
};
-# Get the current posting object
+
+# Get the current posting
sub current {
my $self = shift;
return $self->{stream}->posting(@{$self->{current}});
@@ -57,4 +65,5 @@
$_[0]->{delta};
};
+
1;
diff --git a/lib/Krawfish/Index/Stream/Span.pm b/lib/Krawfish/Index/Stream/Span.pm
index 05de891..6167956 100644
--- a/lib/Krawfish/Index/Stream/Span.pm
+++ b/lib/Krawfish/Index/Stream/Span.pm
@@ -4,6 +4,8 @@
use strict;
use warnings;
+# THIS IS CURRENTLY UNUSED!
+
# This is a PostingsList-Example for Spans
# Add entry to bitstream
@@ -83,6 +85,7 @@
};
+# Get token posting
sub posting {
shift;
return Krawfish::Posting::Token->new(@_);
diff --git a/lib/Krawfish/Index/Tokens.pm b/lib/Krawfish/Index/Tokens.pm
index fd2fce6..52f51ec 100644
--- a/lib/Krawfish/Index/Tokens.pm
+++ b/lib/Krawfish/Index/Tokens.pm
@@ -3,6 +3,8 @@
use strict;
use warnings;
+# THIS IS CURRENTLY UNUSED!
+
# There is one token list per tokenization
# The Token list has the following jobs:
@@ -43,6 +45,7 @@
...
};
+
# Get an array of end positions that are in the range of min/max
# Start with the lowest
sub extend_to_right {
@@ -51,6 +54,7 @@
...
};
+
# Check if the number of tokens between end and start
# is in the given range.
#
@@ -64,18 +68,24 @@
...
};
+
# Returns the number of tokens per foundry
sub freq_in_doc {
...
};
+
+# Get the maximum number of subtokens per token
sub max_subtokens {
...
};
-sub skip_doc {
+# Skip to next document
+sub skip_doc {
+ ...
};
+
1;
diff --git a/lib/Krawfish/Koral.pm b/lib/Krawfish/Koral.pm
index a61ce0d..6f822c0 100644
--- a/lib/Krawfish/Koral.pm
+++ b/lib/Krawfish/Koral.pm
@@ -12,12 +12,9 @@
use constant DEBUG => 0;
+# Parse a koral query object and transform to an
+# actual index query.
-
-
-# Parse a koral query and transform to an actual
-# index query.
-#
# Procession order for query and corpus:
# a) parse (cluster)
# b) normalize and finalize (cluster)
@@ -358,6 +355,26 @@
};
+# Stringification
+sub to_string {
+ my ($self, $id) = @_;
+ my $str = '';
+
+ my @list = ();
+
+ if ($self->compile) {
+ push @list, 'compile=[' . $self->compile->to_string($id) . ']';
+ };
+ if ($self->corpus) {
+ push @list, 'corpus=[' . $self->corpus->to_string($id) . ']';
+ };
+ if ($self->query) {
+ push @list, 'query=[' . $self->query->to_string($id) . ']';
+ };
+
+ return join(',', @list);
+};
+
@@ -411,26 +428,6 @@
};
-# Stringification
-sub to_string {
- my ($self, $id) = @_;
- my $str = '';
-
- my @list = ();
-
- if ($self->compile) {
- push @list, 'compile=[' . $self->compile->to_string($id) . ']';
- };
- if ($self->corpus) {
- push @list, 'corpus=[' . $self->corpus->to_string($id) . ']';
- };
- if ($self->query) {
- push @list, 'query=[' . $self->query->to_string($id) . ']';
- };
-
- return join(',', @list);
-};
-
1;
diff --git a/lib/Krawfish/Koral/Compile.pm b/lib/Krawfish/Koral/Compile.pm
index bc2f97a..15f3c9c 100644
--- a/lib/Krawfish/Koral/Compile.pm
+++ b/lib/Krawfish/Koral/Compile.pm
@@ -4,10 +4,13 @@
use strict;
use warnings;
+# Creation of compilation query
+
# WARNING! / TODO!
# An enrichment for fields or snippets (better any enrichments)
# can never wrap around a presort query, because the relevant
-# data structures and algorithms require the results to be in doc_id order!
+# data structures and algorithms require the results to be in
+# doc_id order!
# WARNING!
# It's important to remember that sortFilter can't be shared in parallel
@@ -35,22 +38,28 @@
filter => 7
);
+
use constant {
DEBUG => 0,
UNIQUE_FIELD => 'id'
};
+
+# Constructor
sub new {
my $class = shift;
bless [@_], $class;
};
+
+# Stringification
sub to_string {
my ($self, $id) = @_;
return join(',', map { $_->to_string($id) } $self->operations);
};
+# Get builder object
sub builder {
return Krawfish::Koral::Compile::Builder->new;
};
@@ -89,14 +98,14 @@
};
# Add unique sorting per default - unless it's a group query
- #unless ($group_query) {
- # push @compile,
- # $mb->sort_by($mb->s_field(UNIQUE_FIELD));
+ # unless ($group_query) {
+ # push @compile,
+ # $mb->sort_by($mb->s_field(UNIQUE_FIELD));
#
- # if (DEBUG) {
- # print_log('kq_compile', 'Added unique field ' . UNIQUE_FIELD . ' to order');
- # };
- #};
+ # if (DEBUG) {
+ # print_log('kq_compile', 'Added unique field ' . UNIQUE_FIELD . ' to order');
+ # };
+ # };
# 1. Introduce required information
@@ -243,11 +252,13 @@
};
+# Send to segments
sub to_segment {
...
};
+# Optimize query
sub optimize {
...
};
diff --git a/lib/Krawfish/Koral/Compile/Aggregate.pm b/lib/Krawfish/Koral/Compile/Aggregate.pm
index bbaa62d..3ea165d 100644
--- a/lib/Krawfish/Koral/Compile/Aggregate.pm
+++ b/lib/Krawfish/Koral/Compile/Aggregate.pm
@@ -18,12 +18,15 @@
'values' => 4
);
+
+# Constructor
sub new {
my $class = shift;
bless [@_], $class;
};
+# Aggregation type
sub type {
'aggregate';
};
@@ -107,6 +110,7 @@
};
+# Stringification
sub to_string {
my ($self, $id) = @_;
return 'aggr=[' . join(',', map { $_->to_string($id) } @$self) . ']';
diff --git a/lib/Krawfish/Koral/Compile/Builder.pm b/lib/Krawfish/Koral/Compile/Builder.pm
index 00ea651..ae4115d 100644
--- a/lib/Krawfish/Koral/Compile/Builder.pm
+++ b/lib/Krawfish/Koral/Compile/Builder.pm
@@ -27,11 +27,8 @@
use Krawfish::Koral::Compile::Type::Key;
use Scalar::Util qw/blessed/;
-sub new {
- my $class = shift;
- bless [], $class;
-};
-
+# Build compile query
+#
# $koral->compile(
# $mb->aggregate(
# $mb->a_frequencies,
@@ -47,11 +44,20 @@
# $mb->fields('author')
# $mb->snippet('')
+# Constructor
+sub new {
+ my $class = shift;
+ bless [], $class;
+};
+
+
+# Aggregate
sub aggregate {
my $self = shift;
return Krawfish::Koral::Compile::Aggregate->new(@_);
};
+
# Some aggregation types
# Aggregate frequencies
sub a_frequencies {
@@ -131,12 +137,17 @@
};
+# Enrich with span context
sub e_span_context {
shift;
my ($term, $count) = @_;
- return Krawfish::Koral::Compile::Enrich::Snippet::Context::Span->new($term, $count);
+ return Krawfish::Koral::Compile::Enrich::Snippet::Context::Span->new(
+ $term,
+ $count
+ );
};
+
# Enrich with Term lists per class
sub e_terms {
shift;
@@ -193,7 +204,8 @@
};
# TODO:
-# s_class (sort by the surface form of a class, necessary for concordances)
+# s_class
+# (sort by the surface form of a class, necessary for concordances)
sub limit {
diff --git a/lib/Krawfish/Koral/Corpus.pm b/lib/Krawfish/Koral/Corpus.pm
index 4bdc481..eb1128f 100644
--- a/lib/Krawfish/Koral/Corpus.pm
+++ b/lib/Krawfish/Koral/Corpus.pm
@@ -9,8 +9,9 @@
use constant DEBUG => 0;
-# Creation of virtual corpus
+# Base object for virtual corpus queries
+# Constructor
sub new {
my $class = shift;
bless {}, $class;
@@ -33,6 +34,7 @@
};
+# Get operands
sub operands {
my $self = shift;
if (@_) {
@@ -59,7 +61,7 @@
# Optimize for an index
sub optimize {
- ...
+ warn 'override';
};
@@ -128,6 +130,7 @@
};
+# Corpus is negative
sub is_negative {
my $self = shift;
if (scalar @_ == 1) {
@@ -137,6 +140,7 @@
};
+# Toggle negativity of corpus
sub toggle_negative {
my $self = shift;
$self->is_negative($self->is_negative ? 0 : 1);
@@ -155,12 +159,6 @@
# Matches nowhere
-# (in the sequence sense of "der >alte*< Mann")
-sub is_null {
- 0;
-};
-
-
sub is_nowhere {
my $self = shift;
if (defined $_[0]) {
@@ -170,7 +168,17 @@
};
-sub is_leaf { 0 };
+# Matches nowhere
+# (in the sequence sense of "der >alte*< Mann")
+sub is_null {
+ 0;
+};
+
+
+# Query is leaf
+sub is_leaf {
+ 0;
+};
# Create KoralQuery builder
@@ -183,18 +191,25 @@
# Query Application methods #
#############################
+# Deserialize
sub from_koral {
- ...
+ warn 'override';
};
+
+# serialize
sub to_koral_fragment {
- ...
+ warn 'override';
};
+
+# Stringification
sub to_string {
- ...
+ warn 'override';
};
+
+# Serialize to neutral string
sub to_neutral {
$_[0]->to_string;
};
diff --git a/lib/Krawfish/Koral/Document.pm b/lib/Krawfish/Koral/Document.pm
index 4e75325..ac41d21 100644
--- a/lib/Krawfish/Koral/Document.pm
+++ b/lib/Krawfish/Koral/Document.pm
@@ -19,13 +19,12 @@
# the keys will be translated to term_ids and the document
# can be added with all freq_in_doc information
+# foundry and layer may need separated term_ids
+# so they are exceptional small.
# TODO:
# Don't forget to deal with TUIs!
-# foundry and layer may need separated term_ids so they are exceptional small.
-
-
use constant DEBUG => 0;
# Parse the document and create an inverted index file
@@ -108,14 +107,11 @@
# In that way it's faster to retrieve presorted fields
# for enrichment!
-
# Prepare field for sorting
- #if ($field->{sortable}) {
-
- # Which entries need to be sorted?
- # $sortable{$field->{key}}++;
- #};
-
+ # if ($field->{sortable}) {
+ # # Which entries need to be sorted?
+ # $sortable{$field->{key}}++;
+ # };
# Prepare for summarization
if (!$field->{type} || $field->{type} eq 'type:string') {
@@ -360,26 +356,3 @@
__END__
-
-
-
-sub to_list {
- my ($self, $doc_id, $replicant_id) = @_;
-};
-
-
-sub add {
- # This will add the doc_id to id-field and
- # this will add the replicant field (either __1:1 or __2:node_name).
-};
-
-
-sub to_forward_index {
- # Only works after identification!
- # This should, however, use a K::I::Store class!
-};
-
-
-1;
-
-__END__
diff --git a/lib/Krawfish/Koral/Info.pm b/lib/Krawfish/Koral/Info.pm
index 652e793..37320e6 100644
--- a/lib/Krawfish/Koral/Info.pm
+++ b/lib/Krawfish/Koral/Info.pm
@@ -9,6 +9,7 @@
use constant DEBUG => 0;
+
# Add error
sub error {
my $self = shift;
@@ -16,18 +17,23 @@
return $self->_info('error', @_);
};
+
+# Add warning
sub warning {
my $self = shift;
print_log('info', 'Warning: ' . join(' ', @_)) if DEBUG;
return $self->_info('warning', @_);
};
+
+# Add message
sub message {
my $self = shift;
print_log('info', 'Message: ' . join(' ', @_)) if DEBUG;
return $self->_info('message', @_);
};
+
# Is there an error?
sub has_error {
return 1 if $_[0]->{error};
@@ -50,6 +56,7 @@
# Copy information from another object
+# Function
sub copy_info_from {
my ($self, $obj) = @_;
@@ -76,12 +83,14 @@
};
+# Merge infos with a new object
sub merge_info {
my ($self, $target) = @_;
copy_info_from($target, $self);
};
+# Information
sub _info {
my $self = shift;
my ($type, $code, $msg, @param) = @_;
@@ -92,4 +101,5 @@
return $self;
};
+
1;
diff --git a/lib/Krawfish/Koral/Query.pm b/lib/Krawfish/Koral/Query.pm
index 8622ad0..37bd848 100644
--- a/lib/Krawfish/Koral/Query.pm
+++ b/lib/Krawfish/Koral/Query.pm
@@ -9,6 +9,8 @@
use warnings;
use strict;
+# Base class for span queries
+
# TODO:
# - extended_* may be queried
# automatically without parameter
@@ -17,11 +19,14 @@
# TODO:
# This is now double with Krawfish::Koral!
+
use constant {
CONTEXT => 'http://korap.ids-mannheim.de/ns/koral/0.6/context.jsonld',
DEBUG => 0
};
+
+# Constructor
sub new {
my $class = shift;
my $self = bless {
@@ -42,8 +47,9 @@
};
+# Override type
sub type {
- ...
+ warn 'override';
};
@@ -52,10 +58,9 @@
#########################################
-
# Normalize the query
sub normalize {
- ...
+ warn 'override';
};
@@ -65,7 +70,7 @@
};
-
+# Translate to ids
# TODO:
# If "nowhere" returns, optimize away
# before ->optimize().
@@ -81,8 +86,6 @@
};
-
-
# Check for cached subqueries
sub cache {
$_[0];
@@ -162,7 +165,7 @@
# Returns a list of classes used by the query,
# e.g. in a focus() context.
sub uses_classes {
- ...
+ warn 'override';
};
@@ -218,20 +221,6 @@
};
-#sub replace_references {
-# my ($self, $refs) = @_;
-# my $sig = $self->signature;
-#
-# # Subquery is identical to given query
-# if ($refs->{$sig}) {
-# ...
-# }
-# else {
-# $refs->{$sig} = $self->operand;
-# };
-#};
-
-
# Matches everything
sub is_anywhere {
my $self = shift;
@@ -242,7 +231,6 @@
};
-
# Is optional
sub is_optional {
my $self = shift;
@@ -339,14 +327,14 @@
# Get the minimum tokens the query spans
sub min_span {
- ...
+ warn 'override';
};
# Get the maximum tokens the query spans
# -1 means arbitrary
sub max_span {
- ...
+ warn 'override';
};
@@ -386,12 +374,13 @@
};
-# Overwritten
+# Serialize
sub to_koral_fragment {
- ...
+ warn 'override';
};
+# Serialize
sub to_koral_query {
my $self = shift;
my $koral = $self->to_koral_fragment;
@@ -399,9 +388,10 @@
$koral;
};
-# Overwritten
+
+# Stringification
sub to_string {
- ...
+ warn 'override';
};
@@ -420,7 +410,7 @@
# TODO: Returns a value of complexity of the query,
# that can be used to decide, if a query should be cached.
sub complexity {
- ...
+ warn 'override';
};
@@ -448,8 +438,18 @@
};
+# sub replace_references {
+# my ($self, $refs) = @_;
+# my $sig = $self->signature;
+#
+# # Subquery is identical to given query
+# if ($refs->{$sig}) {
+# ...
+# }
+# else {
+# $refs->{$sig} = $self->operand;
+# };
+# };
+
+
1;
-
-
-__END__
-
diff --git a/lib/Krawfish/Log.pm b/lib/Krawfish/Log.pm
index e60880a..9958de2 100644
--- a/lib/Krawfish/Log.pm
+++ b/lib/Krawfish/Log.pm
@@ -3,6 +3,8 @@
use strict;
use warnings;
+# Simple log mechanism
+
our @EXPORT = 'print_log';
sub print_log {
diff --git a/lib/Krawfish/Posting.pm b/lib/Krawfish/Posting.pm
index e83ca35..ed29d65 100644
--- a/lib/Krawfish/Posting.pm
+++ b/lib/Krawfish/Posting.pm
@@ -4,6 +4,10 @@
use strict;
use warnings;
+# Krawfish::Posting is the base class for all
+# span based postings.
+# May better be named "Krawfish::Posting::Span"
+
# Constructor
sub new {
my $class = shift;
@@ -64,6 +68,19 @@
};
+# Check if two postings are identical
+# WARNING:
+# This should compare payloads separately,
+# because classes may be in different order,
+# though resulting in identical postings
+sub same_as {
+ my ($self, $comp) = @_;
+ return unless $comp;
+ return $self->to_string eq $comp->to_string;
+};
+
+
+# Return all classes in the payload
sub get_classes {
my ($self, $nrs) = @_;
@@ -157,12 +174,4 @@
};
-# Check if two postings are identical
-sub same_as {
- my ($self, $comp) = @_;
- return unless $comp;
- return $self->to_string eq $comp->to_string;
-};
-
-
1;
diff --git a/lib/Krawfish/Posting/Bundle.pm b/lib/Krawfish/Posting/Bundle.pm
index cb0e5eb..7a33257 100644
--- a/lib/Krawfish/Posting/Bundle.pm
+++ b/lib/Krawfish/Posting/Bundle.pm
@@ -28,6 +28,7 @@
current => undef
}, shift;
+ # Add passed items
foreach (@_) {
unless ($self->add($_)) {
warn "$_ is not a valid match object";
@@ -111,7 +112,7 @@
};
-# Stringify bundle
+# Stringification
sub to_string {
my $self = shift;
return '[' . join ('|', map { $_->to_string } @{$self->{list}}) . ']';
@@ -120,7 +121,8 @@
# The bundle may contain multiple items and these
# items may contain bundles.
-# Current will contain a single posting that may become a match.
+# Current will contain a single posting that may
+# become a match.
sub current {
return $_[0]->{current};
};
@@ -205,13 +207,18 @@
return 1;
};
+
+# Reset internal position in bundle
sub reset {
$_[0]->{pos} = -1;
};
+
+# Get item in list
sub item {
my ($self, $item) = @_;
$self->{list}->[$item];
};
+
1;
diff --git a/lib/Krawfish/Posting/Data.pm b/lib/Krawfish/Posting/Data.pm
index b0ebf17..30c9e7e 100644
--- a/lib/Krawfish/Posting/Data.pm
+++ b/lib/Krawfish/Posting/Data.pm
@@ -3,15 +3,24 @@
use strict;
use warnings;
+# Represent arbitrary posting data, that may be cast to
+# other posting types
+
+
+# Constructor
sub new {
my ($class, $data) = @_;
bless [@$data], $class;
};
+
+# Document id in posting
sub doc_id {
$_[0]->[0];
};
+
+# Stringification
sub to_string {
my $self = shift;
my $str = '[' . $self->doc_id;
diff --git a/lib/Krawfish/Posting/Doc.pm b/lib/Krawfish/Posting/Doc.pm
index 9a69af5..4d0a69b 100644
--- a/lib/Krawfish/Posting/Doc.pm
+++ b/lib/Krawfish/Posting/Doc.pm
@@ -2,8 +2,9 @@
use strict;
use warnings;
-# Make identical with DocWithFlags!
+# Document based posting
+# Constructor
sub new {
my $class = shift;
my $id = shift;
@@ -15,11 +16,16 @@
return ${$_[0]};
};
+
+# Get flags
sub flags {
};
+
+# Stringification
sub to_string {
'[' . ${$_[0]} . ']';
};
+
1;
diff --git a/lib/Krawfish/Posting/Forward.pm b/lib/Krawfish/Posting/Forward.pm
index fc65687..77b542d 100644
--- a/lib/Krawfish/Posting/Forward.pm
+++ b/lib/Krawfish/Posting/Forward.pm
@@ -3,12 +3,14 @@
use strict;
use warnings;
+# Posting in the Forward index
+
# API:
-# ->preceding_data # The whitespace data before the subtoken
-# ->subterm_id # The current subterm identifier
-# ->annotations # Get all annotations as terms
+# ->preceding_data # The whitespace data before the subtoken
+# ->subterm_id # The current subterm identifier
+# ->annotations # Get all annotations as terms
# ->annotations(
-# foundry # TODO: Think of more complex options!
+# foundry # TODO: Think of more complex options!
# )
# TODO:
@@ -27,6 +29,7 @@
use constant DEBUG => 0;
+
# Constructor
sub new {
my $class = shift;
@@ -158,6 +161,7 @@
return \@anno;
};
+
# Stringification
sub to_string {
my $str = '[' . ($_[0]->doc_id // '?') . ':#' . $_[0]->term_id;
diff --git a/lib/Krawfish/Posting/List.pm b/lib/Krawfish/Posting/List.pm
index 29ab674..aa73d42 100644
--- a/lib/Krawfish/Posting/List.pm
+++ b/lib/Krawfish/Posting/List.pm
@@ -3,7 +3,7 @@
use warnings;
use strict;
-# This is a sorted bundle of matches.
+# This is a sorted bundle of postings.
sub matches {
return $_[0]->size;
diff --git a/lib/Krawfish/Posting/Payload.pm b/lib/Krawfish/Posting/Payload.pm
index 5c63000..25b7ae8 100644
--- a/lib/Krawfish/Posting/Payload.pm
+++ b/lib/Krawfish/Posting/Payload.pm
@@ -4,6 +4,10 @@
use warnings;
use Scalar::Util qw/blessed/;
+
+# Class representing payload data
+
+
use constant {
PTI_CLASS => 0
};
@@ -12,30 +16,39 @@
@EXPORT = qw/PTI_CLASS/;
+
+# Constructor
sub new {
my $class = shift;
bless [], $class;
};
+
+# Get length of payload
sub length {
scalar @{$_[0]};
};
+
+# Copy data from other payload
sub copy_from {
- my $self = shift;
- my $payload = shift;
+ my ($self, $payload) = @_;
foreach (@$payload) {
$self->add(@$_);
};
return $self;
};
+
+# Add data to payload
sub add {
my $self = shift;
push @{$self}, [@_];
return $self;
};
+
+# Clone payload
sub clone {
my $self = shift;
my $new = __PACKAGE__->new;
@@ -46,7 +59,7 @@
};
-# Stringify
+# Stringification
sub to_string {
my $self = shift;
return join ('|', map { join(',', @{$_}) } @$self );
diff --git a/lib/Krawfish/Posting/Sorted.pm b/lib/Krawfish/Posting/Sorted.pm
deleted file mode 100644
index ce14570..0000000
--- a/lib/Krawfish/Posting/Sorted.pm
+++ /dev/null
@@ -1,27 +0,0 @@
-package Krawfish::Posting::Sorted;
-use parent 'Krawfish::Posting';
-use strict;
-use warnings;
-
-# Sorted may be bundled!
-# Probably use K::P::Bundle instead!
-
-# This posting iterator is returned by the HeapSort system.
-
-sub doc_id {
- ...
-};
-
-sub matches {
- ...
-};
-
-sub rank {
- ...
-};
-
-sub same {
- ...
-};
-
-1;
diff --git a/lib/Krawfish/Posting/Token.pm b/lib/Krawfish/Posting/Token.pm
index 49a8e26..d2bb9ee 100644
--- a/lib/Krawfish/Posting/Token.pm
+++ b/lib/Krawfish/Posting/Token.pm
@@ -3,6 +3,9 @@
use strict;
use warnings;
+# Posting representing a single token
+
+# Constructor
sub new {
my $class = shift;
bless [@_], $class;
diff --git a/lib/Krawfish/Query.pm b/lib/Krawfish/Query.pm
index e87eecb..f28fb40 100644
--- a/lib/Krawfish/Query.pm
+++ b/lib/Krawfish/Query.pm
@@ -4,6 +4,9 @@
use strict;
use warnings;
+
+# Krawfish::Query is the base class for all span queries.
+
use constant DEBUG => 0;
# Current span object
@@ -21,42 +24,22 @@
};
+# Move to next posting
# Overwrite
-# TODO: Accepts a target doc
-# TODO: Returns the doc_id of the current posting
+# Returns true if nexting works
sub next {
...
};
-# Clone query
-sub clone {
- warn $_[0];
- ...
-};
-
-
-# Per default every operation is complex
-sub complex {
- return 1;
-};
-
-
-# TODO:
-# This is a value that should probably be stored
-# at span-beginnings and can help to jump through very long
-# sequences of spans
-sub max_length {
- ...
-};
-
-
# This is only relevant for term posting lists
sub next_doc {
my $self = shift;
my $current_doc_id = $self->current->doc_id;
- print_log('query', refaddr($self) . ": go to next doc following $current_doc_id") if DEBUG;
+ if (DEBUG) {
+ print_log('query', refaddr($self) . ": go to next doc following $current_doc_id");
+ };
do {
$self->next or return;
@@ -66,39 +49,41 @@
};
-sub freq_in_doc {
- warn 'freq_in_doc only supported for term queries (see PostingPointer)';
-};
-
-
-# Skip to (or beyond) a certain document id
+# Overwrite
+# Skip to (or beyond) a certain doc id.
+# This should be overwritten to more effective methods.
sub skip_doc {
- my ($self, $doc_id) = @_;
+ my ($self, $target_doc_id) = @_;
- print_log('query', refaddr($self) . ': skip to doc id ' . $doc_id) if DEBUG;
+ print_log('query', refaddr($self) . ': skip to doc id ' . $target_doc_id) if DEBUG;
- while (!$self->current || $self->current->doc_id < $doc_id) {
+ while (!$self->current || $self->current->doc_id < $target_doc_id) {
$self->next_doc or return;
};
+ # TODO:
+ # Return NOMORE in case no more
+ # documents exist
return $self->current->doc_id;
};
-# Skip to (or beyond) a certain position
+# Skip to (or beyond) a certain position in the doc.
# Returns true, if the new current is positioned
# in the same document beyond the given pos.
# Otherwise returns false.
+# TODO:
+# This behaviour should be improved!
sub skip_pos {
- my ($self, $pos) = @_;
+ my ($self, $target_pos) = @_;
my $current = $self->current or return;
my $doc_id = $current->doc_id;
while (($current = $self->current) && $current->doc_id == $doc_id) {
- if ($current->start < $pos) {
+ if ($current->start < $target_pos) {
print_log('query', "Skip " . $current->to_string .
- " to pos $pos in doc id $doc_id") if DEBUG;
+ " to pos $target_pos in doc id $doc_id") if DEBUG;
$self->next;
}
else {
@@ -129,7 +114,7 @@
# Forward the second span to advance to the document of the first span
else {
- print_log('filter', 'Forward second') if DEBUG;
+ print_log('query', 'Forward second') if DEBUG;
$second->skip_doc($first_c->doc_id) or return;
$second_c = $second->current;
};
@@ -138,16 +123,36 @@
return 1;
};
-# In Lucene it's exemplified:
-# int advance(int target) {
-# int doc;
-# while ((doc = nextDoc()) < target) {
-# }
-# return doc;
-# }
+
+# Clone query
+# (Not implemented yet)
+sub clone {
+ warn $_[0];
+ ...
+};
-# The maximum possible frequency of the query
+# Per default every operation is complex
+sub complex {
+ return 1;
+};
+
+
+# TODO:
+# This is a value that should probably be stored
+# at span-beginnings and can help to jump through very long
+# sequences of spans
+sub max_length {
+ ...
+};
+
+
+sub freq_in_doc {
+ warn 'freq_in_doc only supported for term queries (see PostingPointer)';
+};
+
+
+# Get maximum possible frequency of the query
sub max_freq {
warn 'Not implemented for this query: ' . blessed $_[0];
};
@@ -158,13 +163,16 @@
warn 'Not implemented by default';
};
+
+# Stringification
# Overwrite
sub to_string {
...
};
-# Override in Krawfish::Collection
+# Get current match
+# Override
sub current_match {
return undef;
};
diff --git a/lib/Krawfish/Query/Base/Dual.pm b/lib/Krawfish/Query/Base/Dual.pm
index 79e1518..6ddf3f2 100644
--- a/lib/Krawfish/Query/Base/Dual.pm
+++ b/lib/Krawfish/Query/Base/Dual.pm
@@ -9,6 +9,8 @@
our @EXPORT;
+# Base query for combination query with two operands
+
# TODO:
# Wrap second query in a buffered query instead of
# dealing with buffer resizing etc. here!
@@ -32,6 +34,8 @@
@EXPORT = qw/NEXTA NEXTB MATCH/;
+
+# Constructor
sub new {
my $class = shift;
bless {
@@ -43,7 +47,7 @@
# Initialize both spans
-sub init {
+sub _init {
return if $_[0]->{init}++;
if (DEBUG) {
print_log('dual', 'Init dual spans: ' . $_[0]->{first}->to_string . ' and ' .
@@ -55,10 +59,10 @@
};
-# This will advance the two spans
+# Move to next posting
sub next {
my $self = shift;
- $self->init;
+ $self->_init;
my ($first, $second);
@@ -111,15 +115,12 @@
return;
};
-
# There is a first and a second operand
-
# TODO:
# Check if second may not be at the end
# of the buffer
-
# Both elements are in the same document
if ($first->doc_id == $second->doc_id) {
@@ -383,5 +384,3 @@
1;
-
-__END__
diff --git a/lib/Krawfish/Query/Base/Sorted.pm b/lib/Krawfish/Query/Base/Sorted.pm
index f2dcb51..81579db 100644
--- a/lib/Krawfish/Query/Base/Sorted.pm
+++ b/lib/Krawfish/Query/Base/Sorted.pm
@@ -6,6 +6,8 @@
use constant DEBUG => 0;
+# Base query for queries that may be unsorted.
+
# TODO:
# Implement using Krawfish::Util::Heap
@@ -25,6 +27,8 @@
# Elements have:
# <size><data>
+
+# Constructor
sub new {
my $class = shift;
bless {
@@ -40,7 +44,7 @@
};
-# Next sorted element
+# Move to next sorted posting
sub next {
my $self = shift;
@@ -86,11 +90,13 @@
return $self->buffer_shift;
};
+
# Return index to last added element
sub buffer_last {
...
};
+
# Points to the latest freed element in the buffer
# (normally this is -1 to first)
sub buffer_recent {
@@ -103,17 +109,20 @@
return $_[0]->{first};
};
-# sub buffer_push;
+
sub buffer_shift {
...
};
+
sub buffer_get {
...
};
+
sub buffer_insert_after {
my ($self, $index, $element) = @_;
};
+
1;
diff --git a/lib/Krawfish/Query/Cache.pm b/lib/Krawfish/Query/Cache.pm
index db882ef..af32043 100644
--- a/lib/Krawfish/Query/Cache.pm
+++ b/lib/Krawfish/Query/Cache.pm
@@ -5,6 +5,8 @@
use strict;
use warnings;
+# Cache implementation for queries
+
sub new {
my $class = shift;
my $self = bless {
@@ -15,18 +17,25 @@
return $self;
};
+
+# Move to next posting
# The doc_ids are not stored as deltas,
# so sorting with offstes is supported
sub next {
...
};
+
+# Get maximum frequency
sub max_freq {
...
};
+
+# Clone query
sub clone {
...
};
+
1;
diff --git a/lib/Krawfish/Query/Class.pm b/lib/Krawfish/Query/Class.pm
index bd976fb..b01943e 100644
--- a/lib/Krawfish/Query/Class.pm
+++ b/lib/Krawfish/Query/Class.pm
@@ -6,6 +6,8 @@
use constant DEBUG => 0;
+
+# Constructor
sub new {
my $class = shift;
bless {
@@ -14,6 +16,8 @@
}, $class;
};
+
+# Clone query
sub clone {
my $self = shift;
__PACKAGE__->new(
@@ -23,6 +27,7 @@
};
+# Move to next posting
sub next {
my $self = shift;
@@ -51,11 +56,13 @@
};
+# Get maximum frequency
sub max_freq {
$_[0]->{span}->max_freq;
};
+# Stringification
sub to_string {
my $self = shift;
my $str = 'class(';
@@ -65,10 +72,12 @@
};
+# Filter query by VC
sub filter_by {
my ($self, $corpus) = @_;
$self->{span} = $self->{span}->filter_by($corpus);
return $self;
};
+
1;
diff --git a/lib/Krawfish/Query/Constraint/ClassDistance.pm b/lib/Krawfish/Query/Constraint/ClassDistance.pm
index 28e0c24..a5da8e2 100644
--- a/lib/Krawfish/Query/Constraint/ClassDistance.pm
+++ b/lib/Krawfish/Query/Constraint/ClassDistance.pm
@@ -3,20 +3,27 @@
use warnings;
# This is no real check,
-# it simply marks the distance between two spans using a class payload
+# it simply marks the distance between two spans
+# using a class payload
+
+# Constructor
sub new {
my $class = shift;
my $nr = shift;
bless \$nr, $class;
};
+
+# Clone query
sub clone {
__PACKAGE__->new(
${$_[0]}
);
};
+
+# Check configuration
sub check {
my $self = shift;
my ($first, $second) = @_;
@@ -44,6 +51,8 @@
return 0b0111;
};
+
+# Stringification
sub to_string {
'class=' . (0 + ${$_[0]});
};
diff --git a/lib/Krawfish/Query/Constraint/Depth.pm b/lib/Krawfish/Query/Constraint/Depth.pm
index 1d3408c..cf8d88c 100644
--- a/lib/Krawfish/Query/Constraint/Depth.pm
+++ b/lib/Krawfish/Query/Constraint/Depth.pm
@@ -4,7 +4,7 @@
use strict;
use warnings;
-# TODO: THIS IS CURRENTLY JUST A MOCKUP
+# THIS IS CURRENTLY JUST A MOCKUP
# Check the nodes for depth
# The relevant information is in the last added
@@ -14,6 +14,8 @@
# direct parent: min==max=-1
# ancestor: min=0, max=256
+
+# Constructor
sub new {
my $class = shift;
bless {
@@ -23,6 +25,7 @@
};
+# Clone query
sub clone {
__PACKAGE__->new(
$_[0]->{min},
@@ -30,7 +33,8 @@
);
};
-# Overwrite
+
+# Check configuration
sub check {
my $self = shift;
my ($first, $second) = @_;
@@ -43,4 +47,5 @@
return NEXTA | NEXTB;
};
+
1;
diff --git a/lib/Krawfish/Query/Constraint/InBetween.pm b/lib/Krawfish/Query/Constraint/InBetween.pm
index c5c36bb..bbccf2d 100644
--- a/lib/Krawfish/Query/Constraint/InBetween.pm
+++ b/lib/Krawfish/Query/Constraint/InBetween.pm
@@ -12,16 +12,17 @@
# the inbetweens.
#
# Example:
-# [orth=Der][opennlp]{2,3}[orth=Mann]
-# To not allow gaps, use
-# [orth=Der][opennlp]{!2,3}[orth=Mann]
+# [orth=Der][opennlp]{2,3}[orth=Mann]
+# To not allow gaps, use
+# [orth=Der][opennlp]{!2,3}[orth=Mann]
-# TODO: Order may not be defined!
-
# TODO:
-# If min=0, a shortcircuit result is returned and following
-# constraints are ignored
+# Order may not be defined!
+
+# TODO:
+# If min=0, a shortcircuit result is returned
+# and following constraints are ignored
use constant {
NEXTA => 1,
@@ -44,6 +45,8 @@
}, $class;
};
+
+# Clone query
sub clone {
my $self = shift;
__PACKAGE__->new(
@@ -54,8 +57,9 @@
);
};
+
# Initialize foundry
-sub init {
+sub _init {
# If foundry is set, load token class and receive
# max_subtokens
...
@@ -115,6 +119,3 @@
1;
-
-
-__END__
diff --git a/lib/Krawfish/Query/Constraint/InDistanceSpan.pm b/lib/Krawfish/Query/Constraint/InDistanceSpan.pm
index 7df9472..227614b 100644
--- a/lib/Krawfish/Query/Constraint/InDistanceSpan.pm
+++ b/lib/Krawfish/Query/Constraint/InDistanceSpan.pm
@@ -12,6 +12,8 @@
MATCH => 4,
};
+
+# Configuration
sub new {
my $class = shift;
bless {
@@ -22,16 +24,21 @@
}, $class;
};
+
+# Clone constraint
sub clone {
...
};
+
+# initialize
sub _init {
return if $_[0]->{init}++;
print_log('c_dist', 'Init distance span') if DEBUG;
$_[0]->{span}->next;
};
+
# Check the configuration
sub check {
my $self = shift;
@@ -79,4 +86,5 @@
# if ($first->end > $current->end)
};
+
1;
diff --git a/lib/Krawfish/Query/Constraint/NotBetween.pm b/lib/Krawfish/Query/Constraint/NotBetween.pm
index fac7543..393a1b6 100644
--- a/lib/Krawfish/Query/Constraint/NotBetween.pm
+++ b/lib/Krawfish/Query/Constraint/NotBetween.pm
@@ -6,7 +6,8 @@
# Check, if a negative token is in between.
# Like [orth=Der][orth!=alte][orth=Mann].
#
-# TODO: Support optional flag
+# TODO:
+# Support optional flag
use constant {
NEXTA => 1,
@@ -18,6 +19,7 @@
};
+# Constructor
sub new {
my $class = shift;
bless {
@@ -27,14 +29,16 @@
};
+# Clonme query
sub clone {
__PACKAGE__->new(
$_[0]->{query}->clone
);
};
-# Initialize in-between query
-sub init {
+
+# Initialize
+sub _init {
my $self = shift;
return if $self->{init}++;
print_log('notC', 'Init notBetween query') if DEBUG;
@@ -42,6 +46,7 @@
};
+# Check configuration
sub check {
my $self = shift;
my ($first, $second, $payload) = @_;
@@ -53,7 +58,7 @@
$second = $temp;
};
- $self->init;
+ $self->_init;
# TODO:
# Use buffer API here
@@ -132,6 +137,8 @@
return ALL_MATCH;
};
+
+# Stringification
sub to_string {
my $self = shift;
'notBetween=' . $self->{query}->to_string;
diff --git a/lib/Krawfish/Query/Constraint/Position.pm b/lib/Krawfish/Query/Constraint/Position.pm
index 9a3ea7e..46fb5d0 100644
--- a/lib/Krawfish/Query/Constraint/Position.pm
+++ b/lib/Krawfish/Query/Constraint/Position.pm
@@ -44,6 +44,7 @@
our (@EXPORT, @next_a, @next_b);
+# Constructor
sub new {
my $class = shift;
bless {
@@ -51,6 +52,8 @@
}, $class;
};
+
+# Clone constraint
sub clone {
__PACKAGE__->new(
$_[0]->{frames}
@@ -452,6 +455,7 @@
};
+# Stringification
sub to_string {
'pos=' . (0 + $_[0]->{frames});
};
diff --git a/lib/Krawfish/Query/Constraints.pm b/lib/Krawfish/Query/Constraints.pm
index 15c7fc9..b68ad6d 100644
--- a/lib/Krawfish/Query/Constraints.pm
+++ b/lib/Krawfish/Query/Constraints.pm
@@ -6,6 +6,14 @@
use strict;
use warnings;
+# TODO:
+# Improve by skipping to the same document
+#
+# TODO:
+# The check probably needs more than just the span
+# information, e.g. to get the max_length() of
+# a span for skip_pos() stuff.
+
use constant {
NEXTA => 1,
NEXTB => 2,
@@ -14,11 +22,8 @@
DEBUG => 0
};
-# TODO: Improve by skipping to the same document
-# TODO:
-# The check probably needs more than just the span information,
-# e.g. to get the max_length() of a span for skip_pos() stuff.
+# Constructor
sub new {
my $class = shift;
bless {
@@ -27,12 +32,14 @@
second => shift,
# TODO:
- # Second operand should be nested in buffer by Dual
+ # Second operand should be nested
+ # in buffer by Dual
buffer => Krawfish::Util::Buffer->new
}, $class;
};
+# Clone query
sub clone {
my $self = shift;
__PACKAGE__->new(
@@ -43,7 +50,6 @@
};
-
# Check all constraints sequentially
sub check {
my $self = shift;
@@ -97,7 +103,7 @@
};
-# The maximum frequency is the minimum of both query frequencies
+# Get maximum frequency of query
sub max_freq {
my $self = shift;
min($self->{first}->max_freq, $self->{second}->max_freq);
@@ -122,6 +128,7 @@
};
+# Stringification
sub to_string {
my $self = shift;
my $str = 'constr(';
diff --git a/lib/Krawfish/Query/Exclusion.pm b/lib/Krawfish/Query/Exclusion.pm
index 10fee76..6b61dee 100644
--- a/lib/Krawfish/Query/Exclusion.pm
+++ b/lib/Krawfish/Query/Exclusion.pm
@@ -130,13 +130,13 @@
};
-# Return the maximum frequency of the first operand
+# Get maximum frequency
sub max_freq {
$_[0]->{first}->max_freq;
};
-# Filter exclusion by a corpus
+# Filter query by a VC
sub filter_by {
my ($self, $corpus) = @_;
diff --git a/lib/Krawfish/Query/Extension.pm b/lib/Krawfish/Query/Extension.pm
index 2a8b504..ed8a250 100644
--- a/lib/Krawfish/Query/Extension.pm
+++ b/lib/Krawfish/Query/Extension.pm
@@ -9,6 +9,8 @@
#
# Support gaps like with Constraint::InBetween
+
+# Constructor
sub new {
my $class = shift;
bless {
@@ -21,15 +23,19 @@
};
+# Clone query
sub clone {
...
};
+
# Check the configuration
sub check {
...
};
+
+# Stringification
sub to_string {
my $self = shift;
my $string ='ext(';
@@ -39,14 +45,13 @@
};
-
+# Get maximum frequency
sub max_freq {
- # TODO:
- ...
+ $_[0]->{span}->max_freq;
};
-# Filter extension by a corpus
+# Filter query by VC
sub filter_by {
...
};
diff --git a/lib/Krawfish/Query/Filter.pm b/lib/Krawfish/Query/Filter.pm
index 2f572dc..4a24052 100644
--- a/lib/Krawfish/Query/Filter.pm
+++ b/lib/Krawfish/Query/Filter.pm
@@ -9,6 +9,8 @@
# Filters a term to check, if it is
# in a supported document
+
+# Constructor
sub new {
my $class = shift;
bless {
@@ -18,6 +20,7 @@
};
+# Clone query
sub clone {
my $self = shift;
__PACKAGE__->new(
@@ -26,8 +29,9 @@
);
};
+
# Initialize spans
-sub init {
+sub _init {
return if $_[0]->{init}++;
print_log('filter', 'Init filter spans') if DEBUG;
$_[0]->{span}->next;
@@ -39,7 +43,7 @@
sub next {
my $self = shift;
- $self->init;
+ $self->_init;
print_log('filter', 'Check next valid span') if DEBUG;
@@ -89,6 +93,8 @@
return 1;
};
+
+# Stringification
sub to_string {
my $self = shift;
my $str = 'filter(';
@@ -100,28 +106,21 @@
# Get the maximum frequency of the term
sub max_freq {
- my $self = shift;
- # my $freq = 0;
-
- # $self->init;
-
- # print_log('filter', 'Count valid spans') if DEBUG;
-
- # Iterate over all docs and collect frequencies
- #while ($self->{span}->same_doc($self->{docs})) {
- # $freq += $self->{span}->freq_in_doc;
- # $self->{span}->next_doc or last;
- #};
-
- return $self->{span}->max_freq;
+ return $_[0]->{span}->max_freq;
};
+# Filter query by VC
sub filter_by {
my ($self, $corpus) = @_;
- # TODO: Check always that the query isn't moved forward yet!
- $self->{docs} = Krawfish::Corpus::And->new($self->{docs}, $corpus->clone);
+ # TODO:
+ # Check always that the query isn't
+ # moved forward yet!
+ $self->{docs} = Krawfish::Corpus::And->new(
+ $self->{docs},
+ $corpus->clone
+ );
$self;
};
diff --git a/lib/Krawfish/Query/Length.pm b/lib/Krawfish/Query/Length.pm
index 3c52ff0..8f8458e 100644
--- a/lib/Krawfish/Query/Length.pm
+++ b/lib/Krawfish/Query/Length.pm
@@ -5,7 +5,8 @@
use constant DEBUG => 0;
-# TODO: This should respect different tokenizations!
+# TODO:
+# This should respect different tokenizations!
# Constructor
sub new {
@@ -19,6 +20,8 @@
}, $class;
};
+
+# Clone query
sub clone {
my $self = shift;
__PACKAGE__->new(
@@ -29,7 +32,8 @@
);
};
-# Overwrite
+
+# Move to next posting
sub next {
my $self = shift;
@@ -51,7 +55,12 @@
# min and max are identical
if ($self->{min} == $self->{max} && $length == $self->{min}) {
- print_log('length', "! Length $length has the length " . $self->{min}) if DEBUG;
+ if (DEBUG) {
+ print_log(
+ 'length',
+ "! Length $length has the length " . $self->{min}
+ );
+ };
$self->{current} = $current;
return 1;
@@ -93,11 +102,13 @@
};
+# Get maximum frequency
sub max_freq {
$_[0]->{span}->max_freq;
};
+# Filter query by VC
sub filter_by {
my ($self, $corpus) = @_;
$self->{span} = $self->{span}->filter_by($corpus);
diff --git a/lib/Krawfish/Query/Match.pm b/lib/Krawfish/Query/Match.pm
index ab51666..6ebffdf 100644
--- a/lib/Krawfish/Query/Match.pm
+++ b/lib/Krawfish/Query/Match.pm
@@ -4,8 +4,13 @@
use strict;
use warnings;
+
+# Get posting by doc id plus position and length.
+
use constant DEBUG => 0;
+
+# Constructor
sub new {
my $class = shift;
bless {
@@ -15,6 +20,8 @@
}, $class;
};
+
+# Clone query
sub clone {
my $self = shift;
__PACKAGE__->new(
@@ -24,7 +31,9 @@
);
};
-sub init {
+
+# Initialize
+sub _init {
return if $_[0]->{init}++;
if (DEBUG) {
print_log('match', 'Init ' . $_[0]->{doc}->to_string);
@@ -33,11 +42,11 @@
};
-# Forward to next match
+# Move to next posting
sub next {
my $self = shift;
- $self->init;
+ $self->_init;
print_log('match', 'Check next valid match') if DEBUG;
@@ -69,34 +78,45 @@
};
-# Match can only occur once (although this requires a filter!)
+# Get maximum frequency
sub max_freq {
+ # Match can only occur once
+ # (although this requires a filter!)
1;
};
+# Stringification
sub to_string {
my $self = shift;
return '[[' . $self->{doc}->to_string . ':' . $self->start . '-' . $self->end . ']]';
};
+# Get start position
sub start {
$_[0]->{start};
};
+# Get end position
sub end {
$_[0]->{end};
};
-# This is useful to, e.g., make sure the document is live
+# Filter query by VC
+# This is useful to, e.g.,
+# make sure the document is live
sub filter_by {
my ($self, $corpus) = @_;
- # TODO: Check always that the query isn't moved forward yet!
- $self->{doc} = Krawfish::Corpus::And->new($self->{doc}, $corpus->clone);
+ # TODO:
+ # Check always that the query isn't moved forward yet!
+ $self->{doc} = Krawfish::Corpus::And->new(
+ $self->{doc},
+ $corpus->clone
+ );
$self;
};
diff --git a/lib/Krawfish/Query/Nowhere.pm b/lib/Krawfish/Query/Nowhere.pm
index 6fb1577..b2cbf6c 100644
--- a/lib/Krawfish/Query/Nowhere.pm
+++ b/lib/Krawfish/Query/Nowhere.pm
@@ -10,31 +10,44 @@
bless \$var, $class;
};
+
+# Get current posting
sub current {
return;
};
+
+# Clone query
sub clone {
__PACKAGE__->new;
};
+# Move to next posting
sub next {
return;
};
+
+# Skip to target document (invalid)
sub skip_doc {
return;
};
+
+# Get maximum frequency
sub max_freq {
0
};
+
+# Stringification
sub to_string {
'[0]';
};
+
+# Filter query by VC (invalid)
sub filter_by {
return;
};
diff --git a/lib/Krawfish/Query/Or.pm b/lib/Krawfish/Query/Or.pm
index 17c4329..7e3fe92 100644
--- a/lib/Krawfish/Query/Or.pm
+++ b/lib/Krawfish/Query/Or.pm
@@ -6,6 +6,8 @@
use constant DEBUG => 0;
+
+# Constructor
sub new {
my $class = shift;
bless {
@@ -14,6 +16,8 @@
}, $class;
};
+
+# Clone query
sub clone {
my $self = shift;
__PACKAGE__->new(
@@ -22,7 +26,9 @@
);
};
-sub init {
+
+# Initialize
+sub _init {
return if $_[0]->{init}++;
if (DEBUG) {
print_log(
@@ -35,9 +41,11 @@
};
+# Move to next posting
sub next {
my $self = shift;
- $self->init;
+
+ $self->_init;
my $first = $self->{first}->current;
my $second = $self->{second}->current;
@@ -121,7 +129,7 @@
};
-# Maximum frequency
+# Get maximum frequency
sub max_freq {
my $self = shift;
@@ -150,7 +158,7 @@
};
-# Filter the query
+# Filter query by VC
sub filter_by {
my ($self, $corpus) = @_;
diff --git a/lib/Krawfish/Query/Reference.pm b/lib/Krawfish/Query/Reference.pm
index ab0cba1..89af754 100644
--- a/lib/Krawfish/Query/Reference.pm
+++ b/lib/Krawfish/Query/Reference.pm
@@ -18,6 +18,7 @@
# The ring buffer query is well suited for this.
+# Constructor
sub new {
my $class = shift;
@@ -32,29 +33,31 @@
};
-sub new {
- my $self = shift;
-};
-
-
+# Clone query
sub clone {
...
};
+
+# Move to next posting
sub next {
...
};
+# Stringification
sub to_string {
my $self = shift;
};
+
+# Get maximum frequency
sub max_freq {
...
};
+# Filter query by VC
sub filter_by {
...
};
diff --git a/lib/Krawfish/Query/Repetition.pm b/lib/Krawfish/Query/Repetition.pm
index 22610b2..3ca4634 100644
--- a/lib/Krawfish/Query/Repetition.pm
+++ b/lib/Krawfish/Query/Repetition.pm
@@ -8,13 +8,17 @@
use constant DEBUG => 0;
-# TODO: Support next_pos, in case current start position can not succeed
-# e.g. in case of position
+# TODO:
+# Support next_pos, in case current start
+# position can not succeed, e.g. in case of position
-# TODO: Support steps:
-# []{1,30,2}
-# means valid: [][], [][][][], [][][][][], ...
+# TODO:
+# Support steps:
+# []{1,30,2}
+# means valid: [][], [][][][], [][][][][], ...
+
+# Constructor
sub new {
my $class = shift;
bless {
@@ -26,6 +30,7 @@
};
+# Clone query
sub clone {
my $self = shift;
__PACKAGE__->new(
@@ -35,8 +40,9 @@
);
};
+
# Initialize spans and buffer
-sub init {
+sub _init {
return if $_[0]->{init}++;
$_[0]->{span}->next;
print_log('repeat', 'Init span') if DEBUG;
@@ -48,9 +54,11 @@
};
+# Move to next posting
sub next {
my $self = shift;
- $self->init;
+
+ $self->_init;
# Get the buffer
my $buffer = $self->{buffer};
@@ -175,6 +183,7 @@
};
+# Stringification
sub to_string {
my $self = shift;
my $str = 'rep(';
@@ -184,8 +193,10 @@
};
-# The maximum frequency is based on the occurrence of the span,
-# multiplied by the difference of min and max values, so
+# Get maximum frequency, based on the occurrence
+# of the span, multiplied by the difference of
+# min and max values, so
+#
# freq([a]{3}) == freq([a])
# freq([a]{1,2}) == freq([a])*2
sub max_freq {
@@ -193,6 +204,8 @@
$self->{span}->max_freq * ($self->{max} - $self->{min} + 1)
};
+
+# Filter query by VC
sub filter_by {
my ($self, $corpus) = @_;
$self->{span} = $self->{span}->filter_by($corpus);
diff --git a/lib/Krawfish/Query/TermID.pm b/lib/Krawfish/Query/TermID.pm
index a5e1996..bc50bf5 100644
--- a/lib/Krawfish/Query/TermID.pm
+++ b/lib/Krawfish/Query/TermID.pm
@@ -24,7 +24,7 @@
};
-# Clone the query
+# Clone query
sub clone {
my $self = shift;
__PACKAGE__->new(
@@ -34,8 +34,7 @@
};
-# Skip to next position
-# This will initialize the posting list
+# Move to next posting
sub next {
my $self = shift;
@@ -49,7 +48,7 @@
};
-# Return current object
+# Get current posting
sub current {
my $postings = $_[0]->{postings};
return if $postings->pos == -1;
@@ -60,20 +59,23 @@
);
};
-# This parameter is relevant, as it is requested e.g. from termFreq
-# to count all frequencies per requested term
+
+# This parameter is relevant, as it is requested
+# e.g. from termFreq to count all frequencies
+# per requested term
sub term_id {
$_[0]->{term_id};
};
-# Get the frequency of the term
+# Get maximum frequency
sub max_freq {
$_[0]->{postings}->freq;
};
-# Get the frequency of the term in the current document
+# Get the frequency of the term in
+# the current document
sub freq_in_doc {
$_[0]->{postings}->freq_in_doc;
};
@@ -85,18 +87,19 @@
};
-# Skip to a certain document
+# Skip to target doc id
sub skip_doc {
$_[0]->{postings}->skip_doc($_[1]);
};
-# The value is simple
+# Complexity of the query
sub complex {
0;
};
-# Filter this query by a corpus
+
+# Filter query by VC
sub filter_by {
my ($self, $corpus) = @_;
return Krawfish::Query::Filter->new(
@@ -104,4 +107,5 @@
);
};
+
1;
diff --git a/lib/Krawfish/Query/Unique.pm b/lib/Krawfish/Query/Unique.pm
index 5ed9bcd..e117f7c 100644
--- a/lib/Krawfish/Query/Unique.pm
+++ b/lib/Krawfish/Query/Unique.pm
@@ -4,8 +4,11 @@
use strict;
use warnings;
+# Filter duplicate postings
+
use constant DEBUG => 0;
+# Constructor
sub new {
my $class = shift;
bless {
@@ -14,6 +17,8 @@
};
};
+
+# Clone query
sub clone {
my $self = shift;
__PACKAGE__->new(
@@ -21,6 +26,8 @@
);
};
+
+# Move to next posting
sub next {
my $self = shift;
@@ -52,20 +59,25 @@
return;
};
+
+# Stringification
sub to_string {
return 'unique(' . $_[0]->{span}->to_string . ')';
};
+# Get maximum frequency
sub max_freq {
$_[0]->{span}->max_freq;
};
+# Filter query by VC
sub filter_by {
my ($self, $corpus) = @_;
$self->{span} = $self->{span}->filter_by($corpus);
return $self;
};
+
1;
diff --git a/lib/Krawfish/Util/PriorityQueue/PerDoc.pm b/lib/Krawfish/Util/PriorityQueue/PerDoc.pm
index 1b8c0b1..cb6d4ea 100644
--- a/lib/Krawfish/Util/PriorityQueue/PerDoc.pm
+++ b/lib/Krawfish/Util/PriorityQueue/PerDoc.pm
@@ -4,19 +4,27 @@
use warnings;
use Krawfish::Log;
-# TODO: Simplify to not require max_rank_ref!
+# TODO:
+# Simplify to not require max_rank_ref!
-# TODO: Add reset method, so the PQ can be reused
-# in SortAfter!
+# TODO:
+# Add reset method, so the PQ can be reused
+# in SortAfter!
-# TODO: This currently only works with ranks,
-# it may be more beneficial to work with criteria.
-# So instead to compare ranks, a Criterion object will
-# compare arbitrary data in the "rank"-field
+# TODO:
+# This currently only works with ranks,
+# it may be more beneficial to work with criteria.
+# So instead to compare ranks, a Criterion object will
+# compare arbitrary data in the "rank"-field
-# TODO: Probably rename from IN_DOC to IN_COLL
-# TODO: Probably rename to PriorityQueue::Bundle
-# TODO: Turn reverse_array into an iterator
+# TODO:
+# Probably rename from IN_DOC to IN_COLL
+
+# TODO:
+# Probably rename to PriorityQueue::Bundle
+
+# TODO:
+# Turn reverse_array into an iterator
use constant {
DEBUG => 0,
diff --git a/lib/Krawfish/Util/SortedList.pm b/lib/Krawfish/Util/SortedList.pm
index c66cfda..71bbf50 100644
--- a/lib/Krawfish/Util/SortedList.pm
+++ b/lib/Krawfish/Util/SortedList.pm
@@ -1,6 +1,6 @@
package Krawfish::Util::SortedList;
use parent 'Krawfish::Query';
-use Krawfish::Posting::Sorted;
+use Krawfish::Posting::List;
use strict;
use warnings;
@@ -13,6 +13,7 @@
};
+# Constructor
sub new {
my $class = shift;
bless {
@@ -21,14 +22,20 @@
}, $class;
};
+
+# Get list lenght
sub length {
scalar @{$_[0]->{list}};
};
+
+# Move to next item in list
sub next {
$_[0]->{pos}++ < $_[0]->length;
};
+
+# Get current item
sub current {
my $self = shift;
return $self->{list}->[$self->{pos}];
diff --git a/lib/Krawfish/Util/String.pm b/lib/Krawfish/Util/String.pm
index 399cfd7..2a8f4a0 100644
--- a/lib/Krawfish/Util/String.pm
+++ b/lib/Krawfish/Util/String.pm
@@ -7,11 +7,15 @@
use parent 'Exporter';
use utf8;
-our @EXPORT = qw/fold_case remove_diacritics normalize_nfkc squote/;
-
# Helper package for unicode handling
+our @EXPORT = qw/fold_case
+ remove_diacritics
+ normalize_nfkc
+ squote/;
+
+
# Fold case of a term
sub fold_case {
fc $_[0];
@@ -56,4 +60,5 @@
return qq{'$str'};
};
+
1;