Added Koral::Document for ForwardIndex creation
diff --git a/lib/Krawfish/Index.pm b/lib/Krawfish/Index.pm
index a948c0b..2ebfa92 100644
--- a/lib/Krawfish/Index.pm
+++ b/lib/Krawfish/Index.pm
@@ -69,7 +69,8 @@
# Maybe logarithmic merge
# https://www.youtube.com/watch?v=VNjf2dxWH2Y&spfreload=5
-# TODO: Maybe 65.535 documents are enough per segment ...
+# TODO:
+# Maybe 65.535 documents are enough per segment ...
# Construct a new index object
@@ -120,6 +121,16 @@
sub add {
my ($self, $doc, $replicant_id) = @_;
+ # TODO:
+ # The document should first be converted in inverted index form
+ # using a hash with
+ #
+ # +field => title,
+ # *term => [postings*]
+ #
+ # Then, when the document is added to certain nodes,
+ # the keys will be translated to term_ids and the document
+ # can be added with all freq_in_doc information
unless (ref $doc) {
$doc = decode_json(Mojo::File->new($doc)->slurp);
};
diff --git a/lib/Krawfish/Index/Dictionary.pm b/lib/Krawfish/Index/Dictionary.pm
index 304b075..e51452e 100644
--- a/lib/Krawfish/Index/Dictionary.pm
+++ b/lib/Krawfish/Index/Dictionary.pm
@@ -15,10 +15,14 @@
#
# Terms have different prefixes:
#
-# terms: *
-# subterms: ~
-# fields: +
-# fieldkeys: !
+# terms: *
+# (casefolded) '
+# subterms: ~
+# annotations
+# token #
+# span <>
+# fields: +
+# fieldkeys: !
#
# add_term:
# First the static dictionary will do a look-up if the term exists,
@@ -26,6 +30,10 @@
# an existing term will return the term_id and a non-existing term
# will be added, returning a term_id.
#
+# For casefolded search, it may be necessary to index the casefolded
+# string as well, with the same term_id (though, the reverse lookup)
+# is not possible. The same MAY be useful for diacriticless search.
+#
# When a new subterm is added, a binary search on the static
# rank file is done to find the alphabetically preceeding term of
# the new term. This rank is then added to the dynamic rank.
@@ -45,7 +53,14 @@
# First a look-up to the static dictionary is done.
# In case of failure, a lookup to the dynamic dictionary is done.
#
-# range_search (regex/approx/wildcard):
+# range_search:
+# Returns an array of valid term_ids.
+# Required searches are:
+# - casefolded
+# - without_diacritics
+# - regular expression
+# - approximate matching
+# - wildcards
# Both dictionaries are searched (maybe in parallel).
#
# term_id_to_term:
@@ -55,7 +70,7 @@
# This feature may be more complicated when term_ids can be reused
# (see delete).
#
-# delete
+# delete:
# Not necessary, but could be implemented in both dictionaries.
# In the dynamic dictionary the branches are removed.
# In the static dictionary the term (and potentially branches)
@@ -64,12 +79,12 @@
# ignored. Though - this feature has not really any benefits,
# as old term_ids can't be reused.
#
-# merge
+# merge:
# Merges the static dictionary with the dynamic dictionary and
# creates a new static dictionary.
# This will also merge the ranks.
#
-# rank_subterm
+# rank_subterm:
# Returns the numerical rank of a subterm in alphabetic order.
# The static dictionary will return a simple even numerical rank
# (calculated on merge). The dynamic dictionary will
@@ -104,6 +119,11 @@
# Alternatively, if trigrams are indexed, this would of course
# look for 'ig$'.
+# TODO:
+# Although subterms will not be requested by the user, they are
+# requested, for example, by the term_id API for co-occurrence search.
+# That's why all subterms need to be stored as well.
+
use constant DEBUG => 0;
sub new {
diff --git a/lib/Krawfish/Index/Forward/Annotation.pm b/lib/Krawfish/Index/Forward/Annotation.pm
new file mode 100644
index 0000000..a2f9b15
--- /dev/null
+++ b/lib/Krawfish/Index/Forward/Annotation.pm
@@ -0,0 +1,57 @@
+package Krawfish::Index::Forward::Annotation;
+use Krawfish::Util::String qw/squote/;
+use warnings;
+use strict;
+
+# TODO:
+# This should contain type, foundry, layer, key, value ... etc.
+
+sub new {
+ my $class = shift;
+ bless {
+ term => shift,
+ data => [@_]
+ }, $class;
+};
+
+
+sub term {
+ $_[0]->{term};
+};
+
+
+sub data {
+ $_[0]->{data}
+};
+
+
+sub identify {
+ my ($self, $dict) = @_;
+ my $term_id = $dict->term_id_by_term($self->{term});
+
+ if (defined $term_id) {
+ $self->{term_id} = $term_id;
+ }
+ else {
+ $self->{term_id} = $dict->add_term($self->{term});;
+ };
+
+ return $self;
+};
+
+
+sub to_string {
+ my $self = shift;
+ my $str = '';
+
+ if ($self->{term_id}) {
+ $str .= $self->{term_id};
+ }
+
+ else {
+ $str .= squote($self->{term});
+ };
+ return $str . '$' . join(',', @{$self->{data}});
+};
+
+1;
diff --git a/lib/Krawfish/Index/Forward/FieldInt.pm b/lib/Krawfish/Index/Forward/FieldInt.pm
new file mode 100644
index 0000000..12b5787
--- /dev/null
+++ b/lib/Krawfish/Index/Forward/FieldInt.pm
@@ -0,0 +1,62 @@
+package Krawfish::Index::Forward::FieldInt;
+use Krawfish::Util::String qw/squote/;
+use warnings;
+use strict;
+
+sub new {
+ my $class = shift;
+ bless {
+ key => shift,
+ value => shift
+ }, $class;
+};
+
+
+
+sub identify {
+ my ($self, $dict) = @_;
+
+ my $key = '!' . $self->{key};
+ my $term = '+' . $self->{key} . ':' . $self->{value};
+
+ # Get key term_id
+ my $term_id = $dict->term_id_by_term($key);
+
+ # Not given yet
+ if (defined $term_id) {
+
+ $self->{key_id} = $term_id;
+
+ # Get term identifier
+ $term_id = $dict->term_id_by_term($term);
+
+ # Term identifier does not exist
+ if (defined $term_id) {
+ $self->{key_value_id} = $term_id;
+ }
+
+ else {
+ $self->{key_value_id} = $dict->add_term($term);
+ };
+ }
+
+ else {
+ $self->{key_id} = $dict->add_term($key);
+ $self->{key_value_id} = $dict->add_term($term);
+ };
+ return $self;
+};
+
+
+sub to_string {
+ my $self = shift;
+ unless ($self->{key_id}) {
+ return squote($self->{key}) . '=' . $self->{value};
+ };
+ return $self->{key_id} . '=' . $self->{key_value_id} . '(' . $self->{value} . ')';
+};
+
+
+
+
+1;
diff --git a/lib/Krawfish/Index/Forward/FieldString.pm b/lib/Krawfish/Index/Forward/FieldString.pm
new file mode 100644
index 0000000..01fedde
--- /dev/null
+++ b/lib/Krawfish/Index/Forward/FieldString.pm
@@ -0,0 +1,58 @@
+package Krawfish::Index::Forward::FieldString;
+use Krawfish::Util::String qw/squote/;
+use warnings;
+use strict;
+
+sub new {
+ my $class = shift;
+ bless {
+ key => shift,
+ value => shift
+ }, $class;
+};
+
+
+sub identify {
+ my ($self, $dict) = @_;
+
+ my $key = '!' . $self->{key};
+ my $term = '+' . $self->{key} . ':' . $self->{value};
+
+ # Get key term_id
+ my $term_id = $dict->term_id_by_term($key);
+
+ # Not given yet
+ if (defined $term_id) {
+
+ $self->{key_id} = $term_id;
+
+ # Get term identifier
+ $term_id = $dict->term_id_by_term($term);
+
+ # Term identifier does not exist
+ if (defined $term_id) {
+ $self->{key_value_id} = $term_id;
+ }
+
+ else {
+ $self->{key_value_id} = $dict->add_term($term);
+ };
+ }
+
+ else {
+ $self->{key_id} = $dict->add_term($key);
+ $self->{key_value_id} = $dict->add_term($term);
+ };
+ return $self;
+};
+
+
+sub to_string {
+ my $self = shift;
+ unless ($self->{key_id}) {
+ return squote($self->{key}) . '=' . squote($self->{value});
+ };
+ return $self->{key_id} . '=' . $self->{key_value_id};
+};
+
+1;
diff --git a/lib/Krawfish/Index/Forward/Fields.pm b/lib/Krawfish/Index/Forward/Fields.pm
new file mode 100644
index 0000000..347a91b
--- /dev/null
+++ b/lib/Krawfish/Index/Forward/Fields.pm
@@ -0,0 +1,43 @@
+package Krawfish::Index::Forward::Fields;
+use Krawfish::Index::Forward::FieldString;
+use Krawfish::Index::Forward::FieldInt;
+use warnings;
+use strict;
+
+sub new {
+ my $class = shift;
+ bless [], $class;
+};
+
+sub add_string {
+ my $self = shift;
+ my ($key, $value) = @_;
+
+ # This may be an integer value
+ push @$self, Krawfish::Index::Forward::FieldString->new($key, $value);
+};
+
+
+sub add_int {
+ my $self = shift;
+ my ($key, $value) = @_;
+
+ # This may be an integer value
+ push @$self, Krawfish::Index::Forward::FieldInt->new($key, $value);
+};
+
+
+sub to_string {
+ return join(';', map { $_->to_string } @{$_[0]});
+};
+
+
+sub identify {
+ my ($self, $dict) = @_;
+ foreach (@$self) {
+ $_->identify($dict);
+ };
+ return $self;
+};
+
+1;
diff --git a/lib/Krawfish/Index/Forward/Stream.pm b/lib/Krawfish/Index/Forward/Stream.pm
new file mode 100644
index 0000000..01bb531
--- /dev/null
+++ b/lib/Krawfish/Index/Forward/Stream.pm
@@ -0,0 +1,42 @@
+package Krawfish::Index::Forward::Stream;
+use Krawfish::Index::Forward::Subtoken;
+use warnings;
+use strict;
+
+# This is one single stream of the forward index;
+
+sub new {
+ my $class = shift;
+ bless [], $class;
+};
+
+
+# Get or set a subtoken
+sub subtoken {
+ my $self = shift;
+ my $pos = shift;
+ if (@_) {
+ $self->[$pos] = Krawfish::Index::Forward::Subtoken->new(@_);
+ };
+ return $self->[$pos];
+};
+
+
+sub to_string {
+ my $i = 0;
+ return join '', map { '(' . ($i++) . ')' . $_->to_string } @{$_[0]}
+};
+
+
+sub identify {
+ my ($self, $dict) = @_;
+
+ foreach (@$self) {
+ $_->identify($dict);
+ };
+
+ return $self;
+};
+
+
+1;
diff --git a/lib/Krawfish/Index/Forward/Subtoken.pm b/lib/Krawfish/Index/Forward/Subtoken.pm
new file mode 100644
index 0000000..4a00e7b
--- /dev/null
+++ b/lib/Krawfish/Index/Forward/Subtoken.pm
@@ -0,0 +1,79 @@
+package Krawfish::Index::Forward::Subtoken;
+use Krawfish::Index::Forward::Annotation;
+use Krawfish::Util::String qw/squote/;
+use warnings;
+use strict;
+
+# This represents a single token in a forward index
+
+sub new {
+ my $class = shift;
+ bless {
+ preceding => shift,
+ subterm => shift,
+ anno => []
+ }, $class;
+};
+
+
+# Preceeding bytes of the subterm
+sub preceding {
+ $_[0]->{preceding};
+};
+
+
+# The subterm surface
+sub subterm {
+ $_[0]->{subterm};
+};
+
+
+# Add annotations
+sub add_annotation {
+ my $self = shift;
+ push @{$self->{anno}}, Krawfish::Index::Forward::Annotation->new(@_);
+};
+
+
+sub identify {
+ my ($self, $dict) = @_;
+
+ my $term = '*' . $self->{subterm};
+ my $term_id = $dict->term_id_by_term($term);
+
+ if (defined $term_id) {
+ $self->{subterm_id} = $term_id;
+ }
+ else {
+ $self->{subterm_id} = $dict->add_term($term);
+ };
+
+ foreach (@{$self->{anno}}) {
+ $_->identify($dict);
+ };
+
+ return $self;
+};
+
+# Stringification
+sub to_string {
+ my $self = shift;
+ my $str = ($self->{preceding} ? $self->{preceding} : '');
+ $str .= '[';
+
+ if ($self->{subterm_id}) {
+ $str .= $self->{subterm_id};
+ }
+ else {
+ $str .= squote($self->{subterm});
+ };
+
+ if (@{$self->{anno}}) {
+ $str .= ';' . join(';', map { $_->to_string } (@{$self->{anno}}));
+ };
+
+ return "$str]";
+};
+
+
+1;
diff --git a/lib/Krawfish/Index/ForwardIndex.pm b/lib/Krawfish/Index/ForwardIndex.pm
index c942bb9..608f9d9 100644
--- a/lib/Krawfish/Index/ForwardIndex.pm
+++ b/lib/Krawfish/Index/ForwardIndex.pm
@@ -2,6 +2,12 @@
use strict;
use warnings;
+
+# WARNING!
+# This is deprecated! Use everything in Krawfish::Index::Forward::*
+
+
+
# This represents a forward index of the data,
# accessible by document ID and byte offset.
#
diff --git a/lib/Krawfish/Koral/Document.pm b/lib/Krawfish/Koral/Document.pm
index 9dcb7c9..0a70d2e 100644
--- a/lib/Krawfish/Koral/Document.pm
+++ b/lib/Krawfish/Koral/Document.pm
@@ -1,81 +1,305 @@
package Krawfish::Koral::Document;
-use Krawfish::Koral::Query::Token;
+use Krawfish::Index::Forward::Stream;
+use Krawfish::Index::Forward::Fields;
+use Krawfish::Log;
+use Mojo::File;
+use Mojo::JSON qw/encode_json decode_json/;
use strict;
use warnings;
+use List::MoreUtils qw/uniq/;
-# Representation of a document
+# Parses a document and creates a simple forward index list.
+#
+# primary='...',
+# fields=[+field => title],
+# terms=[*term => [postings*]]
+#
+# Then, when the document is added to certain nodes,
+# the keys will be translated to term_ids and the document
+# can be added with all freq_in_doc information
+
+
+use constant DEBUG => 1;
+
+# Parse the document and create an inverted index file
sub new {
my $class = shift;
- my $self = bless {}, $class;
- return $self unless @_;
+ my $self = bless {
+ primary => '',
+ sortable => {},
+ stream => Krawfish::Index::Forward::Stream->new,
+ fields => Krawfish::Index::Forward::Fields->new
+ }, $class;
- my $koral = shift;
- if ($koral->{primaryData}) {
- $self->primary_data($koral->{primaryData});
+ my $doc = shift;
+
+ unless (ref $doc) {
+ $doc = decode_json(Mojo::File->new($doc)->slurp);
};
- # Parse segments
- if ($koral->{segments}) {
-
- # Todo: Parse and sort
- $self->{segments} = $koral->{segments};
- };
-
- # Parse annotations
- if ($koral->{annotations}) {
-
- # TODO: All annotations need to be wrapped
- my @annotations = ();
- foreach my $item (@{$koral->{annotations}}) {
- if ($item->{'@type'} eq 'koral:token') {
- my $token = Krawfish::Koral::Query::Token->new($item);
-
- unless (scalar $item->{segments}) {
- }
- };
- };
- };
+ # Parse the document
+ $self->_parse($doc);
return $self;
};
-# Primary data
+# Get the primary data
sub primary_data {
+ $_[0]->{primary};
+};
+
+
+sub stream {
+ $_[0]->{stream};
+};
+
+
+sub fields {
+ $_[0]->{fields};
+};
+
+
+sub sortable {
+ $_[0]->{sortable};
+};
+
+sub identify {
+ my ($self, $dict) = @_;
+ $self->{fields} = $self->{fields}->identify($dict);
+ $self->{stream} = $self->{stream}->identify($dict);
+ return $self;
+};
+
+
+sub to_string {
my $self = shift;
- if (@_) {
- $self->{primary_data} = shift;
+ return '[' . $self->fields->to_string . ']' . $self->stream->to_string;
+};
+
+
+# Parse the file and create a token-ordered document
+sub _parse {
+ my ($self, $doc) = @_;
+
+ # Get the document part
+ # This may - in the future - support multiple documents at once
+ $doc = $doc->{document};
+
+ my $primary = '';
+ my $stream = $self->stream;
+ my $fields = $self->fields;
+
+ # Remember the primary data for the creation
+ # of the forward index
+ if ($doc->{primaryData}) {
+ $primary = $doc->{primaryData};
};
- return $self->{primary_data};
-};
+
+ $self->{primary} = $primary;
+
+ # Add metadata fields
+ my $pos = 0;
+ my %sortable;
+ foreach my $field (@{$doc->{fields}}) {
+
+ # TODO:
+ # Presort fields based on their field_key_id!
+ # In that way it's faster to retrieve presorted fields
+ # for enrichment!
-# Segments
-sub segments {
- my $self = shift;
- if (@_) {
- $self->{segments} = shift;
+ # Prepare field for sorting
+ if ($field->{sortable}) {
+
+ # Which entries need to be sorted?
+ $sortable{$field->{key}}++;
+ };
+
+
+ # Prepare for summarization
+ if ($field->{type} && $field->{type} eq 'type:integer') {
+ $fields->add_int($field->{key}, $field->{value});
+ }
+ else {
+ $fields->add_string($field->{key}, $field->{value});
+ };
+
+ # This will later be indexed for search as well as retrieval in
+ # the forward index.
};
- return $self->{segments};
+
+ # Check that the unique field is given, as this is required
+ $self->{sortable} = \%sortable;
+
+ my $primary_index = 0;
+
+ # Get all subtokens
+ if ($doc->{subtokens}) {
+
+ print_log('doc', 'Parse subtokens') if DEBUG;
+
+ # Get all subtoken offsets
+ foreach my $subtoken (@{$doc->{subtokens}}) {
+
+ # Get start and end of the subtoken
+ my ($start, $end) = @{$subtoken->{offsets}};
+
+ if (DEBUG) {
+ print_log(
+ 'doc',
+ 'Store subtoken: ' . $pos . '=' . join('-', $start, $end)
+ );
+ };
+
+ # Get the term surface from the primary text
+ # TODO:
+ # Ensure that the offsets are valid!
+ my $preceding = substr($primary, $primary_index, $start - $primary_index);
+ my $term = substr($primary, $start, $end - $start);
+ $primary_index = $end;
+
+ print_log('doc', 'Surface form is ' . $term) if DEBUG;
+
+ $stream->subtoken($pos, $preceding, $term);
+ $pos++;
+ };
+ };
+
+
+ # There are tokens indexed by subtokens
+ if ($primary_index) {
+ my $preceding = substr($primary, $primary_index);
+ $stream->subtoken($pos, $preceding, '') if $preceding;
+
+ # TODO: Probably not a good idea
+ $primary_index = 0;
+ };
+
+
+ # Get all annotations
+ $pos = 0;
+ my $end;
+ foreach my $item (@{$doc->{annotations}}) {
+
+ # Add token term to term dictionary
+ if ($item->{'@type'} eq 'koral:token') {
+
+ unless ($item->{wrap}) {
+ warn 'No wrap defined in KoralQuery';
+ next;
+ };
+
+ # Create key string
+ my $wrap = $item->{wrap};
+ my @keys;
+
+ # Token wraps a koral:termGroup
+ if ($wrap->{'@type'} && $wrap->{'@type'} eq 'koral:termGroup') {
+ foreach (@{$wrap->{operands}}) {
+ push @keys, _term($_);
+ };
+ }
+
+ # Token wraps a single koral:term
+ else {
+ push @keys, _term($wrap);
+ };
+
+ # Append posting to postings list
+ my @subtoken_offset = _subtokens($item);
+
+ # There are no reference subtokens defined
+ unless (scalar @subtoken_offset) {
+
+ # Use the current position for storing
+ push @subtoken_offset, $pos;
+
+ # But there are offsets defined
+ if ($item->{offsets}) {
+
+ # Get character definitions
+ my ($start, $end) = @{$item->{offsets}};
+
+ # Get the term surface from the primary text
+ # TODO:
+ # Ensure that the offsets are valid!
+ my $preceding = substr($primary, $primary_index, $start - $primary_index);
+ my $term = substr($primary, $start, $end - $start);
+ $primary_index = $end;
+
+ $stream->subtoken($pos, $preceding, $term);
+ };
+ $pos++;
+ };
+
+ # Add token terms
+ foreach (@keys) {
+
+ # Add token annotation
+ my $length = $subtoken_offset[1] ? ($subtoken_offset[1]-$subtoken_offset[0]-1) : 0;
+ $stream->subtoken($subtoken_offset[0])->add_annotation('#' . $_, $length);
+ };
+ }
+
+ # Add span term to dictionary
+ elsif ($item->{'@type'} eq 'koral:span') {
+
+ # Create key string
+ my $key = '<>' . _term($item->{wrap});
+
+
+ # Add span to forward stream
+ my $length = $item->{subtokens}->[1] ? (
+ $item->{subtokens}->[-1] - $item->{subtokens}->[0]
+ ) : 0;
+ $stream->subtoken($item->{subtokens}->[0])->add_annotation($key, $length);
+ };
+ };
+
+ # There are tokens indexed by subtokens
+ if ($primary_index) {
+ my $preceding = substr($primary, $primary_index);
+ $stream->subtoken($pos, $preceding, '') if $preceding;
+
+ # TODO: Probably not a good idea
+ $primary_index = 0;
+ };
};
-sub annotations {
-};
-# Return segment list or nothing
-sub _segment_list {
+# TODO: Use from_koral()->term
+# Potentially with a prefix
+sub _term {
+ my $item = shift;
+
+ my $key = '';
+ # Create term for term dictionary
+ if ($item->{foundry}) {
+ $key .= $item->{foundry};
+ if ($item->{layer}) {
+ $key .= '/' . $item->{layer};
+ }
+ $key .= '=';
+ };
+ return $key . ($item->{key} // '');
+}
+
+
+# Return subtoken list or nothing
+sub _subtokens {
my $item = shift;
my @posting;
- if ($item->{segments}) {
+ if ($item->{subtokens}) {
- @posting = ($item->{segments}->[0]);
+ # Remove!
+ push @posting, $item->{subtokens}->[0];
- if ($item->{segments}->[1]) {
-
- # The end is AFTER the second segment
- push @posting, $item->{segments}->[1];
+ if ($item->{subtokens}->[1]) {
+ # The end is AFTER the second subtoken
+ push @posting, $item->{subtokens}->[1] + 1;
};
return @posting;
@@ -86,3 +310,29 @@
1;
+
+
+__END__
+
+
+
+sub to_list {
+ my ($self, $doc_id, $replicant_id) = @_;
+};
+
+
+sub add {
+ # This will add the doc_id to id-field and
+ # this will add the replicant field (either __1:1 or __2:node_name).
+};
+
+
+sub to_forward_index {
+ # Only works after identification!
+ # This should, however, use a K::I::Store class!
+};
+
+
+1;
+
+__END__