Added Koral::Document for ForwardIndex creation

commit: dd10fb91819cf3ac77a5032c145e22a8322b79f4 [log] [tgz]
author: Akron <nils@diewald-online.de> Tue Aug 08 20:19:46 2017 +0200
committer: Akron <nils@diewald-online.de> Tue Aug 08 20:19:46 2017 +0200
tree: 565cceab31198f4ff088fdb95c3ad08845614c6f
parent: be96e283934339bbc0222019dc5e44fb139e120b [diff]
diff --git a/lib/Krawfish/Index.pm b/lib/Krawfish/Index.pm
index a948c0b..2ebfa92 100644
--- a/lib/Krawfish/Index.pm
+++ b/lib/Krawfish/Index.pm

@@ -69,7 +69,8 @@
 #   Maybe logarithmic merge
 #   https://www.youtube.com/watch?v=VNjf2dxWH2Y&spfreload=5
 
-# TODO: Maybe 65.535 documents are enough per segment ...
+# TODO:
+#   Maybe 65.535 documents are enough per segment ...
 
 
 # Construct a new index object
@@ -120,6 +121,16 @@
 sub add {
   my ($self, $doc, $replicant_id) = @_;
 
+  # TODO:
+  #   The document should first be converted in inverted index form
+  #   using a hash with
+  #
+  #   +field => title,
+  #   *term => [postings*]
+  #
+  #   Then, when the document is added to certain nodes,
+  #   the keys will be translated to term_ids and the document
+  #   can be added with all freq_in_doc information
   unless (ref $doc) {
     $doc = decode_json(Mojo::File->new($doc)->slurp);
   };

diff --git a/lib/Krawfish/Index/Dictionary.pm b/lib/Krawfish/Index/Dictionary.pm
index 304b075..e51452e 100644
--- a/lib/Krawfish/Index/Dictionary.pm
+++ b/lib/Krawfish/Index/Dictionary.pm

@@ -15,10 +15,14 @@
 #
 # Terms have different prefixes:
 #
-#   terms:     *
-#   subterms:  ~
-#   fields:    +
-#   fieldkeys: !
+#   terms:       *
+#   (casefolded) '
+#   subterms:    ~
+#   annotations
+#     token      #
+#     span       <>
+#   fields:      +
+#   fieldkeys:   !
 #
 # add_term:
 #   First the static dictionary will do a look-up if the term exists,
@@ -26,6 +30,10 @@
 #   an existing term will return the term_id and a non-existing term
 #   will be added, returning a term_id.
 #
+#   For casefolded search, it may be necessary to index the casefolded
+#   string as well, with the same term_id (though, the reverse lookup)
+#   is not possible. The same MAY be useful for diacriticless search.
+#
 #   When a new subterm is added, a binary search on the static
 #   rank file is done to find the alphabetically preceeding term of
 #   the new term. This rank is then added to the dynamic rank.
@@ -45,7 +53,14 @@
 #   First a look-up to the static dictionary is done.
 #   In case of failure, a lookup to the dynamic dictionary is done.
 #
-# range_search (regex/approx/wildcard):
+# range_search:
+#   Returns an array of valid term_ids.
+#   Required searches are:
+#     - casefolded
+#     - without_diacritics
+#     - regular expression
+#     - approximate matching
+#     - wildcards
 #   Both dictionaries are searched (maybe in parallel).
 #
 # term_id_to_term:
@@ -55,7 +70,7 @@
 #   This feature may be more complicated when term_ids can be reused
 #   (see delete).
 #
-# delete
+# delete:
 #   Not necessary, but could be implemented in both dictionaries.
 #   In the dynamic dictionary the branches are removed.
 #   In the static dictionary the term (and potentially branches)
@@ -64,12 +79,12 @@
 #   ignored. Though - this feature has not really any benefits,
 #   as old term_ids can't be reused.
 #
-# merge
+# merge:
 #   Merges the static dictionary with the dynamic dictionary and
 #   creates a new static dictionary.
 #   This will also merge the ranks.
 #
-# rank_subterm
+# rank_subterm:
 #   Returns the numerical rank of a subterm in alphabetic order.
 #   The static dictionary will return a simple even numerical rank
 #   (calculated on merge). The dynamic dictionary will
@@ -104,6 +119,11 @@
 #   Alternatively, if trigrams are indexed, this would of course
 #   look for 'ig$'.
 
+# TODO:
+#   Although subterms will not be requested by the user, they are
+#   requested, for example, by the term_id API for co-occurrence search.
+#   That's why all subterms need to be stored as well.
+
 use constant DEBUG => 0;
 
 sub new {

diff --git a/lib/Krawfish/Index/Forward/Annotation.pm b/lib/Krawfish/Index/Forward/Annotation.pm
new file mode 100644
index 0000000..a2f9b15
--- /dev/null
+++ b/lib/Krawfish/Index/Forward/Annotation.pm

@@ -0,0 +1,57 @@
+package Krawfish::Index::Forward::Annotation;
+use Krawfish::Util::String qw/squote/;
+use warnings;
+use strict;
+
+# TODO:
+#   This should contain type, foundry, layer, key, value ... etc.
+
+sub new {
+  my $class = shift;
+  bless {
+    term => shift,
+    data => [@_]
+  }, $class;
+};
+
+
+sub term {
+  $_[0]->{term};
+};
+
+
+sub data {
+  $_[0]->{data}
+};
+
+
+sub identify {
+  my ($self, $dict) = @_;
+  my $term_id = $dict->term_id_by_term($self->{term});
+
+  if (defined $term_id) {
+    $self->{term_id} = $term_id;
+  }
+  else {
+    $self->{term_id} = $dict->add_term($self->{term});;
+  };
+
+  return $self;
+};
+
+
+sub to_string {
+  my $self = shift;
+  my $str = '';
+
+  if ($self->{term_id}) {
+    $str .= $self->{term_id};
+  }
+
+  else {
+    $str .= squote($self->{term});
+  };
+  return $str . '$' . join(',', @{$self->{data}});
+};
+
+1;

diff --git a/lib/Krawfish/Index/Forward/FieldInt.pm b/lib/Krawfish/Index/Forward/FieldInt.pm
new file mode 100644
index 0000000..12b5787
--- /dev/null
+++ b/lib/Krawfish/Index/Forward/FieldInt.pm

@@ -0,0 +1,62 @@
+package Krawfish::Index::Forward::FieldInt;
+use Krawfish::Util::String qw/squote/;
+use warnings;
+use strict;
+
+sub new {
+  my $class = shift;
+  bless {
+    key => shift,
+    value => shift
+  }, $class;
+};
+
+
+
+sub identify {
+  my ($self, $dict) = @_;
+
+  my $key  = '!' . $self->{key};
+  my $term = '+' . $self->{key} . ':' . $self->{value};
+
+  # Get key term_id
+  my $term_id = $dict->term_id_by_term($key);
+
+  # Not given yet
+  if (defined $term_id) {
+
+    $self->{key_id} = $term_id;
+
+    # Get term identifier
+    $term_id = $dict->term_id_by_term($term);
+
+    # Term identifier does not exist
+    if (defined $term_id) {
+      $self->{key_value_id} = $term_id;
+    }
+
+    else {
+      $self->{key_value_id} = $dict->add_term($term);
+    };
+  }
+
+  else {
+    $self->{key_id} = $dict->add_term($key);
+    $self->{key_value_id} = $dict->add_term($term);
+  };
+  return $self;
+};
+
+
+sub to_string {
+  my $self = shift;
+  unless ($self->{key_id}) {
+    return squote($self->{key}) . '=' . $self->{value};
+  };
+  return $self->{key_id} . '=' . $self->{key_value_id} . '(' . $self->{value} . ')';
+};
+
+
+
+
+1;

diff --git a/lib/Krawfish/Index/Forward/FieldString.pm b/lib/Krawfish/Index/Forward/FieldString.pm
new file mode 100644
index 0000000..01fedde
--- /dev/null
+++ b/lib/Krawfish/Index/Forward/FieldString.pm

@@ -0,0 +1,58 @@
+package Krawfish::Index::Forward::FieldString;
+use Krawfish::Util::String qw/squote/;
+use warnings;
+use strict;
+
+sub new {
+  my $class = shift;
+  bless {
+    key => shift,
+    value => shift
+  }, $class;
+};
+
+
+sub identify {
+  my ($self, $dict) = @_;
+
+  my $key  = '!' . $self->{key};
+  my $term = '+' . $self->{key} . ':' . $self->{value};
+
+  # Get key term_id
+  my $term_id = $dict->term_id_by_term($key);
+
+  # Not given yet
+  if (defined $term_id) {
+
+    $self->{key_id} = $term_id;
+
+    # Get term identifier
+    $term_id = $dict->term_id_by_term($term);
+
+    # Term identifier does not exist
+    if (defined $term_id) {
+      $self->{key_value_id} = $term_id;
+    }
+
+    else {
+      $self->{key_value_id} = $dict->add_term($term);
+    };
+  }
+
+  else {
+    $self->{key_id} = $dict->add_term($key);
+    $self->{key_value_id} = $dict->add_term($term);
+  };
+  return $self;
+};
+
+
+sub to_string {
+  my $self = shift;
+  unless ($self->{key_id}) {
+    return squote($self->{key}) . '=' . squote($self->{value});
+  };
+  return $self->{key_id} . '=' . $self->{key_value_id};
+};
+
+1;

diff --git a/lib/Krawfish/Index/Forward/Fields.pm b/lib/Krawfish/Index/Forward/Fields.pm
new file mode 100644
index 0000000..347a91b
--- /dev/null
+++ b/lib/Krawfish/Index/Forward/Fields.pm

@@ -0,0 +1,43 @@
+package Krawfish::Index::Forward::Fields;
+use Krawfish::Index::Forward::FieldString;
+use Krawfish::Index::Forward::FieldInt;
+use warnings;
+use strict;
+
+sub new {
+  my $class = shift;
+  bless [], $class;
+};
+
+sub add_string {
+  my $self = shift;
+  my ($key, $value) = @_;
+
+  # This may be an integer value
+  push @$self, Krawfish::Index::Forward::FieldString->new($key, $value);
+};
+
+
+sub add_int {
+  my $self = shift;
+  my ($key, $value) = @_;
+
+  # This may be an integer value
+  push @$self, Krawfish::Index::Forward::FieldInt->new($key, $value);
+};
+
+
+sub to_string {
+  return join(';', map { $_->to_string } @{$_[0]});
+};
+
+
+sub identify {
+  my ($self, $dict) = @_;
+  foreach (@$self) {
+    $_->identify($dict);
+  };
+  return $self;
+};
+
+1;

diff --git a/lib/Krawfish/Index/Forward/Stream.pm b/lib/Krawfish/Index/Forward/Stream.pm
new file mode 100644
index 0000000..01bb531
--- /dev/null
+++ b/lib/Krawfish/Index/Forward/Stream.pm

@@ -0,0 +1,42 @@
+package Krawfish::Index::Forward::Stream;
+use Krawfish::Index::Forward::Subtoken;
+use warnings;
+use strict;
+
+# This is one single stream of the forward index;
+
+sub new {
+  my $class = shift;
+  bless [], $class;
+};
+
+
+# Get or set a subtoken
+sub subtoken {
+  my $self = shift;
+  my $pos = shift;
+  if (@_) {
+    $self->[$pos] = Krawfish::Index::Forward::Subtoken->new(@_);
+  };
+  return $self->[$pos];
+};
+
+
+sub to_string {
+  my $i = 0;
+  return join '', map { '(' . ($i++) . ')' .  $_->to_string } @{$_[0]}
+};
+
+
+sub identify {
+  my ($self, $dict) = @_;
+
+  foreach (@$self) {
+    $_->identify($dict);
+  };
+
+  return $self;
+};
+
+
+1;

diff --git a/lib/Krawfish/Index/Forward/Subtoken.pm b/lib/Krawfish/Index/Forward/Subtoken.pm
new file mode 100644
index 0000000..4a00e7b
--- /dev/null
+++ b/lib/Krawfish/Index/Forward/Subtoken.pm

@@ -0,0 +1,79 @@
+package Krawfish::Index::Forward::Subtoken;
+use Krawfish::Index::Forward::Annotation;
+use Krawfish::Util::String qw/squote/;
+use warnings;
+use strict;
+
+# This represents a single token in a forward index
+
+sub new {
+  my $class = shift;
+  bless {
+    preceding => shift,
+    subterm => shift,
+    anno => []
+  }, $class;
+};
+
+
+# Preceeding bytes of the subterm
+sub preceding {
+  $_[0]->{preceding};
+};
+
+
+# The subterm surface
+sub subterm {
+  $_[0]->{subterm};
+};
+
+
+# Add annotations
+sub add_annotation {
+  my $self = shift;
+  push @{$self->{anno}}, Krawfish::Index::Forward::Annotation->new(@_);
+};
+
+
+sub identify {
+  my ($self, $dict) = @_;
+
+  my $term = '*' . $self->{subterm};
+  my $term_id = $dict->term_id_by_term($term);
+
+  if (defined $term_id) {
+    $self->{subterm_id} = $term_id;
+  }
+  else {
+    $self->{subterm_id} = $dict->add_term($term);
+  };
+
+  foreach (@{$self->{anno}}) {
+    $_->identify($dict);
+  };
+
+  return $self;
+};
+
+# Stringification
+sub to_string {
+  my $self = shift;
+  my $str = ($self->{preceding} ? $self->{preceding} : '');
+  $str .= '[';
+
+  if ($self->{subterm_id}) {
+    $str .= $self->{subterm_id};
+  }
+  else {
+    $str .= squote($self->{subterm});
+  };
+
+  if (@{$self->{anno}}) {
+    $str .= ';' . join(';', map { $_->to_string } (@{$self->{anno}}));
+  };
+
+  return "$str]";
+};
+
+
+1;

diff --git a/lib/Krawfish/Index/ForwardIndex.pm b/lib/Krawfish/Index/ForwardIndex.pm
index c942bb9..608f9d9 100644
--- a/lib/Krawfish/Index/ForwardIndex.pm
+++ b/lib/Krawfish/Index/ForwardIndex.pm

@@ -2,6 +2,12 @@
 use strict;
 use warnings;
 
+
+# WARNING!
+# This is deprecated! Use everything in Krawfish::Index::Forward::*
+
+
+
 # This represents a forward index of the data,
 # accessible by document ID and byte offset.
 #

diff --git a/lib/Krawfish/Koral/Document.pm b/lib/Krawfish/Koral/Document.pm
index 9dcb7c9..0a70d2e 100644
--- a/lib/Krawfish/Koral/Document.pm
+++ b/lib/Krawfish/Koral/Document.pm

@@ -1,81 +1,305 @@
 package Krawfish::Koral::Document;
-use Krawfish::Koral::Query::Token;
+use Krawfish::Index::Forward::Stream;
+use Krawfish::Index::Forward::Fields;
+use Krawfish::Log;
+use Mojo::File;
+use Mojo::JSON qw/encode_json decode_json/;
 use strict;
 use warnings;
+use List::MoreUtils qw/uniq/;
 
-# Representation of a document
+# Parses a document and creates a simple forward index list.
+#
+#   primary='...',
+#   fields=[+field => title],
+#   terms=[*term => [postings*]]
+#
+#   Then, when the document is added to certain nodes,
+#   the keys will be translated to term_ids and the document
+#   can be added with all freq_in_doc information
+
+
+use constant DEBUG => 1;
+
+# Parse the document and create an inverted index file
 sub new {
   my $class = shift;
-  my $self = bless {}, $class;
 
-  return $self unless @_;
+  my $self = bless {
+    primary => '',
+    sortable => {},
+    stream => Krawfish::Index::Forward::Stream->new,
+    fields => Krawfish::Index::Forward::Fields->new
+  }, $class;
 
-  my $koral = shift;
-  if ($koral->{primaryData}) {
-    $self->primary_data($koral->{primaryData});
+  my $doc = shift;
+
+  unless (ref $doc) {
+    $doc = decode_json(Mojo::File->new($doc)->slurp);
   };
 
-  # Parse segments
-  if ($koral->{segments}) {
-
-    # Todo: Parse and sort
-    $self->{segments} = $koral->{segments};
-  };
-
-  # Parse annotations
-  if ($koral->{annotations}) {
-
-    # TODO: All annotations need to be wrapped
-    my @annotations = ();
-    foreach my $item (@{$koral->{annotations}}) {
-      if ($item->{'@type'} eq 'koral:token') {
-        my $token = Krawfish::Koral::Query::Token->new($item);
-
-        unless (scalar $item->{segments}) {
-        }
-      };
-    };
-  };
+  # Parse the document
+  $self->_parse($doc);
 
   return $self;
 };
 
 
-# Primary data
+# Get the primary data
 sub primary_data {
+  $_[0]->{primary};
+};
+
+
+sub stream {
+  $_[0]->{stream};
+};
+
+
+sub fields {
+  $_[0]->{fields};
+};
+
+
+sub sortable {
+  $_[0]->{sortable};
+};
+
+sub identify {
+  my ($self, $dict) = @_;
+  $self->{fields} = $self->{fields}->identify($dict);
+  $self->{stream} = $self->{stream}->identify($dict);
+  return $self;
+};
+
+
+sub to_string {
   my $self = shift;
-  if (@_) {
-    $self->{primary_data} = shift;
+  return '[' . $self->fields->to_string . ']' . $self->stream->to_string;
+};
+
+
+# Parse the file and create a token-ordered document
+sub _parse {
+  my ($self, $doc) = @_;
+
+  # Get the document part
+  # This may - in the future - support multiple documents at once
+  $doc = $doc->{document};
+
+  my $primary = '';
+  my $stream = $self->stream;
+  my $fields = $self->fields;
+
+  # Remember the primary data for the creation
+  # of the forward index
+  if ($doc->{primaryData}) {
+    $primary = $doc->{primaryData};
   };
-  return $self->{primary_data};
-};
+
+  $self->{primary} = $primary;
+
+  # Add metadata fields
+  my $pos = 0;
+  my %sortable;
+  foreach my $field (@{$doc->{fields}}) {
+
+    # TODO:
+    #   Presort fields based on their field_key_id!
+    #   In that way it's faster to retrieve presorted fields
+    #   for enrichment!
 
 
-# Segments
-sub segments {
-  my $self = shift;
-  if (@_) {
-    $self->{segments} = shift;
+    # Prepare field for sorting
+    if ($field->{sortable}) {
+
+      # Which entries need to be sorted?
+      $sortable{$field->{key}}++;
+    };
+
+
+    # Prepare for summarization
+    if ($field->{type} && $field->{type} eq 'type:integer') {
+      $fields->add_int($field->{key}, $field->{value});
+    }
+    else {
+      $fields->add_string($field->{key}, $field->{value});
+    };
+
+    # This will later be indexed for search as well as retrieval in
+    # the forward index.
   };
-  return $self->{segments};
+
+  # Check that the unique field is given, as this is required
+  $self->{sortable} = \%sortable;
+
+  my $primary_index = 0;
+
+  # Get all subtokens
+  if ($doc->{subtokens}) {
+
+    print_log('doc', 'Parse subtokens') if DEBUG;
+
+    # Get all subtoken offsets
+    foreach my $subtoken (@{$doc->{subtokens}}) {
+
+      # Get start and end of the subtoken
+      my ($start, $end) = @{$subtoken->{offsets}};
+
+      if (DEBUG) {
+        print_log(
+          'doc',
+          'Store subtoken: ' . $pos . '=' . join('-', $start, $end)
+        );
+      };
+
+      # Get the term surface from the primary text
+      # TODO:
+      #   Ensure that the offsets are valid!
+      my $preceding = substr($primary, $primary_index, $start - $primary_index);
+      my $term      = substr($primary, $start, $end - $start);
+      $primary_index = $end;
+
+      print_log('doc', 'Surface form is ' . $term) if DEBUG;
+
+      $stream->subtoken($pos, $preceding, $term);
+      $pos++;
+    };
+  };
+
+
+  # There are tokens indexed by subtokens
+  if ($primary_index) {
+    my $preceding = substr($primary, $primary_index);
+    $stream->subtoken($pos, $preceding, '') if $preceding;
+
+    # TODO: Probably not a good idea
+    $primary_index = 0;
+  };
+
+
+  # Get all annotations
+  $pos = 0;
+  my $end;
+  foreach my $item (@{$doc->{annotations}}) {
+
+    # Add token term to term dictionary
+    if ($item->{'@type'} eq 'koral:token') {
+
+      unless ($item->{wrap}) {
+        warn 'No wrap defined in KoralQuery';
+        next;
+      };
+
+      # Create key string
+      my $wrap = $item->{wrap};
+      my @keys;
+
+      # Token wraps a koral:termGroup
+      if ($wrap->{'@type'} && $wrap->{'@type'} eq 'koral:termGroup')  {
+        foreach (@{$wrap->{operands}}) {
+          push @keys, _term($_);
+        };
+      }
+
+      # Token wraps a single koral:term
+      else {
+        push @keys, _term($wrap);
+      };
+
+      # Append posting to postings list
+      my @subtoken_offset = _subtokens($item);
+
+      # There are no reference subtokens defined
+      unless (scalar @subtoken_offset) {
+
+        # Use the current position for storing
+        push @subtoken_offset, $pos;
+
+        # But there are offsets defined
+        if ($item->{offsets}) {
+
+          # Get character definitions
+          my ($start, $end) = @{$item->{offsets}};
+
+          # Get the term surface from the primary text
+          # TODO:
+          #   Ensure that the offsets are valid!
+          my $preceding = substr($primary, $primary_index, $start - $primary_index);
+          my $term = substr($primary, $start, $end - $start);
+          $primary_index = $end;
+
+          $stream->subtoken($pos, $preceding, $term);
+        };
+        $pos++;
+      };
+
+      # Add token terms
+      foreach (@keys) {
+
+        # Add token annotation
+        my $length = $subtoken_offset[1] ? ($subtoken_offset[1]-$subtoken_offset[0]-1) : 0;
+        $stream->subtoken($subtoken_offset[0])->add_annotation('#' . $_, $length);
+      };
+    }
+
+    # Add span term to dictionary
+    elsif ($item->{'@type'} eq 'koral:span') {
+
+      # Create key string
+      my $key = '<>' . _term($item->{wrap});
+
+
+      # Add span to forward stream
+      my $length = $item->{subtokens}->[1] ? (
+        $item->{subtokens}->[-1] - $item->{subtokens}->[0]
+      ) : 0;
+      $stream->subtoken($item->{subtokens}->[0])->add_annotation($key, $length);
+    };
+  };
+
+  # There are tokens indexed by subtokens
+  if ($primary_index) {
+    my $preceding = substr($primary, $primary_index);
+    $stream->subtoken($pos, $preceding, '') if $preceding;
+
+    # TODO: Probably not a good idea
+    $primary_index = 0;
+  };
 };
 
-sub annotations {
-};
 
-# Return segment list or nothing
-sub _segment_list {
+# TODO: Use from_koral()->term
+# Potentially with a prefix
+sub _term {
+  my $item = shift;
+
+  my $key = '';
+  # Create term for term dictionary
+  if ($item->{foundry}) {
+    $key .= $item->{foundry};
+    if ($item->{layer}) {
+      $key .= '/' . $item->{layer};
+    }
+    $key .= '=';
+  };
+  return $key . ($item->{key} // '');
+}
+
+
+# Return subtoken list or nothing
+sub _subtokens {
   my $item = shift;
   my @posting;
 
-  if ($item->{segments}) {
+  if ($item->{subtokens}) {
 
-    @posting = ($item->{segments}->[0]);
+    # Remove!
+    push @posting, $item->{subtokens}->[0];
 
-   if ($item->{segments}->[1]) {
-
-     # The end is AFTER the second segment
-      push @posting, $item->{segments}->[1];
+    if ($item->{subtokens}->[1]) {
+      # The end is AFTER the second subtoken
+      push @posting, $item->{subtokens}->[1] + 1;
     };
 
     return @posting;
@@ -86,3 +310,29 @@
 
 
 1;
+
+
+__END__
+
+
+
+sub to_list {
+  my ($self, $doc_id, $replicant_id) = @_;
+};
+
+
+sub add {
+  # This will add the doc_id to id-field and
+  # this will add the replicant field (either __1:1 or __2:node_name).
+};
+
+
+sub to_forward_index {
+  # Only works after identification!
+  # This should, however, use a K::I::Store class!
+};
+
+
+1;
+
+__END__
commit	dd10fb91819cf3ac77a5032c145e22a8322b79f4	[log] [tgz]
author	Akron <nils@diewald-online.de>	Tue Aug 08 20:19:46 2017 +0200
committer	Akron <nils@diewald-online.de>	Tue Aug 08 20:19:46 2017 +0200
tree	565cceab31198f4ff088fdb95c3ad08845614c6f
parent	be96e283934339bbc0222019dc5e44fb139e120b [diff]