Added Koral::Document for ForwardIndex creation

commit: dd10fb91819cf3ac77a5032c145e22a8322b79f4 [log] [tgz]
author: Akron <nils@diewald-online.de> Tue Aug 08 20:19:46 2017 +0200
committer: Akron <nils@diewald-online.de> Tue Aug 08 20:19:46 2017 +0200
tree: 565cceab31198f4ff088fdb95c3ad08845614c6f
parent: be96e283934339bbc0222019dc5e44fb139e120b [diff] [blame]
diff --git a/lib/Krawfish/Koral/Document.pm b/lib/Krawfish/Koral/Document.pm
index 9dcb7c9..0a70d2e 100644
--- a/lib/Krawfish/Koral/Document.pm
+++ b/lib/Krawfish/Koral/Document.pm

@@ -1,81 +1,305 @@
 package Krawfish::Koral::Document;
-use Krawfish::Koral::Query::Token;
+use Krawfish::Index::Forward::Stream;
+use Krawfish::Index::Forward::Fields;
+use Krawfish::Log;
+use Mojo::File;
+use Mojo::JSON qw/encode_json decode_json/;
 use strict;
 use warnings;
+use List::MoreUtils qw/uniq/;
 
-# Representation of a document
+# Parses a document and creates a simple forward index list.
+#
+#   primary='...',
+#   fields=[+field => title],
+#   terms=[*term => [postings*]]
+#
+#   Then, when the document is added to certain nodes,
+#   the keys will be translated to term_ids and the document
+#   can be added with all freq_in_doc information
+
+
+use constant DEBUG => 1;
+
+# Parse the document and create an inverted index file
 sub new {
   my $class = shift;
-  my $self = bless {}, $class;
 
-  return $self unless @_;
+  my $self = bless {
+    primary => '',
+    sortable => {},
+    stream => Krawfish::Index::Forward::Stream->new,
+    fields => Krawfish::Index::Forward::Fields->new
+  }, $class;
 
-  my $koral = shift;
-  if ($koral->{primaryData}) {
-    $self->primary_data($koral->{primaryData});
+  my $doc = shift;
+
+  unless (ref $doc) {
+    $doc = decode_json(Mojo::File->new($doc)->slurp);
   };
 
-  # Parse segments
-  if ($koral->{segments}) {
-
-    # Todo: Parse and sort
-    $self->{segments} = $koral->{segments};
-  };
-
-  # Parse annotations
-  if ($koral->{annotations}) {
-
-    # TODO: All annotations need to be wrapped
-    my @annotations = ();
-    foreach my $item (@{$koral->{annotations}}) {
-      if ($item->{'@type'} eq 'koral:token') {
-        my $token = Krawfish::Koral::Query::Token->new($item);
-
-        unless (scalar $item->{segments}) {
-        }
-      };
-    };
-  };
+  # Parse the document
+  $self->_parse($doc);
 
   return $self;
 };
 
 
-# Primary data
+# Get the primary data
 sub primary_data {
+  $_[0]->{primary};
+};
+
+
+sub stream {
+  $_[0]->{stream};
+};
+
+
+sub fields {
+  $_[0]->{fields};
+};
+
+
+sub sortable {
+  $_[0]->{sortable};
+};
+
+sub identify {
+  my ($self, $dict) = @_;
+  $self->{fields} = $self->{fields}->identify($dict);
+  $self->{stream} = $self->{stream}->identify($dict);
+  return $self;
+};
+
+
+sub to_string {
   my $self = shift;
-  if (@_) {
-    $self->{primary_data} = shift;
+  return '[' . $self->fields->to_string . ']' . $self->stream->to_string;
+};
+
+
+# Parse the file and create a token-ordered document
+sub _parse {
+  my ($self, $doc) = @_;
+
+  # Get the document part
+  # This may - in the future - support multiple documents at once
+  $doc = $doc->{document};
+
+  my $primary = '';
+  my $stream = $self->stream;
+  my $fields = $self->fields;
+
+  # Remember the primary data for the creation
+  # of the forward index
+  if ($doc->{primaryData}) {
+    $primary = $doc->{primaryData};
   };
-  return $self->{primary_data};
-};
+
+  $self->{primary} = $primary;
+
+  # Add metadata fields
+  my $pos = 0;
+  my %sortable;
+  foreach my $field (@{$doc->{fields}}) {
+
+    # TODO:
+    #   Presort fields based on their field_key_id!
+    #   In that way it's faster to retrieve presorted fields
+    #   for enrichment!
 
 
-# Segments
-sub segments {
-  my $self = shift;
-  if (@_) {
-    $self->{segments} = shift;
+    # Prepare field for sorting
+    if ($field->{sortable}) {
+
+      # Which entries need to be sorted?
+      $sortable{$field->{key}}++;
+    };
+
+
+    # Prepare for summarization
+    if ($field->{type} && $field->{type} eq 'type:integer') {
+      $fields->add_int($field->{key}, $field->{value});
+    }
+    else {
+      $fields->add_string($field->{key}, $field->{value});
+    };
+
+    # This will later be indexed for search as well as retrieval in
+    # the forward index.
   };
-  return $self->{segments};
+
+  # Check that the unique field is given, as this is required
+  $self->{sortable} = \%sortable;
+
+  my $primary_index = 0;
+
+  # Get all subtokens
+  if ($doc->{subtokens}) {
+
+    print_log('doc', 'Parse subtokens') if DEBUG;
+
+    # Get all subtoken offsets
+    foreach my $subtoken (@{$doc->{subtokens}}) {
+
+      # Get start and end of the subtoken
+      my ($start, $end) = @{$subtoken->{offsets}};
+
+      if (DEBUG) {
+        print_log(
+          'doc',
+          'Store subtoken: ' . $pos . '=' . join('-', $start, $end)
+        );
+      };
+
+      # Get the term surface from the primary text
+      # TODO:
+      #   Ensure that the offsets are valid!
+      my $preceding = substr($primary, $primary_index, $start - $primary_index);
+      my $term      = substr($primary, $start, $end - $start);
+      $primary_index = $end;
+
+      print_log('doc', 'Surface form is ' . $term) if DEBUG;
+
+      $stream->subtoken($pos, $preceding, $term);
+      $pos++;
+    };
+  };
+
+
+  # There are tokens indexed by subtokens
+  if ($primary_index) {
+    my $preceding = substr($primary, $primary_index);
+    $stream->subtoken($pos, $preceding, '') if $preceding;
+
+    # TODO: Probably not a good idea
+    $primary_index = 0;
+  };
+
+
+  # Get all annotations
+  $pos = 0;
+  my $end;
+  foreach my $item (@{$doc->{annotations}}) {
+
+    # Add token term to term dictionary
+    if ($item->{'@type'} eq 'koral:token') {
+
+      unless ($item->{wrap}) {
+        warn 'No wrap defined in KoralQuery';
+        next;
+      };
+
+      # Create key string
+      my $wrap = $item->{wrap};
+      my @keys;
+
+      # Token wraps a koral:termGroup
+      if ($wrap->{'@type'} && $wrap->{'@type'} eq 'koral:termGroup')  {
+        foreach (@{$wrap->{operands}}) {
+          push @keys, _term($_);
+        };
+      }
+
+      # Token wraps a single koral:term
+      else {
+        push @keys, _term($wrap);
+      };
+
+      # Append posting to postings list
+      my @subtoken_offset = _subtokens($item);
+
+      # There are no reference subtokens defined
+      unless (scalar @subtoken_offset) {
+
+        # Use the current position for storing
+        push @subtoken_offset, $pos;
+
+        # But there are offsets defined
+        if ($item->{offsets}) {
+
+          # Get character definitions
+          my ($start, $end) = @{$item->{offsets}};
+
+          # Get the term surface from the primary text
+          # TODO:
+          #   Ensure that the offsets are valid!
+          my $preceding = substr($primary, $primary_index, $start - $primary_index);
+          my $term = substr($primary, $start, $end - $start);
+          $primary_index = $end;
+
+          $stream->subtoken($pos, $preceding, $term);
+        };
+        $pos++;
+      };
+
+      # Add token terms
+      foreach (@keys) {
+
+        # Add token annotation
+        my $length = $subtoken_offset[1] ? ($subtoken_offset[1]-$subtoken_offset[0]-1) : 0;
+        $stream->subtoken($subtoken_offset[0])->add_annotation('#' . $_, $length);
+      };
+    }
+
+    # Add span term to dictionary
+    elsif ($item->{'@type'} eq 'koral:span') {
+
+      # Create key string
+      my $key = '<>' . _term($item->{wrap});
+
+
+      # Add span to forward stream
+      my $length = $item->{subtokens}->[1] ? (
+        $item->{subtokens}->[-1] - $item->{subtokens}->[0]
+      ) : 0;
+      $stream->subtoken($item->{subtokens}->[0])->add_annotation($key, $length);
+    };
+  };
+
+  # There are tokens indexed by subtokens
+  if ($primary_index) {
+    my $preceding = substr($primary, $primary_index);
+    $stream->subtoken($pos, $preceding, '') if $preceding;
+
+    # TODO: Probably not a good idea
+    $primary_index = 0;
+  };
 };
 
-sub annotations {
-};
 
-# Return segment list or nothing
-sub _segment_list {
+# TODO: Use from_koral()->term
+# Potentially with a prefix
+sub _term {
+  my $item = shift;
+
+  my $key = '';
+  # Create term for term dictionary
+  if ($item->{foundry}) {
+    $key .= $item->{foundry};
+    if ($item->{layer}) {
+      $key .= '/' . $item->{layer};
+    }
+    $key .= '=';
+  };
+  return $key . ($item->{key} // '');
+}
+
+
+# Return subtoken list or nothing
+sub _subtokens {
   my $item = shift;
   my @posting;
 
-  if ($item->{segments}) {
+  if ($item->{subtokens}) {
 
-    @posting = ($item->{segments}->[0]);
+    # Remove!
+    push @posting, $item->{subtokens}->[0];
 
-   if ($item->{segments}->[1]) {
-
-     # The end is AFTER the second segment
-      push @posting, $item->{segments}->[1];
+    if ($item->{subtokens}->[1]) {
+      # The end is AFTER the second subtoken
+      push @posting, $item->{subtokens}->[1] + 1;
     };
 
     return @posting;
@@ -86,3 +310,29 @@
 
 
 1;
+
+
+__END__
+
+
+
+sub to_list {
+  my ($self, $doc_id, $replicant_id) = @_;
+};
+
+
+sub add {
+  # This will add the doc_id to id-field and
+  # this will add the replicant field (either __1:1 or __2:node_name).
+};
+
+
+sub to_forward_index {
+  # Only works after identification!
+  # This should, however, use a K::I::Store class!
+};
+
+
+1;
+
+__END__
commit	dd10fb91819cf3ac77a5032c145e22a8322b79f4	[log] [tgz]
author	Akron <nils@diewald-online.de>	Tue Aug 08 20:19:46 2017 +0200
committer	Akron <nils@diewald-online.de>	Tue Aug 08 20:19:46 2017 +0200
tree	565cceab31198f4ff088fdb95c3ad08845614c6f
parent	be96e283934339bbc0222019dc5e44fb139e120b [diff] [blame]