Added Koral::Document for ForwardIndex creation
diff --git a/lib/Krawfish/Koral/Document.pm b/lib/Krawfish/Koral/Document.pm
index 9dcb7c9..0a70d2e 100644
--- a/lib/Krawfish/Koral/Document.pm
+++ b/lib/Krawfish/Koral/Document.pm
@@ -1,81 +1,305 @@
package Krawfish::Koral::Document;
-use Krawfish::Koral::Query::Token;
+use Krawfish::Index::Forward::Stream;
+use Krawfish::Index::Forward::Fields;
+use Krawfish::Log;
+use Mojo::File;
+use Mojo::JSON qw/encode_json decode_json/;
use strict;
use warnings;
+use List::MoreUtils qw/uniq/;
-# Representation of a document
+# Parses a document and creates a simple forward index list.
+#
+# primary='...',
+# fields=[+field => title],
+# terms=[*term => [postings*]]
+#
+# Then, when the document is added to certain nodes,
+# the keys will be translated to term_ids and the document
+# can be added with all freq_in_doc information
+
+
+use constant DEBUG => 1;
+
+# Parse the document and create an inverted index file
sub new {
my $class = shift;
- my $self = bless {}, $class;
- return $self unless @_;
+ my $self = bless {
+ primary => '',
+ sortable => {},
+ stream => Krawfish::Index::Forward::Stream->new,
+ fields => Krawfish::Index::Forward::Fields->new
+ }, $class;
- my $koral = shift;
- if ($koral->{primaryData}) {
- $self->primary_data($koral->{primaryData});
+ my $doc = shift;
+
+ unless (ref $doc) {
+ $doc = decode_json(Mojo::File->new($doc)->slurp);
};
- # Parse segments
- if ($koral->{segments}) {
-
- # Todo: Parse and sort
- $self->{segments} = $koral->{segments};
- };
-
- # Parse annotations
- if ($koral->{annotations}) {
-
- # TODO: All annotations need to be wrapped
- my @annotations = ();
- foreach my $item (@{$koral->{annotations}}) {
- if ($item->{'@type'} eq 'koral:token') {
- my $token = Krawfish::Koral::Query::Token->new($item);
-
- unless (scalar $item->{segments}) {
- }
- };
- };
- };
+ # Parse the document
+ $self->_parse($doc);
return $self;
};
-# Primary data
+# Get the primary data
sub primary_data {
+ $_[0]->{primary};
+};
+
+
+sub stream {
+ $_[0]->{stream};
+};
+
+
+sub fields {
+ $_[0]->{fields};
+};
+
+
+sub sortable {
+ $_[0]->{sortable};
+};
+
+sub identify {
+ my ($self, $dict) = @_;
+ $self->{fields} = $self->{fields}->identify($dict);
+ $self->{stream} = $self->{stream}->identify($dict);
+ return $self;
+};
+
+
+sub to_string {
my $self = shift;
- if (@_) {
- $self->{primary_data} = shift;
+ return '[' . $self->fields->to_string . ']' . $self->stream->to_string;
+};
+
+
+# Parse the file and create a token-ordered document
+sub _parse {
+ my ($self, $doc) = @_;
+
+ # Get the document part
+ # This may - in the future - support multiple documents at once
+ $doc = $doc->{document};
+
+ my $primary = '';
+ my $stream = $self->stream;
+ my $fields = $self->fields;
+
+ # Remember the primary data for the creation
+ # of the forward index
+ if ($doc->{primaryData}) {
+ $primary = $doc->{primaryData};
};
- return $self->{primary_data};
-};
+
+ $self->{primary} = $primary;
+
+ # Add metadata fields
+ my $pos = 0;
+ my %sortable;
+ foreach my $field (@{$doc->{fields}}) {
+
+ # TODO:
+ # Presort fields based on their field_key_id!
+ # In that way it's faster to retrieve presorted fields
+ # for enrichment!
-# Segments
-sub segments {
- my $self = shift;
- if (@_) {
- $self->{segments} = shift;
+ # Prepare field for sorting
+ if ($field->{sortable}) {
+
+ # Which entries need to be sorted?
+ $sortable{$field->{key}}++;
+ };
+
+
+ # Prepare for summarization
+ if ($field->{type} && $field->{type} eq 'type:integer') {
+ $fields->add_int($field->{key}, $field->{value});
+ }
+ else {
+ $fields->add_string($field->{key}, $field->{value});
+ };
+
+ # This will later be indexed for search as well as retrieval in
+ # the forward index.
};
- return $self->{segments};
+
+ # Check that the unique field is given, as this is required
+ $self->{sortable} = \%sortable;
+
+ my $primary_index = 0;
+
+ # Get all subtokens
+ if ($doc->{subtokens}) {
+
+ print_log('doc', 'Parse subtokens') if DEBUG;
+
+ # Get all subtoken offsets
+ foreach my $subtoken (@{$doc->{subtokens}}) {
+
+ # Get start and end of the subtoken
+ my ($start, $end) = @{$subtoken->{offsets}};
+
+ if (DEBUG) {
+ print_log(
+ 'doc',
+ 'Store subtoken: ' . $pos . '=' . join('-', $start, $end)
+ );
+ };
+
+ # Get the term surface from the primary text
+ # TODO:
+ # Ensure that the offsets are valid!
+ my $preceding = substr($primary, $primary_index, $start - $primary_index);
+ my $term = substr($primary, $start, $end - $start);
+ $primary_index = $end;
+
+ print_log('doc', 'Surface form is ' . $term) if DEBUG;
+
+ $stream->subtoken($pos, $preceding, $term);
+ $pos++;
+ };
+ };
+
+
+ # There are tokens indexed by subtokens
+ if ($primary_index) {
+ my $preceding = substr($primary, $primary_index);
+ $stream->subtoken($pos, $preceding, '') if $preceding;
+
+ # TODO: Probably not a good idea
+ $primary_index = 0;
+ };
+
+
+ # Get all annotations
+ $pos = 0;
+ my $end;
+ foreach my $item (@{$doc->{annotations}}) {
+
+ # Add token term to term dictionary
+ if ($item->{'@type'} eq 'koral:token') {
+
+ unless ($item->{wrap}) {
+ warn 'No wrap defined in KoralQuery';
+ next;
+ };
+
+ # Create key string
+ my $wrap = $item->{wrap};
+ my @keys;
+
+ # Token wraps a koral:termGroup
+ if ($wrap->{'@type'} && $wrap->{'@type'} eq 'koral:termGroup') {
+ foreach (@{$wrap->{operands}}) {
+ push @keys, _term($_);
+ };
+ }
+
+ # Token wraps a single koral:term
+ else {
+ push @keys, _term($wrap);
+ };
+
+ # Append posting to postings list
+ my @subtoken_offset = _subtokens($item);
+
+ # There are no reference subtokens defined
+ unless (scalar @subtoken_offset) {
+
+ # Use the current position for storing
+ push @subtoken_offset, $pos;
+
+ # But there are offsets defined
+ if ($item->{offsets}) {
+
+ # Get character definitions
+ my ($start, $end) = @{$item->{offsets}};
+
+ # Get the term surface from the primary text
+ # TODO:
+ # Ensure that the offsets are valid!
+ my $preceding = substr($primary, $primary_index, $start - $primary_index);
+ my $term = substr($primary, $start, $end - $start);
+ $primary_index = $end;
+
+ $stream->subtoken($pos, $preceding, $term);
+ };
+ $pos++;
+ };
+
+ # Add token terms
+ foreach (@keys) {
+
+ # Add token annotation
+ my $length = $subtoken_offset[1] ? ($subtoken_offset[1]-$subtoken_offset[0]-1) : 0;
+ $stream->subtoken($subtoken_offset[0])->add_annotation('#' . $_, $length);
+ };
+ }
+
+ # Add span term to dictionary
+ elsif ($item->{'@type'} eq 'koral:span') {
+
+ # Create key string
+ my $key = '<>' . _term($item->{wrap});
+
+
+ # Add span to forward stream
+ my $length = $item->{subtokens}->[1] ? (
+ $item->{subtokens}->[-1] - $item->{subtokens}->[0]
+ ) : 0;
+ $stream->subtoken($item->{subtokens}->[0])->add_annotation($key, $length);
+ };
+ };
+
+ # There are tokens indexed by subtokens
+ if ($primary_index) {
+ my $preceding = substr($primary, $primary_index);
+ $stream->subtoken($pos, $preceding, '') if $preceding;
+
+ # TODO: Probably not a good idea
+ $primary_index = 0;
+ };
};
-sub annotations {
-};
-# Return segment list or nothing
-sub _segment_list {
+# TODO: Use from_koral()->term
+# Potentially with a prefix
+sub _term {
+ my $item = shift;
+
+ my $key = '';
+ # Create term for term dictionary
+ if ($item->{foundry}) {
+ $key .= $item->{foundry};
+ if ($item->{layer}) {
+ $key .= '/' . $item->{layer};
+ }
+ $key .= '=';
+ };
+ return $key . ($item->{key} // '');
+}
+
+
+# Return subtoken list or nothing
+sub _subtokens {
my $item = shift;
my @posting;
- if ($item->{segments}) {
+ if ($item->{subtokens}) {
- @posting = ($item->{segments}->[0]);
+ # Remove!
+ push @posting, $item->{subtokens}->[0];
- if ($item->{segments}->[1]) {
-
- # The end is AFTER the second segment
- push @posting, $item->{segments}->[1];
+ if ($item->{subtokens}->[1]) {
+ # The end is AFTER the second subtoken
+ push @posting, $item->{subtokens}->[1] + 1;
};
return @posting;
@@ -86,3 +310,29 @@
1;
+
+
+__END__
+
+
+
+sub to_list {
+ my ($self, $doc_id, $replicant_id) = @_;
+};
+
+
+sub add {
+ # This will add the doc_id to id-field and
+ # this will add the replicant field (either __1:1 or __2:node_name).
+};
+
+
+sub to_forward_index {
+ # Only works after identification!
+ # This should, however, use a K::I::Store class!
+};
+
+
+1;
+
+__END__