blob: ac41d214fbeb8a71bc1f18a58da4effa22ca7111 [file] [log] [blame]
package Krawfish::Koral::Document;
use Krawfish::Koral::Document::Stream;
use Krawfish::Koral::Document::Fields;
use Krawfish::Koral::Query::Term;
use Krawfish::Log;
use Mojo::File;
use Mojo::JSON qw/encode_json decode_json/;
use strict;
use warnings;
use List::MoreUtils qw/uniq/;
# Parses a document and creates a simple forward index list.
#
# primary='...',
# fields=[+field => title],
# terms=[*term => [postings*]]
#
# Then, when the document is added to certain nodes,
# the keys will be translated to term_ids and the document
# can be added with all freq_in_doc information
# foundry and layer may need separated term_ids
# so they are exceptional small.
# TODO:
# Don't forget to deal with TUIs!
use constant DEBUG => 0;
# Parse the document and create an inverted index file
sub new {
my $class = shift;
my $self = bless {
# sortable => {},
stream => Krawfish::Koral::Document::Stream->new,
fields => Krawfish::Koral::Document::Fields->new
}, $class;
my $doc = shift;
unless (ref $doc) {
$doc = decode_json(Mojo::File->new($doc)->slurp);
};
# Parse the document
$self->_parse($doc);
return $self;
};
# Get the stream object
sub stream {
$_[0]->{stream};
};
# Get the fields object
sub fields {
$_[0]->{fields};
};
# Translate all terms into term_ids and
# add unknown terms to the dictionary
sub identify {
my ($self, $dict) = @_;
$self->{fields} = $self->{fields}->identify($dict);
$self->{stream} = $self->{stream}->identify($dict);
return $self;
};
# Stringification
sub to_string {
my ($self, $id) = @_;
return '[' . $self->fields->to_string($id) . ']' . $self->stream->to_string($id);
};
# Parse the file and create a token-ordered document
sub _parse {
my ($self, $doc) = @_;
# Get the document part
# This may - in the future - support multiple documents at once
$doc = $doc->{document};
my $primary = '';
my $stream = $self->stream;
my $fields = $self->fields;
# Remember the primary data for the creation
# of the forward index
if ($doc->{primaryData}) {
$primary = $doc->{primaryData};
};
# Add metadata fields
my $pos = 0;
# my %sortable;
foreach my $field (@{$doc->{fields}}) {
# TODO:
# Presort fields based on their field_key_id!
# In that way it's faster to retrieve presorted fields
# for enrichment!
# Prepare field for sorting
# if ($field->{sortable}) {
# # Which entries need to be sorted?
# $sortable{$field->{key}}++;
# };
# Prepare for summarization
if (!$field->{type} || $field->{type} eq 'type:string') {
if (ref $field->{value} && ref $field->{value} eq 'ARRAY') {
if (DEBUG) {
print_log('doc', 'Field ' . $field->{key} . ' is multivalued');
};
my $key = $field->{key};
# Iterate over all field values and add the value
foreach my $value (@{$field->{value}}) {
$fields->add_string($key, $value);
};
}
else {
$fields->add_string($field->{key}, $field->{value});
};
}
elsif ($field->{type} eq 'type:integer') {
$fields->add_int($field->{key}, $field->{value});
}
elsif ($field->{type} eq 'type:store') {
$fields->add_store($field->{key}, $field->{value});
}
else {
warn 'unknown field type: ' . $field->{type};
};
# This will later be indexed for search as well as retrieval in
# the forward index.
};
# Check that the unique field is given, as this is required
# $self->{sortable} = \%sortable;
my $primary_index = 0;
# Get all subtokens
if ($doc->{subtokens}) {
print_log('doc', 'Parse subtokens') if DEBUG;
# Get all subtoken offsets
foreach my $subtoken (@{$doc->{subtokens}}) {
# Get start and end of the subtoken
my ($start, $end) = @{$subtoken->{offsets}};
if (DEBUG) {
print_log(
'doc',
'Store subtoken: ' . $pos . '=' . join('-', $start, $end)
);
};
# Get the term surface from the primary text
# TODO:
# Ensure that the offsets are valid!
my $preceding = substr($primary, $primary_index, $start - $primary_index) // '';
my $term = substr($primary, $start, $end - $start);
$primary_index = $end;
print_log('doc', 'Surface form is ' . $term) if DEBUG;
$stream->subtoken($pos, $preceding, $term);
$pos++;
};
};
# There are tokens indexed by subtokens
if ($primary_index) {
my $preceding = substr($primary, $primary_index);
$stream->subtoken($pos, $preceding, '') if $preceding;
# TODO: Probably not a good idea
$primary_index = 0;
};
# Get all annotations
$pos = 0;
my $end;
foreach my $item (@{$doc->{annotations}}) {
# Add token term to term dictionary
if ($item->{'@type'} eq 'koral:token') {
unless ($item->{wrap}) {
warn 'No wrap defined in KoralQuery';
CORE::next;
};
# Create key string
my $wrap = $item->{wrap};
my @keys;
# Token wraps a koral:termGroup
if ($wrap->{'@type'} && $wrap->{'@type'} eq 'koral:termGroup') {
foreach (@{$wrap->{operands}}) {
push @keys, _term($_);
};
}
# Token wraps a single koral:term
else {
push @keys, _term($wrap);
};
# Append posting to postings list
my @subtoken_offset = _subtokens($item);
# There are no reference subtokens defined
unless (scalar @subtoken_offset) {
# Use the current position for storing
push @subtoken_offset, $pos;
# But there are offsets defined
if ($item->{offsets}) {
# Get character definitions
my ($start, $end) = @{$item->{offsets}};
# Get the term surface from the primary text
# TODO:
# Ensure that the offsets are valid!
my $preceding = substr($primary, $primary_index, $start - $primary_index);
my $term = substr($primary, $start, $end - $start);
$primary_index = $end;
$stream->subtoken($pos, $preceding, $term);
};
$pos++;
};
# Add token terms
foreach (@keys) {
# Add token annotation
# my $length = $subtoken_offset[1] ? ($subtoken_offset[1]-$subtoken_offset[0]-1) : 0;
$stream->subtoken(
$subtoken_offset[0]
)->add_annotation($_, $subtoken_offset[1] ? $subtoken_offset[1] : $subtoken_offset[0] + 1);
};
}
# Add span term to dictionary
elsif ($item->{'@type'} eq 'koral:span') {
# Create key string
my $term = _term($item->{wrap});
$term->term_type('span');
# Add span to forward stream
#my $length = $item->{subtokens}->[1] ? (
# $item->{subtokens}->[-1] - $item->{subtokens}->[0]
#) : 0;
$stream->subtoken($item->{subtokens}->[0])->add_annotation(
$term,
$item->{subtokens}->[-1] + 1
);
};
};
# There are tokens indexed by subtokens
if ($primary_index) {
my $preceding = substr($primary, $primary_index);
$stream->subtoken($pos, $preceding, '') if $preceding;
# TODO: Probably not a good idea
$primary_index = 0;
};
};
# TODO: Use from_koral()->term
# Potentially with a prefix
sub _term {
my $item = shift;
my $term = Krawfish::Koral::Query::Term->new;
if ($item->{foundry}) {
$term->foundry($item->{foundry});
};
if ($item->{layer}) {
$term->layer($item->{layer});
};
if ($item->{key}) {
$term->key($item->{key});
};
if ($item->{value}) {
$term->value($item->{value});
};
# Make token default term type
$term->term_type('token');
return $term;
#my $key = '';
## Create term for term dictionary
#if ($item->{foundry}) {
# $key .= $item->{foundry};
# if ($item->{layer}) {
# $key .= '/' . $item->{layer};
# }
# $key .= '=';
#};
#return $key . ($item->{key} // '');
}
# Return subtoken list or nothing
sub _subtokens {
my $item = shift;
my @posting;
if ($item->{subtokens}) {
# Remove!
push @posting, $item->{subtokens}->[0];
if ($item->{subtokens}->[1]) {
# The end is AFTER the second subtoken
push @posting, $item->{subtokens}->[1] + 1;
};
return @posting;
};
return;
};
1;
__END__