| package Krawfish::Koral::Document; |
| use Krawfish::Koral::Document::Stream; |
| use Krawfish::Koral::Document::Fields; |
| use Krawfish::Koral::Query::Term; |
| use Krawfish::Log; |
| use Mojo::File; |
| use Mojo::JSON qw/encode_json decode_json/; |
| use strict; |
| use warnings; |
| use List::MoreUtils qw/uniq/; |
| |
| # Parses a document and creates a simple forward index list. |
| # |
| # primary='...', |
| # fields=[+field => title], |
| # terms=[*term => [postings*]] |
| # |
| # Then, when the document is added to certain nodes, |
| # the keys will be translated to term_ids and the document |
| # can be added with all freq_in_doc information |
| |
| # foundry and layer may need separated term_ids |
| # so they are exceptional small. |
| |
| # TODO: |
| # Don't forget to deal with TUIs! |
| |
| use constant DEBUG => 0; |
| |
| # Parse the document and create an inverted index file |
| sub new { |
| my $class = shift; |
| |
| my $self = bless { |
| # sortable => {}, |
| stream => Krawfish::Koral::Document::Stream->new, |
| fields => Krawfish::Koral::Document::Fields->new |
| }, $class; |
| |
| my $doc = shift; |
| |
| unless (ref $doc) { |
| $doc = decode_json(Mojo::File->new($doc)->slurp); |
| }; |
| |
| # Parse the document |
| $self->_parse($doc); |
| |
| return $self; |
| }; |
| |
| |
| # Get the stream object |
| sub stream { |
| $_[0]->{stream}; |
| }; |
| |
| |
| # Get the fields object |
| sub fields { |
| $_[0]->{fields}; |
| }; |
| |
| |
| # Translate all terms into term_ids and |
| # add unknown terms to the dictionary |
| sub identify { |
| my ($self, $dict) = @_; |
| $self->{fields} = $self->{fields}->identify($dict); |
| $self->{stream} = $self->{stream}->identify($dict); |
| return $self; |
| }; |
| |
| |
| # Stringification |
| sub to_string { |
| my ($self, $id) = @_; |
| return '[' . $self->fields->to_string($id) . ']' . $self->stream->to_string($id); |
| }; |
| |
| |
| # Parse the file and create a token-ordered document |
| sub _parse { |
| my ($self, $doc) = @_; |
| |
| # Get the document part |
| # This may - in the future - support multiple documents at once |
| $doc = $doc->{document}; |
| |
| my $primary = ''; |
| my $stream = $self->stream; |
| my $fields = $self->fields; |
| |
| # Remember the primary data for the creation |
| # of the forward index |
| if ($doc->{primaryData}) { |
| $primary = $doc->{primaryData}; |
| }; |
| |
| # Add metadata fields |
| my $pos = 0; |
| # my %sortable; |
| foreach my $field (@{$doc->{fields}}) { |
| |
| # TODO: |
| # Presort fields based on their field_key_id! |
| # In that way it's faster to retrieve presorted fields |
| # for enrichment! |
| |
| # Prepare field for sorting |
| # if ($field->{sortable}) { |
| # # Which entries need to be sorted? |
| # $sortable{$field->{key}}++; |
| # }; |
| |
| # Prepare for summarization |
| if (!$field->{type} || $field->{type} eq 'type:string') { |
| if (ref $field->{value} && ref $field->{value} eq 'ARRAY') { |
| |
| if (DEBUG) { |
| print_log('doc', 'Field ' . $field->{key} . ' is multivalued'); |
| }; |
| |
| my $key = $field->{key}; |
| |
| # Iterate over all field values and add the value |
| foreach my $value (@{$field->{value}}) { |
| $fields->add_string($key, $value); |
| }; |
| } |
| else { |
| $fields->add_string($field->{key}, $field->{value}); |
| }; |
| } |
| elsif ($field->{type} eq 'type:integer') { |
| $fields->add_int($field->{key}, $field->{value}); |
| } |
| elsif ($field->{type} eq 'type:store') { |
| $fields->add_store($field->{key}, $field->{value}); |
| } |
| else { |
| warn 'unknown field type: ' . $field->{type}; |
| }; |
| |
| # This will later be indexed for search as well as retrieval in |
| # the forward index. |
| }; |
| |
| # Check that the unique field is given, as this is required |
| # $self->{sortable} = \%sortable; |
| |
| my $primary_index = 0; |
| |
| # Get all subtokens |
| if ($doc->{subtokens}) { |
| |
| print_log('doc', 'Parse subtokens') if DEBUG; |
| |
| # Get all subtoken offsets |
| foreach my $subtoken (@{$doc->{subtokens}}) { |
| |
| # Get start and end of the subtoken |
| my ($start, $end) = @{$subtoken->{offsets}}; |
| |
| if (DEBUG) { |
| print_log( |
| 'doc', |
| 'Store subtoken: ' . $pos . '=' . join('-', $start, $end) |
| ); |
| }; |
| |
| # Get the term surface from the primary text |
| # TODO: |
| # Ensure that the offsets are valid! |
| my $preceding = substr($primary, $primary_index, $start - $primary_index) // ''; |
| my $term = substr($primary, $start, $end - $start); |
| $primary_index = $end; |
| |
| print_log('doc', 'Surface form is ' . $term) if DEBUG; |
| |
| $stream->subtoken($pos, $preceding, $term); |
| $pos++; |
| }; |
| }; |
| |
| |
| # There are tokens indexed by subtokens |
| if ($primary_index) { |
| my $preceding = substr($primary, $primary_index); |
| $stream->subtoken($pos, $preceding, '') if $preceding; |
| |
| # TODO: Probably not a good idea |
| $primary_index = 0; |
| }; |
| |
| |
| # Get all annotations |
| $pos = 0; |
| my $end; |
| foreach my $item (@{$doc->{annotations}}) { |
| |
| # Add token term to term dictionary |
| if ($item->{'@type'} eq 'koral:token') { |
| |
| unless ($item->{wrap}) { |
| warn 'No wrap defined in KoralQuery'; |
| CORE::next; |
| }; |
| |
| # Create key string |
| my $wrap = $item->{wrap}; |
| my @keys; |
| |
| # Token wraps a koral:termGroup |
| if ($wrap->{'@type'} && $wrap->{'@type'} eq 'koral:termGroup') { |
| foreach (@{$wrap->{operands}}) { |
| push @keys, _term($_); |
| }; |
| } |
| |
| # Token wraps a single koral:term |
| else { |
| push @keys, _term($wrap); |
| }; |
| |
| # Append posting to postings list |
| my @subtoken_offset = _subtokens($item); |
| |
| # There are no reference subtokens defined |
| unless (scalar @subtoken_offset) { |
| |
| # Use the current position for storing |
| push @subtoken_offset, $pos; |
| |
| # But there are offsets defined |
| if ($item->{offsets}) { |
| |
| # Get character definitions |
| my ($start, $end) = @{$item->{offsets}}; |
| |
| # Get the term surface from the primary text |
| # TODO: |
| # Ensure that the offsets are valid! |
| my $preceding = substr($primary, $primary_index, $start - $primary_index); |
| my $term = substr($primary, $start, $end - $start); |
| $primary_index = $end; |
| |
| $stream->subtoken($pos, $preceding, $term); |
| }; |
| $pos++; |
| }; |
| |
| # Add token terms |
| foreach (@keys) { |
| |
| # Add token annotation |
| # my $length = $subtoken_offset[1] ? ($subtoken_offset[1]-$subtoken_offset[0]-1) : 0; |
| $stream->subtoken( |
| $subtoken_offset[0] |
| )->add_annotation($_, $subtoken_offset[1] ? $subtoken_offset[1] : $subtoken_offset[0] + 1); |
| }; |
| } |
| |
| # Add span term to dictionary |
| elsif ($item->{'@type'} eq 'koral:span') { |
| |
| # Create key string |
| my $term = _term($item->{wrap}); |
| $term->term_type('span'); |
| |
| # Add span to forward stream |
| #my $length = $item->{subtokens}->[1] ? ( |
| # $item->{subtokens}->[-1] - $item->{subtokens}->[0] |
| #) : 0; |
| $stream->subtoken($item->{subtokens}->[0])->add_annotation( |
| $term, |
| $item->{subtokens}->[-1] + 1 |
| ); |
| }; |
| }; |
| |
| # There are tokens indexed by subtokens |
| if ($primary_index) { |
| my $preceding = substr($primary, $primary_index); |
| $stream->subtoken($pos, $preceding, '') if $preceding; |
| |
| # TODO: Probably not a good idea |
| $primary_index = 0; |
| }; |
| }; |
| |
| |
| # TODO: Use from_koral()->term |
| # Potentially with a prefix |
| sub _term { |
| my $item = shift; |
| my $term = Krawfish::Koral::Query::Term->new; |
| |
| if ($item->{foundry}) { |
| $term->foundry($item->{foundry}); |
| }; |
| |
| if ($item->{layer}) { |
| $term->layer($item->{layer}); |
| }; |
| |
| if ($item->{key}) { |
| $term->key($item->{key}); |
| }; |
| |
| if ($item->{value}) { |
| $term->value($item->{value}); |
| }; |
| |
| # Make token default term type |
| $term->term_type('token'); |
| |
| return $term; |
| |
| #my $key = ''; |
| ## Create term for term dictionary |
| #if ($item->{foundry}) { |
| # $key .= $item->{foundry}; |
| # if ($item->{layer}) { |
| # $key .= '/' . $item->{layer}; |
| # } |
| # $key .= '='; |
| #}; |
| #return $key . ($item->{key} // ''); |
| } |
| |
| |
| # Return subtoken list or nothing |
| sub _subtokens { |
| my $item = shift; |
| my @posting; |
| |
| if ($item->{subtokens}) { |
| |
| # Remove! |
| push @posting, $item->{subtokens}->[0]; |
| |
| if ($item->{subtokens}->[1]) { |
| # The end is AFTER the second subtoken |
| push @posting, $item->{subtokens}->[1] + 1; |
| }; |
| |
| return @posting; |
| }; |
| |
| return; |
| }; |
| |
| |
| 1; |
| |
| |
| __END__ |