| package Krawfish::Index; |
| use Krawfish::Index::Dictionary; |
| use Krawfish::Index::Subtokens; |
| use Krawfish::Index::PrimaryData; |
| use Krawfish::Index::Fields; |
| use Krawfish::Index::PostingsLive; |
| use Krawfish::Cache; |
| use Krawfish::Log; |
| use strict; |
| use warnings; |
| use Scalar::Util qw!blessed!; |
| use Mojo::JSON qw/encode_json decode_json/; |
| use Mojo::File; |
| |
| # TODO: This should be a base class for K::I::Static and K::I::Dynamic |
| |
| # TODO: Add LiveDocs-PostingsList, that supports deletion |
| # TODO: Live should store the last_doc value |
| |
| # |
| # TODO: Support multiple tokenized texts for parallel corpora |
| # |
| # TODO: Create Importer class |
| # |
| # TODO: Support Main Index and Auxiliary Indices with merging |
| # https://www.youtube.com/watch?v=98E1h_u4xGk |
| # |
| # TODO: Maybe logarithmic merge |
| # https://www.youtube.com/watch?v=VNjf2dxWH2Y&spfreload=5 |
| |
| # TODO: Maybe 65.535 documents are enough per segment ... |
| |
| # TODO: Build a forward index |
| # TODO: With a forward index, the subtokens offsets will no longer |
| # point to character positions in the primary text but to |
| # subtoken positions in the forward index! |
| |
| # TODO: |
| # Currently ranking is not collation based. It should be possible |
| # to define a collation per field and |
| # use one collation for prefix and suffix sorting. |
| # It may be beneficial to make a different sorting possible (though it's |
| # probably acceptable to make it slow) |
| # Use http://userguide.icu-project.org/collation |
| |
| # TODO: |
| # Reranking a field is not necessary, if the field value is already given. |
| # In that case, look up the dictionary if the value is already given, |
| # take the example doc of that field value and add the rank of that |
| # doc for the new doc. |
| # If the field is not yet given, take the next or previous value in dictionary |
| # order and use the rank to rerank the field (see K::I::Dictionary). |
| # BUT: This only works if the field has the same collation as the |
| # dictionary! |
| |
| # TODO: |
| # field names should have term_ids, so should foundries and layers, but |
| # probably not field values and annotation values. |
| # terms may have term_ids and subterms should have subterm_ids |
| |
| |
| use constant DEBUG => 0; |
| |
| |
| sub new { |
| my $class = shift; |
| my $file = shift; |
| my $self = bless { |
| file => $file |
| }, $class; |
| |
| print_log('index', 'Instantiate new index') if DEBUG; |
| |
| # Load dictionary |
| $self->{dict} = Krawfish::Index::Dictionary->new( |
| $self->{file} |
| ); |
| |
| # Load offsets |
| $self->{subtokens} = Krawfish::Index::Subtokens->new( |
| $self->{file} |
| ); |
| |
| # Load primary |
| $self->{primary} = Krawfish::Index::PrimaryData->new( |
| $self->{file} |
| ); |
| |
| # Load fields |
| $self->{fields} = Krawfish::Index::Fields->new( |
| $self->{file} |
| ); |
| |
| # Load live document pointer |
| $self->{live} = Krawfish::Index::PostingsLive->new( |
| $self->{file} |
| ); |
| |
| # Create a list of docid -> uuid mappers |
| # This may be problematic as uuids may need to be uint64, |
| # this can grow for a segment with 65.000 docs up to ~ 500kb |
| # Or ~ 7MB for 1,000,000 documents |
| # But this means it's possible to store |
| # 18.446.744.073.709.551.615 documents in the index |
| $self->{identifier} = []; |
| |
| # Collect fields to sort |
| $self->{sortable} = {}; |
| |
| # Collect values to sum |
| $self->{summable} = {}; |
| |
| # Add cache |
| $self->{cache} = Krawfish::Cache->new; |
| |
| return $self; |
| }; |
| |
| |
| # Get last document index |
| sub last_doc { |
| $_[0]->{live}->last_doc; |
| }; |
| |
| |
| # Alias for last doc |
| sub max_rank { |
| $_[0]->{live}->last_doc; |
| }; |
| |
| |
| # Get term dictionary |
| sub dict { |
| $_[0]->{dict}; |
| }; |
| |
| |
| # Get info |
| sub info { |
| $_[0]->{info}; |
| }; |
| |
| |
| # Get subtokens |
| sub subtokens { |
| $_[0]->{subtokens}; |
| }; |
| |
| |
| # Get live documents |
| sub live { |
| $_[0]->{live}; |
| }; |
| |
| |
| # Get primary |
| sub primary { |
| $_[0]->{primary}; |
| }; |
| |
| |
| # Get fields |
| sub fields { |
| $_[0]->{fields}; |
| }; |
| |
| |
| # Get field values for addition |
| sub field_values { |
| $_[0]->{field_values}; |
| }; |
| |
| |
| # Add document to the index |
| # TODO: Expect a KoralQuery document |
| # TODO: This should be specific to Krawfish::Index::Dynamic; |
| # TODO: Support update as a insert_after_delete |
| sub add { |
| my $self = shift; |
| my $doc = shift; |
| unless (ref $doc) { |
| $doc = decode_json(Mojo::File->new($doc)->slurp); |
| }; |
| |
| # Get new doc_id |
| my $doc_id = $self->live->incr; |
| |
| # Get document |
| $doc = $doc->{document}; |
| |
| # Store primary data |
| if ($doc->{primaryData}) { |
| |
| # TODO: This may, in the future, contain the forward index instead |
| $self->primary->store($doc_id, $doc->{primaryData}); |
| |
| print_log('index', 'Store primary data "' . $doc->{primaryData} . '"') if DEBUG; |
| }; |
| |
| my $pos = 0; |
| |
| # Store identifier for mappings |
| # But what is the purpose of the identifier? |
| # Isn't it okay to be slow here ... ? |
| if ($doc->{id}) { |
| $self->{identifier}->[$doc_id] = $doc->{id}; |
| }; |
| |
| my $dict = $self->{dict}; |
| |
| # Add metadata fields |
| my $fields = $self->fields; |
| foreach my $field (@{$doc->{fields}}) { |
| |
| # TODO: |
| # Also store 'id' as a field value |
| |
| # Add to document field (retrieval) |
| $fields->store($doc_id, $field->{key}, $field->{value}); |
| |
| # Prepare for summarization |
| # if ($field->{type} eq 'type:integer') { |
| # }; |
| |
| # Prepare field for sorting |
| if ($field->{sortable}) { |
| |
| # Which entries need to be sorted? |
| $self->{sortable}->{$field->{key}}++; |
| }; |
| |
| # Prepare field for summing |
| # if ($field->{summable}) { |
| # |
| # # Which entries need to be summable |
| # $self->{summable}->{$field->{key}}++; |
| # }; |
| |
| # Add to postings lists (search) |
| my $term = $field->{key} . ':' . $field->{value}; |
| my $post_list = $dict->add_term('+' . $term); |
| $post_list->append($doc_id); |
| }; |
| |
| my $subtokens = $self->subtokens; |
| |
| # The primary text is necessary for the subtoken index as well as |
| # for the forward index |
| my $primary = $doc->{primaryData}; |
| |
| # Store subtokens |
| if ($doc->{subtokens}) { |
| |
| print_log('index', 'Store subtokens') if DEBUG; |
| |
| # Store all subtoken offsets |
| foreach my $seg (@{$doc->{subtokens}}) { |
| |
| # Get start and end of the subtoken |
| my ($start, $end) = @{$seg->{offsets}}; |
| |
| if (DEBUG) { |
| print_log( |
| 'index', |
| 'Store subtoken: ' . $doc_id . ':' . $pos . '=' . join('-', $start, $end) |
| ); |
| }; |
| |
| # Get the term surface from the primary text |
| # TODO: Ensure that the offsets are valid! |
| my $term = substr($primary, $start, $end - $start); |
| |
| # TODO: There may be a prefix necessary for surface forms |
| # TODO: This may in fact be not necessary at all - |
| # The subtokens may have their own IDs |
| # And the terms do not need to be stored in the dictionary for retrieval ... |
| |
| # Add as a subterm |
| my $subterm_id = $dict->add_subterm($term); |
| |
| # TODO: |
| # Check somehow, if the term is new. If so, then { |
| # TODO: Store case insensitive term |
| # $dict->add_subterm_casefolded(fold_case($term), $subterm_id); |
| # $dict->add_subterm_without_diacritics(remove_diacritics($term), $subterm_id); |
| # } |
| |
| print_log('index', 'Surface form has subterm_id ' . $subterm_id) if DEBUG; |
| |
| # Store information to subtoken |
| $subtokens->store( |
| $doc_id, |
| $pos++, |
| $start, |
| $end, |
| $subterm_id, |
| $term # Probably not necessary! |
| ); |
| }; |
| }; |
| |
| # Get all tokens |
| $pos = 0; |
| my $end; |
| foreach my $item (@{$doc->{annotations}}) { |
| |
| # Add token term to term dictionary |
| if ($item->{'@type'} eq 'koral:token') { |
| |
| unless ($item->{wrap}) { |
| warn 'No wrap defined in KoralQuery'; |
| next; |
| }; |
| |
| # Create key string |
| my $wrap = $item->{wrap}; |
| my @keys; |
| |
| # Token wraps a koral:termGroup |
| if ($wrap->{'@type'} && $wrap->{'@type'} eq 'koral:termGroup') { |
| foreach (@{$wrap->{operands}}) { |
| push @keys, _term($_); |
| }; |
| } |
| |
| # Token wraps a single koral:term |
| else { |
| push @keys, _term($wrap); |
| }; |
| |
| # Append posting to postings list |
| my @subtokens = _subtokens($item); |
| |
| # No subtokens defined |
| unless (scalar @subtokens) { |
| push @subtokens, $pos; |
| |
| # Store offsets |
| if ($item->{offsets}) { |
| $subtokens->store($doc_id, $pos, @{$item->{offsets}}); |
| }; |
| $pos++; |
| }; |
| |
| # Add token terms |
| foreach (@keys) { |
| my $post_list = $dict->add_term($_); |
| $post_list->append($doc_id, @subtokens); |
| }; |
| } |
| |
| # Add span term to dictionary |
| elsif ($item->{'@type'} eq 'koral:span') { |
| |
| # Create key string |
| my $key = '<>' . _term($item->{wrap}); |
| |
| my $post_list = $dict->add_term($key); |
| |
| # Append posting to posting list |
| $post_list->append( |
| $doc_id, |
| $item->{subtokens}->[0], |
| # The end is AFTER the second subtoken |
| $item->{subtokens}->[-1] + 1 |
| ); |
| }; |
| }; |
| |
| return $doc_id; |
| }; |
| |
| |
| # TODO: Use from_koral()->term |
| # Potentially with a prefix |
| sub _term { |
| my $item = shift; |
| |
| my $key = ''; |
| # Create term for term dictionary |
| if ($item->{foundry}) { |
| $key .= $item->{foundry}; |
| if ($item->{layer}) { |
| $key .= '/' . $item->{layer}; |
| } |
| $key .= '='; |
| }; |
| return $key . ($item->{key} // ''); |
| } |
| |
| |
| # Return subtoken list or nothing |
| sub _subtokens { |
| my $item = shift; |
| my @posting; |
| |
| if ($item->{subtokens}) { |
| |
| # Remove! |
| push @posting, $item->{subtokens}->[0]; |
| |
| if ($item->{subtokens}->[1]) { |
| # The end is AFTER the second subtoken |
| push @posting, $item->{subtokens}->[1] + 1; |
| }; |
| |
| return @posting; |
| }; |
| |
| return; |
| }; |
| |
| |
| # Apply (aka search) the index |
| sub apply { |
| my $self = shift; |
| my $koral = shift; |
| |
| # Necessary for filtering |
| my $corpus = $koral->corpus->prepare_for($self) or return; |
| |
| # Add VC to query as a constraint |
| my $query = $koral->query->prepare_for($self, $corpus) or return; |
| |
| # Get meta information |
| my $meta = $koral->meta->prepare_for($self) or return; |
| |
| my $cb = shift; |
| my @result = (); |
| |
| # No callback - push to array |
| unless ($cb) { |
| while ($query->next) { |
| push @result, $query->current; |
| }; |
| return @result; |
| }; |
| |
| # Push callback |
| while ($query->next) { |
| $cb->($query->current); |
| }; |
| |
| }; |
| |
| |
| |
| 1; |
| |
| |
| __END__ |
| |
| |
| |
| # Search using meta data |
| # Can also be used to collect with a callback |
| # |
| sub search { |
| my ($self, $koral, $cb) = @_; |
| |
| my $query = $koral->query; |
| my $corpus = $koral->corpus; |
| my $meta = $koral->meta; |
| |
| # Initiate result object |
| my $result = $koral->result; |
| |
| # Get filtered search object |
| my $search = $query->filter_by($corpus)->plan_for($self); |
| |
| # Augment with facets |
| # Will add to result info |
| if ($meta->facets) { |
| $search = $meta->facets($search); |
| }; |
| |
| # Augment with counting |
| # Will add to result info |
| if ($meta->count) { |
| $search = $meta->count($search); |
| }; |
| |
| # Augment with sorting |
| if ($meta->sorted_by) { |
| $search = $meta->sorted_by($search); |
| }; |
| |
| # Augment with limitations |
| if ($meta->limit) { |
| $search = $meta->limit($search); |
| }; |
| |
| # Augment with field collector |
| # Will modify current match |
| $search = $meta->fields($search); |
| |
| # Augment with id creator |
| # Will modify current match |
| $search = $meta->id_create($search); |
| |
| # Augment with snmippet creator |
| # Will modify current match |
| $search = $meta->snippets($search); |
| |
| # Iterate over all matches |
| while ($search->next) { |
| |
| # Based on the information, this will populate the match |
| $result->add_match($search->current_match); |
| }; |
| |
| return $koral; |
| }; |
| |
| sub get_fields { |
| my ($self, $doc_id, $fields) = @_; |
| ... |
| }; |
| |
| # This returns the posting's start and end position |
| # when embedded in a span, e.g. <base/s=s> |
| sub get_context_by_query { |
| my ($self, $posting, $query) = @_ |
| }; |
| |
| sub get_annotations { |
| my ($self, $posting, $terms) = @_; |
| |
| my %anno = (); |
| |
| my $dict = $self->dict; |
| foreach my $term ($dict->terms($terms)) { |
| my $term_list = $dict->get($term); |
| |
| # Skip to the correct document and the first position |
| next unless $term_list->next($posting->doc_id, $posting->start); |
| |
| # Init annotation |
| my $anno = ($anno{$term} //= []); |
| |
| # Iterate over all annotations |
| while ($term_list->current->end <= $posting->end) { |
| |
| # Remember the annotations |
| push @$anno, $term_list->current->clone; |
| |
| $term_list->next or next; |
| } |
| |
| # Close (and forget) termlist |
| $term_list->close; |
| }; |
| |
| return \%anno; |
| }; |
| |
| |
| |
| |
| |
| sub items_per_page; |
| |
| sub start_page; |
| |
| sub apply { |
| my $self = shift; |
| my $query = $self->plan; |
| my $cb = shift; |
| my @result = (); |
| |
| # No callback - push to array |
| unless ($cb) { |
| while ($query->next) { |
| push @result, $query->current; |
| }; |
| return @result; |
| }; |
| |
| # Push callback |
| while ($query->next) { |
| $cb->($query->current); |
| }; |
| }; |