blob: c6e7d4eee996202908e98a71a9668d8367fcbb1a [file] [log] [blame]
package Krawfish::Index::Dictionary;
use Krawfish::Index::Dictionary::Collations;
use Krawfish::Util::String qw/normalize_nfkc/;
use strict;
use warnings;
use Krawfish::Log;
# TODO:
# Create a central prefix constant class!
# This class is the basic dictionary class. It provides a
# homogeneous interface to K::I::Dictionary::Dynamic and
# K::I::Dictionary::Static (versioned).
#
# All terms need to be searched and retrieved using
# NFD normalization. NFD will leave all terms intact for
# recreation (term_by_term_id) while making it possible to
# ignore diacritica during search.
#
# Dynamic:
# - A dynamic TST (either balancing or self-optimizing)
# Static:
# - A static TST (compact, fast (de)serializale,
# cache-optimized, small)
#
# Terms have different prefixes:
#
# terms: *
# (casefolded) '
# subterms: ~
# foundry: ^
# layer: &
# annotations
# token # (not yet supported)
# span <>
# relations <, >
# attributes @
# fields: +
# fieldkeys: !
#
# add_term:
# First the static dictionary will do a look-up if the term exists,
# then the dynamic dictionary will do an insert_or_search, meaning
# an existing term will return the term_id and a non-existing term
# will be added, returning a term_id.
#
# For casefolded search, it may be necessary to index the casefolded
# string as well, with the same term_id (though, the reverse lookup)
# is not possible. The same MAY be useful for diacriticless search.
#
# When a new subterm is added, a binary search on the static
# rank file is done to find the alphabetically preceeding term of
# the new term. This rank is then added to the dynamic rank.
# This is done for the prefix list as well as for the suffix list.
# It's beneficial to collect a bunch of new terms before doing the
# ranking, so the new terms are ordered alphabetically in advance
# before finding the preceding terms in the subterm rank file.
# In that way, each preceeding match using binary search will narrow
# the following binary search by moving the starting index.
# (THIS MAY BE OLD INFORMATION - consult Krawfish::Index::Rank)
#
# A data structure that supports relational ordering (let's say
# the previous term has the rank 3 and the folowing has the term
# 4 and you want to add it as 3.5) would be nice, but I wasn't
# able to find one that's precise in all circumstances.
#
# add_field($term):
# Identical to add_term(), but returns the term_id and a boolean value
# if the field was set to be sortable.
#
# term_to_term_id:
# First a look-up to the static dictionary is done.
# In case of failure, a lookup to the dynamic dictionary is done.
#
# search:
# Returns an array of valid term_ids.
# Required searches are:
# - casefolded
# - without_diacritics
# - regular expression
# - approximate matching
# - (wildcards)
# Both dictionaries are searched (maybe in parallel).
#
# term_id_to_term:
# In case the term_id is larger than the largest term_id of the
# static dictionary, make a reverse lookup in the dynamic dictionary,
# otherwise make a reverse lookup in the static dictionary.
# This feature may be more complicated when term_ids can be reused
# (see delete).
#
# delete:
# Not necessary, but could be implemented in both dictionaries.
# In the dynamic dictionary the branches are removed.
# In the static dictionary the term (and potentially branches)
# are marked as deleted.
# When both dictionaries are merged, removed terms may be
# ignored. Though - this feature has not really any benefits,
# as old term_ids can't be reused.
#
# merge:
# Merges the static dictionary with the dynamic dictionary and
# creates a new static dictionary.
# This will also merge the ranks.
#
# rank_subterm_prefix:
# Returns the numerical rank of a subterm in collated order.
# The static dictionary will return a simple even numerical rank
# (calculated on merge). The dynamic dictionary will
# return a simple odd numerical rank. Because odd ranks are not
# guaranteed to be unique, they need to be treated special
# in sort algorithms etc.
#
# rank_subterm_suffix
# see rank_subterm, but uses the reverse list.
# TODO:
# Create a fieldrange index
# like http://epic.awi.de/17813/1/Sch2007br.pdf
# This could be especially useful for dates with a specific date
# format, like [year-byte][year-byte][month-byte][day-byte][hour-byte][minute-byte][sec-byte]
#
# TODO:
# This should also take into account multi-ranges like described in
# https://github.com/KorAP/Krill/issues/17
#
# http://search.cpan.org/~davidiam/Set-SegmentTree-0.01/lib/Set/SegmentTree.pm
# https://en.wikipedia.org/wiki/Segment_tree
# TODO:
# collect_term_ids:
# This may be beneficial for co-occurrence search etc.
# Accepts a list of alphabetically sorted strings.
# The list will be prepared as a stream of common suffixes,
# like 'Banane', -4, 'jo', -3, 'ptist' ...
# In that way, a term search does not have to start at the
# dictionary root,
# but can go up just the necessary steps again and search
# further.
# As sorting and prefix-preparing may be quite time consuming,
# this may not be an option everytime.
# In addition, the collation needs to be identical.
# TODO:
# Currently all terms have a term_id, which may limit the whole dictionary
# to a finite number of terms (although this number can be pretty high
# and is only limited to a single node).
# The easiest (and recommended) solution is to treat hapax legomena
# special.
# TODO:
# At the moment, all terms have a same term_id mapping,
# though different term types may have different term_id mappings.
# TODO:
# Support case insensitivity
# TODO:
# Support aliases (e.g. a surface term may have the
# same postings list throughout foundries)
# TODO:
# Add suffix_search("ig"), in case the user searches for ".*ig".
# This may e.g. use a binary search in the suffix ranking.
# Alternatively, if trigrams are indexed, this would of course
# look for 'ig$'.
# TODO:
# Although subterms will not be requested by the user, they are
# requested, for example, by the term_id API for co-occurrence search.
# That's why all subterms need to be stored as well.
use constant DEBUG => 1;
sub new {
my $class = shift;
my $file = shift;
bless {
file => $file,
hash => {}, # Contain the dictionary
# Better:
# Only have
# dynamic => undef,
# static => undef,
# This will probably be one array in the future
prefix_rank => [],
suffix_rank => [],
ranked => 0,
# TEMP helper arrays for (sub)term_id -> (sub)term mapping
term_array => [],
subterm_array => [],
# Bookkeeping for (sub)term_ids
last_term_id => 1,
last_subterm_id => 1,
# There may be different IDs for
# fieldvalues, fieldkeys, terms, subterms ... etc.
# Collations store the collation id or the field => collation association
text_collation => undef,
field_collation => {},
collations => Krawfish::Index::Dictionary::Collations->new
}, $class;
};
sub add_term {
my ($self, $term) = @_;
# Normalize term to nfkc
$term = normalize_nfkc($term);
print_log('dict', "Added term $term") if DEBUG;
my $hash = $self->{hash};
# Term not in dictionary yet
unless (exists $hash->{$term}) {
# Increment term_id
# TODO: This may infact fail, as term_ids are limited in size.
# For hapax legomena, a special null marker will be returned
my $term_id = $self->{last_term_id}++;
# Set term to term_id
$hash->{$term} = $term_id;
# Store term for term_id mapping
$self->{term_array}->[$term_id] = $term;
};
return $hash->{$term};
};
sub collations {
$_[0]->{collations};
};
# Add a field to the index
# TODO:
# Currently the collation only accepts a locale
# but it should - in the future - probably accept a more
# parametric definition
sub add_field {
my ($self, $term, $locale) = @_;
if (DEBUG) {
print_log('dict', "Add field $term" . ($locale ? " with $locale" : ''));
};
# Check if term exists
my $term_id = $self->term_id_by_term('!' . $term);
# The term already exists
if ($term_id) {
# Check which collation was stored
my $stored_locale = $self->collation($term_id);
# The field was not meant to be sortable
if (!defined $locale && !defined $stored_locale) {
# Everything is fine
return $term_id;
}
# The field was meant to be sortable
elsif (defined $locale && defined $stored_locale &&
$locale eq $stored_locale) {
# The collations are compatible
return $term_id;
};
# Collations are not compatible
return;
};
# The term does not exist yet
$term_id = $self->add_term('!' . $term);
if (DEBUG) {
print_log('dict', "Add new collation $locale to field $term_id");
};
$self->{field_collation}->{$term_id} = $locale;
return $term_id;
};
# Get the collation of the base or the field id or undef, if not sortable
sub collation {
my ($self, $field_id) = @_;
my $locale = undef;
unless ($field_id) {
$locale = $self->{text_collation};
}
else {
$locale = $self->{field_collation}->{$field_id};
};
# Return the collation object
return unless $locale;
return 'NUM' if $locale ne 'NUM';
return $self->{collations}->get($locale);
};
# Add subterm to dictionary
# The subterm does not need to be searched, so the mechanism is only used
# to guarantee that subterms are unique.
# There may be a better data structure for this, though.
sub add_subterm {
my ($self, $subterm) = @_;
warn 'DEPRECATED';
print_log('dict', "Added subterm $subterm") if DEBUG;
my $hash = $self->{hash};
my $subterm_id;
# Subterm not in dictionary yet
unless ($subterm_id = $hash->{'~' . $subterm}) {
# Increment sub_term_id
# TODO: This may infact fail, as term_ids are limited in size.
# For hapax legomena, a special null marker will be returned
$subterm_id = $self->{last_subterm_id}++;
$self->{ranked} = 0;
# TODO: Based on the subterms, the rankings will be processed
# Store subterm for term_id mapping
$self->{subterm_array}->[$subterm_id] = $subterm;
# Add subterm to set
$hash->{'~' . $subterm} = $subterm_id;
};
return $subterm_id;
};
# Return the term from a term_id
# This needs to be fast (can't be done like this)
sub term_by_term_id {
my ($self, $term_id) = @_;
print_log('dict', 'Try to retrieve term id ' . $term_id) if DEBUG;
return $self->{term_array}->[$term_id];
};
sub subterm_by_subterm_id {
warn 'DEPRECATED';
my ($self, $subterm_id) = @_;
print_log('dict', 'Try to retrieve subterm id ' . $subterm_id) if DEBUG;
return $self->{subterm_array}->[$subterm_id];
};
# Returns a rank value by a certain subterm id
sub prefix_rank_by_subterm_id {
my ($self, $subterm_id) = @_;
$self->process_subterm_ranks;
$self->{prefix_rank}->[$subterm_id];
};
# Returns a rank value by a certain subterm id
sub suffix_rank_by_subterm_id {
my ($self, $subterm_id) = @_;
$self->process_subterm_ranks;
$self->{suffix_rank}->[$subterm_id];
};
# Returns the term id by a term
sub term_id_by_term {
my ($self, $term) = @_;
# Normalize term to nfkc
$term = normalize_nfkc($term);
print_log('dict', 'Try to retrieve term ' . $term) if DEBUG;
return $self->{hash}->{$term};
};
# Return the last charachters of a term
# - beneficial for substring sorting
sub suffix_by_term_id {
my ($self, $term_id, $length) = @_;
...
};
# Return a list of term_ids based on a regular expression
sub term_ids {
my ($self, $re) = @_;
if ($re) {
my $hash = $self->{hash};
return sort map { $hash->{$_} } grep { $_ =~ $re } keys %$hash;
};
return values %{$self->{hash}};
};
# Returns the terms based on the internal sorting
sub terms {
my $self = shift;
return sort keys %{$self->{hash}};
};
# This should be implemented in the dynamic dictionary only
sub process_subterm_ranks {
return if $_[0]->{ranked};
my $self = shift;
# Accept a collation for sorting
# see
# - https://dev.mysql.com/doc/refman/5.7/en/adding-collation.html
# - http://www.unicode.org/reports/tr10/tr10-30.html
my $collation = shift;
# TODO:
# For prefix rank:
# Iterate over prefix tree in in-node order
# and generate a rank for the subtree of the subterm.
# ...
# For suffix rank:
# Iterate over prefix tree
# and sort by suffix using
# bucketsort or mergesort to
# create a sorted rank
# Iterate over all subterms alphabetically
my $i = 0;
my @subt = grep { index($_, '~') == 0 } keys %{$self->{hash}};
foreach my $subterm (sort @subt) {
# Set subterm_id to prefix rank
$self->{prefix_rank}->[$i++] = $self->{hash}->{$subterm};
};
# Iterate over all subterms based on their suffixes
$i = 0;
foreach my $subterm (sort { reverse($a) cmp reverse($b) } @subt) {
# Set subterm_id to prefix rank
$self->{suffix_rank}->[$i++] = $self->{hash}->{$subterm};
};
$_[0]->{ranked} = 1;
};
1;