lib/Krawfish/Index/Dictionary.pm - KorAP/Krawfish-prototype - Gitiles

 package Krawfish::Index::Dictionary;
 use Krawfish::Index::Dictionary::Collations;
 use Krawfish::Util::String qw/normalize_nfkc/;
 use strict;
 use warnings;
 use Krawfish::Log;

 # TODO:
 #   Create a central prefix constant class!

 # This class is the basic dictionary class. It provides a
 # homogeneous interface to K::I::Dictionary::Dynamic and
 # K::I::Dictionary::Static (versioned).
 #
 # All terms need to be searched and retrieved using
 # NFD normalization. NFD will leave all terms intact for
 # recreation (term_by_term_id) while making it possible to
 # ignore diacritica during search.
 #
 #   Dynamic:
 #   - A dynamic TST (either balancing or self-optimizing)
 #   Static:
 #   - A static TST (compact, fast (de)serializale,
 #     cache-optimized, small)
 #
 # Terms have different prefixes:
 #
 #   terms:       *
 #   (casefolded) '
 #   subterms:    ~
 #   foundry:     ^
 #   layer:       &
 #   annotations
 #     token      #   (not yet supported)
 #     span       <>
 #     relations  <, >
 #     attributes @
 #   fields:      +
 #   fieldkeys:   !
 #
 # add_term:
 #   First the static dictionary will do a look-up if the term exists,
 #   then the dynamic dictionary will do an insert_or_search, meaning
 #   an existing term will return the term_id and a non-existing term
 #   will be added, returning a term_id.
 #
 #   For casefolded search, it may be necessary to index the casefolded
 #   string as well, with the same term_id (though, the reverse lookup)
 #   is not possible. The same MAY be useful for diacriticless search.
 #
 #   When a new subterm is added, a binary search on the static
 #   rank file is done to find the alphabetically preceeding term of
 #   the new term. This rank is then added to the dynamic rank.
 #   This is done for the prefix list as well as for the suffix list.
 #   It's beneficial to collect a bunch of new terms before doing the
 #   ranking, so the new terms are ordered alphabetically in advance
 #   before finding the preceding terms in the subterm rank file.
 #   In that way, each preceeding match using binary search will narrow
 #   the following binary search by moving the starting index.
 #   (THIS MAY BE OLD INFORMATION - consult Krawfish::Index::Rank)
 #
 #   A data structure that supports relational ordering (let's say
 #   the previous term has the rank 3 and the folowing has the term
 #   4 and you want to add it as 3.5) would be nice, but I wasn't
 #   able to find one that's precise in all circumstances.
 #
 # add_field($term):
 #   Identical to add_term(), but returns the term_id and a boolean value
 #   if the field was set to be sortable.
 #
 # term_to_term_id:
 #   First a look-up to the static dictionary is done.
 #   In case of failure, a lookup to the dynamic dictionary is done.
 #
 # search:
 #   Returns an array of valid term_ids.
 #   Required searches are:
 #     - casefolded
 #     - without_diacritics
 #     - regular expression
 #     - approximate matching
 #     - (wildcards)
 #   Both dictionaries are searched (maybe in parallel).
 #
 # term_id_to_term:
 #   In case the term_id is larger than the largest term_id of the
 #   static dictionary, make a reverse lookup in the dynamic dictionary,
 #   otherwise make a reverse lookup in the static dictionary.
 #   This feature may be more complicated when term_ids can be reused
 #   (see delete).
 #
 # delete:
 #   Not necessary, but could be implemented in both dictionaries.
 #   In the dynamic dictionary the branches are removed.
 #   In the static dictionary the term (and potentially branches)
 #   are marked as deleted.
 #   When both dictionaries are merged, removed terms may be
 #   ignored. Though - this feature has not really any benefits,
 #   as old term_ids can't be reused.
 #
 # merge:
 #   Merges the static dictionary with the dynamic dictionary and
 #   creates a new static dictionary.
 #   This will also merge the ranks.
 #
 # rank_subterm_prefix:
 #   Returns the numerical rank of a subterm in collated order.
 #   The static dictionary will return a simple even numerical rank
 #   (calculated on merge). The dynamic dictionary will
 #   return a simple odd numerical rank. Because odd ranks are not
 #   guaranteed to be unique, they need to be treated special
 #   in sort algorithms etc.
 #
 # rank_subterm_suffix
 #   see rank_subterm, but uses the reverse list.

 # TODO:
 #   Create a fieldrange index
 #   like http://epic.awi.de/17813/1/Sch2007br.pdf
 #   This could be especially useful for dates with a specific date
 #   format, like [year-byte][year-byte][month-byte][day-byte][hour-byte][minute-byte][sec-byte]
 #
 # TODO:
 #   This should also take into account multi-ranges like described in
 #   https://github.com/KorAP/Krill/issues/17
 #
 #   http://search.cpan.org/~davidiam/Set-SegmentTree-0.01/lib/Set/SegmentTree.pm
 #   https://en.wikipedia.org/wiki/Segment_tree

 # TODO:
 #   collect_term_ids:
 #     This may be beneficial for co-occurrence search etc.
 #     Accepts a list of alphabetically sorted strings.
 #     The list will be prepared as a stream of common suffixes,
 #     like 'Banane', -4, 'jo', -3, 'ptist' ...
 #     In that way, a term search does not have to start at the
 #     dictionary root,
 #     but can go up just the necessary steps again and search
 #     further.
 #     As sorting and prefix-preparing may be quite time consuming,
 #     this may not be an option everytime.
 #     In addition, the collation needs to be identical.

 # TODO:
 #   Currently all terms have a term_id, which may limit the whole dictionary
 #   to a finite number of terms (although this number can be pretty high
 #   and is only limited to a single node).
 #   The easiest (and recommended) solution is to treat hapax legomena
 #   special.

 # TODO:
 #   At the moment, all terms have a same term_id mapping,
 #   though different term types may have different term_id mappings.

 # TODO:
 #   Support case insensitivity

 # TODO:
 #   Support aliases (e.g. a surface term may have the
 #   same postings list throughout foundries)

 # TODO:
 #   Add suffix_search("ig"), in case the user searches for ".*ig".
 #   This may e.g. use a binary search in the suffix ranking.
 #   Alternatively, if trigrams are indexed, this would of course
 #   look for 'ig$'.

 # TODO:
 #   Although subterms will not be requested by the user, they are
 #   requested, for example, by the term_id API for co-occurrence search.
 #   That's why all subterms need to be stored as well.


 use constant DEBUG => 1;

 sub new {
   my $class = shift;
   my $file = shift;
   bless {
     file => $file,
     hash => {},   # Contain the dictionary

     # Better:
     # Only have
     # dynamic => undef,
     # static => undef,

     # This will probably be one array in the future
     prefix_rank => [],
     suffix_rank => [],
     ranked => 0,

     # TEMP helper arrays for (sub)term_id -> (sub)term mapping
     term_array => [],
     subterm_array => [],

     # Bookkeeping for (sub)term_ids
     last_term_id => 1,
     last_subterm_id => 1,

     # There may be different IDs for
     # fieldvalues, fieldkeys, terms, subterms ... etc.

     # Collations store the collation id or the field => collation association
     text_collation => undef,
     field_collation => {},
     collations => Krawfish::Index::Dictionary::Collations->new
   }, $class;
 };


 sub add_term {
   my ($self, $term) = @_;

   # Normalize term to nfkc
   $term = normalize_nfkc($term);

   print_log('dict', "Added term $term") if DEBUG;

   my $hash = $self->{hash};

   # Term not in dictionary yet
   unless (exists $hash->{$term}) {

     # Increment term_id
     # TODO: This may infact fail, as term_ids are limited in size.
     #   For hapax legomena, a special null marker will be returned
     my $term_id = $self->{last_term_id}++;

     # Set term to term_id
     $hash->{$term} = $term_id;

     # Store term for term_id mapping
     $self->{term_array}->[$term_id] = $term;
   };
   return $hash->{$term};
 };


 sub collations {
   $_[0]->{collations};
 };

 # Add a field to the index
 # TODO:
 #   Currently the collation only accepts a locale
 #   but it should - in the future - probably accept a more
 #   parametric definition
 sub add_field {
   my ($self, $term, $locale) = @_;

   if (DEBUG) {
     print_log('dict', "Add field $term" . ($locale ? " with $locale" : ''));
   };

   # Check if term exists
   my $term_id = $self->term_id_by_term('!' . $term);

   # The term already exists
   if ($term_id) {

     # Check which collation was stored
     my $stored_locale = $self->collation($term_id);

     # The field was not meant to be sortable
     if (!defined $locale && !defined $stored_locale) {

       # Everything is fine
       return $term_id;
     }

     # The field was meant to be sortable
     elsif (defined $locale && defined $stored_locale &&
           $locale eq $stored_locale) {

       # The collations are compatible
       return $term_id;
     };

     # Collations are not compatible
     return;
   };

   # The term does not exist yet
   $term_id = $self->add_term('!' . $term);

   if (DEBUG) {
     print_log('dict', "Add new collation $locale to field $term_id");
   };
   $self->{field_collation}->{$term_id} = $locale;
   return $term_id;
 };


 # Get the collation of the base or the field id or undef, if not sortable
 sub collation {
   my ($self, $field_id) = @_;
   my $locale = undef;
   unless ($field_id) {
     $locale = $self->{text_collation};
   }
   else {
     $locale = $self->{field_collation}->{$field_id};
   };

   # Return the collation object
   return unless $locale;
   return 'NUM' if $locale ne 'NUM';
   return $self->{collations}->get($locale);
 };


 # Add subterm to dictionary
 # The subterm does not need to be searched, so the mechanism is only used
 # to guarantee that subterms are unique.
 # There may be a better data structure for this, though.
 sub add_subterm {
   my ($self, $subterm) = @_;

   warn 'DEPRECATED';

   print_log('dict', "Added subterm $subterm") if DEBUG;

   my $hash = $self->{hash};
   my $subterm_id;

   # Subterm not in dictionary yet
   unless ($subterm_id = $hash->{'~' . $subterm}) {

     # Increment sub_term_id
     # TODO: This may infact fail, as term_ids are limited in size.
     #   For hapax legomena, a special null marker will be returned
     $subterm_id = $self->{last_subterm_id}++;

     $self->{ranked} = 0;

     # TODO: Based on the subterms, the rankings will be processed

     # Store subterm for term_id mapping
     $self->{subterm_array}->[$subterm_id] = $subterm;

     # Add subterm to set
     $hash->{'~' . $subterm} = $subterm_id;
   };

   return $subterm_id;
 };


 # Return the term from a term_id
 # This needs to be fast (can't be done like this)
 sub term_by_term_id {
   my ($self, $term_id) = @_;
   print_log('dict', 'Try to retrieve term id ' . $term_id) if DEBUG;
   return $self->{term_array}->[$term_id];
 };


 sub subterm_by_subterm_id {
   warn 'DEPRECATED';
   my ($self, $subterm_id) = @_;
   print_log('dict', 'Try to retrieve subterm id ' . $subterm_id) if DEBUG;
   return $self->{subterm_array}->[$subterm_id];

 };


 # Returns a rank value by a certain subterm id
 sub prefix_rank_by_subterm_id {
   my ($self, $subterm_id) = @_;
   $self->process_subterm_ranks;
   $self->{prefix_rank}->[$subterm_id];
 };


 # Returns a rank value by a certain subterm id
 sub suffix_rank_by_subterm_id {
   my ($self, $subterm_id) = @_;
   $self->process_subterm_ranks;
   $self->{suffix_rank}->[$subterm_id];
 };


 # Returns the term id by a term
 sub term_id_by_term {
   my ($self, $term) = @_;

   # Normalize term to nfkc
   $term = normalize_nfkc($term);

   print_log('dict', 'Try to retrieve term ' . $term) if DEBUG;
   return $self->{hash}->{$term};
 };


 # Return the last charachters of a term
 # - beneficial for substring sorting
 sub suffix_by_term_id {
   my ($self, $term_id, $length) = @_;
   ...
 };


 # Return a list of term_ids based on a regular expression
 sub term_ids {
   my ($self, $re) = @_;

   if ($re) {
     my $hash = $self->{hash};

     return sort map { $hash->{$_} } grep { $_ =~ $re } keys %$hash;
   };

   return values %{$self->{hash}};
 };


 # Returns the terms based on the internal sorting
 sub terms {
   my $self = shift;
   return sort keys %{$self->{hash}};
 };


 # This should be implemented in the dynamic dictionary only
 sub process_subterm_ranks {
   return if $_[0]->{ranked};
   my $self = shift;

   # Accept a collation for sorting
   # see
   # - https://dev.mysql.com/doc/refman/5.7/en/adding-collation.html
   # - http://www.unicode.org/reports/tr10/tr10-30.html
   my $collation = shift;

   # TODO:
   # For prefix rank:
   # Iterate over prefix tree in in-node order
   # and generate a rank for the subtree of the subterm.
   # ...
   # For suffix rank:
   # Iterate over prefix tree
   # and sort by suffix using
   # bucketsort or mergesort to
   # create a sorted rank

   # Iterate over all subterms alphabetically
   my $i = 0;
   my @subt = grep { index($_, '~') == 0 } keys %{$self->{hash}};
   foreach my $subterm (sort @subt) {

     # Set subterm_id to prefix rank
     $self->{prefix_rank}->[$i++] = $self->{hash}->{$subterm};
   };

   # Iterate over all subterms based on their suffixes
   $i = 0;
   foreach my $subterm (sort { reverse($a) cmp reverse($b) } @subt) {

     # Set subterm_id to prefix rank
     $self->{suffix_rank}->[$i++] = $self->{hash}->{$subterm};
   };

   $_[0]->{ranked} = 1;
 };


 1;
	package Krawfish::Index::Dictionary;
	use Krawfish::Index::Dictionary::Collations;
	use Krawfish::Util::String qw/normalize_nfkc/;
	use strict;
	use warnings;
	use Krawfish::Log;

	# TODO:
	# Create a central prefix constant class!

	# This class is the basic dictionary class. It provides a
	# homogeneous interface to K::I::Dictionary::Dynamic and
	# K::I::Dictionary::Static (versioned).
	#
	# All terms need to be searched and retrieved using
	# NFD normalization. NFD will leave all terms intact for
	# recreation (term_by_term_id) while making it possible to
	# ignore diacritica during search.
	#
	# Dynamic:
	# - A dynamic TST (either balancing or self-optimizing)
	# Static:
	# - A static TST (compact, fast (de)serializale,
	# cache-optimized, small)
	#
	# Terms have different prefixes:
	#
	# terms: *
	# (casefolded) '
	# subterms: ~
	# foundry: ^
	# layer: &
	# annotations
	# token # (not yet supported)
	# span <>
	# relations <, >
	# attributes @
	# fields: +
	# fieldkeys: !
	#
	# add_term:
	# First the static dictionary will do a look-up if the term exists,
	# then the dynamic dictionary will do an insert_or_search, meaning
	# an existing term will return the term_id and a non-existing term
	# will be added, returning a term_id.
	#
	# For casefolded search, it may be necessary to index the casefolded
	# string as well, with the same term_id (though, the reverse lookup)
	# is not possible. The same MAY be useful for diacriticless search.
	#
	# When a new subterm is added, a binary search on the static
	# rank file is done to find the alphabetically preceeding term of
	# the new term. This rank is then added to the dynamic rank.
	# This is done for the prefix list as well as for the suffix list.
	# It's beneficial to collect a bunch of new terms before doing the
	# ranking, so the new terms are ordered alphabetically in advance
	# before finding the preceding terms in the subterm rank file.
	# In that way, each preceeding match using binary search will narrow
	# the following binary search by moving the starting index.
	# (THIS MAY BE OLD INFORMATION - consult Krawfish::Index::Rank)
	#
	# A data structure that supports relational ordering (let's say
	# the previous term has the rank 3 and the folowing has the term
	# 4 and you want to add it as 3.5) would be nice, but I wasn't
	# able to find one that's precise in all circumstances.
	#
	# add_field($term):
	# Identical to add_term(), but returns the term_id and a boolean value
	# if the field was set to be sortable.
	#
	# term_to_term_id:
	# First a look-up to the static dictionary is done.
	# In case of failure, a lookup to the dynamic dictionary is done.
	#
	# search:
	# Returns an array of valid term_ids.
	# Required searches are:
	# - casefolded
	# - without_diacritics
	# - regular expression
	# - approximate matching
	# - (wildcards)
	# Both dictionaries are searched (maybe in parallel).
	#
	# term_id_to_term:
	# In case the term_id is larger than the largest term_id of the
	# static dictionary, make a reverse lookup in the dynamic dictionary,
	# otherwise make a reverse lookup in the static dictionary.
	# This feature may be more complicated when term_ids can be reused
	# (see delete).
	#
	# delete:
	# Not necessary, but could be implemented in both dictionaries.
	# In the dynamic dictionary the branches are removed.
	# In the static dictionary the term (and potentially branches)
	# are marked as deleted.
	# When both dictionaries are merged, removed terms may be
	# ignored. Though - this feature has not really any benefits,
	# as old term_ids can't be reused.
	#
	# merge:
	# Merges the static dictionary with the dynamic dictionary and
	# creates a new static dictionary.
	# This will also merge the ranks.
	#
	# rank_subterm_prefix:
	# Returns the numerical rank of a subterm in collated order.
	# The static dictionary will return a simple even numerical rank
	# (calculated on merge). The dynamic dictionary will
	# return a simple odd numerical rank. Because odd ranks are not
	# guaranteed to be unique, they need to be treated special
	# in sort algorithms etc.
	#
	# rank_subterm_suffix
	# see rank_subterm, but uses the reverse list.

	# TODO:
	# Create a fieldrange index
	# like http://epic.awi.de/17813/1/Sch2007br.pdf
	# This could be especially useful for dates with a specific date
	# format, like [year-byte][year-byte][month-byte][day-byte][hour-byte][minute-byte][sec-byte]
	#
	# TODO:
	# This should also take into account multi-ranges like described in
	# https://github.com/KorAP/Krill/issues/17
	#
	# http://search.cpan.org/~davidiam/Set-SegmentTree-0.01/lib/Set/SegmentTree.pm
	# https://en.wikipedia.org/wiki/Segment_tree

	# TODO:
	# collect_term_ids:
	# This may be beneficial for co-occurrence search etc.
	# Accepts a list of alphabetically sorted strings.
	# The list will be prepared as a stream of common suffixes,
	# like 'Banane', -4, 'jo', -3, 'ptist' ...
	# In that way, a term search does not have to start at the
	# dictionary root,
	# but can go up just the necessary steps again and search
	# further.
	# As sorting and prefix-preparing may be quite time consuming,
	# this may not be an option everytime.
	# In addition, the collation needs to be identical.

	# TODO:
	# Currently all terms have a term_id, which may limit the whole dictionary
	# to a finite number of terms (although this number can be pretty high
	# and is only limited to a single node).
	# The easiest (and recommended) solution is to treat hapax legomena
	# special.

	# TODO:
	# At the moment, all terms have a same term_id mapping,
	# though different term types may have different term_id mappings.

	# TODO:
	# Support case insensitivity

	# TODO:
	# Support aliases (e.g. a surface term may have the
	# same postings list throughout foundries)

	# TODO:
	# Add suffix_search("ig"), in case the user searches for ".*ig".
	# This may e.g. use a binary search in the suffix ranking.
	# Alternatively, if trigrams are indexed, this would of course
	# look for 'ig$'.

	# TODO:
	# Although subterms will not be requested by the user, they are
	# requested, for example, by the term_id API for co-occurrence search.
	# That's why all subterms need to be stored as well.


	use constant DEBUG => 1;

	sub new {
	my $class = shift;
	my $file = shift;
	bless {
	file => $file,
	hash => {}, # Contain the dictionary

	# Better:
	# Only have
	# dynamic => undef,
	# static => undef,

	# This will probably be one array in the future
	prefix_rank => [],
	suffix_rank => [],
	ranked => 0,

	# TEMP helper arrays for (sub)term_id -> (sub)term mapping
	term_array => [],
	subterm_array => [],

	# Bookkeeping for (sub)term_ids
	last_term_id => 1,
	last_subterm_id => 1,

	# There may be different IDs for
	# fieldvalues, fieldkeys, terms, subterms ... etc.

	# Collations store the collation id or the field => collation association
	text_collation => undef,
	field_collation => {},
	collations => Krawfish::Index::Dictionary::Collations->new
	}, $class;
	};


	sub add_term {
	my ($self, $term) = @_;

	# Normalize term to nfkc
	$term = normalize_nfkc($term);

	print_log('dict', "Added term $term") if DEBUG;

	my $hash = $self->{hash};

	# Term not in dictionary yet
	unless (exists $hash->{$term}) {

	# Increment term_id
	# TODO: This may infact fail, as term_ids are limited in size.
	# For hapax legomena, a special null marker will be returned
	my $term_id = $self->{last_term_id}++;

	# Set term to term_id
	$hash->{$term} = $term_id;

	# Store term for term_id mapping
	$self->{term_array}->[$term_id] = $term;
	};
	return $hash->{$term};
	};


	sub collations {
	$_[0]->{collations};
	};

	# Add a field to the index
	# TODO:
	# Currently the collation only accepts a locale
	# but it should - in the future - probably accept a more
	# parametric definition
	sub add_field {
	my ($self, $term, $locale) = @_;

	if (DEBUG) {
	print_log('dict', "Add field $term" . ($locale ? " with $locale" : ''));
	};

	# Check if term exists
	my $term_id = $self->term_id_by_term('!' . $term);

	# The term already exists
	if ($term_id) {

	# Check which collation was stored
	my $stored_locale = $self->collation($term_id);

	# The field was not meant to be sortable
	if (!defined $locale && !defined $stored_locale) {

	# Everything is fine
	return $term_id;
	}

	# The field was meant to be sortable
	elsif (defined $locale && defined $stored_locale &&
	$locale eq $stored_locale) {

	# The collations are compatible
	return $term_id;
	};

	# Collations are not compatible
	return;
	};

	# The term does not exist yet
	$term_id = $self->add_term('!' . $term);

	if (DEBUG) {
	print_log('dict', "Add new collation $locale to field $term_id");
	};
	$self->{field_collation}->{$term_id} = $locale;
	return $term_id;
	};



	# Get the collation of the base or the field id or undef, if not sortable
	sub collation {
	my ($self, $field_id) = @_;
	my $locale = undef;
	unless ($field_id) {
	$locale = $self->{text_collation};
	}
	else {
	$locale = $self->{field_collation}->{$field_id};
	};

	# Return the collation object
	return unless $locale;
	return 'NUM' if $locale ne 'NUM';
	return $self->{collations}->get($locale);
	};



	# Add subterm to dictionary
	# The subterm does not need to be searched, so the mechanism is only used
	# to guarantee that subterms are unique.
	# There may be a better data structure for this, though.
	sub add_subterm {
	my ($self, $subterm) = @_;

	warn 'DEPRECATED';

	print_log('dict', "Added subterm $subterm") if DEBUG;

	my $hash = $self->{hash};
	my $subterm_id;

	# Subterm not in dictionary yet
	unless ($subterm_id = $hash->{'~' . $subterm}) {

	# Increment sub_term_id
	# TODO: This may infact fail, as term_ids are limited in size.
	# For hapax legomena, a special null marker will be returned
	$subterm_id = $self->{last_subterm_id}++;

	$self->{ranked} = 0;

	# TODO: Based on the subterms, the rankings will be processed

	# Store subterm for term_id mapping
	$self->{subterm_array}->[$subterm_id] = $subterm;

	# Add subterm to set
	$hash->{'~' . $subterm} = $subterm_id;
	};

	return $subterm_id;
	};


	# Return the term from a term_id
	# This needs to be fast (can't be done like this)
	sub term_by_term_id {
	my ($self, $term_id) = @_;
	print_log('dict', 'Try to retrieve term id ' . $term_id) if DEBUG;
	return $self->{term_array}->[$term_id];
	};


	sub subterm_by_subterm_id {
	warn 'DEPRECATED';
	my ($self, $subterm_id) = @_;
	print_log('dict', 'Try to retrieve subterm id ' . $subterm_id) if DEBUG;
	return $self->{subterm_array}->[$subterm_id];

	};


	# Returns a rank value by a certain subterm id
	sub prefix_rank_by_subterm_id {
	my ($self, $subterm_id) = @_;
	$self->process_subterm_ranks;
	$self->{prefix_rank}->[$subterm_id];
	};


	# Returns a rank value by a certain subterm id
	sub suffix_rank_by_subterm_id {
	my ($self, $subterm_id) = @_;
	$self->process_subterm_ranks;
	$self->{suffix_rank}->[$subterm_id];
	};


	# Returns the term id by a term
	sub term_id_by_term {
	my ($self, $term) = @_;

	# Normalize term to nfkc
	$term = normalize_nfkc($term);

	print_log('dict', 'Try to retrieve term ' . $term) if DEBUG;
	return $self->{hash}->{$term};
	};




	# Return the last charachters of a term
	# - beneficial for substring sorting
	sub suffix_by_term_id {
	my ($self, $term_id, $length) = @_;
	...
	};


	# Return a list of term_ids based on a regular expression
	sub term_ids {
	my ($self, $re) = @_;

	if ($re) {
	my $hash = $self->{hash};

	return sort map { $hash->{$_} } grep { $_ =~ $re } keys %$hash;
	};

	return values %{$self->{hash}};
	};


	# Returns the terms based on the internal sorting
	sub terms {
	my $self = shift;
	return sort keys %{$self->{hash}};
	};


	# This should be implemented in the dynamic dictionary only
	sub process_subterm_ranks {
	return if $_[0]->{ranked};
	my $self = shift;

	# Accept a collation for sorting
	# see
	# - https://dev.mysql.com/doc/refman/5.7/en/adding-collation.html
	# - http://www.unicode.org/reports/tr10/tr10-30.html
	my $collation = shift;

	# TODO:
	# For prefix rank:
	# Iterate over prefix tree in in-node order
	# and generate a rank for the subtree of the subterm.
	# ...
	# For suffix rank:
	# Iterate over prefix tree
	# and sort by suffix using
	# bucketsort or mergesort to
	# create a sorted rank

	# Iterate over all subterms alphabetically
	my $i = 0;
	my @subt = grep { index($_, '~') == 0 } keys %{$self->{hash}};
	foreach my $subterm (sort @subt) {

	# Set subterm_id to prefix rank
	$self->{prefix_rank}->[$i++] = $self->{hash}->{$subterm};
	};

	# Iterate over all subterms based on their suffixes
	$i = 0;
	foreach my $subterm (sort { reverse($a) cmp reverse($b) } @subt) {

	# Set subterm_id to prefix rank
	$self->{suffix_rank}->[$i++] = $self->{hash}->{$subterm};
	};

	$_[0]->{ranked} = 1;
	};


	1;