added case folding and diacritic removal
diff --git a/lib/Krawfish/Index.pm b/lib/Krawfish/Index.pm
index c76c249..5daecea 100644
--- a/lib/Krawfish/Index.pm
+++ b/lib/Krawfish/Index.pm
@@ -225,6 +225,13 @@
# Add as a subterm
my $subterm_id = $dict->add_subterm($term);
+ # TODO:
+ # Check somehow, if the term is new. If so, then {
+ # TODO: Store case insensitive term
+ # $dict->add_subterm_casefolded(fold_case($term), $subterm_id);
+ # $dict->add_subterm_without_diacritics(remove_diacritics($term), $subterm_id);
+ # }
+
print_log('index', 'Surface form has subterm_id ' . $subterm_id) if DEBUG;
# Store information to subtoken
diff --git a/lib/Krawfish/Index/Store/Dynamic/Dictionary.pm b/lib/Krawfish/Index/Store/Dynamic/Dictionary.pm
index 4b7ac01..208c48c 100644
--- a/lib/Krawfish/Index/Store/Dynamic/Dictionary.pm
+++ b/lib/Krawfish/Index/Store/Dynamic/Dictionary.pm
@@ -7,6 +7,9 @@
# term(leaf-node)
#
# TODO:
+# Add alias transitions, that point to a list of term ids.
+#
+# TODO:
# The self-optimizing application should also
# be used for autosuggestions.
# This will be a separate datastructure
@@ -26,7 +29,8 @@
EQ_KID => 2,
HI_KID => 3,
TERM_ID => 4,
- TERM_CHAR => '00'
+ TERM_CHAR => '00',
+ ALIAS_CHAR => '01'
};
# Code is based on Tree::Ternary
@@ -125,7 +129,26 @@
};
-sub search_i;
+# TODO:
+# Insert a term and store a term_id as an alias.
+# If the term already exist, add the term_id to the term id array.
+# This is useful for casefolded terms, that may refold to multiple
+# term_ids (therefore useful for case insensitive searching).
+# Or for accent insensitive searches.
+# Another use case are cached regular expressions, like /.+?ratu.+?/,
+# that are costly to search the dictionary for, but may easily be stored as an alias collection!
+sub insert_alias {
+ my ($self, $term, $term_id) = @_;
+ ...
+};
+
+
+# This will return an array of term ids,
+# in case the term is stored as an alias.
+# Otherwise the array has only one item.
+sub search_alias;
+
+
sub prefix_lookup {
my ($self, $prefix, $top_k) = @_;
diff --git a/lib/Krawfish/Index/Store/V1/Dictionary.pm b/lib/Krawfish/Index/Store/V1/Dictionary.pm
index bea59a1..64ca8cf 100644
--- a/lib/Krawfish/Index/Store/V1/Dictionary.pm
+++ b/lib/Krawfish/Index/Store/V1/Dictionary.pm
@@ -13,7 +13,7 @@
HI_KID => 3,
TERM_ID => 4,
TERM_CHAR => '00',
- DEBUG => 0
+ DEBUG => 1
};
# TODO:
@@ -64,6 +64,16 @@
};
+# read with header
+# This may first be done in the parent dictionary module
+# and then be delegated to the right version
+sub from_file;
+
+
+# write a header
+sub to_file;
+
+
# Search for string
# TODO:
# Support collation my $diff = collation_cmp($char, $node_char);
@@ -91,6 +101,7 @@
print_log('v1_dict', "$char vs $node_char") if DEBUG;
+ # Check for right child
if ($char lt $node_char) {
print_log('v1_dict', "$char < $node_char") if DEBUG;
@@ -181,12 +192,14 @@
# - Use a stack or something similar to store the info and
# keep next/previous-diff-xor for the eq nodes.
# - All characters need to be treated as UCS2 (2-byte encoding)
+ # or utf-32 (4-byte encoding)
my $dynamic_node = shift;
# Init with the first offset
- my $parent_offset = 0;
- my $node_offset = 0;
- my $curr_offset = 0;
+ my $parent_offset = 0;
+ my $node_offset = 0;
+ my $curr_offset = 0;
+ my $gparent_offset = 0;
# Iterate over the tree breadth-first
# TODO:
@@ -196,32 +209,54 @@
# In that case unique suffixes are stored in one page,
# which may be faster.
#
- my @queue = ([$node_offset, $parent_offset, $dynamic_node]);
+ my @queue = ([$node_offset, $parent_offset, $dynamic_node, $gparent_offset]);
my @array = ();
my @term_ids = ();
+ my $parent_of_bst = 0;
+
# Iterate in a breadth-first manner
while (scalar(@queue) > 0) {
# Get offset information
# - The node offset is the position of the parent pointer
# - The offset is the position of the parent root
- ($node_offset, $parent_offset, $dynamic_node) = @{shift @queue};
+ ($node_offset, $parent_offset, $dynamic_node, $gparent_offset) = @{shift @queue};
# Get the current letter node as an array
my ($length, $chars) = _complete_level_sort($dynamic_node);
- # TODO:
- # Fix the parent nodes xor-eq-value
-
# The next node array starts with a length indicator
push @array, $length;
# The current root position
$curr_offset = $#array;
+ # Fix the parent nodes xor-eq-value
if ($parent_offset > 0) {
+
+ if (DEBUG) {
+ # print_log('v1_dict', "The current bst parent is $parent_of_bst") if DEBUG;
+
+ print_log('v1_dict', "Fix the current parent eq node");
+ print_log('v1_dict',
+ ' ' . $array[$parent_offset - 1] .
+ " at $node_offset/$curr_offset with $gparent_offset: " .
+ $gparent_offset .
+ " xor $parent_offset = " .
+ ($gparent_offset ^ $parent_offset),
+ );
+ };
+
+ # TODO:
$array[$parent_offset] ^= $curr_offset; # = $curr_offset
+ }
+
+ elsif (DEBUG) {
+ print_log(
+ 'v1_dict',
+ "curr_offset is $curr_offset with parent_offset = $parent_offset"
+ );
};
foreach (@$chars) {
@@ -240,6 +275,7 @@
# O(1) for term id request and O(n) for term retrieval
if ($_->[SPLIT_CHAR] eq TERM_CHAR) {
# push @array, $parent_offset ^ $_->[TERM_ID]; #
+
push @array, $curr_offset ^ $_->[TERM_ID];
# store leaf node in term_id array
@@ -247,11 +283,16 @@
}
# Push temporary eq
else {
-# push @array, $parent_offset; # $curr_offset; # It's rather initialized 0 ^ $curr_offset
+ # push @array, $parent_offset; # $curr_offset;
+ # It's rather initialized 0 ^ $curr_offset
push @array, $curr_offset; # It's rather initialized 0 ^ $curr_offset
- push @queue, [$curr_offset, $#array, $_->[EQ_KID]];
+
+ # Push the current EQ node on the queue
+ push @queue, [$curr_offset, $#array, $_->[EQ_KID], $parent_offset];
};
};
+
+ $parent_of_bst = $curr_offset;
};
return \@array, \@term_ids;
diff --git a/lib/Krawfish/Util/String.pm b/lib/Krawfish/Util/String.pm
new file mode 100644
index 0000000..f104cd3
--- /dev/null
+++ b/lib/Krawfish/Util/String.pm
@@ -0,0 +1,33 @@
+package Krawfish::Util::String;
+use strict;
+use warnings;
+
+# Potentially use Unicode::UCD instead
+use Unicode::CaseFold;
+use Unicode::Normalize qw/getCombinClass normalize/;
+
+use parent 'Exporter';
+
+our @EXPORT = qw/fold_case remove_diacritics normalize_nfkc/;
+
+sub fold_case {
+ fc $_[0];
+};
+
+# http://archives.miloush.net/michkap/archive/2007/05/14/2629747.html
+# http://stackoverflow.com/questions/249087/how-do-i-remove-diacritics-accents-from-a-string-in-net#249126
+# http://stackoverflow.com/questions/2992066/code-to-strip-diacritical-marks-using-icu
+sub remove_diacritics {
+ my $norm = normalize('D',$_[0]);
+
+ # Check character properties with
+ # Unicode::Properties 'uniprops';
+ $norm =~ s/\p{InCombiningDiacriticalMarks}//g;
+ return normalize('C', $norm);
+};
+
+sub normalize_nfkc {
+ return normalize('KC',$_[0]);
+};
+
+1;
diff --git a/t/index/dictionary_static.t b/t/index/dictionary_static.t
index 89aed91..3d14567 100644
--- a/t/index/dictionary_static.t
+++ b/t/index/dictionary_static.t
@@ -21,12 +21,14 @@
is_deeply(Krawfish::Index::Store::V1::Dictionary::_complete_order(10), [7,4,9,2,6,8,10,1,3,5], 'Find a length order');
-
-#print join(',', $tst->store),"\n";
-
ok(my $complete = Krawfish::Index::Store::V1::Dictionary->from_dynamic($tst),
'Construct new complete tst');
+diag $complete->to_string(5);
+
+done_testing;
+__END__
+
ok(!$complete->search('a'), 'a is not part of the TST');
ok(!$complete->search('ba'), 'a is not part of the TST');
is($complete->search('ab'), 3, 'ab is part of the TST');
@@ -34,7 +36,6 @@
is($complete->search('abc'), 1, 'abc is part of the TST');
is($complete->search('bc'), 4, 'ab is part of the TST');
-#diag $complete->to_string(11);
#is($complete->term_by_term_id(4), '', 'Term by term id');
done_testing;
diff --git a/t/util/string.t b/t/util/string.t
new file mode 100644
index 0000000..5c0f294
--- /dev/null
+++ b/t/util/string.t
@@ -0,0 +1,24 @@
+#!/url/bin/env perl
+use strict;
+use warnings;
+use Test::More;
+use utf8;
+use Mojo::Util qw/encode decode/;
+
+use_ok('Krawfish::Util::String');
+
+is(fold_case('aaa'), 'aaa', 'Case fold 1');
+is(fold_case('AAA'), 'aaa', 'Case fold 2');
+is(fold_case('AaA'), 'aaa', 'Case fold 3');
+
+is(fold_case('aäa'), 'aäa', 'Case fold 4');
+is(fold_case('aÄß'), 'aäss', 'Case fold 5');
+is(fold_case('a-Äß'), 'a-äss', 'Case fold 6');
+is(fold_case('ÄÖÜß'), 'äöüss', 'Case fold 7');
+
+is(remove_diacritics('Česká'), 'Ceska', 'Removed diacritics');
+is(remove_diacritics('Äößa'), 'Aoßa', 'Removed diacritics');
+
+done_testing;
+__END__
+