added case folding and diacritic removal

commit: 1972c12d4114e7e1dcfeef83f4012b33c4ac3f55 [log] [tgz]
author: Akron <nils@diewald-online.de> Tue Mar 14 17:46:55 2017 +0100
committer: Akron <nils@diewald-online.de> Tue Mar 14 17:46:55 2017 +0100
tree: d76fc22f71fa759614b598511072b6618e3ba21e
parent: e7585ceb1bafc8a7fe5b37b35269bc0f900425e2 [diff]
diff --git a/lib/Krawfish/Index.pm b/lib/Krawfish/Index.pm
index c76c249..5daecea 100644
--- a/lib/Krawfish/Index.pm
+++ b/lib/Krawfish/Index.pm

@@ -225,6 +225,13 @@
       # Add as a subterm
       my $subterm_id = $dict->add_subterm($term);
 
+      # TODO:
+      #   Check somehow, if the term is new. If so, then {
+      #     TODO: Store case insensitive term
+      #       $dict->add_subterm_casefolded(fold_case($term), $subterm_id);
+      #       $dict->add_subterm_without_diacritics(remove_diacritics($term), $subterm_id);
+      #   }
+
       print_log('index', 'Surface form has subterm_id ' . $subterm_id) if DEBUG;
 
       # Store information to subtoken

diff --git a/lib/Krawfish/Index/Store/Dynamic/Dictionary.pm b/lib/Krawfish/Index/Store/Dynamic/Dictionary.pm
index 4b7ac01..208c48c 100644
--- a/lib/Krawfish/Index/Store/Dynamic/Dictionary.pm
+++ b/lib/Krawfish/Index/Store/Dynamic/Dictionary.pm

@@ -7,6 +7,9 @@
 #   term(leaf-node)
 #
 # TODO:
+#   Add alias transitions, that point to a list of term ids.
+#
+# TODO:
 #   The self-optimizing application should also
 #   be used for autosuggestions.
 #   This will be a separate datastructure
@@ -26,7 +29,8 @@
   EQ_KID => 2,
   HI_KID => 3,
   TERM_ID => 4,
-  TERM_CHAR => '00'
+  TERM_CHAR => '00',
+  ALIAS_CHAR => '01'
 };
 
 # Code is based on Tree::Ternary
@@ -125,7 +129,26 @@
 };
 
 
-sub search_i;
+# TODO:
+#   Insert a term and store a term_id as an alias.
+#   If the term already exist, add the term_id to the term id array.
+#   This is useful for casefolded terms, that may refold to multiple
+#   term_ids (therefore useful for case insensitive searching).
+#   Or for accent insensitive searches.
+#   Another use case are cached regular expressions, like /.+?ratu.+?/,
+#   that are costly to search the dictionary for, but may easily be stored as an alias collection!
+sub insert_alias {
+  my ($self, $term, $term_id) = @_;
+  ...
+};
+
+
+# This will return an array of term ids,
+# in case the term is stored as an alias.
+# Otherwise the array has only one item.
+sub search_alias;
+
+
 
 sub prefix_lookup {
   my ($self, $prefix, $top_k) = @_;

diff --git a/lib/Krawfish/Index/Store/V1/Dictionary.pm b/lib/Krawfish/Index/Store/V1/Dictionary.pm
index bea59a1..64ca8cf 100644
--- a/lib/Krawfish/Index/Store/V1/Dictionary.pm
+++ b/lib/Krawfish/Index/Store/V1/Dictionary.pm

@@ -13,7 +13,7 @@
   HI_KID     => 3,
   TERM_ID    => 4,
   TERM_CHAR  => '00',
-  DEBUG      => 0
+  DEBUG      => 1
 };
 
 # TODO:
@@ -64,6 +64,16 @@
 };
 
 
+# read with header
+# This may first be done in the parent dictionary module
+# and then be delegated to the right version
+sub from_file;
+
+
+# write a header
+sub to_file;
+
+
 # Search for string
 # TODO:
 #   Support collation my $diff = collation_cmp($char, $node_char);
@@ -91,6 +101,7 @@
 
     print_log('v1_dict', "$char vs $node_char") if DEBUG;
 
+    # Check for right child
     if ($char lt $node_char) {
       print_log('v1_dict', "$char < $node_char") if DEBUG;
 
@@ -181,12 +192,14 @@
   #   - Use a stack or something similar to store the info and
   #     keep next/previous-diff-xor for the eq nodes.
   #   - All characters need to be treated as UCS2 (2-byte encoding)
+  #     or utf-32 (4-byte encoding)
   my $dynamic_node = shift;
 
   # Init with the first offset
-  my $parent_offset = 0;
-  my $node_offset = 0;
-  my $curr_offset = 0;
+  my $parent_offset  = 0;
+  my $node_offset    = 0;
+  my $curr_offset    = 0;
+  my $gparent_offset = 0;
 
   # Iterate over the tree breadth-first
   # TODO:
@@ -196,32 +209,54 @@
   #   In that case unique suffixes are stored in one page,
   #   which may be faster.
   #
-  my @queue = ([$node_offset, $parent_offset, $dynamic_node]);
+  my @queue = ([$node_offset, $parent_offset, $dynamic_node, $gparent_offset]);
   my @array = ();
   my @term_ids = ();
 
+  my $parent_of_bst = 0;
+
   # Iterate in a breadth-first manner
   while (scalar(@queue) > 0) {
 
     # Get offset information
     # - The node offset is the position of the parent pointer
     # - The offset is the position of the parent root
-    ($node_offset, $parent_offset, $dynamic_node) = @{shift @queue};
+    ($node_offset, $parent_offset, $dynamic_node, $gparent_offset) = @{shift @queue};
 
     # Get the current letter node as an array
     my ($length, $chars) = _complete_level_sort($dynamic_node);
 
-    # TODO:
-    #   Fix the parent nodes xor-eq-value
-
     # The next node array starts with a length indicator
     push @array, $length;
 
     # The current root position
     $curr_offset = $#array;
 
+    # Fix the parent nodes xor-eq-value
     if ($parent_offset > 0) {
+
+      if (DEBUG) {
+        # print_log('v1_dict', "The current bst parent is $parent_of_bst") if DEBUG;
+
+        print_log('v1_dict', "Fix the current parent eq node");
+        print_log('v1_dict',
+                  '  ' . $array[$parent_offset - 1] .
+                    " at $node_offset/$curr_offset with $gparent_offset:     " .
+                    $gparent_offset .
+                    " xor $parent_offset = " .
+                    ($gparent_offset ^ $parent_offset),
+          );
+      };
+
+      # TODO:
       $array[$parent_offset] ^= $curr_offset; # = $curr_offset
+    }
+
+    elsif (DEBUG) {
+      print_log(
+        'v1_dict',
+        "curr_offset is $curr_offset with parent_offset = $parent_offset"
+      );
     };
 
     foreach (@$chars) {
@@ -240,6 +275,7 @@
       #   O(1) for term id request and O(n) for term retrieval
       if ($_->[SPLIT_CHAR] eq TERM_CHAR) {
         # push @array, $parent_offset ^ $_->[TERM_ID]; #
+
         push @array, $curr_offset ^ $_->[TERM_ID];
 
         # store leaf node in term_id array
@@ -247,11 +283,16 @@
       }
       # Push temporary eq
       else {
-#        push @array, $parent_offset; # $curr_offset; # It's rather initialized 0 ^ $curr_offset
+        #        push @array, $parent_offset; # $curr_offset;
+        # It's rather initialized 0 ^ $curr_offset
         push @array, $curr_offset; # It's rather initialized 0 ^ $curr_offset
-        push @queue, [$curr_offset, $#array, $_->[EQ_KID]];
+
+        # Push the current EQ node on the queue
+        push @queue, [$curr_offset, $#array, $_->[EQ_KID], $parent_offset];
       };
     };
+
+    $parent_of_bst = $curr_offset;
   };
 
   return \@array, \@term_ids;

diff --git a/lib/Krawfish/Util/String.pm b/lib/Krawfish/Util/String.pm
new file mode 100644
index 0000000..f104cd3
--- /dev/null
+++ b/lib/Krawfish/Util/String.pm

@@ -0,0 +1,33 @@
+package Krawfish::Util::String;
+use strict;
+use warnings;
+
+# Potentially use Unicode::UCD instead
+use Unicode::CaseFold;
+use Unicode::Normalize qw/getCombinClass normalize/;
+
+use parent 'Exporter';
+
+our @EXPORT = qw/fold_case remove_diacritics normalize_nfkc/;
+
+sub fold_case {
+  fc $_[0];
+};
+
+# http://archives.miloush.net/michkap/archive/2007/05/14/2629747.html
+# http://stackoverflow.com/questions/249087/how-do-i-remove-diacritics-accents-from-a-string-in-net#249126
+# http://stackoverflow.com/questions/2992066/code-to-strip-diacritical-marks-using-icu
+sub remove_diacritics {
+  my $norm = normalize('D',$_[0]);
+
+  # Check character properties with
+  # Unicode::Properties 'uniprops';
+  $norm =~ s/\p{InCombiningDiacriticalMarks}//g;
+  return normalize('C', $norm);
+};
+
+sub normalize_nfkc {
+  return normalize('KC',$_[0]);
+};
+
+1;

diff --git a/t/index/dictionary_static.t b/t/index/dictionary_static.t
index 89aed91..3d14567 100644
--- a/t/index/dictionary_static.t
+++ b/t/index/dictionary_static.t

@@ -21,12 +21,14 @@
 is_deeply(Krawfish::Index::Store::V1::Dictionary::_complete_order(10), [7,4,9,2,6,8,10,1,3,5], 'Find a length order');
 
 
-
-#print join(',', $tst->store),"\n";
-
 ok(my $complete = Krawfish::Index::Store::V1::Dictionary->from_dynamic($tst),
    'Construct new complete tst');
 
+diag $complete->to_string(5);
+
+done_testing;
+__END__
+
 ok(!$complete->search('a'), 'a is not part of the TST');
 ok(!$complete->search('ba'), 'a is not part of the TST');
 is($complete->search('ab'), 3, 'ab is part of the TST');
@@ -34,7 +36,6 @@
 is($complete->search('abc'), 1, 'abc is part of the TST');
 is($complete->search('bc'), 4, 'ab is part of the TST');
 
-#diag $complete->to_string(11);
 #is($complete->term_by_term_id(4), '', 'Term by term id');
 
 done_testing;

diff --git a/t/util/string.t b/t/util/string.t
new file mode 100644
index 0000000..5c0f294
--- /dev/null
+++ b/t/util/string.t

@@ -0,0 +1,24 @@
+#!/url/bin/env perl
+use strict;
+use warnings;
+use Test::More;
+use utf8;
+use Mojo::Util qw/encode decode/;
+
+use_ok('Krawfish::Util::String');
+
+is(fold_case('aaa'), 'aaa', 'Case fold 1');
+is(fold_case('AAA'), 'aaa', 'Case fold 2');
+is(fold_case('AaA'), 'aaa', 'Case fold 3');
+
+is(fold_case('aäa'), 'aäa', 'Case fold 4');
+is(fold_case('aÄß'), 'aäss', 'Case fold 5');
+is(fold_case('a-Äß'), 'a-äss', 'Case fold 6');
+is(fold_case('ÄÖÜß'), 'äöüss', 'Case fold 7');
+
+is(remove_diacritics('Česká'), 'Ceska', 'Removed diacritics');
+is(remove_diacritics('Äößa'), 'Aoßa', 'Removed diacritics');
+
+done_testing;
+__END__
+
commit	1972c12d4114e7e1dcfeef83f4012b33c4ac3f55	[log] [tgz]
author	Akron <nils@diewald-online.de>	Tue Mar 14 17:46:55 2017 +0100
committer	Akron <nils@diewald-online.de>	Tue Mar 14 17:46:55 2017 +0100
tree	d76fc22f71fa759614b598511072b6618e3ba21e
parent	e7585ceb1bafc8a7fe5b37b35269bc0f900425e2 [diff]