added case folding and diacritic removal

commit: 1972c12d4114e7e1dcfeef83f4012b33c4ac3f55 [log] [tgz]
author: Akron <nils@diewald-online.de> Tue Mar 14 17:46:55 2017 +0100
committer: Akron <nils@diewald-online.de> Tue Mar 14 17:46:55 2017 +0100
tree: d76fc22f71fa759614b598511072b6618e3ba21e
parent: e7585ceb1bafc8a7fe5b37b35269bc0f900425e2 [diff]
diff --git a/lib/Krawfish/Index/Store/Dynamic/Dictionary.pm b/lib/Krawfish/Index/Store/Dynamic/Dictionary.pm
index 4b7ac01..208c48c 100644
--- a/lib/Krawfish/Index/Store/Dynamic/Dictionary.pm
+++ b/lib/Krawfish/Index/Store/Dynamic/Dictionary.pm

@@ -7,6 +7,9 @@
 #   term(leaf-node)
 #
 # TODO:
+#   Add alias transitions, that point to a list of term ids.
+#
+# TODO:
 #   The self-optimizing application should also
 #   be used for autosuggestions.
 #   This will be a separate datastructure
@@ -26,7 +29,8 @@
   EQ_KID => 2,
   HI_KID => 3,
   TERM_ID => 4,
-  TERM_CHAR => '00'
+  TERM_CHAR => '00',
+  ALIAS_CHAR => '01'
 };
 
 # Code is based on Tree::Ternary
@@ -125,7 +129,26 @@
 };
 
 
-sub search_i;
+# TODO:
+#   Insert a term and store a term_id as an alias.
+#   If the term already exist, add the term_id to the term id array.
+#   This is useful for casefolded terms, that may refold to multiple
+#   term_ids (therefore useful for case insensitive searching).
+#   Or for accent insensitive searches.
+#   Another use case are cached regular expressions, like /.+?ratu.+?/,
+#   that are costly to search the dictionary for, but may easily be stored as an alias collection!
+sub insert_alias {
+  my ($self, $term, $term_id) = @_;
+  ...
+};
+
+
+# This will return an array of term ids,
+# in case the term is stored as an alias.
+# Otherwise the array has only one item.
+sub search_alias;
+
+
 
 sub prefix_lookup {
   my ($self, $prefix, $top_k) = @_;

diff --git a/lib/Krawfish/Index/Store/V1/Dictionary.pm b/lib/Krawfish/Index/Store/V1/Dictionary.pm
index bea59a1..64ca8cf 100644
--- a/lib/Krawfish/Index/Store/V1/Dictionary.pm
+++ b/lib/Krawfish/Index/Store/V1/Dictionary.pm

@@ -13,7 +13,7 @@
   HI_KID     => 3,
   TERM_ID    => 4,
   TERM_CHAR  => '00',
-  DEBUG      => 0
+  DEBUG      => 1
 };
 
 # TODO:
@@ -64,6 +64,16 @@
 };
 
 
+# read with header
+# This may first be done in the parent dictionary module
+# and then be delegated to the right version
+sub from_file;
+
+
+# write a header
+sub to_file;
+
+
 # Search for string
 # TODO:
 #   Support collation my $diff = collation_cmp($char, $node_char);
@@ -91,6 +101,7 @@
 
     print_log('v1_dict', "$char vs $node_char") if DEBUG;
 
+    # Check for right child
     if ($char lt $node_char) {
       print_log('v1_dict', "$char < $node_char") if DEBUG;
 
@@ -181,12 +192,14 @@
   #   - Use a stack or something similar to store the info and
   #     keep next/previous-diff-xor for the eq nodes.
   #   - All characters need to be treated as UCS2 (2-byte encoding)
+  #     or utf-32 (4-byte encoding)
   my $dynamic_node = shift;
 
   # Init with the first offset
-  my $parent_offset = 0;
-  my $node_offset = 0;
-  my $curr_offset = 0;
+  my $parent_offset  = 0;
+  my $node_offset    = 0;
+  my $curr_offset    = 0;
+  my $gparent_offset = 0;
 
   # Iterate over the tree breadth-first
   # TODO:
@@ -196,32 +209,54 @@
   #   In that case unique suffixes are stored in one page,
   #   which may be faster.
   #
-  my @queue = ([$node_offset, $parent_offset, $dynamic_node]);
+  my @queue = ([$node_offset, $parent_offset, $dynamic_node, $gparent_offset]);
   my @array = ();
   my @term_ids = ();
 
+  my $parent_of_bst = 0;
+
   # Iterate in a breadth-first manner
   while (scalar(@queue) > 0) {
 
     # Get offset information
     # - The node offset is the position of the parent pointer
     # - The offset is the position of the parent root
-    ($node_offset, $parent_offset, $dynamic_node) = @{shift @queue};
+    ($node_offset, $parent_offset, $dynamic_node, $gparent_offset) = @{shift @queue};
 
     # Get the current letter node as an array
     my ($length, $chars) = _complete_level_sort($dynamic_node);
 
-    # TODO:
-    #   Fix the parent nodes xor-eq-value
-
     # The next node array starts with a length indicator
     push @array, $length;
 
     # The current root position
     $curr_offset = $#array;
 
+    # Fix the parent nodes xor-eq-value
     if ($parent_offset > 0) {
+
+      if (DEBUG) {
+        # print_log('v1_dict', "The current bst parent is $parent_of_bst") if DEBUG;
+
+        print_log('v1_dict', "Fix the current parent eq node");
+        print_log('v1_dict',
+                  '  ' . $array[$parent_offset - 1] .
+                    " at $node_offset/$curr_offset with $gparent_offset:     " .
+                    $gparent_offset .
+                    " xor $parent_offset = " .
+                    ($gparent_offset ^ $parent_offset),
+          );
+      };
+
+      # TODO:
       $array[$parent_offset] ^= $curr_offset; # = $curr_offset
+    }
+
+    elsif (DEBUG) {
+      print_log(
+        'v1_dict',
+        "curr_offset is $curr_offset with parent_offset = $parent_offset"
+      );
     };
 
     foreach (@$chars) {
@@ -240,6 +275,7 @@
       #   O(1) for term id request and O(n) for term retrieval
       if ($_->[SPLIT_CHAR] eq TERM_CHAR) {
         # push @array, $parent_offset ^ $_->[TERM_ID]; #
+
         push @array, $curr_offset ^ $_->[TERM_ID];
 
         # store leaf node in term_id array
@@ -247,11 +283,16 @@
       }
       # Push temporary eq
       else {
-#        push @array, $parent_offset; # $curr_offset; # It's rather initialized 0 ^ $curr_offset
+        #        push @array, $parent_offset; # $curr_offset;
+        # It's rather initialized 0 ^ $curr_offset
         push @array, $curr_offset; # It's rather initialized 0 ^ $curr_offset
-        push @queue, [$curr_offset, $#array, $_->[EQ_KID]];
+
+        # Push the current EQ node on the queue
+        push @queue, [$curr_offset, $#array, $_->[EQ_KID], $parent_offset];
       };
     };
+
+    $parent_of_bst = $curr_offset;
   };
 
   return \@array, \@term_ids;
commit	1972c12d4114e7e1dcfeef83f4012b33c4ac3f55	[log] [tgz]
author	Akron <nils@diewald-online.de>	Tue Mar 14 17:46:55 2017 +0100
committer	Akron <nils@diewald-online.de>	Tue Mar 14 17:46:55 2017 +0100
tree	d76fc22f71fa759614b598511072b6618e3ba21e
parent	e7585ceb1bafc8a7fe5b37b35269bc0f900425e2 [diff]