Added some ideas for the forward index

commit: d2f9e69eeca2c6a648722f69cf9b5edeaa3ff5c6 [log] [tgz]
author: Akron <nils@diewald-online.de> Thu Feb 09 14:44:02 2017 +0100
committer: Akron <nils@diewald-online.de> Thu Feb 09 14:44:02 2017 +0100
tree: a09bba600e65fbfe3383dea4821b093c3c0e8e32
parent: 6ff7b485d3205b4a0a16d07c3a629b8a4375b226 [diff]
diff --git a/lib/Krawfish/Index/ForwardIndex.pm b/lib/Krawfish/Index/ForwardIndex.pm
new file mode 100644
index 0000000..047567c
--- /dev/null
+++ b/lib/Krawfish/Index/ForwardIndex.pm

@@ -0,0 +1,56 @@
+package Krawfish::Index::ForwardIndex;
+use strict;
+use warnings;
+
+# This represents a forward index of the data,
+# accessible by document ID and byte offset.
+#
+# In the end this should replace the primary data.
+#
+# TODO:
+#   On merge, first make a termID->termID-mapping based on the dictionary
+#   merge. Then, convert the forward index based on this table without
+#   dictionary lookup.
+#
+sub new {
+  my $class = shift;
+  bless {
+    file => shift,
+    forward => []
+  }, $class;
+};
+
+sub store {
+  my $self = shift;
+  my ($doc_id, $text) = @_;
+  $self->{forward}->[$doc_id] = $text;
+};
+
+sub get {
+  my $self = shift;
+  my ($doc_id, $offset, $end) = @_;
+  return substr($self->{forward}->[$doc_id], $offset, $end - $offset);
+};
+
+
+# Return a stream of elements (primary text and annotations)
+sub get_annotated {
+  my $self = shift;
+  my ($doc_id, $offset, $length, $foundry, $layer) = @_;
+  ...
+};
+
+# Return the surface string only
+sub get_surface {
+  my ($self, $doc_id, $offset, $length) = @_;
+  ...
+};
+
+# Add the document as an annotated stream
+sub add_stream {
+  my $self = shift;
+  my ($doc_id, $stream) = @_;
+  ...
+};
+
+1;

diff --git a/lib/Krawfish/Index/Store/ForwardIndex.pm b/lib/Krawfish/Index/Store/ForwardIndex.pm
new file mode 100644
index 0000000..9648651
--- /dev/null
+++ b/lib/Krawfish/Index/Store/ForwardIndex.pm

@@ -0,0 +1,120 @@
+package Krawfish::Index::Store::ForwardIndex;
+use Krawfish::Index::Store::Util qw/enc_string
+                                    dec_string
+                                    enc_varint
+                                    dec_varint/;
+use strict;
+use warnings;
+
+# TODO:
+#   The store should be versioned!
+#
+# TODO:
+#   This should probably be renamed to ForwardStream,
+#   while the index needs to contain an index pointing to the
+#   offsets for the documents in question!
+#
+# TODO:
+#   This should store all document data using
+#   term-IDs (where possible).
+#   Structure like
+#   [length][subtoken-surface-token-ID][foundry-layer-ID][term] ...
+#   [length][plain-text]
+#   [length][subtoken-surface-token-ID][foundry-layer-ID][term] ...
+#
+#   The plain text contains blanks, commata, etc.
+#   The subtokens point to byte offsets in the highly compressed forward index.
+#
+
+
+use constant {
+  SUBTOKEN_MARKER    => 0b0000_0000,
+  PLAIN_TOKEN_MARKER => 0b1111_0000,
+  PLAIN_MARKER       => 0b1111_1111,
+  WS_SCHEME          => 1 # Short string compression scheme optimized for whitespace
+};
+
+sub new {
+  my $class = shift;
+  my $short_string_compression_scheme = shift;
+  bless {
+    buffer => '', # Contains subtokens
+    plain_tail  => '', # Contains plain strings
+    plain_pos => 0,
+    stream => '',
+    compression_scheme => $short_string_compression_scheme
+  }
+  bless \$stream, $class;
+};
+
+sub pos;
+
+# Add term by id
+sub add_term_id {
+  my ($foundry_id, $layer_id, $term_id) = @_;
+  # The term_id is a surface term,
+  # meaning this adds a new subtoken marker
+  if ($foundry_layer_id == 0) {
+    $self->_flush;
+    $self->{buffer} .= enc_varint($term_id);
+  }
+  else {
+    $self->{buffer} .= $foundry_id . $layer_id;
+    $self->{buffer} .= enc_varint($term_id);
+  }
+};
+
+
+# Flush the buffer
+sub _flush {
+  my $self = shift;
+  $self->{stream} .=
+    SUBTOKEN_MARKER .
+    (length($self->{buffer} . $self->{plain_tail}) + 1) .
+    $self->{buffer} .
+    PLAIN_MARKER .
+    $self->{plain_tail};
+
+  $self->{buffer} = '';
+  $self->{plain_tail} = '';
+  $self->{plain_pos} = 0;
+};
+
+# Add an annotation
+sub add_term {
+  my ($foundry_id, $layer_id, $term) = @_;
+
+  # The term_id is a surface term,
+  # meaning this adds a new subtoken marker
+  if ($foundry_layer_id == 0) {
+    $self->_flush;
+    $self->{buffer} .= PLAIN_MARKER . enc_varint($self->{plain_pos}++);
+    $self->{plain_tail}  .= PLAIN_MARKER . enc_string(
+      $term,
+      $self->{compression_scheme}
+    );
+  }
+  else {
+    $self->{buffer} .= $foundry_id . $layer_id;
+    $self->{buffer} .= PLAIN_MARKER . enc_varint($self->{plain_pos}++);
+    $self->{plain_tail}  .= PLAIN_MARKER . enc_string(
+      $term,
+      $self->{compression_scheme}
+    );
+  }
+};
+
+
+# Add plain string
+# for example punctuation, whitespace etc.
+sub add_plain {
+  my ($self, $string) = @_;
+  $self->_flush;
+  $self->{stream} .= PLAIN_TOKEN_MARKER . enc_string(
+    $string,
+    WS_SCHEME
+  );
+};
+
+
+1;

diff --git a/lib/Krawfish/Index/Store/Util.pm b/lib/Krawfish/Index/Store/Util.pm
new file mode 100644
index 0000000..a5a78d4
--- /dev/null
+++ b/lib/Krawfish/Index/Store/Util.pm

@@ -0,0 +1,42 @@
+package Krawfish::Index::Store::Util;
+use parent 'Exporter';
+use strict;
+use warnings;
+
+our @EXPORT_OK = qw/enc_varint
+                    dec_varint
+                    enc_string
+                    dec_string/;
+
+# This is not allowed to contain the markers of
+# Krawfish::Index::Store::ForwardIndex
+#
+# See, e.g.
+#   https://github.com/antirez/smaz
+#   https://en.wikipedia.org/wiki/Standard_Compression_Scheme_for_Unicode
+#   https://tools.ietf.org/html/rfc1978
+#   http://ed-von-schleck.github.io/shoco/
+#
+# The second parameter is the compression scheme, that may vary based on the language,
+# or the data type (e.g. plain data)
+sub enc_string ($$) {
+  warn 'Short string encoding not implemented yet';
+  return $_[0];
+};
+
+sub dec_string ($$) {
+  warn 'Short string encoding not implemented yet';
+  return $_[0];
+};
+
+sub enc_varint ($) {
+  warn 'varint encoding not implemented yet';
+  return $_[0];
+};
+
+sub dec_varint ($) {
+  warn 'varint encoding not implemented yet';
+  return $_[0];
+};
+
+1;
commit	d2f9e69eeca2c6a648722f69cf9b5edeaa3ff5c6	[log] [tgz]
author	Akron <nils@diewald-online.de>	Thu Feb 09 14:44:02 2017 +0100
committer	Akron <nils@diewald-online.de>	Thu Feb 09 14:44:02 2017 +0100
tree	a09bba600e65fbfe3383dea4821b093c3c0e8e32
parent	6ff7b485d3205b4a0a16d07c3a629b8a4375b226 [diff]