Added some ideas for the forward index
diff --git a/lib/Krawfish/Index/ForwardIndex.pm b/lib/Krawfish/Index/ForwardIndex.pm
new file mode 100644
index 0000000..047567c
--- /dev/null
+++ b/lib/Krawfish/Index/ForwardIndex.pm
@@ -0,0 +1,56 @@
+package Krawfish::Index::ForwardIndex;
+use strict;
+use warnings;
+
+# This represents a forward index of the data,
+# accessible by document ID and byte offset.
+#
+# In the end this should replace the primary data.
+#
+# TODO:
+# On merge, first make a termID->termID-mapping based on the dictionary
+# merge. Then, convert the forward index based on this table without
+# dictionary lookup.
+#
+sub new {
+ my $class = shift;
+ bless {
+ file => shift,
+ forward => []
+ }, $class;
+};
+
+sub store {
+ my $self = shift;
+ my ($doc_id, $text) = @_;
+ $self->{forward}->[$doc_id] = $text;
+};
+
+sub get {
+ my $self = shift;
+ my ($doc_id, $offset, $end) = @_;
+ return substr($self->{forward}->[$doc_id], $offset, $end - $offset);
+};
+
+
+# Return a stream of elements (primary text and annotations)
+sub get_annotated {
+ my $self = shift;
+ my ($doc_id, $offset, $length, $foundry, $layer) = @_;
+ ...
+};
+
+# Return the surface string only
+sub get_surface {
+ my ($self, $doc_id, $offset, $length) = @_;
+ ...
+};
+
+# Add the document as an annotated stream
+sub add_stream {
+ my $self = shift;
+ my ($doc_id, $stream) = @_;
+ ...
+};
+
+1;
diff --git a/lib/Krawfish/Index/Store/ForwardIndex.pm b/lib/Krawfish/Index/Store/ForwardIndex.pm
new file mode 100644
index 0000000..9648651
--- /dev/null
+++ b/lib/Krawfish/Index/Store/ForwardIndex.pm
@@ -0,0 +1,120 @@
+package Krawfish::Index::Store::ForwardIndex;
+use Krawfish::Index::Store::Util qw/enc_string
+ dec_string
+ enc_varint
+ dec_varint/;
+use strict;
+use warnings;
+
+# TODO:
+# The store should be versioned!
+#
+# TODO:
+# This should probably be renamed to ForwardStream,
+# while the index needs to contain an index pointing to the
+# offsets for the documents in question!
+#
+# TODO:
+# This should store all document data using
+# term-IDs (where possible).
+# Structure like
+# [length][subtoken-surface-token-ID][foundry-layer-ID][term] ...
+# [length][plain-text]
+# [length][subtoken-surface-token-ID][foundry-layer-ID][term] ...
+#
+# The plain text contains blanks, commata, etc.
+# The subtokens point to byte offsets in the highly compressed forward index.
+#
+
+
+use constant {
+ SUBTOKEN_MARKER => 0b0000_0000,
+ PLAIN_TOKEN_MARKER => 0b1111_0000,
+ PLAIN_MARKER => 0b1111_1111,
+ WS_SCHEME => 1 # Short string compression scheme optimized for whitespace
+};
+
+sub new {
+ my $class = shift;
+ my $short_string_compression_scheme = shift;
+ bless {
+ buffer => '', # Contains subtokens
+ plain_tail => '', # Contains plain strings
+ plain_pos => 0,
+ stream => '',
+ compression_scheme => $short_string_compression_scheme
+ }
+ bless \$stream, $class;
+};
+
+sub pos;
+
+# Add term by id
+sub add_term_id {
+ my ($foundry_id, $layer_id, $term_id) = @_;
+ # The term_id is a surface term,
+ # meaning this adds a new subtoken marker
+ if ($foundry_layer_id == 0) {
+ $self->_flush;
+ $self->{buffer} .= enc_varint($term_id);
+ }
+ else {
+ $self->{buffer} .= $foundry_id . $layer_id;
+ $self->{buffer} .= enc_varint($term_id);
+ }
+};
+
+
+# Flush the buffer
+sub _flush {
+ my $self = shift;
+ $self->{stream} .=
+ SUBTOKEN_MARKER .
+ (length($self->{buffer} . $self->{plain_tail}) + 1) .
+ $self->{buffer} .
+ PLAIN_MARKER .
+ $self->{plain_tail};
+
+ $self->{buffer} = '';
+ $self->{plain_tail} = '';
+ $self->{plain_pos} = 0;
+};
+
+# Add an annotation
+sub add_term {
+ my ($foundry_id, $layer_id, $term) = @_;
+
+ # The term_id is a surface term,
+ # meaning this adds a new subtoken marker
+ if ($foundry_layer_id == 0) {
+ $self->_flush;
+ $self->{buffer} .= PLAIN_MARKER . enc_varint($self->{plain_pos}++);
+ $self->{plain_tail} .= PLAIN_MARKER . enc_string(
+ $term,
+ $self->{compression_scheme}
+ );
+ }
+ else {
+ $self->{buffer} .= $foundry_id . $layer_id;
+ $self->{buffer} .= PLAIN_MARKER . enc_varint($self->{plain_pos}++);
+ $self->{plain_tail} .= PLAIN_MARKER . enc_string(
+ $term,
+ $self->{compression_scheme}
+ );
+ }
+};
+
+
+# Add plain string
+# for example punctuation, whitespace etc.
+sub add_plain {
+ my ($self, $string) = @_;
+ $self->_flush;
+ $self->{stream} .= PLAIN_TOKEN_MARKER . enc_string(
+ $string,
+ WS_SCHEME
+ );
+};
+
+
+1;
diff --git a/lib/Krawfish/Index/Store/Util.pm b/lib/Krawfish/Index/Store/Util.pm
new file mode 100644
index 0000000..a5a78d4
--- /dev/null
+++ b/lib/Krawfish/Index/Store/Util.pm
@@ -0,0 +1,42 @@
+package Krawfish::Index::Store::Util;
+use parent 'Exporter';
+use strict;
+use warnings;
+
+our @EXPORT_OK = qw/enc_varint
+ dec_varint
+ enc_string
+ dec_string/;
+
+# This is not allowed to contain the markers of
+# Krawfish::Index::Store::ForwardIndex
+#
+# See, e.g.
+# https://github.com/antirez/smaz
+# https://en.wikipedia.org/wiki/Standard_Compression_Scheme_for_Unicode
+# https://tools.ietf.org/html/rfc1978
+# http://ed-von-schleck.github.io/shoco/
+#
+# The second parameter is the compression scheme, that may vary based on the language,
+# or the data type (e.g. plain data)
+sub enc_string ($$) {
+ warn 'Short string encoding not implemented yet';
+ return $_[0];
+};
+
+sub dec_string ($$) {
+ warn 'Short string encoding not implemented yet';
+ return $_[0];
+};
+
+sub enc_varint ($) {
+ warn 'varint encoding not implemented yet';
+ return $_[0];
+};
+
+sub dec_varint ($) {
+ warn 'varint encoding not implemented yet';
+ return $_[0];
+};
+
+1;