Rename Meta to Compile
Change-Id: Ief55855e2642a64672a840ece9d9fd876a49d373
diff --git a/lib/Krawfish/Compile/Segment/Group/AnnotationClasses.pm b/lib/Krawfish/Compile/Segment/Group/AnnotationClasses.pm
new file mode 100644
index 0000000..bba8b9e
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Group/AnnotationClasses.pm
@@ -0,0 +1,11 @@
+# This should make it possible to search for classes
+# and group based on the annotations at the certain range.
+# This, however, is probably quite tricky as
+# there is no simple position based forward index with
+# term_ids for annotations, meaning that this
+# has to check the annotations in the complete forward index,
+# probably making this unusable slow.
+# but who knows ...
+
+# A query like
+# group_by_annotation_classes(1,"opennlp","p","Der {1:[]} Mann")
diff --git a/lib/Krawfish/Compile/Segment/Group/Characters.pm b/lib/Krawfish/Compile/Segment/Group/Characters.pm
new file mode 100644
index 0000000..e5b1da7
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Group/Characters.pm
@@ -0,0 +1,66 @@
+package Krawfish::Compile::Segment::Group::Character;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+
+# This groups on prefix or suffixes of subterms.
+# Necessary to support "Ansicht nach Wortendungen" for example.
+# It's possible to first group on terms and then - per term,
+# request the term surface in the dictionary and group by
+# the result.
+
+
+use constant DEBUG => 0;
+
+sub new {
+ my $class = shift;
+ bless {
+ segments => shift, # Krawfish::Index::Segments object
+ # TODO: May as well be a subtoken object
+ from_start => shift, # boolean - otherwise from end
+ char_count => shift
+ nrs => [@_]
+ }, $class;
+};
+
+
+sub get_group {
+ my ($self, $match) = @_;
+
+ # Get all classes from the match
+ my @classes = $match->get_classes($self->{nrs});
+
+ my $segments = $self->{segments};
+
+ my %group;
+
+ # Classes have nr, start, end
+ foreach my $class (sort { $a->start <=> $b->start } @classes) {
+
+ if ($self->{from_start}) {
+
+ # This will retrieve the segment from the segments stream
+ my $segment = $stream->get($match->doc_id, $class->start);
+
+ if ($segment->)
+
+ # The character count can be satisfied by the
+ my $first_chars = $segment->first_chars;
+
+ if (length($first_chars) <= $self->{char_count} {
+ substr($first_chars);
+ }
+
+ # Check, if the segment only spans one segment
+ if ($class->end != $class->start+1) {
+
+ };
+ }
+ else {
+ ...
+ };
+ };
+};
+
+1;
diff --git a/lib/Krawfish/Compile/Segment/Group/ClassFrequencies.pm b/lib/Krawfish/Compile/Segment/Group/ClassFrequencies.pm
new file mode 100644
index 0000000..529267f
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Group/ClassFrequencies.pm
@@ -0,0 +1,215 @@
+package Krawfish::Compile::Segment::Group::ClassFrequencies;
+use parent 'Krawfish::Compile';
+use Krawfish::Koral::Result::Group::ClassFrequencies;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+use constant DEBUG => 0;
+
+# Aggregate by content information, for example,
+# based on a certain class
+#
+# TODO:
+# Currently this only works for surface term_ids, but it may very well collect
+# arbitrary annotations! In that case, multiple annotations and different
+# annotation lengths have to be taken into account.
+#
+# TODO:
+# The special case of class 0 needs to be treated.
+#
+# TODO:
+# Support virtual corpus classes
+
+sub new {
+ my $class = shift;
+ my $self = bless {
+ forward_obj => shift,
+ query => shift,
+ classes => shift,
+ class_freq => {},
+ term_cache => {},
+ last_doc_id => -1
+ }, $class;
+
+ $self->{groups} = Krawfish::Koral::Result::Group::ClassFrequencies->new(
+ $self->{classes}
+ );
+
+ return $self;
+};
+
+
+# Initialize forward counter
+sub _init {
+ return if $_[0]->{forward_pointer};
+
+ my $self = shift;
+
+ print_log('g_class_freq', 'Create forward pointer') if DEBUG;
+
+ # Load the ranked list - may be too large for memory!
+ $self->{forward_pointer} = $self->{forward_obj}->pointer;
+};
+
+
+# Shorthand for "search through"
+sub finalize {
+ while ($_[0]->next) {};
+ return $_[0];
+};
+
+
+# Move to next match
+sub next {
+ my $self = shift;
+
+ $self->_init;
+
+ # No more matches
+ return unless $self->{query}->next;
+
+ # Get the current posting
+ my $current = $self->{query}->current;
+
+ my $groups = $self->{groups};
+ my $pointer = $self->{forward_pointer};
+
+ # Get the current doc_id and move to it
+ my $doc_id = $current->doc_id;
+
+ # Current doc_id differ - move forward
+ if ($doc_id != $self->{last_doc_id}) {
+
+ # Skip to doc
+ if ($pointer->skip_doc($doc_id) != $doc_id) {
+
+ # This should never happen, as for all docs there is a
+ # forward index!
+ return;
+ };
+
+ if (DEBUG) {
+ print_log('g_class_freq', "Moved forward index to $doc_id");
+ };
+
+ # Remember the last document
+ $self->{last_doc_id} = $doc_id;
+ };
+
+ # Remember terms (for overlap)
+ # with the structure pos -> term_id
+ # my $cache = $self->{term_cache};
+
+ # Collect classes - have the structure
+ # $classes[1] = [...term_id,0,0,term_id,...]
+ my @classes = ();
+ foreach (@{$self->{classes}}) {
+ $classes[$_] = []; # TODO: Length is normally the length of the match
+ };
+ my @term_cache = ();
+
+ # Get offset to simplify classes
+ my $offset = $current->start;
+
+ # TODO:
+ # There may be overlaps within following
+ # matches, so under certain circumstances the pointer
+ # needs to move backwards.
+ # It may be enough to do that at the beginning here.
+ # Or instead of skip_pos, the forward pointer can
+ # reposition automatically.
+
+ # Get class payloads
+ my @class_infos = $current->get_classes($self->{classes});
+
+ # Retrieve the requested classes for the current posting
+ foreach my $class_info (@class_infos) {
+
+ # Check class info
+ my ($nr, $start, $end) = @{$class_info};
+
+ # Iterate over the class and collect term ids
+ for (my $i = $start; $i < $end; $i++) {
+
+ # The relative position of the class in the match
+ my $rel_pos = $i - $offset;
+
+ # Check if term already retrieved
+ if ($term_cache[$rel_pos]) {
+
+ # Copy to class
+ $classes[$nr]->[$rel_pos] = $term_cache[$rel_pos];
+ }
+
+ # TODO:
+ # Check for term cache
+ # if ($term_cache->{$start})
+
+ # Retrieve term id
+ else {
+
+ # Collect all terms from start to end
+ if ($pointer->skip_pos($i)) {
+
+ # Add term id to class in correct order
+ my $term_id = $pointer->current->term_id;
+
+ if (DEBUG) {
+ print_log(
+ 'g_class_freq',
+ "Term id at position $i is #" . $term_id);
+ };
+
+
+ # Set term_id at relative position in term_cache
+ $term_cache[$rel_pos] = $classes[$nr]->[$rel_pos] = $term_id;
+ };
+ }
+ };
+ };
+
+ if (DEBUG) {
+ print_log('g_class_freq', 'term cache is ' . join(',', @term_cache));
+ };
+
+ # The signature has the structure [class, term_id*]+
+ my @sig = ();
+
+ # Iterate over all classes
+ foreach my $nr (@{$self->{classes}}) {
+ push @sig, $nr;
+
+ foreach (@{$classes[$nr]}) {
+ push @sig, $_ if defined $_; # Add all set term_ids
+ };
+
+ push @sig, 0;
+ };
+
+ # Increment per match
+ $self->{groups}->incr_match(\@sig);
+
+ return 1;
+};
+
+
+sub current {
+ return $_[0]->{query}->current;
+};
+
+
+# Get collection
+sub collection {
+ $_[0]->{groups};
+};
+
+
+sub to_string {
+ my $self = shift;
+ my $str = 'gClassFreq(' . join(',', @{$self->{classes}}) .
+ ':' . $self->{query}->to_string . ')';
+ return $str;
+};
+
+1;
diff --git a/lib/Krawfish/Compile/Segment/Group/Classes.pm b/lib/Krawfish/Compile/Segment/Group/Classes.pm
new file mode 100644
index 0000000..af286ed
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Group/Classes.pm
@@ -0,0 +1,121 @@
+package Krawfish::Compile::Segment::Group::Classes;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+# TODO:
+# The name is somehow misleading, as this will only
+# group by surface terms.
+
+# TODO:
+# Refer to Krawfish::Compile::Segment::TermIDs!
+
+use constant {
+ DEBUG => 0,
+ NR => 0,
+ START_POS => 1,
+ END_POS => 2,
+};
+
+
+sub new {
+ my $class = shift;
+ bless {
+ segment => shift,
+ nrs => @_ ? [sort @_] : undef,
+ groups => {},
+ }, $class;
+};
+
+
+# Get the group signature for each match
+sub get_group {
+ my ($self, $match) = @_;
+
+ # Get all classes from the match
+ # Classes need to be sorted by start position
+ # to be retrievable, in case the subtokens-Stream
+ # is implemented as a postingslist (probably not)
+ my @classes = $match->get_classes_sorted($self->{nrs});
+
+ my $subtokens = $self->{segment}->subtokens;
+
+ my %class_group;
+
+ # Classes have nr, start, end
+ foreach my $class (@classes) {
+
+ # WARNING! CLASSES MAY OVERLAP SO SUBTOKENS SHOULD BE CACHED OR BUFFERED!
+
+ # Get start position
+ my $start = $class->[START_POS];
+
+ my @seq = ();
+
+ # Receive subtoken
+ my $subt = $subtokens->get($match->doc_id, $start);
+
+ # Push term id to subtoken
+ # TODO: A subtoken should have accessors
+ push (@seq, $subt->[2]);
+
+ while ($start < ($class->[END_POS] -1)) {
+ $subt = $subtokens->get($match->doc_id, ++$start);
+
+ # Push subterm id to subtoken
+ push (@seq, $subt->[2]);
+ };
+
+ # Class not yet set
+ unless ($class_group{$class->[NR]}) {
+ $class_group{$class->[NR]} = join('___', @seq);
+ }
+
+ # There is a gap in the class, potentially an overlap!
+ # TODO: Resolve overlaps!
+ # TODO: Mark gaps!
+ else {
+ $class_group{$class->[NR]} .= '___' . join('___', @seq);
+ };
+ };
+
+ my $string = '';
+ foreach (sort {$a <=> $b} keys %class_group) {
+ $string .= $_ .':' . $class_group{$_} . ';';
+ };
+
+ return $string;
+};
+
+
+# Return group info as hash
+sub to_hash {
+ my ($self, $signature, $doc_freq, $freq) = @_;
+
+ # TODO:
+ # This can't work!
+ # Get dictionary object to convert terms to term id
+ # my $dict = $self->{segment}->dict;
+
+ my %hash = ();
+ while ($signature =~ /\G(\d+):(.+?);/g) {
+
+ # if (DEBUG) {
+ # print_log('g_class', "Build class $1 for signature $2");
+ # };
+
+ $hash{"class_$1"} = [ split('___', $2)];
+ };
+ $hash{freq} = $freq if defined $freq;
+ $hash{doc_freq} = $doc_freq;
+ return \%hash;
+};
+
+
+sub to_string {
+ my $str = 'classes';
+ $str .= $_[0]->{nrs} ? '[' . join(',', @{$_[0]->{nrs}}) . ']' : '';
+ return $str;
+};
+
+1;
diff --git a/lib/Krawfish/Compile/Segment/Group/Fields.pm b/lib/Krawfish/Compile/Segment/Group/Fields.pm
new file mode 100644
index 0000000..75884b2
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Group/Fields.pm
@@ -0,0 +1,175 @@
+package Krawfish::Compile::Segment::Group::Fields;
+use parent 'Krawfish::Compile';
+use Krawfish::Koral::Result::Group::Fields;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+use constant DEBUG => 0;
+
+# This will group matches (especially document matches) by field
+# This is useful e.g. for document browsing per corpus.
+#
+# Because the grouping is based on ranking, the sorting will be trivial.
+#
+# TODO:
+# For some mechanisms, it is not necessary to count all occurrences,
+# e.g. to get all keywords used in a certain virtual corpus or all
+# used annotations.
+
+sub new {
+ my $class = shift;
+ my ($field_obj, $query, $fields) = @_;
+ my $self = bless {
+ field_obj => $field_obj,
+ query => $query,
+ field_keys => [map { ref($_) ? $_->term_id : $_ } @$fields],
+ last_doc_id => -1
+ }, $class;
+
+ # Initialize group object
+ $self->{groups} = Krawfish::Koral::Result::Group::Fields->new($self->{field_keys});
+
+ return $self;
+};
+
+
+# Initialize field pointer
+sub _init {
+ return if $_[0]->{field_pointer};
+
+ my $self = shift;
+
+ print_log('g_fields', 'Create pointer on fields') if DEBUG;
+
+ # Load the ranked list - may be too large for memory!
+ $self->{field_pointer} = $self->{field_obj}->pointer;
+};
+
+
+sub to_string {
+ my $self = shift;
+ my $str = 'gFields(' . join(',', map { '#' . $_ } @{$self->{field_keys}}) .
+ ':' . $self->{query}->to_string . ')';
+ return $str;
+};
+
+
+# Shorthand for "search through"
+sub finalize {
+ while ($_[0]->next) {};
+ return $_[0];
+};
+
+
+# Iterate to the next result
+sub next {
+ my $self = shift;
+
+ $self->_init;
+
+ my $groups = $self->{groups};
+ my $pointer = $self->{field_pointer};
+
+ # There is a next match
+ if ($self->{query}->next) {
+
+ # Get the current posting
+ my $current = $self->{query}->current;
+
+ if ($current->doc_id != $self->{last_doc_id}) {
+
+ # Flush old information
+ $groups->flush;
+
+ my $doc_id = $pointer->skip_doc($current->doc_id);
+
+ # There are no fields for this doc
+ next if $doc_id != $current->doc_id;
+
+ # Due to multivalued fields,
+ # a document can yield a permutation of
+ # patterns, so we recognize this
+ my @patterns = ();
+ my @field_keys = @{$self->{field_keys}};
+
+ # Ignore stored fields
+ my @field_objs = grep { $_->type ne 'store' } $pointer->fields(@field_keys);
+
+ my ($key_pos, $val_pos) = (0,0);
+
+ # Iterate through both lists and create a pattern
+ # Pattern may occur because fields can have multiple values
+ while ($key_pos < @field_keys) {
+
+ # There are no more values for the position
+ if (!$field_objs[$val_pos]) {
+ # Add ignorable null term
+ unless (@{$patterns[$key_pos]}) {
+ push @{$patterns[$key_pos]}, 0;
+ };
+ $key_pos++;
+ }
+
+ # Key identifier are matching
+ elsif ($field_keys[$key_pos] == $field_objs[$val_pos]->key_id) {
+
+ # Add key to pattern
+ $patterns[$key_pos] //= [];
+ push @{$patterns[$key_pos]}, $field_objs[$val_pos]->term_id;
+ $val_pos++;
+ }
+
+ # Forward key position
+ elsif ($field_keys[$key_pos] < $field_objs[$val_pos]->key_id) {
+
+ # Add ignorable null term
+ unless (@{$patterns[$key_pos]}) {
+ push @{$patterns[$key_pos]}, 0;
+ };
+ $key_pos++;
+ }
+
+ # $field_keys[$key_pos] > $field_objs[$val_pos]->key_id
+ else {
+
+ # I don't know if this can happen
+ $val_pos++;
+ };
+ };
+
+ # This adds
+ $groups->incr_doc(\@patterns);
+
+ # Set last doc to current doc
+ $self->{last_doc_id} = $current->doc_id;
+ };
+
+ # Add to frequencies
+ $groups->incr_match;
+
+ return 1;
+ };
+
+ # Flush cached results
+ $groups->flush;
+
+ return 0;
+};
+
+
+sub current {
+ return $_[0]->{query}->current;
+};
+
+
+# Get collection
+sub collection {
+ $_[0]->{groups};
+};
+
+
+1;
+
+
+__END__
diff --git a/lib/Krawfish/Compile/Segment/Group/Spans.pm b/lib/Krawfish/Compile/Segment/Group/Spans.pm
new file mode 100644
index 0000000..7085162
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Group/Spans.pm
@@ -0,0 +1,59 @@
+package Krawfish::Compile::Segment::Group::Spans;
+use parent 'Krawfish::Compile';
+use Krawfish::Log;
+use strict;
+use warnings;
+
+# This may be generalizable, but for the moment
+# It should make it possible to group the span positions
+# of a query based on a nesting query.
+#
+# The idea is to make the following possible:
+# Search for a term in sentences (like "{1:contains(<s>, {2:'baum'})}") and
+# based on the position and length of 1 and 2,
+# a result like
+#
+# 0: 5
+# 1: 7
+# 100: 2
+#
+# can be returned, where each class 1 is sliced in
+# 100 pieces and for each piece there is a dot, in case
+# class 2 occurs in that slice.
+#
+# By doing that it's easy to visualize the position of expressions
+# in sentences or documents etc.
+#
+# For example to answer questions like 'where in documents does
+# the phrase "Herzlichen Dank" occur?'
+#
+# If the span spans more than 1 slice, the result can be
+#
+# 0_2: 1
+# 0_3: 4
+# 4: 6
+#
+# etc. In case the second class is not nested in the first
+# class, this is not counted at all (as this would result
+# in weird data regarding the slice sizes).
+
+sub new {
+ my $class = shift;
+ my %param = @_;
+ bless {
+ slices => $param{slices} // 100,
+ wrap_clas => $param{wrap_class} // 1,
+ embedded_class => $param{embedded_class} // 2
+ }, $class;
+};
+
+# Get the group signature for each match
+# May well be renamed to get_signature
+sub get_group {
+ my $self = shift;
+ my $slice_start = 0;
+ my $slice_end = 0;
+ return $slice_start . '_' . $slice_end;
+};
+
+1;
diff --git a/lib/Krawfish/Compile/Segment/Group/TermExistence.pm b/lib/Krawfish/Compile/Segment/Group/TermExistence.pm
new file mode 100644
index 0000000..009c6ed
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Group/TermExistence.pm
@@ -0,0 +1,159 @@
+package Krawfish::Compile::Segment::Group::TermExistence;
+use parent 'Krawfish::Compile';
+use strict;
+use warnings;
+
+# The query works similar to Or-query, but only accepts term ids.
+
+sub new {
+ my $class = shift;
+ bless {
+ term_id => shift, # Term Query
+ term_ids => shift, # Optional TermExistence-Query
+ filter => undef,
+ existence => []
+ }, $class;
+};
+
+sub _init {
+ ...
+};
+
+
+# TODO:
+# Think about when next() is called, as it needs to be called on term_ids as well ...
+# Mabe this should be done in _init as a while query somehow.
+sub next {
+ my $self = shift;
+
+ # Get the current document in the VC
+ my $filter = $self->{filter};
+ my $doc_id = $filter->doc_id;
+
+ # The next document to look for in the VC
+ my $next_doc_id;
+
+
+ # Check the single term_id for existence
+
+ # The simple term does not exist
+ my $term = $self->{term_id};
+ if (!$term) {
+ # Do nothing
+ }
+
+ # Should never happen
+ elsif (!$term->current) {
+ $self->{term_id} = undef;
+ }
+
+ # Term exists and can be checked
+ else {
+
+ # Is the VC document beyond the current document id
+ if ($doc_id > $term->doc_id) {
+
+ # Move the term document to the VC document
+ $term->skip_doc($doc_id);
+ };
+
+ # Are both terms in the same document?
+ if ($term->doc_id == $doc_id) {
+
+ # Add this term to existence
+ $self->exists($term->term_id);
+
+ # Close posting
+ $term->close;
+
+ # Do not check any further
+ $self->{term_id} = undef;
+ }
+
+ # Current term document is beyond current VC doc
+ else {
+ $next_doc_id = $term->doc_id;
+ };
+ };
+
+
+ # Check the complex term_ids for existence
+
+ my $terms = $self->{term_ids};
+
+ if (!$terms) {
+ # Do nothing
+ }
+
+ # Should never happen
+ elsif (!$terms->current) {
+ $self->{term_ids} = undef;
+ }
+
+ else {
+
+ # When there is a complex query, move on
+ if ($doc_id > $terms->doc_id) {
+ $terms->skip_doc($doc_id);
+ };
+
+ # There are no further matches
+ unless ($terms->current) {
+
+ # Merge existence values
+ $self->exists($terms->existence);
+ $terms->close;
+ $self->{term_ids} = undef;
+ }
+
+ # Current terms are beyond current VC doc
+ else {
+
+ # Remember the next relevant document id
+ if (!$next_doc_id || $next_doc_id > $term->doc_id) {
+ $next_doc_id = $term->doc_id;
+ };
+ };
+ };
+
+ # There is a next document id defined - move on
+ if (defined $next_doc_id) {
+
+ # Move the VC stream to the next relevant position
+ if ($filter->skip_doc($next_doc_id)) {
+
+ # It's fine
+ return 1;
+ };
+ };
+
+ return 0;
+};
+
+
+# Add term ids to existence list
+sub exists {
+ my ($self, $term_id) = @_;
+
+ if (ref $term_id) {
+ push @{$terms->existence}, @$term_id;
+ }
+ else {
+ push @{$terms->existence}, $term_id;
+ };
+};
+
+
+# Return list of existing term ids
+sub existence {
+ return $self->{existence}
+};
+
+
+sub filter_by {
+ ...
+ # It is relevant to filter The query - but one filter may be enough
+};
+
+
+1;