Rename Meta to Compile Change-Id: Ief55855e2642a64672a840ece9d9fd876a49d373

commit: 5cf5fca327157df9d7c3aa74cc651e52b630d4db [log] [tgz]
author: Akron <nils@diewald-online.de> Mon Oct 09 19:01:47 2017 +0200
committer: Akron <nils@diewald-online.de> Mon Oct 09 19:01:47 2017 +0200
tree: 5f23c44dba64aeee9fe659f91393663c58d27d4a
parent: a3e70a02ac3490431340042e7915ee697ec188f2 [diff]
diff --git a/lib/Krawfish/Compile/Segment/Group/AnnotationClasses.pm b/lib/Krawfish/Compile/Segment/Group/AnnotationClasses.pm
new file mode 100644
index 0000000..bba8b9e
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Group/AnnotationClasses.pm

@@ -0,0 +1,11 @@
+# This should make it possible to search for classes
+# and group based on the annotations at the certain range.
+# This, however, is probably quite tricky as
+# there is no simple position based forward index with
+# term_ids for annotations, meaning that this
+# has to check the annotations in the complete forward index,
+# probably making this unusable slow.
+# but who knows ...
+
+# A query like
+# group_by_annotation_classes(1,"opennlp","p","Der {1:[]} Mann")

diff --git a/lib/Krawfish/Compile/Segment/Group/Characters.pm b/lib/Krawfish/Compile/Segment/Group/Characters.pm
new file mode 100644
index 0000000..e5b1da7
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Group/Characters.pm

@@ -0,0 +1,66 @@
+package Krawfish::Compile::Segment::Group::Character;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+
+# This groups on prefix or suffixes of subterms.
+# Necessary to support "Ansicht nach Wortendungen" for example.
+# It's possible to first group on terms and then - per term,
+# request the term surface in the dictionary and group by
+# the result.
+
+
+use constant DEBUG => 0;
+
+sub new {
+  my $class = shift;
+  bless {
+    segments   => shift, # Krawfish::Index::Segments object
+    # TODO: May as well be a subtoken object
+    from_start => shift,  # boolean - otherwise from end
+    char_count => shift
+    nrs => [@_]
+  }, $class;
+};
+
+
+sub get_group {
+  my ($self, $match) = @_;
+
+  # Get all classes from the match
+  my @classes = $match->get_classes($self->{nrs});
+
+  my $segments = $self->{segments};
+
+  my %group;
+
+  # Classes have nr, start, end
+  foreach my $class (sort { $a->start <=> $b->start } @classes) {
+
+    if ($self->{from_start}) {
+
+      # This will retrieve the segment from the segments stream
+      my $segment = $stream->get($match->doc_id, $class->start);
+
+      if ($segment->)
+
+        # The character count can be satisfied by the
+      my $first_chars = $segment->first_chars;
+
+      if (length($first_chars) <= $self->{char_count} {
+        substr($first_chars);
+      }
+      
+      # Check, if the segment only spans one segment
+      if ($class->end != $class->start+1) {
+        
+      };
+    }
+    else {
+      ...
+    };
+  };
+};
+
+1;

diff --git a/lib/Krawfish/Compile/Segment/Group/ClassFrequencies.pm b/lib/Krawfish/Compile/Segment/Group/ClassFrequencies.pm
new file mode 100644
index 0000000..529267f
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Group/ClassFrequencies.pm

@@ -0,0 +1,215 @@
+package Krawfish::Compile::Segment::Group::ClassFrequencies;
+use parent 'Krawfish::Compile';
+use Krawfish::Koral::Result::Group::ClassFrequencies;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+use constant DEBUG => 0;
+
+# Aggregate by content information, for example,
+# based on a certain class
+#
+# TODO:
+#   Currently this only works for surface term_ids, but it may very well collect
+#   arbitrary annotations! In that case, multiple annotations and different
+#   annotation lengths have to be taken into account.
+#
+# TODO:
+#   The special case of class 0 needs to be treated.
+#
+# TODO:
+#   Support virtual corpus classes
+
+sub new {
+  my $class = shift;
+  my $self = bless {
+    forward_obj => shift,
+    query       => shift,
+    classes     => shift,
+    class_freq  => {},
+    term_cache  => {},
+    last_doc_id => -1
+  }, $class;
+
+  $self->{groups} = Krawfish::Koral::Result::Group::ClassFrequencies->new(
+    $self->{classes}
+  );
+
+  return $self;
+};
+
+
+# Initialize forward counter
+sub _init {
+  return if $_[0]->{forward_pointer};
+
+  my $self = shift;
+
+  print_log('g_class_freq', 'Create forward pointer') if DEBUG;
+
+  # Load the ranked list - may be too large for memory!
+  $self->{forward_pointer} = $self->{forward_obj}->pointer;
+};
+
+
+# Shorthand for "search through"
+sub finalize {
+  while ($_[0]->next) {};
+  return $_[0];
+};
+
+
+# Move to next match
+sub next {
+  my $self = shift;
+
+  $self->_init;
+
+  # No more matches
+  return unless $self->{query}->next;
+
+  # Get the current posting
+  my $current = $self->{query}->current;
+
+  my $groups = $self->{groups};
+  my $pointer = $self->{forward_pointer};
+
+  # Get the current doc_id and move to it
+  my $doc_id = $current->doc_id;
+
+  # Current doc_id differ - move forward
+  if ($doc_id != $self->{last_doc_id}) {
+
+    # Skip to doc
+    if ($pointer->skip_doc($doc_id) != $doc_id) {
+
+      # This should never happen, as for all docs there is a
+      # forward index!
+      return;
+    };
+
+    if (DEBUG) {
+      print_log('g_class_freq', "Moved forward index to $doc_id");
+    };
+
+    # Remember the last document
+    $self->{last_doc_id} = $doc_id;
+  };
+
+  # Remember terms (for overlap)
+  # with the structure pos -> term_id
+  # my $cache = $self->{term_cache};
+
+  # Collect classes - have the structure
+  # $classes[1] = [...term_id,0,0,term_id,...]
+  my @classes = ();
+  foreach (@{$self->{classes}}) {
+    $classes[$_] = []; # TODO: Length is normally the length of the match
+  };
+  my @term_cache = ();
+
+  # Get offset to simplify classes
+  my $offset = $current->start;
+
+  # TODO:
+  #   There may be overlaps within following
+  #   matches, so under certain circumstances the pointer
+  #   needs to move backwards.
+  #   It may be enough to do that at the beginning here.
+  #   Or instead of skip_pos, the forward pointer can
+  #   reposition automatically.
+
+  # Get class payloads
+  my @class_infos = $current->get_classes($self->{classes});
+
+  # Retrieve the requested classes for the current posting
+  foreach my $class_info (@class_infos) {
+
+    # Check class info
+    my ($nr, $start, $end) = @{$class_info};
+
+    # Iterate over the class and collect term ids
+    for (my $i = $start; $i < $end; $i++) {
+
+      # The relative position of the class in the match
+      my $rel_pos = $i - $offset;
+
+      # Check if term already retrieved
+      if ($term_cache[$rel_pos]) {
+
+        # Copy to class
+        $classes[$nr]->[$rel_pos] = $term_cache[$rel_pos];
+      }
+
+      # TODO:
+      #   Check for term cache
+      #   if ($term_cache->{$start})
+
+      # Retrieve term id
+      else {
+
+        # Collect all terms from start to end
+        if ($pointer->skip_pos($i)) {
+
+          # Add term id to class in correct order
+          my $term_id = $pointer->current->term_id;
+
+          if (DEBUG) {
+            print_log(
+              'g_class_freq',
+              "Term id at position $i is #" . $term_id);
+          };
+
+
+          # Set term_id at relative position in term_cache
+          $term_cache[$rel_pos] = $classes[$nr]->[$rel_pos] = $term_id;
+        };
+      }
+    };
+  };
+
+  if (DEBUG) {
+    print_log('g_class_freq', 'term cache is ' . join(',', @term_cache));
+  };
+
+  # The signature has the structure [class, term_id*]+
+  my @sig = ();
+
+  # Iterate over all classes
+  foreach my $nr (@{$self->{classes}}) {
+    push @sig, $nr;
+
+    foreach (@{$classes[$nr]}) {
+      push @sig, $_ if defined $_; # Add all set term_ids
+    };
+
+    push @sig, 0;
+  };
+
+  # Increment per match
+  $self->{groups}->incr_match(\@sig);
+
+  return 1;
+};
+
+
+sub current {
+  return $_[0]->{query}->current;
+};
+
+
+# Get collection
+sub collection {
+  $_[0]->{groups};
+};
+
+
+sub to_string {
+  my $self = shift;
+  my $str = 'gClassFreq(' . join(',', @{$self->{classes}}) .
+    ':' . $self->{query}->to_string . ')';
+  return $str;
+};
+
+1;

diff --git a/lib/Krawfish/Compile/Segment/Group/Classes.pm b/lib/Krawfish/Compile/Segment/Group/Classes.pm
new file mode 100644
index 0000000..af286ed
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Group/Classes.pm

@@ -0,0 +1,121 @@
+package Krawfish::Compile::Segment::Group::Classes;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+# TODO:
+#   The name is somehow misleading, as this will only
+#   group by surface terms.
+
+# TODO:
+#   Refer to Krawfish::Compile::Segment::TermIDs!
+
+use constant {
+  DEBUG => 0,
+    NR => 0,
+    START_POS => 1,
+    END_POS => 2,
+};
+
+
+sub new {
+  my $class = shift;
+  bless {
+    segment => shift,
+    nrs => @_ ? [sort @_] : undef,
+    groups => {},
+  }, $class;
+};
+
+
+# Get the group signature for each match
+sub get_group {
+  my ($self, $match) = @_;
+
+  # Get all classes from the match
+  # Classes need to be sorted by start position
+  # to be retrievable, in case the subtokens-Stream
+  # is implemented as a postingslist (probably not)
+  my @classes = $match->get_classes_sorted($self->{nrs});
+
+  my $subtokens = $self->{segment}->subtokens;
+
+  my %class_group;
+
+  # Classes have nr, start, end
+  foreach my $class (@classes) {
+
+    # WARNING! CLASSES MAY OVERLAP SO SUBTOKENS SHOULD BE CACHED OR BUFFERED!
+
+    # Get start position
+    my $start = $class->[START_POS];
+
+    my @seq = ();
+
+    # Receive subtoken
+    my $subt = $subtokens->get($match->doc_id, $start);
+
+    # Push term id to subtoken
+    # TODO: A subtoken should have accessors
+    push (@seq, $subt->[2]);
+
+    while ($start < ($class->[END_POS] -1)) {
+      $subt = $subtokens->get($match->doc_id, ++$start);
+
+      # Push subterm id to subtoken
+      push (@seq, $subt->[2]);
+    };
+
+    # Class not yet set
+    unless ($class_group{$class->[NR]}) {
+      $class_group{$class->[NR]} = join('___', @seq);
+    }
+
+    # There is a gap in the class, potentially an overlap!
+    # TODO: Resolve overlaps!
+    # TODO: Mark gaps!
+    else {
+      $class_group{$class->[NR]} .= '___' . join('___', @seq);
+    };
+  };
+
+  my $string = '';
+  foreach (sort {$a <=> $b} keys %class_group) {
+    $string .= $_ .':' . $class_group{$_} . ';';
+  };
+
+  return $string;
+};
+
+
+# Return group info as hash
+sub to_hash {
+  my ($self, $signature, $doc_freq, $freq) = @_;
+
+  # TODO:
+  #   This can't work!
+  # Get dictionary object to convert terms to term id
+  # my $dict = $self->{segment}->dict;
+
+  my %hash = ();
+  while ($signature =~ /\G(\d+):(.+?);/g) {
+
+  #  if (DEBUG) {
+  #    print_log('g_class', "Build class $1 for signature $2");
+  #  };
+
+    $hash{"class_$1"} = [ split('___', $2)];
+  };
+  $hash{freq} = $freq if defined $freq;
+  $hash{doc_freq} = $doc_freq;
+  return \%hash;
+};
+
+
+sub to_string {
+  my $str = 'classes';
+  $str .= $_[0]->{nrs} ? '[' . join(',', @{$_[0]->{nrs}}) . ']' : '';
+  return $str;
+};
+
+1;

diff --git a/lib/Krawfish/Compile/Segment/Group/Fields.pm b/lib/Krawfish/Compile/Segment/Group/Fields.pm
new file mode 100644
index 0000000..75884b2
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Group/Fields.pm

@@ -0,0 +1,175 @@
+package Krawfish::Compile::Segment::Group::Fields;
+use parent 'Krawfish::Compile';
+use Krawfish::Koral::Result::Group::Fields;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+use constant DEBUG => 0;
+
+# This will group matches (especially document matches) by field
+# This is useful e.g. for document browsing per corpus.
+#
+# Because the grouping is based on ranking, the sorting will be trivial.
+#
+# TODO:
+#   For some mechanisms, it is not necessary to count all occurrences,
+#   e.g. to get all keywords used in a certain virtual corpus or all
+#   used annotations.
+
+sub new {
+  my $class = shift;
+  my ($field_obj, $query, $fields) = @_;
+  my $self = bless {
+    field_obj  => $field_obj,
+    query      => $query,
+    field_keys => [map { ref($_) ? $_->term_id : $_ } @$fields],
+    last_doc_id => -1
+  }, $class;
+
+  # Initialize group object
+  $self->{groups} = Krawfish::Koral::Result::Group::Fields->new($self->{field_keys});
+
+  return $self;
+};
+
+
+# Initialize field pointer
+sub _init {
+  return if $_[0]->{field_pointer};
+
+  my $self = shift;
+
+  print_log('g_fields', 'Create pointer on fields') if DEBUG;
+
+  # Load the ranked list - may be too large for memory!
+  $self->{field_pointer} = $self->{field_obj}->pointer;
+};
+
+
+sub to_string {
+  my $self = shift;
+  my $str = 'gFields(' . join(',', map { '#' . $_ } @{$self->{field_keys}}) .
+    ':' . $self->{query}->to_string . ')';
+  return $str;
+};
+
+
+# Shorthand for "search through"
+sub finalize {
+  while ($_[0]->next) {};
+  return $_[0];
+};
+
+
+# Iterate to the next result
+sub next {
+  my $self = shift;
+
+  $self->_init;
+
+  my $groups = $self->{groups};
+  my $pointer = $self->{field_pointer};
+
+  # There is a next match
+  if ($self->{query}->next) {
+
+    # Get the current posting
+    my $current = $self->{query}->current;
+
+    if ($current->doc_id != $self->{last_doc_id}) {
+
+      # Flush old information
+      $groups->flush;
+
+      my $doc_id = $pointer->skip_doc($current->doc_id);
+
+      # There are no fields for this doc
+      next if $doc_id != $current->doc_id;
+
+      # Due to multivalued fields,
+      # a document can yield a permutation of
+      # patterns, so we recognize this
+      my @patterns = ();
+      my @field_keys = @{$self->{field_keys}};
+
+      # Ignore stored fields
+      my @field_objs = grep { $_->type ne 'store' } $pointer->fields(@field_keys);
+
+      my ($key_pos, $val_pos) = (0,0);
+
+      # Iterate through both lists and create a pattern
+      # Pattern may occur because fields can have multiple values
+      while ($key_pos < @field_keys) {
+
+        # There are no more values for the position
+        if (!$field_objs[$val_pos]) {
+          # Add ignorable null term
+          unless (@{$patterns[$key_pos]}) {
+            push @{$patterns[$key_pos]}, 0;
+          };
+          $key_pos++;
+        }
+
+        # Key identifier are matching
+        elsif ($field_keys[$key_pos] == $field_objs[$val_pos]->key_id) {
+
+          # Add key to pattern
+          $patterns[$key_pos] //= [];
+          push @{$patterns[$key_pos]}, $field_objs[$val_pos]->term_id;
+          $val_pos++;
+        }
+
+        # Forward key position
+        elsif ($field_keys[$key_pos] < $field_objs[$val_pos]->key_id) {
+
+          # Add ignorable null term
+          unless (@{$patterns[$key_pos]}) {
+            push @{$patterns[$key_pos]}, 0;
+          };
+          $key_pos++;
+        }
+
+        # $field_keys[$key_pos] > $field_objs[$val_pos]->key_id
+        else {
+
+          # I don't know if this can happen
+          $val_pos++;
+        };
+      };
+
+      # This adds
+      $groups->incr_doc(\@patterns);
+
+      # Set last doc to current doc
+      $self->{last_doc_id} = $current->doc_id;
+    };
+
+    # Add to frequencies
+    $groups->incr_match;
+
+    return 1;
+  };
+
+  # Flush cached results
+  $groups->flush;
+
+  return 0;
+};
+
+
+sub current {
+  return $_[0]->{query}->current;
+};
+
+
+# Get collection
+sub collection {
+  $_[0]->{groups};
+};
+
+
+1;
+
+
+__END__

diff --git a/lib/Krawfish/Compile/Segment/Group/Spans.pm b/lib/Krawfish/Compile/Segment/Group/Spans.pm
new file mode 100644
index 0000000..7085162
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Group/Spans.pm

@@ -0,0 +1,59 @@
+package Krawfish::Compile::Segment::Group::Spans;
+use parent 'Krawfish::Compile';
+use Krawfish::Log;
+use strict;
+use warnings;
+
+# This may be generalizable, but for the moment
+# It should make it possible to group the span positions
+# of a query based on a nesting query.
+#
+# The idea is to make the following possible:
+# Search for a term in sentences (like "{1:contains(<s>, {2:'baum'})}") and
+# based on the position and length of 1 and 2,
+# a result like
+#
+#     0: 5
+#     1: 7
+#   100: 2
+#
+# can be returned, where each class 1 is sliced in
+# 100 pieces and for each piece there is a dot, in case
+# class 2 occurs in that slice.
+#
+# By doing that it's easy to visualize the position of expressions
+# in sentences or documents etc.
+#
+# For example to answer questions like 'where in documents does
+# the phrase "Herzlichen Dank" occur?'
+#
+# If the span spans more than 1 slice, the result can be
+#
+#   0_2: 1
+#   0_3: 4
+#   4: 6
+#
+# etc. In case the second class is not nested in the first
+# class, this is not counted at all (as this would result
+# in weird data regarding the slice sizes).
+
+sub new {
+  my $class = shift;
+  my %param = @_;
+  bless {
+    slices => $param{slices} // 100,
+    wrap_clas => $param{wrap_class} // 1,
+    embedded_class => $param{embedded_class} // 2
+  }, $class;
+};
+
+# Get the group signature for each match
+# May well be renamed to get_signature
+sub get_group {
+  my $self = shift;
+  my $slice_start = 0;
+  my $slice_end = 0;
+  return $slice_start . '_' . $slice_end;
+};
+
+1;

diff --git a/lib/Krawfish/Compile/Segment/Group/TermExistence.pm b/lib/Krawfish/Compile/Segment/Group/TermExistence.pm
new file mode 100644
index 0000000..009c6ed
--- /dev/null
+++ b/lib/Krawfish/Compile/Segment/Group/TermExistence.pm

@@ -0,0 +1,159 @@
+package Krawfish::Compile::Segment::Group::TermExistence;
+use parent 'Krawfish::Compile';
+use strict;
+use warnings;
+
+# The query works similar to Or-query, but only accepts term ids.
+
+sub new {
+  my $class = shift;
+  bless {
+    term_id   => shift,  # Term Query
+    term_ids  => shift,  # Optional TermExistence-Query
+    filter    => undef,
+    existence => []
+  }, $class;
+};
+
+sub _init {
+  ...
+};
+
+
+# TODO:
+#   Think about when next() is called, as it needs to be called on term_ids as well ...
+#   Mabe this should be done in _init as a while query somehow.
+sub next {
+  my $self = shift;
+
+  # Get the current document in the VC
+  my $filter = $self->{filter};
+  my $doc_id = $filter->doc_id;
+
+  # The next document to look for in the VC
+  my $next_doc_id;
+
+
+  # Check the single term_id for existence
+
+  # The simple term does not exist
+  my $term = $self->{term_id};
+  if (!$term) {
+    # Do nothing
+  }
+
+  # Should never happen
+  elsif (!$term->current) {
+    $self->{term_id} = undef;
+  }
+
+  # Term exists and can be checked
+  else {
+
+    # Is the VC document beyond the current document id
+    if ($doc_id > $term->doc_id) {
+
+      # Move the term document to the VC document
+      $term->skip_doc($doc_id);
+    };
+
+    # Are both terms in the same document?
+    if ($term->doc_id == $doc_id) {
+
+      # Add this term to existence
+      $self->exists($term->term_id);
+
+      # Close posting
+      $term->close;
+
+      # Do not check any further
+      $self->{term_id} = undef;
+    }
+
+    # Current term document is beyond current VC doc
+    else {
+      $next_doc_id = $term->doc_id;
+    };
+  };
+
+
+  # Check the complex term_ids for existence
+
+  my $terms = $self->{term_ids};
+
+  if (!$terms) {
+    # Do nothing
+  }
+
+  # Should never happen
+  elsif (!$terms->current) {
+    $self->{term_ids} = undef;
+  }
+
+  else {
+
+    # When there is a complex query, move on
+    if ($doc_id > $terms->doc_id) {
+      $terms->skip_doc($doc_id);
+    };
+
+    # There are no further matches
+    unless ($terms->current) {
+
+      # Merge existence values
+      $self->exists($terms->existence);
+      $terms->close;
+      $self->{term_ids} = undef;
+    }
+
+    # Current terms are beyond current VC doc
+    else {
+
+      # Remember the next relevant document id
+      if (!$next_doc_id || $next_doc_id > $term->doc_id) {
+        $next_doc_id = $term->doc_id;
+      };
+    };
+  };
+
+  # There is a next document id defined - move on
+  if (defined $next_doc_id) {
+
+    # Move the VC stream to the next relevant position
+    if ($filter->skip_doc($next_doc_id)) {
+
+      # It's fine
+      return 1;
+    };
+  };
+
+  return 0;
+};
+
+
+# Add term ids to existence list
+sub exists {
+  my ($self, $term_id) = @_;
+
+  if (ref $term_id) {
+    push @{$terms->existence}, @$term_id;
+  }
+  else {
+    push @{$terms->existence}, $term_id;
+  };
+};
+
+
+# Return list of existing term ids
+sub existence {
+  return $self->{existence}
+};
+
+
+sub filter_by {
+  ...
+    # It is relevant to filter The query - but one filter may be enough
+};
+
+
+1;
commit	5cf5fca327157df9d7c3aa74cc651e52b630d4db	[log] [tgz]
author	Akron <nils@diewald-online.de>	Mon Oct 09 19:01:47 2017 +0200
committer	Akron <nils@diewald-online.de>	Mon Oct 09 19:01:47 2017 +0200
tree	5f23c44dba64aeee9fe659f91393663c58d27d4a
parent	a3e70a02ac3490431340042e7915ee697ec188f2 [diff]