Cleanup and additions regarding requirements of COSMAS II Change-Id: Id55b9861838aed2978f095d18d83284b4ab1af08

commit: 31e088b2db9300f4132e02bd4a774a004199a3d5 [log] [tgz]
author: Akron <nils@diewald-online.de> Fri Sep 29 14:48:49 2017 +0200
committer: Akron <nils@diewald-online.de> Fri Sep 29 14:48:49 2017 +0200
tree: 8cf94a18ea590815aa49ea797c6492c1e82bdb90
parent: bc287e5fc3c59ee42a61df60246bb9606b6faf83 [diff]
diff --git a/lib/Krawfish/Meta/Segment/Aggregate/Length.pm b/lib/Krawfish/Meta/Segment/Aggregate/Length.pm
index 5b7fb6d..ef92dec 100644
--- a/lib/Krawfish/Meta/Segment/Aggregate/Length.pm
+++ b/lib/Krawfish/Meta/Segment/Aggregate/Length.pm

@@ -4,7 +4,7 @@
 use strict;
 use warnings;
 
-# This will check the segments length -
+# This will check the hits length in subtokens -
 # currently other word lengths are not supported
 
 # See https://en.wikipedia.org/wiki/Selection_algorithm

diff --git a/lib/Krawfish/Meta/Segment/Aggregate/TermExistence.pm b/lib/Krawfish/Meta/Segment/Aggregate/TermExistence.pm
index f1e8594..fe9ed04 100644
--- a/lib/Krawfish/Meta/Segment/Aggregate/TermExistence.pm
+++ b/lib/Krawfish/Meta/Segment/Aggregate/TermExistence.pm

@@ -3,4 +3,6 @@
 # This probably requires a single list of term queries, that can be
 # closed, once a match occurs.
 
+# Probably better suited in Group
+
 __END__

diff --git a/lib/Krawfish/Meta/Segment/Aggregate/Values.pm b/lib/Krawfish/Meta/Segment/Aggregate/Values.pm
index 0f60e6c..9523887 100644
--- a/lib/Krawfish/Meta/Segment/Aggregate/Values.pm
+++ b/lib/Krawfish/Meta/Segment/Aggregate/Values.pm

@@ -11,6 +11,10 @@
 # TODO:
 #   Support corpus classes
 
+# TODO:
+#   This is rather a group query or better:
+#   An aggregation on groups!
+
 use constant {
   DEBUG          => 1
 };

diff --git a/lib/Krawfish/Meta/Segment/Enrich/Terms.pm b/lib/Krawfish/Meta/Segment/Enrich/Terms.pm
index 527bb16..0076566 100644
--- a/lib/Krawfish/Meta/Segment/Enrich/Terms.pm
+++ b/lib/Krawfish/Meta/Segment/Enrich/Terms.pm

@@ -6,7 +6,7 @@
 use warnings;
 
 # TODO:
-#   Potentially rename to ::Terms!
+#   Potentially rename to ::Terms! or ::Classes!
 
 # Enrich each match with all term ids for a specific region and
 # for a specific class

diff --git a/lib/Krawfish/Meta/Segment/EnrichGroup/Values.pm b/lib/Krawfish/Meta/Segment/EnrichGroup/Values.pm
new file mode 100644
index 0000000..f94461c
--- /dev/null
+++ b/lib/Krawfish/Meta/Segment/EnrichGroup/Values.pm

@@ -0,0 +1,4 @@
+# Add per group values from fields,
+# like in a group on documents add the min and max values
+# of a field, e.g. the date span, or the total number
+# of sentences in a corpus.

diff --git a/lib/Krawfish/Meta/Segment/Group/AnnotationClasses.pm b/lib/Krawfish/Meta/Segment/Group/AnnotationClasses.pm
new file mode 100644
index 0000000..bba8b9e
--- /dev/null
+++ b/lib/Krawfish/Meta/Segment/Group/AnnotationClasses.pm

@@ -0,0 +1,11 @@
+# This should make it possible to search for classes
+# and group based on the annotations at the certain range.
+# This, however, is probably quite tricky as
+# there is no simple position based forward index with
+# term_ids for annotations, meaning that this
+# has to check the annotations in the complete forward index,
+# probably making this unusable slow.
+# but who knows ...
+
+# A query like
+# group_by_annotation_classes(1,"opennlp","p","Der {1:[]} Mann")

diff --git a/lib/Krawfish/Meta/Segment/Group/Characters.pm b/lib/Krawfish/Meta/Segment/Group/Characters.pm
new file mode 100644
index 0000000..0594f17
--- /dev/null
+++ b/lib/Krawfish/Meta/Segment/Group/Characters.pm

@@ -0,0 +1,66 @@
+package Krawfish::Meta::Segment::Group::Character;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+
+# This groups on prefix or suffixes of subterms.
+# Necessary to support "Ansicht nach Wortendungen" for example.
+# It's possible to first group on terms and then - per term,
+# request the term surface in the dictionary and group by
+# the result.
+
+
+use constant DEBUG => 0;
+
+sub new {
+  my $class = shift;
+  bless {
+    segments   => shift, # Krawfish::Index::Segments object
+    # TODO: May as well be a subtoken object
+    from_start => shift,  # boolean - otherwise from end
+    char_count => shift
+    nrs => [@_]
+  }, $class;
+};
+
+
+sub get_group {
+  my ($self, $match) = @_;
+
+  # Get all classes from the match
+  my @classes = $match->get_classes($self->{nrs});
+
+  my $segments = $self->{segments};
+
+  my %group;
+
+  # Classes have nr, start, end
+  foreach my $class (sort { $a->start <=> $b->start } @classes) {
+
+    if ($self->{from_start}) {
+
+      # This will retrieve the segment from the segments stream
+      my $segment = $stream->get($match->doc_id, $class->start);
+
+      if ($segment->)
+
+        # The character count can be satisfied by the
+      my $first_chars = $segment->first_chars;
+
+      if (length($first_chars) <= $self->{char_count} {
+        substr($first_chars);
+      }
+      
+      # Check, if the segment only spans one segment
+      if ($class->end != $class->start+1) {
+        
+      };
+    }
+    else {
+      ...
+    };
+  };
+};
+
+1;

diff --git a/lib/Krawfish/Meta/Segment/Group/Spans.pm b/lib/Krawfish/Meta/Segment/Group/Spans.pm
new file mode 100644
index 0000000..debaa69
--- /dev/null
+++ b/lib/Krawfish/Meta/Segment/Group/Spans.pm

@@ -0,0 +1,59 @@
+package Krawfish::Meta::Segment::Group::Spans;
+use parent 'Krawfish::Meta';
+use Krawfish::Log;
+use strict;
+use warnings;
+
+# This may be generalizable, but for the moment
+# It should make it possible to group the span positions
+# of a query based on a nesting query.
+#
+# The idea is to make the following possible:
+# Search for a term in sentences (like "{1:contains(<s>, {2:'baum'})}") and
+# based on the position and length of 1 and 2,
+# a result like
+#
+#     0: 5
+#     1: 7
+#   100: 2
+#
+# can be returned, where each class 1 is sliced in
+# 100 pieces and for each piece there is a dot, in case
+# class 2 occurs in that slice.
+#
+# By doing that it's easy to visualize the position of expressions
+# in sentences or documents etc.
+#
+# For example to answer questions like 'where in documents does
+# the phrase "Herzlichen Dank" occur?'
+#
+# If the span spans more than 1 slice, the result can be
+#
+#   0_2: 1
+#   0_3: 4
+#   4: 6
+#
+# etc. In case the second class is not nested in the first
+# class, this is not counted at all (as this would result
+# in weird data regarding the slice sizes).
+
+sub new {
+  my $class = shift;
+  my %param = @_;
+  bless {
+    slices => $param{slices} // 100,
+    wrap_clas => $param{wrap_class} // 1,
+    embedded_class => $param{embedded_class} // 2
+  }, $class;
+};
+
+# Get the group signature for each match
+# May well be renamed to get_signature
+sub get_group {
+  my $self = shift;
+  my $slice_start = 0;
+  my $slice_end = 0;
+  return $slice_start . '_' . $slice_end;
+};
+
+1;

diff --git a/lib/Krawfish/Meta/Segment/Group/TermExistence.pm b/lib/Krawfish/Meta/Segment/Group/TermExistence.pm
new file mode 100644
index 0000000..3570166
--- /dev/null
+++ b/lib/Krawfish/Meta/Segment/Group/TermExistence.pm

@@ -0,0 +1,159 @@
+package Krawfish::Meta::Segment::Group::TermExistence;
+use parent 'Krawfish::Meta';
+use strict;
+use warnings;
+
+# The query works similar to Or-query, but only accepts term ids.
+
+sub new {
+  my $class = shift;
+  bless {
+    term_id   => shift,  # Term Query
+    term_ids  => shift,  # Optional TermExistence-Query
+    filter    => undef,
+    existence => []
+  }, $class;
+};
+
+sub _init {
+  ...
+};
+
+
+# TODO:
+#   Think about when next() is called, as it needs to be called on term_ids as well ...
+#   Mabe this should be done in _init as a while query somehow.
+sub next {
+  my $self = shift;
+
+  # Get the current document in the VC
+  my $filter = $self->{filter};
+  my $doc_id = $filter->doc_id;
+
+  # The next document to look for in the VC
+  my $next_doc_id;
+
+
+  # Check the single term_id for existence
+
+  # The simple term does not exist
+  my $term = $self->{term_id};
+  if (!$term) {
+    # Do nothing
+  }
+
+  # Should never happen
+  elsif (!$term->current) {
+    $self->{term_id} = undef;
+  }
+
+  # Term exists and can be checked
+  else {
+
+    # Is the VC document beyond the current document id
+    if ($doc_id > $term->doc_id) {
+
+      # Move the term document to the VC document
+      $term->skip_doc($doc_id);
+    };
+
+    # Are both terms in the same document?
+    if ($term->doc_id == $doc_id) {
+
+      # Add this term to existence
+      $self->exists($term->term_id);
+
+      # Close posting
+      $term->close;
+
+      # Do not check any further
+      $self->{term_id} = undef;
+    }
+
+    # Current term document is beyond current VC doc
+    else {
+      $next_doc_id = $term->doc_id;
+    };
+  };
+
+
+  # Check the complex term_ids for existence
+
+  my $terms = $self->{term_ids};
+
+  if (!$terms) {
+    # Do nothing
+  }
+
+  # Should never happen
+  elsif (!$terms->current) {
+    $self->{term_ids} = undef;
+  }
+
+  else {
+
+    # When there is a complex query, move on
+    if ($doc_id > $terms->doc_id) {
+      $terms->skip_doc($doc_id);
+    };
+
+    # There are no further matches
+    unless ($terms->current) {
+
+      # Merge existence values
+      $self->exists($terms->existence);
+      $terms->close;
+      $self->{term_ids} = undef;
+    }
+
+    # Current terms are beyond current VC doc
+    else {
+
+      # Remember the next relevant document id
+      if (!$next_doc_id || $next_doc_id > $term->doc_id) {
+        $next_doc_id = $term->doc_id;
+      };
+    };
+  };
+
+  # There is a next document id defined - move on
+  if (defined $next_doc_id) {
+
+    # Move the VC stream to the next relevant position
+    if ($filter->skip_doc($next_doc_id)) {
+
+      # It's fine
+      return 1;
+    };
+  };
+
+  return 0;
+};
+
+
+# Add term ids to existence list
+sub exists {
+  my ($self, $term_id) = @_;
+
+  if (ref $term_id) {
+    push @{$terms->existence}, @$term_id;
+  }
+  else {
+    push @{$terms->existence}, $term_id;
+  };
+};
+
+
+# Return list of existing term ids
+sub existence {
+  return $self->{existence}
+};
+
+
+sub filter_by {
+  ...
+    # It is relevant to filter The query - but one filter may be enough
+};
+
+
+1;
commit	31e088b2db9300f4132e02bd4a774a004199a3d5	[log] [tgz]
author	Akron <nils@diewald-online.de>	Fri Sep 29 14:48:49 2017 +0200
committer	Akron <nils@diewald-online.de>	Fri Sep 29 14:48:49 2017 +0200
tree	8cf94a18ea590815aa49ea797c6492c1e82bdb90
parent	bc287e5fc3c59ee42a61df60246bb9606b6faf83 [diff]