Cleanup and additions regarding requirements of COSMAS II
Change-Id: Id55b9861838aed2978f095d18d83284b4ab1af08
diff --git a/lib/Krawfish/Meta/Segment/Aggregate/Length.pm b/lib/Krawfish/Meta/Segment/Aggregate/Length.pm
index 5b7fb6d..ef92dec 100644
--- a/lib/Krawfish/Meta/Segment/Aggregate/Length.pm
+++ b/lib/Krawfish/Meta/Segment/Aggregate/Length.pm
@@ -4,7 +4,7 @@
use strict;
use warnings;
-# This will check the segments length -
+# This will check the hits length in subtokens -
# currently other word lengths are not supported
# See https://en.wikipedia.org/wiki/Selection_algorithm
diff --git a/lib/Krawfish/Meta/Segment/Aggregate/TermExistence.pm b/lib/Krawfish/Meta/Segment/Aggregate/TermExistence.pm
index f1e8594..fe9ed04 100644
--- a/lib/Krawfish/Meta/Segment/Aggregate/TermExistence.pm
+++ b/lib/Krawfish/Meta/Segment/Aggregate/TermExistence.pm
@@ -3,4 +3,6 @@
# This probably requires a single list of term queries, that can be
# closed, once a match occurs.
+# Probably better suited in Group
+
__END__
diff --git a/lib/Krawfish/Meta/Segment/Aggregate/Values.pm b/lib/Krawfish/Meta/Segment/Aggregate/Values.pm
index 0f60e6c..9523887 100644
--- a/lib/Krawfish/Meta/Segment/Aggregate/Values.pm
+++ b/lib/Krawfish/Meta/Segment/Aggregate/Values.pm
@@ -11,6 +11,10 @@
# TODO:
# Support corpus classes
+# TODO:
+# This is rather a group query or better:
+# An aggregation on groups!
+
use constant {
DEBUG => 1
};
diff --git a/lib/Krawfish/Meta/Segment/Enrich/Terms.pm b/lib/Krawfish/Meta/Segment/Enrich/Terms.pm
index 527bb16..0076566 100644
--- a/lib/Krawfish/Meta/Segment/Enrich/Terms.pm
+++ b/lib/Krawfish/Meta/Segment/Enrich/Terms.pm
@@ -6,7 +6,7 @@
use warnings;
# TODO:
-# Potentially rename to ::Terms!
+# Potentially rename to ::Terms! or ::Classes!
# Enrich each match with all term ids for a specific region and
# for a specific class
diff --git a/lib/Krawfish/Meta/Segment/EnrichGroup/Values.pm b/lib/Krawfish/Meta/Segment/EnrichGroup/Values.pm
new file mode 100644
index 0000000..f94461c
--- /dev/null
+++ b/lib/Krawfish/Meta/Segment/EnrichGroup/Values.pm
@@ -0,0 +1,4 @@
+# Add per group values from fields,
+# like in a group on documents add the min and max values
+# of a field, e.g. the date span, or the total number
+# of sentences in a corpus.
diff --git a/lib/Krawfish/Meta/Segment/Group/AnnotationClasses.pm b/lib/Krawfish/Meta/Segment/Group/AnnotationClasses.pm
new file mode 100644
index 0000000..bba8b9e
--- /dev/null
+++ b/lib/Krawfish/Meta/Segment/Group/AnnotationClasses.pm
@@ -0,0 +1,11 @@
+# This should make it possible to search for classes
+# and group based on the annotations at the certain range.
+# This, however, is probably quite tricky as
+# there is no simple position based forward index with
+# term_ids for annotations, meaning that this
+# has to check the annotations in the complete forward index,
+# probably making this unusable slow.
+# but who knows ...
+
+# A query like
+# group_by_annotation_classes(1,"opennlp","p","Der {1:[]} Mann")
diff --git a/lib/Krawfish/Meta/Segment/Group/Characters.pm b/lib/Krawfish/Meta/Segment/Group/Characters.pm
new file mode 100644
index 0000000..0594f17
--- /dev/null
+++ b/lib/Krawfish/Meta/Segment/Group/Characters.pm
@@ -0,0 +1,66 @@
+package Krawfish::Meta::Segment::Group::Character;
+use Krawfish::Log;
+use strict;
+use warnings;
+
+
+# This groups on prefix or suffixes of subterms.
+# Necessary to support "Ansicht nach Wortendungen" for example.
+# It's possible to first group on terms and then - per term,
+# request the term surface in the dictionary and group by
+# the result.
+
+
+use constant DEBUG => 0;
+
+sub new {
+ my $class = shift;
+ bless {
+ segments => shift, # Krawfish::Index::Segments object
+ # TODO: May as well be a subtoken object
+ from_start => shift, # boolean - otherwise from end
+ char_count => shift
+ nrs => [@_]
+ }, $class;
+};
+
+
+sub get_group {
+ my ($self, $match) = @_;
+
+ # Get all classes from the match
+ my @classes = $match->get_classes($self->{nrs});
+
+ my $segments = $self->{segments};
+
+ my %group;
+
+ # Classes have nr, start, end
+ foreach my $class (sort { $a->start <=> $b->start } @classes) {
+
+ if ($self->{from_start}) {
+
+ # This will retrieve the segment from the segments stream
+ my $segment = $stream->get($match->doc_id, $class->start);
+
+ if ($segment->)
+
+ # The character count can be satisfied by the
+ my $first_chars = $segment->first_chars;
+
+ if (length($first_chars) <= $self->{char_count} {
+ substr($first_chars);
+ }
+
+ # Check, if the segment only spans one segment
+ if ($class->end != $class->start+1) {
+
+ };
+ }
+ else {
+ ...
+ };
+ };
+};
+
+1;
diff --git a/lib/Krawfish/Meta/Segment/Group/Spans.pm b/lib/Krawfish/Meta/Segment/Group/Spans.pm
new file mode 100644
index 0000000..debaa69
--- /dev/null
+++ b/lib/Krawfish/Meta/Segment/Group/Spans.pm
@@ -0,0 +1,59 @@
+package Krawfish::Meta::Segment::Group::Spans;
+use parent 'Krawfish::Meta';
+use Krawfish::Log;
+use strict;
+use warnings;
+
+# This may be generalizable, but for the moment
+# It should make it possible to group the span positions
+# of a query based on a nesting query.
+#
+# The idea is to make the following possible:
+# Search for a term in sentences (like "{1:contains(<s>, {2:'baum'})}") and
+# based on the position and length of 1 and 2,
+# a result like
+#
+# 0: 5
+# 1: 7
+# 100: 2
+#
+# can be returned, where each class 1 is sliced in
+# 100 pieces and for each piece there is a dot, in case
+# class 2 occurs in that slice.
+#
+# By doing that it's easy to visualize the position of expressions
+# in sentences or documents etc.
+#
+# For example to answer questions like 'where in documents does
+# the phrase "Herzlichen Dank" occur?'
+#
+# If the span spans more than 1 slice, the result can be
+#
+# 0_2: 1
+# 0_3: 4
+# 4: 6
+#
+# etc. In case the second class is not nested in the first
+# class, this is not counted at all (as this would result
+# in weird data regarding the slice sizes).
+
+sub new {
+ my $class = shift;
+ my %param = @_;
+ bless {
+ slices => $param{slices} // 100,
+ wrap_clas => $param{wrap_class} // 1,
+ embedded_class => $param{embedded_class} // 2
+ }, $class;
+};
+
+# Get the group signature for each match
+# May well be renamed to get_signature
+sub get_group {
+ my $self = shift;
+ my $slice_start = 0;
+ my $slice_end = 0;
+ return $slice_start . '_' . $slice_end;
+};
+
+1;
diff --git a/lib/Krawfish/Meta/Segment/Group/TermExistence.pm b/lib/Krawfish/Meta/Segment/Group/TermExistence.pm
new file mode 100644
index 0000000..3570166
--- /dev/null
+++ b/lib/Krawfish/Meta/Segment/Group/TermExistence.pm
@@ -0,0 +1,159 @@
+package Krawfish::Meta::Segment::Group::TermExistence;
+use parent 'Krawfish::Meta';
+use strict;
+use warnings;
+
+# The query works similar to Or-query, but only accepts term ids.
+
+sub new {
+ my $class = shift;
+ bless {
+ term_id => shift, # Term Query
+ term_ids => shift, # Optional TermExistence-Query
+ filter => undef,
+ existence => []
+ }, $class;
+};
+
+sub _init {
+ ...
+};
+
+
+# TODO:
+# Think about when next() is called, as it needs to be called on term_ids as well ...
+# Mabe this should be done in _init as a while query somehow.
+sub next {
+ my $self = shift;
+
+ # Get the current document in the VC
+ my $filter = $self->{filter};
+ my $doc_id = $filter->doc_id;
+
+ # The next document to look for in the VC
+ my $next_doc_id;
+
+
+ # Check the single term_id for existence
+
+ # The simple term does not exist
+ my $term = $self->{term_id};
+ if (!$term) {
+ # Do nothing
+ }
+
+ # Should never happen
+ elsif (!$term->current) {
+ $self->{term_id} = undef;
+ }
+
+ # Term exists and can be checked
+ else {
+
+ # Is the VC document beyond the current document id
+ if ($doc_id > $term->doc_id) {
+
+ # Move the term document to the VC document
+ $term->skip_doc($doc_id);
+ };
+
+ # Are both terms in the same document?
+ if ($term->doc_id == $doc_id) {
+
+ # Add this term to existence
+ $self->exists($term->term_id);
+
+ # Close posting
+ $term->close;
+
+ # Do not check any further
+ $self->{term_id} = undef;
+ }
+
+ # Current term document is beyond current VC doc
+ else {
+ $next_doc_id = $term->doc_id;
+ };
+ };
+
+
+ # Check the complex term_ids for existence
+
+ my $terms = $self->{term_ids};
+
+ if (!$terms) {
+ # Do nothing
+ }
+
+ # Should never happen
+ elsif (!$terms->current) {
+ $self->{term_ids} = undef;
+ }
+
+ else {
+
+ # When there is a complex query, move on
+ if ($doc_id > $terms->doc_id) {
+ $terms->skip_doc($doc_id);
+ };
+
+ # There are no further matches
+ unless ($terms->current) {
+
+ # Merge existence values
+ $self->exists($terms->existence);
+ $terms->close;
+ $self->{term_ids} = undef;
+ }
+
+ # Current terms are beyond current VC doc
+ else {
+
+ # Remember the next relevant document id
+ if (!$next_doc_id || $next_doc_id > $term->doc_id) {
+ $next_doc_id = $term->doc_id;
+ };
+ };
+ };
+
+ # There is a next document id defined - move on
+ if (defined $next_doc_id) {
+
+ # Move the VC stream to the next relevant position
+ if ($filter->skip_doc($next_doc_id)) {
+
+ # It's fine
+ return 1;
+ };
+ };
+
+ return 0;
+};
+
+
+# Add term ids to existence list
+sub exists {
+ my ($self, $term_id) = @_;
+
+ if (ref $term_id) {
+ push @{$terms->existence}, @$term_id;
+ }
+ else {
+ push @{$terms->existence}, $term_id;
+ };
+};
+
+
+# Return list of existing term ids
+sub existence {
+ return $self->{existence}
+};
+
+
+sub filter_by {
+ ...
+ # It is relevant to filter The query - but one filter may be enough
+};
+
+
+1;