Improve snippet generation with different markup classes
Change-Id: I96cc269b49c2cadcd692ae9030b077b4a8a25b70
diff --git a/lib/Krawfish/Koral/Document.pm b/lib/Krawfish/Koral/Document.pm
index bfc465a..74297f7 100644
--- a/lib/Krawfish/Koral/Document.pm
+++ b/lib/Krawfish/Koral/Document.pm
@@ -26,6 +26,13 @@
# TODO:
# Don't forget to deal with TUIs!
+# TODO:
+# Add character extensions to the forward index only
+
+# TODO:
+# Fields need - depending on the type -
+# a prefix AND a postfix!
+
use constant DEBUG => 0;
# Parse the document and create an inverted index file
diff --git a/lib/Krawfish/Koral/Document/Annotation.pm b/lib/Krawfish/Koral/Document/Annotation.pm
index 4108dde..a4c5573 100644
--- a/lib/Krawfish/Koral/Document/Annotation.pm
+++ b/lib/Krawfish/Koral/Document/Annotation.pm
@@ -5,6 +5,11 @@
use strict;
use Krawfish::Koral::Query::Term;
+# TODO:
+# Have common methods with
+# Krawfish::Koral::Result::Enrich::Snippet::Markup
+
+
# Accepts a Krawfish::Koral::Query::Term object
sub new {
my $class = shift;
diff --git a/lib/Krawfish/Koral/Result/Enrich/Snippet.pm b/lib/Krawfish/Koral/Result/Enrich/Snippet.pm
index a03ac06..af7935a 100644
--- a/lib/Krawfish/Koral/Result/Enrich/Snippet.pm
+++ b/lib/Krawfish/Koral/Result/Enrich/Snippet.pm
@@ -5,6 +5,22 @@
with 'Krawfish::Koral::Result::Inflatable';
+# The structure of a match is as follows:
+#
+# <context>
+# <more />
+# ... # Pure text and decorations
+# <focus> # Possible extension to elements
+# ... # Pure text, decorations and annotations
+# <hit> # The concrete hit
+# ... # Pure text, decorations, annotations and highlights
+# </hit>
+# ... # Pure text, decorations and annotations
+# </focus>
+# ... # Pure text and decorations
+# <more />
+# </context>
+
# TODO:
# Make sure this works for right-to-left (RTL) language scripts as well!
@@ -14,10 +30,18 @@
sub new {
my $class = shift;
+ # stream
+ # stream_offset
+ # doc_id
+
# match_ids
- bless {
+ my $self = bless {
@_
}, $class;
+
+
+ $self->{annotations} //= [];
+ return $self;
};
@@ -36,50 +60,6 @@
};
-# Set context end position
-sub context_end {
- my $self = shift;
- if (@_) {
- $self->{context_end} = shift;
- return $self;
- };
- return $self->{context_end};
-};
-
-
-# Set extension end position
-sub extension_end {
- my $self = shift;
- if (@_) {
- $self->{extension_end} = shift;
- return $self;
- };
- return $self->{extension_end};
-};
-
-
-# Set context start position
-sub hit_start {
- my $self = shift;
- if (@_) {
- $self->{hit_start} = shift;
- return $self;
- };
- return $self->{hit_start};
-};
-
-
-# Set context end position
-sub hit_end {
- my $self = shift;
- if (@_) {
- $self->{hit_end} = shift;
- return $self;
- };
- return $self->{hit_end};
-};
-
-
# Set doc id
sub doc_id {
my $self = shift;
@@ -91,28 +71,6 @@
};
-# Add highlight to snippet
-sub add_highlight {
- my ($self, $highlight) = @_;
- my $hls = ($self->{highlights} //= []);
- push @$hls, $highlight;
-};
-
-
-# Add annotations to be retrieved in hit
-sub add_annotation {
- ...
-};
-
-
-# All annotations to be retrieved in hit
-sub annotations_sorted {
- # TODO:
- # Sort all requested annotations numerically by
- # foundry_id > layer_id > anno_id!
- return ();
-};
-
# This stores a Krawfish::Koral::Document::Stream
# with the stream_offset subtoken at 0
sub stream {
@@ -142,6 +100,7 @@
my $str = $self->key . ':' . $self->stream->to_string($id);
};
+
# Key for KQ serialization
sub key {
'snippet'
@@ -156,4 +115,163 @@
};
+sub _order_markup {
+ my ($self, $stream) = @_;
+ # This is based on processHighlightStack() in Krill
+ #
+ # 1. Take all markup and split into opening and closing tags
+ # - Milestones are only added as starts
+ my (@open, @close);
+ # 2. Sort the open tags:
+ # - by start position
+ # - by start character extension
+ # - by end position
+ # - by class number
+ # 3. Sort the closing tags
+ # - by end position
+ # - by end character extension
+ # - by start position
+ # - by class number
+ # 4. Create a stack or a list of the doubled length of
+ # the opening list
+ my @stack;
+
+ while (@open || @close) {
+
+ # No more open tags
+ if (!@open) {
+ push @stack, pop @close;
+ next;
+ }
+
+ # No more end tags
+ elsif (!@close) {
+ last;
+ };
+
+ # The opener starts before the closer ends
+ if ($open[0] < $close[0]) {
+ push @stack, shift @open;
+ }
+
+ # First let the closer end
+ else {
+ push(@stack, shift(@close));
+ };
+ };
+
+ return @stack;
+
+ # 5. Iterate over the stream and add all annotations.
+ # Stream is:
+ # Krawfish::Koral::Document::Stream
+ # with surface annotations only
+ my $length = $self->stream->length;
+ while ($length > 0) {
+ ...
+ };
+};
+
+# Add annotation
+sub add {
+ my $self = shift;
+ my $e = shift;
+
+ # Add markup objects
+ if (Role::Tiny::does_role($e, 'Krawfish::Koral::Result::Enrich::Snippet::Markup')) {
+
+ # Add the hit boundaries
+ if (Role::Tiny::does_role($e, 'Krawfish::Koral::Result::Enrich::Snippet::Hit')) {
+ $self->hit_start($e->start);
+ $self->hit_end($e->end);
+ }
+
+ # Context information
+ elsif (Role::Tiny::does_role($e, 'Krawfish::Koral::Result::Enrich::Snippet::Context')) {
+ $self->context_start($e->start);
+ $self->context_end($e->end);
+ }
+
+ # Scope extended by, e.g., spans
+ elsif (Role::Tiny::does_role($e, 'Krawfish::Koral::Result::Enrich::Snippet::Focus')) {
+ $self->focus_start($e->start);
+ $self->focus_end($e->end);
+ };
+
+ # Push to annotation list
+ push @{$self->{annotations}}, $_[0];
+ };
+};
+
+
+
+# Set context start position
+sub context_start {
+ my $self = shift;
+ if (@_) {
+ $self->{context_start} = shift;
+ return $self;
+ };
+ return $self->{context_start};
+};
+
+
+# Set context end position
+sub context_end {
+ my $self = shift;
+ if (@_) {
+ $self->{context_end} = shift;
+ return $self;
+ };
+ return $self->{context_end};
+};
+
+
+
+# Set extension start position
+sub focus_start {
+ my $self = shift;
+ if (@_) {
+ $self->{focus_start} = shift;
+ return $self;
+ };
+ return $self->{focus_start};
+};
+
+
+# Set extension end position
+sub focus_end {
+ my $self = shift;
+ if (@_) {
+ $self->{focus_end} = shift;
+ return $self;
+ };
+ return $self->{focus_end};
+};
+
+
+# Set hit start position
+sub hit_start {
+ my $self = shift;
+ if (@_) {
+ $self->{hit_start} = shift;
+ return $self;
+ };
+ return $self->{hit_start};
+};
+
+
+# Set hit end position
+sub hit_end {
+ my $self = shift;
+ if (@_) {
+ $self->{hit_end} = shift;
+ return $self;
+ };
+ return $self->{hit_end};
+};
+
+
+
+
1;
diff --git a/lib/Krawfish/Koral/Result/Enrich/Snippet/Annotation.pm b/lib/Krawfish/Koral/Result/Enrich/Snippet/Annotation.pm
new file mode 100644
index 0000000..1150bb6
--- /dev/null
+++ b/lib/Krawfish/Koral/Result/Enrich/Snippet/Annotation.pm
@@ -0,0 +1,28 @@
+package Krawfish::Koral::Result::Enrich::Snippet::Annotation;
+use strict;
+use warnings;
+use Role::Tiny;
+
+# TODO:
+# This role needs the term identifier
+# role!
+
+sub foundry {};
+
+sub layer {};
+
+sub key {};
+
+sub value {};
+
+# Certainty of the annotation
+sub certainty {
+ my $self = shift;
+ if (@_) {
+ $self->{certainty} = shift;
+ return $self;
+ };
+ return $self->{certainty};
+};
+
+1;
diff --git a/lib/Krawfish/Koral/Result/Enrich/Snippet/Attribute.pm b/lib/Krawfish/Koral/Result/Enrich/Snippet/Attribute.pm
new file mode 100644
index 0000000..de7cc7a
--- /dev/null
+++ b/lib/Krawfish/Koral/Result/Enrich/Snippet/Attribute.pm
@@ -0,0 +1,23 @@
+package Krawfish::Koral::Result::Enrich::Snippet::Attribute;
+use strict;
+use warnings;
+use Role::Tiny;
+
+with 'Krawfish::Koral::Result::Enrich::Snippet::Markup';
+with 'Krawfish::Koral::Result::Enrich::Snippet::Annotation';
+
+
+# Start position of target
+sub ref_tui {
+ my $self = shift;
+ if (@_) {
+ $self->{target_start} = shift;
+ return $self;
+ };
+ return $self->{target_start};
+};
+
+
+
+
+1;
diff --git a/lib/Krawfish/Koral/Result/Enrich/Snippet/Focus.pm b/lib/Krawfish/Koral/Result/Enrich/Snippet/Focus.pm
new file mode 100644
index 0000000..265bd97
--- /dev/null
+++ b/lib/Krawfish/Koral/Result/Enrich/Snippet/Focus.pm
@@ -0,0 +1,11 @@
+package Krawfish::Koral::Result::Enrich::Snippet::Focus;
+use strict;
+use warnings;
+use Role::Tiny;
+use Krawfish::Log;
+
+with 'Krawfish::Koral::Result::Enrich::Snippet::Markup';
+
+use constant DEBUG => 0;
+
+1;
diff --git a/lib/Krawfish/Koral/Result/Enrich/Snippet/Highlight.pm b/lib/Krawfish/Koral/Result/Enrich/Snippet/Highlight.pm
new file mode 100644
index 0000000..b70df94
--- /dev/null
+++ b/lib/Krawfish/Koral/Result/Enrich/Snippet/Highlight.pm
@@ -0,0 +1,23 @@
+package Krawfish::Koral::Result::Enrich::Snippet::Highlight;
+use strict;
+use warnings;
+use Role::Tiny;
+use Krawfish::Log;
+
+with 'Krawfish::Koral::Result::Enrich::Snippet::Markup';
+
+use constant DEBUG => 0;
+
+
+# Class number of highlight
+sub number {
+ my $self = shift;
+ if (@_) {
+ $self->{number} = shift;
+ return $self;
+ };
+ return $self->{number};
+};
+
+
+1;
diff --git a/lib/Krawfish/Koral/Result/Enrich/Snippet/Hit.pm b/lib/Krawfish/Koral/Result/Enrich/Snippet/Hit.pm
new file mode 100644
index 0000000..8819d6f
--- /dev/null
+++ b/lib/Krawfish/Koral/Result/Enrich/Snippet/Hit.pm
@@ -0,0 +1,19 @@
+package Krawfish::Koral::Result::Enrich::Snippet::Hit;
+use strict;
+use warnings;
+use Role::Tiny;
+use Krawfish::Log;
+
+with 'Krawfish::Koral::Result::Enrich::Snippet::Markup';
+
+use constant DEBUG => 0;
+
+
+# Stringify to brackets
+sub to_brackets {
+ my $self = shift;
+ return $self->is_opening ? '[' : ']';
+};
+
+
+1;
diff --git a/lib/Krawfish/Koral/Result/Enrich/Snippet/Markup.pm b/lib/Krawfish/Koral/Result/Enrich/Snippet/Markup.pm
new file mode 100644
index 0000000..403dce1
--- /dev/null
+++ b/lib/Krawfish/Koral/Result/Enrich/Snippet/Markup.pm
@@ -0,0 +1,87 @@
+package Krawfish::Koral::Result::Enrich::Snippet::Markup;
+use strict;
+use warnings;
+use Role::Tiny;
+
+requires qw/start
+ end
+ start_char
+ end_char/;
+
+# TODO:
+# Have common methods with
+# Krawfish::Koral::Document::Annotation
+
+# TODO:
+# This is the base class for
+# - hit
+# - highlight
+# - relation
+# - anchor
+# - Annotation
+
+# TODO:
+# All these role may very well
+# be under Koral - as index data types.
+
+sub new {
+ my $class = shift;
+ bless { @_ }, $class;
+};
+
+
+# Start position
+sub start {
+ my $self = shift;
+ if (@_) {
+ $self->{start} = shift;
+ return $self;
+ };
+ return $self->{start};
+};
+
+
+# End position
+sub end {
+ my $self = shift;
+ if (@_) {
+ $self->{end} = shift;
+ return $self;
+ };
+ return $self->{end};
+};
+
+
+# Start char
+sub start_char {
+ my $self = shift;
+ if (@_) {
+ $self->{start_char} = shift;
+ return $self;
+ };
+ return $self->{start_char};
+};
+
+
+# End char
+sub end_char {
+ my $self = shift;
+ if (@_) {
+ $self->{end_char} = shift;
+ return $self;
+ };
+ return $self->{end_char};
+};
+
+
+# The element occurs as an opening tag
+sub is_opening {
+ my $self = shift;
+ if (@_ > 0) {
+ $self->{opening} = shift;
+ return $self;
+ };
+ return $self->{opening};
+};
+
+1;
diff --git a/lib/Krawfish/Koral/Result/Enrich/Snippet/Milestone.pm b/lib/Krawfish/Koral/Result/Enrich/Snippet/Milestone.pm
new file mode 100644
index 0000000..7d997dc
--- /dev/null
+++ b/lib/Krawfish/Koral/Result/Enrich/Snippet/Milestone.pm
@@ -0,0 +1,20 @@
+package Krawfish::Koral::Result::Enrich::Snippet::Milestone;
+use strict;
+use warnings;
+use Role::Tiny;
+use Krawfish::Log;
+
+with 'Krawfish::Koral::Result::Enrich::Snippet::Markup';
+with 'Krawfish::Koral::Result::Enrich::Snippet::Annotation';
+
+use constant DEBUG => 0;
+
+# The milestone element always is embedded before
+# the actual position
+
+# Milestones have identical start and end positions
+sub end {
+ $_[0]->start;
+};
+
+1;
diff --git a/lib/Krawfish/Koral/Result/Enrich/Snippet/Relation.pm b/lib/Krawfish/Koral/Result/Enrich/Snippet/Relation.pm
new file mode 100644
index 0000000..10a78b7
--- /dev/null
+++ b/lib/Krawfish/Koral/Result/Enrich/Snippet/Relation.pm
@@ -0,0 +1,61 @@
+package Krawfish::Koral::Result::Enrich::Snippet::Relation;
+use strict;
+use warnings;
+use Role::Tiny;
+use Krawfish::Log;
+
+with 'Krawfish::Koral::Result::Enrich::Snippet::Markup';
+with 'Krawfish::Koral::Result::Enrich::Snippet::TUI';
+with 'Krawfish::Koral::Result::Enrich::Snippet::Annotation';
+
+use constant DEBUG => 0;
+
+sub left_to_right {
+ return $self->{left_to_right};
+};
+
+# Start position of right part
+sub right_start {
+ my $self = shift;
+ if (@_) {
+ $self->{target_start} = shift;
+ return $self;
+ };
+ return $self->{target_start};
+};
+
+
+# End position of the right part
+sub right_end {
+ my $self = shift;
+ if (@_) {
+ $self->{target_end} = shift;
+ return $self;
+ };
+ return $self->{target_end};
+};
+
+
+# TUI of source
+sub source_tui {
+ my $self = shift;
+ if (@_) {
+ $self->{source_tui} = shift;
+ return $self;
+ };
+ return $self->{source_tui};
+};
+
+
+# TUI of target
+sub target_tui {
+ my $self = shift;
+ if (@_) {
+ $self->{target_tui} = shift;
+ return $self;
+ };
+ return $self->{target_tui};
+};
+
+
+1;
diff --git a/lib/Krawfish/Koral/Result/Enrich/Snippet/Span.pm b/lib/Krawfish/Koral/Result/Enrich/Snippet/Span.pm
new file mode 100644
index 0000000..374ff81
--- /dev/null
+++ b/lib/Krawfish/Koral/Result/Enrich/Snippet/Span.pm
@@ -0,0 +1,26 @@
+package Krawfish::Koral::Result::Enrich::Snippet::Span;
+use strict;
+use warnings;
+use Role::Tiny;
+use Krawfish::Log;
+
+with 'Krawfish::Koral::Result::Enrich::Snippet::Markup';
+with 'Krawfish::Koral::Result::Enrich::Snippet::TUI';
+with 'Krawfish::Koral::Result::Enrich::Snippet::Certainty';
+
+# Spans are used for token as well as span annotations,
+# therefore even tokens can have a depth information
+
+use constant DEBUG => 0;
+
+# Depth
+sub depth {
+ my $self = shift;
+ if (@_) {
+ $self->{depth} = shift;
+ return $self;
+ };
+ return $self->{depth};
+};
+
+1;
diff --git a/lib/Krawfish/Koral/Result/Enrich/Snippet/TUI.pm b/lib/Krawfish/Koral/Result/Enrich/Snippet/TUI.pm
new file mode 100644
index 0000000..34218ad
--- /dev/null
+++ b/lib/Krawfish/Koral/Result/Enrich/Snippet/TUI.pm
@@ -0,0 +1,16 @@
+package Krawfish::Koral::Result::Enrich::Snippet::TUI;
+use strict;
+use warnings;
+use Role::Tiny;
+
+# Token unique identifier
+sub tui {
+ my $self = shift;
+ if (@_) {
+ $self->{tui} = shift;
+ return $self;
+ };
+ return $self->{tui};
+};
+
+1;