Improve checks regarding annotation boundaries
Resolves #6
Change-Id: I3a41d239ea44923720bb7d557de26f30f285ae67
diff --git a/Changes b/Changes
index 2c4883f..d04ebd3 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,6 @@
+2.4.2 2023-02-10
+ - Improve checks for numerical annotation bounds.
+
2.4.1 2023-02-07
- Fix test.
diff --git a/lib/KorAP/XML/TEI/Annotations/Annotation.pm b/lib/KorAP/XML/TEI/Annotations/Annotation.pm
index 305de49..892c3fd 100644
--- a/lib/KorAP/XML/TEI/Annotations/Annotation.pm
+++ b/lib/KorAP/XML/TEI/Annotations/Annotation.pm
@@ -3,6 +3,7 @@
use warnings;
use Log::Any '$log';
use KorAP::XML::TEI 'escape_xml';
+use Scalar::Util qw'looks_like_number';
# TODO:
# Make these parameters passable from the script
@@ -37,6 +38,23 @@
# Create a new annotation object
sub new {
my $class = shift;
+
+ if (defined $_[1]) {
+ unless (looks_like_number($_[1])) {
+ return;
+ };
+
+ if (defined $_[2]) {
+ unless (looks_like_number($_[2])) {
+ return;
+ };
+
+ if (defined $_[3] && !looks_like_number($_[3])) {
+ return;
+ };
+ };
+ };
+
my $self = bless [@_], $class;
# Ensure minimum length for pushing attributes
@@ -47,7 +65,12 @@
# Set 'from'
sub set_from {
- $_[0]->[FROM] = $_[1];
+ if (looks_like_number($_[1])) {
+ $_[0]->[FROM] = $_[1];
+ return 1;
+ };
+ $log->fatal('Passed non-numeric value as annotation start');
+ return;
};
@@ -59,7 +82,12 @@
# Set 'to'
sub set_to {
- $_[0]->[TO] = $_[1];
+ if (looks_like_number($_[1])) {
+ $_[0]->[TO] = $_[1];
+ return 1;
+ };
+ $log->fatal('Passed non-numeric value as annotation end');
+ return;
};
@@ -73,7 +101,12 @@
sub set_level {
# Insert information about depth of element in XML-tree
# (top element = level 1)
- $_[0]->[LEVEL] = $_[1];
+ if (looks_like_number($_[1])) {
+ $_[0]->[LEVEL] = $_[1];
+ return 1;
+ };
+ $log->fatal('Passed non-numeric value as annotation level');
+ return;
};
diff --git a/lib/KorAP/XML/TEI/Annotations/Collector.pm b/lib/KorAP/XML/TEI/Annotations/Collector.pm
index 887bbb0..6fd69da 100644
--- a/lib/KorAP/XML/TEI/Annotations/Collector.pm
+++ b/lib/KorAP/XML/TEI/Annotations/Collector.pm
@@ -14,7 +14,7 @@
# Add new annotation to annotation list
sub add_new_annotation {
my $self = shift;
- my $token = KorAP::XML::TEI::Annotations::Annotation->new(@_);
+ my $token = KorAP::XML::TEI::Annotations::Annotation->new(@_) or return;
push @$self, $token;
return $token;
};
diff --git a/lib/KorAP/XML/TEI/Tokenizer/External.pm b/lib/KorAP/XML/TEI/Tokenizer/External.pm
index 92f0c31..e0f4f29 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/External.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/External.pm
@@ -6,6 +6,7 @@
use IO::Select;
use IPC::Open2 qw(open2);
use Encode qw(encode);
+use Scalar::Util qw'looks_like_number';
# This tokenizer starts an external process for
# tokenization. It writes the data to tokenize
@@ -126,6 +127,9 @@
# Serialize all bounds
my $c = 0;
for (my $i = 0; $i < @bounds; $i += 2 ){
+ unless (looks_like_number($bounds[$i]) && looks_like_number($bounds[$i+1])) {
+ die $log->fatal("Token bounds not numerical from external tokenizer ('$text_id')");
+ };
$output .= qq! <span id="t_$c" from="! . $bounds[$i] . '" to="' .
$bounds[$i+1] . qq!" />\n!;
$c++;
@@ -184,9 +188,9 @@
my ($self, $structures) = @_;
for (my $i=0; $i < @{$self->{sentence_starts}}; $i++) {
- my $anno = $structures->add_new_annotation("s");
- $anno->set_from($self->{sentence_starts}[$i]);
- $anno->set_to($self->{sentence_endss}[$i]);
+ my $anno = $structures->add_new_annotation('s');
+ $anno->set_from($self->{sentence_starts}[$i]) or die $log->fatal('Sentence boundaries not numerical');
+ $anno->set_to($self->{sentence_endss}[$i]) or die $log->fatal('Sentence boundaries not numerical');
$anno->set_level(-1);
}
$self->{sentence_starts} = [];
diff --git a/t/annotation.t b/t/annotation.t
index 1176c98..998e7ea 100644
--- a/t/annotation.t
+++ b/t/annotation.t
@@ -78,6 +78,11 @@
->text_is('span > fs > f > fs f:nth-of-type(3)', 'C & A')
};
+subtest 'Wrong annotations' => sub {
+ ok(!KorAP::XML::TEI::Annotations::Annotation->new('p','Error',0));
+ ok(!KorAP::XML::TEI::Annotations::Annotation->new('p',0,'Occurred'));
+ ok(!KorAP::XML::TEI::Annotations::Annotation->new('p',0,5,'Fehler'));
+};
done_testing;
diff --git a/t/annotations-collect.t b/t/annotations-collect.t
index 543e693..aa67937 100644
--- a/t/annotations-collect.t
+++ b/t/annotations-collect.t
@@ -44,7 +44,21 @@
my $anno = KorAP::XML::TEI::Annotations::Annotation->new('x4', 20 => 21);
-$t->add_annotation($anno);
+ok($anno);
+
+ok($t->add_annotation($anno));
+
+$loy = Test::XML::Loy->new($t->to_string('text',0))
+ ->attr_is('layer', 'docid', 'text')
+ ->attr_is('span#s0', 'to', '8')
+ ->attr_is('span#s1', 'to', '14')
+ ->attr_is('span#s1', 'l', '2')
+ ->attr_is('span#s2', 'to', '20')
+ ->attr_is('span#s3', 'from', '20')
+ ->attr_is('span#s3', 'to', '21')
+;
+
+ok(!$t->add_new_annotation('x1','error','occurred'));
$loy = Test::XML::Loy->new($t->to_string('text',0))
->attr_is('layer', 'docid', 'text')