Warn and fail on wrong token order
Change-Id: I573d974c8c27f1af9a450ffab0d6f6e61e358376
diff --git a/Changes b/Changes
index ec0a887..42adaf8 100644
--- a/Changes
+++ b/Changes
@@ -1,6 +1,7 @@
-0.44 2022-02-07
+0.44 2022-02-17
- Improve Gingko Metadata support.
- Fix data-URIs by always refering to UTF-8.
+ - Warn on wrong token order.
0.43 2022-01-17
- Fix temporary extract handling when defined
diff --git a/lib/KorAP/XML/Tokenizer.pm b/lib/KorAP/XML/Tokenizer.pm
index 3496d69..482bcf7 100644
--- a/lib/KorAP/XML/Tokenizer.pm
+++ b/lib/KorAP/XML/Tokenizer.pm
@@ -116,10 +116,19 @@
# my (@non_word_tokens);
my $p = $doc->primary;
+ my $old_end = 0;
foreach my $span (@$tokens) {
my $from = $span->{'-from'};
my $to = $span->{'-to'};
+ if ($from < $old_end) {
+ $self->error("Tokens duplicated or not in order");
+ $log->warn("Token positions not in order in [$from-$to] in $path");
+ return;
+ };
+
+ $old_end = $to;
+
# Get the subring from primary data
my $token = $p->data($from, $to);
diff --git a/t/corpus/artificial/opennlp/tokens_wrong.xml b/t/corpus/artificial/opennlp/tokens_wrong.xml
new file mode 100644
index 0000000..781dc6e
--- /dev/null
+++ b/t/corpus/artificial/opennlp/tokens_wrong.xml
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?><?xml-model href="span.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?><layer xmlns="http://ids-mannheim.de/ns/KorAP" docid="ART_00001" VERSION="KorAP-0.4">
+<spanList>
+ <span id="s_7" from="0" to="3"/>
+ <span id="s_8" from="4" to="11"/>
+ <span id="s_9" from="12" to="23"/>
+ <span id="s_10" from="24" to="30"/>
+ <span id="s_11" from="31" to="35"/>
+ <span id="s_21" from="89" to="96"/>
+ <span id="s_15" from="52" to="63"/>
+ <span id="s_16" from="64" to="73"/>
+ <span id="s_17" from="74" to="77"/>
+ <span id="s_18" from="77" to="78"/>
+ <span id="s_22" from="97" to="101"/>
+ <span id="s_12" from="36" to="39"/>
+ <span id="s_13" from="40" to="47"/>
+ <span id="s_14" from="48" to="51"/>
+ <span id="s_25" from="124" to="128"/>
+ <span id="s_26" from="128" to="129"/>
+ <span id="s_19" from="79" to="84"/>
+ <span id="s_20" from="85" to="88"/>
+ <span id="s_23" from="102" to="111"/>
+ <span id="s_24" from="112" to="123"/>
+ </spanList>
+</layer>
diff --git a/t/tokens_artificial.t b/t/tokens_artificial.t
new file mode 100644
index 0000000..8daec81
--- /dev/null
+++ b/t/tokens_artificial.t
@@ -0,0 +1,37 @@
+use strict;
+use warnings;
+use utf8;
+use Test::More;
+use Benchmark ':hireswallclock';
+use Mojo::DOM;
+use Mojo::File;
+use Mojo::ByteStream 'b';
+use Data::Dumper;
+use File::Basename 'dirname';
+use File::Spec::Functions 'catdir';
+use lib 'lib', '../lib';
+
+use_ok('KorAP::XML::Krill');
+
+# ART
+my $path = catdir(dirname(__FILE__), 'corpus','artificial');
+ok(my $doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
+
+ok($doc->parse, 'Parse document');
+
+$doc->parse;
+
+my $tokens = KorAP::XML::Tokenizer->new(
+ path => $doc->path,
+ doc => $doc,
+ foundry => 'OpenNLP',
+ layer => 'tokens_wrong',
+ name => 'Tokens'
+);
+
+# Order is wrong!
+ok(!$tokens->parse, 'Parse tokens');
+
+done_testing;
+__END__
+