Warn and fail on wrong token order
Change-Id: I573d974c8c27f1af9a450ffab0d6f6e61e358376
diff --git a/lib/KorAP/XML/Tokenizer.pm b/lib/KorAP/XML/Tokenizer.pm
index 3496d69..482bcf7 100644
--- a/lib/KorAP/XML/Tokenizer.pm
+++ b/lib/KorAP/XML/Tokenizer.pm
@@ -116,10 +116,19 @@
# my (@non_word_tokens);
my $p = $doc->primary;
+ my $old_end = 0;
foreach my $span (@$tokens) {
my $from = $span->{'-from'};
my $to = $span->{'-to'};
+ if ($from < $old_end) {
+ $self->error("Tokens duplicated or not in order");
+ $log->warn("Token positions not in order in [$from-$to] in $path");
+ return;
+ };
+
+ $old_end = $to;
+
# Get the subring from primary data
my $token = $p->data($from, $to);