Improve offset error identification

Change-Id: Id5c9dadb043a3adff9ec2c70f3fd957c38fa9ee8
diff --git a/Changes b/Changes
index 77350ea..55212ca 100644
--- a/Changes
+++ b/Changes
@@ -1,6 +1,7 @@
         - korapxml2conllu:
             - the sigle-pattern option now affects the entire sigle
             - handle docid attributes correctly if they are in a different line than their parent element <layer>
+            - Improve identification of offset errors
 
 0.5.0 2022-09-29
         - korapxml2conllu:
diff --git a/script/korapxml2conllu b/script/korapxml2conllu
index b771474..9f19987 100755
--- a/script/korapxml2conllu
+++ b/script/korapxml2conllu
@@ -215,7 +215,7 @@
       }
 #      $log->debug("found span: $current_id $current_from $current_to");
       $token = substr($plain_texts{$docid}, $current_from, $current_to - $current_from);
-      if (!defined $token) {
+      if (!defined $token || length($token) == 0) {
         $log->warn("$docid: could not retrieve token at $current_from-$current_to/", length($plain_texts{$docid}), " - ending with: ", substr($plain_texts{$docid},length($plain_texts{$docid})-10));
         $token = "_";
       }
diff --git a/t/data/nkjp-fail.zip b/t/data/nkjp-fail.zip
new file mode 100644
index 0000000..94fcc5f
--- /dev/null
+++ b/t/data/nkjp-fail.zip
Binary files differ
diff --git a/t/test.t b/t/test.t
index 112846d..52ac9d0 100644
--- a/t/test.t
+++ b/t/test.t
@@ -1,6 +1,6 @@
 use strict;
 use warnings;
-use Test::More tests => 59;
+use Test::More tests => 61;
 use Test::Script;
 use Test::TempDir::Tiny;
 use File::Copy;
@@ -212,4 +212,7 @@
 script_stdout_like("WDF19/A0000/13072/tree_tagger/morpho.xml", "--sigle-pattern to specify a text sigle pattern extracts the right texts");
 script_stdout_unlike("WDF19/A0000/14247/tree_tagger/morpho.xml", "--sigle-pattern to specify a text sigle pattern does not extract the wrong texts");
 
+script_runs([ 'script/korapxml2conllu', "t/data/nkjp-fail.zip" ], "Runs korapxml2conllu on nkjp-fail test data");
+script_stderr_like("could not retrieve token at 1297-1298/ 1297  - ending with:  e! upadku.", "Offset error");
+
 done_testing;