Improve offset error identification
Change-Id: Id5c9dadb043a3adff9ec2c70f3fd957c38fa9ee8
diff --git a/Changes b/Changes
index 77350ea..55212ca 100644
--- a/Changes
+++ b/Changes
@@ -1,6 +1,7 @@
- korapxml2conllu:
- the sigle-pattern option now affects the entire sigle
- handle docid attributes correctly if they are in a different line than their parent element <layer>
+ - Improve identification of offset errors
0.5.0 2022-09-29
- korapxml2conllu:
diff --git a/script/korapxml2conllu b/script/korapxml2conllu
index b771474..9f19987 100755
--- a/script/korapxml2conllu
+++ b/script/korapxml2conllu
@@ -215,7 +215,7 @@
}
# $log->debug("found span: $current_id $current_from $current_to");
$token = substr($plain_texts{$docid}, $current_from, $current_to - $current_from);
- if (!defined $token) {
+ if (!defined $token || length($token) == 0) {
$log->warn("$docid: could not retrieve token at $current_from-$current_to/", length($plain_texts{$docid}), " - ending with: ", substr($plain_texts{$docid},length($plain_texts{$docid})-10));
$token = "_";
}
diff --git a/t/data/nkjp-fail.zip b/t/data/nkjp-fail.zip
new file mode 100644
index 0000000..94fcc5f
--- /dev/null
+++ b/t/data/nkjp-fail.zip
Binary files differ
diff --git a/t/test.t b/t/test.t
index 112846d..52ac9d0 100644
--- a/t/test.t
+++ b/t/test.t
@@ -1,6 +1,6 @@
use strict;
use warnings;
-use Test::More tests => 59;
+use Test::More tests => 61;
use Test::Script;
use Test::TempDir::Tiny;
use File::Copy;
@@ -212,4 +212,7 @@
script_stdout_like("WDF19/A0000/13072/tree_tagger/morpho.xml", "--sigle-pattern to specify a text sigle pattern extracts the right texts");
script_stdout_unlike("WDF19/A0000/14247/tree_tagger/morpho.xml", "--sigle-pattern to specify a text sigle pattern does not extract the wrong texts");
+script_runs([ 'script/korapxml2conllu', "t/data/nkjp-fail.zip" ], "Runs korapxml2conllu on nkjp-fail test data");
+script_stderr_like("could not retrieve token at 1297-1298/ 1297 - ending with: e! upadku.", "Offset error");
+
done_testing;