Handle lonely docid attributes in NKJP KorAP-XML
Change-Id: Id439a98ac46e2bbd33e65b1719cca39cd11b6377
diff --git a/Changes b/Changes
index f87e8c3..7116bf3 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,6 @@
+ - korapxml2conllu:
+ - handle docid attributes correctly if they are in a different line than their parent element <layer>
+
0.5.0 2022-09-29
- korapxml2conllu:
- --word2vec|lm-training-data option added to print word2vec input format
diff --git a/script/korapxml2conllu b/script/korapxml2conllu
index 5e91c06..9cb25f8 100755
--- a/script/korapxml2conllu
+++ b/script/korapxml2conllu
@@ -159,7 +159,7 @@
}
$filename=$1 if(!eof(MORPHO_OR_TOKENPIPE) && /\s+inflating:\s+(.*)/);
}
- } elsif(m@^\s*<layer\s+.*docid="([^"]+)"@) {
+ } elsif(m@(?:^|\s)docid="([^"]+)"@) {
last if($test && $text_no++ > 3);
if(!$first) {
closeDoc(0);
@@ -340,8 +340,11 @@
$docid=$1;
$log->debug("Getting plain text for $docid");
$text_started=0;
- } elsif(/<layer[^>]+docid="([^"]*)/) {
+ } elsif(/<raw_text\b/) {
+ $text_started=0;
+ } elsif(/(?:^|\s)docid="([^"]*)/) {
$docid=$1;
+ $log->debug("Getting plain text for $docid");
} elsif(m@<span @) {
($current_id) = /id="[^0-9]*([^\"]*)"/;
($current_from) = /from="([^\"]*)"/;
diff --git a/t/data/nkjp.zip b/t/data/nkjp.zip
new file mode 100644
index 0000000..2ac910c
--- /dev/null
+++ b/t/data/nkjp.zip
Binary files differ
diff --git a/t/test.t b/t/test.t
index 7954e6d..ac3550c 100644
--- a/t/test.t
+++ b/t/test.t
@@ -1,6 +1,6 @@
use strict;
use warnings;
-use Test::More tests => 50;
+use Test::More tests => 53;
use Test::Script;
use Test::TempDir::Tiny;
use File::Copy;
@@ -200,4 +200,7 @@
script_stderr_like "WARNING: No valid input document.*token offsets missing", "Warn on missing token offsets";
script_stderr_like qr@WARNING: No valid input document.*text.id .*missing@, "Warn on missing text ids";
+script_runs([ 'script/korapxml2conllu', "t/data/nkjp.zip" ], "Runs korapxml2conllu on nkjp test data");
+script_stderr_unlike("Use of uninitialized value", "Handles lonely docid parameters (line separated from layer elements)");
+script_stdout_like("\n9\twesołości\twesołość\tsubst\tsubst\tsg:gen:f", "Correctly converts nkjp annotations");
done_testing;