Handle lonely docid attributes in NKJP KorAP-XML

Change-Id: Id439a98ac46e2bbd33e65b1719cca39cd11b6377
diff --git a/Changes b/Changes
index f87e8c3..7116bf3 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,6 @@
+        - korapxml2conllu:
+            - handle docid attributes correctly if they are in a different line than their parent element <layer>
+
 0.5.0 2022-09-29
         - korapxml2conllu:
             - --word2vec|lm-training-data option added to print word2vec input format
diff --git a/script/korapxml2conllu b/script/korapxml2conllu
index 5e91c06..9cb25f8 100755
--- a/script/korapxml2conllu
+++ b/script/korapxml2conllu
@@ -159,7 +159,7 @@
         }
         $filename=$1 if(!eof(MORPHO_OR_TOKENPIPE) && /\s+inflating:\s+(.*)/);
       }
-    } elsif(m@^\s*<layer\s+.*docid="([^"]+)"@) {
+    } elsif(m@(?:^|\s)docid="([^"]+)"@) {
       last if($test && $text_no++ > 3);
       if(!$first) {
         closeDoc(0);
@@ -340,8 +340,11 @@
       $docid=$1;
       $log->debug("Getting plain text for $docid");
       $text_started=0;
-    } elsif(/<layer[^>]+docid="([^"]*)/) {
+    } elsif(/<raw_text\b/) {
+      $text_started=0;
+    } elsif(/(?:^|\s)docid="([^"]*)/) {
       $docid=$1;
+      $log->debug("Getting plain text for $docid");
     } elsif(m@<span @) {
         ($current_id) = /id="[^0-9]*([^\"]*)"/;
         ($current_from) = /from="([^\"]*)"/;
diff --git a/t/data/nkjp.zip b/t/data/nkjp.zip
new file mode 100644
index 0000000..2ac910c
--- /dev/null
+++ b/t/data/nkjp.zip
Binary files differ
diff --git a/t/test.t b/t/test.t
index 7954e6d..ac3550c 100644
--- a/t/test.t
+++ b/t/test.t
@@ -1,6 +1,6 @@
 use strict;
 use warnings;
-use Test::More tests => 50;
+use Test::More tests => 53;
 use Test::Script;
 use Test::TempDir::Tiny;
 use File::Copy;
@@ -200,4 +200,7 @@
 script_stderr_like "WARNING: No valid input document.*token offsets missing", "Warn on missing token offsets";
 script_stderr_like qr@WARNING: No valid input document.*text.id .*missing@,   "Warn on missing text ids";
 
+script_runs([ 'script/korapxml2conllu', "t/data/nkjp.zip" ], "Runs korapxml2conllu on nkjp test data");
+script_stderr_unlike("Use of uninitialized value", "Handles lonely docid parameters (line separated from layer elements)");
+script_stdout_like("\n9\twesołości\twesołość\tsubst\tsubst\tsg:gen:f", "Correctly converts nkjp annotations");
 done_testing;