Handle lonely docid attributes in NKJP KorAP-XML
Change-Id: Id439a98ac46e2bbd33e65b1719cca39cd11b6377
diff --git a/script/korapxml2conllu b/script/korapxml2conllu
index 5e91c06..9cb25f8 100755
--- a/script/korapxml2conllu
+++ b/script/korapxml2conllu
@@ -159,7 +159,7 @@
}
$filename=$1 if(!eof(MORPHO_OR_TOKENPIPE) && /\s+inflating:\s+(.*)/);
}
- } elsif(m@^\s*<layer\s+.*docid="([^"]+)"@) {
+ } elsif(m@(?:^|\s)docid="([^"]+)"@) {
last if($test && $text_no++ > 3);
if(!$first) {
closeDoc(0);
@@ -340,8 +340,11 @@
$docid=$1;
$log->debug("Getting plain text for $docid");
$text_started=0;
- } elsif(/<layer[^>]+docid="([^"]*)/) {
+ } elsif(/<raw_text\b/) {
+ $text_started=0;
+ } elsif(/(?:^|\s)docid="([^"]*)/) {
$docid=$1;
+ $log->debug("Getting plain text for $docid");
} elsif(m@<span @) {
($current_id) = /id="[^0-9]*([^\"]*)"/;
($current_from) = /from="([^\"]*)"/;