conllu2korapxml: convert upos column to upos features
e.g.:
<f name="pos">VVFIN</f>
<f name="upos">VERB</f>
resolves #5
Change-Id: I7913c28d02b73036b28663396f96900bfe9f7b9a
diff --git a/script/conllu2korapxml b/script/conllu2korapxml
index 29df253..b63ca14 100755
--- a/script/conllu2korapxml
+++ b/script/conllu2korapxml
@@ -177,14 +177,20 @@
</span>
@;
}
- my $pos = $parsed[3];
+ my $pos = $parsed[4];
+ my $upos = $parsed[3];
$pos =~ s/\|.*//;
$morpho .= qq( <span id="s${s}_n$t" from="$spansFrom[$t]" to="$spansTo[$t]">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
- <f name="pos">$pos</f>
);
+ if($pos ne "_") {
+ $morpho .= qq( <f name="pos">$pos</f>\n);
+ }
+ if($upos ne "_") {
+ $morpho .= qq( <f name="upos">$upos</f>\n);
+ }
$morpho .= qq( <f name="lemma">$parsed[2]</f>\n) if($parsed[2] ne "_" || $parsed[1] eq '_');
$morpho .= qq( <f name="msd">$parsed[5]</f>\n) if($parsed[5] ne "_");
if($parsed[9] ne "_") {
diff --git a/t/test.t b/t/test.t
index 9c50e52..2820038 100644
--- a/t/test.t
+++ b/t/test.t
@@ -1,6 +1,6 @@
use strict;
use warnings;
-use Test::More tests => 68;
+use Test::More tests => 72;
use Test::Script;
use Test::TempDir::Tiny;
use File::Copy;
@@ -196,6 +196,12 @@
like($zipcontent, qr@GOE/AGA/00000/ud/dependency\.xml@, "conllu2korapxml UDPipe input conversion contains dependency layer with foundry name 'ud'");
like($zipcontent, qr@rw-rw-rw-.*GOE/AGA/00000/ud/morpho\.xml@, "KorAP-XML zip contents have read and write permissions");
+$zipcontent = `$UNZIP -c $zipfile`;
+like($zipcontent, qr/.*<f name="upos">VERB<\/f>.*/, "conllu2korapxml extracts upos tags.");
+like($zipcontent, qr/.*<f name="pos">VVFIN<\/f>.*/, "conllu2korapxml extracts (x)pos tags.");
+unlike($zipcontent, qr/.*<f name="pos">_<\/f>.*/, "conllu2korapxml ignores _ pos tags.");
+unlike($zipcontent, qr/.*<f name="upos">_<\/f>.*/, "conllu2korapxml ignores _ upos tags.");
+
script_runs([ 'script/conllu2korapxml', 't/data/deu-deps.conllu' ], "Runs conllu2korap with UDPipe input");
script_stderr_unlike "fileparse(): need a valid pathname", "Ignore sent_id and newdoc id";
script_stderr_like "WARNING: No valid input document.*token offsets missing", "Warn on missing token offsets";