c2k: include lemma only if !='_' (unless token is also '_')
Change-Id: Id1fd52a61177d47286bad858a561ccdc7c2df64e
diff --git a/Changes b/Changes
index b250585..be573b3 100644
--- a/Changes
+++ b/Changes
@@ -1,5 +1,6 @@
- korapxml2conllu: use morpho.xml if present when run on base zips
- korapxml2conllu: new option -c <columns>
+ - conllu2korapxml: ignore _-lemmas
0.4.1 2021-07-31
- korapxml2conllu: fix patterns not extracted for last texts in archive
diff --git a/script/conllu2korapxml b/script/conllu2korapxml
index b4aa17a..1fdad0d 100755
--- a/script/conllu2korapxml
+++ b/script/conllu2korapxml
@@ -142,6 +142,7 @@
<fs>
<f name="pos">$parsed[3]</f>
);
+ $morpho .= qq( <f name="lemma">$parsed[2]</f>\n) if($parsed[2] ne "_" || $parsed[1] eq '_');
$morpho .= qq( <f name="msd">$parsed[5]</f>\n) if($parsed[5] ne "_");
if($parsed[9] ne "_") {
if ($parsed[9] =~ /[0-9.e]+/) {
diff --git a/t/test.t b/t/test.t
index 5d0152f..648bf20 100644
--- a/t/test.t
+++ b/t/test.t
@@ -1,6 +1,6 @@
use strict;
use warnings;
-use Test::More tests => 30;
+use Test::More tests => 33;
use Test::Script;
use Test::TempDir::Tiny;
use File::Copy;
@@ -105,4 +105,20 @@
script_stdout_like "\n# posting/id = i.12610_4_5", "Extracts directly adjacent postings from morpho zips (2)";
script_stdout_like "\n# posting/id = i.14548_9_1", "Extracts last postings in morpho zip";
+$zipfile = "$test_tempdir/without_lemma.zip";
+script_runs([ 'script/conllu2korapxml', "t/data/without_lemma.tsv" ], {stdout => \$zipcontent},
+ "Converts t/data/without_lemma.tsv to KorAP-XML zip");
+open($fh, ">", $zipfile) or fail("cannot open file $zipfile for writing");
+print $fh $zipcontent;
+close($fh);
+my $UNZIP = `sh -c 'command -v unzip'`;
+chomp $UNZIP;
+
+if ($UNZIP eq '') {
+ warn('No unzip executable found in PATH.');
+ return 0;
+};
+$zipcontent = `$UNZIP -c $zipfile`;
+unlike($zipcontent, qr/.*name ="lemma".*/, "conllu2korapxml igores _ lemmas.");
+like($zipcontent, qr/.*<f name="pos">NN|NN<\/f>.*/, "conllu2korapxml does not ignore pos for _ lemmas.");
done_testing;