Split morphological features in NKJP

Change-Id: I239ced09485f4507d4e04475cee510cdca1a6c32
diff --git a/Changes b/Changes
index c6c3553..9693f7d 100644
--- a/Changes
+++ b/Changes
@@ -1,6 +1,7 @@
-0.48 2022-11-10
+0.48 2022-11-15
         - Improve support for text siglen including
           underscore in corpus parts.
+        - Split morphological features in NKJP.
 
 0.47 2022-08-08
         - Support for preferred language transformation.
diff --git a/lib/KorAP/XML/Annotation/NKJP/Morpho.pm b/lib/KorAP/XML/Annotation/NKJP/Morpho.pm
index b6a7304..a618e3f 100644
--- a/lib/KorAP/XML/Annotation/NKJP/Morpho.pm
+++ b/lib/KorAP/XML/Annotation/NKJP/Morpho.pm
@@ -2,6 +2,45 @@
 use KorAP::XML::Annotation::Base;
 use Data::Dumper;
 
+our %morpho = (
+  sg => 'number',
+  pl => 'number',
+  nom => 'case',
+  gen => 'case',
+  dat => 'case',
+  acc => 'case',
+  inst => 'case',
+  loc => 'case',
+  voc => 'case',
+  m1 => 'gender',
+  m2 => 'gender',
+  m3 => 'gender',
+  f => 'gender',
+  n => 'gender',
+  pri => 'person',
+  sec => 'person',
+  ter => 'person',
+  pos => 'degree',
+  com => 'degree',
+  sup => 'degree',
+  imperf => 'aspect',
+  perf => 'aspect',
+  aff => 'negation',
+  neg => 'negation',
+  akc => 'accent',
+  nakc => 'accent',
+  praep => 'postprep',
+  npraep => 'postprep',
+  congr => 'accomm',
+  rec => 'accomm',
+  nagl => 'agglut',
+  agl => 'agglut',
+  wok => 'vocal',
+  nwok => 'vocal',
+  pun => 'fullstopp',
+  npun => 'fullstopp',
+);
+
 sub parse {
   my $self = shift;
 
@@ -59,7 +98,15 @@
             # msd tag
             elsif (($name eq 'msd')
                      && ($found = $f->{'#text'})) {
-              $mtt->add_by_term('nkjp/m:' . $found);
+              foreach (split(':',$found)) {
+                if (exists $morpho{$_}) {
+                  $mtt->add_by_term('nkjp/m:' . $morpho{$_} . ':' . $_);
+                }
+
+                else {
+                  $$self->log->warn('Unknown morphological feature: ' . $_);
+                };
+              };
             };
           };
         };
diff --git a/t/real/nkjp.t b/t/real/nkjp.t
index aa9f5df..66ed637 100644
--- a/t/real/nkjp.t
+++ b/t/real/nkjp.t
@@ -100,7 +100,9 @@
 like($token, qr!<>:dereko\/s:seg\$<b>64!);
 like($token, qr!i:ładu!);
 like($token, qr!nkjp\/l:ład!);
-like($token, qr!nkjp\/m:sg:gen:m3!);
+like($token, qr!nkjp\/m:number:sg!);
+like($token, qr!nkjp\/m:case:gen!);
+like($token, qr!nkjp\/m:gender:m3!);
 like($token, qr!nkjp\/p:subst!);
 like($token, qr!s:ładu!);
 
@@ -170,7 +172,10 @@
 like($token, qr!_5\$<i>23<i>28!);
 like($token, qr!i:takie!);
 like($token, qr!nkjp/l:taki!);
-like($token, qr!nkjp/m:sg:nom:n:pos!);
+like($token, qr!nkjp/m:number:sg!);
+like($token, qr!nkjp/m:case:nom!);
+like($token, qr!nkjp/m:gender:n!);
+like($token, qr!nkjp/m:degree:pos!);
 like($token, qr!nkjp/p:adj!);
 like($token, qr!s:takie!);
 like($token, qr!nkjp/ov:takie!);
@@ -185,7 +190,9 @@
 like($token, qr!_67\$<i>464<i>475!);
 like($token, qr!i:kierkegaard!);
 like($token, qr!nkjp/l:Kierkegaard!);
-like($token, qr!nkjp/m:sg:nom:m1!);
+like($token, qr!nkjp/m:number:sg!);
+like($token, qr!nkjp/m:case:nom!);
+like($token, qr!nkjp/m:gender:m1!);
 like($token, qr!nkjp/ne:persName:surname!);
 like($token, qr!nkjp/p:subst!);
 like($token, qr!s:Kierkegaard!);