Split morphological features in NKJP
Change-Id: I239ced09485f4507d4e04475cee510cdca1a6c32
diff --git a/Changes b/Changes
index c6c3553..9693f7d 100644
--- a/Changes
+++ b/Changes
@@ -1,6 +1,7 @@
-0.48 2022-11-10
+0.48 2022-11-15
- Improve support for text siglen including
underscore in corpus parts.
+ - Split morphological features in NKJP.
0.47 2022-08-08
- Support for preferred language transformation.
diff --git a/lib/KorAP/XML/Annotation/NKJP/Morpho.pm b/lib/KorAP/XML/Annotation/NKJP/Morpho.pm
index b6a7304..a618e3f 100644
--- a/lib/KorAP/XML/Annotation/NKJP/Morpho.pm
+++ b/lib/KorAP/XML/Annotation/NKJP/Morpho.pm
@@ -2,6 +2,45 @@
use KorAP::XML::Annotation::Base;
use Data::Dumper;
+our %morpho = (
+ sg => 'number',
+ pl => 'number',
+ nom => 'case',
+ gen => 'case',
+ dat => 'case',
+ acc => 'case',
+ inst => 'case',
+ loc => 'case',
+ voc => 'case',
+ m1 => 'gender',
+ m2 => 'gender',
+ m3 => 'gender',
+ f => 'gender',
+ n => 'gender',
+ pri => 'person',
+ sec => 'person',
+ ter => 'person',
+ pos => 'degree',
+ com => 'degree',
+ sup => 'degree',
+ imperf => 'aspect',
+ perf => 'aspect',
+ aff => 'negation',
+ neg => 'negation',
+ akc => 'accent',
+ nakc => 'accent',
+ praep => 'postprep',
+ npraep => 'postprep',
+ congr => 'accomm',
+ rec => 'accomm',
+ nagl => 'agglut',
+ agl => 'agglut',
+ wok => 'vocal',
+ nwok => 'vocal',
+ pun => 'fullstopp',
+ npun => 'fullstopp',
+);
+
sub parse {
my $self = shift;
@@ -59,7 +98,15 @@
# msd tag
elsif (($name eq 'msd')
&& ($found = $f->{'#text'})) {
- $mtt->add_by_term('nkjp/m:' . $found);
+ foreach (split(':',$found)) {
+ if (exists $morpho{$_}) {
+ $mtt->add_by_term('nkjp/m:' . $morpho{$_} . ':' . $_);
+ }
+
+ else {
+ $$self->log->warn('Unknown morphological feature: ' . $_);
+ };
+ };
};
};
};
diff --git a/t/real/nkjp.t b/t/real/nkjp.t
index aa9f5df..66ed637 100644
--- a/t/real/nkjp.t
+++ b/t/real/nkjp.t
@@ -100,7 +100,9 @@
like($token, qr!<>:dereko\/s:seg\$<b>64!);
like($token, qr!i:ładu!);
like($token, qr!nkjp\/l:ład!);
-like($token, qr!nkjp\/m:sg:gen:m3!);
+like($token, qr!nkjp\/m:number:sg!);
+like($token, qr!nkjp\/m:case:gen!);
+like($token, qr!nkjp\/m:gender:m3!);
like($token, qr!nkjp\/p:subst!);
like($token, qr!s:ładu!);
@@ -170,7 +172,10 @@
like($token, qr!_5\$<i>23<i>28!);
like($token, qr!i:takie!);
like($token, qr!nkjp/l:taki!);
-like($token, qr!nkjp/m:sg:nom:n:pos!);
+like($token, qr!nkjp/m:number:sg!);
+like($token, qr!nkjp/m:case:nom!);
+like($token, qr!nkjp/m:gender:n!);
+like($token, qr!nkjp/m:degree:pos!);
like($token, qr!nkjp/p:adj!);
like($token, qr!s:takie!);
like($token, qr!nkjp/ov:takie!);
@@ -185,7 +190,9 @@
like($token, qr!_67\$<i>464<i>475!);
like($token, qr!i:kierkegaard!);
like($token, qr!nkjp/l:Kierkegaard!);
-like($token, qr!nkjp/m:sg:nom:m1!);
+like($token, qr!nkjp/m:number:sg!);
+like($token, qr!nkjp/m:case:nom!);
+like($token, qr!nkjp/m:gender:m1!);
like($token, qr!nkjp/ne:persName:surname!);
like($token, qr!nkjp/p:subst!);
like($token, qr!s:Kierkegaard!);