Support named entities for NKJP
Change-Id: I71bd002625849c34628d99c518571484e6083ea0
diff --git a/lib/KorAP/XML/Annotation/NKJP/NamedEntities.pm b/lib/KorAP/XML/Annotation/NKJP/NamedEntities.pm
new file mode 100644
index 0000000..e4b6e3c
--- /dev/null
+++ b/lib/KorAP/XML/Annotation/NKJP/NamedEntities.pm
@@ -0,0 +1,67 @@
+package KorAP::XML::Annotation::NKJP::NamedEntities;
+use KorAP::XML::Annotation::Base;
+
+# Import named entities, potentially with a specified
+# Model. However - now all models are mapped to the 'ne'-Prefix
+# and are indistinguishable in annotations. However - if only one
+# model is used, the model is listed in the foundries.
+sub parse {
+ my $self = shift;
+ my $model = shift;
+
+ $$self->add_tokendata(
+ foundry => 'nkjp',
+ layer => 'named',
+ cb => sub {
+ my ($stream, $token) = @_;
+ my $mtt = $stream->pos($token->get_pos);
+
+ my $content = $token->get_hash->{fs}->{f} or return;
+ my $found;
+
+ if (ref $content eq 'HASH') {
+ $content = [$content];
+ };
+
+ foreach my $c (@$content) {
+ if ($c->{-name} ne 'ne') {
+ next;
+ };
+
+ if ($found = $c->{fs}) {
+ my $ents;
+ if (ref $found->{f} eq 'HASH') {
+ $ents = [$found->{f}];
+ } else {
+ $ents = $found->{f};
+ };
+
+ my ($type, $subtype);
+ foreach (@$ents) {
+ if ($_->{'-name'}) {
+ if ($_->{'-name'} eq 'type') {
+ $type = $_->{symbol}->{'-value'};
+ }
+ elsif ($_->{'-name'} eq 'subtype') {
+ $subtype = $_->{symbol}->{'-value'};
+ };
+ };
+ };
+
+ if ($type && $subtype) {
+ $mtt->add_by_term('nkjp/ne:' . $type . ':' . $subtype);
+ } elsif ($type) {
+ $mtt->add_by_term('nkjp/ne:' . $type);
+ };
+ };
+ };
+ }) or return;
+
+ return 1;
+};
+
+sub layer_info {
+ ['nkjp/ne=tokens'];
+};
+
+1;
diff --git a/t/real/corpus/NKJP/NKJP/KolakowskiOco/nkjp/morpho.xml b/t/real/corpus/NKJP/NKJP/KolakowskiOco/nkjp/morpho.xml
index bdef55e..17019a5 100644
--- a/t/real/corpus/NKJP/NKJP/KolakowskiOco/nkjp/morpho.xml
+++ b/t/real/corpus/NKJP/NKJP/KolakowskiOco/nkjp/morpho.xml
@@ -2989,6 +2989,13 @@
</fs>
<fs type="alt">
<f name="lemma">to</f>
+ <f name="pos">conj</f>
+ <f name="msd">
+ <symbol value="0"/>
+ </f>
+ </fs>
+ <fs type="alt">
+ <f name="lemma">to</f>
<f name="pos">part</f>
<f name="msd">
<symbol value="0"/>
@@ -3672,6 +3679,13 @@
<symbol value="0"/>
</f>
</fs>
+ <fs type="alt">
+ <f name="lemma">to</f>
+ <f name="pos">conj</f>
+ <f name="msd">
+ <symbol value="0"/>
+ </f>
+ </fs>
<fs type="alt" n="choice">
<f name="lemma">to</f>
<f name="pos">part</f>
diff --git a/t/real/corpus/NKJP/NKJP/KolakowskiOco/nkjp/named.xml b/t/real/corpus/NKJP/NKJP/KolakowskiOco/nkjp/named.xml
index df1ee28..b40d1d6 100644
--- a/t/real/corpus/NKJP/NKJP/KolakowskiOco/nkjp/named.xml
+++ b/t/real/corpus/NKJP/NKJP/KolakowskiOco/nkjp/named.xml
@@ -8,34 +8,30 @@
<fs xmlns="http://www.tei-c.org/ns/1.0" type="ne">
<f name="ne"><!-- _Kierkegaard-->
<fs>
- <f name="complex-ent">
- <fs type="complex-ent">
- <f name="type">
- <symbol value="persName"/>
- </f>
- <f name="subtype">
- <symbol value="surname"/>
- </f>
- <f name="nkjp-named">
- <fs type="named">
- <f name="type">
- <symbol value="persName"/>
- </f>
- <f name="subtype">
- <symbol value="surname"/>
- </f>
- <f name="orth">
- <string>Kierkegaard</string>
- </f>
- <f name="base">
- <string>Kierkegaard</string>
- </f>
- <f name="certainty">
- <symbol value="high"/>
- </f>
- </fs>
- </f>
- </fs>
+ <f name="type">
+ <symbol value="persName"/>
+ </f>
+ <f name="subtype">
+ <symbol value="surname"/>
+ </f>
+ </fs>
+ </f>
+ <f name="nkjp">
+ <fs type="named">
+ <f name="type">
+ <symbol value="persName"/>
+ </f>
+ <f name="subtype">
+ <symbol value="surname"/>
+ </f>
+ <f name="orth">
+ <string>Kierkegaard</string>
+ </f>
+ <f name="base">
+ <string>Kierkegaard</string>
+ </f>
+ <f name="certainty">
+ <symbol value="high"/>
</f>
</fs>
</f>
diff --git a/t/real/nkjp.t b/t/real/nkjp.t
index cdd46e8..8652037 100644
--- a/t/real/nkjp.t
+++ b/t/real/nkjp.t
@@ -15,6 +15,7 @@
use File::Spec::Functions 'catdir';
use_ok('KorAP::XML::Krill');
+use_ok('KorAP::XML::Annotation::NKJP::NamedEntities');
my $path = catdir(dirname(__FILE__), 'corpus','NKJP','NKJP','KOT');
@@ -52,7 +53,7 @@
doc => $doc,
foundry => $token_base_foundry,
layer => $token_base_layer,
- name => 'tokens'
+ name => 'tokens',
);
ok($tokens, 'Token Object is fine');
ok($tokens->parse, 'Token parsing is fine');
@@ -65,7 +66,7 @@
## Base
ok($tokens->add('DeReKo', 'Structure', 'base_sentences_paragraphs'));
-ok($tokens->add('NKJP', 'Morpho'), 'Add Gingko');
+ok($tokens->add('NKJP', 'Morpho'), 'Add Morpho');
$output = $tokens->to_data;
@@ -128,13 +129,14 @@
## Base
ok($tokens->add('DeReKo', 'Structure', 'base_sentences_paragraphs'));
-ok($tokens->add('NKJP', 'Morpho'), 'Add Gingko');
+ok($tokens->add('NKJP', 'Morpho'), 'Add Morpho');
+ok($tokens->add('NKJP', 'NamedEntities'), 'Add NamedEntities');
$output = $tokens->to_data;
-is($output->{data}->{foundries}, 'dereko dereko/structure dereko/structure/base_sentences_paragraphs nkjp nkjp/morpho', 'Foundries');
+is($output->{data}->{foundries}, 'dereko dereko/structure dereko/structure/base_sentences_paragraphs nkjp nkjp/morpho nkjp/namedentities', 'Foundries');
-is($output->{data}->{layerInfos}, 'dereko/s=spans nkjp/l=tokens nkjp/m=tokens nkjp/p=tokens', 'layerInfos');
+is($output->{data}->{layerInfos}, 'dereko/s=spans nkjp/l=tokens nkjp/m=tokens nkjp/ne=tokens nkjp/p=tokens', 'layerInfos');
$token = join('||', @{$output->{data}->{stream}->[5]});
@@ -146,6 +148,19 @@
like($token, qr!nkjp/p:adj!);
like($token, qr!s:takie!);
+$token = join('||', @{$output->{data}->{stream}->[67]});
+
+like($token, qr!<>:dereko/s:seg\$<b>64<i>464<i>475<i>68<b>4<s>1!);
+like($token, qr!\@:dereko\/s:corresp:ann_segmentation\.xml\\#segm_2\.2-seg\$<b>17<s>1<i>68!);
+like($token, qr!\@:dereko\/s:id:morph_2\.2-seg\$<b>17<s>1<i>68!);
+like($token, qr!_67\$<i>464<i>475!);
+like($token, qr!i:kierkegaard!);
+like($token, qr!nkjp/l:Kierkegaard!);
+like($token, qr!nkjp/m:sg:nom:m1!);
+like($token, qr!nkjp/ne:persName:surname!);
+like($token, qr!nkjp/p:subst!);
+like($token, qr!s:Kierkegaard!);
+
done_testing;
__END__