Support named entities for NKJP

Change-Id: I71bd002625849c34628d99c518571484e6083ea0
diff --git a/lib/KorAP/XML/Annotation/NKJP/NamedEntities.pm b/lib/KorAP/XML/Annotation/NKJP/NamedEntities.pm
new file mode 100644
index 0000000..e4b6e3c
--- /dev/null
+++ b/lib/KorAP/XML/Annotation/NKJP/NamedEntities.pm
@@ -0,0 +1,67 @@
+package KorAP::XML::Annotation::NKJP::NamedEntities;
+use KorAP::XML::Annotation::Base;
+
+# Import named entities, potentially with a specified
+# Model. However - now all models are mapped to the 'ne'-Prefix
+# and are indistinguishable in annotations. However - if only one
+# model is used, the model is listed in the foundries.
+sub parse {
+  my $self   = shift;
+  my $model  = shift;
+
+  $$self->add_tokendata(
+    foundry => 'nkjp',
+    layer => 'named',
+    cb => sub {
+      my ($stream, $token) = @_;
+      my $mtt = $stream->pos($token->get_pos);
+
+      my $content = $token->get_hash->{fs}->{f} or return;
+      my $found;
+
+      if (ref $content eq 'HASH') {
+        $content = [$content];
+      };
+
+      foreach my $c (@$content) {
+        if ($c->{-name} ne 'ne') {
+          next;
+        };
+
+        if ($found = $c->{fs}) {
+          my $ents;
+          if (ref $found->{f} eq 'HASH') {
+            $ents = [$found->{f}];
+          } else {
+            $ents = $found->{f};
+          };
+
+          my ($type, $subtype);
+          foreach (@$ents) {
+            if ($_->{'-name'}) {
+              if ($_->{'-name'} eq 'type') {
+                $type = $_->{symbol}->{'-value'};
+              }
+              elsif ($_->{'-name'} eq 'subtype') {
+                $subtype = $_->{symbol}->{'-value'};
+              };
+            };
+          };
+
+          if ($type && $subtype) {
+            $mtt->add_by_term('nkjp/ne:' . $type . ':' . $subtype);
+          } elsif ($type) {
+            $mtt->add_by_term('nkjp/ne:' . $type);
+          };
+        };
+      };
+    }) or return;
+
+  return 1;
+};
+
+sub layer_info {
+  ['nkjp/ne=tokens'];
+};
+
+1;
diff --git a/t/real/corpus/NKJP/NKJP/KolakowskiOco/nkjp/morpho.xml b/t/real/corpus/NKJP/NKJP/KolakowskiOco/nkjp/morpho.xml
index bdef55e..17019a5 100644
--- a/t/real/corpus/NKJP/NKJP/KolakowskiOco/nkjp/morpho.xml
+++ b/t/real/corpus/NKJP/NKJP/KolakowskiOco/nkjp/morpho.xml
@@ -2989,6 +2989,13 @@
                </fs>
                <fs type="alt">
                   <f name="lemma">to</f>
+                  <f name="pos">conj</f>
+                  <f name="msd">
+                     <symbol value="0"/>
+                  </f>
+               </fs>
+               <fs type="alt">
+                  <f name="lemma">to</f>
                   <f name="pos">part</f>
                   <f name="msd">
                      <symbol value="0"/>
@@ -3672,6 +3679,13 @@
                      <symbol value="0"/>
                   </f>
                </fs>
+               <fs type="alt">
+                  <f name="lemma">to</f>
+                  <f name="pos">conj</f>
+                  <f name="msd">
+                     <symbol value="0"/>
+                  </f>
+               </fs>
                <fs type="alt" n="choice">
                   <f name="lemma">to</f>
                   <f name="pos">part</f>
diff --git a/t/real/corpus/NKJP/NKJP/KolakowskiOco/nkjp/named.xml b/t/real/corpus/NKJP/NKJP/KolakowskiOco/nkjp/named.xml
index df1ee28..b40d1d6 100644
--- a/t/real/corpus/NKJP/NKJP/KolakowskiOco/nkjp/named.xml
+++ b/t/real/corpus/NKJP/NKJP/KolakowskiOco/nkjp/named.xml
@@ -8,34 +8,30 @@
          <fs xmlns="http://www.tei-c.org/ns/1.0" type="ne">
             <f name="ne"><!-- _Kierkegaard-->
                <fs>
-                  <f name="complex-ent">
-                     <fs type="complex-ent">
-                        <f name="type">
-                           <symbol value="persName"/>
-                        </f>
-                        <f name="subtype">
-                           <symbol value="surname"/>
-                        </f>
-                        <f name="nkjp-named">
-                           <fs type="named">
-                              <f name="type">
-                                 <symbol value="persName"/>
-                              </f>
-                              <f name="subtype">
-                                 <symbol value="surname"/>
-                              </f>
-                              <f name="orth">
-                                 <string>Kierkegaard</string>
-                              </f>
-                              <f name="base">
-                                 <string>Kierkegaard</string>
-                              </f>
-                              <f name="certainty">
-                                 <symbol value="high"/>
-                              </f>
-                           </fs>
-                        </f>
-                     </fs>
+                  <f name="type">
+                     <symbol value="persName"/>
+                  </f>
+                  <f name="subtype">
+                     <symbol value="surname"/>
+                  </f>
+               </fs>
+            </f>
+            <f name="nkjp">
+               <fs type="named">
+                  <f name="type">
+                     <symbol value="persName"/>
+                  </f>
+                  <f name="subtype">
+                     <symbol value="surname"/>
+                  </f>
+                  <f name="orth">
+                     <string>Kierkegaard</string>
+                  </f>
+                  <f name="base">
+                     <string>Kierkegaard</string>
+                  </f>
+                  <f name="certainty">
+                     <symbol value="high"/>
                   </f>
                </fs>
             </f>
diff --git a/t/real/nkjp.t b/t/real/nkjp.t
index cdd46e8..8652037 100644
--- a/t/real/nkjp.t
+++ b/t/real/nkjp.t
@@ -15,6 +15,7 @@
 use File::Spec::Functions 'catdir';
 
 use_ok('KorAP::XML::Krill');
+use_ok('KorAP::XML::Annotation::NKJP::NamedEntities');
 
 my $path = catdir(dirname(__FILE__), 'corpus','NKJP','NKJP','KOT');
 
@@ -52,7 +53,7 @@
   doc => $doc,
   foundry => $token_base_foundry,
   layer => $token_base_layer,
-  name => 'tokens'
+  name => 'tokens',
 );
 ok($tokens, 'Token Object is fine');
 ok($tokens->parse, 'Token parsing is fine');
@@ -65,7 +66,7 @@
 
 ## Base
 ok($tokens->add('DeReKo', 'Structure', 'base_sentences_paragraphs'));
-ok($tokens->add('NKJP', 'Morpho'), 'Add Gingko');
+ok($tokens->add('NKJP', 'Morpho'), 'Add Morpho');
 
 $output = $tokens->to_data;
 
@@ -128,13 +129,14 @@
 
 ## Base
 ok($tokens->add('DeReKo', 'Structure', 'base_sentences_paragraphs'));
-ok($tokens->add('NKJP', 'Morpho'), 'Add Gingko');
+ok($tokens->add('NKJP', 'Morpho'), 'Add Morpho');
+ok($tokens->add('NKJP', 'NamedEntities'), 'Add NamedEntities');
 
 $output = $tokens->to_data;
 
-is($output->{data}->{foundries}, 'dereko dereko/structure dereko/structure/base_sentences_paragraphs nkjp nkjp/morpho', 'Foundries');
+is($output->{data}->{foundries}, 'dereko dereko/structure dereko/structure/base_sentences_paragraphs nkjp nkjp/morpho nkjp/namedentities', 'Foundries');
 
-is($output->{data}->{layerInfos}, 'dereko/s=spans nkjp/l=tokens nkjp/m=tokens nkjp/p=tokens', 'layerInfos');
+is($output->{data}->{layerInfos}, 'dereko/s=spans nkjp/l=tokens nkjp/m=tokens nkjp/ne=tokens nkjp/p=tokens', 'layerInfos');
 
 $token = join('||', @{$output->{data}->{stream}->[5]});
 
@@ -146,6 +148,19 @@
 like($token, qr!nkjp/p:adj!);
 like($token, qr!s:takie!);
 
+$token = join('||', @{$output->{data}->{stream}->[67]});
+
+like($token, qr!<>:dereko/s:seg\$<b>64<i>464<i>475<i>68<b>4<s>1!);
+like($token, qr!\@:dereko\/s:corresp:ann_segmentation\.xml\\#segm_2\.2-seg\$<b>17<s>1<i>68!);
+like($token, qr!\@:dereko\/s:id:morph_2\.2-seg\$<b>17<s>1<i>68!);
+like($token, qr!_67\$<i>464<i>475!);
+like($token, qr!i:kierkegaard!);
+like($token, qr!nkjp/l:Kierkegaard!);
+like($token, qr!nkjp/m:sg:nom:m1!);
+like($token, qr!nkjp/ne:persName:surname!);
+like($token, qr!nkjp/p:subst!);
+like($token, qr!s:Kierkegaard!);
+
 done_testing;
 __END__