Support for NKJP taxonomy

Change-Id: I2d2ac7f2faffa5d0f48477a126bf341ad217c14d
diff --git a/Changes b/Changes
index e270c1a..8c3c5ed 100644
--- a/Changes
+++ b/Changes
@@ -1,5 +1,6 @@
 0.47 2022-07-27
         - Support for preferred language transformation.
+        - Support for NKJP taxonomies.
 
 0.46 2022-07-21
         - Support NKJP Meta, Morpho and NamedEntities.
diff --git a/lib/KorAP/XML/Meta/NKJP.pm b/lib/KorAP/XML/Meta/NKJP.pm
new file mode 100644
index 0000000..a4b302a
--- /dev/null
+++ b/lib/KorAP/XML/Meta/NKJP.pm
@@ -0,0 +1,81 @@
+package KorAP::XML::Meta::NKJP;
+use KorAP::XML::Meta::Base;
+use KorAP::XML::Meta::I5;
+
+my $squish = \&KorAP::XML::Meta::I5::_squish;
+
+our %taxonomy = ();
+
+sub parse {
+  my ($self, $dom, $type) = @_;
+
+  # Parse using the parent I% class
+  unless (KorAP::XML::Meta::I5::parse($self, $dom, $type)) {
+    return 0;
+  };
+
+  my $lang = $self->lang // 'pl';
+
+  if ($type eq 'corpus') {
+
+    %taxonomy = ();
+
+    my $taxes = $dom->find('encodingDesc > classDecl > taxonomy');
+
+    $taxes->each(
+      sub{
+        my $tax_id = $_->attr('xml:id') or return;
+
+        $_->find('category')->each(
+          sub {
+            my $cat_id = $_->attr('xml:id') or return;
+
+            my $desc = $_->find('> desc')->first(
+              sub{ $_->attr('xml:lang') && lc($_->attr('xml:lang')) eq lc($lang) }
+            )->all_text;
+
+            my $tax_sub = $taxonomy{$tax_id} //= {};
+            $tax_sub->{'#' . $cat_id} = $desc;
+          }
+        );
+      }
+    );
+  }
+
+  elsif ($type eq 'text') {
+
+    # Delete old interpretation
+    delete $self->{K_text_class};
+
+    my $temp = $dom->at('textClass');
+    if ($temp) {
+      # Dereference categories
+      $temp->find("catRef")->each(
+        sub {
+          return unless $_->attr('target');
+          return unless $_->attr('scheme');
+
+          my $target = $_->attr('target');
+          my $scheme = $_->attr('scheme');
+
+
+          # Set NKJP type
+          if ($scheme eq '#taxonomy-NKJP-type') {
+            $self->{K_nkjp_type} //= [];
+            my $resolved = $taxonomy{'taxonomy-NKJP-type'}->{$target};
+            push(@{$self->{K_nkjp_type}}, split(',\s+', $resolved)) if $resolved;
+          }
+
+          # Set NKJP type
+          elsif ($scheme eq '#taxonomy-NKJP-channel') {
+            $self->{K_nkjp_channel} //= [];
+            my $resolved = $taxonomy{'taxonomy-NKJP-channel'}->{$target};
+            push(@{$self->{K_nkjp_channel}}, split(',\s+', $resolved)) if $resolved;
+          };
+        }
+      );
+    };
+  };
+};
+
+1;
diff --git a/t/real/nkjp.t b/t/real/nkjp.t
index 8f94f44..7f651c0 100644
--- a/t/real/nkjp.t
+++ b/t/real/nkjp.t
@@ -16,11 +16,12 @@
 
 use_ok('KorAP::XML::Krill');
 use_ok('KorAP::XML::Meta::I5');
+use_ok('KorAP::XML::Meta::NKJP');
 use_ok('KorAP::XML::Annotation::NKJP::NamedEntities');
 
 my $path = catdir(dirname(__FILE__), 'corpus','NKJP','NKJP','KOT');
 
-ok(my $doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
+ok(my $doc = KorAP::XML::Krill->new( path => $path . '/', meta_type => 'NKJP' ), 'Load Korap::Document');
 ok($doc->parse, 'Parse document');
 
 is($doc->text_sigle, 'NKJP/NKJP/KOT', 'Correct text sigle');
@@ -32,7 +33,12 @@
 is($meta->{T_title}, 'TEI P5 encoded version of sample(s) of "Kot"', 'Title');
 is($meta->{T_corpus_title}, 'Narodowy Korpus Języka Polskiego -- podkorpus zawierający 1 milion słów', 'Title');
 
-ok($doc = KorAP::XML::Krill->new( path => $path . '/', lang => 'en' ), 'Load Korap::Document');
+is($meta->{K_nkjp_channel}->[0], 'miesiecznik', 'NKJP-Channel');
+ok(!$meta->{K_nkjp_channel}->[1], 'NKJP-Channel');
+is($meta->{K_nkjp_type}->[0], 'publicystyka i wiadomości prasowe', 'NKJP-Type');
+ok(!$meta->{K_nkjp_type}->[1], 'NKJP-Type');
+
+ok($doc = KorAP::XML::Krill->new( path => $path . '/', meta_type => 'NKJP', lang => 'en' ), 'Load Korap::Document');
 ok($doc->parse, 'Parse document');
 $meta = $doc->meta;
 
@@ -51,6 +57,10 @@
 ok(!$meta->{S_text_domain}, 'No Text Domain');
 ok(!$meta->{S_text_column}, 'No Text Column');
 
+is($meta->{K_nkjp_channel}->[0], 'monthly', 'NKJP-Channel');
+ok(!$meta->{K_nkjp_channel}->[1], 'NKJP-Channel');
+is($meta->{K_nkjp_type}->[0], 'journalism', 'NKJP-Type');
+ok(!$meta->{K_nkjp_type}->[1], 'NKJP-Type');
 
 # Tokenization
 use_ok('KorAP::XML::Tokenizer');
@@ -98,7 +108,7 @@
 # KolakowskiOco
 $path = catdir(dirname(__FILE__), 'corpus','NKJP','NKJP','KolakowskiOco');
 
-ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
+ok($doc = KorAP::XML::Krill->new( path => $path . '/', meta_type => 'NKJP', lang => 'pl'), 'Load Korap::Document');
 ok($doc->parse, 'Parse document');
 
 is($doc->text_sigle, 'NKJP/NKJP/KolakowskiOco', 'Correct text sigle');
@@ -120,6 +130,12 @@
 ok(!$meta->{S_text_domain}, 'No Text Domain');
 ok(!$meta->{S_text_column}, 'No Text Column');
 
+is($meta->{K_nkjp_channel}->[0], 'książka', 'NKJP-Channel');
+ok(!$meta->{K_nkjp_channel}->[1], 'NKJP-Channel');
+is($meta->{K_nkjp_type}->[0], 'literatura piękna', 'NKJP-Type');
+ok(!$meta->{K_nkjp_type}->[1], 'NKJP-Type');
+
+
 # Get tokenization
 $tokens = KorAP::XML::Tokenizer->new(
   path => $doc->path,