Support for NKJP taxonomy
Change-Id: I2d2ac7f2faffa5d0f48477a126bf341ad217c14d
diff --git a/Changes b/Changes
index e270c1a..8c3c5ed 100644
--- a/Changes
+++ b/Changes
@@ -1,5 +1,6 @@
0.47 2022-07-27
- Support for preferred language transformation.
+ - Support for NKJP taxonomies.
0.46 2022-07-21
- Support NKJP Meta, Morpho and NamedEntities.
diff --git a/lib/KorAP/XML/Meta/NKJP.pm b/lib/KorAP/XML/Meta/NKJP.pm
new file mode 100644
index 0000000..a4b302a
--- /dev/null
+++ b/lib/KorAP/XML/Meta/NKJP.pm
@@ -0,0 +1,81 @@
+package KorAP::XML::Meta::NKJP;
+use KorAP::XML::Meta::Base;
+use KorAP::XML::Meta::I5;
+
+my $squish = \&KorAP::XML::Meta::I5::_squish;
+
+our %taxonomy = ();
+
+sub parse {
+ my ($self, $dom, $type) = @_;
+
+ # Parse using the parent I% class
+ unless (KorAP::XML::Meta::I5::parse($self, $dom, $type)) {
+ return 0;
+ };
+
+ my $lang = $self->lang // 'pl';
+
+ if ($type eq 'corpus') {
+
+ %taxonomy = ();
+
+ my $taxes = $dom->find('encodingDesc > classDecl > taxonomy');
+
+ $taxes->each(
+ sub{
+ my $tax_id = $_->attr('xml:id') or return;
+
+ $_->find('category')->each(
+ sub {
+ my $cat_id = $_->attr('xml:id') or return;
+
+ my $desc = $_->find('> desc')->first(
+ sub{ $_->attr('xml:lang') && lc($_->attr('xml:lang')) eq lc($lang) }
+ )->all_text;
+
+ my $tax_sub = $taxonomy{$tax_id} //= {};
+ $tax_sub->{'#' . $cat_id} = $desc;
+ }
+ );
+ }
+ );
+ }
+
+ elsif ($type eq 'text') {
+
+ # Delete old interpretation
+ delete $self->{K_text_class};
+
+ my $temp = $dom->at('textClass');
+ if ($temp) {
+ # Dereference categories
+ $temp->find("catRef")->each(
+ sub {
+ return unless $_->attr('target');
+ return unless $_->attr('scheme');
+
+ my $target = $_->attr('target');
+ my $scheme = $_->attr('scheme');
+
+
+ # Set NKJP type
+ if ($scheme eq '#taxonomy-NKJP-type') {
+ $self->{K_nkjp_type} //= [];
+ my $resolved = $taxonomy{'taxonomy-NKJP-type'}->{$target};
+ push(@{$self->{K_nkjp_type}}, split(',\s+', $resolved)) if $resolved;
+ }
+
+ # Set NKJP type
+ elsif ($scheme eq '#taxonomy-NKJP-channel') {
+ $self->{K_nkjp_channel} //= [];
+ my $resolved = $taxonomy{'taxonomy-NKJP-channel'}->{$target};
+ push(@{$self->{K_nkjp_channel}}, split(',\s+', $resolved)) if $resolved;
+ };
+ }
+ );
+ };
+ };
+};
+
+1;
diff --git a/t/real/nkjp.t b/t/real/nkjp.t
index 8f94f44..7f651c0 100644
--- a/t/real/nkjp.t
+++ b/t/real/nkjp.t
@@ -16,11 +16,12 @@
use_ok('KorAP::XML::Krill');
use_ok('KorAP::XML::Meta::I5');
+use_ok('KorAP::XML::Meta::NKJP');
use_ok('KorAP::XML::Annotation::NKJP::NamedEntities');
my $path = catdir(dirname(__FILE__), 'corpus','NKJP','NKJP','KOT');
-ok(my $doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
+ok(my $doc = KorAP::XML::Krill->new( path => $path . '/', meta_type => 'NKJP' ), 'Load Korap::Document');
ok($doc->parse, 'Parse document');
is($doc->text_sigle, 'NKJP/NKJP/KOT', 'Correct text sigle');
@@ -32,7 +33,12 @@
is($meta->{T_title}, 'TEI P5 encoded version of sample(s) of "Kot"', 'Title');
is($meta->{T_corpus_title}, 'Narodowy Korpus Języka Polskiego -- podkorpus zawierający 1 milion słów', 'Title');
-ok($doc = KorAP::XML::Krill->new( path => $path . '/', lang => 'en' ), 'Load Korap::Document');
+is($meta->{K_nkjp_channel}->[0], 'miesiecznik', 'NKJP-Channel');
+ok(!$meta->{K_nkjp_channel}->[1], 'NKJP-Channel');
+is($meta->{K_nkjp_type}->[0], 'publicystyka i wiadomości prasowe', 'NKJP-Type');
+ok(!$meta->{K_nkjp_type}->[1], 'NKJP-Type');
+
+ok($doc = KorAP::XML::Krill->new( path => $path . '/', meta_type => 'NKJP', lang => 'en' ), 'Load Korap::Document');
ok($doc->parse, 'Parse document');
$meta = $doc->meta;
@@ -51,6 +57,10 @@
ok(!$meta->{S_text_domain}, 'No Text Domain');
ok(!$meta->{S_text_column}, 'No Text Column');
+is($meta->{K_nkjp_channel}->[0], 'monthly', 'NKJP-Channel');
+ok(!$meta->{K_nkjp_channel}->[1], 'NKJP-Channel');
+is($meta->{K_nkjp_type}->[0], 'journalism', 'NKJP-Type');
+ok(!$meta->{K_nkjp_type}->[1], 'NKJP-Type');
# Tokenization
use_ok('KorAP::XML::Tokenizer');
@@ -98,7 +108,7 @@
# KolakowskiOco
$path = catdir(dirname(__FILE__), 'corpus','NKJP','NKJP','KolakowskiOco');
-ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
+ok($doc = KorAP::XML::Krill->new( path => $path . '/', meta_type => 'NKJP', lang => 'pl'), 'Load Korap::Document');
ok($doc->parse, 'Parse document');
is($doc->text_sigle, 'NKJP/NKJP/KolakowskiOco', 'Correct text sigle');
@@ -120,6 +130,12 @@
ok(!$meta->{S_text_domain}, 'No Text Domain');
ok(!$meta->{S_text_column}, 'No Text Column');
+is($meta->{K_nkjp_channel}->[0], 'książka', 'NKJP-Channel');
+ok(!$meta->{K_nkjp_channel}->[1], 'NKJP-Channel');
+is($meta->{K_nkjp_type}->[0], 'literatura piękna', 'NKJP-Type');
+ok(!$meta->{K_nkjp_type}->[1], 'NKJP-Type');
+
+
# Get tokenization
$tokens = KorAP::XML::Tokenizer->new(
path => $doc->path,