Support switch for preferred language transformation Change-Id: I7bda578f386e4b454eaa9bf100f3c258e10f74c2

commit: 64f7faead93871ffa3e3612c823b267846ad35d4 [log] [tgz]
author: Akron <nils@diewald-online.de> Wed Jul 27 12:45:33 2022 +0200
committer: Akron <nils@diewald-online.de> Wed Jul 27 12:45:33 2022 +0200
tree: 320ce42676d571fae1871c0e7b101d3d15fca4da
parent: e85a7765eb640570eb5b1bf6f62ef9d0e02f3639 [diff]
diff --git a/Changes b/Changes
index 2a267f1..e270c1a 100644
--- a/Changes
+++ b/Changes

@@ -1,3 +1,6 @@
+0.47 2022-07-27
+        - Support for preferred language transformation.
+
 0.46 2022-07-21
         - Support NKJP Meta, Morpho and NamedEntities.
 

diff --git a/lib/KorAP/XML/Batch/File.pm b/lib/KorAP/XML/Batch/File.pm
index 297fc06..d8f94dd 100644
--- a/lib/KorAP/XML/Batch/File.pm
+++ b/lib/KorAP/XML/Batch/File.pm

@@ -20,6 +20,7 @@
     layer           => $param{layer}     || 'Tokens',
     anno            => $param{anno}      || [[]],
     log             => $param{log}       || $log,
+    lang            => $param{lang},
     koral           => $param{koral},
     non_word_tokens => $param{non_word_tokens},
     non_verbal_tokens => $param{non_verbal_tokens},
@@ -42,7 +43,8 @@
     path      => $input,
     meta_type => $self->{meta_type},
     cache     => $self->{cache},
-    log       => $self->{log}
+    log       => $self->{log},
+    lang      => $self->{lang}
   );
 
   # Parse document

diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index d8b72ab..564cc14 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm

@@ -16,12 +16,13 @@
 
 our @EXPORT_OK = qw(get_file_name get_file_name_from_glob);
 
-our $VERSION = '0.46';
+our $VERSION = '0.47';
 
 has 'path';
 has [qw/text_sigle doc_sigle corpus_sigle/];
 has 'meta_type' => 'I5';
 has 'cache';
+has 'lang';
 
 has log => sub {
   return $log;
@@ -140,7 +141,8 @@
       corpus_sigle => $self->corpus_sigle,
       doc_sigle    => $self->doc_sigle,
       text_sigle   => $self->text_sigle,
-      cache        => $self->cache
+      cache        => $self->cache,
+      lang         => $self->lang
     );
 
     # Associate meta object

diff --git a/lib/KorAP/XML/Meta/Base.pm b/lib/KorAP/XML/Meta/Base.pm
index 8f6900b..a4269a7 100644
--- a/lib/KorAP/XML/Meta/Base.pm
+++ b/lib/KorAP/XML/Meta/Base.pm

@@ -39,13 +39,17 @@
 
 sub cache {
   $_[0]->{_cache};
-}
+};
+
+sub lang {
+  $_[0]->{_lang};
+};
 
 sub new {
   my $class = shift;
   my %hash = @_;
   my $copy = {};
-  foreach (qw/log cache corpus_sigle doc_sigle text_sigle/) {
+  foreach (qw/log cache lang corpus_sigle doc_sigle text_sigle/) {
     $copy->{'_' . $_} = $hash{$_};
   };
 

diff --git a/lib/KorAP/XML/Meta/I5.pm b/lib/KorAP/XML/Meta/I5.pm
index fd52192..a675e4f 100644
--- a/lib/KorAP/XML/Meta/I5.pm
+++ b/lib/KorAP/XML/Meta/I5.pm

@@ -69,6 +69,8 @@
 sub parse {
   my ($self, $dom, $type) = @_;
 
+  my $lang = $self->lang;
+
   # Parse text sigle
   if ($type eq 'text' && !$self->text_sigle) {
     my $v = $dom->at('textSigle');
@@ -106,8 +108,20 @@
     # There is an analytic element
 
     # Get title, subtitle, author, editor
-    my $title     = $analytic->at('h\.title[type=main]');
-    my $sub_title = $analytic->at('h\.title[type=sub]');
+    my $titles = $analytic->find('h\.title[type=main]');
+    my $title;
+    if ($lang) {
+      $title = $titles->first(sub{ $_->attr('xml:lang') && lc($_->attr('xml:lang')) eq lc($lang) });
+    };
+    $title = $titles->first unless $title;
+
+    my $sub_title;
+    $titles    = $analytic->find('h\.title[type=sub]');
+    if ($lang) {
+      $sub_title = $titles->first(sub{ $_->attr('xml:lang') && lc($_->attr('xml:lang')) eq lc($lang) });
+    };
+    $sub_title = $titles->first unless $sub_title;
+
     my $author    = $analytic->at('h\.author');
     my $editor    = $analytic->at('editor');
 
@@ -181,16 +195,24 @@
   };
 
   # Not in analytic
-  my $title;
+  my ($titles, $title);
   if ($type eq 'corpus') {
 
     # Corpus title not yet given
     unless ($self->{T_corpus_title}) {
-      if ($title = $dom->at('fileDesc > titleStmt > c\.title')) {
-        $title = _squish($title->all_text);
+      if ($titles = $dom->find('fileDesc > titleStmt > c\.title')) {
+        if ($lang) {
+          $title = $titles->first(sub{ $_->attr('xml:lang') && lc($_->attr('xml:lang')) eq lc($lang) });
+        };
+
+        $title = $titles->first unless $title;
 
         if ($title) {
-          $self->{T_corpus_title} = _remove_prefix($title, $self->corpus_sigle);
+          $title = _squish($title->all_text);
+
+          if ($title) {
+            $self->{T_corpus_title} = _remove_prefix($title, $self->corpus_sigle);
+          };
         };
       };
     };
@@ -199,11 +221,19 @@
   # doc title
   elsif ($type eq 'doc') {
     unless ($self->{T_doc_title}) {
-      if ($title = $dom->at('fileDesc > titleStmt > d\.title')) {
-        $title = _squish($title->all_text);
+      if ($titles = $dom->find('fileDesc > titleStmt > d\.title')) {
+        if ($lang) {
+          $title = $titles->first(sub{ $_->attr('xml:lang') && lc($_->attr('xml:lang')) eq lc($lang) });
+        };
+
+        $title = $titles->first unless $title;
 
         if ($title) {
-          $self->{T_doc_title} = _remove_prefix($title, $self->doc_sigle);
+          $title = _squish($title->all_text);
+
+          if ($title) {
+            $self->{T_doc_title} = _remove_prefix($title, $self->doc_sigle);
+          };
         };
       };
     };
@@ -212,12 +242,21 @@
   # text title
   elsif ($type eq 'text') {
     unless ($self->{T_title}) {
-      if ($title = $dom->at('fileDesc > titleStmt > t\.title')) {
-        $title = _squish($title->all_text);
-        if ($title) {
-          $self->{T_title} = _remove_prefix($title, $self->text_sigle);
+      if ($titles = $dom->find('fileDesc > titleStmt > t\.title')) {
+        if ($lang) {
+          $title = $titles->first(sub{ $_->attr('xml:lang') && lc($_->attr('xml:lang')) eq lc($lang) });
         };
-      }
+
+        $title = $titles->first unless $title;
+
+        if ($title) {
+          $title = _squish($title->all_text);
+
+          if ($title) {
+            $self->{T_title} = _remove_prefix($title, $self->text_sigle);
+          };
+        };
+      };
     };
   };
 

diff --git a/script/korapxml2krill b/script/korapxml2krill
index e909b09..7e19644 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill

@@ -161,9 +161,12 @@
 #
 # 2022/07/21
 # - Support for NKJP
+#
+# 2022/07/27
+# - Support for preferred language transformation
 # ----------------------------------------------------------
 
-our $LAST_CHANGE = '2022/07/21';
+our $LAST_CHANGE = '2022/07/27';
 our $LOCAL = $FindBin::Bin;
 our $KORAL_VERSION = 0.03;
 our $VERSION_MSG = <<"VERSION";
@@ -200,6 +203,7 @@
   'sigle|sg=s'  => \@sigle,
   'cache|c=s'   => \($cfg{cache_file}),
   'config|cfg=s' => \(my $cfg_file),
+  'lang=s'        => \($cfg{lang}),
   'log|l=s'     => \($cfg{log}),
   'anno|a=s'    => \@anno,
   'primary|p!'  => sub {
@@ -252,7 +256,7 @@
 
   foreach (qw!output cache-size input-base token overwrite
               meta base-sentences base-paragraphs base-pagebreaks
-              gzip to-tar log cache non-word-tokens
+              gzip to-tar log lang cache non-word-tokens
               non-verbal-tokens sequential-extraction
               temporary-extract cache-init
               koral extract-dir jobs!) {
@@ -555,10 +559,10 @@
   koral     => ($cfg{koral} // $KORAL_VERSION),
   anno      => \@filtered_anno,
   non_word_tokens   => ($cfg{non_word_tokens}   // 0),
-  non_verbal_tokens => ($cfg{non_verbal_tokens} // 0)
+  non_verbal_tokens => ($cfg{non_verbal_tokens} // 0),
+  lang      => $cfg{lang},
 );
 
-
 # Auto adjust jobs
 if ($jobs eq '-1') {
   my $cores = 1;
@@ -1376,6 +1380,15 @@
 In case the C<Text> path is omitted, the whole document will be extracted.
 On the document level, the postfix wildcard C<*> is supported.
 
+=item B<--lang>
+
+Preferred language for metadata fields. In case multiple titles are
+given (on any level) with different C<xml:lang> attributes,
+the language given is preferred.
+Because titles may have different sources and different priorities,
+non-specific language titles may still be preferred in case the title
+source has a higher priority.
+
 
 =item B<--log|-l>
 

diff --git a/t/real/nkjp.t b/t/real/nkjp.t
index 8652037..8f94f44 100644
--- a/t/real/nkjp.t
+++ b/t/real/nkjp.t

@@ -15,6 +15,7 @@
 use File::Spec::Functions 'catdir';
 
 use_ok('KorAP::XML::Krill');
+use_ok('KorAP::XML::Meta::I5');
 use_ok('KorAP::XML::Annotation::NKJP::NamedEntities');
 
 my $path = catdir(dirname(__FILE__), 'corpus','NKJP','NKJP','KOT');
@@ -29,6 +30,15 @@
 my $meta = $doc->meta;
 
 is($meta->{T_title}, 'TEI P5 encoded version of sample(s) of "Kot"', 'Title');
+is($meta->{T_corpus_title}, 'Narodowy Korpus Języka Polskiego -- podkorpus zawierający 1 milion słów', 'Title');
+
+ok($doc = KorAP::XML::Krill->new( path => $path . '/', lang => 'en' ), 'Load Korap::Document');
+ok($doc->parse, 'Parse document');
+$meta = $doc->meta;
+
+is($meta->{T_title}, 'TEI P5 encoded version of sample(s) of "Kot"', 'Title');
+is($meta->{T_corpus_title}, 'National Corpus of Polish -- the 1 million word subcorpus', 'Language sensitive Title');
+
 ok(!$meta->{T_sub_title}, 'SubTitle');
 ok(!$meta->{T_author}, 'Author');
 ok(!$meta->{A_editor}, 'Editor');

diff --git a/t/script/single.t b/t/script/single.t
index 301357c..01d119f 100644
--- a/t/script/single.t
+++ b/t/script/single.t

@@ -237,6 +237,38 @@
 ok(!-f $output, 'Output does not exist');
 
 
+# Koral version
+$input = catdir($f, '..', 'real', 'corpus', 'NKJP', 'NKJP', 'KOT');
+$call = join(
+  ' ',
+  'perl', $script,
+  '--input' => $input,
+  '--output' => $output,
+  '--cache' => $cache,
+  '-t' => 'NKJP#Morpho',
+  '-l' => 'INFO',
+  '--lang' => 'en'
+);
+
+$call .= ' -w ';
+
+stderr_like(
+  sub {
+    system($call);
+  },
+  qr!The code took!,
+  $call
+);
+
+ok(-f $output, 'Output does exist');
+ok(($file = Mojo::File->new($output)->slurp), 'Slurp data');
+ok(($json = decode_json $file), 'decode json');
+is($json->{corpusTitle}, 'National Corpus of Polish -- the 1 million word subcorpus', 'Title');
+
+
+
+
+
 done_testing;
 __END__
commit	64f7faead93871ffa3e3612c823b267846ad35d4	[log] [tgz]
author	Akron <nils@diewald-online.de>	Wed Jul 27 12:45:33 2022 +0200
committer	Akron <nils@diewald-online.de>	Wed Jul 27 12:45:33 2022 +0200
tree	320ce42676d571fae1871c0e7b101d3d15fca4da
parent	e85a7765eb640570eb5b1bf6f62ef9d0e02f3639 [diff]