Support switch for preferred language transformation
Change-Id: I7bda578f386e4b454eaa9bf100f3c258e10f74c2
diff --git a/Changes b/Changes
index 2a267f1..e270c1a 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,6 @@
+0.47 2022-07-27
+ - Support for preferred language transformation.
+
0.46 2022-07-21
- Support NKJP Meta, Morpho and NamedEntities.
diff --git a/lib/KorAP/XML/Batch/File.pm b/lib/KorAP/XML/Batch/File.pm
index 297fc06..d8f94dd 100644
--- a/lib/KorAP/XML/Batch/File.pm
+++ b/lib/KorAP/XML/Batch/File.pm
@@ -20,6 +20,7 @@
layer => $param{layer} || 'Tokens',
anno => $param{anno} || [[]],
log => $param{log} || $log,
+ lang => $param{lang},
koral => $param{koral},
non_word_tokens => $param{non_word_tokens},
non_verbal_tokens => $param{non_verbal_tokens},
@@ -42,7 +43,8 @@
path => $input,
meta_type => $self->{meta_type},
cache => $self->{cache},
- log => $self->{log}
+ log => $self->{log},
+ lang => $self->{lang}
);
# Parse document
diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index d8b72ab..564cc14 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm
@@ -16,12 +16,13 @@
our @EXPORT_OK = qw(get_file_name get_file_name_from_glob);
-our $VERSION = '0.46';
+our $VERSION = '0.47';
has 'path';
has [qw/text_sigle doc_sigle corpus_sigle/];
has 'meta_type' => 'I5';
has 'cache';
+has 'lang';
has log => sub {
return $log;
@@ -140,7 +141,8 @@
corpus_sigle => $self->corpus_sigle,
doc_sigle => $self->doc_sigle,
text_sigle => $self->text_sigle,
- cache => $self->cache
+ cache => $self->cache,
+ lang => $self->lang
);
# Associate meta object
diff --git a/lib/KorAP/XML/Meta/Base.pm b/lib/KorAP/XML/Meta/Base.pm
index 8f6900b..a4269a7 100644
--- a/lib/KorAP/XML/Meta/Base.pm
+++ b/lib/KorAP/XML/Meta/Base.pm
@@ -39,13 +39,17 @@
sub cache {
$_[0]->{_cache};
-}
+};
+
+sub lang {
+ $_[0]->{_lang};
+};
sub new {
my $class = shift;
my %hash = @_;
my $copy = {};
- foreach (qw/log cache corpus_sigle doc_sigle text_sigle/) {
+ foreach (qw/log cache lang corpus_sigle doc_sigle text_sigle/) {
$copy->{'_' . $_} = $hash{$_};
};
diff --git a/lib/KorAP/XML/Meta/I5.pm b/lib/KorAP/XML/Meta/I5.pm
index fd52192..a675e4f 100644
--- a/lib/KorAP/XML/Meta/I5.pm
+++ b/lib/KorAP/XML/Meta/I5.pm
@@ -69,6 +69,8 @@
sub parse {
my ($self, $dom, $type) = @_;
+ my $lang = $self->lang;
+
# Parse text sigle
if ($type eq 'text' && !$self->text_sigle) {
my $v = $dom->at('textSigle');
@@ -106,8 +108,20 @@
# There is an analytic element
# Get title, subtitle, author, editor
- my $title = $analytic->at('h\.title[type=main]');
- my $sub_title = $analytic->at('h\.title[type=sub]');
+ my $titles = $analytic->find('h\.title[type=main]');
+ my $title;
+ if ($lang) {
+ $title = $titles->first(sub{ $_->attr('xml:lang') && lc($_->attr('xml:lang')) eq lc($lang) });
+ };
+ $title = $titles->first unless $title;
+
+ my $sub_title;
+ $titles = $analytic->find('h\.title[type=sub]');
+ if ($lang) {
+ $sub_title = $titles->first(sub{ $_->attr('xml:lang') && lc($_->attr('xml:lang')) eq lc($lang) });
+ };
+ $sub_title = $titles->first unless $sub_title;
+
my $author = $analytic->at('h\.author');
my $editor = $analytic->at('editor');
@@ -181,16 +195,24 @@
};
# Not in analytic
- my $title;
+ my ($titles, $title);
if ($type eq 'corpus') {
# Corpus title not yet given
unless ($self->{T_corpus_title}) {
- if ($title = $dom->at('fileDesc > titleStmt > c\.title')) {
- $title = _squish($title->all_text);
+ if ($titles = $dom->find('fileDesc > titleStmt > c\.title')) {
+ if ($lang) {
+ $title = $titles->first(sub{ $_->attr('xml:lang') && lc($_->attr('xml:lang')) eq lc($lang) });
+ };
+
+ $title = $titles->first unless $title;
if ($title) {
- $self->{T_corpus_title} = _remove_prefix($title, $self->corpus_sigle);
+ $title = _squish($title->all_text);
+
+ if ($title) {
+ $self->{T_corpus_title} = _remove_prefix($title, $self->corpus_sigle);
+ };
};
};
};
@@ -199,11 +221,19 @@
# doc title
elsif ($type eq 'doc') {
unless ($self->{T_doc_title}) {
- if ($title = $dom->at('fileDesc > titleStmt > d\.title')) {
- $title = _squish($title->all_text);
+ if ($titles = $dom->find('fileDesc > titleStmt > d\.title')) {
+ if ($lang) {
+ $title = $titles->first(sub{ $_->attr('xml:lang') && lc($_->attr('xml:lang')) eq lc($lang) });
+ };
+
+ $title = $titles->first unless $title;
if ($title) {
- $self->{T_doc_title} = _remove_prefix($title, $self->doc_sigle);
+ $title = _squish($title->all_text);
+
+ if ($title) {
+ $self->{T_doc_title} = _remove_prefix($title, $self->doc_sigle);
+ };
};
};
};
@@ -212,12 +242,21 @@
# text title
elsif ($type eq 'text') {
unless ($self->{T_title}) {
- if ($title = $dom->at('fileDesc > titleStmt > t\.title')) {
- $title = _squish($title->all_text);
- if ($title) {
- $self->{T_title} = _remove_prefix($title, $self->text_sigle);
+ if ($titles = $dom->find('fileDesc > titleStmt > t\.title')) {
+ if ($lang) {
+ $title = $titles->first(sub{ $_->attr('xml:lang') && lc($_->attr('xml:lang')) eq lc($lang) });
};
- }
+
+ $title = $titles->first unless $title;
+
+ if ($title) {
+ $title = _squish($title->all_text);
+
+ if ($title) {
+ $self->{T_title} = _remove_prefix($title, $self->text_sigle);
+ };
+ };
+ };
};
};
diff --git a/script/korapxml2krill b/script/korapxml2krill
index e909b09..7e19644 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -161,9 +161,12 @@
#
# 2022/07/21
# - Support for NKJP
+#
+# 2022/07/27
+# - Support for preferred language transformation
# ----------------------------------------------------------
-our $LAST_CHANGE = '2022/07/21';
+our $LAST_CHANGE = '2022/07/27';
our $LOCAL = $FindBin::Bin;
our $KORAL_VERSION = 0.03;
our $VERSION_MSG = <<"VERSION";
@@ -200,6 +203,7 @@
'sigle|sg=s' => \@sigle,
'cache|c=s' => \($cfg{cache_file}),
'config|cfg=s' => \(my $cfg_file),
+ 'lang=s' => \($cfg{lang}),
'log|l=s' => \($cfg{log}),
'anno|a=s' => \@anno,
'primary|p!' => sub {
@@ -252,7 +256,7 @@
foreach (qw!output cache-size input-base token overwrite
meta base-sentences base-paragraphs base-pagebreaks
- gzip to-tar log cache non-word-tokens
+ gzip to-tar log lang cache non-word-tokens
non-verbal-tokens sequential-extraction
temporary-extract cache-init
koral extract-dir jobs!) {
@@ -555,10 +559,10 @@
koral => ($cfg{koral} // $KORAL_VERSION),
anno => \@filtered_anno,
non_word_tokens => ($cfg{non_word_tokens} // 0),
- non_verbal_tokens => ($cfg{non_verbal_tokens} // 0)
+ non_verbal_tokens => ($cfg{non_verbal_tokens} // 0),
+ lang => $cfg{lang},
);
-
# Auto adjust jobs
if ($jobs eq '-1') {
my $cores = 1;
@@ -1376,6 +1380,15 @@
In case the C<Text> path is omitted, the whole document will be extracted.
On the document level, the postfix wildcard C<*> is supported.
+=item B<--lang>
+
+Preferred language for metadata fields. In case multiple titles are
+given (on any level) with different C<xml:lang> attributes,
+the language given is preferred.
+Because titles may have different sources and different priorities,
+non-specific language titles may still be preferred in case the title
+source has a higher priority.
+
=item B<--log|-l>
diff --git a/t/real/nkjp.t b/t/real/nkjp.t
index 8652037..8f94f44 100644
--- a/t/real/nkjp.t
+++ b/t/real/nkjp.t
@@ -15,6 +15,7 @@
use File::Spec::Functions 'catdir';
use_ok('KorAP::XML::Krill');
+use_ok('KorAP::XML::Meta::I5');
use_ok('KorAP::XML::Annotation::NKJP::NamedEntities');
my $path = catdir(dirname(__FILE__), 'corpus','NKJP','NKJP','KOT');
@@ -29,6 +30,15 @@
my $meta = $doc->meta;
is($meta->{T_title}, 'TEI P5 encoded version of sample(s) of "Kot"', 'Title');
+is($meta->{T_corpus_title}, 'Narodowy Korpus Języka Polskiego -- podkorpus zawierający 1 milion słów', 'Title');
+
+ok($doc = KorAP::XML::Krill->new( path => $path . '/', lang => 'en' ), 'Load Korap::Document');
+ok($doc->parse, 'Parse document');
+$meta = $doc->meta;
+
+is($meta->{T_title}, 'TEI P5 encoded version of sample(s) of "Kot"', 'Title');
+is($meta->{T_corpus_title}, 'National Corpus of Polish -- the 1 million word subcorpus', 'Language sensitive Title');
+
ok(!$meta->{T_sub_title}, 'SubTitle');
ok(!$meta->{T_author}, 'Author');
ok(!$meta->{A_editor}, 'Editor');
diff --git a/t/script/single.t b/t/script/single.t
index 301357c..01d119f 100644
--- a/t/script/single.t
+++ b/t/script/single.t
@@ -237,6 +237,38 @@
ok(!-f $output, 'Output does not exist');
+# Koral version
+$input = catdir($f, '..', 'real', 'corpus', 'NKJP', 'NKJP', 'KOT');
+$call = join(
+ ' ',
+ 'perl', $script,
+ '--input' => $input,
+ '--output' => $output,
+ '--cache' => $cache,
+ '-t' => 'NKJP#Morpho',
+ '-l' => 'INFO',
+ '--lang' => 'en'
+);
+
+$call .= ' -w ';
+
+stderr_like(
+ sub {
+ system($call);
+ },
+ qr!The code took!,
+ $call
+);
+
+ok(-f $output, 'Output does exist');
+ok(($file = Mojo::File->new($output)->slurp), 'Slurp data');
+ok(($json = decode_json $file), 'decode json');
+is($json->{corpusTitle}, 'National Corpus of Polish -- the 1 million word subcorpus', 'Title');
+
+
+
+
+
done_testing;
__END__