Support switch for preferred language transformation
Change-Id: I7bda578f386e4b454eaa9bf100f3c258e10f74c2
diff --git a/script/korapxml2krill b/script/korapxml2krill
index e909b09..7e19644 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -161,9 +161,12 @@
#
# 2022/07/21
# - Support for NKJP
+#
+# 2022/07/27
+# - Support for preferred language transformation
# ----------------------------------------------------------
-our $LAST_CHANGE = '2022/07/21';
+our $LAST_CHANGE = '2022/07/27';
our $LOCAL = $FindBin::Bin;
our $KORAL_VERSION = 0.03;
our $VERSION_MSG = <<"VERSION";
@@ -200,6 +203,7 @@
'sigle|sg=s' => \@sigle,
'cache|c=s' => \($cfg{cache_file}),
'config|cfg=s' => \(my $cfg_file),
+ 'lang=s' => \($cfg{lang}),
'log|l=s' => \($cfg{log}),
'anno|a=s' => \@anno,
'primary|p!' => sub {
@@ -252,7 +256,7 @@
foreach (qw!output cache-size input-base token overwrite
meta base-sentences base-paragraphs base-pagebreaks
- gzip to-tar log cache non-word-tokens
+ gzip to-tar log lang cache non-word-tokens
non-verbal-tokens sequential-extraction
temporary-extract cache-init
koral extract-dir jobs!) {
@@ -555,10 +559,10 @@
koral => ($cfg{koral} // $KORAL_VERSION),
anno => \@filtered_anno,
non_word_tokens => ($cfg{non_word_tokens} // 0),
- non_verbal_tokens => ($cfg{non_verbal_tokens} // 0)
+ non_verbal_tokens => ($cfg{non_verbal_tokens} // 0),
+ lang => $cfg{lang},
);
-
# Auto adjust jobs
if ($jobs eq '-1') {
my $cores = 1;
@@ -1376,6 +1380,15 @@
In case the C<Text> path is omitted, the whole document will be extracted.
On the document level, the postfix wildcard C<*> is supported.
+=item B<--lang>
+
+Preferred language for metadata fields. In case multiple titles are
+given (on any level) with different C<xml:lang> attributes,
+the language given is preferred.
+Because titles may have different sources and different priorities,
+non-specific language titles may still be preferred in case the title
+source has a higher priority.
+
=item B<--log|-l>