Support file extensions in base tokenization file
Change-Id: I02a2bcba87998c99d496e1e394093ae9483ddb04
diff --git a/Changes b/Changes
index 133d7f2..0d50b3c 100644
--- a/Changes
+++ b/Changes
@@ -1,5 +1,6 @@
-0.32 2017-07-04
+0.32 2017-10-24
- Fixed tar building process in script.
+ - Support file extensions in base tokenization parameter.
0.31 2017-06-30
- Fixed exit codes in script.
diff --git a/lib/KorAP/XML/Tokenizer.pm b/lib/KorAP/XML/Tokenizer.pm
index 40e6c63..74fc083 100644
--- a/lib/KorAP/XML/Tokenizer.pm
+++ b/lib/KorAP/XML/Tokenizer.pm
@@ -48,9 +48,15 @@
sub parse {
my $self = shift;
+ my $layer_file = lc($self->layer);
+
+ if (index($layer_file, '.') < 0) {
+ $layer_file .= '.xml';
+ };
+
# Create new token stream
my $mtts = KorAP::XML::Index::MultiTermTokenStream->new;
- my $path = $self->path . lc($self->foundry) . '/' . lc($self->layer) . '.xml';
+ my $path = $self->path . lc($self->foundry) . '/' . $layer_file;
unless (-e $path) {
$self->log->warn('Unable to load base tokenization: ' . $path);
diff --git a/script/korapxml2krill b/script/korapxml2krill
index c264436..c859920 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -513,7 +513,10 @@
};
# Get tokenization basis
-my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
+my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
+
+# Remove file extension
+$token_base_layer =~ s/\.xml$//i;
# TODO: This should not be initialized for batch
my $cache = Cache::FastMmap->new(
diff --git a/t/script/base.t b/t/script/base.t
index 326f4e6..08baa64 100644
--- a/t/script/base.t
+++ b/t/script/base.t
@@ -29,7 +29,7 @@
'--input' => $input,
'--output' => $output,
'--cache' => $cache,
- '-t' => 'Base#tokens_aggr',
+ '-t' => 'Base#tokens_aggr.xml',
'-bs' => 'DeReKo#Structure',
'-bp' => 'DeReKo#Structure',
'-l' => 'INFO'