Added configuration to script
Change-Id: Ia76b3096465deeed53a57f09540f5d8834970cea
diff --git a/Changes b/Changes
index 808cc10..bea5911 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,9 @@
+0.27 2017-04-07
+ - Support configuration files.
+
+0.26 2017-04-06
+ - Support wildcards on input.
+
0.25 2017-03-14
- Updated to Mojolicious 7.20
- Fixed meta treatment in case analytic and monogr
diff --git a/MANIFEST b/MANIFEST
index 826961f..b1b3551 100755
--- a/MANIFEST
+++ b/MANIFEST
@@ -101,6 +101,8 @@
t/script/usage.t
t/script/extract.t
t/script/archive.t
+t/script/config.t
+t/script/base.t
t/corpus/archive.zip
t/corpus/archive_rei.zip
t/corpus/BZK/header.xml
diff --git a/Makefile.PL b/Makefile.PL
index 4473073..a2fc10b 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -37,7 +37,8 @@
'bytes' => 0,
'Pod::Usage' => 0,
'Cache::FastMmap' => 1.40,
- 'Sys::Info' => 0.78
+ 'Sys::Info' => 0.78,
+ 'Config::Simple' => 4.58
},
MIN_PERL_VERSION => '5.014',
test => {
diff --git a/Readme.pod b/Readme.pod
index e1f1d8f..b2cbd45 100644
--- a/Readme.pod
+++ b/Readme.pod
@@ -203,6 +203,23 @@
Can be flagged using C<--no-cache-delete> as well.
Defaults to C<true>.
+=item B<--config|-cfg>
+
+Configure the parameters of your call in a file
+of key-value pairs with whitespace separator
+
+ overwrite 1
+ token DeReKo#Structure
+ ...
+
+Supported parameters are:
+C<overwrite>, C<gzip>, C<jobs>,
+C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
+C<output>, C<base-sentences>, C<base-paragraphs>,
+C<base-pagebreaks>, C<skip> (semicolon separated), C<sigle>
+(semicolon separated), C<anno> (semicolon separated).
+
+
=item B<--sigle|-sg>
Extract the given texts.
diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index 1435796..8e5ae8f 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm
@@ -16,7 +16,7 @@
use Data::Dumper;
use File::Spec::Functions qw/catdir catfile catpath splitdir splitpath rel2abs/;
-our $VERSION = '0.26';
+our $VERSION = '0.27';
has 'path';
has [qw/text_sigle doc_sigle corpus_sigle/];
diff --git a/script/korapxml2krill b/script/korapxml2krill
index af1da02..a439fff 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -16,6 +16,7 @@
use KorAP::XML::Archive;
use KorAP::XML::Tokenizer;
use KorAP::XML::Batch::File;
+use Config::Simple;
use Parallel::ForkManager;
use v5.10;
use Sys::Info;
@@ -97,9 +98,12 @@
# 2017/04/06
# - added support for wildcards in input
#
+# 2017/04/07
+# - support configuration option
+#
# ----------------------------------------------------------
-our $LAST_CHANGE = '2017/04/06';
+our $LAST_CHANGE = '2017/04/07';
our $LOCAL = $FindBin::Bin;
our $VERSION_MSG = <<"VERSION";
Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
@@ -121,22 +125,23 @@
'output|o=s' => \(my $output),
'overwrite|w' => \(my $overwrite),
'meta|m=s' => \(my $meta),
- 'token|t=s' => \(my $token_base = 'OpenNLP#tokens'),
- 'base-sentences|bs=s' => \(my $base_sentences = ''),
- 'base-paragraphs|bp=s' => \(my $base_paragraphs = ''),
- 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks = ''),
+ 'token|t=s' => \(my $token_base),
+ 'base-sentences|bs=s' => \(my $base_sentences),
+ 'base-paragraphs|bp=s' => \(my $base_paragraphs),
+ 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
'gzip|z' => \(my $gzip),
'skip|s=s' => \@skip,
'sigle|sg=s' => \@sigle,
- 'cache|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
- 'log|l=s' => \(my $log_level = 'ERROR'),
+ 'cache|c=s' => \(my $cache_file),
+ 'config|cfg=s' => \(my $cfg_file),
+ 'log|l=s' => \(my $log_level),
'anno|a=s' => \@anno,
'primary|p!' => \(my $primary),
'pretty|y' => \(my $pretty),
- 'jobs|j=i' => \(my $jobs = 0),
- 'cache-size|cs=s' => \(my $cache_size = '50m'),
- 'cache-delete|cd!' => \(my $cache_delete = 1),
- 'cache-init|ci!' => \(my $cache_init = 1),
+ 'jobs|j=i' => \(my $jobs),
+ 'cache-size|cs=s' => \(my $cache_size),
+ 'cache-delete|cd!' => \(my $cache_delete),
+ 'cache-init|ci!' => \(my $cache_init),
'help|h' => sub {
pod2usage(
-sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
@@ -154,6 +159,113 @@
}
);
+# Load from configuration
+if ($cfg_file && -e $cfg_file) {
+
+ print "Reading config from $cfg_file\n";
+
+ my %config;
+
+ Config::Simple->import_from($cfg_file, \%config);
+
+ # Overwrite
+ if (!defined($overwrite) && defined $config{overwrite}) {
+ $overwrite = $config{overwrite};
+ };
+
+ # Gzip
+ if (!defined($gzip) && defined $config{gzip}) {
+ $gzip = $config{gzip};
+ };
+
+ # Jobs
+ if (!defined($jobs) && defined $config{jobs}) {
+ $jobs = $config{jobs};
+ };
+
+ # Token base
+ if (!defined($token_base) && defined $config{token}) {
+ $token_base = $config{token};
+ };
+
+ # Cache file
+ if (!defined($cache_file) && defined $config{cache}) {
+ $cache_file = $config{cache};
+ };
+
+ # Cache size
+ if (!defined($cache_size) && defined $config{'cache-size'}) {
+ $cache_size = $config{'cache-size'};
+ };
+
+ # Cache delete
+ if (!defined($cache_delete) && defined $config{'cache-delete'}) {
+ $cache_delete = $config{'cache-delete'} ;
+ };
+
+ # Cache init
+ if (!(defined $cache_init) && defined $config{'cache-init'}) {
+ $cache_init = $config{'cache-init'} ;
+ };
+
+ # Meta
+ if (!(defined $meta) && defined $config{'meta'}) {
+ $meta = $config{'meta'} ;
+ };
+
+ # Output
+ if (!(defined $output) && defined $config{'output'}) {
+ $output = $config{'output'} ;
+ };
+
+ # Base-sentences
+ if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
+ $base_sentences = $config{'base-sentences'} ;
+ };
+
+ # Base-paragraphs
+ if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
+ $base_paragraphs = $config{'base-paragraphs'} ;
+ };
+
+ # Base-pagebreaks
+ if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
+ $base_pagebreaks = $config{'base-pagebreaks'} ;
+ };
+
+ # Log
+ if (!(defined $log_level) && defined $config{'log'}) {
+ $log_level = $config{'log'} ;
+ };
+
+ # Skip
+ if (!scalar(@skip) && defined $config{'skip'}) {
+ @skip = split /\s*;\s*/, $config{'skip'} ;
+ };
+
+ # Sigle
+ if (!scalar(@sigle) && defined $config{'sigle'}) {
+ @sigle = split /\s*;\s*/, $config{'sigle'} ;
+ };
+
+ # Anno
+ if (!scalar(@anno) && defined $config{'anno'}) {
+ @anno = split /\s*;\s*/, $config{'anno'} ;
+ };
+};
+
+# Set default token base
+$token_base //= 'OpenNLP#tokens';
+$cache_file //= 'korapxml2krill.cache';
+$cache_size //= '50m';
+$jobs //= 0;
+$cache_delete //= 1;
+$cache_init //= 1;
+$log_level //= 'ERROR';
+$base_sentences //= '';
+$base_paragraphs //= '';
+$base_pagebreaks //= '';
+
$base_sentences = lc $base_sentences;
$base_paragraphs = lc $base_paragraphs;
$base_pagebreaks = lc $base_pagebreaks;
@@ -183,10 +295,10 @@
my $log = Log::Log4perl->get_logger('main');
-if ($jobs == -1) {
+if ($jobs eq '-1') {
state $cores = Sys::Info->new->device('CPU')->count;
$jobs = ceil(5 * $cores);
- $log->info("Run using $jobs jobs");
+ $log->info("Run using $jobs jobs on $cores cores");
};
@@ -351,7 +463,7 @@
if (scalar(@new_input) > scalar(@input)) {
@input = sort { length($a) <=> length($b) } @new_input;
- print 'Input rewritten to ' . join(',', @input);
+ print 'Input rewritten to ' . join(', ', @input)."\n";
};
};
@@ -870,6 +982,23 @@
Can be flagged using C<--no-cache-delete> as well.
Defaults to C<true>.
+=item B<--config|-cfg>
+
+Configure the parameters of your call in a file
+of key-value pairs with whitespace separator
+
+ overwrite 1
+ token DeReKo#Structure
+ ...
+
+Supported parameters are:
+C<overwrite>, C<gzip>, C<jobs>,
+C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
+C<output>, C<base-sentences>, C<base-paragraphs>,
+C<base-pagebreaks>, C<skip> (semicolon separated), C<sigle>
+(semicolon separated), C<anno> (semicolon separated).
+
+
=item B<--sigle|-sg>
Extract the given texts.
diff --git a/t/script/config.t b/t/script/config.t
new file mode 100644
index 0000000..8af59c5
--- /dev/null
+++ b/t/script/config.t
@@ -0,0 +1,67 @@
+#/usr/bin/env perl
+use strict;
+use warnings;
+
+use File::Basename 'dirname';
+use File::Spec::Functions qw/catdir catfile/;
+use File::Temp qw/ :POSIX tempfile/;
+use Mojo::File;
+use Test::More;
+use Test::Output qw/combined_from/;
+use Data::Dumper;
+
+my $f = dirname(__FILE__);
+
+my ($fh, $cfg_file) = tempfile();
+
+print $fh <<CFG;
+overwrite 0
+token OpenNLP#tokens
+base-sentences DeReKo#Structure
+base-paragraphs DeReKo#Structure
+base-pagebreaks DeReKo#Structure
+jobs -1
+meta I5
+gzip 1
+log DEBUG
+CFG
+
+close($fh);
+
+# Path for script
+my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
+
+# Path for input
+my $input = "'".catfile($f, '..', 'corpus', 'archives', 'wpd15*.zip') . "'";
+
+# Temporary output
+my $output = File::Temp->newdir(CLEANUP => 0);
+
+my $call = join(
+ ' ',
+ 'perl', $script,
+ 'archive',
+ '--config' => $cfg_file,
+ '--input' => $input,
+ '--output' => $output
+);
+
+like($call, qr!config!, 'Call string');
+
+my $stdout = combined_from(sub { system($call) });
+
+like($stdout, qr!Reading config from!, 'Config');
+
+# Processed using gzip
+like($stdout, qr!Processed .+?WPD15-A00-00081\.json\.gz!, 'Gzip');
+
+# Check log level
+like($stdout, qr!Unable to parse KorAP::XML::Annotation::Glemm::Morpho!, 'Check log level');
+
+# Check wildcard input
+like($stdout, qr!Input rewritten to .+?wpd15-single\.zip,.+?wpd15-single\.malt\.zip,.+?wpd15-single\.corenlp\.zip,.+?wpd15-single\.opennlp\.zip,.+?wpd15-single\.mdparser\.zip,.+?wpd15-single\.tree_tagger\.zip!is, 'Wildcards');
+
+like($stdout, qr!Run using \d+ jobs on \d+ cores!, 'Jobs');
+
+done_testing;
+__END__