Added configuration to script

Change-Id: Ia76b3096465deeed53a57f09540f5d8834970cea
diff --git a/Changes b/Changes
index 808cc10..bea5911 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,9 @@
+0.27 2017-04-07
+        - Support configuration files.
+
+0.26 2017-04-06
+        - Support wildcards on input.
+
 0.25 2017-03-14
         - Updated to Mojolicious 7.20
         - Fixed meta treatment in case analytic and monogr
diff --git a/MANIFEST b/MANIFEST
index 826961f..b1b3551 100755
--- a/MANIFEST
+++ b/MANIFEST
@@ -101,6 +101,8 @@
 t/script/usage.t
 t/script/extract.t
 t/script/archive.t
+t/script/config.t
+t/script/base.t
 t/corpus/archive.zip
 t/corpus/archive_rei.zip
 t/corpus/BZK/header.xml
diff --git a/Makefile.PL b/Makefile.PL
index 4473073..a2fc10b 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -37,7 +37,8 @@
     'bytes'           => 0,
     'Pod::Usage'      => 0,
     'Cache::FastMmap' => 1.40,
-    'Sys::Info'       => 0.78
+    'Sys::Info'       => 0.78,
+    'Config::Simple'  => 4.58
   },
   MIN_PERL_VERSION => '5.014',
   test => {
diff --git a/Readme.pod b/Readme.pod
index e1f1d8f..b2cbd45 100644
--- a/Readme.pod
+++ b/Readme.pod
@@ -203,6 +203,23 @@
 Can be flagged using C<--no-cache-delete> as well.
 Defaults to C<true>.
 
+=item B<--config|-cfg>
+
+Configure the parameters of your call in a file
+of key-value pairs with whitespace separator
+
+  overwrite 1
+  token     DeReKo#Structure
+  ...
+
+Supported parameters are:
+C<overwrite>, C<gzip>, C<jobs>,
+C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
+C<output>, C<base-sentences>, C<base-paragraphs>,
+C<base-pagebreaks>, C<skip> (semicolon separated), C<sigle>
+(semicolon separated), C<anno> (semicolon separated).
+
+
 =item B<--sigle|-sg>
 
 Extract the given texts.
diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index 1435796..8e5ae8f 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm
@@ -16,7 +16,7 @@
 use Data::Dumper;
 use File::Spec::Functions qw/catdir catfile catpath splitdir splitpath rel2abs/;
 
-our $VERSION = '0.26';
+our $VERSION = '0.27';
 
 has 'path';
 has [qw/text_sigle doc_sigle corpus_sigle/];
diff --git a/script/korapxml2krill b/script/korapxml2krill
index af1da02..a439fff 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -16,6 +16,7 @@
 use KorAP::XML::Archive;
 use KorAP::XML::Tokenizer;
 use KorAP::XML::Batch::File;
+use Config::Simple;
 use Parallel::ForkManager;
 use v5.10;
 use Sys::Info;
@@ -97,9 +98,12 @@
 # 2017/04/06
 # - added support for wildcards in input
 #
+# 2017/04/07
+# - support configuration option
+#
 # ----------------------------------------------------------
 
-our $LAST_CHANGE = '2017/04/06';
+our $LAST_CHANGE = '2017/04/07';
 our $LOCAL = $FindBin::Bin;
 our $VERSION_MSG = <<"VERSION";
 Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
@@ -121,22 +125,23 @@
   'output|o=s'  => \(my $output),
   'overwrite|w' => \(my $overwrite),
   'meta|m=s'    => \(my $meta),
-  'token|t=s'   => \(my $token_base = 'OpenNLP#tokens'),
-  'base-sentences|bs=s' => \(my $base_sentences = ''),
-  'base-paragraphs|bp=s' => \(my $base_paragraphs = ''),
-  'base-pagebreaks|bpb=s' => \(my $base_pagebreaks = ''),
+  'token|t=s'   => \(my $token_base),
+  'base-sentences|bs=s'   => \(my $base_sentences),
+  'base-paragraphs|bp=s'  => \(my $base_paragraphs),
+  'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
   'gzip|z'      => \(my $gzip),
   'skip|s=s'    => \@skip,
   'sigle|sg=s'  => \@sigle,
-  'cache|c=s'   => \(my $cache_file = 'korapxml2krill.cache'),
-  'log|l=s'     => \(my $log_level = 'ERROR'),
+  'cache|c=s'   => \(my $cache_file),
+  'config|cfg=s' => \(my $cfg_file),
+  'log|l=s'     => \(my $log_level),
   'anno|a=s'    => \@anno,
   'primary|p!'  => \(my $primary),
   'pretty|y'    => \(my $pretty),
-  'jobs|j=i'    => \(my $jobs = 0),
-  'cache-size|cs=s'  => \(my $cache_size = '50m'),
-  'cache-delete|cd!' => \(my $cache_delete = 1),
-  'cache-init|ci!'   => \(my $cache_init = 1),
+  'jobs|j=i'    => \(my $jobs),
+  'cache-size|cs=s'  => \(my $cache_size),
+  'cache-delete|cd!' => \(my $cache_delete),
+  'cache-init|ci!'   => \(my $cache_init),
   'help|h'      => sub {
     pod2usage(
       -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
@@ -154,6 +159,113 @@
   }
 );
 
+# Load from configuration
+if ($cfg_file && -e $cfg_file) {
+
+  print "Reading config from $cfg_file\n";
+
+  my %config;
+
+  Config::Simple->import_from($cfg_file, \%config);
+
+  # Overwrite
+  if (!defined($overwrite) && defined $config{overwrite}) {
+    $overwrite = $config{overwrite};
+  };
+
+  # Gzip
+  if (!defined($gzip) && defined $config{gzip}) {
+    $gzip = $config{gzip};
+  };
+
+  # Jobs
+  if (!defined($jobs) && defined $config{jobs}) {
+    $jobs = $config{jobs};
+  };
+
+  # Token base
+  if (!defined($token_base) && defined $config{token}) {
+    $token_base = $config{token};
+  };
+
+  # Cache file
+  if (!defined($cache_file) && defined $config{cache}) {
+    $cache_file = $config{cache};
+  };
+
+  # Cache size
+  if (!defined($cache_size) && defined $config{'cache-size'}) {
+    $cache_size = $config{'cache-size'};
+  };
+
+  # Cache delete
+  if (!defined($cache_delete) && defined $config{'cache-delete'}) {
+    $cache_delete = $config{'cache-delete'} ;
+  };
+
+  # Cache init
+  if (!(defined $cache_init) && defined $config{'cache-init'}) {
+    $cache_init = $config{'cache-init'} ;
+  };
+
+  # Meta
+  if (!(defined $meta) && defined $config{'meta'}) {
+    $meta = $config{'meta'} ;
+  };
+
+  # Output
+  if (!(defined $output) && defined $config{'output'}) {
+    $output = $config{'output'} ;
+  };
+
+  # Base-sentences
+  if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
+    $base_sentences = $config{'base-sentences'} ;
+  };
+
+  # Base-paragraphs
+  if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
+    $base_paragraphs = $config{'base-paragraphs'} ;
+  };
+
+  # Base-pagebreaks
+  if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
+    $base_pagebreaks = $config{'base-pagebreaks'} ;
+  };
+
+  # Log
+  if (!(defined $log_level) && defined $config{'log'}) {
+    $log_level = $config{'log'} ;
+  };
+
+  # Skip
+  if (!scalar(@skip) && defined $config{'skip'}) {
+    @skip = split /\s*;\s*/, $config{'skip'} ;
+  };
+
+  # Sigle
+  if (!scalar(@sigle) && defined $config{'sigle'}) {
+    @sigle = split /\s*;\s*/, $config{'sigle'} ;
+  };
+
+  # Anno
+  if (!scalar(@anno) && defined $config{'anno'}) {
+    @anno = split /\s*;\s*/, $config{'anno'} ;
+  };
+};
+
+# Set default token base
+$token_base      //= 'OpenNLP#tokens';
+$cache_file      //= 'korapxml2krill.cache';
+$cache_size      //= '50m';
+$jobs            //= 0;
+$cache_delete    //= 1;
+$cache_init      //= 1;
+$log_level       //= 'ERROR';
+$base_sentences  //= '';
+$base_paragraphs //= '';
+$base_pagebreaks //= '';
+
 $base_sentences  = lc $base_sentences;
 $base_paragraphs = lc $base_paragraphs;
 $base_pagebreaks = lc $base_pagebreaks;
@@ -183,10 +295,10 @@
 my $log = Log::Log4perl->get_logger('main');
 
 
-if ($jobs == -1) {
+if ($jobs eq '-1') {
   state $cores = Sys::Info->new->device('CPU')->count;
   $jobs = ceil(5 * $cores);
-  $log->info("Run using $jobs jobs");
+  $log->info("Run using $jobs jobs on $cores cores");
 };
 
 
@@ -351,7 +463,7 @@
 
   if (scalar(@new_input) > scalar(@input)) {
     @input = sort { length($a) <=> length($b) } @new_input;
-    print 'Input rewritten to ' . join(',', @input);
+    print 'Input rewritten to ' . join(', ', @input)."\n";
   };
 };
 
@@ -870,6 +982,23 @@
 Can be flagged using C<--no-cache-delete> as well.
 Defaults to C<true>.
 
+=item B<--config|-cfg>
+
+Configure the parameters of your call in a file
+of key-value pairs with whitespace separator
+
+  overwrite 1
+  token     DeReKo#Structure
+  ...
+
+Supported parameters are:
+C<overwrite>, C<gzip>, C<jobs>,
+C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
+C<output>, C<base-sentences>, C<base-paragraphs>,
+C<base-pagebreaks>, C<skip> (semicolon separated), C<sigle>
+(semicolon separated), C<anno> (semicolon separated).
+
+
 =item B<--sigle|-sg>
 
 Extract the given texts.
diff --git a/t/script/config.t b/t/script/config.t
new file mode 100644
index 0000000..8af59c5
--- /dev/null
+++ b/t/script/config.t
@@ -0,0 +1,67 @@
+#/usr/bin/env perl
+use strict;
+use warnings;
+
+use File::Basename 'dirname';
+use File::Spec::Functions qw/catdir catfile/;
+use File::Temp qw/ :POSIX tempfile/;
+use Mojo::File;
+use Test::More;
+use Test::Output qw/combined_from/;
+use Data::Dumper;
+
+my $f = dirname(__FILE__);
+
+my ($fh, $cfg_file) = tempfile();
+
+print $fh <<CFG;
+overwrite       0
+token           OpenNLP#tokens
+base-sentences  DeReKo#Structure
+base-paragraphs DeReKo#Structure
+base-pagebreaks DeReKo#Structure
+jobs            -1
+meta            I5
+gzip            1
+log             DEBUG
+CFG
+
+close($fh);
+
+# Path for script
+my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
+
+# Path for input
+my $input = "'".catfile($f, '..', 'corpus', 'archives', 'wpd15*.zip') . "'";
+
+# Temporary output
+my $output = File::Temp->newdir(CLEANUP => 0);
+
+my $call = join(
+  ' ',
+  'perl', $script,
+  'archive',
+  '--config' => $cfg_file,
+  '--input' => $input,
+  '--output' => $output
+);
+
+like($call, qr!config!, 'Call string');
+
+my $stdout = combined_from(sub { system($call) });
+
+like($stdout, qr!Reading config from!, 'Config');
+
+# Processed using gzip
+like($stdout, qr!Processed .+?WPD15-A00-00081\.json\.gz!, 'Gzip');
+
+# Check log level
+like($stdout, qr!Unable to parse KorAP::XML::Annotation::Glemm::Morpho!, 'Check log level');
+
+# Check wildcard input
+like($stdout, qr!Input rewritten to .+?wpd15-single\.zip,.+?wpd15-single\.malt\.zip,.+?wpd15-single\.corenlp\.zip,.+?wpd15-single\.opennlp\.zip,.+?wpd15-single\.mdparser\.zip,.+?wpd15-single\.tree_tagger\.zip!is, 'Wildcards');
+
+like($stdout, qr!Run using \d+ jobs on \d+ cores!, 'Jobs');
+
+done_testing;
+__END__