Added configuration to script

Change-Id: Ia76b3096465deeed53a57f09540f5d8834970cea
diff --git a/script/korapxml2krill b/script/korapxml2krill
index af1da02..a439fff 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -16,6 +16,7 @@
 use KorAP::XML::Archive;
 use KorAP::XML::Tokenizer;
 use KorAP::XML::Batch::File;
+use Config::Simple;
 use Parallel::ForkManager;
 use v5.10;
 use Sys::Info;
@@ -97,9 +98,12 @@
 # 2017/04/06
 # - added support for wildcards in input
 #
+# 2017/04/07
+# - support configuration option
+#
 # ----------------------------------------------------------
 
-our $LAST_CHANGE = '2017/04/06';
+our $LAST_CHANGE = '2017/04/07';
 our $LOCAL = $FindBin::Bin;
 our $VERSION_MSG = <<"VERSION";
 Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
@@ -121,22 +125,23 @@
   'output|o=s'  => \(my $output),
   'overwrite|w' => \(my $overwrite),
   'meta|m=s'    => \(my $meta),
-  'token|t=s'   => \(my $token_base = 'OpenNLP#tokens'),
-  'base-sentences|bs=s' => \(my $base_sentences = ''),
-  'base-paragraphs|bp=s' => \(my $base_paragraphs = ''),
-  'base-pagebreaks|bpb=s' => \(my $base_pagebreaks = ''),
+  'token|t=s'   => \(my $token_base),
+  'base-sentences|bs=s'   => \(my $base_sentences),
+  'base-paragraphs|bp=s'  => \(my $base_paragraphs),
+  'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
   'gzip|z'      => \(my $gzip),
   'skip|s=s'    => \@skip,
   'sigle|sg=s'  => \@sigle,
-  'cache|c=s'   => \(my $cache_file = 'korapxml2krill.cache'),
-  'log|l=s'     => \(my $log_level = 'ERROR'),
+  'cache|c=s'   => \(my $cache_file),
+  'config|cfg=s' => \(my $cfg_file),
+  'log|l=s'     => \(my $log_level),
   'anno|a=s'    => \@anno,
   'primary|p!'  => \(my $primary),
   'pretty|y'    => \(my $pretty),
-  'jobs|j=i'    => \(my $jobs = 0),
-  'cache-size|cs=s'  => \(my $cache_size = '50m'),
-  'cache-delete|cd!' => \(my $cache_delete = 1),
-  'cache-init|ci!'   => \(my $cache_init = 1),
+  'jobs|j=i'    => \(my $jobs),
+  'cache-size|cs=s'  => \(my $cache_size),
+  'cache-delete|cd!' => \(my $cache_delete),
+  'cache-init|ci!'   => \(my $cache_init),
   'help|h'      => sub {
     pod2usage(
       -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
@@ -154,6 +159,113 @@
   }
 );
 
+# Load from configuration
+if ($cfg_file && -e $cfg_file) {
+
+  print "Reading config from $cfg_file\n";
+
+  my %config;
+
+  Config::Simple->import_from($cfg_file, \%config);
+
+  # Overwrite
+  if (!defined($overwrite) && defined $config{overwrite}) {
+    $overwrite = $config{overwrite};
+  };
+
+  # Gzip
+  if (!defined($gzip) && defined $config{gzip}) {
+    $gzip = $config{gzip};
+  };
+
+  # Jobs
+  if (!defined($jobs) && defined $config{jobs}) {
+    $jobs = $config{jobs};
+  };
+
+  # Token base
+  if (!defined($token_base) && defined $config{token}) {
+    $token_base = $config{token};
+  };
+
+  # Cache file
+  if (!defined($cache_file) && defined $config{cache}) {
+    $cache_file = $config{cache};
+  };
+
+  # Cache size
+  if (!defined($cache_size) && defined $config{'cache-size'}) {
+    $cache_size = $config{'cache-size'};
+  };
+
+  # Cache delete
+  if (!defined($cache_delete) && defined $config{'cache-delete'}) {
+    $cache_delete = $config{'cache-delete'} ;
+  };
+
+  # Cache init
+  if (!(defined $cache_init) && defined $config{'cache-init'}) {
+    $cache_init = $config{'cache-init'} ;
+  };
+
+  # Meta
+  if (!(defined $meta) && defined $config{'meta'}) {
+    $meta = $config{'meta'} ;
+  };
+
+  # Output
+  if (!(defined $output) && defined $config{'output'}) {
+    $output = $config{'output'} ;
+  };
+
+  # Base-sentences
+  if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
+    $base_sentences = $config{'base-sentences'} ;
+  };
+
+  # Base-paragraphs
+  if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
+    $base_paragraphs = $config{'base-paragraphs'} ;
+  };
+
+  # Base-pagebreaks
+  if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
+    $base_pagebreaks = $config{'base-pagebreaks'} ;
+  };
+
+  # Log
+  if (!(defined $log_level) && defined $config{'log'}) {
+    $log_level = $config{'log'} ;
+  };
+
+  # Skip
+  if (!scalar(@skip) && defined $config{'skip'}) {
+    @skip = split /\s*;\s*/, $config{'skip'} ;
+  };
+
+  # Sigle
+  if (!scalar(@sigle) && defined $config{'sigle'}) {
+    @sigle = split /\s*;\s*/, $config{'sigle'} ;
+  };
+
+  # Anno
+  if (!scalar(@anno) && defined $config{'anno'}) {
+    @anno = split /\s*;\s*/, $config{'anno'} ;
+  };
+};
+
+# Set default token base
+$token_base      //= 'OpenNLP#tokens';
+$cache_file      //= 'korapxml2krill.cache';
+$cache_size      //= '50m';
+$jobs            //= 0;
+$cache_delete    //= 1;
+$cache_init      //= 1;
+$log_level       //= 'ERROR';
+$base_sentences  //= '';
+$base_paragraphs //= '';
+$base_pagebreaks //= '';
+
 $base_sentences  = lc $base_sentences;
 $base_paragraphs = lc $base_paragraphs;
 $base_pagebreaks = lc $base_pagebreaks;
@@ -183,10 +295,10 @@
 my $log = Log::Log4perl->get_logger('main');
 
 
-if ($jobs == -1) {
+if ($jobs eq '-1') {
   state $cores = Sys::Info->new->device('CPU')->count;
   $jobs = ceil(5 * $cores);
-  $log->info("Run using $jobs jobs");
+  $log->info("Run using $jobs jobs on $cores cores");
 };
 
 
@@ -351,7 +463,7 @@
 
   if (scalar(@new_input) > scalar(@input)) {
     @input = sort { length($a) <=> length($b) } @new_input;
-    print 'Input rewritten to ' . join(',', @input);
+    print 'Input rewritten to ' . join(', ', @input)."\n";
   };
 };
 
@@ -870,6 +982,23 @@
 Can be flagged using C<--no-cache-delete> as well.
 Defaults to C<true>.
 
+=item B<--config|-cfg>
+
+Configure the parameters of your call in a file
+of key-value pairs with whitespace separator
+
+  overwrite 1
+  token     DeReKo#Structure
+  ...
+
+Supported parameters are:
+C<overwrite>, C<gzip>, C<jobs>,
+C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
+C<output>, C<base-sentences>, C<base-paragraphs>,
+C<base-pagebreaks>, C<skip> (semicolon separated), C<sigle>
+(semicolon separated), C<anno> (semicolon separated).
+
+
 =item B<--sigle|-sg>
 
 Extract the given texts.