Support serial conversion and input-base

Change-Id: I3225847e31c307e0b5bd3e108ae0f306d71d4395
diff --git a/Changes b/Changes
index 420f1df..cce79da 100644
--- a/Changes
+++ b/Changes
@@ -1,6 +1,8 @@
-0.27 2017-04-07
+0.27 2017-04-10
         - Support configuration files.
         - Support temporary extraction.
+        - Support serial conversion.
+        - Support input-base.
 
 0.26 2017-04-06
         - Support wildcards on input.
diff --git a/MANIFEST b/MANIFEST
index b1b3551..812c953 100755
--- a/MANIFEST
+++ b/MANIFEST
@@ -98,6 +98,7 @@
 t/sgbr/pos.t
 t/sgbr/token.t
 t/script/single.t
+t/script/serial.t
 t/script/usage.t
 t/script/extract.t
 t/script/archive.t
diff --git a/Makefile.PL b/Makefile.PL
index a2fc10b..ab696f0 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -38,7 +38,9 @@
     'Pod::Usage'      => 0,
     'Cache::FastMmap' => 1.40,
     'Sys::Info'       => 0.78,
-    'Config::Simple'  => 4.58
+    'Config::Simple'  => 4.58,
+    'String::Random'  => 0.29,
+    'File::Path'      => 2.12
   },
   MIN_PERL_VERSION => '5.014',
   test => {
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 93e5eac..2ac6ddb 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -23,7 +23,9 @@
 use Sys::Info::Constants qw( :device_cpu );
 use File::Glob ':bsd_glob';
 use File::Temp qw/tempdir/;
-
+use File::Path qw(remove_tree make_path);
+use Mojo::Collection 'c';
+use String::Random qw(random_string);
 
 # use KorAP::XML::ForkPool;
 # TODO: use Parallel::Loops
@@ -104,6 +106,9 @@
 # - support configuration option
 # - support for temporary extraction
 #
+# 2017/04/10
+# - support serial processing
+# - support input root
 # ----------------------------------------------------------
 
 our $LAST_CHANGE = '2017/04/07';
@@ -112,12 +117,17 @@
 Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
 VERSION
 
+# Prototypes
+sub get_file_name_from_glob($);
+sub get_file_name($);
+
 # Parse comand
 my $cmd;
 our @ARGV;
 if ($ARGV[0] && index($ARGV[0], '-') != 0) {
   $cmd = shift @ARGV;
 };
+my @keep_argv = @ARGV;
 
 my (@skip, @sigle, @anno, @input);
 my $text;
@@ -125,6 +135,7 @@
 # Parse options from the command line
 GetOptions(
   'input|i=s'   => \@input,
+  'input-base|ib=s' => \(my $input_base),
   'output|o=s'  => \(my $output),
   'overwrite|w' => \(my $overwrite),
   'meta|m=s'    => \(my $meta),
@@ -163,11 +174,9 @@
   }
 );
 
+
 # Load from configuration
 if ($cfg_file && -e $cfg_file) {
-
-  print "Reading config from $cfg_file\n";
-
   my %config;
 
   Config::Simple->import_from($cfg_file, \%config);
@@ -187,6 +196,11 @@
     $jobs = $config{jobs};
   };
 
+  # Input root base directory
+  if (!defined($input_base) && defined $config{'input-base'}) {
+    $input_base = $config{'input-base'};
+  };
+
   # temporary-extract
   if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
     $extract_dir = $config{'temporary-extract'};
@@ -263,6 +277,7 @@
   };
 };
 
+
 # Set default token base
 $token_base      //= 'OpenNLP#tokens';
 $cache_file      //= 'korapxml2krill.cache';
@@ -279,6 +294,21 @@
 $base_paragraphs = lc $base_paragraphs;
 $base_pagebreaks = lc $base_pagebreaks;
 
+
+# Initialize log4perl object
+Log::Log4perl->init({
+  'log4perl.rootLogger' => uc($log_level) . ', STDERR',
+  'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
+  'log4perl.appender.STDERR.layout' => 'PatternLayout',
+  'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
+});
+
+my $log = Log::Log4perl->get_logger('main');
+
+
+print "Reading config from $cfg_file\n" if $cfg_file;
+
+
 my %ERROR_HASH = (
   -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
   -verbose  => 99,
@@ -293,16 +323,6 @@
 # Gzip has no effect, if no output is given
 pod2usage(%ERROR_HASH) if $gzip && !$output;
 
-# Initialize log4perl object
-Log::Log4perl->init({
-  'log4perl.rootLogger' => uc($log_level) . ', STDERR',
-  'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
-  'log4perl.appender.STDERR.layout' => 'PatternLayout',
-  'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
-});
-
-my $log = Log::Log4perl->get_logger('main');
-
 
 if ($jobs eq '-1') {
   state $cores = Sys::Info->new->device('CPU')->count;
@@ -311,6 +331,58 @@
 };
 
 
+# Start serial processing
+if ($cmd eq 'serial') {
+
+  if ($output && (!-e $output || !-d $output)) {
+    print "Directory '$output' does not exist.\n\n";
+    exit(0);
+  };
+
+  # Remove all inputs
+  my $remove_next = 0;
+  @keep_argv = @{c(@keep_argv)->grep(
+    sub {
+      # Input flag
+      if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
+        $remove_next = 1;
+        return 0;
+      }
+
+      # input value
+      elsif ($remove_next) {
+        $remove_next = 0;
+        return 0;
+      };
+
+      # Pass parameter
+      return 1;
+    }
+  )->to_array};
+
+
+  # Iterate over all inputs
+  foreach (@input) {
+
+    my $new_out = catdir($output, get_file_name_from_glob($_));
+
+    # Create new path
+    unless (make_path($new_out)) {
+      $log->error("Can\'t create path $new_out");
+      exit(0);
+    };
+
+    # Create archive command
+    my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
+    print "Start serial processing of $_ to $new_out\n";
+
+    # Start archiving
+    system @archive_cmd;
+  };
+
+  exit(0);
+};
+
 my %skip;
 $skip{lc($_)} = 1 foreach @skip;
 
@@ -432,7 +504,6 @@
   anno      => \@filtered_anno
 );
 
-
 # Get file name based on path information
 sub get_file_name ($) {
   my $i = $input[0];
@@ -450,6 +521,21 @@
   return $file;
 };
 
+
+sub get_file_name_from_glob ($) {
+  my $glob = shift;
+  $glob =~ s/\.zip$//;          # Remove file extension
+  $glob =~ s{\/([^\/]+?)$}{$1}; # Remove path unix style
+  $glob =~ s{\\([^\\]+?)$}{$1}; # Remove path windows style
+  $glob =~ s/[\*\?]//g;         # Remove arbitrary fills
+  $glob =~ s/[\{\}\[\]]/-/g;    # Remove class and multiple brackets
+  $glob =~ s/\-\-+/-/g;         # Remove sequences of binding characters
+  $glob =~ s/^-//;              # Clean beginning
+  $glob =~ s/-$//;              # Clean end
+  return $glob;
+};
+
+
 # Convert sigle to path construct
 s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
 
@@ -460,19 +546,25 @@
   };
 };
 
-# Glob files
+
+# Glob and prefix files
 if (@input) {
+
   my @new_input = ();
 
   # Iterate over all inputs
-  foreach (@input) {
-    push (@new_input, bsd_glob($_));
+  foreach my $wild_card (@input) {
+
+    # Prefix with input root
+    $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
+
+    push (@new_input, bsd_glob($wild_card));
   };
 
-  if (scalar(@new_input) > scalar(@input)) {
-    @input = sort { length($a) <=> length($b) } @new_input;
-    print 'Input rewritten to ' . join(', ', @input)."\n";
-  };
+  # Sort files by length
+  @input = sort { length($a) <=> length($b) } @new_input;
+
+  print 'Input is ' . join(', ', @input)."\n";
 };
 
 
@@ -605,9 +697,6 @@
       );
       print "extracted.\n";
     };
-
-    print "\n";
-    # exit(1);
   }
 
   # Can't create archive object
@@ -640,9 +729,14 @@
 
       # Create a temporary directory
       if ($extract_dir eq ':temp:') {
-        $extract_dir = tempdir(CLEANUP => 1);
+        $extract_dir = tempdir(CLEANUP => 0);
+        print "Temporarily extract to $extract_dir\n";
       };
 
+      # Add some random extra to avoid clashes with multiple archives
+      $extract_dir = catdir($extract_dir, random_string('cccccc'));
+
+      # Extract to temprary directory
       if ($archive->extract_all($extract_dir, $jobs)) {
         @input = ($extract_dir);
       }
@@ -816,11 +910,20 @@
   # Delete cache file
   unlink($cache_file) if $cache_delete;
 
+  print timestr(timediff(Benchmark->new, $t))."\n";
   print "Done.\n";
-  print timestr(timediff(Benchmark->new, $t))."\n\n";
 };
 
 
+# Cleanup temporary extraction directory
+if ($extract_dir) {
+  my $objects = remove_tree($extract_dir, { safe => 1 });
+  print "Removed directory $extract_dir with $objects objects.\n";
+};
+
+
+print "\n";
+
 __END__
 
 =pod
@@ -877,6 +980,16 @@
 
 Extracts KorAP-XML documents from a zip file.
 
+=item B<serial>
+
+  $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
+
+Convert archives sequentially. The inputs are not merged but treated
+as they are (so they may be premerged or globs).
+the C<--out> directory is treated as the base directory where subdirectories
+are created based on the archive name.
+
+
 =back
 
 
@@ -918,6 +1031,11 @@
 B<The root folder switch using the hash sign is experimental and
 may vanish in future versions.>
 
+=item B<--input-base|-ib> <directory>
+
+The base directory for inputs.
+
+
 =item B<--output|-o> <directory|file>
 
 Output folder for archive processing or
@@ -1038,7 +1156,7 @@
   ...
 
 Supported parameters are:
-C<overwrite>, C<gzip>, C<jobs>,
+C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
 C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
 C<output>, C<base-sentences>, C<temp-extract>, C<base-paragraphs>,
 C<base-pagebreaks>, C<skip> (semicolon separated), C<sigle>
diff --git a/t/script/archive.t b/t/script/archive.t
index 17ab672..69546c0 100644
--- a/t/script/archive.t
+++ b/t/script/archive.t
@@ -185,7 +185,7 @@
   sub {
     system($call);
   },
-  qr!Input rewritten to .+?wpd15-single\.zip,.+?wpd15-single\.malt\.zip,.+?wpd15-single\.corenlp\.zip,.+?wpd15-single\.opennlp\.zip,.+?wpd15-single\.mdparser\.zip,.+?wpd15-single\.tree_tagger\.zip!is,
+  qr!Input is .+?wpd15-single\.zip,.+?wpd15-single\.malt\.zip,.+?wpd15-single\.corenlp\.zip,.+?wpd15-single\.opennlp\.zip,.+?wpd15-single\.mdparser\.zip,.+?wpd15-single\.tree_tagger\.zip!is,
   $call
 );
 
diff --git a/t/script/config.t b/t/script/config.t
index 8af59c5..8a5f46e 100644
--- a/t/script/config.t
+++ b/t/script/config.t
@@ -14,7 +14,9 @@
 
 my ($fh, $cfg_file) = tempfile();
 
-print $fh <<CFG;
+my $input_base = catdir($f, '..', 'corpus', 'archives');
+
+print $fh <<"CFG";
 overwrite       0
 token           OpenNLP#tokens
 base-sentences  DeReKo#Structure
@@ -24,6 +26,7 @@
 meta            I5
 gzip            1
 log             DEBUG
+input-base      $input_base
 CFG
 
 close($fh);
@@ -32,7 +35,7 @@
 my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
 
 # Path for input
-my $input = "'".catfile($f, '..', 'corpus', 'archives', 'wpd15*.zip') . "'";
+my $input = "'".catfile('wpd15*.zip') . "'";
 
 # Temporary output
 my $output = File::Temp->newdir(CLEANUP => 0);
@@ -59,7 +62,7 @@
 like($stdout, qr!Unable to parse KorAP::XML::Annotation::Glemm::Morpho!, 'Check log level');
 
 # Check wildcard input
-like($stdout, qr!Input rewritten to .+?wpd15-single\.zip,.+?wpd15-single\.malt\.zip,.+?wpd15-single\.corenlp\.zip,.+?wpd15-single\.opennlp\.zip,.+?wpd15-single\.mdparser\.zip,.+?wpd15-single\.tree_tagger\.zip!is, 'Wildcards');
+like($stdout, qr!Input is .+?wpd15-single\.zip,.+?wpd15-single\.malt\.zip,.+?wpd15-single\.corenlp\.zip,.+?wpd15-single\.opennlp\.zip,.+?wpd15-single\.mdparser\.zip,.+?wpd15-single\.tree_tagger\.zip!is, 'Wildcards');
 
 like($stdout, qr!Run using \d+ jobs on \d+ cores!, 'Jobs');
 
diff --git a/t/script/serial.t b/t/script/serial.t
new file mode 100644
index 0000000..70febc3
--- /dev/null
+++ b/t/script/serial.t
@@ -0,0 +1,39 @@
+use strict;
+use warnings;
+use Test::More;
+use Test::Output;
+use File::Basename 'dirname';
+use File::Spec::Functions qw/catdir catfile/;
+use File::Temp qw/tempdir/;
+
+my $f = dirname(__FILE__);
+my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
+
+my $input_base = catdir($f, '..', 'corpus', 'archives');
+
+# Temporary output
+my $output = File::Temp->newdir(CLEANUP => 0);
+
+my $call = join(
+  ' ',
+  'perl', $script,
+  'serial',
+  '-i' => '"ngafy*.zip"',
+  '-i' => '"tree*.zip"',
+  '-ib' => $input_base,
+  '-o' => $output,
+  '-l' => 'WARN'
+);
+
+# Test without parameters
+combined_like(
+  sub {
+    system($call);
+  },
+  qr!Start serial processing of ngafy\*\.zip!s,
+  $call
+);
+
+
+done_testing;
+__END__