Support serial conversion and input-base

Change-Id: I3225847e31c307e0b5bd3e108ae0f306d71d4395
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 93e5eac..2ac6ddb 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -23,7 +23,9 @@
 use Sys::Info::Constants qw( :device_cpu );
 use File::Glob ':bsd_glob';
 use File::Temp qw/tempdir/;
-
+use File::Path qw(remove_tree make_path);
+use Mojo::Collection 'c';
+use String::Random qw(random_string);
 
 # use KorAP::XML::ForkPool;
 # TODO: use Parallel::Loops
@@ -104,6 +106,9 @@
 # - support configuration option
 # - support for temporary extraction
 #
+# 2017/04/10
+# - support serial processing
+# - support input root
 # ----------------------------------------------------------
 
 our $LAST_CHANGE = '2017/04/07';
@@ -112,12 +117,17 @@
 Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
 VERSION
 
+# Prototypes
+sub get_file_name_from_glob($);
+sub get_file_name($);
+
 # Parse comand
 my $cmd;
 our @ARGV;
 if ($ARGV[0] && index($ARGV[0], '-') != 0) {
   $cmd = shift @ARGV;
 };
+my @keep_argv = @ARGV;
 
 my (@skip, @sigle, @anno, @input);
 my $text;
@@ -125,6 +135,7 @@
 # Parse options from the command line
 GetOptions(
   'input|i=s'   => \@input,
+  'input-base|ib=s' => \(my $input_base),
   'output|o=s'  => \(my $output),
   'overwrite|w' => \(my $overwrite),
   'meta|m=s'    => \(my $meta),
@@ -163,11 +174,9 @@
   }
 );
 
+
 # Load from configuration
 if ($cfg_file && -e $cfg_file) {
-
-  print "Reading config from $cfg_file\n";
-
   my %config;
 
   Config::Simple->import_from($cfg_file, \%config);
@@ -187,6 +196,11 @@
     $jobs = $config{jobs};
   };
 
+  # Input root base directory
+  if (!defined($input_base) && defined $config{'input-base'}) {
+    $input_base = $config{'input-base'};
+  };
+
   # temporary-extract
   if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
     $extract_dir = $config{'temporary-extract'};
@@ -263,6 +277,7 @@
   };
 };
 
+
 # Set default token base
 $token_base      //= 'OpenNLP#tokens';
 $cache_file      //= 'korapxml2krill.cache';
@@ -279,6 +294,21 @@
 $base_paragraphs = lc $base_paragraphs;
 $base_pagebreaks = lc $base_pagebreaks;
 
+
+# Initialize log4perl object
+Log::Log4perl->init({
+  'log4perl.rootLogger' => uc($log_level) . ', STDERR',
+  'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
+  'log4perl.appender.STDERR.layout' => 'PatternLayout',
+  'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
+});
+
+my $log = Log::Log4perl->get_logger('main');
+
+
+print "Reading config from $cfg_file\n" if $cfg_file;
+
+
 my %ERROR_HASH = (
   -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
   -verbose  => 99,
@@ -293,16 +323,6 @@
 # Gzip has no effect, if no output is given
 pod2usage(%ERROR_HASH) if $gzip && !$output;
 
-# Initialize log4perl object
-Log::Log4perl->init({
-  'log4perl.rootLogger' => uc($log_level) . ', STDERR',
-  'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
-  'log4perl.appender.STDERR.layout' => 'PatternLayout',
-  'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
-});
-
-my $log = Log::Log4perl->get_logger('main');
-
 
 if ($jobs eq '-1') {
   state $cores = Sys::Info->new->device('CPU')->count;
@@ -311,6 +331,58 @@
 };
 
 
+# Start serial processing
+if ($cmd eq 'serial') {
+
+  if ($output && (!-e $output || !-d $output)) {
+    print "Directory '$output' does not exist.\n\n";
+    exit(0);
+  };
+
+  # Remove all inputs
+  my $remove_next = 0;
+  @keep_argv = @{c(@keep_argv)->grep(
+    sub {
+      # Input flag
+      if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
+        $remove_next = 1;
+        return 0;
+      }
+
+      # input value
+      elsif ($remove_next) {
+        $remove_next = 0;
+        return 0;
+      };
+
+      # Pass parameter
+      return 1;
+    }
+  )->to_array};
+
+
+  # Iterate over all inputs
+  foreach (@input) {
+
+    my $new_out = catdir($output, get_file_name_from_glob($_));
+
+    # Create new path
+    unless (make_path($new_out)) {
+      $log->error("Can\'t create path $new_out");
+      exit(0);
+    };
+
+    # Create archive command
+    my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
+    print "Start serial processing of $_ to $new_out\n";
+
+    # Start archiving
+    system @archive_cmd;
+  };
+
+  exit(0);
+};
+
 my %skip;
 $skip{lc($_)} = 1 foreach @skip;
 
@@ -432,7 +504,6 @@
   anno      => \@filtered_anno
 );
 
-
 # Get file name based on path information
 sub get_file_name ($) {
   my $i = $input[0];
@@ -450,6 +521,21 @@
   return $file;
 };
 
+
+sub get_file_name_from_glob ($) {
+  my $glob = shift;
+  $glob =~ s/\.zip$//;          # Remove file extension
+  $glob =~ s{\/([^\/]+?)$}{$1}; # Remove path unix style
+  $glob =~ s{\\([^\\]+?)$}{$1}; # Remove path windows style
+  $glob =~ s/[\*\?]//g;         # Remove arbitrary fills
+  $glob =~ s/[\{\}\[\]]/-/g;    # Remove class and multiple brackets
+  $glob =~ s/\-\-+/-/g;         # Remove sequences of binding characters
+  $glob =~ s/^-//;              # Clean beginning
+  $glob =~ s/-$//;              # Clean end
+  return $glob;
+};
+
+
 # Convert sigle to path construct
 s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
 
@@ -460,19 +546,25 @@
   };
 };
 
-# Glob files
+
+# Glob and prefix files
 if (@input) {
+
   my @new_input = ();
 
   # Iterate over all inputs
-  foreach (@input) {
-    push (@new_input, bsd_glob($_));
+  foreach my $wild_card (@input) {
+
+    # Prefix with input root
+    $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
+
+    push (@new_input, bsd_glob($wild_card));
   };
 
-  if (scalar(@new_input) > scalar(@input)) {
-    @input = sort { length($a) <=> length($b) } @new_input;
-    print 'Input rewritten to ' . join(', ', @input)."\n";
-  };
+  # Sort files by length
+  @input = sort { length($a) <=> length($b) } @new_input;
+
+  print 'Input is ' . join(', ', @input)."\n";
 };
 
 
@@ -605,9 +697,6 @@
       );
       print "extracted.\n";
     };
-
-    print "\n";
-    # exit(1);
   }
 
   # Can't create archive object
@@ -640,9 +729,14 @@
 
       # Create a temporary directory
       if ($extract_dir eq ':temp:') {
-        $extract_dir = tempdir(CLEANUP => 1);
+        $extract_dir = tempdir(CLEANUP => 0);
+        print "Temporarily extract to $extract_dir\n";
       };
 
+      # Add some random extra to avoid clashes with multiple archives
+      $extract_dir = catdir($extract_dir, random_string('cccccc'));
+
+      # Extract to temprary directory
       if ($archive->extract_all($extract_dir, $jobs)) {
         @input = ($extract_dir);
       }
@@ -816,11 +910,20 @@
   # Delete cache file
   unlink($cache_file) if $cache_delete;
 
+  print timestr(timediff(Benchmark->new, $t))."\n";
   print "Done.\n";
-  print timestr(timediff(Benchmark->new, $t))."\n\n";
 };
 
 
+# Cleanup temporary extraction directory
+if ($extract_dir) {
+  my $objects = remove_tree($extract_dir, { safe => 1 });
+  print "Removed directory $extract_dir with $objects objects.\n";
+};
+
+
+print "\n";
+
 __END__
 
 =pod
@@ -877,6 +980,16 @@
 
 Extracts KorAP-XML documents from a zip file.
 
+=item B<serial>
+
+  $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
+
+Convert archives sequentially. The inputs are not merged but treated
+as they are (so they may be premerged or globs).
+the C<--out> directory is treated as the base directory where subdirectories
+are created based on the archive name.
+
+
 =back
 
 
@@ -918,6 +1031,11 @@
 B<The root folder switch using the hash sign is experimental and
 may vanish in future versions.>
 
+=item B<--input-base|-ib> <directory>
+
+The base directory for inputs.
+
+
 =item B<--output|-o> <directory|file>
 
 Output folder for archive processing or
@@ -1038,7 +1156,7 @@
   ...
 
 Supported parameters are:
-C<overwrite>, C<gzip>, C<jobs>,
+C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
 C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
 C<output>, C<base-sentences>, C<temp-extract>, C<base-paragraphs>,
 C<base-pagebreaks>, C<skip> (semicolon separated), C<sigle>