Merged executables

Change-Id: I429bbf0edac82d26899e86d6912b405810819e88
diff --git a/script/korapxml2krill_dir b/script/korapxml2krill_dir
index b955dcf..0b010db 100644
--- a/script/korapxml2krill_dir
+++ b/script/korapxml2krill_dir
@@ -3,274 +3,11 @@
 use warnings;
 use lib 'lib';
 use FindBin;
-use File::Temp;
-use File::Spec::Functions qw/catfile catdir/;
-use Getopt::Long;
-use Directory::Iterator;
-use KorAP::XML::Krill;
-use KorAP::XML::Archive;
-use Benchmark qw/:hireswallclock/;
-use Parallel::ForkManager;
 
-my $local = $FindBin::Bin;
+our $LOCAL = $FindBin::Bin;
+our @ARGV;
 
-# Changes
-# 2013/11/25
-# - Initial release
-#
-# 2016/02/04
-# - Rename to korapxml2krill_dir
-#
-# 2016/02/12
-# - Support overwrite
-#
-# 2016/02/14
-# - Added version information
-# - Added support for archive files
-#
-# 2016/02/15
-# - Fixed temporary directory bug
-# - Improved skipping before unzipping
-# - Added EXPERIMENTAL concurrency support
-
-sub printversion {
-  print "Version " . $KorAP::XML::Krill::VERSION . "\n\n";
-  exit(1);
-};
-
-sub printhelp {
-  print <<'EOHELP';
-
-Merge foundry data based on a tokenization and create indexer friendly documents
-for whole directories.
-
-Call:
-korapxml2krill_dir -z --input <directory> --output <directory>
-
-  --input|-i <directory|file>     Directory or archive file of documents to index
-  --output|-o <directory>         Name of output folder
-  --overwrite|-w                  Overwrite files that already exist
-  --token|-t <foundry>[#<layer>]  Define the default tokenization by specifying
-                                  the name of the foundry and optionally the name
-                                  of the layer. Defaults to OpenNLP#tokens.
-  --skip|-s <foundry>[#<layer>]   Skip specific foundries by specifying the name
-                                  or specific layers by defining the name
-                                  with a # in front of the foundry,
-                                  e.g. Mate#Morpho. Alternatively you can skip #ALL.
-                                  Can be set multiple times.
-  --allow|-a <foundry>#<layer>    Allow specific foundries and layers by defining them
-                                  combining the foundry name with a # and the layer name.
-  --primary|-p                    Output primary data or not. Defaults to true.
-                                  Can be flagged using --no-primary as well.
-  --jobs|-j                       Define the number of concurrent jobs in seperated forks,
-                                  defaults to 0. This is EXPERIMENTAL!
-  --human|-m                      Represent the data human friendly,
-                                  while the output defaults to JSON
-  --pretty|-y                     Pretty print json output
-  --gzip|-z                       Compress the output
-                                  (expects a defined output file)
-  --log|-l                        The Log4perl log level, defaults to ERROR.
-  --help|-h                       Print this document (optional)
-  --version|-v                    Print version information
-
-diewald@ids-mannheim.de, 2016/02/15
-
-EOHELP
-
-  exit(defined $_[0] ? $_[0] : 0);
-};
-
-my ($input, $output, $text, $gzip, $log_level, @skip,
-    $token_base, $primary, @allow, $pretty,
-    $overwrite);
-my $jobs = 0;
-GetOptions(
-  'input|i=s'   => \$input,
-  'output|o=s'  => \$output,
-  'human|m'     => \$text,
-  'overwrite|w' => \$overwrite,
-  'token|t=s'   => \$token_base,
-  'gzip|z'      => \$gzip,
-  'skip|s=s'    => \@skip,
-  'log|l=s'     => \$log_level,
-  'allow|a=s'   => \@allow,
-  'primary|p!'  => \$primary,
-  'pretty|y'    => \$pretty,
-  'jobs|j=i'    => \$jobs,
-  'help|h'      => sub { printhelp },
-  'version|v'   => sub { printversion }
-);
-
-printhelp(1) if !$input || !$output;
-
-sub get_file_name {
-  my $file = shift;
-  $file =~ s/^?\/?$input//;
-  $file =~ tr/\//-/;
-  $file =~ s{^-+}{};
-  return $file;
-};
-
-# write file
-sub write_file {
-  my $anno = shift;
-  my $file = get_file_name($anno);
-
-  # TODO: This should be done directly with a data structure! KorAP::XML::Wrap
-
-  my $call = 'perl ' . $local . '/korapxml2krill -i ' .
-    $anno . ' -o ' . $output . '/' . $file . '.json';
-  $call .= '.gz -z' if $gzip;
-  $call .= ' -m' if $text;
-  $call .= ' -w' if $overwrite;
-  $call .= ' -t ' . $token_base if $token_base;
-  $call .= ' -l ' . $log_level if $log_level;
-  $call .= ' --no-primary ' if $primary;
-  $call .= ' -y ' . $pretty if $pretty;
-  $call .= ' -a ' . $_ foreach @allow;
-  $call .= ' -s ' . $_ foreach @skip;
-  system($call);
-  return "$file";
-};
-
-# Zero means: everything runs in the parent process
-my $pool = Parallel::ForkManager->new($jobs);
-
-my $count = 0;
-my $iter = 0;
-
-# Report on fork message
-$pool->run_on_finish (
-  sub {
-    my ($pid, $code) = shift;
-    my $data = pop;
-    print 'Convert ['. ($jobs > 0 ? "$pid:" : '') .
-      ($iter++) . "/$count]" .
-	($code ? " $code" : '') .
-	  " $$data\n";
-  }
-);
-
-my $t;
-print "Reading data ...\n";
-
-# Input is a directory
-if (-d $input) {
-  my $it = Directory::Iterator->new($input);
-  my @dirs;
-  my $dir;
-
-  while (1) {
-    if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
-      push @dirs, $dir;
-      $it->prune;
-    };
-    last unless $it->next;
-  };
-
-  print "Start processing ...\n";
-  $t = Benchmark->new;
-  $count = scalar @dirs;
-
- DIRECTORY_LOOP:
-  for (my $i = 0; $i < $count; $i++) {
-
-    unless ($overwrite) {
-      my $filename = catfile(
-	$output,
-	get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
-      );
-
-      if (-e $filename) {
-	$iter++;
-	print "Skip $filename\n";
-	next;
-      };
-    };
-
-    # Get the next fork
-    my $pid = $pool->start and next DIRECTORY_LOOP;
-    my $msg;
-
-    $msg = write_file($dirs[$i]);
-    $pool->finish(0, \$msg);
-  };
-}
-
-# Input is a file
-elsif (-f($input) && (my $archive = KorAP::XML::Archive->new($input))) {
-  unless ($archive->test_unzip) {
-    print "Unzip is not installed or incompatible.\n\n";
-    exit(1);
-  };
-
-  unless ($archive->test) {
-    print "Zip archive not compatible.\n\n";
-    exit(1);
-  };
-
-  print "Start processing ...\n";
-  $t = Benchmark->new;
-  my @dirs = $archive->list_texts;
-  $count = scalar @dirs;
-
- ARCHIVE_LOOP:
-  for (my $i = 0; $i < $count; $i++) {
-
-    # Split path information
-    my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
-
-    unless ($overwrite) {
-      my $filename = catfile(
-	$output,
-	get_file_name(catdir($doc, $text)) . '.json' . ($gzip ? '.gz' : '')
-      );
-
-      if (-e $filename) {
-	$iter++;
-	print "Skip $filename\n";
-	next;
-      };
-    };
-
-    # Get the next fork
-    my $pid = $pool->start and next ARCHIVE_LOOP;
-
-    # Create temporary file
-    my $temp = File::Temp->newdir;
-
-    my $msg;
-
-    # Extract from archive
-    if ($archive->extract($dirs[$i], $temp)) {
-
-      # Create corpus directory
-      $input = catdir("$temp", $corpus);
-
-      # Temporary directory
-      my $dir = catdir($input, $doc, $text);
-
-      # Write file
-      $msg = write_file($dir);
-
-      $temp = undef;
-      $pool->finish(0, \$msg);
-    }
-    else {
-
-      $temp = undef;
-      $msg = "Unable to extract " . $dirs[$i] . "\n";
-      $pool->finish(1, \$msg);
-    };
-  };
-}
-
-else {
-  print "Input is neither a directory nor an archive.\n\n";
-};
-
-$pool->wait_all_children;
-
-print timestr(timediff(Benchmark->new, $t))."\n\n";
+warn "korapxml2krill_dir is DEPRECATED. Please use korapxml2krill";
+system("perl $LOCAL/korapxml2krill archive " . join(' ', @ARGV));
 
 __END__