Improve corpuslist generation

Change-Id: Ic21aed1bc4723cd611d5989024621e865fb422f0
diff --git a/bin/corpuslist_to_korapxml2krill b/bin/corpuslist_to_korapxml2krill
index a74dd9e..9b4a041 100755
--- a/bin/corpuslist_to_korapxml2krill
+++ b/bin/corpuslist_to_korapxml2krill
@@ -1,15 +1,38 @@
 #!/usr/bin/env perl
 use strict;
 use warnings;
+use feature 'say';
+use Getopt::Long qw(GetOptions :config no_auto_abbrev);
+use Pod::Usage;
 
-my $package_size = 12;
-my $worker = 8;
+our $VERSION = '0.0.1';
+our $VERSION_MSG = "\ncorpuslist_to_korapxml2krill - v$VERSION\n";
+
+
+GetOptions(
+  'batch|b=i'  => \(my $package_size = 12),
+  'worker|w=i'  => \(my $worker = 8),
+  'conversion-config|ccfg=s' => \(my $cfg = '/export/netapp/korap/krill-json/dereko-2021-1/dereko-2021-1.cfg' ),
+  'log-dir|ld=s' => \(my $log_dir = '/opt/korap/process-dereko-2021-1'),
+  'log-prefix|lp=s' => \(my $log_prefix = 'dereko-2021-1'),
+  'slack=s' => \(my $slack = 'dereko'),
+  'comment=s' => \(my $comment = 'w-23-gesamt'),
+  'help|h' => sub {
+    pod2usage(
+      -verbose  => 99,
+      -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
+      -msg      => $VERSION_MSG,
+      -output   => '-'
+    )
+  }
+);
 
 my $corpus_c = $package_size;
 my $worker_c = 1;
 my $job_c = 1;
 my @lines = ();
 
+
 # Iterate over the list of corpora
 foreach my $line (<STDIN>) {
 
@@ -27,31 +50,65 @@
   };
 };
 
+
+# Write remaining commands
 if (@lines) {
   write_command(\@lines);
 };
 
+
+# Write a single command for a batch
 sub write_command {
   my $lines = shift;
+
+  my $from = $lines->[0];
+  my $to = $lines->[-1];
+
   if ($worker_c > $worker) {
     $worker_c = 1;
   };
+
   print '# ' . $job_c++ . '. Worker ' . $worker_c . ' - 10.0.10.' . (52 + $worker_c) . "\n";
-  $worker_c++;
+
+  if ($slack) {
+    print '( ( ';
+  };
+
   print init();
+
   foreach (@$lines) {
     print corpus($_);
   };
-  print to_log($lines->[0], $lines->[-1]);
+
+  print to_log($from, $to);
+
+  if ($slack) {
+    say ' ; \\';
+    print 'slack.js -c ' . $slack . ' ';
+    print '"Done: Worker ' . $worker_c . " conversion $from-$to";
+    if ($comment) {
+      print ' (' . $comment . ')';
+    };
+    print '"  ) & )'."\n\n";
+  }
+
+  else {
+    print ' & ', "\n\n";
+  };
+
+  print "\n";
+
+  $worker_c++;
 };
 
 
 # Initialize the command
 sub init {
   return 'korapxml2krill serial \\' . "\n" .
-    '  -cfg \'/export/netapp/korap/krill-json/dereko-2021-1/dereko-2021-1.cfg\' \\' . "\n";
+    '  -cfg \'' . $cfg . '\' \\' . "\n";
 };
 
+# One line per corpus
 sub corpus {
   my $corpus = shift;
   return '  -i "' . $corpus . '.*zip" \\' . "\n";
@@ -60,8 +117,8 @@
 # End with log command
 sub to_log {
   my ($from, $to) = @_;
-  return '  &> "/opt/korap/process-dereko-2021-1/dereko-2021-1-' .
-    $from . '-' . $to . '.log" &' . "\n\n";
+  return '  &> "' . $log_dir . '/' . $log_prefix . '-' .
+    $from . '-' . $to . '.log"';
 };
 
 __END__
@@ -74,4 +131,36 @@
 
   $ cat corpuslist.txt | perl corpuslist_to_korapxml2krill > conversion_tasks.txt
 
+=head1 OPTIONS
+
+=over 2
+
+=item B<--batch|-b>
+
+Batch size, i.e. how many corpora are converted per worker
+
+=item B<--worker|-w>
+
+How many worker should be used. This will count up
+and start at 1 again, once all workers have their batches.
+
+=item B<--log-dir|-ld>
+
+(Local) directory to store log files.
+
+=item B<--log-prefix|-lp>
+
+Prefix of the log file, that will contain information about
+the start and end corpus of the batch.
+
+=item B<--slack>
+
+Give a slack channel for reporting.
+
+=item B<--comment>
+
+Add comment that is added to all reportings.
+
+=back
+
 =cut