Initial commit of corpuslist_to_korapxml2krill

This is a script to generate a list to manually split korapxml2krill
conversions tasks on multiple workers.
Not yet parameterizable.

Change-Id: Ia1e10a65d1d2719f4178217cc5499d53ee09d812
diff --git a/bin/corpuslist_to_korapxml2krill b/bin/corpuslist_to_korapxml2krill
new file mode 100644
index 0000000..a74dd9e
--- /dev/null
+++ b/bin/corpuslist_to_korapxml2krill
@@ -0,0 +1,77 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+my $package_size = 12;
+my $worker = 8;
+
+my $corpus_c = $package_size;
+my $worker_c = 1;
+my $job_c = 1;
+my @lines = ();
+
+# Iterate over the list of corpora
+foreach my $line (<STDIN>) {
+
+  # Parse corpus
+  if ($line =~ m!\/([^\/]+?)\.zip\n*$!) {
+    # print $1,"\n";
+    push @lines, $1;
+  };
+
+  # Write command
+  if (--$corpus_c <= 0) {
+    write_command(\@lines);
+    $corpus_c = $package_size;
+    @lines = ();
+  };
+};
+
+if (@lines) {
+  write_command(\@lines);
+};
+
+sub write_command {
+  my $lines = shift;
+  if ($worker_c > $worker) {
+    $worker_c = 1;
+  };
+  print '# ' . $job_c++ . '. Worker ' . $worker_c . ' - 10.0.10.' . (52 + $worker_c) . "\n";
+  $worker_c++;
+  print init();
+  foreach (@$lines) {
+    print corpus($_);
+  };
+  print to_log($lines->[0], $lines->[-1]);
+};
+
+
+# Initialize the command
+sub init {
+  return 'korapxml2krill serial \\' . "\n" .
+    '  -cfg \'/export/netapp/korap/krill-json/dereko-2021-1/dereko-2021-1.cfg\' \\' . "\n";
+};
+
+sub corpus {
+  my $corpus = shift;
+  return '  -i "' . $corpus . '.*zip" \\' . "\n";
+};
+
+# End with log command
+sub to_log {
+  my ($from, $to) = @_;
+  return '  &> "/opt/korap/process-dereko-2021-1/dereko-2021-1-' .
+    $from . '-' . $to . '.log" &' . "\n\n";
+};
+
+__END__
+
+=pod
+
+=encoding utf8
+
+=head1 SYNOPSIS
+
+  $ cat corpuslist.txt | perl corpuslist_to_korapxml2krill > conversion_tasks.txt
+
+=cut