Initial commit of corpuslist_to_index

This is a script to generate a list of untar+index+cleanup tasks.
Helpful to not blowup the filesystem by untarring too much data
and to be able to check the index logs once in a while to prevent
index corruption.
Not yet parameterizable.

Change-Id: Ifaeebcae41596b4220092673d8cddb4106a150ec
diff --git a/bin/corpuslist_to_index b/bin/corpuslist_to_index
new file mode 100644
index 0000000..e174343
--- /dev/null
+++ b/bin/corpuslist_to_index
@@ -0,0 +1,77 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use feature 'say';
+
+my $package_size = 15;
+
+my $corpus_c = $package_size;
+my @lines = ();
+
+my $index_dir = '/opt/korap/index-2020-03-30';
+my $netapp_index_dir = '/export/netapp/korap/korap-head-02/index-2020-03-30/index';
+
+# Iterate over the list of corpora
+foreach my $line (<STDIN>) {
+
+  # Parse corpus
+  if ($line =~ m!\/([^\/]+?)\.zip\n*$!) {
+    push @lines, $1;
+  };
+
+  # Write command
+  if (--$corpus_c <= 0) {
+    write_command(\@lines);
+    $corpus_c = $package_size;
+    @lines = ();
+  };
+};
+
+if (@lines) {
+  write_command(\@lines);
+};
+
+sub write_command {
+  my $lines = shift;
+  foreach (@$lines) {
+    print corpus($_);
+  };
+  to_log($lines->[0], $lines->[-1]);
+};
+
+
+# Initialize the command
+sub corpus {
+  my $corpus = shift;
+  return 'tar -C ' . $index_dir . '/json/ -xvf ' .
+    '/export/netapp/korap/krill-json/dereko-2021-1/' . $corpus . '.tar & \\'."\n";
+};
+
+sub to_log {
+  my ($from, $to) = @_;
+
+  say "echo 'ok'\n";
+
+  say "ls -l /opt/korap/index-2020-03-30/json/ | wc -l\n";
+
+  say 'java -jar Krill-Indexer.jar \\';
+  say '  -c ' . $index_dir . '/kustvakt-lite.conf \\';
+  say '  -i ' . $index_dir . '/json \\';
+  say "  -o $netapp_index_dir \\";
+  say '  &> ' . $index_dir . '/index.netapp.' . $from . '-' . $to . '.log &' . "\n";
+
+  say "rm -r /opt/korap/index-2020-03-30/json/";
+  say "mkdir /opt/korap/index-2020-03-30/json/\n";
+};
+
+__END__
+
+=pod
+
+=encoding utf8
+
+=head1 SYNOPSIS
+
+  $ cat corpuslist.txt | perl corpuslist_to_index > index_tasks.txt
+
+=cut