Make index generator more adjustable

Change-Id: I1dbc51e1b64f62c6e783f912472d38a6d91dd306
diff --git a/bin/corpuslist_to_index b/bin/corpuslist_to_index
index 3287ad0..b96e969 100644
--- a/bin/corpuslist_to_index
+++ b/bin/corpuslist_to_index
@@ -7,18 +7,23 @@
 
 use Getopt::Long qw(GetOptions :config no_auto_abbrev);
 use Pod::Usage;
+use Cwd;
+
+use File::Spec::Functions qw'catdir catfile';
 
 our $VERSION = '0.0.2';
 our $VERSION_MSG = "\ncorpuslist_to_index - v$VERSION\n";
 
-my $cmd;
-our @ARGV;
-if ($ARGV[0] && index($ARGV[0], '-') != 0) {
-  $cmd = shift @ARGV;
-};
+my @cmds = ('untar', 'count_json', 'to_index');
 
 GetOptions(
   'batch|b=i'  => \(my $package_size = 15),
+  'base-dir|bd=s' => \(my $base_dir = Cwd::cwd()), # '/opt/korap/index-2020-03-30'),
+  'json-dir|jd=s' => \(my $json_dir = 'json'),
+  'tar-dir|td=s' => \(my $tar_dir = '/export/netapp/korap/krill-json/dereko-2021-1/'),
+  'index-dir|id=s' => \(my $index_dir = '/export/netapp/korap/korap-head-02/index-2020-03-30/index'),
+  'index-conf|ic=s' => \(my $index_conf = 'kustvakt.conf'),
+  'cmd|c=s' => \@cmds,
   'help|h' => sub {
     pod2usage(
       -verbose  => 99,
@@ -29,13 +34,37 @@
   }
 );
 
+# Make paths absolute
+$json_dir   = File::Spec->rel2abs( $json_dir, $base_dir) if $json_dir;
+$tar_dir    = File::Spec->rel2abs( $tar_dir, $base_dir) if $tar_dir;
+$index_conf = File::Spec->rel2abs( $base_dir, $index_conf) if $index_conf;
+
+
 use feature 'say';
 
 my $corpus_c = $package_size;
 my @lines = ();
 
-my $index_dir = '/opt/korap/index-2020-03-30';
-my $netapp_index_dir = '/export/netapp/korap/korap-head-02/index-2020-03-30/index';
+my @actions = ();
+
+foreach (@cmds) {
+  if ($_ eq 'untar') {
+    push @actions, \&_untar;
+  };
+};
+
+foreach (@cmds) {
+  if ($_ eq 'count_json') {
+    push @actions, \&_count_json;
+  };
+};
+
+foreach (@cmds) {
+  if ($_ eq 'to_index') {
+    push @actions, \&_to_index;
+  };
+};
+
 
 # Iterate over the list of corpora
 foreach my $line (<STDIN>) {
@@ -45,51 +74,72 @@
     push @lines, $1;
   };
 
-  # Write command
+  # Write command(s)
   if (--$corpus_c <= 0) {
-    write_command(\@lines);
+
+    # write actions
+    foreach (@actions) {
+      $_->(\@lines, 0);
+    };
+
     $corpus_c = $package_size;
     @lines = ();
   };
 };
 
+
+# Write actions for the remaining files
 if (@lines) {
-  write_command(\@lines);
-};
-
-sub write_command {
-  my $lines = shift;
-  foreach (@$lines) {
-    print corpus($_);
+  foreach (@actions) {
+    $_->(\@lines, 1);
   };
-  to_log($lines->[0], $lines->[-1]);
 };
 
 
-# Initialize the command
-sub corpus {
-  my $corpus = shift;
-  return 'tar -C ' . $index_dir . '/json/ -xvf ' .
-    '/export/netapp/korap/krill-json/dereko-2021-1/' . $corpus . '.tar & \\'."\n";
+# Untar json files
+sub _untar {
+  my ($lines, $final) = @_;
+  my $run = '';
+  foreach (@$lines) {
+    $run .= 'tar -C ' . $json_dir . ' -xvf ' .
+      catfile($tar_dir, $_ . '.tar') . ' & \\'."\n";
+  };
+  chomp($run);
+  chop($run);
+  chop($run);
+  chop($run);
+  $run .= "\n\n";
+  print $run;
 };
 
-sub to_log {
-  my ($from, $to) = @_;
 
-  say "echo 'ok'\n";
+# Count json files
+sub _count_json {
+  say '# Count files';
+  say 'ls -l ' . $json_dir . ' | wc -l';
+  say "#\n";
+};
 
-  say "ls -l /opt/korap/index-2020-03-30/json/ | wc -l\n";
+
+# Index json files
+sub _to_index {
+  my ($lines, $final) = @_;
+  my $from = $lines->[0];
+  my $to = $lines->[-1];
 
   say 'java -jar Krill-Indexer.jar \\';
-  say '  -c ' . $index_dir . '/kustvakt-lite.conf \\';
-  say '  -i ' . $index_dir . '/json \\';
-  say "  -o $netapp_index_dir \\";
-  say '  &> ' . $index_dir . '/index.netapp.' . $from . '-' . $to . '.log &' . "\n";
+  say '  -c ' . $index_conf . ' \\';
+  say '  -i ' . $json_dir . ' \\';
+  say '  -o ' . $index_dir . ' \\';
+  say '  &> ' . catfile($base_dir, 'index.' . $from . '-' . $to . '.log') . ' & ' . "\n";
 
-  say "rm -r /opt/korap/index-2020-03-30/json/";
-  say "mkdir /opt/korap/index-2020-03-30/json/\n";
+  # Cleanup json directory
+  say 'rm -r ' . $json_dir;
+  say 'mkdir ' . $json_dir unless $final;
+  say "\n";
 };
 
+
 __END__
 
 =pod
@@ -108,6 +158,36 @@
 
 Batch size, i.e. how many corpora are converted, extracted, indexed, ... at a time.
 
+=item B<--base-dir|-bd>
+
+Base directory on which all other directory definitions operate.
+
+=item B<--json-dir|-jd>
+
+Directory to export extracted json files from tars.
+
+=item B<--cmd|-c>
+
+Commands to process, can be called multiple times.
+
+Supported commands include:
+
+=over 4
+
+=item B<untar>
+
+Untar a tar package to a certain directory.
+
+=item B<count_json>
+
+Count all extracted json files after untar.
+
+=item B<to_index>
+
+This will index all files and cleanup the json folder afterwards.
+
+=back
+
 =back
 
 =cut