#!/usr/bin/env perl
use strict;
use warnings;

# This script will generate commands to convert and index
# a list of corpora with korapxml2krill and Krill

use Getopt::Long qw(GetOptions :config no_auto_abbrev);
use Pod::Usage;
use Cwd;

use File::Spec::Functions qw'catdir catfile';

our $VERSION = '0.0.2';
our $VERSION_MSG = "\ncorpuslist_to_index - v$VERSION\n";

my @cmds = ('untar', 'count_json', 'to_index');

my $who = ''; # Worker 05 (10.0.10.57):

GetOptions(
  'batch|b=i'  => \(my $package_size = 15),
  'base-dir|bd=s' => \(my $base_dir = Cwd::cwd()), # '/opt/korap/index-2020-03-30'),
  'json-dir|jd=s' => \(my $json_dir = 'json'),
  'tar-dir|td=s' => \(my $tar_dir = '/export/netapp/korap/krill-json/dereko-2021-1/'),
  'index-dir|id=s' => \(my $index_dir = '/export/netapp/korap/korap-head-02/index-2020-03-30/index'),
  'index-conf|ic=s' => \(my $index_conf = 'kustvakt.conf'),
  'slack=s' => \(my $slack = ''),
  'cleanup!' => \(my $cleanup_json = 1),
  'cmd|c=s' => \@cmds,
  'comment=s' => \(my $comment),
  'help|h' => sub {
    pod2usage(
      -verbose  => 99,
      -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
      -msg      => $VERSION_MSG,
      -output   => '-'
    )
  }
);

# Make paths absolute
$json_dir   = File::Spec->rel2abs( $json_dir, $base_dir) if $json_dir;
$tar_dir    = File::Spec->rel2abs( $tar_dir, $base_dir) if $tar_dir;
$index_conf = File::Spec->rel2abs( $index_conf, $base_dir) if $index_conf;


use feature 'say';

my $corpus_c = $package_size;
my @lines = ();

my @actions = ();

foreach (@cmds) {
  if ($_ eq 'untar') {
    push @actions, \&_untar;
  };
};

foreach (@cmds) {
  if ($_ eq 'count_json') {
    push @actions, \&_count_json;
  };
};

foreach (@cmds) {
  if ($_ eq 'to_index') {
    push @actions, \&_to_index;
  };
};


# Iterate over the list of corpora
foreach my $line (<STDIN>) {

  # Parse corpus
  if ($line =~ m!\/([^\/]+?)\.zip\n*$!) {
    push @lines, $1;
  };

  # Write command(s)
  if (--$corpus_c <= 0) {

    # write actions
    foreach (@actions) {
      $_->(\@lines, 0);
    };

    $corpus_c = $package_size;
    @lines = ();
  };
};


# Write actions for the remaining files
if (@lines) {
  foreach (@actions) {
    $_->(\@lines, 1);
  };
};


# Untar json files
sub _untar {
  my ($lines, $final) = @_;
  my $run = '';
  foreach (@$lines) {
    $run .= 'tar -C ' . $json_dir . ' -xvf ' .
      catfile($tar_dir, $_ . '.tar') . ' & \\'."\n";
  };
  chomp($run);
  chop($run);
  chop($run);
  chop($run);
  $run .= "\n\n";
  print $run;
};


# Count json files
sub _count_json {
  say '# Count files';
  say 'ls -l ' . $json_dir . ' | wc -l';
  say "#\n";
};


# Index json files
sub _to_index {
  my ($lines, $final) = @_;
  my $from = $lines->[0];
  my $to = $lines->[-1];

  if ($slack) {
    print '( ( ';
  };
  print 'java -jar Krill-Indexer.jar \\' . "\n";
  say '  -c ' . $index_conf . ' \\';
  say '  -i ' . $json_dir . ' \\';
  say '  -o ' . $index_dir . ' \\';
  print '  &> ' . catfile($base_dir, 'index.' . $from . '-' . $to . '.log');

  if ($slack) {
    say ' ; \\';
    print 'slack.js -c ' . $slack . ' ';
    print '"Done: ' . $who . "indexation $from-$to";
    if ($comment) {
      print ' (' . $comment . ')';
    };
    print '"  ) & )'."\n\n";
  } else {
    print ' & ' . "\n\n";
  };

  # Cleanup json directory
  if ($cleanup_json) {
    say 'rm -r ' . $json_dir;
    say 'mkdir ' . $json_dir unless $final;
    say "\n";
  };
};


__END__

=pod

=encoding utf8

=head1 SYNOPSIS

  $ cat corpuslist.txt | perl corpuslist_to_index > index_tasks.txt

=head1 OPTIONS

=over 2

=item B<--batch|-b>

Batch size, i.e. how many corpora are converted, extracted, indexed, ... at a time.

=item B<--base-dir|-bd>

Base directory on which all other directory definitions operate.

=item B<--json-dir|-jd>

Directory to export extracted json files from tars.

=item B<--cmd|-c>

Commands to process, can be called multiple times.

Supported commands include:

=over 4

=item B<untar>

Untar a tar package to a certain directory.

=item B<count_json>

Count all extracted json files after untar.

=item B<to_index>

This will index all files and cleanup the json folder afterwards.

=back

=item B<--slack>

Give a slack channel for reporting.

=item B<--comment>

Add comment that is added to all reportings.

=item B<--(no)cleanup>

Clean up the json folder after indexation.

=back

=cut
