#!/usr/bin/env perl
use strict;
use warnings;
use feature 'say';
use Getopt::Long qw(GetOptions :config no_auto_abbrev);
use Pod::Usage;

our $VERSION = '0.0.1';
our $VERSION_MSG = "\ncorpuslist_to_korapxml2krill - v$VERSION\n";


GetOptions(
  'batch|b=i'  => \(my $package_size = 12),
  'worker|w=i'  => \(my $worker = 8),
  'conversion-config|ccfg=s' => \(my $cfg = '/export/netapp/korap/krill-json/dereko-2021-1/dereko-2021-1.cfg' ),
  'log-dir|ld=s' => \(my $log_dir = '/opt/korap/process-dereko-2021-1'),
  'log-prefix|lp=s' => \(my $log_prefix = 'dereko-2021-1'),
  'slack=s' => \(my $slack = 'dereko'),
  'comment=s' => \(my $comment = 'w-23-gesamt'),
  'help|h' => sub {
    pod2usage(
      -verbose  => 99,
      -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
      -msg      => $VERSION_MSG,
      -output   => '-'
    )
  }
);

my $corpus_c = $package_size;
my $worker_c = 1;
my $job_c = 1;
my @lines = ();


# Iterate over the list of corpora
foreach my $line (<STDIN>) {

  # Parse corpus
  if ($line =~ m!\/([^\/]+?)\.zip\n*$!) {
    # print $1,"\n";
    push @lines, $1;
  };

  # Write command
  if (--$corpus_c <= 0) {
    write_command(\@lines);
    $corpus_c = $package_size;
    @lines = ();
  };
};


# Write remaining commands
if (@lines) {
  write_command(\@lines);
};


# Write a single command for a batch
sub write_command {
  my $lines = shift;

  my $from = $lines->[0];
  my $to = $lines->[-1];

  if ($worker_c > $worker) {
    $worker_c = 1;
  };

  print '# ' . $job_c++ . '. Worker ' . $worker_c . ' - 10.0.10.' . (52 + $worker_c) . "\n";

  if ($slack) {
    print '( ( ';
  };

  print init();

  foreach (@$lines) {
    print corpus($_);
  };

  print to_log($from, $to);

  if ($slack) {
    say ' ; \\';
    print 'slack.js -c ' . $slack . ' ';
    print '"Done: Worker ' . $worker_c . " conversion $from-$to";
    if ($comment) {
      print ' (' . $comment . ')';
    };
    print '"  ) & )'."\n\n";
  }

  else {
    print ' & ', "\n\n";
  };

  print "\n";

  $worker_c++;
};


# Initialize the command
sub init {
  return 'korapxml2krill serial \\' . "\n" .
    '  -cfg \'' . $cfg . '\' \\' . "\n";
};

# One line per corpus
sub corpus {
  my $corpus = shift;
  return '  -i "' . $corpus . '.*zip" \\' . "\n";
};

# End with log command
sub to_log {
  my ($from, $to) = @_;
  return '  &> "' . $log_dir . '/' . $log_prefix . '-' .
    $from . '-' . $to . '.log"';
};

__END__

=pod

=encoding utf8

=head1 SYNOPSIS

  $ cat corpuslist.txt | perl corpuslist_to_korapxml2krill > conversion_tasks.txt

=head1 OPTIONS

=over 2

=item B<--batch|-b>

Batch size, i.e. how many corpora are converted per worker

=item B<--worker|-w>

How many worker should be used. This will count up
and start at 1 again, once all workers have their batches.

=item B<--log-dir|-ld>

(Local) directory to store log files.

=item B<--log-prefix|-lp>

Prefix of the log file, that will contain information about
the start and end corpus of the batch.

=item B<--slack>

Give a slack channel for reporting.

=item B<--comment>

Add comment that is added to all reportings.

=back

=cut
