Improve corpuslist generation
Change-Id: Ic21aed1bc4723cd611d5989024621e865fb422f0
diff --git a/bin/corpuslist_to_korapxml2krill b/bin/corpuslist_to_korapxml2krill
index a74dd9e..9b4a041 100755
--- a/bin/corpuslist_to_korapxml2krill
+++ b/bin/corpuslist_to_korapxml2krill
@@ -1,15 +1,38 @@
#!/usr/bin/env perl
use strict;
use warnings;
+use feature 'say';
+use Getopt::Long qw(GetOptions :config no_auto_abbrev);
+use Pod::Usage;
-my $package_size = 12;
-my $worker = 8;
+our $VERSION = '0.0.1';
+our $VERSION_MSG = "\ncorpuslist_to_korapxml2krill - v$VERSION\n";
+
+
+GetOptions(
+ 'batch|b=i' => \(my $package_size = 12),
+ 'worker|w=i' => \(my $worker = 8),
+ 'conversion-config|ccfg=s' => \(my $cfg = '/export/netapp/korap/krill-json/dereko-2021-1/dereko-2021-1.cfg' ),
+ 'log-dir|ld=s' => \(my $log_dir = '/opt/korap/process-dereko-2021-1'),
+ 'log-prefix|lp=s' => \(my $log_prefix = 'dereko-2021-1'),
+ 'slack=s' => \(my $slack = 'dereko'),
+ 'comment=s' => \(my $comment = 'w-23-gesamt'),
+ 'help|h' => sub {
+ pod2usage(
+ -verbose => 99,
+ -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
+ -msg => $VERSION_MSG,
+ -output => '-'
+ )
+ }
+);
my $corpus_c = $package_size;
my $worker_c = 1;
my $job_c = 1;
my @lines = ();
+
# Iterate over the list of corpora
foreach my $line (<STDIN>) {
@@ -27,31 +50,65 @@
};
};
+
+# Write remaining commands
if (@lines) {
write_command(\@lines);
};
+
+# Write a single command for a batch
sub write_command {
my $lines = shift;
+
+ my $from = $lines->[0];
+ my $to = $lines->[-1];
+
if ($worker_c > $worker) {
$worker_c = 1;
};
+
print '# ' . $job_c++ . '. Worker ' . $worker_c . ' - 10.0.10.' . (52 + $worker_c) . "\n";
- $worker_c++;
+
+ if ($slack) {
+ print '( ( ';
+ };
+
print init();
+
foreach (@$lines) {
print corpus($_);
};
- print to_log($lines->[0], $lines->[-1]);
+
+ print to_log($from, $to);
+
+ if ($slack) {
+ say ' ; \\';
+ print 'slack.js -c ' . $slack . ' ';
+ print '"Done: Worker ' . $worker_c . " conversion $from-$to";
+ if ($comment) {
+ print ' (' . $comment . ')';
+ };
+ print '" ) & )'."\n\n";
+ }
+
+ else {
+ print ' & ', "\n\n";
+ };
+
+ print "\n";
+
+ $worker_c++;
};
# Initialize the command
sub init {
return 'korapxml2krill serial \\' . "\n" .
- ' -cfg \'/export/netapp/korap/krill-json/dereko-2021-1/dereko-2021-1.cfg\' \\' . "\n";
+ ' -cfg \'' . $cfg . '\' \\' . "\n";
};
+# One line per corpus
sub corpus {
my $corpus = shift;
return ' -i "' . $corpus . '.*zip" \\' . "\n";
@@ -60,8 +117,8 @@
# End with log command
sub to_log {
my ($from, $to) = @_;
- return ' &> "/opt/korap/process-dereko-2021-1/dereko-2021-1-' .
- $from . '-' . $to . '.log" &' . "\n\n";
+ return ' &> "' . $log_dir . '/' . $log_prefix . '-' .
+ $from . '-' . $to . '.log"';
};
__END__
@@ -74,4 +131,36 @@
$ cat corpuslist.txt | perl corpuslist_to_korapxml2krill > conversion_tasks.txt
+=head1 OPTIONS
+
+=over 2
+
+=item B<--batch|-b>
+
+Batch size, i.e. how many corpora are converted per worker
+
+=item B<--worker|-w>
+
+How many worker should be used. This will count up
+and start at 1 again, once all workers have their batches.
+
+=item B<--log-dir|-ld>
+
+(Local) directory to store log files.
+
+=item B<--log-prefix|-lp>
+
+Prefix of the log file, that will contain information about
+the start and end corpus of the batch.
+
+=item B<--slack>
+
+Give a slack channel for reporting.
+
+=item B<--comment>
+
+Add comment that is added to all reportings.
+
+=back
+
=cut