Merge "Add script to count commits for reports"
diff --git a/bin/corpuslist_to_index b/bin/corpuslist_to_index
new file mode 100644
index 0000000..e174343
--- /dev/null
+++ b/bin/corpuslist_to_index
@@ -0,0 +1,77 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use feature 'say';
+
+my $package_size = 15;
+
+my $corpus_c = $package_size;
+my @lines = ();
+
+my $index_dir = '/opt/korap/index-2020-03-30';
+my $netapp_index_dir = '/export/netapp/korap/korap-head-02/index-2020-03-30/index';
+
+# Iterate over the list of corpora
+foreach my $line (<STDIN>) {
+
+ # Parse corpus
+ if ($line =~ m!\/([^\/]+?)\.zip\n*$!) {
+ push @lines, $1;
+ };
+
+ # Write command
+ if (--$corpus_c <= 0) {
+ write_command(\@lines);
+ $corpus_c = $package_size;
+ @lines = ();
+ };
+};
+
+if (@lines) {
+ write_command(\@lines);
+};
+
+sub write_command {
+ my $lines = shift;
+ foreach (@$lines) {
+ print corpus($_);
+ };
+ to_log($lines->[0], $lines->[-1]);
+};
+
+
+# Initialize the command
+sub corpus {
+ my $corpus = shift;
+ return 'tar -C ' . $index_dir . '/json/ -xvf ' .
+ '/export/netapp/korap/krill-json/dereko-2021-1/' . $corpus . '.tar & \\'."\n";
+};
+
+sub to_log {
+ my ($from, $to) = @_;
+
+ say "echo 'ok'\n";
+
+ say "ls -l /opt/korap/index-2020-03-30/json/ | wc -l\n";
+
+ say 'java -jar Krill-Indexer.jar \\';
+ say ' -c ' . $index_dir . '/kustvakt-lite.conf \\';
+ say ' -i ' . $index_dir . '/json \\';
+ say " -o $netapp_index_dir \\";
+ say ' &> ' . $index_dir . '/index.netapp.' . $from . '-' . $to . '.log &' . "\n";
+
+ say "rm -r /opt/korap/index-2020-03-30/json/";
+ say "mkdir /opt/korap/index-2020-03-30/json/\n";
+};
+
+__END__
+
+=pod
+
+=encoding utf8
+
+=head1 SYNOPSIS
+
+ $ cat corpuslist.txt | perl corpuslist_to_index > index_tasks.txt
+
+=cut
diff --git a/bin/corpuslist_to_korapxml2krill b/bin/corpuslist_to_korapxml2krill
new file mode 100644
index 0000000..a74dd9e
--- /dev/null
+++ b/bin/corpuslist_to_korapxml2krill
@@ -0,0 +1,77 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+my $package_size = 12;
+my $worker = 8;
+
+my $corpus_c = $package_size;
+my $worker_c = 1;
+my $job_c = 1;
+my @lines = ();
+
+# Iterate over the list of corpora
+foreach my $line (<STDIN>) {
+
+ # Parse corpus
+ if ($line =~ m!\/([^\/]+?)\.zip\n*$!) {
+ # print $1,"\n";
+ push @lines, $1;
+ };
+
+ # Write command
+ if (--$corpus_c <= 0) {
+ write_command(\@lines);
+ $corpus_c = $package_size;
+ @lines = ();
+ };
+};
+
+if (@lines) {
+ write_command(\@lines);
+};
+
+sub write_command {
+ my $lines = shift;
+ if ($worker_c > $worker) {
+ $worker_c = 1;
+ };
+ print '# ' . $job_c++ . '. Worker ' . $worker_c . ' - 10.0.10.' . (52 + $worker_c) . "\n";
+ $worker_c++;
+ print init();
+ foreach (@$lines) {
+ print corpus($_);
+ };
+ print to_log($lines->[0], $lines->[-1]);
+};
+
+
+# Initialize the command
+sub init {
+ return 'korapxml2krill serial \\' . "\n" .
+ ' -cfg \'/export/netapp/korap/krill-json/dereko-2021-1/dereko-2021-1.cfg\' \\' . "\n";
+};
+
+sub corpus {
+ my $corpus = shift;
+ return ' -i "' . $corpus . '.*zip" \\' . "\n";
+};
+
+# End with log command
+sub to_log {
+ my ($from, $to) = @_;
+ return ' &> "/opt/korap/process-dereko-2021-1/dereko-2021-1-' .
+ $from . '-' . $to . '.log" &' . "\n\n";
+};
+
+__END__
+
+=pod
+
+=encoding utf8
+
+=head1 SYNOPSIS
+
+ $ cat corpuslist.txt | perl corpuslist_to_korapxml2krill > conversion_tasks.txt
+
+=cut
diff --git a/bin/slim_korapxml2krill_log b/bin/slim_korapxml2krill_log
new file mode 100644
index 0000000..f0146cf
--- /dev/null
+++ b/bin/slim_korapxml2krill_log
@@ -0,0 +1,72 @@
+#!/usr/bin/env perl
+use Mojo::Base -strict;
+use Mojo::File qw'path';
+
+our @ARGV;
+my ($unable, $unable_substring, $unable_offsets) = (0,0,0);
+
+my $file = path($ARGV[0]);
+my $out_fh = path($file->dirname)->child(
+ $file->basename('.log') . '-slim.log'
+)->open('>');
+
+my $fh = $file->open('<');
+
+# Iterate over file
+while (!eof($fh)){
+ local $_ = <$fh>;
+
+ if ($_ =~ qr!(?: Processed)! && $_ !~ qr!:1\/!) {
+ next;
+ };
+
+ if ($_ =~ qr! Unable to process !) {
+ $unable++;
+ next;
+ }
+ elsif ($_ =~ qr! Tokenization with failing offsets !) {
+ $unable_offsets++;
+ next;
+ }
+ elsif ($_ =~ qr! Unable to find substring !) {
+ $unable_substring++;
+ next;
+ }
+ elsif ($_ =~ qr!^Done\.$!) {
+ my $str = 'Done.';
+ $str .= ' [!Process: ' . $unable . ']' if $unable;
+ $str .= ' [!Offstes: ' . $unable_offsets . ']' if $unable_offsets;
+ $str .= ' [!Substring: ' . $unable_substring . ']' if $unable_substring;
+ $unable = 0;
+ $unable_substring = 0;
+ $unable_offsets = 0;
+ print $out_fh "## $str\n";
+ next;
+ };
+
+ if ($_ =~ qr! Unable to (?:process|find substring) !) {
+ next;
+ }
+ elsif ($_ =~ qr!substr outside of string!) {
+ next;
+ }
+ elsif ($_ =~ qr!with failing offsets!) {
+ next;
+ }
+ elsif ($_ =~ qr! in \/opt\/korap!) {
+ next;
+ };
+
+ print $out_fh $_;
+};
+
+$out_fh->close;
+$fh->close;
+
+__END__
+
+=pod
+
+ $ slim_korapxml2krill mylog.log
+
+=cut