blob: db4f856481c20bdf1f569ea247105b57b482fb16 [file] [log] [blame]
#/usr/bin/env perl
use strict;
use warnings;
use File::Basename 'dirname';
use File::Spec::Functions qw/catdir catfile/;
use File::Temp qw/:POSIX/;
use Mojo::File;
use Mojo::Util qw/quote/;
use Mojo::JSON qw/decode_json/;
use IO::Uncompress::Gunzip;
use Test::More;
use Test::Output qw/:stdout :stderr :functions/;
use Data::Dumper;
use KorAP::XML::Archive;
use utf8;
my $f = dirname(__FILE__);
my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
my $call = join(
' ',
'perl', $script,
'archive'
);
unless (KorAP::XML::Archive::test_unzip) {
plan skip_all => 'unzip not found';
};
# Test without parameters
stdout_like(
sub {
system($call);
},
qr!archive.+?\$ korapxml2krill!s,
$call
);
my $input = catfile($f, '..', 'corpus', 'archive.zip');
ok(-f $input, 'Input archive found');
my $output = File::Temp->newdir(CLEANUP => 0);
$output->unlink_on_destroy(0);
my $cache = tmpnam();
ok(-d $output, 'Output directory exists');
$call = join(
' ',
'perl', $script,
'archive',
'--input' => '' . $input,
'--output' => $output,
'--cache' => $cache,
'-t' => 'Base#tokens_aggr',
'-m' => 'Sgbr'
);
# Test without compression
my $json;
{
local $SIG{__WARN__} = sub {};
my $out = stdout_from(sub { system($call); });
like($out, qr!TEST-BSP-1\.json!s, $call);
$out =~ m!Processed (.+?\.json)!;
$json = $1;
};
ok(-f $json, 'Json file exists');
ok((my $file = Mojo::File->new($json)->slurp), 'Slurp data');
ok(($json = decode_json $file), 'decode json');
is($json->{data}->{tokenSource}, 'base#tokens_aggr', 'Title');
is($json->{data}->{foundries}, 'base base/paragraphs base/sentences dereko dereko/structure sgbr sgbr/lemma sgbr/morpho', 'Foundries');
is($json->{sgbrKodex}, 'M', 'Kodex meta data');
# Use directory
$input = catdir($f, '..', 'annotation', 'corpus');
$call = join(
' ',
'perl', $script,
'archive',
'--input' => $input,
'--output' => $output,
'--cache' => $cache,
'-t' => 'Tree_Tagger#Tokens',
'-j' => 4 # 4 jobs!
);
my ($json_1, $json_2);
{
local $SIG{__WARN__} = sub {};
# That's not really stable on slow machines!
my $out = stdout_from(sub { system($call); });
ok($out =~ m!\[\$(\d+?):1\/2\]!s, $call . ' pid 1');
my $pid1 = $1;
ok($out =~ m!\[\$(\d+?):2\/2\]!s, $call . ' pid 2');
my $pid2 = $1;
isnt($pid1, $pid2, 'No PID match');
ok($out =~ m!Processed .+?\/corpus-doc-0001\.json!s, $call);
ok($out =~ m!Processed .+?\/corpus-doc-0002\.json!s, $call);
ok(-d $output, 'Temporary directory still exists');
my $json_1 = catfile($output, 'corpus-doc-0001.json');
ok(-f $json_1, 'Json file exists 1');
my $json_2 = catfile($output, 'corpus-doc-0002.json');
ok(-f $json_2, 'Json file exists 2');
ok(($file = Mojo::File->new($json_1)->slurp), 'Slurp data');
ok(($json_1 = decode_json $file), 'decode json');
is($json_1->{data}->{tokenSource}, 'tree_tagger#tokens', 'TokenSource');
is($json_1->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
is($json_1->{textSigle}, 'Corpus/Doc/0001', 'Sigle');
ok(-f $json_2, 'Json file exists');
ok(($file = Mojo::File->new($json_2)->slurp), 'Slurp data');
ok(($json_2 = decode_json $file), 'decode json');
is($json_2->{data}->{tokenSource}, 'tree_tagger#tokens', 'TokenSource');
is($json_2->{data}->{foundries}, 'base base/paragraphs base/sentences dereko dereko/structure malt malt/dependency treetagger treetagger/morpho treetagger/sentences', 'Foundries');
is($json_2->{textSigle}, 'Corpus/Doc/0002', 'Sigle');
};
ok(-d $output, 'Ouput directory exists');
$input = catfile($f, '..', 'corpus', 'WDD15', 'A79', '83946');
$call = join(
' ',
'perl', $script,
'--input' => $input,
'--cache' => $cache
);
# Test without compression
{
local $SIG{__WARN__} = sub {};
my $out = stderr_from(sub { system($call); });
like($out, qr!no base tokenization!s, $call);
};
my $input_quotes = catfile($f, '..', 'corpus', 'archive_quotes.zip');
$call = join(
' ',
'perl', $script,
'archive',
'--input' => $input_quotes,
'--output' => $output,
'--cache' => $cache,
'-t' => 'Base#tokens_aggr'
);
# Test without parameters
stdout_like(
sub {
system($call);
},
qr!Done\.!is,
$call
);
unlink($output);
$input_quotes = "'".catfile($f, '..', 'corpus', 'archives', 'wpd15*.zip') . "'";
$call = join(
' ',
'perl', $script,
'archive',
'--input' => $input_quotes,
'--output' => $output,
'--cache' => $cache,
'-t' => 'Base#tokens_aggr'
);
# Test without parameters
stdout_like(
sub {
system($call);
},
qr!Input is .+?wpd15-single\.zip,.+?wpd15-single\.malt\.zip,.+?wpd15-single\.corenlp\.zip,.+?wpd15-single\.opennlp\.zip,.+?wpd15-single\.mdparser\.zip,.+?wpd15-single\.tree_tagger\.zip!is,
$call
);
done_testing;
__END__