blob: 69546c09a567c6bf75d2d33bd616043c642c2b02 [file] [log] [blame]
Akron7d4cdd82016-08-17 21:39:45 +02001#/usr/bin/env perl
2use strict;
3use warnings;
4use File::Basename 'dirname';
5use File::Spec::Functions qw/catdir catfile/;
6use File::Temp qw/tempdir/;
Akron3ec0a1c2017-01-18 14:41:55 +01007use Mojo::File;
Akron821db3d2017-04-06 21:19:31 +02008use Mojo::Util qw/quote/;
Akron7d4cdd82016-08-17 21:39:45 +02009use Mojo::JSON qw/decode_json/;
10use IO::Uncompress::Gunzip;
11use Test::More;
12use Test::Output qw/:stdout :stderr :functions/;
13use Data::Dumper;
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020014use KorAP::XML::Archive;
Akron7d4cdd82016-08-17 21:39:45 +020015use utf8;
16
17my $f = dirname(__FILE__);
18my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
19
20my $call = join(
21 ' ',
22 'perl', $script,
23 'archive'
24);
25
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020026unless (KorAP::XML::Archive::test_unzip) {
27 plan skip_all => 'unzip not found';
28};
29
Akron7d4cdd82016-08-17 21:39:45 +020030# Test without parameters
31stdout_like(
32 sub {
33 system($call);
34 },
Akrona76d8352016-10-27 16:27:32 +020035 qr!archive.+?\$ korapxml2krill!s,
Akron7d4cdd82016-08-17 21:39:45 +020036 $call
37);
38
39my $input = catfile($f, '..', 'corpus', 'archive.zip');
40ok(-f $input, 'Input archive found');
41
Akron3ec48972016-08-17 23:24:52 +020042my $output = File::Temp->newdir(CLEANUP => 0);
43$output->unlink_on_destroy(0);
44
Akron7d4cdd82016-08-17 21:39:45 +020045ok(-d $output, 'Output directory exists');
46
47$call = join(
48 ' ',
49 'perl', $script,
50 'archive',
Akron821db3d2017-04-06 21:19:31 +020051 '--input' => '' . $input,
Akron7d4cdd82016-08-17 21:39:45 +020052 '--output' => $output,
53 '-t' => 'Base#tokens_aggr',
54 '-m' => 'Sgbr'
55);
56
57# Test without compression
58my $json;
59{
60 local $SIG{__WARN__} = sub {};
61 my $out = stdout_from(sub { system($call); });
62
63 like($out, qr!TEST-BSP-1\.json!s, $call);
64
65 $out =~ m!Processed (.+?\.json)!;
66 $json = $1;
67};
68
69ok(-f $json, 'Json file exists');
Akron3ec0a1c2017-01-18 14:41:55 +010070ok((my $file = Mojo::File->new($json)->slurp), 'Slurp data');
Akron7d4cdd82016-08-17 21:39:45 +020071ok(($json = decode_json $file), 'decode json');
72
73is($json->{data}->{tokenSource}, 'base#tokens_aggr', 'Title');
74is($json->{data}->{foundries}, 'base base/paragraphs base/sentences dereko dereko/structure sgbr sgbr/lemma sgbr/morpho', 'Foundries');
75is($json->{sgbrKodex}, 'M', 'Kodex meta data');
76
Akron3ec48972016-08-17 23:24:52 +020077
78# Use directory
79$input = catdir($f, '..', 'annotation', 'corpus');
80
81$call = join(
82 ' ',
83 'perl', $script,
84 'archive',
85 '--input' => $input,
86 '--output' => $output,
87 '-t' => 'Tree_Tagger#Tokens',
88 '-j' => 4 # 4 jobs!
89);
90
91my ($json_1, $json_2);
92
93{
94 local $SIG{__WARN__} = sub {};
95
96 # That's not really stable on slow machines!
97 my $out = stdout_from(sub { system($call); });
98
99 ok($out =~ m!\[\$(\d+?):1\/2\]!s, $call . ' pid 1');
100 my $pid1 = $1;
101 ok($out =~ m!\[\$(\d+?):2\/2\]!s, $call . ' pid 2');
102 my $pid2 = $1;
103
104 isnt($pid1, $pid2, 'No PID match');
105
106 ok($out =~ m!Processed .+?\/corpus-doc-0001\.json!s, $call);
107 ok($out =~ m!Processed .+?\/corpus-doc-0002\.json!s, $call);
108
109 ok(-d $output, 'Temporary directory still exists');
110 my $json_1 = catfile($output, 'corpus-doc-0001.json');
111 ok(-f $json_1, 'Json file exists 1');
112 my $json_2 = catfile($output, 'corpus-doc-0002.json');
113 ok(-f $json_2, 'Json file exists 2');
114
Akron3ec0a1c2017-01-18 14:41:55 +0100115 ok(($file = Mojo::File->new($json_1)->slurp), 'Slurp data');
Akron3ec48972016-08-17 23:24:52 +0200116 ok(($json_1 = decode_json $file), 'decode json');
117
118 is($json_1->{data}->{tokenSource}, 'tree_tagger#tokens', 'TokenSource');
119 is($json_1->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
120 is($json_1->{textSigle}, 'Corpus/Doc/0001', 'Sigle');
121
122 ok(-f $json_2, 'Json file exists');
Akron3ec0a1c2017-01-18 14:41:55 +0100123 ok(($file = Mojo::File->new($json_2)->slurp), 'Slurp data');
Akron3ec48972016-08-17 23:24:52 +0200124 ok(($json_2 = decode_json $file), 'decode json');
125
126 is($json_2->{data}->{tokenSource}, 'tree_tagger#tokens', 'TokenSource');
127 is($json_2->{data}->{foundries}, 'base base/paragraphs base/sentences dereko dereko/structure malt malt/dependency treetagger treetagger/morpho treetagger/sentences', 'Foundries');
128 is($json_2->{textSigle}, 'Corpus/Doc/0002', 'Sigle');
129};
130
131ok(-d $output, 'Ouput directory exists');
Akron89df4fa2016-11-04 14:35:37 +0100132
133
134$input = catfile($f, '..', 'corpus', 'WDD15', 'A79', '83946');
135$call = join(
136 ' ',
137 'perl', $script,
138 '--input' => $input
139);
140
141# Test without compression
142{
143 local $SIG{__WARN__} = sub {};
144 my $out = stderr_from(sub { system($call); });
145
146 like($out, qr!no base tokenization!s, $call);
147};
148
Akronf6240842017-02-17 23:45:26 +0100149my $input_quotes = catfile($f, '..', 'corpus', 'archive_quotes.zip');
150$call = join(
151 ' ',
152 'perl', $script,
153 'archive',
154 '--input' => $input_quotes,
155 '--output' => $output,
156 '-t' => 'Base#tokens_aggr'
157);
158
159# Test without parameters
160stdout_like(
161 sub {
162 system($call);
163 },
164 qr!Done\.!is,
165 $call
166);
167
Akron89df4fa2016-11-04 14:35:37 +0100168
Akron3ec48972016-08-17 23:24:52 +0200169unlink($output);
170
Akron821db3d2017-04-06 21:19:31 +0200171
172$input_quotes = "'".catfile($f, '..', 'corpus', 'archives', 'wpd15*.zip') . "'";
173
174$call = join(
175 ' ',
176 'perl', $script,
177 'archive',
178 '--input' => $input_quotes,
179 '--output' => $output,
180 '-t' => 'Base#tokens_aggr'
181);
182
183# Test without parameters
184stdout_like(
185 sub {
186 system($call);
187 },
Akron63f20d42017-04-10 23:40:29 +0200188 qr!Input is .+?wpd15-single\.zip,.+?wpd15-single\.malt\.zip,.+?wpd15-single\.corenlp\.zip,.+?wpd15-single\.opennlp\.zip,.+?wpd15-single\.mdparser\.zip,.+?wpd15-single\.tree_tagger\.zip!is,
Akron821db3d2017-04-06 21:19:31 +0200189 $call
190);
191
Akron7d4cdd82016-08-17 21:39:45 +0200192done_testing;
193__END__