blob: b0244ca662d08b6f7094dc70516a5a6c63f4d06d [file] [log] [blame]
Akron7d4cdd82016-08-17 21:39:45 +02001#/usr/bin/env perl
2use strict;
3use warnings;
4use File::Basename 'dirname';
5use File::Spec::Functions qw/catdir catfile/;
Akrond5bb4342017-06-19 11:50:49 +02006use File::Temp qw/:POSIX/;
Akron3ec0a1c2017-01-18 14:41:55 +01007use Mojo::File;
Akron821db3d2017-04-06 21:19:31 +02008use Mojo::Util qw/quote/;
Akron7d4cdd82016-08-17 21:39:45 +02009use Mojo::JSON qw/decode_json/;
10use IO::Uncompress::Gunzip;
11use Test::More;
Akron63d03ee2019-02-13 18:49:38 +010012use Test::Output qw/:stdout :stderr :combined :functions/;
Akron7d4cdd82016-08-17 21:39:45 +020013use Data::Dumper;
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020014use KorAP::XML::Archive;
Akron7d4cdd82016-08-17 21:39:45 +020015use utf8;
16
Akronfab17d32020-07-31 14:38:29 +020017if ($ENV{SKIP_SCRIPT}) {
18 plan skip_all => 'Skip script tests';
19};
20
Akron7d4cdd82016-08-17 21:39:45 +020021my $f = dirname(__FILE__);
22my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
23
24my $call = join(
25 ' ',
26 'perl', $script,
27 'archive'
28);
29
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020030unless (KorAP::XML::Archive::test_unzip) {
31 plan skip_all => 'unzip not found';
32};
33
Akron7d4cdd82016-08-17 21:39:45 +020034# Test without parameters
35stdout_like(
36 sub {
37 system($call);
38 },
Akrona76d8352016-10-27 16:27:32 +020039 qr!archive.+?\$ korapxml2krill!s,
Akron7d4cdd82016-08-17 21:39:45 +020040 $call
41);
42
43my $input = catfile($f, '..', 'corpus', 'archive.zip');
44ok(-f $input, 'Input archive found');
Akron3ec48972016-08-17 23:24:52 +020045my $output = File::Temp->newdir(CLEANUP => 0);
46$output->unlink_on_destroy(0);
47
Akrond5bb4342017-06-19 11:50:49 +020048my $cache = tmpnam();
49
Akron7d4cdd82016-08-17 21:39:45 +020050ok(-d $output, 'Output directory exists');
51
52$call = join(
53 ' ',
54 'perl', $script,
55 'archive',
Akron821db3d2017-04-06 21:19:31 +020056 '--input' => '' . $input,
Akron7d4cdd82016-08-17 21:39:45 +020057 '--output' => $output,
Akrond5bb4342017-06-19 11:50:49 +020058 '--cache' => $cache,
Akron7d4cdd82016-08-17 21:39:45 +020059 '-t' => 'Base#tokens_aggr',
60 '-m' => 'Sgbr'
61);
62
63# Test without compression
64my $json;
65{
66 local $SIG{__WARN__} = sub {};
67 my $out = stdout_from(sub { system($call); });
68
69 like($out, qr!TEST-BSP-1\.json!s, $call);
70
71 $out =~ m!Processed (.+?\.json)!;
72 $json = $1;
73};
74
75ok(-f $json, 'Json file exists');
Akron3ec0a1c2017-01-18 14:41:55 +010076ok((my $file = Mojo::File->new($json)->slurp), 'Slurp data');
Akron7d4cdd82016-08-17 21:39:45 +020077ok(($json = decode_json $file), 'decode json');
78
79is($json->{data}->{tokenSource}, 'base#tokens_aggr', 'Title');
80is($json->{data}->{foundries}, 'base base/paragraphs base/sentences dereko dereko/structure sgbr sgbr/lemma sgbr/morpho', 'Foundries');
81is($json->{sgbrKodex}, 'M', 'Kodex meta data');
82
Akron3ec48972016-08-17 23:24:52 +020083
84# Use directory
85$input = catdir($f, '..', 'annotation', 'corpus');
86
87$call = join(
88 ' ',
89 'perl', $script,
90 'archive',
91 '--input' => $input,
92 '--output' => $output,
Akrond5bb4342017-06-19 11:50:49 +020093 '--cache' => $cache,
Akron3ec48972016-08-17 23:24:52 +020094 '-t' => 'Tree_Tagger#Tokens',
95 '-j' => 4 # 4 jobs!
96);
97
98my ($json_1, $json_2);
99
100{
101 local $SIG{__WARN__} = sub {};
102
103 # That's not really stable on slow machines!
104 my $out = stdout_from(sub { system($call); });
105
Marc Kupietz7fe9cd92024-03-18 11:50:22 +0100106 ok($out =~ m!\[\$(\d+?):1\/3\]!s, $call . ' pid 1');
Akron3ec48972016-08-17 23:24:52 +0200107 my $pid1 = $1;
Marc Kupietz7fe9cd92024-03-18 11:50:22 +0100108 ok($out =~ m!\[\$(\d+?):2\/3\]!s, $call . ' pid 2');
Akron3ec48972016-08-17 23:24:52 +0200109 my $pid2 = $1;
Marc Kupietz7fe9cd92024-03-18 11:50:22 +0100110 ok($out =~ m!\[\$(\d+?):3\/3\]!s, $call . ' pid 3');
111 my $pid3 = $1;
Akron3ec48972016-08-17 23:24:52 +0200112
113 isnt($pid1, $pid2, 'No PID match');
Marc Kupietz7fe9cd92024-03-18 11:50:22 +0100114 isnt($pid2, $pid3, 'No PID match');
115 isnt($pid1, $pid3, 'No PID match');
Akron3ec48972016-08-17 23:24:52 +0200116
117 ok($out =~ m!Processed .+?\/corpus-doc-0001\.json!s, $call);
118 ok($out =~ m!Processed .+?\/corpus-doc-0002\.json!s, $call);
Marc Kupietz7fe9cd92024-03-18 11:50:22 +0100119 ok($out =~ m!Processed .+?\/corpus-doc-0003\.json!s, $call);
Akron3ec48972016-08-17 23:24:52 +0200120
121 ok(-d $output, 'Temporary directory still exists');
122 my $json_1 = catfile($output, 'corpus-doc-0001.json');
123 ok(-f $json_1, 'Json file exists 1');
124 my $json_2 = catfile($output, 'corpus-doc-0002.json');
125 ok(-f $json_2, 'Json file exists 2');
126
Akron3ec0a1c2017-01-18 14:41:55 +0100127 ok(($file = Mojo::File->new($json_1)->slurp), 'Slurp data');
Akron3ec48972016-08-17 23:24:52 +0200128 ok(($json_1 = decode_json $file), 'decode json');
129
130 is($json_1->{data}->{tokenSource}, 'tree_tagger#tokens', 'TokenSource');
Marc Kupietzb8c53822024-03-16 18:54:08 +0100131 is($json_1->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences spacy spacy/morpho treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
Akron3ec48972016-08-17 23:24:52 +0200132 is($json_1->{textSigle}, 'Corpus/Doc/0001', 'Sigle');
133
134 ok(-f $json_2, 'Json file exists');
Akron3ec0a1c2017-01-18 14:41:55 +0100135 ok(($file = Mojo::File->new($json_2)->slurp), 'Slurp data');
Akron3ec48972016-08-17 23:24:52 +0200136 ok(($json_2 = decode_json $file), 'decode json');
137
138 is($json_2->{data}->{tokenSource}, 'tree_tagger#tokens', 'TokenSource');
139 is($json_2->{data}->{foundries}, 'base base/paragraphs base/sentences dereko dereko/structure malt malt/dependency treetagger treetagger/morpho treetagger/sentences', 'Foundries');
140 is($json_2->{textSigle}, 'Corpus/Doc/0002', 'Sigle');
141};
142
143ok(-d $output, 'Ouput directory exists');
Akron89df4fa2016-11-04 14:35:37 +0100144
145
Akron63d03ee2019-02-13 18:49:38 +0100146my $temp_extract = tmpnam();
147
148# Ignore -te when archive is a directory
149$call = join(
150 ' ',
151 'perl', $script,
152 'archive',
153 '--input' => $input,
154 '--output' => $output,
155 '--cache' => $cache,
156 '-t' => 'Tree_Tagger#Tokens',
157 '-j' => 4, # 4 jobs!
158 '-te' => $temp_extract
159);
160
161{
162 local $SIG{__WARN__} = sub {};
163
164 my $out = combined_from(sub { system($call); });
165
166 ok($out =~ m!Processed .+?\/corpus-doc-0001\.json!s, $call);
167 ok($out =~ m!Processed .+?\/corpus-doc-0002\.json!s, $call);
168};
169
170
Akronf6240842017-02-17 23:45:26 +0100171my $input_quotes = catfile($f, '..', 'corpus', 'archive_quotes.zip');
172$call = join(
173 ' ',
174 'perl', $script,
175 'archive',
176 '--input' => $input_quotes,
177 '--output' => $output,
Akrond5bb4342017-06-19 11:50:49 +0200178 '--cache' => $cache,
Akronf6240842017-02-17 23:45:26 +0100179 '-t' => 'Base#tokens_aggr'
180);
181
182# Test without parameters
183stdout_like(
184 sub {
185 system($call);
186 },
187 qr!Done\.!is,
188 $call
189);
190
Akron89df4fa2016-11-04 14:35:37 +0100191
Akron3ec48972016-08-17 23:24:52 +0200192unlink($output);
193
Akron821db3d2017-04-06 21:19:31 +0200194$input_quotes = "'".catfile($f, '..', 'corpus', 'archives', 'wpd15*.zip') . "'";
195
196$call = join(
197 ' ',
198 'perl', $script,
199 'archive',
200 '--input' => $input_quotes,
201 '--output' => $output,
Akrond5bb4342017-06-19 11:50:49 +0200202 '--cache' => $cache,
Akron821db3d2017-04-06 21:19:31 +0200203 '-t' => 'Base#tokens_aggr'
204);
205
206# Test without parameters
207stdout_like(
208 sub {
209 system($call);
210 },
Akron63f20d42017-04-10 23:40:29 +0200211 qr!Input is .+?wpd15-single\.zip,.+?wpd15-single\.malt\.zip,.+?wpd15-single\.corenlp\.zip,.+?wpd15-single\.opennlp\.zip,.+?wpd15-single\.mdparser\.zip,.+?wpd15-single\.tree_tagger\.zip!is,
Akron821db3d2017-04-06 21:19:31 +0200212 $call
213);
214
Akron31a08cb2019-02-20 20:43:26 +0100215
Akron31a08cb2019-02-20 20:43:26 +0100216# Test with sigles
217$input = catfile($f, '..', 'corpus', 'archive.zip');
218ok(-f $input, 'Input archive found');
219
220unlink($output);
221
222$call = join(
223 ' ',
224 'perl', $script,
225 'archive',
226 '--input' => '' . $input,
227 '--output' => $output,
228 '--sigle' => 'TEST/BSP/2',
229 '--sigle' => 'TEST/BSP/5',
230 '-t' => 'Base#tokens_aggr',
231 '-m' => 'Sgbr'
232);
233
234{
235 local $SIG{__WARN__} = sub {};
236 my $out = stdout_from(sub { system($call); });
237
238 like($out, qr!TEST-BSP-1\.json!s, $call);
239
240 $out =~ m!Processed (.+?\.json)!;
241 $json = $1;
242};
243
244ok(-f $json, 'Json file exists');
245
Akrona3518372024-01-22 23:29:00 +0100246# Test quiet
247
248# my $input = catfile($f, '..', 'corpus', 'archive.zip');
249# ok(-f $input, 'Input archive found');
250
251$call = join(
252 ' ',
253 'perl', $script,
254 'archive',
255 '--input' => '' . $input,
256 '--quiet',
257 '--output' => $output,
258 '--sigle' => 'TEST/BSP/2',
259 '--sigle' => 'TEST/BSP/5',
260 '-t' => 'Base#tokens_aggr',
261 '-m' => 'Sgbr'
262);
263
264{
265 local $SIG{__WARN__} = sub {};
266 my $out = stdout_from(sub { system($call); });
267
268 is($out, "\n", $call);
269};
Akron31a08cb2019-02-20 20:43:26 +0100270
Akron7d4cdd82016-08-17 21:39:45 +0200271done_testing;
272__END__