blob: d3853f4a0eaf9e4c5a5d9065c19f945a5a9acd40 [file] [log] [blame]
Akron7d4cdd82016-08-17 21:39:45 +02001#/usr/bin/env perl
2use strict;
3use warnings;
4use File::Basename 'dirname';
5use File::Spec::Functions qw/catdir catfile/;
6use File::Temp qw/tempdir/;
Akron3ec0a1c2017-01-18 14:41:55 +01007use Mojo::File;
Akron7d4cdd82016-08-17 21:39:45 +02008use Mojo::JSON qw/decode_json/;
9use IO::Uncompress::Gunzip;
10use Test::More;
11use Test::Output qw/:stdout :stderr :functions/;
12use Data::Dumper;
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020013use KorAP::XML::Archive;
Akron7d4cdd82016-08-17 21:39:45 +020014use utf8;
15
16my $f = dirname(__FILE__);
17my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
18
19my $call = join(
20 ' ',
21 'perl', $script,
22 'archive'
23);
24
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020025unless (KorAP::XML::Archive::test_unzip) {
26 plan skip_all => 'unzip not found';
27};
28
Akron7d4cdd82016-08-17 21:39:45 +020029# Test without parameters
30stdout_like(
31 sub {
32 system($call);
33 },
Akrona76d8352016-10-27 16:27:32 +020034 qr!archive.+?\$ korapxml2krill!s,
Akron7d4cdd82016-08-17 21:39:45 +020035 $call
36);
37
38my $input = catfile($f, '..', 'corpus', 'archive.zip');
39ok(-f $input, 'Input archive found');
40
Akron3ec48972016-08-17 23:24:52 +020041my $output = File::Temp->newdir(CLEANUP => 0);
42$output->unlink_on_destroy(0);
43
Akron7d4cdd82016-08-17 21:39:45 +020044ok(-d $output, 'Output directory exists');
45
46$call = join(
47 ' ',
48 'perl', $script,
49 'archive',
50 '--input' => $input,
51 '--output' => $output,
52 '-t' => 'Base#tokens_aggr',
53 '-m' => 'Sgbr'
54);
55
56# Test without compression
57my $json;
58{
59 local $SIG{__WARN__} = sub {};
60 my $out = stdout_from(sub { system($call); });
61
62 like($out, qr!TEST-BSP-1\.json!s, $call);
63
64 $out =~ m!Processed (.+?\.json)!;
65 $json = $1;
66};
67
68ok(-f $json, 'Json file exists');
Akron3ec0a1c2017-01-18 14:41:55 +010069ok((my $file = Mojo::File->new($json)->slurp), 'Slurp data');
Akron7d4cdd82016-08-17 21:39:45 +020070ok(($json = decode_json $file), 'decode json');
71
72is($json->{data}->{tokenSource}, 'base#tokens_aggr', 'Title');
73is($json->{data}->{foundries}, 'base base/paragraphs base/sentences dereko dereko/structure sgbr sgbr/lemma sgbr/morpho', 'Foundries');
74is($json->{sgbrKodex}, 'M', 'Kodex meta data');
75
Akron3ec48972016-08-17 23:24:52 +020076
77# Use directory
78$input = catdir($f, '..', 'annotation', 'corpus');
79
80$call = join(
81 ' ',
82 'perl', $script,
83 'archive',
84 '--input' => $input,
85 '--output' => $output,
86 '-t' => 'Tree_Tagger#Tokens',
87 '-j' => 4 # 4 jobs!
88);
89
90my ($json_1, $json_2);
91
92{
93 local $SIG{__WARN__} = sub {};
94
95 # That's not really stable on slow machines!
96 my $out = stdout_from(sub { system($call); });
97
98 ok($out =~ m!\[\$(\d+?):1\/2\]!s, $call . ' pid 1');
99 my $pid1 = $1;
100 ok($out =~ m!\[\$(\d+?):2\/2\]!s, $call . ' pid 2');
101 my $pid2 = $1;
102
103 isnt($pid1, $pid2, 'No PID match');
104
105 ok($out =~ m!Processed .+?\/corpus-doc-0001\.json!s, $call);
106 ok($out =~ m!Processed .+?\/corpus-doc-0002\.json!s, $call);
107
108 ok(-d $output, 'Temporary directory still exists');
109 my $json_1 = catfile($output, 'corpus-doc-0001.json');
110 ok(-f $json_1, 'Json file exists 1');
111 my $json_2 = catfile($output, 'corpus-doc-0002.json');
112 ok(-f $json_2, 'Json file exists 2');
113
Akron3ec0a1c2017-01-18 14:41:55 +0100114 ok(($file = Mojo::File->new($json_1)->slurp), 'Slurp data');
Akron3ec48972016-08-17 23:24:52 +0200115 ok(($json_1 = decode_json $file), 'decode json');
116
117 is($json_1->{data}->{tokenSource}, 'tree_tagger#tokens', 'TokenSource');
118 is($json_1->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
119 is($json_1->{textSigle}, 'Corpus/Doc/0001', 'Sigle');
120
121 ok(-f $json_2, 'Json file exists');
Akron3ec0a1c2017-01-18 14:41:55 +0100122 ok(($file = Mojo::File->new($json_2)->slurp), 'Slurp data');
Akron3ec48972016-08-17 23:24:52 +0200123 ok(($json_2 = decode_json $file), 'decode json');
124
125 is($json_2->{data}->{tokenSource}, 'tree_tagger#tokens', 'TokenSource');
126 is($json_2->{data}->{foundries}, 'base base/paragraphs base/sentences dereko dereko/structure malt malt/dependency treetagger treetagger/morpho treetagger/sentences', 'Foundries');
127 is($json_2->{textSigle}, 'Corpus/Doc/0002', 'Sigle');
128};
129
130ok(-d $output, 'Ouput directory exists');
Akron89df4fa2016-11-04 14:35:37 +0100131
132
133$input = catfile($f, '..', 'corpus', 'WDD15', 'A79', '83946');
134$call = join(
135 ' ',
136 'perl', $script,
137 '--input' => $input
138);
139
140# Test without compression
141{
142 local $SIG{__WARN__} = sub {};
143 my $out = stderr_from(sub { system($call); });
144
145 like($out, qr!no base tokenization!s, $call);
146};
147
Akronf6240842017-02-17 23:45:26 +0100148my $input_quotes = catfile($f, '..', 'corpus', 'archive_quotes.zip');
149$call = join(
150 ' ',
151 'perl', $script,
152 'archive',
153 '--input' => $input_quotes,
154 '--output' => $output,
155 '-t' => 'Base#tokens_aggr'
156);
157
158# Test without parameters
159stdout_like(
160 sub {
161 system($call);
162 },
163 qr!Done\.!is,
164 $call
165);
166
Akron89df4fa2016-11-04 14:35:37 +0100167
Akron3ec48972016-08-17 23:24:52 +0200168unlink($output);
169
Akron7d4cdd82016-08-17 21:39:45 +0200170done_testing;
171__END__