blob: 0aec043ca41f0c7fe4b3d470a4f5ad666eadc60a [file] [log] [blame]
Akrone1dbc382016-07-08 22:24:52 +02001#/usr/bin/env perl
2use strict;
3use warnings;
4use File::Basename 'dirname';
5use File::Spec::Functions qw/catdir catfile/;
6use File::Temp qw/ :POSIX /;
7use Mojo::Util qw/slurp/;
8use Mojo::JSON qw/decode_json/;
9use IO::Uncompress::Gunzip;
10use Test::More;
Akron5f51d422016-08-16 16:26:43 +020011use Test::Output;
12use Data::Dumper;
Akrone1dbc382016-07-08 22:24:52 +020013
14my $f = dirname(__FILE__);
15my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
Akron5f51d422016-08-16 16:26:43 +020016
17my $input = catdir($f, '..', 'annotation', 'corpus', 'doc', '0001');
Akrone1dbc382016-07-08 22:24:52 +020018ok(-d $input, 'Input directory found');
19
Akron5f51d422016-08-16 16:26:43 +020020my $output = tmpnam();
Akrone1dbc382016-07-08 22:24:52 +020021
Akron5f51d422016-08-16 16:26:43 +020022ok(!-f $output, 'Output does not exist');
Akrone1dbc382016-07-08 22:24:52 +020023
Akron5f51d422016-08-16 16:26:43 +020024my $call = join(
25 ' ',
26 'perl', $script,
27 '--input' => $input,
28 '--output' => $output,
29 '-t' => 'OpenNLP#Tokens',
30 '-l' => 'INFO'
31);
32
33# Test without compression
34stderr_like(
35 sub {
36 system($call);
37 },
38 qr!The code took!,
39 $call
40);
41
42ok(-f $output, 'Output does exist');
43ok((my $file = slurp $output), 'Slurp data');
44ok((my $json = decode_json $file), 'decode json');
Akrone1dbc382016-07-08 22:24:52 +020045is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
46is($json->{title}, 'Beispiel Text', 'Title');
47is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
48is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
49like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
50is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
Akron5f51d422016-08-16 16:26:43 +020051is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
Akrone1dbc382016-07-08 22:24:52 +020052
Akron5f51d422016-08-16 16:26:43 +020053# Delete output
54unlink $output;
55ok(!-f $output, 'Output does not exist');
Akrone1dbc382016-07-08 22:24:52 +020056
Akron5f51d422016-08-16 16:26:43 +020057$call .= ' -z';
Akrone1dbc382016-07-08 22:24:52 +020058
Akron5f51d422016-08-16 16:26:43 +020059# Test with compression
60stderr_like(
61 sub { system($call); },
62 qr!The code took!,
63 $call
64);
65
66ok(-f $output, 'Output does exist');
67
68# Uncompress the data using a buffer
69my $gz = IO::Uncompress::Gunzip->new($output, Transparent => 0);
70($file, my $buffer) = '';
71while ($gz->read($buffer)) {
72 $file .= $buffer;
73};
74
75ok($json = decode_json($file), 'decode json');
76
Akrone1dbc382016-07-08 22:24:52 +020077is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
78is($json->{title}, 'Beispiel Text', 'Title');
Akron5f51d422016-08-16 16:26:43 +020079is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
Akrone1dbc382016-07-08 22:24:52 +020080is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
81like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
82is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
83
Akron5f51d422016-08-16 16:26:43 +020084# Delete output
85unlink $output;
86ok(!-f $output, 'Output does not exist');
87
88# Use a different token source and skip all annotations,
89# except for DeReKo#Structure and Mate#Dependency
90$call = join(
91 ' ',
92 'perl', $script,
93 '--input' => $input,
94 '--output' => $output,
95 '-t' => 'CoreNLP#Tokens',
96 '-s' => '#all',
97 '-a' => 'DeReKo#Structure',
98 '-a' => 'Mate#Dependency',
99 '-l' => 'INFO'
100);
101
102stderr_like(
103 sub {
104 system($call);
105 },
106 qr!The code took!,
107 $call
108);
109
110ok(-f $output, 'Output does exist');
111ok(($file = slurp $output), 'Slurp data');
112ok(($json = decode_json $file), 'decode json');
113
114is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
115
116is($json->{title}, 'Beispiel Text', 'Title');
117is($json->{data}->{tokenSource}, 'corenlp#tokens', 'TokenSource');
118is($json->{data}->{foundries}, 'dereko dereko/structure mate mate/dependency', 'Foundries');
119
120like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
121is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>20', 'Tokens');
122
Akrone2b902d2016-08-16 16:50:11 +0200123
124# Check overwrite
125$call = join(
126 ' ',
127 'perl', $script,
128 '--input' => $input,
129 '--output' => $output,
130 '-t' => 'CoreNLP#Tokens',
131 '-s' => '#all',
132 '-a' => 'DeReKo#Structure',
133 '-a' => 'Mate#Dependency',
134 '-l' => 'DEBUG'
135);
136
137ok(-f $output, 'Output does exist');
138stderr_like(
139 sub {
140 system($call);
141 },
142 qr!already exists!,
143 $call
144);
145
146$call .= ' -w ';
147
148stderr_unlike(
149 sub {
150 system($call);
151 },
152 qr!already exists!,
153 $call
154);
155
156
Akron5f51d422016-08-16 16:26:43 +0200157# Test meta
158# Test sigle!
Akrone1dbc382016-07-08 22:24:52 +0200159
160done_testing;
161__END__
Akron5f51d422016-08-16 16:26:43 +0200162