blob: cda1b57e95d87a96a9b2a81debf549d2a4b806db [file] [log] [blame]
Akrone1dbc382016-07-08 22:24:52 +02001#/usr/bin/env perl
2use strict;
3use warnings;
4use File::Basename 'dirname';
5use File::Spec::Functions qw/catdir catfile/;
6use File::Temp qw/ :POSIX /;
7use Mojo::Util qw/slurp/;
8use Mojo::JSON qw/decode_json/;
9use IO::Uncompress::Gunzip;
10use Test::More;
Akron5f51d422016-08-16 16:26:43 +020011use Test::Output;
12use Data::Dumper;
Akrone1dbc382016-07-08 22:24:52 +020013
14my $f = dirname(__FILE__);
15my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
Akrone1dbc382016-07-08 22:24:52 +020016ok(-f $script, 'Script found');
Akron5f51d422016-08-16 16:26:43 +020017
18stdout_like(
19 sub { system('perl', $script) },
20 qr!Usage.+?korapxml2krill!s,
21 'Usage output'
22);
23
24my $input = catdir($f, '..', 'annotation', 'corpus', 'doc', '0001');
Akrone1dbc382016-07-08 22:24:52 +020025ok(-d $input, 'Input directory found');
26
Akron5f51d422016-08-16 16:26:43 +020027my $output = tmpnam();
Akrone1dbc382016-07-08 22:24:52 +020028
Akron5f51d422016-08-16 16:26:43 +020029ok(!-f $output, 'Output does not exist');
Akrone1dbc382016-07-08 22:24:52 +020030
Akron5f51d422016-08-16 16:26:43 +020031my $call = join(
32 ' ',
33 'perl', $script,
34 '--input' => $input,
35 '--output' => $output,
36 '-t' => 'OpenNLP#Tokens',
37 '-l' => 'INFO'
38);
39
40# Test without compression
41stderr_like(
42 sub {
43 system($call);
44 },
45 qr!The code took!,
46 $call
47);
48
49ok(-f $output, 'Output does exist');
50ok((my $file = slurp $output), 'Slurp data');
51ok((my $json = decode_json $file), 'decode json');
Akrone1dbc382016-07-08 22:24:52 +020052is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
53is($json->{title}, 'Beispiel Text', 'Title');
54is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
55is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
56like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
57is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
Akron5f51d422016-08-16 16:26:43 +020058is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
Akrone1dbc382016-07-08 22:24:52 +020059
Akron5f51d422016-08-16 16:26:43 +020060# Delete output
61unlink $output;
62ok(!-f $output, 'Output does not exist');
Akrone1dbc382016-07-08 22:24:52 +020063
Akron5f51d422016-08-16 16:26:43 +020064$call .= ' -z';
Akrone1dbc382016-07-08 22:24:52 +020065
Akron5f51d422016-08-16 16:26:43 +020066# Test with compression
67stderr_like(
68 sub { system($call); },
69 qr!The code took!,
70 $call
71);
72
73ok(-f $output, 'Output does exist');
74
75# Uncompress the data using a buffer
76my $gz = IO::Uncompress::Gunzip->new($output, Transparent => 0);
77($file, my $buffer) = '';
78while ($gz->read($buffer)) {
79 $file .= $buffer;
80};
81
82ok($json = decode_json($file), 'decode json');
83
Akrone1dbc382016-07-08 22:24:52 +020084is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
85is($json->{title}, 'Beispiel Text', 'Title');
Akron5f51d422016-08-16 16:26:43 +020086is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
Akrone1dbc382016-07-08 22:24:52 +020087is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
88like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
89is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
90
Akron5f51d422016-08-16 16:26:43 +020091# Delete output
92unlink $output;
93ok(!-f $output, 'Output does not exist');
94
95# Use a different token source and skip all annotations,
96# except for DeReKo#Structure and Mate#Dependency
97$call = join(
98 ' ',
99 'perl', $script,
100 '--input' => $input,
101 '--output' => $output,
102 '-t' => 'CoreNLP#Tokens',
103 '-s' => '#all',
104 '-a' => 'DeReKo#Structure',
105 '-a' => 'Mate#Dependency',
106 '-l' => 'INFO'
107);
108
109stderr_like(
110 sub {
111 system($call);
112 },
113 qr!The code took!,
114 $call
115);
116
117ok(-f $output, 'Output does exist');
118ok(($file = slurp $output), 'Slurp data');
119ok(($json = decode_json $file), 'decode json');
120
121is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
122
123is($json->{title}, 'Beispiel Text', 'Title');
124is($json->{data}->{tokenSource}, 'corenlp#tokens', 'TokenSource');
125is($json->{data}->{foundries}, 'dereko dereko/structure mate mate/dependency', 'Foundries');
126
127like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
128is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>20', 'Tokens');
129
130# Test overwrite!!!
131# Test meta
132# Test sigle!
133# Test help
134# Test version
135
Akrone1dbc382016-07-08 22:24:52 +0200136
137done_testing;
138__END__
Akron5f51d422016-08-16 16:26:43 +0200139