blob: 12f7e127496a31eed2593d3e212bd7a9ef321930 [file] [log] [blame]
Akron405f0c52016-07-07 17:56:16 +02001#!/usr/bin/env perl
2use strict;
3use warnings;
4use Test::More;
5use File::Basename 'dirname';
6use File::Spec::Functions 'catdir';
7use File::Temp qw/ :POSIX /;
Akron3ec0a1c2017-01-18 14:41:55 +01008use Mojo::File;
Akron405f0c52016-07-07 17:56:16 +02009use Mojo::JSON qw/decode_json/;
Akroncdf0e002016-07-08 16:42:04 +020010use IO::Uncompress::Gunzip;
11use Data::Dumper;
Akron405f0c52016-07-07 17:56:16 +020012
13use_ok('KorAP::XML::Batch::File');
14
15ok(my $bf = KorAP::XML::Batch::File->new(
16 overwrite => 1,
17 foundry => 'OpenNLP',
18 layer => 'Tokens'
19), 'Construct new batch file object');
20
Akron405f0c52016-07-07 17:56:16 +020021my $path = catdir(dirname(__FILE__), 'annotation', 'corpus', 'doc', '0001');
22
23my $output = tmpnam();
24ok($bf->process($path => $output), 'Process file');
25
26ok(-f $output, 'File exists');
27
Akron3ec0a1c2017-01-18 14:41:55 +010028ok(my $file = Mojo::File->new($output)->slurp, 'Slurp data');
Akron405f0c52016-07-07 17:56:16 +020029
30ok(my $json = decode_json $file, 'decode json');
31
32is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
33is($json->{title}, 'Beispiel Text', 'Title');
34is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
35is($json->{data}->{foundries}, '', 'Foundries');
36like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
37is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>18', 'Tokens');
Akrondec43122020-03-03 11:22:25 +010038is($json->{data}->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>18<b>0', 'Data');
Akron405f0c52016-07-07 17:56:16 +020039
Akroncdf0e002016-07-08 16:42:04 +020040# Generate with Gzip
41$bf->{gzip} = 1;
42
43$path = catdir(dirname(__FILE__), 'annotation', 'corpus', 'doc', '0001');
44$output = tmpnam();
45ok($bf->process($path => $output), 'Process file');
46
47my $out;
48my $gz = IO::Uncompress::Gunzip->new($output);
49ok($gz->read($out), 'Uncompress');
50
51ok($json = decode_json $out, 'decode json');
52
53is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
54is($json->{title}, 'Beispiel Text', 'Title');
55is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
56is($json->{data}->{foundries}, '', 'Foundries');
57like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
58is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>18', 'Tokens');
Akrondec43122020-03-03 11:22:25 +010059is($json->{data}->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>18<b>0', 'Data');
Akroncdf0e002016-07-08 16:42:04 +020060
61# Generate with annotations
62$bf->{gzip} = 0;
63$bf->{anno} = [
64 ['CoreNLP', 'Morpho'],
65 ['OpenNLP', 'Morpho']
66];
67$output = tmpnam();
68ok($bf->process($path => $output), 'Process file');
Akron3ec0a1c2017-01-18 14:41:55 +010069ok($file = Mojo::File->new($output)->slurp, 'Slurp data');
Akroncdf0e002016-07-08 16:42:04 +020070ok($json = decode_json $file, 'decode json');
71
72is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
73is($json->{title}, 'Beispiel Text', 'Title');
74is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
75is($json->{data}->{foundries}, 'corenlp corenlp/morpho opennlp opennlp/morpho', 'Foundries');
76like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
77is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>18', 'Tokens');
78
79my $token = $json->{data}->{stream}->[0];
80
81like($json->{data}->{text}, qr/Ende Schuljahr eingestellt wird\.$/, 'Primary text');
82
Akrondec43122020-03-03 11:22:25 +010083is($token->[1], '<>:base/s:t$<b>64<i>0<i>129<i>18<b>0', 'base/s');
Akroncdf0e002016-07-08 16:42:04 +020084is($token->[2], '_0$<i>0<i>3', 'position');
85is($token->[3], 'corenlp/p:APPRART', 'corenlp');
86is($token->[5], 'opennlp/p:APPRART', 'opennlp');
87
88$token = $json->{data}->{stream}->[-1];
89
90is($token->[1], 'corenlp/p:VAFIN', 'corenlp');
91is($token->[3], 'opennlp/p:VAFIN', 'opennlp');
92
93# Check layer and foundry for base tokenization
94# No primary data
95$bf->{anno} = [[]];
Akroncdf0e002016-07-08 16:42:04 +020096$bf->{foundry} = 'CoreNLP';
97$bf->{layer} = 'Tokens';
98
99ok($bf->process($path => $output), 'Process file');
100ok(-f $output, 'File exists');
Akron3ec0a1c2017-01-18 14:41:55 +0100101ok($file = Mojo::File->new($output)->slurp, 'Slurp data');
Akroncdf0e002016-07-08 16:42:04 +0200102ok($json = decode_json $file, 'decode json');
103
Akroncdf0e002016-07-08 16:42:04 +0200104is($json->{data}->{tokenSource}, 'corenlp#tokens', 'Title');
105
106like($file, qr/^\{"/, 'No pretty printing');
107
108# Check pretty printing
109$bf->{pretty} = 1;
110ok($bf->process($path => $output), 'Process file');
111ok(-f $output, 'File exists');
Akron3ec0a1c2017-01-18 14:41:55 +0100112ok($file = Mojo::File->new($output)->slurp, 'Slurp data');
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +0200113like($file, qr/^\{[\n\s]+"/, 'No pretty printing');
Akroncdf0e002016-07-08 16:42:04 +0200114
115# Check overwriting
116$bf->{overwrite} = 0;
117
Akron13d56622016-10-31 14:54:49 +0100118is($bf->process($path => $output), -1, 'Process file');
Akroncdf0e002016-07-08 16:42:04 +0200119
Akron405f0c52016-07-07 17:56:16 +0200120done_testing;
121__END__
Akroncdf0e002016-07-08 16:42:04 +0200122
123
124
125