Akron | 405f0c5 | 2016-07-07 17:56:16 +0200 | [diff] [blame] | 1 | #!/usr/bin/env perl |
| 2 | use strict; |
| 3 | use warnings; |
| 4 | use Test::More; |
| 5 | use File::Basename 'dirname'; |
| 6 | use File::Spec::Functions 'catdir'; |
| 7 | use File::Temp qw/ :POSIX /; |
Akron | 3ec0a1c | 2017-01-18 14:41:55 +0100 | [diff] [blame] | 8 | use Mojo::File; |
Akron | 405f0c5 | 2016-07-07 17:56:16 +0200 | [diff] [blame] | 9 | use Mojo::JSON qw/decode_json/; |
Akron | cdf0e00 | 2016-07-08 16:42:04 +0200 | [diff] [blame] | 10 | use IO::Uncompress::Gunzip; |
| 11 | use Data::Dumper; |
Akron | 405f0c5 | 2016-07-07 17:56:16 +0200 | [diff] [blame] | 12 | |
| 13 | use_ok('KorAP::XML::Batch::File'); |
| 14 | |
| 15 | ok(my $bf = KorAP::XML::Batch::File->new( |
| 16 | overwrite => 1, |
| 17 | foundry => 'OpenNLP', |
| 18 | layer => 'Tokens' |
| 19 | ), 'Construct new batch file object'); |
| 20 | |
Akron | 405f0c5 | 2016-07-07 17:56:16 +0200 | [diff] [blame] | 21 | my $path = catdir(dirname(__FILE__), 'annotation', 'corpus', 'doc', '0001'); |
| 22 | |
| 23 | my $output = tmpnam(); |
| 24 | ok($bf->process($path => $output), 'Process file'); |
| 25 | |
| 26 | ok(-f $output, 'File exists'); |
| 27 | |
Akron | 3ec0a1c | 2017-01-18 14:41:55 +0100 | [diff] [blame] | 28 | ok(my $file = Mojo::File->new($output)->slurp, 'Slurp data'); |
Akron | 405f0c5 | 2016-07-07 17:56:16 +0200 | [diff] [blame] | 29 | |
| 30 | ok(my $json = decode_json $file, 'decode json'); |
| 31 | |
| 32 | is($json->{textType}, 'Zeitung: Tageszeitung', 'text type'); |
| 33 | is($json->{title}, 'Beispiel Text', 'Title'); |
| 34 | is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title'); |
| 35 | is($json->{data}->{foundries}, '', 'Foundries'); |
| 36 | like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries'); |
| 37 | is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>18', 'Tokens'); |
Akron | dec4312 | 2020-03-03 11:22:25 +0100 | [diff] [blame] | 38 | is($json->{data}->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>18<b>0', 'Data'); |
Akron | 405f0c5 | 2016-07-07 17:56:16 +0200 | [diff] [blame] | 39 | |
Akron | cdf0e00 | 2016-07-08 16:42:04 +0200 | [diff] [blame] | 40 | # Generate with Gzip |
| 41 | $bf->{gzip} = 1; |
| 42 | |
| 43 | $path = catdir(dirname(__FILE__), 'annotation', 'corpus', 'doc', '0001'); |
| 44 | $output = tmpnam(); |
| 45 | ok($bf->process($path => $output), 'Process file'); |
| 46 | |
| 47 | my $out; |
| 48 | my $gz = IO::Uncompress::Gunzip->new($output); |
| 49 | ok($gz->read($out), 'Uncompress'); |
| 50 | |
| 51 | ok($json = decode_json $out, 'decode json'); |
| 52 | |
| 53 | is($json->{textType}, 'Zeitung: Tageszeitung', 'text type'); |
| 54 | is($json->{title}, 'Beispiel Text', 'Title'); |
| 55 | is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title'); |
| 56 | is($json->{data}->{foundries}, '', 'Foundries'); |
| 57 | like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries'); |
| 58 | is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>18', 'Tokens'); |
Akron | dec4312 | 2020-03-03 11:22:25 +0100 | [diff] [blame] | 59 | is($json->{data}->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>18<b>0', 'Data'); |
Akron | cdf0e00 | 2016-07-08 16:42:04 +0200 | [diff] [blame] | 60 | |
| 61 | # Generate with annotations |
| 62 | $bf->{gzip} = 0; |
| 63 | $bf->{anno} = [ |
| 64 | ['CoreNLP', 'Morpho'], |
| 65 | ['OpenNLP', 'Morpho'] |
| 66 | ]; |
| 67 | $output = tmpnam(); |
| 68 | ok($bf->process($path => $output), 'Process file'); |
Akron | 3ec0a1c | 2017-01-18 14:41:55 +0100 | [diff] [blame] | 69 | ok($file = Mojo::File->new($output)->slurp, 'Slurp data'); |
Akron | cdf0e00 | 2016-07-08 16:42:04 +0200 | [diff] [blame] | 70 | ok($json = decode_json $file, 'decode json'); |
| 71 | |
| 72 | is($json->{textType}, 'Zeitung: Tageszeitung', 'text type'); |
| 73 | is($json->{title}, 'Beispiel Text', 'Title'); |
| 74 | is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title'); |
| 75 | is($json->{data}->{foundries}, 'corenlp corenlp/morpho opennlp opennlp/morpho', 'Foundries'); |
| 76 | like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries'); |
| 77 | is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>18', 'Tokens'); |
| 78 | |
| 79 | my $token = $json->{data}->{stream}->[0]; |
| 80 | |
| 81 | like($json->{data}->{text}, qr/Ende Schuljahr eingestellt wird\.$/, 'Primary text'); |
| 82 | |
Akron | dec4312 | 2020-03-03 11:22:25 +0100 | [diff] [blame] | 83 | is($token->[1], '<>:base/s:t$<b>64<i>0<i>129<i>18<b>0', 'base/s'); |
Akron | cdf0e00 | 2016-07-08 16:42:04 +0200 | [diff] [blame] | 84 | is($token->[2], '_0$<i>0<i>3', 'position'); |
| 85 | is($token->[3], 'corenlp/p:APPRART', 'corenlp'); |
| 86 | is($token->[5], 'opennlp/p:APPRART', 'opennlp'); |
| 87 | |
| 88 | $token = $json->{data}->{stream}->[-1]; |
| 89 | |
| 90 | is($token->[1], 'corenlp/p:VAFIN', 'corenlp'); |
| 91 | is($token->[3], 'opennlp/p:VAFIN', 'opennlp'); |
| 92 | |
| 93 | # Check layer and foundry for base tokenization |
| 94 | # No primary data |
| 95 | $bf->{anno} = [[]]; |
Akron | cdf0e00 | 2016-07-08 16:42:04 +0200 | [diff] [blame] | 96 | $bf->{foundry} = 'CoreNLP'; |
| 97 | $bf->{layer} = 'Tokens'; |
| 98 | |
| 99 | ok($bf->process($path => $output), 'Process file'); |
| 100 | ok(-f $output, 'File exists'); |
Akron | 3ec0a1c | 2017-01-18 14:41:55 +0100 | [diff] [blame] | 101 | ok($file = Mojo::File->new($output)->slurp, 'Slurp data'); |
Akron | cdf0e00 | 2016-07-08 16:42:04 +0200 | [diff] [blame] | 102 | ok($json = decode_json $file, 'decode json'); |
| 103 | |
Akron | cdf0e00 | 2016-07-08 16:42:04 +0200 | [diff] [blame] | 104 | is($json->{data}->{tokenSource}, 'corenlp#tokens', 'Title'); |
| 105 | |
| 106 | like($file, qr/^\{"/, 'No pretty printing'); |
| 107 | |
Akron | cdf0e00 | 2016-07-08 16:42:04 +0200 | [diff] [blame] | 108 | # Check overwriting |
| 109 | $bf->{overwrite} = 0; |
| 110 | |
Akron | 13d5662 | 2016-10-31 14:54:49 +0100 | [diff] [blame] | 111 | is($bf->process($path => $output), -1, 'Process file'); |
Akron | cdf0e00 | 2016-07-08 16:42:04 +0200 | [diff] [blame] | 112 | |
Akron | 405f0c5 | 2016-07-07 17:56:16 +0200 | [diff] [blame] | 113 | done_testing; |
| 114 | __END__ |
Akron | cdf0e00 | 2016-07-08 16:42:04 +0200 | [diff] [blame] | 115 | |
| 116 | |
| 117 | |
| 118 | |