Support non-verbal annotations
Change-Id: I6cc0e7c8279f523d3c4b66b14125866ec0be1695
diff --git a/t/real/agd.t b/t/real/agd.t
index 498f4e2..6ba7ebe 100644
--- a/t/real/agd.t
+++ b/t/real/agd.t
@@ -53,7 +53,7 @@
foundry => $token_base_foundry,
layer => $token_base_layer,
name => 'tokens',
- non_word_tokens => 1
+ non_verbal_tokens => 1
);
ok($tokens, 'Token Object is fine');
@@ -71,9 +71,9 @@
is($output->{version}, '0.03', 'version');
is($output->{data}->{foundries}, '', 'Foundries');
is($output->{data}->{layerInfos}, '', 'layerInfos');
-is($output->{data}->{stream}->[1]->[2], 's:ku', 'data');
-is($output->{data}->{stream}->[2]->[2], 's:sqn', 'data');
-is($output->{data}->{stream}->[3]->[2], 's:alxv', 'data');
+is($output->{data}->{stream}->[0]->[4], 's:ku', 'data');
+is($output->{data}->{stream}->[1]->[2], 's:sqn', 'data');
+is($output->{data}->{stream}->[2]->[2], 's:alxv', 'data');
is($output->{textSigle}, 'AGD/DOC/00001', 'Correct text sigle');
is($output->{docSigle}, 'AGD/DOC', 'Correct document sigle');
is($output->{corpusSigle}, 'AGD', 'Correct corpus sigle');
@@ -103,7 +103,7 @@
is($output->{data}->{layerInfos}, 'dereko/s=spans dgd/l=tokens dgd/p=tokens dgd/para=tokens',
'layerInfos');
-my $third_token = join('||', @{$output->{data}->{stream}->[3]});
+my $third_token = join('||', @{$output->{data}->{stream}->[2]});
like($third_token, qr!dgd/l:alui!);
like($third_token, qr!dgd/p:VMGWY!);
like($third_token, qr!i:alxv!);
@@ -116,13 +116,18 @@
# Offsets are suboptimal set, but good enough
$first_token = join('||', @{$output->{data}->{stream}->[0]});
-like($first_token, qr!<>:base/s:s\$<b>64<i>0<i>16<i>3<b>1!);
+like($first_token, qr!<>:base/s:s\$<b>64<i>0<i>16<i>2<b>1!);
my $token = join('||', @{$output->{data}->{stream}->[1]});
+like($token, qr!<>:base/s:s\$<b>64<i>16<i>23<i>4<b>1!);
+$token = join('||', @{$output->{data}->{stream}->[2]});
unlike($token, qr!<>:base/s:s!);
-$token = join('||', @{$output->{data}->{stream}->[2]});
-like($token, qr!<>:base/s:s\$<b>64<i>16<i>23<i>5<b>1!);
+$token = join('||', @{$output->{data}->{stream}->[3]});
+like($token, qr!<>:base/s:s\$<b>64<i>23<i>27<i>5<b>1!);
+
+$token = join('||', @{$output->{data}->{stream}->[5]});
+like($token, qr!dgd/para:pause!);
done_testing;
__END__
diff --git a/t/script/non_verbal_tokens.t b/t/script/non_verbal_tokens.t
new file mode 100644
index 0000000..fa460e4
--- /dev/null
+++ b/t/script/non_verbal_tokens.t
@@ -0,0 +1,99 @@
+#/usr/bin/env perl
+use strict;
+use warnings;
+use File::Basename 'dirname';
+use File::Spec::Functions qw/catdir catfile/;
+use File::Temp qw/ :POSIX /;
+use Mojo::File;
+use Mojo::JSON qw/decode_json/;
+use IO::Uncompress::Gunzip;
+use Test::More;
+use Test::Output;
+use Data::Dumper;
+use utf8;
+
+my $f = dirname(__FILE__);
+my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
+
+my $input = catdir($f, '..', 'corpus', 'AGD-scrambled', 'DOC', '00001');
+ok(-d $input, 'Input directory found');
+
+my $output = tmpnam();
+my $cache = tmpnam();
+
+ok(!-f $output, 'Output does not exist');
+
+my $call = join(
+ ' ',
+ 'perl', $script,
+ '--input' => $input,
+ '--output' => $output,
+ '--cache' => $cache,
+ '-t' => 'DGD#Annot',
+ '-l' => 'INFO'
+);
+
+# Test without compression
+stderr_like(
+ sub {
+ system($call);
+ },
+ qr!The code took!,
+ $call
+);
+
+ok(-f $output, 'Output does exist');
+ok((my $file = Mojo::File->new($output)->slurp), 'Slurp data');
+ok((my $json = decode_json $file), 'decode json');
+
+is($json->{textSigle}, 'AGD/DOC/00001', 'text sigle');
+is($json->{title}, 'FOLK_E_00321_SE_01_T_01_DF_01', 'Title');
+is($json->{data}->{tokenSource}, 'dgd#annot', 'Title');
+is($json->{data}->{foundries}, 'dereko dereko/structure dgd dgd/morpho', 'Foundries');
+my $stream = $json->{data}->{stream};
+my $token = $stream->[4];
+is($token->[3], 'dgd/l:pui', 'Token');
+$token = $stream->[5];
+is($token->[13], 'dgd/l:xui', 'Token');
+
+$call = join(
+ ' ',
+ 'perl', $script,
+ '--input' => $input,
+ '--output' => $output,
+ '--cache' => $cache,
+ '-t' => 'DGD#annot',
+ '-l' => 'INFO',
+ '-w' => '',
+ '-nvt' => ''
+);
+
+# Test without compression
+stderr_like(
+ sub {
+ system($call);
+ },
+ qr!The code took!,
+ $call
+);
+
+ok(-f $output, 'Output does exist');
+ok(($file = Mojo::File->new($output)->slurp), 'Slurp data');
+ok(($json = decode_json $file), 'decode json');
+$stream = $json->{data}->{stream};
+
+$stream = $json->{data}->{stream};
+
+$token = $stream->[4];
+is($token->[3], 'dgd/l:pui', 'Token');
+
+$token = $stream->[5];
+is($token->[14], 'dgd/para:pause$<b>128<s>5', 'Token');
+
+$token = $stream->[6];
+is($token->[1], 'dgd/l:xui', 'Token');
+
+
+
+done_testing;
+__END__