Restructure test suite to prepare CPAN release
Change-Id: If3033774f50d33b2e5b3344e3927fd534cef4dfb
diff --git a/t/real/script/archive.t b/t/real/script/archive.t
new file mode 100644
index 0000000..721f1bc
--- /dev/null
+++ b/t/real/script/archive.t
@@ -0,0 +1,47 @@
+#/usr/bin/env perl
+use strict;
+use warnings;
+use File::Basename 'dirname';
+use File::Spec::Functions qw/catdir catfile/;
+use File::Temp qw/:POSIX/;
+use Mojo::File;
+use Mojo::Util qw/quote/;
+use Mojo::JSON qw/decode_json/;
+use IO::Uncompress::Gunzip;
+use Test::More;
+use Test::Output qw/:stdout :stderr :combined :functions/;
+use Data::Dumper;
+use KorAP::XML::Archive;
+use utf8;
+
+if ($ENV{SKIP_SCRIPT} || $ENV{SKIP_REAL}) {
+ plan skip_all => 'Skip script/real tests';
+};
+
+my $f = dirname(__FILE__);
+my $script = catfile($f, '..', '..', '..', 'script', 'korapxml2krill');
+
+my $cache = tmpnam();
+
+my $output = File::Temp->newdir(CLEANUP => 0);
+$output->unlink_on_destroy(0);
+
+my $input = catfile($f, '..', 'corpus', 'WDD15', 'A79', '83946');
+my $call = join(
+ ' ',
+ 'perl', $script,
+ '--input' => $input,
+ '--cache' => $cache
+);
+
+# Test without compression
+{
+ local $SIG{__WARN__} = sub {};
+ my $out = combined_from(sub { system($call); });
+
+ like($out, qr!No tokens found!s, $call);
+};
+
+
+done_testing;
+__END__
diff --git a/t/real/script/base.t b/t/real/script/base.t
new file mode 100644
index 0000000..62e9e15
--- /dev/null
+++ b/t/real/script/base.t
@@ -0,0 +1,75 @@
+#/usr/bin/env perl
+use strict;
+use warnings;
+use File::Basename 'dirname';
+use File::Spec::Functions qw/catdir catfile/;
+use File::Temp qw/ :POSIX /;
+use Mojo::File;
+use Mojo::JSON qw/decode_json/;
+use IO::Uncompress::Gunzip;
+use Test::More;
+use Test::Output;
+use Data::Dumper;
+use utf8;
+
+if ($ENV{SKIP_SCRIPT} || $ENV{SKIP_REAL}) {
+ plan skip_all => 'Skip script/real tests';
+};
+
+my $f = dirname(__FILE__);
+my $script = catfile($f, '..', '..', '..', 'script', 'korapxml2krill');
+
+my $input = catdir($f, '..', 'corpus', 'GOE2', 'AGA', '03828');
+ok(-d $input, 'Input directory found');
+
+my $output = tmpnam();
+my $cache = tmpnam();
+
+ok(!-f $output, 'Output does not exist');
+
+my $call = join(
+ ' ',
+ 'perl', $script,
+ '--input' => $input,
+ '--output' => $output,
+ '--cache' => $cache,
+ '-t' => 'Base#tokens_aggr.xml',
+ '-bs' => 'DeReKo#Structure',
+ '-bp' => 'DeReKo#Structure',
+ '-l' => 'INFO'
+);
+
+# Test without compression
+stderr_like(
+ sub {
+ system($call);
+ },
+ qr!The code took!,
+ $call
+);
+
+ok(-f $output, 'Output does exist');
+ok((my $file = Mojo::File->new($output)->slurp), 'Slurp data');
+ok((my $json = decode_json $file), 'decode json');
+is($json->{textType}, 'Autobiographie', 'text type');
+is($json->{title}, 'Autobiographische Einzelheiten', 'Title');
+is($json->{data}->{tokenSource}, 'base#tokens_aggr', 'Title');
+is($json->{data}->{foundries}, 'dereko dereko/structure dereko/structure/base-sentences-paragraphs marmot marmot/morpho', 'Foundries');
+my $stream = $json->{data}->{stream};
+my $token = $stream->[0];
+is($token->[0], '-:base/paragraphs$<i>14', 'Paragraphs');
+is($token->[1], '-:base/sentences$<i>215', 'Sentences');
+
+is($token->[5], '<>:base/s:s$<b>64<i>0<i>30<i>2<b>2', 'struct');
+is($token->[7], '<>:dereko/s:s$<b>64<i>0<i>30<i>2<b>4', 'struct');
+is($token->[8], '<>:base/s:t$<b>64<i>0<i>35242<i>5239<b>0', 'struct');
+
+$token = $stream->[4];
+is($token->[0], '<>:base/s:s$<b>64<i>53<i>254<i>32<b>2', 'struct');
+is($token->[1], '<>:dereko/s:s$<b>64<i>53<i>254<i>32<b>5<s>1', 'struct');
+is($token->[2], '<>:base/s:p$<b>64<i>53<i>3299<i>504<b>1', 'struct');
+is($token->[3], '<>:dereko/s:p$<b>64<i>53<i>3299<i>504<b>4', 'struct');
+
+done_testing;
+
+__END__
diff --git a/t/real/script/non_verbal_tokens.t b/t/real/script/non_verbal_tokens.t
new file mode 100644
index 0000000..b97bfbf
--- /dev/null
+++ b/t/real/script/non_verbal_tokens.t
@@ -0,0 +1,103 @@
+#/usr/bin/env perl
+use strict;
+use warnings;
+use File::Basename 'dirname';
+use File::Spec::Functions qw/catdir catfile/;
+use File::Temp qw/ :POSIX /;
+use Mojo::File;
+use Mojo::JSON qw/decode_json/;
+use IO::Uncompress::Gunzip;
+use Test::More;
+use Test::Output;
+use Data::Dumper;
+use utf8;
+
+if ($ENV{SKIP_SCRIPT} || $ENV{SKIP_REAL}) {
+ plan skip_all => 'Skip script/real tests';
+};
+
+my $f = dirname(__FILE__);
+my $script = catfile($f, '..', '..', '..', 'script', 'korapxml2krill');
+
+my $input = catdir($f, '..', 'corpus', 'AGD-scrambled', 'DOC', '00001');
+ok(-d $input, 'Input directory found');
+
+my $output = tmpnam();
+my $cache = tmpnam();
+
+ok(!-f $output, 'Output does not exist');
+
+my $call = join(
+ ' ',
+ 'perl', $script,
+ '--input' => $input,
+ '--output' => $output,
+ '--cache' => $cache,
+ '-t' => 'DGD#Annot',
+ '-l' => 'INFO'
+);
+
+# Test without compression
+stderr_like(
+ sub {
+ system($call);
+ },
+ qr!The code took!,
+ $call
+);
+
+ok(-f $output, 'Output does exist');
+ok((my $file = Mojo::File->new($output)->slurp), 'Slurp data');
+ok((my $json = decode_json $file), 'decode json');
+
+is($json->{textSigle}, 'AGD/DOC/00001', 'text sigle');
+is($json->{title}, 'FOLK_E_00321_SE_01_T_01_DF_01', 'Title');
+is($json->{data}->{tokenSource}, 'dgd#annot', 'Title');
+is($json->{data}->{foundries}, 'dereko dereko/structure dgd dgd/morpho', 'Foundries');
+my $stream = $json->{data}->{stream};
+my $token = $stream->[4];
+is($token->[3], 'dgd/l:pui', 'Token');
+$token = $stream->[5];
+is($token->[15], 'dgd/l:xui', 'Token');
+
+$call = join(
+ ' ',
+ 'perl', $script,
+ '--input' => $input,
+ '--output' => $output,
+ '--cache' => $cache,
+ '-t' => 'DGD#annot',
+ '-l' => 'INFO',
+ '-w' => '',
+ '-nvt' => ''
+);
+
+# Test without compression
+stderr_like(
+ sub {
+ system($call);
+ },
+ qr!The code took!,
+ $call
+);
+
+ok(-f $output, 'Output does exist');
+ok(($file = Mojo::File->new($output)->slurp), 'Slurp data');
+ok(($json = decode_json $file), 'decode json');
+$stream = $json->{data}->{stream};
+
+$stream = $json->{data}->{stream};
+
+$token = $stream->[4];
+is($token->[3], 'dgd/l:pui', 'Token');
+
+$token = $stream->[5];
+is($token->[5], 'dgd/para:pause$<b>128<s>2', 'Token');
+
+$token = $stream->[6];
+is($token->[13], 'dgd/l:xui', 'Token');
+
+
+
+done_testing;
+__END__
diff --git a/t/real/script/non_word_tokens.t b/t/real/script/non_word_tokens.t
new file mode 100644
index 0000000..5458468
--- /dev/null
+++ b/t/real/script/non_word_tokens.t
@@ -0,0 +1,99 @@
+#/usr/bin/env perl
+use strict;
+use warnings;
+use File::Basename 'dirname';
+use File::Spec::Functions qw/catdir catfile/;
+use File::Temp qw/ :POSIX /;
+use Mojo::File;
+use Mojo::JSON qw/decode_json/;
+use IO::Uncompress::Gunzip;
+use Test::More;
+use Test::Output;
+use Data::Dumper;
+use utf8;
+
+if ($ENV{SKIP_SCRIPT} || $ENV{SKIP_REAL}) {
+ plan skip_all => 'Skip script/real tests';
+};
+
+my $f = dirname(__FILE__);
+my $script = catfile($f, '..', '..', '..', 'script', 'korapxml2krill');
+
+my $input = catdir($f, '..', 'corpus', 'WPD', '00001');
+ok(-d $input, 'Input directory found');
+
+my $output = tmpnam();
+my $cache = tmpnam();
+
+ok(!-f $output, 'Output does not exist');
+
+my $call = join(
+ ' ',
+ 'perl', $script,
+ '--input' => $input,
+ '--output' => $output,
+ '--cache' => $cache,
+ '-t' => 'OpenNLP#tokens',
+ '-l' => 'INFO'
+);
+
+# Test without compression
+stderr_like(
+ sub {
+ system($call);
+ },
+ qr!The code took!,
+ $call
+);
+
+ok(-f $output, 'Output does exist');
+ok((my $file = Mojo::File->new($output)->slurp), 'Slurp data');
+ok((my $json = decode_json $file), 'decode json');
+is($json->{textSigle}, 'WPD/AAA/00001', 'text sigle');
+is($json->{title}, 'A', 'Title');
+is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
+is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/sentences dereko dereko/structure mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
+my $stream = $json->{data}->{stream};
+my $token = $stream->[12];
+is($token->[16], 's:Vokal', 'Token');
+$token = $stream->[13];
+is($token->[23], 's:Der', 'Token');
+
+
+$call = join(
+ ' ',
+ 'perl', $script,
+ '--input' => $input,
+ '--output' => $output,
+ '--cache' => $cache,
+ '-t' => 'OpenNLP#tokens',
+ '-l' => 'INFO',
+ '-w' => '',
+ '-nwt' => ''
+);
+
+# Test without compression
+stderr_like(
+ sub {
+ system($call);
+ },
+ qr!The code took!,
+ $call
+);
+
+ok(-f $output, 'Output does exist');
+ok(($file = Mojo::File->new($output)->slurp), 'Slurp data');
+ok(($json = decode_json $file), 'decode json');
+$stream = $json->{data}->{stream};
+$token = $stream->[12];
+is($token->[17], 's:Vokal', 'Token');
+$token = $stream->[13];
+is($token->[7], 's:.', 'Token');
+is($token->[11], 'xip/p:PUNCT', 'Token');
+$token = $stream->[14];
+is($token->[23], 's:Der', 'Token');
+
+
+done_testing;
+
+__END__
diff --git a/t/real/script/single.t b/t/real/script/single.t
new file mode 100644
index 0000000..8acba83
--- /dev/null
+++ b/t/real/script/single.t
@@ -0,0 +1,59 @@
+#/usr/bin/env perl
+use strict;
+use warnings;
+use File::Basename 'dirname';
+use File::Spec::Functions qw/catdir catfile/;
+use File::Temp qw/:POSIX/;
+use Mojo::File;
+use Mojo::JSON qw/decode_json/;
+use IO::Uncompress::Gunzip;
+use Test::More;
+use Test::Output;
+use Data::Dumper;
+use utf8;
+
+if ($ENV{SKIP_SCRIPT} || $ENV{SKIP_REAL}) {
+ plan skip_all => 'Skip script/real tests';
+};
+
+
+my $output = tmpnam();
+my $cache = tmpnam();
+
+my $f = dirname(__FILE__);
+my $script = catfile($f, '..', '..', '..', 'script', 'korapxml2krill');
+
+# AGA with base info
+my $input = catdir($f, '..', 'corpus', 'GOE2', 'AGA', '03828');
+ok(-d $input, 'Input directory found');
+
+ok(!-f $output, 'Output does not exist');
+
+my $call = join(
+ ' ',
+ 'perl', $script,
+ '--input' => $input,
+ '--output' => $output,
+ '--cache' => $cache,
+ '-t' => 'base#tokens_aggr',
+ '-bs' => 'DeReKo#Structure',
+ '-bp' => 'DeReKo#Structure',
+ '-bpb' => 'DeReKo#Structure',
+ '-l' => 'INFO'
+);
+
+stderr_like(
+ sub {
+ system($call);
+ },
+ qr!The code took!,
+ $call
+);
+ok(-f $output, 'Output does exist');
+ok((my $file = Mojo::File->new($output)->slurp), 'Slurp data');
+ok((my $json = decode_json $file), 'decode json');
+
+is($json->{title}, 'Autobiographische Einzelheiten', 'title');
+is($json->{data}->{stream}->[0]->[-1], '~:base/s:pb$<i>529<i>0', 'Pagebreak annotation');
+
+done_testing;