Blame - t/batch_file.t - KorAP/KorAP-XML-Krill

blob: 947e1ef95e967ce7117ed687b352fc8cdae63603 [file] [log] [blame]

Akron	405f0c5	2016-07-07 17:56:16 +0200	[diff] [blame]	1	#!/usr/bin/env perl
				2	use strict;
				3	use warnings;
				4	use Test::More;
				5	use File::Basename 'dirname';
				6	use File::Spec::Functions 'catdir';
				7	use File::Temp qw/ :POSIX /;
				8	use Mojo::Util qw/slurp/;
				9	use Mojo::JSON qw/decode_json/;
Akron	cdf0e00	2016-07-08 16:42:04 +0200	[diff] [blame]	10	use IO::Uncompress::Gunzip;
				11	use Data::Dumper;
Akron	405f0c5	2016-07-07 17:56:16 +0200	[diff] [blame]	12
				13	use_ok('KorAP::XML::Batch::File');
				14
				15	ok(my $bf = KorAP::XML::Batch::File->new(
				16	overwrite => 1,
				17	foundry => 'OpenNLP',
				18	layer => 'Tokens'
				19	), 'Construct new batch file object');
				20
Akron	405f0c5	2016-07-07 17:56:16 +0200	[diff] [blame]	21	my $path = catdir(dirname(__FILE__), 'annotation', 'corpus', 'doc', '0001');
				22
				23	my $output = tmpnam();
				24	ok($bf->process($path => $output), 'Process file');
				25
				26	ok(-f $output, 'File exists');
				27
				28	ok(my $file = slurp $output, 'Slurp data');
				29
				30	ok(my $json = decode_json $file, 'decode json');
				31
				32	is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
				33	is($json->{title}, 'Beispiel Text', 'Title');
				34	is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
				35	is($json->{data}->{foundries}, '', 'Foundries');
				36	like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
				37	is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>18', 'Tokens');
				38	is($json->{data}->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Data');
				39
Akron	cdf0e00	2016-07-08 16:42:04 +0200	[diff] [blame]	40	# Generate with Gzip
				41	$bf->{gzip} = 1;
				42
				43	$path = catdir(dirname(__FILE__), 'annotation', 'corpus', 'doc', '0001');
				44	$output = tmpnam();
				45	ok($bf->process($path => $output), 'Process file');
				46
				47	my $out;
				48	my $gz = IO::Uncompress::Gunzip->new($output);
				49	ok($gz->read($out), 'Uncompress');
				50
				51	ok($json = decode_json $out, 'decode json');
				52
				53	is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
				54	is($json->{title}, 'Beispiel Text', 'Title');
				55	is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
				56	is($json->{data}->{foundries}, '', 'Foundries');
				57	like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
				58	is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>18', 'Tokens');
				59	is($json->{data}->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Data');
				60
				61	# Generate with annotations
				62	$bf->{gzip} = 0;
				63	$bf->{anno} = [
				64	['CoreNLP', 'Morpho'],
				65	['OpenNLP', 'Morpho']
				66	];
				67	$output = tmpnam();
				68	ok($bf->process($path => $output), 'Process file');
				69	ok($file = slurp $output, 'Slurp data');
				70	ok($json = decode_json $file, 'decode json');
				71
				72	is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
				73	is($json->{title}, 'Beispiel Text', 'Title');
				74	is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
				75	is($json->{data}->{foundries}, 'corenlp corenlp/morpho opennlp opennlp/morpho', 'Foundries');
				76	like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
				77	is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>18', 'Tokens');
				78
				79	my $token = $json->{data}->{stream}->[0];
				80
				81	like($json->{data}->{text}, qr/Ende Schuljahr eingestellt wird\.$/, 'Primary text');
				82
				83	is($token->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'base/s');
				84	is($token->[2], '_0$<i>0<i>3', 'position');
				85	is($token->[3], 'corenlp/p:APPRART', 'corenlp');
				86	is($token->[5], 'opennlp/p:APPRART', 'opennlp');
				87
				88	$token = $json->{data}->{stream}->[-1];
				89
				90	is($token->[1], 'corenlp/p:VAFIN', 'corenlp');
				91	is($token->[3], 'opennlp/p:VAFIN', 'opennlp');
				92
				93	# Check layer and foundry for base tokenization
				94	# No primary data
				95	$bf->{anno} = [[]];
				96	$bf->{primary} = 0;
				97	$bf->{foundry} = 'CoreNLP';
				98	$bf->{layer} = 'Tokens';
				99
				100	ok($bf->process($path => $output), 'Process file');
				101	ok(-f $output, 'File exists');
				102	ok($file = slurp $output, 'Slurp data');
				103	ok($json = decode_json $file, 'decode json');
				104
				105	ok(!$json->{data}->{text}, 'No Primary text');
				106	is($json->{data}->{tokenSource}, 'corenlp#tokens', 'Title');
				107
				108	like($file, qr/^\{"/, 'No pretty printing');
				109
				110	# Check pretty printing
				111	$bf->{pretty} = 1;
				112	ok($bf->process($path => $output), 'Process file');
				113	ok(-f $output, 'File exists');
				114	ok($file = slurp $output, 'Slurp data');
				115	like($file, qr/^\{\n\s+"/, 'No pretty printing');
				116
				117	# Check overwriting
				118	$bf->{overwrite} = 0;
				119
				120	ok(!$bf->process($path => $output), 'Process file');
				121
Akron	405f0c5	2016-07-07 17:56:16 +0200	[diff] [blame]	122	done_testing;
				123	__END__
Akron	cdf0e00	2016-07-08 16:42:04 +0200	[diff] [blame]	124
				125
				126
				127