Blame - t/script/single.t - KorAP/KorAP-XML-Krill

blob: a1d6c581e263e0a75710ac9769488afef2de5f81 [file] [log] [blame]

Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	1	#/usr/bin/env perl
				2	use strict;
				3	use warnings;
				4	use File::Basename 'dirname';
				5	use File::Spec::Functions qw/catdir catfile/;
				6	use File::Temp qw/ :POSIX /;
Akron	3ec0a1c	2017-01-18 14:41:55 +0100	[diff] [blame]	7	use Mojo::File;
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	8	use Mojo::JSON qw/decode_json/;
				9	use IO::Uncompress::Gunzip;
				10	use Test::More;
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	11	use Test::Output;
				12	use Data::Dumper;
Akron	f98b669	2016-08-16 19:17:44 +0200	[diff] [blame]	13	use utf8;
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	14
				15	my $f = dirname(__FILE__);
				16	my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	17
				18	my $input = catdir($f, '..', 'annotation', 'corpus', 'doc', '0001');
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	19	ok(-d $input, 'Input directory found');
				20
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	21	my $output = tmpnam();
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	22
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	23	ok(!-f $output, 'Output does not exist');
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	24
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	25	my $call = join(
				26	' ',
				27	'perl', $script,
				28	'--input' => $input,
				29	'--output' => $output,
				30	'-t' => 'OpenNLP#Tokens',
				31	'-l' => 'INFO'
				32	);
				33
				34	# Test without compression
				35	stderr_like(
				36	sub {
				37	system($call);
				38	},
				39	qr!The code took!,
				40	$call
				41	);
				42
				43	ok(-f $output, 'Output does exist');
Akron	3ec0a1c	2017-01-18 14:41:55 +0100	[diff] [blame]	44	ok((my $file = Mojo::File->new($output)->slurp), 'Slurp data');
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	45	ok((my $json = decode_json $file), 'decode json');
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	46	is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
				47	is($json->{title}, 'Beispiel Text', 'Title');
				48	is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
				49	is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
				50	like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
				51	is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	52	is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	53
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	54	# Delete output
				55	unlink $output;
				56	ok(!-f $output, 'Output does not exist');
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	57
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	58	$call .= ' -z';
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	59
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	60	# Test with compression
				61	stderr_like(
				62	sub { system($call); },
				63	qr!The code took!,
				64	$call
				65	);
				66
				67	ok(-f $output, 'Output does exist');
				68
				69	# Uncompress the data using a buffer
				70	my $gz = IO::Uncompress::Gunzip->new($output, Transparent => 0);
				71	($file, my $buffer) = '';
				72	while ($gz->read($buffer)) {
				73	$file .= $buffer;
				74	};
Nils Diewald	b3e9ccd	2016-10-24 15:16:52 +0200	[diff] [blame]	75	$gz->close;
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	76
				77	ok($json = decode_json($file), 'decode json');
				78
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	79	is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
				80	is($json->{title}, 'Beispiel Text', 'Title');
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	81	is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	82	is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
				83	like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
				84	is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
				85
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	86	# Delete output
Nils Diewald	b3e9ccd	2016-10-24 15:16:52 +0200	[diff] [blame]	87	is(unlink($output), 1, 'Unlink successful');
				88	ok(!-e $output, 'Output does not exist');
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	89
				90	# Use a different token source and skip all annotations,
				91	# except for DeReKo#Structure and Mate#Dependency
				92	$call = join(
				93	' ',
				94	'perl', $script,
				95	'--input' => $input,
				96	'--output' => $output,
				97	'-t' => 'CoreNLP#Tokens',
				98	'-s' => '#all',
				99	'-a' => 'DeReKo#Structure',
				100	'-a' => 'Mate#Dependency',
				101	'-l' => 'INFO'
				102	);
				103
				104	stderr_like(
				105	sub {
				106	system($call);
				107	},
				108	qr!The code took!,
				109	$call
				110	);
				111
				112	ok(-f $output, 'Output does exist');
Akron	3ec0a1c	2017-01-18 14:41:55 +0100	[diff] [blame]	113	ok(($file = Mojo::File->new($output)->slurp), 'Slurp data');
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	114	ok(($json = decode_json $file), 'decode json');
				115
				116	is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
				117
				118	is($json->{title}, 'Beispiel Text', 'Title');
				119	is($json->{data}->{tokenSource}, 'corenlp#tokens', 'TokenSource');
				120	is($json->{data}->{foundries}, 'dereko dereko/structure mate mate/dependency', 'Foundries');
				121
				122	like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
				123	is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>20', 'Tokens');
				124
Akron	e2b902d	2016-08-16 16:50:11 +0200	[diff] [blame]	125
				126	# Check overwrite
				127	$call = join(
				128	' ',
				129	'perl', $script,
				130	'--input' => $input,
				131	'--output' => $output,
				132	'-t' => 'CoreNLP#Tokens',
				133	'-s' => '#all',
				134	'-a' => 'DeReKo#Structure',
				135	'-a' => 'Mate#Dependency',
				136	'-l' => 'DEBUG'
				137	);
				138
				139	ok(-f $output, 'Output does exist');
				140	stderr_like(
				141	sub {
				142	system($call);
				143	},
				144	qr!already exists!,
				145	$call
				146	);
				147
				148	$call .= ' -w ';
				149
				150	stderr_unlike(
				151	sub {
				152	system($call);
				153	},
				154	qr!already exists!,
				155	$call
				156	);
				157
Akron	f98b669	2016-08-16 19:17:44 +0200	[diff] [blame]	158	# Check meta data switch
Akron	e2b902d	2016-08-16 16:50:11 +0200	[diff] [blame]	159
Akron	f98b669	2016-08-16 19:17:44 +0200	[diff] [blame]	160	# Delete output
				161	unlink $output;
				162	ok(!-f $output, 'Output does not exist');
				163
				164	$input = catdir($f, '..', 'sgbr', 'PRO-DUD', 'BSP-2013-01', '32');
				165
				166	# Use a different token source and skip all annotations,
				167	# except for DeReKo#Structure and Mate#Dependency
				168	$call = join(
				169	' ',
				170	'perl', $script,
				171	'--input' => $input,
				172	'--output' => $output,
				173	'-m' => 'Sgbr',
				174	'-t' => 'Base#Tokens_aggr',
				175	'-l' => 'INFO'
				176	);
				177
				178	stderr_like(
				179	sub {
				180	system($call);
				181	},
				182	qr!The code took!,
				183	$call
				184	);
				185
				186	ok(-f $output, 'Output does exist');
Akron	3ec0a1c	2017-01-18 14:41:55 +0100	[diff] [blame]	187	ok(($file = Mojo::File->new($output)->slurp), 'Slurp data');
Akron	f98b669	2016-08-16 19:17:44 +0200	[diff] [blame]	188	ok(($json = decode_json $file), 'decode json');
				189
				190	is($json->{data}->{text}, 'Selbst ist der Jeck', 'Text');
				191	is($json->{data}->{tokenSource}, 'base#tokens_aggr', 'TokenSource');
				192	is($json->{pubPlace}, 'Stadtingen', 'pubPlace');
				193	is($json->{textSigle}, 'PRO-DUD/BSP-2013-01/32', 'textSigle');
				194	is($json->{docSigle}, 'PRO-DUD/BSP-2013-01', 'docSigle');
				195	is($json->{corpusSigle}, 'PRO-DUD', 'corpusSigle');
				196	is($json->{sgbrKodex}, 'T', 'sgbrKodex');
				197	is($json->{author}, 'unbekannt', 'Author');
				198	is($json->{language}, 'de', 'Language');
				199	is($json->{docTitle}, 'Korpus zur Beobachtung des Schreibgebrauchs im Deutschen', 'docTitle');
				200	is($json->{funder}, 'Bundesministerium für Bildung und Forschung', 'docTitle');
				201	is($json->{title}, 'Nur Platt, kein Deutsch', 'title');
				202	is($json->{pubDate}, '20130126', 'pubDate');
				203	is($json->{docSubTitle}, 'Subkorpus Ortsblatt, Jahrgang 2013, Monat Januar', 'docSubTitle');
				204	is($json->{keywords}, 'sgbrKodex:T', 'keywords');
				205	is($json->{publisher}, 'Dorfblatt GmbH', 'publisher');
				206
Akron	636bd9c	2017-02-09 17:13:00 +0100	[diff] [blame^]	207
				208
				209	# AGA with base info
				210	unlink $output;
				211	ok(!-f $output, 'Output does not exist');
				212	$input = catdir($f, '..', 'corpus', 'GOE2', 'AGA', '03828');
				213	ok(-d $input, 'Input directory found');
				214
				215	ok(!-f $output, 'Output does not exist');
				216
				217	$call = join(
				218	' ',
				219	'perl', $script,
				220	'--input' => $input,
				221	'--output' => $output,
				222	'-t' => 'base#tokens_aggr',
				223	'-bs' => 'DeReKo#Structure',
				224	'-bp' => 'DeReKo#Structure',
				225	'-bpb' => 'DeReKo#Structure',
				226	'-l' => 'INFO'
				227	);
				228
				229	stderr_like(
				230	sub {
				231	system($call);
				232	},
				233	qr!The code took!,
				234	$call
				235	);
				236	ok(-f $output, 'Output does exist');
				237	ok(($file = Mojo::File->new($output)->slurp), 'Slurp data');
				238	ok(($json = decode_json $file), 'decode json');
				239
				240	is($json->{title}, 'Autobiographische Einzelheiten', 'title');
				241	is($json->{data}->{stream}->[0]->[-1], '~:base/s:pb$<i>529<i>0', 'Pagebreak annotation');
				242
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	243	done_testing;
				244	__END__
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	245