Blame - t/artificial.t - KorAP/KorAP-XML-Krill

blob: 7d40177db8021f1de99b428ec13e51071c5e89fb [file] [log] [blame]

Nils Diewald	98767bb	2014-04-25 20:31:19 +0000	[diff] [blame^]	1	#!/usr/bin/env perl
				2	# source ~/perl5/perlbrew/etc/bashrc
				3	# perlbrew switch perl-blead@korap
				4	use strict;
				5	use warnings;
				6	use utf8;
				7	use Test::More;
				8	use Benchmark ':hireswallclock';
				9	use lib 'lib', '../lib';
				10	use Scalar::Util qw/weaken/;
				11
				12	use File::Basename 'dirname';
				13	use File::Spec::Functions 'catdir';
				14
				15	use_ok('KorAP::Document');
				16
				17	my $path = catdir(dirname(__FILE__), 'artificial');
				18	ok(my $doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
				19	is($doc->path, $path . '/', 'Path');
				20	ok($doc->parse, 'Parse document');
				21
				22	sub new_tokenizer {
				23	my $x = $doc;
				24	weaken $x;
				25	return KorAP::Tokenizer->new(
				26	path => $x->path,
				27	doc => $x,
				28	foundry => 'OpenNLP',
				29	layer => 'Tokens',
				30	name => 'tokens'
				31	)
				32	};
				33
				34	is($doc->primary->data,
				35	'Zum letzten kulturellen Anlass lädt die Leitung des Schulheimes Hofbergli ein, '.
				36	'bevor der Betrieb Ende Schuljahr eingestellt wird.', 'Primary data');
				37
				38	is($doc->primary->data_length, 129, 'Primary data length');
				39
				40	is($doc->primary->data(0,3), 'Zum', 'Get primary data');
				41
				42	# Get tokens
				43	use_ok('KorAP::Tokenizer');
				44	# Get tokenization
				45	ok(my $tokens = KorAP::Tokenizer->new(
				46	path => $doc->path,
				47	doc => $doc,
				48	foundry => 'OpenNLP',
				49	layer => 'Tokens',
				50	name => 'tokens'
				51	), 'New Tokenizer');
				52	ok($tokens->parse, 'Parse');
				53
				54	is($tokens->foundry, 'OpenNLP', 'Foundry');
				55
				56	is($tokens->doc->id, 'ART_00001', 'Doc id');
				57	is($tokens->should, 20, 'Should');
				58	is($tokens->have, 18, 'Have');
				59	is($tokens->name, 'tokens', 'Name');
				60	is($tokens->layer, 'Tokens', 'Layer');
				61
				62	is($tokens->stream->pos(0)->to_string, '[(0-3)s:Zum\|i:zum\|_0#0-3\|-:tokens$<i>18]', 'Token is correct');
				63	is($tokens->stream->pos(1)->to_string, '[(4-11)s:letzten\|i:letzten\|_1#4-11]', 'Token is correct');
				64
				65	my $i = 2;
				66	foreach ([12,23, 'kulturellen'],
				67	[24,30, 'Anlass'],
				68	[31,35, 'lädt'],
				69	[36,39, 'die'],
				70	[40,47, 'Leitung'],
				71	[48,51, 'des'],
				72	[52,63, 'Schulheimes'],
				73	[64,73, 'Hofbergli'],
				74	[74,77, 'ein'],
				75	[79,84, 'bevor'],
				76	[85,88, 'der'],
				77	[89,96, 'Betrieb'],
				78	[97,101, 'Ende'],
				79	[102,111, 'Schuljahr'],
				80	[112,123, 'eingestellt'],
				81	[124,128, 'wird']
				82	) {
				83	is($tokens->stream->pos($i++)->to_string,
				84	'[('.$_->[0].'-'.$_->[1].')'.
				85	's:'.$_->[2].'\|i:'.lc($_->[2]).'\|'.
				86	'_'.($i-1).'#'.$_->[0].'-'.$_->[1].']',
				87	'Token is correct');
				88	};
				89
				90	ok(!$tokens->stream->pos($i++), 'No more tokens');
				91
				92	# Add OpenNLP/morpho
				93	ok($tokens->add('OpenNLP', 'Morpho'), 'Add OpenNLP/Morpho');
				94
				95	$i = 0;
				96	foreach (qw/APPRART ADJA ADJA NN VVFIN ART NN ART NN NE PTKVZ KOUS ART NN NN NN VVPP VAFIN/) {
				97	like($tokens->stream->pos($i++)->to_string,
				98	qr!\\|opennlp/p:$_!,
				99	'Annotation (OpenNLP) is correct');
				100	};
				101
				102	# Add OpenNLP/sentences
				103	ok($tokens->add('OpenNLP', 'Sentences'), 'Add OpenNLP/Sentences');
				104
				105	is($tokens->stream->pos(0)->to_string, '[(0-3)s:Zum\|i:zum\|_0#0-3\|-:tokens$<i>18\|opennlp/p:APPRART\|<>:opennlp/s#0-129$<i>17\|-:opennlp/sentences$<i>1]', 'Correct sentence');
				106
				107
				108	# New instantiation
				109	ok($tokens = KorAP::Tokenizer->new(
				110	path => $doc->path,
				111	doc => $doc,
				112	foundry => 'OpenNLP',
				113	layer => 'Tokens',
				114	name => 'tokens'
				115	), 'New Tokenizer');
				116
				117	ok($tokens->parse, 'Parse');
				118
				119	# Add OpenNLP/sentences
				120	ok($tokens->add('Base', 'Sentences'), 'Add Base/Sentences');
				121
				122	# Add OpenNLP/sentences
				123	ok($tokens->add('Base', 'Paragraphs'), 'Add Base/Paragraphs');
				124
				125	is($tokens->stream->pos(0)->to_string,
				126	'[(0-3)s:Zum\|i:zum\|_0#0-3\|-:tokens$<i>18\|<>:base/s#0-129$<i>17\|<>:base/text#0-129$<i>17\|-:base/sentences$<i>1\|-:base/paragraphs$<i>0]',
				127	'Correct base annotation');
				128
				129
				130	# New instantiation
				131	ok($tokens = new_tokenizer, 'New Tokenizer');
				132
				133	ok($tokens->parse, 'Parse');
				134
				135	# Add CoreNLP/NamedEntities
				136	ok($tokens->add('CoreNLP', 'NamedEntities', 'ne_dewac_175m_600'), 'Add CoreNLP/NamedEntities');
				137	ok($tokens->add('CoreNLP', 'NamedEntities', 'ne_hgc_175m_600'), 'Add CoreNLP/NamedEntities');
				138
				139	is($tokens->stream->pos(9)->to_string,
				140	'[(64-73)s:Hofbergli\|i:hofbergli\|_9#64-73\|corenlp/ne_dewac_175m_600:I-LOC\|corenlp/ne_hgc_175m_600:I-LOC]',
				141	'Correct NamedEntities annotation');
				142
				143
				144	# New instantiation
				145	ok($tokens = new_tokenizer, 'New Tokenizer');
				146	ok($tokens->parse, 'Parse');
				147
				148	# Add CoreNLP/Morpho
				149	ok($tokens->add('CoreNLP', 'Morpho'), 'Add CoreNLP/Morpho');
				150
				151	is($tokens->stream->pos(0)->to_string,
				152	'[(0-3)s:Zum\|i:zum\|_0#0-3\|-:tokens$<i>18\|corenlp/p:APPRART]',
				153	'Correct corenlp annotation');
				154
				155	$i = 0;
				156	foreach (qw/APPRART ADJ ADJA NN VVFIN ART NN ART NN NE PTKVZ KOUS ART NN NN NN VVPP VAFIN/) {
				157	like($tokens->stream->pos($i++)->to_string,
				158	qr!\\|corenlp/p:$_!,
				159	'Annotation (CoreNLP) is correct');
				160	};
				161
				162	# Add CoreNLP/Sentences
				163	ok($tokens->add('CoreNLP', 'Sentences'), 'Add CoreNLP/Sentences');
				164
				165	is($tokens->stream->pos(0)->to_string,
				166	'[(0-3)s:Zum\|i:zum\|_0#0-3\|-:tokens$<i>18\|corenlp/p:APPRART\|<>:corenlp/s#0-129$<i>17\|-:corenlp/sentences$<i>1]',
				167	'Correct corenlp annotation');
				168
				169
				170	# New instantiation
				171	ok($tokens = new_tokenizer, 'New Tokenizer');
				172	ok($tokens->parse, 'Parse');
				173
				174	# Add CoreNLP/Sentences
				175	ok($tokens->add('Connexor', 'Sentences'), 'Add Connexor/Sentences');
				176
				177	is($tokens->stream->pos(0)->to_string,
				178	'[(0-3)s:Zum\|i:zum\|_0#0-3\|-:tokens$<i>18\|<>:cnx/s#0-129$<i>17\|-:cnx/sentences$<i>1]',
				179	'Correct cnx annotation');
				180
				181
				182
				183	# Todo: CoreNLP/Constituency!
				184	# Todo: Connexor/Morpho
				185	# Todo: Connexor/Phrase
				186	# Todo: Connexor/Syntax
				187
				188
				189	done_testing;
				190	__END__
				191
				192
				193
				194	# Connexor
				195	push(@layers, ['Connexor', 'Morpho']);
				196	push(@layers, ['Connexor', 'Syntax']);
				197	push(@layers, ['Connexor', 'Phrase']);
				198	push(@layers, ['Connexor', 'Sentences']);
				199
				200	# TreeTagger
				201	push(@layers, ['TreeTagger', 'Morpho']);
				202	push(@layers, ['TreeTagger', 'Sentences']);
				203
				204	# Mate
				205	# push(@layers, ['Mate', 'Morpho']);
				206	push(@layers, ['Mate', 'Dependency']);
				207
				208	# XIP
				209	push(@layers, ['XIP', 'Morpho']);
				210	push(@layers, ['XIP', 'Constituency']);
				211	push(@layers, ['XIP', 'Dependency']);
				212	push(@layers, ['XIP', 'Sentences']);
				213
				214
				215
				216	# Metdata
				217	is($doc->title, 'A', 'title');
				218	ok(!$doc->sub_title, 'subTitle');
				219
				220	is($doc->id, 'WPD_AAA.00001', 'ID');
				221	is($doc->corpus_id, 'WPD', 'corpusID');
				222	is($doc->pub_date, '20050328', 'pubDate');
				223	is($doc->pub_place, 'URL:http://de.wikipedia.org', 'pubPlace');
				224	is($doc->text_class->[0], 'freizeit-unterhaltung', 'TextClass');
				225	is($doc->text_class->[1], 'reisen', 'TextClass');
				226	is($doc->text_class->[2], 'wissenschaft', 'TextClass');
				227	is($doc->text_class->[3], 'populaerwissenschaft', 'TextClass');
				228	ok(!$doc->text_class->[4], 'TextClass');
				229	is($doc->author->[0], 'Ruru', 'author');
				230	is($doc->author->[1], 'Jens.Ol', 'author');
				231	is($doc->author->[2], 'Aglarech', 'author');
				232	ok(!$doc->author->[3], 'author');
				233
				234	# Get tokens
				235	use_ok('KorAP::Tokenizer');
				236	# Get tokenization
				237	ok(my $tokens = KorAP::Tokenizer->new(
				238	path => $doc->path,
				239	doc => $doc,
				240	foundry => 'OpenNLP',
				241	layer => 'Tokens',
				242	name => 'tokens'
				243	), 'New Tokenizer');
				244	ok($tokens->parse, 'Parse');
				245
				246	is($tokens->path, $path . '/', 'Path');
				247	is($tokens->foundry, 'OpenNLP', 'Foundry');
				248	is($tokens->doc->id, 'WPD_AAA.00001', 'Doc id');
				249	is($tokens->should, 1068, 'Should');
				250	is($tokens->have, 923, 'Have');
				251	is($tokens->name, 'tokens', 'Name');
				252	is($tokens->layer, 'Tokens', 'Layer');
				253
				254	is($tokens->stream->pos(118)->to_string, '[(763-768)s:Linie\|i:linie\|_118#763-768]', 'Token is correct');
				255
				256	# Add Mate
				257	ok($tokens->add('Mate', 'Morpho'), 'Add Mate');
				258
				259	is($tokens->stream->pos(118)->to_string, '[(763-768)s:Linie\|i:linie\|_118#763-768\|mate/l:linie\|mate/p:NN\|mate/m:case:acc\|mate/m:number:sg\|mate/m:gender:fem]', 'with Mate');
				260
				261	# Add sentences
				262	ok($tokens->add('Base', 'Sentences'), 'Add Sentences');
				263
				264	is($tokens->stream->pos(0)->to_string, '[(0-1)s:A\|i:a\|_0#0-1\|-:tokens$<i>923\|mate/p:XY\|<>:base/s#0-74$<i>13\|<>:base/text#0-6083$<i>923\|-:sentences$<i>96]', 'Startinfo');
				265
				266	foreach (@layers) {
				267	ok($tokens->add(@$_), 'Add '. join(', ', @$_));
				268	};
				269
				270	is($tokens->stream->pos(0)->to_string, '[(0-1)s:A\|i:a\|_0#0-1\|-:tokens$<i>923\|mate/p:XY\|<>:base/s#0-74$<i>13\|<>:base/text#0-6083$<i>923\|-:sentences$<i>96\|<>:base/para#0-224$<i>34\|-:paragraphs$<i>76\|opennlp/p:NE\|<>:opennlp/s#0-74$<i>13\|<>:corenlp/s#0-6$<i>2\|cnx/l:A\|cnx/p:N\|cnx/syn:@NH\|<>:cnx/s#0-74$<i>13\|tt/l:A\|tt/p:NN\|tt/l:A\|tt/p:FM\|<>:tt/s#0-6083$<i>923\|>:mate/d:PNC$<i>2\|xip/p:SYMBOL\|xip/l:A\|<>:xip/c:TOP#0-74$<i>13\|<>:xip/c:MC#0-73$<i>13<b>1\|>:xip/d:SUBJ$<i>3\|<:xip/d:COORD$<i>1\|<>:xip/s#0-74$<i>13]', 'Startinfo');
				271
				272
				273	is($tokens->stream->pos(118)->to_string,
				274	'[(763-768)s:Linie\|i:linie\|_118#763-768\|'.
				275	'mate/l:linie\|mate/p:NN\|mate/m:case:acc\|mate/m:number:sg\|mate/m:gender:fem\|' .
				276	'opennlp/p:NN\|'.
				277	'cnx/l:linie\|cnx/p:N\|cnx/syn:@NH\|'.
				278	'tt/l:Linie\|tt/p:NN\|'.
				279	'<:mate/d:NK$<i>116\|<:mate/d:NK$<i>117\|>:mate/d:NK$<i>115\|'.
				280	'xip/p:NOUN\|xip/l:Linie\|<>:xip/c:NOUN#763-768$<i>119\|<:xip/d:DETERM$<i>116\|<:xip/d:NMOD$<i>117]', 'with All');
				281
				282	is($tokens->layer_info, 'cnx/c=const cnx/l=lemma cnx/m=msd cnx/p=pos mate/d=dep mate/l=lemma mate/m=msd mate/p=pos opennlp/p=pos tt/l=lemma tt/p=pos xip/c=const xip/d=dep xip/l=lemma xip/p=pos', 'Layer info');
				283
				284	is($tokens->support, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/namedentities corenlp/namedentities corenlp/namedentities/ne_dewac_175m_600 corenlp/namedentities/ne_hgc_175m_600 corenlp/sentences mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/dependency xip/morpho xip/sentences', 'Support');
				285
				286	done_testing;
				287
				288	__END__