Blame - t/real/transform.t - KorAP/KorAP-XML-Krill

blob: cf7e63655f4dbc7e21759f49e2f61428243e563d [file] [log] [blame]

Akron	414ec95	2020-08-03 15:48:43 +0200	[diff] [blame]	1	#!/usr/bin/env perl
				2	use strict;
				3	use warnings;
				4	use utf8;
				5	use Test::More;
				6	use JSON::XS;
				7	use Benchmark ':hireswallclock';
				8	use lib 'lib', '../lib';
				9
				10	use File::Basename 'dirname';
				11	use File::Spec::Functions 'catdir';
				12
				13	if ($ENV{SKIP_REAL}) {
				14	plan skip_all => 'Skip real tests';
				15	};
				16
				17	use_ok('KorAP::XML::Krill');
				18
				19	sub _t2h {
				20	my $string = shift;
				21	$string =~ s/^\[$\d+?-\d+?$(.+?)\]$/$1/;
				22	my %hash = ();
				23	foreach (split(qr!\\|!, $string)) {
				24	$hash{$_} = 1;
				25	};
				26	return \%hash;
				27	};
				28
				29	my @layers;
				30	# push(@layers, ['Base', 'Sentences']);
				31	push(@layers, ['Base', 'Paragraphs']);
				32
				33	# OpenNLP
				34	push(@layers, ['OpenNLP', 'Morpho']);
				35	push(@layers, ['OpenNLP', 'Sentences']);
				36
				37	# CoreNLP
				38	push(@layers, ['CoreNLP', 'NamedEntities', 'ne_dewac_175m_600']);
				39	push(@layers, ['CoreNLP', 'NamedEntities', 'ne_hgc_175m_600']);
				40	push(@layers, ['CoreNLP', 'Sentences']);
				41
				42	# push(@layers, ['DeReKo', 'Structure']);
				43
				44	# push(@layers, ['Glemm', 'Morpho']);
				45
				46	# push(@layers, ['Mate', 'Morpho']);
				47	push(@layers, ['Mate', 'Dependency']);
				48
				49	# push(@layers, ['Malt', 'Dependency']);
				50
				51	# Connexor
				52	push(@layers, ['Connexor', 'Morpho']);
				53	push(@layers, ['Connexor', 'Syntax']);
				54	push(@layers, ['Connexor', 'Phrase']);
				55	push(@layers, ['Connexor', 'Sentences']);
				56
				57
				58	# TODO: OpenNLP
				59
				60	# TreeTagger
				61	push(@layers, ['TreeTagger', 'Morpho']);
				62	push(@layers, ['TreeTagger', 'Sentences']);
				63
				64	# Mate
				65	#push(@layers, ['Mate', 'Morpho']);
				66	#push(@layers, ['Mate', 'Dependency']);
				67
				68	# XIP
				69	push(@layers, ['XIP', 'Morpho']);
				70	push(@layers, ['XIP', 'Constituency']);
				71	# push(@layers, ['XIP', 'Dependency']); # Will be skipped
				72	push(@layers, ['XIP', 'Sentences']);
				73
				74
				75	my $path = catdir(dirname(__FILE__), qw!corpus WPD 00001!);
				76	ok(my $doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
				77	like($doc->path, qr!\Q$path\E/$!, 'Path');
				78
				79	ok($doc = KorAP::XML::Krill->new( path => $path ), 'Load Korap::Document');
				80	like($doc->path, qr!\Q$path\E/$!, 'Path');
				81
				82	ok($doc->parse, 'Parse document');
				83
				84	# Metdata
				85	my $meta = $doc->meta;
				86	is($meta->{T_title}, 'A', 'title');
				87	ok(!$meta->{T_sub_title}, 'subTitle');
				88
				89	is($doc->text_sigle, 'WPD/AAA/00001', 'ID');
				90	is($doc->corpus_sigle, 'WPD', 'corpusID');
				91
				92	is($meta->{D_pub_date}, '20050328', 'pubDate');
				93	is($meta->{S_pub_place}, 'URL:http://de.wikipedia.org', 'pubPlace');
				94	is($meta->{K_text_class}->[0], 'freizeit-unterhaltung', 'TextClass');
				95	is($meta->{K_text_class}->[1], 'reisen', 'TextClass');
				96	is($meta->{K_text_class}->[2], 'wissenschaft', 'TextClass');
				97	is($meta->{K_text_class}->[3], 'populaerwissenschaft', 'TextClass');
				98	ok(!$meta->{K_text_class}->[4], 'TextClass');
				99	is($meta->{T_author}, 'Ruru; Jens.Ol; Aglarech; u.a.', 'author');
				100
				101	# Get tokens
				102	use_ok('KorAP::XML::Tokenizer');
				103	# Get tokenization
				104	ok(my $tokens = KorAP::XML::Tokenizer->new(
				105	path => $doc->path,
				106	doc => $doc,
				107	foundry => 'OpenNLP',
				108	layer => 'Tokens',
				109	name => 'tokens'
				110	), 'New Tokenizer');
				111	ok($tokens->parse, 'Parse');
				112
				113	like($tokens->path, qr!\Q$path\E/$!, 'Path');
				114	is($tokens->foundry, 'OpenNLP', 'Foundry');
				115	is($tokens->doc->text_sigle, 'WPD/AAA/00001', 'Doc id');
				116	is($tokens->should, 1068, 'Should');
				117	is($tokens->have, 923, 'Have');
				118	is($tokens->name, 'tokens', 'Name');
				119	is($tokens->layer, 'Tokens', 'Layer');
				120
				121
				122	is_deeply(_t2h($tokens->stream->pos(118)->to_string),
				123	_t2h('[(763-768)s:Linie\|i:linie\|_118$<i>763<i>768]'),
				124	'Token is correct');
				125
				126	# Add Mate
				127	ok($tokens->add('Mate', 'Morpho'), 'Add Mate');
				128
				129
				130	is_deeply(
				131	_t2h($tokens->stream->pos(118)->to_string),
				132	_t2h('[(763-768)s:Linie\|i:linie\|_118$<i>763<i>768\|mate/l:linie\|mate/p:NN\|mate/m:case:acc\|mate/m:number:sg\|mate/m:gender:fem]'),
				133	'with Mate');
				134
				135	# Add sentences
				136	ok($tokens->add('Base', 'Sentences'), 'Add Sentences');
				137
				138	is_deeply(
				139	_t2h($tokens->stream->pos(0)->to_string),
				140	_t2h('[(0-1)s:A\|i:a\|_0$<i>0<i>1\|-:tokens$<i>923\|mate/p:XY\|<>:base/s:s$<b>64<i>0<i>74<i>13<b>2\|<>:base/s:t$<b>64<i>0<i>6083<i>923<b>0\|-:base/sentences$<i>96]'),
				141	'Startinfo'
				142	);
				143
				144	foreach (@layers) {
				145	ok($tokens->add(@$_), 'Add '. join(', ', @$_));
				146	};
				147
				148	my $s =
				149	'[(0-1)s:A\|i:a\|_0$<i>0<i>1\|'.
				150	'-:tokens$<i>923\|'.
				151	'mate/p:XY\|'.
				152	'<>:base/s:s$<b>64<i>0<i>74<i>13<b>2\|'.
				153	'<>:base/s:t$<b>64<i>0<i>6083<i>923<b>0\|'.
				154	'-:base/sentences$<i>96\|'.
				155	'<>:base/s:p$<b>64<i>0<i>224<i>34<b>1\|'.
				156	'-:base/paragraphs$<i>76\|'.
				157	'opennlp/p:NE\|' .
				158	'<>:opennlp/s:s$<b>64<i>0<i>74<i>13<b>0\|'.
				159	'-:opennlp/sentences$<i>50\|'.
				160	'<>:corenlp/s:s$<b>64<i>0<i>6<i>2<b>0\|'.
				161	'-:corenlp/sentences$<i>67\|'.
				162	'cnx/l:A\|'.
				163	'cnx/p:N\|'.
				164	'cnx/syn:@NH\|'.
				165	'<>:cnx/c:np$<b>64<i>0<i>1<i>1<b>0\|'.
				166	'<>:cnx/s:s$<b>64<i>0<i>74<i>13<b>0\|'.
				167	'-:cnx/sentences$<i>63\|'.
				168	# 'tt/l:A\|'.
				169	'tt/p:NN$<b>129<b>199\|'.
				170	'tt/l:A$<b>129<b>253\|'.
				171	'tt/p:FM$<b>129<b>54\|'.
				172	'<>:tt/s:s$<b>64<i>0<i>6083<i>923<b>0\|'.
				173	'-:tt/sentences$<i>1\|'.
				174	'>:mate/d:PNC$<b>32<i>2\|' . #<s>0<s>0\|'.
				175	# 'mate/d:&&&$<b>128<s>1\|'.
				176	'xip/p:SYMBOL\|'.
				177	'xip/l:A\|'.
				178	'<>:xip/c:TOP$<b>64<i>0<i>74<i>13<b>0\|'.
				179	'<>:xip/c:MC$<b>64<i>0<i>73<i>13<b>1\|'.
				180	'<>:xip/c:NP$<b>64<i>0<i>1<i>1<b>2\|'.
				181	'<>:xip/c:NPA$<b>64<i>0<i>1<i>1<b>3\|'.
				182	'<>:xip/c:NOUN$<b>64<i>0<i>1<i>1<b>4\|'.
				183	'<>:xip/c:SYMBOL$<b>64<i>0<i>1<i>1<b>5\|'.
				184	# '>:xip/d:SUBJ$<i>3\|'.
				185	# '<:xip/d:COORD$<i>1\|'.
				186	'<>:xip/s:s$<b>64<i>0<i>74<i>13<b>0\|'.
				187	'-:xip/sentences$<i>65]';
				188
				189	{
				190	local $SIG{__WARN__} = sub {};
				191	is_deeply(
				192	_t2h($tokens->stream->pos(0)->to_string),
				193	_t2h($s),
				194	'Startinfo');
				195	};
				196
				197
				198	is($tokens->layer_info,
				199	'base/s=spans cnx/c=spans cnx/l=tokens cnx/m=tokens cnx/p=tokens cnx/s=spans cnx/syn=tokens corenlp/ne=tokens corenlp/s=spans mate/d=rels mate/l=tokens mate/m=tokens mate/p=tokens opennlp/p=tokens opennlp/s=spans tt/l=tokens tt/p=tokens tt/s=spans xip/c=spans xip/l=tokens xip/p=tokens xip/s=spans', 'Layer info');
				200	# xip/d=rels
				201
				202	is($tokens->support, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/namedentities corenlp/namedentities corenlp/namedentities/ne_dewac_175m_600 corenlp/namedentities/ne_hgc_175m_600 corenlp/sentences mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Support');
				203	# xip/dependency
				204	done_testing;
				205	__END__