Blame - script/prepare_index.pl - KorAP/KorAP-XML-Krill

blob: f0dc39891db767fad8438fe6b7d711a890697a26 [file] [log] [blame]

Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame^]	1	#!/usr/bin/env perl
				2	use strict;
				3	use warnings;
				4	use v5.16;
				5	use lib 'lib', '../lib';
				6	use Set::Scalar;
				7	use Mojo::DOM;
				8	use Mojo::Util qw/encode decode/;
				9	use Mojo::ByteStream 'b';
				10
				11	use Log::Log4perl;
				12	Log::Log4perl->init("script/log4perl.conf");
				13
				14	use KorAP::Document;
				15	use KorAP::Tokenizer;
				16
				17
				18	# Call perl script/prepare_index.pl WPD/AAA/00001
				19
				20	sub parse_doc {
				21	my $doc = KorAP::Document->new(
				22	path => shift . '/'
				23	);
				24
				25	$doc->parse;
				26
				27	my $tokens = KorAP::Tokenizer->new(
				28	path => $doc->path,
				29	doc => $doc,
				30	foundry => 'connexor',
				31	layer => 'tokens'
				32	);
				33
				34	$tokens->parse;
				35
				36	my $i = 0;
				37	$tokens->add_spandata(
				38	foundry => 'connexor',
				39	layer => 'sentences',
				40	#skip => 1,
				41	cb => sub {
				42	my ($stream, $span) = @_;
				43	my $mtt = $stream->pos($span->p_start);
				44	$mtt->add(
				45	term => '<>:s',
				46	o_start => $span->o_start,
				47	o_end => $span->o_end,
				48	p_end => $span->p_end
				49	);
				50	$i++;
				51	}
				52	);
				53
				54	$tokens->stream->add_meta('s', '<i>' . $i);
				55
				56	$i = 0;
				57	$tokens->add_spandata(
				58	foundry => 'base',
				59	layer => 'paragraph',
				60	#skip => 1,
				61	cb => sub {
				62	my ($stream, $span) = @_;
				63	my $mtt = $stream->pos($span->p_start);
				64	$mtt->add(
				65	term => '<>:p',
				66	o_start => $span->o_start,
				67	o_end => $span->o_end,
				68	p_end => $span->p_end
				69	);
				70	$i++;
				71	}
				72	);
				73	$tokens->stream->add_meta('p', '<i>' . $i);
				74
				75	$tokens->add_tokendata(
				76	foundry => 'opennlp',
				77	layer => 'morpho',
				78	#skip => 1,
				79	cb => sub {
				80	my ($stream, $token) = @_;
				81	my $mtt = $stream->pos($token->pos);
				82	my $content = $token->content;
				83
				84	my $found;
				85
				86	# syntax
				87	if (($found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
				88	$mtt->add(
				89	term => 'opennlp_p:' . $found
				90	);
				91	};
				92	});
				93
				94
				95	my $model = 'ne_dewac_175m_600';
				96	$tokens->add_tokendata(
				97	foundry => 'corenlp',
				98	#skip => 1,
				99	layer => $model,
				100	cb => sub {
				101	my ($stream, $token) = @_;
				102	my $mtt = $stream->pos($token->pos);
				103	my $content = $token->content;
				104
				105	my $found;
				106
				107	if (($found = $content->at('f[name=ne] f[name=ent]')) && ($found = $found->text)) {
				108	$mtt->add(
				109	term => 'corenlp_' . $model . ':' . $found
				110	);
				111	};
				112	});
				113
				114	$model = 'ne_hgc_175m_600';
				115	$tokens->add_tokendata(
				116	foundry => 'corenlp',
				117	#skip => 1,
				118	layer => $model,
				119	cb => sub {
				120	my ($stream, $token) = @_;
				121	my $mtt = $stream->pos($token->pos);
				122	my $content = $token->content;
				123
				124	my $found;
				125
				126	if (($found = $content->at('f[name=ne] f[name=ent]')) && ($found = $found->text)) {
				127	$mtt->add(
				128	term => 'corenlp_' . $model . ':' . $found
				129	);
				130	};
				131	});
				132
				133	$tokens->add_tokendata(
				134	foundry => 'connexor',
				135	layer => 'morpho',
				136	#skip => 1,
				137	cb => sub {
				138	my ($stream, $token) = @_;
				139	my $mtt = $stream->pos($token->pos);
				140	my $content = $token->content;
				141
				142	my $found;
				143
				144	# Lemma
				145	if (($found = $content->at('f[name="lemma"]')) && ($found = $found->text)) {
				146	if (index($found, "\N{U+00a0}") >= 0) {
				147	$found = b($found)->decode;
				148	foreach (split(/\x{00A0}/, $found)) {
				149	$mtt->add(
				150	term => 'cnx_l:' . b($_)->encode
				151	);
				152	}
				153	}
				154	else {
				155	$mtt->add(
				156	term => 'cnx_l:' . $found # b($found)->encode
				157	);
				158	};
				159	};
				160
				161	# POS
				162	if (($found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
				163	$mtt->add(
				164	term => 'cnx_p:' . $found
				165	);
				166	};
				167
				168	# MSD
				169	# Todo: Look in the description!
				170	if (($found = $content->at('f[name="msd"]')) && ($found = $found->text)) {
				171	foreach (split(':', $found)) {
				172	$mtt->add(
				173	term => 'cnx_m:' . $_
				174	);
				175	};
				176	};
				177	}
				178	);
				179
				180	$tokens->add_tokendata(
				181	foundry => 'connexor',
				182	layer => 'syntax',
				183	#skip => 1,
				184	cb => sub {
				185	my ($stream, $token) = @_;
				186	my $mtt = $stream->pos($token->pos);
				187	my $content = $token->content;
				188
				189	my $found;
				190
				191	# syntax
				192	if (($found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
				193	$mtt->add(
				194	term => 'cnx_syn:' . $found
				195	);
				196	};
				197	});
				198
				199	$tokens->add_spandata(
				200	foundry => 'connexor',
				201	layer => 'phrase',
				202	#skip => 1,
				203	cb => sub {
				204	my ($stream, $span) = @_;
				205
				206	my $type = $span->content->at('f[name=pos]');
				207	if ($type && ($type = $type->text)) {
				208	my $mtt = $stream->pos($span->p_start);
				209	$mtt->add(
				210	term => '<>:cnx_const:' . $type,
				211	o_start => $span->o_start,
				212	o_end => $span->o_end,
				213	p_end => $span->p_end
				214	);
				215	};
				216	}
				217	);
				218
				219	$tokens->add_tokendata(
				220	foundry => 'tree_tagger',
				221	#skip => 1,
				222	layer => 'morpho',
				223	cb => sub {
				224	my ($stream, $token) = @_;
				225	my $mtt = $stream->pos($token->pos);
				226	my $content = $token->content;
				227
				228	my $found;
				229
				230	# lemma
				231	if (($found = $content->at('f[name="lemma"]')) &&
				232	($found = $found->text) && $found ne 'UNKNOWN') {
				233	$mtt->add(
				234	term => 'tt_l:' . $found
				235	);
				236	};
				237
				238	# pos
				239	if (($found = $content->at('f[name="ctag"]')) && ($found = $found->text)) {
				240	$mtt->add(
				241	term => 'tt_p:' . $found
				242	);
				243	};
				244	});
				245
				246	$tokens->add_tokendata(
				247	foundry => 'mate',
				248	layer => 'morpho',
				249	cb => sub {
				250	my ($stream, $token) = @_;
				251	my $mtt = $stream->pos($token->pos);
				252	my $content = $token->content;
				253
				254	my $found;
				255
				256	my $capital = 0;
				257
				258	# pos
				259	if (($found = $content->at('f[name="pos"]')) &&
				260	($found = $found->text)) {
				261	$mtt->add(term => 'mate_p:' . $found
				262	);
				263	};
				264
				265	# lemma
				266	if (($found = $content->at('f[name="lemma"]'))
				267	&& ($found = $found->text)
				268	&& $found ne '--') {
				269	$mtt->add(term => 'mate_l:' . b($found)->decode('latin-1')->encode->to_string);
				270	};
				271
				272	# MSD
				273	if (($found = $content->at('f[name="msd"]')) &&
				274	($found = $found->text) &&
				275	($found ne '_')) {
				276	foreach (split '\\|', $found) {
				277	my ($x, $y) = split "=", $_;
				278	# case, tense, number, mood, person, degree, gender
				279	$mtt->add(term => 'mate_m:' . $x . ':' . $y);
				280	};
				281	};
				282	});
				283
				284
				285	$tokens->add_tokendata(
				286	foundry => 'xip',
				287	#skip => 1,
				288	layer => 'morpho',
				289	encoding => 'bytes',
				290	cb => sub {
				291	my ($stream, $token) = @_;
				292	my $mtt = $stream->pos($token->pos);
				293	my $content = $token->content;
				294
				295	my $found;
				296
				297	my $capital = 0;
				298	# pos
				299	if (($found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
				300	$mtt->add(
				301	term => 'xip_p:' . $found
				302	);
				303
				304	$capital = 1 if $found eq 'NOUN';
				305	};
				306
				307	# lemma
				308	if (($found = $content->at('f[name="lemma"]')) && ($found = $found->text)) {
				309	my (@token) = split('#', $found);
				310
				311	my $full = '';
				312	foreach (@token) {
				313	$full .= $_;
				314	$_ =~ s{/\w+$}{};
				315	$mtt->add(term => 'xip_l:' . $_);
				316	};
				317	if (@token > 1) {
				318	$full =~ s{/}{}g;
				319	$full = lc $full;
				320	$full = $capital ? ucfirst($full) : $full;
				321	$mtt->add(term => 'xip_l:' . $full);
				322	};
				323	};
				324	});
				325
				326
				327	# Collect all spans and check for roots
				328	my %xip_const;
				329	my $xip_const_root = Set::Scalar->new;
				330	my $xip_const_noroot = Set::Scalar->new;
				331
				332	# First run:
				333	$tokens->add_spandata(
				334	foundry => 'xip',
				335	layer => 'constituency',
				336	encoding => 'bytes',
				337	#skip => 1,
				338	cb => sub {
				339	my ($stream, $span) = @_;
				340
				341	$xip_const{$span->id} = $span;
				342	$xip_const_root->insert($span->id);
				343
				344	$span->content->find('rel[label=dominates][target]')->each(
				345	sub {
				346	my $rel = shift;
				347	$xip_const_noroot->insert($rel->attr('target'));
				348	}
				349	);
				350	}
				351	);
				352
				353	my $stream = $tokens->stream;
				354
				355	my $add_const = sub {
				356	my $span = shift;
				357	my $level = shift;
				358	my $mtt = $stream->pos($span->p_start);
				359
				360	my $content = $span->content;
				361	my $type = $content->at('f[name=const]');
				362	if ($type && ($type = $type->text)) {
				363	# $type is now NPA, NP, NUM
				364	my %term = (
				365	term => '<>:xip_const:' . $type,
				366	o_start => $span->o_start,
				367	o_end => $span->o_end,
				368	p_end => $span->p_end
				369	);
				370
				371	$term{payload} = '<s>' . $level if $level;
				372
				373	$mtt->add(%term);
				374
				375	my $this = __SUB__;
				376
				377	$content->find('rel[label=dominates][target]')->each(
				378	sub {
				379	my $subspan = delete $xip_const{$_[0]->attr('target')} or return;
				380	$this->($subspan, $level + 1);
				381	}
				382	);
				383	};
				384	};
				385
				386	my $diff = $xip_const_root->difference($xip_const_noroot);
				387	foreach ($diff->members) {
				388	my $obj = delete $xip_const{$_} or next;
				389	$add_const->($obj, 0);
				390	};
				391
				392	# Todo: Add mate-morpho
				393	# Todo: Add mate-dependency
				394	# Todo: Add xip-dependency
				395
				396	print $tokens->stream->to_string;
				397	};
				398
				399	if ($ARGV[0]) {
				400	parse_doc($ARGV[0]);
				401	};
				402
				403
				404
				405	__END__