Blame - lib/Krawfish/Index.pm - KorAP/Krawfish-prototype

blob: 9d4b6a1498dc4908646a2c0d7d4079c93e05ac57 [file] [log] [blame]

Akron	5f52153	2016-10-21 19:30:23 +0200	[diff] [blame]	1	package Krawfish::Index;
				2	use Krawfish::Index::Dictionary;
Akron	6a74973	2017-02-14 14:43:06 +0100	[diff] [blame]	3	use Krawfish::Index::Subtokens;
Akron	71fc14c	2016-10-31 23:44:43 +0100	[diff] [blame]	4	use Krawfish::Index::PrimaryData;
Akron	b01495a	2016-11-21 22:29:41 +0100	[diff] [blame]	5	use Krawfish::Index::Fields;
Akron	2ea61aa	2017-06-03 16:30:23 +0200	[diff] [blame]	6	use Krawfish::Index::PostingsLive;
Akron	91b0e47	2016-12-05 17:07:50 +0100	[diff] [blame]	7	use Krawfish::Cache;
Akron	c001d36	2016-12-12 19:07:52 +0100	[diff] [blame]	8	use Krawfish::Log;
Akron	5f52153	2016-10-21 19:30:23 +0200	[diff] [blame]	9	use strict;
				10	use warnings;
Akron	c3657bf	2016-10-31 00:15:43 +0100	[diff] [blame]	11	use Scalar::Util qw!blessed!;
Akron	5f52153	2016-10-21 19:30:23 +0200	[diff] [blame]	12	use Mojo::JSON qw/encode_json decode_json/;
Akron	18ff592	2017-01-13 10:09:45 +0100	[diff] [blame]	13	use Mojo::File;
Akron	5f52153	2016-10-21 19:30:23 +0200	[diff] [blame]	14
Akron	df4b405	2017-04-21 14:51:28 +0200	[diff] [blame]	15	# TODO: This should be a base class for K::I::Static and K::I::Dynamic
				16
Akron	93271d8	2016-11-24 09:18:41 +0100	[diff] [blame]	17	# TODO: Add LiveDocs-PostingsList, that supports deletion
Akron	2ea61aa	2017-06-03 16:30:23 +0200	[diff] [blame]	18	# TODO: Live should store the last_doc value
				19
Akron	93271d8	2016-11-24 09:18:41 +0100	[diff] [blame]	20	#
Akron	42d3f6a	2017-04-02 17:46:48 +0200	[diff] [blame]	21	# TODO: Support multiple tokenized texts for parallel corpora
				22	#
Akron	c3657bf	2016-10-31 00:15:43 +0100	[diff] [blame]	23	# TODO: Create Importer class
				24	#
Akron	5f52153	2016-10-21 19:30:23 +0200	[diff] [blame]	25	# TODO: Support Main Index and Auxiliary Indices with merging
				26	# https://www.youtube.com/watch?v=98E1h_u4xGk
Akron	c3657bf	2016-10-31 00:15:43 +0100	[diff] [blame]	27	#
Akron	5f52153	2016-10-21 19:30:23 +0200	[diff] [blame]	28	# TODO: Maybe logarithmic merge
				29	# https://www.youtube.com/watch?v=VNjf2dxWH2Y&spfreload=5
				30
Akron	2ee89f1	2016-12-07 18:33:52 +0100	[diff] [blame]	31	# TODO: Maybe 65.535 documents are enough per segment ...
				32
Akron	6e13a06	2017-01-13 11:55:28 +0100	[diff] [blame]	33	# TODO: Build a forward index
Akron	6a74973	2017-02-14 14:43:06 +0100	[diff] [blame]	34	# TODO: With a forward index, the subtokens offsets will no longer
Akron	6e13a06	2017-01-13 11:55:28 +0100	[diff] [blame]	35	# point to character positions in the primary text but to
Akron	6a74973	2017-02-14 14:43:06 +0100	[diff] [blame]	36	# subtoken positions in the forward index!
Akron	6e13a06	2017-01-13 11:55:28 +0100	[diff] [blame]	37
Akron	6ff7b48	2017-02-09 01:29:29 +0100	[diff] [blame]	38	# TODO:
Akron	9315367	2017-03-07 20:43:12 +0100	[diff] [blame]	39	# Currently ranking is not collation based. It should be possible
				40	# to define a collation per field and
				41	# use one collation for prefix and suffix sorting.
				42	# It may be beneficial to make a different sorting possible (though it's
				43	# probably acceptable to make it slow)
				44	# Use http://userguide.icu-project.org/collation
				45
				46	# TODO:
Akron	6ff7b48	2017-02-09 01:29:29 +0100	[diff] [blame]	47	# Reranking a field is not necessary, if the field value is already given.
				48	# In that case, look up the dictionary if the value is already given,
				49	# take the example doc of that field value and add the rank of that
				50	# doc for the new doc.
				51	# If the field is not yet given, take the next or previous value in dictionary
				52	# order and use the rank to rerank the field (see K::I::Dictionary).
				53	# BUT: This only works if the field has the same collation as the
				54	# dictionary!
				55
Akron	4b5257e	2017-04-05 17:50:22 +0200	[diff] [blame]	56	# TODO:
				57	# field names should have term_ids, so should foundries and layers, but
				58	# probably not field values and annotation values.
				59	# terms may have term_ids and subterms should have subterm_ids
				60
				61
Akron	3e648a0	2017-02-24 20:19:15 +0100	[diff] [blame]	62	use constant DEBUG => 0;
Akron	c001d36	2016-12-12 19:07:52 +0100	[diff] [blame]	63
				64
Akron	5f52153	2016-10-21 19:30:23 +0200	[diff] [blame]	65	sub new {
				66	my $class = shift;
				67	my $file = shift;
				68	my $self = bless {
				69	file => $file
				70	}, $class;
				71
Akron	23257b5	2017-01-17 21:01:41 +0100	[diff] [blame]	72	print_log('index', 'Instantiate new index') if DEBUG;
				73
Akron	5f52153	2016-10-21 19:30:23 +0200	[diff] [blame]	74	# Load dictionary
				75	$self->{dict} = Krawfish::Index::Dictionary->new(
				76	$self->{file}
				77	);
				78
Akron	71fc14c	2016-10-31 23:44:43 +0100	[diff] [blame]	79	# Load offsets
Akron	6a74973	2017-02-14 14:43:06 +0100	[diff] [blame]	80	$self->{subtokens} = Krawfish::Index::Subtokens->new(
Akron	71fc14c	2016-10-31 23:44:43 +0100	[diff] [blame]	81	$self->{file}
				82	);
				83
				84	# Load primary
				85	$self->{primary} = Krawfish::Index::PrimaryData->new(
				86	$self->{file}
				87	);
				88
Akron	4b5257e	2017-04-05 17:50:22 +0200	[diff] [blame]	89	# Load fields
Akron	b01495a	2016-11-21 22:29:41 +0100	[diff] [blame]	90	$self->{fields} = Krawfish::Index::Fields->new(
				91	$self->{file}
				92	);
				93
Akron	2ea61aa	2017-06-03 16:30:23 +0200	[diff] [blame]	94	# Load live document pointer
				95	$self->{live} = Krawfish::Index::PostingsLive->new(
				96	$self->{file}
				97	);
				98
Akron	2ee89f1	2016-12-07 18:33:52 +0100	[diff] [blame]	99	# Create a list of docid -> uuid mappers
				100	# This may be problematic as uuids may need to be uint64,
				101	# this can grow for a segment with 65.000 docs up to ~ 500kb
Akron	df4b405	2017-04-21 14:51:28 +0200	[diff] [blame]	102	# Or ~ 7MB for 1,000,000 documents
				103	# But this means it's possible to store
				104	# 18.446.744.073.709.551.615 documents in the index
Akron	2ee89f1	2016-12-07 18:33:52 +0100	[diff] [blame]	105	$self->{identifier} = [];
				106
				107	# Collect fields to sort
				108	$self->{sortable} = {};
				109
Akron	4b5257e	2017-04-05 17:50:22 +0200	[diff] [blame]	110	# Collect values to sum
				111	$self->{summable} = {};
				112
Akron	91b0e47	2016-12-05 17:07:50 +0100	[diff] [blame]	113	# Add cache
				114	$self->{cache} = Krawfish::Cache->new;
				115
Akron	5f52153	2016-10-21 19:30:23 +0200	[diff] [blame]	116	return $self;
				117	};
				118
Akron	6ccf810	2016-10-26 12:41:07 +0200	[diff] [blame]	119
Akron	1811acc	2017-06-07 02:13:16 +0200	[diff] [blame]	120	# Get the last document index
Akron	5f52153	2016-10-21 19:30:23 +0200	[diff] [blame]	121	sub last_doc {
Akron	1811acc	2017-06-07 02:13:16 +0200	[diff] [blame]	122	$_[0]->{live}->next_doc_id - 1;
Akron	5f52153	2016-10-21 19:30:23 +0200	[diff] [blame]	123	};
				124
Akron	4b5257e	2017-04-05 17:50:22 +0200	[diff] [blame]	125
Akron	3e648a0	2017-02-24 20:19:15 +0100	[diff] [blame]	126	# Alias for last doc
				127	sub max_rank {
Akron	1811acc	2017-06-07 02:13:16 +0200	[diff] [blame]	128	$_[0]->{live}->next_doc_id - 1;
Akron	3e648a0	2017-02-24 20:19:15 +0100	[diff] [blame]	129	};
Akron	6ccf810	2016-10-26 12:41:07 +0200	[diff] [blame]	130
Akron	4b5257e	2017-04-05 17:50:22 +0200	[diff] [blame]	131
Akron	6ccf810	2016-10-26 12:41:07 +0200	[diff] [blame]	132	# Get term dictionary
				133	sub dict {
				134	$_[0]->{dict};
				135	};
				136
Akron	2ee89f1	2016-12-07 18:33:52 +0100	[diff] [blame]	137
				138	# Get info
Akron	ddf077a	2016-11-05 15:00:00 +0100	[diff] [blame]	139	sub info {
				140	$_[0]->{info};
				141	};
Akron	c3657bf	2016-10-31 00:15:43 +0100	[diff] [blame]	142
Akron	2ee89f1	2016-12-07 18:33:52 +0100	[diff] [blame]	143
Akron	6a74973	2017-02-14 14:43:06 +0100	[diff] [blame]	144	# Get subtokens
				145	sub subtokens {
				146	$_[0]->{subtokens};
Akron	71fc14c	2016-10-31 23:44:43 +0100	[diff] [blame]	147	};
				148
				149
Akron	2ea61aa	2017-06-03 16:30:23 +0200	[diff] [blame]	150	# Get live documents
				151	sub live {
				152	$_[0]->{live};
				153	};
				154
				155
Akron	71fc14c	2016-10-31 23:44:43 +0100	[diff] [blame]	156	# Get primary
				157	sub primary {
				158	$_[0]->{primary};
				159	};
				160
Akron	2ee89f1	2016-12-07 18:33:52 +0100	[diff] [blame]	161
Akron	b01495a	2016-11-21 22:29:41 +0100	[diff] [blame]	162	# Get fields
				163	sub fields {
				164	$_[0]->{fields};
				165	};
				166
Akron	71fc14c	2016-10-31 23:44:43 +0100	[diff] [blame]	167
Akron	4b5257e	2017-04-05 17:50:22 +0200	[diff] [blame]	168	# Get field values for addition
				169	sub field_values {
				170	$_[0]->{field_values};
				171	};
				172
				173
Akron	6ccf810	2016-10-26 12:41:07 +0200	[diff] [blame]	174	# Add document to the index
Akron	e020194	2016-11-26 01:11:31 +0100	[diff] [blame]	175	# TODO: Expect a KoralQuery document
Akron	df4b405	2017-04-21 14:51:28 +0200	[diff] [blame]	176	# TODO: This should be specific to Krawfish::Index::Dynamic;
Akron	e23e292	2017-05-01 13:18:12 +0200	[diff] [blame]	177	# TODO: Support update as a insert_after_delete
Akron	5f52153	2016-10-21 19:30:23 +0200	[diff] [blame]	178	sub add {
				179	my $self = shift;
				180	my $doc = shift;
				181	unless (ref $doc) {
Akron	18ff592	2017-01-13 10:09:45 +0100	[diff] [blame]	182	$doc = decode_json(Mojo::File->new($doc)->slurp);
Akron	5f52153	2016-10-21 19:30:23 +0200	[diff] [blame]	183	};
				184
Akron	5f52153	2016-10-21 19:30:23 +0200	[diff] [blame]	185	# Get new doc_id
Akron	2ea61aa	2017-06-03 16:30:23 +0200	[diff] [blame]	186	my $doc_id = $self->live->incr;
Akron	5f52153	2016-10-21 19:30:23 +0200	[diff] [blame]	187
Akron	6ccf810	2016-10-26 12:41:07 +0200	[diff] [blame]	188	# Get document
Akron	f0d514a	2016-11-01 14:16:25 +0100	[diff] [blame]	189	$doc = $doc->{document};
Akron	1e46919	2016-10-24 12:59:21 +0200	[diff] [blame]	190
Akron	71fc14c	2016-10-31 23:44:43 +0100	[diff] [blame]	191	# Store primary data
Akron	c001d36	2016-12-12 19:07:52 +0100	[diff] [blame]	192	if ($doc->{primaryData}) {
Akron	6e13a06	2017-01-13 11:55:28 +0100	[diff] [blame]	193
				194	# TODO: This may, in the future, contain the forward index instead
Akron	c001d36	2016-12-12 19:07:52 +0100	[diff] [blame]	195	$self->primary->store($doc_id, $doc->{primaryData});
Akron	71fc14c	2016-10-31 23:44:43 +0100	[diff] [blame]	196
Akron	c001d36	2016-12-12 19:07:52 +0100	[diff] [blame]	197	print_log('index', 'Store primary data "' . $doc->{primaryData} . '"') if DEBUG;
				198	};
				199
Akron	71fc14c	2016-10-31 23:44:43 +0100	[diff] [blame]	200	my $pos = 0;
Akron	b01495a	2016-11-21 22:29:41 +0100	[diff] [blame]	201
Akron	2ee89f1	2016-12-07 18:33:52 +0100	[diff] [blame]	202	# Store identifier for mappings
Akron	df4b405	2017-04-21 14:51:28 +0200	[diff] [blame]	203	# But what is the purpose of the identifier?
				204	# Isn't it okay to be slow here ... ?
Akron	2ee89f1	2016-12-07 18:33:52 +0100	[diff] [blame]	205	if ($doc->{id}) {
				206	$self->{identifier}->[$doc_id] = $doc->{id};
				207	};
				208
Akron	6e13a06	2017-01-13 11:55:28 +0100	[diff] [blame]	209	my $dict = $self->{dict};
				210
Akron	b01495a	2016-11-21 22:29:41 +0100	[diff] [blame]	211	# Add metadata fields
				212	my $fields = $self->fields;
				213	foreach my $field (@{$doc->{fields}}) {
				214
Akron	4b5257e	2017-04-05 17:50:22 +0200	[diff] [blame]	215	# TODO:
				216	# Also store 'id' as a field value
				217
Akron	b01495a	2016-11-21 22:29:41 +0100	[diff] [blame]	218	# Add to document field (retrieval)
				219	$fields->store($doc_id, $field->{key}, $field->{value});
				220
Akron	4b5257e	2017-04-05 17:50:22 +0200	[diff] [blame]	221	# Prepare for summarization
				222	# if ($field->{type} eq 'type:integer') {
				223	# };
				224
Akron	2ee89f1	2016-12-07 18:33:52 +0100	[diff] [blame]	225	# Prepare field for sorting
				226	if ($field->{sortable}) {
				227
Akron	cf6806a	2016-12-28 16:45:23 +0100	[diff] [blame]	228	# Which entries need to be sorted?
Akron	2ee89f1	2016-12-07 18:33:52 +0100	[diff] [blame]	229	$self->{sortable}->{$field->{key}}++;
				230	};
				231
Akron	4b5257e	2017-04-05 17:50:22 +0200	[diff] [blame]	232	# Prepare field for summing
				233	# if ($field->{summable}) {
				234	#
				235	# # Which entries need to be summable
				236	# $self->{summable}->{$field->{key}}++;
				237	# };
				238
Akron	b01495a	2016-11-21 22:29:41 +0100	[diff] [blame]	239	# Add to postings lists (search)
				240	my $term = $field->{key} . ':' . $field->{value};
Akron	1cb0b21	2017-02-17 16:07:13 +0100	[diff] [blame]	241	my $post_list = $dict->add_term('+' . $term);
Akron	b01495a	2016-11-21 22:29:41 +0100	[diff] [blame]	242	$post_list->append($doc_id);
				243	};
				244
Akron	6a74973	2017-02-14 14:43:06 +0100	[diff] [blame]	245	my $subtokens = $self->subtokens;
Akron	6e13a06	2017-01-13 11:55:28 +0100	[diff] [blame]	246
Akron	6a74973	2017-02-14 14:43:06 +0100	[diff] [blame]	247	# The primary text is necessary for the subtoken index as well as
Akron	6e13a06	2017-01-13 11:55:28 +0100	[diff] [blame]	248	# for the forward index
				249	my $primary = $doc->{primaryData};
				250
Akron	6a74973	2017-02-14 14:43:06 +0100	[diff] [blame]	251	# Store subtokens
				252	if ($doc->{subtokens}) {
Akron	6e13a06	2017-01-13 11:55:28 +0100	[diff] [blame]	253
Akron	6a74973	2017-02-14 14:43:06 +0100	[diff] [blame]	254	print_log('index', 'Store subtokens') if DEBUG;
Akron	6e13a06	2017-01-13 11:55:28 +0100	[diff] [blame]	255
Akron	6a74973	2017-02-14 14:43:06 +0100	[diff] [blame]	256	# Store all subtoken offsets
				257	foreach my $seg (@{$doc->{subtokens}}) {
Akron	6e13a06	2017-01-13 11:55:28 +0100	[diff] [blame]	258
Akron	6a74973	2017-02-14 14:43:06 +0100	[diff] [blame]	259	# Get start and end of the subtoken
Akron	6e13a06	2017-01-13 11:55:28 +0100	[diff] [blame]	260	my ($start, $end) = @{$seg->{offsets}};
				261
				262	if (DEBUG) {
				263	print_log(
				264	'index',
Akron	6a74973	2017-02-14 14:43:06 +0100	[diff] [blame]	265	'Store subtoken: ' . $doc_id . ':' . $pos . '=' . join('-', $start, $end)
Akron	6e13a06	2017-01-13 11:55:28 +0100	[diff] [blame]	266	);
				267	};
				268
				269	# Get the term surface from the primary text
				270	# TODO: Ensure that the offsets are valid!
				271	my $term = substr($primary, $start, $end - $start);
				272
				273	# TODO: There may be a prefix necessary for surface forms
Akron	bc9d54c	2017-01-14 02:27:45 +0100	[diff] [blame]	274	# TODO: This may in fact be not necessary at all -
Akron	6a74973	2017-02-14 14:43:06 +0100	[diff] [blame]	275	# The subtokens may have their own IDs
Akron	bc9d54c	2017-01-14 02:27:45 +0100	[diff] [blame]	276	# And the terms do not need to be stored in the dictionary for retrieval ...
Akron	6e13a06	2017-01-13 11:55:28 +0100	[diff] [blame]	277
Akron	1cb0b21	2017-02-17 16:07:13 +0100	[diff] [blame]	278	# Add as a subterm
				279	my $subterm_id = $dict->add_subterm($term);
				280
Akron	1972c12	2017-03-14 17:46:55 +0100	[diff] [blame]	281	# TODO:
				282	# Check somehow, if the term is new. If so, then {
				283	# TODO: Store case insensitive term
				284	# $dict->add_subterm_casefolded(fold_case($term), $subterm_id);
				285	# $dict->add_subterm_without_diacritics(remove_diacritics($term), $subterm_id);
				286	# }
				287
Akron	1cb0b21	2017-02-17 16:07:13 +0100	[diff] [blame]	288	print_log('index', 'Surface form has subterm_id ' . $subterm_id) if DEBUG;
Akron	d5105af	2017-01-14 16:50:38 +0100	[diff] [blame]	289
Akron	6a74973	2017-02-14 14:43:06 +0100	[diff] [blame]	290	# Store information to subtoken
Akron	1cb0b21	2017-02-17 16:07:13 +0100	[diff] [blame]	291	$subtokens->store(
				292	$doc_id,
				293	$pos++,
				294	$start,
				295	$end,
				296	$subterm_id,
				297	$term # Probably not necessary!
				298	);
Akron	6e13a06	2017-01-13 11:55:28 +0100	[diff] [blame]	299	};
				300	};
				301
Akron	5f52153	2016-10-21 19:30:23 +0200	[diff] [blame]	302	# Get all tokens
Akron	71fc14c	2016-10-31 23:44:43 +0100	[diff] [blame]	303	$pos = 0;
Akron	1e46919	2016-10-24 12:59:21 +0200	[diff] [blame]	304	my $end;
Akron	f0d514a	2016-11-01 14:16:25 +0100	[diff] [blame]	305	foreach my $item (@{$doc->{annotations}}) {
Akron	5f52153	2016-10-21 19:30:23 +0200	[diff] [blame]	306
Akron	6ccf810	2016-10-26 12:41:07 +0200	[diff] [blame]	307	# Add token term to term dictionary
Akron	1e46919	2016-10-24 12:59:21 +0200	[diff] [blame]	308	if ($item->{'@type'} eq 'koral:token') {
Akron	5f52153	2016-10-21 19:30:23 +0200	[diff] [blame]	309
Akron	f0d514a	2016-11-01 14:16:25 +0100	[diff] [blame]	310	unless ($item->{wrap}) {
				311	warn 'No wrap defined in KoralQuery';
				312	next;
				313	};
				314
Akron	6ccf810	2016-10-26 12:41:07 +0200	[diff] [blame]	315	# Create key string
Akron	f0d514a	2016-11-01 14:16:25 +0100	[diff] [blame]	316	my $wrap = $item->{wrap};
				317	my @keys;
				318
				319	# Token wraps a koral:termGroup
				320	if ($wrap->{'@type'} && $wrap->{'@type'} eq 'koral:termGroup') {
				321	foreach (@{$wrap->{operands}}) {
				322	push @keys, _term($_);
				323	};
				324	}
				325
				326	# Token wraps a single koral:term
				327	else {
				328	push @keys, _term($wrap);
				329	};
Akron	1e46919	2016-10-24 12:59:21 +0200	[diff] [blame]	330
				331	# Append posting to postings list
Akron	6a74973	2017-02-14 14:43:06 +0100	[diff] [blame]	332	my @subtokens = _subtokens($item);
Akron	71fc14c	2016-10-31 23:44:43 +0100	[diff] [blame]	333
Akron	6a74973	2017-02-14 14:43:06 +0100	[diff] [blame]	334	# No subtokens defined
				335	unless (scalar @subtokens) {
				336	push @subtokens, $pos;
Akron	71fc14c	2016-10-31 23:44:43 +0100	[diff] [blame]	337
				338	# Store offsets
				339	if ($item->{offsets}) {
Akron	6a74973	2017-02-14 14:43:06 +0100	[diff] [blame]	340	$subtokens->store($doc_id, $pos, @{$item->{offsets}});
Akron	71fc14c	2016-10-31 23:44:43 +0100	[diff] [blame]	341	};
				342	$pos++;
				343	};
Akron	6ccf810	2016-10-26 12:41:07 +0200	[diff] [blame]	344
Akron	9cb1373	2017-01-31 19:16:32 +0100	[diff] [blame]	345	# Add token terms
Akron	f0d514a	2016-11-01 14:16:25 +0100	[diff] [blame]	346	foreach (@keys) {
Akron	1cb0b21	2017-02-17 16:07:13 +0100	[diff] [blame]	347	my $post_list = $dict->add_term($_);
Akron	6a74973	2017-02-14 14:43:06 +0100	[diff] [blame]	348	$post_list->append($doc_id, @subtokens);
Akron	f0d514a	2016-11-01 14:16:25 +0100	[diff] [blame]	349	};
Akron	6ccf810	2016-10-26 12:41:07 +0200	[diff] [blame]	350	}
				351
				352	# Add span term to dictionary
Akron	13e3101	2016-10-25 02:08:30 +0200	[diff] [blame]	353	elsif ($item->{'@type'} eq 'koral:span') {
Akron	6ccf810	2016-10-26 12:41:07 +0200	[diff] [blame]	354
				355	# Create key string
Akron	f0d514a	2016-11-01 14:16:25 +0100	[diff] [blame]	356	my $key = '<>' . _term($item->{wrap});
Akron	6ccf810	2016-10-26 12:41:07 +0200	[diff] [blame]	357
Akron	1cb0b21	2017-02-17 16:07:13 +0100	[diff] [blame]	358	my $post_list = $dict->add_term($key);
Akron	13e3101	2016-10-25 02:08:30 +0200	[diff] [blame]	359
				360	# Append posting to posting list
				361	$post_list->append(
				362	$doc_id,
Akron	6a74973	2017-02-14 14:43:06 +0100	[diff] [blame]	363	$item->{subtokens}->[0],
				364	# The end is AFTER the second subtoken
				365	$item->{subtokens}->[-1] + 1
Akron	13e3101	2016-10-25 02:08:30 +0200	[diff] [blame]	366	);
Akron	1e46919	2016-10-24 12:59:21 +0200	[diff] [blame]	367	};
Akron	5f52153	2016-10-21 19:30:23 +0200	[diff] [blame]	368	};
				369
Akron	71fc14c	2016-10-31 23:44:43 +0100	[diff] [blame]	370	return $doc_id;
Akron	5f52153	2016-10-21 19:30:23 +0200	[diff] [blame]	371	};
				372
Akron	1e46919	2016-10-24 12:59:21 +0200	[diff] [blame]	373
Akron	e020194	2016-11-26 01:11:31 +0100	[diff] [blame]	374	# TODO: Use from_koral()->term
				375	# Potentially with a prefix
Akron	6ccf810	2016-10-26 12:41:07 +0200	[diff] [blame]	376	sub _term {
				377	my $item = shift;
Akron	1e46919	2016-10-24 12:59:21 +0200	[diff] [blame]	378
Akron	6ccf810	2016-10-26 12:41:07 +0200	[diff] [blame]	379	my $key = '';
				380	# Create term for term dictionary
				381	if ($item->{foundry}) {
				382	$key .= $item->{foundry};
				383	if ($item->{layer}) {
				384	$key .= '/' . $item->{layer};
				385	}
				386	$key .= '=';
				387	};
				388	return $key . ($item->{key} // '');
				389	}
				390
Akron	c3657bf	2016-10-31 00:15:43 +0100	[diff] [blame]	391
Akron	6a74973	2017-02-14 14:43:06 +0100	[diff] [blame]	392	# Return subtoken list or nothing
				393	sub _subtokens {
Akron	6ccf810	2016-10-26 12:41:07 +0200	[diff] [blame]	394	my $item = shift;
				395	my @posting;
				396
Akron	6a74973	2017-02-14 14:43:06 +0100	[diff] [blame]	397	if ($item->{subtokens}) {
Akron	6ccf810	2016-10-26 12:41:07 +0200	[diff] [blame]	398
				399	# Remove!
Akron	6a74973	2017-02-14 14:43:06 +0100	[diff] [blame]	400	push @posting, $item->{subtokens}->[0];
Akron	6ccf810	2016-10-26 12:41:07 +0200	[diff] [blame]	401
Akron	6a74973	2017-02-14 14:43:06 +0100	[diff] [blame]	402	if ($item->{subtokens}->[1]) {
				403	# The end is AFTER the second subtoken
				404	push @posting, $item->{subtokens}->[1] + 1;
Akron	6ccf810	2016-10-26 12:41:07 +0200	[diff] [blame]	405	};
				406
				407	return @posting;
				408	};
				409
				410	return;
Akron	5f52153	2016-10-21 19:30:23 +0200	[diff] [blame]	411	};
				412
Akron	c3657bf	2016-10-31 00:15:43 +0100	[diff] [blame]	413
Akron	e020194	2016-11-26 01:11:31 +0100	[diff] [blame]	414	# Apply (aka search) the index
Akron	c3657bf	2016-10-31 00:15:43 +0100	[diff] [blame]	415	sub apply {
				416	my $self = shift;
				417	my $koral = shift;
				418
				419	# Necessary for filtering
Akron	e020194	2016-11-26 01:11:31 +0100	[diff] [blame]	420	my $corpus = $koral->corpus->prepare_for($self) or return;
Akron	c3657bf	2016-10-31 00:15:43 +0100	[diff] [blame]	421
Akron	e020194	2016-11-26 01:11:31 +0100	[diff] [blame]	422	# Add VC to query as a constraint
				423	my $query = $koral->query->prepare_for($self, $corpus) or return;
Akron	c3657bf	2016-10-31 00:15:43 +0100	[diff] [blame]	424
				425	# Get meta information
Akron	e020194	2016-11-26 01:11:31 +0100	[diff] [blame]	426	my $meta = $koral->meta->prepare_for($self) or return;
Akron	c3657bf	2016-10-31 00:15:43 +0100	[diff] [blame]	427
				428	my $cb = shift;
				429	my @result = ();
				430
				431	# No callback - push to array
				432	unless ($cb) {
				433	while ($query->next) {
				434	push @result, $query->current;
				435	};
				436	return @result;
				437	};
				438
				439	# Push callback
				440	while ($query->next) {
				441	$cb->($query->current);
				442	};
				443
				444	};
				445
				446
				447
Akron	5f52153	2016-10-21 19:30:23 +0200	[diff] [blame]	448	1;
Akron	c3657bf	2016-10-31 00:15:43 +0100	[diff] [blame]	449
				450
				451	__END__
				452
Akron	349747d	2016-12-05 11:05:53 +0100	[diff] [blame]	453
				454
				455	# Search using meta data
				456	# Can also be used to collect with a callback
Akron	7db79e2	2016-12-08 23:02:32 +0100	[diff] [blame]	457	#
Akron	349747d	2016-12-05 11:05:53 +0100	[diff] [blame]	458	sub search {
				459	my ($self, $koral, $cb) = @_;
				460
				461	my $query = $koral->query;
				462	my $corpus = $koral->corpus;
				463	my $meta = $koral->meta;
				464
Akron	27fb743	2016-12-11 18:07:32 +0100	[diff] [blame]	465	# Initiate result object
Akron	349747d	2016-12-05 11:05:53 +0100	[diff] [blame]	466	my $result = $koral->result;
				467
Akron	27fb743	2016-12-11 18:07:32 +0100	[diff] [blame]	468	# Get filtered search object
Akron	349747d	2016-12-05 11:05:53 +0100	[diff] [blame]	469	my $search = $query->filter_by($corpus)->plan_for($self);
				470
				471	# Augment with facets
Akron	27fb743	2016-12-11 18:07:32 +0100	[diff] [blame]	472	# Will add to result info
Akron	349747d	2016-12-05 11:05:53 +0100	[diff] [blame]	473	if ($meta->facets) {
				474	$search = $meta->facets($search);
				475	};
				476
Akron	27fb743	2016-12-11 18:07:32 +0100	[diff] [blame]	477	# Augment with counting
				478	# Will add to result info
				479	if ($meta->count) {
				480	$search = $meta->count($search);
				481	};
				482
Akron	349747d	2016-12-05 11:05:53 +0100	[diff] [blame]	483	# Augment with sorting
Akron	2ee89f1	2016-12-07 18:33:52 +0100	[diff] [blame]	484	if ($meta->sorted_by) {
				485	$search = $meta->sorted_by($search);
Akron	349747d	2016-12-05 11:05:53 +0100	[diff] [blame]	486	};
				487
Akron	27fb743	2016-12-11 18:07:32 +0100	[diff] [blame]	488	# Augment with limitations
				489	if ($meta->limit) {
				490	$search = $meta->limit($search);
Akron	349747d	2016-12-05 11:05:53 +0100	[diff] [blame]	491	};
				492
Akron	27fb743	2016-12-11 18:07:32 +0100	[diff] [blame]	493	# Augment with field collector
				494	# Will modify current match
				495	$search = $meta->fields($search);
				496
				497	# Augment with id creator
				498	# Will modify current match
				499	$search = $meta->id_create($search);
				500
				501	# Augment with snmippet creator
				502	# Will modify current match
				503	$search = $meta->snippets($search);
				504
				505	# Iterate over all matches
				506	while ($search->next) {
				507
				508	# Based on the information, this will populate the match
				509	$result->add_match($search->current_match);
Akron	349747d	2016-12-05 11:05:53 +0100	[diff] [blame]	510	};
				511
				512	return $koral;
				513	};
				514
				515	sub get_fields {
				516	my ($self, $doc_id, $fields) = @_;
Akron	4b5257e	2017-04-05 17:50:22 +0200	[diff] [blame]	517	...
Akron	349747d	2016-12-05 11:05:53 +0100	[diff] [blame]	518	};
				519
				520	# This returns the posting's start and end position
				521	# when embedded in a span, e.g. <base/s=s>
				522	sub get_context_by_query {
				523	my ($self, $posting, $query) = @_
				524	};
				525
				526	sub get_annotations {
				527	my ($self, $posting, $terms) = @_;
				528
				529	my %anno = ();
				530
				531	my $dict = $self->dict;
				532	foreach my $term ($dict->terms($terms)) {
				533	my $term_list = $dict->get($term);
				534
				535	# Skip to the correct document and the first position
				536	next unless $term_list->next($posting->doc_id, $posting->start);
				537
				538	# Init annotation
				539	my $anno = ($anno{$term} //= []);
				540
				541	# Iterate over all annotations
				542	while ($term_list->current->end <= $posting->end) {
				543
				544	# Remember the annotations
				545	push @$anno, $term_list->current->clone;
				546
				547	$term_list->next or next;
				548	}
				549
				550	# Close (and forget) termlist
				551	$term_list->close;
				552	};
				553
				554	return \%anno;
Akron	c3657bf	2016-10-31 00:15:43 +0100	[diff] [blame]	555	};
				556
				557
Akron	349747d	2016-12-05 11:05:53 +0100	[diff] [blame]	558
				559
Akron	c3657bf	2016-10-31 00:15:43 +0100	[diff] [blame]	560
				561	sub items_per_page;
				562
				563	sub start_page;
				564
				565	sub apply {
				566	my $self = shift;
				567	my $query = $self->plan;
				568	my $cb = shift;
				569	my @result = ();
				570
				571	# No callback - push to array
				572	unless ($cb) {
				573	while ($query->next) {
				574	push @result, $query->current;
				575	};
				576	return @result;
				577	};
				578
				579	# Push callback
				580	while ($query->next) {
				581	$cb->($query->current);
				582	};
				583	};