blob: 9d4b6a1498dc4908646a2c0d7d4079c93e05ac57 [file] [log] [blame]
Akron5f521532016-10-21 19:30:23 +02001package Krawfish::Index;
2use Krawfish::Index::Dictionary;
Akron6a749732017-02-14 14:43:06 +01003use Krawfish::Index::Subtokens;
Akron71fc14c2016-10-31 23:44:43 +01004use Krawfish::Index::PrimaryData;
Akronb01495a2016-11-21 22:29:41 +01005use Krawfish::Index::Fields;
Akron2ea61aa2017-06-03 16:30:23 +02006use Krawfish::Index::PostingsLive;
Akron91b0e472016-12-05 17:07:50 +01007use Krawfish::Cache;
Akronc001d362016-12-12 19:07:52 +01008use Krawfish::Log;
Akron5f521532016-10-21 19:30:23 +02009use strict;
10use warnings;
Akronc3657bf2016-10-31 00:15:43 +010011use Scalar::Util qw!blessed!;
Akron5f521532016-10-21 19:30:23 +020012use Mojo::JSON qw/encode_json decode_json/;
Akron18ff5922017-01-13 10:09:45 +010013use Mojo::File;
Akron5f521532016-10-21 19:30:23 +020014
Akrondf4b4052017-04-21 14:51:28 +020015# TODO: This should be a base class for K::I::Static and K::I::Dynamic
16
Akron93271d82016-11-24 09:18:41 +010017# TODO: Add LiveDocs-PostingsList, that supports deletion
Akron2ea61aa2017-06-03 16:30:23 +020018# TODO: Live should store the last_doc value
19
Akron93271d82016-11-24 09:18:41 +010020#
Akron42d3f6a2017-04-02 17:46:48 +020021# TODO: Support multiple tokenized texts for parallel corpora
22#
Akronc3657bf2016-10-31 00:15:43 +010023# TODO: Create Importer class
24#
Akron5f521532016-10-21 19:30:23 +020025# TODO: Support Main Index and Auxiliary Indices with merging
26# https://www.youtube.com/watch?v=98E1h_u4xGk
Akronc3657bf2016-10-31 00:15:43 +010027#
Akron5f521532016-10-21 19:30:23 +020028# TODO: Maybe logarithmic merge
29# https://www.youtube.com/watch?v=VNjf2dxWH2Y&spfreload=5
30
Akron2ee89f12016-12-07 18:33:52 +010031# TODO: Maybe 65.535 documents are enough per segment ...
32
Akron6e13a062017-01-13 11:55:28 +010033# TODO: Build a forward index
Akron6a749732017-02-14 14:43:06 +010034# TODO: With a forward index, the subtokens offsets will no longer
Akron6e13a062017-01-13 11:55:28 +010035# point to character positions in the primary text but to
Akron6a749732017-02-14 14:43:06 +010036# subtoken positions in the forward index!
Akron6e13a062017-01-13 11:55:28 +010037
Akron6ff7b482017-02-09 01:29:29 +010038# TODO:
Akron93153672017-03-07 20:43:12 +010039# Currently ranking is not collation based. It should be possible
40# to define a collation per field and
41# use one collation for prefix and suffix sorting.
42# It may be beneficial to make a different sorting possible (though it's
43# probably acceptable to make it slow)
44# Use http://userguide.icu-project.org/collation
45
46# TODO:
Akron6ff7b482017-02-09 01:29:29 +010047# Reranking a field is not necessary, if the field value is already given.
48# In that case, look up the dictionary if the value is already given,
49# take the example doc of that field value and add the rank of that
50# doc for the new doc.
51# If the field is not yet given, take the next or previous value in dictionary
52# order and use the rank to rerank the field (see K::I::Dictionary).
53# BUT: This only works if the field has the same collation as the
54# dictionary!
55
Akron4b5257e2017-04-05 17:50:22 +020056# TODO:
57# field names should have term_ids, so should foundries and layers, but
58# probably not field values and annotation values.
59# terms may have term_ids and subterms should have subterm_ids
60
61
Akron3e648a02017-02-24 20:19:15 +010062use constant DEBUG => 0;
Akronc001d362016-12-12 19:07:52 +010063
64
Akron5f521532016-10-21 19:30:23 +020065sub new {
66 my $class = shift;
67 my $file = shift;
68 my $self = bless {
69 file => $file
70 }, $class;
71
Akron23257b52017-01-17 21:01:41 +010072 print_log('index', 'Instantiate new index') if DEBUG;
73
Akron5f521532016-10-21 19:30:23 +020074 # Load dictionary
75 $self->{dict} = Krawfish::Index::Dictionary->new(
76 $self->{file}
77 );
78
Akron71fc14c2016-10-31 23:44:43 +010079 # Load offsets
Akron6a749732017-02-14 14:43:06 +010080 $self->{subtokens} = Krawfish::Index::Subtokens->new(
Akron71fc14c2016-10-31 23:44:43 +010081 $self->{file}
82 );
83
84 # Load primary
85 $self->{primary} = Krawfish::Index::PrimaryData->new(
86 $self->{file}
87 );
88
Akron4b5257e2017-04-05 17:50:22 +020089 # Load fields
Akronb01495a2016-11-21 22:29:41 +010090 $self->{fields} = Krawfish::Index::Fields->new(
91 $self->{file}
92 );
93
Akron2ea61aa2017-06-03 16:30:23 +020094 # Load live document pointer
95 $self->{live} = Krawfish::Index::PostingsLive->new(
96 $self->{file}
97 );
98
Akron2ee89f12016-12-07 18:33:52 +010099 # Create a list of docid -> uuid mappers
100 # This may be problematic as uuids may need to be uint64,
101 # this can grow for a segment with 65.000 docs up to ~ 500kb
Akrondf4b4052017-04-21 14:51:28 +0200102 # Or ~ 7MB for 1,000,000 documents
103 # But this means it's possible to store
104 # 18.446.744.073.709.551.615 documents in the index
Akron2ee89f12016-12-07 18:33:52 +0100105 $self->{identifier} = [];
106
107 # Collect fields to sort
108 $self->{sortable} = {};
109
Akron4b5257e2017-04-05 17:50:22 +0200110 # Collect values to sum
111 $self->{summable} = {};
112
Akron91b0e472016-12-05 17:07:50 +0100113 # Add cache
114 $self->{cache} = Krawfish::Cache->new;
115
Akron5f521532016-10-21 19:30:23 +0200116 return $self;
117};
118
Akron6ccf8102016-10-26 12:41:07 +0200119
Akron1811acc2017-06-07 02:13:16 +0200120# Get the last document index
Akron5f521532016-10-21 19:30:23 +0200121sub last_doc {
Akron1811acc2017-06-07 02:13:16 +0200122 $_[0]->{live}->next_doc_id - 1;
Akron5f521532016-10-21 19:30:23 +0200123};
124
Akron4b5257e2017-04-05 17:50:22 +0200125
Akron3e648a02017-02-24 20:19:15 +0100126# Alias for last doc
127sub max_rank {
Akron1811acc2017-06-07 02:13:16 +0200128 $_[0]->{live}->next_doc_id - 1;
Akron3e648a02017-02-24 20:19:15 +0100129};
Akron6ccf8102016-10-26 12:41:07 +0200130
Akron4b5257e2017-04-05 17:50:22 +0200131
Akron6ccf8102016-10-26 12:41:07 +0200132# Get term dictionary
133sub dict {
134 $_[0]->{dict};
135};
136
Akron2ee89f12016-12-07 18:33:52 +0100137
138# Get info
Akronddf077a2016-11-05 15:00:00 +0100139sub info {
140 $_[0]->{info};
141};
Akronc3657bf2016-10-31 00:15:43 +0100142
Akron2ee89f12016-12-07 18:33:52 +0100143
Akron6a749732017-02-14 14:43:06 +0100144# Get subtokens
145sub subtokens {
146 $_[0]->{subtokens};
Akron71fc14c2016-10-31 23:44:43 +0100147};
148
149
Akron2ea61aa2017-06-03 16:30:23 +0200150# Get live documents
151sub live {
152 $_[0]->{live};
153};
154
155
Akron71fc14c2016-10-31 23:44:43 +0100156# Get primary
157sub primary {
158 $_[0]->{primary};
159};
160
Akron2ee89f12016-12-07 18:33:52 +0100161
Akronb01495a2016-11-21 22:29:41 +0100162# Get fields
163sub fields {
164 $_[0]->{fields};
165};
166
Akron71fc14c2016-10-31 23:44:43 +0100167
Akron4b5257e2017-04-05 17:50:22 +0200168# Get field values for addition
169sub field_values {
170 $_[0]->{field_values};
171};
172
173
Akron6ccf8102016-10-26 12:41:07 +0200174# Add document to the index
Akrone0201942016-11-26 01:11:31 +0100175# TODO: Expect a KoralQuery document
Akrondf4b4052017-04-21 14:51:28 +0200176# TODO: This should be specific to Krawfish::Index::Dynamic;
Akrone23e2922017-05-01 13:18:12 +0200177# TODO: Support update as a insert_after_delete
Akron5f521532016-10-21 19:30:23 +0200178sub add {
179 my $self = shift;
180 my $doc = shift;
181 unless (ref $doc) {
Akron18ff5922017-01-13 10:09:45 +0100182 $doc = decode_json(Mojo::File->new($doc)->slurp);
Akron5f521532016-10-21 19:30:23 +0200183 };
184
Akron5f521532016-10-21 19:30:23 +0200185 # Get new doc_id
Akron2ea61aa2017-06-03 16:30:23 +0200186 my $doc_id = $self->live->incr;
Akron5f521532016-10-21 19:30:23 +0200187
Akron6ccf8102016-10-26 12:41:07 +0200188 # Get document
Akronf0d514a2016-11-01 14:16:25 +0100189 $doc = $doc->{document};
Akron1e469192016-10-24 12:59:21 +0200190
Akron71fc14c2016-10-31 23:44:43 +0100191 # Store primary data
Akronc001d362016-12-12 19:07:52 +0100192 if ($doc->{primaryData}) {
Akron6e13a062017-01-13 11:55:28 +0100193
194 # TODO: This may, in the future, contain the forward index instead
Akronc001d362016-12-12 19:07:52 +0100195 $self->primary->store($doc_id, $doc->{primaryData});
Akron71fc14c2016-10-31 23:44:43 +0100196
Akronc001d362016-12-12 19:07:52 +0100197 print_log('index', 'Store primary data "' . $doc->{primaryData} . '"') if DEBUG;
198 };
199
Akron71fc14c2016-10-31 23:44:43 +0100200 my $pos = 0;
Akronb01495a2016-11-21 22:29:41 +0100201
Akron2ee89f12016-12-07 18:33:52 +0100202 # Store identifier for mappings
Akrondf4b4052017-04-21 14:51:28 +0200203 # But what is the purpose of the identifier?
204 # Isn't it okay to be slow here ... ?
Akron2ee89f12016-12-07 18:33:52 +0100205 if ($doc->{id}) {
206 $self->{identifier}->[$doc_id] = $doc->{id};
207 };
208
Akron6e13a062017-01-13 11:55:28 +0100209 my $dict = $self->{dict};
210
Akronb01495a2016-11-21 22:29:41 +0100211 # Add metadata fields
212 my $fields = $self->fields;
213 foreach my $field (@{$doc->{fields}}) {
214
Akron4b5257e2017-04-05 17:50:22 +0200215 # TODO:
216 # Also store 'id' as a field value
217
Akronb01495a2016-11-21 22:29:41 +0100218 # Add to document field (retrieval)
219 $fields->store($doc_id, $field->{key}, $field->{value});
220
Akron4b5257e2017-04-05 17:50:22 +0200221 # Prepare for summarization
222 # if ($field->{type} eq 'type:integer') {
223 # };
224
Akron2ee89f12016-12-07 18:33:52 +0100225 # Prepare field for sorting
226 if ($field->{sortable}) {
227
Akroncf6806a2016-12-28 16:45:23 +0100228 # Which entries need to be sorted?
Akron2ee89f12016-12-07 18:33:52 +0100229 $self->{sortable}->{$field->{key}}++;
230 };
231
Akron4b5257e2017-04-05 17:50:22 +0200232 # Prepare field for summing
233 # if ($field->{summable}) {
234 #
235 # # Which entries need to be summable
236 # $self->{summable}->{$field->{key}}++;
237 # };
238
Akronb01495a2016-11-21 22:29:41 +0100239 # Add to postings lists (search)
240 my $term = $field->{key} . ':' . $field->{value};
Akron1cb0b212017-02-17 16:07:13 +0100241 my $post_list = $dict->add_term('+' . $term);
Akronb01495a2016-11-21 22:29:41 +0100242 $post_list->append($doc_id);
243 };
244
Akron6a749732017-02-14 14:43:06 +0100245 my $subtokens = $self->subtokens;
Akron6e13a062017-01-13 11:55:28 +0100246
Akron6a749732017-02-14 14:43:06 +0100247 # The primary text is necessary for the subtoken index as well as
Akron6e13a062017-01-13 11:55:28 +0100248 # for the forward index
249 my $primary = $doc->{primaryData};
250
Akron6a749732017-02-14 14:43:06 +0100251 # Store subtokens
252 if ($doc->{subtokens}) {
Akron6e13a062017-01-13 11:55:28 +0100253
Akron6a749732017-02-14 14:43:06 +0100254 print_log('index', 'Store subtokens') if DEBUG;
Akron6e13a062017-01-13 11:55:28 +0100255
Akron6a749732017-02-14 14:43:06 +0100256 # Store all subtoken offsets
257 foreach my $seg (@{$doc->{subtokens}}) {
Akron6e13a062017-01-13 11:55:28 +0100258
Akron6a749732017-02-14 14:43:06 +0100259 # Get start and end of the subtoken
Akron6e13a062017-01-13 11:55:28 +0100260 my ($start, $end) = @{$seg->{offsets}};
261
262 if (DEBUG) {
263 print_log(
264 'index',
Akron6a749732017-02-14 14:43:06 +0100265 'Store subtoken: ' . $doc_id . ':' . $pos . '=' . join('-', $start, $end)
Akron6e13a062017-01-13 11:55:28 +0100266 );
267 };
268
269 # Get the term surface from the primary text
270 # TODO: Ensure that the offsets are valid!
271 my $term = substr($primary, $start, $end - $start);
272
273 # TODO: There may be a prefix necessary for surface forms
Akronbc9d54c2017-01-14 02:27:45 +0100274 # TODO: This may in fact be not necessary at all -
Akron6a749732017-02-14 14:43:06 +0100275 # The subtokens may have their own IDs
Akronbc9d54c2017-01-14 02:27:45 +0100276 # And the terms do not need to be stored in the dictionary for retrieval ...
Akron6e13a062017-01-13 11:55:28 +0100277
Akron1cb0b212017-02-17 16:07:13 +0100278 # Add as a subterm
279 my $subterm_id = $dict->add_subterm($term);
280
Akron1972c122017-03-14 17:46:55 +0100281 # TODO:
282 # Check somehow, if the term is new. If so, then {
283 # TODO: Store case insensitive term
284 # $dict->add_subterm_casefolded(fold_case($term), $subterm_id);
285 # $dict->add_subterm_without_diacritics(remove_diacritics($term), $subterm_id);
286 # }
287
Akron1cb0b212017-02-17 16:07:13 +0100288 print_log('index', 'Surface form has subterm_id ' . $subterm_id) if DEBUG;
Akrond5105af2017-01-14 16:50:38 +0100289
Akron6a749732017-02-14 14:43:06 +0100290 # Store information to subtoken
Akron1cb0b212017-02-17 16:07:13 +0100291 $subtokens->store(
292 $doc_id,
293 $pos++,
294 $start,
295 $end,
296 $subterm_id,
297 $term # Probably not necessary!
298 );
Akron6e13a062017-01-13 11:55:28 +0100299 };
300 };
301
Akron5f521532016-10-21 19:30:23 +0200302 # Get all tokens
Akron71fc14c2016-10-31 23:44:43 +0100303 $pos = 0;
Akron1e469192016-10-24 12:59:21 +0200304 my $end;
Akronf0d514a2016-11-01 14:16:25 +0100305 foreach my $item (@{$doc->{annotations}}) {
Akron5f521532016-10-21 19:30:23 +0200306
Akron6ccf8102016-10-26 12:41:07 +0200307 # Add token term to term dictionary
Akron1e469192016-10-24 12:59:21 +0200308 if ($item->{'@type'} eq 'koral:token') {
Akron5f521532016-10-21 19:30:23 +0200309
Akronf0d514a2016-11-01 14:16:25 +0100310 unless ($item->{wrap}) {
311 warn 'No wrap defined in KoralQuery';
312 next;
313 };
314
Akron6ccf8102016-10-26 12:41:07 +0200315 # Create key string
Akronf0d514a2016-11-01 14:16:25 +0100316 my $wrap = $item->{wrap};
317 my @keys;
318
319 # Token wraps a koral:termGroup
320 if ($wrap->{'@type'} && $wrap->{'@type'} eq 'koral:termGroup') {
321 foreach (@{$wrap->{operands}}) {
322 push @keys, _term($_);
323 };
324 }
325
326 # Token wraps a single koral:term
327 else {
328 push @keys, _term($wrap);
329 };
Akron1e469192016-10-24 12:59:21 +0200330
331 # Append posting to postings list
Akron6a749732017-02-14 14:43:06 +0100332 my @subtokens = _subtokens($item);
Akron71fc14c2016-10-31 23:44:43 +0100333
Akron6a749732017-02-14 14:43:06 +0100334 # No subtokens defined
335 unless (scalar @subtokens) {
336 push @subtokens, $pos;
Akron71fc14c2016-10-31 23:44:43 +0100337
338 # Store offsets
339 if ($item->{offsets}) {
Akron6a749732017-02-14 14:43:06 +0100340 $subtokens->store($doc_id, $pos, @{$item->{offsets}});
Akron71fc14c2016-10-31 23:44:43 +0100341 };
342 $pos++;
343 };
Akron6ccf8102016-10-26 12:41:07 +0200344
Akron9cb13732017-01-31 19:16:32 +0100345 # Add token terms
Akronf0d514a2016-11-01 14:16:25 +0100346 foreach (@keys) {
Akron1cb0b212017-02-17 16:07:13 +0100347 my $post_list = $dict->add_term($_);
Akron6a749732017-02-14 14:43:06 +0100348 $post_list->append($doc_id, @subtokens);
Akronf0d514a2016-11-01 14:16:25 +0100349 };
Akron6ccf8102016-10-26 12:41:07 +0200350 }
351
352 # Add span term to dictionary
Akron13e31012016-10-25 02:08:30 +0200353 elsif ($item->{'@type'} eq 'koral:span') {
Akron6ccf8102016-10-26 12:41:07 +0200354
355 # Create key string
Akronf0d514a2016-11-01 14:16:25 +0100356 my $key = '<>' . _term($item->{wrap});
Akron6ccf8102016-10-26 12:41:07 +0200357
Akron1cb0b212017-02-17 16:07:13 +0100358 my $post_list = $dict->add_term($key);
Akron13e31012016-10-25 02:08:30 +0200359
360 # Append posting to posting list
361 $post_list->append(
362 $doc_id,
Akron6a749732017-02-14 14:43:06 +0100363 $item->{subtokens}->[0],
364 # The end is AFTER the second subtoken
365 $item->{subtokens}->[-1] + 1
Akron13e31012016-10-25 02:08:30 +0200366 );
Akron1e469192016-10-24 12:59:21 +0200367 };
Akron5f521532016-10-21 19:30:23 +0200368 };
369
Akron71fc14c2016-10-31 23:44:43 +0100370 return $doc_id;
Akron5f521532016-10-21 19:30:23 +0200371};
372
Akron1e469192016-10-24 12:59:21 +0200373
Akrone0201942016-11-26 01:11:31 +0100374# TODO: Use from_koral()->term
375# Potentially with a prefix
Akron6ccf8102016-10-26 12:41:07 +0200376sub _term {
377 my $item = shift;
Akron1e469192016-10-24 12:59:21 +0200378
Akron6ccf8102016-10-26 12:41:07 +0200379 my $key = '';
380 # Create term for term dictionary
381 if ($item->{foundry}) {
382 $key .= $item->{foundry};
383 if ($item->{layer}) {
384 $key .= '/' . $item->{layer};
385 }
386 $key .= '=';
387 };
388 return $key . ($item->{key} // '');
389}
390
Akronc3657bf2016-10-31 00:15:43 +0100391
Akron6a749732017-02-14 14:43:06 +0100392# Return subtoken list or nothing
393sub _subtokens {
Akron6ccf8102016-10-26 12:41:07 +0200394 my $item = shift;
395 my @posting;
396
Akron6a749732017-02-14 14:43:06 +0100397 if ($item->{subtokens}) {
Akron6ccf8102016-10-26 12:41:07 +0200398
399 # Remove!
Akron6a749732017-02-14 14:43:06 +0100400 push @posting, $item->{subtokens}->[0];
Akron6ccf8102016-10-26 12:41:07 +0200401
Akron6a749732017-02-14 14:43:06 +0100402 if ($item->{subtokens}->[1]) {
403 # The end is AFTER the second subtoken
404 push @posting, $item->{subtokens}->[1] + 1;
Akron6ccf8102016-10-26 12:41:07 +0200405 };
406
407 return @posting;
408 };
409
410 return;
Akron5f521532016-10-21 19:30:23 +0200411};
412
Akronc3657bf2016-10-31 00:15:43 +0100413
Akrone0201942016-11-26 01:11:31 +0100414# Apply (aka search) the index
Akronc3657bf2016-10-31 00:15:43 +0100415sub apply {
416 my $self = shift;
417 my $koral = shift;
418
419 # Necessary for filtering
Akrone0201942016-11-26 01:11:31 +0100420 my $corpus = $koral->corpus->prepare_for($self) or return;
Akronc3657bf2016-10-31 00:15:43 +0100421
Akrone0201942016-11-26 01:11:31 +0100422 # Add VC to query as a constraint
423 my $query = $koral->query->prepare_for($self, $corpus) or return;
Akronc3657bf2016-10-31 00:15:43 +0100424
425 # Get meta information
Akrone0201942016-11-26 01:11:31 +0100426 my $meta = $koral->meta->prepare_for($self) or return;
Akronc3657bf2016-10-31 00:15:43 +0100427
428 my $cb = shift;
429 my @result = ();
430
431 # No callback - push to array
432 unless ($cb) {
433 while ($query->next) {
434 push @result, $query->current;
435 };
436 return @result;
437 };
438
439 # Push callback
440 while ($query->next) {
441 $cb->($query->current);
442 };
443
444};
445
446
447
Akron5f521532016-10-21 19:30:23 +02004481;
Akronc3657bf2016-10-31 00:15:43 +0100449
450
451__END__
452
Akron349747d2016-12-05 11:05:53 +0100453
454
455# Search using meta data
456# Can also be used to collect with a callback
Akron7db79e22016-12-08 23:02:32 +0100457#
Akron349747d2016-12-05 11:05:53 +0100458sub search {
459 my ($self, $koral, $cb) = @_;
460
461 my $query = $koral->query;
462 my $corpus = $koral->corpus;
463 my $meta = $koral->meta;
464
Akron27fb7432016-12-11 18:07:32 +0100465 # Initiate result object
Akron349747d2016-12-05 11:05:53 +0100466 my $result = $koral->result;
467
Akron27fb7432016-12-11 18:07:32 +0100468 # Get filtered search object
Akron349747d2016-12-05 11:05:53 +0100469 my $search = $query->filter_by($corpus)->plan_for($self);
470
471 # Augment with facets
Akron27fb7432016-12-11 18:07:32 +0100472 # Will add to result info
Akron349747d2016-12-05 11:05:53 +0100473 if ($meta->facets) {
474 $search = $meta->facets($search);
475 };
476
Akron27fb7432016-12-11 18:07:32 +0100477 # Augment with counting
478 # Will add to result info
479 if ($meta->count) {
480 $search = $meta->count($search);
481 };
482
Akron349747d2016-12-05 11:05:53 +0100483 # Augment with sorting
Akron2ee89f12016-12-07 18:33:52 +0100484 if ($meta->sorted_by) {
485 $search = $meta->sorted_by($search);
Akron349747d2016-12-05 11:05:53 +0100486 };
487
Akron27fb7432016-12-11 18:07:32 +0100488 # Augment with limitations
489 if ($meta->limit) {
490 $search = $meta->limit($search);
Akron349747d2016-12-05 11:05:53 +0100491 };
492
Akron27fb7432016-12-11 18:07:32 +0100493 # Augment with field collector
494 # Will modify current match
495 $search = $meta->fields($search);
496
497 # Augment with id creator
498 # Will modify current match
499 $search = $meta->id_create($search);
500
501 # Augment with snmippet creator
502 # Will modify current match
503 $search = $meta->snippets($search);
504
505 # Iterate over all matches
506 while ($search->next) {
507
508 # Based on the information, this will populate the match
509 $result->add_match($search->current_match);
Akron349747d2016-12-05 11:05:53 +0100510 };
511
512 return $koral;
513};
514
515sub get_fields {
516 my ($self, $doc_id, $fields) = @_;
Akron4b5257e2017-04-05 17:50:22 +0200517 ...
Akron349747d2016-12-05 11:05:53 +0100518};
519
520# This returns the posting's start and end position
521# when embedded in a span, e.g. <base/s=s>
522sub get_context_by_query {
523 my ($self, $posting, $query) = @_
524};
525
526sub get_annotations {
527 my ($self, $posting, $terms) = @_;
528
529 my %anno = ();
530
531 my $dict = $self->dict;
532 foreach my $term ($dict->terms($terms)) {
533 my $term_list = $dict->get($term);
534
535 # Skip to the correct document and the first position
536 next unless $term_list->next($posting->doc_id, $posting->start);
537
538 # Init annotation
539 my $anno = ($anno{$term} //= []);
540
541 # Iterate over all annotations
542 while ($term_list->current->end <= $posting->end) {
543
544 # Remember the annotations
545 push @$anno, $term_list->current->clone;
546
547 $term_list->next or next;
548 }
549
550 # Close (and forget) termlist
551 $term_list->close;
552 };
553
554 return \%anno;
Akronc3657bf2016-10-31 00:15:43 +0100555};
556
557
Akron349747d2016-12-05 11:05:53 +0100558
559
Akronc3657bf2016-10-31 00:15:43 +0100560
561sub items_per_page;
562
563sub start_page;
564
565sub apply {
566 my $self = shift;
567 my $query = $self->plan;
568 my $cb = shift;
569 my @result = ();
570
571 # No callback - push to array
572 unless ($cb) {
573 while ($query->next) {
574 push @result, $query->current;
575 };
576 return @result;
577 };
578
579 # Push callback
580 while ($query->next) {
581 $cb->($query->current);
582 };
583};