| Akron | 5f52153 | 2016-10-21 19:30:23 +0200 | [diff] [blame] | 1 | package Krawfish::Index; |
| 2 | use Krawfish::Index::Dictionary; |
| Akron | 6a74973 | 2017-02-14 14:43:06 +0100 | [diff] [blame] | 3 | use Krawfish::Index::Subtokens; |
| Akron | 71fc14c | 2016-10-31 23:44:43 +0100 | [diff] [blame] | 4 | use Krawfish::Index::PrimaryData; |
| Akron | b01495a | 2016-11-21 22:29:41 +0100 | [diff] [blame] | 5 | use Krawfish::Index::Fields; |
| Akron | 2ea61aa | 2017-06-03 16:30:23 +0200 | [diff] [blame] | 6 | use Krawfish::Index::PostingsLive; |
| Akron | 91b0e47 | 2016-12-05 17:07:50 +0100 | [diff] [blame] | 7 | use Krawfish::Cache; |
| Akron | c001d36 | 2016-12-12 19:07:52 +0100 | [diff] [blame] | 8 | use Krawfish::Log; |
| Akron | 5f52153 | 2016-10-21 19:30:23 +0200 | [diff] [blame] | 9 | use strict; |
| 10 | use warnings; |
| Akron | c3657bf | 2016-10-31 00:15:43 +0100 | [diff] [blame] | 11 | use Scalar::Util qw!blessed!; |
| Akron | 5f52153 | 2016-10-21 19:30:23 +0200 | [diff] [blame] | 12 | use Mojo::JSON qw/encode_json decode_json/; |
| Akron | 18ff592 | 2017-01-13 10:09:45 +0100 | [diff] [blame] | 13 | use Mojo::File; |
| Akron | 5f52153 | 2016-10-21 19:30:23 +0200 | [diff] [blame] | 14 | |
| Akron | df4b405 | 2017-04-21 14:51:28 +0200 | [diff] [blame] | 15 | # TODO: This should be a base class for K::I::Static and K::I::Dynamic |
| 16 | |
| Akron | 93271d8 | 2016-11-24 09:18:41 +0100 | [diff] [blame] | 17 | # TODO: Add LiveDocs-PostingsList, that supports deletion |
| Akron | 2ea61aa | 2017-06-03 16:30:23 +0200 | [diff] [blame] | 18 | # TODO: Live should store the last_doc value |
| 19 | |
| Akron | 93271d8 | 2016-11-24 09:18:41 +0100 | [diff] [blame] | 20 | # |
| Akron | 42d3f6a | 2017-04-02 17:46:48 +0200 | [diff] [blame] | 21 | # TODO: Support multiple tokenized texts for parallel corpora |
| 22 | # |
| Akron | c3657bf | 2016-10-31 00:15:43 +0100 | [diff] [blame] | 23 | # TODO: Create Importer class |
| 24 | # |
| Akron | 5f52153 | 2016-10-21 19:30:23 +0200 | [diff] [blame] | 25 | # TODO: Support Main Index and Auxiliary Indices with merging |
| 26 | # https://www.youtube.com/watch?v=98E1h_u4xGk |
| Akron | c3657bf | 2016-10-31 00:15:43 +0100 | [diff] [blame] | 27 | # |
| Akron | 5f52153 | 2016-10-21 19:30:23 +0200 | [diff] [blame] | 28 | # TODO: Maybe logarithmic merge |
| 29 | # https://www.youtube.com/watch?v=VNjf2dxWH2Y&spfreload=5 |
| 30 | |
| Akron | 2ee89f1 | 2016-12-07 18:33:52 +0100 | [diff] [blame] | 31 | # TODO: Maybe 65.535 documents are enough per segment ... |
| 32 | |
| Akron | 6e13a06 | 2017-01-13 11:55:28 +0100 | [diff] [blame] | 33 | # TODO: Build a forward index |
| Akron | 6a74973 | 2017-02-14 14:43:06 +0100 | [diff] [blame] | 34 | # TODO: With a forward index, the subtokens offsets will no longer |
| Akron | 6e13a06 | 2017-01-13 11:55:28 +0100 | [diff] [blame] | 35 | # point to character positions in the primary text but to |
| Akron | 6a74973 | 2017-02-14 14:43:06 +0100 | [diff] [blame] | 36 | # subtoken positions in the forward index! |
| Akron | 6e13a06 | 2017-01-13 11:55:28 +0100 | [diff] [blame] | 37 | |
| Akron | 6ff7b48 | 2017-02-09 01:29:29 +0100 | [diff] [blame] | 38 | # TODO: |
| Akron | 9315367 | 2017-03-07 20:43:12 +0100 | [diff] [blame] | 39 | # Currently ranking is not collation based. It should be possible |
| 40 | # to define a collation per field and |
| 41 | # use one collation for prefix and suffix sorting. |
| 42 | # It may be beneficial to make a different sorting possible (though it's |
| 43 | # probably acceptable to make it slow) |
| 44 | # Use http://userguide.icu-project.org/collation |
| 45 | |
| 46 | # TODO: |
| Akron | 6ff7b48 | 2017-02-09 01:29:29 +0100 | [diff] [blame] | 47 | # Reranking a field is not necessary, if the field value is already given. |
| 48 | # In that case, look up the dictionary if the value is already given, |
| 49 | # take the example doc of that field value and add the rank of that |
| 50 | # doc for the new doc. |
| 51 | # If the field is not yet given, take the next or previous value in dictionary |
| 52 | # order and use the rank to rerank the field (see K::I::Dictionary). |
| 53 | # BUT: This only works if the field has the same collation as the |
| 54 | # dictionary! |
| 55 | |
| Akron | 4b5257e | 2017-04-05 17:50:22 +0200 | [diff] [blame] | 56 | # TODO: |
| 57 | # field names should have term_ids, so should foundries and layers, but |
| 58 | # probably not field values and annotation values. |
| 59 | # terms may have term_ids and subterms should have subterm_ids |
| 60 | |
| 61 | |
| Akron | 3e648a0 | 2017-02-24 20:19:15 +0100 | [diff] [blame] | 62 | use constant DEBUG => 0; |
| Akron | c001d36 | 2016-12-12 19:07:52 +0100 | [diff] [blame] | 63 | |
| 64 | |
| Akron | 5f52153 | 2016-10-21 19:30:23 +0200 | [diff] [blame] | 65 | sub new { |
| 66 | my $class = shift; |
| 67 | my $file = shift; |
| 68 | my $self = bless { |
| 69 | file => $file |
| 70 | }, $class; |
| 71 | |
| Akron | 23257b5 | 2017-01-17 21:01:41 +0100 | [diff] [blame] | 72 | print_log('index', 'Instantiate new index') if DEBUG; |
| 73 | |
| Akron | 5f52153 | 2016-10-21 19:30:23 +0200 | [diff] [blame] | 74 | # Load dictionary |
| 75 | $self->{dict} = Krawfish::Index::Dictionary->new( |
| 76 | $self->{file} |
| 77 | ); |
| 78 | |
| Akron | 71fc14c | 2016-10-31 23:44:43 +0100 | [diff] [blame] | 79 | # Load offsets |
| Akron | 6a74973 | 2017-02-14 14:43:06 +0100 | [diff] [blame] | 80 | $self->{subtokens} = Krawfish::Index::Subtokens->new( |
| Akron | 71fc14c | 2016-10-31 23:44:43 +0100 | [diff] [blame] | 81 | $self->{file} |
| 82 | ); |
| 83 | |
| 84 | # Load primary |
| 85 | $self->{primary} = Krawfish::Index::PrimaryData->new( |
| 86 | $self->{file} |
| 87 | ); |
| 88 | |
| Akron | 4b5257e | 2017-04-05 17:50:22 +0200 | [diff] [blame] | 89 | # Load fields |
| Akron | b01495a | 2016-11-21 22:29:41 +0100 | [diff] [blame] | 90 | $self->{fields} = Krawfish::Index::Fields->new( |
| 91 | $self->{file} |
| 92 | ); |
| 93 | |
| Akron | 2ea61aa | 2017-06-03 16:30:23 +0200 | [diff] [blame] | 94 | # Load live document pointer |
| 95 | $self->{live} = Krawfish::Index::PostingsLive->new( |
| 96 | $self->{file} |
| 97 | ); |
| 98 | |
| Akron | 2ee89f1 | 2016-12-07 18:33:52 +0100 | [diff] [blame] | 99 | # Create a list of docid -> uuid mappers |
| 100 | # This may be problematic as uuids may need to be uint64, |
| 101 | # this can grow for a segment with 65.000 docs up to ~ 500kb |
| Akron | df4b405 | 2017-04-21 14:51:28 +0200 | [diff] [blame] | 102 | # Or ~ 7MB for 1,000,000 documents |
| 103 | # But this means it's possible to store |
| 104 | # 18.446.744.073.709.551.615 documents in the index |
| Akron | 2ee89f1 | 2016-12-07 18:33:52 +0100 | [diff] [blame] | 105 | $self->{identifier} = []; |
| 106 | |
| 107 | # Collect fields to sort |
| 108 | $self->{sortable} = {}; |
| 109 | |
| Akron | 4b5257e | 2017-04-05 17:50:22 +0200 | [diff] [blame] | 110 | # Collect values to sum |
| 111 | $self->{summable} = {}; |
| 112 | |
| Akron | 91b0e47 | 2016-12-05 17:07:50 +0100 | [diff] [blame] | 113 | # Add cache |
| 114 | $self->{cache} = Krawfish::Cache->new; |
| 115 | |
| Akron | 5f52153 | 2016-10-21 19:30:23 +0200 | [diff] [blame] | 116 | return $self; |
| 117 | }; |
| 118 | |
| Akron | 6ccf810 | 2016-10-26 12:41:07 +0200 | [diff] [blame] | 119 | |
| Akron | 1811acc | 2017-06-07 02:13:16 +0200 | [diff] [blame] | 120 | # Get the last document index |
| Akron | 5f52153 | 2016-10-21 19:30:23 +0200 | [diff] [blame] | 121 | sub last_doc { |
| Akron | 1811acc | 2017-06-07 02:13:16 +0200 | [diff] [blame] | 122 | $_[0]->{live}->next_doc_id - 1; |
| Akron | 5f52153 | 2016-10-21 19:30:23 +0200 | [diff] [blame] | 123 | }; |
| 124 | |
| Akron | 4b5257e | 2017-04-05 17:50:22 +0200 | [diff] [blame] | 125 | |
| Akron | 3e648a0 | 2017-02-24 20:19:15 +0100 | [diff] [blame] | 126 | # Alias for last doc |
| 127 | sub max_rank { |
| Akron | 1811acc | 2017-06-07 02:13:16 +0200 | [diff] [blame] | 128 | $_[0]->{live}->next_doc_id - 1; |
| Akron | 3e648a0 | 2017-02-24 20:19:15 +0100 | [diff] [blame] | 129 | }; |
| Akron | 6ccf810 | 2016-10-26 12:41:07 +0200 | [diff] [blame] | 130 | |
| Akron | 4b5257e | 2017-04-05 17:50:22 +0200 | [diff] [blame] | 131 | |
| Akron | 6ccf810 | 2016-10-26 12:41:07 +0200 | [diff] [blame] | 132 | # Get term dictionary |
| 133 | sub dict { |
| 134 | $_[0]->{dict}; |
| 135 | }; |
| 136 | |
| Akron | 2ee89f1 | 2016-12-07 18:33:52 +0100 | [diff] [blame] | 137 | |
| 138 | # Get info |
| Akron | ddf077a | 2016-11-05 15:00:00 +0100 | [diff] [blame] | 139 | sub info { |
| 140 | $_[0]->{info}; |
| 141 | }; |
| Akron | c3657bf | 2016-10-31 00:15:43 +0100 | [diff] [blame] | 142 | |
| Akron | 2ee89f1 | 2016-12-07 18:33:52 +0100 | [diff] [blame] | 143 | |
| Akron | 6a74973 | 2017-02-14 14:43:06 +0100 | [diff] [blame] | 144 | # Get subtokens |
| 145 | sub subtokens { |
| 146 | $_[0]->{subtokens}; |
| Akron | 71fc14c | 2016-10-31 23:44:43 +0100 | [diff] [blame] | 147 | }; |
| 148 | |
| 149 | |
| Akron | 2ea61aa | 2017-06-03 16:30:23 +0200 | [diff] [blame] | 150 | # Get live documents |
| 151 | sub live { |
| 152 | $_[0]->{live}; |
| 153 | }; |
| 154 | |
| 155 | |
| Akron | 71fc14c | 2016-10-31 23:44:43 +0100 | [diff] [blame] | 156 | # Get primary |
| 157 | sub primary { |
| 158 | $_[0]->{primary}; |
| 159 | }; |
| 160 | |
| Akron | 2ee89f1 | 2016-12-07 18:33:52 +0100 | [diff] [blame] | 161 | |
| Akron | b01495a | 2016-11-21 22:29:41 +0100 | [diff] [blame] | 162 | # Get fields |
| 163 | sub fields { |
| 164 | $_[0]->{fields}; |
| 165 | }; |
| 166 | |
| Akron | 71fc14c | 2016-10-31 23:44:43 +0100 | [diff] [blame] | 167 | |
| Akron | 4b5257e | 2017-04-05 17:50:22 +0200 | [diff] [blame] | 168 | # Get field values for addition |
| 169 | sub field_values { |
| 170 | $_[0]->{field_values}; |
| 171 | }; |
| 172 | |
| 173 | |
| Akron | 6ccf810 | 2016-10-26 12:41:07 +0200 | [diff] [blame] | 174 | # Add document to the index |
| Akron | e020194 | 2016-11-26 01:11:31 +0100 | [diff] [blame] | 175 | # TODO: Expect a KoralQuery document |
| Akron | df4b405 | 2017-04-21 14:51:28 +0200 | [diff] [blame] | 176 | # TODO: This should be specific to Krawfish::Index::Dynamic; |
| Akron | e23e292 | 2017-05-01 13:18:12 +0200 | [diff] [blame] | 177 | # TODO: Support update as a insert_after_delete |
| Akron | 5f52153 | 2016-10-21 19:30:23 +0200 | [diff] [blame] | 178 | sub add { |
| 179 | my $self = shift; |
| 180 | my $doc = shift; |
| 181 | unless (ref $doc) { |
| Akron | 18ff592 | 2017-01-13 10:09:45 +0100 | [diff] [blame] | 182 | $doc = decode_json(Mojo::File->new($doc)->slurp); |
| Akron | 5f52153 | 2016-10-21 19:30:23 +0200 | [diff] [blame] | 183 | }; |
| 184 | |
| Akron | 5f52153 | 2016-10-21 19:30:23 +0200 | [diff] [blame] | 185 | # Get new doc_id |
| Akron | 2ea61aa | 2017-06-03 16:30:23 +0200 | [diff] [blame] | 186 | my $doc_id = $self->live->incr; |
| Akron | 5f52153 | 2016-10-21 19:30:23 +0200 | [diff] [blame] | 187 | |
| Akron | 6ccf810 | 2016-10-26 12:41:07 +0200 | [diff] [blame] | 188 | # Get document |
| Akron | f0d514a | 2016-11-01 14:16:25 +0100 | [diff] [blame] | 189 | $doc = $doc->{document}; |
| Akron | 1e46919 | 2016-10-24 12:59:21 +0200 | [diff] [blame] | 190 | |
| Akron | 71fc14c | 2016-10-31 23:44:43 +0100 | [diff] [blame] | 191 | # Store primary data |
| Akron | c001d36 | 2016-12-12 19:07:52 +0100 | [diff] [blame] | 192 | if ($doc->{primaryData}) { |
| Akron | 6e13a06 | 2017-01-13 11:55:28 +0100 | [diff] [blame] | 193 | |
| 194 | # TODO: This may, in the future, contain the forward index instead |
| Akron | c001d36 | 2016-12-12 19:07:52 +0100 | [diff] [blame] | 195 | $self->primary->store($doc_id, $doc->{primaryData}); |
| Akron | 71fc14c | 2016-10-31 23:44:43 +0100 | [diff] [blame] | 196 | |
| Akron | c001d36 | 2016-12-12 19:07:52 +0100 | [diff] [blame] | 197 | print_log('index', 'Store primary data "' . $doc->{primaryData} . '"') if DEBUG; |
| 198 | }; |
| 199 | |
| Akron | 71fc14c | 2016-10-31 23:44:43 +0100 | [diff] [blame] | 200 | my $pos = 0; |
| Akron | b01495a | 2016-11-21 22:29:41 +0100 | [diff] [blame] | 201 | |
| Akron | 2ee89f1 | 2016-12-07 18:33:52 +0100 | [diff] [blame] | 202 | # Store identifier for mappings |
| Akron | df4b405 | 2017-04-21 14:51:28 +0200 | [diff] [blame] | 203 | # But what is the purpose of the identifier? |
| 204 | # Isn't it okay to be slow here ... ? |
| Akron | 2ee89f1 | 2016-12-07 18:33:52 +0100 | [diff] [blame] | 205 | if ($doc->{id}) { |
| 206 | $self->{identifier}->[$doc_id] = $doc->{id}; |
| 207 | }; |
| 208 | |
| Akron | 6e13a06 | 2017-01-13 11:55:28 +0100 | [diff] [blame] | 209 | my $dict = $self->{dict}; |
| 210 | |
| Akron | b01495a | 2016-11-21 22:29:41 +0100 | [diff] [blame] | 211 | # Add metadata fields |
| 212 | my $fields = $self->fields; |
| 213 | foreach my $field (@{$doc->{fields}}) { |
| 214 | |
| Akron | 4b5257e | 2017-04-05 17:50:22 +0200 | [diff] [blame] | 215 | # TODO: |
| 216 | # Also store 'id' as a field value |
| 217 | |
| Akron | b01495a | 2016-11-21 22:29:41 +0100 | [diff] [blame] | 218 | # Add to document field (retrieval) |
| 219 | $fields->store($doc_id, $field->{key}, $field->{value}); |
| 220 | |
| Akron | 4b5257e | 2017-04-05 17:50:22 +0200 | [diff] [blame] | 221 | # Prepare for summarization |
| 222 | # if ($field->{type} eq 'type:integer') { |
| 223 | # }; |
| 224 | |
| Akron | 2ee89f1 | 2016-12-07 18:33:52 +0100 | [diff] [blame] | 225 | # Prepare field for sorting |
| 226 | if ($field->{sortable}) { |
| 227 | |
| Akron | cf6806a | 2016-12-28 16:45:23 +0100 | [diff] [blame] | 228 | # Which entries need to be sorted? |
| Akron | 2ee89f1 | 2016-12-07 18:33:52 +0100 | [diff] [blame] | 229 | $self->{sortable}->{$field->{key}}++; |
| 230 | }; |
| 231 | |
| Akron | 4b5257e | 2017-04-05 17:50:22 +0200 | [diff] [blame] | 232 | # Prepare field for summing |
| 233 | # if ($field->{summable}) { |
| 234 | # |
| 235 | # # Which entries need to be summable |
| 236 | # $self->{summable}->{$field->{key}}++; |
| 237 | # }; |
| 238 | |
| Akron | b01495a | 2016-11-21 22:29:41 +0100 | [diff] [blame] | 239 | # Add to postings lists (search) |
| 240 | my $term = $field->{key} . ':' . $field->{value}; |
| Akron | 1cb0b21 | 2017-02-17 16:07:13 +0100 | [diff] [blame] | 241 | my $post_list = $dict->add_term('+' . $term); |
| Akron | b01495a | 2016-11-21 22:29:41 +0100 | [diff] [blame] | 242 | $post_list->append($doc_id); |
| 243 | }; |
| 244 | |
| Akron | 6a74973 | 2017-02-14 14:43:06 +0100 | [diff] [blame] | 245 | my $subtokens = $self->subtokens; |
| Akron | 6e13a06 | 2017-01-13 11:55:28 +0100 | [diff] [blame] | 246 | |
| Akron | 6a74973 | 2017-02-14 14:43:06 +0100 | [diff] [blame] | 247 | # The primary text is necessary for the subtoken index as well as |
| Akron | 6e13a06 | 2017-01-13 11:55:28 +0100 | [diff] [blame] | 248 | # for the forward index |
| 249 | my $primary = $doc->{primaryData}; |
| 250 | |
| Akron | 6a74973 | 2017-02-14 14:43:06 +0100 | [diff] [blame] | 251 | # Store subtokens |
| 252 | if ($doc->{subtokens}) { |
| Akron | 6e13a06 | 2017-01-13 11:55:28 +0100 | [diff] [blame] | 253 | |
| Akron | 6a74973 | 2017-02-14 14:43:06 +0100 | [diff] [blame] | 254 | print_log('index', 'Store subtokens') if DEBUG; |
| Akron | 6e13a06 | 2017-01-13 11:55:28 +0100 | [diff] [blame] | 255 | |
| Akron | 6a74973 | 2017-02-14 14:43:06 +0100 | [diff] [blame] | 256 | # Store all subtoken offsets |
| 257 | foreach my $seg (@{$doc->{subtokens}}) { |
| Akron | 6e13a06 | 2017-01-13 11:55:28 +0100 | [diff] [blame] | 258 | |
| Akron | 6a74973 | 2017-02-14 14:43:06 +0100 | [diff] [blame] | 259 | # Get start and end of the subtoken |
| Akron | 6e13a06 | 2017-01-13 11:55:28 +0100 | [diff] [blame] | 260 | my ($start, $end) = @{$seg->{offsets}}; |
| 261 | |
| 262 | if (DEBUG) { |
| 263 | print_log( |
| 264 | 'index', |
| Akron | 6a74973 | 2017-02-14 14:43:06 +0100 | [diff] [blame] | 265 | 'Store subtoken: ' . $doc_id . ':' . $pos . '=' . join('-', $start, $end) |
| Akron | 6e13a06 | 2017-01-13 11:55:28 +0100 | [diff] [blame] | 266 | ); |
| 267 | }; |
| 268 | |
| 269 | # Get the term surface from the primary text |
| 270 | # TODO: Ensure that the offsets are valid! |
| 271 | my $term = substr($primary, $start, $end - $start); |
| 272 | |
| 273 | # TODO: There may be a prefix necessary for surface forms |
| Akron | bc9d54c | 2017-01-14 02:27:45 +0100 | [diff] [blame] | 274 | # TODO: This may in fact be not necessary at all - |
| Akron | 6a74973 | 2017-02-14 14:43:06 +0100 | [diff] [blame] | 275 | # The subtokens may have their own IDs |
| Akron | bc9d54c | 2017-01-14 02:27:45 +0100 | [diff] [blame] | 276 | # And the terms do not need to be stored in the dictionary for retrieval ... |
| Akron | 6e13a06 | 2017-01-13 11:55:28 +0100 | [diff] [blame] | 277 | |
| Akron | 1cb0b21 | 2017-02-17 16:07:13 +0100 | [diff] [blame] | 278 | # Add as a subterm |
| 279 | my $subterm_id = $dict->add_subterm($term); |
| 280 | |
| Akron | 1972c12 | 2017-03-14 17:46:55 +0100 | [diff] [blame] | 281 | # TODO: |
| 282 | # Check somehow, if the term is new. If so, then { |
| 283 | # TODO: Store case insensitive term |
| 284 | # $dict->add_subterm_casefolded(fold_case($term), $subterm_id); |
| 285 | # $dict->add_subterm_without_diacritics(remove_diacritics($term), $subterm_id); |
| 286 | # } |
| 287 | |
| Akron | 1cb0b21 | 2017-02-17 16:07:13 +0100 | [diff] [blame] | 288 | print_log('index', 'Surface form has subterm_id ' . $subterm_id) if DEBUG; |
| Akron | d5105af | 2017-01-14 16:50:38 +0100 | [diff] [blame] | 289 | |
| Akron | 6a74973 | 2017-02-14 14:43:06 +0100 | [diff] [blame] | 290 | # Store information to subtoken |
| Akron | 1cb0b21 | 2017-02-17 16:07:13 +0100 | [diff] [blame] | 291 | $subtokens->store( |
| 292 | $doc_id, |
| 293 | $pos++, |
| 294 | $start, |
| 295 | $end, |
| 296 | $subterm_id, |
| 297 | $term # Probably not necessary! |
| 298 | ); |
| Akron | 6e13a06 | 2017-01-13 11:55:28 +0100 | [diff] [blame] | 299 | }; |
| 300 | }; |
| 301 | |
| Akron | 5f52153 | 2016-10-21 19:30:23 +0200 | [diff] [blame] | 302 | # Get all tokens |
| Akron | 71fc14c | 2016-10-31 23:44:43 +0100 | [diff] [blame] | 303 | $pos = 0; |
| Akron | 1e46919 | 2016-10-24 12:59:21 +0200 | [diff] [blame] | 304 | my $end; |
| Akron | f0d514a | 2016-11-01 14:16:25 +0100 | [diff] [blame] | 305 | foreach my $item (@{$doc->{annotations}}) { |
| Akron | 5f52153 | 2016-10-21 19:30:23 +0200 | [diff] [blame] | 306 | |
| Akron | 6ccf810 | 2016-10-26 12:41:07 +0200 | [diff] [blame] | 307 | # Add token term to term dictionary |
| Akron | 1e46919 | 2016-10-24 12:59:21 +0200 | [diff] [blame] | 308 | if ($item->{'@type'} eq 'koral:token') { |
| Akron | 5f52153 | 2016-10-21 19:30:23 +0200 | [diff] [blame] | 309 | |
| Akron | f0d514a | 2016-11-01 14:16:25 +0100 | [diff] [blame] | 310 | unless ($item->{wrap}) { |
| 311 | warn 'No wrap defined in KoralQuery'; |
| 312 | next; |
| 313 | }; |
| 314 | |
| Akron | 6ccf810 | 2016-10-26 12:41:07 +0200 | [diff] [blame] | 315 | # Create key string |
| Akron | f0d514a | 2016-11-01 14:16:25 +0100 | [diff] [blame] | 316 | my $wrap = $item->{wrap}; |
| 317 | my @keys; |
| 318 | |
| 319 | # Token wraps a koral:termGroup |
| 320 | if ($wrap->{'@type'} && $wrap->{'@type'} eq 'koral:termGroup') { |
| 321 | foreach (@{$wrap->{operands}}) { |
| 322 | push @keys, _term($_); |
| 323 | }; |
| 324 | } |
| 325 | |
| 326 | # Token wraps a single koral:term |
| 327 | else { |
| 328 | push @keys, _term($wrap); |
| 329 | }; |
| Akron | 1e46919 | 2016-10-24 12:59:21 +0200 | [diff] [blame] | 330 | |
| 331 | # Append posting to postings list |
| Akron | 6a74973 | 2017-02-14 14:43:06 +0100 | [diff] [blame] | 332 | my @subtokens = _subtokens($item); |
| Akron | 71fc14c | 2016-10-31 23:44:43 +0100 | [diff] [blame] | 333 | |
| Akron | 6a74973 | 2017-02-14 14:43:06 +0100 | [diff] [blame] | 334 | # No subtokens defined |
| 335 | unless (scalar @subtokens) { |
| 336 | push @subtokens, $pos; |
| Akron | 71fc14c | 2016-10-31 23:44:43 +0100 | [diff] [blame] | 337 | |
| 338 | # Store offsets |
| 339 | if ($item->{offsets}) { |
| Akron | 6a74973 | 2017-02-14 14:43:06 +0100 | [diff] [blame] | 340 | $subtokens->store($doc_id, $pos, @{$item->{offsets}}); |
| Akron | 71fc14c | 2016-10-31 23:44:43 +0100 | [diff] [blame] | 341 | }; |
| 342 | $pos++; |
| 343 | }; |
| Akron | 6ccf810 | 2016-10-26 12:41:07 +0200 | [diff] [blame] | 344 | |
| Akron | 9cb1373 | 2017-01-31 19:16:32 +0100 | [diff] [blame] | 345 | # Add token terms |
| Akron | f0d514a | 2016-11-01 14:16:25 +0100 | [diff] [blame] | 346 | foreach (@keys) { |
| Akron | 1cb0b21 | 2017-02-17 16:07:13 +0100 | [diff] [blame] | 347 | my $post_list = $dict->add_term($_); |
| Akron | 6a74973 | 2017-02-14 14:43:06 +0100 | [diff] [blame] | 348 | $post_list->append($doc_id, @subtokens); |
| Akron | f0d514a | 2016-11-01 14:16:25 +0100 | [diff] [blame] | 349 | }; |
| Akron | 6ccf810 | 2016-10-26 12:41:07 +0200 | [diff] [blame] | 350 | } |
| 351 | |
| 352 | # Add span term to dictionary |
| Akron | 13e3101 | 2016-10-25 02:08:30 +0200 | [diff] [blame] | 353 | elsif ($item->{'@type'} eq 'koral:span') { |
| Akron | 6ccf810 | 2016-10-26 12:41:07 +0200 | [diff] [blame] | 354 | |
| 355 | # Create key string |
| Akron | f0d514a | 2016-11-01 14:16:25 +0100 | [diff] [blame] | 356 | my $key = '<>' . _term($item->{wrap}); |
| Akron | 6ccf810 | 2016-10-26 12:41:07 +0200 | [diff] [blame] | 357 | |
| Akron | 1cb0b21 | 2017-02-17 16:07:13 +0100 | [diff] [blame] | 358 | my $post_list = $dict->add_term($key); |
| Akron | 13e3101 | 2016-10-25 02:08:30 +0200 | [diff] [blame] | 359 | |
| 360 | # Append posting to posting list |
| 361 | $post_list->append( |
| 362 | $doc_id, |
| Akron | 6a74973 | 2017-02-14 14:43:06 +0100 | [diff] [blame] | 363 | $item->{subtokens}->[0], |
| 364 | # The end is AFTER the second subtoken |
| 365 | $item->{subtokens}->[-1] + 1 |
| Akron | 13e3101 | 2016-10-25 02:08:30 +0200 | [diff] [blame] | 366 | ); |
| Akron | 1e46919 | 2016-10-24 12:59:21 +0200 | [diff] [blame] | 367 | }; |
| Akron | 5f52153 | 2016-10-21 19:30:23 +0200 | [diff] [blame] | 368 | }; |
| 369 | |
| Akron | 71fc14c | 2016-10-31 23:44:43 +0100 | [diff] [blame] | 370 | return $doc_id; |
| Akron | 5f52153 | 2016-10-21 19:30:23 +0200 | [diff] [blame] | 371 | }; |
| 372 | |
| Akron | 1e46919 | 2016-10-24 12:59:21 +0200 | [diff] [blame] | 373 | |
| Akron | e020194 | 2016-11-26 01:11:31 +0100 | [diff] [blame] | 374 | # TODO: Use from_koral()->term |
| 375 | # Potentially with a prefix |
| Akron | 6ccf810 | 2016-10-26 12:41:07 +0200 | [diff] [blame] | 376 | sub _term { |
| 377 | my $item = shift; |
| Akron | 1e46919 | 2016-10-24 12:59:21 +0200 | [diff] [blame] | 378 | |
| Akron | 6ccf810 | 2016-10-26 12:41:07 +0200 | [diff] [blame] | 379 | my $key = ''; |
| 380 | # Create term for term dictionary |
| 381 | if ($item->{foundry}) { |
| 382 | $key .= $item->{foundry}; |
| 383 | if ($item->{layer}) { |
| 384 | $key .= '/' . $item->{layer}; |
| 385 | } |
| 386 | $key .= '='; |
| 387 | }; |
| 388 | return $key . ($item->{key} // ''); |
| 389 | } |
| 390 | |
| Akron | c3657bf | 2016-10-31 00:15:43 +0100 | [diff] [blame] | 391 | |
| Akron | 6a74973 | 2017-02-14 14:43:06 +0100 | [diff] [blame] | 392 | # Return subtoken list or nothing |
| 393 | sub _subtokens { |
| Akron | 6ccf810 | 2016-10-26 12:41:07 +0200 | [diff] [blame] | 394 | my $item = shift; |
| 395 | my @posting; |
| 396 | |
| Akron | 6a74973 | 2017-02-14 14:43:06 +0100 | [diff] [blame] | 397 | if ($item->{subtokens}) { |
| Akron | 6ccf810 | 2016-10-26 12:41:07 +0200 | [diff] [blame] | 398 | |
| 399 | # Remove! |
| Akron | 6a74973 | 2017-02-14 14:43:06 +0100 | [diff] [blame] | 400 | push @posting, $item->{subtokens}->[0]; |
| Akron | 6ccf810 | 2016-10-26 12:41:07 +0200 | [diff] [blame] | 401 | |
| Akron | 6a74973 | 2017-02-14 14:43:06 +0100 | [diff] [blame] | 402 | if ($item->{subtokens}->[1]) { |
| 403 | # The end is AFTER the second subtoken |
| 404 | push @posting, $item->{subtokens}->[1] + 1; |
| Akron | 6ccf810 | 2016-10-26 12:41:07 +0200 | [diff] [blame] | 405 | }; |
| 406 | |
| 407 | return @posting; |
| 408 | }; |
| 409 | |
| 410 | return; |
| Akron | 5f52153 | 2016-10-21 19:30:23 +0200 | [diff] [blame] | 411 | }; |
| 412 | |
| Akron | c3657bf | 2016-10-31 00:15:43 +0100 | [diff] [blame] | 413 | |
| Akron | e020194 | 2016-11-26 01:11:31 +0100 | [diff] [blame] | 414 | # Apply (aka search) the index |
| Akron | c3657bf | 2016-10-31 00:15:43 +0100 | [diff] [blame] | 415 | sub apply { |
| 416 | my $self = shift; |
| 417 | my $koral = shift; |
| 418 | |
| 419 | # Necessary for filtering |
| Akron | e020194 | 2016-11-26 01:11:31 +0100 | [diff] [blame] | 420 | my $corpus = $koral->corpus->prepare_for($self) or return; |
| Akron | c3657bf | 2016-10-31 00:15:43 +0100 | [diff] [blame] | 421 | |
| Akron | e020194 | 2016-11-26 01:11:31 +0100 | [diff] [blame] | 422 | # Add VC to query as a constraint |
| 423 | my $query = $koral->query->prepare_for($self, $corpus) or return; |
| Akron | c3657bf | 2016-10-31 00:15:43 +0100 | [diff] [blame] | 424 | |
| 425 | # Get meta information |
| Akron | e020194 | 2016-11-26 01:11:31 +0100 | [diff] [blame] | 426 | my $meta = $koral->meta->prepare_for($self) or return; |
| Akron | c3657bf | 2016-10-31 00:15:43 +0100 | [diff] [blame] | 427 | |
| 428 | my $cb = shift; |
| 429 | my @result = (); |
| 430 | |
| 431 | # No callback - push to array |
| 432 | unless ($cb) { |
| 433 | while ($query->next) { |
| 434 | push @result, $query->current; |
| 435 | }; |
| 436 | return @result; |
| 437 | }; |
| 438 | |
| 439 | # Push callback |
| 440 | while ($query->next) { |
| 441 | $cb->($query->current); |
| 442 | }; |
| 443 | |
| 444 | }; |
| 445 | |
| 446 | |
| 447 | |
| Akron | 5f52153 | 2016-10-21 19:30:23 +0200 | [diff] [blame] | 448 | 1; |
| Akron | c3657bf | 2016-10-31 00:15:43 +0100 | [diff] [blame] | 449 | |
| 450 | |
| 451 | __END__ |
| 452 | |
| Akron | 349747d | 2016-12-05 11:05:53 +0100 | [diff] [blame] | 453 | |
| 454 | |
| 455 | # Search using meta data |
| 456 | # Can also be used to collect with a callback |
| Akron | 7db79e2 | 2016-12-08 23:02:32 +0100 | [diff] [blame] | 457 | # |
| Akron | 349747d | 2016-12-05 11:05:53 +0100 | [diff] [blame] | 458 | sub search { |
| 459 | my ($self, $koral, $cb) = @_; |
| 460 | |
| 461 | my $query = $koral->query; |
| 462 | my $corpus = $koral->corpus; |
| 463 | my $meta = $koral->meta; |
| 464 | |
| Akron | 27fb743 | 2016-12-11 18:07:32 +0100 | [diff] [blame] | 465 | # Initiate result object |
| Akron | 349747d | 2016-12-05 11:05:53 +0100 | [diff] [blame] | 466 | my $result = $koral->result; |
| 467 | |
| Akron | 27fb743 | 2016-12-11 18:07:32 +0100 | [diff] [blame] | 468 | # Get filtered search object |
| Akron | 349747d | 2016-12-05 11:05:53 +0100 | [diff] [blame] | 469 | my $search = $query->filter_by($corpus)->plan_for($self); |
| 470 | |
| 471 | # Augment with facets |
| Akron | 27fb743 | 2016-12-11 18:07:32 +0100 | [diff] [blame] | 472 | # Will add to result info |
| Akron | 349747d | 2016-12-05 11:05:53 +0100 | [diff] [blame] | 473 | if ($meta->facets) { |
| 474 | $search = $meta->facets($search); |
| 475 | }; |
| 476 | |
| Akron | 27fb743 | 2016-12-11 18:07:32 +0100 | [diff] [blame] | 477 | # Augment with counting |
| 478 | # Will add to result info |
| 479 | if ($meta->count) { |
| 480 | $search = $meta->count($search); |
| 481 | }; |
| 482 | |
| Akron | 349747d | 2016-12-05 11:05:53 +0100 | [diff] [blame] | 483 | # Augment with sorting |
| Akron | 2ee89f1 | 2016-12-07 18:33:52 +0100 | [diff] [blame] | 484 | if ($meta->sorted_by) { |
| 485 | $search = $meta->sorted_by($search); |
| Akron | 349747d | 2016-12-05 11:05:53 +0100 | [diff] [blame] | 486 | }; |
| 487 | |
| Akron | 27fb743 | 2016-12-11 18:07:32 +0100 | [diff] [blame] | 488 | # Augment with limitations |
| 489 | if ($meta->limit) { |
| 490 | $search = $meta->limit($search); |
| Akron | 349747d | 2016-12-05 11:05:53 +0100 | [diff] [blame] | 491 | }; |
| 492 | |
| Akron | 27fb743 | 2016-12-11 18:07:32 +0100 | [diff] [blame] | 493 | # Augment with field collector |
| 494 | # Will modify current match |
| 495 | $search = $meta->fields($search); |
| 496 | |
| 497 | # Augment with id creator |
| 498 | # Will modify current match |
| 499 | $search = $meta->id_create($search); |
| 500 | |
| 501 | # Augment with snmippet creator |
| 502 | # Will modify current match |
| 503 | $search = $meta->snippets($search); |
| 504 | |
| 505 | # Iterate over all matches |
| 506 | while ($search->next) { |
| 507 | |
| 508 | # Based on the information, this will populate the match |
| 509 | $result->add_match($search->current_match); |
| Akron | 349747d | 2016-12-05 11:05:53 +0100 | [diff] [blame] | 510 | }; |
| 511 | |
| 512 | return $koral; |
| 513 | }; |
| 514 | |
| 515 | sub get_fields { |
| 516 | my ($self, $doc_id, $fields) = @_; |
| Akron | 4b5257e | 2017-04-05 17:50:22 +0200 | [diff] [blame] | 517 | ... |
| Akron | 349747d | 2016-12-05 11:05:53 +0100 | [diff] [blame] | 518 | }; |
| 519 | |
| 520 | # This returns the posting's start and end position |
| 521 | # when embedded in a span, e.g. <base/s=s> |
| 522 | sub get_context_by_query { |
| 523 | my ($self, $posting, $query) = @_ |
| 524 | }; |
| 525 | |
| 526 | sub get_annotations { |
| 527 | my ($self, $posting, $terms) = @_; |
| 528 | |
| 529 | my %anno = (); |
| 530 | |
| 531 | my $dict = $self->dict; |
| 532 | foreach my $term ($dict->terms($terms)) { |
| 533 | my $term_list = $dict->get($term); |
| 534 | |
| 535 | # Skip to the correct document and the first position |
| 536 | next unless $term_list->next($posting->doc_id, $posting->start); |
| 537 | |
| 538 | # Init annotation |
| 539 | my $anno = ($anno{$term} //= []); |
| 540 | |
| 541 | # Iterate over all annotations |
| 542 | while ($term_list->current->end <= $posting->end) { |
| 543 | |
| 544 | # Remember the annotations |
| 545 | push @$anno, $term_list->current->clone; |
| 546 | |
| 547 | $term_list->next or next; |
| 548 | } |
| 549 | |
| 550 | # Close (and forget) termlist |
| 551 | $term_list->close; |
| 552 | }; |
| 553 | |
| 554 | return \%anno; |
| Akron | c3657bf | 2016-10-31 00:15:43 +0100 | [diff] [blame] | 555 | }; |
| 556 | |
| 557 | |
| Akron | 349747d | 2016-12-05 11:05:53 +0100 | [diff] [blame] | 558 | |
| 559 | |
| Akron | c3657bf | 2016-10-31 00:15:43 +0100 | [diff] [blame] | 560 | |
| 561 | sub items_per_page; |
| 562 | |
| 563 | sub start_page; |
| 564 | |
| 565 | sub apply { |
| 566 | my $self = shift; |
| 567 | my $query = $self->plan; |
| 568 | my $cb = shift; |
| 569 | my @result = (); |
| 570 | |
| 571 | # No callback - push to array |
| 572 | unless ($cb) { |
| 573 | while ($query->next) { |
| 574 | push @result, $query->current; |
| 575 | }; |
| 576 | return @result; |
| 577 | }; |
| 578 | |
| 579 | # Push callback |
| 580 | while ($query->next) { |
| 581 | $cb->($query->current); |
| 582 | }; |
| 583 | }; |