blob: bfc465aee7163c88a6004b50f068b12adfe0b8dd [file] [log] [blame]
Akronf0d514a2016-11-01 14:16:25 +01001package Krawfish::Koral::Document;
Akron4a46e6e2017-08-16 17:49:16 +02002use Krawfish::Koral::Document::Stream;
Akron18829342017-11-03 18:49:22 +01003use Krawfish::Koral::Document::Subtoken;
Akron4a46e6e2017-08-16 17:49:16 +02004use Krawfish::Koral::Document::Fields;
Akron14ff0c42017-08-09 20:49:52 +02005use Krawfish::Koral::Query::Term;
Akrondd10fb92017-08-08 20:19:46 +02006use Krawfish::Log;
7use Mojo::File;
8use Mojo::JSON qw/encode_json decode_json/;
Akron927789d2016-11-01 02:33:04 +01009use strict;
10use warnings;
Akrondd10fb92017-08-08 20:19:46 +020011use List::MoreUtils qw/uniq/;
Akron927789d2016-11-01 02:33:04 +010012
Akrondd10fb92017-08-08 20:19:46 +020013# Parses a document and creates a simple forward index list.
14#
15# primary='...',
16# fields=[+field => title],
17# terms=[*term => [postings*]]
18#
19# Then, when the document is added to certain nodes,
20# the keys will be translated to term_ids and the document
21# can be added with all freq_in_doc information
22
Akrona588d072017-10-13 14:45:34 +020023# foundry and layer may need separated term_ids
24# so they are exceptional small.
Akrondd10fb92017-08-08 20:19:46 +020025
Akron14ff0c42017-08-09 20:49:52 +020026# TODO:
27# Don't forget to deal with TUIs!
28
Akron94256e62017-10-10 17:29:18 +020029use constant DEBUG => 0;
Akrondd10fb92017-08-08 20:19:46 +020030
31# Parse the document and create an inverted index file
Akronf0d514a2016-11-01 14:16:25 +010032sub new {
33 my $class = shift;
Akronf0d514a2016-11-01 14:16:25 +010034
Akrondd10fb92017-08-08 20:19:46 +020035 my $self = bless {
Akroncdbe3cb2017-08-29 21:16:18 +020036 # sortable => {},
Akron4a46e6e2017-08-16 17:49:16 +020037 stream => Krawfish::Koral::Document::Stream->new,
38 fields => Krawfish::Koral::Document::Fields->new
Akrondd10fb92017-08-08 20:19:46 +020039 }, $class;
Akronf0d514a2016-11-01 14:16:25 +010040
Akrondd10fb92017-08-08 20:19:46 +020041 my $doc = shift;
42
43 unless (ref $doc) {
44 $doc = decode_json(Mojo::File->new($doc)->slurp);
Akronf0d514a2016-11-01 14:16:25 +010045 };
46
Akrondd10fb92017-08-08 20:19:46 +020047 # Parse the document
48 $self->_parse($doc);
Akronf0d514a2016-11-01 14:16:25 +010049
50 return $self;
51};
52
53
Akron14ff0c42017-08-09 20:49:52 +020054# Get the stream object
Akrondd10fb92017-08-08 20:19:46 +020055sub stream {
56 $_[0]->{stream};
57};
58
59
Akron14ff0c42017-08-09 20:49:52 +020060# Get the fields object
Akrondd10fb92017-08-08 20:19:46 +020061sub fields {
62 $_[0]->{fields};
63};
64
65
Akron14ff0c42017-08-09 20:49:52 +020066# Translate all terms into term_ids and
67# add unknown terms to the dictionary
Akrondd10fb92017-08-08 20:19:46 +020068sub identify {
69 my ($self, $dict) = @_;
70 $self->{fields} = $self->{fields}->identify($dict);
71 $self->{stream} = $self->{stream}->identify($dict);
72 return $self;
73};
74
75
Akron14ff0c42017-08-09 20:49:52 +020076# Stringification
Akrondd10fb92017-08-08 20:19:46 +020077sub to_string {
Akron10448e12017-10-11 18:04:53 +020078 my ($self, $id) = @_;
79 return '[' . $self->fields->to_string($id) . ']' . $self->stream->to_string($id);
80};
81
Akrondd10fb92017-08-08 20:19:46 +020082
83# Parse the file and create a token-ordered document
84sub _parse {
85 my ($self, $doc) = @_;
86
87 # Get the document part
88 # This may - in the future - support multiple documents at once
89 $doc = $doc->{document};
90
91 my $primary = '';
92 my $stream = $self->stream;
93 my $fields = $self->fields;
94
95 # Remember the primary data for the creation
96 # of the forward index
97 if ($doc->{primaryData}) {
98 $primary = $doc->{primaryData};
Akronf0d514a2016-11-01 14:16:25 +010099 };
Akrondd10fb92017-08-08 20:19:46 +0200100
Akrondd10fb92017-08-08 20:19:46 +0200101 # Add metadata fields
102 my $pos = 0;
Akroncdbe3cb2017-08-29 21:16:18 +0200103 # my %sortable;
Akrondd10fb92017-08-08 20:19:46 +0200104 foreach my $field (@{$doc->{fields}}) {
105
106 # TODO:
107 # Presort fields based on their field_key_id!
108 # In that way it's faster to retrieve presorted fields
109 # for enrichment!
Akronf0d514a2016-11-01 14:16:25 +0100110
Akrondd10fb92017-08-08 20:19:46 +0200111 # Prepare field for sorting
Akrona588d072017-10-13 14:45:34 +0200112 # if ($field->{sortable}) {
113 # # Which entries need to be sorted?
114 # $sortable{$field->{key}}++;
115 # };
Akrondd10fb92017-08-08 20:19:46 +0200116
117 # Prepare for summarization
Akrondf4c1632017-08-17 16:56:30 +0200118 if (!$field->{type} || $field->{type} eq 'type:string') {
Akroncdbe3cb2017-08-29 21:16:18 +0200119 if (ref $field->{value} && ref $field->{value} eq 'ARRAY') {
120
121 if (DEBUG) {
122 print_log('doc', 'Field ' . $field->{key} . ' is multivalued');
123 };
124
125 my $key = $field->{key};
126
127 # Iterate over all field values and add the value
128 foreach my $value (@{$field->{value}}) {
129 $fields->add_string($key, $value);
130 };
131 }
132 else {
133 $fields->add_string($field->{key}, $field->{value});
134 };
Akrondf4c1632017-08-17 16:56:30 +0200135 }
136 elsif ($field->{type} eq 'type:integer') {
Akrondd10fb92017-08-08 20:19:46 +0200137 $fields->add_int($field->{key}, $field->{value});
138 }
Akrondf4c1632017-08-17 16:56:30 +0200139 elsif ($field->{type} eq 'type:store') {
140 $fields->add_store($field->{key}, $field->{value});
141 }
Akrondd10fb92017-08-08 20:19:46 +0200142 else {
Akrondf4c1632017-08-17 16:56:30 +0200143 warn 'unknown field type: ' . $field->{type};
Akrondd10fb92017-08-08 20:19:46 +0200144 };
145
146 # This will later be indexed for search as well as retrieval in
147 # the forward index.
Akronf0d514a2016-11-01 14:16:25 +0100148 };
Akrondd10fb92017-08-08 20:19:46 +0200149
150 # Check that the unique field is given, as this is required
Akroncdbe3cb2017-08-29 21:16:18 +0200151 # $self->{sortable} = \%sortable;
Akrondd10fb92017-08-08 20:19:46 +0200152
153 my $primary_index = 0;
154
155 # Get all subtokens
156 if ($doc->{subtokens}) {
157
158 print_log('doc', 'Parse subtokens') if DEBUG;
159
160 # Get all subtoken offsets
161 foreach my $subtoken (@{$doc->{subtokens}}) {
162
163 # Get start and end of the subtoken
164 my ($start, $end) = @{$subtoken->{offsets}};
165
166 if (DEBUG) {
167 print_log(
168 'doc',
169 'Store subtoken: ' . $pos . '=' . join('-', $start, $end)
170 );
171 };
172
173 # Get the term surface from the primary text
174 # TODO:
175 # Ensure that the offsets are valid!
Akron1563b0c2017-08-10 19:58:04 +0200176 my $preceding = substr($primary, $primary_index, $start - $primary_index) // '';
Akrondd10fb92017-08-08 20:19:46 +0200177 my $term = substr($primary, $start, $end - $start);
178 $primary_index = $end;
179
180 print_log('doc', 'Surface form is ' . $term) if DEBUG;
181
Akron18829342017-11-03 18:49:22 +0100182 $stream->subtoken(
183 $pos,
184 Krawfish::Koral::Document::Subtoken->new(
185 preceding => $preceding,
186 subterm => $term
187 ));
Akrondd10fb92017-08-08 20:19:46 +0200188 $pos++;
189 };
190 };
191
192
193 # There are tokens indexed by subtokens
194 if ($primary_index) {
195 my $preceding = substr($primary, $primary_index);
Akron18829342017-11-03 18:49:22 +0100196
197 if ($preceding) {
198 $stream->subtoken(
199 $pos,
200 Krawfish::Koral::Document::Subtoken->new(
201 preceding => $preceding
202 ));
203 };
Akrondd10fb92017-08-08 20:19:46 +0200204
205 # TODO: Probably not a good idea
206 $primary_index = 0;
207 };
208
209
210 # Get all annotations
211 $pos = 0;
212 my $end;
213 foreach my $item (@{$doc->{annotations}}) {
214
215 # Add token term to term dictionary
216 if ($item->{'@type'} eq 'koral:token') {
217
218 unless ($item->{wrap}) {
219 warn 'No wrap defined in KoralQuery';
Akron4204f172017-10-02 22:32:02 +0200220 CORE::next;
Akrondd10fb92017-08-08 20:19:46 +0200221 };
222
223 # Create key string
224 my $wrap = $item->{wrap};
225 my @keys;
226
227 # Token wraps a koral:termGroup
228 if ($wrap->{'@type'} && $wrap->{'@type'} eq 'koral:termGroup') {
229 foreach (@{$wrap->{operands}}) {
230 push @keys, _term($_);
231 };
232 }
233
234 # Token wraps a single koral:term
235 else {
236 push @keys, _term($wrap);
237 };
238
239 # Append posting to postings list
240 my @subtoken_offset = _subtokens($item);
241
242 # There are no reference subtokens defined
243 unless (scalar @subtoken_offset) {
244
245 # Use the current position for storing
246 push @subtoken_offset, $pos;
247
248 # But there are offsets defined
249 if ($item->{offsets}) {
250
251 # Get character definitions
252 my ($start, $end) = @{$item->{offsets}};
253
254 # Get the term surface from the primary text
255 # TODO:
256 # Ensure that the offsets are valid!
257 my $preceding = substr($primary, $primary_index, $start - $primary_index);
258 my $term = substr($primary, $start, $end - $start);
259 $primary_index = $end;
260
Akron18829342017-11-03 18:49:22 +0100261 $stream->subtoken(
262 $pos,
263 Krawfish::Koral::Document::Subtoken->new(
264 preceding => $preceding,
265 subterm => $term
266 ));
Akrondd10fb92017-08-08 20:19:46 +0200267 };
268 $pos++;
269 };
270
271 # Add token terms
272 foreach (@keys) {
273
274 # Add token annotation
Akron14ff0c42017-08-09 20:49:52 +0200275 # my $length = $subtoken_offset[1] ? ($subtoken_offset[1]-$subtoken_offset[0]-1) : 0;
276 $stream->subtoken(
277 $subtoken_offset[0]
278 )->add_annotation($_, $subtoken_offset[1] ? $subtoken_offset[1] : $subtoken_offset[0] + 1);
Akrondd10fb92017-08-08 20:19:46 +0200279 };
280 }
281
282 # Add span term to dictionary
283 elsif ($item->{'@type'} eq 'koral:span') {
284
285 # Create key string
Akron14ff0c42017-08-09 20:49:52 +0200286 my $term = _term($item->{wrap});
287 $term->term_type('span');
Akrondd10fb92017-08-08 20:19:46 +0200288
289 # Add span to forward stream
Akron14ff0c42017-08-09 20:49:52 +0200290 #my $length = $item->{subtokens}->[1] ? (
291 # $item->{subtokens}->[-1] - $item->{subtokens}->[0]
292 #) : 0;
293 $stream->subtoken($item->{subtokens}->[0])->add_annotation(
294 $term,
295 $item->{subtokens}->[-1] + 1
296 );
Akrondd10fb92017-08-08 20:19:46 +0200297 };
298 };
299
300 # There are tokens indexed by subtokens
301 if ($primary_index) {
302 my $preceding = substr($primary, $primary_index);
Akron18829342017-11-03 18:49:22 +0100303
304 if ($preceding) {
305 $stream->subtoken(
306 $pos,
307 Krawfish::Koral::Document::Subtoken->new(
308 preceding => $preceding
309 ));
310 };
Akrondd10fb92017-08-08 20:19:46 +0200311
312 # TODO: Probably not a good idea
313 $primary_index = 0;
314 };
Akronf0d514a2016-11-01 14:16:25 +0100315};
316
Akronf0d514a2016-11-01 14:16:25 +0100317
Akrondd10fb92017-08-08 20:19:46 +0200318# TODO: Use from_koral()->term
319# Potentially with a prefix
320sub _term {
321 my $item = shift;
Akron14ff0c42017-08-09 20:49:52 +0200322 my $term = Krawfish::Koral::Query::Term->new;
Akrondd10fb92017-08-08 20:19:46 +0200323
Akrondd10fb92017-08-08 20:19:46 +0200324 if ($item->{foundry}) {
Akron14ff0c42017-08-09 20:49:52 +0200325 $term->foundry($item->{foundry});
Akrondd10fb92017-08-08 20:19:46 +0200326 };
Akron14ff0c42017-08-09 20:49:52 +0200327
328 if ($item->{layer}) {
329 $term->layer($item->{layer});
330 };
331
332 if ($item->{key}) {
333 $term->key($item->{key});
334 };
335
336 if ($item->{value}) {
337 $term->value($item->{value});
338 };
339
Akron7b4e4d92017-09-25 12:18:29 +0200340 # Make token default term type
341 $term->term_type('token');
342
Akron14ff0c42017-08-09 20:49:52 +0200343 return $term;
344
345 #my $key = '';
346 ## Create term for term dictionary
347 #if ($item->{foundry}) {
348 # $key .= $item->{foundry};
349 # if ($item->{layer}) {
350 # $key .= '/' . $item->{layer};
351 # }
352 # $key .= '=';
353 #};
354 #return $key . ($item->{key} // '');
Akrondd10fb92017-08-08 20:19:46 +0200355}
356
357
358# Return subtoken list or nothing
359sub _subtokens {
Akronf0d514a2016-11-01 14:16:25 +0100360 my $item = shift;
361 my @posting;
362
Akrondd10fb92017-08-08 20:19:46 +0200363 if ($item->{subtokens}) {
Akronf0d514a2016-11-01 14:16:25 +0100364
Akrondd10fb92017-08-08 20:19:46 +0200365 # Remove!
366 push @posting, $item->{subtokens}->[0];
Akronf0d514a2016-11-01 14:16:25 +0100367
Akrondd10fb92017-08-08 20:19:46 +0200368 if ($item->{subtokens}->[1]) {
369 # The end is AFTER the second subtoken
370 push @posting, $item->{subtokens}->[1] + 1;
Akronf0d514a2016-11-01 14:16:25 +0100371 };
372
373 return @posting;
374 };
375
376 return;
377};
378
Akron927789d2016-11-01 02:33:04 +0100379
3801;
Akrondd10fb92017-08-08 20:19:46 +0200381
382
383__END__