blob: ac41d214fbeb8a71bc1f18a58da4effa22ca7111 [file] [log] [blame]
Akronf0d514a2016-11-01 14:16:25 +01001package Krawfish::Koral::Document;
Akron4a46e6e2017-08-16 17:49:16 +02002use Krawfish::Koral::Document::Stream;
3use Krawfish::Koral::Document::Fields;
Akron14ff0c42017-08-09 20:49:52 +02004use Krawfish::Koral::Query::Term;
Akrondd10fb92017-08-08 20:19:46 +02005use Krawfish::Log;
6use Mojo::File;
7use Mojo::JSON qw/encode_json decode_json/;
Akron927789d2016-11-01 02:33:04 +01008use strict;
9use warnings;
Akrondd10fb92017-08-08 20:19:46 +020010use List::MoreUtils qw/uniq/;
Akron927789d2016-11-01 02:33:04 +010011
Akrondd10fb92017-08-08 20:19:46 +020012# Parses a document and creates a simple forward index list.
13#
14# primary='...',
15# fields=[+field => title],
16# terms=[*term => [postings*]]
17#
18# Then, when the document is added to certain nodes,
19# the keys will be translated to term_ids and the document
20# can be added with all freq_in_doc information
21
Akrona588d072017-10-13 14:45:34 +020022# foundry and layer may need separated term_ids
23# so they are exceptional small.
Akrondd10fb92017-08-08 20:19:46 +020024
Akron14ff0c42017-08-09 20:49:52 +020025# TODO:
26# Don't forget to deal with TUIs!
27
Akron94256e62017-10-10 17:29:18 +020028use constant DEBUG => 0;
Akrondd10fb92017-08-08 20:19:46 +020029
30# Parse the document and create an inverted index file
Akronf0d514a2016-11-01 14:16:25 +010031sub new {
32 my $class = shift;
Akronf0d514a2016-11-01 14:16:25 +010033
Akrondd10fb92017-08-08 20:19:46 +020034 my $self = bless {
Akroncdbe3cb2017-08-29 21:16:18 +020035 # sortable => {},
Akron4a46e6e2017-08-16 17:49:16 +020036 stream => Krawfish::Koral::Document::Stream->new,
37 fields => Krawfish::Koral::Document::Fields->new
Akrondd10fb92017-08-08 20:19:46 +020038 }, $class;
Akronf0d514a2016-11-01 14:16:25 +010039
Akrondd10fb92017-08-08 20:19:46 +020040 my $doc = shift;
41
42 unless (ref $doc) {
43 $doc = decode_json(Mojo::File->new($doc)->slurp);
Akronf0d514a2016-11-01 14:16:25 +010044 };
45
Akrondd10fb92017-08-08 20:19:46 +020046 # Parse the document
47 $self->_parse($doc);
Akronf0d514a2016-11-01 14:16:25 +010048
49 return $self;
50};
51
52
Akron14ff0c42017-08-09 20:49:52 +020053# Get the stream object
Akrondd10fb92017-08-08 20:19:46 +020054sub stream {
55 $_[0]->{stream};
56};
57
58
Akron14ff0c42017-08-09 20:49:52 +020059# Get the fields object
Akrondd10fb92017-08-08 20:19:46 +020060sub fields {
61 $_[0]->{fields};
62};
63
64
Akron14ff0c42017-08-09 20:49:52 +020065# Translate all terms into term_ids and
66# add unknown terms to the dictionary
Akrondd10fb92017-08-08 20:19:46 +020067sub identify {
68 my ($self, $dict) = @_;
69 $self->{fields} = $self->{fields}->identify($dict);
70 $self->{stream} = $self->{stream}->identify($dict);
71 return $self;
72};
73
74
Akron14ff0c42017-08-09 20:49:52 +020075# Stringification
Akrondd10fb92017-08-08 20:19:46 +020076sub to_string {
Akron10448e12017-10-11 18:04:53 +020077 my ($self, $id) = @_;
78 return '[' . $self->fields->to_string($id) . ']' . $self->stream->to_string($id);
79};
80
Akrondd10fb92017-08-08 20:19:46 +020081
82# Parse the file and create a token-ordered document
83sub _parse {
84 my ($self, $doc) = @_;
85
86 # Get the document part
87 # This may - in the future - support multiple documents at once
88 $doc = $doc->{document};
89
90 my $primary = '';
91 my $stream = $self->stream;
92 my $fields = $self->fields;
93
94 # Remember the primary data for the creation
95 # of the forward index
96 if ($doc->{primaryData}) {
97 $primary = $doc->{primaryData};
Akronf0d514a2016-11-01 14:16:25 +010098 };
Akrondd10fb92017-08-08 20:19:46 +020099
Akrondd10fb92017-08-08 20:19:46 +0200100 # Add metadata fields
101 my $pos = 0;
Akroncdbe3cb2017-08-29 21:16:18 +0200102 # my %sortable;
Akrondd10fb92017-08-08 20:19:46 +0200103 foreach my $field (@{$doc->{fields}}) {
104
105 # TODO:
106 # Presort fields based on their field_key_id!
107 # In that way it's faster to retrieve presorted fields
108 # for enrichment!
Akronf0d514a2016-11-01 14:16:25 +0100109
Akrondd10fb92017-08-08 20:19:46 +0200110 # Prepare field for sorting
Akrona588d072017-10-13 14:45:34 +0200111 # if ($field->{sortable}) {
112 # # Which entries need to be sorted?
113 # $sortable{$field->{key}}++;
114 # };
Akrondd10fb92017-08-08 20:19:46 +0200115
116 # Prepare for summarization
Akrondf4c1632017-08-17 16:56:30 +0200117 if (!$field->{type} || $field->{type} eq 'type:string') {
Akroncdbe3cb2017-08-29 21:16:18 +0200118 if (ref $field->{value} && ref $field->{value} eq 'ARRAY') {
119
120 if (DEBUG) {
121 print_log('doc', 'Field ' . $field->{key} . ' is multivalued');
122 };
123
124 my $key = $field->{key};
125
126 # Iterate over all field values and add the value
127 foreach my $value (@{$field->{value}}) {
128 $fields->add_string($key, $value);
129 };
130 }
131 else {
132 $fields->add_string($field->{key}, $field->{value});
133 };
Akrondf4c1632017-08-17 16:56:30 +0200134 }
135 elsif ($field->{type} eq 'type:integer') {
Akrondd10fb92017-08-08 20:19:46 +0200136 $fields->add_int($field->{key}, $field->{value});
137 }
Akrondf4c1632017-08-17 16:56:30 +0200138 elsif ($field->{type} eq 'type:store') {
139 $fields->add_store($field->{key}, $field->{value});
140 }
Akrondd10fb92017-08-08 20:19:46 +0200141 else {
Akrondf4c1632017-08-17 16:56:30 +0200142 warn 'unknown field type: ' . $field->{type};
Akrondd10fb92017-08-08 20:19:46 +0200143 };
144
145 # This will later be indexed for search as well as retrieval in
146 # the forward index.
Akronf0d514a2016-11-01 14:16:25 +0100147 };
Akrondd10fb92017-08-08 20:19:46 +0200148
149 # Check that the unique field is given, as this is required
Akroncdbe3cb2017-08-29 21:16:18 +0200150 # $self->{sortable} = \%sortable;
Akrondd10fb92017-08-08 20:19:46 +0200151
152 my $primary_index = 0;
153
154 # Get all subtokens
155 if ($doc->{subtokens}) {
156
157 print_log('doc', 'Parse subtokens') if DEBUG;
158
159 # Get all subtoken offsets
160 foreach my $subtoken (@{$doc->{subtokens}}) {
161
162 # Get start and end of the subtoken
163 my ($start, $end) = @{$subtoken->{offsets}};
164
165 if (DEBUG) {
166 print_log(
167 'doc',
168 'Store subtoken: ' . $pos . '=' . join('-', $start, $end)
169 );
170 };
171
172 # Get the term surface from the primary text
173 # TODO:
174 # Ensure that the offsets are valid!
Akron1563b0c2017-08-10 19:58:04 +0200175 my $preceding = substr($primary, $primary_index, $start - $primary_index) // '';
Akrondd10fb92017-08-08 20:19:46 +0200176 my $term = substr($primary, $start, $end - $start);
177 $primary_index = $end;
178
179 print_log('doc', 'Surface form is ' . $term) if DEBUG;
180
181 $stream->subtoken($pos, $preceding, $term);
182 $pos++;
183 };
184 };
185
186
187 # There are tokens indexed by subtokens
188 if ($primary_index) {
189 my $preceding = substr($primary, $primary_index);
190 $stream->subtoken($pos, $preceding, '') if $preceding;
191
192 # TODO: Probably not a good idea
193 $primary_index = 0;
194 };
195
196
197 # Get all annotations
198 $pos = 0;
199 my $end;
200 foreach my $item (@{$doc->{annotations}}) {
201
202 # Add token term to term dictionary
203 if ($item->{'@type'} eq 'koral:token') {
204
205 unless ($item->{wrap}) {
206 warn 'No wrap defined in KoralQuery';
Akron4204f172017-10-02 22:32:02 +0200207 CORE::next;
Akrondd10fb92017-08-08 20:19:46 +0200208 };
209
210 # Create key string
211 my $wrap = $item->{wrap};
212 my @keys;
213
214 # Token wraps a koral:termGroup
215 if ($wrap->{'@type'} && $wrap->{'@type'} eq 'koral:termGroup') {
216 foreach (@{$wrap->{operands}}) {
217 push @keys, _term($_);
218 };
219 }
220
221 # Token wraps a single koral:term
222 else {
223 push @keys, _term($wrap);
224 };
225
226 # Append posting to postings list
227 my @subtoken_offset = _subtokens($item);
228
229 # There are no reference subtokens defined
230 unless (scalar @subtoken_offset) {
231
232 # Use the current position for storing
233 push @subtoken_offset, $pos;
234
235 # But there are offsets defined
236 if ($item->{offsets}) {
237
238 # Get character definitions
239 my ($start, $end) = @{$item->{offsets}};
240
241 # Get the term surface from the primary text
242 # TODO:
243 # Ensure that the offsets are valid!
244 my $preceding = substr($primary, $primary_index, $start - $primary_index);
245 my $term = substr($primary, $start, $end - $start);
246 $primary_index = $end;
247
248 $stream->subtoken($pos, $preceding, $term);
249 };
250 $pos++;
251 };
252
253 # Add token terms
254 foreach (@keys) {
255
256 # Add token annotation
Akron14ff0c42017-08-09 20:49:52 +0200257 # my $length = $subtoken_offset[1] ? ($subtoken_offset[1]-$subtoken_offset[0]-1) : 0;
258 $stream->subtoken(
259 $subtoken_offset[0]
260 )->add_annotation($_, $subtoken_offset[1] ? $subtoken_offset[1] : $subtoken_offset[0] + 1);
Akrondd10fb92017-08-08 20:19:46 +0200261 };
262 }
263
264 # Add span term to dictionary
265 elsif ($item->{'@type'} eq 'koral:span') {
266
267 # Create key string
Akron14ff0c42017-08-09 20:49:52 +0200268 my $term = _term($item->{wrap});
269 $term->term_type('span');
Akrondd10fb92017-08-08 20:19:46 +0200270
271 # Add span to forward stream
Akron14ff0c42017-08-09 20:49:52 +0200272 #my $length = $item->{subtokens}->[1] ? (
273 # $item->{subtokens}->[-1] - $item->{subtokens}->[0]
274 #) : 0;
275 $stream->subtoken($item->{subtokens}->[0])->add_annotation(
276 $term,
277 $item->{subtokens}->[-1] + 1
278 );
Akrondd10fb92017-08-08 20:19:46 +0200279 };
280 };
281
282 # There are tokens indexed by subtokens
283 if ($primary_index) {
284 my $preceding = substr($primary, $primary_index);
285 $stream->subtoken($pos, $preceding, '') if $preceding;
286
287 # TODO: Probably not a good idea
288 $primary_index = 0;
289 };
Akronf0d514a2016-11-01 14:16:25 +0100290};
291
Akronf0d514a2016-11-01 14:16:25 +0100292
Akrondd10fb92017-08-08 20:19:46 +0200293# TODO: Use from_koral()->term
294# Potentially with a prefix
295sub _term {
296 my $item = shift;
Akron14ff0c42017-08-09 20:49:52 +0200297 my $term = Krawfish::Koral::Query::Term->new;
Akrondd10fb92017-08-08 20:19:46 +0200298
Akrondd10fb92017-08-08 20:19:46 +0200299 if ($item->{foundry}) {
Akron14ff0c42017-08-09 20:49:52 +0200300 $term->foundry($item->{foundry});
Akrondd10fb92017-08-08 20:19:46 +0200301 };
Akron14ff0c42017-08-09 20:49:52 +0200302
303 if ($item->{layer}) {
304 $term->layer($item->{layer});
305 };
306
307 if ($item->{key}) {
308 $term->key($item->{key});
309 };
310
311 if ($item->{value}) {
312 $term->value($item->{value});
313 };
314
Akron7b4e4d92017-09-25 12:18:29 +0200315 # Make token default term type
316 $term->term_type('token');
317
Akron14ff0c42017-08-09 20:49:52 +0200318 return $term;
319
320 #my $key = '';
321 ## Create term for term dictionary
322 #if ($item->{foundry}) {
323 # $key .= $item->{foundry};
324 # if ($item->{layer}) {
325 # $key .= '/' . $item->{layer};
326 # }
327 # $key .= '=';
328 #};
329 #return $key . ($item->{key} // '');
Akrondd10fb92017-08-08 20:19:46 +0200330}
331
332
333# Return subtoken list or nothing
334sub _subtokens {
Akronf0d514a2016-11-01 14:16:25 +0100335 my $item = shift;
336 my @posting;
337
Akrondd10fb92017-08-08 20:19:46 +0200338 if ($item->{subtokens}) {
Akronf0d514a2016-11-01 14:16:25 +0100339
Akrondd10fb92017-08-08 20:19:46 +0200340 # Remove!
341 push @posting, $item->{subtokens}->[0];
Akronf0d514a2016-11-01 14:16:25 +0100342
Akrondd10fb92017-08-08 20:19:46 +0200343 if ($item->{subtokens}->[1]) {
344 # The end is AFTER the second subtoken
345 push @posting, $item->{subtokens}->[1] + 1;
Akronf0d514a2016-11-01 14:16:25 +0100346 };
347
348 return @posting;
349 };
350
351 return;
352};
353
Akron927789d2016-11-01 02:33:04 +0100354
3551;
Akrondd10fb92017-08-08 20:19:46 +0200356
357
358__END__