blob: aa96468a91f8690e6394dffde43ff0938aa25ffd [file] [log] [blame]
Akronf0d514a2016-11-01 14:16:25 +01001package Krawfish::Koral::Document;
Akron4a46e6e2017-08-16 17:49:16 +02002use Krawfish::Koral::Document::Stream;
3use Krawfish::Koral::Document::Fields;
Akron14ff0c42017-08-09 20:49:52 +02004use Krawfish::Koral::Query::Term;
Akrondd10fb92017-08-08 20:19:46 +02005use Krawfish::Log;
6use Mojo::File;
7use Mojo::JSON qw/encode_json decode_json/;
Akron927789d2016-11-01 02:33:04 +01008use strict;
9use warnings;
Akrondd10fb92017-08-08 20:19:46 +020010use List::MoreUtils qw/uniq/;
Akron927789d2016-11-01 02:33:04 +010011
Akrondd10fb92017-08-08 20:19:46 +020012# Parses a document and creates a simple forward index list.
13#
14# primary='...',
15# fields=[+field => title],
16# terms=[*term => [postings*]]
17#
18# Then, when the document is added to certain nodes,
19# the keys will be translated to term_ids and the document
20# can be added with all freq_in_doc information
21
22
Akron14ff0c42017-08-09 20:49:52 +020023# TODO:
24# Don't forget to deal with TUIs!
25
26# foundry and layer may need separated term_ids so they are exceptional small.
27
28
Akrondd10fb92017-08-08 20:19:46 +020029use constant DEBUG => 1;
30
31# Parse the document and create an inverted index file
Akronf0d514a2016-11-01 14:16:25 +010032sub new {
33 my $class = shift;
Akronf0d514a2016-11-01 14:16:25 +010034
Akrondd10fb92017-08-08 20:19:46 +020035 my $self = bless {
Akrondd10fb92017-08-08 20:19:46 +020036 sortable => {},
Akron4a46e6e2017-08-16 17:49:16 +020037 stream => Krawfish::Koral::Document::Stream->new,
38 fields => Krawfish::Koral::Document::Fields->new
Akrondd10fb92017-08-08 20:19:46 +020039 }, $class;
Akronf0d514a2016-11-01 14:16:25 +010040
Akrondd10fb92017-08-08 20:19:46 +020041 my $doc = shift;
42
43 unless (ref $doc) {
44 $doc = decode_json(Mojo::File->new($doc)->slurp);
Akronf0d514a2016-11-01 14:16:25 +010045 };
46
Akrondd10fb92017-08-08 20:19:46 +020047 # Parse the document
48 $self->_parse($doc);
Akronf0d514a2016-11-01 14:16:25 +010049
50 return $self;
51};
52
53
Akron14ff0c42017-08-09 20:49:52 +020054# Get the stream object
Akrondd10fb92017-08-08 20:19:46 +020055sub stream {
56 $_[0]->{stream};
57};
58
59
Akron14ff0c42017-08-09 20:49:52 +020060# Get the fields object
Akrondd10fb92017-08-08 20:19:46 +020061sub fields {
62 $_[0]->{fields};
63};
64
65
66sub sortable {
67 $_[0]->{sortable};
68};
69
Akron14ff0c42017-08-09 20:49:52 +020070
71# Translate all terms into term_ids and
72# add unknown terms to the dictionary
Akrondd10fb92017-08-08 20:19:46 +020073sub identify {
74 my ($self, $dict) = @_;
75 $self->{fields} = $self->{fields}->identify($dict);
76 $self->{stream} = $self->{stream}->identify($dict);
77 return $self;
78};
79
80
Akron14ff0c42017-08-09 20:49:52 +020081# Stringification
Akrondd10fb92017-08-08 20:19:46 +020082sub to_string {
Akronf0d514a2016-11-01 14:16:25 +010083 my $self = shift;
Akrondd10fb92017-08-08 20:19:46 +020084 return '[' . $self->fields->to_string . ']' . $self->stream->to_string;
85};
86
87
88# Parse the file and create a token-ordered document
89sub _parse {
90 my ($self, $doc) = @_;
91
92 # Get the document part
93 # This may - in the future - support multiple documents at once
94 $doc = $doc->{document};
95
96 my $primary = '';
97 my $stream = $self->stream;
98 my $fields = $self->fields;
99
100 # Remember the primary data for the creation
101 # of the forward index
102 if ($doc->{primaryData}) {
103 $primary = $doc->{primaryData};
Akronf0d514a2016-11-01 14:16:25 +0100104 };
Akrondd10fb92017-08-08 20:19:46 +0200105
Akrondd10fb92017-08-08 20:19:46 +0200106 # Add metadata fields
107 my $pos = 0;
108 my %sortable;
109 foreach my $field (@{$doc->{fields}}) {
110
111 # TODO:
112 # Presort fields based on their field_key_id!
113 # In that way it's faster to retrieve presorted fields
114 # for enrichment!
Akronf0d514a2016-11-01 14:16:25 +0100115
116
Akrondd10fb92017-08-08 20:19:46 +0200117 # Prepare field for sorting
118 if ($field->{sortable}) {
119
120 # Which entries need to be sorted?
121 $sortable{$field->{key}}++;
122 };
123
124
125 # Prepare for summarization
Akrondf4c1632017-08-17 16:56:30 +0200126 if (!$field->{type} || $field->{type} eq 'type:string') {
127 $fields->add_string($field->{key}, $field->{value});
128 }
129 elsif ($field->{type} eq 'type:integer') {
Akrondd10fb92017-08-08 20:19:46 +0200130 $fields->add_int($field->{key}, $field->{value});
131 }
Akrondf4c1632017-08-17 16:56:30 +0200132 elsif ($field->{type} eq 'type:store') {
133 $fields->add_store($field->{key}, $field->{value});
134 }
Akrondd10fb92017-08-08 20:19:46 +0200135 else {
Akrondf4c1632017-08-17 16:56:30 +0200136 warn 'unknown field type: ' . $field->{type};
Akrondd10fb92017-08-08 20:19:46 +0200137 };
138
139 # This will later be indexed for search as well as retrieval in
140 # the forward index.
Akronf0d514a2016-11-01 14:16:25 +0100141 };
Akrondd10fb92017-08-08 20:19:46 +0200142
143 # Check that the unique field is given, as this is required
144 $self->{sortable} = \%sortable;
145
146 my $primary_index = 0;
147
148 # Get all subtokens
149 if ($doc->{subtokens}) {
150
151 print_log('doc', 'Parse subtokens') if DEBUG;
152
153 # Get all subtoken offsets
154 foreach my $subtoken (@{$doc->{subtokens}}) {
155
156 # Get start and end of the subtoken
157 my ($start, $end) = @{$subtoken->{offsets}};
158
159 if (DEBUG) {
160 print_log(
161 'doc',
162 'Store subtoken: ' . $pos . '=' . join('-', $start, $end)
163 );
164 };
165
166 # Get the term surface from the primary text
167 # TODO:
168 # Ensure that the offsets are valid!
Akron1563b0c2017-08-10 19:58:04 +0200169 my $preceding = substr($primary, $primary_index, $start - $primary_index) // '';
Akrondd10fb92017-08-08 20:19:46 +0200170 my $term = substr($primary, $start, $end - $start);
171 $primary_index = $end;
172
173 print_log('doc', 'Surface form is ' . $term) if DEBUG;
174
175 $stream->subtoken($pos, $preceding, $term);
176 $pos++;
177 };
178 };
179
180
181 # There are tokens indexed by subtokens
182 if ($primary_index) {
183 my $preceding = substr($primary, $primary_index);
184 $stream->subtoken($pos, $preceding, '') if $preceding;
185
186 # TODO: Probably not a good idea
187 $primary_index = 0;
188 };
189
190
191 # Get all annotations
192 $pos = 0;
193 my $end;
194 foreach my $item (@{$doc->{annotations}}) {
195
196 # Add token term to term dictionary
197 if ($item->{'@type'} eq 'koral:token') {
198
199 unless ($item->{wrap}) {
200 warn 'No wrap defined in KoralQuery';
201 next;
202 };
203
204 # Create key string
205 my $wrap = $item->{wrap};
206 my @keys;
207
208 # Token wraps a koral:termGroup
209 if ($wrap->{'@type'} && $wrap->{'@type'} eq 'koral:termGroup') {
210 foreach (@{$wrap->{operands}}) {
211 push @keys, _term($_);
212 };
213 }
214
215 # Token wraps a single koral:term
216 else {
217 push @keys, _term($wrap);
218 };
219
220 # Append posting to postings list
221 my @subtoken_offset = _subtokens($item);
222
223 # There are no reference subtokens defined
224 unless (scalar @subtoken_offset) {
225
226 # Use the current position for storing
227 push @subtoken_offset, $pos;
228
229 # But there are offsets defined
230 if ($item->{offsets}) {
231
232 # Get character definitions
233 my ($start, $end) = @{$item->{offsets}};
234
235 # Get the term surface from the primary text
236 # TODO:
237 # Ensure that the offsets are valid!
238 my $preceding = substr($primary, $primary_index, $start - $primary_index);
239 my $term = substr($primary, $start, $end - $start);
240 $primary_index = $end;
241
242 $stream->subtoken($pos, $preceding, $term);
243 };
244 $pos++;
245 };
246
247 # Add token terms
248 foreach (@keys) {
249
250 # Add token annotation
Akron14ff0c42017-08-09 20:49:52 +0200251 # my $length = $subtoken_offset[1] ? ($subtoken_offset[1]-$subtoken_offset[0]-1) : 0;
252 $stream->subtoken(
253 $subtoken_offset[0]
254 )->add_annotation($_, $subtoken_offset[1] ? $subtoken_offset[1] : $subtoken_offset[0] + 1);
Akrondd10fb92017-08-08 20:19:46 +0200255 };
256 }
257
258 # Add span term to dictionary
259 elsif ($item->{'@type'} eq 'koral:span') {
260
261 # Create key string
Akron14ff0c42017-08-09 20:49:52 +0200262 my $term = _term($item->{wrap});
263 $term->term_type('span');
Akrondd10fb92017-08-08 20:19:46 +0200264
265 # Add span to forward stream
Akron14ff0c42017-08-09 20:49:52 +0200266 #my $length = $item->{subtokens}->[1] ? (
267 # $item->{subtokens}->[-1] - $item->{subtokens}->[0]
268 #) : 0;
269 $stream->subtoken($item->{subtokens}->[0])->add_annotation(
270 $term,
271 $item->{subtokens}->[-1] + 1
272 );
Akrondd10fb92017-08-08 20:19:46 +0200273 };
274 };
275
276 # There are tokens indexed by subtokens
277 if ($primary_index) {
278 my $preceding = substr($primary, $primary_index);
279 $stream->subtoken($pos, $preceding, '') if $preceding;
280
281 # TODO: Probably not a good idea
282 $primary_index = 0;
283 };
Akronf0d514a2016-11-01 14:16:25 +0100284};
285
Akronf0d514a2016-11-01 14:16:25 +0100286
Akrondd10fb92017-08-08 20:19:46 +0200287# TODO: Use from_koral()->term
288# Potentially with a prefix
289sub _term {
290 my $item = shift;
Akron14ff0c42017-08-09 20:49:52 +0200291 my $term = Krawfish::Koral::Query::Term->new;
Akrondd10fb92017-08-08 20:19:46 +0200292
Akrondd10fb92017-08-08 20:19:46 +0200293 if ($item->{foundry}) {
Akron14ff0c42017-08-09 20:49:52 +0200294 $term->foundry($item->{foundry});
Akrondd10fb92017-08-08 20:19:46 +0200295 };
Akron14ff0c42017-08-09 20:49:52 +0200296
297 if ($item->{layer}) {
298 $term->layer($item->{layer});
299 };
300
301 if ($item->{key}) {
302 $term->key($item->{key});
303 };
304
305 if ($item->{value}) {
306 $term->value($item->{value});
307 };
308
309 return $term;
310
311 #my $key = '';
312 ## Create term for term dictionary
313 #if ($item->{foundry}) {
314 # $key .= $item->{foundry};
315 # if ($item->{layer}) {
316 # $key .= '/' . $item->{layer};
317 # }
318 # $key .= '=';
319 #};
320 #return $key . ($item->{key} // '');
Akrondd10fb92017-08-08 20:19:46 +0200321}
322
323
324# Return subtoken list or nothing
325sub _subtokens {
Akronf0d514a2016-11-01 14:16:25 +0100326 my $item = shift;
327 my @posting;
328
Akrondd10fb92017-08-08 20:19:46 +0200329 if ($item->{subtokens}) {
Akronf0d514a2016-11-01 14:16:25 +0100330
Akrondd10fb92017-08-08 20:19:46 +0200331 # Remove!
332 push @posting, $item->{subtokens}->[0];
Akronf0d514a2016-11-01 14:16:25 +0100333
Akrondd10fb92017-08-08 20:19:46 +0200334 if ($item->{subtokens}->[1]) {
335 # The end is AFTER the second subtoken
336 push @posting, $item->{subtokens}->[1] + 1;
Akronf0d514a2016-11-01 14:16:25 +0100337 };
338
339 return @posting;
340 };
341
342 return;
343};
344
Akron927789d2016-11-01 02:33:04 +0100345
3461;
Akrondd10fb92017-08-08 20:19:46 +0200347
348
349__END__
350
351
352
353sub to_list {
354 my ($self, $doc_id, $replicant_id) = @_;
355};
356
357
358sub add {
359 # This will add the doc_id to id-field and
360 # this will add the replicant field (either __1:1 or __2:node_name).
361};
362
363
364sub to_forward_index {
365 # Only works after identification!
366 # This should, however, use a K::I::Store class!
367};
368
369
3701;
371
372__END__