| Akron | 71fc14c | 2016-10-31 23:44:43 +0100 | [diff] [blame] | 1 | package Krawfish::Index::Fields; |
| Akron | d6a87ff | 2017-08-11 00:17:30 +0200 | [diff] [blame] | 2 | use Krawfish::Index::Fields::Doc; |
| Akron | f703f6f | 2017-08-25 21:20:52 +0200 | [diff] [blame] | 3 | use Krawfish::Index::Fields::Ranks; |
| Akron | d6a87ff | 2017-08-11 00:17:30 +0200 | [diff] [blame] | 4 | use Krawfish::Index::Fields::Pointer; |
| Akron | 8781e6b | 2016-12-09 02:04:17 +0100 | [diff] [blame] | 5 | use Krawfish::Log; |
| Akron | 71fc14c | 2016-10-31 23:44:43 +0100 | [diff] [blame] | 6 | use warnings; |
| Akron | d6a87ff | 2017-08-11 00:17:30 +0200 | [diff] [blame] | 7 | use strict; |
| Akron | 71fc14c | 2016-10-31 23:44:43 +0100 | [diff] [blame] | 8 | |
| Akron | dc8dceb | 2017-08-22 20:25:39 +0200 | [diff] [blame] | 9 | use constant DEBUG => 0; |
| Akron | 8781e6b | 2016-12-09 02:04:17 +0100 | [diff] [blame] | 10 | |
| Akron | c1ed58c | 2017-08-04 17:26:30 +0200 | [diff] [blame] | 11 | |
| 12 | # TODO: |
| 13 | # Reranking a field is not necessary, if the field value is already given. |
| 14 | # In that case, look up the dictionary if the value is already given, |
| 15 | # take the example doc of that field value and add the rank of that |
| 16 | # doc for the new doc. |
| 17 | # If the field is not yet given, take the next or previous value in dictionary |
| 18 | # order and use the rank to rerank the field (see K::I::Dictionary). |
| 19 | # BUT: This only works if the field has the same collation as the |
| 20 | # dictionary! |
| 21 | |
| Akron | 4a46e6e | 2017-08-16 17:49:16 +0200 | [diff] [blame] | 22 | |
| 23 | # Merging the fields index is pretty simple, as it only needs to be indexed |
| 24 | # on the document level and then simply be appended. |
| 25 | |
| Akron | 2ee89f1 | 2016-12-07 18:33:52 +0100 | [diff] [blame] | 26 | # Sort documents by a field and attach a numerical rank. |
| Akron | 7db79e2 | 2016-12-08 23:02:32 +0100 | [diff] [blame] | 27 | # Returns the maximum rank and a vector of ranks at doc id position. |
| Akron | 2ee89f1 | 2016-12-07 18:33:52 +0100 | [diff] [blame] | 28 | # Ranks can be set multiple timnes |
| Akron | 7db79e2 | 2016-12-08 23:02:32 +0100 | [diff] [blame] | 29 | # |
| 30 | # TODO: |
| 31 | # These ranks may also be used for facet search, because |
| 32 | # remembering the ranks and increment their values will |
| 33 | # return the most common k facets of the field quickly. |
| 34 | # Returning the fields per rank, however, may become |
| 35 | # a linear search for the first rank in the ranked fields, |
| 36 | # which may be slow. |
| 37 | # But nonetheless, the max_rank field may also give a hint, |
| 38 | # if the field is good for faceting! (unique ranks per field |
| 39 | # are bad, for example!) |
| Akron | d6a87ff | 2017-08-11 00:17:30 +0200 | [diff] [blame] | 40 | |
| 41 | |
| Akron | a588d07 | 2017-10-13 14:45:34 +0200 | [diff] [blame] | 42 | # Constructor |
| Akron | d6a87ff | 2017-08-11 00:17:30 +0200 | [diff] [blame] | 43 | sub new { |
| 44 | my $class = shift; |
| 45 | bless { |
| 46 | docs => [], |
| Akron | f703f6f | 2017-08-25 21:20:52 +0200 | [diff] [blame] | 47 | last_doc_id => -1, |
| 48 | ranks => {} |
| Akron | d6a87ff | 2017-08-11 00:17:30 +0200 | [diff] [blame] | 49 | }, $class; |
| 50 | }; |
| 51 | |
| Akron | b00c2be | 2017-08-16 14:45:07 +0200 | [diff] [blame] | 52 | |
| Akron | d6a87ff | 2017-08-11 00:17:30 +0200 | [diff] [blame] | 53 | # Get last document identifier aka max_doc_id |
| 54 | sub last_doc_id { |
| 55 | $_[0]->{last_doc_id}; |
| 56 | }; |
| 57 | |
| 58 | |
| Akron | defcc59 | 2017-08-19 10:02:29 +0200 | [diff] [blame] | 59 | # Accepts a Krawfish::Koral::Document |
| Akron | d6a87ff | 2017-08-11 00:17:30 +0200 | [diff] [blame] | 60 | sub add { |
| 61 | my ($self, $doc) = @_; |
| 62 | my $doc_id = $self->{last_doc_id}++; |
| 63 | |
| 64 | # TODO: |
| 65 | # use Krawfish::Index::Store::V1::Fields->new; |
| Akron | a588d07 | 2017-10-13 14:45:34 +0200 | [diff] [blame] | 66 | $self->{docs}->[$self->last_doc_id] = |
| 67 | Krawfish::Index::Fields::Doc->new($doc); |
| Akron | d6a87ff | 2017-08-11 00:17:30 +0200 | [diff] [blame] | 68 | return $doc_id; |
| 69 | }; |
| 70 | |
| Akron | b00c2be | 2017-08-16 14:45:07 +0200 | [diff] [blame] | 71 | |
| Akron | defcc59 | 2017-08-19 10:02:29 +0200 | [diff] [blame] | 72 | # Get doc from list (as long as the list provides random access to docs) |
| Akron | d6a87ff | 2017-08-11 00:17:30 +0200 | [diff] [blame] | 73 | sub doc { |
| 74 | my ($self, $doc_id) = @_; |
| 75 | print_log('fields', 'Get document for id ' . $doc_id) if DEBUG; |
| 76 | return $self->{docs}->[$doc_id]; |
| 77 | }; |
| 78 | |
| 79 | |
| 80 | # Get a specific forward indexed document by doc_id |
| 81 | sub pointer { |
| 82 | my $self = shift; |
| 83 | return Krawfish::Index::Fields::Pointer->new($self); |
| 84 | }; |
| 85 | |
| 86 | |
| Akron | 71fc14c | 2016-10-31 23:44:43 +0100 | [diff] [blame] | 87 | 1; |
| Akron | d6a87ff | 2017-08-11 00:17:30 +0200 | [diff] [blame] | 88 | |
| 89 | |
| 90 | __END__ |