| Akron | 71fc14c | 2016-10-31 23:44:43 +0100 | [diff] [blame] | 1 | package Krawfish::Index::Fields; |
| Akron | 2bdc9cb | 2017-04-21 15:25:54 +0200 | [diff] [blame] | 2 | use Krawfish::Index::Rank::Fields; |
| Akron | 8781e6b | 2016-12-09 02:04:17 +0100 | [diff] [blame] | 3 | use Krawfish::Log; |
| Akron | 71fc14c | 2016-10-31 23:44:43 +0100 | [diff] [blame] | 4 | use strict; |
| 5 | use warnings; |
| 6 | |
| Akron | 8781e6b | 2016-12-09 02:04:17 +0100 | [diff] [blame] | 7 | use constant DEBUG => 0; |
| 8 | |
| Akron | 71fc14c | 2016-10-31 23:44:43 +0100 | [diff] [blame] | 9 | sub new { |
| 10 | my $class = shift; |
| 11 | bless { |
| 12 | file => shift, |
| Akron | 2ee89f1 | 2016-12-07 18:33:52 +0100 | [diff] [blame] | 13 | array => [], # doc array |
| 14 | ranks => {}, # ranked lists |
| 15 | identifier => shift |
| Akron | 71fc14c | 2016-10-31 23:44:43 +0100 | [diff] [blame] | 16 | }, $class; |
| 17 | }; |
| 18 | |
| Akron | dd02499 | 2017-05-07 13:02:06 +0200 | [diff] [blame] | 19 | # Todo: Probably store multiple key/values at once |
| 20 | |
| Akron | 71fc14c | 2016-10-31 23:44:43 +0100 | [diff] [blame] | 21 | sub store { |
| 22 | my $self = shift; |
| 23 | my $doc_id = shift; |
| 24 | my ($key, $value) = @_; |
| Akron | 2ee89f1 | 2016-12-07 18:33:52 +0100 | [diff] [blame] | 25 | |
| 26 | # Preset fields with doc_id |
| Akron | 71fc14c | 2016-10-31 23:44:43 +0100 | [diff] [blame] | 27 | my $fields = ($self->{array}->[$doc_id] //= {}); |
| Akron | 2ee89f1 | 2016-12-07 18:33:52 +0100 | [diff] [blame] | 28 | |
| Akron | d075909 | 2016-12-28 18:59:57 +0100 | [diff] [blame] | 29 | # Delete cached ranks |
| 30 | delete $self->{ranks}->{$key}; |
| 31 | |
| Akron | 8781e6b | 2016-12-09 02:04:17 +0100 | [diff] [blame] | 32 | print_log( |
| 33 | 'fields', |
| 34 | 'Store field ' . $key . ':' . $value . ' for ' . $doc_id |
| 35 | ) if DEBUG; |
| 36 | |
| Akron | 2ee89f1 | 2016-12-07 18:33:52 +0100 | [diff] [blame] | 37 | # TODO: |
| 38 | # This needs to have information whether it's a string |
| 39 | # or an integer (mainly for sorting) |
| Akron | 71fc14c | 2016-10-31 23:44:43 +0100 | [diff] [blame] | 40 | $fields->{$key} = $value; |
| 41 | }; |
| 42 | |
| Akron | 4b5257e | 2017-04-05 17:50:22 +0200 | [diff] [blame] | 43 | |
| 44 | # Get the field value of a document |
| Akron | 71fc14c | 2016-10-31 23:44:43 +0100 | [diff] [blame] | 45 | sub get { |
| 46 | my $self = shift; |
| 47 | my $doc_id = shift; |
| 48 | my $doc = $self->{array}->[$doc_id]; |
| Akron | 8781e6b | 2016-12-09 02:04:17 +0100 | [diff] [blame] | 49 | |
| 50 | # Get specific field |
| 51 | if (@_) { |
| 52 | print_log( |
| 53 | 'fields', |
| 54 | 'Get field ' . $_[0] . ' for ' . $doc_id |
| 55 | ) if DEBUG; |
| 56 | |
| 57 | return $doc->{$_[0]} ; |
| 58 | }; |
| 59 | |
| 60 | # Get all fields |
| Akron | 71fc14c | 2016-10-31 23:44:43 +0100 | [diff] [blame] | 61 | return $doc; |
| 62 | }; |
| 63 | |
| Akron | 2ee89f1 | 2016-12-07 18:33:52 +0100 | [diff] [blame] | 64 | |
| 65 | # Return documents by array |
| 66 | sub docs { |
| 67 | return $_[0]->{array}; |
| 68 | }; |
| 69 | |
| 70 | |
| 71 | # Sort documents by a field and attach a numerical rank. |
| Akron | 7db79e2 | 2016-12-08 23:02:32 +0100 | [diff] [blame] | 72 | # Returns the maximum rank and a vector of ranks at doc id position. |
| Akron | 2ee89f1 | 2016-12-07 18:33:52 +0100 | [diff] [blame] | 73 | # Ranks can be set multiple timnes |
| Akron | 7db79e2 | 2016-12-08 23:02:32 +0100 | [diff] [blame] | 74 | # |
| 75 | # TODO: |
| 76 | # These ranks may also be used for facet search, because |
| 77 | # remembering the ranks and increment their values will |
| 78 | # return the most common k facets of the field quickly. |
| 79 | # Returning the fields per rank, however, may become |
| 80 | # a linear search for the first rank in the ranked fields, |
| 81 | # which may be slow. |
| 82 | # But nonetheless, the max_rank field may also give a hint, |
| 83 | # if the field is good for faceting! (unique ranks per field |
| 84 | # are bad, for example!) |
| 85 | # |
| 86 | # TODO: |
| 87 | # Return object |
| 88 | # |
| Akron | 5a263a6 | 2016-12-10 17:50:20 +0100 | [diff] [blame] | 89 | sub ranked_by { |
| Akron | 2ee89f1 | 2016-12-07 18:33:52 +0100 | [diff] [blame] | 90 | my ($self, $field) = @_; |
| 91 | |
| Akron | 8781e6b | 2016-12-09 02:04:17 +0100 | [diff] [blame] | 92 | print_log( |
| 93 | 'fields', |
| 94 | 'Get rank vector for ' . $field |
| 95 | ) if DEBUG; |
| 96 | |
| Akron | 2ee89f1 | 2016-12-07 18:33:52 +0100 | [diff] [blame] | 97 | # TODO: |
| 98 | # Currently ranks are set absolutely - but they should be set |
| 99 | # multiple times to make sorts for multiple fields |
| 100 | # |
| 101 | # TODO: Check if the field needs to be sorted |
| 102 | # numerically or based on a collation |
| 103 | |
| Akron | 5a263a6 | 2016-12-10 17:50:20 +0100 | [diff] [blame] | 104 | my $ranks = $self->{ranks}; |
| Akron | 8781e6b | 2016-12-09 02:04:17 +0100 | [diff] [blame] | 105 | |
| Akron | 5a263a6 | 2016-12-10 17:50:20 +0100 | [diff] [blame] | 106 | # Lookup at disk |
| 107 | return $ranks->{$field} if $ranks->{$field}; |
| Akron | 2ee89f1 | 2016-12-07 18:33:52 +0100 | [diff] [blame] | 108 | |
| Akron | 5a263a6 | 2016-12-10 17:50:20 +0100 | [diff] [blame] | 109 | # Add rank |
| Akron | 2bdc9cb | 2017-04-21 15:25:54 +0200 | [diff] [blame] | 110 | $ranks->{$field} = Krawfish::Index::Rank::Fields->new( |
| Akron | 7db79e2 | 2016-12-08 23:02:32 +0100 | [diff] [blame] | 111 | [grep { defined $_ } map { $_->{$field} } @{$self->{array}}] |
| Akron | 2ee89f1 | 2016-12-07 18:33:52 +0100 | [diff] [blame] | 112 | ); |
| 113 | |
| Akron | 8781e6b | 2016-12-09 02:04:17 +0100 | [diff] [blame] | 114 | if (DEBUG) { |
| 115 | print_log( |
| 116 | 'fields', |
| Akron | 5a263a6 | 2016-12-10 17:50:20 +0100 | [diff] [blame] | 117 | 'Return rank vector for ' . $field |
| Akron | 8781e6b | 2016-12-09 02:04:17 +0100 | [diff] [blame] | 118 | ); |
| 119 | }; |
| 120 | |
| Akron | 2ee89f1 | 2016-12-07 18:33:52 +0100 | [diff] [blame] | 121 | # Return ranked list |
| Akron | 5a263a6 | 2016-12-10 17:50:20 +0100 | [diff] [blame] | 122 | return $ranks->{$field}; |
| Akron | 2ee89f1 | 2016-12-07 18:33:52 +0100 | [diff] [blame] | 123 | }; |
| 124 | |
| 125 | |
| Akron | 71fc14c | 2016-10-31 23:44:43 +0100 | [diff] [blame] | 126 | 1; |