blob: b54ea4b224edadc706de45ea217fcf3c9bc5420b [file] [log] [blame]
Akron71fc14c2016-10-31 23:44:43 +01001package Krawfish::Index::Fields;
Akron2bdc9cb2017-04-21 15:25:54 +02002use Krawfish::Index::Rank::Fields;
Akron8781e6b2016-12-09 02:04:17 +01003use Krawfish::Log;
Akron71fc14c2016-10-31 23:44:43 +01004use strict;
5use warnings;
6
Akron8781e6b2016-12-09 02:04:17 +01007use constant DEBUG => 0;
8
Akron71fc14c2016-10-31 23:44:43 +01009sub new {
10 my $class = shift;
11 bless {
12 file => shift,
Akron2ee89f12016-12-07 18:33:52 +010013 array => [], # doc array
14 ranks => {}, # ranked lists
15 identifier => shift
Akron71fc14c2016-10-31 23:44:43 +010016 }, $class;
17};
18
Akrondd024992017-05-07 13:02:06 +020019# Todo: Probably store multiple key/values at once
20
Akron71fc14c2016-10-31 23:44:43 +010021sub store {
22 my $self = shift;
23 my $doc_id = shift;
24 my ($key, $value) = @_;
Akron2ee89f12016-12-07 18:33:52 +010025
26 # Preset fields with doc_id
Akron71fc14c2016-10-31 23:44:43 +010027 my $fields = ($self->{array}->[$doc_id] //= {});
Akron2ee89f12016-12-07 18:33:52 +010028
Akrond0759092016-12-28 18:59:57 +010029 # Delete cached ranks
30 delete $self->{ranks}->{$key};
31
Akron8781e6b2016-12-09 02:04:17 +010032 print_log(
33 'fields',
34 'Store field ' . $key . ':' . $value . ' for ' . $doc_id
35 ) if DEBUG;
36
Akron2ee89f12016-12-07 18:33:52 +010037 # TODO:
38 # This needs to have information whether it's a string
39 # or an integer (mainly for sorting)
Akron71fc14c2016-10-31 23:44:43 +010040 $fields->{$key} = $value;
41};
42
Akron4b5257e2017-04-05 17:50:22 +020043
44# Get the field value of a document
Akron71fc14c2016-10-31 23:44:43 +010045sub get {
46 my $self = shift;
47 my $doc_id = shift;
48 my $doc = $self->{array}->[$doc_id];
Akron8781e6b2016-12-09 02:04:17 +010049
50 # Get specific field
51 if (@_) {
52 print_log(
53 'fields',
54 'Get field ' . $_[0] . ' for ' . $doc_id
55 ) if DEBUG;
56
57 return $doc->{$_[0]} ;
58 };
59
60 # Get all fields
Akron71fc14c2016-10-31 23:44:43 +010061 return $doc;
62};
63
Akron2ee89f12016-12-07 18:33:52 +010064
65# Return documents by array
66sub docs {
67 return $_[0]->{array};
68};
69
70
71# Sort documents by a field and attach a numerical rank.
Akron7db79e22016-12-08 23:02:32 +010072# Returns the maximum rank and a vector of ranks at doc id position.
Akron2ee89f12016-12-07 18:33:52 +010073# Ranks can be set multiple timnes
Akron7db79e22016-12-08 23:02:32 +010074#
75# TODO:
76# These ranks may also be used for facet search, because
77# remembering the ranks and increment their values will
78# return the most common k facets of the field quickly.
79# Returning the fields per rank, however, may become
80# a linear search for the first rank in the ranked fields,
81# which may be slow.
82# But nonetheless, the max_rank field may also give a hint,
83# if the field is good for faceting! (unique ranks per field
84# are bad, for example!)
85#
86# TODO:
87# Return object
88#
Akron5a263a62016-12-10 17:50:20 +010089sub ranked_by {
Akron2ee89f12016-12-07 18:33:52 +010090 my ($self, $field) = @_;
91
Akron8781e6b2016-12-09 02:04:17 +010092 print_log(
93 'fields',
94 'Get rank vector for ' . $field
95 ) if DEBUG;
96
Akron2ee89f12016-12-07 18:33:52 +010097 # TODO:
98 # Currently ranks are set absolutely - but they should be set
99 # multiple times to make sorts for multiple fields
100 #
101 # TODO: Check if the field needs to be sorted
102 # numerically or based on a collation
103
Akron5a263a62016-12-10 17:50:20 +0100104 my $ranks = $self->{ranks};
Akron8781e6b2016-12-09 02:04:17 +0100105
Akron5a263a62016-12-10 17:50:20 +0100106 # Lookup at disk
107 return $ranks->{$field} if $ranks->{$field};
Akron2ee89f12016-12-07 18:33:52 +0100108
Akron5a263a62016-12-10 17:50:20 +0100109 # Add rank
Akron2bdc9cb2017-04-21 15:25:54 +0200110 $ranks->{$field} = Krawfish::Index::Rank::Fields->new(
Akron7db79e22016-12-08 23:02:32 +0100111 [grep { defined $_ } map { $_->{$field} } @{$self->{array}}]
Akron2ee89f12016-12-07 18:33:52 +0100112 );
113
Akron8781e6b2016-12-09 02:04:17 +0100114 if (DEBUG) {
115 print_log(
116 'fields',
Akron5a263a62016-12-10 17:50:20 +0100117 'Return rank vector for ' . $field
Akron8781e6b2016-12-09 02:04:17 +0100118 );
119 };
120
Akron2ee89f12016-12-07 18:33:52 +0100121 # Return ranked list
Akron5a263a62016-12-10 17:50:20 +0100122 return $ranks->{$field};
Akron2ee89f12016-12-07 18:33:52 +0100123};
124
125
Akron71fc14c2016-10-31 23:44:43 +01001261;