blob: 4e2d498824db0cab811fdbdf584d68ca7ea5cc29 [file] [log] [blame]
Akrondc8dceb2017-08-22 20:25:39 +02001package Krawfish::Result::Segment::Group::Fields;
2use parent 'Krawfish::Result';
3use Krawfish::Posting::Group::Fields;
Akroneb12ac22017-01-19 00:05:12 +01004use Krawfish::Log;
5use strict;
6use warnings;
7
8use constant DEBUG => 0;
9
10# This will group matches (especially document matches) by field
Akrone0914532017-07-29 19:53:10 +020011# This is useful e.g. for document browsing per corpus.
Akroneb12ac22017-01-19 00:05:12 +010012#
Akrone0914532017-07-29 19:53:10 +020013# Because the grouping is based on ranking, the sorting will be trivial.
Akronea3df5b2017-09-09 22:09:18 +020014#
15# TODO:
16# For some mechanisms, it is not necessary to count all occurrences,
17# e.g. to get all keywords used in a certain virtual corpus or all
18# used annotations.
Akroneb12ac22017-01-19 00:05:12 +010019
20sub new {
21 my $class = shift;
Akrondc8dceb2017-08-22 20:25:39 +020022 my ($field_obj, $query, $fields) = @_;
23 my $self = bless {
24 field_obj => $field_obj,
25 query => $query,
26 field_keys => [map { ref($_) ? $_->term_id : $_ } @$fields],
27 last_doc_id => -1
Akroneb12ac22017-01-19 00:05:12 +010028 }, $class;
Akrondc8dceb2017-08-22 20:25:39 +020029
30 # Initialize group object
31 $self->{groups} = Krawfish::Posting::Group::Fields->new($self->{field_keys});
32
33 return $self;
Akroneb12ac22017-01-19 00:05:12 +010034};
35
36
Akrondc8dceb2017-08-22 20:25:39 +020037# Initialize field pointer
38sub _init {
39 return if $_[0]->{field_pointer};
40
41 my $self = shift;
42
43 print_log('g_fields', 'Create pointer on fields') if DEBUG;
44
45 # Load the ranked list - may be too large for memory!
46 $self->{field_pointer} = $self->{field_obj}->pointer;
47};
48
49
50sub to_string {
51 my $self = shift;
52 my $str = 'gFields(' . join(',', map { '#' . $_ } @{$self->{field_keys}}) .
53 ':' . $self->{query}->to_string . ')';
54 return $str;
55};
56
57
58# Shorthand for "search through"
59sub finalize {
60 while ($_[0]->next) {};
61 return $_[0];
62};
63
64
65# Iterate to the next result
66sub next {
67 my $self = shift;
68
69 $self->_init;
70
71 my $groups = $self->{groups};
72 my $pointer = $self->{field_pointer};
73
Akrondc8dceb2017-08-22 20:25:39 +020074 # There is a next match
75 if ($self->{query}->next) {
76
77 # Get the current posting
78 my $current = $self->{query}->current;
79
80 if ($current->doc_id != $self->{last_doc_id}) {
81
82 # Flush old information
83 $groups->flush;
84
85 my $doc_id = $pointer->skip_doc($current->doc_id);
86
87 # There are no fields for this doc
88 next if $doc_id != $current->doc_id;
89
90 # Due to multivalued fields,
91 # a document can yield a permutation of
92 # patterns, so we recognize this
93 my @patterns = ();
94 my @field_keys = @{$self->{field_keys}};
95
96 # Ignore stored fields
97 my @field_objs = grep { $_->type ne 'store' } $pointer->fields(@field_keys);
98
99 my ($key_pos, $val_pos) = (0,0);
100
101 # Iterate through both lists and create a pattern
102 # Pattern may occur because fields can have multiple values
103 while ($key_pos < @field_keys) {
104
105 # There are no more values for the position
106 if (!$field_objs[$val_pos]) {
107 # Add ignorable null term
108 unless (@{$patterns[$key_pos]}) {
109 push @{$patterns[$key_pos]}, 0;
110 };
111 $key_pos++;
112 }
113
114 # Key identifier are matching
115 elsif ($field_keys[$key_pos] == $field_objs[$val_pos]->key_id) {
116
117 # Add key to pattern
118 $patterns[$key_pos] //= [];
119 push @{$patterns[$key_pos]}, $field_objs[$val_pos]->term_id;
120 $val_pos++;
121 }
122
123 # Forward key position
124 elsif ($field_keys[$key_pos] < $field_objs[$val_pos]->key_id) {
125
126 # Add ignorable null term
127 unless (@{$patterns[$key_pos]}) {
128 push @{$patterns[$key_pos]}, 0;
129 };
130 $key_pos++;
131 }
132
133 # $field_keys[$key_pos] > $field_objs[$val_pos]->key_id
134 else {
135
136 # I don't know if this can happen
137 $val_pos++;
138 };
139 };
140
141 # This adds
142 $groups->incr_doc(\@patterns);
143
Akrondc8dceb2017-08-22 20:25:39 +0200144 # Set last doc to current doc
145 $self->{last_doc_id} = $current->doc_id;
146 };
147
148 # Add to frequencies
149 $groups->incr_match;
150
151 return 1;
152 };
153
Akron09ab24b2017-08-24 12:45:39 +0200154 # Flush cached results
Akrondc8dceb2017-08-22 20:25:39 +0200155 $groups->flush;
156
157 return 0;
158};
159
160
161sub current {
162 return $_[0]->{query}->current;
163};
164
165
Akron09ab24b2017-08-24 12:45:39 +0200166# Get collection
Akrondc8dceb2017-08-22 20:25:39 +0200167sub collection {
168 $_[0]->{groups};
169};
170
171
Akrondc8dceb2017-08-22 20:25:39 +02001721;
Akron09ab24b2017-08-24 12:45:39 +0200173
174
Akrondc8dceb2017-08-22 20:25:39 +0200175__END__