blob: e2db10dcea291e7fe7a6faa2c409b1194d2f05e4 [file] [log] [blame]
Akronbc9d54c2017-01-14 02:27:45 +01001package Krawfish::Result::Group;
Akron18ff5922017-01-13 10:09:45 +01002use Krawfish::Log;
3use strict;
4use warnings;
5
Akron81181512017-01-19 09:52:34 +01006use constant DEBUG => 0;
Akron555de3b2017-01-17 00:27:29 +01007
8# TODO: Use Krawfish::Posting::Group;
Akron18ff5922017-01-13 10:09:45 +01009
Akroneb12ac22017-01-19 00:05:12 +010010# Group matches based on certain criteria, for example:
11# - for record matches
12# - metadata!
13# - This is useful to group document matches for corpus browsing!
Akron1f3feac2017-05-05 17:05:45 +020014# - BUT: This would probably need a witness mechanism, so for a match,
15# some fields can be loaded, e.g. a matching document sigle will return
16# the document title.
Akroneb12ac22017-01-19 00:05:12 +010017# - for span matches
18# - metdata
19# - this is an extension to facets, where snippet frequencies are grouped
20# based on a certain facet.
21# - having facets in a first step may improve the distributed aggregation
22# (as the central node than knows, which facets are most or least common)
23# - this grouping doesn't seem beneficial - as the facet view already helps here
Akron18ff5922017-01-13 10:09:45 +010024#
Akroneb12ac22017-01-19 00:05:12 +010025# - innertextual!
Akron18ff5922017-01-13 10:09:45 +010026# - has a certain identical class on surface
27# - has the same starting characters of a word
28# - has the same ending characters of a word
29# - has the same POS of a certain class (this is actually pretty hard!)
30# - this may mean to modify the search a bit to lift the posting types
31# and make a class, like [orth=der & base/p=*]
32# - At least the postingslist of base/p=* should be merged in parallel!
33#
34# This is already possible in C2 so it needs to be implemented!
35
Akron1f3feac2017-05-05 17:05:45 +020036# A group has the following structure for matches:
Akron18ff5922017-01-13 10:09:45 +010037# {
38# criterion => [freq, doc_freq]
39# }
Akron1f3feac2017-05-05 17:05:45 +020040#
41# For docs, freq and doc_freq are identical
42#
Akronbc9d54c2017-01-14 02:27:45 +010043# Where criterion is a classed sequence of criteria
Akron18ff5922017-01-13 10:09:45 +010044# with class information, like
45# 1:der|2:Baum => []
Akronbc9d54c2017-01-14 02:27:45 +010046# Sometimes it may indicate tokens instead of classes though ...
Akron1f3feac2017-05-05 17:05:45 +020047#
48# With a witness, the group has:
49# {
50# criterion => [freq, doc_freq, match]
51# }
52# The match can be anything - so it may even be a first example snippet.
53#
Akron97a7cba2017-05-26 13:39:06 +020054# But with a multiple class() corpora, there may be more:
Akron1f3feac2017-05-05 17:05:45 +020055#
56# {
57# criterion => [freq, doc_freq, freq, doc_freq, freq, doc_freq, ...]
58# }
Akron97a7cba2017-05-26 13:39:06 +020059#
60# or even
61#
62# {
63# criterion => [freq, doc_freq, match, freq, doc_freq, match, freq, doc_freq, match ...]
64# }
Akron1f3feac2017-05-05 17:05:45 +020065
66
67
68# WARNING!
69# This kind of result can not be limited or sorted on an earlier level,
70# as the number of matches is only clear after everything is aggregated.
Akron18ff5922017-01-13 10:09:45 +010071
72# Construct grouping function
73sub new {
74 my $class = shift;
Akron01e97e82017-08-03 15:12:25 +020075 my ($query, $criterion) = @_;
Akron81181512017-01-19 09:52:34 +010076
Akron18ff5922017-01-13 10:09:45 +010077 bless {
Akron81181512017-01-19 09:52:34 +010078 query => $query,
Akron18ff5922017-01-13 10:09:45 +010079
80 # This is a group criterion object, created outside, that defines the criterion
Akron81181512017-01-19 09:52:34 +010081 criterion => $criterion,
Akronbc9d54c2017-01-14 02:27:45 +010082 pos => -1,
Akron18ff5922017-01-13 10:09:45 +010083
84 # Group to fill with matches and group info
85 # (as class1=>X, class2=>Y)
Akronbc9d54c2017-01-14 02:27:45 +010086 groups => []
Akron18ff5922017-01-13 10:09:45 +010087 }, $class;
88};
89
Akron555de3b2017-01-17 00:27:29 +010090
Akron18ff5922017-01-13 10:09:45 +010091# Go through all matches
92# This could, nonetheless, be implemented like Facets ...
93sub _init {
94 my $self = shift;
Akronbc9d54c2017-01-14 02:27:45 +010095
96 return if $self->{init}++;
97
Akron18ff5922017-01-13 10:09:45 +010098 my $criterion = $self->{criterion};
Akron81181512017-01-19 09:52:34 +010099
Akron18ff5922017-01-13 10:09:45 +0100100 my $query = $self->{query};
101
102 my %groups = ();
Akronbc9d54c2017-01-14 02:27:45 +0100103 my ($group, $current);
Akron18ff5922017-01-13 10:09:45 +0100104 my $doc_id = -1;
105
Akronbc9d54c2017-01-14 02:27:45 +0100106 # Iterate over all queries
Akron18ff5922017-01-13 10:09:45 +0100107 while ($query->next) {
108
Akronbc9d54c2017-01-14 02:27:45 +0100109 # Get current query if there is any
Akron18ff5922017-01-13 10:09:45 +0100110 $current = $query->current or last;
111
112 # Potentially create new group
Akron97a7cba2017-05-26 13:39:06 +0200113 $group = ($groups{
114 $criterion->get_group($current)
115 } //= [0,0]);
116
117 # TODO: Should work with classes!
118 # Like
119 # foreach my $nr ($match->get_corpus_classes) {
120 # $group->[$nr * 2]++;
121 # }
Akron18ff5922017-01-13 10:09:45 +0100122
123 # Increment freq
124 $group->[0]++;
125
126 if ($current->doc_id != $doc_id) {
127
128 # Increment doc_freq
129 $group->[1]++;
130
Akron97a7cba2017-05-26 13:39:06 +0200131 # TODO: If requested, add a witness!
132
Akron18ff5922017-01-13 10:09:45 +0100133 $doc_id = $current->doc_id;
134 };
135 };
136
Akron555de3b2017-01-17 00:27:29 +0100137 # Value is stored as [criterion, freq, doc_freq]
138 # Sorted by freq by default
Akronbc9d54c2017-01-14 02:27:45 +0100139 my @array = ();
Akroneb12ac22017-01-19 00:05:12 +0100140 foreach (sort { $groups{$b}->[0] <=> $groups{$a}->[0] } keys %groups) {
Akron555de3b2017-01-17 00:27:29 +0100141 push @array, [$_, $groups{$_}->[0], $groups{$_}->[1]];
Akronbc9d54c2017-01-14 02:27:45 +0100142 };
143
Akron555de3b2017-01-17 00:27:29 +0100144 # Store for retrieval
Akronbc9d54c2017-01-14 02:27:45 +0100145 $self->{groups} = \@array;
Akron555de3b2017-01-17 00:27:29 +0100146 return 1;
Akronbc9d54c2017-01-14 02:27:45 +0100147};
148
149
150sub freq {
151 my $self = shift;
152 scalar @{$self->{groups}}
Akron18ff5922017-01-13 10:09:45 +0100153};
154
Akron97a7cba2017-05-26 13:39:06 +0200155
156# Next will move a position value
Akron18ff5922017-01-13 10:09:45 +0100157sub next {
158 my $self = shift;
Akronbc9d54c2017-01-14 02:27:45 +0100159 $self->_init;
160 if ($self->{pos}++ < ($self->freq - 1)) {
161 return 1;
162 };
163 return;
Akron18ff5922017-01-13 10:09:45 +0100164};
165
166
Akron555de3b2017-01-17 00:27:29 +0100167sub current;
Akron18ff5922017-01-13 10:09:45 +0100168
169
Akron555de3b2017-01-17 00:27:29 +0100170# Return a hash reference with information
Akronbc9d54c2017-01-14 02:27:45 +0100171sub current_group {
Akron555de3b2017-01-17 00:27:29 +0100172 my $self = shift;
173 my $group = $self->{groups}->[$self->{pos}];
Akrond5105af2017-01-14 16:50:38 +0100174
Akron555de3b2017-01-17 00:27:29 +0100175 # Make a hash from criterion
176 return $self->{criterion}->to_hash(@$group);
Akronbc9d54c2017-01-14 02:27:45 +0100177};
Akron18ff5922017-01-13 10:09:45 +0100178
179
180sub to_string {
181 my $self = shift;
Akroneb12ac22017-01-19 00:05:12 +0100182 my $str = 'groupBy(';
Akron18ff5922017-01-13 10:09:45 +0100183 $str .= $self->{criterion}->to_string . ':';
184 $str .= $self->{query}->to_string;
Akron8fb8d902017-02-20 21:29:31 +0100185 return $str . ')';
Akron18ff5922017-01-13 10:09:45 +0100186};
187
1881;