| Akron | 71fc14c | 2016-10-31 23:44:43 +0100 | [diff] [blame] | 1 | package Krawfish::Index::TokensList; |
| 2 | use strict; |
| 3 | use warnings; |
| 4 | |
| Akron | 555de3b | 2017-01-17 00:27:29 +0100 | [diff] [blame] | 5 | use constant DEBUG => 0; |
| Akron | 18ff592 | 2017-01-13 10:09:45 +0100 | [diff] [blame] | 6 | |
| Akron | 555de3b | 2017-01-17 00:27:29 +0100 | [diff] [blame] | 7 | # This is a special PostingsList to store the length of tokens |
| 8 | # in segments |
| Akron | 71fc14c | 2016-10-31 23:44:43 +0100 | [diff] [blame] | 9 | # |
| 10 | # It may also be used for extensions and distances with tokens |
| 11 | # (instead of segments) |
| 12 | # |
| Akron | 555de3b | 2017-01-17 00:27:29 +0100 | [diff] [blame] | 13 | # That's why this postingslist has a special API for extensions |
| 14 | # and word distances. |
| Akron | 71fc14c | 2016-10-31 23:44:43 +0100 | [diff] [blame] | 15 | # |
| Akron | 555de3b | 2017-01-17 00:27:29 +0100 | [diff] [blame] | 16 | # Structure may be: ([docid-delta]([seg-pos-delta][length-varbit])*)* |
| Akron | 71fc14c | 2016-10-31 23:44:43 +0100 | [diff] [blame] | 17 | # |
| Akron | 555de3b | 2017-01-17 00:27:29 +0100 | [diff] [blame] | 18 | # The problem is, this won't make it possible to go back and forth. |
| Akron | 71fc14c | 2016-10-31 23:44:43 +0100 | [diff] [blame] | 19 | |
| 20 | sub new { |
| 21 | my $class = shift; |
| 22 | bless { |
| 23 | array => [], |
| 24 | pos => -1, |
| Akron | 555de3b | 2017-01-17 00:27:29 +0100 | [diff] [blame] | 25 | index_file => shift, |
| Akron | 71fc14c | 2016-10-31 23:44:43 +0100 | [diff] [blame] | 26 | foundry => shift |
| 27 | }, $class; |
| 28 | } |
| 29 | |
| 30 | sub append { |
| 31 | my $self = shift; |
| 32 | my ($token, $doc_id, $pos, $end) = @_; |
| Akron | 6d9341b | 2016-11-16 16:59:01 +0100 | [diff] [blame] | 33 | print_log('toklist', "Appended $token with $doc_id, $pos" . ($end ? "-$end" : '')) if DEBUG; |
| Akron | 71fc14c | 2016-10-31 23:44:43 +0100 | [diff] [blame] | 34 | push(@{$self->{array}}, [$doc_id, $pos, $end]); |
| 35 | }; |
| 36 | |
| 37 | sub next; |
| 38 | |
| 39 | sub pos { |
| 40 | return $_[0]->{pos}; |
| 41 | }; |
| 42 | |
| 43 | sub token { |
| 44 | return $_[0]->{array}->[$_[0]->pos]; |
| Akron | 555de3b | 2017-01-17 00:27:29 +0100 | [diff] [blame] | 45 | }; |
| 46 | |
| 47 | |
| 48 | sub freq; |
| Akron | 71fc14c | 2016-10-31 23:44:43 +0100 | [diff] [blame] | 49 | |
| 50 | sub skip_to_doc; |
| 51 | |
| 52 | sub skip_to_pos; |
| 53 | |
| Akron | 555de3b | 2017-01-17 00:27:29 +0100 | [diff] [blame] | 54 | |
| 55 | # Get an array of start positions that are in the range of min/max |
| 56 | # Start with the lowest |
| 57 | sub extend_to_left { |
| 58 | my ($self, $start, $min, $max) = @_; |
| 59 | # Returns an array of start positions |
| 60 | ... |
| 61 | }; |
| 62 | |
| 63 | # Get an array of end positions that are in the range of min/max |
| 64 | # Start with the lowest |
| 65 | sub extend_to_right { |
| 66 | my ($self, $end, $min, $max) = @_; |
| 67 | # Returns an array of end positions |
| 68 | ... |
| 69 | }; |
| 70 | |
| 71 | # Check if the number of tokens between end and start |
| 72 | # is in the given range. |
| 73 | # |
| 74 | # This is necessary for token distance |
| 75 | # a []{2,3} b |
| 76 | sub check_tokens_between { |
| 77 | my ($self, $end, $start, $min, $max) = @_; |
| 78 | |
| 79 | # First check if this is even possible based on segments |
| 80 | # then check on tokens |
| 81 | ... |
| 82 | } |
| 83 | |
| 84 | |
| Akron | 71fc14c | 2016-10-31 23:44:43 +0100 | [diff] [blame] | 85 | 1; |
| Akron | 555de3b | 2017-01-17 00:27:29 +0100 | [diff] [blame] | 86 | |
| 87 | __END__ |