blob: 7b6477ad96683d28b3c35605c867e35748dbaef9 [file] [log] [blame]
Akron14ff0c42017-08-09 20:49:52 +02001package Krawfish::Index::Forward;
Akron1563b0c2017-08-10 19:58:04 +02002use Krawfish::Index::Forward::Pointer;
Akron14ff0c42017-08-09 20:49:52 +02003use Krawfish::Index::Forward::Doc;
Akron1563b0c2017-08-10 19:58:04 +02004use Krawfish::Log;
Akron14ff0c42017-08-09 20:49:52 +02005use warnings;
6use strict;
7
Akron1563b0c2017-08-10 19:58:04 +02008use constant DEBUG => 1;
9
Akron4a46e6e2017-08-16 17:49:16 +020010# This represents a forward index of the data,
11# accessible by document ID and subtoken offset.
12
13# Merging the forward index is pretty simple, as it only needs to be indexed
14# on the document level and then simply be appended.
15
16# TODO:
17# This is great for retrieving pagebreaks, annotations, primary data,
18# perhaps help on regex ...
19# But can this help to expand the context of a match to a certain element context?
20# Probably by retrieving the data with a certain maximum offset (say left 100 subtokens, right 100 subtokens)
21# and first check for the expanding element start on the left, then move to the right.
22#
23# TODO:
24# In case the term IDs are retrieved for surface sorting,
25# it may be useful to not have much data in memory.
26# Look into K::I::Subtokens for use of $term_ids there. It may not be crucial though.
27
28# TODO:
29# The forward index needs fast access to documents and positions,
30# to get term ids from contexts for use in the co-occurrence analysis.
31
32
Akron14ff0c42017-08-09 20:49:52 +020033# TODO:
34# This API needs to be backed up by a store version.
Akron4a46e6e2017-08-16 17:49:16 +020035# use Krawfish::Index::Store::V1::ForwardIndex;
Akron14ff0c42017-08-09 20:49:52 +020036
Akron14ff0c42017-08-09 20:49:52 +020037sub new {
38 my $class = shift;
39
40 bless {
41 docs => [],
Akron1563b0c2017-08-10 19:58:04 +020042 last_doc_id => -1
Akron14ff0c42017-08-09 20:49:52 +020043 }, $class;
44};
45
46
47# Get last document identifier aka max_doc_id
48sub last_doc_id {
49 $_[0]->{last_doc_id};
50};
51
52
53# Accept a Krawfish::Koral::Document object
54sub add {
55 my ($self, $doc) = @_;
56 my $doc_id = $self->{last_doc_id}++;
57
Akron1563b0c2017-08-10 19:58:04 +020058 # TODO:
59 # use Krawfish::Index::Store::V1::ForwardIndex->new;
60 $self->{docs}->[$self->last_doc_id] =
61 Krawfish::Index::Forward::Doc->new($doc);
Akron14ff0c42017-08-09 20:49:52 +020062
63 return $doc_id;
64};
65
66
Akron1563b0c2017-08-10 19:58:04 +020067# Get doc from list (as long as the list provides random access to docs
68sub doc {
Akron14ff0c42017-08-09 20:49:52 +020069 my ($self, $doc_id) = @_;
Akron1563b0c2017-08-10 19:58:04 +020070 print_log('fwd', 'Get document for id ' . $doc_id) if DEBUG;
71 return $self->{docs}->[$doc_id];
Akron14ff0c42017-08-09 20:49:52 +020072};
73
Akron1563b0c2017-08-10 19:58:04 +020074# Get a specific forward indexed document by doc_id
75sub pointer {
76 my $self = shift;
77 return Krawfish::Index::Forward::Pointer->new($self);
Akron14ff0c42017-08-09 20:49:52 +020078};
79
80
811;