Blame - lib/Krawfish/Index/Forward.pm - KorAP/Krawfish-prototype

blob: 7b6477ad96683d28b3c35605c867e35748dbaef9 [file] [log] [blame]

Akron	14ff0c4	2017-08-09 20:49:52 +0200	[diff] [blame]	1	package Krawfish::Index::Forward;
Akron	1563b0c	2017-08-10 19:58:04 +0200	[diff] [blame]	2	use Krawfish::Index::Forward::Pointer;
Akron	14ff0c4	2017-08-09 20:49:52 +0200	[diff] [blame]	3	use Krawfish::Index::Forward::Doc;
Akron	1563b0c	2017-08-10 19:58:04 +0200	[diff] [blame]	4	use Krawfish::Log;
Akron	14ff0c4	2017-08-09 20:49:52 +0200	[diff] [blame]	5	use warnings;
				6	use strict;
				7
Akron	1563b0c	2017-08-10 19:58:04 +0200	[diff] [blame]	8	use constant DEBUG => 1;
				9
Akron	4a46e6e	2017-08-16 17:49:16 +0200	[diff] [blame^]	10	# This represents a forward index of the data,
				11	# accessible by document ID and subtoken offset.
				12
				13	# Merging the forward index is pretty simple, as it only needs to be indexed
				14	# on the document level and then simply be appended.
				15
				16	# TODO:
				17	# This is great for retrieving pagebreaks, annotations, primary data,
				18	# perhaps help on regex ...
				19	# But can this help to expand the context of a match to a certain element context?
				20	# Probably by retrieving the data with a certain maximum offset (say left 100 subtokens, right 100 subtokens)
				21	# and first check for the expanding element start on the left, then move to the right.
				22	#
				23	# TODO:
				24	# In case the term IDs are retrieved for surface sorting,
				25	# it may be useful to not have much data in memory.
				26	# Look into K::I::Subtokens for use of $term_ids there. It may not be crucial though.
				27
				28	# TODO:
				29	# The forward index needs fast access to documents and positions,
				30	# to get term ids from contexts for use in the co-occurrence analysis.
				31
				32
Akron	14ff0c4	2017-08-09 20:49:52 +0200	[diff] [blame]	33	# TODO:
				34	# This API needs to be backed up by a store version.
Akron	4a46e6e	2017-08-16 17:49:16 +0200	[diff] [blame^]	35	# use Krawfish::Index::Store::V1::ForwardIndex;
Akron	14ff0c4	2017-08-09 20:49:52 +0200	[diff] [blame]	36
Akron	14ff0c4	2017-08-09 20:49:52 +0200	[diff] [blame]	37	sub new {
				38	my $class = shift;
				39
				40	bless {
				41	docs => [],
Akron	1563b0c	2017-08-10 19:58:04 +0200	[diff] [blame]	42	last_doc_id => -1
Akron	14ff0c4	2017-08-09 20:49:52 +0200	[diff] [blame]	43	}, $class;
				44	};
				45
				46
				47	# Get last document identifier aka max_doc_id
				48	sub last_doc_id {
				49	$_[0]->{last_doc_id};
				50	};
				51
				52
				53	# Accept a Krawfish::Koral::Document object
				54	sub add {
				55	my ($self, $doc) = @_;
				56	my $doc_id = $self->{last_doc_id}++;
				57
Akron	1563b0c	2017-08-10 19:58:04 +0200	[diff] [blame]	58	# TODO:
				59	# use Krawfish::Index::Store::V1::ForwardIndex->new;
				60	$self->{docs}->[$self->last_doc_id] =
				61	Krawfish::Index::Forward::Doc->new($doc);
Akron	14ff0c4	2017-08-09 20:49:52 +0200	[diff] [blame]	62
				63	return $doc_id;
				64	};
				65
				66
Akron	1563b0c	2017-08-10 19:58:04 +0200	[diff] [blame]	67	# Get doc from list (as long as the list provides random access to docs
				68	sub doc {
Akron	14ff0c4	2017-08-09 20:49:52 +0200	[diff] [blame]	69	my ($self, $doc_id) = @_;
Akron	1563b0c	2017-08-10 19:58:04 +0200	[diff] [blame]	70	print_log('fwd', 'Get document for id ' . $doc_id) if DEBUG;
				71	return $self->{docs}->[$doc_id];
Akron	14ff0c4	2017-08-09 20:49:52 +0200	[diff] [blame]	72	};
				73
Akron	1563b0c	2017-08-10 19:58:04 +0200	[diff] [blame]	74	# Get a specific forward indexed document by doc_id
				75	sub pointer {
				76	my $self = shift;
				77	return Krawfish::Index::Forward::Pointer->new($self);
Akron	14ff0c4	2017-08-09 20:49:52 +0200	[diff] [blame]	78	};
				79
				80
				81	1;