lib/KorAP/XML/Tokenizer/Range.pm - KorAP/KorAP-XML-Krill - Gitiles

 package KorAP::XML::Tokenizer::Range;
 use strict;
 use warnings;
 use Array::IntSpan;
 use Carp 'carp';

 our $SPAN_RE = qr/!([-+]?\d+):([-+]?\d+)$/;

 our $debug = 0;

 sub new {
   my $range = Array::IntSpan->new;
   bless \$range, shift;
 };


 # Set integer range from x to y with z
 sub set {
   ${shift()}->set_range(@_);
 };


 # Set gap in range from x to y with !z-1:z
 sub gap {
   ${shift()}->set_range(
     $_[0], $_[1],
     '!' . ($_[2] - 1) . ':' . $_[2]
   );
 };


 # Lookup range - ignore gaps!
 sub lookup {
   my $x = ${$_[0]}->lookup( $_[1] );
   return if (!defined $x || index($x, '!') == 0);
   return $x;
 };


 # Lookup the position before the character offset
 sub before {
   my $self = shift;
   my $offset = shift;

   # Be aware - this uses the array-lookup, not the object method!
   my $found = $$self->lookup( $offset );

   # Nothing set here
   unless (defined $found) {
     carp "There is no value for $offset" if $debug;
     return;
   };

   # Hit a gap,
   # return preceding match
   if ($found =~ $SPAN_RE) {
     return $1 >= 0 ? $1 : 0;
   }
   else {
     # Didn't hit a gap
     # this however may be inaccurate
     # but lifts recall
     return $found > 1 ? $found - 1 : 0;
   };
 };


 # Lookup the position after the character offset
 sub after {
   my $self = shift;
   my $offset = shift;
   my $found = $$self->lookup( $offset );

   unless (defined $found) {
     carp "There is no value for $offset" if $debug;
     return;
   };

   if ($found =~ $SPAN_RE) {
     return $2;
   }
   else {
     # The current position is likely wrong. The most common non-gap
     # and non-exact-match is in the situation of
     # "y<e> z</e>", where e is in the range of the y token.
     $found + 1;
   };
 };


 sub to_string {
   my $self = shift;
   return join(
     '',
     map {'['.join(',',@$_).']'}
       @{$$self->get_range(0,100,'...')}
     ) . '...';
 };

 1;


 __END__

 =pod

 This module is used for mapping character offsets to positions in
 the stream of tokens.

 =head1 set

   $range->set(34, 46, 2);

 Start-offset, end-offset, position in token stream.

 =head1 gap

   $range->gap(47, 49, 5);

 Start-offset, end-offset, preceding position in token stream.

 =head1 before

   my $pos = $range->before(15);

 Return the token position in the token stream
 before the character offset.
 Be aware: The smallest before-position is 0.

 =head1 after

   my $pos = $range->after(34);

 Return the token position in the token stream
 following the character offset.
 In case, the character offset is part of a token,
 the current token position is returned.

 =head1 to_string

   print $range->to_string;

 Serialize the first 100 character positions
 in a string representation.
	package KorAP::XML::Tokenizer::Range;
	use strict;
	use warnings;
	use Array::IntSpan;
	use Carp 'carp';

	our $SPAN_RE = qr/!([-+]?\d+):([-+]?\d+)$/;

	our $debug = 0;

	sub new {
	my $range = Array::IntSpan->new;
	bless \$range, shift;
	};


	# Set integer range from x to y with z
	sub set {
	${shift()}->set_range(@_);
	};


	# Set gap in range from x to y with !z-1:z
	sub gap {
	${shift()}->set_range(
	$_[0], $_[1],
	'!' . ($_[2] - 1) . ':' . $_[2]
	);
	};


	# Lookup range - ignore gaps!
	sub lookup {
	my $x = ${$_[0]}->lookup( $_[1] );
	return if (!defined $x \|\| index($x, '!') == 0);
	return $x;
	};


	# Lookup the position before the character offset
	sub before {
	my $self = shift;
	my $offset = shift;

	# Be aware - this uses the array-lookup, not the object method!
	my $found = $$self->lookup( $offset );

	# Nothing set here
	unless (defined $found) {
	carp "There is no value for $offset" if $debug;
	return;
	};

	# Hit a gap,
	# return preceding match
	if ($found =~ $SPAN_RE) {
	return $1 >= 0 ? $1 : 0;
	}
	else {
	# Didn't hit a gap
	# this however may be inaccurate
	# but lifts recall
	return $found > 1 ? $found - 1 : 0;
	};
	};


	# Lookup the position after the character offset
	sub after {
	my $self = shift;
	my $offset = shift;
	my $found = $$self->lookup( $offset );

	unless (defined $found) {
	carp "There is no value for $offset" if $debug;
	return;
	};

	if ($found =~ $SPAN_RE) {
	return $2;
	}
	else {
	# The current position is likely wrong. The most common non-gap
	# and non-exact-match is in the situation of
	# "y<e> z</e>", where e is in the range of the y token.
	$found + 1;
	};
	};


	sub to_string {
	my $self = shift;
	return join(
	'',
	map {'['.join(',',@$_).']'}
	@{$$self->get_range(0,100,'...')}
	) . '...';
	};

	1;


	__END__

	=pod

	This module is used for mapping character offsets to positions in
	the stream of tokens.

	=head1 set

	$range->set(34, 46, 2);

	Start-offset, end-offset, position in token stream.

	=head1 gap

	$range->gap(47, 49, 5);

	Start-offset, end-offset, preceding position in token stream.

	=head1 before

	my $pos = $range->before(15);

	Return the token position in the token stream
	before the character offset.
	Be aware: The smallest before-position is 0.

	=head1 after

	my $pos = $range->after(34);

	Return the token position in the token stream
	following the character offset.
	In case, the character offset is part of a token,
	the current token position is returned.

	=head1 to_string

	print $range->to_string;

	Serialize the first 100 character positions
	in a string representation.