blob: 8b3e9925a4d6c63973ec572353457dfa0fa8ce0a [file] [log] [blame]
package KorAP::XML::Tokenizer::Range;
use strict;
use warnings;
use Array::IntSpan;
use Carp 'carp';
our $SPAN_RE = qr/!([-+]?\d+):([-+]?\d+)$/;
our $debug = 0;
sub new {
my $range = Array::IntSpan->new;
bless \$range, shift;
};
# Set integer range from x to y with z
sub set {
${shift()}->set_range(@_);
};
# Set gap in range from x to y with !z-1:z
sub gap {
${shift()}->set_range(
$_[0], $_[1],
'!' . ($_[2] - 1) . ':' . $_[2]
);
};
# Lookup range - ignore gaps!
sub lookup {
my $x = ${$_[0]}->lookup( $_[1] );
return if (!defined $x || index($x, '!') == 0);
return $x;
};
# Lookup the position before the character offset
sub before {
my $self = shift;
my $offset = shift;
# Be aware - this uses the array-lookup, not the object method!
my $found = $$self->lookup( $offset );
# Nothing set here
unless (defined $found) {
carp "There is no value for $offset" if $debug;
return;
};
# Hit a gap,
# return preceding match
if ($found =~ $SPAN_RE) {
return $1 >= 0 ? $1 : 0;
}
else {
# Didn't hit a gap
# this however may be inaccurate
# but lifts recall
return $found > 1 ? $found - 1 : 0;
};
};
# Lookup the position after the character offset
sub after {
my $self = shift;
my $offset = shift;
my $found = $$self->lookup( $offset );
unless (defined $found) {
carp "There is no value for $offset" if $debug;
return;
};
if ($found =~ $SPAN_RE) {
return $2;
}
else {
# The current position is likely wrong. The most common non-gap
# and non-exact-match is in the situation of
# "y<e> z</e>", where e is in the range of the y token.
$found + 1;
};
};
sub to_string {
my $self = shift;
return join(
'',
map {'['.join(',',@$_).']'}
@{$$self->get_range(0,100,'...')}
) . '...';
};
1;
__END__
=pod
This module is used for mapping character offsets to positions in
the stream of tokens.
=head1 set
$range->set(34, 46, 2);
Start-offset, end-offset, position in token stream.
=head1 gap
$range->gap(47, 49, 5);
Start-offset, end-offset, preceding position in token stream.
=head1 before
my $pos = $range->before(15);
Return the token position in the token stream
before the character offset.
Be aware: The smallest before-position is 0.
=head1 after
my $pos = $range->after(34);
Return the token position in the token stream
following the character offset.
In case, the character offset is part of a token,
the current token position is returned.
=head1 to_string
print $range->to_string;
Serialize the first 100 character positions
in a string representation.