| package KorAP::XML::Document::Primary; |
| use strict; |
| use warnings; |
| use Carp qw/croak carp/; |
| use Mojo::ByteStream 'b'; |
| use feature 'state'; |
| use Packed::Array; |
| use utf8; |
| |
| # our $QUOT = b("„“”")->decode; |
| our $QUOT_RE = qr/[„“”]/; |
| |
| # Constructor |
| sub new { |
| my $class = shift; |
| bless [$_[0]], $class; |
| }; |
| |
| |
| # Get the data as a substring |
| sub data { |
| my ($self, $from, $to) = @_; |
| |
| # Get range data from primary |
| return substr($self->[0], $from) if $from && !$to; |
| |
| # Get full data |
| return $self->[0] unless $to; |
| |
| # Return substring |
| return (substr($self->[0], $from, $to - $from) // undef); |
| }; |
| |
| |
| # Get the data using byte ofsets |
| sub data_bytes { |
| my ($self, $from, $to) = @_; |
| |
| use bytes; |
| |
| # Only start offset defined |
| if ($from && !$to) { |
| return b(substr($self->[0], $from))->decode; |
| }; |
| |
| # No offset defined |
| return b($self->[0])->decode unless $to; |
| |
| # Get the substring based on offsets |
| my $substr = substr($self->[0], $from, $to - $from); |
| |
| # Decode |
| return b($substr)->decode if defined $substr; |
| |
| # No data |
| return; |
| }; |
| |
| |
| # The length of the primary text in characters |
| sub data_length { |
| my $self = shift; |
| return $self->[1] if $self->[1]; |
| $self->[1] = length($self->[0]); |
| return $self->[1]; |
| }; |
| |
| |
| # Get correct offset |
| sub bytes2chars { |
| my $self = shift; |
| unless ($self->[2]) { |
| $self->[2] = $self->_calc_chars($self->[0]); |
| }; |
| return $self->[2]->[shift]; |
| }; |
| |
| |
| # Get correct offset |
| sub xip2chars { |
| my $self = shift; |
| unless ($self->[3]) { |
| my $buffer = $self->[0]; |
| |
| # Hacky work around: replace fancy quotation marks for XIP |
| $buffer =~ s{$QUOT_RE}{"}g; |
| |
| $self->[3] = $self->_calc_chars($buffer); |
| }; |
| return $self->[3]->[shift]; |
| }; |
| |
| |
| # Calculate character offsets |
| sub _calc_chars { |
| use bytes; |
| my ($self, $text) = @_; |
| |
| tie my @array, 'Packed::Array'; |
| |
| state $leading = pack( 'B8', '10000000' ); |
| state $start = pack( 'B8', '01000000' ); |
| |
| my ($i, $j) = (0,0); |
| my $c; |
| |
| # Init array |
| my $l = length($text); |
| $array[$l-1] = 0; |
| |
| # Iterate over every character |
| while ($i <= $l) { |
| |
| # Get actual character |
| $c = substr($text, $i, 1); |
| |
| # store character position |
| $array[$i++] = $j; |
| |
| # This is the start of a multibyte sequence |
| if (ord($c & $leading) && ord($c & $start)) { |
| |
| # Get the next byte - expecting a following character |
| $c = substr($text, $i, 1); |
| |
| # Character is part of a multibyte |
| while (ord($c & $leading)) { |
| |
| # Set count |
| $array[$i] = (ord($c & $start)) ? ++$j : $j; |
| |
| # Get next character |
| $c = substr($text, ++$i, 1); |
| }; |
| }; |
| |
| $j++; |
| }; |
| return \@array; |
| }; |
| |
| |
| 1; |
| |
| |
| __END__ |
| |
| =pod |
| |
| =head1 NAME |
| |
| KorAP::XML::Document::Primary |
| |
| =head1 SYNOPSIS |
| |
| my $text = KorAP::XML::Document::Primary('Das ist mein Text'); |
| print $text->data(2,5); |
| print $text->data_length; |
| |
| |
| =head1 DESCRIPTION |
| |
| Represent textual data with annotated character and byte offsets. |
| |
| |
| =head1 ATTRIBUTES |
| |
| =head2 data_length |
| |
| print $text->data_length; |
| |
| The textual length in number of characters. |
| |
| |
| =head1 METHODS |
| |
| =head2 data |
| |
| print $text->data; |
| print $text->data(4); |
| print $text->data(5,17); |
| |
| Return the textual data as a substring. Accepts a starting offset and the length of |
| the requested data. The data will be wrapped in an utf-8 encoded L<Mojo::ByteStream>. |
| |
| =head2 bytes2chars |
| |
| print $text->bytes2chars(40); |
| |
| Calculates the character offset based on a given byte offset. |
| |
| =cut |