Clean up primary data handling
Change-Id: Id1188637806ba5fd29294b0aa01cbdefb7b3b62e
diff --git a/lib/KorAP/XML/Document/Primary.pm b/lib/KorAP/XML/Document/Primary.pm
index bb7e9f5..e760873 100644
--- a/lib/KorAP/XML/Document/Primary.pm
+++ b/lib/KorAP/XML/Document/Primary.pm
@@ -1,12 +1,17 @@
package KorAP::XML::Document::Primary;
use strict;
use warnings;
-use Carp qw/croak carp/;
use Mojo::ByteStream 'b';
use feature 'state';
use Packed::Array;
use utf8;
+use constant {
+ DATA => 0,
+ BYTES => 1,
+ XIP => 2
+};
+
# our $QUOT = b("„“”")->decode;
our $QUOT_RE = qr/[„“”]/;
@@ -22,15 +27,15 @@
my ($self, $from, $to) = @_;
# Get range data from primary
- return substr($self->[0], $from) if $from && !$to;
+ return substr($self->[DATA], $from) if $from && !$to;
# Get full data
- return $self->[0] unless $to;
+ return $self->[DATA] unless $to;
return if $to > $self->data_length;
# Return substring
- return (substr($self->[0], $from, $to - $from) // undef);
+ return (substr($self->[DATA], $from, $to - $from) // undef);
};
@@ -42,14 +47,14 @@
# Only start offset defined
if ($from && !$to) {
- return b(substr($self->[0], $from))->decode;
+ return b(substr($self->[DATA], $from))->decode;
};
# No offset defined
- return b($self->[0])->decode unless $to;
+ return b($self->[DATA])->decode unless $to;
# Get the substring based on offsets
- my $substr = substr($self->[0], $from, $to - $from);
+ my $substr = substr($self->[DATA], $from, $to - $from);
# Decode
return b($substr)->decode if defined $substr;
@@ -61,42 +66,34 @@
# The length of the primary text in characters
sub data_length {
- my $self = shift;
- return $self->[1] if $self->[1];
- $self->[1] = length($self->[0]);
- return $self->[1];
+ length($_[0]->[DATA]);
};
# Get correct offset
sub bytes2chars {
my $self = shift;
- unless ($self->[2]) {
- $self->[2] = $self->_calc_chars($self->[0]);
+ unless ($self->[BYTES]) {
+ $self->[BYTES] = _calc_chars($self->[DATA]);
};
- return $self->[2]->[shift];
+ return $self->[BYTES]->[shift];
};
# Get correct offset
sub xip2chars {
my $self = shift;
- unless ($self->[3]) {
- my $buffer = $self->[0];
-
+ unless ($self->[XIP]) {
# Hacky work around: replace fancy quotation marks for XIP
- $buffer =~ s{$QUOT_RE}{"}g;
-
- $self->[3] = $self->_calc_chars($buffer);
+ $self->[XIP] = _calc_chars($self->[DATA] =~ s{$QUOT_RE}{"}gr);
};
- return $self->[3]->[shift];
+ return $self->[XIP]->[shift];
};
# Calculate character offsets
sub _calc_chars {
use bytes;
- my ($self, $text) = @_;
tie my @array, 'Packed::Array';
@@ -107,14 +104,14 @@
my $c;
# Init array
- my $l = length($text);
+ my $l = length($_[0]);
$array[$l-1] = 0;
# Iterate over every character
while ($i <= $l) {
# Get actual character
- $c = substr($text, $i, 1);
+ $c = substr($_[0], $i, 1);
# store character position
$array[$i++] = $j;
@@ -123,16 +120,16 @@
if (ord($c & $leading) && ord($c & $start)) {
# Get the next byte - expecting a following character
- $c = substr($text, $i, 1);
+ $c = substr($_[0], $i, 1);
# Character is part of a multibyte
while (ord($c & $leading)) {
- # Set count
- $array[$i] = (ord($c & $start)) ? ++$j : $j;
+ # Set count
+ $array[$i] = (ord($c & $start)) ? ++$j : $j;
- # Get next character
- $c = substr($text, ++$i, 1);
+ # Get next character
+ $c = substr($_[0], ++$i, 1);
};
};
diff --git a/lib/KorAP/XML/Meta/Base.pm b/lib/KorAP/XML/Meta/Base.pm
index de9ad44..4a61d87 100644
--- a/lib/KorAP/XML/Meta/Base.pm
+++ b/lib/KorAP/XML/Meta/Base.pm
@@ -55,6 +55,7 @@
return join(' ', @{$self->{$_[0]} // []});
};
+
# Check if cached
# Cache differently!
sub is_cached {
@@ -93,7 +94,7 @@
if ($self->doc_sigle) {
$new{doc_sigle} = $self->doc_sigle;
if ($self->text_sigle) {
- $new{text_sigle} = $self->text_sigle;
+ $new{text_sigle} = $self->text_sigle;
}
}
};
diff --git a/lib/KorAP/XML/Tokenizer/Spans.pm b/lib/KorAP/XML/Tokenizer/Spans.pm
index fa91bc5..2722aec 100644
--- a/lib/KorAP/XML/Tokenizer/Spans.pm
+++ b/lib/KorAP/XML/Tokenizer/Spans.pm
@@ -48,10 +48,10 @@
return if $error;
if (ref $spans && $spans->{span}) {
- $spans = $spans->{span};
+ $spans = $spans->{span};
}
else {
- return [];
+ return [];
};
diff --git a/lib/KorAP/XML/Tokenizer/Tokens.pm b/lib/KorAP/XML/Tokenizer/Tokens.pm
index 3881979..50367d4 100644
--- a/lib/KorAP/XML/Tokenizer/Tokens.pm
+++ b/lib/KorAP/XML/Tokenizer/Tokens.pm
@@ -3,7 +3,6 @@
use Mojo::ByteStream 'b';
use Mojo::File;
use KorAP::XML::Tokenizer::Token;
-use Carp qw/croak carp/;
use File::Spec::Functions qw/catdir catfile/;
use XML::Fast;
use Try::Tiny;
@@ -15,7 +14,6 @@
sub parse {
my $self = shift;
- # my $path = $self->path . $self->foundry . '/' . $self->layer . '.xml';
my $path = catfile($self->path, $self->foundry, $self->layer . '.xml');
# Legacy data support
@@ -38,32 +36,31 @@
# Bug workaround
if ($self->foundry eq 'glemm') {
if (index($file, "</span\n") > 0 || index($file, "</span\r") > 0) {
- $file =~ s!</span[\n\r]!</span>\n!g;
+ $file =~ s!</span[\n\r]!</span>\n!g;
};
};
-# my $spans = Mojo::DOM->new($file);
-# $spans->xml(1);
+ # my $spans = Mojo::DOM->new($file);
+ # $spans->xml(1);
my ($spans, $error);
try {
- local $SIG{__WARN__} = sub {
- $error = 1;
- };
- $spans = xml2hash($file, text => '#text', attr => '-')->{layer}->{spanList};
- }
- catch {
- $self->log->warn('Span error in ' . $path . ($_ ? ': ' . $_ : ''));
+ local $SIG{__WARN__} = sub {
$error = 1;
+ };
+ $spans = xml2hash($file, text => '#text', attr => '-')->{layer}->{spanList};
+ } catch {
+ $self->log->warn('Span error in ' . $path . ($_ ? ': ' . $_ : ''));
+ $error = 1;
};
return if $error;
if (ref $spans && $spans->{span}) {
- $spans = $spans->{span};
+ $spans = $spans->{span};
}
else {
- return [];
+ return [];
};
$spans = [$spans] if ref $spans ne 'ARRAY';