blob: 78e3f8e86e66e15413b05bf7c12f40e7950f294d [file] [log] [blame]
package KorAP::XML::Tokenizer::Tokens;
use Mojo::Base 'KorAP::XML::Tokenizer::Units';
use Mojo::ByteStream 'b';
use Mojo::File;
use KorAP::XML::Tokenizer::Token;
use File::Spec::Functions qw/catdir catfile/;
use XML::Fast;
use Try::Tiny;
use Log::Any qw($log);
has 'log' => sub {
$log;
};
sub parse {
my $self = shift;
my $path = catfile($self->path, $self->foundry, $self->layer . '.xml');
# Legacy data support
unless (-e $path) {
if ($self->layer eq 'namedentities') {
$path = $self->path . $self->foundry . '/ne_combined.xml';
return unless -e $path;
}
elsif ($self->layer eq 'morpho' && $self->foundry eq 'glemm') {
$path = catfile($self->path, $self->foundry, 'glemm.xml');
return unless -e $path;
}
else {
return;
};
};
my $file = b(Mojo::File->new($path)->slurp);
# Bug workaround
if ($self->foundry eq 'glemm') {
if (index($file, "</span\n") > 0 || index($file, "</span\r") > 0) {
$file =~ s!</span[\n\r]!</span>\n!g;
};
};
# my $spans = Mojo::DOM->new($file);
# $spans->xml(1);
my ($spans, $error);
try {
local $SIG{__WARN__} = sub {
$error = 1;
};
$spans = xml2hash($file, text => '#text', attr => '-')->{layer}->{spanList};
} catch {
$log->warn('Span error in ' . $path . ($_ ? ': ' . $_ : ''));
$error = 1;
};
return if $error;
if (ref $spans && $spans->{span}) {
$spans = $spans->{span};
}
else {
return [];
};
$spans = [$spans] if ref $spans ne 'ARRAY';
my ($should, $have) = (0,0);
my @tokens;
foreach my $s (@$spans) {
$should++;
my $token = $self->token(
$s->{-from},
$s->{-to},
$s
) or next;
$have++;
push(@tokens, $token);
};
$self->should($should);
$self->have($have);
return \@tokens;
};
1;