blob: 2722aec803055486441df239ba6363d0f9e0f024 [file] [log] [blame]
Akrone4c2e412016-01-28 15:10:50 +01001package KorAP::XML::Tokenizer::Spans;
Nils Diewald21a3e1a2014-04-28 18:48:16 +00002use strict;
3use warnings;
Akrone4c2e412016-01-28 15:10:50 +01004use KorAP::XML::Log;
Akron69a4a2f2016-01-17 12:55:50 +01005use Data::Dumper;
Akrone4c2e412016-01-28 15:10:50 +01006use Mojo::Base 'KorAP::XML::Tokenizer::Units';
Akron3ec0a1c2017-01-18 14:41:55 +01007use Mojo::File;
Akrone4c2e412016-01-28 15:10:50 +01008use KorAP::XML::Tokenizer::Span;
Nils Diewald2db9ad02013-10-29 19:26:43 +00009use Mojo::ByteStream 'b';
Nils Diewald7364d1f2013-11-05 19:26:35 +000010use XML::Fast;
Nils Diewald092178e2013-11-26 16:18:48 +000011use Try::Tiny;
Nils Diewald2db9ad02013-10-29 19:26:43 +000012
Nils Diewald7364d1f2013-11-05 19:26:35 +000013has 'range';
Nils Diewald2db9ad02013-10-29 19:26:43 +000014
Nils Diewald092178e2013-11-26 16:18:48 +000015has 'log' => sub {
Nils Diewald21a3e1a2014-04-28 18:48:16 +000016 if(Log::Log4perl->initialized()) {
17 state $log = Log::Log4perl->get_logger(__PACKAGE__);
18 };
Akrone4c2e412016-01-28 15:10:50 +010019 state $log = KorAP::XML::Log->new;
Nils Diewald21a3e1a2014-04-28 18:48:16 +000020 return $log;
Nils Diewald092178e2013-11-26 16:18:48 +000021};
22
Nils Diewaldff6d0782014-06-10 18:26:36 +000023
Akrondc898d82016-02-28 23:49:19 +010024# Parse span file
Nils Diewald2db9ad02013-10-29 19:26:43 +000025sub parse {
26 my $self = shift;
Nils Diewald092178e2013-11-26 16:18:48 +000027 my $path = $self->path . $self->foundry . '/' . $self->layer . '.xml';
Nils Diewald38b3b5a2013-12-04 00:54:08 +000028
Nils Diewald21a3e1a2014-04-28 18:48:16 +000029 unless (-e $path) {
Nils Diewald90410c22014-11-03 21:04:05 +000030 $self->log->warn('Unable to load file ' . $path);
Nils Diewald21a3e1a2014-04-28 18:48:16 +000031 return;
32 };
Nils Diewald38b3b5a2013-12-04 00:54:08 +000033
Akron3ec0a1c2017-01-18 14:41:55 +010034 my $file = b(Mojo::File->new($path)->slurp);
Nils Diewald2db9ad02013-10-29 19:26:43 +000035
Nils Diewaldaba47102013-11-27 15:02:47 +000036 my ($spans, $error);
Nils Diewald092178e2013-11-26 16:18:48 +000037 try {
Akrondc898d82016-02-28 23:49:19 +010038 local $SIG{__WARN__} = sub {
39 $error = 1;
40 };
41 $spans = xml2hash($file, text => '#text', attr => '-', array => ['span'])->{layer}->{spanList};
Nils Diewald092178e2013-11-26 16:18:48 +000042 }
43 catch {
Akrondc898d82016-02-28 23:49:19 +010044 $self->log->warn('Span error in ' . $path . ($_ ? ': ' . $_ : ''));
45 $error = 1;
Nils Diewald092178e2013-11-26 16:18:48 +000046 };
47
Nils Diewaldaba47102013-11-27 15:02:47 +000048 return if $error;
49
Nils Diewald092178e2013-11-26 16:18:48 +000050 if (ref $spans && $spans->{span}) {
Akronb555b602020-08-04 13:40:32 +020051 $spans = $spans->{span};
Nils Diewald092178e2013-11-26 16:18:48 +000052 }
53 else {
Akronb555b602020-08-04 13:40:32 +020054 return [];
Nils Diewald092178e2013-11-26 16:18:48 +000055 };
56
Nils Diewald7b847222014-04-23 11:14:00 +000057
Nils Diewaldded8e832013-11-06 15:42:17 +000058 $spans = [$spans] if ref $spans ne 'ARRAY';
Nils Diewald2db9ad02013-10-29 19:26:43 +000059
60 my ($should, $have) = (0,0);
Nils Diewald7364d1f2013-11-05 19:26:35 +000061 my ($from, $to, $h);
Nils Diewald2db9ad02013-10-29 19:26:43 +000062
63 my @spans;
Nils Diewald7364d1f2013-11-05 19:26:35 +000064 my $p = $self->primary;
Nils Diewald2db9ad02013-10-29 19:26:43 +000065
Nils Diewald7364d1f2013-11-05 19:26:35 +000066 foreach my $s (@$spans) {
Nils Diewald2db9ad02013-10-29 19:26:43 +000067
Nils Diewald7364d1f2013-11-05 19:26:35 +000068 $should++;
Nils Diewald2db9ad02013-10-29 19:26:43 +000069
Nils Diewald7364d1f2013-11-05 19:26:35 +000070 my $span = $self->span(
71 $s->{-from},
72 $s->{-to},
73 $s
74 ) or next;
Nils Diewald2db9ad02013-10-29 19:26:43 +000075
Nils Diewald7364d1f2013-11-05 19:26:35 +000076 $have++;
Nils Diewald2db9ad02013-10-29 19:26:43 +000077
Nils Diewald7364d1f2013-11-05 19:26:35 +000078 push(@spans, $span);
79 };
Nils Diewald2db9ad02013-10-29 19:26:43 +000080
81 $self->should($should);
82 $self->have($have);
83
84 return \@spans;
85};
86
871;