| Akron | e4c2e41 | 2016-01-28 15:10:50 +0100 | [diff] [blame] | 1 | package KorAP::XML::Tokenizer::Spans; |
| Nils Diewald | 21a3e1a | 2014-04-28 18:48:16 +0000 | [diff] [blame] | 2 | use strict; |
| 3 | use warnings; |
| Akron | e4c2e41 | 2016-01-28 15:10:50 +0100 | [diff] [blame] | 4 | use KorAP::XML::Log; |
| Akron | 69a4a2f | 2016-01-17 12:55:50 +0100 | [diff] [blame] | 5 | use Data::Dumper; |
| Akron | e4c2e41 | 2016-01-28 15:10:50 +0100 | [diff] [blame] | 6 | use Mojo::Base 'KorAP::XML::Tokenizer::Units'; |
| Akron | 3ec0a1c | 2017-01-18 14:41:55 +0100 | [diff] [blame] | 7 | use Mojo::File; |
| Akron | e4c2e41 | 2016-01-28 15:10:50 +0100 | [diff] [blame] | 8 | use KorAP::XML::Tokenizer::Span; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 9 | use Mojo::ByteStream 'b'; |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 10 | use XML::Fast; |
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 11 | use Try::Tiny; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 12 | |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 13 | has 'range'; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 14 | |
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 15 | has 'log' => sub { |
| Nils Diewald | 21a3e1a | 2014-04-28 18:48:16 +0000 | [diff] [blame] | 16 | if(Log::Log4perl->initialized()) { |
| 17 | state $log = Log::Log4perl->get_logger(__PACKAGE__); |
| 18 | }; |
| Akron | e4c2e41 | 2016-01-28 15:10:50 +0100 | [diff] [blame] | 19 | state $log = KorAP::XML::Log->new; |
| Nils Diewald | 21a3e1a | 2014-04-28 18:48:16 +0000 | [diff] [blame] | 20 | return $log; |
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 21 | }; |
| 22 | |
| Nils Diewald | ff6d078 | 2014-06-10 18:26:36 +0000 | [diff] [blame] | 23 | |
| Akron | dc898d8 | 2016-02-28 23:49:19 +0100 | [diff] [blame] | 24 | # Parse span file |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 25 | sub parse { |
| 26 | my $self = shift; |
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 27 | my $path = $self->path . $self->foundry . '/' . $self->layer . '.xml'; |
| Nils Diewald | 38b3b5a | 2013-12-04 00:54:08 +0000 | [diff] [blame] | 28 | |
| Nils Diewald | 21a3e1a | 2014-04-28 18:48:16 +0000 | [diff] [blame] | 29 | unless (-e $path) { |
| Nils Diewald | 90410c2 | 2014-11-03 21:04:05 +0000 | [diff] [blame] | 30 | $self->log->warn('Unable to load file ' . $path); |
| Nils Diewald | 21a3e1a | 2014-04-28 18:48:16 +0000 | [diff] [blame] | 31 | return; |
| 32 | }; |
| Nils Diewald | 38b3b5a | 2013-12-04 00:54:08 +0000 | [diff] [blame] | 33 | |
| Akron | 3ec0a1c | 2017-01-18 14:41:55 +0100 | [diff] [blame] | 34 | my $file = b(Mojo::File->new($path)->slurp); |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 35 | |
| Nils Diewald | aba4710 | 2013-11-27 15:02:47 +0000 | [diff] [blame] | 36 | my ($spans, $error); |
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 37 | try { |
| Akron | dc898d8 | 2016-02-28 23:49:19 +0100 | [diff] [blame] | 38 | local $SIG{__WARN__} = sub { |
| 39 | $error = 1; |
| 40 | }; |
| 41 | $spans = xml2hash($file, text => '#text', attr => '-', array => ['span'])->{layer}->{spanList}; |
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 42 | } |
| 43 | catch { |
| Akron | dc898d8 | 2016-02-28 23:49:19 +0100 | [diff] [blame] | 44 | $self->log->warn('Span error in ' . $path . ($_ ? ': ' . $_ : '')); |
| 45 | $error = 1; |
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 46 | }; |
| 47 | |
| Nils Diewald | aba4710 | 2013-11-27 15:02:47 +0000 | [diff] [blame] | 48 | return if $error; |
| 49 | |
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 50 | if (ref $spans && $spans->{span}) { |
| Akron | b555b60 | 2020-08-04 13:40:32 +0200 | [diff] [blame] | 51 | $spans = $spans->{span}; |
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 52 | } |
| 53 | else { |
| Akron | b555b60 | 2020-08-04 13:40:32 +0200 | [diff] [blame] | 54 | return []; |
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 55 | }; |
| 56 | |
| Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 57 | |
| Nils Diewald | ded8e83 | 2013-11-06 15:42:17 +0000 | [diff] [blame] | 58 | $spans = [$spans] if ref $spans ne 'ARRAY'; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 59 | |
| 60 | my ($should, $have) = (0,0); |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 61 | my ($from, $to, $h); |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 62 | |
| 63 | my @spans; |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 64 | my $p = $self->primary; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 65 | |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 66 | foreach my $s (@$spans) { |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 67 | |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 68 | $should++; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 69 | |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 70 | my $span = $self->span( |
| 71 | $s->{-from}, |
| 72 | $s->{-to}, |
| 73 | $s |
| 74 | ) or next; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 75 | |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 76 | $have++; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 77 | |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 78 | push(@spans, $span); |
| 79 | }; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 80 | |
| 81 | $self->should($should); |
| 82 | $self->have($have); |
| 83 | |
| 84 | return \@spans; |
| 85 | }; |
| 86 | |
| 87 | 1; |